blob: f712e234fe491761fc5c63481ea65eed2a003507 [file] [log] [blame]
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en" dir="ltr">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<meta name="keywords" content="SMILA/Documentation/Importing/Crawler/Web,SMILA/Documentation/Importing/Concept,SMILA/Documentation/Importing/DeltaCheck,SMILA/Documentation/Importing/SimpleCompoundExtractorService,SMILA/Documentation/TaskGenerators" />
<link rel="shortcut icon" href="http://wiki.eclipse.org/SMILA/Documentation/Importing/Crawler/favicon.ico" />
<link rel="search" type="application/opensearchdescription+xml" href="http://wiki.eclipse.org/opensearch_desc.php" title="Eclipsepedia (English)" />
<link rel="alternate" type="application/rss+xml" title="Eclipsepedia RSS Feed" href="http://wiki.eclipse.org/index.php?title=Special:Recentchanges&amp;feed=rss" />
<link rel="alternate" type="application/atom+xml" title="Eclipsepedia Atom Feed" href="http://wiki.eclipse.org/index.php?title=Special:Recentchanges&amp;feed=atom" />
<title>SMILA/Documentation/Importing/Crawler/Web - Eclipsepedia</title>
<style type="text/css" media="screen,projection">/*<![CDATA[*/ @import "http://wiki.eclipse.org/skins/eclipsenova/novaWide.css?116"; /*]]>*/</style>
<link rel="stylesheet" type="text/css" media="print" href="http://wiki.eclipse.org/skins/eclipsenova/eclipsenovaPrint.css?116" />
<link rel="stylesheet" type="text/css" media="handheld" href="http://wiki.eclipse.org/skins/eclipsenova/handheld.css?116" />
<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/Nova/css/header.css" media="screen" />
<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/tabs.css" media="screen" />
<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/Nova/css/visual.css" media="screen" />
<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/Nova/css/layout.css" media="screen" />
<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/Nova/css/footer.css" media="screen" />
<!--[if IE]><link rel="stylesheet" type="text/css" href="/skins/eclipsenova/IEpngfix.css" media="screen" /><![endif]-->
<!--[if lt IE 5.5000]><style type="text/css">@import "/skins/eclipsenova/IE50Fixes.css?116";</style> <![endif]-->
<!--[if IE 5.5000]><style type="text/css">@import "/skins/eclipsenova/IE55Fixes.css?116";</style><![endif]-->
<!--[if IE 6]><style type="text/css">@import "/skins/eclipsenova/IE60Fixes.css?116";</style><![endif]-->
<!--[if IE 7]><style type="text/css">@import "/skins/eclipsenova/IE70Fixes.css?116";</style><![endif]-->
<!--[if lt IE 7]><script type="text/javascript" src="/skins/common/IEFixes.js?116"></script>
<meta http-equiv="imagetoolbar" content="no" /><![endif]-->
<script type= "text/javascript">/*<![CDATA[*/
var skin = "eclipsenova";
var stylepath = "/skins";
var wgArticlePath = "/$1";
var wgScriptPath = "";
var wgScript = "/index.php";
var wgServer = "http://wiki.eclipse.org";
var wgCanonicalNamespace = "";
var wgCanonicalSpecialPageName = false;
var wgNamespaceNumber = 0;
var wgPageName = "SMILA/Documentation/Importing/Crawler/Web";
var wgTitle = "SMILA/Documentation/Importing/Crawler/Web";
var wgAction = "view";
var wgRestrictionEdit = [];
var wgRestrictionMove = [];
var wgArticleId = "35060";
var wgIsArticle = true;
var wgUserName = null;
var wgUserGroups = null;
var wgUserLanguage = "en";
var wgContentLanguage = "en";
var wgBreakFrames = false;
var wgCurRevisionId = "307873";
var wgVersion = "1.12.0";
var wgEnableAPI = true;
var wgEnableWriteAPI = false;
/*]]>*/</script>
<script type="text/javascript" src="http://wiki.eclipse.org/skins/common/wikibits.js?116"><!-- wikibits js --></script>
<!-- Performance mods similar to those for bug 166401 -->
<script type="text/javascript" src="http://wiki.eclipse.org/index.php?title=-&amp;action=raw&amp;gen=js&amp;useskin=eclipsenova"><!-- site js --></script>
<!-- Head Scripts -->
<script type="text/javascript" src="http://wiki.eclipse.org/skins/common/ajax.js?116"></script>
<link rel="stylesheet" type="text/css" href="Web.html" /> </head>
<body class="mediawiki ns-0 ltr page-SMILA_Documentation_Importing_Crawler_Web">
<div id="globalWrapper">
<div id="column-one">
<!-- Eclipse Additions for the Top Nav start here M. Ward-->
<div id="header">
<div id="header-graphic">
<img src="http://wiki.eclipse.org/skins/eclipsenova/eclipse.png" alt="Eclipse Wiki">
</div>
<!-- Pulled 101409 Mward -->
<div class="portlet" id="p-personal">
<div class="pBody">
<ul>
<li id="pt-login"><a href="http://wiki.eclipse.org/index.php?title=Special:Userlogin&amp;returnto=SMILA/Documentation/Importing/Crawler/Web">Log in</a></li>
</ul>
</div>
</div>
<div id="header-icons">
<div id="sites">
<ul id="sitesUL">
<li><a href="http://www.eclipse.org"><img src="http://dev.eclipse.org/custom_icons/eclipseIcon.png" width="28" height="28" alt="Eclipse Foundation" title="Eclipse Foundation" /><div>Eclipse Foundation</div></a></li>
<li><a href="http://marketplace.eclipse.org"><img src="http://dev.eclipse.org/custom_icons/marketplace.png" width="28" height="28" alt="Eclipse Marketplace" title="Eclipse Marketplace" /><div>Eclipse Marketplace</div></a></li>
<li><a href="https://bugs.eclipse.org/bugs"><img src="http://dev.eclipse.org/custom_icons/system-search-bw.png" width="28" height="28" alt="Bugzilla" title="Bugzilla" /><div>Bugzilla</div></a></li>
<li><a href="http://live.eclipse.org"><img src="http://dev.eclipse.org/custom_icons/audio-input-microphone-bw.png" width="28" height="28" alt="Live" title="Live" /><div>Eclipse Live</div></a></li>
<li><a href="http://planeteclipse.org"><img src="http://dev.eclipse.org/large_icons/devices/audio-card.png" width="28" height="28" alt="PlanetEclipse" title="Planet" /><div>Planet Eclipse</div></a></li>
<li><a href="http://portal.eclipse.org"><img src="http://dev.eclipse.org/custom_icons/preferences-system-network-proxy-bw.png" width="28" height="28" alt="Portal" title="Portal" /><div>My Foundation Portal</div></a></li>
</ul>
</div>
</div>
</div>
<!-- NEW HEADER STUFF HERE -->
<div id="header-menu">
<div id="header-nav">
<ul> <li><a class="first_one" href="http://wiki.eclipse.org/" target="_self">Home</a></li> <li><a href="http://www.eclipse.org/downloads/" target="_self">Downloads</a></li>
<li><a href="http://www.eclipse.org/users/" target="_self">Users</a></li>
<li><a href="http://www.eclipse.org/membership/" target="_self">Members</a></li>
<li><a href="http://wiki.eclipse.org/index.php/Development_Resources" target="_self">Committers</a></li>
<li><a href="http://www.eclipse.org/resources/" target="_self">Resources</a></li>
<li><a href="http://www.eclipse.org/projects/" target="_self">Projects</a></li>
<li><a href="http://www.eclipse.org/org/" target="_self">About Us</a></li>
</ul>
</div>
<div id="header-utils">
<!-- moved the search window here -->
<form action="http://wiki.eclipse.org/Special:Search" >
<input class="input" name="search" type="text" accesskey="f" value="" />
<input type='submit' onclick="this.submit();" name="go" id="searchGoButton" class="button" title="Go to a page with this exact name if one exists" value="Go" />&nbsp;
<input type='submit' onclick="this.submit();" name="fulltext" class="button" id="mw-searchButton" title="Search Eclipsepedia for this text" value="Search" />
</form>
</div>
</div>
<!-- Eclipse Additions for the Header stop here -->
<!-- Additions and mods for leftside nav Start here -->
<!--Started nav rip here-->
<!-- these are the nav controls main page, changes etc -->
<div id="novaContent" class="faux">
<div id="leftcol">
<ul id="leftnav">
<!-- these are the page controls, edit history etc -->
<li class="separator"><a class="separator">Navigation &#160;&#160;</li>
<li id="n-mainpage"><a href="http://wiki.eclipse.org/Main_Page">Main Page</a></li>
<li id="n-portal"><a href="http://wiki.eclipse.org/Eclipsepedia:Community_Portal">Community portal</a></li>
<li id="n-currentevents"><a href="http://wiki.eclipse.org/Eclipsepedia:Current_events">Current events</a></li>
<li id="n-recentchanges"><a href="http://wiki.eclipse.org/Special:Recentchanges">Recent changes</a></li>
<li id="n-randompage"><a href="http://wiki.eclipse.org/Special:Random">Random page</a></li>
<li id="n-help"><a href="http://wiki.eclipse.org/Help:Contents">Help</a></li>
<li class="separator"><a class="separator">Toolbox &#160;&#160;</a></li>
<li id="t-whatlinkshere"><a href="http://wiki.eclipse.org/Special:Whatlinkshere/SMILA/Documentation/Importing/Crawler/Web">What links here</a></li>
<li id="t-recentchangeslinked"><a href="http://wiki.eclipse.org/Special:Recentchangeslinked/SMILA/Documentation/Importing/Crawler/Web">Related changes</a></li>
<!-- This is the toolbox section -->
<li id="t-upload"><a href="http://wiki.eclipse.org/Special:Upload">Upload file</a></li>
<li id="t-specialpages"><a href="http://wiki.eclipse.org/Special:Specialpages">Special pages</a></li>
<li id="t-print"><a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/Importing/Crawler/Web&amp;printable=yes">Printable version</a></li> <li id="t-permalink"><a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/Importing/Crawler/Web&amp;oldid=307873">Permanent link</a></li> </ul>
</div>
<!-- Additions and mods for leftside nav End here -->
<div id="column-content">
<div id="content">
<a name="top" id="top"></a>
<div id="tabs">
<ul class="primary">
<li class="active"><a href="Web.html"><span class="tab">Page</span></a></li>
<li><a href="http://wiki.eclipse.org/index.php?title=Talk:SMILA/Documentation/Importing/Crawler/Web&amp;action=edit"><span class="tab">Discussion</span></a></li>
<li><a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/Importing/Crawler/Web&amp;action=edit"><span class="tab">View source</span></a></li>
<li><a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/Importing/Crawler/Web&amp;action=history"><span class="tab">History</span></a></li>
<li><a href="http://wiki.eclipse.org/index.php?title=Special:Userlogin&amp;returnto=SMILA/Documentation/Importing/Crawler/Web"><span class="tab">Edit</span></a></li>
</ul>
</div>
<script type="text/javascript"> if (window.isMSIE55) fixalpha(); </script>
<h1 class="firstHeading">SMILA/Documentation/Importing/Crawler/Web</h1>
<div id="bodyContent">
<h3 id="siteSub">From Eclipsepedia</h3>
<div id="contentSub"><span class="subpages">&lt; <a href="../../../../SMILA.html" title="SMILA">SMILA</a> | <a href="../../../Documentation.1.html" title="SMILA/Documentation">Documentation</a></span></div>
<div id="jump-to-nav">Jump to: <a href="Web.html#column-one">navigation</a>, <a href="Web.html#searchInput">search</a></div> <!-- start content -->
<p>WebCrawler, WebFetcher and WebExtractor worker are used for importing files from a web server. For a big picture and the worker's interaction have a look at the <a href="../Concept.html" title="SMILA/Documentation/Importing/Concept"> Importing Concept</a>.
</p>
<table id="toc" class="toc" summary="Contents"><tr><td><div id="toctitle"><h2>Contents</h2></div>
<ul>
<li class="toclevel-1"><a href="Web.html#Web_Crawler_Worker"><span class="tocnumber">1</span> <span class="toctext">Web Crawler Worker</span></a>
<ul>
<li class="toclevel-2"><a href="Web.html#Filter_patterns_and_normalization"><span class="tocnumber">1.1</span> <span class="toctext">Filter patterns and normalization</span></a></li>
<li class="toclevel-2"><a href="Web.html#Configuration"><span class="tocnumber">1.2</span> <span class="toctext">Configuration</span></a>
<ul>
<li class="toclevel-3"><a href="Web.html#Configuring_a_proxy"><span class="tocnumber">1.2.1</span> <span class="toctext">Configuring a proxy</span></a></li>
</ul>
</li>
<li class="toclevel-2"><a href="Web.html#Internal_structure"><span class="tocnumber">1.3</span> <span class="toctext">Internal structure</span></a>
<ul>
<li class="toclevel-3"><a href="Web.html#Scaling"><span class="tocnumber">1.3.1</span> <span class="toctext">Scaling</span></a></li>
</ul>
</li>
<li class="toclevel-2"><a href="Web.html#Implementation_details"><span class="tocnumber">1.4</span> <span class="toctext">Implementation details</span></a></li>
</ul>
</li>
<li class="toclevel-1"><a href="Web.html#Web_Fetcher_Worker"><span class="tocnumber">2</span> <span class="toctext">Web Fetcher Worker</span></a></li>
<li class="toclevel-1"><a href="Web.html#Web_Extractor_Worker"><span class="tocnumber">3</span> <span class="toctext">Web Extractor Worker</span></a></li>
<li class="toclevel-1"><a href="Web.html#Sample_web_crawl_job"><span class="tocnumber">4</span> <span class="toctext">Sample web crawl job</span></a></li>
</ul>
</td></tr></table><script type="text/javascript"> if (window.showTocToggle) { var tocShowText = "show"; var tocHideText = "hide"; showTocToggle(); } </script>
<a name="Web_Crawler_Worker"></a><h3> <span class="mw-headline"> Web Crawler Worker </span></h3>
<ul><li> Worker name: <tt>webCrawler</tt>
</li><li> Parameters:
<ul><li> <tt>dataSource</tt>: <i>(req.)</i> name of data source, used only to mark produced records currently.
</li><li> <tt>startUrl</tt>: <i>(req.)</i> URL to start crawling at. Must be a valid URL, no additional escaping is done.
</li><li> <tt>waitBetweenRequests</tt>: <i>(opt.)</i> long value in milliseconds on how long to wait between HTTP requests (default: 0).
</li><li> <tt>linksPerBulk</tt>: <i>(opt.)</i> number of links in one bulk object for follow-up tasks (default: 10)
</li><li> <tt>filters</tt>: <i>(opt.)</i> A map containing filter settings, i.e. instructions which links to include or exclude from the crawl. This parameter is optional.
<ul><li> <tt>maxCrawlDepth</tt>: the maximum crawl depth when following links.
</li><li> <tt>followRedirects</tt>: whether to follow redirects or not (default: false).
</li><li> <tt>maxRedirects</tt>: maximum number of allowed redirects when following redirects is enabled (default: 1).
</li><li> <tt>urlPatterns</tt>: regex patterns for filtering crawled elements on the basis of their URL
<ul><li> <tt>include</tt>: if include patterns are specified, at least one of them must match the URL. If no include patterns are specified, this is handled as if all URLs are included.
</li><li> <tt>exclude</tt>: if at least one exclude pattern matches the URL, the crawled element is filtered out
</li></ul>
</li></ul>
</li><li> <tt>mapping</tt> <i>(req.)</i> specifies how to map link properties to record attributes
<ul><li> <tt>httpUrl</tt> <i>(req.)</i> mapping attribute for the URL
</li><li> <tt>httpMimetype</tt> <i>(opt.)</i> mapping attribute for the mime type
</li><li> <tt>httpCharset</tt> <i>(opt.)</i> mapping attribute for character set
</li><li> <tt>httpContenttype</tt> <i>(opt.)</i> mapping attribute for the content type
</li><li> <tt>httpLastModified</tt> <i>(opt.)</i> mapping attribute for the link's last modified date
</li><li> <tt>httpSize</tt> <i>(opt.)</i> mapping attribute for the link content's size (in bytes)
</li><li> <tt>httpContent</tt> <i>(opt.)</i> attachment name where the link content is written to
</li></ul>
</li></ul>
</li><li> Task generator: <tt><a href="../../TaskGenerators.html#RunOnceTriggerTaskGenerator" title="SMILA/Documentation/TaskGenerators">runOnceTrigger</a></tt>
</li><li> Input slots:
<ul><li> <tt>linksToCrawl</tt>: Records describing links to crawl.
</li></ul>
</li><li> Output slots:
<ul><li> <tt>linksToCrawl</tt>: Records describing outgoing links from the crawled resources. Should be connected to the same bucket as the input slot.
</li><li> <tt>crawledRecords</tt>: Records describing crawled resources. For resources of mimetype <tt>text/html</tt> the records have the content attached. For other resources, use a webFetcher worker later in the workflow to get the content.
</li></ul>
</li></ul>
<a name="Filter_patterns_and_normalization"></a><h5> <span class="mw-headline"> Filter patterns and normalization </span></h5>
<p>When defining filter patterns, keep in mind that URLs are normalized <i>before</i> filters are applied. Normalization means:
</p>
<ul><li> the URL will be made absolute when it's relative (e.g. /relative/link -&gt; <a href="http://my.domain.de/relative/link" class="external free" title="http://my.domain.de/relative/link" rel="nofollow">http://my.domain.de/relative/link</a>)
</li><li> paths will be normalized (e.g. host/path/../path2 -&gt; host/path2)
</li><li> scheme and host will be converted to lower case (e.g. HTTP://WWW.Host.de/Path -&gt; <a href="http://www.host.de/Path" class="external free" title="http://www.host.de/Path" rel="nofollow">http://www.host.de/Path</a>)
<ul><li> <i>Hint: The path will not be converted to lower case!</i>
</li></ul>
</li><li> fragments will be removed (e.g. host/path#fragment -&gt; host/path)
</li><li> the default port 80 will be removed (e.g. host:80 -&gt; host)
</li><li> 'opaque' URIs can not be handled and will be filtered out automatically (e.g. javascript:void(0), <a href="mailto:andreas.weber@empolis.com" class="external free" title="mailto:andreas.weber@empolis.com" rel="nofollow">mailto:andreas.weber@empolis.com</a>)
</li></ul>
<a name="Configuration"></a><h4> <span class="mw-headline"> Configuration </span></h4>
<p>The configuration directory <tt>org.eclipse.smila.importing.crawler.web</tt> contains the configuration file <tt>webcrawler.properties</tt>.
</p><p>The configuration properties can contain the following properties:
</p>
<ul><li> proxyHost (default: none)
</li><li> proxyPort (default: 80)
</li><li> socketTimeout (default: none, i.e. no socket timeout)
</li></ul>
<p>The configuration properties <tt>proxyHost</tt> and <tt>proxyPort</tt> are used to define a proxy for the web crawler (i.e. the <tt>DefaultFetcher</tt> class is using these configuration to configure its HTTP client) whereas the <tt>socketTimeout</tt> parameter defines how the fetcher's timeout is while retrieving data from the server. If you omit the <tt>socketTimeout</tt> parameter, the fetcher will set no timeout.
</p>
<a name="Configuring_a_proxy"></a><h5> <span class="mw-headline"> Configuring a proxy </span></h5>
<p>You can configure the proxy the web crawler should use by defining the proxy in the configuration file (see above). E.g. to set up the web crawler to use a proxy at proxy-host:3128, use the following configuration:
</p>
<pre>
proxyHost=proxy-host
proxyPort=3128
</pre>
<p>Alternatively you can also use the JRE system properties <tt>http.proxyHost</tt> and <tt>http.proxyPort</tt> (see <a href="http://docs.oracle.com/javase/7/docs/technotes/guides/net/proxies.html" class="external free" title="http://docs.oracle.com/javase/7/docs/technotes/guides/net/proxies.html" rel="nofollow">http://docs.oracle.com/javase/7/docs/technotes/guides/net/proxies.html</a> for more information on proxy system properties).
</p>
<a name="Internal_structure"></a><h4> <span class="mw-headline"> Internal structure </span></h4>
<p>To make it easier to extend and improve the web crawler it is divided internally into components. Each of them is a single OSGi service that handles one part of the crawl functionality and can be exchanged individually to improve a single part of the functionality. The architecture looks like this:
</p><p><a href="http://wiki.eclipse.org/Image:SMILA-Importing-Web-Crawler-Internal.png" class="image" title="Image:SMILA-Importing-Web-Crawler-Internal.png"><img alt="Image:SMILA-Importing-Web-Crawler-Internal.png" src="http://wiki.eclipse.org/images/5/5c/SMILA-Importing-Web-Crawler-Internal.png" width="960" height="720" border="0" /></a>
</p><p>The WebCrawler worker is started with an input bulk that contains records with URLs to crawl. (The exception to this rule is the start of the crawl process where it gets a task without an input bulk, which causes it to generate an input record from its configured <tt>startUrl</tt> parameter). Then the components are executed like this:
</p>
<ul><li> First a <tt>VisitedLinksService</tt> is asked if this link was already crawled by someone else in this crawl job run. If so, the record is just dropped and no output is produced. Otherwise the link is marked as visited in the <tt>VisitedLinksService</tt> and processing goes on.
</li><li> The <tt>Fetcher</tt> is called to get the metadata (e.g. the mime type). If the mime type of the resource is suitable for link extraction, the Fetcher also gets the content. Otherwise the content will only be fetched in the WebFetcher worker later in the crawl workflow to save IO load.
</li><li> If the content of the resource was fetched, the <tt>LinkExtractor</tt> is called to extract outgoing links (e.g. look for &lt;A&gt; tags). It can produce multiple link records containing one absolute outgoing URL each.
</li><li> If outgoing links were found the current crawl depth is checked and if a maximum crawl depth is configured for this job and it is exceeded the links are discarded. The current crawl depth is stored in each link record (using the attribute _crawlDepth).
</li><li> The <tt>LinkFilter</tt> is called next to remove links that should not be followed (e.g. because they are on a different site) or remove duplicates.
</li><li> In a last step the <tt>RecordProducer</tt> is called to decide how the processed record should be written to the <tt>recordBulks</tt> output bulk. The producer could modify the records or split them into multiple records, if necessary for the use case.
</li></ul>
<a name="Scaling"></a><h5> <span class="mw-headline"> Scaling </span></h5>
<p>Outgoing links are separated into multiple bulks to improve scaling: The outgoing links from the initial task that crawls the <tt>startUrl</tt> will be written to an own bulk each, while outgoing links from later tasks will be written to separate bulks according to the <tt>linksPerBulk</tt> parameter. The outgoing crawled records are divided into bulks of 100 records at most.
</p>
<a name="Implementation_details"></a><h4> <span class="mw-headline"> Implementation details </span></h4>
<ul><li> <tt>ObjectStoreVisitedLinksService</tt> (implements <tt>VisitedLinksService</tt>): Uses the <tt>ObjectStoreService</tt> to store which links have been visited, similar to the <tt><a href="../DeltaCheck.html#ObjectStoreDeltaService" title="SMILA/Documentation/Importing/DeltaCheck">ObjectStoreDeltaService</a></tt>. It uses a configuration file with the same properties in the same configuration directory, but named <tt>visitedlinksstore.properties</tt>.
</li><li> <tt>DefaultFetcher</tt>: Uses a GET request to read the URL. Currently, authentication is not supported. Writes content to attachment <tt>httpContent</tt>, if the resource is of mimetype <tt>text/html</tt> and sets the following attributes:
<ul><li> <tt>httpSize</tt>: value from HTTP header <tt>Content-Length</tt> (-1, if not set), as a Long value.
</li><li> <tt>httpContenttype</tt>: value from HTTP header <tt>Content-Type</tt>, if set.
</li><li> <tt>httpMimetype</tt>: mimetype part of HTTP header <tt>Content-Type</tt>, if set.
</li><li> <tt>httpCharset</tt>: charset part of HTTP header <tt>Content-Type</tt>, if set.
</li><li> <tt>httpLastModified</tt>: value from HTTP header <tt>Last-Modified</tt>, if set, as a DateTime value.
</li><li> <tt>_isCompound</tt>: set to <tt>true</tt> for resources that are identified as extractable compound objects by the running CompoundExtractor service.
</li></ul>
</li><li> <tt>DefaultRecordProducer</tt>: Set record source and calculates <tt>_deltaHash</tt> value for DeltaChecker worker (first wins):
<ul><li> if content is attached, calculate a digest.
</li><li> if <tt>httpLastModified</tt> attribute is set, use it as the hash.
</li><li> if <tt>httpSize</tt> attribute is set, concatenate value of <tt>httpMimetype</tt> attribute and use it as hash
</li><li> if nothing works, create a UUID to force updating.
</li></ul>
</li><li> <tt>DefaultLinkExtractor</tt> (implements <tt>LinkExtractor</tt>: Simple link extraction from HTML <tt>&lt;A href="..."&gt;</tt> tags using the tagsoup HTML parser.
</li><li> <tt>DefaultLinkFilter</tt>: Links are normalized (e.g. fragment parts from URLs ("#...") are removed) and filtered against the specified filter configuration.
</li><li> The internal attribute <tt>_crawlDepth</tt> is used to track the crawl depth of each link to support checking the crawl depth with the <tt>maxCrawlDepth</tt> filter.
</li></ul>
<a name="Web_Fetcher_Worker"></a><h3> <span class="mw-headline"> Web Fetcher Worker </span></h3>
<ul><li> Worker name: <tt>webFetcher</tt>
</li><li> Parameters:
<ul><li> <tt>waitBetweenRequests</tt>: <i>(opt., see Web Crawler)</i>
</li><li> <tt>filters</tt>:
<ul><li> <tt>followRedirects</tt>: <i>(opt., see Web Crawler)</i>
</li><li> <tt>maxRedirects</tt>: <i>(opt., see Web Crawler)</i>
</li><li> <tt>urlPatterns</tt>: <i>(opt., see Web Crawler) applied to resulting URL of a redirect</i>
<ul><li> <tt>include</tt>: <i>(opt., see Web Crawler)</i>
</li><li> <tt>exclude</tt>: <i>(opt., see Web Crawler)</i>
</li></ul>
</li></ul>
</li><li> <tt>mapping</tt> <i>(req., see Web Crawler)</i>
<ul><li> <tt>httpUrl</tt> <i>(req.)</i> to read the attribute that contains the URL where to fetch the content
</li><li> <tt>httpContent</tt> <i>(req.)</i> attachment name where the file content is written to
</li><li> <tt>httpMimetype</tt> <i>(opt., see Web Crawler)</i>
</li><li> <tt>httpCharset</tt> <i>(opt., see Web Crawler)</i>
</li><li> <tt>httpContenttype</tt> <i>(opt., see Web Crawler)</i>
</li><li> <tt>httpLastModified</tt> <i>(opt., see Web Crawler)</i>
</li><li> <tt>httpSize</tt> <i>(opt., see Web Crawler)</i>
</li></ul>
</li></ul>
</li><li> Input slots:
<ul><li> <tt>linksToFetch</tt>: Records describing crawled resources, with or without the content of the resource.
</li></ul>
</li><li> Output slots:
<ul><li> <tt>fetchedLinks</tt>: The incoming records with the content of the resource attached.
</li></ul>
</li></ul>
<p>The fetcher tries to get the content of a web resource identified by attribute <tt>httpUrl</tt>, if attachment <tt>httpContent</tt> is not yet set. Like the <tt>DefaultFetcher</tt> above it does not do authentication to read the resource.
</p>
<a name="Web_Extractor_Worker"></a><h3> <span class="mw-headline"> Web Extractor Worker </span></h3>
<ul><li> Worker name: <tt>webExtractor</tt>
</li><li> Parameters:
<ul><li> <tt>filters</tt>: <i>(opt., see Web Crawler)</i>
<ul><li> <tt>followRedirects</tt>: <i>(opt., see Web Crawler)</i>
</li><li> <tt>maxRedirects</tt>: <i>(opt., see Web Crawler)</i>
</li><li> <tt>urlPatterns</tt>: <i>(opt., see Web Crawler)</i>
<ul><li> <tt>include</tt>: <i>(opt., see Web Crawler)</i>
</li><li> <tt>exclude</tt>: <i>(opt., see Web Crawler)</i>
</li></ul>
</li></ul>
</li><li> <tt>mapping</tt> <i>(req., see Web Crawler)</i>
<ul><li> <tt>httpUrl</tt> <i>(req., see Web Crawler)</i> URLs of compounds have the compound link as prefix, e.g. <tt><a href="http://example.com/compound.zip/compound-element.txt" class="external free" title="http://example.com/compound.zip/compound-element.txt" rel="nofollow">http://example.com/compound.zip/compound-element.txt</a></tt>
</li><li> <tt>httpMimetype</tt> <i>(req., see Web Crawler)</i>
</li><li> <tt>httpCharset</tt> <i>(opt., see Web Crawler)</i>
</li><li> <tt>httpContenttype</tt> <i>(opt., see Web Crawler)</i>
</li><li> <tt>httpLastModified</tt> <i>(opt., see Web Crawler)</i>
</li><li> <tt>httpSize</tt> <i>(opt., see Web Crawler)</i>
</li><li> <tt>httpContent</tt> <i>(opt., see Web Crawler)</i>
</li></ul>
</li></ul>
</li><li> Input slots:
<ul><li> <tt>compounds</tt>
</li></ul>
</li><li> Output slots:
<ul><li> <tt>files</tt>
</li></ul>
</li></ul>
<ul><li> Dependency: <a href="../SimpleCompoundExtractorService.html" class="mw-redirect" title="SMILA/Documentation/Importing/SimpleCompoundExtractorService">CompoundExtractor service</a>
</li></ul>
<p>For each input record, an input stream to the described web resource is created and fed into the CompoundExtractor service. The produced records are converted to look like records produced by the file crawler. Additional internal attributes that are set:
</p>
<ul><li> <tt>_deltaHash</tt>: computed as in the WebCrawler worker
</li><li> <tt>_compoundRecordId</tt>: record ID of top-level compound this element was extracted from
</li><li> <tt>_isCompound</tt>: set to <tt>true</tt> for elements that are compounds themselves.* <tt>_compoundPath</tt>: sequence of <tt>httpUrl</tt> attribute values of the compound objects needed to navigate to the compound element.
</li></ul>
<p>The crawler attributes <tt>httpContenttype</tt>, <tt>httpMimetype</tt> and <tt>httpCharset</tt> are currently not set by the WebExtractor worker.
</p><p>If the element is not a compound itself, its content is added as attachment <tt>httpContent</tt>.
</p>
<a name="Sample_web_crawl_job"></a><h3> <span class="mw-headline"> Sample web crawl job </span></h3>
<p>Job definition for crawling from start URL "<a href="../../../../SMILA.html" class="external free" title="http://wiki.eclipse.org/SMILA" rel="nofollow">http://wiki.eclipse.org/SMILA</a>", pushing the imported records to job "indexUpdateJob". An include pattern is defined to make sure that we only crawl URLs from "below" our start URL.
</p>
<pre>
{
&quot;name&quot;:&quot;crawlWebJob&quot;,
&quot;workflow&quot;:&quot;webCrawling&quot;,
&quot;parameters&quot;:{
&quot;tempStore&quot;:&quot;temp&quot;,
&quot;dataSource&quot;:&quot;web&quot;,
&quot;startUrl&quot;:&quot;http://wiki.eclipse.org/SMILA&quot;,
&quot;jobToPushTo&quot;:&quot;indexUpdateJob&quot;,
&quot;waitBetweenRequests&quot;: 100,
&quot;mapping&quot;:{
&quot;httpContent&quot;:&quot;Content&quot;,
&quot;httpUrl&quot;:&quot;Path&quot;
},
&quot;filters&quot;:{
&quot;urlPatterns&quot;:{
&quot;include&quot;:[&quot;http://wiki\\.eclipse\\.org/SMILA/.*&quot;]
}
}
}
}
</pre>
<!--
NewPP limit report
Preprocessor node count: 141/1000000
Post-expand include size: 0/2097152 bytes
Template argument size: 0/2097152 bytes
#ifexist count: 0/100
-->
<!-- Saved in parser cache with key wikidb:pcache:idhash:35060-0!1!0!!en!2!edit=0 and timestamp 20120710093518 -->
<div class="printfooter">
Retrieved from "<a href="Web.html">http://wiki.eclipse.org/SMILA/Documentation/Importing/Crawler/Web</a>"</div>
<div id="catlinks"><p class='catlinks'><a href="http://wiki.eclipse.org/Special:Categories" title="Special:Categories">Category</a>: <span dir='ltr'><a href="http://wiki.eclipse.org/Category:SMILA" title="Category:SMILA">SMILA</a></span></p></div> <!-- end content -->
<div class="visualClear"></div>
</div>
</div>
</div>
<!-- Yoink of toolbox for phoenix moved up -->
</div>
</div>
<div id="clearFooter"/>
<div id="footer" >
<ul id="footernav">
<li class="first"><a href="http://www.eclipse.org/">Home</a></li>
<li><a href="http://www.eclipse.org/legal/privacy.php">Privacy Policy</a></li>
<li><a href="http://www.eclipse.org/legal/termsofuse.php">Terms of Use</a></li>
<li><a href="http://www.eclipse.org/legal/copyright.php">Copyright Agent</a></li>
<li><a href="http://www.eclipse.org/org/foundation/contact.php">Contact</a></li>
<li><a href="http://wiki.eclipse.org/Eclipsepedia:About" title="Eclipsepedia:About">About Eclipsepedia</a></li>
</ul>
<span id="copyright">Copyright &copy; 2012 The Eclipse Foundation. All Rights Reserved</span>
<p id="footercredit">This page was last modified 19:32, 20 June 2012 by <a href="http://wiki.eclipse.org/User:Igor.novakovic.empolis.com" title="User:Igor.novakovic.empolis.com">Igor Novakovic</a>. Based on work by <a href="http://wiki.eclipse.org/User:Andreas.schank.attensity.com" title="User:Andreas.schank.attensity.com">A. Schank</a>, <a href="http://wiki.eclipse.org/index.php?title=User:Nadine.auslaender.attensity.com&amp;action=edit" class="new" title="User:Nadine.auslaender.attensity.com"> </a> and <a href="http://wiki.eclipse.org/index.php?title=User:Andreas.weber.attensity.com&amp;action=edit" class="new" title="User:Andreas.weber.attensity.com">Andreas Weber</a> and <a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/Importing/Crawler/Web&amp;action=credits" title="SMILA/Documentation/Importing/Crawler/Web">others</a>.</p>
<p id="footerviews">This page has been accessed 1,531 times.</p>
</div>
<script type="text/javascript">
var gaJsHost = (("https:" == document.location.protocol) ? "https://ssl." : "http://www.");
document.write(unescape("%3Cscript src='" + gaJsHost + "google-analytics.com/ga.js' type='text/javascript'%3E%3C/script%3E"));
</script>
<script type="text/javascript">
var pageTracker = _gat._getTracker("UA-910670-4");
pageTracker._trackPageview();
</script>
<!-- <div class="visualClear"></div> -->
<script type="text/javascript">if (window.runOnloadHook) runOnloadHook();</script>
</div>
<!-- Served in 0.055 secs. --></body></html>