blob: 937add7cb1faeb455820eec11f1200312549acb4 [file] [log] [blame]
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en" dir="ltr">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<meta name="keywords" content="SMILA/Documentation/Web Crawler,SMILA/Documentation/Crawler,SMILA/Documentation/Filesystem Crawler,SMILA/Documentation/JDBC Crawler" />
<link rel="shortcut icon" href="http://wiki.eclipse.org/SMILA/Documentation/favicon.ico" />
<link rel="search" type="application/opensearchdescription+xml" href="http://wiki.eclipse.org/opensearch_desc.php" title="Eclipsepedia (English)" />
<link rel="alternate" type="application/rss+xml" title="Eclipsepedia RSS Feed" href="http://wiki.eclipse.org/index.php?title=Special:Recentchanges&amp;feed=rss" />
<link rel="alternate" type="application/atom+xml" title="Eclipsepedia Atom Feed" href="http://wiki.eclipse.org/index.php?title=Special:Recentchanges&amp;feed=atom" />
<title>SMILA/Documentation/Web Crawler - Eclipsepedia</title>
<style type="text/css" media="screen,projection">/*<![CDATA[*/ @import "http://wiki.eclipse.org/skins/eclipsenova/novaWide.css?116"; /*]]>*/</style>
<link rel="stylesheet" type="text/css" media="print" href="http://wiki.eclipse.org/skins/eclipsenova/eclipsenovaPrint.css?116" />
<link rel="stylesheet" type="text/css" media="handheld" href="http://wiki.eclipse.org/skins/eclipsenova/handheld.css?116" />
<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/Nova/css/header.css" media="screen" />
<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/tabs.css" media="screen" />
<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/Nova/css/visual.css" media="screen" />
<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/Nova/css/layout.css" media="screen" />
<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/Nova/css/footer.css" media="screen" />
<!--[if IE]><link rel="stylesheet" type="text/css" href="/skins/eclipsenova/IEpngfix.css" media="screen" /><![endif]-->
<!--[if lt IE 5.5000]><style type="text/css">@import "/skins/eclipsenova/IE50Fixes.css?116";</style> <![endif]-->
<!--[if IE 5.5000]><style type="text/css">@import "/skins/eclipsenova/IE55Fixes.css?116";</style><![endif]-->
<!--[if IE 6]><style type="text/css">@import "/skins/eclipsenova/IE60Fixes.css?116";</style><![endif]-->
<!--[if IE 7]><style type="text/css">@import "/skins/eclipsenova/IE70Fixes.css?116";</style><![endif]-->
<!--[if lt IE 7]><script type="text/javascript" src="/skins/common/IEFixes.js?116"></script>
<meta http-equiv="imagetoolbar" content="no" /><![endif]-->
<script type= "text/javascript">/*<![CDATA[*/
var skin = "eclipsenova";
var stylepath = "/skins";
var wgArticlePath = "/$1";
var wgScriptPath = "";
var wgScript = "/index.php";
var wgServer = "http://wiki.eclipse.org";
var wgCanonicalNamespace = "";
var wgCanonicalSpecialPageName = false;
var wgNamespaceNumber = 0;
var wgPageName = "SMILA/Documentation/Web_Crawler";
var wgTitle = "SMILA/Documentation/Web Crawler";
var wgAction = "view";
var wgRestrictionEdit = [];
var wgRestrictionMove = [];
var wgArticleId = "15257";
var wgIsArticle = true;
var wgUserName = null;
var wgUserGroups = null;
var wgUserLanguage = "en";
var wgContentLanguage = "en";
var wgBreakFrames = false;
var wgCurRevisionId = "285987";
var wgVersion = "1.12.0";
var wgEnableAPI = true;
var wgEnableWriteAPI = false;
/*]]>*/</script>
<script type="text/javascript" src="http://wiki.eclipse.org/skins/common/wikibits.js?116"><!-- wikibits js --></script>
<!-- Performance mods similar to those for bug 166401 -->
<script type="text/javascript" src="http://wiki.eclipse.org/index.php?title=-&amp;action=raw&amp;gen=js&amp;useskin=eclipsenova"><!-- site js --></script>
<!-- Head Scripts -->
<script type="text/javascript" src="http://wiki.eclipse.org/skins/common/ajax.js?116"></script>
<style type="text/css">/*<![CDATA[*/
.source-xml {line-height: normal; font-size: medium;}
.source-xml li {line-height: normal;}
/**
* GeSHi Dynamically Generated Stylesheet
* --------------------------------------
* Dynamically generated stylesheet for xml
* CSS class: source-xml, CSS id:
* GeSHi (C) 2004 - 2007 Nigel McNie (http://qbnz.com/highlighter)
*/
.source-xml .de1, .source-xml .de2 {font-family: 'Courier New', Courier, monospace; font-weight: normal;}
.source-xml {}
.source-xml .head {}
.source-xml .foot {}
.source-xml .imp {font-weight: bold; color: red;}
.source-xml .ln-xtra {color: #cc0; background-color: #ffc;}
.source-xml li {font-family: 'Courier New', Courier, monospace; color: black; font-weight: normal; font-style: normal;}
.source-xml li.li2 {font-weight: bold;}
.source-xml .coMULTI {color: #808080; font-style: italic;}
.source-xml .es0 {color: #000099; font-weight: bold;}
.source-xml .br0 {color: #66cc66;}
.source-xml .st0 {color: #ff0000;}
.source-xml .nu0 {color: #cc66cc;}
.source-xml .sc0 {color: #00bbdd;}
.source-xml .sc1 {color: #ddbb00;}
.source-xml .sc2 {color: #339933;}
.source-xml .sc3 {color: #009900;}
.source-xml .re0 {color: #000066;}
.source-xml .re1 {font-weight: bold; color: black;}
.source-xml .re2 {font-weight: bold; color: black;}
/*]]>*/
</style>
<style type="text/css">/*<![CDATA[*/
@import "http://wiki.eclipse.org/index.php?title=MediaWiki:Geshi.css&usemsgcache=yes&action=raw&ctype=text/css&smaxage=18000";
/*]]>*/
</style><link rel="stylesheet" type="text/css" href="Web_Crawler.html" /> </head>
<body class="mediawiki ns-0 ltr page-SMILA_Documentation_Web_Crawler">
<div id="globalWrapper">
<div id="column-one">
<!-- Eclipse Additions for the Top Nav start here M. Ward-->
<div id="header">
<div id="header-graphic">
<img src="http://wiki.eclipse.org/skins/eclipsenova/eclipse.png" alt="Eclipse Wiki">
</div>
<!-- Pulled 101409 Mward -->
<div class="portlet" id="p-personal">
<div class="pBody">
<ul>
<li id="pt-login"><a href="http://wiki.eclipse.org/index.php?title=Special:Userlogin&amp;returnto=SMILA/Documentation/Web_Crawler">Log in</a></li>
</ul>
</div>
</div>
<div id="header-icons">
<div id="sites">
<ul id="sitesUL">
<li><a href="http://www.eclipse.org"><img src="http://dev.eclipse.org/custom_icons/eclipseIcon.png" width="28" height="28" alt="Eclipse Foundation" title="Eclipse Foundation" /><div>Eclipse Foundation</div></a></li>
<li><a href="http://marketplace.eclipse.org"><img src="http://dev.eclipse.org/custom_icons/marketplace.png" width="28" height="28" alt="Eclipse Marketplace" title="Eclipse Marketplace" /><div>Eclipse Marketplace</div></a></li>
<li><a href="https://bugs.eclipse.org/bugs"><img src="http://dev.eclipse.org/custom_icons/system-search-bw.png" width="28" height="28" alt="Bugzilla" title="Bugzilla" /><div>Bugzilla</div></a></li>
<li><a href="http://live.eclipse.org"><img src="http://dev.eclipse.org/custom_icons/audio-input-microphone-bw.png" width="28" height="28" alt="Live" title="Live" /><div>Eclipse Live</div></a></li>
<li><a href="http://planeteclipse.org"><img src="http://dev.eclipse.org/large_icons/devices/audio-card.png" width="28" height="28" alt="PlanetEclipse" title="Planet" /><div>Planet Eclipse</div></a></li>
<li><a href="http://portal.eclipse.org"><img src="http://dev.eclipse.org/custom_icons/preferences-system-network-proxy-bw.png" width="28" height="28" alt="Portal" title="Portal" /><div>My Foundation Portal</div></a></li>
</ul>
</div>
</div>
</div>
<!-- NEW HEADER STUFF HERE -->
<div id="header-menu">
<div id="header-nav">
<ul> <li><a class="first_one" href="http://wiki.eclipse.org/" target="_self">Home</a></li> <li><a href="http://www.eclipse.org/downloads/" target="_self">Downloads</a></li>
<li><a href="http://www.eclipse.org/users/" target="_self">Users</a></li>
<li><a href="http://www.eclipse.org/membership/" target="_self">Members</a></li>
<li><a href="http://wiki.eclipse.org/index.php/Development_Resources" target="_self">Committers</a></li>
<li><a href="http://www.eclipse.org/resources/" target="_self">Resources</a></li>
<li><a href="http://www.eclipse.org/projects/" target="_self">Projects</a></li>
<li><a href="http://www.eclipse.org/org/" target="_self">About Us</a></li>
</ul>
</div>
<div id="header-utils">
<!-- moved the search window here -->
<form action="http://wiki.eclipse.org/Special:Search" >
<input class="input" name="search" type="text" accesskey="f" value="" />
<input type='submit' onclick="this.submit();" name="go" id="searchGoButton" class="button" title="Go to a page with this exact name if one exists" value="Go" />&nbsp;
<input type='submit' onclick="this.submit();" name="fulltext" class="button" id="mw-searchButton" title="Search Eclipsepedia for this text" value="Search" />
</form>
</div>
</div>
<!-- Eclipse Additions for the Header stop here -->
<!-- Additions and mods for leftside nav Start here -->
<!--Started nav rip here-->
<!-- these are the nav controls main page, changes etc -->
<div id="novaContent" class="faux">
<div id="leftcol">
<ul id="leftnav">
<!-- these are the page controls, edit history etc -->
<li class="separator"><a class="separator">Navigation &#160;&#160;</li>
<li id="n-mainpage"><a href="http://wiki.eclipse.org/Main_Page">Main Page</a></li>
<li id="n-portal"><a href="http://wiki.eclipse.org/Eclipsepedia:Community_Portal">Community portal</a></li>
<li id="n-currentevents"><a href="http://wiki.eclipse.org/Eclipsepedia:Current_events">Current events</a></li>
<li id="n-recentchanges"><a href="http://wiki.eclipse.org/Special:Recentchanges">Recent changes</a></li>
<li id="n-randompage"><a href="http://wiki.eclipse.org/Special:Random">Random page</a></li>
<li id="n-help"><a href="http://wiki.eclipse.org/Help:Contents">Help</a></li>
<li class="separator"><a class="separator">Toolbox &#160;&#160;</a></li>
<li id="t-whatlinkshere"><a href="http://wiki.eclipse.org/Special:Whatlinkshere/SMILA/Documentation/Web_Crawler">What links here</a></li>
<li id="t-recentchangeslinked"><a href="http://wiki.eclipse.org/Special:Recentchangeslinked/SMILA/Documentation/Web_Crawler">Related changes</a></li>
<!-- This is the toolbox section -->
<li id="t-upload"><a href="http://wiki.eclipse.org/Special:Upload">Upload file</a></li>
<li id="t-specialpages"><a href="http://wiki.eclipse.org/Special:Specialpages">Special pages</a></li>
<li id="t-print"><a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/Web_Crawler&amp;printable=yes">Printable version</a></li> <li id="t-permalink"><a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/Web_Crawler&amp;oldid=285987">Permanent link</a></li> </ul>
</div>
<!-- Additions and mods for leftside nav End here -->
<div id="column-content">
<div id="content">
<a name="top" id="top"></a>
<div id="tabs">
<ul class="primary">
<li class="active"><a href="Web_Crawler.html"><span class="tab">Page</span></a></li>
<li><a href="http://wiki.eclipse.org/Talk:SMILA/Documentation/Web_Crawler"><span class="tab">Discussion</span></a></li>
<li><a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/Web_Crawler&amp;action=edit"><span class="tab">View source</span></a></li>
<li><a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/Web_Crawler&amp;action=history"><span class="tab">History</span></a></li>
<li><a href="http://wiki.eclipse.org/index.php?title=Special:Userlogin&amp;returnto=SMILA/Documentation/Web&#32;Crawler"><span class="tab">Edit</span></a></li>
</ul>
</div>
<script type="text/javascript"> if (window.isMSIE55) fixalpha(); </script>
<h1 class="firstHeading">SMILA/Documentation/Web Crawler</h1>
<div id="bodyContent">
<h3 id="siteSub">From Eclipsepedia</h3>
<div id="contentSub"><span class="subpages">&lt; <a href="../../SMILA.html" title="SMILA">SMILA</a> | <a href="../Documentation.1.html" title="SMILA/Documentation">Documentation</a></span></div>
<div id="jump-to-nav">Jump to: <a href="Web_Crawler.html#column-one">navigation</a>, <a href="Web_Crawler.html#searchInput">search</a></div> <!-- start content -->
<div class="messagebox" style="background-color: #def3fe; border: 1px solid #c5d7e0; color: black; padding: 5px; margin: 1ex 0; min-height: 35px; padding-left: 45px;">
<div style="float: left; margin-left: -40px;"><a href="http://wiki.eclipse.org/Image:Note.png" class="image" title="Note.png"><img alt="" src="http://wiki.eclipse.org/images/c/cc/Note.png" width="35" height="35" border="0" /></a></div>
<div><b>This is deprecated for SMILA 1.0, the connectivity framework is still functional but will aimed to be replaced by scalable import based on SMILAs job management.</b><br /></div>
</div>
<p><br />
</p>
<table id="toc" class="toc" summary="Contents"><tr><td><div id="toctitle"><h2>Contents</h2></div>
<ul>
<li class="toclevel-1"><a href="Web_Crawler.html#Overview"><span class="tocnumber">1</span> <span class="toctext">Overview</span></a></li>
<li class="toclevel-1"><a href="Web_Crawler.html#Crawling_configuration"><span class="tocnumber">2</span> <span class="toctext">Crawling configuration</span></a></li>
<li class="toclevel-1"><a href="Web_Crawler.html#Crawling_configuration_explanation"><span class="tocnumber">3</span> <span class="toctext">Crawling configuration explanation</span></a></li>
<li class="toclevel-1"><a href="Web_Crawler.html#Crawling_configuration_example"><span class="tocnumber">4</span> <span class="toctext">Crawling configuration example</span></a>
<ul>
<li class="toclevel-2"><a href="Web_Crawler.html#Minimal_configuration_example"><span class="tocnumber">4.1</span> <span class="toctext">Minimal configuration example</span></a></li>
<li class="toclevel-2"><a href="Web_Crawler.html#Html_form_login_example"><span class="tocnumber">4.2</span> <span class="toctext">Html form login example</span></a></li>
<li class="toclevel-2"><a href="Web_Crawler.html#Multiple_website_configuration"><span class="tocnumber">4.3</span> <span class="toctext">Multiple website configuration</span></a></li>
<li class="toclevel-2"><a href="Web_Crawler.html#Complex_website_configuration_example"><span class="tocnumber">4.4</span> <span class="toctext">Complex website configuration example</span></a></li>
</ul>
</li>
<li class="toclevel-1"><a href="Web_Crawler.html#Output_example_for_default_configuration"><span class="tocnumber">5</span> <span class="toctext">Output example for default configuration</span></a></li>
<li class="toclevel-1"><a href="Web_Crawler.html#Additional_performance_counters"><span class="tocnumber">6</span> <span class="toctext">Additional performance counters</span></a></li>
<li class="toclevel-1"><a href="Web_Crawler.html#See_also"><span class="tocnumber">7</span> <span class="toctext">See also</span></a></li>
<li class="toclevel-1"><a href="Web_Crawler.html#External_links"><span class="tocnumber">8</span> <span class="toctext">External links</span></a></li>
</ul>
</td></tr></table><script type="text/javascript"> if (window.showTocToggle) { var tocShowText = "show"; var tocHideText = "hide"; showTocToggle(); } </script>
<a name="Overview"></a><h2> <span class="mw-headline"> Overview </span></h2>
<p>The Web crawler fetches data from HTTP servers. Starting with an initial URL, it crawls all linked websites recursively.
</p>
<a name="Crawling_configuration"></a><h2> <span class="mw-headline"> Crawling configuration </span></h2>
<p>The example configuration file is located at <tt>configuration/org.eclipse.smila.connectivity.framework/web.xml</tt>
</p><p>Defining Schema: <tt>org.eclipse.smila.connectivitiy.framework.crawler.web/schemas/WebDataSourceConnectionConfigSchema.xsd</tt>
</p>
<a name="Crawling_configuration_explanation"></a><h2> <span class="mw-headline"> Crawling configuration explanation </span></h2>
<p>See <a href="Crawler.html#Configuration" title="SMILA/Documentation/Crawler">SMILA/Documentation/Crawler#Configuration</a> for the generic parts of the configuration file.
</p><p>The root element of the configuration is <tt>DataSourceConnectionConfig</tt> and contains the following sub elements:
</p>
<ul><li> <tt>DataSourceID</tt> – the identification of a data source.
</li><li> <tt>SchemaID</tt> – specify the schema for a crawler job.
</li><li> <tt>DataConnectionID</tt> – describes which agent crawler should be used.
<ul><li> <tt>Crawler</tt> – implementation class of a crawler.
</li><li> <tt>Agent</tt> – implementation class of an agent.
</li></ul>
</li><li> <tt>CompoundHandling</tt> – specify if packed data (like a ZIP containing files) should be unpack and files within should be crawled (YES or NO).
</li><li> <tt>Attributes</tt> – list all attributes which describe a website.
<ul><li> <tt>Attribute</tt>:
<ul><li> attributes:
<ul><li> <tt>Type</tt> (required) – the data type (String, Integer or Date).
</li><li> <tt>Name</tt> (required) – attributes name.
</li><li> <tt>HashAttribute</tt> – specify if the attribute is used for the hash used for delta indexing (<i>true</i> or <i>false</i>). Must be true for at least one attribute which must always have a value.
</li><li> <tt>KeyAttribute</tt> – specify if the attribute is used for creating the record ID (<i>true</i> or <i>false</i>). Must be true for at least one attribute. All key attributes must identify the file uniquely, so usually you will set it <i>true</i> for the attribute containing <i>Url</i> FieldAttribute.
</li><li> <tt>Attachment</tt> – specify if the attribute return the data as attachment of record.
</li></ul>
</li><li> sub elements:
<ul><li> <tt>FieldAttribute</tt>: Content of element is one of
<ul><li> <i>Url</i>: URL of the web page. NOTE: Must currently be mapped to an attribute named "Url". Mapping to additional attributes are allowed.
</li><li> <i>Title</i>: The title of the web page from the &lt;title&gt; tag.
</li><li> <i>Content</i>: The content of the web page. Original binary content, if mapped to an attachment, else it is tried to convert it to a string using the encoding reported in the response headers.
</li><li> <i>MimeType</i>: Mime type of website as reported in response headers.
</li></ul>
</li><li> <tt>MetaAttribute</tt>
<ul><li> sub elements <tt>MetaName</tt>: Key of value to get from metadata.
</li><li> attribute <tt>Type</tt>: one of <i>MetaData</i>, <i>ResponseHeader</i>, <i>MetaDataWithResponseHeaderFallBack</i>: read from HTML meta tags, response header or both
</li><li> attribute <tt>ReturnType</tt>: structure the metadata will be returned. One of:
</li></ul>
</li></ul>
</li></ul>
</li></ul>
</li></ul>
<dl><dd><dl><dd><dl><dd><dl><dd><ul><li> <tt>MetaDataString</tt>: default structure, metadata is returned as single string, for example:
</li></ul>
</dd></dl>
</dd></dl>
</dd></dl>
</dd></dl>
<div dir="ltr" style="text-align: left;"><pre class="source-xml"><span class="sc3"><span class="re1">&lt;Val</span> <span class="re0">key</span>=<span class="st0">&quot;ResponseHeader&quot;</span><span class="re2">&gt;</span></span>Content-type: text/html<span class="sc3"><span class="re1">&lt;/Val<span class="re2">&gt;</span></span></span></pre></div>
<dl><dd><dl><dd><dl><dd><dl><dd><ul><li> <tt>MetaDataValue</tt>: only values of metadata are returned, for example:
</li></ul>
</dd></dl>
</dd></dl>
</dd></dl>
</dd></dl>
<div dir="ltr" style="text-align: left;"><pre class="source-xml"><span class="sc3"><span class="re1">&lt;Val</span> <span class="re0">key</span>=<span class="st0">&quot;ResponseHeader&quot;</span><span class="re2">&gt;</span></span>text/html<span class="sc3"><span class="re1">&lt;/Val<span class="re2">&gt;</span></span></span></pre></div>
<dl><dd><dl><dd><dl><dd><dl><dd><ul><li> <tt>MetaDataMObject</tt>: metadata is returned as MObject containing attributes with metadata names and values, for example:
</li></ul>
</dd></dl>
</dd></dl>
</dd></dl>
</dd></dl>
<div dir="ltr" style="text-align: left;"><pre class="source-xml"><span class="sc3"><span class="re1">&lt;Map</span> <span class="re0">key</span>=<span class="st0">&quot;ResponseHeader&quot;</span><span class="re2">&gt;</span></span>
<span class="sc3"><span class="re1">&lt;Val</span> <span class="re0">key</span>=<span class="st0">&quot;Content-Type&quot;</span><span class="re2">&gt;</span></span>text/html<span class="sc3"><span class="re1">&lt;/Val<span class="re2">&gt;</span></span></span>
...
<span class="sc3"><span class="re1">&lt;/Map<span class="re2">&gt;</span></span></span></pre></div>
<ul><li> <tt>Process</tt> – this element is responsible for selecting data
<ul><li> <tt>Website</tt> - contains all important information for accessing and crawling a website.
<ul><li> <tt>ProjectName</tt> - defines project name
</li><li> <tt>Sitemaps</tt> - for supporting Google site maps. <tt>sitemap.xml</tt>, <tt>sitemap.xml.gz</tt> and <tt>sitemap.gz</tt> formats are supported. See [<a href="https://www.google.com/webmasters/tools/docs/en/protocol.html" class="external autonumber" title="https://www.google.com/webmasters/tools/docs/en/protocol.html" rel="nofollow">[1]</a>]. Links extracted from <tt>&lt;loc&gt;</tt> tags are added to the current level links. Crawler looks for the sitemap file at the root directory of the web server and then caches it for the particular host to avoid parsing the sitemap again for the URL already processed.
</li><li> <tt>Header</tt> - request headers separated by semicolon. Headers should be in format <tt>"&lt;header_name&gt;:&lt;header_content&gt;"</tt>, separated by semicolon.
</li><li> <tt>Referer</tt> - to include <tt>"Referer: URL"</tt> header in HTTP request. See [<a href="http://en.wikipedia.org/wiki/Referer" class="external autonumber" title="http://en.wikipedia.org/wiki/Referer" rel="nofollow">[2]</a>]
</li><li> <tt>EnableCookies</tt> - enable or disable cookies for crawling process (true or false). See [<a href="http://en.wikipedia.org/wiki/HTTP_cookie" class="external autonumber" title="http://en.wikipedia.org/wiki/HTTP_cookie" rel="nofollow">[3]</a>]
</li><li> <tt>UserAgent</tt> - element used to identify crawler to the server as a specific user agent origination the request. The <tt>UserAgent</tt> string generated looks like the following: <tt>Name/Version (Description, Url, Email)</tt>
<ul><li> <tt>Name</tt> (required)
</li><li> <tt>Version</tt>
</li><li> <tt>Description</tt>
</li><li> <tt>URL</tt>
</li><li> <tt>Email</tt>
</li></ul>
</li><li> <tt>Robotstxt</tt> element used for supporting <tt>robots.txt</tt> information. The Robots Exclusion Standard tells crawler how to crawl a website – or rather which resources should not be crawled. See [<a href="http://www.robotstxt.org/" class="external autonumber" title="http://www.robotstxt.org/" rel="nofollow">[4]</a>]
<ul><li> <tt>Policy</tt>: there are five types of policies offered on how to deal with robots.txt rules:
<ol><li> <tt>Classic</tt>. Simply obey the robots.txt rules. Recommended unless you have special permission to collect a site more aggressively.
</li><li> <tt>Ignore</tt>. Completely ignore robots.txt rules.
</li><li> <tt>Custom</tt>. Obey your own, custom, robots.txt instead of those discovered on the relevant site. The attribute Value must contain the path to a locally available robots.txt file in this case.
</li><li> <tt>Set</tt>. Limit robots names which rules are followed to the given set. Value attribute must handle robots names separated by semicolon in this case.
</li></ol>
</li><li> <tt>Value</tt>: specifies the filename with the robots.txt rules for Custom policy and set of agent names for the Set policy.
</li><li> <tt>AgentNames</tt>: specifies the list of agents we advertise. This list should be started with the same name as UserAgent Name (for example: crawler user-agent name that is used for the crawl job)
</li></ul>
</li><li> <tt>CrawlingModel</tt>: there are two models available:
<ul><li> <tt>Type</tt>: the model type (MaxBreadth or MaxDepth)
</li></ul>
<ol><li> <tt>MaxBreadth</tt>: crawling a web site through a limited number of links.
</li><li> <tt>MaxDepth</tt>: crawling a web site with specifying the maximum crawling depth.
</li></ol>
<ul><li> <tt>Value</tt>: parameter (Integer)
</li></ul>
</li><li> <tt>CrawlScope</tt>: decides for each discovered URI if it is within the scope of the current crawl.
</li><li> <tt>Type</tt>: following scope are provided:
<ol><li> <tt>Broad</tt>: accept all. This scope does not impose any limits on the hosts, domains, or URI paths crawled.
</li><li> <tt>Domain</tt>: accept if on same 'domain' as seeds (start URL). This scope limits discovered URIs to the set of domains defined by the provided seeds. That is any URI discovered belonging to a domain from which one of the seed came is within scope. Using the seed 'brox.de', a domain scope will fetch 'bugs.brox.de', 'confluence.brox.de', etc. It will fetch all discovered URIs from 'brox.de' and from any subdomain of 'brox.de'.
</li><li> <tt>Host</tt>: accept if on exact host as seeds. This scope limits discovered URIs to the set of hosts defined by the provided seeds. If the seed is <tt>'www.brox.de'</tt>, then we'll only fetch items discovered on this host. The crawler will not go to <tt>'bugs.brox.de'</tt>.
</li><li> <tt>Path</tt>: accept if on same host and a shared path-prefix as seeds. This scope goes yet further and limits the discovered URIs to a section of paths on hosts defined by the seeds. Of course any host that has a seed **:pointing at its root (i.e. www.sample.com/index.html) will be included in full where as a host whose only seed is www.sample2.com/path/index.html **:will be limited to URIs under /path/.
</li></ol>
<ul><li> <tt>Filters</tt>: every scope can have additional filters to select URI that will be considered to be within or out of scope ( see the section Filters for details)
</li></ul>
</li><li> <tt>CrawlLimits</tt>: In addition to limits imposed on the scope of the crawl it is possible to enforce arbitrary limits on the duration and extent of the crawling process with the following setting:
<ul><li> <tt>SizeLimits</tt>:
<ul><li> <tt>MaxBytesDownload</tt>: stop after a fixed number of bytes have been downloaded (0 means unlimited).
</li><li> <tt>MaxDocumentDownload</tt>: stop after downloading a fixed number of documents (0 means unlimited).
</li><li> <tt>MaxTimeSec</tt>: stop after a certain number of seconds have elapsed (0 means unlimited). These are not supposed to be hard limits. Once one of these limits is reached, it will trigger a graceful termination of the crawl job, which means that URIs already being crawled will be completed. As a result the set limit will be exceeded by some amount.
</li><li> <tt>MaxLengthBytes</tt>: maximum number of bytes to download per document. Will truncate file once this limit is reached.
</li></ul>
</li><li> <tt>TimeoutLimits</tt>: Whenever crawler connects to or reads from a remote host, it checks the timeouts and aborts the operation if any is exceeded. This prevents anomalous occurrences such as hanging reads or infinite connects.
<ul><li> <tt>Timeout</tt>: This limit is the total time need to connect and get the download website, and such represents the total of a ConnectTimeout plus a ReadTimeout.
</li><li> <tt>ConnectTimeout</tt>: Connect timeout in seconds. TCP connections that take longer to establish will be aborted.
</li><li> <tt>ReadTimeout</tt>: Read (and write) timeout in seconds. Reads that take longer will fail. The default value for read timeout is 900 seconds.
</li></ul>
</li><li> <tt>WaitLimis</tt>:
<ul><li> <tt>Wait</tt>: Wait the specified number of seconds between the retrievals. Use of this option is recommended, as it lightens the server load by making the *:requests less frequent. Specifying a large value for this option is useful if the network or the destination host is down, so that crawler can wait *:long enough to reasonably expect the network error to be fixed before the retry.
</li><li> <tt>RandomWait</tt>: Some web sites may perform log analysis to identify retrieval programs by looking for statistically significant similarities in the time between requests. This option causes the time between requests to vary between 0 and 2 * wait seconds, where wait was specified using the wait setting, in order to mask crawler's presence from such analysis.
</li><li> <tt>MaxRetries</tt>: How often to retry URLs that failed.
</li><li> <tt>WaitRetry</tt>: How long to wait between such retries.
</li></ul>
</li></ul>
</li><li> <tt>Proxy</tt>: specifies the HTTP proxy server to be used.
<ul><li> <tt>ProxyServer</tt>:
<ul><li> <tt>Host</tt>
</li><li> <tt>Port</tt>
</li><li> <tt>Login</tt>
</li><li> <tt>Password</tt>
</li></ul>
</li></ul>
</li><li> <tt>Authentication</tt>: The Authentication element is used to gain access to areas of websites requiring authentication. Three types of authentication are available: RFC2617 (BASIC and DIGEST types of authentication), HTTP POST or GET of an HTML Form and SSL Certificate based client authentication.
<ul><li> <tt>RFC2617</tt>:
<ul><li> <tt>Host</tt> and
</li><li> <tt>Port</tt>: equate to the canonical root URI of RFC2617.
</li><li> <tt>Realm</tt>: realm as per RFC2617. The realm string must match exactly the realm name presented in the authentication challenge served up by the web server.
</li><li> <tt>Login</tt>: username for login.
</li><li> <tt>Password</tt>: password to this restricted area.
</li></ul>
</li><li> <tt>HMTLFrom</tt>:
<ul><li> <tt>CredentialDomain</tt>: same as the RFC2617 canonical root URI of RFC2617.
</li><li> <tt>HttpMethod</tt>: POST or GET
</li><li> <tt>LoginUrl</tt>: relative or absolute URI to the page that the HTML Form submits to (Not the page that contains the HTML Form)
</li><li> <tt>FormItems</tt>: listing of HTML Form key/value pairs
</li></ul>
</li><li> <tt>SSLCertificate</tt>:
<ul><li> <tt>ProtocolName</tt>: name of the protocol to be used, e.g. "https".
</li><li> <tt>Port</tt>: port number
</li><li> <tt>TruststoreUrl</tt>: location of the file containing one or several trusted certificates.
</li><li> <tt>TruststorePassword</tt>
</li><li> <tt>KeystoneUrl</tt>: location of the file containing a private key/public certificate pair.
</li><li> <tt>KeystonePassword</tt>
</li></ul>
</li></ul>
</li><li> <tt>Seeds</tt>: contains a list of Seed elements
<ul><li> <tt>FollowLinks</tt>: enables analyzing URL of pages that otherwise would be ignored:
</li></ul>
<ol><li> <tt>NoFollow</tt>: do not analyze anything that matches any "Unselect" filter.
</li><li> <tt>Follow</tt>: analyze everything that matches some "Unselect" filter, do not index anything
</li><li> <tt>FollowLinksWithCorrespondingSelectFilter</tt>: index pages that match both "Select" and "Unselect" filters, and analyze everything else that matches **:some "Unselect" filter.
</li></ol>
<ul><li> <tt>Seed</tt>: defines site’s start path from which crawling process begin.
</li></ul>
</li><li> <tt>Filters</tt>: contains a list of Filter elements and optional refinements elements.
<ul><li> <tt>Filter</tt>: used to define filters for pages that should be crawled and indexed.
<ul><li> <tt>Type</tt>: the following filter types are available:
</li></ul>
<ol><li> <tt>BeginningPath</tt>: filters paths which begin with the specified characters.
</li><li> <tt>RegExp</tt>: filters urls based on a regular expression.
</li><li> <tt>ContentType</tt>: filters content type on a regular expression. Use this filter to abort the download of content-types other than those wanted.
</li></ol>
<ul><li> <tt>WorkType</tt>: Select or Unselect, the way how filter should work.
</li><li> <tt>Value</tt>: the filter value that will be used to check if the given value matches the filter or not.
</li></ul>
</li><li> <tt>Refinements</tt>: must be nested into the Filter element. It allows to modify filter settings under certain circumstances. Following refinements may be applied to the filters:
</li></ul>
<ol><li> <tt>Port</tt>: match only those URIs for the given port number.
</li><li> <tt>TimeOfDay</tt>: if this refinement is applied, the filter will only be in effect between the hours specified each day. From and To attributes must be in HH:mm:ss format (e.g. 23:00:00)
<ul><li> <tt>From</tt>: time when filter becomes enabled.
</li><li> <tt>To</tt>: till this time the filter will be enabled.
</li></ul>
</li></ol>
</li><li> <tt>MetaTagFilters</tt>: contains a list of <tt>MetaTagFilter</tt> elements.
<ul><li> <tt>MetaTagFilter</tt>: defines filter for omitting content by meta tags.
<ul><li> <tt>Type</tt>: type of meta-tag to match: <tt>Name</tt> or <tt>Http-Equiv</tt>.
</li><li> <tt>Name</tt>: name of the tag e.g. "author" for the Type "Name".
</li><li> <tt>Content</tt>: the tag contents.
</li><li> <tt>WorkType</tt>: <tt>Select</tt> or <tt>Unselect</tt>
</li></ul>
</li></ul>
</li></ul>
</li></ul>
</li></ul>
<a name="Crawling_configuration_example"></a><h2> <span class="mw-headline"> Crawling configuration example </span></h2>
<div dir="ltr" style="text-align: left;"><pre class="source-xml"><span class="sc3"><span class="re1">&lt;DataSourceConnectionConfig</span>
<span class="re0">xmlns:xsi</span>=<span class="st0">&quot;http://www.w3.org/2001/XMLSchema-instance&quot;</span>
<span class="re0">xsi:noNamespaceSchemaLocation</span>=<span class="st0">&quot;../org.eclipse.smila.connectivity.framework.crawler.web/schemas/WebDataSourceConnectionConfigSchema.xsd&quot;</span><span class="re2">&gt;</span></span>
<span class="sc3"><span class="re1">&lt;DataSourceID<span class="re2">&gt;</span></span></span>web<span class="sc3"><span class="re1">&lt;/DataSourceID<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;SchemaID<span class="re2">&gt;</span></span></span>org.eclipse.smila.connectivity.framework.crawler.web<span class="sc3"><span class="re1">&lt;/SchemaID<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;DataConnectionID<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Crawler<span class="re2">&gt;</span></span></span>WebCrawlerDS<span class="sc3"><span class="re1">&lt;/Crawler<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/DataConnectionID<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;CompoundHandling<span class="re2">&gt;</span></span></span>No<span class="sc3"><span class="re1">&lt;/CompoundHandling<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Attributes<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Attribute</span> <span class="re0">Type</span>=<span class="st0">&quot;String&quot;</span> <span class="re0">Name</span>=<span class="st0">&quot;Url&quot;</span> <span class="re0">KeyAttribute</span>=<span class="st0">&quot;true&quot;</span><span class="re2">&gt;</span></span>
<span class="sc3"><span class="re1">&lt;FieldAttribute<span class="re2">&gt;</span></span></span>Url<span class="sc3"><span class="re1">&lt;/FieldAttribute<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/Attribute<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Attribute</span> <span class="re0">Type</span>=<span class="st0">&quot;String&quot;</span> <span class="re0">Name</span>=<span class="st0">&quot;Title&quot;</span><span class="re2">&gt;</span></span>
<span class="sc3"><span class="re1">&lt;FieldAttribute<span class="re2">&gt;</span></span></span>Title<span class="sc3"><span class="re1">&lt;/FieldAttribute<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/Attribute<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Attribute</span> <span class="re0">Type</span>=<span class="st0">&quot;String&quot;</span> <span class="re0">Name</span>=<span class="st0">&quot;Content&quot;</span> <span class="re0">HashAttribute</span>=<span class="st0">&quot;true&quot;</span> <span class="re0">Attachment</span>=<span class="st0">&quot;true&quot;</span> <span class="re0">MimeTypeAttribute</span>=<span class="st0">&quot;Content&quot;</span><span class="re2">&gt;</span></span>
<span class="sc3"><span class="re1">&lt;FieldAttribute<span class="re2">&gt;</span></span></span>Content<span class="sc3"><span class="re1">&lt;/FieldAttribute<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/Attribute<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Attribute</span> <span class="re0">Type</span>=<span class="st0">&quot;String&quot;</span> <span class="re0">Name</span>=<span class="st0">&quot;MimeType&quot;</span><span class="re2">&gt;</span></span>
<span class="sc3"><span class="re1">&lt;FieldAttribute<span class="re2">&gt;</span></span></span>MimeType<span class="sc3"><span class="re1">&lt;/FieldAttribute<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/Attribute<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Attribute</span> <span class="re0">Type</span>=<span class="st0">&quot;String&quot;</span> <span class="re0">Name</span>=<span class="st0">&quot;MetaData&quot;</span> <span class="re0">Attachment</span>=<span class="st0">&quot;false&quot;</span><span class="re2">&gt;</span></span>
<span class="sc3"><span class="re1">&lt;MetaAttribute</span> <span class="re0">Type</span>=<span class="st0">&quot;MetaData&quot;</span><span class="re2">/&gt;</span></span>
<span class="sc3"><span class="re1">&lt;/Attribute<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Attribute</span> <span class="re0">Type</span>=<span class="st0">&quot;String&quot;</span> <span class="re0">Name</span>=<span class="st0">&quot;ResponseHeader&quot;</span> <span class="re0">Attachment</span>=<span class="st0">&quot;false&quot;</span><span class="re2">&gt;</span></span>
<span class="sc3"><span class="re1">&lt;MetaAttribute</span> <span class="re0">Type</span>=<span class="st0">&quot;ResponseHeader&quot;</span><span class="re2">&gt;</span></span>
<span class="sc3"><span class="re1">&lt;MetaName<span class="re2">&gt;</span></span></span>Date<span class="sc3"><span class="re1">&lt;/MetaName<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;MetaName<span class="re2">&gt;</span></span></span>Server<span class="sc3"><span class="re1">&lt;/MetaName<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/MetaAttribute<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/Attribute<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Attribute</span> <span class="re0">Type</span>=<span class="st0">&quot;String&quot;</span> <span class="re0">Name</span>=<span class="st0">&quot;MetaDataWithResponseHeaderFallBack&quot;</span> <span class="re0">Attachment</span>=<span class="st0">&quot;false&quot;</span><span class="re2">&gt;</span></span>
<span class="sc3"><span class="re1">&lt;MetaAttribute</span> <span class="re0">Type</span>=<span class="st0">&quot;MetaDataWithResponseHeaderFallBack&quot;</span><span class="re2">/&gt;</span></span>
<span class="sc3"><span class="re1">&lt;/Attribute<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/Attributes<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Process<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;WebSite</span> <span class="re0">ProjectName</span>=<span class="st0">&quot;Example Crawler Configuration&quot;</span> <span class="re0">Header</span>=<span class="st0">&quot;Accept-Encoding: gzip,deflate; Via: myProxy&quot;</span> <span class="re0">Referer</span>=<span class="st0">&quot;http://myReferer&quot;</span><span class="re2">&gt;</span></span>
<span class="sc3"><span class="re1">&lt;UserAgent</span> <span class="re0">Name</span>=<span class="st0">&quot;Crawler&quot;</span> <span class="re0">Version</span>=<span class="st0">&quot;1.0&quot;</span> <span class="re0">Description</span>=<span class="st0">&quot;teddy crawler&quot;</span> <span class="re0">Url</span>=<span class="st0">&quot;http://www.teddy.com&quot;</span> <span class="re0">Email</span>=<span class="st0">&quot;crawler@teddy.com&quot;</span><span class="re2">/&gt;</span></span>
<span class="sc3"><span class="re1">&lt;CrawlingModel</span> <span class="re0">Type</span>=<span class="st0">&quot;MaxDepth&quot;</span> <span class="re0">Value</span>=<span class="st0">&quot;1000&quot;</span><span class="re2">/&gt;</span></span>
<span class="sc3"><span class="re1">&lt;CrawlScope</span> <span class="re0">Type</span>=<span class="st0">&quot;Domain&quot;</span><span class="re2">&gt;</span></span>
<span class="sc3"><span class="re1">&lt;Filters<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Filter</span> <span class="re0">Type</span>=<span class="st0">&quot;BeginningPath&quot;</span> <span class="re0">WorkType</span>=<span class="st0">&quot;Select&quot;</span> <span class="re0">Value</span>=<span class="st0">&quot;/&quot;</span><span class="re2">/&gt;</span></span>
<span class="sc3"><span class="re1">&lt;/Filters<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/CrawlScope<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;CrawlLimits<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="coMULTI">&lt;!-- Warning: The amount of files returned is limited to 1000 --&gt;</span></span>
<span class="sc3"><span class="re1">&lt;SizeLimits</span> <span class="re0">MaxBytesDownload</span>=<span class="st0">&quot;0&quot;</span> <span class="re0">MaxDocumentDownload</span>=<span class="st0">&quot;1000&quot;</span> <span class="re0">MaxTimeSec</span>=<span class="st0">&quot;3600&quot;</span> <span class="re0">MaxLengthBytes</span>=<span class="st0">&quot;100000&quot;</span><span class="re2">/&gt;</span></span>
<span class="sc3"><span class="re1">&lt;TimeoutLimits</span> <span class="re0">Timeout</span>=<span class="st0">&quot;10000&quot;</span><span class="re2">/&gt;</span></span>
<span class="sc3"><span class="re1">&lt;WaitLimits</span> <span class="re0">Wait</span>=<span class="st0">&quot;0&quot;</span> <span class="re0">RandomWait</span>=<span class="st0">&quot;false&quot;</span> <span class="re0">MaxRetries</span>=<span class="st0">&quot;8&quot;</span> <span class="re0">WaitRetry</span>=<span class="st0">&quot;0&quot;</span><span class="re2">/&gt;</span></span>
<span class="sc3"><span class="re1">&lt;/CrawlLimits<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Seeds</span> <span class="re0">FollowLinks</span>=<span class="st0">&quot;Follow&quot;</span><span class="re2">&gt;</span></span>
<span class="sc3"><span class="re1">&lt;Seed<span class="re2">&gt;</span></span></span>http://en.wikipedia.org/<span class="sc3"><span class="re1">&lt;/Seed<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/Seeds<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Filters<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Filter</span> <span class="re0">Type</span>=<span class="st0">&quot;RegExp&quot;</span> <span class="re0">Value</span>=<span class="st0">&quot;.*action=edit.*&quot;</span> <span class="re0">WorkType</span>=<span class="st0">&quot;Unselect&quot;</span><span class="re2">/&gt;</span></span>
<span class="sc3"><span class="re1">&lt;/Filters<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/WebSite<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/Process<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/DataSourceConnectionConfig<span class="re2">&gt;</span></span></span></pre></div>
<a name="Minimal_configuration_example"></a><h3> <span class="mw-headline"> Minimal configuration example </span></h3>
<p>This example demonstrates minimal configuration required for crawler.
</p>
<div dir="ltr" style="text-align: left;"><pre class="source-xml"><span class="sc3"><span class="re1">&lt;WebSite</span> <span class="re0">ProjectName</span>=<span class="st0">&quot;Minimal Configuration&quot;</span><span class="re2">&gt;</span></span>
<span class="sc3"><span class="re1">&lt;Seeds<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Seed<span class="re2">&gt;</span></span></span>http://localhost/test/<span class="sc3"><span class="re1">&lt;/Seed<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/Seeds<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/WebSite<span class="re2">&gt;</span></span></span></pre></div>
<a name="Html_form_login_example"></a><h3> <span class="mw-headline"> Html form login example </span></h3>
<p>his example demonstrates how to login to Invision Power Board powered forum. Number of downloaded pages is limited to 15. robots.txt information is ignored. Crawler will advertise itself as Mozilla/5.0.
</p>
<div dir="ltr" style="text-align: left;"><pre class="source-xml"><span class="sc3"><span class="re1">&lt;WebSite</span> <span class="re0">ProjectName</span>=<span class="st0">&quot;Login To Invision Powerboard Forum Example&quot;</span><span class="re2">&gt;</span></span>
<span class="sc3"><span class="re1">&lt;UserAgent</span> <span class="re0">Name</span>=<span class="st0">&quot;Mozilla&quot;</span> <span class="re0">Version</span>=<span class="st0">&quot;5.0&quot;</span> <span class="re0">Description</span>=<span class="st0">&quot;&quot;</span> <span class="re0">Url</span>=<span class="st0">&quot;&quot;</span> <span class="re0">Email</span>=<span class="st0">&quot;&quot;</span><span class="re2">/&gt;</span></span>
<span class="sc3"><span class="re1">&lt;Robotstxt</span> <span class="re0">Policy</span>=<span class="st0">&quot;Ignore&quot;</span> <span class="re2">/&gt;</span></span>
<span class="sc3"><span class="re1">&lt;CrawlLimits<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;SizeLimits</span> <span class="re0">MaxDocumentDownload</span>=<span class="st0">&quot;15&quot;</span><span class="re2">/&gt;</span></span>
<span class="sc3"><span class="re1">&lt;/CrawlLimits<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Authentication<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;HtmlForm</span> <span class="re0">CredentialDomain</span>=<span class="st0">&quot;http://forum.example.com/index.php?act=Login&amp;amp;CODE=00&quot;</span> <span class="re0">LoginUri</span>=<span class="st0">&quot;http://forum.example.com/index.php?act=Login&amp;amp;CODE=01&quot;</span> <span class="re0">HttpMethod</span>=<span class="st0">&quot;POST&quot;</span><span class="re2">&gt;</span></span>
<span class="sc3"><span class="re1">&lt;FormElements<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;FormElement</span> <span class="re0">Key</span>=<span class="st0">&quot;referer&quot;</span> <span class="re0">Value</span>=<span class="st0">&quot;&quot;</span><span class="re2">/&gt;</span></span>
<span class="sc3"><span class="re1">&lt;FormElement</span> <span class="re0">Key</span>=<span class="st0">&quot;CookieDate&quot;</span> <span class="re0">Value</span>=<span class="st0">&quot;1&quot;</span><span class="re2">/&gt;</span></span>
<span class="sc3"><span class="re1">&lt;FormElement</span> <span class="re0">Key</span>=<span class="st0">&quot;Privacy&quot;</span> <span class="re0">Value</span>=<span class="st0">&quot;1&quot;</span><span class="re2">/&gt;</span></span>
<span class="sc3"><span class="re1">&lt;FormElement</span> <span class="re0">Key</span>=<span class="st0">&quot;UserName&quot;</span> <span class="re0">Value</span>=<span class="st0">&quot;User&quot;</span><span class="re2">/&gt;</span></span>
<span class="sc3"><span class="re1">&lt;FormElement</span> <span class="re0">Key</span>=<span class="st0">&quot;PassWord&quot;</span> <span class="re0">Value</span>=<span class="st0">&quot;Password&quot;</span><span class="re2">/&gt;</span></span>
<span class="sc3"><span class="re1">&lt;FormElement</span> <span class="re0">Key</span>=<span class="st0">&quot;submit&quot;</span> <span class="re0">Value</span>=<span class="st0">&quot;Enter&quot;</span><span class="re2">/&gt;</span></span>
<span class="sc3"><span class="re1">&lt;/FormElements<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/HtmlForm<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/Authentication<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Seeds</span> <span class="re0">FollowLinks</span>=<span class="st0">&quot;Follow&quot;</span><span class="re2">&gt;</span></span>
<span class="sc3"><span class="re1">&lt;Seed<span class="re2">&gt;</span></span></span><span class="sc2">&lt;![CDATA[http://forum.example.com/index.php?act=Login&amp;CODE=00]]&gt;</span><span class="sc3"><span class="re1">&lt;/Seed<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/Seeds<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/WebSite<span class="re2">&gt;</span></span></span></pre></div>
<a name="Multiple_website_configuration"></a><h3> <span class="mw-headline"> Multiple website configuration </span></h3>
<div dir="ltr" style="text-align: left;"><pre class="source-xml"><span class="sc3"><span class="re1">&lt;WebSite</span> <span class="re0">ProjectName</span>=<span class="st0">&quot;First WebSite&quot;</span><span class="re2">&gt;</span></span>
<span class="sc3"><span class="re1">&lt;UserAgent</span> <span class="re0">Name</span>=<span class="st0">&quot;Brox Crawler&quot;</span> <span class="re0">Version</span>=<span class="st0">&quot;1.0&quot;</span> <span class="re0">Description</span>=<span class="st0">&quot;Brox Crawler&quot;</span> <span class="re0">Url</span>=<span class="st0">&quot;http://www.example.com&quot;</span> <span class="re0">Email</span>=<span class="st0">&quot;crawler@example.com&quot;</span><span class="re2">/&gt;</span></span>
<span class="sc3"><span class="re1">&lt;CrawlingModel</span> <span class="re0">Type</span>=<span class="st0">&quot;MaxIterations&quot;</span> <span class="re0">Value</span>=<span class="st0">&quot;20&quot;</span><span class="re2">/&gt;</span></span>
<span class="sc3"><span class="re1">&lt;CrawlScope</span> <span class="re0">Type</span>=<span class="st0">&quot;Broad&quot;</span><span class="re2">&gt;</span></span>
<span class="sc3"><span class="re1">&lt;CrawlLimits<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;SizeLimits</span> <span class="re0">MaxBytesDownload</span>=<span class="st0">&quot;0&quot;</span> <span class="re0">MaxDocumentDownload</span>=<span class="st0">&quot;100&quot;</span> <span class="re0">MaxTimeSec</span>=<span class="st0">&quot;3600&quot;</span> <span class="re0">MaxLengthBytes</span>=<span class="st0">&quot;1000000&quot;</span> <span class="re2">/&gt;</span></span>
<span class="sc3"><span class="re1">&lt;TimeoutLimits</span> <span class="re0">Timeout</span>=<span class="st0">&quot;10000&quot;</span> <span class="re2">/&gt;</span></span>
<span class="sc3"><span class="re1">&lt;WaitLimits</span> <span class="re0">Wait</span>=<span class="st0">&quot;0&quot;</span> <span class="re0">RandomWait</span>=<span class="st0">&quot;false&quot;</span> <span class="re0">MaxRetries</span>=<span class="st0">&quot;8&quot;</span> <span class="re0">WaitRetry</span>=<span class="st0">&quot;0&quot;</span><span class="re2">/&gt;</span></span>
<span class="sc3"><span class="re1">&lt;/CrawlLimits<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Seeds</span> <span class="re0">FollowLinks</span>=<span class="st0">&quot;Follow&quot;</span>
<span class="re1">&lt;Seed<span class="re2">&gt;</span></span></span>http://localhost/<span class="sc3"><span class="re1">&lt;/Seed<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Seed<span class="re2">&gt;</span></span></span>http://localhost/otherseed<span class="sc3"><span class="re1">&lt;/Seed<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/Seeds<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Authentication<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Rfc2617</span> <span class="re0">Host</span>=<span class="st0">&quot;localhost&quot;</span> <span class="re0">Port</span>=<span class="st0">&quot;80&quot;</span> <span class="re0">Realm</span>=<span class="st0">&quot;Restricted area&quot;</span> <span class="re0">Login</span>=<span class="st0">&quot;user&quot;</span> <span class="re0">Password</span>=<span class="st0">&quot;pass&quot;</span><span class="re2">/&gt;</span></span>
<span class="sc3"><span class="re1">&lt;HtmlForm</span> <span class="re0">CredentialDomain</span>=<span class="st0">&quot;http://localhost:8081/admin/&quot;</span> <span class="re0">LoginUri</span>=<span class="st0">&quot;/j_security_check&quot;</span> <span class="re0">HttpMethod</span>=<span class="st0">&quot;GET&quot;</span><span class="re2">&gt;</span></span>
<span class="sc3"><span class="re1">&lt;FormElements<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;FormElement</span> <span class="re0">Key</span>=<span class="st0">&quot;j_username&quot;</span> <span class="re0">Value</span>=<span class="st0">&quot;admin&quot;</span><span class="re2">/&gt;</span></span>
<span class="sc3"><span class="re1">&lt;FormElement</span> <span class="re0">Key</span>=<span class="st0">&quot;j_password&quot;</span> <span class="re0">Value</span>=<span class="st0">&quot;&quot;</span><span class="re2">/&gt;</span></span>
<span class="sc3"><span class="re1">&lt;FormElement</span> <span class="re0">Key</span>=<span class="st0">&quot;submit&quot;</span> <span class="re0">Value</span>=<span class="st0">&quot;Login&quot;</span><span class="re2">/&gt;</span></span>
<span class="sc3"><span class="re1">&lt;/FormElements<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/HtmlForm<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/Authentication<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/WebSite<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;WebSite</span> <span class="re0">ProjectName</span>=<span class="st0">&quot;Second WebSite&quot;</span><span class="re2">&gt;</span></span>
<span class="sc3"><span class="re1">&lt;UserAgent</span> <span class="re0">Name</span>=<span class="st0">&quot;Mozilla&quot;</span> <span class="re0">Version</span>=<span class="st0">&quot;5.0&quot;</span> <span class="re0">Description</span>=<span class="st0">&quot;X11; U; Linux x86_64; en-US; rv:1.8.1.4&quot;</span> <span class="re2">/&gt;</span></span>
<span class="sc3"><span class="re1">&lt;Robotstxt</span> <span class="re0">Policy</span>=<span class="st0">&quot;Classic&quot;</span> <span class="re0">AgentNames</span>=<span class="st0">&quot;mozilla, googlebot&quot;</span><span class="re2">/&gt;</span></span>
<span class="sc3"><span class="re1">&lt;CrawlingModel</span> <span class="re0">Type</span>=<span class="st0">&quot;MaxDepth&quot;</span> <span class="re0">Value</span>=<span class="st0">&quot;100&quot;</span><span class="re2">/&gt;</span></span>
<span class="sc3"><span class="re1">&lt;CrawlScope</span> <span class="re0">Type</span>=<span class="st0">&quot;Host&quot;</span><span class="re2">/&gt;</span></span>
<span class="sc3"><span class="re1">&lt;CrawlLimits<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;WaitLimits</span> <span class="re0">Wait</span>=<span class="st0">&quot;5&quot;</span> <span class="re0">RandomWait</span>=<span class="st0">&quot;true&quot;</span><span class="re2">/&gt;</span></span>
<span class="sc3"><span class="re1">&lt;/CrawlLimits<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Seeds</span> <span class="re0">FollowLinks</span>=<span class="st0">&quot;NoFollow&quot;</span><span class="re2">&gt;</span></span>
<span class="sc3"><span class="re1">&lt;Seed<span class="re2">&gt;</span></span></span>http://example.com<span class="sc3"><span class="re1">&lt;/Seed<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/Seeds<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Filters<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Filter</span> <span class="re0">Type</span>=<span class="st0">&quot;BeginningPath&quot;</span> <span class="re0">WorkType</span>=<span class="st0">&quot;Unselect&quot;</span> <span class="re0">Value</span>=<span class="st0">&quot;/something/&quot;</span><span class="re2">&gt;</span></span>
<span class="sc3"><span class="re1">&lt;Refinements<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;TimeOfDay</span> <span class="re0">From</span>=<span class="st0">&quot;09:00:00&quot;</span> <span class="re0">To</span>=<span class="st0">&quot;23:00:00&quot;</span><span class="re2">/&gt;</span></span>
<span class="sc3"><span class="re1">&lt;Port</span> <span class="re0">Number</span>=<span class="st0">&quot;80&quot;</span><span class="re2">/&gt;</span></span>
<span class="sc3"><span class="re1">&lt;/Refinements<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/Filter<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Filter</span> <span class="re0">Type</span>=<span class="st0">&quot;RegExp&quot;</span> <span class="re0">WorkType</span>=<span class="st0">&quot;Unselect&quot;</span> <span class="re0">Value</span>=<span class="st0">&quot;news&quot;</span><span class="re2">/&gt;</span></span>
<span class="sc3"><span class="re1">&lt;Filter</span> <span class="re0">Type</span>=<span class="st0">&quot;ContentType&quot;</span> <span class="re0">WorkType</span>=<span class="st0">&quot;Unselect&quot;</span> <span class="re0">Value</span>=<span class="st0">&quot;image/jpeg&quot;</span><span class="re2">/&gt;</span></span>
<span class="sc3"><span class="re1">&lt;/Filters<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/WebSite<span class="re2">&gt;</span></span></span></pre></div>
<a name="Complex_website_configuration_example"></a><h3> <span class="mw-headline"> Complex website configuration example </span></h3>
<div dir="ltr" style="text-align: left;"><pre class="source-xml"><span class="sc3"><span class="re1">&lt;WebSite</span> <span class="re0">ProjectName</span>=<span class="st0">&quot;Example Crawler Configuration&quot;</span> <span class="re0">Header</span>=<span class="st0">&quot;Accept-Encoding: gzip,deflate; Via: myProxy&quot;</span> <span class="re0">Referer</span>=<span class="st0">&quot;http://myReferer&quot;</span><span class="re2">&gt;</span></span>
<span class="sc3"><span class="re1">&lt;UserAgent</span> <span class="re0">Name</span>=<span class="st0">&quot;Crawler&quot;</span> <span class="re0">Version</span>=<span class="st0">&quot;1.0&quot;</span> <span class="re0">Description</span>=<span class="st0">&quot;Test crawler&quot;</span> <span class="re0">Url</span>=<span class="st0">&quot;http://www.example.com&quot;</span> <span class="re0">Email</span>=<span class="st0">&quot;crawler@example.com&quot;</span><span class="re2">/&gt;</span></span>
<span class="sc3"><span class="re1">&lt;Robotstxt</span> <span class="re0">Policy</span>=<span class="st0">&quot;Custom&quot;</span> <span class="re0">Value</span>=<span class="st0">&quot;/home/user/customRobotRules.txt&quot;</span> <span class="re0">AgentNames</span>=<span class="st0">&quot;agent1;agent2&quot;</span><span class="re2">/&gt;</span></span>
<span class="sc3"><span class="re1">&lt;CrawlingModel</span> <span class="re0">Type</span>=<span class="st0">&quot;MaxIterations&quot;</span> <span class="re0">Value</span>=<span class="st0">&quot;20&quot;</span><span class="re2">/&gt;</span></span>
<span class="sc3"><span class="re1">&lt;CrawlScope</span> <span class="re0">Type</span>=<span class="st0">&quot;Broad&quot;</span><span class="re2">&gt;</span></span>
<span class="sc3"><span class="re1">&lt;Filters<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Filter</span> <span class="re0">Type</span>=<span class="st0">&quot;BeginningPath&quot;</span> <span class="re0">WorkType</span>=<span class="st0">&quot;Select&quot;</span> <span class="re0">Value</span>=<span class="st0">&quot;/test.html&quot;</span><span class="re2">/&gt;</span></span>
<span class="sc3"><span class="re1">&lt;/Filters<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/CrawlScope<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;CrawlLimits<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;SizeLimits</span> <span class="re0">MaxBytesDownload</span>=<span class="st0">&quot;0&quot;</span> <span class="re0">MaxDocumentDownload</span>=<span class="st0">&quot;1&quot;</span> <span class="re0">MaxTimeSec</span>=<span class="st0">&quot;3600&quot;</span> <span class="re0">MaxLengthBytes</span>=<span class="st0">&quot;1000000&quot;</span> <span class="re2">/&gt;</span></span>
<span class="sc3"><span class="re1">&lt;TimeoutLimits</span> <span class="re0">Timeout</span>=<span class="st0">&quot;10000&quot;</span> <span class="re2">/&gt;</span></span>
<span class="sc3"><span class="re1">&lt;WaitLimits</span> <span class="re0">Wait</span>=<span class="st0">&quot;0&quot;</span> <span class="re0">RandomWait</span>=<span class="st0">&quot;false&quot;</span> <span class="re0">MaxRetries</span>=<span class="st0">&quot;8&quot;</span> <span class="re0">WaitRetry</span>=<span class="st0">&quot;0&quot;</span><span class="re2">/&gt;</span></span>
<span class="sc3"><span class="re1">&lt;/CrawlLimits<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Proxy<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;ProxyServer</span> <span class="re0">Host</span>=<span class="st0">&quot;example.com&quot;</span> <span class="re0">Port</span>=<span class="st0">&quot;3128&quot;</span> <span class="re0">Login</span>=<span class="st0">&quot;user&quot;</span> <span class="re0">Password</span>=<span class="st0">&quot;pass&quot;</span><span class="re2">/&gt;</span></span>
<span class="sc3"><span class="re1">&lt;/Proxy<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Authentication<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Rfc2617</span> <span class="re0">Host</span>=<span class="st0">&quot;somehost.com&quot;</span> <span class="re0">Port</span>=<span class="st0">&quot;80&quot;</span> <span class="re0">Realm</span>=<span class="st0">&quot;realm string&quot;</span> <span class="re0">Login</span>=<span class="st0">&quot;user&quot;</span> <span class="re0">Password</span>=<span class="st0">&quot;pass&quot;</span><span class="re2">/&gt;</span></span>
<span class="sc3"><span class="re1">&lt;/Authentication<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Seeds</span> <span class="re0">FollowLinks</span>=<span class="st0">&quot;NoFollow&quot;</span><span class="re2">&gt;</span></span>
<span class="sc3"><span class="re1">&lt;Seed<span class="re2">&gt;</span></span></span>http://example.com<span class="sc3"><span class="re1">&lt;/Seed<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/Seeds<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Filters<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Filter</span> <span class="re0">Type</span>=<span class="st0">&quot;BeginningPath&quot;</span> <span class="re0">WorkType</span>=<span class="st0">&quot;Unselect&quot;</span> <span class="re0">Value</span>=<span class="st0">&quot;/something/&quot;</span><span class="re2">&gt;</span></span>
<span class="sc3"><span class="re1">&lt;Refinements<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;TimeOfDay</span> <span class="re0">From</span>=<span class="st0">&quot;09:00:00&quot;</span> <span class="re0">To</span>=<span class="st0">&quot;23:00:00&quot;</span><span class="re2">/&gt;</span></span>‘
<span class="sc3"><span class="re1">&lt;Port</span> <span class="re0">Number</span>=<span class="st0">&quot;80&quot;</span><span class="re2">/&gt;</span></span>
<span class="sc3"><span class="re1">&lt;/Refinements<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/Filter<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Filter</span> <span class="re0">Type</span>=<span class="st0">&quot;RegExp&quot;</span> <span class="re0">WorkType</span>=<span class="st0">&quot;Unselect&quot;</span> <span class="re0">Value</span>=<span class="st0">&quot;news&quot;</span><span class="re2">/&gt;</span></span>
<span class="sc3"><span class="re1">&lt;Filter</span> <span class="re0">Type</span>=<span class="st0">&quot;ContentType&quot;</span> <span class="re0">WorkType</span>=<span class="st0">&quot;Unselect&quot;</span> <span class="re0">Value</span>=<span class="st0">&quot;image/jpeg&quot;</span><span class="re2">/&gt;</span></span>
<span class="sc3"><span class="re1">&lt;/Filters<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;MetaTagFilters<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;MetaTagFilter</span> <span class="re0">Type</span>=<span class="st0">&quot;Name&quot;</span> <span class="re0">Name</span>=<span class="st0">&quot;author&quot;</span> <span class="re0">Content</span>=<span class="st0">&quot;Blocked Author&quot;</span> <span class="re0">WorkType</span>=<span class="st0">&quot;Unselect&quot;</span><span class="re2">/&gt;</span></span>
<span class="sc3"><span class="re1">&lt;/MetaTagFilters<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/WebSite<span class="re2">&gt;</span></span></span></pre></div>
<a name="Output_example_for_default_configuration"></a><h2> <span class="mw-headline"> Output example for default configuration </span></h2>
<p>If you crawl with the default configuration file, you’ll receive the following record:
</p>
<div dir="ltr" style="text-align: left;"><pre class="source-xml"><span class="sc3"><span class="re1">&lt;Record</span> <span class="re0">xmlns</span>=<span class="st0">&quot;http://www.eclipse.org/smila/record&quot;</span> <span class="re0">version</span>=<span class="st0">&quot;1.0&quot;</span><span class="re2">&gt;</span></span>
<span class="sc3"><span class="re1">&lt;Val</span> <span class="re0">key</span>=<span class="st0">&quot;_recordid&quot;</span><span class="re2">&gt;</span></span>web:<span class="sc1">&amp;lt;</span>Url=http://en.wikipedia.org/wiki/Main_Page<span class="sc1">&amp;gt;</span><span class="sc3"><span class="re1">&lt;/Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Val</span> <span class="re0">key</span>=<span class="st0">&quot;Url&quot;</span><span class="re2">&gt;</span></span>http://en.wikipedia.org/wiki/Main_Page<span class="sc3"><span class="re1">&lt;/Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Val</span> <span class="re0">key</span>=<span class="st0">&quot;Content&quot;</span><span class="re2">&gt;</span></span>
Whole content of wikipedia main page.
To much to post here.
<span class="sc3"><span class="re1">&lt;/Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Val</span> <span class="re0">key</span>=<span class="st0">&quot;Title&quot;</span><span class="re2">&gt;</span></span>Wikipedia, the free encyclopedia<span class="sc3"><span class="re1">&lt;/Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Seq</span> <span class="re0">n</span>=<span class="st0">&quot;MetaData&quot;</span><span class="re2">&gt;</span></span>
<span class="sc3"><span class="re1">&lt;Val<span class="re2">&gt;</span></span></span>base:null<span class="sc3"><span class="re1">&lt;/Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Val<span class="re2">&gt;</span></span></span>noCache:false<span class="sc3"><span class="re1">&lt;/Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Val<span class="re2">&gt;</span></span></span>noFollow:false<span class="sc3"><span class="re1">&lt;/Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Val<span class="re2">&gt;</span></span></span>noIndex:false<span class="sc3"><span class="re1">&lt;/Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Val<span class="re2">&gt;</span></span></span>refresh:false<span class="sc3"><span class="re1">&lt;/Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Val<span class="re2">&gt;</span></span></span>refreshHref:null<span class="sc3"><span class="re1">&lt;/Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Val<span class="re2">&gt;</span></span></span>
keywords:Main Page,1266,1815,1919,1935,1948 NCAA Men's
Division I Ice Hockey Tournament,1991,1993,2009,2009
Bangladesh Rifles revolt,Althea Byfield
<span class="sc3"><span class="re1">&lt;/Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Val<span class="re2">&gt;</span></span></span>generator:MediaWiki 1.15alpha<span class="sc3"><span class="re1">&lt;/Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Val<span class="re2">&gt;</span></span></span>content-type:text/html; charset=utf-8<span class="sc3"><span class="re1">&lt;/Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Val<span class="re2">&gt;</span></span></span>content-style-type:text/css<span class="sc3"><span class="re1">&lt;/Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/Seq<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Val</span> <span class="re0">key</span>=<span class="st0">&quot;MimeType&quot;</span><span class="re2">&gt;</span></span>text/html<span class="sc3"><span class="re1">&lt;/Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Seq</span> <span class="re0">key</span>=<span class="st0">&quot;ResponseHeader&quot;</span><span class="re2">&gt;</span></span>
<span class="sc3"><span class="re1">&lt;Val<span class="re2">&gt;</span></span></span>Server:Apache<span class="sc3"><span class="re1">&lt;/Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Val<span class="re2">&gt;</span></span></span>Date:Thu, 26 Feb 2009 14:33:37 GMT<span class="sc3"><span class="re1">&lt;/Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/Seq<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Seq</span> <span class="re0">key</span>=<span class="st0">&quot;MetaDataWithResponseHeaderFallBack&quot;</span><span class="re2">&gt;</span></span>
<span class="sc3"><span class="re1">&lt;Val<span class="re2">&gt;</span></span></span>Age:2<span class="sc3"><span class="re1">&lt;/Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Val<span class="re2">&gt;</span></span></span>Content-Language:en<span class="sc3"><span class="re1">&lt;/Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Val<span class="re2">&gt;</span></span></span>Content-Length:57974<span class="sc3"><span class="re1">&lt;/Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Val<span class="re2">&gt;</span></span></span>Last-Modified:Thu, 26 Feb 2009 14:31:46 GMT<span class="sc3"><span class="re1">&lt;/Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Val<span class="re2">&gt;</span></span></span>
X-Cache-Lookup:MISS from knsq25.knams.wikimedia.org:80
<span class="sc3"><span class="re1">&lt;/Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Val<span class="re2">&gt;</span></span></span>Connection:Keep-Alive<span class="sc3"><span class="re1">&lt;/Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Val<span class="re2">&gt;</span></span></span>X-Cache:MISS from knsq25.knams.wikimedia.org<span class="sc3"><span class="re1">&lt;/Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Val<span class="re2">&gt;</span></span></span>Server:Apache<span class="sc3"><span class="re1">&lt;/Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Val<span class="re2">&gt;</span></span></span>X-Powered-By:PHP/5.2.4-2ubuntu5wm1<span class="sc3"><span class="re1">&lt;/Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Val<span class="re2">&gt;</span></span></span>
Cache-Control:private, s-maxage=0, max-age=0,
must-revalidate
<span class="sc3"><span class="re1">&lt;/Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Val<span class="re2">&gt;</span></span></span>Date:Thu, 26 Feb 2009 14:33:37 GMT<span class="sc3"><span class="re1">&lt;/Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Val<span class="re2">&gt;</span></span></span>Vary:Accept-Encoding,Cookie<span class="sc3"><span class="re1">&lt;/Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Val<span class="re2">&gt;</span></span></span>
X-Vary-Options:Accept-Encoding;list-contains=gzip,Cookie;string-contains=enwikiToken;string-contains=enwikiLoggedOut;string-contains=enwiki_session;string-contains=centralauth_Token;string-contains=centralauth_Session;string-contains=centralauth_LoggedOut
<span class="sc3"><span class="re1">&lt;/Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Val<span class="re2">&gt;</span></span></span>
Via:1.1 sq39.wikimedia.org:3128 (squid/2.7.STABLE6), 1.0
knsq29.knams.wikimedia.org:3128 (squid/2.7.STABLE6), 1.0
knsq25.knams.wikimedia.org:80 (squid/2.7.STABLE6), 1.0
HAN-HB-FW-001
<span class="sc3"><span class="re1">&lt;/Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Val<span class="re2">&gt;</span></span></span>Content-Type:text/html; charset=utf-8<span class="sc3"><span class="re1">&lt;/Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Val<span class="re2">&gt;</span></span></span>Proxy-Connection:Keep-Alive<span class="sc3"><span class="re1">&lt;/Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Val<span class="re2">&gt;</span></span></span>base:null<span class="sc3"><span class="re1">&lt;/Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Val<span class="re2">&gt;</span></span></span>noCache:false<span class="sc3"><span class="re1">&lt;/Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Val<span class="re2">&gt;</span></span></span>noFollow:false<span class="sc3"><span class="re1">&lt;/Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Val<span class="re2">&gt;</span></span></span>noIndex:false<span class="sc3"><span class="re1">&lt;/Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Val<span class="re2">&gt;</span></span></span>refresh:false<span class="sc3"><span class="re1">&lt;/Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Val<span class="re2">&gt;</span></span></span>refreshHref:null<span class="sc3"><span class="re1">&lt;/Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Val<span class="re2">&gt;</span></span></span>
keywords:Main Page,1266,1815,1919,1935,1948 NCAA Men's
Division I Ice Hockey Tournament,1991,1993,2009,2009
Bangladesh Rifles revolt,Althea Byfield
<span class="sc3"><span class="re1">&lt;/Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Val<span class="re2">&gt;</span></span></span>generator:MediaWiki 1.15alpha<span class="sc3"><span class="re1">&lt;/Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Val<span class="re2">&gt;</span></span></span>content-type:text/html; charset=utf-8<span class="sc3"><span class="re1">&lt;/Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Val<span class="re2">&gt;</span></span></span>content-style-type:text/css<span class="sc3"><span class="re1">&lt;/Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/Seq<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Val</span> <span class="re0">key</span>=<span class="st0">&quot;_HASH_TOKEN&quot;</span><span class="re2">&gt;</span></span>eb1eff85a3e3d4ad4ffd0dd9d4883e3d1f7f988019ca9bfa4a4df2e7659aa6<span class="sc3"><span class="re1">&lt;/Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Attachment<span class="re2">&gt;</span></span></span>Content<span class="sc3"><span class="re1">&lt;/Attachment<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/Record<span class="re2">&gt;</span></span></span></pre></div>
<a name="Additional_performance_counters"></a><h2> <span class="mw-headline"> Additional performance counters </span></h2>
<p>The FileSystemCrawler adds some specific counters to the common counters:
</p>
<ul><li> bytes: number of bytes read from web server
</li><li> pages: number of web pages read
</li><li> averageHttpFetchTime: average time for fetching a page from the server.
</li><li> producerExceptions: number of webserver related errors
</li></ul>
<a name="See_also"></a><h2> <span class="mw-headline"> See also </span></h2>
<ul><li> <a href="Crawler.html" title="SMILA/Documentation/Crawler">Crawler</a>
</li><li> <a href="Filesystem_Crawler.html" title="SMILA/Documentation/Filesystem Crawler">Filesystem Crawler</a>
</li><li> <a href="JDBC_Crawler.html" title="SMILA/Documentation/JDBC Crawler">JDBC Crawler</a>
</li></ul>
<a name="External_links"></a><h2> <span class="mw-headline"> External links </span></h2>
<ul><li> <a href="http://www.robotstxt.org/robotstxt.html" class="external text" title="http://www.robotstxt.org/robotstxt.html" rel="nofollow">The Web Robots Pages - robots.txt reference</a>
</li><li> <a href="https://www.google.com/webmasters/tools/docs/en/protocol.html" class="external text" title="https://www.google.com/webmasters/tools/docs/en/protocol.html" rel="nofollow">Google Sitemap Protocol</a>
</li><li> <a href="http://en.wikipedia.org/wiki/Referer" class="external text" title="http://en.wikipedia.org/wiki/Referer" rel="nofollow">HTTP Referer Header</a>
</li><li> <a href="http://en.wikipedia.org/wiki/HTTP_cookie" class="external text" title="http://en.wikipedia.org/wiki/HTTP_cookie" rel="nofollow">HTTP Cookie Header</a>
</li></ul>
<p><br />
</p>
<!--
NewPP limit report
Preprocessor node count: 214/1000000
Post-expand include size: 1047/2097152 bytes
Template argument size: 515/2097152 bytes
#ifexist count: 0/100
-->
<!-- Saved in parser cache with key wikidb:pcache:idhash:15257-0!1!0!!en!2!edit=0 and timestamp 20130416061001 -->
<div class="printfooter">
Retrieved from "<a href="Web_Crawler.html">http://wiki.eclipse.org/SMILA/Documentation/Web_Crawler</a>"</div>
<div id="catlinks"><p class='catlinks'><a href="http://wiki.eclipse.org/Special:Categories" title="Special:Categories">Category</a>: <span dir='ltr'><a href="http://wiki.eclipse.org/Category:SMILA" title="Category:SMILA">SMILA</a></span></p></div> <!-- end content -->
<div class="visualClear"></div>
</div>
</div>
</div>
<!-- Yoink of toolbox for phoenix moved up -->
</div>
</div>
<div id="clearFooter"/>
<div id="footer" >
<ul id="footernav">
<li class="first"><a href="http://www.eclipse.org/">Home</a></li>
<li><a href="http://www.eclipse.org/legal/privacy.php">Privacy Policy</a></li>
<li><a href="http://www.eclipse.org/legal/termsofuse.php">Terms of Use</a></li>
<li><a href="http://www.eclipse.org/legal/copyright.php">Copyright Agent</a></li>
<li><a href="http://www.eclipse.org/org/foundation/contact.php">Contact</a></li>
<li><a href="http://wiki.eclipse.org/Eclipsepedia:About" title="Eclipsepedia:About">About Eclipsepedia</a></li>
</ul>
<span id="copyright">Copyright &copy; 2013 The Eclipse Foundation. All Rights Reserved</span>
<p id="footercredit">This page was last modified 09:38, 24 January 2012 by <a href="http://wiki.eclipse.org/index.php?title=User:Juergen.schumacher.attensity.com&amp;action=edit" class="new" title="User:Juergen.schumacher.attensity.com">Juergen Schumacher</a>. Based on work by <a href="http://wiki.eclipse.org/index.php?title=User:Nadine.auslaender.attensity.com&amp;action=edit" class="new" title="User:Nadine.auslaender.attensity.com"> </a>, <a href="http://wiki.eclipse.org/User:Andreas.schank.attensity.com" title="User:Andreas.schank.attensity.com">A. Schank</a> and <a href="http://wiki.eclipse.org/index.php?title=User:Bjoern.decker.attensity.com&amp;action=edit" class="new" title="User:Bjoern.decker.attensity.com">Björn Decker</a> and <a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/Web_Crawler&amp;action=credits" title="SMILA/Documentation/Web Crawler">others</a>.</p>
<p id="footerviews">This page has been accessed 6,713 times.</p>
</div>
<script type="text/javascript">
var gaJsHost = (("https:" == document.location.protocol) ? "https://ssl." : "http://www.");
document.write(unescape("%3Cscript src='" + gaJsHost + "google-analytics.com/ga.js' type='text/javascript'%3E%3C/script%3E"));
</script>
<script type="text/javascript">
var pageTracker = _gat._getTracker("UA-910670-4");
pageTracker._trackPageview();
</script>
<!-- <div class="visualClear"></div> -->
<script type="text/javascript">if (window.runOnloadHook) runOnloadHook();</script>
</div>
<!-- Served in 0.070 secs. --></body></html>