blob: 06c8377a606a9c58fb4b3c119c6e9b2a131e4997 [file] [log] [blame]
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en" dir="ltr">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<meta name="keywords" content="SMILA/Documentation/Crawler,SMILA/Development Guidelines/How to implement a crawler,SMILA/Documentation/CompoundManagement,SMILA/Documentation/Filesystem Crawler,SMILA/Documentation/JDBC Crawler,SMILA/Documentation/Web Crawler" />
<link rel="shortcut icon" href="http://wiki.eclipse.org/SMILA/Documentation/favicon.ico" />
<link rel="search" type="application/opensearchdescription+xml" href="http://wiki.eclipse.org/opensearch_desc.php" title="Eclipsepedia (English)" />
<link rel="alternate" type="application/rss+xml" title="Eclipsepedia RSS Feed" href="http://wiki.eclipse.org/index.php?title=Special:Recentchanges&amp;feed=rss" />
<link rel="alternate" type="application/atom+xml" title="Eclipsepedia Atom Feed" href="http://wiki.eclipse.org/index.php?title=Special:Recentchanges&amp;feed=atom" />
<title>SMILA/Documentation/Crawler - Eclipsepedia</title>
<style type="text/css" media="screen,projection">/*<![CDATA[*/ @import "/skins/eclipsenova/novaWide.css?116"; /*]]>*/</style>
<link rel="stylesheet" type="text/css" media="print" href="http://wiki.eclipse.org/skins/eclipsenova/eclipsenovaPrint.css?116" />
<link rel="stylesheet" type="text/css" media="handheld" href="http://wiki.eclipse.org/skins/eclipsenova/handheld.css?116" />
<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/Nova/css/header.css" media="screen" />
<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/tabs.css" media="screen" />
<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/Nova/css/visual.css" media="screen" />
<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/Nova/css/layout.css" media="screen" />
<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/Nova/css/footer.css" media="screen" />
<!--[if IE]><link rel="stylesheet" type="text/css" href="/skins/eclipsenova/IEpngfix.css" media="screen" /><![endif]-->
<!--[if lt IE 5.5000]><style type="text/css">@import "/skins/eclipsenova/IE50Fixes.css?116";</style> <![endif]-->
<!--[if IE 5.5000]><style type="text/css">@import "/skins/eclipsenova/IE55Fixes.css?116";</style><![endif]-->
<!--[if IE 6]><style type="text/css">@import "/skins/eclipsenova/IE60Fixes.css?116";</style><![endif]-->
<!--[if IE 7]><style type="text/css">@import "/skins/eclipsenova/IE70Fixes.css?116";</style><![endif]-->
<!--[if lt IE 7]><script type="text/javascript" src="/skins/common/IEFixes.js?116"></script>
<meta http-equiv="imagetoolbar" content="no" /><![endif]-->
<script type= "text/javascript">/*<![CDATA[*/
var skin = "eclipsenova";
var stylepath = "/skins";
var wgArticlePath = "/$1";
var wgScriptPath = "";
var wgScript = "/index.php";
var wgServer = "http://wiki.eclipse.org";
var wgCanonicalNamespace = "";
var wgCanonicalSpecialPageName = false;
var wgNamespaceNumber = 0;
var wgPageName = "SMILA/Documentation/Crawler";
var wgTitle = "SMILA/Documentation/Crawler";
var wgAction = "view";
var wgRestrictionEdit = [];
var wgRestrictionMove = [];
var wgArticleId = "18589";
var wgIsArticle = true;
var wgUserName = null;
var wgUserGroups = null;
var wgUserLanguage = "en";
var wgContentLanguage = "en";
var wgBreakFrames = false;
var wgCurRevisionId = "285985";
var wgVersion = "1.12.0";
var wgEnableAPI = true;
var wgEnableWriteAPI = false;
/*]]>*/</script>
<script type="text/javascript" src="http://wiki.eclipse.org/skins/common/wikibits.js?116"><!-- wikibits js --></script>
<!-- Performance mods similar to those for bug 166401 -->
<script type="text/javascript" src="http://wiki.eclipse.org/index.php?title=-&amp;action=raw&amp;gen=js&amp;useskin=eclipsenova"><!-- site js --></script>
<!-- Head Scripts -->
<script type="text/javascript" src="http://wiki.eclipse.org/skins/common/ajax.js?116"></script>
<style type="text/css">/*<![CDATA[*/
.source-xml {line-height: normal; font-size: medium;}
.source-xml li {line-height: normal;}
/**
* GeSHi Dynamically Generated Stylesheet
* --------------------------------------
* Dynamically generated stylesheet for xml
* CSS class: source-xml, CSS id:
* GeSHi (C) 2004 - 2007 Nigel McNie (http://qbnz.com/highlighter)
*/
.source-xml .de1, .source-xml .de2 {font-family: 'Courier New', Courier, monospace; font-weight: normal;}
.source-xml {}
.source-xml .head {}
.source-xml .foot {}
.source-xml .imp {font-weight: bold; color: red;}
.source-xml .ln-xtra {color: #cc0; background-color: #ffc;}
.source-xml li {font-family: 'Courier New', Courier, monospace; color: black; font-weight: normal; font-style: normal;}
.source-xml li.li2 {font-weight: bold;}
.source-xml .coMULTI {color: #808080; font-style: italic;}
.source-xml .es0 {color: #000099; font-weight: bold;}
.source-xml .br0 {color: #66cc66;}
.source-xml .st0 {color: #ff0000;}
.source-xml .nu0 {color: #cc66cc;}
.source-xml .sc0 {color: #00bbdd;}
.source-xml .sc1 {color: #ddbb00;}
.source-xml .sc2 {color: #339933;}
.source-xml .sc3 {color: #009900;}
.source-xml .re0 {color: #000066;}
.source-xml .re1 {font-weight: bold; color: black;}
.source-xml .re2 {font-weight: bold; color: black;}
/*]]>*/
</style>
<style type="text/css">/*<![CDATA[*/
@import "/index.php?title=MediaWiki:Geshi.css&usemsgcache=yes&action=raw&ctype=text/css&smaxage=18000";
/*]]>*/
</style><link rel="stylesheet" type="text/css" href="Crawler.html" /> </head>
<body class="mediawiki ns-0 ltr page-SMILA_Documentation_Crawler">
<div id="globalWrapper">
<div id="column-one">
<!-- Eclipse Additions for the Top Nav start here M. Ward-->
<div id="header">
<div id="header-graphic">
<img src="http://wiki.eclipse.org/skins/eclipsenova/eclipse.png" alt="Eclipse Wiki">
</div>
<!-- Pulled 101409 Mward -->
<div class="portlet" id="p-personal">
<div class="pBody">
<ul>
<li id="pt-login"><a href="http://wiki.eclipse.org/index.php?title=Special:Userlogin&amp;returnto=SMILA/Documentation/Crawler">Log in</a></li>
</ul>
</div>
</div>
<div id="header-icons">
<div id="sites">
<ul id="sitesUL">
<li><a href="http://www.eclipse.org"><img src="http://dev.eclipse.org/custom_icons/eclipseIcon.png" width="28" height="28" alt="Eclipse Foundation" title="Eclipse Foundation" /><div>Eclipse Foundation</div></a></li>
<li><a href="http://marketplace.eclipse.org"><img src="http://dev.eclipse.org/custom_icons/marketplace.png" width="28" height="28" alt="Eclipse Marketplace" title="Eclipse Marketplace" /><div>Eclipse Marketplace</div></a></li>
<li><a href="https://bugs.eclipse.org/bugs"><img src="http://dev.eclipse.org/custom_icons/system-search-bw.png" width="28" height="28" alt="Bugzilla" title="Bugzilla" /><div>Bugzilla</div></a></li>
<li><a href="http://live.eclipse.org"><img src="http://dev.eclipse.org/custom_icons/audio-input-microphone-bw.png" width="28" height="28" alt="Live" title="Live" /><div>Eclipse Live</div></a></li>
<li><a href="http://planeteclipse.org"><img src="http://dev.eclipse.org/large_icons/devices/audio-card.png" width="28" height="28" alt="PlanetEclipse" title="Planet" /><div>Planet Eclipse</div></a></li>
<li><a href="http://portal.eclipse.org"><img src="http://dev.eclipse.org/custom_icons/preferences-system-network-proxy-bw.png" width="28" height="28" alt="Portal" title="Portal" /><div>My Foundation Portal</div></a></li>
</ul>
</div>
</div>
</div>
<!-- NEW HEADER STUFF HERE -->
<div id="header-menu">
<div id="header-nav">
<ul> <li><a class="first_one" href="http://wiki.eclipse.org/" target="_self">Home</a></li> <li><a href="http://www.eclipse.org/downloads/" target="_self">Downloads</a></li>
<li><a href="http://www.eclipse.org/users/" target="_self">Users</a></li>
<li><a href="http://www.eclipse.org/membership/" target="_self">Members</a></li>
<li><a href="http://wiki.eclipse.org/index.php/Development_Resources" target="_self">Committers</a></li>
<li><a href="http://www.eclipse.org/resources/" target="_self">Resources</a></li>
<li><a href="http://www.eclipse.org/projects/" target="_self">Projects</a></li>
<li><a href="http://www.eclipse.org/org/" target="_self">About Us</a></li>
</ul>
</div>
<div id="header-utils">
<!-- moved the search window here -->
<form action="http://wiki.eclipse.org/Special:Search" >
<input class="input" name="search" type="text" accesskey="f" value="" />
<input type='submit' onclick="this.submit();" name="go" id="searchGoButton" class="button" title="Go to a page with this exact name if one exists" value="Go" />&nbsp;
<input type='submit' onclick="this.submit();" name="fulltext" class="button" id="mw-searchButton" title="Search Eclipsepedia for this text" value="Search" />
</form>
</div>
</div>
<!-- Eclipse Additions for the Header stop here -->
<!-- Additions and mods for leftside nav Start here -->
<!--Started nav rip here-->
<!-- these are the nav controls main page, changes etc -->
<div id="novaContent" class="faux">
<div id="leftcol">
<ul id="leftnav">
<!-- these are the page controls, edit history etc -->
<li class="separator"><a class="separator">Navigation &#160;&#160;</li>
<li id="n-mainpage"><a href="http://wiki.eclipse.org/Main_Page">Main Page</a></li>
<li id="n-portal"><a href="http://wiki.eclipse.org/Eclipsepedia:Community_Portal">Community portal</a></li>
<li id="n-currentevents"><a href="http://wiki.eclipse.org/Eclipsepedia:Current_events">Current events</a></li>
<li id="n-recentchanges"><a href="http://wiki.eclipse.org/Special:Recentchanges">Recent changes</a></li>
<li id="n-randompage"><a href="http://wiki.eclipse.org/Special:Random">Random page</a></li>
<li id="n-help"><a href="http://wiki.eclipse.org/Help:Contents">Help</a></li>
<li class="separator"><a class="separator">Toolbox &#160;&#160;</a></li>
<li id="t-whatlinkshere"><a href="http://wiki.eclipse.org/Special:Whatlinkshere/SMILA/Documentation/Crawler">What links here</a></li>
<li id="t-recentchangeslinked"><a href="http://wiki.eclipse.org/Special:Recentchangeslinked/SMILA/Documentation/Crawler">Related changes</a></li>
<!-- This is the toolbox section -->
<li id="t-upload"><a href="http://wiki.eclipse.org/Special:Upload">Upload file</a></li>
<li id="t-specialpages"><a href="http://wiki.eclipse.org/Special:Specialpages">Special pages</a></li>
<li id="t-print"><a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/Crawler&amp;printable=yes">Printable version</a></li> <li id="t-permalink"><a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/Crawler&amp;oldid=285985">Permanent link</a></li> </ul>
</div>
<!-- Additions and mods for leftside nav End here -->
<div id="column-content">
<div id="content">
<a name="top" id="top"></a>
<div id="tabs">
<ul class="primary">
<li class="active"><a href="Crawler.html"><span class="tab">Page</span></a></li>
<li><a href="http://wiki.eclipse.org/index.php?title=Talk:SMILA/Documentation/Crawler&amp;action=edit"><span class="tab">Discussion</span></a></li>
<li><a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/Crawler&amp;action=edit"><span class="tab">View source</span></a></li>
<li><a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/Crawler&amp;action=history"><span class="tab">History</span></a></li>
<li><a href="http://wiki.eclipse.org/index.php?title=Special:Userlogin&amp;returnto=SMILA/Documentation/Crawler"><span class="tab">Edit</span></a></li>
</ul>
</div>
<script type="text/javascript"> if (window.isMSIE55) fixalpha(); </script>
<h1 class="firstHeading">SMILA/Documentation/Crawler</h1>
<div id="bodyContent">
<h3 id="siteSub">From Eclipsepedia</h3>
<div id="contentSub"><span class="subpages">&lt; <a href="../../SMILA.html" title="SMILA">SMILA</a> | <a href="../Documentation.1.html" title="SMILA/Documentation">Documentation</a></span></div>
<div id="jump-to-nav">Jump to: <a href="Crawler.html#column-one">navigation</a>, <a href="Crawler.html#searchInput">search</a></div> <!-- start content -->
<div class="messagebox" style="background-color: #def3fe; border: 1px solid #c5d7e0; color: black; padding: 5px; margin: 1ex 0; min-height: 35px; padding-left: 45px;">
<div style="float: left; margin-left: -40px;"><a href="http://wiki.eclipse.org/Image:Note.png" class="image" title="Note.png"><img alt="" src="http://wiki.eclipse.org/images/c/cc/Note.png" width="35" height="35" border="0" /></a></div>
<div><b>This is deprecated for SMILA 1.0, the connectivity framework is still functional but will aimed to be replaced by scalable import based on SMILAs job management.</b><br /></div>
</div>
<div class="thumb tright"><div class="thumbinner" style="width:182px;"><a href="http://wiki.eclipse.org/Image:CrawlerWorkflow.png" class="image" title="Crawler Wokflow"><img alt="Crawler Wokflow" src="http://wiki.eclipse.org/images/thumb/6/66/CrawlerWorkflow.png/180px-CrawlerWorkflow.png" width="180" height="99" border="0" class="thumbimage" /></a> <div class="thumbcaption"><div class="magnify"><a href="http://wiki.eclipse.org/Image:CrawlerWorkflow.png" class="internal" title="Enlarge"><img src="http://wiki.eclipse.org/skins/common/images/magnify-clip.png" width="15" height="11" alt="" /></a></div>Crawler Wokflow</div></div></div>
<table id="toc" class="toc" summary="Contents"><tr><td><div id="toctitle"><h2>Contents</h2></div>
<ul>
<li class="toclevel-1"><a href="Crawler.html#Overview"><span class="tocnumber">1</span> <span class="toctext">Overview</span></a></li>
<li class="toclevel-1"><a href="Crawler.html#API"><span class="tocnumber">2</span> <span class="toctext">API</span></a></li>
<li class="toclevel-1"><a href="Crawler.html#Architecture"><span class="tocnumber">3</span> <span class="toctext">Architecture</span></a></li>
<li class="toclevel-1"><a href="Crawler.html#Configuration"><span class="tocnumber">4</span> <span class="toctext">Configuration</span></a>
<ul>
<li class="toclevel-2"><a href="Crawler.html#Further_Information:"><span class="tocnumber">4.1</span> <span class="toctext">Further Information:</span></a></li>
</ul>
</li>
<li class="toclevel-1"><a href="Crawler.html#Crawler_lifecycle"><span class="tocnumber">5</span> <span class="toctext">Crawler lifecycle</span></a></li>
<li class="toclevel-1"><a href="Crawler.html#See_also"><span class="tocnumber">6</span> <span class="toctext">See also</span></a></li>
</ul>
</td></tr></table><script type="text/javascript"> if (window.showTocToggle) { var tocShowText = "show"; var tocHideText = "hide"; showTocToggle(); } </script>
<a name="Overview"></a><h2> <span class="mw-headline"> Overview </span></h2>
<p>A crawler gathers information about resources, both content and metadata of interest like size or MIME type. SMILA currently comes with three types of crawlers, each adequate for a different datasource type, namely Web crawler, JDBC Database crawler, and File System crawler, to allow gathering information from the internet, databases, or files from a hard disk. Furthermore, the Connectivity Framework provides an API for developers which allows them to create their own crawlers.
</p>
<a name="API"></a><h2> <span class="mw-headline"> API </span></h2>
<p>A crawler has to implement two interfaces: <tt>Crawler</tt> and <tt>CrawlerCallback</tt>. The easiest way to achieve this is to extend the abstract base class <tt>AbstractCrawler</tt> located in bundle <tt>org.eclipse.smila.connectivity.framework</tt>. This class already contains handling for the crawlers Id and an OSGI service activate method. The crawler method <tt>getNext()</tt> is designed to support an array of Datareference objects, as this reduces the number of method calls. In general there are no restrictions on the size of the array, in fact the size could vary on multiple method calls. This allows a crawler to internally implement a Producer/Consumer pattern. A <tt>Crawler</tt> implementation that is restricted to work as an iterator only can also enforce this by always returning an array of size one.
</p><p>Javadoc: <a href="http://build.eclipse.org/rt/smila/javadoc/current/org/eclipse/smila/connectivity/framework/Crawler.html" class="external text" title="http://build.eclipse.org/rt/smila/javadoc/current/org/eclipse/smila/connectivity/framework/Crawler.html" rel="nofollow">org.eclipse.smila.connectivity.framework.Crawler</a>
</p>
<a name="Architecture"></a><h2> <span class="mw-headline"> Architecture </span></h2>
<p>Crawlers are managed and instantiated by the CrawlerController. The CrawlerController communicates with the crawler via interface <tt>Crawler</tt>, only. The crawler's <tt>getNext()</tt> method returns <tt>DataReference</tt> objects to the CrawlerController. <tt>DataReference</tt> is also an interface implemented by class <tt>org.eclipse.smila.connectivity.framework.util.internal.DataReferenceImpl</tt>. A DataReference, as the name suggests, is only a reference to data provided by the crawler. This is mainly a performance issue, as due to the use of DeltaIndexing it may not be neccessary to transfer all the data from the crawler to the CrawlerController and to ConnectivityManager. Therefore a DataReference contains only the minumum data needed to perform DeltaIndexing: an Id and a hash token. To access the whole object it provideds method <tt>getRecord()</tt> that returns a complete Record object containing Id, attributes, annotations and attachments. To create the Record object, the DataReference communicates with the crawler via interface <tt>CrawlerCallback</tt>, as each DataReference has a reference to the crawler that created it.
</p><p>The following chart shows the crawler architecture and how data is shared with the CrawlerController:
<a href="http://wiki.eclipse.org/Image:Crawler_Architecture.png" class="image" title="Crawler Architecture"><img alt="Crawler Architecture" src="http://wiki.eclipse.org/images/3/3d/Crawler_Architecture.png" width="960" height="720" border="0" /></a>
</p><p>Package <tt>org.eclipse.smila.connectivity.framework.util</tt> provides some factory classes for crawlers to create Ids, hashes and DataReference objects. More utility classes are planned to be implemented, that allow easy realization of crawlers using an iterator or producer/consumer pattern.
</p>
<a name="Configuration"></a><h2> <span class="mw-headline"> Configuration </span></h2>
<p>A crawler is started with a specific, named configuration, that defines what information is to be crawled (e.g. content, kinds of metadata) and where to find that data (e.g. file system path, JDBC Connection String). See each crawler documentation for details on configuration options.
</p><p><br />
Each crawler can define its own configuration because crawlers need different information to execute specifc crawl jobs. As example a JDBC crawler needs information about which database and which table should be crawled and which columns should be returned.
</p><p>Therefore the crawler developer defines a schema that contains all interesting information. This schema is based on a root schema that is supported by the SMILA framework. It declares the generic framework/frame which has to be used to send DataSourceConnectionConfigs (a crawl task) to the SMILA framework.
The root-schema can be found in:
configuration\org.eclipse.smila.connectivity.framework.schema/schemas/RootDataSourceConnectionConfigSchema.xsd.
</p><p>The root schema looks like as follows:
</p><p><a href="http://wiki.eclipse.org/Image:RootdatasourceConnectionConfig.png" class="image" title="Image:RootdatasourceConnectionConfig.png"><img alt="Image:RootdatasourceConnectionConfig.png" src="http://wiki.eclipse.org/images/a/ab/RootdatasourceConnectionConfig.png" width="1065" height="514" border="0" /></a>
</p>
<dl><dt>DataSourceID
</dt><dd>A description string that is used in the whole framework to separate and address information that apply to the same crawl job
</dd></dl>
<dl><dt>SchemaID
</dt><dd>The SchemaID contains the whole bundle name of the crawler (e.g. File System crawler: org.eclipse.smila.connectivity.framework.crawler.filesystem).<br /> The SMILA Framework uses this information to gather the schema for the validation of the DataSourceConnectionConfig that should be executed.
</dd></dl>
<dl><dt>DataConnectionID
</dt><dd>This tag describes if an agent or crawler should be used. It contains either of the following tags:
<ul><li><b>Agent</b>
</li><li><b>Crawler</b>
</li></ul>
</dd><dd>The name that is used in these tags is the Service name of the agent/crawler.
</dd></dl>
<dl><dt>RecordBuffer
</dt><dd>Here you can specify settings to optimize record transfer to ConnectivityManager
<ul><li>Size - the number of records to be send to ConnectivityyManager in one block. Default is 1.
</li><li>FlushInterval - a time interval in milliseconds after which to send the current elements of the RecordBuffer to ConnectivityManager. Default is 1000.
</li></ul>
</dd></dl>
<dl><dt>DeltaIndexing</dt><dd>
</dd><dd>Configuration options for delta indexing that are to be interpreted by the CrawlerController. The following values are supported:
<ul><li><tt>full</tt> - delta indexing is fully activated. Records are checked if they need to be updated, entries for new/updated records are added to the deltaIndexingManager, delta-delete is executed if no error occured
</li><li><tt>additive</tt> - as <tt>full</tt> but delta-delete is not executed
</li><li><tt>initial</tt> - For an initial import in an empty index or a new source in an existing index performance can be optimized by NOT checking if a record needs to be updated (we know that all records are new) but adding an entry in the DeltaIndexingManager for each Record. This allows later runs using <tt>full</tt> or <tt>additive</tt> to make use of DeltaIndexing infformation.
</li><li><tt>disabled</tt> - delta indexing is fully disabled. No checks are done, no entries are created/updated, no Delta-Delete is executed. Later runs cannot benefit from DeltaIndexing
</li></ul>
</dd></dl>
<dl><dt>CompoundHandling</dt><dd>
</dd><dd>Configuration options for CompoundHandling. See <a href="CompoundManagement.html#Configuration" title="SMILA/Documentation/CompoundManagement">CompoundManagement</a> for details.
</dd></dl>
<dl><dt>Attributes
</dt><dd>Placeholder for each crawler's attribute definition. <br />Each crawler can define here which attributes it can return. An attribute is a specific information of an entry in the datasource that is crawled by the crawler (E.g. In a filesystem an entry is a file, and attributes of an file are Size, Content, etc.)
</dd></dl>
<dl><dt>Process
</dt><dd>Placeholder for Tags that the crawler developer can define. <br /> In this Tag all information can be transferred for a crawl task that are necessary to start a crawling process. These information are maybe: starting urls/folder, and which entries should be crawled ( e.g. queries/wildcards/include/excludes).
</dd></dl>
<p><br />
</p>
<a name="Further_Information:"></a><h3> <span class="mw-headline"> Further Information: </span></h3>
<ol><li> See for each crawler attributes and process tags
</li><li> <a href="../Development_Guidelines/How_to_implement_a_crawler.html" class="mw-redirect" title="SMILA/Development Guidelines/How to implement a crawler">How to implement a crawler</a>
</li></ol>
<a name="Crawler_lifecycle"></a><h2> <span class="mw-headline"> Crawler lifecycle </span></h2>
<p>The CrawlerController manages the life cycle of the crawler (e.g. start, stop, abort) and may instantiate multiple crawlers concurrently, even of the same type. This is realised by using OSGi ComponentFactories. Each crawler does not automatically start an OSGi service, but registers only a crawler ComponentFactory with the CrawlerController. Via the ComponentFactory the CrawlerController can instantiate crawlers on demand.
</p><p>Here is a template for a crawler OSGi component definition:
</p>
<div dir="ltr" style="text-align: left;"><pre class="source-xml"><span class="sc3"><span class="re1">&lt;component</span> <span class="re0">name</span>=<span class="st0">&quot;%CRAWLER_TYPE%&quot;</span> <span class="re0">immediate</span>=<span class="st0">&quot;false&quot;</span> <span class="re0">factory</span>=<span class="st0">&quot;CrawlerFactory&quot;</span><span class="re2">&gt;</span></span>
<span class="sc3"><span class="re1">&lt;implementation</span> <span class="re0">class</span>=<span class="st0">&quot;%CRAWLER_IMPLEMENTATION_CLASS%&quot;</span> <span class="re2">/&gt;</span></span>
<span class="sc3"><span class="re1">&lt;service<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;provide</span> <span class="re0">interface</span>=<span class="st0">&quot;org.eclipse.smila.connectivity.framework.Crawler&quot;</span><span class="re2">/&gt;</span></span>
<span class="sc3"><span class="re1">&lt;/service<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/component<span class="re2">&gt;</span></span></span></pre></div>
<a name="See_also"></a><h2> <span class="mw-headline"> See also </span></h2>
<p>More information about the different crawlers can be found here:
</p>
<ul><li> <a href="Filesystem_Crawler.html" title="SMILA/Documentation/Filesystem Crawler">File System crawler</a>
</li><li> <a href="Web_Crawler.html" title="SMILA/Documentation/Web Crawler">Web crawler</a>
</li><li> <a href="JDBC_Crawler.html" title="SMILA/Documentation/JDBC Crawler">JDBC crawler</a>
</li></ul>
<!--
NewPP limit report
Preprocessor node count: 73/1000000
Post-expand include size: 1045/2097152 bytes
Template argument size: 515/2097152 bytes
#ifexist count: 0/100
-->
<!-- Saved in parser cache with key wikidb:pcache:idhash:18589-0!1!0!!en!2!edit=0 and timestamp 20120202171431 -->
<div class="printfooter">
Retrieved from "<a href="Crawler.html">http://wiki.eclipse.org/SMILA/Documentation/Crawler</a>"</div>
<div id="catlinks"><p class='catlinks'><a href="http://wiki.eclipse.org/Special:Categories" title="Special:Categories">Category</a>: <span dir='ltr'><a href="http://wiki.eclipse.org/Category:SMILA" title="Category:SMILA">SMILA</a></span></p></div> <!-- end content -->
<div class="visualClear"></div>
</div>
</div>
</div>
<!-- Yoink of toolbox for phoenix moved up -->
</div>
</div>
<div id="clearFooter"/>
<div id="footer" >
<ul id="footernav">
<li class="first"><a href="http://www.eclipse.org/">Home</a></li>
<li><a href="http://www.eclipse.org/legal/privacy.php">Privacy Policy</a></li>
<li><a href="http://www.eclipse.org/legal/termsofuse.php">Terms of Use</a></li>
<li><a href="http://www.eclipse.org/legal/copyright.php">Copyright Agent</a></li>
<li><a href="http://www.eclipse.org/org/foundation/contact.php">Contact</a></li>
<li><a href="http://wiki.eclipse.org/Eclipsepedia:About" title="Eclipsepedia:About">About Eclipsepedia</a></li>
</ul>
<span id="copyright">Copyright &copy; 2012 The Eclipse Foundation. All Rights Reserved</span>
<p id="footercredit">This page was last modified 09:38, 24 January 2012 by <a href="http://wiki.eclipse.org/index.php?title=User:Juergen.schumacher.attensity.com&amp;action=edit" class="new" title="User:Juergen.schumacher.attensity.com">Juergen Schumacher</a>. Based on work by <a href="http://wiki.eclipse.org/index.php?title=User:Nadine.auslaender.attensity.com&amp;action=edit" class="new" title="User:Nadine.auslaender.attensity.com"> </a>, <a href="http://wiki.eclipse.org/User:Daniel.stucky.empolis.com" title="User:Daniel.stucky.empolis.com">Daniel Stucky</a> and <a href="http://wiki.eclipse.org/User:Igor.novakovic.empolis.com" title="User:Igor.novakovic.empolis.com">Igor Novakovic</a> and <a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/Crawler&amp;action=credits" title="SMILA/Documentation/Crawler">others</a>.</p>
<p id="footerviews">This page has been accessed 4,352 times.</p>
</div>
<script type="text/javascript">
var gaJsHost = (("https:" == document.location.protocol) ? "https://ssl." : "http://www.");
document.write(unescape("%3Cscript src='" + gaJsHost + "google-analytics.com/ga.js' type='text/javascript'%3E%3C/script%3E"));
</script>
<script type="text/javascript">
var pageTracker = _gat._getTracker("UA-910670-4");
pageTracker._trackPageview();
</script>
<!-- <div class="visualClear"></div> -->
<script type="text/javascript">if (window.runOnloadHook) runOnloadHook();</script>
</div>
<!-- Served in 0.061 secs. --></body></html>