blob: 667361c6da3f655c96c8158909923941cb980cab [file] [log] [blame]
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en" dir="ltr">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<meta name="keywords" content="SMILA/Documentation/CrawlerController,SMILA/Documentation/CompoundManagement" />
<link rel="shortcut icon" href="http://wiki.eclipse.org/SMILA/Documentation/favicon.ico" />
<link rel="search" type="application/opensearchdescription+xml" href="http://wiki.eclipse.org/opensearch_desc.php" title="Eclipsepedia (English)" />
<link rel="alternate" type="application/rss+xml" title="Eclipsepedia RSS Feed" href="http://wiki.eclipse.org/index.php?title=Special:Recentchanges&amp;feed=rss" />
<link rel="alternate" type="application/atom+xml" title="Eclipsepedia Atom Feed" href="http://wiki.eclipse.org/index.php?title=Special:Recentchanges&amp;feed=atom" />
<title>SMILA/Documentation/CrawlerController - Eclipsepedia</title>
<style type="text/css" media="screen,projection">/*<![CDATA[*/ @import "/skins/eclipsenova/novaWide.css?116"; /*]]>*/</style>
<link rel="stylesheet" type="text/css" media="print" href="http://wiki.eclipse.org/skins/eclipsenova/eclipsenovaPrint.css?116" />
<link rel="stylesheet" type="text/css" media="handheld" href="http://wiki.eclipse.org/skins/eclipsenova/handheld.css?116" />
<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/Nova/css/header.css" media="screen" />
<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/tabs.css" media="screen" />
<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/Nova/css/visual.css" media="screen" />
<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/Nova/css/layout.css" media="screen" />
<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/Nova/css/footer.css" media="screen" />
<!--[if IE]><link rel="stylesheet" type="text/css" href="/skins/eclipsenova/IEpngfix.css" media="screen" /><![endif]-->
<!--[if lt IE 5.5000]><style type="text/css">@import "/skins/eclipsenova/IE50Fixes.css?116";</style> <![endif]-->
<!--[if IE 5.5000]><style type="text/css">@import "/skins/eclipsenova/IE55Fixes.css?116";</style><![endif]-->
<!--[if IE 6]><style type="text/css">@import "/skins/eclipsenova/IE60Fixes.css?116";</style><![endif]-->
<!--[if IE 7]><style type="text/css">@import "/skins/eclipsenova/IE70Fixes.css?116";</style><![endif]-->
<!--[if lt IE 7]><script type="text/javascript" src="/skins/common/IEFixes.js?116"></script>
<meta http-equiv="imagetoolbar" content="no" /><![endif]-->
<script type= "text/javascript">/*<![CDATA[*/
var skin = "eclipsenova";
var stylepath = "/skins";
var wgArticlePath = "/$1";
var wgScriptPath = "";
var wgScript = "/index.php";
var wgServer = "http://wiki.eclipse.org";
var wgCanonicalNamespace = "";
var wgCanonicalSpecialPageName = false;
var wgNamespaceNumber = 0;
var wgPageName = "SMILA/Documentation/CrawlerController";
var wgTitle = "SMILA/Documentation/CrawlerController";
var wgAction = "view";
var wgRestrictionEdit = [];
var wgRestrictionMove = [];
var wgArticleId = "18820";
var wgIsArticle = true;
var wgUserName = null;
var wgUserGroups = null;
var wgUserLanguage = "en";
var wgContentLanguage = "en";
var wgBreakFrames = false;
var wgCurRevisionId = "247447";
var wgVersion = "1.12.0";
var wgEnableAPI = true;
var wgEnableWriteAPI = false;
/*]]>*/</script>
<script type="text/javascript" src="http://wiki.eclipse.org/skins/common/wikibits.js?116"><!-- wikibits js --></script>
<!-- Performance mods similar to those for bug 166401 -->
<script type="text/javascript" src="http://wiki.eclipse.org/index.php?title=-&amp;action=raw&amp;gen=js&amp;useskin=eclipsenova"><!-- site js --></script>
<!-- Head Scripts -->
<script type="text/javascript" src="http://wiki.eclipse.org/skins/common/ajax.js?116"></script>
<link rel="stylesheet" type="text/css" href="CrawlerController.html" /> </head>
<body class="mediawiki ns-0 ltr page-SMILA_Documentation_CrawlerController">
<div id="globalWrapper">
<div id="column-one">
<!-- Eclipse Additions for the Top Nav start here M. Ward-->
<div id="header">
<div id="header-graphic">
<img src="http://wiki.eclipse.org/skins/eclipsenova/eclipse.png" alt="Eclipse Wiki">
</div>
<!-- Pulled 101409 Mward -->
<div class="portlet" id="p-personal">
<div class="pBody">
<ul>
<li id="pt-login"><a href="http://wiki.eclipse.org/index.php?title=Special:Userlogin&amp;returnto=SMILA/Documentation/CrawlerController">Log in</a></li>
</ul>
</div>
</div>
<div id="header-icons">
<div id="sites">
<ul id="sitesUL">
<li><a href="http://www.eclipse.org"><img src="http://dev.eclipse.org/custom_icons/eclipseIcon.png" width="28" height="28" alt="Eclipse Foundation" title="Eclipse Foundation" /><div>Eclipse Foundation</div></a></li>
<li><a href="http://marketplace.eclipse.org"><img src="http://dev.eclipse.org/custom_icons/marketplace.png" width="28" height="28" alt="Eclipse Marketplace" title="Eclipse Marketplace" /><div>Eclipse Marketplace</div></a></li>
<li><a href="https://bugs.eclipse.org/bugs"><img src="http://dev.eclipse.org/custom_icons/system-search-bw.png" width="28" height="28" alt="Bugzilla" title="Bugzilla" /><div>Bugzilla</div></a></li>
<li><a href="http://live.eclipse.org"><img src="http://dev.eclipse.org/custom_icons/audio-input-microphone-bw.png" width="28" height="28" alt="Live" title="Live" /><div>Eclipse Live</div></a></li>
<li><a href="http://planeteclipse.org"><img src="http://dev.eclipse.org/large_icons/devices/audio-card.png" width="28" height="28" alt="PlanetEclipse" title="Planet" /><div>Planet Eclipse</div></a></li>
<li><a href="http://portal.eclipse.org"><img src="http://dev.eclipse.org/custom_icons/preferences-system-network-proxy-bw.png" width="28" height="28" alt="Portal" title="Portal" /><div>My Foundation Portal</div></a></li>
</ul>
</div>
</div>
</div>
<!-- NEW HEADER STUFF HERE -->
<div id="header-menu">
<div id="header-nav">
<ul> <li><a class="first_one" href="http://wiki.eclipse.org/" target="_self">Home</a></li> <li><a href="http://www.eclipse.org/downloads/" target="_self">Downloads</a></li>
<li><a href="http://www.eclipse.org/users/" target="_self">Users</a></li>
<li><a href="http://www.eclipse.org/membership/" target="_self">Members</a></li>
<li><a href="http://wiki.eclipse.org/index.php/Development_Resources" target="_self">Committers</a></li>
<li><a href="http://www.eclipse.org/resources/" target="_self">Resources</a></li>
<li><a href="http://www.eclipse.org/projects/" target="_self">Projects</a></li>
<li><a href="http://www.eclipse.org/org/" target="_self">About Us</a></li>
</ul>
</div>
<div id="header-utils">
<!-- moved the search window here -->
<form action="http://wiki.eclipse.org/Special:Search" >
<input class="input" name="search" type="text" accesskey="f" value="" />
<input type='submit' onclick="this.submit();" name="go" id="searchGoButton" class="button" title="Go to a page with this exact name if one exists" value="Go" />&nbsp;
<input type='submit' onclick="this.submit();" name="fulltext" class="button" id="mw-searchButton" title="Search Eclipsepedia for this text" value="Search" />
</form>
</div>
</div>
<!-- Eclipse Additions for the Header stop here -->
<!-- Additions and mods for leftside nav Start here -->
<!--Started nav rip here-->
<!-- these are the nav controls main page, changes etc -->
<div id="novaContent" class="faux">
<div id="leftcol">
<ul id="leftnav">
<!-- these are the page controls, edit history etc -->
<li class="separator"><a class="separator">Navigation &#160;&#160;</li>
<li id="n-mainpage"><a href="http://wiki.eclipse.org/Main_Page">Main Page</a></li>
<li id="n-portal"><a href="http://wiki.eclipse.org/Eclipsepedia:Community_Portal">Community portal</a></li>
<li id="n-currentevents"><a href="http://wiki.eclipse.org/Eclipsepedia:Current_events">Current events</a></li>
<li id="n-recentchanges"><a href="http://wiki.eclipse.org/Special:Recentchanges">Recent changes</a></li>
<li id="n-randompage"><a href="http://wiki.eclipse.org/Special:Random">Random page</a></li>
<li id="n-help"><a href="http://wiki.eclipse.org/Help:Contents">Help</a></li>
<li class="separator"><a class="separator">Toolbox &#160;&#160;</a></li>
<li id="t-whatlinkshere"><a href="http://wiki.eclipse.org/Special:Whatlinkshere/SMILA/Documentation/CrawlerController">What links here</a></li>
<li id="t-recentchangeslinked"><a href="http://wiki.eclipse.org/Special:Recentchangeslinked/SMILA/Documentation/CrawlerController">Related changes</a></li>
<!-- This is the toolbox section -->
<li id="t-upload"><a href="http://wiki.eclipse.org/Special:Upload">Upload file</a></li>
<li id="t-specialpages"><a href="http://wiki.eclipse.org/Special:Specialpages">Special pages</a></li>
<li id="t-print"><a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/CrawlerController&amp;printable=yes">Printable version</a></li> <li id="t-permalink"><a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/CrawlerController&amp;oldid=247447">Permanent link</a></li> </ul>
</div>
<!-- Additions and mods for leftside nav End here -->
<div id="column-content">
<div id="content">
<a name="top" id="top"></a>
<div id="tabs">
<ul class="primary">
<li class="active"><a href="CrawlerController.html"><span class="tab">Page</span></a></li>
<li><a href="http://wiki.eclipse.org/index.php?title=Talk:SMILA/Documentation/CrawlerController&amp;action=edit"><span class="tab">Discussion</span></a></li>
<li><a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/CrawlerController&amp;action=edit"><span class="tab">View source</span></a></li>
<li><a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/CrawlerController&amp;action=history"><span class="tab">History</span></a></li>
<li><a href="http://wiki.eclipse.org/index.php?title=Special:Userlogin&amp;returnto=SMILA/Documentation/CrawlerController"><span class="tab">Edit</span></a></li>
</ul>
</div>
<script type="text/javascript"> if (window.isMSIE55) fixalpha(); </script>
<h1 class="firstHeading">SMILA/Documentation/CrawlerController</h1>
<div id="bodyContent">
<h3 id="siteSub">From Eclipsepedia</h3>
<div id="contentSub"><span class="subpages">&lt; <a href="../../SMILA.html" title="SMILA">SMILA</a> | <a href="../Documentation.html" title="SMILA/Documentation">Documentation</a></span></div>
<div id="jump-to-nav">Jump to: <a href="CrawlerController.html#column-one">navigation</a>, <a href="CrawlerController.html#searchInput">search</a></div> <!-- start content -->
<table id="toc" class="toc" summary="Contents"><tr><td><div id="toctitle"><h2>Contents</h2></div>
<ul>
<li class="toclevel-1"><a href="CrawlerController.html#Overview"><span class="tocnumber">1</span> <span class="toctext">Overview</span></a></li>
<li class="toclevel-1"><a href="CrawlerController.html#API"><span class="tocnumber">2</span> <span class="toctext">API</span></a></li>
<li class="toclevel-1"><a href="CrawlerController.html#Implementations"><span class="tocnumber">3</span> <span class="toctext">Implementations</span></a>
<ul>
<li class="toclevel-2"><a href="CrawlerController.html#org.eclipse.smila.connectivity.framework.impl"><span class="tocnumber">3.1</span> <span class="toctext">org.eclipse.smila.connectivity.framework.impl</span></a></li>
<li class="toclevel-2"><a href="CrawlerController.html#Configuration"><span class="tocnumber">3.2</span> <span class="toctext">Configuration</span></a></li>
<li class="toclevel-2"><a href="CrawlerController.html#JMX_interface"><span class="tocnumber">3.3</span> <span class="toctext">JMX interface</span></a></li>
</ul>
</li>
</ul>
</td></tr></table><script type="text/javascript"> if (window.showTocToggle) { var tocShowText = "show"; var tocHideText = "hide"; showTocToggle(); } </script>
<a name="Overview"></a><h2> <span class="mw-headline"> Overview </span></h2>
<p>The CrawlerController is a component that manages and monitors Crawlers. Whenever a new crawl is triggered (via <tt>startCrawl()</tt>) a new instance of the used Crawler is created and the crawler object hash value is used a an id (called jobId) to identify records created by this crawler instance. This jobid is set as an annotation on all records and is also visible on the crawler instance in the JMX console.
</p>
<a name="API"></a><h2> <span class="mw-headline"> API </span></h2>
<p>Current javadoc:
</p>
<ul><li> <a href="http://build.eclipse.org/rt/smila/javadoc/current/org/eclipse/smila/connectivity/framework/CrawlerController.html" class="external text" title="http://build.eclipse.org/rt/smila/javadoc/current/org/eclipse/smila/connectivity/framework/CrawlerController.html" rel="nofollow">org.eclipse.smila.connectivity.framework.CrawlerController</a>
</li><li> <a href="http://build.eclipse.org/rt/smila/javadoc/current/org/eclipse/smila/connectivity/framework/util/CrawlerControllerCallback.html" class="external text" title="http://build.eclipse.org/rt/smila/javadoc/current/org/eclipse/smila/connectivity/framework/util/CrawlerControllerCallback.html" rel="nofollow">org.eclipse.smila.connectivity.framework.util.CrawlerControllerCallback</a>
</li></ul>
<a name="Implementations"></a><h2> <span class="mw-headline"> Implementations </span></h2>
<p>It is possible to provide different implementations for the CrawlerController interface. At the moment there is one implementation available.
</p>
<a name="org.eclipse.smila.connectivity.framework.impl"></a><h3> <span class="mw-headline">org.eclipse.smila.connectivity.framework.impl</span></h3>
<p>This bundle contains the default implementation of the CrawlerController interface.
</p><p>The CrawlerController implements the general processing logic common for all types of Crawlers. Its interface is a pure management interface that can be accessed by its Java interface or its wrapping JMX interface. It has references to the following OSGi services:
</p>
<ul><li> Crawler ComponentFactory
</li><li> ConnectivityManager
</li><li> DeltaIndexingManager (optional)
</li><li> CompoundManager
</li><li> ConfigurationManagement (t.b.d.)
</li></ul>
<p>Crawler Factories register themselves at the CrawlerController. Each time a crawl for a certain type of crawler is initiated, a new instance of that Crawler type is created via the Crawler ComponentFactory. This allows parallel crawling of datasources with the same type (e.g. several websites). Note that it is not possible to crawl the same data source concurrently!
</p><p><br />
This chart shows the current CrawlerController processing logic for one crawl run:
<a href="http://wiki.eclipse.org/Image:CrawlerControllerProcessingLogic.png" class="image" title="Image:CrawlerControllerProcessingLogic.png"><img alt="Image:CrawlerControllerProcessingLogic.png" src="http://wiki.eclipse.org/images/6/67/CrawlerControllerProcessingLogic.png" width="960" height="720" border="0" /></a>
</p>
<ul><li> First the CrawlerController initializes DeltaIndexing for the current data source by calling <tt>DeltaIndexingManager::init(String)</tt> and also initializes a new Crawler (not shown)
</li><li> the then executes subprocess <b>process crawler</b> with the initialized Crawler
</li><li> if no error occured so far it performs the subprocess <b>delete delta</b>
</li><li> finally it finishes the run by calling <tt>DeltaIndexingManager::finish(String)</tt>
</li></ul>
<p><br />
</p>
<dl><dt>Process Crawler
</dt></dl>
<ul><li> the CrawlerController checks if the given Crawler has more data available
</li><li> YES: the CrawlerController checks each received DataReference send by the Crawler if it needs to be updated by calling <tt>DeltaIndexingManager::checkForUpdate(...)</tt>
<ul><li> YES: the CrawlerController request the complete record from the Crawler and checks if the record is a compound
<ul><li> YES: the subprocess <b>process compounds</b> is executed.
</li><li> NO: no special actions are taken
</li></ul>
</li><li>the record is added to the Queue by calling <tt>ConnectivityManager::add(...)</tt> and is marked as visited in the DeltaIndexingManager by calling <tt>DeltaIndexingManager::visit(...)</tt>
</li><li> NO: the DataReference is skipped. DeltaIndexingManager internally already set the visited flag for this Id
</li></ul>
</li><li> NO: return to the calling process
</li></ul>
<p><br />
</p>
<dl><dt>Process Compounds
</dt></dl>
<p>Please see <a href="CompoundManagement.html" title="SMILA/Documentation/CompoundManagement">CompoundManagement</a> for details on compound handling.
</p>
<ul><li> by calling <tt>CompoundManager:extract(Record, DataSourceConnectionConfig)</tt> the subprocess receives a CompoundCrawler that iterates over the elements of the compound record
</li><li> the subprocess recursively calls subprocess <b>process crawler</b> using the CompoundCrawler
</li><li> the compound record is adapted according to the configuration (set to null, modified, left unmodified) by calling <tt>CompoundManager:adaptCompoundRecord(Record, DataSourceConnectionConfig)</tt>
</li><li> return to the calling process
</li></ul>
<p><br />
</p>
<dl><dt>Delete Delta
</dt></dl>
<ul><li> by calling <tt>DeltaIndexingManager::obsoleteIdIterator(...)</tt> the subprocess receives an Iterator over all Ids that have to be deleted
</li><li> for each Id <tt>ConnectivityManager::delete(...)</tt> is called
</li><li> return to the calling process
</li></ul>
<p><br />
</p>
<dl><dt>Note</dt><dd> The exact logic depends on the settings of <tt>DeltaIndexing</tt> in the data source configuration. Depending on the configured value, delta indexing logic is executed fully, partially or not at all.
</dd></dl>
<a name="Configuration"></a><h3> <span class="mw-headline"> Configuration </span></h3>
<p>There are no configuration options available for this bundle.
</p>
<a name="JMX_interface"></a><h3> <span class="mw-headline"> JMX interface </span></h3>
<p>Javdoc: <a href="http://build.eclipse.org/rt/smila/javadoc/current/org/eclipse/smila/connectivity/framework/CrawlerControllerAgent.html" class="external text" title="http://build.eclipse.org/rt/smila/javadoc/current/org/eclipse/smila/connectivity/framework/CrawlerControllerAgent.html" rel="nofollow">org.eclipse.smila.connectivity.framework.CrawlerControllerAgent</a>
</p><p>Here is a screenshot of the CrawlerController in the JMX Console:
</p><p><a href="http://wiki.eclipse.org/Image:CrawlerControllerJMX.png" class="image" title="Image:CrawlerControllerJMX.png"><img alt="Image:CrawlerControllerJMX.png" src="http://wiki.eclipse.org/images/7/7b/CrawlerControllerJMX.png" width="584" height="450" border="0" /></a>
</p>
<!--
NewPP limit report
Preprocessor node count: 22/1000000
Post-expand include size: 0/2097152 bytes
Template argument size: 0/2097152 bytes
#ifexist count: 0/100
-->
<!-- Saved in parser cache with key wikidb:pcache:idhash:18820-0!1!0!!en!2!edit=0 and timestamp 20110617075004 -->
<div class="printfooter">
Retrieved from "<a href="CrawlerController.html">http://wiki.eclipse.org/SMILA/Documentation/CrawlerController</a>"</div>
<!-- end content -->
<div class="visualClear"></div>
</div>
</div>
</div>
<!-- Yoink of toolbox for phoenix moved up -->
</div>
</div>
<div id="clearFooter"/>
<div id="footer" >
<ul id="footernav">
<li class="first"><a href="http://www.eclipse.org/">Home</a></li>
<li><a href="http://www.eclipse.org/legal/privacy.php">Privacy Policy</a></li>
<li><a href="http://www.eclipse.org/legal/termsofuse.php">Terms of Use</a></li>
<li><a href="http://www.eclipse.org/legal/copyright.php">Copyright Agent</a></li>
<li><a href="http://www.eclipse.org/org/foundation/contact.php">Contact</a></li>
<li><a href="http://wiki.eclipse.org/Eclipsepedia:About" title="Eclipsepedia:About">About Eclipsepedia</a></li>
</ul>
<span id="copyright">Copyright &copy; 2011 The Eclipse Foundation. All Rights Reserved</span>
<p id="footercredit">This page was last modified 07:56, 21 April 2011 by <a href="http://wiki.eclipse.org/index.php?title=User:Juergen.schumacher.attensity.com&amp;action=edit" class="new" title="User:Juergen.schumacher.attensity.com">Juergen Schumacher</a>. Based on work by <a href="http://wiki.eclipse.org/User:Daniel.stucky.empolis.com" title="User:Daniel.stucky.empolis.com">Daniel Stucky</a> and <a href="http://wiki.eclipse.org/User:Igor.novakovic.empolis.com" title="User:Igor.novakovic.empolis.com">Igor Novakovic</a>.</p>
<p id="footerviews">This page has been accessed 1,698 times.</p>
</div>
<script type="text/javascript">
var gaJsHost = (("https:" == document.location.protocol) ? "https://ssl." : "http://www.");
document.write(unescape("%3Cscript src='" + gaJsHost + "google-analytics.com/ga.js' type='text/javascript'%3E%3C/script%3E"));
</script>
<script type="text/javascript">
var pageTracker = _gat._getTracker("UA-910670-4");
pageTracker._trackPageview();
</script>
<!-- <div class="visualClear"></div> -->
<script type="text/javascript">if (window.runOnloadHook) runOnloadHook();</script>
</div>
<!-- Served in 0.147 secs. --></body></html>