| <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> |
| <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en" dir="ltr"> |
| <head> |
| <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> |
| <meta name="keywords" content="SMILA/Documentation/CrawlerController,SMILA/Documentation/CompoundManagement,SMILA/Documentation/Crawler" /> |
| <link rel="shortcut icon" href="http://wiki.eclipse.org/SMILA/Documentation/favicon.ico" /> |
| <link rel="search" type="application/opensearchdescription+xml" href="http://wiki.eclipse.org/opensearch_desc.php" title="Eclipsepedia (English)" /> |
| <link rel="alternate" type="application/rss+xml" title="Eclipsepedia RSS Feed" href="http://wiki.eclipse.org/index.php?title=Special:Recentchanges&feed=rss" /> |
| <link rel="alternate" type="application/atom+xml" title="Eclipsepedia Atom Feed" href="http://wiki.eclipse.org/index.php?title=Special:Recentchanges&feed=atom" /> |
| |
| |
| <title>SMILA/Documentation/CrawlerController - Eclipsepedia</title> |
| |
| <style type="text/css" media="screen,projection">/*<![CDATA[*/ @import "http://wiki.eclipse.org/skins/eclipsenova/novaWide.css?116"; /*]]>*/</style> |
| <link rel="stylesheet" type="text/css" media="print" href="http://wiki.eclipse.org/skins/eclipsenova/eclipsenovaPrint.css?116" /> |
| <link rel="stylesheet" type="text/css" media="handheld" href="http://wiki.eclipse.org/skins/eclipsenova/handheld.css?116" /> |
| <link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/Nova/css/header.css" media="screen" /> |
| <link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/tabs.css" media="screen" /> |
| <link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/Nova/css/visual.css" media="screen" /> |
| <link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/Nova/css/layout.css" media="screen" /> |
| <link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/Nova/css/footer.css" media="screen" /> |
| <!--[if IE]><link rel="stylesheet" type="text/css" href="/skins/eclipsenova/IEpngfix.css" media="screen" /><![endif]--> |
| <!--[if lt IE 5.5000]><style type="text/css">@import "/skins/eclipsenova/IE50Fixes.css?116";</style> <![endif]--> |
| <!--[if IE 5.5000]><style type="text/css">@import "/skins/eclipsenova/IE55Fixes.css?116";</style><![endif]--> |
| <!--[if IE 6]><style type="text/css">@import "/skins/eclipsenova/IE60Fixes.css?116";</style><![endif]--> |
| <!--[if IE 7]><style type="text/css">@import "/skins/eclipsenova/IE70Fixes.css?116";</style><![endif]--> |
| <!--[if lt IE 7]><script type="text/javascript" src="/skins/common/IEFixes.js?116"></script> |
| <meta http-equiv="imagetoolbar" content="no" /><![endif]--> |
| <script type= "text/javascript">/*<![CDATA[*/ |
| var skin = "eclipsenova"; |
| var stylepath = "/skins"; |
| var wgArticlePath = "/$1"; |
| var wgScriptPath = ""; |
| var wgScript = "/index.php"; |
| var wgServer = "http://wiki.eclipse.org"; |
| var wgCanonicalNamespace = ""; |
| var wgCanonicalSpecialPageName = false; |
| var wgNamespaceNumber = 0; |
| var wgPageName = "SMILA/Documentation/CrawlerController"; |
| var wgTitle = "SMILA/Documentation/CrawlerController"; |
| var wgAction = "view"; |
| var wgRestrictionEdit = []; |
| var wgRestrictionMove = []; |
| var wgArticleId = "18820"; |
| var wgIsArticle = true; |
| var wgUserName = null; |
| var wgUserGroups = null; |
| var wgUserLanguage = "en"; |
| var wgContentLanguage = "en"; |
| var wgBreakFrames = false; |
| var wgCurRevisionId = "285984"; |
| var wgVersion = "1.12.0"; |
| var wgEnableAPI = true; |
| var wgEnableWriteAPI = false; |
| /*]]>*/</script> |
| |
| <script type="text/javascript" src="http://wiki.eclipse.org/skins/common/wikibits.js?116"><!-- wikibits js --></script> |
| |
| <!-- Performance mods similar to those for bug 166401 --> |
| <script type="text/javascript" src="http://wiki.eclipse.org/index.php?title=-&action=raw&gen=js&useskin=eclipsenova"><!-- site js --></script> |
| |
| <!-- Head Scripts --> |
| <script type="text/javascript" src="http://wiki.eclipse.org/skins/common/ajax.js?116"></script> |
| <style type="text/css">/*<![CDATA[*/ |
| .source-javascript {line-height: normal; font-size: medium;} |
| .source-javascript li {line-height: normal;} |
| /** |
| * GeSHi Dynamically Generated Stylesheet |
| * -------------------------------------- |
| * Dynamically generated stylesheet for javascript |
| * CSS class: source-javascript, CSS id: |
| * GeSHi (C) 2004 - 2007 Nigel McNie (http://qbnz.com/highlighter) |
| */ |
| .source-javascript .de1, .source-javascript .de2 {font-family: 'Courier New', Courier, monospace; font-weight: normal;} |
| .source-javascript {} |
| .source-javascript .head {} |
| .source-javascript .foot {} |
| .source-javascript .imp {font-weight: bold; color: red;} |
| .source-javascript .ln-xtra {color: #cc0; background-color: #ffc;} |
| .source-javascript li {font-family: 'Courier New', Courier, monospace; color: black; font-weight: normal; font-style: normal;} |
| .source-javascript li.li2 {font-weight: bold;} |
| .source-javascript .kw1 {color: #000066; font-weight: bold;} |
| .source-javascript .kw2 {color: #003366; font-weight: bold;} |
| .source-javascript .kw3 {color: #000066;} |
| .source-javascript .co1 {color: #009900; font-style: italic;} |
| .source-javascript .coMULTI {color: #009900; font-style: italic;} |
| .source-javascript .es0 {color: #000099; font-weight: bold;} |
| .source-javascript .br0 {color: #66cc66;} |
| .source-javascript .st0 {color: #3366CC;} |
| .source-javascript .nu0 {color: #CC0000;} |
| .source-javascript .me1 {color: #006600;} |
| .source-javascript .sc0 {} |
| .source-javascript .sc1 {} |
| .source-javascript .sc2 {} |
| .source-javascript .sc3 {} |
| .source-javascript .re0 {color: #0066FF;} |
| |
| /*]]>*/ |
| </style> |
| <style type="text/css">/*<![CDATA[*/ |
| @import "http://wiki.eclipse.org/index.php?title=MediaWiki:Geshi.css&usemsgcache=yes&action=raw&ctype=text/css&smaxage=18000"; |
| /*]]>*/ |
| </style><link rel="stylesheet" type="text/css" href="CrawlerController.html" /> </head> |
| <body class="mediawiki ns-0 ltr page-SMILA_Documentation_CrawlerController"> |
| <div id="globalWrapper"> |
| |
| |
| <div id="column-one"> |
| <!-- Eclipse Additions for the Top Nav start here M. Ward--> |
| |
| <div id="header"> |
| <div id="header-graphic"> |
| <img src="http://wiki.eclipse.org/skins/eclipsenova/eclipse.png" alt="Eclipse Wiki"> |
| </div> |
| <!-- Pulled 101409 Mward --> |
| |
| <div class="portlet" id="p-personal"> |
| <div class="pBody"> |
| <ul> |
| <li id="pt-login"><a href="http://wiki.eclipse.org/index.php?title=Special:Userlogin&returnto=SMILA/Documentation/CrawlerController">Log in</a></li> |
| </ul> |
| </div> |
| </div> |
| |
| <div id="header-icons"> |
| <div id="sites"> |
| <ul id="sitesUL"> |
| <li><a href="http://www.eclipse.org"><img src="http://dev.eclipse.org/custom_icons/eclipseIcon.png" width="28" height="28" alt="Eclipse Foundation" title="Eclipse Foundation" /><div>Eclipse Foundation</div></a></li> |
| <li><a href="http://marketplace.eclipse.org"><img src="http://dev.eclipse.org/custom_icons/marketplace.png" width="28" height="28" alt="Eclipse Marketplace" title="Eclipse Marketplace" /><div>Eclipse Marketplace</div></a></li> |
| <li><a href="https://bugs.eclipse.org/bugs"><img src="http://dev.eclipse.org/custom_icons/system-search-bw.png" width="28" height="28" alt="Bugzilla" title="Bugzilla" /><div>Bugzilla</div></a></li> |
| <li><a href="http://live.eclipse.org"><img src="http://dev.eclipse.org/custom_icons/audio-input-microphone-bw.png" width="28" height="28" alt="Live" title="Live" /><div>Eclipse Live</div></a></li> |
| <li><a href="http://planeteclipse.org"><img src="http://dev.eclipse.org/large_icons/devices/audio-card.png" width="28" height="28" alt="PlanetEclipse" title="Planet" /><div>Planet Eclipse</div></a></li> |
| <li><a href="http://portal.eclipse.org"><img src="http://dev.eclipse.org/custom_icons/preferences-system-network-proxy-bw.png" width="28" height="28" alt="Portal" title="Portal" /><div>My Foundation Portal</div></a></li> |
| </ul> |
| </div> |
| </div> |
| </div> |
| <!-- NEW HEADER STUFF HERE --> |
| <div id="header-menu"> |
| <div id="header-nav"> |
| <ul> <li><a class="first_one" href="http://wiki.eclipse.org/" target="_self">Home</a></li> <li><a href="http://www.eclipse.org/downloads/" target="_self">Downloads</a></li> |
| <li><a href="http://www.eclipse.org/users/" target="_self">Users</a></li> |
| <li><a href="http://www.eclipse.org/membership/" target="_self">Members</a></li> |
| <li><a href="http://wiki.eclipse.org/index.php/Development_Resources" target="_self">Committers</a></li> |
| <li><a href="http://www.eclipse.org/resources/" target="_self">Resources</a></li> |
| <li><a href="http://www.eclipse.org/projects/" target="_self">Projects</a></li> |
| <li><a href="http://www.eclipse.org/org/" target="_self">About Us</a></li> |
| </ul> |
| </div> |
| <div id="header-utils"> |
| <!-- moved the search window here --> |
| <form action="http://wiki.eclipse.org/Special:Search" > |
| <input class="input" name="search" type="text" accesskey="f" value="" /> |
| <input type='submit' onclick="this.submit();" name="go" id="searchGoButton" class="button" title="Go to a page with this exact name if one exists" value="Go" /> |
| <input type='submit' onclick="this.submit();" name="fulltext" class="button" id="mw-searchButton" title="Search Eclipsepedia for this text" value="Search" /> |
| </form> |
| </div> |
| </div> |
| |
| |
| <!-- Eclipse Additions for the Header stop here --> |
| <!-- Additions and mods for leftside nav Start here --> |
| |
| <!--Started nav rip here--> |
| <!-- these are the nav controls main page, changes etc --> |
| <div id="novaContent" class="faux"> |
| <div id="leftcol"> |
| <ul id="leftnav"> |
| <!-- these are the page controls, edit history etc --> |
| <li class="separator"><a class="separator">Navigation   </li> |
| <li id="n-mainpage"><a href="http://wiki.eclipse.org/Main_Page">Main Page</a></li> |
| <li id="n-portal"><a href="http://wiki.eclipse.org/Eclipsepedia:Community_Portal">Community portal</a></li> |
| <li id="n-currentevents"><a href="http://wiki.eclipse.org/Eclipsepedia:Current_events">Current events</a></li> |
| <li id="n-recentchanges"><a href="http://wiki.eclipse.org/Special:Recentchanges">Recent changes</a></li> |
| <li id="n-randompage"><a href="http://wiki.eclipse.org/Special:Random">Random page</a></li> |
| <li id="n-help"><a href="http://wiki.eclipse.org/Help:Contents">Help</a></li> |
| <li class="separator"><a class="separator">Toolbox   </a></li> |
| |
| <li id="t-whatlinkshere"><a href="http://wiki.eclipse.org/Special:Whatlinkshere/SMILA/Documentation/CrawlerController">What links here</a></li> |
| <li id="t-recentchangeslinked"><a href="http://wiki.eclipse.org/Special:Recentchangeslinked/SMILA/Documentation/CrawlerController">Related changes</a></li> |
| <!-- This is the toolbox section --> |
| <li id="t-upload"><a href="http://wiki.eclipse.org/Special:Upload">Upload file</a></li> |
| <li id="t-specialpages"><a href="http://wiki.eclipse.org/Special:Specialpages">Special pages</a></li> |
| <li id="t-print"><a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/CrawlerController&printable=yes">Printable version</a></li> <li id="t-permalink"><a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/CrawlerController&oldid=285984">Permanent link</a></li> </ul> |
| </div> |
| |
| |
| <!-- Additions and mods for leftside nav End here --> |
| |
| |
| <div id="column-content"> |
| <div id="content"> |
| <a name="top" id="top"></a> |
| |
| <div id="tabs"> |
| <ul class="primary"> |
| <li class="active"><a href="CrawlerController.html"><span class="tab">Page</span></a></li> |
| <li><a href="http://wiki.eclipse.org/index.php?title=Talk:SMILA/Documentation/CrawlerController&action=edit"><span class="tab">Discussion</span></a></li> |
| <li><a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/CrawlerController&action=edit"><span class="tab">View source</span></a></li> |
| <li><a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/CrawlerController&action=history"><span class="tab">History</span></a></li> |
| <li><a href="http://wiki.eclipse.org/index.php?title=Special:Userlogin&returnto=SMILA/Documentation/CrawlerController"><span class="tab">Edit</span></a></li> |
| </ul> |
| </div> |
| |
| |
| <script type="text/javascript"> if (window.isMSIE55) fixalpha(); </script> |
| <h1 class="firstHeading">SMILA/Documentation/CrawlerController</h1> |
| <div id="bodyContent"> |
| <h3 id="siteSub">From Eclipsepedia</h3> |
| <div id="contentSub"><span class="subpages">< <a href="../../SMILA.html" title="SMILA">SMILA</a> | <a href="../Documentation.1.html" title="SMILA/Documentation">Documentation</a></span></div> |
| <div id="jump-to-nav">Jump to: <a href="CrawlerController.html#column-one">navigation</a>, <a href="CrawlerController.html#searchInput">search</a></div> <!-- start content --> |
| <div class="messagebox" style="background-color: #def3fe; border: 1px solid #c5d7e0; color: black; padding: 5px; margin: 1ex 0; min-height: 35px; padding-left: 45px;"> |
| <div style="float: left; margin-left: -40px;"><a href="http://wiki.eclipse.org/Image:Note.png" class="image" title="Note.png"><img alt="" src="http://wiki.eclipse.org/images/c/cc/Note.png" width="35" height="35" border="0" /></a></div> |
| <div><b>This is deprecated for SMILA 1.0, the connectivity framework is still functional but will aimed to be replaced by scalable import based on SMILAs job management.</b><br /></div> |
| </div> |
| <p><br /> |
| </p> |
| <table id="toc" class="toc" summary="Contents"><tr><td><div id="toctitle"><h2>Contents</h2></div> |
| <ul> |
| <li class="toclevel-1"><a href="CrawlerController.html#Overview"><span class="tocnumber">1</span> <span class="toctext">Overview</span></a></li> |
| <li class="toclevel-1"><a href="CrawlerController.html#API"><span class="tocnumber">2</span> <span class="toctext">API</span></a></li> |
| <li class="toclevel-1"><a href="CrawlerController.html#Implementations"><span class="tocnumber">3</span> <span class="toctext">Implementations</span></a> |
| <ul> |
| <li class="toclevel-2"><a href="CrawlerController.html#org.eclipse.smila.connectivity.framework.impl"><span class="tocnumber">3.1</span> <span class="toctext">org.eclipse.smila.connectivity.framework.impl</span></a></li> |
| <li class="toclevel-2"><a href="CrawlerController.html#Configuration"><span class="tocnumber">3.2</span> <span class="toctext">Configuration</span></a></li> |
| <li class="toclevel-2"><a href="CrawlerController.html#JMX_interface"><span class="tocnumber">3.3</span> <span class="toctext">JMX interface</span></a></li> |
| <li class="toclevel-2"><a href="CrawlerController.html#HTTP_ReST_JSON_interface"><span class="tocnumber">3.4</span> <span class="toctext">HTTP ReST JSON interface</span></a> |
| <ul> |
| <li class="toclevel-3"><a href="CrawlerController.html#Crawler_Datasource_Listing"><span class="tocnumber">3.4.1</span> <span class="toctext">Crawler Datasource Listing</span></a></li> |
| <li class="toclevel-3"><a href="CrawlerController.html#Start_a_Crawler"><span class="tocnumber">3.4.2</span> <span class="toctext">Start a Crawler</span></a></li> |
| <li class="toclevel-3"><a href="CrawlerController.html#Get_Crawler_Statistics"><span class="tocnumber">3.4.3</span> <span class="toctext">Get Crawler Statistics</span></a></li> |
| <li class="toclevel-3"><a href="CrawlerController.html#Stop_a_Crawler"><span class="tocnumber">3.4.4</span> <span class="toctext">Stop a Crawler</span></a></li> |
| </ul> |
| </li> |
| </ul> |
| </li> |
| </ul> |
| </td></tr></table><script type="text/javascript"> if (window.showTocToggle) { var tocShowText = "show"; var tocHideText = "hide"; showTocToggle(); } </script> |
| <a name="Overview"></a><h2> <span class="mw-headline"> Overview </span></h2> |
| <p>The CrawlerController is a component that manages and monitors <a href="Crawler.html" title="SMILA/Documentation/Crawler">Crawlers</a>. Whenever a new crawl is triggered (via <tt>startCrawl()</tt>) a new instance of the used Crawler is created and the crawler object hash value is used as an id (called <i>import run id</i>) to identify records created by this crawler instance. This import run id is set as an attribute <i>_importRunId</i> on all records and is also visible on the crawler instance in the JMX console. |
| </p> |
| <a name="API"></a><h2> <span class="mw-headline"> API </span></h2> |
| <p>Current javadoc: |
| </p> |
| <ul><li> <a href="http://build.eclipse.org/rt/smila/javadoc/current/org/eclipse/smila/connectivity/framework/CrawlerController.html" class="external text" title="http://build.eclipse.org/rt/smila/javadoc/current/org/eclipse/smila/connectivity/framework/CrawlerController.html" rel="nofollow">org.eclipse.smila.connectivity.framework.CrawlerController</a> |
| </li><li> <a href="http://build.eclipse.org/rt/smila/javadoc/current/org/eclipse/smila/connectivity/framework/util/CrawlerControllerCallback.html" class="external text" title="http://build.eclipse.org/rt/smila/javadoc/current/org/eclipse/smila/connectivity/framework/util/CrawlerControllerCallback.html" rel="nofollow">org.eclipse.smila.connectivity.framework.util.CrawlerControllerCallback</a> |
| </li></ul> |
| <a name="Implementations"></a><h2> <span class="mw-headline"> Implementations </span></h2> |
| <p>It is possible to provide different implementations for the CrawlerController interface. At the moment there is one implementation available. |
| </p> |
| <a name="org.eclipse.smila.connectivity.framework.impl"></a><h3> <span class="mw-headline">org.eclipse.smila.connectivity.framework.impl</span></h3> |
| <p>This bundle contains the default implementation of the CrawlerController interface. |
| </p><p>The CrawlerController implements the general processing logic common for all types of Crawlers. Its interface is a pure management interface that can be accessed by its Java interface or its wrapping JMX interface. It has references to the following OSGi services: |
| </p> |
| <ul><li> Crawler ComponentFactory |
| </li><li> ConnectivityManager |
| </li><li> DeltaIndexingManager (optional) |
| </li><li> CompoundManager |
| </li><li> ConfigurationManagement (t.b.d.) |
| </li></ul> |
| <p>Crawler Factories register themselves at the CrawlerController. Each time a crawl for a certain type of crawler is initiated, a new instance of that Crawler type is created via the Crawler ComponentFactory. This allows parallel crawling of datasources with the same type (e.g. several websites). Note that it is not possible to crawl the same data source concurrently! |
| </p><p><br /> |
| This chart shows the current CrawlerController processing logic for one crawl run: |
| <a href="http://wiki.eclipse.org/Image:CrawlerControllerProcessingLogic.png" class="image" title="Image:CrawlerControllerProcessingLogic.png"><img alt="Image:CrawlerControllerProcessingLogic.png" src="http://wiki.eclipse.org/images/6/67/CrawlerControllerProcessingLogic.png" width="960" height="720" border="0" /></a> |
| </p> |
| <ul><li> First the CrawlerController initializes DeltaIndexing for the current data source by calling <tt>DeltaIndexingManager::init(String)</tt> and also initializes a new Crawler (not shown) |
| </li><li> the then executes subprocess <b>process crawler</b> with the initialized Crawler |
| </li><li> if no error occured so far it performs the subprocess <b>delete delta</b> |
| </li><li> finally it finishes the run by calling <tt>DeltaIndexingManager::finish(String)</tt> |
| </li></ul> |
| <p><br /> |
| </p> |
| <dl><dt>Process Crawler |
| </dt></dl> |
| <ul><li> the CrawlerController checks if the given Crawler has more data available |
| </li><li> YES: the CrawlerController checks each received DataReference send by the Crawler if it needs to be updated by calling <tt>DeltaIndexingManager::checkForUpdate(...)</tt> |
| <ul><li> YES: the CrawlerController request the complete record from the Crawler and checks if the record is a compound |
| <ul><li> YES: the subprocess <b>process compounds</b> is executed. |
| </li><li> NO: no special actions are taken |
| </li></ul> |
| </li><li>the record is added to the Queue by calling <tt>ConnectivityManager::add(...)</tt> and is marked as visited in the DeltaIndexingManager by calling <tt>DeltaIndexingManager::visit(...)</tt> |
| </li><li> NO: the DataReference is skipped. DeltaIndexingManager internally already set the visited flag for this Id |
| </li></ul> |
| </li><li> NO: return to the calling process |
| </li></ul> |
| <p><br /> |
| </p> |
| <dl><dt>Process Compounds |
| </dt></dl> |
| <p>Please see <a href="CompoundManagement.html" title="SMILA/Documentation/CompoundManagement">CompoundManagement</a> for details on compound handling. |
| </p> |
| <ul><li> by calling <tt>CompoundManager:extract(Record, DataSourceConnectionConfig)</tt> the subprocess receives a CompoundCrawler that iterates over the elements of the compound record |
| </li><li> the subprocess recursively calls subprocess <b>process crawler</b> using the CompoundCrawler |
| </li><li> the compound record is adapted according to the configuration (set to null, modified, left unmodified) by calling <tt>CompoundManager:adaptCompoundRecord(Record, DataSourceConnectionConfig)</tt> |
| </li><li> return to the calling process |
| </li></ul> |
| <p><br /> |
| </p> |
| <dl><dt>Delete Delta |
| </dt></dl> |
| <ul><li> by calling <tt>DeltaIndexingManager::obsoleteIdIterator(...)</tt> the subprocess receives an Iterator over all Ids that have to be deleted |
| </li><li> for each Id <tt>ConnectivityManager::delete(...)</tt> is called |
| </li><li> return to the calling process |
| </li></ul> |
| <p><br /> |
| </p> |
| <dl><dt>Note</dt><dd> The exact logic depends on the settings of <tt>DeltaIndexing</tt> in the data source configuration. Depending on the configured value, delta indexing logic is executed fully, partially or not at all. |
| </dd></dl> |
| <a name="Configuration"></a><h3> <span class="mw-headline"> Configuration </span></h3> |
| <p>There are no configuration options available for this bundle. |
| </p> |
| <a name="JMX_interface"></a><h3> <span class="mw-headline"> JMX interface </span></h3> |
| <p>Javdoc: <a href="http://build.eclipse.org/rt/smila/javadoc/current/org/eclipse/smila/connectivity/framework/CrawlerControllerAgent.html" class="external text" title="http://build.eclipse.org/rt/smila/javadoc/current/org/eclipse/smila/connectivity/framework/CrawlerControllerAgent.html" rel="nofollow">org.eclipse.smila.connectivity.framework.CrawlerControllerAgent</a> |
| </p><p>Here is a screenshot of the CrawlerController in the JMX Console: |
| </p><p><a href="http://wiki.eclipse.org/Image:CrawlerControllerJMX.png" class="image" title="Image:CrawlerControllerJMX.png"><img alt="Image:CrawlerControllerJMX.png" src="http://wiki.eclipse.org/images/7/7b/CrawlerControllerJMX.png" width="713" height="443" border="0" /></a> |
| </p> |
| <a name="HTTP_ReST_JSON_interface"></a><h3> <span class="mw-headline"> HTTP ReST JSON interface </span></h3> |
| <p>Since version 0.9 the CrawlerController can also be controlled via the SMILA ReST API. It provides the following endpoints: |
| </p> |
| <table border="1px solid #CCC" cellpadding="5px" style="border:1px solid #CCC; border-collapse:collapse; font:small/1.5 Tahoma, Bitstream Vera Sans, Verdana, Helvetica, sans-serif;"> |
| <tr> |
| <th> endpoint </th><th> method </th><th> description |
| </th></tr> |
| <tr> |
| <td> /smila/crawlers </td><td> GET </td><td> list data sources available for crawling and the current crawl state |
| </td></tr> |
| <tr> |
| <td> /smila/crawlers/<datasource-id> </td><td> GET </td><td> get statistics of current or last crawl run, if one exists. |
| </td></tr> |
| <tr> |
| <td> /smila/crawlers/<datasource-id> </td><td> POST + JSON-Body </td><td> start crawler |
| </td></tr> |
| <tr> |
| <td> /smila/crawlers/<datasource-id>/finish </td><td> POST </td><td> stop crawler |
| </td></tr> |
| </table> |
| <a name="Crawler_Datasource_Listing"></a><h4> <span class="mw-headline"> Crawler Datasource Listing </span></h4> |
| <p>The listing contains the available data sources that can be used for crawling and the current crawl state. State "Undefined" means that no crawl run for the datasource has yet been started. Other states can be |
| </p> |
| <ul><li> Running: A crawler is current working on this datasource. |
| </li><li> Finished: The crawler has crawled the datasource completely. |
| </li><li> Stopped: The crawler was stopped by the user before it could finish to crawl the datasource. |
| </li><li> Aborted: A fatal error occurred while crawling the datasource. |
| </li></ul> |
| <p>If the state has one of these four values, it is possible to read statistics for the datasource by using the given URL. Example: |
| </p> |
| <div dir="ltr" style="text-align: left;"><pre class="source-javascript">GET <span class="re0">/smila/crawlers/</span> |
| --> |
| <span class="nu0">200</span> OK |
| <span class="br0">{</span> |
| <span class="st0">"crawlers"</span>: <span class="br0">[</span> |
| <span class="br0">{</span> |
| <span class="st0">"name"</span>: <span class="st0">"web"</span>, |
| <span class="st0">"state"</span>: <span class="st0">"Undefined"</span>, |
| <span class="st0">"url"</span>: <span class="st0">"http://localhost:8080/smila/crawlers/web/"</span> |
| <span class="br0">}</span>, |
| <span class="br0">{</span> |
| <span class="st0">"name"</span>: <span class="st0">"file"</span>, |
| <span class="st0">"state"</span>: <span class="st0">"Finished"</span>, |
| <span class="st0">"url"</span>: <span class="st0">"http://localhost:8080/smila/crawlers/file/"</span> |
| <span class="br0">}</span>, |
| <span class="br0">{</span> |
| <span class="st0">"name"</span>: <span class="st0">"xmldump"</span>, |
| <span class="st0">"state"</span>: <span class="st0">"Undefined"</span>, |
| <span class="st0">"url"</span>: <span class="st0">"http://localhost:8080/smila/crawlers/xmldump/"</span> |
| <span class="br0">}</span> |
| <span class="br0">]</span> |
| <span class="br0">}</span></pre></div> |
| <a name="Start_a_Crawler"></a><h4> <span class="mw-headline"> Start a Crawler </span></h4> |
| <p>If a datasource is not in crawl state "Running" it can be started using the URL given in the datasource listing. The request must contain a JSON body describing the destination job to submit records to. In case of success the response contains the internal import run ID. |
| </p> |
| <div dir="ltr" style="text-align: left;"><pre class="source-javascript">POST <span class="re0">/smila/crawlers/file/</span> |
| <span class="br0">{</span> |
| <span class="st0">"jobName"</span>: <span class="st0">"indexUpdateJob"</span> |
| <span class="br0">}</span> |
| --> |
| <span class="nu0">200</span> OK |
| <span class="br0">{</span> |
| <span class="st0">"importRunId"</span>: <span class="nu0">1992135396</span> |
| <span class="br0">}</span></pre></div> |
| <p>Other response codes: |
| </p> |
| <ul><li> 400 Bad Request: datasource ID does not exist, destination job is not active, datasource is not a crawler source or a crawler is already running for the datasource. |
| </li><li> 500 Internal Server Error: Ohter errors. |
| </li></ul> |
| <a name="Get_Crawler_Statistics"></a><h4> <span class="mw-headline"> Get Crawler Statistics </span></h4> |
| <p>If a datasource has been crawler or is currently crawler you can read the performance counters using the datasource URL: |
| </p> |
| <div dir="ltr" style="text-align: left;"><pre class="source-javascript">GET <span class="re0">/smila/crawlers/file/</span> |
| --> |
| <span class="nu0">200</span> OK |
| <span class="br0">{</span> |
| <span class="st0">"jobName"</span>: <span class="st0">"job"</span>, |
| <span class="st0">"attachmentBytesTransfered"</span>: <span class="nu0">0</span>, |
| <span class="st0">"attachmentTransferRate"</span>: <span class="nu0">0</span>, |
| <span class="st0">"averageAttachmentTransferRate"</span>: <span class="nu0">0</span>, |
| <span class="st0">"averageDeltaIndicesProcessingTime"</span>: <span class="nu0">0</span>, |
| <span class="st0">"averageRecordsProcessingTime"</span>: <span class="nu0">0</span>, |
| <span class="st0">"deltaIndices"</span>: <span class="nu0">569</span>, |
| <span class="st0">"endDate"</span>: <span class="st0">"2011-09-06"</span>, |
| <span class="st0">"errorBuffer"</span>: <span class="st0">"[]"</span>, |
| <span class="st0">"exceptions"</span>: <span class="nu0">0</span>, |
| <span class="st0">"exceptionsCritical"</span>: <span class="nu0">0</span>, |
| <span class="st0">"importRunId"</span>: <span class="st0">"786625416"</span>, |
| <span class="st0">"overallAverageDeltaIndicesProcessingTime"</span>: <span class="nu0">10.06854130052724</span>, |
| <span class="st0">"overallAverageRecordsProcessingTime"</span>: <span class="st0">"Infinity"</span>, |
| <span class="st0">"records"</span>: <span class="nu0">0</span>, |
| <span class="st0">"startDate"</span>: <span class="st0">"2011-09-06"</span>, |
| <span class="st0">"files"</span>: <span class="nu0">0</span>, |
| <span class="st0">"folders"</span>: <span class="nu0">0</span>, |
| <span class="st0">"producerExceptions"</span>: <span class="nu0">0</span>, |
| <span class="st0">"dataSourceId"</span>: <span class="st0">"file"</span>, |
| <span class="st0">"state"</span>: <span class="st0">"Finished"</span> |
| <span class="br0">}</span></pre></div> |
| <p>Other responses are |
| </p> |
| <ul><li>400 Bad Request: Invalid datasource ID |
| </li><li>404 Not Found: No statistics available for given datasource |
| </li><li>500 Internal Server Error: Other error. |
| </li></ul> |
| <a name="Stop_a_Crawler"></a><h4> <span class="mw-headline"> Stop a Crawler </span></h4> |
| <p>To stop a running crawler, use the following HTTP request. The response will be empty, just the response code will be "OK". |
| </p> |
| <div dir="ltr" style="text-align: left;"><pre class="source-javascript">POST <span class="re0">/smila/crawlers/file/finish/</span> |
| --> |
| <span class="nu0">200</span> OK</pre></div> |
| <p>Other responses are: |
| </p> |
| <ul><li> 400 Bad Request: No crawler is running for this datasource. |
| </li><li> 500 Internal Server Error: Other errors. |
| </li></ul> |
| <p>, |
| </p> |
| <!-- |
| NewPP limit report |
| Preprocessor node count: 77/1000000 |
| Post-expand include size: 1221/2097152 bytes |
| Template argument size: 515/2097152 bytes |
| #ifexist count: 0/100 |
| --> |
| |
| <!-- Saved in parser cache with key wikidb:pcache:idhash:18820-0!1!0!!en!2!edit=0 and timestamp 20130416060949 --> |
| <div class="printfooter"> |
| Retrieved from "<a href="CrawlerController.html">http://wiki.eclipse.org/SMILA/Documentation/CrawlerController</a>"</div> |
| <!-- end content --> |
| <div class="visualClear"></div> |
| </div> |
| </div> |
| |
| |
| </div> |
| |
| |
| <!-- Yoink of toolbox for phoenix moved up --> |
| |
| |
| </div> |
| </div> |
| <div id="clearFooter"/> |
| <div id="footer" > |
| <ul id="footernav"> |
| <li class="first"><a href="http://www.eclipse.org/">Home</a></li> |
| <li><a href="http://www.eclipse.org/legal/privacy.php">Privacy Policy</a></li> |
| <li><a href="http://www.eclipse.org/legal/termsofuse.php">Terms of Use</a></li> |
| <li><a href="http://www.eclipse.org/legal/copyright.php">Copyright Agent</a></li> |
| <li><a href="http://www.eclipse.org/org/foundation/contact.php">Contact</a></li> |
| <li><a href="http://wiki.eclipse.org/Eclipsepedia:About" title="Eclipsepedia:About">About Eclipsepedia</a></li> |
| </ul> |
| <span id="copyright">Copyright © 2013 The Eclipse Foundation. All Rights Reserved</span> |
| <p id="footercredit">This page was last modified 09:37, 24 January 2012 by <a href="http://wiki.eclipse.org/index.php?title=User:Juergen.schumacher.attensity.com&action=edit" class="new" title="User:Juergen.schumacher.attensity.com">Juergen Schumacher</a>. Based on work by <a href="http://wiki.eclipse.org/index.php?title=User:Nadine.auslaender.attensity.com&action=edit" class="new" title="User:Nadine.auslaender.attensity.com"> </a>, <a href="http://wiki.eclipse.org/User:Andreas.Weber.empolis.com" title="User:Andreas.Weber.empolis.com">Andreas Weber</a> and <a href="http://wiki.eclipse.org/User:Daniel.stucky.empolis.com" title="User:Daniel.stucky.empolis.com">Daniel Stucky</a> and <a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/CrawlerController&action=credits" title="SMILA/Documentation/CrawlerController">others</a>.</p> |
| <p id="footerviews">This page has been accessed 3,685 times.</p> |
| </div> |
| |
| <script type="text/javascript"> |
| var gaJsHost = (("https:" == document.location.protocol) ? "https://ssl." : "http://www."); |
| document.write(unescape("%3Cscript src='" + gaJsHost + "google-analytics.com/ga.js' type='text/javascript'%3E%3C/script%3E")); |
| </script> |
| <script type="text/javascript"> |
| var pageTracker = _gat._getTracker("UA-910670-4"); |
| pageTracker._trackPageview(); |
| </script> |
| |
| |
| |
| |
| |
| |
| |
| <!-- <div class="visualClear"></div> --> |
| |
| <script type="text/javascript">if (window.runOnloadHook) runOnloadHook();</script> |
| </div> |
| |
| <!-- Served in 0.059 secs. --></body></html> |