blob: 0f5749213a985bae3a649033eefd5efbbc267881 [file] [log] [blame]
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en" dir="ltr">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<meta name="keywords" content="SMILA/Documentation/HowTo/How to add a new Data Source to the importing framework,SMILA/Development Guidelines/Create a bundle (plug-in),SMILA/Development Guidelines/Create a test bundle (plug-in),SMILA/Development Guidelines/How to write a Worker,SMILA/Development Guidelines/Howto set up dev environment,SMILA/Documentation/Importing/Concept,SMILA/Documentation/JobManager" />
<link rel="shortcut icon" href="http://wiki.eclipse.org/SMILA/Documentation/HowTo/favicon.ico" />
<link rel="search" type="application/opensearchdescription+xml" href="http://wiki.eclipse.org/opensearch_desc.php" title="Eclipsepedia (English)" />
<link rel="alternate" type="application/rss+xml" title="Eclipsepedia RSS Feed" href="http://wiki.eclipse.org/index.php?title=Special:Recentchanges&amp;feed=rss" />
<link rel="alternate" type="application/atom+xml" title="Eclipsepedia Atom Feed" href="http://wiki.eclipse.org/index.php?title=Special:Recentchanges&amp;feed=atom" />
<title>SMILA/Documentation/HowTo/How to add a new Data Source to the importing framework - Eclipsepedia</title>
<style type="text/css" media="screen,projection">/*<![CDATA[*/ @import "/skins/eclipsenova/novaWide.css?116"; /*]]>*/</style>
<link rel="stylesheet" type="text/css" media="print" href="http://wiki.eclipse.org/skins/eclipsenova/eclipsenovaPrint.css?116" />
<link rel="stylesheet" type="text/css" media="handheld" href="http://wiki.eclipse.org/skins/eclipsenova/handheld.css?116" />
<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/Nova/css/header.css" media="screen" />
<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/tabs.css" media="screen" />
<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/Nova/css/visual.css" media="screen" />
<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/Nova/css/layout.css" media="screen" />
<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/Nova/css/footer.css" media="screen" />
<!--[if IE]><link rel="stylesheet" type="text/css" href="/skins/eclipsenova/IEpngfix.css" media="screen" /><![endif]-->
<!--[if lt IE 5.5000]><style type="text/css">@import "/skins/eclipsenova/IE50Fixes.css?116";</style> <![endif]-->
<!--[if IE 5.5000]><style type="text/css">@import "/skins/eclipsenova/IE55Fixes.css?116";</style><![endif]-->
<!--[if IE 6]><style type="text/css">@import "/skins/eclipsenova/IE60Fixes.css?116";</style><![endif]-->
<!--[if IE 7]><style type="text/css">@import "/skins/eclipsenova/IE70Fixes.css?116";</style><![endif]-->
<!--[if lt IE 7]><script type="text/javascript" src="/skins/common/IEFixes.js?116"></script>
<meta http-equiv="imagetoolbar" content="no" /><![endif]-->
<script type= "text/javascript">/*<![CDATA[*/
var skin = "eclipsenova";
var stylepath = "/skins";
var wgArticlePath = "/$1";
var wgScriptPath = "";
var wgScript = "/index.php";
var wgServer = "http://wiki.eclipse.org";
var wgCanonicalNamespace = "";
var wgCanonicalSpecialPageName = false;
var wgNamespaceNumber = 0;
var wgPageName = "SMILA/Documentation/HowTo/How_to_add_a_new_Data_Source_to_the_importing_framework";
var wgTitle = "SMILA/Documentation/HowTo/How to add a new Data Source to the importing framework";
var wgAction = "view";
var wgRestrictionEdit = [];
var wgRestrictionMove = [];
var wgArticleId = "35477";
var wgIsArticle = true;
var wgUserName = null;
var wgUserGroups = null;
var wgUserLanguage = "en";
var wgContentLanguage = "en";
var wgBreakFrames = false;
var wgCurRevisionId = "286557";
var wgVersion = "1.12.0";
var wgEnableAPI = true;
var wgEnableWriteAPI = false;
/*]]>*/</script>
<script type="text/javascript" src="http://wiki.eclipse.org/skins/common/wikibits.js?116"><!-- wikibits js --></script>
<!-- Performance mods similar to those for bug 166401 -->
<script type="text/javascript" src="http://wiki.eclipse.org/index.php?title=-&amp;action=raw&amp;gen=js&amp;useskin=eclipsenova"><!-- site js --></script>
<!-- Head Scripts -->
<script type="text/javascript" src="http://wiki.eclipse.org/skins/common/ajax.js?116"></script>
<style type="text/css">/*<![CDATA[*/
.source-java {line-height: normal; font-size: medium;}
.source-java li {line-height: normal;}
/**
* GeSHi Dynamically Generated Stylesheet
* --------------------------------------
* Dynamically generated stylesheet for java
* CSS class: source-java, CSS id:
* GeSHi (C) 2004 - 2007 Nigel McNie (http://qbnz.com/highlighter)
*/
.source-java .de1, .source-java .de2 {font-family: 'Courier New', Courier, monospace; font-weight: normal;}
.source-java {}
.source-java .head {}
.source-java .foot {}
.source-java .imp {font-weight: bold; color: red;}
.source-java .ln-xtra {color: #cc0; background-color: #ffc;}
.source-java li {font-family: 'Courier New', Courier, monospace; color: black; font-weight: normal; font-style: normal;}
.source-java li.li2 {font-weight: bold;}
.source-java .kw1 {color: #7F0055; font-weight: bold;}
.source-java .kw2 {color: #7F0055; font-weight: bold;}
.source-java .kw3 {color: #000000; font-weight: normal}
.source-java .kw4 {color: #7F0055; font-weight: bold;}
.source-java .co1 {color: #3F7F5F; font-style: italic;}
.source-java .co2 {color: #3F7F5F;}
.source-java .co3 {color: #3F7F5F; font-style: italic; font-weight: bold;}
.source-java .coMULTI {color: #3F5FBF; font-style: italic;}
.source-java .es0 {color: #000000;}
.source-java .br0 {color: #000000;}
.source-java .st0 {color: #2A00ff;}
.source-java .nu0 {color: #000000;}
.source-java .me1 {color: #000000;}
.source-java .me2 {color: #000000;}
/*]]>*/
</style>
<style type="text/css">/*<![CDATA[*/
@import "/index.php?title=MediaWiki:Geshi.css&usemsgcache=yes&action=raw&ctype=text/css&smaxage=18000";
/*]]>*/
</style><style type="text/css">/*<![CDATA[*/
.source-javascript {line-height: normal; font-size: medium;}
.source-javascript li {line-height: normal;}
/**
* GeSHi Dynamically Generated Stylesheet
* --------------------------------------
* Dynamically generated stylesheet for javascript
* CSS class: source-javascript, CSS id:
* GeSHi (C) 2004 - 2007 Nigel McNie (http://qbnz.com/highlighter)
*/
.source-javascript .de1, .source-javascript .de2 {font-family: 'Courier New', Courier, monospace; font-weight: normal;}
.source-javascript {}
.source-javascript .head {}
.source-javascript .foot {}
.source-javascript .imp {font-weight: bold; color: red;}
.source-javascript .ln-xtra {color: #cc0; background-color: #ffc;}
.source-javascript li {font-family: 'Courier New', Courier, monospace; color: black; font-weight: normal; font-style: normal;}
.source-javascript li.li2 {font-weight: bold;}
.source-javascript .kw1 {color: #000066; font-weight: bold;}
.source-javascript .kw2 {color: #003366; font-weight: bold;}
.source-javascript .kw3 {color: #000066;}
.source-javascript .co1 {color: #009900; font-style: italic;}
.source-javascript .coMULTI {color: #009900; font-style: italic;}
.source-javascript .es0 {color: #000099; font-weight: bold;}
.source-javascript .br0 {color: #66cc66;}
.source-javascript .st0 {color: #3366CC;}
.source-javascript .nu0 {color: #CC0000;}
.source-javascript .me1 {color: #006600;}
.source-javascript .sc0 {}
.source-javascript .sc1 {}
.source-javascript .sc2 {}
.source-javascript .sc3 {}
.source-javascript .re0 {color: #0066FF;}
/*]]>*/
</style>
<style type="text/css">/*<![CDATA[*/
@import "/index.php?title=MediaWiki:Geshi.css&usemsgcache=yes&action=raw&ctype=text/css&smaxage=18000";
/*]]>*/
</style><style type="text/css">/*<![CDATA[*/
.source-text {line-height: normal; font-size: medium;}
.source-text li {line-height: normal;}
/**
* GeSHi Dynamically Generated Stylesheet
* --------------------------------------
* Dynamically generated stylesheet for text
* CSS class: source-text, CSS id:
* GeSHi (C) 2004 - 2007 Nigel McNie (http://qbnz.com/highlighter)
*/
.source-text .de1, .source-text .de2 {font-family: 'Courier New', Courier, monospace; font-weight: normal;}
.source-text {}
.source-text .head {}
.source-text .foot {}
.source-text .imp {font-weight: bold; color: red;}
.source-text .ln-xtra {color: #cc0; background-color: #ffc;}
.source-text li {font-family: 'Courier New', Courier, monospace; color: black; font-weight: normal; font-style: normal;}
.source-text li.li2 {font-weight: bold;}
/*]]>*/
</style>
<style type="text/css">/*<![CDATA[*/
@import "/index.php?title=MediaWiki:Geshi.css&usemsgcache=yes&action=raw&ctype=text/css&smaxage=18000";
/*]]>*/
</style><link rel="stylesheet" type="text/css" href="How_to_add_a_new_Data_Source_to_the_importing_framework.html" /> </head>
<body class="mediawiki ns-0 ltr page-SMILA_Documentation_HowTo_How_to_add_a_new_Data_Source_to_the_importing_framework">
<div id="globalWrapper">
<div id="column-one">
<!-- Eclipse Additions for the Top Nav start here M. Ward-->
<div id="header">
<div id="header-graphic">
<img src="http://wiki.eclipse.org/skins/eclipsenova/eclipse.png" alt="Eclipse Wiki">
</div>
<!-- Pulled 101409 Mward -->
<div class="portlet" id="p-personal">
<div class="pBody">
<ul>
<li id="pt-login"><a href="http://wiki.eclipse.org/index.php?title=Special:Userlogin&amp;returnto=SMILA/Documentation/HowTo/How_to_add_a_new_Data_Source_to_the_importing_framework">Log in</a></li>
</ul>
</div>
</div>
<div id="header-icons">
<div id="sites">
<ul id="sitesUL">
<li><a href="http://www.eclipse.org"><img src="http://dev.eclipse.org/custom_icons/eclipseIcon.png" width="28" height="28" alt="Eclipse Foundation" title="Eclipse Foundation" /><div>Eclipse Foundation</div></a></li>
<li><a href="http://marketplace.eclipse.org"><img src="http://dev.eclipse.org/custom_icons/marketplace.png" width="28" height="28" alt="Eclipse Marketplace" title="Eclipse Marketplace" /><div>Eclipse Marketplace</div></a></li>
<li><a href="https://bugs.eclipse.org/bugs"><img src="http://dev.eclipse.org/custom_icons/system-search-bw.png" width="28" height="28" alt="Bugzilla" title="Bugzilla" /><div>Bugzilla</div></a></li>
<li><a href="http://live.eclipse.org"><img src="http://dev.eclipse.org/custom_icons/audio-input-microphone-bw.png" width="28" height="28" alt="Live" title="Live" /><div>Eclipse Live</div></a></li>
<li><a href="http://planeteclipse.org"><img src="http://dev.eclipse.org/large_icons/devices/audio-card.png" width="28" height="28" alt="PlanetEclipse" title="Planet" /><div>Planet Eclipse</div></a></li>
<li><a href="http://portal.eclipse.org"><img src="http://dev.eclipse.org/custom_icons/preferences-system-network-proxy-bw.png" width="28" height="28" alt="Portal" title="Portal" /><div>My Foundation Portal</div></a></li>
</ul>
</div>
</div>
</div>
<!-- NEW HEADER STUFF HERE -->
<div id="header-menu">
<div id="header-nav">
<ul> <li><a class="first_one" href="http://wiki.eclipse.org/" target="_self">Home</a></li> <li><a href="http://www.eclipse.org/downloads/" target="_self">Downloads</a></li>
<li><a href="http://www.eclipse.org/users/" target="_self">Users</a></li>
<li><a href="http://www.eclipse.org/membership/" target="_self">Members</a></li>
<li><a href="http://wiki.eclipse.org/index.php/Development_Resources" target="_self">Committers</a></li>
<li><a href="http://www.eclipse.org/resources/" target="_self">Resources</a></li>
<li><a href="http://www.eclipse.org/projects/" target="_self">Projects</a></li>
<li><a href="http://www.eclipse.org/org/" target="_self">About Us</a></li>
</ul>
</div>
<div id="header-utils">
<!-- moved the search window here -->
<form action="http://wiki.eclipse.org/Special:Search" >
<input class="input" name="search" type="text" accesskey="f" value="" />
<input type='submit' onclick="this.submit();" name="go" id="searchGoButton" class="button" title="Go to a page with this exact name if one exists" value="Go" />&nbsp;
<input type='submit' onclick="this.submit();" name="fulltext" class="button" id="mw-searchButton" title="Search Eclipsepedia for this text" value="Search" />
</form>
</div>
</div>
<!-- Eclipse Additions for the Header stop here -->
<!-- Additions and mods for leftside nav Start here -->
<!--Started nav rip here-->
<!-- these are the nav controls main page, changes etc -->
<div id="novaContent" class="faux">
<div id="leftcol">
<ul id="leftnav">
<!-- these are the page controls, edit history etc -->
<li class="separator"><a class="separator">Navigation &#160;&#160;</li>
<li id="n-mainpage"><a href="http://wiki.eclipse.org/Main_Page">Main Page</a></li>
<li id="n-portal"><a href="http://wiki.eclipse.org/Eclipsepedia:Community_Portal">Community portal</a></li>
<li id="n-currentevents"><a href="http://wiki.eclipse.org/Eclipsepedia:Current_events">Current events</a></li>
<li id="n-recentchanges"><a href="http://wiki.eclipse.org/Special:Recentchanges">Recent changes</a></li>
<li id="n-randompage"><a href="http://wiki.eclipse.org/Special:Random">Random page</a></li>
<li id="n-help"><a href="http://wiki.eclipse.org/Help:Contents">Help</a></li>
<li class="separator"><a class="separator">Toolbox &#160;&#160;</a></li>
<li id="t-whatlinkshere"><a href="http://wiki.eclipse.org/Special:Whatlinkshere/SMILA/Documentation/HowTo/How_to_add_a_new_Data_Source_to_the_importing_framework">What links here</a></li>
<li id="t-recentchangeslinked"><a href="http://wiki.eclipse.org/Special:Recentchangeslinked/SMILA/Documentation/HowTo/How_to_add_a_new_Data_Source_to_the_importing_framework">Related changes</a></li>
<!-- This is the toolbox section -->
<li id="t-upload"><a href="http://wiki.eclipse.org/Special:Upload">Upload file</a></li>
<li id="t-specialpages"><a href="http://wiki.eclipse.org/Special:Specialpages">Special pages</a></li>
<li id="t-print"><a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/HowTo/How_to_add_a_new_Data_Source_to_the_importing_framework&amp;printable=yes">Printable version</a></li> <li id="t-permalink"><a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/HowTo/How_to_add_a_new_Data_Source_to_the_importing_framework&amp;oldid=286557">Permanent link</a></li> </ul>
</div>
<!-- Additions and mods for leftside nav End here -->
<div id="column-content">
<div id="content">
<a name="top" id="top"></a>
<div id="tabs">
<ul class="primary">
<li class="active"><a href="How_to_add_a_new_Data_Source_to_the_importing_framework.html"><span class="tab">Page</span></a></li>
<li><a href="http://wiki.eclipse.org/index.php?title=Talk:SMILA/Documentation/HowTo/How_to_add_a_new_Data_Source_to_the_importing_framework&amp;action=edit"><span class="tab">Discussion</span></a></li>
<li><a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/HowTo/How_to_add_a_new_Data_Source_to_the_importing_framework&amp;action=edit"><span class="tab">View source</span></a></li>
<li><a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/HowTo/How_to_add_a_new_Data_Source_to_the_importing_framework&amp;action=history"><span class="tab">History</span></a></li>
<li><a href="http://wiki.eclipse.org/index.php?title=Special:Userlogin&amp;returnto=SMILA/Documentation/HowTo/How%20to%20add%20a%20new%20Data%20Source%20to%20the%20importing%20framework"><span class="tab">Edit</span></a></li>
</ul>
</div>
<script type="text/javascript"> if (window.isMSIE55) fixalpha(); </script>
<h1 class="firstHeading">SMILA/Documentation/HowTo/How to add a new Data Source to the importing framework</h1>
<div id="bodyContent">
<h3 id="siteSub">From Eclipsepedia</h3>
<div id="contentSub"><span class="subpages">&lt; <a href="../../../SMILA.html" title="SMILA">SMILA</a> | <a href="../../Documentation.1.html" title="SMILA/Documentation">Documentation</a> | <a href="../HowTo.html" title="SMILA/Documentation/HowTo">HowTo</a></span></div>
<div id="jump-to-nav">Jump to: <a href="How_to_add_a_new_Data_Source_to_the_importing_framework.html#column-one">navigation</a>, <a href="How_to_add_a_new_Data_Source_to_the_importing_framework.html#searchInput">search</a></div> <!-- start content -->
<p>This how to manual shows how you can add a new data source (e.g. database, connectors, etc.) for the new SMILA importing framework (see <a href="../Importing/Concept.html" title="SMILA/Documentation/Importing/Concept">Importing Concept</a> for more information about the framework).
</p><p>The steps necessary to include the bundles and workers into the builds or launchers won't be covered here, as they are covered in detail in other how tos (see preconditions).
</p>
<table id="toc" class="toc" summary="Contents"><tr><td><div id="toctitle"><h2>Contents</h2></div>
<ul>
<li class="toclevel-1"><a href="How_to_add_a_new_Data_Source_to_the_importing_framework.html#Preconditions"><span class="tocnumber">1</span> <span class="toctext">Preconditions</span></a></li>
<li class="toclevel-1"><a href="How_to_add_a_new_Data_Source_to_the_importing_framework.html#Prepare_the_bundle"><span class="tocnumber">2</span> <span class="toctext">Prepare the bundle</span></a></li>
<li class="toclevel-1"><a href="How_to_add_a_new_Data_Source_to_the_importing_framework.html#Writing_the_workers"><span class="tocnumber">3</span> <span class="toctext">Writing the workers</span></a>
<ul>
<li class="toclevel-2"><a href="How_to_add_a_new_Data_Source_to_the_importing_framework.html#The_Crawler"><span class="tocnumber">3.1</span> <span class="toctext">The Crawler</span></a>
<ul>
<li class="toclevel-3"><a href="How_to_add_a_new_Data_Source_to_the_importing_framework.html#What_is_the_crawler_worker_supposed_to_do.3F"><span class="tocnumber">3.1.1</span> <span class="toctext">What is the crawler worker supposed to do?</span></a></li>
</ul>
</li>
<li class="toclevel-2"><a href="How_to_add_a_new_Data_Source_to_the_importing_framework.html#The_Fetcher"><span class="tocnumber">3.2</span> <span class="toctext">The Fetcher</span></a>
<ul>
<li class="toclevel-3"><a href="How_to_add_a_new_Data_Source_to_the_importing_framework.html#What_is_the_fetcher_worker_supposed_to_do.3F"><span class="tocnumber">3.2.1</span> <span class="toctext">What is the fetcher worker supposed to do?</span></a></li>
</ul>
</li>
<li class="toclevel-2"><a href="How_to_add_a_new_Data_Source_to_the_importing_framework.html#Plugging_it_up"><span class="tocnumber">3.3</span> <span class="toctext">Plugging it up</span></a></li>
<li class="toclevel-2"><a href="How_to_add_a_new_Data_Source_to_the_importing_framework.html#Mapping_to_Solr"><span class="tocnumber">3.4</span> <span class="toctext">Mapping to Solr</span></a></li>
<li class="toclevel-2"><a href="How_to_add_a_new_Data_Source_to_the_importing_framework.html#And_....Action.21"><span class="tocnumber">3.5</span> <span class="toctext">And ....Action!</span></a></li>
</ul>
</li>
</ul>
</td></tr></table><script type="text/javascript"> if (window.showTocToggle) { var tocShowText = "show"; var tocHideText = "hide"; showTocToggle(); } </script>
<a name="Preconditions"></a><h1> <span class="mw-headline"> Preconditions </span></h1>
<ul><li> Set up your development environment, see <a href="../../Development_Guidelines/Howto_set_up_dev_environment.html" class="mw-redirect" title="SMILA/Development Guidelines/Howto set up dev environment">How to set up the development environment</a>.
</li><li> You should have read and understood the documentation about the <a href="../JobManager.html" title="SMILA/Documentation/JobManager">JobManager</a>, especially the configuration of workers and workflows if you want to create new workers.
</li><li> You should have at least an idea about the OSGi framework and OSGi services. For links to introductory articles and tutorials see <a href="http://www.osgi.org/About/HowOSGi" class="external autonumber" title="http://www.osgi.org/About/HowOSGi" rel="nofollow">[1]</a>. For a quite comprehensive overview on OSGi see <a href="http://njbartlett.name/osgibook.html" class="external autonumber" title="http://njbartlett.name/osgibook.html" rel="nofollow">[2]</a>. Especially, SMILA makes intensive use of OSGi's Declarative Services facility, so you may want to have at least a quick look at it.
</li><li> You should already have gone through the <a href="../../Development_Guidelines/How_to_write_a_Worker.html" class="mw-redirect" title="SMILA/Development Guidelines/How to write a Worker">How to write a Worker</a> tutorial, since you need a Crawler and a Fetcher worker in order to be able to crawl a new Data Source.
</li></ul>
<a name="Prepare_the_bundle"></a><h1> <span class="mw-headline"> Prepare the bundle </span></h1>
<p>Please follow the <a href="../../Development_Guidelines/Create_a_bundle_(plug-in).html" class="mw-redirect" title="SMILA/Development Guidelines/Create a bundle (plug-in)">How to create a bundle (plug-in)</a> manual to create a new bundle.
</p><p>Add the following bundles to the <i>Imported Packages</i> list:
</p>
<ul><li> org.eclipse.smila.datamodel: For the Record class.
</li><li> org.eclipse.smila.objectstore: Possible exceptions when accessing input/output streams.
</li><li> org.eclipse.smila.taskmanager: To access the Task.
</li><li> org.eclipse.smila.taskworker: The TaskWorker bundle containing the Worker and TaskContext interfaces.
</li><li> org.eclipse.smila.taskworker.input: Input streams of the TaskWorker bundle.
</li><li> org.eclipse.smila.taskworker.output: Output streams of the TaskWorker bundle.
</li><li> org.eclipse.smila.importing: The importing framework bundle.
</li></ul>
<p>You should also add a test bundle (see <a href="../../Development_Guidelines/Create_a_test_bundle_(plug-in).html" class="mw-redirect" title="SMILA/Development Guidelines/Create a test bundle (plug-in)">How to create a test bundle (plug-in)</a>).
</p>
<a name="Writing_the_workers"></a><h1> <span class="mw-headline"> Writing the workers </span></h1>
<p>You should also have a look at the two existing crawlers in SMILA, <tt>org.eclipse.smila.importing.crawler.file</tt> and <tt>org.eclipse.smila.importing.crawler.web</tt>.
</p>
<a name="The_Crawler"></a><h2> <span class="mw-headline"> The Crawler </span></h2>
<p>The crawler worker is responsible to retrieve or produce the IDs (e.g. URLs etc.) to adress or identify the data in the data source.
</p><p>The only interface the worker has to implement is <span style="font-family:monospace;">org.eclipse.smila.taskworker.Worker</span>.
</p>
<a name="What_is_the_crawler_worker_supposed_to_do.3F"></a><h3> <span class="mw-headline"> What is the crawler worker supposed to do? </span></h3>
<p>The crawler worker is supposed to do the following:
</p>
<ul><li> be invoked by the task generator when the crawl job ist started (as Run-Once job!)
</li><li> optionally get some information about what to crawl (some seed id or base URL or SQL query or whatever)
</li><li> iterate over the data source according to that information
</li><li> and for each entry generate an output record
<ul><li> with the data source property set
</li><li> with the id set (e.g. to the ID of the data sources data record, to make things easier)
</li><li> optionally with the attribute <span style="font-family:monospace;">_deltaHash</span> (ImportingConstants.ATTRIBUTE_DELTA_HASH) set to some information that indicates if the data has been changed meanwhile (a hash over the content or a timestampt of the last modification etc.), so the delta checker can determine if the record has to be processed or the data in the index is up-to-date.
</li><li> optionally with every other information it can easily gather for the data (e.g. for a file crawler these would be the file's metadata that are quickly available without actually reading the file).
</li></ul>
</li></ul>
<p>So the worker could look something like the following:
</p>
<div dir="ltr" style="text-align: left;"><pre class="source-java"><span class="kw1">public</span> <span class="kw1">class</span> WhatsoeverCrawlerWorker <span class="kw1">implements</span> Worker <span class="br0">&#123;</span>
&nbsp;
<span class="kw1">private</span> <span class="kw1">static</span> <span class="kw1">final</span> <span class="kw3">String</span> NAME = <span class="st0">&quot;whatsoeverCrawler&quot;</span>;
&nbsp;
<span class="kw1">private</span> <span class="kw1">static</span> <span class="kw1">final</span> <span class="kw3">String</span> OUTPUT_SLOT = <span class="st0">&quot;output&quot;</span>;
&nbsp;
<span class="kw1">private</span> <span class="kw1">static</span> <span class="kw1">final</span> <span class="kw3">String</span> PROPERTY_SEED = <span class="st0">&quot;seed&quot;</span>;
&nbsp;
<span class="kw1">private</span> <span class="kw1">static</span> <span class="kw1">final</span> <span class="kw4">int</span> MAX_IDS_PER_BULK = <span class="nu0">1024</span>;
&nbsp;
<span class="kw1">private</span> Log _log = LogFactory.<span class="me1">getLog</span><span class="br0">&#40;</span>getClass<span class="br0">&#40;</span><span class="br0">&#41;</span><span class="br0">&#41;</span>;
&nbsp;
<span class="coMULTI">/** {@inheritDoc} */</span>
@Override
<span class="kw1">public</span> <span class="kw4">void</span> perform<span class="br0">&#40;</span>TaskContext taskContext<span class="br0">&#41;</span> <span class="kw1">throws</span> <span class="kw3">Exception</span> <span class="br0">&#123;</span>
<span class="kw1">final</span> AnyMap taskParams = taskContext.<span class="me1">getTaskParameters</span><span class="br0">&#40;</span><span class="br0">&#41;</span>;
<span class="kw1">final</span> <span class="kw3">String</span> dataSource = taskParams
.<span class="me1">getStringValue</span><span class="br0">&#40;</span>ImportingConstants.<span class="me1">TASK_PARAM_DATA_SOURCE</span><span class="br0">&#41;</span>;
<span class="kw1">if</span> <span class="br0">&#40;</span>dataSource == <span class="kw2">null</span> || dataSource.<span class="me1">trim</span><span class="br0">&#40;</span><span class="br0">&#41;</span>.<span class="me1">length</span><span class="br0">&#40;</span><span class="br0">&#41;</span> == <span class="nu0">0</span><span class="br0">&#41;</span> <span class="br0">&#123;</span>
<span class="kw1">throw</span> <span class="kw1">new</span> <span class="kw3">IllegalArgumentException</span><span class="br0">&#40;</span><span class="st0">&quot;Parameter '&quot;</span>
+ ImportingConstants.<span class="me1">TASK_PARAM_DATA_SOURCE</span> + <span class="st0">&quot;' of task &quot;</span>
+ taskContext.<span class="me1">getTask</span><span class="br0">&#40;</span><span class="br0">&#41;</span>.<span class="me1">getTaskId</span><span class="br0">&#40;</span><span class="br0">&#41;</span> + <span class="st0">&quot; is null or empty&quot;</span><span class="br0">&#41;</span>;
<span class="br0">&#125;</span>
<span class="kw1">final</span> <span class="kw3">String</span> seedId = taskParams.<span class="me1">getStringValue</span><span class="br0">&#40;</span>PROPERTY_SEED<span class="br0">&#41;</span>;
<span class="kw1">if</span> <span class="br0">&#40;</span>seedId == <span class="kw2">null</span> || seedId.<span class="me1">trim</span><span class="br0">&#40;</span><span class="br0">&#41;</span>.<span class="me1">length</span><span class="br0">&#40;</span><span class="br0">&#41;</span> == <span class="nu0">0</span><span class="br0">&#41;</span> <span class="br0">&#123;</span>
<span class="kw1">throw</span> <span class="kw1">new</span> <span class="kw3">IllegalArgumentException</span><span class="br0">&#40;</span><span class="st0">&quot;Parameter '&quot;</span> + PROPERTY_SEED
+ <span class="st0">&quot;' of task &quot;</span> + taskContext.<span class="me1">getTask</span><span class="br0">&#40;</span><span class="br0">&#41;</span>.<span class="me1">getTaskId</span><span class="br0">&#40;</span><span class="br0">&#41;</span>
+ <span class="st0">&quot; is null or empty&quot;</span><span class="br0">&#41;</span>;
<span class="br0">&#125;</span>
&nbsp;
<span class="kw4">int</span> recordCount = <span class="nu0">0</span>;
<span class="kw4">int</span> recordOutputIndex = <span class="nu0">0</span>;
RecordOutput recordOutput = taskContext.<span class="me1">getOutputs</span><span class="br0">&#40;</span><span class="br0">&#41;</span>.<span class="me1">getAsRecordOutput</span><span class="br0">&#40;</span>
OUTPUT_SLOT, recordOutputIndex<span class="br0">&#41;</span>;
<span class="kw1">for</span> <span class="br0">&#40;</span>Record record&nbsp;: getRecordsBySeed<span class="br0">&#40;</span>seedId, dataSource<span class="br0">&#41;</span><span class="br0">&#41;</span> <span class="br0">&#123;</span>
recordOutput.<span class="me1">writeRecord</span><span class="br0">&#40;</span>record<span class="br0">&#41;</span>;
recordCount++;
<span class="kw1">if</span> <span class="br0">&#40;</span>_log.<span class="me1">isDebugEnabled</span><span class="br0">&#40;</span><span class="br0">&#41;</span><span class="br0">&#41;</span> <span class="br0">&#123;</span>
_log.<span class="me1">debug</span><span class="br0">&#40;</span><span class="st0">&quot;added id &quot;</span> + record.<span class="me1">getId</span><span class="br0">&#40;</span><span class="br0">&#41;</span><span class="br0">&#41;</span>;
<span class="br0">&#125;</span>
<span class="kw1">if</span> <span class="br0">&#40;</span>recordCount&nbsp;% MAX_IDS_PER_BULK == <span class="nu0">0</span><span class="br0">&#41;</span> <span class="br0">&#123;</span>
recordOutput.<span class="me1">commit</span><span class="br0">&#40;</span><span class="br0">&#41;</span>;
recordOutputIndex++;
recordOutput = taskContext.<span class="me1">getOutputs</span><span class="br0">&#40;</span><span class="br0">&#41;</span>.<span class="me1">getAsRecordOutput</span><span class="br0">&#40;</span>
OUTPUT_SLOT, recordOutputIndex<span class="br0">&#41;</span>;
<span class="br0">&#125;</span>
<span class="br0">&#125;</span>
_log.<span class="me1">info</span><span class="br0">&#40;</span><span class="st0">&quot;Found &quot;</span> + recordCount + <span class="st0">&quot; records for seed id &quot;</span> + seedId + <span class="st0">&quot;.&quot;</span><span class="br0">&#41;</span>;
<span class="br0">&#125;</span>
&nbsp;
<span class="coMULTI">/**
* gets records from the data source, if possible fills the
* {@link ImportingConstants#ATTRIBUTE_DELTA_HASH} attribute for the delta
* checker to be able to determine if the record has to be updated/inserted
* at all.
*
* @param seedId
* the seed id to know where/what to crawl.
* @param dataSource
* the data source to crawl.
* @return a list of records containing the ID of the data source's data and
* optionally a delta hash.
*/</span>
<span class="kw1">private</span> List&lt;Record&gt; getRecordsBySeed<span class="br0">&#40;</span><span class="kw1">final</span> <span class="kw3">String</span> seedId, <span class="kw1">final</span> <span class="kw3">String</span> dataSource<span class="br0">&#41;</span> <span class="br0">&#123;</span>
ArrayList&lt;Record&gt; recordsToCrawl = <span class="kw1">new</span> ArrayList&lt;Record&gt;<span class="br0">&#40;</span><span class="br0">&#41;</span>;
&nbsp;
<span class="co1">// iterate over the entries in the data source determined by the seed id</span>
<span class="kw1">while</span><span class="br0">&#40;</span>...<span class="br0">&#41;</span> <span class="br0">&#123;</span>
<span class="co1">// id: the id of the data</span>
<span class="co1">// lastModified: the last modified date of the record (omit if it cannot be determined)</span>
<span class="kw1">final</span> Record record = DataFactory.<span class="kw1">DEFAULT</span>.<span class="me1">createRecord</span><span class="br0">&#40;</span>id, dataSource<span class="br0">&#41;</span>;
record.<span class="me1">getMetadata</span><span class="br0">&#40;</span><span class="br0">&#41;</span>.<span class="me1">put</span><span class="br0">&#40;</span>ImportingConstants.<span class="me1">ATTRIBUTE_DELTA_HASH</span>, lastModified<span class="br0">&#41;</span>;
recordsToCrawl.<span class="me1">add</span><span class="br0">&#40;</span>record<span class="br0">&#41;</span>;
<span class="br0">&#125;</span>
<span class="kw1">return</span> recordsToCrawl;
<span class="br0">&#125;</span>
&nbsp;
<span class="coMULTI">/** {@inheritDoc} */</span>
@Override
<span class="kw1">public</span> <span class="kw3">String</span> getName<span class="br0">&#40;</span><span class="br0">&#41;</span> <span class="br0">&#123;</span>
<span class="kw1">return</span> NAME;
<span class="br0">&#125;</span>
<span class="br0">&#125;</span></pre></div>
<p>If your data source is a bit more complex, e.g. hierarchical (file system, etc.) or you have to follow a linked source (like e.g. a web site), you might just have a look at how the sample implementations of file and web crawler work like (e.g. using the VisitedLinks service or looping back to the crawler to visit the next hierarchiy stage, etc.).
</p>
<a name="The_Fetcher"></a><h2> <span class="mw-headline"> The Fetcher </span></h2>
<p>So now we've created bulks of records pointing to the data to be imported into SMILA, we now need a worker that actually fetches the data from the data source using the ids, the crawler provided.
</p><p>The only interface the worker has to implement is <span style="font-family:monospace;">org.eclipse.smila.taskworker.Worker</span>.
</p>
<a name="What_is_the_fetcher_worker_supposed_to_do.3F"></a><h3> <span class="mw-headline"> What is the fetcher worker supposed to do? </span></h3>
<ul><li> Read the records sent from the crawler and filtered by the delta checker
</li><li> get the data to be processed by SMILA out of the data source for each ID in the record bulk
</li><li> create records from that data
</li><li> hand that data over to the update pusher, which in turn hands it over to the import workflow (i.e. the bulk builder)
</li><li> optionally (if supported) extract compounds or send them to a compound extractor worker to do so.
</li></ul>
<p>so the fetcher worker would look something like the follows, with the magic happening in the <span style="font-family:monospace;">fetch(...)</span> method, that has to be accessing the data source, retrieving the data and adding it as an attachment and filling other metadata as needed (you might have a look at the <span style="font-family:monospace;">FileFetcherWorker</span> or the web crawlers <span style="font-family:monospace;">SimpleFetcher</span> implementation for an inspiration).
</p>
<div dir="ltr" style="text-align: left;"><pre class="source-java"><span class="kw1">public</span> <span class="kw1">class</span> WhatsoeverFetcherWorker <span class="kw1">implements</span> Worker <span class="br0">&#123;</span>
&nbsp;
<span class="kw1">private</span> <span class="kw1">static</span> <span class="kw1">final</span> <span class="kw3">String</span> NAME = <span class="st0">&quot;whatsoeverFetcher&quot;</span>;
<span class="kw1">private</span> <span class="kw1">static</span> <span class="kw1">final</span> <span class="kw3">String</span> INPUT_SLOT = <span class="st0">&quot;input&quot;</span>;
<span class="kw1">private</span> <span class="kw1">static</span> <span class="kw1">final</span> <span class="kw3">String</span> OUTPUT_SLOT = <span class="st0">&quot;output&quot;</span>;
<span class="kw1">protected</span> <span class="kw1">final</span> Log _log = LogFactory.<span class="me1">getLog</span><span class="br0">&#40;</span>getClass<span class="br0">&#40;</span><span class="br0">&#41;</span><span class="br0">&#41;</span>;
&nbsp;
<span class="coMULTI">/** {@inheritDoc} */</span>
@Override
<span class="kw1">public</span> <span class="kw4">void</span> perform<span class="br0">&#40;</span>TaskContext taskContext<span class="br0">&#41;</span> <span class="kw1">throws</span> <span class="kw3">Exception</span> <span class="br0">&#123;</span>
<span class="kw1">final</span> RecordInput recordInput = taskContext.<span class="me1">getInputs</span><span class="br0">&#40;</span><span class="br0">&#41;</span>
.<span class="me1">getAsRecordInput</span><span class="br0">&#40;</span>INPUT_SLOT<span class="br0">&#41;</span>;
<span class="kw1">final</span> RecordOutput recordOutput = taskContext.<span class="me1">getOutputs</span><span class="br0">&#40;</span><span class="br0">&#41;</span>
.<span class="me1">getAsRecordOutput</span><span class="br0">&#40;</span>OUTPUT_SLOT<span class="br0">&#41;</span>;
Record record;
<span class="kw1">do</span> <span class="br0">&#123;</span>
record = recordInput.<span class="me1">getRecord</span><span class="br0">&#40;</span><span class="br0">&#41;</span>;
<span class="kw1">if</span> <span class="br0">&#40;</span>record&nbsp;!= <span class="kw2">null</span><span class="br0">&#41;</span> <span class="br0">&#123;</span>
<span class="kw1">if</span> <span class="br0">&#40;</span>_log.<span class="me1">isDebugEnabled</span><span class="br0">&#40;</span><span class="br0">&#41;</span><span class="br0">&#41;</span> <span class="br0">&#123;</span>
_log.<span class="me1">debug</span><span class="br0">&#40;</span><span class="st0">&quot;fetching content for record &quot;</span> + record.<span class="me1">getId</span><span class="br0">&#40;</span><span class="br0">&#41;</span><span class="br0">&#41;</span>;
<span class="br0">&#125;</span>
fetch<span class="br0">&#40;</span>record, taskContext<span class="br0">&#41;</span>;
recordOutput.<span class="me1">writeRecord</span><span class="br0">&#40;</span>record<span class="br0">&#41;</span>;
<span class="kw1">if</span> <span class="br0">&#40;</span>_log.<span class="me1">isDebugEnabled</span><span class="br0">&#40;</span><span class="br0">&#41;</span><span class="br0">&#41;</span> <span class="br0">&#123;</span>
_log.<span class="me1">debug</span><span class="br0">&#40;</span><span class="st0">&quot;added record &quot;</span> + record.<span class="me1">getId</span><span class="br0">&#40;</span><span class="br0">&#41;</span><span class="br0">&#41;</span>;
<span class="br0">&#125;</span>
<span class="br0">&#125;</span>
<span class="br0">&#125;</span> <span class="kw1">while</span> <span class="br0">&#40;</span>record&nbsp;!= <span class="kw2">null</span><span class="br0">&#41;</span>;
<span class="br0">&#125;</span>
&nbsp;
<span class="coMULTI">/**
* Actually retrieves the data from the source based on the ID of the record
* and fills in the record's meta data and/or attachments.
*
* @param record
* the record to be completed with information from the data
* source
* @param taskContext
* the tasks context.
*/</span>
<span class="kw1">private</span> <span class="kw4">void</span> fetch<span class="br0">&#40;</span>Record record, TaskContext taskContext<span class="br0">&#41;</span> <span class="br0">&#123;</span>
<span class="kw1">final</span> <span class="kw4">long</span> time = taskContext.<span class="me1">getTimestamp</span><span class="br0">&#40;</span><span class="br0">&#41;</span>;
&nbsp;
<span class="co1">// go and fetch the content and fill the record's content, metadata and/or</span>
<span class="co1">// attachments with it.</span>
record.<span class="me1">getMetadata</span><span class="br0">&#40;</span><span class="br0">&#41;</span>.<span class="me1">put</span><span class="br0">&#40;</span>..., ...<span class="br0">&#41;</span>;
...
&nbsp;
<span class="me1">taskContext</span>.<span class="me1">measureTime</span><span class="br0">&#40;</span><span class="st0">&quot;fetchContent&quot;</span>, time<span class="br0">&#41;</span>;
<span class="br0">&#125;</span>
&nbsp;
<span class="coMULTI">/** {@inheritDoc} */</span>
@Override
<span class="kw1">public</span> <span class="kw3">String</span> getName<span class="br0">&#40;</span><span class="br0">&#41;</span> <span class="br0">&#123;</span>
<span class="kw1">return</span> NAME;
<span class="br0">&#125;</span>
<span class="br0">&#125;</span></pre></div>
<a name="Plugging_it_up"></a><h2> <span class="mw-headline"> Plugging it up </span></h2>
<p>So, now we have to plug all together.
</p>
<ul><li> Write component definitions for your workers (and as well for your service if one is needed to access your data source).
</li><li> Add the bundle to the launcher and the config.ini file.
</li><li> Set you Scale-Up limits
</li><li> add worker descriptions to the <span style="font-family:monospace;">workers.json</span> file for your workers, these could look something like the following code snippet.
<ul><li> <b>Please note:</b> we need the task generator here for the runOnce triggering!
</li></ul>
</li></ul>
<div dir="ltr" style="text-align: left;"><pre class="source-javascript"><span class="br0">&#123;</span>
<span class="st0">&quot;name&quot;</span>: <span class="st0">&quot;whatsoeverCrawler&quot;</span>,
<span class="st0">&quot;taskGenerator&quot;</span>:<span class="st0">&quot;runOnceTrigger&quot;</span>,
<span class="st0">&quot;parameters&quot;</span>:<span class="br0">&#91;</span>
<span class="br0">&#123;</span>
<span class="st0">&quot;name&quot;</span>:<span class="st0">&quot;dataSource&quot;</span>
<span class="br0">&#125;</span>,
<span class="br0">&#123;</span>
<span class="st0">&quot;name&quot;</span>:<span class="st0">&quot;seed&quot;</span>
<span class="br0">&#125;</span>
<span class="br0">&#93;</span>,
<span class="st0">&quot;input&quot;</span>: <span class="br0">&#91;</span><span class="br0">&#93;</span>,
<span class="st0">&quot;output&quot;</span>: <span class="br0">&#91;</span>
<span class="br0">&#123;</span> <span class="st0">&quot;name&quot;</span>: <span class="st0">&quot;output&quot;</span>,
<span class="st0">&quot;type&quot;</span>: <span class="st0">&quot;recordBulks&quot;</span>,
<span class="st0">&quot;modes&quot;</span>:<span class="br0">&#91;</span>
<span class="st0">&quot;maybeEmpty&quot;</span>,
<span class="st0">&quot;multiple&quot;</span>
<span class="br0">&#93;</span>
<span class="br0">&#125;</span> <span class="br0">&#93;</span>
<span class="br0">&#125;</span>,
<span class="br0">&#123;</span>
<span class="st0">&quot;name&quot;</span>: <span class="st0">&quot;whatsoeverFetcher&quot;</span>,
<span class="st0">&quot;input&quot;</span>: <span class="br0">&#91;</span>
<span class="br0">&#123;</span> <span class="st0">&quot;name&quot;</span>: <span class="st0">&quot;input&quot;</span>,
<span class="st0">&quot;type&quot;</span>: <span class="st0">&quot;recordBulks&quot;</span>
<span class="br0">&#125;</span> <span class="br0">&#93;</span>,
<span class="st0">&quot;output&quot;</span>: <span class="br0">&#91;</span>
<span class="br0">&#123;</span> <span class="st0">&quot;name&quot;</span>: <span class="st0">&quot;output&quot;</span>,
<span class="st0">&quot;type&quot;</span>: <span class="st0">&quot;recordBulks&quot;</span>
<span class="br0">&#125;</span> <span class="br0">&#93;</span>
<span class="br0">&#125;</span></pre></div>
<ul><li> add the workers to a sensible workflow like e.g.
</li></ul>
<div dir="ltr" style="text-align: left;"><pre class="source-javascript"><span class="br0">&#123;</span>
<span class="st0">&quot;name&quot;</span>:<span class="st0">&quot;whatsoeverCrawling&quot;</span>,
<span class="st0">&quot;startAction&quot;</span>:<span class="br0">&#123;</span>
<span class="st0">&quot;worker&quot;</span>:<span class="st0">&quot;whatsoeverCrawler&quot;</span>,
<span class="st0">&quot;output&quot;</span>:<span class="br0">&#123;</span>
<span class="st0">&quot;output&quot;</span>:<span class="st0">&quot;somethingToCrawlBucket&quot;</span>
<span class="br0">&#125;</span>
<span class="br0">&#125;</span>,
<span class="st0">&quot;actions&quot;</span>:<span class="br0">&#91;</span>
<span class="br0">&#123;</span>
<span class="st0">&quot;worker&quot;</span>:<span class="st0">&quot;deltaChecker&quot;</span>,
<span class="st0">&quot;input&quot;</span>:<span class="br0">&#123;</span>
<span class="st0">&quot;recordsToCheck&quot;</span>:<span class="st0">&quot;somethingToCrawlBucket&quot;</span>
<span class="br0">&#125;</span>,
<span class="st0">&quot;output&quot;</span>:<span class="br0">&#123;</span>
<span class="st0">&quot;updatedRecords&quot;</span>:<span class="st0">&quot;somethingToFetchBucket&quot;</span>
<span class="br0">&#125;</span>
<span class="br0">&#125;</span>,
<span class="br0">&#123;</span>
<span class="st0">&quot;worker&quot;</span>:<span class="st0">&quot;whatsoeverFetcher&quot;</span>,
<span class="st0">&quot;input&quot;</span>:<span class="br0">&#123;</span>
<span class="st0">&quot;input&quot;</span>:<span class="st0">&quot;somethingToFetchBucket&quot;</span>
<span class="br0">&#125;</span>,
<span class="st0">&quot;output&quot;</span>:<span class="br0">&#123;</span>
<span class="st0">&quot;output&quot;</span>:<span class="st0">&quot;somethingToPushBucket&quot;</span>
<span class="br0">&#125;</span>
<span class="br0">&#125;</span>,
<span class="br0">&#123;</span>
<span class="st0">&quot;worker&quot;</span>:<span class="st0">&quot;updatePusher&quot;</span>,
<span class="st0">&quot;input&quot;</span>:<span class="br0">&#123;</span>
<span class="st0">&quot;recordsToPush&quot;</span>:<span class="st0">&quot;somethingToPushBucket&quot;</span>
<span class="br0">&#125;</span>
<span class="br0">&#125;</span>
<span class="br0">&#93;</span>
<span class="br0">&#125;</span></pre></div>
<ul><li> For your convenience you can also create a predfined job in the jobs.json, like the following snippet (you should notice that the seed parameter is fixed if you choose to use a predefined job)
</li></ul>
<div dir="ltr" style="text-align: left;"><pre class="source-javascript"><span class="br0">&#123;</span>
<span class="st0">&quot;name&quot;</span>: <span class="st0">&quot;crawlWhatsoever&quot;</span>,
<span class="st0">&quot;workflow&quot;</span>: <span class="st0">&quot;whatsoeverCrawling&quot;</span>,
<span class="st0">&quot;parameters&quot;</span>: <span class="br0">&#123;</span>
<span class="st0">&quot;tempStore&quot;</span>: <span class="st0">&quot;temp&quot;</span>,
<span class="st0">&quot;dataSource&quot;</span>: <span class="st0">&quot;whatsoever&quot;</span>,
<span class="st0">&quot;seed&quot;</span>: <span class="st0">&quot;your seed data&quot;</span>,
<span class="st0">&quot;jobToPushTo&quot;</span>: <span class="st0">&quot;indexUpdate&quot;</span>
<span class="br0">&#125;</span></pre></div>
<a name="Mapping_to_Solr"></a><h2> <span class="mw-headline"> Mapping to Solr </span></h2>
<p>You may have to map your record's attributes to Solr.
See AdaptFileCrawlerWorkerOutput.bpel as well as AddPipelinde.bpel of <tt>SMILA.application/configuration/org.eclipse.smila.processing.bpel/pipelines/</tt>.
</p><p>If you have used attributes that do not match your Solr attributes, you have to adapt them in order to index them. You do not have to do so, if you used attributes that match your Solr schema.
</p><p>In <tt>SMILA.application</tt> you can find existing mappings for file and web crawling in the configuration folder <tt>configuration/org.eclipse.smila.processing.bpel/pipelines/</tt>.
</p><p>Just add your own mapping pipeline (e.g. by copying, renaming and adapting <tt>AdaptFileCrawlerWorkerOutput.bpel</tt> or by building it from scratch (the key component to use within is the <tt>org.eclipse.smila.processing.pipelets.CopyPipelet</tt>).
</p><p>Then extend the <tt>AddPipeline</tt> by inserting your new mapping pipeline.
</p><p>Don't forget to add your adaption pipeline to the deployment descriptor <tt>deploy.xml</tt> (as a process and as a partner link in the AddPipeline as well!).
</p><p>Now the attributes of the records your fetcher fetched will be mapped according to your Solr configuration.
</p>
<a name="And_....Action.21"></a><h2> <span class="mw-headline"> And ....Action! </span></h2>
<p>So now it's time to check if everything went right.
</p>
<ul><li> Start SMILA
</li><li> check if you can access your worker definitions, workflow and job via the REST API. If not, check for errors (syntax errors in the json files, others in SMILA log).
</li><li> check in SMILA's log if your workers were added
</li><li> check if your adaption pipeline is visible in the pipeline list (<tt><a href="http://localhost:8080/smila/pipeline/" class="external free" title="http://localhost:8080/smila/pipeline/" rel="nofollow">http://localhost:8080/smila/pipeline/</a></tt>)
</li><li> start the indexing job: POST <tt><a href="http://localhost:8080/smila/jobmanager/jobs/indexUpdate/" class="external free" title="http://localhost:8080/smila/jobmanager/jobs/indexUpdate/" rel="nofollow">http://localhost:8080/smila/jobmanager/jobs/indexUpdate/</a></tt>
</li><li> start your crawling job (remember: it has to be started as a RunOnce Job!)
</li></ul>
<div dir="ltr" style="text-align: left;"><pre class="source-text">POST http://localhost:8080/smila/jobmanager/jobs/crawlWhatsoever/
{
&quot;mode&quot;: &quot;runOnce&quot;
}</pre></div>
<ul><li> Check your jobs, after your crawl job succeeded, you can finish your input job. After the input job succeeded (if you finished it), you should wait some seconds (up to 60, because it takes some time for the autocommit), before checking, if your data was indexed (see <tt><a href="http://localhost:8080/SMILA/search" class="external free" title="http://localhost:8080/SMILA/search" rel="nofollow">http://localhost:8080/SMILA/search</a></tt>).
</li></ul>
<p>So now you should be able to search in your content.
</p><p>If you can find your records, you have just successfully added a new datasource to your SMILA application. Congratulations!
</p>
<!--
NewPP limit report
Preprocessor node count: 77/1000000
Post-expand include size: 440/2097152 bytes
Template argument size: 132/2097152 bytes
#ifexist count: 0/100
-->
<!-- Saved in parser cache with key wikidb:pcache:idhash:35477-0!1!0!!en!2!edit=0 and timestamp 20120202221648 -->
<div class="printfooter">
Retrieved from "<a href="How_to_add_a_new_Data_Source_to_the_importing_framework.html">http://wiki.eclipse.org/SMILA/Documentation/HowTo/How_to_add_a_new_Data_Source_to_the_importing_framework</a>"</div>
<!-- end content -->
<div class="visualClear"></div>
</div>
</div>
</div>
<!-- Yoink of toolbox for phoenix moved up -->
</div>
</div>
<div id="clearFooter"/>
<div id="footer" >
<ul id="footernav">
<li class="first"><a href="http://www.eclipse.org/">Home</a></li>
<li><a href="http://www.eclipse.org/legal/privacy.php">Privacy Policy</a></li>
<li><a href="http://www.eclipse.org/legal/termsofuse.php">Terms of Use</a></li>
<li><a href="http://www.eclipse.org/legal/copyright.php">Copyright Agent</a></li>
<li><a href="http://www.eclipse.org/org/foundation/contact.php">Contact</a></li>
<li><a href="http://wiki.eclipse.org/Eclipsepedia:About" title="Eclipsepedia:About">About Eclipsepedia</a></li>
</ul>
<span id="copyright">Copyright &copy; 2012 The Eclipse Foundation. All Rights Reserved</span>
<p id="footercredit">This page was last modified 14:05, 25 January 2012 by <a href="http://wiki.eclipse.org/User:Andreas.schank.attensity.com" title="User:Andreas.schank.attensity.com">A. Schank</a>. Based on work by <a href="http://wiki.eclipse.org/index.php?title=User:Juergen.schumacher.attensity.com&amp;action=edit" class="new" title="User:Juergen.schumacher.attensity.com">Juergen Schumacher</a>.</p>
<p id="footerviews">This page has been accessed 161 times.</p>
</div>
<script type="text/javascript">
var gaJsHost = (("https:" == document.location.protocol) ? "https://ssl." : "http://www.");
document.write(unescape("%3Cscript src='" + gaJsHost + "google-analytics.com/ga.js' type='text/javascript'%3E%3C/script%3E"));
</script>
<script type="text/javascript">
var pageTracker = _gat._getTracker("UA-910670-4");
pageTracker._trackPageview();
</script>
<!-- <div class="visualClear"></div> -->
<script type="text/javascript">if (window.runOnloadHook) runOnloadHook();</script>
</div>
<!-- Served in 0.051 secs. --></body></html>