blob: f0a8aae098f32e3915896bd20990b2cf4f187b14 [file] [log] [blame]
<!DOCTYPE html>
<html lang="en" dir="ltr" class="client-nojs">
<head>
<meta charset="UTF-8" />
<title>SMILA/Documentation/HowTo/How to add a new Data Source to the importing framework - Eclipsepedia</title>
<meta http-equiv="X-UA-Compatible" content="IE=EDGE" />
<meta name="generator" content="MediaWiki 1.23.2" />
<link rel="shortcut icon" href="http://wiki.eclipse.org/eclipse.org-common/themes/solstice/public/images/favicon.ico" />
<link rel="search" type="application/opensearchdescription+xml" href="http://wiki.eclipse.org/opensearch_desc.php" title="Eclipsepedia (en)" />
<link rel="EditURI" type="application/rsd+xml" href="http://wiki.eclipse.org/api.php?action=rsd" />
<link rel="alternate" type="application/atom+xml" title="Eclipsepedia Atom feed" href="http://wiki.eclipse.org/index.php?title=Special:RecentChanges&amp;feed=atom" />
<link rel="stylesheet" href="http://wiki.eclipse.org/load.php?debug=false&amp;lang=en&amp;modules=mediawiki.legacy.commonPrint%2Cshared%7Cmediawiki.ui.button&amp;only=styles&amp;skin=solstice&amp;*" />
<link rel="stylesheet" href="http://wiki.eclipse.org/skins/solstice/public/stylesheets/styles.min.css?303" media="screen, print" /><meta name="ResourceLoaderDynamicStyles" content="" />
<style>a:lang(ar),a:lang(kk-arab),a:lang(mzn),a:lang(ps),a:lang(ur){text-decoration:none}
/* cache key: my_wiki:resourceloader:filter:minify-css:7:14ece53a42aa314864e5fd8c57f0d98f */</style>
<script src="http://wiki.eclipse.org/load.php?debug=false&amp;lang=en&amp;modules=startup&amp;only=scripts&amp;skin=solstice&amp;*"></script>
<script>if(window.mw){
mw.config.set({"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"SMILA/Documentation/HowTo/How_to_add_a_new_Data_Source_to_the_importing_framework","wgTitle":"SMILA/Documentation/HowTo/How to add a new Data Source to the importing framework","wgCurRevisionId":326097,"wgRevisionId":326097,"wgArticleId":35477,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":[],"wgBreakFrames":false,"wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgMonthNamesShort":["","Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"],"wgRelevantPageName":"SMILA/Documentation/HowTo/How_to_add_a_new_Data_Source_to_the_importing_framework","wgIsProbablyEditable":false,"wgRestrictionEdit":[],"wgRestrictionMove":[],"wgWikiEditorEnabledModules":{"toolbar":false,"dialogs":false,"hidesig":true,"preview":false,"previewDialog":false,"publish":false},"wgCategoryTreePageCategoryOptions":"{\"mode\":0,\"hideprefix\":20,\"showcount\":true,\"namespaces\":false}"});
}</script><script>if(window.mw){
mw.loader.implement("user.options",function($,jQuery){mw.user.options.set({"ccmeonemails":0,"cols":80,"date":"default","diffonly":0,"disablemail":0,"editfont":"default","editondblclick":0,"editsectiononrightclick":0,"enotifminoredits":0,"enotifrevealaddr":0,"enotifusertalkpages":1,"enotifwatchlistpages":1,"extendwatchlist":0,"fancysig":0,"forceeditsummary":0,"gender":"unknown","hideminor":0,"hidepatrolled":0,"imagesize":2,"math":1,"minordefault":0,"newpageshidepatrolled":0,"nickname":"","norollbackdiff":0,"numberheadings":0,"previewonfirst":0,"previewontop":1,"rcdays":7,"rclimit":50,"rows":25,"showhiddencats":0,"shownumberswatching":1,"showtoolbar":1,"skin":"solstice","stubthreshold":0,"thumbsize":2,"underline":2,"uselivepreview":0,"usenewrc":0,"watchcreations":1,"watchdefault":1,"watchdeletion":0,"watchlistdays":3,"watchlisthideanons":0,"watchlisthidebots":0,"watchlisthideliu":0,"watchlisthideminor":0,"watchlisthideown":0,"watchlisthidepatrolled":0,"watchmoves":0,"wllimit":250,
"useeditwarning":1,"prefershttps":1,"language":"en","variant-gan":"gan","variant-iu":"iu","variant-kk":"kk","variant-ku":"ku","variant-shi":"shi","variant-sr":"sr","variant-tg":"tg","variant-uz":"uz","variant-zh":"zh","searchNs0":true,"searchNs1":false,"searchNs2":false,"searchNs3":false,"searchNs4":false,"searchNs5":false,"searchNs6":false,"searchNs7":false,"searchNs8":false,"searchNs9":false,"searchNs10":false,"searchNs11":false,"searchNs12":false,"searchNs13":false,"searchNs14":false,"searchNs15":false,"variant":"en"});},{},{});mw.loader.implement("user.tokens",function($,jQuery){mw.user.tokens.set({"editToken":"+\\","patrolToken":false,"watchToken":false});},{},{});
/* cache key: my_wiki:resourceloader:filter:minify-js:7:70d74423d3fc1e1c18fa9a1ff645a84a */
}</script>
<script>if(window.mw){
mw.loader.load(["mediawiki.page.startup","mediawiki.legacy.wikibits","mediawiki.legacy.ajax"]);
}</script>
<style type="text/css">/*<![CDATA[*/
.source-java {line-height: normal;}
.source-java li, .source-java pre {
line-height: normal; border: 0px none white;
}
/**
* GeSHi Dynamically Generated Stylesheet
* --------------------------------------
* Dynamically generated stylesheet for java
* CSS class: source-java, CSS id:
* GeSHi (C) 2004 - 2007 Nigel McNie, 2007 - 2008 Benny Baumann
* (http://qbnz.com/highlighter/ and http://geshi.org/)
* --------------------------------------
*/
.java.source-java .de1, .java.source-java .de2 {font: normal normal 1em/1.2em monospace; margin:0; padding:0; background:none; vertical-align:top;}
.java.source-java {font-family:monospace;}
.java.source-java .imp {font-weight: bold; color: red;}
.java.source-java li, .java.source-java .li1 {font-weight: normal; vertical-align:top;}
.java.source-java .ln {width:1px;text-align:right;margin:0;padding:0 2px;vertical-align:top;}
.java.source-java .li2 {font-weight: bold; vertical-align:top;}
.java.source-java .kw1 {color: #7F0055; font-weight: bold;}
.java.source-java .kw2 {color: #7F0055; font-weight: bold;}
.java.source-java .kw3 {color: #000000; font-weight: normal}
.java.source-java .kw4 {color: #7F0055; font-weight: bold;}
.java.source-java .co1 {color: #3F7F5F; font-style: italic;}
.java.source-java .co2 {color: #3F7F5F;}
.java.source-java .co3 {color: #3F7F5F; font-style: italic; font-weight: bold;}
.java.source-java .coMULTI {color: #3F5FBF; font-style: italic;}
.java.source-java .es0 {color: #000000;}
.java.source-java .br0 {color: #000000;}
.java.source-java .sy0 {color: #000000;}
.java.source-java .st0 {color: #2A00ff;}
.java.source-java .nu0 {color: #000000;}
.java.source-java .me1 {color: #000000;}
.java.source-java .me2 {color: #000000;}
.java.source-java .ln-xtra, .java.source-java li.ln-xtra, .java.source-java div.ln-xtra {background-color: #ffc;}
.java.source-java span.xtra { display:block; }
/*]]>*/
</style><style type="text/css">/*<![CDATA[*/
.source-xml {line-height: normal;}
.source-xml li, .source-xml pre {
line-height: normal; border: 0px none white;
}
/**
* GeSHi Dynamically Generated Stylesheet
* --------------------------------------
* Dynamically generated stylesheet for xml
* CSS class: source-xml, CSS id:
* GeSHi (C) 2004 - 2007 Nigel McNie, 2007 - 2008 Benny Baumann
* (http://qbnz.com/highlighter/ and http://geshi.org/)
* --------------------------------------
*/
.xml.source-xml .de1, .xml.source-xml .de2 {font: normal normal 1em/1.2em monospace; margin:0; padding:0; background:none; vertical-align:top;}
.xml.source-xml {font-family:monospace;}
.xml.source-xml .imp {font-weight: bold; color: red;}
.xml.source-xml li, .xml.source-xml .li1 {font-weight: normal; vertical-align:top;}
.xml.source-xml .ln {width:1px;text-align:right;margin:0;padding:0 2px;vertical-align:top;}
.xml.source-xml .li2 {font-weight: bold; vertical-align:top;}
.xml.source-xml .es0 {color: #000099; font-weight: bold;}
.xml.source-xml .br0 {color: #66cc66;}
.xml.source-xml .sy0 {color: #66cc66;}
.xml.source-xml .st0 {color: #ff0000;}
.xml.source-xml .nu0 {color: #cc66cc;}
.xml.source-xml .sc-1 {color: #808080; font-style: italic;}
.xml.source-xml .sc0 {color: #00bbdd;}
.xml.source-xml .sc1 {color: #ddbb00;}
.xml.source-xml .sc2 {color: #339933;}
.xml.source-xml .sc3 {color: #009900;}
.xml.source-xml .re0 {color: #000066;}
.xml.source-xml .re1 {color: #000000; font-weight: bold;}
.xml.source-xml .re2 {color: #000000; font-weight: bold;}
.xml.source-xml .ln-xtra, .xml.source-xml li.ln-xtra, .xml.source-xml div.ln-xtra {background-color: #ffc;}
.xml.source-xml span.xtra { display:block; }
/*]]>*/
</style><style type="text/css">/*<![CDATA[*/
.source-javascript {line-height: normal;}
.source-javascript li, .source-javascript pre {
line-height: normal; border: 0px none white;
}
/**
* GeSHi Dynamically Generated Stylesheet
* --------------------------------------
* Dynamically generated stylesheet for javascript
* CSS class: source-javascript, CSS id:
* GeSHi (C) 2004 - 2007 Nigel McNie, 2007 - 2008 Benny Baumann
* (http://qbnz.com/highlighter/ and http://geshi.org/)
* --------------------------------------
*/
.javascript.source-javascript .de1, .javascript.source-javascript .de2 {font: normal normal 1em/1.2em monospace; margin:0; padding:0; background:none; vertical-align:top;}
.javascript.source-javascript {font-family:monospace;}
.javascript.source-javascript .imp {font-weight: bold; color: red;}
.javascript.source-javascript li, .javascript.source-javascript .li1 {font-weight: normal; vertical-align:top;}
.javascript.source-javascript .ln {width:1px;text-align:right;margin:0;padding:0 2px;vertical-align:top;}
.javascript.source-javascript .li2 {font-weight: bold; vertical-align:top;}
.javascript.source-javascript .kw1 {color: #000066; font-weight: bold;}
.javascript.source-javascript .kw2 {color: #003366; font-weight: bold;}
.javascript.source-javascript .kw3 {color: #000066;}
.javascript.source-javascript .kw5 {color: #FF0000;}
.javascript.source-javascript .co1 {color: #006600; font-style: italic;}
.javascript.source-javascript .co2 {color: #009966; font-style: italic;}
.javascript.source-javascript .coMULTI {color: #006600; font-style: italic;}
.javascript.source-javascript .es0 {color: #000099; font-weight: bold;}
.javascript.source-javascript .br0 {color: #009900;}
.javascript.source-javascript .sy0 {color: #339933;}
.javascript.source-javascript .st0 {color: #3366CC;}
.javascript.source-javascript .nu0 {color: #CC0000;}
.javascript.source-javascript .me1 {color: #660066;}
.javascript.source-javascript .ln-xtra, .javascript.source-javascript li.ln-xtra, .javascript.source-javascript div.ln-xtra {background-color: #ffc;}
.javascript.source-javascript span.xtra { display:block; }
/*]]>*/
</style><style type="text/css">/*<![CDATA[*/
.source-text {line-height: normal;}
.source-text li, .source-text pre {
line-height: normal; border: 0px none white;
}
/**
* GeSHi Dynamically Generated Stylesheet
* --------------------------------------
* Dynamically generated stylesheet for text
* CSS class: source-text, CSS id:
* GeSHi (C) 2004 - 2007 Nigel McNie, 2007 - 2008 Benny Baumann
* (http://qbnz.com/highlighter/ and http://geshi.org/)
* --------------------------------------
*/
.text.source-text .de1, .text.source-text .de2 {font: normal normal 1em/1.2em monospace; margin:0; padding:0; background:none; vertical-align:top;}
.text.source-text {font-family:monospace;}
.text.source-text .imp {font-weight: bold; color: red;}
.text.source-text li, .text.source-text .li1 {font-weight: normal; vertical-align:top;}
.text.source-text .ln {width:1px;text-align:right;margin:0;padding:0 2px;vertical-align:top;}
.text.source-text .li2 {font-weight: bold; vertical-align:top;}
.text.source-text .ln-xtra, .text.source-text li.ln-xtra, .text.source-text div.ln-xtra {background-color: #ffc;}
.text.source-text span.xtra { display:block; }
/*]]>*/
</style><meta name="viewport" content="width=device-width, initial-scale=1.0"></head>
<body class="mediawiki ltr sitedir-ltr ns-0 ns-subject page-SMILA_Documentation_HowTo_How_to_add_a_new_Data_Source_to_the_importing_framework skin-solstice action-view" id="solstice">
<a class="sr-only" href="How_to_add_a_new_Data_Source_to_the_importing_framework.html#content">Skip to main content</a>
<div class="thin-header">
<header role="banner" class="hidden-print noprint">
<div class="container-fluid">
<div id="row-logo-search">
<div id="header-left">
<div class="row">
<div class="hidden-xs col-sm-6 logo-container">
<a href="https://www.eclipse.org/" ><img class="logo-eclipse-default" src="http://wiki.eclipse.org/skins/solstice/public/images/logo/eclipse-800x188.png" alt="Eclipsepedia"></a>
</div>
<div class="navbar col-sm-18 yamm" id="main-menu">
<div class="navbar-collapse collapse" id="navbar-collapse-1">
<ul class="nav navbar-nav">
<li><a target="_self" href="https://eclipse.org/downloads/">Download</a></li>
<li><a target="_self" href="https://eclipse.org/users/">Getting Started </a></li>
<li><a target="_self" href="https://eclipse.org/membership/">Members</a></li>
<li><a target="_self" href="https://eclipse.org/projects/">Projects</a></li>
<li class="dropdown visible-xs"><a class="dropdown-toggle" data-toggle="dropdown" href="How_to_add_a_new_Data_Source_to_the_importing_framework.html#">Community <b class="caret"></b></a><ul class="dropdown-menu"><li><a href="http://marketplace.eclipse.org">Marketplace</a></li><li><a href="http://events.eclipse.org">Events</a></li><li><a href="http://www.planeteclipse.org/">Planet Eclipse</a></li><li><a href="https://eclipse.org/community/eclipse_newsletter/">Newsletter</a></li><li><a href="https://www.youtube.com/user/EclipseFdn">Videos</a></li></ul></li><li class="dropdown visible-xs"><a class="dropdown-toggle" data-toggle="dropdown" href="How_to_add_a_new_Data_Source_to_the_importing_framework.html#">Participate <b class="caret"></b></a><ul class="dropdown-menu"><li><a href="https://bugs.eclipse.org/bugs/">Report a Bug</a></li><li><a href="https://eclipse.org/forums/">Forums</a></li><li><a href="https://eclipse.org/mail/">Mailing Lists</a></li><li><a href="https://wiki.eclipse.org/">Wiki</a></li><li><a href="https://wiki.eclipse.org/IRC">IRC</a></li><li><a href="https://eclipse.org/contribute/">How to Contribute</a></li></ul></li><li class="dropdown visible-xs"><a class="dropdown-toggle" data-toggle="dropdown" href="How_to_add_a_new_Data_Source_to_the_importing_framework.html#">Working Groups <b class="caret"></b></a><ul class="dropdown-menu"><li><a href="http://wiki.eclipse.org/Auto_IWG">Automotive</a></li><li><a href="http://iot.eclipse.org">Internet of Things</a></li><li><a href="http://locationtech.org">LocationTech</a></li><li><a href="http://lts.eclipse.org">Long-Term Support</a></li><li><a href="http://polarsys.org">PolarSys</a></li><li><a href="http://science.eclipse.org">Science</a></li><li><a href="http://openmdm.org">OpenMDM</a></li></ul></li><!-- More -->
<li class="dropdown hidden-xs"><a class="dropdown-toggle" data-toggle="dropdown">More<b class="caret"></b></a>
<ul class="dropdown-menu">
<li>
<!-- Content container to add padding -->
<div class="yamm-content">
<div class="row">
<ul class="col-sm-8 list-unstyled"><li><p><strong>Community</strong></p></li><li><a href="http://marketplace.eclipse.org">Marketplace</a></li><li><a href="http://events.eclipse.org">Events</a></li><li><a href="http://www.planeteclipse.org/">Planet Eclipse</a></li><li><a href="https://eclipse.org/community/eclipse_newsletter/">Newsletter</a></li><li><a href="https://www.youtube.com/user/EclipseFdn">Videos</a></li></ul><ul class="col-sm-8 list-unstyled"><li><p><strong>Participate</strong></p></li><li><a href="https://bugs.eclipse.org/bugs/">Report a Bug</a></li><li><a href="https://eclipse.org/forums/">Forums</a></li><li><a href="https://eclipse.org/mail/">Mailing Lists</a></li><li><a href="https://wiki.eclipse.org/">Wiki</a></li><li><a href="https://wiki.eclipse.org/IRC">IRC</a></li><li><a href="https://eclipse.org/contribute/">How to Contribute</a></li></ul><ul class="col-sm-8 list-unstyled"><li><p><strong>Working Groups</strong></p></li><li><a href="http://wiki.eclipse.org/Auto_IWG">Automotive</a></li><li><a href="http://iot.eclipse.org">Internet of Things</a></li><li><a href="http://locationtech.org">LocationTech</a></li><li><a href="http://lts.eclipse.org">Long-Term Support</a></li><li><a href="http://polarsys.org">PolarSys</a></li><li><a href="http://science.eclipse.org">Science</a></li><li><a href="http://openmdm.org">OpenMDM</a></li></ul> </div>
</div>
</li>
</ul>
</li>
</ul>
</div>
<div class="navbar-header">
<button data-target="#navbar-collapse-1" data-toggle="collapse" class="navbar-toggle" type="button">
<span class="sr-only">Toggle navigation</span>
<span class="icon-bar"></span>
<span class="icon-bar"></span>
<span class="icon-bar"></span>
<span class="icon-bar"></span>
</button>
<a href="https://www.eclipse.org/" class="visible-xs navbar-brand"><img class="logo-eclipse-default" src="http://wiki.eclipse.org/skins/solstice/public/images/logo/eclipse-800x188.png" alt="Eclipsepedia" width="174"></a>
</div>
</div>
</div>
</div>
</div>
</div>
</header>
<section class="defaut-breadcrumbs hidden-print noprint hidden-print clearfix" id="breadcrumb">
<div>
<ol class="breadcrumb">
<li><a href="https://www.eclipse.org/">Home</a></li>
<li><a href="http://wiki.eclipse.org/Main_Page">Eclipse Wiki</a></li>
<li class="active">SMILA/Documentation/HowTo/How to add a new Data Source to the importing framework</li></ol>
</div>
</section>
</div>
<div class="toolbar-menu breadcrumbs-offset noprint hidden-print margin-bottom-0 clearfix">
<div class="col-md-24">
<ol class="breadcrumb" role="navigation">
<li id="pt-login">
<a href="http://wiki.eclipse.org/index.php?title=Special:UserLogin&amp;returnto=SMILA%2FDocumentation%2FHowTo%2FHow+to+add+a+new+Data+Source+to+the+importing+framework">
<i class="fa fa-sign-in fa-fw orange"></i> Log in </a>
</li>
</ul>
</div>
</div>
<main role="main" class="background-grey">
<div class="container-full padding-top-25">
<!-- content -->
<section id="content" class="mw-body container-full clearfix 0">
<div id="mw-js-message" style="display:none;"></div>
<!-- bodyContent -->
<div id="bodyContent">
<!-- jumpto -->
<div id="jump-to-nav" class="mw-jump">
Jump to: <a href="How_to_add_a_new_Data_Source_to_the_importing_framework.html#mw-head">navigation</a>,
<a href="How_to_add_a_new_Data_Source_to_the_importing_framework.html#p-search">search</a>
</div>
<!-- /jumpto -->
<!-- leftcol -->
<aside class="col-md-4 noprint hidden-print" id="leftcol">
<form class="input-group" role="form" id="form-eclipse-search" action="http://wiki.eclipse.org/index.php" id="searchform">
<input id="searchInput" class="search-query form-control" type="search" accesskey="f" title="Special:Search" placeholder="Search" name="search" value="">
<span class="input-group-btn">
<button value="search" id="mw-searchButton" type="submit" class="btn btn-default" title="Search the pages for this text" name="fulltext">
<i class="fa fa-search"></i>
</button>
</span>
</form>
<select class="form-control margin-top-10 margin-bottom-10 visible-xs visible-sm" onchange="this.options[this.selectedIndex].value && (window.location = this.options[this.selectedIndex].value);"><option class="fw-700 "><span class="fw-700">---Navigation---</span></option><option value="/Main_Page">Main Page</option><option value="/Eclipsepedia:Community_portal">Community portal</option><option value="/Eclipsepedia:Current_events">Current events</option><option value="/Special:RecentChanges">Recent changes</option><option value="/Special:Random">Random page</option><option value="https://www.mediawiki.org/wiki/Special:MyLanguage/Help:Contents">Help</option></select><ul class="ul-left-nav fa-ul hidden-print leftnav hidden-xs hidden-sm"><li class="separator"><span class="separator">Navigation</span></li> <li class=""><i class="fa fa-angle-double-right orange fa-fw"></i> <a href="http://wiki.eclipse.org/Main_Page" id="n-mainpage" title="Visit the main page [z]" accesskey="z">Main Page</a></li> <li class=""><i class="fa fa-angle-double-right orange fa-fw"></i> <a href="http://wiki.eclipse.org/Eclipsepedia:Community_portal" id="n-portal" title="About the project, what you can do, where to find things">Community portal</a></li> <li class=""><i class="fa fa-angle-double-right orange fa-fw"></i> <a href="http://wiki.eclipse.org/Eclipsepedia:Current_events" id="n-currentevents" title="Find background information on current events">Current events</a></li> <li class=""><i class="fa fa-angle-double-right orange fa-fw"></i> <a href="http://wiki.eclipse.org/Special:RecentChanges" id="n-recentchanges" title="A list of recent changes in the wiki [r]" accesskey="r">Recent changes</a></li> <li class=""><i class="fa fa-angle-double-right orange fa-fw"></i> <a href="http://wiki.eclipse.org/Special:Random" id="n-randompage" title="Load a random page [x]" accesskey="x">Random page</a></li> <li class=""><i class="fa fa-angle-double-right orange fa-fw"></i> <a href="https://www.mediawiki.org/wiki/Special:MyLanguage/Help:Contents" id="n-help" title="The place to find out">Help</a></li></ul> <select class="form-control margin-top-10 margin-bottom-10 visible-xs visible-sm" onchange="this.options[this.selectedIndex].value && (window.location = this.options[this.selectedIndex].value);"><option class="fw-700 "><span class="fw-700">---Toolbox---</span></option><option value="/index.php?title=SMILA/Documentation/HowTo/How_to_add_a_new_Data_Source_to_the_importing_framework&amp;action=info">Page information</option><option value="/index.php?title=SMILA/Documentation/HowTo/How_to_add_a_new_Data_Source_to_the_importing_framework&amp;oldid=326097">Permanent link</option><option value="/index.php?title=SMILA/Documentation/HowTo/How_to_add_a_new_Data_Source_to_the_importing_framework&amp;printable=yes">Printable version</option><option value="/Special:SpecialPages">Special pages</option><option value="/Special:RecentChangesLinked/SMILA/Documentation/HowTo/How_to_add_a_new_Data_Source_to_the_importing_framework">Related changes</option><option value="/Special:WhatLinksHere/SMILA/Documentation/HowTo/How_to_add_a_new_Data_Source_to_the_importing_framework">What links here</option></select><ul class="ul-left-nav fa-ul hidden-print leftnav hidden-xs hidden-sm"><li class="separator"><span class="separator">Toolbox</span></li><li><i class="fa fa-angle-double-right orange fa-fw"></i> <a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/HowTo/How_to_add_a_new_Data_Source_to_the_importing_framework&amp;action=info" id="t-info">Page information</a></li><li><i class="fa fa-angle-double-right orange fa-fw"></i> <a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/HowTo/How_to_add_a_new_Data_Source_to_the_importing_framework&amp;oldid=326097" id="t-permalink" title="Permanent link to this revision of the page">Permanent link</a></li><li><i class="fa fa-angle-double-right orange fa-fw"></i> <a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/HowTo/How_to_add_a_new_Data_Source_to_the_importing_framework&amp;printable=yes" id="t-print" rel="alternate" title="Printable version of this page [p]" accesskey="p">Printable version</a></li><li><i class="fa fa-angle-double-right orange fa-fw"></i> <a href="http://wiki.eclipse.org/Special:SpecialPages" id="t-specialpages" title="A list of all special pages [q]" accesskey="q">Special pages</a></li><li><i class="fa fa-angle-double-right orange fa-fw"></i> <a href="http://wiki.eclipse.org/Special:RecentChangesLinked/SMILA/Documentation/HowTo/How_to_add_a_new_Data_Source_to_the_importing_framework" id="t-recentchangeslinked" title="Recent changes in pages linked from this page [k]" accesskey="k">Related changes</a></li><li><i class="fa fa-angle-double-right orange fa-fw"></i> <a href="http://wiki.eclipse.org/Special:WhatLinksHere/SMILA/Documentation/HowTo/How_to_add_a_new_Data_Source_to_the_importing_framework" id="t-whatlinkshere" title="A list of all wiki pages that link here [j]" accesskey="j">What links here</a></li></ul> </aside>
<!-- /leftcol -->
<!-- mainContent -->
<div id="mainContent" class="col-md-20">
<ul class="nav nav-tabs noprint hidden-print" role="tablist">
<li id="ca-nstab-main" class="active"><a href="How_to_add_a_new_Data_Source_to_the_importing_framework.html" title="View the content page [c]" accesskey="c" tabindex="-1">Page</a></li>
<li id="ca-talk" class="new"><a href="http://wiki.eclipse.org/index.php?title=Talk:SMILA/Documentation/HowTo/How_to_add_a_new_Data_Source_to_the_importing_framework&amp;action=edit&amp;redlink=1" title="Discussion about the content page [t]" accesskey="t" tabindex="-1">Discussion</a></li>
<li id="ca-viewsource"><a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/HowTo/How_to_add_a_new_Data_Source_to_the_importing_framework&amp;action=edit" title="This page is protected.&#10;You can view its source [e]" accesskey="e" tabindex="-1">View source</a></li>
<li id="ca-history" class="collapsible"><a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/HowTo/How_to_add_a_new_Data_Source_to_the_importing_framework&amp;action=history" title="Past revisions of this page [h]" accesskey="h" tabindex="-1">History</a></li>
</ul> <div class="tab-content background-white">
<div id="tab-pane-main-page-content" class="tab-pane active">
<h1 id="firstHeading" class="firstHeading page-header">
<span dir="auto">SMILA/Documentation/HowTo/How to add a new Data Source to the importing framework</span>
</h1>
<div id="main-page-content">
<!-- subtitle -->
<div id="contentSub" class="alert alert-small alert-warning"><span class="subpages">&lt; <a href="../../../SMILA.html" title="SMILA">SMILA</a>&lrm; | <a href="../../Documentation.1.html" title="SMILA/Documentation">Documentation</a>&lrm; | <a href="../HowTo.html" title="SMILA/Documentation/HowTo">HowTo</a></span></div>
<!-- /subtitle -->
<div id="mw-content-text" lang="en" dir="ltr" class="mw-content-ltr"><p>This how to manual shows how you can add a new data source (e.g. database, connectors, etc.) for the new SMILA importing framework (see <a href="../Importing/Concept.html" title="SMILA/Documentation/Importing/Concept">Importing Concept</a> for more information about the framework).
</p><p>The steps necessary to include the bundles and workers into the builds or launchers won't be covered here, as they are covered in detail in other how tos (see preconditions).
</p>
<div id="toc" class="toc"><div id="toctitle"><h2>Contents</h2></div>
<ul>
<li class="toclevel-1 tocsection-1"><a href="How_to_add_a_new_Data_Source_to_the_importing_framework.html#Preconditions"><span class="tocnumber">1</span> <span class="toctext">Preconditions</span></a></li>
<li class="toclevel-1 tocsection-2"><a href="How_to_add_a_new_Data_Source_to_the_importing_framework.html#Prepare_the_bundle"><span class="tocnumber">2</span> <span class="toctext">Prepare the bundle</span></a></li>
<li class="toclevel-1 tocsection-3"><a href="How_to_add_a_new_Data_Source_to_the_importing_framework.html#Writing_the_workers"><span class="tocnumber">3</span> <span class="toctext">Writing the workers</span></a>
<ul>
<li class="toclevel-2 tocsection-4"><a href="How_to_add_a_new_Data_Source_to_the_importing_framework.html#The_Crawler"><span class="tocnumber">3.1</span> <span class="toctext">The Crawler</span></a>
<ul>
<li class="toclevel-3 tocsection-5"><a href="How_to_add_a_new_Data_Source_to_the_importing_framework.html#What_is_the_crawler_worker_supposed_to_do.3F"><span class="tocnumber">3.1.1</span> <span class="toctext">What is the crawler worker supposed to do?</span></a></li>
</ul>
</li>
<li class="toclevel-2 tocsection-6"><a href="How_to_add_a_new_Data_Source_to_the_importing_framework.html#The_Fetcher"><span class="tocnumber">3.2</span> <span class="toctext">The Fetcher</span></a>
<ul>
<li class="toclevel-3 tocsection-7"><a href="How_to_add_a_new_Data_Source_to_the_importing_framework.html#What_is_the_fetcher_worker_supposed_to_do.3F"><span class="tocnumber">3.2.1</span> <span class="toctext">What is the fetcher worker supposed to do?</span></a></li>
</ul>
</li>
<li class="toclevel-2 tocsection-8"><a href="How_to_add_a_new_Data_Source_to_the_importing_framework.html#The_Extractor"><span class="tocnumber">3.3</span> <span class="toctext">The Extractor</span></a>
<ul>
<li class="toclevel-3 tocsection-9"><a href="How_to_add_a_new_Data_Source_to_the_importing_framework.html#What.27s_the_extractor_worker_supposed_to_do.3F"><span class="tocnumber">3.3.1</span> <span class="toctext">What's the extractor worker supposed to do?</span></a></li>
<li class="toclevel-3 tocsection-10"><a href="How_to_add_a_new_Data_Source_to_the_importing_framework.html#Creating_an_extractor_worker_using_the_base_classes"><span class="tocnumber">3.3.2</span> <span class="toctext">Creating an extractor worker using the base classes</span></a></li>
</ul>
</li>
<li class="toclevel-2 tocsection-11"><a href="How_to_add_a_new_Data_Source_to_the_importing_framework.html#Plugging_it_up"><span class="tocnumber">3.4</span> <span class="toctext">Plugging it up</span></a></li>
<li class="toclevel-2 tocsection-12"><a href="How_to_add_a_new_Data_Source_to_the_importing_framework.html#And_....Action.21"><span class="tocnumber">3.5</span> <span class="toctext">And ....Action!</span></a></li>
</ul>
</li>
</ul>
</div>
<h1><span class="mw-headline" id="Preconditions">Preconditions</span></h1>
<ul>
<li> Set up your development environment, see <a href="../../Development_Guidelines/Howto_set_up_dev_environment.html" title="SMILA/Development Guidelines/Howto set up dev environment" class="mw-redirect">How to set up the development environment</a>.
</li>
<li> You should have read and understood the documentation about the <a href="../JobManager.html" title="SMILA/Documentation/JobManager">JobManager</a>, especially the configuration of workers and workflows if you want to create new workers.
</li>
<li> You should have at least an idea about the OSGi framework and OSGi services. For links to introductory articles and tutorials see <a rel="nofollow" class="external autonumber" href="http://www.osgi.org/About/HowOSGi">[1]</a>. For a quite comprehensive overview on OSGi see <a rel="nofollow" class="external autonumber" href="http://njbartlett.name/osgibook.html">[2]</a>. Especially, SMILA makes intensive use of OSGi's Declarative Services facility, so you may want to have at least a quick look at it.
</li>
<li> You should already have gone through the <a href="../../Development_Guidelines/How_to_write_a_Worker.html" title="SMILA/Development Guidelines/How to write a Worker" class="mw-redirect">How to write a Worker</a> tutorial, since you need a Crawler and a Fetcher worker in order to be able to crawl a new Data Source.
</li>
</ul>
<h1><span class="mw-headline" id="Prepare_the_bundle">Prepare the bundle</span></h1>
<p>Please follow the <a href="../../Development_Guidelines/Create_a_bundle_(plug-in).html" title="SMILA/Development Guidelines/Create a bundle (plug-in)" class="mw-redirect">How to create a bundle (plug-in)</a> manual to create a new bundle.
</p><p>Add the following bundles to the <i>Imported Packages</i> list:
</p>
<ul>
<li> org.eclipse.smila.datamodel: For the Record class.
</li>
<li> org.eclipse.smila.objectstore: Possible exceptions when accessing input/output streams.
</li>
<li> org.eclipse.smila.taskmanager: To access the Task.
</li>
<li> org.eclipse.smila.taskworker: The TaskWorker bundle containing the Worker and TaskContext interfaces.
</li>
<li> org.eclipse.smila.taskworker.input: Input streams of the TaskWorker bundle.
</li>
<li> org.eclipse.smila.taskworker.output: Output streams of the TaskWorker bundle.
</li>
<li> org.eclipse.smila.importing: The importing framework bundle.
</li>
</ul>
<p>You should also add a test bundle (see <a href="../../Development_Guidelines/Create_a_test_bundle_(plug-in).html" title="SMILA/Development Guidelines/Create a test bundle (plug-in)" class="mw-redirect">How to create a test bundle (plug-in)</a>).
</p>
<h1><span class="mw-headline" id="Writing_the_workers">Writing the workers</span></h1>
<p>You should also have a look at the existing crawlers in SMILA: <tt>org.eclipse.smila.importing.crawler.file</tt>, <tt>org.eclipse.smila.importing.crawler.web</tt>, <tt>org.eclipse.smila.importing.crawler.jdbc</tt> and <tt>org.eclipse.smila.importing.crawler.feed</tt>.
</p>
<h2><span class="mw-headline" id="The_Crawler">The Crawler</span></h2>
<p>The crawler worker is responsible to retrieve or produce the IDs (e.g. URLs etc.) to adress or identify the data in the data source.
</p><p>The only interface the worker has to implement is <span style="font-family:monospace;">org.eclipse.smila.taskworker.Worker</span>.
</p>
<h3><span class="mw-headline" id="What_is_the_crawler_worker_supposed_to_do.3F">What is the crawler worker supposed to do?</span></h3>
<p>The crawler worker is supposed to do the following:
</p>
<ul>
<li> be invoked by the task generator when the crawl job ist started (as Run-Once job!)
</li>
<li> optionally get some information about what to crawl (some seed id or base URL or SQL query or whatever)
</li>
<li> iterate over the data source according to that information
</li>
<li> and for each entry generate an output record
<ul>
<li> with the data source property set
</li>
<li> with the id set (e.g. to the ID of the data sources data record, to make things easier)
</li>
<li> optionally with the attribute <span style="font-family:monospace;">_deltaHash</span> (ImportingConstants.ATTRIBUTE_DELTA_HASH) set to some information that indicates if the data has been changed meanwhile (a hash over the content or a timestampt of the last modification etc.), so the delta checker can determine if the record has to be processed or the data in the index is up-to-date.
</li>
<li> optionally with data source properties it can easily gather for the data (e.g. for a file crawler these would be the file's metadata that are quickly available without actually reading the file).
</li>
<li> with metadata attributes that were mapped from the data source properties so that they fit to the following processing, e.g. the search index schema
</li>
<li> if the data source contains compound objects, it must set the attribute <span style="font-family:monospace;">_isCompound</span> (ImportingConstants.ATTRIBUTE_COMPOUNDFLAG) to <span style="font-family:monospace;">true</span>, so that the DeltaChecker worker can route compound records to a special compound extractor worker. See below for more details.
</li>
</ul>
</li>
</ul>
<p>So the worker could look something like the following:
</p>
<div dir="ltr" class="mw-geshi mw-code mw-content-ltr"><div class="java source-java"><pre class="de1"><span class="kw1">public</span> <span class="kw1">class</span> WhatsoeverCrawlerWorker <span class="kw1">implements</span> Worker <span class="br0">&#123;</span>
&#160;
<span class="kw1">private</span> <span class="kw1">static</span> <span class="kw1">final</span> <span class="kw3">String</span> NAME <span class="sy0">=</span> <span class="st0">&quot;whatsoeverCrawler&quot;</span><span class="sy0">;</span>
&#160;
<span class="kw1">private</span> <span class="kw1">static</span> <span class="kw1">final</span> <span class="kw3">String</span> OUTPUT_SLOT <span class="sy0">=</span> <span class="st0">&quot;output&quot;</span><span class="sy0">;</span>
&#160;
<span class="kw1">private</span> <span class="kw1">static</span> <span class="kw1">final</span> <span class="kw3">String</span> PROPERTY_SEED <span class="sy0">=</span> <span class="st0">&quot;seed&quot;</span><span class="sy0">;</span>
&#160;
<span class="kw1">private</span> <span class="kw1">static</span> <span class="kw1">final</span> <span class="kw4">int</span> MAX_IDS_PER_BULK <span class="sy0">=</span> <span class="nu0">1024</span><span class="sy0">;</span>
&#160;
<span class="kw1">private</span> Log _log <span class="sy0">=</span> LogFactory.<span class="me1">getLog</span><span class="br0">&#40;</span>getClass<span class="br0">&#40;</span><span class="br0">&#41;</span><span class="br0">&#41;</span><span class="sy0">;</span>
&#160;
<span class="co3">/** {@inheritDoc} */</span>
@Override
<span class="kw1">public</span> <span class="kw4">void</span> perform<span class="br0">&#40;</span>TaskContext taskContext<span class="br0">&#41;</span> <span class="kw1">throws</span> <span class="kw3">Exception</span> <span class="br0">&#123;</span>
<span class="kw1">final</span> AnyMap taskParams <span class="sy0">=</span> taskContext.<span class="me1">getTaskParameters</span><span class="br0">&#40;</span><span class="br0">&#41;</span><span class="sy0">;</span>
<span class="kw1">final</span> <span class="kw3">String</span> dataSource <span class="sy0">=</span> taskParams
.<span class="me1">getStringValue</span><span class="br0">&#40;</span>ImportingConstants.<span class="me1">TASK_PARAM_DATA_SOURCE</span><span class="br0">&#41;</span><span class="sy0">;</span>
<span class="kw1">if</span> <span class="br0">&#40;</span>dataSource <span class="sy0">==</span> <span class="kw2">null</span> <span class="sy0">||</span> dataSource.<span class="me1">trim</span><span class="br0">&#40;</span><span class="br0">&#41;</span>.<span class="me1">length</span><span class="br0">&#40;</span><span class="br0">&#41;</span> <span class="sy0">==</span> <span class="nu0">0</span><span class="br0">&#41;</span> <span class="br0">&#123;</span>
<span class="kw1">throw</span> <span class="kw1">new</span> <span class="kw3">IllegalArgumentException</span><span class="br0">&#40;</span><span class="st0">&quot;Parameter '&quot;</span>
<span class="sy0">+</span> ImportingConstants.<span class="me1">TASK_PARAM_DATA_SOURCE</span> <span class="sy0">+</span> <span class="st0">&quot;' of task &quot;</span>
<span class="sy0">+</span> taskContext.<span class="me1">getTask</span><span class="br0">&#40;</span><span class="br0">&#41;</span>.<span class="me1">getTaskId</span><span class="br0">&#40;</span><span class="br0">&#41;</span> <span class="sy0">+</span> <span class="st0">&quot; is null or empty&quot;</span><span class="br0">&#41;</span><span class="sy0">;</span>
<span class="br0">&#125;</span>
<span class="kw1">final</span> <span class="kw3">String</span> seedId <span class="sy0">=</span> taskParams.<span class="me1">getStringValue</span><span class="br0">&#40;</span>PROPERTY_SEED<span class="br0">&#41;</span><span class="sy0">;</span>
<span class="kw1">if</span> <span class="br0">&#40;</span>seedId <span class="sy0">==</span> <span class="kw2">null</span> <span class="sy0">||</span> seedId.<span class="me1">trim</span><span class="br0">&#40;</span><span class="br0">&#41;</span>.<span class="me1">length</span><span class="br0">&#40;</span><span class="br0">&#41;</span> <span class="sy0">==</span> <span class="nu0">0</span><span class="br0">&#41;</span> <span class="br0">&#123;</span>
<span class="kw1">throw</span> <span class="kw1">new</span> <span class="kw3">IllegalArgumentException</span><span class="br0">&#40;</span><span class="st0">&quot;Parameter '&quot;</span> <span class="sy0">+</span> PROPERTY_SEED
<span class="sy0">+</span> <span class="st0">&quot;' of task &quot;</span> <span class="sy0">+</span> taskContext.<span class="me1">getTask</span><span class="br0">&#40;</span><span class="br0">&#41;</span>.<span class="me1">getTaskId</span><span class="br0">&#40;</span><span class="br0">&#41;</span>
<span class="sy0">+</span> <span class="st0">&quot; is null or empty&quot;</span><span class="br0">&#41;</span><span class="sy0">;</span>
<span class="br0">&#125;</span>
&#160;
<span class="kw4">int</span> recordCount <span class="sy0">=</span> <span class="nu0">0</span><span class="sy0">;</span>
<span class="kw4">int</span> recordOutputIndex <span class="sy0">=</span> <span class="nu0">0</span><span class="sy0">;</span>
RecordOutput recordOutput <span class="sy0">=</span> taskContext.<span class="me1">getOutputs</span><span class="br0">&#40;</span><span class="br0">&#41;</span>.<span class="me1">getAsRecordOutput</span><span class="br0">&#40;</span>
OUTPUT_SLOT, recordOutputIndex<span class="br0">&#41;</span><span class="sy0">;</span>
<span class="kw1">for</span> <span class="br0">&#40;</span>Record record <span class="sy0">:</span> getRecordsBySeed<span class="br0">&#40;</span>seedId, dataSource<span class="br0">&#41;</span><span class="br0">&#41;</span> <span class="br0">&#123;</span>
recordOutput.<span class="me1">writeRecord</span><span class="br0">&#40;</span>record<span class="br0">&#41;</span><span class="sy0">;</span>
recordCount<span class="sy0">++;</span>
<span class="kw1">if</span> <span class="br0">&#40;</span>_log.<span class="me1">isDebugEnabled</span><span class="br0">&#40;</span><span class="br0">&#41;</span><span class="br0">&#41;</span> <span class="br0">&#123;</span>
_log.<span class="me1">debug</span><span class="br0">&#40;</span><span class="st0">&quot;added id &quot;</span> <span class="sy0">+</span> record.<span class="me1">getId</span><span class="br0">&#40;</span><span class="br0">&#41;</span><span class="br0">&#41;</span><span class="sy0">;</span>
<span class="br0">&#125;</span>
<span class="kw1">if</span> <span class="br0">&#40;</span>recordCount <span class="sy0">%</span> MAX_IDS_PER_BULK <span class="sy0">==</span> <span class="nu0">0</span><span class="br0">&#41;</span> <span class="br0">&#123;</span>
recordOutput.<span class="me1">commit</span><span class="br0">&#40;</span><span class="br0">&#41;</span><span class="sy0">;</span>
recordOutputIndex<span class="sy0">++;</span>
recordOutput <span class="sy0">=</span> taskContext.<span class="me1">getOutputs</span><span class="br0">&#40;</span><span class="br0">&#41;</span>.<span class="me1">getAsRecordOutput</span><span class="br0">&#40;</span>
OUTPUT_SLOT, recordOutputIndex<span class="br0">&#41;</span><span class="sy0">;</span>
<span class="br0">&#125;</span>
<span class="br0">&#125;</span>
_log.<span class="me1">info</span><span class="br0">&#40;</span><span class="st0">&quot;Found &quot;</span> <span class="sy0">+</span> recordCount <span class="sy0">+</span> <span class="st0">&quot; records for seed id &quot;</span> <span class="sy0">+</span> seedId <span class="sy0">+</span> <span class="st0">&quot;.&quot;</span><span class="br0">&#41;</span><span class="sy0">;</span>
<span class="br0">&#125;</span>
&#160;
<span class="co3">/**
* gets records from the data source, if possible fills the
* {@link ImportingConstants#ATTRIBUTE_DELTA_HASH} attribute for the delta
* checker to be able to determine if the record has to be updated/inserted
* at all.
*
* @param seedId
* the seed id to know where/what to crawl.
* @param dataSource
* the data source to crawl.
* @return a list of records containing the ID of the data source's data and
* optionally a delta hash.
*/</span>
<span class="kw1">private</span> List<span class="sy0">&lt;</span>Record<span class="sy0">&gt;</span> getRecordsBySeed<span class="br0">&#40;</span><span class="kw1">final</span> <span class="kw3">String</span> seedId, <span class="kw1">final</span> <span class="kw3">String</span> dataSource<span class="br0">&#41;</span> <span class="br0">&#123;</span>
ArrayList<span class="sy0">&lt;</span>Record<span class="sy0">&gt;</span> recordsToCrawl <span class="sy0">=</span> <span class="kw1">new</span> ArrayList<span class="sy0">&lt;</span>Record<span class="sy0">&gt;</span><span class="br0">&#40;</span><span class="br0">&#41;</span><span class="sy0">;</span>
&#160;
<span class="co1">// iterate over the entries in the data source determined by the seed id</span>
<span class="kw1">while</span><span class="br0">&#40;</span>...<span class="br0">&#41;</span> <span class="br0">&#123;</span>
<span class="co1">// id: the id of the data</span>
<span class="co1">// lastModified: the last modified date of the record (omit if it cannot be determined)</span>
<span class="kw1">final</span> Record record <span class="sy0">=</span> DataFactory.<span class="kw1">DEFAULT</span>.<span class="me1">createRecord</span><span class="br0">&#40;</span>id, dataSource<span class="br0">&#41;</span><span class="sy0">;</span>
record.<span class="me1">getMetadata</span><span class="br0">&#40;</span><span class="br0">&#41;</span>.<span class="me1">put</span><span class="br0">&#40;</span>ImportingConstants.<span class="me1">ATTRIBUTE_DELTA_HASH</span>, lastModified<span class="br0">&#41;</span><span class="sy0">;</span>
&#160;
<span class="co1">// map internal property names to attribute names e.g. by using org.eclipse.smila.importing.util.PropertyNameMapper </span>
<span class="co1">// mapper.mapNames(record, ...); // (see implementation of FileCrawlerWorker, WebCrawlerWorker)</span>
&#160;
recordsToCrawl.<span class="me1">add</span><span class="br0">&#40;</span>record<span class="br0">&#41;</span><span class="sy0">;</span>
<span class="br0">&#125;</span>
<span class="kw1">return</span> recordsToCrawl<span class="sy0">;</span>
<span class="br0">&#125;</span>
&#160;
<span class="co3">/** {@inheritDoc} */</span>
@Override
<span class="kw1">public</span> <span class="kw3">String</span> getName<span class="br0">&#40;</span><span class="br0">&#41;</span> <span class="br0">&#123;</span>
<span class="kw1">return</span> NAME<span class="sy0">;</span>
<span class="br0">&#125;</span>
<span class="br0">&#125;</span></pre></div></div>
<p>If your data source is a bit more complex, e.g. hierarchical (file system, etc.) or you have to follow a linked source (like e.g. a web site), you might just have a look at how the sample implementations of file and web crawler work like (e.g. using the VisitedLinks service or looping back to the crawler to visit the next hierarchiy stage, etc.).
</p>
<h2><span class="mw-headline" id="The_Fetcher">The Fetcher</span></h2>
<p>So now we've created bulks of records pointing to the data to be imported into SMILA, we now need a worker that actually fetches the data from the data source using the ids, the crawler provided.
</p><p>The only interface the worker has to implement is <span style="font-family:monospace;">org.eclipse.smila.taskworker.Worker</span>.
</p>
<h3><span class="mw-headline" id="What_is_the_fetcher_worker_supposed_to_do.3F">What is the fetcher worker supposed to do?</span></h3>
<ul>
<li> Read the records sent from the crawler and filtered by the delta checker
</li>
<li> get the data to be processed by SMILA out of the data source for each ID in the record bulk
</li>
<li> create records from that data
</li>
<li> hand that data over to the update pusher, which in turn hands it over to the import workflow (i.e. the bulk builder)
</li>
<li> optionally (if supported) extract compounds or send them to a compound extractor worker to do so.
</li>
</ul>
<p>so the fetcher worker would look something like the follows, with the magic happening in the <span style="font-family:monospace;">fetch(...)</span> method, that has to be accessing the data source, retrieving the data and adding it as an attachment and filling other metadata as needed (you might have a look at the <span style="font-family:monospace;">FileFetcherWorker</span> or the web crawlers <span style="font-family:monospace;">SimpleFetcher</span> implementation for an inspiration).
</p>
<div dir="ltr" class="mw-geshi mw-code mw-content-ltr"><div class="java source-java"><pre class="de1"><span class="kw1">public</span> <span class="kw1">class</span> WhatsoeverFetcherWorker <span class="kw1">implements</span> Worker <span class="br0">&#123;</span>
&#160;
<span class="kw1">private</span> <span class="kw1">static</span> <span class="kw1">final</span> <span class="kw3">String</span> NAME <span class="sy0">=</span> <span class="st0">&quot;whatsoeverFetcher&quot;</span><span class="sy0">;</span>
<span class="kw1">private</span> <span class="kw1">static</span> <span class="kw1">final</span> <span class="kw3">String</span> INPUT_SLOT <span class="sy0">=</span> <span class="st0">&quot;input&quot;</span><span class="sy0">;</span>
<span class="kw1">private</span> <span class="kw1">static</span> <span class="kw1">final</span> <span class="kw3">String</span> OUTPUT_SLOT <span class="sy0">=</span> <span class="st0">&quot;output&quot;</span><span class="sy0">;</span>
<span class="kw1">protected</span> <span class="kw1">final</span> Log _log <span class="sy0">=</span> LogFactory.<span class="me1">getLog</span><span class="br0">&#40;</span>getClass<span class="br0">&#40;</span><span class="br0">&#41;</span><span class="br0">&#41;</span><span class="sy0">;</span>
&#160;
<span class="co3">/** {@inheritDoc} */</span>
@Override
<span class="kw1">public</span> <span class="kw4">void</span> perform<span class="br0">&#40;</span>TaskContext taskContext<span class="br0">&#41;</span> <span class="kw1">throws</span> <span class="kw3">Exception</span> <span class="br0">&#123;</span>
<span class="kw1">final</span> RecordInput recordInput <span class="sy0">=</span> taskContext.<span class="me1">getInputs</span><span class="br0">&#40;</span><span class="br0">&#41;</span>
.<span class="me1">getAsRecordInput</span><span class="br0">&#40;</span>INPUT_SLOT<span class="br0">&#41;</span><span class="sy0">;</span>
<span class="kw1">final</span> RecordOutput recordOutput <span class="sy0">=</span> taskContext.<span class="me1">getOutputs</span><span class="br0">&#40;</span><span class="br0">&#41;</span>
.<span class="me1">getAsRecordOutput</span><span class="br0">&#40;</span>OUTPUT_SLOT<span class="br0">&#41;</span><span class="sy0">;</span>
Record record<span class="sy0">;</span>
<span class="kw1">do</span> <span class="br0">&#123;</span>
record <span class="sy0">=</span> recordInput.<span class="me1">getRecord</span><span class="br0">&#40;</span><span class="br0">&#41;</span><span class="sy0">;</span>
<span class="kw1">if</span> <span class="br0">&#40;</span>record <span class="sy0">!=</span> <span class="kw2">null</span><span class="br0">&#41;</span> <span class="br0">&#123;</span>
<span class="kw1">if</span> <span class="br0">&#40;</span>_log.<span class="me1">isDebugEnabled</span><span class="br0">&#40;</span><span class="br0">&#41;</span><span class="br0">&#41;</span> <span class="br0">&#123;</span>
_log.<span class="me1">debug</span><span class="br0">&#40;</span><span class="st0">&quot;fetching content for record &quot;</span> <span class="sy0">+</span> record.<span class="me1">getId</span><span class="br0">&#40;</span><span class="br0">&#41;</span><span class="br0">&#41;</span><span class="sy0">;</span>
<span class="br0">&#125;</span>
fetch<span class="br0">&#40;</span>record, taskContext<span class="br0">&#41;</span><span class="sy0">;</span>
&#160;
<span class="co1">// map internal property names to attribute/attachment names e.g. by using org.eclipse.smila.importing.util.PropertyNameMapper </span>
<span class="co1">// mapper.mapNames(record, ...); // (see implementation of FileCrawlerWorker, WebCrawlerWorker)</span>
&#160;
recordOutput.<span class="me1">writeRecord</span><span class="br0">&#40;</span>record<span class="br0">&#41;</span><span class="sy0">;</span>
<span class="kw1">if</span> <span class="br0">&#40;</span>_log.<span class="me1">isDebugEnabled</span><span class="br0">&#40;</span><span class="br0">&#41;</span><span class="br0">&#41;</span> <span class="br0">&#123;</span>
_log.<span class="me1">debug</span><span class="br0">&#40;</span><span class="st0">&quot;added record &quot;</span> <span class="sy0">+</span> record.<span class="me1">getId</span><span class="br0">&#40;</span><span class="br0">&#41;</span><span class="br0">&#41;</span><span class="sy0">;</span>
<span class="br0">&#125;</span>
<span class="br0">&#125;</span>
<span class="br0">&#125;</span> <span class="kw1">while</span> <span class="br0">&#40;</span>record <span class="sy0">!=</span> <span class="kw2">null</span><span class="br0">&#41;</span><span class="sy0">;</span>
<span class="br0">&#125;</span>
&#160;
<span class="co3">/**
* Actually retrieves the data from the source based on the ID of the record
* and fills in the record's meta data and/or attachments.
*
* @param record
* the record to be completed with information from the data
* source
* @param taskContext
* the tasks context.
*/</span>
<span class="kw1">private</span> <span class="kw4">void</span> fetch<span class="br0">&#40;</span>Record record, TaskContext taskContext<span class="br0">&#41;</span> <span class="br0">&#123;</span>
<span class="kw1">final</span> <span class="kw4">long</span> time <span class="sy0">=</span> taskContext.<span class="me1">getTimestamp</span><span class="br0">&#40;</span><span class="br0">&#41;</span><span class="sy0">;</span>
&#160;
<span class="co1">// go and fetch the content and fill the record's content, metadata and/or</span>
<span class="co1">// attachments with it.</span>
record.<span class="me1">getMetadata</span><span class="br0">&#40;</span><span class="br0">&#41;</span>.<span class="me1">put</span><span class="br0">&#40;</span>..., ...<span class="br0">&#41;</span><span class="sy0">;</span>
...
&#160;
<span class="me1">taskContext</span>.<span class="me1">measureTime</span><span class="br0">&#40;</span><span class="st0">&quot;fetchContent&quot;</span>, time<span class="br0">&#41;</span><span class="sy0">;</span>
<span class="br0">&#125;</span>
&#160;
<span class="co3">/** {@inheritDoc} */</span>
@Override
<span class="kw1">public</span> <span class="kw3">String</span> getName<span class="br0">&#40;</span><span class="br0">&#41;</span> <span class="br0">&#123;</span>
<span class="kw1">return</span> NAME<span class="sy0">;</span>
<span class="br0">&#125;</span>
<span class="br0">&#125;</span></pre></div></div>
<h2><span class="mw-headline" id="The_Extractor">The Extractor</span></h2>
<p>If your data source contains compound documents like ZIP archives or similar that should be decomposed into the real documents you need to implement an additional compound extractor worker.
</p><p>Basically, you need to implement the interface <code>org.eclipse.smila.taskworker.Worker</code>, again.
</p>
<h3><span class="mw-headline" id="What.27s_the_extractor_worker_supposed_to_do.3F">What's the extractor worker supposed to do?</span></h3>
<p>The extractor worker should for each input object fetch the content from the data source, extract the compound object and create records for each contained object. Additionally, the crawler worker must mark compound objects by setting a special attribute so that the DeltaChecker worker can route the compound to the extractor worker instead of the fetcher worker.
</p><p>There are some things in SMILA that can make this stuff more simple:
</p>
<ul>
<li> An <code>org.eclipse.smila.importing.compounds.CompoundExtractor</code> service can handle the actual extraction process: Just give it an <code>java.io.InputStream</code> to a supported and it return records for each contained object. Additionally, it can handle the compound identification for the crawler worker by checking if a filename or a mimetype denotes a supprted compound type. See <a href="../Importing/CompoundExtractorService.html" title="SMILA/Documentation/Importing/CompoundExtractorService">SMILA/Documentation/Importing/CompoundExtractorService</a> for more details on this service.
</li>
<li> The <code>org.eclipse.smila.importing.compounds.ExtractorWorkerBase</code> is a base class for extractor workers that use the <code>CompoundExtractor</code>.
</li>
</ul>
<h3><span class="mw-headline" id="Creating_an_extractor_worker_using_the_base_classes">Creating an extractor worker using the base classes</span></h3>
<p>If using the <code>ExtractorWorkerBase</code>, your extractor worker has three tasks to do:
</p>
<ul>
<li> First, provide a <code>ContentFetcher</code> that creates a InputStream to the compound content. In the simplest case, the worker can be the ContentFetcher itself. Then it has to implement a <code>getContent</code> method to get the content based on attributes set in the current compound record and the task parameters. Of course, this method will likely be very similar to some of the fetcher worker's code, so it makes sense to share this code, and the shared component then might implement the <code>ContentFetcher</code> interface.
</li>
<li> Second, invoke a given <code>CompoundExtractor</code> service. This is necessary, because the extractor service needs more than only the content to be able to extract it: the mime type or the filename (especially the file extension) is needed to identify which type of compound it has to extract. So the <code>invokeExtractor</code> method is given the compound record so that it can get this meta information (it hopefully knows where the associated worker has put it&#160;;-) and calls the appropriate extractor service method. The extractor also needs the name of the attachment to put the object content in.
</li>
<li> Then it implements a method that converts the records produced by the extractor service to records that look like records produced by the associated crawler. Especially, it has to create a record ID that matches those produced by the crawler, copy the attachment and attributes produced by the extrator to the corresponding names called like the ones the crawler would have produced (or use the task parameters to determine the correct names) and set the _deltaHash attribute. It also can copy attributes from the compound record, if there is a need to "inherit" them.
</li>
<li> Finally it implements a method <code>mapRecord()</code> that is used for the mapping of internal data source property names to attribute names.
</li>
<li> Of course, like any worker, it has to provide a <code>getName()</code> method returning the worker name.
</li>
</ul>
<div dir="ltr" class="mw-geshi mw-code mw-content-ltr"><div class="java source-java"><pre class="de1"><span class="kw1">public</span> <span class="kw1">class</span> WhatsoeverExtractorWorker <span class="kw1">extends</span> ExtractorWorkerBase <span class="kw1">implements</span> ContentFetcher <span class="br0">&#123;</span>
&#160;
<span class="kw1">private</span> <span class="kw1">static</span> <span class="kw1">final</span> <span class="kw3">String</span> NAME <span class="sy0">=</span> <span class="st0">&quot;whatsoeverExtractor&quot;</span><span class="sy0">;</span>
&#160;
@Override
<span class="kw1">public</span> <span class="kw3">String</span> getName<span class="br0">&#40;</span><span class="br0">&#41;</span> <span class="br0">&#123;</span>
<span class="kw1">return</span> NAME<span class="sy0">;</span>
<span class="br0">&#125;</span>
&#160;
@Override
<span class="kw1">protected</span> ContentFetcher getContentFetcher<span class="br0">&#40;</span><span class="br0">&#41;</span> <span class="br0">&#123;</span>
<span class="kw1">return</span> <span class="kw1">this</span><span class="sy0">;</span> <span class="co1">// easiest thing, if you don't have created a separate fetcher component.</span>
<span class="br0">&#125;</span>
&#160;
@Override
<span class="kw1">public</span> <span class="kw3">InputStream</span> getContent<span class="br0">&#40;</span><span class="kw1">final</span> Record record, <span class="kw1">final</span> AnyMap parameters<span class="br0">&#41;</span> <span class="kw1">throws</span> ImportingException <span class="br0">&#123;</span>
<span class="kw1">return</span> ...<span class="sy0">;</span> <span class="co1">// get a stream to the compound content. Might share code with associated Fetcher worker. </span>
<span class="br0">&#125;</span>
&#160;
@Override
<span class="kw1">protected</span> Iterator<span class="sy0">&lt;</span>Record<span class="sy0">&gt;</span> invokeExtractor<span class="br0">&#40;</span><span class="kw1">final</span> CompoundExtractor extractor, <span class="kw1">final</span> Record compoundRecord,
<span class="kw1">final</span> <span class="kw3">InputStream</span> compoundContent, <span class="kw1">final</span> TaskContext taskContext<span class="br0">&#41;</span> <span class="kw1">throws</span> CompoundExtractorException <span class="br0">&#123;</span>
<span class="kw1">final</span> <span class="kw3">String</span> mimeType <span class="sy0">=</span> compoundRecord.<span class="me1">getMetadata</span><span class="br0">&#40;</span><span class="br0">&#41;</span>.<span class="me1">getStringValue</span><span class="br0">&#40;</span>WhatsoeverCrawlerWorker.<span class="me1">ATTRIBUTE_MIMETYPE</span><span class="br0">&#41;</span><span class="sy0">;</span>
<span class="kw1">return</span> extractor.<span class="me1">extract</span><span class="br0">&#40;</span>compoundContent, <span class="kw2">null</span>, mimeType, WhatsoeverCrawlerWorker.<span class="me1">ATTACHMENT_CONTENT</span><span class="br0">&#41;</span><span class="sy0">;</span>
<span class="br0">&#125;</span>
&#160;
@Override
<span class="kw1">protected</span> Record convertRecord<span class="br0">&#40;</span><span class="kw1">final</span> Record compoundRecord, <span class="kw1">final</span> Record extractedRecord,
<span class="kw1">final</span> TaskContext taskContext<span class="br0">&#41;</span> <span class="br0">&#123;</span>
<span class="kw1">final</span> <span class="kw3">String</span> dataSource <span class="sy0">=</span> compoundRecord.<span class="me1">getSource</span><span class="br0">&#40;</span><span class="br0">&#41;</span><span class="sy0">;</span>
<span class="kw1">final</span> <span class="kw3">String</span> id <span class="sy0">=</span> ...<span class="sy0">;</span> <span class="co1">// create a unique ID from the compound record attributes, according to the conventions of associated crawler worker.</span>
<span class="kw1">final</span> Record convertedRecord <span class="sy0">=</span> extractedRecord.<span class="me1">getFactory</span><span class="br0">&#40;</span><span class="br0">&#41;</span>.<span class="me1">createRecord</span><span class="br0">&#40;</span>id, dataSource<span class="br0">&#41;</span><span class="sy0">;</span>
copyAttachment<span class="br0">&#40;</span>extractedRecord, convertedRecord, WhatsoeverCrawlerWorker.<span class="me1">ATTACHMENT_CONTENT</span><span class="br0">&#41;</span><span class="sy0">;</span>
copyAttribute<span class="br0">&#40;</span>extractedRecord, CompoundExtractor.<span class="me1">KEY_SIZE</span>, convertedRecord, WhatsoeverCrawlerWorker.<span class="me1">ATTRIBUTE_SIZE</span><span class="br0">&#41;</span><span class="sy0">;</span>
<span class="co1">// copy more attributes from compound record or extracted record to the final record according to the conventions of your crawler worker.</span>
<span class="co1">// ...</span>
copyAttribute<span class="br0">&#40;</span>extractedRecord, CompoundExtractor.<span class="me1">KEY_TIME</span>, convertedRecord, ImportingConstants.<span class="me1">ATTRIBUTE_DELTA_HASH</span><span class="br0">&#41;</span><span class="sy0">;</span>
<span class="kw1">return</span> convertedRecord<span class="sy0">;</span>
<span class="br0">&#125;</span>
&#160;
@Override
<span class="kw1">protected</span> <span class="kw4">void</span> mapRecord<span class="br0">&#40;</span><span class="kw1">final</span> Record record, <span class="kw1">final</span> TaskContext taskContext<span class="br0">&#41;</span> <span class="br0">&#123;</span>
<span class="kw1">final</span> PropertyNameMapper mapper <span class="sy0">=</span> PropertyNameMapper.<span class="me1">createFrom</span><span class="br0">&#40;</span>taskContext<span class="br0">&#41;</span><span class="sy0">;</span>
mapper.<span class="me1">mapNames</span><span class="br0">&#40;</span>record, ...<span class="br0">&#41;</span><span class="sy0">;</span>
<span class="br0">&#125;</span>
&#160;
<span class="br0">&#125;</span></pre></div></div>
<p>That's it, basically. You might want to have a look at the web or file crawler implementations in SMILA to see how this scheme is implemented there.
</p><p>Don't forget that the base class requires a service reference to the extractor service, so you have to add it to the OSGi DS component definition, like this (you can of course add more references if necessary):
</p>
<div dir="ltr" class="mw-geshi mw-code mw-content-ltr"><div class="xml source-xml"><pre class="de1"><span class="sc3"><span class="re1">&lt;?xml</span> <span class="re0">version</span>=<span class="st0">&quot;1.0&quot;</span> <span class="re0">encoding</span>=<span class="st0">&quot;UTF-8&quot;</span><span class="re2">?&gt;</span></span>
<span class="sc3"><span class="re1">&lt;scr:component</span> <span class="re0">xmlns:scr</span>=<span class="st0">&quot;http://www.osgi.org/xmlns/scr/v1.1.0&quot;</span> <span class="re0">name</span>=<span class="st0">&quot;WhatsoeverExtractorWorker&quot;</span><span class="re2">&gt;</span></span>
<span class="sc3"><span class="re1">&lt;implementation</span> <span class="re0">class</span>=<span class="st0">&quot;org.eclipse.smila.importing.crawler.whatever.WhatsoeverExtractorWorker&quot;</span><span class="re2">/&gt;</span></span>
<span class="sc3"><span class="re1">&lt;service<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;provide</span> <span class="re0">interface</span>=<span class="st0">&quot;org.eclipse.smila.taskworker.Worker&quot;</span><span class="re2">/&gt;</span></span>
<span class="sc3"><span class="re1">&lt;/service<span class="re2">&gt;</span></span></span>
...
<span class="sc3"><span class="re1">&lt;reference</span> </span>
<span class="sc3"> <span class="re0">name</span>=<span class="st0">&quot;compoundExtractor&quot;</span></span>
<span class="sc3"> <span class="re0">interface</span>=<span class="st0">&quot;org.eclipse.smila.importing.compounds.CompoundExtractor&quot;</span> </span>
<span class="sc3"> <span class="re0">bind</span>=<span class="st0">&quot;setCompoundExtractor&quot;</span></span>
<span class="sc3"> <span class="re0">unbind</span>=<span class="st0">&quot;unsetCompoundExtractor&quot;</span></span>
<span class="sc3"> <span class="re0">cardinality</span>=<span class="st0">&quot;1..1&quot;</span></span>
<span class="sc3"> <span class="re0">policy</span>=<span class="st0">&quot;static&quot;</span> </span>
<span class="sc3"> <span class="re2">/&gt;</span></span>
<span class="sc3"><span class="re1">&lt;/scr:component<span class="re2">&gt;</span></span></span></pre></div></div>
<p>If your data source contains compound types not supported by the SMILA extractor services you can either extend them or implement the compound extraction completely on your own. We will not go into detail about this here, however.
</p>
<h2><span class="mw-headline" id="Plugging_it_up">Plugging it up</span></h2>
<p>So, now we have to plug all together.
</p>
<ul>
<li> Write component definitions for your workers (and as well for your service if one is needed to access your data source).
</li>
<li> Add the bundle to the launcher and the config.ini file.
</li>
<li> Set your Scale-Up limits
</li>
<li> add worker descriptions to the <span style="font-family:monospace;">workers.json</span> file for your workers, these could look something like the following code snippet.
</li>
</ul>
<p><b>Please note:</b>
</p>
<ul>
<li> The crawler worker needs the "runOnceTrigger" task generator for the runOnce triggering!
</li>
<li> An extractor worker based on the ExtractorWorkerBase class has always an input slot named "compound" and an output slot named "files".
</li>
</ul>
<div dir="ltr" class="mw-geshi mw-code mw-content-ltr"><div class="javascript source-javascript"><pre class="de1"><span class="br0">&#123;</span>
<span class="st0">&quot;name&quot;</span><span class="sy0">:</span> <span class="st0">&quot;whatsoeverCrawler&quot;</span><span class="sy0">,</span>
<span class="st0">&quot;taskGenerator&quot;</span><span class="sy0">:</span><span class="st0">&quot;runOnceTrigger&quot;</span><span class="sy0">,</span>
<span class="st0">&quot;parameters&quot;</span><span class="sy0">:</span><span class="br0">&#91;</span>
<span class="br0">&#123;</span>
<span class="st0">&quot;name&quot;</span><span class="sy0">:</span><span class="st0">&quot;dataSource&quot;</span>
<span class="br0">&#125;</span><span class="sy0">,</span>
<span class="br0">&#123;</span>
<span class="st0">&quot;name&quot;</span><span class="sy0">:</span><span class="st0">&quot;seed&quot;</span>
<span class="br0">&#125;</span><span class="sy0">,</span>
<span class="br0">&#123;</span>
<span class="st0">&quot;name&quot;</span><span class="sy0">:</span><span class="st0">&quot;mapping&quot;</span><span class="sy0">,</span>
<span class="st0">&quot;type&quot;</span><span class="sy0">:</span><span class="st0">&quot;map&quot;</span><span class="sy0">,</span>
<span class="st0">&quot;entries&quot;</span><span class="sy0">:</span><span class="br0">&#91;</span>
<span class="br0">&#123;</span>
<span class="st0">&quot;name&quot;</span><span class="sy0">:</span><span class="st0">&quot;whatsoeverProperty_1&quot;</span>
<span class="br0">&#125;</span><span class="sy0">,</span>
<span class="br0">&#123;</span>
<span class="st0">&quot;name&quot;</span><span class="sy0">:</span><span class="st0">&quot;whatsoeverProperty_2&quot;</span>
<span class="br0">&#125;</span><span class="sy0">,</span>
...
<span class="br0">&#93;</span>
<span class="br0">&#125;</span>
<span class="br0">&#93;</span><span class="sy0">,</span>
<span class="st0">&quot;input&quot;</span><span class="sy0">:</span> <span class="br0">&#91;</span><span class="br0">&#93;</span><span class="sy0">,</span>
<span class="st0">&quot;output&quot;</span><span class="sy0">:</span> <span class="br0">&#91;</span>
<span class="br0">&#123;</span> <span class="st0">&quot;name&quot;</span><span class="sy0">:</span> <span class="st0">&quot;output&quot;</span><span class="sy0">,</span>
<span class="st0">&quot;type&quot;</span><span class="sy0">:</span> <span class="st0">&quot;recordBulks&quot;</span><span class="sy0">,</span>
<span class="st0">&quot;modes&quot;</span><span class="sy0">:</span><span class="br0">&#91;</span>
<span class="st0">&quot;maybeEmpty&quot;</span><span class="sy0">,</span>
<span class="st0">&quot;multiple&quot;</span>
<span class="br0">&#93;</span>
<span class="br0">&#125;</span> <span class="br0">&#93;</span>
<span class="br0">&#125;</span><span class="sy0">,</span>
<span class="br0">&#123;</span>
<span class="st0">&quot;name&quot;</span><span class="sy0">:</span> <span class="st0">&quot;whatsoeverFetcher&quot;</span><span class="sy0">,</span>
<span class="st0">&quot;parameters&quot;</span><span class="sy0">:</span><span class="br0">&#91;</span>
<span class="br0">&#123;</span>
<span class="st0">&quot;name&quot;</span><span class="sy0">:</span><span class="st0">&quot;mapping&quot;</span><span class="sy0">,</span>
<span class="st0">&quot;type&quot;</span><span class="sy0">:</span><span class="st0">&quot;map&quot;</span><span class="sy0">,</span>
<span class="st0">&quot;entries&quot;</span><span class="sy0">:</span><span class="br0">&#91;</span>
<span class="br0">&#123;</span>
<span class="st0">&quot;name&quot;</span><span class="sy0">:</span><span class="st0">&quot;whatsoeverProperty_A&quot;</span>
<span class="br0">&#125;</span><span class="sy0">,</span>
...
<span class="br0">&#93;</span>
<span class="br0">&#125;</span>
<span class="st0">&quot;input&quot;</span><span class="sy0">:</span> <span class="br0">&#91;</span>
<span class="br0">&#123;</span> <span class="st0">&quot;name&quot;</span><span class="sy0">:</span> <span class="st0">&quot;input&quot;</span><span class="sy0">,</span>
<span class="st0">&quot;type&quot;</span><span class="sy0">:</span> <span class="st0">&quot;recordBulks&quot;</span>
<span class="br0">&#125;</span> <span class="br0">&#93;</span><span class="sy0">,</span>
<span class="st0">&quot;output&quot;</span><span class="sy0">:</span> <span class="br0">&#91;</span>
<span class="br0">&#123;</span> <span class="st0">&quot;name&quot;</span><span class="sy0">:</span> <span class="st0">&quot;output&quot;</span><span class="sy0">,</span>
<span class="st0">&quot;type&quot;</span><span class="sy0">:</span> <span class="st0">&quot;recordBulks&quot;</span>
<span class="br0">&#125;</span> <span class="br0">&#93;</span>
<span class="br0">&#125;</span><span class="sy0">,</span>
<span class="br0">&#123;</span>
<span class="st0">&quot;name&quot;</span><span class="sy0">:</span> <span class="st0">&quot;whatsoeverExtractor&quot;</span><span class="sy0">,</span>
<span class="st0">&quot;parameters&quot;</span><span class="sy0">:</span><span class="br0">&#91;</span>
<span class="br0">&#123;</span>
<span class="st0">&quot;name&quot;</span><span class="sy0">:</span><span class="st0">&quot;mapping&quot;</span><span class="sy0">,</span>
<span class="st0">&quot;type&quot;</span><span class="sy0">:</span><span class="st0">&quot;map&quot;</span><span class="sy0">,</span>
<span class="st0">&quot;entries&quot;</span><span class="sy0">:</span><span class="br0">&#91;</span>
<span class="br0">&#123;</span>
<span class="st0">&quot;name&quot;</span><span class="sy0">:</span><span class="st0">&quot;whatsoeverProperty_1&quot;</span>
<span class="br0">&#125;</span><span class="sy0">,</span>
<span class="br0">&#123;</span>
<span class="st0">&quot;name&quot;</span><span class="sy0">:</span><span class="st0">&quot;whatsoeverProperty_2&quot;</span>
<span class="br0">&#125;</span><span class="sy0">,</span>
...
<span class="br0">&#93;</span>
<span class="br0">&#125;</span>
<span class="st0">&quot;input&quot;</span><span class="sy0">:</span> <span class="br0">&#91;</span>
<span class="br0">&#123;</span> <span class="st0">&quot;name&quot;</span><span class="sy0">:</span> <span class="st0">&quot;compounds&quot;</span><span class="sy0">,</span>
<span class="st0">&quot;type&quot;</span><span class="sy0">:</span> <span class="st0">&quot;recordBulks&quot;</span>
<span class="br0">&#125;</span> <span class="br0">&#93;</span><span class="sy0">,</span>
<span class="st0">&quot;output&quot;</span><span class="sy0">:</span> <span class="br0">&#91;</span>
<span class="br0">&#123;</span> <span class="st0">&quot;name&quot;</span><span class="sy0">:</span> <span class="st0">&quot;files&quot;</span><span class="sy0">,</span>
<span class="st0">&quot;type&quot;</span><span class="sy0">:</span> <span class="st0">&quot;recordBulks&quot;</span>
<span class="br0">&#125;</span> <span class="br0">&#93;</span>
<span class="br0">&#125;</span></pre></div></div>
<ul>
<li> add the workers to a sensible workflow like e.g.
</li>
</ul>
<div dir="ltr" class="mw-geshi mw-code mw-content-ltr"><div class="javascript source-javascript"><pre class="de1"> <span class="br0">&#123;</span>
<span class="st0">&quot;name&quot;</span><span class="sy0">:</span><span class="st0">&quot;whatsoeverCrawling&quot;</span><span class="sy0">,</span>
<span class="st0">&quot;startAction&quot;</span><span class="sy0">:</span><span class="br0">&#123;</span>
<span class="st0">&quot;worker&quot;</span><span class="sy0">:</span><span class="st0">&quot;whatsoeverCrawler&quot;</span><span class="sy0">,</span>
<span class="st0">&quot;output&quot;</span><span class="sy0">:</span><span class="br0">&#123;</span>
<span class="st0">&quot;output&quot;</span><span class="sy0">:</span><span class="st0">&quot;somethingToCrawlBucket&quot;</span>
<span class="br0">&#125;</span>
<span class="br0">&#125;</span><span class="sy0">,</span>
<span class="st0">&quot;actions&quot;</span><span class="sy0">:</span><span class="br0">&#91;</span>
<span class="br0">&#123;</span>
<span class="st0">&quot;worker&quot;</span><span class="sy0">:</span><span class="st0">&quot;deltaChecker&quot;</span><span class="sy0">,</span>
<span class="st0">&quot;input&quot;</span><span class="sy0">:</span><span class="br0">&#123;</span>
<span class="st0">&quot;recordsToCheck&quot;</span><span class="sy0">:</span><span class="st0">&quot;somethingToCrawlBucket&quot;</span>
<span class="br0">&#125;</span><span class="sy0">,</span>
<span class="st0">&quot;output&quot;</span><span class="sy0">:</span><span class="br0">&#123;</span>
<span class="st0">&quot;updatedRecords&quot;</span><span class="sy0">:</span><span class="st0">&quot;somethingToFetchBucket&quot;</span><span class="sy0">,</span>
<span class="st0">&quot;updateedCompounds&quot;</span><span class="sy0">:</span><span class="st0">&quot;somethingToExtractBucket&quot;</span>
<span class="br0">&#125;</span>
<span class="br0">&#125;</span><span class="sy0">,</span>
<span class="br0">&#123;</span>
<span class="st0">&quot;worker&quot;</span><span class="sy0">:</span><span class="st0">&quot;whatsoeverFetcher&quot;</span><span class="sy0">,</span>
<span class="st0">&quot;input&quot;</span><span class="sy0">:</span><span class="br0">&#123;</span>
<span class="st0">&quot;input&quot;</span><span class="sy0">:</span><span class="st0">&quot;somethingToFetchBucket&quot;</span>
<span class="br0">&#125;</span><span class="sy0">,</span>
<span class="st0">&quot;output&quot;</span><span class="sy0">:</span><span class="br0">&#123;</span>
<span class="st0">&quot;output&quot;</span><span class="sy0">:</span><span class="st0">&quot;somethingToPushBucket&quot;</span>
<span class="br0">&#125;</span>
<span class="br0">&#125;</span><span class="sy0">,</span>
<span class="br0">&#123;</span>
<span class="st0">&quot;worker&quot;</span><span class="sy0">:</span><span class="st0">&quot;whatsoeverExtractor&quot;</span><span class="sy0">,</span>
<span class="st0">&quot;input&quot;</span><span class="sy0">:</span><span class="br0">&#123;</span>
<span class="st0">&quot;input&quot;</span><span class="sy0">:</span><span class="st0">&quot;somethingToExtractBucket&quot;</span>
<span class="br0">&#125;</span><span class="sy0">,</span>
<span class="st0">&quot;output&quot;</span><span class="sy0">:</span><span class="br0">&#123;</span>
<span class="st0">&quot;output&quot;</span><span class="sy0">:</span><span class="st0">&quot;somethingToPushBucket&quot;</span>
<span class="br0">&#125;</span>
<span class="br0">&#125;</span><span class="sy0">,</span>
<span class="br0">&#123;</span>
<span class="st0">&quot;worker&quot;</span><span class="sy0">:</span><span class="st0">&quot;updatePusher&quot;</span><span class="sy0">,</span>
<span class="st0">&quot;input&quot;</span><span class="sy0">:</span><span class="br0">&#123;</span>
<span class="st0">&quot;recordsToPush&quot;</span><span class="sy0">:</span><span class="st0">&quot;somethingToPushBucket&quot;</span>
<span class="br0">&#125;</span>
<span class="br0">&#125;</span>
<span class="br0">&#93;</span>
<span class="br0">&#125;</span></pre></div></div>
<ul>
<li> For your convenience you can also create a predefined job in the jobs.json, like the following snippet (you should notice that the seed parameter is fixed if you choose to use a predefined job)
</li>
</ul>
<div dir="ltr" class="mw-geshi mw-code mw-content-ltr"><div class="javascript source-javascript"><pre class="de1"> <span class="br0">&#123;</span>
<span class="st0">&quot;name&quot;</span><span class="sy0">:</span> <span class="st0">&quot;crawlWhatsoever&quot;</span><span class="sy0">,</span>
<span class="st0">&quot;workflow&quot;</span><span class="sy0">:</span> <span class="st0">&quot;whatsoeverCrawling&quot;</span><span class="sy0">,</span>
<span class="st0">&quot;parameters&quot;</span><span class="sy0">:</span> <span class="br0">&#123;</span>
<span class="st0">&quot;tempStore&quot;</span><span class="sy0">:</span> <span class="st0">&quot;temp&quot;</span><span class="sy0">,</span>
<span class="st0">&quot;dataSource&quot;</span><span class="sy0">:</span> <span class="st0">&quot;whatsoever&quot;</span><span class="sy0">,</span>
<span class="st0">&quot;seed&quot;</span><span class="sy0">:</span> <span class="st0">&quot;your seed data&quot;</span><span class="sy0">,</span>
<span class="st0">&quot;jobToPushTo&quot;</span><span class="sy0">:</span> <span class="st0">&quot;indexUpdate&quot;</span><span class="sy0">,</span>
<span class="st0">&quot;mapping&quot;</span><span class="sy0">:</span><span class="br0">&#123;</span>
<span class="st0">&quot;whatsoeverProperty_1&quot;</span><span class="sy0">:</span><span class="st0">&quot;Title&quot;</span><span class="sy0">,</span>
<span class="st0">&quot;whatsoeverProperty_2&quot;</span><span class="sy0">:</span><span class="st0">&quot;Path&quot;</span><span class="sy0">,</span>
<span class="st0">&quot;whatsoeverProperty_A&quot;</span><span class="sy0">:</span><span class="st0">&quot;Content&quot;</span><span class="sy0">,</span>
...
<span class="br0">&#125;</span>
<span class="br0">&#125;</span></pre></div></div>
<h2><span class="mw-headline" id="And_....Action.21">And ....Action!</span></h2>
<p>So now it's time to check if everything went right.
</p>
<ul>
<li> Start SMILA
</li>
<li> check if you can access your worker definitions, workflow and job via the REST API. If not, check for errors (syntax errors in the json files, others in SMILA log).
</li>
<li> check in SMILA's log if your workers were added
</li>
<li> start the indexing job: POST <tt><a rel="nofollow" class="external free" href="http://localhost:8080/smila/jobmanager/jobs/indexUpdate/">http://localhost:8080/smila/jobmanager/jobs/indexUpdate/</a></tt>
</li>
<li> start your crawling job (remember: it has to be started as a RunOnce Job!)
</li>
</ul>
<div dir="ltr" class="mw-geshi mw-code mw-content-ltr"><div class="text source-text"><pre class="de1">POST http://localhost:8080/smila/jobmanager/jobs/crawlWhatsoever/
{
&quot;mode&quot;: &quot;runOnce&quot;
}</pre></div></div>
<ul>
<li> Check your jobs, after your crawl job succeeded, you can finish your input job. After the input job succeeded (if you finished it), you should wait some seconds (up to 60, because it takes some time for the autocommit), before checking, if your data was indexed (see <tt><a rel="nofollow" class="external free" href="http://localhost:8080/SMILA/search">http://localhost:8080/SMILA/search</a></tt>).
</li>
</ul>
<p>So now you should be able to search in your content.
</p><p>If you can find your records, you have just successfully added a new datasource to your SMILA application. Congratulations!
</p>
<!--
NewPP limit report
CPU time usage: 0.284 seconds
Real time usage: 0.294 seconds
Preprocessor visited node count: 176/1000000
Preprocessor generated node count: 400/1000000
Post‐expand include size: 543/2097152 bytes
Template argument size: 147/2097152 bytes
Highest expansion depth: 2/40
Expensive parser function count: 0/100
-->
<!-- Saved in parser cache with key my_wiki:pcache:idhash:35477-0!*!0!!en!*!* and timestamp 20150414084527 and revision id 326097
-->
</div>
<!-- catlinks -->
<div id='catlinks' class='catlinks catlinks-allhidden'></div> <!-- /catlinks -->
</div>
</div>
</div>
</div>
<!-- /maincontent -->
<!-- printfooter -->
<div class="printfooter">
Retrieved from "<a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/HowTo/How_to_add_a_new_Data_Source_to_the_importing_framework&amp;oldid=326097">http://wiki.eclipse.org/index.php?title=SMILA/Documentation/HowTo/How_to_add_a_new_Data_Source_to_the_importing_framework&amp;oldid=326097</a>" </div>
<!-- /printfooter -->
<!-- debughtml -->
<!-- /debughtml -->
</div>
<!-- /bodyContent -->
</section>
<!-- /content -->
<!-- footer -->
</div> <section id="footer-contribution-info" style="border-top:1px solid #ccc;" class="footer-offset background-white margin-top-25"><div class="container text-center padding-top-10 padding-bottom-10"><p id="footercredit">This page was last modified 06:14, 10 January 2013 by <a href="http://wiki.eclipse.org/index.php?title=User:Dhaenssgen.brox.de&amp;action=edit&amp;redlink=1" class="new" title="User:Dhaenssgen.brox.de (page does not exist)">Daniel Hänßgen</a>. Based on work by <a href="http://wiki.eclipse.org/User:Andreas.weber.empolis.com" title="User:Andreas.weber.empolis.com">Andreas Weber</a>, <a href="http://wiki.eclipse.org/index.php?title=User:Juergen.schumacher.attensity.com&amp;action=edit&amp;redlink=1" class="new" title="User:Juergen.schumacher.attensity.com (page does not exist)">Juergen Schumacher</a> and <a href="http://wiki.eclipse.org/User:Andreas.schank.attensity.com" title="User:Andreas.schank.attensity.com">A. Schank</a>.</p><p id="footerviews">This page has been accessed 2,629 times.</p></div></section> </main> <!-- /#main-content-container-row -->
<p id="back-to-top" class="noprint hidden-print">
<a class="visible-xs" href="How_to_add_a_new_Data_Source_to_the_importing_framework.html#top">Back to the top</a>
</p>
<footer role="contentinfo" class="noprint hidden-print">
<div class="container">
<div class="row">
<section id="footer-eclipse-foundation" class="col-xs-offset-1 col-xs-11 col-sm-7 col-md-6 col-md-offset-0">
<h2 class="section-title">Eclipse Foundation</h2>
<ul class="nav">
<li><a href="https://eclipse.org/org/">About us</a></li>
<li><a href="https://eclipse.org/org/foundation/contact.php">Contact Us</a></li>
<li><a href="https://eclipse.org/donate">Donate</a></li>
<li><a href="https://eclipse.org/org/documents/">Governance</a></li>
<li><a href="https://eclipse.org/artwork/">Logo and Artwork</a></li>
<li><a href="https://eclipse.org/org/foundation/directors.php">Board of Directors</a></li>
</ul>
</section>
<section id="footer-legal" class="col-xs-offset-1 col-xs-11 col-sm-7 col-md-6 col-md-offset-0">
<h2 class="section-title">Legal</h2>
<ul class="nav">
<li><a href="https://eclipse.org/legal/privacy.php">Privacy Policy</a></li>
<li><a href="https://eclipse.org/legal/termsofuse.php">Terms of Use</a></li>
<li><a href="https://eclipse.org/legal/copyright.php">Copyright Agent</a></li>
<li><a href="https://eclipse.org/org/documents/epl-v10.php">Eclipse Public License </a></li>
<li><a href="https://eclipse.org/legal/">Legal Resources </a></li>
</ul>
</section>
<section id="footer-useful-links" class="col-xs-offset-1 col-xs-11 col-sm-7 col-md-6 col-md-offset-0">
<h2 class="section-title">Useful Links</h2>
<ul class="nav">
<li><a href="https://bugs.eclipse.org/bugs/">Report a Bug</a></li>
<li><a href="http://help.eclipse.org/">Documentation</a></li>
<li><a href="https://eclipse.org/contribute/">How to Contribute</a></li>
<li><a href="https://eclipse.org/mail/">Mailing Lists</a></li>
<li><a href="https://eclipse.org/forums/">Forums</a></li>
<li><a href="http://marketplace.eclipse.org/">Marketplace</a></li>
</ul>
</section>
<section id="footer-other" class="col-xs-offset-1 col-xs-11 col-sm-7 col-md-6 col-md-offset-0">
<h2 class="section-title">Other</h2>
<ul class="nav">
<li><a href="https://eclipse.org/ide/">IDE and Tools</a></li>
<li><a href="https://eclipse.org/projects">Community of Projects</a></li>
<li><a href="https://eclipse.org/org/workinggroups/">Working Groups</a></li>
</ul>
<ul class="list-inline social-media">
<li><a href="https://twitter.com/EclipseFdn"><i class="fa fa-twitter-square"></i></a></li>
<li><a href="https://plus.google.com/+Eclipse"><i class="fa fa-google-plus-square"></i></a></li>
<li><a href="https://www.facebook.com/eclipse.org"><i class="fa fa-facebook-square"></i> </a></li>
<li><a href="https://www.youtube.com/user/EclipseFdn"><i class="fa fa-youtube-square"></i></a></li>
</ul>
</section>
<div id="copyright" class="col-xs-offset-1 col-sm-14 col-md-24 col-md-offset-0">
<div>
<span><img src="http://eclipse.org/eclipse.org-common/themes/solstice/public/images/logo/eclipse-logo-bw-800x188.png" alt="Eclipse.org black and white logo" width="166" height="39" id="logo-eclipse-white"/></span>
<p id="copyright-text">Copyright &copy; 2014 The Eclipse Foundation. All Rights Reserved.</p>
</div>
</div>
<a href="How_to_add_a_new_Data_Source_to_the_importing_framework.html#" class="scrollup">Back to the top</a>
</div>
</div>
</footer>
<script src="http://wiki.eclipse.org/skins/solstice/public/javascript/main.min.js"></script>
<!-- Placed at the end of the document so the pages load faster -->
<script type="text/javascript">
var _gaq = _gaq || [];
_gaq.push(['_setAccount', 'UA-910670-2']);
_gaq.push(['_trackPageview']);
(function() {
var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
})();
</script> <!-- /footer -->
<script>/*<![CDATA[*/window.jQuery && jQuery.ready();/*]]>*/</script><script>if(window.mw){
mw.loader.state({"skins.solstice":"loading","site":"ready","user":"ready","user.groups":"ready"});
}</script>
<script src="http://wiki.eclipse.org/load.php?debug=false&amp;lang=en&amp;modules=skins.solstice&amp;only=scripts&amp;skin=solstice&amp;*"></script>
<script>if(window.mw){
mw.loader.load(["mediawiki.action.view.postEdit","mediawiki.user","mediawiki.hidpi","mediawiki.page.ready","mediawiki.searchSuggest"],null,true);
}</script>
<script>if(window.mw){
mw.config.set({"wgBackendResponseTime":354});
}</script> </body>
</html>