blob: 9d0177bfa4eed447e3d208d11766b62d1c1e811d [file] [log] [blame]
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en" dir="ltr">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<meta name="keywords" content="SMILA/Documentation/5 more minutes to change the workflow,SMILA/Documentation/BPEL Workflow Processor,SMILA/Documentation/JobManager,SMILA/Documentation/Processing/JSON REST API for BPEL pipelines,SMILA/Documentation/Solr,SMILA/Documentation/Worker/PipelineProcessorWorker,SMILA/Documentation for 5 Minutes to Success" />
<link rel="shortcut icon" href="http://wiki.eclipse.org/SMILA/Documentation/favicon.ico" />
<link rel="search" type="application/opensearchdescription+xml" href="http://wiki.eclipse.org/opensearch_desc.php" title="Eclipsepedia (English)" />
<link rel="alternate" type="application/rss+xml" title="Eclipsepedia RSS Feed" href="http://wiki.eclipse.org/index.php?title=Special:Recentchanges&amp;feed=rss" />
<link rel="alternate" type="application/atom+xml" title="Eclipsepedia Atom Feed" href="http://wiki.eclipse.org/index.php?title=Special:Recentchanges&amp;feed=atom" />
<title>SMILA/Documentation/5 more minutes to change the workflow - Eclipsepedia</title>
<style type="text/css" media="screen,projection">/*<![CDATA[*/ @import "/skins/eclipsenova/novaWide.css?116"; /*]]>*/</style>
<link rel="stylesheet" type="text/css" media="print" href="http://wiki.eclipse.org/skins/eclipsenova/eclipsenovaPrint.css?116" />
<link rel="stylesheet" type="text/css" media="handheld" href="http://wiki.eclipse.org/skins/eclipsenova/handheld.css?116" />
<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/Nova/css/header.css" media="screen" />
<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/tabs.css" media="screen" />
<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/Nova/css/visual.css" media="screen" />
<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/Nova/css/layout.css" media="screen" />
<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/Nova/css/footer.css" media="screen" />
<!--[if IE]><link rel="stylesheet" type="text/css" href="/skins/eclipsenova/IEpngfix.css" media="screen" /><![endif]-->
<!--[if lt IE 5.5000]><style type="text/css">@import "/skins/eclipsenova/IE50Fixes.css?116";</style> <![endif]-->
<!--[if IE 5.5000]><style type="text/css">@import "/skins/eclipsenova/IE55Fixes.css?116";</style><![endif]-->
<!--[if IE 6]><style type="text/css">@import "/skins/eclipsenova/IE60Fixes.css?116";</style><![endif]-->
<!--[if IE 7]><style type="text/css">@import "/skins/eclipsenova/IE70Fixes.css?116";</style><![endif]-->
<!--[if lt IE 7]><script type="text/javascript" src="/skins/common/IEFixes.js?116"></script>
<meta http-equiv="imagetoolbar" content="no" /><![endif]-->
<script type= "text/javascript">/*<![CDATA[*/
var skin = "eclipsenova";
var stylepath = "/skins";
var wgArticlePath = "/$1";
var wgScriptPath = "";
var wgScript = "/index.php";
var wgServer = "http://wiki.eclipse.org";
var wgCanonicalNamespace = "";
var wgCanonicalSpecialPageName = false;
var wgNamespaceNumber = 0;
var wgPageName = "SMILA/Documentation/5_more_minutes_to_change_the_workflow";
var wgTitle = "SMILA/Documentation/5 more minutes to change the workflow";
var wgAction = "view";
var wgRestrictionEdit = [];
var wgRestrictionMove = [];
var wgArticleId = "35503";
var wgIsArticle = true;
var wgUserName = null;
var wgUserGroups = null;
var wgUserLanguage = "en";
var wgContentLanguage = "en";
var wgBreakFrames = false;
var wgCurRevisionId = "286060";
var wgVersion = "1.12.0";
var wgEnableAPI = true;
var wgEnableWriteAPI = false;
/*]]>*/</script>
<script type="text/javascript" src="http://wiki.eclipse.org/skins/common/wikibits.js?116"><!-- wikibits js --></script>
<!-- Performance mods similar to those for bug 166401 -->
<script type="text/javascript" src="http://wiki.eclipse.org/index.php?title=-&amp;action=raw&amp;gen=js&amp;useskin=eclipsenova"><!-- site js --></script>
<!-- Head Scripts -->
<script type="text/javascript" src="http://wiki.eclipse.org/skins/common/ajax.js?116"></script>
<style type="text/css">/*<![CDATA[*/
.source-xml {line-height: normal; font-size: medium;}
.source-xml li {line-height: normal;}
/**
* GeSHi Dynamically Generated Stylesheet
* --------------------------------------
* Dynamically generated stylesheet for xml
* CSS class: source-xml, CSS id:
* GeSHi (C) 2004 - 2007 Nigel McNie (http://qbnz.com/highlighter)
*/
.source-xml .de1, .source-xml .de2 {font-family: 'Courier New', Courier, monospace; font-weight: normal;}
.source-xml {}
.source-xml .head {}
.source-xml .foot {}
.source-xml .imp {font-weight: bold; color: red;}
.source-xml .ln-xtra {color: #cc0; background-color: #ffc;}
.source-xml li {font-family: 'Courier New', Courier, monospace; color: black; font-weight: normal; font-style: normal;}
.source-xml li.li2 {font-weight: bold;}
.source-xml .coMULTI {color: #808080; font-style: italic;}
.source-xml .es0 {color: #000099; font-weight: bold;}
.source-xml .br0 {color: #66cc66;}
.source-xml .st0 {color: #ff0000;}
.source-xml .nu0 {color: #cc66cc;}
.source-xml .sc0 {color: #00bbdd;}
.source-xml .sc1 {color: #ddbb00;}
.source-xml .sc2 {color: #339933;}
.source-xml .sc3 {color: #009900;}
.source-xml .re0 {color: #000066;}
.source-xml .re1 {font-weight: bold; color: black;}
.source-xml .re2 {font-weight: bold; color: black;}
/*]]>*/
</style>
<style type="text/css">/*<![CDATA[*/
@import "/index.php?title=MediaWiki:Geshi.css&usemsgcache=yes&action=raw&ctype=text/css&smaxage=18000";
/*]]>*/
</style><link rel="stylesheet" type="text/css" href="5_more_minutes_to_change_the_workflow.html" /> </head>
<body class="mediawiki ns-0 ltr page-SMILA_Documentation_5_more_minutes_to_change_the_workflow">
<div id="globalWrapper">
<div id="column-one">
<!-- Eclipse Additions for the Top Nav start here M. Ward-->
<div id="header">
<div id="header-graphic">
<img src="http://wiki.eclipse.org/skins/eclipsenova/eclipse.png" alt="Eclipse Wiki">
</div>
<!-- Pulled 101409 Mward -->
<div class="portlet" id="p-personal">
<div class="pBody">
<ul>
<li id="pt-login"><a href="http://wiki.eclipse.org/index.php?title=Special:Userlogin&amp;returnto=SMILA/Documentation/5_more_minutes_to_change_the_workflow">Log in</a></li>
</ul>
</div>
</div>
<div id="header-icons">
<div id="sites">
<ul id="sitesUL">
<li><a href="http://www.eclipse.org"><img src="http://dev.eclipse.org/custom_icons/eclipseIcon.png" width="28" height="28" alt="Eclipse Foundation" title="Eclipse Foundation" /><div>Eclipse Foundation</div></a></li>
<li><a href="http://marketplace.eclipse.org"><img src="http://dev.eclipse.org/custom_icons/marketplace.png" width="28" height="28" alt="Eclipse Marketplace" title="Eclipse Marketplace" /><div>Eclipse Marketplace</div></a></li>
<li><a href="https://bugs.eclipse.org/bugs"><img src="http://dev.eclipse.org/custom_icons/system-search-bw.png" width="28" height="28" alt="Bugzilla" title="Bugzilla" /><div>Bugzilla</div></a></li>
<li><a href="http://live.eclipse.org"><img src="http://dev.eclipse.org/custom_icons/audio-input-microphone-bw.png" width="28" height="28" alt="Live" title="Live" /><div>Eclipse Live</div></a></li>
<li><a href="http://planeteclipse.org"><img src="http://dev.eclipse.org/large_icons/devices/audio-card.png" width="28" height="28" alt="PlanetEclipse" title="Planet" /><div>Planet Eclipse</div></a></li>
<li><a href="http://portal.eclipse.org"><img src="http://dev.eclipse.org/custom_icons/preferences-system-network-proxy-bw.png" width="28" height="28" alt="Portal" title="Portal" /><div>My Foundation Portal</div></a></li>
</ul>
</div>
</div>
</div>
<!-- NEW HEADER STUFF HERE -->
<div id="header-menu">
<div id="header-nav">
<ul> <li><a class="first_one" href="http://wiki.eclipse.org/" target="_self">Home</a></li> <li><a href="http://www.eclipse.org/downloads/" target="_self">Downloads</a></li>
<li><a href="http://www.eclipse.org/users/" target="_self">Users</a></li>
<li><a href="http://www.eclipse.org/membership/" target="_self">Members</a></li>
<li><a href="http://wiki.eclipse.org/index.php/Development_Resources" target="_self">Committers</a></li>
<li><a href="http://www.eclipse.org/resources/" target="_self">Resources</a></li>
<li><a href="http://www.eclipse.org/projects/" target="_self">Projects</a></li>
<li><a href="http://www.eclipse.org/org/" target="_self">About Us</a></li>
</ul>
</div>
<div id="header-utils">
<!-- moved the search window here -->
<form action="http://wiki.eclipse.org/Special:Search" >
<input class="input" name="search" type="text" accesskey="f" value="" />
<input type='submit' onclick="this.submit();" name="go" id="searchGoButton" class="button" title="Go to a page with this exact name if one exists" value="Go" />&nbsp;
<input type='submit' onclick="this.submit();" name="fulltext" class="button" id="mw-searchButton" title="Search Eclipsepedia for this text" value="Search" />
</form>
</div>
</div>
<!-- Eclipse Additions for the Header stop here -->
<!-- Additions and mods for leftside nav Start here -->
<!--Started nav rip here-->
<!-- these are the nav controls main page, changes etc -->
<div id="novaContent" class="faux">
<div id="leftcol">
<ul id="leftnav">
<!-- these are the page controls, edit history etc -->
<li class="separator"><a class="separator">Navigation &#160;&#160;</li>
<li id="n-mainpage"><a href="http://wiki.eclipse.org/Main_Page">Main Page</a></li>
<li id="n-portal"><a href="http://wiki.eclipse.org/Eclipsepedia:Community_Portal">Community portal</a></li>
<li id="n-currentevents"><a href="http://wiki.eclipse.org/Eclipsepedia:Current_events">Current events</a></li>
<li id="n-recentchanges"><a href="http://wiki.eclipse.org/Special:Recentchanges">Recent changes</a></li>
<li id="n-randompage"><a href="http://wiki.eclipse.org/Special:Random">Random page</a></li>
<li id="n-help"><a href="http://wiki.eclipse.org/Help:Contents">Help</a></li>
<li class="separator"><a class="separator">Toolbox &#160;&#160;</a></li>
<li id="t-whatlinkshere"><a href="http://wiki.eclipse.org/Special:Whatlinkshere/SMILA/Documentation/5_more_minutes_to_change_the_workflow">What links here</a></li>
<li id="t-recentchangeslinked"><a href="http://wiki.eclipse.org/Special:Recentchangeslinked/SMILA/Documentation/5_more_minutes_to_change_the_workflow">Related changes</a></li>
<!-- This is the toolbox section -->
<li id="t-upload"><a href="http://wiki.eclipse.org/Special:Upload">Upload file</a></li>
<li id="t-specialpages"><a href="http://wiki.eclipse.org/Special:Specialpages">Special pages</a></li>
<li id="t-print"><a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/5_more_minutes_to_change_the_workflow&amp;printable=yes">Printable version</a></li> <li id="t-permalink"><a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/5_more_minutes_to_change_the_workflow&amp;oldid=286060">Permanent link</a></li> </ul>
</div>
<!-- Additions and mods for leftside nav End here -->
<div id="column-content">
<div id="content">
<a name="top" id="top"></a>
<div id="tabs">
<ul class="primary">
<li class="active"><a href="5_more_minutes_to_change_the_workflow.html"><span class="tab">Page</span></a></li>
<li><a href="http://wiki.eclipse.org/index.php?title=Talk:SMILA/Documentation/5_more_minutes_to_change_the_workflow&amp;action=edit"><span class="tab">Discussion</span></a></li>
<li><a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/5_more_minutes_to_change_the_workflow&amp;action=edit"><span class="tab">View source</span></a></li>
<li><a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/5_more_minutes_to_change_the_workflow&amp;action=history"><span class="tab">History</span></a></li>
<li><a href="http://wiki.eclipse.org/index.php?title=Special:Userlogin&amp;returnto=SMILA/Documentation/5%20more%20minutes%20to%20change%20the%20workflow"><span class="tab">Edit</span></a></li>
</ul>
</div>
<script type="text/javascript"> if (window.isMSIE55) fixalpha(); </script>
<h1 class="firstHeading">SMILA/Documentation/5 more minutes to change the workflow</h1>
<div id="bodyContent">
<h3 id="siteSub">From Eclipsepedia</h3>
<div id="contentSub"><span class="subpages">&lt; <a href="../../SMILA.html" title="SMILA">SMILA</a> | <a href="../Documentation.1.html" title="SMILA/Documentation">Documentation</a></span></div>
<div id="jump-to-nav">Jump to: <a href="5_more_minutes_to_change_the_workflow.html#column-one">navigation</a>, <a href="5_more_minutes_to_change_the_workflow.html#searchInput">search</a></div> <!-- start content -->
<p>
</p>
<table id="toc" class="toc" summary="Contents"><tr><td><div id="toctitle"><h2>Contents</h2></div>
<ul>
<li class="toclevel-1"><a href="5_more_minutes_to_change_the_workflow.html#Just_another_5_minutes_to_change_the_workflow"><span class="tocnumber">1</span> <span class="toctext">Just another 5 minutes to change the workflow</span></a>
<ul>
<li class="toclevel-2"><a href="5_more_minutes_to_change_the_workflow.html#Configure_new_solr_index"><span class="tocnumber">1.1</span> <span class="toctext">Configure new solr index</span></a></li>
<li class="toclevel-2"><a href="5_more_minutes_to_change_the_workflow.html#Create_a_new_BPEL_pipeline"><span class="tocnumber">1.2</span> <span class="toctext">Create a new BPEL pipeline</span></a></li>
<li class="toclevel-2"><a href="5_more_minutes_to_change_the_workflow.html#Create_and_start_a_new_indexing_job"><span class="tocnumber">1.3</span> <span class="toctext">Create and start a new indexing job</span></a></li>
<li class="toclevel-2"><a href="5_more_minutes_to_change_the_workflow.html#Update_the_web_crawl_job"><span class="tocnumber">1.4</span> <span class="toctext">Update the web crawl job</span></a></li>
<li class="toclevel-2"><a href="5_more_minutes_to_change_the_workflow.html#Put_it__all_together"><span class="tocnumber">1.5</span> <span class="toctext">Put it all together</span></a></li>
</ul>
</li>
<li class="toclevel-1"><a href="5_more_minutes_to_change_the_workflow.html#Configuration_overview"><span class="tocnumber">2</span> <span class="toctext">Configuration overview</span></a></li>
</ul>
</li>
</ul>
</td></tr></table><script type="text/javascript"> if (window.showTocToggle) { var tocShowText = "show"; var tocHideText = "hide"; showTocToggle(); } </script>
<a name="Just_another_5_minutes_to_change_the_workflow"></a><h1> <span class="mw-headline"> Just another 5 minutes to change the workflow </span></h1>
<p>In the <a href="../Documentation_for_5_Minutes_to_Success.html" title="SMILA/Documentation for 5 Minutes to Success">5 minutes to success</a> all data collected by crawlers was processed with the same asynchronous "indexUpdate" workflow using the BPEL pipeline "AddPipeline". All data was indexed into the same solr/lucene index "DefaultCore".
It is possible, however, to configure SMILA so that data from different data sources will go through different workflows and pipelines and will be indexed into different indices. This will require more advanced configuration features than before but still quite simple ones.
</p><p>In the following sections we are going to use the generic asynchronous "importToPipeline" workflow which let you specify the BPEL pipeline to process the data. We create an additional BPEL pipeline for webcrawler records so that webcrawler data will be indexed into a separate index named "WebCore".
</p>
<a name="Configure_new_solr_index"></a><h2> <span class="mw-headline"> Configure new solr index </span></h2>
<table width="100%" style="background-color:#d8e4f1; padding-left:30px;">
<tr>
<td>
<p>It's very important to shutdown and restart the SMILA engine after the following configuration changes are done because modified configurations are loaded during startup only.
</p>
</td></tr></table>
<p>To configure your own index "WebCore" follow the description in the SMILA documentation for <a href="Solr.html#Setup_another_core" title="SMILA/Documentation/Solr">creating your own solr index</a>.
</p>
<table width="100%" style="background-color:#d8e4f1; padding-left:30px;">
<tr>
<td>If you already started SMILA before (as we suppose you did), please copy your new Core configuration and the modified <tt>solr.xml</tt> file to the folder <tt>workspace\.metadata\.plugins\org.eclipse.smila.solr</tt> because the configuration will not be copied again, after the first start of the Solr bundle.
</td></tr></table>
<p>For more information about the solr indexing, please see the <a href="Solr.html" title="SMILA/Documentation/Solr">SMILA solr documentation</a>.
</p>
<a name="Create_a_new_BPEL_pipeline"></a><h2> <span class="mw-headline"> Create a new BPEL pipeline </span></h2>
<p>We need to add the <i>AddWebPipeline</i> pipeline to the BPEL WorkflowProcessor. For more information about BPEL WorkflowProcessor please check the <a href="BPEL_Workflow_Processor.html" title="SMILA/Documentation/BPEL Workflow Processor">BPEL WorkflowProcessor</a> documentation.
Predefined BPEL WorkflowProcessor configuration files are contained in the <tt>configuration/org.eclipse.smila.processing.bpel/pipelines</tt> directory. However, we can add new BPEL pipelines with the SMILA REST API.
</p><p>Start SMILA if it's not yet running, and use your favourite REST client to add the "AddWebPipeline" BPEL pipeline: (the BPEL XML is a little bit unreadable cause we have to escape it for being valid JSON content; after posting the new pipeline you can get a readable version via monitoring REST API - see below)
</p>
<pre>
POST http://localhost:8080/smila/pipeline
{
&quot;name&quot;:&quot;AddWebPipeline&quot;,
&quot;definition&quot;:&quot;&lt;?xml version=\&quot;1.0\&quot; encoding=\&quot;utf-8\&quot;&nbsp;?&gt;\r\n&lt;process name=\&quot;AddWebPipeline\&quot; targetNamespace=\&quot;http://www.eclipse.org/smila/processor\&quot; xmlns=\&quot;http://docs.oasis-open.org/wsbpel/2.0/process/executable\&quot;\r\n xmlns:xsd=\&quot;http://www.w3.org/2001/XMLSchema\&quot; xmlns:proc=\&quot;http://www.eclipse.org/smila/processor\&quot; xmlns:rec=\&quot;http://www.eclipse.org/smila/record\&quot;\r\n xmlns:bpel=\&quot;http://docs.oasis-open.org/wsbpel/2.0/process/executable\&quot;&gt;\r\n\r\n &lt;import location=\&quot;processor.wsdl\&quot; namespace=\&quot;http://www.eclipse.org/smila/processor\&quot; importType=\&quot;http://schemas.xmlsoap.org/wsdl/\&quot; /&gt;\r\n\r\n &lt;partnerLinks&gt;\r\n &lt;partnerLink name=\&quot;Pipeline\&quot; partnerLinkType=\&quot;proc:ProcessorPartnerLinkType\&quot; myRole=\&quot;service\&quot; /&gt;\r\n &lt;partnerLink name=\&quot;AdaptWebCrawlerWorkerOutput\&quot; partnerLinkType=\&quot;proc:ProcessorPartnerLinkType\&quot; partnerRole=\&quot;service\&quot; /&gt;\r\n &lt;/partnerLinks&gt;\r\n\r\n &lt;extensions&gt;\r\n &lt;extension namespace=\&quot;http://www.eclipse.org/smila/processor\&quot; mustUnderstand=\&quot;no\&quot; /&gt;\r\n &lt;/extensions&gt;\r\n\r\n &lt;variables&gt;\r\n &lt;variable name=\&quot;request\&quot; messageType=\&quot;proc:ProcessorMessage\&quot; /&gt;\r\n &lt;/variables&gt;\r\n\r\n &lt;sequence name=\&quot;AddWebPipeline\&quot;&gt;\r\n &lt;receive name=\&quot;start\&quot; partnerLink=\&quot;Pipeline\&quot; portType=\&quot;proc:ProcessorPortType\&quot; operation=\&quot;process\&quot; variable=\&quot;request\&quot;\r\n createInstance=\&quot;yes\&quot; /&gt;\r\n\r\n &lt;invoke name=\&quot;adaptWebCrawlerWorkerOutput\&quot; inputVariable=\&quot;request\&quot; partnerLink=\&quot;AdaptWebCrawlerWorkerOutput\&quot;\r\n outputVariable=\&quot;request\&quot; operation=\&quot;process\&quot; portType=\&quot;proc:ProcessorPortType\&quot; /&gt;\r\n\r\n &lt;forEach counterName=\&quot;index\&quot; parallel=\&quot;yes\&quot; name=\&quot;iterateRecords\&quot;&gt;\r\n &lt;startCounterValue&gt;1&lt;/startCounterValue&gt;\r\n &lt;finalCounterValue&gt;count($request.records/rec:Record)&lt;/finalCounterValue&gt;\r\n &lt;scope&gt;\r\n &lt;variables&gt;\r\n &lt;variable name=\&quot;identifiedRecord\&quot; messageType=\&quot;proc:ProcessorMessage\&quot; /&gt;\r\n &lt;/variables&gt;\r\n &lt;sequence&gt;\r\n &lt;if name=\&quot;MimeTypeNotSet\&quot;&gt;\r\n &lt;condition&gt;not($request.records/rec:Record[position()=$index]/rec:Val[@key=\&quot;MimeType\&quot;])&lt;/condition&gt;\r\n &lt;sequence&gt;\r\n &lt;extensionActivity&gt;\r\n &lt;proc:invokePipelet name=\&quot;detectMimeType\&quot;&gt;\r\n &lt;proc:pipelet class=\&quot;org.eclipse.smila.processing.pipelets.MimeTypeIdentifyPipelet\&quot; /&gt;\r\n &lt;proc:variables input=\&quot;request\&quot; index=\&quot;index\&quot; output=\&quot;identifiedRecord\&quot; /&gt;\r\n &lt;proc:configuration&gt;\r\n &lt;rec:Val key=\&quot;FileExtensionAttribute\&quot;&gt;Extension&lt;/rec:Val&gt;\r\n &lt;rec:Val key=\&quot;MetaDataAttribute\&quot;&gt;MetaData&lt;/rec:Val&gt;\r\n &lt;rec:Val key=\&quot;MimeTypeAttribute\&quot;&gt;MimeType&lt;/rec:Val&gt;\r\n &lt;/proc:configuration&gt;\r\n &lt;/proc:invokePipelet&gt;\r\n &lt;/extensionActivity&gt;\r\n &lt;assign name=\&quot;copy result into original variable for next tests\&quot;&gt;\r\n &lt;copy&gt;\r\n &lt;from&gt;$identifiedRecord.records/rec:Record[1]&lt;/from&gt;\r\n &lt;to&gt;$request.records/rec:Record[position()=$index]&lt;/to&gt;\r\n &lt;/copy&gt;\r\n &lt;/assign&gt;\r\n &lt;/sequence&gt;\r\n &lt;/if&gt;\r\n\r\n &lt;!-- only process text based content, skip everything else --&gt;\r\n &lt;if name=\&quot;IsText\&quot;&gt;\r\n &lt;condition&gt;starts-with($request.records/rec:Record[position()=$index]/rec:Val[@key=\&quot;MimeType\&quot;],\&quot;text/\&quot;)&lt;/condition&gt;\r\n &lt;if name=\&quot;IsHTML\&quot;&gt;\r\n &lt;condition&gt;$request.records/rec:Record[position()=$index]/rec:Val[@key=\&quot;MimeType\&quot;] = \&quot;text/html\&quot; or\r\n $request.records/rec:Record[position()=$index]/rec:Val[@key=\&quot;MimeType\&quot;] = \&quot;text/xml\&quot;\r\n &lt;/condition&gt;\r\n &lt;!-- extract txt from html and xml files --&gt;\r\n &lt;extensionActivity&gt;\r\n &lt;proc:invokePipelet name=\&quot;invokeHtml2Txt\&quot;&gt;\r\n &lt;proc:pipelet class=\&quot;org.eclipse.smila.processing.pipelets.HtmlToTextPipelet\&quot; /&gt;\r\n &lt;proc:variables input=\&quot;request\&quot; index=\&quot;index\&quot; /&gt;\r\n &lt;proc:configuration&gt;\r\n &lt;rec:Val key=\&quot;inputType\&quot;&gt;ATTACHMENT&lt;/rec:Val&gt;\r\n &lt;rec:Val key=\&quot;outputType\&quot;&gt;ATTRIBUTE&lt;/rec:Val&gt;\r\n &lt;rec:Val key=\&quot;inputName\&quot;&gt;Content&lt;/rec:Val&gt;\r\n &lt;rec:Val key=\&quot;outputName\&quot;&gt;Content&lt;/rec:Val&gt;\r\n &lt;rec:Val key=\&quot;meta:title\&quot;&gt;Title&lt;/rec:Val&gt;\r\n &lt;/proc:configuration&gt;\r\n &lt;/proc:invokePipelet&gt;\r\n &lt;/extensionActivity&gt;\r\n &lt;else&gt;\r\n &lt;!-- copy txt from attachment to attribute --&gt;\r\n &lt;extensionActivity&gt;\r\n &lt;proc:invokePipelet name=\&quot;invokeCopyContent\&quot;&gt;\r\n &lt;proc:pipelet class=\&quot;org.eclipse.smila.processing.pipelets.CopyPipelet\&quot; /&gt;\r\n &lt;proc:variables input=\&quot;request\&quot; index=\&quot;index\&quot; /&gt;\r\n &lt;proc:configuration&gt;\r\n &lt;rec:Val key=\&quot;inputType\&quot;&gt;ATTACHMENT&lt;/rec:Val&gt;\r\n &lt;rec:Val key=\&quot;outputType\&quot;&gt;ATTRIBUTE&lt;/rec:Val&gt;\r\n &lt;rec:Val key=\&quot;inputName\&quot;&gt;Content&lt;/rec:Val&gt;\r\n &lt;rec:Val key=\&quot;outputName\&quot;&gt;Content&lt;/rec:Val&gt;\r\n &lt;rec:Val key=\&quot;mode\&quot;&gt;COPY&lt;/rec:Val&gt;\r\n &lt;/proc:configuration&gt;\r\n &lt;/proc:invokePipelet&gt;\r\n &lt;/extensionActivity&gt;\r\n &lt;/else&gt;\r\n &lt;/if&gt;\r\n &lt;/if&gt;\r\n &lt;/sequence&gt;\r\n &lt;/scope&gt;\r\n &lt;/forEach&gt;\r\n\r\n &lt;extensionActivity&gt;\r\n &lt;proc:invokePipelet name=\&quot;SolrIndexPipelet\&quot;&gt;\r\n &lt;proc:pipelet class=\&quot;org.eclipse.smila.solr.index.SolrIndexPipelet\&quot; /&gt;\r\n &lt;proc:variables input=\&quot;request\&quot; output=\&quot;request\&quot; /&gt;\r\n &lt;proc:configuration&gt;\r\n &lt;rec:Val key=\&quot;ExecutionMode\&quot;&gt;ADD&lt;/rec:Val&gt;\r\n &lt;rec:Val key=\&quot;CoreName\&quot;&gt;WebCore&lt;/rec:Val&gt;\r\n &lt;rec:Seq key=\&quot;CoreFields\&quot;&gt;\r\n &lt;rec:Map&gt;\r\n &lt;rec:Val key=\&quot;FieldName\&quot;&gt;_source&lt;/rec:Val&gt;\r\n &lt;/rec:Map&gt;\r\n &lt;rec:Map&gt;\r\n &lt;rec:Val key=\&quot;FieldName\&quot;&gt;Path&lt;/rec:Val&gt;\r\n &lt;/rec:Map&gt;\r\n &lt;rec:Map&gt;\r\n &lt;rec:Val key=\&quot;FieldName\&quot;&gt;Url&lt;/rec:Val&gt;\r\n &lt;/rec:Map&gt;\r\n &lt;rec:Map&gt;\r\n &lt;rec:Val key=\&quot;FieldName\&quot;&gt;Filename&lt;/rec:Val&gt;\r\n &lt;/rec:Map&gt;\r\n &lt;rec:Map&gt;\r\n &lt;rec:Val key=\&quot;FieldName\&quot;&gt;MimeType&lt;/rec:Val&gt;\r\n &lt;/rec:Map&gt;\r\n &lt;rec:Map&gt;\r\n &lt;rec:Val key=\&quot;FieldName\&quot;&gt;Size&lt;/rec:Val&gt;\r\n &lt;/rec:Map&gt;\r\n &lt;rec:Map&gt;\r\n &lt;rec:Val key=\&quot;FieldName\&quot;&gt;LastModifiedDate&lt;/rec:Val&gt;\r\n &lt;/rec:Map&gt;\r\n &lt;rec:Map&gt;\r\n &lt;rec:Val key=\&quot;FieldName\&quot;&gt;Content&lt;/rec:Val&gt;\r\n &lt;/rec:Map&gt;\r\n &lt;rec:Map&gt;\r\n &lt;rec:Val key=\&quot;FieldName\&quot;&gt;Extension&lt;/rec:Val&gt;\r\n &lt;/rec:Map&gt;\r\n &lt;rec:Map&gt;\r\n &lt;rec:Val key=\&quot;FieldName\&quot;&gt;Title&lt;/rec:Val&gt;\r\n &lt;/rec:Map&gt;\r\n &lt;rec:Map&gt;\r\n &lt;rec:Val key=\&quot;FieldName\&quot;&gt;Author&lt;/rec:Val&gt;\r\n &lt;/rec:Map&gt;\r\n &lt;/rec:Seq&gt;\r\n &lt;/proc:configuration&gt;\r\n &lt;/proc:invokePipelet&gt;\r\n &lt;/extensionActivity&gt;\r\n\r\n &lt;reply name=\&quot;end\&quot; partnerLink=\&quot;Pipeline\&quot; portType=\&quot;proc:ProcessorPortType\&quot; operation=\&quot;process\&quot; variable=\&quot;request\&quot; /&gt;\r\n &lt;/sequence&gt;\r\n&lt;/process&gt;\r\n&quot;
}
</pre>
<p>You can monitor the defined BPEL pipelines via browser, so you should find your new pipeline there:
</p>
<pre>
http://localhost:8080/smila/pipeline
</pre>
<p>Note that we used "WebCore" index name for the Solr index in the BPEL above:
</p>
<div dir="ltr" style="text-align: left;"><pre class="source-xml">...
<span class="sc3"><span class="re1">&lt;proc:configuration<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;rec:Val</span> <span class="re0">key</span>=<span class="st0">&quot;CoreName&quot;</span><span class="re2">&gt;</span></span>WebCore<span class="sc3"><span class="re1">&lt;/rec:Val<span class="re2">&gt;</span></span></span>
...
<span class="sc3"><span class="re1">&lt;/proc:configuration<span class="re2">&gt;</span></span></span>
...</pre></div>
<a name="Create_and_start_a_new_indexing_job"></a><h2> <span class="mw-headline"> Create and start a new indexing job </span></h2>
<p>We define an indexing job based on the predefined asynchronous workflow "importToPipeline" (see <tt>SMILA/configuration/org.eclipse.smila.jobmanager/workflows.json</tt>). This indexing job will process the imported data by using our new BPEL pipeline "AddWebPipeline".
</p><p>The "importToPipeline" workflow contains a <a href="Worker/PipelineProcessorWorker.html" title="SMILA/Documentation/Worker/PipelineProcessorWorker">PipelineProcessorWorker worker</a> which is not configured for dedicated BPEL pipelines, so the BPEL pipelines handling adds and deletes have to be set via job parameter.
</p><p>Use your favourite REST Client to create an appropriate job definition:
</p>
<pre>
POST http://localhost:8080/smila/jobmanager/jobs/
{
&quot;name&quot;:&quot;indexWebJob&quot;,
&quot;parameters&quot;:{
&quot;tempStore&quot;: &quot;temp&quot;,
&quot;addPipeline&quot;: &quot;AddWebPipeline&quot;,
&quot;deletePipeline&quot;: &quot;DeletePipeline&quot;
},
&quot;workflow&quot;:&quot;importToPipeline&quot;
}
</pre>
<p>Note that the "DeletePipeline" is not needed for our test szenario here, but we must fulfill all undefined workflow parameters.
</p><p>Afterwards, start a job run for the defined job:
</p>
<pre>
POST http://localhost:8080/smila/jobmanager/jobs/indexWebJob
</pre>
<a name="Update_the_web_crawl_job"></a><h2> <span class="mw-headline"> Update the web crawl job </span></h2>
<p>Since the web crawl job already is predefined to push the crawled records to the <tt>indexUpdate</tt> job, we now either must define a new job or update the crawl job's definition in the <span style="font-family:monospace;">job.json</span> file. Here we choose the <i>new job</i> option.
</p><p>POST the following update json using your favorite REST client:
</p>
<pre>
POST http://localhost:8080/smila/jobmanager/jobs/
{
&quot;name&quot;:&quot;crawlWikiToWebCore&quot;,
&quot;workflow&quot;:&quot;webCrawling&quot;,
&quot;parameters&quot;:{
&quot;tempStore&quot;:&quot;temp&quot;,
&quot;dataSource&quot;:&quot;web&quot;,
&quot;startUrl&quot;:&quot;http://wiki.eclipse.org/SMILA&quot;,
&quot;filter&quot;:{
&quot;urlPrefix&quot;:&quot;http://wiki.eclipse.org/SMILA&quot;
},
&quot;jobToPushTo&quot;:&quot;indexWebJob&quot;
}
}
</pre>
<p>Please note that we used the following line to let the crawl job push the records to our new job:
</p>
<pre>
&quot;jobToPushTo&quot;:&quot;indexWebJob&quot;
</pre>
<p>Now start the crawl job (don't forget runOnce!):
</p>
<pre>
POST http://localhost:8080/smila/jobmanager/jobs/crawlWikiToWebCore
{
&quot;mode&quot;: &quot;runOnce&quot;
}
</pre>
<p>After a sufficiently long time to crawl, process and commit the data, you can have another look at the <a href="http://localhost:8080/SMILA/search" class="external text" title="http://localhost:8080/SMILA/search" rel="nofollow">SMILA search page</a> to find your new core listed among the available cores, and if you choose it, you can search for e.g. <i>SMILA</i> in the new WebCore.
</p>
<a name="Put_it__all_together"></a><h2> <span class="mw-headline"> Put it all together </span></h2>
<p>Ok, now it seems that we have finally finished configuring SMILA for using separate BPEL pipelines for file system and web crawling and index data from these crawlers into different indices.
Here is what we have done so far:
</p>
<ol><li> We added the <tt>WebCore</tt> index to the Solr configuration and copied it to the workspace.
</li><li> We created a new BPEL pipeline for Web crawler data referencing the new Lucene index.
</li><li> We used a separate job for web indexing that references the new BPEL pipeline.
</li><li> We updated the web crawl job to push the records to a different indexing job which references the new BPEL pipeline.
</li></ol>
<a name="Configuration_overview"></a><h1> <span class="mw-headline"> Configuration overview </span></h1>
<p>SMILA configuration files are located in the <tt>configuration</tt> directory of the SMILA application.
The following lists the configuration files and documentation links relevant to this tutorial, regarding SMILA components:
</p><p><b>Jobmanager</b>
</p>
<ul><li> configuration folder: <tt>org.eclipse.smila.jobmanager</tt>
<ul><li> <tt>workflows.json</tt> (Predefined asynchronous workflows)
</li></ul>
</li><li> Documentation
<ul><li> <a href="JobManager.html" title="SMILA/Documentation/JobManager">JobManager</a>
</li><li> <a href="Worker/PipelineProcessorWorker.html" title="SMILA/Documentation/Worker/PipelineProcessorWorker">PipelineProcessorWorker</a>
</li></ul>
</li><li> REST API: <a href="http://localhost:8080/smila/jobmanager" class="external free" title="http://localhost:8080/smila/jobmanager" rel="nofollow">http://localhost:8080/smila/jobmanager</a>
</li></ul>
<p><b>BPEL Pipelines</b>
</p>
<ul><li> configuration folder: <tt>org.eclipse.smila.processing.bpel</tt>
<ul><li> <tt>pipelines/*.bpel</tt> (Predefined BPEL pipelines)
</li></ul>
</li><li> Documentation
<ul><li> <a href="BPEL_Workflow_Processor.html" title="SMILA/Documentation/BPEL Workflow Processor">BPELWorkflowProcessor</a>
</li><li> <a href="Processing/JSON_REST_API_for_BPEL_pipelines.html" title="SMILA/Documentation/Processing/JSON REST API for BPEL pipelines">JSON REST API for BPEL pipelines</a>
</li></ul>
</li><li> REST API: <a href="http://localhost:8080/smila/pipeline" class="external free" title="http://localhost:8080/smila/pipeline" rel="nofollow">http://localhost:8080/smila/pipeline</a>
</li></ul>
<p><b>Solr</b>
</p>
<ul><li> DataDictionary
<ul><li> configuration folder: <tt>org.eclipse.smila.solr</tt>
</li></ul>
</li><li> Documentation
<ul><li> <a href="Solr.html" title="SMILA/Documentation/Solr">SMILA/Documentation/Solr</a>
</li></ul>
</li></ul>
<!--
NewPP limit report
Preprocessor node count: 49/1000000
Post-expand include size: 52/2097152 bytes
Template argument size: 8/2097152 bytes
#ifexist count: 0/100
-->
<!-- Saved in parser cache with key wikidb:pcache:idhash:35503-0!1!0!!en!2!edit=0 and timestamp 20120203101128 -->
<div class="printfooter">
Retrieved from "<a href="5_more_minutes_to_change_the_workflow.html">http://wiki.eclipse.org/SMILA/Documentation/5_more_minutes_to_change_the_workflow</a>"</div>
<div id="catlinks"><p class='catlinks'><a href="http://wiki.eclipse.org/Special:Categories" title="Special:Categories">Categories</a>: <span dir='ltr'><a href="http://wiki.eclipse.org/Category:SMILA" title="Category:SMILA">SMILA</a></span> | <span dir='ltr'><a href="http://wiki.eclipse.org/Category:HowTo" title="Category:HowTo">HowTo</a></span></p></div> <!-- end content -->
<div class="visualClear"></div>
</div>
</div>
</div>
<!-- Yoink of toolbox for phoenix moved up -->
</div>
</div>
<div id="clearFooter"/>
<div id="footer" >
<ul id="footernav">
<li class="first"><a href="http://www.eclipse.org/">Home</a></li>
<li><a href="http://www.eclipse.org/legal/privacy.php">Privacy Policy</a></li>
<li><a href="http://www.eclipse.org/legal/termsofuse.php">Terms of Use</a></li>
<li><a href="http://www.eclipse.org/legal/copyright.php">Copyright Agent</a></li>
<li><a href="http://www.eclipse.org/org/foundation/contact.php">Contact</a></li>
<li><a href="http://wiki.eclipse.org/Eclipsepedia:About" title="Eclipsepedia:About">About Eclipsepedia</a></li>
</ul>
<span id="copyright">Copyright &copy; 2012 The Eclipse Foundation. All Rights Reserved</span>
<p id="footercredit">This page was last modified 12:23, 24 January 2012 by <a href="http://wiki.eclipse.org/User:Andreas.schank.attensity.com" title="User:Andreas.schank.attensity.com">A. Schank</a>. </p>
<p id="footerviews">This page has been accessed 97 times.</p>
</div>
<script type="text/javascript">
var gaJsHost = (("https:" == document.location.protocol) ? "https://ssl." : "http://www.");
document.write(unescape("%3Cscript src='" + gaJsHost + "google-analytics.com/ga.js' type='text/javascript'%3E%3C/script%3E"));
</script>
<script type="text/javascript">
var pageTracker = _gat._getTracker("UA-910670-4");
pageTracker._trackPageview();
</script>
<!-- <div class="visualClear"></div> -->
<script type="text/javascript">if (window.runOnloadHook) runOnloadHook();</script>
</div>
<!-- Served in 0.155 secs. --></body></html>