<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en" dir="ltr">
	<head>
		<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
				<meta name="keywords" content="SMILA/Documentation/Architecture Overview,SMILA/Documentation/Importing/Concept,SMILA/Documentation/JobManager,SMILA/Documentation/ObjectStore/Bundle org.eclipse.smila.objectstore,SMILA/Documentation/Pipelets,SMILA/Documentation for 5 Minutes to Success,SMILA/Manual" />
		<link rel="shortcut icon" href="http://wiki.eclipse.org/SMILA/favicon.ico" />
		<link rel="search" type="application/opensearchdescription+xml" href="http://wiki.eclipse.org/opensearch_desc.php" title="Eclipsepedia (English)" />
<link rel="alternate" type="application/rss+xml" title="Eclipsepedia RSS Feed" href="http://wiki.eclipse.org/index.php?title=Special:Recentchanges&amp;feed=rss" />
<link rel="alternate" type="application/atom+xml" title="Eclipsepedia Atom Feed" href="http://wiki.eclipse.org/index.php?title=Special:Recentchanges&amp;feed=atom" />
	
		
		<title>SMILA/Documentation/Architecture Overview - Eclipsepedia</title>

		<style type="text/css" media="screen,projection">/*<![CDATA[*/ @import "/skins/eclipsenova/novaWide.css?116"; /*]]>*/</style>
		<link rel="stylesheet" type="text/css" media="print" href="http://wiki.eclipse.org/skins/eclipsenova/eclipsenovaPrint.css?116" />
		<link rel="stylesheet" type="text/css" media="handheld" href="http://wiki.eclipse.org/skins/eclipsenova/handheld.css?116" />
		<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/Nova/css/header.css" media="screen" />
		<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/tabs.css" media="screen" />
		<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/Nova/css/visual.css" media="screen" />
		<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/Nova/css/layout.css" media="screen" />
		<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/Nova/css/footer.css" media="screen" />
	        <!--[if IE]><link rel="stylesheet" type="text/css" href="/skins/eclipsenova/IEpngfix.css" media="screen" /><![endif]-->
		<!--[if lt IE 5.5000]><style type="text/css">@import "/skins/eclipsenova/IE50Fixes.css?116";</style> <![endif]-->
		<!--[if IE 5.5000]><style type="text/css">@import "/skins/eclipsenova/IE55Fixes.css?116";</style><![endif]-->
		<!--[if IE 6]><style type="text/css">@import "/skins/eclipsenova/IE60Fixes.css?116";</style><![endif]-->
		<!--[if IE 7]><style type="text/css">@import "/skins/eclipsenova/IE70Fixes.css?116";</style><![endif]-->
		<!--[if lt IE 7]><script type="text/javascript" src="/skins/common/IEFixes.js?116"></script>
		<meta http-equiv="imagetoolbar" content="no" /><![endif]-->
		<script type= "text/javascript">/*<![CDATA[*/
var skin = "eclipsenova";
var stylepath = "/skins";
var wgArticlePath = "/$1";
var wgScriptPath = "";
var wgScript = "/index.php";
var wgServer = "http://wiki.eclipse.org";
var wgCanonicalNamespace = "";
var wgCanonicalSpecialPageName = false;
var wgNamespaceNumber = 0;
var wgPageName = "SMILA/Documentation/Architecture_Overview";
var wgTitle = "SMILA/Documentation/Architecture Overview";
var wgAction = "view";
var wgRestrictionEdit = [];
var wgRestrictionMove = [];
var wgArticleId = "19260";
var wgIsArticle = true;
var wgUserName = null;
var wgUserGroups = null;
var wgUserLanguage = "en";
var wgContentLanguage = "en";
var wgBreakFrames = false;
var wgCurRevisionId = "286692";
var wgVersion = "1.12.0";
var wgEnableAPI = true;
var wgEnableWriteAPI = false;
/*]]>*/</script>
                
		<script type="text/javascript" src="http://wiki.eclipse.org/skins/common/wikibits.js?116"><!-- wikibits js --></script>

<!-- Performance mods similar to those for bug 166401 -->
		<script type="text/javascript" src="http://wiki.eclipse.org/index.php?title=-&amp;action=raw&amp;gen=js&amp;useskin=eclipsenova"><!-- site js --></script>

		<!-- Head Scripts -->
				<script type="text/javascript" src="http://wiki.eclipse.org/skins/common/ajax.js?116"></script>
<link rel="stylesheet" type="text/css" href="Architecture_Overview.html" />	</head>
<body  class="mediawiki ns-0 ltr page-SMILA_Documentation_Architecture_Overview">
	<div id="globalWrapper">


		<div id="column-one">
<!-- Eclipse Additions for the Top Nav start here M. Ward-->

		      <div id="header">
               <div id="header-graphic">
                 <img src="http://wiki.eclipse.org/skins/eclipsenova/eclipse.png" alt="Eclipse Wiki">
               </div>
<!-- Pulled 101409 Mward	   -->
            
            <div class="portlet" id="p-personal">
              <div class="pBody">
              <ul>
                <li id="pt-login"><a href="http://wiki.eclipse.org/index.php?title=Special:Userlogin&amp;returnto=SMILA/Documentation/Architecture_Overview">Log in</a></li>
                      </ul>
              </div>
            </div>

            <div id="header-icons">
	      <div id="sites">
              <ul id="sitesUL">
                <li><a href="http://www.eclipse.org"><img src="http://dev.eclipse.org/custom_icons/eclipseIcon.png" width="28" height="28" alt="Eclipse Foundation" title="Eclipse Foundation" /><div>Eclipse Foundation</div></a></li>
                <li><a href="http://marketplace.eclipse.org"><img src="http://dev.eclipse.org/custom_icons/marketplace.png" width="28" height="28" alt="Eclipse Marketplace" title="Eclipse Marketplace" /><div>Eclipse Marketplace</div></a></li>
                <li><a href="https://bugs.eclipse.org/bugs"><img src="http://dev.eclipse.org/custom_icons/system-search-bw.png" width="28" height="28" alt="Bugzilla" title="Bugzilla" /><div>Bugzilla</div></a></li>
                <li><a href="http://live.eclipse.org"><img src="http://dev.eclipse.org/custom_icons/audio-input-microphone-bw.png" width="28" height="28" alt="Live" title="Live" /><div>Eclipse Live</div></a></li>
                <li><a href="http://planeteclipse.org"><img src="http://dev.eclipse.org/large_icons/devices/audio-card.png" width="28" height="28" alt="PlanetEclipse" title="Planet" /><div>Planet Eclipse</div></a></li>
                <li><a href="http://portal.eclipse.org"><img src="http://dev.eclipse.org/custom_icons/preferences-system-network-proxy-bw.png" width="28" height="28" alt="Portal" title="Portal" /><div>My Foundation Portal</div></a></li>
              </ul>
              </div>
            </div>
        </div> 
<!-- NEW HEADER STUFF HERE -->
       <div id="header-menu">
         <div id="header-nav">
               <ul> 		 <li><a class="first_one" href="http://wiki.eclipse.org/" target="_self">Home</a></li>                  <li><a  href="http://www.eclipse.org/downloads/" target="_self">Downloads</a></li> 
                 <li><a  href="http://www.eclipse.org/users/" target="_self">Users</a></li> 
                 <li><a  href="http://www.eclipse.org/membership/" target="_self">Members</a></li> 
                 <li><a  href="http://wiki.eclipse.org/index.php/Development_Resources" target="_self">Committers</a></li> 
                 <li><a  href="http://www.eclipse.org/resources/" target="_self">Resources</a></li> 
                 <li><a  href="http://www.eclipse.org/projects/" target="_self">Projects</a></li> 
                 <li><a  href="http://www.eclipse.org/org/" target="_self">About Us</a></li> 
                </ul>
         </div>
         <div id="header-utils">
<!-- moved the search window here -->
                           <form action="http://wiki.eclipse.org/Special:Search" >
                  <input class="input" name="search" type="text" accesskey="f" value="" />
                    <input type='submit' onclick="this.submit();" name="go" id="searchGoButton" class="button" title="Go to a page with this exact name if one exists" value="Go" />&nbsp;
                    <input type='submit' onclick="this.submit();" name="fulltext" class="button" id="mw-searchButton" title="Search Eclipsepedia for this text"  value="Search" />
                </form>
		 </div>
      </div>


<!-- Eclipse Additions for the Header stop here -->
<!-- Additions and mods for leftside nav Start here -->

<!--Started nav  rip here-->
<!-- these are the nav controls main page, changes etc -->
    <div id="novaContent" class="faux">
            <div id="leftcol">
          <ul id="leftnav">
<!-- these are the page controls, edit history etc -->
            	  <li class="separator"><a class="separator">Navigation &#160;&#160;</li>
                                <li id="n-mainpage"><a href="http://wiki.eclipse.org/Main_Page">Main Page</a></li>
	                                <li id="n-portal"><a href="http://wiki.eclipse.org/Eclipsepedia:Community_Portal">Community portal</a></li>
	                                <li id="n-currentevents"><a href="http://wiki.eclipse.org/Eclipsepedia:Current_events">Current events</a></li>
	                                <li id="n-recentchanges"><a href="http://wiki.eclipse.org/Special:Recentchanges">Recent changes</a></li>
	                                <li id="n-randompage"><a href="http://wiki.eclipse.org/Special:Random">Random page</a></li>
	                                <li id="n-help"><a href="http://wiki.eclipse.org/Help:Contents">Help</a></li>
		  <li class="separator"><a class="separator">Toolbox &#160;&#160;</a></li>

				<li id="t-whatlinkshere"><a href="http://wiki.eclipse.org/Special:Whatlinkshere/SMILA/Documentation/Architecture_Overview">What links here</a></li>
				<li id="t-recentchangeslinked"><a href="http://wiki.eclipse.org/Special:Recentchangeslinked/SMILA/Documentation/Architecture_Overview">Related changes</a></li>
                <!-- This is the toolbox section -->
<li id="t-upload"><a href="http://wiki.eclipse.org/Special:Upload">Upload file</a></li>
<li id="t-specialpages"><a href="http://wiki.eclipse.org/Special:Specialpages">Special pages</a></li>
				<li id="t-print"><a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/Architecture_Overview&amp;printable=yes">Printable version</a></li>				<li id="t-permalink"><a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/Architecture_Overview&amp;oldid=286692">Permanent link</a></li>	  </ul>
        </div>


<!-- Additions and mods for leftside nav End here -->


  <div id="column-content">
    <div id="content">
      <a name="top" id="top"></a>

              <div id="tabs">
         <ul class="primary">
                                         <li class="active"><a href="Documentation/Architecture_Overview.html"><span class="tab">Page</span></a></li>
                                                         <li><a href="http://wiki.eclipse.org/index.php?title=Talk:SMILA/Documentation/Architecture_Overview&amp;action=edit"><span class="tab">Discussion</span></a></li>
                                         <li><a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/Architecture_Overview&amp;action=edit"><span class="tab">View source</span></a></li>
                                         <li><a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/Architecture_Overview&amp;action=history"><span class="tab">History</span></a></li>
                                         <li><a href="http://wiki.eclipse.org/index.php?title=Special:Userlogin&amp;returnto=SMILA/Documentation/Architecture%20Overview"><span class="tab">Edit</span></a></li>
                  </ul>
        </div>


          <script type="text/javascript"> if (window.isMSIE55) fixalpha(); </script>
            <h1 class="firstHeading">SMILA/Documentation/Architecture Overview</h1>
      <div id="bodyContent">
        <h3 id="siteSub">From Eclipsepedia</h3>
        <div id="contentSub"><span class="subpages">&lt; <a href="../SMILA.html" title="SMILA">SMILA</a> | <a href="Documentation.1.html" title="SMILA/Documentation">Documentation</a></span>(Redirected from <a href="http://wiki.eclipse.org/index.php?title=SMILA/Architecture_Overview&amp;redirect=no" title="SMILA/Architecture Overview">SMILA/Architecture Overview</a>)</div>
                              <div id="jump-to-nav">Jump to: <a href="Architecture_Overview.html#column-one">navigation</a>, <a href="Architecture_Overview.html#searchInput">search</a></div>          <!-- start content -->
          <table id="toc" class="toc" summary="Contents"><tr><td><div id="toctitle"><h2>Contents</h2></div>
<ul>
<li class="toclevel-1"><a href="Architecture_Overview.html#What_is_SMILA.3F"><span class="tocnumber">1</span> <span class="toctext">What is SMILA?</span></a>
<ul>
<li class="toclevel-2"><a href="Architecture_Overview.html#Introduction"><span class="tocnumber">1.1</span> <span class="toctext">Introduction</span></a></li>
<li class="toclevel-2"><a href="Architecture_Overview.html#Architecture_Overview"><span class="tocnumber">1.2</span> <span class="toctext">Architecture Overview</span></a></li>
</ul>
</li>
<li class="toclevel-1"><a href="Architecture_Overview.html#Want_to_know_more.3F"><span class="tocnumber">2</span> <span class="toctext">Want to know more?</span></a></li>
</ul>
</li>
</ul>
</td></tr></table><script type="text/javascript"> if (window.showTocToggle) { var tocShowText = "show"; var tocHideText = "hide"; showTocToggle(); } </script>
<a name="What_is_SMILA.3F"></a><h2> <span class="mw-headline"> What is SMILA? </span></h2>
<a name="Introduction"></a><h3> <span class="mw-headline"> Introduction </span></h3>
<p>SMILA is a <i>framework</i> for creating scalable server-side systems that process large amounts of unstructured data in order to build applications in the area of search, linguistic analysis, information mining or similar. The goal is to enable you to easily integrate data source connectors, search engines, sophisticated analysis methods and more and gaining scalability and reliability out-of-the-box.
</p><p>As such, SMILA provides these main parts:
</p>
<ul><li> <a href="Documentation/JobManager.html" title="SMILA/Documentation/JobManager"><b>JobManager</b></a>: a system for asynchronous, scalable processing of data using configurable <i>workflows</i>. The system is able to reliably distribute the <i>tasks</i> to be done on big clusters of hosts. The workflows orchestrate easy-to-implement <i>workers</i> that can be used to integrate application-specific processing logic.
</li><li> <a href="Documentation/Importing/Concept.html" title="SMILA/Documentation/Importing/Concept"><b>Crawlers</b></a>: concepts and basic implementations for scalable components that extract data from data sources. 
</li><li> <a href="Documentation/Pipelets.html" title="SMILA/Documentation/Pipelets"><b>Pipelines</b></a>: a system for processing synchronous requests (e.g. search requests) by orchestrating easy-to-implement components (<i>pipelets</i>) in workflows defined in BPEL.
</li><li> <a href="Documentation/ObjectStore/Bundle_org.eclipse.smila.objectstore.html" title="SMILA/Documentation/ObjectStore/Bundle org.eclipse.smila.objectstore"><b>Storage</b></a>: concepts for integrating big-data storages for efficient persistence of the processed data.
</li></ul>
<p>Eventually, all SMILA functionality will be accessible for external clients via an <i>HTTP ReST API</i> using <i>JSON</i> as the exchange data format.
</p><p>As an Eclipse system, SMILA is built in <i>OSGi</i> and makes heavy use of the OSGi <i>service</i> component model.
</p>
<a name="Architecture_Overview"></a><h3> <span class="mw-headline"> Architecture Overview </span></h3>
<p><a href="http://wiki.eclipse.org/Image:SMILA_Architecture_Overview_1.0.png" class="image" title="Image:SMILA Architecture Overview_1.0.png"><img alt="Image:SMILA Architecture Overview_1.0.png" src="http://wiki.eclipse.org/images/b/b3/SMILA_Architecture_Overview_1.0.png" width="960" height="720" border="0" /></a>
</p><p><font size="-1">
Download <a href="http://wiki.eclipse.org/images/7/79/SMILA_Architecture_1.0.zip" class="internal" title="SMILA Architecture 1.0.zip">this zip file</a> containing the original PowerPoint file of this slide.
</font>
</p><p>A SMILA system usually consists of two parts:
</p>
<ul><li> First, data has to be imported into the system and processed to produce an search index or an ontology or whatever can be learned from the data. 
</li><li> Second, the learned information is used to answer retrieval requests from users, for examples search or ontology exploration requests.
</li></ul>
<p>In the first process usually some data source is crawled or an external client pushes data from the source into the SMILA system using the HTTP ReST API. Often the data consists of large number of documents (e.g. a file system, web site, or content management system). To be processed each document is represented in SMILA by a <i>record</i> describing the metadata of the document (name, size, access rights, authors, keywords, ...) and the original content of the document itself.
</p><p>To process large amounts of data, SMILA must be able to distribute the work to be done on multiple SMILA nodes (computers). Therefore the <i>bulkbuilder</i> seperates the incoming data into <i>bulks</i> of records of a configurable size and writes them to an ObjectStore. For each of these bulks the <i>JobManager</i> creates <i>tasks</i> for <i>workers</i> to process such a bulk and produce other bulks with the result. When such a worker is available it asks the <i>TaskManager</i> for tasks to-do, does the work and finally notifies the TaskManager about the result. <i>Workflows</i> define which workers should process a bulk in what sequence. Whenever a worker finishes a task for a bulk successfully, the JobManager can create follow-up tasks based on such a workflow definition. In case a worker fails its task (because the process or machine crashes or because of network problem) the JobManager can decide to retry the task later and so ensure that the data is processed even in error conditions. The processing of the complete data set using such a workflow is called a <i>job run</i> and monitoring of the current state of such a job run is easily possible via the HTTP ReST API.
</p><p>JobManager and TaskManager use <a href="http://zookeeper.apache.org" class="external text" title="http://zookeeper.apache.org" rel="nofollow">Apache Zookeeper</a> to coordinate the state of a job run and the to-do and in-progress tasks over multiple computer. So the job processing is distributed
</p><p>To make implementing workers easy, the SMILA JobManager system contains the <i>WorkerManager</i> that enables you to concentrate on the actual worker functionality without having to worry about getting the TaskManager and ObjectStore interaction right.
</p><p>To extract large amounts of data from the data source, the asynchronous job framework can also be used to implement highly scalable <i>crawlers</i>. Crawling can be divided into several steps: 
</p>
<ul><li> getting names of elements from the datasource
</li><li> checking if the element has changed since a previous crawl run (delta check)
</li><li> getting the content of changed or new elements
</li><li> pushing the element to a processing job.
</li></ul>
<p>These steps can be implemented as seperate workers, too, so the crawl work can be parallelized and distributed quite easily. By using the JobManager to control the crawling we gain the same reliabilty and scalability from the processing for the crawling, too. And: Implementing new crawlers is just as easy as implementing new workers.
</p><p>Eventually, the final step of such asynchrounous processing workflow will write the processed data to some target system, for example a search engine or an ontology manager or a database where it can be used to process retrieval requests, and so we get to the second part of the system. Such requests are coming from an external client application via the HTTP ReST API. They are usually of a synchronous nature, meaning that a client sends a request and waits for the result to present it to a user, and it expects the result to be produced rather quickly. On the other hand we want to have a similar flexibility to configure the processing of such synchronous requests as we have for the asynchronous job processing. Therefore we use a different workflow processor here which is based on a BPEL engine. The BPEL workflows (which we call <i>pipelines</i>) in this processor orchestrate so-called <i>pipelets</i> to perform the different steps needed to enrich and refine the original requests and to produce the result. Implementing such a pipelet is probably even easier than implementing a worker&nbsp;;-)
</p><p>Finally, it's even possible to combine both workflow variants because there is a <i>PipelineProcessing</i> worker in the asynchronous system performs a task by executing synchronous pipeline. So it's possible to implement a only pipelet and have the functionality available in both kinds of workflows. Additionally, there is a <i>PipeletProcessing</i> worker available that executes just a single pipelet and so saves the overhead of the synchronous workflow processor if one pipelet is sufficient to execute tasks.
</p>
<a name="Want_to_know_more.3F"></a><h2> <span class="mw-headline"> Want to know more? </span></h2>
<p>For further up to date documentation of all implemented components please see:
</p>
<ul><li> See SMILA in action: <a href="Documentation_for_5_Minutes_to_Success.html" title="SMILA/Documentation for 5 Minutes to Success">SMILA in 5 Minutes</a>
</li><li> Read the <a href="Manual.html" class="mw-redirect" title="SMILA/Manual">Manual</a>
</li></ul>

<!-- 
NewPP limit report
Preprocessor node count: 6/1000000
Post-expand include size: 0/2097152 bytes
Template argument size: 0/2097152 bytes
#ifexist count: 0/100
-->

<!-- Saved in parser cache with key wikidb:pcache:idhash:19260-0!1!0!!en!2!edit=0 and timestamp 20120202144715 -->
<div class="printfooter">
Retrieved from "<a href="Documentation/Architecture_Overview.html">http://wiki.eclipse.org/SMILA/Documentation/Architecture_Overview</a>"</div>
          <div id="catlinks"><p class='catlinks'><a href="http://wiki.eclipse.org/Special:Categories" title="Special:Categories">Category</a>: <span dir='ltr'><a href="http://wiki.eclipse.org/Category:SMILA" title="Category:SMILA">SMILA</a></span></p></div>          <!-- end content -->
          <div class="visualClear"></div>
        </div>
      </div>


    </div>


	<!-- Yoink of toolbox for phoenix moved up -->


  </div>
  </div>
              <div id="clearFooter"/>
            <div id="footer" >  
	      <ul id="footernav">
                <li class="first"><a href="http://www.eclipse.org/">Home</a></li>
               	<li><a href="http://www.eclipse.org/legal/privacy.php">Privacy Policy</a></li>
	        <li><a href="http://www.eclipse.org/legal/termsofuse.php">Terms of Use</a></li>
		<li><a href="http://www.eclipse.org/legal/copyright.php">Copyright Agent</a></li>
                <li><a href="http://www.eclipse.org/org/foundation/contact.php">Contact</a></li>
               	<li><a href="http://wiki.eclipse.org/Eclipsepedia:About" title="Eclipsepedia:About">About Eclipsepedia</a></li>
	      </ul>
              <span id="copyright">Copyright &copy; 2012 The Eclipse Foundation. All Rights Reserved</span>
	      <p id="footercredit">This page was last modified 07:57, 26 January 2012 by <a href="http://wiki.eclipse.org/index.php?title=User:Juergen.schumacher.attensity.com&amp;action=edit" class="new" title="User:Juergen.schumacher.attensity.com">Juergen Schumacher</a>. Based on work by <a href="http://wiki.eclipse.org/User:Drazen.cindric.attensity.com" title="User:Drazen.cindric.attensity.com">Drazen Cindric</a>, <a href="http://wiki.eclipse.org/User:Andreas.schank.attensity.com" title="User:Andreas.schank.attensity.com">A. Schank</a> and <a href="http://wiki.eclipse.org/User:Igor.novakovic.attensity.com" title="User:Igor.novakovic.attensity.com">Igor Novakovic</a> and <a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/Architecture_Overview&amp;action=credits" title="SMILA/Documentation/Architecture Overview">others</a>.</p>
	      <p id="footerviews">This page has been accessed 7,948 times.</p>
	    </div>

            <script type="text/javascript">
              var gaJsHost = (("https:" == document.location.protocol) ? "https://ssl." : "http://www.");
              document.write(unescape("%3Cscript src='" + gaJsHost + "google-analytics.com/ga.js' type='text/javascript'%3E%3C/script%3E"));
            </script>
            <script type="text/javascript">
              var pageTracker = _gat._getTracker("UA-910670-4");
              pageTracker._trackPageview();
            </script>
 
 	



		

<!-- 			<div class="visualClear"></div> -->
	
		<script type="text/javascript">if (window.runOnloadHook) runOnloadHook();</script>
</div>

<!-- Served in 0.103 secs. --></body></html>
