<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en" dir="ltr">
	<head>
		<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
				<meta name="keywords" content="SMILA/Documentation/Importing/Concept,SMILA/Documentation/Importing/Crawler/File,SMILA/Documentation/Importing/Crawler/Web,SMILA/Documentation/Importing/DeltaCheck,SMILA/Documentation/Importing/SimpleCompoundExtractorService,SMILA/Documentation/Importing/UpdatePusher" />
		<link rel="shortcut icon" href="http://wiki.eclipse.org/SMILA/Documentation/Importing/favicon.ico" />
		<link rel="search" type="application/opensearchdescription+xml" href="http://wiki.eclipse.org/opensearch_desc.php" title="Eclipsepedia (English)" />
<link rel="alternate" type="application/rss+xml" title="Eclipsepedia RSS Feed" href="http://wiki.eclipse.org/index.php?title=Special:Recentchanges&amp;feed=rss" />
<link rel="alternate" type="application/atom+xml" title="Eclipsepedia Atom Feed" href="http://wiki.eclipse.org/index.php?title=Special:Recentchanges&amp;feed=atom" />
	
		
		<title>SMILA/Documentation/Importing/Concept - Eclipsepedia</title>

		<style type="text/css" media="screen,projection">/*<![CDATA[*/ @import "http://wiki.eclipse.org/skins/eclipsenova/novaWide.css?116"; /*]]>*/</style>
		<link rel="stylesheet" type="text/css" media="print" href="http://wiki.eclipse.org/skins/eclipsenova/eclipsenovaPrint.css?116" />
		<link rel="stylesheet" type="text/css" media="handheld" href="http://wiki.eclipse.org/skins/eclipsenova/handheld.css?116" />
		<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/Nova/css/header.css" media="screen" />
		<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/tabs.css" media="screen" />
		<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/Nova/css/visual.css" media="screen" />
		<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/Nova/css/layout.css" media="screen" />
		<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/Nova/css/footer.css" media="screen" />
	        <!--[if IE]><link rel="stylesheet" type="text/css" href="/skins/eclipsenova/IEpngfix.css" media="screen" /><![endif]-->
		<!--[if lt IE 5.5000]><style type="text/css">@import "/skins/eclipsenova/IE50Fixes.css?116";</style> <![endif]-->
		<!--[if IE 5.5000]><style type="text/css">@import "/skins/eclipsenova/IE55Fixes.css?116";</style><![endif]-->
		<!--[if IE 6]><style type="text/css">@import "/skins/eclipsenova/IE60Fixes.css?116";</style><![endif]-->
		<!--[if IE 7]><style type="text/css">@import "/skins/eclipsenova/IE70Fixes.css?116";</style><![endif]-->
		<!--[if lt IE 7]><script type="text/javascript" src="/skins/common/IEFixes.js?116"></script>
		<meta http-equiv="imagetoolbar" content="no" /><![endif]-->
		<script type= "text/javascript">/*<![CDATA[*/
var skin = "eclipsenova";
var stylepath = "/skins";
var wgArticlePath = "/$1";
var wgScriptPath = "";
var wgScript = "/index.php";
var wgServer = "http://wiki.eclipse.org";
var wgCanonicalNamespace = "";
var wgCanonicalSpecialPageName = false;
var wgNamespaceNumber = 0;
var wgPageName = "SMILA/Documentation/Importing/Concept";
var wgTitle = "SMILA/Documentation/Importing/Concept";
var wgAction = "view";
var wgRestrictionEdit = [];
var wgRestrictionMove = [];
var wgArticleId = "34796";
var wgIsArticle = true;
var wgUserName = null;
var wgUserGroups = null;
var wgUserLanguage = "en";
var wgContentLanguage = "en";
var wgBreakFrames = false;
var wgCurRevisionId = "291529";
var wgVersion = "1.12.0";
var wgEnableAPI = true;
var wgEnableWriteAPI = false;
/*]]>*/</script>
                
		<script type="text/javascript" src="http://wiki.eclipse.org/skins/common/wikibits.js?116"><!-- wikibits js --></script>

<!-- Performance mods similar to those for bug 166401 -->
		<script type="text/javascript" src="http://wiki.eclipse.org/index.php?title=-&amp;action=raw&amp;gen=js&amp;useskin=eclipsenova"><!-- site js --></script>

		<!-- Head Scripts -->
				<script type="text/javascript" src="http://wiki.eclipse.org/skins/common/ajax.js?116"></script>
<link rel="stylesheet" type="text/css" href="Concept.html" />	</head>
<body  class="mediawiki ns-0 ltr page-SMILA_Documentation_Importing_Concept">
	<div id="globalWrapper">


		<div id="column-one">
<!-- Eclipse Additions for the Top Nav start here M. Ward-->

		      <div id="header">
               <div id="header-graphic">
                 <img src="http://wiki.eclipse.org/skins/eclipsenova/eclipse.png" alt="Eclipse Wiki">
               </div>
<!-- Pulled 101409 Mward	   -->
            
            <div class="portlet" id="p-personal">
              <div class="pBody">
              <ul>
                <li id="pt-login"><a href="http://wiki.eclipse.org/index.php?title=Special:Userlogin&amp;returnto=SMILA/Documentation/Importing/Concept">Log in</a></li>
                      </ul>
              </div>
            </div>

            <div id="header-icons">
	      <div id="sites">
              <ul id="sitesUL">
                <li><a href="http://www.eclipse.org"><img src="http://dev.eclipse.org/custom_icons/eclipseIcon.png" width="28" height="28" alt="Eclipse Foundation" title="Eclipse Foundation" /><div>Eclipse Foundation</div></a></li>
                <li><a href="http://marketplace.eclipse.org"><img src="http://dev.eclipse.org/custom_icons/marketplace.png" width="28" height="28" alt="Eclipse Marketplace" title="Eclipse Marketplace" /><div>Eclipse Marketplace</div></a></li>
                <li><a href="https://bugs.eclipse.org/bugs"><img src="http://dev.eclipse.org/custom_icons/system-search-bw.png" width="28" height="28" alt="Bugzilla" title="Bugzilla" /><div>Bugzilla</div></a></li>
                <li><a href="http://live.eclipse.org"><img src="http://dev.eclipse.org/custom_icons/audio-input-microphone-bw.png" width="28" height="28" alt="Live" title="Live" /><div>Eclipse Live</div></a></li>
                <li><a href="http://planeteclipse.org"><img src="http://dev.eclipse.org/large_icons/devices/audio-card.png" width="28" height="28" alt="PlanetEclipse" title="Planet" /><div>Planet Eclipse</div></a></li>
                <li><a href="http://portal.eclipse.org"><img src="http://dev.eclipse.org/custom_icons/preferences-system-network-proxy-bw.png" width="28" height="28" alt="Portal" title="Portal" /><div>My Foundation Portal</div></a></li>
              </ul>
              </div>
            </div>
        </div> 
<!-- NEW HEADER STUFF HERE -->
       <div id="header-menu">
         <div id="header-nav">
               <ul> 		 <li><a class="first_one" href="http://wiki.eclipse.org/" target="_self">Home</a></li>                  <li><a  href="http://www.eclipse.org/downloads/" target="_self">Downloads</a></li> 
                 <li><a  href="http://www.eclipse.org/users/" target="_self">Users</a></li> 
                 <li><a  href="http://www.eclipse.org/membership/" target="_self">Members</a></li> 
                 <li><a  href="http://wiki.eclipse.org/index.php/Development_Resources" target="_self">Committers</a></li> 
                 <li><a  href="http://www.eclipse.org/resources/" target="_self">Resources</a></li> 
                 <li><a  href="http://www.eclipse.org/projects/" target="_self">Projects</a></li> 
                 <li><a  href="http://www.eclipse.org/org/" target="_self">About Us</a></li> 
                </ul>
         </div>
         <div id="header-utils">
<!-- moved the search window here -->
                           <form action="http://wiki.eclipse.org/Special:Search" >
                  <input class="input" name="search" type="text" accesskey="f" value="" />
                    <input type='submit' onclick="this.submit();" name="go" id="searchGoButton" class="button" title="Go to a page with this exact name if one exists" value="Go" />&nbsp;
                    <input type='submit' onclick="this.submit();" name="fulltext" class="button" id="mw-searchButton" title="Search Eclipsepedia for this text"  value="Search" />
                </form>
		 </div>
      </div>


<!-- Eclipse Additions for the Header stop here -->
<!-- Additions and mods for leftside nav Start here -->

<!--Started nav  rip here-->
<!-- these are the nav controls main page, changes etc -->
    <div id="novaContent" class="faux">
            <div id="leftcol">
          <ul id="leftnav">
<!-- these are the page controls, edit history etc -->
            	  <li class="separator"><a class="separator">Navigation &#160;&#160;</li>
                                <li id="n-mainpage"><a href="http://wiki.eclipse.org/Main_Page">Main Page</a></li>
	                                <li id="n-portal"><a href="http://wiki.eclipse.org/Eclipsepedia:Community_Portal">Community portal</a></li>
	                                <li id="n-currentevents"><a href="http://wiki.eclipse.org/Eclipsepedia:Current_events">Current events</a></li>
	                                <li id="n-recentchanges"><a href="http://wiki.eclipse.org/Special:Recentchanges">Recent changes</a></li>
	                                <li id="n-randompage"><a href="http://wiki.eclipse.org/Special:Random">Random page</a></li>
	                                <li id="n-help"><a href="http://wiki.eclipse.org/Help:Contents">Help</a></li>
		  <li class="separator"><a class="separator">Toolbox &#160;&#160;</a></li>

				<li id="t-whatlinkshere"><a href="http://wiki.eclipse.org/Special:Whatlinkshere/SMILA/Documentation/Importing/Concept">What links here</a></li>
				<li id="t-recentchangeslinked"><a href="http://wiki.eclipse.org/Special:Recentchangeslinked/SMILA/Documentation/Importing/Concept">Related changes</a></li>
                <!-- This is the toolbox section -->
<li id="t-upload"><a href="http://wiki.eclipse.org/Special:Upload">Upload file</a></li>
<li id="t-specialpages"><a href="http://wiki.eclipse.org/Special:Specialpages">Special pages</a></li>
				<li id="t-print"><a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/Importing/Concept&amp;printable=yes">Printable version</a></li>				<li id="t-permalink"><a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/Importing/Concept&amp;oldid=291529">Permanent link</a></li>	  </ul>
        </div>


<!-- Additions and mods for leftside nav End here -->


  <div id="column-content">
    <div id="content">
      <a name="top" id="top"></a>

              <div id="tabs">
         <ul class="primary">
                                         <li class="active"><a href="Concept.html"><span class="tab">Page</span></a></li>
                                                         <li><a href="http://wiki.eclipse.org/index.php?title=Talk:SMILA/Documentation/Importing/Concept&amp;action=edit"><span class="tab">Discussion</span></a></li>
                                         <li><a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/Importing/Concept&amp;action=edit"><span class="tab">View source</span></a></li>
                                         <li><a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/Importing/Concept&amp;action=history"><span class="tab">History</span></a></li>
                                         <li><a href="http://wiki.eclipse.org/index.php?title=Special:Userlogin&amp;returnto=SMILA/Documentation/Importing/Concept"><span class="tab">Edit</span></a></li>
                  </ul>
        </div>


          <script type="text/javascript"> if (window.isMSIE55) fixalpha(); </script>
            <h1 class="firstHeading">SMILA/Documentation/Importing/Concept</h1>
      <div id="bodyContent">
        <h3 id="siteSub">From Eclipsepedia</h3>
        <div id="contentSub"><span class="subpages">&lt; <a href="../../../SMILA.html" title="SMILA">SMILA</a> | <a href="../../Documentation.1.html" title="SMILA/Documentation">Documentation</a></span></div>
                              <div id="jump-to-nav">Jump to: <a href="Concept.html#column-one">navigation</a>, <a href="Concept.html#searchInput">search</a></div>          <!-- start content -->
          <table id="toc" class="toc" summary="Contents"><tr><td><div id="toctitle"><h2>Contents</h2></div>
<ul>
<li class="toclevel-1"><a href="Concept.html#Using_JobManager_to_run_imports"><span class="tocnumber">1</span> <span class="toctext">Using JobManager to run imports</span></a>
<ul>
<li class="toclevel-2"><a href="Concept.html#Overview"><span class="tocnumber">1.1</span> <span class="toctext">Overview</span></a></li>
<li class="toclevel-2"><a href="Concept.html#Components"><span class="tocnumber">1.2</span> <span class="toctext">Components</span></a></li>
<li class="toclevel-2"><a href="Concept.html#Delta_Delete"><span class="tocnumber">1.3</span> <span class="toctext">Delta Delete</span></a></li>
<li class="toclevel-2"><a href="Concept.html#Compound_Handling"><span class="tocnumber">1.4</span> <span class="toctext">Compound Handling</span></a></li>
</ul>
</li>
</ul>
</td></tr></table><script type="text/javascript"> if (window.showTocToggle) { var tocShowText = "show"; var tocHideText = "hide"; showTocToggle(); } </script>
<a name="Using_JobManager_to_run_imports"></a><h2> <span class="mw-headline"> Using JobManager to run imports </span></h2>
<p>The idea is to apply the jobmanagement framework for doing crawl jobs, too. The advantages are:
</p>
<ul><li> we don't need a separate execution framework for crawling anymore
</li><li> integrators can use same programming model for creating crawler components than for processing workers.
</li><li> same control and monitoring APIs for crawling and processing
</li><li> better performance through inherent asynchronicity
</li><li> better error tolerance through inherent failsafety
</li><li> Parallelization of crawling process possible
</li></ul>
<a name="Overview"></a><h3> <span class="mw-headline"> Overview </span></h3>
<p>We can reach this goal by splitting up the crawl process into several workers. Basically, a crawling workflow always looks like this:
</p><p><a href="http://wiki.eclipse.org/Image:SMILA-importing-workflow-1.1.png" class="image" title="Image:SMILA-importing-workflow-1.1.png"><img alt="Image:SMILA-importing-workflow-1.1.png" src="http://wiki.eclipse.org/images/8/88/SMILA-importing-workflow-1.1.png" width="960" height="720" border="0" /></a>
</p><p>Workers with names starting with "(DS)" are specific for the crawled data source type. E.g. to crawl a file system you apparently need a different crawler worker than for a web server. Not each component may be necessary for each data source type, and it is possible to adapt components to add or remove functionality.
</p><p>The crawling job is separated from the processing (e.g. indexing) workflow. A final worker in the crawl workflow pushes all records to the other workflow. This makes it possible to have several datasources being crawled into a single index. Also, in update crawl runs it is easier to detect when the actual crawling is done so that it can be determined which records have to be deleted because they were not visited in this run. We assume that the processing job is just running all the time.
</p>
<a name="Components"></a><h3> <span class="mw-headline"> Components  </span></h3>
<ul><li>Crawler: (data source specific, see <a href="Crawler/File.html" title="SMILA/Documentation/Importing/Crawler/File">File Crawler</a> and <a href="Crawler/Web.html" title="SMILA/Documentation/Importing/Crawler/Web">Web Crawler</a>) 
<ul><li>two output slots: 
<ul><li>one for crawled resources (files, web pages), that match configurable filters (name patterns, extensions, size, mimetype, ...) 
</li><li>one for resources still to crawl (outgoing links, sub-directories). This one leads to follow-up tasks for the same worker. 
</li></ul>
</li><li>The worker can create multiple output bulks for each slot per task so that the following workers can parallelize better. 
</li><li>In general, it doesn't get content of a resource, but only the path or URL (or whatever identifies it) and metadata of the resources, to minimize IO load especially during "update runs" where most of the resources have not changed and therefore need not need to be fetched. 
</li><li>If it has to fetch the content anyway (e.g. Web Crawler has to parse HTML to find follow-up links), it may add it to the crawled records to prevent additional fetching. 
</li><li>The crawler can use or contain a "VisitedLinks" service if the data source can have cycles or multiple paths to the same resource (typical: web crawler) so that it does not produce multiple records that represent the same resource.
</li><li>Identifies compound objects: Set a special attribute on compound objects so that they can be routed for extraction later in the workflow. See below for details on compound handling.
</li></ul>
</li></ul>
<ul><li><a href="DeltaCheck.html" title="SMILA/Documentation/Importing/DeltaCheck">DeltaChecker</a>: 
<ul><li>Checks with DeltaService whether a resource is new in the data source or has been changed since the last run, depending on some of the metadata produced by the crawler (modification date from file system or HTTP headers). 
</li><li>If resource has not changed since the last run, this is marked in DeltaService and the record is not written to the output bulk. 
</li><li>Else the record is written to the output bulk (an additional attribute describes if it's a new record or one to update) and must be pushed to the processing workflow in the end.
</li><li>The DeltaChecker can write identified compound objects to a separate output slot to route them to a extractor worker.
</li></ul>
</li></ul>
<ul><li>Fetcher: (data source specific, see <a href="Crawler/File.html#File_Fetcher" title="SMILA/Documentation/Importing/Crawler/File">File Fetcher</a> and <a href="Crawler/Web.html#Web_Fetcher_Worker" title="SMILA/Documentation/Importing/Crawler/Web">Web Fetcher</a>) 
<ul><li>Worker that gets the content of the resource, if the record does not contain it already. 
</li><li>Detect compounds (like archive files (zip, tgz), for example) and does not fetch the content, but just copy the records containing the IDs (URL/file name wtc.) to a compound output bulk for later extraction, as we do not want to put extremely large compound objects into bulks.
</li></ul>
</li></ul>
<ul><li>Compound extractor: (data source specific, see <a href="Crawler/File.html" title="SMILA/Documentation/Importing/Crawler/File">File Crawler</a> and <a href="Crawler/Web.html" title="SMILA/Documentation/Importing/Crawler/Web">Web Crawler</a>) 
<ul><li>for handling compounds: fetch the compound data to a local temp filesystem, extract it and add the records to output bulks, just like the ones written by the fetcher. 
</li><li>The compound extractor is data source specific for two reasons:
<ul><li>it must access the data source itself to fetch the compound content because we want to avoid the overhead of putting really large objects in the workflow bulks.
</li><li>it must produce records that look as if they were produced by the corresponding data source crawler and fetcher workers, so it must understand the same configuration.
</li></ul>
</li></ul>
</li></ul>
<ul><li><a href="UpdatePusher.html" title="SMILA/Documentation/Importing/UpdatePusher">Update Pusher</a>: 
<ul><li>Push resulting records to BulkBuilder and mark them as updated in the DeltaService. To prevent duplicates and to skip unchanged objects extracted from compounds, it checks the DeltaService again if this check is enabled. In the completion phase of the job run, it can performs the "delta-delete" operation (see below).
</li></ul>
</li></ul>
<p>The crawl job is started as a <i>runOnce</i> job, i.e. the jobmanager creates an initial task that causes the crawler to start crawling (data source configuration and start links would be given as job parameters, we may need some additional component to manage data source configurations), which creates follow-up crawl and record tasks. When all tasks have been processed the job run finishes automatically (because it was started in runOnce mode).
</p>
<a name="Delta_Delete"></a><h3> <span class="mw-headline"> Delta Delete </span></h3>
<p>Optionally, a final "completion run" can be triggered that causes the UpdatePusher to examine the DeltaService for records that have not been visisted in this job run, because they have been removed from the data source and therefore should be removed from the target of the import process, too. For each of these records one "to-delete" record is pushed to the BulkBuilder. To support an efficient delta-delete on large data sources it will be necessary to parallelize this "delta-delete" operation, because it could take rather long to scan the complete DeltaService and create the to-delete records sequentially. To support this the DeltaService should put the state entries for one source in different partitions or <i>shards</i> that can be scanned independently. Then one task can be created for each shard to scan it for deleted records, and they can be processed in parallel by multiple instances of the UpdatePusher.
</p><p>The "delta-delete" run is not triggered if there have been tasks with fatal errors before in the crawl phase of the job run. Therefore it cannot happen that the complete import target is cleared after nothing could be imported from the data source due to errors.
</p><p>The usage of the DeltaService is determined by a job parameter named <tt>deltaImportStrategy</tt>. It can have one of four values:
</p>
<ul><li> <tt>disabled</tt>: the DeltaService is disabled, no entries are written for imported records, and no delta-delete is performed. This reduces the import time if the delta information is not needed for the application. However, as no initial delta information is recorded in this mode, switching to another mode for incremental updates does usually not make sense.
</li><li> <tt>initial</tt>: the delta information for each imported record is written to the DeltaService, but neither a check of the already existing information nor the delta-delete is performed. This is useful to make an initial import faster. Afterwards incremental updates can be done using one of the following modes.
</li><li> <tt>additive</tt>: existing delta information is checked to prevent unchanged objects from being re-imported unnecessarily, and the delta information for all imported records is written. Only the final delta-delete step is ommitted. This is useful if objects removed from the data source do not need to be removed from the import target, too. 
</li><li> <tt>full</tt>: This is the default mode with the aim to keep the import target in sync with the data source: delta information is checked and recorded, and the delta-delete is done.
</li></ul>
<p>If one of the first two values (<tt>disabled</tt> and <tt>initial</tt>) is used, the DeltaChecker worker just writes every records from the input bulk to the output bulk without really doing anything. Thus, to improve the performance even more, for jobs using these modes the DeltaChecker worker can be removed from the workflow completely and the output of the crawler worker can be processed by the fetcher worker immediately.
</p>
<a name="Compound_Handling"></a><h3> <span class="mw-headline"> Compound Handling </span></h3>
<p><i>Compounds</i> are objects that consist of many smaller objects that should be imported as individual objects. Typical examples are archive files like ZIP or TAR.GZ files that contain a lot of documents that should be indexed, another type would be an email with attachments. To import these compound elements they must be <i>extracted</i> from the compound objects. This can be a recursive process as a compound can contain compound objects again. To prevent that the extraction process thwarts the import of "ordinary" objects we decided to handle them in specialized workers, not in the crawler or fetcher workers themselves. These workers are called <i>extractor workers</i>.
</p><p>Extractor workers are data source specific for two reasons:
</p>
<ul><li> They must access the data source to fetch the content. Because compounds can be very large, we want to prevent the overhead of writing the to the workflow bulks before extracting them.
</li><li> They must produce records that look like records procuced by the corresponding crawler so that they can be processed in the same way. So the extractor must understand the same configuration as the crawler and fetcher workers.
</li></ul>
<p>However, the actual extraction process is usually generic, so we provide a generic <i>Compound Extractor Service</i> that can be shared by the specific extraction workers (see <a href="SimpleCompoundExtractorService.html" class="mw-redirect" title="SMILA/Documentation/Importing/SimpleCompoundExtractorService">current implementation</a>). There is also a generic base class for such workers that should make implementing a extractor worker for new data source types easy.
Currently, we have implemented extractor workers to support the <a href="Crawler/File.html" title="SMILA/Documentation/Importing/Crawler/File">file crawler</a> and <a href="Crawler/Web.html" title="SMILA/Documentation/Importing/Crawler/Web">web crawler</a> implementations, see the respective pages for more information.
</p><p>Therefore the compound handling consists of two parts:
</p>
<ul><li> Identification: to be able to handle compounds seperately in the workflow they must be marked as being compounds. We expect this to be done in the crawler worker that can use metadata of the object to classify it as a compound, e.g. the filename (especially the extension) or the mimetype, if reported by the data source. The crawler worker can use methods of the Compound Extractor Service to decide if an object is a compound that is also supported by the extractor service. To mark an object as a compound the crawler sets the system attribute <tt>_isCompound</tt> to <tt>true</tt>. This is also necessary to handle compounds correctly in the delta-check-and-delete logic:
<ul><li> if a compound has been removed from the data source, the import process must also delete all previously imported compound elements. Therefore compound elements must be stored with individual entries in the delta service.
</li><li> if a compound itself was not changed all previously imported elements must be marked as visited, too, so that they are not deleted afterwards: We do not want to fetch and extract the complete unchanged compound in this case just to update the element entries, of course.
</li></ul>
</li><li> Extraction: Fetch the compound content and create an additional record for each compound element (make sure that the record ID is still unique for each element wrt. the data source). Contained compounds are extracted immediately, too. To support the delta logic, in each element record of the compound the system attribute <tt>_compoundRecordId</tt> must be set to the record ID of the top-level compund object. Note that the DeltaService does currently not support the individual management of sub-compounds, so to handle the delta logic for contained compounds correctly they must all be extracted and checked in the UpdatePusher individually.
</li></ul>
<p>The branching of the workflow for compounds is currently done by the DeltaChecker: It has a second output slot named "updatedCompounds". If this slot is connected to a bucket, it will write all new and updated objects marked as being a compound object by the crawler to this slot instead of the original "updatedRecords" slot. However, this means that correct compound handling is currently only possible if the DeltaCheckerWorker is part of the workflow, even if is not needed to implement the chosen <tt>deltaImportStrategy</tt>.
</p>
<!-- 
NewPP limit report
Preprocessor node count: 17/1000000
Post-expand include size: 0/2097152 bytes
Template argument size: 0/2097152 bytes
#ifexist count: 0/100
-->

<!-- Saved in parser cache with key wikidb:pcache:idhash:34796-0!1!0!!en!2!edit=0 and timestamp 20120710093507 -->
<div class="printfooter">
Retrieved from "<a href="Concept.html">http://wiki.eclipse.org/SMILA/Documentation/Importing/Concept</a>"</div>
          <div id="catlinks"><p class='catlinks'><a href="http://wiki.eclipse.org/Special:Categories" title="Special:Categories">Category</a>: <span dir='ltr'><a href="http://wiki.eclipse.org/Category:SMILA" title="Category:SMILA">SMILA</a></span></p></div>          <!-- end content -->
          <div class="visualClear"></div>
        </div>
      </div>


    </div>


	<!-- Yoink of toolbox for phoenix moved up -->


  </div>
  </div>
              <div id="clearFooter"/>
            <div id="footer" >  
	      <ul id="footernav">
                <li class="first"><a href="http://www.eclipse.org/">Home</a></li>
               	<li><a href="http://www.eclipse.org/legal/privacy.php">Privacy Policy</a></li>
	        <li><a href="http://www.eclipse.org/legal/termsofuse.php">Terms of Use</a></li>
		<li><a href="http://www.eclipse.org/legal/copyright.php">Copyright Agent</a></li>
                <li><a href="http://www.eclipse.org/org/foundation/contact.php">Contact</a></li>
               	<li><a href="http://wiki.eclipse.org/Eclipsepedia:About" title="Eclipsepedia:About">About Eclipsepedia</a></li>
	      </ul>
              <span id="copyright">Copyright &copy; 2012 The Eclipse Foundation. All Rights Reserved</span>
	      <p id="footercredit">This page was last modified 09:57, 22 February 2012 by <a href="http://wiki.eclipse.org/index.php?title=User:Juergen.schumacher.attensity.com&amp;action=edit" class="new" title="User:Juergen.schumacher.attensity.com">Juergen Schumacher</a>. Based on work by <a href="http://wiki.eclipse.org/User:Andreas.schank.attensity.com" title="User:Andreas.schank.attensity.com">A. Schank</a> and <a href="http://wiki.eclipse.org/index.php?title=User:Nadine.auslaender.attensity.com&amp;action=edit" class="new" title="User:Nadine.auslaender.attensity.com"> </a>.</p>
	      <p id="footerviews">This page has been accessed 1,379 times.</p>
	    </div>

            <script type="text/javascript">
              var gaJsHost = (("https:" == document.location.protocol) ? "https://ssl." : "http://www.");
              document.write(unescape("%3Cscript src='" + gaJsHost + "google-analytics.com/ga.js' type='text/javascript'%3E%3C/script%3E"));
            </script>
            <script type="text/javascript">
              var pageTracker = _gat._getTracker("UA-910670-4");
              pageTracker._trackPageview();
            </script>
 
 	



		

<!-- 			<div class="visualClear"></div> -->
	
		<script type="text/javascript">if (window.runOnloadHook) runOnloadHook();</script>
</div>

<!-- Served in 0.053 secs. --></body></html>
