blob: b84914be2ec8d8ecbf5432c88387c1bfb19a8a5d [file] [log] [blame]
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en" dir="ltr">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<meta name="keywords" content="SMILA/Specifications/CompoundManagementDiscussion" />
<link rel="shortcut icon" href="http://wiki.eclipse.org/SMILA/Specifications/favicon.ico" />
<link rel="search" type="application/opensearchdescription+xml" href="http://wiki.eclipse.org/opensearch_desc.php" title="Eclipsepedia (English)" />
<link rel="alternate" type="application/rss+xml" title="Eclipsepedia RSS Feed" href="http://wiki.eclipse.org/index.php?title=Special:Recentchanges&amp;feed=rss" />
<link rel="alternate" type="application/atom+xml" title="Eclipsepedia Atom Feed" href="http://wiki.eclipse.org/index.php?title=Special:Recentchanges&amp;feed=atom" />
<title>SMILA/Specifications/CompoundManagementDiscussion - Eclipsepedia</title>
<style type="text/css" media="screen,projection">/*<![CDATA[*/ @import "/skins/eclipsenova/novaWide.css?116"; /*]]>*/</style>
<link rel="stylesheet" type="text/css" media="print" href="http://wiki.eclipse.org/skins/eclipsenova/eclipsenovaPrint.css?116" />
<link rel="stylesheet" type="text/css" media="handheld" href="http://wiki.eclipse.org/skins/eclipsenova/handheld.css?116" />
<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/Nova/css/header.css" media="screen" />
<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/tabs.css" media="screen" />
<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/Nova/css/visual.css" media="screen" />
<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/Nova/css/layout.css" media="screen" />
<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/Nova/css/footer.css" media="screen" />
<!--[if IE]><link rel="stylesheet" type="text/css" href="/skins/eclipsenova/IEpngfix.css" media="screen" /><![endif]-->
<!--[if lt IE 5.5000]><style type="text/css">@import "/skins/eclipsenova/IE50Fixes.css?116";</style> <![endif]-->
<!--[if IE 5.5000]><style type="text/css">@import "/skins/eclipsenova/IE55Fixes.css?116";</style><![endif]-->
<!--[if IE 6]><style type="text/css">@import "/skins/eclipsenova/IE60Fixes.css?116";</style><![endif]-->
<!--[if IE 7]><style type="text/css">@import "/skins/eclipsenova/IE70Fixes.css?116";</style><![endif]-->
<!--[if lt IE 7]><script type="text/javascript" src="/skins/common/IEFixes.js?116"></script>
<meta http-equiv="imagetoolbar" content="no" /><![endif]-->
<script type= "text/javascript">/*<![CDATA[*/
var skin = "eclipsenova";
var stylepath = "/skins";
var wgArticlePath = "/$1";
var wgScriptPath = "";
var wgScript = "/index.php";
var wgServer = "http://wiki.eclipse.org";
var wgCanonicalNamespace = "";
var wgCanonicalSpecialPageName = false;
var wgNamespaceNumber = 0;
var wgPageName = "SMILA/Specifications/CompoundManagementDiscussion";
var wgTitle = "SMILA/Specifications/CompoundManagementDiscussion";
var wgAction = "view";
var wgRestrictionEdit = [];
var wgRestrictionMove = [];
var wgArticleId = "19821";
var wgIsArticle = true;
var wgUserName = null;
var wgUserGroups = null;
var wgUserLanguage = "en";
var wgContentLanguage = "en";
var wgBreakFrames = false;
var wgCurRevisionId = "156307";
var wgVersion = "1.12.0";
var wgEnableAPI = true;
var wgEnableWriteAPI = false;
/*]]>*/</script>
<script type="text/javascript" src="http://wiki.eclipse.org/skins/common/wikibits.js?116"><!-- wikibits js --></script>
<!-- Performance mods similar to those for bug 166401 -->
<script type="text/javascript" src="http://wiki.eclipse.org/index.php?title=-&amp;action=raw&amp;gen=js&amp;useskin=eclipsenova"><!-- site js --></script>
<!-- Head Scripts -->
<script type="text/javascript" src="http://wiki.eclipse.org/skins/common/ajax.js?116"></script>
<link rel="stylesheet" type="text/css" href="CompoundManagementDiscussion.html" /> </head>
<body class="mediawiki ns-0 ltr page-SMILA_Specifications_CompoundManagementDiscussion">
<div id="globalWrapper">
<div id="column-one">
<!-- Eclipse Additions for the Top Nav start here M. Ward-->
<div id="header">
<div id="header-graphic">
<img src="http://wiki.eclipse.org/skins/eclipsenova/eclipse.png" alt="Eclipse Wiki">
</div>
<!-- Pulled 101409 Mward -->
<div class="portlet" id="p-personal">
<div class="pBody">
<ul>
<li id="pt-login"><a href="http://wiki.eclipse.org/index.php?title=Special:Userlogin&amp;returnto=SMILA/Specifications/CompoundManagementDiscussion">Log in</a></li>
</ul>
</div>
</div>
<div id="header-icons">
<div id="sites">
<ul id="sitesUL">
<li><a href="http://www.eclipse.org"><img src="http://dev.eclipse.org/custom_icons/eclipseIcon.png" width="28" height="28" alt="Eclipse Foundation" title="Eclipse Foundation" /><div>Eclipse Foundation</div></a></li>
<li><a href="http://marketplace.eclipse.org"><img src="http://dev.eclipse.org/custom_icons/marketplace.png" width="28" height="28" alt="Eclipse Marketplace" title="Eclipse Marketplace" /><div>Eclipse Marketplace</div></a></li>
<li><a href="https://bugs.eclipse.org/bugs"><img src="http://dev.eclipse.org/custom_icons/system-search-bw.png" width="28" height="28" alt="Bugzilla" title="Bugzilla" /><div>Bugzilla</div></a></li>
<li><a href="http://live.eclipse.org"><img src="http://dev.eclipse.org/custom_icons/audio-input-microphone-bw.png" width="28" height="28" alt="Live" title="Live" /><div>Eclipse Live</div></a></li>
<li><a href="http://planeteclipse.org"><img src="http://dev.eclipse.org/large_icons/devices/audio-card.png" width="28" height="28" alt="PlanetEclipse" title="Planet" /><div>Planet Eclipse</div></a></li>
<li><a href="http://portal.eclipse.org"><img src="http://dev.eclipse.org/custom_icons/preferences-system-network-proxy-bw.png" width="28" height="28" alt="Portal" title="Portal" /><div>My Foundation Portal</div></a></li>
</ul>
</div>
</div>
</div>
<!-- NEW HEADER STUFF HERE -->
<div id="header-menu">
<div id="header-nav">
<ul> <li><a class="first_one" href="http://wiki.eclipse.org/" target="_self">Home</a></li> <li><a href="http://www.eclipse.org/downloads/" target="_self">Downloads</a></li>
<li><a href="http://www.eclipse.org/users/" target="_self">Users</a></li>
<li><a href="http://www.eclipse.org/membership/" target="_self">Members</a></li>
<li><a href="http://wiki.eclipse.org/index.php/Development_Resources" target="_self">Committers</a></li>
<li><a href="http://www.eclipse.org/resources/" target="_self">Resources</a></li>
<li><a href="http://www.eclipse.org/projects/" target="_self">Projects</a></li>
<li><a href="http://www.eclipse.org/org/" target="_self">About Us</a></li>
</ul>
</div>
<div id="header-utils">
<!-- moved the search window here -->
<form action="http://wiki.eclipse.org/Special:Search" >
<input class="input" name="search" type="text" accesskey="f" value="" />
<input type='submit' onclick="this.submit();" name="go" id="searchGoButton" class="button" title="Go to a page with this exact name if one exists" value="Go" />&nbsp;
<input type='submit' onclick="this.submit();" name="fulltext" class="button" id="mw-searchButton" title="Search Eclipsepedia for this text" value="Search" />
</form>
</div>
</div>
<!-- Eclipse Additions for the Header stop here -->
<!-- Additions and mods for leftside nav Start here -->
<!--Started nav rip here-->
<!-- these are the nav controls main page, changes etc -->
<div id="novaContent" class="faux">
<div id="leftcol">
<ul id="leftnav">
<!-- these are the page controls, edit history etc -->
<li class="separator"><a class="separator">Navigation &#160;&#160;</li>
<li id="n-mainpage"><a href="http://wiki.eclipse.org/Main_Page">Main Page</a></li>
<li id="n-portal"><a href="http://wiki.eclipse.org/Eclipsepedia:Community_Portal">Community portal</a></li>
<li id="n-currentevents"><a href="http://wiki.eclipse.org/Eclipsepedia:Current_events">Current events</a></li>
<li id="n-recentchanges"><a href="http://wiki.eclipse.org/Special:Recentchanges">Recent changes</a></li>
<li id="n-randompage"><a href="http://wiki.eclipse.org/Special:Random">Random page</a></li>
<li id="n-help"><a href="http://wiki.eclipse.org/Help:Contents">Help</a></li>
<li class="separator"><a class="separator">Toolbox &#160;&#160;</a></li>
<li id="t-whatlinkshere"><a href="http://wiki.eclipse.org/Special:Whatlinkshere/SMILA/Specifications/CompoundManagementDiscussion">What links here</a></li>
<li id="t-recentchangeslinked"><a href="http://wiki.eclipse.org/Special:Recentchangeslinked/SMILA/Specifications/CompoundManagementDiscussion">Related changes</a></li>
<!-- This is the toolbox section -->
<li id="t-upload"><a href="http://wiki.eclipse.org/Special:Upload">Upload file</a></li>
<li id="t-specialpages"><a href="http://wiki.eclipse.org/Special:Specialpages">Special pages</a></li>
<li id="t-print"><a href="http://wiki.eclipse.org/index.php?title=SMILA/Specifications/CompoundManagementDiscussion&amp;printable=yes">Printable version</a></li> <li id="t-permalink"><a href="http://wiki.eclipse.org/index.php?title=SMILA/Specifications/CompoundManagementDiscussion&amp;oldid=156307">Permanent link</a></li> </ul>
</div>
<!-- Additions and mods for leftside nav End here -->
<div id="column-content">
<div id="content">
<a name="top" id="top"></a>
<div id="tabs">
<ul class="primary">
<li class="active"><a href="CompoundManagementDiscussion.html"><span class="tab">Page</span></a></li>
<li><a href="http://wiki.eclipse.org/Talk:SMILA/Specifications/CompoundManagementDiscussion"><span class="tab">Discussion</span></a></li>
<li><a href="http://wiki.eclipse.org/index.php?title=SMILA/Specifications/CompoundManagementDiscussion&amp;action=edit"><span class="tab">View source</span></a></li>
<li><a href="http://wiki.eclipse.org/index.php?title=SMILA/Specifications/CompoundManagementDiscussion&amp;action=history"><span class="tab">History</span></a></li>
<li><a href="http://wiki.eclipse.org/index.php?title=Special:Userlogin&amp;returnto=SMILA/Specifications/CompoundManagementDiscussion"><span class="tab">Edit</span></a></li>
</ul>
</div>
<script type="text/javascript"> if (window.isMSIE55) fixalpha(); </script>
<h1 class="firstHeading">SMILA/Specifications/CompoundManagementDiscussion</h1>
<div id="bodyContent">
<h3 id="siteSub">From Eclipsepedia</h3>
<div id="contentSub"><span class="subpages">&lt; <a href="../../SMILA.html" title="SMILA">SMILA</a> | <a href="../Specifications.html" title="SMILA/Specifications">Specifications</a></span></div>
<div id="jump-to-nav">Jump to: <a href="CompoundManagementDiscussion.html#column-one">navigation</a>, <a href="CompoundManagementDiscussion.html#searchInput">search</a></div> <!-- start content -->
<table id="toc" class="toc" summary="Contents"><tr><td><div id="toctitle"><h2>Contents</h2></div>
<ul>
<li class="toclevel-1"><a href="CompoundManagementDiscussion.html#CompoundManagement_improvements"><span class="tocnumber">1</span> <span class="toctext">CompoundManagement improvements</span></a>
<ul>
<li class="toclevel-2"><a href="CompoundManagementDiscussion.html#Integration_in_DeltaIndexingManager"><span class="tocnumber">1.1</span> <span class="toctext">Integration in DeltaIndexingManager</span></a></li>
<li class="toclevel-2"><a href="CompoundManagementDiscussion.html#Adapting_Compound_Records"><span class="tocnumber">1.2</span> <span class="toctext">Adapting Compound Records</span></a></li>
<li class="toclevel-2"><a href="CompoundManagementDiscussion.html#Compound_Inheritance"><span class="tocnumber">1.3</span> <span class="toctext">Compound Inheritance</span></a></li>
<li class="toclevel-2"><a href="CompoundManagementDiscussion.html#Filtering_of_compound_elements"><span class="tocnumber">1.4</span> <span class="toctext">Filtering of compound elements</span></a></li>
<li class="toclevel-2"><a href="CompoundManagementDiscussion.html#CompoundHandling_for_Agents"><span class="tocnumber">1.5</span> <span class="toctext">CompoundHandling for Agents</span></a></li>
<li class="toclevel-2"><a href="CompoundManagementDiscussion.html#Dependency_to_MimeTypeIdentifier"><span class="tocnumber">1.6</span> <span class="toctext">Dependency to MimeTypeIdentifier</span></a></li>
<li class="toclevel-2"><a href="CompoundManagementDiscussion.html#Tutorial"><span class="tocnumber">1.7</span> <span class="toctext">Tutorial</span></a></li>
</ul>
</li>
</ul>
</td></tr></table><script type="text/javascript"> if (window.showTocToggle) { var tocShowText = "show"; var tocHideText = "hide"; showTocToggle(); } </script>
<a name="CompoundManagement_improvements"></a><h2> <span class="mw-headline"> CompoundManagement improvements </span></h2>
<p>The current CompoundManagement implementation is by no means finished and final. Below are some ideas and already known issues that could or even have to be adressed in the future:
</p>
<a name="Integration_in_DeltaIndexingManager"></a><h3> <span class="mw-headline"> Integration in DeltaIndexingManager </span></h3>
<dl><dt>Priority</dt><dd> SHOWSTOPPER
</dd><dt>STATUS</dt><dd> DONE see <a href="https://bugs.eclipse.org/bugs/show_bug.cgi?id=278360" class="external free" title="https://bugs.eclipse.org/bugs/show_bug.cgi?id=278360" rel="nofollow">https://bugs.eclipse.org/bugs/show_bug.cgi?id=278360</a>
</dd></dl>
<p>The current implementations of the DeltaIndexingManager do not handle compound elements correctly. We have to store dependencies between compound records and their elements. If a compound record is checked for update and DeltaIndexing determines that it needs no update then all elements of the compound record have to be marked as visited as well as the compound record. This has to be done recursively for nested compounds (zip in zip in ...).
</p><p>We have to store two more information with the DeltaIndexingManager:
</p>
<ul><li> <tt>boolean isCompound</tt>: a flag that specifies if an entry is a compound record (true) or not (false)
</li><li> <tt>String parentIdHash</tt>: the hash of the parentId. This is only set for compound elements that reference their direct parent compound record. For top level compounds or non compound records it is set to NULL.
</li></ul>
<p>To speed up DeltaIndexing we should not set the VISITED flag for all elements of a compound record, but only for the ones that are containers themselves. In this way we can save lots of modifications on existing entries (especially useful for JPA implementation). We also need an additional flag <tt>MODIFIED</tt> to differentiate between unchanged and changed compound objects. Therefore we have to spend some more logic when determining the records for DeltaIndexing Delete. Here we have to select only those records whose visited flag is false and that either don't have a parentId (they are not part of a compound hierachy) or a parentId who's MODIFIED flag is set to true.
</p><p><br />
</p>
<a name="Adapting_Compound_Records"></a><h3> <span class="mw-headline"> Adapting Compound Records </span></h3>
<dl><dt>Priority</dt><dd> LOW
</dd><dt>STATUS</dt><dd> OPEN
</dd></dl>
<p>CompoundManager offers method <tt>adaptCompoundRecord(...)</tt> to adapt the compound record after it's elements were extracted. This is useful for the following scenarios:
</p>
<ul><li> if we do not want SMILA to process and index the compound records themselves we could delete the record
</li><li> if we want to index the compound record (its metadata, the content makes no sense for any search engine) we can do so but we may want to remove the big content object before sending it to the workflow engine
</li><li> anything else ...
</li></ul>
<p>At the moment this method is not implemented, it returns the unmodified record. Of course the adaptation should be configurable. Both of the above described options should be easy to implement.
</p>
<a name="Compound_Inheritance"></a><h3> <span class="mw-headline"> Compound Inheritance </span></h3>
<dl><dt>Priority</dt><dd> LOW
</dd><dt>STATUS</dt><dd> OPEN
</dd></dl>
<p>It should be possible to "inherit" attributes, attachments and annotations from a compound record to it's elements. A good example are access rights that are associated with the compound record but are lost when the elements are processed. The inheritance should be configurable:
</p>
<ul><li> what attributes/attachments/annotations are inherited
</li><li> how are they inherited (execution mode)
<ul><li> add: the compound record values are added to existing element values
</li><li> replace: the elements values (if any exist) are replaced by the compound records values
</li><li> setIfEmpty: the values from the compound record are set on the element if no values exist
</li></ul>
</li></ul>
<p><br />
</p>
<a name="Filtering_of_compound_elements"></a><h3> <span class="mw-headline"> Filtering of compound elements </span></h3>
<dl><dt>Priority</dt><dd> LOW
</dd><dt>STATUS</dt><dd> OPEN
</dd></dl>
<p>It should be possible to configure filters for compound elements so that certain elements of a compound record are ignored just as within regular crawlers. It would be great if the filters of the data source the compound record originates from could be reused but I guess that Crawlers/Agents configuration may get to heterogeneous. So a separate filter mechanism could be applied that works only on the common defined CompoundAttributes (those are anyway the only available attributes).
</p><p><br />
</p>
<a name="CompoundHandling_for_Agents"></a><h3> <span class="mw-headline"> CompoundHandling for Agents </span></h3>
<dl><dt>Priority</dt><dd> LOW
</dd><dt>STATUS</dt><dd> OPEN
</dd></dl>
<p>At the moment Compoundhandling is only used in the CrawlerController (to be precise in the class CrawlThread). It should also be available in the AgentController processing logic although we currently do not have an Agent that provides compound records. Perhaps we can enhance the mock agent to send compound records if so desired to allow testing.
</p>
<a name="Dependency_to_MimeTypeIdentifier"></a><h3> <span class="mw-headline"> Dependency to MimeTypeIdentifier </span></h3>
<dl><dt>Priority</dt><dd> HIGH
</dd><dt>STATUS</dt><dd> OPEN
</dd></dl>
<p>The CompoundManager needs a MimeTypeIdentifier service to be able to identify the mime type of an incoming object and to decide whether it's a compound or not. This already works fine. However, the MimeTypeidentifier interface and the SimpleMimeTypeIdentifier service are located in bundle <tt>org.eclipse.smila.processing.pipelets.mimetype</tt> which entails dependencies to <tt>org.eclipse.smila.processing</tt> and some of its sub-bundles. We should move the MimeTypeIdentifier interface and the SimpleMimeTypeIdentifier into different packages outside of processing. Perhaps utils is a good place, but we have to separate interface and implementation to allow for other implementations (ApertureMimetypeidentifier will definitely come).
Then the <tt>SimpleMimeTypeIdentifier</tt> should also be separated into a pure service and a ProcessingService. The ProcessingService should be located in <tt>org.eclipse.smila.processing.pipelets</tt> and it should be independent of the MimeTypeIdentifier service used. It should work with any MimeTypeIdentifier and contain appropriate logic to find mime type and/or extension information about the files (e.g. the web crawler metadata).
</p>
<a name="Tutorial"></a><h3> <span class="mw-headline"> Tutorial </span></h3>
<dl><dt>Priority</dt><dd> HIGH
</dd><dt>STATUS</dt><dd> OPEN
</dd></dl>
<p>We should add a Tutorial on "How to implement a CompoundHandler and CompoundCrawler" as it is a common place for contributors to extend SMILA with their own functionality.
</p>
<!--
NewPP limit report
Preprocessor node count: 17/1000000
Post-expand include size: 0/2097152 bytes
Template argument size: 0/2097152 bytes
#ifexist count: 0/100
-->
<!-- Saved in parser cache with key wikidb:pcache:idhash:19821-0!1!0!!en!2!edit=0 and timestamp 20110617075241 -->
<div class="printfooter">
Retrieved from "<a href="CompoundManagementDiscussion.html">http://wiki.eclipse.org/SMILA/Specifications/CompoundManagementDiscussion</a>"</div>
<!-- end content -->
<div class="visualClear"></div>
</div>
</div>
</div>
<!-- Yoink of toolbox for phoenix moved up -->
</div>
</div>
<div id="clearFooter"/>
<div id="footer" >
<ul id="footernav">
<li class="first"><a href="http://www.eclipse.org/">Home</a></li>
<li><a href="http://www.eclipse.org/legal/privacy.php">Privacy Policy</a></li>
<li><a href="http://www.eclipse.org/legal/termsofuse.php">Terms of Use</a></li>
<li><a href="http://www.eclipse.org/legal/copyright.php">Copyright Agent</a></li>
<li><a href="http://www.eclipse.org/org/foundation/contact.php">Contact</a></li>
<li><a href="http://wiki.eclipse.org/Eclipsepedia:About" title="Eclipsepedia:About">About Eclipsepedia</a></li>
</ul>
<span id="copyright">Copyright &copy; 2011 The Eclipse Foundation. All Rights Reserved</span>
<p id="footercredit">This page was last modified 14:58, 2 June 2009 by <a href="http://wiki.eclipse.org/User:Daniel.stucky.empolis.com" title="User:Daniel.stucky.empolis.com">Daniel Stucky</a>. Based on work by <a href="http://wiki.eclipse.org/User:Juergen.schumacher.empolis.com" title="User:Juergen.schumacher.empolis.com">Juergen Schumacher</a>.</p>
<p id="footerviews">This page has been accessed 1,212 times.</p>
</div>
<script type="text/javascript">
var gaJsHost = (("https:" == document.location.protocol) ? "https://ssl." : "http://www.");
document.write(unescape("%3Cscript src='" + gaJsHost + "google-analytics.com/ga.js' type='text/javascript'%3E%3C/script%3E"));
</script>
<script type="text/javascript">
var pageTracker = _gat._getTracker("UA-910670-4");
pageTracker._trackPageview();
</script>
<!-- <div class="visualClear"></div> -->
<script type="text/javascript">if (window.runOnloadHook) runOnloadHook();</script>
</div>
<!-- Served in 0.121 secs. --></body></html>