blob: 74ad591ab56a9235596a622e5aa3d3d015f47161 [file] [log] [blame]
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en" dir="ltr">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<meta name="keywords" content="SMILA/Project Concepts/CompoundManagement,Daniel.stucky.empolis.com,Sebastian.voigt.brox.de" />
<link rel="shortcut icon" href="http://wiki.eclipse.org/SMILA/Project_Concepts/favicon.ico" />
<link rel="search" type="application/opensearchdescription+xml" href="http://wiki.eclipse.org/opensearch_desc.php" title="Eclipsepedia (English)" />
<link rel="alternate" type="application/rss+xml" title="Eclipsepedia RSS Feed" href="http://wiki.eclipse.org/index.php?title=Special:Recentchanges&amp;feed=rss" />
<link rel="alternate" type="application/atom+xml" title="Eclipsepedia Atom Feed" href="http://wiki.eclipse.org/index.php?title=Special:Recentchanges&amp;feed=atom" />
<title>SMILA/Project Concepts/CompoundManagement - Eclipsepedia</title>
<style type="text/css" media="screen,projection">/*<![CDATA[*/ @import "/skins/eclipsenova/novaWide.css?116"; /*]]>*/</style>
<link rel="stylesheet" type="text/css" media="print" href="http://wiki.eclipse.org/skins/eclipsenova/eclipsenovaPrint.css?116" />
<link rel="stylesheet" type="text/css" media="handheld" href="http://wiki.eclipse.org/skins/eclipsenova/handheld.css?116" />
<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/Nova/css/header.css" media="screen" />
<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/tabs.css" media="screen" />
<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/Nova/css/visual.css" media="screen" />
<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/Nova/css/layout.css" media="screen" />
<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/Nova/css/footer.css" media="screen" />
<!--[if IE]><link rel="stylesheet" type="text/css" href="/skins/eclipsenova/IEpngfix.css" media="screen" /><![endif]-->
<!--[if lt IE 5.5000]><style type="text/css">@import "/skins/eclipsenova/IE50Fixes.css?116";</style> <![endif]-->
<!--[if IE 5.5000]><style type="text/css">@import "/skins/eclipsenova/IE55Fixes.css?116";</style><![endif]-->
<!--[if IE 6]><style type="text/css">@import "/skins/eclipsenova/IE60Fixes.css?116";</style><![endif]-->
<!--[if IE 7]><style type="text/css">@import "/skins/eclipsenova/IE70Fixes.css?116";</style><![endif]-->
<!--[if lt IE 7]><script type="text/javascript" src="/skins/common/IEFixes.js?116"></script>
<meta http-equiv="imagetoolbar" content="no" /><![endif]-->
<script type= "text/javascript">/*<![CDATA[*/
var skin = "eclipsenova";
var stylepath = "/skins";
var wgArticlePath = "/$1";
var wgScriptPath = "";
var wgScript = "/index.php";
var wgServer = "http://wiki.eclipse.org";
var wgCanonicalNamespace = "";
var wgCanonicalSpecialPageName = false;
var wgNamespaceNumber = 0;
var wgPageName = "SMILA/Project_Concepts/CompoundManagement";
var wgTitle = "SMILA/Project Concepts/CompoundManagement";
var wgAction = "view";
var wgRestrictionEdit = [];
var wgRestrictionMove = [];
var wgArticleId = "15227";
var wgIsArticle = true;
var wgUserName = null;
var wgUserGroups = null;
var wgUserLanguage = "en";
var wgContentLanguage = "en";
var wgBreakFrames = false;
var wgCurRevisionId = "153026";
var wgVersion = "1.12.0";
var wgEnableAPI = true;
var wgEnableWriteAPI = false;
/*]]>*/</script>
<script type="text/javascript" src="http://wiki.eclipse.org/skins/common/wikibits.js?116"><!-- wikibits js --></script>
<!-- Performance mods similar to those for bug 166401 -->
<script type="text/javascript" src="http://wiki.eclipse.org/index.php?title=-&amp;action=raw&amp;gen=js&amp;useskin=eclipsenova"><!-- site js --></script>
<!-- Head Scripts -->
<script type="text/javascript" src="http://wiki.eclipse.org/skins/common/ajax.js?116"></script>
<style type="text/css">/*<![CDATA[*/
.source-xml {line-height: normal; font-size: medium;}
.source-xml li {line-height: normal;}
/**
* GeSHi Dynamically Generated Stylesheet
* --------------------------------------
* Dynamically generated stylesheet for xml
* CSS class: source-xml, CSS id:
* GeSHi (C) 2004 - 2007 Nigel McNie (http://qbnz.com/highlighter)
*/
.source-xml .de1, .source-xml .de2 {font-family: 'Courier New', Courier, monospace; font-weight: normal;}
.source-xml {}
.source-xml .head {}
.source-xml .foot {}
.source-xml .imp {font-weight: bold; color: red;}
.source-xml .ln-xtra {color: #cc0; background-color: #ffc;}
.source-xml li {font-family: 'Courier New', Courier, monospace; color: black; font-weight: normal; font-style: normal;}
.source-xml li.li2 {font-weight: bold;}
.source-xml .coMULTI {color: #808080; font-style: italic;}
.source-xml .es0 {color: #000099; font-weight: bold;}
.source-xml .br0 {color: #66cc66;}
.source-xml .st0 {color: #ff0000;}
.source-xml .nu0 {color: #cc66cc;}
.source-xml .sc0 {color: #00bbdd;}
.source-xml .sc1 {color: #ddbb00;}
.source-xml .sc2 {color: #339933;}
.source-xml .sc3 {color: #009900;}
.source-xml .re0 {color: #000066;}
.source-xml .re1 {font-weight: bold; color: black;}
.source-xml .re2 {font-weight: bold; color: black;}
/*]]>*/
</style>
<style type="text/css">/*<![CDATA[*/
@import "/index.php?title=MediaWiki:Geshi.css&usemsgcache=yes&action=raw&ctype=text/css&smaxage=18000";
/*]]>*/
</style><style type="text/css">/*<![CDATA[*/
.source-java {line-height: normal; font-size: medium;}
.source-java li {line-height: normal;}
/**
* GeSHi Dynamically Generated Stylesheet
* --------------------------------------
* Dynamically generated stylesheet for java
* CSS class: source-java, CSS id:
* GeSHi (C) 2004 - 2007 Nigel McNie (http://qbnz.com/highlighter)
*/
.source-java .de1, .source-java .de2 {font-family: 'Courier New', Courier, monospace; font-weight: normal;}
.source-java {}
.source-java .head {}
.source-java .foot {}
.source-java .imp {font-weight: bold; color: red;}
.source-java .ln-xtra {color: #cc0; background-color: #ffc;}
.source-java li {font-family: 'Courier New', Courier, monospace; color: black; font-weight: normal; font-style: normal;}
.source-java li.li2 {font-weight: bold;}
.source-java .kw1 {color: #7F0055; font-weight: bold;}
.source-java .kw2 {color: #7F0055; font-weight: bold;}
.source-java .kw3 {color: #000000; font-weight: normal}
.source-java .kw4 {color: #7F0055; font-weight: bold;}
.source-java .co1 {color: #3F7F5F; font-style: italic;}
.source-java .co2 {color: #3F7F5F;}
.source-java .co3 {color: #3F7F5F; font-style: italic; font-weight: bold;}
.source-java .coMULTI {color: #3F5FBF; font-style: italic;}
.source-java .es0 {color: #000000;}
.source-java .br0 {color: #000000;}
.source-java .st0 {color: #2A00ff;}
.source-java .nu0 {color: #000000;}
.source-java .me1 {color: #000000;}
.source-java .me2 {color: #000000;}
/*]]>*/
</style>
<style type="text/css">/*<![CDATA[*/
@import "/index.php?title=MediaWiki:Geshi.css&usemsgcache=yes&action=raw&ctype=text/css&smaxage=18000";
/*]]>*/
</style><link rel="stylesheet" type="text/css" href="CompoundManagement.html" /> </head>
<body class="mediawiki ns-0 ltr page-SMILA_Project_Concepts_CompoundManagement">
<div id="globalWrapper">
<div id="column-one">
<!-- Eclipse Additions for the Top Nav start here M. Ward-->
<div id="header">
<div id="header-graphic">
<img src="http://wiki.eclipse.org/skins/eclipsenova/eclipse.png" alt="Eclipse Wiki">
</div>
<!-- Pulled 101409 Mward -->
<div class="portlet" id="p-personal">
<div class="pBody">
<ul>
<li id="pt-login"><a href="http://wiki.eclipse.org/index.php?title=Special:Userlogin&amp;returnto=SMILA/Project_Concepts/CompoundManagement">Log in</a></li>
</ul>
</div>
</div>
<div id="header-icons">
<div id="sites">
<ul id="sitesUL">
<li><a href="http://www.eclipse.org"><img src="http://dev.eclipse.org/custom_icons/eclipseIcon.png" width="28" height="28" alt="Eclipse Foundation" title="Eclipse Foundation" /><div>Eclipse Foundation</div></a></li>
<li><a href="http://marketplace.eclipse.org"><img src="http://dev.eclipse.org/custom_icons/marketplace.png" width="28" height="28" alt="Eclipse Marketplace" title="Eclipse Marketplace" /><div>Eclipse Marketplace</div></a></li>
<li><a href="https://bugs.eclipse.org/bugs"><img src="http://dev.eclipse.org/custom_icons/system-search-bw.png" width="28" height="28" alt="Bugzilla" title="Bugzilla" /><div>Bugzilla</div></a></li>
<li><a href="http://live.eclipse.org"><img src="http://dev.eclipse.org/custom_icons/audio-input-microphone-bw.png" width="28" height="28" alt="Live" title="Live" /><div>Eclipse Live</div></a></li>
<li><a href="http://planeteclipse.org"><img src="http://dev.eclipse.org/large_icons/devices/audio-card.png" width="28" height="28" alt="PlanetEclipse" title="Planet" /><div>Planet Eclipse</div></a></li>
<li><a href="http://portal.eclipse.org"><img src="http://dev.eclipse.org/custom_icons/preferences-system-network-proxy-bw.png" width="28" height="28" alt="Portal" title="Portal" /><div>My Foundation Portal</div></a></li>
</ul>
</div>
</div>
</div>
<!-- NEW HEADER STUFF HERE -->
<div id="header-menu">
<div id="header-nav">
<ul> <li><a class="first_one" href="http://wiki.eclipse.org/" target="_self">Home</a></li> <li><a href="http://www.eclipse.org/downloads/" target="_self">Downloads</a></li>
<li><a href="http://www.eclipse.org/users/" target="_self">Users</a></li>
<li><a href="http://www.eclipse.org/membership/" target="_self">Members</a></li>
<li><a href="http://wiki.eclipse.org/index.php/Development_Resources" target="_self">Committers</a></li>
<li><a href="http://www.eclipse.org/resources/" target="_self">Resources</a></li>
<li><a href="http://www.eclipse.org/projects/" target="_self">Projects</a></li>
<li><a href="http://www.eclipse.org/org/" target="_self">About Us</a></li>
</ul>
</div>
<div id="header-utils">
<!-- moved the search window here -->
<form action="http://wiki.eclipse.org/Special:Search" >
<input class="input" name="search" type="text" accesskey="f" value="" />
<input type='submit' onclick="this.submit();" name="go" id="searchGoButton" class="button" title="Go to a page with this exact name if one exists" value="Go" />&nbsp;
<input type='submit' onclick="this.submit();" name="fulltext" class="button" id="mw-searchButton" title="Search Eclipsepedia for this text" value="Search" />
</form>
</div>
</div>
<!-- Eclipse Additions for the Header stop here -->
<!-- Additions and mods for leftside nav Start here -->
<!--Started nav rip here-->
<!-- these are the nav controls main page, changes etc -->
<div id="novaContent" class="faux">
<div id="leftcol">
<ul id="leftnav">
<!-- these are the page controls, edit history etc -->
<li class="separator"><a class="separator">Navigation &#160;&#160;</li>
<li id="n-mainpage"><a href="http://wiki.eclipse.org/Main_Page">Main Page</a></li>
<li id="n-portal"><a href="http://wiki.eclipse.org/Eclipsepedia:Community_Portal">Community portal</a></li>
<li id="n-currentevents"><a href="http://wiki.eclipse.org/Eclipsepedia:Current_events">Current events</a></li>
<li id="n-recentchanges"><a href="http://wiki.eclipse.org/Special:Recentchanges">Recent changes</a></li>
<li id="n-randompage"><a href="http://wiki.eclipse.org/Special:Random">Random page</a></li>
<li id="n-help"><a href="http://wiki.eclipse.org/Help:Contents">Help</a></li>
<li class="separator"><a class="separator">Toolbox &#160;&#160;</a></li>
<li id="t-whatlinkshere"><a href="http://wiki.eclipse.org/Special:Whatlinkshere/SMILA/Project_Concepts/CompoundManagement">What links here</a></li>
<li id="t-recentchangeslinked"><a href="http://wiki.eclipse.org/Special:Recentchangeslinked/SMILA/Project_Concepts/CompoundManagement">Related changes</a></li>
<!-- This is the toolbox section -->
<li id="t-upload"><a href="http://wiki.eclipse.org/Special:Upload">Upload file</a></li>
<li id="t-specialpages"><a href="http://wiki.eclipse.org/Special:Specialpages">Special pages</a></li>
<li id="t-print"><a href="http://wiki.eclipse.org/index.php?title=SMILA/Project_Concepts/CompoundManagement&amp;printable=yes">Printable version</a></li> <li id="t-permalink"><a href="http://wiki.eclipse.org/index.php?title=SMILA/Project_Concepts/CompoundManagement&amp;oldid=153026">Permanent link</a></li> </ul>
</div>
<!-- Additions and mods for leftside nav End here -->
<div id="column-content">
<div id="content">
<a name="top" id="top"></a>
<div id="tabs">
<ul class="primary">
<li class="active"><a href="CompoundManagement.html"><span class="tab">Page</span></a></li>
<li><a href="http://wiki.eclipse.org/index.php?title=Talk:SMILA/Project_Concepts/CompoundManagement&amp;action=edit"><span class="tab">Discussion</span></a></li>
<li><a href="http://wiki.eclipse.org/index.php?title=SMILA/Project_Concepts/CompoundManagement&amp;action=edit"><span class="tab">View source</span></a></li>
<li><a href="http://wiki.eclipse.org/index.php?title=SMILA/Project_Concepts/CompoundManagement&amp;action=history"><span class="tab">History</span></a></li>
<li><a href="http://wiki.eclipse.org/index.php?title=Special:Userlogin&amp;returnto=SMILA/Project%20Concepts/CompoundManagement"><span class="tab">Edit</span></a></li>
</ul>
</div>
<script type="text/javascript"> if (window.isMSIE55) fixalpha(); </script>
<h1 class="firstHeading">SMILA/Project Concepts/CompoundManagement</h1>
<div id="bodyContent">
<h3 id="siteSub">From Eclipsepedia</h3>
<div id="contentSub"><span class="subpages">&lt; <a href="../../SMILA.html" title="SMILA">SMILA</a> | <a href="../Project_Concepts.1.html" title="SMILA/Project Concepts">Project Concepts</a></span></div>
<div id="jump-to-nav">Jump to: <a href="CompoundManagement.html#column-one">navigation</a>, <a href="CompoundManagement.html#searchInput">search</a></div> <!-- start content -->
<table id="toc" class="toc" summary="Contents"><tr><td><div id="toctitle"><h2>Contents</h2></div>
<ul>
<li class="toclevel-1"><a href="CompoundManagement.html#Description"><span class="tocnumber">1</span> <span class="toctext">Description</span></a></li>
<li class="toclevel-1"><a href="CompoundManagement.html#Technical_proposal"><span class="tocnumber">2</span> <span class="toctext">Technical proposal</span></a>
<ul>
<li class="toclevel-2"><a href="CompoundManagement.html#Overview"><span class="tocnumber">2.1</span> <span class="toctext">Overview</span></a></li>
<li class="toclevel-2"><a href="CompoundManagement.html#Configuration"><span class="tocnumber">2.2</span> <span class="toctext">Configuration</span></a>
<ul>
<li class="toclevel-3"><a href="CompoundManagement.html#CompoundHandlerRegistry_Configuration"><span class="tocnumber">2.2.1</span> <span class="toctext">CompoundHandlerRegistry Configuration</span></a></li>
<li class="toclevel-3"><a href="CompoundManagement.html#CompoundHandler_Runtime_Configuration"><span class="tocnumber">2.2.2</span> <span class="toctext">CompoundHandler Runtime Configuration</span></a></li>
<li class="toclevel-3"><a href="CompoundManagement.html#Alternative_Compound_Configuration"><span class="tocnumber">2.2.3</span> <span class="toctext">Alternative Compound Configuration</span></a></li>
</ul>
</li>
<li class="toclevel-2"><a href="CompoundManagement.html#Interfaces"><span class="tocnumber">2.3</span> <span class="toctext">Interfaces</span></a></li>
<li class="toclevel-2"><a href="CompoundManagement.html#Implementation"><span class="tocnumber">2.4</span> <span class="toctext">Implementation</span></a></li>
<li class="toclevel-2"><a href="CompoundManagement.html#CompoundManagement_vs._Splitter"><span class="tocnumber">2.5</span> <span class="toctext">CompoundManagement vs. Splitter</span></a></li>
</ul>
</li>
</ul>
</td></tr></table><script type="text/javascript"> if (window.showTocToggle) { var tocShowText = "show"; var tocHideText = "hide"; showTocToggle(); } </script>
<a name="Description"></a><h2> <span class="mw-headline"> Description </span></h2>
<p>Work out a concept to handle compound objects (objects that contain or can be split up into multiple objects).
</p>
<a name="Technical_proposal"></a><h2> <span class="mw-headline"> Technical proposal </span></h2>
<a name="Overview"></a><h3> <span class="mw-headline"> Overview </span></h3>
<p>The CompoundManagement is responsible for extraction of elements from compound objects of various mimetypes (like zip archives, Windows Help Files (hlp), etc.). The CompoundManagement provides an Crawler interface over the extracted elements, this is identical to the ones provided by CrawlerFactories, thus it provides delta indexing support.
The processing of the various types of compound objects and creation of "CompoundCrawlers" is delegated to so called CompoundHandlers. Each CompoundHandler implementation is associated with specific mimetypes.
</p><p>This chart shows the architecture of the CompoundManagement:
<a href="http://wiki.eclipse.org/Image:Compound_management_architecture.png" class="image" title="Image:compound_management_architecture.png"><img alt="Image:compound_management_architecture.png" src="http://wiki.eclipse.org/images/5/5d/Compound_management_architecture.png" width="960" height="720" border="0" /></a>
<b>Note</b>: The component CompoundHandlerRegistry is most likely obsolete, as it's functionality (registration of CompoundHandlers) can be achieved by using OSGi technologies.
</p>
<a name="Configuration"></a><h3> <span class="mw-headline"> Configuration </span></h3>
<a name="CompoundHandlerRegistry_Configuration"></a><h4> <span class="mw-headline"> CompoundHandlerRegistry Configuration </span></h4>
<p>At first we need a configuration for the CompoundHandlerRegistry that associates a CompoundHandler implementation (there may be multiple supporting the same mimetype) with a mimetype. This could look like this:
</p>
<div dir="ltr" style="text-align: left;"><pre class="source-xml"><span class="sc3"><span class="re1">&lt;CompoundHandlerRegistry<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;CompoundHandler</span> <span class="re0">mimetype</span>=<span class="st0">&quot;application/zip&quot;</span> <span class="re0">class</span>=<span class="st0">&quot;org.eclipse.smila.irm.compoundmanagement.ZipCompoundHandler&quot;</span><span class="re2">&gt;</span></span>
<span class="sc3"><span class="re1">&lt;CompoundHandler</span> <span class="re0">mimetype</span>=<span class="st0">&quot;application/mshelp&quot;</span> <span class="re0">class</span>=<span class="st0">&quot;org.eclipse.smila.irm.compoundmanagement.HlpCompoundHandler&quot;</span><span class="re2">&gt;</span></span>
<span class="sc3"><span class="re1">&lt;CompoundHandler</span> <span class="re0">mimetype</span>=<span class="st0">&quot;application/java-archive&quot;</span> <span class="re0">class</span>=<span class="st0">&quot;org.eclipse.smila.irm.compoundmanagement.ZipCompoundHandler&quot;</span><span class="re2">&gt;</span></span>
...
<span class="sc3"><span class="re1">&lt;/CompoundHandlerRegistry<span class="re2">&gt;</span></span></span></pre></div>
<p><a href="http://wiki.eclipse.org/index.php?title=User:Sebastian.voigt.brox.de&amp;action=edit" class="new" title="User:Sebastian.voigt.brox.de">Sebastian Voigt</a>: This configuration could be omit. The CompoundManagement should resolve automatically which Compound "Handler" is installed. I would call them Compound Bundle or something else because each Handler is deployed with a bundle.
The CompoundManagement can use a defined extension points to find "installed" compound bundles. Extension Point can be called org.eccenca.irm.compound.
</p><p>This Extension Point should offer the following Interface:
</p>
<div dir="ltr" style="text-align: left;"><pre class="source-java"><span class="br0">&#123;</span>
<span class="kw3">String</span> getMimeType<span class="br0">&#40;</span><span class="br0">&#41;</span>;
<span class="kw3">String</span> getCompoundHandlerName<span class="br0">&#40;</span><span class="br0">&#41;</span> - returns a Description of the Compound Handler <span class="br0">&#40;</span>used <span class="kw1">for</span> Logging<span class="br0">&#41;</span>
<span class="br0">&#125;</span></pre></div>
<p>Before each indexing job the Compound Manager should retrieve all installed bundles that implement the Compound Extension point and should warn the user if there are bundles installed that address the same mimetype.
</p>
<a name="CompoundHandler_Runtime_Configuration"></a><h4> <span class="mw-headline"> CompoundHandler Runtime Configuration </span></h4>
<p>Then during runtime we have to provide a configuration to the CompoundManagement that is passed to the CompoundHandler implementations. It contains information about how to process extracted data. This could/should contain
</p>
<ul><li> information about working directories where to extract the data to
</li><li> information about attributes that should be inherited from the compound object. During inheritance there may be special actions required, like
<ul><li> replace existing values
</li><li> append to existing values
</li><li> set value, if no value exists
</li></ul>
</li></ul>
<div dir="ltr" style="text-align: left;"><pre class="source-xml"><span class="sc3"><span class="re1">&lt;inheritedAttributes<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Attribute</span> <span class="re0">name</span>=<span class="st0">&quot;accessRights&quot;</span> <span class="re0">action</span>=<span class="st0">&quot;replace&quot;</span><span class="re2">&gt;</span></span>
<span class="sc3"><span class="re1">&lt;Attribute</span> <span class="re0">name</span>=<span class="st0">&quot;lastModified&quot;</span> <span class="re0">action</span>=<span class="st0">&quot;replace&quot;</span><span class="re2">&gt;</span></span>
<span class="sc3"><span class="re1">&lt;Attribute</span> <span class="re0">name</span>=<span class="st0">&quot;abc&quot;</span> <span class="re0">action</span>=<span class="st0">&quot;append&quot;</span><span class="re2">&gt;</span></span>
<span class="sc3"><span class="re1">&lt;Attribute</span> <span class="re0">name</span>=<span class="st0">&quot;xyz&quot;</span> <span class="re0">action</span>=<span class="st0">&quot;set&quot;</span><span class="re2">&gt;</span></span>
...
<span class="sc3"><span class="re1">&lt;/inheritedAttributes<span class="re2">&gt;</span></span></span></pre></div>
<ul><li> information about filters. It would be great if the filters of an IRM configuration could be applied to CompoundHandlers (e.g. a filesystem is crawled and .log files are excluded, so we also want to exclude .log files contained in zips.)
<ul><li> another option would be to let the Agent/Crawler Controller apply filtering logic on the Records returned by the CompoundCrawler by delegating it back to the Agent/Crawler. So filtering logic has to be part of the Crawler interface.
</li></ul>
</li><li> information on how to create Record IDs&nbsp;? Or is this logic up to the implementation&nbsp;?
</li><li> information on how to create Delta Indexing hash key (what attributes to use)
</li><li> information on what attribute contain the content to be extracted
</li></ul>
<p>As different CompoundHandler implementations may need different configuration we should make the configuration schema extendable as done in the IRM configuration. Some configurations will be needed in all cases (like Inheritance of attributes, delta indexing hash), some may be optional or different (like configuration of working environment and filters).
</p>
<a name="Alternative_Compound_Configuration"></a><h4> <span class="mw-headline"> Alternative Compound Configuration </span></h4>
<p><a href="http://wiki.eclipse.org/index.php?title=User:Sebastian.voigt.brox.de&amp;action=edit" class="new" title="User:Sebastian.voigt.brox.de">Sebastian Voigt</a>:
A CompoundHandler behave as and has the same workflow as a Crawler. Therefore it should use the same configuration file.
The Compound Manager defines a Compound config schema, and each Compound Handler can redefine the Attributes and the Process Tags (like in the workflow for the irm configuration).
Process can be used to define behavior like filtering etc. for the extracting Job. HashAttributes and KeyAttributes are used to build the Record and ID (build be the Controller).
</p><p>The Compound configuration should contain additionally a description of an Index Job. The Configuration is only used for this index job. Thus for each CompoundHandler and for each Index Job Configuration there could be a config for the compound handler (different behavior for different index jobs)
</p><p>I would not add action tags like defined above. The IRM Framework should not change attributes or the information. It is only responsible to return information from specific data source. Therefore I think Attributes should not join or replaced.
Usually the Compound contents don't fit to the data source like e.g. Sharepoint and Zips. Sharepoint Objects has no path in a file system, and zips have only a sub path. There is no need to join/replace any information.
The Use of Compound handler is to return further/additional Attributes that describe the Object in a Compound more.
</p>
<ul><li> <a href="http://wiki.eclipse.org/User:Daniel.stucky.empolis.com" title="User:Daniel.stucky.empolis.com">Daniel Stucky</a>: we agreed, that adaptation of attributes in compound elements is needed and that we should do it as early as possible - inside a CompoundHandler (the alternative was during BPEL, but as the data of the parent object is needed we would have to store both the elements as well as the parents attributes in the objects EILRecord.). This "attribute inheritance" should be implemented once (abstract base class). If we really need different actions will be seen during implementation.
</li></ul>
<p><a href="http://wiki.eclipse.org/index.php?title=User:Sebastian.voigt.brox.de&amp;action=edit" class="new" title="User:Sebastian.voigt.brox.de">Sebastian Voigt</a>: Ok. We can adopt a IRM Configuration for this job. We need the following information:
</p><p>1) which attributes has to gathered from the compound
2) where should they be stored
3) which operation is used when it is stored in an attribute that&nbsp;has been&nbsp;inherited
</p><p>\--&gt; CompoundConfiguration
Attribute: Which information should be gathered from the Compound ( Compound defines with schema itself what is possible)
Name: In which Attribute in the Record should be stored the Information&nbsp; (if this Attribute exists in the Record it will be overwritten)
Attributes that should not be gathered but inherited are selected with a a &lt;Inherited/&gt; tag.
</p>
<div dir="ltr" style="text-align: left;"><pre class="source-xml"><span class="sc3"><span class="re1">&lt;CompoundConfiguration</span> <span class="re0">xmlns:xsi</span>=<span class="st0">&quot;http://www.w3.org/2001/XMLSchema-instance&quot;</span> <span class="re0">xsi:noNamespaceSchemaLocation</span>=<span class="st0">&quot;CompoundHandlerZip.xsd&quot;</span><span class="re2">&gt;</span></span>
<span class="sc3"><span class="re1">&lt;IndexJob<span class="re2">&gt;</span></span></span>
FileSystemIndexJob
<span class="sc3"><span class="re1">&lt;/IndexJob<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Attributes<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Attribute</span> <span class="re0">Type</span>=<span class="st0">&quot;Date&quot;</span> <span class="re0">Name</span>=<span class="st0">&quot;Date&quot;</span><span class="re2">&gt;</span></span>
<span class="sc3"><span class="re1">&lt;FileAttributes<span class="re2">&gt;</span></span></span>FileDate<span class="sc3"><span class="re1">&lt;/FileAttributes<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/Attribute<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Attribute</span> <span class="re0">Type</span>=<span class="st0">&quot;String&quot;</span> <span class="re0">Name</span>=<span class="st0">&quot;Filename&quot;</span><span class="re2">&gt;</span></span>
<span class="sc3"><span class="re1">&lt;FileAttributes<span class="re2">&gt;</span></span></span>Name<span class="sc3"><span class="re1">&lt;/FileAttributes<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/Attribute<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Attribute</span> <span class="re0">Type</span>=<span class="st0">&quot;String&quot;</span> <span class="re0">Name</span>=<span class="st0">&quot;Path&quot;</span><span class="re2">&gt;</span></span>
<span class="sc3"><span class="re1">&lt;FileAttributes<span class="re2">&gt;</span></span></span>Path<span class="sc3"><span class="re1">&lt;/FileAttributes<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/Attribute<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Attribute</span> <span class="re0">Type</span>=<span class="st0">&quot;String&quot;</span> <span class="re0">Name</span>=<span class="st0">&quot;PermissionUsers&quot;</span><span class="re2">&gt;</span></span>
<span class="sc3"><span class="re1">&lt;Inherited</span><span class="re2">/&gt;</span></span>
<span class="sc3"><span class="re1">&lt;/Attribute<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Attribute</span> <span class="re0">Type</span>=<span class="st0">&quot;StringCollection&quot;</span> <span class="re0">Name</span>=<span class="st0">&quot;PermissionGroup&quot;</span><span class="re2">&gt;</span></span>
<span class="sc3"><span class="re1">&lt;Inherited<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/Attribute<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Attribute</span> <span class="re0">Type</span>=<span class="st0">&quot;String&quot;</span> <span class="re0">Name</span>=<span class="st0">&quot;Content&quot;</span> <span class="re0">MimeTypeAttribute</span>=<span class="st0">&quot;Content&quot;</span><span class="re2">&gt;</span></span>
<span class="sc3"><span class="re1">&lt;FileAttributes<span class="re2">&gt;</span></span></span>Content<span class="sc3"><span class="re1">&lt;/FileAttributes<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/Attribute<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Attribute</span> <span class="re0">Type</span>=<span class="st0">&quot;String&quot;</span> <span class="re0">Name</span>=<span class="st0">&quot;Extension&quot;</span> <span class="re0">MimeTypeAttribute</span>=<span class="st0">&quot;FileExtension&quot;</span><span class="re2">&gt;</span></span>
<span class="sc3"><span class="re1">&lt;FileAttributes<span class="re2">&gt;</span></span></span>FileExtension<span class="sc3"><span class="re1">&lt;/FileAttributes<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/Attribute<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/Attributes<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Process<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Filter</span> <span class="re0">Recursive</span>=<span class="st0">&quot;true&quot;</span> <span class="re0">CaseSensitive</span>=<span class="st0">&quot;false&quot;</span><span class="re2">&gt;</span></span>
<span class="sc3"><span class="re1">&lt;Include</span> <span class="re0">Name</span>=<span class="st0">&quot;*.txt&quot;</span><span class="re2">/&gt;</span></span>
<span class="sc3"><span class="re1">&lt;/Filter<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/Process<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/IRMConfiguration<span class="re2">&gt;</span></span></span></pre></div>
<p>e.g.
The Date, the filename, path, content and the extension are gathered from the compound and the permissions are inherited from the compoound itself.
\\
</p>
<a name="Interfaces"></a><h3> <span class="mw-headline"> Interfaces </span></h3>
<div dir="ltr" style="text-align: left;"><pre class="source-java"><span class="kw1">interface</span> CompoundManagement
<span class="br0">&#123;</span>
Crawler extract<span class="br0">&#40;</span> Record compound, CMConfig config, <span class="kw3">String</span> mimetype <span class="br0">&#41;</span>;
<span class="br0">&#125;</span></pre></div>
<div dir="ltr" style="text-align: left;"><pre class="source-java"><span class="kw1">interface</span> CompoundHandler
<span class="br0">&#123;</span>
Crawler extract<span class="br0">&#40;</span> Record compound, CMConfig config <span class="br0">&#41;</span>;
<span class="br0">&#125;</span></pre></div>
<div dir="ltr" style="text-align: left;"><pre class="source-java"><span class="kw1">interface</span> CompoundHandlerRegistry
<span class="br0">&#123;</span>
CompoundHandler getCompoundHandler<span class="br0">&#40;</span> <span class="kw3">String</span> mimetype <span class="br0">&#41;</span>;
<span class="kw4">void</span> register<span class="br0">&#40;</span> <span class="kw3">String</span> mimetype, Sring clazz <span class="br0">&#41;</span>;
<span class="kw4">void</span> unregister<span class="br0">&#40;</span> <span class="kw3">String</span> mimetype <span class="br0">&#41;</span>;
<span class="br0">&#125;</span></pre></div>
<a name="Implementation"></a><h3> <span class="mw-headline"> Implementation </span></h3>
<p>CompoundManagement and CompoundHandlerRegistry are kind of fixed components that do not need to be reimplemented by SMILA users.
CompoundHandler implementations do the real work and contributions are expected here. We should provide one or two sample implementations (I suggest one for zip files). Each CompoundHandler implementation is free on how to implement it's functionality. It can be done in process using Java libs or in external processes (like executing unzip.exe). There are no restrictions on these implementations.
</p><p>The CompoundHandler interface could support SCA but except for the technology independence I do not see a big gain here. CompoundHandlers should not be executed remotely\!
</p>
<a name="CompoundManagement_vs._Splitter"></a><h3> <span class="mw-headline"> CompoundManagement vs. Splitter </span></h3>
<p>CompoundManagement and Splitter functionality basically offer the same functionality:
</p>
<ul><li> input: one object
</li><li> output N objects
</li></ul>
<p>The usage of both is slightly different:
</p>
<ul><li> CompoundManagement
<ul><li> is used in the IRM (in generall "near" the data source)
</li><li> multiple types of compounds must be processed dynamically
</li></ul>
</li><li> Splitter
<ul><li> is used in BPEL to provide Chapter or Page wise indexing
</li><li> usually only a single object type is splitted, because splitting is most likely based on INSO output and not done on the raw data
</li></ul>
</li></ul>
<p>Therefore we should provide a BPEL service for Splitting. This service should be configurable to support splitting of one concrete type. Internally we can reuse the concept for CompoundManagement registerung just a single CompoundHandler.
</p>
<!--
NewPP limit report
Preprocessor node count: 33/1000000
Post-expand include size: 0/2097152 bytes
Template argument size: 0/2097152 bytes
#ifexist count: 0/100
-->
<!-- Saved in parser cache with key wikidb:pcache:idhash:15227-0!1!0!!en!2!edit=0 and timestamp 20120203101503 -->
<div class="printfooter">
Retrieved from "<a href="CompoundManagement.html">http://wiki.eclipse.org/SMILA/Project_Concepts/CompoundManagement</a>"</div>
<!-- end content -->
<div class="visualClear"></div>
</div>
</div>
</div>
<!-- Yoink of toolbox for phoenix moved up -->
</div>
</div>
<div id="clearFooter"/>
<div id="footer" >
<ul id="footernav">
<li class="first"><a href="http://www.eclipse.org/">Home</a></li>
<li><a href="http://www.eclipse.org/legal/privacy.php">Privacy Policy</a></li>
<li><a href="http://www.eclipse.org/legal/termsofuse.php">Terms of Use</a></li>
<li><a href="http://www.eclipse.org/legal/copyright.php">Copyright Agent</a></li>
<li><a href="http://www.eclipse.org/org/foundation/contact.php">Contact</a></li>
<li><a href="http://wiki.eclipse.org/Eclipsepedia:About" title="Eclipsepedia:About">About Eclipsepedia</a></li>
</ul>
<span id="copyright">Copyright &copy; 2012 The Eclipse Foundation. All Rights Reserved</span>
<p id="footercredit">This page was last modified 07:01, 12 May 2009 by <a href="http://wiki.eclipse.org/User:Daniel.stucky.empolis.com" title="User:Daniel.stucky.empolis.com">Daniel Stucky</a>. </p>
<p id="footerviews">This page has been accessed 2,020 times.</p>
</div>
<script type="text/javascript">
var gaJsHost = (("https:" == document.location.protocol) ? "https://ssl." : "http://www.");
document.write(unescape("%3Cscript src='" + gaJsHost + "google-analytics.com/ga.js' type='text/javascript'%3E%3C/script%3E"));
</script>
<script type="text/javascript">
var pageTracker = _gat._getTracker("UA-910670-4");
pageTracker._trackPageview();
</script>
<!-- <div class="visualClear"></div> -->
<script type="text/javascript">if (window.runOnloadHook) runOnloadHook();</script>
</div>
<!-- Served in 0.250 secs. --></body></html>