blob: af5b483f3da7539fde8fe759ca5a0df0675544bf [file] [log] [blame]
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en" dir="ltr">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<meta name="keywords" content="SMILA/Documentation/CompoundManagement,SMILA/Documentation/Crawler" />
<link rel="shortcut icon" href="http://wiki.eclipse.org/SMILA/Documentation/favicon.ico" />
<link rel="search" type="application/opensearchdescription+xml" href="http://wiki.eclipse.org/opensearch_desc.php" title="Eclipsepedia (English)" />
<link rel="alternate" type="application/rss+xml" title="Eclipsepedia RSS Feed" href="http://wiki.eclipse.org/index.php?title=Special:Recentchanges&amp;feed=rss" />
<link rel="alternate" type="application/atom+xml" title="Eclipsepedia Atom Feed" href="http://wiki.eclipse.org/index.php?title=Special:Recentchanges&amp;feed=atom" />
<title>SMILA/Documentation/CompoundManagement - Eclipsepedia</title>
<style type="text/css" media="screen,projection">/*<![CDATA[*/ @import "/skins/eclipsenova/novaWide.css?116"; /*]]>*/</style>
<link rel="stylesheet" type="text/css" media="print" href="http://wiki.eclipse.org/skins/eclipsenova/eclipsenovaPrint.css?116" />
<link rel="stylesheet" type="text/css" media="handheld" href="http://wiki.eclipse.org/skins/eclipsenova/handheld.css?116" />
<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/Nova/css/header.css" media="screen" />
<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/tabs.css" media="screen" />
<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/Nova/css/visual.css" media="screen" />
<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/Nova/css/layout.css" media="screen" />
<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/Nova/css/footer.css" media="screen" />
<!--[if IE]><link rel="stylesheet" type="text/css" href="/skins/eclipsenova/IEpngfix.css" media="screen" /><![endif]-->
<!--[if lt IE 5.5000]><style type="text/css">@import "/skins/eclipsenova/IE50Fixes.css?116";</style> <![endif]-->
<!--[if IE 5.5000]><style type="text/css">@import "/skins/eclipsenova/IE55Fixes.css?116";</style><![endif]-->
<!--[if IE 6]><style type="text/css">@import "/skins/eclipsenova/IE60Fixes.css?116";</style><![endif]-->
<!--[if IE 7]><style type="text/css">@import "/skins/eclipsenova/IE70Fixes.css?116";</style><![endif]-->
<!--[if lt IE 7]><script type="text/javascript" src="/skins/common/IEFixes.js?116"></script>
<meta http-equiv="imagetoolbar" content="no" /><![endif]-->
<script type= "text/javascript">/*<![CDATA[*/
var skin = "eclipsenova";
var stylepath = "/skins";
var wgArticlePath = "/$1";
var wgScriptPath = "";
var wgScript = "/index.php";
var wgServer = "http://wiki.eclipse.org";
var wgCanonicalNamespace = "";
var wgCanonicalSpecialPageName = false;
var wgNamespaceNumber = 0;
var wgPageName = "SMILA/Documentation/CompoundManagement";
var wgTitle = "SMILA/Documentation/CompoundManagement";
var wgAction = "view";
var wgRestrictionEdit = [];
var wgRestrictionMove = [];
var wgArticleId = "19793";
var wgIsArticle = true;
var wgUserName = null;
var wgUserGroups = null;
var wgUserLanguage = "en";
var wgContentLanguage = "en";
var wgBreakFrames = false;
var wgCurRevisionId = "285992";
var wgVersion = "1.12.0";
var wgEnableAPI = true;
var wgEnableWriteAPI = false;
/*]]>*/</script>
<script type="text/javascript" src="http://wiki.eclipse.org/skins/common/wikibits.js?116"><!-- wikibits js --></script>
<!-- Performance mods similar to those for bug 166401 -->
<script type="text/javascript" src="http://wiki.eclipse.org/index.php?title=-&amp;action=raw&amp;gen=js&amp;useskin=eclipsenova"><!-- site js --></script>
<!-- Head Scripts -->
<script type="text/javascript" src="http://wiki.eclipse.org/skins/common/ajax.js?116"></script>
<style type="text/css">/*<![CDATA[*/
.source-java {line-height: normal; font-size: medium;}
.source-java li {line-height: normal;}
/**
* GeSHi Dynamically Generated Stylesheet
* --------------------------------------
* Dynamically generated stylesheet for java
* CSS class: source-java, CSS id:
* GeSHi (C) 2004 - 2007 Nigel McNie (http://qbnz.com/highlighter)
*/
.source-java .de1, .source-java .de2 {font-family: 'Courier New', Courier, monospace; font-weight: normal;}
.source-java {}
.source-java .head {}
.source-java .foot {}
.source-java .imp {font-weight: bold; color: red;}
.source-java .ln-xtra {color: #cc0; background-color: #ffc;}
.source-java li {font-family: 'Courier New', Courier, monospace; color: black; font-weight: normal; font-style: normal;}
.source-java li.li2 {font-weight: bold;}
.source-java .kw1 {color: #7F0055; font-weight: bold;}
.source-java .kw2 {color: #7F0055; font-weight: bold;}
.source-java .kw3 {color: #000000; font-weight: normal}
.source-java .kw4 {color: #7F0055; font-weight: bold;}
.source-java .co1 {color: #3F7F5F; font-style: italic;}
.source-java .co2 {color: #3F7F5F;}
.source-java .co3 {color: #3F7F5F; font-style: italic; font-weight: bold;}
.source-java .coMULTI {color: #3F5FBF; font-style: italic;}
.source-java .es0 {color: #000000;}
.source-java .br0 {color: #000000;}
.source-java .st0 {color: #2A00ff;}
.source-java .nu0 {color: #000000;}
.source-java .me1 {color: #000000;}
.source-java .me2 {color: #000000;}
/*]]>*/
</style>
<style type="text/css">/*<![CDATA[*/
@import "/index.php?title=MediaWiki:Geshi.css&usemsgcache=yes&action=raw&ctype=text/css&smaxage=18000";
/*]]>*/
</style><style type="text/css">/*<![CDATA[*/
.source-xml {line-height: normal; font-size: medium;}
.source-xml li {line-height: normal;}
/**
* GeSHi Dynamically Generated Stylesheet
* --------------------------------------
* Dynamically generated stylesheet for xml
* CSS class: source-xml, CSS id:
* GeSHi (C) 2004 - 2007 Nigel McNie (http://qbnz.com/highlighter)
*/
.source-xml .de1, .source-xml .de2 {font-family: 'Courier New', Courier, monospace; font-weight: normal;}
.source-xml {}
.source-xml .head {}
.source-xml .foot {}
.source-xml .imp {font-weight: bold; color: red;}
.source-xml .ln-xtra {color: #cc0; background-color: #ffc;}
.source-xml li {font-family: 'Courier New', Courier, monospace; color: black; font-weight: normal; font-style: normal;}
.source-xml li.li2 {font-weight: bold;}
.source-xml .coMULTI {color: #808080; font-style: italic;}
.source-xml .es0 {color: #000099; font-weight: bold;}
.source-xml .br0 {color: #66cc66;}
.source-xml .st0 {color: #ff0000;}
.source-xml .nu0 {color: #cc66cc;}
.source-xml .sc0 {color: #00bbdd;}
.source-xml .sc1 {color: #ddbb00;}
.source-xml .sc2 {color: #339933;}
.source-xml .sc3 {color: #009900;}
.source-xml .re0 {color: #000066;}
.source-xml .re1 {font-weight: bold; color: black;}
.source-xml .re2 {font-weight: bold; color: black;}
/*]]>*/
</style>
<style type="text/css">/*<![CDATA[*/
@import "/index.php?title=MediaWiki:Geshi.css&usemsgcache=yes&action=raw&ctype=text/css&smaxage=18000";
/*]]>*/
</style><link rel="stylesheet" type="text/css" href="CompoundManagement.html" /> </head>
<body class="mediawiki ns-0 ltr page-SMILA_Documentation_CompoundManagement">
<div id="globalWrapper">
<div id="column-one">
<!-- Eclipse Additions for the Top Nav start here M. Ward-->
<div id="header">
<div id="header-graphic">
<img src="http://wiki.eclipse.org/skins/eclipsenova/eclipse.png" alt="Eclipse Wiki">
</div>
<!-- Pulled 101409 Mward -->
<div class="portlet" id="p-personal">
<div class="pBody">
<ul>
<li id="pt-login"><a href="http://wiki.eclipse.org/index.php?title=Special:Userlogin&amp;returnto=SMILA/Documentation/CompoundManagement">Log in</a></li>
</ul>
</div>
</div>
<div id="header-icons">
<div id="sites">
<ul id="sitesUL">
<li><a href="http://www.eclipse.org"><img src="http://dev.eclipse.org/custom_icons/eclipseIcon.png" width="28" height="28" alt="Eclipse Foundation" title="Eclipse Foundation" /><div>Eclipse Foundation</div></a></li>
<li><a href="http://marketplace.eclipse.org"><img src="http://dev.eclipse.org/custom_icons/marketplace.png" width="28" height="28" alt="Eclipse Marketplace" title="Eclipse Marketplace" /><div>Eclipse Marketplace</div></a></li>
<li><a href="https://bugs.eclipse.org/bugs"><img src="http://dev.eclipse.org/custom_icons/system-search-bw.png" width="28" height="28" alt="Bugzilla" title="Bugzilla" /><div>Bugzilla</div></a></li>
<li><a href="http://live.eclipse.org"><img src="http://dev.eclipse.org/custom_icons/audio-input-microphone-bw.png" width="28" height="28" alt="Live" title="Live" /><div>Eclipse Live</div></a></li>
<li><a href="http://planeteclipse.org"><img src="http://dev.eclipse.org/large_icons/devices/audio-card.png" width="28" height="28" alt="PlanetEclipse" title="Planet" /><div>Planet Eclipse</div></a></li>
<li><a href="http://portal.eclipse.org"><img src="http://dev.eclipse.org/custom_icons/preferences-system-network-proxy-bw.png" width="28" height="28" alt="Portal" title="Portal" /><div>My Foundation Portal</div></a></li>
</ul>
</div>
</div>
</div>
<!-- NEW HEADER STUFF HERE -->
<div id="header-menu">
<div id="header-nav">
<ul> <li><a class="first_one" href="http://wiki.eclipse.org/" target="_self">Home</a></li> <li><a href="http://www.eclipse.org/downloads/" target="_self">Downloads</a></li>
<li><a href="http://www.eclipse.org/users/" target="_self">Users</a></li>
<li><a href="http://www.eclipse.org/membership/" target="_self">Members</a></li>
<li><a href="http://wiki.eclipse.org/index.php/Development_Resources" target="_self">Committers</a></li>
<li><a href="http://www.eclipse.org/resources/" target="_self">Resources</a></li>
<li><a href="http://www.eclipse.org/projects/" target="_self">Projects</a></li>
<li><a href="http://www.eclipse.org/org/" target="_self">About Us</a></li>
</ul>
</div>
<div id="header-utils">
<!-- moved the search window here -->
<form action="http://wiki.eclipse.org/Special:Search" >
<input class="input" name="search" type="text" accesskey="f" value="" />
<input type='submit' onclick="this.submit();" name="go" id="searchGoButton" class="button" title="Go to a page with this exact name if one exists" value="Go" />&nbsp;
<input type='submit' onclick="this.submit();" name="fulltext" class="button" id="mw-searchButton" title="Search Eclipsepedia for this text" value="Search" />
</form>
</div>
</div>
<!-- Eclipse Additions for the Header stop here -->
<!-- Additions and mods for leftside nav Start here -->
<!--Started nav rip here-->
<!-- these are the nav controls main page, changes etc -->
<div id="novaContent" class="faux">
<div id="leftcol">
<ul id="leftnav">
<!-- these are the page controls, edit history etc -->
<li class="separator"><a class="separator">Navigation &#160;&#160;</li>
<li id="n-mainpage"><a href="http://wiki.eclipse.org/Main_Page">Main Page</a></li>
<li id="n-portal"><a href="http://wiki.eclipse.org/Eclipsepedia:Community_Portal">Community portal</a></li>
<li id="n-currentevents"><a href="http://wiki.eclipse.org/Eclipsepedia:Current_events">Current events</a></li>
<li id="n-recentchanges"><a href="http://wiki.eclipse.org/Special:Recentchanges">Recent changes</a></li>
<li id="n-randompage"><a href="http://wiki.eclipse.org/Special:Random">Random page</a></li>
<li id="n-help"><a href="http://wiki.eclipse.org/Help:Contents">Help</a></li>
<li class="separator"><a class="separator">Toolbox &#160;&#160;</a></li>
<li id="t-whatlinkshere"><a href="http://wiki.eclipse.org/Special:Whatlinkshere/SMILA/Documentation/CompoundManagement">What links here</a></li>
<li id="t-recentchangeslinked"><a href="http://wiki.eclipse.org/Special:Recentchangeslinked/SMILA/Documentation/CompoundManagement">Related changes</a></li>
<!-- This is the toolbox section -->
<li id="t-upload"><a href="http://wiki.eclipse.org/Special:Upload">Upload file</a></li>
<li id="t-specialpages"><a href="http://wiki.eclipse.org/Special:Specialpages">Special pages</a></li>
<li id="t-print"><a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/CompoundManagement&amp;printable=yes">Printable version</a></li> <li id="t-permalink"><a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/CompoundManagement&amp;oldid=285992">Permanent link</a></li> </ul>
</div>
<!-- Additions and mods for leftside nav End here -->
<div id="column-content">
<div id="content">
<a name="top" id="top"></a>
<div id="tabs">
<ul class="primary">
<li class="active"><a href="CompoundManagement.html"><span class="tab">Page</span></a></li>
<li><a href="http://wiki.eclipse.org/index.php?title=Talk:SMILA/Documentation/CompoundManagement&amp;action=edit"><span class="tab">Discussion</span></a></li>
<li><a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/CompoundManagement&amp;action=edit"><span class="tab">View source</span></a></li>
<li><a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/CompoundManagement&amp;action=history"><span class="tab">History</span></a></li>
<li><a href="http://wiki.eclipse.org/index.php?title=Special:Userlogin&amp;returnto=SMILA/Documentation/CompoundManagement"><span class="tab">Edit</span></a></li>
</ul>
</div>
<script type="text/javascript"> if (window.isMSIE55) fixalpha(); </script>
<h1 class="firstHeading">SMILA/Documentation/CompoundManagement</h1>
<div id="bodyContent">
<h3 id="siteSub">From Eclipsepedia</h3>
<div id="contentSub"><span class="subpages">&lt; <a href="../../SMILA.html" title="SMILA">SMILA</a> | <a href="../Documentation.1.html" title="SMILA/Documentation">Documentation</a></span></div>
<div id="jump-to-nav">Jump to: <a href="CompoundManagement.html#column-one">navigation</a>, <a href="CompoundManagement.html#searchInput">search</a></div> <!-- start content -->
<div class="messagebox" style="background-color: #def3fe; border: 1px solid #c5d7e0; color: black; padding: 5px; margin: 1ex 0; min-height: 35px; padding-left: 45px;">
<div style="float: left; margin-left: -40px;"><a href="http://wiki.eclipse.org/Image:Note.png" class="image" title="Note.png"><img alt="" src="http://wiki.eclipse.org/images/c/cc/Note.png" width="35" height="35" border="0" /></a></div>
<div><b>This is deprecated for SMILA 1.0, the connectivity framework is still functional but planned to be replaced by scalable import based on SMILAs job management.</b><br /></div>
</div>
<table id="toc" class="toc" summary="Contents"><tr><td><div id="toctitle"><h2>Contents</h2></div>
<ul>
<li class="toclevel-1"><a href="CompoundManagement.html#Overview"><span class="tocnumber">1</span> <span class="toctext">Overview</span></a></li>
<li class="toclevel-1"><a href="CompoundManagement.html#API"><span class="tocnumber">2</span> <span class="toctext">API</span></a></li>
<li class="toclevel-1"><a href="CompoundManagement.html#Implementations"><span class="tocnumber">3</span> <span class="toctext">Implementations</span></a>
<ul>
<li class="toclevel-2"><a href="CompoundManagement.html#org.eclipse.smila.connectivity.framework.impl"><span class="tocnumber">3.1</span> <span class="toctext">org.eclipse.smila.connectivity.framework.impl</span></a></li>
<li class="toclevel-2"><a href="CompoundManagement.html#org.eclipse.smila.connectivity.framework.compound.zip"><span class="tocnumber">3.2</span> <span class="toctext">org.eclipse.smila.connectivity.framework.compound.zip</span></a></li>
</ul>
</li>
<li class="toclevel-1"><a href="CompoundManagement.html#Configuration"><span class="tocnumber">4</span> <span class="toctext">Configuration</span></a>
<ul>
<li class="toclevel-2"><a href="CompoundManagement.html#Configuration_example"><span class="tocnumber">4.1</span> <span class="toctext">Configuration example</span></a></li>
</ul>
</li>
</ul>
</td></tr></table><script type="text/javascript"> if (window.showTocToggle) { var tocShowText = "show"; var tocHideText = "hide"; showTocToggle(); } </script>
<a name="Overview"></a><h2> <span class="mw-headline"> Overview </span></h2>
<p>CompoundManagement in SMILA is an extendable set of components. The central component is the CompoundManager. It manages CompoundHandlers that are each capable of extraction elements of certain types of files (like zip or chm). Each CompoundHandler registers itself at the CompoundManager providing a list of supported mime types. The CompoundManager provides functionality to check if a given record contains a compound. It uses a MimetypeIdentifier to identify the mime type of the given record and checks if any registered CompoundHandler is capable of processing records this mime type. It then delegates the processing to the CompoundHandler which in turn creates a CompoundCrawler over the extracted elements of the comnpound record and passes the CompoundCrawler back.
CompoundCrawlers are just like regular Crawlers. The difference is that they work on the given compound record only and not on an external data source.
</p><p>The following chart shows all CompoundManagement components:
</p><p><a href="http://wiki.eclipse.org/Image:CompoundManagement.png" class="image" title="Image:CompoundManagement.png"><img alt="Image:CompoundManagement.png" src="http://wiki.eclipse.org/images/3/30/CompoundManagement.png" width="960" height="720" border="0" /></a>
</p><p><br />
</p>
<a name="API"></a><h2> <span class="mw-headline"> API </span></h2>
<div dir="ltr" style="text-align: left;"><pre class="source-java"><span class="coMULTI">/**
* The Interface CompoundManager.
*/</span>
<span class="kw1">public</span> <span class="kw1">interface</span> CompoundManager <span class="br0">&#123;</span>
&nbsp;
<span class="coMULTI">/**
* Checks if a record is a compound object.
*
* @param record
* the Record
* @param config
* the DataSourceConnectionConfig
* @return true if the record is a compound object and is extractable by this CompoundManager, false otherwise
* @throws CompoundException
* if any error occurs
*/</span>
<span class="kw4">boolean</span> isCompound<span class="br0">&#40;</span><span class="kw1">final</span> Record record, <span class="kw1">final</span> DataSourceConnectionConfig config<span class="br0">&#41;</span> <span class="kw1">throws</span> CompoundException;
&nbsp;
<span class="coMULTI">/**
* Extracts the elements of the given record and returns a Crawler over the extracted elements.
*
* @param record
* the Record
* @param config
* the DataSourceConnectionConfig
* @return a Crawler interface over the extracted elements
* @throws CompoundException
* if any error occurs
*/</span>
Crawler extract<span class="br0">&#40;</span><span class="kw1">final</span> Record record, <span class="kw1">final</span> DataSourceConnectionConfig config<span class="br0">&#41;</span> <span class="kw1">throws</span> CompoundException;
&nbsp;
<span class="coMULTI">/**
* Adapts the input record according to the given configuration. The record may be left unmodified, modified or even
* set to null.
*
* @param record
* the Record
* @param config
* the DataSourceConnectionConfig
* @return the adapted record
* @throws CompoundException
* if any error occurs
*/</span>
Record adaptCompoundRecord<span class="br0">&#40;</span><span class="kw1">final</span> Record record, <span class="kw1">final</span> DataSourceConnectionConfig config<span class="br0">&#41;</span> <span class="kw1">throws</span> CompoundException;
<span class="br0">&#125;</span></pre></div>
<div dir="ltr" style="text-align: left;"><pre class="source-java"><span class="coMULTI">/**
* The Interface CompoundHandler.
*/</span>
<span class="kw1">public</span> <span class="kw1">interface</span> CompoundHandler <span class="br0">&#123;</span>
&nbsp;
<span class="coMULTI">/**
* Gets the mime types the CompoundHandler is capable to extract.
* @return a Collection of mime types the CompoundHandler is capable to extract.
*/</span>
Collection&lt;String&gt; getSupportedMimeTypes<span class="br0">&#40;</span><span class="br0">&#41;</span>;
&nbsp;
<span class="coMULTI">/**
* Extracts the elements of the given record and returns a Crawler over the extracted elements.
* @param record
* the Record
* @param config
* the DataSourceConnectionConfig
* @return a Crawler interface over the extracted elements
* @throws CompoundException
* if any error occurs
*/</span>
Crawler extract<span class="br0">&#40;</span><span class="kw1">final</span> Record record, <span class="kw1">final</span> DataSourceConnectionConfig config<span class="br0">&#41;</span> <span class="kw1">throws</span> CompoundException;
<span class="br0">&#125;</span></pre></div>
<div dir="ltr" style="text-align: left;"><pre class="source-java"><span class="coMULTI">/**
* The Interface CompoundCrawler.
*/</span>
<span class="kw1">public</span> <span class="kw1">interface</span> CompoundCrawler <span class="kw1">extends</span> Crawler <span class="br0">&#123;</span>
&nbsp;
<span class="coMULTI">/**
* Sets the compound record to extract data from.
*
* @param record
* the compound Record
* @throws CrawlerException
* if parameter record is null
*/</span>
<span class="kw4">void</span> setCompoundRecord<span class="br0">&#40;</span><span class="kw1">final</span> Record record<span class="br0">&#41;</span> <span class="kw1">throws</span> CrawlerException;
&nbsp;
<span class="coMULTI">/**
* Gets the compound record.
*
* @return the compound record.
*/</span>
Record getCompoundRecord<span class="br0">&#40;</span><span class="br0">&#41;</span>;
<span class="br0">&#125;</span></pre></div>
<p><br />
</p>
<a name="Implementations"></a><h2> <span class="mw-headline"> Implementations </span></h2>
<p>It is possible to provide different implementations for all components. Most important is that it is easy to extend CompoundHandling by providing new CompoundHandler implementations.
</p>
<a name="org.eclipse.smila.connectivity.framework.impl"></a><h3> <span class="mw-headline">org.eclipse.smila.connectivity.framework.impl</span></h3>
<p>This bundle contains the default implementation of the CompoundManager interface as well as some abstract base classes for CompoundHandlers and CompoundCrawlers.
</p><p>The CrawlerController implements the general processing logic common for all types of Crawlers. Its interface is a pure management interface that can be accessed by its Java interface or its wrapping JMX interface. It has references to the following OSGi services:
</p>
<ul><li> MimeTypeIdentifier (1..1)
</li><li> CompoundHandler (0..n)
</li></ul>
<p>CompoundHandlers register themselves at the CompoundManager.
</p><p>The method <tt>adaptCompoundRecord()</tt> is not implemented, yet. It just returns the unmodified input record.
</p>
<dl><dt>Configuration
</dt></dl>
<p>There are no configuration options available for this bundle.
</p><p><br />
</p>
<a name="org.eclipse.smila.connectivity.framework.compound.zip"></a><h3> <span class="mw-headline">org.eclipse.smila.connectivity.framework.compound.zip</span></h3>
<p>This bundle contains an implementation to handle zip archives. It can handle the mime types
</p>
<ul><li>application/zip
</li><li>application/java-archive
</li></ul>
<p>It provides the OSGi Declarative Services <tt>ZipCompoundHandler</tt> and <tt>ZipCompoundCrawler</tt>. As with regular Crawlers the <tt>ZipCompoundCrawler</tt> is a ComponentFactory. Each time method <tt>extract(...)</tt> is called on the <tt>ZipCompoundHandler</tt> a new instance of a <tt>ZipCompoundCrawler</tt> is created. Both services don't have any dependencies to other services, except that <tt>ZipCompoundHandler</tt> references the <tt>ZipCompoundCrawler</tt>.
</p><p>For Id creation the ElementAttribute Path is used, for hash creation it's ElementAttribute LastModifiedDate.
</p><p>The generated records will contain a metadata element called <b>_compounds"</b> that contains the (ordered) path through compounds to the last compound the file is contained within.
</p><p>E.g. consider the following scenario:
inside the Zip <code>/path/to/data/folder/compressed_data.zip</code> exists another zip <code>path within zip/second.zip</code> and within that zip there is a file <code>path within second zip/myfile.txt</code> then the Record would contain (among others) the following metadata elements:
</p>
<div dir="ltr" style="text-align: left;"><pre class="source-xml"><span class="sc3"><span class="re1">&lt;Val</span> <span class="re0">key</span>=<span class="st0">&quot;Path&quot;</span><span class="re2">&gt;</span></span>path within second zip/myfile.txt<span class="sc3"><span class="re1">&lt;/Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Val</span> <span class="re0">key</span>=<span class="st0">&quot;Filename&quot;</span><span class="re2">&gt;</span></span>myfile.txt<span class="sc3"><span class="re1">&lt;/Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Seq</span> <span class="re0">key</span>=<span class="st0">&quot;_compounds&quot;</span><span class="re2">&gt;</span></span>
<span class="sc3"><span class="re1">&lt;Val<span class="re2">&gt;</span></span></span>/path/to/data/folder/compressed_data.zip<span class="sc3"><span class="re1">&lt;/Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Val<span class="re2">&gt;</span></span></span>path within zip/second.zip<span class="sc3"><span class="re1">&lt;/Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/Seq<span class="re2">&gt;</span></span></span></pre></div>
<p>With that information an application could work its way through the compounds to the contained file.
</p><p><b>Note</b>
The extract functionality is implemented using standard JDK zip file handling. Therefore only the archives must only contain filenames in UTF-8 encoding. Lot of zip tools doe not use UTF-8 but the platform default encoding. This will lead to errors for some characters (e.g. German Umlaute).
</p>
<dl><dt>Configuration
</dt></dl>
<p>There are no configuration options available for this bundle.
</p>
<a name="Configuration"></a><h2> <span class="mw-headline"> Configuration </span></h2>
<p>If and how CompoundHandling works is configured within each DataSourceConnectionConfig. There is a special element <b>CompoundHandling</b> that contains this configuration. If this element is omitted no CompoundHandling is done (compound records are processed as single documents). In contrast to regular Crawlers the CompoundHandling configuration may not be overwritten by each CompoundCrawler, they all share the same configuration. In addition it is not configurable how compound elements keys and hashes are created. This is determined by each CompoundCrawler implementation.
</p><p>CompoundHandling configuration contains the following sub elements:
</p>
<dl><dt>MimeTypeAttribute
</dt><dd>The name of the attribute of the compound record containing the mime type of the <i>ContentAttachment</i>. If no mime type is set any detected mime type by CompoundHandling is stored in an attribute using this name. This parameter is optional. If not specified then ExtensionAttribute must be set!
</dd><dt>ExtensionAttribute
</dt><dd>The name of the attribute of the compound record containing the file extension. This parameter is optional. If not specified then MimeTypeAttribute must be set!
</dd><dt>ContentAttachment (required)
</dt><dd>The name of the attachment of the compound record containing the content of the compound
</dd><dt>CompoundAttributes
</dt><dd>A list of CompoundAttribute to be set on extracted compound elements
</dd><dt>CompoundAttribute
</dt><dd>Type (required) – the data type (String, Integer or Date)
</dd><dd>Name (required) – attributes name
</dd><dd>Attachment – specify if the attribute returns the data as an attachment instead of an attribute
</dd><dt>ElementAttribute
</dt><dd>The supported ElementAttribute types are LastModifiedDate, Path, Content, Size, FileExtension, Name
</dd></dl>
<p><br />
</p>
<a name="Configuration_example"></a><h3> <span class="mw-headline"> Configuration example </span></h3>
<p>Here is a sample snippet of a CompoundHandling configuration:
</p>
<div dir="ltr" style="text-align: left;"><pre class="source-xml"><span class="sc3"><span class="re1">&lt;CompoundHandling<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;MimeTypeAttribute<span class="re2">&gt;</span></span></span>MimeType<span class="sc3"><span class="re1">&lt;/MimeTypeAttribute<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;ExtensionAttribute<span class="re2">&gt;</span></span></span>Extension<span class="sc3"><span class="re1">&lt;/ExtensionAttribute<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;ContentAttachment<span class="re2">&gt;</span></span></span>Content<span class="sc3"><span class="re1">&lt;/ContentAttachment<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;CompoundAttributes<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;CompoundAttribute</span> <span class="re0">Type</span>=<span class="st0">&quot;Date&quot;</span> <span class="re0">Name</span>=<span class="st0">&quot;LastModifiedDate&quot;</span><span class="re2">&gt;</span></span>
<span class="sc3"><span class="re1">&lt;ElementAttribute<span class="re2">&gt;</span></span></span>LastModifiedDate<span class="sc3"><span class="re1">&lt;/ElementAttribute<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/CompoundAttribute<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;CompoundAttribute</span> <span class="re0">Type</span>=<span class="st0">&quot;String&quot;</span> <span class="re0">Name</span>=<span class="st0">&quot;Path&quot;</span><span class="re2">&gt;</span></span>
<span class="sc3"><span class="re1">&lt;ElementAttribute<span class="re2">&gt;</span></span></span>Path<span class="sc3"><span class="re1">&lt;/ElementAttribute<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/CompoundAttribute<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;CompoundAttribute</span> <span class="re0">Type</span>=<span class="st0">&quot;String&quot;</span> <span class="re0">Name</span>=<span class="st0">&quot;Content&quot;</span> <span class="re0">Attachment</span>=<span class="st0">&quot;true&quot;</span><span class="re2">&gt;</span></span>
<span class="sc3"><span class="re1">&lt;ElementAttribute<span class="re2">&gt;</span></span></span>Content<span class="sc3"><span class="re1">&lt;/ElementAttribute<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/CompoundAttribute<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;CompoundAttribute</span> <span class="re0">Type</span>=<span class="st0">&quot;String&quot;</span> <span class="re0">Name</span>=<span class="st0">&quot;Size&quot;</span><span class="re2">&gt;</span></span>
<span class="sc3"><span class="re1">&lt;ElementAttribute<span class="re2">&gt;</span></span></span>Size<span class="sc3"><span class="re1">&lt;/ElementAttribute<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/CompoundAttribute<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;CompoundAttribute</span> <span class="re0">Type</span>=<span class="st0">&quot;String&quot;</span> <span class="re0">Name</span>=<span class="st0">&quot;Extension&quot;</span><span class="re2">&gt;</span></span>
<span class="sc3"><span class="re1">&lt;ElementAttribute<span class="re2">&gt;</span></span></span>FileExtension<span class="sc3"><span class="re1">&lt;/ElementAttribute<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/CompoundAttribute<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;CompoundAttribute</span> <span class="re0">Type</span>=<span class="st0">&quot;String&quot;</span> <span class="re0">Name</span>=<span class="st0">&quot;Filename&quot;</span><span class="re2">&gt;</span></span>
<span class="sc3"><span class="re1">&lt;ElementAttribute<span class="re2">&gt;</span></span></span>Name<span class="sc3"><span class="re1">&lt;/ElementAttribute<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/CompoundAttribute<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/CompoundAttributes<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/CompoundHandling<span class="re2">&gt;</span></span></span></pre></div>
<p>For details about the integration of this configuration part in some crawler's configuration please see <a href="Crawler.html" title="SMILA/Documentation/Crawler">Crawler</a> documentation.
</p>
<!--
NewPP limit report
Preprocessor node count: 75/1000000
Post-expand include size: 1036/2097152 bytes
Template argument size: 506/2097152 bytes
#ifexist count: 0/100
-->
<!-- Saved in parser cache with key wikidb:pcache:idhash:19793-0!1!0!!en!2!edit=0 and timestamp 20120202190336 -->
<div class="printfooter">
Retrieved from "<a href="CompoundManagement.html">http://wiki.eclipse.org/SMILA/Documentation/CompoundManagement</a>"</div>
<!-- end content -->
<div class="visualClear"></div>
</div>
</div>
</div>
<!-- Yoink of toolbox for phoenix moved up -->
</div>
</div>
<div id="clearFooter"/>
<div id="footer" >
<ul id="footernav">
<li class="first"><a href="http://www.eclipse.org/">Home</a></li>
<li><a href="http://www.eclipse.org/legal/privacy.php">Privacy Policy</a></li>
<li><a href="http://www.eclipse.org/legal/termsofuse.php">Terms of Use</a></li>
<li><a href="http://www.eclipse.org/legal/copyright.php">Copyright Agent</a></li>
<li><a href="http://www.eclipse.org/org/foundation/contact.php">Contact</a></li>
<li><a href="http://wiki.eclipse.org/Eclipsepedia:About" title="Eclipsepedia:About">About Eclipsepedia</a></li>
</ul>
<span id="copyright">Copyright &copy; 2012 The Eclipse Foundation. All Rights Reserved</span>
<p id="footercredit">This page was last modified 09:44, 24 January 2012 by <a href="http://wiki.eclipse.org/index.php?title=User:Juergen.schumacher.attensity.com&amp;action=edit" class="new" title="User:Juergen.schumacher.attensity.com">Juergen Schumacher</a>. Based on work by <a href="http://wiki.eclipse.org/User:Drazen.cindric.attensity.com" title="User:Drazen.cindric.attensity.com">Drazen Cindric</a>, <a href="http://wiki.eclipse.org/User:Igor.novakovic.empolis.com" title="User:Igor.novakovic.empolis.com">Igor Novakovic</a> and <a href="http://wiki.eclipse.org/User:Daniel.stucky.empolis.com" title="User:Daniel.stucky.empolis.com">Daniel Stucky</a> and <a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/CompoundManagement&amp;action=credits" title="SMILA/Documentation/CompoundManagement">others</a>.</p>
<p id="footerviews">This page has been accessed 2,083 times.</p>
</div>
<script type="text/javascript">
var gaJsHost = (("https:" == document.location.protocol) ? "https://ssl." : "http://www.");
document.write(unescape("%3Cscript src='" + gaJsHost + "google-analytics.com/ga.js' type='text/javascript'%3E%3C/script%3E"));
</script>
<script type="text/javascript">
var pageTracker = _gat._getTracker("UA-910670-4");
pageTracker._trackPageview();
</script>
<!-- <div class="visualClear"></div> -->
<script type="text/javascript">if (window.runOnloadHook) runOnloadHook();</script>
</div>
<!-- Served in 0.051 secs. --></body></html>