blob: f0842c0207dd995cb8453cd4aa9a0077d3a259ec [file] [log] [blame]
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en" dir="ltr">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<meta name="keywords" content="SMILA/Documentation/Importing/CompoundExtractorService,SMILA/Documentation/MimeTypeIdentifier" />
<link rel="shortcut icon" href="http://wiki.eclipse.org/SMILA/Documentation/Importing/favicon.ico" />
<link rel="search" type="application/opensearchdescription+xml" href="http://wiki.eclipse.org/opensearch_desc.php" title="Eclipsepedia (English)" />
<link rel="alternate" type="application/rss+xml" title="Eclipsepedia RSS Feed" href="http://wiki.eclipse.org/index.php?title=Special:Recentchanges&amp;feed=rss" />
<link rel="alternate" type="application/atom+xml" title="Eclipsepedia Atom Feed" href="http://wiki.eclipse.org/index.php?title=Special:Recentchanges&amp;feed=atom" />
<title>SMILA/Documentation/Importing/CompoundExtractorService - Eclipsepedia</title>
<style type="text/css" media="screen,projection">/*<![CDATA[*/ @import "http://wiki.eclipse.org/skins/eclipsenova/novaWide.css?116"; /*]]>*/</style>
<link rel="stylesheet" type="text/css" media="print" href="http://wiki.eclipse.org/skins/eclipsenova/eclipsenovaPrint.css?116" />
<link rel="stylesheet" type="text/css" media="handheld" href="http://wiki.eclipse.org/skins/eclipsenova/handheld.css?116" />
<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/Nova/css/header.css" media="screen" />
<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/tabs.css" media="screen" />
<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/Nova/css/visual.css" media="screen" />
<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/Nova/css/layout.css" media="screen" />
<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/Nova/css/footer.css" media="screen" />
<!--[if IE]><link rel="stylesheet" type="text/css" href="/skins/eclipsenova/IEpngfix.css" media="screen" /><![endif]-->
<!--[if lt IE 5.5000]><style type="text/css">@import "/skins/eclipsenova/IE50Fixes.css?116";</style> <![endif]-->
<!--[if IE 5.5000]><style type="text/css">@import "/skins/eclipsenova/IE55Fixes.css?116";</style><![endif]-->
<!--[if IE 6]><style type="text/css">@import "/skins/eclipsenova/IE60Fixes.css?116";</style><![endif]-->
<!--[if IE 7]><style type="text/css">@import "/skins/eclipsenova/IE70Fixes.css?116";</style><![endif]-->
<!--[if lt IE 7]><script type="text/javascript" src="/skins/common/IEFixes.js?116"></script>
<meta http-equiv="imagetoolbar" content="no" /><![endif]-->
<script type= "text/javascript">/*<![CDATA[*/
var skin = "eclipsenova";
var stylepath = "/skins";
var wgArticlePath = "/$1";
var wgScriptPath = "";
var wgScript = "/index.php";
var wgServer = "http://wiki.eclipse.org";
var wgCanonicalNamespace = "";
var wgCanonicalSpecialPageName = false;
var wgNamespaceNumber = 0;
var wgPageName = "SMILA/Documentation/Importing/CompoundExtractorService";
var wgTitle = "SMILA/Documentation/Importing/CompoundExtractorService";
var wgAction = "view";
var wgRestrictionEdit = [];
var wgRestrictionMove = [];
var wgArticleId = "35983";
var wgIsArticle = true;
var wgUserName = null;
var wgUserGroups = null;
var wgUserLanguage = "en";
var wgContentLanguage = "en";
var wgBreakFrames = false;
var wgCurRevisionId = "302634";
var wgVersion = "1.12.0";
var wgEnableAPI = true;
var wgEnableWriteAPI = false;
/*]]>*/</script>
<script type="text/javascript" src="http://wiki.eclipse.org/skins/common/wikibits.js?116"><!-- wikibits js --></script>
<!-- Performance mods similar to those for bug 166401 -->
<script type="text/javascript" src="http://wiki.eclipse.org/index.php?title=-&amp;action=raw&amp;gen=js&amp;useskin=eclipsenova"><!-- site js --></script>
<!-- Head Scripts -->
<script type="text/javascript" src="http://wiki.eclipse.org/skins/common/ajax.js?116"></script>
<link rel="stylesheet" type="text/css" href="CompoundExtractorService.html" /> </head>
<body class="mediawiki ns-0 ltr page-SMILA_Documentation_Importing_CompoundExtractorService">
<div id="globalWrapper">
<div id="column-one">
<!-- Eclipse Additions for the Top Nav start here M. Ward-->
<div id="header">
<div id="header-graphic">
<img src="http://wiki.eclipse.org/skins/eclipsenova/eclipse.png" alt="Eclipse Wiki">
</div>
<!-- Pulled 101409 Mward -->
<div class="portlet" id="p-personal">
<div class="pBody">
<ul>
<li id="pt-login"><a href="http://wiki.eclipse.org/index.php?title=Special:Userlogin&amp;returnto=SMILA/Documentation/Importing/CompoundExtractorService">Log in</a></li>
</ul>
</div>
</div>
<div id="header-icons">
<div id="sites">
<ul id="sitesUL">
<li><a href="http://www.eclipse.org"><img src="http://dev.eclipse.org/custom_icons/eclipseIcon.png" width="28" height="28" alt="Eclipse Foundation" title="Eclipse Foundation" /><div>Eclipse Foundation</div></a></li>
<li><a href="http://marketplace.eclipse.org"><img src="http://dev.eclipse.org/custom_icons/marketplace.png" width="28" height="28" alt="Eclipse Marketplace" title="Eclipse Marketplace" /><div>Eclipse Marketplace</div></a></li>
<li><a href="https://bugs.eclipse.org/bugs"><img src="http://dev.eclipse.org/custom_icons/system-search-bw.png" width="28" height="28" alt="Bugzilla" title="Bugzilla" /><div>Bugzilla</div></a></li>
<li><a href="http://live.eclipse.org"><img src="http://dev.eclipse.org/custom_icons/audio-input-microphone-bw.png" width="28" height="28" alt="Live" title="Live" /><div>Eclipse Live</div></a></li>
<li><a href="http://planeteclipse.org"><img src="http://dev.eclipse.org/large_icons/devices/audio-card.png" width="28" height="28" alt="PlanetEclipse" title="Planet" /><div>Planet Eclipse</div></a></li>
<li><a href="http://portal.eclipse.org"><img src="http://dev.eclipse.org/custom_icons/preferences-system-network-proxy-bw.png" width="28" height="28" alt="Portal" title="Portal" /><div>My Foundation Portal</div></a></li>
</ul>
</div>
</div>
</div>
<!-- NEW HEADER STUFF HERE -->
<div id="header-menu">
<div id="header-nav">
<ul> <li><a class="first_one" href="http://wiki.eclipse.org/" target="_self">Home</a></li> <li><a href="http://www.eclipse.org/downloads/" target="_self">Downloads</a></li>
<li><a href="http://www.eclipse.org/users/" target="_self">Users</a></li>
<li><a href="http://www.eclipse.org/membership/" target="_self">Members</a></li>
<li><a href="http://wiki.eclipse.org/index.php/Development_Resources" target="_self">Committers</a></li>
<li><a href="http://www.eclipse.org/resources/" target="_self">Resources</a></li>
<li><a href="http://www.eclipse.org/projects/" target="_self">Projects</a></li>
<li><a href="http://www.eclipse.org/org/" target="_self">About Us</a></li>
</ul>
</div>
<div id="header-utils">
<!-- moved the search window here -->
<form action="http://wiki.eclipse.org/Special:Search" >
<input class="input" name="search" type="text" accesskey="f" value="" />
<input type='submit' onclick="this.submit();" name="go" id="searchGoButton" class="button" title="Go to a page with this exact name if one exists" value="Go" />&nbsp;
<input type='submit' onclick="this.submit();" name="fulltext" class="button" id="mw-searchButton" title="Search Eclipsepedia for this text" value="Search" />
</form>
</div>
</div>
<!-- Eclipse Additions for the Header stop here -->
<!-- Additions and mods for leftside nav Start here -->
<!--Started nav rip here-->
<!-- these are the nav controls main page, changes etc -->
<div id="novaContent" class="faux">
<div id="leftcol">
<ul id="leftnav">
<!-- these are the page controls, edit history etc -->
<li class="separator"><a class="separator">Navigation &#160;&#160;</li>
<li id="n-mainpage"><a href="http://wiki.eclipse.org/Main_Page">Main Page</a></li>
<li id="n-portal"><a href="http://wiki.eclipse.org/Eclipsepedia:Community_Portal">Community portal</a></li>
<li id="n-currentevents"><a href="http://wiki.eclipse.org/Eclipsepedia:Current_events">Current events</a></li>
<li id="n-recentchanges"><a href="http://wiki.eclipse.org/Special:Recentchanges">Recent changes</a></li>
<li id="n-randompage"><a href="http://wiki.eclipse.org/Special:Random">Random page</a></li>
<li id="n-help"><a href="http://wiki.eclipse.org/Help:Contents">Help</a></li>
<li class="separator"><a class="separator">Toolbox &#160;&#160;</a></li>
<li id="t-whatlinkshere"><a href="http://wiki.eclipse.org/Special:Whatlinkshere/SMILA/Documentation/Importing/CompoundExtractorService">What links here</a></li>
<li id="t-recentchangeslinked"><a href="http://wiki.eclipse.org/Special:Recentchangeslinked/SMILA/Documentation/Importing/CompoundExtractorService">Related changes</a></li>
<!-- This is the toolbox section -->
<li id="t-upload"><a href="http://wiki.eclipse.org/Special:Upload">Upload file</a></li>
<li id="t-specialpages"><a href="http://wiki.eclipse.org/Special:Specialpages">Special pages</a></li>
<li id="t-print"><a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/Importing/CompoundExtractorService&amp;printable=yes">Printable version</a></li> <li id="t-permalink"><a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/Importing/CompoundExtractorService&amp;oldid=302634">Permanent link</a></li> </ul>
</div>
<!-- Additions and mods for leftside nav End here -->
<div id="column-content">
<div id="content">
<a name="top" id="top"></a>
<div id="tabs">
<ul class="primary">
<li class="active"><a href="CompoundExtractorService.html"><span class="tab">Page</span></a></li>
<li><a href="http://wiki.eclipse.org/index.php?title=Talk:SMILA/Documentation/Importing/CompoundExtractorService&amp;action=edit"><span class="tab">Discussion</span></a></li>
<li><a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/Importing/CompoundExtractorService&amp;action=edit"><span class="tab">View source</span></a></li>
<li><a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/Importing/CompoundExtractorService&amp;action=history"><span class="tab">History</span></a></li>
<li><a href="http://wiki.eclipse.org/index.php?title=Special:Userlogin&amp;returnto=SMILA/Documentation/Importing/CompoundExtractorService"><span class="tab">Edit</span></a></li>
</ul>
</div>
<script type="text/javascript"> if (window.isMSIE55) fixalpha(); </script>
<h1 class="firstHeading">SMILA/Documentation/Importing/CompoundExtractorService</h1>
<div id="bodyContent">
<h3 id="siteSub">From Eclipsepedia</h3>
<div id="contentSub"><span class="subpages">&lt; <a href="../../../SMILA.html" title="SMILA">SMILA</a> | <a href="../../Documentation.1.html" title="SMILA/Documentation">Documentation</a></span></div>
<div id="jump-to-nav">Jump to: <a href="CompoundExtractorService.html#column-one">navigation</a>, <a href="CompoundExtractorService.html#searchInput">search</a></div> <!-- start content -->
<table id="toc" class="toc" summary="Contents"><tr><td><div id="toctitle"><h2>Contents</h2></div>
<ul>
<li class="toclevel-1"><a href="CompoundExtractorService.html#CompoundExtractor_Service"><span class="tocnumber">1</span> <span class="toctext">CompoundExtractor Service</span></a></li>
<li class="toclevel-1"><a href="CompoundExtractorService.html#Implementations"><span class="tocnumber">2</span> <span class="toctext">Implementations</span></a>
<ul>
<li class="toclevel-2"><a href="CompoundExtractorService.html#SimpleCompoundExtractorService"><span class="tocnumber">2.1</span> <span class="toctext">SimpleCompoundExtractorService</span></a>
<ul>
<li class="toclevel-3"><a href="CompoundExtractorService.html#Configuration"><span class="tocnumber">2.1.1</span> <span class="toctext">Configuration</span></a></li>
</ul>
</li>
<li class="toclevel-2"><a href="CompoundExtractorService.html#CommonsCompressCompoundExtractorService"><span class="tocnumber">2.2</span> <span class="toctext">CommonsCompressCompoundExtractorService</span></a>
<ul>
<li class="toclevel-3"><a href="CompoundExtractorService.html#Configuration_2"><span class="tocnumber">2.2.1</span> <span class="toctext">Configuration</span></a></li>
</ul>
</li>
</ul>
</li>
</ul>
</td></tr></table><script type="text/javascript"> if (window.showTocToggle) { var tocShowText = "show"; var tocHideText = "hide"; showTocToggle(); } </script>
<a name="CompoundExtractor_Service"></a><h2> <span class="mw-headline"> CompoundExtractor Service </span></h2>
<p>Interface: <tt>org.eclipse.smila.importing.CompoundExtractor</tt>
</p><p>A CompoundExtractor service provides two kinds of methods:
</p>
<ul><li> check if an object's filename, URL or mimetype idenfifies it as a compound object that can be extracted by the service.
</li><li> extract the compound: Given an InputStream with the compound content produce records for the elements.
</li></ul>
<p>The element records can contain the following attributes:
</p>
<ul><li> <tt>fileName</tt>: the complete name of the entry in the compound object, usually something like a filesystem path
</li><li> <tt>isCompound</tt>: set to <tt>true</tt> if the element is a supported compound object itself.
</li><li> <tt>size</tt>: uncompressed size of the element
</li><li> <tt>time</tt>: last modification timestamp, as a datetime value.
</li><li> <tt>compressedSize</tt>: compressed size of the element
</li><li> <tt>comment</tt>: a comment for the element in the compound
</li><li> <tt>isRootCompound</tt>: set to <tt>true</tt> if the record describes the processed compound object itself.
</li><li> <tt>compounds</tt>: a sequence of the compound files to look into to reach this element. For example, if the compound <tt>/data/compound.zip</tt> contains a file <tt>archived/subcompound.zip</tt> which contain a file <tt>x.html</tt>, the <tt>compounds</tt> list for <tt>x.html</tt> would be: <pre>[/data/compound.zip, archived/subcompound.zip]</pre>
</li></ul>
<p>Not all attributes need to be set for all compound types.
</p>
<a name="Implementations"></a><h2> <span class="mw-headline"> Implementations </span></h2>
<a name="SimpleCompoundExtractorService"></a><h3> <span class="mw-headline"> SimpleCompoundExtractorService </span></h3>
<p>Bundle: <tt>org.eclipse.smila.importing.compounds.simple</tt>
</p><p>This extractor service uses the classes provided by the JDK's <tt>java.util.zip</tt> package to extract compound objects. This means that it can currently support ZIP files and GZ files (not TAR.GZ, though).
</p><p>Supported Mimetypes:
</p>
<ul><li> <tt>application/zip</tt>
</li><li> <tt>application/x-gunzip</tt>
</li><li> <tt>application/x-gzip</tt>
</li></ul>
<p>If the mimetype is not provided by the caller at all or it is only <tt>application/octet-stream</tt> it uses the current <a href="../MimeTypeIdentifier.html" title="SMILA/Documentation/MimeTypeIdentifier">MimeType Identifier service</a> to recognize the real mimetype from the filename extension.
</p><p>The compound types are treated differently:
</p>
<ul><li> For ZIP files, it creates one record for the ZIP file itself and one record for each contained element.
</li><li> For GZ files, it creates one record for the compressed file with the original filename of the GZ file but no content, and one record for the content of the uncompressed file.
</li></ul>
<a name="Configuration"></a><h4> <span class="mw-headline"> Configuration </span></h4>
<p>The simple extractor service can be configured by means of a properties file <tt>configuration/org.eclipse.smila.importing.compounds.simple/extractor.properties</tt>.
</p><p>The configuration properties are as follows:
</p>
<ul><li> <tt>zip.encoding</tt> the encoding to use when extracting ZIP files that do not use UTF-8 (default: <tt>UTF-8</tt>)
<ul><li> example: <tt>zip.encoding=CP850</tt>
</li><li> This property will only yield an effect when SMILA is run on a JRE prior to JRE 7. Since JRE 7 using non-UTF-8 ZIPs with special characters will almost always throw an IllegalArgumentException, since JRE 7's zip solution does not honor the properties used by the previous solution, but software written for java prior to java 7 has no other means on configuring another code page to use for such zip files.
</li></ul>
</li><li> <tt>tmp.dir</tt> the temporary directory to extract compounds to, per default a directory named <tt>org.eclipse.smila.importing.compounds.simple</tt> is created in the user's temporary folder (e.g. on Windows 7 something like <tt>C:\Users\&lt;username&gt;\AppData\Local\Temp\org.eclipse.smila.importing.compounds.simple</tt>).
<ul><li> example: <tt>tmp.dir=/temp/SMILA.compound.extractor/</tt>
</li></ul>
</li></ul>
<a name="CommonsCompressCompoundExtractorService"></a><h3> <span class="mw-headline"> CommonsCompressCompoundExtractorService </span></h3>
<div class="messagebox" style="background-color: #def3fe; border: 1px solid #c5d7e0; color: black; padding: 5px; margin: 1ex 0; min-height: 35px; padding-left: 45px;">
<div style="float: left; margin-left: -40px;"><a href="http://wiki.eclipse.org/Image:Idea.png" class="image" title="Idea.png"><img alt="" src="http://wiki.eclipse.org/images/a/a4/Idea.png" width="35" height="35" border="0" /></a></div>
<div><b>Not yet in SMILA!</b>
As soon as we are allowed to use the <a href="http://commons.apache.org/compress" class="external text" title="http://commons.apache.org/compress" rel="nofollow">Apache Commons Compress</a> library we will provide a extractor service based on this library and then be able to support more compound types.<b><br /></div></b>
</div>
<p>Bundle: <tt>org.eclipse.smila.importing.compounds.compress</tt>
</p><p>Supported Compound Formats:
</p>
<ul><li> Archives
<ul><li> <tt>zip</tt>
</li><li> <tt>tar</tt>
</li><li> <tt>cpio</tt>
</li><li> <tt>java-archive</tt>
</li></ul>
</li><li> Compressions
<ul><li> <tt>bzip2</tt>
</li><li> <tt>gzip</tt>
</li></ul>
</li></ul>
<a name="Configuration_2"></a><h4> <span class="mw-headline"> Configuration </span></h4>
<p>The simple extractor service can be configured by means of a properties file <tt>configuration/org.eclipse.smila.importing.compounds.compress/extractor.properties</tt>.
</p><p>The configuration properties are as follows:
</p>
<ul><li> <tt>zip.encoding</tt> the encoding to use when extracting ZIP files that do not use UTF-8 (default: <tt>UTF-8</tt>)
<ul><li> example: <tt>zip.encoding=CP850</tt>
</li><li> This property will only yield an effect when SMILA is run on a JRE prior to JRE 7. Since JRE 7 using non-UTF-8 ZIPs with special characters will almost always throw an IllegalArgumentException, since JRE 7's zip solution does not honor the properties used by the previous solution, but software written for java prior to java 7 has no other means on configuring another code page to use for such zip files.
</li></ul>
</li><li> <tt>tmp.dir</tt> the temporary directory to extract compounds to, per default a directory named <tt>org.eclipse.smila.importing.compounds.compress</tt> is created in the user's temporary folder (e.g. on Windows 7 something like <tt>C:\Users\&lt;username&gt;\AppData\Local\Temp\org.eclipse.smila.importing.compounds.compress</tt>).
<ul><li> example: <tt>tmp.dir=/temp/SMILA.compound.extractor/</tt>
</li></ul>
</li></ul>
<!--
NewPP limit report
Preprocessor node count: 90/1000000
Post-expand include size: 1252/2097152 bytes
Template argument size: 722/2097152 bytes
#ifexist count: 0/100
-->
<!-- Saved in parser cache with key wikidb:pcache:idhash:35983-0!1!0!!en!2!edit=0 and timestamp 20120710135642 -->
<div class="printfooter">
Retrieved from "<a href="CompoundExtractorService.html">http://wiki.eclipse.org/SMILA/Documentation/Importing/CompoundExtractorService</a>"</div>
<div id="catlinks"><p class='catlinks'><a href="http://wiki.eclipse.org/Special:Categories" title="Special:Categories">Category</a>: <span dir='ltr'><a href="http://wiki.eclipse.org/Category:SMILA" title="Category:SMILA">SMILA</a></span></p></div> <!-- end content -->
<div class="visualClear"></div>
</div>
</div>
</div>
<!-- Yoink of toolbox for phoenix moved up -->
</div>
</div>
<div id="clearFooter"/>
<div id="footer" >
<ul id="footernav">
<li class="first"><a href="http://www.eclipse.org/">Home</a></li>
<li><a href="http://www.eclipse.org/legal/privacy.php">Privacy Policy</a></li>
<li><a href="http://www.eclipse.org/legal/termsofuse.php">Terms of Use</a></li>
<li><a href="http://www.eclipse.org/legal/copyright.php">Copyright Agent</a></li>
<li><a href="http://www.eclipse.org/org/foundation/contact.php">Contact</a></li>
<li><a href="http://wiki.eclipse.org/Eclipsepedia:About" title="Eclipsepedia:About">About Eclipsepedia</a></li>
</ul>
<span id="copyright">Copyright &copy; 2012 The Eclipse Foundation. All Rights Reserved</span>
<p id="footercredit">This page was last modified 12:49, 18 May 2012 by <a href="http://wiki.eclipse.org/User:Andreas.schank.attensity.com" title="User:Andreas.schank.attensity.com">A. Schank</a>. Based on work by <a href="http://wiki.eclipse.org/index.php?title=User:Juergen.schumacher.attensity.com&amp;action=edit" class="new" title="User:Juergen.schumacher.attensity.com">Juergen Schumacher</a>.</p>
<p id="footerviews">This page has been accessed 701 times.</p>
</div>
<script type="text/javascript">
var gaJsHost = (("https:" == document.location.protocol) ? "https://ssl." : "http://www.");
document.write(unescape("%3Cscript src='" + gaJsHost + "google-analytics.com/ga.js' type='text/javascript'%3E%3C/script%3E"));
</script>
<script type="text/javascript">
var pageTracker = _gat._getTracker("UA-910670-4");
pageTracker._trackPageview();
</script>
<!-- <div class="visualClear"></div> -->
<script type="text/javascript">if (window.runOnloadHook) runOnloadHook();</script>
</div>
<!-- Served in 0.101 secs. --></body></html>