blob: e379eb8015237df923f147ee979e7f119a4fe9fd [file] [log] [blame]
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en" dir="ltr">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<meta name="keywords" content="SMILA/Documentation/TikaPipelet,SMILA/Documentation/TikaPipelet,SMILA/Development Guidelines/How to write a Pipelet,SMILA/Documentation/Bundle org.eclipse.smila.processing.pipelets,SMILA/Documentation/Bundle org.eclipse.smila.processing.pipelets.boilerpipe,SMILA/Glossary" />
<link rel="shortcut icon" href="http://wiki.eclipse.org/SMILA/Documentation/favicon.ico" />
<link rel="search" type="application/opensearchdescription+xml" href="http://wiki.eclipse.org/opensearch_desc.php" title="Eclipsepedia (English)" />
<link rel="alternate" type="application/rss+xml" title="Eclipsepedia RSS Feed" href="http://wiki.eclipse.org/index.php?title=Special:Recentchanges&amp;feed=rss" />
<link rel="alternate" type="application/atom+xml" title="Eclipsepedia Atom Feed" href="http://wiki.eclipse.org/index.php?title=Special:Recentchanges&amp;feed=atom" />
<title>SMILA/Documentation/TikaPipelet - Eclipsepedia</title>
<style type="text/css" media="screen,projection">/*<![CDATA[*/ @import "http://wiki.eclipse.org/skins/eclipsenova/novaWide.css?116"; /*]]>*/</style>
<link rel="stylesheet" type="text/css" media="print" href="http://wiki.eclipse.org/skins/eclipsenova/eclipsenovaPrint.css?116" />
<link rel="stylesheet" type="text/css" media="handheld" href="http://wiki.eclipse.org/skins/eclipsenova/handheld.css?116" />
<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/Nova/css/header.css" media="screen" />
<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/tabs.css" media="screen" />
<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/Nova/css/visual.css" media="screen" />
<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/Nova/css/layout.css" media="screen" />
<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/Nova/css/footer.css" media="screen" />
<!--[if IE]><link rel="stylesheet" type="text/css" href="/skins/eclipsenova/IEpngfix.css" media="screen" /><![endif]-->
<!--[if lt IE 5.5000]><style type="text/css">@import "/skins/eclipsenova/IE50Fixes.css?116";</style> <![endif]-->
<!--[if IE 5.5000]><style type="text/css">@import "/skins/eclipsenova/IE55Fixes.css?116";</style><![endif]-->
<!--[if IE 6]><style type="text/css">@import "/skins/eclipsenova/IE60Fixes.css?116";</style><![endif]-->
<!--[if IE 7]><style type="text/css">@import "/skins/eclipsenova/IE70Fixes.css?116";</style><![endif]-->
<!--[if lt IE 7]><script type="text/javascript" src="/skins/common/IEFixes.js?116"></script>
<meta http-equiv="imagetoolbar" content="no" /><![endif]-->
<script type= "text/javascript">/*<![CDATA[*/
var skin = "eclipsenova";
var stylepath = "/skins";
var wgArticlePath = "/$1";
var wgScriptPath = "";
var wgScript = "/index.php";
var wgServer = "http://wiki.eclipse.org";
var wgCanonicalNamespace = "";
var wgCanonicalSpecialPageName = false;
var wgNamespaceNumber = 0;
var wgPageName = "SMILA/Documentation/TikaPipelet";
var wgTitle = "SMILA/Documentation/TikaPipelet";
var wgAction = "view";
var wgRestrictionEdit = [];
var wgRestrictionMove = [];
var wgArticleId = "39987";
var wgIsArticle = true;
var wgUserName = null;
var wgUserGroups = null;
var wgUserLanguage = "en";
var wgContentLanguage = "en";
var wgBreakFrames = false;
var wgCurRevisionId = "333383";
var wgVersion = "1.12.0";
var wgEnableAPI = true;
var wgEnableWriteAPI = false;
/*]]>*/</script>
<script type="text/javascript" src="http://wiki.eclipse.org/skins/common/wikibits.js?116"><!-- wikibits js --></script>
<!-- Performance mods similar to those for bug 166401 -->
<script type="text/javascript" src="http://wiki.eclipse.org/index.php?title=-&amp;action=raw&amp;gen=js&amp;useskin=eclipsenova"><!-- site js --></script>
<!-- Head Scripts -->
<script type="text/javascript" src="http://wiki.eclipse.org/skins/common/ajax.js?116"></script>
<style type="text/css">/*<![CDATA[*/
.source-xml {line-height: normal; font-size: medium;}
.source-xml li {line-height: normal;}
/**
* GeSHi Dynamically Generated Stylesheet
* --------------------------------------
* Dynamically generated stylesheet for xml
* CSS class: source-xml, CSS id:
* GeSHi (C) 2004 - 2007 Nigel McNie (http://qbnz.com/highlighter)
*/
.source-xml .de1, .source-xml .de2 {font-family: 'Courier New', Courier, monospace; font-weight: normal;}
.source-xml {}
.source-xml .head {}
.source-xml .foot {}
.source-xml .imp {font-weight: bold; color: red;}
.source-xml .ln-xtra {color: #cc0; background-color: #ffc;}
.source-xml li {font-family: 'Courier New', Courier, monospace; color: black; font-weight: normal; font-style: normal;}
.source-xml li.li2 {font-weight: bold;}
.source-xml .coMULTI {color: #808080; font-style: italic;}
.source-xml .es0 {color: #000099; font-weight: bold;}
.source-xml .br0 {color: #66cc66;}
.source-xml .st0 {color: #ff0000;}
.source-xml .nu0 {color: #cc66cc;}
.source-xml .sc0 {color: #00bbdd;}
.source-xml .sc1 {color: #ddbb00;}
.source-xml .sc2 {color: #339933;}
.source-xml .sc3 {color: #009900;}
.source-xml .re0 {color: #000066;}
.source-xml .re1 {font-weight: bold; color: black;}
.source-xml .re2 {font-weight: bold; color: black;}
/*]]>*/
</style>
<style type="text/css">/*<![CDATA[*/
@import "http://wiki.eclipse.org/index.php?title=MediaWiki:Geshi.css&usemsgcache=yes&action=raw&ctype=text/css&smaxage=18000";
/*]]>*/
</style><link rel="stylesheet" type="text/css" href="TikaPipelet.html" /> </head>
<body class="mediawiki ns-0 ltr page-SMILA_Documentation_TikaPipelet">
<div id="globalWrapper">
<div id="column-one">
<!-- Eclipse Additions for the Top Nav start here M. Ward-->
<div id="header">
<div id="header-graphic">
<img src="http://wiki.eclipse.org/skins/eclipsenova/eclipse.png" alt="Eclipse Wiki">
</div>
<!-- Pulled 101409 Mward -->
<div class="portlet" id="p-personal">
<div class="pBody">
<ul>
<li id="pt-login"><a href="http://wiki.eclipse.org/index.php?title=Special:Userlogin&amp;returnto=SMILA/Documentation/TikaPipelet">Log in</a></li>
</ul>
</div>
</div>
<div id="header-icons">
<div id="sites">
<ul id="sitesUL">
<li><a href="http://www.eclipse.org"><img src="http://dev.eclipse.org/custom_icons/eclipseIcon.png" width="28" height="28" alt="Eclipse Foundation" title="Eclipse Foundation" /><div>Eclipse Foundation</div></a></li>
<li><a href="http://marketplace.eclipse.org"><img src="http://dev.eclipse.org/custom_icons/marketplace.png" width="28" height="28" alt="Eclipse Marketplace" title="Eclipse Marketplace" /><div>Eclipse Marketplace</div></a></li>
<li><a href="https://bugs.eclipse.org/bugs"><img src="http://dev.eclipse.org/custom_icons/system-search-bw.png" width="28" height="28" alt="Bugzilla" title="Bugzilla" /><div>Bugzilla</div></a></li>
<li><a href="http://live.eclipse.org"><img src="http://dev.eclipse.org/custom_icons/audio-input-microphone-bw.png" width="28" height="28" alt="Live" title="Live" /><div>Eclipse Live</div></a></li>
<li><a href="http://planeteclipse.org"><img src="http://dev.eclipse.org/large_icons/devices/audio-card.png" width="28" height="28" alt="PlanetEclipse" title="Planet" /><div>Planet Eclipse</div></a></li>
<li><a href="http://portal.eclipse.org"><img src="http://dev.eclipse.org/custom_icons/preferences-system-network-proxy-bw.png" width="28" height="28" alt="Portal" title="Portal" /><div>My Foundation Portal</div></a></li>
</ul>
</div>
</div>
</div>
<!-- NEW HEADER STUFF HERE -->
<div id="header-menu">
<div id="header-nav">
<ul> <li><a class="first_one" href="http://wiki.eclipse.org/" target="_self">Home</a></li> <li><a href="http://www.eclipse.org/downloads/" target="_self">Downloads</a></li>
<li><a href="http://www.eclipse.org/users/" target="_self">Users</a></li>
<li><a href="http://www.eclipse.org/membership/" target="_self">Members</a></li>
<li><a href="http://wiki.eclipse.org/index.php/Development_Resources" target="_self">Committers</a></li>
<li><a href="http://www.eclipse.org/resources/" target="_self">Resources</a></li>
<li><a href="http://www.eclipse.org/projects/" target="_self">Projects</a></li>
<li><a href="http://www.eclipse.org/org/" target="_self">About Us</a></li>
</ul>
</div>
<div id="header-utils">
<!-- moved the search window here -->
<form action="http://wiki.eclipse.org/Special:Search" >
<input class="input" name="search" type="text" accesskey="f" value="" />
<input type='submit' onclick="this.submit();" name="go" id="searchGoButton" class="button" title="Go to a page with this exact name if one exists" value="Go" />&nbsp;
<input type='submit' onclick="this.submit();" name="fulltext" class="button" id="mw-searchButton" title="Search Eclipsepedia for this text" value="Search" />
</form>
</div>
</div>
<!-- Eclipse Additions for the Header stop here -->
<!-- Additions and mods for leftside nav Start here -->
<!--Started nav rip here-->
<!-- these are the nav controls main page, changes etc -->
<div id="novaContent" class="faux">
<div id="leftcol">
<ul id="leftnav">
<!-- these are the page controls, edit history etc -->
<li class="separator"><a class="separator">Navigation &#160;&#160;</li>
<li id="n-mainpage"><a href="http://wiki.eclipse.org/Main_Page">Main Page</a></li>
<li id="n-portal"><a href="http://wiki.eclipse.org/Eclipsepedia:Community_Portal">Community portal</a></li>
<li id="n-currentevents"><a href="http://wiki.eclipse.org/Eclipsepedia:Current_events">Current events</a></li>
<li id="n-recentchanges"><a href="http://wiki.eclipse.org/Special:Recentchanges">Recent changes</a></li>
<li id="n-randompage"><a href="http://wiki.eclipse.org/Special:Random">Random page</a></li>
<li id="n-help"><a href="http://wiki.eclipse.org/Help:Contents">Help</a></li>
<li class="separator"><a class="separator">Toolbox &#160;&#160;</a></li>
<li id="t-whatlinkshere"><a href="http://wiki.eclipse.org/Special:Whatlinkshere/SMILA/Documentation/TikaPipelet">What links here</a></li>
<li id="t-recentchangeslinked"><a href="http://wiki.eclipse.org/Special:Recentchangeslinked/SMILA/Documentation/TikaPipelet">Related changes</a></li>
<!-- This is the toolbox section -->
<li id="t-upload"><a href="http://wiki.eclipse.org/Special:Upload">Upload file</a></li>
<li id="t-specialpages"><a href="http://wiki.eclipse.org/Special:Specialpages">Special pages</a></li>
<li id="t-print"><a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/TikaPipelet&amp;printable=yes">Printable version</a></li> <li id="t-permalink"><a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/TikaPipelet&amp;oldid=333383">Permanent link</a></li> </ul>
</div>
<!-- Additions and mods for leftside nav End here -->
<div id="column-content">
<div id="content">
<a name="top" id="top"></a>
<div id="tabs">
<ul class="primary">
<li class="active"><a href="TikaPipelet.html"><span class="tab">Page</span></a></li>
<li><a href="http://wiki.eclipse.org/index.php?title=Talk:SMILA/Documentation/TikaPipelet&amp;action=edit"><span class="tab">Discussion</span></a></li>
<li><a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/TikaPipelet&amp;action=edit"><span class="tab">View source</span></a></li>
<li><a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/TikaPipelet&amp;action=history"><span class="tab">History</span></a></li>
<li><a href="http://wiki.eclipse.org/index.php?title=Special:Userlogin&amp;returnto=SMILA/Documentation/TikaPipelet"><span class="tab">Edit</span></a></li>
</ul>
</div>
<script type="text/javascript"> if (window.isMSIE55) fixalpha(); </script>
<h1 class="firstHeading">SMILA/Documentation/TikaPipelet</h1>
<div id="bodyContent">
<h3 id="siteSub">From Eclipsepedia</h3>
<div id="contentSub"><span class="subpages">&lt; <a href="../../SMILA.html" title="SMILA">SMILA</a> | <a href="../Documentation.1.html" title="SMILA/Documentation">Documentation</a></span></div>
<div id="jump-to-nav">Jump to: <a href="TikaPipelet.html#column-one">navigation</a>, <a href="TikaPipelet.html#searchInput">search</a></div> <!-- start content -->
<table id="toc" class="toc" summary="Contents"><tr><td><div id="toctitle"><h2>Contents</h2></div>
<ul>
<li class="toclevel-1"><a href="TikaPipelet.html#Bundle:_org.eclipse.smila.tika"><span class="tocnumber">1</span> <span class="toctext">Bundle: org.eclipse.smila.tika</span></a>
<ul>
<li class="toclevel-2"><a href="TikaPipelet.html#Description"><span class="tocnumber">1.1</span> <span class="toctext">Description</span></a>
<ul>
<li class="toclevel-3"><a href="TikaPipelet.html#Supported_document_types"><span class="tocnumber">1.1.1</span> <span class="toctext">Supported document types</span></a></li>
</ul>
</li>
<li class="toclevel-2"><a href="TikaPipelet.html#Configuration"><span class="tocnumber">1.2</span> <span class="toctext">Configuration</span></a>
<ul>
<li class="toclevel-3"><a href="TikaPipelet.html#Configuring_the_Property_Mapping"><span class="tocnumber">1.2.1</span> <span class="toctext">Configuring the Property Mapping</span></a></li>
<li class="toclevel-3"><a href="TikaPipelet.html#Example"><span class="tocnumber">1.2.2</span> <span class="toctext">Example</span></a></li>
<li class="toclevel-3"><a href="TikaPipelet.html#Typical_Property-Names"><span class="tocnumber">1.2.3</span> <span class="toctext">Typical Property-Names</span></a></li>
</ul>
</li>
<li class="toclevel-2"><a href="TikaPipelet.html#Extending_Tika"><span class="tocnumber">1.3</span> <span class="toctext">Extending Tika</span></a></li>
</ul>
</li>
</ul>
</li>
</ul>
</td></tr></table><script type="text/javascript"> if (window.showTocToggle) { var tocShowText = "show"; var tocHideText = "hide"; showTocToggle(); } </script>
<a name="Bundle:_org.eclipse.smila.tika"></a><h2> <span class="mw-headline"> Bundle: <tt>org.eclipse.smila.tika</tt> </span></h2>
<a name="Description"></a><h3> <span class="mw-headline"> Description </span></h3>
<p>The TikaPipelet converts various document formats (such as PDF, Microsoft Office, OpenOffice, etc.) to plain text using <a href="../Glossary.html#Tika" title="SMILA/Glossary">Tika</a> technology: A record attachment containing the binary content can thus be converted to plain text and stored in an attribute. In addition to that, metadata properties of the document (like title, author, etc) can be extracted and written to record attibutes. To improve the Tika parsing process, it is possible to optionally pass the content-type and filename of the document stored in other record attributes via parameters <i>contentTypeAttribute</i> and <i>fileNameAttribute</i>.
</p><p>The TikaPipelet supports the configurable error handling as described in <a href="../Development_Guidelines/How_to_write_a_Pipelet.html#Implementation" class="mw-redirect" title="SMILA/Development Guidelines/How to write a Pipelet">SMILA/Development_Guidelines/How_to_write_a_Pipelet#Implementation</a>. When used in JobManager workflows, records causing errors are dropped.
</p>
<a name="Supported_document_types"></a><h4> <span class="mw-headline"> Supported document types </span></h4>
<p>By default, SMILA contains only a subset of Tika. Therefore not all documents formats can be converted out-of-the-box by using the TikaPipelet. However it's easy to extend SMILA so that the TikaPipelet supports <i>all</i> document formats, see <a href="TikaPipelet.html#Extending_Tika" title="SMILA/Documentation/TikaPipelet"> "Extending Tika"</a> section below.
</p>
<table border="1">
<tr>
<th>Document format</th><th>supported out-of-the-box</th><th>supported by using</th><th>Hints
</th></tr>
<tr>
<td><i>Microsoft Office</i></td><td>yes</td><td>TikaPipelet</td><td>---
</td></tr>
<tr>
<td><i>OpenOffice (OpenDocument formats)</i></td><td>yes</td><td>TikaPipelet</td><td>---
</td></tr>
<tr>
<td><i>RTF</i></td><td>yes</td><td>TikaPipelet</td><td>---
</td></tr>
<tr>
<td><i>Plain text</i></td><td>yes</td><td>---</td><td>no conversion, given input text is used as "converted" text
</td></tr>
<tr>
<td><i>HTML/XML</i></td><td>yes</td><td><a href="Bundle_org.eclipse.smila.processing.pipelets.html#org.eclipse.smila.processing.pipelets.HtmlToTextPipelet" title="SMILA/Documentation/Bundle org.eclipse.smila.processing.pipelets">HtmlToTextPipelet</a></td><td> <a href="Bundle_org.eclipse.smila.processing.pipelets.boilerpipe.html" title="SMILA/Documentation/Bundle org.eclipse.smila.processing.pipelets.boilerpipe">BoilerpipePipelet</a> can also be used for HTML text extraction
</td></tr>
<tr>
<td><i>PDF</i></td><td>no</td><td><a href="TikaPipelet.html#Extending_Tika" title="SMILA/Documentation/TikaPipelet">Tika extension</a> </td><td> converted text will be empty with out-of-the-box SMILA, a warning will be written to the log
</td></tr>
</table>
<p>As you see, SMILA (resp. its 'AddPipeline' which is the default indexing pipeline) per default uses the TikaPipelet only for converting <i>binary</i> document formats. When indexing text based documents another piplelet (<a href="Bundle_org.eclipse.smila.processing.pipelets.html#org.eclipse.smila.processing.pipelets.HtmlToTextPipelet" title="SMILA/Documentation/Bundle org.eclipse.smila.processing.pipelets">HtmlToTextPipelet</a>) is used. However after <a href="TikaPipelet.html#Extending_Tika" title="SMILA/Documentation/TikaPipelet"> extending Tika</a> this can be simplified by using TikaPipelet for <i>all</i> document formats.
</p>
<a name="Configuration"></a><h3> <span class="mw-headline"> Configuration </span></h3>
<table border="1">
<tr>
<th>Property</th><th>Type</th><th>Read Type</th><th>Required</th><th>Description
</th></tr>
<tr>
<td><i>inputType</i></td><td>String&nbsp;: <i>ATTACHMENT, ATTRIBUTE</i></td><td>runtime</td><td>yes</td><td>Selects if the input is found in an attachment or attribute of the record. Usually it doesn't make sense to use "ATTRIBUTE" here because the documents to convert are binary content.
</td></tr>
<tr>
<td><i>outputType</i></td><td>String&nbsp;: <i>ATTACHMENT, ATTRIBUTE</i></td><td>runtime</td><td>yes</td><td>Selects if output should be stored in an attachment or attribute of the record
</td></tr>
<tr>
<td><i>inputName</i></td><td>String</td><td>runtime</td><td>yes</td><td>Name of input attachment or path to input attribute (process a String literal of attribute)
</td></tr>
<tr>
<td><i>outputName</i></td><td>String</td><td>runtime</td><td> yes</td><td>Name of output attachment or path to output attribute for plain text (store result as String literal of attribute)
</td></tr>
<tr>
<td><i>extractProperties</i></td><td>String</td><td>runtime</td><td>no</td><td>Specifies which metadata properties reported by Tika for the document should be written to which record attribute. See below for details.
</td></tr>
<tr>
<td><i>contentTypeAttribute</i></td><td>String</td><td>runtime</td><td>no</td><td>Parameter referencing the attribute that contains the content-type of the document. If specified the content-type is used to better guide the Tika parsing process. Tika also performs a MimeType detection and the resulting value is stored in this attribute.
</td></tr>
<tr>
<td><i>fileNameAttribute</i></td><td>String</td><td>runtime</td><td>no</td><td>Parameter referencing the attribute that contains the name of the file that was the source of the attachment content. If specified the filename is used to better guide the Tika parsing process.
</td></tr>
<tr>
<td><i>exportAsHtml</i></td><td>Boolean</td><td>runtime</td><td>no</td><td>Flag that specifies if the output should be in HTML format (true) or not (false). Plain text output (false) is default.
</td></tr>
<tr>
<td><i>pageBreak</i></td><td>Boolean</td><td>runtime</td><td>no</td><td>Flag that specifies if pageBreaks should be used to split the content into multiple output records (true) or not (false). The recordId of the output records is generated by concatenating the recordId of the input record with the pageNumber, seperated by <i>#</i>, e.g. (testdoc.pdf#1).This parameter is only interpreted if exportAsHtml is <i>false</i>. Default is (false).
</td></tr>
<tr>
<td><i>pageNumberAttribute</i></td><td>String</td><td>runtime</td><td>no</td><td>Parameter that specifies the name of the attribute that should contain the extracted page number. This parameter is only interpreted if pageBreak is <i>true</i>. If not set, the page number is not set (default)
</td></tr>
<tr>
<td><i>keepHyphens</i></td><td>Boolean</td><td>runtime</td><td>no</td><td>If set to "false", hyphens are removed from words at line breaks so that the separated syllables are contracted to one word (“charac-&lt;newline&gt;teristics” gets "characteristics"). If set to "true", this dehyphenation is disabled. Default is (false).
</td></tr>
<tr>
<td><i>maxLength</i></td><td>Long</td><td>runtime</td><td>no</td><td>The maximum number of characters to extract. If a document contains more characters than specified all remaining characters are omitted. To get all available characters just omit this Parameter. This may lead to OutOfMemory Exceptions with big documents. Default is -1 (unlimited).
</td></tr>
</table>
<p>Some notes on "maxLength" in combination with other parameters:
</p>
<ul><li> If "exportAsHTML" is set to "true", the HTML tags will not be counted when checking the limit, so the actual output will be longer than maxLength characters: The output creation stops when the "real" text content of the HTML reaches maxLength characters. After this, also no additional tags will be appended.
</li><li> The extracted text is "trimmed" so the actual output can be shorter than maxLength characters cause leading and trailing whitespaces are removed.
</li><li> When "outputHyphen" and "exportAsHTML" are set to "false", the actual output can be shorter than maxLength characters, because the hyphens and linebreaks are removed from the limited output. With "exportAsHTML=true", this effect will probably not notable because usually the output will get longer because of the HTML tags.
</li></ul>
<p><br />
</p>
<a name="Configuring_the_Property_Mapping"></a><h4> <span class="mw-headline"> Configuring the Property Mapping </span></h4>
<p>In addition to the plain text content, Tika can extract metadata properties from documents like title, author, publisher, dates of publication etc, ... The names of these properties depend very much on the documents and what is actually extracted. Some well known names like Dublin Core (dc, dcterms) are used. For a complete list please refer to the <a href="../Glossary.html#Tika" title="SMILA/Glossary">Tika</a> documentation. To check with your documents you can download Tika and use the Tika Application to see all extracted metadata.
</p><p>To store such metadata properties in SMILA records, you must specify the names of the properties you want to store in the <i>extractProperties</i> parameter. Usually this parameter contains a sequence of maps. The map values have the following format:
</p>
<table border="1">
<tr>
<th>Property</th><th>Type</th><th>Read Type</th><th>Required</th><th>Description
</th></tr>
<tr>
<td><i>metadataName</i></td><td>String</td><td>runtime</td><td>yes</td><td>The name of the metadata property. This will be matched with the extracted metadata property names in a case-insensitive manner.
</td></tr>
<tr>
<td><i>targetAttribute</i></td><td>String</td><td>runtime</td><td>no</td><td>The name of Record attribute to store metadata value(s) in. If not set the string provided in the <i>metadataName</i> will be used as attribute name.
</td></tr>
<tr>
<td><i>singleResult</i></td><td>Boolean</td><td>runtime</td><td>no</td><td> Flag that specifies if only the first value (if multiple values exists) is used in the result (true) or if all values are used (false). Default is false.
</td></tr>
<tr>
<td><i>storeMode</i></td><td>String </td><td>runtime</td><td>no</td><td> Specifies whether attributes already stored in the record target attribute will be left unchanged ("leave"), overwritten ("overwrite") or if the extracted properties will be added to potentially existing ones ("add"). Default is "add".
</td></tr>
</table>
<a name="Example"></a><h4> <span class="mw-headline"> Example </span></h4>
<p>The following example shows how to configure the pipelet to extract the text from the attachment called <i>Content</i> and stores the extracted text in the attribute <i>Text</i>. Additionally the eventually contained metadata properties Company, Creator and Title will be stored in properties.
</p><p>E.g. if a word document with the value "ACME" as company and "John Doe" as creator, the resulting record would contain the plain text in the attribute <tt>Text</tt>, the value <tt>ACME</tt> in the attribute <tt>Company</tt>, as well as the value <tt>John Doe</tt> in an attribute <tt>Creator</tt>.
</p>
<div dir="ltr" style="text-align: left;"><pre class="source-xml"><span class="sc3"><span class="re1">&lt;proc:configuration<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;rec:Val</span> <span class="re0">key</span>=<span class="st0">&quot;inputName&quot;</span><span class="re2">&gt;</span></span>Content<span class="sc3"><span class="re1">&lt;/rec:Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;rec:Val</span> <span class="re0">key</span>=<span class="st0">&quot;inputType&quot;</span><span class="re2">&gt;</span></span>ATTACHMENT<span class="sc3"><span class="re1">&lt;/rec:Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;rec:Val</span> <span class="re0">key</span>=<span class="st0">&quot;outputName&quot;</span><span class="re2">&gt;</span></span>Text<span class="sc3"><span class="re1">&lt;/rec:Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;rec:Val</span> <span class="re0">key</span>=<span class="st0">&quot;outputType&quot;</span><span class="re2">&gt;</span></span>ATTRIBUTE<span class="sc3"><span class="re1">&lt;/rec:Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;rec:Val</span> <span class="re0">key</span>=<span class="st0">&quot;contentTypeAttribute&quot;</span><span class="re2">&gt;</span></span>MimeType<span class="sc3"><span class="re1">&lt;/rec:Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;rec:Val</span> <span class="re0">key</span>=<span class="st0">&quot;fileNameAttribute&quot;</span><span class="re2">&gt;</span></span>FileName<span class="sc3"><span class="re1">&lt;/rec:Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;rec:Val</span> <span class="re0">key</span>=<span class="st0">&quot;exportAsHtml&quot;</span><span class="re2">&gt;</span></span>false<span class="sc3"><span class="re1">&lt;/rec:Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;rec:Val</span> <span class="re0">key</span>=<span class="st0">&quot;pageBreak&quot;</span><span class="re2">&gt;</span></span>falsec<span class="sc3"><span class="re1">&lt;/rec:Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;rec:Val</span> <span class="re0">key</span>=<span class="st0">&quot;keepHyphens&quot;</span><span class="re2">&gt;</span></span>false<span class="sc3"><span class="re1">&lt;/rec:Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;rec:Val</span> <span class="re0">key</span>=<span class="st0">&quot;maxLength&quot;</span><span class="re2">&gt;</span></span>100000<span class="sc3"><span class="re1">&lt;/rec:Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;rec:Seq</span> <span class="re0">key</span>=<span class="st0">&quot;extractProperties&quot;</span><span class="re2">&gt;</span></span>
<span class="sc3"><span class="re1">&lt;rec:Map<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;rec:Val</span> <span class="re0">key</span>=<span class="st0">&quot;metadataName&quot;</span><span class="re2">&gt;</span></span>company<span class="sc3"><span class="re1">&lt;/rec:Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;rec:Val</span> <span class="re0">key</span>=<span class="st0">&quot;targetAttribute&quot;</span><span class="re2">&gt;</span></span>Company<span class="sc3"><span class="re1">&lt;/rec:Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;rec:Val</span> <span class="re0">key</span>=<span class="st0">&quot;singleResult&quot;</span><span class="re2">&gt;</span></span>false<span class="sc3"><span class="re1">&lt;/rec:Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/rec:Map<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;rec:Map<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;rec:Val</span> <span class="re0">key</span>=<span class="st0">&quot;metadataName&quot;</span><span class="re2">&gt;</span></span>creator<span class="sc3"><span class="re1">&lt;/rec:Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;rec:Val</span> <span class="re0">key</span>=<span class="st0">&quot;targetAttribute&quot;</span><span class="re2">&gt;</span></span>Creator<span class="sc3"><span class="re1">&lt;/rec:Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;rec:Val</span> <span class="re0">key</span>=<span class="st0">&quot;singleResult&quot;</span><span class="re2">&gt;</span></span>false<span class="sc3"><span class="re1">&lt;/rec:Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/rec:Map<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;rec:Map<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;rec:Val</span> <span class="re0">key</span>=<span class="st0">&quot;metadataName&quot;</span><span class="re2">&gt;</span></span>title<span class="sc3"><span class="re1">&lt;/rec:Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;rec:Val</span> <span class="re0">key</span>=<span class="st0">&quot;targetAttribute&quot;</span><span class="re2">&gt;</span></span>Title<span class="sc3"><span class="re1">&lt;/rec:Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;rec:Val</span> <span class="re0">key</span>=<span class="st0">&quot;singleResult&quot;</span><span class="re2">&gt;</span></span>true<span class="sc3"><span class="re1">&lt;/rec:Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/rec:Map<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/rec:Seq<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/proc:configuration<span class="re2">&gt;</span></span></span></pre></div>
<a name="Typical_Property-Names"></a><h4> <span class="mw-headline"> Typical Property-Names </span></h4>
<ul><li> Generic
<ul><li>"contributor"
</li><li>"coverage"
</li><li>"creator"
</li><li>"description"
</li><li>"format"
</li><li>"identifier"
</li><li>"language"
</li><li>"modified"
</li><li>"publisher"
</li><li>"relation"
</li><li>"rights"
</li><li>"source"
</li><li>"subject"
</li><li>"title"
</li><li>"type"
</li></ul>
</li></ul>
<ul><li> MS- Office
<ul><li>"Application-Name"
</li><li>"Application-Version"
</li><li>"Author"
</li><li>"Category"
</li><li>"Comments"
</li><li>"Company"
</li><li>"Content-Status"
</li><li>"Edit-Time"
</li><li>"Keywords"
</li><li>"Last-Author"
</li><li>"Manager"
</li><li>"Notes"
</li><li>"Presentation-Format"
</li><li>"Revision-Number"
</li><li>"Security"
</li><li>"Template"
</li><li>"Total-Time"
</li><li>"custom:"
</li><li>"Version"
</li></ul>
</li></ul>
<p><br />
</p>
<a name="Extending_Tika"></a><h3> <span class="mw-headline"> Extending Tika </span></h3>
<p>SMILA does not contain the complete Tika distribution, because some converters need third party libraries with licenses that we are not allowed to distribute. However, it is easy (and absolutely legal!) to include those parts of Tika into your SMILA installation yourself:
</p>
<ul><li> Download org.eclipse.smila.tika.deps bundle from <a href="http://ubuntuone.com/1n9PNxx6akZ0X1Bc7ahYrm" class="external text" title="http://ubuntuone.com/1n9PNxx6akZ0X1Bc7ahYrm" rel="nofollow">here</a>
</li><li> Replace the appropriate bundle of your SMILA distribution with the downloaded bundle by just copying the downloaded bundle to <tt>&lt;path-to-your-SMILA&gt;/plugins</tt> folder.
</li></ul>
<p>That's it! After SMILA restart, all document formats supported by Tika will be also be supported by SMILA's TikaPipelet.
</p>
<!--
NewPP limit report
Preprocessor node count: 19/1000000
Post-expand include size: 0/2097152 bytes
Template argument size: 0/2097152 bytes
#ifexist count: 0/100
-->
<!-- Saved in parser cache with key wikidb:pcache:idhash:39987-0!1!0!!en!2!edit=0 and timestamp 20130416060940 -->
<div class="printfooter">
Retrieved from "<a href="TikaPipelet.html">http://wiki.eclipse.org/SMILA/Documentation/TikaPipelet</a>"</div>
<div id="catlinks"><p class='catlinks'><a href="http://wiki.eclipse.org/Special:Categories" title="Special:Categories">Categories</a>: <span dir='ltr'><a href="http://wiki.eclipse.org/Category:SMILA" title="Category:SMILA">SMILA</a></span> | <span dir='ltr'><a href="http://wiki.eclipse.org/index.php?title=Category:SMILA/Pipelet&amp;action=edit" class="new" title="Category:SMILA/Pipelet">SMILA/Pipelet</a></span></p></div> <!-- end content -->
<div class="visualClear"></div>
</div>
</div>
</div>
<!-- Yoink of toolbox for phoenix moved up -->
</div>
</div>
<div id="clearFooter"/>
<div id="footer" >
<ul id="footernav">
<li class="first"><a href="http://www.eclipse.org/">Home</a></li>
<li><a href="http://www.eclipse.org/legal/privacy.php">Privacy Policy</a></li>
<li><a href="http://www.eclipse.org/legal/termsofuse.php">Terms of Use</a></li>
<li><a href="http://www.eclipse.org/legal/copyright.php">Copyright Agent</a></li>
<li><a href="http://www.eclipse.org/org/foundation/contact.php">Contact</a></li>
<li><a href="http://wiki.eclipse.org/Eclipsepedia:About" title="Eclipsepedia:About">About Eclipsepedia</a></li>
</ul>
<span id="copyright">Copyright &copy; 2013 The Eclipse Foundation. All Rights Reserved</span>
<p id="footercredit">This page was last modified 11:13, 10 April 2013 by <a href="http://wiki.eclipse.org/User:Andreas.weber.empolis.com" title="User:Andreas.weber.empolis.com">Andreas Weber</a>. Based on work by <a href="http://wiki.eclipse.org/User:Juergen.schumacher.empolis.com" title="User:Juergen.schumacher.empolis.com">Juergen Schumacher</a>, <a href="http://wiki.eclipse.org/index.php?title=User:Nadine.auslaender.empolis.com&amp;action=edit" class="new" title="User:Nadine.auslaender.empolis.com">Nadine Ausländer</a> and <a href="http://wiki.eclipse.org/index.php?title=User:Daniel.stucky.attensity.com&amp;action=edit" class="new" title="User:Daniel.stucky.attensity.com">Daniel Stucky</a> and <a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/TikaPipelet&amp;action=credits" title="SMILA/Documentation/TikaPipelet">others</a>.</p>
<p id="footerviews">This page has been accessed 440 times.</p>
</div>
<script type="text/javascript">
var gaJsHost = (("https:" == document.location.protocol) ? "https://ssl." : "http://www.");
document.write(unescape("%3Cscript src='" + gaJsHost + "google-analytics.com/ga.js' type='text/javascript'%3E%3C/script%3E"));
</script>
<script type="text/javascript">
var pageTracker = _gat._getTracker("UA-910670-4");
pageTracker._trackPageview();
</script>
<!-- <div class="visualClear"></div> -->
<script type="text/javascript">if (window.runOnloadHook) runOnloadHook();</script>
</div>
<!-- Served in 0.126 secs. --></body></html>