blob: be8c672dd58c3e1972767ea6ce6fa85d0b683c80 [file] [log] [blame]
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en" dir="ltr">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<meta name="keywords" content="SMILA/Documentation/2011.Simplification/org.eclipse.smila.processing.pipelets" />
<link rel="shortcut icon" href="http://wiki.eclipse.org/SMILA/Documentation/2011.Simplification/favicon.ico" />
<link rel="search" type="application/opensearchdescription+xml" href="http://wiki.eclipse.org/opensearch_desc.php" title="Eclipsepedia (English)" />
<link rel="alternate" type="application/rss+xml" title="Eclipsepedia RSS Feed" href="http://wiki.eclipse.org/index.php?title=Special:Recentchanges&amp;feed=rss" />
<link rel="alternate" type="application/atom+xml" title="Eclipsepedia Atom Feed" href="http://wiki.eclipse.org/index.php?title=Special:Recentchanges&amp;feed=atom" />
<title>SMILA/Documentation/2011.Simplification/org.eclipse.smila.processing.pipelets - Eclipsepedia</title>
<style type="text/css" media="screen,projection">/*<![CDATA[*/ @import "/skins/eclipsenova/novaWide.css?116"; /*]]>*/</style>
<link rel="stylesheet" type="text/css" media="print" href="http://wiki.eclipse.org/skins/eclipsenova/eclipsenovaPrint.css?116" />
<link rel="stylesheet" type="text/css" media="handheld" href="http://wiki.eclipse.org/skins/eclipsenova/handheld.css?116" />
<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/Nova/css/header.css" media="screen" />
<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/tabs.css" media="screen" />
<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/Nova/css/visual.css" media="screen" />
<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/Nova/css/layout.css" media="screen" />
<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/Nova/css/footer.css" media="screen" />
<!--[if IE]><link rel="stylesheet" type="text/css" href="/skins/eclipsenova/IEpngfix.css" media="screen" /><![endif]-->
<!--[if lt IE 5.5000]><style type="text/css">@import "/skins/eclipsenova/IE50Fixes.css?116";</style> <![endif]-->
<!--[if IE 5.5000]><style type="text/css">@import "/skins/eclipsenova/IE55Fixes.css?116";</style><![endif]-->
<!--[if IE 6]><style type="text/css">@import "/skins/eclipsenova/IE60Fixes.css?116";</style><![endif]-->
<!--[if IE 7]><style type="text/css">@import "/skins/eclipsenova/IE70Fixes.css?116";</style><![endif]-->
<!--[if lt IE 7]><script type="text/javascript" src="/skins/common/IEFixes.js?116"></script>
<meta http-equiv="imagetoolbar" content="no" /><![endif]-->
<script type= "text/javascript">/*<![CDATA[*/
var skin = "eclipsenova";
var stylepath = "/skins";
var wgArticlePath = "/$1";
var wgScriptPath = "";
var wgScript = "/index.php";
var wgServer = "http://wiki.eclipse.org";
var wgCanonicalNamespace = "";
var wgCanonicalSpecialPageName = false;
var wgNamespaceNumber = 0;
var wgPageName = "SMILA/Documentation/2011.Simplification/org.eclipse.smila.processing.pipelets";
var wgTitle = "SMILA/Documentation/2011.Simplification/org.eclipse.smila.processing.pipelets";
var wgAction = "view";
var wgRestrictionEdit = [];
var wgRestrictionMove = [];
var wgArticleId = "30064";
var wgIsArticle = true;
var wgUserName = null;
var wgUserGroups = null;
var wgUserLanguage = "en";
var wgContentLanguage = "en";
var wgBreakFrames = false;
var wgCurRevisionId = "242421";
var wgVersion = "1.12.0";
var wgEnableAPI = true;
var wgEnableWriteAPI = false;
/*]]>*/</script>
<script type="text/javascript" src="http://wiki.eclipse.org/skins/common/wikibits.js?116"><!-- wikibits js --></script>
<!-- Performance mods similar to those for bug 166401 -->
<script type="text/javascript" src="http://wiki.eclipse.org/index.php?title=-&amp;action=raw&amp;gen=js&amp;useskin=eclipsenova"><!-- site js --></script>
<!-- Head Scripts -->
<script type="text/javascript" src="http://wiki.eclipse.org/skins/common/ajax.js?116"></script>
<style type="text/css">/*<![CDATA[*/
.source-xml {line-height: normal; font-size: medium;}
.source-xml li {line-height: normal;}
/**
* GeSHi Dynamically Generated Stylesheet
* --------------------------------------
* Dynamically generated stylesheet for xml
* CSS class: source-xml, CSS id:
* GeSHi (C) 2004 - 2007 Nigel McNie (http://qbnz.com/highlighter)
*/
.source-xml .de1, .source-xml .de2 {font-family: 'Courier New', Courier, monospace; font-weight: normal;}
.source-xml {}
.source-xml .head {}
.source-xml .foot {}
.source-xml .imp {font-weight: bold; color: red;}
.source-xml .ln-xtra {color: #cc0; background-color: #ffc;}
.source-xml li {font-family: 'Courier New', Courier, monospace; color: black; font-weight: normal; font-style: normal;}
.source-xml li.li2 {font-weight: bold;}
.source-xml .coMULTI {color: #808080; font-style: italic;}
.source-xml .es0 {color: #000099; font-weight: bold;}
.source-xml .br0 {color: #66cc66;}
.source-xml .st0 {color: #ff0000;}
.source-xml .nu0 {color: #cc66cc;}
.source-xml .sc0 {color: #00bbdd;}
.source-xml .sc1 {color: #ddbb00;}
.source-xml .sc2 {color: #339933;}
.source-xml .sc3 {color: #009900;}
.source-xml .re0 {color: #000066;}
.source-xml .re1 {font-weight: bold; color: black;}
.source-xml .re2 {font-weight: bold; color: black;}
/*]]>*/
</style>
<style type="text/css">/*<![CDATA[*/
@import "/index.php?title=MediaWiki:Geshi.css&usemsgcache=yes&action=raw&ctype=text/css&smaxage=18000";
/*]]>*/
</style><link rel="stylesheet" type="text/css" href="org.eclipse.smila.processing.pipelets.html" /> </head>
<body class="mediawiki ns-0 ltr page-SMILA_Documentation_2011_Simplification_org_eclipse_smila_processing_pipelets">
<div id="globalWrapper">
<div id="column-one">
<!-- Eclipse Additions for the Top Nav start here M. Ward-->
<div id="header">
<div id="header-graphic">
<img src="http://wiki.eclipse.org/skins/eclipsenova/eclipse.png" alt="Eclipse Wiki">
</div>
<!-- Pulled 101409 Mward -->
<div class="portlet" id="p-personal">
<div class="pBody">
<ul>
<li id="pt-login"><a href="http://wiki.eclipse.org/index.php?title=Special:Userlogin&amp;returnto=SMILA/Documentation/2011.Simplification/org.eclipse.smila.processing.pipelets">Log in</a></li>
</ul>
</div>
</div>
<div id="header-icons">
<div id="sites">
<ul id="sitesUL">
<li><a href="http://www.eclipse.org"><img src="http://dev.eclipse.org/custom_icons/eclipseIcon.png" width="28" height="28" alt="Eclipse Foundation" title="Eclipse Foundation" /><div>Eclipse Foundation</div></a></li>
<li><a href="http://marketplace.eclipse.org"><img src="http://dev.eclipse.org/custom_icons/marketplace.png" width="28" height="28" alt="Eclipse Marketplace" title="Eclipse Marketplace" /><div>Eclipse Marketplace</div></a></li>
<li><a href="https://bugs.eclipse.org/bugs"><img src="http://dev.eclipse.org/custom_icons/system-search-bw.png" width="28" height="28" alt="Bugzilla" title="Bugzilla" /><div>Bugzilla</div></a></li>
<li><a href="http://live.eclipse.org"><img src="http://dev.eclipse.org/custom_icons/audio-input-microphone-bw.png" width="28" height="28" alt="Live" title="Live" /><div>Eclipse Live</div></a></li>
<li><a href="http://planeteclipse.org"><img src="http://dev.eclipse.org/large_icons/devices/audio-card.png" width="28" height="28" alt="PlanetEclipse" title="Planet" /><div>Planet Eclipse</div></a></li>
<li><a href="http://portal.eclipse.org"><img src="http://dev.eclipse.org/custom_icons/preferences-system-network-proxy-bw.png" width="28" height="28" alt="Portal" title="Portal" /><div>My Foundation Portal</div></a></li>
</ul>
</div>
</div>
</div>
<!-- NEW HEADER STUFF HERE -->
<div id="header-menu">
<div id="header-nav">
<ul> <li><a class="first_one" href="http://wiki.eclipse.org/" target="_self">Home</a></li> <li><a href="http://www.eclipse.org/downloads/" target="_self">Downloads</a></li>
<li><a href="http://www.eclipse.org/users/" target="_self">Users</a></li>
<li><a href="http://www.eclipse.org/membership/" target="_self">Members</a></li>
<li><a href="http://wiki.eclipse.org/index.php/Development_Resources" target="_self">Committers</a></li>
<li><a href="http://www.eclipse.org/resources/" target="_self">Resources</a></li>
<li><a href="http://www.eclipse.org/projects/" target="_self">Projects</a></li>
<li><a href="http://www.eclipse.org/org/" target="_self">About Us</a></li>
</ul>
</div>
<div id="header-utils">
<!-- moved the search window here -->
<form action="http://wiki.eclipse.org/Special:Search" >
<input class="input" name="search" type="text" accesskey="f" value="" />
<input type='submit' onclick="this.submit();" name="go" id="searchGoButton" class="button" title="Go to a page with this exact name if one exists" value="Go" />&nbsp;
<input type='submit' onclick="this.submit();" name="fulltext" class="button" id="mw-searchButton" title="Search Eclipsepedia for this text" value="Search" />
</form>
</div>
</div>
<!-- Eclipse Additions for the Header stop here -->
<!-- Additions and mods for leftside nav Start here -->
<!--Started nav rip here-->
<!-- these are the nav controls main page, changes etc -->
<div id="novaContent" class="faux">
<div id="leftcol">
<ul id="leftnav">
<!-- these are the page controls, edit history etc -->
<li class="separator"><a class="separator">Navigation &#160;&#160;</li>
<li id="n-mainpage"><a href="http://wiki.eclipse.org/Main_Page">Main Page</a></li>
<li id="n-portal"><a href="http://wiki.eclipse.org/Eclipsepedia:Community_Portal">Community portal</a></li>
<li id="n-currentevents"><a href="http://wiki.eclipse.org/Eclipsepedia:Current_events">Current events</a></li>
<li id="n-recentchanges"><a href="http://wiki.eclipse.org/Special:Recentchanges">Recent changes</a></li>
<li id="n-randompage"><a href="http://wiki.eclipse.org/Special:Random">Random page</a></li>
<li id="n-help"><a href="http://wiki.eclipse.org/Help:Contents">Help</a></li>
<li class="separator"><a class="separator">Toolbox &#160;&#160;</a></li>
<li id="t-whatlinkshere"><a href="http://wiki.eclipse.org/Special:Whatlinkshere/SMILA/Documentation/2011.Simplification/org.eclipse.smila.processing.pipelets">What links here</a></li>
<li id="t-recentchangeslinked"><a href="http://wiki.eclipse.org/Special:Recentchangeslinked/SMILA/Documentation/2011.Simplification/org.eclipse.smila.processing.pipelets">Related changes</a></li>
<!-- This is the toolbox section -->
<li id="t-upload"><a href="http://wiki.eclipse.org/Special:Upload">Upload file</a></li>
<li id="t-specialpages"><a href="http://wiki.eclipse.org/Special:Specialpages">Special pages</a></li>
<li id="t-print"><a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/2011.Simplification/org.eclipse.smila.processing.pipelets&amp;printable=yes">Printable version</a></li> <li id="t-permalink"><a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/2011.Simplification/org.eclipse.smila.processing.pipelets&amp;oldid=242421">Permanent link</a></li> </ul>
</div>
<!-- Additions and mods for leftside nav End here -->
<div id="column-content">
<div id="content">
<a name="top" id="top"></a>
<div id="tabs">
<ul class="primary">
<li class="active"><a href="org.eclipse.smila.processing.pipelets.html"><span class="tab">Page</span></a></li>
<li><a href="http://wiki.eclipse.org/index.php?title=Talk:SMILA/Documentation/2011.Simplification/org.eclipse.smila.processing.pipelets&amp;action=edit"><span class="tab">Discussion</span></a></li>
<li><a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/2011.Simplification/org.eclipse.smila.processing.pipelets&amp;action=edit"><span class="tab">View source</span></a></li>
<li><a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/2011.Simplification/org.eclipse.smila.processing.pipelets&amp;action=history"><span class="tab">History</span></a></li>
<li><a href="http://wiki.eclipse.org/index.php?title=Special:Userlogin&amp;returnto=SMILA/Documentation/2011.Simplification/org.eclipse.smila.processing.pipelets"><span class="tab">Edit</span></a></li>
</ul>
</div>
<script type="text/javascript"> if (window.isMSIE55) fixalpha(); </script>
<h1 class="firstHeading">SMILA/Documentation/2011.Simplification/org.eclipse.smila.processing.pipelets</h1>
<div id="bodyContent">
<h3 id="siteSub">From Eclipsepedia</h3>
<div id="contentSub"><span class="subpages">&lt; <a href="../../../SMILA.html" title="SMILA">SMILA</a> | <a href="../../Documentation.html" title="SMILA/Documentation">Documentation</a> | <a href="../2011.Simplification.html" title="SMILA/Documentation/2011.Simplification">2011.Simplification</a></span></div>
<div id="jump-to-nav">Jump to: <a href="org.eclipse.smila.processing.pipelets.html#column-one">navigation</a>, <a href="org.eclipse.smila.processing.pipelets.html#searchInput">search</a></div> <!-- start content -->
<p>This page describes the SMILA pipelets provided by bundle <tt>org.eclipse.smila.processing.pipelets</tt>.
</p>
<table id="toc" class="toc" summary="Contents"><tr><td><div id="toctitle"><h2>Contents</h2></div>
<ul>
<li class="toclevel-1"><a href="org.eclipse.smila.processing.pipelets.html#org.eclipse.smila.processing.pipelets.CommitRecordsPipelet"><span class="tocnumber">1</span> <span class="toctext">org.eclipse.smila.processing.pipelets.CommitRecordsPipelet</span></a>
<ul>
<li class="toclevel-2"><a href="org.eclipse.smila.processing.pipelets.html#Description"><span class="tocnumber">1.1</span> <span class="toctext">Description</span></a></li>
<li class="toclevel-2"><a href="org.eclipse.smila.processing.pipelets.html#Configuration"><span class="tocnumber">1.2</span> <span class="toctext">Configuration</span></a></li>
</ul>
</li>
<li class="toclevel-1"><a href="org.eclipse.smila.processing.pipelets.html#org.eclipse.smila.processing.pipelets.AddValuesPipelet"><span class="tocnumber">2</span> <span class="toctext">org.eclipse.smila.processing.pipelets.AddValuesPipelet</span></a></li>
<li class="toclevel-1"><a href="org.eclipse.smila.processing.pipelets.html#Configuration_2"><span class="tocnumber">3</span> <span class="toctext">Configuration</span></a>
<ul>
<li class="toclevel-2"><a href="org.eclipse.smila.processing.pipelets.html#Example"><span class="tocnumber">3.1</span> <span class="toctext">Example</span></a></li>
</ul>
</li>
<li class="toclevel-1"><a href="org.eclipse.smila.processing.pipelets.html#org.eclipse.smila.processing.pipelets.HtmlToTextPipelet"><span class="tocnumber">4</span> <span class="toctext">org.eclipse.smila.processing.pipelets.HtmlToTextPipelet</span></a>
<ul>
<li class="toclevel-2"><a href="org.eclipse.smila.processing.pipelets.html#Description_2"><span class="tocnumber">4.1</span> <span class="toctext">Description</span></a></li>
<li class="toclevel-2"><a href="org.eclipse.smila.processing.pipelets.html#Configuration_3"><span class="tocnumber">4.2</span> <span class="toctext">Configuration</span></a>
<ul>
<li class="toclevel-3"><a href="org.eclipse.smila.processing.pipelets.html#Example_2"><span class="tocnumber">4.2.1</span> <span class="toctext">Example</span></a></li>
</ul>
</li>
</ul>
</li>
<li class="toclevel-1"><a href="org.eclipse.smila.processing.pipelets.html#org.eclipse.smila.processing.pipelets.CopyPipelet"><span class="tocnumber">5</span> <span class="toctext">org.eclipse.smila.processing.pipelets.CopyPipelet</span></a>
<ul>
<li class="toclevel-2"><a href="org.eclipse.smila.processing.pipelets.html#Description_3"><span class="tocnumber">5.1</span> <span class="toctext">Description</span></a></li>
<li class="toclevel-2"><a href="org.eclipse.smila.processing.pipelets.html#Configuration_4"><span class="tocnumber">5.2</span> <span class="toctext">Configuration</span></a>
<ul>
<li class="toclevel-3"><a href="org.eclipse.smila.processing.pipelets.html#Example_3"><span class="tocnumber">5.2.1</span> <span class="toctext">Example</span></a></li>
</ul>
</li>
</ul>
</li>
<li class="toclevel-1"><a href="org.eclipse.smila.processing.pipelets.html#org.eclipse.smila.processing.pipelets.SubAttributeExtractorPipelet"><span class="tocnumber">6</span> <span class="toctext">org.eclipse.smila.processing.pipelets.SubAttributeExtractorPipelet</span></a>
<ul>
<li class="toclevel-2"><a href="org.eclipse.smila.processing.pipelets.html#Description_4"><span class="tocnumber">6.1</span> <span class="toctext">Description</span></a></li>
<li class="toclevel-2"><a href="org.eclipse.smila.processing.pipelets.html#Configuration_5"><span class="tocnumber">6.2</span> <span class="toctext">Configuration</span></a>
<ul>
<li class="toclevel-3"><a href="org.eclipse.smila.processing.pipelets.html#Example_4"><span class="tocnumber">6.2.1</span> <span class="toctext">Example</span></a></li>
</ul>
</li>
</ul>
</li>
<li class="toclevel-1"><a href="org.eclipse.smila.processing.pipelets.html#Bundle:_org.eclipse.smila.processing.pipelets.MimeTypeIdentifyPipelet"><span class="tocnumber">7</span> <span class="toctext">Bundle: org.eclipse.smila.processing.pipelets.MimeTypeIdentifyPipelet</span></a>
<ul>
<li class="toclevel-2"><a href="org.eclipse.smila.processing.pipelets.html#Description_5"><span class="tocnumber">7.1</span> <span class="toctext">Description</span></a></li>
<li class="toclevel-2"><a href="org.eclipse.smila.processing.pipelets.html#Configuration_6"><span class="tocnumber">7.2</span> <span class="toctext">Configuration</span></a>
<ul>
<li class="toclevel-3"><a href="org.eclipse.smila.processing.pipelets.html#Example_5"><span class="tocnumber">7.2.1</span> <span class="toctext">Example</span></a></li>
</ul>
</li>
</ul>
</li>
</ul>
</td></tr></table><script type="text/javascript"> if (window.showTocToggle) { var tocShowText = "show"; var tocHideText = "hide"; showTocToggle(); } </script>
<a name="org.eclipse.smila.processing.pipelets.CommitRecordsPipelet"></a><h2> <span class="mw-headline"> org.eclipse.smila.processing.pipelets.CommitRecordsPipelet </span></h2>
<a name="Description"></a><h3> <span class="mw-headline"> Description </span></h3>
<p>Commits each record in the <i>input</i> variable on the blackboard to the storages. Can be used to save the records immediately during the workflow instead of only when a workflow has been finished.
</p>
<a name="Configuration"></a><h3> <span class="mw-headline"> Configuration </span></h3>
<p>none.
</p>
<a name="org.eclipse.smila.processing.pipelets.AddValuesPipelet"></a><h2> <span class="mw-headline"> org.eclipse.smila.processing.pipelets.AddValuesPipelet </span></h2>
<p>Adds something to an attribute in the processed records. If the attribute does not contain a sequence already, the current value will be wrapped in one before the new values are added.
</p>
<a name="Configuration_2"></a><h2> <span class="mw-headline"> Configuration </span></h2>
<table border="1">
<tr>
<th>Property
</th><th>Type
</th><th>Description
</th></tr>
<tr>
<td><i>outputAttribute</i>
</td><td>a string value
</td><td>name of attribute to add values to.
</td></tr>
<tr>
<td><i>valuesToAdd</i>
</td><td>anything, usually a value or a sequence of values
</td><td>the values to add
</td></tr></table>
<a name="Example"></a><h3> <span class="mw-headline"> Example </span></h3>
<p>From a test pipeline: This adds two string values to whatever already exists in attribute "out" of the processed records.
</p>
<div dir="ltr" style="text-align: left;"><pre class="source-xml"><span class="sc3"><span class="re1">&lt;proc:invokePipelet</span> <span class="re0">name</span>=<span class="st0">&quot;addValuesToNonExistingAttribute&quot;</span><span class="re2">&gt;</span></span>
<span class="sc3"><span class="re1">&lt;proc:pipelet</span> <span class="re0">class</span>=<span class="st0">&quot;org.eclipse.smila.processing.pipelets.AddValuesPipelet&quot;</span> <span class="re2">/&gt;</span></span>
<span class="sc3"><span class="re1">&lt;proc:variables</span> <span class="re0">input</span>=<span class="st0">&quot;request&quot;</span> <span class="re2">/&gt;</span></span>
<span class="sc3"><span class="re1">&lt;proc:configuration<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;rec:Val</span> <span class="re0">key</span>=<span class="st0">&quot;outputAttribute&quot;</span><span class="re2">&gt;</span></span>out<span class="sc3"><span class="re1">&lt;/rec:Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;rec:Seq</span> <span class="re0">key</span>=<span class="st0">&quot;valuesToAdd&quot;</span><span class="re2">&gt;</span></span>
<span class="sc3"><span class="re1">&lt;rec:Val<span class="re2">&gt;</span></span></span>value1<span class="sc3"><span class="re1">&lt;/rec:Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;rec:Val<span class="re2">&gt;</span></span></span>value2<span class="sc3"><span class="re1">&lt;/rec:Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/rec:Seq<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/proc:configuration<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/proc:invokePipelet<span class="re2">&gt;</span></span></span></pre></div>
<a name="org.eclipse.smila.processing.pipelets.HtmlToTextPipelet"></a><h2> <span class="mw-headline"> org.eclipse.smila.processing.pipelets.HtmlToTextPipelet </span></h2>
<a name="Description_2"></a><h3> <span class="mw-headline"> Description </span></h3>
<p>Extract plain text and metadata from an HTML document in an attribute or attachment of each record and writes it to configurable attributes or attachments.
</p><p>The pipelet uses the CyberNeko HTML parser <a href="http://nekohtml.sourceforge.net/" class="external text" title="http://nekohtml.sourceforge.net/" rel="nofollow">NekoHTML</a> to parse HTML documents.
</p>
<a name="Configuration_3"></a><h3> <span class="mw-headline"> Configuration </span></h3>
<table border="1">
<tr>
<th>Property
</th><th>Type
</th><th>Description
</th></tr>
<tr>
<td><i>inputType</i>
</td><td>String&nbsp;: <i>ATTACHMENT, ATTRIBUTE</i>
</td><td>selects if the HTML input is found in an attachment or attribute of the record
</td></tr>
<tr>
<td><i>outputType</i>
</td><td>String&nbsp;: <i>ATTACHMENT, ATTRIBUTE</i>
</td><td>selects if the plain text should be stored in an attachment or attribute of the record
</td></tr>
<tr>
<td><i>inputName</i>
</td><td>String
</td><td>name of input attachment or path to input attribute (process literals of attribute)
</td></tr>
<tr>
<td><i>outputName</i>
</td><td>String
</td><td> name of output attachment or path to output attribute for plain text (store result as literals of attribute)
</td></tr>
<tr>
<td><i>removeContentTags</i>
</td><td>String
</td><td>comma separated list of HTML tags (case insensitive) for which the complete content should be removed from the resulting plain text. If not set, it defaults to <i>"applet,frame,object,script,style"</i>. If the value is set, you must add the default tags explicitly to have their contents removed, too.
</td></tr>
<tr>
<td><i>meta:&lt;name&gt;</i>
</td><td>String: attribute path
</td><td>store the content of the <tt>&lt;META&gt;</tt> tag with <i>name="&lt;name&gt;"</i> (case insensitive) to the attribute named as the value of the property. E.g. a property named <i>"meta:author"</i> with value "authors" causes the content attributes of <tt>&lt;META name="author" content="..."&gt;</tt> tags to be stored in the attribute <i>authors</i> of the respective record.
</td></tr>
<tr>
<td><i>tag:title</i>
</td><td>String: attribute path
</td><td>store the content of the <tt>&lt;TITLE&gt;</tt> tag with to the attribute named as the value of the property.
</td></tr></table>
<a name="Example_2"></a><h4> <span class="mw-headline"> Example </span></h4>
<p>This configuration extracts plain text from the HTML document in attachment <i>"html"</i> and stores it in the attribute <i>"text"</i>. It removes the complete content of heading tags <tt>&lt;h1&gt;, ..., &lt;h4&gt;</tt>. Additionally it looks for <tt>&lt;meta&gt;</tt> tags with names <i>"author"</i> and <i>"keywords"</i> and stores their contents in attributes <i>"authors"</i> and <i>"keywords"</i>, respectively:
</p>
<div dir="ltr" style="text-align: left;"><pre class="source-xml"><span class="sc3"><span class="re1">&lt;extensionActivity<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;proc:invokePipelet</span> <span class="re0">name</span>=<span class="st0">&quot;invokeHtml2Txt&quot;</span><span class="re2">&gt;</span></span>
<span class="sc3"><span class="re1">&lt;proc:pipelet</span> <span class="re0">class</span>=<span class="st0">&quot;org.eclipse.smila.processing.pipelets.HtmlToTextPipelet&quot;</span> <span class="re2">/&gt;</span></span>
<span class="sc3"><span class="re1">&lt;proc:variables</span> <span class="re0">input</span>=<span class="st0">&quot;request&quot;</span> <span class="re0">output</span>=<span class="st0">&quot;request&quot;</span> <span class="re2">/&gt;</span></span>
<span class="sc3"><span class="re1">&lt;proc:configuration<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;rec:Val</span> <span class="re0">key</span>=<span class="st0">&quot;inputType&quot;</span><span class="re2">&gt;</span></span>ATTACHMENT<span class="sc3"><span class="re1">&lt;/rec:Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;rec:Val</span> <span class="re0">key</span>=<span class="st0">&quot;outputType&quot;</span><span class="re2">&gt;</span></span>ATTRIBUTE<span class="sc3"><span class="re1">&lt;/rec:Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;rec:Val</span> <span class="re0">key</span>=<span class="st0">&quot;inputName&quot;</span><span class="re2">&gt;</span></span>html<span class="sc3"><span class="re1">&lt;/rec:Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;rec:Val</span> <span class="re0">key</span>=<span class="st0">&quot;outputName&quot;</span><span class="re2">&gt;</span></span>text<span class="sc3"><span class="re1">&lt;/rec:Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;rec:Val</span> <span class="re0">key</span>=<span class="st0">&quot;meta:author&quot;</span><span class="re2">&gt;</span></span>author<span class="sc3"><span class="re1">&lt;/rec:Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;rec:Val</span> <span class="re0">key</span>=<span class="st0">&quot;meta:keywords&quot;</span><span class="re2">&gt;</span></span>keywords<span class="sc3"><span class="re1">&lt;/rec:Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;rec:Val</span> <span class="re0">key</span>=<span class="st0">&quot;meta:title&quot;</span><span class="re2">&gt;</span></span>title<span class="sc3"><span class="re1">&lt;/rec:Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;rec:Val</span> <span class="re0">key</span>=<span class="st0">&quot;removeContentTags&quot;</span><span class="re2">&gt;</span></span>h1,h2,h3,h4<span class="sc3"><span class="re1">&lt;/rec:Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/proc:configuration<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/proc:invokePipelet<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/extensionActivity<span class="re2">&gt;</span></span></span></pre></div>
<a name="org.eclipse.smila.processing.pipelets.CopyPipelet"></a><h2> <span class="mw-headline"> org.eclipse.smila.processing.pipelets.CopyPipelet </span></h2>
<a name="Description_3"></a><h3> <span class="mw-headline"> Description </span></h3>
<p>This pipelet can be used to copy a String value between attributes and/or attachments. It suppoprts two execution modes:
</p>
<ul><li> COPY: copy the value from the input attribute/attachment to thee output attribute/attachment
</li><li> MOVE: same as COPY, but after that delete the value from the input attribute/attachment
</li></ul>
<a name="Configuration_4"></a><h3> <span class="mw-headline"> Configuration </span></h3>
<table border="1">
<tr>
<th>Property
</th><th>Type
</th><th>Description
</th></tr>
<tr>
<td><i>inputType</i>
</td><td>String&nbsp;: <i>ATTACHMENT, ATTRIBUTE</i>
</td><td>selects if the input is found in an attachment or attribute of the record
</td></tr>
<tr>
<td><i>outputType</i>
</td><td>String&nbsp;: <i>ATTACHMENT, ATTRIBUTE</i>
</td><td>selects if output should be stored in an attachment or attribute of the record
</td></tr>
<tr>
<td><i>inputName</i>
</td><td>String
</td><td>name of input attachment or path to input attribute (process a String literal of attribute)
</td></tr>
<tr>
<td><i>outputName</i>
</td><td>String
</td><td> name of output attachment or path to output attribute for plain text (store result as String literal of attribute)
</td></tr>
<tr>
<td><i>mode</i>
</td><td>String&nbsp;: <i>COPY, MOVE</i>
</td><td> execution mode. Copy the value or move (copy and delete) the value. Default is COPY.
</td></tr>
</table>
<a name="Example_3"></a><h4> <span class="mw-headline"> Example </span></h4>
<p>This configuration shows how to copy the value of attachment 'Content' into the attribute 'TextContent':
</p>
<div dir="ltr" style="text-align: left;"><pre class="source-xml"><span class="sc3"><span class="coMULTI">&lt;!-- copy txt from attachment to attribute --&gt;</span></span>
<span class="sc3"><span class="re1">&lt;extensionActivity<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;proc:invokePipelet</span> <span class="re0">name</span>=<span class="st0">&quot;invokeCopyContent&quot;</span><span class="re2">&gt;</span></span>
<span class="sc3"><span class="re1">&lt;proc:pipelet</span> <span class="re0">class</span>=<span class="st0">&quot;org.eclipse.smila.processing.pipelets.CopyPipelet&quot;</span> <span class="re2">/&gt;</span></span>
<span class="sc3"><span class="re1">&lt;proc:variables</span> <span class="re0">input</span>=<span class="st0">&quot;request&quot;</span> <span class="re0">output</span>=<span class="st0">&quot;request&quot;</span> <span class="re2">/&gt;</span></span>
<span class="sc3"><span class="re1">&lt;proc:configuration<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;rec:Val</span> <span class="re0">key</span>=<span class="st0">&quot;inputType&quot;</span><span class="re2">&gt;</span></span>ATTACHMENT<span class="sc3"><span class="re1">&lt;/rec:Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;rec:Val</span> <span class="re0">key</span>=<span class="st0">&quot;outputType&quot;</span><span class="re2">&gt;</span></span>ATTRIBUTE<span class="sc3"><span class="re1">&lt;/rec:Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;rec:Val</span> <span class="re0">key</span>=<span class="st0">&quot;inputName&quot;</span><span class="re2">&gt;</span></span>Content<span class="sc3"><span class="re1">&lt;/rec:Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;rec:Val</span> <span class="re0">key</span>=<span class="st0">&quot;outputName&quot;</span><span class="re2">&gt;</span></span>TextContent<span class="sc3"><span class="re1">&lt;/rec:Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;rec:Val</span> <span class="re0">key</span>=<span class="st0">&quot;mode&quot;</span><span class="re2">&gt;</span></span>COPY<span class="sc3"><span class="re1">&lt;/rec:Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/proc:configuration<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/proc:invokePipelet<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/extensionActivity<span class="re2">&gt;</span></span></span></pre></div>
<a name="org.eclipse.smila.processing.pipelets.SubAttributeExtractorPipelet"></a><h2> <span class="mw-headline"> org.eclipse.smila.processing.pipelets.SubAttributeExtractorPipelet </span></h2>
<a name="Description_4"></a><h3> <span class="mw-headline"> Description </span></h3>
<p>Extracts Literal values from an attribute that has a nested maps. The attributes in the nested map can have nested maps themselves. To address a attribute in the nested structure a path needs to be specified. The pipelet supports different execution modes:
</p>
<ul><li>FIRST: selects only the first literal of the specified attribute
</li><li>LAST: selects only the last literal of the specified attribute
</li><li>ALL_AS_LIST: selects all literal values of the specified attribute and returns a list
</li><li>ALL_AS_ONE: selects all literal values of the specified attribute and concatenates them to a single string, using a separator (default is blank)
</li></ul>
<p>This pipelet works only on attributes, not on attachments!
</p><p><b>Note</b>:
If the maps on the path are nested in sequences, the pipelet uses the first element of such a sequence.
</p>
<a name="Configuration_5"></a><h3> <span class="mw-headline"> Configuration </span></h3>
<table border="1">
<tr>
<th>Property
</th><th>Type
</th><th>Description
</th></tr>
<tr>
<td><i>inputPath</i>
</td><td>String
</td><td>the path to the input attribute with Literals
</td></tr>
<tr>
<td><i>outputPath</i>
</td><td>String
</td><td>the name of the attribute to store the extracted value(s) as Literals in (not a path, only a top-level attribute, currently)
</td></tr>
<tr>
<td><i>mode</i>
</td><td>String&nbsp;: <i>FIRST, LAST, ALL_AS_LIST, ALL_AS_ONE</i>
</td><td> execution mode. See above for details.
</td></tr>
<tr>
<td><i>separator</i>
</td><td>String
</td><td> the separation string used for mode ALL_AS_ONE. Default is a blank
</td></tr>
</table>
<a name="Example_4"></a><h4> <span class="mw-headline"> Example </span></h4>
<p>This configuration can be applied to records provided by the FeedAgent. It shows how to access the subattribute 'Value' of attribute 'Contents', concatenating all values to one:
</p>
<div dir="ltr" style="text-align: left;"><pre class="source-xml"><span class="sc3"><span class="coMULTI">&lt;!-- extract content --&gt;</span></span>
<span class="sc3"><span class="re1">&lt;extensionActivity<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;proc:invokePipelet</span> <span class="re0">name</span>=<span class="st0">&quot;extract content&quot;</span><span class="re2">&gt;</span></span>
<span class="sc3"><span class="re1">&lt;proc:pipelet</span> <span class="re0">class</span>=<span class="st0">&quot;org.eclipse.smila.processing.pipelets.SubAttributeExtractorPipelet&quot;</span> <span class="re2">/&gt;</span></span>
<span class="sc3"><span class="re1">&lt;proc:variables</span> <span class="re0">input</span>=<span class="st0">&quot;request&quot;</span> <span class="re0">output</span>=<span class="st0">&quot;request&quot;</span> <span class="re2">/&gt;</span></span>
<span class="sc3"><span class="re1">&lt;proc:configuration<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;rec:Val</span> <span class="re0">key</span>=<span class="st0">&quot;inputPath&quot;</span><span class="re2">&gt;</span></span>Contents/Value<span class="sc3"><span class="re1">&lt;/rec:Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;rec:Val</span> <span class="re0">key</span>=<span class="st0">&quot;outputPath&quot;</span><span class="re2">&gt;</span></span>Content<span class="sc3"><span class="re1">&lt;/rec:Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;rec:Val</span> <span class="re0">key</span>=<span class="st0">&quot;mode&quot;</span><span class="re2">&gt;</span></span>ALL_AS_ONE<span class="sc3"><span class="re1">&lt;/rec:Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/proc:configuration<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/proc:invokePipelet<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/extensionActivity<span class="re2">&gt;</span></span></span></pre></div>
<a name="Bundle:_org.eclipse.smila.processing.pipelets.MimeTypeIdentifyPipelet"></a><h2> <span class="mw-headline"> Bundle: org.eclipse.smila.processing.pipelets.MimeTypeIdentifyPipelet </span></h2>
<a name="Description_5"></a><h3> <span class="mw-headline"> Description </span></h3>
<p>This pipelet is used to identify the MIME type of a document.
It uses an <tt>org.eclipse.smila.processing.pipelets.mimetype.MimeTypeIdentifier</tt> service to perform the actual identification of the MIME type. Depending on the specified properties, the MIME type is detected from the file content, from the file extension, or from both. If the identification does not return a MIME type - and if configured accordingly - the service will search the metadata for this information. The identified MIME type is then stored to an attribute in the record.
</p><p><br />
</p>
<a name="Configuration_6"></a><h3> <span class="mw-headline"> Configuration </span></h3>
<p>The pipelet is configured using the <tt>&lt;configuration&gt;</tt> section inside the <tt>&lt;invokePipelet&gt;</tt> activity of the corresponding BPEL file. It provides the following properties:
</p>
<table border="1">
<tr>
<th>Property</th><th>Type</th><th>Usage</th><th>Description
</th></tr>
<tr>
<td><i>FileExtensionAttribute</i></td><td>String</td><td>Optional</td><td>Name of the attribute containing the file extension
</td></tr>
<tr>
<td><i>ContentAttachment</i></td><td>String</td><td>Optional</td><td>Name of the attachment containing the file content
</td></tr>
<tr>
<td><i>MetaDataAttribute</i></td><td>String</td><td>Optional</td><td>Name of the attribute containing metadata information, e.g. a Web Crawler returns a response header containing applicable MIME type information
</td></tr>
<tr>
<td><i>MimeTypeAttribute</i></td><td>String</td><td>Required</td><td>Name of the attribute to store the identified MIME type to
</td></tr></table>
<p>Note that at least one of the properties <i>FileExtensionAttribute</i>, <i>ContentAttachment</i>, and <i>MetaDataAttribute</i> must be specified!
</p>
<a name="Example_5"></a><h4> <span class="mw-headline"> Example </span></h4>
<p>The following example is used in the SMILA example application to identify the MIME types of documents that are delivered by the File System Crawler or Web Crawler.
</p><p><b>addpipeline.bpel</b>
</p>
<div dir="ltr" style="text-align: left;"><pre class="source-xml"><span class="sc3"><span class="re1">&lt;extensionActivity<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;proc:invokePipelet</span> <span class="re0">name</span>=<span class="st0">&quot;detect MimeType&quot;</span><span class="re2">&gt;</span></span>
<span class="sc3"><span class="re1">&lt;proc:pipelet</span> <span class="re0">class</span>=<span class="st0">&quot;org.eclipse.smila.processing.pipelets.MimeTypeIdentifyPipelet&quot;</span> <span class="re2">/&gt;</span></span>
<span class="sc3"><span class="re1">&lt;proc:variables</span> <span class="re0">input</span>=<span class="st0">&quot;request&quot;</span> <span class="re0">output</span>=<span class="st0">&quot;request&quot;</span> <span class="re2">/&gt;</span></span>
<span class="sc3"><span class="re1">&lt;proc:configuration<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;rec:Val</span> <span class="re0">key</span>=<span class="st0">&quot;FileExtensionAttribute&quot;</span><span class="re2">&gt;</span></span>Extension<span class="sc3"><span class="re1">&lt;/rec:Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;rec:Val</span> <span class="re0">key</span>=<span class="st0">&quot;MetaDataAttribute&quot;</span><span class="re2">&gt;</span></span>MetaData<span class="sc3"><span class="re1">&lt;/rec:Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;rec:Val</span> <span class="re0">key</span>=<span class="st0">&quot;MimeTypeAttribute&quot;</span><span class="re2">&gt;</span></span>MimeType<span class="sc3"><span class="re1">&lt;/rec:Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/proc:configuration<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/proc:invokePipelet<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/extensionActivity<span class="re2">&gt;</span></span></span></pre></div>
<!--
NewPP limit report
Preprocessor node count: 51/1000000
Post-expand include size: 0/2097152 bytes
Template argument size: 0/2097152 bytes
#ifexist count: 0/100
-->
<!-- Saved in parser cache with key wikidb:pcache:idhash:30064-0!1!0!!en!2!edit=0 and timestamp 20110328164123 -->
<div class="printfooter">
Retrieved from "<a href="org.eclipse.smila.processing.pipelets.html">http://wiki.eclipse.org/SMILA/Documentation/2011.Simplification/org.eclipse.smila.processing.pipelets</a>"</div>
<div id="catlinks"><p class='catlinks'><a href="http://wiki.eclipse.org/Special:Categories" title="Special:Categories">Categories</a>: <span dir='ltr'><a href="http://wiki.eclipse.org/Category:SMILA" title="Category:SMILA">SMILA</a></span> | <span dir='ltr'><a href="http://wiki.eclipse.org/index.php?title=Category:SMILA/Pipelet&amp;action=edit" class="new" title="Category:SMILA/Pipelet">SMILA/Pipelet</a></span></p></div> <!-- end content -->
<div class="visualClear"></div>
</div>
</div>
</div>
<!-- Yoink of toolbox for phoenix moved up -->
</div>
</div>
<div id="clearFooter"/>
<div id="footer" >
<ul id="footernav">
<li class="first"><a href="http://www.eclipse.org/">Home</a></li>
<li><a href="http://www.eclipse.org/legal/privacy.php">Privacy Policy</a></li>
<li><a href="http://www.eclipse.org/legal/termsofuse.php">Terms of Use</a></li>
<li><a href="http://www.eclipse.org/legal/copyright.php">Copyright Agent</a></li>
<li><a href="http://www.eclipse.org/org/foundation/contact.php">Contact</a></li>
<li><a href="http://wiki.eclipse.org/Eclipsepedia:About" title="Eclipsepedia:About">About Eclipsepedia</a></li>
</ul>
<span id="copyright">Copyright &copy; 2011 The Eclipse Foundation. All Rights Reserved</span>
<p id="footercredit">This page was last modified 16:31, 11 March 2011 by <a href="http://wiki.eclipse.org/index.php?title=User:Nadine.auslaender.attensity.com&amp;action=edit" class="new" title="User:Nadine.auslaender.attensity.com"> </a>. Based on work by <a href="http://wiki.eclipse.org/index.php?title=User:Juergen.schumacher.attensity.com&amp;action=edit" class="new" title="User:Juergen.schumacher.attensity.com">Juergen Schumacher</a>.</p>
<p id="footerviews">This page has been accessed 269 times.</p>
</div>
<script type="text/javascript">
var gaJsHost = (("https:" == document.location.protocol) ? "https://ssl." : "http://www.");
document.write(unescape("%3Cscript src='" + gaJsHost + "google-analytics.com/ga.js' type='text/javascript'%3E%3C/script%3E"));
</script>
<script type="text/javascript">
var pageTracker = _gat._getTracker("UA-910670-4");
pageTracker._trackPageview();
</script>
<!-- <div class="visualClear"></div> -->
<script type="text/javascript">if (window.runOnloadHook) runOnloadHook();</script>
</div>
<!-- Served in 0.068 secs. --></body></html>