blob: 9ef3a3ce67c4512bc07fb33bce240c06521948b1 [file] [log] [blame]
<!DOCTYPE html>
<html lang="en" dir="ltr" class="client-nojs">
<head>
<meta charset="UTF-8" />
<title>SMILA/Documentation/Bundle org.eclipse.smila.processing.pipelets.boilerpipe - Eclipsepedia</title>
<meta http-equiv="X-UA-Compatible" content="IE=EDGE" />
<meta name="generator" content="MediaWiki 1.23.2" />
<link rel="shortcut icon" href="http://wiki.eclipse.org/eclipse.org-common/themes/solstice/public/images/favicon.ico" />
<link rel="search" type="application/opensearchdescription+xml" href="http://wiki.eclipse.org/opensearch_desc.php" title="Eclipsepedia (en)" />
<link rel="EditURI" type="application/rsd+xml" href="http://wiki.eclipse.org/api.php?action=rsd" />
<link rel="alternate" type="application/atom+xml" title="Eclipsepedia Atom feed" href="http://wiki.eclipse.org/index.php?title=Special:RecentChanges&amp;feed=atom" />
<link rel="stylesheet" href="http://wiki.eclipse.org/load.php?debug=false&amp;lang=en&amp;modules=mediawiki.legacy.commonPrint%2Cshared%7Cmediawiki.ui.button&amp;only=styles&amp;skin=solstice&amp;*" />
<link rel="stylesheet" href="http://wiki.eclipse.org/skins/solstice/public/stylesheets/styles.min.css?303" media="screen, print" /><meta name="ResourceLoaderDynamicStyles" content="" />
<style>a:lang(ar),a:lang(kk-arab),a:lang(mzn),a:lang(ps),a:lang(ur){text-decoration:none}
/* cache key: my_wiki:resourceloader:filter:minify-css:7:14ece53a42aa314864e5fd8c57f0d98f */</style>
<script src="http://wiki.eclipse.org/load.php?debug=false&amp;lang=en&amp;modules=startup&amp;only=scripts&amp;skin=solstice&amp;*"></script>
<script>if(window.mw){
mw.config.set({"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"SMILA/Documentation/Bundle_org.eclipse.smila.processing.pipelets.boilerpipe","wgTitle":"SMILA/Documentation/Bundle org.eclipse.smila.processing.pipelets.boilerpipe","wgCurRevisionId":352422,"wgRevisionId":352422,"wgArticleId":37346,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["SMILA","SMILA/Pipelet"],"wgBreakFrames":false,"wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgMonthNamesShort":["","Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"],"wgRelevantPageName":"SMILA/Documentation/Bundle_org.eclipse.smila.processing.pipelets.boilerpipe","wgIsProbablyEditable":false,"wgRestrictionEdit":[],"wgRestrictionMove":[],"wgWikiEditorEnabledModules":{"toolbar":false,"dialogs":false,"hidesig":true,"preview":false,"previewDialog":false,"publish":false},"wgCategoryTreePageCategoryOptions":"{\"mode\":0,\"hideprefix\":20,\"showcount\":true,\"namespaces\":false}"});
}</script><script>if(window.mw){
mw.loader.implement("user.options",function($,jQuery){mw.user.options.set({"ccmeonemails":0,"cols":80,"date":"default","diffonly":0,"disablemail":0,"editfont":"default","editondblclick":0,"editsectiononrightclick":0,"enotifminoredits":0,"enotifrevealaddr":0,"enotifusertalkpages":1,"enotifwatchlistpages":1,"extendwatchlist":0,"fancysig":0,"forceeditsummary":0,"gender":"unknown","hideminor":0,"hidepatrolled":0,"imagesize":2,"math":1,"minordefault":0,"newpageshidepatrolled":0,"nickname":"","norollbackdiff":0,"numberheadings":0,"previewonfirst":0,"previewontop":1,"rcdays":7,"rclimit":50,"rows":25,"showhiddencats":0,"shownumberswatching":1,"showtoolbar":1,"skin":"solstice","stubthreshold":0,"thumbsize":2,"underline":2,"uselivepreview":0,"usenewrc":0,"watchcreations":1,"watchdefault":1,"watchdeletion":0,"watchlistdays":3,"watchlisthideanons":0,"watchlisthidebots":0,"watchlisthideliu":0,"watchlisthideminor":0,"watchlisthideown":0,"watchlisthidepatrolled":0,"watchmoves":0,"wllimit":250,
"useeditwarning":1,"prefershttps":1,"language":"en","variant-gan":"gan","variant-iu":"iu","variant-kk":"kk","variant-ku":"ku","variant-shi":"shi","variant-sr":"sr","variant-tg":"tg","variant-uz":"uz","variant-zh":"zh","searchNs0":true,"searchNs1":false,"searchNs2":false,"searchNs3":false,"searchNs4":false,"searchNs5":false,"searchNs6":false,"searchNs7":false,"searchNs8":false,"searchNs9":false,"searchNs10":false,"searchNs11":false,"searchNs12":false,"searchNs13":false,"searchNs14":false,"searchNs15":false,"variant":"en"});},{},{});mw.loader.implement("user.tokens",function($,jQuery){mw.user.tokens.set({"editToken":"+\\","patrolToken":false,"watchToken":false});},{},{});
/* cache key: my_wiki:resourceloader:filter:minify-js:7:70d74423d3fc1e1c18fa9a1ff645a84a */
}</script>
<script>if(window.mw){
mw.loader.load(["mediawiki.page.startup","mediawiki.legacy.wikibits","mediawiki.legacy.ajax"]);
}</script>
<style type="text/css">/*<![CDATA[*/
.source-xml {line-height: normal;}
.source-xml li, .source-xml pre {
line-height: normal; border: 0px none white;
}
/**
* GeSHi Dynamically Generated Stylesheet
* --------------------------------------
* Dynamically generated stylesheet for xml
* CSS class: source-xml, CSS id:
* GeSHi (C) 2004 - 2007 Nigel McNie, 2007 - 2008 Benny Baumann
* (http://qbnz.com/highlighter/ and http://geshi.org/)
* --------------------------------------
*/
.xml.source-xml .de1, .xml.source-xml .de2 {font: normal normal 1em/1.2em monospace; margin:0; padding:0; background:none; vertical-align:top;}
.xml.source-xml {font-family:monospace;}
.xml.source-xml .imp {font-weight: bold; color: red;}
.xml.source-xml li, .xml.source-xml .li1 {font-weight: normal; vertical-align:top;}
.xml.source-xml .ln {width:1px;text-align:right;margin:0;padding:0 2px;vertical-align:top;}
.xml.source-xml .li2 {font-weight: bold; vertical-align:top;}
.xml.source-xml .es0 {color: #000099; font-weight: bold;}
.xml.source-xml .br0 {color: #66cc66;}
.xml.source-xml .sy0 {color: #66cc66;}
.xml.source-xml .st0 {color: #ff0000;}
.xml.source-xml .nu0 {color: #cc66cc;}
.xml.source-xml .sc-1 {color: #808080; font-style: italic;}
.xml.source-xml .sc0 {color: #00bbdd;}
.xml.source-xml .sc1 {color: #ddbb00;}
.xml.source-xml .sc2 {color: #339933;}
.xml.source-xml .sc3 {color: #009900;}
.xml.source-xml .re0 {color: #000066;}
.xml.source-xml .re1 {color: #000000; font-weight: bold;}
.xml.source-xml .re2 {color: #000000; font-weight: bold;}
.xml.source-xml .ln-xtra, .xml.source-xml li.ln-xtra, .xml.source-xml div.ln-xtra {background-color: #ffc;}
.xml.source-xml span.xtra { display:block; }
/*]]>*/
</style><meta name="viewport" content="width=device-width, initial-scale=1.0"></head>
<body class="mediawiki ltr sitedir-ltr ns-0 ns-subject page-SMILA_Documentation_Bundle_org_eclipse_smila_processing_pipelets_boilerpipe skin-solstice action-view" id="solstice">
<a class="sr-only" href="Bundle_org.eclipse.smila.processing.pipelets.boilerpipe.html#content">Skip to main content</a>
<div class="thin-header">
<header role="banner" class="hidden-print noprint">
<div class="container-fluid">
<div id="row-logo-search">
<div id="header-left">
<div class="row">
<div class="hidden-xs col-sm-6 logo-container">
<a href="https://www.eclipse.org/" ><img class="logo-eclipse-default" src="http://wiki.eclipse.org/skins/solstice/public/images/logo/eclipse-800x188.png" alt="Eclipsepedia"></a>
</div>
<div class="navbar col-sm-18 yamm" id="main-menu">
<div class="navbar-collapse collapse" id="navbar-collapse-1">
<ul class="nav navbar-nav">
<li><a target="_self" href="https://eclipse.org/downloads/">Download</a></li>
<li><a target="_self" href="https://eclipse.org/users/">Getting Started </a></li>
<li><a target="_self" href="https://eclipse.org/membership/">Members</a></li>
<li><a target="_self" href="https://eclipse.org/projects/">Projects</a></li>
<li class="dropdown visible-xs"><a class="dropdown-toggle" data-toggle="dropdown" href="Bundle_org.eclipse.smila.processing.pipelets.boilerpipe.html#">Community <b class="caret"></b></a><ul class="dropdown-menu"><li><a href="http://marketplace.eclipse.org">Marketplace</a></li><li><a href="http://events.eclipse.org">Events</a></li><li><a href="http://www.planeteclipse.org/">Planet Eclipse</a></li><li><a href="https://eclipse.org/community/eclipse_newsletter/">Newsletter</a></li><li><a href="https://www.youtube.com/user/EclipseFdn">Videos</a></li></ul></li><li class="dropdown visible-xs"><a class="dropdown-toggle" data-toggle="dropdown" href="Bundle_org.eclipse.smila.processing.pipelets.boilerpipe.html#">Participate <b class="caret"></b></a><ul class="dropdown-menu"><li><a href="https://bugs.eclipse.org/bugs/">Report a Bug</a></li><li><a href="https://eclipse.org/forums/">Forums</a></li><li><a href="https://eclipse.org/mail/">Mailing Lists</a></li><li><a href="https://wiki.eclipse.org/">Wiki</a></li><li><a href="https://wiki.eclipse.org/IRC">IRC</a></li><li><a href="https://eclipse.org/contribute/">How to Contribute</a></li></ul></li><li class="dropdown visible-xs"><a class="dropdown-toggle" data-toggle="dropdown" href="Bundle_org.eclipse.smila.processing.pipelets.boilerpipe.html#">Working Groups <b class="caret"></b></a><ul class="dropdown-menu"><li><a href="http://wiki.eclipse.org/Auto_IWG">Automotive</a></li><li><a href="http://iot.eclipse.org">Internet of Things</a></li><li><a href="http://locationtech.org">LocationTech</a></li><li><a href="http://lts.eclipse.org">Long-Term Support</a></li><li><a href="http://polarsys.org">PolarSys</a></li><li><a href="http://science.eclipse.org">Science</a></li><li><a href="http://openmdm.org">OpenMDM</a></li></ul></li><!-- More -->
<li class="dropdown hidden-xs"><a class="dropdown-toggle" data-toggle="dropdown">More<b class="caret"></b></a>
<ul class="dropdown-menu">
<li>
<!-- Content container to add padding -->
<div class="yamm-content">
<div class="row">
<ul class="col-sm-8 list-unstyled"><li><p><strong>Community</strong></p></li><li><a href="http://marketplace.eclipse.org">Marketplace</a></li><li><a href="http://events.eclipse.org">Events</a></li><li><a href="http://www.planeteclipse.org/">Planet Eclipse</a></li><li><a href="https://eclipse.org/community/eclipse_newsletter/">Newsletter</a></li><li><a href="https://www.youtube.com/user/EclipseFdn">Videos</a></li></ul><ul class="col-sm-8 list-unstyled"><li><p><strong>Participate</strong></p></li><li><a href="https://bugs.eclipse.org/bugs/">Report a Bug</a></li><li><a href="https://eclipse.org/forums/">Forums</a></li><li><a href="https://eclipse.org/mail/">Mailing Lists</a></li><li><a href="https://wiki.eclipse.org/">Wiki</a></li><li><a href="https://wiki.eclipse.org/IRC">IRC</a></li><li><a href="https://eclipse.org/contribute/">How to Contribute</a></li></ul><ul class="col-sm-8 list-unstyled"><li><p><strong>Working Groups</strong></p></li><li><a href="http://wiki.eclipse.org/Auto_IWG">Automotive</a></li><li><a href="http://iot.eclipse.org">Internet of Things</a></li><li><a href="http://locationtech.org">LocationTech</a></li><li><a href="http://lts.eclipse.org">Long-Term Support</a></li><li><a href="http://polarsys.org">PolarSys</a></li><li><a href="http://science.eclipse.org">Science</a></li><li><a href="http://openmdm.org">OpenMDM</a></li></ul> </div>
</div>
</li>
</ul>
</li>
</ul>
</div>
<div class="navbar-header">
<button data-target="#navbar-collapse-1" data-toggle="collapse" class="navbar-toggle" type="button">
<span class="sr-only">Toggle navigation</span>
<span class="icon-bar"></span>
<span class="icon-bar"></span>
<span class="icon-bar"></span>
<span class="icon-bar"></span>
</button>
<a href="https://www.eclipse.org/" class="visible-xs navbar-brand"><img class="logo-eclipse-default" src="http://wiki.eclipse.org/skins/solstice/public/images/logo/eclipse-800x188.png" alt="Eclipsepedia" width="174"></a>
</div>
</div>
</div>
</div>
</div>
</div>
</header>
<section class="defaut-breadcrumbs hidden-print noprint hidden-print clearfix" id="breadcrumb">
<div>
<ol class="breadcrumb">
<li><a href="https://www.eclipse.org/">Home</a></li>
<li><a href="http://wiki.eclipse.org/Main_Page">Eclipse Wiki</a></li>
<li class="active">SMILA/Documentation/Bundle org.eclipse.smila.processing.pipelets.boilerpipe</li></ol>
</div>
</section>
</div>
<div class="toolbar-menu breadcrumbs-offset noprint hidden-print margin-bottom-0 clearfix">
<div class="col-md-24">
<ol class="breadcrumb" role="navigation">
<li id="pt-login">
<a href="http://wiki.eclipse.org/index.php?title=Special:UserLogin&amp;returnto=SMILA%2FDocumentation%2FBundle+org.eclipse.smila.processing.pipelets.boilerpipe">
<i class="fa fa-sign-in fa-fw orange"></i> Log in </a>
</li>
</ul>
</div>
</div>
<main role="main" class="background-grey">
<div class="container-full padding-top-25">
<!-- content -->
<section id="content" class="mw-body container-full clearfix 0">
<div id="mw-js-message" style="display:none;"></div>
<!-- bodyContent -->
<div id="bodyContent">
<!-- jumpto -->
<div id="jump-to-nav" class="mw-jump">
Jump to: <a href="Bundle_org.eclipse.smila.processing.pipelets.boilerpipe.html#mw-head">navigation</a>,
<a href="Bundle_org.eclipse.smila.processing.pipelets.boilerpipe.html#p-search">search</a>
</div>
<!-- /jumpto -->
<!-- leftcol -->
<aside class="col-md-4 noprint hidden-print" id="leftcol">
<form class="input-group" role="form" id="form-eclipse-search" action="http://wiki.eclipse.org/index.php" id="searchform">
<input id="searchInput" class="search-query form-control" type="search" accesskey="f" title="Special:Search" placeholder="Search" name="search" value="">
<span class="input-group-btn">
<button value="search" id="mw-searchButton" type="submit" class="btn btn-default" title="Search the pages for this text" name="fulltext">
<i class="fa fa-search"></i>
</button>
</span>
</form>
<select class="form-control margin-top-10 margin-bottom-10 visible-xs visible-sm" onchange="this.options[this.selectedIndex].value && (window.location = this.options[this.selectedIndex].value);"><option class="fw-700 "><span class="fw-700">---Navigation---</span></option><option value="/Main_Page">Main Page</option><option value="/Eclipsepedia:Community_portal">Community portal</option><option value="/Eclipsepedia:Current_events">Current events</option><option value="/Special:RecentChanges">Recent changes</option><option value="/Special:Random">Random page</option><option value="https://www.mediawiki.org/wiki/Special:MyLanguage/Help:Contents">Help</option></select><ul class="ul-left-nav fa-ul hidden-print leftnav hidden-xs hidden-sm"><li class="separator"><span class="separator">Navigation</span></li> <li class=""><i class="fa fa-angle-double-right orange fa-fw"></i> <a href="http://wiki.eclipse.org/Main_Page" id="n-mainpage" title="Visit the main page [z]" accesskey="z">Main Page</a></li> <li class=""><i class="fa fa-angle-double-right orange fa-fw"></i> <a href="http://wiki.eclipse.org/Eclipsepedia:Community_portal" id="n-portal" title="About the project, what you can do, where to find things">Community portal</a></li> <li class=""><i class="fa fa-angle-double-right orange fa-fw"></i> <a href="http://wiki.eclipse.org/Eclipsepedia:Current_events" id="n-currentevents" title="Find background information on current events">Current events</a></li> <li class=""><i class="fa fa-angle-double-right orange fa-fw"></i> <a href="http://wiki.eclipse.org/Special:RecentChanges" id="n-recentchanges" title="A list of recent changes in the wiki [r]" accesskey="r">Recent changes</a></li> <li class=""><i class="fa fa-angle-double-right orange fa-fw"></i> <a href="http://wiki.eclipse.org/Special:Random" id="n-randompage" title="Load a random page [x]" accesskey="x">Random page</a></li> <li class=""><i class="fa fa-angle-double-right orange fa-fw"></i> <a href="https://www.mediawiki.org/wiki/Special:MyLanguage/Help:Contents" id="n-help" title="The place to find out">Help</a></li></ul> <select class="form-control margin-top-10 margin-bottom-10 visible-xs visible-sm" onchange="this.options[this.selectedIndex].value && (window.location = this.options[this.selectedIndex].value);"><option class="fw-700 "><span class="fw-700">---Toolbox---</span></option><option value="/index.php?title=SMILA/Documentation/Bundle_org.eclipse.smila.processing.pipelets.boilerpipe&amp;action=info">Page information</option><option value="/index.php?title=SMILA/Documentation/Bundle_org.eclipse.smila.processing.pipelets.boilerpipe&amp;oldid=352422">Permanent link</option><option value="/index.php?title=SMILA/Documentation/Bundle_org.eclipse.smila.processing.pipelets.boilerpipe&amp;printable=yes">Printable version</option><option value="/Special:SpecialPages">Special pages</option><option value="/Special:RecentChangesLinked/SMILA/Documentation/Bundle_org.eclipse.smila.processing.pipelets.boilerpipe">Related changes</option><option value="/Special:WhatLinksHere/SMILA/Documentation/Bundle_org.eclipse.smila.processing.pipelets.boilerpipe">What links here</option></select><ul class="ul-left-nav fa-ul hidden-print leftnav hidden-xs hidden-sm"><li class="separator"><span class="separator">Toolbox</span></li><li><i class="fa fa-angle-double-right orange fa-fw"></i> <a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/Bundle_org.eclipse.smila.processing.pipelets.boilerpipe&amp;action=info" id="t-info">Page information</a></li><li><i class="fa fa-angle-double-right orange fa-fw"></i> <a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/Bundle_org.eclipse.smila.processing.pipelets.boilerpipe&amp;oldid=352422" id="t-permalink" title="Permanent link to this revision of the page">Permanent link</a></li><li><i class="fa fa-angle-double-right orange fa-fw"></i> <a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/Bundle_org.eclipse.smila.processing.pipelets.boilerpipe&amp;printable=yes" id="t-print" rel="alternate" title="Printable version of this page [p]" accesskey="p">Printable version</a></li><li><i class="fa fa-angle-double-right orange fa-fw"></i> <a href="http://wiki.eclipse.org/Special:SpecialPages" id="t-specialpages" title="A list of all special pages [q]" accesskey="q">Special pages</a></li><li><i class="fa fa-angle-double-right orange fa-fw"></i> <a href="http://wiki.eclipse.org/Special:RecentChangesLinked/SMILA/Documentation/Bundle_org.eclipse.smila.processing.pipelets.boilerpipe" id="t-recentchangeslinked" title="Recent changes in pages linked from this page [k]" accesskey="k">Related changes</a></li><li><i class="fa fa-angle-double-right orange fa-fw"></i> <a href="http://wiki.eclipse.org/Special:WhatLinksHere/SMILA/Documentation/Bundle_org.eclipse.smila.processing.pipelets.boilerpipe" id="t-whatlinkshere" title="A list of all wiki pages that link here [j]" accesskey="j">What links here</a></li></ul> </aside>
<!-- /leftcol -->
<!-- mainContent -->
<div id="mainContent" class="col-md-20">
<ul class="nav nav-tabs noprint hidden-print" role="tablist">
<li id="ca-nstab-main" class="active"><a href="Bundle_org.eclipse.smila.processing.pipelets.boilerpipe.html" title="View the content page [c]" accesskey="c" tabindex="-1">Page</a></li>
<li id="ca-talk" class="new"><a href="http://wiki.eclipse.org/index.php?title=Talk:SMILA/Documentation/Bundle_org.eclipse.smila.processing.pipelets.boilerpipe&amp;action=edit&amp;redlink=1" title="Discussion about the content page [t]" accesskey="t" tabindex="-1">Discussion</a></li>
<li id="ca-viewsource"><a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/Bundle_org.eclipse.smila.processing.pipelets.boilerpipe&amp;action=edit" title="This page is protected.&#10;You can view its source [e]" accesskey="e" tabindex="-1">View source</a></li>
<li id="ca-history" class="collapsible"><a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/Bundle_org.eclipse.smila.processing.pipelets.boilerpipe&amp;action=history" title="Past revisions of this page [h]" accesskey="h" tabindex="-1">History</a></li>
</ul> <div class="tab-content background-white">
<div id="tab-pane-main-page-content" class="tab-pane active">
<h1 id="firstHeading" class="firstHeading page-header">
<span dir="auto">SMILA/Documentation/Bundle org.eclipse.smila.processing.pipelets.boilerpipe</span>
</h1>
<div id="main-page-content">
<!-- subtitle -->
<div id="contentSub" class="alert alert-small alert-warning"><span class="subpages">&lt; <a href="../../SMILA.html" title="SMILA">SMILA</a>&lrm; | <a href="../Documentation.1.html" title="SMILA/Documentation">Documentation</a></span></div>
<!-- /subtitle -->
<div id="mw-content-text" lang="en" dir="ltr" class="mw-content-ltr"><p>This page describes the SMILA pipelets provided by bundle <tt>org.eclipse.smila.processing.pipelets.boilerpipe</tt>.
</p>
<div id="toc" class="toc"><div id="toctitle"><h2>Contents</h2></div>
<ul>
<li class="toclevel-1 tocsection-1"><a href="Bundle_org.eclipse.smila.processing.pipelets.boilerpipe.html#General"><span class="tocnumber">1</span> <span class="toctext">General</span></a></li>
<li class="toclevel-1 tocsection-2"><a href="Bundle_org.eclipse.smila.processing.pipelets.boilerpipe.html#org.eclipse.smila.processing.pipelets.boilerpipe.BoilerpipePipelet"><span class="tocnumber">2</span> <span class="toctext">org.eclipse.smila.processing.pipelets.boilerpipe.BoilerpipePipelet</span></a>
<ul>
<li class="toclevel-2 tocsection-3"><a href="Bundle_org.eclipse.smila.processing.pipelets.boilerpipe.html#Configuration"><span class="tocnumber">2.1</span> <span class="toctext">Configuration</span></a></li>
<li class="toclevel-2 tocsection-4"><a href="Bundle_org.eclipse.smila.processing.pipelets.boilerpipe.html#Note_on_Boilerpipe.27s_extraction_process_and_memory_consumption"><span class="tocnumber">2.2</span> <span class="toctext">Note on Boilerpipe's extraction process and memory consumption</span></a></li>
<li class="toclevel-2 tocsection-5"><a href="Bundle_org.eclipse.smila.processing.pipelets.boilerpipe.html#Example"><span class="tocnumber">2.3</span> <span class="toctext">Example</span></a></li>
</ul>
</li>
</ul>
</div>
<h2><span class="mw-headline" id="General">General</span></h2>
<p>All pipelets in this bundle support the configurable error handling as described in <a href="../Development_Guidelines/How_to_write_a_Pipelet.html#Implementation" title="SMILA/Development Guidelines/How to write a Pipelet" class="mw-redirect">SMILA/Development_Guidelines/How_to_write_a_Pipelet#Implementation</a>. When used in jobmanager workflows, records causing errors are dropped.
</p><p><b> Read Type </b>
</p>
<ul>
<li> <i>runtime</i>: Parameters are read when processing records. Parameter value can be set per Record.
</li>
<li> <i>init</i>: Parameters are read once from Pipelet configuration when initializing the Pipelet. Parameter value can not be overwritten in Record.
</li>
</ul>
<h2><span class="mw-headline" id="org.eclipse.smila.processing.pipelets.boilerpipe.BoilerpipePipelet">org.eclipse.smila.processing.pipelets.boilerpipe.BoilerpipePipelet</span></h2>
<p>Extracts text from an HTML input using the <a rel="nofollow" class="external text" href="http://code.google.com/p/boilerpipe/">Boilerpipe library</a>. In contrast to the <a href="Bundle_org.eclipse.smila.processing.pipelets.html#org.eclipse.smila.processing.pipelets.HtmlToTextPipelet" title="SMILA/Documentation/Bundle org.eclipse.smila.processing.pipelets"> HtmlToTextPipelet</a> it offers different algorithms for textual content extraction but does not extract HTML metadata.
</p>
<h3><span class="mw-headline" id="Configuration">Configuration</span></h3>
<table border="1">
<tr>
<th>Property
</th>
<th>Type
</th>
<th>Read Type
</th>
<th>Description
</th></tr>
<tr>
<td><i>inputType</i>
</td>
<td>String&#160;: <i>ATTACHMENT, ATTRIBUTE</i>
</td>
<td>runtime
</td>
<td>Defines whether the HTML input is found in an attachment or in an attribute of the record
</td></tr>
<tr>
<td><i>outputType</i>
</td>
<td>String&#160;: <i>ATTACHMENT, ATTRIBUTE</i>
</td>
<td>runtime
</td>
<td>Defines whether the plain text should be stored in an attachment or in an attribute of the record
</td></tr>
<tr>
<td><i>inputName</i>
</td>
<td>String
</td>
<td>runtime
</td>
<td>Name of attachment or attribute that contains the HTML input
</td></tr>
<tr>
<td><i>outputName</i>
</td>
<td>String
</td>
<td>runtime
</td>
<td>Name of attachment or attribute for plain text output
</td></tr>
<tr>
<td><i>encodingAttribute</i>
</td>
<td>String
</td>
<td>runtime
</td>
<td>Optional name of the attribute with the encoding of the input attachment.
</td></tr>
<tr>
<td><i>defaultEncoding</i>
</td>
<td>String
</td>
<td>runtime
</td>
<td>Optional fallback encoding, if anything else fails.
</td></tr>
<tr>
<td><i>maxParserBlocks</i>
</td>
<td>integer
</td>
<td>runtime
</td>
<td>Limits Boilerpipe's parsing process, default value is 20.000. Negative values disable the limit. See below for addition information.
</td></tr>
<tr>
<td><i>filter</i>
</td>
<td>Sequence of String
</td>
<td>init
</td>
<td>A list of boiler pipe filters to use. This may contain class names, static method or static variable references. Default is <tt>de.l3s.boilerpipe.extractors.ArticleExtractor.INSTANCE</tt>. Please note that BoilerpipeExtractors implement the interface BoilerpipeFilter and are pipelines of BoilerpipeFilters. Therefore you should not use multiple BoilerpipeExtractors! Also please note that some Extractors and Filters do not have a default Constructor and therefore cannot be used by this Pipelet. Others may not have a public Constructor but a public static instance member.
</td></tr></table>
<h3><span class="mw-headline" id="Note_on_Boilerpipe.27s_extraction_process_and_memory_consumption">Note on Boilerpipe's extraction process and memory consumption</span></h3>
<p>Boilerpipe is not a generic HTML-to-Text-converter that works great with every HTML page. It is specialized on extracting the <i>relevant</i> portion of text from HTML pages with "typical" layouts: For example, news sites usually have a content area in the middle where the actual article is, and around that you'll find areas with navigation links, advertising, header and footer content, etc. On such pages boilerpipe is quite good at extracting only the actual article text.
</p><p>The cost of this is that Boilerpipe needs more memory than other HTML parsers, because it creates a special document object model of the HTML page to be able to identify the content area. This can lead to problems on "non-typical" pages, for example large pages that consists only of list of links (a quite extreme example is <a rel="nofollow" class="external autonumber" href="http://worldwidescience.org/topicpages/s.html">[1]</a>). Such pages often lead to OutOfMemory errors when a vanilla Boilerpipe is used to parse them (8 GB of heap space is not enough to parse the "extreme" example).
</p><p>To prevent this we have patched Boilerpipe for SMILA so that we can set an upper limit for the extraction process. The configuration property for this is named "maxParserBlocks". It limits the amounts of "TextBlock" objects created by Boilerpipe for it's document model - one TextBlock represents, roughly speaking, the content between two consecutive HTML tags. The default value for this is 20.000 which in our tests was high enough to handle typical news sites or Wikipedia (with still a lot of reserve left). If you get incomplete results from processing your HTML pages with the default setting, you may want to increase the value of this property and additionally you should probably increase the -Xmx setting for your SMILA VM. To disable the limit (proceed at your own risk) set the property to a negative value.
</p><p>Patch submitted to Boilerpipe project: <a rel="nofollow" class="external autonumber" href="http://code.google.com/p/boilerpipe/issues/detail?id=71">[2]</a>.
</p>
<h3><span class="mw-headline" id="Example">Example</span></h3>
<p>Extract text from the HTML input in attachment "html" into the attribute "text" using the encoding given in attribute "http.encoding" and using the extractor ArticleSentencesExtractor:
</p>
<div dir="ltr" class="mw-geshi mw-code mw-content-ltr"><div class="xml source-xml"><pre class="de1"><span class="sc3"><span class="re1">&lt;proc:invokePipelet</span> <span class="re0">name</span>=<span class="st0">&quot;extractText&quot;</span><span class="re2">&gt;</span></span>
<span class="sc3"><span class="re1">&lt;proc:pipelet</span> <span class="re0">class</span>=<span class="st0">&quot;org.eclipse.smila.processing.pipelets.boilerpipe.BoilerpipePipelet&quot;</span> <span class="re2">/&gt;</span></span>
<span class="sc3"><span class="re1">&lt;proc:variables</span> <span class="re0">input</span>=<span class="st0">&quot;request&quot;</span> <span class="re2">/&gt;</span></span>
<span class="sc3"><span class="re1">&lt;proc:configuration<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;rec:Val</span> <span class="re0">key</span>=<span class="st0">&quot;inputType&quot;</span><span class="re2">&gt;</span></span>ATTACHMENT<span class="sc3"><span class="re1">&lt;/rec:Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;rec:Val</span> <span class="re0">key</span>=<span class="st0">&quot;inputName&quot;</span><span class="re2">&gt;</span></span>html<span class="sc3"><span class="re1">&lt;/rec:Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;rec:Val</span> <span class="re0">key</span>=<span class="st0">&quot;outputType&quot;</span><span class="re2">&gt;</span></span>ATTRIBUTE<span class="sc3"><span class="re1">&lt;/rec:Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;rec:Val</span> <span class="re0">key</span>=<span class="st0">&quot;outputName&quot;</span><span class="re2">&gt;</span></span>text<span class="sc3"><span class="re1">&lt;/rec:Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;rec:Val</span> <span class="re0">key</span>=<span class="st0">&quot;encodingAttribute&quot;</span><span class="re2">&gt;</span></span>http.encoding<span class="sc3"><span class="re1">&lt;/rec:Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;rec:Val</span> <span class="re0">key</span>=<span class="st0">&quot;filter&quot;</span><span class="re2">&gt;</span></span>de.l3s.boilerpipe.extractors.ArticleSentencesExtractor.INSTANCE<span class="sc3"><span class="re1">&lt;/rec:Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/proc:configuration<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/proc:invokePipelet<span class="re2">&gt;</span></span></span></pre></div></div>
<p><br />
The same example but using the simple filter MarkEverythingContentFilter:
</p>
<div dir="ltr" class="mw-geshi mw-code mw-content-ltr"><div class="xml source-xml"><pre class="de1"><span class="sc3"><span class="re1">&lt;proc:invokePipelet</span> <span class="re0">name</span>=<span class="st0">&quot;extractText&quot;</span><span class="re2">&gt;</span></span>
<span class="sc3"><span class="re1">&lt;proc:pipelet</span> <span class="re0">class</span>=<span class="st0">&quot;org.eclipse.smila.processing.pipelets.boilerpipe.BoilerpipePipelet&quot;</span> <span class="re2">/&gt;</span></span>
<span class="sc3"><span class="re1">&lt;proc:variables</span> <span class="re0">input</span>=<span class="st0">&quot;request&quot;</span> <span class="re2">/&gt;</span></span>
<span class="sc3"><span class="re1">&lt;proc:configuration<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;rec:Val</span> <span class="re0">key</span>=<span class="st0">&quot;inputType&quot;</span><span class="re2">&gt;</span></span>ATTACHMENT<span class="sc3"><span class="re1">&lt;/rec:Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;rec:Val</span> <span class="re0">key</span>=<span class="st0">&quot;inputName&quot;</span><span class="re2">&gt;</span></span>html<span class="sc3"><span class="re1">&lt;/rec:Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;rec:Val</span> <span class="re0">key</span>=<span class="st0">&quot;outputType&quot;</span><span class="re2">&gt;</span></span>ATTRIBUTE<span class="sc3"><span class="re1">&lt;/rec:Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;rec:Val</span> <span class="re0">key</span>=<span class="st0">&quot;outputName&quot;</span><span class="re2">&gt;</span></span>text<span class="sc3"><span class="re1">&lt;/rec:Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;rec:Val</span> <span class="re0">key</span>=<span class="st0">&quot;encodingAttribute&quot;</span><span class="re2">&gt;</span></span>http.encoding<span class="sc3"><span class="re1">&lt;/rec:Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;rec:Val</span> <span class="re0">key</span>=<span class="st0">&quot;filter&quot;</span><span class="re2">&gt;</span></span>de.l3s.boilerpipe.filters.simple.MarkEverythingContentFilter.INSTANCE<span class="sc3"><span class="re1">&lt;/rec:Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/proc:configuration<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/proc:invokePipelet<span class="re2">&gt;</span></span></span></pre></div></div>
<!--
NewPP limit report
CPU time usage: 0.060 seconds
Real time usage: 0.066 seconds
Preprocessor visited node count: 25/1000000
Preprocessor generated node count: 64/1000000
Post‐expand include size: 0/2097152 bytes
Template argument size: 0/2097152 bytes
Highest expansion depth: 2/40
Expensive parser function count: 0/100
-->
<!-- Saved in parser cache with key my_wiki:pcache:idhash:37346-0!*!0!!en!*!* and timestamp 20150414084533 and revision id 352422
-->
</div>
<!-- catlinks -->
<div id='catlinks' class='catlinks'><div id="mw-normal-catlinks" class="mw-normal-catlinks"><a href="http://wiki.eclipse.org/Special:Categories" title="Special:Categories">Categories</a>: <ul><li><a href="http://wiki.eclipse.org/Category:SMILA" title="Category:SMILA">SMILA</a></li><li><a href="http://wiki.eclipse.org/index.php?title=Category:SMILA/Pipelet&amp;action=edit&amp;redlink=1" class="new" title="Category:SMILA/Pipelet (page does not exist)">SMILA/Pipelet</a></li></ul></div></div> <!-- /catlinks -->
</div>
</div>
</div>
</div>
<!-- /maincontent -->
<!-- printfooter -->
<div class="printfooter">
Retrieved from "<a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/Bundle_org.eclipse.smila.processing.pipelets.boilerpipe&amp;oldid=352422">http://wiki.eclipse.org/index.php?title=SMILA/Documentation/Bundle_org.eclipse.smila.processing.pipelets.boilerpipe&amp;oldid=352422</a>" </div>
<!-- /printfooter -->
<!-- debughtml -->
<!-- /debughtml -->
</div>
<!-- /bodyContent -->
</section>
<!-- /content -->
<!-- footer -->
</div> <section id="footer-contribution-info" style="border-top:1px solid #ccc;" class="footer-offset background-white margin-top-25"><div class="container text-center padding-top-10 padding-bottom-10"><p id="footercredit">This page was last modified 09:02, 28 November 2013 by <a href="http://wiki.eclipse.org/User:Juergen.schumacher.empolis.com" title="User:Juergen.schumacher.empolis.com">Juergen Schumacher</a>. Based on work by <a href="http://wiki.eclipse.org/index.php?title=User:Daniel.stucky.attensity.com&amp;action=edit&amp;redlink=1" class="new" title="User:Daniel.stucky.attensity.com (page does not exist)">Daniel Stucky</a> and <a href="http://wiki.eclipse.org/User:Eclipse.liefke.biz" title="User:Eclipse.liefke.biz">Tobias Liefke</a>.</p><p id="footerviews">This page has been accessed 1,823 times.</p></div></section> </main> <!-- /#main-content-container-row -->
<p id="back-to-top" class="noprint hidden-print">
<a class="visible-xs" href="Bundle_org.eclipse.smila.processing.pipelets.boilerpipe.html#top">Back to the top</a>
</p>
<footer role="contentinfo" class="noprint hidden-print">
<div class="container">
<div class="row">
<section id="footer-eclipse-foundation" class="col-xs-offset-1 col-xs-11 col-sm-7 col-md-6 col-md-offset-0">
<h2 class="section-title">Eclipse Foundation</h2>
<ul class="nav">
<li><a href="https://eclipse.org/org/">About us</a></li>
<li><a href="https://eclipse.org/org/foundation/contact.php">Contact Us</a></li>
<li><a href="https://eclipse.org/donate">Donate</a></li>
<li><a href="https://eclipse.org/org/documents/">Governance</a></li>
<li><a href="https://eclipse.org/artwork/">Logo and Artwork</a></li>
<li><a href="https://eclipse.org/org/foundation/directors.php">Board of Directors</a></li>
</ul>
</section>
<section id="footer-legal" class="col-xs-offset-1 col-xs-11 col-sm-7 col-md-6 col-md-offset-0">
<h2 class="section-title">Legal</h2>
<ul class="nav">
<li><a href="https://eclipse.org/legal/privacy.php">Privacy Policy</a></li>
<li><a href="https://eclipse.org/legal/termsofuse.php">Terms of Use</a></li>
<li><a href="https://eclipse.org/legal/copyright.php">Copyright Agent</a></li>
<li><a href="https://eclipse.org/org/documents/epl-v10.php">Eclipse Public License </a></li>
<li><a href="https://eclipse.org/legal/">Legal Resources </a></li>
</ul>
</section>
<section id="footer-useful-links" class="col-xs-offset-1 col-xs-11 col-sm-7 col-md-6 col-md-offset-0">
<h2 class="section-title">Useful Links</h2>
<ul class="nav">
<li><a href="https://bugs.eclipse.org/bugs/">Report a Bug</a></li>
<li><a href="http://help.eclipse.org/">Documentation</a></li>
<li><a href="https://eclipse.org/contribute/">How to Contribute</a></li>
<li><a href="https://eclipse.org/mail/">Mailing Lists</a></li>
<li><a href="https://eclipse.org/forums/">Forums</a></li>
<li><a href="http://marketplace.eclipse.org/">Marketplace</a></li>
</ul>
</section>
<section id="footer-other" class="col-xs-offset-1 col-xs-11 col-sm-7 col-md-6 col-md-offset-0">
<h2 class="section-title">Other</h2>
<ul class="nav">
<li><a href="https://eclipse.org/ide/">IDE and Tools</a></li>
<li><a href="https://eclipse.org/projects">Community of Projects</a></li>
<li><a href="https://eclipse.org/org/workinggroups/">Working Groups</a></li>
</ul>
<ul class="list-inline social-media">
<li><a href="https://twitter.com/EclipseFdn"><i class="fa fa-twitter-square"></i></a></li>
<li><a href="https://plus.google.com/+Eclipse"><i class="fa fa-google-plus-square"></i></a></li>
<li><a href="https://www.facebook.com/eclipse.org"><i class="fa fa-facebook-square"></i> </a></li>
<li><a href="https://www.youtube.com/user/EclipseFdn"><i class="fa fa-youtube-square"></i></a></li>
</ul>
</section>
<div id="copyright" class="col-xs-offset-1 col-sm-14 col-md-24 col-md-offset-0">
<div>
<span><img src="http://eclipse.org/eclipse.org-common/themes/solstice/public/images/logo/eclipse-logo-bw-800x188.png" alt="Eclipse.org black and white logo" width="166" height="39" id="logo-eclipse-white"/></span>
<p id="copyright-text">Copyright &copy; 2014 The Eclipse Foundation. All Rights Reserved.</p>
</div>
</div>
<a href="Bundle_org.eclipse.smila.processing.pipelets.boilerpipe.html#" class="scrollup">Back to the top</a>
</div>
</div>
</footer>
<script src="http://wiki.eclipse.org/skins/solstice/public/javascript/main.min.js"></script>
<!-- Placed at the end of the document so the pages load faster -->
<script type="text/javascript">
var _gaq = _gaq || [];
_gaq.push(['_setAccount', 'UA-910670-2']);
_gaq.push(['_trackPageview']);
(function() {
var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
})();
</script> <!-- /footer -->
<script>/*<![CDATA[*/window.jQuery && jQuery.ready();/*]]>*/</script><script>if(window.mw){
mw.loader.state({"skins.solstice":"loading","site":"ready","user":"ready","user.groups":"ready"});
}</script>
<script src="http://wiki.eclipse.org/load.php?debug=false&amp;lang=en&amp;modules=skins.solstice&amp;only=scripts&amp;skin=solstice&amp;*"></script>
<script>if(window.mw){
mw.loader.load(["mediawiki.action.view.postEdit","mediawiki.user","mediawiki.hidpi","mediawiki.page.ready","mediawiki.searchSuggest"],null,true);
}</script>
<script>if(window.mw){
mw.config.set({"wgBackendResponseTime":339});
}</script> </body>
</html>