blob: b26933b637b336eb7563b1e6185e393b378aa0ba [file] [log] [blame]
<!DOCTYPE html>
<html lang="en" dir="ltr" class="client-nojs">
<head>
<meta charset="UTF-8" />
<title>SMILA/Documentation/Importing/Crawler/File - Eclipsepedia</title>
<meta http-equiv="X-UA-Compatible" content="IE=EDGE" />
<meta name="generator" content="MediaWiki 1.23.2" />
<link rel="shortcut icon" href="http://wiki.eclipse.org/eclipse.org-common/themes/solstice/public/images/favicon.ico" />
<link rel="search" type="application/opensearchdescription+xml" href="http://wiki.eclipse.org/opensearch_desc.php" title="Eclipsepedia (en)" />
<link rel="EditURI" type="application/rsd+xml" href="http://wiki.eclipse.org/api.php?action=rsd" />
<link rel="alternate" type="application/atom+xml" title="Eclipsepedia Atom feed" href="http://wiki.eclipse.org/index.php?title=Special:RecentChanges&amp;feed=atom" />
<link rel="stylesheet" href="http://wiki.eclipse.org/load.php?debug=false&amp;lang=en&amp;modules=mediawiki.legacy.commonPrint%2Cshared%7Cmediawiki.ui.button&amp;only=styles&amp;skin=solstice&amp;*" />
<link rel="stylesheet" href="http://wiki.eclipse.org/skins/solstice/public/stylesheets/styles.min.css?303" media="screen, print" /><meta name="ResourceLoaderDynamicStyles" content="" />
<style>a:lang(ar),a:lang(kk-arab),a:lang(mzn),a:lang(ps),a:lang(ur){text-decoration:none}
/* cache key: my_wiki:resourceloader:filter:minify-css:7:14ece53a42aa314864e5fd8c57f0d98f */</style>
<script src="http://wiki.eclipse.org/load.php?debug=false&amp;lang=en&amp;modules=startup&amp;only=scripts&amp;skin=solstice&amp;*"></script>
<script>if(window.mw){
mw.config.set({"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"SMILA/Documentation/Importing/Crawler/File","wgTitle":"SMILA/Documentation/Importing/Crawler/File","wgCurRevisionId":378630,"wgRevisionId":378630,"wgArticleId":34874,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["SMILA"],"wgBreakFrames":false,"wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgMonthNamesShort":["","Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"],"wgRelevantPageName":"SMILA/Documentation/Importing/Crawler/File","wgIsProbablyEditable":false,"wgRestrictionEdit":[],"wgRestrictionMove":[],"wgWikiEditorEnabledModules":{"toolbar":false,"dialogs":false,"hidesig":true,"preview":false,"previewDialog":false,"publish":false},"wgCategoryTreePageCategoryOptions":"{\"mode\":0,\"hideprefix\":20,\"showcount\":true,\"namespaces\":false}"});
}</script><script>if(window.mw){
mw.loader.implement("user.options",function($,jQuery){mw.user.options.set({"ccmeonemails":0,"cols":80,"date":"default","diffonly":0,"disablemail":0,"editfont":"default","editondblclick":0,"editsectiononrightclick":0,"enotifminoredits":0,"enotifrevealaddr":0,"enotifusertalkpages":1,"enotifwatchlistpages":1,"extendwatchlist":0,"fancysig":0,"forceeditsummary":0,"gender":"unknown","hideminor":0,"hidepatrolled":0,"imagesize":2,"math":1,"minordefault":0,"newpageshidepatrolled":0,"nickname":"","norollbackdiff":0,"numberheadings":0,"previewonfirst":0,"previewontop":1,"rcdays":7,"rclimit":50,"rows":25,"showhiddencats":0,"shownumberswatching":1,"showtoolbar":1,"skin":"solstice","stubthreshold":0,"thumbsize":2,"underline":2,"uselivepreview":0,"usenewrc":0,"watchcreations":1,"watchdefault":1,"watchdeletion":0,"watchlistdays":3,"watchlisthideanons":0,"watchlisthidebots":0,"watchlisthideliu":0,"watchlisthideminor":0,"watchlisthideown":0,"watchlisthidepatrolled":0,"watchmoves":0,"wllimit":250,
"useeditwarning":1,"prefershttps":1,"language":"en","variant-gan":"gan","variant-iu":"iu","variant-kk":"kk","variant-ku":"ku","variant-shi":"shi","variant-sr":"sr","variant-tg":"tg","variant-uz":"uz","variant-zh":"zh","searchNs0":true,"searchNs1":false,"searchNs2":false,"searchNs3":false,"searchNs4":false,"searchNs5":false,"searchNs6":false,"searchNs7":false,"searchNs8":false,"searchNs9":false,"searchNs10":false,"searchNs11":false,"searchNs12":false,"searchNs13":false,"searchNs14":false,"searchNs15":false,"variant":"en"});},{},{});mw.loader.implement("user.tokens",function($,jQuery){mw.user.tokens.set({"editToken":"+\\","patrolToken":false,"watchToken":false});},{},{});
/* cache key: my_wiki:resourceloader:filter:minify-js:7:70d74423d3fc1e1c18fa9a1ff645a84a */
}</script>
<script>if(window.mw){
mw.loader.load(["mediawiki.page.startup","mediawiki.legacy.wikibits","mediawiki.legacy.ajax"]);
}</script>
<meta name="viewport" content="width=device-width, initial-scale=1.0"></head>
<body class="mediawiki ltr sitedir-ltr ns-0 ns-subject page-SMILA_Documentation_Importing_Crawler_File skin-solstice action-view" id="solstice">
<a class="sr-only" href="File.html#content">Skip to main content</a>
<div class="thin-header">
<header role="banner" class="hidden-print noprint">
<div class="container-fluid">
<div id="row-logo-search">
<div id="header-left">
<div class="row">
<div class="hidden-xs col-sm-6 logo-container">
<a href="https://www.eclipse.org/" ><img class="logo-eclipse-default" src="http://wiki.eclipse.org/skins/solstice/public/images/logo/eclipse-800x188.png" alt="Eclipsepedia"></a>
</div>
<div class="navbar col-sm-18 yamm" id="main-menu">
<div class="navbar-collapse collapse" id="navbar-collapse-1">
<ul class="nav navbar-nav">
<li><a target="_self" href="https://eclipse.org/downloads/">Download</a></li>
<li><a target="_self" href="https://eclipse.org/users/">Getting Started </a></li>
<li><a target="_self" href="https://eclipse.org/membership/">Members</a></li>
<li><a target="_self" href="https://eclipse.org/projects/">Projects</a></li>
<li class="dropdown visible-xs"><a class="dropdown-toggle" data-toggle="dropdown" href="File.html#">Community <b class="caret"></b></a><ul class="dropdown-menu"><li><a href="http://marketplace.eclipse.org">Marketplace</a></li><li><a href="http://events.eclipse.org">Events</a></li><li><a href="http://www.planeteclipse.org/">Planet Eclipse</a></li><li><a href="https://eclipse.org/community/eclipse_newsletter/">Newsletter</a></li><li><a href="https://www.youtube.com/user/EclipseFdn">Videos</a></li></ul></li><li class="dropdown visible-xs"><a class="dropdown-toggle" data-toggle="dropdown" href="File.html#">Participate <b class="caret"></b></a><ul class="dropdown-menu"><li><a href="https://bugs.eclipse.org/bugs/">Report a Bug</a></li><li><a href="https://eclipse.org/forums/">Forums</a></li><li><a href="https://eclipse.org/mail/">Mailing Lists</a></li><li><a href="https://wiki.eclipse.org/">Wiki</a></li><li><a href="https://wiki.eclipse.org/IRC">IRC</a></li><li><a href="https://eclipse.org/contribute/">How to Contribute</a></li></ul></li><li class="dropdown visible-xs"><a class="dropdown-toggle" data-toggle="dropdown" href="File.html#">Working Groups <b class="caret"></b></a><ul class="dropdown-menu"><li><a href="http://wiki.eclipse.org/Auto_IWG">Automotive</a></li><li><a href="http://iot.eclipse.org">Internet of Things</a></li><li><a href="http://locationtech.org">LocationTech</a></li><li><a href="http://lts.eclipse.org">Long-Term Support</a></li><li><a href="http://polarsys.org">PolarSys</a></li><li><a href="http://science.eclipse.org">Science</a></li><li><a href="http://openmdm.org">OpenMDM</a></li></ul></li><!-- More -->
<li class="dropdown hidden-xs"><a class="dropdown-toggle" data-toggle="dropdown">More<b class="caret"></b></a>
<ul class="dropdown-menu">
<li>
<!-- Content container to add padding -->
<div class="yamm-content">
<div class="row">
<ul class="col-sm-8 list-unstyled"><li><p><strong>Community</strong></p></li><li><a href="http://marketplace.eclipse.org">Marketplace</a></li><li><a href="http://events.eclipse.org">Events</a></li><li><a href="http://www.planeteclipse.org/">Planet Eclipse</a></li><li><a href="https://eclipse.org/community/eclipse_newsletter/">Newsletter</a></li><li><a href="https://www.youtube.com/user/EclipseFdn">Videos</a></li></ul><ul class="col-sm-8 list-unstyled"><li><p><strong>Participate</strong></p></li><li><a href="https://bugs.eclipse.org/bugs/">Report a Bug</a></li><li><a href="https://eclipse.org/forums/">Forums</a></li><li><a href="https://eclipse.org/mail/">Mailing Lists</a></li><li><a href="https://wiki.eclipse.org/">Wiki</a></li><li><a href="https://wiki.eclipse.org/IRC">IRC</a></li><li><a href="https://eclipse.org/contribute/">How to Contribute</a></li></ul><ul class="col-sm-8 list-unstyled"><li><p><strong>Working Groups</strong></p></li><li><a href="http://wiki.eclipse.org/Auto_IWG">Automotive</a></li><li><a href="http://iot.eclipse.org">Internet of Things</a></li><li><a href="http://locationtech.org">LocationTech</a></li><li><a href="http://lts.eclipse.org">Long-Term Support</a></li><li><a href="http://polarsys.org">PolarSys</a></li><li><a href="http://science.eclipse.org">Science</a></li><li><a href="http://openmdm.org">OpenMDM</a></li></ul> </div>
</div>
</li>
</ul>
</li>
</ul>
</div>
<div class="navbar-header">
<button data-target="#navbar-collapse-1" data-toggle="collapse" class="navbar-toggle" type="button">
<span class="sr-only">Toggle navigation</span>
<span class="icon-bar"></span>
<span class="icon-bar"></span>
<span class="icon-bar"></span>
<span class="icon-bar"></span>
</button>
<a href="https://www.eclipse.org/" class="visible-xs navbar-brand"><img class="logo-eclipse-default" src="http://wiki.eclipse.org/skins/solstice/public/images/logo/eclipse-800x188.png" alt="Eclipsepedia" width="174"></a>
</div>
</div>
</div>
</div>
</div>
</div>
</header>
<section class="defaut-breadcrumbs hidden-print noprint hidden-print clearfix" id="breadcrumb">
<div>
<ol class="breadcrumb">
<li><a href="https://www.eclipse.org/">Home</a></li>
<li><a href="http://wiki.eclipse.org/Main_Page">Eclipse Wiki</a></li>
<li class="active">SMILA/Documentation/Importing/Crawler/File</li></ol>
</div>
</section>
</div>
<div class="toolbar-menu breadcrumbs-offset noprint hidden-print margin-bottom-0 clearfix">
<div class="col-md-24">
<ol class="breadcrumb" role="navigation">
<li id="pt-login">
<a href="http://wiki.eclipse.org/index.php?title=Special:UserLogin&amp;returnto=SMILA%2FDocumentation%2FImporting%2FCrawler%2FFile">
<i class="fa fa-sign-in fa-fw orange"></i> Log in </a>
</li>
</ul>
</div>
</div>
<main role="main" class="background-grey">
<div class="container-full padding-top-25">
<!-- content -->
<section id="content" class="mw-body container-full clearfix 0">
<div id="mw-js-message" style="display:none;"></div>
<!-- bodyContent -->
<div id="bodyContent">
<!-- jumpto -->
<div id="jump-to-nav" class="mw-jump">
Jump to: <a href="File.html#mw-head">navigation</a>,
<a href="File.html#p-search">search</a>
</div>
<!-- /jumpto -->
<!-- leftcol -->
<aside class="col-md-4 noprint hidden-print" id="leftcol">
<form class="input-group" role="form" id="form-eclipse-search" action="http://wiki.eclipse.org/index.php" id="searchform">
<input id="searchInput" class="search-query form-control" type="search" accesskey="f" title="Special:Search" placeholder="Search" name="search" value="">
<span class="input-group-btn">
<button value="search" id="mw-searchButton" type="submit" class="btn btn-default" title="Search the pages for this text" name="fulltext">
<i class="fa fa-search"></i>
</button>
</span>
</form>
<select class="form-control margin-top-10 margin-bottom-10 visible-xs visible-sm" onchange="this.options[this.selectedIndex].value && (window.location = this.options[this.selectedIndex].value);"><option class="fw-700 "><span class="fw-700">---Navigation---</span></option><option value="/Main_Page">Main Page</option><option value="/Eclipsepedia:Community_portal">Community portal</option><option value="/Eclipsepedia:Current_events">Current events</option><option value="/Special:RecentChanges">Recent changes</option><option value="/Special:Random">Random page</option><option value="https://www.mediawiki.org/wiki/Special:MyLanguage/Help:Contents">Help</option></select><ul class="ul-left-nav fa-ul hidden-print leftnav hidden-xs hidden-sm"><li class="separator"><span class="separator">Navigation</span></li> <li class=""><i class="fa fa-angle-double-right orange fa-fw"></i> <a href="http://wiki.eclipse.org/Main_Page" id="n-mainpage" title="Visit the main page [z]" accesskey="z">Main Page</a></li> <li class=""><i class="fa fa-angle-double-right orange fa-fw"></i> <a href="http://wiki.eclipse.org/Eclipsepedia:Community_portal" id="n-portal" title="About the project, what you can do, where to find things">Community portal</a></li> <li class=""><i class="fa fa-angle-double-right orange fa-fw"></i> <a href="http://wiki.eclipse.org/Eclipsepedia:Current_events" id="n-currentevents" title="Find background information on current events">Current events</a></li> <li class=""><i class="fa fa-angle-double-right orange fa-fw"></i> <a href="http://wiki.eclipse.org/Special:RecentChanges" id="n-recentchanges" title="A list of recent changes in the wiki [r]" accesskey="r">Recent changes</a></li> <li class=""><i class="fa fa-angle-double-right orange fa-fw"></i> <a href="http://wiki.eclipse.org/Special:Random" id="n-randompage" title="Load a random page [x]" accesskey="x">Random page</a></li> <li class=""><i class="fa fa-angle-double-right orange fa-fw"></i> <a href="https://www.mediawiki.org/wiki/Special:MyLanguage/Help:Contents" id="n-help" title="The place to find out">Help</a></li></ul> <select class="form-control margin-top-10 margin-bottom-10 visible-xs visible-sm" onchange="this.options[this.selectedIndex].value && (window.location = this.options[this.selectedIndex].value);"><option class="fw-700 "><span class="fw-700">---Toolbox---</span></option><option value="/index.php?title=SMILA/Documentation/Importing/Crawler/File&amp;action=info">Page information</option><option value="/index.php?title=SMILA/Documentation/Importing/Crawler/File&amp;oldid=378630">Permanent link</option><option value="/index.php?title=SMILA/Documentation/Importing/Crawler/File&amp;printable=yes">Printable version</option><option value="/Special:SpecialPages">Special pages</option><option value="/Special:RecentChangesLinked/SMILA/Documentation/Importing/Crawler/File">Related changes</option><option value="/Special:WhatLinksHere/SMILA/Documentation/Importing/Crawler/File">What links here</option></select><ul class="ul-left-nav fa-ul hidden-print leftnav hidden-xs hidden-sm"><li class="separator"><span class="separator">Toolbox</span></li><li><i class="fa fa-angle-double-right orange fa-fw"></i> <a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/Importing/Crawler/File&amp;action=info" id="t-info">Page information</a></li><li><i class="fa fa-angle-double-right orange fa-fw"></i> <a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/Importing/Crawler/File&amp;oldid=378630" id="t-permalink" title="Permanent link to this revision of the page">Permanent link</a></li><li><i class="fa fa-angle-double-right orange fa-fw"></i> <a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/Importing/Crawler/File&amp;printable=yes" id="t-print" rel="alternate" title="Printable version of this page [p]" accesskey="p">Printable version</a></li><li><i class="fa fa-angle-double-right orange fa-fw"></i> <a href="http://wiki.eclipse.org/Special:SpecialPages" id="t-specialpages" title="A list of all special pages [q]" accesskey="q">Special pages</a></li><li><i class="fa fa-angle-double-right orange fa-fw"></i> <a href="http://wiki.eclipse.org/Special:RecentChangesLinked/SMILA/Documentation/Importing/Crawler/File" id="t-recentchangeslinked" title="Recent changes in pages linked from this page [k]" accesskey="k">Related changes</a></li><li><i class="fa fa-angle-double-right orange fa-fw"></i> <a href="http://wiki.eclipse.org/Special:WhatLinksHere/SMILA/Documentation/Importing/Crawler/File" id="t-whatlinkshere" title="A list of all wiki pages that link here [j]" accesskey="j">What links here</a></li></ul> </aside>
<!-- /leftcol -->
<!-- mainContent -->
<div id="mainContent" class="col-md-20">
<ul class="nav nav-tabs noprint hidden-print" role="tablist">
<li id="ca-nstab-main" class="active"><a href="File.html" title="View the content page [c]" accesskey="c" tabindex="-1">Page</a></li>
<li id="ca-talk" class="new"><a href="http://wiki.eclipse.org/index.php?title=Talk:SMILA/Documentation/Importing/Crawler/File&amp;action=edit&amp;redlink=1" title="Discussion about the content page [t]" accesskey="t" tabindex="-1">Discussion</a></li>
<li id="ca-viewsource"><a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/Importing/Crawler/File&amp;action=edit" title="This page is protected.&#10;You can view its source [e]" accesskey="e" tabindex="-1">View source</a></li>
<li id="ca-history" class="collapsible"><a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/Importing/Crawler/File&amp;action=history" title="Past revisions of this page [h]" accesskey="h" tabindex="-1">History</a></li>
</ul> <div class="tab-content background-white">
<div id="tab-pane-main-page-content" class="tab-pane active">
<h1 id="firstHeading" class="firstHeading page-header">
<span dir="auto">SMILA/Documentation/Importing/Crawler/File</span>
</h1>
<div id="main-page-content">
<!-- subtitle -->
<div id="contentSub" class="alert alert-small alert-warning"><span class="subpages">&lt; <a href="../../../../SMILA.html" title="SMILA">SMILA</a>&lrm; | <a href="../../../Documentation.1.html" title="SMILA/Documentation">Documentation</a></span></div>
<!-- /subtitle -->
<div id="mw-content-text" lang="en" dir="ltr" class="mw-content-ltr"><p>File Crawler, File Fetcher and File Extractor worker are used for importing files from a file system. For a big picture and the worker's interaction have a look at the <a href="../Concept.html" title="SMILA/Documentation/Importing/Concept"> Importing Concept</a>.
</p>
<div id="toc" class="toc"><div id="toctitle"><h2>Contents</h2></div>
<ul>
<li class="toclevel-1 tocsection-1"><a href="File.html#File_Crawler"><span class="tocnumber">1</span> <span class="toctext">File Crawler</span></a>
<ul>
<li class="toclevel-2 tocsection-2"><a href="File.html#Configuration"><span class="tocnumber">1.1</span> <span class="toctext">Configuration</span></a></li>
<li class="toclevel-2 tocsection-3"><a href="File.html#Processing"><span class="tocnumber">1.2</span> <span class="toctext">Processing</span></a></li>
<li class="toclevel-2 tocsection-4"><a href="File.html#File_permission_info"><span class="tocnumber">1.3</span> <span class="toctext">File permission info</span></a></li>
</ul>
</li>
<li class="toclevel-1 tocsection-5"><a href="File.html#File_Fetcher"><span class="tocnumber">2</span> <span class="toctext">File Fetcher</span></a>
<ul>
<li class="toclevel-2 tocsection-6"><a href="File.html#Configuration_2"><span class="tocnumber">2.1</span> <span class="toctext">Configuration</span></a></li>
</ul>
</li>
<li class="toclevel-1 tocsection-7"><a href="File.html#File_Extractor_Worker"><span class="tocnumber">3</span> <span class="toctext">File Extractor Worker</span></a>
<ul>
<li class="toclevel-2 tocsection-8"><a href="File.html#Configuration_3"><span class="tocnumber">3.1</span> <span class="toctext">Configuration</span></a></li>
<li class="toclevel-2 tocsection-9"><a href="File.html#Processing_2"><span class="tocnumber">3.2</span> <span class="toctext">Processing</span></a></li>
</ul>
</li>
<li class="toclevel-1 tocsection-10"><a href="File.html#Sample_file_crawl_job"><span class="tocnumber">4</span> <span class="toctext">Sample file crawl job</span></a></li>
</ul>
</div>
<h3><span class="mw-headline" id="File_Crawler">File Crawler</span></h3>
<p>The File Crawler crawls files from a root folder and the subdirectories below.
</p>
<h5><span class="mw-headline" id="Configuration">Configuration</span></h5>
<p>The File Crawler worker is usually the first worker in a workflow and the job is started in <tt>runOnce</tt> mode.
</p>
<ul>
<li> Worker name: <tt>fileCrawler</tt>
</li>
<li> Parameters:
<ul>
<li> <tt>dataSource</tt>: <i>(req.)</i> value for attribute <tt>_source</tt>, needed e.g. by the delta service
</li>
<li> <tt>rootFolder</tt>: <i>(req.)</i> crawl starting point
</li>
<li> <tt>filters</tt> <i>(opt.)</i> filters with conditions to in- or exclude files and folders from import
<ul>
<li> <tt>maxFileSize</tt>: maximum file size, files that are bigger are filtered out
</li>
<li> <tt>maxFolderDepth</tt>: starting from the root folder, this is the maximum depth to crawl into subdirectories. <i>(Hint: Folder structures in compounds are not taken into account here)</i>
</li>
<li> <tt>followSymbolicLinks</tt>: whether to follow symbolic links to files/folders or not
</li>
<li> <tt>filePatterns</tt>: regex patterns for filtering crawled files on the basis of their file name
<ul>
<li> <tt>include</tt>: if include patterns are specified, at least one of them must match the file name. If no include patterns are specified, this is handled as if all file names are included.
</li>
<li> <tt>exclude</tt>: if at least one exclude pattern matches the file name, the crawled file is filtered out
</li>
<li> <b>(Hint: the patterns need to use forward slashes as directory seperators, even if your file system uses backslashes as folder delimiters)</b>
</li>
</ul>
</li>
<li> <tt>folderPatterns</tt>: regex patterns for filtering crawled folders and files on the basis of their complete folder path. <b>(Hint: Contrary to the file patterns a folder pattern must match the complete path, it doesn't work if it just matches the folder name!)</b>
<ul>
<li> <tt>include</tt>: Only relevant for crawled files: If include patterns are specified, at least one of them must match the file path. If no include patterns are specified, this is handled as if all file paths are included.
</li>
<li> <tt>exclude</tt>: Only relevant for crawled folders: If at least one exclude pattern matches the folder name, the folder (and its subdirectories) will not be imported.
</li>
<li> <b>(Hint: the patterns need to use forward slashes as directory seperators, even if your file system uses backslashes as folder delimiters)</b>
</li>
</ul>
</li>
</ul>
</li>
<li> <tt>mapping</tt> <i>(req.)</i> specifies how to map file properties to record attributes
<ul>
<li> <tt>filePath</tt> <i>(opt.)</i> mapping attribute for the complete file path <i>(Hint: required for standard import workflow because File Fetcher and File Extractor worker need this, see below)</i>
</li>
<li> <tt>fileFolder</tt> <i>(opt.)</i> mapping attribute for the file folder (complete path without file name)
</li>
<li> <tt>fileName</tt> <i>(opt.)</i> mapping attribute for the file name
</li>
<li> <tt>fileExtension</tt> <i>(opt.)</i> mapping attribute for the file extension
</li>
<li> <tt>fileSize</tt> <i>(opt.)</i> mapping attribute for the file size (in bytes)
</li>
<li> <tt>fileLastModified</tt> <i>(opt.)</i> mapping attribute for the file's last modified date
</li>
<li> <tt>fileReadAccess</tt> <i>(opt.)</i> mapping attribute for the read access from the file permission info (Access Control List)
</li>
<li> <tt>fileWriteAccess</tt> <i>(opt.)</i> mapping attribute for the write access from the file permission info (Access Control List)
</li>
</ul>
</li>
<li> parameters to control size of output bulks, see below for details
<ul>
<li> <tt>maxFilesPerBulk</tt> <i>(opt.)</i> maximum number of files in one bulk. (default: 1000)
</li>
<li> <tt>minFilesPerBulk</tt> <i>(opt.)</i> minimum number of files in one bulk. (default: 100)
</li>
<li> <tt>directoriesPerBulk</tt> <i>(opt.)</i> number of directories written to one bulk for follow-up crawl tasks. (default: 10)
</li>
</ul>
</li>
</ul>
</li>
<li> Task generator: <tt><a href="../../TaskGenerators.html#RunOnceTriggerTaskGenerator" title="SMILA/Documentation/TaskGenerators">runOnceTrigger</a></tt>
</li>
<li> Input slots:
<ul>
<li> <tt>directoriesToCrawl</tt>
</li>
</ul>
</li>
<li> Output slots:
<ul>
<li> <tt>directoriesToCrawl</tt>
</li>
<li> <tt>filesToCrawl</tt>
</li>
</ul>
</li>
</ul>
<h5><span class="mw-headline" id="Processing">Processing</span></h5>
<p>The File Crawler starts crawling in the <tt>rootFolder</tt>. It produces one record for each subdirectory in the bucket connected to <tt>directoriesToCrawl</tt> and one record per file in the bucket connected to <tt>filesToCrawl</tt>. The bucket in slot <tt>directoriesToCrawl</tt> should be connected to the input slot of the File Crawler so that the subdirectories are crawled in followup tasks. The resulting records do not yet contain the file content but only metadata attributes configured in the <tt>mapping</tt>.
</p><p>The directory and file records are collected in bulks, whose size can be configured via the parameters <tt>maxFilesPerBulk</tt>, <tt>minFilesPerBulk</tt> and <tt>directoriesPerBulk</tt>:
</p>
<ul>
<li> <tt>maxFilesPerBulk</tt> has the same effect in any of the following cases:
<ul>
<li> <i>not configured:</i> a new <tt>filesToCrawl</tt> bulk is started each 1000 files.
</li>
<li> <i>configured:</i> a new <tt>filesToCrawl</tt> bulk is started when the configured value is reached.
</li>
</ul>
</li>
<li> <tt>minFilesPerBulk</tt>
<ul>
<li> <i>not configured:</i> only files in the crawled directory are added to <tt>filesToCrawl</tt> bulks, all subdirectories are written to <tt>directoriesToCrawl</tt> bulks.
</li>
<li> <i>configured:</i> when <tt>minFilesPerBulk</tt> is not reached with all files of the current folder, we step into the subfolder(s) to reach the configured minimum size. If min size is reached, all remaining files of the current subfolder are also written to <tt>filesToCrawl</tt> bulk(s). Remaining subfolders of the current folder and subfolders of already crawled subfolders are written to <tt>directoriesToCrawl</tt> bulks.
</li>
</ul>
</li>
<li> <tt>directoriesPerBulk</tt>
<ul>
<li> <i>not configured:</i> each sub-directory that is not read directly will be written to a seperate <tt>directoriesToCrawl</tt> bulk
</li>
<li> <i>configured:</i> the given number of sub-directories will be written to the same <tt>directoriesToCrawl</tt> bulk before a new one is started.
</li>
</ul>
</li>
</ul>
<p>Please note that both parameters must be &gt;= 0 and also that <tt>minFilesPerBulk</tt> must be &lt; <tt>maxFilesPerBulk</tt>. Otherwise your job will fail.
</p><p>Since SMILA 1.3 these parameters are used in the initial crawl task, too. There is no special logic anymore in this task.
</p><p><i>Source</i>:
</p><p>The attribute <tt>_source</tt> is set from the task parameter <tt>dataSource</tt> which has no further meaning currently, but it is needed by the delta service.
</p><p><i>Compounds</i>:
</p><p>If the runnning CompoundExtractor service identifies an object as a extractable compound, it is marked with attribute <tt>_isCompound</tt> set to <tt>true</tt>.
</p>
<ul>
<li> Dependency: <a href="../SimpleCompoundExtractorService.html" title="SMILA/Documentation/Importing/SimpleCompoundExtractorService" class="mw-redirect">CompoundExtractor service</a>
</li>
</ul>
<h4><span class="mw-headline" id="File_permission_info">File permission info</span></h4>
<p>If metadata attributes for file access info are configured in <tt>mapping</tt>:
</p>
<pre>
{
...
&quot;mapping&quot;:{
...
&quot;fileReadAccess&quot;:&quot;ReadAccess&quot;,
&quot;fileWriteAccess&quot;:&quot;WriteAccess&quot;
},
...
}
</pre>
<p>Then contains each resulting record file access attributes set with following exemplary values:
</p>
<ul>
<li> Linux: The properties will contain the names for the files owner and group, if they have read/write access and the special value "_OTHERS_", if all users have read/write access. If a user/group name cannot be resolved by the operating system, the value will contain the numeric user/group id instead. For example:
</li>
</ul>
<pre>
&gt; ls -l file
-rw-r--r-- 1 johndoe users 1 29. Jan 16:06 file
--&gt;
{
...
&quot;ReadAccess&quot;: [
&quot;johndoe&quot;,
&quot;users&quot;,
&quot;_OTHERS_&quot;
],
&quot;WriteAccess&quot;: [
&quot;jschumacher&quot;
]
}
</pre>
<ul>
<li> Windows:
</li>
</ul>
<pre>
{
...
&quot;ReadAccess&quot;: [
&quot;BUILTIN\Administrators&quot;,
&quot;NT AUTORITY\SYSTEM&quot;,
&quot;BUILTIN\Users&quot;,
...
],
&quot;WriteAccess&quot;: [
&quot;BUILTIN\Administrators&quot;,
&quot;NT AUTORITY\SYSTEM&quot;,
&quot;NT AUTORITY\Authentificated Users&quot;,
...
]
}
</pre>
<h3><span class="mw-headline" id="File_Fetcher">File Fetcher</span></h3>
<p>For each input record, reads the file referenced in attribute <tt>filePath</tt> and adds the content as attachment <tt>fileContent</tt> and optionally further file properties.
The File Fetcher can be used in combination with the File Crawler where the File Crawler extracts the metadata of files and the Fetcher adds the file content or it can be used individually to get both the file content and metadata properties.
</p>
<h5><span class="mw-headline" id="Configuration_2">Configuration</span></h5>
<ul>
<li> Worker name: <tt>fileFetcher</tt>
</li>
<li> Parameters:
<ul>
<li> <tt>mapping</tt> <i>(req.)</i> needed to get the file path and to add the fetched file content
<ul>
<li> <tt>filePath</tt> <i>(req.)</i> to read the attribute that contains the file path
</li>
<li> <tt>fileContent</tt> <i>(req.)</i> attachment name where the file content is written to
</li>
<li> <tt>fileFolder</tt> <i>(opt.)</i> mapping attribute for the file folder (complete path without file name)
</li>
<li> <tt>fileName</tt> <i>(opt.)</i> mapping attribute for the file name
</li>
<li> <tt>fileExtension</tt> <i>(opt.)</i> mapping attribute for the file extension
</li>
<li> <tt>fileSize</tt> <i>(opt.)</i> mapping attribute for the file size (in bytes)
</li>
<li> <tt>fileLastModified</tt> <i>(opt.)</i> mapping attribute for the file's last modified date
</li>
</ul>
</li>
</ul>
</li>
<li> Input slots:
<ul>
<li> <tt>filesToFetch</tt>
</li>
</ul>
</li>
<li> Output slots:
<ul>
<li> <tt>files</tt>
</li>
</ul>
</li>
</ul>
<p><br />
</p>
<h3><span class="mw-headline" id="File_Extractor_Worker">File Extractor Worker</span></h3>
<p>Used for extracting compounds (zip, tgz, etc.) in file crawling.
</p>
<h5><span class="mw-headline" id="Configuration_3">Configuration</span></h5>
<ul>
<li> Worker name: <tt>fileExtractor</tt>
</li>
<li> Parameters:
<ul>
<li> <tt>filters</tt> <i>(opt., see File Crawler)</i>
<ul>
<li> <tt>maxFileSize</tt>: <i>(opt., see File Crawler)</i>
</li>
<li> <tt>filePatterns</tt>: <i>(opt., see File Crawler)</i>
<ul>
<li> <tt>include</tt>: <i>(opt., see File Crawler)</i>
</li>
<li> <tt>exclude</tt>: <i>(opt., see File Crawler)</i>
</li>
</ul>
</li>
<li> <tt>folderPatterns</tt>: <i>(opt., see File Crawler)</i>
<ul>
<li> <tt>include</tt>: <i>(opt., see File Crawler)</i>
</li>
<li> <tt>exclude</tt>: <i>(opt.)</i> The behaviour is slightly different here to that of the File Crawler: If an exclude pattern matches the folder path of an extracted file, then the file is filtered out. But according to the pattern, files from subdirectories may be imported!
</li>
</ul>
</li>
</ul>
</li>
<li> <tt>mapping</tt> <i>(req.)</i>
<ul>
<li> <tt>filePath</tt> <i>(req., see File Crawler)</i>: needed to get the file path of the compound file to extract
</li>
<li> <tt>fileFolder</tt> <i>(opt., see File Crawler)</i>
</li>
<li> <tt>fileName</tt> <i>(opt., see File Crawler)</i>
</li>
<li> <tt>fileExtension</tt> <i>(opt., see File Crawler)</i>
</li>
<li> <tt>fileSize</tt> <i>(opt., see File Crawler)</i>
</li>
<li> <tt>fileLastModified</tt> <i>(opt., see File Crawler)</i>
</li>
<li> <tt>fileContent</tt> <i>(req., see File Fetcher)</i>
</li>
</ul>
</li>
</ul>
</li>
<li> Input slots:
<ul>
<li> <tt>compounds</tt>
</li>
</ul>
</li>
<li> Output slots:
<ul>
<li> <tt>files</tt>
</li>
</ul>
</li>
</ul>
<h5><span class="mw-headline" id="Processing_2">Processing</span></h5>
<p>For each input record, an input stream to the described file is created and fed into the CompoundExtractor service to extract the compound elements. If an element is a compound itself, it is also extracted. If it is not a compound, a new record is created. The produced records are converted to look like records produced by the file crawler resp. fetcher, with the attributes and attachment set that are specified in the <tt>mapping</tt> configuration. Additionally, the following attributes are set:
</p>
<ul>
<li> <tt>_deltaHash</tt>: computed as by the FileCrawler worker
</li>
<li> <tt>_compoundRecordId</tt>: record ID of top-level compound this element was extracted from
</li>
<li> <tt>_isCompound</tt>: set to <tt>true</tt> for elements that are compounds themselves.
</li>
<li> <tt>_compoundPath</tt>: sequence of <tt>filePath</tt> attribute values of the compound objects needed to navigate to the compound element.
</li>
</ul>
<ul>
<li> Dependency: <a href="../SimpleCompoundExtractorService.html" title="SMILA/Documentation/Importing/SimpleCompoundExtractorService" class="mw-redirect">CompoundExtractor service</a>
</li>
</ul>
<h3><span class="mw-headline" id="Sample_file_crawl_job">Sample file crawl job</span></h3>
<p>Job definition that imports all files from root folder "workspace-SMILA", pushing the imported records to job "indexUpdateJob". The following files/folders are filtered out:
</p>
<ul>
<li> files ending with ".class"
</li>
<li> files starting with "."
</li>
<li> folder(-path)s ending with ".svn"
</li>
</ul>
<pre>
{
&quot;name&quot;:&quot;crawlFileJob&quot;,
&quot;workflow&quot;:&quot;fileCrawling&quot;,
&quot;parameters&quot;:{
&quot;tempStore&quot;:&quot;temp&quot;,
&quot;dataSource&quot;:&quot;files&quot;,
&quot;rootFolder&quot;:&quot;/workspace-SMILA&quot;,
&quot;jobToPushTo&quot;:&quot;indexUpdateJob&quot;,
&quot;mapping&quot;:{
&quot;fileContent&quot;:&quot;Content&quot;,
&quot;filePath&quot;:&quot;Path&quot;,
&quot;fileName&quot;:&quot;FileName&quot;,
&quot;fileExtension&quot;:&quot;FileExtension&quot;,
&quot;fileReadAccess&quot;:&quot;ReadAccess&quot;,
&quot;fileWriteAccess&quot;:&quot;WriteAccess&quot;
},
&quot;filters&quot;:{
&quot;filePatterns&quot;:{
&quot;exclude&quot;:[&quot;\\..*&quot;, &quot;.*\\.class&quot;]
},
&quot;folderPatterns&quot;:{
&quot;exclude&quot;:[&quot;.*\\.svn&quot;]
}
}
}
}
</pre>
<!--
NewPP limit report
CPU time usage: 0.044 seconds
Real time usage: 0.050 seconds
Preprocessor visited node count: 69/1000000
Preprocessor generated node count: 124/1000000
Post‐expand include size: 0/2097152 bytes
Template argument size: 0/2097152 bytes
Highest expansion depth: 2/40
Expensive parser function count: 0/100
-->
<!-- Saved in parser cache with key my_wiki:pcache:idhash:34874-0!*!0!!en!*!* and timestamp 20150414143627 and revision id 378630
-->
</div>
<!-- catlinks -->
<div id='catlinks' class='catlinks'><div id="mw-normal-catlinks" class="mw-normal-catlinks"><a href="http://wiki.eclipse.org/Special:Categories" title="Special:Categories">Category</a>: <ul><li><a href="http://wiki.eclipse.org/Category:SMILA" title="Category:SMILA">SMILA</a></li></ul></div></div> <!-- /catlinks -->
</div>
</div>
</div>
</div>
<!-- /maincontent -->
<!-- printfooter -->
<div class="printfooter">
Retrieved from "<a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/Importing/Crawler/File&amp;oldid=378630">http://wiki.eclipse.org/index.php?title=SMILA/Documentation/Importing/Crawler/File&amp;oldid=378630</a>" </div>
<!-- /printfooter -->
<!-- debughtml -->
<!-- /debughtml -->
</div>
<!-- /bodyContent -->
</section>
<!-- /content -->
<!-- footer -->
</div> <section id="footer-contribution-info" style="border-top:1px solid #ccc;" class="footer-offset background-white margin-top-25"><div class="container text-center padding-top-10 padding-bottom-10"><p id="footercredit">This page was last modified 04:57, 19 February 2015 by <a href="http://wiki.eclipse.org/User:Juergen.schumacher.empolis.com" title="User:Juergen.schumacher.empolis.com">Juergen Schumacher</a>. Based on work by <a href="http://wiki.eclipse.org/index.php?title=User:Julia.kudrin.empolis.com&amp;action=edit&amp;redlink=1" class="new" title="User:Julia.kudrin.empolis.com (page does not exist)">Julia Kudrin</a>, <a href="http://wiki.eclipse.org/User:Andreas.weber.empolis.com" title="User:Andreas.weber.empolis.com">Andreas Weber</a> and <a href="http://wiki.eclipse.org/index.php?title=User:Daniel.stucky.attensity.com&amp;action=edit&amp;redlink=1" class="new" title="User:Daniel.stucky.attensity.com (page does not exist)">Daniel Stucky</a> and <a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/Importing/Crawler/File&amp;action=credits" title="SMILA/Documentation/Importing/Crawler/File">others</a>.</p><p id="footerviews">This page has been accessed 3,986 times.</p></div></section> </main> <!-- /#main-content-container-row -->
<p id="back-to-top" class="noprint hidden-print">
<a class="visible-xs" href="File.html#top">Back to the top</a>
</p>
<footer role="contentinfo" class="noprint hidden-print">
<div class="container">
<div class="row">
<section id="footer-eclipse-foundation" class="col-xs-offset-1 col-xs-11 col-sm-7 col-md-6 col-md-offset-0">
<h2 class="section-title">Eclipse Foundation</h2>
<ul class="nav">
<li><a href="https://eclipse.org/org/">About us</a></li>
<li><a href="https://eclipse.org/org/foundation/contact.php">Contact Us</a></li>
<li><a href="https://eclipse.org/donate">Donate</a></li>
<li><a href="https://eclipse.org/org/documents/">Governance</a></li>
<li><a href="https://eclipse.org/artwork/">Logo and Artwork</a></li>
<li><a href="https://eclipse.org/org/foundation/directors.php">Board of Directors</a></li>
</ul>
</section>
<section id="footer-legal" class="col-xs-offset-1 col-xs-11 col-sm-7 col-md-6 col-md-offset-0">
<h2 class="section-title">Legal</h2>
<ul class="nav">
<li><a href="https://eclipse.org/legal/privacy.php">Privacy Policy</a></li>
<li><a href="https://eclipse.org/legal/termsofuse.php">Terms of Use</a></li>
<li><a href="https://eclipse.org/legal/copyright.php">Copyright Agent</a></li>
<li><a href="https://eclipse.org/org/documents/epl-v10.php">Eclipse Public License </a></li>
<li><a href="https://eclipse.org/legal/">Legal Resources </a></li>
</ul>
</section>
<section id="footer-useful-links" class="col-xs-offset-1 col-xs-11 col-sm-7 col-md-6 col-md-offset-0">
<h2 class="section-title">Useful Links</h2>
<ul class="nav">
<li><a href="https://bugs.eclipse.org/bugs/">Report a Bug</a></li>
<li><a href="http://help.eclipse.org/">Documentation</a></li>
<li><a href="https://eclipse.org/contribute/">How to Contribute</a></li>
<li><a href="https://eclipse.org/mail/">Mailing Lists</a></li>
<li><a href="https://eclipse.org/forums/">Forums</a></li>
<li><a href="http://marketplace.eclipse.org/">Marketplace</a></li>
</ul>
</section>
<section id="footer-other" class="col-xs-offset-1 col-xs-11 col-sm-7 col-md-6 col-md-offset-0">
<h2 class="section-title">Other</h2>
<ul class="nav">
<li><a href="https://eclipse.org/ide/">IDE and Tools</a></li>
<li><a href="https://eclipse.org/projects">Community of Projects</a></li>
<li><a href="https://eclipse.org/org/workinggroups/">Working Groups</a></li>
</ul>
<ul class="list-inline social-media">
<li><a href="https://twitter.com/EclipseFdn"><i class="fa fa-twitter-square"></i></a></li>
<li><a href="https://plus.google.com/+Eclipse"><i class="fa fa-google-plus-square"></i></a></li>
<li><a href="https://www.facebook.com/eclipse.org"><i class="fa fa-facebook-square"></i> </a></li>
<li><a href="https://www.youtube.com/user/EclipseFdn"><i class="fa fa-youtube-square"></i></a></li>
</ul>
</section>
<div id="copyright" class="col-xs-offset-1 col-sm-14 col-md-24 col-md-offset-0">
<div>
<span><img src="http://eclipse.org/eclipse.org-common/themes/solstice/public/images/logo/eclipse-logo-bw-800x188.png" alt="Eclipse.org black and white logo" width="166" height="39" id="logo-eclipse-white"/></span>
<p id="copyright-text">Copyright &copy; 2014 The Eclipse Foundation. All Rights Reserved.</p>
</div>
</div>
<a href="File.html#" class="scrollup">Back to the top</a>
</div>
</div>
</footer>
<script src="http://wiki.eclipse.org/skins/solstice/public/javascript/main.min.js"></script>
<!-- Placed at the end of the document so the pages load faster -->
<script type="text/javascript">
var _gaq = _gaq || [];
_gaq.push(['_setAccount', 'UA-910670-2']);
_gaq.push(['_trackPageview']);
(function() {
var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
})();
</script> <!-- /footer -->
<script>/*<![CDATA[*/window.jQuery && jQuery.ready();/*]]>*/</script><script>if(window.mw){
mw.loader.state({"skins.solstice":"loading","site":"ready","user":"ready","user.groups":"ready"});
}</script>
<script src="http://wiki.eclipse.org/load.php?debug=false&amp;lang=en&amp;modules=skins.solstice&amp;only=scripts&amp;skin=solstice&amp;*"></script>
<script>if(window.mw){
mw.loader.load(["mediawiki.action.view.postEdit","mediawiki.user","mediawiki.hidpi","mediawiki.page.ready","mediawiki.searchSuggest"],null,true);
}</script>
<script>if(window.mw){
mw.config.set({"wgBackendResponseTime":383});
}</script> </body>
</html>