| <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> |
| <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en" dir="ltr"> |
| <head> |
| <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> |
| <meta name="keywords" content="SMILA/Documentation/Filesystem Crawler,SMILA/Documentation/Crawler,SMILA/Documentation/JDBC Crawler,SMILA/Documentation/Web Crawler" /> |
| <link rel="shortcut icon" href="http://wiki.eclipse.org/SMILA/Documentation/favicon.ico" /> |
| <link rel="search" type="application/opensearchdescription+xml" href="http://wiki.eclipse.org/opensearch_desc.php" title="Eclipsepedia (English)" /> |
| <link rel="alternate" type="application/rss+xml" title="Eclipsepedia RSS Feed" href="http://wiki.eclipse.org/index.php?title=Special:Recentchanges&feed=rss" /> |
| <link rel="alternate" type="application/atom+xml" title="Eclipsepedia Atom Feed" href="http://wiki.eclipse.org/index.php?title=Special:Recentchanges&feed=atom" /> |
| |
| |
| <title>SMILA/Documentation/Filesystem Crawler - Eclipsepedia</title> |
| |
| <style type="text/css" media="screen,projection">/*<![CDATA[*/ @import "/skins/eclipsenova/novaWide.css?116"; /*]]>*/</style> |
| <link rel="stylesheet" type="text/css" media="print" href="http://wiki.eclipse.org/skins/eclipsenova/eclipsenovaPrint.css?116" /> |
| <link rel="stylesheet" type="text/css" media="handheld" href="http://wiki.eclipse.org/skins/eclipsenova/handheld.css?116" /> |
| <link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/Nova/css/header.css" media="screen" /> |
| <link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/tabs.css" media="screen" /> |
| <link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/Nova/css/visual.css" media="screen" /> |
| <link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/Nova/css/layout.css" media="screen" /> |
| <link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/Nova/css/footer.css" media="screen" /> |
| <!--[if IE]><link rel="stylesheet" type="text/css" href="/skins/eclipsenova/IEpngfix.css" media="screen" /><![endif]--> |
| <!--[if lt IE 5.5000]><style type="text/css">@import "/skins/eclipsenova/IE50Fixes.css?116";</style> <![endif]--> |
| <!--[if IE 5.5000]><style type="text/css">@import "/skins/eclipsenova/IE55Fixes.css?116";</style><![endif]--> |
| <!--[if IE 6]><style type="text/css">@import "/skins/eclipsenova/IE60Fixes.css?116";</style><![endif]--> |
| <!--[if IE 7]><style type="text/css">@import "/skins/eclipsenova/IE70Fixes.css?116";</style><![endif]--> |
| <!--[if lt IE 7]><script type="text/javascript" src="/skins/common/IEFixes.js?116"></script> |
| <meta http-equiv="imagetoolbar" content="no" /><![endif]--> |
| <script type= "text/javascript">/*<![CDATA[*/ |
| var skin = "eclipsenova"; |
| var stylepath = "/skins"; |
| var wgArticlePath = "/$1"; |
| var wgScriptPath = ""; |
| var wgScript = "/index.php"; |
| var wgServer = "http://wiki.eclipse.org"; |
| var wgCanonicalNamespace = ""; |
| var wgCanonicalSpecialPageName = false; |
| var wgNamespaceNumber = 0; |
| var wgPageName = "SMILA/Documentation/Filesystem_Crawler"; |
| var wgTitle = "SMILA/Documentation/Filesystem Crawler"; |
| var wgAction = "view"; |
| var wgRestrictionEdit = []; |
| var wgRestrictionMove = []; |
| var wgArticleId = "17587"; |
| var wgIsArticle = true; |
| var wgUserName = null; |
| var wgUserGroups = null; |
| var wgUserLanguage = "en"; |
| var wgContentLanguage = "en"; |
| var wgBreakFrames = false; |
| var wgCurRevisionId = "285986"; |
| var wgVersion = "1.12.0"; |
| var wgEnableAPI = true; |
| var wgEnableWriteAPI = false; |
| /*]]>*/</script> |
| |
| <script type="text/javascript" src="http://wiki.eclipse.org/skins/common/wikibits.js?116"><!-- wikibits js --></script> |
| |
| <!-- Performance mods similar to those for bug 166401 --> |
| <script type="text/javascript" src="http://wiki.eclipse.org/index.php?title=-&action=raw&gen=js&useskin=eclipsenova"><!-- site js --></script> |
| |
| <!-- Head Scripts --> |
| <script type="text/javascript" src="http://wiki.eclipse.org/skins/common/ajax.js?116"></script> |
| <style type="text/css">/*<![CDATA[*/ |
| .source-xml {line-height: normal; font-size: medium;} |
| .source-xml li {line-height: normal;} |
| /** |
| * GeSHi Dynamically Generated Stylesheet |
| * -------------------------------------- |
| * Dynamically generated stylesheet for xml |
| * CSS class: source-xml, CSS id: |
| * GeSHi (C) 2004 - 2007 Nigel McNie (http://qbnz.com/highlighter) |
| */ |
| .source-xml .de1, .source-xml .de2 {font-family: 'Courier New', Courier, monospace; font-weight: normal;} |
| .source-xml {} |
| .source-xml .head {} |
| .source-xml .foot {} |
| .source-xml .imp {font-weight: bold; color: red;} |
| .source-xml .ln-xtra {color: #cc0; background-color: #ffc;} |
| .source-xml li {font-family: 'Courier New', Courier, monospace; color: black; font-weight: normal; font-style: normal;} |
| .source-xml li.li2 {font-weight: bold;} |
| .source-xml .coMULTI {color: #808080; font-style: italic;} |
| .source-xml .es0 {color: #000099; font-weight: bold;} |
| .source-xml .br0 {color: #66cc66;} |
| .source-xml .st0 {color: #ff0000;} |
| .source-xml .nu0 {color: #cc66cc;} |
| .source-xml .sc0 {color: #00bbdd;} |
| .source-xml .sc1 {color: #ddbb00;} |
| .source-xml .sc2 {color: #339933;} |
| .source-xml .sc3 {color: #009900;} |
| .source-xml .re0 {color: #000066;} |
| .source-xml .re1 {font-weight: bold; color: black;} |
| .source-xml .re2 {font-weight: bold; color: black;} |
| |
| /*]]>*/ |
| </style> |
| <style type="text/css">/*<![CDATA[*/ |
| @import "/index.php?title=MediaWiki:Geshi.css&usemsgcache=yes&action=raw&ctype=text/css&smaxage=18000"; |
| /*]]>*/ |
| </style><link rel="stylesheet" type="text/css" href="Filesystem_Crawler.html" /> </head> |
| <body class="mediawiki ns-0 ltr page-SMILA_Documentation_Filesystem_Crawler"> |
| <div id="globalWrapper"> |
| |
| |
| <div id="column-one"> |
| <!-- Eclipse Additions for the Top Nav start here M. Ward--> |
| |
| <div id="header"> |
| <div id="header-graphic"> |
| <img src="http://wiki.eclipse.org/skins/eclipsenova/eclipse.png" alt="Eclipse Wiki"> |
| </div> |
| <!-- Pulled 101409 Mward --> |
| |
| <div class="portlet" id="p-personal"> |
| <div class="pBody"> |
| <ul> |
| <li id="pt-login"><a href="http://wiki.eclipse.org/index.php?title=Special:Userlogin&returnto=SMILA/Documentation/Filesystem_Crawler">Log in</a></li> |
| </ul> |
| </div> |
| </div> |
| |
| <div id="header-icons"> |
| <div id="sites"> |
| <ul id="sitesUL"> |
| <li><a href="http://www.eclipse.org"><img src="http://dev.eclipse.org/custom_icons/eclipseIcon.png" width="28" height="28" alt="Eclipse Foundation" title="Eclipse Foundation" /><div>Eclipse Foundation</div></a></li> |
| <li><a href="http://marketplace.eclipse.org"><img src="http://dev.eclipse.org/custom_icons/marketplace.png" width="28" height="28" alt="Eclipse Marketplace" title="Eclipse Marketplace" /><div>Eclipse Marketplace</div></a></li> |
| <li><a href="https://bugs.eclipse.org/bugs"><img src="http://dev.eclipse.org/custom_icons/system-search-bw.png" width="28" height="28" alt="Bugzilla" title="Bugzilla" /><div>Bugzilla</div></a></li> |
| <li><a href="http://live.eclipse.org"><img src="http://dev.eclipse.org/custom_icons/audio-input-microphone-bw.png" width="28" height="28" alt="Live" title="Live" /><div>Eclipse Live</div></a></li> |
| <li><a href="http://planeteclipse.org"><img src="http://dev.eclipse.org/large_icons/devices/audio-card.png" width="28" height="28" alt="PlanetEclipse" title="Planet" /><div>Planet Eclipse</div></a></li> |
| <li><a href="http://portal.eclipse.org"><img src="http://dev.eclipse.org/custom_icons/preferences-system-network-proxy-bw.png" width="28" height="28" alt="Portal" title="Portal" /><div>My Foundation Portal</div></a></li> |
| </ul> |
| </div> |
| </div> |
| </div> |
| <!-- NEW HEADER STUFF HERE --> |
| <div id="header-menu"> |
| <div id="header-nav"> |
| <ul> <li><a class="first_one" href="http://wiki.eclipse.org/" target="_self">Home</a></li> <li><a href="http://www.eclipse.org/downloads/" target="_self">Downloads</a></li> |
| <li><a href="http://www.eclipse.org/users/" target="_self">Users</a></li> |
| <li><a href="http://www.eclipse.org/membership/" target="_self">Members</a></li> |
| <li><a href="http://wiki.eclipse.org/index.php/Development_Resources" target="_self">Committers</a></li> |
| <li><a href="http://www.eclipse.org/resources/" target="_self">Resources</a></li> |
| <li><a href="http://www.eclipse.org/projects/" target="_self">Projects</a></li> |
| <li><a href="http://www.eclipse.org/org/" target="_self">About Us</a></li> |
| </ul> |
| </div> |
| <div id="header-utils"> |
| <!-- moved the search window here --> |
| <form action="http://wiki.eclipse.org/Special:Search" > |
| <input class="input" name="search" type="text" accesskey="f" value="" /> |
| <input type='submit' onclick="this.submit();" name="go" id="searchGoButton" class="button" title="Go to a page with this exact name if one exists" value="Go" /> |
| <input type='submit' onclick="this.submit();" name="fulltext" class="button" id="mw-searchButton" title="Search Eclipsepedia for this text" value="Search" /> |
| </form> |
| </div> |
| </div> |
| |
| |
| <!-- Eclipse Additions for the Header stop here --> |
| <!-- Additions and mods for leftside nav Start here --> |
| |
| <!--Started nav rip here--> |
| <!-- these are the nav controls main page, changes etc --> |
| <div id="novaContent" class="faux"> |
| <div id="leftcol"> |
| <ul id="leftnav"> |
| <!-- these are the page controls, edit history etc --> |
| <li class="separator"><a class="separator">Navigation   </li> |
| <li id="n-mainpage"><a href="http://wiki.eclipse.org/Main_Page">Main Page</a></li> |
| <li id="n-portal"><a href="http://wiki.eclipse.org/Eclipsepedia:Community_Portal">Community portal</a></li> |
| <li id="n-currentevents"><a href="http://wiki.eclipse.org/Eclipsepedia:Current_events">Current events</a></li> |
| <li id="n-recentchanges"><a href="http://wiki.eclipse.org/Special:Recentchanges">Recent changes</a></li> |
| <li id="n-randompage"><a href="http://wiki.eclipse.org/Special:Random">Random page</a></li> |
| <li id="n-help"><a href="http://wiki.eclipse.org/Help:Contents">Help</a></li> |
| <li class="separator"><a class="separator">Toolbox   </a></li> |
| |
| <li id="t-whatlinkshere"><a href="http://wiki.eclipse.org/Special:Whatlinkshere/SMILA/Documentation/Filesystem_Crawler">What links here</a></li> |
| <li id="t-recentchangeslinked"><a href="http://wiki.eclipse.org/Special:Recentchangeslinked/SMILA/Documentation/Filesystem_Crawler">Related changes</a></li> |
| <!-- This is the toolbox section --> |
| <li id="t-upload"><a href="http://wiki.eclipse.org/Special:Upload">Upload file</a></li> |
| <li id="t-specialpages"><a href="http://wiki.eclipse.org/Special:Specialpages">Special pages</a></li> |
| <li id="t-print"><a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/Filesystem_Crawler&printable=yes">Printable version</a></li> <li id="t-permalink"><a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/Filesystem_Crawler&oldid=285986">Permanent link</a></li> </ul> |
| </div> |
| |
| |
| <!-- Additions and mods for leftside nav End here --> |
| |
| |
| <div id="column-content"> |
| <div id="content"> |
| <a name="top" id="top"></a> |
| |
| <div id="tabs"> |
| <ul class="primary"> |
| <li class="active"><a href="Filesystem_Crawler.html"><span class="tab">Page</span></a></li> |
| <li><a href="http://wiki.eclipse.org/index.php?title=Talk:SMILA/Documentation/Filesystem_Crawler&action=edit"><span class="tab">Discussion</span></a></li> |
| <li><a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/Filesystem_Crawler&action=edit"><span class="tab">View source</span></a></li> |
| <li><a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/Filesystem_Crawler&action=history"><span class="tab">History</span></a></li> |
| <li><a href="http://wiki.eclipse.org/index.php?title=Special:Userlogin&returnto=SMILA/Documentation/Filesystem%20Crawler"><span class="tab">Edit</span></a></li> |
| </ul> |
| </div> |
| |
| |
| <script type="text/javascript"> if (window.isMSIE55) fixalpha(); </script> |
| <h1 class="firstHeading">SMILA/Documentation/Filesystem Crawler</h1> |
| <div id="bodyContent"> |
| <h3 id="siteSub">From Eclipsepedia</h3> |
| <div id="contentSub"><span class="subpages">< <a href="../../SMILA.html" title="SMILA">SMILA</a> | <a href="../Documentation.1.html" title="SMILA/Documentation">Documentation</a></span></div> |
| <div id="jump-to-nav">Jump to: <a href="Filesystem_Crawler.html#column-one">navigation</a>, <a href="Filesystem_Crawler.html#searchInput">search</a></div> <!-- start content --> |
| <div class="messagebox" style="background-color: #def3fe; border: 1px solid #c5d7e0; color: black; padding: 5px; margin: 1ex 0; min-height: 35px; padding-left: 45px;"> |
| <div style="float: left; margin-left: -40px;"><a href="http://wiki.eclipse.org/Image:Note.png" class="image" title="Note.png"><img alt="" src="http://wiki.eclipse.org/images/c/cc/Note.png" width="35" height="35" border="0" /></a></div> |
| <div><b>This is deprecated for SMILA 1.0, the connectivity framework is still functional but will aimed to be replaced by scalable import based on SMILAs job management.</b><br /></div> |
| </div> |
| <table id="toc" class="toc" summary="Contents"><tr><td><div id="toctitle"><h2>Contents</h2></div> |
| <ul> |
| <li class="toclevel-1"><a href="Filesystem_Crawler.html#Overview"><span class="tocnumber">1</span> <span class="toctext">Overview</span></a></li> |
| <li class="toclevel-1"><a href="Filesystem_Crawler.html#Crawling_configuration"><span class="tocnumber">2</span> <span class="toctext">Crawling configuration</span></a></li> |
| <li class="toclevel-1"><a href="Filesystem_Crawler.html#Crawling_configuration_explanation"><span class="tocnumber">3</span> <span class="toctext">Crawling configuration explanation</span></a></li> |
| <li class="toclevel-1"><a href="Filesystem_Crawler.html#Crawling_configuration_example"><span class="tocnumber">4</span> <span class="toctext">Crawling configuration example</span></a></li> |
| <li class="toclevel-1"><a href="Filesystem_Crawler.html#Output_example_for_default_configuration"><span class="tocnumber">5</span> <span class="toctext">Output example for default configuration</span></a></li> |
| <li class="toclevel-1"><a href="Filesystem_Crawler.html#Additional_performance_counters"><span class="tocnumber">6</span> <span class="toctext">Additional performance counters</span></a></li> |
| <li class="toclevel-1"><a href="Filesystem_Crawler.html#See_also"><span class="tocnumber">7</span> <span class="toctext">See also</span></a></li> |
| </ul> |
| </td></tr></table><script type="text/javascript"> if (window.showTocToggle) { var tocShowText = "show"; var tocHideText = "hide"; showTocToggle(); } </script> |
| <a name="Overview"></a><h2> <span class="mw-headline"> Overview </span></h2> |
| <p>The file system crawler recursively fetches all files from a given directory. Besides providing the content of files, it may also gather any file's metadata from the following list: |
| </p> |
| <ul><li> full path |
| </li><li> file name only |
| </li><li> file size |
| </li><li> last modified date |
| </li><li> file content |
| </li><li> file extension |
| </li></ul> |
| <a name="Crawling_configuration"></a><h2> <span class="mw-headline"> Crawling configuration </span></h2> |
| <p>The example configuration file is located at <tt>configuration/org.eclipse.smila.connectivity.framework/file.xml</tt>. |
| </p><p>Defining Schema: <tt>org.eclipse.smila.connectivits.framework.crawler.filesystem/schemas/FileSystemDataSourceConnectionConfigSchema.xsd</tt>. |
| </p> |
| <a name="Crawling_configuration_explanation"></a><h2> <span class="mw-headline"> Crawling configuration explanation </span></h2> |
| <p>See <a href="Crawler.html#Configuration" title="SMILA/Documentation/Crawler">SMILA/Documentation/Crawler#Configuration</a> for the generic parts of the configuration file. |
| </p><p>The root element of crawling configuration is <tt>DataSourceConnectionConfig</tt> and contains the following sub elements: |
| </p> |
| <ul><li> <tt>DataSourceID</tt> – the identification of a data source |
| </li><li> <tt>SchemaID</tt> – specifies the schema for a crawler job |
| </li><li> <tt>DataConnectionID</tt> – describes which agent crawler should be used |
| <ul><li> <tt>Crawler</tt> – implementation class of a Crawler |
| </li><li> <tt>Agent</tt> – implementation class of an Agent |
| </li></ul> |
| </li><li> <tt>CompoundHandling</tt> – specify if packed data (like a ZIP containing files) should be unpack and files within should be crawled (YES or NO). |
| </li><li> <tt>Attributes</tt> – list all attributes which describe a file. |
| <ul><li> <tt>Attribute</tt> |
| <ul><li> attributes: |
| <ul><li> <tt>Type</tt> (required) – the data type (String, Integer or Date). |
| </li><li> <tt>Name</tt> (required) – attributes name. |
| </li><li> <tt>HashAttribute</tt> – specify if the attribute is used for the hash used for delta indexing (<i>true</i> or <i>false</i>). Must be true for at least one attribute which must always have a value. Usually the attribute containing the <i>LastModifiedDate</i> will be a good candidate to set this to <i>true</i> for. |
| </li><li> <tt>KeyAttribute</tt> – specify if the attribute is used for creating the record ID (<i>true</i> or <i>false</i>). Must be true for at least one attribute. All key attributes must identify the file uniquely, so usually you will set it <i>true</i> for the attribute containing <i>Path</i> FileAttribute. |
| </li><li> <tt>Attachment</tt> – specify if the attribute return the data as attachment of record. |
| </li></ul> |
| </li><li> sub elements: |
| <ul><li> <tt>FileAttributes</tt> - specify the file attribute to write into the target attribute. The content of the element must be one of |
| <ul><li> <i>Name</i>: name of file, without the directory path |
| </li><li> <i>Path</i>: complete path including file name. |
| </li><li> <i>Size</i>: size in bytes. |
| </li><li> <i>LastModifiedDate</i>: Date of last modification |
| </li><li> <i>Content</i>: Content of file. Unconverted binary if written to an attachment. Else the crawler tries to detect the encoding and converts the content to a string (with fallbacks to UTF-8 or default encoding of the operating system). |
| </li><li> <i>FileExtension</i>: The part of the filename after the last "." character (without the dot). An empty string if the filename does not contain a dot. |
| </li></ul> |
| </li></ul> |
| </li></ul> |
| </li></ul> |
| </li><li> <tt>Process</tt> – contains parameters for gathering data. |
| <ul><li> <tt>BaseDir</tt> – the directory the crawling process begin (if is null, cannot be found/access or is not a directory a CrawlerCriticalException will be thrown). |
| <ul><li> <tt>Filter</tt> – select file type and crawling mode. |
| <ul><li> <tt>Recursive</tt> – (true or false). |
| </li><li> <tt>CaseSensitive</tt> – true or false |
| </li></ul> |
| </li><li> <tt>Include</tt> – file to crawl. |
| <ul><li> <tt>Name</tt> - String e.g. <tt>"*.txt"</tt> (crawl all text files). Everything that is not included is excluded automatically. You could use a star * as wildcard. |
| </li></ul> |
| </li><li> <tt>Exclude</tt> – files to leave out while crawling. |
| <ul><li> <tt>Name</tt> – String e.g. <tt>"*test*"</tt> (leave out all text files which have <tt>test</tt> in the filename). |
| </li></ul> |
| </li></ul> |
| </li></ul> |
| </li></ul> |
| <a name="Crawling_configuration_example"></a><h2> <span class="mw-headline"> Crawling configuration example </span></h2> |
| <div dir="ltr" style="text-align: left;"><pre class="source-xml"><span class="sc3"><span class="re1"><DataSourceConnectionConfig</span> |
| <span class="re0">xmlns:xsi</span>=<span class="st0">"http://www.w3.org/2001/XMLSchema-instance"</span> |
| <span class="re0">xsi:noNamespaceSchemaLocation</span>=<span class="st0">"../org.eclipse.smila.connectivity.framework.crawler.filesystem/schemas/FileSystemDataSourceConnectionConfigSchema.xsd"</span><span class="re2">></span></span> |
| <span class="sc3"><span class="re1"><DataSourceID<span class="re2">></span></span></span>file<span class="sc3"><span class="re1"></DataSourceID<span class="re2">></span></span></span> |
| <span class="sc3"><span class="re1"><SchemaID<span class="re2">></span></span></span>org.eclipse.smila.connectivity.framework.crawler.filesystem<span class="sc3"><span class="re1"></SchemaID<span class="re2">></span></span></span> |
| <span class="sc3"><span class="re1"><DataConnectionID<span class="re2">></span></span></span> |
| <span class="sc3"><span class="re1"><Crawler<span class="re2">></span></span></span>FileSystemCrawlerDS<span class="sc3"><span class="re1"></Crawler<span class="re2">></span></span></span> |
| <span class="sc3"><span class="re1"></DataConnectionID<span class="re2">></span></span></span> |
| <span class="sc3"><span class="re1"><CompoundHandling<span class="re2">></span></span></span>Yes<span class="sc3"><span class="re1"></CompoundHandling<span class="re2">></span></span></span> |
| <span class="sc3"><span class="re1"><Attributes<span class="re2">></span></span></span> |
| <span class="sc3"><span class="re1"><Attribute</span> <span class="re0">Type</span>=<span class="st0">"Date"</span> <span class="re0">Name</span>=<span class="st0">"LastModifiedDate"</span> <span class="re0">HashAttribute</span>=<span class="st0">"true"</span><span class="re2">></span></span> |
| <span class="sc3"><span class="re1"><FileAttributes<span class="re2">></span></span></span>LastModifiedDate<span class="sc3"><span class="re1"></FileAttributes<span class="re2">></span></span></span> |
| <span class="sc3"><span class="re1"></Attribute<span class="re2">></span></span></span> |
| <span class="sc3"><span class="re1"><Attribute</span> <span class="re0">Type</span>=<span class="st0">"String"</span> <span class="re0">Name</span>=<span class="st0">"Filename"</span><span class="re2">></span></span> |
| <span class="sc3"><span class="re1"><FileAttributes<span class="re2">></span></span></span>Name<span class="sc3"><span class="re1"></FileAttributes<span class="re2">></span></span></span> |
| <span class="sc3"><span class="re1"></Attribute<span class="re2">></span></span></span> |
| <span class="sc3"><span class="re1"><Attribute</span> <span class="re0">Type</span>=<span class="st0">"String"</span> <span class="re0">Name</span>=<span class="st0">"Path"</span> <span class="re0">KeyAttribute</span>=<span class="st0">"true"</span><span class="re2">></span></span> |
| <span class="sc3"><span class="re1"><FileAttributes<span class="re2">></span></span></span>Path<span class="sc3"><span class="re1"></FileAttributes<span class="re2">></span></span></span> |
| <span class="sc3"><span class="re1"></Attribute<span class="re2">></span></span></span> |
| <span class="sc3"><span class="re1"><Attribute</span> <span class="re0">Type</span>=<span class="st0">"String"</span> <span class="re0">Name</span>=<span class="st0">"Content"</span> <span class="re0">Attachment</span>=<span class="st0">"true"</span><span class="re2">></span></span> |
| <span class="sc3"><span class="re1"><FileAttributes<span class="re2">></span></span></span>Content<span class="sc3"><span class="re1"></FileAttributes<span class="re2">></span></span></span> |
| <span class="sc3"><span class="re1"></Attribute<span class="re2">></span></span></span> |
| <span class="sc3"><span class="re1"><Attribute</span> <span class="re0">Type</span>=<span class="st0">"String"</span> <span class="re0">Name</span>=<span class="st0">"Extension"</span> |
| <span class="re1"><FileAttributes<span class="re2">></span></span></span>FileExtension<span class="sc3"><span class="re1"></FileAttributes<span class="re2">></span></span></span> |
| <span class="sc3"><span class="re1"></Attribute<span class="re2">></span></span></span> |
| <span class="sc3"><span class="re1"><Attribute</span> <span class="re0">Type</span>=<span class="st0">"String"</span> <span class="re0">Name</span>=<span class="st0">"Size"</span><span class="re2">></span></span> |
| <span class="sc3"><span class="re1"><FileAttributes<span class="re2">></span></span></span>Size<span class="sc3"><span class="re1"></FileAttributes<span class="re2">></span></span></span> |
| <span class="sc3"><span class="re1"></Attribute<span class="re2">></span></span></span> |
| <span class="sc3"><span class="re1"></Attributes<span class="re2">></span></span></span> |
| <span class="sc3"><span class="re1"><Process<span class="re2">></span></span></span> |
| <span class="sc3"><span class="re1"><BaseDir<span class="re2">></span></span></span>c:\data<span class="sc3"><span class="re1"></BaseDir<span class="re2">></span></span></span> |
| <span class="sc3"><span class="re1"><Filter</span> <span class="re0">Recursive</span>=<span class="st0">"true"</span> <span class="re0">CaseSensitive</span>=<span class="st0">"false"</span><span class="re2">></span></span> |
| <span class="sc3"><span class="re1"><Include</span> <span class="re0">Name</span>=<span class="st0">"*.txt"</span><span class="re2">/></span></span> |
| <span class="sc3"><span class="re1"><Include</span> <span class="re0">Name</span>=<span class="st0">"*.htm"</span><span class="re2">/></span></span> |
| <span class="sc3"><span class="re1"><Include</span> <span class="re0">Name</span>=<span class="st0">"*.html"</span><span class="re2">/></span></span> |
| <span class="sc3"><span class="re1"><Include</span> <span class="re0">Name</span>=<span class="st0">"*.xml"</span><span class="re2">/></span></span> |
| <span class="sc3"><span class="re1"></Filter<span class="re2">></span></span></span> |
| <span class="sc3"><span class="re1"></Process<span class="re2">></span></span></span> |
| <span class="sc3"><span class="re1"></DataSourceConnectionConfig<span class="re2">></span></span></span></pre></div> |
| <a name="Output_example_for_default_configuration"></a><h2> <span class="mw-headline"> Output example for default configuration </span></h2> |
| <p>For a text file named <tt>crawler.txt</tt> located in <tt>c:/data</tt> the crawler will create the following record: |
| </p> |
| <div dir="ltr" style="text-align: left;"><pre class="source-xml"><span class="sc3"><span class="re1"><Record</span> <span class="re0">xmlns</span>=<span class="st0">"http://www.eclipse.org/smila/record"</span> <span class="re0">version</span>=<span class="st0">"2.0"</span><span class="re2">></span></span> |
| <span class="sc3"><span class="re1"><Val</span> <span class="re0">key</span>=<span class="st0">"_recordid"</span><span class="re2">></span></span>file:<span class="sc1">&lt;</span>Path=c:\data\crawler.txt<span class="sc1">&gt;</span><span class="sc3"><span class="re1"></Val<span class="re2">></span></span></span> |
| <span class="sc3"><span class="re1"><Val</span> <span class="re0">key</span>=<span class="st0">"_source"</span><span class="re2">></span></span>file<span class="sc3"><span class="re1"></Val<span class="re2">></span></span></span> |
| <span class="sc3"><span class="re1"><Val</span> <span class="re0">key</span>=<span class="st0">"LastModifiedDate"</span> <span class="re0">type</span>=<span class="st0">"datetime"</span><span class="re2">></span></span>2009-02-25T17:44:46+0100<span class="sc3"><span class="re1"></Val<span class="re2">></span></span></span> |
| <span class="sc3"><span class="re1"><Val</span> <span class="re0">key</span>=<span class="st0">"Path"</span><span class="re2">></span></span>c:\data\crawler.txt<span class="sc3"><span class="re1"></Val<span class="re2">></span></span></span> |
| <span class="sc3"><span class="re1"><Val</span> <span class="re0">key</span>=<span class="st0">"Filename"</span><span class="re2">></span></span>crawler.txt<span class="sc3"><span class="re1"></Val<span class="re2">></span></span></span> |
| <span class="sc3"><span class="re1"><Val</span> <span class="re0">key</span>=<span class="st0">"Extension"</span><span class="re2">></span></span>txt<span class="sc3"><span class="re1"></Val<span class="re2">></span></span></span> |
| <span class="sc3"><span class="re1"><Val</span> <span class="re0">key</span>=<span class="st0">"Size"</span> <span class="re0">type</span>=<span class="st0">"long"</span><span class="re2">></span></span>36<span class="sc3"><span class="re1"></Val<span class="re2">></span></span></span> |
| <span class="sc3"><span class="re1"><Val</span> <span class="re0">key</span>=<span class="st0">"_HASH_TOKEN"</span><span class="re2">></span></span>66f373e6f13498a65c7f5f1cf185611e94ab45630c825cc2028dda38e8245c7<span class="sc3"><span class="re1"></Val<span class="re2">></span></span></span> |
| <span class="sc3"><span class="re1"><Attachment<span class="re2">></span></span></span>Content<span class="sc3"><span class="re1"></Attachment<span class="re2">></span></span></span> |
| <span class="sc3"><span class="re1"></Record<span class="re2">></span></span></span></pre></div> |
| <a name="Additional_performance_counters"></a><h2> <span class="mw-headline"> Additional performance counters </span></h2> |
| <p>The FileSystemCrawler adds some specific counters to the common counters: |
| </p> |
| <ul><li> files: number of files visited |
| </li><li> folders: number of directories visited |
| </li><li> producerExceptions: number of filesystem related errors |
| </li></ul> |
| <a name="See_also"></a><h2> <span class="mw-headline"> See also </span></h2> |
| <ul><li> <a href="Crawler.html" title="SMILA/Documentation/Crawler">Crawler</a> |
| </li><li> <a href="Web_Crawler.html" title="SMILA/Documentation/Web Crawler">Web Crawler</a> |
| </li><li> <a href="JDBC_Crawler.html" title="SMILA/Documentation/JDBC Crawler">JDBC Crawler</a> |
| </li></ul> |
| <p><br /> |
| </p> |
| <!-- |
| NewPP limit report |
| Preprocessor node count: 81/1000000 |
| Post-expand include size: 1045/2097152 bytes |
| Template argument size: 515/2097152 bytes |
| #ifexist count: 0/100 |
| --> |
| |
| <!-- Saved in parser cache with key wikidb:pcache:idhash:17587-0!1!0!!en!2!edit=0 and timestamp 20120202140624 --> |
| <div class="printfooter"> |
| Retrieved from "<a href="Filesystem_Crawler.html">http://wiki.eclipse.org/SMILA/Documentation/Filesystem_Crawler</a>"</div> |
| <div id="catlinks"><p class='catlinks'><a href="http://wiki.eclipse.org/Special:Categories" title="Special:Categories">Category</a>: <span dir='ltr'><a href="http://wiki.eclipse.org/Category:SMILA" title="Category:SMILA">SMILA</a></span></p></div> <!-- end content --> |
| <div class="visualClear"></div> |
| </div> |
| </div> |
| |
| |
| </div> |
| |
| |
| <!-- Yoink of toolbox for phoenix moved up --> |
| |
| |
| </div> |
| </div> |
| <div id="clearFooter"/> |
| <div id="footer" > |
| <ul id="footernav"> |
| <li class="first"><a href="http://www.eclipse.org/">Home</a></li> |
| <li><a href="http://www.eclipse.org/legal/privacy.php">Privacy Policy</a></li> |
| <li><a href="http://www.eclipse.org/legal/termsofuse.php">Terms of Use</a></li> |
| <li><a href="http://www.eclipse.org/legal/copyright.php">Copyright Agent</a></li> |
| <li><a href="http://www.eclipse.org/org/foundation/contact.php">Contact</a></li> |
| <li><a href="http://wiki.eclipse.org/Eclipsepedia:About" title="Eclipsepedia:About">About Eclipsepedia</a></li> |
| </ul> |
| <span id="copyright">Copyright © 2012 The Eclipse Foundation. All Rights Reserved</span> |
| <p id="footercredit">This page was last modified 09:38, 24 January 2012 by <a href="http://wiki.eclipse.org/index.php?title=User:Juergen.schumacher.attensity.com&action=edit" class="new" title="User:Juergen.schumacher.attensity.com">Juergen Schumacher</a>. Based on work by <a href="http://wiki.eclipse.org/User:Drazen.cindric.attensity.com" title="User:Drazen.cindric.attensity.com">Drazen Cindric</a>, <a href="http://wiki.eclipse.org/User:Igor.novakovic.empolis.com" title="User:Igor.novakovic.empolis.com">Igor Novakovic</a> and <a href="http://wiki.eclipse.org/User:Eliseyev.softaria.com" title="User:Eliseyev.softaria.com">Alexander Eliseyev</a> and <a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/Filesystem_Crawler&action=credits" title="SMILA/Documentation/Filesystem Crawler">others</a>.</p> |
| <p id="footerviews">This page has been accessed 3,364 times.</p> |
| </div> |
| |
| <script type="text/javascript"> |
| var gaJsHost = (("https:" == document.location.protocol) ? "https://ssl." : "http://www."); |
| document.write(unescape("%3Cscript src='" + gaJsHost + "google-analytics.com/ga.js' type='text/javascript'%3E%3C/script%3E")); |
| </script> |
| <script type="text/javascript"> |
| var pageTracker = _gat._getTracker("UA-910670-4"); |
| pageTracker._trackPageview(); |
| </script> |
| |
| |
| |
| |
| |
| |
| |
| <!-- <div class="visualClear"></div> --> |
| |
| <script type="text/javascript">if (window.runOnloadHook) runOnloadHook();</script> |
| </div> |
| |
| <!-- Served in 0.061 secs. --></body></html> |