| <!DOCTYPE html> |
| <html lang="en" dir="ltr" class="client-nojs"> |
| <head> |
| <meta charset="UTF-8" /> |
| <title>SMILA/Documentation/HowTo/How to implement a crawler - Eclipsepedia</title> |
| <meta http-equiv="X-UA-Compatible" content="IE=EDGE" /> |
| <meta name="generator" content="MediaWiki 1.23.2" /> |
| <link rel="shortcut icon" href="http://wiki.eclipse.org/eclipse.org-common/themes/solstice/public/images/favicon.ico" /> |
| <link rel="search" type="application/opensearchdescription+xml" href="http://wiki.eclipse.org/opensearch_desc.php" title="Eclipsepedia (en)" /> |
| <link rel="EditURI" type="application/rsd+xml" href="http://wiki.eclipse.org/api.php?action=rsd" /> |
| <link rel="alternate" type="application/atom+xml" title="Eclipsepedia Atom feed" href="http://wiki.eclipse.org/index.php?title=Special:RecentChanges&feed=atom" /> |
| <link rel="canonical" href="../Documentation/HowTo/How_to_implement_a_crawler.html" /> |
| <link rel="stylesheet" href="http://wiki.eclipse.org/load.php?debug=false&lang=en&modules=mediawiki.legacy.commonPrint%2Cshared%7Cmediawiki.ui.button&only=styles&skin=solstice&*" /> |
| <link rel="stylesheet" href="http://wiki.eclipse.org/skins/solstice/public/stylesheets/styles.min.css?303" media="screen, print" /><meta name="ResourceLoaderDynamicStyles" content="" /> |
| <style>a:lang(ar),a:lang(kk-arab),a:lang(mzn),a:lang(ps),a:lang(ur){text-decoration:none} |
| /* cache key: my_wiki:resourceloader:filter:minify-css:7:14ece53a42aa314864e5fd8c57f0d98f */</style> |
| <script src="http://wiki.eclipse.org/load.php?debug=false&lang=en&modules=startup&only=scripts&skin=solstice&*"></script> |
| <script>if(window.mw){ |
| mw.config.set({"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"SMILA/Documentation/HowTo/How_to_implement_a_crawler","wgTitle":"SMILA/Documentation/HowTo/How to implement a crawler","wgCurRevisionId":286129,"wgRevisionId":286129,"wgArticleId":15203,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["SMILA"],"wgBreakFrames":false,"wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgMonthNamesShort":["","Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"],"wgRelevantPageName":"SMILA/Documentation/HowTo/How_to_implement_a_crawler","wgIsProbablyEditable":false,"wgRestrictionEdit":[],"wgRestrictionMove":[],"wgRedirectedFrom":"SMILA/Development_Guidelines/How_to_implement_a_crawler","wgWikiEditorEnabledModules":{"toolbar":false,"dialogs":false,"hidesig":true,"preview":false,"previewDialog":false,"publish":false},"wgCategoryTreePageCategoryOptions":"{\"mode\":0,\"hideprefix\":20,\"showcount\":true,\"namespaces\":false}"}); |
| }</script><script>if(window.mw){ |
| mw.loader.implement("user.options",function($,jQuery){mw.user.options.set({"ccmeonemails":0,"cols":80,"date":"default","diffonly":0,"disablemail":0,"editfont":"default","editondblclick":0,"editsectiononrightclick":0,"enotifminoredits":0,"enotifrevealaddr":0,"enotifusertalkpages":1,"enotifwatchlistpages":1,"extendwatchlist":0,"fancysig":0,"forceeditsummary":0,"gender":"unknown","hideminor":0,"hidepatrolled":0,"imagesize":2,"math":1,"minordefault":0,"newpageshidepatrolled":0,"nickname":"","norollbackdiff":0,"numberheadings":0,"previewonfirst":0,"previewontop":1,"rcdays":7,"rclimit":50,"rows":25,"showhiddencats":0,"shownumberswatching":1,"showtoolbar":1,"skin":"solstice","stubthreshold":0,"thumbsize":2,"underline":2,"uselivepreview":0,"usenewrc":0,"watchcreations":1,"watchdefault":1,"watchdeletion":0,"watchlistdays":3,"watchlisthideanons":0,"watchlisthidebots":0,"watchlisthideliu":0,"watchlisthideminor":0,"watchlisthideown":0,"watchlisthidepatrolled":0,"watchmoves":0,"wllimit":250, |
| "useeditwarning":1,"prefershttps":1,"language":"en","variant-gan":"gan","variant-iu":"iu","variant-kk":"kk","variant-ku":"ku","variant-shi":"shi","variant-sr":"sr","variant-tg":"tg","variant-uz":"uz","variant-zh":"zh","searchNs0":true,"searchNs1":false,"searchNs2":false,"searchNs3":false,"searchNs4":false,"searchNs5":false,"searchNs6":false,"searchNs7":false,"searchNs8":false,"searchNs9":false,"searchNs10":false,"searchNs11":false,"searchNs12":false,"searchNs13":false,"searchNs14":false,"searchNs15":false,"variant":"en"});},{},{});mw.loader.implement("user.tokens",function($,jQuery){mw.user.tokens.set({"editToken":"+\\","patrolToken":false,"watchToken":false});},{},{}); |
| /* cache key: my_wiki:resourceloader:filter:minify-js:7:70d74423d3fc1e1c18fa9a1ff645a84a */ |
| }</script> |
| <script>if(window.mw){ |
| mw.loader.load(["mediawiki.page.startup","mediawiki.legacy.wikibits","mediawiki.legacy.ajax"]); |
| }</script> |
| <style type="text/css">/*<![CDATA[*/ |
| .source-text {line-height: normal;} |
| .source-text li, .source-text pre { |
| line-height: normal; border: 0px none white; |
| } |
| /** |
| * GeSHi Dynamically Generated Stylesheet |
| * -------------------------------------- |
| * Dynamically generated stylesheet for text |
| * CSS class: source-text, CSS id: |
| * GeSHi (C) 2004 - 2007 Nigel McNie, 2007 - 2008 Benny Baumann |
| * (http://qbnz.com/highlighter/ and http://geshi.org/) |
| * -------------------------------------- |
| */ |
| .text.source-text .de1, .text.source-text .de2 {font: normal normal 1em/1.2em monospace; margin:0; padding:0; background:none; vertical-align:top;} |
| .text.source-text {font-family:monospace;} |
| .text.source-text .imp {font-weight: bold; color: red;} |
| .text.source-text li, .text.source-text .li1 {font-weight: normal; vertical-align:top;} |
| .text.source-text .ln {width:1px;text-align:right;margin:0;padding:0 2px;vertical-align:top;} |
| .text.source-text .li2 {font-weight: bold; vertical-align:top;} |
| .text.source-text .ln-xtra, .text.source-text li.ln-xtra, .text.source-text div.ln-xtra {background-color: #ffc;} |
| .text.source-text span.xtra { display:block; } |
| |
| /*]]>*/ |
| </style><style type="text/css">/*<![CDATA[*/ |
| .source-xml {line-height: normal;} |
| .source-xml li, .source-xml pre { |
| line-height: normal; border: 0px none white; |
| } |
| /** |
| * GeSHi Dynamically Generated Stylesheet |
| * -------------------------------------- |
| * Dynamically generated stylesheet for xml |
| * CSS class: source-xml, CSS id: |
| * GeSHi (C) 2004 - 2007 Nigel McNie, 2007 - 2008 Benny Baumann |
| * (http://qbnz.com/highlighter/ and http://geshi.org/) |
| * -------------------------------------- |
| */ |
| .xml.source-xml .de1, .xml.source-xml .de2 {font: normal normal 1em/1.2em monospace; margin:0; padding:0; background:none; vertical-align:top;} |
| .xml.source-xml {font-family:monospace;} |
| .xml.source-xml .imp {font-weight: bold; color: red;} |
| .xml.source-xml li, .xml.source-xml .li1 {font-weight: normal; vertical-align:top;} |
| .xml.source-xml .ln {width:1px;text-align:right;margin:0;padding:0 2px;vertical-align:top;} |
| .xml.source-xml .li2 {font-weight: bold; vertical-align:top;} |
| .xml.source-xml .es0 {color: #000099; font-weight: bold;} |
| .xml.source-xml .br0 {color: #66cc66;} |
| .xml.source-xml .sy0 {color: #66cc66;} |
| .xml.source-xml .st0 {color: #ff0000;} |
| .xml.source-xml .nu0 {color: #cc66cc;} |
| .xml.source-xml .sc-1 {color: #808080; font-style: italic;} |
| .xml.source-xml .sc0 {color: #00bbdd;} |
| .xml.source-xml .sc1 {color: #ddbb00;} |
| .xml.source-xml .sc2 {color: #339933;} |
| .xml.source-xml .sc3 {color: #009900;} |
| .xml.source-xml .re0 {color: #000066;} |
| .xml.source-xml .re1 {color: #000000; font-weight: bold;} |
| .xml.source-xml .re2 {color: #000000; font-weight: bold;} |
| .xml.source-xml .ln-xtra, .xml.source-xml li.ln-xtra, .xml.source-xml div.ln-xtra {background-color: #ffc;} |
| .xml.source-xml span.xtra { display:block; } |
| |
| /*]]>*/ |
| </style><style type="text/css">/*<![CDATA[*/ |
| .source-java {line-height: normal;} |
| .source-java li, .source-java pre { |
| line-height: normal; border: 0px none white; |
| } |
| /** |
| * GeSHi Dynamically Generated Stylesheet |
| * -------------------------------------- |
| * Dynamically generated stylesheet for java |
| * CSS class: source-java, CSS id: |
| * GeSHi (C) 2004 - 2007 Nigel McNie, 2007 - 2008 Benny Baumann |
| * (http://qbnz.com/highlighter/ and http://geshi.org/) |
| * -------------------------------------- |
| */ |
| .java.source-java .de1, .java.source-java .de2 {font: normal normal 1em/1.2em monospace; margin:0; padding:0; background:none; vertical-align:top;} |
| .java.source-java {font-family:monospace;} |
| .java.source-java .imp {font-weight: bold; color: red;} |
| .java.source-java li, .java.source-java .li1 {font-weight: normal; vertical-align:top;} |
| .java.source-java .ln {width:1px;text-align:right;margin:0;padding:0 2px;vertical-align:top;} |
| .java.source-java .li2 {font-weight: bold; vertical-align:top;} |
| .java.source-java .kw1 {color: #7F0055; font-weight: bold;} |
| .java.source-java .kw2 {color: #7F0055; font-weight: bold;} |
| .java.source-java .kw3 {color: #000000; font-weight: normal} |
| .java.source-java .kw4 {color: #7F0055; font-weight: bold;} |
| .java.source-java .co1 {color: #3F7F5F; font-style: italic;} |
| .java.source-java .co2 {color: #3F7F5F;} |
| .java.source-java .co3 {color: #3F7F5F; font-style: italic; font-weight: bold;} |
| .java.source-java .coMULTI {color: #3F5FBF; font-style: italic;} |
| .java.source-java .es0 {color: #000000;} |
| .java.source-java .br0 {color: #000000;} |
| .java.source-java .sy0 {color: #000000;} |
| .java.source-java .st0 {color: #2A00ff;} |
| .java.source-java .nu0 {color: #000000;} |
| .java.source-java .me1 {color: #000000;} |
| .java.source-java .me2 {color: #000000;} |
| .java.source-java .ln-xtra, .java.source-java li.ln-xtra, .java.source-java div.ln-xtra {background-color: #ffc;} |
| .java.source-java span.xtra { display:block; } |
| |
| /*]]>*/ |
| </style><meta name="viewport" content="width=device-width, initial-scale=1.0"></head> |
| <body class="mediawiki ltr sitedir-ltr ns-0 ns-subject page-SMILA_Documentation_HowTo_How_to_implement_a_crawler skin-solstice action-view" id="solstice"> |
| <a class="sr-only" href="How_to_implement_a_crawler.html#content">Skip to main content</a> |
| <div class="thin-header"> |
| <header role="banner" class="hidden-print noprint"> |
| <div class="container-fluid"> |
| <div id="row-logo-search"> |
| <div id="header-left"> |
| <div class="row"> |
| <div class="hidden-xs col-sm-6 logo-container"> |
| <a href="https://www.eclipse.org/" ><img class="logo-eclipse-default" src="http://wiki.eclipse.org/skins/solstice/public/images/logo/eclipse-800x188.png" alt="Eclipsepedia"></a> |
| </div> |
| <div class="navbar col-sm-18 yamm" id="main-menu"> |
| <div class="navbar-collapse collapse" id="navbar-collapse-1"> |
| <ul class="nav navbar-nav"> |
| <li><a target="_self" href="https://eclipse.org/downloads/">Download</a></li> |
| <li><a target="_self" href="https://eclipse.org/users/">Getting Started </a></li> |
| <li><a target="_self" href="https://eclipse.org/membership/">Members</a></li> |
| <li><a target="_self" href="https://eclipse.org/projects/">Projects</a></li> |
| <li class="dropdown visible-xs"><a class="dropdown-toggle" data-toggle="dropdown" href="How_to_implement_a_crawler.html#">Community <b class="caret"></b></a><ul class="dropdown-menu"><li><a href="http://marketplace.eclipse.org">Marketplace</a></li><li><a href="http://events.eclipse.org">Events</a></li><li><a href="http://www.planeteclipse.org/">Planet Eclipse</a></li><li><a href="https://eclipse.org/community/eclipse_newsletter/">Newsletter</a></li><li><a href="https://www.youtube.com/user/EclipseFdn">Videos</a></li></ul></li><li class="dropdown visible-xs"><a class="dropdown-toggle" data-toggle="dropdown" href="How_to_implement_a_crawler.html#">Participate <b class="caret"></b></a><ul class="dropdown-menu"><li><a href="https://bugs.eclipse.org/bugs/">Report a Bug</a></li><li><a href="https://eclipse.org/forums/">Forums</a></li><li><a href="https://eclipse.org/mail/">Mailing Lists</a></li><li><a href="https://wiki.eclipse.org/">Wiki</a></li><li><a href="https://wiki.eclipse.org/IRC">IRC</a></li><li><a href="https://eclipse.org/contribute/">How to Contribute</a></li></ul></li><li class="dropdown visible-xs"><a class="dropdown-toggle" data-toggle="dropdown" href="How_to_implement_a_crawler.html#">Working Groups <b class="caret"></b></a><ul class="dropdown-menu"><li><a href="http://wiki.eclipse.org/Auto_IWG">Automotive</a></li><li><a href="http://iot.eclipse.org">Internet of Things</a></li><li><a href="http://locationtech.org">LocationTech</a></li><li><a href="http://lts.eclipse.org">Long-Term Support</a></li><li><a href="http://polarsys.org">PolarSys</a></li><li><a href="http://science.eclipse.org">Science</a></li><li><a href="http://openmdm.org">OpenMDM</a></li></ul></li><!-- More --> |
| <li class="dropdown hidden-xs"><a class="dropdown-toggle" data-toggle="dropdown">More<b class="caret"></b></a> |
| <ul class="dropdown-menu"> |
| <li> |
| <!-- Content container to add padding --> |
| <div class="yamm-content"> |
| <div class="row"> |
| <ul class="col-sm-8 list-unstyled"><li><p><strong>Community</strong></p></li><li><a href="http://marketplace.eclipse.org">Marketplace</a></li><li><a href="http://events.eclipse.org">Events</a></li><li><a href="http://www.planeteclipse.org/">Planet Eclipse</a></li><li><a href="https://eclipse.org/community/eclipse_newsletter/">Newsletter</a></li><li><a href="https://www.youtube.com/user/EclipseFdn">Videos</a></li></ul><ul class="col-sm-8 list-unstyled"><li><p><strong>Participate</strong></p></li><li><a href="https://bugs.eclipse.org/bugs/">Report a Bug</a></li><li><a href="https://eclipse.org/forums/">Forums</a></li><li><a href="https://eclipse.org/mail/">Mailing Lists</a></li><li><a href="https://wiki.eclipse.org/">Wiki</a></li><li><a href="https://wiki.eclipse.org/IRC">IRC</a></li><li><a href="https://eclipse.org/contribute/">How to Contribute</a></li></ul><ul class="col-sm-8 list-unstyled"><li><p><strong>Working Groups</strong></p></li><li><a href="http://wiki.eclipse.org/Auto_IWG">Automotive</a></li><li><a href="http://iot.eclipse.org">Internet of Things</a></li><li><a href="http://locationtech.org">LocationTech</a></li><li><a href="http://lts.eclipse.org">Long-Term Support</a></li><li><a href="http://polarsys.org">PolarSys</a></li><li><a href="http://science.eclipse.org">Science</a></li><li><a href="http://openmdm.org">OpenMDM</a></li></ul> </div> |
| </div> |
| </li> |
| </ul> |
| </li> |
| </ul> |
| </div> |
| <div class="navbar-header"> |
| <button data-target="#navbar-collapse-1" data-toggle="collapse" class="navbar-toggle" type="button"> |
| <span class="sr-only">Toggle navigation</span> |
| <span class="icon-bar"></span> |
| <span class="icon-bar"></span> |
| <span class="icon-bar"></span> |
| <span class="icon-bar"></span> |
| </button> |
| <a href="https://www.eclipse.org/" class="visible-xs navbar-brand"><img class="logo-eclipse-default" src="http://wiki.eclipse.org/skins/solstice/public/images/logo/eclipse-800x188.png" alt="Eclipsepedia" width="174"></a> |
| </div> |
| </div> |
| </div> |
| </div> |
| </div> |
| </div> |
| </header> |
| <section class="defaut-breadcrumbs hidden-print noprint hidden-print clearfix" id="breadcrumb"> |
| <div> |
| <ol class="breadcrumb"> |
| <li><a href="https://www.eclipse.org/">Home</a></li> |
| <li><a href="http://wiki.eclipse.org/Main_Page">Eclipse Wiki</a></li> |
| <li class="active">SMILA/Documentation/HowTo/How to implement a crawler</li></ol> |
| </div> |
| </section> |
| </div> |
| <div class="toolbar-menu breadcrumbs-offset noprint hidden-print margin-bottom-0 clearfix"> |
| <div class="col-md-24"> |
| <ol class="breadcrumb" role="navigation"> |
| <li id="pt-login"> |
| <a href="http://wiki.eclipse.org/index.php?title=Special:UserLogin&returnto=SMILA%2FDocumentation%2FHowTo%2FHow+to+implement+a+crawler"> |
| <i class="fa fa-sign-in fa-fw orange"></i> Log in </a> |
| </li> |
| </ul> |
| </div> |
| </div> |
| <main role="main" class="background-grey"> |
| <div class="container-full padding-top-25"> |
| |
| |
| |
| <!-- content --> |
| <section id="content" class="mw-body container-full clearfix 0"> |
| <div id="mw-js-message" style="display:none;"></div> |
| |
| |
| <!-- bodyContent --> |
| <div id="bodyContent"> |
| |
| |
| <!-- jumpto --> |
| <div id="jump-to-nav" class="mw-jump"> |
| Jump to: <a href="How_to_implement_a_crawler.html#mw-head">navigation</a>, |
| <a href="How_to_implement_a_crawler.html#p-search">search</a> |
| </div> |
| <!-- /jumpto --> |
| |
| <!-- leftcol --> |
| <aside class="col-md-4 noprint hidden-print" id="leftcol"> |
| <form class="input-group" role="form" id="form-eclipse-search" action="http://wiki.eclipse.org/index.php" id="searchform"> |
| <input id="searchInput" class="search-query form-control" type="search" accesskey="f" title="Special:Search" placeholder="Search" name="search" value=""> |
| |
| <span class="input-group-btn"> |
| <button value="search" id="mw-searchButton" type="submit" class="btn btn-default" title="Search the pages for this text" name="fulltext"> |
| <i class="fa fa-search"></i> |
| </button> |
| </span> |
| </form> |
| <select class="form-control margin-top-10 margin-bottom-10 visible-xs visible-sm" onchange="this.options[this.selectedIndex].value && (window.location = this.options[this.selectedIndex].value);"><option class="fw-700 "><span class="fw-700">---Navigation---</span></option><option value="/Main_Page">Main Page</option><option value="/Eclipsepedia:Community_portal">Community portal</option><option value="/Eclipsepedia:Current_events">Current events</option><option value="/Special:RecentChanges">Recent changes</option><option value="/Special:Random">Random page</option><option value="https://www.mediawiki.org/wiki/Special:MyLanguage/Help:Contents">Help</option></select><ul class="ul-left-nav fa-ul hidden-print leftnav hidden-xs hidden-sm"><li class="separator"><span class="separator">Navigation</span></li> <li class=""><i class="fa fa-angle-double-right orange fa-fw"></i> <a href="http://wiki.eclipse.org/Main_Page" id="n-mainpage" title="Visit the main page [z]" accesskey="z">Main Page</a></li> <li class=""><i class="fa fa-angle-double-right orange fa-fw"></i> <a href="http://wiki.eclipse.org/Eclipsepedia:Community_portal" id="n-portal" title="About the project, what you can do, where to find things">Community portal</a></li> <li class=""><i class="fa fa-angle-double-right orange fa-fw"></i> <a href="http://wiki.eclipse.org/Eclipsepedia:Current_events" id="n-currentevents" title="Find background information on current events">Current events</a></li> <li class=""><i class="fa fa-angle-double-right orange fa-fw"></i> <a href="http://wiki.eclipse.org/Special:RecentChanges" id="n-recentchanges" title="A list of recent changes in the wiki [r]" accesskey="r">Recent changes</a></li> <li class=""><i class="fa fa-angle-double-right orange fa-fw"></i> <a href="http://wiki.eclipse.org/Special:Random" id="n-randompage" title="Load a random page [x]" accesskey="x">Random page</a></li> <li class=""><i class="fa fa-angle-double-right orange fa-fw"></i> <a href="https://www.mediawiki.org/wiki/Special:MyLanguage/Help:Contents" id="n-help" title="The place to find out">Help</a></li></ul> <select class="form-control margin-top-10 margin-bottom-10 visible-xs visible-sm" onchange="this.options[this.selectedIndex].value && (window.location = this.options[this.selectedIndex].value);"><option class="fw-700 "><span class="fw-700">---Toolbox---</span></option><option value="/index.php?title=SMILA/Documentation/HowTo/How_to_implement_a_crawler&action=info">Page information</option><option value="/index.php?title=SMILA/Documentation/HowTo/How_to_implement_a_crawler&oldid=286129">Permanent link</option><option value="/index.php?title=SMILA/Documentation/HowTo/How_to_implement_a_crawler&printable=yes">Printable version</option><option value="/Special:SpecialPages">Special pages</option><option value="/Special:RecentChangesLinked/SMILA/Documentation/HowTo/How_to_implement_a_crawler">Related changes</option><option value="/Special:WhatLinksHere/SMILA/Documentation/HowTo/How_to_implement_a_crawler">What links here</option></select><ul class="ul-left-nav fa-ul hidden-print leftnav hidden-xs hidden-sm"><li class="separator"><span class="separator">Toolbox</span></li><li><i class="fa fa-angle-double-right orange fa-fw"></i> <a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/HowTo/How_to_implement_a_crawler&action=info" id="t-info">Page information</a></li><li><i class="fa fa-angle-double-right orange fa-fw"></i> <a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/HowTo/How_to_implement_a_crawler&oldid=286129" id="t-permalink" title="Permanent link to this revision of the page">Permanent link</a></li><li><i class="fa fa-angle-double-right orange fa-fw"></i> <a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/HowTo/How_to_implement_a_crawler&printable=yes" id="t-print" rel="alternate" title="Printable version of this page [p]" accesskey="p">Printable version</a></li><li><i class="fa fa-angle-double-right orange fa-fw"></i> <a href="http://wiki.eclipse.org/Special:SpecialPages" id="t-specialpages" title="A list of all special pages [q]" accesskey="q">Special pages</a></li><li><i class="fa fa-angle-double-right orange fa-fw"></i> <a href="http://wiki.eclipse.org/Special:RecentChangesLinked/SMILA/Documentation/HowTo/How_to_implement_a_crawler" id="t-recentchangeslinked" title="Recent changes in pages linked from this page [k]" accesskey="k">Related changes</a></li><li><i class="fa fa-angle-double-right orange fa-fw"></i> <a href="http://wiki.eclipse.org/Special:WhatLinksHere/SMILA/Documentation/HowTo/How_to_implement_a_crawler" id="t-whatlinkshere" title="A list of all wiki pages that link here [j]" accesskey="j">What links here</a></li></ul> </aside> |
| <!-- /leftcol --> |
| |
| <!-- mainContent --> |
| <div id="mainContent" class="col-md-20"> |
| <ul class="nav nav-tabs noprint hidden-print" role="tablist"> |
| <li id="ca-nstab-main" class="selected"><a href="../Documentation/HowTo/How_to_implement_a_crawler.html" title="View the content page [c]" accesskey="c" tabindex="-1">Page</a></li> |
| <li id="ca-talk"><a href="http://wiki.eclipse.org/Talk:SMILA/Documentation/HowTo/How_to_implement_a_crawler" title="Discussion about the content page [t]" accesskey="t" tabindex="-1">Discussion</a></li> |
| <li id="ca-viewsource"><a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/HowTo/How_to_implement_a_crawler&action=edit" title="This page is protected. You can view its source [e]" accesskey="e" tabindex="-1">View source</a></li> |
| <li id="ca-history" class="collapsible"><a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/HowTo/How_to_implement_a_crawler&action=history" title="Past revisions of this page [h]" accesskey="h" tabindex="-1">History</a></li> |
| </ul> <div class="tab-content background-white"> |
| <div id="tab-pane-main-page-content" class="tab-pane active"> |
| |
| |
| <h1 id="firstHeading" class="firstHeading page-header"> |
| <span dir="auto">SMILA/Documentation/HowTo/How to implement a crawler</span> |
| </h1> |
| <div id="main-page-content"> |
| <!-- subtitle --> |
| <div id="contentSub" class="alert alert-small alert-warning"><span class="subpages">< <a href="../../SMILA.html" title="SMILA">SMILA</a>‎ | <a href="../Documentation.1.html" title="SMILA/Documentation">Documentation</a>‎ | <a href="../Documentation/HowTo.html" title="SMILA/Documentation/HowTo">HowTo</a></span>(Redirected from <a href="http://wiki.eclipse.org/index.php?title=SMILA/Development_Guidelines/How_to_implement_a_crawler&redirect=no" title="SMILA/Development Guidelines/How to implement a crawler">SMILA/Development Guidelines/How to implement a crawler</a>)</div> |
| <!-- /subtitle --> |
| |
| |
| <div id="mw-content-text" lang="en" dir="ltr" class="mw-content-ltr"><div class="messagebox" style="background-color: #def3fe; border: 1px solid #c5d7e0; color: black; padding: 5px; margin: 1ex 0; min-height: 35px; padding-left: 45px;"> |
| <div style="float: left; margin-left: -40px;"><a href="http://wiki.eclipse.org/File:Note.png" class="image"><img alt="Note.png" src="http://wiki.eclipse.org/images/c/cc/Note.png" width="35" height="35" /></a></div> |
| <div><b>This is deprecated for SMILA 1.0, the connectivity framework is still functional but will aimed to be replaced by scalable import based on SMILAs job management.</b><br /></div> |
| </div> |
| <p><br /> |
| Explains how to implement an <a href="../Glossary.html#C" title="SMILA/Glossary">Crawler</a> and <a href="../Howto_integrate_a_component_in_SMILA.html" title="SMILA/Howto integrate a component in SMILA" class="mw-redirect">add its functionality</a> to SMILA. |
| </p> |
| <div id="toc" class="toc"><div id="toctitle"><h2>Contents</h2></div> |
| <ul> |
| <li class="toclevel-1 tocsection-1"><a href="How_to_implement_a_crawler.html#Prepare_bundle_and_manifest"><span class="tocnumber">1</span> <span class="toctext">Prepare bundle and manifest</span></a></li> |
| <li class="toclevel-1 tocsection-2"><a href="How_to_implement_a_crawler.html#Prepare_DataSourceConnect_schema_and_classes"><span class="tocnumber">2</span> <span class="toctext">Prepare DataSourceConnect schema and classes</span></a></li> |
| <li class="toclevel-1 tocsection-3"><a href="How_to_implement_a_crawler.html#OSGi_and_Declarative_Service_requirements"><span class="tocnumber">3</span> <span class="toctext">OSGi and Declarative Service requirements</span></a></li> |
| <li class="toclevel-1 tocsection-4"><a href="How_to_implement_a_crawler.html#Implement_your_crwler"><span class="tocnumber">4</span> <span class="toctext">Implement your crwler</span></a></li> |
| <li class="toclevel-1 tocsection-5"><a href="How_to_implement_a_crawler.html#Activate_your_crawler"><span class="tocnumber">5</span> <span class="toctext">Activate your crawler</span></a> |
| <ul> |
| <li class="toclevel-2 tocsection-6"><a href="How_to_implement_a_crawler.html#Activation_SMILA_in_eclipse"><span class="tocnumber">5.1</span> <span class="toctext">Activation SMILA in eclipse</span></a></li> |
| <li class="toclevel-2 tocsection-7"><a href="How_to_implement_a_crawler.html#Activation_SMILA_application"><span class="tocnumber">5.2</span> <span class="toctext">Activation SMILA application</span></a></li> |
| </ul> |
| </li> |
| <li class="toclevel-1 tocsection-8"><a href="How_to_implement_a_crawler.html#Run_your_crawler"><span class="tocnumber">6</span> <span class="toctext">Run your crawler</span></a></li> |
| </ul> |
| </div> |
| |
| <h2><span class="mw-headline" id="Prepare_bundle_and_manifest">Prepare bundle and manifest</span></h2> |
| <ul> |
| <li>Create a new bundle that will contain your crawler. Follow the instructions on <a href="Create_a_bundle_(plug-in).html" title="SMILA/Development Guidelines/Create a bundle (plug-in)" class="mw-redirect">How to create a bundle</a>. In this sample we use the prefix <tt>myplugin.crawler.mock</tt> for the name of project. |
| </li> |
| <li>For crawler JXB code generation we need to import SMILA.builder project into our workspace. |
| </li> |
| </ul> |
| <ul> |
| <li>Edit the manifest file and add at least the following packages to the <i>Import-Package</i> section. |
| <ul> |
| <li><tt>org.eclipse.smila.connectivity;version="1.0.0"</tt> |
| </li> |
| <li><tt>org.eclipse.smila.connectivity.framework;version="1.0.0"</tt> |
| </li> |
| <li><tt>org.eclipse.smila.connectivity.framework.performancecounters;version="1.0.0"</tt> |
| </li> |
| <li><tt>org.eclipse.smila.connectivity.framework.schema;version="1.0.0"</tt> |
| </li> |
| <li><tt>org.eclipse.smila.connectivity.framework.schema.config;version="1.0.0"</tt> |
| </li> |
| <li><tt>org.eclipse.smila.connectivity.framework.schema.config.interfaces;version="1.0.0"</tt> |
| </li> |
| <li><tt>org.eclipse.smila.connectivity.framework.util;version="1.0.0"</tt> |
| </li> |
| <li><tt>org.eclipse.smila.datamodel;version="1.0.0"</tt> |
| </li> |
| </ul> |
| </li> |
| </ul> |
| <ul> |
| <li>you will have to add additional packages to fill you crawler with business logic ! |
| </li> |
| </ul> |
| <ul> |
| <li>Now your MANIFEST.MF file should be like |
| </li> |
| </ul> |
| <div dir="ltr" class="mw-geshi mw-code mw-content-ltr"><div class="text source-text"><pre class="de1">Manifest-Version: 1.0 |
| Bundle-ManifestVersion: 2 |
| Bundle-Name: Mock Crawler |
| Bundle-SymbolicName: myplugin.crawler.mock |
| Bundle-Version: 1.0.0 |
| Bundle-RequiredExecutionEnvironment: JavaSE-1.6 |
| Import-Package: |
| org.eclipse.smila.connectivity;version="1.0.0", |
| org.eclipse.smila.connectivity.framework;version="1.0.0", |
| org.eclipse.smila.connectivity.framework.performancecounters;version="1.0.0", |
| org.eclipse.smila.connectivity.framework.schema;version="1.0.0", |
| org.eclipse.smila.connectivity.framework.schema.config;version="1.0.0", |
| org.eclipse.smila.connectivity.framework.schema.config.interfaces;version="1.0.0", |
| org.eclipse.smila.connectivity.framework.util;version="1.0.0", |
| org.eclipse.smila.datamodel;version="1.0.0"</pre></div></div> |
| <h2><span class="mw-headline" id="Prepare_DataSourceConnect_schema_and_classes">Prepare DataSourceConnect schema and classes</span></h2> |
| <ul> |
| <li>create an additional source folder <tt>code/gen</tt> to contain the generated schema sources |
| <ul> |
| <li>Right-click your bundle and click <i>New > Source Folder</i>. |
| </li> |
| <li>Enter "code/gen" as the folder name. |
| </li> |
| <li>edit build.properties and add folder <tt>code/gen</tt> to the source folders. |
| </li> |
| </ul> |
| </li> |
| </ul> |
| <div dir="ltr" class="mw-geshi mw-code mw-content-ltr"><div class="text source-text"><pre class="de1">source.. = code/src/,\ |
| code/gen/ |
| output.. = code/bin/</pre></div></div> |
| <p><br /> |
| </p> |
| <ul> |
| <li>create schema definition |
| <ul> |
| <li>create a folder <tt>schema</tt> in your bundle |
| </li> |
| <li>create file <tt>schemas\MockCrawlerSchema.xsd</tt> to contain the XSD schema for the crawler configuration based on the abstract XSD schema "RootDataSourceConnectionConfigSchema" |
| </li> |
| <li>therin you have to provide definitions of "Process" and "Attribute" nodes for crawler specific information |
| </li> |
| <li>the following code snippet can be used as a template |
| </li> |
| </ul> |
| </li> |
| </ul> |
| <div dir="ltr" class="mw-geshi mw-code mw-content-ltr"><div class="xml source-xml"><pre class="de1"><span class="sc3"><span class="re1"><?xml</span> <span class="re0">version</span>=<span class="st0">"1.0"</span> <span class="re0">encoding</span>=<span class="st0">"UTF-8"</span><span class="re2">?></span></span> |
| <span class="sc3"><span class="re1"><xs:schema</span> <span class="re0">elementFormDefault</span>=<span class="st0">"qualified"</span> <span class="re0">attributeFormDefault</span>=<span class="st0">"unqualified"</span> <span class="re0">xmlns:xs</span>=<span class="st0">"http://www.w3.org/2001/XMLSchema"</span><span class="re2">></span></span> |
| <span class="sc3"><span class="re1"><xs:redefine</span> <span class="re0">schemaLocation</span>=<span class="st0">"../../org.eclipse.smila.connectivity.framework.schema/schemas/RootDataSourceConnectionConfigSchema.xsd"</span><span class="re2">></span></span> |
| <span class="sc3"><span class="re1"><xs:complexType</span> <span class="re0">name</span>=<span class="st0">"Process"</span><span class="re2">></span></span> |
| <span class="sc3"><span class="re1"><xs:annotation<span class="re2">></span></span></span> |
| <span class="sc3"><span class="re1"><xs:documentation<span class="re2">></span></span></span>Process Specification<span class="sc3"><span class="re1"></xs:documentation<span class="re2">></span></span></span> |
| <span class="sc3"><span class="re1"></xs:annotation<span class="re2">></span></span></span> |
| <span class="sc3"><span class="re1"><xs:complexContent<span class="re2">></span></span></span> |
| <span class="sc3"><span class="re1"><xs:extension</span> <span class="re0">base</span>=<span class="st0">"Process"</span><span class="re2">></span></span> |
|   |
| <span class="sc3"><\!--define crawler specific process here --<span class="re2">></span></span> |
|   |
| <span class="sc3"><span class="re1"></xs:extension<span class="re2">></span></span></span> |
| <span class="sc3"><span class="re1"></xs:complexContent<span class="re2">></span></span></span> |
| <span class="sc3"><span class="re1"></xs:complexType<span class="re2">></span></span></span> |
| <span class="sc3"><span class="re1"><xs:complexType</span> <span class="re0">name</span>=<span class="st0">"Attribute"</span><span class="re2">></span></span> |
| <span class="sc3"><span class="re1"><xs:complexContent<span class="re2">></span></span></span> |
| <span class="sc3"><span class="re1"><xs:extension</span> <span class="re0">base</span>=<span class="st0">"Attribute"</span><span class="re2">></span></span> |
|   |
| <span class="sc3"><\!--define crawler specific attributes here --<span class="re2">></span></span> |
|   |
| <span class="sc3"><span class="re1"></xs:extension<span class="re2">></span></span></span> |
| <span class="sc3"><span class="re1"></xs:complexContent<span class="re2">></span></span></span> |
| <span class="sc3"><span class="re1"></xs:complexType<span class="re2">></span></span></span> |
| <span class="sc3"><span class="re1"></xs:redefine<span class="re2">></span></span></span> |
| <span class="sc3"><span class="re1"></xs:schema<span class="re2">></span></span></span></pre></div></div> |
| <ul> |
| <li>create JAXB mapping |
| <ul> |
| <li>create file <tt>schemas\MockCrawlerSchema.jxb</tt> to contain the JAXB mappings used for generating configuration classes. |
| </li> |
| <li>Here is an example for the <tt>MockCrawler</tt> JXB file you can use as a template, just rename the "schemaLocation" and "package name": |
| </li> |
| </ul> |
| </li> |
| </ul> |
| <div dir="ltr" class="mw-geshi mw-code mw-content-ltr"><div class="xml source-xml"><pre class="de1"><span class="sc3"><span class="re1"><jxb:bindings</span> <span class="re0">version</span>=<span class="st0">"1.0"</span> </span> |
| <span class="sc3"> <span class="re0">xmlns:jxb</span>=<span class="st0">"http://java.sun.com/xml/ns/jaxb"</span> </span> |
| <span class="sc3"> <span class="re0">xmlns:xs</span>=<span class="st0">"http://www.w3.org/2001/XMLSchema"</span> </span> |
| <span class="sc3"><span class="re2">></span></span> |
| <span class="sc3"><span class="re1"><jxb:bindings</span> <span class="re0">schemaLocation</span>=<span class="st0">"MockCrawlerSchema.xsd"</span> <span class="re0">node</span>=<span class="st0">"/xs:schema"</span><span class="re2">></span></span> |
| <span class="sc3"><span class="re1"><jxb:schemaBindings<span class="re2">></span></span></span> |
| <span class="sc3"><span class="re1"><jxb:package</span> <span class="re0">name</span>=<span class="st0">"mypackage.crawler.mock.messages"</span><span class="re2">/></span></span> |
| <span class="sc3"><span class="re1"></jxb:schemaBindings<span class="re2">></span></span></span> |
| <span class="sc3"><span class="re1"><jxb:globalBindings<span class="re2">></span></span></span> |
| <span class="sc3"><span class="re1"><jxb:javaType</span> <span class="re0">name</span>=<span class="st0">"java.util.Date"</span> <span class="re0">xmlType</span>=<span class="st0">"xs:dateTime"</span> <span class="re0">printMethod</span>=<span class="st0">"org.eclipse.smila.connectivity.framework.schema.tools.SimpleDateFormatter.print"</span> <span class="re0">parseMethod</span>=<span class="st0">"org.eclipse.smila.connectivity.framework.schema.tools.SimpleDateFormatter.parse"</span><span class="re2">/></span></span> |
| <span class="sc3"><span class="re1"><jxb:javaType</span> <span class="re0">name</span>=<span class="st0">"org.eclipse.smila.connectivity.framework.schema.config.MimeTypeAttributeType"</span> <span class="re0">xmlType</span>=<span class="st0">"MimeTypeAttributeType"</span> <span class="re0">parseMethod</span>=<span class="st0">"org.eclipse.smila.connectivity.framework.schema.config.MimeTypeAttributeType.fromValue"</span> <span class="re0">printMethod</span>=<span class="st0">"org.eclipse.smila.connectivity.framework.schema.config.MimeTypeAttributeType.toValue"</span><span class="re2">/></span></span> |
| <span class="sc3"><span class="re1"><jxb:serializable</span> <span class="re0">uid</span>=<span class="st0">"1"</span><span class="re2">/></span></span> |
| <span class="sc3"><span class="re1"></jxb:globalBindings<span class="re2">></span></span></span> |
| <span class="sc3"><span class="re1"></jxb:bindings<span class="re2">></span></span></span> |
| <span class="sc3"><span class="re1"></jxb:bindings<span class="re2">></span></span></span></pre></div></div> |
| <p><br /> |
| </p> |
| <ul> |
| <li>Add a schema location reference in the plug-in implementation |
| <ul> |
| <li>Create a new class (<tt>DataSourceConnectionConfigPluginImpl</tt>) which implements the interface <tt>DataSourceConnectionConfigPlugin</tt>. |
| </li> |
| <li>Use the method <tt>String getSchemaLocation()</tt> to return "schemas/MockCrawlerSchema.xsd". |
| </li> |
| <li>Use the method <tt>String getMessagesPackage()</tt> to return package name"mypackage.crawler.mock.messages". |
| </li> |
| </ul> |
| </li> |
| </ul> |
| Here is an example implementation for the <tt>MockCrawler</tt> you can use as a template: <div dir="ltr" class="mw-geshi mw-code mw-content-ltr"><div class="java source-java"><pre class="de1"><span class="kw1">package</span> <span class="co2">mypackage.crawler.mock</span><span class="sy0">;</span> |
|   |
| <span class="kw1">import</span> <span class="co2">org.eclipse.smila.connectivity.framework.schema.DataSourceConnectionConfigPlugin</span><span class="sy0">;</span> |
|   |
| <span class="co3">/** |
| * The Class DataSourceConnectionConfigPluginImpl. |
| */</span> |
| <span class="kw1">public</span> <span class="kw1">class</span> DataSourceConnectionConfigPluginImpl <span class="kw1">implements</span> DataSourceConnectionConfigPlugin <span class="br0">{</span> |
|   |
| <span class="co3">/** |
| * {@inheritDoc} |
| * |
| * @see org.eclipse.smila.connectivity.framework.schema.DataSourceConnectionConfigPlugin#getSchemaLocation() |
| */</span> |
| <span class="kw1">public</span> <span class="kw3">String</span> getSchemaLocation<span class="br0">(</span><span class="br0">)</span> <span class="br0">{</span> |
| <span class="kw1">return</span> <span class="st0">"schemas/MockCrawlerSchema.xsd"</span><span class="sy0">;</span> |
| <span class="br0">}</span> |
|   |
| <span class="co3">/** |
| * {@inheritDoc} |
| * |
| * @see org.eclipse.smila.connectivity.framework.schema.DataSourceConnectionConfigPlugin#getMessagesPackage() |
| */</span> |
| <span class="kw1">public</span> <span class="kw3">String</span> getMessagesPackage<span class="br0">(</span><span class="br0">)</span> <span class="br0">{</span> |
| <span class="kw1">return</span> <span class="st0">"mypackage.crawler.mock.messages"</span><span class="sy0">;</span> |
| <span class="br0">}</span> |
|   |
| <span class="br0">}</span></pre></div></div> |
| <ul> |
| <li>create new file <tt>plugin.xml</tt> |
| <ul> |
| <li>define the extension for <tt>org.eclipse.smila.connectivity.framework.schema.extension</tt>, using the bundle name as ID and NAME. |
| </li> |
| <li>set the schema class to your implmenetation of interface <tt>DataSourceConnectionConfigPlugin</tt> |
| </li> |
| <li>Here is an example for the <tt>MockCrawler</tt> <tt>plugin.xml</tt> file you can use as a template: |
| </li> |
| </ul> |
| </li> |
| </ul> |
| <div dir="ltr" class="mw-geshi mw-code mw-content-ltr"><div class="java source-java"><pre class="de1"><span class="sy0"><</span>plugin<span class="sy0">></span> |
| <span class="sy0"><</span>extension |
| id<span class="sy0">=</span><span class="st0">"myplugin.crawler.mock"</span> |
| name<span class="sy0">=</span><span class="st0">"myplugin.crawler.mock"</span> |
| point<span class="sy0">=</span><span class="st0">"org.eclipse.smila.connectivity.framework.schema.extension"</span><span class="sy0">></span> |
| <span class="sy0"><</span>schema |
| <span class="kw1">class</span><span class="sy0">=</span><span class="st0">"mypackage.crawler.mock.DataSourceConnectionConfigPluginImpl"</span><span class="sy0">></span> |
| <span class="sy0"></</span>schema<span class="sy0">></span> |
| <span class="sy0"></</span>extension<span class="sy0">></span> |
| <span class="sy0"></</span>plugin<span class="sy0">></span></pre></div></div> |
| <p><br /> |
| </p> |
| <ul> |
| <li>Compile schema into JAXB classes by using <tt>ant</tt> |
| <ul> |
| <li>See <a href="Setup_for_JAXB_code_generation.html" title="SMILA/Development Guidelines/Setup for JAXB code generation">SMILA/Development Guidelines/Setup for JAXB code generation</a> for instruction on how to setup the JAXB generation tools. It is advised to let lib outside the workspace, for example in a lower level folder. (my -Dlib.dir=../../ |
| </li> |
| <li>create a new file <tt>build.xml</tt> to contain JXB build information. Use the following template as the content for file <tt>build.xml</tt> and rename the property value accordingly: |
| </li> |
| </ul> |
| </li> |
| </ul> |
| <div dir="ltr" class="mw-geshi mw-code mw-content-ltr"><div class="xml source-xml"><pre class="de1"><span class="sc3"><span class="re1"><project</span> <span class="re0">name</span>=<span class="st0">"sub-build"</span> <span class="re0">default</span>=<span class="st0">"compile-schema-and-decorate"</span> <span class="re0">basedir</span>=<span class="st0">"."</span><span class="re2">></span></span> |
|   |
| <span class="sc3"><span class="re1"><property</span> <span class="re0">name</span>=<span class="st0">"schema.name"</span> <span class="re0">value</span>=<span class="st0">"MockCrawlerSchema"</span> <span class="re2">/></span></span> |
|   |
| <span class="sc3"><span class="re1"><import</span> <span class="re0">file</span>=<span class="st0">"../SMILA.builder/xjc/build.xml"</span> <span class="re2">/></span></span> |
|   |
| <span class="sc3"><span class="re1"></project<span class="re2">></span></span></span></pre></div></div> |
| <ul> |
| <li><ul> |
| <li>Launch <tt>ant -Dlib.dir=../lib</tt> from a cmd console to create the java files or to see any error messages. |
| </li> |
| </ul> |
| </li> |
| </ul> |
| <p><br /> <b>Note:</b> If you rename the schema file name, make sure to update the following locations: |
| </p> |
| <ul> |
| <li>Plug-in implementation classes |
| </li> |
| <li><tt>MockCrawlerSchema.jxb</tt> (it also should be renamed with the same name as schema) |
| </li> |
| <li><tt>build.xml</tt> |
| </li> |
| </ul> |
| <h2><span class="mw-headline" id="OSGi_and_Declarative_Service_requirements">OSGi and Declarative Service requirements</span></h2> |
| <ul> |
| <li>It is not required to implement a BundleActivator. |
| </li> |
| <li>Create the top level folder <tt>OSGI-INF</tt>. |
| </li> |
| <li>Create a Component Description file in <tt>OSGI-INF</tt>. You can name the file as you like, but it is good practice to name it like the crawler. Therein you have to provide a unique component name, it should be the same as the crawler's class name. Then you have to provide your implementation class and the service interface class, which is always <tt>org.eclipse.smila.connectivity.framework.Crawler</tt>. Here is an example for the <tt>MockCrawler</tt> component description file you can use as a template: |
| </li> |
| </ul> |
| <div dir="ltr" class="mw-geshi mw-code mw-content-ltr"><div class="xml source-xml"><pre class="de1"><span class="sc3"><span class="re1"><component</span> <span class="re0">name</span>=<span class="st0">"MockCrawler"</span> <span class="re0">immediate</span>=<span class="st0">"false"</span> <span class="re0">factory</span>=<span class="st0">"CrawlerFactory"</span><span class="re2">></span></span> |
| <span class="sc3"><span class="re1"><implementation</span> <span class="re0">class</span>=<span class="st0">"mypackage.crawer.mock.MockCrawler"</span> <span class="re2">/></span></span> |
| <span class="sc3"><span class="re1"><service<span class="re2">></span></span></span> |
| <span class="sc3"><span class="re1"><provide</span> <span class="re0">interface</span>=<span class="st0">"org.eclipse.smila.connectivity.framework.Crawler"</span><span class="re2">/></span></span> |
| <span class="sc3"><span class="re1"></service<span class="re2">></span></span></span> |
| <span class="sc3"><span class="re1"></component<span class="re2">></span></span></span></pre></div></div> |
| <ul> |
| <li>Add a <i>Service-Component</i> entry to your manifest file, e.g.: |
| </li> |
| </ul> |
| <pre>Service-Component: OSGI-INF/mockcrawler.xml |
| </pre> |
| <ul> |
| <li>Open <tt>build.properties</tt> and change the binary build: Add the folders <tt>OSGI-INF</tt> and <tt>schemas</tt> as well as the file <tt>plugin.xml</tt>. |
| </li> |
| </ul> |
| <div dir="ltr" class="mw-geshi mw-code mw-content-ltr"><div class="xml source-xml"><pre class="de1">bin.includes = META-INF/,\ |
| .,\ |
| plugin.xml,\ |
| schemas/,\ |
| OSGI-INF/</pre></div></div> |
| <p><br /> |
| </p> |
| <h2><span class="mw-headline" id="Implement_your_crwler">Implement your crwler</span></h2> |
| <ul> |
| <li>Implement your crawler in a new class extending <tt>org.eclipse.smila.connectivity.framework.AbstractCrawler</tt>. |
| </li> |
| </ul> |
| <ul> |
| <li>Integrate your new agent bundle into the build process: Refer to the page <a href="How_to_integrate_new_bundle_into_build_process.html" title="SMILA/Development Guidelines/How to integrate new bundle into build process" class="mw-redirect">How to integrate new bundle into build process</a> for further instructions. |
| </li> |
| </ul> |
| <ul> |
| <li> Follow the example of FileSystemCrawler |
| </li> |
| </ul> |
| <p>[optional] |
| </p> |
| <ul> |
| <li>Create a JUnit test bundle for this crawler e.g. <tt>myplugin.crawler.mock.test</tt>. |
| </li> |
| <li>Integrate your test bundle into the build process: Refer to the page <a href="How_to_integrate_test_bundle_into_build_process.html" title="SMILA/Development Guidelines/How to integrate test bundle into build process" class="mw-redirect">How to integrate test bundle into build process</a>) for further instructions. |
| </li> |
| </ul> |
| <h2><span class="mw-headline" id="Activate_your_crawler">Activate your crawler</span></h2> |
| <h3><span class="mw-headline" id="Activation_SMILA_in_eclipse">Activation SMILA in eclipse</span></h3> |
| <ul> |
| <li>Open the <i>Run</i> dialog, switch to the configuration page of <i>Bundles</i>, select your bundle and set the parameter <i>Default Auto-Start</i> to <i>true</i>. |
| </li> |
| <li>Launch <tt>SMILA.launch</tt>. |
| </li> |
| </ul> |
| <h3><span class="mw-headline" id="Activation_SMILA_application">Activation SMILA application</span></h3> |
| <ul> |
| <li>Insert your bundle , e.g. <tt>myplugin.crawler.mock@4:start</tt>, to the <tt>config.ini</tt> file. |
| </li> |
| <li>Launch SMILA by calling either <tt>SMILA.exe</tt> or <tt>eclipse.exe -console</tt> |
| </li> |
| </ul> |
| <h2><span class="mw-headline" id="Run_your_crawler">Run your crawler</span></h2> |
| <p>Information on how to start and run an Crawler can be found in the <a href="../Documentation/CrawlerController.html" title="SMILA/Documentation/CrawlerController">CrawlerController</a> documentation. |
| </p> |
| <!-- |
| NewPP limit report |
| CPU time usage: 0.220 seconds |
| Real time usage: 0.231 seconds |
| Preprocessor visited node count: 156/1000000 |
| Preprocessor generated node count: 444/1000000 |
| Post‐expand include size: 1047/2097152 bytes |
| Template argument size: 515/2097152 bytes |
| Highest expansion depth: 5/40 |
| Expensive parser function count: 0/100 |
| --> |
| |
| <!-- Saved in parser cache with key my_wiki:pcache:idhash:15203-0!*!0!!en!2!* and timestamp 20150414084631 and revision id 286129 |
| --> |
| </div> |
| <!-- catlinks --> |
| <div id='catlinks' class='catlinks'><div id="mw-normal-catlinks" class="mw-normal-catlinks"><a href="http://wiki.eclipse.org/Special:Categories" title="Special:Categories">Category</a>: <ul><li><a href="http://wiki.eclipse.org/Category:SMILA" title="Category:SMILA">SMILA</a></li></ul></div></div> <!-- /catlinks --> |
| </div> |
| </div> |
| </div> |
| </div> |
| <!-- /maincontent --> |
| |
| <!-- printfooter --> |
| <div class="printfooter"> |
| Retrieved from "<a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/HowTo/How_to_implement_a_crawler&oldid=286129">http://wiki.eclipse.org/index.php?title=SMILA/Documentation/HowTo/How_to_implement_a_crawler&oldid=286129</a>" </div> |
| <!-- /printfooter --> |
| |
| |
| <!-- debughtml --> |
| <!-- /debughtml --> |
| |
| </div> |
| <!-- /bodyContent --> |
| </section> |
| <!-- /content --> |
| |
| <!-- footer --> |
| |
| </div> <section id="footer-contribution-info" style="border-top:1px solid #ccc;" class="footer-offset background-white margin-top-25"><div class="container text-center padding-top-10 padding-bottom-10"><p id="footercredit">This page was last modified 09:29, 24 January 2012 by <a href="http://wiki.eclipse.org/index.php?title=User:Juergen.schumacher.attensity.com&action=edit&redlink=1" class="new" title="User:Juergen.schumacher.attensity.com (page does not exist)">Juergen Schumacher</a>. Based on work by <a href="http://wiki.eclipse.org/User:Andreas.schank.attensity.com" title="User:Andreas.schank.attensity.com">A. Schank</a>, <a href="http://wiki.eclipse.org/index.php?title=User:Leccher.gmail.com&action=edit&redlink=1" class="new" title="User:Leccher.gmail.com (page does not exist)">Lorenzo </a> and <a href="http://wiki.eclipse.org/index.php?title=User:Daniel.stucky.attensity.com&action=edit&redlink=1" class="new" title="User:Daniel.stucky.attensity.com (page does not exist)">Daniel Stucky</a> and <a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/HowTo/How_to_implement_a_crawler&action=credits" title="SMILA/Documentation/HowTo/How to implement a crawler">others</a>.</p><p id="footerviews">This page has been accessed 10,665 times.</p></div></section> </main> <!-- /#main-content-container-row --> |
| <p id="back-to-top" class="noprint hidden-print"> |
| <a class="visible-xs" href="How_to_implement_a_crawler.html#top">Back to the top</a> |
| </p> |
| <footer role="contentinfo" class="noprint hidden-print"> |
| <div class="container"> |
| <div class="row"> |
| <section id="footer-eclipse-foundation" class="col-xs-offset-1 col-xs-11 col-sm-7 col-md-6 col-md-offset-0"> |
| <h2 class="section-title">Eclipse Foundation</h2> |
| <ul class="nav"> |
| <li><a href="https://eclipse.org/org/">About us</a></li> |
| <li><a href="https://eclipse.org/org/foundation/contact.php">Contact Us</a></li> |
| <li><a href="https://eclipse.org/donate">Donate</a></li> |
| <li><a href="https://eclipse.org/org/documents/">Governance</a></li> |
| <li><a href="https://eclipse.org/artwork/">Logo and Artwork</a></li> |
| <li><a href="https://eclipse.org/org/foundation/directors.php">Board of Directors</a></li> |
| </ul> |
| </section> |
| <section id="footer-legal" class="col-xs-offset-1 col-xs-11 col-sm-7 col-md-6 col-md-offset-0"> |
| <h2 class="section-title">Legal</h2> |
| <ul class="nav"> |
| <li><a href="https://eclipse.org/legal/privacy.php">Privacy Policy</a></li> |
| <li><a href="https://eclipse.org/legal/termsofuse.php">Terms of Use</a></li> |
| <li><a href="https://eclipse.org/legal/copyright.php">Copyright Agent</a></li> |
| <li><a href="https://eclipse.org/org/documents/epl-v10.php">Eclipse Public License </a></li> |
| <li><a href="https://eclipse.org/legal/">Legal Resources </a></li> |
| |
| </ul> |
| </section> |
| |
| <section id="footer-useful-links" class="col-xs-offset-1 col-xs-11 col-sm-7 col-md-6 col-md-offset-0"> |
| <h2 class="section-title">Useful Links</h2> |
| <ul class="nav"> |
| <li><a href="https://bugs.eclipse.org/bugs/">Report a Bug</a></li> |
| <li><a href="http://help.eclipse.org/">Documentation</a></li> |
| <li><a href="https://eclipse.org/contribute/">How to Contribute</a></li> |
| <li><a href="https://eclipse.org/mail/">Mailing Lists</a></li> |
| <li><a href="https://eclipse.org/forums/">Forums</a></li> |
| <li><a href="http://marketplace.eclipse.org/">Marketplace</a></li> |
| </ul> |
| </section> |
| |
| <section id="footer-other" class="col-xs-offset-1 col-xs-11 col-sm-7 col-md-6 col-md-offset-0"> |
| |
| <h2 class="section-title">Other</h2> |
| <ul class="nav"> |
| <li><a href="https://eclipse.org/ide/">IDE and Tools</a></li> |
| <li><a href="https://eclipse.org/projects">Community of Projects</a></li> |
| <li><a href="https://eclipse.org/org/workinggroups/">Working Groups</a></li> |
| </ul> |
| |
| <ul class="list-inline social-media"> |
| <li><a href="https://twitter.com/EclipseFdn"><i class="fa fa-twitter-square"></i></a></li> |
| <li><a href="https://plus.google.com/+Eclipse"><i class="fa fa-google-plus-square"></i></a></li> |
| <li><a href="https://www.facebook.com/eclipse.org"><i class="fa fa-facebook-square"></i> </a></li> |
| <li><a href="https://www.youtube.com/user/EclipseFdn"><i class="fa fa-youtube-square"></i></a></li> |
| </ul> |
| |
| </section> |
| <div id="copyright" class="col-xs-offset-1 col-sm-14 col-md-24 col-md-offset-0"> |
| <div> |
| <span><img src="http://eclipse.org/eclipse.org-common/themes/solstice/public/images/logo/eclipse-logo-bw-800x188.png" alt="Eclipse.org black and white logo" width="166" height="39" id="logo-eclipse-white"/></span> |
| <p id="copyright-text">Copyright © 2014 The Eclipse Foundation. All Rights Reserved.</p> |
| </div> |
| </div> |
| <a href="How_to_implement_a_crawler.html#" class="scrollup">Back to the top</a> |
| </div> |
| </div> |
| </footer> |
| |
| <script src="http://wiki.eclipse.org/skins/solstice/public/javascript/main.min.js"></script> |
| |
| <!-- Placed at the end of the document so the pages load faster --> |
| <script type="text/javascript"> |
| |
| var _gaq = _gaq || []; |
| _gaq.push(['_setAccount', 'UA-910670-2']); |
| _gaq.push(['_trackPageview']); |
| |
| (function() { |
| var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true; |
| ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js'; |
| var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s); |
| })(); |
| |
| </script> <!-- /footer --> |
| <script>/*<![CDATA[*/window.jQuery && jQuery.ready();/*]]>*/</script><script>if(window.mw){ |
| mw.loader.state({"skins.solstice":"loading","site":"ready","user":"ready","user.groups":"ready"}); |
| }</script> |
| <script src="http://wiki.eclipse.org/load.php?debug=false&lang=en&modules=skins.solstice&only=scripts&skin=solstice&*"></script> |
| <script>if(window.mw){ |
| mw.loader.load(["mediawiki.action.view.postEdit","mediawiki.user","mediawiki.hidpi","mediawiki.page.ready","mediawiki.searchSuggest"],null,true); |
| }</script> |
| <script>if(window.mw){ |
| mw.config.set({"wgBackendResponseTime":452}); |
| }</script> </body> |
| </html> |