blob: b6c928ab0d2f9cf0ac654a63d843135f8ce59424 [file] [log] [blame]
<!DOCTYPE html>
<html lang="en" dir="ltr" class="client-nojs">
<head>
<meta charset="UTF-8" />
<title>SMILA/Specifications/CrawlerAPIDiscussion09 - Eclipsepedia</title>
<meta http-equiv="X-UA-Compatible" content="IE=EDGE" />
<meta name="generator" content="MediaWiki 1.23.2" />
<link rel="shortcut icon" href="http://wiki.eclipse.org/eclipse.org-common/themes/solstice/public/images/favicon.ico" />
<link rel="search" type="application/opensearchdescription+xml" href="http://wiki.eclipse.org/opensearch_desc.php" title="Eclipsepedia (en)" />
<link rel="EditURI" type="application/rsd+xml" href="http://wiki.eclipse.org/api.php?action=rsd" />
<link rel="alternate" type="application/atom+xml" title="Eclipsepedia Atom feed" href="http://wiki.eclipse.org/index.php?title=Special:RecentChanges&amp;feed=atom" />
<link rel="stylesheet" href="http://wiki.eclipse.org/load.php?debug=false&amp;lang=en&amp;modules=mediawiki.legacy.commonPrint%2Cshared%7Cmediawiki.ui.button&amp;only=styles&amp;skin=solstice&amp;*" />
<link rel="stylesheet" href="http://wiki.eclipse.org/skins/solstice/public/stylesheets/styles.min.css?303" media="screen, print" /><meta name="ResourceLoaderDynamicStyles" content="" />
<style>a:lang(ar),a:lang(kk-arab),a:lang(mzn),a:lang(ps),a:lang(ur){text-decoration:none}
/* cache key: my_wiki:resourceloader:filter:minify-css:7:14ece53a42aa314864e5fd8c57f0d98f */</style>
<script src="http://wiki.eclipse.org/load.php?debug=false&amp;lang=en&amp;modules=startup&amp;only=scripts&amp;skin=solstice&amp;*"></script>
<script>if(window.mw){
mw.config.set({"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"SMILA/Specifications/CrawlerAPIDiscussion09","wgTitle":"SMILA/Specifications/CrawlerAPIDiscussion09","wgCurRevisionId":115417,"wgRevisionId":115417,"wgArticleId":15331,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":[],"wgBreakFrames":false,"wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgMonthNamesShort":["","Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"],"wgRelevantPageName":"SMILA/Specifications/CrawlerAPIDiscussion09","wgIsProbablyEditable":false,"wgRestrictionEdit":[],"wgRestrictionMove":[],"wgWikiEditorEnabledModules":{"toolbar":false,"dialogs":false,"hidesig":true,"preview":false,"previewDialog":false,"publish":false},"wgCategoryTreePageCategoryOptions":"{\"mode\":0,\"hideprefix\":20,\"showcount\":true,\"namespaces\":false}"});
}</script><script>if(window.mw){
mw.loader.implement("user.options",function($,jQuery){mw.user.options.set({"ccmeonemails":0,"cols":80,"date":"default","diffonly":0,"disablemail":0,"editfont":"default","editondblclick":0,"editsectiononrightclick":0,"enotifminoredits":0,"enotifrevealaddr":0,"enotifusertalkpages":1,"enotifwatchlistpages":1,"extendwatchlist":0,"fancysig":0,"forceeditsummary":0,"gender":"unknown","hideminor":0,"hidepatrolled":0,"imagesize":2,"math":1,"minordefault":0,"newpageshidepatrolled":0,"nickname":"","norollbackdiff":0,"numberheadings":0,"previewonfirst":0,"previewontop":1,"rcdays":7,"rclimit":50,"rows":25,"showhiddencats":0,"shownumberswatching":1,"showtoolbar":1,"skin":"solstice","stubthreshold":0,"thumbsize":2,"underline":2,"uselivepreview":0,"usenewrc":0,"watchcreations":1,"watchdefault":1,"watchdeletion":0,"watchlistdays":3,"watchlisthideanons":0,"watchlisthidebots":0,"watchlisthideliu":0,"watchlisthideminor":0,"watchlisthideown":0,"watchlisthidepatrolled":0,"watchmoves":0,"wllimit":250,
"useeditwarning":1,"prefershttps":1,"language":"en","variant-gan":"gan","variant-iu":"iu","variant-kk":"kk","variant-ku":"ku","variant-shi":"shi","variant-sr":"sr","variant-tg":"tg","variant-uz":"uz","variant-zh":"zh","searchNs0":true,"searchNs1":false,"searchNs2":false,"searchNs3":false,"searchNs4":false,"searchNs5":false,"searchNs6":false,"searchNs7":false,"searchNs8":false,"searchNs9":false,"searchNs10":false,"searchNs11":false,"searchNs12":false,"searchNs13":false,"searchNs14":false,"searchNs15":false,"variant":"en"});},{},{});mw.loader.implement("user.tokens",function($,jQuery){mw.user.tokens.set({"editToken":"+\\","patrolToken":false,"watchToken":false});},{},{});
/* cache key: my_wiki:resourceloader:filter:minify-js:7:70d74423d3fc1e1c18fa9a1ff645a84a */
}</script>
<script>if(window.mw){
mw.loader.load(["mediawiki.page.startup","mediawiki.legacy.wikibits","mediawiki.legacy.ajax"]);
}</script>
<style type="text/css">/*<![CDATA[*/
.source-java {line-height: normal;}
.source-java li, .source-java pre {
line-height: normal; border: 0px none white;
}
/**
* GeSHi Dynamically Generated Stylesheet
* --------------------------------------
* Dynamically generated stylesheet for java
* CSS class: source-java, CSS id:
* GeSHi (C) 2004 - 2007 Nigel McNie, 2007 - 2008 Benny Baumann
* (http://qbnz.com/highlighter/ and http://geshi.org/)
* --------------------------------------
*/
.java.source-java .de1, .java.source-java .de2 {font: normal normal 1em/1.2em monospace; margin:0; padding:0; background:none; vertical-align:top;}
.java.source-java {font-family:monospace;}
.java.source-java .imp {font-weight: bold; color: red;}
.java.source-java li, .java.source-java .li1 {font-weight: normal; vertical-align:top;}
.java.source-java .ln {width:1px;text-align:right;margin:0;padding:0 2px;vertical-align:top;}
.java.source-java .li2 {font-weight: bold; vertical-align:top;}
.java.source-java .kw1 {color: #7F0055; font-weight: bold;}
.java.source-java .kw2 {color: #7F0055; font-weight: bold;}
.java.source-java .kw3 {color: #000000; font-weight: normal}
.java.source-java .kw4 {color: #7F0055; font-weight: bold;}
.java.source-java .co1 {color: #3F7F5F; font-style: italic;}
.java.source-java .co2 {color: #3F7F5F;}
.java.source-java .co3 {color: #3F7F5F; font-style: italic; font-weight: bold;}
.java.source-java .coMULTI {color: #3F5FBF; font-style: italic;}
.java.source-java .es0 {color: #000000;}
.java.source-java .br0 {color: #000000;}
.java.source-java .sy0 {color: #000000;}
.java.source-java .st0 {color: #2A00ff;}
.java.source-java .nu0 {color: #000000;}
.java.source-java .me1 {color: #000000;}
.java.source-java .me2 {color: #000000;}
.java.source-java .ln-xtra, .java.source-java li.ln-xtra, .java.source-java div.ln-xtra {background-color: #ffc;}
.java.source-java span.xtra { display:block; }
/*]]>*/
</style><meta name="viewport" content="width=device-width, initial-scale=1.0"></head>
<body class="mediawiki ltr sitedir-ltr ns-0 ns-subject page-SMILA_Specifications_CrawlerAPIDiscussion09 skin-solstice action-view" id="solstice">
<a class="sr-only" href="CrawlerAPIDiscussion09.html#content">Skip to main content</a>
<div class="thin-header">
<header role="banner" class="hidden-print noprint">
<div class="container-fluid">
<div id="row-logo-search">
<div id="header-left">
<div class="row">
<div class="hidden-xs col-sm-6 logo-container">
<a href="https://www.eclipse.org/" ><img class="logo-eclipse-default" src="http://wiki.eclipse.org/skins/solstice/public/images/logo/eclipse-800x188.png" alt="Eclipsepedia"></a>
</div>
<div class="navbar col-sm-18 yamm" id="main-menu">
<div class="navbar-collapse collapse" id="navbar-collapse-1">
<ul class="nav navbar-nav">
<li><a target="_self" href="https://eclipse.org/downloads/">Download</a></li>
<li><a target="_self" href="https://eclipse.org/users/">Getting Started </a></li>
<li><a target="_self" href="https://eclipse.org/membership/">Members</a></li>
<li><a target="_self" href="https://eclipse.org/projects/">Projects</a></li>
<li class="dropdown visible-xs"><a class="dropdown-toggle" data-toggle="dropdown" href="CrawlerAPIDiscussion09.html#">Community <b class="caret"></b></a><ul class="dropdown-menu"><li><a href="http://marketplace.eclipse.org">Marketplace</a></li><li><a href="http://events.eclipse.org">Events</a></li><li><a href="http://www.planeteclipse.org/">Planet Eclipse</a></li><li><a href="https://eclipse.org/community/eclipse_newsletter/">Newsletter</a></li><li><a href="https://www.youtube.com/user/EclipseFdn">Videos</a></li></ul></li><li class="dropdown visible-xs"><a class="dropdown-toggle" data-toggle="dropdown" href="CrawlerAPIDiscussion09.html#">Participate <b class="caret"></b></a><ul class="dropdown-menu"><li><a href="https://bugs.eclipse.org/bugs/">Report a Bug</a></li><li><a href="https://eclipse.org/forums/">Forums</a></li><li><a href="https://eclipse.org/mail/">Mailing Lists</a></li><li><a href="https://wiki.eclipse.org/">Wiki</a></li><li><a href="https://wiki.eclipse.org/IRC">IRC</a></li><li><a href="https://eclipse.org/contribute/">How to Contribute</a></li></ul></li><li class="dropdown visible-xs"><a class="dropdown-toggle" data-toggle="dropdown" href="CrawlerAPIDiscussion09.html#">Working Groups <b class="caret"></b></a><ul class="dropdown-menu"><li><a href="http://wiki.eclipse.org/Auto_IWG">Automotive</a></li><li><a href="http://iot.eclipse.org">Internet of Things</a></li><li><a href="http://locationtech.org">LocationTech</a></li><li><a href="http://lts.eclipse.org">Long-Term Support</a></li><li><a href="http://polarsys.org">PolarSys</a></li><li><a href="http://science.eclipse.org">Science</a></li><li><a href="http://openmdm.org">OpenMDM</a></li></ul></li><!-- More -->
<li class="dropdown hidden-xs"><a class="dropdown-toggle" data-toggle="dropdown">More<b class="caret"></b></a>
<ul class="dropdown-menu">
<li>
<!-- Content container to add padding -->
<div class="yamm-content">
<div class="row">
<ul class="col-sm-8 list-unstyled"><li><p><strong>Community</strong></p></li><li><a href="http://marketplace.eclipse.org">Marketplace</a></li><li><a href="http://events.eclipse.org">Events</a></li><li><a href="http://www.planeteclipse.org/">Planet Eclipse</a></li><li><a href="https://eclipse.org/community/eclipse_newsletter/">Newsletter</a></li><li><a href="https://www.youtube.com/user/EclipseFdn">Videos</a></li></ul><ul class="col-sm-8 list-unstyled"><li><p><strong>Participate</strong></p></li><li><a href="https://bugs.eclipse.org/bugs/">Report a Bug</a></li><li><a href="https://eclipse.org/forums/">Forums</a></li><li><a href="https://eclipse.org/mail/">Mailing Lists</a></li><li><a href="https://wiki.eclipse.org/">Wiki</a></li><li><a href="https://wiki.eclipse.org/IRC">IRC</a></li><li><a href="https://eclipse.org/contribute/">How to Contribute</a></li></ul><ul class="col-sm-8 list-unstyled"><li><p><strong>Working Groups</strong></p></li><li><a href="http://wiki.eclipse.org/Auto_IWG">Automotive</a></li><li><a href="http://iot.eclipse.org">Internet of Things</a></li><li><a href="http://locationtech.org">LocationTech</a></li><li><a href="http://lts.eclipse.org">Long-Term Support</a></li><li><a href="http://polarsys.org">PolarSys</a></li><li><a href="http://science.eclipse.org">Science</a></li><li><a href="http://openmdm.org">OpenMDM</a></li></ul> </div>
</div>
</li>
</ul>
</li>
</ul>
</div>
<div class="navbar-header">
<button data-target="#navbar-collapse-1" data-toggle="collapse" class="navbar-toggle" type="button">
<span class="sr-only">Toggle navigation</span>
<span class="icon-bar"></span>
<span class="icon-bar"></span>
<span class="icon-bar"></span>
<span class="icon-bar"></span>
</button>
<a href="https://www.eclipse.org/" class="visible-xs navbar-brand"><img class="logo-eclipse-default" src="http://wiki.eclipse.org/skins/solstice/public/images/logo/eclipse-800x188.png" alt="Eclipsepedia" width="174"></a>
</div>
</div>
</div>
</div>
</div>
</div>
</header>
<section class="defaut-breadcrumbs hidden-print noprint hidden-print clearfix" id="breadcrumb">
<div>
<ol class="breadcrumb">
<li><a href="https://www.eclipse.org/">Home</a></li>
<li><a href="http://wiki.eclipse.org/Main_Page">Eclipse Wiki</a></li>
<li class="active">SMILA/Specifications/CrawlerAPIDiscussion09</li></ol>
</div>
</section>
</div>
<div class="toolbar-menu breadcrumbs-offset noprint hidden-print margin-bottom-0 clearfix">
<div class="col-md-24">
<ol class="breadcrumb" role="navigation">
<li id="pt-login">
<a href="http://wiki.eclipse.org/index.php?title=Special:UserLogin&amp;returnto=SMILA%2FSpecifications%2FCrawlerAPIDiscussion09">
<i class="fa fa-sign-in fa-fw orange"></i> Log in </a>
</li>
</ul>
</div>
</div>
<main role="main" class="background-grey">
<div class="container-full padding-top-25">
<!-- content -->
<section id="content" class="mw-body container-full clearfix 0">
<div id="mw-js-message" style="display:none;"></div>
<!-- bodyContent -->
<div id="bodyContent">
<!-- jumpto -->
<div id="jump-to-nav" class="mw-jump">
Jump to: <a href="CrawlerAPIDiscussion09.html#mw-head">navigation</a>,
<a href="CrawlerAPIDiscussion09.html#p-search">search</a>
</div>
<!-- /jumpto -->
<!-- leftcol -->
<aside class="col-md-4 noprint hidden-print" id="leftcol">
<form class="input-group" role="form" id="form-eclipse-search" action="http://wiki.eclipse.org/index.php" id="searchform">
<input id="searchInput" class="search-query form-control" type="search" accesskey="f" title="Special:Search" placeholder="Search" name="search" value="">
<span class="input-group-btn">
<button value="search" id="mw-searchButton" type="submit" class="btn btn-default" title="Search the pages for this text" name="fulltext">
<i class="fa fa-search"></i>
</button>
</span>
</form>
<select class="form-control margin-top-10 margin-bottom-10 visible-xs visible-sm" onchange="this.options[this.selectedIndex].value && (window.location = this.options[this.selectedIndex].value);"><option class="fw-700 "><span class="fw-700">---Navigation---</span></option><option value="/Main_Page">Main Page</option><option value="/Eclipsepedia:Community_portal">Community portal</option><option value="/Eclipsepedia:Current_events">Current events</option><option value="/Special:RecentChanges">Recent changes</option><option value="/Special:Random">Random page</option><option value="https://www.mediawiki.org/wiki/Special:MyLanguage/Help:Contents">Help</option></select><ul class="ul-left-nav fa-ul hidden-print leftnav hidden-xs hidden-sm"><li class="separator"><span class="separator">Navigation</span></li> <li class=""><i class="fa fa-angle-double-right orange fa-fw"></i> <a href="http://wiki.eclipse.org/Main_Page" id="n-mainpage" title="Visit the main page [z]" accesskey="z">Main Page</a></li> <li class=""><i class="fa fa-angle-double-right orange fa-fw"></i> <a href="http://wiki.eclipse.org/Eclipsepedia:Community_portal" id="n-portal" title="About the project, what you can do, where to find things">Community portal</a></li> <li class=""><i class="fa fa-angle-double-right orange fa-fw"></i> <a href="http://wiki.eclipse.org/Eclipsepedia:Current_events" id="n-currentevents" title="Find background information on current events">Current events</a></li> <li class=""><i class="fa fa-angle-double-right orange fa-fw"></i> <a href="http://wiki.eclipse.org/Special:RecentChanges" id="n-recentchanges" title="A list of recent changes in the wiki [r]" accesskey="r">Recent changes</a></li> <li class=""><i class="fa fa-angle-double-right orange fa-fw"></i> <a href="http://wiki.eclipse.org/Special:Random" id="n-randompage" title="Load a random page [x]" accesskey="x">Random page</a></li> <li class=""><i class="fa fa-angle-double-right orange fa-fw"></i> <a href="https://www.mediawiki.org/wiki/Special:MyLanguage/Help:Contents" id="n-help" title="The place to find out">Help</a></li></ul> <select class="form-control margin-top-10 margin-bottom-10 visible-xs visible-sm" onchange="this.options[this.selectedIndex].value && (window.location = this.options[this.selectedIndex].value);"><option class="fw-700 "><span class="fw-700">---Toolbox---</span></option><option value="/index.php?title=SMILA/Specifications/CrawlerAPIDiscussion09&amp;action=info">Page information</option><option value="/index.php?title=SMILA/Specifications/CrawlerAPIDiscussion09&amp;oldid=115417">Permanent link</option><option value="/index.php?title=SMILA/Specifications/CrawlerAPIDiscussion09&amp;printable=yes">Printable version</option><option value="/Special:SpecialPages">Special pages</option><option value="/Special:RecentChangesLinked/SMILA/Specifications/CrawlerAPIDiscussion09">Related changes</option><option value="/Special:WhatLinksHere/SMILA/Specifications/CrawlerAPIDiscussion09">What links here</option></select><ul class="ul-left-nav fa-ul hidden-print leftnav hidden-xs hidden-sm"><li class="separator"><span class="separator">Toolbox</span></li><li><i class="fa fa-angle-double-right orange fa-fw"></i> <a href="http://wiki.eclipse.org/index.php?title=SMILA/Specifications/CrawlerAPIDiscussion09&amp;action=info" id="t-info">Page information</a></li><li><i class="fa fa-angle-double-right orange fa-fw"></i> <a href="http://wiki.eclipse.org/index.php?title=SMILA/Specifications/CrawlerAPIDiscussion09&amp;oldid=115417" id="t-permalink" title="Permanent link to this revision of the page">Permanent link</a></li><li><i class="fa fa-angle-double-right orange fa-fw"></i> <a href="http://wiki.eclipse.org/index.php?title=SMILA/Specifications/CrawlerAPIDiscussion09&amp;printable=yes" id="t-print" rel="alternate" title="Printable version of this page [p]" accesskey="p">Printable version</a></li><li><i class="fa fa-angle-double-right orange fa-fw"></i> <a href="http://wiki.eclipse.org/Special:SpecialPages" id="t-specialpages" title="A list of all special pages [q]" accesskey="q">Special pages</a></li><li><i class="fa fa-angle-double-right orange fa-fw"></i> <a href="http://wiki.eclipse.org/Special:RecentChangesLinked/SMILA/Specifications/CrawlerAPIDiscussion09" id="t-recentchangeslinked" title="Recent changes in pages linked from this page [k]" accesskey="k">Related changes</a></li><li><i class="fa fa-angle-double-right orange fa-fw"></i> <a href="http://wiki.eclipse.org/Special:WhatLinksHere/SMILA/Specifications/CrawlerAPIDiscussion09" id="t-whatlinkshere" title="A list of all wiki pages that link here [j]" accesskey="j">What links here</a></li></ul> </aside>
<!-- /leftcol -->
<!-- mainContent -->
<div id="mainContent" class="col-md-20">
<ul class="nav nav-tabs noprint hidden-print" role="tablist">
<li id="ca-nstab-main" class="active"><a href="CrawlerAPIDiscussion09.html" title="View the content page [c]" accesskey="c" tabindex="-1">Page</a></li>
<li id="ca-talk" class="new"><a href="http://wiki.eclipse.org/index.php?title=Talk:SMILA/Specifications/CrawlerAPIDiscussion09&amp;action=edit&amp;redlink=1" title="Discussion about the content page [t]" accesskey="t" tabindex="-1">Discussion</a></li>
<li id="ca-viewsource"><a href="http://wiki.eclipse.org/index.php?title=SMILA/Specifications/CrawlerAPIDiscussion09&amp;action=edit" title="This page is protected.&#10;You can view its source [e]" accesskey="e" tabindex="-1">View source</a></li>
<li id="ca-history" class="collapsible"><a href="http://wiki.eclipse.org/index.php?title=SMILA/Specifications/CrawlerAPIDiscussion09&amp;action=history" title="Past revisions of this page [h]" accesskey="h" tabindex="-1">History</a></li>
</ul> <div class="tab-content background-white">
<div id="tab-pane-main-page-content" class="tab-pane active">
<h1 id="firstHeading" class="firstHeading page-header">
<span dir="auto">SMILA/Specifications/CrawlerAPIDiscussion09</span>
</h1>
<div id="main-page-content">
<!-- subtitle -->
<div id="contentSub" class="alert alert-small alert-warning"><span class="subpages">&lt; <a href="../../SMILA.html" title="SMILA">SMILA</a>&lrm; | <a href="../Specifications.html" title="SMILA/Specifications">Specifications</a></span></div>
<!-- /subtitle -->
<div id="mw-content-text" lang="en" dir="ltr" class="mw-content-ltr"><div id="toc" class="toc"><div id="toctitle"><h2>Contents</h2></div>
<ul>
<li class="toclevel-1 tocsection-1"><a href="CrawlerAPIDiscussion09.html#API-Problems"><span class="tocnumber">1</span> <span class="toctext">API-Problems</span></a>
<ul>
<li class="toclevel-2 tocsection-2"><a href="CrawlerAPIDiscussion09.html#Current_Implementation"><span class="tocnumber">1.1</span> <span class="toctext">Current Implementation</span></a></li>
<li class="toclevel-2 tocsection-3"><a href="CrawlerAPIDiscussion09.html#Current_Problems"><span class="tocnumber">1.2</span> <span class="toctext">Current Problems</span></a></li>
<li class="toclevel-2 tocsection-4"><a href="CrawlerAPIDiscussion09.html#Alternatives"><span class="tocnumber">1.3</span> <span class="toctext">Alternatives</span></a></li>
<li class="toclevel-2 tocsection-5"><a href="CrawlerAPIDiscussion09.html#Discussion"><span class="tocnumber">1.4</span> <span class="toctext">Discussion</span></a></li>
</ul>
</li>
<li class="toclevel-1 tocsection-6"><a href="CrawlerAPIDiscussion09.html#Separation_between_Crawler_Implementation_and_Communication_Implementation"><span class="tocnumber">2</span> <span class="toctext">Separation between Crawler Implementation and Communication Implementation</span></a>
<ul>
<li class="toclevel-2 tocsection-7"><a href="CrawlerAPIDiscussion09.html#How_can_we_separate_the_Communication_technology_from_the_Crawler_Implementation.3F_Goal_is_to_switch_simple_between_e.g._Tuscany_and_In-Process_Communication_without_changing_the_code_for_crawlers."><span class="tocnumber">2.1</span> <span class="toctext">How can we separate the Communication technology from the Crawler Implementation? Goal is to switch simple between e.g. Tuscany and In-Process Communication without changing the code for crawlers.</span></a></li>
<li class="toclevel-2 tocsection-8"><a href="CrawlerAPIDiscussion09.html#How_big_should_be_the_Crawler_Framework_.28classes_that_are_necessary_for_the_start_of_the_Crawler_Process.3F.29"><span class="tocnumber">2.2</span> <span class="toctext">How big should be the Crawler Framework (classes that are necessary for the start of the Crawler Process?)</span></a></li>
<li class="toclevel-2 tocsection-9"><a href="CrawlerAPIDiscussion09.html#Alternate_opinion"><span class="tocnumber">2.3</span> <span class="toctext">Alternate opinion</span></a></li>
</ul>
</li>
</ul>
</div>
<h1><span class="mw-headline" id="API-Problems">API-Problems</span></h1>
<h3><span class="mw-headline" id="Current_Implementation">Current Implementation</span></h3>
<pre> /**
* Returns an array of MObject objects. The size of the returned array may vary from call to call. The maximum size of
* the array is determined by configuration or by the implementation class.
*
* <b>@return</b> an array of MObject objects or null, if no more MObject exist
* <b>@throws</b> CrawlerException
* if any error occurs
*/
MObject[] getNextDeltaIndexingData() <b>throws</b> CrawlerException, CrawlerCriticalException;
</pre>
<p><br />
</p>
<pre> /**
* Returns a Record object. The parameter pos refers to the position of the MObject from the MObject[] returned by
* getNextDeltaIndexingData().
*
* <b>@param</b> pos
* the position refering to a MObject[]
* <b>@return</b> a Record object
* <b>@throws</b> CrawlerException
* if any error occurs
*/
Record getRecord(<b>int</b> pos) <b>throws</b> CrawlerException, CrawlerCriticalException;
</pre>
<p><br />
Workflow:
</p>
<ol>
<li> getNextDeltaIndexingData should return attributes that are needed to generate the ID and the HASH for the entry
<dl>
<dd> (they are flagged in the IndexOrderConfiguration)
</dd>
</dl>
</li>
<li> The CrawlerController then generates the ID and the HASH
</li>
<li> Communication with DeltaIndexingModule (ID and HASH needed)
</li>
<li> DeltaIndexingModule returns the Information if the entry has changed or not
</li>
<li> For changed entries the CrawlerController queries the Record from the Crawler
</li>
</ol>
<p>The Crawler returns always an array (size can be defined by the crawler).
Tests have shown that this workflow increases communication performance, but crawler developer has to implement more code and the API is a little bit more complicate
</p>
<h3><span class="mw-headline" id="Current_Problems">Current Problems</span></h3>
<p>Crawler Developer have to handled frames for getNextDeltaIndexing and getRecords
Attachments (Attributes that are flagged as Attachment in the IndexOrder) cannot be returned with the MObject (with GetNextDeltaIndexing), because MObject can contain only Literals and Literals are only simple Data-Types
Crawler should usually not return Attachments for hashing, because it destroys the intended Workflow. "Expensive" (time-consuming) operations like getting the content of the Entry should only be executed with getRecord() in the current Implementation attachments (the content) is returned in the mobject as string and then it is returned also as attachment in the record (probably it is also returned in record as Mobject). That means the content is transferred three times
Crawler Developer has to understand Record/MObject Structure
Exception handling: How should an Exception handled while calling getNextDeltaIndexing? At the moment it tries several times until stopping crawling.
</p><p><br />
</p>
<h3><span class="mw-headline" id="Alternatives">Alternatives</span></h3>
<ol>
<li> getNextDeltaIndexing returns a new Class (e.g. DIEntry)
<dl>
<dd>the Class contains Attributes with Name and Value, the Value is stored with the Object-Type. therefore every Attribute and Attachments can be returned
</dd>
<dd>getRecord returns only Object\[\], it contains only not previously transferred attributes
</dd>
<dd>CrawlerController creates Records (based on the information in the IndexOrder)
</dd>
</dl>
</li>
<li> getNextDeltaIndexing returns Record (contains only the DI-Information Attributes and Attachments)
<dl>
<dd>getRecord returns also a Record, it contains only not previously transferred Information
</dd>
<dd>CrawlerController can "merge" both entries
</dd>
</dl>
</li>
<li> HASH/ID generation is executed in the Crawler Process.
<dl>
<dd>At the moment the Crawler is based on an abstract class that should implement the communication implementation (like Tuscany). Hash /ID creation classes #:can be moved into the Crawler Site Classes. Thus getNextDeltaIndexing will return prepared ID and Hash
</dd>
</dl>
</li>
</ol>
<p><br />
</p>
<h3><span class="mw-headline" id="Discussion">Discussion</span></h3>
<p><a href="http://wiki.eclipse.org/index.php?title=User:S.voigt.brox.de&amp;action=edit&amp;redlink=1" class="new" title="User:S.voigt.brox.de (page does not exist)">Sebastian Voigt</a>:
to minimize problems with the underlying communication technology and to simplify crawler development i would prefer 1)
Crawler Developers have only to understand the indexorderconfiguration and they can return the "Attributes" with simple Java data-types.
There is no advantage for us that the crawler developer has to implement Hashing/ID Components (increase only development complexity) and has to fill records and MObjects.
</p><p><a href="http://wiki.eclipse.org/User:Daniel.stucky.empolis.com" title="User:Daniel.stucky.empolis.com">Daniel Stucky</a>:
Personally I prefer to let the Crawler generate ID and HASH. It is beneficial for performance, as less data has to be transferred between Crawlers and CrawlerController. I don't see additional complexity. Not every Crawler has to implement it's own methods to create ID/HASH. He only has to use them. Such methods can be made available by Utility classes or an abstract base class. If someone desperately wants to implement these things on his own - he's free to do it and has to bear the consequences.
Concerning the return types, I think that getNextDeltaIndexing() should return an array of a new data type DIInfo, that contains only the ID (Id) and the HASH (String). As there are 2 concrete data types (Id and String) there is no need to use MObjects or Records. It is still possible, though.
For the return type of getRecord() one could simply use a Map&lt;String,Object&gt; and create the Record objects on the CrawlerController. In this way a Crawler may provide data, that is not convertible into a Record (at least not automatically/generically). On the other hand, we would have less dependencies towards other bundles. A Record object has more constraints and allows a Crawler to provide additional information to the data using annotations (sadly I currently don't have an example for a use case). Another issue could be semantics. At the moment is is totally unclear how semantics are added/associated to/with Records. Using the same objects throughout the system may make things easier.
I do agree that creation of Records, MObjects and Literals is cumbersome. So we should adopt those APIs or add utility methods to make creation easier, regardless of this is used in Crawlers or in the CrawlerController.
</p><p><br />
</p>
<h1><span class="mw-headline" id="Separation_between_Crawler_Implementation_and_Communication_Implementation">Separation between Crawler Implementation and Communication Implementation</span></h1>
<h3><span class="mw-headline" id="How_can_we_separate_the_Communication_technology_from_the_Crawler_Implementation.3F_Goal_is_to_switch_simple_between_e.g._Tuscany_and_In-Process_Communication_without_changing_the_code_for_crawlers.">How can we separate the Communication technology from the Crawler Implementation? Goal is to switch simple between e.g. Tuscany and In-Process Communication without changing the code for crawlers.</span></h3>
<p><a href="http://wiki.eclipse.org/User:Daniel.stucky.empolis.com" title="User:Daniel.stucky.empolis.com">Daniel Stucky</a>:
Actually Tuscany (SCA) is the technology that allows separation of communication technology and business logic. The wiring of components allows us for example to let the CrawlerController communicate with Crawlers in Process, via RMI, webservice, etc. by configuration. I think your question is "Is it possible to NOT use Tuscany for in process communication without changing code for crawlers?". There are several issues:
</p>
<ul>
<li> in process communication without Tuscany may be a valid request, as it leads to better performance. Even when using binding.sca Tuscany generates proxy objects that will slow down communication. Perhaps we should do some tests (see Performance Evaluation on page <a href="../Project_Concepts/IRM.html" title="SMILA/Project Concepts/IRM">SMILA/Project Concepts/IRM</a>
</li>
<li> most of the Tuscany features do not need actual coding (e.g. implementation of interfaces) but are enabled by code annotations. These annotations do not interfere with the crawler code if Tuscany is not used at runtime (for compilation Tuscany annotation classes are needed of course)
</li>
<li> the concept was done with Tuscany/SCA functionality in mind. So there are several features that automatically come with Tuscany (like handling of conversations/sessions, using ComponentContext to determine CrawlerID). This allows a Crawler to crawl multiple DataSources in parallel by automatically providing multiple instances. If Tuscany is not used this feature has to be reimplemented by each Crawler. If it is reimplemented, then it makes no sense to use it's Tuscany counterpart when using Tuscany. The ComponentContext is used to get the Crawlers ID from the Component description. It is used for Crawler detection by the CrawlerController
</li>
</ul>
<p>So what is the gain for a Crawler developer? I don't see any benefits regarding simplification. In contrast, the developer has to take care of multithreading and session handling.
If you see any problems with the technology in the Crawler area, then we should discuss if CrawlerController and Crawler should run in the same VM and make NOT use of Tuscany in any case. If Crawlers in non Java technologies are needed integration is done in traditional ways (e.g. JNI, Corba, etc.) using a Java Proxy. And is Tuscany a valid technology for distributing ConnectivityManager and BPEL Services, then&#160;?
</p>
<h3><span class="mw-headline" id="How_big_should_be_the_Crawler_Framework_.28classes_that_are_necessary_for_the_start_of_the_Crawler_Process.3F.29">How big should be the Crawler Framework (classes that are necessary for the start of the Crawler Process?)</span></h3>
<p><a href="http://wiki.eclipse.org/User:Daniel.stucky.empolis.com" title="User:Daniel.stucky.empolis.com">Daniel Stucky</a>:
I think we should try to keep the Crawler Framework as small as possible. So I guess we have to provide seperate bundles for interfaces and implementations, as it is already done in org.eclipse.smila.connectivity and org.eclipse.smila.connectivity.impl. Also a restructuring of utility classes may be necessary.
</p>
<h3><span class="mw-headline" id="Alternate_opinion">Alternate opinion</span></h3>
<p><a href="http://wiki.eclipse.org/index.php?title=User:Churkin.ivan.gmail.com&amp;action=edit&amp;redlink=1" class="new" title="User:Churkin.ivan.gmail.com (page does not exist)">Ivan Churkin</a>: I have alternate to Daniel opinion. But, before represent it, I want to summarize.
</p><p>The main goal of framework is is to offer convenient API for 3rd party crawler developers. To satisfy the goal, it have to possess following characteristics, in my opinion.
</p>
<ul>
<li> Simplicity.
</li>
<li> Independence. ( from 3rd party technologies, like SCA )
</li>
<li> Effectiveness. ( ready crawler should interact with framework efficiently)
</li>
</ul>
<p>Unfortunately, current crawler API does not possess at least one characteristic from the list!
</p>
<ul>
<li> Its hard to implement.
</li>
<li> It dependent from SCA
</li>
<li> It inefficiently interacts with framework, for example when HASH should be calculated from the CONTENT, like for web crawler. As a result crawler sends CONTENT as some additional Attribute to Crawler Controller only for calculating HASH. And, moreover, its impossible use web crawler for downloading binary content, because DIInfo based on string Literals.
</li>
</ul>
<p>In my opinion its absolutely unacceptable.
</p><p>The problem that this API was designed specially for SCA. Its not user-friendly. Additionally, it has (only one) simplification of development, common HASH calculating on crawler controller side. This simplification breaks effectiveness and makes additional issues like "Content or binary based HASH" problem.
</p><p>I think the solution is to split crawler API and communication API. Crawler interface should be very simple. It should be something like the next interface:
</p>
<div dir="ltr" class="mw-geshi mw-code mw-content-ltr"><div class="java source-java"><pre class="de1"><span class="kw1">interface</span> Crawler <span class="br0">&#123;</span>
<span class="kw4">void</span> start<span class="br0">&#40;</span>IndexOrderConfiruration config<span class="br0">&#41;</span><span class="sy0">;</span>
<span class="kw4">boolean</span> next<span class="br0">&#40;</span><span class="br0">&#41;</span><span class="sy0">;</span>
<span class="kw3">Object</span> getAttribute<span class="br0">&#40;</span><span class="kw3">String</span> name<span class="br0">&#41;</span><span class="sy0">;</span>
<span class="kw4">byte</span><span class="br0">&#91;</span><span class="br0">&#93;</span> getAttachment<span class="br0">&#40;</span><span class="kw3">String</span> name<span class="br0">&#41;</span><span class="sy0">;</span>
<span class="kw4">void</span> finish<span class="br0">&#40;</span><span class="br0">&#41;</span><span class="sy0">;</span>
<span class="br0">&#125;</span></pre></div></div>
<p>Or, maybe, even better:
</p>
<div dir="ltr" class="mw-geshi mw-code mw-content-ltr"><div class="java source-java"><pre class="de1"><span class="kw1">interface</span> DataSourceReference <span class="br0">&#123;</span>
<span class="kw3">Object</span> getAttribute<span class="br0">&#40;</span><span class="kw3">String</span> name<span class="br0">&#41;</span><span class="sy0">;</span>
<span class="kw4">byte</span><span class="br0">&#91;</span><span class="br0">&#93;</span> getAttachment<span class="br0">&#40;</span><span class="kw3">String</span> name<span class="br0">&#41;</span><span class="sy0">;</span>
<span class="br0">&#125;</span>
&#160;
<span class="kw1">interface</span> Crawler <span class="br0">&#123;</span>
<span class="kw4">void</span> start<span class="br0">&#40;</span>IndexOrderConfiruration config<span class="br0">&#41;</span><span class="sy0">;</span>
DataSourceReference next<span class="br0">&#40;</span><span class="br0">&#41;</span><span class="sy0">;</span>
<span class="kw4">void</span> finish<span class="br0">&#40;</span><span class="br0">&#41;</span><span class="sy0">;</span>
<span class="br0">&#125;</span></pre></div></div>
<p><br />
Communication interface will depends from communication technology used. For SCA It will be similar to currently used Crawler interface. The main benefit that it will be added reference implementation (RI) of communication interface into framework. It will allow to ball a game. Manly, crawler developers will implement very simple interface and only to use ready communication RI. From the other side, it will be allowed to write and use own implementations of communication interface if RI does not fit ( dont shure that its really required ).
</p><p>I see many benefits.
</p>
<ul>
<li> All hard and unclear work will be moved to written once communication RI, All crawler developers will be happy&#160;;)
</li>
<li> Its more flexible regarding transfort protocols. For example, if transfort will be changed (from SCA to other), we have to change only one class in framework. And we have not fix all (3rd party) crawlers, they will remain the same.
</li>
<li> Problems like "Content based HASH" diappeared.
</li>
</ul>
<!--
NewPP limit report
CPU time usage: 0.128 seconds
Real time usage: 0.162 seconds
Preprocessor visited node count: 40/1000000
Preprocessor generated node count: 80/1000000
Post‐expand include size: 0/2097152 bytes
Template argument size: 0/2097152 bytes
Highest expansion depth: 2/40
Expensive parser function count: 0/100
-->
<!-- Saved in parser cache with key my_wiki:pcache:idhash:15331-0!*!0!!en!*!* and timestamp 20150414084647 and revision id 115417
-->
</div>
<!-- catlinks -->
<div id='catlinks' class='catlinks catlinks-allhidden'></div> <!-- /catlinks -->
</div>
</div>
</div>
</div>
<!-- /maincontent -->
<!-- printfooter -->
<div class="printfooter">
Retrieved from "<a href="http://wiki.eclipse.org/index.php?title=SMILA/Specifications/CrawlerAPIDiscussion09&amp;oldid=115417">http://wiki.eclipse.org/index.php?title=SMILA/Specifications/CrawlerAPIDiscussion09&amp;oldid=115417</a>" </div>
<!-- /printfooter -->
<!-- debughtml -->
<!-- /debughtml -->
</div>
<!-- /bodyContent -->
</section>
<!-- /content -->
<!-- footer -->
</div> <section id="footer-contribution-info" style="border-top:1px solid #ccc;" class="footer-offset background-white margin-top-25"><div class="container text-center padding-top-10 padding-bottom-10"><p id="footercredit">This page was last modified 10:46, 26 August 2008 by <a href="http://wiki.eclipse.org/index.php?title=User:Churkin.ivan.gmail.com&amp;action=edit&amp;redlink=1" class="new" title="User:Churkin.ivan.gmail.com (page does not exist)">Ivan Churkin</a>. Based on work by <a href="http://wiki.eclipse.org/User:Daniel.stucky.empolis.com" title="User:Daniel.stucky.empolis.com">Daniel Stucky</a> and <a href="http://wiki.eclipse.org/index.php?title=User:Svoigt.brox.de&amp;action=edit&amp;redlink=1" class="new" title="User:Svoigt.brox.de (page does not exist)">Sebastian Voigt</a>.</p><p id="footerviews">This page has been accessed 3,813 times.</p></div></section> </main> <!-- /#main-content-container-row -->
<p id="back-to-top" class="noprint hidden-print">
<a class="visible-xs" href="CrawlerAPIDiscussion09.html#top">Back to the top</a>
</p>
<footer role="contentinfo" class="noprint hidden-print">
<div class="container">
<div class="row">
<section id="footer-eclipse-foundation" class="col-xs-offset-1 col-xs-11 col-sm-7 col-md-6 col-md-offset-0">
<h2 class="section-title">Eclipse Foundation</h2>
<ul class="nav">
<li><a href="https://eclipse.org/org/">About us</a></li>
<li><a href="https://eclipse.org/org/foundation/contact.php">Contact Us</a></li>
<li><a href="https://eclipse.org/donate">Donate</a></li>
<li><a href="https://eclipse.org/org/documents/">Governance</a></li>
<li><a href="https://eclipse.org/artwork/">Logo and Artwork</a></li>
<li><a href="https://eclipse.org/org/foundation/directors.php">Board of Directors</a></li>
</ul>
</section>
<section id="footer-legal" class="col-xs-offset-1 col-xs-11 col-sm-7 col-md-6 col-md-offset-0">
<h2 class="section-title">Legal</h2>
<ul class="nav">
<li><a href="https://eclipse.org/legal/privacy.php">Privacy Policy</a></li>
<li><a href="https://eclipse.org/legal/termsofuse.php">Terms of Use</a></li>
<li><a href="https://eclipse.org/legal/copyright.php">Copyright Agent</a></li>
<li><a href="https://eclipse.org/org/documents/epl-v10.php">Eclipse Public License </a></li>
<li><a href="https://eclipse.org/legal/">Legal Resources </a></li>
</ul>
</section>
<section id="footer-useful-links" class="col-xs-offset-1 col-xs-11 col-sm-7 col-md-6 col-md-offset-0">
<h2 class="section-title">Useful Links</h2>
<ul class="nav">
<li><a href="https://bugs.eclipse.org/bugs/">Report a Bug</a></li>
<li><a href="http://help.eclipse.org/">Documentation</a></li>
<li><a href="https://eclipse.org/contribute/">How to Contribute</a></li>
<li><a href="https://eclipse.org/mail/">Mailing Lists</a></li>
<li><a href="https://eclipse.org/forums/">Forums</a></li>
<li><a href="http://marketplace.eclipse.org/">Marketplace</a></li>
</ul>
</section>
<section id="footer-other" class="col-xs-offset-1 col-xs-11 col-sm-7 col-md-6 col-md-offset-0">
<h2 class="section-title">Other</h2>
<ul class="nav">
<li><a href="https://eclipse.org/ide/">IDE and Tools</a></li>
<li><a href="https://eclipse.org/projects">Community of Projects</a></li>
<li><a href="https://eclipse.org/org/workinggroups/">Working Groups</a></li>
</ul>
<ul class="list-inline social-media">
<li><a href="https://twitter.com/EclipseFdn"><i class="fa fa-twitter-square"></i></a></li>
<li><a href="https://plus.google.com/+Eclipse"><i class="fa fa-google-plus-square"></i></a></li>
<li><a href="https://www.facebook.com/eclipse.org"><i class="fa fa-facebook-square"></i> </a></li>
<li><a href="https://www.youtube.com/user/EclipseFdn"><i class="fa fa-youtube-square"></i></a></li>
</ul>
</section>
<div id="copyright" class="col-xs-offset-1 col-sm-14 col-md-24 col-md-offset-0">
<div>
<span><img src="http://eclipse.org/eclipse.org-common/themes/solstice/public/images/logo/eclipse-logo-bw-800x188.png" alt="Eclipse.org black and white logo" width="166" height="39" id="logo-eclipse-white"/></span>
<p id="copyright-text">Copyright &copy; 2014 The Eclipse Foundation. All Rights Reserved.</p>
</div>
</div>
<a href="CrawlerAPIDiscussion09.html#" class="scrollup">Back to the top</a>
</div>
</div>
</footer>
<script src="http://wiki.eclipse.org/skins/solstice/public/javascript/main.min.js"></script>
<!-- Placed at the end of the document so the pages load faster -->
<script type="text/javascript">
var _gaq = _gaq || [];
_gaq.push(['_setAccount', 'UA-910670-2']);
_gaq.push(['_trackPageview']);
(function() {
var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
})();
</script> <!-- /footer -->
<script>/*<![CDATA[*/window.jQuery && jQuery.ready();/*]]>*/</script><script>if(window.mw){
mw.loader.state({"skins.solstice":"loading","site":"ready","user":"ready","user.groups":"ready"});
}</script>
<script src="http://wiki.eclipse.org/load.php?debug=false&amp;lang=en&amp;modules=skins.solstice&amp;only=scripts&amp;skin=solstice&amp;*"></script>
<script>if(window.mw){
mw.loader.load(["mediawiki.action.view.postEdit","mediawiki.user","mediawiki.hidpi","mediawiki.page.ready","mediawiki.searchSuggest"],null,true);
}</script>
<script>if(window.mw){
mw.config.set({"wgBackendResponseTime":338});
}</script> </body>
</html>