blob: 6cf5ef47e379b52c908797b9b12c9c7c879dba17 [file] [log] [blame]
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en" dir="ltr">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<meta name="keywords" content="SMILA/Specifications/CrawlerAPIDiscussion09,SMILA/Project Concepts/IRM,Daniel.stucky.empolis.com,Churkin.ivan.gmail.com,S.voigt.brox.de" />
<link rel="shortcut icon" href="http://wiki.eclipse.org/SMILA/Specifications/favicon.ico" />
<link rel="search" type="application/opensearchdescription+xml" href="http://wiki.eclipse.org/opensearch_desc.php" title="Eclipsepedia (English)" />
<link rel="alternate" type="application/rss+xml" title="Eclipsepedia RSS Feed" href="http://wiki.eclipse.org/index.php?title=Special:Recentchanges&amp;feed=rss" />
<link rel="alternate" type="application/atom+xml" title="Eclipsepedia Atom Feed" href="http://wiki.eclipse.org/index.php?title=Special:Recentchanges&amp;feed=atom" />
<title>SMILA/Specifications/CrawlerAPIDiscussion09 - Eclipsepedia</title>
<style type="text/css" media="screen,projection">/*<![CDATA[*/ @import "http://wiki.eclipse.org/skins/eclipsenova/novaWide.css?116"; /*]]>*/</style>
<link rel="stylesheet" type="text/css" media="print" href="http://wiki.eclipse.org/skins/eclipsenova/eclipsenovaPrint.css?116" />
<link rel="stylesheet" type="text/css" media="handheld" href="http://wiki.eclipse.org/skins/eclipsenova/handheld.css?116" />
<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/Nova/css/header.css" media="screen" />
<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/tabs.css" media="screen" />
<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/Nova/css/visual.css" media="screen" />
<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/Nova/css/layout.css" media="screen" />
<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/Nova/css/footer.css" media="screen" />
<!--[if IE]><link rel="stylesheet" type="text/css" href="/skins/eclipsenova/IEpngfix.css" media="screen" /><![endif]-->
<!--[if lt IE 5.5000]><style type="text/css">@import "/skins/eclipsenova/IE50Fixes.css?116";</style> <![endif]-->
<!--[if IE 5.5000]><style type="text/css">@import "/skins/eclipsenova/IE55Fixes.css?116";</style><![endif]-->
<!--[if IE 6]><style type="text/css">@import "/skins/eclipsenova/IE60Fixes.css?116";</style><![endif]-->
<!--[if IE 7]><style type="text/css">@import "/skins/eclipsenova/IE70Fixes.css?116";</style><![endif]-->
<!--[if lt IE 7]><script type="text/javascript" src="/skins/common/IEFixes.js?116"></script>
<meta http-equiv="imagetoolbar" content="no" /><![endif]-->
<script type= "text/javascript">/*<![CDATA[*/
var skin = "eclipsenova";
var stylepath = "/skins";
var wgArticlePath = "/$1";
var wgScriptPath = "";
var wgScript = "/index.php";
var wgServer = "http://wiki.eclipse.org";
var wgCanonicalNamespace = "";
var wgCanonicalSpecialPageName = false;
var wgNamespaceNumber = 0;
var wgPageName = "SMILA/Specifications/CrawlerAPIDiscussion09";
var wgTitle = "SMILA/Specifications/CrawlerAPIDiscussion09";
var wgAction = "view";
var wgRestrictionEdit = [];
var wgRestrictionMove = [];
var wgArticleId = "15331";
var wgIsArticle = true;
var wgUserName = null;
var wgUserGroups = null;
var wgUserLanguage = "en";
var wgContentLanguage = "en";
var wgBreakFrames = false;
var wgCurRevisionId = "115417";
var wgVersion = "1.12.0";
var wgEnableAPI = true;
var wgEnableWriteAPI = false;
/*]]>*/</script>
<script type="text/javascript" src="http://wiki.eclipse.org/skins/common/wikibits.js?116"><!-- wikibits js --></script>
<!-- Performance mods similar to those for bug 166401 -->
<script type="text/javascript" src="http://wiki.eclipse.org/index.php?title=-&amp;action=raw&amp;gen=js&amp;useskin=eclipsenova"><!-- site js --></script>
<!-- Head Scripts -->
<script type="text/javascript" src="http://wiki.eclipse.org/skins/common/ajax.js?116"></script>
<style type="text/css">/*<![CDATA[*/
.source-java {line-height: normal; font-size: medium;}
.source-java li {line-height: normal;}
/**
* GeSHi Dynamically Generated Stylesheet
* --------------------------------------
* Dynamically generated stylesheet for java
* CSS class: source-java, CSS id:
* GeSHi (C) 2004 - 2007 Nigel McNie (http://qbnz.com/highlighter)
*/
.source-java .de1, .source-java .de2 {font-family: 'Courier New', Courier, monospace; font-weight: normal;}
.source-java {}
.source-java .head {}
.source-java .foot {}
.source-java .imp {font-weight: bold; color: red;}
.source-java .ln-xtra {color: #cc0; background-color: #ffc;}
.source-java li {font-family: 'Courier New', Courier, monospace; color: black; font-weight: normal; font-style: normal;}
.source-java li.li2 {font-weight: bold;}
.source-java .kw1 {color: #7F0055; font-weight: bold;}
.source-java .kw2 {color: #7F0055; font-weight: bold;}
.source-java .kw3 {color: #000000; font-weight: normal}
.source-java .kw4 {color: #7F0055; font-weight: bold;}
.source-java .co1 {color: #3F7F5F; font-style: italic;}
.source-java .co2 {color: #3F7F5F;}
.source-java .co3 {color: #3F7F5F; font-style: italic; font-weight: bold;}
.source-java .coMULTI {color: #3F5FBF; font-style: italic;}
.source-java .es0 {color: #000000;}
.source-java .br0 {color: #000000;}
.source-java .st0 {color: #2A00ff;}
.source-java .nu0 {color: #000000;}
.source-java .me1 {color: #000000;}
.source-java .me2 {color: #000000;}
/*]]>*/
</style>
<style type="text/css">/*<![CDATA[*/
@import "http://wiki.eclipse.org/index.php?title=MediaWiki:Geshi.css&usemsgcache=yes&action=raw&ctype=text/css&smaxage=18000";
/*]]>*/
</style><link rel="stylesheet" type="text/css" href="CrawlerAPIDiscussion09.html" /> </head>
<body class="mediawiki ns-0 ltr page-SMILA_Specifications_CrawlerAPIDiscussion09">
<div id="globalWrapper">
<div id="column-one">
<!-- Eclipse Additions for the Top Nav start here M. Ward-->
<div id="header">
<div id="header-graphic">
<img src="http://wiki.eclipse.org/skins/eclipsenova/eclipse.png" alt="Eclipse Wiki">
</div>
<!-- Pulled 101409 Mward -->
<div class="portlet" id="p-personal">
<div class="pBody">
<ul>
<li id="pt-login"><a href="http://wiki.eclipse.org/index.php?title=Special:Userlogin&amp;returnto=SMILA/Specifications/CrawlerAPIDiscussion09">Log in</a></li>
</ul>
</div>
</div>
<div id="header-icons">
<div id="sites">
<ul id="sitesUL">
<li><a href="http://www.eclipse.org"><img src="http://dev.eclipse.org/custom_icons/eclipseIcon.png" width="28" height="28" alt="Eclipse Foundation" title="Eclipse Foundation" /><div>Eclipse Foundation</div></a></li>
<li><a href="http://marketplace.eclipse.org"><img src="http://dev.eclipse.org/custom_icons/marketplace.png" width="28" height="28" alt="Eclipse Marketplace" title="Eclipse Marketplace" /><div>Eclipse Marketplace</div></a></li>
<li><a href="https://bugs.eclipse.org/bugs"><img src="http://dev.eclipse.org/custom_icons/system-search-bw.png" width="28" height="28" alt="Bugzilla" title="Bugzilla" /><div>Bugzilla</div></a></li>
<li><a href="http://live.eclipse.org"><img src="http://dev.eclipse.org/custom_icons/audio-input-microphone-bw.png" width="28" height="28" alt="Live" title="Live" /><div>Eclipse Live</div></a></li>
<li><a href="http://planeteclipse.org"><img src="http://dev.eclipse.org/large_icons/devices/audio-card.png" width="28" height="28" alt="PlanetEclipse" title="Planet" /><div>Planet Eclipse</div></a></li>
<li><a href="http://portal.eclipse.org"><img src="http://dev.eclipse.org/custom_icons/preferences-system-network-proxy-bw.png" width="28" height="28" alt="Portal" title="Portal" /><div>My Foundation Portal</div></a></li>
</ul>
</div>
</div>
</div>
<!-- NEW HEADER STUFF HERE -->
<div id="header-menu">
<div id="header-nav">
<ul> <li><a class="first_one" href="http://wiki.eclipse.org/" target="_self">Home</a></li> <li><a href="http://www.eclipse.org/downloads/" target="_self">Downloads</a></li>
<li><a href="http://www.eclipse.org/users/" target="_self">Users</a></li>
<li><a href="http://www.eclipse.org/membership/" target="_self">Members</a></li>
<li><a href="http://wiki.eclipse.org/index.php/Development_Resources" target="_self">Committers</a></li>
<li><a href="http://www.eclipse.org/resources/" target="_self">Resources</a></li>
<li><a href="http://www.eclipse.org/projects/" target="_self">Projects</a></li>
<li><a href="http://www.eclipse.org/org/" target="_self">About Us</a></li>
</ul>
</div>
<div id="header-utils">
<!-- moved the search window here -->
<form action="http://wiki.eclipse.org/Special:Search" >
<input class="input" name="search" type="text" accesskey="f" value="" />
<input type='submit' onclick="this.submit();" name="go" id="searchGoButton" class="button" title="Go to a page with this exact name if one exists" value="Go" />&nbsp;
<input type='submit' onclick="this.submit();" name="fulltext" class="button" id="mw-searchButton" title="Search Eclipsepedia for this text" value="Search" />
</form>
</div>
</div>
<!-- Eclipse Additions for the Header stop here -->
<!-- Additions and mods for leftside nav Start here -->
<!--Started nav rip here-->
<!-- these are the nav controls main page, changes etc -->
<div id="novaContent" class="faux">
<div id="leftcol">
<ul id="leftnav">
<!-- these are the page controls, edit history etc -->
<li class="separator"><a class="separator">Navigation &#160;&#160;</li>
<li id="n-mainpage"><a href="http://wiki.eclipse.org/Main_Page">Main Page</a></li>
<li id="n-portal"><a href="http://wiki.eclipse.org/Eclipsepedia:Community_Portal">Community portal</a></li>
<li id="n-currentevents"><a href="http://wiki.eclipse.org/Eclipsepedia:Current_events">Current events</a></li>
<li id="n-recentchanges"><a href="http://wiki.eclipse.org/Special:Recentchanges">Recent changes</a></li>
<li id="n-randompage"><a href="http://wiki.eclipse.org/Special:Random">Random page</a></li>
<li id="n-help"><a href="http://wiki.eclipse.org/Help:Contents">Help</a></li>
<li class="separator"><a class="separator">Toolbox &#160;&#160;</a></li>
<li id="t-whatlinkshere"><a href="http://wiki.eclipse.org/Special:Whatlinkshere/SMILA/Specifications/CrawlerAPIDiscussion09">What links here</a></li>
<li id="t-recentchangeslinked"><a href="http://wiki.eclipse.org/Special:Recentchangeslinked/SMILA/Specifications/CrawlerAPIDiscussion09">Related changes</a></li>
<!-- This is the toolbox section -->
<li id="t-upload"><a href="http://wiki.eclipse.org/Special:Upload">Upload file</a></li>
<li id="t-specialpages"><a href="http://wiki.eclipse.org/Special:Specialpages">Special pages</a></li>
<li id="t-print"><a href="http://wiki.eclipse.org/index.php?title=SMILA/Specifications/CrawlerAPIDiscussion09&amp;printable=yes">Printable version</a></li> <li id="t-permalink"><a href="http://wiki.eclipse.org/index.php?title=SMILA/Specifications/CrawlerAPIDiscussion09&amp;oldid=115417">Permanent link</a></li> </ul>
</div>
<!-- Additions and mods for leftside nav End here -->
<div id="column-content">
<div id="content">
<a name="top" id="top"></a>
<div id="tabs">
<ul class="primary">
<li class="active"><a href="CrawlerAPIDiscussion09.html"><span class="tab">Page</span></a></li>
<li><a href="http://wiki.eclipse.org/index.php?title=Talk:SMILA/Specifications/CrawlerAPIDiscussion09&amp;action=edit"><span class="tab">Discussion</span></a></li>
<li><a href="http://wiki.eclipse.org/index.php?title=SMILA/Specifications/CrawlerAPIDiscussion09&amp;action=edit"><span class="tab">View source</span></a></li>
<li><a href="http://wiki.eclipse.org/index.php?title=SMILA/Specifications/CrawlerAPIDiscussion09&amp;action=history"><span class="tab">History</span></a></li>
<li><a href="http://wiki.eclipse.org/index.php?title=Special:Userlogin&amp;returnto=SMILA/Specifications/CrawlerAPIDiscussion09"><span class="tab">Edit</span></a></li>
</ul>
</div>
<script type="text/javascript"> if (window.isMSIE55) fixalpha(); </script>
<h1 class="firstHeading">SMILA/Specifications/CrawlerAPIDiscussion09</h1>
<div id="bodyContent">
<h3 id="siteSub">From Eclipsepedia</h3>
<div id="contentSub"><span class="subpages">&lt; <a href="../../SMILA.html" title="SMILA">SMILA</a> | <a href="../Specifications.1.html" title="SMILA/Specifications">Specifications</a></span></div>
<div id="jump-to-nav">Jump to: <a href="CrawlerAPIDiscussion09.html#column-one">navigation</a>, <a href="CrawlerAPIDiscussion09.html#searchInput">search</a></div> <!-- start content -->
<table id="toc" class="toc" summary="Contents"><tr><td><div id="toctitle"><h2>Contents</h2></div>
<ul>
<li class="toclevel-1"><a href="CrawlerAPIDiscussion09.html#API-Problems"><span class="tocnumber">1</span> <span class="toctext">API-Problems</span></a>
<ul>
<li class="toclevel-2"><a href="CrawlerAPIDiscussion09.html#Current_Implementation"><span class="tocnumber">1.1</span> <span class="toctext">Current Implementation</span></a></li>
<li class="toclevel-2"><a href="CrawlerAPIDiscussion09.html#Current_Problems"><span class="tocnumber">1.2</span> <span class="toctext">Current Problems</span></a></li>
<li class="toclevel-2"><a href="CrawlerAPIDiscussion09.html#Alternatives"><span class="tocnumber">1.3</span> <span class="toctext">Alternatives</span></a></li>
<li class="toclevel-2"><a href="CrawlerAPIDiscussion09.html#Discussion"><span class="tocnumber">1.4</span> <span class="toctext">Discussion</span></a></li>
</ul>
</li>
<li class="toclevel-1"><a href="CrawlerAPIDiscussion09.html#Separation_between_Crawler_Implementation_and_Communication_Implementation"><span class="tocnumber">2</span> <span class="toctext">Separation between Crawler Implementation and Communication Implementation</span></a>
<ul>
<li class="toclevel-2"><a href="CrawlerAPIDiscussion09.html#How_can_we_separate_the_Communication_technology_from_the_Crawler_Implementation.3F_Goal_is_to_switch_simple_between_e.g._Tuscany_and_In-Process_Communication_without_changing_the_code_for_crawlers."><span class="tocnumber">2.1</span> <span class="toctext">How can we separate the Communication technology from the Crawler Implementation? Goal is to switch simple between e.g. Tuscany and In-Process Communication without changing the code for crawlers.</span></a></li>
<li class="toclevel-2"><a href="CrawlerAPIDiscussion09.html#How_big_should_be_the_Crawler_Framework_.28classes_that_are_necessary_for_the_start_of_the_Crawler_Process.3F.29"><span class="tocnumber">2.2</span> <span class="toctext">How big should be the Crawler Framework (classes that are necessary for the start of the Crawler Process?)</span></a></li>
<li class="toclevel-2"><a href="CrawlerAPIDiscussion09.html#Alternate_opinion"><span class="tocnumber">2.3</span> <span class="toctext">Alternate opinion</span></a></li>
</ul>
</li>
</ul>
</td></tr></table><script type="text/javascript"> if (window.showTocToggle) { var tocShowText = "show"; var tocHideText = "hide"; showTocToggle(); } </script>
<a name="API-Problems"></a><h1> <span class="mw-headline">API-Problems</span></h1>
<a name="Current_Implementation"></a><h3> <span class="mw-headline">Current Implementation</span></h3>
<pre> /**
* Returns an array of MObject objects. The size of the returned array may vary from call to call. The maximum size of
* the array is determined by configuration or by the implementation class.
*
* <b>@return</b> an array of MObject objects or null, if no more MObject exist
* <b>@throws</b> CrawlerException
* if any error occurs
*/
MObject[] getNextDeltaIndexingData() <b>throws</b> CrawlerException, CrawlerCriticalException;
</pre>
<p><br />
</p>
<pre> /**
* Returns a Record object. The parameter pos refers to the position of the MObject from the MObject[] returned by
* getNextDeltaIndexingData().
*
* <b>@param</b> pos
* the position refering to a MObject[]
* <b>@return</b> a Record object
* <b>@throws</b> CrawlerException
* if any error occurs
*/
Record getRecord(<b>int</b> pos) <b>throws</b> CrawlerException, CrawlerCriticalException;
</pre>
<p><br />
Workflow:
</p>
<ol><li> getNextDeltaIndexingData should return attributes that are needed to generate the ID and the HASH for the entry
<dl><dd> (they are flagged in the IndexOrderConfiguration)
</dd></dl>
</li><li> The CrawlerController then generates the ID and the HASH
</li><li> Communication with DeltaIndexingModule (ID and HASH needed)
</li><li> DeltaIndexingModule returns the Information if the entry has changed or not
</li><li> For changed entries the CrawlerController queries the Record from the Crawler
</li></ol>
<p>The Crawler returns always an array (size can be defined by the crawler).
Tests have shown that this workflow increases communication performance, but crawler developer has to implement more code and the API is a little bit more complicate
</p>
<a name="Current_Problems"></a><h3> <span class="mw-headline">Current Problems</span></h3>
<p>Crawler Developer have to handled frames for getNextDeltaIndexing and getRecords
Attachments (Attributes that are flagged as Attachment in the IndexOrder) cannot be returned with the MObject (with GetNextDeltaIndexing), because MObject can contain only Literals and Literals are only simple Data-Types
Crawler should usually not return Attachments for hashing, because it destroys the intended Workflow. "Expensive" (time-consuming) operations like getting the content of the Entry should only be executed with getRecord() in the current Implementation attachments (the content) is returned in the mobject as string and then it is returned also as attachment in the record (probably it is also returned in record as Mobject). That means the content is transferred three times
Crawler Developer has to understand Record/MObject Structure
Exception handling: How should an Exception handled while calling getNextDeltaIndexing? At the moment it tries several times until stopping crawling.
</p><p><br />
</p>
<a name="Alternatives"></a><h3> <span class="mw-headline">Alternatives</span></h3>
<ol><li> getNextDeltaIndexing returns a new Class (e.g. DIEntry)
<dl><dd>the Class contains Attributes with Name and Value, the Value is stored with the Object-Type. therefore every Attribute and Attachments can be returned
</dd><dd>getRecord returns only Object\[\], it contains only not previously transferred attributes
</dd><dd>CrawlerController creates Records (based on the information in the IndexOrder)
</dd></dl>
</li><li> getNextDeltaIndexing returns Record (contains only the DI-Information Attributes and Attachments)
<dl><dd>getRecord returns also a Record, it contains only not previously transferred Information
</dd><dd>CrawlerController can "merge" both entries
</dd></dl>
</li><li> HASH/ID generation is executed in the Crawler Process.
<dl><dd>At the moment the Crawler is based on an abstract class that should implement the communication implementation (like Tuscany). Hash /ID creation classes #:can be moved into the Crawler Site Classes. Thus getNextDeltaIndexing will return prepared ID and Hash
</dd></dl>
</li></ol>
<p><br />
</p>
<a name="Discussion"></a><h3> <span class="mw-headline">Discussion</span></h3>
<p><a href="http://wiki.eclipse.org/index.php?title=User:S.voigt.brox.de&amp;action=edit" class="new" title="User:S.voigt.brox.de">Sebastian Voigt</a>:
to minimize problems with the underlying communication technology and to simplify crawler development i would prefer 1)
Crawler Developers have only to understand the indexorderconfiguration and they can return the "Attributes" with simple Java data-types.
There is no advantage for us that the crawler developer has to implement Hashing/ID Components (increase only development complexity) and has to fill records and MObjects.
</p><p><a href="http://wiki.eclipse.org/User:Daniel.stucky.empolis.com" title="User:Daniel.stucky.empolis.com">Daniel Stucky</a>:
Personally I prefer to let the Crawler generate ID and HASH. It is beneficial for performance, as less data has to be transferred between Crawlers and CrawlerController. I don't see additional complexity. Not every Crawler has to implement it's own methods to create ID/HASH. He only has to use them. Such methods can be made available by Utility classes or an abstract base class. If someone desperately wants to implement these things on his own - he's free to do it and has to bear the consequences.
Concerning the return types, I think that getNextDeltaIndexing() should return an array of a new data type DIInfo, that contains only the ID (Id) and the HASH (String). As there are 2 concrete data types (Id and String) there is no need to use MObjects or Records. It is still possible, though.
For the return type of getRecord() one could simply use a Map&lt;String,Object&gt; and create the Record objects on the CrawlerController. In this way a Crawler may provide data, that is not convertible into a Record (at least not automatically/generically). On the other hand, we would have less dependencies towards other bundles. A Record object has more constraints and allows a Crawler to provide additional information to the data using annotations (sadly I currently don't have an example for a use case). Another issue could be semantics. At the moment is is totally unclear how semantics are added/associated to/with Records. Using the same objects throughout the system may make things easier.
I do agree that creation of Records, MObjects and Literals is cumbersome. So we should adopt those APIs or add utility methods to make creation easier, regardless of this is used in Crawlers or in the CrawlerController.
</p><p><br />
</p>
<a name="Separation_between_Crawler_Implementation_and_Communication_Implementation"></a><h1> <span class="mw-headline">Separation between Crawler Implementation and Communication Implementation</span></h1>
<a name="How_can_we_separate_the_Communication_technology_from_the_Crawler_Implementation.3F_Goal_is_to_switch_simple_between_e.g._Tuscany_and_In-Process_Communication_without_changing_the_code_for_crawlers."></a><h3> <span class="mw-headline">How can we separate the Communication technology from the Crawler Implementation? Goal is to switch simple between e.g. Tuscany and In-Process Communication without changing the code for crawlers.</span></h3>
<p><a href="http://wiki.eclipse.org/User:Daniel.stucky.empolis.com" title="User:Daniel.stucky.empolis.com">Daniel Stucky</a>:
Actually Tuscany (SCA) is the technology that allows separation of communication technology and business logic. The wiring of components allows us for example to let the CrawlerController communicate with Crawlers in Process, via RMI, webservice, etc. by configuration. I think your question is "Is it possible to NOT use Tuscany for in process communication without changing code for crawlers?". There are several issues:
</p>
<ul><li> in process communication without Tuscany may be a valid request, as it leads to better performance. Even when using binding.sca Tuscany generates proxy objects that will slow down communication. Perhaps we should do some tests (see Performance Evaluation on page <a href="../Project_Concepts/IRM.html" title="SMILA/Project Concepts/IRM">SMILA/Project Concepts/IRM</a>
</li><li> most of the Tuscany features do not need actual coding (e.g. implementation of interfaces) but are enabled by code annotations. These annotations do not interfere with the crawler code if Tuscany is not used at runtime (for compilation Tuscany annotation classes are needed of course)
</li><li> the concept was done with Tuscany/SCA functionality in mind. So there are several features that automatically come with Tuscany (like handling of conversations/sessions, using ComponentContext to determine CrawlerID). This allows a Crawler to crawl multiple DataSources in parallel by automatically providing multiple instances. If Tuscany is not used this feature has to be reimplemented by each Crawler. If it is reimplemented, then it makes no sense to use it's Tuscany counterpart when using Tuscany. The ComponentContext is used to get the Crawlers ID from the Component description. It is used for Crawler detection by the CrawlerController
</li></ul>
<p>So what is the gain for a Crawler developer? I don't see any benefits regarding simplification. In contrast, the developer has to take care of multithreading and session handling.
If you see any problems with the technology in the Crawler area, then we should discuss if CrawlerController and Crawler should run in the same VM and make NOT use of Tuscany in any case. If Crawlers in non Java technologies are needed integration is done in traditional ways (e.g. JNI, Corba, etc.) using a Java Proxy. And is Tuscany a valid technology for distributing ConnectivityManager and BPEL Services, then&nbsp;?
</p>
<a name="How_big_should_be_the_Crawler_Framework_.28classes_that_are_necessary_for_the_start_of_the_Crawler_Process.3F.29"></a><h3> <span class="mw-headline">How big should be the Crawler Framework (classes that are necessary for the start of the Crawler Process?)</span></h3>
<p><a href="http://wiki.eclipse.org/User:Daniel.stucky.empolis.com" title="User:Daniel.stucky.empolis.com">Daniel Stucky</a>:
I think we should try to keep the Crawler Framework as small as possible. So I guess we have to provide seperate bundles for interfaces and implementations, as it is already done in org.eclipse.smila.connectivity and org.eclipse.smila.connectivity.impl. Also a restructuring of utility classes may be necessary.
</p>
<a name="Alternate_opinion"></a><h3> <span class="mw-headline">Alternate opinion</span></h3>
<p><a href="http://wiki.eclipse.org/index.php?title=User:Churkin.ivan.gmail.com&amp;action=edit" class="new" title="User:Churkin.ivan.gmail.com">Ivan Churkin</a>: I have alternate to Daniel opinion. But, before represent it, I want to summarize.
</p><p>The main goal of framework is is to offer convenient API for 3rd party crawler developers. To satisfy the goal, it have to possess following characteristics, in my opinion.
</p>
<ul><li> Simplicity.
</li><li> Independence. ( from 3rd party technologies, like SCA )
</li><li> Effectiveness. ( ready crawler should interact with framework efficiently)
</li></ul>
<p>Unfortunately, current crawler API does not possess at least one characteristic from the list!
</p>
<ul><li> Its hard to implement.
</li><li> It dependent from SCA
</li><li> It inefficiently interacts with framework, for example when HASH should be calculated from the CONTENT, like for web crawler. As a result crawler sends CONTENT as some additional Attribute to Crawler Controller only for calculating HASH. And, moreover, its impossible use web crawler for downloading binary content, because DIInfo based on string Literals.
</li></ul>
<p>In my opinion its absolutely unacceptable.
</p><p>The problem that this API was designed specially for SCA. Its not user-friendly. Additionally, it has (only one) simplification of development, common HASH calculating on crawler controller side. This simplification breaks effectiveness and makes additional issues like "Content or binary based HASH" problem.
</p><p>I think the solution is to split crawler API and communication API. Crawler interface should be very simple. It should be something like the next interface:
</p>
<div dir="ltr" style="text-align: left;"><pre class="source-java"><span class="kw1">interface</span> Crawler <span class="br0">&#123;</span>
<span class="kw4">void</span> start<span class="br0">&#40;</span>IndexOrderConfiruration config<span class="br0">&#41;</span>;
<span class="kw4">boolean</span> next<span class="br0">&#40;</span><span class="br0">&#41;</span>;
<span class="kw3">Object</span> getAttribute<span class="br0">&#40;</span><span class="kw3">String</span> name<span class="br0">&#41;</span>;
<span class="kw4">byte</span><span class="br0">&#91;</span><span class="br0">&#93;</span> getAttachment<span class="br0">&#40;</span><span class="kw3">String</span> name<span class="br0">&#41;</span>;
<span class="kw4">void</span> finish<span class="br0">&#40;</span><span class="br0">&#41;</span>;
<span class="br0">&#125;</span></pre></div>
<p>Or, maybe, even better:
</p>
<div dir="ltr" style="text-align: left;"><pre class="source-java"><span class="kw1">interface</span> DataSourceReference <span class="br0">&#123;</span>
<span class="kw3">Object</span> getAttribute<span class="br0">&#40;</span><span class="kw3">String</span> name<span class="br0">&#41;</span>;
<span class="kw4">byte</span><span class="br0">&#91;</span><span class="br0">&#93;</span> getAttachment<span class="br0">&#40;</span><span class="kw3">String</span> name<span class="br0">&#41;</span>;
<span class="br0">&#125;</span>
&nbsp;
<span class="kw1">interface</span> Crawler <span class="br0">&#123;</span>
<span class="kw4">void</span> start<span class="br0">&#40;</span>IndexOrderConfiruration config<span class="br0">&#41;</span>;
DataSourceReference next<span class="br0">&#40;</span><span class="br0">&#41;</span>;
<span class="kw4">void</span> finish<span class="br0">&#40;</span><span class="br0">&#41;</span>;
<span class="br0">&#125;</span></pre></div>
<p><br />
Communication interface will depends from communication technology used. For SCA It will be similar to currently used Crawler interface. The main benefit that it will be added reference implementation (RI) of communication interface into framework. It will allow to ball a game. Manly, crawler developers will implement very simple interface and only to use ready communication RI. From the other side, it will be allowed to write and use own implementations of communication interface if RI does not fit ( dont shure that its really required ).
</p><p>I see many benefits.
</p>
<ul><li> All hard and unclear work will be moved to written once communication RI, All crawler developers will be happy&nbsp;;)
</li><li> Its more flexible regarding transfort protocols. For example, if transfort will be changed (from SCA to other), we have to change only one class in framework. And we have not fix all (3rd party) crawlers, they will remain the same.
</li><li> Problems like "Content based HASH" diappeared.
</li></ul>
<!--
NewPP limit report
Preprocessor node count: 16/1000000
Post-expand include size: 0/2097152 bytes
Template argument size: 0/2097152 bytes
#ifexist count: 0/100
-->
<!-- Saved in parser cache with key wikidb:pcache:idhash:15331-0!1!0!!en!2!edit=0 and timestamp 20120710093550 -->
<div class="printfooter">
Retrieved from "<a href="CrawlerAPIDiscussion09.html">http://wiki.eclipse.org/SMILA/Specifications/CrawlerAPIDiscussion09</a>"</div>
<!-- end content -->
<div class="visualClear"></div>
</div>
</div>
</div>
<!-- Yoink of toolbox for phoenix moved up -->
</div>
</div>
<div id="clearFooter"/>
<div id="footer" >
<ul id="footernav">
<li class="first"><a href="http://www.eclipse.org/">Home</a></li>
<li><a href="http://www.eclipse.org/legal/privacy.php">Privacy Policy</a></li>
<li><a href="http://www.eclipse.org/legal/termsofuse.php">Terms of Use</a></li>
<li><a href="http://www.eclipse.org/legal/copyright.php">Copyright Agent</a></li>
<li><a href="http://www.eclipse.org/org/foundation/contact.php">Contact</a></li>
<li><a href="http://wiki.eclipse.org/Eclipsepedia:About" title="Eclipsepedia:About">About Eclipsepedia</a></li>
</ul>
<span id="copyright">Copyright &copy; 2012 The Eclipse Foundation. All Rights Reserved</span>
<p id="footercredit">This page was last modified 14:46, 26 August 2008 by <a href="http://wiki.eclipse.org/index.php?title=User:Churkin.ivan.gmail.com&amp;action=edit" class="new" title="User:Churkin.ivan.gmail.com">Ivan Churkin</a>. Based on work by <a href="http://wiki.eclipse.org/User:Daniel.stucky.empolis.com" title="User:Daniel.stucky.empolis.com">Daniel Stucky</a> and <a href="http://wiki.eclipse.org/index.php?title=User:Svoigt.brox.de&amp;action=edit" class="new" title="User:Svoigt.brox.de">Sebastian Voigt</a>.</p>
<p id="footerviews">This page has been accessed 3,058 times.</p>
</div>
<script type="text/javascript">
var gaJsHost = (("https:" == document.location.protocol) ? "https://ssl." : "http://www.");
document.write(unescape("%3Cscript src='" + gaJsHost + "google-analytics.com/ga.js' type='text/javascript'%3E%3C/script%3E"));
</script>
<script type="text/javascript">
var pageTracker = _gat._getTracker("UA-910670-4");
pageTracker._trackPageview();
</script>
<!-- <div class="visualClear"></div> -->
<script type="text/javascript">if (window.runOnloadHook) runOnloadHook();</script>
</div>
<!-- Served in 0.049 secs. --></body></html>