blob: d7cc95114d9f05ddd961ac2ba471aa69c2e19c6a [file] [log] [blame]
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en" dir="ltr">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<meta name="keywords" content="SMILA/Project Concepts/IRMDiscussion,Daniel.stucky.empolis.com,G.schmidt.brox.de,S.voigt.brox.de,Igor.Novakovic.empolis.com,Index Order Configuration Schema,Issue" />
<link rel="shortcut icon" href="http://wiki.eclipse.org/SMILA/Project_Concepts/favicon.ico" />
<link rel="search" type="application/opensearchdescription+xml" href="http://wiki.eclipse.org/opensearch_desc.php" title="Eclipsepedia (English)" />
<link rel="alternate" type="application/rss+xml" title="Eclipsepedia RSS Feed" href="http://wiki.eclipse.org/index.php?title=Special:Recentchanges&amp;feed=rss" />
<link rel="alternate" type="application/atom+xml" title="Eclipsepedia Atom Feed" href="http://wiki.eclipse.org/index.php?title=Special:Recentchanges&amp;feed=atom" />
<title>SMILA/Project Concepts/IRMDiscussion - Eclipsepedia</title>
<style type="text/css" media="screen,projection">/*<![CDATA[*/ @import "http://wiki.eclipse.org/skins/eclipsenova/novaWide.css?116"; /*]]>*/</style>
<link rel="stylesheet" type="text/css" media="print" href="http://wiki.eclipse.org/skins/eclipsenova/eclipsenovaPrint.css?116" />
<link rel="stylesheet" type="text/css" media="handheld" href="http://wiki.eclipse.org/skins/eclipsenova/handheld.css?116" />
<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/Nova/css/header.css" media="screen" />
<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/tabs.css" media="screen" />
<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/Nova/css/visual.css" media="screen" />
<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/Nova/css/layout.css" media="screen" />
<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/Nova/css/footer.css" media="screen" />
<!--[if IE]><link rel="stylesheet" type="text/css" href="/skins/eclipsenova/IEpngfix.css" media="screen" /><![endif]-->
<!--[if lt IE 5.5000]><style type="text/css">@import "/skins/eclipsenova/IE50Fixes.css?116";</style> <![endif]-->
<!--[if IE 5.5000]><style type="text/css">@import "/skins/eclipsenova/IE55Fixes.css?116";</style><![endif]-->
<!--[if IE 6]><style type="text/css">@import "/skins/eclipsenova/IE60Fixes.css?116";</style><![endif]-->
<!--[if IE 7]><style type="text/css">@import "/skins/eclipsenova/IE70Fixes.css?116";</style><![endif]-->
<!--[if lt IE 7]><script type="text/javascript" src="/skins/common/IEFixes.js?116"></script>
<meta http-equiv="imagetoolbar" content="no" /><![endif]-->
<script type= "text/javascript">/*<![CDATA[*/
var skin = "eclipsenova";
var stylepath = "/skins";
var wgArticlePath = "/$1";
var wgScriptPath = "";
var wgScript = "/index.php";
var wgServer = "http://wiki.eclipse.org";
var wgCanonicalNamespace = "";
var wgCanonicalSpecialPageName = false;
var wgNamespaceNumber = 0;
var wgPageName = "SMILA/Project_Concepts/IRMDiscussion";
var wgTitle = "SMILA/Project Concepts/IRMDiscussion";
var wgAction = "view";
var wgRestrictionEdit = [];
var wgRestrictionMove = [];
var wgArticleId = "15272";
var wgIsArticle = true;
var wgUserName = null;
var wgUserGroups = null;
var wgUserLanguage = "en";
var wgContentLanguage = "en";
var wgBreakFrames = false;
var wgCurRevisionId = "286171";
var wgVersion = "1.12.0";
var wgEnableAPI = true;
var wgEnableWriteAPI = false;
/*]]>*/</script>
<script type="text/javascript" src="http://wiki.eclipse.org/skins/common/wikibits.js?116"><!-- wikibits js --></script>
<!-- Performance mods similar to those for bug 166401 -->
<script type="text/javascript" src="http://wiki.eclipse.org/index.php?title=-&amp;action=raw&amp;gen=js&amp;useskin=eclipsenova"><!-- site js --></script>
<!-- Head Scripts -->
<script type="text/javascript" src="http://wiki.eclipse.org/skins/common/ajax.js?116"></script>
<style type="text/css">/*<![CDATA[*/
.source-java {line-height: normal; font-size: medium;}
.source-java li {line-height: normal;}
/**
* GeSHi Dynamically Generated Stylesheet
* --------------------------------------
* Dynamically generated stylesheet for java
* CSS class: source-java, CSS id:
* GeSHi (C) 2004 - 2007 Nigel McNie (http://qbnz.com/highlighter)
*/
.source-java .de1, .source-java .de2 {font-family: 'Courier New', Courier, monospace; font-weight: normal;}
.source-java {}
.source-java .head {}
.source-java .foot {}
.source-java .imp {font-weight: bold; color: red;}
.source-java .ln-xtra {color: #cc0; background-color: #ffc;}
.source-java li {font-family: 'Courier New', Courier, monospace; color: black; font-weight: normal; font-style: normal;}
.source-java li.li2 {font-weight: bold;}
.source-java .kw1 {color: #7F0055; font-weight: bold;}
.source-java .kw2 {color: #7F0055; font-weight: bold;}
.source-java .kw3 {color: #000000; font-weight: normal}
.source-java .kw4 {color: #7F0055; font-weight: bold;}
.source-java .co1 {color: #3F7F5F; font-style: italic;}
.source-java .co2 {color: #3F7F5F;}
.source-java .co3 {color: #3F7F5F; font-style: italic; font-weight: bold;}
.source-java .coMULTI {color: #3F5FBF; font-style: italic;}
.source-java .es0 {color: #000000;}
.source-java .br0 {color: #000000;}
.source-java .st0 {color: #2A00ff;}
.source-java .nu0 {color: #000000;}
.source-java .me1 {color: #000000;}
.source-java .me2 {color: #000000;}
/*]]>*/
</style>
<style type="text/css">/*<![CDATA[*/
@import "http://wiki.eclipse.org/index.php?title=MediaWiki:Geshi.css&usemsgcache=yes&action=raw&ctype=text/css&smaxage=18000";
/*]]>*/
</style><link rel="stylesheet" type="text/css" href="IRMDiscussion.html" /> </head>
<body class="mediawiki ns-0 ltr page-SMILA_Project_Concepts_IRMDiscussion">
<div id="globalWrapper">
<div id="column-one">
<!-- Eclipse Additions for the Top Nav start here M. Ward-->
<div id="header">
<div id="header-graphic">
<img src="http://wiki.eclipse.org/skins/eclipsenova/eclipse.png" alt="Eclipse Wiki">
</div>
<!-- Pulled 101409 Mward -->
<div class="portlet" id="p-personal">
<div class="pBody">
<ul>
<li id="pt-login"><a href="http://wiki.eclipse.org/index.php?title=Special:Userlogin&amp;returnto=SMILA/Project_Concepts/IRMDiscussion">Log in</a></li>
</ul>
</div>
</div>
<div id="header-icons">
<div id="sites">
<ul id="sitesUL">
<li><a href="http://www.eclipse.org"><img src="http://dev.eclipse.org/custom_icons/eclipseIcon.png" width="28" height="28" alt="Eclipse Foundation" title="Eclipse Foundation" /><div>Eclipse Foundation</div></a></li>
<li><a href="http://marketplace.eclipse.org"><img src="http://dev.eclipse.org/custom_icons/marketplace.png" width="28" height="28" alt="Eclipse Marketplace" title="Eclipse Marketplace" /><div>Eclipse Marketplace</div></a></li>
<li><a href="https://bugs.eclipse.org/bugs"><img src="http://dev.eclipse.org/custom_icons/system-search-bw.png" width="28" height="28" alt="Bugzilla" title="Bugzilla" /><div>Bugzilla</div></a></li>
<li><a href="http://live.eclipse.org"><img src="http://dev.eclipse.org/custom_icons/audio-input-microphone-bw.png" width="28" height="28" alt="Live" title="Live" /><div>Eclipse Live</div></a></li>
<li><a href="http://planeteclipse.org"><img src="http://dev.eclipse.org/large_icons/devices/audio-card.png" width="28" height="28" alt="PlanetEclipse" title="Planet" /><div>Planet Eclipse</div></a></li>
<li><a href="http://portal.eclipse.org"><img src="http://dev.eclipse.org/custom_icons/preferences-system-network-proxy-bw.png" width="28" height="28" alt="Portal" title="Portal" /><div>My Foundation Portal</div></a></li>
</ul>
</div>
</div>
</div>
<!-- NEW HEADER STUFF HERE -->
<div id="header-menu">
<div id="header-nav">
<ul> <li><a class="first_one" href="http://wiki.eclipse.org/" target="_self">Home</a></li> <li><a href="http://www.eclipse.org/downloads/" target="_self">Downloads</a></li>
<li><a href="http://www.eclipse.org/users/" target="_self">Users</a></li>
<li><a href="http://www.eclipse.org/membership/" target="_self">Members</a></li>
<li><a href="http://wiki.eclipse.org/index.php/Development_Resources" target="_self">Committers</a></li>
<li><a href="http://www.eclipse.org/resources/" target="_self">Resources</a></li>
<li><a href="http://www.eclipse.org/projects/" target="_self">Projects</a></li>
<li><a href="http://www.eclipse.org/org/" target="_self">About Us</a></li>
</ul>
</div>
<div id="header-utils">
<!-- moved the search window here -->
<form action="http://wiki.eclipse.org/Special:Search" >
<input class="input" name="search" type="text" accesskey="f" value="" />
<input type='submit' onclick="this.submit();" name="go" id="searchGoButton" class="button" title="Go to a page with this exact name if one exists" value="Go" />&nbsp;
<input type='submit' onclick="this.submit();" name="fulltext" class="button" id="mw-searchButton" title="Search Eclipsepedia for this text" value="Search" />
</form>
</div>
</div>
<!-- Eclipse Additions for the Header stop here -->
<!-- Additions and mods for leftside nav Start here -->
<!--Started nav rip here-->
<!-- these are the nav controls main page, changes etc -->
<div id="novaContent" class="faux">
<div id="leftcol">
<ul id="leftnav">
<!-- these are the page controls, edit history etc -->
<li class="separator"><a class="separator">Navigation &#160;&#160;</li>
<li id="n-mainpage"><a href="http://wiki.eclipse.org/Main_Page">Main Page</a></li>
<li id="n-portal"><a href="http://wiki.eclipse.org/Eclipsepedia:Community_Portal">Community portal</a></li>
<li id="n-currentevents"><a href="http://wiki.eclipse.org/Eclipsepedia:Current_events">Current events</a></li>
<li id="n-recentchanges"><a href="http://wiki.eclipse.org/Special:Recentchanges">Recent changes</a></li>
<li id="n-randompage"><a href="http://wiki.eclipse.org/Special:Random">Random page</a></li>
<li id="n-help"><a href="http://wiki.eclipse.org/Help:Contents">Help</a></li>
<li class="separator"><a class="separator">Toolbox &#160;&#160;</a></li>
<li id="t-whatlinkshere"><a href="http://wiki.eclipse.org/Special:Whatlinkshere/SMILA/Project_Concepts/IRMDiscussion">What links here</a></li>
<li id="t-recentchangeslinked"><a href="http://wiki.eclipse.org/Special:Recentchangeslinked/SMILA/Project_Concepts/IRMDiscussion">Related changes</a></li>
<!-- This is the toolbox section -->
<li id="t-upload"><a href="http://wiki.eclipse.org/Special:Upload">Upload file</a></li>
<li id="t-specialpages"><a href="http://wiki.eclipse.org/Special:Specialpages">Special pages</a></li>
<li id="t-print"><a href="http://wiki.eclipse.org/index.php?title=SMILA/Project_Concepts/IRMDiscussion&amp;printable=yes">Printable version</a></li> <li id="t-permalink"><a href="http://wiki.eclipse.org/index.php?title=SMILA/Project_Concepts/IRMDiscussion&amp;oldid=286171">Permanent link</a></li> </ul>
</div>
<!-- Additions and mods for leftside nav End here -->
<div id="column-content">
<div id="content">
<a name="top" id="top"></a>
<div id="tabs">
<ul class="primary">
<li class="active"><a href="IRMDiscussion.html"><span class="tab">Page</span></a></li>
<li><a href="http://wiki.eclipse.org/index.php?title=Talk:SMILA/Project_Concepts/IRMDiscussion&amp;action=edit"><span class="tab">Discussion</span></a></li>
<li><a href="http://wiki.eclipse.org/index.php?title=SMILA/Project_Concepts/IRMDiscussion&amp;action=edit"><span class="tab">View source</span></a></li>
<li><a href="http://wiki.eclipse.org/index.php?title=SMILA/Project_Concepts/IRMDiscussion&amp;action=history"><span class="tab">History</span></a></li>
<li><a href="http://wiki.eclipse.org/index.php?title=Special:Userlogin&amp;returnto=SMILA/Project&#32;Concepts/IRMDiscussion"><span class="tab">Edit</span></a></li>
</ul>
</div>
<script type="text/javascript"> if (window.isMSIE55) fixalpha(); </script>
<h1 class="firstHeading">SMILA/Project Concepts/IRMDiscussion</h1>
<div id="bodyContent">
<h3 id="siteSub">From Eclipsepedia</h3>
<div id="contentSub"><span class="subpages">&lt; <a href="../../SMILA.html" title="SMILA">SMILA</a> | <a href="../Project_Concepts.1.html" title="SMILA/Project Concepts">Project Concepts</a></span></div>
<div id="jump-to-nav">Jump to: <a href="IRMDiscussion.html#column-one">navigation</a>, <a href="IRMDiscussion.html#searchInput">search</a></div> <!-- start content -->
<table id="toc" class="toc" summary="Contents"><tr><td><div id="toctitle"><h2>Contents</h2></div>
<ul>
<li class="toclevel-1"><a href="IRMDiscussion.html#Discussion"><span class="tocnumber">1</span> <span class="toctext">Discussion</span></a>
<ul>
<li class="toclevel-2"><a href="IRMDiscussion.html#Benefit_of_SCA_for_Integration_Model_os_external_Systems"><span class="tocnumber">1.1</span> <span class="toctext">Benefit of SCA for Integration Model os external Systems</span></a></li>
<li class="toclevel-2"><a href="IRMDiscussion.html#Entry_Barrier_for_Integration_Developers"><span class="tocnumber">1.2</span> <span class="toctext">Entry Barrier for Integration Developers</span></a></li>
<li class="toclevel-2"><a href="IRMDiscussion.html#Naming_of_the_modules.2Fcomponents"><span class="tocnumber">1.3</span> <span class="toctext">Naming of the modules/components</span></a></li>
<li class="toclevel-2"><a href="IRMDiscussion.html#Agent.2FController_conflict_problem.3F"><span class="tocnumber">1.4</span> <span class="toctext">Agent/Controller conflict problem?</span></a></li>
<li class="toclevel-2"><a href="IRMDiscussion.html#Process_Component_Logic:"><span class="tocnumber">1.5</span> <span class="toctext">Process Component Logic:</span></a></li>
<li class="toclevel-2"><a href="IRMDiscussion.html#Definition_of_Interfaces_for_the_Components_.28Agent.2FCrawler_Controller_.26_Connectivity.29:"><span class="tocnumber">1.6</span> <span class="toctext">Definition of Interfaces for the Components (Agent/Crawler Controller &amp; Connectivity):</span></a></li>
<li class="toclevel-2"><a href="IRMDiscussion.html#Configuration_Management:_Information_retrieval"><span class="tocnumber">1.7</span> <span class="toctext">Configuration Management: Information retrieval</span></a></li>
<li class="toclevel-2"><a href="IRMDiscussion.html#Dealing.2FHandling_with_special_information_like_permissions:"><span class="tocnumber">1.8</span> <span class="toctext">Dealing/Handling with special information like permissions:</span></a></li>
</ul>
</li>
<li class="toclevel-1"><a href="IRMDiscussion.html#Here_are_some_ideas.2Fdiscussions_about_interfaces"><span class="tocnumber">2</span> <span class="toctext">Here are some ideas/discussions about interfaces</span></a>
<ul>
<li class="toclevel-2"><a href="IRMDiscussion.html#Agent_Controller"><span class="tocnumber">2.1</span> <span class="toctext">Agent Controller</span></a></li>
<li class="toclevel-2"><a href="IRMDiscussion.html#Crawler"><span class="tocnumber">2.2</span> <span class="toctext">Crawler</span></a></li>
</ul>
</li>
</ul>
</td></tr></table><script type="text/javascript"> if (window.showTocToggle) { var tocShowText = "show"; var tocHideText = "hide"; showTocToggle(); } </script>
<a name="Discussion"></a><h2> <span class="mw-headline"> Discussion </span></h2>
<a name="Benefit_of_SCA_for_Integration_Model_os_external_Systems"></a><h3> <span class="mw-headline"> Benefit of SCA for Integration Model os external Systems </span></h3>
<p><a href="http://wiki.eclipse.org/User:G.schmidt.brox.de" title="User:G.schmidt.brox.de">Georg Schmidt</a>: Where is the direct advantage to use SCA for the integration of external systems? In which context should it be used and which advantages arre we gaining from it?
</p>
<ul><li> <a href="http://wiki.eclipse.org/User:Daniel.stucky.empolis.com" title="User:Daniel.stucky.empolis.com">Daniel Stucky</a>: As Agents/Crawlers may be implemented in different programming languages we need a technology to communicate between Agents/Crawlers and their Controllers. SCA provides this functionality, but it does not offer any advantages (except that we will make use of SCA in other parts of SMILA and so the it would be more homogeneous). What other possibilities do we have for such a communication&nbsp;?
<ul><li> <a href="http://wiki.eclipse.org/User:G.schmidt.brox.de" title="User:G.schmidt.brox.de">Georg Schmidt</a>: Just plain inprocess communication. Eg. Calling one OSGi bundle from another.
</li></ul>
</li></ul>
<a name="Entry_Barrier_for_Integration_Developers"></a><h3> <span class="mw-headline"> Entry Barrier for Integration Developers </span></h3>
<p><a href="http://wiki.eclipse.org/User:G.schmidt.brox.de" title="User:G.schmidt.brox.de">Georg Schmidt</a>: Is there already an idea which entry barrier exists for integration developers? How do we handle build integration? How do we do unit tests? Which technologies must the developer know? Which tools could the developer use to perform the development? Which interfaces must the developer implement at minimum to get the easyest integration done? Is there a concept for managing configurations?
</p>
<ul><li> <a href="http://wiki.eclipse.org/User:Daniel.stucky.empolis.com" title="User:Daniel.stucky.empolis.com">Daniel Stucky</a>: Developers should only have to implement Agents/Crawlers. Must have technologies would only be SCA. I think we should postpone these questions after the concept is stable.
<ul><li> <a href="http://wiki.eclipse.org/User:G.schmidt.brox.de" title="User:G.schmidt.brox.de">Georg Schmidt</a>: Ok. Lets pospone it.
</li></ul>
</li></ul>
<a name="Naming_of_the_modules.2Fcomponents"></a><h3> <span class="mw-headline"> Naming of the modules/components </span></h3>
<p><a href="http://wiki.eclipse.org/index.php?title=User:S.voigt.brox.de&amp;action=edit" class="new" title="User:S.voigt.brox.de">Sebastian Voigt</a>:
</p>
<ul><li> Modules/Component in the SCA Component View should have Names with Manager/Module/Component in the Name, e.g. Delta Indexing Manager sounds better than only "Incremental Import State"
</li><li> Agent and Crawler Component should be merged into a superior module with a name (something like IRM, but probably better: Connector?). But Both components should exists as component, there a developer can decided to implement both or only one of component for his "irm/Connector"-module, this modul has to developed and the other components are delivered as framework (see Discussion 2.2)
</li><li> <a href="http://wiki.eclipse.org/User:Daniel.stucky.empolis.com" title="User:Daniel.stucky.empolis.com">Daniel Stucky</a>: I don't think we need a "superior" module in terms of code, but in terms of packaging to allow code reuse for an implementation supporting both the Agents and Crawler interfaces.
</li><li> <a href="http://wiki.eclipse.org/index.php?title=User:S.voigt.brox.de&amp;action=edit" class="new" title="User:S.voigt.brox.de">Sebastian Voigt</a>: That was my intention. We could create a package, that has to be developed and that contains the crawler and/or agent. We need a name for it. Atm there is no border between the IRM FrameWork/interface and the IRM itself (if we should call it IRM).
</li><li> <a href="http://wiki.eclipse.org/index.php?title=User:S.voigt.brox.de&amp;action=edit" class="new" title="User:S.voigt.brox.de">Sebastian Voigt</a>: Probably we could change the architecture overview (the figure) in that way that the components are more separated (task-based\!). I see at the moment 3 packages:
<ul><li> Agent/crawler (that have to be developed by a new data source)
</li><li> THE (IRM) Interface: contains the Agent/Crawler Controller, the compound management and the Configuration manager (at this point it is the configuration only for the indexing job).
</li><li> Connectivity Module/Manager (how it is called at the moment?)
</li></ul>
</li></ul>
<p>These are the three main parts; perhaps we can make small (dashed border lines) boxes around them.
From the view of an IRM/agent/crawler developer the IRM Interface (agent/crawler controller/compound management/configuration manager) and the Connectivity Part is contained in ONE package/box. This is the connectivity "box".
From the point of view of nodes there is one node that contains the IRM-Interface and the Connectivity Module. The Agent/Crawler can probably run at the host where the data source runs (another node) and the queue runs distributed at more than hosts/nodes.
Probably there is a misunderstood and the IRM Interface things should run at the host where the agent/crawler is installed.
If not, I vote for super package (super module) that contains the existing IRM interface and the connectivity part).
We have to create figures that fit into super architecture overview figures. In the super architecture overview there are agents/crawlers and the connectivity Module. In our overview this "packages" should appear or should reused to show the reader the hierarchy.
</p>
<a name="Agent.2FController_conflict_problem.3F"></a><h3> <span class="mw-headline"> Agent/Controller conflict problem? </span></h3>
<p><a href="http://wiki.eclipse.org/index.php?title=User:S.voigt.brox.de&amp;action=edit" class="new" title="User:S.voigt.brox.de">Sebastian Voigt</a>: Agent and crawler should not be allowed to access (send it to Connectivity Manager) the same data/object/information at the same time
\--&gt; mutual exclusion / synchronization is needed
</p>
<ul><li> <a href="http://wiki.eclipse.org/User:Daniel.stucky.empolis.com" title="User:Daniel.stucky.empolis.com">Daniel Stucky</a>: this funtionality has to be adressed more generally in the Connectivity Manager, as Agents/Crawlers now nothing about each other.
</li></ul>
<p><a href="http://wiki.eclipse.org/index.php?title=User:S.voigt.brox.de&amp;action=edit" class="new" title="User:S.voigt.brox.de">Sebastian Voigt</a>: That is correct, synchronization issues has be managed by the Connectivity Manager.
</p>
<a name="Process_Component_Logic:"></a><h3> <span class="mw-headline"> Process Component Logic: </span></h3>
<p><a href="http://wiki.eclipse.org/index.php?title=User:S.voigt.brox.de&amp;action=edit" class="new" title="User:S.voigt.brox.de">Sebastian Voigt</a>: The Process Component Logic should/can be own Modul/Component, because it handle a lot of work. Thus it can be easier reused and improved.
</p>
<ul><li> <a href="http://wiki.eclipse.org/User:Daniel.stucky.empolis.com" title="User:Daniel.stucky.empolis.com">Daniel Stucky</a>: Yes, I agree. At design time I focused only on container objects (like zip), but theree are other compounds that must be handeled. This is called "Splitter" in the architecture overview and also contains page-based indexing of large files. So we could also need a seperate framework here.
</li></ul>
<a name="Definition_of_Interfaces_for_the_Components_.28Agent.2FCrawler_Controller_.26_Connectivity.29:"></a><h3> <span class="mw-headline"> Definition of Interfaces for the Components (Agent/Crawler Controller &amp; Connectivity): </span></h3>
<p><a href="http://wiki.eclipse.org/index.php?title=User:S.voigt.brox.de&amp;action=edit" class="new" title="User:S.voigt.brox.de">Sebastian Voigt</a>: The Data format for the Agent/Crawler Controller should be defined:
</p>
<ul><li> In which format should the retrieved information returned (Agent/Crawler \-&gt;Agent/Crawler Controller)
</li></ul>
<p>Simple example for the definition (same way it is used with IRMs and the AFE-Engine):
A agent/crawler defines with a xsd Schema. It Contains all information fields that the agent/controller can return with xml.
The Agent/Crawler controller can check the xml with the given xsd.
Then mentioned "data unifier" can be used to convert information fields like date to a unique format.
A Concept for it is described at <a href="http://wiki.eclipse.org/index.php?title=Index_Order_Configuration_Schema&amp;action=edit" class="new" title="Index Order Configuration Schema">Index Order Configuration Schema</a>.
</p><p>Probably the XML format from ECS-67 can be used for this.
</p>
<ul><li> For the connectivity module there is also a definition for the data format needed Agent/Crawler Controller \--&gt; Connectivity).
</li><li> <a href="http://wiki.eclipse.org/User:Daniel.stucky.empolis.com" title="User:Daniel.stucky.empolis.com">Daniel Stucky</a>: I like the idea of using XML schema. But I think we should not allow an IRM to introduce new data types. Is this possible&nbsp;?
</li><li> <a href="http://wiki.eclipse.org/index.php?title=User:S.voigt.brox.de&amp;action=edit" class="new" title="User:S.voigt.brox.de">Sebastian Voigt</a>: This confluence page <a href="http://wiki.eclipse.org/index.php?title=Index_Order_Configuration_Schema&amp;action=edit" class="new" title="Index Order Configuration Schema">Index Order Configuration Schema</a> contains a more detailed description for the irm configuration
</li></ul>
<a name="Configuration_Management:_Information_retrieval"></a><h3> <span class="mw-headline"> Configuration Management: Information retrieval </span></h3>
<p><a href="http://wiki.eclipse.org/index.php?title=User:S.voigt.brox.de&amp;action=edit" class="new" title="User:S.voigt.brox.de">Sebastian Voigt</a>: An Agent or a Crawler need to know which information should be retrieved. Thus the Agent/Crawler can retrieve only necessary information (lazy initialize: long retrieval operations should only execute if they are necessary) Should this information be stored within the config?
(BTW: the Binding of information to index fields is done in another part of the framework? That means this configuration file should be used also in another parts of the framework. Should it be used also for teh configuration of index fields and their parameters for the search configuration (AND/OR/wildcards... search dependent parameters)
The same idea from 2.6 with the xml/xsd definition of a self-built IRM/Connector could be used. The Configuration (config) has a special part, where the information is described that should be retrived with the same xml tags used for the information transport between agent/crawler and agent/crawler controller.
</p><p>This two parts of the config could also be verified by the agent/crawler controller with the xsd of the agent/crawler(IRM/Connector).
See for more Information <a href="http://wiki.eclipse.org/index.php?title=Index_Order_Configuration_Schema&amp;action=edit" class="new" title="Index Order Configuration Schema">Index Order Configuration Schema</a>.
2.6 and 2.7 should be discussed and if the idea is good we can create a exact definiton for this Definiton is now here: <a href="http://wiki.eclipse.org/index.php?title=Index_Order_Configuration_Schema&amp;action=edit" class="new" title="Index Order Configuration Schema">Index Order Configuration Schema</a>).
</p>
<ul><li> <a href="http://wiki.eclipse.org/User:Daniel.stucky.empolis.com" title="User:Daniel.stucky.empolis.com">Daniel Stucky</a>: We should seperate the configuration of what information an agent/crawler provides from what information is indexed and how this information is searched. There may be simple cases, where the configuration is equal, but the processing of the crawled information usualy leads to additional index fields. Also not every information may be used for indexing/searching, but may be stored in the XML Data Storage for other use cases (e.g. Mashup).
</li><li> <a href="http://wiki.eclipse.org/index.php?title=User:S.voigt.brox.de&amp;action=edit" class="new" title="User:S.voigt.brox.de">Sebastian Voigt</a>: Ok.
</li></ul>
<a name="Dealing.2FHandling_with_special_information_like_permissions:"></a><h3> <span class="mw-headline"> Dealing/Handling with special information like permissions: </span></h3>
<p><a href="http://wiki.eclipse.org/index.php?title=User:S.voigt.brox.de&amp;action=edit" class="new" title="User:S.voigt.brox.de">Sebastian Voigt</a>:
A lot of data sources use external security management. That means the assignment of a user to a security group or something else is not stored within the data source. An Example is the use of LDAP or Windows Domain for the security management. Data sources store only the permissions of users and groups of them. Therefore IRM has no knowledge while indexing which user is assigned to a group.
</p><p>Thus existing IRMs for the AF-Engine return permissions for an information object/entry unchanged and the search implementation uses a module that gather the information about the assignments from users to groups. This module then translates the search query, in order that the search only returns entries that the search user is allowed to see.
Should we keep this workflow for SMILA?
<a href="http://wiki.eclipse.org/User:Daniel.stucky.empolis.com" title="User:Daniel.stucky.empolis.com">Daniel Stucky</a>:
</p>
<ul><li> I agree that we need functionality to resolve security information (members of a group, groups of a user).
</li><li> your approach is good as
<ul><li> less data is stored as when the Crawler would resolve the information (groups can have lot's of members)
</li><li> changes in user-&gt;group assignment can be applied without need for reindexing
</li></ul>
</li><li> how does it affect the search performance&nbsp;? During indexing the time spent to resolve this information is not so critical.
</li><li> I suggest that we just provide the functionality in the framework but do not constitute on how security information is handeled. There may be scenarios where the security information is only accessible via Agents/Crawlers. Your approach should be emphasized in "Best Practices".
</li></ul>
<p><a href="http://wiki.eclipse.org/index.php?title=User:Igor.Novakovic.empolis.com&amp;action=edit" class="new" title="User:Igor.Novakovic.empolis.com">Igor Novakovic</a>:
</p>
<ul><li> We used the same approach in our product called e:SLS. We stored only group information bound to a document (this binding has a rather static nature) in index. Now, before the search query has been executed, the groups in witch the user (who fired that search request) is a member would be resolved and used as a filter criteria for the search. Additionally, the retrieved documents were checked against the actual current access rights of that user in order to make sure, that he/she can really read those documents. (Die group-access information, stored in index, may be out of date if the document has not been reindexed after the access right on that document changed.)
</li></ul>
<p><br />
</p>
<a name="Here_are_some_ideas.2Fdiscussions_about_interfaces"></a><h2> <span class="mw-headline"> Here are some ideas/discussions about interfaces </span></h2>
<a name="Agent_Controller"></a><h3> <span class="mw-headline"> Agent Controller </span></h3>
<div dir="ltr" style="text-align: left;"><pre class="source-java"><span class="kw1">interface</span> AgentController|borderStyle
<span class="br0">&#123;</span>
<span class="kw4">void</span> add<span class="br0">&#40;</span>Record<span class="br0">&#41;</span> <span class="co1">// triggers the add process</span>
<span class="kw4">void</span> update<span class="br0">&#40;</span>Record<span class="br0">&#41;</span> <span class="co1">// triggers the update process</span>
<span class="kw4">void</span> delete<span class="br0">&#40;</span>Record<span class="br0">&#41;</span> <span class="co1">// triggers the delete process, the Record most likely will only contain the ID and no data</span>
<span class="br0">&#125;</span></pre></div>
<a name="Crawler"></a><h3> <span class="mw-headline"> Crawler </span></h3>
<div dir="ltr" style="text-align: left;"><pre class="source-java"><span class="kw1">interface</span> Crawler
<span class="br0">&#123;</span>
Iterator&lt;Record&gt; crawl<span class="br0">&#40;</span>Config<span class="br0">&#41;</span>
<span class="br0">&#125;</span></pre></div>
<p>Starts a crawl process (as separate thread(s)) with the given Configuration and returns an Iterator on the crawled data. In this case the Iterator has to be a Service that is created on demand. The Iterators hasNext() method should not return a boolean, but an IncImportData object (e.g. a hash token) if it has a next elements, or NULL if no more elements exist. The IncImportData (probably a hash) is needed in the CrawlerController to determine if this data needs to be processed. The Iterator also needs a method skip() to move the iterator to the next element without getting the current element.
{note:title=Technical Note}My idea was that the Crawler Controller initiates a new crawl process by calling method crawl() on the Crawler, which returns an Iterator on the data to the Crawler Controller.
Therefore I made some tests with Tuscany using Conversations to simulate this interaction. In General it works, but Tuscany seems to have a bug when returning ServiceReferences. Initiated Conversations are not reused. I created <a href="http://wiki.eclipse.org/index.php?title=Issue&amp;action=edit" class="new" title="Issue">https://issues.apache.org/jira/browse/TUSCANY-2028</a> in Tuscany JIRA to address this limitation. This bug is fixed in Tuscany 1.1
{note}
\\
</p>
<div dir="ltr" style="text-align: left;"><pre class="source-java"><span class="kw1">interface</span> <span class="kw3">Iterator</span>
<span class="br0">&#123;</span>
<span class="coMULTI">/**
Checks if more data objects are available.
@return a Record containing data for delta indexing (ID and hash) or null if no more data objects exist.
*/</span>
Record hasNext<span class="br0">&#40;</span><span class="br0">&#41;</span>;
&nbsp;
<span class="coMULTI">/**
Moves the iterator to the next data object after accessing and returning the current data object as a Record
@return a Record containing the complete data
*/</span>
Record next<span class="br0">&#40;</span><span class="br0">&#41;</span>;
&nbsp;
<span class="coMULTI">/**
Moves the iterator to the next element without accessing and returning the data object
*/</span>
<span class="kw4">void</span> skip<span class="br0">&#40;</span><span class="br0">&#41;</span>;
<span class="br0">&#125;</span></pre></div>
<p>{info:title=Alternative Interface Design}
We could also provide the following interface. It seems to be more flexible than the initial one and distributes the implementation logic between the Crawler and Iterator. In the initial approach the main logic is provided by the iterator.
A second benefit is that it allows direct access to a selected Record which may be needed in BPEL during search.:
</p>
<div dir="ltr" style="text-align: left;"><pre class="source-java"><span class="kw1">interface</span> Crawler
<span class="br0">&#123;</span>
<span class="coMULTI">/**
Returns an Iterator with Records on delta indexing information
*/</span>
Iterator&lt;Record&gt; crawl<span class="br0">&#40;</span>Config<span class="br0">&#41;</span>
&nbsp;
<span class="coMULTI">/**
Gets a Record with all data by ID
*/</span>
Record getRecord<span class="br0">&#40;</span>ID<span class="br0">&#41;</span>
<span class="br0">&#125;</span></pre></div>
<div dir="ltr" style="text-align: left;"><pre class="source-java"><span class="kw1">interface</span> <span class="kw3">Iterator</span>
<span class="br0">&#123;</span>
<span class="coMULTI">/**
Checks if more data exists and returns true if one or more data exists, false otherwise
*/</span>
<span class="kw4">boolean</span> hasNext<span class="br0">&#40;</span><span class="br0">&#41;</span>;
&nbsp;
<span class="coMULTI">/**
Returns one or more Records containing delta indexing information.
*/</span>
List&lt;Record&gt; next<span class="br0">&#40;</span><span class="br0">&#41;</span>;
<span class="br0">&#125;</span></pre></div>
<p>With this interface, the Iterator iterates only on delta indexing information. It does not access all of the objects data and does not return this data in any way. Access to the complete data is provided by the Crawler interface, using the Record ID.
Note that iteration and access of data is asynchronus. This may be difficult or even impossible to implement for certain data sources, or the maybe the size of the List has be reduced to one (compare empolis Exchange Connector).
Perhaps we should introduce a data type DeltaIndexingRecord to seperate between Records with complete data and delta indexing data.
</p><p><a href="http://wiki.eclipse.org/index.php?title=User:S.voigt.brox.de&amp;action=edit" class="new" title="User:S.voigt.brox.de">Sebastian Voigt</a>:
I would prefer the upper Crawler Interface.
The alternate interface doesn't fit the workflow for a crawler from my point of view. The crawler will crawl each item step by step, the getRecord mechanism forces the Crawler to cache the Record information for each entry to return them when the Controller asks for it, or it has to access the entry in the data source twice which means lower performance. The Record hasNext(), Record next() Iteration is better, it is an easier workflow and therefore easier for the Crawler developer.
{info}
</p>
<!--
NewPP limit report
Preprocessor node count: 28/1000000
Post-expand include size: 0/2097152 bytes
Template argument size: 0/2097152 bytes
#ifexist count: 0/100
-->
<!-- Saved in parser cache with key wikidb:pcache:idhash:15272-0!1!0!!en!2!edit=0 and timestamp 20130416061025 -->
<div class="printfooter">
Retrieved from "<a href="IRMDiscussion.html">http://wiki.eclipse.org/SMILA/Project_Concepts/IRMDiscussion</a>"</div>
<div id="catlinks"><p class='catlinks'><a href="http://wiki.eclipse.org/Special:Categories" title="Special:Categories">Category</a>: <span dir='ltr'><a href="http://wiki.eclipse.org/Category:SMILA" title="Category:SMILA">SMILA</a></span></p></div> <!-- end content -->
<div class="visualClear"></div>
</div>
</div>
</div>
<!-- Yoink of toolbox for phoenix moved up -->
</div>
</div>
<div id="clearFooter"/>
<div id="footer" >
<ul id="footernav">
<li class="first"><a href="http://www.eclipse.org/">Home</a></li>
<li><a href="http://www.eclipse.org/legal/privacy.php">Privacy Policy</a></li>
<li><a href="http://www.eclipse.org/legal/termsofuse.php">Terms of Use</a></li>
<li><a href="http://www.eclipse.org/legal/copyright.php">Copyright Agent</a></li>
<li><a href="http://www.eclipse.org/org/foundation/contact.php">Contact</a></li>
<li><a href="http://wiki.eclipse.org/Eclipsepedia:About" title="Eclipsepedia:About">About Eclipsepedia</a></li>
</ul>
<span id="copyright">Copyright &copy; 2013 The Eclipse Foundation. All Rights Reserved</span>
<p id="footercredit">This page was last modified 14:23, 24 January 2012 by <a href="http://wiki.eclipse.org/index.php?title=User:Juergen.schumacher.attensity.com&amp;action=edit" class="new" title="User:Juergen.schumacher.attensity.com">Juergen Schumacher</a>. Based on work by <a href="http://wiki.eclipse.org/User:Daniel.stucky.empolis.com" title="User:Daniel.stucky.empolis.com">Daniel Stucky</a>.</p>
<p id="footerviews">This page has been accessed 1,459 times.</p>
</div>
<script type="text/javascript">
var gaJsHost = (("https:" == document.location.protocol) ? "https://ssl." : "http://www.");
document.write(unescape("%3Cscript src='" + gaJsHost + "google-analytics.com/ga.js' type='text/javascript'%3E%3C/script%3E"));
</script>
<script type="text/javascript">
var pageTracker = _gat._getTracker("UA-910670-4");
pageTracker._trackPageview();
</script>
<!-- <div class="visualClear"></div> -->
<script type="text/javascript">if (window.runOnloadHook) runOnloadHook();</script>
</div>
<!-- Served in 0.053 secs. --></body></html>