blob: c356d2c2b49c28514b12a6de7cae97a87d0f5ee3 [file] [log] [blame]
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en" dir="ltr">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<meta name="keywords" content="SMILA/Documentation/Data Model and Serialization Formats,SMILA/Documentation/2011.Simplification/Search" />
<link rel="shortcut icon" href="http://wiki.eclipse.org/SMILA/Documentation/favicon.ico" />
<link rel="search" type="application/opensearchdescription+xml" href="http://wiki.eclipse.org/opensearch_desc.php" title="Eclipsepedia (English)" />
<link rel="alternate" type="application/rss+xml" title="Eclipsepedia RSS Feed" href="http://wiki.eclipse.org/index.php?title=Special:Recentchanges&amp;feed=rss" />
<link rel="alternate" type="application/atom+xml" title="Eclipsepedia Atom Feed" href="http://wiki.eclipse.org/index.php?title=Special:Recentchanges&amp;feed=atom" />
<title>SMILA/Documentation/Data Model and Serialization Formats - Eclipsepedia</title>
<style type="text/css" media="screen,projection">/*<![CDATA[*/ @import "http://wiki.eclipse.org/skins/eclipsenova/novaWide.css?116"; /*]]>*/</style>
<link rel="stylesheet" type="text/css" media="print" href="http://wiki.eclipse.org/skins/eclipsenova/eclipsenovaPrint.css?116" />
<link rel="stylesheet" type="text/css" media="handheld" href="http://wiki.eclipse.org/skins/eclipsenova/handheld.css?116" />
<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/Nova/css/header.css" media="screen" />
<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/tabs.css" media="screen" />
<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/Nova/css/visual.css" media="screen" />
<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/Nova/css/layout.css" media="screen" />
<link rel="stylesheet" type="text/css" href="http://wiki.eclipse.org/skins/eclipsenova/Nova/css/footer.css" media="screen" />
<!--[if IE]><link rel="stylesheet" type="text/css" href="/skins/eclipsenova/IEpngfix.css" media="screen" /><![endif]-->
<!--[if lt IE 5.5000]><style type="text/css">@import "/skins/eclipsenova/IE50Fixes.css?116";</style> <![endif]-->
<!--[if IE 5.5000]><style type="text/css">@import "/skins/eclipsenova/IE55Fixes.css?116";</style><![endif]-->
<!--[if IE 6]><style type="text/css">@import "/skins/eclipsenova/IE60Fixes.css?116";</style><![endif]-->
<!--[if IE 7]><style type="text/css">@import "/skins/eclipsenova/IE70Fixes.css?116";</style><![endif]-->
<!--[if lt IE 7]><script type="text/javascript" src="/skins/common/IEFixes.js?116"></script>
<meta http-equiv="imagetoolbar" content="no" /><![endif]-->
<script type= "text/javascript">/*<![CDATA[*/
var skin = "eclipsenova";
var stylepath = "/skins";
var wgArticlePath = "/$1";
var wgScriptPath = "";
var wgScript = "/index.php";
var wgServer = "http://wiki.eclipse.org";
var wgCanonicalNamespace = "";
var wgCanonicalSpecialPageName = false;
var wgNamespaceNumber = 0;
var wgPageName = "SMILA/Documentation/Data_Model_and_Serialization_Formats";
var wgTitle = "SMILA/Documentation/Data Model and Serialization Formats";
var wgAction = "view";
var wgRestrictionEdit = [];
var wgRestrictionMove = [];
var wgArticleId = "30891";
var wgIsArticle = true;
var wgUserName = null;
var wgUserGroups = null;
var wgUserLanguage = "en";
var wgContentLanguage = "en";
var wgBreakFrames = false;
var wgCurRevisionId = "333600";
var wgVersion = "1.12.0";
var wgEnableAPI = true;
var wgEnableWriteAPI = false;
/*]]>*/</script>
<script type="text/javascript" src="http://wiki.eclipse.org/skins/common/wikibits.js?116"><!-- wikibits js --></script>
<!-- Performance mods similar to those for bug 166401 -->
<script type="text/javascript" src="http://wiki.eclipse.org/index.php?title=-&amp;action=raw&amp;gen=js&amp;useskin=eclipsenova"><!-- site js --></script>
<!-- Head Scripts -->
<script type="text/javascript" src="http://wiki.eclipse.org/skins/common/ajax.js?116"></script>
<style type="text/css">/*<![CDATA[*/
.source-xml {line-height: normal; font-size: medium;}
.source-xml li {line-height: normal;}
/**
* GeSHi Dynamically Generated Stylesheet
* --------------------------------------
* Dynamically generated stylesheet for xml
* CSS class: source-xml, CSS id:
* GeSHi (C) 2004 - 2007 Nigel McNie (http://qbnz.com/highlighter)
*/
.source-xml .de1, .source-xml .de2 {font-family: 'Courier New', Courier, monospace; font-weight: normal;}
.source-xml {}
.source-xml .head {}
.source-xml .foot {}
.source-xml .imp {font-weight: bold; color: red;}
.source-xml .ln-xtra {color: #cc0; background-color: #ffc;}
.source-xml li {font-family: 'Courier New', Courier, monospace; color: black; font-weight: normal; font-style: normal;}
.source-xml li.li2 {font-weight: bold;}
.source-xml .coMULTI {color: #808080; font-style: italic;}
.source-xml .es0 {color: #000099; font-weight: bold;}
.source-xml .br0 {color: #66cc66;}
.source-xml .st0 {color: #ff0000;}
.source-xml .nu0 {color: #cc66cc;}
.source-xml .sc0 {color: #00bbdd;}
.source-xml .sc1 {color: #ddbb00;}
.source-xml .sc2 {color: #339933;}
.source-xml .sc3 {color: #009900;}
.source-xml .re0 {color: #000066;}
.source-xml .re1 {font-weight: bold; color: black;}
.source-xml .re2 {font-weight: bold; color: black;}
/*]]>*/
</style>
<style type="text/css">/*<![CDATA[*/
@import "http://wiki.eclipse.org/index.php?title=MediaWiki:Geshi.css&usemsgcache=yes&action=raw&ctype=text/css&smaxage=18000";
/*]]>*/
</style><style type="text/css">/*<![CDATA[*/
.source-javascript {line-height: normal; font-size: medium;}
.source-javascript li {line-height: normal;}
/**
* GeSHi Dynamically Generated Stylesheet
* --------------------------------------
* Dynamically generated stylesheet for javascript
* CSS class: source-javascript, CSS id:
* GeSHi (C) 2004 - 2007 Nigel McNie (http://qbnz.com/highlighter)
*/
.source-javascript .de1, .source-javascript .de2 {font-family: 'Courier New', Courier, monospace; font-weight: normal;}
.source-javascript {}
.source-javascript .head {}
.source-javascript .foot {}
.source-javascript .imp {font-weight: bold; color: red;}
.source-javascript .ln-xtra {color: #cc0; background-color: #ffc;}
.source-javascript li {font-family: 'Courier New', Courier, monospace; color: black; font-weight: normal; font-style: normal;}
.source-javascript li.li2 {font-weight: bold;}
.source-javascript .kw1 {color: #000066; font-weight: bold;}
.source-javascript .kw2 {color: #003366; font-weight: bold;}
.source-javascript .kw3 {color: #000066;}
.source-javascript .co1 {color: #009900; font-style: italic;}
.source-javascript .coMULTI {color: #009900; font-style: italic;}
.source-javascript .es0 {color: #000099; font-weight: bold;}
.source-javascript .br0 {color: #66cc66;}
.source-javascript .st0 {color: #3366CC;}
.source-javascript .nu0 {color: #CC0000;}
.source-javascript .me1 {color: #006600;}
.source-javascript .sc0 {}
.source-javascript .sc1 {}
.source-javascript .sc2 {}
.source-javascript .sc3 {}
.source-javascript .re0 {color: #0066FF;}
/*]]>*/
</style>
<style type="text/css">/*<![CDATA[*/
@import "http://wiki.eclipse.org/index.php?title=MediaWiki:Geshi.css&usemsgcache=yes&action=raw&ctype=text/css&smaxage=18000";
/*]]>*/
</style><style type="text/css">/*<![CDATA[*/
.source-text {line-height: normal; font-size: medium;}
.source-text li {line-height: normal;}
/**
* GeSHi Dynamically Generated Stylesheet
* --------------------------------------
* Dynamically generated stylesheet for text
* CSS class: source-text, CSS id:
* GeSHi (C) 2004 - 2007 Nigel McNie (http://qbnz.com/highlighter)
*/
.source-text .de1, .source-text .de2 {font-family: 'Courier New', Courier, monospace; font-weight: normal;}
.source-text {}
.source-text .head {}
.source-text .foot {}
.source-text .imp {font-weight: bold; color: red;}
.source-text .ln-xtra {color: #cc0; background-color: #ffc;}
.source-text li {font-family: 'Courier New', Courier, monospace; color: black; font-weight: normal; font-style: normal;}
.source-text li.li2 {font-weight: bold;}
/*]]>*/
</style>
<style type="text/css">/*<![CDATA[*/
@import "http://wiki.eclipse.org/index.php?title=MediaWiki:Geshi.css&usemsgcache=yes&action=raw&ctype=text/css&smaxage=18000";
/*]]>*/
</style><link rel="stylesheet" type="text/css" href="Data_Model_and_Serialization_Formats.html" /> </head>
<body class="mediawiki ns-0 ltr page-SMILA_Documentation_Data_Model_and_Serialization_Formats">
<div id="globalWrapper">
<div id="column-one">
<!-- Eclipse Additions for the Top Nav start here M. Ward-->
<div id="header">
<div id="header-graphic">
<img src="http://wiki.eclipse.org/skins/eclipsenova/eclipse.png" alt="Eclipse Wiki">
</div>
<!-- Pulled 101409 Mward -->
<div class="portlet" id="p-personal">
<div class="pBody">
<ul>
<li id="pt-login"><a href="http://wiki.eclipse.org/index.php?title=Special:Userlogin&amp;returnto=SMILA/Documentation/Data_Model_and_Serialization_Formats">Log in</a></li>
</ul>
</div>
</div>
<div id="header-icons">
<div id="sites">
<ul id="sitesUL">
<li><a href="http://www.eclipse.org"><img src="http://dev.eclipse.org/custom_icons/eclipseIcon.png" width="28" height="28" alt="Eclipse Foundation" title="Eclipse Foundation" /><div>Eclipse Foundation</div></a></li>
<li><a href="http://marketplace.eclipse.org"><img src="http://dev.eclipse.org/custom_icons/marketplace.png" width="28" height="28" alt="Eclipse Marketplace" title="Eclipse Marketplace" /><div>Eclipse Marketplace</div></a></li>
<li><a href="https://bugs.eclipse.org/bugs"><img src="http://dev.eclipse.org/custom_icons/system-search-bw.png" width="28" height="28" alt="Bugzilla" title="Bugzilla" /><div>Bugzilla</div></a></li>
<li><a href="http://live.eclipse.org"><img src="http://dev.eclipse.org/custom_icons/audio-input-microphone-bw.png" width="28" height="28" alt="Live" title="Live" /><div>Eclipse Live</div></a></li>
<li><a href="http://planeteclipse.org"><img src="http://dev.eclipse.org/large_icons/devices/audio-card.png" width="28" height="28" alt="PlanetEclipse" title="Planet" /><div>Planet Eclipse</div></a></li>
<li><a href="http://portal.eclipse.org"><img src="http://dev.eclipse.org/custom_icons/preferences-system-network-proxy-bw.png" width="28" height="28" alt="Portal" title="Portal" /><div>My Foundation Portal</div></a></li>
</ul>
</div>
</div>
</div>
<!-- NEW HEADER STUFF HERE -->
<div id="header-menu">
<div id="header-nav">
<ul> <li><a class="first_one" href="http://wiki.eclipse.org/" target="_self">Home</a></li> <li><a href="http://www.eclipse.org/downloads/" target="_self">Downloads</a></li>
<li><a href="http://www.eclipse.org/users/" target="_self">Users</a></li>
<li><a href="http://www.eclipse.org/membership/" target="_self">Members</a></li>
<li><a href="http://wiki.eclipse.org/index.php/Development_Resources" target="_self">Committers</a></li>
<li><a href="http://www.eclipse.org/resources/" target="_self">Resources</a></li>
<li><a href="http://www.eclipse.org/projects/" target="_self">Projects</a></li>
<li><a href="http://www.eclipse.org/org/" target="_self">About Us</a></li>
</ul>
</div>
<div id="header-utils">
<!-- moved the search window here -->
<form action="http://wiki.eclipse.org/Special:Search" >
<input class="input" name="search" type="text" accesskey="f" value="" />
<input type='submit' onclick="this.submit();" name="go" id="searchGoButton" class="button" title="Go to a page with this exact name if one exists" value="Go" />&nbsp;
<input type='submit' onclick="this.submit();" name="fulltext" class="button" id="mw-searchButton" title="Search Eclipsepedia for this text" value="Search" />
</form>
</div>
</div>
<!-- Eclipse Additions for the Header stop here -->
<!-- Additions and mods for leftside nav Start here -->
<!--Started nav rip here-->
<!-- these are the nav controls main page, changes etc -->
<div id="novaContent" class="faux">
<div id="leftcol">
<ul id="leftnav">
<!-- these are the page controls, edit history etc -->
<li class="separator"><a class="separator">Navigation &#160;&#160;</li>
<li id="n-mainpage"><a href="http://wiki.eclipse.org/Main_Page">Main Page</a></li>
<li id="n-portal"><a href="http://wiki.eclipse.org/Eclipsepedia:Community_Portal">Community portal</a></li>
<li id="n-currentevents"><a href="http://wiki.eclipse.org/Eclipsepedia:Current_events">Current events</a></li>
<li id="n-recentchanges"><a href="http://wiki.eclipse.org/Special:Recentchanges">Recent changes</a></li>
<li id="n-randompage"><a href="http://wiki.eclipse.org/Special:Random">Random page</a></li>
<li id="n-help"><a href="http://wiki.eclipse.org/Help:Contents">Help</a></li>
<li class="separator"><a class="separator">Toolbox &#160;&#160;</a></li>
<li id="t-whatlinkshere"><a href="http://wiki.eclipse.org/Special:Whatlinkshere/SMILA/Documentation/Data_Model_and_Serialization_Formats">What links here</a></li>
<li id="t-recentchangeslinked"><a href="http://wiki.eclipse.org/Special:Recentchangeslinked/SMILA/Documentation/Data_Model_and_Serialization_Formats">Related changes</a></li>
<!-- This is the toolbox section -->
<li id="t-upload"><a href="http://wiki.eclipse.org/Special:Upload">Upload file</a></li>
<li id="t-specialpages"><a href="http://wiki.eclipse.org/Special:Specialpages">Special pages</a></li>
<li id="t-print"><a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/Data_Model_and_Serialization_Formats&amp;printable=yes">Printable version</a></li> <li id="t-permalink"><a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/Data_Model_and_Serialization_Formats&amp;oldid=333600">Permanent link</a></li> </ul>
</div>
<!-- Additions and mods for leftside nav End here -->
<div id="column-content">
<div id="content">
<a name="top" id="top"></a>
<div id="tabs">
<ul class="primary">
<li class="active"><a href="Data_Model_and_Serialization_Formats.html"><span class="tab">Page</span></a></li>
<li><a href="http://wiki.eclipse.org/index.php?title=Talk:SMILA/Documentation/Data_Model_and_Serialization_Formats&amp;action=edit"><span class="tab">Discussion</span></a></li>
<li><a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/Data_Model_and_Serialization_Formats&amp;action=edit"><span class="tab">View source</span></a></li>
<li><a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/Data_Model_and_Serialization_Formats&amp;action=history"><span class="tab">History</span></a></li>
<li><a href="http://wiki.eclipse.org/index.php?title=Special:Userlogin&amp;returnto=SMILA/Documentation/Data&#32;Model&#32;and&#32;Serialization&#32;Formats"><span class="tab">Edit</span></a></li>
</ul>
</div>
<script type="text/javascript"> if (window.isMSIE55) fixalpha(); </script>
<h1 class="firstHeading">SMILA/Documentation/Data Model and Serialization Formats</h1>
<div id="bodyContent">
<h3 id="siteSub">From Eclipsepedia</h3>
<div id="contentSub"><span class="subpages">&lt; <a href="../../SMILA.html" title="SMILA">SMILA</a> | <a href="../Documentation.1.html" title="SMILA/Documentation">Documentation</a></span></div>
<div id="jump-to-nav">Jump to: <a href="Data_Model_and_Serialization_Formats.html#column-one">navigation</a>, <a href="Data_Model_and_Serialization_Formats.html#searchInput">search</a></div> <!-- start content -->
<table id="toc" class="toc" summary="Contents"><tr><td><div id="toctitle"><h2>Contents</h2></div>
<ul>
<li class="toclevel-1"><a href="Data_Model_and_Serialization_Formats.html#SMILA_Data_Model"><span class="tocnumber">1</span> <span class="toctext">SMILA Data Model</span></a>
<ul>
<li class="toclevel-2"><a href="Data_Model_and_Serialization_Formats.html#Concepts"><span class="tocnumber">1.1</span> <span class="toctext">Concepts</span></a>
<ul>
<li class="toclevel-3"><a href="Data_Model_and_Serialization_Formats.html#System_attributes"><span class="tocnumber">1.1.1</span> <span class="toctext">System attributes</span></a></li>
<li class="toclevel-3"><a href="Data_Model_and_Serialization_Formats.html#Date_and_DateTime_formats"><span class="tocnumber">1.1.2</span> <span class="toctext">Date and DateTime formats</span></a></li>
</ul>
</li>
<li class="toclevel-2"><a href="Data_Model_and_Serialization_Formats.html#XML_format"><span class="tocnumber">1.2</span> <span class="toctext">XML format</span></a></li>
<li class="toclevel-2"><a href="Data_Model_and_Serialization_Formats.html#JSON_format"><span class="tocnumber">1.3</span> <span class="toctext">JSON format</span></a></li>
<li class="toclevel-2"><a href="Data_Model_and_Serialization_Formats.html#BON_Binary_Object_Notation_Format"><span class="tocnumber">1.4</span> <span class="toctext">BON Binary Object Notation Format</span></a>
<ul>
<li class="toclevel-3"><a href="Data_Model_and_Serialization_Formats.html#Format_introduction"><span class="tocnumber">1.4.1</span> <span class="toctext">Format introduction</span></a></li>
<li class="toclevel-3"><a href="Data_Model_and_Serialization_Formats.html#Scalar_Types"><span class="tocnumber">1.4.2</span> <span class="toctext">Scalar Types</span></a></li>
<li class="toclevel-3"><a href="Data_Model_and_Serialization_Formats.html#Integer_compressing"><span class="tocnumber">1.4.3</span> <span class="toctext">Integer compressing</span></a></li>
<li class="toclevel-3"><a href="Data_Model_and_Serialization_Formats.html#Binary_Type"><span class="tocnumber">1.4.4</span> <span class="toctext">Binary Type</span></a></li>
<li class="toclevel-3"><a href="Data_Model_and_Serialization_Formats.html#Token_Bytes"><span class="tocnumber">1.4.5</span> <span class="toctext">Token Bytes</span></a></li>
<li class="toclevel-3"><a href="Data_Model_and_Serialization_Formats.html#Backward_compatible_extension_concept"><span class="tocnumber">1.4.6</span> <span class="toctext">Backward compatible extension concept</span></a></li>
<li class="toclevel-3"><a href="Data_Model_and_Serialization_Formats.html#Custom_Type"><span class="tocnumber">1.4.7</span> <span class="toctext">Custom Type</span></a></li>
<li class="toclevel-3"><a href="Data_Model_and_Serialization_Formats.html#Examples"><span class="tocnumber">1.4.8</span> <span class="toctext">Examples</span></a>
<ul>
<li class="toclevel-4"><a href="Data_Model_and_Serialization_Formats.html#Integer"><span class="tocnumber">1.4.8.1</span> <span class="toctext">Integer</span></a></li>
<li class="toclevel-4"><a href="Data_Model_and_Serialization_Formats.html#String"><span class="tocnumber">1.4.8.2</span> <span class="toctext">String</span></a></li>
<li class="toclevel-4"><a href="Data_Model_and_Serialization_Formats.html#Complex_example"><span class="tocnumber">1.4.8.3</span> <span class="toctext">Complex example</span></a></li>
<li class="toclevel-4"><a href="Data_Model_and_Serialization_Formats.html#Complex_example_with_Custom_Type"><span class="tocnumber">1.4.8.4</span> <span class="toctext">Complex example with Custom Type</span></a></li>
<li class="toclevel-4"><a href="Data_Model_and_Serialization_Formats.html#Complex_example_with_attachments"><span class="tocnumber">1.4.8.5</span> <span class="toctext">Complex example with attachments</span></a></li>
</ul>
</li>
</ul>
</li>
<li class="toclevel-2"><a href="Data_Model_and_Serialization_Formats.html#Record_Filters"><span class="tocnumber">1.5</span> <span class="toctext">Record Filters</span></a></li>
</ul>
</li>
</ul>
</li>
</ul>
</li>
</ul>
</td></tr></table><script type="text/javascript"> if (window.showTocToggle) { var tocShowText = "show"; var tocHideText = "hide"; showTocToggle(); } </script>
<a name="SMILA_Data_Model"></a><h2> <span class="mw-headline"> SMILA Data Model </span></h2>
<ul><li> Implementation bundle: <tt>org.eclipse.smila.datamodel</tt>
</li><li> Current Version: 1.0.0
</li></ul>
<a name="Concepts"></a><h3> <span class="mw-headline"> Concepts </span></h3>
<p>The data to be processed in SMILA is represented as <b>records</b>. For example, one record could correspond to one document or to any resource which should be indexed or found in a search. A record consists of <b>metadata</b> and optional <b>attachments</b>.
</p><p><a href="http://wiki.eclipse.org/Image:SMILA-datamodel-1.0.png" class="image" title="SMILA data model version 1.0"><img alt="SMILA data model version 1.0" src="http://wiki.eclipse.org/images/thumb/b/b2/SMILA-datamodel-1.0.png/800px-SMILA-datamodel-1.0.png" width="800" height="600" border="0" /></a>
</p>
<dl><dt>Metadata</dt><dd>
</dd></dl>
<p>Metadata contains typed <b>values</b> (literals) arranged in <b>maps</b> (key-anything associations) and <b>sequences</b> (lists of anything). Values can be strings, long integers, double precision floating point numbers, booleans, dates (year, month, day) or datetimes (date + time of day, down to seconds). Maps and sequences can be nested arbitrarily, map keys are always strings. All metadata of one record is arranged in a single Map.
</p>
<dl><dt>Attachments</dt><dd>
</dd></dl>
<p>Attachments can contain any binary content ("byte arrays"), possibly of larger size. If the content is kept in-memory or read from a persistence service on-demand depends on the implementation of the interface. Currently the size is limited to 2 GB (maximum size of a Java <tt>byte[]</tt>), but we are planning to extend this in the future.
</p><p>A single entry in a record's metadata map is called <b>Metadata element</b>.
According to the use case, metadata elements can be semantically interpreted as:
</p>
<dl><dt>Attributes</dt><dd> Usually, attributes are used when referring to the metadata of an object which is to be processed from a given data source or which is retrieved as the result of a search request. For example, typical attributes characterizing a web page to be indexed are its URL, the size in bytes, the MIME type, the title, and the plain-text content. These attributes are defined by the application domain.
</dd></dl>
<dl><dt>Parameters</dt><dd> Attributes may not be adequate or sufficient for all record types. For example, in search processing, a record represents not a single object from some data source but rather a search request object. In such a case, the record's metadata does not contain attributes from the application domain on top-level but rather <i>request parameters</i> that configure and influence the request execution. These parameters are defined by the pipelets which are used in the workflow that was triggered by the search request. Also, their names do not start with underscores. However, a request or result record may contain application-specific attributes on deeper nested levels. Find an example, hopefully illustrating the difference between attributes and parameters, in <a href="2011.Simplification/Search.html" class="mw-redirect" title="SMILA/Documentation/2011.Simplification/Search">Search API</a>.
</dd></dl>
<dl><dt>Annotations</dt><dd> An annotation can be used to add a data structure to the record which was generated as the result of some processing step. E.g., a named-entity-recognition pipelet could add an annotation describing at which character position some entity was found, meaning that the record was <i>annotated</i> with this additional information. If annotations appear in the same maps as attributes, their names should be chosen in such a way that they will not conflict with attribute names from the application, e.g. by prefixing them with an underscore "_".
</dd></dl>
<dl><dt>System attributes</dt><dd> These attributes are needed by SMILA in order to coordinate the processing of a record (see below). Their names start with an underscore "_", so that they will not conflict with names from the application domain.
</dd></dl>
<a name="System_attributes"></a><h4> <span class="mw-headline"> System attributes </span></h4>
<dl><dt>RecordID</dt><dd> Every record must contain a single-valued string attribute named "_recordid" which is required to identify the record. It must be unique for all processed records. This must be ensured by whoever created and submitted the record it to the system (this would be crawlers or agents, usually). There is no predefined format of the record ID, hence it can contain any string. So, creating UUIDs or something similar would be entirely sufficient. Also, the producer must place any information needed to access the original data from which the record was produced into explicitly named attributes.
</dd><dt>Source</dt><dd> Every record should also contain a second system attribute named "_source" which contains the ID of the data source (e.g. crawler definition) that produced it. This is used by DeltaIndexing or RecordStorage to perform operations on all records from the same source.
</dd></dl>
<a name="Date_and_DateTime_formats"></a><h4> <span class="mw-headline"> Date and DateTime formats </span></h4>
<p>Internally, date and datetime values are represented as instances of <a href="http://download.oracle.com/javase/7/docs/api/java/util/Date.html" class="external text" title="http://download.oracle.com/javase/7/docs/api/java/util/Date.html" rel="nofollow"><code>java.util.Date</code></a>, which means that they are stored as the number of milliseconds since January 1, 1970, 00:00:00 GMT. For the string serialization used in XML, JSON or BON (see below) the following rules apply:
</p>
<ul><li> The format of date values is "yyyy-MM-dd" (see <a href="http://download.oracle.com/javase/7/docs/api/java/text/SimpleDateFormat.html" class="external text" title="http://download.oracle.com/javase/7/docs/api/java/text/SimpleDateFormat.html" rel="nofollow">SimpleDateFormat</a> for the meaning of the format string). The year must have exactly 4 digits, the month and day must have 2 digits.
</li><li> The format of datetime values is either "yyyy-MM-dd'T'HH:mm:ss&lt;TZ&gt;" or "yyyy-MM-dd'T'HH:mm:ss.SSS&lt;TZ&gt;". With "&lt;TZ&gt;" being either "X", "XX" or "XXX".
<ul><li> For the date part the date value rules apply.
</li><li> Milliseconds are optional when parsing datetime values from strings, but if given, they must have exactly 3 digits.
</li><li> The time zone information must be included and must conform to <a href="http://en.wikipedia.org/wiki/ISO_8601#Time_zone_designators" class="external text" title="http://en.wikipedia.org/wiki/ISO_8601#Time_zone_designators" rel="nofollow">ISO 8601 time zone designators</a>. Which may be either "Z" for GMT/UTC/ZuluTime, or one of the forms "[+-]hh", "[+-]hhmm", "[+-]hh:mm", denoting the offset from UTC. Examples would be "+0100" for Central European Time (CET, MEZ) or "-0600" for Eastern Standard Time (EST). Of course, using "+00", "+0000" or "+00:00" for GMT/UTC/ZuluTime is fine, too. An exception is the negative sign with a zero offset, which is no valid time zone in ISO 8601 (like "-00").
</li><li> The default time zone designator format in SMILA is of the form "XX" which is represented by "Sign TwoDigitHours Minutes" (like "-0830").
</li><li> When a datetime value is created by parsing from a string (e.g. by parsing XML, JSON or BON, or using the <a href="http://build.eclipse.org/rt/smila/javadoc/current/org/eclipse/smila/datamodel/DataFactory.html" class="external text" title="http://build.eclipse.org/rt/smila/javadoc/current/org/eclipse/smila/datamodel/DataFactory.html" rel="nofollow"><code>DataFactory.parseFromString</code></a> methods, it will be printed in the exactly same way when serialized again when written to XML, JSON or BON (see <a href="http://build.eclipse.org/rt/smila/javadoc/current/org/eclipse/smila/datamodel/ValueFormatHelper.html" class="external text" title="http://build.eclipse.org/rt/smila/javadoc/current/org/eclipse/smila/datamodel/ValueFormatHelper.html" rel="nofollow"><code>ValueFormatHelper.getDefaultDateTimeFormat</code></a>).
</li><li> When a datetime value was created in Java from an instance of <code>java.util.Date</code> immediately, it will be serialized using the default timezone of the creating JVM. The milliseconds will be included, too, even if they are just 000.
</li></ul>
</li></ul>
<a name="XML_format"></a><h3> <span class="mw-headline"> XML format </span></h3>
<p>The XML format of a record is designed to be quite compact:
</p>
<div dir="ltr" style="text-align: left;"><pre class="source-xml"><span class="sc3"><span class="re1">&lt;Record</span> <span class="re0">xmlns</span>=<span class="st0">&quot;http://www.eclipse.org/smila/record&quot;</span> <span class="re0">version</span>=<span class="st0">&quot;2.0&quot;</span><span class="re2">&gt;</span></span>
<span class="sc3"><span class="re1">&lt;Val</span> <span class="re0">key</span>=<span class="st0">&quot;_recordid&quot;</span><span class="re2">&gt;</span></span>web:http://example.org/something<span class="sc3"><span class="re1">&lt;/Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Val</span> <span class="re0">key</span>=<span class="st0">&quot;_source&quot;</span><span class="re2">&gt;</span></span>web<span class="sc3"><span class="re1">&lt;/Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Val</span> <span class="re0">key</span>=<span class="st0">&quot;url&quot;</span><span class="re2">&gt;</span></span>http://example.org/something<span class="sc3"><span class="re1">&lt;/Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Val</span> <span class="re0">key</span>=<span class="st0">&quot;filesize&quot;</span> <span class="re0">type</span>=<span class="st0">&quot;long&quot;</span><span class="re2">&gt;</span></span>1234<span class="sc3"><span class="re1">&lt;/Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Val</span> <span class="re0">key</span>=<span class="st0">&quot;sizeInKb&quot;</span> <span class="re0">type</span>=<span class="st0">&quot;double&quot;</span><span class="re2">&gt;</span></span>1.2<span class="sc3"><span class="re1">&lt;/Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Val</span> <span class="re0">key</span>=<span class="st0">&quot;checked&quot;</span> <span class="re0">type</span>=<span class="st0">&quot;boolean&quot;</span><span class="re2">&gt;</span></span>true<span class="sc3"><span class="re1">&lt;/Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Val</span> <span class="re0">key</span>=<span class="st0">&quot;created&quot;</span> <span class="re0">type</span>=<span class="st0">&quot;date&quot;</span><span class="re2">&gt;</span></span>2010-12-02<span class="sc3"><span class="re1">&lt;/Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Val</span> <span class="re0">key</span>=<span class="st0">&quot;lastModified&quot;</span> <span class="re0">type</span>=<span class="st0">&quot;datetime&quot;</span><span class="re2">&gt;</span></span>2010-12-02T16:20:54.123+0100<span class="sc3"><span class="re1">&lt;/Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Seq</span> <span class="re0">key</span>=<span class="st0">&quot;trustee&quot;</span><span class="re2">&gt;</span></span>
<span class="sc3"><span class="re1">&lt;Val<span class="re2">&gt;</span></span></span>group1<span class="sc3"><span class="re1">&lt;/Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Val<span class="re2">&gt;</span></span></span>group2<span class="sc3"><span class="re1">&lt;/Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/Seq<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Seq</span> <span class="re0">key</span>=<span class="st0">&quot;author&quot;</span><span class="re2">&gt;</span></span>
<span class="sc3"><span class="re1">&lt;Map<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Val</span> <span class="re0">key</span>=<span class="st0">&quot;firstname&quot;</span><span class="re2">&gt;</span></span>John<span class="sc3"><span class="re1">&lt;/Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Val</span> <span class="re0">key</span>=<span class="st0">&quot;lastname&quot;</span><span class="re2">&gt;</span></span>Doe<span class="sc3"><span class="re1">&lt;/Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/Map<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Map<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Val</span> <span class="re0">key</span>=<span class="st0">&quot;firstname&quot;</span><span class="re2">&gt;</span></span>Lisa<span class="sc3"><span class="re1">&lt;/Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Val</span> <span class="re0">key</span>=<span class="st0">&quot;lastname&quot;</span><span class="re2">&gt;</span></span>Müller<span class="sc3"><span class="re1">&lt;/Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/Map<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/Seq<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Map</span> <span class="re0">key</span>=<span class="st0">&quot;contact&quot;</span><span class="re2">&gt;</span></span>
<span class="sc3"><span class="re1">&lt;Val</span> <span class="re0">key</span>=<span class="st0">&quot;email&quot;</span><span class="re2">&gt;</span></span>Homer.Simpson@powerplant.com<span class="sc3"><span class="re1">&lt;/Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Map</span> <span class="re0">key</span>=<span class="st0">&quot;address&quot;</span><span class="re2">&gt;</span></span>
<span class="sc3"><span class="re1">&lt;Val</span> <span class="re0">key</span>=<span class="st0">&quot;street&quot;</span><span class="re2">&gt;</span></span>742 Evergreen Terrace<span class="sc3"><span class="re1">&lt;/Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Val</span> <span class="re0">key</span>=<span class="st0">&quot;city&quot;</span><span class="re2">&gt;</span></span>Springfield<span class="sc3"><span class="re1">&lt;/Val<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/Map<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/Map<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Seq</span> <span class="re0">key</span>=<span class="st0">&quot;emptylist&quot;</span> <span class="re2">/&gt;</span></span>
<span class="sc3"><span class="re1">&lt;Map</span> <span class="re0">key</span>=<span class="st0">&quot;emptymap&quot;</span> <span class="re2">/&gt;</span></span>
&nbsp;
<span class="sc3"><span class="re1">&lt;Attachment<span class="re2">&gt;</span></span></span>content<span class="sc3"><span class="re1">&lt;/Attachment<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Attachment<span class="re2">&gt;</span></span></span>fulltext<span class="sc3"><span class="re1">&lt;/Attachment<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/Record<span class="re2">&gt;</span></span></span></pre></div>
<p><b>Notes:</b>
</p>
<ul><li> The Any objects are represented by <tt>&lt;Val&gt;</tt>, <tt>&lt;Map&gt;</tt>, and <tt>&lt;Seq&gt;</tt> elements.
</li><li> An object that is part of a map must have an additional <i>key</i> attribute. Elements of sequences must not have the <i>key</i> attribute.
</li><li> The type of a value is defined by an optional <i>type</i> attribute, the default is "string".
</li><li> See above for description of date and datetime formats.
</li><li> The top-level <tt>&lt;Map&gt;</tt> element of a record is omitted from the XML.
</li><li> In XML, the record does not contain the attachment values, but only their names so that a reader knows that there are attachments to be processed.
</li><li> Attachments are not supported in the XML format, only the names of attachments are preserved, the attachments themselves (the bytes) are lost
</li></ul>
<p>See package <tt>org.eclipse.smila.datamodel.xml</tt> for serialization helper classes.
</p>
<div class="messagebox" style="background-color: #def3fe; border: 1px solid #c5d7e0; color: black; padding: 5px; margin: 1ex 0; min-height: 35px; padding-left: 45px;">
<div style="float: left; margin-left: -40px;"><a href="http://wiki.eclipse.org/Image:Bug.png" class="image" title="Bug.png"><img alt="" src="http://wiki.eclipse.org/images/7/7d/Bug.png" width="35" height="30" border="0" /></a></div>
<div><a href="https://bugs.eclipse.org/351704" class="external text" title="https://bugs.eclipse.org/351704" rel="nofollow"><b>351704</b></a><br />Due to a bug in the JDK's default implementation of XMLStreamReader you should only use xml in version 1.0. When deserializing you either dont specify the XML declaration at all or you must use <div dir="ltr" style="text-align: left;"><pre class="source-xml"><span class="sc3"><span class="re1">&lt;?xml</span> <span class="re0">version</span>=<span class="st0">&quot;1.0&quot;</span> <span class="re0">encoding</span>=<span class="st0">&quot;utf-8&quot;</span><span class="re2">?&gt;</span></span></pre></div></div>
</div>
<p><br />
</p>
<a name="JSON_format"></a><h3> <span class="mw-headline"> JSON format </span></h3>
<p>The JSON format of a record looks like this:
</p>
<div dir="ltr" style="text-align: left;"><pre class="source-javascript"><span class="br0">&#123;</span>
<span class="st0">&quot;_recordid&quot;</span>&nbsp;: <span class="st0">&quot;web:http://example.org/something&quot;</span>,
<span class="st0">&quot;_source&quot;</span>&nbsp;: <span class="st0">&quot;web&quot;</span>,
<span class="st0">&quot;url&quot;</span>&nbsp;: <span class="st0">&quot;web:http://example.org/something&quot;</span>,
<span class="st0">&quot;filesize&quot;</span>&nbsp;: <span class="nu0">1234</span>,
<span class="st0">&quot;sizeInKb&quot;</span>&nbsp;: <span class="nu0">1.2</span>,
<span class="st0">&quot;checked&quot;</span>&nbsp;: <span class="kw2">true</span>,
<span class="st0">&quot;created&quot;</span>&nbsp;: <span class="st0">&quot;2010-12-02&quot;</span>,
<span class="st0">&quot;lastModified&quot;</span>&nbsp;: <span class="st0">&quot;2010-12-02T16:20:54.123+0100&quot;</span>,
<span class="st0">&quot;trustee&quot;</span>&nbsp;: <span class="br0">&#91;</span> <span class="st0">&quot;group1&quot;</span>, <span class="st0">&quot;group2&quot;</span> <span class="br0">&#93;</span>,
<span class="st0">&quot;author&quot;</span>&nbsp;:
<span class="br0">&#91;</span> <span class="br0">&#123;</span>
<span class="st0">&quot;firstname&quot;</span>&nbsp;: <span class="st0">&quot;John&quot;</span>,
<span class="st0">&quot;lastname&quot;</span>&nbsp;: <span class="st0">&quot;Doe&quot;</span>
<span class="br0">&#125;</span>,
<span class="br0">&#123;</span>
<span class="st0">&quot;firstname&quot;</span>&nbsp;: <span class="st0">&quot;Lisa&quot;</span>,
<span class="st0">&quot;lastname&quot;</span>&nbsp;: <span class="st0">&quot;Müller&quot;</span>
<span class="br0">&#125;</span> <span class="br0">&#93;</span>,
<span class="st0">&quot;contact&quot;</span>&nbsp;:
<span class="br0">&#123;</span>
<span class="st0">&quot;email&quot;</span>&nbsp;: <span class="st0">&quot;Homer.Simpson@powerplant.com&quot;</span>,
<span class="st0">&quot;address&quot;</span>&nbsp;:
<span class="br0">&#123;</span>
<span class="st0">&quot;street&quot;</span>&nbsp;: <span class="st0">&quot;742 Evergreen Terrace&quot;</span>,
<span class="st0">&quot;city&quot;</span>&nbsp;: <span class="st0">&quot;Springfield&quot;</span>
<span class="br0">&#125;</span>
<span class="br0">&#125;</span>,
<span class="st0">&quot;_attachments&quot;</span>: <span class="br0">&#91;</span><span class="st0">&quot;content&quot;</span>, <span class="st0">&quot;fulltext&quot;</span><span class="br0">&#93;</span>
<span class="br0">&#125;</span></pre></div>
<p><b>Notes:</b>
</p>
<ul><li> Number value types are determined implicitly when parsing JSON:
<ul><li> If a number value can be parsed as a long integer, a long value will be created, else it will become a double value.
</li></ul>
</li><li> Date and DateTime are not supported by JSON natively, therefore date and datetime values are printed to JSON as simple strings using the format rules described above. On the other hand, when the JSON parser finds a string value that has a correct date or datetime format, it creates a date or datetime value. The original string is preserved, so when accessing the value "as a string" the client will get the original string. Also, when the object is written to JSON (or BON or XML) again, the original string will be used. So this autodetection should not cause problems even if some string value has the correct format, but is not meant to be a date or datetime.
</li><li> Map keys are always strings and must be enclosed in quotes.
</li><li> Attachments are not supported in the JSON format, only the names of attachments are preserved, the attachments themselves (the bytes) are lost
</li></ul>
<p>See package <tt>org.eclipse.smila.datamodel.ipc</tt> for serialization helper classes.
</p>
<a name="BON_Binary_Object_Notation_Format"></a><h3> <span class="mw-headline"> BON Binary Object Notation Format </span></h3>
<a name="Format_introduction"></a><h4> <span class="mw-headline"> Format introduction </span></h4>
<p>The format consists of a sequence of tokens and data with two different types of tokens:
</p>
<ul><li> Event tokens are single bytes which are describing an event (e.g. OBJECT-START, SEQUENCE-START).
</li><li> Data tokens are the first part of an entity.
</li></ul>
<p>Every entity consists of up to three parts. The first part is a one byte token which describes the following data type and in case of a string type this token is followed by a data length information (second part). The last part is the information itself (except for the boolean type which is stored within the token).
</p><p>Integer values are stored in a compressed format. The sign and the integer length (number of bytes) are stored in the token byte. Strings are generally stored in UTF-8 format.
</p><p>The handling of date and datetime values is exactly as in JSON. See above for detais.
</p><p>Attachments are fully supported.
</p>
<a name="Scalar_Types"></a><h4> <span class="mw-headline"> Scalar Types </span></h4>
<p>The current release features the following scalar types:
</p>
<ul><li> Integer: <a href="http://en.wikipedia.org/wiki/Integer_(computer_science)" class="external text" title="http://en.wikipedia.org/wiki/Integer_(computer_science)" rel="nofollow">signed int64</a>
<ul><li> compressed, bytes are stored in <a href="http://en.wikipedia.org/wiki/Network_byte_order#Endianness_in_networking" class="external text" title="http://en.wikipedia.org/wiki/Network_byte_order#Endianness_in_networking" rel="nofollow">network byte order (big endian)</a>
</li><li> −9,223,372,036,854,775,808 to +9,223,372,036,854,775,807
</li></ul>
</li><li> Floating point values:
<ul><li> double (8 bytes in network byte order IEEE format (java default))
</li></ul>
</li><li> Bool
</li><li> String:
<ul><li> UTF-8 coded Text Strings, max 2^31-1 bytes
</li></ul>
</li></ul>
<a name="Integer_compressing"></a><h4> <span class="mw-headline"> Integer compressing </span></h4>
<p>The token bytes 0..15 defines the sign of the number (0-7, positive, 8-15 negative) and the number of the necessary bytes, to store the number. The bytes are stored in network byte order.
</p>
<table border="1">
<caption>Examples for integer compression
</caption><tr>
<th> value </th><th> token </th><th> data
</th></tr>
<tr>
<td>17 </td><td> 0 (positiv 1 byte) </td><td> 17 (0x 11)
</td></tr>
<tr>
<td>17985 </td><td> 1 (positiv 2 bytes) </td><td> 0x 46 41
</td></tr>
</table>
<a name="Binary_Type"></a><h4> <span class="mw-headline"> Binary Type </span></h4>
<p>The Binary type is used for arbitrary binary content in attachments. A single binary is currently limited to a size of max 2^31-1 bytes.
</p>
<a name="Token_Bytes"></a><h4> <span class="mw-headline"> Token Bytes </span></h4>
<p>There are two different types of tokens. Here is a complete list of all tokens which are supported by the current release of the format:
</p>
<table border="1">
<caption> List of event tokens
</caption><tr>
<th> token </th><th> description </th><th> byte
</th></tr>
<tr>
<td>OBJECT-START</td><td>No version string</td><td>25
</td></tr>
<tr>
<td>OBJECT-START</td><td>Followed by version (reserved, not implemented)</td><td>26
</td></tr>
<tr>
<td>OBJECT-END</td><td> </td><td>28
</td></tr>
<tr>
<td>SEQUENCE-START</td><td> </td><td>29
</td></tr>
<tr>
<td>SEQUENCE-END</td><td> </td><td>30
</td></tr>
<tr>
<td>MAPPING-START</td><td> </td><td>31
</td></tr>
<tr>
<td>MAPPING-END</td><td> </td><td>32
</td></tr>
<tr>
<td>ATTACHMENTS-START</td><td> </td><td>33
</td></tr>
<tr>
<td>ATTACHMENTS-END</td><td> </td><td>34
</td></tr>
<tr>
<td>CUSTOM-TYPE</td><td> </td><td>43
</td></tr></table>
<table border="1">
<caption>List of data tokens
</caption><tr>
<th> token </th><th> description </th><th> byte
</th></tr>
<tr>
<td>SCALAR-INT</td><td>positiv length 1</td><td>0
</td></tr>
<tr>
<td> </td><td>positiv length 2</td><td>1
</td></tr>
<tr>
<td> </td><td>...</td><td>...
</td></tr>
<tr>
<td> </td><td>positiv length 8</td><td>7
</td></tr>
<tr>
<td> </td><td>negative length 1</td><td>8
</td></tr>
<tr>
<td> </td><td>negative length 2</td><td>9
</td></tr>
<tr>
<td> </td><td>...</td><td>...
</td></tr>
<tr>
<td> </td><td>negative length 8</td><td>15
</td></tr>
<tr>
<td>SCALAR-BOOL</td><td>true</td><td>16
</td></tr>
<tr>
<td> </td><td>false</td><td>17
</td></tr>
<tr>
<td>SCALAR-FLOAT</td><td>float (32 bit)</td><td>18 (reserved, not implemented)
</td></tr>
<tr>
<td>SCALAR-FLOAT</td><td>double (64 bit)</td><td>19
</td></tr>
<tr>
<td>SCALAR-FLOAT</td><td>long double (80 bit)</td><td>20 (reserved, not implemented)
</td></tr>
<tr>
<td>SCALAR-STRING</td><td>1 length byte</td><td>21
</td></tr>
<tr>
<td> </td><td>2 length byte</td><td>22
</td></tr>
<tr>
<td> </td><td>3 length byte</td><td>23
</td></tr>
<tr>
<td> </td><td>4 length byte</td><td>24
</td></tr>
<tr>
<td>BINARY</td><td>length 1</td><td>35
</td></tr>
<tr>
<td> </td><td>length 2</td><td>36
</td></tr>
<tr>
<td> </td><td>length 3</td><td>37
</td></tr>
<tr>
<td> </td><td>length 4</td><td>38
</td></tr>
<tr>
<td> </td><td>length 5</td><td>39 (reserved, not implemented)
</td></tr>
<tr>
<td> </td><td>...</td><td>...
</td></tr>
<tr>
<td> </td><td>length 8</td><td>42 (reserved, not implemented)
</td></tr></table>
<a name="Backward_compatible_extension_concept"></a><h4> <span class="mw-headline"> Backward compatible extension concept </span></h4>
<p>If we need a BON format extension (= a new token), we pick an unused token number. Token 26 and 27 are reserved to store additional version information, but this is currently not implemented.
</p>
<a name="Custom_Type"></a><h4> <span class="mw-headline"> Custom Type </span></h4>
<p>The token CUSTOM-TYPE (43), followed by a type identifier (token SCALAR-STRING/21 + string), marks the following map or sequence as a special custom type. Appropriate parser code can create corresponding user objects.
</p><p>There is no mapping of this BON format to JSON. The representation in JSON is just the same as if the CUSTOM_TYPE token and the following string would be skipped.
</p><p>So parsing such a BON without knowing an appropriate custom type is just skipping these extra information and continue.
</p>
<a name="Examples"></a><h4> <span class="mw-headline"> Examples </span></h4>
<a name="Integer"></a><h5> <span class="mw-headline"> Integer </span></h5>
<p>Sample integer value: <b>-36364</b>
</p><p>The BON representation:
</p>
<table border="1">
<tr>
<th>Value (decimal)</th><th>Info</th><th>Comment
</th></tr>
<tr>
<td>9</td><td>SCALAR-INT</td><td>negative int value with 2 bytes length
</td></tr>
<tr>
<td>36364</td><td>int int value without sign</td><td>
</td></tr></table>
<p>and the hex representation:
</p>
<div dir="ltr" style="text-align: left;"><pre class="source-text">09 8E 0C</pre></div>
<a name="String"></a><h5> <span class="mw-headline"> String </span></h5>
<p>Sample text: <b>ähnlich</b>
</p><p>The BON representation:
</p>
<table border="1">
<tr>
<th>Value (decimal)</th><th>Info</th><th>Comment
</th></tr>
<tr>
<td>21</td><td>SCALAR-STRING</td><td>string with one byte length info
</td></tr>
<tr>
<td>08</td><td>length info</td><td>the string follows
</td></tr>
<tr>
<td>ähnlich</td><td>the string content</td><td>
</td></tr></table>
<p>and the hex representation:
</p>
<div dir="ltr" style="text-align: left;"><pre class="source-text">15 08 c3 a4 68 6e 6c 69 63 68</pre></div>
<a name="Complex_example"></a><h5> <span class="mw-headline"> Complex example </span></h5>
<p>A complex example: This could be some text annotation or highlighting structure. The JSON representation is:
</p>
<div dir="ltr" style="text-align: left;"><pre class="source-javascript"><span class="br0">&#123;</span>
<span class="st0">&quot;title&quot;</span>: <span class="br0">&#91;</span>
<span class="br0">&#91;</span><span class="st0">&quot;STEM&quot;</span>,<span class="st0">&quot;the&quot;</span>,<span class="nu0">0</span>,<span class="nu0">2</span><span class="br0">&#93;</span>,
<span class="br0">&#91;</span><span class="st0">&quot;STEM&quot;</span>,<span class="st0">&quot;title&quot;</span>,<span class="nu0">4</span>,<span class="nu0">8</span><span class="br0">&#93;</span>
<span class="br0">&#93;</span>
<span class="br0">&#125;</span></pre></div>
<table border="1">
<tr>
<th>Value (decimal)</th><th>Info</th><th>Comment
</th></tr>
<tr>
<td>25</td><td>OBJECT-START</td><td> "---" (here: without Type:version)
</td></tr>
<tr>
<td>31</td><td>MAPPING-START</td><td>
</td></tr>
<tr>
<td>21</td><td>SCALAR-STRING</td><td>string with one byte length info
</td></tr>
<tr>
<td>5</td><td> </td><td> length info of the string
</td></tr>
<tr>
<td>title</td><td> </td><td> the string content
</td></tr>
<tr>
<td>29</td><td>SEQUENCE-START</td><td>start of the sequence "STEM,the,0,2"
</td></tr>
<tr>
<td>21</td><td>SCALAR-STRING</td><td>string with one byte length info
</td></tr>
<tr>
<td>4</td><td> </td><td> length info for "STEM"
</td></tr>
<tr>
<td>STEM</td><td> </td><td>the string content
</td></tr>
<tr>
<td>21</td><td>SCALAR-STRING</td><td>string with one byte length info
</td></tr>
<tr>
<td>3</td><td> </td><td> length info for "the"
</td></tr>
<tr>
<td>the</td><td> </td><td> the string content
</td></tr>
<tr>
<td>0</td><td>SCALAR-INT (positive)</td><td>with one byte length
</td></tr>
<tr>
<td>0</td><td> </td><td> the INT value
</td></tr>
<tr>
<td>0</td><td>SCALAR-INT (positive)</td><td>with one byte length
</td></tr>
<tr>
<td>2</td><td> </td><td> the INT value
</td></tr>
<tr>
<td>30</td><td>SEQUENCE-END</td><td>end of the sequence "STEM,the,0,2"
</td></tr>
<tr>
<td>29</td><td>SEQUENCE-START</td><td>start of the sequence "STEM,title,4,8"
</td></tr>
<tr>
<td>21</td><td>SCALAR-STRING</td><td>string with one byte length info
</td></tr>
<tr>
<td>4</td><td> </td><td> length info for "STEM"
</td></tr>
<tr>
<td>STEM</td><td> </td><td> the string content
</td></tr>
<tr>
<td>21</td><td>SCALAR-STRING</td><td>string with one byte length info
</td></tr>
<tr>
<td>5</td><td> </td><td> length info for "title"
</td></tr>
<tr>
<td>title</td><td> </td><td> the string content
</td></tr>
<tr>
<td>0</td><td>SCALAR-INT (positive)</td><td>with one byte length
</td></tr>
<tr>
<td>4</td><td> </td><td> the INT value
</td></tr>
<tr>
<td>0</td><td>SCALAR-INT (positive)</td><td>with one byte length
</td></tr>
<tr>
<td>8</td><td> </td><td> the INT value
</td></tr>
<tr>
<td>30</td><td>SEQUENCE-END</td><td>end of the sequence "STEM,title,4,8"
</td></tr>
<tr>
<td>32</td><td>MAPPING-END</td><td>
</td></tr>
<tr>
<td>28</td><td>OBJECT-END</td><td>
</td></tr></table>
<p><br />
</p>
<a name="Complex_example_with_Custom_Type"></a><h5> <span class="mw-headline"> Complex example with Custom Type </span></h5>
<p>If there is a custom type "text":
</p>
<table border="1">
<tr>
<th>Value (decimal)</th><th>Info</th><th>Comment
</th></tr>
<tr>
<td>25</td><td>OBJECT-START</td><td> "---" (here: without Type:version)
</td></tr>
<tr>
<td>31</td><td>MAPPING-START</td><td>
</td></tr>
<tr>
<td>21</td><td>SCALAR-STRING</td><td>string with one byte length info
</td></tr>
<tr>
<td>5</td><td> </td><td> length info of the string
</td></tr>
<tr>
<td>title</td><td> </td><td> the string content
</td></tr>
<tr>
<td>43</td><td>CUSTOM-TYPE</td><td> a custom type follows
</td></tr>
<tr>
<td>21</td><td>SCALAR-STRING</td><td>string with one byte length info
</td></tr>
<tr>
<td>4</td><td> </td><td> length info for "text"
</td></tr>
<tr>
<td>text</td><td> </td><td>the string content
</td></tr>
<tr>
<td>29</td><td>SEQUENCE-START</td><td>start of the sequence "STEM,the,0,2"
</td></tr>
<tr>
<td>21</td><td>SCALAR-STRING</td><td>string with one byte length info
</td></tr>
<tr>
<td>4</td><td> </td><td> length info for "STEM"
</td></tr>
<tr>
<td>STEM</td><td> </td><td>the string content
</td></tr>
<tr>
<td>21</td><td>SCALAR-STRING</td><td>string with one byte length info
</td></tr>
<tr>
<td>3</td><td> </td><td> length info for "the"
</td></tr>
<tr>
<td>the</td><td> </td><td> the string content
</td></tr>
<tr>
<td>0</td><td>SCALAR-INT (positive)</td><td>with one byte length
</td></tr>
<tr>
<td>0</td><td> </td><td> the INT value
</td></tr>
<tr>
<td>0</td><td>SCALAR-INT (positive)</td><td>with one byte length
</td></tr>
<tr>
<td>2</td><td> </td><td> the INT value
</td></tr>
<tr>
<td>30</td><td>SEQUENCE-END</td><td>end of the sequence "STEM,the,0,2"
</td></tr>
<tr>
<td>29</td><td>SEQUENCE-START</td><td>start of the sequence "STEM,title,4,8"
</td></tr>
<tr>
<td>21</td><td>SCALAR-STRING</td><td>string with one byte length info
</td></tr>
<tr>
<td>4</td><td> </td><td> length info for "STEM"
</td></tr>
<tr>
<td>STEM</td><td> </td><td> the string content
</td></tr>
<tr>
<td>21</td><td>SCALAR-STRING</td><td>string with one byte length info
</td></tr>
<tr>
<td>5</td><td> </td><td> length info for "title"
</td></tr>
<tr>
<td>title</td><td> </td><td> the string content
</td></tr>
<tr>
<td>0</td><td>SCALAR-INT (positive)</td><td>with one byte length
</td></tr>
<tr>
<td>4</td><td> </td><td> the INT value
</td></tr>
<tr>
<td>0</td><td>SCALAR-INT (positive)</td><td>with one byte length
</td></tr>
<tr>
<td>8</td><td> </td><td> the INT value
</td></tr>
<tr>
<td>30</td><td>SEQUENCE-END</td><td>end of the sequence "STEM,title,4,8"
</td></tr>
<tr>
<td>32</td><td>MAPPING-END</td><td>
</td></tr>
<tr>
<td>28</td><td>OBJECT-END</td><td>
</td></tr></table>
<p><br />
</p>
<a name="Complex_example_with_attachments"></a><h5> <span class="mw-headline"> Complex example with attachments </span></h5>
<p>Another example with attachments: This could be some input record generated by a crawler (e.g. a mail crawler). The JSON representation is:
</p>
<div dir="ltr" style="text-align: left;"><pre class="source-javascript"><span class="br0">&#123;</span>
<span class="st0">&quot;subject&quot;</span>: <span class="st0">&quot;a test mail&quot;</span>,
<span class="st0">&quot;_attachments&quot;</span>&nbsp;: <span class="br0">&#91;</span><span class="st0">&quot;pdfFile&quot;</span>, <span class="st0">&quot;zipFile&quot;</span><span class="br0">&#93;</span>
<span class="br0">&#125;</span></pre></div>
<p>Note that "_attachments" is not a regular metadata field but contains the name of the attachments. Also note that the JSON representation does not contain the attachments themselves. This is only for documentation purpose.
</p><p><br />
</p>
<table border="1">
<tr>
<th>Value (decimal)</th><th>Info</th><th>Comment
</th></tr>
<tr>
<td>25</td><td>OBJECT-START</td><td> "---" (here: without Type:version)
</td></tr>
<tr>
<td>31</td><td>MAPPING-START</td><td>
</td></tr>
<tr>
<td>21</td><td>SCALAR-STRING</td><td>string with one byte length info
</td></tr>
<tr>
<td>7</td><td> </td><td> length info for the string
</td></tr>
<tr>
<td>subject</td><td> </td><td> the string content
</td></tr>
<tr>
<td>21</td><td>SCALAR-STRING</td><td>string with one byte length info
</td></tr>
<tr>
<td>11</td><td> </td><td> length info for "a test mail"
</td></tr>
<tr>
<td>a test mail</td><td> </td><td> the string content
</td></tr>
<tr>
<td>32</td><td>MAPPING-END</td><td>
</td></tr>
<tr>
<td>33</td><td>ATTACHMENTS-START</td><td>
</td></tr>
<tr>
<td>21</td><td>SCALAR-STRING</td><td>string with one byte length info
</td></tr>
<tr>
<td>7</td><td> </td><td> length info the string
</td></tr>
<tr>
<td>pdfFile</td><td> </td><td> the string content
</td></tr>
<tr>
<td>35</td><td>BINARY</td><td> binary with 1 byte length info
</td></tr>
<tr>
<td>12345</td><td> </td><td> length info for the binary content
</td></tr>
<tr>
<td> 03x0815 .... </td><td> </td><td> the binary content
</td></tr>
<tr>
<td>21</td><td>SCALAR-STRING</td><td>string with one byte length info
</td></tr>
<tr>
<td>7</td><td> </td><td> length info for the string
</td></tr>
<tr>
<td>zipFile</td><td> </td><td> the string content
</td></tr>
<tr>
<td>35</td><td>BINARY</td><td> binary with 1 byte length info
</td></tr>
<tr>
<td>98765</td><td> </td><td> length info for the binary content
</td></tr>
<tr>
<td> 08x4711 .... </td><td> </td><td> the binary content
</td></tr>
<tr>
<td>34</td><td>ATTACHMENTS-END</td><td>
</td></tr>
<tr>
<td>28</td><td>OBJECT-END</td><td>
</td></tr></table>
<a name="Record_Filters"></a><h3> <span class="mw-headline"> Record Filters </span></h3>
<p><b>Record filters</b> produce reduced copies of a record: A record filter has a name and contains a list of metadata element names. When applied to a record, it produces a copy of the record that contains only the elements of the list.
</p><p>Record filters are described in a simple XML format:
</p>
<div dir="ltr" style="text-align: left;"><pre class="source-xml"><span class="sc3"><span class="re1">&lt;RecordFilters<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Filter</span> <span class="re0">name</span>=<span class="st0">&quot;filter0&quot;</span> <span class="re2">/&gt;</span></span>
<span class="sc3"><span class="re1">&lt;Filter</span> <span class="re0">name</span>=<span class="st0">&quot;filter1&quot;</span><span class="re2">&gt;</span></span>
<span class="sc3"><span class="re1">&lt;Element</span> <span class="re0">name</span>=<span class="st0">&quot;attribute&quot;</span> <span class="re2">/&gt;</span></span>
<span class="sc3"><span class="re1">&lt;/Filter<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Filter</span> <span class="re0">name</span>=<span class="st0">&quot;filter3&quot;</span><span class="re2">&gt;</span></span>
<span class="sc3"><span class="re1">&lt;Element</span> <span class="re0">name</span>=<span class="st0">&quot;attribute1&quot;</span> <span class="re2">/&gt;</span></span>
<span class="sc3"><span class="re1">&lt;Element</span> <span class="re0">name</span>=<span class="st0">&quot;attribute2&quot;</span> <span class="re2">/&gt;</span></span>
<span class="sc3"><span class="re1">&lt;Element</span> <span class="re0">name</span>=<span class="st0">&quot;attribute3&quot;</span> <span class="re2">/&gt;</span></span>
<span class="sc3"><span class="re1">&lt;/Filter<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;Filter</span> <span class="re0">name</span>=<span class="st0">&quot;filter-all&quot;</span><span class="re2">&gt;</span></span>
<span class="sc3"><span class="re1">&lt;Element</span> <span class="re0">name</span>=<span class="st0">&quot;*&quot;</span> <span class="re2">/&gt;</span></span>
<span class="sc3"><span class="re1">&lt;/Filter<span class="re2">&gt;</span></span></span>
<span class="sc3"><span class="re1">&lt;/RecordFilters<span class="re2">&gt;</span></span></span></pre></div>
<p><b>Notes:</b>
</p>
<ul><li> A filter always copies the system elements "_recordid" and "_source". Therefore, the apparently empty "filter0" in this definition produces records that still contain these system elements.
</li><li> A filter may contain arbitrary numbers of element names. It's fine if an element does not appear in the record to copy, it's just ignored.
</li><li> A filter always removes attachments: The "filter-all" in this definition produces a copy of the record with all metadata elements, but not attachments.
</li></ul>
<p>Filters are usually applied by asking the blackboard for a filtered copy of the record's metadata. See Blackboard service API for details. To work with filters directly, see package <tt>org.eclipse.smila.datamodel.filter</tt> for utility classes.
</p>
<!--
NewPP limit report
Preprocessor node count: 98/1000000
Post-expand include size: 1393/2097152 bytes
Template argument size: 800/2097152 bytes
#ifexist count: 0/100
-->
<!-- Saved in parser cache with key wikidb:pcache:idhash:30891-0!1!0!!en!2!edit=0 and timestamp 20130416072455 -->
<div class="printfooter">
Retrieved from "<a href="Data_Model_and_Serialization_Formats.html">http://wiki.eclipse.org/SMILA/Documentation/Data_Model_and_Serialization_Formats</a>"</div>
<div id="catlinks"><p class='catlinks'><a href="http://wiki.eclipse.org/Special:Categories" title="Special:Categories">Category</a>: <span dir='ltr'><a href="http://wiki.eclipse.org/Category:SMILA" title="Category:SMILA">SMILA</a></span></p></div> <!-- end content -->
<div class="visualClear"></div>
</div>
</div>
</div>
<!-- Yoink of toolbox for phoenix moved up -->
</div>
</div>
<div id="clearFooter"/>
<div id="footer" >
<ul id="footernav">
<li class="first"><a href="http://www.eclipse.org/">Home</a></li>
<li><a href="http://www.eclipse.org/legal/privacy.php">Privacy Policy</a></li>
<li><a href="http://www.eclipse.org/legal/termsofuse.php">Terms of Use</a></li>
<li><a href="http://www.eclipse.org/legal/copyright.php">Copyright Agent</a></li>
<li><a href="http://www.eclipse.org/org/foundation/contact.php">Contact</a></li>
<li><a href="http://wiki.eclipse.org/Eclipsepedia:About" title="Eclipsepedia:About">About Eclipsepedia</a></li>
</ul>
<span id="copyright">Copyright &copy; 2013 The Eclipse Foundation. All Rights Reserved</span>
<p id="footercredit">This page was last modified 12:23, 11 April 2013 by <a href="http://wiki.eclipse.org/index.php?title=User:Marco.strack.empolis.com&amp;action=edit" class="new" title="User:Marco.strack.empolis.com">Marco Strack</a>. Based on work by <a href="http://wiki.eclipse.org/User:Andreas.weber.empolis.com" title="User:Andreas.weber.empolis.com">Andreas Weber</a>, <a href="http://wiki.eclipse.org/index.php?title=User:Armin.pies.empolis.com&amp;action=edit" class="new" title="User:Armin.pies.empolis.com">Armin Pies</a> and <a href="http://wiki.eclipse.org/index.php?title=User:Juergen.schumacher.attensity.com&amp;action=edit" class="new" title="User:Juergen.schumacher.attensity.com">Juergen Schumacher</a> and <a href="http://wiki.eclipse.org/index.php?title=SMILA/Documentation/Data_Model_and_Serialization_Formats&amp;action=credits" title="SMILA/Documentation/Data Model and Serialization Formats">others</a>.</p>
<p id="footerviews">This page has been accessed 2,998 times.</p>
</div>
<script type="text/javascript">
var gaJsHost = (("https:" == document.location.protocol) ? "https://ssl." : "http://www.");
document.write(unescape("%3Cscript src='" + gaJsHost + "google-analytics.com/ga.js' type='text/javascript'%3E%3C/script%3E"));
</script>
<script type="text/javascript">
var pageTracker = _gat._getTracker("UA-910670-4");
pageTracker._trackPageview();
</script>
<!-- <div class="visualClear"></div> -->
<script type="text/javascript">if (window.runOnloadHook) runOnloadHook();</script>
</div>
<!-- Served in 0.056 secs. --></body></html>