blob: 068962ffcab7a762d7fbd0ac0f02251a3dfd5284 [file] [log] [blame]
var detectMimeTypePipelet = pipelets.create("org.eclipse.smila.processing.pipelets.MimeTypeIdentifyPipelet", {
"FileExtensionAttribute" : "Extension",
"MetaDataAttrbute" : "MetaData",
"MimeTypeAttribute" : "MimeType"
});
var splitXMLPipelet = pipelets.create("org.eclipse.smila.processing.pipelets.xmlprocessing.XmlDocumentSplitterPipelet", {
"inputType" : "ATTRIBUTE",
"outputType" : "ATTRIBUTE",
"inputName" : "Path",
"outputName" : "Content",
"beginTagName" : "document",
"endTagName" : "document",
});
var solrIndexPipelet = pipelets.create("org.eclipse.smila.solr.index.SolrIndexPipelet", {
"ExecutionMode" : "ADD",
"CoreName" : "DefaultCore",
"CoreFields" : [
{"FieldName": "_source"},
{"FieldName": "Path"},
{"FieldName": "Url"},
{"FieldName": "Filename"},
{"FieldName": "MimeType"},
{"FieldName": "Size"},
{"FieldName": "LastModifiedDate"},
{"FieldName": "Content"},
{"FieldName": "Extension"},
{"FieldName": "Title"},
{"FieldName": "Author"}
]
});
function xPathExtract(record, inputName, outputName, xpath) {
var xPathExtractPipelet = pipelets.create("org.eclipse.smila.processing.pipelets.xmlprocessing.XPathExtractorPipelet", {
"inputType" : "ATTRIBUTE",
"outputType" : "ATTRIBUTE",
"inputName" : inputName,
"outputName" : outputName,
"xpath" : xpath
});
return xPathExtractPipelet.process(record);
};
/* called by worker: initialize for task. */
function prepare(parameters) {
}
/* called by worker: process single record from bulk. */
function processRecord(record) {
// 1. detectMimeType
if (!("MimeType" in record)) {
detectMimeTypePipelet.process(record);
}
// 2. split xml
if (record.MimeType == "text/xml" || record.MimeType == "application/xml") {
record = splitXMLPipelet.process(record);
}
// 3. extractTitle
xPathExtract(record, "Content", "Title", "document/title");
// 4. extractText
xPathExtract(record, "Content", "Content", "document/text");
// 5. index
return solrIndexPipelet.process(record);
}