| var detectMimeTypePipelet = pipelets.create("org.eclipse.smila.processing.pipelets.MimeTypeIdentifyPipelet", { |
| "FileExtensionAttribute" : "Extension", |
| "MetaDataAttrbute" : "MetaData", |
| "MimeTypeAttribute" : "MimeType" |
| }); |
| |
| var splitXMLPipelet = pipelets.create("org.eclipse.smila.processing.pipelets.xmlprocessing.XmlDocumentSplitterPipelet", { |
| "inputType" : "ATTRIBUTE", |
| "outputType" : "ATTRIBUTE", |
| "inputName" : "Path", |
| "outputName" : "Content", |
| "beginTagName" : "document", |
| "endTagName" : "document", |
| }); |
| |
| var solrIndexPipelet = pipelets.create("org.eclipse.smila.solr.index.SolrIndexPipelet", { |
| "ExecutionMode" : "ADD", |
| "CoreName" : "DefaultCore", |
| "CoreFields" : [ |
| {"FieldName": "_source"}, |
| {"FieldName": "Path"}, |
| {"FieldName": "Url"}, |
| {"FieldName": "Filename"}, |
| {"FieldName": "MimeType"}, |
| {"FieldName": "Size"}, |
| {"FieldName": "LastModifiedDate"}, |
| {"FieldName": "Content"}, |
| {"FieldName": "Extension"}, |
| {"FieldName": "Title"}, |
| {"FieldName": "Author"} |
| ] |
| }); |
| |
| function xPathExtract(record, inputName, outputName, xpath) { |
| var xPathExtractPipelet = pipelets.create("org.eclipse.smila.processing.pipelets.xmlprocessing.XPathExtractorPipelet", { |
| "inputType" : "ATTRIBUTE", |
| "outputType" : "ATTRIBUTE", |
| "inputName" : inputName, |
| "outputName" : outputName, |
| "xpath" : xpath |
| }); |
| return xPathExtractPipelet.process(record); |
| }; |
| |
| /* called by worker: initialize for task. */ |
| function prepare(parameters) { |
| } |
| |
| /* called by worker: process single record from bulk. */ |
| function processRecord(record) { |
| // 1. detectMimeType |
| if (!("MimeType" in record)) { |
| detectMimeTypePipelet.process(record); |
| } |
| |
| // 2. split xml |
| if (record.MimeType == "text/xml" || record.MimeType == "application/xml") { |
| record = splitXMLPipelet.process(record); |
| } |
| |
| // 3. extractTitle |
| xPathExtract(record, "Content", "Title", "document/title"); |
| |
| // 4. extractText |
| xPathExtract(record, "Content", "Content", "document/text"); |
| |
| // 5. index |
| return solrIndexPipelet.process(record); |
| } |