| |
| var htmlToTextPipelet = pipelets.create("org.eclipse.smila.processing.pipelets.HtmlToTextPipelet", { |
| "inputType" : "ATTRIBUTE", |
| "outputType" : "ATTRIBUTE", |
| "inputName" : "Content", |
| "outputName" : "Content" |
| }); |
| |
| var solrIndexPipelet = pipelets.create("org.eclipse.smila.solr.update.SolrUpdatePipelet", { |
| "indexname" : "collection1", |
| "update" : { |
| "operation" : "ADD", |
| "commitWithinMs" : "600000", |
| "mapping" : { |
| "_source" : "", |
| "Path" : "", |
| "Url" : "", |
| "Filename" : "", |
| "MimeType" : "", |
| "Size" : "", |
| "LastModifiedDate" : "", |
| "Content" : "", |
| "Extension" : "", |
| "Title" : "", |
| "Author" : "" |
| } |
| } |
| }); |
| |
| function subAttributeExtract(record, iPath, oPath, mode) { |
| var pipelet = pipelets.create("org.eclipse.smila.processing.pipelets.SubAttributeExtractorPipelet", { |
| "inputPath" : iPath, |
| "outputPath" : oPath, |
| "mode" : mode |
| }); |
| return pipelet.process(record); |
| }; |
| |
| /* called by worker: initialize for task. */ |
| function prepare(parameters) { |
| } |
| |
| /* called by worker: process single record from bulk. */ |
| function processRecord(record) { |
| |
| subAttributeExtract(record, "Contents/Type", "MimeType", "FIRST"); |
| subAttributeExtract(record, "Contents/Value", "Content", "ALL_AS_ONE"); |
| subAttributeExtract(record, "Description/Value", "Content", "ALL_AS_ONE"); |
| subAttributeExtract(record, "Links/Href", "Url", "FIRST"); |
| subAttributeExtract(record, "Authors/Name", "Author", "ALL_AS_LIST"); |
| |
| if (("MimeType" in record) && (record.MimeType == "text/xml" || record.MimeType == "text/html")) { |
| htmlToTextPipelet.process(record); |
| } |
| |
| return solrIndexPipelet.process(record); |
| } |