{ | |
"class" : "org.eclipse.smila.processing.pipelets.boilerpipe.BoilerpipePipelet", | |
"description": "Removes all boiler plates and HTML tags from a web page.", | |
"parameters": [ | |
{ | |
"name": "inputType", | |
"type": "string", | |
"values": ["ATTACHMENT", "ATTRIBUTE"], | |
"description": "Defines whether the HTML input is found in an attachment or in an attribute of the record" | |
}, | |
{ | |
"name": "outputType", | |
"type": "string", | |
"values": ["ATTACHMENT", "ATTRIBUTE"], | |
"description": "Defines whether the plain text should be stored in an attachment or in an attribute of the record" | |
}, | |
{ | |
"name": "inputName", | |
"type": "string", | |
"description": "Name of attachment or attribute that contains the HTML input" | |
}, | |
{ | |
"name": "outputName", | |
"type": "string", | |
"description": "Name of attachment or attribute for plain text output" | |
}, | |
{ | |
"name": "encodingAttribute", | |
"type": "string", | |
"optional": true, | |
"description": "Optional name of the attribute with the encoding of the input attachment." | |
}, | |
{ | |
"name": "defaultEncoding", | |
"type": "string", | |
"optional": true, | |
"description": "Fallback encoding, if anything else fails." | |
}, | |
{ | |
"name": "filter", | |
"type": "string", | |
"optional": true, | |
"multi": true, | |
"description": "A list of the boiler pipe filters to use, may contain class names, or static method or static variable references (defaults to de.l3s.boilerpipe.extractors.ArticleExtractor.INSTANCE)." | |
} | |
] | |
} |