/******************************************************************************* | |
* Copyright (c) 2008, 2011 Attensity Europe GmbH and brox IT Solutions GmbH. All rights reserved. This program and the | |
* accompanying materials are made available under the terms of the Eclipse Public License v1.0 which accompanies this | |
* distribution, and is available at http://www.eclipse.org/legal/epl-v10.html | |
* | |
* Contributors: Juergen Schumacher (empolis GmbH) - initial API and implementation Drazen Cindric (Attensity Europe | |
* GmbH) - data model improvements | |
*******************************************************************************/ | |
package org.eclipse.smila.processing.pipelets; | |
import java.io.InputStream; | |
import java.io.StringReader; | |
import java.util.ArrayList; | |
import java.util.Arrays; | |
import java.util.Collection; | |
import java.util.HashMap; | |
import java.util.Iterator; | |
import java.util.List; | |
import java.util.Map; | |
import java.util.Map.Entry; | |
import org.apache.commons.collections.map.MultiValueMap; | |
import org.apache.commons.logging.Log; | |
import org.apache.commons.logging.LogFactory; | |
import org.apache.xerces.xni.Augmentations; | |
import org.apache.xerces.xni.QName; | |
import org.apache.xerces.xni.XMLAttributes; | |
import org.apache.xerces.xni.XMLString; | |
import org.apache.xerces.xni.parser.XMLDocumentFilter; | |
import org.apache.xerces.xni.parser.XMLInputSource; | |
import org.apache.xerces.xni.parser.XMLParserConfiguration; | |
import org.cyberneko.html.HTMLConfiguration; | |
import org.cyberneko.html.filters.DefaultFilter; | |
import org.cyberneko.html.filters.ElementRemover; | |
import org.eclipse.smila.blackboard.Blackboard; | |
import org.eclipse.smila.blackboard.BlackboardAccessException; | |
import org.eclipse.smila.datamodel.Any; | |
import org.eclipse.smila.datamodel.AnyMap; | |
import org.eclipse.smila.datamodel.AnySeq; | |
import org.eclipse.smila.datamodel.Value; | |
import org.eclipse.smila.processing.ProcessingException; | |
/** | |
* Simple HTML-to-Text extractor pipelet using NekoHTML parser. | |
* | |
* @author jschumacher | |
* | |
*/ | |
public class HtmlToTextPipelet extends ATransformationPipelet { | |
/** | |
* By default the content of these tags is also removed from the result. This can be overwritten using the | |
* "removeContentTags" property. | |
*/ | |
private static final String[] DEFAULT_REMOVE_CONTENT_TAGS = { "applet", "frame", "object", "script", "style" }; | |
/** | |
* property to configure the tags for which the complete content is removed. | |
*/ | |
private static final String PROP_REMOVE_CONTENT_TAGS = "removeContentTags"; | |
/** | |
* property to configure the default encoding of HTML documents to convert. | |
*/ | |
private static final String PROP_DEFAULT_ENCODING = "defaultEncoding"; | |
/** | |
* property to configure attribute targets for HTML meta tag contents. | |
*/ | |
private static final String PROP_META = "meta:"; | |
/** | |
* property to configure attribute targets for HTML title tag content. | |
*/ | |
private static final String PROP_TITLE = "tag:title"; | |
/** | |
* tag names for which the complete content is removed from result. | |
*/ | |
private String[] _removeContentTags = DEFAULT_REMOVE_CONTENT_TAGS; | |
/** | |
* mapping of META tags to attribute names. | |
*/ | |
private final Map<String, String> _metaAttributeMapping = new HashMap<String, String>(); | |
/** | |
* default encoding parameter. | |
*/ | |
private String _defaultEncoding; | |
/** The log. */ | |
private final Log _log = LogFactory.getLog(getClass()); | |
/** | |
* {@inheritDoc} | |
* | |
*/ | |
@Override | |
public void configure(final AnyMap configuration) throws ProcessingException { | |
super.configure(configuration); | |
_defaultEncoding = configuration.getStringValue(PROP_DEFAULT_ENCODING); | |
final Any removeContentTagValue = configuration.get(PROP_REMOVE_CONTENT_TAGS); | |
if (removeContentTagValue != null && removeContentTagValue.isString()) { | |
final String removeContentTagList = ((Value) removeContentTagValue).asString().trim(); | |
if (removeContentTagList.length() > 0) { | |
_removeContentTags = removeContentTagList.split(","); | |
_log.info("Removing complete content of these tags: " + Arrays.toString(_removeContentTags)); | |
} | |
} | |
for (final Entry<String, Any> entry : configuration.entrySet()) { | |
final String key = entry.getKey(); | |
final String value = ((Value) entry.getValue()).asString(); | |
if (key.startsWith(PROP_META)) { | |
final String metaName = key.substring(PROP_META.length()); | |
final String attributeName = value; | |
_metaAttributeMapping.put(metaName, attributeName); | |
} else if (key.equalsIgnoreCase(PROP_TITLE)) { | |
final String attributeName = value; | |
_metaAttributeMapping.put(PROP_TITLE, attributeName); | |
} | |
} | |
} | |
/** | |
* {@inheritDoc} | |
* | |
*/ | |
@Override | |
public String[] process(final Blackboard blackboard, final String[] recordIds) throws ProcessingException { | |
if (recordIds != null) { | |
for (final String id : recordIds) { | |
try { | |
final MultiValueMap metadata = new MultiValueMap(); | |
final List<String> results = new ArrayList<String>(); | |
if (isReadFromAttribute()) { | |
processAttributeValues(blackboard, id, results, metadata); | |
} else { | |
final InputStream stream = blackboard.getAttachmentAsStream(id, _inputName); | |
if (stream != null) { | |
results.add(extractText(id, stream, metadata)); | |
} | |
} | |
storeResults(blackboard, id, results); | |
storeMetadata(blackboard, id, metadata); | |
} catch (final Exception ex) { | |
_log.error("Error processing ID " + id, ex); | |
} | |
} | |
} | |
return recordIds; | |
} | |
/** | |
* read HTML strings from an attribute and add plain text strings and metadata to the results. | |
* | |
* @param blackboard | |
* blackboard service to use. | |
* @param id | |
* record ID to process | |
* @param results | |
* list of plain text strings. | |
* @param metadata | |
* metadata map | |
* @throws BlackboardAccessException | |
* record is not on blackboard | |
* @throws ProcessingException | |
* error parsing the HTML. | |
*/ | |
private void processAttributeValues(final Blackboard blackboard, final String id, final List<String> results, | |
final MultiValueMap metadata) throws BlackboardAccessException, ProcessingException { | |
final AnyMap anyMap = blackboard.getMetadata(id); | |
final Any any = anyMap.get(_inputName); | |
if (any != null) { | |
for (final Any value : any) { | |
if (value.isValue()) { | |
final String content = ((Value) value).asString(); | |
if (content != null) { | |
results.add(extractText(id, content, metadata)); | |
} | |
} else if (any.isSeq()) { | |
final AnySeq sequence = (AnySeq) any; | |
for (final Any element : sequence) { | |
if (element.isString()) { | |
final String content = ((Value) element).asString(); | |
if (content != null) { | |
results.add(extractText(id, content, metadata)); | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
/** | |
* write metadata to attributes on blackboard. | |
* | |
* @param blackboard | |
* blackboard | |
* @param id | |
* record id | |
* @param metadata | |
* metadata to store | |
* @throws BlackboardAccessException | |
* error writing values. | |
*/ | |
private void storeMetadata(final Blackboard blackboard, final String id, final MultiValueMap metadata) | |
throws BlackboardAccessException { | |
if (!metadata.isEmpty()) { | |
for (final Iterator<?> attributeNames = metadata.keySet().iterator(); attributeNames.hasNext();) { | |
final String attributeName = (String) attributeNames.next(); | |
final Collection<?> values = metadata.getCollection(attributeName); | |
final AnyMap anyMap = blackboard.getMetadata(id); | |
if (!values.isEmpty()) { | |
anyMap.remove(attributeName); | |
final AnySeq sequence = blackboard.getDataFactory().createAnySeq(); | |
for (final Iterator<?> iter = values.iterator(); iter.hasNext();) { | |
final String value = (String) iter.next(); | |
sequence.add(value); | |
} | |
anyMap.put(attributeName, sequence); | |
} | |
} | |
} | |
} | |
/** | |
* extract the plain text from the HTML document given by the stream. | |
* | |
* @param id | |
* record ID (for logging purposes) | |
* @param stream | |
* HTML stream * | |
* @param metadata | |
* put extracted metadata in this map. | |
* @return plain text | |
* @throws ProcessingException | |
* error in parsing | |
*/ | |
private String extractText(final String id, final InputStream stream, final MultiValueMap metadata) | |
throws ProcessingException { | |
final StringBuilder result = new StringBuilder(); | |
final XMLParserConfiguration parser = createParser(result, metadata); | |
try { | |
parser.parse(new XMLInputSource(null, id, null, stream, null)); | |
} catch (final Exception e) { | |
_log.error("error parsing HTML document in record " + id, e); | |
throw new ProcessingException("error parsing HTML document in record " + id + ": " + e.toString(), e); | |
} | |
return result.toString(); | |
} | |
/** | |
* extract the plain text from the HTML document given by the string. | |
* | |
* @param id | |
* record ID (for logging purposes) | |
* @param content | |
* HTML string | |
* @param metadata | |
* put extracted metadata in this map. | |
* @return plain text | |
* @throws ProcessingException | |
* error in parsing | |
*/ | |
private String extractText(final String id, final String content, final MultiValueMap metadata) | |
throws ProcessingException { | |
final StringBuilder result = new StringBuilder(); | |
final XMLParserConfiguration parser = createParser(result, metadata); | |
try { | |
parser.parse(new XMLInputSource(null, id, null, new StringReader(content), null)); | |
} catch (final Exception e) { | |
_log.error("error parsing HTML document in record " + id, e); | |
throw new ProcessingException("error parsing HTML document in record " + id + ": " + e.toString(), e); | |
} | |
return result.toString(); | |
} | |
/** | |
* create the HTML Parser/Filter to extract plain text. | |
* | |
* @param result | |
* string builder to take the result. | |
* @param metadata | |
* put extracted metadata in this map. | |
* @return HTML parser/filter. | |
*/ | |
private XMLParserConfiguration createParser(final StringBuilder result, final MultiValueMap metadata) { | |
final ElementRemover elementRemover = new ElementRemover(); | |
for (final String tag : _removeContentTags) { | |
elementRemover.removeElement(tag); | |
} | |
final CommentRemover commentRemover = new CommentRemover(); | |
final MetadataExtractor metadataExtractor = new MetadataExtractor(metadata); | |
final PlainTextWriter writer = new PlainTextWriter(result); | |
final XMLDocumentFilter[] filters = { commentRemover, metadataExtractor, elementRemover, writer }; | |
final XMLParserConfiguration parser = new HTMLConfiguration(); | |
parser.setProperty("http://cyberneko.org/html/properties/filters", filters); | |
if (_defaultEncoding != null) { | |
parser.setProperty("http://cyberneko.org/html/properties/default-encoding", _defaultEncoding); | |
} | |
return parser; | |
} | |
/** | |
* Append plain text from document to a string builder. | |
* | |
* @author jschumacher | |
* | |
*/ | |
public class PlainTextWriter extends DefaultFilter { | |
/** | |
* target StringBuilder. | |
*/ | |
private final StringBuilder _target; | |
/** | |
* @param target | |
* the StringBuilder to write to | |
*/ | |
public PlainTextWriter(final StringBuilder target) { | |
super(); | |
_target = target; | |
} | |
/** | |
* {@inheritDoc} | |
* | |
* @see org.cyberneko.html.filters.DefaultFilter#characters(org.apache.xerces.xni.XMLString, | |
* org.apache.xerces.xni.Augmentations) | |
*/ | |
@Override | |
public void characters(final XMLString text, final Augmentations augs) { | |
_target.append(text.ch, text.offset, text.length); | |
super.characters(text, augs); | |
} | |
} | |
/** | |
* removes comments from HTML files. | |
* | |
* @author jschumacher | |
* | |
*/ | |
public class CommentRemover extends DefaultFilter { | |
/** | |
* {@inheritDoc} | |
* | |
* @see org.cyberneko.html.filters.DefaultFilter#comment(org.apache.xerces.xni.XMLString, | |
* org.apache.xerces.xni.Augmentations) | |
*/ | |
@Override | |
public void comment(final XMLString text, final Augmentations augs) { | |
// do nothing | |
} | |
} | |
/** | |
* extract metadata from META tags. | |
* | |
* @author jschumacher | |
* | |
*/ | |
public class MetadataExtractor extends DefaultFilter { | |
/** | |
* attribute to value map extracted from document. | |
*/ | |
private final MultiValueMap _metadata; | |
/** | |
* boolean flag if the parser is in the title start tag. | |
*/ | |
private boolean _inTitleTag; | |
/** | |
* Buffer to store the content of the title tag. | |
*/ | |
private StringBuffer _titleBuffer = new StringBuffer(); | |
/** | |
* @param metadata | |
* map to use as target for storing the attribute-value lists. | |
*/ | |
public MetadataExtractor(final MultiValueMap metadata) { | |
super(); | |
_metadata = metadata; | |
} | |
/** | |
* {@inheritDoc} | |
* | |
* @see org.cyberneko.html.filters.DefaultFilter#startElement(org.apache.xerces.xni.QName, | |
* org.apache.xerces.xni.XMLAttributes, org.apache.xerces.xni.Augmentations) | |
*/ | |
@Override | |
public void startElement(final QName element, final XMLAttributes attributes, final Augmentations augs) { | |
super.startElement(element, attributes, augs); | |
if ("meta".equalsIgnoreCase(element.localpart)) { | |
extractMetadata(element, attributes); | |
} else if ("title".equalsIgnoreCase(element.localpart)) { | |
_inTitleTag = true; | |
_titleBuffer = new StringBuffer(); | |
} | |
} | |
/** | |
* {@inheritDoc} | |
* | |
* @see org.cyberneko.html.filters.DefaultFilter#emptyElement(org.apache.xerces.xni.QName, | |
* org.apache.xerces.xni.XMLAttributes, org.apache.xerces.xni.Augmentations) | |
*/ | |
@Override | |
public void emptyElement(final QName element, final XMLAttributes attributes, final Augmentations augs) { | |
super.emptyElement(element, attributes, augs); | |
if ("meta".equalsIgnoreCase(element.localpart)) { | |
extractMetadata(element, attributes); | |
} | |
} | |
/** | |
* {@inheritDoc} | |
* | |
* @see org.cyberneko.html.filters.DefaultFilter#endElement(org.apache.xerces.xni.QName, | |
* org.apache.xerces.xni.Augmentations) | |
*/ | |
@Override | |
public void endElement(final QName element, final Augmentations augs) { | |
super.endElement(element, augs); | |
if ("title".equalsIgnoreCase(element.localpart)) { | |
_inTitleTag = false; | |
setTitle(); | |
} | |
} | |
/** | |
* {@inheritDoc} | |
* | |
* @see org.cyberneko.html.filters.DefaultFilter#endElement(XMLString text, org.apache.xerces.xni.Augmentations) | |
*/ | |
@Override | |
public void characters(final XMLString text, final Augmentations augs) { | |
super.characters(text, augs); | |
if (_inTitleTag) { | |
_titleBuffer.append(text.toString()); | |
} | |
} | |
/** | |
* check for META tag. | |
* | |
* @param element | |
* current element. | |
* @param attributes | |
* attributes of tag. | |
*/ | |
private void extractMetadata(final QName element, final XMLAttributes attributes) { | |
String metaName = null; | |
String metaValue = null; | |
for (int i = 0; i < attributes.getLength(); i++) { | |
final String attributeName = attributes.getLocalName(i); | |
if ("name".equalsIgnoreCase(attributeName)) { | |
metaName = attributes.getValue(i).trim().toLowerCase(); | |
} else if ("content".equalsIgnoreCase(attributeName)) { | |
metaValue = attributes.getValue(i); | |
} | |
} | |
if (metaName != null && metaValue != null) { | |
final String attributeName = _metaAttributeMapping.get(metaName); | |
if (attributeName != null) { | |
_metadata.put(attributeName, metaValue); | |
} | |
} | |
} | |
/** | |
* Adds the content of the _titleBuffer as an attribute value to the _metadata map. | |
*/ | |
private void setTitle() { | |
final String attributeName = _metaAttributeMapping.get(PROP_TITLE); | |
if (attributeName != null && _titleBuffer != null) { | |
final String title = _titleBuffer.toString().trim(); | |
if (title.length() > 0) { | |
_metadata.put(attributeName, title); | |
} | |
} | |
} | |
} | |
} |