blob: 5609122289b4ef0d936249da714a4f430fde7124 [file] [log] [blame]
/*******************************************************************************
* Copyright (c) 2008 empolis GmbH and brox IT Solutions GmbH.
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Eclipse Public License v1.0
* which accompanies this distribution, and is available at
* http://www.eclipse.org/legal/epl-v10.html
*
* Contributors:
* Juergen Schumacher (empolis GmbH) - initial API and implementation
*******************************************************************************/
package org.eclipse.smila.processing.pipelets;
import java.io.InputStream;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.commons.collections.map.MultiValueMap;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.xerces.xni.Augmentations;
import org.apache.xerces.xni.QName;
import org.apache.xerces.xni.XMLAttributes;
import org.apache.xerces.xni.XMLString;
import org.apache.xerces.xni.parser.XMLDocumentFilter;
import org.apache.xerces.xni.parser.XMLInputSource;
import org.apache.xerces.xni.parser.XMLParserConfiguration;
import org.cyberneko.html.HTMLConfiguration;
import org.cyberneko.html.filters.DefaultFilter;
import org.cyberneko.html.filters.ElementRemover;
import org.eclipse.smila.blackboard.BlackboardAccessException;
import org.eclipse.smila.blackboard.BlackboardService;
import org.eclipse.smila.blackboard.path.Path;
import org.eclipse.smila.datamodel.id.Id;
import org.eclipse.smila.datamodel.record.Literal;
import org.eclipse.smila.processing.ProcessingException;
import org.eclipse.smila.processing.configuration.PipeletConfiguration;
import org.eclipse.smila.processing.configuration.PipeletConfiguration.Property;
/**
* Simple HTML-to-Text extractor pipelet using NekoHTML parser.
*
* @author jschumacher
*
*/
public class HtmlToTextPipelet extends ATransformationPipelet {
/**
* By default the content of these tags is also removed from the result. This can be overwritten using the
* "removeContentTags" property.
*/
private static final String[] DEFAULT_REMOVE_CONTENT_TAGS = { "applet", "frame", "object", "script", "style" };
/**
* property to configure the tags for which the complete content is removed.
*/
private static final String PROP_REMOVE_CONTENT_TAGS = "removeContentTags";
/**
* property to configure attribute targets for HTML meta tag contents.
*/
private static final String PROP_META = "meta:";
/**
* tag names for which the complete content is removed from result.
*/
private String[] _removeContentTags = DEFAULT_REMOVE_CONTENT_TAGS;
/**
* mapping of META tags to attribute names.
*/
private final Map<String, String> _metaAttributeMapping = new HashMap<String, String>();
/** The log. */
private final Log _log = LogFactory.getLog(getClass());
/**
* {@inheritDoc}
*
* @see org.eclipse.smila.processing.pipelets.ATransformationPipelet
* #configure(org.eclipse.smila.processing.configuration.PipeletConfiguration)
*/
@Override
public void configure(PipeletConfiguration configuration) throws ProcessingException {
super.configure(configuration);
final Object removeContentTagValue = configuration.getPropertyFirstValue(PROP_REMOVE_CONTENT_TAGS);
if (removeContentTagValue != null) {
final String removeContentTagList = removeContentTagValue.toString().trim();
if (removeContentTagList.length() > 0) {
_removeContentTags = removeContentTagList.split(",");
_log.info("Removing complete content of these tags: " + Arrays.toString(_removeContentTags));
}
}
for (Property property : configuration.getProperties()) {
if (property.getName().startsWith(PROP_META)) {
final String metaName = property.getName().substring(PROP_META.length());
final String attributeName = property.getValue().toString();
_metaAttributeMapping.put(metaName, attributeName);
}
}
}
/**
* {@inheritDoc}
*
* @see org.eclipse.smila.processing.SimplePipelet#process(org.eclipse.smila.blackboard.BlackboardService,
* org.eclipse.smila.datamodel.id.Id[])
*/
public Id[] process(BlackboardService blackboard, Id[] recordIds) throws ProcessingException {
if (recordIds != null) {
for (Id id : recordIds) {
try {
final MultiValueMap metadata = new MultiValueMap();
final List<String> results = new ArrayList<String>();
if (isReadFromAttribute()) {
processAttributeValues(blackboard, id, results, metadata);
} else {
final InputStream stream = blackboard.getAttachmentAsStream(id, _inputName);
if (stream != null) {
results.add(extractText(id, stream, metadata));
}
}
storeResults(blackboard, id, results);
storeMetadata(blackboard, id, metadata);
} catch (BlackboardAccessException ex) {
_log.error("Error processing ID " + id, ex);
}
}
}
return recordIds;
}
/**
* read HTML strings from an attribute and add plain text strings and metadata to the results.
*
* @param blackboard
* blackboard service to use.
* @param id
* record ID to process
* @param results
* list of plain text strings.
* @param metadata
* metadata map
* @throws BlackboardAccessException
* record is not on blackboard
* @throws ProcessingException
* error parsing the HTML.
*/
private void processAttributeValues(BlackboardService blackboard, Id id, final List<String> results,
final MultiValueMap metadata) throws BlackboardAccessException, ProcessingException {
final List<Literal> literals = blackboard.getLiterals(id, _inputPath);
if (!literals.isEmpty()) {
for (Literal literal : literals) {
final String content = literal.getStringValue();
if (content != null) {
results.add(extractText(id, content, metadata));
}
}
}
}
/**
* write metadata to attributes on blackboard.
*
* @param blackboard
* blackboard
* @param id
* record id
* @param metadata
* metadata to store
* @throws BlackboardAccessException
* error writing values.
*/
@SuppressWarnings("unchecked")
private void storeMetadata(BlackboardService blackboard, Id id, MultiValueMap metadata)
throws BlackboardAccessException {
if (!metadata.isEmpty()) {
for (final Iterator attributeNames = metadata.keySet().iterator(); attributeNames.hasNext();) {
final String attributeName = (String) attributeNames.next();
final Collection values = metadata.getCollection(attributeName);
final Path path = new Path(attributeName);
if (!values.isEmpty()) {
blackboard.removeLiterals(id, path);
for (final Iterator iter = values.iterator(); iter.hasNext();) {
final String value = (String) iter.next();
final Literal literal = blackboard.createLiteral(id);
literal.setStringValue(value);
blackboard.addLiteral(id, path, literal);
}
}
}
}
}
/**
* extract the plain text from the HTML document given by the stream.
*
* @param id
* record ID (for logging purposes)
* @param stream
* HTML stream *
* @param metadata
* put extracted metadata in this map.
* @return plain text
* @throws ProcessingException
* error in parsing
*/
private String extractText(Id id, InputStream stream, MultiValueMap metadata) throws ProcessingException {
final StringBuilder result = new StringBuilder();
final XMLParserConfiguration parser = createParser(result, metadata);
try {
parser.parse(new XMLInputSource(null, id.toString(), null, stream, null));
} catch (Exception e) {
_log.error("error parsing HTML document in record " + id, e);
throw new ProcessingException("error parsing HTML document in record " + id + ": " + e.toString(), e);
}
return result.toString();
}
/**
* extract the plain text from the HTML document given by the string.
*
* @param id
* record ID (for logging purposes)
* @param content
* HTML string
* @param metadata
* put extracted metadata in this map.
* @return plain text
* @throws ProcessingException
* error in parsing
*/
private String extractText(Id id, String content, MultiValueMap metadata) throws ProcessingException {
final StringBuilder result = new StringBuilder();
final XMLParserConfiguration parser = createParser(result, metadata);
try {
parser.parse(new XMLInputSource(null, id.getIdHash(), null, new StringReader(content), null));
} catch (Exception e) {
_log.error("error parsing HTML document in record " + id, e);
throw new ProcessingException("error parsing HTML document in record " + id + ": " + e.toString(), e);
}
return result.toString();
}
/**
* create the HTML Parser/Filter to extract plain text.
*
* @param result
* string builder to take the result.
* @param metadata
* put extracted metadata in this map.
* @return HTML parser/filter.
*/
private XMLParserConfiguration createParser(final StringBuilder result, final MultiValueMap metadata) {
final ElementRemover elementRemover = new ElementRemover();
for (String tag : _removeContentTags) {
elementRemover.removeElement(tag);
}
final CommentRemover commentRemover = new CommentRemover();
final MetadataExtractor metadataExtractor = new MetadataExtractor(metadata);
final PlainTextWriter writer = new PlainTextWriter(result);
final XMLDocumentFilter[] filters = { commentRemover, metadataExtractor, elementRemover, writer };
final XMLParserConfiguration parser = new HTMLConfiguration();
parser.setProperty("http://cyberneko.org/html/properties/filters", filters);
return parser;
}
/**
* Append plain text from document to a string builder.
*
* @author jschumacher
*
*/
public class PlainTextWriter extends DefaultFilter {
/**
* target StringBuilder.
*/
private StringBuilder _target;
/**
* @param target
* the StringBuilder to write to
*/
public PlainTextWriter(StringBuilder target) {
super();
_target = target;
}
/**
* {@inheritDoc}
*
* @see org.cyberneko.html.filters.DefaultFilter#characters(org.apache.xerces.xni.XMLString,
* org.apache.xerces.xni.Augmentations)
*/
@Override
public void characters(XMLString text, Augmentations augs) {
_target.append(text.ch, text.offset, text.length);
super.characters(text, augs);
}
}
/**
* removes comments from HTML files.
*
* @author jschumacher
*
*/
public class CommentRemover extends DefaultFilter {
/**
* {@inheritDoc}
*
* @see org.cyberneko.html.filters.DefaultFilter#comment(org.apache.xerces.xni.XMLString,
* org.apache.xerces.xni.Augmentations)
*/
@Override
public void comment(XMLString text, Augmentations augs) {
// do nothing
}
}
/**
* extract metadata from META tags.
*
* @author jschumacher
*
*/
public class MetadataExtractor extends DefaultFilter {
/**
* attribute to value map extrated from document.
*/
private final MultiValueMap _metadata;
/**
* @param metadata
* map to use as target for storing the attribute-value lists.
*/
public MetadataExtractor(MultiValueMap metadata) {
super();
_metadata = metadata;
}
/**
* {@inheritDoc}
*
* @see org.cyberneko.html.filters.DefaultFilter#startElement(org.apache.xerces.xni.QName,
* org.apache.xerces.xni.XMLAttributes, org.apache.xerces.xni.Augmentations)
*/
@Override
public void startElement(QName element, XMLAttributes attributes, Augmentations augs) {
super.startElement(element, attributes, augs);
extractMetadata(element, attributes);
}
/**
* {@inheritDoc}
*
* @see org.cyberneko.html.filters.DefaultFilter#emptyElement(org.apache.xerces.xni.QName,
* org.apache.xerces.xni.XMLAttributes, org.apache.xerces.xni.Augmentations)
*/
@Override
public void emptyElement(QName element, XMLAttributes attributes, Augmentations augs) {
super.emptyElement(element, attributes, augs);
extractMetadata(element, attributes);
}
/**
* check for META tag.
*
* @param element
* current element.
* @param attributes
* atributes of tag.
*/
private void extractMetadata(QName element, XMLAttributes attributes) {
if ("meta".equalsIgnoreCase(element.localpart)) {
String metaName = null;
String metaValue = null;
for (int i = 0; i < attributes.getLength(); i++) {
final String attributeName = attributes.getLocalName(i);
if ("name".equalsIgnoreCase(attributeName)) {
metaName = attributes.getValue(i).trim().toLowerCase();
} else if ("content".equalsIgnoreCase(attributeName)) {
metaValue = attributes.getValue(i);
}
}
if (metaName != null && metaValue != null) {
final String attributeName = _metaAttributeMapping.get(metaName);
if (attributeName != null) {
_metadata.put(attributeName, metaValue);
}
}
}
}
}
}