org.eclipse.smila.processing.pipelets/code/src/org/eclipse/smila/processing/pipelets/HtmlToTextPipelet.java - smila/org.eclipse.smila.core - Git at Google

 /*******************************************************************************
  * Copyright (c) 2008 empolis GmbH and brox IT Solutions GmbH.
  * All rights reserved. This program and the accompanying materials
  * are made available under the terms of the Eclipse Public License v1.0
  * which accompanies this distribution, and is available at
  * http://www.eclipse.org/legal/epl-v10.html
  *
  * Contributors:
  *    Juergen Schumacher (empolis GmbH) - initial API and implementation
  *******************************************************************************/

 package org.eclipse.smila.processing.pipelets;

 import java.io.InputStream;
 import java.io.StringReader;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.HashMap;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;

 import org.apache.commons.collections.map.MultiValueMap;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.xerces.xni.Augmentations;
 import org.apache.xerces.xni.QName;
 import org.apache.xerces.xni.XMLAttributes;
 import org.apache.xerces.xni.XMLString;
 import org.apache.xerces.xni.parser.XMLDocumentFilter;
 import org.apache.xerces.xni.parser.XMLInputSource;
 import org.apache.xerces.xni.parser.XMLParserConfiguration;
 import org.cyberneko.html.HTMLConfiguration;
 import org.cyberneko.html.filters.DefaultFilter;
 import org.cyberneko.html.filters.ElementRemover;
 import org.eclipse.smila.blackboard.BlackboardAccessException;
 import org.eclipse.smila.blackboard.BlackboardService;
 import org.eclipse.smila.blackboard.path.Path;
 import org.eclipse.smila.datamodel.id.Id;
 import org.eclipse.smila.datamodel.record.Literal;
 import org.eclipse.smila.processing.ProcessingException;
 import org.eclipse.smila.processing.configuration.PipeletConfiguration;
 import org.eclipse.smila.processing.configuration.PipeletConfiguration.Property;

 /**
  * Simple HTML-to-Text extractor pipelet using NekoHTML parser.
  *
  * @author jschumacher
  *
  */
 public class HtmlToTextPipelet extends ATransformationPipelet {

   /**
    * By default the content of these tags is also removed from the result. This can be overwritten using the
    * "removeContentTags" property.
    */
   private static final String[] DEFAULT_REMOVE_CONTENT_TAGS = { "applet", "frame", "object", "script", "style" };

   /**
    * property to configure the tags for which the complete content is removed.
    */
   private static final String PROP_REMOVE_CONTENT_TAGS = "removeContentTags";

   /**
    * property to configure attribute targets for HTML meta tag contents.
    */
   private static final String PROP_META = "meta:";

   /**
    * tag names for which the complete content is removed from result.
    */
   private String[] _removeContentTags = DEFAULT_REMOVE_CONTENT_TAGS;

   /**
    * mapping of META tags to attribute names.
    */
   private final Map<String, String> _metaAttributeMapping = new HashMap<String, String>();

   /** The log. */
   private final Log _log = LogFactory.getLog(getClass());

   /**
    * {@inheritDoc}
    *
    * @see org.eclipse.smila.processing.pipelets.ATransformationPipelet
    *      #configure(org.eclipse.smila.processing.configuration.PipeletConfiguration)
    */
   @Override
   public void configure(PipeletConfiguration configuration) throws ProcessingException {
     super.configure(configuration);
     final Object removeContentTagValue = configuration.getPropertyFirstValue(PROP_REMOVE_CONTENT_TAGS);
     if (removeContentTagValue != null) {
       final String removeContentTagList = removeContentTagValue.toString().trim();
       if (removeContentTagList.length() > 0) {
         _removeContentTags = removeContentTagList.split(",");
         _log.info("Removing complete content of these tags: " + Arrays.toString(_removeContentTags));
       }
     }
     for (Property property : configuration.getProperties()) {
       if (property.getName().startsWith(PROP_META)) {
         final String metaName = property.getName().substring(PROP_META.length());
         final String attributeName = property.getValue().toString();
         _metaAttributeMapping.put(metaName, attributeName);
       }
     }
   }

   /**
    * {@inheritDoc}
    *
    * @see org.eclipse.smila.processing.SimplePipelet#process(org.eclipse.smila.blackboard.BlackboardService,
    *      org.eclipse.smila.datamodel.id.Id[])
    */
   public Id[] process(BlackboardService blackboard, Id[] recordIds) throws ProcessingException {
     if (recordIds != null) {
       for (Id id : recordIds) {
         try {
           final MultiValueMap metadata = new MultiValueMap();
           final List<String> results = new ArrayList<String>();
           if (isReadFromAttribute()) {
             processAttributeValues(blackboard, id, results, metadata);
           } else {
             final InputStream stream = blackboard.getAttachmentAsStream(id, _inputName);
             if (stream != null) {
               results.add(extractText(id, stream, metadata));
             }
           }
           storeResults(blackboard, id, results);
           storeMetadata(blackboard, id, metadata);
         } catch (BlackboardAccessException ex) {
           _log.error("Error processing ID " + id, ex);
         }
       }
     }
     return recordIds;
   }

   /**
    * read HTML strings from an attribute and add plain text strings and metadata to the results.
    *
    * @param blackboard
    *          blackboard service to use.
    * @param id
    *          record ID to process
    * @param results
    *          list of plain text strings.
    * @param metadata
    *          metadata map
    * @throws BlackboardAccessException
    *           record is not on blackboard
    * @throws ProcessingException
    *           error parsing the HTML.
    */
   private void processAttributeValues(BlackboardService blackboard, Id id, final List<String> results,
     final MultiValueMap metadata) throws BlackboardAccessException, ProcessingException {
     final List<Literal> literals = blackboard.getLiterals(id, _inputPath);
     if (!literals.isEmpty()) {
       for (Literal literal : literals) {
         final String content = literal.getStringValue();
         if (content != null) {
           results.add(extractText(id, content, metadata));
         }
       }
     }
   }

   /**
    * write metadata to attributes on blackboard.
    *
    * @param blackboard
    *          blackboard
    * @param id
    *          record id
    * @param metadata
    *          metadata to store
    * @throws BlackboardAccessException
    *           error writing values.
    */
   @SuppressWarnings("unchecked")
   private void storeMetadata(BlackboardService blackboard, Id id, MultiValueMap metadata)
     throws BlackboardAccessException {
     if (!metadata.isEmpty()) {
       for (final Iterator attributeNames = metadata.keySet().iterator(); attributeNames.hasNext();) {
         final String attributeName = (String) attributeNames.next();
         final Collection values = metadata.getCollection(attributeName);
         final Path path = new Path(attributeName);
         if (!values.isEmpty()) {
           blackboard.removeLiterals(id, path);
           for (final Iterator iter = values.iterator(); iter.hasNext();) {
             final String value = (String) iter.next();
             final Literal literal = blackboard.createLiteral(id);
             literal.setStringValue(value);
             blackboard.addLiteral(id, path, literal);
           }
         }
       }
     }
   }

   /**
    * extract the plain text from the HTML document given by the stream.
    *
    * @param id
    *          record ID (for logging purposes)
    * @param stream
    *          HTML stream *
    * @param metadata
    *          put extracted metadata in this map.
    * @return plain text
    * @throws ProcessingException
    *           error in parsing
    */
   private String extractText(Id id, InputStream stream, MultiValueMap metadata) throws ProcessingException {
     final StringBuilder result = new StringBuilder();
     final XMLParserConfiguration parser = createParser(result, metadata);
     try {
       parser.parse(new XMLInputSource(null, id.toString(), null, stream, null));
     } catch (Exception e) {
       _log.error("error parsing HTML document in record " + id, e);
       throw new ProcessingException("error parsing HTML document in record " + id + ": " + e.toString(), e);
     }
     return result.toString();
   }

   /**
    * extract the plain text from the HTML document given by the string.
    *
    * @param id
    *          record ID (for logging purposes)
    * @param content
    *          HTML string
    * @param metadata
    *          put extracted metadata in this map.
    * @return plain text
    * @throws ProcessingException
    *           error in parsing
    */
   private String extractText(Id id, String content, MultiValueMap metadata) throws ProcessingException {
     final StringBuilder result = new StringBuilder();
     final XMLParserConfiguration parser = createParser(result, metadata);

     try {
       parser.parse(new XMLInputSource(null, id.getIdHash(), null, new StringReader(content), null));
     } catch (Exception e) {
       _log.error("error parsing HTML document in record " + id, e);
       throw new ProcessingException("error parsing HTML document in record " + id + ": " + e.toString(), e);
     }
     return result.toString();
   }

   /**
    * create the HTML Parser/Filter to extract plain text.
    *
    * @param result
    *          string builder to take the result.
    * @param metadata
    *          put extracted metadata in this map.
    * @return HTML parser/filter.
    */
   private XMLParserConfiguration createParser(final StringBuilder result, final MultiValueMap metadata) {
     final ElementRemover elementRemover = new ElementRemover();
     for (String tag : _removeContentTags) {
       elementRemover.removeElement(tag);
     }
     final CommentRemover commentRemover = new CommentRemover();
     final MetadataExtractor metadataExtractor = new MetadataExtractor(metadata);
     final PlainTextWriter writer = new PlainTextWriter(result);
     final XMLDocumentFilter[] filters = { commentRemover, metadataExtractor, elementRemover, writer };
     final XMLParserConfiguration parser = new HTMLConfiguration();
     parser.setProperty("http://cyberneko.org/html/properties/filters", filters);
     return parser;
   }

   /**
    * Append plain text from document to a string builder.
    *
    * @author jschumacher
    *
    */
   public class PlainTextWriter extends DefaultFilter {
     /**
      * target StringBuilder.
      */
     private StringBuilder _target;

     /**
      * @param target
      *          the StringBuilder to write to
      */
     public PlainTextWriter(StringBuilder target) {
       super();
       _target = target;
     }

     /**
      * {@inheritDoc}
      *
      * @see org.cyberneko.html.filters.DefaultFilter#characters(org.apache.xerces.xni.XMLString,
      *      org.apache.xerces.xni.Augmentations)
      */
     @Override
     public void characters(XMLString text, Augmentations augs) {
       _target.append(text.ch, text.offset, text.length);
       super.characters(text, augs);
     }
   }

   /**
    * removes comments from HTML files.
    *
    * @author jschumacher
    *
    */
   public class CommentRemover extends DefaultFilter {
     /**
      * {@inheritDoc}
      *
      * @see org.cyberneko.html.filters.DefaultFilter#comment(org.apache.xerces.xni.XMLString,
      *      org.apache.xerces.xni.Augmentations)
      */
     @Override
     public void comment(XMLString text, Augmentations augs) {
       // do nothing
     }
   }

   /**
    * extract metadata from META tags.
    *
    * @author jschumacher
    *
    */
   public class MetadataExtractor extends DefaultFilter {
     /**
      * attribute to value map extrated from document.
      */
     private final MultiValueMap _metadata;

     /**
      * @param metadata
      *          map to use as target for storing the attribute-value lists.
      */
     public MetadataExtractor(MultiValueMap metadata) {
       super();
       _metadata = metadata;
     }

     /**
      * {@inheritDoc}
      *
      * @see org.cyberneko.html.filters.DefaultFilter#startElement(org.apache.xerces.xni.QName,
      *      org.apache.xerces.xni.XMLAttributes, org.apache.xerces.xni.Augmentations)
      */
     @Override
     public void startElement(QName element, XMLAttributes attributes, Augmentations augs) {
       super.startElement(element, attributes, augs);
       extractMetadata(element, attributes);
     }

     /**
      * {@inheritDoc}
      *
      * @see org.cyberneko.html.filters.DefaultFilter#emptyElement(org.apache.xerces.xni.QName,
      *      org.apache.xerces.xni.XMLAttributes, org.apache.xerces.xni.Augmentations)
      */
     @Override
     public void emptyElement(QName element, XMLAttributes attributes, Augmentations augs) {
       super.emptyElement(element, attributes, augs);
       extractMetadata(element, attributes);
     }

     /**
      * check for META tag.
      *
      * @param element
      *          current element.
      * @param attributes
      *          atributes of tag.
      */
     private void extractMetadata(QName element, XMLAttributes attributes) {
       if ("meta".equalsIgnoreCase(element.localpart)) {
         String metaName = null;
         String metaValue = null;
         for (int i = 0; i < attributes.getLength(); i++) {
           final String attributeName = attributes.getLocalName(i);
           if ("name".equalsIgnoreCase(attributeName)) {
             metaName = attributes.getValue(i).trim().toLowerCase();
           } else if ("content".equalsIgnoreCase(attributeName)) {
             metaValue = attributes.getValue(i);
           }
         }
         if (metaName != null && metaValue != null) {
           final String attributeName = _metaAttributeMapping.get(metaName);
           if (attributeName != null) {
             _metadata.put(attributeName, metaValue);
           }
         }
       }
     }

   }
 }
	/*******************************************************************************
	* Copyright (c) 2008 empolis GmbH and brox IT Solutions GmbH.
	* All rights reserved. This program and the accompanying materials
	* are made available under the terms of the Eclipse Public License v1.0
	* which accompanies this distribution, and is available at
	* http://www.eclipse.org/legal/epl-v10.html
	*
	* Contributors:
	* Juergen Schumacher (empolis GmbH) - initial API and implementation
	*******************************************************************************/

	package org.eclipse.smila.processing.pipelets;

	import java.io.InputStream;
	import java.io.StringReader;
	import java.util.ArrayList;
	import java.util.Arrays;
	import java.util.Collection;
	import java.util.HashMap;
	import java.util.Iterator;
	import java.util.List;
	import java.util.Map;

	import org.apache.commons.collections.map.MultiValueMap;
	import org.apache.commons.logging.Log;
	import org.apache.commons.logging.LogFactory;
	import org.apache.xerces.xni.Augmentations;
	import org.apache.xerces.xni.QName;
	import org.apache.xerces.xni.XMLAttributes;
	import org.apache.xerces.xni.XMLString;
	import org.apache.xerces.xni.parser.XMLDocumentFilter;
	import org.apache.xerces.xni.parser.XMLInputSource;
	import org.apache.xerces.xni.parser.XMLParserConfiguration;
	import org.cyberneko.html.HTMLConfiguration;
	import org.cyberneko.html.filters.DefaultFilter;
	import org.cyberneko.html.filters.ElementRemover;
	import org.eclipse.smila.blackboard.BlackboardAccessException;
	import org.eclipse.smila.blackboard.BlackboardService;
	import org.eclipse.smila.blackboard.path.Path;
	import org.eclipse.smila.datamodel.id.Id;
	import org.eclipse.smila.datamodel.record.Literal;
	import org.eclipse.smila.processing.ProcessingException;
	import org.eclipse.smila.processing.configuration.PipeletConfiguration;
	import org.eclipse.smila.processing.configuration.PipeletConfiguration.Property;

	/**
	* Simple HTML-to-Text extractor pipelet using NekoHTML parser.
	*
	* @author jschumacher
	*
	*/
	public class HtmlToTextPipelet extends ATransformationPipelet {

	/**
	* By default the content of these tags is also removed from the result. This can be overwritten using the
	* "removeContentTags" property.
	*/
	private static final String[] DEFAULT_REMOVE_CONTENT_TAGS = { "applet", "frame", "object", "script", "style" };

	/**
	* property to configure the tags for which the complete content is removed.
	*/
	private static final String PROP_REMOVE_CONTENT_TAGS = "removeContentTags";

	/**
	* property to configure attribute targets for HTML meta tag contents.
	*/
	private static final String PROP_META = "meta:";

	/**
	* tag names for which the complete content is removed from result.
	*/
	private String[] _removeContentTags = DEFAULT_REMOVE_CONTENT_TAGS;

	/**
	* mapping of META tags to attribute names.
	*/
	private final Map<String, String> _metaAttributeMapping = new HashMap<String, String>();

	/** The log. */
	private final Log _log = LogFactory.getLog(getClass());

	/**
	* {@inheritDoc}
	*
	* @see org.eclipse.smila.processing.pipelets.ATransformationPipelet
	* #configure(org.eclipse.smila.processing.configuration.PipeletConfiguration)
	*/
	@Override
	public void configure(PipeletConfiguration configuration) throws ProcessingException {
	super.configure(configuration);
	final Object removeContentTagValue = configuration.getPropertyFirstValue(PROP_REMOVE_CONTENT_TAGS);
	if (removeContentTagValue != null) {
	final String removeContentTagList = removeContentTagValue.toString().trim();
	if (removeContentTagList.length() > 0) {
	_removeContentTags = removeContentTagList.split(",");
	_log.info("Removing complete content of these tags: " + Arrays.toString(_removeContentTags));
	}
	}
	for (Property property : configuration.getProperties()) {
	if (property.getName().startsWith(PROP_META)) {
	final String metaName = property.getName().substring(PROP_META.length());
	final String attributeName = property.getValue().toString();
	_metaAttributeMapping.put(metaName, attributeName);
	}
	}
	}

	/**
	* {@inheritDoc}
	*
	* @see org.eclipse.smila.processing.SimplePipelet#process(org.eclipse.smila.blackboard.BlackboardService,
	* org.eclipse.smila.datamodel.id.Id[])
	*/
	public Id[] process(BlackboardService blackboard, Id[] recordIds) throws ProcessingException {
	if (recordIds != null) {
	for (Id id : recordIds) {
	try {
	final MultiValueMap metadata = new MultiValueMap();
	final List<String> results = new ArrayList<String>();
	if (isReadFromAttribute()) {
	processAttributeValues(blackboard, id, results, metadata);
	} else {
	final InputStream stream = blackboard.getAttachmentAsStream(id, _inputName);
	if (stream != null) {
	results.add(extractText(id, stream, metadata));
	}
	}
	storeResults(blackboard, id, results);
	storeMetadata(blackboard, id, metadata);
	} catch (BlackboardAccessException ex) {
	_log.error("Error processing ID " + id, ex);
	}
	}
	}
	return recordIds;
	}

	/**
	* read HTML strings from an attribute and add plain text strings and metadata to the results.
	*
	* @param blackboard
	* blackboard service to use.
	* @param id
	* record ID to process
	* @param results
	* list of plain text strings.
	* @param metadata
	* metadata map
	* @throws BlackboardAccessException
	* record is not on blackboard
	* @throws ProcessingException
	* error parsing the HTML.
	*/
	private void processAttributeValues(BlackboardService blackboard, Id id, final List<String> results,
	final MultiValueMap metadata) throws BlackboardAccessException, ProcessingException {
	final List<Literal> literals = blackboard.getLiterals(id, _inputPath);
	if (!literals.isEmpty()) {
	for (Literal literal : literals) {
	final String content = literal.getStringValue();
	if (content != null) {
	results.add(extractText(id, content, metadata));
	}
	}
	}
	}

	/**
	* write metadata to attributes on blackboard.
	*
	* @param blackboard
	* blackboard
	* @param id
	* record id
	* @param metadata
	* metadata to store
	* @throws BlackboardAccessException
	* error writing values.
	*/
	@SuppressWarnings("unchecked")
	private void storeMetadata(BlackboardService blackboard, Id id, MultiValueMap metadata)
	throws BlackboardAccessException {
	if (!metadata.isEmpty()) {
	for (final Iterator attributeNames = metadata.keySet().iterator(); attributeNames.hasNext();) {
	final String attributeName = (String) attributeNames.next();
	final Collection values = metadata.getCollection(attributeName);
	final Path path = new Path(attributeName);
	if (!values.isEmpty()) {
	blackboard.removeLiterals(id, path);
	for (final Iterator iter = values.iterator(); iter.hasNext();) {
	final String value = (String) iter.next();
	final Literal literal = blackboard.createLiteral(id);
	literal.setStringValue(value);
	blackboard.addLiteral(id, path, literal);
	}
	}
	}
	}
	}

	/**
	* extract the plain text from the HTML document given by the stream.
	*
	* @param id
	* record ID (for logging purposes)
	* @param stream
	* HTML stream *
	* @param metadata
	* put extracted metadata in this map.
	* @return plain text
	* @throws ProcessingException
	* error in parsing
	*/
	private String extractText(Id id, InputStream stream, MultiValueMap metadata) throws ProcessingException {
	final StringBuilder result = new StringBuilder();
	final XMLParserConfiguration parser = createParser(result, metadata);
	try {
	parser.parse(new XMLInputSource(null, id.toString(), null, stream, null));
	} catch (Exception e) {
	_log.error("error parsing HTML document in record " + id, e);
	throw new ProcessingException("error parsing HTML document in record " + id + ": " + e.toString(), e);
	}
	return result.toString();
	}

	/**
	* extract the plain text from the HTML document given by the string.
	*
	* @param id
	* record ID (for logging purposes)
	* @param content
	* HTML string
	* @param metadata
	* put extracted metadata in this map.
	* @return plain text
	* @throws ProcessingException
	* error in parsing
	*/
	private String extractText(Id id, String content, MultiValueMap metadata) throws ProcessingException {
	final StringBuilder result = new StringBuilder();
	final XMLParserConfiguration parser = createParser(result, metadata);

	try {
	parser.parse(new XMLInputSource(null, id.getIdHash(), null, new StringReader(content), null));
	} catch (Exception e) {
	_log.error("error parsing HTML document in record " + id, e);
	throw new ProcessingException("error parsing HTML document in record " + id + ": " + e.toString(), e);
	}
	return result.toString();
	}

	/**
	* create the HTML Parser/Filter to extract plain text.
	*
	* @param result
	* string builder to take the result.
	* @param metadata
	* put extracted metadata in this map.
	* @return HTML parser/filter.
	*/
	private XMLParserConfiguration createParser(final StringBuilder result, final MultiValueMap metadata) {
	final ElementRemover elementRemover = new ElementRemover();
	for (String tag : _removeContentTags) {
	elementRemover.removeElement(tag);
	}
	final CommentRemover commentRemover = new CommentRemover();
	final MetadataExtractor metadataExtractor = new MetadataExtractor(metadata);
	final PlainTextWriter writer = new PlainTextWriter(result);
	final XMLDocumentFilter[] filters = { commentRemover, metadataExtractor, elementRemover, writer };
	final XMLParserConfiguration parser = new HTMLConfiguration();
	parser.setProperty("http://cyberneko.org/html/properties/filters", filters);
	return parser;
	}

	/**
	* Append plain text from document to a string builder.
	*
	* @author jschumacher
	*
	*/
	public class PlainTextWriter extends DefaultFilter {
	/**
	* target StringBuilder.
	*/
	private StringBuilder _target;

	/**
	* @param target
	* the StringBuilder to write to
	*/
	public PlainTextWriter(StringBuilder target) {
	super();
	_target = target;
	}

	/**
	* {@inheritDoc}
	*
	* @see org.cyberneko.html.filters.DefaultFilter#characters(org.apache.xerces.xni.XMLString,
	* org.apache.xerces.xni.Augmentations)
	*/
	@Override
	public void characters(XMLString text, Augmentations augs) {
	_target.append(text.ch, text.offset, text.length);
	super.characters(text, augs);
	}
	}

	/**
	* removes comments from HTML files.
	*
	* @author jschumacher
	*
	*/
	public class CommentRemover extends DefaultFilter {
	/**
	* {@inheritDoc}
	*
	* @see org.cyberneko.html.filters.DefaultFilter#comment(org.apache.xerces.xni.XMLString,
	* org.apache.xerces.xni.Augmentations)
	*/
	@Override
	public void comment(XMLString text, Augmentations augs) {
	// do nothing
	}
	}

	/**
	* extract metadata from META tags.
	*
	* @author jschumacher
	*
	*/
	public class MetadataExtractor extends DefaultFilter {
	/**
	* attribute to value map extrated from document.
	*/
	private final MultiValueMap _metadata;

	/**
	* @param metadata
	* map to use as target for storing the attribute-value lists.
	*/
	public MetadataExtractor(MultiValueMap metadata) {
	super();
	_metadata = metadata;
	}

	/**
	* {@inheritDoc}
	*
	* @see org.cyberneko.html.filters.DefaultFilter#startElement(org.apache.xerces.xni.QName,
	* org.apache.xerces.xni.XMLAttributes, org.apache.xerces.xni.Augmentations)
	*/
	@Override
	public void startElement(QName element, XMLAttributes attributes, Augmentations augs) {
	super.startElement(element, attributes, augs);
	extractMetadata(element, attributes);
	}

	/**
	* {@inheritDoc}
	*
	* @see org.cyberneko.html.filters.DefaultFilter#emptyElement(org.apache.xerces.xni.QName,
	* org.apache.xerces.xni.XMLAttributes, org.apache.xerces.xni.Augmentations)
	*/
	@Override
	public void emptyElement(QName element, XMLAttributes attributes, Augmentations augs) {
	super.emptyElement(element, attributes, augs);
	extractMetadata(element, attributes);
	}

	/**
	* check for META tag.
	*
	* @param element
	* current element.
	* @param attributes
	* atributes of tag.
	*/
	private void extractMetadata(QName element, XMLAttributes attributes) {
	if ("meta".equalsIgnoreCase(element.localpart)) {
	String metaName = null;
	String metaValue = null;
	for (int i = 0; i < attributes.getLength(); i++) {
	final String attributeName = attributes.getLocalName(i);
	if ("name".equalsIgnoreCase(attributeName)) {
	metaName = attributes.getValue(i).trim().toLowerCase();
	} else if ("content".equalsIgnoreCase(attributeName)) {
	metaValue = attributes.getValue(i);
	}
	}
	if (metaName != null && metaValue != null) {
	final String attributeName = _metaAttributeMapping.get(metaName);
	if (attributeName != null) {
	_metadata.put(attributeName, metaValue);
	}
	}
	}
	}

	}
	}