core/org.eclipse.smila.processing.pipelets.xmlprocessing/code/src/org/eclipse/smila/processing/pipelets/xmlprocessing/XmlDocumentSplitterPipelet.java - gerrit/smila/org.eclipse.smila.core - Git at Google

 /***********************************************************************************************************************
  * Copyright (c) 2008, 2013 Empolis Information Management GmbH and brox IT Solutions GmbH. All rights reserved. This
  * program and the accompanying materials are made available under the terms of the Eclipse Public License v1.0 which
  * accompanies this distribution, and is available at http://www.eclipse.org/legal/epl-v10.html
  *
  * Contributors: Juergen Schumacher (Empolis Information Management GmbH) - initial implementation, based on
  * XmlSplitterPipelet
  **********************************************************************************************************************/

 package org.eclipse.smila.processing.pipelets.xmlprocessing;

 import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.net.URL;

 import javax.xml.namespace.QName;

 import org.apache.commons.httpclient.HttpClient;
 import org.apache.commons.httpclient.methods.GetMethod;
 import org.eclipse.smila.blackboard.Blackboard;
 import org.eclipse.smila.blackboard.Blackboard.Get;
 import org.eclipse.smila.blackboard.BlackboardAccessException;
 import org.eclipse.smila.datamodel.AnyMap;
 import org.eclipse.smila.datamodel.DataFactory;
 import org.eclipse.smila.datamodel.Record;
 import org.eclipse.smila.processing.ProcessingException;
 import org.eclipse.smila.processing.parameters.MissingParameterException;
 import org.eclipse.smila.processing.parameters.ParameterAccessor;
 import org.eclipse.smila.processing.pipelets.ATransformationPipelet;
 import org.eclipse.smila.processing.pipelets.DocumentSplitterPipelet;
 import org.eclipse.smila.processing.util.ProcessingConstants;
 import org.eclipse.smila.processing.util.ResultCollector;
 import org.eclipse.smila.utils.xml.stax.XmlSnippetHandler;
 import org.eclipse.smila.utils.xml.stax.XmlSnippetSplitter;

 /**
  * The possible properties are:
  * <ul>
  * <li>beginTagName: the name of the tag to start the xml snippet</li>
  * <li>beginTagNamespace: the (optional) namespace of the tag to start the xml snippet</li>
  * <li>endTagName: the name of the tag to end the xml snippet</li>
  * <li>endTagNamespace: the (optional) namespace of the tag to end the xml snippet</li>
  * <li>inputName: name of the Attribute/Attachment to read the XML Document from.</li>
  * <li>outputName: name of the Attribute/Attachment to store the extracted value in</li>
  * <li>inputType: the type (Attribute or Attachment of the inputName). An input Attribute is not interpreted as content
  * but as a file path or an URL to the XML document</li>
  * <li>outputType: the type (Attribute or Attachment of the outputName)</li>
  * </ul>
  */
 public class XmlDocumentSplitterPipelet extends ATransformationPipelet {

   /** Constant for the property beginTagName. */
   public static final String PROP_BEGIN_TAG_NAME = "beginTagName";

   /** Constant for the property beginTagNamespace. */
   public static final String PROP_BEGIN_TAG_NAMESPACE = "beginTagNamespace";

   /** Constant for the property endTagName. */
   public static final String PROP_END_TAG_NAME = "endTagName";

   /** Constant for the property endTagNamespace. */
   public static final String PROP_END_TAG_NAMESPACE = "endTagNamespace";

   /**
    * {@inheritDoc}
    */
   @Override
   public String[] process(final Blackboard blackboard, final String[] recordIds) throws ProcessingException {
     if (recordIds == null) {
       return recordIds;
     }
     final ParameterAccessor paramAccessor = new ParameterAccessor(blackboard, _config);
     final ResultCollector resultCollector =
       new ResultCollector(paramAccessor, _log, ProcessingConstants.DROP_ON_ERROR_DEFAULT);
     for (final String id : recordIds) {
       try {
         paramAccessor.setCurrentRecord(id);
         final String beginTagName = paramAccessor.getRequiredParameter(PROP_BEGIN_TAG_NAME);
         final String beginTagNamespace = paramAccessor.getParameter(PROP_BEGIN_TAG_NAMESPACE, "");
         final String endTagName = paramAccessor.getParameter(PROP_END_TAG_NAME, beginTagName);
         final String endTagNamespace = paramAccessor.getParameter(PROP_END_TAG_NAMESPACE, beginTagNamespace);
         final QName beginTag = new QName(beginTagNamespace, beginTagName);
         final QName endTag = new QName(endTagNamespace, endTagName);
         final String outputName = getOutputName(paramAccessor);
         final boolean storeInAttribute = isStoreInAttribute(getOutputType(paramAccessor));
         final InternalHandler snippetHandler =
           new InternalHandler(blackboard, id, outputName, storeInAttribute, resultCollector);
         final XmlSnippetSplitter splitter = new XmlSnippetSplitter(snippetHandler, beginTag, endTag);
         final InputStream inputStream = getXmlInputStream(blackboard, id, paramAccessor);
         splitter.read(inputStream);
         if (_log.isInfoEnabled()) {
           _log.info("Created " + snippetHandler.getRecordCount() + " records from processing record " + id);
         }
       } catch (final Exception e) {
         resultCollector.addFailedResult(id, e);
       }
     }
     return resultCollector.getResultIds();
   }

   /** get XML input Stream. */
   private InputStream getXmlInputStream(final Blackboard blackboard, final String id,
     final ParameterAccessor paramAccessor) throws IOException, BlackboardAccessException, MissingParameterException {
     InputStream inputStream = null;
     if (isReadFromAttribute(getInputType(paramAccessor))) {
       inputStream = loadExternalInputStream(readStringInput(blackboard, id, paramAccessor));
     } else {
       inputStream = blackboard.getAttachmentAsStream(id, getInputName(paramAccessor));
     }
     return inputStream;
   }

   /**
    * Get the external InputStream to the given url or file path.
    *
    * @param attrtibuteValue
    *          the attrtibuteValue denoting an URL or file path
    * @return a InputStream or null
    * @throws IOException
    *           if any error occurs
    */
   private InputStream loadExternalInputStream(final String attrtibuteValue) throws IOException {
     InputStream stream = null;
     if (attrtibuteValue != null && attrtibuteValue.trim().length() > 0) {
       if (attrtibuteValue.startsWith("file")) {
         final URL url = new URL(attrtibuteValue);
         stream = new FileInputStream(url.getAuthority() + url.getPath());
       } else if (attrtibuteValue.startsWith("http")) {
         final URL url = new URL(attrtibuteValue);
         final HttpClient httpClient = new HttpClient();
         final GetMethod getMethod = new GetMethod(url.toString());
         httpClient.executeMethod(getMethod);
         stream = getMethod.getResponseBodyAsStream();
       } else {
         stream = new FileInputStream(attrtibuteValue);
       }
     } // if
     return stream;
   }

   class InternalHandler implements XmlSnippetHandler {

     private final Blackboard _blackboard;

     private final String _currentId;

     private final ResultCollector _resultCollector;

     private final String _outputName;

     private final boolean _storeInAttribute;

     private final AnyMap _cloneMetadata = DataFactory.DEFAULT.createAnyMap();

     private int _recordCount;

     private InternalHandler(final Blackboard blackboard, final String currentId, final String outputName,
       final boolean storeInAttribute, final ResultCollector resultCollector) throws BlackboardAccessException {
       _blackboard = blackboard;
       _currentId = currentId;
       _outputName = outputName;
       _storeInAttribute = storeInAttribute;
       _resultCollector = resultCollector;
       _cloneMetadata.putAll(_blackboard.getMetadata(_currentId));
       _cloneMetadata.remove(Record.RECORD_ID);
     }

     public int getRecordCount() {
       return _recordCount;
     }

     /**
      * {@inheritDoc}
      */
     @Override
     public void handleSnippet(final byte[] snippet) {
       final String snippetId = _currentId + DocumentSplitterPipelet.SPLIT_ID_SEPARATOR + _recordCount++;
       try {
         final Record snippetRecord = _blackboard.getRecord(snippetId, Get.NEW);
         snippetRecord.getMetadata().put(DocumentSplitterPipelet.DOCUMENT_ID, _currentId);
         snippetRecord.getMetadata().putAll(_cloneMetadata);
         if (_storeInAttribute) {
           snippetRecord.getMetadata().put(_outputName, new String(snippet, ENCODING_CHARSET));
         } else {
           _blackboard.setAttachment(snippetId, _outputName, snippet);
         }
         _resultCollector.addResult(snippetId);
       } catch (final Exception ex) {
         _log.warn("Error creating XML-snippet record", ex);
       }
     }
   }
 }
	/***********************************************************************************************************************
	* Copyright (c) 2008, 2013 Empolis Information Management GmbH and brox IT Solutions GmbH. All rights reserved. This
	* program and the accompanying materials are made available under the terms of the Eclipse Public License v1.0 which
	* accompanies this distribution, and is available at http://www.eclipse.org/legal/epl-v10.html
	*
	* Contributors: Juergen Schumacher (Empolis Information Management GmbH) - initial implementation, based on
	* XmlSplitterPipelet
	**********************************************************************************************************************/

	package org.eclipse.smila.processing.pipelets.xmlprocessing;

	import java.io.FileInputStream;
	import java.io.IOException;
	import java.io.InputStream;
	import java.net.URL;

	import javax.xml.namespace.QName;

	import org.apache.commons.httpclient.HttpClient;
	import org.apache.commons.httpclient.methods.GetMethod;
	import org.eclipse.smila.blackboard.Blackboard;
	import org.eclipse.smila.blackboard.Blackboard.Get;
	import org.eclipse.smila.blackboard.BlackboardAccessException;
	import org.eclipse.smila.datamodel.AnyMap;
	import org.eclipse.smila.datamodel.DataFactory;
	import org.eclipse.smila.datamodel.Record;
	import org.eclipse.smila.processing.ProcessingException;
	import org.eclipse.smila.processing.parameters.MissingParameterException;
	import org.eclipse.smila.processing.parameters.ParameterAccessor;
	import org.eclipse.smila.processing.pipelets.ATransformationPipelet;
	import org.eclipse.smila.processing.pipelets.DocumentSplitterPipelet;
	import org.eclipse.smila.processing.util.ProcessingConstants;
	import org.eclipse.smila.processing.util.ResultCollector;
	import org.eclipse.smila.utils.xml.stax.XmlSnippetHandler;
	import org.eclipse.smila.utils.xml.stax.XmlSnippetSplitter;

	/**
	* The possible properties are:
	* <ul>
	* <li>beginTagName: the name of the tag to start the xml snippet</li>
	* <li>beginTagNamespace: the (optional) namespace of the tag to start the xml snippet</li>
	* <li>endTagName: the name of the tag to end the xml snippet</li>
	* <li>endTagNamespace: the (optional) namespace of the tag to end the xml snippet</li>
	* <li>inputName: name of the Attribute/Attachment to read the XML Document from.</li>
	* <li>outputName: name of the Attribute/Attachment to store the extracted value in</li>
	* <li>inputType: the type (Attribute or Attachment of the inputName). An input Attribute is not interpreted as content
	* but as a file path or an URL to the XML document</li>
	* <li>outputType: the type (Attribute or Attachment of the outputName)</li>
	* </ul>
	*/
	public class XmlDocumentSplitterPipelet extends ATransformationPipelet {

	/** Constant for the property beginTagName. */
	public static final String PROP_BEGIN_TAG_NAME = "beginTagName";

	/** Constant for the property beginTagNamespace. */
	public static final String PROP_BEGIN_TAG_NAMESPACE = "beginTagNamespace";

	/** Constant for the property endTagName. */
	public static final String PROP_END_TAG_NAME = "endTagName";

	/** Constant for the property endTagNamespace. */
	public static final String PROP_END_TAG_NAMESPACE = "endTagNamespace";

	/**
	* {@inheritDoc}
	*/
	@Override
	public String[] process(final Blackboard blackboard, final String[] recordIds) throws ProcessingException {
	if (recordIds == null) {
	return recordIds;
	}
	final ParameterAccessor paramAccessor = new ParameterAccessor(blackboard, _config);
	final ResultCollector resultCollector =
	new ResultCollector(paramAccessor, _log, ProcessingConstants.DROP_ON_ERROR_DEFAULT);
	for (final String id : recordIds) {
	try {
	paramAccessor.setCurrentRecord(id);
	final String beginTagName = paramAccessor.getRequiredParameter(PROP_BEGIN_TAG_NAME);
	final String beginTagNamespace = paramAccessor.getParameter(PROP_BEGIN_TAG_NAMESPACE, "");
	final String endTagName = paramAccessor.getParameter(PROP_END_TAG_NAME, beginTagName);
	final String endTagNamespace = paramAccessor.getParameter(PROP_END_TAG_NAMESPACE, beginTagNamespace);
	final QName beginTag = new QName(beginTagNamespace, beginTagName);
	final QName endTag = new QName(endTagNamespace, endTagName);
	final String outputName = getOutputName(paramAccessor);
	final boolean storeInAttribute = isStoreInAttribute(getOutputType(paramAccessor));
	final InternalHandler snippetHandler =
	new InternalHandler(blackboard, id, outputName, storeInAttribute, resultCollector);
	final XmlSnippetSplitter splitter = new XmlSnippetSplitter(snippetHandler, beginTag, endTag);
	final InputStream inputStream = getXmlInputStream(blackboard, id, paramAccessor);
	splitter.read(inputStream);
	if (_log.isInfoEnabled()) {
	_log.info("Created " + snippetHandler.getRecordCount() + " records from processing record " + id);
	}
	} catch (final Exception e) {
	resultCollector.addFailedResult(id, e);
	}
	}
	return resultCollector.getResultIds();
	}

	/** get XML input Stream. */
	private InputStream getXmlInputStream(final Blackboard blackboard, final String id,
	final ParameterAccessor paramAccessor) throws IOException, BlackboardAccessException, MissingParameterException {
	InputStream inputStream = null;
	if (isReadFromAttribute(getInputType(paramAccessor))) {
	inputStream = loadExternalInputStream(readStringInput(blackboard, id, paramAccessor));
	} else {
	inputStream = blackboard.getAttachmentAsStream(id, getInputName(paramAccessor));
	}
	return inputStream;
	}

	/**
	* Get the external InputStream to the given url or file path.
	*
	* @param attrtibuteValue
	* the attrtibuteValue denoting an URL or file path
	* @return a InputStream or null
	* @throws IOException
	* if any error occurs
	*/
	private InputStream loadExternalInputStream(final String attrtibuteValue) throws IOException {
	InputStream stream = null;
	if (attrtibuteValue != null && attrtibuteValue.trim().length() > 0) {
	if (attrtibuteValue.startsWith("file")) {
	final URL url = new URL(attrtibuteValue);
	stream = new FileInputStream(url.getAuthority() + url.getPath());
	} else if (attrtibuteValue.startsWith("http")) {
	final URL url = new URL(attrtibuteValue);
	final HttpClient httpClient = new HttpClient();
	final GetMethod getMethod = new GetMethod(url.toString());
	httpClient.executeMethod(getMethod);
	stream = getMethod.getResponseBodyAsStream();
	} else {
	stream = new FileInputStream(attrtibuteValue);
	}
	} // if
	return stream;
	}

	class InternalHandler implements XmlSnippetHandler {

	private final Blackboard _blackboard;

	private final String _currentId;

	private final ResultCollector _resultCollector;

	private final String _outputName;

	private final boolean _storeInAttribute;

	private final AnyMap _cloneMetadata = DataFactory.DEFAULT.createAnyMap();

	private int _recordCount;

	private InternalHandler(final Blackboard blackboard, final String currentId, final String outputName,
	final boolean storeInAttribute, final ResultCollector resultCollector) throws BlackboardAccessException {
	_blackboard = blackboard;
	_currentId = currentId;
	_outputName = outputName;
	_storeInAttribute = storeInAttribute;
	_resultCollector = resultCollector;
	_cloneMetadata.putAll(_blackboard.getMetadata(_currentId));
	_cloneMetadata.remove(Record.RECORD_ID);
	}

	public int getRecordCount() {
	return _recordCount;
	}

	/**
	* {@inheritDoc}
	*/
	@Override
	public void handleSnippet(final byte[] snippet) {
	final String snippetId = _currentId + DocumentSplitterPipelet.SPLIT_ID_SEPARATOR + _recordCount++;
	try {
	final Record snippetRecord = _blackboard.getRecord(snippetId, Get.NEW);
	snippetRecord.getMetadata().put(DocumentSplitterPipelet.DOCUMENT_ID, _currentId);
	snippetRecord.getMetadata().putAll(_cloneMetadata);
	if (_storeInAttribute) {
	snippetRecord.getMetadata().put(_outputName, new String(snippet, ENCODING_CHARSET));
	} else {
	_blackboard.setAttachment(snippetId, _outputName, snippet);
	}
	_resultCollector.addResult(snippetId);
	} catch (final Exception ex) {
	_log.warn("Error creating XML-snippet record", ex);
	}
	}
	}
	}