blob: 468c90cbd268c2e8aea9f94e921cdecbe511ef57 [file] [log] [blame]
/***********************************************************************************************************************
* Copyright (c) 2008, 2013 Empolis Information Management GmbH and brox IT Solutions GmbH. All rights reserved. This
* program and the accompanying materials are made available under the terms of the Eclipse Public License v1.0 which
* accompanies this distribution, and is available at http://www.eclipse.org/legal/epl-v10.html
*
* Contributors: Juergen Schumacher (Empolis Information Management GmbH) - initial implementation, based on
* XmlSplitterPipelet
**********************************************************************************************************************/
package org.eclipse.smila.processing.pipelets.xmlprocessing;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import javax.xml.namespace.QName;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.methods.GetMethod;
import org.eclipse.smila.blackboard.Blackboard;
import org.eclipse.smila.blackboard.Blackboard.Get;
import org.eclipse.smila.blackboard.BlackboardAccessException;
import org.eclipse.smila.datamodel.AnyMap;
import org.eclipse.smila.datamodel.DataFactory;
import org.eclipse.smila.datamodel.Record;
import org.eclipse.smila.processing.ProcessingException;
import org.eclipse.smila.processing.parameters.MissingParameterException;
import org.eclipse.smila.processing.parameters.ParameterAccessor;
import org.eclipse.smila.processing.pipelets.ATransformationPipelet;
import org.eclipse.smila.processing.pipelets.DocumentSplitterPipelet;
import org.eclipse.smila.processing.util.ProcessingConstants;
import org.eclipse.smila.processing.util.ResultCollector;
import org.eclipse.smila.utils.xml.stax.XmlSnippetHandler;
import org.eclipse.smila.utils.xml.stax.XmlSnippetSplitter;
/**
* The possible properties are:
* <ul>
* <li>beginTagName: the name of the tag to start the xml snippet</li>
* <li>beginTagNamespace: the (optional) namespace of the tag to start the xml snippet</li>
* <li>endTagName: the name of the tag to end the xml snippet</li>
* <li>endTagNamespace: the (optional) namespace of the tag to end the xml snippet</li>
* <li>inputName: name of the Attribute/Attachment to read the XML Document from.</li>
* <li>outputName: name of the Attribute/Attachment to store the extracted value in</li>
* <li>inputType: the type (Attribute or Attachment of the inputName). An input Attribute is not interpreted as content
* but as a file path or an URL to the XML document</li>
* <li>outputType: the type (Attribute or Attachment of the outputName)</li>
* </ul>
*/
public class XmlDocumentSplitterPipelet extends ATransformationPipelet {
/** Constant for the property beginTagName. */
public static final String PROP_BEGIN_TAG_NAME = "beginTagName";
/** Constant for the property beginTagNamespace. */
public static final String PROP_BEGIN_TAG_NAMESPACE = "beginTagNamespace";
/** Constant for the property endTagName. */
public static final String PROP_END_TAG_NAME = "endTagName";
/** Constant for the property endTagNamespace. */
public static final String PROP_END_TAG_NAMESPACE = "endTagNamespace";
/**
* {@inheritDoc}
*/
@Override
public String[] process(final Blackboard blackboard, final String[] recordIds) throws ProcessingException {
if (recordIds == null) {
return recordIds;
}
final ParameterAccessor paramAccessor = new ParameterAccessor(blackboard, _config);
final ResultCollector resultCollector =
new ResultCollector(paramAccessor, _log, ProcessingConstants.DROP_ON_ERROR_DEFAULT);
for (final String id : recordIds) {
try {
paramAccessor.setCurrentRecord(id);
final String beginTagName = paramAccessor.getRequiredParameter(PROP_BEGIN_TAG_NAME);
final String beginTagNamespace = paramAccessor.getParameter(PROP_BEGIN_TAG_NAMESPACE, "");
final String endTagName = paramAccessor.getParameter(PROP_END_TAG_NAME, beginTagName);
final String endTagNamespace = paramAccessor.getParameter(PROP_END_TAG_NAMESPACE, beginTagNamespace);
final QName beginTag = new QName(beginTagNamespace, beginTagName);
final QName endTag = new QName(endTagNamespace, endTagName);
final String outputName = getOutputName(paramAccessor);
final boolean storeInAttribute = isStoreInAttribute(getOutputType(paramAccessor));
final InternalHandler snippetHandler =
new InternalHandler(blackboard, id, outputName, storeInAttribute, resultCollector);
final XmlSnippetSplitter splitter = new XmlSnippetSplitter(snippetHandler, beginTag, endTag);
final InputStream inputStream = getXmlInputStream(blackboard, id, paramAccessor);
splitter.read(inputStream);
if (_log.isInfoEnabled()) {
_log.info("Created " + snippetHandler.getRecordCount() + " records from processing record " + id);
}
} catch (final Exception e) {
resultCollector.addFailedResult(id, e);
}
}
return resultCollector.getResultIds();
}
/** get XML input Stream. */
private InputStream getXmlInputStream(final Blackboard blackboard, final String id,
final ParameterAccessor paramAccessor) throws IOException, BlackboardAccessException, MissingParameterException {
InputStream inputStream = null;
if (isReadFromAttribute(getInputType(paramAccessor))) {
inputStream = loadExternalInputStream(readStringInput(blackboard, id, paramAccessor));
} else {
inputStream = blackboard.getAttachmentAsStream(id, getInputName(paramAccessor));
}
return inputStream;
}
/**
* Get the external InputStream to the given url or file path.
*
* @param attrtibuteValue
* the attrtibuteValue denoting an URL or file path
* @return a InputStream or null
* @throws IOException
* if any error occurs
*/
private InputStream loadExternalInputStream(final String attrtibuteValue) throws IOException {
InputStream stream = null;
if (attrtibuteValue != null && attrtibuteValue.trim().length() > 0) {
if (attrtibuteValue.startsWith("file")) {
final URL url = new URL(attrtibuteValue);
stream = new FileInputStream(url.getAuthority() + url.getPath());
} else if (attrtibuteValue.startsWith("http")) {
final URL url = new URL(attrtibuteValue);
final HttpClient httpClient = new HttpClient();
final GetMethod getMethod = new GetMethod(url.toString());
httpClient.executeMethod(getMethod);
stream = getMethod.getResponseBodyAsStream();
} else {
stream = new FileInputStream(attrtibuteValue);
}
} // if
return stream;
}
class InternalHandler implements XmlSnippetHandler {
private final Blackboard _blackboard;
private final String _currentId;
private final ResultCollector _resultCollector;
private final String _outputName;
private final boolean _storeInAttribute;
private final AnyMap _cloneMetadata = DataFactory.DEFAULT.createAnyMap();
private int _recordCount;
private InternalHandler(final Blackboard blackboard, final String currentId, final String outputName,
final boolean storeInAttribute, final ResultCollector resultCollector) throws BlackboardAccessException {
_blackboard = blackboard;
_currentId = currentId;
_outputName = outputName;
_storeInAttribute = storeInAttribute;
_resultCollector = resultCollector;
_cloneMetadata.putAll(_blackboard.getMetadata(_currentId));
_cloneMetadata.remove(Record.RECORD_ID);
}
public int getRecordCount() {
return _recordCount;
}
/**
* {@inheritDoc}
*/
@Override
public void handleSnippet(final byte[] snippet) {
final String snippetId = _currentId + DocumentSplitterPipelet.SPLIT_ID_SEPARATOR + _recordCount++;
try {
final Record snippetRecord = _blackboard.getRecord(snippetId, Get.NEW);
snippetRecord.getMetadata().put(DocumentSplitterPipelet.DOCUMENT_ID, _currentId);
snippetRecord.getMetadata().putAll(_cloneMetadata);
if (_storeInAttribute) {
snippetRecord.getMetadata().put(_outputName, new String(snippet, ENCODING_CHARSET));
} else {
_blackboard.setAttachment(snippetId, _outputName, snippet);
}
_resultCollector.addResult(snippetId);
} catch (final Exception ex) {
_log.warn("Error creating XML-snippet record", ex);
}
}
}
}