blob: c5f818cffad3a3a646caeff4debb5d9d9aacdf82 [file] [log] [blame]
/*******************************************************************************
* Copyright (c) 2013, Empolis Information Management GmbH and brox IT Solutions GmbH. All rights reserved. This program
* and the accompanying materials are made available under the terms of the Eclipse Public License v1.0 which
* accompanies this distribution, and is available at http://www.eclipse.org/legal/epl-v10.html
*
* Contributors: Daniel Stucky (Empolis Information Management GmbH) - initial implementation
*******************************************************************************/
package org.eclipse.smila.processing.pipelets;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.eclipse.smila.blackboard.Blackboard;
import org.eclipse.smila.blackboard.BlackboardAccessException;
import org.eclipse.smila.common.language.Language;
import org.eclipse.smila.common.language.LanguageIdentifyService;
import org.eclipse.smila.datamodel.AnyMap;
import org.eclipse.smila.processing.Pipelet;
import org.eclipse.smila.processing.ProcessingException;
import org.eclipse.smila.processing.parameters.ParameterAccessor;
import org.eclipse.smila.processing.util.ProcessingConstants;
import org.eclipse.smila.processing.util.ResultCollector;
import org.eclipse.smila.utils.service.ServiceUtils;
/**
* Language Identify Pipelet.
*
* @author Daniel Stucky
*/
public class LanguageIdentifyPipelet implements Pipelet {
/**
* Constant for the configuration property ContentAttribute.
*/
public static final String CONTENT_ATTRIBUTE = "ContentAttribute";
/**
* Constant for the configuration property LanguageAttribute.
*/
public static final String LANGUAGE_ATTRIBUTE = "LanguageAttribute";
/**
* Constant for the configuration property DefaultLanguage.
*/
public static final String DEFAULT_LANGUAGE = "DefaultLanguage";
/**
* Constant for the configuration property AlternativeNameAttribute.
*/
public static final String ALTERNATIVE_NAME_ATTRIBUTE = "AlternativeNameAttribute";
/**
* Constant for the configuration property DefaultAlternativeName.
*/
public static final String DEFAULT_ALTERNATIVE_NAME = "DefaultAlternativeName";
/**
* Constant for the configuration property UseCertainLanguagesOnly.
*/
public static final String USE_CERTAIN_LANGUAGES_ONLY = "UseCertainLanguagesOnly";
private static class Parameters {
private final String _contentAttributeName;
private final String _languageAttributeName;
private final String _defaultLanguage;
private final String _alternativeNameAttributeName;
private final String _defaultAlternativeName;
private final boolean _useCertainLanguagesOnly;
Parameters(final ParameterAccessor paramAccessor) throws ProcessingException {
_contentAttributeName = paramAccessor.getRequiredParameter(CONTENT_ATTRIBUTE);
_languageAttributeName = paramAccessor.getParameter(LANGUAGE_ATTRIBUTE, null);
_defaultLanguage = paramAccessor.getParameter(DEFAULT_LANGUAGE, null);
_alternativeNameAttributeName = paramAccessor.getParameter(ALTERNATIVE_NAME_ATTRIBUTE, null);
_defaultAlternativeName = paramAccessor.getParameter(DEFAULT_ALTERNATIVE_NAME, null);
_useCertainLanguagesOnly = paramAccessor.getBooleanParameter(USE_CERTAIN_LANGUAGES_ONLY, false);
if (_languageAttributeName == null && _alternativeNameAttributeName == null) {
throw new ProcessingException("One of the config properties " + LANGUAGE_ATTRIBUTE + " or "
+ ALTERNATIVE_NAME_ATTRIBUTE + " have to be specified!");
}
}
}
/**
* local logger.
*/
private final Log _log = LogFactory.getLog(getClass());
/**
* The configuration.
*/
private AnyMap _configuration;
/**
* Language Identifier Service.
*/
private LanguageIdentifyService _languageIdentifier;
/**
* read configuration parameters.
*
* {@inheritDoc}
*/
@Override
public void configure(final AnyMap configuration) throws ProcessingException {
_configuration = configuration;
}
/**
* {@inheritDoc}
*/
@Override
public String[] process(final Blackboard blackboard, final String[] recordIds) throws ProcessingException {
// process records
final LanguageIdentifyService identifier = getLanguageIdentifier();
final ParameterAccessor paramAccessor = new ParameterAccessor(blackboard, _configuration);
final ResultCollector resultCollector =
new ResultCollector(paramAccessor, _log, ProcessingConstants.DROP_ON_ERROR_DEFAULT);
for (final String id : recordIds) {
try {
paramAccessor.setCurrentRecord(id);
final Parameters parameters = new Parameters(paramAccessor);
identifyAndStore(blackboard, id, identifier, parameters);
resultCollector.addResult(id);
} catch (final Exception ex) {
resultCollector.addFailedResult(id, ex);
}
} // for
return resultCollector.getResultIds();
}
/**
* @return a LanguageIdentifier service.
*
* @throws ProcessingException
* could not find a service
*/
private synchronized LanguageIdentifyService getLanguageIdentifier() throws ProcessingException {
if (_languageIdentifier == null) {
try {
_languageIdentifier = ServiceUtils.getService(LanguageIdentifyService.class);
} catch (final Exception ex) {
_log.warn("Error while waiting for LanguageIdentifier service to come up.", ex);
}
if (_languageIdentifier == null) {
throw new ProcessingException("No LanguageIdentifier service available, giving up");
}
}
return _languageIdentifier;
}
/**
* Identifies the Language and stores it in the BlackboardService.
*
* @param blackboard
* the BlackboardService
* @param id
* the Id
* @param identifier
* LanguageIdentifier service to use.
* @throws BlackboardAccessException
* if any error occurs
*/
private void identifyAndStore(final Blackboard blackboard, final String id,
final LanguageIdentifyService identifier, final Parameters p) throws BlackboardAccessException {
final AnyMap metaData = blackboard.getMetadata(id);
if (metaData.containsKey(p._contentAttributeName)) {
final String text = metaData.getStringValue(p._contentAttributeName);
final Language identifiedLanguage = identifier.identify(text);
if (identifiedLanguage != null) {
if (identifiedLanguage.isCertain() || !p._useCertainLanguagesOnly) {
storeIdentifiedLanguage(id, identifiedLanguage, metaData, p);
} else {
if (_log.isInfoEnabled()) {
_log.info("Detected language '" + identifiedLanguage.getIsoLanguage() + "' for Id '" + id
+ "' is not certain. Detected language is not used.");
}
setDefaultLanguage(id, metaData, p);
}
} else {
if (_log.isWarnEnabled()) {
_log.warn("Unable to identify Language for Id '" + id + "'.");
}
setDefaultLanguage(id, metaData, p);
}
} else {
if (_log.isWarnEnabled()) {
_log.warn("Unable to identify Language for Id '" + id + "'. No input value found for '" + CONTENT_ATTRIBUTE
+ "'");
}
setDefaultLanguage(id, metaData, p);
}
}
/**
* set language attributes from identifiedLanguage.
*/
private void storeIdentifiedLanguage(final String id, final Language identifiedLanguage, final AnyMap metaData,
final Parameters p) {
final String language = identifiedLanguage.getIsoLanguage();
final String alternativeName = identifiedLanguage.getAlternativeName();
if (_log.isTraceEnabled()) {
_log.trace("Detected language '" + language + "' for Id '" + id + "'.");
}
if (p._languageAttributeName != null) {
metaData.put(p._languageAttributeName, language);
}
if (alternativeName != null && p._alternativeNameAttributeName != null) {
metaData.put(p._alternativeNameAttributeName, alternativeName);
}
}
/**
* Set default language.
*
* @param id
* record Id
* @param metaData
* record metadata
*/
private void setDefaultLanguage(final String id, final AnyMap metaData, final Parameters p) {
if (p._defaultLanguage != null) {
if (_log.isInfoEnabled()) {
_log.info("Using default language '" + p._defaultLanguage + "' for Id '" + id + "'.");
}
metaData.put(p._languageAttributeName, p._defaultLanguage);
}
if (p._defaultAlternativeName != null) {
if (_log.isInfoEnabled()) {
_log.info("Using default alternativeName '" + p._defaultAlternativeName + "' for Id '" + id + "'.");
}
metaData.put(p._alternativeNameAttributeName, p._defaultAlternativeName);
}
}
}