| /******************************************************************************* |
| * Copyright (c) 2013, Empolis Information Management GmbH and brox IT Solutions GmbH. All rights reserved. This program |
| * and the accompanying materials are made available under the terms of the Eclipse Public License v1.0 which |
| * accompanies this distribution, and is available at http://www.eclipse.org/legal/epl-v10.html |
| * |
| * Contributors: Daniel Stucky (Empolis Information Management GmbH) - initial implementation |
| *******************************************************************************/ |
| package org.eclipse.smila.processing.pipelets; |
| |
| import org.apache.commons.logging.Log; |
| import org.apache.commons.logging.LogFactory; |
| import org.eclipse.smila.blackboard.Blackboard; |
| import org.eclipse.smila.blackboard.BlackboardAccessException; |
| import org.eclipse.smila.common.language.Language; |
| import org.eclipse.smila.common.language.LanguageIdentifyService; |
| import org.eclipse.smila.datamodel.AnyMap; |
| import org.eclipse.smila.processing.Pipelet; |
| import org.eclipse.smila.processing.ProcessingException; |
| import org.eclipse.smila.processing.parameters.ParameterAccessor; |
| import org.eclipse.smila.processing.util.ProcessingConstants; |
| import org.eclipse.smila.processing.util.ResultCollector; |
| import org.eclipse.smila.utils.service.ServiceUtils; |
| |
| /** |
| * Language Identify Pipelet. |
| * |
| * @author Daniel Stucky |
| */ |
| public class LanguageIdentifyPipelet implements Pipelet { |
| |
| /** |
| * Constant for the configuration property ContentAttribute. |
| */ |
| public static final String CONTENT_ATTRIBUTE = "ContentAttribute"; |
| |
| /** |
| * Constant for the configuration property LanguageAttribute. |
| */ |
| public static final String LANGUAGE_ATTRIBUTE = "LanguageAttribute"; |
| |
| /** |
| * Constant for the configuration property DefaultLanguage. |
| */ |
| public static final String DEFAULT_LANGUAGE = "DefaultLanguage"; |
| |
| /** |
| * Constant for the configuration property AlternativeNameAttribute. |
| */ |
| public static final String ALTERNATIVE_NAME_ATTRIBUTE = "AlternativeNameAttribute"; |
| |
| /** |
| * Constant for the configuration property DefaultAlternativeName. |
| */ |
| public static final String DEFAULT_ALTERNATIVE_NAME = "DefaultAlternativeName"; |
| |
| /** |
| * Constant for the configuration property UseCertainLanguagesOnly. |
| */ |
| public static final String USE_CERTAIN_LANGUAGES_ONLY = "UseCertainLanguagesOnly"; |
| |
| private static class Parameters { |
| private final String _contentAttributeName; |
| |
| private final String _languageAttributeName; |
| |
| private final String _defaultLanguage; |
| |
| private final String _alternativeNameAttributeName; |
| |
| private final String _defaultAlternativeName; |
| |
| private final boolean _useCertainLanguagesOnly; |
| |
| Parameters(final ParameterAccessor paramAccessor) throws ProcessingException { |
| _contentAttributeName = paramAccessor.getRequiredParameter(CONTENT_ATTRIBUTE); |
| _languageAttributeName = paramAccessor.getParameter(LANGUAGE_ATTRIBUTE, null); |
| _defaultLanguage = paramAccessor.getParameter(DEFAULT_LANGUAGE, null); |
| _alternativeNameAttributeName = paramAccessor.getParameter(ALTERNATIVE_NAME_ATTRIBUTE, null); |
| _defaultAlternativeName = paramAccessor.getParameter(DEFAULT_ALTERNATIVE_NAME, null); |
| _useCertainLanguagesOnly = paramAccessor.getBooleanParameter(USE_CERTAIN_LANGUAGES_ONLY, false); |
| if (_languageAttributeName == null && _alternativeNameAttributeName == null) { |
| throw new ProcessingException("One of the config properties " + LANGUAGE_ATTRIBUTE + " or " |
| + ALTERNATIVE_NAME_ATTRIBUTE + " have to be specified!"); |
| } |
| } |
| } |
| |
| /** |
| * local logger. |
| */ |
| private final Log _log = LogFactory.getLog(getClass()); |
| |
| /** |
| * The configuration. |
| */ |
| private AnyMap _configuration; |
| |
| /** |
| * Language Identifier Service. |
| */ |
| private LanguageIdentifyService _languageIdentifier; |
| |
| /** |
| * read configuration parameters. |
| * |
| * {@inheritDoc} |
| */ |
| @Override |
| public void configure(final AnyMap configuration) throws ProcessingException { |
| _configuration = configuration; |
| } |
| |
| /** |
| * {@inheritDoc} |
| */ |
| @Override |
| public String[] process(final Blackboard blackboard, final String[] recordIds) throws ProcessingException { |
| // process records |
| final LanguageIdentifyService identifier = getLanguageIdentifier(); |
| final ParameterAccessor paramAccessor = new ParameterAccessor(blackboard, _configuration); |
| final ResultCollector resultCollector = |
| new ResultCollector(paramAccessor, _log, ProcessingConstants.DROP_ON_ERROR_DEFAULT); |
| for (final String id : recordIds) { |
| try { |
| paramAccessor.setCurrentRecord(id); |
| final Parameters parameters = new Parameters(paramAccessor); |
| identifyAndStore(blackboard, id, identifier, parameters); |
| resultCollector.addResult(id); |
| } catch (final Exception ex) { |
| resultCollector.addFailedResult(id, ex); |
| } |
| } // for |
| return resultCollector.getResultIds(); |
| } |
| |
| /** |
| * @return a LanguageIdentifier service. |
| * |
| * @throws ProcessingException |
| * could not find a service |
| */ |
| private synchronized LanguageIdentifyService getLanguageIdentifier() throws ProcessingException { |
| if (_languageIdentifier == null) { |
| try { |
| _languageIdentifier = ServiceUtils.getService(LanguageIdentifyService.class); |
| } catch (final Exception ex) { |
| _log.warn("Error while waiting for LanguageIdentifier service to come up.", ex); |
| } |
| if (_languageIdentifier == null) { |
| throw new ProcessingException("No LanguageIdentifier service available, giving up"); |
| } |
| } |
| return _languageIdentifier; |
| } |
| |
| /** |
| * Identifies the Language and stores it in the BlackboardService. |
| * |
| * @param blackboard |
| * the BlackboardService |
| * @param id |
| * the Id |
| * @param identifier |
| * LanguageIdentifier service to use. |
| * @throws BlackboardAccessException |
| * if any error occurs |
| */ |
| private void identifyAndStore(final Blackboard blackboard, final String id, |
| final LanguageIdentifyService identifier, final Parameters p) throws BlackboardAccessException { |
| final AnyMap metaData = blackboard.getMetadata(id); |
| if (metaData.containsKey(p._contentAttributeName)) { |
| final String text = metaData.getStringValue(p._contentAttributeName); |
| final Language identifiedLanguage = identifier.identify(text); |
| if (identifiedLanguage != null) { |
| if (identifiedLanguage.isCertain() || !p._useCertainLanguagesOnly) { |
| storeIdentifiedLanguage(id, identifiedLanguage, metaData, p); |
| } else { |
| if (_log.isInfoEnabled()) { |
| _log.info("Detected language '" + identifiedLanguage.getIsoLanguage() + "' for Id '" + id |
| + "' is not certain. Detected language is not used."); |
| } |
| setDefaultLanguage(id, metaData, p); |
| } |
| } else { |
| if (_log.isWarnEnabled()) { |
| _log.warn("Unable to identify Language for Id '" + id + "'."); |
| } |
| setDefaultLanguage(id, metaData, p); |
| } |
| } else { |
| if (_log.isWarnEnabled()) { |
| _log.warn("Unable to identify Language for Id '" + id + "'. No input value found for '" + CONTENT_ATTRIBUTE |
| + "'"); |
| } |
| setDefaultLanguage(id, metaData, p); |
| } |
| } |
| |
| /** |
| * set language attributes from identifiedLanguage. |
| */ |
| private void storeIdentifiedLanguage(final String id, final Language identifiedLanguage, final AnyMap metaData, |
| final Parameters p) { |
| final String language = identifiedLanguage.getIsoLanguage(); |
| final String alternativeName = identifiedLanguage.getAlternativeName(); |
| if (_log.isTraceEnabled()) { |
| _log.trace("Detected language '" + language + "' for Id '" + id + "'."); |
| } |
| |
| if (p._languageAttributeName != null) { |
| metaData.put(p._languageAttributeName, language); |
| } |
| if (alternativeName != null && p._alternativeNameAttributeName != null) { |
| metaData.put(p._alternativeNameAttributeName, alternativeName); |
| } |
| } |
| |
| /** |
| * Set default language. |
| * |
| * @param id |
| * record Id |
| * @param metaData |
| * record metadata |
| */ |
| private void setDefaultLanguage(final String id, final AnyMap metaData, final Parameters p) { |
| if (p._defaultLanguage != null) { |
| if (_log.isInfoEnabled()) { |
| _log.info("Using default language '" + p._defaultLanguage + "' for Id '" + id + "'."); |
| } |
| metaData.put(p._languageAttributeName, p._defaultLanguage); |
| } |
| if (p._defaultAlternativeName != null) { |
| if (_log.isInfoEnabled()) { |
| _log.info("Using default alternativeName '" + p._defaultAlternativeName + "' for Id '" + id + "'."); |
| } |
| metaData.put(p._alternativeNameAttributeName, p._defaultAlternativeName); |
| } |
| } |
| } |