| /********************************************************************************************************************* |
| * Copyright (c) 2008, 2012 Attensity Europe GmbH and brox IT Solutions GmbH. All rights reserved. This program and the |
| * accompanying materials are made available under the terms of the Eclipse Public License v1.0 which accompanies this |
| * distribution, and is available at http://www.eclipse.org/legal/epl-v10.html |
| **********************************************************************************************************************/ |
| package org.eclipse.smila.importing.crawler.web; |
| |
| import java.io.InputStream; |
| import java.util.Iterator; |
| |
| import org.eclipse.smila.datamodel.AnyMap; |
| import org.eclipse.smila.datamodel.Record; |
| import org.eclipse.smila.importing.ContentFetcher; |
| import org.eclipse.smila.importing.ImportingConstants; |
| import org.eclipse.smila.importing.compounds.CompoundExtractor; |
| import org.eclipse.smila.importing.compounds.CompoundExtractorException; |
| import org.eclipse.smila.importing.compounds.ExtractorWorkerBase; |
| import org.eclipse.smila.importing.crawler.web.filter.FilterConfiguration; |
| import org.eclipse.smila.importing.crawler.web.utils.DeltaHash; |
| import org.eclipse.smila.importing.util.PropertyNameMapper; |
| import org.eclipse.smila.taskworker.TaskContext; |
| |
| /** Compound extractor worker to use in web crawling workflows. */ |
| public class WebExtractorWorker extends ExtractorWorkerBase { |
| /** name of worker. */ |
| public static final String NAME = "webExtractor"; |
| |
| /** reference to Fetcher service. */ |
| private Fetcher _fetcher; |
| |
| /** {@inheritDoc} */ |
| @Override |
| public String getName() { |
| return NAME; |
| } |
| |
| /** {@inheritDoc} */ |
| @Override |
| protected Iterator<Record> invokeExtractor(final CompoundExtractor extractor, final Record compoundRecord, |
| final InputStream compoundContent, final TaskContext taskContext) throws CompoundExtractorException { |
| final PropertyNameMapper mapper = PropertyNameMapper.createFrom(taskContext); |
| // the web crawler has already mapped this record, so we have to read the mapped attributes |
| final String url = |
| compoundRecord.getMetadata().getStringValue(mapper.get(WebCrawlerConstants.ATTRIBUTE_URL).get(0)); |
| final String mimeType = |
| compoundRecord.getMetadata().getStringValue(mapper.get(WebCrawlerConstants.ATTRIBUTE_MIMETYPE).get(0)); |
| return extractor.extract(compoundContent, url, mimeType, WebCrawlerConstants.ATTACHMENT_CONTENT); |
| } |
| |
| /** {@inheritDoc} */ |
| @Override |
| protected Record convertRecord(final Record compoundRecord, final Record extractedRecord, |
| final TaskContext taskContext) { |
| final String dataSource = compoundRecord.getSource(); |
| final Record convertedRecord; |
| if (extractedRecord.getMetadata().containsKey(CompoundExtractor.KEY_IS_ROOT_COMPOUND_RECORD)) { |
| // it's the compound's record. |
| convertedRecord = compoundRecord; |
| } else { |
| // it's a record extracted from a compound |
| convertedRecord = |
| extractedRecord.getFactory().createRecord(dataSource + ":" + extractedRecord.getId(), dataSource); |
| copySetToStringAttribute(extractedRecord, CompoundExtractor.KEY_COMPOUNDS, convertedRecord, |
| WebCrawlerConstants.ATTRIBUTE_URL, "/"); // use compounds as prefix for URL of extracted record |
| concatAttributeValues(extractedRecord, CompoundExtractor.KEY_FILE_NAME, convertedRecord, |
| WebCrawlerConstants.ATTRIBUTE_URL, "/"); // add file name to URL of extracted record |
| } |
| copyAttachment(extractedRecord, convertedRecord, WebCrawlerConstants.ATTACHMENT_CONTENT); |
| copyAttribute(extractedRecord, CompoundExtractor.KEY_SIZE, convertedRecord, WebCrawlerConstants.ATTRIBUTE_SIZE); |
| |
| // fallback for last modified: set the last modification date of the compound record if the |
| // extracted record does not provide an own value: |
| copyAttribute(compoundRecord, WebCrawlerConstants.ATTRIBUTE_LASTMODIFIED, convertedRecord, |
| WebCrawlerConstants.ATTRIBUTE_LASTMODIFIED); |
| copyAttribute(extractedRecord, CompoundExtractor.KEY_TIME, convertedRecord, |
| WebCrawlerConstants.ATTRIBUTE_LASTMODIFIED); |
| |
| DeltaHash.calculate(convertedRecord); |
| return convertedRecord; |
| } |
| |
| /** |
| * Filters applied to extracted records: |
| * <ul> |
| * <li>urlPatterns (to the name of the extracted file).</li> |
| * </ul> |
| */ |
| @Override |
| protected boolean filterRecord(final Record record, final TaskContext taskContext) { |
| final AnyMap filterParams = taskContext.getTaskParameters().getMap(ImportingConstants.TASK_PARAM_FILTERS); |
| if (filterParams != null) { |
| final FilterConfiguration filterConfiguration = new FilterConfiguration(filterParams); |
| if (record.getMetadata().containsKey(WebCrawlerConstants.ATTRIBUTE_URL) |
| && !filterConfiguration.getUrlPatternMatcher().matches( |
| record.getMetadata().getStringValue(WebCrawlerConstants.ATTRIBUTE_URL))) { |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| /** {@inheritDoc} */ |
| @Override |
| protected void mapRecord(final Record record, final TaskContext taskContext) { |
| final PropertyNameMapper mapper = PropertyNameMapper.createFrom(taskContext); |
| mapper.mapNames(record, WebCrawlerConstants.PROPERTY_NAMES); |
| } |
| |
| /** {@inheritDoc} */ |
| @Override |
| protected ContentFetcher getContentFetcher() { |
| return _fetcher; |
| } |
| |
| /** DS service reference injection method. */ |
| public void setFetcher(final Fetcher fetcher) { |
| _fetcher = fetcher; |
| } |
| |
| /** DS service reference removal method. */ |
| public void unsetFetcher(final Fetcher fetcher) { |
| if (_fetcher == fetcher) { |
| _fetcher = null; |
| } |
| } |
| } |