blob: 08292abd82c7681c5922c4b02900fbf4774f2e06 [file] [log] [blame]
/*********************************************************************************************************************
* Copyright (c) 2008, 2012 Attensity Europe GmbH and brox IT Solutions GmbH. All rights reserved. This program and the
* accompanying materials are made available under the terms of the Eclipse Public License v1.0 which accompanies this
* distribution, and is available at http://www.eclipse.org/legal/epl-v10.html
**********************************************************************************************************************/
package org.eclipse.smila.importing.crawler.web;
import java.io.InputStream;
import java.util.Iterator;
import org.eclipse.smila.datamodel.AnyMap;
import org.eclipse.smila.datamodel.Record;
import org.eclipse.smila.importing.ContentFetcher;
import org.eclipse.smila.importing.ImportingConstants;
import org.eclipse.smila.importing.compounds.CompoundExtractor;
import org.eclipse.smila.importing.compounds.CompoundExtractorException;
import org.eclipse.smila.importing.compounds.ExtractorWorkerBase;
import org.eclipse.smila.importing.crawler.web.filter.FilterConfiguration;
import org.eclipse.smila.importing.crawler.web.utils.DeltaHash;
import org.eclipse.smila.importing.util.PropertyNameMapper;
import org.eclipse.smila.taskworker.TaskContext;
/** Compound extractor worker to use in web crawling workflows. */
public class WebExtractorWorker extends ExtractorWorkerBase {
/** name of worker. */
public static final String NAME = "webExtractor";
/** reference to Fetcher service. */
private Fetcher _fetcher;
/** {@inheritDoc} */
@Override
public String getName() {
return NAME;
}
/** {@inheritDoc} */
@Override
protected Iterator<Record> invokeExtractor(final CompoundExtractor extractor, final Record compoundRecord,
final InputStream compoundContent, final TaskContext taskContext) throws CompoundExtractorException {
final PropertyNameMapper mapper = PropertyNameMapper.createFrom(taskContext);
// the web crawler has already mapped this record, so we have to read the mapped attributes
final String url =
compoundRecord.getMetadata().getStringValue(mapper.get(WebCrawlerConstants.ATTRIBUTE_URL).get(0));
final String mimeType =
compoundRecord.getMetadata().getStringValue(mapper.get(WebCrawlerConstants.ATTRIBUTE_MIMETYPE).get(0));
return extractor.extract(compoundContent, url, mimeType, WebCrawlerConstants.ATTACHMENT_CONTENT);
}
/** {@inheritDoc} */
@Override
protected Record convertRecord(final Record compoundRecord, final Record extractedRecord,
final TaskContext taskContext) {
final String dataSource = compoundRecord.getSource();
final Record convertedRecord;
if (extractedRecord.getMetadata().containsKey(CompoundExtractor.KEY_IS_ROOT_COMPOUND_RECORD)) {
// it's the compound's record.
convertedRecord = compoundRecord;
} else {
// it's a record extracted from a compound
convertedRecord =
extractedRecord.getFactory().createRecord(dataSource + ":" + extractedRecord.getId(), dataSource);
copySetToStringAttribute(extractedRecord, CompoundExtractor.KEY_COMPOUNDS, convertedRecord,
WebCrawlerConstants.ATTRIBUTE_URL, "/"); // use compounds as prefix for URL of extracted record
concatAttributeValues(extractedRecord, CompoundExtractor.KEY_FILE_NAME, convertedRecord,
WebCrawlerConstants.ATTRIBUTE_URL, "/"); // add file name to URL of extracted record
}
copyAttachment(extractedRecord, convertedRecord, WebCrawlerConstants.ATTACHMENT_CONTENT);
copyAttribute(extractedRecord, CompoundExtractor.KEY_SIZE, convertedRecord, WebCrawlerConstants.ATTRIBUTE_SIZE);
// fallback for last modified: set the last modification date of the compound record if the
// extracted record does not provide an own value:
copyAttribute(compoundRecord, WebCrawlerConstants.ATTRIBUTE_LASTMODIFIED, convertedRecord,
WebCrawlerConstants.ATTRIBUTE_LASTMODIFIED);
copyAttribute(extractedRecord, CompoundExtractor.KEY_TIME, convertedRecord,
WebCrawlerConstants.ATTRIBUTE_LASTMODIFIED);
DeltaHash.calculate(convertedRecord);
return convertedRecord;
}
/**
* Filters applied to extracted records:
* <ul>
* <li>urlPatterns (to the name of the extracted file).</li>
* </ul>
*/
@Override
protected boolean filterRecord(final Record record, final TaskContext taskContext) {
final AnyMap filterParams = taskContext.getTaskParameters().getMap(ImportingConstants.TASK_PARAM_FILTERS);
if (filterParams != null) {
final FilterConfiguration filterConfiguration = new FilterConfiguration(filterParams);
if (record.getMetadata().containsKey(WebCrawlerConstants.ATTRIBUTE_URL)
&& !filterConfiguration.getUrlPatternMatcher().matches(
record.getMetadata().getStringValue(WebCrawlerConstants.ATTRIBUTE_URL))) {
return false;
}
}
return true;
}
/** {@inheritDoc} */
@Override
protected void mapRecord(final Record record, final TaskContext taskContext) {
final PropertyNameMapper mapper = PropertyNameMapper.createFrom(taskContext);
mapper.mapNames(record, WebCrawlerConstants.PROPERTY_NAMES);
}
/** {@inheritDoc} */
@Override
protected ContentFetcher getContentFetcher() {
return _fetcher;
}
/** DS service reference injection method. */
public void setFetcher(final Fetcher fetcher) {
_fetcher = fetcher;
}
/** DS service reference removal method. */
public void unsetFetcher(final Fetcher fetcher) {
if (_fetcher == fetcher) {
_fetcher = null;
}
}
}