blob: 17b745fe4352cb0a8646df05d2a43c81ca26ac0a [file] [log] [blame]
/*******************************************************************************
* Copyright (c) 2008, 2012 Attensity Europe GmbH and brox IT Solutions GmbH. All rights reserved. This program and the
* accompanying materials are made available under the terms of the Eclipse Public License v1.0 which accompanies this
* distribution, and is available at http://www.eclipse.org/legal/epl-v10.html
*
* Contributors: Andreas Schank (Attensity Europe GmbH) - initial API and implementation
*******************************************************************************/
package org.eclipse.smila.importing.crawler.web;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import org.eclipse.smila.datamodel.AnyMap;
import org.eclipse.smila.datamodel.DataFactory;
import org.eclipse.smila.importing.ImportingConstants;
import org.eclipse.smila.importing.crawler.web.WebCrawlerConstants.ErrorHandling;
import org.eclipse.smila.importing.crawler.web.filter.FilterConfiguration;
import org.eclipse.smila.importing.crawler.web.utils.RobotsTxt;
import org.eclipse.smila.importing.util.CrawlingContext;
import org.eclipse.smila.jobmanager.JobRunDataProvider;
import org.eclipse.smila.jobmanager.exceptions.JobManagerException;
import org.eclipse.smila.taskworker.TaskContext;
import org.eclipse.smila.taskworker.TaskLog;
/**
* Context holding information needed throughout most of the web crawling process like mapper, filter configuration etc.
* for one task
*/
public class WebCrawlingContext extends CrawlingContext {
/** the filter configuration for the current task. */
private final FilterConfiguration _filterConfiguration;
/** the id of the current input bulk. */
private String _currentInputBulkId;
/** the visited URLs. */
private final Set<String> _visitedUrls = new HashSet<String>();
/** the extracted URLs. */
private final Set<String> _extractedUrls = new HashSet<String>();
/** robots.txt evaluators for hosts visited in this task. */
private final Map<String, RobotsTxt> _robotsTxts = new HashMap<>();
/** number of links per output bulk. */
private int _linksPerBulk = WebCrawlerConstants.DEFAULT_LINKS_PER_BULK;
/** error handling on recoverable fetch errors. */
private ErrorHandling _errorHandling = ErrorHandling.DROP;
/** creates a crawling context from the taskContext. */
public WebCrawlingContext(final TaskContext taskContext) {
super(taskContext, false);
if (getTaskParameters().containsKey(ImportingConstants.TASK_PARAM_FILTERS)) {
_filterConfiguration =
new FilterConfiguration(getTaskParameters().getMap(ImportingConstants.TASK_PARAM_FILTERS));
} else {
_filterConfiguration = new FilterConfiguration(DataFactory.DEFAULT.createAnyMap());
}
if (taskContext.getTaskParameters().containsKey(WebCrawlerConstants.TASK_PARAM_LINKS_PER_BULK)) {
_linksPerBulk =
taskContext.getTaskParameters().getLongValue(WebCrawlerConstants.TASK_PARAM_LINKS_PER_BULK).intValue();
}
if (taskContext.getTaskParameters().containsKey(WebCrawlerConstants.TASK_PARAM_LINK_ERROR_HANDLING)) {
_errorHandling =
ErrorHandling.valueOf(taskContext.getTaskParameters()
.getStringValue(WebCrawlerConstants.TASK_PARAM_LINK_ERROR_HANDLING).toUpperCase());
}
}
/**
* @return the _filterConfiguration
*/
public FilterConfiguration getFilterConfiguration() {
return _filterConfiguration;
}
/**
* @return the _taskLog
*/
public TaskLog getTaskLog() {
return getTaskContext().getLog();
}
/**
* @return the _parameters
*/
public AnyMap getTaskParameters() {
return getTaskContext().getTaskParameters();
}
/**
* @return the _jobRunId
*/
public String getCurrentInputBulkId() {
return _currentInputBulkId;
}
/**
* @param inputBulkId
*/
public void setCurrentInputBulkId(final String inputBulkId) {
_currentInputBulkId = inputBulkId;
}
/**
* @return the visited urls of this task
*/
public Set<String> getVisitedUrls() {
return _visitedUrls;
}
/** @return the number of links per output bulk. */
public int getLinksPerBulk() {
return _linksPerBulk;
}
/** @return how to handle recoverable fetch errors. */
public ErrorHandling getErrorHandling() {
return _errorHandling;
}
/**
* @return set in which the Urls are collected that have been written to linksToCrawl in this task, for duplicate
* prevention.
*/
public Set<String> getExtractedUrls() {
return _extractedUrls;
}
/** @return robots.txt checker for given host, if already fetched. */
public RobotsTxt getRobotsTxt(final String hostAndPort, final JobRunDataProvider jobRunDataProvider) {
RobotsTxt robotsTxt = _robotsTxts.get(hostAndPort);
if (robotsTxt == null) {
final byte[] binaryRobotsTxt = getStoredRobotsTxt(hostAndPort, jobRunDataProvider);
if (binaryRobotsTxt != null) {
robotsTxt = new RobotsTxt(binaryRobotsTxt);
_robotsTxts.put(hostAndPort, robotsTxt);
}
}
return robotsTxt;
}
/** get serialized robots.txt from job run data. */
private byte[] getStoredRobotsTxt(final String hostAndPort, final JobRunDataProvider jobRunDataProvider) {
if (jobRunDataProvider != null && getJobName() != null && getJobRunId() != null) {
try {
return jobRunDataProvider.getCustomData(getJobName(), getJobRunId(), WebCrawlerWorker.NAME, "robots.txt",
hostAndPort);
} catch (final JobManagerException ex) {
getTaskLog().warn("Failed to read robots.txt for " + hostAndPort + " from job run data: " + ex);
}
}
return null;
}
/** store robots.txt in context and job run data, if available. */
public void putRobotsTxt(final String hostAndPort, final RobotsTxt robotsTxt,
final JobRunDataProvider jobRunDataProvider) {
_robotsTxts.put(hostAndPort, robotsTxt);
if (jobRunDataProvider != null && getJobName() != null && getJobRunId() != null) {
try {
jobRunDataProvider.setCustomData(getJobName(), getJobRunId(), WebCrawlerWorker.NAME, robotsTxt.asBinary(),
"robots.txt", hostAndPort);
} catch (final JobManagerException ex) {
getTaskLog().warn("Failed to store robots.txt for " + hostAndPort + " in job run data: " + ex);
}
}
}
}