| /******************************************************************************* |
| * Copyright (c) 2008, 2012 Attensity Europe GmbH and brox IT Solutions GmbH. All rights reserved. This program and the |
| * accompanying materials are made available under the terms of the Eclipse Public License v1.0 which accompanies this |
| * distribution, and is available at http://www.eclipse.org/legal/epl-v10.html |
| * |
| * Contributors: Andreas Schank (Attensity Europe GmbH) - initial API and implementation |
| *******************************************************************************/ |
| package org.eclipse.smila.importing.crawler.web; |
| |
| import java.util.HashMap; |
| import java.util.HashSet; |
| import java.util.Map; |
| import java.util.Set; |
| |
| import org.eclipse.smila.datamodel.AnyMap; |
| import org.eclipse.smila.datamodel.DataFactory; |
| import org.eclipse.smila.importing.ImportingConstants; |
| import org.eclipse.smila.importing.crawler.web.WebCrawlerConstants.ErrorHandling; |
| import org.eclipse.smila.importing.crawler.web.filter.FilterConfiguration; |
| import org.eclipse.smila.importing.crawler.web.utils.RobotsTxt; |
| import org.eclipse.smila.importing.util.CrawlingContext; |
| import org.eclipse.smila.jobmanager.JobRunDataProvider; |
| import org.eclipse.smila.jobmanager.exceptions.JobManagerException; |
| import org.eclipse.smila.taskworker.TaskContext; |
| import org.eclipse.smila.taskworker.TaskLog; |
| |
| /** |
| * Context holding information needed throughout most of the web crawling process like mapper, filter configuration etc. |
| * for one task |
| */ |
| public class WebCrawlingContext extends CrawlingContext { |
| |
| /** the filter configuration for the current task. */ |
| private final FilterConfiguration _filterConfiguration; |
| |
| /** the id of the current input bulk. */ |
| private String _currentInputBulkId; |
| |
| /** the visited URLs. */ |
| private final Set<String> _visitedUrls = new HashSet<String>(); |
| |
| /** the extracted URLs. */ |
| private final Set<String> _extractedUrls = new HashSet<String>(); |
| |
| /** robots.txt evaluators for hosts visited in this task. */ |
| private final Map<String, RobotsTxt> _robotsTxts = new HashMap<>(); |
| |
| /** number of links per output bulk. */ |
| private int _linksPerBulk = WebCrawlerConstants.DEFAULT_LINKS_PER_BULK; |
| |
| /** error handling on recoverable fetch errors. */ |
| private ErrorHandling _errorHandling = ErrorHandling.DROP; |
| |
| /** creates a crawling context from the taskContext. */ |
| public WebCrawlingContext(final TaskContext taskContext) { |
| super(taskContext, false); |
| if (getTaskParameters().containsKey(ImportingConstants.TASK_PARAM_FILTERS)) { |
| _filterConfiguration = |
| new FilterConfiguration(getTaskParameters().getMap(ImportingConstants.TASK_PARAM_FILTERS)); |
| } else { |
| _filterConfiguration = new FilterConfiguration(DataFactory.DEFAULT.createAnyMap()); |
| } |
| if (taskContext.getTaskParameters().containsKey(WebCrawlerConstants.TASK_PARAM_LINKS_PER_BULK)) { |
| _linksPerBulk = |
| taskContext.getTaskParameters().getLongValue(WebCrawlerConstants.TASK_PARAM_LINKS_PER_BULK).intValue(); |
| } |
| if (taskContext.getTaskParameters().containsKey(WebCrawlerConstants.TASK_PARAM_LINK_ERROR_HANDLING)) { |
| _errorHandling = |
| ErrorHandling.valueOf(taskContext.getTaskParameters() |
| .getStringValue(WebCrawlerConstants.TASK_PARAM_LINK_ERROR_HANDLING).toUpperCase()); |
| } |
| |
| } |
| |
| /** |
| * @return the _filterConfiguration |
| */ |
| public FilterConfiguration getFilterConfiguration() { |
| return _filterConfiguration; |
| } |
| |
| /** |
| * @return the _taskLog |
| */ |
| public TaskLog getTaskLog() { |
| return getTaskContext().getLog(); |
| } |
| |
| /** |
| * @return the _parameters |
| */ |
| public AnyMap getTaskParameters() { |
| return getTaskContext().getTaskParameters(); |
| } |
| |
| /** |
| * @return the _jobRunId |
| */ |
| public String getCurrentInputBulkId() { |
| return _currentInputBulkId; |
| } |
| |
| /** |
| * @param inputBulkId |
| */ |
| public void setCurrentInputBulkId(final String inputBulkId) { |
| _currentInputBulkId = inputBulkId; |
| } |
| |
| /** |
| * @return the visited urls of this task |
| */ |
| public Set<String> getVisitedUrls() { |
| return _visitedUrls; |
| } |
| |
| /** @return the number of links per output bulk. */ |
| public int getLinksPerBulk() { |
| return _linksPerBulk; |
| } |
| |
| /** @return how to handle recoverable fetch errors. */ |
| public ErrorHandling getErrorHandling() { |
| return _errorHandling; |
| } |
| |
| /** |
| * @return set in which the Urls are collected that have been written to linksToCrawl in this task, for duplicate |
| * prevention. |
| */ |
| public Set<String> getExtractedUrls() { |
| return _extractedUrls; |
| } |
| |
| /** @return robots.txt checker for given host, if already fetched. */ |
| public RobotsTxt getRobotsTxt(final String hostAndPort, final JobRunDataProvider jobRunDataProvider) { |
| RobotsTxt robotsTxt = _robotsTxts.get(hostAndPort); |
| if (robotsTxt == null) { |
| final byte[] binaryRobotsTxt = getStoredRobotsTxt(hostAndPort, jobRunDataProvider); |
| if (binaryRobotsTxt != null) { |
| robotsTxt = new RobotsTxt(binaryRobotsTxt); |
| _robotsTxts.put(hostAndPort, robotsTxt); |
| } |
| } |
| return robotsTxt; |
| } |
| |
| /** get serialized robots.txt from job run data. */ |
| private byte[] getStoredRobotsTxt(final String hostAndPort, final JobRunDataProvider jobRunDataProvider) { |
| if (jobRunDataProvider != null && getJobName() != null && getJobRunId() != null) { |
| try { |
| return jobRunDataProvider.getCustomData(getJobName(), getJobRunId(), WebCrawlerWorker.NAME, "robots.txt", |
| hostAndPort); |
| } catch (final JobManagerException ex) { |
| getTaskLog().warn("Failed to read robots.txt for " + hostAndPort + " from job run data: " + ex); |
| } |
| } |
| return null; |
| } |
| |
| /** store robots.txt in context and job run data, if available. */ |
| public void putRobotsTxt(final String hostAndPort, final RobotsTxt robotsTxt, |
| final JobRunDataProvider jobRunDataProvider) { |
| _robotsTxts.put(hostAndPort, robotsTxt); |
| if (jobRunDataProvider != null && getJobName() != null && getJobRunId() != null) { |
| try { |
| jobRunDataProvider.setCustomData(getJobName(), getJobRunId(), WebCrawlerWorker.NAME, robotsTxt.asBinary(), |
| "robots.txt", hostAndPort); |
| } catch (final JobManagerException ex) { |
| getTaskLog().warn("Failed to store robots.txt for " + hostAndPort + " in job run data: " + ex); |
| } |
| } |
| } |
| |
| } |