core/org.eclipse.smila.importing.crawler.web/code/src/org/eclipse/smila/importing/crawler/web/WebCrawlingContext.java - gerrit/smila/org.eclipse.smila.core - Git at Google

 /*******************************************************************************
  * Copyright (c) 2008, 2012 Attensity Europe GmbH and brox IT Solutions GmbH. All rights reserved. This program and the
  * accompanying materials are made available under the terms of the Eclipse Public License v1.0 which accompanies this
  * distribution, and is available at http://www.eclipse.org/legal/epl-v10.html
  *
  * Contributors: Andreas Schank (Attensity Europe GmbH) - initial API and implementation
  *******************************************************************************/
 package org.eclipse.smila.importing.crawler.web;

 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Map;
 import java.util.Set;

 import org.eclipse.smila.datamodel.AnyMap;
 import org.eclipse.smila.datamodel.DataFactory;
 import org.eclipse.smila.importing.ImportingConstants;
 import org.eclipse.smila.importing.crawler.web.WebCrawlerConstants.ErrorHandling;
 import org.eclipse.smila.importing.crawler.web.filter.FilterConfiguration;
 import org.eclipse.smila.importing.crawler.web.utils.RobotsTxt;
 import org.eclipse.smila.importing.util.CrawlingContext;
 import org.eclipse.smila.jobmanager.JobRunDataProvider;
 import org.eclipse.smila.jobmanager.exceptions.JobManagerException;
 import org.eclipse.smila.taskworker.TaskContext;
 import org.eclipse.smila.taskworker.TaskLog;

 /**
  * Context holding information needed throughout most of the web crawling process like mapper, filter configuration etc.
  * for one task
  */
 public class WebCrawlingContext extends CrawlingContext {

   /** the filter configuration for the current task. */
   private final FilterConfiguration _filterConfiguration;

   /** the id of the current input bulk. */
   private String _currentInputBulkId;

   /** the visited URLs. */
   private final Set<String> _visitedUrls = new HashSet<String>();

   /** the extracted URLs. */
   private final Set<String> _extractedUrls = new HashSet<String>();

   /** robots.txt evaluators for hosts visited in this task. */
   private final Map<String, RobotsTxt> _robotsTxts = new HashMap<>();

   /** number of links per output bulk. */
   private int _linksPerBulk = WebCrawlerConstants.DEFAULT_LINKS_PER_BULK;

   /** error handling on recoverable fetch errors. */
   private ErrorHandling _errorHandling = ErrorHandling.DROP;

   /** creates a crawling context from the taskContext. */
   public WebCrawlingContext(final TaskContext taskContext) {
     super(taskContext, false);
     if (getTaskParameters().containsKey(ImportingConstants.TASK_PARAM_FILTERS)) {
       _filterConfiguration =
         new FilterConfiguration(getTaskParameters().getMap(ImportingConstants.TASK_PARAM_FILTERS));
     } else {
       _filterConfiguration = new FilterConfiguration(DataFactory.DEFAULT.createAnyMap());
     }
     if (taskContext.getTaskParameters().containsKey(WebCrawlerConstants.TASK_PARAM_LINKS_PER_BULK)) {
       _linksPerBulk =
         taskContext.getTaskParameters().getLongValue(WebCrawlerConstants.TASK_PARAM_LINKS_PER_BULK).intValue();
     }
     if (taskContext.getTaskParameters().containsKey(WebCrawlerConstants.TASK_PARAM_LINK_ERROR_HANDLING)) {
       _errorHandling =
         ErrorHandling.valueOf(taskContext.getTaskParameters()
           .getStringValue(WebCrawlerConstants.TASK_PARAM_LINK_ERROR_HANDLING).toUpperCase());
     }

   }

   /**
    * @return the _filterConfiguration
    */
   public FilterConfiguration getFilterConfiguration() {
     return _filterConfiguration;
   }

   /**
    * @return the _taskLog
    */
   public TaskLog getTaskLog() {
     return getTaskContext().getLog();
   }

   /**
    * @return the _parameters
    */
   public AnyMap getTaskParameters() {
     return getTaskContext().getTaskParameters();
   }

   /**
    * @return the _jobRunId
    */
   public String getCurrentInputBulkId() {
     return _currentInputBulkId;
   }

   /**
    * @param inputBulkId
    */
   public void setCurrentInputBulkId(final String inputBulkId) {
     _currentInputBulkId = inputBulkId;
   }

   /**
    * @return the visited urls of this task
    */
   public Set<String> getVisitedUrls() {
     return _visitedUrls;
   }

   /** @return the number of links per output bulk. */
   public int getLinksPerBulk() {
     return _linksPerBulk;
   }

   /** @return how to handle recoverable fetch errors. */
   public ErrorHandling getErrorHandling() {
     return _errorHandling;
   }

   /**
    * @return set in which the Urls are collected that have been written to linksToCrawl in this task, for duplicate
    *         prevention.
    */
   public Set<String> getExtractedUrls() {
     return _extractedUrls;
   }

   /** @return robots.txt checker for given host, if already fetched. */
   public RobotsTxt getRobotsTxt(final String hostAndPort, final JobRunDataProvider jobRunDataProvider) {
     RobotsTxt robotsTxt = _robotsTxts.get(hostAndPort);
     if (robotsTxt == null) {
       final byte[] binaryRobotsTxt = getStoredRobotsTxt(hostAndPort, jobRunDataProvider);
       if (binaryRobotsTxt != null) {
         robotsTxt = new RobotsTxt(binaryRobotsTxt);
         _robotsTxts.put(hostAndPort, robotsTxt);
       }
     }
     return robotsTxt;
   }

   /** get serialized robots.txt from job run data. */
   private byte[] getStoredRobotsTxt(final String hostAndPort, final JobRunDataProvider jobRunDataProvider) {
     if (jobRunDataProvider != null && getJobName() != null && getJobRunId() != null) {
       try {
         return jobRunDataProvider.getCustomData(getJobName(), getJobRunId(), WebCrawlerWorker.NAME, "robots.txt",
           hostAndPort);
       } catch (final JobManagerException ex) {
         getTaskLog().warn("Failed to read robots.txt for " + hostAndPort + " from job run data: " + ex);
       }
     }
     return null;
   }

   /** store robots.txt in context and job run data, if available. */
   public void putRobotsTxt(final String hostAndPort, final RobotsTxt robotsTxt,
     final JobRunDataProvider jobRunDataProvider) {
     _robotsTxts.put(hostAndPort, robotsTxt);
     if (jobRunDataProvider != null && getJobName() != null && getJobRunId() != null) {
       try {
         jobRunDataProvider.setCustomData(getJobName(), getJobRunId(), WebCrawlerWorker.NAME, robotsTxt.asBinary(),
           "robots.txt", hostAndPort);
       } catch (final JobManagerException ex) {
         getTaskLog().warn("Failed to store robots.txt for " + hostAndPort + " in job run data: " + ex);
       }
     }
   }

 }
	/*******************************************************************************
	* Copyright (c) 2008, 2012 Attensity Europe GmbH and brox IT Solutions GmbH. All rights reserved. This program and the
	* accompanying materials are made available under the terms of the Eclipse Public License v1.0 which accompanies this
	* distribution, and is available at http://www.eclipse.org/legal/epl-v10.html
	*
	* Contributors: Andreas Schank (Attensity Europe GmbH) - initial API and implementation
	*******************************************************************************/
	package org.eclipse.smila.importing.crawler.web;

	import java.util.HashMap;
	import java.util.HashSet;
	import java.util.Map;
	import java.util.Set;

	import org.eclipse.smila.datamodel.AnyMap;
	import org.eclipse.smila.datamodel.DataFactory;
	import org.eclipse.smila.importing.ImportingConstants;
	import org.eclipse.smila.importing.crawler.web.WebCrawlerConstants.ErrorHandling;
	import org.eclipse.smila.importing.crawler.web.filter.FilterConfiguration;
	import org.eclipse.smila.importing.crawler.web.utils.RobotsTxt;
	import org.eclipse.smila.importing.util.CrawlingContext;
	import org.eclipse.smila.jobmanager.JobRunDataProvider;
	import org.eclipse.smila.jobmanager.exceptions.JobManagerException;
	import org.eclipse.smila.taskworker.TaskContext;
	import org.eclipse.smila.taskworker.TaskLog;

	/**
	* Context holding information needed throughout most of the web crawling process like mapper, filter configuration etc.
	* for one task
	*/
	public class WebCrawlingContext extends CrawlingContext {

	/** the filter configuration for the current task. */
	private final FilterConfiguration _filterConfiguration;

	/** the id of the current input bulk. */
	private String _currentInputBulkId;

	/** the visited URLs. */
	private final Set<String> _visitedUrls = new HashSet<String>();

	/** the extracted URLs. */
	private final Set<String> _extractedUrls = new HashSet<String>();

	/** robots.txt evaluators for hosts visited in this task. */
	private final Map<String, RobotsTxt> _robotsTxts = new HashMap<>();

	/** number of links per output bulk. */
	private int _linksPerBulk = WebCrawlerConstants.DEFAULT_LINKS_PER_BULK;

	/** error handling on recoverable fetch errors. */
	private ErrorHandling _errorHandling = ErrorHandling.DROP;

	/** creates a crawling context from the taskContext. */
	public WebCrawlingContext(final TaskContext taskContext) {
	super(taskContext, false);
	if (getTaskParameters().containsKey(ImportingConstants.TASK_PARAM_FILTERS)) {
	_filterConfiguration =
	new FilterConfiguration(getTaskParameters().getMap(ImportingConstants.TASK_PARAM_FILTERS));
	} else {
	_filterConfiguration = new FilterConfiguration(DataFactory.DEFAULT.createAnyMap());
	}
	if (taskContext.getTaskParameters().containsKey(WebCrawlerConstants.TASK_PARAM_LINKS_PER_BULK)) {
	_linksPerBulk =
	taskContext.getTaskParameters().getLongValue(WebCrawlerConstants.TASK_PARAM_LINKS_PER_BULK).intValue();
	}
	if (taskContext.getTaskParameters().containsKey(WebCrawlerConstants.TASK_PARAM_LINK_ERROR_HANDLING)) {
	_errorHandling =
	ErrorHandling.valueOf(taskContext.getTaskParameters()
	.getStringValue(WebCrawlerConstants.TASK_PARAM_LINK_ERROR_HANDLING).toUpperCase());
	}

	}

	/**
	* @return the _filterConfiguration
	*/
	public FilterConfiguration getFilterConfiguration() {
	return _filterConfiguration;
	}

	/**
	* @return the _taskLog
	*/
	public TaskLog getTaskLog() {
	return getTaskContext().getLog();
	}

	/**
	* @return the _parameters
	*/
	public AnyMap getTaskParameters() {
	return getTaskContext().getTaskParameters();
	}

	/**
	* @return the _jobRunId
	*/
	public String getCurrentInputBulkId() {
	return _currentInputBulkId;
	}

	/**
	* @param inputBulkId
	*/
	public void setCurrentInputBulkId(final String inputBulkId) {
	_currentInputBulkId = inputBulkId;
	}

	/**
	* @return the visited urls of this task
	*/
	public Set<String> getVisitedUrls() {
	return _visitedUrls;
	}

	/** @return the number of links per output bulk. */
	public int getLinksPerBulk() {
	return _linksPerBulk;
	}

	/** @return how to handle recoverable fetch errors. */
	public ErrorHandling getErrorHandling() {
	return _errorHandling;
	}

	/**
	* @return set in which the Urls are collected that have been written to linksToCrawl in this task, for duplicate
	* prevention.
	*/
	public Set<String> getExtractedUrls() {
	return _extractedUrls;
	}

	/** @return robots.txt checker for given host, if already fetched. */
	public RobotsTxt getRobotsTxt(final String hostAndPort, final JobRunDataProvider jobRunDataProvider) {
	RobotsTxt robotsTxt = _robotsTxts.get(hostAndPort);
	if (robotsTxt == null) {
	final byte[] binaryRobotsTxt = getStoredRobotsTxt(hostAndPort, jobRunDataProvider);
	if (binaryRobotsTxt != null) {
	robotsTxt = new RobotsTxt(binaryRobotsTxt);
	_robotsTxts.put(hostAndPort, robotsTxt);
	}
	}
	return robotsTxt;
	}

	/** get serialized robots.txt from job run data. */
	private byte[] getStoredRobotsTxt(final String hostAndPort, final JobRunDataProvider jobRunDataProvider) {
	if (jobRunDataProvider != null && getJobName() != null && getJobRunId() != null) {
	try {
	return jobRunDataProvider.getCustomData(getJobName(), getJobRunId(), WebCrawlerWorker.NAME, "robots.txt",
	hostAndPort);
	} catch (final JobManagerException ex) {
	getTaskLog().warn("Failed to read robots.txt for " + hostAndPort + " from job run data: " + ex);
	}
	}
	return null;
	}

	/** store robots.txt in context and job run data, if available. */
	public void putRobotsTxt(final String hostAndPort, final RobotsTxt robotsTxt,
	final JobRunDataProvider jobRunDataProvider) {
	_robotsTxts.put(hostAndPort, robotsTxt);
	if (jobRunDataProvider != null && getJobName() != null && getJobRunId() != null) {
	try {
	jobRunDataProvider.setCustomData(getJobName(), getJobRunId(), WebCrawlerWorker.NAME, robotsTxt.asBinary(),
	"robots.txt", hostAndPort);
	} catch (final JobManagerException ex) {
	getTaskLog().warn("Failed to store robots.txt for " + hostAndPort + " in job run data: " + ex);
	}
	}
	}

	}