core/org.eclipse.smila.importing.crawler.web/code/src/org/eclipse/smila/importing/crawler/web/WebFetcherWorker.java - gerrit/smila/org.eclipse.smila.core - Git at Google

 /***********************************************************************************************************************
  * Copyright (c) 2008,2011 empolis GmbH and brox IT Solutions GmbH. All rights reserved. This program and the
  * accompanying materials are made available under the terms of the Eclipse Public License v1.0 which accompanies this
  * distribution, and is available at http://www.eclipse.org/legal/epl-v10.html
  *
  * Contributors: Andreas Weber (Attensity Europe GmbH) - initial API and implementation
  **********************************************************************************************************************/
 package org.eclipse.smila.importing.crawler.web;

 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.eclipse.smila.datamodel.Record;
 import org.eclipse.smila.importing.crawler.web.WebCrawlerConstants.ErrorHandling;
 import org.eclipse.smila.importing.util.PropertyNameMapper;
 import org.eclipse.smila.taskworker.TaskContext;
 import org.eclipse.smila.taskworker.Worker;
 import org.eclipse.smila.taskworker.input.RecordInput;
 import org.eclipse.smila.taskworker.output.RecordOutput;

 /** Fetches binary content from URL and stores the content as record attachment. */
 public class WebFetcherWorker implements Worker {

   /** worker's name. */
   private static final String WORKER_NAME = "webFetcher";

   /** input slot name. */
   private static final String INPUT_SLOT = "linksToFetch";

   /** output slot name. */
   private static final String OUTPUT_SLOT = "fetchedLinks";

   /** local logger. */
   private final Log _log = LogFactory.getLog(getClass());

   /** reference to Fetcher service. */
   private Fetcher _fetcher;

   @Override
   public String getName() {
     return WORKER_NAME;
   }

   @Override
   public void perform(final TaskContext taskContext) throws Exception {
     final RecordInput recordInput = taskContext.getInputs().getAsRecordInput(INPUT_SLOT);
     final RecordOutput recordOutput = taskContext.getOutputs().getAsRecordOutput(OUTPUT_SLOT);
     final Long sleepTime =
       taskContext.getTaskParameters().getLongValue(WebCrawlerConstants.TASK_PARAM_WAIT_BETWEEN_REQUESTS);
     final WebCrawlingContext context = new WebCrawlingContext(taskContext);
     context.setCurrentInputBulkId(recordInput.getObjectName());
     final PropertyNameMapper mapper = context.getMapper();
     Record record;
     do {
       record = recordInput.getRecord();
       boolean writeRecord = true;
       if (record != null) {
         if (!record.hasAttachment(mapper.get(WebCrawlerConstants.ATTACHMENT_CONTENT).get(0))) {
           // we have no content attachment yet, so we fetch the content here...
           waitBetweenRequests(sleepTime);
           writeRecord = fetchContent(record, context);
         }
         if (writeRecord) {
           // if any mapping had been forgotten...
           mapper.mapNames(record, WebCrawlerConstants.PROPERTY_NAMES);
           recordOutput.writeRecord(record);
           if (_log.isDebugEnabled()) {
             _log.debug("added record " + record.getId());
           }
         }
       }
     } while (record != null && !taskContext.isCanceled());
   }

   /**
    * fetch content and measure time as "duration...fetchContent".
    *
    * @return true if the record should be written to output, else it should be dropped.
    */
   private boolean fetchContent(final Record record, final WebCrawlingContext context) throws WebCrawlerException {
     final long time = context.getTaskContext().getTimestamp();
     try {
       if (_log.isDebugEnabled()) {
         _log.debug("fetching content for record " + record.getId());
       }
       // this record has already been mapped, so we have to look up the URL using mapping info.
       _fetcher.fetch(
         record.getMetadata().getStringValue(context.getMapper().get(WebCrawlerConstants.ATTRIBUTE_URL).get(0)),
         record, context);
       return true;
     } catch (final WebCrawlerException e) {
       final StringBuilder message = new StringBuilder("Failed to fetch link for record ").append(record.getId());
       final boolean dropRecord = e.isRecoverable() && context.getErrorHandling() == ErrorHandling.DROP;
       final boolean ignoreFetchError = e.isRecoverable() && context.getErrorHandling() == ErrorHandling.IGNORE;
       if (dropRecord && !ignoreFetchError) {
         message.append(", record will be dropped.");
       }
       context.getTaskLog().warn(message.toString(), e);
       if (e.isRecoverable() && context.getErrorHandling() == ErrorHandling.RETRY) {
         throw e; // repeat the whole task for recoverable errors, otherwise continue processing
       }
       return !dropRecord || ignoreFetchError;
     } finally {
       context.getTaskContext().measureTime("fetchContent", time);
     }
   }

   /** check if we should sleep between two consecutive requests and sleep, if so. */
   private void waitBetweenRequests(final Long sleepTime) {
     if (sleepTime != null && sleepTime > 0) {
       try {
         Thread.sleep(sleepTime);
       } catch (final InterruptedException e) {
         ; // ignore
       }
     }
   }

   /** DS service reference injection method. */
   public void setFetcher(final Fetcher fetcher) {
     _fetcher = fetcher;
   }

   /** DS service reference removal method. */
   public void unsetFetcher(final Fetcher fetcher) {
     if (_fetcher == fetcher) {
       _fetcher = null;
     }
   }

 }
	/***********************************************************************************************************************
	* Copyright (c) 2008,2011 empolis GmbH and brox IT Solutions GmbH. All rights reserved. This program and the
	* accompanying materials are made available under the terms of the Eclipse Public License v1.0 which accompanies this
	* distribution, and is available at http://www.eclipse.org/legal/epl-v10.html
	*
	* Contributors: Andreas Weber (Attensity Europe GmbH) - initial API and implementation
	**********************************************************************************************************************/
	package org.eclipse.smila.importing.crawler.web;

	import org.apache.commons.logging.Log;
	import org.apache.commons.logging.LogFactory;
	import org.eclipse.smila.datamodel.Record;
	import org.eclipse.smila.importing.crawler.web.WebCrawlerConstants.ErrorHandling;
	import org.eclipse.smila.importing.util.PropertyNameMapper;
	import org.eclipse.smila.taskworker.TaskContext;
	import org.eclipse.smila.taskworker.Worker;
	import org.eclipse.smila.taskworker.input.RecordInput;
	import org.eclipse.smila.taskworker.output.RecordOutput;

	/** Fetches binary content from URL and stores the content as record attachment. */
	public class WebFetcherWorker implements Worker {

	/** worker's name. */
	private static final String WORKER_NAME = "webFetcher";

	/** input slot name. */
	private static final String INPUT_SLOT = "linksToFetch";

	/** output slot name. */
	private static final String OUTPUT_SLOT = "fetchedLinks";

	/** local logger. */
	private final Log _log = LogFactory.getLog(getClass());

	/** reference to Fetcher service. */
	private Fetcher _fetcher;

	@Override
	public String getName() {
	return WORKER_NAME;
	}

	@Override
	public void perform(final TaskContext taskContext) throws Exception {
	final RecordInput recordInput = taskContext.getInputs().getAsRecordInput(INPUT_SLOT);
	final RecordOutput recordOutput = taskContext.getOutputs().getAsRecordOutput(OUTPUT_SLOT);
	final Long sleepTime =
	taskContext.getTaskParameters().getLongValue(WebCrawlerConstants.TASK_PARAM_WAIT_BETWEEN_REQUESTS);
	final WebCrawlingContext context = new WebCrawlingContext(taskContext);
	context.setCurrentInputBulkId(recordInput.getObjectName());
	final PropertyNameMapper mapper = context.getMapper();
	Record record;
	do {
	record = recordInput.getRecord();
	boolean writeRecord = true;
	if (record != null) {
	if (!record.hasAttachment(mapper.get(WebCrawlerConstants.ATTACHMENT_CONTENT).get(0))) {
	// we have no content attachment yet, so we fetch the content here...
	waitBetweenRequests(sleepTime);
	writeRecord = fetchContent(record, context);
	}
	if (writeRecord) {
	// if any mapping had been forgotten...
	mapper.mapNames(record, WebCrawlerConstants.PROPERTY_NAMES);
	recordOutput.writeRecord(record);
	if (_log.isDebugEnabled()) {
	_log.debug("added record " + record.getId());
	}
	}
	}
	} while (record != null && !taskContext.isCanceled());
	}

	/**
	* fetch content and measure time as "duration...fetchContent".
	*
	* @return true if the record should be written to output, else it should be dropped.
	*/
	private boolean fetchContent(final Record record, final WebCrawlingContext context) throws WebCrawlerException {
	final long time = context.getTaskContext().getTimestamp();
	try {
	if (_log.isDebugEnabled()) {
	_log.debug("fetching content for record " + record.getId());
	}
	// this record has already been mapped, so we have to look up the URL using mapping info.
	_fetcher.fetch(
	record.getMetadata().getStringValue(context.getMapper().get(WebCrawlerConstants.ATTRIBUTE_URL).get(0)),
	record, context);
	return true;
	} catch (final WebCrawlerException e) {
	final StringBuilder message = new StringBuilder("Failed to fetch link for record ").append(record.getId());
	final boolean dropRecord = e.isRecoverable() && context.getErrorHandling() == ErrorHandling.DROP;
	final boolean ignoreFetchError = e.isRecoverable() && context.getErrorHandling() == ErrorHandling.IGNORE;
	if (dropRecord && !ignoreFetchError) {
	message.append(", record will be dropped.");
	}
	context.getTaskLog().warn(message.toString(), e);
	if (e.isRecoverable() && context.getErrorHandling() == ErrorHandling.RETRY) {
	throw e; // repeat the whole task for recoverable errors, otherwise continue processing
	}
	return !dropRecord \|\| ignoreFetchError;
	} finally {
	context.getTaskContext().measureTime("fetchContent", time);
	}
	}

	/** check if we should sleep between two consecutive requests and sleep, if so. */
	private void waitBetweenRequests(final Long sleepTime) {
	if (sleepTime != null && sleepTime > 0) {
	try {
	Thread.sleep(sleepTime);
	} catch (final InterruptedException e) {
	; // ignore
	}
	}
	}

	/** DS service reference injection method. */
	public void setFetcher(final Fetcher fetcher) {
	_fetcher = fetcher;
	}

	/** DS service reference removal method. */
	public void unsetFetcher(final Fetcher fetcher) {
	if (_fetcher == fetcher) {
	_fetcher = null;
	}
	}

	}