core/org.eclipse.smila.importing.crawler.web/code/src/org/eclipse/smila/importing/crawler/web/WebCrawlerWorker.java - gerrit/smila/org.eclipse.smila.core - Git at Google

 /*******************************************************************************
  * Copyright (c) 2008, 2011 Attensity Europe GmbH and brox IT Solutions GmbH. All rights reserved. This program and the
  * accompanying materials are made available under the terms of the Eclipse Public License v1.0 which accompanies this
  * distribution, and is available at http://www.eclipse.org/legal/epl-v10.html
  *
  * Contributors: Juergen Schumacher (Attensity Europe GmbH) - initial API and implementation
  *******************************************************************************/

 package org.eclipse.smila.importing.crawler.web;

 import java.io.IOException;
 import java.net.URISyntaxException;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.Iterator;

 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.http.client.RedirectException;
 import org.eclipse.smila.datamodel.Any;
 import org.eclipse.smila.datamodel.DataFactory;
 import org.eclipse.smila.datamodel.Record;
 import org.eclipse.smila.importing.ImportingConstants;
 import org.eclipse.smila.importing.VisitedLinksException;
 import org.eclipse.smila.importing.VisitedLinksService;
 import org.eclipse.smila.importing.compounds.CompoundExtractor;
 import org.eclipse.smila.importing.crawler.web.WebCrawlerConstants.ErrorHandling;
 import org.eclipse.smila.importing.crawler.web.filter.FilterConfiguration;
 import org.eclipse.smila.importing.crawler.web.utils.UriHelper;
 import org.eclipse.smila.objectstore.ObjectStoreException;
 import org.eclipse.smila.taskworker.TaskContext;
 import org.eclipse.smila.taskworker.Worker;
 import org.eclipse.smila.taskworker.input.Inputs;
 import org.eclipse.smila.taskworker.input.RecordInput;
 import org.eclipse.smila.taskworker.output.Outputs;
 import org.eclipse.smila.taskworker.output.RecordOutput;
 import org.eclipse.smila.utils.MaybeRecoverableException;

 /** Worker for Web crawling. */
 public class WebCrawlerWorker implements Worker {
   /** Name of the worker, used in worker description and workflows. */
   public static final String NAME = "webCrawler";

   /** name of input slot containing the links to crawl. */
   public static final String INPUT_SLOT_LINKS_TO_CRAWL = "linksToCrawl";

   /** name of output slot containing the links to crawl. */
   public static final String OUTPUT_SLOT_LINKS_TO_CRAWL = "linksToCrawl";

   /** name of input slot containing the crawled records. */
   public static final String OUTPUT_SLOT_CRAWLED_RECORDS = "crawledRecords";

   /**
    * number of records to write into a single output bulk. However, usually the worker will produce 1 record output bulk
    * per link input bulk because for each input link at most one output record will be produced.
    */
   private static final int RECORDS_PER_BULK = 100;

   /** dummy input bulk Id used in initial crawl task for marking links as visited. */
   private static final String BULK_ID_FOR_INITIAL_TASK = "initial";

   /** reference to VisitedLinks service. */
   private VisitedLinksService _visitedLinks;

   /** reference to Fetcher service. */
   private Fetcher _fetcher;

   /** reference to LinkExtractor service. */
   private LinkExtractor _linkExtractor;

   /** reference to LinkFilter service. */
   private LinkFilter _linkFilter;

   /** reference to RecordProducer service. */
   private RecordProducer _recordProducer;

   /** local logger. */
   private final Log _log = LogFactory.getLog(getClass());

   /** the compound extractor. */
   private CompoundExtractor _compoundExtractor;

   /**
    * holds the current output bulks and creates new output bulks if the specified number or links or records have been
    * written.
    */
   private static final class RecordOutputHandler {

     /** task context outputs manager. */
     private final Outputs _outputs;

     /** maximum number of links to write to a single bulk. */
     private final int _linksPerBulk;

     /** maximum number of records to write to a single bulk. */
     private final int _recordsPerBulk;

     /** current linksToCrawl bulk. */
     private RecordOutput _linksToCrawl;

     /** index of current linksToCrawl bulk. */
     private int _linksToCrawlBulkIndex;

     /** current crawledRecords bulk. */
     private RecordOutput _crawledRecords;

     /** index of current crawledRecords bulk. */
     private int _crawledRecordsBulkIndex;

     /** initialize instance for processing of one task. */
     private RecordOutputHandler(final Outputs outputs, final int linksPerBulk, final int recordsPerBulk) {
       _outputs = outputs;
       _linksPerBulk = linksPerBulk;
       _recordsPerBulk = recordsPerBulk;
     }

     /** write a record to the linksToCrawl bulk, start a new bulk if necessary. */
     private void addLinkToCrawl(final Record record) throws WebCrawlerException {
       try {
         if (_linksToCrawl == null) {
           _linksToCrawl = _outputs.getAsRecordOutput(OUTPUT_SLOT_LINKS_TO_CRAWL);
         } else if (_linksToCrawl.getRecordCount() >= _linksPerBulk) {
           _linksToCrawl.commit();
           _linksToCrawlBulkIndex++;
           _linksToCrawl = _outputs.getAsRecordOutput(OUTPUT_SLOT_LINKS_TO_CRAWL, _linksToCrawlBulkIndex);
         }
         _linksToCrawl.writeRecord(record);
       } catch (final Exception ex) {
         throw new WebCrawlerException("Error writing to linksToCrawl bulk", ex);
       }
     }

     /** apply mapping and write the mapped record to the crawledRecords bulk, start a new bulk if necessary. */
     private void mapAndAddCrawledRecord(final Record record, final WebCrawlingContext context)
       throws WebCrawlerException {
       try {
         if (_crawledRecords == null) {
           _crawledRecords = _outputs.getAsRecordOutput(OUTPUT_SLOT_CRAWLED_RECORDS);
         } else if (_crawledRecords.getRecordCount() >= _recordsPerBulk) {
           _crawledRecords.commit();
           _crawledRecordsBulkIndex++;
           _crawledRecords = _outputs.getAsRecordOutput(OUTPUT_SLOT_CRAWLED_RECORDS, _crawledRecordsBulkIndex);
         }
         // map attributes. Do not do it before...
         context.getMapper().mapNames(record, WebCrawlerConstants.PROPERTY_NAMES);
         _crawledRecords.writeRecord(record);
       } catch (final Exception ex) {
         throw new WebCrawlerException("Error writing to crawledRecords bulk", ex);
       }
     }
   }

   /** get MIME Type from record. */
   public static String getMimeType(final Record record) {
     return record.getMetadata().getStringValue(WebCrawlerConstants.ATTRIBUTE_MIMETYPE);
   }

   @Override
   public String getName() {
     return NAME;
   }

   @Override
   public void perform(final TaskContext taskContext) throws Exception {
     final Inputs inputs = taskContext.getInputs();
     final WebCrawlingContext webCrawlingContext = new WebCrawlingContext(taskContext);
     if (inputs.getDataObjectCount(INPUT_SLOT_LINKS_TO_CRAWL) == 0) {
       initiateCrawling(webCrawlingContext);
     } else {
       final RecordInput linksToCrawl = inputs.getAsRecordInput(INPUT_SLOT_LINKS_TO_CRAWL);
       crawlLinkRecords(linksToCrawl, webCrawlingContext);
     }
   }

   /** start crawling from task parameters. */
   private void initiateCrawling(final WebCrawlingContext webCrawlingContext) throws MaybeRecoverableException {
     // put each link to an own bulk to improving scaling.
     final RecordOutputHandler outputBulks =
       new RecordOutputHandler(webCrawlingContext.getTaskContext().getOutputs(), 1, RECORDS_PER_BULK);
     final Record initialLinkRecord = DataFactory.DEFAULT.createRecord();
     final String startUrl =
       webCrawlingContext.getTaskParameters().getStringValue(WebCrawlerConstants.TASK_PARAM_START_URL);
     setUrl(initialLinkRecord, startUrl);
     initCrawlDepth(initialLinkRecord, webCrawlingContext);
     _visitedLinks.clearSource(webCrawlingContext.getDataSource());
     webCrawlingContext.setCurrentInputBulkId(BULK_ID_FOR_INITIAL_TASK);
     crawlLinkRecord(initialLinkRecord, outputBulks, webCrawlingContext);
   }

   /** crawl links from input records. */
   private void crawlLinkRecords(final RecordInput linksToCrawl, final WebCrawlingContext context)
     throws ObjectStoreException, IOException, MaybeRecoverableException {
     final Long sleepTime =
       context.getTaskParameters().getLongValue(WebCrawlerConstants.TASK_PARAM_WAIT_BETWEEN_REQUESTS);
     final RecordOutputHandler outputBulks =
       new RecordOutputHandler(context.getTaskContext().getOutputs(), context.getLinksPerBulk(), RECORDS_PER_BULK);
     final String inputBulkId = linksToCrawl.getObjectName();
     context.setCurrentInputBulkId(inputBulkId);
     Record record = linksToCrawl.getRecord();
     while (record != null && !context.getTaskContext().isCanceled()) {
       initCrawlDepth(record, context); // make sure that crawl depth is set in record
       normalizeUrl(record, context);
       if (hasNotBeenVisited(record, context)) {
         waitBetweenRequests(sleepTime);
         crawlLinkRecord(record, outputBulks, context);
         context.getVisitedUrls().add(getUrl(record));
       }
       record = linksToCrawl.getRecord();
     }
   }

   /** ensure that URL in record is normalized. */
   private void normalizeUrl(final Record record, final WebCrawlingContext context) throws MaybeRecoverableException {
     try {
       setUrl(record, getUrl(record)); // ensure that URL is normalized
     } catch (final MaybeRecoverableException ex) {
       handleCrawlException(record, context, ex);
     }
   }

   /**
    * check if the URL in a record has already been visited: either in this task by checking local list of URLs visited
    * in this task, or by a worker processing a different crawl task, by checking the global {@link VisitedLinksService}
    * instance.
    */
   private boolean hasNotBeenVisited(final Record record, final WebCrawlingContext context)
     throws VisitedLinksException {
     final String url = getUrl(record);
     debugLogUrl("Check if visited", record);
     boolean notVisited = false;
     if (!context.getVisitedUrls().contains(url)) {
       // not yet visited in this task
       notVisited = !checkAndMarkVisitedTimed(record, context);
       debugLogUrl("Will visit: " + notVisited, record);
     } else {
       debugLogUrl("Duplicate URL in task", record);
     }
     return notVisited;
   }

   /**
    * Check if an url in a record is marked as visited in {@link VisitedLinksService} instance and otherwise mark it.
    * measure time as "duration...checkVisitedLinks"
    *
    * @return 'true' if url was already visited, 'false' otherwise.
    */
   private boolean checkAndMarkVisitedTimed(final Record record, final WebCrawlingContext context)
     throws VisitedLinksException {
     final String url = getUrl(record);
     debugLogUrl("Check if visited", record);
     final long time = context.getTaskContext().getTimestamp();
     try {
       return _visitedLinks.checkAndMarkVisited(context.getDataSource(), url, context.getJobRunId(),
         context.getCurrentInputBulkId());
     } finally {
       context.getTaskContext().measureTime("checkVisitedLinks", time);
     }
   }

   /** crawl a link represented by one record: fetch metadata and content, extract links, produce record. */
   private void crawlLinkRecord(final Record linkRecord, final RecordOutputHandler outputBulks,
     final WebCrawlingContext context) throws MaybeRecoverableException {
     try {
       invokeFetcherTimed(linkRecord, context);
       // Check again after fetching to prevent duplicates when an URL is processed by two workers at the same time.
       // Additionally, this marks the URL as visited in the initial crawl task.
       if (!checkAndMarkVisitedTimed(linkRecord, context)) {
         extractAndFilterLinks(linkRecord, outputBulks, context);
         produceAndWriteRecords(linkRecord, outputBulks, context);
       }
     } catch (final MaybeRecoverableException ex) {
       handleCrawlException(linkRecord, context, ex);
     } catch (final RuntimeException ex) {
       logNonRecoverableError(linkRecord, ex, context);
     }
   }

   /** log or rethrow exception. */
   private void handleCrawlException(final Record linkRecord, final WebCrawlingContext context,
     final MaybeRecoverableException ex) throws MaybeRecoverableException {
     if (ex.isRecoverable() && context.getErrorHandling() == ErrorHandling.RETRY) {
       throw ex; // schedule retry for task
     }
     if (BULK_ID_FOR_INITIAL_TASK.equals(context.getCurrentInputBulkId())) {
       if (ex.isRecoverable() && context.getErrorHandling() != ErrorHandling.RETRY) {
         // we do not want to retry, so wrap exception such that job run will fail.
         throw new WebCrawlerException(ex.getMessage(), ex, false);
       }
       // exception is not recoverable so we can just rethrow it.
       throw ex;
     }
     if (ex.getCause() != null && ex.getCause() instanceof RedirectException) {
       logRedirectErrors(linkRecord, context, ex);
     } else {
       logNonRecoverableError(linkRecord, ex, context);
     }
   }

   private void logRedirectErrors(final Record linkRecord, final WebCrawlingContext context,
     final MaybeRecoverableException ex) {
     final FilterConfiguration filterConfig = context.getFilterConfiguration();
     if (filterConfig == null || !filterConfig.followRedirects()) {
       _log.info(ex.getLocalizedMessage()); // ignore redirect errors if follow redirects is not configured
     } else {
       logNonRecoverableError(linkRecord, ex, context);
     }
   }

   /**
    * produce the record to be processed by SMILA from the crawled link record and write them to the records output bulk.
    */
   private void produceAndWriteRecords(final Record linkRecord, final RecordOutputHandler outputBulks,
     final WebCrawlingContext context) throws WebCrawlerException {
     final Collection<Record> crawledRecords = produceRecordsTimed(linkRecord, context);
     for (final Record crawledRecord : crawledRecords) {
       if (isCompoundRecord(crawledRecord, context)) {
         setIsCompound(crawledRecord);
       }
       outputBulks.mapAndAddCrawledRecord(crawledRecord, context);
     }
   }

   /** checks if the crawled record is a compound record. */
   private boolean isCompoundRecord(final Record record, final WebCrawlingContext context) {
     return _compoundExtractor.canExtract(getUrl(record), getMimeType(record));
   }

   /** extract and filter links from content of the fetched web resource and write them to the links output bulk. */
   private void extractAndFilterLinks(final Record linkRecord, final RecordOutputHandler outputBulks,
     final WebCrawlingContext context) throws WebCrawlerException, VisitedLinksException {
     if (linkRecord.hasAttachment(WebCrawlerConstants.ATTACHMENT_CONTENT)) {
       final Collection<Record> extractedLinks = extractLinksTimed(linkRecord, context);
       final Collection<Record> filteredLinks = filterLinksTimed(extractedLinks, linkRecord, context);
       for (final Record outgoingLink : filteredLinks) {
         // check if extracted links are already visited
         if (context.getExtractedUrls().add(getUrl(outgoingLink))) {
           if (isNotVisitedTimed(outgoingLink, context)) {
             outputBulks.addLinkToCrawl(outgoingLink);
           }
         }
       }
     }
   }

   /** invoke fetcher and measure time as "duration...fetchResource". */
   private void invokeFetcherTimed(final Record linkRecord, final WebCrawlingContext context)
     throws WebCrawlerException {
     debugLogUrl("Call fetcher for ", linkRecord);
     final long time = context.getTaskContext().getTimestamp();
     try {
       // this record has not been mapped, so use the original URL attribute, ignore mapping rules!
       _fetcher.crawl(getUrl(linkRecord), linkRecord, context);
     } finally {
       context.getTaskContext().measureTime("fetchResource", time);
     }
   }

   /** invoke link extractor and measure time as "duration...extractLinks". */
   private Collection<Record> extractLinksTimed(final Record linkRecord, final WebCrawlingContext context)
     throws WebCrawlerException {
     debugLogUrl("Extract links from ", linkRecord);
     final long time = context.getTaskContext().getTimestamp();
     try {
       return _linkExtractor.extractLinks(linkRecord, context);
     } finally {
       context.getTaskContext().measureTime("extractLinks", time);
     }
   }

   /** invoke link filter and measure time as "duration...filterLink". */
   private Collection<Record> filterLinksTimed(final Collection<Record> extractedLinks, final Record sourceLink,
     final WebCrawlingContext context) throws WebCrawlerException {
     if (_log.isDebugEnabled()) {
       _log.debug("Filter links " + extractedLinks + " extracted from " + getUrl(sourceLink));
     }
     final long time = context.getTaskContext().getTimestamp();
     try {
       // apply filter for max crawl depth (and set new value to extracted records)
       final long crawlDepth = getCrawlDepth(sourceLink);
       if (crawlDepth == 0) {
         return Collections.emptyList();
       }
       final Iterator<Record> it = extractedLinks.iterator();
       final long nextCrawlDepth = Math.max(-1, crawlDepth - 1);
       while (it.hasNext()) {
         final Record rec = it.next();
         setCrawlDepth(rec, nextCrawlDepth);
       }
       // apply filter for URL patterns
       final Collection<Record> filteredLinks =
         _linkFilter.filterExtractedLinks(extractedLinks, getUrl(sourceLink), context);
       if (_log.isDebugEnabled()) {
         _log.debug("Remaining links: " + filteredLinks);
       }
       return filteredLinks;
     } finally {
       context.getTaskContext().measureTime("filterLink", time);
     }
   }

   /**
    * Check if an extracted link has not yet been visited by someone else. measure time as "duration...checkVisitedLinks"
    *
    * @return 'true' if url was already visited, 'false' otherwise.
    */
   private boolean isNotVisitedTimed(final Record record, final WebCrawlingContext context)
     throws VisitedLinksException {
     final String url = getUrl(record);
     debugLogUrl("Check if extracted link is visited", record);
     final long time = context.getTaskContext().getTimestamp();
     try {
       return !_visitedLinks.isVisited(context.getDataSource(), url, context.getJobRunId());
     } finally {
       context.getTaskContext().measureTime("checkVisitedLinks", time);
     }
   }

   /** invoke record producer and measure time as "duration...produceRecords". */
   private Collection<Record> produceRecordsTimed(final Record crawledRecord, final WebCrawlingContext context)
     throws WebCrawlerException {
     debugLogUrl("Produce record for ", crawledRecord);
     final long time = context.getTaskContext().getTimestamp();
     try {
       return _recordProducer.produceRecords(crawledRecord, context);
     } finally {
       context.getTaskContext().measureTime("produceRecords", time);
     }
   }

   /** log URL of record to debug log. */
   private void debugLogUrl(final String message, final Record link) {
     if (_log.isDebugEnabled()) {
       _log.debug(message + " " + getUrl(link));
     }
   }

   /**
    * log a non-recoverable error as a warning to the tasklog instead of throwing it, which would abort the task and
    * cancel the crawl job run.
    */
   private void logNonRecoverableError(final Record linkRecord, final Exception ex, final WebCrawlingContext context) {
     context.getTaskLog().warn(
       "Error crawling link " + getUrl(linkRecord) + " in source " + context.getDataSource() + ", skipping.", ex);
   }

   /** @return URL from record. */
   private String getUrl(final Record record) {
     return record.getMetadata().getStringValue(WebCrawlerConstants.ATTRIBUTE_URL);
   }

   /**
    * normalize URL and set in record.
    */
   private void setUrl(final Record record, final String url) throws MaybeRecoverableException {
     try {
       record.getMetadata().put(WebCrawlerConstants.ATTRIBUTE_URL, UriHelper.normalizeUrl(null, url));
     } catch (final URISyntaxException ex) {
       throw new MaybeRecoverableException(ex, false);
     }
   }

   private void initCrawlDepth(final Record linkRecord, final WebCrawlingContext webCrawlingContext) {
     final Any linkCrawlDepth = linkRecord.getMetadata().get(WebCrawlerConstants.ATTRIBUTE_CRAWL_DEPTH);
     if (linkCrawlDepth == null || !linkCrawlDepth.isLong()) {
       final long maxCrawlDepth = webCrawlingContext.getFilterConfiguration().getMaxCrawlDepth();
       linkRecord.getMetadata().put(WebCrawlerConstants.ATTRIBUTE_CRAWL_DEPTH, maxCrawlDepth);
     }
   }

   /** set crawl depth in record. */
   private void setCrawlDepth(final Record record, final long crawlDepth) {
     record.getMetadata().put(WebCrawlerConstants.ATTRIBUTE_CRAWL_DEPTH, crawlDepth);
   }

   /** @return crawl depth of given record. */
   private long getCrawlDepth(final Record record) {
     if (record.getMetadata().containsKey(WebCrawlerConstants.ATTRIBUTE_CRAWL_DEPTH)) {
       return record.getMetadata().getLongValue(WebCrawlerConstants.ATTRIBUTE_CRAWL_DEPTH);
     }
     return -1;
   }

   /** mark record as compound. */
   private void setIsCompound(final Record record) {
     record.getMetadata().put(ImportingConstants.ATTRIBUTE_COMPOUNDFLAG, true);
   }

   /** check if we should sleep between two consecutive requests and sleep, if so. */
   private void waitBetweenRequests(final Long sleepTime) {
     if (sleepTime != null && sleepTime > 0) {
       try {
         Thread.sleep(sleepTime);
       } catch (final InterruptedException e) {
         ; // ignore
       }
     }
   }

   /** DS service reference injection method. */
   public void setVisitedLinks(final VisitedLinksService visitedLinks) {
     _visitedLinks = visitedLinks;
   }

   /** DS service reference removal method. */
   public void unsetVisitedLinks(final VisitedLinksService visitedLinks) {
     if (_visitedLinks == visitedLinks) {
       _visitedLinks = null;
     }
   }

   /** DS service reference injection method. */
   public void setFetcher(final Fetcher fetcher) {
     _fetcher = fetcher;
   }

   /** DS service reference removal method. */
   public void unsetFetcher(final Fetcher fetcher) {
     if (_fetcher == fetcher) {
       _fetcher = null;
     }
   }

   /** DS service reference injection method. */
   public void setLinkExtractor(final LinkExtractor linkExtractor) {
     _linkExtractor = linkExtractor;
   }

   /** DS service reference removal method. */
   public void unsetLinkExtractor(final LinkExtractor linkExtractor) {
     if (_linkExtractor == linkExtractor) {
       _linkExtractor = null;
     }
   }

   /** DS service reference injection method. */
   public void setLinkFilter(final LinkFilter linkFilter) {
     _linkFilter = linkFilter;
   }

   /** DS service reference removal method. */
   public void unsetLinkFilter(final LinkFilter linkFilter) {
     if (_linkFilter == linkFilter) {
       _linkFilter = null;
     }
   }

   /** DS service reference injection method. */
   public void setRecordProducer(final RecordProducer recordProducer) {
     _recordProducer = recordProducer;
   }

   /** DS service reference removal method. */
   public void unsetRecordProducer(final RecordProducer recordProducer) {
     if (_recordProducer == recordProducer) {
       _recordProducer = null;
     }
   }

   /** DS service reference injection method. */
   public void setCompoundExtractor(final CompoundExtractor compoundExtractor) {
     _compoundExtractor = compoundExtractor;
   }

   /** DS service reference removal method. */
   public void unsetCompoundExtractor(final CompoundExtractor compoundExtractor) {
     if (_compoundExtractor == compoundExtractor) {
       _compoundExtractor = null;
     }
   }
 }