core/org.eclipse.smila.connectivity.framework.crawler.web/code/src/org/eclipse/smila/connectivity/framework/crawler/web/fetcher/Fetcher.java - smila/org.eclipse.smila.core - Git at Google

 /***********************************************************************************************************************
  * Copyright (c) 2008 empolis GmbH and brox IT Solutions GmbH. All rights reserved. This program and the accompanying
  * materials are made available under the terms of the Eclipse Public License v1.0 which accompanies this distribution,
  * and is available at http://www.eclipse.org/legal/epl-v10.html
  *
  * Contributors: Dmitry Hazin (brox IT Solutions GmbH) - initial creator Sebastian Voigt (brox IT Solutions GmbH)
  *
  * This File is based on the Fetcher.java from Nutch 0.8.1 (see below the licene). The original File was modified by the
  * Smila Team
  **********************************************************************************************************************/
 /**
  * Copyright 2005 The Apache Software Foundation
  *
  * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
  * the License. You may obtain a copy of the License at
  *
  * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
  * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
  * specific language governing permissions and limitations under the License.
  */
 package org.eclipse.smila.connectivity.framework.crawler.web.fetcher;

 import java.net.MalformedURLException;
 import java.util.Set;

 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.eclipse.smila.connectivity.framework.crawler.web.WebCrawler;
 import org.eclipse.smila.connectivity.framework.crawler.web.WebCrawlerPerformanceAgent;
 import org.eclipse.smila.connectivity.framework.crawler.web.configuration.Configuration;
 import org.eclipse.smila.connectivity.framework.crawler.web.configuration.Configured;
 import org.eclipse.smila.connectivity.framework.crawler.web.configuration.FetcherProperties;
 import org.eclipse.smila.connectivity.framework.crawler.web.crawl.CrawlMode;
 import org.eclipse.smila.connectivity.framework.crawler.web.filter.FilterProcessor;
 import org.eclipse.smila.connectivity.framework.crawler.web.http.Http;
 import org.eclipse.smila.connectivity.framework.crawler.web.http.HttpOutput;
 import org.eclipse.smila.connectivity.framework.crawler.web.http.HttpStatus;
 import org.eclipse.smila.connectivity.framework.crawler.web.http.SitemapParser;
 import org.eclipse.smila.connectivity.framework.crawler.web.metadata.Metadata;
 import org.eclipse.smila.connectivity.framework.crawler.web.parse.BinaryParser;
 import org.eclipse.smila.connectivity.framework.crawler.web.parse.Content;
 import org.eclipse.smila.connectivity.framework.crawler.web.parse.Outlink;
 import org.eclipse.smila.connectivity.framework.crawler.web.parse.Parse;
 import org.eclipse.smila.connectivity.framework.crawler.web.parse.ParseImpl;
 import org.eclipse.smila.connectivity.framework.crawler.web.parse.ParseStatus;
 import org.eclipse.smila.connectivity.framework.crawler.web.parse.Parser;
 import org.eclipse.smila.connectivity.framework.crawler.web.parse.ParserManager;
 import org.eclipse.smila.connectivity.framework.performancecounters.CrawlerPerformanceCounterHelper;

 /**
  * Class that fetches the link and updates crawler status.
  */
 public class Fetcher extends Configured {

   /**
    * Milliseconds in second constant.
    */
   private static final int MILLISECS_IN_SEC = 1000;

   /**
    * The Log.
    */
   private final Log _log = LogFactory.getLog(Fetcher.class);

   /**
    * Total bytes fetched.
    */
   private long _bytes;

   /**
    * Total pages fetched.
    */
   private int _pages;

   /**
    * Total pages occur.
    */
   private int _errors;

   /**
    * The max redirect.
    */
   private final int _maxRedirect;

   /**
    * The max retries.
    */
   private final int _maxRetries;

   /**
    * The wait retry.
    */
   private final int _waitRetry;

   /**
    * The output.
    */
   private FetcherOutput _output;

   /**
    * The site maps enabled.
    */
   private final boolean _sitemapsEnabled;

   /**
    * The site map.
    */
   private SitemapParser _sitemap;

   /**
    * The _performance counters.
    */
   private final CrawlerPerformanceCounterHelper<WebCrawlerPerformanceAgent> _performanceCounters;

   /**
    * Webcrawler parsers manager.
    */
   private final ParserManager _parserManager;

   /** fallback parser. */
   private final Parser _defaultParser = new BinaryParser();

   /**
    * Creates object with given Configuration.
    *
    * @param configuration
    *          Configuration
    * @param performanceCounters
    *          the performance counters
    * @param parserManager
    *          parser manager
    */
   public Fetcher(final Configuration configuration, final ParserManager parserManager,
     final CrawlerPerformanceCounterHelper<WebCrawlerPerformanceAgent> performanceCounters) {
     super(configuration);
     _performanceCounters = performanceCounters;
     _maxRedirect = configuration.getInt(FetcherProperties.MAX_REDIRECTS);
     _maxRetries = configuration.getInt(FetcherProperties.MAX_RETRIES);
     _waitRetry = configuration.getInt(FetcherProperties.WAIT_RETRY) * Configuration.MILLIS_PER_SECOND;
     _sitemapsEnabled = configuration.getBoolean(FetcherProperties.USE_SITEMAPS);
     if (_sitemapsEnabled) {
       _sitemap = new SitemapParser(configuration);
     }
     _parserManager = parserManager;
   }

   /**
    * Update status.
    *
    * @param bytesInPage
    *          the bytes in page
    */
   private synchronized void updateStatus(final int bytesInPage) {
     _pages++;
     _performanceCounters.increment(WebCrawler.POC_PAGES);
     _bytes += bytesInPage;
     _performanceCounters.incrementBy(WebCrawler.POC_BYTES, bytesInPage);
   }

   /**
    * Fetches the page and parses the link.
    *
    * @param link
    *          link to fetch
    * @param filterProcessor
    *          filters to perform
    * @param linksDone
    *          list of already crawled links
    *
    * @return FetcherOutput
    */
   public FetcherOutput fetch(Outlink link, final FilterProcessor filterProcessor, final Set<Outlink> linksDone) {
     try {
       boolean redirecting;
       boolean retrying;
       int redirectCount = 0;
       int retriesCount = 0;
       boolean continueFetching;
       final String url = link.toString();
       final Http http = new Http();
       http.setConf(getConf());
       Outlink[] sitemapLinks = new Outlink[0];

       if (_sitemapsEnabled) {
         if (_log.isDebugEnabled()) {
           _log.debug("Trying to download sitemap.xml");
         }
         sitemapLinks = _sitemap.getSitemapLinks(http, link.getUrl());
       }

       if (_log.isDebugEnabled()) {
         _log.debug("processing link: " + url);
       }
       final long start = System.currentTimeMillis();
       do {
         if (_log.isDebugEnabled()) {
           _log.debug("redirectCount=" + redirectCount);
           _log.debug("retriesCount=" + retriesCount);
         }
         redirecting = false;
         retrying = false;

         final HttpOutput output = http.getHttpOutput(link, filterProcessor);

         final HttpStatus status = output.getStatus();
         final Content content = output.getContent();
         ParseStatus pstatus = null;

         switch (status.getCode()) {

           case HttpStatus.SUCCESS: // got a page
             pstatus = output(url, content, HttpStatus.SUCCESS, sitemapLinks);
             updateStatus(content.getContent().length);
             if (pstatus != null && pstatus.isSuccess() && pstatus.getMinorCode() == ParseStatus.SUCCESS_REDIRECT) {
               link = getRedirectLink(link, pstatus.getMessage(), filterProcessor, linksDone);
               if (link == null) {
                 output(url, null, HttpStatus.NOTFETCHING, sitemapLinks);
               } else {
                 redirecting = true;
                 redirectCount++;
               }
             }
             break;

           case HttpStatus.MOVED: // redirect
           case HttpStatus.TEMP_MOVED:
             link = getRedirectLink(link, status.getMessage(), filterProcessor, linksDone);
             if (link == null) {
               output(url, null, HttpStatus.NOTFETCHING, sitemapLinks);
             } else {
               redirecting = true;
               redirectCount++;
             }
             break;

           // failures - increase the retry counter
           case HttpStatus.EXCEPTION:
             logError(url, status.getMessage());
             // Fallthrough
           case HttpStatus.RETRY:
             Thread.sleep(_waitRetry);
             retriesCount++;
             retrying = true;
             break;
           // permanent failures
           case HttpStatus.GONE: // gone
           case HttpStatus.NOTFOUND:
           case HttpStatus.ACCESS_DENIED:
           case HttpStatus.ROBOTS_DENIED:
           case HttpStatus.NOTMODIFIED:
             output(url, null, HttpStatus.GONE, sitemapLinks);
             break;
           case HttpStatus.NOTFETCHING:
             if (_log.isDebugEnabled()) {
               _log.debug("Won't fetch url " + url);
             }
             output(url, null, HttpStatus.NOTFETCHING, sitemapLinks);
             break;
           default:
             if (_log.isWarnEnabled()) {
               _log.warn("Unknown HttpStatus: " + status.getCode());
             }
             output(url, null, HttpStatus.GONE, sitemapLinks);
         }
         if (redirecting && redirectCount >= _maxRedirect) {
           if (_log.isInfoEnabled()) {
             _log.info(" - redirect count exceeded " + url);
           }
           output(url, null, HttpStatus.GONE, sitemapLinks);
         }
         if (retrying && retriesCount >= _maxRetries) {
           if (_log.isInfoEnabled()) {
             _log.info(" - retries count exceeded " + url);
           }
           output(url, null, HttpStatus.GONE, sitemapLinks);
         }
         continueFetching = redirecting && redirectCount < _maxRedirect || retrying && retriesCount < _maxRetries;
       } while (continueFetching);
       _performanceCounters.incrementBy(WebCrawler.POC_AVEREGE_TIME_TO_FETCH, (System.currentTimeMillis() - start)
         / MILLISECS_IN_SEC);
     } catch (final InterruptedException exception) {
       logError(link.getUrlString(), exception.getMessage());
     }
     // FetcherOutput
     return _output;
   }

   /**
    * Log error.
    *
    * @param url
    *          the url
    * @param message
    *          the message
    */
   private void logError(final String url, final String message) {
     _log.error("fetch of " + url + " failed with " + message);
     // record failure
     _errors++;
   }

   /**
    * Output.
    *
    * @param url
    *          the URL
    * @param content
    *          the content
    * @param status
    *          the status
    * @param sitemapLinks
    *          the site map links
    *
    * @return the parses the status
    */
   private ParseStatus output(final String url, Content content, final int status, final Outlink[] sitemapLinks) {

     if (content == null) {
       content = new Content(url, url, new byte[0], "", new Metadata());
     }

     Parse parse = null;
     if (status == HttpStatus.SUCCESS && _parserManager != null) {
       ParseStatus parseStatus = null;
       if (_parserManager != null) {
         final Parser parser = _parserManager.getParser(content.getContentType());
         if (parser != null) {
           if (_log.isDebugEnabled()) {
             _log.debug("Using webcrawler parser: " + parser.getClass().getName() + " for content-type "
               + content.getContentType());
           }
           parser.setConf(getConf());
           parse = parser.getParse(content);
           parseStatus = parse.getData().getStatus();
           if (!parseStatus.isSuccess()) {
             _log.error("Error parsing: " + url + ": " + parseStatus);
             parse = parseStatus.getEmptyParse(getConf());
           }
         } else {
           if (_log.isWarnEnabled()) {
             _log.warn("Parser for content-type: " + content.getContentType()
               + " not found. Treating as binary content.");
           }
           _defaultParser.setConf(getConf());
           parse = _defaultParser.getParse(content);
         }
       } else {
         if (_log.isErrorEnabled()) {
           _log.error("Parser manager is not set! Unable to get any parsers.");
         }
       }
     }
     if (parse == null) {
       _output = new FetcherOutput(content, null, sitemapLinks);
     } else {
       _output = new FetcherOutput(content, new ParseImpl(parse), sitemapLinks);
     }

     if (parse != null) {
       return parse.getData().getStatus();
     } else {
       return null;
     }
   }

   /**
    * Gets the redirect link.
    *
    * @param fromLink
    *          the from link
    * @param toUrlString
    *          the to URL string
    * @param filterProcessor
    *          the filter processor
    * @param linksDone
    *          list of already crawled links
    *
    * @return the redirect link
    */
   private Outlink getRedirectLink(final Outlink fromLink, final String toUrlString,
     final FilterProcessor filterProcessor, final Set<Outlink> linksDone) {
     Outlink newLink = null;
     try {
       newLink = new Outlink(toUrlString, fromLink.getAnchor(), getConf());
       final CrawlMode crawlMode = filterProcessor.evaluateUrlFilters(newLink);
       if (!crawlMode.equals(CrawlMode.Skip) && !newLink.equals(fromLink) && !linksDone.contains(newLink)) {
         if (_log.isDebugEnabled()) {
           _log.debug("redirect to " + newLink.getUrlString());
         }
       } else {
         if (_log.isDebugEnabled()) {
           if (crawlMode.equals(CrawlMode.Skip)) {
             _log.debug("Won't redirect: CrawlMode=Skip, url = " + newLink.getUrlString());
           } else if (linksDone.contains(newLink)) {
             _log.debug("Won't redirect: url already crawled, url = " + newLink.getUrlString());
           } else if (newLink.equals(fromLink)) {
             _log.debug("Won't redirect: redirect to the same url, url = " + newLink.getUrlString());
           }
         }
         return null;
       }
     } catch (final MalformedURLException exception) {
       _log.error("Malformed redirect url: " + toUrlString);
     }
     return newLink;
   }

   /**
    * Returns amount of bytes fetched so far.
    *
    * @return long
    */
   public long getBytes() {
     return _bytes;
   }

   /**
    * Assigns the amount of bytes fetched.
    *
    * @param bytes
    *          long
    */
   public void setBytes(final long bytes) {
     this._bytes = bytes;
   }

   /**
    * Returns number of fetching errors happened so far.
    *
    * @return errors
    */
   public int getErrors() {
     return _errors;
   }

   /**
    * Assigns number of fetching errors.
    *
    * @param errors
    *          errors
    */
   public void setErrors(final int errors) {
     _errors = errors;
   }

   /**
    * Returns number of fetched pages so far.
    *
    * @return pages
    */
   public int getPages() {
     return _pages;
   }

   /**
    * Assigns number of fetched pages.
    *
    * @param pages
    *          pages
    */
   public void setPages(final int pages) {
     _pages = pages;
   }

 }
	/***********************************************************************************************************************
	* Copyright (c) 2008 empolis GmbH and brox IT Solutions GmbH. All rights reserved. This program and the accompanying
	* materials are made available under the terms of the Eclipse Public License v1.0 which accompanies this distribution,
	* and is available at http://www.eclipse.org/legal/epl-v10.html
	*
	* Contributors: Dmitry Hazin (brox IT Solutions GmbH) - initial creator Sebastian Voigt (brox IT Solutions GmbH)
	*
	* This File is based on the Fetcher.java from Nutch 0.8.1 (see below the licene). The original File was modified by the
	* Smila Team
	**********************************************************************************************************************/
	/**
	* Copyright 2005 The Apache Software Foundation
	*
	* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
	* an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations under the License.
	*/
	package org.eclipse.smila.connectivity.framework.crawler.web.fetcher;

	import java.net.MalformedURLException;
	import java.util.Set;

	import org.apache.commons.logging.Log;
	import org.apache.commons.logging.LogFactory;
	import org.eclipse.smila.connectivity.framework.crawler.web.WebCrawler;
	import org.eclipse.smila.connectivity.framework.crawler.web.WebCrawlerPerformanceAgent;
	import org.eclipse.smila.connectivity.framework.crawler.web.configuration.Configuration;
	import org.eclipse.smila.connectivity.framework.crawler.web.configuration.Configured;
	import org.eclipse.smila.connectivity.framework.crawler.web.configuration.FetcherProperties;
	import org.eclipse.smila.connectivity.framework.crawler.web.crawl.CrawlMode;
	import org.eclipse.smila.connectivity.framework.crawler.web.filter.FilterProcessor;
	import org.eclipse.smila.connectivity.framework.crawler.web.http.Http;
	import org.eclipse.smila.connectivity.framework.crawler.web.http.HttpOutput;
	import org.eclipse.smila.connectivity.framework.crawler.web.http.HttpStatus;
	import org.eclipse.smila.connectivity.framework.crawler.web.http.SitemapParser;
	import org.eclipse.smila.connectivity.framework.crawler.web.metadata.Metadata;
	import org.eclipse.smila.connectivity.framework.crawler.web.parse.BinaryParser;
	import org.eclipse.smila.connectivity.framework.crawler.web.parse.Content;
	import org.eclipse.smila.connectivity.framework.crawler.web.parse.Outlink;
	import org.eclipse.smila.connectivity.framework.crawler.web.parse.Parse;
	import org.eclipse.smila.connectivity.framework.crawler.web.parse.ParseImpl;
	import org.eclipse.smila.connectivity.framework.crawler.web.parse.ParseStatus;
	import org.eclipse.smila.connectivity.framework.crawler.web.parse.Parser;
	import org.eclipse.smila.connectivity.framework.crawler.web.parse.ParserManager;
	import org.eclipse.smila.connectivity.framework.performancecounters.CrawlerPerformanceCounterHelper;

	/**
	* Class that fetches the link and updates crawler status.
	*/
	public class Fetcher extends Configured {

	/**
	* Milliseconds in second constant.
	*/
	private static final int MILLISECS_IN_SEC = 1000;

	/**
	* The Log.
	*/
	private final Log _log = LogFactory.getLog(Fetcher.class);

	/**
	* Total bytes fetched.
	*/
	private long _bytes;

	/**
	* Total pages fetched.
	*/
	private int _pages;

	/**
	* Total pages occur.
	*/
	private int _errors;

	/**
	* The max redirect.
	*/
	private final int _maxRedirect;

	/**
	* The max retries.
	*/
	private final int _maxRetries;

	/**
	* The wait retry.
	*/
	private final int _waitRetry;

	/**
	* The output.
	*/
	private FetcherOutput _output;

	/**
	* The site maps enabled.
	*/
	private final boolean _sitemapsEnabled;

	/**
	* The site map.
	*/
	private SitemapParser _sitemap;

	/**
	* The _performance counters.
	*/
	private final CrawlerPerformanceCounterHelper<WebCrawlerPerformanceAgent> _performanceCounters;

	/**
	* Webcrawler parsers manager.
	*/
	private final ParserManager _parserManager;

	/** fallback parser. */
	private final Parser _defaultParser = new BinaryParser();

	/**
	* Creates object with given Configuration.
	*
	* @param configuration
	* Configuration
	* @param performanceCounters
	* the performance counters
	* @param parserManager
	* parser manager
	*/
	public Fetcher(final Configuration configuration, final ParserManager parserManager,
	final CrawlerPerformanceCounterHelper<WebCrawlerPerformanceAgent> performanceCounters) {
	super(configuration);
	_performanceCounters = performanceCounters;
	_maxRedirect = configuration.getInt(FetcherProperties.MAX_REDIRECTS);
	_maxRetries = configuration.getInt(FetcherProperties.MAX_RETRIES);
	_waitRetry = configuration.getInt(FetcherProperties.WAIT_RETRY) * Configuration.MILLIS_PER_SECOND;
	_sitemapsEnabled = configuration.getBoolean(FetcherProperties.USE_SITEMAPS);
	if (_sitemapsEnabled) {
	_sitemap = new SitemapParser(configuration);
	}
	_parserManager = parserManager;
	}

	/**
	* Update status.
	*
	* @param bytesInPage
	* the bytes in page
	*/
	private synchronized void updateStatus(final int bytesInPage) {
	_pages++;
	_performanceCounters.increment(WebCrawler.POC_PAGES);
	_bytes += bytesInPage;
	_performanceCounters.incrementBy(WebCrawler.POC_BYTES, bytesInPage);
	}

	/**
	* Fetches the page and parses the link.
	*
	* @param link
	* link to fetch
	* @param filterProcessor
	* filters to perform
	* @param linksDone
	* list of already crawled links
	*
	* @return FetcherOutput
	*/
	public FetcherOutput fetch(Outlink link, final FilterProcessor filterProcessor, final Set<Outlink> linksDone) {
	try {
	boolean redirecting;
	boolean retrying;
	int redirectCount = 0;
	int retriesCount = 0;
	boolean continueFetching;
	final String url = link.toString();
	final Http http = new Http();
	http.setConf(getConf());
	Outlink[] sitemapLinks = new Outlink[0];

	if (_sitemapsEnabled) {
	if (_log.isDebugEnabled()) {
	_log.debug("Trying to download sitemap.xml");
	}
	sitemapLinks = _sitemap.getSitemapLinks(http, link.getUrl());
	}

	if (_log.isDebugEnabled()) {
	_log.debug("processing link: " + url);
	}
	final long start = System.currentTimeMillis();
	do {
	if (_log.isDebugEnabled()) {
	_log.debug("redirectCount=" + redirectCount);
	_log.debug("retriesCount=" + retriesCount);
	}
	redirecting = false;
	retrying = false;

	final HttpOutput output = http.getHttpOutput(link, filterProcessor);

	final HttpStatus status = output.getStatus();
	final Content content = output.getContent();
	ParseStatus pstatus = null;

	switch (status.getCode()) {

	case HttpStatus.SUCCESS: // got a page
	pstatus = output(url, content, HttpStatus.SUCCESS, sitemapLinks);
	updateStatus(content.getContent().length);
	if (pstatus != null && pstatus.isSuccess() && pstatus.getMinorCode() == ParseStatus.SUCCESS_REDIRECT) {
	link = getRedirectLink(link, pstatus.getMessage(), filterProcessor, linksDone);
	if (link == null) {
	output(url, null, HttpStatus.NOTFETCHING, sitemapLinks);
	} else {
	redirecting = true;
	redirectCount++;
	}
	}
	break;

	case HttpStatus.MOVED: // redirect
	case HttpStatus.TEMP_MOVED:
	link = getRedirectLink(link, status.getMessage(), filterProcessor, linksDone);
	if (link == null) {
	output(url, null, HttpStatus.NOTFETCHING, sitemapLinks);
	} else {
	redirecting = true;
	redirectCount++;
	}
	break;

	// failures - increase the retry counter
	case HttpStatus.EXCEPTION:
	logError(url, status.getMessage());
	// Fallthrough
	case HttpStatus.RETRY:
	Thread.sleep(_waitRetry);
	retriesCount++;
	retrying = true;
	break;
	// permanent failures
	case HttpStatus.GONE: // gone
	case HttpStatus.NOTFOUND:
	case HttpStatus.ACCESS_DENIED:
	case HttpStatus.ROBOTS_DENIED:
	case HttpStatus.NOTMODIFIED:
	output(url, null, HttpStatus.GONE, sitemapLinks);
	break;
	case HttpStatus.NOTFETCHING:
	if (_log.isDebugEnabled()) {
	_log.debug("Won't fetch url " + url);
	}
	output(url, null, HttpStatus.NOTFETCHING, sitemapLinks);
	break;
	default:
	if (_log.isWarnEnabled()) {
	_log.warn("Unknown HttpStatus: " + status.getCode());
	}
	output(url, null, HttpStatus.GONE, sitemapLinks);
	}
	if (redirecting && redirectCount >= _maxRedirect) {
	if (_log.isInfoEnabled()) {
	_log.info(" - redirect count exceeded " + url);
	}
	output(url, null, HttpStatus.GONE, sitemapLinks);
	}
	if (retrying && retriesCount >= _maxRetries) {
	if (_log.isInfoEnabled()) {
	_log.info(" - retries count exceeded " + url);
	}
	output(url, null, HttpStatus.GONE, sitemapLinks);
	}
	continueFetching = redirecting && redirectCount < _maxRedirect \|\| retrying && retriesCount < _maxRetries;
	} while (continueFetching);
	_performanceCounters.incrementBy(WebCrawler.POC_AVEREGE_TIME_TO_FETCH, (System.currentTimeMillis() - start)
	/ MILLISECS_IN_SEC);
	} catch (final InterruptedException exception) {
	logError(link.getUrlString(), exception.getMessage());
	}
	// FetcherOutput
	return _output;
	}

	/**
	* Log error.
	*
	* @param url
	* the url
	* @param message
	* the message
	*/
	private void logError(final String url, final String message) {
	_log.error("fetch of " + url + " failed with " + message);
	// record failure
	_errors++;
	}

	/**
	* Output.
	*
	* @param url
	* the URL
	* @param content
	* the content
	* @param status
	* the status
	* @param sitemapLinks
	* the site map links
	*
	* @return the parses the status
	*/
	private ParseStatus output(final String url, Content content, final int status, final Outlink[] sitemapLinks) {

	if (content == null) {
	content = new Content(url, url, new byte[0], "", new Metadata());
	}

	Parse parse = null;
	if (status == HttpStatus.SUCCESS && _parserManager != null) {
	ParseStatus parseStatus = null;
	if (_parserManager != null) {
	final Parser parser = _parserManager.getParser(content.getContentType());
	if (parser != null) {
	if (_log.isDebugEnabled()) {
	_log.debug("Using webcrawler parser: " + parser.getClass().getName() + " for content-type "
	+ content.getContentType());
	}
	parser.setConf(getConf());
	parse = parser.getParse(content);
	parseStatus = parse.getData().getStatus();
	if (!parseStatus.isSuccess()) {
	_log.error("Error parsing: " + url + ": " + parseStatus);
	parse = parseStatus.getEmptyParse(getConf());
	}
	} else {
	if (_log.isWarnEnabled()) {
	_log.warn("Parser for content-type: " + content.getContentType()
	+ " not found. Treating as binary content.");
	}
	_defaultParser.setConf(getConf());
	parse = _defaultParser.getParse(content);
	}
	} else {
	if (_log.isErrorEnabled()) {
	_log.error("Parser manager is not set! Unable to get any parsers.");
	}
	}
	}
	if (parse == null) {
	_output = new FetcherOutput(content, null, sitemapLinks);
	} else {
	_output = new FetcherOutput(content, new ParseImpl(parse), sitemapLinks);
	}

	if (parse != null) {
	return parse.getData().getStatus();
	} else {
	return null;
	}
	}

	/**
	* Gets the redirect link.
	*
	* @param fromLink
	* the from link
	* @param toUrlString
	* the to URL string
	* @param filterProcessor
	* the filter processor
	* @param linksDone
	* list of already crawled links
	*
	* @return the redirect link
	*/
	private Outlink getRedirectLink(final Outlink fromLink, final String toUrlString,
	final FilterProcessor filterProcessor, final Set<Outlink> linksDone) {
	Outlink newLink = null;
	try {
	newLink = new Outlink(toUrlString, fromLink.getAnchor(), getConf());
	final CrawlMode crawlMode = filterProcessor.evaluateUrlFilters(newLink);
	if (!crawlMode.equals(CrawlMode.Skip) && !newLink.equals(fromLink) && !linksDone.contains(newLink)) {
	if (_log.isDebugEnabled()) {
	_log.debug("redirect to " + newLink.getUrlString());
	}
	} else {
	if (_log.isDebugEnabled()) {
	if (crawlMode.equals(CrawlMode.Skip)) {
	_log.debug("Won't redirect: CrawlMode=Skip, url = " + newLink.getUrlString());
	} else if (linksDone.contains(newLink)) {
	_log.debug("Won't redirect: url already crawled, url = " + newLink.getUrlString());
	} else if (newLink.equals(fromLink)) {
	_log.debug("Won't redirect: redirect to the same url, url = " + newLink.getUrlString());
	}
	}
	return null;
	}
	} catch (final MalformedURLException exception) {
	_log.error("Malformed redirect url: " + toUrlString);
	}
	return newLink;
	}

	/**
	* Returns amount of bytes fetched so far.
	*
	* @return long
	*/
	public long getBytes() {
	return _bytes;
	}

	/**
	* Assigns the amount of bytes fetched.
	*
	* @param bytes
	* long
	*/
	public void setBytes(final long bytes) {
	this._bytes = bytes;
	}

	/**
	* Returns number of fetching errors happened so far.
	*
	* @return errors
	*/
	public int getErrors() {
	return _errors;
	}

	/**
	* Assigns number of fetching errors.
	*
	* @param errors
	* errors
	*/
	public void setErrors(final int errors) {
	_errors = errors;
	}

	/**
	* Returns number of fetched pages so far.
	*
	* @return pages
	*/
	public int getPages() {
	return _pages;
	}

	/**
	* Assigns number of fetched pages.
	*
	* @param pages
	* pages
	*/
	public void setPages(final int pages) {
	_pages = pages;
	}

	}