core/org.eclipse.smila.importing.crawler.web/code/src/org/eclipse/smila/importing/crawler/web/fetcher/DefaultFetcher.java - smila/org.eclipse.smila.core - Git at Google

 /*******************************************************************************
  * Copyright (c) 2008, 2011 Attensity Europe GmbH and brox IT Solutions GmbH. All rights reserved. This program and the
  * accompanying materials are made available under the terms of the Eclipse Public License v1.0 which accompanies this
  * distribution, and is available at http://www.eclipse.org/legal/epl-v10.html
  *
  * Contributors: Juergen Schumacher (Attensity Europe GmbH) - initial API and implementation
  *******************************************************************************/

 package org.eclipse.smila.importing.crawler.web.fetcher;

 import java.io.IOException;
 import java.io.InputStream;
 import java.net.ProxySelector;
 import java.net.URISyntaxException;
 import java.util.Date;

 import org.apache.commons.io.IOUtils;
 import org.apache.http.Header;
 import org.apache.http.HeaderElement;
 import org.apache.http.HttpEntity;
 import org.apache.http.HttpHost;
 import org.apache.http.HttpResponse;
 import org.apache.http.HttpStatus;
 import org.apache.http.NameValuePair;
 import org.apache.http.client.HttpClient;
 import org.apache.http.client.RedirectException;
 import org.apache.http.client.methods.HttpGet;
 import org.apache.http.client.params.HttpClientParams;
 import org.apache.http.conn.ClientConnectionManager;
 import org.apache.http.conn.params.ConnRoutePNames;
 import org.apache.http.impl.client.AbstractHttpClient;
 import org.apache.http.impl.client.DefaultHttpClient;
 import org.apache.http.impl.conn.ProxySelectorRoutePlanner;
 import org.apache.http.impl.cookie.DateParseException;
 import org.apache.http.impl.cookie.DateUtils;
 import org.apache.http.params.CoreConnectionPNames;
 import org.eclipse.smila.datamodel.AnyMap;
 import org.eclipse.smila.datamodel.Record;
 import org.eclipse.smila.http.client.util.HttpClientUtil;
 import org.eclipse.smila.importing.ImportingException;
 import org.eclipse.smila.importing.VisitedLinksException;
 import org.eclipse.smila.importing.VisitedLinksService;
 import org.eclipse.smila.importing.crawler.web.Fetcher;
 import org.eclipse.smila.importing.crawler.web.LinkFilter;
 import org.eclipse.smila.importing.crawler.web.WebCrawlerConstants;
 import org.eclipse.smila.importing.crawler.web.WebCrawlerException;
 import org.eclipse.smila.importing.crawler.web.WebCrawlingContext;
 import org.eclipse.smila.importing.crawler.web.filter.FilterConfiguration;
 import org.eclipse.smila.importing.crawler.web.utils.UriHelper;
 import org.eclipse.smila.importing.crawler.web.utils.WebCrawlerConfiguration;
 import org.eclipse.smila.importing.util.PropertyNameMapper;
 import org.eclipse.smila.taskworker.TaskContext;

 /**
  * Example implementation of a Fetcher service. It uses GET method to access the resource.
  * <ul>
  * <li>During crawling it reads metadata for content-length, content-type and last-modified from the HTTP header to
  * attributes and attaches the content of resources that are reported as mime type "text/html".
  * <li>During fetching it just attaches the content of any resource.
  * </ul>
  * It does not (yet) support authentication. It is based on Apache HttpClient 4.1.
  *
  */
 public class DefaultFetcher implements Fetcher {

   /** name of HTTP header for last-modified date. */
   private static final String HEADER_LASTMODIFIED = "Last-Modified";

   /** name of HTTP header for content-type and charset. */
   private static final String HEADER_CONTENTTYPE = "Content-Type";

   /** name of Content-type header parameter for charset. */
   private static final String HEADER_PARAM_CHARSET = "charset";

   /** default setttings for the connection manager. */
   private static final int DEFAULT_MAX_CONNECTIONS_PER_HOST = 32;

   /** default setttings for the connection manager. */
   private static final int DEFAULT_MAX_TOTAL_CONNECTIONS = 128;

   /** reference to VisitedLinks service. */
   private VisitedLinksService _visitedLinks;

   /** reference to LinkFilter service. */
   private LinkFilter _linkFilter;

   /** client for all http operations. */
   private final HttpClient _httpClient;

   /** initialize HttpClient with disabled redirects. */
   public DefaultFetcher() {
     _httpClient = createAndonfigureClient();
   }

   @Override
   public void crawl(final String url, final Record linkRecord, final WebCrawlingContext context)
     throws WebCrawlerException {
     HttpResponseInputStream response = null;
     try {
       response = getResource(url, context);
       resetUrlAttributeOnRedirect(linkRecord, response, context.getMapper());
       readMetadata(linkRecord, response);
       readHtmlContent(linkRecord, response);
     } catch (final RedirectException ex) {
       throw new WebCrawlerException("Error while handling redirects for web resource " + url + ": "
         + ex.getMessage(), ex, false);
     } catch (final VisitedLinksException ex) {
       throw new WebCrawlerException("Error while handling redirects for web resource " + url + ": "
         + ex.getMessage(), ex, true);
     } catch (final IOException ex) {
       throw new WebCrawlerException("IO error while getting web resource " + url + ": " + ex.getMessage(), ex, true);
     } finally {
       IOUtils.closeQuietly(response);
     }
   }

   @Override
   public void fetch(final String url, final Record crawledRecord, final WebCrawlingContext context)
     throws WebCrawlerException {
     HttpResponseInputStream response = null;
     try {
       response = getResource(url, context);
       resetUrlAttributeOnRedirect(crawledRecord, response, context.getMapper());
       readContent(crawledRecord, response);
     } catch (final RedirectException ex) {
       throw new WebCrawlerException("Error while handling redirects for web resource " + url + ": "
         + ex.getMessage(), ex, false);
     } catch (final VisitedLinksException ex) {
       throw new WebCrawlerException("Error while handling redirects for web resource " + url + ": "
         + ex.getMessage(), ex, true);
     } catch (final IOException ex) {
       throw new WebCrawlerException("IO error while getting web resource " + url + ": " + ex.getMessage(), ex, true);
     } finally {
       IOUtils.closeQuietly(response);
     }
   }

   /**
    * {@inheritDoc}
    *
    * <p>
    * <b>Please note: a mapped record (at least URL must be mapped) is expected here!</b>
    * </p>
    */
   @Override
   public InputStream getContent(final Record crawledRecord, final TaskContext taskContext)
     throws ImportingException {
     final PropertyNameMapper mapper = PropertyNameMapper.createFrom(taskContext);
     String url = crawledRecord.getMetadata().getStringValue(mapper.get(WebCrawlerConstants.ATTRIBUTE_URL));
     if (url == null) {
       url = crawledRecord.getMetadata().getStringValue(WebCrawlerConstants.ATTRIBUTE_URL);
     }
     try {
       final HttpResponseInputStream response = getResource(url, new WebCrawlingContext(taskContext));
       resetUrlAttributeOnRedirect(crawledRecord, response, mapper);
       return response;
     } catch (final RedirectException ex) {
       throw new ImportingException("Error while handling redirects for web resource " + url + ": "
         + ex.getMessage(), ex, false);
     } catch (final VisitedLinksException ex) {
       throw new ImportingException("Error while handling redirects for web resource " + url + ": "
         + ex.getMessage(), ex, true);
     } catch (final IOException ex) {
       throw new ImportingException("IO error while getting web resource " + url + ": " + ex.getMessage(), ex, true);
     } catch (final Exception ex) {
       throw new ImportingException("Http error while getting web resource " + url + ": " + ex.getMessage(), ex,
         false);
     }
   }

   private HttpResponseInputStream getResource(final String url, final WebCrawlingContext context)
     throws WebCrawlerException, VisitedLinksException, RedirectException, IOException {
     return getResource(url, context, 0);
   }

   /** create GET request to given resource, and return it if the response code was 200 (OK). */
   private HttpResponseInputStream getResource(final String url, final WebCrawlingContext context,
     final int redirectLevel) throws WebCrawlerException, VisitedLinksException, RedirectException, IOException {
     final FilterConfiguration filterConfig = context.getFilterConfiguration();
     final HttpGet request = new HttpGet(url);
     final HttpResponse response = _httpClient.execute(request);
     final HttpResponseInputStream responseStream = new HttpResponseInputStream(url, response, redirectLevel > 0);
     final int responseCode = response.getStatusLine().getStatusCode();
     if (responseCode == HttpStatus.SC_OK) {
       return responseStream;
     } else if (isRedirect(responseCode)) {
       if (filterConfig != null && filterConfig.followRedirects()) {
         return handleRedirects(responseStream, context, redirectLevel);
       } else {
         IOUtils.closeQuietly(responseStream);
         throw new RedirectException("Follow redirects not configured, skipping link " + url);
       }
     } else {
       IOUtils.closeQuietly(responseStream);
       throw new WebCrawlerException("GET " + url + ": server responded with " + responseCode + ".");
     }
   }

   /** extract metadata from HTTP response. */
   private void readMetadata(final Record record, final HttpResponseInputStream response) {
     final AnyMap metadata = record.getMetadata();
     final HttpEntity entity = response.getResponseEntity();
     if (entity != null) {
       metadata.put(WebCrawlerConstants.ATTRIBUTE_SIZE, entity.getContentLength());
     }
     final Header contentType = response.getResponse().getFirstHeader(HEADER_CONTENTTYPE);
     if (contentType != null) {
       metadata.put(WebCrawlerConstants.ATTRIBUTE_CONTENTTYPE, contentType.getValue());
       final HeaderElement[] elements = contentType.getElements();
       if (elements.length > 0) {
         final String mimetype = elements[0].getName();
         if (mimetype != null) {
           metadata.put(WebCrawlerConstants.ATTRIBUTE_MIMETYPE, mimetype);
         }
         final NameValuePair charset = elements[0].getParameterByName(HEADER_PARAM_CHARSET);
         if (charset != null) {
           metadata.put(WebCrawlerConstants.ATTRIBUTE_CHARSET, charset.getValue());
         }
       }
     }
     final Header date = response.getResponse().getFirstHeader(HEADER_LASTMODIFIED);
     if (date != null) {
       try {
         final Date parsedDate = DateUtils.parseDate(date.getValue());
         metadata.put(WebCrawlerConstants.ATTRIBUTE_LASTMODIFIED,
           metadata.getFactory().createDateTimeValue(parsedDate));
       } catch (final DateParseException ex) {
         metadata.put(WebCrawlerConstants.ATTRIBUTE_LASTMODIFIED, date.getValue());
       }
     }
   }

   /** get content from response, if it is HTML. */
   private void readHtmlContent(final Record record, final InputStream contentStream) throws IOException {
     final String mimetype = record.getMetadata().getStringValue(WebCrawlerConstants.ATTRIBUTE_MIMETYPE);
     if (mimetype != null && mimetype.equals("text/html")) {
       readContent(record, contentStream);
     }
   }

   /**
    * get content from response, regardless of mimetype. If content could be read and size attribute is not set or
    * negative, adapt it to the actual size of the content.
    */
   private void readContent(final Record record, final InputStream contentStream) throws IOException {
     final byte[] content = IOUtils.toByteArray(contentStream);
     if (content != null) {
       record.setAttachment(WebCrawlerConstants.ATTACHMENT_CONTENT, content);
       final Long size = record.getMetadata().getLongValue(WebCrawlerConstants.ATTRIBUTE_SIZE);
       if (size == null || size < 0) {
         record.getMetadata().put(WebCrawlerConstants.ATTRIBUTE_SIZE, content.length);
       }
     }
   }

   private boolean isRedirect(final int statusCode) {
     switch (statusCode) {
       case HttpStatus.SC_MOVED_PERMANENTLY:
         return true;
       case HttpStatus.SC_MOVED_TEMPORARILY:
         return true;
       case HttpStatus.SC_SEE_OTHER:
         return true;
       case HttpStatus.SC_TEMPORARY_REDIRECT:
         return true;
       default:
         return false;
     }
   }

   private HttpResponseInputStream handleRedirects(final HttpResponseInputStream responseStream,
     final WebCrawlingContext context, int redirectLevel) throws WebCrawlerException, VisitedLinksException,
     IOException, RedirectException {
     try {
       if (redirectLevel >= context.getFilterConfiguration().getMaxRedirects()) {
         throw new RedirectException("Reached maximum number of redirects");
       }

       // get the location header to find out where to redirect to
       final HttpResponse response = responseStream.getResponse();
       final Header locationHeader = response.getFirstHeader("location");
       if (locationHeader == null) {
         throw new RedirectException("Received redirect response " + response.getStatusLine()
           + " but no location header");
       }

       final String location = locationHeader.getValue();
       try {
         final String redirectUrl = UriHelper.makeAbsolute(responseStream.getUrl(), location);
         if (redirectUrl == null) {
           throw new RedirectException("Couldn't create absolute link from baseUri " + responseStream.getUrl()
             + " and link " + location);
         }
         final String normalizedRedirectUrl = UriHelper.normalizeUrl(redirectUrl);
         if (_linkFilter.allowLink(normalizedRedirectUrl, context)) {
           if (!context.getVisitedUrls().contains(normalizedRedirectUrl)
             && !_visitedLinks.checkAndMarkVisited(context.getSource(), normalizedRedirectUrl,
               context.getJobRunId(), context.getCurrentInputBulkId())) {
             context.getVisitedUrls().add(normalizedRedirectUrl);
             return getResource(normalizedRedirectUrl, context, ++redirectLevel);
           } else {
             throw new RedirectException("Redirect to URL '" + normalizedRedirectUrl
               + "' is not allowed. URL was already visited");
           }
         } else {
           throw new RedirectException("Redirect to URL '" + normalizedRedirectUrl
             + "' is not allowed by filter configuration");
         }
       } catch (final URISyntaxException ex) {
         throw new RedirectException("Invalid Redirect location '" + location + "'", ex);
       }
     } finally {
       IOUtils.closeQuietly(responseStream);
     }
   }

   private void resetUrlAttributeOnRedirect(final Record record, final HttpResponseInputStream response,
     final PropertyNameMapper mapper) {
     if (response.isRedirect()) {
       if (record.getMetadata().containsKey(mapper.get(WebCrawlerConstants.ATTRIBUTE_URL))) {
         record.getMetadata().put(mapper.get(WebCrawlerConstants.ATTRIBUTE_URL), response.getUrl());
       } else {
         record.getMetadata().put(WebCrawlerConstants.ATTRIBUTE_URL, response.getUrl());
       }
     }
   }

   /** Reads proxy configuration from config file and sets the proxy configuration accordingly. */
   private HttpClient createAndonfigureClient() {
     final ClientConnectionManager connectionManager =
       HttpClientUtil.createThreadSafeConnectionManager(DEFAULT_MAX_TOTAL_CONNECTIONS,
         DEFAULT_MAX_CONNECTIONS_PER_HOST);
     final HttpClient httpClient = new DefaultHttpClient(connectionManager);
     HttpClientParams.setRedirecting(httpClient.getParams(), false);
     final WebCrawlerConfiguration config = new WebCrawlerConfiguration();
     final HttpHost proxyHost = config.getProxyHost();
     if (proxyHost != null) {
       httpClient.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY, proxyHost);
     } else {
       ((AbstractHttpClient) httpClient).setRoutePlanner(new ProxySelectorRoutePlanner(httpClient
         .getConnectionManager().getSchemeRegistry(), ProxySelector.getDefault()));
     }
     final Integer socketTimeout = config.getSocketTimeout();
     if (socketTimeout > 0) {
       httpClient.getParams().setParameter(CoreConnectionPNames.SO_TIMEOUT, socketTimeout);
     }
     return httpClient;
   }

   /** DS service reference injection method. */
   public void setVisitedLinks(final VisitedLinksService visitedLinks) {
     _visitedLinks = visitedLinks;
   }

   /** DS service reference removal method. */
   public void unsetVisitedLinks(final VisitedLinksService visitedLinks) {
     if (_visitedLinks == visitedLinks) {
       _visitedLinks = null;
     }
   }

   /** DS service reference injection method. */
   public void setLinkFilter(final LinkFilter linkFilter) {
     _linkFilter = linkFilter;
   }

   /** DS service reference removal method. */
   public void unsetLinkFilter(final LinkFilter linkFilter) {
     if (_linkFilter == linkFilter) {
       _linkFilter = null;
     }
   }
 }
	/*******************************************************************************
	* Copyright (c) 2008, 2011 Attensity Europe GmbH and brox IT Solutions GmbH. All rights reserved. This program and the
	* accompanying materials are made available under the terms of the Eclipse Public License v1.0 which accompanies this
	* distribution, and is available at http://www.eclipse.org/legal/epl-v10.html
	*
	* Contributors: Juergen Schumacher (Attensity Europe GmbH) - initial API and implementation
	*******************************************************************************/

	package org.eclipse.smila.importing.crawler.web.fetcher;

	import java.io.IOException;
	import java.io.InputStream;
	import java.net.ProxySelector;
	import java.net.URISyntaxException;
	import java.util.Date;

	import org.apache.commons.io.IOUtils;
	import org.apache.http.Header;
	import org.apache.http.HeaderElement;
	import org.apache.http.HttpEntity;
	import org.apache.http.HttpHost;
	import org.apache.http.HttpResponse;
	import org.apache.http.HttpStatus;
	import org.apache.http.NameValuePair;
	import org.apache.http.client.HttpClient;
	import org.apache.http.client.RedirectException;
	import org.apache.http.client.methods.HttpGet;
	import org.apache.http.client.params.HttpClientParams;
	import org.apache.http.conn.ClientConnectionManager;
	import org.apache.http.conn.params.ConnRoutePNames;
	import org.apache.http.impl.client.AbstractHttpClient;
	import org.apache.http.impl.client.DefaultHttpClient;
	import org.apache.http.impl.conn.ProxySelectorRoutePlanner;
	import org.apache.http.impl.cookie.DateParseException;
	import org.apache.http.impl.cookie.DateUtils;
	import org.apache.http.params.CoreConnectionPNames;
	import org.eclipse.smila.datamodel.AnyMap;
	import org.eclipse.smila.datamodel.Record;
	import org.eclipse.smila.http.client.util.HttpClientUtil;
	import org.eclipse.smila.importing.ImportingException;
	import org.eclipse.smila.importing.VisitedLinksException;
	import org.eclipse.smila.importing.VisitedLinksService;
	import org.eclipse.smila.importing.crawler.web.Fetcher;
	import org.eclipse.smila.importing.crawler.web.LinkFilter;
	import org.eclipse.smila.importing.crawler.web.WebCrawlerConstants;
	import org.eclipse.smila.importing.crawler.web.WebCrawlerException;
	import org.eclipse.smila.importing.crawler.web.WebCrawlingContext;
	import org.eclipse.smila.importing.crawler.web.filter.FilterConfiguration;
	import org.eclipse.smila.importing.crawler.web.utils.UriHelper;
	import org.eclipse.smila.importing.crawler.web.utils.WebCrawlerConfiguration;
	import org.eclipse.smila.importing.util.PropertyNameMapper;
	import org.eclipse.smila.taskworker.TaskContext;

	/**
	* Example implementation of a Fetcher service. It uses GET method to access the resource.
	* <ul>
	* <li>During crawling it reads metadata for content-length, content-type and last-modified from the HTTP header to
	* attributes and attaches the content of resources that are reported as mime type "text/html".
	* <li>During fetching it just attaches the content of any resource.
	* </ul>
	* It does not (yet) support authentication. It is based on Apache HttpClient 4.1.
	*
	*/
	public class DefaultFetcher implements Fetcher {

	/** name of HTTP header for last-modified date. */
	private static final String HEADER_LASTMODIFIED = "Last-Modified";

	/** name of HTTP header for content-type and charset. */
	private static final String HEADER_CONTENTTYPE = "Content-Type";

	/** name of Content-type header parameter for charset. */
	private static final String HEADER_PARAM_CHARSET = "charset";

	/** default setttings for the connection manager. */
	private static final int DEFAULT_MAX_CONNECTIONS_PER_HOST = 32;

	/** default setttings for the connection manager. */
	private static final int DEFAULT_MAX_TOTAL_CONNECTIONS = 128;

	/** reference to VisitedLinks service. */
	private VisitedLinksService _visitedLinks;

	/** reference to LinkFilter service. */
	private LinkFilter _linkFilter;

	/** client for all http operations. */
	private final HttpClient _httpClient;

	/** initialize HttpClient with disabled redirects. */
	public DefaultFetcher() {
	_httpClient = createAndonfigureClient();
	}

	@Override
	public void crawl(final String url, final Record linkRecord, final WebCrawlingContext context)
	throws WebCrawlerException {
	HttpResponseInputStream response = null;
	try {
	response = getResource(url, context);
	resetUrlAttributeOnRedirect(linkRecord, response, context.getMapper());
	readMetadata(linkRecord, response);
	readHtmlContent(linkRecord, response);
	} catch (final RedirectException ex) {
	throw new WebCrawlerException("Error while handling redirects for web resource " + url + ": "
	+ ex.getMessage(), ex, false);
	} catch (final VisitedLinksException ex) {
	throw new WebCrawlerException("Error while handling redirects for web resource " + url + ": "
	+ ex.getMessage(), ex, true);
	} catch (final IOException ex) {
	throw new WebCrawlerException("IO error while getting web resource " + url + ": " + ex.getMessage(), ex, true);
	} finally {
	IOUtils.closeQuietly(response);
	}
	}

	@Override
	public void fetch(final String url, final Record crawledRecord, final WebCrawlingContext context)
	throws WebCrawlerException {
	HttpResponseInputStream response = null;
	try {
	response = getResource(url, context);
	resetUrlAttributeOnRedirect(crawledRecord, response, context.getMapper());
	readContent(crawledRecord, response);
	} catch (final RedirectException ex) {
	throw new WebCrawlerException("Error while handling redirects for web resource " + url + ": "
	+ ex.getMessage(), ex, false);
	} catch (final VisitedLinksException ex) {
	throw new WebCrawlerException("Error while handling redirects for web resource " + url + ": "
	+ ex.getMessage(), ex, true);
	} catch (final IOException ex) {
	throw new WebCrawlerException("IO error while getting web resource " + url + ": " + ex.getMessage(), ex, true);
	} finally {
	IOUtils.closeQuietly(response);
	}
	}

	/**
	* {@inheritDoc}
	*
	* <p>
	* <b>Please note: a mapped record (at least URL must be mapped) is expected here!</b>
	* </p>
	*/
	@Override
	public InputStream getContent(final Record crawledRecord, final TaskContext taskContext)
	throws ImportingException {
	final PropertyNameMapper mapper = PropertyNameMapper.createFrom(taskContext);
	String url = crawledRecord.getMetadata().getStringValue(mapper.get(WebCrawlerConstants.ATTRIBUTE_URL));
	if (url == null) {
	url = crawledRecord.getMetadata().getStringValue(WebCrawlerConstants.ATTRIBUTE_URL);
	}
	try {
	final HttpResponseInputStream response = getResource(url, new WebCrawlingContext(taskContext));
	resetUrlAttributeOnRedirect(crawledRecord, response, mapper);
	return response;
	} catch (final RedirectException ex) {
	throw new ImportingException("Error while handling redirects for web resource " + url + ": "
	+ ex.getMessage(), ex, false);
	} catch (final VisitedLinksException ex) {
	throw new ImportingException("Error while handling redirects for web resource " + url + ": "
	+ ex.getMessage(), ex, true);
	} catch (final IOException ex) {
	throw new ImportingException("IO error while getting web resource " + url + ": " + ex.getMessage(), ex, true);
	} catch (final Exception ex) {
	throw new ImportingException("Http error while getting web resource " + url + ": " + ex.getMessage(), ex,
	false);
	}
	}

	private HttpResponseInputStream getResource(final String url, final WebCrawlingContext context)
	throws WebCrawlerException, VisitedLinksException, RedirectException, IOException {
	return getResource(url, context, 0);
	}

	/** create GET request to given resource, and return it if the response code was 200 (OK). */
	private HttpResponseInputStream getResource(final String url, final WebCrawlingContext context,
	final int redirectLevel) throws WebCrawlerException, VisitedLinksException, RedirectException, IOException {
	final FilterConfiguration filterConfig = context.getFilterConfiguration();
	final HttpGet request = new HttpGet(url);
	final HttpResponse response = _httpClient.execute(request);
	final HttpResponseInputStream responseStream = new HttpResponseInputStream(url, response, redirectLevel > 0);
	final int responseCode = response.getStatusLine().getStatusCode();
	if (responseCode == HttpStatus.SC_OK) {
	return responseStream;
	} else if (isRedirect(responseCode)) {
	if (filterConfig != null && filterConfig.followRedirects()) {
	return handleRedirects(responseStream, context, redirectLevel);
	} else {
	IOUtils.closeQuietly(responseStream);
	throw new RedirectException("Follow redirects not configured, skipping link " + url);
	}
	} else {
	IOUtils.closeQuietly(responseStream);
	throw new WebCrawlerException("GET " + url + ": server responded with " + responseCode + ".");
	}
	}

	/** extract metadata from HTTP response. */
	private void readMetadata(final Record record, final HttpResponseInputStream response) {
	final AnyMap metadata = record.getMetadata();
	final HttpEntity entity = response.getResponseEntity();
	if (entity != null) {
	metadata.put(WebCrawlerConstants.ATTRIBUTE_SIZE, entity.getContentLength());
	}
	final Header contentType = response.getResponse().getFirstHeader(HEADER_CONTENTTYPE);
	if (contentType != null) {
	metadata.put(WebCrawlerConstants.ATTRIBUTE_CONTENTTYPE, contentType.getValue());
	final HeaderElement[] elements = contentType.getElements();
	if (elements.length > 0) {
	final String mimetype = elements[0].getName();
	if (mimetype != null) {
	metadata.put(WebCrawlerConstants.ATTRIBUTE_MIMETYPE, mimetype);
	}
	final NameValuePair charset = elements[0].getParameterByName(HEADER_PARAM_CHARSET);
	if (charset != null) {
	metadata.put(WebCrawlerConstants.ATTRIBUTE_CHARSET, charset.getValue());
	}
	}
	}
	final Header date = response.getResponse().getFirstHeader(HEADER_LASTMODIFIED);
	if (date != null) {
	try {
	final Date parsedDate = DateUtils.parseDate(date.getValue());
	metadata.put(WebCrawlerConstants.ATTRIBUTE_LASTMODIFIED,
	metadata.getFactory().createDateTimeValue(parsedDate));
	} catch (final DateParseException ex) {
	metadata.put(WebCrawlerConstants.ATTRIBUTE_LASTMODIFIED, date.getValue());
	}
	}
	}

	/** get content from response, if it is HTML. */
	private void readHtmlContent(final Record record, final InputStream contentStream) throws IOException {
	final String mimetype = record.getMetadata().getStringValue(WebCrawlerConstants.ATTRIBUTE_MIMETYPE);
	if (mimetype != null && mimetype.equals("text/html")) {
	readContent(record, contentStream);
	}
	}

	/**
	* get content from response, regardless of mimetype. If content could be read and size attribute is not set or
	* negative, adapt it to the actual size of the content.
	*/
	private void readContent(final Record record, final InputStream contentStream) throws IOException {
	final byte[] content = IOUtils.toByteArray(contentStream);
	if (content != null) {
	record.setAttachment(WebCrawlerConstants.ATTACHMENT_CONTENT, content);
	final Long size = record.getMetadata().getLongValue(WebCrawlerConstants.ATTRIBUTE_SIZE);
	if (size == null \|\| size < 0) {
	record.getMetadata().put(WebCrawlerConstants.ATTRIBUTE_SIZE, content.length);
	}
	}
	}

	private boolean isRedirect(final int statusCode) {
	switch (statusCode) {
	case HttpStatus.SC_MOVED_PERMANENTLY:
	return true;
	case HttpStatus.SC_MOVED_TEMPORARILY:
	return true;
	case HttpStatus.SC_SEE_OTHER:
	return true;
	case HttpStatus.SC_TEMPORARY_REDIRECT:
	return true;
	default:
	return false;
	}
	}

	private HttpResponseInputStream handleRedirects(final HttpResponseInputStream responseStream,
	final WebCrawlingContext context, int redirectLevel) throws WebCrawlerException, VisitedLinksException,
	IOException, RedirectException {
	try {
	if (redirectLevel >= context.getFilterConfiguration().getMaxRedirects()) {
	throw new RedirectException("Reached maximum number of redirects");
	}

	// get the location header to find out where to redirect to
	final HttpResponse response = responseStream.getResponse();
	final Header locationHeader = response.getFirstHeader("location");
	if (locationHeader == null) {
	throw new RedirectException("Received redirect response " + response.getStatusLine()
	+ " but no location header");
	}

	final String location = locationHeader.getValue();
	try {
	final String redirectUrl = UriHelper.makeAbsolute(responseStream.getUrl(), location);
	if (redirectUrl == null) {
	throw new RedirectException("Couldn't create absolute link from baseUri " + responseStream.getUrl()
	+ " and link " + location);
	}
	final String normalizedRedirectUrl = UriHelper.normalizeUrl(redirectUrl);
	if (_linkFilter.allowLink(normalizedRedirectUrl, context)) {
	if (!context.getVisitedUrls().contains(normalizedRedirectUrl)
	&& !_visitedLinks.checkAndMarkVisited(context.getSource(), normalizedRedirectUrl,
	context.getJobRunId(), context.getCurrentInputBulkId())) {
	context.getVisitedUrls().add(normalizedRedirectUrl);
	return getResource(normalizedRedirectUrl, context, ++redirectLevel);
	} else {
	throw new RedirectException("Redirect to URL '" + normalizedRedirectUrl
	+ "' is not allowed. URL was already visited");
	}
	} else {
	throw new RedirectException("Redirect to URL '" + normalizedRedirectUrl
	+ "' is not allowed by filter configuration");
	}
	} catch (final URISyntaxException ex) {
	throw new RedirectException("Invalid Redirect location '" + location + "'", ex);
	}
	} finally {
	IOUtils.closeQuietly(responseStream);
	}
	}

	private void resetUrlAttributeOnRedirect(final Record record, final HttpResponseInputStream response,
	final PropertyNameMapper mapper) {
	if (response.isRedirect()) {
	if (record.getMetadata().containsKey(mapper.get(WebCrawlerConstants.ATTRIBUTE_URL))) {
	record.getMetadata().put(mapper.get(WebCrawlerConstants.ATTRIBUTE_URL), response.getUrl());
	} else {
	record.getMetadata().put(WebCrawlerConstants.ATTRIBUTE_URL, response.getUrl());
	}
	}
	}

	/** Reads proxy configuration from config file and sets the proxy configuration accordingly. */
	private HttpClient createAndonfigureClient() {
	final ClientConnectionManager connectionManager =
	HttpClientUtil.createThreadSafeConnectionManager(DEFAULT_MAX_TOTAL_CONNECTIONS,
	DEFAULT_MAX_CONNECTIONS_PER_HOST);
	final HttpClient httpClient = new DefaultHttpClient(connectionManager);
	HttpClientParams.setRedirecting(httpClient.getParams(), false);
	final WebCrawlerConfiguration config = new WebCrawlerConfiguration();
	final HttpHost proxyHost = config.getProxyHost();
	if (proxyHost != null) {
	httpClient.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY, proxyHost);
	} else {
	((AbstractHttpClient) httpClient).setRoutePlanner(new ProxySelectorRoutePlanner(httpClient
	.getConnectionManager().getSchemeRegistry(), ProxySelector.getDefault()));
	}
	final Integer socketTimeout = config.getSocketTimeout();
	if (socketTimeout > 0) {
	httpClient.getParams().setParameter(CoreConnectionPNames.SO_TIMEOUT, socketTimeout);
	}
	return httpClient;
	}

	/** DS service reference injection method. */
	public void setVisitedLinks(final VisitedLinksService visitedLinks) {
	_visitedLinks = visitedLinks;
	}

	/** DS service reference removal method. */
	public void unsetVisitedLinks(final VisitedLinksService visitedLinks) {
	if (_visitedLinks == visitedLinks) {
	_visitedLinks = null;
	}
	}

	/** DS service reference injection method. */
	public void setLinkFilter(final LinkFilter linkFilter) {
	_linkFilter = linkFilter;
	}

	/** DS service reference removal method. */
	public void unsetLinkFilter(final LinkFilter linkFilter) {
	if (_linkFilter == linkFilter) {
	_linkFilter = null;
	}
	}
	}