| /******************************************************************************* |
| * Copyright (c) 2008, 2011 Attensity Europe GmbH and brox IT Solutions GmbH. All rights reserved. This program and the |
| * accompanying materials are made available under the terms of the Eclipse Public License v1.0 which accompanies this |
| * distribution, and is available at http://www.eclipse.org/legal/epl-v10.html |
| * |
| * Contributors: Juergen Schumacher (Attensity Europe GmbH) - initial API and implementation |
| *******************************************************************************/ |
| |
| package org.eclipse.smila.importing.crawler.web.fetcher; |
| |
| import java.io.IOException; |
| import java.io.InputStream; |
| import java.net.ProxySelector; |
| import java.net.URISyntaxException; |
| import java.util.Date; |
| |
| import org.apache.commons.io.IOUtils; |
| import org.apache.http.Header; |
| import org.apache.http.HeaderElement; |
| import org.apache.http.HttpEntity; |
| import org.apache.http.HttpHost; |
| import org.apache.http.HttpResponse; |
| import org.apache.http.HttpStatus; |
| import org.apache.http.NameValuePair; |
| import org.apache.http.client.HttpClient; |
| import org.apache.http.client.RedirectException; |
| import org.apache.http.client.methods.HttpGet; |
| import org.apache.http.client.params.HttpClientParams; |
| import org.apache.http.conn.ClientConnectionManager; |
| import org.apache.http.conn.params.ConnRoutePNames; |
| import org.apache.http.impl.client.AbstractHttpClient; |
| import org.apache.http.impl.client.DefaultHttpClient; |
| import org.apache.http.impl.conn.ProxySelectorRoutePlanner; |
| import org.apache.http.impl.cookie.DateParseException; |
| import org.apache.http.impl.cookie.DateUtils; |
| import org.apache.http.params.CoreConnectionPNames; |
| import org.eclipse.smila.datamodel.AnyMap; |
| import org.eclipse.smila.datamodel.Record; |
| import org.eclipse.smila.http.client.util.HttpClientUtil; |
| import org.eclipse.smila.importing.ImportingException; |
| import org.eclipse.smila.importing.VisitedLinksException; |
| import org.eclipse.smila.importing.VisitedLinksService; |
| import org.eclipse.smila.importing.crawler.web.Fetcher; |
| import org.eclipse.smila.importing.crawler.web.LinkFilter; |
| import org.eclipse.smila.importing.crawler.web.WebCrawlerConstants; |
| import org.eclipse.smila.importing.crawler.web.WebCrawlerException; |
| import org.eclipse.smila.importing.crawler.web.WebCrawlingContext; |
| import org.eclipse.smila.importing.crawler.web.filter.FilterConfiguration; |
| import org.eclipse.smila.importing.crawler.web.utils.UriHelper; |
| import org.eclipse.smila.importing.crawler.web.utils.WebCrawlerConfiguration; |
| import org.eclipse.smila.importing.util.PropertyNameMapper; |
| import org.eclipse.smila.taskworker.TaskContext; |
| |
| /** |
| * Example implementation of a Fetcher service. It uses GET method to access the resource. |
| * <ul> |
| * <li>During crawling it reads metadata for content-length, content-type and last-modified from the HTTP header to |
| * attributes and attaches the content of resources that are reported as mime type "text/html". |
| * <li>During fetching it just attaches the content of any resource. |
| * </ul> |
| * It does not (yet) support authentication. It is based on Apache HttpClient 4.1. |
| * |
| */ |
| public class DefaultFetcher implements Fetcher { |
| |
| /** name of HTTP header for last-modified date. */ |
| private static final String HEADER_LASTMODIFIED = "Last-Modified"; |
| |
| /** name of HTTP header for content-type and charset. */ |
| private static final String HEADER_CONTENTTYPE = "Content-Type"; |
| |
| /** name of Content-type header parameter for charset. */ |
| private static final String HEADER_PARAM_CHARSET = "charset"; |
| |
| /** default setttings for the connection manager. */ |
| private static final int DEFAULT_MAX_CONNECTIONS_PER_HOST = 32; |
| |
| /** default setttings for the connection manager. */ |
| private static final int DEFAULT_MAX_TOTAL_CONNECTIONS = 128; |
| |
| /** reference to VisitedLinks service. */ |
| private VisitedLinksService _visitedLinks; |
| |
| /** reference to LinkFilter service. */ |
| private LinkFilter _linkFilter; |
| |
| /** client for all http operations. */ |
| private final HttpClient _httpClient; |
| |
| /** initialize HttpClient with disabled redirects. */ |
| public DefaultFetcher() { |
| _httpClient = createAndonfigureClient(); |
| } |
| |
| @Override |
| public void crawl(final String url, final Record linkRecord, final WebCrawlingContext context) |
| throws WebCrawlerException { |
| HttpResponseInputStream response = null; |
| try { |
| response = getResource(url, context); |
| resetUrlAttributeOnRedirect(linkRecord, response, context.getMapper()); |
| readMetadata(linkRecord, response); |
| readHtmlContent(linkRecord, response); |
| } catch (final RedirectException ex) { |
| throw new WebCrawlerException("Error while handling redirects for web resource " + url + ": " |
| + ex.getMessage(), ex, false); |
| } catch (final VisitedLinksException ex) { |
| throw new WebCrawlerException("Error while handling redirects for web resource " + url + ": " |
| + ex.getMessage(), ex, true); |
| } catch (final IOException ex) { |
| throw new WebCrawlerException("IO error while getting web resource " + url + ": " + ex.getMessage(), ex, true); |
| } finally { |
| IOUtils.closeQuietly(response); |
| } |
| } |
| |
| @Override |
| public void fetch(final String url, final Record crawledRecord, final WebCrawlingContext context) |
| throws WebCrawlerException { |
| HttpResponseInputStream response = null; |
| try { |
| response = getResource(url, context); |
| resetUrlAttributeOnRedirect(crawledRecord, response, context.getMapper()); |
| readContent(crawledRecord, response); |
| } catch (final RedirectException ex) { |
| throw new WebCrawlerException("Error while handling redirects for web resource " + url + ": " |
| + ex.getMessage(), ex, false); |
| } catch (final VisitedLinksException ex) { |
| throw new WebCrawlerException("Error while handling redirects for web resource " + url + ": " |
| + ex.getMessage(), ex, true); |
| } catch (final IOException ex) { |
| throw new WebCrawlerException("IO error while getting web resource " + url + ": " + ex.getMessage(), ex, true); |
| } finally { |
| IOUtils.closeQuietly(response); |
| } |
| } |
| |
| /** |
| * {@inheritDoc} |
| * |
| * <p> |
| * <b>Please note: a mapped record (at least URL must be mapped) is expected here!</b> |
| * </p> |
| */ |
| @Override |
| public InputStream getContent(final Record crawledRecord, final TaskContext taskContext) |
| throws ImportingException { |
| final PropertyNameMapper mapper = PropertyNameMapper.createFrom(taskContext); |
| String url = crawledRecord.getMetadata().getStringValue(mapper.get(WebCrawlerConstants.ATTRIBUTE_URL)); |
| if (url == null) { |
| url = crawledRecord.getMetadata().getStringValue(WebCrawlerConstants.ATTRIBUTE_URL); |
| } |
| try { |
| final HttpResponseInputStream response = getResource(url, new WebCrawlingContext(taskContext)); |
| resetUrlAttributeOnRedirect(crawledRecord, response, mapper); |
| return response; |
| } catch (final RedirectException ex) { |
| throw new ImportingException("Error while handling redirects for web resource " + url + ": " |
| + ex.getMessage(), ex, false); |
| } catch (final VisitedLinksException ex) { |
| throw new ImportingException("Error while handling redirects for web resource " + url + ": " |
| + ex.getMessage(), ex, true); |
| } catch (final IOException ex) { |
| throw new ImportingException("IO error while getting web resource " + url + ": " + ex.getMessage(), ex, true); |
| } catch (final Exception ex) { |
| throw new ImportingException("Http error while getting web resource " + url + ": " + ex.getMessage(), ex, |
| false); |
| } |
| } |
| |
| private HttpResponseInputStream getResource(final String url, final WebCrawlingContext context) |
| throws WebCrawlerException, VisitedLinksException, RedirectException, IOException { |
| return getResource(url, context, 0); |
| } |
| |
| /** create GET request to given resource, and return it if the response code was 200 (OK). */ |
| private HttpResponseInputStream getResource(final String url, final WebCrawlingContext context, |
| final int redirectLevel) throws WebCrawlerException, VisitedLinksException, RedirectException, IOException { |
| final FilterConfiguration filterConfig = context.getFilterConfiguration(); |
| final HttpGet request = new HttpGet(url); |
| final HttpResponse response = _httpClient.execute(request); |
| final HttpResponseInputStream responseStream = new HttpResponseInputStream(url, response, redirectLevel > 0); |
| final int responseCode = response.getStatusLine().getStatusCode(); |
| if (responseCode == HttpStatus.SC_OK) { |
| return responseStream; |
| } else if (isRedirect(responseCode)) { |
| if (filterConfig != null && filterConfig.followRedirects()) { |
| return handleRedirects(responseStream, context, redirectLevel); |
| } else { |
| IOUtils.closeQuietly(responseStream); |
| throw new RedirectException("Follow redirects not configured, skipping link " + url); |
| } |
| } else { |
| IOUtils.closeQuietly(responseStream); |
| throw new WebCrawlerException("GET " + url + ": server responded with " + responseCode + "."); |
| } |
| } |
| |
| /** extract metadata from HTTP response. */ |
| private void readMetadata(final Record record, final HttpResponseInputStream response) { |
| final AnyMap metadata = record.getMetadata(); |
| final HttpEntity entity = response.getResponseEntity(); |
| if (entity != null) { |
| metadata.put(WebCrawlerConstants.ATTRIBUTE_SIZE, entity.getContentLength()); |
| } |
| final Header contentType = response.getResponse().getFirstHeader(HEADER_CONTENTTYPE); |
| if (contentType != null) { |
| metadata.put(WebCrawlerConstants.ATTRIBUTE_CONTENTTYPE, contentType.getValue()); |
| final HeaderElement[] elements = contentType.getElements(); |
| if (elements.length > 0) { |
| final String mimetype = elements[0].getName(); |
| if (mimetype != null) { |
| metadata.put(WebCrawlerConstants.ATTRIBUTE_MIMETYPE, mimetype); |
| } |
| final NameValuePair charset = elements[0].getParameterByName(HEADER_PARAM_CHARSET); |
| if (charset != null) { |
| metadata.put(WebCrawlerConstants.ATTRIBUTE_CHARSET, charset.getValue()); |
| } |
| } |
| } |
| final Header date = response.getResponse().getFirstHeader(HEADER_LASTMODIFIED); |
| if (date != null) { |
| try { |
| final Date parsedDate = DateUtils.parseDate(date.getValue()); |
| metadata.put(WebCrawlerConstants.ATTRIBUTE_LASTMODIFIED, |
| metadata.getFactory().createDateTimeValue(parsedDate)); |
| } catch (final DateParseException ex) { |
| metadata.put(WebCrawlerConstants.ATTRIBUTE_LASTMODIFIED, date.getValue()); |
| } |
| } |
| } |
| |
| /** get content from response, if it is HTML. */ |
| private void readHtmlContent(final Record record, final InputStream contentStream) throws IOException { |
| final String mimetype = record.getMetadata().getStringValue(WebCrawlerConstants.ATTRIBUTE_MIMETYPE); |
| if (mimetype != null && mimetype.equals("text/html")) { |
| readContent(record, contentStream); |
| } |
| } |
| |
| /** |
| * get content from response, regardless of mimetype. If content could be read and size attribute is not set or |
| * negative, adapt it to the actual size of the content. |
| */ |
| private void readContent(final Record record, final InputStream contentStream) throws IOException { |
| final byte[] content = IOUtils.toByteArray(contentStream); |
| if (content != null) { |
| record.setAttachment(WebCrawlerConstants.ATTACHMENT_CONTENT, content); |
| final Long size = record.getMetadata().getLongValue(WebCrawlerConstants.ATTRIBUTE_SIZE); |
| if (size == null || size < 0) { |
| record.getMetadata().put(WebCrawlerConstants.ATTRIBUTE_SIZE, content.length); |
| } |
| } |
| } |
| |
| private boolean isRedirect(final int statusCode) { |
| switch (statusCode) { |
| case HttpStatus.SC_MOVED_PERMANENTLY: |
| return true; |
| case HttpStatus.SC_MOVED_TEMPORARILY: |
| return true; |
| case HttpStatus.SC_SEE_OTHER: |
| return true; |
| case HttpStatus.SC_TEMPORARY_REDIRECT: |
| return true; |
| default: |
| return false; |
| } |
| } |
| |
| private HttpResponseInputStream handleRedirects(final HttpResponseInputStream responseStream, |
| final WebCrawlingContext context, int redirectLevel) throws WebCrawlerException, VisitedLinksException, |
| IOException, RedirectException { |
| try { |
| if (redirectLevel >= context.getFilterConfiguration().getMaxRedirects()) { |
| throw new RedirectException("Reached maximum number of redirects"); |
| } |
| |
| // get the location header to find out where to redirect to |
| final HttpResponse response = responseStream.getResponse(); |
| final Header locationHeader = response.getFirstHeader("location"); |
| if (locationHeader == null) { |
| throw new RedirectException("Received redirect response " + response.getStatusLine() |
| + " but no location header"); |
| } |
| |
| final String location = locationHeader.getValue(); |
| try { |
| final String redirectUrl = UriHelper.makeAbsolute(responseStream.getUrl(), location); |
| if (redirectUrl == null) { |
| throw new RedirectException("Couldn't create absolute link from baseUri " + responseStream.getUrl() |
| + " and link " + location); |
| } |
| final String normalizedRedirectUrl = UriHelper.normalizeUrl(redirectUrl); |
| if (_linkFilter.allowLink(normalizedRedirectUrl, context)) { |
| if (!context.getVisitedUrls().contains(normalizedRedirectUrl) |
| && !_visitedLinks.checkAndMarkVisited(context.getSource(), normalizedRedirectUrl, |
| context.getJobRunId(), context.getCurrentInputBulkId())) { |
| context.getVisitedUrls().add(normalizedRedirectUrl); |
| return getResource(normalizedRedirectUrl, context, ++redirectLevel); |
| } else { |
| throw new RedirectException("Redirect to URL '" + normalizedRedirectUrl |
| + "' is not allowed. URL was already visited"); |
| } |
| } else { |
| throw new RedirectException("Redirect to URL '" + normalizedRedirectUrl |
| + "' is not allowed by filter configuration"); |
| } |
| } catch (final URISyntaxException ex) { |
| throw new RedirectException("Invalid Redirect location '" + location + "'", ex); |
| } |
| } finally { |
| IOUtils.closeQuietly(responseStream); |
| } |
| } |
| |
| private void resetUrlAttributeOnRedirect(final Record record, final HttpResponseInputStream response, |
| final PropertyNameMapper mapper) { |
| if (response.isRedirect()) { |
| if (record.getMetadata().containsKey(mapper.get(WebCrawlerConstants.ATTRIBUTE_URL))) { |
| record.getMetadata().put(mapper.get(WebCrawlerConstants.ATTRIBUTE_URL), response.getUrl()); |
| } else { |
| record.getMetadata().put(WebCrawlerConstants.ATTRIBUTE_URL, response.getUrl()); |
| } |
| } |
| } |
| |
| /** Reads proxy configuration from config file and sets the proxy configuration accordingly. */ |
| private HttpClient createAndonfigureClient() { |
| final ClientConnectionManager connectionManager = |
| HttpClientUtil.createThreadSafeConnectionManager(DEFAULT_MAX_TOTAL_CONNECTIONS, |
| DEFAULT_MAX_CONNECTIONS_PER_HOST); |
| final HttpClient httpClient = new DefaultHttpClient(connectionManager); |
| HttpClientParams.setRedirecting(httpClient.getParams(), false); |
| final WebCrawlerConfiguration config = new WebCrawlerConfiguration(); |
| final HttpHost proxyHost = config.getProxyHost(); |
| if (proxyHost != null) { |
| httpClient.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY, proxyHost); |
| } else { |
| ((AbstractHttpClient) httpClient).setRoutePlanner(new ProxySelectorRoutePlanner(httpClient |
| .getConnectionManager().getSchemeRegistry(), ProxySelector.getDefault())); |
| } |
| final Integer socketTimeout = config.getSocketTimeout(); |
| if (socketTimeout > 0) { |
| httpClient.getParams().setParameter(CoreConnectionPNames.SO_TIMEOUT, socketTimeout); |
| } |
| return httpClient; |
| } |
| |
| /** DS service reference injection method. */ |
| public void setVisitedLinks(final VisitedLinksService visitedLinks) { |
| _visitedLinks = visitedLinks; |
| } |
| |
| /** DS service reference removal method. */ |
| public void unsetVisitedLinks(final VisitedLinksService visitedLinks) { |
| if (_visitedLinks == visitedLinks) { |
| _visitedLinks = null; |
| } |
| } |
| |
| /** DS service reference injection method. */ |
| public void setLinkFilter(final LinkFilter linkFilter) { |
| _linkFilter = linkFilter; |
| } |
| |
| /** DS service reference removal method. */ |
| public void unsetLinkFilter(final LinkFilter linkFilter) { |
| if (_linkFilter == linkFilter) { |
| _linkFilter = null; |
| } |
| } |
| } |