blob: 40fbce54b9fed0dce4782c53c021aae87d54e553 [file] [log] [blame]
/*******************************************************************************
* Copyright (c) 2008, 2011 Attensity Europe GmbH and brox IT Solutions GmbH. All rights reserved. This program and the
* accompanying materials are made available under the terms of the Eclipse Public License v1.0 which accompanies this
* distribution, and is available at http://www.eclipse.org/legal/epl-v10.html
*
* Contributors: Juergen Schumacher (Attensity Europe GmbH) - initial API and implementation
*******************************************************************************/
package org.eclipse.smila.importing.crawler.web.fetcher;
import java.io.IOException;
import java.io.InputStream;
import java.net.ProxySelector;
import java.net.URISyntaxException;
import java.util.Date;
import org.apache.commons.io.IOUtils;
import org.apache.http.Header;
import org.apache.http.HeaderElement;
import org.apache.http.HttpEntity;
import org.apache.http.HttpHost;
import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.NameValuePair;
import org.apache.http.client.HttpClient;
import org.apache.http.client.RedirectException;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.params.HttpClientParams;
import org.apache.http.conn.ClientConnectionManager;
import org.apache.http.conn.params.ConnRoutePNames;
import org.apache.http.impl.client.AbstractHttpClient;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.impl.conn.ProxySelectorRoutePlanner;
import org.apache.http.impl.cookie.DateParseException;
import org.apache.http.impl.cookie.DateUtils;
import org.apache.http.params.CoreConnectionPNames;
import org.eclipse.smila.datamodel.AnyMap;
import org.eclipse.smila.datamodel.Record;
import org.eclipse.smila.http.client.util.HttpClientUtil;
import org.eclipse.smila.importing.ImportingException;
import org.eclipse.smila.importing.VisitedLinksException;
import org.eclipse.smila.importing.VisitedLinksService;
import org.eclipse.smila.importing.crawler.web.Fetcher;
import org.eclipse.smila.importing.crawler.web.LinkFilter;
import org.eclipse.smila.importing.crawler.web.WebCrawlerConstants;
import org.eclipse.smila.importing.crawler.web.WebCrawlerException;
import org.eclipse.smila.importing.crawler.web.WebCrawlingContext;
import org.eclipse.smila.importing.crawler.web.filter.FilterConfiguration;
import org.eclipse.smila.importing.crawler.web.utils.UriHelper;
import org.eclipse.smila.importing.crawler.web.utils.WebCrawlerConfiguration;
import org.eclipse.smila.importing.util.PropertyNameMapper;
import org.eclipse.smila.taskworker.TaskContext;
/**
* Example implementation of a Fetcher service. It uses GET method to access the resource.
* <ul>
* <li>During crawling it reads metadata for content-length, content-type and last-modified from the HTTP header to
* attributes and attaches the content of resources that are reported as mime type "text/html".
* <li>During fetching it just attaches the content of any resource.
* </ul>
* It does not (yet) support authentication. It is based on Apache HttpClient 4.1.
*
*/
public class DefaultFetcher implements Fetcher {
/** name of HTTP header for last-modified date. */
private static final String HEADER_LASTMODIFIED = "Last-Modified";
/** name of HTTP header for content-type and charset. */
private static final String HEADER_CONTENTTYPE = "Content-Type";
/** name of Content-type header parameter for charset. */
private static final String HEADER_PARAM_CHARSET = "charset";
/** default setttings for the connection manager. */
private static final int DEFAULT_MAX_CONNECTIONS_PER_HOST = 32;
/** default setttings for the connection manager. */
private static final int DEFAULT_MAX_TOTAL_CONNECTIONS = 128;
/** reference to VisitedLinks service. */
private VisitedLinksService _visitedLinks;
/** reference to LinkFilter service. */
private LinkFilter _linkFilter;
/** client for all http operations. */
private final HttpClient _httpClient;
/** initialize HttpClient with disabled redirects. */
public DefaultFetcher() {
_httpClient = createAndonfigureClient();
}
@Override
public void crawl(final String url, final Record linkRecord, final WebCrawlingContext context)
throws WebCrawlerException {
HttpResponseInputStream response = null;
try {
response = getResource(url, context);
resetUrlAttributeOnRedirect(linkRecord, response, context.getMapper());
readMetadata(linkRecord, response);
readHtmlContent(linkRecord, response);
} catch (final RedirectException ex) {
throw new WebCrawlerException("Error while handling redirects for web resource " + url + ": "
+ ex.getMessage(), ex, false);
} catch (final VisitedLinksException ex) {
throw new WebCrawlerException("Error while handling redirects for web resource " + url + ": "
+ ex.getMessage(), ex, true);
} catch (final IOException ex) {
throw new WebCrawlerException("IO error while getting web resource " + url + ": " + ex.getMessage(), ex, true);
} finally {
IOUtils.closeQuietly(response);
}
}
@Override
public void fetch(final String url, final Record crawledRecord, final WebCrawlingContext context)
throws WebCrawlerException {
HttpResponseInputStream response = null;
try {
response = getResource(url, context);
resetUrlAttributeOnRedirect(crawledRecord, response, context.getMapper());
readContent(crawledRecord, response);
} catch (final RedirectException ex) {
throw new WebCrawlerException("Error while handling redirects for web resource " + url + ": "
+ ex.getMessage(), ex, false);
} catch (final VisitedLinksException ex) {
throw new WebCrawlerException("Error while handling redirects for web resource " + url + ": "
+ ex.getMessage(), ex, true);
} catch (final IOException ex) {
throw new WebCrawlerException("IO error while getting web resource " + url + ": " + ex.getMessage(), ex, true);
} finally {
IOUtils.closeQuietly(response);
}
}
/**
* {@inheritDoc}
*
* <p>
* <b>Please note: a mapped record (at least URL must be mapped) is expected here!</b>
* </p>
*/
@Override
public InputStream getContent(final Record crawledRecord, final TaskContext taskContext)
throws ImportingException {
final PropertyNameMapper mapper = PropertyNameMapper.createFrom(taskContext);
String url = crawledRecord.getMetadata().getStringValue(mapper.get(WebCrawlerConstants.ATTRIBUTE_URL));
if (url == null) {
url = crawledRecord.getMetadata().getStringValue(WebCrawlerConstants.ATTRIBUTE_URL);
}
try {
final HttpResponseInputStream response = getResource(url, new WebCrawlingContext(taskContext));
resetUrlAttributeOnRedirect(crawledRecord, response, mapper);
return response;
} catch (final RedirectException ex) {
throw new ImportingException("Error while handling redirects for web resource " + url + ": "
+ ex.getMessage(), ex, false);
} catch (final VisitedLinksException ex) {
throw new ImportingException("Error while handling redirects for web resource " + url + ": "
+ ex.getMessage(), ex, true);
} catch (final IOException ex) {
throw new ImportingException("IO error while getting web resource " + url + ": " + ex.getMessage(), ex, true);
} catch (final Exception ex) {
throw new ImportingException("Http error while getting web resource " + url + ": " + ex.getMessage(), ex,
false);
}
}
private HttpResponseInputStream getResource(final String url, final WebCrawlingContext context)
throws WebCrawlerException, VisitedLinksException, RedirectException, IOException {
return getResource(url, context, 0);
}
/** create GET request to given resource, and return it if the response code was 200 (OK). */
private HttpResponseInputStream getResource(final String url, final WebCrawlingContext context,
final int redirectLevel) throws WebCrawlerException, VisitedLinksException, RedirectException, IOException {
final FilterConfiguration filterConfig = context.getFilterConfiguration();
final HttpGet request = new HttpGet(url);
final HttpResponse response = _httpClient.execute(request);
final HttpResponseInputStream responseStream = new HttpResponseInputStream(url, response, redirectLevel > 0);
final int responseCode = response.getStatusLine().getStatusCode();
if (responseCode == HttpStatus.SC_OK) {
return responseStream;
} else if (isRedirect(responseCode)) {
if (filterConfig != null && filterConfig.followRedirects()) {
return handleRedirects(responseStream, context, redirectLevel);
} else {
IOUtils.closeQuietly(responseStream);
throw new RedirectException("Follow redirects not configured, skipping link " + url);
}
} else {
IOUtils.closeQuietly(responseStream);
throw new WebCrawlerException("GET " + url + ": server responded with " + responseCode + ".");
}
}
/** extract metadata from HTTP response. */
private void readMetadata(final Record record, final HttpResponseInputStream response) {
final AnyMap metadata = record.getMetadata();
final HttpEntity entity = response.getResponseEntity();
if (entity != null) {
metadata.put(WebCrawlerConstants.ATTRIBUTE_SIZE, entity.getContentLength());
}
final Header contentType = response.getResponse().getFirstHeader(HEADER_CONTENTTYPE);
if (contentType != null) {
metadata.put(WebCrawlerConstants.ATTRIBUTE_CONTENTTYPE, contentType.getValue());
final HeaderElement[] elements = contentType.getElements();
if (elements.length > 0) {
final String mimetype = elements[0].getName();
if (mimetype != null) {
metadata.put(WebCrawlerConstants.ATTRIBUTE_MIMETYPE, mimetype);
}
final NameValuePair charset = elements[0].getParameterByName(HEADER_PARAM_CHARSET);
if (charset != null) {
metadata.put(WebCrawlerConstants.ATTRIBUTE_CHARSET, charset.getValue());
}
}
}
final Header date = response.getResponse().getFirstHeader(HEADER_LASTMODIFIED);
if (date != null) {
try {
final Date parsedDate = DateUtils.parseDate(date.getValue());
metadata.put(WebCrawlerConstants.ATTRIBUTE_LASTMODIFIED,
metadata.getFactory().createDateTimeValue(parsedDate));
} catch (final DateParseException ex) {
metadata.put(WebCrawlerConstants.ATTRIBUTE_LASTMODIFIED, date.getValue());
}
}
}
/** get content from response, if it is HTML. */
private void readHtmlContent(final Record record, final InputStream contentStream) throws IOException {
final String mimetype = record.getMetadata().getStringValue(WebCrawlerConstants.ATTRIBUTE_MIMETYPE);
if (mimetype != null && mimetype.equals("text/html")) {
readContent(record, contentStream);
}
}
/**
* get content from response, regardless of mimetype. If content could be read and size attribute is not set or
* negative, adapt it to the actual size of the content.
*/
private void readContent(final Record record, final InputStream contentStream) throws IOException {
final byte[] content = IOUtils.toByteArray(contentStream);
if (content != null) {
record.setAttachment(WebCrawlerConstants.ATTACHMENT_CONTENT, content);
final Long size = record.getMetadata().getLongValue(WebCrawlerConstants.ATTRIBUTE_SIZE);
if (size == null || size < 0) {
record.getMetadata().put(WebCrawlerConstants.ATTRIBUTE_SIZE, content.length);
}
}
}
private boolean isRedirect(final int statusCode) {
switch (statusCode) {
case HttpStatus.SC_MOVED_PERMANENTLY:
return true;
case HttpStatus.SC_MOVED_TEMPORARILY:
return true;
case HttpStatus.SC_SEE_OTHER:
return true;
case HttpStatus.SC_TEMPORARY_REDIRECT:
return true;
default:
return false;
}
}
private HttpResponseInputStream handleRedirects(final HttpResponseInputStream responseStream,
final WebCrawlingContext context, int redirectLevel) throws WebCrawlerException, VisitedLinksException,
IOException, RedirectException {
try {
if (redirectLevel >= context.getFilterConfiguration().getMaxRedirects()) {
throw new RedirectException("Reached maximum number of redirects");
}
// get the location header to find out where to redirect to
final HttpResponse response = responseStream.getResponse();
final Header locationHeader = response.getFirstHeader("location");
if (locationHeader == null) {
throw new RedirectException("Received redirect response " + response.getStatusLine()
+ " but no location header");
}
final String location = locationHeader.getValue();
try {
final String redirectUrl = UriHelper.makeAbsolute(responseStream.getUrl(), location);
if (redirectUrl == null) {
throw new RedirectException("Couldn't create absolute link from baseUri " + responseStream.getUrl()
+ " and link " + location);
}
final String normalizedRedirectUrl = UriHelper.normalizeUrl(redirectUrl);
if (_linkFilter.allowLink(normalizedRedirectUrl, context)) {
if (!context.getVisitedUrls().contains(normalizedRedirectUrl)
&& !_visitedLinks.checkAndMarkVisited(context.getSource(), normalizedRedirectUrl,
context.getJobRunId(), context.getCurrentInputBulkId())) {
context.getVisitedUrls().add(normalizedRedirectUrl);
return getResource(normalizedRedirectUrl, context, ++redirectLevel);
} else {
throw new RedirectException("Redirect to URL '" + normalizedRedirectUrl
+ "' is not allowed. URL was already visited");
}
} else {
throw new RedirectException("Redirect to URL '" + normalizedRedirectUrl
+ "' is not allowed by filter configuration");
}
} catch (final URISyntaxException ex) {
throw new RedirectException("Invalid Redirect location '" + location + "'", ex);
}
} finally {
IOUtils.closeQuietly(responseStream);
}
}
private void resetUrlAttributeOnRedirect(final Record record, final HttpResponseInputStream response,
final PropertyNameMapper mapper) {
if (response.isRedirect()) {
if (record.getMetadata().containsKey(mapper.get(WebCrawlerConstants.ATTRIBUTE_URL))) {
record.getMetadata().put(mapper.get(WebCrawlerConstants.ATTRIBUTE_URL), response.getUrl());
} else {
record.getMetadata().put(WebCrawlerConstants.ATTRIBUTE_URL, response.getUrl());
}
}
}
/** Reads proxy configuration from config file and sets the proxy configuration accordingly. */
private HttpClient createAndonfigureClient() {
final ClientConnectionManager connectionManager =
HttpClientUtil.createThreadSafeConnectionManager(DEFAULT_MAX_TOTAL_CONNECTIONS,
DEFAULT_MAX_CONNECTIONS_PER_HOST);
final HttpClient httpClient = new DefaultHttpClient(connectionManager);
HttpClientParams.setRedirecting(httpClient.getParams(), false);
final WebCrawlerConfiguration config = new WebCrawlerConfiguration();
final HttpHost proxyHost = config.getProxyHost();
if (proxyHost != null) {
httpClient.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY, proxyHost);
} else {
((AbstractHttpClient) httpClient).setRoutePlanner(new ProxySelectorRoutePlanner(httpClient
.getConnectionManager().getSchemeRegistry(), ProxySelector.getDefault()));
}
final Integer socketTimeout = config.getSocketTimeout();
if (socketTimeout > 0) {
httpClient.getParams().setParameter(CoreConnectionPNames.SO_TIMEOUT, socketTimeout);
}
return httpClient;
}
/** DS service reference injection method. */
public void setVisitedLinks(final VisitedLinksService visitedLinks) {
_visitedLinks = visitedLinks;
}
/** DS service reference removal method. */
public void unsetVisitedLinks(final VisitedLinksService visitedLinks) {
if (_visitedLinks == visitedLinks) {
_visitedLinks = null;
}
}
/** DS service reference injection method. */
public void setLinkFilter(final LinkFilter linkFilter) {
_linkFilter = linkFilter;
}
/** DS service reference removal method. */
public void unsetLinkFilter(final LinkFilter linkFilter) {
if (_linkFilter == linkFilter) {
_linkFilter = null;
}
}
}