blob: d82b084178844e1e76c9f1652ee4fe736678691b [file] [log] [blame]
/***********************************************************************************************************************
* Copyright (c) 2008 empolis GmbH and brox IT Solutions GmbH. All rights reserved. This program and the accompanying
* materials are made available under the terms of the Eclipse Public License v1.0 which accompanies this distribution,
* and is available at http://www.eclipse.org/legal/epl-v10.html
*
* Contributors: Dmitry Hazin (brox IT Solutions GmbH) - initial creator Sebastian Voigt (brox IT Solutions GmbH)
**********************************************************************************************************************/
package org.eclipse.smila.connectivity.framework.crawler.web;
import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.eclipse.smila.connectivity.framework.CrawlerCriticalException;
import org.eclipse.smila.connectivity.framework.crawler.web.configuration.Configuration;
import org.eclipse.smila.connectivity.framework.crawler.web.configuration.CrawlProperties;
import org.eclipse.smila.connectivity.framework.crawler.web.configuration.FetcherProperties;
import org.eclipse.smila.connectivity.framework.crawler.web.crawl.CrawlMode;
import org.eclipse.smila.connectivity.framework.crawler.web.fetcher.Fetcher;
import org.eclipse.smila.connectivity.framework.crawler.web.fetcher.FetcherOutput;
import org.eclipse.smila.connectivity.framework.crawler.web.filter.FilterProcessor;
import org.eclipse.smila.connectivity.framework.crawler.web.filter.impl.FilterProcessorImpl;
import org.eclipse.smila.connectivity.framework.crawler.web.messages.ModelType;
import org.eclipse.smila.connectivity.framework.crawler.web.messages.WebSite;
import org.eclipse.smila.connectivity.framework.crawler.web.parse.Outlink;
import org.eclipse.smila.connectivity.framework.crawler.web.parse.ParserManager;
import org.eclipse.smila.connectivity.framework.performancecounters.CrawlerPerformanceCounterHelper;
/**
* The Class WebSiteIterator.
*
*/
public class WebSiteIterator implements Iterator<IndexDocument> {
/**
* The Log.
*/
private final Log _log = LogFactory.getLog(WebSiteIterator.class);
/**
* Set of links which are already "crawled". A set is used to avoid double entries.
*/
private final Set<Outlink> _linksDone = new HashSet<Outlink>();
/**
* Set of links which are queued for "crawling". A set is used to avoid double entries.
*/
private Set<Outlink> _linksToDo = new HashSet<Outlink>();
/**
* The links to do next level.
*/
private Set<Outlink> _linksToDoNextLevel = new HashSet<Outlink>();
/**
* The iterations done.
*/
private int _iterationsDone;
/**
* The current depth.
*/
private int _currentDepth;
/**
* The configuration.
*/
private Configuration _configuration;
/**
* The fetcher.
*/
private Fetcher _fetcher;
/**
* The wait.
*/
private int _wait;
/**
* The random wait.
*/
private boolean _randomWait;
/**
* The filter processor.
*/
private FilterProcessor _filterProcessor;
/**
* The start time.
*/
private long _startTime;
/**
* Currently selected document in this iterator.
*/
private IndexDocument _currentIndexDocument;
/**
* The _performance counters.
*/
@SuppressWarnings("unused")
private final CrawlerPerformanceCounterHelper<WebCrawlerPerformanceAgent> _performanceCounters;
/**
* Initialize crawling.
*
* @param webSite
* web site crawling configuration
* @param performanceCounters
* the performance counters
* @param parserManager
* webcrawler parsers manager
* @throws CrawlerCriticalException
* the crawler critical exception
*/
public WebSiteIterator(final WebSite webSite, final ParserManager parserManager,
final CrawlerPerformanceCounterHelper<WebCrawlerPerformanceAgent> performanceCounters)
throws CrawlerCriticalException {
try {
_performanceCounters = performanceCounters;
_configuration = new Configuration();
_configuration.loadConfiguration(webSite);
_fetcher = new Fetcher(_configuration, parserManager, performanceCounters);
_wait = _configuration.getInt(CrawlProperties.WAIT);
_randomWait = _configuration.getBoolean(CrawlProperties.RANDOM_WAIT);
if (_log.isDebugEnabled()) {
_log.debug("Starting new project: " + _configuration.get(CrawlProperties.PROJECT_NAME));
}
_linksToDo = _configuration.getSeeds();
_filterProcessor = new FilterProcessorImpl(_configuration);
_startTime = System.currentTimeMillis();
} catch (final IllegalAccessException exception) {
throw new CrawlerCriticalException("Error loading configuration", exception);
} catch (final InvocationTargetException exception) {
throw new CrawlerCriticalException("Error loading configuration", exception);
} catch (final IOException exception) {
throw new CrawlerCriticalException("Error loading configuration", exception);
}
}
/**
* Checks if this iterator has a next document for indexing.
*
* @return boolean
*/
@Override
public boolean hasNext() {
while (_linksToDo.size() > 0 && _currentIndexDocument == null && !limitsExceeded()) {
_iterationsDone++;
final Outlink link = _linksToDo.iterator().next();
_linksToDo.remove(link);
if (!_linksDone.contains(link)) {
_linksDone.add(link);
// prove if the url matches crawl scope and all filters
final CrawlMode crawlMode = _filterProcessor.evaluateUrlFilters(link);
if (!crawlMode.equals(CrawlMode.Skip)) {
try {
if (_log.isDebugEnabled()) {
_log.debug("Link = " + link.getUrlString() + " crawled");
}
_currentIndexDocument = indexDocs(link, _configuration, crawlMode);
} catch (final InterruptedException exception) {
_log.error("Error fetching link " + link.getUrlString());
}
} else {
if (_log.isDebugEnabled()) {
_log.debug("Link = " + link.getUrlString() + " not included (cause: SKIP, Filter)");
}
}
} else {
if (_log.isDebugEnabled()) {
_log.debug("Link = " + link.getUrlString() + " already crawled");
}
}
if (_linksToDo.size() == 0 && _linksToDoNextLevel.size() > 0) {
_log.debug("Number of next level links: " + _linksToDoNextLevel.size());
_linksToDo = _linksToDoNextLevel;
_linksToDoNextLevel = new HashSet<Outlink>();
_currentDepth++;
_log.debug("Current depth is: " + _currentDepth);
}
}
return _currentIndexDocument != null;
}
/**
* Gets the next index document.
*
* @return IndexDocument
*/
@Override
public IndexDocument next() {
if (_currentIndexDocument == null) {
hasNext();
}
final IndexDocument result = _currentIndexDocument;
_currentIndexDocument = null;
return result;
}
/**
* Downloads the page and creates index document.
*
* @param outlink
* Link to be fetched.
* @param conf
* Crawler configuration
* @param crawlMode
* One of Skip, Index or AnalyzeOnly
*
* @return IndexDocument
*
* @throws InterruptedException
* if error occured
*/
private IndexDocument indexDocs(final Outlink outlink, final Configuration conf, CrawlMode crawlMode)
throws InterruptedException {
IndexDocument document = null;
int delay = 0;
if (_randomWait) {
delay = (int) (Math.random() * _wait * 2);
} else if (_wait > 0) {
delay = _wait;
}
_log.debug("Wait before next retrieval, seconds: " + delay);
Thread.sleep(delay * Configuration.MILLIS_PER_SECOND);
final FetcherOutput fetcherOutput = _fetcher.fetch(outlink, _filterProcessor, _linksDone);
// Check if fetching and parsing successfully finished
if (fetcherOutput.getParse() != null) {
if (crawlMode.equals(CrawlMode.Index)) {
crawlMode =
_filterProcessor.evaluateHtmlMetaTagFilters(fetcherOutput.getParse().getData().getHtmlMetaTags());
// if we still want to index let's do it now
if (crawlMode.equals(CrawlMode.Index)) {
document = createDocument(fetcherOutput);
}
}
if (!crawlMode.equals(CrawlMode.Skip)) {
updateTodoLinks(fetcherOutput);
}
}
return document;
}
/** check if any of the configured size, count or time limits is exceeded. */
private boolean limitsExceeded() {
// check size limits
if (limitExceeded(_fetcher.getBytes(), FetcherProperties.MAX_BYTES_DOWNLOAD)) {
_log.info("Max bytes limit exceeded");
return true;
}
if (limitExceeded(_fetcher.getPages(), FetcherProperties.MAX_DOCUMENT_DOWNLOAD)) {
_log.info("Max pages limit exceeded");
return true;
}
final float elapsedTime = (System.currentTimeMillis() - _startTime) / (float) Configuration.MILLIS_PER_SECOND;
if (limitExceeded((long) elapsedTime, CrawlProperties.MAX_TIME_SEC)) {
_log.info("Max time exceeded");
return true;
}
if (ModelType.MAX_ITERATIONS.value().equals(_configuration.get(CrawlProperties.CRAWLING_MODEL_TYPE))
&& limitExceeded(_iterationsDone, CrawlProperties.CRAWLING_MODEL_VALUE)) {
_log.info("Maximum number of iterations exceeded");
return true;
}
if (ModelType.MAX_DEPTH.value().equals(_configuration.get(CrawlProperties.CRAWLING_MODEL_TYPE))
&& limitExceeded(_currentDepth, CrawlProperties.CRAWLING_MODEL_VALUE)) {
_log.info("Maximum depth exceeded!");
return true;
}
return false;
}
/**
* Limit exceeded.
*
* @param test
* the test
* @param propertyName
* the property name
*
* @return true, if successful
*/
private boolean limitExceeded(final long test, final String propertyName) {
if (_configuration.getInt(propertyName) > 0 && test >= _configuration.getInt(propertyName)) {
return true;
}
return false;
}
/** add outgoing links from fetched page to todo lists. */
private void updateTodoLinks(final FetcherOutput fetcherOutput) {
final Outlink[] outlinks = fetcherOutput.getParse().getData().getOutlinks();
if (outlinks != null && outlinks.length > 0) {
for (final Outlink link : outlinks) {
// links from the page are added to the next level
_linksToDoNextLevel.add(link);
if (_log.isDebugEnabled()) {
_log.debug("added new link to do:" + link.toString());
}
}
}
final Outlink[] sitemapOutlinks = fetcherOutput.getSitemapLinks();
if (sitemapOutlinks != null && sitemapOutlinks.length > 0) {
for (final Outlink link : sitemapOutlinks) {
// links from sitemap file are added to the same level
_linksToDo.add(link);
if (_log.isDebugEnabled()) {
_log.debug("added new link from sitemap file:" + link.toString());
}
}
}
}
/** convert fetcher output to IndexDocument. */
private IndexDocument createDocument(final FetcherOutput fetcherOutput) {
IndexDocument document;
final String url = fetcherOutput.getContent().getUrl();
final String title = fetcherOutput.getParse().getData().getTitle();
// String content = fetcherOutput.getParse().getText();
final byte[] content = fetcherOutput.getContent().getContent();
final List<String> responseHeaders = fetcherOutput.getParse().getData().getContentMeta().toArrayList();
final List<String> htmlMetaData = fetcherOutput.getParse().getData().getHtmlMetaTags().toArrayList();
final List<String> metaDataWithResponseHeaderFallBack = new ArrayList<String>();
metaDataWithResponseHeaderFallBack.addAll(responseHeaders);
metaDataWithResponseHeaderFallBack.addAll(htmlMetaData);
document =
new IndexDocument(url, title, content, responseHeaders, htmlMetaData, metaDataWithResponseHeaderFallBack);
return document;
}
/**
* Empty implementation of the Iterator method.
*/
@Override
public void remove() {
;
}
}