| /*********************************************************************************************************************** |
| * Copyright (c) 2008 empolis GmbH and brox IT Solutions GmbH. All rights reserved. This program and the accompanying |
| * materials are made available under the terms of the Eclipse Public License v1.0 which accompanies this distribution, |
| * and is available at http://www.eclipse.org/legal/epl-v10.html |
| * |
| * Contributors: Dmitry Hazin (brox IT Solutions GmbH) - initial creator Sebastian Voigt (brox IT Solutions GmbH) |
| **********************************************************************************************************************/ |
| package org.eclipse.smila.connectivity.framework.crawler.web; |
| |
| import java.io.IOException; |
| import java.lang.reflect.InvocationTargetException; |
| import java.util.ArrayList; |
| import java.util.HashSet; |
| import java.util.Iterator; |
| import java.util.List; |
| import java.util.Set; |
| |
| import org.apache.commons.logging.Log; |
| import org.apache.commons.logging.LogFactory; |
| import org.eclipse.smila.connectivity.framework.CrawlerCriticalException; |
| import org.eclipse.smila.connectivity.framework.crawler.web.configuration.Configuration; |
| import org.eclipse.smila.connectivity.framework.crawler.web.configuration.CrawlProperties; |
| import org.eclipse.smila.connectivity.framework.crawler.web.configuration.FetcherProperties; |
| import org.eclipse.smila.connectivity.framework.crawler.web.crawl.CrawlMode; |
| import org.eclipse.smila.connectivity.framework.crawler.web.fetcher.Fetcher; |
| import org.eclipse.smila.connectivity.framework.crawler.web.fetcher.FetcherOutput; |
| import org.eclipse.smila.connectivity.framework.crawler.web.filter.FilterProcessor; |
| import org.eclipse.smila.connectivity.framework.crawler.web.filter.impl.FilterProcessorImpl; |
| import org.eclipse.smila.connectivity.framework.crawler.web.messages.ModelType; |
| import org.eclipse.smila.connectivity.framework.crawler.web.messages.WebSite; |
| import org.eclipse.smila.connectivity.framework.crawler.web.parse.Outlink; |
| import org.eclipse.smila.connectivity.framework.crawler.web.parse.ParserManager; |
| import org.eclipse.smila.connectivity.framework.performancecounters.CrawlerPerformanceCounterHelper; |
| |
| /** |
| * The Class WebSiteIterator. |
| * |
| */ |
| public class WebSiteIterator implements Iterator<IndexDocument> { |
| |
| /** |
| * The Log. |
| */ |
| private final Log _log = LogFactory.getLog(WebSiteIterator.class); |
| |
| /** |
| * Set of links which are already "crawled". A set is used to avoid double entries. |
| */ |
| private final Set<Outlink> _linksDone = new HashSet<Outlink>(); |
| |
| /** |
| * Set of links which are queued for "crawling". A set is used to avoid double entries. |
| */ |
| private Set<Outlink> _linksToDo = new HashSet<Outlink>(); |
| |
| /** |
| * The links to do next level. |
| */ |
| private Set<Outlink> _linksToDoNextLevel = new HashSet<Outlink>(); |
| |
| /** |
| * The iterations done. |
| */ |
| private int _iterationsDone; |
| |
| /** |
| * The current depth. |
| */ |
| private int _currentDepth; |
| |
| /** |
| * The configuration. |
| */ |
| private Configuration _configuration; |
| |
| /** |
| * The fetcher. |
| */ |
| private Fetcher _fetcher; |
| |
| /** |
| * The wait. |
| */ |
| private int _wait; |
| |
| /** |
| * The random wait. |
| */ |
| private boolean _randomWait; |
| |
| /** |
| * The filter processor. |
| */ |
| private FilterProcessor _filterProcessor; |
| |
| /** |
| * The start time. |
| */ |
| private long _startTime; |
| |
| /** |
| * Currently selected document in this iterator. |
| */ |
| private IndexDocument _currentIndexDocument; |
| |
| /** |
| * The _performance counters. |
| */ |
| @SuppressWarnings("unused") |
| private final CrawlerPerformanceCounterHelper<WebCrawlerPerformanceAgent> _performanceCounters; |
| |
| /** |
| * Initialize crawling. |
| * |
| * @param webSite |
| * web site crawling configuration |
| * @param performanceCounters |
| * the performance counters |
| * @param parserManager |
| * webcrawler parsers manager |
| * @throws CrawlerCriticalException |
| * the crawler critical exception |
| */ |
| public WebSiteIterator(final WebSite webSite, final ParserManager parserManager, |
| final CrawlerPerformanceCounterHelper<WebCrawlerPerformanceAgent> performanceCounters) |
| throws CrawlerCriticalException { |
| try { |
| _performanceCounters = performanceCounters; |
| _configuration = new Configuration(); |
| _configuration.loadConfiguration(webSite); |
| |
| _fetcher = new Fetcher(_configuration, parserManager, performanceCounters); |
| _wait = _configuration.getInt(CrawlProperties.WAIT); |
| _randomWait = _configuration.getBoolean(CrawlProperties.RANDOM_WAIT); |
| |
| if (_log.isDebugEnabled()) { |
| _log.debug("Starting new project: " + _configuration.get(CrawlProperties.PROJECT_NAME)); |
| } |
| |
| _linksToDo = _configuration.getSeeds(); |
| _filterProcessor = new FilterProcessorImpl(_configuration); |
| _startTime = System.currentTimeMillis(); |
| } catch (final IllegalAccessException exception) { |
| throw new CrawlerCriticalException("Error loading configuration", exception); |
| } catch (final InvocationTargetException exception) { |
| throw new CrawlerCriticalException("Error loading configuration", exception); |
| } catch (final IOException exception) { |
| throw new CrawlerCriticalException("Error loading configuration", exception); |
| } |
| |
| } |
| |
| /** |
| * Checks if this iterator has a next document for indexing. |
| * |
| * @return boolean |
| */ |
| @Override |
| public boolean hasNext() { |
| while (_linksToDo.size() > 0 && _currentIndexDocument == null && !limitsExceeded()) { |
| _iterationsDone++; |
| final Outlink link = _linksToDo.iterator().next(); |
| _linksToDo.remove(link); |
| if (!_linksDone.contains(link)) { |
| _linksDone.add(link); |
| // prove if the url matches crawl scope and all filters |
| final CrawlMode crawlMode = _filterProcessor.evaluateUrlFilters(link); |
| if (!crawlMode.equals(CrawlMode.Skip)) { |
| try { |
| if (_log.isDebugEnabled()) { |
| _log.debug("Link = " + link.getUrlString() + " crawled"); |
| } |
| _currentIndexDocument = indexDocs(link, _configuration, crawlMode); |
| } catch (final InterruptedException exception) { |
| _log.error("Error fetching link " + link.getUrlString()); |
| } |
| } else { |
| if (_log.isDebugEnabled()) { |
| _log.debug("Link = " + link.getUrlString() + " not included (cause: SKIP, Filter)"); |
| } |
| } |
| } else { |
| if (_log.isDebugEnabled()) { |
| _log.debug("Link = " + link.getUrlString() + " already crawled"); |
| } |
| } |
| |
| if (_linksToDo.size() == 0 && _linksToDoNextLevel.size() > 0) { |
| _log.debug("Number of next level links: " + _linksToDoNextLevel.size()); |
| _linksToDo = _linksToDoNextLevel; |
| _linksToDoNextLevel = new HashSet<Outlink>(); |
| _currentDepth++; |
| _log.debug("Current depth is: " + _currentDepth); |
| } |
| |
| } |
| |
| return _currentIndexDocument != null; |
| } |
| |
| /** |
| * Gets the next index document. |
| * |
| * @return IndexDocument |
| */ |
| @Override |
| public IndexDocument next() { |
| if (_currentIndexDocument == null) { |
| hasNext(); |
| } |
| final IndexDocument result = _currentIndexDocument; |
| _currentIndexDocument = null; |
| return result; |
| } |
| |
| /** |
| * Downloads the page and creates index document. |
| * |
| * @param outlink |
| * Link to be fetched. |
| * @param conf |
| * Crawler configuration |
| * @param crawlMode |
| * One of Skip, Index or AnalyzeOnly |
| * |
| * @return IndexDocument |
| * |
| * @throws InterruptedException |
| * if error occured |
| */ |
| private IndexDocument indexDocs(final Outlink outlink, final Configuration conf, CrawlMode crawlMode) |
| throws InterruptedException { |
| IndexDocument document = null; |
| int delay = 0; |
| if (_randomWait) { |
| delay = (int) (Math.random() * _wait * 2); |
| } else if (_wait > 0) { |
| delay = _wait; |
| } |
| _log.debug("Wait before next retrieval, seconds: " + delay); |
| Thread.sleep(delay * Configuration.MILLIS_PER_SECOND); |
| final FetcherOutput fetcherOutput = _fetcher.fetch(outlink, _filterProcessor, _linksDone); |
| // Check if fetching and parsing successfully finished |
| if (fetcherOutput.getParse() != null) { |
| if (crawlMode.equals(CrawlMode.Index)) { |
| crawlMode = |
| _filterProcessor.evaluateHtmlMetaTagFilters(fetcherOutput.getParse().getData().getHtmlMetaTags()); |
| // if we still want to index let's do it now |
| if (crawlMode.equals(CrawlMode.Index)) { |
| document = createDocument(fetcherOutput); |
| } |
| } |
| if (!crawlMode.equals(CrawlMode.Skip)) { |
| updateTodoLinks(fetcherOutput); |
| } |
| } |
| return document; |
| } |
| |
| /** check if any of the configured size, count or time limits is exceeded. */ |
| private boolean limitsExceeded() { |
| // check size limits |
| if (limitExceeded(_fetcher.getBytes(), FetcherProperties.MAX_BYTES_DOWNLOAD)) { |
| _log.info("Max bytes limit exceeded"); |
| return true; |
| } |
| if (limitExceeded(_fetcher.getPages(), FetcherProperties.MAX_DOCUMENT_DOWNLOAD)) { |
| _log.info("Max pages limit exceeded"); |
| return true; |
| } |
| final float elapsedTime = (System.currentTimeMillis() - _startTime) / (float) Configuration.MILLIS_PER_SECOND; |
| if (limitExceeded((long) elapsedTime, CrawlProperties.MAX_TIME_SEC)) { |
| _log.info("Max time exceeded"); |
| return true; |
| } |
| if (ModelType.MAX_ITERATIONS.value().equals(_configuration.get(CrawlProperties.CRAWLING_MODEL_TYPE)) |
| && limitExceeded(_iterationsDone, CrawlProperties.CRAWLING_MODEL_VALUE)) { |
| _log.info("Maximum number of iterations exceeded"); |
| return true; |
| } |
| if (ModelType.MAX_DEPTH.value().equals(_configuration.get(CrawlProperties.CRAWLING_MODEL_TYPE)) |
| && limitExceeded(_currentDepth, CrawlProperties.CRAWLING_MODEL_VALUE)) { |
| _log.info("Maximum depth exceeded!"); |
| return true; |
| } |
| return false; |
| } |
| |
| /** |
| * Limit exceeded. |
| * |
| * @param test |
| * the test |
| * @param propertyName |
| * the property name |
| * |
| * @return true, if successful |
| */ |
| private boolean limitExceeded(final long test, final String propertyName) { |
| if (_configuration.getInt(propertyName) > 0 && test >= _configuration.getInt(propertyName)) { |
| return true; |
| } |
| return false; |
| } |
| |
| /** add outgoing links from fetched page to todo lists. */ |
| private void updateTodoLinks(final FetcherOutput fetcherOutput) { |
| final Outlink[] outlinks = fetcherOutput.getParse().getData().getOutlinks(); |
| if (outlinks != null && outlinks.length > 0) { |
| for (final Outlink link : outlinks) { |
| // links from the page are added to the next level |
| _linksToDoNextLevel.add(link); |
| if (_log.isDebugEnabled()) { |
| _log.debug("added new link to do:" + link.toString()); |
| } |
| } |
| } |
| final Outlink[] sitemapOutlinks = fetcherOutput.getSitemapLinks(); |
| if (sitemapOutlinks != null && sitemapOutlinks.length > 0) { |
| for (final Outlink link : sitemapOutlinks) { |
| // links from sitemap file are added to the same level |
| _linksToDo.add(link); |
| if (_log.isDebugEnabled()) { |
| _log.debug("added new link from sitemap file:" + link.toString()); |
| } |
| } |
| } |
| } |
| |
| /** convert fetcher output to IndexDocument. */ |
| private IndexDocument createDocument(final FetcherOutput fetcherOutput) { |
| IndexDocument document; |
| final String url = fetcherOutput.getContent().getUrl(); |
| final String title = fetcherOutput.getParse().getData().getTitle(); |
| // String content = fetcherOutput.getParse().getText(); |
| final byte[] content = fetcherOutput.getContent().getContent(); |
| |
| final List<String> responseHeaders = fetcherOutput.getParse().getData().getContentMeta().toArrayList(); |
| final List<String> htmlMetaData = fetcherOutput.getParse().getData().getHtmlMetaTags().toArrayList(); |
| |
| final List<String> metaDataWithResponseHeaderFallBack = new ArrayList<String>(); |
| metaDataWithResponseHeaderFallBack.addAll(responseHeaders); |
| metaDataWithResponseHeaderFallBack.addAll(htmlMetaData); |
| |
| document = |
| new IndexDocument(url, title, content, responseHeaders, htmlMetaData, metaDataWithResponseHeaderFallBack); |
| return document; |
| } |
| |
| /** |
| * Empty implementation of the Iterator method. |
| */ |
| @Override |
| public void remove() { |
| ; |
| } |
| |
| } |