[361885] prevent duplicates on redirects to links that are filtered (e.g. to another site)

commit: 45ca2b40c19b0b4ed90d288c33cf55d17d257d12 [log] [tgz]
author: jschumacher <jschumacher@89c3eef5-5052-0410-b617-e4420d9a04fc> Tue Oct 25 08:03:52 2011 +0000
committer: jschumacher <jschumacher@89c3eef5-5052-0410-b617-e4420d9a04fc> Tue Oct 25 08:03:52 2011 +0000
tree: 9705164a522b826c1241ba5b3c5baef09eae60da
parent: 62211d221f5cfb32f9b0e25043488222a5dca8ec [diff]
diff --git a/core/org.eclipse.smila.connectivity.framework.crawler.web/code/src/org/eclipse/smila/connectivity/framework/crawler/web/WebSiteIterator.java b/core/org.eclipse.smila.connectivity.framework.crawler.web/code/src/org/eclipse/smila/connectivity/framework/crawler/web/WebSiteIterator.java
index 0c1c2bd..d82b084 100644
--- a/core/org.eclipse.smila.connectivity.framework.crawler.web/code/src/org/eclipse/smila/connectivity/framework/crawler/web/WebSiteIterator.java
+++ b/core/org.eclipse.smila.connectivity.framework.crawler.web/code/src/org/eclipse/smila/connectivity/framework/crawler/web/WebSiteIterator.java

@@ -157,34 +157,8 @@
    */
   @Override
   public boolean hasNext() {
-    while (_linksToDo.size() > 0 && _currentIndexDocument == null) {
+    while (_linksToDo.size() > 0 && _currentIndexDocument == null && !limitsExceeded()) {
       _iterationsDone++;
-      // check size limits
-      if (limitExceeded(_fetcher.getBytes(), FetcherProperties.MAX_BYTES_DOWNLOAD)) {
-        _log.info("Max bytes limit exceeded");
-        return false;
-      }
-      if (limitExceeded(_fetcher.getPages(), FetcherProperties.MAX_DOCUMENT_DOWNLOAD)) {
-        _log.info("Max pages limit exceeded");
-        return false;
-      }
-      final float elapsedTime =
-        (System.currentTimeMillis() - _startTime) / (float) Configuration.MILLIS_PER_SECOND;
-      if (limitExceeded((long) elapsedTime, CrawlProperties.MAX_TIME_SEC)) {
-        _log.info("Max time exceeded");
-        return false;
-      }
-      if (ModelType.MAX_ITERATIONS.value().equals(_configuration.get(CrawlProperties.CRAWLING_MODEL_TYPE))
-        && limitExceeded(_iterationsDone, CrawlProperties.CRAWLING_MODEL_VALUE)) {
-        _log.info("Maximum number of iterations exceeded");
-        return false;
-      }
-      if (ModelType.MAX_DEPTH.value().equals(_configuration.get(CrawlProperties.CRAWLING_MODEL_TYPE))
-        && limitExceeded(_currentDepth, CrawlProperties.CRAWLING_MODEL_VALUE)) {
-        _log.info("Maximum depth exceeded!");
-        return false;
-      }
-
       final Outlink link = _linksToDo.iterator().next();
       _linksToDo.remove(link);
       if (!_linksDone.contains(link)) {
@@ -269,55 +243,49 @@
     // Check if fetching and parsing successfully finished
     if (fetcherOutput.getParse() != null) {
       if (crawlMode.equals(CrawlMode.Index)) {
-        // XXX: Temporary workaround that is needed to avoid indexing of non-text content.
-        // if (fetcherOutput.getContent().getContentType().toLowerCase().contains("text")) {
-        // run html metatags filters
         crawlMode =
           _filterProcessor.evaluateHtmlMetaTagFilters(fetcherOutput.getParse().getData().getHtmlMetaTags());
         // if we still want to index let's do it now
         if (crawlMode.equals(CrawlMode.Index)) {
-          final String url = fetcherOutput.getContent().getUrl();
-          final String title = fetcherOutput.getParse().getData().getTitle();
-          // String content = fetcherOutput.getParse().getText();
-          final byte[] content = fetcherOutput.getContent().getContent();
-
-          final List<String> responseHeaders = fetcherOutput.getParse().getData().getContentMeta().toArrayList();
-          final List<String> htmlMetaData = fetcherOutput.getParse().getData().getHtmlMetaTags().toArrayList();
-
-          final List<String> metaDataWithResponseHeaderFallBack = new ArrayList<String>();
-          metaDataWithResponseHeaderFallBack.addAll(responseHeaders);
-          metaDataWithResponseHeaderFallBack.addAll(htmlMetaData);
-
-          document =
-            new IndexDocument(url, title, content, responseHeaders, htmlMetaData,
-              metaDataWithResponseHeaderFallBack);
+          document = createDocument(fetcherOutput);
         }
-        // }
       }
       if (!crawlMode.equals(CrawlMode.Skip)) {
-        // update links to do (for further indexing)
-        final Outlink[] outlinks = fetcherOutput.getParse().getData().getOutlinks();
-        if (outlinks != null && outlinks.length > 0) {
-          for (final Outlink link : outlinks) {
-            // links from the page are added to the next level
-            _linksToDoNextLevel.add(link);
-            _log.debug("added new link to do:" + link.toString());
-          }
-        }
-        final Outlink[] sitemapOutlinks = fetcherOutput.getSitemapLinks();
-        if (sitemapOutlinks != null && sitemapOutlinks.length > 0) {
-          for (final Outlink link : sitemapOutlinks) {
-            // links from sitemap file are added to the same level
-            _linksToDo.add(link);
-            _log.debug("added new link from sitemap file:" + link.toString());
-          }
-        }
-
+        updateTodoLinks(fetcherOutput);
       }
     }
     return document;
   }
 
+  /** check if any of the configured size, count or time limits is exceeded. */
+  private boolean limitsExceeded() {
+    // check size limits
+    if (limitExceeded(_fetcher.getBytes(), FetcherProperties.MAX_BYTES_DOWNLOAD)) {
+      _log.info("Max bytes limit exceeded");
+      return true;
+    }
+    if (limitExceeded(_fetcher.getPages(), FetcherProperties.MAX_DOCUMENT_DOWNLOAD)) {
+      _log.info("Max pages limit exceeded");
+      return true;
+    }
+    final float elapsedTime = (System.currentTimeMillis() - _startTime) / (float) Configuration.MILLIS_PER_SECOND;
+    if (limitExceeded((long) elapsedTime, CrawlProperties.MAX_TIME_SEC)) {
+      _log.info("Max time exceeded");
+      return true;
+    }
+    if (ModelType.MAX_ITERATIONS.value().equals(_configuration.get(CrawlProperties.CRAWLING_MODEL_TYPE))
+      && limitExceeded(_iterationsDone, CrawlProperties.CRAWLING_MODEL_VALUE)) {
+      _log.info("Maximum number of iterations exceeded");
+      return true;
+    }
+    if (ModelType.MAX_DEPTH.value().equals(_configuration.get(CrawlProperties.CRAWLING_MODEL_TYPE))
+      && limitExceeded(_currentDepth, CrawlProperties.CRAWLING_MODEL_VALUE)) {
+      _log.info("Maximum depth exceeded!");
+      return true;
+    }
+    return false;
+  }
+
   /**
    * Limit exceeded.
    * 
@@ -335,6 +303,50 @@
     return false;
   }
 
+  /** add outgoing links from fetched page to todo lists. */
+  private void updateTodoLinks(final FetcherOutput fetcherOutput) {
+    final Outlink[] outlinks = fetcherOutput.getParse().getData().getOutlinks();
+    if (outlinks != null && outlinks.length > 0) {
+      for (final Outlink link : outlinks) {
+        // links from the page are added to the next level
+        _linksToDoNextLevel.add(link);
+        if (_log.isDebugEnabled()) {
+          _log.debug("added new link to do:" + link.toString());
+        }
+      }
+    }
+    final Outlink[] sitemapOutlinks = fetcherOutput.getSitemapLinks();
+    if (sitemapOutlinks != null && sitemapOutlinks.length > 0) {
+      for (final Outlink link : sitemapOutlinks) {
+        // links from sitemap file are added to the same level
+        _linksToDo.add(link);
+        if (_log.isDebugEnabled()) {
+          _log.debug("added new link from sitemap file:" + link.toString());
+        }
+      }
+    }
+  }
+
+  /** convert fetcher output to IndexDocument. */
+  private IndexDocument createDocument(final FetcherOutput fetcherOutput) {
+    IndexDocument document;
+    final String url = fetcherOutput.getContent().getUrl();
+    final String title = fetcherOutput.getParse().getData().getTitle();
+    // String content = fetcherOutput.getParse().getText();
+    final byte[] content = fetcherOutput.getContent().getContent();
+
+    final List<String> responseHeaders = fetcherOutput.getParse().getData().getContentMeta().toArrayList();
+    final List<String> htmlMetaData = fetcherOutput.getParse().getData().getHtmlMetaTags().toArrayList();
+
+    final List<String> metaDataWithResponseHeaderFallBack = new ArrayList<String>();
+    metaDataWithResponseHeaderFallBack.addAll(responseHeaders);
+    metaDataWithResponseHeaderFallBack.addAll(htmlMetaData);
+
+    document =
+      new IndexDocument(url, title, content, responseHeaders, htmlMetaData, metaDataWithResponseHeaderFallBack);
+    return document;
+  }
+
   /**
    * Empty implementation of the Iterator method.
    */

diff --git a/core/org.eclipse.smila.connectivity.framework.crawler.web/code/src/org/eclipse/smila/connectivity/framework/crawler/web/fetcher/Fetcher.java b/core/org.eclipse.smila.connectivity.framework.crawler.web/code/src/org/eclipse/smila/connectivity/framework/crawler/web/fetcher/Fetcher.java
index af4ad89..1bb448f 100644
--- a/core/org.eclipse.smila.connectivity.framework.crawler.web/code/src/org/eclipse/smila/connectivity/framework/crawler/web/fetcher/Fetcher.java
+++ b/core/org.eclipse.smila.connectivity.framework.crawler.web/code/src/org/eclipse/smila/connectivity/framework/crawler/web/fetcher/Fetcher.java

@@ -160,7 +160,7 @@
   }

 

   /**

-   * Fetches and parses the link.

+   * Fetches the page and parses the link.

    * 

    * @param link

    *          link to fetch

@@ -173,7 +173,6 @@
    */

   public FetcherOutput fetch(Outlink link, final FilterProcessor filterProcessor, final Set<Outlink> linksDone) {

     try {

-      // fetch the page

       boolean redirecting;

       boolean retrying;

       int redirectCount = 0;

@@ -216,7 +215,9 @@
             updateStatus(content.getContent().length);

             if (pstatus != null && pstatus.isSuccess() && pstatus.getMinorCode() == ParseStatus.SUCCESS_REDIRECT) {

               link = getRedirectLink(link, pstatus.getMessage(), filterProcessor, linksDone);

-              if (link != null) {

+              if (link == null) {

+                output(url, null, HttpStatus.NOTFETCHING, sitemapLinks);

+              } else {

                 redirecting = true;

                 redirectCount++;

               }

@@ -226,7 +227,9 @@
           case HttpStatus.MOVED: // redirect

           case HttpStatus.TEMP_MOVED:

             link = getRedirectLink(link, status.getMessage(), filterProcessor, linksDone);

-            if (link != null) {

+            if (link == null) {

+              output(url, null, HttpStatus.NOTFETCHING, sitemapLinks);

+            } else {

               redirecting = true;

               redirectCount++;

             }

@@ -261,23 +264,19 @@
             }

             output(url, null, HttpStatus.GONE, sitemapLinks);

         }

-

         if (redirecting && redirectCount >= _maxRedirect) {

           if (_log.isInfoEnabled()) {

             _log.info(" - redirect count exceeded " + url);

           }

           output(url, null, HttpStatus.GONE, sitemapLinks);

         }

-

         if (retrying && retriesCount >= _maxRetries) {

           if (_log.isInfoEnabled()) {

             _log.info(" - retries count exceeded " + url);

           }

           output(url, null, HttpStatus.GONE, sitemapLinks);

         }

-

         continueFetching = redirecting && redirectCount < _maxRedirect || retrying && retriesCount < _maxRetries;

-

       } while (continueFetching);

       _performanceCounters.incrementBy(WebCrawler.POC_AVEREGE_TIME_TO_FETCH, (System.currentTimeMillis() - start)

         / MILLISECS_IN_SEC);
commit	45ca2b40c19b0b4ed90d288c33cf55d17d257d12	[log] [tgz]
author	jschumacher <jschumacher@89c3eef5-5052-0410-b617-e4420d9a04fc>	Tue Oct 25 08:03:52 2011 +0000
committer	jschumacher <jschumacher@89c3eef5-5052-0410-b617-e4420d9a04fc>	Tue Oct 25 08:03:52 2011 +0000
tree	9705164a522b826c1241ba5b3c5baef09eae60da
parent	62211d221f5cfb32f9b0e25043488222a5dca8ec [diff]