[361885] prevent duplicates on redirects to links that are filtered (e.g. to another site)
diff --git a/core/org.eclipse.smila.connectivity.framework.crawler.web/code/src/org/eclipse/smila/connectivity/framework/crawler/web/WebSiteIterator.java b/core/org.eclipse.smila.connectivity.framework.crawler.web/code/src/org/eclipse/smila/connectivity/framework/crawler/web/WebSiteIterator.java
index 0c1c2bd..d82b084 100644
--- a/core/org.eclipse.smila.connectivity.framework.crawler.web/code/src/org/eclipse/smila/connectivity/framework/crawler/web/WebSiteIterator.java
+++ b/core/org.eclipse.smila.connectivity.framework.crawler.web/code/src/org/eclipse/smila/connectivity/framework/crawler/web/WebSiteIterator.java
@@ -157,34 +157,8 @@
*/
@Override
public boolean hasNext() {
- while (_linksToDo.size() > 0 && _currentIndexDocument == null) {
+ while (_linksToDo.size() > 0 && _currentIndexDocument == null && !limitsExceeded()) {
_iterationsDone++;
- // check size limits
- if (limitExceeded(_fetcher.getBytes(), FetcherProperties.MAX_BYTES_DOWNLOAD)) {
- _log.info("Max bytes limit exceeded");
- return false;
- }
- if (limitExceeded(_fetcher.getPages(), FetcherProperties.MAX_DOCUMENT_DOWNLOAD)) {
- _log.info("Max pages limit exceeded");
- return false;
- }
- final float elapsedTime =
- (System.currentTimeMillis() - _startTime) / (float) Configuration.MILLIS_PER_SECOND;
- if (limitExceeded((long) elapsedTime, CrawlProperties.MAX_TIME_SEC)) {
- _log.info("Max time exceeded");
- return false;
- }
- if (ModelType.MAX_ITERATIONS.value().equals(_configuration.get(CrawlProperties.CRAWLING_MODEL_TYPE))
- && limitExceeded(_iterationsDone, CrawlProperties.CRAWLING_MODEL_VALUE)) {
- _log.info("Maximum number of iterations exceeded");
- return false;
- }
- if (ModelType.MAX_DEPTH.value().equals(_configuration.get(CrawlProperties.CRAWLING_MODEL_TYPE))
- && limitExceeded(_currentDepth, CrawlProperties.CRAWLING_MODEL_VALUE)) {
- _log.info("Maximum depth exceeded!");
- return false;
- }
-
final Outlink link = _linksToDo.iterator().next();
_linksToDo.remove(link);
if (!_linksDone.contains(link)) {
@@ -269,55 +243,49 @@
// Check if fetching and parsing successfully finished
if (fetcherOutput.getParse() != null) {
if (crawlMode.equals(CrawlMode.Index)) {
- // XXX: Temporary workaround that is needed to avoid indexing of non-text content.
- // if (fetcherOutput.getContent().getContentType().toLowerCase().contains("text")) {
- // run html metatags filters
crawlMode =
_filterProcessor.evaluateHtmlMetaTagFilters(fetcherOutput.getParse().getData().getHtmlMetaTags());
// if we still want to index let's do it now
if (crawlMode.equals(CrawlMode.Index)) {
- final String url = fetcherOutput.getContent().getUrl();
- final String title = fetcherOutput.getParse().getData().getTitle();
- // String content = fetcherOutput.getParse().getText();
- final byte[] content = fetcherOutput.getContent().getContent();
-
- final List<String> responseHeaders = fetcherOutput.getParse().getData().getContentMeta().toArrayList();
- final List<String> htmlMetaData = fetcherOutput.getParse().getData().getHtmlMetaTags().toArrayList();
-
- final List<String> metaDataWithResponseHeaderFallBack = new ArrayList<String>();
- metaDataWithResponseHeaderFallBack.addAll(responseHeaders);
- metaDataWithResponseHeaderFallBack.addAll(htmlMetaData);
-
- document =
- new IndexDocument(url, title, content, responseHeaders, htmlMetaData,
- metaDataWithResponseHeaderFallBack);
+ document = createDocument(fetcherOutput);
}
- // }
}
if (!crawlMode.equals(CrawlMode.Skip)) {
- // update links to do (for further indexing)
- final Outlink[] outlinks = fetcherOutput.getParse().getData().getOutlinks();
- if (outlinks != null && outlinks.length > 0) {
- for (final Outlink link : outlinks) {
- // links from the page are added to the next level
- _linksToDoNextLevel.add(link);
- _log.debug("added new link to do:" + link.toString());
- }
- }
- final Outlink[] sitemapOutlinks = fetcherOutput.getSitemapLinks();
- if (sitemapOutlinks != null && sitemapOutlinks.length > 0) {
- for (final Outlink link : sitemapOutlinks) {
- // links from sitemap file are added to the same level
- _linksToDo.add(link);
- _log.debug("added new link from sitemap file:" + link.toString());
- }
- }
-
+ updateTodoLinks(fetcherOutput);
}
}
return document;
}
+ /** check if any of the configured size, count or time limits is exceeded. */
+ private boolean limitsExceeded() {
+ // check size limits
+ if (limitExceeded(_fetcher.getBytes(), FetcherProperties.MAX_BYTES_DOWNLOAD)) {
+ _log.info("Max bytes limit exceeded");
+ return true;
+ }
+ if (limitExceeded(_fetcher.getPages(), FetcherProperties.MAX_DOCUMENT_DOWNLOAD)) {
+ _log.info("Max pages limit exceeded");
+ return true;
+ }
+ final float elapsedTime = (System.currentTimeMillis() - _startTime) / (float) Configuration.MILLIS_PER_SECOND;
+ if (limitExceeded((long) elapsedTime, CrawlProperties.MAX_TIME_SEC)) {
+ _log.info("Max time exceeded");
+ return true;
+ }
+ if (ModelType.MAX_ITERATIONS.value().equals(_configuration.get(CrawlProperties.CRAWLING_MODEL_TYPE))
+ && limitExceeded(_iterationsDone, CrawlProperties.CRAWLING_MODEL_VALUE)) {
+ _log.info("Maximum number of iterations exceeded");
+ return true;
+ }
+ if (ModelType.MAX_DEPTH.value().equals(_configuration.get(CrawlProperties.CRAWLING_MODEL_TYPE))
+ && limitExceeded(_currentDepth, CrawlProperties.CRAWLING_MODEL_VALUE)) {
+ _log.info("Maximum depth exceeded!");
+ return true;
+ }
+ return false;
+ }
+
/**
* Limit exceeded.
*
@@ -335,6 +303,50 @@
return false;
}
+ /** add outgoing links from fetched page to todo lists. */
+ private void updateTodoLinks(final FetcherOutput fetcherOutput) {
+ final Outlink[] outlinks = fetcherOutput.getParse().getData().getOutlinks();
+ if (outlinks != null && outlinks.length > 0) {
+ for (final Outlink link : outlinks) {
+ // links from the page are added to the next level
+ _linksToDoNextLevel.add(link);
+ if (_log.isDebugEnabled()) {
+ _log.debug("added new link to do:" + link.toString());
+ }
+ }
+ }
+ final Outlink[] sitemapOutlinks = fetcherOutput.getSitemapLinks();
+ if (sitemapOutlinks != null && sitemapOutlinks.length > 0) {
+ for (final Outlink link : sitemapOutlinks) {
+ // links from sitemap file are added to the same level
+ _linksToDo.add(link);
+ if (_log.isDebugEnabled()) {
+ _log.debug("added new link from sitemap file:" + link.toString());
+ }
+ }
+ }
+ }
+
+ /** convert fetcher output to IndexDocument. */
+ private IndexDocument createDocument(final FetcherOutput fetcherOutput) {
+ IndexDocument document;
+ final String url = fetcherOutput.getContent().getUrl();
+ final String title = fetcherOutput.getParse().getData().getTitle();
+ // String content = fetcherOutput.getParse().getText();
+ final byte[] content = fetcherOutput.getContent().getContent();
+
+ final List<String> responseHeaders = fetcherOutput.getParse().getData().getContentMeta().toArrayList();
+ final List<String> htmlMetaData = fetcherOutput.getParse().getData().getHtmlMetaTags().toArrayList();
+
+ final List<String> metaDataWithResponseHeaderFallBack = new ArrayList<String>();
+ metaDataWithResponseHeaderFallBack.addAll(responseHeaders);
+ metaDataWithResponseHeaderFallBack.addAll(htmlMetaData);
+
+ document =
+ new IndexDocument(url, title, content, responseHeaders, htmlMetaData, metaDataWithResponseHeaderFallBack);
+ return document;
+ }
+
/**
* Empty implementation of the Iterator method.
*/
diff --git a/core/org.eclipse.smila.connectivity.framework.crawler.web/code/src/org/eclipse/smila/connectivity/framework/crawler/web/fetcher/Fetcher.java b/core/org.eclipse.smila.connectivity.framework.crawler.web/code/src/org/eclipse/smila/connectivity/framework/crawler/web/fetcher/Fetcher.java
index af4ad89..1bb448f 100644
--- a/core/org.eclipse.smila.connectivity.framework.crawler.web/code/src/org/eclipse/smila/connectivity/framework/crawler/web/fetcher/Fetcher.java
+++ b/core/org.eclipse.smila.connectivity.framework.crawler.web/code/src/org/eclipse/smila/connectivity/framework/crawler/web/fetcher/Fetcher.java
@@ -160,7 +160,7 @@
}
/**
- * Fetches and parses the link.
+ * Fetches the page and parses the link.
*
* @param link
* link to fetch
@@ -173,7 +173,6 @@
*/
public FetcherOutput fetch(Outlink link, final FilterProcessor filterProcessor, final Set<Outlink> linksDone) {
try {
- // fetch the page
boolean redirecting;
boolean retrying;
int redirectCount = 0;
@@ -216,7 +215,9 @@
updateStatus(content.getContent().length);
if (pstatus != null && pstatus.isSuccess() && pstatus.getMinorCode() == ParseStatus.SUCCESS_REDIRECT) {
link = getRedirectLink(link, pstatus.getMessage(), filterProcessor, linksDone);
- if (link != null) {
+ if (link == null) {
+ output(url, null, HttpStatus.NOTFETCHING, sitemapLinks);
+ } else {
redirecting = true;
redirectCount++;
}
@@ -226,7 +227,9 @@
case HttpStatus.MOVED: // redirect
case HttpStatus.TEMP_MOVED:
link = getRedirectLink(link, status.getMessage(), filterProcessor, linksDone);
- if (link != null) {
+ if (link == null) {
+ output(url, null, HttpStatus.NOTFETCHING, sitemapLinks);
+ } else {
redirecting = true;
redirectCount++;
}
@@ -261,23 +264,19 @@
}
output(url, null, HttpStatus.GONE, sitemapLinks);
}
-
if (redirecting && redirectCount >= _maxRedirect) {
if (_log.isInfoEnabled()) {
_log.info(" - redirect count exceeded " + url);
}
output(url, null, HttpStatus.GONE, sitemapLinks);
}
-
if (retrying && retriesCount >= _maxRetries) {
if (_log.isInfoEnabled()) {
_log.info(" - retries count exceeded " + url);
}
output(url, null, HttpStatus.GONE, sitemapLinks);
}
-
continueFetching = redirecting && redirectCount < _maxRedirect || retrying && retriesCount < _maxRetries;
-
} while (continueFetching);
_performanceCounters.incrementBy(WebCrawler.POC_AVEREGE_TIME_TO_FETCH, (System.currentTimeMillis() - start)
/ MILLISECS_IN_SEC);