core/org.eclipse.smila.importing.crawler.web/code/src/org/eclipse/smila/importing/crawler/web/Fetcher.java - gerrit/smila/org.eclipse.smila.core - Git at Google

 /*******************************************************************************
  * Copyright (c) 2008, 2011 Attensity Europe GmbH and brox IT Solutions GmbH. All rights reserved. This program and the
  * accompanying materials are made available under the terms of the Eclipse Public License v1.0 which accompanies this
  * distribution, and is available at http://www.eclipse.org/legal/epl-v10.html
  *
  * Contributors: Juergen Schumacher (Attensity Europe GmbH) - initial API and implementation
  *******************************************************************************/

 package org.eclipse.smila.importing.crawler.web;

 import org.eclipse.smila.datamodel.Record;
 import org.eclipse.smila.importing.ContentFetcher;

 /**
  * Interface for Fetcher service of the WebCrawlerWorker and WebFetcherWorker. The fetcher is responsible for getting
  * metadata and content
  *
  * @author scum36
  *
  */
 public interface Fetcher extends ContentFetcher {
   /**
    * invoked by WebCrawlerWorker to resolve the URL in an input record. Must write metadata from HTTP header to
    * attributes, and attaches the content of resources that can be used for link extraction.
    *
    * @param url
    *          the url to crawl
    * @param linkRecord
    *          record containing the URL and maybe additional information necessary to access the web resource.
    * @throws WebCrawlerException
    *           if resource cannot be crawled. If recoverable the request should be retried later, else the record should
    *           be skipped by the crawler worker.
    */
   void crawl(String url, Record linkRecord, WebCrawlingContext context) throws WebCrawlerException;

   /**
    * invoked by WebFetcherWorker to get the content of a resource for which the crawler did not already attach the
    * content.
    *
    * <p>
    * <b>Please note: the crawledRecord will already have been mapped.</b>
    * </p>
    *
    * @param url
    *          the url to fetch into the record
    * @param crawledRecord
    * @throws WebCrawlerException
    *           if resource cannot be fetched. If recoverable the request should be retried later, else the record should
    *           be skipped by the crawler worker.
    */
   void fetch(String url, Record crawledRecord, WebCrawlingContext context) throws WebCrawlerException;

 }
	/*******************************************************************************
	* Copyright (c) 2008, 2011 Attensity Europe GmbH and brox IT Solutions GmbH. All rights reserved. This program and the
	* accompanying materials are made available under the terms of the Eclipse Public License v1.0 which accompanies this
	* distribution, and is available at http://www.eclipse.org/legal/epl-v10.html
	*
	* Contributors: Juergen Schumacher (Attensity Europe GmbH) - initial API and implementation
	*******************************************************************************/

	package org.eclipse.smila.importing.crawler.web;

	import org.eclipse.smila.datamodel.Record;
	import org.eclipse.smila.importing.ContentFetcher;

	/**
	* Interface for Fetcher service of the WebCrawlerWorker and WebFetcherWorker. The fetcher is responsible for getting
	* metadata and content
	*
	* @author scum36
	*
	*/
	public interface Fetcher extends ContentFetcher {
	/**
	* invoked by WebCrawlerWorker to resolve the URL in an input record. Must write metadata from HTTP header to
	* attributes, and attaches the content of resources that can be used for link extraction.
	*
	* @param url
	* the url to crawl
	* @param linkRecord
	* record containing the URL and maybe additional information necessary to access the web resource.
	* @throws WebCrawlerException
	* if resource cannot be crawled. If recoverable the request should be retried later, else the record should
	* be skipped by the crawler worker.
	*/
	void crawl(String url, Record linkRecord, WebCrawlingContext context) throws WebCrawlerException;

	/**
	* invoked by WebFetcherWorker to get the content of a resource for which the crawler did not already attach the
	* content.
	*
	* <p>
	* <b>Please note: the crawledRecord will already have been mapped.</b>
	* </p>
	*
	* @param url
	* the url to fetch into the record
	* @param crawledRecord
	* @throws WebCrawlerException
	* if resource cannot be fetched. If recoverable the request should be retried later, else the record should
	* be skipped by the crawler worker.
	*/
	void fetch(String url, Record crawledRecord, WebCrawlingContext context) throws WebCrawlerException;

	}