blob: b38e5031b6fe1f41f0d9c6567981689eac08b8e2 [file] [log] [blame]
/*******************************************************************************
* Copyright (c) 2008, 2011 Attensity Europe GmbH and brox IT Solutions GmbH. All rights reserved. This program and the
* accompanying materials are made available under the terms of the Eclipse Public License v1.0 which accompanies this
* distribution, and is available at http://www.eclipse.org/legal/epl-v10.html
*
* Contributors: Juergen Schumacher (Attensity Europe GmbH) - initial API and implementation
*******************************************************************************/
package org.eclipse.smila.importing.crawler.web;
import org.eclipse.smila.datamodel.Record;
import org.eclipse.smila.importing.ContentFetcher;
/**
* Interface for Fetcher service of the WebCrawlerWorker and WebFetcherWorker. The fetcher is responsible for getting
* metadata and content
*
* @author scum36
*
*/
public interface Fetcher extends ContentFetcher {
/**
* invoked by WebCrawlerWorker to resolve the URL in an input record. Must write metadata from HTTP header to
* attributes, and attaches the content of resources that can be used for link extraction.
*
* @param url
* the url to crawl
* @param linkRecord
* record containing the URL and maybe additional information necessary to access the web resource.
* @throws WebCrawlerException
* if resource cannot be crawled. If recoverable the request should be retried later, else the record should
* be skipped by the crawler worker.
*/
void crawl(String url, Record linkRecord, WebCrawlingContext context) throws WebCrawlerException;
/**
* invoked by WebFetcherWorker to get the content of a resource for which the crawler did not already attach the
* content.
*
* <p>
* <b>Please note: the crawledRecord will already have been mapped.</b>
* </p>
*
* @param url
* the url to fetch into the record
* @param crawledRecord
* @throws WebCrawlerException
* if resource cannot be fetched. If recoverable the request should be retried later, else the record should
* be skipped by the crawler worker.
*/
void fetch(String url, Record crawledRecord, WebCrawlingContext context) throws WebCrawlerException;
}