| /******************************************************************************* |
| * Copyright (c) 2008, 2011 Attensity Europe GmbH and brox IT Solutions GmbH. All rights reserved. This program and the |
| * accompanying materials are made available under the terms of the Eclipse Public License v1.0 which accompanies this |
| * distribution, and is available at http://www.eclipse.org/legal/epl-v10.html |
| * |
| * Contributors: Juergen Schumacher (Attensity Europe GmbH) - initial API and implementation |
| *******************************************************************************/ |
| |
| package org.eclipse.smila.importing.crawler.web; |
| |
| import org.eclipse.smila.datamodel.Record; |
| import org.eclipse.smila.importing.ContentFetcher; |
| |
| /** |
| * Interface for Fetcher service of the WebCrawlerWorker and WebFetcherWorker. The fetcher is responsible for getting |
| * metadata and content |
| * |
| * @author scum36 |
| * |
| */ |
| public interface Fetcher extends ContentFetcher { |
| /** |
| * invoked by WebCrawlerWorker to resolve the URL in an input record. Must write metadata from HTTP header to |
| * attributes, and attaches the content of resources that can be used for link extraction. |
| * |
| * @param url |
| * the url to crawl |
| * @param linkRecord |
| * record containing the URL and maybe additional information necessary to access the web resource. |
| * @throws WebCrawlerException |
| * if resource cannot be crawled. If recoverable the request should be retried later, else the record should |
| * be skipped by the crawler worker. |
| */ |
| void crawl(String url, Record linkRecord, WebCrawlingContext context) throws WebCrawlerException; |
| |
| /** |
| * invoked by WebFetcherWorker to get the content of a resource for which the crawler did not already attach the |
| * content. |
| * |
| * <p> |
| * <b>Please note: the crawledRecord will already have been mapped.</b> |
| * </p> |
| * |
| * @param url |
| * the url to fetch into the record |
| * @param crawledRecord |
| * @throws WebCrawlerException |
| * if resource cannot be fetched. If recoverable the request should be retried later, else the record should |
| * be skipped by the crawler worker. |
| */ |
| void fetch(String url, Record crawledRecord, WebCrawlingContext context) throws WebCrawlerException; |
| |
| } |