blob: 56f8b14fe2c8e8677977f31157eebf2960cc6175 [file] [log] [blame]
/*******************************************************************************
* Copyright (c) 2008, 2011 Attensity Europe GmbH and brox IT Solutions GmbH. All rights reserved. This program and the
* accompanying materials are made available under the terms of the Eclipse Public License v1.0 which accompanies this
* distribution, and is available at http://www.eclipse.org/legal/epl-v10.html
*
* Contributors: Juergen Schumacher (Attensity Europe GmbH) - initial API and implementation
*******************************************************************************/
package org.eclipse.smila.importing.crawler.web;
import java.util.Collection;
import org.eclipse.smila.datamodel.Record;
/**
* interface for LinkFilter services. The LinkFilter is called on the result of the {@link LinkExtractor} to select only
* those links that should really be followed in follow-up tasks.
*/
public interface LinkFilter {
/**
* filter links extracted from given source URL.
*
* @param extractedLinks
* result from {@link LinkExtractor} service.
* @param originalUrl
* the source URL from which the links were extracted
* @param context
* the {@link WebCrawlingContext}.
* @return links to follow in follow-up tasks
*/
Collection<Record> filterExtractedLinks(Collection<Record> extractedLinks, String originalUrl,
WebCrawlingContext context) throws WebCrawlerException;
/**
* Check if it is allowed to follow a given redirect link.
*
* @param link
* a String containing the link to be checked
* @param originalUrl
* the original URL that was redirected.
* @param context
* the {@link WebCrawlingContext}.
* @return true if the link is allowed to be followed, false otherwise
*/
boolean allowRedirectLink(String link, String originalUrl, WebCrawlingContext context) throws WebCrawlerException;
}