| /******************************************************************************* |
| * Copyright (c) 2008, 2011 Attensity Europe GmbH and brox IT Solutions GmbH. All rights reserved. This program and the |
| * accompanying materials are made available under the terms of the Eclipse Public License v1.0 which accompanies this |
| * distribution, and is available at http://www.eclipse.org/legal/epl-v10.html |
| * |
| * Contributors: Juergen Schumacher (Attensity Europe GmbH) - initial API and implementation |
| *******************************************************************************/ |
| |
| package org.eclipse.smila.importing.crawler.web; |
| |
| import java.util.Collection; |
| |
| import org.eclipse.smila.datamodel.Record; |
| |
| /** |
| * interface for LinkFilter services. The LinkFilter is called on the result of the {@link LinkExtractor} to select only |
| * those links that should really be followed in follow-up tasks. |
| */ |
| public interface LinkFilter { |
| /** |
| * filter links extracted from given source URL. |
| * |
| * @param extractedLinks |
| * result from {@link LinkExtractor} service. |
| * @param originalUrl |
| * the source URL from which the links were extracted |
| * @param context |
| * the {@link WebCrawlingContext}. |
| * @return links to follow in follow-up tasks |
| */ |
| Collection<Record> filterExtractedLinks(Collection<Record> extractedLinks, String originalUrl, |
| WebCrawlingContext context) throws WebCrawlerException; |
| |
| /** |
| * Check if it is allowed to follow a given redirect link. |
| * |
| * @param link |
| * a String containing the link to be checked |
| * @param originalUrl |
| * the original URL that was redirected. |
| * @param context |
| * the {@link WebCrawlingContext}. |
| * @return true if the link is allowed to be followed, false otherwise |
| */ |
| boolean allowRedirectLink(String link, String originalUrl, WebCrawlingContext context) throws WebCrawlerException; |
| } |