package org.eclipse.smila.importing.crawler.web.filter; | |
import java.util.ArrayList; | |
import java.util.Collection; | |
import java.util.HashSet; | |
import java.util.Set; | |
import org.eclipse.smila.datamodel.Record; | |
import org.eclipse.smila.importing.crawler.web.LinkFilter; | |
import org.eclipse.smila.importing.crawler.web.WebCrawlerConstants; | |
import org.eclipse.smila.importing.crawler.web.WebCrawlerException; | |
import org.eclipse.smila.importing.crawler.web.WebCrawlingContext; | |
import org.eclipse.smila.taskworker.TaskLog; | |
/** Default @ LinkFilter} implementation. */ | |
public class DefaultLinkFilter implements LinkFilter { | |
@Override | |
public Collection<Record> filterLinks(final Collection<Record> extractedLinks, final WebCrawlingContext context) | |
throws WebCrawlerException { | |
final Set<String> links = new HashSet<String>(); | |
final Collection<Record> filteredLinks = new ArrayList<Record>(extractedLinks.size()); | |
for (final Record link : extractedLinks) { | |
final String url = link.getMetadata().getStringValue(WebCrawlerConstants.ATTRIBUTE_URL); | |
if (isLinkAllowed(context.getFilterConfiguration(), url, links, context.getTaskLog())) { | |
filteredLinks.add(link); | |
} | |
} | |
return filteredLinks; | |
} | |
@Override | |
public boolean allowLink(final String url, final WebCrawlingContext context) throws WebCrawlerException { | |
if (url != null) { | |
final FilterConfiguration filterConfig = context.getFilterConfiguration(); | |
if (filterConfig != null && filterConfig.getUrlPatternMatcher().matches(url)) { | |
return true; | |
} | |
} | |
return false; | |
} | |
/** check if URL is no duplicate (not already contained in <tt>links</tt>) and is not filtered out by url patterns. */ | |
private boolean isLinkAllowed(final FilterConfiguration filterConfig, final String url, final Set<String> links, | |
final TaskLog log) { | |
if (!links.add(url)) { | |
// filter out duplicates | |
return false; | |
} | |
if (filterConfig != null && !filterConfig.getUrlPatternMatcher().matches(url)) { | |
// filter out non-matching URLs | |
return false; | |
} | |
return true; | |
} | |
} |