blob: 95d61d986d68b3932bf2daf620a7eadaa9075954 [file] [log] [blame]
package org.eclipse.smila.importing.crawler.web.filter;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.Set;
import org.eclipse.smila.datamodel.Record;
import org.eclipse.smila.importing.crawler.web.LinkFilter;
import org.eclipse.smila.importing.crawler.web.WebCrawlerConstants;
import org.eclipse.smila.importing.crawler.web.WebCrawlerException;
import org.eclipse.smila.importing.crawler.web.WebCrawlingContext;
import org.eclipse.smila.taskworker.TaskLog;
/** Default @ LinkFilter} implementation. */
public class DefaultLinkFilter implements LinkFilter {
@Override
public Collection<Record> filterLinks(final Collection<Record> extractedLinks, final WebCrawlingContext context)
throws WebCrawlerException {
final Set<String> links = new HashSet<String>();
final Collection<Record> filteredLinks = new ArrayList<Record>(extractedLinks.size());
for (final Record link : extractedLinks) {
final String url = link.getMetadata().getStringValue(WebCrawlerConstants.ATTRIBUTE_URL);
if (isLinkAllowed(context.getFilterConfiguration(), url, links, context.getTaskLog())) {
filteredLinks.add(link);
}
}
return filteredLinks;
}
@Override
public boolean allowLink(final String url, final WebCrawlingContext context) throws WebCrawlerException {
if (url != null) {
final FilterConfiguration filterConfig = context.getFilterConfiguration();
if (filterConfig != null && filterConfig.getUrlPatternMatcher().matches(url)) {
return true;
}
}
return false;
}
/** check if URL is no duplicate (not already contained in <tt>links</tt>) and is not filtered out by url patterns. */
private boolean isLinkAllowed(final FilterConfiguration filterConfig, final String url, final Set<String> links,
final TaskLog log) {
if (!links.add(url)) {
// filter out duplicates
return false;
}
if (filterConfig != null && !filterConfig.getUrlPatternMatcher().matches(url)) {
// filter out non-matching URLs
return false;
}
return true;
}
}