package org.eclipse.smila.importing.crawler.web.test; | |
import java.util.ArrayList; | |
import java.util.Collection; | |
import org.eclipse.smila.datamodel.AnyMap; | |
import org.eclipse.smila.datamodel.AnySeq; | |
import org.eclipse.smila.datamodel.DataFactory; | |
import org.eclipse.smila.datamodel.Record; | |
import org.eclipse.smila.importing.ImportingConstants; | |
import org.eclipse.smila.importing.crawler.web.LinkFilter; | |
import org.eclipse.smila.importing.crawler.web.WebCrawlerConstants; | |
import org.eclipse.smila.importing.crawler.web.filter.DefaultLinkFilter; | |
import org.eclipse.smila.importing.crawler.web.filter.FilterConfiguration; | |
/** Tests for @ DefaultLinkFilter} class. */ | |
public class TestDefaultLinkFilter extends WebExtractorTestBase { | |
/** base URL for testing. */ | |
private static final String START_URL = "http://www.example.com/"; | |
private DefaultLinkFilter _filter; | |
@Override | |
protected void setUp() throws Exception { | |
_filter = (DefaultLinkFilter) getService(LinkFilter.class); | |
} | |
/** check with empty link list. */ | |
public void testEmptyLinks() throws Exception { | |
final Collection<Record> extractedLinks = new ArrayList<Record>(); | |
final Collection<Record> filteredLinks = _filter.filterLinks(extractedLinks, _webCrawlingContext); | |
assertEquals(extractedLinks, filteredLinks); | |
} | |
/** check without filter configuration. */ | |
public void testWithoutConfig() throws Exception { | |
final Collection<Record> extractedLinks = new ArrayList<Record>(); | |
extractedLinks.add(createLinkRecord(START_URL + "index.html")); | |
final Collection<Record> filteredLinks = _filter.filterLinks(extractedLinks, _webCrawlingContext); | |
assertEquals(extractedLinks, filteredLinks); | |
} | |
/** check with matching url include pattern. */ | |
public void testIncludeMatching() throws Exception { | |
// create config | |
final AnyMap configAny = DataFactory.DEFAULT.createAnyMap(); | |
final AnyMap urlPatterns = DataFactory.DEFAULT.createAnyMap(); | |
configAny.put(FilterConfiguration.URL_PATTERNS, urlPatterns); | |
final AnySeq includePatterns = DataFactory.DEFAULT.createAnySeq(); | |
urlPatterns.put(FilterConfiguration.INCLUDE_PATTERNS, includePatterns); | |
includePatterns.add("http:.*"); | |
final AnyMap taskParameters = DataFactory.DEFAULT.createAnyMap(); | |
taskParameters.put(ImportingConstants.TASK_PARAM_FILTERS, configAny); | |
initWebCrawlingContext(taskParameters); | |
// test | |
final Collection<Record> extractedLinks = new ArrayList<Record>(); | |
extractedLinks.add(createLinkRecord(START_URL + "index.html")); | |
final Collection<Record> filteredLinks = _filter.filterLinks(extractedLinks, _webCrawlingContext); | |
assertEquals(extractedLinks, filteredLinks); | |
} | |
/** check with non-matching url include pattern. */ | |
public void testIncludeNonMatching() throws Exception { | |
// create config | |
final AnyMap configAny = DataFactory.DEFAULT.createAnyMap(); | |
final AnyMap urlPatterns = DataFactory.DEFAULT.createAnyMap(); | |
configAny.put(FilterConfiguration.URL_PATTERNS, urlPatterns); | |
final AnySeq includePatterns = DataFactory.DEFAULT.createAnySeq(); | |
urlPatterns.put(FilterConfiguration.INCLUDE_PATTERNS, includePatterns); | |
includePatterns.add("ftp:.*"); | |
final AnyMap taskParameters = DataFactory.DEFAULT.createAnyMap(); | |
taskParameters.put(ImportingConstants.TASK_PARAM_FILTERS, configAny); | |
initWebCrawlingContext(taskParameters); | |
// test | |
final Collection<Record> extractedLinks = new ArrayList<Record>(); | |
extractedLinks.add(createLinkRecord(START_URL + "index.html")); | |
final Collection<Record> filteredLinks = _filter.filterLinks(extractedLinks, _webCrawlingContext); | |
assertTrue(filteredLinks.isEmpty()); | |
} | |
/** check with matching url exclude pattern. */ | |
public void testExcludeMatching() throws Exception { | |
// create config | |
final AnyMap configAny = DataFactory.DEFAULT.createAnyMap(); | |
final AnyMap urlPatterns = DataFactory.DEFAULT.createAnyMap(); | |
configAny.put(FilterConfiguration.URL_PATTERNS, urlPatterns); | |
final AnySeq excludePatterns = DataFactory.DEFAULT.createAnySeq(); | |
urlPatterns.put(FilterConfiguration.EXCLUDE_PATTERNS, excludePatterns); | |
excludePatterns.add("http:.*"); | |
final AnyMap taskParameters = DataFactory.DEFAULT.createAnyMap(); | |
taskParameters.put(ImportingConstants.TASK_PARAM_FILTERS, configAny); | |
initWebCrawlingContext(taskParameters); | |
// test | |
final Collection<Record> extractedLinks = new ArrayList<Record>(); | |
extractedLinks.add(createLinkRecord(START_URL + "index.html")); | |
final Collection<Record> filteredLinks = _filter.filterLinks(extractedLinks, _webCrawlingContext); | |
assertTrue(filteredLinks.isEmpty()); | |
} | |
/** check with non-matching url exclude pattern. */ | |
public void testExcludeNonMatching() throws Exception { | |
// create config | |
final AnyMap configAny = DataFactory.DEFAULT.createAnyMap(); | |
final AnyMap urlPatterns = DataFactory.DEFAULT.createAnyMap(); | |
configAny.put(FilterConfiguration.URL_PATTERNS, urlPatterns); | |
final AnySeq excludePatterns = DataFactory.DEFAULT.createAnySeq(); | |
urlPatterns.put(FilterConfiguration.EXCLUDE_PATTERNS, excludePatterns); | |
excludePatterns.add("ftp:.*"); | |
final AnyMap taskParameters = DataFactory.DEFAULT.createAnyMap(); | |
taskParameters.put(ImportingConstants.TASK_PARAM_FILTERS, configAny); | |
initWebCrawlingContext(taskParameters); | |
// test | |
final Collection<Record> extractedLinks = new ArrayList<Record>(); | |
extractedLinks.add(createLinkRecord(START_URL + "index.html")); | |
final Collection<Record> filteredLinks = _filter.filterLinks(extractedLinks, _webCrawlingContext); | |
assertEquals(extractedLinks, filteredLinks); | |
} | |
/** check with matching url include and exclude patterns. */ | |
public void testInAndExcludeMatching() throws Exception { | |
// create config | |
final AnyMap configAny = DataFactory.DEFAULT.createAnyMap(); | |
final AnyMap urlPatterns = DataFactory.DEFAULT.createAnyMap(); | |
configAny.put(FilterConfiguration.URL_PATTERNS, urlPatterns); | |
final AnySeq includePatterns = DataFactory.DEFAULT.createAnySeq(); | |
final AnySeq excludePatterns = DataFactory.DEFAULT.createAnySeq(); | |
urlPatterns.put(FilterConfiguration.INCLUDE_PATTERNS, includePatterns); | |
urlPatterns.put(FilterConfiguration.EXCLUDE_PATTERNS, excludePatterns); | |
includePatterns.add("http:.*"); | |
excludePatterns.add("ftp:.*"); | |
final AnyMap taskParameters = DataFactory.DEFAULT.createAnyMap(); | |
taskParameters.put(ImportingConstants.TASK_PARAM_FILTERS, configAny); | |
initWebCrawlingContext(taskParameters); | |
// test | |
final Collection<Record> extractedLinks = new ArrayList<Record>(); | |
extractedLinks.add(createLinkRecord(START_URL + "index.html")); | |
final Collection<Record> filteredLinks = _filter.filterLinks(extractedLinks, _webCrawlingContext); | |
assertEquals(extractedLinks, filteredLinks); | |
} | |
/** check with non-matching url include and exclude patterns. */ | |
public void testInAndExcludeNonMatching() throws Exception { | |
// create config | |
final AnyMap configAny = DataFactory.DEFAULT.createAnyMap(); | |
final AnyMap urlPatterns = DataFactory.DEFAULT.createAnyMap(); | |
configAny.put(FilterConfiguration.URL_PATTERNS, urlPatterns); | |
final AnySeq includePatterns = DataFactory.DEFAULT.createAnySeq(); | |
final AnySeq excludePatterns = DataFactory.DEFAULT.createAnySeq(); | |
urlPatterns.put(FilterConfiguration.INCLUDE_PATTERNS, includePatterns); | |
urlPatterns.put(FilterConfiguration.EXCLUDE_PATTERNS, excludePatterns); | |
includePatterns.add("ftp:.*"); | |
excludePatterns.add("http:.*"); | |
final AnyMap taskParameters = DataFactory.DEFAULT.createAnyMap(); | |
taskParameters.put(ImportingConstants.TASK_PARAM_FILTERS, configAny); | |
initWebCrawlingContext(taskParameters); | |
// test | |
final Collection<Record> extractedLinks = new ArrayList<Record>(); | |
extractedLinks.add(createLinkRecord(START_URL + "index.html")); | |
final Collection<Record> filteredLinks = _filter.filterLinks(extractedLinks, _webCrawlingContext); | |
assertTrue(filteredLinks.isEmpty()); | |
} | |
/** create a link record. */ | |
private Record createLinkRecord(final String url) { | |
final Record record = | |
DataFactory.DEFAULT.createRecord("TestSimpleLinkFilter", Long.toString(System.nanoTime())); | |
record.getMetadata().put(WebCrawlerConstants.ATTRIBUTE_URL, url); | |
return record; | |
} | |
} |