blob: b140649b4dada507c48e30bd034b2d947e2f902e [file] [log] [blame]
package org.eclipse.smila.importing.crawler.web.test;
import java.util.ArrayList;
import java.util.Collection;
import org.eclipse.smila.datamodel.AnyMap;
import org.eclipse.smila.datamodel.AnySeq;
import org.eclipse.smila.datamodel.DataFactory;
import org.eclipse.smila.datamodel.Record;
import org.eclipse.smila.importing.ImportingConstants;
import org.eclipse.smila.importing.crawler.web.LinkFilter;
import org.eclipse.smila.importing.crawler.web.WebCrawlerConstants;
import org.eclipse.smila.importing.crawler.web.filter.DefaultLinkFilter;
import org.eclipse.smila.importing.crawler.web.filter.FilterConfiguration;
/** Tests for @ DefaultLinkFilter} class. */
public class TestDefaultLinkFilter extends WebExtractorTestBase {
/** base URL for testing. */
private static final String START_URL = "http://www.example.com/";
private DefaultLinkFilter _filter;
@Override
protected void setUp() throws Exception {
_filter = (DefaultLinkFilter) getService(LinkFilter.class);
}
/** check with empty link list. */
public void testEmptyLinks() throws Exception {
final Collection<Record> extractedLinks = new ArrayList<Record>();
final Collection<Record> filteredLinks = _filter.filterLinks(extractedLinks, _webCrawlingContext);
assertEquals(extractedLinks, filteredLinks);
}
/** check without filter configuration. */
public void testWithoutConfig() throws Exception {
final Collection<Record> extractedLinks = new ArrayList<Record>();
extractedLinks.add(createLinkRecord(START_URL + "index.html"));
final Collection<Record> filteredLinks = _filter.filterLinks(extractedLinks, _webCrawlingContext);
assertEquals(extractedLinks, filteredLinks);
}
/** check with matching url include pattern. */
public void testIncludeMatching() throws Exception {
// create config
final AnyMap configAny = DataFactory.DEFAULT.createAnyMap();
final AnyMap urlPatterns = DataFactory.DEFAULT.createAnyMap();
configAny.put(FilterConfiguration.URL_PATTERNS, urlPatterns);
final AnySeq includePatterns = DataFactory.DEFAULT.createAnySeq();
urlPatterns.put(FilterConfiguration.INCLUDE_PATTERNS, includePatterns);
includePatterns.add("http:.*");
final AnyMap taskParameters = DataFactory.DEFAULT.createAnyMap();
taskParameters.put(ImportingConstants.TASK_PARAM_FILTERS, configAny);
initWebCrawlingContext(taskParameters);
// test
final Collection<Record> extractedLinks = new ArrayList<Record>();
extractedLinks.add(createLinkRecord(START_URL + "index.html"));
final Collection<Record> filteredLinks = _filter.filterLinks(extractedLinks, _webCrawlingContext);
assertEquals(extractedLinks, filteredLinks);
}
/** check with non-matching url include pattern. */
public void testIncludeNonMatching() throws Exception {
// create config
final AnyMap configAny = DataFactory.DEFAULT.createAnyMap();
final AnyMap urlPatterns = DataFactory.DEFAULT.createAnyMap();
configAny.put(FilterConfiguration.URL_PATTERNS, urlPatterns);
final AnySeq includePatterns = DataFactory.DEFAULT.createAnySeq();
urlPatterns.put(FilterConfiguration.INCLUDE_PATTERNS, includePatterns);
includePatterns.add("ftp:.*");
final AnyMap taskParameters = DataFactory.DEFAULT.createAnyMap();
taskParameters.put(ImportingConstants.TASK_PARAM_FILTERS, configAny);
initWebCrawlingContext(taskParameters);
// test
final Collection<Record> extractedLinks = new ArrayList<Record>();
extractedLinks.add(createLinkRecord(START_URL + "index.html"));
final Collection<Record> filteredLinks = _filter.filterLinks(extractedLinks, _webCrawlingContext);
assertTrue(filteredLinks.isEmpty());
}
/** check with matching url exclude pattern. */
public void testExcludeMatching() throws Exception {
// create config
final AnyMap configAny = DataFactory.DEFAULT.createAnyMap();
final AnyMap urlPatterns = DataFactory.DEFAULT.createAnyMap();
configAny.put(FilterConfiguration.URL_PATTERNS, urlPatterns);
final AnySeq excludePatterns = DataFactory.DEFAULT.createAnySeq();
urlPatterns.put(FilterConfiguration.EXCLUDE_PATTERNS, excludePatterns);
excludePatterns.add("http:.*");
final AnyMap taskParameters = DataFactory.DEFAULT.createAnyMap();
taskParameters.put(ImportingConstants.TASK_PARAM_FILTERS, configAny);
initWebCrawlingContext(taskParameters);
// test
final Collection<Record> extractedLinks = new ArrayList<Record>();
extractedLinks.add(createLinkRecord(START_URL + "index.html"));
final Collection<Record> filteredLinks = _filter.filterLinks(extractedLinks, _webCrawlingContext);
assertTrue(filteredLinks.isEmpty());
}
/** check with non-matching url exclude pattern. */
public void testExcludeNonMatching() throws Exception {
// create config
final AnyMap configAny = DataFactory.DEFAULT.createAnyMap();
final AnyMap urlPatterns = DataFactory.DEFAULT.createAnyMap();
configAny.put(FilterConfiguration.URL_PATTERNS, urlPatterns);
final AnySeq excludePatterns = DataFactory.DEFAULT.createAnySeq();
urlPatterns.put(FilterConfiguration.EXCLUDE_PATTERNS, excludePatterns);
excludePatterns.add("ftp:.*");
final AnyMap taskParameters = DataFactory.DEFAULT.createAnyMap();
taskParameters.put(ImportingConstants.TASK_PARAM_FILTERS, configAny);
initWebCrawlingContext(taskParameters);
// test
final Collection<Record> extractedLinks = new ArrayList<Record>();
extractedLinks.add(createLinkRecord(START_URL + "index.html"));
final Collection<Record> filteredLinks = _filter.filterLinks(extractedLinks, _webCrawlingContext);
assertEquals(extractedLinks, filteredLinks);
}
/** check with matching url include and exclude patterns. */
public void testInAndExcludeMatching() throws Exception {
// create config
final AnyMap configAny = DataFactory.DEFAULT.createAnyMap();
final AnyMap urlPatterns = DataFactory.DEFAULT.createAnyMap();
configAny.put(FilterConfiguration.URL_PATTERNS, urlPatterns);
final AnySeq includePatterns = DataFactory.DEFAULT.createAnySeq();
final AnySeq excludePatterns = DataFactory.DEFAULT.createAnySeq();
urlPatterns.put(FilterConfiguration.INCLUDE_PATTERNS, includePatterns);
urlPatterns.put(FilterConfiguration.EXCLUDE_PATTERNS, excludePatterns);
includePatterns.add("http:.*");
excludePatterns.add("ftp:.*");
final AnyMap taskParameters = DataFactory.DEFAULT.createAnyMap();
taskParameters.put(ImportingConstants.TASK_PARAM_FILTERS, configAny);
initWebCrawlingContext(taskParameters);
// test
final Collection<Record> extractedLinks = new ArrayList<Record>();
extractedLinks.add(createLinkRecord(START_URL + "index.html"));
final Collection<Record> filteredLinks = _filter.filterLinks(extractedLinks, _webCrawlingContext);
assertEquals(extractedLinks, filteredLinks);
}
/** check with non-matching url include and exclude patterns. */
public void testInAndExcludeNonMatching() throws Exception {
// create config
final AnyMap configAny = DataFactory.DEFAULT.createAnyMap();
final AnyMap urlPatterns = DataFactory.DEFAULT.createAnyMap();
configAny.put(FilterConfiguration.URL_PATTERNS, urlPatterns);
final AnySeq includePatterns = DataFactory.DEFAULT.createAnySeq();
final AnySeq excludePatterns = DataFactory.DEFAULT.createAnySeq();
urlPatterns.put(FilterConfiguration.INCLUDE_PATTERNS, includePatterns);
urlPatterns.put(FilterConfiguration.EXCLUDE_PATTERNS, excludePatterns);
includePatterns.add("ftp:.*");
excludePatterns.add("http:.*");
final AnyMap taskParameters = DataFactory.DEFAULT.createAnyMap();
taskParameters.put(ImportingConstants.TASK_PARAM_FILTERS, configAny);
initWebCrawlingContext(taskParameters);
// test
final Collection<Record> extractedLinks = new ArrayList<Record>();
extractedLinks.add(createLinkRecord(START_URL + "index.html"));
final Collection<Record> filteredLinks = _filter.filterLinks(extractedLinks, _webCrawlingContext);
assertTrue(filteredLinks.isEmpty());
}
/** create a link record. */
private Record createLinkRecord(final String url) {
final Record record =
DataFactory.DEFAULT.createRecord("TestSimpleLinkFilter", Long.toString(System.nanoTime()));
record.getMetadata().put(WebCrawlerConstants.ATTRIBUTE_URL, url);
return record;
}
}