blob: a9cf5bc23704e576ccf67af90faead9a9924a269 [file] [log] [blame]
/*******************************************************************************
* Copyright (c) 2008, 2011 Attensity Europe GmbH and brox IT Solutions GmbH. All rights reserved. This program and the
* accompanying materials are made available under the terms of the Eclipse Public License v1.0 which accompanies this
* distribution, and is available at http://www.eclipse.org/legal/epl-v10.html
*
* Contributors: Juergen Schumacher (Attensity Europe GmbH) - initial API and implementation
*******************************************************************************/
package org.eclipse.smila.importing.crawler.web.test;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import org.eclipse.smila.datamodel.AnyMap;
import org.eclipse.smila.datamodel.AnySeq;
import org.eclipse.smila.datamodel.DataFactory;
import org.eclipse.smila.datamodel.Record;
import org.eclipse.smila.datamodel.ipc.BinaryObjectStreamIterator;
import org.eclipse.smila.http.server.HttpService;
import org.eclipse.smila.importing.ImportingConstants;
import org.eclipse.smila.importing.crawler.web.WebCrawlerConstants;
import org.eclipse.smila.importing.crawler.web.WebCrawlerWorker;
import org.eclipse.smila.importing.crawler.web.filter.FilterConfiguration;
import org.eclipse.smila.importing.util.PropertyNameMapper;
import org.eclipse.smila.jobmanager.JobRunDataProvider;
import org.eclipse.smila.jobmanager.JobRunEngine;
import org.eclipse.smila.jobmanager.JobState;
import org.eclipse.smila.jobmanager.definitions.DefinitionPersistence;
import org.eclipse.smila.jobmanager.definitions.JobDefinition;
import org.eclipse.smila.jobmanager.definitions.JobManagerConstants;
import org.eclipse.smila.jobmanager.definitions.JobRunMode;
import org.eclipse.smila.objectstore.ObjectStoreException;
import org.eclipse.smila.objectstore.ObjectStoreService;
import org.eclipse.smila.objectstore.StoreObject;
import org.eclipse.smila.taskworker.Worker;
import org.eclipse.smila.utils.service.ServiceUtils;
import org.osgi.framework.ServiceReference;
/** test class for WebCrawlerWorker. */
public class TestWebCrawlerWorker extends WebExtractorTestBase {
private static final String JOBNAME_CRAWLWEB = "crawlWeb";
private static final String STORENAME = "buckets";
private static final String BUCKET_LINKS = "outgoingLinks/";
private static final String BUCKET_RECORDS = "crawledRecords/";
private static final String BASEURL = "http://localhost:8765/files/";
private JobRunEngine _jobRunEngine;
private JobRunDataProvider _jobRunDataProvider;
private DefinitionPersistence _defPersistence;
private ObjectStoreService _objectStore;
private final PropertyNameMapper _mapper = _webCrawlingContext.getMapper();
@Override
protected void setUp() throws Exception {
super.setUp();
_jobRunEngine = getService(JobRunEngine.class);
_jobRunDataProvider = getService(JobRunDataProvider.class);
_defPersistence = getService(DefinitionPersistence.class);
_objectStore = getService(ObjectStoreService.class);
_objectStore.removeStore(STORENAME);
_objectStore.ensureStore(STORENAME);
getService(HttpService.class); // wait for system started.
}
/** assert that we are testing the right implementation. */
@SuppressWarnings("rawtypes")
public void testService() throws Exception {
final ServiceReference[] services = ServiceUtils.getServiceReferences(Worker.class);
assertTrue("no worker services started.", services.length > 0);
for (final ServiceReference service : services) {
final Worker worker = ServiceUtils.getService(service, Worker.class);
if (worker instanceof WebCrawlerWorker) {
assertEquals("webCrawler", worker.getName());
return; // worker found, test ok.
}
}
fail("WebCrawlerWorker not found");
}
/** crawl a single page with one link. */
public void testCrawlPageWithOneLink() throws Exception {
runWebCrawlerJob("links1.html");
checkOutgoingLinks(BASEURL + "index.html");
checkCrawledRecords(1, BASEURL + "links1.html");
}
/** test with filter include pattern to filter out external link. */
public void testCrawlPageWithFilteringExternalLink() throws Exception {
final AnyMap filterParams = DataFactory.DEFAULT.createAnyMap();
final AnyMap urlPatterns = DataFactory.DEFAULT.createAnyMap();
filterParams.put(FilterConfiguration.URL_PATTERNS, urlPatterns);
final AnySeq includePatterns = DataFactory.DEFAULT.createAnySeq();
urlPatterns.put(FilterConfiguration.INCLUDE_PATTERNS, includePatterns);
includePatterns.add(BASEURL + ".*");
runWebCrawlerJob("links2.html", filterParams);
checkOutgoingLinks(BASEURL + "index.html");
checkCrawledRecords(1, BASEURL + "links2.html");
}
/** crawl a single page with links of several types: with fragment, non-html, with parameters (filtered out). */
public void testCrawlPageWithMixedLinks() throws Exception {
final AnyMap filterParams = DataFactory.DEFAULT.createAnyMap();
final AnyMap urlPatterns = DataFactory.DEFAULT.createAnyMap();
filterParams.put(FilterConfiguration.URL_PATTERNS, urlPatterns);
final AnySeq excludePatterns = DataFactory.DEFAULT.createAnySeq();
urlPatterns.put(FilterConfiguration.EXCLUDE_PATTERNS, excludePatterns);
excludePatterns.add(".*" + "\\?" + ".*");
runWebCrawlerJob("links3.html", filterParams);
checkOutgoingLinks(BASEURL + "index.html", BASEURL + "plain.txt", BASEURL + "icon.png");
checkCrawledRecords(1, BASEURL + "links3.html");
}
/** crawl with many links. */
public void testCrawlPageWithManyLinks() throws Exception {
runWebCrawlerJob("links11.html");
checkOutgoingLinks(BASEURL + "page00.html", BASEURL + "page01.html", BASEURL + "page10.html", BASEURL
+ "page02.html", BASEURL + "page03.html", BASEURL + "page04.html", BASEURL + "page05.html", BASEURL
+ "page06.html", BASEURL + "page07.html", BASEURL + "page08.html", BASEURL + "page09.html");
final List<Record> records = checkCrawledRecords(1, BASEURL + "links11.html");
for (final Record record : records) {
assertFalse(record.getMetadata().containsKey(ImportingConstants.ATTRIBUTE_COMPOUNDFLAG));
}
}
/** crawl a plain-text resource. */
public void testCrawlTextResource() throws Exception {
runWebCrawlerJob("plain.txt");
checkOutgoingLinks();
final List<Record> records = checkCrawledRecords(1, BASEURL + "plain.txt");
assertFalse(records.get(0).getMetadata().containsKey(ImportingConstants.ATTRIBUTE_COMPOUNDFLAG));
}
/** crawl a binary resource. */
public void testCrawlBinaryResource() throws Exception {
runWebCrawlerJob("icon.png");
checkOutgoingLinks();
final List<Record> records = checkCrawledRecords(1, BASEURL + "icon.png");
assertFalse(records.get(0).getMetadata().containsKey(ImportingConstants.ATTRIBUTE_COMPOUNDFLAG));
}
/** crawl a single page with one link. */
public void testCrawlCompoundWithOneLink() throws Exception {
runWebCrawlerJob("document_in_compound.zip");
checkOutgoingLinks();
final List<Record> records = checkCrawledRecords(1, BASEURL + "document_in_compound.zip");
for (final Record record : records) {
assertTrue(record.getMetadata().getBooleanValue(ImportingConstants.ATTRIBUTE_COMPOUNDFLAG));
assertFalse(record.hasAttachments());
assertEquals("application/zip",
record.getMetadata().getStringValue(_mapper.get(WebCrawlerConstants.ATTRIBUTE_MIMETYPE)));
assertTrue(record.getMetadata().containsKey(_mapper.get(WebCrawlerConstants.ATTRIBUTE_LASTMODIFIED)));
}
}
/** start a webcrawler job from template in config and wait until it is finished. */
private String runWebCrawlerJob(final String startFile) throws Exception {
return runWebCrawlerJob(startFile, null);
}
/** start a webcrawler job from template in config and wait until it is finished. */
private String runWebCrawlerJob(final String startFile, final AnyMap filterParams) throws Exception {
final JobDefinition jobTemplate = _defPersistence.getJob(JOBNAME_CRAWLWEB + "Template");
final String jobName = JOBNAME_CRAWLWEB + System.nanoTime();
final AnyMap jobAny = jobTemplate.toAny(false);
jobAny.put("name", jobName);
jobAny.getMap("parameters").put("startUrl", BASEURL + startFile);
if (filterParams != null) {
jobAny.getMap("parameters").put("filters", filterParams);
}
final JobDefinition job = new JobDefinition(jobAny);
_defPersistence.addJob(job);
final String jobRunId = _jobRunEngine.startJob(jobName, JobRunMode.RUNONCE);
waitForJobRunSucceeded(jobName, jobRunId, 10000);
return jobRunId;
}
/** Waits for a job to be completed. */
private void waitForJobRunSucceeded(final String jobName, final String jobId, final long maxWaitTime)
throws Exception {
final long sleepTime = 500L;
final long millisStarted = System.currentTimeMillis();
while (true) {
final Collection<String> completedIds = _jobRunDataProvider.getCompletedJobRunIds(jobName);
if (completedIds.contains(jobId)) {
final AnyMap runData = _jobRunDataProvider.getJobRunData(jobName, jobId);
final String jobRunState = runData.getStringValue(JobManagerConstants.DATA_JOB_STATE);
assertEquals(JobState.SUCCEEDED, JobState.valueOf(jobRunState));
return;
}
assertTrue("Waited too long for job to complete", System.currentTimeMillis() - millisStarted <= maxWaitTime);
Thread.sleep(sleepTime);
}
}
/**
* check number and content of linksToCrawl bulk. Expects each link in a seperate bulk. Outgoing link records are NOT
* mapped!
*/
private void checkOutgoingLinks(final String... urls) throws Exception {
final int expectedNumberOfBulks = urls.length;
final List<StoreObject> bulks = getSortedBulks(BUCKET_LINKS, expectedNumberOfBulks);
if (expectedNumberOfBulks > 0) {
int recordCount = 0;
for (final StoreObject bulk : bulks) {
final BinaryObjectStreamIterator bulkReader =
new BinaryObjectStreamIterator(_objectStore.readObject(STORENAME, bulk.getId()));
try {
while (bulkReader.hasNext()) {
final Record linkRecord = bulkReader.next();
System.out.println("Outgoing Link: " + linkRecord);
assertTrue("Too many links", recordCount < urls.length);
assertEquals(urls[recordCount],
linkRecord.getMetadata().getStringValue(WebCrawlerConstants.ATTRIBUTE_URL));
recordCount++;
}
} finally {
bulkReader.close();
}
}
assertEquals("Too few links.", recordCount, urls.length);
}
}
/** check number and content of crawledRecords bulk. */
private List<Record> checkCrawledRecords(final int expectedNumberOfBulks, final String... urls) throws Exception {
final List<StoreObject> bulks = getSortedBulks(BUCKET_RECORDS, expectedNumberOfBulks);
final List<Record> records = new ArrayList<Record>();
if (expectedNumberOfBulks > 0) {
int recordCount = 0;
for (final StoreObject bulk : bulks) {
final BinaryObjectStreamIterator bulkReader =
new BinaryObjectStreamIterator(_objectStore.readObject(STORENAME, bulk.getId()));
try {
while (bulkReader.hasNext()) {
final Record crawledRecord = bulkReader.next();
System.out.println("Crawled Record: " + crawledRecord);
assertTrue("Too many records", recordCount < urls.length);
assertEquals("web", crawledRecord.getSource());
final AnyMap metadata = crawledRecord.getMetadata();
assertEquals(urls[recordCount], metadata.getStringValue(_mapper.get(WebCrawlerConstants.ATTRIBUTE_URL)));
assertTrue(metadata.get(_mapper.get(WebCrawlerConstants.ATTRIBUTE_SIZE)).isLong());
assertTrue(metadata.get(_mapper.get(WebCrawlerConstants.ATTRIBUTE_LASTMODIFIED)).isDateTime());
assertTrue(metadata.containsKey(ImportingConstants.ATTRIBUTE_DELTA_HASH));
final String mimetype = metadata.getStringValue(_mapper.get(WebCrawlerConstants.ATTRIBUTE_MIMETYPE));
assertTrue(metadata.getStringValue(_mapper.get(WebCrawlerConstants.ATTRIBUTE_CONTENTTYPE)).startsWith(
mimetype));
if ("text/html".equals(mimetype)) {
assertTrue(crawledRecord.hasAttachment(_mapper.get(WebCrawlerConstants.ATTACHMENT_CONTENT)));
assertEquals(
crawledRecord.getAttachmentAsBytes(_mapper.get(WebCrawlerConstants.ATTACHMENT_CONTENT)).length,
metadata.getLongValue(_mapper.get(WebCrawlerConstants.ATTRIBUTE_SIZE)).intValue());
} else {
assertFalse(crawledRecord.hasAttachment(_mapper.get(WebCrawlerConstants.ATTACHMENT_CONTENT)));
}
recordCount++;
records.add(crawledRecord);
}
} finally {
bulkReader.close();
}
}
assertEquals("Too few records.", recordCount, urls.length);
}
return records;
}
/** get bulks from bucket sorted by IDs. assert that the number is as expected. */
private List<StoreObject> getSortedBulks(final String bucket, final int expectedNumberOfBulks)
throws ObjectStoreException {
final List<StoreObject> bulks = new ArrayList<StoreObject>(_objectStore.getStoreObjectInfos(STORENAME, bucket));
assertEquals(expectedNumberOfBulks, bulks.size());
Collections.sort(bulks, new Comparator<StoreObject>() {
@Override
public int compare(final StoreObject o1, final StoreObject o2) {
return o1.getId().compareTo(o2.getId());
}
});
return bulks;
}
}