| /******************************************************************************* |
| * Copyright (c) 2008, 2011 Attensity Europe GmbH and brox IT Solutions GmbH. All rights reserved. This program and the |
| * accompanying materials are made available under the terms of the Eclipse Public License v1.0 which accompanies this |
| * distribution, and is available at http://www.eclipse.org/legal/epl-v10.html |
| * |
| * Contributors: Juergen Schumacher (Attensity Europe GmbH) - initial API and implementation |
| *******************************************************************************/ |
| |
| package org.eclipse.smila.importing.crawler.web.test; |
| |
| import java.util.ArrayList; |
| import java.util.Collection; |
| import java.util.Collections; |
| import java.util.Comparator; |
| import java.util.List; |
| |
| import org.eclipse.smila.datamodel.AnyMap; |
| import org.eclipse.smila.datamodel.AnySeq; |
| import org.eclipse.smila.datamodel.DataFactory; |
| import org.eclipse.smila.datamodel.Record; |
| import org.eclipse.smila.datamodel.ipc.BinaryObjectStreamIterator; |
| import org.eclipse.smila.http.server.HttpService; |
| import org.eclipse.smila.importing.ImportingConstants; |
| import org.eclipse.smila.importing.crawler.web.WebCrawlerConstants; |
| import org.eclipse.smila.importing.crawler.web.WebCrawlerWorker; |
| import org.eclipse.smila.importing.crawler.web.filter.FilterConfiguration; |
| import org.eclipse.smila.importing.util.PropertyNameMapper; |
| import org.eclipse.smila.jobmanager.JobRunDataProvider; |
| import org.eclipse.smila.jobmanager.JobRunEngine; |
| import org.eclipse.smila.jobmanager.JobState; |
| import org.eclipse.smila.jobmanager.definitions.DefinitionPersistence; |
| import org.eclipse.smila.jobmanager.definitions.JobDefinition; |
| import org.eclipse.smila.jobmanager.definitions.JobManagerConstants; |
| import org.eclipse.smila.jobmanager.definitions.JobRunMode; |
| import org.eclipse.smila.objectstore.ObjectStoreException; |
| import org.eclipse.smila.objectstore.ObjectStoreService; |
| import org.eclipse.smila.objectstore.StoreObject; |
| import org.eclipse.smila.taskworker.Worker; |
| import org.eclipse.smila.utils.service.ServiceUtils; |
| import org.osgi.framework.ServiceReference; |
| |
| /** test class for WebCrawlerWorker. */ |
| public class TestWebCrawlerWorker extends WebExtractorTestBase { |
| private static final String JOBNAME_CRAWLWEB = "crawlWeb"; |
| |
| private static final String STORENAME = "buckets"; |
| |
| private static final String BUCKET_LINKS = "outgoingLinks/"; |
| |
| private static final String BUCKET_RECORDS = "crawledRecords/"; |
| |
| private static final String BASEURL = "http://localhost:8765/files/"; |
| |
| private JobRunEngine _jobRunEngine; |
| |
| private JobRunDataProvider _jobRunDataProvider; |
| |
| private DefinitionPersistence _defPersistence; |
| |
| private ObjectStoreService _objectStore; |
| |
| private final PropertyNameMapper _mapper = _webCrawlingContext.getMapper(); |
| |
| @Override |
| protected void setUp() throws Exception { |
| super.setUp(); |
| _jobRunEngine = getService(JobRunEngine.class); |
| _jobRunDataProvider = getService(JobRunDataProvider.class); |
| _defPersistence = getService(DefinitionPersistence.class); |
| _objectStore = getService(ObjectStoreService.class); |
| _objectStore.removeStore(STORENAME); |
| _objectStore.ensureStore(STORENAME); |
| getService(HttpService.class); // wait for system started. |
| } |
| |
| /** assert that we are testing the right implementation. */ |
| @SuppressWarnings("rawtypes") |
| public void testService() throws Exception { |
| final ServiceReference[] services = ServiceUtils.getServiceReferences(Worker.class); |
| assertTrue("no worker services started.", services.length > 0); |
| for (final ServiceReference service : services) { |
| final Worker worker = ServiceUtils.getService(service, Worker.class); |
| if (worker instanceof WebCrawlerWorker) { |
| assertEquals("webCrawler", worker.getName()); |
| return; // worker found, test ok. |
| } |
| } |
| fail("WebCrawlerWorker not found"); |
| } |
| |
| /** crawl a single page with one link. */ |
| public void testCrawlPageWithOneLink() throws Exception { |
| runWebCrawlerJob("links1.html"); |
| checkOutgoingLinks(BASEURL + "index.html"); |
| checkCrawledRecords(1, BASEURL + "links1.html"); |
| } |
| |
| /** test with filter include pattern to filter out external link. */ |
| public void testCrawlPageWithFilteringExternalLink() throws Exception { |
| final AnyMap filterParams = DataFactory.DEFAULT.createAnyMap(); |
| final AnyMap urlPatterns = DataFactory.DEFAULT.createAnyMap(); |
| filterParams.put(FilterConfiguration.URL_PATTERNS, urlPatterns); |
| final AnySeq includePatterns = DataFactory.DEFAULT.createAnySeq(); |
| urlPatterns.put(FilterConfiguration.INCLUDE_PATTERNS, includePatterns); |
| includePatterns.add(BASEURL + ".*"); |
| |
| runWebCrawlerJob("links2.html", filterParams); |
| checkOutgoingLinks(BASEURL + "index.html"); |
| checkCrawledRecords(1, BASEURL + "links2.html"); |
| } |
| |
| /** crawl a single page with links of several types: with fragment, non-html, with parameters (filtered out). */ |
| public void testCrawlPageWithMixedLinks() throws Exception { |
| final AnyMap filterParams = DataFactory.DEFAULT.createAnyMap(); |
| final AnyMap urlPatterns = DataFactory.DEFAULT.createAnyMap(); |
| filterParams.put(FilterConfiguration.URL_PATTERNS, urlPatterns); |
| final AnySeq excludePatterns = DataFactory.DEFAULT.createAnySeq(); |
| urlPatterns.put(FilterConfiguration.EXCLUDE_PATTERNS, excludePatterns); |
| excludePatterns.add(".*" + "\\?" + ".*"); |
| |
| runWebCrawlerJob("links3.html", filterParams); |
| checkOutgoingLinks(BASEURL + "index.html", BASEURL + "plain.txt", BASEURL + "icon.png"); |
| checkCrawledRecords(1, BASEURL + "links3.html"); |
| } |
| |
| /** crawl with many links. */ |
| public void testCrawlPageWithManyLinks() throws Exception { |
| runWebCrawlerJob("links11.html"); |
| checkOutgoingLinks(BASEURL + "page00.html", BASEURL + "page01.html", BASEURL + "page10.html", BASEURL |
| + "page02.html", BASEURL + "page03.html", BASEURL + "page04.html", BASEURL + "page05.html", BASEURL |
| + "page06.html", BASEURL + "page07.html", BASEURL + "page08.html", BASEURL + "page09.html"); |
| final List<Record> records = checkCrawledRecords(1, BASEURL + "links11.html"); |
| for (final Record record : records) { |
| assertFalse(record.getMetadata().containsKey(ImportingConstants.ATTRIBUTE_COMPOUNDFLAG)); |
| } |
| } |
| |
| /** crawl a plain-text resource. */ |
| public void testCrawlTextResource() throws Exception { |
| runWebCrawlerJob("plain.txt"); |
| checkOutgoingLinks(); |
| final List<Record> records = checkCrawledRecords(1, BASEURL + "plain.txt"); |
| assertFalse(records.get(0).getMetadata().containsKey(ImportingConstants.ATTRIBUTE_COMPOUNDFLAG)); |
| } |
| |
| /** crawl a binary resource. */ |
| public void testCrawlBinaryResource() throws Exception { |
| runWebCrawlerJob("icon.png"); |
| checkOutgoingLinks(); |
| final List<Record> records = checkCrawledRecords(1, BASEURL + "icon.png"); |
| assertFalse(records.get(0).getMetadata().containsKey(ImportingConstants.ATTRIBUTE_COMPOUNDFLAG)); |
| } |
| |
| /** crawl a single page with one link. */ |
| public void testCrawlCompoundWithOneLink() throws Exception { |
| runWebCrawlerJob("document_in_compound.zip"); |
| checkOutgoingLinks(); |
| final List<Record> records = checkCrawledRecords(1, BASEURL + "document_in_compound.zip"); |
| for (final Record record : records) { |
| assertTrue(record.getMetadata().getBooleanValue(ImportingConstants.ATTRIBUTE_COMPOUNDFLAG)); |
| assertFalse(record.hasAttachments()); |
| assertEquals("application/zip", |
| record.getMetadata().getStringValue(_mapper.get(WebCrawlerConstants.ATTRIBUTE_MIMETYPE))); |
| assertTrue(record.getMetadata().containsKey(_mapper.get(WebCrawlerConstants.ATTRIBUTE_LASTMODIFIED))); |
| } |
| } |
| |
| /** start a webcrawler job from template in config and wait until it is finished. */ |
| private String runWebCrawlerJob(final String startFile) throws Exception { |
| return runWebCrawlerJob(startFile, null); |
| } |
| |
| /** start a webcrawler job from template in config and wait until it is finished. */ |
| private String runWebCrawlerJob(final String startFile, final AnyMap filterParams) throws Exception { |
| final JobDefinition jobTemplate = _defPersistence.getJob(JOBNAME_CRAWLWEB + "Template"); |
| final String jobName = JOBNAME_CRAWLWEB + System.nanoTime(); |
| final AnyMap jobAny = jobTemplate.toAny(false); |
| jobAny.put("name", jobName); |
| jobAny.getMap("parameters").put("startUrl", BASEURL + startFile); |
| if (filterParams != null) { |
| jobAny.getMap("parameters").put("filters", filterParams); |
| } |
| final JobDefinition job = new JobDefinition(jobAny); |
| _defPersistence.addJob(job); |
| final String jobRunId = _jobRunEngine.startJob(jobName, JobRunMode.RUNONCE); |
| waitForJobRunSucceeded(jobName, jobRunId, 10000); |
| return jobRunId; |
| } |
| |
| /** Waits for a job to be completed. */ |
| private void waitForJobRunSucceeded(final String jobName, final String jobId, final long maxWaitTime) |
| throws Exception { |
| final long sleepTime = 500L; |
| final long millisStarted = System.currentTimeMillis(); |
| while (true) { |
| final Collection<String> completedIds = _jobRunDataProvider.getCompletedJobRunIds(jobName); |
| if (completedIds.contains(jobId)) { |
| final AnyMap runData = _jobRunDataProvider.getJobRunData(jobName, jobId); |
| final String jobRunState = runData.getStringValue(JobManagerConstants.DATA_JOB_STATE); |
| assertEquals(JobState.SUCCEEDED, JobState.valueOf(jobRunState)); |
| return; |
| } |
| assertTrue("Waited too long for job to complete", System.currentTimeMillis() - millisStarted <= maxWaitTime); |
| Thread.sleep(sleepTime); |
| } |
| } |
| |
| /** |
| * check number and content of linksToCrawl bulk. Expects each link in a seperate bulk. Outgoing link records are NOT |
| * mapped! |
| */ |
| private void checkOutgoingLinks(final String... urls) throws Exception { |
| final int expectedNumberOfBulks = urls.length; |
| final List<StoreObject> bulks = getSortedBulks(BUCKET_LINKS, expectedNumberOfBulks); |
| if (expectedNumberOfBulks > 0) { |
| int recordCount = 0; |
| for (final StoreObject bulk : bulks) { |
| final BinaryObjectStreamIterator bulkReader = |
| new BinaryObjectStreamIterator(_objectStore.readObject(STORENAME, bulk.getId())); |
| try { |
| while (bulkReader.hasNext()) { |
| final Record linkRecord = bulkReader.next(); |
| System.out.println("Outgoing Link: " + linkRecord); |
| assertTrue("Too many links", recordCount < urls.length); |
| assertEquals(urls[recordCount], |
| linkRecord.getMetadata().getStringValue(WebCrawlerConstants.ATTRIBUTE_URL)); |
| recordCount++; |
| } |
| } finally { |
| bulkReader.close(); |
| } |
| } |
| assertEquals("Too few links.", recordCount, urls.length); |
| } |
| } |
| |
| /** check number and content of crawledRecords bulk. */ |
| private List<Record> checkCrawledRecords(final int expectedNumberOfBulks, final String... urls) throws Exception { |
| final List<StoreObject> bulks = getSortedBulks(BUCKET_RECORDS, expectedNumberOfBulks); |
| final List<Record> records = new ArrayList<Record>(); |
| if (expectedNumberOfBulks > 0) { |
| int recordCount = 0; |
| for (final StoreObject bulk : bulks) { |
| final BinaryObjectStreamIterator bulkReader = |
| new BinaryObjectStreamIterator(_objectStore.readObject(STORENAME, bulk.getId())); |
| try { |
| while (bulkReader.hasNext()) { |
| final Record crawledRecord = bulkReader.next(); |
| System.out.println("Crawled Record: " + crawledRecord); |
| assertTrue("Too many records", recordCount < urls.length); |
| assertEquals("web", crawledRecord.getSource()); |
| final AnyMap metadata = crawledRecord.getMetadata(); |
| assertEquals(urls[recordCount], metadata.getStringValue(_mapper.get(WebCrawlerConstants.ATTRIBUTE_URL))); |
| assertTrue(metadata.get(_mapper.get(WebCrawlerConstants.ATTRIBUTE_SIZE)).isLong()); |
| assertTrue(metadata.get(_mapper.get(WebCrawlerConstants.ATTRIBUTE_LASTMODIFIED)).isDateTime()); |
| assertTrue(metadata.containsKey(ImportingConstants.ATTRIBUTE_DELTA_HASH)); |
| final String mimetype = metadata.getStringValue(_mapper.get(WebCrawlerConstants.ATTRIBUTE_MIMETYPE)); |
| assertTrue(metadata.getStringValue(_mapper.get(WebCrawlerConstants.ATTRIBUTE_CONTENTTYPE)).startsWith( |
| mimetype)); |
| if ("text/html".equals(mimetype)) { |
| assertTrue(crawledRecord.hasAttachment(_mapper.get(WebCrawlerConstants.ATTACHMENT_CONTENT))); |
| assertEquals( |
| crawledRecord.getAttachmentAsBytes(_mapper.get(WebCrawlerConstants.ATTACHMENT_CONTENT)).length, |
| metadata.getLongValue(_mapper.get(WebCrawlerConstants.ATTRIBUTE_SIZE)).intValue()); |
| } else { |
| assertFalse(crawledRecord.hasAttachment(_mapper.get(WebCrawlerConstants.ATTACHMENT_CONTENT))); |
| } |
| recordCount++; |
| records.add(crawledRecord); |
| } |
| } finally { |
| bulkReader.close(); |
| } |
| } |
| assertEquals("Too few records.", recordCount, urls.length); |
| } |
| return records; |
| } |
| |
| /** get bulks from bucket sorted by IDs. assert that the number is as expected. */ |
| private List<StoreObject> getSortedBulks(final String bucket, final int expectedNumberOfBulks) |
| throws ObjectStoreException { |
| final List<StoreObject> bulks = new ArrayList<StoreObject>(_objectStore.getStoreObjectInfos(STORENAME, bucket)); |
| assertEquals(expectedNumberOfBulks, bulks.size()); |
| Collections.sort(bulks, new Comparator<StoreObject>() { |
| @Override |
| public int compare(final StoreObject o1, final StoreObject o2) { |
| return o1.getId().compareTo(o2.getId()); |
| } |
| }); |
| return bulks; |
| } |
| |
| } |