blob: 5d33a846a545485dd39641b0efffaeedc585699d [file] [log] [blame]
/***********************************************************************************************************************
* Copyright (c) 2008 empolis GmbH and brox IT Solutions GmbH. All rights reserved. This program and the accompanying
* materials are made available under the terms of the Eclipse Public License v1.0 which accompanies this distribution,
* and is available at http://www.eclipse.org/legal/epl-v10.html
*
* Contributors: Ivan Churkin (brox IT Solutions GmbH) - initial creator Sebastian Voigt (brox IT Solutions GmbH)
**********************************************************************************************************************/
package org.eclipse.smila.connectivity.framework.crawler.web.test;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.eclipse.smila.connectivity.framework.DataReference;
import org.eclipse.smila.connectivity.framework.crawler.web.WebCrawler;
import org.eclipse.smila.connectivity.framework.crawler.web.messages.FieldAttributeType;
import org.eclipse.smila.connectivity.framework.crawler.web.messages.MetaType;
import org.eclipse.smila.connectivity.framework.crawler.web.parse.Parser;
import org.eclipse.smila.connectivity.framework.crawler.web.parse.ParserManager;
import org.eclipse.smila.connectivity.framework.crawler.web.parse.ParserManagerImpl;
import org.eclipse.smila.connectivity.framework.crawler.web.parse.html.HtmlParser;
import org.eclipse.smila.connectivity.framework.crawler.web.parse.js.JavascriptParserImpl;
import org.eclipse.smila.connectivity.framework.schema.ConfigurationLoader;
import org.eclipse.smila.connectivity.framework.schema.config.DataSourceConnectionConfig;
import org.eclipse.smila.datamodel.record.Record;
import org.eclipse.smila.test.DeclarativeServiceTestCase;
/**
* The Class TestCrawler.
*/
public class TestCrawler extends DeclarativeServiceTestCase {
/**
* The Crawler.
*/
private WebCrawler _crawler;
/**
* Parser manager.
*/
private ParserManager _parserManager;
/**
* Html Parser.
*/
private Parser _htmlParser;
/**
* Javascript Parser.
*/
private Parser _javascriptParser;
/**
* The _log.
*/
private final Log _log = LogFactory.getLog(getClass());
/**
* {@inheritDoc}
*
* @see junit.framework.TestCase#setUp()
*/
@Override
protected void setUp() throws Exception {
// register the service, because it's configuration uses immediate="false"
_crawler = registerService(new WebCrawler(), null, WebCrawler.class, 0);
_parserManager = registerService(new ParserManagerImpl(), null, ParserManagerImpl.class, 0);
_htmlParser = registerService(new HtmlParser(), null, HtmlParser.class, 0);
_javascriptParser = registerService(new JavascriptParserImpl(), null, JavascriptParserImpl.class, 0);
assertNotNull(_crawler);
assertNotNull(_parserManager);
assertNotNull(_htmlParser);
assertNotNull(_javascriptParser);
_parserManager.addParser(_htmlParser);
_parserManager.addParser(_javascriptParser);
_crawler.setParserManager(_parserManager);
}
/**
* Test crawler.
*
* @throws Exception
* the exception
*/
public void testCrawler() throws Exception {
final DataSourceConnectionConfig config =
ConfigurationLoader.unmarshall(TestConfigLoad.class.getResourceAsStream("ConfigExample.xml"));
assertNotNull(config);
_crawler.initialize(config);
final long start = System.currentTimeMillis();
int counter = 0;
DataReference[] diObjects;
while ((diObjects = _crawler.getNext()) != null) {
if (diObjects.length == 0) {
break;
}
for (int i = 0; i < diObjects.length; i++) {
final Record record = diObjects[i].getRecord();
assertNotNull(record);
assertNotNull(record.getMetadata().getAttribute(FieldAttributeType.URL.value()));
assertNotNull(record.getMetadata().getAttribute(FieldAttributeType.TITLE.value()));
assertNotNull(record.getAttachment(FieldAttributeType.CONTENT.value()));
assertNotNull(record.getMetadata().getAttribute(MetaType.META_DATA.value()));
assertNotNull(record.getMetadata().getAttribute(MetaType.RESPONSE_HEADER.value()));
assertNotNull(record.getMetadata().getAttribute(MetaType.META_DATA_WITH_RESPONSE_HEADER_FALL_BACK.value()));
counter++;
}
}
_crawler.close();
_log.info("Total time:" + (System.currentTimeMillis() - start));
_log.info("Total records: " + counter);
}
}