blob: aadfdb8515751c5b2537d1e2c2e247d0b175fee9 [file] [log] [blame]
/*******************************************************************************
* Copyright (c) 2008, 2011 Attensity Europe GmbH and brox IT Solutions GmbH. All rights reserved. This program and the
* accompanying materials are made available under the terms of the Eclipse Public License v1.0 which accompanies this
* distribution, and is available at http://www.eclipse.org/legal/epl-v10.html
*
* Contributors: Andreas Weber (Attensity Europe GmbH) - initial API and implementation
*******************************************************************************/
package org.eclipse.smila.importing.crawler.web.test;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.eclipse.smila.datamodel.DataFactory;
import org.eclipse.smila.datamodel.Record;
import org.eclipse.smila.importing.crawler.web.LinkExtractor;
import org.eclipse.smila.importing.crawler.web.WebCrawlerConstants;
import org.eclipse.smila.importing.crawler.web.extractor.DefaultLinkExtractor;
import org.eclipse.smila.importing.crawler.web.extractor.LinkExtractorHtmlNeko;
import org.eclipse.smila.importing.crawler.web.extractor.LinkExtractorHtmlSoup;
/** Test for {@link DefaultLinkExtractor} class. */
public class TestDefaultLinkExtractor extends WebExtractorTestBase {
private static final String BASE_URI = "http://www.attensity.com";
private static final String BASE_URI_WITH_PATH = BASE_URI + "/p";
private static final String BASE_URI_WITH_PATH_AND_FILE = BASE_URI_WITH_PATH + "/test.html";
private DefaultLinkExtractor _extractor;
@Override
protected void setUp() throws Exception {
super.setUp();
_extractor = (DefaultLinkExtractor) getService(LinkExtractor.class);
}
/** test for extracting absolute link. */
public void testSimpleHref() throws Exception {
final Map<String, String> testData = new HashMap<String, String>();
final String link = BASE_URI_WITH_PATH + "/link.html";
testData.put(link, link);
final String htmlString = "<html> <a href=\"" + link + "\"/> </html>";
final Record inputRecord = createInputRecord(htmlString);
doTest(inputRecord, testData);
}
/** test for extracting absolute link with incomplete html. */
public void testSimpleHrefIncompleteHtml() throws Exception {
final Map<String, String> testData = new HashMap<String, String>();
final String link = BASE_URI_WITH_PATH + "/link.htm";
testData.put(link, link);
final String htmlString = "<html> <a href=\"" + link + "\"";
final Record inputRecord = createInputRecord(htmlString);
doTest(inputRecord, testData);
}
/** test for extracting relative links. */
public void testRelativeLinks() throws Exception {
final Map<String, String> testData = new HashMap<String, String>();
// key: test link, value: expected absolute link
testData.put("/link1.html", BASE_URI + "/link1.html");
testData.put("link2.html", BASE_URI_WITH_PATH + "/link2.html");
testData.put("../link3.htm", BASE_URI + "/link3.htm");
testData.put("./link4.htm", BASE_URI_WITH_PATH + "/link4.htm");
final String htmlString = createHtmlString(testData);
final Record inputRecord = createInputRecord(htmlString);
doTest(inputRecord, testData);
}
/**
* tests that links can be extracted even if HTML is malformed. This test succeeds for tagsoup but fails for nekohtml.
*/
public void testMalformedHtml() throws Exception {
final String link1 = BASE_URI_WITH_PATH + "/link1.html";
final String link2 = BASE_URI_WITH_PATH + "/link2.html";
final Map<String, String> testData = new HashMap<String, String>();
testData.put(link1, link1);
testData.put(link2, link2);
final String htmlString = "<p> <title> </p> " //
+ "<a href=\"" + link1 + "\"> </A>" //
+ "<p> </title> </p> </p>" //
+ "<A HrEF=\"" + link2 + "\"> </a>"; //
final Record inputRecord = createInputRecord(htmlString);
// this only works for tagsoup!
_extractor.setLinkExtractorHtml(new LinkExtractorHtmlSoup());
final Collection<Record> result = _extractor.extractLinks(inputRecord, _webCrawlingContext);
checkResults(new ArrayList<String>(testData.values()), result);
}
/** test for extracting links from 'FRAME' element. */
public void testFrameLink() throws Exception {
final Map<String, String> testData = new HashMap<String, String>();
final String link1 = BASE_URI_WITH_PATH + "/navigation_Left.htm";
final String link2 = BASE_URI_WITH_PATH + "/introduction.htm";
testData.put(link1, link1);
testData.put(link2, link2);
final String htmlString = "<frameset> " //
+ "<frame name=\"Navigation_Frame\" src=\"navigation_Left.htm\" marginheight=\"0\" marginwidth=\"0\"/>" //
+ "<FRAME name=\"ContentFrame\" SRC=\"introduction.htm\" marginheight=\"0\" marginwidth=\"0\"/>" //
+ "</frameset>";
final Record inputRecord = createInputRecord(htmlString);
// this only works for tagsoup!
_extractor.setLinkExtractorHtml(new LinkExtractorHtmlSoup());
final Collection<Record> result = _extractor.extractLinks(inputRecord, _webCrawlingContext);
checkResults(new ArrayList<String>(testData.values()), result);
}
/** test for extracting links from 'IMG' element. */
public void testImageLink() throws Exception {
final Map<String, String> testData = new HashMap<String, String>();
final String link1 = BASE_URI_WITH_PATH + "/icon.gif";
final String link2 = BASE_URI_WITH_PATH + "/images/picture.jpg";
testData.put(link1, link1);
testData.put(link2, link2);
final String htmlString = "<html><body><img src=\"" + link1 + "\"> <img src=\"" + link2 + "\"></body></html>";
final Record inputRecord = createInputRecord(htmlString);
// this only works for tagsoup!
_extractor.setLinkExtractorHtml(new LinkExtractorHtmlSoup());
final Collection<Record> result = _extractor.extractLinks(inputRecord, _webCrawlingContext);
checkResults(new ArrayList<String>(testData.values()), result);
}
/** test with unescaped link. Hint: We can't extract unescaped relative links at the moment */
public void testUnescapedLink() throws Exception {
final Map<String, String> testData = new HashMap<String, String>();
final String link1 = BASE_URI_WITH_PATH + "/test1.html";
final String linkUnescapedRel = "this link is not escaped"; // this can not be extracted!
final String linkUnescapedAbs = "http://www.attensity.com/this link is not escaped/?query=a b";
final String link2 = BASE_URI_WITH_PATH + "/test2.html";
testData.put(link1, link1);
testData.put(linkUnescapedAbs, "http://www.attensity.com/this%20link%20is%20not%20escaped/?query=a%20b");
testData.put(link2, link2);
final String htmlString = "<title>" //
+ "<a href=\"" + link1 + "\"> </a>" //
+ "<a href=\"" + linkUnescapedRel + "\"> </a>" //
+ "<a href=\"" + linkUnescapedAbs + "\"> </a>" //
+ "<a href=\"" + link2 + "\"> </a>"; //
final Record inputRecord = createInputRecord(htmlString);
doTest(inputRecord, testData);
}
/** test with escaped link. */
public void testEscapedLink() throws Exception {
final Map<String, String> testData = new HashMap<String, String>();
final String link1 = BASE_URI_WITH_PATH + "/test1.html";
final String linkEscapedAbs = "http://www.attensity.com/p/this%20link%20is%20escaped/?query=a%20b";
final String linkEscapedRel = "this%20link%20is%20escaped/?query=a%20b";
final String link2 = BASE_URI_WITH_PATH + "/test2.html";
testData.put(link1, link1);
testData.put(linkEscapedAbs, linkEscapedAbs);
testData.put(linkEscapedRel, linkEscapedAbs);
testData.put(link2, link2);
final String htmlString = "<title>" //
+ "<a href=\"" + link1 + "\"> </a>" //
+ "<a href=\"" + linkEscapedAbs + "\"> </a>" //
+ "<a href=\"" + linkEscapedRel + "\"> </a>" //
+ "<a href=\"" + link2 + "\"> </a>"; //
final Record inputRecord = createInputRecord(htmlString);
doTest(inputRecord, testData);
}
/** test link normlization. */
public void testLinkNormalization() throws Exception {
final Map<String, String> testData = new HashMap<String, String>();
final String link1 = "HTtp://WWW.Attensity.com:8080/Test1.html"; // -> scheme and host are converted to lower case
final String link2 = "http://www.attensity.com#fragment"; // -> fragment parts are removed
final String link3 = "http://www.attensity.com/?Query=q&query2=q"; // -> query parts remain
final String link4 = "http://www.attensity.com:80/port"; // -> default port 80 is removed
final String link5 = "http://www.attensity.com/test unescaped"; // -> link will be escaped
final String link6 = "http://www.attensity.com/test%20escaped"; // -> escaped link remains
final String link7 = "http://www.attensity.com/path/../path2"; // -> path is normalized
final String link8 = "javascript:void(0);"; // invalid
final String link9 = "mailto:andreas.weber@empolis.com"; // invalid
testData.put(link1, "http://www.attensity.com:8080/Test1.html");
testData.put(link2, "http://www.attensity.com");
testData.put(link3, link3);
testData.put(link4, "http://www.attensity.com/port");
testData.put(link5, "http://www.attensity.com/test%20unescaped");
testData.put(link6, link6);
testData.put(link7, "http://www.attensity.com/path2");
final String htmlString = "<title>" //
+ "<a href=\"" + link1 + "\"> </a>" //
+ "<a href=\"" + link2 + "\"> </a>" //
+ "<a href=\"" + link3 + "\"> </a>" //
+ "<a href=\"" + link4 + "\"> </a>" //
+ "<a href=\"" + link5 + "\"> </a>" //
+ "<a href=\"" + link6 + "\"> </a>" //
+ "<a href=\"" + link7 + "\"> </a>" //
+ "<a href=\"" + link8 + "\"> </a>" //
+ "<a href=\"" + link9 + "\"> </a>"; //
final Record inputRecord = createInputRecord(htmlString);
doTest(inputRecord, testData);
}
/** do testing with nekohtml and tagsoup html parser. */
private void doTest(final Record inputRecord, final Map<String, String> testData) throws Exception {
// neko
_extractor.setLinkExtractorHtml(new LinkExtractorHtmlNeko());
Collection<Record> result = _extractor.extractLinks(inputRecord, _webCrawlingContext);
checkResults(new ArrayList<String>(testData.values()), result);
// tagsoup
_extractor.setLinkExtractorHtml(new LinkExtractorHtmlSoup());
result = _extractor.extractLinks(inputRecord, _webCrawlingContext);
checkResults(new ArrayList<String>(testData.values()), result);
}
/** helper method to create html input from map with URIs. */
private String createHtmlString(final Map<String, String> uriMap) {
String s = "<html>";
for (final String uri : uriMap.keySet()) {
s = s + "<a href=\"" + uri + "\"/>";
s = s + "<irgendein> html <schrott> der dazwischen <steht>";
}
return s;
}
/** helper method to create input record from html. */
private Record createInputRecord(final String htmlContent) {
final Record r = DataFactory.DEFAULT.createRecord();
final byte[] htmlBytes = htmlContent.getBytes();
r.setAttachment(WebCrawlerConstants.ATTACHMENT_CONTENT, htmlBytes);
r.getMetadata().put(WebCrawlerConstants.ATTRIBUTE_URL, BASE_URI_WITH_PATH_AND_FILE);
return r;
}
/** helper method to check the extracted links. */
private void checkResults(final List<String> expectedLinks, final Collection<Record> actualRecords) {
while (expectedLinks.contains(null)) {
expectedLinks.remove(null);
}
assertEquals(expectedLinks.size(), actualRecords.size());
final List<String> actualLinks = new ArrayList<String>();
for (final Record r : actualRecords) {
actualLinks.add(r.getMetadata().getStringValue(WebCrawlerConstants.ATTRIBUTE_URL));
}
Collections.sort(expectedLinks);
Collections.sort(actualLinks);
assertEquals(expectedLinks, actualLinks);
}
}