/******************************************************************************* | |
* Copyright (c) 2008, 2011 Attensity Europe GmbH and brox IT Solutions GmbH. All rights reserved. This program and the | |
* accompanying materials are made available under the terms of the Eclipse Public License v1.0 which accompanies this | |
* distribution, and is available at http://www.eclipse.org/legal/epl-v10.html | |
* | |
* Contributors: Andreas Weber (Attensity Europe GmbH) - initial API and implementation | |
*******************************************************************************/ | |
package org.eclipse.smila.importing.crawler.web.test; | |
import java.util.ArrayList; | |
import java.util.Collection; | |
import java.util.Collections; | |
import java.util.HashMap; | |
import java.util.List; | |
import java.util.Map; | |
import org.eclipse.smila.datamodel.DataFactory; | |
import org.eclipse.smila.datamodel.Record; | |
import org.eclipse.smila.importing.crawler.web.LinkExtractor; | |
import org.eclipse.smila.importing.crawler.web.WebCrawlerConstants; | |
import org.eclipse.smila.importing.crawler.web.extractor.DefaultLinkExtractor; | |
import org.eclipse.smila.importing.crawler.web.extractor.LinkExtractorHtmlNeko; | |
import org.eclipse.smila.importing.crawler.web.extractor.LinkExtractorHtmlSoup; | |
/** Test for {@link DefaultLinkExtractor} class. */ | |
public class TestDefaultLinkExtractor extends WebExtractorTestBase { | |
private static final String BASE_URI = "http://www.attensity.com"; | |
private static final String BASE_URI_WITH_PATH = BASE_URI + "/p"; | |
private static final String BASE_URI_WITH_PATH_AND_FILE = BASE_URI_WITH_PATH + "/test.html"; | |
private DefaultLinkExtractor _extractor; | |
@Override | |
protected void setUp() throws Exception { | |
super.setUp(); | |
_extractor = (DefaultLinkExtractor) getService(LinkExtractor.class); | |
} | |
/** test for extracting absolute link. */ | |
public void testSimpleHref() throws Exception { | |
final Map<String, String> testData = new HashMap<String, String>(); | |
final String link = BASE_URI_WITH_PATH + "/link.html"; | |
testData.put(link, link); | |
final String htmlString = "<html> <a href=\"" + link + "\"/> </html>"; | |
final Record inputRecord = createInputRecord(htmlString); | |
doTest(inputRecord, testData); | |
} | |
/** test for extracting absolute link with incomplete html. */ | |
public void testSimpleHrefIncompleteHtml() throws Exception { | |
final Map<String, String> testData = new HashMap<String, String>(); | |
final String link = BASE_URI_WITH_PATH + "/link.htm"; | |
testData.put(link, link); | |
final String htmlString = "<html> <a href=\"" + link + "\""; | |
final Record inputRecord = createInputRecord(htmlString); | |
doTest(inputRecord, testData); | |
} | |
/** test for extracting relative links. */ | |
public void testRelativeLinks() throws Exception { | |
final Map<String, String> testData = new HashMap<String, String>(); | |
// key: test link, value: expected absolute link | |
testData.put("/link1.html", BASE_URI + "/link1.html"); | |
testData.put("link2.html", BASE_URI_WITH_PATH + "/link2.html"); | |
testData.put("../link3.htm", BASE_URI + "/link3.htm"); | |
testData.put("./link4.htm", BASE_URI_WITH_PATH + "/link4.htm"); | |
final String htmlString = createHtmlString(testData); | |
final Record inputRecord = createInputRecord(htmlString); | |
doTest(inputRecord, testData); | |
} | |
/** | |
* tests that links can be extracted even if HTML is malformed. This test succeeds for tagsoup but fails for nekohtml. | |
*/ | |
public void testMalformedHtml() throws Exception { | |
final String link1 = BASE_URI_WITH_PATH + "/link1.html"; | |
final String link2 = BASE_URI_WITH_PATH + "/link2.html"; | |
final Map<String, String> testData = new HashMap<String, String>(); | |
testData.put(link1, link1); | |
testData.put(link2, link2); | |
final String htmlString = "<p> <title> </p> " // | |
+ "<a href=\"" + link1 + "\"> </A>" // | |
+ "<p> </title> </p> </p>" // | |
+ "<A HrEF=\"" + link2 + "\"> </a>"; // | |
final Record inputRecord = createInputRecord(htmlString); | |
// this only works for tagsoup! | |
_extractor.setLinkExtractorHtml(new LinkExtractorHtmlSoup()); | |
final Collection<Record> result = _extractor.extractLinks(inputRecord, _webCrawlingContext); | |
checkResults(new ArrayList<String>(testData.values()), result); | |
} | |
/** test for extracting links from 'FRAME' element. */ | |
public void testFrameLink() throws Exception { | |
final Map<String, String> testData = new HashMap<String, String>(); | |
final String link1 = BASE_URI_WITH_PATH + "/navigation_Left.htm"; | |
final String link2 = BASE_URI_WITH_PATH + "/introduction.htm"; | |
testData.put(link1, link1); | |
testData.put(link2, link2); | |
final String htmlString = "<frameset> " // | |
+ "<frame name=\"Navigation_Frame\" src=\"navigation_Left.htm\" marginheight=\"0\" marginwidth=\"0\"/>" // | |
+ "<FRAME name=\"ContentFrame\" SRC=\"introduction.htm\" marginheight=\"0\" marginwidth=\"0\"/>" // | |
+ "</frameset>"; | |
final Record inputRecord = createInputRecord(htmlString); | |
// this only works for tagsoup! | |
_extractor.setLinkExtractorHtml(new LinkExtractorHtmlSoup()); | |
final Collection<Record> result = _extractor.extractLinks(inputRecord, _webCrawlingContext); | |
checkResults(new ArrayList<String>(testData.values()), result); | |
} | |
/** test for extracting links from 'IMG' element. */ | |
public void testImageLink() throws Exception { | |
final Map<String, String> testData = new HashMap<String, String>(); | |
final String link1 = BASE_URI_WITH_PATH + "/icon.gif"; | |
final String link2 = BASE_URI_WITH_PATH + "/images/picture.jpg"; | |
testData.put(link1, link1); | |
testData.put(link2, link2); | |
final String htmlString = "<html><body><img src=\"" + link1 + "\"> <img src=\"" + link2 + "\"></body></html>"; | |
final Record inputRecord = createInputRecord(htmlString); | |
// this only works for tagsoup! | |
_extractor.setLinkExtractorHtml(new LinkExtractorHtmlSoup()); | |
final Collection<Record> result = _extractor.extractLinks(inputRecord, _webCrawlingContext); | |
checkResults(new ArrayList<String>(testData.values()), result); | |
} | |
/** test with unescaped link. Hint: We can't extract unescaped relative links at the moment */ | |
public void testUnescapedLink() throws Exception { | |
final Map<String, String> testData = new HashMap<String, String>(); | |
final String link1 = BASE_URI_WITH_PATH + "/test1.html"; | |
final String linkUnescapedRel = "this link is not escaped"; // this can not be extracted! | |
final String linkUnescapedAbs = "http://www.attensity.com/this link is not escaped/?query=a b"; | |
final String link2 = BASE_URI_WITH_PATH + "/test2.html"; | |
testData.put(link1, link1); | |
testData.put(linkUnescapedAbs, "http://www.attensity.com/this%20link%20is%20not%20escaped/?query=a%20b"); | |
testData.put(link2, link2); | |
final String htmlString = "<title>" // | |
+ "<a href=\"" + link1 + "\"> </a>" // | |
+ "<a href=\"" + linkUnescapedRel + "\"> </a>" // | |
+ "<a href=\"" + linkUnescapedAbs + "\"> </a>" // | |
+ "<a href=\"" + link2 + "\"> </a>"; // | |
final Record inputRecord = createInputRecord(htmlString); | |
doTest(inputRecord, testData); | |
} | |
/** test with escaped link. */ | |
public void testEscapedLink() throws Exception { | |
final Map<String, String> testData = new HashMap<String, String>(); | |
final String link1 = BASE_URI_WITH_PATH + "/test1.html"; | |
final String linkEscapedAbs = "http://www.attensity.com/p/this%20link%20is%20escaped/?query=a%20b"; | |
final String linkEscapedRel = "this%20link%20is%20escaped/?query=a%20b"; | |
final String link2 = BASE_URI_WITH_PATH + "/test2.html"; | |
testData.put(link1, link1); | |
testData.put(linkEscapedAbs, linkEscapedAbs); | |
testData.put(linkEscapedRel, linkEscapedAbs); | |
testData.put(link2, link2); | |
final String htmlString = "<title>" // | |
+ "<a href=\"" + link1 + "\"> </a>" // | |
+ "<a href=\"" + linkEscapedAbs + "\"> </a>" // | |
+ "<a href=\"" + linkEscapedRel + "\"> </a>" // | |
+ "<a href=\"" + link2 + "\"> </a>"; // | |
final Record inputRecord = createInputRecord(htmlString); | |
doTest(inputRecord, testData); | |
} | |
/** test link normlization. */ | |
public void testLinkNormalization() throws Exception { | |
final Map<String, String> testData = new HashMap<String, String>(); | |
final String link1 = "HTtp://WWW.Attensity.com:8080/Test1.html"; // -> scheme and host are converted to lower case | |
final String link2 = "http://www.attensity.com#fragment"; // -> fragment parts are removed | |
final String link3 = "http://www.attensity.com/?Query=q&query2=q"; // -> query parts remain | |
final String link4 = "http://www.attensity.com:80/port"; // -> default port 80 is removed | |
final String link5 = "http://www.attensity.com/test unescaped"; // -> link will be escaped | |
final String link6 = "http://www.attensity.com/test%20escaped"; // -> escaped link remains | |
final String link7 = "http://www.attensity.com/path/../path2"; // -> path is normalized | |
final String link8 = "javascript:void(0);"; // invalid | |
final String link9 = "mailto:andreas.weber@empolis.com"; // invalid | |
testData.put(link1, "http://www.attensity.com:8080/Test1.html"); | |
testData.put(link2, "http://www.attensity.com"); | |
testData.put(link3, link3); | |
testData.put(link4, "http://www.attensity.com/port"); | |
testData.put(link5, "http://www.attensity.com/test%20unescaped"); | |
testData.put(link6, link6); | |
testData.put(link7, "http://www.attensity.com/path2"); | |
final String htmlString = "<title>" // | |
+ "<a href=\"" + link1 + "\"> </a>" // | |
+ "<a href=\"" + link2 + "\"> </a>" // | |
+ "<a href=\"" + link3 + "\"> </a>" // | |
+ "<a href=\"" + link4 + "\"> </a>" // | |
+ "<a href=\"" + link5 + "\"> </a>" // | |
+ "<a href=\"" + link6 + "\"> </a>" // | |
+ "<a href=\"" + link7 + "\"> </a>" // | |
+ "<a href=\"" + link8 + "\"> </a>" // | |
+ "<a href=\"" + link9 + "\"> </a>"; // | |
final Record inputRecord = createInputRecord(htmlString); | |
doTest(inputRecord, testData); | |
} | |
/** do testing with nekohtml and tagsoup html parser. */ | |
private void doTest(final Record inputRecord, final Map<String, String> testData) throws Exception { | |
// neko | |
_extractor.setLinkExtractorHtml(new LinkExtractorHtmlNeko()); | |
Collection<Record> result = _extractor.extractLinks(inputRecord, _webCrawlingContext); | |
checkResults(new ArrayList<String>(testData.values()), result); | |
// tagsoup | |
_extractor.setLinkExtractorHtml(new LinkExtractorHtmlSoup()); | |
result = _extractor.extractLinks(inputRecord, _webCrawlingContext); | |
checkResults(new ArrayList<String>(testData.values()), result); | |
} | |
/** helper method to create html input from map with URIs. */ | |
private String createHtmlString(final Map<String, String> uriMap) { | |
String s = "<html>"; | |
for (final String uri : uriMap.keySet()) { | |
s = s + "<a href=\"" + uri + "\"/>"; | |
s = s + "<irgendein> html <schrott> der dazwischen <steht>"; | |
} | |
return s; | |
} | |
/** helper method to create input record from html. */ | |
private Record createInputRecord(final String htmlContent) { | |
final Record r = DataFactory.DEFAULT.createRecord(); | |
final byte[] htmlBytes = htmlContent.getBytes(); | |
r.setAttachment(WebCrawlerConstants.ATTACHMENT_CONTENT, htmlBytes); | |
r.getMetadata().put(WebCrawlerConstants.ATTRIBUTE_URL, BASE_URI_WITH_PATH_AND_FILE); | |
return r; | |
} | |
/** helper method to check the extracted links. */ | |
private void checkResults(final List<String> expectedLinks, final Collection<Record> actualRecords) { | |
while (expectedLinks.contains(null)) { | |
expectedLinks.remove(null); | |
} | |
assertEquals(expectedLinks.size(), actualRecords.size()); | |
final List<String> actualLinks = new ArrayList<String>(); | |
for (final Record r : actualRecords) { | |
actualLinks.add(r.getMetadata().getStringValue(WebCrawlerConstants.ATTRIBUTE_URL)); | |
} | |
Collections.sort(expectedLinks); | |
Collections.sort(actualLinks); | |
assertEquals(expectedLinks, actualLinks); | |
} | |
} |