blob: 9e724896c8602020af63dd375a07806c41201520 [file] [log] [blame]
//------------------------------------------------------------------------------
// Copyright (c) 2005, 2006 IBM Corporation and others.
// All rights reserved. This program and the accompanying materials
// are made available under the terms of the Eclipse Public License v1.0
// which accompanies this distribution, and is available at
// http://www.eclipse.org/legal/epl-v10.html
//
// Contributors:
// IBM Corporation - initial implementation
//------------------------------------------------------------------------------
package org.eclipse.epf.common.html;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.io.StringWriter;
import java.util.Properties;
import org.w3c.dom.Document;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.tidy.Tidy;
/**
* Extracts the title, meta tags and text from a HTML file or source.
*
* @author Kelvin Low
* @since 1.0
*/
public class HTMLParser {
private static final int BUFFER_SIZE = 4096;
private static final String HTML_SCRIPT_TAG = "script"; //$NON-NLS-1$
private static final String HTML_TITLE_TAG = "title"; //$NON-NLS-1$
private static final String HTML_META_TAG = "meta"; //$NON-NLS-1$
private Tidy tidy;
private String title;
private String summary;
private String text;
private Properties metaTags;
private StringBuffer htmlText;
/**
* Creates a new instance.
*/
public HTMLParser() {
try {
tidy = new Tidy();
tidy.setXHTML(true);
tidy.setDropEmptyParas(true);
tidy.setDropFontTags(true);
tidy.setQuiet(true);
tidy.setShowWarnings(false);
tidy.setSmartIndent(false);
tidy.setTidyMark(false);
tidy.setWraplen(132);
tidy.setIndentAttributes(false);
tidy.setIndentContent(false);
tidy.setSpaces(2);
tidy.setInputEncoding("UTF-8"); //$NON-NLS-1$
tidy.setOutputEncoding("UTF-8"); //$NON-NLS-1$
} catch (Exception e) {
tidy = null;
}
}
/**
* Parses the given HTML file.
*/
public void parse(File file) throws Exception {
if (tidy == null || !file.exists() || !file.canRead()) {
return;
}
FileInputStream fis = new FileInputStream(file);
InputStreamReader isr = new InputStreamReader(fis, "UTF-8"); //$NON-NLS-1$
BufferedReader br = new BufferedReader(isr);
StringBuffer textBuffer = new StringBuffer(BUFFER_SIZE);
char[] buffer = new char[BUFFER_SIZE];
int charsRead;
while ((charsRead = br.read(buffer, 0, BUFFER_SIZE)) > 0) {
textBuffer.append(buffer, 0, charsRead);
}
parse(textBuffer.toString());
if (br != null) {
try {
br.close();
} catch (IOException e) {
}
}
}
/**
* Parses the given HTML source.
*/
protected void parse(String htmlSource) throws Exception {
title = ""; //$NON-NLS-1$
summary = ""; //$NON-NLS-1$
text = ""; //$NON-NLS-1$
metaTags = new Properties();
Document doc = getDocument(htmlSource);
if (doc != null) {
htmlText = new StringBuffer(1024);
extract(doc.getChildNodes());
text = htmlText.toString();
}
}
/**
* Returns the title text.
*/
public String getTitle() {
return title;
}
/**
* Returns the HTML meta tags.
*/
public Properties getMetaTags() {
return metaTags;
}
/**
* Returns the summary.
*/
public String getSummary() {
return summary;
}
/**
* Returns the body text.
*/
public String getText() {
return text;
}
/**
* Returns the DOM document for the given HTML source.
*/
protected Document getDocument(String html) throws Exception {
if (html == null || html.length() == 0) {
return null;
}
ByteArrayInputStream input = new ByteArrayInputStream(html
.getBytes("UTF-8")); //$NON-NLS-1$
ByteArrayOutputStream output = new ByteArrayOutputStream();
StringWriter sw = new StringWriter();
PrintWriter pw = new PrintWriter(sw);
tidy.setErrout(pw);
return tidy.parseDOM(input, output);
}
/**
* Extracts the title, meta tags and body text from the given nodes.
*/
protected void extract(NodeList nodes) {
for (int i = 0; i < nodes.getLength(); i++) {
Node node = nodes.item(i);
String nodeName = node.getNodeName();
switch (node.getNodeType()) {
case Node.ELEMENT_NODE:
if (!nodeName.equals(HTML_SCRIPT_TAG)) {
NamedNodeMap attrs = node.getAttributes();
for (int j = 0; j < attrs.getLength(); j++) {
Node attrNode = attrs.item(j);
String attrNodeName = attrNode.getNodeName();
String attrNodeValue = attrNode.getNodeValue();
if (attrNodeName.equals(HTML_TITLE_TAG)) {
title = attrNodeValue;
} else if (attrNodeName.equals(HTML_META_TAG)) {
metaTags.put(attrNodeName, attrNodeValue);
}
}
NodeList childNodes = node.getChildNodes();
if (childNodes != null && childNodes.getLength() > 0) {
extract(childNodes);
}
}
break;
case Node.TEXT_NODE:
htmlText.append(node.getNodeValue()).append(' ');
break;
}
}
}
}