blob: 80e27caaea46e1082e4854e307e2e4121357f488 [file] [log] [blame]
/*******************************************************************************
* Copyright (c) 2000, 2003 IBM Corporation and others.
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Common Public License v1.0
* which accompanies this distribution, and is available at
* http://www.eclipse.org/legal/cpl-v10.html
*
* Contributors:
* IBM Corporation - initial API and implementation
*******************************************************************************/
package org.eclipse.help.internal.search;
import java.io.*;
import java.net.*;
import java.util.*;
import org.apache.lucene.demo.html.*;
import org.eclipse.help.internal.*;
/**
* Parser HTML documents.
* Extracts document encoding from header,
* and delegates to lucene HTML parser for extraction
* of title, summary, and content.
*/
public class HTMLDocParser {
// maximum number of characters that will be searched
// from the beginning of HTML document to charset declaration
private static final int MAX_OFFSET = 2048;
// elements, atributes and values contstants
final String ELEMENT_META = "META";
final String ELEMENT_BODY = "body";
final String ELEMENT_HEAD = "head";
final String ATTRIBUTE_HTTP = "http-equiv";
final String ATTRIBUTE_HTTP_VALUE = "content-type";
final String ATTRIBUTE_CONTENT = "content";
// states for parsing elements
final int STATE_ELEMENT_START = 0;
final int STATE_ELEMENT_AFTER_LT = 1;
final int STATE_ELEMENT_AFTER_LT_SLASH = 2;
final int STATE_ELEMENT_META = 3;
// states for parsing HTTP-EQUIV attribute
final int STATE_HTTP_START = 0;
final int STATE_HTTP_AFTER_NAME = 1;
final int STATE_HTTP_AFTER_EQ = 2;
final int STATE_HTTP_DONE = 3;
// states for parsing CONTENT attribute
final int STATE_CONTENT_START = 0;
final int STATE_CONTENT_AFTER_NAME = 1;
final int STATE_CONTENT_AFTER_EQ = 2;
final int STATE_CONTENT_DONE = 3;
private HTMLParser htmlParser;
private InputStream inputStream = null;
/**
* @param url
* @throws IOException
*/
public void openDocument(URL url) throws IOException {
inputStream = url.openStream();
String encoding = getCharsetFromHTML(inputStream);
try {
inputStream.close();
} catch (IOException closeIOE) {
}
inputStream = url.openStream();
if (encoding != null) {
try {
htmlParser =
new HTMLParser(
new InputStreamReader(inputStream, encoding));
} catch (UnsupportedEncodingException uee) {
if (HelpPlugin.DEBUG_SEARCH) {
System.out.println(
this.getClass().getName()
+ " JVM does not support encoding "
+ encoding
+ " specified in document "
+ url.getPath()
+ ". Default encoding will be used during indexing.");
}
htmlParser = new HTMLParser(new InputStreamReader(inputStream));
}
} else {
if (HelpPlugin.DEBUG_SEARCH) {
System.out.println(
this.getClass().getName()
+ " Encoding not found in document "
+ url.getPath()
+ ". Default encoding will be used during indexing.");
}
htmlParser = new HTMLParser(new InputStreamReader(inputStream));
}
}
/**
* Releases resources (closes streams)
*/
public void closeDocument() {
if (inputStream != null) {
try {
inputStream.close();
} catch (IOException closeIOE) {
}
}
}
public String getTitle() throws IOException {
if (htmlParser == null) {
throw new NullPointerException();
}
try {
return htmlParser.getTitle();
} catch (InterruptedException ie) {
return "";
}
}
public String getSummary() throws IOException {
if (htmlParser == null) {
throw new NullPointerException();
}
try {
return htmlParser.getSummary();
} catch (InterruptedException ie) {
return "";
}
}
public Reader getContentReader() throws IOException {
if (htmlParser == null) {
throw new NullPointerException();
}
return htmlParser.getReader();
}
/**
* Private.
* Parses HTML to extract document encoding specified in HTTP
* equivalent META tag in the document header. Example of such META tag is
* <META HTTP-EQUIV="content-type" CONTENT="text/html; charset=UTF-8">
* @return String or null if encoding not found
*/
public String getCharsetFromHTML(InputStream is) {
// Set up an ascii reader for the document (documents should not use
// other characters before encoding is defined)
Reader asciiReader = new ASCIIReader(is, MAX_OFFSET);
StreamTokenizer tokenizer = new StreamTokenizer(asciiReader);
// tokenizer.eolIsSignificant(false);// default false
// tokenizer.slashSlashComments(false); // default false
// tokenizer.slashStarComments(false);// default false
tokenizer.lowerCaseMode(false);
// tokenizer.quoteChar('\"'); // default quote char
tokenizer.ordinaryChar('\''); // default quote char
tokenizer.ordinaryChar('/'); // default comment character
String charset = getCharsetFromHTMLTokens(tokenizer);
if (asciiReader != null) {
try {
asciiReader.close();
} catch (IOException ioe) {
}
}
return charset;
}
public String getCharsetFromHTMLTokens(StreamTokenizer tokenizer) {
// keeps track of content attribute attribute until parsing
// of the meta tag is complete
String contentValue = null;
// initialize states
int stateContent = STATE_HTTP_START;
int stateElement = STATE_ELEMENT_START;
int stateHttp = STATE_HTTP_START;
try {
// in the worst case, process tokens until end of file
for (int token = tokenizer.nextToken();
token != StreamTokenizer.TT_EOF;
token = tokenizer.nextToken()) {
// debug tokens
// if (token == StreamTokenizer.TT_WORD) {
// System.out.println("word =" + tokenizer.sval);
// } else if (token == StreamTokenizer.TT_NUMBER) {
// System.out.println("number =" + tokenizer.nval);
// } else if (token == StreamTokenizer.TT_EOL) {
// System.out.println("endofline=");
// } else if ((char) token == '\"') {
// System.out.println("\" =" + tokenizer.sval);
//
// } else {
// System.out.println("else =" + (char) token);
// }
// process input based depending on current state
switch (stateElement) {
case STATE_ELEMENT_START :
if (token == '<') {
stateElement = STATE_ELEMENT_AFTER_LT;
} // else do nothing, cannot be beginning of META tag
break;
case STATE_ELEMENT_AFTER_LT :
if (token == StreamTokenizer.TT_WORD) {
// some element opened
if (ELEMENT_META
.equalsIgnoreCase(tokenizer.sval)) {
// META element opened
stateElement = STATE_ELEMENT_META;
// initialize state of attributes
stateHttp = STATE_HTTP_START;
stateContent = STATE_CONTENT_START;
contentValue = null;
} else if (
ELEMENT_BODY.equalsIgnoreCase(
tokenizer.sval)) {
// body element opened, we are too far, stop processing input
return null;
} else {
// some other element opened, start from initial state
stateElement = STATE_ELEMENT_START;
}
} else if (token == '/') {
// can be begging of head closing
stateElement = STATE_ELEMENT_AFTER_LT_SLASH;
} else {
// not an element opened, could be openning of declaration
// or element closing e.t.c.
stateElement = STATE_ELEMENT_START;
}
break;
case STATE_ELEMENT_AFTER_LT_SLASH :
if (token == StreamTokenizer.TT_WORD
&& ELEMENT_HEAD.equalsIgnoreCase(tokenizer.sval)) {
// head element closed, we are too far, stop processing input
return null;
} else {
stateElement = STATE_ELEMENT_START;
}
break;
default : // STATE_META_IN :
switch (token) {
case '>' :
// no longer inside META, start from initial state
stateElement = STATE_ELEMENT_START;
break;
case StreamTokenizer.TT_WORD :
// string inside META tag, can be attribute name
if (ATTRIBUTE_HTTP
.equalsIgnoreCase(tokenizer.sval)) {
// found HTTP-EQUIV attribute name
stateHttp = STATE_HTTP_AFTER_NAME;
} else if (
ATTRIBUTE_CONTENT.equalsIgnoreCase(
tokenizer.sval)) {
// found CONTENT attribute name
stateContent = STATE_CONTENT_AFTER_NAME;
} else if (
stateHttp == STATE_HTTP_AFTER_EQ
&& ATTRIBUTE_HTTP_VALUE.equalsIgnoreCase(
tokenizer.sval)) {
// value of HTTP-EQUIV attribute (unquoted)
// we found <META ... HTTP-EQUIV=content-type
stateHttp = STATE_HTTP_DONE;
} else {
// some other attribute name or string,
// reset states of seeked attributes,
// unless successfully processed earlier
if (stateHttp != STATE_HTTP_DONE) {
stateHttp = STATE_HTTP_START;
}
if (stateContent != STATE_CONTENT_DONE) {
stateContent = STATE_CONTENT_START;
}
}
break;
case '=' :
// = inside META tag, can separate interesing us
// attribute names from values
if (stateHttp == STATE_HTTP_AFTER_NAME) {
// we have HTTP-EQUIV=
stateHttp = STATE_HTTP_AFTER_EQ;
} else if (
stateContent == STATE_CONTENT_AFTER_NAME) {
// we have CONTENT=
stateContent = STATE_CONTENT_AFTER_EQ;
} else {
// equal sign after some other attribute name or string,
// reset states of seeked attributes,
// unless successfully processed earlier
if (stateHttp != STATE_HTTP_DONE) {
stateHttp = STATE_HTTP_START;
}
if (stateContent != STATE_CONTENT_DONE) {
stateContent = STATE_CONTENT_START;
}
}
break;
case '\"' :
// quoted string inside META tag, can be attribute value
if (stateHttp == STATE_HTTP_AFTER_EQ) {
// value of HTTP-EQUIV attribute
if (ATTRIBUTE_HTTP_VALUE
.equalsIgnoreCase(tokenizer.sval)) {
// we found <META ... HTTP-EQUIV="content-type"
stateHttp = STATE_HTTP_DONE;
}
} else if (
stateContent == STATE_CONTENT_AFTER_EQ) {
// value of CONTENT attribute
stateContent = STATE_CONTENT_DONE;
// save the value of the attribute
// if attribue HTTP-EQUIV="content-type" is found
// in the same META tag, this value might have
// Content-type entity header
contentValue = tokenizer.sval;
} else {
// value for the attribute is missing
// reset states of seeked attributes,
// unless successfully processed earlier
if (stateHttp != STATE_HTTP_DONE) {
stateHttp = STATE_HTTP_START;
}
if (stateContent != STATE_CONTENT_DONE) {
stateContent = STATE_CONTENT_START;
}
}
break;
default :
// other unexpected token inside META tag
// reset states of seeked attributes,
// unless successfully processed earlier
if (stateHttp != STATE_HTTP_DONE) {
stateHttp = STATE_HTTP_START;
}
if (stateContent != STATE_CONTENT_DONE) {
stateContent = STATE_CONTENT_START;
}
break;
}
break;
}
if (contentValue != null
&& stateHttp == STATE_HTTP_DONE
&& stateContent == STATE_CONTENT_DONE) {
// <META HTTP-EQUIV="content-type" CONTENT="*******"
// parse vale of content attribute to extract encoding
return getCharsetFromHTTP(contentValue);
}
}
} catch (IOException ioe) {
return null;
}
// end of file
return null;
}
/**
* Parses HTTP1.1 Content-Type entity-header field
* for example, Content-Type: text/html; charset=ISO-8859-4,
* and extracts charset parameter value of the media sub type.
* @param media-type, for example Content-Type: text/html; charset=ISO-8859-4
* @return value of charset parameter, for example ISO-8859-4
* or null if parameter does not exist
*/
public String getCharsetFromHTTP(String contentValue) {
StringTokenizer t = new StringTokenizer(contentValue, ";");
while (t.hasMoreTokens()) {
String parameter = t.nextToken().trim();
if (parameter.startsWith("charset=")) {
String charset =
parameter.substring("charset=".length()).trim();
if (charset.length() > 0) {
return charset;
}
}
}
return null;
}
}