org.eclipse.help.base/src/org/eclipse/help/internal/search/HTMLDocParser.java - platform/eclipse.platform.ua - Git at Google

 /*******************************************************************************
  * Copyright (c) 2000, 2015 IBM Corporation and others.
  * All rights reserved. This program and the accompanying materials
  * are made available under the terms of the Eclipse Public License v1.0
  * which accompanies this distribution, and is available at
  * http://www.eclipse.org/legal/epl-v10.html
  *
  * Contributors:
  *     IBM Corporation - initial API and implementation
  *******************************************************************************/
 package org.eclipse.help.internal.search;

 import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.Reader;
 import java.io.StreamTokenizer;
 import java.io.UnsupportedEncodingException;
 import java.net.URL;
 import java.util.Locale;
 import java.util.StringTokenizer;

 import org.apache.lucene.demo.html.HTMLParser;
 import org.eclipse.help.internal.base.util.ProxyUtil;

 /**
  * Parser HTML documents. Extracts document encoding from header, and delegates
  * to lucene HTML parser for extraction of title, summary, and content.
  */
 public class HTMLDocParser {
 	// maximum number of characters that will be searched
 	// from the beginning of HTML document to charset declaration
 	public static final int MAX_OFFSET = 2048;

 	// elements, atributes and values contstants
 	final static String ELEMENT_META = "META"; //$NON-NLS-1$
 	final static String ELEMENT_BODY = "body"; //$NON-NLS-1$
 	final static String ELEMENT_HEAD = "head"; //$NON-NLS-1$
 	final static String ATTRIBUTE_HTTP = "http-equiv"; //$NON-NLS-1$
 	final static String ATTRIBUTE_HTTP_VALUE = "content-type"; //$NON-NLS-1$
 	final static String ATTRIBUTE_CONTENT = "content"; //$NON-NLS-1$

 	// states for parsing elements
 	final static int STATE_ELEMENT_START = 0;
 	final static int STATE_ELEMENT_AFTER_LT = 1;
 	final static int STATE_ELEMENT_AFTER_LT_SLASH = 2;
 	final static int STATE_ELEMENT_META = 3;
 	// states for parsing HTTP-EQUIV attribute
 	final static int STATE_HTTP_START = 0;
 	final static int STATE_HTTP_AFTER_NAME = 1;
 	final static int STATE_HTTP_AFTER_EQ = 2;
 	final static int STATE_HTTP_DONE = 3;
 	// states for parsing CONTENT attribute
 	final static int STATE_CONTENT_START = 0;
 	final static int STATE_CONTENT_AFTER_NAME = 1;
 	final static int STATE_CONTENT_AFTER_EQ = 2;
 	final static int STATE_CONTENT_DONE = 3;

 	private HTMLParser htmlParser;
 	private InputStream inputStream = null;
 	/**
 	 * @param url
 	 * @throws IOException
 	 */
 	public void openDocument(URL url) throws IOException {
 		inputStream = ProxyUtil.getStream(url);

 		String encoding = getCharsetFromHTML(inputStream);
 		try {
 			inputStream.close();
 		} catch (IOException closeIOE) {
 		}
 		inputStream = ProxyUtil.getStream(url);
 		if (encoding != null) {
 			try {
 				htmlParser = new HTMLParser(new InputStreamReader(inputStream,
 						encoding));

 			}
 			catch (UnsupportedEncodingException uee) {
 				htmlParser = new HTMLParser(new InputStreamReader(inputStream));
 			}
 		}
 		else {
 			htmlParser = new HTMLParser(new InputStreamReader(inputStream));
 		}
 		htmlParser.parse();
 	}
 	/**
 	 * Releases resources (closes streams)
 	 */
 	public void closeDocument() {
 		if (inputStream != null) {
 			try {
 				inputStream.close();
 			} catch (IOException closeIOE) {
 			}
 		}
 	}
 	public String getTitle() throws IOException {
 		if (htmlParser == null) {
 			throw new NullPointerException();
 		}
 		try {
 			return htmlParser.getTitle();
 		} catch (InterruptedException ie) {
 			return ""; //$NON-NLS-1$
 		}
 	}
 	public String getSummary(String title) throws IOException {
 		try {
 			return htmlParser.getSummary();
 		} catch (InterruptedException ie) {
 			return ""; //$NON-NLS-1$
 		}
 	}

 	public Reader getContentReader() throws IOException {
 		if (htmlParser == null) {
 			throw new NullPointerException();
 		}
 		return htmlParser.getReader();
 	}
 	/**
 	 * Private. Parses HTML to extract document encoding specified in HTTP
 	 * equivalent META tag in the document header. Example of such META tag is
 	 * <META HTTP-EQUIV="content-type" CONTENT="text/html; charset=UTF-8">
 	 *
 	 * @return String or null if encoding not found
 	 */
 	public static String getCharsetFromHTML(InputStream is) {
 		// Set up an ascii reader for the document (documents should not use
 		// other characters before encoding is defined)
 		Reader asciiReader = new ASCIIReader(is, MAX_OFFSET);
 		StreamTokenizer tokenizer = new StreamTokenizer(asciiReader);

 		// tokenizer.eolIsSignificant(false);// default false
 		// tokenizer.slashSlashComments(false); // default false
 		// tokenizer.slashStarComments(false);// default false
 		tokenizer.lowerCaseMode(false);

 		// tokenizer.quoteChar('\"'); // default quote char
 		tokenizer.ordinaryChar('\''); // default quote char
 		tokenizer.ordinaryChar('/'); // default comment character

 		String charset = getCharsetFromHTMLTokens(tokenizer);
 		if (asciiReader != null) {
 			try {
 				asciiReader.close();
 			} catch (IOException ioe) {
 			}
 		}
 		return charset;
 	}
 	public static String getCharsetFromHTMLTokens(StreamTokenizer tokenizer) {
 		// keeps track of content attribute attribute until parsing
 		// of the meta tag is complete
 		String contentValue = null;

 		// initialize states
 		int stateContent = STATE_HTTP_START;
 		int stateElement = STATE_ELEMENT_START;
 		int stateHttp = STATE_HTTP_START;

 		try {
 			// in the worst case, process tokens until end of file
 			for (int token = tokenizer.nextToken(); token != StreamTokenizer.TT_EOF; token = tokenizer
 					.nextToken()) {
 				// debug tokens
 				//				if (token == StreamTokenizer.TT_WORD) {
 				//					System.out.println("word =" + tokenizer.sval);
 				//				} else if (token == StreamTokenizer.TT_NUMBER) {
 				//					System.out.println("number =" + tokenizer.nval);
 				//				} else if (token == StreamTokenizer.TT_EOL) {
 				//					System.out.println("endofline=");
 				//				} else if ((char) token == '\"') {
 				//					System.out.println("\" =" + tokenizer.sval);
 				//
 				//				} else {
 				//					System.out.println("else =" + (char) token);
 				//				}

 				// process input based depending on current state
 				switch (stateElement) {
 					case STATE_ELEMENT_START :
 						if (token == '<') {
 							stateElement = STATE_ELEMENT_AFTER_LT;
 						} // else do nothing, cannot be beginning of META tag
 						break;
 					case STATE_ELEMENT_AFTER_LT :
 						if (token == StreamTokenizer.TT_WORD) {
 							// some element opened
 							if (ELEMENT_META.equalsIgnoreCase(tokenizer.sval)) {
 								// META element opened
 								stateElement = STATE_ELEMENT_META;
 								// initialize state of attributes
 								stateHttp = STATE_HTTP_START;
 								stateContent = STATE_CONTENT_START;
 								contentValue = null;
 							} else if (ELEMENT_BODY
 									.equalsIgnoreCase(tokenizer.sval)) {
 								// body element opened, we are too far, stop
 								// processing input
 								return null;
 							} else {
 								// some other element opened, start from initial
 								// state
 								stateElement = STATE_ELEMENT_START;
 							}
 						} else if (token == '/') {
 							// can be begging of head closing
 							stateElement = STATE_ELEMENT_AFTER_LT_SLASH;
 						} else {
 							// not an element opened, could be openning of
 							// declaration
 							// or element closing e.t.c.
 							stateElement = STATE_ELEMENT_START;
 						}
 						break;
 					case STATE_ELEMENT_AFTER_LT_SLASH :
 						if (token == StreamTokenizer.TT_WORD
 								&& ELEMENT_HEAD
 										.equalsIgnoreCase(tokenizer.sval)) {
 							// head element closed, we are too far, stop
 							// processing input
 							return null;
 						}
 						stateElement = STATE_ELEMENT_START;
 						break;
 					default : // STATE_META_IN :
 						switch (token) {
 							case '>' :
 								// no longer inside META, start from initial
 								// state
 								stateElement = STATE_ELEMENT_START;
 								break;
 							case StreamTokenizer.TT_WORD :
 								// string inside META tag, can be attribute name
 								if (ATTRIBUTE_HTTP
 										.equalsIgnoreCase(tokenizer.sval)) {
 									// found HTTP-EQUIV attribute name
 									stateHttp = STATE_HTTP_AFTER_NAME;
 								} else if (ATTRIBUTE_CONTENT
 										.equalsIgnoreCase(tokenizer.sval)) {
 									// found CONTENT attribute name
 									stateContent = STATE_CONTENT_AFTER_NAME;
 								} else if (stateHttp == STATE_HTTP_AFTER_EQ
 										&& ATTRIBUTE_HTTP_VALUE
 												.equalsIgnoreCase(tokenizer.sval)) {
 									// value of HTTP-EQUIV attribute (unquoted)
 									// we found <META ...
 									// HTTP-EQUIV=content-type
 									stateHttp = STATE_HTTP_DONE;
 								} else {
 									// some other attribute name or string,
 									// reset states of seeked attributes,
 									// unless successfully processed earlier
 									if (stateHttp != STATE_HTTP_DONE) {
 										stateHttp = STATE_HTTP_START;
 									}
 									if (stateContent != STATE_CONTENT_DONE) {
 										stateContent = STATE_CONTENT_START;
 									}
 								}
 								break;
 							case '=' :
 								// = inside META tag, can separate interesing us
 								// attribute names from values
 								if (stateHttp == STATE_HTTP_AFTER_NAME) {
 									// we have HTTP-EQUIV=
 									stateHttp = STATE_HTTP_AFTER_EQ;
 								} else if (stateContent == STATE_CONTENT_AFTER_NAME) {
 									// we have CONTENT=
 									stateContent = STATE_CONTENT_AFTER_EQ;
 								} else {
 									// equal sign after some other attribute
 									// name or string,
 									// reset states of seeked attributes,
 									// unless successfully processed earlier
 									if (stateHttp != STATE_HTTP_DONE) {
 										stateHttp = STATE_HTTP_START;
 									}
 									if (stateContent != STATE_CONTENT_DONE) {
 										stateContent = STATE_CONTENT_START;
 									}
 								}
 								break;
 							case '\"' :
 								// quoted string inside META tag, can be
 								// attribute value
 								if (stateHttp == STATE_HTTP_AFTER_EQ) {
 									// value of HTTP-EQUIV attribute
 									if (ATTRIBUTE_HTTP_VALUE
 											.equalsIgnoreCase(tokenizer.sval)) {
 										// we found <META ...
 										// HTTP-EQUIV="content-type"
 										stateHttp = STATE_HTTP_DONE;
 									}
 								} else if (stateContent == STATE_CONTENT_AFTER_EQ) {
 									// value of CONTENT attribute
 									stateContent = STATE_CONTENT_DONE;
 									// save the value of the attribute
 									// if attribue HTTP-EQUIV="content-type" is
 									// found
 									// in the same META tag, this value might
 									// have
 									// Content-type entity header
 									contentValue = tokenizer.sval;
 								} else {
 									// value for the attribute is missing
 									// reset states of seeked attributes
 									stateHttp = STATE_HTTP_START;
 									stateContent = STATE_CONTENT_START;
 								}
 								break;
 							default :
 								// other unexpected token inside META tag
 								// reset states of seeked attributes,
 								// unless successfully processed earlier
 								if (stateHttp != STATE_HTTP_DONE) {
 									stateHttp = STATE_HTTP_START;
 								}
 								if (stateContent != STATE_CONTENT_DONE) {
 									stateContent = STATE_CONTENT_START;
 								}
 								break;
 						}
 						break;
 				}
 				if (contentValue != null && stateHttp == STATE_HTTP_DONE
 						&& stateContent == STATE_CONTENT_DONE) {
 					// <META HTTP-EQUIV="content-type" CONTENT="*******"
 					// parse vale of content attribute to extract encoding
 					return getCharsetFromHTTP(contentValue);
 				}

 			}
 		} catch (IOException ioe) {
 			return null;
 		}
 		// end of file
 		return null;
 	}
 	/**
 	 * Parses HTTP1.1 Content-Type entity-header field for example,
 	 * Content-Type: text/html; charset=ISO-8859-4, and extracts charset
 	 * parameter value of the media sub type.
 	 *
 	 * @return value of charset parameter, for example ISO-8859-4 or null if
 	 *         parameter does not exist
 	 */
 	public static String getCharsetFromHTTP(String contentValue) {
 		StringTokenizer t = new StringTokenizer(contentValue, ";"); //$NON-NLS-1$
 		while (t.hasMoreTokens()) {
 			String parameter = t.nextToken().trim();
 			if (parameter.toLowerCase(Locale.ENGLISH).startsWith("charset=")) { //$NON-NLS-1$
 				String charset = parameter
 						.substring("charset=".length()).trim(); //$NON-NLS-1$
 				if (charset.length() > 0) {
 					return charset;
 				}
 			}
 		}
 		return null;
 	}

 	public Exception getException() {
 		if (htmlParser != null) {
 			return htmlParser.getException();
 		}
 		return null;
 	}
 }
	/*******************************************************************************
	* Copyright (c) 2000, 2015 IBM Corporation and others.
	* All rights reserved. This program and the accompanying materials
	* are made available under the terms of the Eclipse Public License v1.0
	* which accompanies this distribution, and is available at
	* http://www.eclipse.org/legal/epl-v10.html
	*
	* Contributors:
	* IBM Corporation - initial API and implementation
	*******************************************************************************/
	package org.eclipse.help.internal.search;

	import java.io.IOException;
	import java.io.InputStream;
	import java.io.InputStreamReader;
	import java.io.Reader;
	import java.io.StreamTokenizer;
	import java.io.UnsupportedEncodingException;
	import java.net.URL;
	import java.util.Locale;
	import java.util.StringTokenizer;

	import org.apache.lucene.demo.html.HTMLParser;
	import org.eclipse.help.internal.base.util.ProxyUtil;

	/**
	* Parser HTML documents. Extracts document encoding from header, and delegates
	* to lucene HTML parser for extraction of title, summary, and content.
	*/
	public class HTMLDocParser {
	// maximum number of characters that will be searched
	// from the beginning of HTML document to charset declaration
	public static final int MAX_OFFSET = 2048;

	// elements, atributes and values contstants
	final static String ELEMENT_META = "META"; //$NON-NLS-1$
	final static String ELEMENT_BODY = "body"; //$NON-NLS-1$
	final static String ELEMENT_HEAD = "head"; //$NON-NLS-1$
	final static String ATTRIBUTE_HTTP = "http-equiv"; //$NON-NLS-1$
	final static String ATTRIBUTE_HTTP_VALUE = "content-type"; //$NON-NLS-1$
	final static String ATTRIBUTE_CONTENT = "content"; //$NON-NLS-1$

	// states for parsing elements
	final static int STATE_ELEMENT_START = 0;
	final static int STATE_ELEMENT_AFTER_LT = 1;
	final static int STATE_ELEMENT_AFTER_LT_SLASH = 2;
	final static int STATE_ELEMENT_META = 3;
	// states for parsing HTTP-EQUIV attribute
	final static int STATE_HTTP_START = 0;
	final static int STATE_HTTP_AFTER_NAME = 1;
	final static int STATE_HTTP_AFTER_EQ = 2;
	final static int STATE_HTTP_DONE = 3;
	// states for parsing CONTENT attribute
	final static int STATE_CONTENT_START = 0;
	final static int STATE_CONTENT_AFTER_NAME = 1;
	final static int STATE_CONTENT_AFTER_EQ = 2;
	final static int STATE_CONTENT_DONE = 3;

	private HTMLParser htmlParser;
	private InputStream inputStream = null;
	/**
	* @param url
	* @throws IOException
	*/
	public void openDocument(URL url) throws IOException {
	inputStream = ProxyUtil.getStream(url);

	String encoding = getCharsetFromHTML(inputStream);
	try {
	inputStream.close();
	} catch (IOException closeIOE) {
	}
	inputStream = ProxyUtil.getStream(url);
	if (encoding != null) {
	try {
	htmlParser = new HTMLParser(new InputStreamReader(inputStream,
	encoding));

	}
	catch (UnsupportedEncodingException uee) {
	htmlParser = new HTMLParser(new InputStreamReader(inputStream));
	}
	}
	else {
	htmlParser = new HTMLParser(new InputStreamReader(inputStream));
	}
	htmlParser.parse();
	}
	/**
	* Releases resources (closes streams)
	*/
	public void closeDocument() {
	if (inputStream != null) {
	try {
	inputStream.close();
	} catch (IOException closeIOE) {
	}
	}
	}
	public String getTitle() throws IOException {
	if (htmlParser == null) {
	throw new NullPointerException();
	}
	try {
	return htmlParser.getTitle();
	} catch (InterruptedException ie) {
	return ""; //$NON-NLS-1$
	}
	}
	public String getSummary(String title) throws IOException {
	try {
	return htmlParser.getSummary();
	} catch (InterruptedException ie) {
	return ""; //$NON-NLS-1$
	}
	}

	public Reader getContentReader() throws IOException {
	if (htmlParser == null) {
	throw new NullPointerException();
	}
	return htmlParser.getReader();
	}
	/**
	* Private. Parses HTML to extract document encoding specified in HTTP
	* equivalent META tag in the document header. Example of such META tag is
	* <META HTTP-EQUIV="content-type" CONTENT="text/html; charset=UTF-8">
	*
	* @return String or null if encoding not found
	*/
	public static String getCharsetFromHTML(InputStream is) {
	// Set up an ascii reader for the document (documents should not use
	// other characters before encoding is defined)
	Reader asciiReader = new ASCIIReader(is, MAX_OFFSET);
	StreamTokenizer tokenizer = new StreamTokenizer(asciiReader);

	// tokenizer.eolIsSignificant(false);// default false
	// tokenizer.slashSlashComments(false); // default false
	// tokenizer.slashStarComments(false);// default false
	tokenizer.lowerCaseMode(false);

	// tokenizer.quoteChar('\"'); // default quote char
	tokenizer.ordinaryChar('\''); // default quote char
	tokenizer.ordinaryChar('/'); // default comment character

	String charset = getCharsetFromHTMLTokens(tokenizer);
	if (asciiReader != null) {
	try {
	asciiReader.close();
	} catch (IOException ioe) {
	}
	}
	return charset;
	}
	public static String getCharsetFromHTMLTokens(StreamTokenizer tokenizer) {
	// keeps track of content attribute attribute until parsing
	// of the meta tag is complete
	String contentValue = null;

	// initialize states
	int stateContent = STATE_HTTP_START;
	int stateElement = STATE_ELEMENT_START;
	int stateHttp = STATE_HTTP_START;

	try {
	// in the worst case, process tokens until end of file
	for (int token = tokenizer.nextToken(); token != StreamTokenizer.TT_EOF; token = tokenizer
	.nextToken()) {
	// debug tokens
	// if (token == StreamTokenizer.TT_WORD) {
	// System.out.println("word =" + tokenizer.sval);
	// } else if (token == StreamTokenizer.TT_NUMBER) {
	// System.out.println("number =" + tokenizer.nval);
	// } else if (token == StreamTokenizer.TT_EOL) {
	// System.out.println("endofline=");
	// } else if ((char) token == '\"') {
	// System.out.println("\" =" + tokenizer.sval);
	//
	// } else {
	// System.out.println("else =" + (char) token);
	// }

	// process input based depending on current state
	switch (stateElement) {
	case STATE_ELEMENT_START :
	if (token == '<') {
	stateElement = STATE_ELEMENT_AFTER_LT;
	} // else do nothing, cannot be beginning of META tag
	break;
	case STATE_ELEMENT_AFTER_LT :
	if (token == StreamTokenizer.TT_WORD) {
	// some element opened
	if (ELEMENT_META.equalsIgnoreCase(tokenizer.sval)) {
	// META element opened
	stateElement = STATE_ELEMENT_META;
	// initialize state of attributes
	stateHttp = STATE_HTTP_START;
	stateContent = STATE_CONTENT_START;
	contentValue = null;
	} else if (ELEMENT_BODY
	.equalsIgnoreCase(tokenizer.sval)) {
	// body element opened, we are too far, stop
	// processing input
	return null;
	} else {
	// some other element opened, start from initial
	// state
	stateElement = STATE_ELEMENT_START;
	}
	} else if (token == '/') {
	// can be begging of head closing
	stateElement = STATE_ELEMENT_AFTER_LT_SLASH;
	} else {
	// not an element opened, could be openning of
	// declaration
	// or element closing e.t.c.
	stateElement = STATE_ELEMENT_START;
	}
	break;
	case STATE_ELEMENT_AFTER_LT_SLASH :
	if (token == StreamTokenizer.TT_WORD
	&& ELEMENT_HEAD
	.equalsIgnoreCase(tokenizer.sval)) {
	// head element closed, we are too far, stop
	// processing input
	return null;
	}
	stateElement = STATE_ELEMENT_START;
	break;
	default : // STATE_META_IN :
	switch (token) {
	case '>' :
	// no longer inside META, start from initial
	// state
	stateElement = STATE_ELEMENT_START;
	break;
	case StreamTokenizer.TT_WORD :
	// string inside META tag, can be attribute name
	if (ATTRIBUTE_HTTP
	.equalsIgnoreCase(tokenizer.sval)) {
	// found HTTP-EQUIV attribute name
	stateHttp = STATE_HTTP_AFTER_NAME;
	} else if (ATTRIBUTE_CONTENT
	.equalsIgnoreCase(tokenizer.sval)) {
	// found CONTENT attribute name
	stateContent = STATE_CONTENT_AFTER_NAME;
	} else if (stateHttp == STATE_HTTP_AFTER_EQ
	&& ATTRIBUTE_HTTP_VALUE
	.equalsIgnoreCase(tokenizer.sval)) {
	// value of HTTP-EQUIV attribute (unquoted)
	// we found <META ...
	// HTTP-EQUIV=content-type
	stateHttp = STATE_HTTP_DONE;
	} else {
	// some other attribute name or string,
	// reset states of seeked attributes,
	// unless successfully processed earlier
	if (stateHttp != STATE_HTTP_DONE) {
	stateHttp = STATE_HTTP_START;
	}
	if (stateContent != STATE_CONTENT_DONE) {
	stateContent = STATE_CONTENT_START;
	}
	}
	break;
	case '=' :
	// = inside META tag, can separate interesing us
	// attribute names from values
	if (stateHttp == STATE_HTTP_AFTER_NAME) {
	// we have HTTP-EQUIV=
	stateHttp = STATE_HTTP_AFTER_EQ;
	} else if (stateContent == STATE_CONTENT_AFTER_NAME) {
	// we have CONTENT=
	stateContent = STATE_CONTENT_AFTER_EQ;
	} else {
	// equal sign after some other attribute
	// name or string,
	// reset states of seeked attributes,
	// unless successfully processed earlier
	if (stateHttp != STATE_HTTP_DONE) {
	stateHttp = STATE_HTTP_START;
	}
	if (stateContent != STATE_CONTENT_DONE) {
	stateContent = STATE_CONTENT_START;
	}
	}
	break;
	case '\"' :
	// quoted string inside META tag, can be
	// attribute value
	if (stateHttp == STATE_HTTP_AFTER_EQ) {
	// value of HTTP-EQUIV attribute
	if (ATTRIBUTE_HTTP_VALUE
	.equalsIgnoreCase(tokenizer.sval)) {
	// we found <META ...
	// HTTP-EQUIV="content-type"
	stateHttp = STATE_HTTP_DONE;
	}
	} else if (stateContent == STATE_CONTENT_AFTER_EQ) {
	// value of CONTENT attribute
	stateContent = STATE_CONTENT_DONE;
	// save the value of the attribute
	// if attribue HTTP-EQUIV="content-type" is
	// found
	// in the same META tag, this value might
	// have
	// Content-type entity header
	contentValue = tokenizer.sval;
	} else {
	// value for the attribute is missing
	// reset states of seeked attributes
	stateHttp = STATE_HTTP_START;
	stateContent = STATE_CONTENT_START;
	}
	break;
	default :
	// other unexpected token inside META tag
	// reset states of seeked attributes,
	// unless successfully processed earlier
	if (stateHttp != STATE_HTTP_DONE) {
	stateHttp = STATE_HTTP_START;
	}
	if (stateContent != STATE_CONTENT_DONE) {
	stateContent = STATE_CONTENT_START;
	}
	break;
	}
	break;
	}
	if (contentValue != null && stateHttp == STATE_HTTP_DONE
	&& stateContent == STATE_CONTENT_DONE) {
	// <META HTTP-EQUIV="content-type" CONTENT="*******"
	// parse vale of content attribute to extract encoding
	return getCharsetFromHTTP(contentValue);
	}

	}
	} catch (IOException ioe) {
	return null;
	}
	// end of file
	return null;
	}
	/**
	* Parses HTTP1.1 Content-Type entity-header field for example,
	* Content-Type: text/html; charset=ISO-8859-4, and extracts charset
	* parameter value of the media sub type.
	*
	* @return value of charset parameter, for example ISO-8859-4 or null if
	* parameter does not exist
	*/
	public static String getCharsetFromHTTP(String contentValue) {
	StringTokenizer t = new StringTokenizer(contentValue, ";"); //$NON-NLS-1$
	while (t.hasMoreTokens()) {
	String parameter = t.nextToken().trim();
	if (parameter.toLowerCase(Locale.ENGLISH).startsWith("charset=")) { //$NON-NLS-1$
	String charset = parameter
	.substring("charset=".length()).trim(); //$NON-NLS-1$
	if (charset.length() > 0) {
	return charset;
	}
	}
	}
	return null;
	}

	public Exception getException() {
	if (htmlParser != null) {
	return htmlParser.getException();
	}
	return null;
	}
	}