| /******************************************************************************* |
| * Copyright (c) 2000, 2015 IBM Corporation and others. |
| * All rights reserved. This program and the accompanying materials |
| * are made available under the terms of the Eclipse Public License v1.0 |
| * which accompanies this distribution, and is available at |
| * http://www.eclipse.org/legal/epl-v10.html |
| * |
| * Contributors: |
| * IBM Corporation - initial API and implementation |
| *******************************************************************************/ |
| package org.eclipse.help.internal.search; |
| |
| import java.io.IOException; |
| import java.io.InputStream; |
| import java.io.InputStreamReader; |
| import java.io.Reader; |
| import java.io.StreamTokenizer; |
| import java.io.UnsupportedEncodingException; |
| import java.net.URL; |
| import java.util.Locale; |
| import java.util.StringTokenizer; |
| |
| import org.apache.lucene.demo.html.HTMLParser; |
| import org.eclipse.help.internal.base.util.ProxyUtil; |
| |
| /** |
| * Parser HTML documents. Extracts document encoding from header, and delegates |
| * to lucene HTML parser for extraction of title, summary, and content. |
| */ |
| public class HTMLDocParser { |
| // maximum number of characters that will be searched |
| // from the beginning of HTML document to charset declaration |
| public static final int MAX_OFFSET = 2048; |
| |
| // elements, atributes and values contstants |
| final static String ELEMENT_META = "META"; //$NON-NLS-1$ |
| final static String ELEMENT_BODY = "body"; //$NON-NLS-1$ |
| final static String ELEMENT_HEAD = "head"; //$NON-NLS-1$ |
| final static String ATTRIBUTE_HTTP = "http-equiv"; //$NON-NLS-1$ |
| final static String ATTRIBUTE_HTTP_VALUE = "content-type"; //$NON-NLS-1$ |
| final static String ATTRIBUTE_CONTENT = "content"; //$NON-NLS-1$ |
| |
| // states for parsing elements |
| final static int STATE_ELEMENT_START = 0; |
| final static int STATE_ELEMENT_AFTER_LT = 1; |
| final static int STATE_ELEMENT_AFTER_LT_SLASH = 2; |
| final static int STATE_ELEMENT_META = 3; |
| // states for parsing HTTP-EQUIV attribute |
| final static int STATE_HTTP_START = 0; |
| final static int STATE_HTTP_AFTER_NAME = 1; |
| final static int STATE_HTTP_AFTER_EQ = 2; |
| final static int STATE_HTTP_DONE = 3; |
| // states for parsing CONTENT attribute |
| final static int STATE_CONTENT_START = 0; |
| final static int STATE_CONTENT_AFTER_NAME = 1; |
| final static int STATE_CONTENT_AFTER_EQ = 2; |
| final static int STATE_CONTENT_DONE = 3; |
| |
| private HTMLParser htmlParser; |
| private InputStream inputStream = null; |
| /** |
| * @param url |
| * @throws IOException |
| */ |
| public void openDocument(URL url) throws IOException { |
| inputStream = ProxyUtil.getStream(url); |
| |
| String encoding = getCharsetFromHTML(inputStream); |
| try { |
| inputStream.close(); |
| } catch (IOException closeIOE) { |
| } |
| inputStream = ProxyUtil.getStream(url); |
| if (encoding != null) { |
| try { |
| htmlParser = new HTMLParser(new InputStreamReader(inputStream, |
| encoding)); |
| |
| } |
| catch (UnsupportedEncodingException uee) { |
| htmlParser = new HTMLParser(new InputStreamReader(inputStream)); |
| } |
| } |
| else { |
| htmlParser = new HTMLParser(new InputStreamReader(inputStream)); |
| } |
| htmlParser.parse(); |
| } |
| /** |
| * Releases resources (closes streams) |
| */ |
| public void closeDocument() { |
| if (inputStream != null) { |
| try { |
| inputStream.close(); |
| } catch (IOException closeIOE) { |
| } |
| } |
| } |
| public String getTitle() throws IOException { |
| if (htmlParser == null) { |
| throw new NullPointerException(); |
| } |
| try { |
| return htmlParser.getTitle(); |
| } catch (InterruptedException ie) { |
| return ""; //$NON-NLS-1$ |
| } |
| } |
| public String getSummary(String title) throws IOException { |
| try { |
| return htmlParser.getSummary(); |
| } catch (InterruptedException ie) { |
| return ""; //$NON-NLS-1$ |
| } |
| } |
| |
| public Reader getContentReader() throws IOException { |
| if (htmlParser == null) { |
| throw new NullPointerException(); |
| } |
| return htmlParser.getReader(); |
| } |
| /** |
| * Private. Parses HTML to extract document encoding specified in HTTP |
| * equivalent META tag in the document header. Example of such META tag is |
| * <META HTTP-EQUIV="content-type" CONTENT="text/html; charset=UTF-8"> |
| * |
| * @return String or null if encoding not found |
| */ |
| public static String getCharsetFromHTML(InputStream is) { |
| // Set up an ascii reader for the document (documents should not use |
| // other characters before encoding is defined) |
| Reader asciiReader = new ASCIIReader(is, MAX_OFFSET); |
| StreamTokenizer tokenizer = new StreamTokenizer(asciiReader); |
| |
| // tokenizer.eolIsSignificant(false);// default false |
| // tokenizer.slashSlashComments(false); // default false |
| // tokenizer.slashStarComments(false);// default false |
| tokenizer.lowerCaseMode(false); |
| |
| // tokenizer.quoteChar('\"'); // default quote char |
| tokenizer.ordinaryChar('\''); // default quote char |
| tokenizer.ordinaryChar('/'); // default comment character |
| |
| String charset = getCharsetFromHTMLTokens(tokenizer); |
| if (asciiReader != null) { |
| try { |
| asciiReader.close(); |
| } catch (IOException ioe) { |
| } |
| } |
| return charset; |
| } |
| public static String getCharsetFromHTMLTokens(StreamTokenizer tokenizer) { |
| // keeps track of content attribute attribute until parsing |
| // of the meta tag is complete |
| String contentValue = null; |
| |
| // initialize states |
| int stateContent = STATE_HTTP_START; |
| int stateElement = STATE_ELEMENT_START; |
| int stateHttp = STATE_HTTP_START; |
| |
| try { |
| // in the worst case, process tokens until end of file |
| for (int token = tokenizer.nextToken(); token != StreamTokenizer.TT_EOF; token = tokenizer |
| .nextToken()) { |
| // debug tokens |
| // if (token == StreamTokenizer.TT_WORD) { |
| // System.out.println("word =" + tokenizer.sval); |
| // } else if (token == StreamTokenizer.TT_NUMBER) { |
| // System.out.println("number =" + tokenizer.nval); |
| // } else if (token == StreamTokenizer.TT_EOL) { |
| // System.out.println("endofline="); |
| // } else if ((char) token == '\"') { |
| // System.out.println("\" =" + tokenizer.sval); |
| // |
| // } else { |
| // System.out.println("else =" + (char) token); |
| // } |
| |
| // process input based depending on current state |
| switch (stateElement) { |
| case STATE_ELEMENT_START : |
| if (token == '<') { |
| stateElement = STATE_ELEMENT_AFTER_LT; |
| } // else do nothing, cannot be beginning of META tag |
| break; |
| case STATE_ELEMENT_AFTER_LT : |
| if (token == StreamTokenizer.TT_WORD) { |
| // some element opened |
| if (ELEMENT_META.equalsIgnoreCase(tokenizer.sval)) { |
| // META element opened |
| stateElement = STATE_ELEMENT_META; |
| // initialize state of attributes |
| stateHttp = STATE_HTTP_START; |
| stateContent = STATE_CONTENT_START; |
| contentValue = null; |
| } else if (ELEMENT_BODY |
| .equalsIgnoreCase(tokenizer.sval)) { |
| // body element opened, we are too far, stop |
| // processing input |
| return null; |
| } else { |
| // some other element opened, start from initial |
| // state |
| stateElement = STATE_ELEMENT_START; |
| } |
| } else if (token == '/') { |
| // can be begging of head closing |
| stateElement = STATE_ELEMENT_AFTER_LT_SLASH; |
| } else { |
| // not an element opened, could be openning of |
| // declaration |
| // or element closing e.t.c. |
| stateElement = STATE_ELEMENT_START; |
| } |
| break; |
| case STATE_ELEMENT_AFTER_LT_SLASH : |
| if (token == StreamTokenizer.TT_WORD |
| && ELEMENT_HEAD |
| .equalsIgnoreCase(tokenizer.sval)) { |
| // head element closed, we are too far, stop |
| // processing input |
| return null; |
| } |
| stateElement = STATE_ELEMENT_START; |
| break; |
| default : // STATE_META_IN : |
| switch (token) { |
| case '>' : |
| // no longer inside META, start from initial |
| // state |
| stateElement = STATE_ELEMENT_START; |
| break; |
| case StreamTokenizer.TT_WORD : |
| // string inside META tag, can be attribute name |
| if (ATTRIBUTE_HTTP |
| .equalsIgnoreCase(tokenizer.sval)) { |
| // found HTTP-EQUIV attribute name |
| stateHttp = STATE_HTTP_AFTER_NAME; |
| } else if (ATTRIBUTE_CONTENT |
| .equalsIgnoreCase(tokenizer.sval)) { |
| // found CONTENT attribute name |
| stateContent = STATE_CONTENT_AFTER_NAME; |
| } else if (stateHttp == STATE_HTTP_AFTER_EQ |
| && ATTRIBUTE_HTTP_VALUE |
| .equalsIgnoreCase(tokenizer.sval)) { |
| // value of HTTP-EQUIV attribute (unquoted) |
| // we found <META ... |
| // HTTP-EQUIV=content-type |
| stateHttp = STATE_HTTP_DONE; |
| } else { |
| // some other attribute name or string, |
| // reset states of seeked attributes, |
| // unless successfully processed earlier |
| if (stateHttp != STATE_HTTP_DONE) { |
| stateHttp = STATE_HTTP_START; |
| } |
| if (stateContent != STATE_CONTENT_DONE) { |
| stateContent = STATE_CONTENT_START; |
| } |
| } |
| break; |
| case '=' : |
| // = inside META tag, can separate interesing us |
| // attribute names from values |
| if (stateHttp == STATE_HTTP_AFTER_NAME) { |
| // we have HTTP-EQUIV= |
| stateHttp = STATE_HTTP_AFTER_EQ; |
| } else if (stateContent == STATE_CONTENT_AFTER_NAME) { |
| // we have CONTENT= |
| stateContent = STATE_CONTENT_AFTER_EQ; |
| } else { |
| // equal sign after some other attribute |
| // name or string, |
| // reset states of seeked attributes, |
| // unless successfully processed earlier |
| if (stateHttp != STATE_HTTP_DONE) { |
| stateHttp = STATE_HTTP_START; |
| } |
| if (stateContent != STATE_CONTENT_DONE) { |
| stateContent = STATE_CONTENT_START; |
| } |
| } |
| break; |
| case '\"' : |
| // quoted string inside META tag, can be |
| // attribute value |
| if (stateHttp == STATE_HTTP_AFTER_EQ) { |
| // value of HTTP-EQUIV attribute |
| if (ATTRIBUTE_HTTP_VALUE |
| .equalsIgnoreCase(tokenizer.sval)) { |
| // we found <META ... |
| // HTTP-EQUIV="content-type" |
| stateHttp = STATE_HTTP_DONE; |
| } |
| } else if (stateContent == STATE_CONTENT_AFTER_EQ) { |
| // value of CONTENT attribute |
| stateContent = STATE_CONTENT_DONE; |
| // save the value of the attribute |
| // if attribue HTTP-EQUIV="content-type" is |
| // found |
| // in the same META tag, this value might |
| // have |
| // Content-type entity header |
| contentValue = tokenizer.sval; |
| } else { |
| // value for the attribute is missing |
| // reset states of seeked attributes |
| stateHttp = STATE_HTTP_START; |
| stateContent = STATE_CONTENT_START; |
| } |
| break; |
| default : |
| // other unexpected token inside META tag |
| // reset states of seeked attributes, |
| // unless successfully processed earlier |
| if (stateHttp != STATE_HTTP_DONE) { |
| stateHttp = STATE_HTTP_START; |
| } |
| if (stateContent != STATE_CONTENT_DONE) { |
| stateContent = STATE_CONTENT_START; |
| } |
| break; |
| } |
| break; |
| } |
| if (contentValue != null && stateHttp == STATE_HTTP_DONE |
| && stateContent == STATE_CONTENT_DONE) { |
| // <META HTTP-EQUIV="content-type" CONTENT="*******" |
| // parse vale of content attribute to extract encoding |
| return getCharsetFromHTTP(contentValue); |
| } |
| |
| } |
| } catch (IOException ioe) { |
| return null; |
| } |
| // end of file |
| return null; |
| } |
| /** |
| * Parses HTTP1.1 Content-Type entity-header field for example, |
| * Content-Type: text/html; charset=ISO-8859-4, and extracts charset |
| * parameter value of the media sub type. |
| * |
| * @return value of charset parameter, for example ISO-8859-4 or null if |
| * parameter does not exist |
| */ |
| public static String getCharsetFromHTTP(String contentValue) { |
| StringTokenizer t = new StringTokenizer(contentValue, ";"); //$NON-NLS-1$ |
| while (t.hasMoreTokens()) { |
| String parameter = t.nextToken().trim(); |
| if (parameter.toLowerCase(Locale.ENGLISH).startsWith("charset=")) { //$NON-NLS-1$ |
| String charset = parameter |
| .substring("charset=".length()).trim(); //$NON-NLS-1$ |
| if (charset.length() > 0) { |
| return charset; |
| } |
| } |
| } |
| return null; |
| } |
| |
| public Exception getException() { |
| if (htmlParser != null) { |
| return htmlParser.getException(); |
| } |
| return null; |
| } |
| } |