| /******************************************************************************* |
| * Copyright (c) 2004, 2011 Tasktop Technologies and others. |
| * |
| * This program and the accompanying materials are made available under the |
| * terms of the Eclipse Public License v. 2.0 which is available at |
| * https://www.eclipse.org/legal/epl-2.0 |
| * |
| * SPDX-License-Identifier: EPL-2.0 |
| * |
| * Tasktop Technologies - initial API and implementation |
| *******************************************************************************/ |
| |
| package org.eclipse.mylyn.commons.net; |
| |
| import java.io.BufferedReader; |
| import java.io.IOException; |
| import java.io.Reader; |
| import java.net.URL; |
| import java.text.ParseException; |
| import java.util.HashMap; |
| import java.util.Locale; |
| |
| import org.apache.commons.lang.StringEscapeUtils; |
| |
| /** |
| * Parses HTML into tokens. |
| * |
| * @author Shawn Minto |
| * @since 2.0 |
| * @deprecated use org.eclipse.mylyn.commons.core.HtmlStreamTokenizer instead. |
| */ |
| @Deprecated |
| public class HtmlStreamTokenizer { |
| |
| /** parser state */ |
| private State state; |
| |
| /** reader from which to parse the text */ |
| private final BufferedReader in; |
| |
| /** base URL for resolving relative URLs */ |
| private final URL base; |
| |
| /** buffer holding the text of the current token */ |
| private final StringBuffer textBuffer; |
| |
| /** buffer holding whitespace preceding the current token */ |
| private final StringBuffer whitespaceBuffer; |
| |
| /** |
| * holds a token that was read and then put back in the queue to be returned again on <code>nextToken</code> call |
| */ |
| private Token pushbackToken; |
| |
| /** |
| * holds a character that was read and then determined not to be part of the current token |
| */ |
| private int pushbackChar; |
| |
| /** current quote delimiter (single or double) */ |
| private int quoteChar; |
| |
| /** Allow class client to choose if tag attributes are escaped or not */ |
| private boolean escapeTagValues; |
| |
| /** |
| * Constructor. |
| * |
| * @param in |
| * reader for the HTML document to tokenize |
| * @param base |
| * URL for resolving relative URLs |
| */ |
| public HtmlStreamTokenizer(Reader in, URL base) { |
| textBuffer = new StringBuffer(); |
| whitespaceBuffer = new StringBuffer(); |
| pushbackChar = 0; |
| state = State.TEXT; |
| this.in = new BufferedReader(in); |
| this.base = base; |
| escapeTagValues = true; |
| } |
| |
| public void escapeTagAttributes(boolean value) { |
| escapeTagValues = value; |
| } |
| |
| /** |
| * Returns the next token from the stream. |
| */ |
| public Token nextToken() throws IOException, ParseException { |
| if (pushbackToken != null) { |
| Token token = pushbackToken; |
| pushbackToken = null; |
| return token; |
| } |
| |
| int closingComment = 0; |
| |
| textBuffer.setLength(0); |
| whitespaceBuffer.setLength(0); |
| do { |
| int ch; |
| if (pushbackChar != 0) { |
| ch = pushbackChar; |
| pushbackChar = 0; |
| } else { |
| ch = in.read(); |
| } |
| if (ch < 0) { |
| State oldState = state; |
| state = State.EOF; |
| if (textBuffer.length() > 0 && oldState == State.TEXT) { |
| return new Token(textBuffer, whitespaceBuffer, false); |
| } else { |
| return new Token(); |
| } |
| } |
| if (state == State.TEXT) { |
| if (ch == '<') { |
| state = State.TAG; |
| if (textBuffer.length() > 0) { |
| return new Token(textBuffer, whitespaceBuffer, false); |
| } |
| } else if (Character.isWhitespace((char) ch)) { |
| pushbackChar = ch; |
| state = State.WS; |
| if (textBuffer.length() > 0) { |
| return new Token(textBuffer, whitespaceBuffer, false); |
| } |
| } else { |
| textBuffer.append((char) ch); |
| } |
| } else if (state == State.WS) { |
| if (!Character.isWhitespace((char) ch)) { |
| pushbackChar = ch; |
| state = State.TEXT; |
| } else { |
| whitespaceBuffer.append((char) ch); |
| } |
| } else if (state == State.TAG) { |
| if (ch == '>') { |
| state = State.TEXT; |
| HtmlTag tag = new HtmlTag(base); |
| parseTag(textBuffer.toString(), tag, escapeTagValues); |
| return new Token(tag, whitespaceBuffer); |
| } |
| if (ch == '<' && textBuffer.length() == 0) { |
| textBuffer.append("<<"); //$NON-NLS-1$ |
| state = State.TEXT; |
| } else if (ch == '-' && textBuffer.length() == 2 && textBuffer.charAt(1) == '-' |
| && textBuffer.charAt(0) == '!') { |
| textBuffer.setLength(0); |
| state = State.COMMENT; |
| } else if (ch == '\'' || ch == '"') { |
| quoteChar = ch; |
| textBuffer.append((char) ch); |
| state = State.TAG_QUOTE; |
| } else { |
| textBuffer.append((char) ch); |
| } |
| } else if (state == State.TAG_QUOTE) { |
| if (ch == '>') { |
| pushbackChar = ch; |
| state = State.TAG; |
| } else { |
| textBuffer.append((char) ch); |
| if (ch == quoteChar) { |
| state = State.TAG; |
| } |
| } |
| } else if (state == State.COMMENT) { |
| if (ch == '>' && closingComment >= 2) { |
| textBuffer.setLength(textBuffer.length() - 2); |
| closingComment = 0; |
| state = State.TEXT; |
| return new Token(textBuffer, whitespaceBuffer, true); |
| } |
| if (ch == '-') { |
| closingComment++; |
| } else { |
| closingComment = 0; |
| } |
| textBuffer.append((char) ch); |
| } |
| } while (true); |
| } |
| |
| /** |
| * Pushes the token back into the queue, to be returned by the subsequent call to <code>nextToken</code> |
| */ |
| public void pushback(Token token) { |
| pushbackToken = token; |
| } |
| |
| /** |
| * Parses an HTML tag out of a string of characters. |
| */ |
| private static void parseTag(String s, HtmlTag tag, boolean escapeValues) throws ParseException { |
| |
| int i = 0; |
| for (; i < s.length() && Character.isWhitespace(s.charAt(i)); i++) { |
| // just move forward |
| } |
| if (i == s.length()) { |
| throw new ParseException("parse empty tag", 0); //$NON-NLS-1$ |
| } |
| |
| int start = i; |
| for (; i < s.length() && !Character.isWhitespace(s.charAt(i)); i++) { |
| // just move forward |
| } |
| tag.setTagName(s.substring(start, i)); |
| |
| for (; i < s.length() && Character.isWhitespace(s.charAt(i)); i++) { |
| // just move forward |
| } |
| if (i == s.length()) { |
| return; |
| } else { |
| parseAttributes(tag, s, i, escapeValues); |
| return; |
| } |
| } |
| |
| /** |
| * parses HTML tag attributes from a buffer and sets them in an HtmlTag |
| */ |
| private static void parseAttributes(HtmlTag tag, String s, int i, boolean escapeValues) throws ParseException { |
| while (i < s.length()) { |
| // skip whitespace |
| while (i < s.length() && Character.isWhitespace(s.charAt(i))) { |
| i++; |
| } |
| |
| if (i == s.length()) { |
| return; |
| } |
| |
| // read the attribute name -- the rule might be looser than the RFC |
| // specifies: |
| // everything up to a space or an equal sign is included |
| int start = i; |
| for (; i < s.length() && !Character.isWhitespace(s.charAt(i)) && s.charAt(i) != '='; i++) { |
| // just move forward |
| } |
| String attributeName = s.substring(start, i).toLowerCase(Locale.ENGLISH); |
| |
| if (attributeName.equals("/")) { //$NON-NLS-1$ |
| tag.setSelfTerminating(true); |
| continue; |
| } |
| |
| for (; i < s.length() && Character.isWhitespace(s.charAt(i)); i++) { |
| // just move forward |
| } |
| if (i == s.length() || s.charAt(i) != '=') { |
| // no attribute value |
| tag.setAttribute(attributeName, ""); //$NON-NLS-1$ |
| continue; |
| } |
| |
| // skip whitespace to the start of attribute value |
| for (i = i + 1; i < s.length() && Character.isWhitespace(s.charAt(i)); i++) { |
| // just move forward |
| } |
| if (i == s.length()) { |
| return; |
| } |
| |
| // read the attribute value -- the rule for unquoted attribute value |
| // is |
| // looser than the one in Conolly's W3C 1996 lexical analyzer draft: |
| // everything |
| // is included up to the next space |
| String attributeValue; |
| if (s.charAt(i) == '"') { |
| start = ++i; |
| for (; i < s.length() && s.charAt(i) != '"'; i++) { |
| // just move forward |
| } |
| if (i == s.length()) { |
| return; // shouldn't happen if input returned by nextToken |
| } |
| if (escapeValues) { |
| attributeValue = unescape(s.substring(start, i)); |
| } else { |
| attributeValue = s.substring(start, i); |
| } |
| i++; |
| } else if (s.charAt(i) == '\'') { |
| start = ++i; |
| for (; i < s.length() && s.charAt(i) != '\''; i++) { |
| // just move forward |
| } |
| if (i == s.length()) { |
| return; // shouldn't happen if input returned by nextToken |
| } |
| attributeValue = unescape(s.substring(start, i)); |
| i++; |
| } else { |
| start = i; |
| for (; i < s.length() && !Character.isWhitespace(s.charAt(i)); i++) { |
| // just move forward |
| } |
| attributeValue = s.substring(start, i); |
| } |
| tag.setAttribute(attributeName, attributeValue); |
| } |
| } |
| |
| /** |
| * Returns a string with HTML escapes changed into their corresponding characters. |
| * |
| * @deprecated use {@link StringEscapeUtils#unescapeHtml(String)} instead |
| */ |
| @Deprecated |
| public static String unescape(String s) { |
| if (s.indexOf('&') == -1) { |
| return s; |
| } else { |
| StringBuffer sb = new StringBuffer(s); |
| unescape(sb); |
| return sb.toString(); |
| } |
| } |
| |
| /** |
| * Replaces (in-place) HTML escapes in a StringBuffer with their corresponding characters. |
| * |
| * @deprecated use {@link StringEscapeUtils#unescapeHtml(String)} instead |
| */ |
| @Deprecated |
| public static StringBuffer unescape(StringBuffer sb) { |
| int i = 0; // index into the unprocessed section of the buffer |
| int j = 0; // index into the processed section of the buffer |
| |
| while (i < sb.length()) { |
| char ch = sb.charAt(i); |
| if (ch == '&') { |
| int start = i; |
| String escape = null; |
| for (i = i + 1; i < sb.length(); i++) { |
| ch = sb.charAt(i); |
| if (!Character.isLetterOrDigit(ch) && !(ch == '#' && i == (start + 1))) { |
| escape = sb.substring(start + 1, i); |
| break; |
| } |
| } |
| if (i == sb.length() && i != (start + 1)) { |
| escape = sb.substring(start + 1); |
| } |
| if (escape != null) { |
| Character character = parseReference(escape); |
| if (character != null |
| && !((0x0A == character || 0x0D == character || 0x09 == ch) |
| || (character >= 0x20 && character <= 0xD7FF) |
| || (character >= 0xE000 && character <= 0xFFFD) || (character >= 0x10000 && character <= 0x10FFFF))) { |
| // Character is an invalid xml character |
| // http://www.w3.org/TR/REC-xml/#charsets |
| character = null; |
| } |
| if (character != null) { |
| ch = character.charValue(); |
| } else { |
| // not an HTML escape; rewind |
| i = start; |
| ch = '&'; |
| } |
| } |
| } |
| sb.setCharAt(j, ch); |
| i++; |
| j++; |
| } |
| |
| sb.setLength(j); |
| return sb; |
| } |
| |
| /** |
| * Parses HTML character and entity references and returns the corresponding character. |
| */ |
| private static Character parseReference(String s) { |
| if (s.length() == 0) { |
| return null; |
| } |
| |
| if (s.charAt(0) == '#') { |
| // character reference |
| if (s.length() == 1) { |
| return null; |
| } |
| |
| try { |
| int value; |
| if (s.charAt(1) == 'x') { |
| // Hex reference |
| value = Integer.parseInt(s.substring(2), 16); |
| } else { |
| // Decimal reference |
| value = Integer.parseInt(s.substring(1)); |
| } |
| return new Character((char) value); |
| } catch (NumberFormatException e) { |
| return null; |
| } |
| } else { |
| return entities.get(s); |
| } |
| } |
| |
| /** |
| * Class for current token. |
| */ |
| public static class Token { |
| public static final Type EOF = new Type(); |
| |
| public static final Type TEXT = new Type(); |
| |
| public static final Type TAG = new Type(); |
| |
| public static final Type COMMENT = new Type(); |
| |
| /** token's type */ |
| private Type type; |
| |
| /** token's value */ |
| private final Object value; |
| |
| /** whitespace preceding the token */ |
| private final StringBuffer whitespace; |
| |
| /** |
| * Constructor for the EOF token. |
| */ |
| protected Token() { |
| type = EOF; |
| value = null; |
| whitespace = null; |
| } |
| |
| /** |
| * Constructor for the HTML tag tokens. |
| */ |
| protected Token(HtmlTag tag, StringBuffer whitespace) { |
| type = TAG; |
| value = tag; |
| this.whitespace = whitespace; |
| } |
| |
| /** |
| * Constructor for regular text and comments. |
| */ |
| protected Token(StringBuffer text, StringBuffer whitespace, boolean comment) { |
| if (comment) { |
| type = COMMENT; |
| } else { |
| type = TEXT; |
| } |
| this.value = text; |
| this.whitespace = whitespace; |
| } |
| |
| /** |
| * Returns the token's type. |
| */ |
| public Type getType() { |
| return type; |
| } |
| |
| /** |
| * Returns the whitespace preceding the token. |
| */ |
| public StringBuffer getWhitespace() { |
| return whitespace; |
| } |
| |
| /** |
| * Returns the token's value. This is an HtmlTag for tokens of type <code>TAG</code> and a StringBuffer for |
| * tokens of type <code>TEXT</code> and <code>COMMENT</code>. For tokens of type <code>EOF</code>, the value is |
| * <code>null</code>. |
| */ |
| public Object getValue() { |
| return value; |
| } |
| |
| /** |
| * Returns the string representation of the token, including the preceding whitespace. |
| */ |
| @Override |
| public String toString() { |
| StringBuffer sb = new StringBuffer(); |
| if (whitespace != null) { |
| sb.append(whitespace); |
| } |
| if (value != null) { |
| if (type == TAG) { |
| // sb.append('<'); |
| } else if (type == COMMENT) { |
| sb.append("<!--"); //$NON-NLS-1$ |
| } |
| sb.append(value); |
| if (type == TAG) { |
| // if(value instanceof HtmlTag) { |
| // HtmlTag htmlTag = (HtmlTag)value; |
| // if(htmlTag.getTagName().startsWith("?xml")) { |
| // sb.append("?>"); |
| // } |
| // } else { |
| // sb.append('>'); |
| |
| } else if (type == COMMENT) { |
| sb.append("-->"); //$NON-NLS-1$ |
| } |
| |
| } |
| return sb.toString(); |
| } |
| |
| /** |
| * Private enum class for token type. |
| */ |
| private static class Type { |
| private Type() { |
| // don't need to do anything |
| } |
| } |
| } |
| |
| /** |
| * Enum class for parser state. |
| */ |
| private static class State { |
| static final State EOF = new State(); |
| |
| static final State COMMENT = new State(); |
| |
| static final State TEXT = new State(); |
| |
| static final State TAG = new State(); |
| |
| static final State WS = new State(); |
| |
| static final State TAG_QUOTE = new State(); |
| |
| private State() { |
| // don't need to do anything |
| } |
| } |
| |
| /** names and values of HTML entity references */ |
| private static HashMap<String, Character> entities; |
| |
| /* |
| * Based on ISO 8879. |
| * |
| * Portions (c) International Organization for Standardization 1986 |
| * Permission to copy in any form is granted for use with conforming SGML |
| * systems and applications as defined in ISO 8879, provided this notice is |
| * included in all copies. |
| * |
| */ |
| static { |
| entities = new HashMap<String, Character>(); |
| entities.put("nbsp", Character.valueOf('\240')); // no-break //$NON-NLS-1$ |
| // space = |
| // non-breaking |
| // space |
| entities.put("iexcl", Character.valueOf('\241')); // inverted //$NON-NLS-1$ |
| // exclamation |
| // mark |
| entities.put("cent", Character.valueOf('\242')); // cent sign //$NON-NLS-1$ |
| entities.put("pound", Character.valueOf('\243')); // pound //$NON-NLS-1$ |
| // sign |
| entities.put("curren", Character.valueOf('\244')); // currency //$NON-NLS-1$ |
| // sign |
| entities.put("yen", Character.valueOf('\245')); // yen sign = //$NON-NLS-1$ |
| // yuan sign |
| entities.put("brvbar", Character.valueOf('\246')); // broken //$NON-NLS-1$ |
| // bar = |
| // broken |
| // vertical |
| // bar |
| entities.put("sect", Character.valueOf('\247')); // section //$NON-NLS-1$ |
| // sign |
| entities.put("uml", Character.valueOf('\250')); // diaeresis = //$NON-NLS-1$ |
| // spacing |
| // diaeresis |
| entities.put("copy", Character.valueOf('\251')); // copyright //$NON-NLS-1$ |
| // sign |
| entities.put("ordf", Character.valueOf('\252')); // feminine //$NON-NLS-1$ |
| // ordinal |
| // indicator |
| entities.put("laquo", Character.valueOf('\253')); // left-pointing //$NON-NLS-1$ |
| // double |
| // angle |
| // quotation |
| // mark = |
| // left |
| // pointing |
| // guillemet |
| entities.put("not", Character.valueOf('\254')); // not sign //$NON-NLS-1$ |
| entities.put("shy", Character.valueOf('\255')); // soft hyphen = //$NON-NLS-1$ |
| // discretionary |
| // hyphen |
| entities.put("reg", Character.valueOf('\256')); // registered //$NON-NLS-1$ |
| // sign = |
| // registered |
| // trade mark |
| // sign |
| entities.put("macr", Character.valueOf('\257')); // macron = //$NON-NLS-1$ |
| // spacing |
| // macron = |
| // overline |
| // = APL |
| // overbar |
| entities.put("deg", Character.valueOf('\260')); // degree sign //$NON-NLS-1$ |
| entities.put("plusmn", Character.valueOf('\261')); // plus-minus //$NON-NLS-1$ |
| // sign = |
| // plus-or-minus |
| // sign |
| entities.put("sup2", Character.valueOf('\262')); // superscript //$NON-NLS-1$ |
| // two = |
| // superscript |
| // digit two |
| // = squared |
| entities.put("sup3", Character.valueOf('\263')); // superscript //$NON-NLS-1$ |
| // three = |
| // superscript |
| // digit |
| // three = |
| // cubed |
| entities.put("acute", Character.valueOf('\264')); // acute //$NON-NLS-1$ |
| // accent = |
| // spacing |
| // acute |
| entities.put("micro", Character.valueOf('\265')); // micro //$NON-NLS-1$ |
| // sign |
| entities.put("para", Character.valueOf('\266')); // pilcrow //$NON-NLS-1$ |
| // sign = |
| // paragraph |
| // sign |
| entities.put("middot", Character.valueOf('\267')); // middle //$NON-NLS-1$ |
| // dot = |
| // Georgian |
| // comma = |
| // Greek |
| // middle |
| // dot |
| entities.put("cedil", Character.valueOf('\270')); // cedilla = //$NON-NLS-1$ |
| // spacing |
| // cedilla |
| entities.put("sup1", Character.valueOf('\271')); // superscript //$NON-NLS-1$ |
| // one = |
| // superscript |
| // digit one |
| entities.put("ordm", Character.valueOf('\272')); // masculine //$NON-NLS-1$ |
| // ordinal |
| // indicator |
| entities.put("raquo", Character.valueOf('\273')); // right-pointing //$NON-NLS-1$ |
| // double |
| // angle |
| // quotation |
| // mark = |
| // right |
| // pointing |
| // guillemet |
| entities.put("frac14", Character.valueOf('\274')); // vulgar //$NON-NLS-1$ |
| // fraction |
| // one |
| // quarter = |
| // fraction |
| // one |
| // quarter |
| entities.put("frac12", Character.valueOf('\275')); // vulgar //$NON-NLS-1$ |
| // fraction |
| // one half |
| // = |
| // fraction |
| // one half |
| entities.put("frac34", Character.valueOf('\276')); // vulgar //$NON-NLS-1$ |
| // fraction |
| // three |
| // quarters |
| // = |
| // fraction |
| // three |
| // quarters |
| entities.put("iquest", Character.valueOf('\277')); // inverted //$NON-NLS-1$ |
| // question |
| // mark = |
| // turned |
| // question |
| // mark |
| entities.put("Agrave", Character.valueOf('\300')); // latin //$NON-NLS-1$ |
| // capital |
| // letter A |
| // with |
| // grave = |
| // latin |
| // capital |
| // letter A |
| // grave |
| entities.put("Aacute", Character.valueOf('\301')); // latin //$NON-NLS-1$ |
| // capital |
| // letter A |
| // with |
| // acute |
| entities.put("Acirc", Character.valueOf('\302')); // latin //$NON-NLS-1$ |
| // capital |
| // letter A |
| // with |
| // circumflex |
| entities.put("Atilde", Character.valueOf('\303')); // latin //$NON-NLS-1$ |
| // capital |
| // letter A |
| // with |
| // tilde |
| entities.put("Auml", Character.valueOf('\304')); // latin //$NON-NLS-1$ |
| // capital |
| // letter A |
| // with |
| // diaeresis |
| entities.put("Aring", Character.valueOf('\305')); // latin //$NON-NLS-1$ |
| // capital |
| // letter A |
| // with ring |
| // above = |
| // latin |
| // capital |
| // letter A |
| // ring |
| entities.put("AElig", Character.valueOf('\306')); // latin //$NON-NLS-1$ |
| // capital |
| // letter AE |
| // = latin |
| // capital |
| // ligature |
| // AE |
| entities.put("Ccedil", Character.valueOf('\307')); // latin //$NON-NLS-1$ |
| // capital |
| // letter C |
| // with |
| // cedilla |
| entities.put("Egrave", Character.valueOf('\310')); // latin //$NON-NLS-1$ |
| // capital |
| // letter E |
| // with |
| // grave |
| entities.put("Eacute", Character.valueOf('\311')); // latin //$NON-NLS-1$ |
| // capital |
| // letter E |
| // with |
| // acute |
| entities.put("Ecirc", Character.valueOf('\312')); // latin //$NON-NLS-1$ |
| // capital |
| // letter E |
| // with |
| // circumflex |
| entities.put("Euml", Character.valueOf('\313')); // latin //$NON-NLS-1$ |
| // capital |
| // letter E |
| // with |
| // diaeresis |
| entities.put("Igrave", Character.valueOf('\314')); // latin //$NON-NLS-1$ |
| // capital |
| // letter I |
| // with |
| // grave |
| entities.put("Iacute", Character.valueOf('\315')); // latin //$NON-NLS-1$ |
| // capital |
| // letter I |
| // with |
| // acute |
| entities.put("Icirc", Character.valueOf('\316')); // latin //$NON-NLS-1$ |
| // capital |
| // letter I |
| // with |
| // circumflex |
| entities.put("Iuml", Character.valueOf('\317')); // latin //$NON-NLS-1$ |
| // capital |
| // letter I |
| // with |
| // diaeresis |
| entities.put("ETH", Character.valueOf('\320')); // latin capital //$NON-NLS-1$ |
| // letter ETH |
| entities.put("Ntilde", Character.valueOf('\321')); // latin //$NON-NLS-1$ |
| // capital |
| // letter N |
| // with |
| // tilde |
| entities.put("Ograve", Character.valueOf('\322')); // latin //$NON-NLS-1$ |
| // capital |
| // letter O |
| // with |
| // grave |
| entities.put("Oacute", Character.valueOf('\323')); // latin //$NON-NLS-1$ |
| // capital |
| // letter O |
| // with |
| // acute |
| entities.put("Ocirc", Character.valueOf('\324')); // latin //$NON-NLS-1$ |
| // capital |
| // letter O |
| // with |
| // circumflex |
| entities.put("Otilde", Character.valueOf('\325')); // latin //$NON-NLS-1$ |
| // capital |
| // letter O |
| // with |
| // tilde |
| entities.put("Ouml", Character.valueOf('\326')); // latin //$NON-NLS-1$ |
| // capital |
| // letter O |
| // with |
| // diaeresis |
| entities.put("times", Character.valueOf('\327')); // multiplication //$NON-NLS-1$ |
| // sign |
| entities.put("Oslash", Character.valueOf('\330')); // latin //$NON-NLS-1$ |
| // capital |
| // letter O |
| // with |
| // stroke = |
| // latin |
| // capital |
| // letter O |
| // slash |
| entities.put("Ugrave", Character.valueOf('\331')); // latin //$NON-NLS-1$ |
| // capital |
| // letter U |
| // with |
| // grave |
| entities.put("Uacute", Character.valueOf('\332')); // latin //$NON-NLS-1$ |
| // capital |
| // letter U |
| // with |
| // acute |
| entities.put("Ucirc", Character.valueOf('\333')); // latin //$NON-NLS-1$ |
| // capital |
| // letter U |
| // with |
| // circumflex |
| entities.put("Uuml", Character.valueOf('\334')); // latin //$NON-NLS-1$ |
| // capital |
| // letter U |
| // with |
| // diaeresis |
| entities.put("Yacute", Character.valueOf('\335')); // latin //$NON-NLS-1$ |
| // capital |
| // letter Y |
| // with |
| // acute |
| entities.put("THORN", Character.valueOf('\336')); // latin //$NON-NLS-1$ |
| // capital |
| // letter |
| // THORN |
| entities.put("szlig", Character.valueOf('\337')); // latin //$NON-NLS-1$ |
| // small |
| // letter |
| // sharp s = |
| // ess-zed |
| entities.put("agrave", Character.valueOf('\340')); // latin //$NON-NLS-1$ |
| // small |
| // letter a |
| // with |
| // grave = |
| // latin |
| // small |
| // letter a |
| // grave |
| entities.put("aacute", Character.valueOf('\341')); // latin //$NON-NLS-1$ |
| // small |
| // letter a |
| // with |
| // acute |
| entities.put("acirc", Character.valueOf('\342')); // latin //$NON-NLS-1$ |
| // small |
| // letter a |
| // with |
| // circumflex |
| entities.put("atilde", Character.valueOf('\343')); // latin //$NON-NLS-1$ |
| // small |
| // letter a |
| // with |
| // tilde |
| entities.put("auml", Character.valueOf('\344')); // latin //$NON-NLS-1$ |
| // small |
| // letter a |
| // with |
| // diaeresis |
| entities.put("aring", Character.valueOf('\345')); // latin //$NON-NLS-1$ |
| // small |
| // letter a |
| // with ring |
| // above = |
| // latin |
| // small |
| // letter a |
| // ring |
| entities.put("aelig", Character.valueOf('\346')); // latin //$NON-NLS-1$ |
| // small |
| // letter ae |
| // = latin |
| // small |
| // ligature |
| // ae |
| entities.put("ccedil", Character.valueOf('\347')); // latin //$NON-NLS-1$ |
| // small |
| // letter c |
| // with |
| // cedilla |
| entities.put("egrave", Character.valueOf('\350')); // latin //$NON-NLS-1$ |
| // small |
| // letter e |
| // with |
| // grave |
| entities.put("eacute", Character.valueOf('\351')); // latin //$NON-NLS-1$ |
| // small |
| // letter e |
| // with |
| // acute |
| entities.put("ecirc", Character.valueOf('\352')); // latin //$NON-NLS-1$ |
| // small |
| // letter e |
| // with |
| // circumflex |
| entities.put("euml", Character.valueOf('\353')); // latin //$NON-NLS-1$ |
| // small |
| // letter e |
| // with |
| // diaeresis |
| entities.put("igrave", Character.valueOf('\354')); // latin //$NON-NLS-1$ |
| // small |
| // letter i |
| // with |
| // grave |
| entities.put("iacute", Character.valueOf('\355')); // latin //$NON-NLS-1$ |
| // small |
| // letter i |
| // with |
| // acute |
| entities.put("icirc", Character.valueOf('\356')); // latin //$NON-NLS-1$ |
| // small |
| // letter i |
| // with |
| // circumflex |
| entities.put("iuml", Character.valueOf('\357')); // latin //$NON-NLS-1$ |
| // small |
| // letter i |
| // with |
| // diaeresis |
| entities.put("eth", Character.valueOf('\360')); // latin small //$NON-NLS-1$ |
| // letter eth |
| entities.put("ntilde", Character.valueOf('\361')); // latin //$NON-NLS-1$ |
| // small |
| // letter n |
| // with |
| // tilde |
| entities.put("ograve", Character.valueOf('\362')); // latin //$NON-NLS-1$ |
| // small |
| // letter o |
| // with |
| // grave |
| entities.put("oacute", Character.valueOf('\363')); // latin //$NON-NLS-1$ |
| // small |
| // letter o |
| // with |
| // acute |
| entities.put("ocirc", Character.valueOf('\364')); // latin //$NON-NLS-1$ |
| // small |
| // letter o |
| // with |
| // circumflex |
| entities.put("otilde", Character.valueOf('\365')); // latin //$NON-NLS-1$ |
| // small |
| // letter o |
| // with |
| // tilde |
| entities.put("ouml", Character.valueOf('\366')); // latin //$NON-NLS-1$ |
| // small |
| // letter o |
| // with |
| // diaeresis |
| entities.put("divide", Character.valueOf('\367')); // division //$NON-NLS-1$ |
| // sign |
| entities.put("oslash", Character.valueOf('\370')); // latin //$NON-NLS-1$ |
| // small |
| // letter o |
| // with |
| // stroke = |
| // latin |
| // small |
| // letter o |
| // slash |
| entities.put("ugrave", Character.valueOf('\371')); // latin //$NON-NLS-1$ |
| // small |
| // letter u |
| // with |
| // grave |
| entities.put("uacute", Character.valueOf('\372')); // latin //$NON-NLS-1$ |
| // small |
| // letter u |
| // with |
| // acute |
| entities.put("ucirc", Character.valueOf('\373')); // latin //$NON-NLS-1$ |
| // small |
| // letter u |
| // with |
| // circumflex |
| entities.put("uuml", Character.valueOf('\374')); // latin //$NON-NLS-1$ |
| // small |
| // letter u |
| // with |
| // diaeresis |
| entities.put("yacute", Character.valueOf('\375')); // latin //$NON-NLS-1$ |
| // small |
| // letter y |
| // with |
| // acute |
| entities.put("thorn", Character.valueOf('\376')); // latin //$NON-NLS-1$ |
| // small |
| // letter |
| // thorn |
| entities.put("yuml", Character.valueOf('\377')); // latin //$NON-NLS-1$ |
| // small |
| // letter y |
| // with |
| // diaeresis |
| |
| // Special characters |
| entities.put("quot", Character.valueOf('\42')); // quotation //$NON-NLS-1$ |
| // mark = APL |
| // quote |
| entities.put("amp", Character.valueOf('\46')); // ampersand //$NON-NLS-1$ |
| entities.put("lt", Character.valueOf('\74')); // less-than //$NON-NLS-1$ |
| // sign |
| entities.put("gt", Character.valueOf('\76')); // greater-than //$NON-NLS-1$ |
| // sign |
| // Latin Extended-A |
| entities.put("OElig", Character.valueOf('\u0152')); // latin //$NON-NLS-1$ |
| // capital |
| // ligature |
| // OE |
| entities.put("oelig", Character.valueOf('\u0153')); // latin //$NON-NLS-1$ |
| // small |
| // ligature |
| // oe, |
| // ligature |
| // is a |
| // misnomer, |
| // this is a |
| // separate |
| // character |
| // in some |
| // languages |
| entities.put("Scaron", Character.valueOf('\u0160')); // latin //$NON-NLS-1$ |
| // capital |
| // letter |
| // S |
| // with |
| // caron |
| entities.put("scaron", Character.valueOf('\u0161')); // latin //$NON-NLS-1$ |
| // small |
| // letter |
| // s |
| // with |
| // caron |
| entities.put("Yuml", Character.valueOf('\u0178')); // latin //$NON-NLS-1$ |
| // capital |
| // letter Y |
| // with |
| // diaeresis |
| // Spacing Modifier Letters |
| entities.put("circ", Character.valueOf('\u02c6')); // modifier //$NON-NLS-1$ |
| // letter |
| // circumflex |
| // accent |
| entities.put("tilde", Character.valueOf('\u02dc')); // small //$NON-NLS-1$ |
| // tilde |
| // General punctuation |
| entities.put("ensp", Character.valueOf('\u2002')); // en space //$NON-NLS-1$ |
| entities.put("emsp", Character.valueOf('\u2003')); // em space //$NON-NLS-1$ |
| entities.put("thinsp", Character.valueOf('\u2009')); // thin //$NON-NLS-1$ |
| // space |
| entities.put("zwnj", Character.valueOf('\u200c')); // zero //$NON-NLS-1$ |
| // width |
| // non-joiner |
| entities.put("zwj", Character.valueOf('\u200d')); // zero //$NON-NLS-1$ |
| // width |
| // joiner |
| entities.put("lrm", Character.valueOf('\u200e')); // left-to-right //$NON-NLS-1$ |
| // mark |
| entities.put("rlm", Character.valueOf('\u200f')); // right-to-left //$NON-NLS-1$ |
| // mark |
| entities.put("ndash", Character.valueOf('\u2013')); // en dash //$NON-NLS-1$ |
| entities.put("mdash", Character.valueOf('\u2014')); // em dash //$NON-NLS-1$ |
| entities.put("lsquo", Character.valueOf('\u2018')); // left //$NON-NLS-1$ |
| // single |
| // quotation |
| // mark |
| entities.put("rsquo", Character.valueOf('\u2019')); // right //$NON-NLS-1$ |
| // single |
| // quotation |
| // mark |
| entities.put("sbquo", Character.valueOf('\u201a')); // single //$NON-NLS-1$ |
| // low-9 |
| // quotation |
| // mark |
| entities.put("ldquo", Character.valueOf('\u201c')); // left //$NON-NLS-1$ |
| // double |
| // quotation |
| // mark |
| entities.put("rdquo", Character.valueOf('\u201d')); // right //$NON-NLS-1$ |
| // double |
| // quotation |
| // mark |
| entities.put("bdquo", Character.valueOf('\u201e')); // double //$NON-NLS-1$ |
| // low-9 |
| // quotation |
| // mark |
| entities.put("dagger", Character.valueOf('\u2020')); // dagger //$NON-NLS-1$ |
| entities.put("Dagger", Character.valueOf('\u2021')); // double //$NON-NLS-1$ |
| // dagger |
| entities.put("permil", Character.valueOf('\u2030')); // per //$NON-NLS-1$ |
| // mille |
| // sign |
| entities.put("lsaquo", Character.valueOf('\u2039')); // single //$NON-NLS-1$ |
| // left-pointing |
| // angle |
| // quotation |
| // mark, |
| // not |
| // yet |
| // standardized |
| entities.put("rsaquo", Character.valueOf('\u203a')); // single //$NON-NLS-1$ |
| // right-pointing |
| // angle |
| // quotation |
| // mark, |
| // not |
| // yet |
| // standardized |
| entities.put("euro", Character.valueOf('\u20ac')); // euro sign //$NON-NLS-1$ |
| } |
| } |