blob: bde86af1384710442fd86e070e65ec4cbc9144f6 [file] [log] [blame]
* Copyright (c) 1998, 2008 IBM Corporation and Others
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Eclipse Public License v1.0
* which accompanies this distribution, and is available at
* Contributors:
* Goh KONDOH - initial API and implementation
package org.eclipse.actf.model.internal.dom.sgml.impl;
import java.util.Enumeration;
import java.util.Hashtable;
import java.util.Vector;
import org.eclipse.actf.model.dom.html.IErrorHandler;
import org.eclipse.actf.model.dom.html.IErrorLogListener;
import org.eclipse.actf.model.dom.html.IParser;
import org.eclipse.actf.model.dom.html.IParserError;
import org.eclipse.actf.model.dom.html.ParseException;
import org.eclipse.actf.model.internal.dom.sgml.ISGMLConstants;
import org.eclipse.actf.model.internal.dom.sgml.ISGMLParser;
import org.eclipse.actf.model.internal.dom.sgml.errorhandler.AttributeValueErrorHandler;
import org.eclipse.actf.model.internal.dom.sgml.errorhandler.DefaultErrorHandler;
import org.eclipse.actf.model.internal.dom.sgml.errorhandler.ITokenErrorHandler;
import org.eclipse.actf.model.internal.dom.sgml.modelgroup.IModelGroup;
import org.eclipse.actf.model.internal.dom.sgml.modelgroup.AndModelGroup.AndContext;
import org.w3c.dom.Attr;
import org.w3c.dom.CDATASection;
import org.w3c.dom.DOMException;
import org.w3c.dom.DOMImplementation;
import org.w3c.dom.Document;
import org.w3c.dom.DocumentFragment;
import org.w3c.dom.DocumentType;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.Text;
import org.xml.sax.DocumentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.ext.LexicalHandler;
* Pure SGML parser. Base class of a parser which parses a SGML derived markup
* language. Language-dependent customization have to be provided by a subclass.
public class SGMLParser implements ISGMLConstants, ISGMLParser {
private boolean isXHTML = false;
private boolean isEmptyElement = true;
private boolean isEndWithSlash = false;
private String currentTagName = "";
private DOMImplementation domImpl;
private static Class createDocumentMethodParamTypes[] = { String.class,
String.class, DocumentType.class };
* @return <code>null</code> if failed.
public DOMImplementation setDOMImplementation(DOMImplementation domImpl) {
Class domImpleInterface = DOMImplementation.class;
try {
java.lang.reflect.Method createDocumentMethod = domImpleInterface
.getMethod("createDocument", createDocumentMethodParamTypes);
if (createDocumentMethod != null) {
this.domImpl = domImpl;
doc = null;
return domImpl;
} catch (Exception e) {
return null;
* @return A DOMImplementation instace this parser users.
public DOMImplementation getDOMImplementation() {
return this.domImpl;
* Constructs this instance and {@link DefaultErrorHandler}instance is
* added.
public SGMLParser() {
addErrorHandler(new DefaultErrorHandler());
addTokenErrorHandler(new AttributeValueErrorHandler());
if (getDOMImplementation() == null) {
setDocument(new SGMLDocument());
anonymousElementDef = new ElementDefinition("ANONYMOUS", any);
public IErrorHandler errorHandlers[] = new IErrorHandler[8];
private ITokenErrorHandler tokenErrorHandlers[] = new ITokenErrorHandler[8];
public int errorHandlerNum = 0;
private int tokenErrorHandlerNum = 0;
* Sets an error handler that recovers error.
* @param errorHandler
* @deprecated {@link #addErrorHandler(org.eclipse.actf.model.dom.html.IErrorHandler)}
public void setErrorHandler(IErrorHandler errorHandler) {
* Adds an ErrorHandler instance. An errorHandler added later is invoked
* earlier by this parser instance than errorHandlers added earlier. If one
* errorHandler handles error (eg. returns <code>
* true</code>), no more
* errorHandlers are invoked.
* @param errorHandler
* errorHandler instance to be added to this parser
public void addErrorHandler(IErrorHandler errorHandler) {
if (errorHandlerNum == errorHandlers.length) {
IErrorHandler newErrorHandlers[] = new IErrorHandler[errorHandlers.length * 2];
for (int i = 0; i < errorHandlerNum; i++) {
newErrorHandlers[i] = errorHandlers[i];
errorHandlers = newErrorHandlers;
this.errorHandlers[errorHandlerNum++] = errorHandler;
* Adds an ErrorHandler instance. An errorHandler added later is invoked
* earlier by this parser instance than errorHandlers added earlier. If one
* errorHandler handles error (eg. returns <code>
* true</code>), no more
* errorHandlers are invoked.
* @param errorHandler
* errorHandler instance to be added to this parser
public void addTokenErrorHandler(ITokenErrorHandler errorHandler) {
if (tokenErrorHandlerNum == tokenErrorHandlers.length) {
ITokenErrorHandler newErrorHandlers[] = new ITokenErrorHandler[errorHandlers.length * 2];
for (int i = 0; i < tokenErrorHandlerNum; i++) {
newErrorHandlers[i] = tokenErrorHandlers[i];
tokenErrorHandlers = newErrorHandlers;
this.tokenErrorHandlers[tokenErrorHandlerNum++] = errorHandler;
* Gets node-level error handlers included in this parser
* @return error handlers
public IErrorHandler[] getErrorHandlers() {
IErrorHandler ret[] = new IErrorHandler[errorHandlerNum];
for (int i = 0; i < errorHandlerNum; i++) {
ret[i] = errorHandlers[i];
return ret;
* Gets token-level error handlers included in this parser.
public ITokenErrorHandler[] getTokenErrorHandlers() {
ITokenErrorHandler ret[] = new ITokenErrorHandler[errorHandlerNum];
for (int i = 0; i < tokenErrorHandlerNum; i++) {
ret[i] = tokenErrorHandlers[i];
return ret;
* Removes a node-level error handler.
* @param errorHandler
* error handler to remove
public void removeErrorHandler(IErrorHandler errorHandler) {
for (int i = 0; i < errorHandlerNum; i++) {
if (errorHandlers[i] == errorHandler) {
errorHandlers[i] = null;
for (i++; i < errorHandlerNum; i++) {
errorHandlers[i - 1] = errorHandlers[i];
* Removes a token-level error handler.
* @param errorHandler
* error handler to remove
public void removeTokenErrorHandler(ITokenErrorHandler errorHandler) {
for (int i = 0; i < tokenErrorHandlerNum; i++) {
if (tokenErrorHandlers[i] == errorHandler) {
tokenErrorHandlers[i] = null;
for (i++; i < tokenErrorHandlerNum; i++) {
tokenErrorHandlers[i - 1] = tokenErrorHandlers[i];
* This variable is for debugging.
public static final boolean _DEBUG = false;
private boolean extractNum = true;
private boolean extractChar = true;
* Determines if this parser extracts both character and number entities or
* not. Default value is <code>true</code>.
public void extractEntity(boolean b) {
this.extractChar = this.extractNum = b;
* Sets if it parses and extracts number entities or not. By default, it
* parses and number character entities.
* @param b
* if true, extracts number entities.
public void extractNumEntity(boolean b) {
this.extractNum = b;
* Checks if it parses and extracts number entities or not. By default, it
* parses and extracts number entities.
* @return true if extracts number entities. Otherwise false
public boolean extractNumEntity() {
return this.extractNum;
* Sets if it parses and extracts character entities or not. By default, it
* parses and extracts character entities.
* @param b
* if true, extracts character entities.
public void extractCharEntity(boolean b) {
this.extractChar = b;
* Checks if it parses and extracts character entities or not. By default,
* it parses and extracts character entities.
* @return true if extracts character entities. Otherwise false
public boolean extractCharEntity() {
return extractChar;
public InsTokenizer tokenizer;
private Document doc = null;
* Public entities are stored in this variable. Keys are id and values are
* file name. Both are instances of <code>java.lang.String</code>. For
* example, for the key "-//W3C//DTD HTML 4.0 Transitional//EN", the value
* is "loose.dtd"
public static Hashtable<String, String> pubEntityMap = new Hashtable<String, String>();
* Gets public entity map.
* @return Hashtable instance whose keys are public id and values are file
* name.
public static Hashtable<String, String> getPublicEntityMap() {
return pubEntityMap;
* Top element's ElementDefinition.
public ElementDefinition lastDef = null;
static IModelGroup pcdata = new IModelGroup() {
public boolean match(ISGMLParser parser, Node parent, Node child) {
if (child instanceof Text && !(child instanceof CDATASection)) {
return true;
} else {
return false;
public boolean optional() {
return false;
public void refer(boolean infinite) {
public String toString() {
return "#PCDATA";
public boolean match(int number) {
return true;
public boolean[] rehash(int totalSize) {
boolean ret[] = new boolean[totalSize];
ret[totalSize - 2] = true;
return ret;
static IModelGroup cdata = new IModelGroup() {
public boolean match(ISGMLParser parser, Node parent, Node child) {
if (child instanceof CDATASection) {
return true;
} else {
return false;
public boolean optional() {
return false;
public void refer(boolean infinite) {
public String toString() {
return "CDATA";
public boolean match(int number) {
return false;
public boolean[] rehash(int totalSize) {
return null;
static IModelGroup empty = new IModelGroup() {
public boolean match(ISGMLParser parser, Node parent, Node child) {
return false;
public boolean optional() {
return true;
public void refer(boolean infinite) {
public String toString() {
return "EMPTY";
public boolean match(int number) {
return false;
public boolean[] rehash(int totalSize) {
return null;
static IModelGroup any = new IModelGroup() {
public boolean match(ISGMLParser parser, Node parent, Node child) {
return true;
public boolean optional() {
return true;
public void refer(boolean infinite) {
public boolean match(int number) {
return true;
public boolean[] rehash(int totalSize) {
return null;
public ElementDefinition anonymousElementDef;
* Gets a element definition for undefined elements.
public ElementDefinition getAnonymousElementDefinition() {
return anonymousElementDef;
public SGMLDocTypeDef dtd;
* Gets DTD that defines this Document's syntax.
* @return DTD that defines this Document's syntax.
public SGMLDocTypeDef getDTD() {
return this.dtd;
* Sets DTD that defines this Document's syntax.
* @param dtd
* DTD that defines this Document's syntax.
public final void setDTD(SGMLDocTypeDef dtd) {
this.dtd = dtd;
if (doc instanceof SGMLDocument) {
((SGMLDocument) doc).setDTD(dtd);
isXHTML = (dtd.toString().indexOf("XHTML") > -1);
public Hashtable<Node, AndContext> andMap = new Hashtable<Node, AndContext>();
public Hashtable<Node, AndContext> getAndMap() {
return andMap;
public Hashtable<Node, Integer> seqMap = new Hashtable<Node, Integer>();
public Hashtable<Node, Integer> getSeqMap() {
return seqMap;
public Hashtable<Node, Node> plusMap = new Hashtable<Node, Node>();
public Hashtable<Node, Node> getPlusMap() {
return plusMap;
public void clearContextMap(Node parent) {
private Attr attribute(ElementDefinition ed, AttributeListImpl attrlist)
throws IOException, ParseException, SAXException {
Attr ret = null;
if (tokenizer.nextToken() != NAME_CHAR) {
if (handleError(IParserError.TAG_NAME, tokenizer.sval)) {
return attribute(ed, attrlist);
} else {
String attName = changeAttrNameCase(tokenizer.sval);
AttributeDefinition ad = ed != null ? ed.getAttributeDef(attName)
: null;
String attValue = attName;
if (tokenizer.nextToken() == EQ) {
attValue = tokenizer.readAttributeValue(ad, ed);
ret = doc.createAttribute(attName);
} else {
ret = doc.createAttribute(attName);
if (ad == null) {
if (attrlist != null) {
attrlist.addAttribute(attName, "CDATA", attValue);
if (ed != null && ed != anonymousElementDef) {
if (handleError(IParserError.ILLEGAL_ATTRIBUTE, ret)) {
ret = attribute(ed, attrlist);
} else {
"Illegal attribute '" + attName + "' for "
+ ed.getName());
} else if (attrlist != null) {
attrlist.addAttribute(attName, ad.getDeclaredTypeStr(),
return ret;
* Records an error. Does nothing by default. If it has ErrorLogListener
* instances, calls their {@link IErrorLogListener#errorLog(int,String)}
* method.
* @param code
* error code.
* @param msg
* message of the error.
public final void error(int code, String msg) {
for (int i = 0; i < errorLogListenerNum; i++) {
if (tokenizer != null) {
errorLogListeners[i].errorLog(code, tokenizer.getCurrentLine()
+ ": " + msg);
} else {
errorLogListeners[i].errorLog(code, msg);
public IErrorLogListener errorLogListeners[] = new IErrorLogListener[8];
public int errorLogListenerNum = 0;
* Adds an error log listerner. Listener is invoked when error is occured.
public void addErrorLogListener(IErrorLogListener listener) {
if (errorLogListenerNum == errorLogListeners.length) {
IErrorLogListener newListeners[] = new IErrorLogListener[errorLogListenerNum * 2];
for (int i = 0; i < errorLogListenerNum; i++) {
newListeners[i] = errorLogListeners[i];
errorLogListeners = newListeners;
errorLogListeners[errorLogListenerNum++] = listener;
* Removes an error log listener.
* @param lister
* error log listener to remove.
public void removeErrorLogListener(IErrorLogListener listener) {
for (int i = 0; i < errorLogListenerNum; i++) {
if (errorLogListeners[i] == listener) {
errorLogListeners[i] = null;
for (i++; i < errorLogListenerNum; i++) {
errorLogListeners[i - 1] = errorLogListeners[i];
private EndTag etag() throws IOException, ParseException, SAXException {
String tagName;
if (tokenizer.nextToken() == NAME_CHAR) {
tagName = changeTagCase(tokenizer.sval);
while (tokenizer.nextToken() != TAGC && tokenizer.ttype != EOF) {
EndTag et = new EndTag(tagName);
currentNode = et;
if (docHandler != null && !eHandleLogical) {
ElementDefinition ed = dtd.getElementDefinition(tagName);
if (ed != null) {
lastElementNumber = ed.number;
return et;
} else if (keepUnknowns) {
lastElementDef = anonymousElementDef;
lastElementNumber = pcdataNumber + 1;
return et;
} else if (handleError(IParserError.UNKNOWN_ELEMENT, et)) {
return null;
return null;
* (non-Javadoc)
* @see org.eclipse.actf.model.dom.sgml.impl.ISGMLParser#getDocument()
public Document getDocument() {
return doc;
* push back buffer size
private static final int BUF_SIZ = 256;
public int bufCount = 0;
public Node buf[] = new Node[BUF_SIZ];
* Gets a Node or {@link EndTag}from a currently reading stream as a result
* of tokenizing.
* @return {@link org.w3c.dom.Node Node}or {@link EndTag}
* @exception ParseException
* @exception IOException
* @see #pushBackNode(org.w3c.dom.Node)
public Node getNode() throws ParseException, IOException, SAXException {
if (bufCount == 0) {
return node();
} else {
return buf[--bufCount];
* Pushes back a node to this parser.
* @param node
* node to be pushed back.
* @see #getNode()
public void pushBackNode(Node node) {
buf[bufCount++] = node;
if (node instanceof Element) {
lastElementDef = dtd.getElementDefinition(node.getNodeName());
lastElementNumber = lastElementDef.number;
* Gets a resource reader for this parser. By default, this class has no
* resource. So if some resource is required, override this method in a
* subclass.
* @exception IOException
* always thrown.
protected Reader getResource(String resourceName) throws IOException {
throw new IOException("cannot find " + resourceName);
public Node node() throws ParseException, IOException, SAXException {
Node ret;
// TODO emulate <tag /> by using EndTag
if (isEndWithSlash && !isEmptyElement) {
// if(isXHTML && isEndWithSlash){
isEndWithSlash = false;
EndTag et = new EndTag(currentTagName);
currentNode = et;
if (docHandler != null && !eHandleLogical) {
ElementDefinition ed = dtd.getElementDefinition(currentTagName);
if (ed != null) {
lastElementNumber = ed.number;
return et;
} else if (keepUnknowns) {
lastElementDef = anonymousElementDef;
lastElementNumber = pcdataNumber + 1;
return et;
} else if (handleError(IParserError.UNKNOWN_ELEMENT, et)) {
return null;
switch (tokenizer.nextToken()) {
case STAGO:
ret = stag();
if (ret == null) { // Unknown Element. Skip it.
ret = getNode();
case ETAGO:
ret = etag();
if (ret == null) { // Unknown Element. Skip it.
ret = node();
currentNode = ret = doc.createTextNode(tokenizer.sval);
if (ret instanceof SGMLText) {
((SGMLText) ret).setIsWhitespaceInElementContent(true);
if (docHandler != null) {
if (saxch != null) {
docHandler.characters(saxch, begin, len);
saxch = null;
case PCDATA:
currentNode = ret = doc.createTextNode(tokenizer.sval);
if (docHandler != null) {
if (saxch != null) {
docHandler.characters(saxch, begin, len);
saxch = null;
currentNode = ret = doc.createComment(tokenizer.sval);
if (lexHandler != null) {
lexHandler.comment(saxch, begin, len);
saxch = null;
case PI:
currentNode = ret = doc.createProcessingInstruction(null,
if (docHandler != null) {
docHandler.processingInstruction(null, tokenizer.sval);
case EOF:
ret = null;
case MDO:
"Illegal Declaration. Discarding to next '>'");
if (tokenizer.nextToken() != '>') {
// consume '>'
return node();
"Internal Parser Error: character encoding may be wrong.");
return node();
return ret;
* Set up syntax information described by DTD.
* @param publicID
* DTD's public id that specifies which to set up.
public void setupDTD(String publicID) throws ParseException, IOException {
publicID = makeUnique(publicID);
SGMLDocTypeDef ret = SGMLDocTypeDef.getPublic(publicID);
if (ret != null) {
lastDef = ret.getElementDefinition(getDefaultTopElement());
} else {
synchronized (publicID) {
ret = SGMLDocTypeDef.getPublic(publicID);
if (ret == null) {
ret = SGMLDocTypeDef.createPublic(publicID, this);
Reader dr = getResource(pubEntityMap.get(publicID));
DTDTokenizer tok = new DTDTokenizer(dr);
new DTDParser(tok, ret).readDTD();
SGMLDocTypeDef.putPublic(publicID, ret);
} else {
lastDef = dtd.getElementDefinition(getDefaultTopElement());
pcdataNumber = dtd.getElementCount();
for (int i = depth - 1; i >= 0; i--) {
ancesterElementDefs[i] = dtd
if (ancesterElementDefs[i] == null) {
error(IParserError.UNKNOWN_ELEMENT, ancesters[i].getNodeName()
+ " is not defined in " + publicID);
ancesterElementDefs[i] = anonymousElementDef;
anonymousElementDef.rehash(pcdataNumber + 2);
anonymousElementDef.number = pcdataNumber + 1;
public int pcdataNumber;
* Parses SGML a document and return its top element. SGML documents are
* consists of three parts. 1. SGML declaration. 2. Dcument type definition.
* 3. SGML instances. If a document misses 1. and 2., this parser try to
* read the default declaration specified by
* {@link #setDefaultDTD(java.lang.String)}.
* @param reader
* parse to read.
* @return document.
* @exception PaserException
* If unrecoverable syntax or token error occured, throwed
* @exception IOException
public Node parse(Reader reader) throws ParseException, IOException,
SAXException {
if (domImpl == null && doc == null) {
throw new ParseException("No factory instance.");
this.tokenizer = new InsTokenizer(reader, this);
if (docHandler != null) {
DocumentType docType = readDocType();
if (docType == null) {
"<!DOCTYPE ...> is missing. Try to use \"" + defaultDTD
+ "\" as document type");
if (doc == null) {
doc = createDocument(docType);
if (doc instanceof SGMLDocument
&& ((SGMLDocument) doc).getDTD() == null) {
((SGMLDocument) doc).setDTD(this.dtd);
while (!commentsBeforeDoctype.isEmpty()) {
CATB catb = commentsBeforeDoctype.lastElement();
Node node = catb.comment ? (Node) doc.createComment(catb.str)
: doc.createProcessingInstruction(null, catb.str);
doc.insertBefore(node, doc.getFirstChild());
// //041026 handle comments after doctype
// comment_loop: while (true) {
// switch (tokenizer.nextToken()) {
// case COMMENT:
// doc.appendChild(doc.createComment(tokenizer.sval));
// if (lexHandler != null) {
// lexHandler.comment(saxch, begin, len);
// }
// break;
// case PI:
// doc.appendChild(doc.createComment(tokenizer.sval));
// if (docHandler != null) {
// docHandler.processingInstruction(null, tokenizer.sval);
// }
// break;
// case MDO:
// break comment_loop;
// default:
// tokenizer.pushBack();
// break comment_loop;
// }
// }
// dummy
context = doc.createElement("dummy0");
seqArray = new IModelGroup[dtd.maxSeqLength];
Node ret = readInstances();
if (docHandler != null)
return ret;
public IModelGroup seqArray[];
public IModelGroup[] getSeqArray() {
return this.seqArray;
protected Document createDocument(DocumentType docType) {
Document ret = domImpl.createDocument("dummy1", "dummy1", docType);
if (ret.getDocumentElement() != null) {
return ret;
* Context element.
public Element context = null;
* Gets a current context element.
* @return context element
* @see #setContext(org.w3c.dom.Element)
public Element getContext() {
return this.context;
public Element forwardPath[] = new Element[BUF_SIZ];
public ElementDefinition ancesterElementDefs[] = new ElementDefinition[BUF_SIZ];
public Element ancesters[] = new Element[BUF_SIZ];
public int depth = 0;
public AttributeListImpl nullAttributeList = createAttributeList();
private void setContextForward(Element element) throws SAXException {
ElementDefinition ed;
if (eHandleLogical && docHandler != null) {
for (Node down = context.getLastChild(); down instanceof Element; down = down
.getLastChild()) {
if (down == element) {
ancesters[depth] = element;
ancesterElementDefs[depth] = lastElementDef;
docHandler.startElement(element.getNodeName(), attrlist);
attrlist = null;
this.context = element;
} else {
ancesters[depth] = (Element) down;
ancesterElementDefs[depth] = ed = dtd
if (ed == null) {
ancesterElementDefs[depth] = anonymousElementDef;
} else {
for (Node down = context.getLastChild(); down instanceof Element; down = down
.getLastChild()) {
if (down == element) {
ancesters[depth] = element;
ancesterElementDefs[depth] = lastElementDef;
attrlist = null;
this.context = element;
} else {
ancesters[depth] = (Element) down;
ancesterElementDefs[depth] = ed = dtd
if (ed == null) {
ancesterElementDefs[depth] = anonymousElementDef;
throw new RuntimeException("Internal Parser Error.");
private void setContextBackward(int newDepth) throws SAXException {
if (eHandleLogical && docHandler != null) {
for (int i = depth - 1; i >= newDepth; i--) {
depth = newDepth;
this.context = ancesters[newDepth - 1];
* Sets current context element node.
* @param element
* new context.
* @see #getContext()
public final void setContext(Element element) throws SAXException {
for (int i = depth - 1; i >= 0; i--) {
Element up = ancesters[i];
if (up == element) {
if (eHandleLogical && docHandler != null) {
for (int j = depth - 1; j > i; j--) {
depth = i + 1;
this.context = element;
int forwardPathLen = 0;
for (Node down = up.getLastChild(); down instanceof Element; down = down
.getLastChild()) {
if (down == element) {
if (eHandleLogical && docHandler != null) {
for (int j = depth - 1; j > i; j--) {
for (int j = 0; j < forwardPathLen; j++) {
.getNodeName(), nullAttributeList);
ancesters[i + j + 1] = forwardPath[j];
ancesterElementDefs[i + j + 1] = dtd
if (ancesterElementDefs[i + j + 1] == null) {
ancesterElementDefs[i + j + 1] = anonymousElementDef;
docHandler.startElement(down.getNodeName(), attrlist);
} else {
for (int j = 0; j < forwardPathLen; j++) {
ancesters[i + j + 1] = forwardPath[j];
ancesterElementDefs[i + j + 1] = dtd
if (ancesterElementDefs[i + j + 1] == null) {
ancesterElementDefs[i + j + 1] = anonymousElementDef;
depth = i + forwardPathLen + 1;
ancesters[depth] = element;
ancesterElementDefs[depth] = lastElementDef;
attrlist = null;
this.context = element;
} else {
forwardPath[forwardPathLen++] = (Element) down;
// creates backward context.
int newDepth = 0;
for (Node up = element; up instanceof Element; up = up.getParentNode()) {
Element newAncesters[] = new Element[newDepth];
int i = 1;
for (Node up = element; up instanceof Element; up = up.getParentNode()) {
newAncesters[newDepth - i] = (Element) up;
for (i = 0; i < newDepth; i++) {
if (ancesters[i] != newAncesters[i]) {
if (eHandleLogical && docHandler != null) {
for (int j = depth - 1; j >= i; j--) {
for (int j = i; j < newDepth - 1; j++) {
docHandler.startElement(element.getNodeName(), attrlist);
while (i < newDepth - 1) {
ancesters[i] = newAncesters[i];
ancesterElementDefs[i] = dtd
ancesters[newDepth - 1] = element;
ancesterElementDefs[newDepth - 1] = lastElementDef;
depth = newDepth;
attrlist = null;
this.context = element;
public Vector<Element> nodesWithEndtag = new Vector<Element>();
* Checks if a specified element has its end tag or not.
* @param element
* element to be checked.
* @return <code>true</code> if <code>element</code> has its end tag.
* Otherwise, false.
public boolean hasEndTag(Element element) {
return nodesWithEndtag.contains(element);
* Determines that a specified element has its end tag.
public void setHasEndTag(Element element) {
private void setTopElement(Element element) throws SAXException {
Element prev = doc.getDocumentElement();
if (prev != null) {
doc.replaceChild(element, prev);
} else {
context = element;
if (docHandler != null && eHandleLogical) {
AttributeListImpl al = attrlist != null ? attrlist
: nullAttributeList;
docHandler.startElement(element.getNodeName(), al);
attrlist = null;
ancesterElementDefs[0] = lastDef;
ancesters[0] = element;
private Vector<EndTag> missedEndtags = new Vector<EndTag>();
private Node readInstances() throws ParseException, IOException,
SAXException {
Node node = getNode();
if (node == null)
return doc;
while (node.getNodeType() == Node.COMMENT_NODE
|| node.getNodeType() == Node.PROCESSING_INSTRUCTION_NODE) {
if (keepComment)
node = getNode();
switch (node.getNodeType()) {
case ENDTAG:
if (!handleError(IParserError.FLOATING_ENDTAG, node)) {
if (eHandleLogical && docHandler != null) {
docHandler.ignorableWhitespace(saxch, begin, len);
error(IParserError.FLOATING_ENDTAG, "Illegal end tag: " + node
+ ". Ignore it.");
return readInstances();
if (lastDef.instance(node)) {
setTopElement((Element) node);
} else {
AttributeListImpl attrlisttmp = attrlist;
attrlist = null;
attrlist = attrlisttmp;
if (lastDef.getContentModel().match(this, context, node)) {
if (!lastDef.startTagOmittable()) {
error(IParserError.ILLEGAL_TOP_ELEMENT, node
+ " can't be a top element.");
setContextForward((Element) node);
} else if (!handleError(IParserError.ILLEGAL_CHILD, node)) {
error(IParserError.ILLEGAL_CHILD, node
+ " is not allowed as a child of " + context);
setContextForward((Element) node);
postElement((Element) node);
case Node.TEXT_NODE:
if (preserveWhitespace && whitespaceText((Text) node)) {
return readInstances();
"#text can't be a top element");
throw new ParseException(tokenizer.getCurrentLine()
+ ": Internal Parser Error " + node);
return readInstances2();
private Node readInstances2() throws ParseException, IOException,
SAXException {
ElementDefinition ed;
Node node;
outer: for (node = getNode(); node != null; node = getNode()) {
// System.out.println(node.getNodeType()+" : "+node);
// System.out.println(node.toString());
node_sel: switch (node.getNodeType()) {
if (keepComment)
case ENDTAG:
if (_DEBUG
&& node.getNodeName().equalsIgnoreCase(
System.getProperty("DEBUG_ENDTAG"))) {
System.err.println("DEBUG: " + node);
for (int i = depth - 1; i >= 0; i--) {
if (ancesterElementDefs[i].number == lastElementNumber
&& (lastElementNumber != pcdataNumber + 1 || ancesters[i]
node.getNodeName()))) {
if (!missedEndtags.isEmpty()) {
extraErrInfo = missedEndtags;
if (handleError(IParserError.SUDDEN_ENDTAG, node)) {
extraErrInfo = null;
break node_sel;
extraErrInfo = null;
error(IParserError.SUDDEN_ENDTAG, missedEndtags
+ " have been forced to be inserted by "
+ node);
* if (ancesterElementDefs[depth - 1].number ==
* lastElementNumber) {
* nodesWithEndtag.addElement(context); }
((EndTag) node).setElement(ancesters[i]);
if (i > 0) {
} else {
break outer;
break node_sel;
} else {
if (!ancesterElementDefs[i].endTagOmittable()) {
missedEndtags.insertElementAt(new EndTag(
ancesters[i].getNodeName()), 0);
if (!handleError(IParserError.FLOATING_ENDTAG, node)) {
if (eHandleLogical && docHandler != null) {
docHandler.ignorableWhitespace(saxch, begin, len);
error(IParserError.FLOATING_ENDTAG, "Illegal end tag: "
+ node + ". Ignore it");
Element element = (Element) node;
Element exParent = null;
if (_DEBUG
&& element.getTagName().equalsIgnoreCase(
System.getProperty("DEBUG_STARTTAG"))) {
System.err.println("DEBUG: " + element);
// System.out.println("element");
for (int i = depth - 1; i >= 0; i--) {
ed = ancesterElementDefs[i];
if (!ed.endTagOmittable() && exParent == null) {
exParent = ancesters[i];
// System.out.println(ed.toString());
if (ed.exclusion(lastElementNumber)) {
if (exParent != null) {
if (!handleError(IParserError.ILLEGAL_CHILD, node)) {
error(IParserError.ILLEGAL_CHILD, node
+ " is an exception uner "
+ ancesters[i]);
} else {
if (context != node && eHandleLogical
&& docHandler != null) {
break node_sel;
} else if (ancesters[i - 1] != null) {
// System.out.println("elenent: mid");
if (ed.inclusion(lastElementNumber)) {
break node_sel;
ed = ancesterElementDefs[depth - 1];
IModelGroup contentModel = ed.getContentModel();
// TODO correct this
if (contentModel.match(lastElementNumber)
&& contentModel.match(this, context, node)) {
// System.out.println("model: fow");
} else if (ed.endTagOmittable()) {
// System.out.println("model: omit");
boolean found = false;
for (int i = depth - 2; i >= 0; i--) {
ed = ancesterElementDefs[i];
contentModel = ed.getContentModel();
if (contentModel.match(lastElementNumber)
&& (found = contentModel.match(this,
ancesters[i], node))
|| !ed.endTagOmittable()) {
// System.out.println("model: omit2 "+found);
if (found) {
// System.out.println("model: ok");
} else if (!handleError(IParserError.ILLEGAL_CHILD, node)) {
// System.out.println("model: child");
error(IParserError.ILLEGAL_CHILD, node
+ " is not allowed as a child of " + context);
} else if (element.getParentNode() != null) {
// unless the error handlers ignore the node
// System.out.println("model: post");
// System.out.println("model: omit3");
} else if (!handleError(IParserError.ILLEGAL_CHILD, node)) {
// System.out.println("model: error");
// TODO ???
error(IParserError.ILLEGAL_CHILD, node
+ " is not allowed as a child of " + context);
// System.out.println(node.getNodeName()+context);
// System.out.println("model: end");
if (context != node && eHandleLogical && docHandler != null) {
docHandler.startElement(node.getNodeName(), attrlist);
case Node.TEXT_NODE:
if (preserveWhitespace && whitespaceText((Text) node)) {
ed = ancesterElementDefs[depth - 1];
contentModel = ed.getContentModel();
if (contentModel.match(pcdataNumber)
&& contentModel.match(this, context, node)) {
} else if (ed.endTagOmittable()) {
for (int i = depth - 2; i >= 0; i--) {
ed = ancesterElementDefs[i];
contentModel = ed.getContentModel();
if (contentModel.match(pcdataNumber)
&& contentModel.match(this, ancesters[i], node)) {
break node_sel;
} else if (!ed.endTagOmittable()) {
if (handleError(IParserError.ILLEGAL_CHILD, node))
break node_sel;
error(IParserError.ILLEGAL_CHILD, "#text(" + node
+ ") is not allowed as a child of " + context);
throw new ParseException(tokenizer.getCurrentLine()
+ ": Internal parser error " + node);
// System.out.println("SGMLParser: a");
if (docHandler != null && eHandleLogical) {
Element top = doc.getDocumentElement();
if (getContext() != top) {
if (top != null)
if ((node = getNode()) != null) {
this.context = doc.getDocumentElement();
if (docHandler != null && eHandleLogical) {
// System.out.println("SGMLParser: pushback");
// System.out.println("SGMLParser: end");
return doc;
private boolean whitespaceText(Text text) {
if (text instanceof SGMLText) {
return ((SGMLText) text).getIsWhitespaceInElementContent();
char str[] = text.getData().toCharArray();
for (int i = str.length - 1; i >= 0; i--) {
if (!Character.isWhitespace(str[i]))
return false;
return true;
private void postElement(Element element) throws ParseException,
IOException, SAXException {
IModelGroup mg = lastElementDef.getContentModel();
if (mg == cdata) {
String tagName = element.getNodeName();
Node cdata = readCDATA(tagName);
currentNode = new EndTag(tagName);
((EndTag) currentNode).setElement(element);
if (docHandler != null && !eHandleLogical) {
if (context == element)
setContextBackward(depth - 1);
} else if (mg == empty && lastElementDef.endTagOmittable()) { // must
// be
// omitted
// System.out.println(element.toString()+lastElementDef.toString());
if (context == element) {
setContextBackward(depth - 1);
} else if (eHandleLogical && docHandler != null) {
} else if (isXHTML && isEndWithSlash) {
// System.out.println(element.toString()+lastElementDef.toString());
private CDATASection readCDATA(String arg) throws ParseException,
IOException, SAXException {
if (lexHandler != null) {
String str = tokenizer.rawText(arg);
CDATASection ret = doc.createCDATASection(str);
currentNode = ret;
if (lexHandler != null) {
return ret;
private Vector<CATB> commentsBeforeDoctype = new Vector<CATB>();
// private Vector commentsAftereDoctype = new Vector();
* comment at the beginning
class CATB {
* If comment, true. Otherwise, false.
boolean comment = true;
String str;
private DocumentType readDocType() throws ParseException, IOException,
SAXException {
if (lastDef != null) {
throw new ParseException("Already read DOCTYPE declaration");
comment_loop: while (true) {
switch (tokenizer.nextToken()) {
if (doc == null) {
CATB catb = new CATB();
catb.str = tokenizer.sval;
} else {
currentNode = doc.createComment(tokenizer.sval);
if (lexHandler != null) {
lexHandler.comment(saxch, begin, len);
case PI:
if (doc == null) {
CATB catb = new CATB();
catb.comment = false;
catb.str = tokenizer.sval;
} else {
currentNode = doc.createProcessingInstruction(null,
if (docHandler != null) {
docHandler.processingInstruction(null, tokenizer.sval);
case MDO:
break comment_loop;
return null;
if (tokenizer.nextToken() != NAME_CHAR
&& !tokenizer.sval.equals("DOCTYPE")) {
throw new ParseException("Unknown declaration at "
+ tokenizer.getCurrentLine());
* Only supports initially setted public entity. For example, <!DOCTYPE
* HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">
if (tokenizer.nextToken() == NAME_CHAR) {
String docTypeName = tokenizer.sval;
this.defaultTopElement = docTypeName;
if (tokenizer.nextToken() == NAME_CHAR
&& tokenizer.sval.equalsIgnoreCase("PUBLIC")) {
if (tokenizer.nextToken() == '"') {
String publicID = tokenizer.eatUntil('"');
if (enforcedDoctype != null) {
publicID = enforcedDoctype;
if (lexHandler != null) {
lexHandler.startDTD(docTypeName, publicID, null);
String entityFileName = pubEntityMap.get(publicID);
if (entityFileName == null) {
if (defaultDTD != null) {
error(IParserError.ILLEGAL_DOCTYPE, "Instead of \""
+ publicID + "\" use \"" + defaultDTD
+ "\" as a DTD.");
entityFileName = pubEntityMap.get(defaultDTD);
if (entityFileName == null) {
throw new ParseException(tokenizer.getCurrentLine()
+ ": this parser does not support "
+ publicID);
publicID = defaultDTD;
if (domImpl == null) {
domImpl = doc.getImplementation();
DocumentType ret = null;
if (domImpl != null) {
currentNode = ret = createDocType(domImpl, docTypeName,
// consume '>'
lastDef = dtd.getElementDefinition(docTypeName);
if (lastDef == null) {
String topElementName = getDefaultTopElement();
error(IParserError.ILLEGAL_DOCTYPE, docTypeName
+ " is not defined as a root element. Use "
+ topElementName + '.');
lastDef = dtd.getElementDefinition(topElementName);
if (lexHandler != null) {
return ret;
error(IParserError.ILLEGAL_DOCTYPE, "Invalid DOCTYPE declaration. Use "
+ defaultDTD);
lastDef = dtd.getElementDefinition(getDefaultTopElement());
return null;
private DocumentType createDocType(DOMImplementation domImpl,
String docTypeName, String publicID) {
* For compatibility to DOM level 1
Class domImplClass = domImpl.getClass();
Class stringClass = docTypeName.getClass();
Class parameterTypes[] = { stringClass, stringClass, stringClass };
java.lang.reflect.Method method;
try {
method = domImplClass.getMethod("createDocumentType",
} catch (NoSuchMethodException e) {
return null;
String args[] = { docTypeName, publicID, "" };
try {
return (DocumentType) method.invoke(domImpl, (Object[]) args);
} catch (IllegalAccessException e) {
return null;
} catch (java.lang.reflect.InvocationTargetException e) {
return null;
} catch (DOMException e) {
return null;
* For DOM level 2 try { return domImpl.createDocumentType(docTypeName,
* publicID, ""); } catch (DOMException e) { return null; }
private Node nodesWithIllegalChildren[] = new Node[BUF_SIZ];
private int nodeWithIllegalChildNum = 0;
private void expandNodesWithIllegalChildren() {
Node newNodes[] = new Node[nodeWithIllegalChildNum * 2];
System.arraycopy(nodesWithIllegalChildren, 0, newNodes, 0,
nodesWithIllegalChildren = newNodes;
private void addErrorNode(Node node) {
for (int i = nodeWithIllegalChildNum - 1; i >= 0; i--) {
if (nodesWithIllegalChildren[i] == node)
if (nodesWithIllegalChildren.length == nodeWithIllegalChildNum) {
nodesWithIllegalChildren[nodeWithIllegalChildNum++] = node;
* Checks if a specified node has an error or not.
* @param node
* node to be ckecked.
* @return <code>true</code> if <code>node</code> is an error node.
* Otherwise <code>false</code>
public boolean isErrorNode(Node node) {
for (int i = nodeWithIllegalChildNum - 1; i >= 0; i--) {
if (nodesWithIllegalChildren[i] == node)
return true;
return false;
* Initialized this parser.
protected void init() {
lastDef = null;
context = null;
nodeWithIllegalChildNum = 0;
depth = 0;
// commentsAftereDoctype.removeAllElements();
if (getDOMImplementation() != null) {
} else {
for (Node child = doc.getFirstChild(); child != null; child = doc
.getFirstChild()) {
int getCharEntity(String entity) throws IOException, ParseException,
SAXException {
SGMLEntityReference er = null;
try {
er = dtd.getEntityReference(entity);
} catch (ParseException e) {
return -1;
SGMLEntityDeclaration ed = er.getEntityDeclaration();
int ch = ed.getReplacementChar();
if (ch != -1) {
return ch;
InsTokenizer tokenizer2 = new InsTokenizer(ed.getReplacementReader(),
if (tokenizer2.nextToken() == PCDATA && tokenizer2.sval.length() == 1) {
char ret = tokenizer2.sval.charAt(0);
return ret;
} else {
throw new ParseException("Internal Parser Error: " + entity
+ " not defined.");
* Sets a <i>Document </i> instance that will be a factory of nodes in DOM
* tree. This parser can use any implementation of W3C's DOM. By default,
* Parsers use {@link SGMLDocument}. This method make the parser instance
* ignore DOMImplementation by side effects.
* @param doc
* new Document instance.
* @see #getDocument()
* @deprecated See {@link #setDOMImplementation(DOMImplementation)}
public void setDocument(Document doc) {
this.doc = doc;
domImpl = null;
public AttributeListImpl attrlist = null;
public int lastElementNumber;
public ElementDefinition lastElementDef;
private Node stag() throws IOException, ParseException, SAXException {
Element ret;
isEmptyElement = true;
isEndWithSlash = false;
if (tokenizer.nextToken() != NAME_CHAR) {
if (handleError(IParserError.TAG_NAME, tokenizer.sval)) {
return stag();
"Perhaps character encoding may not be correct.");
while (tokenizer.nextToken() != NAME_CHAR) {
if (tokenizer.ttype == EOF || tokenizer.ttype == TAGC) {
return null;
ElementDefinition ed = this.dtd.getElementDefinition(tokenizer.sval);
if (ed != null) {
lastElementNumber = ed.number;
lastElementDef = ed;
isEmptyElement = ed.getContentModel().toString().equalsIgnoreCase(
} else if (keepUnknowns) {
lastElementNumber = pcdataNumber + 1;
ed = lastElementDef = anonymousElementDef;
currentNode = ret = doc.createElement(changeTagCase(tokenizer.sval));
* boolean syntaxError = false;
Attr attr = null;
if (docHandler != null)
attrlist = createAttributeList();
isEndWithSlash = false;
while (tokenizer.nextToken() != TAGC) {
if (tokenizer.ttype == '/') {
isEndWithSlash = true;
} else {
isEndWithSlash = false;
if (tokenizer.ttype == STAGO || tokenizer.ttype == ETAGO) {
if (handleError(IParserError.BEFORE_ATTRNAME, tokenizer.sval)) {
"requires an attribute in " + ret);
} else if (tokenizer.ttype == EOF) {
attr = attribute(ed, attrlist);
if (attr != null) {
* } else if (!syntaxError) { syntaxError = true;
* error(STARTTAG_SYNTAX_ERR, "requires an attribute after " +
* ret);
if (docHandler != null && !eHandleLogical) {
// if(dtd.toString().indexOf("XHTML") < 0 || !endWithSlash){
docHandler.startElement(ret.getNodeName(), attrlist);
// }
if (ed == null) {
if (!handleError(IParserError.UNKNOWN_ELEMENT, ret)) {
error(IParserError.UNKNOWN_ELEMENT, "Unknown Element: "
+ ret.getTagName() + ". Ignore it.");
return null;
} else if (ed == anonymousElementDef) {
if (handleError(IParserError.UNKNOWN_ELEMENT, ret)) {
return null;
} else {
error(IParserError.UNKNOWN_ELEMENT, "Unknown Element: "
+ ret.getTagName()
+ ". Define its definition as <!ELEMENT "
+ ret.getNodeName().toUpperCase() + " - - ANY>");
currentTagName = ret.getNodeName();
// if (isXHTML && isEndWithSlash) {
// System.out.println("slash end tag");
// if (docHandler != null && !eHandleLogical) {
// docHandler.endElement(ret.getNodeName());
// }
// System.out.println(docHandler);
// }
return ret;
public String makeUnique(String id) {
for (Enumeration<String> e = pubEntityMap.elements(); e
.hasMoreElements();) {
String ret = e.nextElement();
if (id.equals(ret))
return ret;
return id;
* Changes default dtd. If &lt;!DOCTYPE ... &gt; statement is missing in a
* top of a document, a parser reads it by specifed dtd.
* @param dtd
* dtd's public id for default like
* <code>"-//W3C//DTD HTML 4.0 Transitional//EN"</code>
public void setDefaultDTD(String dtd) {
defaultDTD = makeUnique(dtd);
* This instance variable holds a default DTD's public ID. If a parser meets
* a strange DOCTYPE declaration, it reads a document with this default DTD.
protected String defaultDTD = null;
* Gets top element's name. If you want the parser to read document without
* DOCTYPE declaration at the top, you must override this method to return
* some element's name. For HTML documents, it returns "HTML".
* @return top element's name
* @exception ParseException
* always thrown.
protected String getDefaultTopElement() throws ParseException {
if (defaultTopElement != null) {
return defaultTopElement;
throw new ParseException(
"doesn't know which element must be at the top.");
private String defaultTopElement = null;
public int defaultTagCase = IParser.UPPER_CASE;
* Specifies element names' case whose start tags are omitted. Default
* behavier makes them uppercased.
* @param tagCase
* this must be {@link IParser#UPPER_CASE}or
* {@link IParser#LOWER_CASE}. If otherwise, ignore.
public void setDefaultTagCase(int tagCase) {
if (tagCase == 0 || tagCase == IParser.LOWER_CASE) {
this.defaultTagCase = tagCase;
* Change a specified string to specified cased.
* @see #setDefaultTagCase(int)
public String changeDefaultTagCase(String tag) {
switch (this.defaultTagCase) {
case IParser.UPPER_CASE:
return tag.toUpperCase();
case IParser.LOWER_CASE:
return tag.toLowerCase();
throw new RuntimeException("Internal Parser Error");
public int tagCase = IParser.ORIGINAL_CASE;
* Specifies element names' case. Default behavier makes them original
* cased.
* @param tagCase
* this must be {@link IParser#UPPER_CASE},
* {@link IParser#LOWER_CASE}or {@link IParser#ORIGINAL_CASE}. If
* otherwise, ignore.
public void setTagCase(int tagCase) {
if (tagCase == IParser.UPPER_CASE || tagCase == IParser.LOWER_CASE
|| tagCase == IParser.ORIGINAL_CASE) {
this.tagCase = tagCase;
final String changeTagCase(String tag) {
switch (this.tagCase) {
case IParser.UPPER_CASE:
return tag.toUpperCase();
case IParser.LOWER_CASE:
return tag.toLowerCase();
return tag;
throw new RuntimeException("Internal Parser Error");
public int attrCase = IParser.ORIGINAL_CASE;
* Specifies attribute names' case. Default behavier makes them original
* cased.
* @param attrCase
* this must be {@link IParser#UPPER_CASE},
* {@link IParser#LOWER_CASE}or {@link IParser#ORIGINAL_CASE}. If
* otherwise, ignore.
public void setAttrNameCase(int attrCase) {
if (attrCase == IParser.UPPER_CASE || attrCase == IParser.LOWER_CASE
|| attrCase == IParser.ORIGINAL_CASE) {
this.attrCase = attrCase;
final String changeAttrNameCase(String attr) {
switch (this.attrCase) {
case IParser.UPPER_CASE:
return attr.toUpperCase();
case IParser.LOWER_CASE:
return attr.toLowerCase();
return attr;
throw new RuntimeException("Internal Parser Error");
private String enforcedDoctype = null;
* Ignores a declaration at the top of the document and enforces document
* type specified by <code>publicId</code>
public void enforceDoctype(String publicId) {
enforcedDoctype = makeUnique(publicId);
* Closes input stream.
public void close() throws IOException {
tokenizer = null;
public Object extraErrInfo = null;
* Gets extra error information. A parser passes two error information (e.g.
* error code and error node) to node-level error handlers. However, it is
* now enough for some kind of error handlers to recover the error. If
* {@link IParserError#SUDDEN_ENDTAG}error occurs, parser set missed end
* tags between error node and future context to extra error information.
public Object getExtraErrInfo() {
return extraErrInfo;
private boolean handleError(int code, Node node) throws ParseException,
IOException, SAXException {
for (int i = errorHandlerNum - 1; i >= 0; i--) {
if (errorHandlers[i].handleError(code, this, node)) {
return true;
return false;
boolean handleError(int code, String errorStr) throws ParseException,
IOException {
for (int i = tokenErrorHandlerNum - 1; i >= 0; i--) {
if (tokenErrorHandlers[i].handleError(code, this, errorStr)) {
return true;
return false;
void putCharNumEntity(Character C, String ent) {
Document doc = getDocument();
if (doc instanceof SGMLDocument) {
((SGMLDocument) doc).putCharNumEntity(C, ent);
public DocumentHandler docHandler = null;
* Gets a <i>DocumentHandler </i> instance included in this parser.
* @see #setDocumentHandler(org.xml.sax.DocumentHandler)
public DocumentHandler getDocumentHandler() {
return this.docHandler;
* Sets a <i>DocumentHandler </i> instace for this parser.
* @see #getDocumentHandler()
public void setDocumentHandler(DocumentHandler handler) {
this.docHandler = handler;
public boolean preserveWhitespace = false;
* Checks if parser preserve whitespaces or not.
* @return <code>true</code> if this parser preserve whitespaces. Otherwise
* <code>false</code>
public boolean getPreserveWhitespace() {
return preserveWhitespace;
* Determines if this parser preserve whitespaces or not. If preserve, the
* parser create text node for whitespace between tags and does not ignore
* carriage return and line feed after start tags and before end tags. By
* default a parser ignores whitespaces.
* @see #getPreserveWhitespace()
public void setPreserveWhitespace(boolean preserv) {
if (tokenizer != null) {
this.preserveWhitespace = preserv;
* Determines which this parser invokes
* {@link org.xml.sax.DocumentHandler#startElement(java.lang.String, org.xml.sax.AttributeList)}
* and {@link org.xml.sax.DocumentHandler#endElement(java.lang.String)}
* <code>logically</code> or <code>physically</code>.<code>
* Logical</code> means
* that if a start or end tag of a element is omitted, a parser invokes each
* method. <code>Physical</code> means that parsers invokes each method if
* and only if their tag appearently exist. If <code>physical</code>, a
* parser does not care if the tag is illegal or not. Default is
* <code>physical</code>
* @param logical
* if true, deal with tags as logical. Otherwise, as physical
public void elementHandle(boolean logical) {
this.eHandleLogical = logical;
public boolean eHandleLogical = true;
public Vector<Element> autoGenerated = new Vector<Element>();
* Checks if the specified element is automatically generated by this parser
* or not. For example, <code>TBODY</code> under <code>TABLE</code> is
* automatically generated in following document.
* <PRE>
* &lt;TABLE&gt; &lt;TR&gt;&lt;TD&gt;&lt;TD&gt; &lt;/TABLE&gt;
* <PRE>
* &#064;param an
* element node to be checked
* &#064;return &lt;code&gt;true&lt;/code&gt; if &lt;code&gt;element&lt;/code&gt; is automatically
* generated by this. Otherwise false.
public boolean autoGenerated(Element element) {
return autoGenerated.contains(element);
public final void addAutoGenerated(Element element) {
* Inserts a string to current position in read stream. This method is
* usually invoked by token-level error handlers.
* @param str
* String to insert
public void insert(String str) throws IOException {
private AttributeListImpl createAttributeList() {
return new AttributeListImpl();
private boolean keepUnknowns;
* (non-Javadoc)
* @see
* org.eclipse.actf.model.dom.sgml.impl.ISGMLParser#keepUnknownElements(
* boolean)
public void keepUnknownElements(boolean keep) {
this.keepUnknowns = keep;
* Gets current line number of source document.
* @return line number
* @deprecated Use {@link org.xml.sax.Locator#getLineNumber()}
public int getCurrentLineNumber() {
return tokenizer.getCurrentLine();
* Gets current column number of source document.
* @return column number
* @deprecated Use {@link org.xml.sax.Locator#getColumnNumber()}
public int getCurrentColumnNumber() {
return tokenizer.getCurrentCol();
* Gets current reading node
public Node getCurrentNode() {
return this.currentNode;
public Node currentNode;
public void setCurrentNode(Node node) {
this.currentNode = node;
private boolean keepComment = true;
* Determines if this parser keeps comments and processing instructions in
* the tree or not. By default, it keeps.
* @param <code>true</code> if it keeps, Otherwise <code>false</code>
public void setKeepComment(boolean keep) {
this.keepComment = keep;
private char saxch[];
private int begin, len;
void setCharacter(char ch[], int begin, int len) {
this.saxch = ch;
this.begin = begin;
this.len = len;
private LexicalHandler lexHandler;
* Note: does not support
* {@link org.xml.sax.ext.LexicalHandler#startEntity(java.lang.String)}and
* {@link org.xml.sax.ext.LexicalHandler#endEntity(java.lang.String)}.
public void setLexicalHandler(LexicalHandler lexHandler) {
this.lexHandler = lexHandler;
LexicalHandler getLexicalHandler() {
return this.lexHandler;
void incrementDepth(int i) {
this.depth += i;
* @param i
* depth
public void reopenContext(int i) throws SAXException {
this.depth += i;
this.context = ancesters[depth - 1];
if (!eHandleLogical || docHandler == null)
for (int j = depth; j < i; j++) {
public Element[] getContextElements() {
Element ret[] = new Element[depth];
System.arraycopy(ancesters, 0, ret, 0, depth);
return ret;
* Parses a fragment under specified context.
public void parseFragment(Element parent, Reader reader)
throws IOException, ParseException, SAXException {
if (dtd == null) {
throw new ParseException("Can't parse without DTD");
} else if (doc == null) {
throw new ParseException("Can't parse without a Document");
this.ancesterElementDefs[0] = dtd.getElementDefinition(parent
if (this.ancesterElementDefs[0] == null) {
this.ancesterElementDefs[0] = anonymousElementDef;
this.ancesters[0] = parent;
this.context = parent;
depth = 1;
this.tokenizer = new InsTokenizer(reader, this);
if (docHandler != null) {
* Parses a fragment. As a side effect, wastes a element node.
public DocumentFragment parseFragment(Reader reader) throws IOException,
ParseException, SAXException {
Element dummy = doc != null ? doc.createElement("dummy") : null;
parseFragment(dummy, reader);
DocumentFragment ret = doc.createDocumentFragment();
for (Node child = dummy.getFirstChild(); child != null; child = dummy
.getFirstChild()) {
return ret;
public int getPushbackBufferSize() {
return BUF_SIZ;