blob: 69fc6d5843bebd34e36eced27c52b83d137b7b9b [file] [log] [blame]
/*******************************************************************************
* Copyright (c) 2001, 2004 IBM Corporation and others.
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Eclipse Public License v1.0
* which accompanies this distribution, and is available at
* http://www.eclipse.org/legal/epl-v10.html
*
* Contributors:
* IBM Corporation - initial API and implementation
* Jens Lukowski/Innoopract - initial renaming/restructuring
*
*******************************************************************************/
package org.eclipse.wst.xml.core.internal.parser;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import org.eclipse.jface.text.BadLocationException;
import org.eclipse.jface.text.IDocument;
import org.eclipse.wst.sse.core.internal.document.DocumentReader;
import org.eclipse.wst.sse.core.internal.ltk.parser.BlockMarker;
import org.eclipse.wst.sse.core.internal.ltk.parser.BlockTagParser;
import org.eclipse.wst.sse.core.internal.ltk.parser.BlockTokenizer;
import org.eclipse.wst.sse.core.internal.ltk.parser.RegionParser;
import org.eclipse.wst.sse.core.internal.ltk.parser.StructuredDocumentRegionHandler;
import org.eclipse.wst.sse.core.internal.ltk.parser.StructuredDocumentRegionParser;
import org.eclipse.wst.sse.core.internal.ltk.parser.StructuredDocumentRegionParserExtension;
import org.eclipse.wst.sse.core.internal.provisional.text.IStructuredDocumentRegion;
import org.eclipse.wst.sse.core.internal.provisional.text.ITextRegion;
import org.eclipse.wst.sse.core.internal.provisional.text.ITextRegionContainer;
import org.eclipse.wst.sse.core.internal.provisional.text.ITextRegionList;
import org.eclipse.wst.sse.core.internal.text.CharSequenceReader;
import org.eclipse.wst.sse.core.internal.text.IRegionComparible;
import org.eclipse.wst.sse.core.internal.util.Debug;
import org.eclipse.wst.xml.core.internal.Logger;
import org.eclipse.wst.xml.core.internal.regions.DOMRegionContext;
/**
* Takes input from the HTMLTokenizer and creates a tag list
*/
public class XMLSourceParser implements RegionParser, BlockTagParser, StructuredDocumentRegionParser, IRegionComparible, StructuredDocumentRegionParserExtension {
// made public to aid access from inner classes in hierarchy.
// TODO: in future, figure out how to solve without exposing data.
public CharSequence fCharSequenceSource = null;
private IDocument fDocumentInput;
protected int fOffset = 0;
// DMW: 2/12/03. Removed some state data, since not really needed,
// and since it added a lot to overhead (since so many regions are
// created.
// protected IStructuredDocumentRegion fCurrentNode = null;
// protected IStructuredDocumentRegion fNodes = null;
// protected List fRegions = null;
// protected Object fInput = null;
protected String fStringInput = null;
protected List fStructuredDocumentRegionHandlers;
protected BlockTokenizer fTokenizer = null;
protected long startTime;
protected long stopTime;
/**
* HTMLSourceParser constructor comment.
*/
public XMLSourceParser() {
super();
fStructuredDocumentRegionHandlers = new ArrayList();
}
/**
* This is a simple utility to count nodes. Used only for debug
* statements.
*/
protected int _countNodes(IStructuredDocumentRegion nodes) {
int result = 0;
IStructuredDocumentRegion countNode = nodes;
while (countNode != null) {
result++;
countNode = countNode.getNext();
}
return result;
}
public void addBlockMarker(BlockMarker marker) {
getTokenizer().addBlockMarker(marker);
}
public void addStructuredDocumentRegionHandler(StructuredDocumentRegionHandler handler) {
if (fStructuredDocumentRegionHandlers == null)
fStructuredDocumentRegionHandlers = new ArrayList();
fStructuredDocumentRegionHandlers.add(handler);
}
public void beginBlockScan(String newTagName) {
getTokenizer().beginBlockTagScan(newTagName);
}
/**
* @return IStructuredDocumentRegion
*/
protected IStructuredDocumentRegion createStructuredDocumentRegion(String type) {
IStructuredDocumentRegion newNode = null;
if (type == DOMRegionContext.BLOCK_TEXT)
newNode = XMLStructuredRegionFactory.createRegion(XMLStructuredRegionFactory.XML_BLOCK);
else
newNode = XMLStructuredRegionFactory.createRegion(XMLStructuredRegionFactory.XML);
return newNode;
}
protected void fireNodeParsed(IStructuredDocumentRegion fCurrentNode) {
// never let an Exceptions from foreign code interfere with completion
// of parsing. To get an exception here is definitely a program error
// somewhere,
// we can't afford to interrupt the flow of control. or backwards
// typing can result!
//
//
try {
if (fCurrentNode != null && fStructuredDocumentRegionHandlers != null) {
for (int i = 0; i < fStructuredDocumentRegionHandlers.size(); i++)
((StructuredDocumentRegionHandler) fStructuredDocumentRegionHandlers.get(i)).nodeParsed(fCurrentNode);
}
}
catch (Exception e) {
Logger.log(Logger.ERROR, e.getMessage());
}
}
public BlockMarker getBlockMarker(String tagName) {
List markers = getTokenizer().getBlockMarkers();
for (int i = 0; i < markers.size(); i++) {
BlockMarker marker = (BlockMarker) markers.get(i);
if (marker.isCaseSensitive()) {
if (marker.getTagName().equals(tagName))
return marker;
}
else {
if (marker.getTagName().equalsIgnoreCase(tagName))
return marker;
}
}
return null;
}
public List getBlockMarkers() {
return getTokenizer().getBlockMarkers();
}
/**
* @return IStructuredDocumentRegion
*/
public IStructuredDocumentRegion getDocumentRegions() {
IStructuredDocumentRegion headnode = null;
if (headnode == null) {
if (Debug.perfTest) {
startTime = System.currentTimeMillis();
}
headnode = parseNodes();
if (Debug.perfTest) {
stopTime = System.currentTimeMillis();
System.out.println(" -- creating nodes of IStructuredDocument -- "); //$NON-NLS-1$
System.out.println(" Time parse and init all regions: " + (stopTime - startTime) + " (msecs)"); //$NON-NLS-2$//$NON-NLS-1$
// System.out.println(" for " + fRegions.size() + "
// Regions");//$NON-NLS-2$//$NON-NLS-1$
System.out.println(" and " + _countNodes(headnode) + " Nodes"); //$NON-NLS-2$//$NON-NLS-1$
}
}
return headnode;
}
protected ITextRegion getNextRegion() {
ITextRegion region = null;
try {
region = getTokenizer().getNextToken();
// DMW: 2/12/03 Removed state
// if (region != null) {
// fRegions.add(region);
// }
return region;
}
catch (StackOverflowError e) {
Logger.logException(getClass().getName() + ": input could not be parsed correctly at position " + getTokenizer().getOffset(), e); //$NON-NLS-1$
throw e;
}
catch (Exception e) {
Logger.logException(getClass().getName() + ": input could not be parsed correctly at position " + getTokenizer().getOffset() + " (" + e.getLocalizedMessage() + ")", e); //$NON-NLS-3$//$NON-NLS-2$//$NON-NLS-1$
}
return null;
}
/**
* Return the full list of known regions. Typically getNodes should be
* used instead of this method.
*/
public List getRegions() {
IStructuredDocumentRegion headNode = null;
if (!getTokenizer().isEOF()) {
headNode = getDocumentRegions();
// throw new IllegalStateException("parsing has not finished");
}
// for memory recovery, we assume if someone
// requests all regions, we can reset our big
// memory consuming objects
// but the new "getRegions" method is then more expensive.
// I don't think its used much, though.
List localRegionsList = getRegions(headNode);
primReset();
return localRegionsList;
}
/**
* Method getRegions.
*
* @param headNode
* @return List
*/
protected List getRegions(IStructuredDocumentRegion headNode) {
List allRegions = new ArrayList();
IStructuredDocumentRegion currentNode = headNode;
while (currentNode != null) {
ITextRegionList nodeRegions = currentNode.getRegions();
for (int i = 0; i < nodeRegions.size(); i++) {
allRegions.add(nodeRegions.get(i));
}
currentNode = currentNode.getNext();
}
return allRegions;
}
/**
*
* @return java.util.List
*/
public List getStructuredDocumentRegionHandlers() {
if (fStructuredDocumentRegionHandlers == null) {
fStructuredDocumentRegionHandlers = new ArrayList(0);
}
return fStructuredDocumentRegionHandlers;
}
/**
* Returns text from the current input. Text is only valid before
* getNodes() has been called and only when a raw String or DocumentReader
* is given as the input.
*/
public String getText(int offset, int length) {
String text = null;
if (fCharSequenceSource != null) {
int start = fOffset + offset;
int end = start + length;
text = fCharSequenceSource.subSequence(start, end).toString();
}
else if (fDocumentInput != null) {
try {
text = fDocumentInput.get(offset, length);
}
catch (BadLocationException e) {
text = ""; //$NON-NLS-1$
}
}
else {
if (fStringInput == null || fStringInput.length() == 0 || offset + length > fStringInput.length() || offset < 0) {
text = ""; //$NON-NLS-1$
}
else {
// offset is entirely valid during parsing as the parse
// numbers haven't been adjusted.
text = fStringInput.substring(offset, offset + length);
}
}
return text;
}
protected BlockTokenizer getTokenizer() {
if (fTokenizer == null) {
fTokenizer = new XMLTokenizer();
}
return fTokenizer;
}
public RegionParser newInstance() {
XMLSourceParser newInstance = new XMLSourceParser();
newInstance.setTokenizer(getTokenizer().newInstance());
return newInstance;
}
protected IStructuredDocumentRegion parseNodes() {
// regions are initially reported as complete offsets within the
// scanned input
// they are adjusted here to be indexes from the currentNode's start
// offset
IStructuredDocumentRegion headNode = null;
IStructuredDocumentRegion lastNode = null;
ITextRegion region = null;
IStructuredDocumentRegion currentNode = null;
String type = null;
while ((region = getNextRegion()) != null) {
type = region.getType();
// these types (might) demand a IStructuredDocumentRegion for each
// of them
if (type == DOMRegionContext.BLOCK_TEXT) {
if (currentNode != null && currentNode.getLastRegion().getType() == DOMRegionContext.BLOCK_TEXT) {
// multiple block texts indicated embedded containers; no
// new IStructuredDocumentRegion
currentNode.addRegion(region);
currentNode.setLength(region.getStart() + region.getLength() - currentNode.getStart());
region.adjustStart(-currentNode.getStart());
// DW 4/16/2003 regions no longer have parents
// region.setParent(currentNode);
}
else {
// not continuing a IStructuredDocumentRegion
if (currentNode != null) {
// ensure that any existing node is at least
// terminated
if (!currentNode.isEnded()) {
currentNode.setLength(region.getStart() - currentNode.getStart());
// fCurrentNode.setTextLength(region.getStart() -
// fCurrentNode.getStart());
}
lastNode = currentNode;
}
fireNodeParsed(currentNode);
currentNode = createStructuredDocumentRegion(type);
if (lastNode != null) {
lastNode.setNext(currentNode);
}
currentNode.setPrevious(lastNode);
currentNode.setStart(region.getStart());
currentNode.setLength(region.getStart() + region.getLength() - currentNode.getStart());
currentNode.setEnded(true);
region.adjustStart(-currentNode.getStart());
currentNode.addRegion(region);
// DW 4/16/2003 regions no longer have parents
// region.setParent(currentNode);
}
}
// the following contexts OPEN new StructuredDocumentRegions
else if ((currentNode != null && currentNode.isEnded()) || (type == DOMRegionContext.XML_CONTENT) || (type == DOMRegionContext.XML_CHAR_REFERENCE) || (type == DOMRegionContext.XML_ENTITY_REFERENCE) || (type == DOMRegionContext.XML_PI_OPEN) || (type == DOMRegionContext.XML_TAG_OPEN) || (type == DOMRegionContext.XML_END_TAG_OPEN) || (type == DOMRegionContext.XML_COMMENT_OPEN) || (type == DOMRegionContext.XML_CDATA_OPEN) || (type == DOMRegionContext.XML_DECLARATION_OPEN)) {
if (currentNode != null) {
// ensure that any existing node is at least terminated
if (!currentNode.isEnded()) {
currentNode.setLength(region.getStart() - currentNode.getStart());
// fCurrentNode.setTextLength(region.getStart() -
// fCurrentNode.getStart());
}
lastNode = currentNode;
}
fireNodeParsed(currentNode);
currentNode = createStructuredDocumentRegion(type);
if (lastNode != null) {
lastNode.setNext(currentNode);
}
currentNode.setPrevious(lastNode);
currentNode.setStart(region.getStart());
currentNode.addRegion(region);
currentNode.setLength(region.getStart() + region.getLength() - currentNode.getStart());
region.adjustStart(-currentNode.getStart());
// DW 4/16/2003 regions no longer have parents
// region.setParent(currentNode);
}
// the following contexts neither open nor close
// StructuredDocumentRegions; just add to them
else if ((type == DOMRegionContext.XML_TAG_NAME) || (type == DOMRegionContext.XML_TAG_ATTRIBUTE_NAME) || (type == DOMRegionContext.XML_TAG_ATTRIBUTE_EQUALS) || (type == DOMRegionContext.XML_TAG_ATTRIBUTE_VALUE) || (type == DOMRegionContext.XML_COMMENT_TEXT) || (type == DOMRegionContext.XML_PI_CONTENT) || (type == DOMRegionContext.XML_DOCTYPE_INTERNAL_SUBSET)) {
currentNode.addRegion(region);
currentNode.setLength(region.getStart() + region.getLength() - currentNode.getStart());
region.adjustStart(-currentNode.getStart());
// DW 4/16/2003 regions no longer have parents
// region.setParent(currentNode);
}
// the following contexts close off StructuredDocumentRegions
// cleanly
else if ((type == DOMRegionContext.XML_PI_CLOSE) || (type == DOMRegionContext.XML_TAG_CLOSE) || (type == DOMRegionContext.XML_EMPTY_TAG_CLOSE) || (type == DOMRegionContext.XML_COMMENT_CLOSE) || (type == DOMRegionContext.XML_DECLARATION_CLOSE) || (type == DOMRegionContext.XML_CDATA_CLOSE)) {
currentNode.setEnded(true);
currentNode.setLength(region.getStart() + region.getLength() - currentNode.getStart());
currentNode.addRegion(region);
region.adjustStart(-currentNode.getStart());
// DW 4/16/2003 regions no longer have parents
// region.setParent(currentNode);
}
// this is extremely rare, but valid
else if (type == DOMRegionContext.WHITE_SPACE) {
ITextRegion lastRegion = currentNode.getLastRegion();
// pack the embedded container with this region
if (lastRegion instanceof ITextRegionContainer) {
ITextRegionContainer container = (ITextRegionContainer) lastRegion;
container.getRegions().add(region);
// containers must have parent set ...
// setting for EACH subregion is redundent, but not sure
// where else to do, so will do here for now.
container.setParent(currentNode);
// DW 4/16/2003 regions no longer have parents
// region.setParent(container);
region.adjustStart(container.getLength() - region.getStart());
}
currentNode.getLastRegion().adjustLength(region.getLength());
currentNode.adjustLength(region.getLength());
}
else if (type == DOMRegionContext.UNDEFINED && currentNode != null) {
// skip on a very-first region situation as the default
// behavior is good enough
// combine with previous if also undefined
if (currentNode.getLastRegion() != null && currentNode.getLastRegion().getType() == DOMRegionContext.UNDEFINED) {
currentNode.getLastRegion().adjustLength(region.getLength());
currentNode.adjustLength(region.getLength());
}
// previous wasn't undefined
else {
currentNode.addRegion(region);
currentNode.setLength(region.getStart() + region.getLength() - currentNode.getStart());
region.adjustStart(-currentNode.getStart());
}
}
else {
// if an unknown type is the first region in the document,
// ensure that a node exists
if (currentNode == null) {
currentNode = createStructuredDocumentRegion(type);
currentNode.setStart(region.getStart());
}
currentNode.addRegion(region);
currentNode.setLength(region.getStart() + region.getLength() - currentNode.getStart());
region.adjustStart(-currentNode.getStart());
// DW 4/16/2003 regions no longer have parents
// region.setParent(currentNode);
if (Debug.debugTokenizer)
System.out.println(getClass().getName() + " found region of not specifically handled type " + region.getType() + " @ " + region.getStart() + "[" + region.getLength() + "]"); //$NON-NLS-4$//$NON-NLS-3$//$NON-NLS-2$//$NON-NLS-1$
//$NON-NLS-3$//$NON-NLS-2$//$NON-NLS-1$
}
// these regions also get their own node, so close them cleanly
// NOTE: these regions have new StructuredDocumentRegions created
// for them above; it may
// be more readable if that is handled here as well, but the
// current layout
// ensures that they open StructuredDocumentRegions the same way
if ((type == DOMRegionContext.XML_CONTENT) || (type == DOMRegionContext.XML_CHAR_REFERENCE) || (type == DOMRegionContext.XML_ENTITY_REFERENCE)) {
currentNode.setEnded(true);
}
if (headNode == null && currentNode != null) {
headNode = currentNode;
}
}
if (currentNode != null) {
fireNodeParsed(currentNode);
currentNode.setPrevious(lastNode);
}
// fStringInput = null;
primReset();
return headNode;
}
protected void primReset() {
// fNodes = null;
// fRegions = null;
// fInput = null;
fStringInput = null;
fCharSequenceSource = null;
fDocumentInput = null;
fOffset = 0;
// fCurrentNode = null;
// DMW: also reset tokenizer so it doesn't hold on
// to large arrays
getTokenizer().reset(new char[0]);
}
/*
* (non-Javadoc)
*
* @see org.eclipse.wst.sse.core.internal.text.IRegionComparible#regionMatches(int,
* int, java.lang.String)
*/
public boolean regionMatches(int offset, int length, String stringToCompare) {
// by definition
if (stringToCompare == null)
return false;
boolean result = false;
if (fCharSequenceSource != null && fCharSequenceSource instanceof IRegionComparible) {
result = ((IRegionComparible) fCharSequenceSource).regionMatches(offset, length, stringToCompare);
}
else {
// old fashioned ways
String test = null;
if (fCharSequenceSource != null) {
test = fCharSequenceSource.subSequence(offset, offset + length).toString();
}
else if (fStringInput != null) {
test = fStringInput.substring(offset, offset + length);
}
result = stringToCompare.equals(test);
}
return result;
}
public boolean regionMatchesIgnoreCase(int offset, int length, String stringToCompare) {
// by definition
if (stringToCompare == null)
return false;
boolean result = false;
if (fCharSequenceSource != null && fCharSequenceSource instanceof IRegionComparible) {
result = ((IRegionComparible) fCharSequenceSource).regionMatchesIgnoreCase(offset, length, stringToCompare);
}
else {
// old fashioned ways
String test = null;
if (fCharSequenceSource != null) {
test = fCharSequenceSource.subSequence(offset, offset + length).toString();
}
else if (fStringInput != null) {
test = fStringInput.substring(offset, offset + length);
}
result = stringToCompare.equalsIgnoreCase(test);
}
return result;
}
public void removeBlockMarker(BlockMarker marker) {
getTokenizer().removeBlockMarker(marker);
}
public void removeBlockMarker(String tagName) {
getTokenizer().removeBlockMarker(tagName);
}
public void removeStructuredDocumentRegionHandler(StructuredDocumentRegionHandler handler) {
if (fStructuredDocumentRegionHandlers == null)
return;
if (fStructuredDocumentRegionHandlers.contains(handler))
fStructuredDocumentRegionHandlers.remove(handler);
}
/**
* Resets the input.
*/
public void reset(java.io.FileInputStream instream) {
primReset();
// fInput = instream;
getTokenizer().reset(instream);
}
/**
* Resets the input.
*/
public void reset(java.io.Reader reader) {
reset(reader, 0);
}
/**
* Resets the input.
*/
public void reset(java.io.Reader reader, int position) {
primReset();
fOffset = position;
getTokenizer().reset(reader, position);
if (reader instanceof DocumentReader) {
IDocument doc = ((DocumentReader) reader).getDocument();
if (doc instanceof CharSequence) {
fCharSequenceSource = (CharSequence) doc;
}
else {
// old fashioned IDocument
fDocumentInput = ((DocumentReader) reader).getDocument();
}
}
else if (reader instanceof CharSequenceReader) {
fCharSequenceSource = ((CharSequenceReader) reader).getOriginalSource();
}
}
/**
* Resets the input. Use this version to allow text to be retrieved
* <em>during</em> parsing, such as by the
* StructuredDocumentRegionHandler.
*/
public void reset(String sourceString) {
reset(new StringReader(sourceString));
fStringInput = sourceString;
}
/**
* Resets the input. Use this version to allow text to be retrieved
* <em>during</em> parsing, such as by the
* StructuredDocumentRegionHandler.
*/
public void reset(String sourceString, int position) {
StringReader reader = new StringReader(sourceString);
reset(reader, position);
fStringInput = sourceString;
}
public void resetHandlers() {
if (fStructuredDocumentRegionHandlers != null) {
int size = fStructuredDocumentRegionHandlers.size();
for (int i = 0; i < size; i++)
((StructuredDocumentRegionHandler) fStructuredDocumentRegionHandlers.get(i)).resetNodes();
}
}
/**
*
* @param List
*/
public void setStructuredDocumentRegionHandlers(List newStructuredDocumentRegionHandlers) {
fStructuredDocumentRegionHandlers = newStructuredDocumentRegionHandlers;
}
protected void setTokenizer(BlockTokenizer newTokenizer) {
// DMW: changed from private to protected, so subclass could use in
// creation of 'newInstance'.
fTokenizer = newTokenizer;
}
}