"""ANTLR3 runtime package""" | |
# begin[licence] | |
# | |
# [The "BSD licence"] | |
# Copyright (c) 2005-2008 Terence Parr | |
# All rights reserved. | |
# | |
# Redistribution and use in source and binary forms, with or without | |
# modification, are permitted provided that the following conditions | |
# are met: | |
# 1. Redistributions of source code must retain the above copyright | |
# notice, this list of conditions and the following disclaimer. | |
# 2. Redistributions in binary form must reproduce the above copyright | |
# notice, this list of conditions and the following disclaimer in the | |
# documentation and/or other materials provided with the distribution. | |
# 3. The name of the author may not be used to endorse or promote products | |
# derived from this software without specific prior written permission. | |
# | |
# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR | |
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES | |
# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. | |
# IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, | |
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT | |
# NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF | |
# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
# | |
# end[licence] | |
import codecs | |
from StringIO import StringIO | |
from antlr3.constants import DEFAULT_CHANNEL, EOF | |
from antlr3.tokens import Token, CommonToken | |
############################################################################ | |
# | |
# basic interfaces | |
# IntStream | |
# +- CharStream | |
# \- TokenStream | |
# | |
# subclasses must implemented all methods | |
# | |
############################################################################ | |
class IntStream(object): | |
""" | |
@brief Base interface for streams of integer values. | |
A simple stream of integers used when all I care about is the char | |
or token type sequence (such as interpretation). | |
""" | |
def consume(self): | |
raise NotImplementedError | |
def LA(self, i): | |
"""Get int at current input pointer + i ahead where i=1 is next int. | |
Negative indexes are allowed. LA(-1) is previous token (token | |
just matched). LA(-i) where i is before first token should | |
yield -1, invalid char / EOF. | |
""" | |
raise NotImplementedError | |
def mark(self): | |
""" | |
Tell the stream to start buffering if it hasn't already. Return | |
current input position, index(), or some other marker so that | |
when passed to rewind() you get back to the same spot. | |
rewind(mark()) should not affect the input cursor. The Lexer | |
track line/col info as well as input index so its markers are | |
not pure input indexes. Same for tree node streams. | |
""" | |
raise NotImplementedError | |
def index(self): | |
""" | |
Return the current input symbol index 0..n where n indicates the | |
last symbol has been read. The index is the symbol about to be | |
read not the most recently read symbol. | |
""" | |
raise NotImplementedError | |
def rewind(self, marker=None): | |
""" | |
Reset the stream so that next call to index would return marker. | |
The marker will usually be index() but it doesn't have to be. It's | |
just a marker to indicate what state the stream was in. This is | |
essentially calling release() and seek(). If there are markers | |
created after this marker argument, this routine must unroll them | |
like a stack. Assume the state the stream was in when this marker | |
was created. | |
If marker is None: | |
Rewind to the input position of the last marker. | |
Used currently only after a cyclic DFA and just | |
before starting a sem/syn predicate to get the | |
input position back to the start of the decision. | |
Do not "pop" the marker off the state. mark(i) | |
and rewind(i) should balance still. It is | |
like invoking rewind(last marker) but it should not "pop" | |
the marker off. It's like seek(last marker's input position). | |
""" | |
raise NotImplementedError | |
def release(self, marker=None): | |
""" | |
You may want to commit to a backtrack but don't want to force the | |
stream to keep bookkeeping objects around for a marker that is | |
no longer necessary. This will have the same behavior as | |
rewind() except it releases resources without the backward seek. | |
This must throw away resources for all markers back to the marker | |
argument. So if you're nested 5 levels of mark(), and then release(2) | |
you have to release resources for depths 2..5. | |
""" | |
raise NotImplementedError | |
def seek(self, index): | |
""" | |
Set the input cursor to the position indicated by index. This is | |
normally used to seek ahead in the input stream. No buffering is | |
required to do this unless you know your stream will use seek to | |
move backwards such as when backtracking. | |
This is different from rewind in its multi-directional | |
requirement and in that its argument is strictly an input cursor | |
(index). | |
For char streams, seeking forward must update the stream state such | |
as line number. For seeking backwards, you will be presumably | |
backtracking using the mark/rewind mechanism that restores state and | |
so this method does not need to update state when seeking backwards. | |
Currently, this method is only used for efficient backtracking using | |
memoization, but in the future it may be used for incremental parsing. | |
The index is 0..n-1. A seek to position i means that LA(1) will | |
return the ith symbol. So, seeking to 0 means LA(1) will return the | |
first element in the stream. | |
""" | |
raise NotImplementedError | |
def size(self): | |
""" | |
Only makes sense for streams that buffer everything up probably, but | |
might be useful to display the entire stream or for testing. This | |
value includes a single EOF. | |
""" | |
raise NotImplementedError | |
def getSourceName(self): | |
""" | |
Where are you getting symbols from? Normally, implementations will | |
pass the buck all the way to the lexer who can ask its input stream | |
for the file name or whatever. | |
""" | |
raise NotImplementedError | |
class CharStream(IntStream): | |
""" | |
@brief A source of characters for an ANTLR lexer. | |
This is an abstract class that must be implemented by a subclass. | |
""" | |
# pylint does not realize that this is an interface, too | |
#pylint: disable-msg=W0223 | |
EOF = -1 | |
def substring(self, start, stop): | |
""" | |
For infinite streams, you don't need this; primarily I'm providing | |
a useful interface for action code. Just make sure actions don't | |
use this on streams that don't support it. | |
""" | |
raise NotImplementedError | |
def LT(self, i): | |
""" | |
Get the ith character of lookahead. This is the same usually as | |
LA(i). This will be used for labels in the generated | |
lexer code. I'd prefer to return a char here type-wise, but it's | |
probably better to be 32-bit clean and be consistent with LA. | |
""" | |
raise NotImplementedError | |
def getLine(self): | |
"""ANTLR tracks the line information automatically""" | |
raise NotImplementedError | |
def setLine(self, line): | |
""" | |
Because this stream can rewind, we need to be able to reset the line | |
""" | |
raise NotImplementedError | |
def getCharPositionInLine(self): | |
""" | |
The index of the character relative to the beginning of the line 0..n-1 | |
""" | |
raise NotImplementedError | |
def setCharPositionInLine(self, pos): | |
raise NotImplementedError | |
class TokenStream(IntStream): | |
""" | |
@brief A stream of tokens accessing tokens from a TokenSource | |
This is an abstract class that must be implemented by a subclass. | |
""" | |
# pylint does not realize that this is an interface, too | |
#pylint: disable-msg=W0223 | |
def LT(self, k): | |
""" | |
Get Token at current input pointer + i ahead where i=1 is next Token. | |
i<0 indicates tokens in the past. So -1 is previous token and -2 is | |
two tokens ago. LT(0) is undefined. For i>=n, return Token.EOFToken. | |
Return null for LT(0) and any index that results in an absolute address | |
that is negative. | |
""" | |
raise NotImplementedError | |
def range(self): | |
""" | |
How far ahead has the stream been asked to look? The return | |
value is a valid index from 0..n-1. | |
""" | |
raise NotImplementedError | |
def get(self, i): | |
""" | |
Get a token at an absolute index i; 0..n-1. This is really only | |
needed for profiling and debugging and token stream rewriting. | |
If you don't want to buffer up tokens, then this method makes no | |
sense for you. Naturally you can't use the rewrite stream feature. | |
I believe DebugTokenStream can easily be altered to not use | |
this method, removing the dependency. | |
""" | |
raise NotImplementedError | |
def getTokenSource(self): | |
""" | |
Where is this stream pulling tokens from? This is not the name, but | |
the object that provides Token objects. | |
""" | |
raise NotImplementedError | |
def toString(self, start=None, stop=None): | |
""" | |
Return the text of all tokens from start to stop, inclusive. | |
If the stream does not buffer all the tokens then it can just | |
return "" or null; Users should not access $ruleLabel.text in | |
an action of course in that case. | |
Because the user is not required to use a token with an index stored | |
in it, we must provide a means for two token objects themselves to | |
indicate the start/end location. Most often this will just delegate | |
to the other toString(int,int). This is also parallel with | |
the TreeNodeStream.toString(Object,Object). | |
""" | |
raise NotImplementedError | |
############################################################################ | |
# | |
# character streams for use in lexers | |
# CharStream | |
# \- ANTLRStringStream | |
# | |
############################################################################ | |
class ANTLRStringStream(CharStream): | |
""" | |
@brief CharStream that pull data from a unicode string. | |
A pretty quick CharStream that pulls all data from an array | |
directly. Every method call counts in the lexer. | |
""" | |
def __init__(self, data): | |
""" | |
@param data This should be a unicode string holding the data you want | |
to parse. If you pass in a byte string, the Lexer will choke on | |
non-ascii data. | |
""" | |
CharStream.__init__(self) | |
# The data being scanned | |
self.strdata = unicode(data) | |
self.data = [ord(c) for c in self.strdata] | |
# How many characters are actually in the buffer | |
self.n = len(data) | |
# 0..n-1 index into string of next char | |
self.p = 0 | |
# line number 1..n within the input | |
self.line = 1 | |
# The index of the character relative to the beginning of the | |
# line 0..n-1 | |
self.charPositionInLine = 0 | |
# A list of CharStreamState objects that tracks the stream state | |
# values line, charPositionInLine, and p that can change as you | |
# move through the input stream. Indexed from 0..markDepth-1. | |
self._markers = [ ] | |
self.lastMarker = None | |
self.markDepth = 0 | |
# What is name or source of this char stream? | |
self.name = None | |
def reset(self): | |
""" | |
Reset the stream so that it's in the same state it was | |
when the object was created *except* the data array is not | |
touched. | |
""" | |
self.p = 0 | |
self.line = 1 | |
self.charPositionInLine = 0 | |
self._markers = [ ] | |
def consume(self): | |
try: | |
if self.data[self.p] == 10: # \n | |
self.line += 1 | |
self.charPositionInLine = 0 | |
else: | |
self.charPositionInLine += 1 | |
self.p += 1 | |
except IndexError: | |
# happend when we reached EOF and self.data[self.p] fails | |
# just do nothing | |
pass | |
def LA(self, i): | |
if i == 0: | |
return 0 # undefined | |
if i < 0: | |
i += 1 # e.g., translate LA(-1) to use offset i=0; then data[p+0-1] | |
try: | |
return self.data[self.p+i-1] | |
except IndexError: | |
return EOF | |
def LT(self, i): | |
if i == 0: | |
return 0 # undefined | |
if i < 0: | |
i += 1 # e.g., translate LA(-1) to use offset i=0; then data[p+0-1] | |
try: | |
return self.strdata[self.p+i-1] | |
except IndexError: | |
return EOF | |
def index(self): | |
""" | |
Return the current input symbol index 0..n where n indicates the | |
last symbol has been read. The index is the index of char to | |
be returned from LA(1). | |
""" | |
return self.p | |
def size(self): | |
return self.n | |
def mark(self): | |
state = (self.p, self.line, self.charPositionInLine) | |
try: | |
self._markers[self.markDepth] = state | |
except IndexError: | |
self._markers.append(state) | |
self.markDepth += 1 | |
self.lastMarker = self.markDepth | |
return self.lastMarker | |
def rewind(self, marker=None): | |
if marker is None: | |
marker = self.lastMarker | |
p, line, charPositionInLine = self._markers[marker-1] | |
self.seek(p) | |
self.line = line | |
self.charPositionInLine = charPositionInLine | |
self.release(marker) | |
def release(self, marker=None): | |
if marker is None: | |
marker = self.lastMarker | |
self.markDepth = marker-1 | |
def seek(self, index): | |
""" | |
consume() ahead until p==index; can't just set p=index as we must | |
update line and charPositionInLine. | |
""" | |
if index <= self.p: | |
self.p = index # just jump; don't update stream state (line, ...) | |
return | |
# seek forward, consume until p hits index | |
while self.p < index: | |
self.consume() | |
def substring(self, start, stop): | |
return self.strdata[start:stop+1] | |
def getLine(self): | |
"""Using setter/getter methods is deprecated. Use o.line instead.""" | |
return self.line | |
def getCharPositionInLine(self): | |
""" | |
Using setter/getter methods is deprecated. Use o.charPositionInLine | |
instead. | |
""" | |
return self.charPositionInLine | |
def setLine(self, line): | |
"""Using setter/getter methods is deprecated. Use o.line instead.""" | |
self.line = line | |
def setCharPositionInLine(self, pos): | |
""" | |
Using setter/getter methods is deprecated. Use o.charPositionInLine | |
instead. | |
""" | |
self.charPositionInLine = pos | |
def getSourceName(self): | |
return self.name | |
class ANTLRFileStream(ANTLRStringStream): | |
""" | |
@brief CharStream that opens a file to read the data. | |
This is a char buffer stream that is loaded from a file | |
all at once when you construct the object. | |
""" | |
def __init__(self, fileName, encoding=None): | |
""" | |
@param fileName The path to the file to be opened. The file will be | |
opened with mode 'rb'. | |
@param encoding If you set the optional encoding argument, then the | |
data will be decoded on the fly. | |
""" | |
self.fileName = fileName | |
fp = codecs.open(fileName, 'rb', encoding) | |
try: | |
data = fp.read() | |
finally: | |
fp.close() | |
ANTLRStringStream.__init__(self, data) | |
def getSourceName(self): | |
"""Deprecated, access o.fileName directly.""" | |
return self.fileName | |
class ANTLRInputStream(ANTLRStringStream): | |
""" | |
@brief CharStream that reads data from a file-like object. | |
This is a char buffer stream that is loaded from a file like object | |
all at once when you construct the object. | |
All input is consumed from the file, but it is not closed. | |
""" | |
def __init__(self, file, encoding=None): | |
""" | |
@param file A file-like object holding your input. Only the read() | |
method must be implemented. | |
@param encoding If you set the optional encoding argument, then the | |
data will be decoded on the fly. | |
""" | |
if encoding is not None: | |
# wrap input in a decoding reader | |
reader = codecs.lookup(encoding)[2] | |
file = reader(file) | |
data = file.read() | |
ANTLRStringStream.__init__(self, data) | |
# I guess the ANTLR prefix exists only to avoid a name clash with some Java | |
# mumbojumbo. A plain "StringStream" looks better to me, which should be | |
# the preferred name in Python. | |
StringStream = ANTLRStringStream | |
FileStream = ANTLRFileStream | |
InputStream = ANTLRInputStream | |
############################################################################ | |
# | |
# Token streams | |
# TokenStream | |
# +- CommonTokenStream | |
# \- TokenRewriteStream | |
# | |
############################################################################ | |
class CommonTokenStream(TokenStream): | |
""" | |
@brief The most common stream of tokens | |
The most common stream of tokens is one where every token is buffered up | |
and tokens are prefiltered for a certain channel (the parser will only | |
see these tokens and cannot change the filter channel number during the | |
parse). | |
""" | |
def __init__(self, tokenSource=None, channel=DEFAULT_CHANNEL): | |
""" | |
@param tokenSource A TokenSource instance (usually a Lexer) to pull | |
the tokens from. | |
@param channel Skip tokens on any channel but this one; this is how we | |
skip whitespace... | |
""" | |
TokenStream.__init__(self) | |
self.tokenSource = tokenSource | |
# Record every single token pulled from the source so we can reproduce | |
# chunks of it later. | |
self.tokens = [] | |
# Map<tokentype, channel> to override some Tokens' channel numbers | |
self.channelOverrideMap = {} | |
# Set<tokentype>; discard any tokens with this type | |
self.discardSet = set() | |
# Skip tokens on any channel but this one; this is how we skip | |
# whitespace... | |
self.channel = channel | |
# By default, track all incoming tokens | |
self.discardOffChannelTokens = False | |
# The index into the tokens list of the current token (next token | |
# to consume). p==-1 indicates that the tokens list is empty | |
self.p = -1 | |
# Remember last marked position | |
self.lastMarker = None | |
# how deep have we gone? | |
self._range = -1 | |
def makeEOFToken(self): | |
return self.tokenSource.makeEOFToken() | |
def setTokenSource(self, tokenSource): | |
"""Reset this token stream by setting its token source.""" | |
self.tokenSource = tokenSource | |
self.tokens = [] | |
self.p = -1 | |
self.channel = DEFAULT_CHANNEL | |
def reset(self): | |
self.p = 0 | |
self.lastMarker = None | |
def fillBuffer(self): | |
""" | |
Load all tokens from the token source and put in tokens. | |
This is done upon first LT request because you might want to | |
set some token type / channel overrides before filling buffer. | |
""" | |
index = 0 | |
t = self.tokenSource.nextToken() | |
while t is not None and t.type != EOF: | |
discard = False | |
if self.discardSet is not None and t.type in self.discardSet: | |
discard = True | |
elif self.discardOffChannelTokens and t.channel != self.channel: | |
discard = True | |
# is there a channel override for token type? | |
try: | |
overrideChannel = self.channelOverrideMap[t.type] | |
except KeyError: | |
# no override for this type | |
pass | |
else: | |
if overrideChannel == self.channel: | |
t.channel = overrideChannel | |
else: | |
discard = True | |
if not discard: | |
t.index = index | |
self.tokens.append(t) | |
index += 1 | |
t = self.tokenSource.nextToken() | |
# leave p pointing at first token on channel | |
self.p = 0 | |
self.p = self.skipOffTokenChannels(self.p) | |
def consume(self): | |
""" | |
Move the input pointer to the next incoming token. The stream | |
must become active with LT(1) available. consume() simply | |
moves the input pointer so that LT(1) points at the next | |
input symbol. Consume at least one token. | |
Walk past any token not on the channel the parser is listening to. | |
""" | |
if self.p < len(self.tokens): | |
self.p += 1 | |
self.p = self.skipOffTokenChannels(self.p) # leave p on valid token | |
def skipOffTokenChannels(self, i): | |
""" | |
Given a starting index, return the index of the first on-channel | |
token. | |
""" | |
try: | |
while self.tokens[i].channel != self.channel: | |
i += 1 | |
except IndexError: | |
# hit the end of token stream | |
pass | |
return i | |
def skipOffTokenChannelsReverse(self, i): | |
while i >= 0 and self.tokens[i].channel != self.channel: | |
i -= 1 | |
return i | |
def setTokenTypeChannel(self, ttype, channel): | |
""" | |
A simple filter mechanism whereby you can tell this token stream | |
to force all tokens of type ttype to be on channel. For example, | |
when interpreting, we cannot exec actions so we need to tell | |
the stream to force all WS and NEWLINE to be a different, ignored | |
channel. | |
""" | |
self.channelOverrideMap[ttype] = channel | |
def discardTokenType(self, ttype): | |
self.discardSet.add(ttype) | |
def getTokens(self, start=None, stop=None, types=None): | |
""" | |
Given a start and stop index, return a list of all tokens in | |
the token type set. Return None if no tokens were found. This | |
method looks at both on and off channel tokens. | |
""" | |
if self.p == -1: | |
self.fillBuffer() | |
if stop is None or stop > len(self.tokens): | |
stop = len(self.tokens) | |
if start is None or stop < 0: | |
start = 0 | |
if start > stop: | |
return None | |
if isinstance(types, (int, long)): | |
# called with a single type, wrap into set | |
types = set([types]) | |
filteredTokens = [ | |
token for token in self.tokens[start:stop] | |
if types is None or token.type in types | |
] | |
if len(filteredTokens) == 0: | |
return None | |
return filteredTokens | |
def LT(self, k): | |
""" | |
Get the ith token from the current position 1..n where k=1 is the | |
first symbol of lookahead. | |
""" | |
if self.p == -1: | |
self.fillBuffer() | |
if k == 0: | |
return None | |
if k < 0: | |
return self.LB(-k) | |
i = self.p | |
n = 1 | |
# find k good tokens | |
while n < k: | |
# skip off-channel tokens | |
i = self.skipOffTokenChannels(i+1) # leave p on valid token | |
n += 1 | |
if i > self._range: | |
self._range = i | |
try: | |
return self.tokens[i] | |
except IndexError: | |
return self.makeEOFToken() | |
def LB(self, k): | |
"""Look backwards k tokens on-channel tokens""" | |
if self.p == -1: | |
self.fillBuffer() | |
if k == 0: | |
return None | |
if self.p - k < 0: | |
return None | |
i = self.p | |
n = 1 | |
# find k good tokens looking backwards | |
while n <= k: | |
# skip off-channel tokens | |
i = self.skipOffTokenChannelsReverse(i-1) # leave p on valid token | |
n += 1 | |
if i < 0: | |
return None | |
return self.tokens[i] | |
def get(self, i): | |
""" | |
Return absolute token i; ignore which channel the tokens are on; | |
that is, count all tokens not just on-channel tokens. | |
""" | |
return self.tokens[i] | |
def slice(self, start, stop): | |
if self.p == -1: | |
self.fillBuffer() | |
if start < 0 or stop < 0: | |
return None | |
return self.tokens[start:stop+1] | |
def LA(self, i): | |
return self.LT(i).type | |
def mark(self): | |
self.lastMarker = self.index() | |
return self.lastMarker | |
def release(self, marker=None): | |
# no resources to release | |
pass | |
def size(self): | |
return len(self.tokens) | |
def range(self): | |
return self._range | |
def index(self): | |
return self.p | |
def rewind(self, marker=None): | |
if marker is None: | |
marker = self.lastMarker | |
self.seek(marker) | |
def seek(self, index): | |
self.p = index | |
def getTokenSource(self): | |
return self.tokenSource | |
def getSourceName(self): | |
return self.tokenSource.getSourceName() | |
def toString(self, start=None, stop=None): | |
if self.p == -1: | |
self.fillBuffer() | |
if start is None: | |
start = 0 | |
elif not isinstance(start, int): | |
start = start.index | |
if stop is None: | |
stop = len(self.tokens) - 1 | |
elif not isinstance(stop, int): | |
stop = stop.index | |
if stop >= len(self.tokens): | |
stop = len(self.tokens) - 1 | |
return ''.join([t.text for t in self.tokens[start:stop+1]]) | |
class RewriteOperation(object): | |
"""@brief Internal helper class.""" | |
def __init__(self, stream, index, text): | |
self.stream = stream | |
# What index into rewrites List are we? | |
self.instructionIndex = None | |
# Token buffer index. | |
self.index = index | |
self.text = text | |
def execute(self, buf): | |
"""Execute the rewrite operation by possibly adding to the buffer. | |
Return the index of the next token to operate on. | |
""" | |
return self.index | |
def toString(self): | |
opName = self.__class__.__name__ | |
return '<%s@%d:"%s">' % ( | |
opName, self.index, self.text) | |
__str__ = toString | |
__repr__ = toString | |
class InsertBeforeOp(RewriteOperation): | |
"""@brief Internal helper class.""" | |
def execute(self, buf): | |
buf.write(self.text) | |
if self.stream.tokens[self.index].type != EOF: | |
buf.write(self.stream.tokens[self.index].text) | |
return self.index + 1 | |
class ReplaceOp(RewriteOperation): | |
""" | |
@brief Internal helper class. | |
I'm going to try replacing range from x..y with (y-x)+1 ReplaceOp | |
instructions. | |
""" | |
def __init__(self, stream, first, last, text): | |
RewriteOperation.__init__(self, stream, first, text) | |
self.lastIndex = last | |
def execute(self, buf): | |
if self.text is not None: | |
buf.write(self.text) | |
return self.lastIndex + 1 | |
def toString(self): | |
if self.text is None: | |
return '<DeleteOp@%d..%d>' % (self.index, self.lastIndex) | |
return '<ReplaceOp@%d..%d:"%s">' % ( | |
self.index, self.lastIndex, self.text) | |
__str__ = toString | |
__repr__ = toString | |
class TokenRewriteStream(CommonTokenStream): | |
"""@brief CommonTokenStream that can be modified. | |
Useful for dumping out the input stream after doing some | |
augmentation or other manipulations. | |
You can insert stuff, replace, and delete chunks. Note that the | |
operations are done lazily--only if you convert the buffer to a | |
String. This is very efficient because you are not moving data around | |
all the time. As the buffer of tokens is converted to strings, the | |
toString() method(s) check to see if there is an operation at the | |
current index. If so, the operation is done and then normal String | |
rendering continues on the buffer. This is like having multiple Turing | |
machine instruction streams (programs) operating on a single input tape. :) | |
Since the operations are done lazily at toString-time, operations do not | |
screw up the token index values. That is, an insert operation at token | |
index i does not change the index values for tokens i+1..n-1. | |
Because operations never actually alter the buffer, you may always get | |
the original token stream back without undoing anything. Since | |
the instructions are queued up, you can easily simulate transactions and | |
roll back any changes if there is an error just by removing instructions. | |
For example, | |
CharStream input = new ANTLRFileStream("input"); | |
TLexer lex = new TLexer(input); | |
TokenRewriteStream tokens = new TokenRewriteStream(lex); | |
T parser = new T(tokens); | |
parser.startRule(); | |
Then in the rules, you can execute | |
Token t,u; | |
... | |
input.insertAfter(t, "text to put after t");} | |
input.insertAfter(u, "text after u");} | |
System.out.println(tokens.toString()); | |
Actually, you have to cast the 'input' to a TokenRewriteStream. :( | |
You can also have multiple "instruction streams" and get multiple | |
rewrites from a single pass over the input. Just name the instruction | |
streams and use that name again when printing the buffer. This could be | |
useful for generating a C file and also its header file--all from the | |
same buffer: | |
tokens.insertAfter("pass1", t, "text to put after t");} | |
tokens.insertAfter("pass2", u, "text after u");} | |
System.out.println(tokens.toString("pass1")); | |
System.out.println(tokens.toString("pass2")); | |
If you don't use named rewrite streams, a "default" stream is used as | |
the first example shows. | |
""" | |
DEFAULT_PROGRAM_NAME = "default" | |
MIN_TOKEN_INDEX = 0 | |
def __init__(self, tokenSource=None, channel=DEFAULT_CHANNEL): | |
CommonTokenStream.__init__(self, tokenSource, channel) | |
# You may have multiple, named streams of rewrite operations. | |
# I'm calling these things "programs." | |
# Maps String (name) -> rewrite (List) | |
self.programs = {} | |
self.programs[self.DEFAULT_PROGRAM_NAME] = [] | |
# Map String (program name) -> Integer index | |
self.lastRewriteTokenIndexes = {} | |
def rollback(self, *args): | |
""" | |
Rollback the instruction stream for a program so that | |
the indicated instruction (via instructionIndex) is no | |
longer in the stream. UNTESTED! | |
""" | |
if len(args) == 2: | |
programName = args[0] | |
instructionIndex = args[1] | |
elif len(args) == 1: | |
programName = self.DEFAULT_PROGRAM_NAME | |
instructionIndex = args[0] | |
else: | |
raise TypeError("Invalid arguments") | |
p = self.programs.get(programName, None) | |
if p is not None: | |
self.programs[programName] = ( | |
p[self.MIN_TOKEN_INDEX:instructionIndex]) | |
def deleteProgram(self, programName=DEFAULT_PROGRAM_NAME): | |
"""Reset the program so that no instructions exist""" | |
self.rollback(programName, self.MIN_TOKEN_INDEX) | |
def insertAfter(self, *args): | |
if len(args) == 2: | |
programName = self.DEFAULT_PROGRAM_NAME | |
index = args[0] | |
text = args[1] | |
elif len(args) == 3: | |
programName = args[0] | |
index = args[1] | |
text = args[2] | |
else: | |
raise TypeError("Invalid arguments") | |
if isinstance(index, Token): | |
# index is a Token, grap the stream index from it | |
index = index.index | |
# to insert after, just insert before next index (even if past end) | |
self.insertBefore(programName, index+1, text) | |
def insertBefore(self, *args): | |
if len(args) == 2: | |
programName = self.DEFAULT_PROGRAM_NAME | |
index = args[0] | |
text = args[1] | |
elif len(args) == 3: | |
programName = args[0] | |
index = args[1] | |
text = args[2] | |
else: | |
raise TypeError("Invalid arguments") | |
if isinstance(index, Token): | |
# index is a Token, grap the stream index from it | |
index = index.index | |
op = InsertBeforeOp(self, index, text) | |
rewrites = self.getProgram(programName) | |
op.instructionIndex = len(rewrites) | |
rewrites.append(op) | |
def replace(self, *args): | |
if len(args) == 2: | |
programName = self.DEFAULT_PROGRAM_NAME | |
first = args[0] | |
last = args[0] | |
text = args[1] | |
elif len(args) == 3: | |
programName = self.DEFAULT_PROGRAM_NAME | |
first = args[0] | |
last = args[1] | |
text = args[2] | |
elif len(args) == 4: | |
programName = args[0] | |
first = args[1] | |
last = args[2] | |
text = args[3] | |
else: | |
raise TypeError("Invalid arguments") | |
if isinstance(first, Token): | |
# first is a Token, grap the stream index from it | |
first = first.index | |
if isinstance(last, Token): | |
# last is a Token, grap the stream index from it | |
last = last.index | |
if first > last or first < 0 or last < 0 or last >= len(self.tokens): | |
raise ValueError( | |
"replace: range invalid: %d..%d (size=%d)" | |
% (first, last, len(self.tokens))) | |
op = ReplaceOp(self, first, last, text) | |
rewrites = self.getProgram(programName) | |
op.instructionIndex = len(rewrites) | |
rewrites.append(op) | |
def delete(self, *args): | |
self.replace(*(list(args) + [None])) | |
def getLastRewriteTokenIndex(self, programName=DEFAULT_PROGRAM_NAME): | |
return self.lastRewriteTokenIndexes.get(programName, -1) | |
def setLastRewriteTokenIndex(self, programName, i): | |
self.lastRewriteTokenIndexes[programName] = i | |
def getProgram(self, name): | |
p = self.programs.get(name, None) | |
if p is None: | |
p = self.initializeProgram(name) | |
return p | |
def initializeProgram(self, name): | |
p = [] | |
self.programs[name] = p | |
return p | |
def toOriginalString(self, start=None, end=None): | |
if self.p == -1: | |
self.fillBuffer() | |
if start is None: | |
start = self.MIN_TOKEN_INDEX | |
if end is None: | |
end = self.size() - 1 | |
buf = StringIO() | |
i = start | |
while i >= self.MIN_TOKEN_INDEX and i <= end and i < len(self.tokens): | |
if self.get(i).type != EOF: | |
buf.write(self.get(i).text) | |
i += 1 | |
return buf.getvalue() | |
def toString(self, *args): | |
if self.p == -1: | |
self.fillBuffer() | |
if len(args) == 0: | |
programName = self.DEFAULT_PROGRAM_NAME | |
start = self.MIN_TOKEN_INDEX | |
end = self.size() - 1 | |
elif len(args) == 1: | |
programName = args[0] | |
start = self.MIN_TOKEN_INDEX | |
end = self.size() - 1 | |
elif len(args) == 2: | |
programName = self.DEFAULT_PROGRAM_NAME | |
start = args[0] | |
end = args[1] | |
if start is None: | |
start = self.MIN_TOKEN_INDEX | |
elif not isinstance(start, int): | |
start = start.index | |
if end is None: | |
end = len(self.tokens) - 1 | |
elif not isinstance(end, int): | |
end = end.index | |
# ensure start/end are in range | |
if end >= len(self.tokens): | |
end = len(self.tokens) - 1 | |
if start < 0: | |
start = 0 | |
rewrites = self.programs.get(programName) | |
if rewrites is None or len(rewrites) == 0: | |
# no instructions to execute | |
return self.toOriginalString(start, end) | |
buf = StringIO() | |
# First, optimize instruction stream | |
indexToOp = self.reduceToSingleOperationPerIndex(rewrites) | |
# Walk buffer, executing instructions and emitting tokens | |
i = start | |
while i <= end and i < len(self.tokens): | |
op = indexToOp.get(i) | |
# remove so any left have index size-1 | |
try: | |
del indexToOp[i] | |
except KeyError: | |
pass | |
t = self.tokens[i] | |
if op is None: | |
# no operation at that index, just dump token | |
if t.type != EOF: | |
buf.write(t.text) | |
i += 1 # move to next token | |
else: | |
i = op.execute(buf) # execute operation and skip | |
# include stuff after end if it's last index in buffer | |
# So, if they did an insertAfter(lastValidIndex, "foo"), include | |
# foo if end==lastValidIndex. | |
if end == len(self.tokens) - 1: | |
# Scan any remaining operations after last token | |
# should be included (they will be inserts). | |
for i in sorted(indexToOp.keys()): | |
op = indexToOp[i] | |
if op.index >= len(self.tokens)-1: | |
buf.write(op.text) | |
return buf.getvalue() | |
__str__ = toString | |
def reduceToSingleOperationPerIndex(self, rewrites): | |
""" | |
We need to combine operations and report invalid operations (like | |
overlapping replaces that are not completed nested). Inserts to | |
same index need to be combined etc... Here are the cases: | |
I.i.u I.j.v leave alone, nonoverlapping | |
I.i.u I.i.v combine: Iivu | |
R.i-j.u R.x-y.v | i-j in x-y delete first R | |
R.i-j.u R.i-j.v delete first R | |
R.i-j.u R.x-y.v | x-y in i-j ERROR | |
R.i-j.u R.x-y.v | boundaries overlap ERROR | |
Delete special case of replace (text==null): | |
D.i-j.u D.x-y.v | boundaries overlapcombine to | |
max(min)..max(right) | |
I.i.u R.x-y.v | i in (x+1)-ydelete I (since | |
insert before we're not deleting | |
i) | |
I.i.u R.x-y.v | i not in (x+1)-yleave alone, | |
nonoverlapping | |
R.x-y.v I.i.u | i in x-y ERROR | |
R.x-y.v I.x.u R.x-y.uv (combine, delete I) | |
R.x-y.v I.i.u | i not in x-y leave alone, nonoverlapping | |
I.i.u = insert u before op @ index i | |
R.x-y.u = replace x-y indexed tokens with u | |
First we need to examine replaces. For any replace op: | |
1. wipe out any insertions before op within that range. | |
2. Drop any replace op before that is contained completely within | |
that range. | |
3. Throw exception upon boundary overlap with any previous replace. | |
Then we can deal with inserts: | |
1. for any inserts to same index, combine even if not adjacent. | |
2. for any prior replace with same left boundary, combine this | |
insert with replace and delete this replace. | |
3. throw exception if index in same range as previous replace | |
Don't actually delete; make op null in list. Easier to walk list. | |
Later we can throw as we add to index -> op map. | |
Note that I.2 R.2-2 will wipe out I.2 even though, technically, the | |
inserted stuff would be before the replace range. But, if you | |
add tokens in front of a method body '{' and then delete the method | |
body, I think the stuff before the '{' you added should disappear too. | |
Return a map from token index to operation. | |
""" | |
# WALK REPLACES | |
for i, rop in enumerate(rewrites): | |
if rop is None: | |
continue | |
if not isinstance(rop, ReplaceOp): | |
continue | |
# Wipe prior inserts within range | |
for j, iop in self.getKindOfOps(rewrites, InsertBeforeOp, i): | |
if iop.index == rop.index: | |
# E.g., insert before 2, delete 2..2; update replace | |
# text to include insert before, kill insert | |
rewrites[iop.instructionIndex] = None | |
rop.text = self.catOpText(iop.text, rop.text) | |
elif iop.index > rop.index and iop.index <= rop.lastIndex: | |
# delete insert as it's a no-op. | |
rewrites[j] = None | |
# Drop any prior replaces contained within | |
for j, prevRop in self.getKindOfOps(rewrites, ReplaceOp, i): | |
if (prevRop.index >= rop.index | |
and prevRop.lastIndex <= rop.lastIndex): | |
# delete replace as it's a no-op. | |
rewrites[j] = None | |
continue | |
# throw exception unless disjoint or identical | |
disjoint = (prevRop.lastIndex < rop.index | |
or prevRop.index > rop.lastIndex) | |
same = (prevRop.index == rop.index | |
and prevRop.lastIndex == rop.lastIndex) | |
# Delete special case of replace (text==null): | |
# D.i-j.u D.x-y.v| boundaries overlapcombine to | |
# max(min)..max(right) | |
if prevRop.text is None and rop.text is None and not disjoint: | |
# kill first delete | |
rewrites[prevRop.instructionIndex] = None | |
rop.index = min(prevRop.index, rop.index) | |
rop.lastIndex = max(prevRop.lastIndex, rop.lastIndex) | |
elif not disjoint and not same: | |
raise ValueError( | |
"replace op boundaries of %s overlap with previous %s" | |
% (rop, prevRop)) | |
# WALK INSERTS | |
for i, iop in enumerate(rewrites): | |
if iop is None: | |
continue | |
if not isinstance(iop, InsertBeforeOp): | |
continue | |
# combine current insert with prior if any at same index | |
for j, prevIop in self.getKindOfOps(rewrites, InsertBeforeOp, i): | |
if prevIop.index == iop.index: # combine objects | |
# convert to strings...we're in process of toString'ing | |
# whole token buffer so no lazy eval issue with any | |
# templates | |
iop.text = self.catOpText(iop.text, prevIop.text) | |
# delete redundant prior insert | |
rewrites[j] = None | |
# look for replaces where iop.index is in range; error | |
for j, rop in self.getKindOfOps(rewrites, ReplaceOp, i): | |
if iop.index == rop.index: | |
rop.text = self.catOpText(iop.text, rop.text) | |
# delete current insert | |
rewrites[i] = None | |
continue | |
if iop.index >= rop.index and iop.index <= rop.lastIndex: | |
raise ValueError( | |
"insert op %s within boundaries of previous %s" | |
% (iop, rop)) | |
m = {} | |
for i, op in enumerate(rewrites): | |
if op is None: | |
# ignore deleted ops | |
continue | |
assert op.index not in m, "should only be one op per index" | |
m[op.index] = op | |
return m | |
def catOpText(self, a, b): | |
x = "" | |
y = "" | |
if a is not None: | |
x = a | |
if b is not None: | |
y = b | |
return x + y | |
def getKindOfOps(self, rewrites, kind, before=None): | |
"""Get all operations before an index of a particular kind.""" | |
if before is None: | |
before = len(rewrites) | |
elif before > len(rewrites): | |
before = len(rewrites) | |
for i, op in enumerate(rewrites[:before]): | |
if op is None: | |
# ignore deleted | |
continue | |
if op.__class__ == kind: | |
yield i, op | |
def toDebugString(self, start=None, end=None): | |
if start is None: | |
start = self.MIN_TOKEN_INDEX | |
if end is None: | |
end = self.size() - 1 | |
buf = StringIO() | |
i = start | |
while i >= self.MIN_TOKEN_INDEX and i <= end and i < len(self.tokens): | |
buf.write(self.get(i)) | |
i += 1 | |
return buf.getvalue() |