blob: 21f236bd1aa23fcc9ec1b79a71cb19e528d97ae7 [file] [log] [blame]
/*******************************************************************************
* Copyright (c) 2007 Wind River Systems, Inc. and others.
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Eclipse Public License v1.0
* which accompanies this distribution, and is available at
* http://www.eclipse.org/legal/epl-v10.html
*
* Contributors:
* UIUC - Photran modifications
* Markus Schorn - initial API and implementation
*******************************************************************************/
/**
* Class edited by Matthew Michelotti.
*
* Overview of changes to Lexer:
* -Redirected the original IToken, Token, TokenWithImage, TokenForDigraph,
* and OffsetLimitReachedException class references to references in my
* package of edited versions of those classes.
* -Changed the fetchToken function to construct tokens which remember
* the white spaces that proceed them. This was done by making a new int
* named spacesStart which corresponds to the start index of the spaces,
* and passing this as an extra variable to all of the token-generating
* functions that fetchToken calls.
* -Updated the functions newToken (2 functions with this name),
* stringLiteral, charLiteral, identifier, headerName, number, and
* newDigraphToken to accept an additional parameter called spacesStart.
* These functions are all called by fetchToken to get a certain type
* of token. I changed the code so that it would also obtain the
* characters from spacesStart to offset, consider these to be white-space
* characters, and add them to the token using my edited constructors
* for Token, TokenWithImage, and TokenForDigraph.
*
* -Added a field called fParentToken, which is an IToken that will be
* set as the parent of each token that this lexer fetches.
* -Re-named fetchToken method as innerFetchToken, and made a new fetchToken
* method. The new fetchToken method will call innerFetchToken, and also
* adds the field fParentToken to the token. NOTE: this functionality should
* probably be moved to the original function in the future, to avoid
* a wrapper function
* -Added a constructor that allows fParentToken to be passed in as an
* argument.
*
* -Added a function getRawChars to get characters from fInput given
* an offset and endOffset.
*
* -Edited the new fetchToken function to remember trigraphs and "\r\n" 's
* as parents of the constructed token, by checking if the length of the
* token image matches the difference in start and end offset.
* -Added a token type tPRE_PHASE_3 which represents a token parent for
* a trigraph or "\r\n" ( which is changed in nextCharPhase3() )
*
*
* Note: These modifications give tokens information about the white
* spaces that come before them. This includes '\\' and '\n' characters in
* line-splices, and comments. No modifications were made to the
* nextDirective method.
*/
package org.eclipse.photran.internal.core.preprocessor.c;
import org.eclipse.cdt.core.parser.IGCCToken;
import org.eclipse.cdt.core.parser.IProblem;
/**
* In short this class converts line endings (to '\n') and trigraphs
* (to their corresponding character),
* removes line-splices, comments and whitespace other than newline.
* Returns preprocessor tokens.
* <p>
* In addition to the preprocessor tokens the following tokens may also be returned:
* {@link #tBEFORE_INPUT}, {@link IToken#tEND_OF_INPUT}, {@link IToken#tCOMPLETION}.
* <p>
* Number literals are split up into {@link IToken#tINTEGER} and {@link IToken#tFLOATINGPT}.
* No checks are done on the number literals.
* <p>
* UNCs are accepted, however characters from outside of the basic source character set are
* not converted to UNCs. Rather than that they are tested with
* {@link Character#isUnicodeIdentifierPart(char)} and may be accepted as part of an
* identifier.
* <p>
* The characters in string literals and char-literals are left as they are found, no conversion to
* an execution character-set is performed.
*/
final public class Lexer {
public static final int tBEFORE_INPUT = IToken.FIRST_RESERVED_SCANNER;
public static final int tNEWLINE = IToken.FIRST_RESERVED_SCANNER + 1;
public static final int tQUOTE_HEADER_NAME = IToken.FIRST_RESERVED_SCANNER + 2;
public static final int tSYSTEM_HEADER_NAME = IToken.FIRST_RESERVED_SCANNER + 3;
public static final int tOTHER_CHARACTER = IToken.FIRST_RESERVED_SCANNER + 4;
public static final int tPRE_PHASE_3 = IToken.FIRST_RESERVED_SCANNER + 5; //added by MM
private static final int END_OF_INPUT = -1;
private static final int ORIGIN_LEXER = OffsetLimitReachedException.ORIGIN_LEXER;
public final static class LexerOptions implements Cloneable {
/**is '$' an allowed character of an identifier*/
public boolean fSupportDollarInIdentifiers= true;
/**is '@' an allowed character of an identifier*/
public boolean fSupportAtSignInIdentifiers= true;
/**should "&lt;?" and ">?" be IGCCToken.tMIN/tMAX tokens*/
public boolean fSupportMinAndMax= true;
/**not used by Lexer class...*/
public boolean fCreateImageLocations= true;
/**is "/% ... %/" treated as a comment*/
public boolean fSupportSlashPercentComments= false;
@Override
public Object clone() {
try {
return super.clone();
} catch (CloneNotSupportedException e) {
return null;
}
}
}
// configuration
private final LexerOptions fOptions;
/**when true, handles END_OF_INPUT token differently*/
private boolean fSupportContentAssist= false;
private final ILexerLog fLog;
/**the source Object for all Tokens that this Lexer makes*/
private final Object fSource;
// the input to the lexer
/**array of characters to parse*/
private final char[] fInput;
/**fInput index of place to start parsing*/
private int fStart;
/**fInput index of place to stop parsing*/
private int fLimit;
// after phase 3 (newline, trigraph, line-splice)
/**offset of fCharPhase3 character, trigraph, or "\r\n"*/
private int fOffset;
/**end of offset of fCharPhase3 character, trigraph, or "\r\n"*/
private int fEndOffset;
/**A char or END_OF_INPUT. This is the current char represented by
* a char in the input, a trigraph, or a "\r\n".*/
private int fCharPhase3;
private boolean fInsideIncludeDirective= false;
private Token fToken;
private Token fLastToken;
// for the few cases where we have to lookahead more than one character
// ( these variables are only used by markPhase3() and restorePhase3() )
private int fMarkOffset;
private int fMarkEndOffset;
private int fMarkPrefetchedChar;
/**If applicable, this is the "#include" token that invoked this Lexer*/
private IToken fParentToken = null; //added by MM
public Lexer(char[] input, LexerOptions options, ILexerLog log, Object source) {
this(input, 0, input.length, options, log, source);
}
//function added by MM
public Lexer(char[] input, LexerOptions options, ILexerLog log, Object source, IToken parentToken) {
this(input, 0, input.length, options, log, source);
fParentToken = parentToken;
}
public Lexer(char[] input, int start, int end, LexerOptions options, ILexerLog log, Object source) {
fInput= input;
fStart= fOffset= fEndOffset= start;
fLimit= end;
fOptions= options;
fLog= log;
fSource= source;
fLastToken= fToken= new Token(tBEFORE_INPUT, source, start, start);
nextCharPhase3();
}
//added by MM
public char[] getRawChars(int offset, int endOffset) {
if(offset < fStart || endOffset > fLimit) return null;
char[] result = new char[endOffset-offset];
System.arraycopy(fInput, offset, result, 0, result.length);
return result;
}
//added by MM
public IToken getParentToken() {
return fParentToken;
}
/**
* Returns the source that is attached to the tokens generated by this lexer
*/
public Object getSource() {
return fSource;
}
/**
* Resets the lexer to the first char and prepares for content-assist mode.
*/
public void setContentAssistMode(int offset) {
fSupportContentAssist= true;
fLimit= Math.min(offset, fInput.length);
// re-initialize
fOffset= fEndOffset= fStart;
nextCharPhase3();
}
/**
* Call this before consuming the name-token in the include directive. It causes the header-file
* tokens to be created.
*/
public void setInsideIncludeDirective(boolean val) {
fInsideIncludeDirective= val;
}
/**
* Returns the current preprocessor token, does not advance.
*/
public Token currentToken() {
return fToken;
}
/**
* Returns the endoffset of the token before the current one.
*/
public int getLastEndOffset() {
return fLastToken.getEndOffset();
}
/**
* Advances to the next token, skipping whitespace other than newline.
* @throws OffsetLimitReachedException when completion is requested in a literal or a header-name.
*/
public Token nextToken() throws OffsetLimitReachedException {
fLastToken= fToken;
return fToken= fetchToken();
}
public boolean currentTokenIsFirstOnLine() {
final int type= fLastToken.getType();
return type == tNEWLINE || type == tBEFORE_INPUT;
}
/**
* Advances to the next newline or the end of input. The newline will not be consumed. If the
* current token is a newline no action is performed.
* Returns the end offset of the last token before the newline.
* @param origin parameter for the {@link OffsetLimitReachedException} when it has to be thrown.
* @since 5.0
*/
@SuppressWarnings("fallthrough")
public final int consumeLine(int origin) throws OffsetLimitReachedException {
Token t= fToken;
Token lt= null;
while(true) {
switch(t.getType()) {
case IToken.tCOMPLETION:
if (lt != null) {
fLastToken= lt;
}
fToken= t;
throw new OffsetLimitReachedException(origin, t);
case IToken.tEND_OF_INPUT:
if (fSupportContentAssist) {
t.setType(IToken.tCOMPLETION);
throw new OffsetLimitReachedException(origin, t);
}
// no break;
case Lexer.tNEWLINE:
fToken= t;
if (lt != null) {
fLastToken= lt;
}
return getLastEndOffset();
}
lt= t;
t= fetchToken();
}
}
/**
* Advances to the next pound token that starts a preprocessor directive.
* @return pound token of the directive or end-of-input.
* @throws OffsetLimitReachedException when completion is requested in a literal or an header-name.
*/
public Token nextDirective() throws OffsetLimitReachedException {
fInsideIncludeDirective= false;
final Token t= fToken;
boolean haveNL= t==null || t.getType() == tNEWLINE;
while(true) {
final boolean hadNL= haveNL;
haveNL= false;
final int start= fOffset;
final int c= fCharPhase3;
// optimization avoids calling nextCharPhase3
int d;
final int pos= fEndOffset;
if (pos+1 >= fLimit) {
d= nextCharPhase3();
}
else {
d= fInput[pos];
switch(d) {
case '\\':
d= nextCharPhase3();
break;
case '?':
if (fInput[pos+1] == '?') {
d= nextCharPhase3();
break;
}
fOffset= pos;
fCharPhase3= d;
fEndOffset= pos+1;
break;
default:
fOffset= pos;
fCharPhase3= d;
fEndOffset= pos+1;
break;
}
}
switch(c) {
case END_OF_INPUT:
fLastToken= fToken= newToken(IToken.tEND_OF_INPUT, start, start);
return fToken;
case '\n':
haveNL= true;
continue;
case ' ':
case '\t':
case 0xb: // vertical tab
case '\f':
case '\r':
haveNL= hadNL;
continue;
case '"':
stringLiteral(start, start, false);
continue;
case '\'':
charLiteral(start, start, false);
continue;
case '/':
switch (d) {
case '/':
nextCharPhase3();
lineComment(start);
continue;
case '*':
blockComment(start, '*');
continue;
case '%':
if (fOptions.fSupportSlashPercentComments) {
blockComment(start, '%');
}
continue;
}
continue;
case '%':
if (hadNL) {
if (d == ':') {
// found at least '#'
final int e= nextCharPhase3();
if (e == '%') {
markPhase3();
if (nextCharPhase3() == ':') {
// found '##'
nextCharPhase3();
continue;
}
restorePhase3();
}
fLastToken= new Token(tNEWLINE, fSource, 0, start); // offset not significant
fToken= newDigraphToken(IToken.tPOUND, start, start);
return fToken;
}
}
continue;
case '#':
if (hadNL && d != '#') {
fLastToken= new Token(tNEWLINE, fSource, 0, start); // offset not significant
fToken= newToken(IToken.tPOUND, start, start);
return fToken;
}
continue;
default:
continue;
}
}
}
//added by MM (re-named original fetchToken...)
private Token fetchToken() throws OffsetLimitReachedException {
Token t = innerFetchToken();
Token ancestor = t;
//remember a parent token for trigraphs and "\r\n"
int tOffset = t.getOffset();
int tEndOffset = t.getEndOffset();
int charImageLength;
if(t.getType() == tNEWLINE) charImageLength = 1;
else charImageLength = t.getCharImage().length;
if(tEndOffset - tOffset != charImageLength) {
Token invoker = new TokenWithImage(tPRE_PHASE_3, fSource, tOffset, tEndOffset, this.getRawChars(tOffset, tEndOffset), t.getCharPrecedingWhiteSpace());
ancestor.setParent(invoker);
ancestor = invoker;
}
//if applicable, make an "#include" directive be a parent of this token
if(fParentToken != null) ancestor.setParent(fParentToken);
return t;
}
/**
* Computes the next token.
*/
private Token innerFetchToken() throws OffsetLimitReachedException {
final int spacesStart = fOffset;
while(true) {
final int start= fOffset;
final int c= fCharPhase3;
final int d= nextCharPhase3();
switch(c) {
case END_OF_INPUT:
return newToken(IToken.tEND_OF_INPUT, spacesStart, start);
case '\n':
fInsideIncludeDirective= false;
return newToken(Lexer.tNEWLINE, spacesStart, start);
case ' ':
case '\t':
case 0xb: // vertical tab
case '\f':
case '\r':
continue;
case 'L':
switch(d) {
case '"':
nextCharPhase3();
return stringLiteral(spacesStart, start, true);
case '\'':
nextCharPhase3();
return charLiteral(spacesStart, start, true);
}
return identifier(spacesStart, start, 1);
case '"':
if (fInsideIncludeDirective) {
return headerName(spacesStart, start, true);
}
return stringLiteral(spacesStart, start, false);
case '\'':
return charLiteral(spacesStart, start, false);
case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': case 'h': case 'i':
case 'j': case 'k': case 'l': case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
case 's': case 't': case 'u': case 'v': case 'w': case 'x': case 'y': case 'z':
case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': case 'H': case 'I':
case 'J': case 'K': case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': case 'Y': case 'Z':
case '_':
return identifier(spacesStart, start, 1);
case '$':
if (fOptions.fSupportDollarInIdentifiers) {
return identifier(spacesStart, start, 1);
}
break;
case '@':
if (fOptions.fSupportAtSignInIdentifiers) {
return identifier(spacesStart, start, 1);
}
break;
case '\\':
switch(d) {
case 'u': case 'U':
nextCharPhase3();
return identifier(spacesStart, start, 2);
}
return newToken(tOTHER_CHARACTER, spacesStart, start, 1);
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
return number(spacesStart, start, 1, false);
case '.':
switch(d) {
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
nextCharPhase3();
return number(spacesStart, start, 2, true);
case '.':
markPhase3();
if (nextCharPhase3() == '.') {
nextCharPhase3();
return newToken(IToken.tELLIPSIS, spacesStart, start);
}
restorePhase3();
break;
case '*':
nextCharPhase3();
return newToken(IToken.tDOTSTAR, spacesStart, start);
}
return newToken(IToken.tDOT, spacesStart, start);
case '#':
if (d == '#') {
nextCharPhase3();
return newToken(IToken.tPOUNDPOUND, spacesStart, start);
}
return newToken(IToken.tPOUND, spacesStart, start);
case '{':
return newToken(IToken.tLBRACE, spacesStart, start);
case '}':
return newToken(IToken.tRBRACE, spacesStart, start);
case '[':
return newToken(IToken.tLBRACKET, spacesStart, start);
case ']':
return newToken(IToken.tRBRACKET, spacesStart, start);
case '(':
return newToken(IToken.tLPAREN, spacesStart, start);
case ')':
return newToken(IToken.tRPAREN, spacesStart, start);
case ';':
return newToken(IToken.tSEMI, spacesStart, start);
case ':':
switch(d) {
case ':':
nextCharPhase3();
return newToken(IToken.tCOLONCOLON, spacesStart, start);
case '>':
nextCharPhase3();
return newDigraphToken(IToken.tRBRACKET, spacesStart, start);
}
return newToken(IToken.tCOLON, spacesStart, start);
case '?':
return newToken(IToken.tQUESTION, spacesStart, start);
case '+':
switch (d) {
case '+':
nextCharPhase3();
return newToken(IToken.tINCR, spacesStart, start);
case '=':
nextCharPhase3();
return newToken(IToken.tPLUSASSIGN, spacesStart, start);
}
return newToken(IToken.tPLUS, spacesStart, start);
case '-':
switch (d) {
case '>':
int e= nextCharPhase3();
if (e == '*') {
nextCharPhase3();
return newToken(IToken.tARROWSTAR, spacesStart, start);
}
return newToken(IToken.tARROW, spacesStart, start);
case '-':
nextCharPhase3();
return newToken(IToken.tDECR, spacesStart, start);
case '=':
nextCharPhase3();
return newToken(IToken.tMINUSASSIGN, spacesStart, start);
}
return newToken(IToken.tMINUS, spacesStart, start);
case '*':
if (d == '=') {
nextCharPhase3();
return newToken(IToken.tSTARASSIGN, spacesStart, start);
}
return newToken(IToken.tSTAR, spacesStart, start);
case '/':
switch (d) {
case '=':
nextCharPhase3();
return newToken(IToken.tDIVASSIGN, spacesStart, start);
case '/':
nextCharPhase3();
lineComment(start);
continue;
case '*':
blockComment(start, '*');
continue;
case '%':
if (fOptions.fSupportSlashPercentComments) {
blockComment(start, '%');
continue;
}
break;
}
return newToken(IToken.tDIV, spacesStart, start);
case '%':
switch (d) {
case '=':
nextCharPhase3();
return newToken(IToken.tMODASSIGN, spacesStart, start);
case '>':
nextCharPhase3();
return newDigraphToken(IToken.tRBRACE, spacesStart, start);
case ':':
final int e= nextCharPhase3();
if (e == '%') {
markPhase3();
if (nextCharPhase3() == ':') {
nextCharPhase3();
return newDigraphToken(IToken.tPOUNDPOUND, spacesStart, start);
}
restorePhase3();
}
return newDigraphToken(IToken.tPOUND, spacesStart, start);
}
return newToken(IToken.tMOD, spacesStart, start);
case '^':
if (d == '=') {
nextCharPhase3();
return newToken(IToken.tXORASSIGN, spacesStart, start);
}
return newToken(IToken.tXOR, spacesStart, start);
case '&':
switch (d) {
case '&':
nextCharPhase3();
return newToken(IToken.tAND, spacesStart, start);
case '=':
nextCharPhase3();
return newToken(IToken.tAMPERASSIGN, spacesStart, start);
}
return newToken(IToken.tAMPER, spacesStart, start);
case '|':
switch (d) {
case '|':
nextCharPhase3();
return newToken(IToken.tOR, spacesStart, start);
case '=':
nextCharPhase3();
return newToken(IToken.tBITORASSIGN, spacesStart, start);
}
return newToken(IToken.tBITOR, spacesStart, start);
case '~':
return newToken(IToken.tBITCOMPLEMENT, spacesStart, start);
case '!':
if (d == '=') {
nextCharPhase3();
return newToken(IToken.tNOTEQUAL, spacesStart, start);
}
return newToken(IToken.tNOT, spacesStart, start);
case '=':
if (d == '=') {
nextCharPhase3();
return newToken(IToken.tEQUAL, spacesStart, start);
}
return newToken(IToken.tASSIGN, spacesStart, start);
case '<':
if (fInsideIncludeDirective) {
return headerName(spacesStart, start, false);
}
switch(d) {
case '=':
nextCharPhase3();
return newToken(IToken.tLTEQUAL, spacesStart, start);
case '<':
final int e= nextCharPhase3();
if (e == '=') {
nextCharPhase3();
return newToken(IToken.tSHIFTLASSIGN, spacesStart, start);
}
return newToken(IToken.tSHIFTL, spacesStart, start);
case '?':
if (fOptions.fSupportMinAndMax) {
nextCharPhase3();
return newToken(IGCCToken.tMIN, spacesStart, start);
}
break;
case ':':
nextCharPhase3();
return newDigraphToken(IToken.tLBRACKET, spacesStart, start);
case '%':
nextCharPhase3();
return newDigraphToken(IToken.tLBRACE, spacesStart, start);
}
return newToken(IToken.tLT, spacesStart, start);
case '>':
switch(d) {
case '=':
nextCharPhase3();
return newToken(IToken.tGTEQUAL, spacesStart, start);
case '>':
final int e= nextCharPhase3();
if (e == '=') {
nextCharPhase3();
return newToken(IToken.tSHIFTRASSIGN, spacesStart, start);
}
return newToken(IToken.tSHIFTR, spacesStart, start);
case '?':
if (fOptions.fSupportMinAndMax) {
nextCharPhase3();
return newToken(IGCCToken.tMAX, spacesStart, start);
}
break;
}
return newToken(IToken.tGT, spacesStart, start);
case ',':
return newToken(IToken.tCOMMA, spacesStart, start);
default:
// in case we have some other letter to start an identifier
if (Character.isUnicodeIdentifierStart((char) c)) {
return identifier(spacesStart, start, 1);
}
break;
}
// handles for instance @
return newToken(tOTHER_CHARACTER, spacesStart, start, 1);
}
}
private Token newToken(int kind, int spacesStart, int offset) {
char[] spaces = new char[offset - spacesStart];
System.arraycopy(fInput, spacesStart, spaces, 0, spaces.length);
return new Token(kind, fSource, offset, fOffset, spaces);
}
private Token newDigraphToken(int kind, int spacesStart, int offset) {
char[] spaces = new char[offset - spacesStart];
System.arraycopy(fInput, spacesStart, spaces, 0, spaces.length);
return new TokenForDigraph(kind, fSource, offset, fOffset, spaces);
}
private Token newToken(final int kind, final int spacesStart, final int offset, final int imageLength) {
final int endOffset= fOffset;
final int sourceLen= endOffset-offset;
char[] image;
if (sourceLen != imageLength) {
image= getCharImage(offset, endOffset, imageLength);
}
else {
image= new char[imageLength];
System.arraycopy(fInput, offset, image, 0, imageLength);
}
char[] spaces = new char[offset - spacesStart];
System.arraycopy(fInput, spacesStart, spaces, 0, spaces.length);
return new TokenWithImage(kind, fSource, offset, endOffset, image, spaces);
}
private void handleProblem(int problemID, char[] arg, int offset) {
fLog.handleProblem(problemID, arg, offset, fOffset);
}
@SuppressWarnings("fallthrough")
private Token headerName(final int spacesStart, final int start, final boolean expectQuotes) throws OffsetLimitReachedException {
int length= 1;
boolean done = false;
int c= fCharPhase3;
loop: while (!done) {
switch (c) {
case END_OF_INPUT:
if (fSupportContentAssist) {
throw new OffsetLimitReachedException(ORIGIN_LEXER,
newToken((expectQuotes ? tQUOTE_HEADER_NAME : tSYSTEM_HEADER_NAME), spacesStart, start, length));
}
// no break;
case '\n':
handleProblem(IProblem.SCANNER_UNBOUNDED_STRING, getInputChars(start, fOffset), start);
break loop;
case '"':
done= expectQuotes;
break;
case '>':
done= !expectQuotes;
break;
}
length++;
c= nextCharPhase3();
}
return newToken((expectQuotes ? tQUOTE_HEADER_NAME : tSYSTEM_HEADER_NAME), spacesStart, start, length);
}
private void blockComment(final int start, final char trigger) {
// we can ignore line-splices, trigraphs and windows newlines when searching for the '*'
int pos= fEndOffset;
while(pos < fLimit) {
if (fInput[pos++] == trigger) {
fEndOffset= pos;
if (nextCharPhase3() == '/') {
nextCharPhase3();
fLog.handleComment(true, start, fOffset);
return;
}
}
}
fCharPhase3= END_OF_INPUT;
fOffset= fEndOffset= pos;
fLog.handleComment(true, start, pos);
}
private void lineComment(final int start) {
int c= fCharPhase3;
while(true) {
switch (c) {
case END_OF_INPUT:
case '\n':
fLog.handleComment(false, start, fOffset);
return;
}
c= nextCharPhase3();
}
}
@SuppressWarnings("fallthrough")
private Token stringLiteral(final int spacesStart, final int start, final boolean wide) throws OffsetLimitReachedException {
boolean escaped = false;
boolean done = false;
int length= wide ? 2 : 1;
int c= fCharPhase3;
loop: while (!done) {
switch(c) {
case END_OF_INPUT:
if (fSupportContentAssist) {
throw new OffsetLimitReachedException(ORIGIN_LEXER,
newToken(wide ? IToken.tLSTRING : IToken.tSTRING, spacesStart, start, length));
}
// no break;
case '\n':
handleProblem(IProblem.SCANNER_UNBOUNDED_STRING, getInputChars(start, fOffset), start);
break loop;
case '\\':
escaped= !escaped;
break;
case '"':
if (!escaped) {
done= true;
}
escaped= false;
break;
default:
escaped= false;
break;
}
length++;
c= nextCharPhase3();
}
return newToken(wide ? IToken.tLSTRING : IToken.tSTRING, spacesStart, start, length);
}
@SuppressWarnings("fallthrough")
private Token charLiteral(final int spacesStart, final int start, boolean wide) throws OffsetLimitReachedException {
boolean escaped = false;
boolean done = false;
int length= wide ? 2 : 1;
int c= fCharPhase3;
loop: while (!done) {
switch(c) {
case END_OF_INPUT:
if (fSupportContentAssist) {
throw new OffsetLimitReachedException(ORIGIN_LEXER,
newToken(wide ? IToken.tLCHAR : IToken.tCHAR, spacesStart, start, length));
}
// no break;
case '\n':
handleProblem(IProblem.SCANNER_BAD_CHARACTER, getInputChars(start, fOffset), start);
break loop;
case '\\':
escaped= !escaped;
break;
case '\'':
if (!escaped) {
done= true;
}
escaped= false;
break;
default:
escaped= false;
break;
}
length++;
c= nextCharPhase3();
}
return newToken(wide ? IToken.tLCHAR : IToken.tCHAR, spacesStart, start, length);
}
private Token identifier(int spacesStart, int start, int length) {
int tokenKind= IToken.tIDENTIFIER;
boolean isPartOfIdentifier= true;
int c= fCharPhase3;
while (true) {
switch(c) {
case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': case 'h': case 'i':
case 'j': case 'k': case 'l': case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
case 's': case 't': case 'u': case 'v': case 'w': case 'x': case 'y': case 'z':
case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': case 'H': case 'I':
case 'J': case 'K': case 'L': case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': case 'Y': case 'Z':
case '_':
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
break;
case '\\': // universal character name
markPhase3();
switch(nextCharPhase3()) {
case 'u': case 'U':
length++;
break;
default:
restorePhase3();
isPartOfIdentifier= false;
break;
}
break;
case END_OF_INPUT:
if (fSupportContentAssist) {
tokenKind= IToken.tCOMPLETION;
}
isPartOfIdentifier= false;
break;
case ' ': case '\t': case 0xb: case '\f': case '\r': case '\n':
isPartOfIdentifier= false;
break;
case '$':
isPartOfIdentifier= fOptions.fSupportDollarInIdentifiers;
break;
case '@':
isPartOfIdentifier= fOptions.fSupportAtSignInIdentifiers;
break;
case '{': case '}': case '[': case ']': case '#': case '(': case ')': case '<': case '>':
case '%': case ':': case ';': case '.': case '?': case '*': case '+': case '-': case '/':
case '^': case '&': case '|': case '~': case '!': case '=': case ',': case '"': case '\'':
isPartOfIdentifier= false;
break;
default:
isPartOfIdentifier= Character.isUnicodeIdentifierPart((char) c);
break;
}
if (!isPartOfIdentifier) {
break;
}
length++;
c= nextCharPhase3();
}
return newToken(tokenKind, spacesStart, start, length);
}
private Token number(final int spacesStart, final int start, int length, boolean isFloat) throws OffsetLimitReachedException {
boolean isPartOfNumber= true;
int c= fCharPhase3;
while (true) {
switch(c) {
// non-digit
case 'a': case 'b': case 'c': case 'd': case 'f': case 'g': case 'h': case 'i':
case 'j': case 'k': case 'l': case 'm': case 'n': case 'o': case 'q': case 'r':
case 's': case 't': case 'u': case 'v': case 'w': case 'x': case 'y': case 'z':
case 'A': case 'B': case 'C': case 'D': case 'F': case 'G': case 'H': case 'I':
case 'J': case 'K': case 'L': case 'M': case 'N': case 'O': case 'Q': case 'R':
case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': case 'Y': case 'Z':
case '_':
// digit
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
break;
// period
case '.':
isFloat= true;
break;
// sign
case 'p':
case 'P':
case 'e':
case 'E':
length++;
c= nextCharPhase3();
switch (c) {
case '+': case '-':
case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
isFloat= true;
length++;
c= nextCharPhase3();
break;
}
continue;
// universal character name (non-digit)
case '\\':
markPhase3();
switch(nextCharPhase3()) {
case 'u': case 'U':
length++;
break;
default:
restorePhase3();
isPartOfNumber= false;
break;
}
break;
case END_OF_INPUT:
if (fSupportContentAssist) {
throw new OffsetLimitReachedException(ORIGIN_LEXER,
newToken((isFloat ? IToken.tFLOATINGPT : IToken.tINTEGER), spacesStart, start, length));
}
isPartOfNumber= false;
break;
default:
isPartOfNumber= false;
break;
}
if (!isPartOfNumber) {
break;
}
c= nextCharPhase3();
length++;
}
return newToken((isFloat ? IToken.tFLOATINGPT : IToken.tINTEGER), spacesStart, start, length);
}
/**
* Saves the current state of phase3, necessary for '...', '%:%:' and UNCs.
*/
private void markPhase3() {
fMarkOffset= fOffset;
fMarkEndOffset= fEndOffset;
fMarkPrefetchedChar= fCharPhase3;
}
/**
* Restores a previously saved state of phase3.
*/
private void restorePhase3() {
fOffset= fMarkOffset;
fEndOffset= fMarkEndOffset;
fCharPhase3= fMarkPrefetchedChar;
}
/**
* Perform phase 1-3: Replace \r\n with \n, handle trigraphs, detect line-splicing.
* Changes fOffset, fEndOffset and fCharPhase3, stateless otherwise.
*/
@SuppressWarnings("fallthrough")
private int nextCharPhase3() {
int pos= fEndOffset;
do {
if (pos+1 >= fLimit) {
if (pos >= fLimit) {
fOffset= fLimit;
fEndOffset= fLimit;
fCharPhase3= END_OF_INPUT;
return END_OF_INPUT;
}
fOffset= pos;
fEndOffset= pos+1;
fCharPhase3= fInput[pos];
return fCharPhase3;
}
final char c= fInput[pos];
fOffset= pos;
fEndOffset= ++pos;
fCharPhase3= c;
switch(c) {
// windows line-ending
case '\r':
if (fInput[pos] == '\n') {
fEndOffset= pos+1;
fCharPhase3= '\n';
return '\n';
}
return c;
// trigraph sequences
case '?':
if (fInput[pos] != '?' || pos+1 >= fLimit) {
return c;
}
final char trigraph= checkTrigraph(fInput[pos+1]);
if (trigraph == 0) {
return c;
}
if (trigraph != '\\') {
fEndOffset= pos+2;
fCharPhase3= trigraph;
return trigraph;
}
pos+= 2;
// no break, handle backslash
case '\\':
final int lsPos= findEndOfLineSpliceSequence(pos);
if (lsPos > pos) {
pos= lsPos;
continue;
}
fEndOffset= pos;
fCharPhase3= '\\';
return '\\'; // don't return c, it may be a '?'
default:
return c;
}
}
while(true);
}
/**
* Maps a trigraph to the character it encodes.
* @param c trigraph without leading question marks.
* @return the character encoded or 0.
*/
private char checkTrigraph(char c) {
switch(c) {
case '=': return '#';
case '\'':return '^';
case '(': return '[';
case ')': return ']';
case '!': return '|';
case '<': return '{';
case '>': return '}';
case '-': return '~';
case '/': return '\\';
}
return 0;
}
/**
* Returns the endoffset for a line-splice sequence, or -1 if there is none.
*/
@SuppressWarnings("fallthrough")
private int findEndOfLineSpliceSequence(int pos) {
boolean haveBackslash= true;
int result= -1;
loop: while(pos < fLimit) {
switch(fInput[pos++]) {
case '\n':
if (haveBackslash) {
result= pos;
haveBackslash= false;
continue loop;
}
return result;
case '\r': case ' ': case '\f': case '\t': case 0xb: // vertical tab
if (haveBackslash) {
continue loop;
}
return result;
case '?':
if (pos+1 >= fLimit || fInput[pos] != '?' || fInput[++pos] != '/') {
return result;
}
// fall through to backslash handling
case '\\':
if (!haveBackslash) {
haveBackslash= true;
continue loop;
}
return result;
default:
return result;
}
}
return result;
}
/**
* Returns the image from the input without any modification.
*/
public char[] getInputChars(int offset, int endOffset) {
final int length= endOffset-offset;
final char[] result= new char[length];
System.arraycopy(fInput, offset, result, 0, length);
return result;
}
char[] getInput() {
return fInput;
}
/**
* Returns the image with trigraphs replaced and line-splices removed.
*/
private char[] getCharImage(int offset, int endOffset, int imageLength) {
final char[] result= new char[imageLength];
markPhase3();
fEndOffset= offset;
for (int idx=0; idx<imageLength; idx++) {
result[idx]= (char) nextCharPhase3();
}
restorePhase3();
return result;
}
}