bundles/org.eclipse.jst.jsp.core/DevTimeSupport/HeadParsers/JSPHeadTokenizer.jFlex - sourceediting/webtools.sourceediting.tests - Git at Google

 /*******************************************************************************
  * Copyright (c) 2005, 2008 IBM Corporation and others.
  * All rights reserved. This program and the accompanying materials
  * are made available under the terms of the Eclipse Public License v1.0
  * which accompanies this distribution, and is available at
  * http://www.eclipse.org/legal/epl-v10.html
  *
  * Contributors:
  *     IBM Corporation - initial API and implementation
  *******************************************************************************/
 /*nlsXXX*/
 package org.eclipse.jst.jsp.core.internal.contenttype;
 import java.io.IOException;
 import java.io.Reader;
 import java.util.Arrays;

 import org.eclipse.wst.xml.core.internal.contenttype.EncodingParserConstants;
 import org.eclipse.wst.xml.core.internal.contenttype.XMLHeadTokenizerConstants;

 %%

 %{


         private boolean hasMore = true;
         private final static int MAX_TO_SCAN = 8000;
         StringBuffer string = new StringBuffer();
         // state stack for easier state handling
         private IntStack fStateStack = new IntStack();
         private String valueText = null;
         private boolean isXHTML;
         private boolean isWML;


         public JSPHeadTokenizer() {
                 super();
         }

           public void reset (Reader in) {
                 /* the input device */
                 zzReader = in;

                 /* the current state of the DFA */
                 zzState = 0;

                 /* the current lexical state */
                 zzLexicalState = YYINITIAL;

                 /* this buffer contains the current text to be matched and is
                  the source of the yytext() string */
                 Arrays.fill(zzBuffer, (char)0);

                 /* the textposition at the last accepting state */
                 zzMarkedPos = 0;

                 /* the textposition at the last state to be included in yytext */
                 //zzPushbackPos = 0;

                 /* the current text position in the buffer */
                 zzCurrentPos = 0;

                 /* startRead marks the beginning of the yytext() string in the buffer */
                 zzStartRead = 0;

                 /**
                  * endRead marks the last character in the buffer, that has been read
                  * from input
                  */
                 zzEndRead = 0;

                 /* number of newlines encountered up to the start of the matched text */
                 //yyline = 0;

                 /* the number of characters up to the start of the matched text */
                 yychar = 0;

                 /**
                  * the number of characters from the last newline up to the start
                  * of the matched text
                  */
                 //yycolumn = 0;

                 /**
                  * yy_atBOL == true <=> the scanner is currently at the beginning
                  * of a line
                  */
                 zzAtBOL = true;

                 /* yy_atEOF == true <=> the scanner has returned a value for EOF */
                 zzAtEOF = false;

                 /* denotes if the user-EOF-code has already been executed */
                 zzEOFDone = false;


                 fStateStack.clear();

                 hasMore = true;
                 isXHTML=false;
                 isWML=false;


         }


         public final HeadParserToken getNextToken() throws IOException, Exception {
                 String context = null;
                 context = primGetNextToken();
                 HeadParserToken result = null;
                 if (valueText != null) {
                         result = createToken(context, yychar, valueText);
                         valueText = null;
                 } else {
                         result = createToken(context, yychar, yytext());
                 }
                 return result;
         }

         public final boolean hasMoreTokens() {
                 return hasMore && yychar < MAX_TO_SCAN;
         }
         private void pushCurrentState() {
                 fStateStack.push(yystate());

         }

         private void popState() {
                 yybegin(fStateStack.pop());
         }
         private HeadParserToken createToken(String context, int start, String text) {
                 return new HeadParserToken(context, start, text);
         }

         public boolean isXHTML() {
             return isXHTML;
         }
         public boolean isWML() {
             return isWML;
         }

 %}

 %eof{
         hasMore=false;
 %eof}

 %public
 %class JSPHeadTokenizer
 %function primGetNextToken
 %type String
 %char
 %unicode
 %ignorecase
 //%debug
 %switch
 %buffer 8192
 %scanerror java.lang.Exception


 UTF16BE = \xFE\xFF
 UTF16LE = \xFF\xFE
 UTF83ByteBOM = \xEF\xBB\xBF

 // SpaceChar = [\x20\x09]


 // [3] S ::= (0x20 | 0x9 | 0xD | 0xA)+
 S = [\x20\x09\x0D\x0A]

 BeginAttributeeValue = {S}* \= {S}*

 LineTerminator = \r|\n


 %state ST_XMLDecl
 %state ST_PAGE_DIRECTIVE
 %state QuotedAttributeValue
 %state DQ_STRING
 %state SQ_STRING
 %state UnDelimitedString

 %%


 <YYINITIAL>
 {
          // force to start at beginning of line (^) and at beginning of file (yychar == 0)
         ^ {UTF16BE}           {if (yychar == 0 ) {hasMore = false; return EncodingParserConstants.UTF16BE;}}
         ^ {UTF16LE}           {if (yychar == 0 ) {hasMore = false; return EncodingParserConstants.UTF16LE;}}
         ^ {UTF83ByteBOM}       {if (yychar == 0 ) {hasMore = false; return EncodingParserConstants.UTF83ByteBOM;}}

         // force to be started on first line, but we do allow preceeding spaces
         ^ {S}* "<\?xml" {S}+ {if (yychar == 0 ) {yybegin(ST_XMLDecl); return XMLHeadTokenizerConstants.XMLDeclStart;}}


         // following are some simply rules to identify JSP content as "XHTML"
         // see http://www.rfc-editor.org/rfc/rfc3236.txt
         "<!DOCTYPE" {S}* "html" {S}* "PUBLIC" .* "//DTD XHTML"                {isXHTML = true;}
         "<html" {S}* "xmlns" {S}* "=" {S}* (\" | \') "http://www.w3.org/1999/xhtml"     {isXHTML = true;}
          // another case that's part of the "HTML family" is WML 1.0 (WML 2.0 is part of XHTML)
          "<!DOCTYPE" {S}* "wml" {S}* "PUBLIC" .* "//DTD WML"                   {isWML = true;}

         "<%" {S}* "@" {S}* ("page"|"tag") {S}+   {yybegin(ST_PAGE_DIRECTIVE); return JSPHeadTokenizerConstants.PageDirectiveStart;}
         ("<jsp:directive.page"|"<jsp:directive.tag") {S}+           {yybegin(ST_PAGE_DIRECTIVE); return JSPHeadTokenizerConstants.PageDirectiveStart;}


 }

 <ST_XMLDecl>
 {
         "version" {BeginAttributeeValue} {pushCurrentState(); yybegin(QuotedAttributeValue); return XMLHeadTokenizerConstants.XMLDeclVersion;}
         "encoding" {BeginAttributeeValue} {pushCurrentState(); yybegin(QuotedAttributeValue); return XMLHeadTokenizerConstants.XMLDelEncoding;}
         // note the "forced end" (via 'hasMore=false') once the end of XML Declaration found
         // This is since non-ascii chars may follow and may cause IOExceptions which would not occur once stream is
         // read with incorrect encoding (such as if platform encoding is in effect until true encoding detected).
         // BUT, the hasMore=false was removed for this JSP case (probably still ok for pure XML) because
         // in a JSP, we must parse past xmlDecl to get at JSP page directive.
         // We'll assume all chars in this area are "readable" as is.
         {S}* "\?>"    {yybegin(YYINITIAL); return XMLHeadTokenizerConstants.XMLDeclEnd;}
 }

 <ST_PAGE_DIRECTIVE>
 {
 //  removed 'language' since it really can be handled seperately from encoding, but may add it back later for simple re-use.
         "language"     {BeginAttributeeValue} {pushCurrentState(); yybegin(QuotedAttributeValue); return JSPHeadTokenizerConstants.PageLanguage;}
         "contentType" {BeginAttributeeValue}  {pushCurrentState(); yybegin(QuotedAttributeValue); return JSPHeadTokenizerConstants.PageContentType;}
         "pageEncoding" {BeginAttributeeValue} {pushCurrentState(); yybegin(QuotedAttributeValue); return JSPHeadTokenizerConstants.PageEncoding;}
         // note the "forced end" (via 'hasMore=false') once the end of XML Declaration found
         // This is since non-ascii chars may follow and may cause IOExceptions which would not occur once stream is
         // read in correct encoding.

         // https://w3.opensource.ibm.com/bugzilla/show_bug.cgi?id=4205 demonstrates how we need to keep parsing,
         // even if come to end of one page directive, so hasMore=false was removed from these rules.
         "%>"    { yybegin(YYINITIAL);  return JSPHeadTokenizerConstants.PageDirectiveEnd;}
         "\/>"    { yybegin(YYINITIAL); return JSPHeadTokenizerConstants.PageDirectiveEnd;}
 }


 <QuotedAttributeValue>
 {
         \"                      { yybegin(DQ_STRING); string.setLength(0); }
         \'                      { yybegin(SQ_STRING); string.setLength(0); }
         // in this state, anything other than a space character can start an undelimited string
         {S}*.           { yypushback(1); yybegin(UnDelimitedString); string.setLength(0);}

 }


 <DQ_STRING>
 {

         \"                      { popState(); valueText = string.toString(); return EncodingParserConstants.StringValue; }
         {LineTerminator}        { yypushback(1);popState(); valueText = string.toString(); return EncodingParserConstants.InvalidTerminatedStringValue;}
         "\?>"                   { yypushback(2); popState(); valueText = string.toString(); return EncodingParserConstants.InvalidTerminatedStringValue;}
         '<'                     { yypushback(1);popState(); valueText = string.toString(); return EncodingParserConstants.InvalidTerminatedStringValue;}
         .                       { string.append( yytext() ); }

         "%>"                    { yypushback(2);popState(); valueText = string.toString(); return EncodingParserConstants.InvalidTerminatedStringValue;}


 }

 <SQ_STRING>
 {

         \'                      { popState(); valueText = string.toString(); return EncodingParserConstants.StringValue;}
         {LineTerminator}        { yypushback(1);popState(); valueText = string.toString(); return EncodingParserConstants.InvalidTerminatedStringValue;}
         "%>"                    { yypushback(2);popState(); valueText = string.toString(); return EncodingParserConstants.InvalidTerminatedStringValue;}
         '<'                     { yypushback(1);popState(); valueText = string.toString(); return EncodingParserConstants.InvalidTerminatedStringValue;}
         .                       { string.append( yytext() ); }
         "%>"                    { yypushback(2);popState(); valueText = string.toString(); return EncodingParserConstants.InvalidTerminatedStringValue;}


 }

 <UnDelimitedString>
 {


         {S}                     { yypushback(1);popState(); valueText = string.toString(); return EncodingParserConstants.UnDelimitedStringValue; }
         {LineTerminator}        { yypushback(1);popState(); valueText = string.toString(); return EncodingParserConstants.InvalidTerminatedStringValue;}
         "\?>"                   { yypushback(2);popState(); valueText = string.toString(); return EncodingParserConstants.InvalidTerminatedStringValue;}
         '<'                     { yypushback(1);popState(); valueText = string.toString(); return EncodingParserConstants.InvalidTerminatedStringValue;}
         // these are a bit special, since we started an undelimit string, but found a quote ... probably indicates a missing beginning quote
         \'                      { yypushback(1);popState(); valueText = string.toString(); return EncodingParserConstants.InvalidTermintatedUnDelimitedStringValue;}
         \"                      { yypushback(1);popState(); valueText = string.toString(); return EncodingParserConstants.InvalidTermintatedUnDelimitedStringValue;}

         .                       { string.append( yytext() ); }
         "%>"                    { yypushback(2);popState(); valueText = string.toString(); return EncodingParserConstants.InvalidTerminatedStringValue;}

 }

 // The "match anything" rule should always be in effect except for when looking for end of string
 // (That is, remember to update state list if/when new states added)
 <YYINITIAL, ST_XMLDecl, QuotedAttributeValue, ST_PAGE_DIRECTIVE>
 {
 // this is the fallback (match "anything") rule  (for this scanner, input is ignored, and position advanced, if not recognized)
 .|\n              {if (yychar > MAX_TO_SCAN) {hasMore=false; return EncodingParserConstants.MAX_CHARS_REACHED;}}
 }

 // this rule always in effect
 <<EOF>>         {hasMore = false; return EncodingParserConstants.EOF;}
	/*******************************************************************************
	* Copyright (c) 2005, 2008 IBM Corporation and others.
	* All rights reserved. This program and the accompanying materials
	* are made available under the terms of the Eclipse Public License v1.0
	* which accompanies this distribution, and is available at
	* http://www.eclipse.org/legal/epl-v10.html
	*
	* Contributors:
	* IBM Corporation - initial API and implementation
	*******************************************************************************/
	/nlsXXX/
	package org.eclipse.jst.jsp.core.internal.contenttype;
	import java.io.IOException;
	import java.io.Reader;
	import java.util.Arrays;

	import org.eclipse.wst.xml.core.internal.contenttype.EncodingParserConstants;
	import org.eclipse.wst.xml.core.internal.contenttype.XMLHeadTokenizerConstants;

	%%

	%{




	private boolean hasMore = true;
	private final static int MAX_TO_SCAN = 8000;
	StringBuffer string = new StringBuffer();
	// state stack for easier state handling
	private IntStack fStateStack = new IntStack();
	private String valueText = null;
	private boolean isXHTML;
	private boolean isWML;


	public JSPHeadTokenizer() {
	super();
	}

	public void reset (Reader in) {
	/* the input device */
	zzReader = in;

	/* the current state of the DFA */
	zzState = 0;

	/* the current lexical state */
	zzLexicalState = YYINITIAL;

	/* this buffer contains the current text to be matched and is
	the source of the yytext() string */
	Arrays.fill(zzBuffer, (char)0);

	/* the textposition at the last accepting state */
	zzMarkedPos = 0;

	/* the textposition at the last state to be included in yytext */
	//zzPushbackPos = 0;

	/* the current text position in the buffer */
	zzCurrentPos = 0;

	/* startRead marks the beginning of the yytext() string in the buffer */
	zzStartRead = 0;

	/**
	* endRead marks the last character in the buffer, that has been read
	* from input
	*/
	zzEndRead = 0;

	/* number of newlines encountered up to the start of the matched text */
	//yyline = 0;

	/* the number of characters up to the start of the matched text */
	yychar = 0;

	/**
	* the number of characters from the last newline up to the start
	* of the matched text
	*/
	//yycolumn = 0;

	/**
	* yy_atBOL == true <=> the scanner is currently at the beginning
	* of a line
	*/
	zzAtBOL = true;

	/* yy_atEOF == true <=> the scanner has returned a value for EOF */
	zzAtEOF = false;

	/* denotes if the user-EOF-code has already been executed */
	zzEOFDone = false;


	fStateStack.clear();

	hasMore = true;
	isXHTML=false;
	isWML=false;


	}


	public final HeadParserToken getNextToken() throws IOException, Exception {
	String context = null;
	context = primGetNextToken();
	HeadParserToken result = null;
	if (valueText != null) {
	result = createToken(context, yychar, valueText);
	valueText = null;
	} else {
	result = createToken(context, yychar, yytext());
	}
	return result;
	}

	public final boolean hasMoreTokens() {
	return hasMore && yychar < MAX_TO_SCAN;
	}
	private void pushCurrentState() {
	fStateStack.push(yystate());

	}

	private void popState() {
	yybegin(fStateStack.pop());
	}
	private HeadParserToken createToken(String context, int start, String text) {
	return new HeadParserToken(context, start, text);
	}

	public boolean isXHTML() {
	return isXHTML;
	}
	public boolean isWML() {
	return isWML;
	}

	%}

	%eof{
	hasMore=false;
	%eof}

	%public
	%class JSPHeadTokenizer
	%function primGetNextToken
	%type String
	%char
	%unicode
	%ignorecase
	//%debug
	%switch
	%buffer 8192
	%scanerror java.lang.Exception


	UTF16BE = \xFE\xFF
	UTF16LE = \xFF\xFE
	UTF83ByteBOM = \xEF\xBB\xBF

	// SpaceChar = [\x20\x09]


	// [3] S ::= (0x20 \| 0x9 \| 0xD \| 0xA)+
	S = [\x20\x09\x0D\x0A]

	BeginAttributeeValue = {S}* \= {S}*

	LineTerminator = \r\|\n


	%state ST_XMLDecl
	%state ST_PAGE_DIRECTIVE
	%state QuotedAttributeValue
	%state DQ_STRING
	%state SQ_STRING
	%state UnDelimitedString

	%%


	<YYINITIAL>
	{
	// force to start at beginning of line (^) and at beginning of file (yychar == 0)
	^ {UTF16BE} {if (yychar == 0 ) {hasMore = false; return EncodingParserConstants.UTF16BE;}}
	^ {UTF16LE} {if (yychar == 0 ) {hasMore = false; return EncodingParserConstants.UTF16LE;}}
	^ {UTF83ByteBOM} {if (yychar == 0 ) {hasMore = false; return EncodingParserConstants.UTF83ByteBOM;}}

	// force to be started on first line, but we do allow preceeding spaces
	^ {S}* "<\?xml" {S}+ {if (yychar == 0 ) {yybegin(ST_XMLDecl); return XMLHeadTokenizerConstants.XMLDeclStart;}}


	// following are some simply rules to identify JSP content as "XHTML"
	// see http://www.rfc-editor.org/rfc/rfc3236.txt
	"<!DOCTYPE" {S}* "html" {S}* "PUBLIC" .* "//DTD XHTML" {isXHTML = true;}
	"<html" {S}* "xmlns" {S}* "=" {S}* (\" \| \') "http://www.w3.org/1999/xhtml" {isXHTML = true;}
	// another case that's part of the "HTML family" is WML 1.0 (WML 2.0 is part of XHTML)
	"<!DOCTYPE" {S}* "wml" {S}* "PUBLIC" .* "//DTD WML" {isWML = true;}

	"<%" {S}* "@" {S}* ("page"\|"tag") {S}+ {yybegin(ST_PAGE_DIRECTIVE); return JSPHeadTokenizerConstants.PageDirectiveStart;}
	("<jsp:directive.page"\|"<jsp:directive.tag") {S}+ {yybegin(ST_PAGE_DIRECTIVE); return JSPHeadTokenizerConstants.PageDirectiveStart;}


	}

	<ST_XMLDecl>
	{
	"version" {BeginAttributeeValue} {pushCurrentState(); yybegin(QuotedAttributeValue); return XMLHeadTokenizerConstants.XMLDeclVersion;}
	"encoding" {BeginAttributeeValue} {pushCurrentState(); yybegin(QuotedAttributeValue); return XMLHeadTokenizerConstants.XMLDelEncoding;}
	// note the "forced end" (via 'hasMore=false') once the end of XML Declaration found
	// This is since non-ascii chars may follow and may cause IOExceptions which would not occur once stream is
	// read with incorrect encoding (such as if platform encoding is in effect until true encoding detected).
	// BUT, the hasMore=false was removed for this JSP case (probably still ok for pure XML) because
	// in a JSP, we must parse past xmlDecl to get at JSP page directive.
	// We'll assume all chars in this area are "readable" as is.
	{S}* "\?>" {yybegin(YYINITIAL); return XMLHeadTokenizerConstants.XMLDeclEnd;}
	}

	<ST_PAGE_DIRECTIVE>
	{
	// removed 'language' since it really can be handled seperately from encoding, but may add it back later for simple re-use.
	"language" {BeginAttributeeValue} {pushCurrentState(); yybegin(QuotedAttributeValue); return JSPHeadTokenizerConstants.PageLanguage;}
	"contentType" {BeginAttributeeValue} {pushCurrentState(); yybegin(QuotedAttributeValue); return JSPHeadTokenizerConstants.PageContentType;}
	"pageEncoding" {BeginAttributeeValue} {pushCurrentState(); yybegin(QuotedAttributeValue); return JSPHeadTokenizerConstants.PageEncoding;}
	// note the "forced end" (via 'hasMore=false') once the end of XML Declaration found
	// This is since non-ascii chars may follow and may cause IOExceptions which would not occur once stream is
	// read in correct encoding.

	// https://w3.opensource.ibm.com/bugzilla/show_bug.cgi?id=4205 demonstrates how we need to keep parsing,
	// even if come to end of one page directive, so hasMore=false was removed from these rules.
	"%>" { yybegin(YYINITIAL); return JSPHeadTokenizerConstants.PageDirectiveEnd;}
	"\/>" { yybegin(YYINITIAL); return JSPHeadTokenizerConstants.PageDirectiveEnd;}
	}


	<QuotedAttributeValue>
	{
	\" { yybegin(DQ_STRING); string.setLength(0); }
	\' { yybegin(SQ_STRING); string.setLength(0); }
	// in this state, anything other than a space character can start an undelimited string
	{S}*. { yypushback(1); yybegin(UnDelimitedString); string.setLength(0);}

	}


	<DQ_STRING>
	{

	\" { popState(); valueText = string.toString(); return EncodingParserConstants.StringValue; }
	{LineTerminator} { yypushback(1);popState(); valueText = string.toString(); return EncodingParserConstants.InvalidTerminatedStringValue;}
	"\?>" { yypushback(2); popState(); valueText = string.toString(); return EncodingParserConstants.InvalidTerminatedStringValue;}
	'<' { yypushback(1);popState(); valueText = string.toString(); return EncodingParserConstants.InvalidTerminatedStringValue;}
	. { string.append( yytext() ); }

	"%>" { yypushback(2);popState(); valueText = string.toString(); return EncodingParserConstants.InvalidTerminatedStringValue;}


	}

	<SQ_STRING>
	{

	\' { popState(); valueText = string.toString(); return EncodingParserConstants.StringValue;}
	{LineTerminator} { yypushback(1);popState(); valueText = string.toString(); return EncodingParserConstants.InvalidTerminatedStringValue;}
	"%>" { yypushback(2);popState(); valueText = string.toString(); return EncodingParserConstants.InvalidTerminatedStringValue;}
	'<' { yypushback(1);popState(); valueText = string.toString(); return EncodingParserConstants.InvalidTerminatedStringValue;}
	. { string.append( yytext() ); }
	"%>" { yypushback(2);popState(); valueText = string.toString(); return EncodingParserConstants.InvalidTerminatedStringValue;}


	}

	<UnDelimitedString>
	{


	{S} { yypushback(1);popState(); valueText = string.toString(); return EncodingParserConstants.UnDelimitedStringValue; }
	{LineTerminator} { yypushback(1);popState(); valueText = string.toString(); return EncodingParserConstants.InvalidTerminatedStringValue;}
	"\?>" { yypushback(2);popState(); valueText = string.toString(); return EncodingParserConstants.InvalidTerminatedStringValue;}
	'<' { yypushback(1);popState(); valueText = string.toString(); return EncodingParserConstants.InvalidTerminatedStringValue;}
	// these are a bit special, since we started an undelimit string, but found a quote ... probably indicates a missing beginning quote
	\' { yypushback(1);popState(); valueText = string.toString(); return EncodingParserConstants.InvalidTermintatedUnDelimitedStringValue;}
	\" { yypushback(1);popState(); valueText = string.toString(); return EncodingParserConstants.InvalidTermintatedUnDelimitedStringValue;}

	. { string.append( yytext() ); }
	"%>" { yypushback(2);popState(); valueText = string.toString(); return EncodingParserConstants.InvalidTerminatedStringValue;}

	}

	// The "match anything" rule should always be in effect except for when looking for end of string
	// (That is, remember to update state list if/when new states added)
	<YYINITIAL, ST_XMLDecl, QuotedAttributeValue, ST_PAGE_DIRECTIVE>
	{
	// this is the fallback (match "anything") rule (for this scanner, input is ignored, and position advanced, if not recognized)
	.\|\n {if (yychar > MAX_TO_SCAN) {hasMore=false; return EncodingParserConstants.MAX_CHARS_REACHED;}}
	}

	// this rule always in effect
	<<EOF>> {hasMore = false; return EncodingParserConstants.EOF;}