blob: 85c137300a078a3c785063173def5b942657a21d [file] [log] [blame]
/**
* Copyright (c) 2017 Eclipse contributors and others.
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Eclipse Public License v2.0
* which accompanies this distribution, and is available at
* http://www.eclipse.org/legal/epl-v20.html
*
* Contributors:
* See notice below.
*
* ---------------------------------------------------------------------
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* ---------------------------------------------------------------------
*
* =========================================================================
* == NOTICE file corresponding to section 4(d) of the Apache License, ==
* == Version 2.0, in this case for the Apache Xerces Java distribution. ==
* =========================================================================
*
* Apache Xerces Java
* Copyright 1999-2010 The Apache Software Foundation
*
* This product includes software developed at
* The Apache Software Foundation (http://www.apache.org/).
*
* Portions of this software were originally based on the following:
* - software copyright (c) 1999, IBM Corporation., http://www.ibm.com.
* - software copyright (c) 1999, Sun Microsystems., http://www.sun.com.
* - voluntary contributions made by Paul Eng on behalf of the
* Apache Software Foundation that were originally developed at iClick, Inc.,
* software copyright (c) 1999.
*/
package org.eclipse.emf.ecore.xml.type.internal;
import java.text.CharacterIterator;
import java.util.Hashtable;
import java.util.Locale;
import java.util.ResourceBundle;
import java.util.Stack;
import java.util.Vector;
import org.eclipse.emf.ecore.plugin.EcorePlugin;
/**
* NOTE: this class is for internal use only.
*/
@SuppressWarnings("all")
public final class RegEx
{
/**
* A simple integer based stack.
*
* moved to org.apache.xerces.util by neilg to support the
* XPathMatcher.
* @author Andy Clark, IBM
*
* @version $Id: IntStack.java 447241 2006-09-18 05:12:57Z mrglavas $
*/
final static class IntStack {
//
// Data
//
/** Stack depth. */
private int fDepth;
/** Stack data. */
private int[] fData;
//
// Public methods
//
/** Returns the size of the stack. */
public int size() {
return fDepth;
}
/** Pushes a value onto the stack. */
public void push(int value) {
ensureCapacity(fDepth + 1);
fData[fDepth++] = value;
}
/** Peeks at the top of the stack. */
public int peek() {
return fData[fDepth - 1];
}
/** Returns the element at the specified depth in the stack. */
public int elementAt(int depth) {
return fData[depth];
}
/** Pops a value off of the stack. */
public int pop() {
return fData[--fDepth];
}
/** Clears the stack. */
public void clear() {
fDepth = 0;
}
// debugging
/** Prints the stack. */
public void print() {
System.out.print('(');
System.out.print(fDepth);
System.out.print(") {");
for (int i = 0; i < fDepth; i++) {
if (i == 3) {
System.out.print(" ...");
break;
}
System.out.print(' ');
System.out.print(fData[i]);
if (i < fDepth - 1) {
System.out.print(',');
}
}
System.out.print(" }");
System.out.println();
}
//
// Private methods
//
/** Ensures capacity. */
private void ensureCapacity(int size) {
if (fData == null) {
fData = new int[32];
}
else if (fData.length <= size) {
int[] newdata = new int[fData.length * 2];
System.arraycopy(fData, 0, newdata, 0, fData.length);
fData = newdata;
}
}
} // class IntStack
/**
* Boyer-Moore searcher.
*
* @xerces.internal
*
* @version $Id: BMPattern.java 572108 2007-09-02 18:48:31Z mrglavas $
*/
static class BMPattern {
final char[] pattern;
final int[] shiftTable;
final boolean ignoreCase;
public BMPattern(String pat, boolean ignoreCase) {
this(pat, 256, ignoreCase);
}
public BMPattern(String pat, int tableSize, boolean ignoreCase) {
this.pattern = pat.toCharArray();
this.shiftTable = new int[tableSize];
this.ignoreCase = ignoreCase;
int length = pattern.length;
for (int i = 0; i < this.shiftTable.length; i ++)
this.shiftTable[i] = length;
for (int i = 0; i < length; i ++) {
char ch = this.pattern[i];
int diff = length-i-1;
int index = ch % this.shiftTable.length;
if (diff < this.shiftTable[index])
this.shiftTable[index] = diff;
if (this.ignoreCase) {
ch = Character.toUpperCase(ch);
index = ch % this.shiftTable.length;
if (diff < this.shiftTable[index])
this.shiftTable[index] = diff;
ch = Character.toLowerCase(ch);
index = ch % this.shiftTable.length;
if (diff < this.shiftTable[index])
this.shiftTable[index] = diff;
}
}
}
/**
*
* @return -1 if <var>iterator</var> does not contain this pattern.
*/
public int matches(CharacterIterator iterator, int start, int limit) {
if (this.ignoreCase) return this.matchesIgnoreCase(iterator, start, limit);
int plength = this.pattern.length;
if (plength == 0) return start;
int index = start+plength;
while (index <= limit) {
int pindex = plength;
int nindex = index+1;
char ch;
do {
if ((ch = iterator.setIndex(--index)) != this.pattern[--pindex])
break;
if (pindex == 0)
return index;
} while (pindex > 0);
index += this.shiftTable[ch % this.shiftTable.length]+1;
if (index < nindex) index = nindex;
}
return -1;
}
/**
*
* @return -1 if <var>str</var> does not contain this pattern.
*/
public int matches(String str, int start, int limit) {
if (this.ignoreCase) return this.matchesIgnoreCase(str, start, limit);
int plength = this.pattern.length;
if (plength == 0) return start;
int index = start+plength;
while (index <= limit) {
//System.err.println("Starts at "+index);
int pindex = plength;
int nindex = index+1;
char ch;
do {
if ((ch = str.charAt(--index)) != this.pattern[--pindex])
break;
if (pindex == 0)
return index;
} while (pindex > 0);
index += this.shiftTable[ch % this.shiftTable.length]+1;
if (index < nindex) index = nindex;
}
return -1;
}
/**
*
* @return -1 if <var>chars</char> does not contain this pattern.
*/
public int matches(char[] chars, int start, int limit) {
if (this.ignoreCase) return this.matchesIgnoreCase(chars, start, limit);
int plength = this.pattern.length;
if (plength == 0) return start;
int index = start+plength;
while (index <= limit) {
//System.err.println("Starts at "+index);
int pindex = plength;
int nindex = index+1;
char ch;
do {
if ((ch = chars[--index]) != this.pattern[--pindex])
break;
if (pindex == 0)
return index;
} while (pindex > 0);
index += this.shiftTable[ch % this.shiftTable.length]+1;
if (index < nindex) index = nindex;
}
return -1;
}
int matchesIgnoreCase(CharacterIterator iterator, int start, int limit) {
int plength = this.pattern.length;
if (plength == 0) return start;
int index = start+plength;
while (index <= limit) {
int pindex = plength;
int nindex = index+1;
char ch;
do {
char ch1 = ch = iterator.setIndex(--index);
char ch2 = this.pattern[--pindex];
if (ch1 != ch2) {
ch1 = Character.toUpperCase(ch1);
ch2 = Character.toUpperCase(ch2);
if (ch1 != ch2 && Character.toLowerCase(ch1) != Character.toLowerCase(ch2))
break;
}
if (pindex == 0)
return index;
} while (pindex > 0);
index += this.shiftTable[ch % this.shiftTable.length]+1;
if (index < nindex) index = nindex;
}
return -1;
}
int matchesIgnoreCase(String text, int start, int limit) {
int plength = this.pattern.length;
if (plength == 0) return start;
int index = start+plength;
while (index <= limit) {
int pindex = plength;
int nindex = index+1;
char ch;
do {
char ch1 = ch = text.charAt(--index);
char ch2 = this.pattern[--pindex];
if (ch1 != ch2) {
ch1 = Character.toUpperCase(ch1);
ch2 = Character.toUpperCase(ch2);
if (ch1 != ch2 && Character.toLowerCase(ch1) != Character.toLowerCase(ch2))
break;
}
if (pindex == 0)
return index;
} while (pindex > 0);
index += this.shiftTable[ch % this.shiftTable.length]+1;
if (index < nindex) index = nindex;
}
return -1;
}
int matchesIgnoreCase(char[] chars, int start, int limit) {
int plength = this.pattern.length;
if (plength == 0) return start;
int index = start+plength;
while (index <= limit) {
int pindex = plength;
int nindex = index+1;
char ch;
do {
char ch1 = ch = chars[--index];
char ch2 = this.pattern[--pindex];
if (ch1 != ch2) {
ch1 = Character.toUpperCase(ch1);
ch2 = Character.toUpperCase(ch2);
if (ch1 != ch2 && Character.toLowerCase(ch1) != Character.toLowerCase(ch2))
break;
}
if (pindex == 0)
return index;
} while (pindex > 0);
index += this.shiftTable[ch % this.shiftTable.length]+1;
if (index < nindex) index = nindex;
}
return -1;
}
/*
public static void main(String[] argv) {
try {
int[] shiftTable = new int[256];
initializeBoyerMoore(argv[0], shiftTable, true);
int o = -1;
CharacterIterator ite = new java.text.StringCharacterIterator(argv[1]);
long start = System.currentTimeMillis();
//for (int i = 0; i < 10000; i ++)
o = searchIgnoreCasesWithBoyerMoore(ite, 0, argv[0], shiftTable);
start = System.currentTimeMillis()-start;
System.out.println("Result: "+o+", Elapsed: "+start);
} catch (Exception ex) {
ex.printStackTrace();
}
}*/
}
/**
* @xerces.internal
*
* @version $Id: CaseInsensitiveMap.java 834653 2009-11-10 20:32:39Z mrglavas $
*/
final static class CaseInsensitiveMap {
private static int CHUNK_SHIFT = 10; /* 2^10 = 1k */
private static int CHUNK_SIZE = (1<<CHUNK_SHIFT);
private static int CHUNK_MASK = (CHUNK_SIZE-1);
private static int INITIAL_CHUNK_COUNT = 64; /* up to 0xFFFF */
private static int[][][] caseInsensitiveMap;
private static int LOWER_CASE_MATCH = 1;
private static int UPPER_CASE_MATCH = 2;
static {
buildCaseInsensitiveMap();
}
/**
* Return a list of code point characters (not including the input value)
* that can be substituted in a case insensitive match
*/
static public int[] get(int codePoint) {
return (codePoint < 0x10000) ? getMapping(codePoint) : null;
}
private static int[] getMapping(int codePoint) {
int chunk = codePoint >>> CHUNK_SHIFT;
int offset = codePoint & CHUNK_MASK;
return caseInsensitiveMap[chunk][offset];
}
private static void buildCaseInsensitiveMap() {
caseInsensitiveMap = new int[INITIAL_CHUNK_COUNT][CHUNK_SIZE][];
int lc, uc;
for (int i=0; i<0x10000; i++) {
lc = Character.toLowerCase((char) i);
uc = Character.toUpperCase((char) i);
// lower/upper case value is not the same as code point
if (lc != uc || lc != i) {
int[] map = new int[2];
int index = 0;
if (lc != i) {
map[index++] = lc;
map[index++] = LOWER_CASE_MATCH;
int[] lcMap = getMapping(lc);
if (lcMap != null) {
map = updateMap(i, map, lc, lcMap, LOWER_CASE_MATCH);
}
}
if (uc != i) {
if (index == map.length) {
map = expandMap(map, 2);
}
map[index++] = uc;
map[index++] = UPPER_CASE_MATCH;
int[] ucMap = getMapping(uc);
if (ucMap != null) {
map = updateMap(i, map, uc, ucMap, UPPER_CASE_MATCH);
}
}
set(i, map);
}
}
}
private static int[] expandMap(int[] srcMap, int expandBy) {
final int oldLen = srcMap.length;
int[] newMap = new int[oldLen + expandBy];
System.arraycopy(srcMap, 0, newMap, 0, oldLen);
return newMap;
}
private static void set(int codePoint, int[] map) {
int chunk = codePoint >>> CHUNK_SHIFT;
int offset = codePoint & CHUNK_MASK;
caseInsensitiveMap[chunk][offset] = map;
}
private static int[] updateMap(int codePoint, int[] codePointMap,
int ciCodePoint, int[] ciCodePointMap, int matchType) {
for (int i=0; i<ciCodePointMap.length; i+=2) {
int c = ciCodePointMap[i];
int[] cMap = getMapping(c);
if (cMap != null) {
if (contains(cMap, ciCodePoint, matchType)) {
if (!contains(cMap, codePoint)) {
cMap = expandAndAdd(cMap, codePoint, matchType);
set(c, cMap);
}
if (!contains(codePointMap, c)) {
codePointMap = expandAndAdd(codePointMap, c,matchType);
}
}
}
}
if (!contains(ciCodePointMap, codePoint)) {
ciCodePointMap = expandAndAdd(ciCodePointMap, codePoint, matchType);
set(ciCodePoint, ciCodePointMap);
}
return codePointMap;
}
private static boolean contains(int[] map, int codePoint) {
for (int i=0; i<map.length; i += 2) {
if (map[i] == codePoint) {
return true;
}
}
return false;
}
private static boolean contains(int[] map, int codePoint, int matchType) {
for (int i=0; i<map.length; i += 2) {
if (map[i] == codePoint && map[i+1] == matchType) {
return true;
}
}
return false;
}
private static int[] expandAndAdd(int[] srcMap, int codePoint, int matchType) {
final int oldLen = srcMap.length;
int[] newMap = new int[oldLen + 2];
System.arraycopy(srcMap, 0, newMap, 0, oldLen);
newMap[oldLen] = codePoint;
newMap[oldLen+1] = matchType;
return newMap;
}
}
/**
* An instance of this class has ranges captured in matching.
*
* @xerces.internal
*
* @see RegularExpression#matches(char[], int, int, Match)
* @see RegularExpression#matches(char[], Match)
* @see RegularExpression#matches(java.text.CharacterIterator, Match)
* @see RegularExpression#matches(java.lang.String, int, int, Match)
* @see RegularExpression#matches(java.lang.String, Match)
* @author TAMURA Kent &lt;kent@trl.ibm.co.jp&gt;
* @version $Id: Match.java 446721 2006-09-15 20:35:34Z mrglavas $
*/
static class Match implements Cloneable {
int[] beginpos = null;
int[] endpos = null;
int nofgroups = 0;
CharacterIterator ciSource = null;
String strSource = null;
char[] charSource = null;
/**
* Creates an instance.
*/
public Match() {
}
/**
*
*/
public synchronized Object clone() {
Match ma = new Match();
if (this.nofgroups > 0) {
ma.setNumberOfGroups(this.nofgroups);
if (this.ciSource != null) ma.setSource(this.ciSource);
if (this.strSource != null) ma.setSource(this.strSource);
for (int i = 0; i < this.nofgroups; i ++) {
ma.setBeginning(i, this.getBeginning(i));
ma.setEnd(i, this.getEnd(i));
}
}
return ma;
}
/**
*
*/
protected void setNumberOfGroups(int n) {
int oldn = this.nofgroups;
this.nofgroups = n;
if (oldn <= 0
|| oldn < n || n*2 < oldn) {
this.beginpos = new int[n];
this.endpos = new int[n];
}
for (int i = 0; i < n; i ++) {
this.beginpos[i] = -1;
this.endpos[i] = -1;
}
}
/**
*
*/
protected void setSource(CharacterIterator ci) {
this.ciSource = ci;
this.strSource = null;
this.charSource = null;
}
/**
*
*/
protected void setSource(String str) {
this.ciSource = null;
this.strSource = str;
this.charSource = null;
}
/**
*
*/
protected void setSource(char[] chars) {
this.ciSource = null;
this.strSource = null;
this.charSource = chars;
}
/**
*
*/
protected void setBeginning(int index, int v) {
this.beginpos[index] = v;
}
/**
*
*/
protected void setEnd(int index, int v) {
this.endpos[index] = v;
}
/**
* Return the number of regular expression groups.
* This method returns 1 when the regular expression has no capturing-parenthesis.
*/
public int getNumberOfGroups() {
if (this.nofgroups <= 0)
throw new IllegalStateException("A result is not set.");
return this.nofgroups;
}
/**
* Return a start position in the target text matched to specified regular expression group.
*
* @param index Less than <code>getNumberOfGroups()</code>.
*/
public int getBeginning(int index) {
if (this.beginpos == null)
throw new IllegalStateException("A result is not set.");
if (index < 0 || this.nofgroups <= index)
throw new IllegalArgumentException("The parameter must be less than "
+this.nofgroups+": "+index);
return this.beginpos[index];
}
/**
* Return an end position in the target text matched to specified regular expression group.
*
* @param index Less than <code>getNumberOfGroups()</code>.
*/
public int getEnd(int index) {
if (this.endpos == null)
throw new IllegalStateException("A result is not set.");
if (index < 0 || this.nofgroups <= index)
throw new IllegalArgumentException("The parameter must be less than "
+this.nofgroups+": "+index);
return this.endpos[index];
}
/**
* Return an substring of the target text matched to specified regular expression group.
*
* @param index Less than <code>getNumberOfGroups()</code>.
*/
public String getCapturedText(int index) {
if (this.beginpos == null)
throw new IllegalStateException("match() has never been called.");
if (index < 0 || this.nofgroups <= index)
throw new IllegalArgumentException("The parameter must be less than "
+this.nofgroups+": "+index);
String ret;
int begin = this.beginpos[index], end = this.endpos[index];
if (begin < 0 || end < 0) return null;
if (this.ciSource != null) {
ret = REUtil.substring(this.ciSource, begin, end);
} else if (this.strSource != null) {
ret = this.strSource.substring(begin, end);
} else {
ret = new String(this.charSource, begin, end-begin);
}
return ret;
}
}
/**
* @xerces.internal
*
* @version $Id: Op.java 572108 2007-09-02 18:48:31Z mrglavas $
*/
static class Op {
static final int DOT = 0;
static final int CHAR = 1; // Single character
static final int RANGE = 3; // [a-zA-Z]
static final int NRANGE = 4; // [^a-zA-Z]
static final int ANCHOR = 5; // ^ $ ...
static final int STRING = 6; // literal String
static final int CLOSURE = 7; // X*
static final int NONGREEDYCLOSURE = 8; // X*?
static final int QUESTION = 9; // X?
static final int NONGREEDYQUESTION = 10; // X??
static final int UNION = 11; // X|Y
static final int CAPTURE = 15; // ( and )
static final int BACKREFERENCE = 16; // \1 \2 ...
static final int LOOKAHEAD = 20; // (?=...)
static final int NEGATIVELOOKAHEAD = 21; // (?!...)
static final int LOOKBEHIND = 22; // (?<=...)
static final int NEGATIVELOOKBEHIND = 23; // (?<!...)
static final int INDEPENDENT = 24; // (?>...)
static final int MODIFIER = 25; // (?ims-ims:...)
static final int CONDITION = 26; // (?(..)yes|no)
static int nofinstances = 0;
static final boolean COUNT = false;
static Op createDot() {
if (Op.COUNT) Op.nofinstances ++;
return new Op(Op.DOT);
}
static CharOp createChar(int data) {
if (Op.COUNT) Op.nofinstances ++;
return new CharOp(Op.CHAR, data);
}
static CharOp createAnchor(int data) {
if (Op.COUNT) Op.nofinstances ++;
return new CharOp(Op.ANCHOR, data);
}
static CharOp createCapture(int number, Op next) {
if (Op.COUNT) Op.nofinstances ++;
CharOp op = new CharOp(Op.CAPTURE, number);
op.next = next;
return op;
}
static UnionOp createUnion(int size) {
if (Op.COUNT) Op.nofinstances ++;
//System.err.println("Creates UnionOp");
return new UnionOp(Op.UNION, size);
}
static ChildOp createClosure(int id) {
if (Op.COUNT) Op.nofinstances ++;
return new ModifierOp(Op.CLOSURE, id, -1);
}
static ChildOp createNonGreedyClosure() {
if (Op.COUNT) Op.nofinstances ++;
return new ChildOp(Op.NONGREEDYCLOSURE);
}
static ChildOp createQuestion(boolean nongreedy) {
if (Op.COUNT) Op.nofinstances ++;
return new ChildOp(nongreedy ? Op.NONGREEDYQUESTION : Op.QUESTION);
}
static RangeOp createRange(Token tok) {
if (Op.COUNT) Op.nofinstances ++;
return new RangeOp(Op.RANGE, tok);
}
static ChildOp createLook(int type, Op next, Op branch) {
if (Op.COUNT) Op.nofinstances ++;
ChildOp op = new ChildOp(type);
op.setChild(branch);
op.next = next;
return op;
}
static CharOp createBackReference(int refno) {
if (Op.COUNT) Op.nofinstances ++;
return new CharOp(Op.BACKREFERENCE, refno);
}
static StringOp createString(String literal) {
if (Op.COUNT) Op.nofinstances ++;
return new StringOp(Op.STRING, literal);
}
static ChildOp createIndependent(Op next, Op branch) {
if (Op.COUNT) Op.nofinstances ++;
ChildOp op = new ChildOp(Op.INDEPENDENT);
op.setChild(branch);
op.next = next;
return op;
}
static ModifierOp createModifier(Op next, Op branch, int add, int mask) {
if (Op.COUNT) Op.nofinstances ++;
ModifierOp op = new ModifierOp(Op.MODIFIER, add, mask);
op.setChild(branch);
op.next = next;
return op;
}
static ConditionOp createCondition(Op next, int ref, Op conditionflow, Op yesflow, Op noflow) {
if (Op.COUNT) Op.nofinstances ++;
ConditionOp op = new ConditionOp(Op.CONDITION, ref, conditionflow, yesflow, noflow);
op.next = next;
return op;
}
final int type;
Op next = null;
protected Op(int type) {
this.type = type;
}
int size() { // for UNION
return 0;
}
Op elementAt(int index) { // for UNIoN
throw new RuntimeException("Internal Error: type="+this.type);
}
Op getChild() { // for CLOSURE, QUESTION
throw new RuntimeException("Internal Error: type="+this.type);
}
// ModifierOp
int getData() { // CharOp for CHAR, BACKREFERENCE, CAPTURE, ANCHOR,
throw new RuntimeException("Internal Error: type="+this.type);
}
int getData2() { // ModifierOp
throw new RuntimeException("Internal Error: type="+this.type);
}
RangeToken getToken() { // RANGE, NRANGE
throw new RuntimeException("Internal Error: type="+this.type);
}
String getString() { // STRING
throw new RuntimeException("Internal Error: type="+this.type);
}
// ================================================================
static class CharOp extends Op {
final int charData;
CharOp(int type, int data) {
super(type);
this.charData = data;
}
int getData() {
return this.charData;
}
}
// ================================================================
static class UnionOp extends Op {
final Vector branches;
UnionOp(int type, int size) {
super(type);
this.branches = new Vector(size);
}
void addElement(Op op) {
this.branches.addElement(op);
}
int size() {
return this.branches.size();
}
Op elementAt(int index) {
return (Op)this.branches.elementAt(index);
}
}
// ================================================================
static class ChildOp extends Op {
Op child;
ChildOp(int type) {
super(type);
}
void setChild(Op child) {
this.child = child;
}
Op getChild() {
return this.child;
}
}
// ================================================================
static class ModifierOp extends ChildOp {
final int v1;
final int v2;
ModifierOp(int type, int v1, int v2) {
super(type);
this.v1 = v1;
this.v2 = v2;
}
int getData() {
return this.v1;
}
int getData2() {
return this.v2;
}
}
// ================================================================
static class RangeOp extends Op {
final Token tok;
RangeOp(int type, Token tok) {
super(type);
this.tok = tok;
}
RangeToken getToken() {
return (RangeToken)this.tok;
}
}
// ================================================================
static class StringOp extends Op {
final String string;
StringOp(int type, String literal) {
super(type);
this.string = literal;
}
String getString() {
return this.string;
}
}
// ================================================================
static class ConditionOp extends Op {
final int refNumber;
final Op condition;
final Op yes;
final Op no;
ConditionOp(int type, int refno, Op conditionflow, Op yesflow, Op noflow) {
super(type);
this.refNumber = refno;
this.condition = conditionflow;
this.yes = yesflow;
this.no = noflow;
}
}
}
/**
* @xerces.internal
*
* @author TAMURA Kent &lt;kent@trl.ibm.co.jp&gt;
* @version $Id: ParseException.java 572108 2007-09-02 18:48:31Z mrglavas $
*/
public static class ParseException extends RuntimeException {
/** Serialization version. */
static final long serialVersionUID = -7012400318097691370L;
final int location;
/*
public ParseException(String mes) {
this(mes, -1);
}
*/
/**
*
*/
public ParseException(String mes, int location) {
super(mes);
this.location = location;
}
/**
*
* @return -1 if location information is not available.
*/
public int getLocation() {
return this.location;
}
}
/**
* A regular expression parser for the XML Schema.
*
* @xerces.internal
*
* @author TAMURA Kent &lt;kent@trl.ibm.co.jp&gt;
* @version $Id: ParserForXMLSchema.java 831926 2009-11-02 15:38:53Z knoaman $
*/
static class ParserForXMLSchema extends RegexParser {
public ParserForXMLSchema() {
//this.setLocale(Locale.getDefault());
}
public ParserForXMLSchema(Locale locale) {
super(locale);
}
Token processCaret() throws ParseException {
this.next();
return Token.createChar('^');
}
Token processDollar() throws ParseException {
this.next();
return Token.createChar('$');
}
Token processLookahead() throws ParseException {
throw ex("parser.process.1", this.offset);
}
Token processNegativelookahead() throws ParseException {
throw ex("parser.process.1", this.offset);
}
Token processLookbehind() throws ParseException {
throw ex("parser.process.1", this.offset);
}
Token processNegativelookbehind() throws ParseException {
throw ex("parser.process.1", this.offset);
}
Token processBacksolidus_A() throws ParseException {
throw ex("parser.process.1", this.offset);
}
Token processBacksolidus_Z() throws ParseException {
throw ex("parser.process.1", this.offset);
}
Token processBacksolidus_z() throws ParseException {
throw ex("parser.process.1", this.offset);
}
Token processBacksolidus_b() throws ParseException {
throw ex("parser.process.1", this.offset);
}
Token processBacksolidus_B() throws ParseException {
throw ex("parser.process.1", this.offset);
}
Token processBacksolidus_lt() throws ParseException {
throw ex("parser.process.1", this.offset);
}
Token processBacksolidus_gt() throws ParseException {
throw ex("parser.process.1", this.offset);
}
Token processStar(Token tok) throws ParseException {
this.next();
return Token.createClosure(tok);
}
Token processPlus(Token tok) throws ParseException {
// X+ -> XX*
this.next();
return Token.createConcat(tok, Token.createClosure(tok));
}
Token processQuestion(Token tok) throws ParseException {
// X? -> X|
this.next();
Token par = Token.createUnion();
par.addChild(tok);
par.addChild(Token.createEmpty());
return par;
}
boolean checkQuestion(int off) {
return false;
}
Token processParen() throws ParseException {
this.next();
Token tok = Token.createParen(this.parseRegex(), 0);
if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
this.next(); // Skips ')'
return tok;
}
Token processParen2() throws ParseException {
throw ex("parser.process.1", this.offset);
}
Token processCondition() throws ParseException {
throw ex("parser.process.1", this.offset);
}
Token processModifiers() throws ParseException {
throw ex("parser.process.1", this.offset);
}
Token processIndependent() throws ParseException {
throw ex("parser.process.1", this.offset);
}
Token processBacksolidus_c() throws ParseException {
this.next();
return this.getTokenForShorthand('c');
}
Token processBacksolidus_C() throws ParseException {
this.next();
return this.getTokenForShorthand('C');
}
Token processBacksolidus_i() throws ParseException {
this.next();
return this.getTokenForShorthand('i');
}
Token processBacksolidus_I() throws ParseException {
this.next();
return this.getTokenForShorthand('I');
}
Token processBacksolidus_g() throws ParseException {
throw this.ex("parser.process.1", this.offset-2);
}
Token processBacksolidus_X() throws ParseException {
throw ex("parser.process.1", this.offset-2);
}
Token processBackreference() throws ParseException {
throw ex("parser.process.1", this.offset-4);
}
int processCIinCharacterClass(RangeToken tok, int c) {
tok.mergeRanges(this.getTokenForShorthand(c));
return -1;
}
/**
* Parses a character-class-expression, not a character-class-escape.
*
* c-c-expression ::= '[' c-group ']'
* c-group ::= positive-c-group | negative-c-group | c-c-subtraction
* positive-c-group ::= (c-range | c-c-escape)+
* negative-c-group ::= '^' positive-c-group
* c-c-subtraction ::= (positive-c-group | negative-c-group) subtraction
* subtraction ::= '-' c-c-expression
* c-range ::= single-range | from-to-range
* single-range ::= multi-c-escape | category-c-escape | block-c-escape | &lt;any XML char&gt;
* cc-normal-c ::= &lt;any character except [, ], \&gt;
* from-to-range ::= cc-normal-c '-' cc-normal-c
*
* @param useNrage Ignored.
* @return This returns no NrageToken.
*/
protected RangeToken parseCharacterClass(boolean useNrange) throws ParseException {
this.setContext(S_INBRACKETS);
this.next(); // '['
boolean nrange = false;
@SuppressWarnings("unused")
boolean wasDecoded = false; // used to detect if the last - was escaped.
RangeToken base = null;
RangeToken tok;
if (this.read() == T_CHAR && this.chardata == '^') {
nrange = true;
this.next(); // '^'
base = Token.createRange();
base.addRange(0, Token.UTF16_MAX);
tok = Token.createRange();
} else {
tok = Token.createRange();
}
int type;
boolean firstloop = true;
while ((type = this.read()) != T_EOF) { // Don't use 'cotinue' for this loop.
wasDecoded = false;
// single-range | from-to-range | subtraction
if (type == T_CHAR && this.chardata == ']' && !firstloop) {
if (nrange) {
base.subtractRanges(tok);
tok = base;
}
break;
}
int c = this.chardata;
boolean end = false;
if (type == T_BACKSOLIDUS) {
switch (c) {
case 'd': case 'D':
case 'w': case 'W':
case 's': case 'S':
tok.mergeRanges(this.getTokenForShorthand(c));
end = true;
break;
case 'i': case 'I':
case 'c': case 'C':
c = this.processCIinCharacterClass(tok, c);
if (c < 0) end = true;
break;
case 'p':
case 'P':
int pstart = this.offset;
RangeToken tok2 = this.processBacksolidus_pP(c);
if (tok2 == null) throw this.ex("parser.atom.5", pstart);
tok.mergeRanges(tok2);
end = true;
break;
case '-':
c = this.decodeEscaped();
wasDecoded = true;
break;
default:
c = this.decodeEscaped();
} // \ + c
} // backsolidus
else if (type == T_XMLSCHEMA_CC_SUBTRACTION && !firstloop) {
// Subraction
if (nrange) {
base.subtractRanges(tok);
tok = base;
}
RangeToken range2 = this.parseCharacterClass(false);
tok.subtractRanges(range2);
if (this.read() != T_CHAR || this.chardata != ']')
throw this.ex("parser.cc.5", this.offset);
break; // Exit this loop
}
this.next();
if (!end) { // if not shorthands...
if (type == T_CHAR) {
if (c == '[') throw this.ex("parser.cc.6", this.offset-2);
if (c == ']') throw this.ex("parser.cc.7", this.offset-2);
if (c == '-' && this.chardata != ']' && !firstloop) throw this.ex("parser.cc.8", this.offset-2); // if regex = '[-]' then invalid
}
if (this.read() != T_CHAR || this.chardata != '-' || c == '-' && firstloop) { // Here is no '-'.
if (!this.isSet(RegularExpression.IGNORE_CASE) || c > 0xffff) {
tok.addRange(c, c);
}
else {
addCaseInsensitiveChar(tok, c);
}
} else { // Found '-'
// Is this '-' is a from-to token??
this.next(); // Skips '-'
if ((type = this.read()) == T_EOF) throw this.ex("parser.cc.2", this.offset);
// c '-' ']' -> '-' is a single-range.
if(type == T_CHAR && this.chardata == ']') { // if - is at the last position of the group
if (!this.isSet(RegularExpression.IGNORE_CASE) || c > 0xffff) {
tok.addRange(c, c);
}
else {
addCaseInsensitiveChar(tok, c);
}
tok.addRange('-', '-');
}
else if (type == T_XMLSCHEMA_CC_SUBTRACTION) {
throw this.ex("parser.cc.8", this.offset-1);
} else {
int rangeend = this.chardata;
if (type == T_CHAR) {
if (rangeend == '[') throw this.ex("parser.cc.6", this.offset-1);
if (rangeend == ']') throw this.ex("parser.cc.7", this.offset-1);
if (rangeend == '-') throw this.ex("parser.cc.8", this.offset-2);
}
else if (type == T_BACKSOLIDUS)
rangeend = this.decodeEscaped();
this.next();
if (c > rangeend) throw this.ex("parser.ope.3", this.offset-1);
if (!this.isSet(RegularExpression.IGNORE_CASE) ||
(c > 0xffff && rangeend > 0xffff)) {
tok.addRange(c, rangeend);
}
else {
addCaseInsensitiveCharRange(tok, c, rangeend);
}
}
}
}
firstloop = false;
}
if (this.read() == T_EOF)
throw this.ex("parser.cc.2", this.offset);
tok.sortRanges();
tok.compactRanges();
//tok.dumpRanges();
this.setContext(S_NORMAL);
this.next(); // Skips ']'
return tok;
}
protected RangeToken parseSetOperations() throws ParseException {
throw this.ex("parser.process.1", this.offset);
}
Token getTokenForShorthand(int ch) {
switch (ch) {
case 'd':
return ParserForXMLSchema.getRange("xml:isDigit", true);
case 'D':
return ParserForXMLSchema.getRange("xml:isDigit", false);
case 'w':
return ParserForXMLSchema.getRange("xml:isWord", true);
case 'W':
return ParserForXMLSchema.getRange("xml:isWord", false);
case 's':
return ParserForXMLSchema.getRange("xml:isSpace", true);
case 'S':
return ParserForXMLSchema.getRange("xml:isSpace", false);
case 'c':
return ParserForXMLSchema.getRange("xml:isNameChar", true);
case 'C':
return ParserForXMLSchema.getRange("xml:isNameChar", false);
case 'i':
return ParserForXMLSchema.getRange("xml:isInitialNameChar", true);
case 'I':
return ParserForXMLSchema.getRange("xml:isInitialNameChar", false);
default:
throw new RuntimeException("Internal Error: shorthands: \\u"+Integer.toString(ch, 16));
}
}
int decodeEscaped() throws ParseException {
if (this.read() != T_BACKSOLIDUS) throw ex("parser.next.1", this.offset-1);
int c = this.chardata;
switch (c) {
case 'n': c = '\n'; break; // LINE FEED U+000A
case 'r': c = '\r'; break; // CRRIAGE RETURN U+000D
case 't': c = '\t'; break; // HORIZONTAL TABULATION U+0009
case '\\':
case '|':
case '.':
case '^':
case '-':
case '?':
case '*':
case '+':
case '{':
case '}':
case '(':
case ')':
case '[':
case ']':
break; // return actucal char
default:
throw ex("parser.process.1", this.offset-2);
}
return c;
}
static private Hashtable ranges = null;
static private Hashtable ranges2 = null;
static synchronized protected RangeToken getRange(String name, boolean positive) {
if (ranges == null) {
ranges = new Hashtable();
ranges2 = new Hashtable();
Token tok = Token.createRange();
setupRange(tok, SPACES);
ranges.put("xml:isSpace", tok);
ranges2.put("xml:isSpace", Token.complementRanges(tok));
tok = Token.createRange();
setupRange(tok, DIGITS);
ranges.put("xml:isDigit", tok);
ranges2.put("xml:isDigit", Token.complementRanges(tok));
tok = Token.createRange();
setupRange(tok, DIGITS);
ranges.put("xml:isDigit", tok);
ranges2.put("xml:isDigit", Token.complementRanges(tok));
tok = Token.createRange();
setupRange(tok, LETTERS);
tok.mergeRanges((Token)ranges.get("xml:isDigit"));
ranges.put("xml:isWord", tok);
ranges2.put("xml:isWord", Token.complementRanges(tok));
tok = Token.createRange();
setupRange(tok, NAMECHARS);
ranges.put("xml:isNameChar", tok);
ranges2.put("xml:isNameChar", Token.complementRanges(tok));
tok = Token.createRange();
setupRange(tok, LETTERS);
tok.addRange('_', '_');
tok.addRange(':', ':');
ranges.put("xml:isInitialNameChar", tok);
ranges2.put("xml:isInitialNameChar", Token.complementRanges(tok));
}
RangeToken tok = positive ? (RangeToken)ranges.get(name)
: (RangeToken)ranges2.get(name);
return tok;
}
static void setupRange(Token range, String src) {
int len = src.length();
for (int i = 0; i < len; i += 2)
range.addRange(src.charAt(i), src.charAt(i+1));
}
private static final String SPACES = "\t\n\r\r ";
private static final String NAMECHARS =
"\u002d\u002e\u0030\u003a\u0041\u005a\u005f\u005f\u0061\u007a\u00b7\u00b7\u00c0\u00d6"
+"\u00d8\u00f6\u00f8\u0131\u0134\u013e\u0141\u0148\u014a\u017e\u0180\u01c3\u01cd\u01f0"
+"\u01f4\u01f5\u01fa\u0217\u0250\u02a8\u02bb\u02c1\u02d0\u02d1\u0300\u0345\u0360\u0361"
+"\u0386\u038a\u038c\u038c\u038e\u03a1\u03a3\u03ce\u03d0\u03d6\u03da\u03da\u03dc\u03dc"
+"\u03de\u03de\u03e0\u03e0\u03e2\u03f3\u0401\u040c\u040e\u044f\u0451\u045c\u045e\u0481"
+"\u0483\u0486\u0490\u04c4\u04c7\u04c8\u04cb\u04cc\u04d0\u04eb\u04ee\u04f5\u04f8\u04f9"
+"\u0531\u0556\u0559\u0559\u0561\u0586\u0591\u05a1\u05a3\u05b9\u05bb\u05bd\u05bf\u05bf"
+"\u05c1\u05c2\u05c4\u05c4\u05d0\u05ea\u05f0\u05f2\u0621\u063a\u0640\u0652\u0660\u0669"
+"\u0670\u06b7\u06ba\u06be\u06c0\u06ce\u06d0\u06d3\u06d5\u06e8\u06ea\u06ed\u06f0\u06f9"
+"\u0901\u0903\u0905\u0939\u093c\u094d\u0951\u0954\u0958\u0963\u0966\u096f\u0981\u0983"
+"\u0985\u098c\u098f\u0990\u0993\u09a8\u09aa\u09b0\u09b2\u09b2\u09b6\u09b9\u09bc\u09bc"
+"\u09be\u09c4\u09c7\u09c8\u09cb\u09cd\u09d7\u09d7\u09dc\u09dd\u09df\u09e3\u09e6\u09f1"
+"\u0a02\u0a02\u0a05\u0a0a\u0a0f\u0a10\u0a13\u0a28\u0a2a\u0a30\u0a32\u0a33\u0a35\u0a36"
+"\u0a38\u0a39\u0a3c\u0a3c\u0a3e\u0a42\u0a47\u0a48\u0a4b\u0a4d\u0a59\u0a5c\u0a5e\u0a5e"
+"\u0a66\u0a74\u0a81\u0a83\u0a85\u0a8b\u0a8d\u0a8d\u0a8f\u0a91\u0a93\u0aa8\u0aaa\u0ab0"
+"\u0ab2\u0ab3\u0ab5\u0ab9\u0abc\u0ac5\u0ac7\u0ac9\u0acb\u0acd\u0ae0\u0ae0\u0ae6\u0aef"
+"\u0b01\u0b03\u0b05\u0b0c\u0b0f\u0b10\u0b13\u0b28\u0b2a\u0b30\u0b32\u0b33\u0b36\u0b39"
+"\u0b3c\u0b43\u0b47\u0b48\u0b4b\u0b4d\u0b56\u0b57\u0b5c\u0b5d\u0b5f\u0b61\u0b66\u0b6f"
+"\u0b82\u0b83\u0b85\u0b8a\u0b8e\u0b90\u0b92\u0b95\u0b99\u0b9a\u0b9c\u0b9c\u0b9e\u0b9f"
+"\u0ba3\u0ba4\u0ba8\u0baa\u0bae\u0bb5\u0bb7\u0bb9\u0bbe\u0bc2\u0bc6\u0bc8\u0bca\u0bcd"
+"\u0bd7\u0bd7\u0be7\u0bef\u0c01\u0c03\u0c05\u0c0c\u0c0e\u0c10\u0c12\u0c28\u0c2a\u0c33"
+"\u0c35\u0c39\u0c3e\u0c44\u0c46\u0c48\u0c4a\u0c4d\u0c55\u0c56\u0c60\u0c61\u0c66\u0c6f"
+"\u0c82\u0c83\u0c85\u0c8c\u0c8e\u0c90\u0c92\u0ca8\u0caa\u0cb3\u0cb5\u0cb9\u0cbe\u0cc4"
+"\u0cc6\u0cc8\u0cca\u0ccd\u0cd5\u0cd6\u0cde\u0cde\u0ce0\u0ce1\u0ce6\u0cef\u0d02\u0d03"
+"\u0d05\u0d0c\u0d0e\u0d10\u0d12\u0d28\u0d2a\u0d39\u0d3e\u0d43\u0d46\u0d48\u0d4a\u0d4d"
+"\u0d57\u0d57\u0d60\u0d61\u0d66\u0d6f\u0e01\u0e2e\u0e30\u0e3a\u0e40\u0e4e\u0e50\u0e59"
+"\u0e81\u0e82\u0e84\u0e84\u0e87\u0e88\u0e8a\u0e8a\u0e8d\u0e8d\u0e94\u0e97\u0e99\u0e9f"
+"\u0ea1\u0ea3\u0ea5\u0ea5\u0ea7\u0ea7\u0eaa\u0eab\u0ead\u0eae\u0eb0\u0eb9\u0ebb\u0ebd"
+"\u0ec0\u0ec4\u0ec6\u0ec6\u0ec8\u0ecd\u0ed0\u0ed9\u0f18\u0f19\u0f20\u0f29\u0f35\u0f35"
+"\u0f37\u0f37\u0f39\u0f39\u0f3e\u0f47\u0f49\u0f69\u0f71\u0f84\u0f86\u0f8b\u0f90\u0f95"
+"\u0f97\u0f97\u0f99\u0fad\u0fb1\u0fb7\u0fb9\u0fb9\u10a0\u10c5\u10d0\u10f6\u1100\u1100"
+"\u1102\u1103\u1105\u1107\u1109\u1109\u110b\u110c\u110e\u1112\u113c\u113c\u113e\u113e"
+"\u1140\u1140\u114c\u114c\u114e\u114e\u1150\u1150\u1154\u1155\u1159\u1159\u115f\u1161"
+"\u1163\u1163\u1165\u1165\u1167\u1167\u1169\u1169\u116d\u116e\u1172\u1173\u1175\u1175"
+"\u119e\u119e\u11a8\u11a8\u11ab\u11ab\u11ae\u11af\u11b7\u11b8\u11ba\u11ba\u11bc\u11c2"
+"\u11eb\u11eb\u11f0\u11f0\u11f9\u11f9\u1e00\u1e9b\u1ea0\u1ef9\u1f00\u1f15\u1f18\u1f1d"
+"\u1f20\u1f45\u1f48\u1f4d\u1f50\u1f57\u1f59\u1f59\u1f5b\u1f5b\u1f5d\u1f5d\u1f5f\u1f7d"
+"\u1f80\u1fb4\u1fb6\u1fbc\u1fbe\u1fbe\u1fc2\u1fc4\u1fc6\u1fcc\u1fd0\u1fd3\u1fd6\u1fdb"
+"\u1fe0\u1fec\u1ff2\u1ff4\u1ff6\u1ffc\u20d0\u20dc\u20e1\u20e1\u2126\u2126\u212a\u212b"
+"\u212e\u212e\u2180\u2182\u3005\u3005\u3007\u3007\u3021\u302f\u3031\u3035\u3041\u3094"
+"\u3099\u309a\u309d\u309e\u30a1\u30fa\u30fc\u30fe\u3105\u312c\u4e00\u9fa5\uac00\ud7a3"
+"";
private static final String LETTERS =
"\u0041\u005a\u0061\u007a\u00c0\u00d6\u00d8\u00f6\u00f8\u0131\u0134\u013e\u0141\u0148"
+"\u014a\u017e\u0180\u01c3\u01cd\u01f0\u01f4\u01f5\u01fa\u0217\u0250\u02a8\u02bb\u02c1"
+"\u0386\u0386\u0388\u038a\u038c\u038c\u038e\u03a1\u03a3\u03ce\u03d0\u03d6\u03da\u03da"
+"\u03dc\u03dc\u03de\u03de\u03e0\u03e0\u03e2\u03f3\u0401\u040c\u040e\u044f\u0451\u045c"
+"\u045e\u0481\u0490\u04c4\u04c7\u04c8\u04cb\u04cc\u04d0\u04eb\u04ee\u04f5\u04f8\u04f9"
+"\u0531\u0556\u0559\u0559\u0561\u0586\u05d0\u05ea\u05f0\u05f2\u0621\u063a\u0641\u064a"
+"\u0671\u06b7\u06ba\u06be\u06c0\u06ce\u06d0\u06d3\u06d5\u06d5\u06e5\u06e6\u0905\u0939"
+"\u093d\u093d\u0958\u0961\u0985\u098c\u098f\u0990\u0993\u09a8\u09aa\u09b0\u09b2\u09b2"
+"\u09b6\u09b9\u09dc\u09dd\u09df\u09e1\u09f0\u09f1\u0a05\u0a0a\u0a0f\u0a10\u0a13\u0a28"
+"\u0a2a\u0a30\u0a32\u0a33\u0a35\u0a36\u0a38\u0a39\u0a59\u0a5c\u0a5e\u0a5e\u0a72\u0a74"
+"\u0a85\u0a8b\u0a8d\u0a8d\u0a8f\u0a91\u0a93\u0aa8\u0aaa\u0ab0\u0ab2\u0ab3\u0ab5\u0ab9"
+"\u0abd\u0abd\u0ae0\u0ae0\u0b05\u0b0c\u0b0f\u0b10\u0b13\u0b28\u0b2a\u0b30\u0b32\u0b33"
+"\u0b36\u0b39\u0b3d\u0b3d\u0b5c\u0b5d\u0b5f\u0b61\u0b85\u0b8a\u0b8e\u0b90\u0b92\u0b95"
+"\u0b99\u0b9a\u0b9c\u0b9c\u0b9e\u0b9f\u0ba3\u0ba4\u0ba8\u0baa\u0bae\u0bb5\u0bb7\u0bb9"
+"\u0c05\u0c0c\u0c0e\u0c10\u0c12\u0c28\u0c2a\u0c33\u0c35\u0c39\u0c60\u0c61\u0c85\u0c8c"
+"\u0c8e\u0c90\u0c92\u0ca8\u0caa\u0cb3\u0cb5\u0cb9\u0cde\u0cde\u0ce0\u0ce1\u0d05\u0d0c"
+"\u0d0e\u0d10\u0d12\u0d28\u0d2a\u0d39\u0d60\u0d61\u0e01\u0e2e\u0e30\u0e30\u0e32\u0e33"
+"\u0e40\u0e45\u0e81\u0e82\u0e84\u0e84\u0e87\u0e88\u0e8a\u0e8a\u0e8d\u0e8d\u0e94\u0e97"
+"\u0e99\u0e9f\u0ea1\u0ea3\u0ea5\u0ea5\u0ea7\u0ea7\u0eaa\u0eab\u0ead\u0eae\u0eb0\u0eb0"
+"\u0eb2\u0eb3\u0ebd\u0ebd\u0ec0\u0ec4\u0f40\u0f47\u0f49\u0f69\u10a0\u10c5\u10d0\u10f6"
+"\u1100\u1100\u1102\u1103\u1105\u1107\u1109\u1109\u110b\u110c\u110e\u1112\u113c\u113c"
+"\u113e\u113e\u1140\u1140\u114c\u114c\u114e\u114e\u1150\u1150\u1154\u1155\u1159\u1159"
+"\u115f\u1161\u1163\u1163\u1165\u1165\u1167\u1167\u1169\u1169\u116d\u116e\u1172\u1173"
+"\u1175\u1175\u119e\u119e\u11a8\u11a8\u11ab\u11ab\u11ae\u11af\u11b7\u11b8\u11ba\u11ba"
+"\u11bc\u11c2\u11eb\u11eb\u11f0\u11f0\u11f9\u11f9\u1e00\u1e9b\u1ea0\u1ef9\u1f00\u1f15"
+"\u1f18\u1f1d\u1f20\u1f45\u1f48\u1f4d\u1f50\u1f57\u1f59\u1f59\u1f5b\u1f5b\u1f5d\u1f5d"
+"\u1f5f\u1f7d\u1f80\u1fb4\u1fb6\u1fbc\u1fbe\u1fbe\u1fc2\u1fc4\u1fc6\u1fcc\u1fd0\u1fd3"
+"\u1fd6\u1fdb\u1fe0\u1fec\u1ff2\u1ff4\u1ff6\u1ffc\u2126\u2126\u212a\u212b\u212e\u212e"
+"\u2180\u2182\u3007\u3007\u3021\u3029\u3041\u3094\u30a1\u30fa\u3105\u312c\u4e00\u9fa5"
+"\uac00\ud7a3";
private static final String DIGITS =
"\u0030\u0039\u0660\u0669\u06F0\u06F9\u0966\u096F\u09E6\u09EF\u0A66\u0A6F\u0AE6\u0AEF"
+"\u0B66\u0B6F\u0BE7\u0BEF\u0C66\u0C6F\u0CE6\u0CEF\u0D66\u0D6F\u0E50\u0E59\u0ED0\u0ED9"
+"\u0F20\u0F29";
}
/**
* This class represents a character class such as [a-z] or a period.
*
* @xerces.internal
*
* @version $Id: RangeToken.java 965250 2010-07-18 16:04:58Z mrglavas $
*/
final static class RangeToken extends Token implements java.io.Serializable {
private static final long serialVersionUID = -553983121197679934L;
int[] ranges;
boolean sorted;
boolean compacted;
RangeToken icaseCache = null;
int[] map = null;
int nonMapIndex;
RangeToken(int type) {
super(type);
this.setSorted(false);
}
// for RANGE or NRANGE
protected void addRange(int start, int end) {
this.icaseCache = null;
//System.err.println("Token#addRange(): "+start+" "+end);
int r1, r2;
if (start <= end) {
r1 = start;
r2 = end;
} else {
r1 = end;
r2 = start;
}
int pos = 0;
if (this.ranges == null) {
this.ranges = new int[2];
this.ranges[0] = r1;
this.ranges[1] = r2;
this.setSorted(true);
} else {
pos = this.ranges.length;
if (this.ranges[pos-1]+1 == r1) {
this.ranges[pos-1] = r2;
return;
}
int[] temp = new int[pos+2];
System.arraycopy(this.ranges, 0, temp, 0, pos);
this.ranges = temp;
if (this.ranges[pos-1] >= r1)
this.setSorted(false);
this.ranges[pos++] = r1;
this.ranges[pos] = r2;
if (!this.sorted)
this.sortRanges();
}
}
private final boolean isSorted() {
return this.sorted;
}
private final void setSorted(boolean sort) {
this.sorted = sort;
if (!sort) this.compacted = false;
}
private final boolean isCompacted() {
return this.compacted;
}
private final void setCompacted() {
this.compacted = true;
}
protected void sortRanges() {
if (this.isSorted())
return;
if (this.ranges == null)
return;
//System.err.println("Do sorting: "+this.ranges.length);
// Bubble sort
// Why? -- In many cases,
// this.ranges has few elements.
for (int i = this.ranges.length-4; i >= 0; i -= 2) {
for (int j = 0; j <= i; j += 2) {
if (this.ranges[j] > this.ranges[j+2]
|| this.ranges[j] == this.ranges[j+2] && this.ranges[j+1] > this.ranges[j+3]) {
int tmp;
tmp = this.ranges[j+2];
this.ranges[j+2] = this.ranges[j];
this.ranges[j] = tmp;
tmp = this.ranges[j+3];
this.ranges[j+3] = this.ranges[j+1];
this.ranges[j+1] = tmp;
}
}
}
this.setSorted(true);
}
/**
* this.ranges is sorted.
*/
protected void compactRanges() {
boolean DEBUG = false;
if (this.ranges == null || this.ranges.length <= 2)
return;
if (this.isCompacted())
return;
int base = 0; // Index of writing point
int target = 0; // Index of processing point
while (target < this.ranges.length) {
if (base != target) {
this.ranges[base] = this.ranges[target++];
this.ranges[base+1] = this.ranges[target++];
} else
target += 2;
int baseend = this.ranges[base+1];
while (target < this.ranges.length) {
if (baseend+1 < this.ranges[target])
break;
if (baseend+1 == this.ranges[target]) {
if (DEBUG)
System.err.println("Token#compactRanges(): Compaction: ["+this.ranges[base]
+", "+this.ranges[base+1]
+"], ["+this.ranges[target]
+", "+this.ranges[target+1]
+"] -> ["+this.ranges[base]
+", "+this.ranges[target+1]
+"]");
this.ranges[base+1] = this.ranges[target+1];
baseend = this.ranges[base+1];
target += 2;
} else if (baseend >= this.ranges[target+1]) {
if (DEBUG)
System.err.println("Token#compactRanges(): Compaction: ["+this.ranges[base]
+", "+this.ranges[base+1]
+"], ["+this.ranges[target]
+", "+this.ranges[target+1]
+"] -> ["+this.ranges[base]
+", "+this.ranges[base+1]
+"]");
target += 2;
} else if (baseend < this.ranges[target+1]) {
if (DEBUG)
System.err.println("Token#compactRanges(): Compaction: ["+this.ranges[base]
+", "+this.ranges[base+1]
+"], ["+this.ranges[target]
+", "+this.ranges[target+1]
+"] -> ["+this.ranges[base]
+", "+this.ranges[target+1]
+"]");
this.ranges[base+1] = this.ranges[target+1];
baseend = this.ranges[base+1];
target += 2;
} else {
throw new RuntimeException("Token#compactRanges(): Internel Error: ["
+this.ranges[base]
+","+this.ranges[base+1]
+"] ["+this.ranges[target]
+","+this.ranges[target+1]+"]");
}
} // while
base += 2;
}
if (base != this.ranges.length) {
int[] result = new int[base];
System.arraycopy(this.ranges, 0, result, 0, base);
this.ranges = result;
}
this.setCompacted();
}
protected void mergeRanges(Token token) {
RangeToken tok = (RangeToken)token;
this.sortRanges();
tok.sortRanges();
if (tok.ranges == null)
return;
this.icaseCache = null;
this.setSorted(true);
if (this.ranges == null) {
this.ranges = new int[tok.ranges.length];
System.arraycopy(tok.ranges, 0, this.ranges, 0, tok.ranges.length);
return;
}
int[] result = new int[this.ranges.length+tok.ranges.length];
for (int i = 0, j = 0, k = 0; i < this.ranges.length || j < tok.ranges.length;) {
if (i >= this.ranges.length) {
result[k++] = tok.ranges[j++];
result[k++] = tok.ranges[j++];
} else if (j >= tok.ranges.length) {
result[k++] = this.ranges[i++];
result[k++] = this.ranges[i++];
} else if (tok.ranges[j] < this.ranges[i]
|| tok.ranges[j] == this.ranges[i] && tok.ranges[j+1] < this.ranges[i+1]) {
result[k++] = tok.ranges[j++];
result[k++] = tok.ranges[j++];
} else {
result[k++] = this.ranges[i++];
result[k++] = this.ranges[i++];
}
}
this.ranges = result;
}
protected void subtractRanges(Token token) {
if (token.type == NRANGE) {
this.intersectRanges(token);
return;
}
RangeToken tok = (RangeToken)token;
if (tok.ranges == null || this.ranges == null)
return;
this.icaseCache = null;
this.sortRanges();
this.compactRanges();
tok.sortRanges();
tok.compactRanges();
//System.err.println("Token#substractRanges(): Entry: "+this.ranges.length+", "+tok.ranges.length);
int[] result = new int[this.ranges.length+tok.ranges.length];
int wp = 0, src = 0, sub = 0;
while (src < this.ranges.length && sub < tok.ranges.length) {
int srcbegin = this.ranges[src];
int srcend = this.ranges[src+1];
int subbegin = tok.ranges[sub];
int subend = tok.ranges[sub+1];
if (srcend < subbegin) { // Not overlapped
// src: o-----o
// sub: o-----o
// res: o-----o
// Reuse sub
result[wp++] = this.ranges[src++];
result[wp++] = this.ranges[src++];
} else if (srcend >= subbegin
&& srcbegin <= subend) { // Overlapped
// src: o--------o
// sub: o----o
// sub: o----o
// sub: o----o
// sub: o------------o
if (subbegin <= srcbegin && srcend <= subend) {
// src: o--------o
// sub: o------------o
// res: empty
// Reuse sub
src += 2;
} else if (subbegin <= srcbegin) {
// src: o--------o
// sub: o----o
// res: o-----o
// Reuse src(=res)
this.ranges[src] = subend+1;
sub += 2;
} else if (srcend <= subend) {
// src: o--------o
// sub: o----o
// res: o-----o
// Reuse sub
result[wp++] = srcbegin;
result[wp++] = subbegin-1;
src += 2;
} else {
// src: o--------o
// sub: o----o
// res: o-o o-o
// Reuse src(=right res)
result[wp++] = srcbegin;
result[wp++] = subbegin-1;
this.ranges[src] = subend+1;
sub += 2;
}
} else if (subend < srcbegin) {
// Not overlapped
// src: o-----o
// sub: o----o
sub += 2;
} else {
throw new RuntimeException("Token#subtractRanges(): Internal Error: ["+this.ranges[src]
+","+this.ranges[src+1]
+"] - ["+tok.ranges[sub]
+","+tok.ranges[sub+1]
+"]");
}
}
while (src < this.ranges.length) {
result[wp++] = this.ranges[src++];
result[wp++] = this.ranges[src++];
}
this.ranges = new int[wp];
System.arraycopy(result, 0, this.ranges, 0, wp);
// this.ranges is sorted and compacted.
}
/**
* @param tok Ignore whether it is NRANGE or not.
*/
protected void intersectRanges(Token token) {
RangeToken tok = (RangeToken)token;
if (tok.ranges == null || this.ranges == null)
return;
this.icaseCache = null;
this.sortRanges();
this.compactRanges();
tok.sortRanges();
tok.compactRanges();
int[] result = new int[this.ranges.length+tok.ranges.length];
int wp = 0, src1 = 0, src2 = 0;
while (src1 < this.ranges.length && src2 < tok.ranges.length) {
int src1begin = this.ranges[src1];
int src1end = this.ranges[src1+1];
int src2begin = tok.ranges[src2];
int src2end = tok.ranges[src2+1];
if (src1end < src2begin) { // Not overlapped
// src1: o-----o
// src2: o-----o
// res: empty
// Reuse src2
src1 += 2;
} else if (src1end >= src2begin
&& src1begin <= src2end) { // Overlapped
// src1: o--------o
// src2: o----o
// src2: o----o
// src2: o----o
// src2: o------------o
if (src2begin <= src1begin && src1end <= src2end) {
// src1: o--------o
// src2: o------------o
// res: o--------o
// Reuse src2
result[wp++] = src1begin;
result[wp++] = src1end;
src1 += 2;
} else if (src2begin <= src1begin) {
// src1: o--------o
// src2: o----o
// res: o--o
// Reuse the rest of src1
result[wp++] = src1begin;
result[wp++] = src2end;
this.ranges[src1] = src2end+1;
src2 += 2;
} else if (src1end <= src2end) {
// src1: o--------o
// src2: o----o
// res: o--o
// Reuse src2
result[wp++] = src2begin;
result[wp++] = src1end;
src1 += 2;
} else {
// src1: o--------o
// src2: o----o
// res: o----o
// Reuse the rest of src1
result[wp++] = src2begin;
result[wp++] = src2end;
this.ranges[src1] = src2end+1;
}
} else if (src2end < src1begin) {
// Not overlapped
// src1: o-----o
// src2: o----o
src2 += 2;
} else {
throw new RuntimeException("Token#intersectRanges(): Internal Error: ["
+this.ranges[src1]
+","+this.ranges[src1+1]
+"] & ["+tok.ranges[src2]
+","+tok.ranges[src2+1]
+"]");
}
}
while (src1 < this.ranges.length) {
result[wp++] = this.ranges[src1++];
result[wp++] = this.ranges[src1++];
}
this.ranges = new int[wp];
System.arraycopy(result, 0, this.ranges, 0, wp);
// this.ranges is sorted and compacted.
}
/**
* for RANGE: Creates complement.
* for NRANGE: Creates the same meaning RANGE.
*/
static Token complementRanges(Token token) {
if (token.type != RANGE && token.type != NRANGE)
throw new IllegalArgumentException("Token#complementRanges(): must be RANGE: "+token.type);
RangeToken tok = (RangeToken)token;
tok.sortRanges();
tok.compactRanges();
int len = tok.ranges.length+2;
if (tok.ranges[0] == 0)
len -= 2;
int last = tok.ranges[tok.ranges.length-1];
if (last == UTF16_MAX)
len -= 2;
RangeToken ret = Token.createRange();
ret.ranges = new int[len];
int wp = 0;
if (tok.ranges[0] > 0) {
ret.ranges[wp++] = 0;
ret.ranges[wp++] = tok.ranges[0]-1;
}
for (int i = 1; i < tok.ranges.length-2; i += 2) {
ret.ranges[wp++] = tok.ranges[i]+1;
ret.ranges[wp++] = tok.ranges[i+1]-1;
}
if (last != UTF16_MAX) {
ret.ranges[wp++] = last+1;
ret.ranges[wp] = UTF16_MAX;
}
ret.setCompacted();
return ret;
}
synchronized RangeToken getCaseInsensitiveToken() {
if (this.icaseCache != null)
return this.icaseCache;
RangeToken uppers = this.type == Token.RANGE ? Token.createRange() : Token.createNRange();
for (int i = 0; i < this.ranges.length; i += 2) {
for (int ch = this.ranges[i]; ch <= this.ranges[i+1]; ch ++) {
if (ch > 0xffff)
uppers.addRange(ch, ch);
else {
char uch = Character.toUpperCase((char)ch);
uppers.addRange(uch, uch);
}
}
}
RangeToken lowers = this.type == Token.RANGE ? Token.createRange() : Token.createNRange();
for (int i = 0; i < uppers.ranges.length; i += 2) {
for (int ch = uppers.ranges[i]; ch <= uppers.ranges[i+1]; ch ++) {
if (ch > 0xffff)
lowers.addRange(ch, ch);
else {
char uch = Character.toLowerCase((char)ch);
lowers.addRange(uch, uch);
}
}
}
lowers.mergeRanges(uppers);
lowers.mergeRanges(this);
lowers.compactRanges();
this.icaseCache = lowers;
return lowers;
}
void dumpRanges() {
System.err.print("RANGE: ");
if (this.ranges == null) {
System.err.println(" NULL");
return;
}
for (int i = 0; i < this.ranges.length; i += 2) {
System.err.print("["+this.ranges[i]+","+this.ranges[i+1]+"] ");
}
System.err.println("");
}
boolean match(int ch) {
if (this.map == null) this.createMap();
boolean ret;
if (this.type == RANGE) {
if (ch < MAPSIZE)
return (this.map[ch/32] & (1<<(ch&0x1f))) != 0;
ret = false;
for (int i = this.nonMapIndex; i < this.ranges.length; i += 2) {
if (this.ranges[i] <= ch && ch <= this.ranges[i+1])
return true;
}
} else {
if (ch < MAPSIZE)
return (this.map[ch/32] & (1<<(ch&0x1f))) == 0;
ret = true;
for (int i = this.nonMapIndex; i < this.ranges.length; i += 2) {
if (this.ranges[i] <= ch && ch <= this.ranges[i+1])
return false;
}
}
return ret;
}
private static final int MAPSIZE = 256;
private void createMap() {
int asize = MAPSIZE/32; // 32 is the number of bits in `int'.
int [] map = new int[asize];
int nonMapIndex = this.ranges.length;
for (int i = 0; i < asize; ++i) {
map[i] = 0;
}
for (int i = 0; i < this.ranges.length; i += 2) {
int s = this.ranges[i];
int e = this.ranges[i+1];
if (s < MAPSIZE) {
for (int j = s; j <= e && j < MAPSIZE; j++) {
map[j/32] |= 1<<(j&0x1f); // s&0x1f : 0-31
}
}
else {
nonMapIndex = i;
break;
}
if (e >= MAPSIZE) {
nonMapIndex = i;
break;
}
}
this.map = map;
this.nonMapIndex = nonMapIndex;
//for (int i = 0; i < asize; i ++) System.err.println("Map: "+Integer.toString(this.map[i], 16));
}
public String toString(int options) {
String ret;
if (this.type == RANGE) {
if (this == Token.token_dot)
ret = ".";
else if (this == Token.token_0to9)
ret = "\\d";
else if (this == Token.token_wordchars)
ret = "\\w";
else if (this == Token.token_spaces)
ret = "\\s";
else {
StringBuilder sb = new StringBuilder();
sb.append('[');
for (int i = 0; i < this.ranges.length; i += 2) {
if ((options & RegularExpression.SPECIAL_COMMA) != 0 && i > 0) sb.append(',');
if (this.ranges[i] == this.ranges[i+1]) {
sb.append(escapeCharInCharClass(this.ranges[i]));
} else {
sb.append(escapeCharInCharClass(this.ranges[i]));
sb.append((char)'-');
sb.append(escapeCharInCharClass(this.ranges[i+1]));
}
}
sb.append(']');
ret = sb.toString();
}
} else {
if (this == Token.token_not_0to9)
ret = "\\D";
else if (this == Token.token_not_wordchars)
ret = "\\W";
else if (this == Token.token_not_spaces)
ret = "\\S";
else {
StringBuilder sb = new StringBuilder();
sb.append("[^");
for (int i = 0; i < this.ranges.length; i += 2) {
if ((options & RegularExpression.SPECIAL_COMMA) != 0 && i > 0) sb.append(',');
if (this.ranges[i] == this.ranges[i+1]) {
sb.append(escapeCharInCharClass(this.ranges[i]));
} else {
sb.append(escapeCharInCharClass(this.ranges[i]));
sb.append('-');
sb.append(escapeCharInCharClass(this.ranges[i+1]));
}
}
sb.append(']');
ret = sb.toString();
}
}
return ret;
}
private static String escapeCharInCharClass(int ch) {
String ret;
switch (ch) {
case '[': case ']': case '-': case '^':
case ',': case '\\':
ret = "\\"+(char)ch;
break;
case '\f': ret = "\\f"; break;
case '\n': ret = "\\n"; break;
case '\r': ret = "\\r"; break;
case '\t': ret = "\\t"; break;
case 0x1b: ret = "\\e"; break;
//case 0x0b: ret = "\\v"; break;
default:
if (ch < 0x20) {
String pre = "0"+Integer.toHexString(ch);
ret = "\\x"+pre.substring(pre.length()-2, pre.length());
} else if (ch >= 0x10000) {
String pre = "0"+Integer.toHexString(ch);
ret = "\\v"+pre.substring(pre.length()-6, pre.length());
} else
ret = ""+(char)ch;
}
return ret;
}
}
/**
* A Regular Expression Parser.
*
* @xerces.internal
*
* @version $Id: RegexParser.java 1033661 2010-11-10 19:31:44Z knoaman $
*/
static class RegexParser {
static final int T_CHAR = 0;
static final int T_EOF = 1;
static final int T_OR = 2; // '|'
static final int T_STAR = 3; // '*'
static final int T_PLUS = 4; // '+'
static final int T_QUESTION = 5; // '?'
static final int T_LPAREN = 6; // '('
static final int T_RPAREN = 7; // ')'
static final int T_DOT = 8; // '.'
static final int T_LBRACKET = 9; // '['
static final int T_BACKSOLIDUS = 10; // '\'
static final int T_CARET = 11; // '^'
static final int T_DOLLAR = 12; // '$'
static final int T_LPAREN2 = 13; // '(?:'
static final int T_LOOKAHEAD = 14; // '(?='
static final int T_NEGATIVELOOKAHEAD = 15; // '(?!'
static final int T_LOOKBEHIND = 16; // '(?<='
static final int T_NEGATIVELOOKBEHIND = 17; // '(?<!'
static final int T_INDEPENDENT = 18; // '(?>'
static final int T_SET_OPERATIONS = 19; // '(?['
static final int T_POSIX_CHARCLASS_START = 20; // '[:' in a character class
static final int T_COMMENT = 21; // '(?#'
static final int T_MODIFIERS = 22; // '(?' [\-,a-z,A-Z]
static final int T_CONDITION = 23; // '(?('
static final int T_XMLSCHEMA_CC_SUBTRACTION = 24; // '-[' in a character class
static class ReferencePosition {
int refNumber;
int position;
ReferencePosition(int n, int pos) {
this.refNumber = n;
this.position = pos;
}
}
int offset;
String regex;
int regexlen;
int options;
ResourceBundle resources;
int chardata;
int nexttoken;
static protected final int S_NORMAL = 0;
static protected final int S_INBRACKETS = 1;
static protected final int S_INXBRACKETS = 2;
int context = S_NORMAL;
int parenOpened = 1;
int parennumber = 1;
boolean hasBackReferences;
Vector references = null;
public RegexParser() {
this.setLocale(Locale.getDefault());
}
public RegexParser(Locale locale) {
this.setLocale(locale);
}
public void setLocale(Locale locale) {
/*
try {
if (locale != null) {
this.resources = ResourceBundle.getBundle("org.apache.xerces.impl.xpath.regex.message", locale);
}
else {
this.resources = ResourceBundle.getBundle("org.apache.xerces.impl.xpath.regex.message");
}
}
catch (MissingResourceException mre) {
throw new RuntimeException("Installation Problem??? Couldn't load messages: "
+ mre.getMessage());
}
*/
}
final ParseException ex(String key, int loc) {
return new ParseException(EcorePlugin.INSTANCE.getString(key), loc);
}
protected final boolean isSet(int flag) {
return (this.options & flag) == flag;
}
synchronized Token parse(String regex, int options) throws ParseException {
this.options = options;
this.offset = 0;
this.setContext(S_NORMAL);
this.parennumber = 1;
this.parenOpened = 1;
this.hasBackReferences = false;
this.regex = regex;
if (this.isSet(RegularExpression.EXTENDED_COMMENT))
this.regex = REUtil.stripExtendedComment(this.regex);
this.regexlen = this.regex.length();
this.next();
Token ret = this.parseRegex();
if (this.offset != this.regexlen)
throw ex("parser.parse.1", this.offset);
if (this.references != null) {
for (int i = 0; i < this.references.size(); i ++) {
ReferencePosition position = (ReferencePosition)this.references.elementAt(i);
if (this.parennumber <= position.refNumber)
throw ex("parser.parse.2", position.position);
}
this.references.removeAllElements();
}
return ret;
}
/*
public RegularExpression createRegex(String regex, int options) throws ParseException {
Token tok = this.parse(regex, options);
return new RegularExpression(regex, tok, this.parennumber, this.hasBackReferences, options);
}
*/
protected final void setContext(int con) {
this.context = con;
}
final int read() {
return this.nexttoken;
}
final void next() {
if (this.offset >= this.regexlen) {
this.chardata = -1;
this.nexttoken = T_EOF;
return;
}
int ret;
int ch = this.regex.charAt(this.offset++);
this.chardata = ch;
if (this.context == S_INBRACKETS) {
// In a character class, this.chardata has one character, that is to say,
// a pair of surrogates is composed and stored to this.chardata.
switch (ch) {
case '\\':
ret = T_BACKSOLIDUS;
if (this.offset >= this.regexlen)
throw ex("parser.next.1", this.offset-1);
this.chardata = this.regex.charAt(this.offset++);
break;
case '-':
// Allow character class subtraction (regardless of whether we are in
// XML Schema mode or not)
if (this.offset < this.regexlen && this.regex.charAt(this.offset) == '[') {
this.offset++;
ret = T_XMLSCHEMA_CC_SUBTRACTION;
} else
ret = T_CHAR;
break;
case '[':
if (!this.isSet(RegularExpression.XMLSCHEMA_MODE)
&& this.offset < this.regexlen && this.regex.charAt(this.offset) == ':') {
this.offset++;
ret = T_POSIX_CHARCLASS_START;
break;
} // Through down
default:
if (REUtil.isHighSurrogate(ch) && this.offset < this.regexlen) {
int low = this.regex.charAt(this.offset);
if (REUtil.isLowSurrogate(low)) {
this.chardata = REUtil.composeFromSurrogates(ch, low);
this.offset ++;
}
}
ret = T_CHAR;
}
this.nexttoken = ret;
return;
}
switch (ch) {
case '|': ret = T_OR; break;
case '*': ret = T_STAR; break;
case '+': ret = T_PLUS; break;
case '?': ret = T_QUESTION; break;
case ')': ret = T_RPAREN; break;
case '.': ret = T_DOT; break;
case '[': ret = T_LBRACKET; break;
case '^':
if (this.isSet(RegularExpression.XMLSCHEMA_MODE)) {
ret = T_CHAR;
}
else {
ret = T_CARET;
}
break;
case '$':
if (this.isSet(RegularExpression.XMLSCHEMA_MODE)) {
ret = T_CHAR;
}
else {
ret = T_DOLLAR;
}
break;
case '(':
ret = T_LPAREN;
if (this.offset >= this.regexlen)
break;
if (this.regex.charAt(this.offset) != '?')
break;
if (++this.offset >= this.regexlen)
throw ex("parser.next.2", this.offset-1);
ch = this.regex.charAt(this.offset++);
switch (ch) {
case ':': ret = T_LPAREN2; break;
case '=': ret = T_LOOKAHEAD; break;
case '!': ret = T_NEGATIVELOOKAHEAD; break;
case '[': ret = T_SET_OPERATIONS; break;
case '>': ret = T_INDEPENDENT; break;
case '<':
if (this.offset >= this.regexlen)
throw ex("parser.next.2", this.offset-3);
ch = this.regex.charAt(this.offset++);
if (ch == '=') {
ret = T_LOOKBEHIND;
} else if (ch == '!') {
ret = T_NEGATIVELOOKBEHIND;
} else
throw ex("parser.next.3", this.offset-3);
break;
case '#':
while (this.offset < this.regexlen) {
ch = this.regex.charAt(this.offset++);
if (ch == ')') break;
}
if (ch != ')')
throw ex("parser.next.4", this.offset-1);
ret = T_COMMENT;
break;
default:
if (ch == '-' || 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z') {// Options
this.offset --;
ret = T_MODIFIERS;
break;
} else if (ch == '(') { // conditional
ret = T_CONDITION; // this.offsets points the next of '('.
break;
}
throw ex("parser.next.2", this.offset-2);
}
break;
case '\\':
ret = T_BACKSOLIDUS;
if (this.offset >= this.regexlen)
throw ex("parser.next.1", this.offset-1);
this.chardata = this.regex.charAt(this.offset++);
break;
default:
ret = T_CHAR;
}
this.nexttoken = ret;
}
/**
* regex ::= term (`|` term)*
* term ::= factor+
* factor ::= ('^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\<' | '\>'
* | atom (('*' | '+' | '?' | minmax ) '?'? )?)
* | '(?=' regex ')' | '(?!' regex ')' | '(?&lt;=' regex ')' | '(?&lt;!' regex ')'
* atom ::= char | '.' | range | '(' regex ')' | '(?:' regex ')' | '\' [0-9]
* | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block
*/
Token parseRegex() throws ParseException {
Token tok = this.parseTerm();
Token parent = null;
while (this.read() == T_OR) {
this.next(); // '|'
if (parent == null) {
parent = Token.createUnion();
parent.addChild(tok);
tok = parent;
}
tok.addChild(this.parseTerm());
}
return tok;
}
/**
* term ::= factor+
*/
Token parseTerm() throws ParseException {
int ch = this.read();
if (ch == T_OR || ch == T_RPAREN || ch == T_EOF) {
return Token.createEmpty();
} else {
Token tok = this.parseFactor();
Token concat = null;
while ((ch = this.read()) != T_OR && ch != T_RPAREN && ch != T_EOF) {
if (concat == null) {
concat = Token.createConcat();
concat.addChild(tok);
tok = concat;
}
concat.addChild(this.parseFactor());
//tok = Token.createConcat(tok, this.parseFactor());
}
return tok;
}
}
// ----------------------------------------------------------------
Token processCaret() throws ParseException {
this.next();
return Token.token_linebeginning;
}
Token processDollar() throws ParseException {
this.next();
return Token.token_lineend;
}
Token processLookahead() throws ParseException {
this.next();
Token tok = Token.createLook(Token.LOOKAHEAD, this.parseRegex());
if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
this.next(); // ')'
return tok;
}
Token processNegativelookahead() throws ParseException {
this.next();
Token tok = Token.createLook(Token.NEGATIVELOOKAHEAD, this.parseRegex());
if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
this.next(); // ')'
return tok;
}
Token processLookbehind() throws ParseException {
this.next();
Token tok = Token.createLook(Token.LOOKBEHIND, this.parseRegex());
if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
this.next(); // ')'
return tok;
}
Token processNegativelookbehind() throws ParseException {
this.next();
Token tok = Token.createLook(Token.NEGATIVELOOKBEHIND, this.parseRegex());
if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
this.next(); // ')'
return tok;
}
Token processBacksolidus_A() throws ParseException {
this.next();
return Token.token_stringbeginning;
}
Token processBacksolidus_Z() throws ParseException {
this.next();
return Token.token_stringend2;
}
Token processBacksolidus_z() throws ParseException {
this.next();
return Token.token_stringend;
}
Token processBacksolidus_b() throws ParseException {
this.next();
return Token.token_wordedge;
}
Token processBacksolidus_B() throws ParseException {
this.next();
return Token.token_not_wordedge;
}
Token processBacksolidus_lt() throws ParseException {
this.next();
return Token.token_wordbeginning;
}
Token processBacksolidus_gt() throws ParseException {
this.next();
return Token.token_wordend;
}
Token processStar(Token tok) throws ParseException {
this.next();
if (this.read() == T_QUESTION) {
this.next();
return Token.createNGClosure(tok);
} else
return Token.createClosure(tok);
}
Token processPlus(Token tok) throws ParseException {
// X+ -> XX*
this.next();
if (this.read() == T_QUESTION) {
this.next();
return Token.createConcat(tok, Token.createNGClosure(tok));
} else
return Token.createConcat(tok, Token.createClosure(tok));
}
Token processQuestion(Token tok) throws ParseException {
// X? -> X|
this.next();
Token par = Token.createUnion();
if (this.read() == T_QUESTION) {
this.next();
par.addChild(Token.createEmpty());
par.addChild(tok);
} else {
par.addChild(tok);
par.addChild(Token.createEmpty());
}
return par;
}
boolean checkQuestion(int off) {
return off < this.regexlen && this.regex.charAt(off) == '?';
}
Token processParen() throws ParseException {
this.next();
int p = this.parenOpened++;
Token tok = Token.createParen(this.parseRegex(), p);
if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
this.parennumber++;
this.next(); // Skips ')'
return tok;
}
Token processParen2() throws ParseException {
this.next();
Token tok = Token.createParen(this.parseRegex(), 0);
if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
this.next(); // Skips ')'
return tok;
}
Token processCondition() throws ParseException {
// this.offset points the next of '('
if (this.offset+1 >= this.regexlen) throw ex("parser.factor.4", this.offset);
// Parses a condition.
int refno = -1;
Token condition = null;
int ch = this.regex.charAt(this.offset);
if ('1' <= ch && ch <= '9') {
refno = ch-'0';
int finalRefno = refno;
if (this.parennumber <= refno)
throw ex("parser.parse.2", this.offset);
while (this.offset + 1 < this.regexlen) {
ch = this.regex.charAt(this.offset + 1);
if ('0' <= ch && ch <= '9') {
refno = (refno * 10) + (ch - '0');
if (refno < this.parennumber) {
finalRefno= refno;
++this.offset;
}
else {
break;
}
}
else {
break;
}
}
this.hasBackReferences = true;
if (this.references == null) this.references = new Vector();
this.references.addElement(new ReferencePosition(finalRefno, this.offset));
this.offset ++;
if (this.regex.charAt(this.offset) != ')') throw ex("parser.factor.1", this.offset);
this.offset ++;
} else {
if (ch == '?') this.offset --; // Points '('.
this.next();
condition = this.parseFactor();
switch (condition.type) {
case Token.LOOKAHEAD:
case Token.NEGATIVELOOKAHEAD:
case Token.LOOKBEHIND:
case Token.NEGATIVELOOKBEHIND:
break;
case Token.ANCHOR:
if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
break;
default:
throw ex("parser.factor.5", this.offset);
}
}
// Parses yes/no-patterns.
this.next();
Token yesPattern = this.parseRegex();
Token noPattern = null;
if (yesPattern.type == Token.UNION) {
if (yesPattern.size() != 2) throw ex("parser.factor.6", this.offset);
noPattern = yesPattern.getChild(1);
yesPattern = yesPattern.getChild(0);
}
if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
this.next();
return Token.createCondition(refno, condition, yesPattern, noPattern);
}
Token processModifiers() throws ParseException {
// this.offset points the next of '?'.
// modifiers ::= [imsw]* ('-' [imsw]*)? ':'
int add = 0, mask = 0, ch = -1;
while (this.offset < this.regexlen) {
ch = this.regex.charAt(this.offset);
int v = REUtil.getOptionValue(ch);
if (v == 0) break; // '-' or ':'?
add |= v;
this.offset ++;
}
if (this.offset >= this.regexlen) throw ex("parser.factor.2", this.offset-1);
if (ch == '-') {
this.offset ++;
while (this.offset < this.regexlen) {
ch = this.regex.charAt(this.offset);
int v = REUtil.getOptionValue(ch);
if (v == 0) break; // ':'?
mask |= v;
this.offset ++;
}
if (this.offset >= this.regexlen) throw ex("parser.factor.2", this.offset-1);
}
Token tok;
if (ch == ':') {
this.offset ++;
this.next();
tok = Token.createModifierGroup(this.parseRegex(), add, mask);
if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
this.next();
} else if (ch == ')') { // such as (?-i)
this.offset ++;
this.next();
tok = Token.createModifierGroup(this.parseRegex(), add, mask);
} else
throw ex("parser.factor.3", this.offset);
return tok;
}
Token processIndependent() throws ParseException {
this.next();
Token tok = Token.createLook(Token.INDEPENDENT, this.parseRegex());
if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
this.next(); // Skips ')'
return tok;
}
Token processBacksolidus_c() throws ParseException {
int ch2; // Must be in 0x0040-0x005f
if (this.offset >= this.regexlen
|| ((ch2 = this.regex.charAt(this.offset++)) & 0xffe0) != 0x0040)
throw ex("parser.atom.1", this.offset-1);
this.next();
return Token.createChar(ch2-0x40);
}
Token processBacksolidus_C() throws ParseException {
throw ex("parser.process.1", this.offset);
}
Token processBacksolidus_i() throws ParseException {
Token tok = Token.createChar('i');
this.next();
return tok;
}
Token processBacksolidus_I() throws ParseException {
throw ex("parser.process.1", this.offset);
}
Token processBacksolidus_g() throws ParseException {
this.next();
return Token.getGraphemePattern();
}
Token processBacksolidus_X() throws ParseException {
this.next();
return Token.getCombiningCharacterSequence();
}
Token processBackreference() throws ParseException {
int refnum = this.chardata-'0';
int finalRefnum = refnum;
if (this.parennumber <= refnum)
throw ex("parser.parse.2", this.offset-2);
while (this.offset < this.regexlen) {
final int ch = this.regex.charAt(this.offset);
if ('0' <= ch && ch <= '9') {
refnum = (refnum * 10) + (ch - '0');
if (refnum < this.parennumber) {
++this.offset;
finalRefnum = refnum;
this.chardata = ch;
}
else {
break;
}
}
else {
break;
}
}
Token tok = Token.createBackReference(finalRefnum);
this.hasBackReferences = true;
if (this.references == null) this.references = new Vector();
this.references.addElement(new ReferencePosition(finalRefnum, this.offset-2));
this.next();
return tok;
}
// ----------------------------------------------------------------
/**
* factor ::= ('^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\<' | '\>'
* | atom (('*' | '+' | '?' | minmax ) '?'? )?)
* | '(?=' regex ')' | '(?!' regex ')' | '(?&lt;=' regex ')' | '(?&lt;!' regex ')'
* | '(?#' [^)]* ')'
* minmax ::= '{' min (',' max?)? '}'
* min ::= [0-9]+
* max ::= [0-9]+
*/
Token parseFactor() throws ParseException {
int ch = this.read();
Token tok;
switch (ch) {
case T_CARET: return this.processCaret();
case T_DOLLAR: return this.processDollar();
case T_LOOKAHEAD: return this.processLookahead();
case T_NEGATIVELOOKAHEAD: return this.processNegativelookahead();
case T_LOOKBEHIND: return this.processLookbehind();
case T_NEGATIVELOOKBEHIND: return this.processNegativelookbehind();
case T_COMMENT:
this.next();
return Token.createEmpty();
case T_BACKSOLIDUS:
switch (this.chardata) {
case 'A': return this.processBacksolidus_A();
case 'Z': return this.processBacksolidus_Z();
case 'z': return this.processBacksolidus_z();
case 'b': return this.processBacksolidus_b();
case 'B': return this.processBacksolidus_B();
case '<': return this.processBacksolidus_lt();
case '>': return this.processBacksolidus_gt();
}
// through down
}
tok = this.parseAtom();
ch = this.read();
switch (ch) {
case T_STAR: return this.processStar(tok);
case T_PLUS: return this.processPlus(tok);
case T_QUESTION: return this.processQuestion(tok);
case T_CHAR:
if (this.chardata == '{' && this.offset < this.regexlen) {
int off = this.offset; // this.offset -> next of '{'
int min = 0, max = -1;
if ((ch = this.regex.charAt(off++)) >= '0' && ch <= '9') {
min = ch -'0';
while (off < this.regexlen
&& (ch = this.regex.charAt(off++)) >= '0' && ch <= '9') {
min = min*10 +ch-'0';
if (min < 0)
throw ex("parser.quantifier.5", this.offset);
}
}
else {
throw ex("parser.quantifier.1", this.offset);
}
max = min;
if (ch == ',') {
if (off >= this.regexlen) {
throw ex("parser.quantifier.3", this.offset);
}
else if ((ch = this.regex.charAt(off++)) >= '0' && ch <= '9') {
max = ch -'0'; // {min,max}
while (off < this.regexlen
&& (ch = this.regex.charAt(off++)) >= '0'
&& ch <= '9') {
max = max*10 +ch-'0';
if (max < 0)
throw ex("parser.quantifier.5", this.offset);
}
if (min > max)
throw ex("parser.quantifier.4", this.offset);
}
else { // assume {min,}
max = -1;
}
}
if (ch != '}')
throw ex("parser.quantifier.2", this.offset);
if (this.checkQuestion(off)) { // off -> next of '}'
tok = Token.createNGClosure(tok);
this.offset = off+1;
} else {
tok = Token.createClosure(tok);
this.offset = off;
}
tok.setMin(min);
tok.setMax(max);
//System.err.println("CLOSURE: "+min+", "+max);
this.next();
}
}
return tok;
}
/**
* atom ::= char | '.' | char-class | '(' regex ')' | '(?:' regex ')' | '\' [0-9]
* | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block
* | '(?>' regex ')'
* char ::= '\\' | '\' [efnrt] | bmp-code | character-1
*/
Token parseAtom() throws ParseException {
int ch = this.read();
Token tok = null;
switch (ch) {
case T_LPAREN: return this.processParen();
case T_LPAREN2: return this.processParen2(); // '(?:'
case T_CONDITION: return this.processCondition(); // '(?('
case T_MODIFIERS: return this.processModifiers(); // (?modifiers ... )
case T_INDEPENDENT: return this.processIndependent();
case T_DOT:
this.next(); // Skips '.'
tok = Token.token_dot;
break;
/**
* char-class ::= '[' ( '^'? range ','?)+ ']'
* range ::= '\d' | '\w' | '\s' | category-block | range-char
* | range-char '-' range-char
* range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | bmp-code | character-2
* bmp-char ::= '\' 'u' [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]
*/
case T_LBRACKET: return this.parseCharacterClass(true);
case T_SET_OPERATIONS: return this.parseSetOperations();
case T_BACKSOLIDUS:
switch (this.chardata) {
case 'd': case 'D':
case 'w': case 'W':
case 's': case 'S':
tok = this.getTokenForShorthand(this.chardata);
this.next();
return tok;
case 'e': case 'f': case 'n': case 'r':
case 't': case 'u': case 'v': case 'x':
{
int ch2 = this.decodeEscaped();
if (ch2 < 0x10000) {
tok = Token.createChar(ch2);
} else {
tok = Token.createString(REUtil.decomposeToSurrogates(ch2));
}
}
break;
case 'c': return this.processBacksolidus_c();
case 'C': return this.processBacksolidus_C();
case 'i': return this.processBacksolidus_i();
case 'I': return this.processBacksolidus_I();
case 'g': return this.processBacksolidus_g();
case 'X': return this.processBacksolidus_X();
case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
return this.processBackreference();
case 'P':
case 'p':
int pstart = this.offset;
tok = processBacksolidus_pP(this.chardata);
if (tok == null) throw this.ex("parser.atom.5", pstart);
break;
default:
tok = Token.createChar(this.chardata);
}
this.next();
break;
case T_CHAR:
if (this.chardata == ']' || this.chardata == '{' || this.chardata == '}')
throw this.ex("parser.atom.4", this.offset-1);
tok = Token.createChar(this.chardata);
int high = this.chardata;
this.next();
if (REUtil.isHighSurrogate(high)
&& this.read() == T_CHAR && REUtil.isLowSurrogate(this.chardata)) {
char[] sur = new char[2];
sur[0] = (char)high;
sur[1] = (char)this.chardata;
tok = Token.createParen(Token.createString(new String(sur)), 0);
this.next();
}
break;
default:
throw this.ex("parser.atom.4", this.offset-1);
}
return tok;
}
protected RangeToken processBacksolidus_pP(int c) throws ParseException {
this.next();
if (this.read() != T_CHAR || this.chardata != '{')
throw this.ex("parser.atom.2", this.offset-1);
// handle category escape
boolean positive = c == 'p';
int namestart = this.offset;
int nameend = this.regex.indexOf('}', namestart);
if (nameend < 0)
throw this.ex("parser.atom.3", this.offset);
String pname = this.regex.substring(namestart, nameend);
this.offset = nameend+1;
return Token.getRange(pname, positive, this.isSet(RegularExpression.XMLSCHEMA_MODE));
}
int processCIinCharacterClass(RangeToken tok, int c) {
return this.decodeEscaped();
}
/**
* char-class ::= '[' ( '^'? range ','?)+ ']'
* range ::= '\d' | '\w' | '\s' | category-block | range-char
* | range-char '-' range-char
* range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | bmp-code | character-2
* bmp-code ::= '\' 'u' [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]
*/
protected RangeToken parseCharacterClass(boolean useNrange) throws ParseException {
this.setContext(S_INBRACKETS);
this.next(); // '['
boolean nrange = false;
RangeToken base = null;
RangeToken tok;
if (this.read() == T_CHAR && this.chardata == '^') {
nrange = true;
this.next(); // '^'
if (useNrange) {
tok = Token.createNRange();
} else {
base = Token.createRange();
base.addRange(0, Token.UTF16_MAX);
tok = Token.createRange();
}
} else {
tok = Token.createRange();
}
int type;
boolean firstloop = true;
while ((type = this.read()) != T_EOF) {
if (type == T_CHAR && this.chardata == ']' && !firstloop)
break;
int c = this.chardata;
boolean end = false;
if (type == T_BACKSOLIDUS) {
switch (c) {
case 'd': case 'D':
case 'w': case 'W':
case 's': case 'S':
tok.mergeRanges(this.getTokenForShorthand(c));
end = true;
break;
case 'i': case 'I':
case 'c': case 'C':
c = this.processCIinCharacterClass(tok, c);
if (c < 0) end = true;
break;
case 'p':
case 'P':
int pstart = this.offset;
RangeToken tok2 = this.processBacksolidus_pP(c);
if (tok2 == null) throw this.ex("parser.atom.5", pstart);
tok.mergeRanges(tok2);
end = true;
break;
default:
c = this.decodeEscaped();
} // \ + c
} // backsolidus
// POSIX Character class such as [:alnum:]
else if (type == T_POSIX_CHARCLASS_START) {
int nameend = this.regex.indexOf(':', this.offset);
if (nameend < 0) throw this.ex("parser.cc.1", this.offset);
boolean positive = true;
if (this.regex.charAt(this.offset) == '^') {
this.offset ++;
positive = false;
}
String name = this.regex.substring(this.offset, nameend);
RangeToken range = Token.getRange(name, positive,
this.isSet(RegularExpression.XMLSCHEMA_MODE));
if (range == null) throw this.ex("parser.cc.3", this.offset);
tok.mergeRanges(range);
end = true;
if (nameend+1 >= this.regexlen || this.regex.charAt(nameend+1) != ']')
throw this.ex("parser.cc.1", nameend);
this.offset = nameend+2;
}
else if (type == T_XMLSCHEMA_CC_SUBTRACTION && !firstloop) {
if (nrange) {
nrange = false;
if (useNrange) {
tok = (RangeToken) Token.complementRanges(tok);
}
else {
base.subtractRanges(tok);
tok = base;
}
}
RangeToken range2 = this.parseCharacterClass(false);
tok.subtractRanges(range2);
if (this.read() != T_CHAR || this.chardata != ']') {
throw this.ex("parser.cc.5", this.offset);
}
break; // Exit this loop
}
this.next();
if (!end) { // if not shorthands...
if (this.read() != T_CHAR || this.chardata != '-') { // Here is no '-'.
if (!this.isSet(RegularExpression.IGNORE_CASE) || c > 0xffff) {
tok.addRange(c, c);
}
else {
addCaseInsensitiveChar(tok, c);
}
}
else if (type == T_XMLSCHEMA_CC_SUBTRACTION) {
throw this.ex("parser.cc.8", this.offset-1);
}
else {
this.next(); // Skips '-'
if ((type = this.read()) == T_EOF) throw this.ex("parser.cc.2", this.offset);
if (type == T_CHAR && this.chardata == ']') {
if (!this.isSet(RegularExpression.IGNORE_CASE) || c > 0xffff) {
tok.addRange(c, c);
}
else {
addCaseInsensitiveChar(tok, c);
}
tok.addRange('-', '-');
} else {
int rangeend = this.chardata;
if (type == T_BACKSOLIDUS) {
rangeend = this.decodeEscaped();
}
this.next();
if (c > rangeend) {
throw this.ex("parser.ope.3", this.offset-1);
}
if (!this.isSet(RegularExpression.IGNORE_CASE) ||
(c > 0xffff && rangeend > 0xffff)) {
tok.addRange(c, rangeend);
}
else {
addCaseInsensitiveCharRange(tok, c, rangeend);
}
}
}
}
if (this.isSet(RegularExpression.SPECIAL_COMMA)
&& this.read() == T_CHAR && this.chardata == ',') {
this.next();
}
firstloop = false;
}
if (this.read() == T_EOF) {
throw this.ex("parser.cc.2", this.offset);
}
if (!useNrange && nrange) {
base.subtractRanges(tok);
tok = base;
}
tok.sortRanges();
tok.compactRanges();
this.setContext(S_NORMAL);
this.next(); // Skips ']'
return tok;
}
/**
* '(?[' ... ']' (('-' | '+' | '&') '[' ... ']')? ')'
*/
protected RangeToken parseSetOperations() throws ParseException {
RangeToken tok = this.parseCharacterClass(false);
int type;
while ((type = this.read()) != T_RPAREN) {
int ch = this.chardata;
if (type == T_CHAR && (ch == '-' || ch == '&')
|| type == T_PLUS) {
this.next();
if (this.read() != T_LBRACKET) throw ex("parser.ope.1", this.offset-1);
RangeToken t2 = this.parseCharacterClass(false);
if (type == T_PLUS)
tok.mergeRanges(t2);
else if (ch == '-')
tok.subtractRanges(t2);
else if (ch == '&')
tok.intersectRanges(t2);
else
throw new RuntimeException("ASSERT");
} else {
throw ex("parser.ope.2", this.offset-1);
}
}
this.next();
return tok;
}
Token getTokenForShorthand(int ch) {
Token tok;
switch (ch) {
case 'd':
tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
? Token.getRange("Nd", true) : Token.token_0to9;
break;
case 'D':
tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
? Token.getRange("Nd", false) : Token.token_not_0to9;
break;
case 'w':
tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
? Token.getRange("IsWord", true) : Token.token_wordchars;
break;
case 'W':
tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
? Token.getRange("IsWord", false) : Token.token_not_wordchars;
break;
case 's':
tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
? Token.getRange("IsSpace", true) : Token.token_spaces;
break;
case 'S':
tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
? Token.getRange("IsSpace", false) : Token.token_not_spaces;
break;
default:
throw new RuntimeException("Internal Error: shorthands: \\u"+Integer.toString(ch, 16));
}
return tok;
}
/**
*/
int decodeEscaped() throws ParseException {
if (this.read() != T_BACKSOLIDUS) throw ex("parser.next.1", this.offset-1);
int c = this.chardata;
switch (c) {
case 'e': c = 0x1b; break; // ESCAPE U+001B
case 'f': c = '\f'; break; // FORM FEED U+000C
case 'n': c = '\n'; break; // LINE FEED U+000A
case 'r': c = '\r'; break; // CRRIAGE RETURN U+000D
case 't': c = '\t'; break; // HORIZONTAL TABULATION U+0009
//case 'v': c = 0x0b; break; // VERTICAL TABULATION U+000B
case 'x':
this.next();
if (this.read() != T_CHAR) throw ex("parser.descape.1", this.offset-1);
if (this.chardata == '{') {
int v1 = 0;
int uv = 0;
do {
this.next();
if (this.read() != T_CHAR) throw ex("parser.descape.1", this.offset-1);
if ((v1 = hexChar(this.chardata)) < 0)
break;
if (uv > uv*16) throw ex("parser.descape.2", this.offset-1);
uv = uv*16+v1;
} while (true);
if (this.chardata != '}') throw ex("parser.descape.3", this.offset-1);
if (uv > Token.UTF16_MAX) throw ex("parser.descape.4", this.offset-1);
c = uv;
} else {
int v1 = 0;
if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
throw ex("parser.descape.1", this.offset-1);
int uv = v1;
this.next();
if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
throw ex("parser.descape.1", this.offset-1);
uv = uv*16+v1;
c = uv;
}
break;
case 'u':
int v1 = 0;
this.next();
if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
throw ex("parser.descape.1", this.offset-1);
int uv = v1;
this.next();
if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
throw ex("parser.descape.1", this.offset-1);
uv = uv*16+v1;
this.next();
if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
throw ex("parser.descape.1", this.offset-1);
uv = uv*16+v1;
this.next();
if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
throw ex("parser.descape.1", this.offset-1);
uv = uv*16+v1;
c = uv;
break;
case 'v':
this.next();
if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
throw ex("parser.descape.1", this.offset-1);
uv = v1;
this.next();
if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
throw ex("parser.descape.1", this.offset-1);
uv = uv*16+v1;
this.next();
if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
throw ex("parser.descape.1", this.offset-1);
uv = uv*16+v1;
this.next();
if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
throw ex("parser.descape.1", this.offset-1);
uv = uv*16+v1;
this.next();
if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
throw ex("parser.descape.1", this.offset-1);
uv = uv*16+v1;
this.next();
if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
throw ex("parser.descape.1", this.offset-1);
uv = uv*16+v1;
if (uv > Token.UTF16_MAX) throw ex("parser.descappe.4", this.offset-1);
c = uv;
break;
case 'A':
case 'Z':
case 'z':
throw ex("parser.descape.5", this.offset-2);
default:
}
return c;
}
static private final int hexChar(int ch) {
if (ch < '0') return -1;
if (ch > 'f') return -1;
if (ch <= '9') return ch-'0';
if (ch < 'A') return -1;
if (ch <= 'F') return ch-'A'+10;
if (ch < 'a') return -1;
return ch-'a'+10;
}
static protected final void addCaseInsensitiveChar(RangeToken tok, int c) {
final int[] caseMap = CaseInsensitiveMap.get(c);
tok.addRange(c, c);
if (caseMap != null) {
for (int i=0; i<caseMap.length; i+=2) {
tok.addRange(caseMap[i], caseMap[i]);
}
}
}
static protected final void addCaseInsensitiveCharRange(RangeToken tok, int start, int end) {
int[] caseMap;
int r1, r2;
if (start <= end) {
r1 = start;
r2 = end;
} else {
r1 = end;
r2 = start;
}
tok.addRange(r1, r2);
for (int ch = r1; ch <= r2; ch++) {
caseMap = CaseInsensitiveMap.get(ch);
if (caseMap != null) {
for (int i=0; i<caseMap.length; i+=2) {
tok.addRange(caseMap[i], caseMap[i]);
}
}
}
}
}
/**
* A regular expression matching engine using Non-deterministic Finite Automaton (NFA).
* This engine does not conform to the POSIX regular expression.
*
* <hr width="50%">
* <h3>How to use</h3>
*
* <dl>
* <dt>A. Standard way
* <dd>
* <pre>
* RegularExpression re = new RegularExpression(<var>regex</var>);
* if (re.matches(text)) { ... }
* </pre>
*
* <dt>B. Capturing groups
* <dd>
* <pre>
* RegularExpression re = new RegularExpression(<var>regex</var>);
* Match match = new Match();
* if (re.matches(text, match)) {
* ... // You can refer captured texts with methods of the <code>Match</code> class.
* }
* </pre>
*
* </dl>
*
* <h4>Case-insensitive matching</h4>
* <pre>
* RegularExpression re = new RegularExpression(<var>regex</var>, "i");
* if (re.matches(text) >= 0) { ...}
* </pre>
*
* <h4>Options</h4>
* <p>You can specify options to <a href="#RegularExpression(java.lang.String, java.lang.String)"><code>RegularExpression(</code><var>regex</var><code>, </code><var>options</var><code>)</code></a>
* or <a href="#setPattern(java.lang.String, java.lang.String)"><code>setPattern(</code><var>regex</var><code>, </code><var>options</var><code>)</code></a>.
* This <var>options</var> parameter consists of the following characters.
* </p>
* <dl>
* <dt><a name="I_OPTION"><code>"i"</code></a>
* <dd>This option indicates case-insensitive matching.
* <dt><a name="M_OPTION"><code>"m"</code></a>
* <dd class="REGEX"><kbd>^</kbd> and <kbd>$</kbd> consider the EOL characters within the text.
* <dt><a name="S_OPTION"><code>"s"</code></a>
* <dd class="REGEX"><kbd>.</kbd> matches any one character.
* <dt><a name="U_OPTION"><code>"u"</code></a>
* <dd class="REGEX">Redefines <Kbd>\d \D \w \W \s \S \b \B \&lt; \></kbd> as becoming to Unicode.
* <dt><a name="W_OPTION"><code>"w"</code></a>
* <dd class="REGEX">By this option, <kbd>\b \B \&lt; \></kbd> are processed with the method of
* 'Unicode Regular Expression Guidelines' Revision 4.
* When "w" and "u" are specified at the same time,
* <kbd>\b \B \&lt; \></kbd> are processed for the "w" option.
* <dt><a name="COMMA_OPTION"><code>","</code></a>
* <dd>The parser treats a comma in a character class as a range separator.
* <kbd class="REGEX">[a,b]</kbd> matches <kbd>a</kbd> or <kbd>,</kbd> or <kbd>b</kbd> without this option.
* <kbd class="REGEX">[a,b]</kbd> matches <kbd>a</kbd> or <kbd>b</kbd> with this option.
*
* <dt><a name="X_OPTION"><code>"X"</code></a>
* <dd class="REGEX">
* By this option, the engine confoms to <a href="http://www.w3.org/TR/2000/WD-xmlschema-2-20000407/#regexs">XML Schema: Regular Expression</a>.
* The <code>match()</code> method does not do subsring matching
* but entire string matching.
*
* </dl>
*
* <hr width="50%">
* <h3>Syntax</h3>
* <table border="1" bgcolor="#ddeeff">
* <tr>
* <td>
* <h4>Differences from the Perl 5 regular expression</h4>
* <ul>
* <li>There is 6-digit hexadecimal character representation (<kbd>\u005cv</kbd><var>HHHHHH</var>.)
* <li>Supports subtraction, union, and intersection operations for character classes.
* <li>Not supported: <kbd>\</kbd><var>ooo</var> (Octal character representations),
* <Kbd>\G</kbd>, <kbd>\C</kbd>, <kbd>\l</kbd><var>c</var>,
* <kbd>\u005c u</kbd><var>c</var>, <kbd>\L</kbd>, <kbd>\U</kbd>,
* <kbd>\E</kbd>, <kbd>\Q</kbd>, <kbd>\N{</kbd><var>name</var><kbd>}</kbd>,
* <Kbd>(?{<kbd><var>code</var><kbd>})</kbd>, <Kbd>(??{<kbd><var>code</var><kbd>})</kbd>
* </ul>
* </td>
* </tr>
* </table>
*
* <P>Meta characters are `<KBD>. * + ? { [ ( ) | \ ^ $</KBD>'.</P>
* <ul>
* <li>Character
* <dl>
* <dt class="REGEX"><kbd>.</kbd> (A period)
* <dd>Matches any one character except the following characters.
* <dd>LINE FEED (U+000A), CARRIAGE RETURN (U+000D),
* PARAGRAPH SEPARATOR (U+2029), LINE SEPARATOR (U+2028)
* <dd>This expression matches one code point in Unicode. It can match a pair of surrogates.
* <dd>When <a href="#S_OPTION">the "s" option</a> is specified,
* it matches any character including the above four characters.
*
* <dt class="REGEX"><Kbd>\e \f \n \r \t</kbd>
* <dd>Matches ESCAPE (U+001B), FORM FEED (U+000C), LINE FEED (U+000A),
* CARRIAGE RETURN (U+000D), HORIZONTAL TABULATION (U+0009)
*
* <dt class="REGEX"><kbd>\c</kbd><var>C</var>
* <dd>Matches a control character.
* The <var>C</var> must be one of '<kbd>@</kbd>', '<kbd>A</kbd>'-'<kbd>Z</kbd>',
* '<kbd>[</kbd>', '<kbd>\u005c</kbd>', '<kbd>]</kbd>', '<kbd>^</kbd>', '<kbd>_</kbd>'.
* It matches a control character of which the character code is less than
* the character code of the <var>C</var> by 0x0040.
* <dd class="REGEX">For example, a <kbd>\cJ</kbd> matches a LINE FEED (U+000A),
* and a <kbd>\c[</kbd> matches an ESCAPE (U+001B).
*
* <dt class="REGEX">a non-meta character
* <dd>Matches the character.
*
* <dt class="REGEX"><KBD>\</KBD> + a meta character
* <dd>Matches the meta character.
*
* <dt class="REGEX"><kbd>\u005cx</kbd><var>HH</var> <kbd>\u005cx{</kbd><var>HHHH</var><kbd>}</kbd>
* <dd>Matches a character of which code point is <var>HH</var> (Hexadecimal) in Unicode.
* You can write just 2 digits for <kbd>\u005cx</kbd><var>HH</var>, and
* variable length digits for <kbd>\u005cx{</kbd><var>HHHH</var><kbd>}</kbd>.
*
* <!--
* <dt class="REGEX"><kbd>\u005c u</kbd><var>HHHH</var>
* <dd>Matches a character of which code point is <var>HHHH</var> (Hexadecimal) in Unicode.
* -->
*
* <dt class="REGEX"><kbd>\u005cv</kbd><var>HHHHHH</var>
* <dd>Matches a character of which code point is <var>HHHHHH</var> (Hexadecimal) in Unicode.
*
* <dt class="REGEX"><kbd>\g</kbd>
* <dd>Matches a grapheme.
* <dd class="REGEX">It is equivalent to <kbd>(?[\p{ASSIGNED}]-[\p{M}\p{C}])?(?:\p{M}|[\x{094D}\x{09CD}\x{0A4D}\x{0ACD}\x{0B3D}\x{0BCD}\x{0C4D}\x{0CCD}\x{0D4D}\x{0E3A}\x{0F84}]\p{L}|[\x{1160}-\x{11A7}]|[\x{11A8}-\x{11FF}]|[\x{FF9E}\x{FF9F}])*</kbd>
*
* <dt class="REGEX"><kbd>\X</kbd>
* <dd class="REGEX">Matches a combining character sequence.
* It is equivalent to <kbd>(?:\PM\pM*)</kbd>
* </dl>
* </li>
*
* <li>Character class
* <dl>
+ * <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub></var><var>R<sub>2</sub></var><var>...</var><var>R<sub>n</sub></var><kbd>]</kbd> (without <a href="#COMMA_OPTION">"," option</a>)
+ * <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub></var><kbd>,</kbd><var>R<sub>2</sub></var><kbd>,</kbd><var>...</var><kbd>,</kbd><var>R<sub>n</sub></var><kbd>]</kbd> (with <a href="#COMMA_OPTION">"," option</a>)
* <dd>Positive character class. It matches a character in ranges.
* <dd><var>R<sub>n</sub></var>:
* <ul>
* <li class="REGEX">A character (including <Kbd>\e \f \n \r \t</kbd> <kbd>\u005cx</kbd><var>HH</var> <kbd>\u005cx{</kbd><var>HHHH</var><kbd>}</kbd> <!--kbd>\u005c u</kbd><var>HHHH</var--> <kbd>\u005cv</kbd><var>HHHHHH</var>)
* <p>This range matches the character.
* <li class="REGEX"><var>C<sub>1</sub></var><kbd>-</kbd><var>C<sub>2</sub></var>
* <p>This range matches a character which has a code point that is >= <var>C<sub>1</sub></var>'s code point and &lt;= <var>C<sub>2</sub></var>'s code point.
+ * <li class="REGEX">A POSIX character class: <Kbd>[:alpha:] [:alnum:] [:ascii:] [:cntrl:] [:digit:] [:graph:] [:lower:] [:print:] [:punct:] [:space:] [:upper:] [:xdigit:]</kbd>,
+ * and negative POSIX character classes in Perl like <kbd>[:^alpha:]</kbd>
* <p>...
* <li class="REGEX"><kbd>\d \D \s \S \w \W \p{</kbd><var>name</var><kbd>} \P{</kbd><var>name</var><kbd>}</kbd>
* <p>These expressions specifies the same ranges as the following expressions.
* </ul>
* <p class="REGEX">Enumerated ranges are merged (union operation).
* <kbd>[a-ec-z]</kbd> is equivalent to <kbd>[a-z]</kbd>
*
* <dt class="REGEX"><kbd>[^</kbd><var>R<sub>1</sub></var><var>R<sub>2</sub></var><var>...</var><var>R<sub>n</sub></var><kbd>]</kbd> (without a <a href="#COMMA_OPTION">"," option</a>)
* <dt class="REGEX"><kbd>[^</kbd><var>R<sub>1</sub></var><kbd>,</kbd><var>R<sub>2</sub></var><kbd>,</kbd><var>...</var><kbd>,</kbd><var>R<sub>n</sub></var><kbd>]</kbd> (with a <a href="#COMMA_OPTION">"," option</a>)
* <dd>Negative character class. It matches a character not in ranges.
*
* <dt class="REGEX"><kbd>(?[</kbd><var>ranges</var><kbd>]</kbd><var>op</var><kbd>[</kbd><var>ranges</var><kbd>]</kbd><var>op</var><kbd>[</kbd><var>ranges</var><kbd>]</kbd> ... <Kbd>)</kbd>
* (<var>op</var> is <kbd>-</kbd> or <kbd>+</kbd> or <kbd>&</kbd>.)
* <dd>Subtraction or union or intersection for character classes.
* <dd class="REGEX">For exmaple, <kbd>(?[A-Z]-[CF])</kbd> is equivalent to <kbd>[A-BD-EG-Z]</kbd>, and <kbd>(?[0x00-0x7f]-[K]&[\p{Lu}])</kbd> is equivalent to <kbd>[A-JL-Z]</kbd>.
* <dd>The result of this operations is a <u>positive character class</u>
* even if an expression includes any negative character classes.
* You have to take care on this in case-insensitive matching.
* For instance, <kbd>(?[^b])</kbd> is equivalent to <kbd>[\x00-ac-\x{10ffff}]</kbd>,
* which is equivalent to <kbd>[^b]</kbd> in case-sensitive matching.
* But, in case-insensitive matching, <kbd>(?[^b])</kbd> matches any character because
* it includes '<kbd>B</kbd>' and '<kbd>B</kbd>' matches '<kbd>b</kbd>'
* though <kbd>[^b]</kbd> is processed as <kbd>[^Bb]</kbd>.
*
* <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub>R<sub>2</sub>...</var><kbd>-[</kbd><var>R<sub>n</sub>R<sub>n+1</sub>...</var><kbd>]]</kbd> (with an <a href="#X_OPTION">"X" option</a>)</dt>
* <dd>Character class subtraction for the XML Schema.
* You can use this syntax when you specify an <a href="#X_OPTION">"X" option</a>.
*
* <dt class="REGEX"><kbd>\d</kbd>
* <dd class="REGEX">Equivalent to <kbd>[0-9]</kbd>.
* <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
* <span class="REGEX"><kbd>\p{Nd}</kbd></span>.
*
* <dt class="REGEX"><kbd>\D</kbd>
* <dd class="REGEX">Equivalent to <kbd>[^0-9]</kbd>
* <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
* <span class="REGEX"><kbd>\P{Nd}</kbd></span>.
*
* <dt class="REGEX"><kbd>\s</kbd>
* <dd class="REGEX">Equivalent to <kbd>[ \f\n\r\t]</kbd>
* <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
* <span class="REGEX"><kbd>[ \f\n\r\t\p{Z}]</kbd></span>.
*
* <dt class="REGEX"><kbd>\S</kbd>
* <dd class="REGEX">Equivalent to <kbd>[^ \f\n\r\t]</kbd>
* <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
* <span class="REGEX"><kbd>[^ \f\n\r\t\p{Z}]</kbd></span>.
*
* <dt class="REGEX"><kbd>\w</kbd>
* <dd class="REGEX">Equivalent to <kbd>[a-zA-Z0-9_]</kbd>
* <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
* <span class="REGEX"><kbd>[\p{Lu}\p{Ll}\p{Lo}\p{Nd}_]</kbd></span>.
*
* <dt class="REGEX"><kbd>\W</kbd>
* <dd class="REGEX">Equivalent to <kbd>[^a-zA-Z0-9_]</kbd>
* <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
* <span class="REGEX"><kbd>[^\p{Lu}\p{Ll}\p{Lo}\p{Nd}_]</kbd></span>.
*
* <dt class="REGEX"><kbd>\p{</kbd><var>name</var><kbd>}</kbd>
* <dd>Matches one character in the specified General Category (the second field in <a href="ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt"><kbd>UnicodeData.txt</kbd></a>) or the specified <a href="ftp://ftp.unicode.org/Public/UNIDATA/Blocks.txt">Block</a>.
* The following names are available:
* <dl>
* <dt>Unicode General Categories:
* <dd><kbd>
* L, M, N, Z, C, P, S, Lu, Ll, Lt, Lm, Lo, Mn, Me, Mc, Nd, Nl, No, Zs, Zl, Zp,
* Cc, Cf, Cn, Co, Cs, Pd, Ps, Pe, Pc, Po, Sm, Sc, Sk, So,
* </kbd>
* <dd>(Currently the Cn category includes U+10000-U+10FFFF characters)
* <dt>Unicode Blocks:
* <dd><kbd>
* Basic Latin, Latin-1 Supplement, Latin Extended-A, Latin Extended-B,
* IPA Extensions, Spacing Modifier Letters, Combining Diacritical Marks, Greek,
* Cyrillic, Armenian, Hebrew, Arabic, Devanagari, Bengali, Gurmukhi, Gujarati,
* Oriya, Tamil, Telugu, Kannada, Malayalam, Thai, Lao, Tibetan, Georgian,
* Hangul Jamo, Latin Extended Additional, Greek Extended, General Punctuation,
* Superscripts and Subscripts, Currency Symbols, Combining Marks for Symbols,
* Letterlike Symbols, Number Forms, Arrows, Mathematical Operators,
* Miscellaneous Technical, Control Pictures, Optical Character Recognition,
* Enclosed Alphanumerics, Box Drawing, Block Elements, Geometric Shapes,
* Miscellaneous Symbols, Dingbats, CJK Symbols and Punctuation, Hiragana,
* Katakana, Bopomofo, Hangul Compatibility Jamo, Kanbun,
* Enclosed CJK Letters and Months, CJK Compatibility, CJK Unified Ideographs,
* Hangul Syllables, High Surrogates, High Private Use Surrogates, Low Surrogates,
* Private Use, CJK Compatibility Ideographs, Alphabetic Presentation Forms,
* Arabic Presentation Forms-A, Combining Half Marks, CJK Compatibility Forms,
* Small Form Variants, Arabic Presentation Forms-B, Specials,
* Halfwidth and Fullwidth Forms
* </kbd>
* <dt>Others:
* <dd><kbd>ALL</kbd> (Equivalent to <kbd>[\u005cu0000-\u005cv10FFFF]</kbd>)
* <dd><kbd>ASSGINED</kbd> (<kbd>\p{ASSIGNED}</kbd> is equivalent to <kbd>\P{Cn}</kbd>)
* <dd><kbd>UNASSGINED</kbd>
* (<kbd>\p{UNASSIGNED}</kbd> is equivalent to <kbd>\p{Cn}</kbd>)
* </dl>
*
* <dt class="REGEX"><kbd>\P{</kbd><var>name</var><kbd>}</kbd>
* <dd>Matches one character not in the specified General Category or the specified Block.
* </dl>
* </li>
*
* <li>Selection and Quantifier
* <dl>
* <dt class="REGEX"><VAR>X</VAR><kbd>|</kbd><VAR>Y</VAR>
* <dd>...
*
* <dt class="REGEX"><VAR>X</VAR><kbd>*</KBD>
* <dd>Matches 0 or more <var>X</var>.
*
* <dt class="REGEX"><VAR>X</VAR><kbd>+</KBD>
* <dd>Matches 1 or more <var>X</var>.
*
* <dt class="REGEX"><VAR>X</VAR><kbd>?</KBD>
* <dd>Matches 0 or 1 <var>X</var>.
*
* <dt class="REGEX"><var>X</var><kbd>{</kbd><var>number</var><kbd>}</kbd>
* <dd>Matches <var>number</var> times.
*
* <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,}</kbd>
* <dd>...
*
* <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,</kbd><var>max</var><kbd>}</kbd>
* <dd>...
*
* <dt class="REGEX"><VAR>X</VAR><kbd>*?</kbd>
* <dt class="REGEX"><VAR>X</VAR><kbd>+?</kbd>
* <dt class="REGEX"><VAR>X</VAR><kbd>??</kbd>
* <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,}?</kbd>
* <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,</kbd><var>max</var><kbd>}?</kbd>
* <dd>Non-greedy matching.
* </dl>
* </li>
*
* <li>Grouping, Capturing, and Back-reference
* <dl>
* <dt class="REGEX"><KBD>(?:</kbd><VAR>X</VAR><kbd>)</KBD>
* <dd>Grouping. "<KBD>foo+</KBD>" matches "<KBD>foo</KBD>" or "<KBD>foooo</KBD>".
* If you want it matches "<KBD>foofoo</KBD>" or "<KBD>foofoofoo</KBD>",
* you have to write "<KBD>(?:foo)+</KBD>".
*
* <dt class="REGEX"><KBD>(</kbd><VAR>X</VAR><kbd>)</KBD>
* <dd>Grouping with capturing.
* It make a group and applications can know
* where in target text a group matched with methods of a <code>Match</code> instance
* after <code><a href="#matches(java.lang.String, org.apache.xerces.utils.regex.Match)">matches(String,Match)</a></code>.
* The 0th group means whole of this regular expression.
* The <VAR>N</VAR>th gorup is the inside of the <VAR>N</VAR>th left parenthesis.
*
* <p>For instance, a regular expression is
* "<FONT color=blue><KBD> *([^&lt;:]*) +&lt;([^&gt;]*)&gt; *</KBD></FONT>"
* and target text is
* "<FONT color=red><KBD>From: TAMURA Kent &lt;kent@trl.ibm.co.jp&gt;</KBD></FONT>":
* <ul>
* <li><code>Match.getCapturedText(0)</code>:
* "<FONT color=red><KBD> TAMURA Kent &lt;kent@trl.ibm.co.jp&gt;</KBD></FONT>"
* <li><code>Match.getCapturedText(1)</code>: "<FONT color=red><KBD>TAMURA Kent</KBD></FONT>"
* <li><code>Match.getCapturedText(2)</code>: "<FONT color=red><KBD>kent@trl.ibm.co.jp</KBD></FONT>"
* </ul>
*
* <dt class="REGEX"><kbd>\1 \2 \3 \4 \5 \6 \7 \8 \9</kbd>
* <dd>
*
* <dt class="REGEX"><kbd>(?></kbd><var>X</var><kbd>)</kbd>
* <dd>Independent expression group. ................
*
* <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>:</kbd><var>X</var><kbd>)</kbd>
* <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>-</kbd><var>options2</var><kbd>:</kbd><var>X</var><kbd>)</kbd>
* <dd>............................
* <dd>The <var>options</var> or the <var>options2</var> consists of 'i' 'm' 's' 'w'.
* Note that it can not contain 'u'.
*
* <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>)</kbd>
* <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>-</kbd><var>options2</var><kbd>)</kbd>
* <dd>......
* <dd>These expressions must be at the beginning of a group.
* </dl>
* </li>
*
* <li>Anchor
* <dl>
* <dt class="REGEX"><kbd>\A</kbd>
* <dd>Matches the beginnig of the text.
*
* <dt class="REGEX"><kbd>\Z</kbd>
* <dd>Matches the end of the text, or before an EOL character at the end of the text,
* or CARRIAGE RETURN + LINE FEED at the end of the text.
*
* <dt class="REGEX"><kbd>\z</kbd>
* <dd>Matches the end of the text.
*
* <dt class="REGEX"><kbd>^</kbd>
* <dd>Matches the beginning of the text. It is equivalent to <span class="REGEX"><Kbd>\A</kbd></span>.
* <dd>When <a href="#M_OPTION">a "m" option</a> is set,
* it matches the beginning of the text, or after one of EOL characters (
* LINE FEED (U+000A), CARRIAGE RETURN (U+000D), LINE SEPARATOR (U+2028),
* PARAGRAPH SEPARATOR (U+2029).)
*
* <dt class="REGEX"><kbd>$</kbd>
* <dd>Matches the end of the text, or before an EOL character at the end of the text,
* or CARRIAGE RETURN + LINE FEED at the end of the text.
* <dd>When <a href="#M_OPTION">a "m" option</a> is set,
* it matches the end of the text, or before an EOL character.
*
* <dt class="REGEX"><kbd>\b</kbd>
* <dd>Matches word boundary.
* (See <a href="#W_OPTION">a "w" option</a>)
*
* <dt class="REGEX"><kbd>\B</kbd>
* <dd>Matches non word boundary.
* (See <a href="#W_OPTION">a "w" option</a>)
*
* <dt class="REGEX"><kbd>\&lt;</kbd>
* <dd>Matches the beginning of a word.
* (See <a href="#W_OPTION">a "w" option</a>)
*
* <dt class="REGEX"><kbd>\&gt;</kbd>
* <dd>Matches the end of a word.
* (See <a href="#W_OPTION">a "w" option</a>)
* </dl>
* </li>
* <li>Lookahead and lookbehind
* <dl>
* <dt class="REGEX"><kbd>(?=</kbd><var>X</var><kbd>)</kbd>
* <dd>Lookahead.
*
* <dt class="REGEX"><kbd>(?!</kbd><var>X</var><kbd>)</kbd>
* <dd>Negative lookahead.
*
* <dt class="REGEX"><kbd>(?&lt;=</kbd><var>X</var><kbd>)</kbd>
* <dd>Lookbehind.
* <dd>(Note for text capturing......)
*
* <dt class="REGEX"><kbd>(?&lt;!</kbd><var>X</var><kbd>)</kbd>
* <dd>Negative lookbehind.
* </dl>
* </li>
*
* <li>Misc.
* <dl>
* <dt class="REGEX"><kbd>(?(</Kbd><var>condition</var><Kbd>)</kbd><var>yes-pattern</var><kbd>|</kbd><var>no-pattern</var><kbd>)</kbd>,
* <dt class="REGEX"><kbd>(?(</kbd><var>condition</var><kbd>)</kbd><var>yes-pattern</var><kbd>)</kbd>
* <dd>......
* <dt class="REGEX"><kbd>(?#</kbd><var>comment</var><kbd>)</kbd>
* <dd>Comment. A comment string consists of characters except '<kbd>)</kbd>'.
* You can not write comments in character classes and before quantifiers.
* </dl>
* </li>
* </ul>
*
*
* <hr width="50%">
* <h3>BNF for the regular expression</h3>
* <pre>
* regex ::= ('(?' options ')')? term ('|' term)*
* term ::= factor+
* factor ::= anchors | atom (('*' | '+' | '?' | minmax ) '?'? )?
* | '(?#' [^)]* ')'
* minmax ::= '{' ([0-9]+ | [0-9]+ ',' | ',' [0-9]+ | [0-9]+ ',' [0-9]+) '}'
* atom ::= char | '.' | char-class | '(' regex ')' | '(?:' regex ')' | '\' [0-9]
* | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block | '\X'
* | '(?>' regex ')' | '(?' options ':' regex ')'
* | '(?' ('(' [0-9] ')' | '(' anchors ')' | looks) term ('|' term)? ')'
* options ::= [imsw]* ('-' [imsw]+)?
* anchors ::= '^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\&lt;' | '\>'
* looks ::= '(?=' regex ')' | '(?!' regex ')'
* | '(?&lt;=' regex ')' | '(?&lt;!' regex ')'
* char ::= '\\' | '\' [efnrtv] | '\c' [@-_] | code-point | character-1
* category-block ::= '\' [pP] category-symbol-1
* | ('\p{' | '\P{') (category-symbol | block-name
* | other-properties) '}'
* category-symbol-1 ::= 'L' | 'M' | 'N' | 'Z' | 'C' | 'P' | 'S'
* category-symbol ::= category-symbol-1 | 'Lu' | 'Ll' | 'Lt' | 'Lm' | Lo'
* | 'Mn' | 'Me' | 'Mc' | 'Nd' | 'Nl' | 'No'
* | 'Zs' | 'Zl' | 'Zp' | 'Cc' | 'Cf' | 'Cn' | 'Co' | 'Cs'
* | 'Pd' | 'Ps' | 'Pe' | 'Pc' | 'Po'
* | 'Sm' | 'Sc' | 'Sk' | 'So'
* block-name ::= (See above)
* other-properties ::= 'ALL' | 'ASSIGNED' | 'UNASSIGNED'
* character-1 ::= (any character except meta-characters)
*
* char-class ::= '[' ranges ']'
* | '(?[' ranges ']' ([-+&] '[' ranges ']')? ')'
* ranges ::= '^'? (range <a href="#COMMA_OPTION">','?</a>)+
* range ::= '\d' | '\w' | '\s' | '\D' | '\W' | '\S' | category-block
* | range-char | range-char '-' range-char
* range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | code-point | character-2
* code-point ::= '\x' hex-char hex-char
* | '\x{' hex-char+ '}'
* <!-- | '\u005c u' hex-char hex-char hex-char hex-char
* --> | '\v' hex-char hex-char hex-char hex-char hex-char hex-char
* hex-char ::= [0-9a-fA-F]
* character-2 ::= (any character except \[]-,)
* </pre>
*
* <hr width="50%">
* <h3>TODO</h3>
* <ul>
* <li><a href="http://www.unicode.org/unicode/reports/tr18/">Unicode Regular Expression Guidelines</a>
* <ul>
* <li>2.4 Canonical Equivalents
* <li>Level 3
* </ul>
* <li>Parsing performance
* </ul>
*
* <hr width="50%">
*
* @xerces.internal
*
* @author TAMURA Kent &lt;kent@trl.ibm.co.jp&gt;
* @version $Id: RegularExpression.java 961928 2010-07-08 20:43:46Z knoaman $
*/
public static class RegularExpression implements java.io.Serializable {
private static final long serialVersionUID = 6242499334195006401L;
static final boolean DEBUG = false;
/**
* Compiles a token tree into an operation flow.
*/
private synchronized void compile(Token tok) {
if (this.operations != null)
return;
this.numberOfClosures = 0;
this.operations = this.compile(tok, null, false);
}
/**
* Converts a token to an operation.
*/
private Op compile(Token tok, Op next, boolean reverse) {
Op ret;
switch (tok.type) {
case Token.DOT:
ret = Op.createDot();
ret.next = next;
break;
case Token.CHAR:
ret = Op.createChar(tok.getChar());
ret.next = next;
break;
case Token.ANCHOR:
ret = Op.createAnchor(tok.getChar());
ret.next = next;
break;
case Token.RANGE:
case Token.NRANGE:
ret = Op.createRange(tok);
ret.next = next;
break;
case Token.CONCAT:
ret = next;
if (!reverse) {
for (int i = tok.size()-1; i >= 0; i --) {
ret = compile(tok.getChild(i), ret, false);
}
} else {
for (int i = 0; i < tok.size(); i ++) {
ret = compile(tok.getChild(i), ret, true);
}
}
break;
case Token.UNION:
Op.UnionOp uni = Op.createUnion(tok.size());
for (int i = 0; i < tok.size(); i ++) {
uni.addElement(compile(tok.getChild(i), next, reverse));
}
ret = uni; // ret.next is null.
break;
case Token.CLOSURE:
case Token.NONGREEDYCLOSURE:
Token child = tok.getChild(0);
int min = tok.getMin();
int max = tok.getMax();
if (min >= 0 && min == max) { // {n}
ret = next;
for (int i = 0; i < min; i ++) {
ret = compile(child, ret, reverse);
}
break;
}
if (min > 0 && max > 0)
max -= min;
if (max > 0) {
// X{2,6} -> XX(X(X(XX?)?)?)?
ret = next;
for (int i = 0; i < max; i ++) {
Op.ChildOp q = Op.createQuestion(tok.type == Token.NONGREEDYCLOSURE);
q.next = next;
q.setChild(compile(child, ret, reverse));
ret = q;
}
} else {
Op.ChildOp op;
if (tok.type == Token.NONGREEDYCLOSURE) {
op = Op.createNonGreedyClosure();
} else { // Token.CLOSURE
op = Op.createClosure(this.numberOfClosures++);
}
op.next = next;
op.setChild(compile(child, op, reverse));
ret = op;
}
if (min > 0) {
for (int i = 0; i < min; i ++) {
ret = compile(child, ret, reverse);
}
}
break;
case Token.EMPTY:
ret = next;
break;
case Token.STRING:
ret = Op.createString(tok.getString());
ret.next = next;
break;
case Token.BACKREFERENCE:
ret = Op.createBackReference(tok.getReferenceNumber());
ret.next = next;
break;
case Token.PAREN:
if (tok.getParenNumber() == 0) {
ret = compile(tok.getChild(0), next, reverse);
} else if (reverse) {
next = Op.createCapture(tok.getParenNumber(), next);
next = compile(tok.getChild(0), next, reverse);
ret = Op.createCapture(-tok.getParenNumber(), next);
} else {
next = Op.createCapture(-tok.getParenNumber(), next);
next = compile(tok.getChild(0), next, reverse);
ret = Op.createCapture(tok.getParenNumber(), next);
}
break;
case Token.LOOKAHEAD:
ret = Op.createLook(Op.LOOKAHEAD, next, compile(tok.getChild(0), null, false));
break;
case Token.NEGATIVELOOKAHEAD:
ret = Op.createLook(Op.NEGATIVELOOKAHEAD, next, compile(tok.getChild(0), null, false));
break;
case Token.LOOKBEHIND:
ret = Op.createLook(Op.LOOKBEHIND, next, compile(tok.getChild(0), null, true));
break;
case Token.NEGATIVELOOKBEHIND:
ret = Op.createLook(Op.NEGATIVELOOKBEHIND, next, compile(tok.getChild(0), null, true));
break;
case Token.INDEPENDENT:
ret = Op.createIndependent(next, compile(tok.getChild(0), null, reverse));
break;
case Token.MODIFIERGROUP:
ret = Op.createModifier(next, compile(tok.getChild(0), null, reverse),
((Token.ModifierToken)tok).getOptions(),
((Token.ModifierToken)tok).getOptionsMask());
break;
case Token.CONDITION:
Token.ConditionToken ctok = (Token.ConditionToken)tok;
int ref = ctok.refNumber;
Op condition = ctok.condition == null ? null : compile(ctok.condition, null, reverse);
Op yes = compile(ctok.yes, next, reverse);
Op no = ctok.no == null ? null : compile(ctok.no, next, reverse);
ret = Op.createCondition(next, ref, condition, yes, no);
break;
default:
throw new RuntimeException("Unknown token type: "+tok.type);
} // switch (tok.type)
return ret;
}
//Public
/**
* Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
*
* @return true if the target is matched to this regular expression.
*/
public boolean matches(char[] target) {
return this.matches(target, 0, target .length , (Match)null);
}
/**
* Checks whether the <var>target</var> text <strong>contains</strong> this pattern
* in specified range or not.
*
* @param start Start offset of the range.
* @param end End offset +1 of the range.
* @return true if the target is matched to this regular expression.
*/
public boolean matches(char[] target, int start, int end) {
return this.matches(target, start, end, (Match)null);
}
/**
* Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
*
* @param match A Match instance for storing matching result.
* @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
*/
public boolean matches(char[] target, Match match) {
return this.matches(target, 0, target .length , match);
}
/**
* Checks whether the <var>target</var> text <strong>contains</strong> this pattern
* in specified range or not.
*
* @param start Start offset of the range.
* @param end End offset +1 of the range.
* @param match A Match instance for storing matching result.
* @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
*/
public boolean matches(char[] target, int start, int end, Match match) {
synchronized (this) {
if (this.operations == null)
this.prepare();
if (this.context == null)
this.context = new Context();
}
Context con = null;
synchronized (this.context) {
con = this.context.inuse ? new Context() : this.context;
con.reset(target, start, end, this.numberOfClosures);
}
if (match != null) {
match.setNumberOfGroups(this.nofparen);
match.setSource(target);
} else if (this.hasBackReferences) {
match = new Match();
match.setNumberOfGroups(this.nofparen);
// Need not to call setSource() because
// a caller can not access this match instance.
}
con.match = match;
if (RegularExpression.isSet(this.options, XMLSCHEMA_MODE)) {
int matchEnd = this. match(con, this.operations, con.start, 1, this.options);
//System.err.println("DEBUG: matchEnd="+matchEnd);
if (matchEnd == con.limit) {
if (con.match != null) {
con.match.setBeginning(0, con.start);
con.match.setEnd(0, matchEnd);
}
con.setInUse(false);
return true;
}
return false;
}
/*
* The pattern has only fixed string.
* The engine uses Boyer-Moore.
*/
if (this.fixedStringOnly) {
//System.err.println("DEBUG: fixed-only: "+this.fixedString);
int o = this.fixedStringTable.matches(target, con.start, con.limit);
if (o >= 0) {
if (con.match != null) {
con.match.setBeginning(0, o);
con.match.setEnd(0, o+this.fixedString.length());
}
con.setInUse(false);
return true;
}
con.setInUse(false);
return false;
}
/*
* The pattern contains a fixed string.
* The engine checks with Boyer-Moore whether the text contains the fixed string or not.
* If not, it return with false.
*/
if (this.fixedString != null) {
int o = this.fixedStringTable.matches(target, con.start, con.limit);
if (o < 0) {
//System.err.println("Non-match in fixed-string search.");
con.setInUse(false);
return false;
}
}
int limit = con.limit-this.minlength;
int matchStart;
int matchEnd = -1;
/*
* Checks whether the expression starts with ".*".
*/
if (this.operations != null
&& this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) {
if (isSet(this.options, SINGLE_LINE)) {
matchStart = con.start;
matchEnd = this. match(con, this.operations, con.start, 1, this.options);
} else {
boolean previousIsEOL = true;
for (matchStart = con.start; matchStart <= limit; matchStart ++) {
int ch = target [ matchStart ] ;
if (isEOLChar(ch)) {
previousIsEOL = true;
} else {
if (previousIsEOL) {
if (0 <= (matchEnd = this. match(con, this.operations,
matchStart, 1, this.options)))
break;
}
previousIsEOL = false;
}
}
}
}
/*
* Optimization against the first character.
*/
else if (this.firstChar != null) {
//System.err.println("DEBUG: with firstchar-matching: "+this.firstChar);
RangeToken range = this.firstChar;
for (matchStart = con.start; matchStart <= limit; matchStart ++) {
int ch = target [matchStart] ;
if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) {
ch = REUtil.composeFromSurrogates(ch, target[matchStart+1]);
}
if (!range.match(ch)) {
continue;
}
if (0 <= (matchEnd = this. match(con, this.operations,
matchStart, 1, this.options))) {
break;
}
}
}
/*
* Straightforward matching.
*/
else {
for (matchStart = con.start; matchStart <= limit; matchStart ++) {
if (0 <= (matchEnd = this. match(con, this.operations, matchStart, 1, this.options)))
break;
}
}
if (matchEnd >= 0) {
if (con.match != null) {
con.match.setBeginning(0, matchStart);
con.match.setEnd(0, matchEnd);
}
con.setInUse(false);
return true;
} else {
con.setInUse(false);
return false;
}
}
/**
* Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
*
* @return true if the target is matched to this regular expression.
*/
public boolean matches(String target) {
return this.matches(target, 0, target .length() , (Match)null);
}
/**
* Checks whether the <var>target</var> text <strong>contains</strong> this pattern
* in specified range or not.
*
* @param start Start offset of the range.
* @param end End offset +1 of the range.
* @return true if the target is matched to this regular expression.
*/
public boolean matches(String target, int start, int end) {
return this.matches(target, start, end, (Match)null);
}
/**
* Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
*
* @param match A Match instance for storing matching result.
* @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
*/
public boolean matches(String target, Match match) {
return this.matches(target, 0, target .length() , match);
}
/**
* Checks whether the <var>target</var> text <strong>contains</strong> this pattern
* in specified range or not.
*
* @param start Start offset of the range.
* @param end End offset +1 of the range.
* @param match A Match instance for storing matching result.
* @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
*/
public boolean matches(String target, int start, int end, Match match) {
synchronized (this) {
if (this.operations == null)
this.prepare();
if (this.context == null)
this.context = new Context();
}
Context con = null;
synchronized (this.context) {
con = this.context.inuse ? new Context() : this.context;
con.reset(target, start, end, this.numberOfClosures);
}
if (match != null) {
match.setNumberOfGroups(this.nofparen);
match.setSource(target);
} else if (this.hasBackReferences) {
match = new Match();
match.setNumberOfGroups(this.nofparen);
// Need not to call setSource() because
// a caller can not access this match instance.
}
con.match = match;
if (RegularExpression.isSet(this.options, XMLSCHEMA_MODE)) {
if (DEBUG) {
System.err.println("target string="+target);
}
int matchEnd = this. match(con, this.operations, con.start, 1, this.options);
if (DEBUG) {
System.err.println("matchEnd="+matchEnd);
System.err.println("con.limit="+con.limit);
}
if (matchEnd == con.limit) {
if (con.match != null) {
con.match.setBeginning(0, con.start);
con.match.setEnd(0, matchEnd);
}
con.setInUse(false);
return true;
}
return false;
}
/*
* The pattern has only fixed string.
* The engine uses Boyer-Moore.
*/
if (this.fixedStringOnly) {
//System.err.println("DEBUG: fixed-only: "+this.fixedString);
int o = this.fixedStringTable.matches(target, con.start, con.limit);
if (o >= 0) {
if (con.match != null) {
con.match.setBeginning(0, o);
con.match.setEnd(0, o+this.fixedString.length());
}
con.setInUse(false);
return true;
}
con.setInUse(false);
return false;
}
/*
* The pattern contains a fixed string.
* The engine checks with Boyer-Moore whether the text contains the fixed string or not.
* If not, it return with false.
*/
if (this.fixedString != null) {
int o = this.fixedStringTable.matches(target, con.start, con.limit);
if (o < 0) {
//System.err.println("Non-match in fixed-string search.");
con.setInUse(false);
return false;
}
}
int limit = con.limit-this.minlength;
int matchStart;
int matchEnd = -1;
/*
* Checks whether the expression starts with ".*".
*/
if (this.operations != null
&& this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) {
if (isSet(this.options, SINGLE_LINE)) {
matchStart = con.start;
matchEnd = this.match(con, this.operations, con.start, 1, this.options);
} else {
boolean previousIsEOL = true;
for (matchStart = con.start; matchStart <= limit; matchStart ++) {
int ch = target .charAt( matchStart ) ;
if (isEOLChar(ch)) {
previousIsEOL = true;
} else {
if (previousIsEOL) {
if (0 <= (matchEnd = this.match(con, this.operations,
matchStart, 1, this.options)))
break;
}
previousIsEOL = false;
}
}
}
}
/*
* Optimization against the first character.
*/
else if (this.firstChar != null) {
//System.err.println("DEBUG: with firstchar-matching: "+this.firstChar);
RangeToken range = this.firstChar;
for (matchStart = con.start; matchStart <= limit; matchStart ++) {
int ch = target .charAt( matchStart ) ;
if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) {
ch = REUtil.composeFromSurrogates(ch, target.charAt(matchStart+1));
}
if (!range.match(ch)) {
continue;
}
if (0 <= (matchEnd = this.match(con, this.operations,
matchStart, 1, this.options))) {
break;
}
}
}
/*
* Straightforward matching.
*/
else {
for (matchStart = con.start; matchStart <= limit; matchStart ++) {
if (0 <= (matchEnd = this.match(con, this.operations, matchStart, 1, this.options)))
break;
}
}
if (matchEnd >= 0) {
if (con.match != null) {
con.match.setBeginning(0, matchStart);
con.match.setEnd(0, matchEnd);
}
con.setInUse(false);
return true;
} else {
con.setInUse(false);
return false;
}
}
/**
* @return -1 when not match; offset of the end of matched string when match.
*/
private int match(Context con, Op op, int offset, int dx, int opts) {
final ExpressionTarget target = con.target;
final Stack opStack = new Stack();
final IntStack dataStack = new IntStack();
final boolean isSetIgnoreCase = isSet(opts, IGNORE_CASE);
int retValue = -1;
boolean returned = false;
for (;;) {
if (op == null || offset > con.limit || offset < con.start) {
if (op == null) {
retValue = isSet(opts, XMLSCHEMA_MODE) && offset != con.limit ? -1 : offset;
}
else {
retValue = -1;
}
returned = true;
}
else {
retValue = -1;
// dx value is either 1 or -1
switch (op.type) {
case Op.CHAR:
{
final int o1 = (dx > 0) ? offset : offset -1;
if (o1 >= con.limit || o1 < 0 || !matchChar(op.getData(), target.charAt(o1), isSetIgnoreCase)) {
returned = true;
break;
}
offset += dx;
op = op.next;
}
break;
case Op.DOT:
{
int o1 = (dx > 0) ? offset : offset - 1;
if (o1 >= con.limit || o1 < 0) {
returned = true;
break;
}
if (isSet(opts, SINGLE_LINE)) {
if (REUtil.isHighSurrogate(target.charAt(o1)) && o1+dx >= 0 && o1+dx < con.limit) {
o1 += dx;
}
}
else {
int ch = target.charAt(o1);
if (REUtil.isHighSurrogate(ch) && o1+dx >= 0 && o1+dx < con.limit) {
o1 += dx;
ch = REUtil.composeFromSurrogates(ch, target.charAt(o1));
}
if (isEOLChar(ch)) {
returned = true;
break;
}
}
offset = (dx > 0) ? o1 + 1 : o1;
op = op.next;
}
break;
case Op.RANGE:
case Op.NRANGE:
{
int o1 = (dx > 0) ? offset : offset -1;
if (o1 >= con.limit || o1 < 0) {
returned = true;
break;
}
int ch = target.charAt(offset);
if (REUtil.isHighSurrogate(ch) && o1+dx < con.limit && o1+dx >=0) {
o1 += dx;
ch = REUtil.composeFromSurrogates(ch, target.charAt(o1));
}
final RangeToken tok = op.getToken();
if (!tok.match(ch)) {
returned = true;
break;
}
offset = (dx > 0) ? o1+1 : o1;
op = op.next;
}
break;
case Op.ANCHOR:
{
if (!matchAnchor(target, op, con, offset, opts)) {
returned = true;
break;
}
op = op.next;
}
break;
case Op.BACKREFERENCE:
{
int refno = op.getData();
if (refno <= 0 || refno >= this.nofparen) {
throw new RuntimeException("Internal Error: Reference number must be more than zero: "+refno);
}
if (con.match.getBeginning(refno) < 0 || con.match.getEnd(refno) < 0) {
returned = true;
break;
}
int o2 = con.match.getBeginning(refno);
int literallen = con.match.getEnd(refno)-o2;
if (dx > 0) {
if (!target.regionMatches(isSetIgnoreCase, offset, con.limit, o2, literallen)) {
returned = true;
break;
}
offset += literallen;
}
else {
if (!target.regionMatches(isSetIgnoreCase, offset-literallen, con.limit, o2, literallen)) {
returned = true;
break;
}
offset -= literallen;
}
op = op.next;
}
break;
case Op.STRING:
{
String literal = op.getString();
int literallen = literal.length();
if (dx > 0) {
if (!target.regionMatches(isSetIgnoreCase, offset, con.limit, literal, literallen)) {
returned = true;
break;
}
offset += literallen;
}
else {
if (!target.regionMatches(isSetIgnoreCase, offset-literallen, con.limit, literal, literallen)) {
returned = true;
break;
}
offset -= literallen;
}
op = op.next;
}
break;
case Op.CLOSURE:
{
// Saves current position to avoid zero-width repeats.
final int id = op.getData();
if (con.closureContexts[id].contains(offset)) {
returned = true;
break;
}
con.closureContexts[id].addOffset(offset);
}
// fall through
case Op.QUESTION:
{
opStack.push(op);
dataStack.push(offset);
op = op.getChild();
}
break;
case Op.NONGREEDYCLOSURE:
case Op.NONGREEDYQUESTION:
{
opStack.push(op);
dataStack.push(offset);
op = op.next;
}
break;
case Op.UNION:
if (op.size() == 0) {
returned = true;
}
else {
opStack.push(op);
dataStack.push(0);
dataStack.push(offset);
op = op.elementAt(0);
}
break;
case Op.CAPTURE:
{
final int refno = op.getData();
if (con.match != null) {
if (refno > 0) {
dataStack.push(con.match.getBeginning(refno));
con.match.setBeginning(refno, offset);
}
else {
final int index = -refno;
dataStack.push(con.match.getEnd(index));
con.match.setEnd(index, offset);
}
opStack.push(op);
dataStack.push(offset);
}
op = op.next;
}
break;
case Op.LOOKAHEAD:
case Op.NEGATIVELOOKAHEAD:
case Op.LOOKBEHIND:
case Op.NEGATIVELOOKBEHIND:
{
opStack.push(op);
dataStack.push(dx);
dataStack.push(offset);
dx = (op.type == Op.LOOKAHEAD || op.type == Op.NEGATIVELOOKAHEAD) ? 1 : -1;
op = op.getChild();
}
break;
case Op.INDEPENDENT:
{
opStack.push(op);
dataStack.push(offset);
op = op.getChild();
}
break;
case Op.MODIFIER:
{
int localopts = opts;
localopts |= op.getData();
localopts &= ~op.getData2();
opStack.push(op);
dataStack.push(opts);
dataStack.push(offset);
opts = localopts;
op = op.getChild();
}
break;
case Op.CONDITION:
{
Op.ConditionOp cop = (Op.ConditionOp)op;
if (cop.refNumber > 0) {
if (cop.refNumber >= this.nofparen) {
throw new RuntimeException("Internal Error: Reference number must be more than zero: "+cop.refNumber);
}
if (con.match.getBeginning(cop.refNumber) >= 0
&& con.match.getEnd(cop.refNumber) >= 0) {
op = cop.yes;
}
else if (cop.no != null) {
op = cop.no;
}
else {
op = cop.next;
}
}
else {
opStack.push(op);
dataStack.push(offset);
op = cop.condition;
}
}
break;
default:
throw new RuntimeException("Unknown operation type: " + op.type);
}
}
// handle recursive operations
while (returned) {
// exhausted all the operations
if (opStack.isEmpty()) {
return retValue;
}
op = (Op) opStack.pop();
offset = dataStack.pop();
switch (op.type) {
case Op.CLOSURE:
case Op.QUESTION:
if (retValue < 0) {
op = op.next;
returned = false;
}
break;
case Op.NONGREEDYCLOSURE:
case Op.NONGREEDYQUESTION:
if (retValue < 0) {
op = op.getChild();
returned = false;
}
break;
case Op.UNION:
{
int unionIndex = dataStack.pop();
if (DEBUG) {
System.err.println("UNION: "+unionIndex+", ret="+retValue);
}
if (retValue < 0) {
if (++unionIndex < op.size()) {
opStack.push(op);
dataStack.push(unionIndex);
dataStack.push(offset);
op = op.elementAt(unionIndex);
returned = false;
}
else {
retValue = -1;
}
}
}
break;
case Op.CAPTURE:
final int refno = op.getData();
final int saved = dataStack.pop();
if (retValue < 0) {
if (refno > 0) {
con.match.setBeginning(refno, saved);
}
else {
con.match.setEnd(-refno, saved);
}
}
break;
case Op.LOOKAHEAD:
case Op.LOOKBEHIND:
{
dx = dataStack.pop();
if (0 <= retValue) {
op = op.next;
returned = false;
}
retValue = -1;
}
break;
case Op.NEGATIVELOOKAHEAD:
case Op.NEGATIVELOOKBEHIND:
{
dx = dataStack.pop();
if (0 > retValue) {
op = op.next;
returned = false;
}
retValue = -1;
}
break;
case Op.MODIFIER:
opts = dataStack.pop();
// fall through
case Op.INDEPENDENT:
if (retValue >= 0) {
offset = retValue;
op = op.next;
returned = false;
}
break;
case Op.CONDITION:
{
final Op.ConditionOp cop = (Op.ConditionOp)op;
if (0 <= retValue) {
op = cop.yes;
}
else if (cop.no != null) {
op = cop.no;
}
else {
op = cop.next;
}
}
returned = false;
break;
default:
break;
}
}
}
}
private boolean matchChar(int ch, int other, boolean ignoreCase) {
return (ignoreCase) ? matchIgnoreCase(ch, other) : ch == other;
}
boolean matchAnchor(ExpressionTarget target, Op op, Context con, int offset, int opts) {
boolean go = false;
switch (op.getData()) {
case '^':
if (isSet(opts, MULTIPLE_LINES)) {
if (!(offset == con.start
|| offset > con.start && offset < con.limit && isEOLChar(target.charAt(offset-1))))
return false;
} else {
if (offset != con.start)
return false;
}
break;
case '@': // Internal use only.
// The @ always matches line beginnings.
if (!(offset == con.start
|| offset > con.start && isEOLChar(target.charAt(offset-1))))
return false;
break;
case '$':
if (isSet(opts, MULTIPLE_LINES)) {
if (!(offset == con.limit
|| offset < con.limit && isEOLChar(target.charAt(offset))))
return false;
} else {
if (!(offset == con.limit
|| offset+1 == con.limit && isEOLChar(target.charAt(offset))
|| offset+2 == con.limit && target.charAt(offset) == CARRIAGE_RETURN
&& target.charAt(offset+1) == LINE_FEED))
return false;
}
break;
case 'A':
if (offset != con.start) return false;
break;
case 'Z':
if (!(offset == con.limit
|| offset+1 == con.limit && isEOLChar(target.charAt(offset))
|| offset+2 == con.limit && target.charAt(offset) == CARRIAGE_RETURN
&& target.charAt(offset+1) == LINE_FEED))
return false;
break;
case 'z':
if (offset != con.limit) return false;
break;
case 'b':
if (con.length == 0)
return false;
{
int after = getWordType(target, con.start, con.limit, offset, opts);
if (after == WT_IGNORE) return false;
int before = getPreviousWordType(target, con.start, con.limit, offset, opts);
if (after == before) return false;
}
break;
case 'B':
if (con.length == 0)
go = true;
else {
int after = getWordType(target, con.start, con.limit, offset, opts);
go = after == WT_IGNORE
|| after == getPreviousWordType(target, con.start, con.limit, offset, opts);
}
if (!go) return false;
break;
case '<':
if (con.length == 0 || offset == con.limit) return false;
if (getWordType(target, con.start, con.limit, offset, opts) != WT_LETTER
|| getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_OTHER)
return false;
break;
case '>':
if (con.length == 0 || offset == con.start) return false;
if (getWordType(target, con.start, con.limit, offset, opts) != WT_OTHER
|| getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_LETTER)
return false;
break;
} // switch anchor type
return true;
}
private static final int getPreviousWordType(ExpressionTarget target, int begin, int end,
int offset, int opts) {
int ret = getWordType(target, begin, end, --offset, opts);
while (ret == WT_IGNORE)
ret = getWordType(target, begin, end, --offset, opts);
return ret;
}
private static final int getWordType(ExpressionTarget target, int begin, int end,
int offset, int opts) {
if (offset < begin || offset >= end) return WT_OTHER;
return getWordType0(target.charAt(offset) , opts);
}
/**
* Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
*
* @return true if the target is matched to this regular expression.
*/
public boolean matches(CharacterIterator target) {
return this.matches(target, (Match)null);
}
/**
* Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
*
* @param match A Match instance for storing matching result.
* @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
*/
public boolean matches(CharacterIterator target, Match match) {
int start = target.getBeginIndex();
int end = target.getEndIndex();
synchronized (this) {
if (this.operations == null)
this.prepare();
if (this.context == null)
this.context = new Context();
}
Context con = null;
synchronized (this.context) {
con = this.context.inuse ? new Context() : this.context;
con.reset(target, start, end, this.numberOfClosures);
}
if (match != null) {
match.setNumberOfGroups(this.nofparen);
match.setSource(target);
} else if (this.hasBackReferences) {
match = new Match();
match.setNumberOfGroups(this.nofparen);
// Need not to call setSource() because
// a caller can not access this match instance.
}
con.match = match;
if (RegularExpression.isSet(this.options, XMLSCHEMA_MODE)) {
int matchEnd = this.match(con, this.operations, con.start, 1, this.options);
//System.err.println("DEBUG: matchEnd="+matchEnd);
if (matchEnd == con.limit) {
if (con.match != null) {
con.match.setBeginning(0, con.start);
con.match.setEnd(0, matchEnd);
}
con.setInUse(false);
return true;
}
return false;
}
/*
* The pattern has only fixed string.
* The engine uses Boyer-Moore.
*/
if (this.fixedStringOnly) {
//System.err.println("DEBUG: fixed-only: "+this.fixedString);
int o = this.fixedStringTable.matches(target, con.start, con.limit);
if (o >= 0) {
if (con.match != null) {
con.match.setBeginning(0, o);
con.match.setEnd(0, o+this.fixedString.length());
}
con.setInUse(false);
return true;
}
con.setInUse(false);
return false;
}
/*
* The pattern contains a fixed string.
* The engine checks with Boyer-Moore whether the text contains the fixed string or not.
* If not, it return with false.
*/
if (this.fixedString != null) {
int o = this.fixedStringTable.matches(target, con.start, con.limit);
if (o < 0) {
//System.err.println("Non-match in fixed-string search.");
con.setInUse(false);
return false;
}
}
int limit = con.limit-this.minlength;
int matchStart;
int matchEnd = -1;
/*
* Checks whether the expression starts with ".*".
*/
if (this.operations != null
&& this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) {
if (isSet(this.options, SINGLE_LINE)) {
matchStart = con.start;
matchEnd = this.match(con, this.operations, con.start, 1, this.options);
} else {
boolean previousIsEOL = true;
for (matchStart = con.start; matchStart <= limit; matchStart ++) {
int ch = target .setIndex( matchStart ) ;
if (isEOLChar(ch)) {
previousIsEOL = true;
} else {
if (previousIsEOL) {
if (0 <= (matchEnd = this.match(con, this.operations,
matchStart, 1, this.options)))
break;
}
previousIsEOL = false;
}
}
}
}
/*
* Optimization against the first character.
*/
else if (this.firstChar != null) {
//System.err.println("DEBUG: with firstchar-matching: "+this.firstChar);
RangeToken range = this.firstChar;
for (matchStart = con.start; matchStart <= limit; matchStart ++) {
int ch = target .setIndex( matchStart ) ;
if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) {
ch = REUtil.composeFromSurrogates(ch, target.setIndex(matchStart+1));
}
if (!range.match(ch)) {
continue;
}
if (0 <= (matchEnd = this.match(con, this.operations,
matchStart, 1, this.options))) {
break;
}
}
}
/*
* Straightforward matching.
*/
else {
for (matchStart = con.start; matchStart <= limit; matchStart ++) {
if (0 <= (matchEnd = this. match(con, this.operations, matchStart, 1, this.options)))
break;
}
}
if (matchEnd >= 0) {
if (con.match != null) {
con.match.setBeginning(0, matchStart);
con.match.setEnd(0, matchEnd);
}
con.setInUse(false);
return true;
} else {
con.setInUse(false);
return false;
}
}
// ================================================================
/**
* A regular expression.
* @serial
*/
String regex;
/**
* @serial
*/
int options;
/**
* The number of parenthesis in the regular expression.
* @serial
*/
int nofparen;
/**
* Internal representation of the regular expression.
* @serial
*/
Token tokentree;
boolean hasBackReferences = false;
transient int minlength;
transient Op operations = null;
transient int numberOfClosures;
transient Context context = null;
transient RangeToken firstChar = null;
transient String fixedString = null;
transient int fixedStringOptions;
transient BMPattern fixedStringTable = null;
transient boolean fixedStringOnly = false;
static abstract class ExpressionTarget {
abstract char charAt(int index);
abstract boolean regionMatches(boolean ignoreCase, int offset, int limit, String part, int partlen);
abstract boolean regionMatches(boolean ignoreCase, int offset, int limit, int offset2, int partlen);
}
static final class StringTarget extends ExpressionTarget {
private String target;
StringTarget(String target) {
this.target = target;
}
final void resetTarget(String target) {
this.target = target;
}
final char charAt(int index) {
return target.charAt(index);
}
final boolean regionMatches(boolean ignoreCase, int offset, int limit,
String part, int partlen) {
if (limit-offset < partlen) {
return false;
}
return (ignoreCase) ? target.regionMatches(true, offset, part, 0, partlen) : target.regionMatches(offset, part, 0, partlen);
}
final boolean regionMatches(boolean ignoreCase, int offset, int limit,
int offset2, int partlen) {
if (limit-offset < partlen) {
return false;
}
return (ignoreCase) ? target.regionMatches(true, offset, target, offset2, partlen)
: target.regionMatches(offset, target, offset2, partlen);
}
}
static final class CharArrayTarget extends ExpressionTarget {
char[] target;
CharArrayTarget(char[] target) {
this.target = target;
}
final void resetTarget(char[] target) {
this.target = target;
}
char charAt(int index) {
return target[index];
}
final boolean regionMatches(boolean ignoreCase, int offset, int limit,
String part, int partlen) {
if (offset < 0 || limit-offset < partlen) {
return false;
}
return (ignoreCase) ? regionMatchesIgnoreCase(offset, limit, part, partlen)
: regionMatches(offset, limit, part, partlen);
}
private final boolean regionMatches(int offset, int limit, String part, int partlen) {
int i = 0;
while (partlen-- > 0) {
if (target[offset++] != part.charAt(i++)) {
return false;
}
}
return true;
}
private final boolean regionMatchesIgnoreCase(int offset, int limit, String part, int partlen) {
int i = 0;
while (partlen-- > 0) {
final char ch1 = target[offset++] ;
final char ch2 = part.charAt(i++);
if (ch1 == ch2) {
continue;
}
final char uch1 = Character.toUpperCase(ch1);
final char uch2 = Character.toUpperCase(ch2);
if (uch1 == uch2) {
continue;
}
if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2)) {
return false;
}
}
return true;
}
final boolean regionMatches(boolean ignoreCase, int offset, int limit, int offset2, int partlen) {
if (offset < 0 || limit-offset < partlen) {
return false;
}
return (ignoreCase) ? regionMatchesIgnoreCase(offset, limit, offset2, partlen)
: regionMatches(offset, limit, offset2, partlen);
}
private final boolean regionMatches(int offset, int limit, int offset2, int partlen) {
int i = offset2;
while (partlen-- > 0) {
if ( target [ offset++ ] != target [ i++ ] )
return false;
}
return true;
}
private final boolean regionMatchesIgnoreCase(int offset, int limit, int offset2, int partlen) {
int i = offset2;
while (partlen-- > 0) {
final char ch1 = target[offset++] ;
final char ch2 = target[i++] ;
if (ch1 == ch2) {
continue;
}
final char uch1 = Character.toUpperCase(ch1);
final char uch2 = Character.toUpperCase(ch2);
if (uch1 == uch2) {
continue;
}
if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2)) {
return false;
}
}
return true;
}
}
static final class CharacterIteratorTarget extends ExpressionTarget {
CharacterIterator target;
CharacterIteratorTarget(CharacterIterator target) {
this.target = target;
}
final void resetTarget(CharacterIterator target) {
this.target = target;
}
final char charAt(int index) {
return target.setIndex(index);
}
final boolean regionMatches(boolean ignoreCase, int offset, int limit,
String part, int partlen) {
if (offset < 0 || limit-offset < partlen) {
return false;
}
return (ignoreCase) ? regionMatchesIgnoreCase(offset, limit, part, partlen)
: regionMatches(offset, limit, part, partlen);
}
private final boolean regionMatches(int offset, int limit, String part, int partlen) {
int i = 0;
while (partlen-- > 0) {
if (target.setIndex(offset++) != part.charAt(i++)) {
return false;
}
}
return true;
}
private final boolean regionMatchesIgnoreCase(int offset, int limit, String part, int partlen) {
int i = 0;
while (partlen-- > 0) {
final char ch1 = target.setIndex(offset++) ;
final char ch2 = part.charAt(i++);
if (ch1 == ch2) {
continue;
}
final char uch1 = Character.toUpperCase(ch1);
final char uch2 = Character.toUpperCase(ch2);
if (uch1 == uch2) {
continue;
}
if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2)) {
return false;
}
}
return true;
}
final boolean regionMatches(boolean ignoreCase, int offset, int limit, int offset2, int partlen) {
if (offset < 0 || limit-offset < partlen) {
return false;
}
return (ignoreCase) ? regionMatchesIgnoreCase(offset, limit, offset2, partlen)
: regionMatches(offset, limit, offset2, partlen);
}
private final boolean regionMatches(int offset, int limit, int offset2, int partlen) {
int i = offset2;
while (partlen-- > 0) {
if (target.setIndex(offset++) != target.setIndex(i++)) {
return false;
}
}
return true;
}
private final boolean regionMatchesIgnoreCase(int offset, int limit, int offset2, int partlen) {
int i = offset2;
while (partlen-- > 0) {
final char ch1 = target.setIndex(offset++) ;
final char ch2 = target.setIndex(i++) ;
if (ch1 == ch2) {
continue;
}
final char uch1 = Character.toUpperCase(ch1);
final char uch2 = Character.toUpperCase(ch2);
if (uch1 == uch2) {
continue;
}
if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2)) {
return false;
}
}
return true;
}
}
static final class ClosureContext {
int[] offsets = new int[4];
int currentIndex = 0;
boolean contains(int offset) {
for (int i=0; i<currentIndex;++i) {
if (offsets[i] == offset) {
return true;
}
}
return false;
}
void reset() {
currentIndex = 0;
}
void addOffset(int offset) {
// We do not check for duplicates, caller is responsible for that
if (currentIndex == offsets.length) {
offsets = expandOffsets();
}
offsets[currentIndex++] = offset;
}
private int[] expandOffsets() {
final int len = offsets.length;
final int newLen = len << 1;
int[] newOffsets = new int[newLen];
System.arraycopy(offsets, 0, newOffsets, 0, currentIndex);
return newOffsets;
}
}
static final class Context {
int start;
int limit;
int length;
Match match;
boolean inuse = false;
ClosureContext[] closureContexts;
private StringTarget stringTarget;
private CharArrayTarget charArrayTarget;
private CharacterIteratorTarget characterIteratorTarget;
ExpressionTarget target;
Context() {
}
private void resetCommon(int nofclosures) {
this.length = this.limit-this.start;
setInUse(true);
this.match = null;
if (this.closureContexts == null || this.closureContexts.length != nofclosures) {
this.closureContexts = new ClosureContext[nofclosures];
}
for (int i = 0; i < nofclosures; i ++) {
if (this.closureContexts[i] == null) {
this.closureContexts[i] = new ClosureContext();
}
else {
this.closureContexts[i].reset();
}
}
}
void reset(CharacterIterator target, int start, int limit, int nofclosures) {
if (characterIteratorTarget == null) {
characterIteratorTarget = new CharacterIteratorTarget(target);
}
else {
characterIteratorTarget.resetTarget(target);
}
this.target = characterIteratorTarget;
this.start = start;
this.limit = limit;
this.resetCommon(nofclosures);
}
void reset(String target, int start, int limit, int nofclosures) {
if (stringTarget == null) {
stringTarget = new StringTarget(target);
}
else {
stringTarget.resetTarget(target);
}
this.target = stringTarget;
this.start = start;
this.limit = limit;
this.resetCommon(nofclosures);
}
void reset(char[] target, int start, int limit, int nofclosures) {
if (charArrayTarget == null) {
charArrayTarget = new CharArrayTarget(target);
}
else {
charArrayTarget.resetTarget(target);
}
this.target = charArrayTarget;
this.start = start;
this.limit = limit;
this.resetCommon(nofclosures);
}
synchronized void setInUse(boolean inUse) {
this.inuse = inUse;
}
}
/**
* Prepares for matching. This method is called just before starting matching.
*/
void prepare() {
if (Op.COUNT) Op.nofinstances = 0;
this.compile(this.tokentree);
/*
if (this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) { // .*
Op anchor = Op.createAnchor(isSet(this.options, SINGLE_LINE) ? 'A' : '@');
anchor.next = this.operations;
this.operations = anchor;
}
*/
if (Op.COUNT) System.err.println("DEBUG: The number of operations: "+Op.nofinstances);
this.minlength = this.tokentree.getMinLength();
this.firstChar = null;
if (!isSet(this.options, PROHIBIT_HEAD_CHARACTER_OPTIMIZATION)
&& !isSet(this.options, XMLSCHEMA_MODE)) {
RangeToken firstChar = Token.createRange();
int fresult = this.tokentree.analyzeFirstCharacter(firstChar, this.options);
if (fresult == Token.FC_TERMINAL) {
firstChar.compactRanges();
this.firstChar = firstChar;
if (DEBUG)
System.err.println("DEBUG: Use the first character optimization: "+firstChar);
}
}
if (this.operations != null
&& (this.operations.type == Op.STRING || this.operations.type == Op.CHAR)
&& this.operations.next == null) {
if (DEBUG)
System.err.print(" *** Only fixed string! *** ");
this.fixedStringOnly = true;
if (this.operations.type == Op.STRING)
this.fixedString = this.operations.getString();
else if (this.operations.getData() >= 0x10000) { // Op.CHAR
this.fixedString = REUtil.decomposeToSurrogates(this.operations.getData());
} else {
char[] ac = new char[1];
ac[0] = (char)this.operations.getData();
this.fixedString = new String(ac);
}
this.fixedStringOptions = this.options;
this.fixedStringTable = new BMPattern(this.fixedString, 256,
isSet(this.fixedStringOptions, IGNORE_CASE));
} else if (!isSet(this.options, PROHIBIT_FIXED_STRING_OPTIMIZATION)
&& !isSet(this.options, XMLSCHEMA_MODE)) {
Token.FixedStringContainer container = new Token.FixedStringContainer();
this.tokentree.findFixedString(container, this.options);
this.fixedString = container.token == null ? null : container.token.getString();
this.fixedStringOptions = container.options;
if (this.fixedString != null && this.fixedString.length() < 2)
this.fixedString = null;
// This pattern has a fixed string of which length is more than one.
if (this.fixedString != null) {
this.fixedStringTable = new BMPattern(this.fixedString, 256,
isSet(this.fixedStringOptions, IGNORE_CASE));
if (DEBUG) {
System.err.println("DEBUG: The longest fixed string: "+this.fixedString.length()
+"/" //+this.fixedString
+"/"+REUtil.createOptionString(this.fixedStringOptions));
System.err.print("String: ");
REUtil.dumpString(this.fixedString);
}
}
}
}
/**
* An option.
* If you specify this option, <span class="REGEX"><kbd>(</kbd><var>X</var><kbd>)</kbd></span>
* captures matched text, and <span class="REGEX"><kbd>(:?</kbd><var>X</var><kbd>)</kbd></span>
* does not capture.
*
* @see #RegularExpression(java.lang.String,int)
* @see #setPattern(java.lang.String,int)
static final int MARK_PARENS = 1<<0;
*/
/**
* "i"
*/
static final int IGNORE_CASE = 1<<1;
/**
* "s"
*/
static final int SINGLE_LINE = 1<<2;
/**
* "m"
*/
static final int MULTIPLE_LINES = 1<<3;
/**
* "x"
*/
static final int EXTENDED_COMMENT = 1<<4;
/**
* This option redefines <span class="REGEX"><kbd>\d \D \w \W \s \S</kbd></span>.
*
* @see #RegularExpression(java.lang.String,int)
* @see #setPattern(java.lang.String,int)
* @see #UNICODE_WORD_BOUNDARY
*/
static final int USE_UNICODE_CATEGORY = 1<<5; // "u"
/**
* An option.
* This enables to process locale-independent word boundary for <span class="REGEX"><kbd>\b \B \&lt; \></kbd></span>.
* <p>By default, the engine considers a position between a word character
* (<span class="REGEX"><Kbd>\w</kbd></span>) and a non word character
* is a word boundary.
* <p>By this option, the engine checks word boundaries with the method of
* 'Unicode Regular Expression Guidelines' Revision 4.
*
* @see #RegularExpression(java.lang.String,int)
* @see #setPattern(java.lang.String,int)
*/
static final int UNICODE_WORD_BOUNDARY = 1<<6; // "w"
/**
* "H"
*/
static final int PROHIBIT_HEAD_CHARACTER_OPTIMIZATION = 1<<7;
/**
* "F"
*/
static final int PROHIBIT_FIXED_STRING_OPTIMIZATION = 1<<8;
/**
* "X". XML Schema mode.
*/
static final int XMLSCHEMA_MODE = 1<<9;
/**
* ",".
*/
static final int SPECIAL_COMMA = 1<<10;
private static final boolean isSet(int options, int flag) {
return (options & flag) == flag;
}
/**
* Creates a new RegularExpression instance.
*
* @param regex A regular expression
* @exception org.apache.xerces.utils.regex.ParseException <VAR>regex</VAR> is not conforming to the syntax.
*/
public RegularExpression(String regex) throws ParseException {
this(regex, null);
}
/**
* Creates a new RegularExpression instance with options.
*
* @param regex A regular expression
* @param options A String consisted of "i" "m" "s" "u" "w" "," "X"
* @exception org.apache.xerces.utils.regex.ParseException <VAR>regex</VAR> is not conforming to the syntax.
*/
public RegularExpression(String regex, String options) throws ParseException {
this.setPattern(regex, options);
}
/**
* Creates a new RegularExpression instance with options.
*
* @param regex A regular expression
* @param options A String consisted of "i" "m" "s" "u" "w" "," "X"
* @exception org.apache.xerces.utils.regex.ParseException <VAR>regex</VAR> is not conforming to the syntax.
*/
public RegularExpression(String regex, String options, Locale locale) throws ParseException {
this.setPattern(regex, options, locale);
}
RegularExpression(String regex, Token tok, int parens, boolean hasBackReferences, int options) {
this.regex = regex;
this.tokentree = tok;
this.nofparen = parens;
this.options = options;
this.hasBackReferences = hasBackReferences;
}
/**
*
*/
public void setPattern(String newPattern) throws ParseException {
this.setPattern(newPattern, Locale.getDefault());
}
public void setPattern(String newPattern, Locale locale) throws ParseException {
this.setPattern(newPattern, this.options, locale);
}
private void setPattern(String newPattern, int options, Locale locale) throws ParseException {
this.regex = newPattern;
this.options = options;
RegexParser rp = RegularExpression.isSet(this.options, RegularExpression.XMLSCHEMA_MODE)
? new ParserForXMLSchema(locale) : new RegexParser(locale);
this.tokentree = rp.parse(this.regex, this.options);
this.nofparen = rp.parennumber;
this.hasBackReferences = rp.hasBackReferences;
this.operations = null;
this.context = null;
}
/**
*
*/
public void setPattern(String newPattern, String options) throws ParseException {
this.setPattern(newPattern, options, Locale.getDefault());
}
public void setPattern(String newPattern, String options, Locale locale) throws ParseException {
this.setPattern(newPattern, REUtil.parseOptions(options), locale);
}
/**
*
*/
public String getPattern() {
return this.regex;
}
/**
* Represents this instence in String.
*/
public String toString() {
return this.tokentree.toString(this.options);
}
/**
* Returns a option string.
* The order of letters in it may be different from a string specified
* in a constructor or <code>setPattern()</code>.
*
* @see #RegularExpression(java.lang.String,java.lang.String)
* @see #setPattern(java.lang.String,java.lang.String)
*/
public String getOptions() {
return REUtil.createOptionString(this.options);
}
/**
* Return true if patterns are the same and the options are equivalent.
*/
public boolean equals(Object obj) {
if (obj == null) return false;
if (!(obj instanceof RegularExpression))
return false;
RegularExpression r = (RegularExpression)obj;
return this.regex.equals(r.regex) && this.options == r.options;
}
boolean equals(String pattern, int options) {
return this.regex.equals(pattern) && this.options == options;
}
/**
*
*/
public int hashCode() {
return (this.regex+"/"+this.getOptions()).hashCode();
}
/**
* Return the number of regular expression groups.
* This method returns 1 when the regular expression has no capturing-parenthesis.
*
*/
public int getNumberOfGroups() {
return this.nofparen;
}
// ================================================================
private static final int WT_IGNORE = 0;
private static final int WT_LETTER = 1;
private static final int WT_OTHER = 2;
private static final int getWordType0(char ch, int opts) {
if (!isSet(opts, UNICODE_WORD_BOUNDARY)) {
if (isSet(opts, USE_UNICODE_CATEGORY)) {
return (Token.getRange("IsWord", true).match(ch)) ? WT_LETTER : WT_OTHER;
}
return isWordChar(ch) ? WT_LETTER : WT_OTHER;
}
switch (Character.getType(ch)) {
case Character.UPPERCASE_LETTER: // L
case Character.LOWERCASE_LETTER: // L
case Character.TITLECASE_LETTER: // L
case Character.MODIFIER_LETTER: // L
case Character.OTHER_LETTER: // L
case Character.LETTER_NUMBER: // N
case Character.DECIMAL_DIGIT_NUMBER: // N
case Character.OTHER_NUMBER: // N
case Character.COMBINING_SPACING_MARK: // Mc
return WT_LETTER;
case Character.FORMAT: // Cf
case Character.NON_SPACING_MARK: // Mn
case Character.ENCLOSING_MARK: // Mc
return WT_IGNORE;
case Character.CONTROL: // Cc
switch (ch) {
case '\t':
case '\n':
case '\u000B':
case '\f':
case '\r':
return WT_OTHER;
default:
return WT_IGNORE;
}
default:
return WT_OTHER;
}
}
// ================================================================
static final int LINE_FEED = 0x000A;
static final int CARRIAGE_RETURN = 0x000D;
static final int LINE_SEPARATOR = 0x2028;
static final int PARAGRAPH_SEPARATOR = 0x2029;
private static final boolean isEOLChar(int ch) {
return ch == LINE_FEED || ch == CARRIAGE_RETURN || ch == LINE_SEPARATOR
|| ch == PARAGRAPH_SEPARATOR;
}
private static final boolean isWordChar(int ch) { // Legacy word characters
if (ch == '_') return true;
if (ch < '0') return false;
if (ch > 'z') return false;
if (ch <= '9') return true;
if (ch < 'A') return false;
if (ch <= 'Z') return true;
if (ch < 'a') return false;
return true;
}
private static final boolean matchIgnoreCase(int chardata, int ch) {
if (chardata == ch) return true;
if (chardata > 0xffff || ch > 0xffff) return false;
char uch1 = Character.toUpperCase((char)chardata);
char uch2 = Character.toUpperCase((char)ch);
if (uch1 == uch2) return true;
return Character.toLowerCase(uch1) == Character.toLowerCase(uch2);
}
}
/**
* @xerces.internal
*
* @version $Id: REUtil.java 828015 2009-10-21 13:56:13Z knoaman $
*/
final static class REUtil {
private REUtil() {
}
static final int composeFromSurrogates(int high, int low) {
return 0x10000 + ((high-0xd800)<<10) + low-0xdc00;
}
static final boolean isLowSurrogate(int ch) {
return (ch & 0xfc00) == 0xdc00;
}
static final boolean isHighSurrogate(int ch) {
return (ch & 0xfc00) == 0xd800;
}
static final String decomposeToSurrogates(int ch) {
char[] chs = new char[2];
ch -= 0x10000;
chs[0] = (char)((ch>>10)+0xd800);
chs[1] = (char)((ch&0x3ff)+0xdc00);
return new String(chs);
}
static final String substring(CharacterIterator iterator, int begin, int end) {
char[] src = new char[end-begin];
for (int i = 0; i < src.length; i ++)
src[i] = iterator.setIndex(i+begin);
return new String(src);
}
// ================================================================
static final int getOptionValue(int ch) {
int ret = 0;
switch (ch) {
case 'i':
ret = RegularExpression.IGNORE_CASE;
break;
case 'm':
ret = RegularExpression.MULTIPLE_LINES;
break;
case 's':
ret = RegularExpression.SINGLE_LINE;
break;
case 'x':
ret = RegularExpression.EXTENDED_COMMENT;
break;
case 'u':
ret = RegularExpression.USE_UNICODE_CATEGORY;
break;
case 'w':
ret = RegularExpression.UNICODE_WORD_BOUNDARY;
break;
case 'F':
ret = RegularExpression.PROHIBIT_FIXED_STRING_OPTIMIZATION;
break;
case 'H':
ret = RegularExpression.PROHIBIT_HEAD_CHARACTER_OPTIMIZATION;
break;
case 'X':
ret = RegularExpression.XMLSCHEMA_MODE;
break;
case ',':
ret = RegularExpression.SPECIAL_COMMA;
break;
default:
}
return ret;
}
static final int parseOptions(String opts) throws ParseException {
if (opts == null) return 0;
int options = 0;
for (int i = 0; i < opts.length(); i ++) {
int v = getOptionValue(opts.charAt(i));
if (v == 0)
throw new ParseException("Unknown Option: "+opts.substring(i), -1);
options |= v;
}
return options;
}
static final String createOptionString(int options) {
StringBuilder sb = new StringBuilder(9);
if ((options & RegularExpression.PROHIBIT_FIXED_STRING_OPTIMIZATION) != 0)
sb.append((char)'F');
if ((options & RegularExpression.PROHIBIT_HEAD_CHARACTER_OPTIMIZATION) != 0)
sb.append((char)'H');
if ((options & RegularExpression.XMLSCHEMA_MODE) != 0)
sb.append((char)'X');
if ((options & RegularExpression.IGNORE_CASE) != 0)
sb.append((char)'i');
if ((options & RegularExpression.MULTIPLE_LINES) != 0)
sb.append((char)'m');
if ((options & RegularExpression.SINGLE_LINE) != 0)
sb.append((char)'s');
if ((options & RegularExpression.USE_UNICODE_CATEGORY) != 0)
sb.append((char)'u');
if ((options & RegularExpression.UNICODE_WORD_BOUNDARY) != 0)
sb.append((char)'w');
if ((options & RegularExpression.EXTENDED_COMMENT) != 0)
sb.append((char)'x');
if ((options & RegularExpression.SPECIAL_COMMA) != 0)
sb.append((char)',');
return sb.toString().intern();
}
// ================================================================
static String stripExtendedComment(String regex) {
int len = regex.length();
StringBuilder buffer = new StringBuilder(len);
int offset = 0;
int charClass = 0;
while (offset < len) {
int ch = regex.charAt(offset++);
// Skips a white space.
if (ch == '\t' || ch == '\n' || ch == '\f' || ch == '\r' || ch == ' ') {
// if we are inside a character class, we keep the white space
if (charClass > 0) {
buffer.append((char)ch);
}
continue;
}
if (ch == '#') { // Skips chracters between '#' and a line end.
while (offset < len) {
ch = regex.charAt(offset++);
if (ch == '\r' || ch == '\n')
break;
}
continue;
}
int next; // Strips an escaped white space.
if (ch == '\\' && offset < len) {
if ((next = regex.charAt(offset)) == '#'
|| next == '\t' || next == '\n' || next == '\f'
|| next == '\r' || next == ' ') {
buffer.append((char)next);
offset ++;
} else { // Other escaped character.
buffer.append((char)'\\');
buffer.append((char)next);
offset ++;
}
}
else if (ch == '[') {
charClass++;
buffer.append((char)ch);
if (offset < len) {
next = regex.charAt(offset);
if (next == '[' || next ==']') {
buffer.append((char)next);
offset ++;
}
else if (next == '^' && offset + 1 < len) {
next = regex.charAt(offset + 1);
if (next == '[' || next ==']') {
buffer.append((char)'^');
buffer.append((char)next);
offset += 2;
}
}
}
}
else {
if (charClass > 0 && ch == ']') {
--charClass;
}
buffer.append((char)ch);
}
}
return buffer.toString();
}
// ================================================================
/**
* Sample entry.
* <div>Usage: <KBD>org.apache.xerces.utils.regex.REUtil &lt;regex&gt; &lt;string&gt;</KBD></div>
*/
public static void main(String[] argv) {
String pattern = null;
try {
String options = "";
String target = null;
if( argv.length == 0 ) {
System.out.println( "Error:Usage: java REUtil -i|-m|-s|-u|-w|-X regularExpression String" );
System.exit( 0 );
}
for (int i = 0; i < argv.length; i ++) {
if (argv[i].length() == 0 || argv[i].charAt(0) != '-') {
if (pattern == null)
pattern = argv[i];
else if (target == null)
target = argv[i];
else
System.err.println("Unnecessary: "+argv[i]);
} else if (argv[i].equals("-i")) {
options += "i";
} else if (argv[i].equals("-m")) {
options += "m";
} else if (argv[i].equals("-s")) {
options += "s";
} else if (argv[i].equals("-u")) {
options += "u";
} else if (argv[i].equals("-w")) {
options += "w";
} else if (argv[i].equals("-X")) {
options += "X";
} else {
System.err.println("Unknown option: "+argv[i]);
}
}
RegularExpression reg = new RegularExpression(pattern, options);
System.out.println("RegularExpression: "+reg);
Match match = new Match();
reg.matches(target, match);
for (int i = 0; i < match.getNumberOfGroups(); i ++) {
if (i == 0 ) System.out.print("Matched range for the whole pattern: ");
else System.out.print("["+i+"]: ");
if (match.getBeginning(i) < 0)
System.out.println("-1");
else {
System.out.print(match.getBeginning(i)+", "+match.getEnd(i)+", ");
System.out.println("\""+match.getCapturedText(i)+"\"");
}
}
} catch (ParseException pe) {
if (pattern == null) {
pe.printStackTrace();
} else {
System.err.println("org.apache.xerces.utils.regex.ParseException: "+pe.getMessage());
String indent = " ";
System.err.println(indent+pattern);
int loc = pe.getLocation();
if (loc >= 0) {
System.err.print(indent);
for (int i = 0; i < loc; i ++) System.err.print("-");
System.err.println("^");
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
static final int CACHESIZE = 20;
static final RegularExpression[] regexCache = new RegularExpression[CACHESIZE];
/**
* Creates a RegularExpression instance.
* This method caches created instances.
*
* @see RegularExpression#RegularExpression(java.lang.String, java.lang.String)
*/
public static RegularExpression createRegex(String pattern, String options)
throws ParseException {
RegularExpression re = null;
int intOptions = REUtil.parseOptions(options);
synchronized (REUtil.regexCache) {
int i;
for (i = 0; i < REUtil.CACHESIZE; i ++) {
RegularExpression cached = REUtil.regexCache[i];
if (cached == null) {
i = -1;
break;
}
if (cached.equals(pattern, intOptions)) {
re = cached;
break;
}
}
if (re != null) {
if (i != 0) {
System.arraycopy(REUtil.regexCache, 0, REUtil.regexCache, 1, i);
REUtil.regexCache[0] = re;
}
} else {
re = new RegularExpression(pattern, options);
System.arraycopy(REUtil.regexCache, 0, REUtil.regexCache, 1, REUtil.CACHESIZE-1);
REUtil.regexCache[0] = re;
}
}
return re;
}
/**
*
* @see RegularExpression#matches(java.lang.String)
*/
public static boolean matches(String regex, String target) throws ParseException {
return REUtil.createRegex(regex, null).matches(target);
}
/**
*
* @see RegularExpression#matches(java.lang.String)
*/
public static boolean matches(String regex, String options, String target) throws ParseException {
return REUtil.createRegex(regex, options).matches(target);
}
// ================================================================
/**
*
*/
public static String quoteMeta(String literal) {
int len = literal.length();
StringBuffer buffer = null;
for (int i = 0; i < len; i ++) {
int ch = literal.charAt(i);
if (".*+?{[()|\\^$".indexOf(ch) >= 0) {
if (buffer == null) {
buffer = new StringBuffer(i+(len-i)*2);
if (i > 0) buffer.append(literal.substring(0, i));
}
buffer.append((char)'\\');
buffer.append((char)ch);
} else if (buffer != null)
buffer.append((char)ch);
}
return buffer != null ? buffer.toString() : literal;
}
// ================================================================
static void dumpString(String v) {
for (int i = 0; i < v.length(); i ++) {
System.out.print(Integer.toHexString(v.charAt(i)));
System.out.print(" ");
}
System.out.println();
}
}
/**
* This class represents a node in parse tree.
*
* @xerces.internal
*
* @version $Id: Token.java 831926 2009-11-02 15:38:53Z knoaman $
*/
static class Token implements java.io.Serializable {
private static final long serialVersionUID = 8484976002585487481L;
static final boolean COUNTTOKENS = true;
static int tokens = 0;
static final int CHAR = 0; // Literal char
static final int DOT = 11; // .
static final int CONCAT = 1; // XY
static final int UNION = 2; // X|Y|Z
static final int CLOSURE = 3; // X*
static final int RANGE = 4; // [a-zA-Z] etc.
static final int NRANGE = 5; // [^a-zA-Z] etc.
static final int PAREN = 6; // (X) or (?:X)
static final int EMPTY = 7; //
static final int ANCHOR = 8; // ^ $ \b \B \< \> \A \Z \z
static final int NONGREEDYCLOSURE = 9; // *? +?
static final int STRING = 10; // strings
static final int BACKREFERENCE = 12; // back references
static final int LOOKAHEAD = 20; // (?=...)
static final int NEGATIVELOOKAHEAD = 21; // (?!...)
static final int LOOKBEHIND = 22; // (?<=...)
static final int NEGATIVELOOKBEHIND = 23; // (?<!...)
static final int INDEPENDENT = 24; // (?>...)
static final int MODIFIERGROUP = 25; // (?ims-ims:...)
static final int CONDITION = 26; // (?(...)yes|no)
static final int UTF16_MAX = 0x10ffff;
final int type;
static Token token_dot;
static Token token_0to9;
static Token token_wordchars;
static Token token_not_0to9;
static Token token_not_wordchars;
static Token token_spaces;
static Token token_not_spaces;
static Token token_empty;
static Token token_linebeginning;
static Token token_linebeginning2;
static Token token_lineend;
static Token token_stringbeginning;
static Token token_stringend;
static Token token_stringend2;
static Token token_wordedge;
static Token token_not_wordedge;
static Token token_wordbeginning;
static Token token_wordend;
static {
Token.token_empty = new Token(Token.EMPTY);
Token.token_linebeginning = Token.createAnchor('^');
Token.token_linebeginning2 = Token.createAnchor('@');
Token.token_lineend = Token.createAnchor('$');
Token.token_stringbeginning = Token.createAnchor('A');
Token.token_stringend = Token.createAnchor('z');
Token.token_stringend2 = Token.createAnchor('Z');
Token.token_wordedge = Token.createAnchor('b');
Token.token_not_wordedge = Token.createAnchor('B');
Token.token_wordbeginning = Token.createAnchor('<');
Token.token_wordend = Token.createAnchor('>');
Token.token_dot = new Token(Token.DOT);
Token.token_0to9 = Token.createRange();
Token.token_0to9.addRange('0', '9');
Token.token_wordchars = Token.createRange();
Token.token_wordchars.addRange('0', '9');
Token.token_wordchars.addRange('A', 'Z');
Token.token_wordchars.addRange('_', '_');
Token.token_wordchars.addRange('a', 'z');
Token.token_spaces = Token.createRange();
Token.token_spaces.addRange('\t', '\t');
Token.token_spaces.addRange('\n', '\n');
Token.token_spaces.addRange('\f', '\f');
Token.token_spaces.addRange('\r', '\r');
Token.token_spaces.addRange(' ', ' ');
Token.token_not_0to9 = Token.complementRanges(Token.token_0to9);
Token.token_not_wordchars = Token.complementRanges(Token.token_wordchars);
Token.token_not_spaces = Token.complementRanges(Token.token_spaces);
}
static Token.ParenToken createLook(int type, Token child) {
if (COUNTTOKENS) Token.tokens ++;
return new Token.ParenToken(type, child, 0);
}
static Token.ParenToken createParen(Token child, int pnumber) {
if (COUNTTOKENS) Token.tokens ++;
return new Token.ParenToken(Token.PAREN, child, pnumber);
}
static Token.ClosureToken createClosure(Token tok) {
if (COUNTTOKENS) Token.tokens ++;
return new Token.ClosureToken(Token.CLOSURE, tok);
}
static Token.ClosureToken createNGClosure(Token tok) {
if (COUNTTOKENS) Token.tokens ++;
return new Token.ClosureToken(Token.NONGREEDYCLOSURE, tok);
}
static Token.ConcatToken createConcat(Token tok1, Token tok2) {
if (COUNTTOKENS) Token.tokens ++;
return new Token.ConcatToken(tok1, tok2);
}
static Token.UnionToken createConcat() {
if (COUNTTOKENS) Token.tokens ++;
return new Token.UnionToken(Token.CONCAT); // *** It is not a bug.
}
static Token.UnionToken createUnion() {
if (COUNTTOKENS) Token.tokens ++;
return new Token.UnionToken(Token.UNION);
}
static Token createEmpty() {
return Token.token_empty;
}
static RangeToken createRange() {
if (COUNTTOKENS) Token.tokens ++;
return new RangeToken(Token.RANGE);
}
static RangeToken createNRange() {
if (COUNTTOKENS) Token.tokens ++;
return new RangeToken(Token.NRANGE);
}
static Token.CharToken createChar(int ch) {
if (COUNTTOKENS) Token.tokens ++;
return new Token.CharToken(Token.CHAR, ch);
}
static private Token.CharToken createAnchor(int ch) {
if (COUNTTOKENS) Token.tokens ++;
return new Token.CharToken(Token.ANCHOR, ch);
}
static Token.StringToken createBackReference(int refno) {
if (COUNTTOKENS) Token.tokens ++;
return new Token.StringToken(Token.BACKREFERENCE, null, refno);
}
static Token.StringToken createString(String str) {
if (COUNTTOKENS) Token.tokens ++;
return new Token.StringToken(Token.STRING, str, 0);
}
static Token.ModifierToken createModifierGroup(Token child, int add, int mask) {
if (COUNTTOKENS) Token.tokens ++;
return new Token.ModifierToken(child, add, mask);
}
static Token.ConditionToken createCondition(int refno, Token condition,
Token yespat, Token nopat) {
if (COUNTTOKENS) Token.tokens ++;
return new Token.ConditionToken(refno, condition, yespat, nopat);
}
protected Token(int type) {
this.type = type;
}
/**
* A number of children.
*/
int size() {
return 0;
}
Token getChild(int index) {
return null;
}
void addChild(Token tok) {
throw new RuntimeException("Not supported.");
}
// for RANGE or NRANGE
protected void addRange(int start, int end) {
throw new RuntimeException("Not supported.");
}
protected void sortRanges() {
throw new RuntimeException("Not supported.");
}
protected void compactRanges() {
throw new RuntimeException("Not supported.");
}
protected void mergeRanges(Token tok) {
throw new RuntimeException("Not supported.");
}
protected void subtractRanges(Token tok) {
throw new RuntimeException("Not supported.");
}
protected void intersectRanges(Token tok) {
throw new RuntimeException("Not supported.");
}
static Token complementRanges(Token tok) {
return RangeToken.complementRanges(tok);
}
void setMin(int min) { // for CLOSURE
}
void setMax(int max) { // for CLOSURE
}
int getMin() { // for CLOSURE
return -1;
}
int getMax() { // for CLOSURE
return -1;
}
int getReferenceNumber() { // for STRING
return 0;
}
String getString() { // for STRING
return null;
}
int getParenNumber() {
return 0;
}
int getChar() {
return -1;
}
public String toString() {
return this.toString(0);
}
public String toString(int options) {
return this.type == Token.DOT ? "." : "";
}
/**
* How many characters are needed?
*/
final int getMinLength() {
switch (this.type) {
case CONCAT:
int sum = 0;
for (int i = 0; i < this.size(); i ++)
sum += this.getChild(i).getMinLength();
return sum;
case CONDITION:
case UNION:
if (this.size() == 0)
return 0;
int ret = this.getChild(0).getMinLength();
for (int i = 1; i < this.size(); i ++) {
int min = this.getChild(i).getMinLength();
if (min < ret) ret = min;
}
return ret;
case CLOSURE:
case NONGREEDYCLOSURE:
if (this.getMin() >= 0)
return this.getMin() * this.getChild(0).getMinLength();
return 0;
case EMPTY:
case ANCHOR:
return 0;
case DOT:
case CHAR:
case RANGE:
case NRANGE:
return 1;
case INDEPENDENT:
case PAREN:
case MODIFIERGROUP:
return this.getChild(0).getMinLength();
case BACKREFERENCE:
return 0; // *******
case STRING:
return this.getString().length();
case LOOKAHEAD:
case NEGATIVELOOKAHEAD:
case LOOKBEHIND:
case NEGATIVELOOKBEHIND:
return 0; // ***** Really?
default:
throw new RuntimeException("Token#getMinLength(): Invalid Type: "+this.type);
}
}
final int getMaxLength() {
switch (this.type) {
case CONCAT:
int sum = 0;
for (int i = 0; i < this.size(); i ++) {
int d = this.getChild(i).getMaxLength();
if (d < 0) return -1;
sum += d;
}
return sum;
case CONDITION:
case UNION:
if (this.size() == 0)
return 0;
int ret = this.getChild(0).getMaxLength();
for (int i = 1; ret >= 0 && i < this.size(); i ++) {
int max = this.getChild(i).getMaxLength();
if (max < 0) { // infinity
ret = -1;
break;
}
if (max > ret) ret = max;
}
return ret;
case CLOSURE:
case NONGREEDYCLOSURE:
if (this.getMax() >= 0)
// When this.child.getMaxLength() < 0,
// this returns minus value
return this.getMax() * this.getChild(0).getMaxLength();
return -1;
case EMPTY:
case ANCHOR:
return 0;
case CHAR:
return 1;
case DOT:
case RANGE:
case NRANGE:
return 2;
case INDEPENDENT:
case PAREN:
case MODIFIERGROUP:
return this.getChild(0).getMaxLength();
case BACKREFERENCE:
return -1; // ******
case STRING:
return this.getString().length();
case LOOKAHEAD:
case NEGATIVELOOKAHEAD:
case LOOKBEHIND:
case NEGATIVELOOKBEHIND:
return 0; // ***** Really?
default:
throw new RuntimeException("Token#getMaxLength(): Invalid Type: "+this.type);
}
}
static final int FC_CONTINUE = 0;
static final int FC_TERMINAL = 1;
static final int FC_ANY = 2;
private static final boolean isSet(int options, int flag) {
return (options & flag) == flag;
}
final int analyzeFirstCharacter(RangeToken result, int options) {
switch (this.type) {
case CONCAT:
int ret = FC_CONTINUE;
for (int i = 0; i < this.size(); i ++)
if ((ret = this.getChild(i).analyzeFirstCharacter(result, options)) != FC_CONTINUE)
break;
return ret;
case UNION:
if (this.size() == 0)
return FC_CONTINUE;
/*
* a|b|c -> FC_TERMINAL
* a|.|c -> FC_ANY
* a|b| -> FC_CONTINUE
*/
int ret2 = FC_CONTINUE;
boolean hasEmpty = false;
for (int i = 0; i < this.size(); i ++) {
ret2 = this.getChild(i).analyzeFirstCharacter(result, options);
if (ret2 == FC_ANY)
break;
else if (ret2 == FC_CONTINUE)
hasEmpty = true;
}
return hasEmpty ? FC_CONTINUE : ret2;
case CONDITION:
int ret3 = this.getChild(0).analyzeFirstCharacter(result, options);
if (this.size() == 1) return FC_CONTINUE;
if (ret3 == FC_ANY) return ret3;
int ret4 = this.getChild(1).analyzeFirstCharacter(result, options);
if (ret4 == FC_ANY) return ret4;
return ret3 == FC_CONTINUE || ret4 == FC_CONTINUE ? FC_CONTINUE : FC_TERMINAL;
case CLOSURE:
case NONGREEDYCLOSURE:
this.getChild(0).analyzeFirstCharacter(result, options);
return FC_CONTINUE;
case EMPTY:
case ANCHOR:
return FC_CONTINUE;
case CHAR:
int ch = this.getChar();
result.addRange(ch, ch);
if (ch < 0x10000 && isSet(options, RegularExpression.IGNORE_CASE)) {
ch = Character.toUpperCase((char)ch);
result.addRange(ch, ch);
ch = Character.toLowerCase((char)ch);
result.addRange(ch, ch);
}
return FC_TERMINAL;
case DOT:
return FC_ANY;
case RANGE:
result.mergeRanges(this);
return FC_TERMINAL;
case NRANGE: // ****
result.mergeRanges(Token.complementRanges(this));
return FC_TERMINAL;
case INDEPENDENT:
case PAREN:
return this.getChild(0).analyzeFirstCharacter(result, options);
case MODIFIERGROUP:
options |= ((ModifierToken)this).getOptions();
options &= ~((ModifierToken)this).getOptionsMask();
return this.getChild(0).analyzeFirstCharacter(result, options);
case BACKREFERENCE:
result.addRange(0, UTF16_MAX); // **** We can not optimize.
return FC_ANY;
case STRING:
int cha = this.getString().charAt(0);
int ch2;
if (REUtil.isHighSurrogate(cha)
&& this.getString().length() >= 2
&& REUtil.isLowSurrogate((ch2 = this.getString().charAt(1))))
cha = REUtil.composeFromSurrogates(cha, ch2);
result.addRange(cha, cha);
if (cha < 0x10000 && isSet(options, RegularExpression.IGNORE_CASE)) {
cha = Character.toUpperCase((char)cha);
result.addRange(cha, cha);
cha = Character.toLowerCase((char)cha);
result.addRange(cha, cha);
}
return FC_TERMINAL;
case LOOKAHEAD:
case NEGATIVELOOKAHEAD:
case LOOKBEHIND:
case NEGATIVELOOKBEHIND:
return FC_CONTINUE;
default:
throw new RuntimeException("Token#analyzeHeadCharacter(): Invalid Type: "+this.type);
}
}
private final boolean isShorterThan(Token tok) {
if (tok == null) return false;
/*
int mylength;
if (this.type == STRING) mylength = this.getString().length();
else if (this.type == CHAR) mylength = this.getChar() >= 0x10000 ? 2 : 1;
else throw new RuntimeException("Internal Error: Illegal type: "+this.type);
int otherlength;
if (tok.type == STRING) otherlength = tok.getString().length();
else if (tok.type == CHAR) otherlength = tok.getChar() >= 0x10000 ? 2 : 1;
else throw new RuntimeException("Internal Error: Illegal type: "+tok.type);
*/
int mylength;
if (this.type == STRING) mylength = this.getString().length();
else throw new RuntimeException("Internal Error: Illegal type: "+this.type);
int otherlength;
if (tok.type == STRING) otherlength = tok.getString().length();
else throw new RuntimeException("Internal Error: Illegal type: "+tok.type);
return mylength < otherlength;
}
static class FixedStringContainer {
Token token = null;
int options = 0;
FixedStringContainer() {
}
}
final void findFixedString(FixedStringContainer container, int options) {
switch (this.type) {
case CONCAT:
Token prevToken = null;
int prevOptions = 0;
for (int i = 0; i < this.size(); i ++) {
this.getChild(i).findFixedString(container, options);
if (prevToken == null || prevToken.isShorterThan(container.token)) {
prevToken = container.token;
prevOptions = container.options;
}
}
container.token = prevToken;
container.options = prevOptions;
return;
case UNION:
case CLOSURE:
case NONGREEDYCLOSURE:
case EMPTY:
case ANCHOR:
case RANGE:
case DOT:
case NRANGE:
case BACKREFERENCE:
case LOOKAHEAD:
case NEGATIVELOOKAHEAD:
case LOOKBEHIND:
case NEGATIVELOOKBEHIND:
case CONDITION:
container.token = null;
return;
case CHAR: // Ignore CHAR tokens.
container.token = null; // **
return; // **
case STRING:
container.token = this;
container.options = options;
return;
case INDEPENDENT:
case PAREN:
this.getChild(0).findFixedString(container, options);
return;
case MODIFIERGROUP:
options |= ((ModifierToken)this).getOptions();
options &= ~((ModifierToken)this).getOptionsMask();
this.getChild(0).findFixedString(container, options);
return;
default:
throw new RuntimeException("Token#findFixedString(): Invalid Type: "+this.type);
}
}
boolean match(int ch) {
throw new RuntimeException("NFAArrow#match(): Internal error: "+this.type);
}
// ------------------------------------------------------
private final static Hashtable categories = new Hashtable();
private final static Hashtable categories2 = new Hashtable();
private static final String[] categoryNames = {
"Cn", "Lu", "Ll", "Lt", "Lm", "Lo", "Mn", "Me", "Mc", "Nd",
"Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", null, "Co", "Cs",
"Pd", "Ps", "Pe", "Pc", "Po", "Sm", "Sc", "Sk", "So", // 28
"Pi", "Pf", // 29, 30
"L", "M", "N", "Z", "C", "P", "S", // 31-37
};
// Schema Rec. {Datatypes} - Punctuation
static final int CHAR_INIT_QUOTE = 29; // Pi - initial quote
static final int CHAR_FINAL_QUOTE = 30; // Pf - final quote
static final int CHAR_LETTER = 31;
static final int CHAR_MARK = 32;
static final int CHAR_NUMBER = 33;
static final int CHAR_SEPARATOR = 34;
static final int CHAR_OTHER = 35;
static final int CHAR_PUNCTUATION = 36;
static final int CHAR_SYMBOL = 37;
//blockNames in UNICODE 3.1 that supported by XML Schema REC
private static final String[] blockNames = {
/*0000..007F;*/ "Basic Latin",
/*0080..00FF;*/ "Latin-1 Supplement",
/*0100..017F;*/ "Latin Extended-A",
/*0180..024F;*/ "Latin Extended-B",
/*0250..02AF;*/ "IPA Extensions",
/*02B0..02FF;*/ "Spacing Modifier Letters",
/*0300..036F;*/ "Combining Diacritical Marks",
/*0370..03FF;*/ "Greek",
/*0400..04FF;*/ "Cyrillic",
/*0530..058F;*/ "Armenian",
/*0590..05FF;*/ "Hebrew",
/*0600..06FF;*/ "Arabic",
/*0700..074F;*/ "Syriac",
/*0780..07BF;*/ "Thaana",
/*0900..097F;*/ "Devanagari",
/*0980..09FF;*/ "Bengali",
/*0A00..0A7F;*/ "Gurmukhi",
/*0A80..0AFF;*/ "Gujarati",
/*0B00..0B7F;*/ "Oriya",
/*0B80..0BFF;*/ "Tamil",
/*0C00..0C7F;*/ "Telugu",
/*0C80..0CFF;*/ "Kannada",
/*0D00..0D7F;*/ "Malayalam",
/*0D80..0DFF;*/ "Sinhala",
/*0E00..0E7F;*/ "Thai",
/*0E80..0EFF;*/ "Lao",
/*0F00..0FFF;*/ "Tibetan",
/*1000..109F;*/ "Myanmar",
/*10A0..10FF;*/ "Georgian",
/*1100..11FF;*/ "Hangul Jamo",
/*1200..137F;*/ "Ethiopic",
/*13A0..13FF;*/ "Cherokee",
/*1400..167F;*/ "Unified Canadian Aboriginal Syllabics",
/*1680..169F;*/ "Ogham",
/*16A0..16FF;*/ "Runic",
/*1780..17FF;*/ "Khmer",
/*1800..18AF;*/ "Mongolian",
/*1E00..1EFF;*/ "Latin Extended Additional",
/*1F00..1FFF;*/ "Greek Extended",
/*2000..206F;*/ "General Punctuation",
/*2070..209F;*/ "Superscripts and Subscripts",
/*20A0..20CF;*/ "Currency Symbols",
/*20D0..20FF;*/ "Combining Marks for Symbols",
/*2100..214F;*/ "Letterlike Symbols",
/*2150..218F;*/ "Number Forms",
/*2190..21FF;*/ "Arrows",
/*2200..22FF;*/ "Mathematical Operators",
/*2300..23FF;*/ "Miscellaneous Technical",
/*2400..243F;*/ "Control Pictures",
/*2440..245F;*/ "Optical Character Recognition",
/*2460..24FF;*/ "Enclosed Alphanumerics",
/*2500..257F;*/ "Box Drawing",
/*2580..259F;*/ "Block Elements",
/*25A0..25FF;*/ "Geometric Shapes",
/*2600..26FF;*/ "Miscellaneous Symbols",
/*2700..27BF;*/ "Dingbats",
/*2800..28FF;*/ "Braille Patterns",
/*2E80..2EFF;*/ "CJK Radicals Supplement",
/*2F00..2FDF;*/ "Kangxi Radicals",
/*2FF0..2FFF;*/ "Ideographic Description Characters",
/*3000..303F;*/ "CJK Symbols and Punctuation",
/*3040..309F;*/ "Hiragana",
/*30A0..30FF;*/ "Katakana",
/*3100..312F;*/ "Bopomofo",
/*3130..318F;*/ "Hangul Compatibility Jamo",
/*3190..319F;*/ "Kanbun",
/*31A0..31BF;*/ "Bopomofo Extended",
/*3200..32FF;*/ "Enclosed CJK Letters and Months",
/*3300..33FF;*/ "CJK Compatibility",
/*3400..4DB5;*/ "CJK Unified Ideographs Extension A",
/*4E00..9FFF;*/ "CJK Unified Ideographs",
/*A000..A48F;*/ "Yi Syllables",
/*A490..A4CF;*/ "Yi Radicals",
/*AC00..D7A3;*/ "Hangul Syllables",
/*E000..F8FF;*/ "Private Use",
/*F900..FAFF;*/ "CJK Compatibility Ideographs",
/*FB00..FB4F;*/ "Alphabetic Presentation Forms",
/*FB50..FDFF;*/ "Arabic Presentation Forms-A",
/*FE20..FE2F;*/ "Combining Half Marks",
/*FE30..FE4F;*/ "CJK Compatibility Forms",
/*FE50..FE6F;*/ "Small Form Variants",
/*FE70..FEFE;*/ "Arabic Presentation Forms-B",
/*FEFF..FEFF;*/ "Specials",
/*FF00..FFEF;*/ "Halfwidth and Fullwidth Forms",
//missing Specials add manually
/*10300..1032F;*/ "Old Italic", // 84
/*10330..1034F;*/ "Gothic",
/*10400..1044F;*/ "Deseret",
/*1D000..1D0FF;*/ "Byzantine Musical Symbols",
/*1D100..1D1FF;*/ "Musical Symbols",
/*1D400..1D7FF;*/ "Mathematical Alphanumeric Symbols",
/*20000..2A6D6;*/ "CJK Unified Ideographs Extension B",
/*2F800..2FA1F;*/ "CJK Compatibility Ideographs Supplement",
/*E0000..E007F;*/ "Tags",
//missing 2 private use add manually
};
//ADD THOSE MANUALLY
//F0000..FFFFD; "Private Use",
//100000..10FFFD; "Private Use"
//FFF0..FFFD; "Specials",
static final String blockRanges =
"\u0000\u007F\u0080\u00FF\u0100\u017F\u0180\u024F\u0250\u02AF\u02B0\u02FF\u0300\u036F"
+"\u0370\u03FF\u0400\u04FF\u0530\u058F\u0590\u05FF\u0600\u06FF\u0700\u074F\u0780\u07BF"
+"\u0900\u097F\u0980\u09FF\u0A00\u0A7F\u0A80\u0AFF\u0B00\u0B7F\u0B80\u0BFF\u0C00\u0C7F\u0C80\u0CFF"
+"\u0D00\u0D7F\u0D80\u0DFF\u0E00\u0E7F\u0E80\u0EFF\u0F00\u0FFF\u1000\u109F\u10A0\u10FF\u1100\u11FF"
+"\u1200\u137F\u13A0\u13FF\u1400\u167F\u1680\u169F\u16A0\u16FF\u1780\u17FF\u1800\u18AF\u1E00\u1EFF"
+"\u1F00\u1FFF\u2000\u206F\u2070\u209F\u20A0\u20CF\u20D0\u20FF\u2100\u214F\u2150\u218F\u2190\u21FF\u2200\u22FF"
+"\u2300\u23FF\u2400\u243F\u2440\u245F\u2460\u24FF\u2500\u257F\u2580\u259F\u25A0\u25FF\u2600\u26FF\u2700\u27BF"
+"\u2800\u28FF\u2E80\u2EFF\u2F00\u2FDF\u2FF0\u2FFF\u3000\u303F\u3040\u309F\u30A0\u30FF\u3100\u312F\u3130\u318F"
+"\u3190\u319F\u31A0\u31BF\u3200\u32FF\u3300\u33FF\u3400\u4DB5\u4E00\u9FFF\uA000\uA48F\uA490\uA4CF"
+"\uAC00\uD7A3\uE000\uF8FF\uF900\uFAFF\uFB00\uFB4F\uFB50\uFDFF"
+"\uFE20\uFE2F\uFE30\uFE4F\uFE50\uFE6F\uFE70\uFEFE\uFEFF\uFEFF\uFF00\uFFEF";
static final int[] nonBMPBlockRanges = {
0x10300, 0x1032F, // 84
0x10330, 0x1034F,
0x10400, 0x1044F,
0x1D000, 0x1D0FF,
0x1D100, 0x1D1FF,
0x1D400, 0x1D7FF,
0x20000, 0x2A6D6,
0x2F800, 0x2FA1F,
0xE0000, 0xE007F
};
private static final int NONBMP_BLOCK_START = 84;
static protected RangeToken getRange(String name, boolean positive) {
if (Token.categories.isEmpty()) {
synchronized (Token.categories) {
Token[] ranges = new Token[Token.categoryNames.length];
for (int i = 0; i < ranges.length; i ++) {
ranges[i] = Token.createRange();
}
int type;
for (int i = 0; i < 0x10000; i ++) {
type = Character.getType((char)i);
if (type == Character.START_PUNCTUATION ||
type == Character.END_PUNCTUATION) {
//build table of Pi values
if (i == 0x00AB || i == 0x2018 || i == 0x201B || i == 0x201C ||
i == 0x201F || i == 0x2039) {
type = CHAR_INIT_QUOTE;
}
//build table of Pf values
if (i == 0x00BB || i == 0x2019 || i == 0x201D || i == 0x203A ) {
type = CHAR_FINAL_QUOTE;
}
}
ranges[type].addRange(i, i);
switch (type) {
case Character.UPPERCASE_LETTER:
case Character.LOWERCASE_LETTER:
case Character.TITLECASE_LETTER:
case Character.MODIFIER_LETTER:
case Character.OTHER_LETTER:
type = CHAR_LETTER;
break;
case Character.NON_SPACING_MARK:
case Character.COMBINING_SPACING_MARK:
case Character.ENCLOSING_MARK:
type = CHAR_MARK;
break;
case Character.DECIMAL_DIGIT_NUMBER:
case Character.LETTER_NUMBER:
case Character.OTHER_NUMBER:
type = CHAR_NUMBER;
break;
case Character.SPACE_SEPARATOR:
case Character.LINE_SEPARATOR:
case Character.PARAGRAPH_SEPARATOR:
type = CHAR_SEPARATOR;
break;
case Character.CONTROL:
case Character.FORMAT:
case Character.SURROGATE:
case Character.PRIVATE_USE:
case Character.UNASSIGNED:
type = CHAR_OTHER;
break;
case Character.CONNECTOR_PUNCTUATION:
case Character.DASH_PUNCTUATION:
case Character.START_PUNCTUATION:
case Character.END_PUNCTUATION:
case CHAR_INIT_QUOTE:
case CHAR_FINAL_QUOTE:
case Character.OTHER_PUNCTUATION:
type = CHAR_PUNCTUATION;
break;
case Character.MATH_SYMBOL:
case Character.CURRENCY_SYMBOL:
case Character.MODIFIER_SYMBOL:
case Character.OTHER_SYMBOL:
type = CHAR_SYMBOL;
break;
default:
throw new RuntimeException("org.apache.xerces.utils.regex.Token#getRange(): Unknown Unicode category: "+type);
}
ranges[type].addRange(i, i);
} // for all characters
ranges[Character.UNASSIGNED].addRange(0x10000, Token.UTF16_MAX);
for (int i = 0; i < ranges.length; i ++) {
if (Token.categoryNames[i] != null) {
if (i == Character.UNASSIGNED) { // Unassigned
ranges[i].addRange(0x10000, Token.UTF16_MAX);
}
Token.categories.put(Token.categoryNames[i], ranges[i]);
Token.categories2.put(Token.categoryNames[i],
Token.complementRanges(ranges[i]));
}
}
//REVISIT: do we really need to support block names as in Unicode 3.1
// or we can just create all the names in IsBLOCKNAME format (XML Schema REC)?
//
StringBuilder buffer = new StringBuilder(50);
for (int i = 0; i < Token.blockNames.length; i ++) {
Token r1 = Token.createRange();
int location;
if (i < NONBMP_BLOCK_START) {
location = i*2;
int rstart = Token.blockRanges.charAt(location);
int rend = Token.blockRanges.charAt(location+1);
//DEBUGING
//System.out.println(n+" " +Integer.toHexString(rstart)
// +"-"+ Integer.toHexString(rend));
r1.addRange(rstart, rend);
} else {
location = (i - NONBMP_BLOCK_START) * 2;
r1.addRange(Token.nonBMPBlockRanges[location],
Token.nonBMPBlockRanges[location + 1]);
}
String n = Token.blockNames[i];
if (n.equals("Specials"))
r1.addRange(0xfff0, 0xfffd);
if (n.equals("Private Use")) {
r1.addRange(0xF0000,0xFFFFD);
r1.addRange(0x100000,0x10FFFD);
}
Token.categories.put(n, r1);
Token.categories2.put(n, Token.complementRanges(r1));
buffer.setLength(0);
buffer.append("Is");
if (n.indexOf(' ') >= 0) {
for (int ci = 0; ci < n.length(); ci ++)
if (n.charAt(ci) != ' ') buffer.append((char)n.charAt(ci));
}
else {
buffer.append(n);
}
Token.setAlias(buffer.toString(), n, true);
}
// TR#18 1.2
Token.setAlias("ASSIGNED", "Cn", false);
Token.setAlias("UNASSIGNED", "Cn", true);
Token all = Token.createRange();
all.addRange(0, Token.UTF16_MAX);
Token.categories.put("ALL", all);
Token.categories2.put("ALL", Token.complementRanges(all));
Token.registerNonXS("ASSIGNED");
Token.registerNonXS("UNASSIGNED");
Token.registerNonXS("ALL");
Token isalpha = Token.createRange();
isalpha.mergeRanges(ranges[Character.UPPERCASE_LETTER]); // Lu
isalpha.mergeRanges(ranges[Character.LOWERCASE_LETTER]); // Ll
isalpha.mergeRanges(ranges[Character.OTHER_LETTER]); // Lo
Token.categories.put("IsAlpha", isalpha);
Token.categories2.put("IsAlpha", Token.complementRanges(isalpha));
Token.registerNonXS("IsAlpha");
Token isalnum = Token.createRange();
isalnum.mergeRanges(isalpha); // Lu Ll Lo
isalnum.mergeRanges(ranges[Character.DECIMAL_DIGIT_NUMBER]); // Nd
Token.categories.put("IsAlnum", isalnum);
Token.categories2.put("IsAlnum", Token.complementRanges(isalnum));
Token.registerNonXS("IsAlnum");
Token isspace = Token.createRange();
isspace.mergeRanges(Token.token_spaces);
isspace.mergeRanges(ranges[CHAR_SEPARATOR]); // Z
Token.categories.put("IsSpace", isspace);
Token.categories2.put("IsSpace", Token.complementRanges(isspace));
Token.registerNonXS("IsSpace");
Token isword = Token.createRange();
isword.mergeRanges(isalnum); // Lu Ll Lo Nd
isword.addRange('_', '_');
Token.categories.put("IsWord", isword);
Token.categories2.put("IsWord", Token.complementRanges(isword));
Token.registerNonXS("IsWord");
Token isascii = Token.createRange();
isascii.addRange(0, 127);
Token.categories.put("IsASCII", isascii);
Token.categories2.put("IsASCII", Token.complementRanges(isascii));
Token.registerNonXS("IsASCII");
Token isnotgraph = Token.createRange();
isnotgraph.mergeRanges(ranges[CHAR_OTHER]);
isnotgraph.addRange(' ', ' ');
Token.categories.put("IsGraph", Token.complementRanges(isnotgraph));
Token.categories2.put("IsGraph", isnotgraph);
Token.registerNonXS("IsGraph");
Token isxdigit = Token.createRange();
isxdigit.addRange('0', '9');
isxdigit.addRange('A', 'F');
isxdigit.addRange('a', 'f');
Token.categories.put("IsXDigit", Token.complementRanges(isxdigit));
Token.categories2.put("IsXDigit", isxdigit);
Token.registerNonXS("IsXDigit");
Token.setAlias("IsDigit", "Nd", true);
Token.setAlias("IsUpper", "Lu", true);
Token.setAlias("IsLower", "Ll", true);
Token.setAlias("IsCntrl", "C", true);
Token.setAlias("IsPrint", "C", false);
Token.setAlias("IsPunct", "P", true);
Token.registerNonXS("IsDigit");
Token.registerNonXS("IsUpper");
Token.registerNonXS("IsLower");
Token.registerNonXS("IsCntrl");
Token.registerNonXS("IsPrint");
Token.registerNonXS("IsPunct");
Token.setAlias("alpha", "IsAlpha", true);
Token.setAlias("alnum", "IsAlnum", true);
Token.setAlias("ascii", "IsASCII", true);
Token.setAlias("cntrl", "IsCntrl", true);
Token.setAlias("digit", "IsDigit", true);
Token.setAlias("graph", "IsGraph", true);
Token.setAlias("lower", "IsLower", true);
Token.setAlias("print", "IsPrint", true);
Token.setAlias("punct", "IsPunct", true);
Token.setAlias("space", "IsSpace", true);
Token.setAlias("upper", "IsUpper", true);
Token.setAlias("word", "IsWord", true); // Perl extension
Token.setAlias("xdigit", "IsXDigit", true);
Token.registerNonXS("alpha");
Token.registerNonXS("alnum");
Token.registerNonXS("ascii");
Token.registerNonXS("cntrl");
Token.registerNonXS("digit");
Token.registerNonXS("graph");
Token.registerNonXS("lower");
Token.registerNonXS("print");
Token.registerNonXS("punct");
Token.registerNonXS("space");
Token.registerNonXS("upper");
Token.registerNonXS("word");
Token.registerNonXS("xdigit");
} // synchronized
} // if null
RangeToken tok = positive ? (RangeToken)Token.categories.get(name)
: (RangeToken)Token.categories2.get(name);
//if (tok == null) System.out.println(name);
return tok;
}
static protected RangeToken getRange(String name, boolean positive, boolean xs) {
RangeToken range = Token.getRange(name, positive);
if (xs && range != null && Token.isRegisterNonXS(name))
range = null;
return range;
}
static Hashtable nonxs = null;
/**
* This method is called by only getRange().
* So this method need not MT-safe.
*/
static protected void registerNonXS(String name) {
if (Token.nonxs == null)
Token.nonxs = new Hashtable();
Token.nonxs.put(name, name);
}
static protected boolean isRegisterNonXS(String name) {
if (Token.nonxs == null)
return false;
//DEBUG
//System.err.println("isRegisterNonXS: "+name);
return Token.nonxs.containsKey(name);
}
private static void setAlias(String newName, String name, boolean positive) {
Token t1 = (Token)Token.categories.get(name);
Token t2 = (Token)Token.categories2.get(name);
if (positive) {
Token.categories.put(newName, t1);
Token.categories2.put(newName, t2);
} else {
Token.categories2.put(newName, t1);
Token.categories.put(newName, t2);
}
}
// ------------------------------------------------------
static final String viramaString =
"\u094D"// ;DEVANAGARI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
+"\u09CD"//;BENGALI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
+"\u0A4D"//;GURMUKHI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
+"\u0ACD"//;GUJARATI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
+"\u0B4D"//;ORIYA SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
+"\u0BCD"//;TAMIL SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
+"\u0C4D"//;TELUGU SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
+"\u0CCD"//;KANNADA SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
+"\u0D4D"//;MALAYALAM SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
+"\u0E3A"//;THAI CHARACTER PHINTHU;Mn;9;ON;;;;;N;THAI VOWEL SIGN PHINTHU;;;;
+"\u0F84";//;TIBETAN MARK HALANTA;Mn;9;ON;;;;;N;TIBETAN VIRAMA;;;;
static private Token token_grapheme = null;
static synchronized Token getGraphemePattern() {
if (Token.token_grapheme != null)
return Token.token_grapheme;
Token base_char = Token.createRange(); // [{ASSIGNED}]-[{M},{C}]
base_char.mergeRanges(Token.getRange("ASSIGNED", true));
base_char.subtractRanges(Token.getRange("M", true));
base_char.subtractRanges(Token.getRange("C", true));
Token virama = Token.createRange();
for (int i = 0; i < Token.viramaString.length(); i++) {
virama.addRange(i, i);
}
Token combiner_wo_virama = Token.createRange();
combiner_wo_virama.mergeRanges(Token.getRange("M", true));
combiner_wo_virama.addRange(0x1160, 0x11ff); // hangul_medial and hangul_final
combiner_wo_virama.addRange(0xff9e, 0xff9f); // extras
Token left = Token.createUnion(); // base_char?
left.addChild(base_char);
left.addChild(Token.token_empty);
Token foo = Token.createUnion();
foo.addChild(Token.createConcat(virama, Token.getRange("L", true)));
foo.addChild(combiner_wo_virama);
foo = Token.createClosure(foo);
foo = Token.createConcat(left, foo);
Token.token_grapheme = foo;
return Token.token_grapheme;
}
/**
* Combing Character Sequence in Perl 5.6.
*/
static private Token token_ccs = null;
static synchronized Token getCombiningCharacterSequence() {
if (Token.token_ccs != null)
return Token.token_ccs;
Token foo = Token.createClosure(Token.getRange("M", true)); // \pM*
foo = Token.createConcat(Token.getRange("M", false), foo); // \PM + \pM*
Token.token_ccs = foo;
return Token.token_ccs;
}
// ------------------------------------------------------
// ------------------------------------------------------
/**
* This class represents a node in parse tree.
*/
static class StringToken extends Token implements java.io.Serializable {
private static final long serialVersionUID = -4614366944218504172L;
String string;
final int refNumber;
StringToken(int type, String str, int n) {
super(type);
this.string = str;
this.refNumber = n;
}
int getReferenceNumber() { // for STRING
return this.refNumber;
}
String getString() { // for STRING
return this.string;
}
public String toString(int options) {
if (this.type == BACKREFERENCE)
return "\\"+this.refNumber;
else
return REUtil.quoteMeta(this.string);
}
}
/**
* This class represents a node in parse tree.
*/
static class ConcatToken extends Token implements java.io.Serializable {
private static final long serialVersionUID = 8717321425541346381L;
final Token child;
final Token child2;
ConcatToken(Token t1, Token t2) {
super(Token.CONCAT);
this.child = t1;
this.child2 = t2;
}
int size() {
return 2;
}
Token getChild(int index) {
return index == 0 ? this.child : this.child2;
}
public String toString(int options) {
String ret;
if (this.child2.type == CLOSURE && this.child2.getChild(0) == this.child) {
ret = this.child.toString(options)+"+";
} else if (this.child2.type == NONGREEDYCLOSURE && this.child2.getChild(0) == this.child) {
ret = this.child.toString(options)+"+?";
} else
ret = this.child.toString(options)+this.child2.toString(options);
return ret;
}
}
/**
* This class represents a node in parse tree.
*/
static class CharToken extends Token implements java.io.Serializable {
private static final long serialVersionUID = -4394272816279496989L;
final int chardata;
CharToken(int type, int ch) {
super(type);
this.chardata = ch;
}
int getChar() {
return this.chardata;
}
public String toString(int options) {
String ret;
switch (this.type) {
case CHAR:
switch (this.chardata) {
case '|': case '*': case '+': case '?':
case '(': case ')': case '.': case '[':
case '{': case '\\':
ret = "\\"+(char)this.chardata;
break;
case '\f': ret = "\\f"; break;
case '\n': ret = "\\n"; break;
case '\r': ret = "\\r"; break;
case '\t': ret = "\\t"; break;
case 0x1b: ret = "\\e"; break;
//case 0x0b: ret = "\\v"; break;
default:
if (this.chardata >= 0x10000) {
String pre = "0"+Integer.toHexString(this.chardata);
ret = "\\v"+pre.substring(pre.length()-6, pre.length());
} else
ret = ""+(char)this.chardata;
}
break;
case ANCHOR:
if (this == Token.token_linebeginning || this == Token.token_lineend)
ret = ""+(char)this.chardata;
else
ret = "\\"+(char)this.chardata;
break;
default:
ret = null;
}
return ret;
}
boolean match(int ch) {
if (this.type == CHAR) {
return ch == this.chardata;
} else
throw new RuntimeException("NFAArrow#match(): Internal error: "+this.type);
}
}
/**
* This class represents a node in parse tree.
*/
static class ClosureToken extends Token implements java.io.Serializable {
private static final long serialVersionUID = 1308971930673997452L;
int min;
int max;
final Token child;
ClosureToken(int type, Token tok) {
super(type);
this.child = tok;
this.setMin(-1);
this.setMax(-1);
}
int size() {
return 1;
}
Token getChild(int index) {
return this.child;
}
final void setMin(int min) {
this.min = min;
}
final void setMax(int max) {
this.max = max;
}
final int getMin() {
return this.min;
}
final int getMax() {
return this.max;
}
public String toString(int options) {
String ret;
if (this.type == CLOSURE) {
if (this.getMin() < 0 && this.getMax() < 0) {
ret = this.child.toString(options)+"*";
} else if (this.getMin() == this.getMax()) {
ret = this.child.toString(options)+"{"+this.getMin()+"}";
} else if (this.getMin() >= 0 && this.getMax() >= 0) {
ret = this.child.toString(options)+"{"+this.getMin()+","+this.getMax()+"}";
} else if (this.getMin() >= 0 && this.getMax() < 0) {
ret = this.child.toString(options)+"{"+this.getMin()+",}";
} else
throw new RuntimeException("Token#toString(): CLOSURE "
+this.getMin()+", "+this.getMax());
} else {
if (this.getMin() < 0 && this.getMax() < 0) {
ret = this.child.toString(options)+"*?";
} else if (this.getMin() == this.getMax()) {
ret = this.child.toString(options)+"{"+this.getMin()+"}?";
} else if (this.getMin() >= 0 && this.getMax() >= 0) {
ret = this.child.toString(options)+"{"+this.getMin()+","+this.getMax()+"}?";
} else if (this.getMin() >= 0 && this.getMax() < 0) {
ret = this.child.toString(options)+"{"+this.getMin()+",}?";
} else
throw new RuntimeException("Token#toString(): NONGREEDYCLOSURE "
+this.getMin()+", "+this.getMax());
}
return ret;
}
}
/**
* This class represents a node in parse tree.
*/
static class ParenToken extends Token implements java.io.Serializable {
private static final long serialVersionUID = -5938014719827987704L;
final Token child;
final int parennumber;
ParenToken(int type, Token tok, int paren) {
super(type);
this.child = tok;
this.parennumber = paren;
}
int size() {
return 1;
}
Token getChild(int index) {
return this.child;
}
int getParenNumber() {
return this.parennumber;
}
public String toString(int options) {
String ret = null;
switch (this.type) {
case PAREN:
if (this.parennumber == 0) {
ret = "(?:"+this.child.toString(options)+")";
} else {
ret = "("+this.child.toString(options)+")";
}
break;
case LOOKAHEAD:
ret = "(?="+this.child.toString(options)+")";
break;
case NEGATIVELOOKAHEAD:
ret = "(?!"+this.child.toString(options)+")";
break;
case LOOKBEHIND:
ret = "(?<="+this.child.toString(options)+")";
break;
case NEGATIVELOOKBEHIND:
ret = "(?<!"+this.child.toString(options)+")";
break;
case INDEPENDENT:
ret = "(?>"+this.child.toString(options)+")";
break;
}
return ret;
}
}
/**
* (?(condition)yes-pattern|no-pattern)
*/
static class ConditionToken extends Token implements java.io.Serializable {
private static final long serialVersionUID = 4353765277910594411L;
final int refNumber;
final Token condition;
final Token yes;
final Token no;
ConditionToken(int refno, Token cond, Token yespat, Token nopat) {
super(Token.CONDITION);
this.refNumber = refno;
this.condition = cond;
this.yes = yespat;
this.no = nopat;
}
int size() {
return this.no == null ? 1 : 2;
}
Token getChild(int index) {
if (index == 0) return this.yes;
if (index == 1) return this.no;
throw new RuntimeException("Internal Error: "+index);
}
public String toString(int options) {
String ret;
if (refNumber > 0) {
ret = "(?("+refNumber+")";
} else if (this.condition.type == Token.ANCHOR) {
ret = "(?("+this.condition+")";
} else {
ret = "(?"+this.condition;
}
if (this.no == null) {
ret += this.yes+")";
} else {
ret += this.yes+"|"+this.no+")";
}
return ret;
}
}
/**
* (ims-ims: .... )
*/
static class ModifierToken extends Token implements java.io.Serializable {
private static final long serialVersionUID = -9114536559696480356L;
final Token child;
final int add;
final int mask;
ModifierToken(Token tok, int add, int mask) {
super(Token.MODIFIERGROUP);
this.child = tok;
this.add = add;
this.mask = mask;
}
int size() {
return 1;
}
Token getChild(int index) {
return this.child;
}
int getOptions() {
return this.add;
}
int getOptionsMask() {
return this.mask;
}
public String toString(int options) {
return "(?"
+(this.add == 0 ? "" : REUtil.createOptionString(this.add))
+(this.mask == 0 ? "" : REUtil.createOptionString(this.mask))
+":"
+this.child.toString(options)
+")";
}
}
/**
* This class represents a node in parse tree.
* for UNION or CONCAT.
*/
static class UnionToken extends Token implements java.io.Serializable {
private static final long serialVersionUID = -2568843945989489861L;
Vector children;
UnionToken(int type) {
super(type);
}
void addChild(Token tok) {
if (tok == null) return;
if (this.children == null) this.children = new Vector();
if (this.type == UNION) {
this.children.addElement(tok);
return;
}
// This is CONCAT, and new child is CONCAT.
if (tok.type == CONCAT) {
for (int i = 0; i < tok.size(); i ++)
this.addChild(tok.getChild(i)); // Recursion
return;
}
int size = this.children.size();
if (size == 0) {
this.children.addElement(tok);
return;
}
Token previous = (Token)this.children.elementAt(size-1);
if (!((previous.type == CHAR || previous.type == STRING)
&& (tok.type == CHAR || tok.type == STRING))) {
this.children.addElement(tok);
return;
}
//System.err.println("Merge '"+previous+"' and '"+tok+"'.");
StringBuffer buffer;
int nextMaxLength = (tok.type == CHAR ? 2 : tok.getString().length());
if (previous.type == CHAR) { // Replace previous token by STRING
buffer = new StringBuffer(2 + nextMaxLength);
int ch = previous.getChar();
if (ch >= 0x10000)
buffer.append(REUtil.decomposeToSurrogates(ch));
else
buffer.append((char)ch);
previous = Token.createString(null);
this.children.setElementAt(previous, size-1);
} else { // STRING
buffer = new StringBuffer(previous.getString().length() + nextMaxLength);
buffer.append(previous.getString());
}
if (tok.type == CHAR) {
int ch = tok.getChar();
if (ch >= 0x10000)
buffer.append(REUtil.decomposeToSurrogates(ch));
else
buffer.append((char)ch);
} else {
buffer.append(tok.getString());
}
((StringToken)previous).string = new String(buffer);
}
int size() {
return this.children == null ? 0 : this.children.size();
}
Token getChild(int index) {
return (Token)this.children.elementAt(index);
}
public String toString(int options) {
String ret;
if (this.type == CONCAT) {
if (this.children.size() == 2) {
Token ch = this.getChild(0);
Token ch2 = this.getChild(1);
if (ch2.type == CLOSURE && ch2.getChild(0) == ch) {
ret = ch.toString(options)+"+";
} else if (ch2.type == NONGREEDYCLOSURE && ch2.getChild(0) == ch) {
ret = ch.toString(options)+"+?";
} else
ret = ch.toString(options)+ch2.toString(options);
} else {
StringBuffer sb = new StringBuffer();
for (int i = 0; i < this.children.size(); i ++) {
sb.append(((Token)this.children.elementAt(i)).toString(options));
}
ret = new String(sb);
}
return ret;
}
if (this.children.size() == 2 && this.getChild(1).type == EMPTY) {
ret = this.getChild(0).toString(options)+"?";
} else if (this.children.size() == 2
&& this.getChild(0).type == EMPTY) {
ret = this.getChild(1).toString(options)+"??";
} else {
StringBuffer sb = new StringBuffer();
sb.append(((Token)this.children.elementAt(0)).toString(options));
for (int i = 1; i < this.children.size(); i ++) {
sb.append((char)'|');
sb.append(((Token)this.children.elementAt(i)).toString(options));
}
ret = new String(sb);
}
return ret;
}
}
}
}