org.eclipse.epf.web.search/src/org/eclipse/epf/web/search/analysis/CJKTokenizer.java - epf/org.eclipse.epf.projects - Git at Google

 package org.eclipse.epf.web.search.analysis;

 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.Tokenizer;

 import java.io.Reader;

 public final class CJKTokenizer extends Tokenizer {
     //~ Static fields/initializers ---------------------------------------------

     /** Max word length */
     private static final int MAX_WORD_LEN = 255;

     /** buffer size: */
     private static final int IO_BUFFER_SIZE = 256;

     //~ Instance fields --------------------------------------------------------

     /** word offset, used to imply which character(in ) is parsed */
     private int offset = 0;

     /** the index used only for ioBuffer */
     private int bufferIndex = 0;

     /** data length */
     private int dataLen = 0;

     /**
      * character buffer, store the characters which are used to compose <br>
      * the returned Token
      */
     private final char[] buffer = new char[MAX_WORD_LEN];

     /**
      * I/O buffer, used to store the content of the input(one of the <br>
      * members of Tokenizer)
      */
     private final char[] ioBuffer = new char[IO_BUFFER_SIZE];

     /** word type: single=>ASCII  double=>non-ASCII word=>default */
     private String tokenType = "word";

     /**
      * tag: previous character is a cached double-byte character  "C1C2C3C4"
      * ----(set the C1 isTokened) C1C2 "C2C3C4" ----(set the C2 isTokened)
      * C1C2 C2C3 "C3C4" ----(set the C3 isTokened) "C1C2 C2C3 C3C4"
      */
     private boolean preIsTokened = false;

     //~ Constructors -----------------------------------------------------------

     /**
      * Construct a token stream processing the given input.
      *
      * @param in I/O reader
      */
     public CJKTokenizer(Reader in) {
         input = in;
     }

     //~ Methods ----------------------------------------------------------------

     /**
      * Returns the next token in the stream, or null at EOS.
      * See http://java.sun.com/j2se/1.3/docs/api/java/lang/Character.UnicodeBlock.html
      * for detail.
      *
      * @return Token
      *
      * @throws java.io.IOException - throw IOException when read error <br>
      *         hanppened in the InputStream
      *
      */
     public final Token next() throws java.io.IOException {
         /** how many character(s) has been stored in buffer */
         int length = 0;

         /** the position used to create Token */
         int start = offset;

         while (true) {
             /** current charactor */
             char c;

             /** unicode block of current charactor for detail */
             Character.UnicodeBlock ub;

             offset++;

             if (bufferIndex >= dataLen) {
                 dataLen = input.read(ioBuffer);
                 bufferIndex = 0;
             }

             if (dataLen == -1) {
                 if (length > 0) {
                     if (preIsTokened == true) {
                         length = 0;
                         preIsTokened = false;
                     }

                     break;
                 } else {
                     return null;
                 }
             } else {
                 //get current character
                 c = ioBuffer[bufferIndex++];

                 //get the UnicodeBlock of the current character
                 ub = Character.UnicodeBlock.of(c);
             }

             //if the current character is ASCII or Extend ASCII
             if ((ub == Character.UnicodeBlock.BASIC_LATIN)
                     || (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS)
                ) {
                 if (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS) {
                     /** convert  HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN */
                     int i = (int) c;
                     i = i - 65248;
                     c = (char) i;
                 }

                 // if the current character is a letter or "_" "+" "#"
                 if (Character.isLetterOrDigit(c)
                         || ((c == '_') || (c == '+') || (c == '#'))
                    ) {
                     if (length == 0) {
                         // "javaC1C2C3C4linux" <br>
                         //      ^--: the current character begin to token the ASCII
                         // letter
                         start = offset - 1;
                     } else if (tokenType == "double") {
                         // "javaC1C2C3C4linux" <br>
                         //              ^--: the previous non-ASCII
                         // : the current character
                         offset--;
                         bufferIndex--;
                         tokenType = "single";

                         if (preIsTokened == true) {
                             // there is only one non-ASCII has been stored
                             length = 0;
                             preIsTokened = false;

                             break;
                         } else {
                             break;
                         }
                     }

                     // store the LowerCase(c) in the buffer
                     buffer[length++] = Character.toLowerCase(c);
                     tokenType = "single";

                     // break the procedure if buffer overflowed!
                     if (length == MAX_WORD_LEN) {
                         break;
                     }
                 } else if (length > 0) {
                     if (preIsTokened == true) {
                         length = 0;
                         preIsTokened = false;
                     } else {
                         break;
                     }
                 }
             } else {
                 // non-ASCII letter, eg."C1C2C3C4"
                 if (Character.isLetter(c)) {
                     if (length == 0) {
                         start = offset - 1;
                         buffer[length++] = c;
                         tokenType = "double";
                     } else {
                         if (tokenType == "single") {
                             offset--;
                             bufferIndex--;

                             //return the previous ASCII characters
                             break;
                         } else {
                             buffer[length++] = c;
                             tokenType = "double";

                             if (length == 2) {
                                 offset--;
                                 bufferIndex--;
                                 preIsTokened = true;

                                 break;
                             }
                         }
                     }
                 } else if (length > 0) {
                     if (preIsTokened == true) {
                         // empty the buffer
                         length = 0;
                         preIsTokened = false;
                     } else {
                         break;
                     }
                 }
             }
         }

         return new Token(new String(buffer, 0, length), start, start + length,
                          tokenType
                         );
     }
 }
	package org.eclipse.epf.web.search.analysis;

	import org.apache.lucene.analysis.Token;
	import org.apache.lucene.analysis.Tokenizer;

	import java.io.Reader;

	public final class CJKTokenizer extends Tokenizer {
	//~ Static fields/initializers ---------------------------------------------

	/** Max word length */
	private static final int MAX_WORD_LEN = 255;

	/** buffer size: */
	private static final int IO_BUFFER_SIZE = 256;

	//~ Instance fields --------------------------------------------------------

	/** word offset, used to imply which character(in ) is parsed */
	private int offset = 0;

	/** the index used only for ioBuffer */
	private int bufferIndex = 0;

	/** data length */
	private int dataLen = 0;

	/**
	* character buffer, store the characters which are used to compose <br>
	* the returned Token
	*/
	private final char[] buffer = new char[MAX_WORD_LEN];

	/**
	* I/O buffer, used to store the content of the input(one of the <br>
	* members of Tokenizer)
	*/
	private final char[] ioBuffer = new char[IO_BUFFER_SIZE];

	/** word type: single=>ASCII double=>non-ASCII word=>default */
	private String tokenType = "word";

	/**
	* tag: previous character is a cached double-byte character "C1C2C3C4"
	* ----(set the C1 isTokened) C1C2 "C2C3C4" ----(set the C2 isTokened)
	* C1C2 C2C3 "C3C4" ----(set the C3 isTokened) "C1C2 C2C3 C3C4"
	*/
	private boolean preIsTokened = false;

	//~ Constructors -----------------------------------------------------------

	/**
	* Construct a token stream processing the given input.
	*
	* @param in I/O reader
	*/
	public CJKTokenizer(Reader in) {
	input = in;
	}

	//~ Methods ----------------------------------------------------------------

	/**
	* Returns the next token in the stream, or null at EOS.
	* See http://java.sun.com/j2se/1.3/docs/api/java/lang/Character.UnicodeBlock.html
	* for detail.
	*
	* @return Token
	*
	* @throws java.io.IOException - throw IOException when read error <br>
	* hanppened in the InputStream
	*
	*/
	public final Token next() throws java.io.IOException {
	/** how many character(s) has been stored in buffer */
	int length = 0;

	/** the position used to create Token */
	int start = offset;

	while (true) {
	/** current charactor */
	char c;

	/** unicode block of current charactor for detail */
	Character.UnicodeBlock ub;

	offset++;

	if (bufferIndex >= dataLen) {
	dataLen = input.read(ioBuffer);
	bufferIndex = 0;
	}

	if (dataLen == -1) {
	if (length > 0) {
	if (preIsTokened == true) {
	length = 0;
	preIsTokened = false;
	}

	break;
	} else {
	return null;
	}
	} else {
	//get current character
	c = ioBuffer[bufferIndex++];

	//get the UnicodeBlock of the current character
	ub = Character.UnicodeBlock.of(c);
	}

	//if the current character is ASCII or Extend ASCII
	if ((ub == Character.UnicodeBlock.BASIC_LATIN)
	\|\| (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS)
	) {
	if (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS) {
	/** convert HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN */
	int i = (int) c;
	i = i - 65248;
	c = (char) i;
	}

	// if the current character is a letter or "_" "+" "#"
	if (Character.isLetterOrDigit(c)
	\|\| ((c == '_') \|\| (c == '+') \|\| (c == '#'))
	) {
	if (length == 0) {
	// "javaC1C2C3C4linux" <br>
	// ^--: the current character begin to token the ASCII
	// letter
	start = offset - 1;
	} else if (tokenType == "double") {
	// "javaC1C2C3C4linux" <br>
	// ^--: the previous non-ASCII
	// : the current character
	offset--;
	bufferIndex--;
	tokenType = "single";

	if (preIsTokened == true) {
	// there is only one non-ASCII has been stored
	length = 0;
	preIsTokened = false;

	break;
	} else {
	break;
	}
	}

	// store the LowerCase(c) in the buffer
	buffer[length++] = Character.toLowerCase(c);
	tokenType = "single";

	// break the procedure if buffer overflowed!
	if (length == MAX_WORD_LEN) {
	break;
	}
	} else if (length > 0) {
	if (preIsTokened == true) {
	length = 0;
	preIsTokened = false;
	} else {
	break;
	}
	}
	} else {
	// non-ASCII letter, eg."C1C2C3C4"
	if (Character.isLetter(c)) {
	if (length == 0) {
	start = offset - 1;
	buffer[length++] = c;
	tokenType = "double";
	} else {
	if (tokenType == "single") {
	offset--;
	bufferIndex--;

	//return the previous ASCII characters
	break;
	} else {
	buffer[length++] = c;
	tokenType = "double";

	if (length == 2) {
	offset--;
	bufferIndex--;
	preIsTokened = true;

	break;
	}
	}
	}
	} else if (length > 0) {
	if (preIsTokened == true) {
	// empty the buffer
	length = 0;
	preIsTokened = false;
	} else {
	break;
	}
	}
	}
	}

	return new Token(new String(buffer, 0, length), start, start + length,
	tokenType
	);
	}
	}