| //------------------------------------------------------------------------------ |
| //Copyright (c) 2004, 2005 IBM Corporation. All Rights Reserved. |
| //------------------------------------------------------------------------------ |
| package org.eclipse.epf.web.search.analysis; |
| |
| import java.io.IOException; |
| import java.io.Reader; |
| |
| import org.apache.lucene.analysis.Token; |
| import org.apache.lucene.analysis.Tokenizer; |
| |
| import com.ibm.icu.text.BreakIterator; |
| |
| /** |
| * A Text Tokenizer that segments text into words using ICU4J. |
| * |
| * @author Kelvin Low |
| * @since 1.0 |
| */ |
| public final class TextTokenizer extends Tokenizer { |
| |
| private final static int BUFFER_SIZE = 4096; |
| |
| private String text; |
| |
| private BreakIterator iterator; |
| |
| /** |
| * Creates a new instance. |
| * |
| * @param reader |
| * The text source. |
| */ |
| public TextTokenizer(Reader reader) { |
| super(reader); |
| StringBuffer textBuffer = new StringBuffer(BUFFER_SIZE); |
| char[] buffer = new char[BUFFER_SIZE]; |
| int charsRead; |
| try { |
| while ((charsRead = reader.read(buffer, 0, BUFFER_SIZE)) > 0) { |
| textBuffer.append(buffer, 0, charsRead); |
| } |
| text = textBuffer.toString(); |
| iterator = BreakIterator.getWordInstance(); |
| iterator.setText(text); |
| } catch (IOException e) { |
| iterator = null; |
| } |
| } |
| |
| /** |
| * @see org.apache.lucene.analysis.TokenStream#next() |
| */ |
| public final Token next() throws IOException { |
| if (iterator != null) { |
| while (true) { |
| int start = iterator.current(); |
| int end = iterator.next(); |
| if (end != BreakIterator.DONE) { |
| String tokenText = text.substring(start, end).toLowerCase(); |
| if (!tokenText.equals(" ")) { //$NON-NLS-1$ |
| if (tokenText.endsWith("'s")) { //$NON-NLS-1$ |
| tokenText = tokenText.substring(0, tokenText |
| .length() - 2); |
| } |
| return new Token(tokenText, 0, tokenText.length()); |
| } |
| } else { |
| return null; |
| } |
| } |
| } |
| return null; |
| } |
| |
| } |