| //------------------------------------------------------------------------------ |
| // Copyright (c) 2005, 2006 IBM Corporation and others. |
| // All rights reserved. This program and the accompanying materials |
| // are made available under the terms of the Eclipse Public License v1.0 |
| // which accompanies this distribution, and is available at |
| // http://www.eclipse.org/legal/epl-v10.html |
| // |
| // Contributors: |
| // IBM Corporation - initial implementation |
| //------------------------------------------------------------------------------ |
| package org.eclipse.epf.search.analysis; |
| |
| import java.io.IOException; |
| import java.io.Reader; |
| |
| import org.apache.lucene.analysis.Token; |
| import org.apache.lucene.analysis.Tokenizer; |
| |
| import com.ibm.icu.text.BreakIterator; |
| |
| /** |
| * A text tokenizer that uses ICU4J to segment text into words. |
| * |
| * @author Kelvin Low |
| * @since 1.0 |
| */ |
| public final class TextTokenizer extends Tokenizer { |
| |
| private final static int BUFFER_SIZE = 4096; |
| |
| private String text; |
| |
| private BreakIterator iterator; |
| |
| /** |
| * Creates a new instance. |
| * |
| * @param reader |
| * the text source |
| */ |
| public TextTokenizer(Reader reader) { |
| super(reader); |
| StringBuffer textBuffer = new StringBuffer(BUFFER_SIZE); |
| char[] buffer = new char[BUFFER_SIZE]; |
| int charsRead; |
| try { |
| while ((charsRead = reader.read(buffer, 0, BUFFER_SIZE)) > 0) { |
| textBuffer.append(buffer, 0, charsRead); |
| } |
| text = textBuffer.toString(); |
| iterator = BreakIterator.getWordInstance(); |
| iterator.setText(text); |
| } catch (IOException e) { |
| iterator = null; |
| } |
| } |
| |
| /** |
| * @see org.apache.lucene.analysis.TokenStream#next() |
| */ |
| public final Token next() throws IOException { |
| if (iterator != null) { |
| while (true) { |
| int start = iterator.current(); |
| int end = iterator.next(); |
| if (end != BreakIterator.DONE) { |
| String tokenText = text.substring(start, end).toLowerCase(); |
| if (!tokenText.equals(" ")) { //$NON-NLS-1$ |
| if (tokenText.endsWith("'s")) { //$NON-NLS-1$ |
| tokenText = tokenText.substring(0, tokenText |
| .length() - 2); |
| } |
| return new Token(tokenText, 0, tokenText.length()); |
| } |
| } else { |
| return null; |
| } |
| } |
| } |
| return null; |
| } |
| |
| } |