blob: a5f335be3c61b96fa733decb5a4fd296078c0ba2 [file] [log] [blame]
//------------------------------------------------------------------------------
// Copyright (c) 2005, 2006 IBM Corporation and others.
// All rights reserved. This program and the accompanying materials
// are made available under the terms of the Eclipse Public License v1.0
// which accompanies this distribution, and is available at
// http://www.eclipse.org/legal/epl-v10.html
//
// Contributors:
// IBM Corporation - initial implementation
//------------------------------------------------------------------------------
package org.eclipse.epf.search.analysis;
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.Tokenizer;
import com.ibm.icu.text.BreakIterator;
/**
* A text tokenizer that uses ICU4J to segment text into words.
*
* @author Kelvin Low
* @since 1.0
*/
public final class TextTokenizer extends Tokenizer {
private final static int BUFFER_SIZE = 4096;
private String text;
private BreakIterator iterator;
/**
* Creates a new instance.
*
* @param reader
* the text source
*/
public TextTokenizer(Reader reader) {
super(reader);
StringBuffer textBuffer = new StringBuffer(BUFFER_SIZE);
char[] buffer = new char[BUFFER_SIZE];
int charsRead;
try {
while ((charsRead = reader.read(buffer, 0, BUFFER_SIZE)) > 0) {
textBuffer.append(buffer, 0, charsRead);
}
text = textBuffer.toString();
iterator = BreakIterator.getWordInstance();
iterator.setText(text);
} catch (IOException e) {
iterator = null;
}
}
/**
* @see org.apache.lucene.analysis.TokenStream#next()
*/
public final Token next() throws IOException {
if (iterator != null) {
while (true) {
int start = iterator.current();
int end = iterator.next();
if (end != BreakIterator.DONE) {
String tokenText = text.substring(start, end).toLowerCase();
if (!tokenText.equals(" ")) { //$NON-NLS-1$
if (tokenText.endsWith("'s")) { //$NON-NLS-1$
tokenText = tokenText.substring(0, tokenText
.length() - 2);
}
return new Token(tokenText, 0, tokenText.length());
}
} else {
return null;
}
}
}
return null;
}
}