blob: 72d320312ae99563003736277f0ee0a21988f7c8 [file] [log] [blame]
/*******************************************************************************
* Copyright (c) 2000, 2015 IBM Corporation and others.
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Eclipse Public License v1.0
* which accompanies this distribution, and is available at
* http://www.eclipse.org/legal/epl-v10.html
*
* Contributors:
* IBM Corporation - initial API and implementation
*******************************************************************************/
package org.eclipse.help.internal.search;
import com.ibm.icu.text.BreakIterator;
import java.io.IOException;
import java.io.Reader;
import java.util.Locale;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
/**
* WordTokenStream obtains tokens containing words appropriate for use with
* Lucene search engine.
*/
public final class WordTokenStream extends Tokenizer {
private static final int BUF_LEN = 4096;
private final Reader reader;
private final BreakIterator boundary;
private StringBuffer strbuf;
private int start = 0;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
/**
* Constructor
*/
public WordTokenStream(String fieldName, Reader reader, Locale locale) {
this.reader = reader;
boundary = BreakIterator.getWordInstance(locale);
}
/**
* @see TokenStream#incrementToken()
*/
@Override
public boolean incrementToken() throws IOException {
clearAttributes();
int length = 0;
char[] buffer = termAtt.buffer();
int end;
if(strbuf == null) {
int available;
char[] cbuf = new char[BUF_LEN];
while ((available = reader.read(cbuf)) <= 0) {
if (available < 0) {
reader.close();
return false;
}
}
strbuf = new StringBuffer(available + 80);
strbuf.append(cbuf, 0, available);
// read more until white space (or EOF)
int c;
while (0 <= (c = reader.read())) {
strbuf.append((char) c);
if (c == ' ' || c == '\r' || c == '\n' || c == '\t') {
break;
}
}
if (c < 0) {
reader.close();
}
boundary.setText(strbuf.toString());
start = boundary.first();
}
else {
start = boundary.next();
}
for (end = boundary.next(); end != BreakIterator.DONE; start = end, end = boundary.next()) {
// determine if it is a word
// any letter or digit between boundaries means it is a word
for (int i = start; i < end; i++) {
if (Character.isLetterOrDigit(strbuf.charAt(i))) {
// it is a word
length = end - start;
if (length >= buffer.length-1)
buffer = termAtt.resizeBuffer(2+length);
termAtt.setLength(length);
strbuf.getChars(start, end, buffer, 0);
return true;
}
}
}
return false;
}
@Override
public void reset() throws IOException {
super.reset();
clearAttributes();
}
@Override
public void close() throws IOException {
/// Unlikely to be called as this is a reused
if (this.reader != null) {
this.reader.close();
}
}
}