| /******************************************************************************* |
| * Copyright (c) 2000, 2003 IBM Corporation and others. |
| * All rights reserved. This program and the accompanying materials |
| * are made available under the terms of the Common Public License v1.0 |
| * which accompanies this distribution, and is available at |
| * http://www.eclipse.org/legal/cpl-v10.html |
| * |
| * Contributors: |
| * IBM Corporation - initial API and implementation |
| *******************************************************************************/ |
| package org.eclipse.help.internal.search; |
| |
| import java.io.*; |
| import java.text.*; |
| import java.util.*; |
| |
| import org.apache.lucene.analysis.*; |
| |
| /** |
| * WordTokenStream obtains tokens containing words |
| * appropriate for use with Lucene search engine. |
| */ |
| public final class WordTokenStream extends TokenStream { |
| private static final int BUF_LEN = 4096; |
| private static final int TOKENS_LEN = 512; |
| private final String fieldName; |
| private final Reader reader; |
| private final BreakIterator boundary; |
| private final ArrayList tokens; |
| private int token; |
| private int noTokens; |
| private final char[] cbuf; |
| /** |
| * Constructor |
| */ |
| public WordTokenStream(String fieldName, Reader reader, Locale locale) { |
| this.fieldName = fieldName; |
| this.reader = reader; |
| boundary = BreakIterator.getWordInstance(locale); |
| cbuf = new char[BUF_LEN]; |
| tokens = new ArrayList(TOKENS_LEN); |
| |
| } |
| /** |
| * @see TokenStream#next() |
| */ |
| public final Token next() throws IOException { |
| if (token >= noTokens) { |
| // read BUF_LEN of chars |
| int l; |
| while ((l = reader.read(cbuf)) <= 0) { |
| if (l < 0) { |
| // EOF |
| reader.close(); |
| return null; |
| } |
| } |
| StringBuffer strbuf = new StringBuffer(l + 80); |
| strbuf.append(cbuf, 0, l); |
| // read more until white space (or EOF) |
| int c; |
| while (0 <= (c = reader.read())) { |
| strbuf.append((char) c); |
| if (c == ' ' || c == '\r' || c == '\n' || c == '\t') { |
| break; |
| } |
| } |
| |
| String str = strbuf.toString(); |
| boundary.setText(str); |
| |
| int start = boundary.first(); |
| tokens.clear(); |
| wordsbreak : for ( |
| int end = boundary.next(); |
| end != BreakIterator.DONE; |
| start = end, end = boundary.next()) { |
| // determine if it is a word |
| // any letter or digit between boundaries means it is a word |
| for (int i = start; i < end; i++) { |
| if (Character.isLetterOrDigit(str.charAt(i))) { |
| // it is a word |
| tokens.add( |
| new Token(str.substring(start, end), start, end)); |
| continue wordsbreak; |
| } |
| } |
| } |
| |
| if (c < 0) { |
| reader.close(); |
| tokens.add((Token) null); |
| } |
| noTokens = tokens.size(); |
| token = 0; |
| } |
| |
| return (Token) tokens.get(token++); |
| |
| } |
| } |