org.eclipse.help.base/src/org/eclipse/help/internal/search/WordTokenStream.java - platform/eclipse.platform.ua - Git at Google

 /*******************************************************************************
  * Copyright (c) 2000, 2015 IBM Corporation and others.
  * All rights reserved. This program and the accompanying materials
  * are made available under the terms of the Eclipse Public License v1.0
  * which accompanies this distribution, and is available at
  * http://www.eclipse.org/legal/epl-v10.html
  *
  * Contributors:
  *     IBM Corporation - initial API and implementation
  *******************************************************************************/
 package org.eclipse.help.internal.search;

 import com.ibm.icu.text.BreakIterator;
 import java.io.IOException;
 import java.io.Reader;
 import java.util.Locale;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;

 /**
  * WordTokenStream obtains tokens containing words appropriate for use with
  * Lucene search engine.
  */
 public final class WordTokenStream extends Tokenizer {
 	private static final int BUF_LEN = 4096;
 	private final Reader reader;
 	private final BreakIterator boundary;
 	private StringBuffer strbuf;

 	private int start = 0;
 	private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);

 	/**
 	 * Constructor
 	 */
 	public WordTokenStream(String fieldName, Reader reader, Locale locale) {
 		this.reader = reader;
 		boundary = BreakIterator.getWordInstance(locale);

 	}
 	/**
 	 * @see TokenStream#incrementToken()
 	 */
 	@Override
 	public boolean incrementToken() throws IOException {
 	    clearAttributes();
 	    int length = 0;
 	    char[] buffer = termAtt.buffer();

 	    int end;
 	    if(strbuf == null) {
 			int available;
 			char[] cbuf = new char[BUF_LEN];
 			while ((available = reader.read(cbuf)) <= 0) {
 				if (available < 0) {
 					reader.close();
 					return false;
 				}
 			}
 			strbuf = new StringBuffer(available + 80);
 			strbuf.append(cbuf, 0, available);
 			// read more until white space (or EOF)
 			int c;
 			while (0 <= (c = reader.read())) {
 				strbuf.append((char) c);
 				if (c == ' ' || c == '\r' || c == '\n' || c == '\t') {
 					break;
 				}
 			}

 			if (c < 0) {
 				reader.close();
 			}

 			boundary.setText(strbuf.toString());
 			start = boundary.first();
 	    }
 	    else {
 	    	start = boundary.next();
 	    }

 		for (end = boundary.next(); end != BreakIterator.DONE; start = end, end = boundary.next()) {
 			// determine if it is a word
 			// any letter or digit between boundaries means it is a word
 			for (int i = start; i < end; i++) {
 				if (Character.isLetterOrDigit(strbuf.charAt(i))) {
 					// it is a word
 					length = end - start;
 					if (length >= buffer.length-1)
 						 buffer = termAtt.resizeBuffer(2+length);
 					termAtt.setLength(length);
 					strbuf.getChars(start, end, buffer, 0);
 					return true;
 				}
 			}
 		}

 	    return false;
 	}

 	@Override
 	public void reset() throws IOException {
 		super.reset();
 		clearAttributes();
 	}

 	@Override
 	public void close() throws IOException {
 		/// Unlikely to be called as this is a reused
 	    if (this.reader != null) {
 	    	this.reader.close();
 	    }
 	}
 }
	/*******************************************************************************
	* Copyright (c) 2000, 2015 IBM Corporation and others.
	* All rights reserved. This program and the accompanying materials
	* are made available under the terms of the Eclipse Public License v1.0
	* which accompanies this distribution, and is available at
	* http://www.eclipse.org/legal/epl-v10.html
	*
	* Contributors:
	* IBM Corporation - initial API and implementation
	*******************************************************************************/
	package org.eclipse.help.internal.search;

	import com.ibm.icu.text.BreakIterator;
	import java.io.IOException;
	import java.io.Reader;
	import java.util.Locale;
	import org.apache.lucene.analysis.TokenStream;
	import org.apache.lucene.analysis.Tokenizer;
	import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;

	/**
	* WordTokenStream obtains tokens containing words appropriate for use with
	* Lucene search engine.
	*/
	public final class WordTokenStream extends Tokenizer {
	private static final int BUF_LEN = 4096;
	private final Reader reader;
	private final BreakIterator boundary;
	private StringBuffer strbuf;

	private int start = 0;
	private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);

	/**
	* Constructor
	*/
	public WordTokenStream(String fieldName, Reader reader, Locale locale) {
	this.reader = reader;
	boundary = BreakIterator.getWordInstance(locale);

	}
	/**
	* @see TokenStream#incrementToken()
	*/
	@Override
	public boolean incrementToken() throws IOException {
	clearAttributes();
	int length = 0;
	char[] buffer = termAtt.buffer();

	int end;
	if(strbuf == null) {
	int available;
	char[] cbuf = new char[BUF_LEN];
	while ((available = reader.read(cbuf)) <= 0) {
	if (available < 0) {
	reader.close();
	return false;
	}
	}
	strbuf = new StringBuffer(available + 80);
	strbuf.append(cbuf, 0, available);
	// read more until white space (or EOF)
	int c;
	while (0 <= (c = reader.read())) {
	strbuf.append((char) c);
	if (c == ' ' \|\| c == '\r' \|\| c == '\n' \|\| c == '\t') {
	break;
	}
	}

	if (c < 0) {
	reader.close();
	}

	boundary.setText(strbuf.toString());
	start = boundary.first();
	}
	else {
	start = boundary.next();
	}

	for (end = boundary.next(); end != BreakIterator.DONE; start = end, end = boundary.next()) {
	// determine if it is a word
	// any letter or digit between boundaries means it is a word
	for (int i = start; i < end; i++) {
	if (Character.isLetterOrDigit(strbuf.charAt(i))) {
	// it is a word
	length = end - start;
	if (length >= buffer.length-1)
	buffer = termAtt.resizeBuffer(2+length);
	termAtt.setLength(length);
	strbuf.getChars(start, end, buffer, 0);
	return true;
	}
	}
	}

	return false;
	}

	@Override
	public void reset() throws IOException {
	super.reset();
	clearAttributes();
	}

	@Override
	public void close() throws IOException {
	/// Unlikely to be called as this is a reused
	if (this.reader != null) {
	this.reader.close();
	}
	}
	}