| package org.eclipse.epf.web.search.analysis; |
| |
| import org.apache.lucene.analysis.Analyzer; |
| import org.apache.lucene.analysis.StopFilter; |
| import org.apache.lucene.analysis.TokenStream; |
| |
| import java.io.Reader; |
| import java.util.Set; |
| |
| public class CJKAnalyzer extends Analyzer { |
| //~ Static fields/initializers --------------------------------------------- |
| |
| /** |
| * An array containing some common English words that are not usually |
| * useful for searching and some double-byte interpunctions. |
| */ |
| public final static String[] STOP_WORDS = { |
| "a", "and", "are", "as", "at", "be", |
| "but", "by", "for", "if", "in", |
| "into", "is", "it", "no", "not", |
| "of", "on", "or", "s", "such", "t", |
| "that", "the", "their", "then", |
| "there", "these", "they", "this", |
| "to", "was", "will", "with", "", |
| "www" |
| }; |
| |
| //~ Instance fields -------------------------------------------------------- |
| |
| /** |
| * stop word list |
| */ |
| private Set stopTable; |
| |
| //~ Constructors ----------------------------------------------------------- |
| |
| /** |
| * Builds an analyzer which removes words in {@link #STOP_WORDS}. |
| */ |
| public CJKAnalyzer() { |
| stopTable = StopFilter.makeStopSet(STOP_WORDS); |
| } |
| |
| /** |
| * Builds an analyzer which removes words in the provided array. |
| * |
| * @param stopWords stop word array |
| */ |
| public CJKAnalyzer(String[] stopWords) { |
| stopTable = StopFilter.makeStopSet(stopWords); |
| } |
| |
| //~ Methods ---------------------------------------------------------------- |
| |
| /** |
| * get token stream from input |
| * |
| * @param fieldName lucene field name |
| * @param reader input reader |
| * @return TokenStream |
| */ |
| public final TokenStream tokenStream(String fieldName, Reader reader) { |
| return new StopFilter(new CJKTokenizer(reader), stopTable); |
| } |
| } |