| //------------------------------------------------------------------------------ |
| // Copyright (c) 2004, 200 IBM Corporation. All Rights Reserved. |
| //------------------------------------------------------------------------------ |
| package org.eclipse.epf.web.search.analysis; |
| |
| import java.util.Hashtable; |
| import java.util.ResourceBundle; |
| import java.util.StringTokenizer; |
| |
| import org.apache.lucene.analysis.StopAnalyzer; |
| import org.apache.lucene.analysis.StopFilter; |
| import org.apache.lucene.analysis.Token; |
| import org.apache.lucene.analysis.TokenFilter; |
| import org.apache.lucene.analysis.TokenStream; |
| |
| |
| public class ChineseFilter extends TokenFilter { |
| |
| private Hashtable _stopTable; |
| |
| public ChineseFilter(TokenStream in) |
| { |
| super(in); |
| |
| input = in; |
| |
| if (_stopTable == null) { |
| loadStopWords(); |
| } |
| } |
| |
| public final Token next() throws java.io.IOException { |
| |
| for (Token token = input.next(); token != null; token = input.next()) { |
| String text = token.termText(); |
| |
| if (_stopTable.get(text) == null) { |
| switch (Character.getType(text.charAt(0))) { |
| |
| // June 3, 2003 - fchong added support for digits for Rational |
| case Character.DECIMAL_DIGIT_NUMBER: |
| case Character.LETTER_NUMBER: |
| case Character.LOWERCASE_LETTER: |
| case Character.UPPERCASE_LETTER: |
| |
| // English word/token should larger than 1 character. |
| if (text.length()>1) { |
| return token; |
| } |
| break; |
| case Character.OTHER_LETTER: |
| |
| // One Chinese character as one Chinese word. |
| // Chinese word extraction to be added later here. |
| |
| return token; |
| } |
| |
| } |
| |
| } |
| return null; |
| } |
| |
| /** |
| * Loads the stop words defined in the StopWords.properties file. |
| */ |
| private void loadStopWords() { |
| String[] words = null; |
| try { |
| ResourceBundle bundle = ResourceBundle.getBundle(ChineseFilter.class |
| .getPackage().getName() |
| + ".StopWords"); //$NON-NLS-1$ |
| String property = bundle.getString("Search.stopWords"); //$NON-NLS-1$ |
| words = split(property, " ,", -1); //$NON-NLS-1$ |
| } catch (Exception e) { |
| words = StopAnalyzer.ENGLISH_STOP_WORDS; |
| } |
| _stopTable = StopFilter.makeStopTable(words); |
| } |
| |
| /** |
| * Splits a string into an array of string tokens. |
| * |
| * @param str |
| * A string. |
| * @param sep |
| * A string containing the string separators. |
| * @param count |
| * The desired number of string tokens. |
| * @return An array of string tokens. |
| */ |
| public static String[] split(String str, String sep, int count) { |
| if (str == null || count == 0 || count < -1) { |
| return null; |
| } |
| |
| StringTokenizer tokenizer = new StringTokenizer(str, sep, |
| count == -1 ? false : true); |
| |
| if (count == -1) { |
| count = tokenizer.countTokens(); |
| } |
| |
| String[] result = new String[count]; |
| int i = 0; |
| while (tokenizer.hasMoreTokens()) { |
| String t = tokenizer.nextToken(); |
| if (i < count) { |
| if ((t.length() == 1) && (sep.indexOf(t) != -1)) { |
| continue; |
| } |
| result[i++] = t; |
| } else { |
| result[count - 1] += t; |
| } |
| } |
| return result; |
| } |
| } |