blob: 3cf281fc751daa6816d823b9c46d5790b47683a4 [file] [log] [blame]
//------------------------------------------------------------------------------
// Copyright (c) 2004, 200 IBM Corporation. All Rights Reserved.
//------------------------------------------------------------------------------
package org.eclipse.epf.web.search.analysis;
import java.util.Hashtable;
import java.util.ResourceBundle;
import java.util.StringTokenizer;
import org.apache.lucene.analysis.StopAnalyzer;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
public class ChineseFilter extends TokenFilter {
private Hashtable _stopTable;
public ChineseFilter(TokenStream in)
{
super(in);
input = in;
if (_stopTable == null) {
loadStopWords();
}
}
public final Token next() throws java.io.IOException {
for (Token token = input.next(); token != null; token = input.next()) {
String text = token.termText();
if (_stopTable.get(text) == null) {
switch (Character.getType(text.charAt(0))) {
// June 3, 2003 - fchong added support for digits for Rational
case Character.DECIMAL_DIGIT_NUMBER:
case Character.LETTER_NUMBER:
case Character.LOWERCASE_LETTER:
case Character.UPPERCASE_LETTER:
// English word/token should larger than 1 character.
if (text.length()>1) {
return token;
}
break;
case Character.OTHER_LETTER:
// One Chinese character as one Chinese word.
// Chinese word extraction to be added later here.
return token;
}
}
}
return null;
}
/**
* Loads the stop words defined in the StopWords.properties file.
*/
private void loadStopWords() {
String[] words = null;
try {
ResourceBundle bundle = ResourceBundle.getBundle(ChineseFilter.class
.getPackage().getName()
+ ".StopWords"); //$NON-NLS-1$
String property = bundle.getString("Search.stopWords"); //$NON-NLS-1$
words = split(property, " ,", -1); //$NON-NLS-1$
} catch (Exception e) {
words = StopAnalyzer.ENGLISH_STOP_WORDS;
}
_stopTable = StopFilter.makeStopTable(words);
}
/**
* Splits a string into an array of string tokens.
*
* @param str
* A string.
* @param sep
* A string containing the string separators.
* @param count
* The desired number of string tokens.
* @return An array of string tokens.
*/
public static String[] split(String str, String sep, int count) {
if (str == null || count == 0 || count < -1) {
return null;
}
StringTokenizer tokenizer = new StringTokenizer(str, sep,
count == -1 ? false : true);
if (count == -1) {
count = tokenizer.countTokens();
}
String[] result = new String[count];
int i = 0;
while (tokenizer.hasMoreTokens()) {
String t = tokenizer.nextToken();
if (i < count) {
if ((t.length() == 1) && (sep.indexOf(t) != -1)) {
continue;
}
result[i++] = t;
} else {
result[count - 1] += t;
}
}
return result;
}
}