org.eclipse.epf.web.search/src/org/eclipse/epf/web/search/analysis/ChineseFilter.java - epf/org.eclipse.epf.projects - Git at Google

 //------------------------------------------------------------------------------
 // Copyright (c) 2004, 200 IBM Corporation.  All Rights Reserved.
 //------------------------------------------------------------------------------
 package org.eclipse.epf.web.search.analysis;

 import java.util.Hashtable;
 import java.util.ResourceBundle;
 import java.util.StringTokenizer;

 import org.apache.lucene.analysis.StopAnalyzer;
 import org.apache.lucene.analysis.StopFilter;
 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;


 public class ChineseFilter extends TokenFilter {

 	private Hashtable _stopTable;

     public ChineseFilter(TokenStream in)
     {
     	super(in);

         input = in;

         if (_stopTable == null) {
 			loadStopWords();
 		}
     }

     public final Token next() throws java.io.IOException {

         for (Token token = input.next(); token != null; token = input.next()) {
             String text = token.termText();

             if (_stopTable.get(text) == null) {
                 switch (Character.getType(text.charAt(0))) {

 				// June 3, 2003 - fchong added support for digits for Rational
 				case Character.DECIMAL_DIGIT_NUMBER:
 				case Character.LETTER_NUMBER:
                 case Character.LOWERCASE_LETTER:
                 case Character.UPPERCASE_LETTER:

                     // English word/token should larger than 1 character.
                     if (text.length()>1) {
                         return token;
                     }
                     break;
                 case Character.OTHER_LETTER:

                     // One Chinese character as one Chinese word.
                     // Chinese word extraction to be added later here.

                     return token;
                 }

             }

         }
         return null;
     }

     /**
 	 * Loads the stop words defined in the StopWords.properties file.
 	 */
 	private void loadStopWords() {
 		String[] words = null;
 		try {
 			ResourceBundle bundle = ResourceBundle.getBundle(ChineseFilter.class
 					.getPackage().getName()
 					+ ".StopWords"); //$NON-NLS-1$
 			String property = bundle.getString("Search.stopWords"); //$NON-NLS-1$
 			words = split(property, " ,", -1); //$NON-NLS-1$
 		} catch (Exception e) {
 			words = StopAnalyzer.ENGLISH_STOP_WORDS;
 		}
 		_stopTable = StopFilter.makeStopTable(words);
 	}

 	/**
 	 * Splits a string into an array of string tokens.
 	 *
 	 * @param str
 	 *            A string.
 	 * @param sep
 	 *            A string containing the string separators.
 	 * @param count
 	 *            The desired number of string tokens.
 	 * @return An array of string tokens.
 	 */
 	public static String[] split(String str, String sep, int count) {
 		if (str == null || count == 0 || count < -1) {
 			return null;
 		}

 		StringTokenizer tokenizer = new StringTokenizer(str, sep,
 				count == -1 ? false : true);

 		if (count == -1) {
 			count = tokenizer.countTokens();
 		}

 		String[] result = new String[count];
 		int i = 0;
 		while (tokenizer.hasMoreTokens()) {
 			String t = tokenizer.nextToken();
 			if (i < count) {
 				if ((t.length() == 1) && (sep.indexOf(t) != -1)) {
 					continue;
 				}
 				result[i++] = t;
 			} else {
 				result[count - 1] += t;
 			}
 		}
 		return result;
 	}
 }
	//------------------------------------------------------------------------------
	// Copyright (c) 2004, 200 IBM Corporation. All Rights Reserved.
	//------------------------------------------------------------------------------
	package org.eclipse.epf.web.search.analysis;

	import java.util.Hashtable;
	import java.util.ResourceBundle;
	import java.util.StringTokenizer;

	import org.apache.lucene.analysis.StopAnalyzer;
	import org.apache.lucene.analysis.StopFilter;
	import org.apache.lucene.analysis.Token;
	import org.apache.lucene.analysis.TokenFilter;
	import org.apache.lucene.analysis.TokenStream;


	public class ChineseFilter extends TokenFilter {

	private Hashtable _stopTable;

	public ChineseFilter(TokenStream in)
	{
	super(in);

	input = in;

	if (_stopTable == null) {
	loadStopWords();
	}
	}

	public final Token next() throws java.io.IOException {

	for (Token token = input.next(); token != null; token = input.next()) {
	String text = token.termText();

	if (_stopTable.get(text) == null) {
	switch (Character.getType(text.charAt(0))) {

	// June 3, 2003 - fchong added support for digits for Rational
	case Character.DECIMAL_DIGIT_NUMBER:
	case Character.LETTER_NUMBER:
	case Character.LOWERCASE_LETTER:
	case Character.UPPERCASE_LETTER:

	// English word/token should larger than 1 character.
	if (text.length()>1) {
	return token;
	}
	break;
	case Character.OTHER_LETTER:

	// One Chinese character as one Chinese word.
	// Chinese word extraction to be added later here.

	return token;
	}

	}

	}
	return null;
	}

	/**
	* Loads the stop words defined in the StopWords.properties file.
	*/
	private void loadStopWords() {
	String[] words = null;
	try {
	ResourceBundle bundle = ResourceBundle.getBundle(ChineseFilter.class
	.getPackage().getName()
	+ ".StopWords"); //$NON-NLS-1$
	String property = bundle.getString("Search.stopWords"); //$NON-NLS-1$
	words = split(property, " ,", -1); //$NON-NLS-1$
	} catch (Exception e) {
	words = StopAnalyzer.ENGLISH_STOP_WORDS;
	}
	_stopTable = StopFilter.makeStopTable(words);
	}

	/**
	* Splits a string into an array of string tokens.
	*
	* @param str
	* A string.
	* @param sep
	* A string containing the string separators.
	* @param count
	* The desired number of string tokens.
	* @return An array of string tokens.
	*/
	public static String[] split(String str, String sep, int count) {
	if (str == null \|\| count == 0 \|\| count < -1) {
	return null;
	}

	StringTokenizer tokenizer = new StringTokenizer(str, sep,
	count == -1 ? false : true);

	if (count == -1) {
	count = tokenizer.countTokens();
	}

	String[] result = new String[count];
	int i = 0;
	while (tokenizer.hasMoreTokens()) {
	String t = tokenizer.nextToken();
	if (i < count) {
	if ((t.length() == 1) && (sep.indexOf(t) != -1)) {
	continue;
	}
	result[i++] = t;
	} else {
	result[count - 1] += t;
	}
	}
	return result;
	}
	}