wlu: ja ko search
diff --git a/org.eclipse.epf.web.search/.settings/org.eclipse.jdt.core.prefs b/org.eclipse.epf.web.search/.settings/org.eclipse.jdt.core.prefs
index bc4d61f..67747ba 100644
--- a/org.eclipse.epf.web.search/.settings/org.eclipse.jdt.core.prefs
+++ b/org.eclipse.epf.web.search/.settings/org.eclipse.jdt.core.prefs
@@ -1,12 +1,12 @@
-#Tue Aug 14 11:10:04 PDT 2007
+#Thu Aug 23 21:28:43 PDT 2007
eclipse.preferences.version=1
org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled
-org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.4
+org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.2
org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve
org.eclipse.jdt.core.compiler.compliance=1.4
org.eclipse.jdt.core.compiler.debug.lineNumber=generate
org.eclipse.jdt.core.compiler.debug.localVariable=generate
org.eclipse.jdt.core.compiler.debug.sourceFile=generate
-org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
+org.eclipse.jdt.core.compiler.problem.assertIdentifier=warning
org.eclipse.jdt.core.compiler.problem.enumIdentifier=warning
-org.eclipse.jdt.core.compiler.source=1.4
+org.eclipse.jdt.core.compiler.source=1.3
diff --git a/org.eclipse.epf.web.search/src/org/eclipse/epf/web/search/IndexSearch.java b/org.eclipse.epf.web.search/src/org/eclipse/epf/web/search/IndexSearch.java
index 137d647..aa558d5 100644
--- a/org.eclipse.epf.web.search/src/org/eclipse/epf/web/search/IndexSearch.java
+++ b/org.eclipse.epf.web.search/src/org/eclipse/epf/web/search/IndexSearch.java
@@ -1,8 +1,9 @@
package org.eclipse.epf.web.search;
+import java.util.Locale;
import java.util.StringTokenizer;
-import org.apache.lucene.document.Document;
+import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Hits;
@@ -10,8 +11,8 @@
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.Sort;
-import org.apache.lucene.search.SortField;
import org.apache.lucene.store.RAMDirectory;
+import org.eclipse.epf.web.search.analysis.CJKAnalyzer;
import org.eclipse.epf.web.search.analysis.ChineseAnalyzer;
import org.eclipse.epf.web.search.analysis.TextAnalyzer;
@@ -60,8 +61,18 @@
try {
+ boolean jako = false;
+ Locale locale = Locale.getDefault();
+ String lang = locale.getLanguage();
+ if (lang.equals(Locale.JAPANESE.getLanguage()) ||
+ lang.equals(Locale.KOREA.getLanguage())) {
+ jako = true;
+ }
+ Analyzer analyzer = jako ? (Analyzer) new CJKAnalyzer() : (Analyzer) new TextAnalyzer();
+
+ //System.out.println("LD>analyzer: " + analyzer);
q = QueryParser.parse(queryString, searchField,
- new TextAnalyzer());
+ analyzer);
if ( q != null ) {
hits = searcher.search(q, (Sort)sort);
}
diff --git a/org.eclipse.epf.web.search/src/org/eclipse/epf/web/search/analysis/CJKAnalyzer.java b/org.eclipse.epf.web.search/src/org/eclipse/epf/web/search/analysis/CJKAnalyzer.java
new file mode 100644
index 0000000..a67a0ea
--- /dev/null
+++ b/org.eclipse.epf.web.search/src/org/eclipse/epf/web/search/analysis/CJKAnalyzer.java
@@ -0,0 +1,65 @@
+package org.eclipse.epf.web.search.analysis;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.TokenStream;
+
+import java.io.Reader;
+import java.util.Set;
+
+public class CJKAnalyzer extends Analyzer {
+ //~ Static fields/initializers ---------------------------------------------
+
+ /**
+ * An array containing some common English words that are not usually
+ * useful for searching and some double-byte interpunctions.
+ */
+ public final static String[] STOP_WORDS = {
+ "a", "and", "are", "as", "at", "be",
+ "but", "by", "for", "if", "in",
+ "into", "is", "it", "no", "not",
+ "of", "on", "or", "s", "such", "t",
+ "that", "the", "their", "then",
+ "there", "these", "they", "this",
+ "to", "was", "will", "with", "",
+ "www"
+ };
+
+ //~ Instance fields --------------------------------------------------------
+
+ /**
+ * stop word list
+ */
+ private Set stopTable;
+
+ //~ Constructors -----------------------------------------------------------
+
+ /**
+ * Builds an analyzer which removes words in {@link #STOP_WORDS}.
+ */
+ public CJKAnalyzer() {
+ stopTable = StopFilter.makeStopSet(STOP_WORDS);
+ }
+
+ /**
+ * Builds an analyzer which removes words in the provided array.
+ *
+ * @param stopWords stop word array
+ */
+ public CJKAnalyzer(String[] stopWords) {
+ stopTable = StopFilter.makeStopSet(stopWords);
+ }
+
+ //~ Methods ----------------------------------------------------------------
+
+ /**
+ * get token stream from input
+ *
+ * @param fieldName lucene field name
+ * @param reader input reader
+ * @return TokenStream
+ */
+ public final TokenStream tokenStream(String fieldName, Reader reader) {
+ return new StopFilter(new CJKTokenizer(reader), stopTable);
+ }
+}
diff --git a/org.eclipse.epf.web.search/src/org/eclipse/epf/web/search/analysis/CJKTokenizer.java b/org.eclipse.epf.web.search/src/org/eclipse/epf/web/search/analysis/CJKTokenizer.java
new file mode 100644
index 0000000..7e32ded
--- /dev/null
+++ b/org.eclipse.epf.web.search/src/org/eclipse/epf/web/search/analysis/CJKTokenizer.java
@@ -0,0 +1,213 @@
+package org.eclipse.epf.web.search.analysis;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.Tokenizer;
+
+import java.io.Reader;
+
+public final class CJKTokenizer extends Tokenizer {
+ //~ Static fields/initializers ---------------------------------------------
+
+ /** Max word length */
+ private static final int MAX_WORD_LEN = 255;
+
+ /** buffer size: */
+ private static final int IO_BUFFER_SIZE = 256;
+
+ //~ Instance fields --------------------------------------------------------
+
+ /** word offset, used to imply which character(in ) is parsed */
+ private int offset = 0;
+
+ /** the index used only for ioBuffer */
+ private int bufferIndex = 0;
+
+ /** data length */
+ private int dataLen = 0;
+
+ /**
+ * character buffer, store the characters which are used to compose <br>
+ * the returned Token
+ */
+ private final char[] buffer = new char[MAX_WORD_LEN];
+
+ /**
+ * I/O buffer, used to store the content of the input(one of the <br>
+ * members of Tokenizer)
+ */
+ private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
+
+ /** word type: single=>ASCII double=>non-ASCII word=>default */
+ private String tokenType = "word";
+
+ /**
+ * tag: previous character is a cached double-byte character "C1C2C3C4"
+ * ----(set the C1 isTokened) C1C2 "C2C3C4" ----(set the C2 isTokened)
+ * C1C2 C2C3 "C3C4" ----(set the C3 isTokened) "C1C2 C2C3 C3C4"
+ */
+ private boolean preIsTokened = false;
+
+ //~ Constructors -----------------------------------------------------------
+
+ /**
+ * Construct a token stream processing the given input.
+ *
+ * @param in I/O reader
+ */
+ public CJKTokenizer(Reader in) {
+ input = in;
+ }
+
+ //~ Methods ----------------------------------------------------------------
+
+ /**
+ * Returns the next token in the stream, or null at EOS.
+ * See http://java.sun.com/j2se/1.3/docs/api/java/lang/Character.UnicodeBlock.html
+ * for detail.
+ *
+ * @return Token
+ *
+ * @throws java.io.IOException - throw IOException when read error <br>
+ * hanppened in the InputStream
+ *
+ */
+ public final Token next() throws java.io.IOException {
+ /** how many character(s) has been stored in buffer */
+ int length = 0;
+
+ /** the position used to create Token */
+ int start = offset;
+
+ while (true) {
+ /** current charactor */
+ char c;
+
+ /** unicode block of current charactor for detail */
+ Character.UnicodeBlock ub;
+
+ offset++;
+
+ if (bufferIndex >= dataLen) {
+ dataLen = input.read(ioBuffer);
+ bufferIndex = 0;
+ }
+
+ if (dataLen == -1) {
+ if (length > 0) {
+ if (preIsTokened == true) {
+ length = 0;
+ preIsTokened = false;
+ }
+
+ break;
+ } else {
+ return null;
+ }
+ } else {
+ //get current character
+ c = ioBuffer[bufferIndex++];
+
+ //get the UnicodeBlock of the current character
+ ub = Character.UnicodeBlock.of(c);
+ }
+
+ //if the current character is ASCII or Extend ASCII
+ if ((ub == Character.UnicodeBlock.BASIC_LATIN)
+ || (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS)
+ ) {
+ if (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS) {
+ /** convert HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN */
+ int i = (int) c;
+ i = i - 65248;
+ c = (char) i;
+ }
+
+ // if the current character is a letter or "_" "+" "#"
+ if (Character.isLetterOrDigit(c)
+ || ((c == '_') || (c == '+') || (c == '#'))
+ ) {
+ if (length == 0) {
+ // "javaC1C2C3C4linux" <br>
+ // ^--: the current character begin to token the ASCII
+ // letter
+ start = offset - 1;
+ } else if (tokenType == "double") {
+ // "javaC1C2C3C4linux" <br>
+ // ^--: the previous non-ASCII
+ // : the current character
+ offset--;
+ bufferIndex--;
+ tokenType = "single";
+
+ if (preIsTokened == true) {
+ // there is only one non-ASCII has been stored
+ length = 0;
+ preIsTokened = false;
+
+ break;
+ } else {
+ break;
+ }
+ }
+
+ // store the LowerCase(c) in the buffer
+ buffer[length++] = Character.toLowerCase(c);
+ tokenType = "single";
+
+ // break the procedure if buffer overflowed!
+ if (length == MAX_WORD_LEN) {
+ break;
+ }
+ } else if (length > 0) {
+ if (preIsTokened == true) {
+ length = 0;
+ preIsTokened = false;
+ } else {
+ break;
+ }
+ }
+ } else {
+ // non-ASCII letter, eg."C1C2C3C4"
+ if (Character.isLetter(c)) {
+ if (length == 0) {
+ start = offset - 1;
+ buffer[length++] = c;
+ tokenType = "double";
+ } else {
+ if (tokenType == "single") {
+ offset--;
+ bufferIndex--;
+
+ //return the previous ASCII characters
+ break;
+ } else {
+ buffer[length++] = c;
+ tokenType = "double";
+
+ if (length == 2) {
+ offset--;
+ bufferIndex--;
+ preIsTokened = true;
+
+ break;
+ }
+ }
+ }
+ } else if (length > 0) {
+ if (preIsTokened == true) {
+ // empty the buffer
+ length = 0;
+ preIsTokened = false;
+ } else {
+ break;
+ }
+ }
+ }
+ }
+
+ return new Token(new String(buffer, 0, length), start, start + length,
+ tokenType
+ );
+ }
+}
+