wlu: ja ko search

commit: c2ed76a40365a6b61e7a45c297045d45ce667fef [log] [tgz]
author: klow <klow> Fri Aug 24 05:47:20 2007 +0000
committer: klow <klow> Fri Aug 24 05:47:20 2007 +0000
tree: 789050b93f9460e4110bb04c5605b36e66d14d14
parent: d4db1a69923842c4ce0a6d2f807ff4d31c3234b7 [diff]
diff --git a/org.eclipse.epf.web.search/.settings/org.eclipse.jdt.core.prefs b/org.eclipse.epf.web.search/.settings/org.eclipse.jdt.core.prefs
index bc4d61f..67747ba 100644
--- a/org.eclipse.epf.web.search/.settings/org.eclipse.jdt.core.prefs
+++ b/org.eclipse.epf.web.search/.settings/org.eclipse.jdt.core.prefs

@@ -1,12 +1,12 @@
-#Tue Aug 14 11:10:04 PDT 2007
+#Thu Aug 23 21:28:43 PDT 2007
 eclipse.preferences.version=1
 org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled
-org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.4
+org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.2
 org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve
 org.eclipse.jdt.core.compiler.compliance=1.4
 org.eclipse.jdt.core.compiler.debug.lineNumber=generate
 org.eclipse.jdt.core.compiler.debug.localVariable=generate
 org.eclipse.jdt.core.compiler.debug.sourceFile=generate
-org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
+org.eclipse.jdt.core.compiler.problem.assertIdentifier=warning
 org.eclipse.jdt.core.compiler.problem.enumIdentifier=warning
-org.eclipse.jdt.core.compiler.source=1.4
+org.eclipse.jdt.core.compiler.source=1.3

diff --git a/org.eclipse.epf.web.search/src/org/eclipse/epf/web/search/IndexSearch.java b/org.eclipse.epf.web.search/src/org/eclipse/epf/web/search/IndexSearch.java
index 137d647..aa558d5 100644
--- a/org.eclipse.epf.web.search/src/org/eclipse/epf/web/search/IndexSearch.java
+++ b/org.eclipse.epf.web.search/src/org/eclipse/epf/web/search/IndexSearch.java

@@ -1,8 +1,9 @@
 package org.eclipse.epf.web.search;
 
+import java.util.Locale;
 import java.util.StringTokenizer;
 
-import org.apache.lucene.document.Document;
+import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.queryParser.ParseException;
 import org.apache.lucene.queryParser.QueryParser;
 import org.apache.lucene.search.Hits;
@@ -10,8 +11,8 @@
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.Searcher;
 import org.apache.lucene.search.Sort;
-import org.apache.lucene.search.SortField;
 import org.apache.lucene.store.RAMDirectory;
+import org.eclipse.epf.web.search.analysis.CJKAnalyzer;
 import org.eclipse.epf.web.search.analysis.ChineseAnalyzer;
 import org.eclipse.epf.web.search.analysis.TextAnalyzer;
 
@@ -60,8 +61,18 @@
 			
 			try {
 				
+				boolean jako = false;
+				Locale locale = Locale.getDefault();
+				String lang = locale.getLanguage();
+				if (lang.equals(Locale.JAPANESE.getLanguage()) ||
+					lang.equals(Locale.KOREA.getLanguage())) {
+					jako = true;
+				}
+				Analyzer analyzer = jako ? (Analyzer) new CJKAnalyzer() : (Analyzer) new TextAnalyzer();
+				
+				//System.out.println("LD>analyzer: " + analyzer);
 				q = QueryParser.parse(queryString, searchField,
-						new TextAnalyzer());
+						analyzer);
 				if ( q != null ) {
 					hits = searcher.search(q, (Sort)sort);
 				}

diff --git a/org.eclipse.epf.web.search/src/org/eclipse/epf/web/search/analysis/CJKAnalyzer.java b/org.eclipse.epf.web.search/src/org/eclipse/epf/web/search/analysis/CJKAnalyzer.java
new file mode 100644
index 0000000..a67a0ea
--- /dev/null
+++ b/org.eclipse.epf.web.search/src/org/eclipse/epf/web/search/analysis/CJKAnalyzer.java

@@ -0,0 +1,65 @@
+package org.eclipse.epf.web.search.analysis;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.TokenStream;
+
+import java.io.Reader;
+import java.util.Set;
+
+public class CJKAnalyzer extends Analyzer {
+  //~ Static fields/initializers ---------------------------------------------
+
+  /**
+   * An array containing some common English words that are not usually
+   * useful for searching and some double-byte interpunctions.
+   */
+  public final static String[] STOP_WORDS = {
+    "a", "and", "are", "as", "at", "be",
+    "but", "by", "for", "if", "in",
+    "into", "is", "it", "no", "not",
+    "of", "on", "or", "s", "such", "t",
+    "that", "the", "their", "then",
+    "there", "these", "they", "this",
+    "to", "was", "will", "with", "",
+    "www"
+  };
+
+  //~ Instance fields --------------------------------------------------------
+
+  /**
+   * stop word list
+   */
+  private Set stopTable;
+
+  //~ Constructors -----------------------------------------------------------
+
+  /**
+   * Builds an analyzer which removes words in {@link #STOP_WORDS}.
+   */
+  public CJKAnalyzer() {
+    stopTable = StopFilter.makeStopSet(STOP_WORDS);
+  }
+
+  /**
+   * Builds an analyzer which removes words in the provided array.
+   *
+   * @param stopWords stop word array
+   */
+  public CJKAnalyzer(String[] stopWords) {
+    stopTable = StopFilter.makeStopSet(stopWords);
+  }
+
+  //~ Methods ----------------------------------------------------------------
+
+  /**
+   * get token stream from input
+   *
+   * @param fieldName lucene field name
+   * @param reader    input reader
+   * @return TokenStream
+   */
+  public final TokenStream tokenStream(String fieldName, Reader reader) {
+    return new StopFilter(new CJKTokenizer(reader), stopTable);
+  }
+}

diff --git a/org.eclipse.epf.web.search/src/org/eclipse/epf/web/search/analysis/CJKTokenizer.java b/org.eclipse.epf.web.search/src/org/eclipse/epf/web/search/analysis/CJKTokenizer.java
new file mode 100644
index 0000000..7e32ded
--- /dev/null
+++ b/org.eclipse.epf.web.search/src/org/eclipse/epf/web/search/analysis/CJKTokenizer.java

@@ -0,0 +1,213 @@
+package org.eclipse.epf.web.search.analysis;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.Tokenizer;
+
+import java.io.Reader;
+
+public final class CJKTokenizer extends Tokenizer {
+    //~ Static fields/initializers ---------------------------------------------
+
+    /** Max word length */
+    private static final int MAX_WORD_LEN = 255;
+
+    /** buffer size: */
+    private static final int IO_BUFFER_SIZE = 256;
+
+    //~ Instance fields --------------------------------------------------------
+
+    /** word offset, used to imply which character(in ) is parsed */
+    private int offset = 0;
+
+    /** the index used only for ioBuffer */
+    private int bufferIndex = 0;
+
+    /** data length */
+    private int dataLen = 0;
+
+    /**
+     * character buffer, store the characters which are used to compose <br>
+     * the returned Token
+     */
+    private final char[] buffer = new char[MAX_WORD_LEN];
+
+    /**
+     * I/O buffer, used to store the content of the input(one of the <br>
+     * members of Tokenizer)
+     */
+    private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
+
+    /** word type: single=>ASCII  double=>non-ASCII word=>default */
+    private String tokenType = "word";
+
+    /**
+     * tag: previous character is a cached double-byte character  "C1C2C3C4"
+     * ----(set the C1 isTokened) C1C2 "C2C3C4" ----(set the C2 isTokened)
+     * C1C2 C2C3 "C3C4" ----(set the C3 isTokened) "C1C2 C2C3 C3C4"
+     */
+    private boolean preIsTokened = false;
+
+    //~ Constructors -----------------------------------------------------------
+
+    /**
+     * Construct a token stream processing the given input.
+     *
+     * @param in I/O reader
+     */
+    public CJKTokenizer(Reader in) {
+        input = in;
+    }
+
+    //~ Methods ----------------------------------------------------------------
+
+    /**
+     * Returns the next token in the stream, or null at EOS.
+     * See http://java.sun.com/j2se/1.3/docs/api/java/lang/Character.UnicodeBlock.html
+     * for detail.
+     *
+     * @return Token
+     *
+     * @throws java.io.IOException - throw IOException when read error <br>
+     *         hanppened in the InputStream
+     *
+     */
+    public final Token next() throws java.io.IOException {
+        /** how many character(s) has been stored in buffer */
+        int length = 0;
+
+        /** the position used to create Token */
+        int start = offset;
+
+        while (true) {
+            /** current charactor */
+            char c;
+
+            /** unicode block of current charactor for detail */
+            Character.UnicodeBlock ub;
+
+            offset++;
+
+            if (bufferIndex >= dataLen) {
+                dataLen = input.read(ioBuffer);
+                bufferIndex = 0;
+            }
+
+            if (dataLen == -1) {
+                if (length > 0) {
+                    if (preIsTokened == true) {
+                        length = 0;
+                        preIsTokened = false;
+                    }
+
+                    break;
+                } else {
+                    return null;
+                }
+            } else {
+                //get current character
+                c = ioBuffer[bufferIndex++];
+
+                //get the UnicodeBlock of the current character
+                ub = Character.UnicodeBlock.of(c);
+            }
+
+            //if the current character is ASCII or Extend ASCII
+            if ((ub == Character.UnicodeBlock.BASIC_LATIN)
+                    || (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS)
+               ) {
+                if (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS) {
+                    /** convert  HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN */
+                    int i = (int) c;
+                    i = i - 65248;
+                    c = (char) i;
+                }
+
+                // if the current character is a letter or "_" "+" "#"
+                if (Character.isLetterOrDigit(c)
+                        || ((c == '_') || (c == '+') || (c == '#'))
+                   ) {
+                    if (length == 0) {
+                        // "javaC1C2C3C4linux" <br>
+                        //      ^--: the current character begin to token the ASCII
+                        // letter
+                        start = offset - 1;
+                    } else if (tokenType == "double") {
+                        // "javaC1C2C3C4linux" <br>
+                        //              ^--: the previous non-ASCII
+                        // : the current character
+                        offset--;
+                        bufferIndex--;
+                        tokenType = "single";
+
+                        if (preIsTokened == true) {
+                            // there is only one non-ASCII has been stored
+                            length = 0;
+                            preIsTokened = false;
+
+                            break;
+                        } else {
+                            break;
+                        }
+                    }
+
+                    // store the LowerCase(c) in the buffer
+                    buffer[length++] = Character.toLowerCase(c);
+                    tokenType = "single";
+
+                    // break the procedure if buffer overflowed!
+                    if (length == MAX_WORD_LEN) {
+                        break;
+                    }
+                } else if (length > 0) {
+                    if (preIsTokened == true) {
+                        length = 0;
+                        preIsTokened = false;
+                    } else {
+                        break;
+                    }
+                }
+            } else {
+                // non-ASCII letter, eg."C1C2C3C4"
+                if (Character.isLetter(c)) {
+                    if (length == 0) {
+                        start = offset - 1;
+                        buffer[length++] = c;
+                        tokenType = "double";
+                    } else {
+                        if (tokenType == "single") {
+                            offset--;
+                            bufferIndex--;
+
+                            //return the previous ASCII characters
+                            break;
+                        } else {
+                            buffer[length++] = c;
+                            tokenType = "double";
+
+                            if (length == 2) {
+                                offset--;
+                                bufferIndex--;
+                                preIsTokened = true;
+
+                                break;
+                            }
+                        }
+                    }
+                } else if (length > 0) {
+                    if (preIsTokened == true) {
+                        // empty the buffer
+                        length = 0;
+                        preIsTokened = false;
+                    } else {
+                        break;
+                    }
+                }
+            }
+        }
+
+        return new Token(new String(buffer, 0, length), start, start + length,
+                         tokenType
+                        );
+    }
+}
+
commit	c2ed76a40365a6b61e7a45c297045d45ce667fef	[log] [tgz]
author	klow <klow>	Fri Aug 24 05:47:20 2007 +0000
committer	klow <klow>	Fri Aug 24 05:47:20 2007 +0000
tree	789050b93f9460e4110bb04c5605b36e66d14d14
parent	d4db1a69923842c4ce0a6d2f807ff4d31c3234b7 [diff]