social-binding: improved KeywordExtractor
diff --git a/moxy/social-binding/src/main/java/eclipselink/example/moxy/socialbinding/util/KeywordExtractor.java b/moxy/social-binding/src/main/java/eclipselink/example/moxy/socialbinding/util/KeywordExtractor.java
index a170cf2..f15df0d 100644
--- a/moxy/social-binding/src/main/java/eclipselink/example/moxy/socialbinding/util/KeywordExtractor.java
+++ b/moxy/social-binding/src/main/java/eclipselink/example/moxy/socialbinding/util/KeywordExtractor.java
@@ -10,6 +10,9 @@
******************************************************************************/
package eclipselink.example.moxy.socialbinding.util;
+import java.io.BufferedReader;
+import java.io.InputStream;
+import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
@@ -24,29 +27,55 @@
*/
public class KeywordExtractor {
+ /**
+ * Return the longest word in the title (preferably a capitalized word).
+ */
public static String extractKeywords(String postTitle) {
- StringTokenizer tokenizer = new StringTokenizer(postTitle, ",.!?()[]'\" \t\n\r\f/");
+ ArrayList<String> allWords = new WordList();
+ ArrayList<String> upperCaseWords = new WordList();
- ArrayList<String> words = new ArrayList<String>();
+ ArrayList<String> excludeWords = buildExlucdeWordsList();
+ StringTokenizer tokenizer = new StringTokenizer(postTitle, ",.!?():;-[]'\" \t\n\r\f/");
while (tokenizer.hasMoreElements()) {
- String token = tokenizer.nextToken();
- if (token.length() > 3 || token.toUpperCase().equals(token)) {
- words.add(token);
+ String word = tokenizer.nextToken();
+ if (!excludeWords.contains(word)) {
+ allWords.add(word);
+ if (Character.isUpperCase(word.toCharArray()[0])) {
+ upperCaseWords.add(word);
+ }
}
}
+
+ StringLengthComparator comparator = new StringLengthComparator();
+ Collections.sort(allWords, comparator);
+ Collections.sort(upperCaseWords, comparator);
- // Sort words, longest one first
- Collections.sort(words, new StringLengthComparator());
-
- String keywords = null;
- if (words.size() > 1) {
- keywords = words.get(0) + "," + words.get(1);
+ if (upperCaseWords.size() > 1) {
+ return upperCaseWords.get(0);
} else {
- keywords = words.get(0);
+ return allWords.get(0);
}
+ }
+
+ private static ArrayList<String> buildExlucdeWordsList() {
+ ArrayList<String> excludeWords = new WordList();
- return keywords;
+ try {
+ ClassLoader cl = Thread.currentThread().getContextClassLoader();
+ InputStream is = cl.getResourceAsStream("META-INF/exclude-words.txt");
+ InputStreamReader isr = new InputStreamReader(is);
+ BufferedReader br = new BufferedReader(isr);
+ String line;
+ while ((line = br.readLine()) != null) {
+ excludeWords.add(line);
+ }
+ br.close();
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+
+ return excludeWords;
}
private static class StringLengthComparator implements Comparator<String> {
@@ -60,5 +89,18 @@
}
}
}
+
+ private static class WordList extends ArrayList<String> {
+ private static final long serialVersionUID = 4780991427891054829L;
+ @Override
+ public boolean contains(Object o) {
+ String s = (String) o;
+ for (String string : this) {
+ if (s.equalsIgnoreCase(string)) return true;
+ }
+ return false;
+ }
+ }
+
}
\ No newline at end of file
diff --git a/moxy/social-binding/src/main/resources/META-INF/exclude-words.txt b/moxy/social-binding/src/main/resources/META-INF/exclude-words.txt
new file mode 100644
index 0000000..e2af3bf
--- /dev/null
+++ b/moxy/social-binding/src/main/resources/META-INF/exclude-words.txt
@@ -0,0 +1,184 @@
+a
+able
+about
+across
+after
+ain't
+all
+almost
+also
+am
+among
+an
+and
+any
+are
+aren't
+as
+at
+be
+because
+been
+but
+by
+can
+can't
+cannot
+could
+could've
+couldn't
+dear
+did
+didn't
+do
+does
+doesn't
+don't
+either
+else
+ever
+every
+for
+from
+get
+got
+had
+has
+hasn't
+have
+he
+he'd
+he'll
+he's
+her
+hers
+him
+his
+how
+how'd
+how'll
+how's
+however
+i
+i'd
+i'll
+i'm
+i've
+if
+in
+into
+is
+isn't
+it
+it's
+its
+just
+least
+let
+like
+likely
+may
+me
+might
+might've
+mightn't
+most
+must
+must've
+mustn't
+my
+neither
+no
+nor
+not
+of
+off
+often
+on
+only
+or
+other
+our
+own
+rather
+said
+say
+says
+shan't
+she
+she'd
+she'll
+she's
+should
+should've
+shouldn't
+since
+so
+some
+than
+that
+that'll
+that's
+the
+their
+them
+then
+there
+there's
+these
+they
+they'd
+they'll
+they're
+they've
+this
+tis
+to
+too
+twas
+us
+wants
+was
+wasn't
+we
+we'd
+we'll
+we're
+were
+weren't
+what
+what'd
+what's
+when
+when
+when'd
+when'll
+when's
+where
+where'd
+where'll
+where's
+which
+while
+who
+who'd
+who'll
+who's
+whom
+why
+why'd
+why'll
+why's
+will
+with
+won't
+would
+would've
+wouldn't
+yet
+you
+you'd
+you'll
+you're
+you've
+your
\ No newline at end of file