blob: f15df0d72a4b4cdf4e4f04efeaa3fb1ed8b11981 [file] [log] [blame]
/*******************************************************************************
* Copyright (c) 2013 Oracle and/or its affiliates. All rights reserved.
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License v1.0 and Eclipse Distribution License v. 1.0
* which accompanies this distribution.
*
* The Eclipse Public License is available at http://www.eclipse.org/legal/epl-v10.html
* and the Eclipse Distribution License is available at
* http://www.eclipse.org/org/documents/edl-v10.php.
******************************************************************************/
package eclipselink.example.moxy.socialbinding.util;
import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.StringTokenizer;
/**
* Simple utility class to extract and prioritize a set of keywords based on the
* title provided.
*
* @author rbarkhous
* @since EclipseLink 2.4.2
*/
public class KeywordExtractor {
/**
* Return the longest word in the title (preferably a capitalized word).
*/
public static String extractKeywords(String postTitle) {
ArrayList<String> allWords = new WordList();
ArrayList<String> upperCaseWords = new WordList();
ArrayList<String> excludeWords = buildExlucdeWordsList();
StringTokenizer tokenizer = new StringTokenizer(postTitle, ",.!?():;-[]'\" \t\n\r\f/");
while (tokenizer.hasMoreElements()) {
String word = tokenizer.nextToken();
if (!excludeWords.contains(word)) {
allWords.add(word);
if (Character.isUpperCase(word.toCharArray()[0])) {
upperCaseWords.add(word);
}
}
}
StringLengthComparator comparator = new StringLengthComparator();
Collections.sort(allWords, comparator);
Collections.sort(upperCaseWords, comparator);
if (upperCaseWords.size() > 1) {
return upperCaseWords.get(0);
} else {
return allWords.get(0);
}
}
private static ArrayList<String> buildExlucdeWordsList() {
ArrayList<String> excludeWords = new WordList();
try {
ClassLoader cl = Thread.currentThread().getContextClassLoader();
InputStream is = cl.getResourceAsStream("META-INF/exclude-words.txt");
InputStreamReader isr = new InputStreamReader(is);
BufferedReader br = new BufferedReader(isr);
String line;
while ((line = br.readLine()) != null) {
excludeWords.add(line);
}
br.close();
} catch (Exception e) {
e.printStackTrace();
}
return excludeWords;
}
private static class StringLengthComparator implements Comparator<String> {
public int compare(String o1, String o2) {
if (o1.length() > o2.length()) {
return -1;
} else if (o1.length() < o2.length()) {
return 1;
} else {
return 0;
}
}
}
private static class WordList extends ArrayList<String> {
private static final long serialVersionUID = 4780991427891054829L;
@Override
public boolean contains(Object o) {
String s = (String) o;
for (String string : this) {
if (s.equalsIgnoreCase(string)) return true;
}
return false;
}
}
}