blob: b90bc3faed51ec09c0b70c5a6df332113b5c22e9 [file] [log] [blame]
/*******************************************************************************
* Copyright (c) 2000, 2016 IBM Corporation and others.
*
* This program and the accompanying materials
* are made available under the terms of the Eclipse Public License 2.0
* which accompanies this distribution, and is available at
* https://www.eclipse.org/legal/epl-2.0/
*
* SPDX-License-Identifier: EPL-2.0
*
* Contributors:
* IBM Corporation - initial API and implementation
* Chris Torrence - patch for bug Bug 107648
* Sopot Cela - Bug 466829
*******************************************************************************/
package org.eclipse.help.internal.search;
import java.io.*;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.StringTokenizer;
import org.apache.lucene.analysis.*;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.index.*;
import org.apache.lucene.search.*;
import org.apache.lucene.search.BooleanQuery.Builder;
import org.eclipse.help.internal.base.*;
/**
* Build query acceptable by the search engine.
*/
public class QueryBuilder {
// Maximum allowed number of terms
private static final int MAX_TERMS = 10;
// Maximum allowed number of ORs
private static final int MAX_UNIONS = 4;
// Maximum allowed number terms with wild cards
private static final int MAX_WILD_TERMS = 2;
// Query from user
private String searchWords;
// Descriptor of Analyzer to process the query words
private AnalyzerDescriptor analyzerDesc;
// Analyzer to process the query words
private Analyzer analyzer;
// List of QueryWordsToken
private List<QueryWordsToken> analyzedTokens;
// List of words to highlight
private List<String> highlightWords = new ArrayList<>();
private Locale locale;
/**
* Creates a query builder for the search word. The search word is processed
* by a lexical analyzer.
*/
public QueryBuilder(String searchWords, AnalyzerDescriptor analyzerDesc) {
this.searchWords = searchWords;
String language = analyzerDesc.getLang();
if (language.length() >= 5) {
this.locale = new Locale(language.substring(0, 2), language
.substring(3, 5));
} else {
this.locale = new Locale(language.substring(0, 2), ""); //$NON-NLS-1$
}
this.analyzerDesc = analyzerDesc;
this.analyzer = analyzerDesc.getAnalyzer();
}
/**
* Splits user query into tokens and returns a list of QueryWordsToken's.
*/
private List<QueryWordsToken> tokenizeUserQuery(String searchWords) {
List<QueryWordsToken> tokenList = new ArrayList<>();
//Divide along quotation marks
//StringTokenizer qTokenizer = new StringTokenizer(searchWords.trim(),
// "\"", true); //$NON-NLS-1$
boolean withinQuotation = false;
String quotedString = ""; //$NON-NLS-1$
int termCount = 0;// keep track of number of terms to disallow too many
int fromIndex = -1;
searchWords = searchWords.trim();
while((fromIndex = searchWords.indexOf("\"", fromIndex+1))!= -1){ //$NON-NLS-1$
withinQuotation = !withinQuotation;
}
if( withinQuotation ) {
searchWords = searchWords + "\""; //$NON-NLS-1$
withinQuotation = !withinQuotation;
}
StringTokenizer qTokenizer = new StringTokenizer(searchWords,"\"",true); //$NON-NLS-1$
int orCount = 0; // keep track of number of ORs to disallow too many
while (qTokenizer.hasMoreTokens()) {
String curToken = qTokenizer.nextToken();
if (curToken.equals("\"")) { //$NON-NLS-1$
if (withinQuotation) {
// check for too many terms
if (BaseHelpSystem.getMode() == BaseHelpSystem.MODE_INFOCENTER
&& ++termCount > MAX_TERMS) {
throw new QueryTooComplexException();
}
tokenList.add(QueryWordsToken.exactPhrase(quotedString));
} else {
quotedString = ""; //$NON-NLS-1$
}
withinQuotation = !withinQuotation;
continue;
} else if (withinQuotation) {
quotedString = curToken;
continue;
} else {
//divide unquoted strings along white space
StringTokenizer parser = new StringTokenizer(curToken.trim());
while (parser.hasMoreTokens()) {
String token = parser.nextToken();
if (token.equalsIgnoreCase(QueryWordsToken.AND().value)) {
tokenList.add(QueryWordsToken.AND());
} else if (token
.equalsIgnoreCase(QueryWordsToken.OR().value)) {
// Check for too many OR terms
if (BaseHelpSystem.getMode() == BaseHelpSystem.MODE_INFOCENTER
&& ++orCount > MAX_UNIONS) {
throw new QueryTooComplexException();
}
tokenList.add(QueryWordsToken.OR());
} else if (token
.equalsIgnoreCase(QueryWordsToken.NOT().value)) {
tokenList.add(QueryWordsToken.NOT());
} else {
// check for too many terms
if (BaseHelpSystem.getMode() == BaseHelpSystem.MODE_INFOCENTER
&& ++termCount > MAX_TERMS) {
throw new QueryTooComplexException();
}
tokenList.add(QueryWordsToken.word(token));
}
}
}
}
return tokenList;
}
/**
* Apply the Analyzer to the search tokens and return the list of processed
* QueryWordsToken's.
*/
private List<QueryWordsToken> analyzeTokens(List<QueryWordsToken> tokens) {
boolean isTokenAfterNot = false;
List<QueryWordsToken> newTokens = new ArrayList<>();
int wildCardTermCount = 0;
for (int i = 0; i < tokens.size(); i++) {
QueryWordsToken token = tokens.get(i);
if (token.type == QueryWordsToken.WORD) {
int questionMIndex = token.value.indexOf('?');
int starIndex = token.value.indexOf('*');
if (starIndex >= 0 || questionMIndex >= 0) {
if (BaseHelpSystem.getMode() == BaseHelpSystem.MODE_INFOCENTER
&& ++wildCardTermCount > MAX_WILD_TERMS) {
throw new QueryTooComplexException();
}
newTokens.add(QueryWordsToken.word(token.value
.toLowerCase(locale)));
// add word to the list of words to highlight
if (!isTokenAfterNot && !highlightWords.contains(token.value)) {
highlightWords.add(token.value);
}
} else {
List<String> wordList = analyzeText(analyzer, "contents", //$NON-NLS-1$
token.value);
if (wordList.size() > 0) {
if (!isTokenAfterNot && !highlightWords.contains(token.value)) {
// add original word to the list of words to
// highlight
highlightWords.add(token.value);
}
if (wordList.size() == 1) {
String word = wordList.get(0);
newTokens.add(QueryWordsToken.word(word));
// add analyzed word to the list of words to
// highlight
// this is required to highlight stemmed words
if (!isTokenAfterNot && !highlightWords.contains(word)) {
highlightWords.add(word);
}
} else {
QueryWordsPhrase phrase = QueryWordsToken.phrase();
for (Iterator<String> it = wordList.iterator(); it
.hasNext();) {
String word = it.next();
phrase.addWord(word);
// add each analyzed word to the list of words
// to highlight
// this is only required to highlight stemmed
// words.
// Adding words should not be done when
// DefaultAnalyzer is used,
// because it does not perform stemming and
// common words removal
// which would result in common characters
// highlighted all over (bug 30263)
if (!analyzerDesc.getId().startsWith(
HelpBasePlugin.PLUGIN_ID + "#")) { //$NON-NLS-1$
if (!isTokenAfterNot && !highlightWords.contains(word)) {
highlightWords.add(word);
}
}
}
newTokens.add(phrase);
}
}
}
} else if (// forget ANDs
/*
* token.type == SearchQueryToken.AND ||
*/
token.type == QueryWordsToken.OR
|| token.type == QueryWordsToken.NOT)
newTokens.add(token);
else if (token.type == QueryWordsToken.EXACT_PHRASE) {
List<String> wordList = analyzeText(analyzer, "exact_contents", //$NON-NLS-1$
token.value);
if (wordList.size() > 0) {
if (!isTokenAfterNot && !highlightWords.contains(token.value)) {
// add original word to the list of words to highlight
highlightWords.add(token.value);
}
}
QueryWordsExactPhrase phrase = QueryWordsToken.exactPhrase();
for (Iterator<String> it = wordList.iterator(); it.hasNext();) {
String word = it.next();
phrase.addWord(word);
// add analyzed word to the list of words to highlight
if (!highlightWords.contains(word))
highlightWords.add(word);
}
// add phrase only if not empty
if (phrase.getWords().size() > 0) {
newTokens.add(phrase);
}
}
isTokenAfterNot = (token.type == QueryWordsToken.NOT);
}
return newTokens;
}
/**
* Get a list of tokens corresponding to a search word or phrase
*
* @return List of String
*/
private List<String> analyzeText(Analyzer analyzer, String fieldName, String text) {
List<String> words = new ArrayList<>(1);
try (Reader reader = new StringReader(text); TokenStream tStream = analyzer.tokenStream(fieldName, reader)) {
tStream.reset();
CharTermAttribute termAttribute = tStream.getAttribute(CharTermAttribute.class);
while (tStream.incrementToken()) {
String term = termAttribute.toString();
words.add(term);
}
} catch (IOException ioe) {
}
return words;
}
/**
* Obtains Lucene Query from tokens
*
* @return Query or null if no query could be created
*/
private Query createLuceneQuery(List<QueryWordsToken> searchTokens, String[] fieldNames,
float[] boosts) {
// Get queries for parts separated by OR
List<Query> requiredQueries = getRequiredQueries(searchTokens, fieldNames,
boosts);
if (requiredQueries.size() == 0)
return null;
else if (requiredQueries.size() <= 1)
return requiredQueries.get(0);
else
/* if (requiredQueries.size() > 1) */
// OR queries
return (orQueries(requiredQueries));
}
/**
* Obtains Lucene queries for token sequences separated at OR.
*
* @return List of Query (could be empty)
*/
private List<Query> getRequiredQueries(List<QueryWordsToken> tokens, String[] fieldNames,
float[] boosts) {
List<Query> oredQueries = new ArrayList<>();
ArrayList<QueryWordsToken> requiredQueryTokens = new ArrayList<>();
for (int i = 0; i < tokens.size(); i++) {
QueryWordsToken token = tokens.get(i);
if (token.type != QueryWordsToken.OR) {
requiredQueryTokens.add(token);
} else {
Query reqQuery = getRequiredQuery(requiredQueryTokens,
fieldNames, boosts);
if (reqQuery != null)
oredQueries.add(reqQuery);
requiredQueryTokens = new ArrayList<>();
}
}
Query reqQuery = getRequiredQuery(requiredQueryTokens, fieldNames,
boosts);
if (reqQuery != null)
oredQueries.add(reqQuery);
return oredQueries;
}
private Query orQueries(Collection<Query> queries) {
Builder builder = new BooleanQuery.Builder();
for (Iterator<Query> it = queries.iterator(); it.hasNext();) {
Query q = it.next();
builder.add(q, BooleanClause.Occur.SHOULD);
}
return builder.build();
}
/**
* Obtains Lucene Query for tokens containing only AND and NOT operators.
*
* @return BooleanQuery or null if no query could be created from the tokens
*/
private Query getRequiredQuery(List<QueryWordsToken> requiredTokens, String[] fieldNames,
float[] boosts) {
Builder retQueryBuilder = new BooleanQuery.Builder();
boolean requiredTermExist = false;
// Parse tokens left to right
QueryWordsToken operator = null;
for (int i = 0; i < requiredTokens.size(); i++) {
QueryWordsToken token = requiredTokens.get(i);
if (token.type == QueryWordsToken.AND
|| token.type == QueryWordsToken.NOT) {
operator = token;
continue;
}
// Creates queries for all fields
Query qs[] = new Query[fieldNames.length];
for (int f = 0; f < fieldNames.length; f++) {
qs[f] = token.createLuceneQuery(fieldNames[f], boosts[f]);
}
// creates the boolean query of all fields
Query q = qs[0];
if (fieldNames.length > 1) {
Builder allFieldsQueryBuilder = new BooleanQuery.Builder();
for (int f = 0; f < fieldNames.length; f++)
allFieldsQueryBuilder.add(qs[f], BooleanClause.Occur.SHOULD);
q = allFieldsQueryBuilder.build();
}
if (operator != null && operator.type == QueryWordsToken.NOT) {
retQueryBuilder.add(q, BooleanClause.Occur.MUST_NOT); // prohibited
} else {
retQueryBuilder.add(q, BooleanClause.Occur.MUST); // required
requiredTermExist = true;
}
}
if (!requiredTermExist) {
return null; // cannot search for prohibited only
}
return retQueryBuilder.build();
}
private Query getLuceneQuery(String[] fieldNames, float[] boosts) {
Query luceneQuery = createLuceneQuery(analyzedTokens, fieldNames,
boosts);
return luceneQuery;
}
/**
* @param fieldNames -
* Collection of field names of type String (e.g. "h1"); the
* search will be performed on the given fields
* @param fieldSearchOnly -
* boolean indicating if field only search should be performed;
* if set to false, default field "contents" and all other fields
* will be searched
*/
public Query getLuceneQuery(Collection<String> fieldNames, boolean fieldSearchOnly)
throws QueryTooComplexException {
// split search query into tokens
List<QueryWordsToken> userTokens = tokenizeUserQuery(searchWords);
analyzedTokens = analyzeTokens(userTokens);
return buildLuceneQuery(fieldNames, fieldSearchOnly);
}
/**
* @param fieldNames -
* Collection of field names of type String (e.g. "h1"); the
* search will be performed on the given fields
* @param fieldSearchOnly -
* boolean indicating if field only search should be performed;
* if set to false, default field "contents" and all other fields
* will be searched
*/
private Query buildLuceneQuery(Collection<String> fieldNames,
boolean fieldSearchOnly) {
String[] fields;
float[] boosts;
if (fieldSearchOnly) {
fields = new String[fieldNames.size()];
boosts = new float[fieldNames.size()];
Iterator<String> fieldNamesIt = fieldNames.iterator();
for (int i = 0; i < fieldNames.size(); i++) {
fields[i] = fieldNamesIt.next();
boosts[i] = 5.0f;
}
} else {
fields = new String[fieldNames.size() + 2];
boosts = new float[fieldNames.size() + 2];
Iterator<String> fieldNamesIt = fieldNames.iterator();
for (int i = 0; i < fieldNames.size(); i++) {
fields[i] = fieldNamesIt.next();
boosts[i] = 5.0f;
}
fields[fieldNames.size()] = "contents"; //$NON-NLS-1$
boosts[fieldNames.size()] = 1.0f;
fields[fieldNames.size()+1] = "title"; //$NON-NLS-1$
boosts[fieldNames.size()+1] = 1.0f;
}
Query query = getLuceneQuery(fields, boosts);
query = improveRankingForUnqotedPhrase(query, fields, boosts);
return query;
}
/**
* If user query contained only words (no quotaions nor operators) extends
* query with term phrase representing entire user query i.e for user string
* a b, the query a AND b will be extended to "a b" OR a AND b
*/
private Query improveRankingForUnqotedPhrase(Query query, String[] fields,
float[] boosts) {
if (query == null)
return query;
// check if all tokens are words
for (int i = 0; i < analyzedTokens.size(); i++)
if (analyzedTokens.get(i).type != QueryWordsToken.WORD)
return query;
// Create phrase query for all tokens and OR with original query
Builder booleanQueryBuilder = new BooleanQuery.Builder();
booleanQueryBuilder.add(query, BooleanClause.Occur.SHOULD);
PhraseQuery.Builder[] phraseQueriesBuilders = new PhraseQuery.Builder[fields.length];
for (int f = 0; f < fields.length; f++) {
phraseQueriesBuilders[f] = new PhraseQuery.Builder();
for (int i = 0; i < analyzedTokens.size(); i++) {
Term t = new Term(fields[f], analyzedTokens
.get(i).value);
phraseQueriesBuilders[f].add(t);
}
Query boostQuery = new BoostQuery(phraseQueriesBuilders[f].build(), 10 * boosts[f]);
booleanQueryBuilder.add(boostQuery, BooleanClause.Occur.SHOULD);
}
return booleanQueryBuilder.build();
}
/**
* Obtains analyzed terms from query as one string. Words are double quoted,
* and separated by space. The analyzed words are needed for highlighting
* word roots.
*/
public String gethighlightTerms() {
StringBuilder buf = new StringBuilder();
for (Iterator<String> it = highlightWords.iterator(); it.hasNext();) {
buf.append('"');
buf.append(it.next());
buf.append("\" "); //$NON-NLS-1$
}
return buf.toString();
}
}