blob: 6948153ba0eb58f87b6c6674237d7323a4fdc187 [file] [log] [blame]
/*******************************************************************************
* Copyright (c) 2000, 2019 IBM Corporation and others.
*
* This program and the accompanying materials
* are made available under the terms of the Eclipse Public License 2.0
* which accompanies this distribution, and is available at
* https://www.eclipse.org/legal/epl-2.0/
*
* SPDX-License-Identifier: EPL-2.0
*
* Contributors:
* IBM Corporation - initial API and implementation
* Alexander Kurtakov - Bug 460787
* Sopot Cela - Bug 466829
*******************************************************************************/
package org.eclipse.help.internal.search;
import java.util.HashSet;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.en.PorterStemFilter;
/**
* Lucene Analyzer for English.
* LowerCaseAndDigitsTokenizer->StopFilter->PorterStemFilter
*/
public final class Analyzer_en extends Analyzer {
/**
* Constructor for Analyzer_en.
*/
public Analyzer_en() {
super();
}
/*
* Can't use try-with-resources because the Lucene internally reuses
* components. See {@link org.apache.lucene.analysis.Analyzer.ReuseStrategy}
*/
@SuppressWarnings("resource")
@Override
protected TokenStreamComponents createComponents(String fieldName) {
final Tokenizer source = new CharAndDigitsTokenizer();
TokenStream result = new StopFilter(source, new CharArraySet(getStopWords(), false));
result = new LowerCaseFilter(result);
result = new PorterStemFilter(result);
return new TokenStreamComponents(source, result);
}
private Set<String> stopWords;
private Set<String> getStopWords() {
if ( stopWords == null ) {
stopWords = new HashSet<>();
for (int i = 0; i < STOP_WORDS.length; i++) {
stopWords.add(STOP_WORDS[i]);
}
}
return stopWords;
}
/**
* Array of English stop words. Differs from StandardAnalyzer's default stop
* words by not having "for", "if", and "this" that are java keywords.
*/
private final static String[] STOP_WORDS = {"a", //$NON-NLS-1$
"and", //$NON-NLS-1$
"are", //$NON-NLS-1$
"as", //$NON-NLS-1$
"at", //$NON-NLS-1$
"be", //$NON-NLS-1$
"but", //$NON-NLS-1$
"by", //$NON-NLS-1$
"in", //$NON-NLS-1$
"into", //$NON-NLS-1$
"is", //$NON-NLS-1$
"it", //$NON-NLS-1$
"no", //$NON-NLS-1$
"not", //$NON-NLS-1$
"of", //$NON-NLS-1$
"on", //$NON-NLS-1$
"or", //$NON-NLS-1$
"s", //$NON-NLS-1$
"such", //$NON-NLS-1$
"t", //$NON-NLS-1$
"that", //$NON-NLS-1$
"the", //$NON-NLS-1$
"their", //$NON-NLS-1$
"then", //$NON-NLS-1$
"there", //$NON-NLS-1$
"these", //$NON-NLS-1$
"they", //$NON-NLS-1$
"to", //$NON-NLS-1$
"was", //$NON-NLS-1$
"will", //$NON-NLS-1$
"with"}; //$NON-NLS-1$
}