org.eclipse.help.base/src/org/eclipse/help/internal/search/Analyzer_en.java - platform/eclipse.platform.ua - Git at Google

 /*******************************************************************************
  * Copyright (c) 2000, 2019 IBM Corporation and others.
  *
  * This program and the accompanying materials
  * are made available under the terms of the Eclipse Public License 2.0
  * which accompanies this distribution, and is available at
  * https://www.eclipse.org/legal/epl-2.0/
  *
  * SPDX-License-Identifier: EPL-2.0
  *
  * Contributors:
  *     IBM Corporation - initial API and implementation
  *     Alexander Kurtakov - Bug 460787
  *     Sopot Cela - Bug 466829
  *******************************************************************************/
 package org.eclipse.help.internal.search;

 import java.util.HashSet;
 import java.util.Set;

 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.analysis.StopFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.core.LowerCaseFilter;
 import org.apache.lucene.analysis.en.PorterStemFilter;

 /**
  * Lucene Analyzer for English.
  * LowerCaseAndDigitsTokenizer->StopFilter->PorterStemFilter
  */
 public final class Analyzer_en extends Analyzer {
 	/**
 	 * Constructor for Analyzer_en.
 	 */
 	public Analyzer_en() {
 		super();
 	}

 	/*
 	 * Can't use try-with-resources because the Lucene internally reuses
 	 * components. See {@link org.apache.lucene.analysis.Analyzer.ReuseStrategy}
 	 */
 	@SuppressWarnings("resource")
 	@Override
 	protected TokenStreamComponents createComponents(String fieldName) {
 		final Tokenizer source = new CharAndDigitsTokenizer();
 		TokenStream result = new StopFilter(source, new CharArraySet(getStopWords(), false));
 		result = new LowerCaseFilter(result);
 		result = new PorterStemFilter(result);
 		return new TokenStreamComponents(source, result);
 	}

 	private Set<String> stopWords;

 	private Set<String> getStopWords() {
 		if ( stopWords == null ) {
 			stopWords = new HashSet<>();
 			for (int i = 0; i < STOP_WORDS.length; i++) {
 				stopWords.add(STOP_WORDS[i]);
 			}
 		}
 		return stopWords;
 	}

 	/**
 	 * Array of English stop words. Differs from StandardAnalyzer's default stop
 	 * words by not having "for", "if", and "this" that are java keywords.
 	 */
 	private final static String[] STOP_WORDS = {"a", //$NON-NLS-1$
 			"and", //$NON-NLS-1$
 			"are", //$NON-NLS-1$
 			"as", //$NON-NLS-1$
 			"at", //$NON-NLS-1$
 			"be", //$NON-NLS-1$
 			"but", //$NON-NLS-1$
 			"by", //$NON-NLS-1$
 			"in", //$NON-NLS-1$
 			"into", //$NON-NLS-1$
 			"is", //$NON-NLS-1$
 			"it", //$NON-NLS-1$
 			"no", //$NON-NLS-1$
 			"not", //$NON-NLS-1$
 			"of", //$NON-NLS-1$
 			"on", //$NON-NLS-1$
 			"or", //$NON-NLS-1$
 			"s", //$NON-NLS-1$
 			"such", //$NON-NLS-1$
 			"t", //$NON-NLS-1$
 			"that", //$NON-NLS-1$
 			"the", //$NON-NLS-1$
 			"their", //$NON-NLS-1$
 			"then", //$NON-NLS-1$
 			"there", //$NON-NLS-1$
 			"these", //$NON-NLS-1$
 			"they", //$NON-NLS-1$
 			"to", //$NON-NLS-1$
 			"was", //$NON-NLS-1$
 			"will", //$NON-NLS-1$
 			"with"}; //$NON-NLS-1$

 }
	/*******************************************************************************
	* Copyright (c) 2000, 2019 IBM Corporation and others.
	*
	* This program and the accompanying materials
	* are made available under the terms of the Eclipse Public License 2.0
	* which accompanies this distribution, and is available at
	* https://www.eclipse.org/legal/epl-2.0/
	*
	* SPDX-License-Identifier: EPL-2.0
	*
	* Contributors:
	* IBM Corporation - initial API and implementation
	* Alexander Kurtakov - Bug 460787
	* Sopot Cela - Bug 466829
	*******************************************************************************/
	package org.eclipse.help.internal.search;

	import java.util.HashSet;
	import java.util.Set;

	import org.apache.lucene.analysis.Analyzer;
	import org.apache.lucene.analysis.CharArraySet;
	import org.apache.lucene.analysis.StopFilter;
	import org.apache.lucene.analysis.TokenStream;
	import org.apache.lucene.analysis.Tokenizer;
	import org.apache.lucene.analysis.core.LowerCaseFilter;
	import org.apache.lucene.analysis.en.PorterStemFilter;

	/**
	* Lucene Analyzer for English.
	* LowerCaseAndDigitsTokenizer->StopFilter->PorterStemFilter
	*/
	public final class Analyzer_en extends Analyzer {
	/**
	* Constructor for Analyzer_en.
	*/
	public Analyzer_en() {
	super();
	}

	/*
	* Can't use try-with-resources because the Lucene internally reuses
	* components. See {@link org.apache.lucene.analysis.Analyzer.ReuseStrategy}
	*/
	@SuppressWarnings("resource")
	@Override
	protected TokenStreamComponents createComponents(String fieldName) {
	final Tokenizer source = new CharAndDigitsTokenizer();
	TokenStream result = new StopFilter(source, new CharArraySet(getStopWords(), false));
	result = new LowerCaseFilter(result);
	result = new PorterStemFilter(result);
	return new TokenStreamComponents(source, result);
	}

	private Set<String> stopWords;

	private Set<String> getStopWords() {
	if ( stopWords == null ) {
	stopWords = new HashSet<>();
	for (int i = 0; i < STOP_WORDS.length; i++) {
	stopWords.add(STOP_WORDS[i]);
	}
	}
	return stopWords;
	}

	/**
	* Array of English stop words. Differs from StandardAnalyzer's default stop
	* words by not having "for", "if", and "this" that are java keywords.
	*/
	private final static String[] STOP_WORDS = {"a", //$NON-NLS-1$
	"and", //$NON-NLS-1$
	"are", //$NON-NLS-1$
	"as", //$NON-NLS-1$
	"at", //$NON-NLS-1$
	"be", //$NON-NLS-1$
	"but", //$NON-NLS-1$
	"by", //$NON-NLS-1$
	"in", //$NON-NLS-1$
	"into", //$NON-NLS-1$
	"is", //$NON-NLS-1$
	"it", //$NON-NLS-1$
	"no", //$NON-NLS-1$
	"not", //$NON-NLS-1$
	"of", //$NON-NLS-1$
	"on", //$NON-NLS-1$
	"or", //$NON-NLS-1$
	"s", //$NON-NLS-1$
	"such", //$NON-NLS-1$
	"t", //$NON-NLS-1$
	"that", //$NON-NLS-1$
	"the", //$NON-NLS-1$
	"their", //$NON-NLS-1$
	"then", //$NON-NLS-1$
	"there", //$NON-NLS-1$
	"these", //$NON-NLS-1$
	"they", //$NON-NLS-1$
	"to", //$NON-NLS-1$
	"was", //$NON-NLS-1$
	"will", //$NON-NLS-1$
	"with"}; //$NON-NLS-1$

	}