| package org.apache.lucene.search.similarities; |
| |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| import java.io.IOException; |
| |
| import org.apache.lucene.index.AtomicReader; // javadoc |
| import org.apache.lucene.index.AtomicReaderContext; |
| import org.apache.lucene.index.FieldInvertState; |
| import org.apache.lucene.search.BooleanQuery; |
| import org.apache.lucene.search.CollectionStatistics; |
| import org.apache.lucene.search.Explanation; |
| import org.apache.lucene.search.IndexSearcher; |
| import org.apache.lucene.search.PhraseQuery; |
| import org.apache.lucene.search.TermQuery; |
| import org.apache.lucene.search.TermStatistics; |
| import org.apache.lucene.search.spans.SpanQuery; // javadoc |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.SmallFloat; // javadoc |
| |
| /** |
| * Similarity defines the components of Lucene scoring. |
| * <p> |
| * Expert: Scoring API. |
| * <p> |
| * This is a low-level API, you should only extend this API if you want to implement |
| * an information retrieval <i>model</i>. If you are instead looking for a convenient way |
| * to alter Lucene's scoring, consider extending a higher-level implementation |
| * such as {@link TFIDFSimilarity}, which implements the vector space model with this API, or |
| * just tweaking the default implementation: {@link DefaultSimilarity}. |
| * <p> |
| * Similarity determines how Lucene weights terms, and Lucene interacts with |
| * this class at both <a href="#indextime">index-time</a> and |
| * <a href="#querytime">query-time</a>. |
| * <p> |
| * <a name="indextime"/> |
| * At indexing time, the indexer calls {@link #computeNorm(FieldInvertState)}, allowing |
| * the Similarity implementation to set a per-document value for the field that will |
| * be later accessible via {@link AtomicReader#getNormValues(String)}. Lucene makes no assumption |
| * about what is in this norm, but it is most useful for encoding length normalization |
| * information. |
| * <p> |
| * Implementations should carefully consider how the normalization is encoded: while |
| * Lucene's classical {@link TFIDFSimilarity} encodes a combination of index-time boost |
| * and length normalization information with {@link SmallFloat} into a single byte, this |
| * might not be suitable for all purposes. |
| * <p> |
| * Many formulas require the use of average document length, which can be computed via a |
| * combination of {@link CollectionStatistics#sumTotalTermFreq()} and |
| * {@link CollectionStatistics#maxDoc()} or {@link CollectionStatistics#docCount()}, |
| * depending upon whether the average should reflect field sparsity. |
| * <p> |
| * Additional scoring factors can be stored in named |
| * <code>NumericDocValuesField</code>s and accessed |
| * at query-time with {@link AtomicReader#getNumericDocValues(String)}. |
| * <p> |
| * Finally, using index-time boosts (either via folding into the normalization byte or |
| * via DocValues), is an inefficient way to boost the scores of different fields if the |
| * boost will be the same for every document, instead the Similarity can simply take a constant |
| * boost parameter <i>C</i>, and {@link PerFieldSimilarityWrapper} can return different |
| * instances with different boosts depending upon field name. |
| * <p> |
| * <a name="querytime"/> |
| * At query-time, Queries interact with the Similarity via these steps: |
| * <ol> |
| * <li>The {@link #computeWeight(float, CollectionStatistics, TermStatistics...)} method is called a single time, |
| * allowing the implementation to compute any statistics (such as IDF, average document length, etc) |
| * across <i>the entire collection</i>. The {@link TermStatistics} and {@link CollectionStatistics} passed in |
| * already contain all of the raw statistics involved, so a Similarity can freely use any combination |
| * of statistics without causing any additional I/O. Lucene makes no assumption about what is |
| * stored in the returned {@link Similarity.SimWeight} object. |
| * <li>The query normalization process occurs a single time: {@link Similarity.SimWeight#getValueForNormalization()} |
| * is called for each query leaf node, {@link Similarity#queryNorm(float)} is called for the top-level |
| * query, and finally {@link Similarity.SimWeight#normalize(float, float)} passes down the normalization value |
| * and any top-level boosts (e.g. from enclosing {@link BooleanQuery}s). |
| * <li>For each segment in the index, the Query creates a {@link #simScorer(SimWeight, AtomicReaderContext)} |
| * The score() method is called for each matching document. |
| * </ol> |
| * <p> |
| * <a name="explaintime"/> |
| * When {@link IndexSearcher#explain(org.apache.lucene.search.Query, int)} is called, queries consult the Similarity's DocScorer for an |
| * explanation of how it computed its score. The query passes in a the document id and an explanation of how the frequency |
| * was computed. |
| * |
| * @see org.apache.lucene.index.IndexWriterConfig#setSimilarity(Similarity) |
| * @see IndexSearcher#setSimilarity(Similarity) |
| * @lucene.experimental |
| */ |
| public abstract class Similarity { |
| |
| /** |
| * Sole constructor. (For invocation by subclass |
| * constructors, typically implicit.) |
| */ |
| public Similarity() {} |
| |
| /** Hook to integrate coordinate-level matching. |
| * <p> |
| * By default this is disabled (returns <code>1</code>), as with |
| * most modern models this will only skew performance, but some |
| * implementations such as {@link TFIDFSimilarity} override this. |
| * |
| * @param overlap the number of query terms matched in the document |
| * @param maxOverlap the total number of terms in the query |
| * @return a score factor based on term overlap with the query |
| */ |
| public float coord(int overlap, int maxOverlap) { |
| return 1f; |
| } |
| |
| /** Computes the normalization value for a query given the sum of the |
| * normalized weights {@link SimWeight#getValueForNormalization()} of |
| * each of the query terms. This value is passed back to the |
| * weight ({@link SimWeight#normalize(float, float)} of each query |
| * term, to provide a hook to attempt to make scores from different |
| * queries comparable. |
| * <p> |
| * By default this is disabled (returns <code>1</code>), but some |
| * implementations such as {@link TFIDFSimilarity} override this. |
| * |
| * @param valueForNormalization the sum of the term normalization values |
| * @return a normalization factor for query weights |
| */ |
| public float queryNorm(float valueForNormalization) { |
| return 1f; |
| } |
| |
| /** |
| * Computes the normalization value for a field, given the accumulated |
| * state of term processing for this field (see {@link FieldInvertState}). |
| * |
| * <p>Matches in longer fields are less precise, so implementations of this |
| * method usually set smaller values when <code>state.getLength()</code> is large, |
| * and larger values when <code>state.getLength()</code> is small. |
| * |
| * @lucene.experimental |
| * |
| * @param state current processing state for this field |
| * @return computed norm value |
| */ |
| public abstract long computeNorm(FieldInvertState state); |
| |
| /** |
| * Compute any collection-level weight (e.g. IDF, average document length, etc) needed for scoring a query. |
| * |
| * @param queryBoost the query-time boost. |
| * @param collectionStats collection-level statistics, such as the number of tokens in the collection. |
| * @param termStats term-level statistics, such as the document frequency of a term across the collection. |
| * @return SimWeight object with the information this Similarity needs to score a query. |
| */ |
| public abstract SimWeight computeWeight(float queryBoost, CollectionStatistics collectionStats, TermStatistics... termStats); |
| |
| /** |
| * Creates a new {@link Similarity.SimScorer} to score matching documents from a segment of the inverted index. |
| * @param weight collection information from {@link #computeWeight(float, CollectionStatistics, TermStatistics...)} |
| * @param context segment of the inverted index to be scored. |
| * @return SloppySimScorer for scoring documents across <code>context</code> |
| * @throws IOException if there is a low-level I/O error |
| */ |
| public abstract SimScorer simScorer(SimWeight weight, AtomicReaderContext context) throws IOException; |
| |
| /** |
| * API for scoring "sloppy" queries such as {@link TermQuery}, |
| * {@link SpanQuery}, and {@link PhraseQuery}. |
| * <p> |
| * Frequencies are floating-point values: an approximate |
| * within-document frequency adjusted for "sloppiness" by |
| * {@link SimScorer#computeSlopFactor(int)}. |
| */ |
| public static abstract class SimScorer { |
| |
| /** |
| * Sole constructor. (For invocation by subclass |
| * constructors, typically implicit.) |
| */ |
| public SimScorer() {} |
| |
| /** |
| * Score a single document |
| * @param doc document id within the inverted index segment |
| * @param freq sloppy term frequency |
| * @return document's score |
| */ |
| public abstract float score(int doc, float freq); |
| |
| /** Computes the amount of a sloppy phrase match, based on an edit distance. */ |
| public abstract float computeSlopFactor(int distance); |
| |
| /** Calculate a scoring factor based on the data in the payload. */ |
| public abstract float computePayloadFactor(int doc, int start, int end, BytesRef payload); |
| |
| /** |
| * Explain the score for a single document |
| * @param doc document id within the inverted index segment |
| * @param freq Explanation of how the sloppy term frequency was computed |
| * @return document's score |
| */ |
| public Explanation explain(int doc, Explanation freq) { |
| Explanation result = new Explanation(score(doc, freq.getValue()), |
| "score(doc=" + doc + ",freq=" + freq.getValue() +"), with freq of:"); |
| result.addDetail(freq); |
| return result; |
| } |
| } |
| |
| /** Stores the weight for a query across the indexed collection. This abstract |
| * implementation is empty; descendants of {@code Similarity} should |
| * subclass {@code SimWeight} and define the statistics they require in the |
| * subclass. Examples include idf, average field length, etc. |
| */ |
| public static abstract class SimWeight { |
| |
| /** |
| * Sole constructor. (For invocation by subclass |
| * constructors, typically implicit.) |
| */ |
| public SimWeight() {} |
| |
| /** The value for normalization of contained query clauses (e.g. sum of squared weights). |
| * <p> |
| * NOTE: a Similarity implementation might not use any query normalization at all, |
| * its not required. However, if it wants to participate in query normalization, |
| * it can return a value here. |
| */ |
| public abstract float getValueForNormalization(); |
| |
| /** Assigns the query normalization factor and boost from parent queries to this. |
| * <p> |
| * NOTE: a Similarity implementation might not use this normalized value at all, |
| * its not required. However, its usually a good idea to at least incorporate |
| * the topLevelBoost (e.g. from an outer BooleanQuery) into its score. |
| */ |
| public abstract void normalize(float queryNorm, float topLevelBoost); |
| } |
| } |