blob: aea50c6cadd695ec2bd7960ce9fceb13a3b7201a [file] [log] [blame]
package org.apache.lucene.search.spans;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermContext;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.InPlaceMergeSorter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Collection;
import java.util.Map;
import java.util.Set;
/** A Spans that is formed from the ordered subspans of a SpanNearQuery
* where the subspans do not overlap and have a maximum slop between them.
* <p>
* The formed spans only contains minimum slop matches.<br>
* The matching slop is computed from the distance(s) between
* the non overlapping matching Spans.<br>
* Successive matches are always formed from the successive Spans
* of the SpanNearQuery.
* <p>
* The formed spans may contain overlaps when the slop is at least 1.
* For example, when querying using
* <pre>t1 t2 t3</pre>
* with slop at least 1, the fragment:
* <pre>t1 t2 t1 t3 t2 t3</pre>
* matches twice:
* <pre>t1 t2 .. t3 </pre>
* <pre> t1 .. t2 t3</pre>
*
*
* Expert:
* Only public for subclassing. Most implementations should not need this class
*/
public class NearSpansOrdered extends Spans {
private final int allowedSlop;
private boolean firstTime = true;
private boolean more = false;
/** The spans in the same order as the SpanNearQuery */
private final Spans[] subSpans;
/** Indicates that all subSpans have same doc() */
private boolean inSameDoc = false;
private int matchDoc = -1;
private int matchStart = -1;
private int matchEnd = -1;
private List<byte[]> matchPayload;
private final Spans[] subSpansByDoc;
// Even though the array is probably almost sorted, InPlaceMergeSorter will likely
// perform better since it has a lower overhead than TimSorter for small arrays
private final InPlaceMergeSorter sorter = new InPlaceMergeSorter() {
@Override
protected void swap(int i, int j) {
ArrayUtil.swap(subSpansByDoc, i, j);
}
@Override
protected int compare(int i, int j) {
return subSpansByDoc[i].doc() - subSpansByDoc[j].doc();
}
};
private SpanNearQuery query;
private boolean collectPayloads = true;
public NearSpansOrdered(SpanNearQuery spanNearQuery, AtomicReaderContext context, Bits acceptDocs, Map<Term,TermContext> termContexts) throws IOException {
this(spanNearQuery, context, acceptDocs, termContexts, true);
}
public NearSpansOrdered(SpanNearQuery spanNearQuery, AtomicReaderContext context, Bits acceptDocs, Map<Term,TermContext> termContexts, boolean collectPayloads)
throws IOException {
if (spanNearQuery.getClauses().length < 2) {
throw new IllegalArgumentException("Less than 2 clauses: "
+ spanNearQuery);
}
this.collectPayloads = collectPayloads;
allowedSlop = spanNearQuery.getSlop();
SpanQuery[] clauses = spanNearQuery.getClauses();
subSpans = new Spans[clauses.length];
matchPayload = new LinkedList<>();
subSpansByDoc = new Spans[clauses.length];
for (int i = 0; i < clauses.length; i++) {
subSpans[i] = clauses[i].getSpans(context, acceptDocs, termContexts);
subSpansByDoc[i] = subSpans[i]; // used in toSameDoc()
}
query = spanNearQuery; // kept for toString() only.
}
// inherit javadocs
@Override
public int doc() { return matchDoc; }
// inherit javadocs
@Override
public int start() { return matchStart; }
// inherit javadocs
@Override
public int end() { return matchEnd; }
public Spans[] getSubSpans() {
return subSpans;
}
// TODO: Remove warning after API has been finalized
// TODO: Would be nice to be able to lazy load payloads
@Override
public Collection<byte[]> getPayload() throws IOException {
return matchPayload;
}
// TODO: Remove warning after API has been finalized
@Override
public boolean isPayloadAvailable() {
return matchPayload.isEmpty() == false;
}
@Override
public long cost() {
long minCost = Long.MAX_VALUE;
for (int i = 0; i < subSpans.length; i++) {
minCost = Math.min(minCost, subSpans[i].cost());
}
return minCost;
}
// inherit javadocs
@Override
public boolean next() throws IOException {
if (firstTime) {
firstTime = false;
for (int i = 0; i < subSpans.length; i++) {
if (! subSpans[i].next()) {
more = false;
return false;
}
}
more = true;
}
if(collectPayloads) {
matchPayload.clear();
}
return advanceAfterOrdered();
}
// inherit javadocs
@Override
public boolean skipTo(int target) throws IOException {
if (firstTime) {
firstTime = false;
for (int i = 0; i < subSpans.length; i++) {
if (! subSpans[i].skipTo(target)) {
more = false;
return false;
}
}
more = true;
} else if (more && (subSpans[0].doc() < target)) {
if (subSpans[0].skipTo(target)) {
inSameDoc = false;
} else {
more = false;
return false;
}
}
if(collectPayloads) {
matchPayload.clear();
}
return advanceAfterOrdered();
}
/** Advances the subSpans to just after an ordered match with a minimum slop
* that is smaller than the slop allowed by the SpanNearQuery.
* @return true iff there is such a match.
*/
private boolean advanceAfterOrdered() throws IOException {
while (more && (inSameDoc || toSameDoc())) {
if (stretchToOrder() && shrinkToAfterShortestMatch()) {
return true;
}
}
return false; // no more matches
}
/** Advance the subSpans to the same document */
private boolean toSameDoc() throws IOException {
sorter.sort(0, subSpansByDoc.length);
int firstIndex = 0;
int maxDoc = subSpansByDoc[subSpansByDoc.length - 1].doc();
while (subSpansByDoc[firstIndex].doc() != maxDoc) {
if (! subSpansByDoc[firstIndex].skipTo(maxDoc)) {
more = false;
inSameDoc = false;
return false;
}
maxDoc = subSpansByDoc[firstIndex].doc();
if (++firstIndex == subSpansByDoc.length) {
firstIndex = 0;
}
}
for (int i = 0; i < subSpansByDoc.length; i++) {
assert (subSpansByDoc[i].doc() == maxDoc)
: " NearSpansOrdered.toSameDoc() spans " + subSpansByDoc[0]
+ "\n at doc " + subSpansByDoc[i].doc()
+ ", but should be at " + maxDoc;
}
inSameDoc = true;
return true;
}
/** Check whether two Spans in the same document are ordered.
* @return true iff spans1 starts before spans2
* or the spans start at the same position,
* and spans1 ends before spans2.
*/
static final boolean docSpansOrdered(Spans spans1, Spans spans2) {
assert spans1.doc() == spans2.doc() : "doc1 " + spans1.doc() + " != doc2 " + spans2.doc();
int start1 = spans1.start();
int start2 = spans2.start();
/* Do not call docSpansOrdered(int,int,int,int) to avoid invoking .end() : */
return (start1 == start2) ? (spans1.end() < spans2.end()) : (start1 < start2);
}
/** Like {@link #docSpansOrdered(Spans,Spans)}, but use the spans
* starts and ends as parameters.
*/
private static final boolean docSpansOrdered(int start1, int end1, int start2, int end2) {
return (start1 == start2) ? (end1 < end2) : (start1 < start2);
}
/** Order the subSpans within the same document by advancing all later spans
* after the previous one.
*/
private boolean stretchToOrder() throws IOException {
matchDoc = subSpans[0].doc();
for (int i = 1; inSameDoc && (i < subSpans.length); i++) {
while (! docSpansOrdered(subSpans[i-1], subSpans[i])) {
if (! subSpans[i].next()) {
inSameDoc = false;
more = false;
break;
} else if (matchDoc != subSpans[i].doc()) {
inSameDoc = false;
break;
}
}
}
return inSameDoc;
}
/** The subSpans are ordered in the same doc, so there is a possible match.
* Compute the slop while making the match as short as possible by advancing
* all subSpans except the last one in reverse order.
*/
private boolean shrinkToAfterShortestMatch() throws IOException {
matchStart = subSpans[subSpans.length - 1].start();
matchEnd = subSpans[subSpans.length - 1].end();
Set<byte[]> possibleMatchPayloads = new HashSet<>();
if (subSpans[subSpans.length - 1].isPayloadAvailable()) {
possibleMatchPayloads.addAll(subSpans[subSpans.length - 1].getPayload());
}
Collection<byte[]> possiblePayload = null;
int matchSlop = 0;
int lastStart = matchStart;
int lastEnd = matchEnd;
for (int i = subSpans.length - 2; i >= 0; i--) {
Spans prevSpans = subSpans[i];
if (collectPayloads && prevSpans.isPayloadAvailable()) {
Collection<byte[]> payload = prevSpans.getPayload();
possiblePayload = new ArrayList<>(payload.size());
possiblePayload.addAll(payload);
}
int prevStart = prevSpans.start();
int prevEnd = prevSpans.end();
while (true) { // Advance prevSpans until after (lastStart, lastEnd)
if (! prevSpans.next()) {
inSameDoc = false;
more = false;
break; // Check remaining subSpans for final match.
} else if (matchDoc != prevSpans.doc()) {
inSameDoc = false; // The last subSpans is not advanced here.
break; // Check remaining subSpans for last match in this document.
} else {
int ppStart = prevSpans.start();
int ppEnd = prevSpans.end(); // Cannot avoid invoking .end()
if (! docSpansOrdered(ppStart, ppEnd, lastStart, lastEnd)) {
break; // Check remaining subSpans.
} else { // prevSpans still before (lastStart, lastEnd)
prevStart = ppStart;
prevEnd = ppEnd;
if (collectPayloads && prevSpans.isPayloadAvailable()) {
Collection<byte[]> payload = prevSpans.getPayload();
possiblePayload = new ArrayList<>(payload.size());
possiblePayload.addAll(payload);
}
}
}
}
if (collectPayloads && possiblePayload != null) {
possibleMatchPayloads.addAll(possiblePayload);
}
assert prevStart <= matchStart;
if (matchStart > prevEnd) { // Only non overlapping spans add to slop.
matchSlop += (matchStart - prevEnd);
}
/* Do not break on (matchSlop > allowedSlop) here to make sure
* that subSpans[0] is advanced after the match, if any.
*/
matchStart = prevStart;
lastStart = prevStart;
lastEnd = prevEnd;
}
boolean match = matchSlop <= allowedSlop;
if(collectPayloads && match && possibleMatchPayloads.size() > 0) {
matchPayload.addAll(possibleMatchPayloads);
}
return match; // ordered and allowed slop
}
@Override
public String toString() {
return getClass().getName() + "("+query.toString()+")@"+
(firstTime?"START":(more?(doc()+":"+start()+"-"+end()):"END"));
}
}