| package org.apache.lucene.index; |
| |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| import org.apache.lucene.util.BytesRefBuilder; |
| import org.apache.lucene.util.PriorityQueue; |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.Bits; |
| |
| import java.io.IOException; |
| import java.util.Arrays; |
| import java.util.Comparator; |
| |
| /** |
| * Exposes {@link TermsEnum} API, merged from {@link TermsEnum} API of sub-segments. |
| * This does a merge sort, by term text, of the sub-readers. |
| * |
| * @lucene.experimental |
| */ |
| public final class MultiTermsEnum extends TermsEnum { |
| |
| private final TermMergeQueue queue; |
| private final TermsEnumWithSlice[] subs; // all of our subs (one per sub-reader) |
| private final TermsEnumWithSlice[] currentSubs; // current subs that have at least one term for this field |
| private final TermsEnumWithSlice[] top; |
| private final MultiDocsEnum.EnumWithSlice[] subDocs; |
| private final MultiDocsAndPositionsEnum.EnumWithSlice[] subDocsAndPositions; |
| |
| private BytesRef lastSeek; |
| private boolean lastSeekExact; |
| private final BytesRefBuilder lastSeekScratch = new BytesRefBuilder(); |
| |
| private int numTop; |
| private int numSubs; |
| private BytesRef current; |
| private Comparator<BytesRef> termComp; |
| |
| static class TermsEnumIndex { |
| public final static TermsEnumIndex[] EMPTY_ARRAY = new TermsEnumIndex[0]; |
| final int subIndex; |
| final TermsEnum termsEnum; |
| |
| public TermsEnumIndex(TermsEnum termsEnum, int subIndex) { |
| this.termsEnum = termsEnum; |
| this.subIndex = subIndex; |
| } |
| } |
| |
| /** Returns how many sub-reader slices contain the current |
| * term. @see #getMatchArray */ |
| public int getMatchCount() { |
| return numTop; |
| } |
| |
| /** Returns sub-reader slices positioned to the current term. */ |
| public TermsEnumWithSlice[] getMatchArray() { |
| return top; |
| } |
| |
| /** Sole constructor. |
| * @param slices Which sub-reader slices we should |
| * merge. */ |
| public MultiTermsEnum(ReaderSlice[] slices) { |
| queue = new TermMergeQueue(slices.length); |
| top = new TermsEnumWithSlice[slices.length]; |
| subs = new TermsEnumWithSlice[slices.length]; |
| subDocs = new MultiDocsEnum.EnumWithSlice[slices.length]; |
| subDocsAndPositions = new MultiDocsAndPositionsEnum.EnumWithSlice[slices.length]; |
| for(int i=0;i<slices.length;i++) { |
| subs[i] = new TermsEnumWithSlice(i, slices[i]); |
| subDocs[i] = new MultiDocsEnum.EnumWithSlice(); |
| subDocs[i].slice = slices[i]; |
| subDocsAndPositions[i] = new MultiDocsAndPositionsEnum.EnumWithSlice(); |
| subDocsAndPositions[i].slice = slices[i]; |
| } |
| currentSubs = new TermsEnumWithSlice[slices.length]; |
| } |
| |
| @Override |
| public BytesRef term() { |
| return current; |
| } |
| |
| @Override |
| public Comparator<BytesRef> getComparator() { |
| return termComp; |
| } |
| |
| /** The terms array must be newly created TermsEnum, ie |
| * {@link TermsEnum#next} has not yet been called. */ |
| public TermsEnum reset(TermsEnumIndex[] termsEnumsIndex) throws IOException { |
| assert termsEnumsIndex.length <= top.length; |
| numSubs = 0; |
| numTop = 0; |
| termComp = null; |
| queue.clear(); |
| for(int i=0;i<termsEnumsIndex.length;i++) { |
| |
| final TermsEnumIndex termsEnumIndex = termsEnumsIndex[i]; |
| assert termsEnumIndex != null; |
| |
| // init our term comp |
| if (termComp == null) { |
| queue.termComp = termComp = termsEnumIndex.termsEnum.getComparator(); |
| } else { |
| // We cannot merge sub-readers that have |
| // different TermComps |
| final Comparator<BytesRef> subTermComp = termsEnumIndex.termsEnum.getComparator(); |
| if (subTermComp != null && !subTermComp.equals(termComp)) { |
| throw new IllegalStateException("sub-readers have different BytesRef.Comparators: " + subTermComp + " vs " + termComp + "; cannot merge"); |
| } |
| } |
| |
| final BytesRef term = termsEnumIndex.termsEnum.next(); |
| if (term != null) { |
| final TermsEnumWithSlice entry = subs[termsEnumIndex.subIndex]; |
| entry.reset(termsEnumIndex.termsEnum, term); |
| queue.add(entry); |
| currentSubs[numSubs++] = entry; |
| } else { |
| // field has no terms |
| } |
| } |
| |
| if (queue.size() == 0) { |
| return TermsEnum.EMPTY; |
| } else { |
| return this; |
| } |
| } |
| |
| @Override |
| public boolean seekExact(BytesRef term) throws IOException { |
| queue.clear(); |
| numTop = 0; |
| |
| boolean seekOpt = false; |
| if (lastSeek != null && termComp.compare(lastSeek, term) <= 0) { |
| seekOpt = true; |
| } |
| |
| lastSeek = null; |
| lastSeekExact = true; |
| |
| for(int i=0;i<numSubs;i++) { |
| final boolean status; |
| // LUCENE-2130: if we had just seek'd already, prior |
| // to this seek, and the new seek term is after the |
| // previous one, don't try to re-seek this sub if its |
| // current term is already beyond this new seek term. |
| // Doing so is a waste because this sub will simply |
| // seek to the same spot. |
| if (seekOpt) { |
| final BytesRef curTerm = currentSubs[i].current; |
| if (curTerm != null) { |
| final int cmp = termComp.compare(term, curTerm); |
| if (cmp == 0) { |
| status = true; |
| } else if (cmp < 0) { |
| status = false; |
| } else { |
| status = currentSubs[i].terms.seekExact(term); |
| } |
| } else { |
| status = false; |
| } |
| } else { |
| status = currentSubs[i].terms.seekExact(term); |
| } |
| |
| if (status) { |
| top[numTop++] = currentSubs[i]; |
| current = currentSubs[i].current = currentSubs[i].terms.term(); |
| assert term.equals(currentSubs[i].current); |
| } |
| } |
| |
| // if at least one sub had exact match to the requested |
| // term then we found match |
| return numTop > 0; |
| } |
| |
| @Override |
| public SeekStatus seekCeil(BytesRef term) throws IOException { |
| queue.clear(); |
| numTop = 0; |
| lastSeekExact = false; |
| |
| boolean seekOpt = false; |
| if (lastSeek != null && termComp.compare(lastSeek, term) <= 0) { |
| seekOpt = true; |
| } |
| |
| lastSeekScratch.copyBytes(term); |
| lastSeek = lastSeekScratch.get(); |
| |
| for(int i=0;i<numSubs;i++) { |
| final SeekStatus status; |
| // LUCENE-2130: if we had just seek'd already, prior |
| // to this seek, and the new seek term is after the |
| // previous one, don't try to re-seek this sub if its |
| // current term is already beyond this new seek term. |
| // Doing so is a waste because this sub will simply |
| // seek to the same spot. |
| if (seekOpt) { |
| final BytesRef curTerm = currentSubs[i].current; |
| if (curTerm != null) { |
| final int cmp = termComp.compare(term, curTerm); |
| if (cmp == 0) { |
| status = SeekStatus.FOUND; |
| } else if (cmp < 0) { |
| status = SeekStatus.NOT_FOUND; |
| } else { |
| status = currentSubs[i].terms.seekCeil(term); |
| } |
| } else { |
| status = SeekStatus.END; |
| } |
| } else { |
| status = currentSubs[i].terms.seekCeil(term); |
| } |
| |
| if (status == SeekStatus.FOUND) { |
| top[numTop++] = currentSubs[i]; |
| current = currentSubs[i].current = currentSubs[i].terms.term(); |
| } else { |
| if (status == SeekStatus.NOT_FOUND) { |
| currentSubs[i].current = currentSubs[i].terms.term(); |
| assert currentSubs[i].current != null; |
| queue.add(currentSubs[i]); |
| } else { |
| // enum exhausted |
| currentSubs[i].current = null; |
| } |
| } |
| } |
| |
| if (numTop > 0) { |
| // at least one sub had exact match to the requested term |
| return SeekStatus.FOUND; |
| } else if (queue.size() > 0) { |
| // no sub had exact match, but at least one sub found |
| // a term after the requested term -- advance to that |
| // next term: |
| pullTop(); |
| return SeekStatus.NOT_FOUND; |
| } else { |
| return SeekStatus.END; |
| } |
| } |
| |
| @Override |
| public void seekExact(long ord) { |
| throw new UnsupportedOperationException(); |
| } |
| |
| @Override |
| public long ord() { |
| throw new UnsupportedOperationException(); |
| } |
| |
| private void pullTop() { |
| // extract all subs from the queue that have the same |
| // top term |
| assert numTop == 0; |
| while(true) { |
| top[numTop++] = queue.pop(); |
| if (queue.size() == 0 || !(queue.top()).current.bytesEquals(top[0].current)) { |
| break; |
| } |
| } |
| current = top[0].current; |
| } |
| |
| private void pushTop() throws IOException { |
| // call next() on each top, and put back into queue |
| for(int i=0;i<numTop;i++) { |
| top[i].current = top[i].terms.next(); |
| if (top[i].current != null) { |
| queue.add(top[i]); |
| } else { |
| // no more fields in this reader |
| } |
| } |
| numTop = 0; |
| } |
| |
| @Override |
| public BytesRef next() throws IOException { |
| if (lastSeekExact) { |
| // Must seekCeil at this point, so those subs that |
| // didn't have the term can find the following term. |
| // NOTE: we could save some CPU by only seekCeil the |
| // subs that didn't match the last exact seek... but |
| // most impls short-circuit if you seekCeil to term |
| // they are already on. |
| final SeekStatus status = seekCeil(current); |
| assert status == SeekStatus.FOUND; |
| lastSeekExact = false; |
| } |
| lastSeek = null; |
| |
| // restore queue |
| pushTop(); |
| |
| // gather equal top fields |
| if (queue.size() > 0) { |
| pullTop(); |
| } else { |
| current = null; |
| } |
| |
| return current; |
| } |
| |
| @Override |
| public int docFreq() throws IOException { |
| int sum = 0; |
| for(int i=0;i<numTop;i++) { |
| sum += top[i].terms.docFreq(); |
| } |
| return sum; |
| } |
| |
| @Override |
| public long totalTermFreq() throws IOException { |
| long sum = 0; |
| for(int i=0;i<numTop;i++) { |
| final long v = top[i].terms.totalTermFreq(); |
| if (v == -1) { |
| return v; |
| } |
| sum += v; |
| } |
| return sum; |
| } |
| |
| @Override |
| public DocsEnum docs(Bits liveDocs, DocsEnum reuse, int flags) throws IOException { |
| MultiDocsEnum docsEnum; |
| // Can only reuse if incoming enum is also a MultiDocsEnum |
| if (reuse != null && reuse instanceof MultiDocsEnum) { |
| docsEnum = (MultiDocsEnum) reuse; |
| // ... and was previously created w/ this MultiTermsEnum: |
| if (!docsEnum.canReuse(this)) { |
| docsEnum = new MultiDocsEnum(this, subs.length); |
| } |
| } else { |
| docsEnum = new MultiDocsEnum(this, subs.length); |
| } |
| |
| final MultiBits multiLiveDocs; |
| if (liveDocs instanceof MultiBits) { |
| multiLiveDocs = (MultiBits) liveDocs; |
| } else { |
| multiLiveDocs = null; |
| } |
| |
| int upto = 0; |
| |
| for(int i=0;i<numTop;i++) { |
| |
| final TermsEnumWithSlice entry = top[i]; |
| |
| final Bits b; |
| |
| if (multiLiveDocs != null) { |
| // optimize for common case: requested skip docs is a |
| // congruent sub-slice of MultiBits: in this case, we |
| // just pull the liveDocs from the sub reader, rather |
| // than making the inefficient |
| // Slice(Multi(sub-readers)): |
| final MultiBits.SubResult sub = multiLiveDocs.getMatchingSub(entry.subSlice); |
| if (sub.matches) { |
| b = sub.result; |
| } else { |
| // custom case: requested skip docs is foreign: |
| // must slice it on every access |
| b = new BitsSlice(liveDocs, entry.subSlice); |
| } |
| } else if (liveDocs != null) { |
| b = new BitsSlice(liveDocs, entry.subSlice); |
| } else { |
| // no deletions |
| b = null; |
| } |
| |
| assert entry.index < docsEnum.subDocsEnum.length: entry.index + " vs " + docsEnum.subDocsEnum.length + "; " + subs.length; |
| final DocsEnum subDocsEnum = entry.terms.docs(b, docsEnum.subDocsEnum[entry.index], flags); |
| if (subDocsEnum != null) { |
| docsEnum.subDocsEnum[entry.index] = subDocsEnum; |
| subDocs[upto].docsEnum = subDocsEnum; |
| subDocs[upto].slice = entry.subSlice; |
| upto++; |
| } else { |
| // should this be an error? |
| assert false : "One of our subs cannot provide a docsenum"; |
| } |
| } |
| |
| if (upto == 0) { |
| return null; |
| } else { |
| return docsEnum.reset(subDocs, upto); |
| } |
| } |
| |
| @Override |
| public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, int flags) throws IOException { |
| MultiDocsAndPositionsEnum docsAndPositionsEnum; |
| // Can only reuse if incoming enum is also a MultiDocsAndPositionsEnum |
| if (reuse != null && reuse instanceof MultiDocsAndPositionsEnum) { |
| docsAndPositionsEnum = (MultiDocsAndPositionsEnum) reuse; |
| // ... and was previously created w/ this MultiTermsEnum: |
| if (!docsAndPositionsEnum.canReuse(this)) { |
| docsAndPositionsEnum = new MultiDocsAndPositionsEnum(this, subs.length); |
| } |
| } else { |
| docsAndPositionsEnum = new MultiDocsAndPositionsEnum(this, subs.length); |
| } |
| |
| final MultiBits multiLiveDocs; |
| if (liveDocs instanceof MultiBits) { |
| multiLiveDocs = (MultiBits) liveDocs; |
| } else { |
| multiLiveDocs = null; |
| } |
| |
| int upto = 0; |
| |
| for(int i=0;i<numTop;i++) { |
| |
| final TermsEnumWithSlice entry = top[i]; |
| |
| final Bits b; |
| |
| if (multiLiveDocs != null) { |
| // Optimize for common case: requested skip docs is a |
| // congruent sub-slice of MultiBits: in this case, we |
| // just pull the liveDocs from the sub reader, rather |
| // than making the inefficient |
| // Slice(Multi(sub-readers)): |
| final MultiBits.SubResult sub = multiLiveDocs.getMatchingSub(top[i].subSlice); |
| if (sub.matches) { |
| b = sub.result; |
| } else { |
| // custom case: requested skip docs is foreign: |
| // must slice it on every access (very |
| // inefficient) |
| b = new BitsSlice(liveDocs, top[i].subSlice); |
| } |
| } else if (liveDocs != null) { |
| b = new BitsSlice(liveDocs, top[i].subSlice); |
| } else { |
| // no deletions |
| b = null; |
| } |
| |
| assert entry.index < docsAndPositionsEnum.subDocsAndPositionsEnum.length: entry.index + " vs " + docsAndPositionsEnum.subDocsAndPositionsEnum.length + "; " + subs.length; |
| final DocsAndPositionsEnum subPostings = entry.terms.docsAndPositions(b, docsAndPositionsEnum.subDocsAndPositionsEnum[entry.index], flags); |
| |
| if (subPostings != null) { |
| docsAndPositionsEnum.subDocsAndPositionsEnum[entry.index] = subPostings; |
| subDocsAndPositions[upto].docsAndPositionsEnum = subPostings; |
| subDocsAndPositions[upto].slice = entry.subSlice; |
| upto++; |
| } else { |
| if (entry.terms.docs(b, null, DocsEnum.FLAG_NONE) != null) { |
| // At least one of our subs does not store |
| // offsets or positions -- we can't correctly |
| // produce a MultiDocsAndPositions enum |
| return null; |
| } |
| } |
| } |
| |
| if (upto == 0) { |
| return null; |
| } else { |
| return docsAndPositionsEnum.reset(subDocsAndPositions, upto); |
| } |
| } |
| |
| final static class TermsEnumWithSlice { |
| private final ReaderSlice subSlice; |
| TermsEnum terms; |
| public BytesRef current; |
| final int index; |
| |
| public TermsEnumWithSlice(int index, ReaderSlice subSlice) { |
| this.subSlice = subSlice; |
| this.index = index; |
| assert subSlice.length >= 0: "length=" + subSlice.length; |
| } |
| |
| public void reset(TermsEnum terms, BytesRef term) { |
| this.terms = terms; |
| current = term; |
| } |
| |
| @Override |
| public String toString() { |
| return subSlice.toString()+":"+terms; |
| } |
| } |
| |
| private final static class TermMergeQueue extends PriorityQueue<TermsEnumWithSlice> { |
| Comparator<BytesRef> termComp; |
| TermMergeQueue(int size) { |
| super(size); |
| } |
| |
| @Override |
| protected boolean lessThan(TermsEnumWithSlice termsA, TermsEnumWithSlice termsB) { |
| final int cmp = termComp.compare(termsA.current, termsB.current); |
| if (cmp != 0) { |
| return cmp < 0; |
| } else { |
| return termsA.subSlice.start < termsB.subSlice.start; |
| } |
| } |
| } |
| |
| @Override |
| public String toString() { |
| return "MultiTermsEnum(" + Arrays.toString(subs) + ")"; |
| } |
| } |