| package org.apache.lucene.codecs.lucene3x; |
| |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| import java.io.IOException; |
| import java.util.ArrayList; |
| import java.util.Comparator; |
| import java.util.List; |
| |
| import org.apache.lucene.index.Term; |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.BytesRefBuilder; |
| import org.apache.lucene.util.MathUtil; |
| import org.apache.lucene.util.PagedBytes; |
| import org.apache.lucene.util.PagedBytes.PagedBytesDataInput; |
| import org.apache.lucene.util.PagedBytes.PagedBytesDataOutput; |
| import org.apache.lucene.util.RamUsageEstimator; |
| import org.apache.lucene.util.packed.GrowableWriter; |
| import org.apache.lucene.util.packed.PackedInts; |
| |
| /** |
| * This stores a monotonically increasing set of <Term, TermInfo> pairs in an |
| * index segment. Pairs are accessed either by Term or by ordinal position the |
| * set. The Terms and TermInfo are actually serialized and stored into a byte |
| * array and pointers to the position of each are stored in a int array. |
| * @deprecated Only for reading existing 3.x indexes |
| */ |
| @Deprecated |
| class TermInfosReaderIndex { |
| |
| private static final int MAX_PAGE_BITS = 18; // 256 KB block |
| private Term[] fields; |
| private int totalIndexInterval; |
| private Comparator<BytesRef> comparator = BytesRef.getUTF8SortedAsUTF16Comparator(); |
| private final PagedBytesDataInput dataInput; |
| private final PackedInts.Reader indexToDataOffset; |
| private final int indexSize; |
| private final int skipInterval; |
| private final long ramBytesUsed; |
| |
| /** |
| * Loads the segment information at segment load time. |
| * |
| * @param indexEnum |
| * the term enum. |
| * @param indexDivisor |
| * the index divisor. |
| * @param tiiFileLength |
| * the size of the tii file, used to approximate the size of the |
| * buffer. |
| * @param totalIndexInterval |
| * the total index interval. |
| */ |
| TermInfosReaderIndex(SegmentTermEnum indexEnum, int indexDivisor, long tiiFileLength, int totalIndexInterval) throws IOException { |
| this.totalIndexInterval = totalIndexInterval; |
| indexSize = 1 + ((int) indexEnum.size - 1) / indexDivisor; |
| skipInterval = indexEnum.skipInterval; |
| // this is only an inital size, it will be GCed once the build is complete |
| long initialSize = (long) (tiiFileLength * 1.5) / indexDivisor; |
| PagedBytes dataPagedBytes = new PagedBytes(estimatePageBits(initialSize)); |
| PagedBytesDataOutput dataOutput = dataPagedBytes.getDataOutput(); |
| |
| final int bitEstimate = 1+MathUtil.log(tiiFileLength, 2); |
| GrowableWriter indexToTerms = new GrowableWriter(bitEstimate, indexSize, PackedInts.DEFAULT); |
| |
| String currentField = null; |
| List<String> fieldStrs = new ArrayList<>(); |
| int fieldCounter = -1; |
| for (int i = 0; indexEnum.next(); i++) { |
| Term term = indexEnum.term(); |
| if (currentField == null || !currentField.equals(term.field())) { |
| currentField = term.field(); |
| fieldStrs.add(currentField); |
| fieldCounter++; |
| } |
| TermInfo termInfo = indexEnum.termInfo(); |
| indexToTerms.set(i, dataOutput.getPosition()); |
| dataOutput.writeVInt(fieldCounter); |
| dataOutput.writeString(term.text()); |
| dataOutput.writeVInt(termInfo.docFreq); |
| if (termInfo.docFreq >= skipInterval) { |
| dataOutput.writeVInt(termInfo.skipOffset); |
| } |
| dataOutput.writeVLong(termInfo.freqPointer); |
| dataOutput.writeVLong(termInfo.proxPointer); |
| dataOutput.writeVLong(indexEnum.indexPointer); |
| for (int j = 1; j < indexDivisor; j++) { |
| if (!indexEnum.next()) { |
| break; |
| } |
| } |
| } |
| |
| fields = new Term[fieldStrs.size()]; |
| for (int i = 0; i < fields.length; i++) { |
| fields[i] = new Term(fieldStrs.get(i)); |
| } |
| |
| dataPagedBytes.freeze(true); |
| dataInput = dataPagedBytes.getDataInput(); |
| indexToDataOffset = indexToTerms.getMutable(); |
| |
| long ramBytesUsed = RamUsageEstimator.shallowSizeOf(fields); |
| ramBytesUsed += RamUsageEstimator.shallowSizeOf(dataInput); |
| ramBytesUsed += fields.length * RamUsageEstimator.shallowSizeOfInstance(Term.class); |
| ramBytesUsed += dataPagedBytes.ramBytesUsed(); |
| ramBytesUsed += indexToDataOffset.ramBytesUsed(); |
| this.ramBytesUsed = ramBytesUsed; |
| } |
| |
| private static int estimatePageBits(long estSize) { |
| return Math.max(Math.min(64 - Long.numberOfLeadingZeros(estSize), MAX_PAGE_BITS), 4); |
| } |
| |
| void seekEnum(SegmentTermEnum enumerator, int indexOffset) throws IOException { |
| PagedBytesDataInput input = dataInput.clone(); |
| |
| input.setPosition(indexToDataOffset.get(indexOffset)); |
| |
| // read the term |
| int fieldId = input.readVInt(); |
| Term field = fields[fieldId]; |
| Term term = new Term(field.field(), input.readString()); |
| |
| // read the terminfo |
| TermInfo termInfo = new TermInfo(); |
| termInfo.docFreq = input.readVInt(); |
| if (termInfo.docFreq >= skipInterval) { |
| termInfo.skipOffset = input.readVInt(); |
| } else { |
| termInfo.skipOffset = 0; |
| } |
| termInfo.freqPointer = input.readVLong(); |
| termInfo.proxPointer = input.readVLong(); |
| |
| long pointer = input.readVLong(); |
| |
| // perform the seek |
| enumerator.seek(pointer, ((long) indexOffset * totalIndexInterval) - 1, term, termInfo); |
| } |
| |
| /** |
| * Binary search for the given term. |
| * |
| * @param term |
| * the term to locate. |
| * @throws IOException If there is a low-level I/O error. |
| */ |
| int getIndexOffset(Term term) throws IOException { |
| int lo = 0; |
| int hi = indexSize - 1; |
| PagedBytesDataInput input = dataInput.clone(); |
| BytesRefBuilder scratch = new BytesRefBuilder(); |
| while (hi >= lo) { |
| int mid = (lo + hi) >>> 1; |
| int delta = compareTo(term, mid, input, scratch); |
| if (delta < 0) |
| hi = mid - 1; |
| else if (delta > 0) |
| lo = mid + 1; |
| else |
| return mid; |
| } |
| return hi; |
| } |
| |
| /** |
| * Gets the term at the given position. For testing. |
| * |
| * @param termIndex |
| * the position to read the term from the index. |
| * @return the term. |
| * @throws IOException If there is a low-level I/O error. |
| */ |
| Term getTerm(int termIndex) throws IOException { |
| PagedBytesDataInput input = dataInput.clone(); |
| input.setPosition(indexToDataOffset.get(termIndex)); |
| |
| // read the term |
| int fieldId = input.readVInt(); |
| Term field = fields[fieldId]; |
| return new Term(field.field(), input.readString()); |
| } |
| |
| /** |
| * Returns the number of terms. |
| * |
| * @return int. |
| */ |
| int length() { |
| return indexSize; |
| } |
| |
| /** |
| * The compares the given term against the term in the index specified by the |
| * term index. ie It returns negative N when term is less than index term; |
| * |
| * @param term |
| * the given term. |
| * @param termIndex |
| * the index of the of term to compare. |
| * @return int. |
| * @throws IOException If there is a low-level I/O error. |
| */ |
| int compareTo(Term term, int termIndex) throws IOException { |
| return compareTo(term, termIndex, dataInput.clone(), new BytesRefBuilder()); |
| } |
| |
| /** |
| * Compare the fields of the terms first, and if not equals return from |
| * compare. If equal compare terms. |
| * |
| * @param term |
| * the term to compare. |
| * @param termIndex |
| * the position of the term in the input to compare |
| * @param input |
| * the input buffer. |
| * @return int. |
| * @throws IOException If there is a low-level I/O error. |
| */ |
| private int compareTo(Term term, int termIndex, PagedBytesDataInput input, BytesRefBuilder reuse) throws IOException { |
| // if term field does not equal mid's field index, then compare fields |
| // else if they are equal, compare term's string values... |
| int c = compareField(term, termIndex, input); |
| if (c == 0) { |
| reuse.setLength(input.readVInt()); |
| reuse.grow(reuse.length()); |
| input.readBytes(reuse.bytes(), 0, reuse.length()); |
| return comparator.compare(term.bytes(), reuse.get()); |
| } |
| return c; |
| } |
| |
| /** |
| * Compares the fields before checking the text of the terms. |
| * |
| * @param term |
| * the given term. |
| * @param termIndex |
| * the term that exists in the data block. |
| * @param input |
| * the data block. |
| * @return int. |
| * @throws IOException If there is a low-level I/O error. |
| */ |
| private int compareField(Term term, int termIndex, PagedBytesDataInput input) throws IOException { |
| input.setPosition(indexToDataOffset.get(termIndex)); |
| return term.field().compareTo(fields[input.readVInt()].field()); |
| } |
| |
| long ramBytesUsed() { |
| return ramBytesUsed; |
| } |
| |
| } |