core/org.eclipse.smila.solr/lib/source/org/apache/lucene/codecs/lucene3x/TermInfosReaderIndex.java - smila/org.eclipse.smila.core - Git at Google

 package org.apache.lucene.codecs.lucene3x;

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Comparator;
 import java.util.List;

 import org.apache.lucene.index.Term;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.BytesRefBuilder;
 import org.apache.lucene.util.MathUtil;
 import org.apache.lucene.util.PagedBytes;
 import org.apache.lucene.util.PagedBytes.PagedBytesDataInput;
 import org.apache.lucene.util.PagedBytes.PagedBytesDataOutput;
 import org.apache.lucene.util.RamUsageEstimator;
 import org.apache.lucene.util.packed.GrowableWriter;
 import org.apache.lucene.util.packed.PackedInts;

 /**
  * This stores a monotonically increasing set of <Term, TermInfo> pairs in an
  * index segment. Pairs are accessed either by Term or by ordinal position the
  * set. The Terms and TermInfo are actually serialized and stored into a byte
  * array and pointers to the position of each are stored in a int array.
  * @deprecated Only for reading existing 3.x indexes
  */
 @Deprecated
 class TermInfosReaderIndex {

   private static final int MAX_PAGE_BITS = 18; // 256 KB block
   private Term[] fields;
   private int totalIndexInterval;
   private Comparator<BytesRef> comparator = BytesRef.getUTF8SortedAsUTF16Comparator();
   private final PagedBytesDataInput dataInput;
   private final PackedInts.Reader indexToDataOffset;
   private final int indexSize;
   private final int skipInterval;
   private final long ramBytesUsed;

   /**
    * Loads the segment information at segment load time.
    *
    * @param indexEnum
    *          the term enum.
    * @param indexDivisor
    *          the index divisor.
    * @param tiiFileLength
    *          the size of the tii file, used to approximate the size of the
    *          buffer.
    * @param totalIndexInterval
    *          the total index interval.
    */
   TermInfosReaderIndex(SegmentTermEnum indexEnum, int indexDivisor, long tiiFileLength, int totalIndexInterval) throws IOException {
     this.totalIndexInterval = totalIndexInterval;
     indexSize = 1 + ((int) indexEnum.size - 1) / indexDivisor;
     skipInterval = indexEnum.skipInterval;
     // this is only an inital size, it will be GCed once the build is complete
     long initialSize = (long) (tiiFileLength * 1.5) / indexDivisor;
     PagedBytes dataPagedBytes = new PagedBytes(estimatePageBits(initialSize));
     PagedBytesDataOutput dataOutput = dataPagedBytes.getDataOutput();

     final int bitEstimate = 1+MathUtil.log(tiiFileLength, 2);
     GrowableWriter indexToTerms = new GrowableWriter(bitEstimate, indexSize, PackedInts.DEFAULT);

     String currentField = null;
     List<String> fieldStrs = new ArrayList<>();
     int fieldCounter = -1;
     for (int i = 0; indexEnum.next(); i++) {
       Term term = indexEnum.term();
       if (currentField == null || !currentField.equals(term.field())) {
         currentField = term.field();
         fieldStrs.add(currentField);
         fieldCounter++;
       }
       TermInfo termInfo = indexEnum.termInfo();
       indexToTerms.set(i, dataOutput.getPosition());
       dataOutput.writeVInt(fieldCounter);
       dataOutput.writeString(term.text());
       dataOutput.writeVInt(termInfo.docFreq);
       if (termInfo.docFreq >= skipInterval) {
         dataOutput.writeVInt(termInfo.skipOffset);
       }
       dataOutput.writeVLong(termInfo.freqPointer);
       dataOutput.writeVLong(termInfo.proxPointer);
       dataOutput.writeVLong(indexEnum.indexPointer);
       for (int j = 1; j < indexDivisor; j++) {
         if (!indexEnum.next()) {
           break;
         }
       }
     }

     fields = new Term[fieldStrs.size()];
     for (int i = 0; i < fields.length; i++) {
       fields[i] = new Term(fieldStrs.get(i));
     }

     dataPagedBytes.freeze(true);
     dataInput = dataPagedBytes.getDataInput();
     indexToDataOffset = indexToTerms.getMutable();

     long ramBytesUsed = RamUsageEstimator.shallowSizeOf(fields);
     ramBytesUsed += RamUsageEstimator.shallowSizeOf(dataInput);
     ramBytesUsed += fields.length * RamUsageEstimator.shallowSizeOfInstance(Term.class);
     ramBytesUsed += dataPagedBytes.ramBytesUsed();
     ramBytesUsed += indexToDataOffset.ramBytesUsed();
     this.ramBytesUsed = ramBytesUsed;
   }

   private static int estimatePageBits(long estSize) {
     return Math.max(Math.min(64 - Long.numberOfLeadingZeros(estSize), MAX_PAGE_BITS), 4);
   }

   void seekEnum(SegmentTermEnum enumerator, int indexOffset) throws IOException {
     PagedBytesDataInput input = dataInput.clone();

     input.setPosition(indexToDataOffset.get(indexOffset));

     // read the term
     int fieldId = input.readVInt();
     Term field = fields[fieldId];
     Term term = new Term(field.field(), input.readString());

     // read the terminfo
     TermInfo termInfo = new TermInfo();
     termInfo.docFreq = input.readVInt();
     if (termInfo.docFreq >= skipInterval) {
       termInfo.skipOffset = input.readVInt();
     } else {
       termInfo.skipOffset = 0;
     }
     termInfo.freqPointer = input.readVLong();
     termInfo.proxPointer = input.readVLong();

     long pointer = input.readVLong();

     // perform the seek
     enumerator.seek(pointer, ((long) indexOffset * totalIndexInterval) - 1, term, termInfo);
   }

   /**
    * Binary search for the given term.
    *
    * @param term
    *          the term to locate.
    * @throws IOException If there is a low-level I/O error.
    */
   int getIndexOffset(Term term) throws IOException {
     int lo = 0;
     int hi = indexSize - 1;
     PagedBytesDataInput input = dataInput.clone();
     BytesRefBuilder scratch = new BytesRefBuilder();
     while (hi >= lo) {
       int mid = (lo + hi) >>> 1;
       int delta = compareTo(term, mid, input, scratch);
       if (delta < 0)
         hi = mid - 1;
       else if (delta > 0)
         lo = mid + 1;
       else
         return mid;
     }
     return hi;
   }

   /**
    * Gets the term at the given position.  For testing.
    *
    * @param termIndex
    *          the position to read the term from the index.
    * @return the term.
    * @throws IOException If there is a low-level I/O error.
    */
   Term getTerm(int termIndex) throws IOException {
     PagedBytesDataInput input = dataInput.clone();
     input.setPosition(indexToDataOffset.get(termIndex));

     // read the term
     int fieldId = input.readVInt();
     Term field = fields[fieldId];
     return new Term(field.field(), input.readString());
   }

   /**
    * Returns the number of terms.
    *
    * @return int.
    */
   int length() {
     return indexSize;
   }

   /**
    * The compares the given term against the term in the index specified by the
    * term index. ie It returns negative N when term is less than index term;
    *
    * @param term
    *          the given term.
    * @param termIndex
    *          the index of the of term to compare.
    * @return int.
    * @throws IOException If there is a low-level I/O error.
    */
   int compareTo(Term term, int termIndex) throws IOException {
     return compareTo(term, termIndex, dataInput.clone(), new BytesRefBuilder());
   }

   /**
    * Compare the fields of the terms first, and if not equals return from
    * compare. If equal compare terms.
    *
    * @param term
    *          the term to compare.
    * @param termIndex
    *          the position of the term in the input to compare
    * @param input
    *          the input buffer.
    * @return int.
    * @throws IOException If there is a low-level I/O error.
    */
   private int compareTo(Term term, int termIndex, PagedBytesDataInput input, BytesRefBuilder reuse) throws IOException {
     // if term field does not equal mid's field index, then compare fields
     // else if they are equal, compare term's string values...
     int c = compareField(term, termIndex, input);
     if (c == 0) {
       reuse.setLength(input.readVInt());
       reuse.grow(reuse.length());
       input.readBytes(reuse.bytes(), 0, reuse.length());
       return comparator.compare(term.bytes(), reuse.get());
     }
     return c;
   }

   /**
    * Compares the fields before checking the text of the terms.
    *
    * @param term
    *          the given term.
    * @param termIndex
    *          the term that exists in the data block.
    * @param input
    *          the data block.
    * @return int.
    * @throws IOException If there is a low-level I/O error.
    */
   private int compareField(Term term, int termIndex, PagedBytesDataInput input) throws IOException {
     input.setPosition(indexToDataOffset.get(termIndex));
     return term.field().compareTo(fields[input.readVInt()].field());
   }

   long ramBytesUsed() {
     return ramBytesUsed;
   }

 }
	package org.apache.lucene.codecs.lucene3x;

	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	import java.io.IOException;
	import java.util.ArrayList;
	import java.util.Comparator;
	import java.util.List;

	import org.apache.lucene.index.Term;
	import org.apache.lucene.util.BytesRef;
	import org.apache.lucene.util.BytesRefBuilder;
	import org.apache.lucene.util.MathUtil;
	import org.apache.lucene.util.PagedBytes;
	import org.apache.lucene.util.PagedBytes.PagedBytesDataInput;
	import org.apache.lucene.util.PagedBytes.PagedBytesDataOutput;
	import org.apache.lucene.util.RamUsageEstimator;
	import org.apache.lucene.util.packed.GrowableWriter;
	import org.apache.lucene.util.packed.PackedInts;

	/**
	* This stores a monotonically increasing set of <Term, TermInfo> pairs in an
	* index segment. Pairs are accessed either by Term or by ordinal position the
	* set. The Terms and TermInfo are actually serialized and stored into a byte
	* array and pointers to the position of each are stored in a int array.
	* @deprecated Only for reading existing 3.x indexes
	*/
	@Deprecated
	class TermInfosReaderIndex {

	private static final int MAX_PAGE_BITS = 18; // 256 KB block
	private Term[] fields;
	private int totalIndexInterval;
	private Comparator<BytesRef> comparator = BytesRef.getUTF8SortedAsUTF16Comparator();
	private final PagedBytesDataInput dataInput;
	private final PackedInts.Reader indexToDataOffset;
	private final int indexSize;
	private final int skipInterval;
	private final long ramBytesUsed;

	/**
	* Loads the segment information at segment load time.
	*
	* @param indexEnum
	* the term enum.
	* @param indexDivisor
	* the index divisor.
	* @param tiiFileLength
	* the size of the tii file, used to approximate the size of the
	* buffer.
	* @param totalIndexInterval
	* the total index interval.
	*/
	TermInfosReaderIndex(SegmentTermEnum indexEnum, int indexDivisor, long tiiFileLength, int totalIndexInterval) throws IOException {
	this.totalIndexInterval = totalIndexInterval;
	indexSize = 1 + ((int) indexEnum.size - 1) / indexDivisor;
	skipInterval = indexEnum.skipInterval;
	// this is only an inital size, it will be GCed once the build is complete
	long initialSize = (long) (tiiFileLength * 1.5) / indexDivisor;
	PagedBytes dataPagedBytes = new PagedBytes(estimatePageBits(initialSize));
	PagedBytesDataOutput dataOutput = dataPagedBytes.getDataOutput();

	final int bitEstimate = 1+MathUtil.log(tiiFileLength, 2);
	GrowableWriter indexToTerms = new GrowableWriter(bitEstimate, indexSize, PackedInts.DEFAULT);

	String currentField = null;
	List<String> fieldStrs = new ArrayList<>();
	int fieldCounter = -1;
	for (int i = 0; indexEnum.next(); i++) {
	Term term = indexEnum.term();
	if (currentField == null \|\| !currentField.equals(term.field())) {
	currentField = term.field();
	fieldStrs.add(currentField);
	fieldCounter++;
	}
	TermInfo termInfo = indexEnum.termInfo();
	indexToTerms.set(i, dataOutput.getPosition());
	dataOutput.writeVInt(fieldCounter);
	dataOutput.writeString(term.text());
	dataOutput.writeVInt(termInfo.docFreq);
	if (termInfo.docFreq >= skipInterval) {
	dataOutput.writeVInt(termInfo.skipOffset);
	}
	dataOutput.writeVLong(termInfo.freqPointer);
	dataOutput.writeVLong(termInfo.proxPointer);
	dataOutput.writeVLong(indexEnum.indexPointer);
	for (int j = 1; j < indexDivisor; j++) {
	if (!indexEnum.next()) {
	break;
	}
	}
	}

	fields = new Term[fieldStrs.size()];
	for (int i = 0; i < fields.length; i++) {
	fields[i] = new Term(fieldStrs.get(i));
	}

	dataPagedBytes.freeze(true);
	dataInput = dataPagedBytes.getDataInput();
	indexToDataOffset = indexToTerms.getMutable();

	long ramBytesUsed = RamUsageEstimator.shallowSizeOf(fields);
	ramBytesUsed += RamUsageEstimator.shallowSizeOf(dataInput);
	ramBytesUsed += fields.length * RamUsageEstimator.shallowSizeOfInstance(Term.class);
	ramBytesUsed += dataPagedBytes.ramBytesUsed();
	ramBytesUsed += indexToDataOffset.ramBytesUsed();
	this.ramBytesUsed = ramBytesUsed;
	}

	private static int estimatePageBits(long estSize) {
	return Math.max(Math.min(64 - Long.numberOfLeadingZeros(estSize), MAX_PAGE_BITS), 4);
	}

	void seekEnum(SegmentTermEnum enumerator, int indexOffset) throws IOException {
	PagedBytesDataInput input = dataInput.clone();

	input.setPosition(indexToDataOffset.get(indexOffset));

	// read the term
	int fieldId = input.readVInt();
	Term field = fields[fieldId];
	Term term = new Term(field.field(), input.readString());

	// read the terminfo
	TermInfo termInfo = new TermInfo();
	termInfo.docFreq = input.readVInt();
	if (termInfo.docFreq >= skipInterval) {
	termInfo.skipOffset = input.readVInt();
	} else {
	termInfo.skipOffset = 0;
	}
	termInfo.freqPointer = input.readVLong();
	termInfo.proxPointer = input.readVLong();

	long pointer = input.readVLong();

	// perform the seek
	enumerator.seek(pointer, ((long) indexOffset * totalIndexInterval) - 1, term, termInfo);
	}

	/**
	* Binary search for the given term.
	*
	* @param term
	* the term to locate.
	* @throws IOException If there is a low-level I/O error.
	*/
	int getIndexOffset(Term term) throws IOException {
	int lo = 0;
	int hi = indexSize - 1;
	PagedBytesDataInput input = dataInput.clone();
	BytesRefBuilder scratch = new BytesRefBuilder();
	while (hi >= lo) {
	int mid = (lo + hi) >>> 1;
	int delta = compareTo(term, mid, input, scratch);
	if (delta < 0)
	hi = mid - 1;
	else if (delta > 0)
	lo = mid + 1;
	else
	return mid;
	}
	return hi;
	}

	/**
	* Gets the term at the given position. For testing.
	*
	* @param termIndex
	* the position to read the term from the index.
	* @return the term.
	* @throws IOException If there is a low-level I/O error.
	*/
	Term getTerm(int termIndex) throws IOException {
	PagedBytesDataInput input = dataInput.clone();
	input.setPosition(indexToDataOffset.get(termIndex));

	// read the term
	int fieldId = input.readVInt();
	Term field = fields[fieldId];
	return new Term(field.field(), input.readString());
	}

	/**
	* Returns the number of terms.
	*
	* @return int.
	*/
	int length() {
	return indexSize;
	}

	/**
	* The compares the given term against the term in the index specified by the
	* term index. ie It returns negative N when term is less than index term;
	*
	* @param term
	* the given term.
	* @param termIndex
	* the index of the of term to compare.
	* @return int.
	* @throws IOException If there is a low-level I/O error.
	*/
	int compareTo(Term term, int termIndex) throws IOException {
	return compareTo(term, termIndex, dataInput.clone(), new BytesRefBuilder());
	}

	/**
	* Compare the fields of the terms first, and if not equals return from
	* compare. If equal compare terms.
	*
	* @param term
	* the term to compare.
	* @param termIndex
	* the position of the term in the input to compare
	* @param input
	* the input buffer.
	* @return int.
	* @throws IOException If there is a low-level I/O error.
	*/
	private int compareTo(Term term, int termIndex, PagedBytesDataInput input, BytesRefBuilder reuse) throws IOException {
	// if term field does not equal mid's field index, then compare fields
	// else if they are equal, compare term's string values...
	int c = compareField(term, termIndex, input);
	if (c == 0) {
	reuse.setLength(input.readVInt());
	reuse.grow(reuse.length());
	input.readBytes(reuse.bytes(), 0, reuse.length());
	return comparator.compare(term.bytes(), reuse.get());
	}
	return c;
	}

	/**
	* Compares the fields before checking the text of the terms.
	*
	* @param term
	* the given term.
	* @param termIndex
	* the term that exists in the data block.
	* @param input
	* the data block.
	* @return int.
	* @throws IOException If there is a low-level I/O error.
	*/
	private int compareField(Term term, int termIndex, PagedBytesDataInput input) throws IOException {
	input.setPosition(indexToDataOffset.get(termIndex));
	return term.field().compareTo(fields[input.readVInt()].field());
	}

	long ramBytesUsed() {
	return ramBytesUsed;
	}

	}