org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityIndex.java - jgit/jgit

blob: 34581aefb5f8e5175ff4eaa40564430558afae11 [file] [log] [blame]

	/*
	* Copyright (C) 2010, Google Inc. and others
	*
	* This program and the accompanying materials are made available under the
	* terms of the Eclipse Distribution License v. 1.0 which is available at
	* https://www.eclipse.org/org/documents/edl-v10.php.
	*
	* SPDX-License-Identifier: BSD-3-Clause
	*/

	package org.eclipse.jgit.diff;

	import java.io.EOFException;
	import java.io.IOException;
	import java.io.InputStream;
	import java.util.Arrays;

	import org.eclipse.jgit.errors.MissingObjectException;
	import org.eclipse.jgit.lib.ObjectLoader;
	import org.eclipse.jgit.lib.ObjectStream;

	/**
	* Index structure of lines/blocks in one file.
	* <p>
	* This structure can be used to compute an approximation of the similarity
	* between two files. The index is used by
	* {@link org.eclipse.jgit.diff.SimilarityRenameDetector} to compute scores
	* between files.
	* <p>
	* To save space in memory, this index uses a space efficient encoding which
	* will not exceed 1 MiB per instance. The index starts out at a smaller size
	* (closer to 2 KiB), but may grow as more distinct blocks within the scanned
	* file are discovered.
	*
	* @since 4.0
	*/
	public class SimilarityIndex {
	/** A special {@link TableFullException} used in place of OutOfMemoryError. */
	public static final TableFullException
	TABLE_FULL_OUT_OF_MEMORY = new TableFullException();

	/**
	* Shift to apply before storing a key.
	* <p>
	* Within the 64 bit table record space, we leave the highest bit unset so
	* all values are positive. The lower 32 bits to count bytes.
	*/
	private static final int KEY_SHIFT = 32;

	/** Maximum value of the count field, also mask to extract the count. */
	private static final long MAX_COUNT = (1L << KEY_SHIFT) - 1;

	/**
	* Total amount of bytes hashed into the structure, including \n. This is
	* usually the size of the file minus number of CRLF encounters.
	*/
	private long hashedCnt;

	/** Number of non-zero entries in {@link #idHash}. */
	private int idSize;

	/** {@link #idSize} that triggers {@link #idHash} to double in size. */
	private int idGrowAt;

	/**
	* Pairings of content keys and counters.
	* <p>
	* Slots in the table are actually two ints wedged into a single long. The
	* upper 32 bits stores the content key, and the remaining lower bits stores
	* the number of bytes associated with that key. Empty slots are denoted by
	* 0, which cannot occur because the count cannot be 0. Values can only be
	* positive, which we enforce during key addition.
	*/
	private long[] idHash;

	/** {@code idHash.length == 1 << idHashBits}. */
	private int idHashBits;

	/**
	* Create a new similarity index for the given object
	*
	* @param obj
	* the object to hash
	* @return similarity index for this object
	* @throws java.io.IOException
	* file contents cannot be read from the repository.
	* @throws org.eclipse.jgit.diff.SimilarityIndex.TableFullException
	* object hashing overflowed the storage capacity of the
	* SimilarityIndex.
	*/
	public static SimilarityIndex create(ObjectLoader obj) throws IOException,
	TableFullException {
	SimilarityIndex idx = new SimilarityIndex();
	idx.hash(obj);
	idx.sort();
	return idx;
	}

	SimilarityIndex() {
	idHashBits = 8;
	idHash = new long[1 << idHashBits];
	idGrowAt = growAt(idHashBits);
	}

	static boolean isBinary(ObjectLoader obj) throws IOException {
	if (obj.isLarge()) {
	try (ObjectStream in1 = obj.openStream()) {
	return RawText.isBinary(in1);
	}
	}
	byte[] raw = obj.getCachedBytes();
	return RawText.isBinary(raw, raw.length, true);
	}

	void hash(ObjectLoader obj) throws MissingObjectException, IOException,
	TableFullException {
	if (obj.isLarge()) {
	hashLargeObject(obj);
	} else {
	byte[] raw = obj.getCachedBytes();
	hash(raw, 0, raw.length);
	}
	}

	private void hashLargeObject(ObjectLoader obj) throws IOException,
	TableFullException {
	boolean text;
	text = !isBinary(obj);

	try (ObjectStream in2 = obj.openStream()) {
	hash(in2, in2.getSize(), text);
	}
	}

	void hash(byte[] raw, int ptr, int end) throws TableFullException {
	final boolean text = !RawText.isBinary(raw, raw.length, true);
	hashedCnt = 0;
	while (ptr < end) {
	int hash = 5381;
	int blockHashedCnt = 0;
	int start = ptr;

	// Hash one line, or one block, whichever occurs first.
	do {
	int c = raw[ptr++] & 0xff;
	// Ignore CR in CRLF sequence if text
	if (text && c == '\r' && ptr < end && raw[ptr] == '\n')
	continue;
	blockHashedCnt++;
	if (c == '\n')
	break;
	hash = (hash << 5) + hash + c;
	} while (ptr < end && ptr - start < 64);
	hashedCnt += blockHashedCnt;
	add(hash, blockHashedCnt);
	}
	}

	void hash(InputStream in, long remaining, boolean text) throws IOException,
	TableFullException {
	byte[] buf = new byte[4096];
	int ptr = 0;
	int cnt = 0;

	while (0 < remaining) {
	int hash = 5381;
	int blockHashedCnt = 0;

	// Hash one line, or one block, whichever occurs first.
	int n = 0;
	do {
	if (ptr == cnt) {
	ptr = 0;
	cnt = in.read(buf, 0, buf.length);
	if (cnt <= 0)
	throw new EOFException();
	}

	n++;
	int c = buf[ptr++] & 0xff;
	// Ignore CR in CRLF sequence if text
	if (text && c == '\r' && ptr < cnt && buf[ptr] == '\n')
	continue;
	blockHashedCnt++;
	if (c == '\n')
	break;
	hash = (hash << 5) + hash + c;
	} while (n < 64 && n < remaining);
	hashedCnt += blockHashedCnt;
	add(hash, blockHashedCnt);
	remaining -= n;
	}
	}

	/**
	* Sort the internal table so it can be used for efficient scoring.
	* <p>
	* Once sorted, additional lines/blocks cannot be added to the index.
	*/
	void sort() {
	// Sort the array. All of the empty space will wind up at the front,
	// because we forced all of the keys to always be positive. Later
	// we only work with the back half of the array.
	//
	Arrays.sort(idHash);
	}

	/**
	* Compute the similarity score between this index and another.
	* <p>
	* A region of a file is defined as a line in a text file or a fixed-size
	* block in a binary file. To prepare an index, each region in the file is
	* hashed; the values and counts of hashes are retained in a sorted table.
	* Define the similarity fraction F as the count of matching regions
	* between the two files divided between the maximum count of regions in
	* either file. The similarity score is F multiplied by the maxScore
	* constant, yielding a range [0, maxScore]. It is defined as maxScore for
	* the degenerate case of two empty files.
	* <p>
	* The similarity score is symmetrical; i.e. a.score(b) == b.score(a).
	*
	* @param dst
	* the other index
	* @param maxScore
	* the score representing a 100% match
	* @return the similarity score
	*/
	public int score(SimilarityIndex dst, int maxScore) {
	long max = Math.max(hashedCnt, dst.hashedCnt);
	if (max == 0)
	return maxScore;
	return (int) ((common(dst) * maxScore) / max);
	}

	long common(SimilarityIndex dst) {
	return common(this, dst);
	}

	private static long common(SimilarityIndex src, SimilarityIndex dst) {
	int srcIdx = src.packedIndex(0);
	int dstIdx = dst.packedIndex(0);
	long[] srcHash = src.idHash;
	long[] dstHash = dst.idHash;
	return common(srcHash, srcIdx, dstHash, dstIdx);
	}

	private static long common(long[] srcHash, int srcIdx, //
	long[] dstHash, int dstIdx) {
	if (srcIdx == srcHash.length \|\| dstIdx == dstHash.length)
	return 0;

	long common = 0;
	int srcKey = keyOf(srcHash[srcIdx]);
	int dstKey = keyOf(dstHash[dstIdx]);

	for (;;) {
	if (srcKey == dstKey) {
	common += Math.min(countOf(srcHash[srcIdx]),
	countOf(dstHash[dstIdx]));

	if (++srcIdx == srcHash.length)
	break;
	srcKey = keyOf(srcHash[srcIdx]);

	if (++dstIdx == dstHash.length)
	break;
	dstKey = keyOf(dstHash[dstIdx]);

	} else if (srcKey < dstKey) {
	// Regions of src which do not appear in dst.
	if (++srcIdx == srcHash.length)
	break;
	srcKey = keyOf(srcHash[srcIdx]);

	} else /* if (dstKey < srcKey) */{
	// Regions of dst which do not appear in src.
	if (++dstIdx == dstHash.length)
	break;
	dstKey = keyOf(dstHash[dstIdx]);
	}
	}

	return common;
	}

	// Testing only
	int size() {
	return idSize;
	}

	// Testing only
	int key(int idx) {
	return keyOf(idHash[packedIndex(idx)]);
	}

	// Testing only
	long count(int idx) {
	return countOf(idHash[packedIndex(idx)]);
	}

	// Brute force approach only for testing.
	int findIndex(int key) {
	for (int i = 0; i < idSize; i++)
	if (key(i) == key)
	return i;
	return -1;
	}

	private int packedIndex(int idx) {
	return (idHash.length - idSize) + idx;
	}

	void add(int key, int cnt) throws TableFullException {
	key = (key * 0x9e370001) >>> 1; // Mix bits and ensure not negative.

	int j = slot(key);
	for (;;) {
	long v = idHash[j];
	if (v == 0) {
	// Empty slot in the table, store here.
	if (idGrowAt <= idSize) {
	grow();
	j = slot(key);
	continue;
	}
	idHash[j] = pair(key, cnt);
	idSize++;
	return;

	} else if (keyOf(v) == key) {
	// Same key, increment the counter. If it overflows, fail
	// indexing to prevent the key from being impacted.
	//
	idHash[j] = pair(key, countOf(v) + cnt);
	return;

	} else if (++j >= idHash.length) {
	j = 0;
	}
	}
	}

	private static long pair(int key, long cnt) throws TableFullException {
	if (MAX_COUNT < cnt)
	throw new TableFullException();
	return (((long) key) << KEY_SHIFT) \| cnt;
	}

	private int slot(int key) {
	// We use 31 - idHashBits because the upper bit was already forced
	// to be 0 and we want the remaining high bits to be used as the
	// table slot.
	//
	return key >>> (31 - idHashBits);
	}

	private static int growAt(int idHashBits) {
	return (1 << idHashBits) * (idHashBits - 3) / idHashBits;
	}

	@SuppressWarnings("UnusedException")
	private void grow() throws TableFullException {
	if (idHashBits == 30)
	throw new TableFullException();

	long[] oldHash = idHash;
	int oldSize = idHash.length;

	idHashBits++;
	idGrowAt = growAt(idHashBits);

	try {
	idHash = new long[1 << idHashBits];
	} catch (OutOfMemoryError noMemory) {
	throw TABLE_FULL_OUT_OF_MEMORY;
	}

	for (int i = 0; i < oldSize; i++) {
	long v = oldHash[i];
	if (v != 0) {
	int j = slot(keyOf(v));
	while (idHash[j] != 0)
	if (++j >= idHash.length)
	j = 0;
	idHash[j] = v;
	}
	}
	}

	private static int keyOf(long v) {
	return (int) (v >>> KEY_SHIFT);
	}

	private static long countOf(long v) {
	return v & MAX_COUNT;
	}

	/** Thrown by {@code create()} when file is too large. */
	public static class TableFullException extends Exception {
	private static final long serialVersionUID = 1L;
	}
	}