| /* |
| * Copyright (C) 2010, Google Inc. and others |
| * |
| * This program and the accompanying materials are made available under the |
| * terms of the Eclipse Distribution License v. 1.0 which is available at |
| * https://www.eclipse.org/org/documents/edl-v10.php. |
| * |
| * SPDX-License-Identifier: BSD-3-Clause |
| */ |
| |
| package org.eclipse.jgit.diff; |
| |
| import java.io.EOFException; |
| import java.io.IOException; |
| import java.io.InputStream; |
| import java.util.Arrays; |
| |
| import org.eclipse.jgit.errors.MissingObjectException; |
| import org.eclipse.jgit.lib.ObjectLoader; |
| import org.eclipse.jgit.lib.ObjectStream; |
| |
| /** |
| * Index structure of lines/blocks in one file. |
| * <p> |
| * This structure can be used to compute an approximation of the similarity |
| * between two files. The index is used by |
| * {@link org.eclipse.jgit.diff.SimilarityRenameDetector} to compute scores |
| * between files. |
| * <p> |
| * To save space in memory, this index uses a space efficient encoding which |
| * will not exceed 1 MiB per instance. The index starts out at a smaller size |
| * (closer to 2 KiB), but may grow as more distinct blocks within the scanned |
| * file are discovered. |
| * |
| * @since 4.0 |
| */ |
| public class SimilarityIndex { |
| /** A special {@link TableFullException} used in place of OutOfMemoryError. */ |
| public static final TableFullException |
| TABLE_FULL_OUT_OF_MEMORY = new TableFullException(); |
| |
| /** |
| * Shift to apply before storing a key. |
| * <p> |
| * Within the 64 bit table record space, we leave the highest bit unset so |
| * all values are positive. The lower 32 bits to count bytes. |
| */ |
| private static final int KEY_SHIFT = 32; |
| |
| /** Maximum value of the count field, also mask to extract the count. */ |
| private static final long MAX_COUNT = (1L << KEY_SHIFT) - 1; |
| |
| /** |
| * Total amount of bytes hashed into the structure, including \n. This is |
| * usually the size of the file minus number of CRLF encounters. |
| */ |
| private long hashedCnt; |
| |
| /** Number of non-zero entries in {@link #idHash}. */ |
| private int idSize; |
| |
| /** {@link #idSize} that triggers {@link #idHash} to double in size. */ |
| private int idGrowAt; |
| |
| /** |
| * Pairings of content keys and counters. |
| * <p> |
| * Slots in the table are actually two ints wedged into a single long. The |
| * upper 32 bits stores the content key, and the remaining lower bits stores |
| * the number of bytes associated with that key. Empty slots are denoted by |
| * 0, which cannot occur because the count cannot be 0. Values can only be |
| * positive, which we enforce during key addition. |
| */ |
| private long[] idHash; |
| |
| /** {@code idHash.length == 1 << idHashBits}. */ |
| private int idHashBits; |
| |
| /** |
| * Create a new similarity index for the given object |
| * |
| * @param obj |
| * the object to hash |
| * @return similarity index for this object |
| * @throws java.io.IOException |
| * file contents cannot be read from the repository. |
| * @throws org.eclipse.jgit.diff.SimilarityIndex.TableFullException |
| * object hashing overflowed the storage capacity of the |
| * SimilarityIndex. |
| */ |
| public static SimilarityIndex create(ObjectLoader obj) throws IOException, |
| TableFullException { |
| SimilarityIndex idx = new SimilarityIndex(); |
| idx.hash(obj); |
| idx.sort(); |
| return idx; |
| } |
| |
| SimilarityIndex() { |
| idHashBits = 8; |
| idHash = new long[1 << idHashBits]; |
| idGrowAt = growAt(idHashBits); |
| } |
| |
| static boolean isBinary(ObjectLoader obj) throws IOException { |
| if (obj.isLarge()) { |
| try (ObjectStream in1 = obj.openStream()) { |
| return RawText.isBinary(in1); |
| } |
| } |
| byte[] raw = obj.getCachedBytes(); |
| return RawText.isBinary(raw, raw.length, true); |
| } |
| |
| void hash(ObjectLoader obj) throws MissingObjectException, IOException, |
| TableFullException { |
| if (obj.isLarge()) { |
| hashLargeObject(obj); |
| } else { |
| byte[] raw = obj.getCachedBytes(); |
| hash(raw, 0, raw.length); |
| } |
| } |
| |
| private void hashLargeObject(ObjectLoader obj) throws IOException, |
| TableFullException { |
| boolean text; |
| text = !isBinary(obj); |
| |
| try (ObjectStream in2 = obj.openStream()) { |
| hash(in2, in2.getSize(), text); |
| } |
| } |
| |
| void hash(byte[] raw, int ptr, int end) throws TableFullException { |
| final boolean text = !RawText.isBinary(raw, raw.length, true); |
| hashedCnt = 0; |
| while (ptr < end) { |
| int hash = 5381; |
| int blockHashedCnt = 0; |
| int start = ptr; |
| |
| // Hash one line, or one block, whichever occurs first. |
| do { |
| int c = raw[ptr++] & 0xff; |
| // Ignore CR in CRLF sequence if text |
| if (text && c == '\r' && ptr < end && raw[ptr] == '\n') |
| continue; |
| blockHashedCnt++; |
| if (c == '\n') |
| break; |
| hash = (hash << 5) + hash + c; |
| } while (ptr < end && ptr - start < 64); |
| hashedCnt += blockHashedCnt; |
| add(hash, blockHashedCnt); |
| } |
| } |
| |
| void hash(InputStream in, long remaining, boolean text) throws IOException, |
| TableFullException { |
| byte[] buf = new byte[4096]; |
| int ptr = 0; |
| int cnt = 0; |
| |
| while (0 < remaining) { |
| int hash = 5381; |
| int blockHashedCnt = 0; |
| |
| // Hash one line, or one block, whichever occurs first. |
| int n = 0; |
| do { |
| if (ptr == cnt) { |
| ptr = 0; |
| cnt = in.read(buf, 0, buf.length); |
| if (cnt <= 0) |
| throw new EOFException(); |
| } |
| |
| n++; |
| int c = buf[ptr++] & 0xff; |
| // Ignore CR in CRLF sequence if text |
| if (text && c == '\r' && ptr < cnt && buf[ptr] == '\n') |
| continue; |
| blockHashedCnt++; |
| if (c == '\n') |
| break; |
| hash = (hash << 5) + hash + c; |
| } while (n < 64 && n < remaining); |
| hashedCnt += blockHashedCnt; |
| add(hash, blockHashedCnt); |
| remaining -= n; |
| } |
| } |
| |
| /** |
| * Sort the internal table so it can be used for efficient scoring. |
| * <p> |
| * Once sorted, additional lines/blocks cannot be added to the index. |
| */ |
| void sort() { |
| // Sort the array. All of the empty space will wind up at the front, |
| // because we forced all of the keys to always be positive. Later |
| // we only work with the back half of the array. |
| // |
| Arrays.sort(idHash); |
| } |
| |
| /** |
| * Compute the similarity score between this index and another. |
| * <p> |
| * A region of a file is defined as a line in a text file or a fixed-size |
| * block in a binary file. To prepare an index, each region in the file is |
| * hashed; the values and counts of hashes are retained in a sorted table. |
| * Define the similarity fraction F as the count of matching regions |
| * between the two files divided between the maximum count of regions in |
| * either file. The similarity score is F multiplied by the maxScore |
| * constant, yielding a range [0, maxScore]. It is defined as maxScore for |
| * the degenerate case of two empty files. |
| * <p> |
| * The similarity score is symmetrical; i.e. a.score(b) == b.score(a). |
| * |
| * @param dst |
| * the other index |
| * @param maxScore |
| * the score representing a 100% match |
| * @return the similarity score |
| */ |
| public int score(SimilarityIndex dst, int maxScore) { |
| long max = Math.max(hashedCnt, dst.hashedCnt); |
| if (max == 0) |
| return maxScore; |
| return (int) ((common(dst) * maxScore) / max); |
| } |
| |
| long common(SimilarityIndex dst) { |
| return common(this, dst); |
| } |
| |
| private static long common(SimilarityIndex src, SimilarityIndex dst) { |
| int srcIdx = src.packedIndex(0); |
| int dstIdx = dst.packedIndex(0); |
| long[] srcHash = src.idHash; |
| long[] dstHash = dst.idHash; |
| return common(srcHash, srcIdx, dstHash, dstIdx); |
| } |
| |
| private static long common(long[] srcHash, int srcIdx, // |
| long[] dstHash, int dstIdx) { |
| if (srcIdx == srcHash.length || dstIdx == dstHash.length) |
| return 0; |
| |
| long common = 0; |
| int srcKey = keyOf(srcHash[srcIdx]); |
| int dstKey = keyOf(dstHash[dstIdx]); |
| |
| for (;;) { |
| if (srcKey == dstKey) { |
| common += Math.min(countOf(srcHash[srcIdx]), |
| countOf(dstHash[dstIdx])); |
| |
| if (++srcIdx == srcHash.length) |
| break; |
| srcKey = keyOf(srcHash[srcIdx]); |
| |
| if (++dstIdx == dstHash.length) |
| break; |
| dstKey = keyOf(dstHash[dstIdx]); |
| |
| } else if (srcKey < dstKey) { |
| // Regions of src which do not appear in dst. |
| if (++srcIdx == srcHash.length) |
| break; |
| srcKey = keyOf(srcHash[srcIdx]); |
| |
| } else /* if (dstKey < srcKey) */{ |
| // Regions of dst which do not appear in src. |
| if (++dstIdx == dstHash.length) |
| break; |
| dstKey = keyOf(dstHash[dstIdx]); |
| } |
| } |
| |
| return common; |
| } |
| |
| // Testing only |
| int size() { |
| return idSize; |
| } |
| |
| // Testing only |
| int key(int idx) { |
| return keyOf(idHash[packedIndex(idx)]); |
| } |
| |
| // Testing only |
| long count(int idx) { |
| return countOf(idHash[packedIndex(idx)]); |
| } |
| |
| // Brute force approach only for testing. |
| int findIndex(int key) { |
| for (int i = 0; i < idSize; i++) |
| if (key(i) == key) |
| return i; |
| return -1; |
| } |
| |
| private int packedIndex(int idx) { |
| return (idHash.length - idSize) + idx; |
| } |
| |
| void add(int key, int cnt) throws TableFullException { |
| key = (key * 0x9e370001) >>> 1; // Mix bits and ensure not negative. |
| |
| int j = slot(key); |
| for (;;) { |
| long v = idHash[j]; |
| if (v == 0) { |
| // Empty slot in the table, store here. |
| if (idGrowAt <= idSize) { |
| grow(); |
| j = slot(key); |
| continue; |
| } |
| idHash[j] = pair(key, cnt); |
| idSize++; |
| return; |
| |
| } else if (keyOf(v) == key) { |
| // Same key, increment the counter. If it overflows, fail |
| // indexing to prevent the key from being impacted. |
| // |
| idHash[j] = pair(key, countOf(v) + cnt); |
| return; |
| |
| } else if (++j >= idHash.length) { |
| j = 0; |
| } |
| } |
| } |
| |
| private static long pair(int key, long cnt) throws TableFullException { |
| if (MAX_COUNT < cnt) |
| throw new TableFullException(); |
| return (((long) key) << KEY_SHIFT) | cnt; |
| } |
| |
| private int slot(int key) { |
| // We use 31 - idHashBits because the upper bit was already forced |
| // to be 0 and we want the remaining high bits to be used as the |
| // table slot. |
| // |
| return key >>> (31 - idHashBits); |
| } |
| |
| private static int growAt(int idHashBits) { |
| return (1 << idHashBits) * (idHashBits - 3) / idHashBits; |
| } |
| |
| @SuppressWarnings("UnusedException") |
| private void grow() throws TableFullException { |
| if (idHashBits == 30) |
| throw new TableFullException(); |
| |
| long[] oldHash = idHash; |
| int oldSize = idHash.length; |
| |
| idHashBits++; |
| idGrowAt = growAt(idHashBits); |
| |
| try { |
| idHash = new long[1 << idHashBits]; |
| } catch (OutOfMemoryError noMemory) { |
| throw TABLE_FULL_OUT_OF_MEMORY; |
| } |
| |
| for (int i = 0; i < oldSize; i++) { |
| long v = oldHash[i]; |
| if (v != 0) { |
| int j = slot(keyOf(v)); |
| while (idHash[j] != 0) |
| if (++j >= idHash.length) |
| j = 0; |
| idHash[j] = v; |
| } |
| } |
| } |
| |
| private static int keyOf(long v) { |
| return (int) (v >>> KEY_SHIFT); |
| } |
| |
| private static long countOf(long v) { |
| return v & MAX_COUNT; |
| } |
| |
| /** Thrown by {@code create()} when file is too large. */ |
| public static class TableFullException extends Exception { |
| private static final long serialVersionUID = 1L; |
| } |
| } |