Skip to content

Commit 4eff23a

Browse files
cxzl25dongjoon-hyun
authored andcommitted
ORC-1610: Reduce the number of hash computation in CuckooSetBytes
### What changes were proposed in this pull request? Add boundary conditions on "length" with the min/max length stored in the hashes. ### Why are the changes needed? https://issues.apache.org/jira/browse/HIVE-24205 > This would significantly reduce the number of hash computation that needs to happen. ``` main insert:00:00:00.689 main lookup:00:00:01.124 PR insert:00:00:00.628 PR lookup:00:00:01.055 ``` ```java Test public void testLen() { int maxSize = 200000; Random gen = new Random(); String[] strings = new String[maxSize]; for (int i = 0; i < maxSize; i++) { strings[i] = RandomStringUtils.random(Math.abs(gen.nextInt(1000))); } byte[][] values = getByteArrays(strings); StopWatch mainSW = new StopWatch(); // load set mainSW.start(); CuckooSetBytes main = new CuckooSetBytes(strings.length); main.fastLookup = false; for (byte[] v : values) { main.insert(v); } mainSW.split(); System.out.println("main insert:" + mainSW); // test that the values we added are there for (byte[] v : values) { assertTrue(main.lookup(v, 0, v.length)); } mainSW.stop(); System.out.println("main lookup:" + mainSW); StopWatch prSW = new StopWatch(); prSW.start(); CuckooSetBytes pr = new CuckooSetBytes(strings.length); pr.fastLookup = true; for (byte[] v : values) { pr.insert(v); } prSW.split(); System.out.println("PR insert:" + prSW); for (byte[] v : values) { assertTrue(pr.lookup(v, 0, v.length)); } prSW.stop(); System.out.println("PR lookup:" + prSW); } ``` ### How was this patch tested? GA ### Was this patch authored or co-authored using generative AI tooling? No Closes #1785 from cxzl25/ORC-1610. Authored-by: sychen <[email protected]> Signed-off-by: Dongjoon Hyun <[email protected]>
1 parent 4284340 commit 4eff23a

File tree

1 file changed

+7
-0
lines changed

1 file changed

+7
-0
lines changed

java/core/src/java/org/apache/orc/util/CuckooSetBytes.java

+7
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,8 @@ public class CuckooSetBytes {
4545
private int rehashCount = 0;
4646
private static final long INT_MASK = 0x00000000ffffffffL;
4747
private static final long BYTE_MASK = 0x00000000000000ffL;
48+
private int maxLen;
49+
private int minLen = Integer.MAX_VALUE;
4850
// some prime numbers spaced about at powers of 2 in magnitude
4951
static final int[] primes = {7, 13, 17, 23, 31, 53, 67, 89, 127, 269, 571, 1019, 2089,
5052
4507, 8263, 16361, 32327, 65437, 131111, 258887, 525961, 999983, 2158909, 4074073,
@@ -84,6 +86,9 @@ public CuckooSetBytes(int expectedSize) {
8486
* and ending at start+len is present in the set.
8587
*/
8688
public boolean lookup(byte[] b, int start, int len) {
89+
if (len < minLen || len > maxLen) {
90+
return false;
91+
}
8792

8893
return entryEqual(t1, h1(b, start, len), b, start, len) ||
8994
entryEqual(t2, h2(b, start, len), b, start, len);
@@ -98,6 +103,8 @@ public void insert(byte[] x) {
98103
if (lookup(x, 0, x.length)) {
99104
return;
100105
}
106+
minLen = Math.min(minLen, x.length);
107+
maxLen = Math.max(maxLen, x.length);
101108

102109
// Try to insert up to n times. Rehash if that fails.
103110
for(int i = 0; i != n; i++) {

0 commit comments

Comments
 (0)