diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..cf49ca2 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +.classpath +.project +.settings/ +target/ \ No newline at end of file diff --git a/pom.xml b/pom.xml index eb50710..4e9d12f 100644 --- a/pom.xml +++ b/pom.xml @@ -75,6 +75,7 @@ org.apache.maven.plugins maven-gpg-plugin + 1.6 sign-artifacts @@ -148,37 +149,16 @@ - it.unimi.dsi - fastutil - ${fastutil-version} + com.carrotsearch + hppc + 0.7.1 - - org.easymock - easymock - ${easymock-version} - test - - - org.powermock - powermock-module-junit4 - ${powermock-version} - test - - - org.powermock - powermock-api-easymock - ${powermock-version} - test - - - - org.testng - testng - ${testng-version} - test - jdk15 + com.carrotsearch.randomizedtesting + randomizedtesting-runner + 2.1.14 + test @@ -190,7 +170,5 @@ 3.0 1.4.8 - 5.7 - 6.5.11 \ No newline at end of file diff --git a/src/main/java/net/agkn/hll/HLL.java b/src/main/java/net/agkn/hll/HLL.java index cdfd3ad..688213a 100644 --- a/src/main/java/net/agkn/hll/HLL.java +++ b/src/main/java/net/agkn/hll/HLL.java @@ -18,8 +18,11 @@ import java.util.Arrays; -import it.unimi.dsi.fastutil.ints.Int2ByteOpenHashMap; -import it.unimi.dsi.fastutil.longs.LongOpenHashSet; +import com.carrotsearch.hppc.IntByteHashMap; +import com.carrotsearch.hppc.LongHashSet; +import com.carrotsearch.hppc.cursors.IntByteCursor; +import com.carrotsearch.hppc.cursors.LongCursor; + import net.agkn.hll.serialization.HLLMetadata; import net.agkn.hll.serialization.IHLLMetadata; import net.agkn.hll.serialization.ISchemaVersion; @@ -79,11 +82,11 @@ public class HLL implements Cloneable { // ************************************************************************ // Storage // storage used when #type is EXPLICIT, null otherwise - private LongOpenHashSet explicitStorage; + LongHashSet explicitStorage; // storage used when #type is SPARSE, null otherwise - private Int2ByteOpenHashMap sparseProbabilisticStorage; + IntByteHashMap sparseProbabilisticStorage; // storage used when #type is FULL, null otherwise - private BitVector probabilisticStorage; + BitVector probabilisticStorage; // current type of this HLL instance, if this changes then so should the // storage used (see above) @@ -347,13 +350,13 @@ public void addRaw(final long rawValue) { if(explicitStorage.size() > explicitThreshold) { if(!sparseOff) { initializeStorage(HLLType.SPARSE); - for(final long value : explicitStorage) { - addRawSparseProbabilistic(value); + for (LongCursor c : explicitStorage) { + addRawSparseProbabilistic(c.value); } } else { initializeStorage(HLLType.FULL); - for(final long value : explicitStorage) { - addRawProbabilistic(value); + for (LongCursor c : explicitStorage) { + addRawProbabilistic(c.value); } } explicitStorage = null; @@ -366,8 +369,9 @@ public void addRaw(final long rawValue) { // promotion, if necessary if(sparseProbabilisticStorage.size() > sparseThreshold) { initializeStorage(HLLType.FULL); - for(final int registerIndex : sparseProbabilisticStorage.keySet()) { - final byte registerValue = sparseProbabilisticStorage.get(registerIndex); + for(IntByteCursor c : sparseProbabilisticStorage) { + final int registerIndex = c.key; + final byte registerValue = c.value; probabilisticStorage.setMaxRegister(registerIndex, registerValue); } sparseProbabilisticStorage = null; @@ -423,7 +427,7 @@ private void addRawSparseProbabilistic(final long rawValue) { // NOTE: no +1 as in paper since 0-based indexing final int j = (int)(rawValue & mBitsMask); - final byte currentValue = sparseProbabilisticStorage.get(j); + final byte currentValue = sparseProbabilisticStorage.getOrDefault(j, (byte) 0); if(p_w > currentValue) { sparseProbabilisticStorage.put(j, p_w); } @@ -488,10 +492,10 @@ private void initializeStorage(final HLLType type) { // nothing to be done break; case EXPLICIT: - this.explicitStorage = new LongOpenHashSet(); + this.explicitStorage = new LongHashSet(); break; case SPARSE: - this.sparseProbabilisticStorage = new Int2ByteOpenHashMap(); + this.sparseProbabilisticStorage = new IntByteHashMap(); break; case FULL: this.probabilisticStorage = new BitVector(regwidth, m); @@ -541,7 +545,7 @@ public long cardinality() { double sum = 0; int numberOfZeroes = 0/*"V" in the paper*/; for(int j=0; j currentRegisterValue) { - sparseProbabilisticStorage.put(registerIndex, registerValue); - } + for(IntByteCursor c : other.sparseProbabilisticStorage) { + final int registerIndex = c.key; + final byte registerValue = c.value; + final byte currentRegisterValue = sparseProbabilisticStorage.get(registerIndex); + if(registerValue > currentRegisterValue) { + sparseProbabilisticStorage.put(registerIndex, registerValue); + } } // promotion, if necessary if(sparseProbabilisticStorage.size() > sparseThreshold) { initializeStorage(HLLType.FULL); - for(final int registerIndex : sparseProbabilisticStorage.keySet()) { - final byte registerValue = sparseProbabilisticStorage.get(registerIndex); - probabilisticStorage.setMaxRegister(registerIndex, registerValue); + for(IntByteCursor c : sparseProbabilisticStorage) { + final int registerIndex = c.key; + final byte registerValue = c.value; + probabilisticStorage.setMaxRegister(registerIndex, registerValue); } sparseProbabilisticStorage = null; } @@ -887,7 +896,7 @@ public byte[] toBytes(final ISchemaVersion schemaVersion) { final IWordSerializer serializer = schemaVersion.getSerializer(type, Long.SIZE, explicitStorage.size()); - final long[] values = explicitStorage.toLongArray(); + final long[] values = explicitStorage.toArray(); Arrays.sort(values); for(final long value : values) { serializer.writeWord(value); @@ -900,9 +909,10 @@ public byte[] toBytes(final ISchemaVersion schemaVersion) { final IWordSerializer serializer = schemaVersion.getSerializer(type, shortWordLength, sparseProbabilisticStorage.size()); - final int[] indices = sparseProbabilisticStorage.keySet().toIntArray(); + final int[] indices = sparseProbabilisticStorage.keys().toArray(); Arrays.sort(indices); for(final int registerIndex : indices) { + assert sparseProbabilisticStorage.containsKey(registerIndex); final long registerValue = sparseProbabilisticStorage.get(registerIndex); // pack index and value into "short word" final long shortWord = ((registerIndex << regwidth) | registerValue); diff --git a/src/main/java/net/agkn/hll/util/HLLUtil.java b/src/main/java/net/agkn/hll/util/HLLUtil.java index d3f97f2..a45155d 100644 --- a/src/main/java/net/agkn/hll/util/HLLUtil.java +++ b/src/main/java/net/agkn/hll/util/HLLUtil.java @@ -59,7 +59,7 @@ public final class HLLUtil { * * @see #largeEstimator(int, int, double) * @see #largeEstimatorCutoff(int, int) - * @see Blog post with section on 2^L + * @see "Blog post with section on 2^L" */ private static final double[] TWO_TO_L = new double[(HLL.MAXIMUM_REGWIDTH_PARAM + 1) * (HLL.MAXIMUM_LOG2M_PARAM + 1)]; @@ -178,7 +178,7 @@ public static double smallEstimator(final int m, final int numberOfZeroes) { * @param registerSizeInBits the size of the HLL registers, in bits. * @return the cutoff for the large range correction. * @see #largeEstimator(int, int, double) - * @see Blog post with section on 64 bit hashes and "large range correction" cutoff + * @see "Blog post with section on 64 bit hashes and 'large range correction' cutoff" */ public static double largeEstimatorCutoff(final int log2m, final int registerSizeInBits) { return (TWO_TO_L[(REG_WIDTH_INDEX_MULTIPLIER * registerSizeInBits) + log2m]) / 30.0; @@ -193,7 +193,7 @@ public static double largeEstimatorCutoff(final int log2m, final int registerSiz * @param registerSizeInBits the size of the HLL registers, in bits. * @param estimator the original estimator ("E" in the paper). * @return a corrected cardinality estimate. - * @see Blog post with section on 64 bit hashes and "large range correction" + * @see "Blog post with section on 64 bit hashes and 'large range correction'" */ public static double largeEstimator(final int log2m, final int registerSizeInBits, final double estimator) { final double twoToL = TWO_TO_L[(REG_WIDTH_INDEX_MULTIPLIER * registerSizeInBits) + log2m]; diff --git a/src/test/java/net/agkn/hll/ExplicitHLLTest.java b/src/test/java/net/agkn/hll/ExplicitHLLTest.java index 80cf526..c942809 100644 --- a/src/test/java/net/agkn/hll/ExplicitHLLTest.java +++ b/src/test/java/net/agkn/hll/ExplicitHLLTest.java @@ -16,24 +16,22 @@ * limitations under the License. */ -import static org.powermock.reflect.Whitebox.getInternalState; -import static org.testng.Assert.assertEquals; -import static org.testng.Assert.assertTrue; -import it.unimi.dsi.fastutil.longs.LongOpenHashSet; +import org.junit.Test; import java.util.HashSet; -import java.util.Random; import net.agkn.hll.serialization.ISchemaVersion; import net.agkn.hll.serialization.SerializationUtil; -import org.testng.annotations.Test; + +import com.carrotsearch.hppc.LongHashSet; +import com.carrotsearch.randomizedtesting.RandomizedTest; /** * Tests {@link HLL} of type {@link HLLType#EXPLICIT}. * * @author timon */ -public class ExplicitHLLTest { +public class ExplicitHLLTest extends RandomizedTest { /** * Tests basic set semantics of {@link HLL#addRaw(long)}. */ @@ -180,11 +178,9 @@ public void randomValuesTest() { final HashSet canonical = new HashSet(); final HLL hll = newHLL(explicitThreshold); - final long seed = 1L/*constant so results are reproducible*/; - final Random random = new Random(seed); for(int i=0;inull. * @return the populated HLL. This will never be null. */ - public static HLL generateRandomHLL(final Random random) { - final int randomTypeInt = random.nextInt(HLLType.values().length); + public static HLL generateRandomHLL() { + final int randomTypeInt = randomIntBetween(0, HLLType.values().length - 1); final HLLType type; switch(randomTypeInt) { case 0: @@ -595,10 +590,10 @@ public static HLL generateRandomHLL(final Random random) { final HLL hll = newHLL(HLLType.EMPTY); for(int i=0; i randoms = new ArrayList(randomCount){{ - for (int i=0; i randoms = new ArrayList(randomCount); + for (int i=0; i