Merge remote-tracking branch 'origin/topic/johanna/bit-1612'

Addig a new random seed for external tests. I added a wrapper around the siphash() function to make calling it a little bit safer at least. BIT-1612 #merged * origin/topic/johanna/bit-1612: HLL: Fix missing typecast in test case. Remove the -K/-J options for setting keys. Add test checking the quality of HLL by adding a lot of elements. Fix serializing probabilistic hashers. Baseline updates after hash function change. Also switch BloomFilters from H3 to siphash. Change Hashing from H3 to Siphash. HLL: Remove unnecessary comparison. Hyperloglog: change calculation of Rho
2025-10-15 21:18:20 +00:00 · 2016-07-14 16:00:03 -07:00 · 2016-07-14 16:00:03 -07:00 · 4d84ee82da
commit 4d84ee82da
parent 1ba33bf66e 4252c003d0
347 changed files with 26269 additions and 26053 deletions
--- a/src/probabilistic/CardinalityCounter.cc
+++ b/src/probabilistic/CardinalityCounter.cc
@ -28,10 +28,9 @@ int CardinalityCounter::OptimalB(double error, double confidence) const
 	return answer;
 	}

-void CardinalityCounter::Init(uint64 size)
+void CardinalityCounter::Init(uint64_t size)
 	{
 	m = size;
-	buckets = new uint8_t[m];

 	// The following magic values are taken directly out of the
 	// description of the HyperLogLog algorithn.
@ -51,60 +50,83 @@ void CardinalityCounter::Init(uint64 size)
 	else
 		reporter->InternalError("Invalid size %" PRIu64 ". Size either has to be 16, 32, 64 or bigger than 128", size);

-	for ( uint64 i = 0; i < m; i++ )
-		buckets[i] = 0;
+	double calc_p = log2(m);
+	if ( trunc(calc_p) != calc_p )
+		reporter->InternalError("Invalid size %" PRIu64 ". Size either has to be a power of 2", size);
+
+	p = calc_p;
+
+	buckets.reserve(m);
+	for ( uint64_t i = 0; i < m; i++ )
+		buckets.push_back(0);
+
+	assert(buckets.size() == m);

 	V = m;
 	}

 CardinalityCounter::CardinalityCounter(CardinalityCounter& other)
+	: buckets(other.buckets)
 	{
-	Init(other.GetM());
-	Merge(&other);
+	V = other.V;
+	alpha_m = other.alpha_m;
+	m = other.m;
+	p = other.p;
+	}
+
+CardinalityCounter::CardinalityCounter(CardinalityCounter&& o)
+	{
+	V = o.V;
+	alpha_m = o.alpha_m;
+	m = o.m;
+	p = o.p;
+
+	o.m = 0;
+	buckets = std::move(o.buckets);
 	}

 CardinalityCounter::CardinalityCounter(double error_margin, double confidence)
 	{
 	int b = OptimalB(error_margin, confidence);
 	Init((uint64) pow(2, b));
+
+	assert(b == p);
 	}

-CardinalityCounter::CardinalityCounter(uint64 size)
+CardinalityCounter::CardinalityCounter(uint64_t size)
 	{
 	Init(size);
 	}

-CardinalityCounter::CardinalityCounter(uint64 arg_size, uint64 arg_V, double arg_alpha_m)
+CardinalityCounter::CardinalityCounter(uint64_t arg_size, uint64_t arg_V, double arg_alpha_m)
 	{
 	m = arg_size;
-	buckets = new uint8_t[m];
+
+	buckets.reserve(m);
+	for ( uint64_t i = 0; i < m; i++ )
+		buckets.push_back(0);
+
 	alpha_m = arg_alpha_m;
 	V = arg_V;
+	p = log2(m);
 	}

 CardinalityCounter::~CardinalityCounter()
 	{
-	delete [] buckets;
 	}

-uint8_t CardinalityCounter::Rank(uint64 hash_modified) const
+uint8_t CardinalityCounter::Rank(uint64_t hash_modified) const
 	{
-	uint8_t answer = 0;
-
-	hash_modified = (uint64)(hash_modified / m);
-	hash_modified *= 2;
-
-	do {
-		hash_modified = (uint64)(hash_modified / 2);
-		answer++;
-	} while ( hash_modified % 2 == 0);
+	hash_modified = hash_modified >> p;
+	int answer = 64 - p - CardinalityCounter::flsll(hash_modified) + 1;
+	assert(answer > 0 && answer < 64);

 	return answer;
 	}

-void CardinalityCounter::AddElement(uint64 hash)
+void CardinalityCounter::AddElement(uint64_t hash)
 	{
-	uint64 index = hash % m;
+	uint64_t index = hash % m;
 	hash = hash-index;

 	if( buckets[index] == 0 )
@ -118,7 +140,7 @@ void CardinalityCounter::AddElement(uint64 hash)

 /**
 * Estimate the size by using the the "raw" HyperLogLog estimate. Then,
- * check if it's too "large" or "small" because the raw estimate doesn't 
+ * check if it's too "large" or "small" because the raw estimate doesn't
 * do well in those cases.
 * Thus, we correct for those errors as specified in the paper.
 *
@ -149,7 +171,7 @@ bool CardinalityCounter::Merge(CardinalityCounter* c)
 	if ( m != c->GetM() )
 		return false;

-	uint8_t* temp = c->GetBuckets();
+	const vector<uint8_t> temp = c->GetBuckets();

 	V = 0;

@ -165,12 +187,12 @@ bool CardinalityCounter::Merge(CardinalityCounter* c)
 	return true;
 	}

-uint8_t* CardinalityCounter::GetBuckets()
+const vector<uint8_t> &CardinalityCounter::GetBuckets() const
 	{
 	return buckets;
 	}

-uint64 CardinalityCounter::GetM() const
+uint64_t CardinalityCounter::GetM() const
 	{
 	return m;
 	}
@ -192,7 +214,7 @@ bool CardinalityCounter::Serialize(SerialInfo* info) const
 CardinalityCounter* CardinalityCounter::Unserialize(UnserialInfo* info)
 	{
 	uint64_t m;
-	uint64 V;
+	uint64_t V;
 	double alpha_m;

 	bool valid = true;
@ -202,13 +224,13 @@ CardinalityCounter* CardinalityCounter::Unserialize(UnserialInfo* info)

 	CardinalityCounter* c = new CardinalityCounter(m, V, alpha_m);

-	uint8_t* buckets = c->buckets;
+	vector<uint8_t>& buckets = c->buckets;

 	for ( unsigned int i = 0; i < m; i++ )
 		{
 		char c;
 		valid &= UNSERIALIZE(&c);
-		buckets[i] = (uint8)c;
+		buckets[i] = (uint8_t)c;
 		}

 	if ( ! valid )
@ -219,3 +241,51 @@ CardinalityCounter* CardinalityCounter::Unserialize(UnserialInfo* info)

 	return c;
 	}
+
+/**
+ * The following function is copied from libc/string/flsll.c from the FreeBSD source
+ * tree. Original copyright message follows
+ */
+/*-
+ * Copyright (c) 1990, 1993
+ *      The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Find Last Set bit
+ */
+int
+CardinalityCounter::flsll(uint64_t mask)
+{
+        int bit;
+
+        if (mask == 0)
+                return (0);
+        for (bit = 1; mask != 1; bit++)
+                mask = (uint64_t)mask >> 1;
+        return (bit);
+}
--- a/src/probabilistic/CardinalityCounter.h
+++ b/src/probabilistic/CardinalityCounter.h
@ -28,13 +28,18 @@ public:
 	 *
 	 * @param confidence confidence of the error. Default: 0.95
 	 */
-	CardinalityCounter(double error_margin, double confidence = 0.95);
+	explicit CardinalityCounter(double error_margin, double confidence = 0.95);

 	/**
 	 * Copy-Constructor
 	 */
 	CardinalityCounter(CardinalityCounter& other);

+	/**
+	 * Move-Constructor
+	 */
+	CardinalityCounter(CardinalityCounter&& o);
+
 	/**
 	 * Constructor for a known number of buckets.
 	 *
@ -43,7 +48,7 @@ public:
 	 *
 	 * @param size number of buckets to create
 	 */
-	CardinalityCounter(uint64 size);
+	explicit CardinalityCounter(uint64_t size);

 	/**
 	 * Destructor.
@ -58,7 +63,7 @@ public:
 	 *
 	 * @param hash 64-bit hash value of the element to be added
 	 */
-	void AddElement(uint64 hash);
+	void AddElement(uint64_t hash);

 	/**
 	 * Get the current estimated number of elements in the data
@ -104,7 +109,7 @@ protected:
 	 *
 	 * @return Number of buckets
 	 */
-	uint64 GetM() const;
+	uint64_t GetM() const;

 	/**
 	 * Returns the buckets array that holds all of the rough cardinality
@ -114,21 +119,21 @@ protected:
 	 *
 	 * @return Array containing cardinality estimates
 	 */
-	uint8_t* GetBuckets();
+	const std::vector<uint8_t>& GetBuckets() const;

 private:
 	/**
 	 * Constructor used when unserializing, i.e., all parameters are
 	 * known.
 	 */
-	CardinalityCounter(uint64 size, uint64 V, double alpha_m);
+	explicit CardinalityCounter(uint64_t size, uint64_t V, double alpha_m);

 	/**
 	 * Helper function with code used jointly by multiple constructors.
 	 *
 	 * @param arg_size: number of buckets that need to be kept
 	 */
-	void Init(uint64 arg_size);
+	void Init(uint64_t arg_size);

 	/**
 	 * This function calculates the smallest value of b that will
@ -150,22 +155,28 @@ private:
 	int OptimalB(double error, double confidence) const;

 	/**
-	 * Determines at which index (counted from the back) the first one-bit
+	 * Determines at which index (counted from the front) the first one-bit
 	 * appears. The last b bits have to be 0 (the element has to be divisible
-	 * by m), hence they are ignored.
+	 * by m), hence they are ignored. Always adds 1 to the result. This is the
+	 * rho function from the original algorithm.
 	 *
 	 * @param hash_modified hash value
 	 *
 	 * @returns index of first one-bit
 	 */
-	uint8_t Rank(uint64 hash_modified) const;
+	uint8_t Rank(uint64_t hash_modified) const;
+
+	/**
+	 * flsll from FreeBSD; especially Linux does not have this.
+	 */
+	static int flsll(uint64_t mask);

 	/**
 	 * This is the number of buckets that will be stored. The standard
 	 * error is 1.04/sqrt(m), so the actual cardinality will be the
 	 * estimate +/- 1.04/sqrt(m) with approximately 68% probability.
 	 */
-	uint64 m;
+	uint64_t m;

 	/**
 	 * These are the actual buckets that are storing an estimate of the
@ -173,7 +184,7 @@ private:
 	 * appears in the bitstring and that location is at most 65, so not
 	 * that many bits are needed to store it.
 	 */
-	uint8_t* buckets;
+	std::vector<uint8_t> buckets;

 	/**
 	 * There are some state constants that need to be kept track of to
@ -181,8 +192,9 @@ private:
 	 * buckets that are 0 and this is used in the small error correction.
 	 * alpha_m is a multiplicative constant used in the algorithm.
 	 */
-	uint64 V;
+	uint64_t V;
 	double alpha_m;
+	int p; // the log2 of m
 };

 }
--- a/src/probabilistic/Hasher.cc
+++ b/src/probabilistic/Hasher.cc
@ -5,18 +5,21 @@

 #include "Hasher.h"
 #include "NetVar.h"
-#include "digest.h"
 #include "Serializer.h"
+#include "digest.h"
+#include "siphash24.h"

 using namespace probabilistic;

-uint64 Hasher::MakeSeed(const void* data, size_t size)
+Hasher::seed_t Hasher::MakeSeed(const void* data, size_t size)
 	{
 	u_char buf[SHA256_DIGEST_LENGTH];
-	uint64 tmpseed;
+	seed_t tmpseed;
 	SHA256_CTX ctx;
 	sha256_init(&ctx);

+	assert(sizeof(tmpseed) == 16);
+
 	if ( data )
 		sha256_update(&ctx, data, size);

@ -56,7 +59,10 @@ bool Hasher::DoSerialize(SerialInfo* info) const
 	if ( ! SERIALIZE(static_cast<uint16>(k)) )
 		return false;

-	return SERIALIZE(static_cast<uint64>(seed));
+	if ( ! SERIALIZE(static_cast<uint64>(seed.h1)) )
+		return false;
+
+	return SERIALIZE(static_cast<uint64>(seed.h2));
 	}

 bool Hasher::DoUnserialize(UnserialInfo* info)
@ -70,8 +76,11 @@ bool Hasher::DoUnserialize(UnserialInfo* info)
 	k = serial_k;
 	assert(k > 0);

-	uint64 serial_seed;
-	if ( ! UNSERIALIZE(&serial_seed) )
+	seed_t serial_seed;
+	if ( ! UNSERIALIZE(&serial_seed.h1) )
+		return false;
+
+	if ( ! UNSERIALIZE(&serial_seed.h2) )
 		return false;

 	seed = serial_seed;
@ -79,14 +88,18 @@ bool Hasher::DoUnserialize(UnserialInfo* info)
 	return true;
 	}

-Hasher::Hasher(size_t arg_k, size_t arg_seed)
+Hasher::Hasher(size_t arg_k, seed_t arg_seed)
 	{
 	k = arg_k;
 	seed = arg_seed;
 	}

-UHF::UHF(size_t arg_seed)
-	: h(arg_seed)
+UHF::UHF()
+	{
+	memset(&seed, 0, sizeof(seed));
+	}
+
+UHF::UHF(Hasher::seed_t arg_seed)
 	{
 	seed = arg_seed;
 	}
@ -96,8 +109,14 @@ UHF::UHF(size_t arg_seed)
 // times.
 Hasher::digest UHF::hash(const void* x, size_t n) const
 	{
+	assert(sizeof(Hasher::seed_t) == SIPHASH_KEYLEN);
+
 	if ( n <= UHASH_KEY_SIZE )
-		return n == 0 ? 0 : h(x, n);
+		{
+		hash_t outdigest;
+		siphash(&outdigest, reinterpret_cast<const uint8_t*>(x), n, reinterpret_cast<const uint8_t*>(&seed));
+		return outdigest;
+		}

 	unsigned char d[16];
 	MD5(reinterpret_cast<const unsigned char*>(x), n, d);
@ -111,11 +130,15 @@ Hasher::digest UHF::hash(const void* x, size_t n) const
 	return *reinterpret_cast<const Hasher::digest*>(d);
 	}

-DefaultHasher::DefaultHasher(size_t k, size_t seed)
+DefaultHasher::DefaultHasher(size_t k, Hasher::seed_t seed)
 	: Hasher(k, seed)
 	{
 	for ( size_t i = 1; i <= k; ++i )
-		hash_functions.push_back(UHF(Seed() + bro_prng(i)));
+		{
+		seed_t s = Seed();
+		s.h1 += bro_prng(i);
+		hash_functions.push_back(UHF(s));
+		}
 	}

 Hasher::digest_vector DefaultHasher::Hash(const void* x, size_t n) const
@ -158,12 +181,16 @@ bool DefaultHasher::DoUnserialize(UnserialInfo* info)

 	hash_functions.clear();
 	for ( size_t i = 0; i < K(); ++i )
-		hash_functions.push_back(UHF(Seed() + bro_prng(i)));
+		{
+		Hasher::seed_t s = Seed();
+		s.h1 += bro_prng(i);
+		hash_functions.push_back(UHF(s));
+		}

 	return true;
 	}

-DoubleHasher::DoubleHasher(size_t k, size_t seed)
+DoubleHasher::DoubleHasher(size_t k, seed_t seed)
 	: Hasher(k, seed), h1(seed + bro_prng(1)), h2(seed + bro_prng(2))
 	{
 	}
--- a/src/probabilistic/Hasher.h
+++ b/src/probabilistic/Hasher.h
@ -4,7 +4,6 @@
 #define PROBABILISTIC_HASHER_H

 #include "Hash.h"
-#include "H3.h"
 #include "SerialObj.h"

 namespace probabilistic {
@ -17,6 +16,15 @@ class Hasher : public SerialObj {
 public:
 	typedef hash_t digest;
 	typedef std::vector<digest> digest_vector;
+	struct seed_t {
+		uint64_t h1;
+		uint64_t h2;
+
+		friend seed_t operator+(seed_t lhs, const uint64_t rhs) {
+			lhs.h1 += rhs;
+			return lhs;
+		}
+	};

 	/**
 	 * Creates a valid hasher seed from an arbitrary string.
@ -30,7 +38,7 @@ public:
 	 *
 	 * @return A seed suitable for hashers.
 	 */
-	static uint64 MakeSeed(const void* data, size_t size);
+	static seed_t MakeSeed(const void* data, size_t size);

 	/**
 	 * Destructor.
@ -89,7 +97,7 @@ public:
 	/**
 	 * Returns the seed used to construct the hasher.
 	 */
-	size_t Seed() const	{ return seed; }
+	seed_t Seed() const	{ return seed; }

 	bool Serialize(SerialInfo* info) const;
 	static Hasher* Unserialize(UnserialInfo* info);
@ -106,11 +114,11 @@ protected:
 	 *
 	 * @param arg_seed The seed for the hasher.
 	 */
-	Hasher(size_t arg_k, size_t arg_seed);
+	Hasher(size_t arg_k, seed_t arg_seed);

 private:
 	size_t k;
-	size_t seed;
+	seed_t seed;
 };

 /**
@ -120,12 +128,17 @@ private:
 class UHF {
 public:
 	/**
-	 * Constructs an H3 hash function seeded with a given seed and an
+	 * Default constructor with zero seed.
+	 */
+	UHF();
+
+	/**
+	 * Constructs an hash function seeded with a given seed and an
 	 * optional extra seed to replace the initial Bro seed.
 	 *
 	 * @param arg_seed The seed to use for this instance.
 	 */
-	UHF(size_t arg_seed = 0);
+	UHF(Hasher::seed_t arg_seed);

 	template <typename T>
 	Hasher::digest operator()(const T& x) const
@ -159,7 +172,8 @@ public:

 	friend bool operator==(const UHF& x, const UHF& y)
 		{
-		return x.h == y.h;
+		return (x.seed.h1 == y.seed.h1) &&
+		       (x.seed.h2 == y.seed.h2);
 		}

 	friend bool operator!=(const UHF& x, const UHF& y)
@ -168,10 +182,9 @@ public:
 		}

 private:
-	static size_t compute_seed(size_t seed);
+	static size_t compute_seed(Hasher::seed_t seed);

-	H3<Hasher::digest, UHASH_KEY_SIZE> h;
-	size_t seed;
+	Hasher::seed_t seed;
 };


@ -188,7 +201,7 @@ public:
 	 *
 	 * @param seed The seed for the hasher.
 	 */
-	DefaultHasher(size_t k, size_t seed);
+	DefaultHasher(size_t k, Hasher::seed_t seed);

 	// Overridden from Hasher.
 	virtual digest_vector Hash(const void* x, size_t n) const final;
@ -216,7 +229,7 @@ public:
 	 *
 	 * @param seed The seed for the hasher.
 	 */
-	DoubleHasher(size_t k, size_t seed);
+	DoubleHasher(size_t k, Hasher::seed_t seed);

 	// Overridden from Hasher.
 	virtual digest_vector Hash(const void* x, size_t n) const final;
--- a/src/probabilistic/bloom-filter.bif
+++ b/src/probabilistic/bloom-filter.bif
@ -42,7 +42,7 @@ function bloomfilter_basic_init%(fp: double, capacity: count,

 	size_t cells = BasicBloomFilter::M(fp, capacity);
 	size_t optimal_k = BasicBloomFilter::K(cells, capacity);
-	size_t seed = Hasher::MakeSeed(name->Len() > 0 ? name->Bytes() : 0,
+	Hasher::seed_t seed = Hasher::MakeSeed(name->Len() > 0 ? name->Bytes() : 0,
                                 name->Len());
 	const Hasher* h = new DoubleHasher(optimal_k, seed);

@ -66,7 +66,7 @@ function bloomfilter_basic_init%(fp: double, capacity: count,
 ##
 ## Returns: A Bloom filter handle.
 ##
-## .. bro:see:: bloomfilter_basic_init bloomfilter_counting_init  bloomfilter_add 
+## .. bro:see:: bloomfilter_basic_init bloomfilter_counting_init  bloomfilter_add
 ##    bloomfilter_lookup bloomfilter_clear bloomfilter_merge global_hash_seed
 function bloomfilter_basic_init2%(k: count, cells: count,
                                  name: string &default=""%): opaque of bloomfilter
@ -82,7 +82,7 @@ function bloomfilter_basic_init2%(k: count, cells: count,
 		return 0;
 		}

-	size_t seed = Hasher::MakeSeed(name->Len() > 0 ? name->Bytes() : 0,
+	Hasher::seed_t seed = Hasher::MakeSeed(name->Len() > 0 ? name->Bytes() : 0,
 				       name->Len());
 	const Hasher* h = new DoubleHasher(k, seed);

@ -121,7 +121,7 @@ function bloomfilter_counting_init%(k: count, cells: count, max: count,
 		return 0;
 		}

-	size_t seed = Hasher::MakeSeed(name->Len() > 0 ? name->Bytes() : 0,
+	Hasher::seed_t seed = Hasher::MakeSeed(name->Len() > 0 ? name->Bytes() : 0,
 				       name->Len());

 	const Hasher* h = new DefaultHasher(k, seed);