Also switch BloomFilters from H3 to siphash.

This removes all dependencies on H3 in our source tree.
2025-10-05 16:18:19 +00:00 · 2016-07-13 09:04:10 -07:00 · 2016-07-13 09:04:10 -07:00 · f1bae871e9
commit f1bae871e9
parent e1218cc7fa
8 changed files with 78 additions and 178 deletions
--- a/src/H3.h
+++ b/src/H3.h
@ -1,143 +0,0 @@
-// Copyright 2004, 2005
-// The Regents of the University of California
-// All Rights Reserved
-// 
-// Permission to use, copy, modify and distribute any part of this
-// h3.h file, without fee, and without a written agreement is hereby
-// granted, provided that the above copyright notice, this paragraph
-// and the following paragraphs appear in all copies.
-// 
-// IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY
-// PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL
-// DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF
-// THE POSSIBILITY OF SUCH DAMAGE.
-// 
-// THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE
-// UNIVERSITY OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE,
-// SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY
-// OF CALIFORNIA MAKES NO REPRESENTATIONS AND EXTENDS NO WARRANTIES
-// OF ANY KIND, EITHER IMPLIED OR EXPRESS, INCLUDING, BUT NOT LIMITED
-// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A
-// PARTICULAR PURPOSE, OR THAT THE USE OF THE SOFTWARE WILL NOT INFRINGE
-// ANY PATENT, TRADEMARK OR OTHER RIGHTS.
-// 
-// The h3.h file is developed by the CoralReef development team at the
-// University of California, San Diego under the Cooperative Association
-// for Internet Data Analysis (CAIDA) Program.  Support for this effort was
-// provided by the CAIDA grant NCR-9711092, DARPA NGI Contract
-// N66001-98-2-8922, DARPA NMS Grant N66001-01-1-8909, NSF Grant ANI-013710
-// and by CAIDA members.
-// 
-// Report bugs and suggestions to coral-bugs@caida.org.
-
-// H3 hash function family
-// C++ template implementation by Ken Keys (kkeys@caida.org)
-//
-// Usage:
-//    #include <h3.h>
-//    const H3<T, N> h;
-//    T hashval = h(data, size [, offset]);
-// (T) is the type to be returned by the hash function; must be an integral
-//     type, e.g. uint32_t.
-// (N) is the size of the data in bytes (if data is a struct, beware of
-//     padding).
-// The hash function hashes the (size) bytes of the data pointed to by (data),
-//     starting at (offset).  Note: offset affects the hash value, so
-//     h(data, size, offset) is not the same as h(data+offset, size, 0).
-//     Typically (size) is N and (offset) is 0, but other values can be used to
-//     hash a substring of the data.  Hashes of substrings can be bitwise-XOR'ed
-//     together to get the same result as hashing the full string.
-// Any number of hash functions can be created by creating new instances of H3,
-//     with the same or different template parameters.  The hash function
-//     constructor takes a seed as argument which defaults to a call to
-//     bro_random().
-
-
-#ifndef H3_H
-#define H3_H
-
-#include <climits>
-#include <cstring>
-
-// The number of values representable by a byte.
-#define H3_BYTE_RANGE (UCHAR_MAX+1)
-
-template <typename T, int N>
-class H3 {
-public:
-	H3()
-		{
-		Init(false, 0);
-		}
-
-	H3(T seed)
-		{
-		Init(true, seed);
-		}
-
-	void Init(bool have_seed, T seed)
-		{
-		T bit_lookup[N * CHAR_BIT];
-
-		for ( size_t bit = 0; bit < N * CHAR_BIT; bit++ )
-			{
-			bit_lookup[bit] = 0;
-			for ( size_t i = 0; i < sizeof(T)/2; i++ )
-				{
-				seed = have_seed ? bro_prng(seed) : bro_random();
-				// assume random() returns at least 16 random bits
-				bit_lookup[bit] = (bit_lookup[bit] << 16) | (seed & 0xFFFF);
-				}
-			}
-
-		for ( size_t byte = 0; byte < N; byte++ )
-			{
-			for ( unsigned val = 0; val < H3_BYTE_RANGE; val++ )
-				{
-				byte_lookup[byte][val] = 0;
-				for ( size_t bit = 0; bit < CHAR_BIT; bit++ )
-					// Does this mean byte_lookup[*][0] == 0? -RP
-					if (val & (1 << bit))
-						byte_lookup[byte][val] ^= bit_lookup[byte*CHAR_BIT+bit];
-				}
-			}
-		}
-
-	T operator()(const void* data, size_t size, size_t offset = 0) const
-		{
-		const unsigned char *p = static_cast<const unsigned char*>(data);
-		T result = 0;
-
-		// loop optmized with Duff's Device
-		unsigned n = (size + 7) / 8;
-		switch ( size % 8 ) {
-		case 0: do { result ^= byte_lookup[offset++][*p++];
-		case 7:      result ^= byte_lookup[offset++][*p++];
-		case 6:      result ^= byte_lookup[offset++][*p++];
-		case 5:      result ^= byte_lookup[offset++][*p++];
-		case 4:      result ^= byte_lookup[offset++][*p++];
-		case 3:      result ^= byte_lookup[offset++][*p++];
-		case 2:      result ^= byte_lookup[offset++][*p++];
-		case 1:      result ^= byte_lookup[offset++][*p++];
-				} while ( --n > 0 );
-			}
-
-		return result;
-		}
-
-	friend bool operator==(const H3& x, const H3& y)
-		{
-		return ! std::memcmp(x.byte_lookup, y.byte_lookup, N * H3_BYTE_RANGE);
-		}
-
-	friend bool operator!=(const H3& x, const H3& y)
-		{
-		return ! (x == y);
-		}
-
-private:
-	T byte_lookup[N][H3_BYTE_RANGE];
-};
-
-#endif //H3_H
--- a/src/probabilistic/CardinalityCounter.cc
+++ b/src/probabilistic/CardinalityCounter.cc
@ -241,7 +241,8 @@ CardinalityCounter* CardinalityCounter::Unserialize(UnserialInfo* info)
 	return c;
 	}

-/* The following function is copied from libc/string/flsll.c from the FreeBSD source
+/**
+ * The following function is copied from libc/string/flsll.c from the FreeBSD source
 * tree. Original copyright message follows
 */
 /*-
--- a/src/probabilistic/CardinalityCounter.h
+++ b/src/probabilistic/CardinalityCounter.h
@ -155,9 +155,10 @@ private:
 	int OptimalB(double error, double confidence) const;

 	/**
-	 * Determines at which index (counted from the back) the first one-bit
+	 * Determines at which index (counted from the front) the first one-bit
 	 * appears. The last b bits have to be 0 (the element has to be divisible
-	 * by m), hence they are ignored.
+	 * by m), hence they are ignored. Always adds 1 to the result. This is the
+	 * rho function from the original algorithm.
 	 *
 	 * @param hash_modified hash value
 	 *
@ -165,6 +166,9 @@ private:
 	 */
 	uint8_t Rank(uint64_t hash_modified) const;

+	/**
+	 * flsll from FreeBSD; especially Linux does not have this.
+	 */
 	static int flsll(uint64_t mask);

 	/**
--- a/src/probabilistic/Hasher.cc
+++ b/src/probabilistic/Hasher.cc
@ -8,15 +8,21 @@
 #include "digest.h"
 #include "Serializer.h"

+extern "C" {
+extern int siphash( uint8_t *out, const uint8_t *in, uint64_t inlen, const uint8_t *k );
+}
+
 using namespace probabilistic;

-uint64 Hasher::MakeSeed(const void* data, size_t size)
+Hasher::seed_t Hasher::MakeSeed(const void* data, size_t size)
 	{
 	u_char buf[SHA256_DIGEST_LENGTH];
-	uint64 tmpseed;
+	seed_t tmpseed;
 	SHA256_CTX ctx;
 	sha256_init(&ctx);

+	assert(sizeof(tmpseed) == 16);
+
 	if ( data )
 		sha256_update(&ctx, data, size);

@ -56,7 +62,8 @@ bool Hasher::DoSerialize(SerialInfo* info) const
 	if ( ! SERIALIZE(static_cast<uint16>(k)) )
 		return false;

-	return SERIALIZE(static_cast<uint64>(seed));
+	return SERIALIZE(static_cast<uint64>(seed.h1));
+	return SERIALIZE(static_cast<uint64>(seed.h2));
 	}

 bool Hasher::DoUnserialize(UnserialInfo* info)
@ -70,8 +77,10 @@ bool Hasher::DoUnserialize(UnserialInfo* info)
 	k = serial_k;
 	assert(k > 0);

-	uint64 serial_seed;
-	if ( ! UNSERIALIZE(&serial_seed) )
+	seed_t serial_seed;
+	if ( ! UNSERIALIZE(&serial_seed.h1) )
+		return false;
+	if ( ! UNSERIALIZE(&serial_seed.h2) )
 		return false;

 	seed = serial_seed;
@ -79,14 +88,18 @@ bool Hasher::DoUnserialize(UnserialInfo* info)
 	return true;
 	}

-Hasher::Hasher(size_t arg_k, size_t arg_seed)
+Hasher::Hasher(size_t arg_k, seed_t arg_seed)
 	{
 	k = arg_k;
 	seed = arg_seed;
 	}

-UHF::UHF(size_t arg_seed)
-	: h(arg_seed)
+UHF::UHF()
+	{
+	memset(&seed, 0, sizeof(seed));
+	}
+
+UHF::UHF(Hasher::seed_t arg_seed)
 	{
 	seed = arg_seed;
 	}
@ -96,8 +109,13 @@ UHF::UHF(size_t arg_seed)
 // times.
 Hasher::digest UHF::hash(const void* x, size_t n) const
 	{
+	assert(sizeof(Hasher::seed_t) == 16);
+	hash_t outdigest;
 	if ( n <= UHASH_KEY_SIZE )
-		return n == 0 ? 0 : h(x, n);
+		{
+		siphash(reinterpret_cast<uint8_t*>(&outdigest), reinterpret_cast<const uint8_t*>(x), n, reinterpret_cast<const uint8_t*>(&seed));
+		return outdigest;
+		}

 	unsigned char d[16];
 	MD5(reinterpret_cast<const unsigned char*>(x), n, d);
@ -111,11 +129,15 @@ Hasher::digest UHF::hash(const void* x, size_t n) const
 	return *reinterpret_cast<const Hasher::digest*>(d);
 	}

-DefaultHasher::DefaultHasher(size_t k, size_t seed)
+DefaultHasher::DefaultHasher(size_t k, Hasher::seed_t seed)
 	: Hasher(k, seed)
 	{
 	for ( size_t i = 1; i <= k; ++i )
-		hash_functions.push_back(UHF(Seed() + bro_prng(i)));
+		{
+		seed_t s = Seed();
+		s.h1 += bro_prng(i);
+		hash_functions.push_back(UHF(s));
+		}
 	}

 Hasher::digest_vector DefaultHasher::Hash(const void* x, size_t n) const
@ -158,12 +180,16 @@ bool DefaultHasher::DoUnserialize(UnserialInfo* info)

 	hash_functions.clear();
 	for ( size_t i = 0; i < K(); ++i )
-		hash_functions.push_back(UHF(Seed() + bro_prng(i)));
+		{
+		Hasher::seed_t s = Seed();
+		s.h1 += bro_prng(i);
+		hash_functions.push_back(UHF(s));
+		}

 	return true;
 	}

-DoubleHasher::DoubleHasher(size_t k, size_t seed)
+DoubleHasher::DoubleHasher(size_t k, seed_t seed)
 	: Hasher(k, seed), h1(seed + bro_prng(1)), h2(seed + bro_prng(2))
 	{
 	}
--- a/src/probabilistic/Hasher.h
+++ b/src/probabilistic/Hasher.h
@ -4,7 +4,6 @@
 #define PROBABILISTIC_HASHER_H

 #include "Hash.h"
-#include "H3.h"
 #include "SerialObj.h"

 namespace probabilistic {
@ -17,6 +16,15 @@ class Hasher : public SerialObj {
 public:
 	typedef hash_t digest;
 	typedef std::vector<digest> digest_vector;
+	struct seed_t {
+		uint64_t h1;
+		uint64_t h2;
+
+		friend seed_t operator+(seed_t lhs, const uint64_t rhs) {
+			lhs.h1 += rhs;
+			return lhs;
+		}
+	};

 	/**
 	 * Creates a valid hasher seed from an arbitrary string.
@ -30,7 +38,7 @@ public:
 	 *
 	 * @return A seed suitable for hashers.
 	 */
-	static uint64 MakeSeed(const void* data, size_t size);
+	static seed_t MakeSeed(const void* data, size_t size);

 	/**
 	 * Destructor.
@ -89,7 +97,7 @@ public:
 	/**
 	 * Returns the seed used to construct the hasher.
 	 */
-	size_t Seed() const	{ return seed; }
+	seed_t Seed() const	{ return seed; }

 	bool Serialize(SerialInfo* info) const;
 	static Hasher* Unserialize(UnserialInfo* info);
@ -106,11 +114,11 @@ protected:
 	 *
 	 * @param arg_seed The seed for the hasher.
 	 */
-	Hasher(size_t arg_k, size_t arg_seed);
+	Hasher(size_t arg_k, seed_t arg_seed);

 private:
 	size_t k;
-	size_t seed;
+	seed_t seed;
 };

 /**
@ -120,12 +128,17 @@ private:
 class UHF {
 public:
 	/**
-	 * Constructs an H3 hash function seeded with a given seed and an
+	 * Default constructor with zero seed.
+	 */
+	UHF();
+
+	/**
+	 * Constructs an hash function seeded with a given seed and an
 	 * optional extra seed to replace the initial Bro seed.
 	 *
 	 * @param arg_seed The seed to use for this instance.
 	 */
-	UHF(size_t arg_seed = 0);
+	UHF(Hasher::seed_t arg_seed);

 	template <typename T>
 	Hasher::digest operator()(const T& x) const
@ -159,7 +172,8 @@ public:

 	friend bool operator==(const UHF& x, const UHF& y)
 		{
-		return x.h == y.h;
+		return (x.seed.h1 == y.seed.h1) &&
+		       (x.seed.h2 == y.seed.h2);
 		}

 	friend bool operator!=(const UHF& x, const UHF& y)
@ -168,10 +182,9 @@ public:
 		}

 private:
-	static size_t compute_seed(size_t seed);
+	static size_t compute_seed(Hasher::seed_t seed);

-	H3<Hasher::digest, UHASH_KEY_SIZE> h;
-	size_t seed;
+	Hasher::seed_t seed;
 };


@ -188,7 +201,7 @@ public:
 	 *
 	 * @param seed The seed for the hasher.
 	 */
-	DefaultHasher(size_t k, size_t seed);
+	DefaultHasher(size_t k, Hasher::seed_t seed);

 	// Overridden from Hasher.
 	virtual digest_vector Hash(const void* x, size_t n) const final;
@ -216,7 +229,7 @@ public:
 	 *
 	 * @param seed The seed for the hasher.
 	 */
-	DoubleHasher(size_t k, size_t seed);
+	DoubleHasher(size_t k, Hasher::seed_t seed);

 	// Overridden from Hasher.
 	virtual digest_vector Hash(const void* x, size_t n) const final;
--- a/src/probabilistic/bloom-filter.bif
+++ b/src/probabilistic/bloom-filter.bif
@ -42,7 +42,7 @@ function bloomfilter_basic_init%(fp: double, capacity: count,

 	size_t cells = BasicBloomFilter::M(fp, capacity);
 	size_t optimal_k = BasicBloomFilter::K(cells, capacity);
-	size_t seed = Hasher::MakeSeed(name->Len() > 0 ? name->Bytes() : 0,
+	Hasher::seed_t seed = Hasher::MakeSeed(name->Len() > 0 ? name->Bytes() : 0,
                                 name->Len());
 	const Hasher* h = new DoubleHasher(optimal_k, seed);

@ -82,7 +82,7 @@ function bloomfilter_basic_init2%(k: count, cells: count,
 		return 0;
 		}

-	size_t seed = Hasher::MakeSeed(name->Len() > 0 ? name->Bytes() : 0,
+	Hasher::seed_t seed = Hasher::MakeSeed(name->Len() > 0 ? name->Bytes() : 0,
 				       name->Len());
 	const Hasher* h = new DoubleHasher(k, seed);

@ -121,7 +121,7 @@ function bloomfilter_counting_init%(k: count, cells: count, max: count,
 		return 0;
 		}

-	size_t seed = Hasher::MakeSeed(name->Len() > 0 ? name->Bytes() : 0,
+	Hasher::seed_t seed = Hasher::MakeSeed(name->Len() > 0 ? name->Bytes() : 0,
 				       name->Len());

 	const Hasher* h = new DefaultHasher(k, seed);
--- a/testing/btest/Baseline/bifs.bloomfilter/output
+++ b/testing/btest/Baseline/bifs.bloomfilter/output
@ -13,7 +13,6 @@ error: false-positive rate must take value between 0 and 1
 1
 1
 1, fp
-1, fp
 1
 1
 1
--- a/testing/btest/bifs/bloomfilter.bro
+++ b/testing/btest/bifs/bloomfilter.bro
@ -28,7 +28,7 @@ function test_basic_bloom_filter()
  bloomfilter_add(bf_str, "bar");
  print bloomfilter_lookup(bf_str, "foo");
  print bloomfilter_lookup(bf_str, "bar");
-  print bloomfilter_lookup(bf_str, "bazzz"), "fp"; # FP
+  # print bloomfilter_lookup(bf_str, "bazzz"), "fp"; # FP false positive does no longer trigger after hash function change
  print bloomfilter_lookup(bf_str, "quuux"), "fp"; # FP
  bloomfilter_add(bf_str, 0.5); # Type mismatch
  bloomfilter_add(bf_str, 100); # Type mismatch