From 8ca76dd4eea561f196b8ee39083a479121092337 Mon Sep 17 00:00:00 2001
From: Matthias Vallentin <vallentin@icir.org>
Date: Wed, 31 Jul 2013 17:59:08 +0200
Subject: [PATCH 1/2] Introduce global_hash_seed script variable.

This commit adds support for script-level specification of a seed to be used by
hashers. For example, if the given name of a Bloom filter is not empty, then
the seed used by the underlying hasher only depends on the Bloom filter name.
If the name is empty, we check whether the user defined a non-empty
global_hash_seed string variable at script and use it instead. If that script
variable does not exist, then we fall back to the initial seed computed a
Bro startup (which is affected ultimately by $BRO_SEED).

See Hasher::MakeSeed for details.
---
 src/NetVar.cc                      |  4 ++
 src/NetVar.h                       |  2 +
 src/probabilistic/Hasher.cc        | 85 ++++++++++++++----------------
 src/probabilistic/Hasher.h         | 45 +++++++++-------
 src/probabilistic/bloom-filter.bif |  9 +++-
 testing/btest/bifs/bloomfilter.bro |  4 +-
 6 files changed, 82 insertions(+), 67 deletions(-)
diff --git a/src/NetVar.cc b/src/NetVar.cc
index 388aa46f10..2fee46e2da 100644
--- a/src/NetVar.cc
+++ b/src/NetVar.cc
@@ -238,6 +238,8 @@ TableType* record_field_table;
 
 StringVal* cmd_line_bpf_filter;
 
+StringVal* global_hash_seed;
+
 OpaqueType* md5_type;
 OpaqueType* sha1_type;
 OpaqueType* sha256_type;
@@ -304,6 +306,8 @@ void init_general_global_var()
 	cmd_line_bpf_filter =
 		internal_val("cmd_line_bpf_filter")->AsStringVal();
 
+	global_hash_seed = opt_internal_string("global_hash_seed");
+
 	md5_type = new OpaqueType("md5");
 	sha1_type = new OpaqueType("sha1");
 	sha256_type = new OpaqueType("sha256");
diff --git a/src/NetVar.h b/src/NetVar.h
index 7ce33d1a1a..3615108f73 100644
--- a/src/NetVar.h
+++ b/src/NetVar.h
@@ -242,6 +242,8 @@ extern TableType* record_field_table;
 
 extern StringVal* cmd_line_bpf_filter;
 
+extern StringVal* global_hash_seed;
+
 class OpaqueType;
 extern OpaqueType* md5_type;
 extern OpaqueType* sha1_type;
diff --git a/src/probabilistic/Hasher.cc b/src/probabilistic/Hasher.cc
index 17597b9a82..e24a207e6e 100644
--- a/src/probabilistic/Hasher.cc
+++ b/src/probabilistic/Hasher.cc
@@ -3,11 +3,34 @@
 #include <typeinfo>
 
 #include "Hasher.h"
+#include "NetVar.h"
 #include "digest.h"
 #include "Serializer.h"
 
 using namespace probabilistic;
 
+size_t Hasher::MakeSeed(const void* data, size_t size)
+	{
+	u_char buf[SHA256_DIGEST_LENGTH];
+	SHA256_CTX ctx;
+	sha256_init(&ctx);
+
+	if ( data )
+		sha256_update(&ctx, data, size);
+
+	else if ( global_hash_seed && global_hash_seed->Len() > 0 )
+		sha256_update(&ctx, global_hash_seed->Bytes(), global_hash_seed->Len());
+
+	else
+		{
+		unsigned int first_seed = initial_seed();
+		sha256_update(&ctx, &first_seed, sizeof(first_seed));
+		}
+
+	sha256_final(&ctx, buf);
+	return *reinterpret_cast<size_t*>(buf); // Use the first bytes as seed.
+	}
+
 bool Hasher::Serialize(SerialInfo* info) const
 	{
 	return SerialObj::Serialize(info);
@@ -25,7 +48,7 @@ bool Hasher::DoSerialize(SerialInfo* info) const
 	if ( ! SERIALIZE(static_cast<uint16>(k)) )
 		return false;
 
-	return SERIALIZE_STR(name.c_str(), name.size());
+	return SERIALIZE(static_cast<uint64>(seed));
 	}
 
 bool Hasher::DoUnserialize(UnserialInfo* info)
@@ -35,30 +58,26 @@ bool Hasher::DoUnserialize(UnserialInfo* info)
 	uint16 serial_k;
 	if ( ! UNSERIALIZE(&serial_k) )
 		return false;
-
 	k = serial_k;
 	assert(k > 0);
 
-	const char* serial_name;
-	if ( ! UNSERIALIZE_STR(&serial_name, 0) )
+	uint64 serial_seed;
+	if ( ! UNSERIALIZE(&serial_seed) )
 		return false;
-
-	name = serial_name;
-	delete [] serial_name;
+	seed = serial_seed;
 
 	return true;
 	}
 
-Hasher::Hasher(size_t k, const std::string& arg_name)
-	: k(k)
+Hasher::Hasher(size_t arg_k, size_t arg_seed)
 	{
-	k = k;
-	name = arg_name;
+	k = arg_k;
+	seed = arg_seed;
 	}
 
 
-UHF::UHF(size_t seed, const std::string& extra)
-	: h(compute_seed(seed, extra))
+UHF::UHF(size_t seed)
+	: h(seed)
 	{
 	}
 
@@ -68,33 +87,11 @@ Hasher::digest UHF::hash(const void* x, size_t n) const
 	return n == 0 ? 0 : h(x, n);
 	}
 
-size_t UHF::compute_seed(size_t seed, const std::string& extra)
+DefaultHasher::DefaultHasher(size_t k, size_t seed)
+	: Hasher(k, seed)
 	{
-	u_char buf[SHA256_DIGEST_LENGTH];
-	SHA256_CTX ctx;
-	sha256_init(&ctx);
-
-	if ( extra.empty() )
-		{
-		unsigned int first_seed = initial_seed();
-		sha256_update(&ctx, &first_seed, sizeof(first_seed));
-		}
-
-	else
-		sha256_update(&ctx, extra.c_str(), extra.size());
-
-	sha256_update(&ctx, &seed, sizeof(seed));
-	sha256_final(&ctx, buf);
-
-	// Take the first sizeof(size_t) bytes as seed.
-	return *reinterpret_cast<size_t*>(buf);
-	}
-
-DefaultHasher::DefaultHasher(size_t k, const std::string& name)
-	: Hasher(k, name)
-	{
-	for ( size_t i = 0; i < k; ++i )
-		hash_functions.push_back(UHF(i, name));
+	for ( size_t i = 1; i <= k; ++i )
+		hash_functions.push_back(UHF(Seed() + bro_prng(i)));
 	}
 
 Hasher::digest_vector DefaultHasher::Hash(const void* x, size_t n) const
@@ -137,13 +134,13 @@ bool DefaultHasher::DoUnserialize(UnserialInfo* info)
 
 	hash_functions.clear();
 	for ( size_t i = 0; i < K(); ++i )
-		hash_functions.push_back(UHF(i, Name()));
+		hash_functions.push_back(UHF(Seed() + bro_prng(i)));
 
 	return true;
 	}
 
-DoubleHasher::DoubleHasher(size_t k, const std::string& name)
-	: Hasher(k, name), h1(1, name), h2(2, name)
+DoubleHasher::DoubleHasher(size_t k, size_t seed)
+	: Hasher(k, seed), h1(seed + bro_prng(1)), h2(seed + bro_prng(2))
 	{
 	}
 
@@ -187,8 +184,8 @@ bool DoubleHasher::DoUnserialize(UnserialInfo* info)
 	{
 	DO_UNSERIALIZE(Hasher);
 
-	h1 = UHF(1, Name());
-	h2 = UHF(2, Name());
+	h1 = UHF(Seed() + bro_prng(1));
+	h2 = UHF(Seed() + bro_prng(2));
 
 	return true;
 	}
diff --git a/src/probabilistic/Hasher.h b/src/probabilistic/Hasher.h
index 3acd5c5867..bd8f5ce5ff 100644
--- a/src/probabilistic/Hasher.h
+++ b/src/probabilistic/Hasher.h
@@ -18,6 +18,20 @@ public:
 	typedef hash_t digest;
 	typedef std::vector<digest> digest_vector;
 
+	/**
+	 * Creates a valid hasher seed from an arbitrary string.
+	 *
+	 * @param data A pointer to contiguous data that should be crunched into a
+	 * seed. If 0, the function tries to find a global_hash_seed script variable
+	 * to derive a seed from. If this variable does not exist, the function uses
+	 * the initial seed generated at Bro startup.
+	 *
+	 * @param size The number of bytes of *data*.
+	 *
+	 * @return A seed suitable for hashers.
+	 */
+	static size_t MakeSeed(const void* data, size_t size);
+
 	/**
 	 * Destructor.
 	 */
@@ -64,11 +78,9 @@ public:
 	size_t K() const	{ return k; }
 
 	/**
-	 * Returns the hasher's name. If not empty, the hasher uses this descriptor
-	 * to seed its *k* hash functions. Otherwise the hasher mixes in the initial
-	 * seed derived from the environment variable `$BRO_SEED`.
+	 * Returns the seed used to construct the hasher.
 	 */
-	const std::string& Name() const { return name; }
+	size_t Seed() const	{ return seed; }
 
 	bool Serialize(SerialInfo* info) const;
 	static Hasher* Unserialize(UnserialInfo* info);
@@ -81,16 +93,15 @@ protected:
 	/**
 	 * Constructor.
 	 *
-	 * @param k the number of hash functions.
+	 * @param arg_k the number of hash functions.
 	 *
-	 * @param name A name for the hasher. Hashers with the same name
-	 * should provide consistent results.
+	 * @param arg_seed The seed for the hasher.
 	 */
-	Hasher(size_t k, const std::string& name);
+	Hasher(size_t arg_k, size_t arg_seed);
 
 private:
 	size_t k;
-	std::string name;
+	size_t seed;
 };
 
 /**
@@ -104,12 +115,8 @@ public:
 	 * optional extra seed to replace the initial Bro seed.
 	 *
 	 * @param seed The seed to use for this instance.
-	 *
-	 * @param extra If not empty, this parameter replaces the initial
-	 * seed to compute the seed for t to compute the seed NUL-terminated
-	 * string as additional seed.
 	 */
-	UHF(size_t seed = 0, const std::string& extra = "");
+	UHF(size_t seed = 0);
 
 	template <typename T>
 	Hasher::digest operator()(const T& x) const
@@ -152,7 +159,7 @@ public:
 		}
 
 private:
-	static size_t compute_seed(size_t seed, const std::string& extra);
+	static size_t compute_seed(size_t seed);
 
 	H3<Hasher::digest, UHASH_KEY_SIZE> h;
 };
@@ -169,9 +176,9 @@ public:
 	 *
 	 * @param k The number of hash functions to use.
 	 *
-	 * @param name The name of the hasher.
+	 * @param seed The seed for the hasher.
 	 */
-	DefaultHasher(size_t k, const std::string& name = "");
+	DefaultHasher(size_t k, size_t seed);
 
 	// Overridden from Hasher.
 	virtual digest_vector Hash(const void* x, size_t n) const /* final */;
@@ -197,9 +204,9 @@ public:
 	 *
 	 * @param k The number of hash functions to use.
 	 *
-	 * @param name The name of the hasher.
+	 * @param seed The seed for the hasher.
 	 */
-	DoubleHasher(size_t k, const std::string& name = "");
+	DoubleHasher(size_t k, size_t seed);
 
 	// Overridden from Hasher.
 	virtual digest_vector Hash(const void* x, size_t n) const /* final */;
diff --git a/src/probabilistic/bloom-filter.bif b/src/probabilistic/bloom-filter.bif
index a3567ad6f7..d936b77e3b 100644
--- a/src/probabilistic/bloom-filter.bif
+++ b/src/probabilistic/bloom-filter.bif
@@ -48,7 +48,9 @@ function bloomfilter_basic_init%(fp: double, capacity: count,
 
 	size_t cells = BasicBloomFilter::M(fp, capacity);
 	size_t optimal_k = BasicBloomFilter::K(cells, capacity);
-	const Hasher* h = new DefaultHasher(optimal_k, name->CheckString());
+	size_t seed = Hasher::MakeSeed(name->Len() > 0 ? name->Bytes() : 0,
+                                 name->Len());
+	const Hasher* h = new DefaultHasher(optimal_k, seed);
 
 	return new BloomFilterVal(new BasicBloomFilter(h, cells));
 	%}
@@ -86,7 +88,10 @@ function bloomfilter_counting_init%(k: count, cells: count, max: count,
 		return 0;
 		}
 
-	const Hasher* h = new DefaultHasher(k, name->CheckString());
+	size_t seed = Hasher::MakeSeed(name->Len() > 0 ? name->Bytes() : 0,
+                                 name->Len());
+
+	const Hasher* h = new DefaultHasher(k, seed);
 
 	uint16 width = 1;
 	while ( max >>= 1 )
diff --git a/testing/btest/bifs/bloomfilter.bro b/testing/btest/bifs/bloomfilter.bro
index 3b40f29553..e6091e25fa 100644
--- a/testing/btest/bifs/bloomfilter.bro
+++ b/testing/btest/bifs/bloomfilter.bro
@@ -21,8 +21,8 @@ function test_basic_bloom_filter()
   bloomfilter_add(bf_str, "bar");
   print bloomfilter_lookup(bf_str, "foo");
   print bloomfilter_lookup(bf_str, "bar");
-  print bloomfilter_lookup(bf_str, "b4z"); # FP
-  print bloomfilter_lookup(bf_str, "quux"); # FP
+  print bloomfilter_lookup(bf_str, "b4zzz"); # FP
+  print bloomfilter_lookup(bf_str, "quuux"); # FP
   bloomfilter_add(bf_str, 0.5); # Type mismatch
   bloomfilter_add(bf_str, 100); # Type mismatch
 

From d50b8a147d739e3fdce9cf235e47d7291adbe212 Mon Sep 17 00:00:00 2001
From: Matthias Vallentin <vallentin@icir.org>
Date: Wed, 31 Jul 2013 18:21:37 +0200
Subject: [PATCH 2/2] Add new BiF for low-level Bloom filter initialization.

For symmetry reasons, the new Bif bloomfilter_basic_init2 also allows users to
manually specify the memory bounds and number of hash functions to use.
---
 NEWS                                          |  1 +
 src/probabilistic/bloom-filter.bif            | 69 +++++++++++++++----
 .../btest/Baseline/bifs.bloomfilter/output    |  2 +
 testing/btest/bifs/bloomfilter.bro            |  7 ++
 4 files changed, 67 insertions(+), 12 deletions(-)

diff --git a/NEWS b/NEWS
index c421e7d675..64058054d6 100644
--- a/NEWS
+++ b/NEWS
@@ -113,6 +113,7 @@ New Functionality
   the frequency of elements. The corresponding functions are:
 
     bloomfilter_basic_init(fp: double, capacity: count, name: string &default=""): opaque of bloomfilter
+    bloomfilter_basic_init2(k: count, cells: count, name: string &default=""): opaque of bloomfilter
     bloomfilter_counting_init(k: count, cells: count, max: count, name: string &default=""): opaque of bloomfilter
     bloomfilter_add(bf: opaque of bloomfilter, x: any)
     bloomfilter_lookup(bf: opaque of bloomfilter, x: any): count
diff --git a/src/probabilistic/bloom-filter.bif b/src/probabilistic/bloom-filter.bif
index d936b77e3b..0c4a67ac6f 100644
--- a/src/probabilistic/bloom-filter.bif
+++ b/src/probabilistic/bloom-filter.bif
@@ -35,8 +35,8 @@ module GLOBAL;
 ##
 ## Returns: A Bloom filter handle.
 ##
-## .. bro:see:: bloomfilter_counting_init  bloomfilter_add bloomfilter_lookup
-##    bloomfilter_clear bloomfilter_merge
+## .. bro:see:: bloomfilter_basic_init2 bloomfilter_counting_init bloomfilter_add
+##    bloomfilter_lookup bloomfilter_clear bloomfilter_merge
 function bloomfilter_basic_init%(fp: double, capacity: count,
                                  name: string &default=""%): opaque of bloomfilter
 	%{
@@ -55,6 +55,47 @@ function bloomfilter_basic_init%(fp: double, capacity: count,
 	return new BloomFilterVal(new BasicBloomFilter(h, cells));
 	%}
 
+## Creates a basic Bloom filter. This function serves as a low-level
+## alternative to bloomfilter_basic_init where the user has full control over
+## the number of hash functions and cells in the underlying bit vector.
+##
+## .. note:: A Bloom filter can have a name associated with it. In the future,
+##    Bloom filters with the same name will be compatible across indepedent Bro
+##    instances, i.e., it will be possible to merge them. Currently, however, that is
+##    not yet supported.
+##
+## k: The number of hash functions to use.
+##
+## cells: The number of cells of the underlying bit vector.
+##
+## name: A name that uniquely identifies and seeds the Bloom filter. If empty,
+## the filter will remain tied to the current Bro process.
+##
+## Returns: A Bloom filter handle.
+##
+## .. bro:see:: bloom_filter_basic_init bloomfilter_counting_init  bloomfilter_add 
+##    bloomfilter_lookup bloomfilter_clear bloomfilter_merge
+function bloomfilter_basic_init2%(k: count, cells: count,
+                                  name: string &default=""%): opaque of bloomfilter
+	%{
+	if ( k == 0 )
+		{
+		reporter->Error("number of hash functions must be non-negative");
+		return 0;
+		}
+	if ( cells == 0 )
+		{
+		reporter->Error("number of cells must be non-negative");
+		return 0;
+		}
+
+	size_t seed = Hasher::MakeSeed(name->Len() > 0 ? name->Bytes() : 0,
+                                 name->Len());
+	const Hasher* h = new DefaultHasher(k, seed);
+
+	return new BloomFilterVal(new BasicBloomFilter(h, cells));
+	%}
+
 ## Creates a counting Bloom filter.
 ##
 ## .. note:: A Bloom filter can have a name associated with it. In the future,
@@ -77,8 +118,8 @@ function bloomfilter_basic_init%(fp: double, capacity: count,
 ##
 ## Returns: A Bloom filter handle.
 ##
-## .. bro:see:: bloomfilter_basic_init bloomfilter_add bloomfilter_lookup
-##    bloomfilter_clear bloomfilter_merge
+## .. bro:see:: bloomfilter_basic_init bloomfilter_basic_init2 bloomfilter_add
+##    bloomfilter_lookup bloomfilter_clear bloomfilter_merge
 function bloomfilter_counting_init%(k: count, cells: count, max: count,
 				    name: string &default=""%): opaque of bloomfilter
 	%{
@@ -106,8 +147,9 @@ function bloomfilter_counting_init%(k: count, cells: count, max: count,
 ##
 ## x: The element to add.
 ##
-## .. bro:see:: bloomfilter_counting_init bloomfilter_basic_init loomfilter_lookup
-##    bloomfilter_clear bloomfilter_merge
+## .. bro:see:: bloomfilter_basic_init bloomfilter_basic_init2 
+##    bloomfilter_counting_init bloomfilter_lookup bloomfilter_clear 
+##    bloomfilter_merge
 function bloomfilter_add%(bf: opaque of bloomfilter, x: any%): any
 	%{
 	BloomFilterVal* bfv = static_cast<BloomFilterVal*>(bf);
@@ -132,8 +174,9 @@ function bloomfilter_add%(bf: opaque of bloomfilter, x: any%): any
 ##
 ## Returns: the counter associated with *x* in *bf*.
 ##
-## .. bro:see:: bloomfilter_counting_init bloomfilter_basic_init
-##    bloomfilter_add bloomfilter_clear bloomfilter_merge
+## .. bro:see:: bloomfilter_basic_init bloomfilter_basic_init2
+##    bloomfilter_counting_init bloomfilter_add bloomfilter_clear
+##    bloomfilter_merge
 function bloomfilter_lookup%(bf: opaque of bloomfilter, x: any%): count
 	%{
 	const BloomFilterVal* bfv = static_cast<const BloomFilterVal*>(bf);
@@ -159,8 +202,9 @@ function bloomfilter_lookup%(bf: opaque of bloomfilter, x: any%): count
 ##
 ## bf: The Bloom filter handle.
 ##
-## .. bro:see:: bloomfilter_counting_init bloomfilter_basic_init
-##    bloomfilter_add bloomfilter_lookup bloomfilter_merge
+## .. bro:see:: bloomfilter_basic_init bloomfilter_counting_init2
+##    bloomfilter_counting_init bloomfilter_add bloomfilter_lookup
+##    bloomfilter_merge
 function bloomfilter_clear%(bf: opaque of bloomfilter%): any
 	%{
 	BloomFilterVal* bfv = static_cast<BloomFilterVal*>(bf);
@@ -183,8 +227,9 @@ function bloomfilter_clear%(bf: opaque of bloomfilter%): any
 ##
 ## Returns: The union of *bf1* and *bf2*.
 ##
-## .. bro:see:: bloomfilter_counting_init bloomfilter_basic_init
-##    bloomfilter_add bloomfilter_lookup bloomfilter_clear
+## .. bro:see:: bloomfilter_basic_init bloomfilter_basic_init2
+##    bloomfilter_counting_init bloomfilter_add bloomfilter_lookup
+##    bloomfilter_clear
 function bloomfilter_merge%(bf1: opaque of bloomfilter,
 			    bf2: opaque of bloomfilter%): opaque of bloomfilter
 	%{
diff --git a/testing/btest/Baseline/bifs.bloomfilter/output b/testing/btest/Baseline/bifs.bloomfilter/output
index 14e1f038c0..731b7c7ce9 100644
--- a/testing/btest/Baseline/bifs.bloomfilter/output
+++ b/testing/btest/Baseline/bifs.bloomfilter/output
@@ -17,6 +17,8 @@ error: false-positive rate must take value between 0 and 1
 1
 1
 1
+1
+1
 2
 3
 3
diff --git a/testing/btest/bifs/bloomfilter.bro b/testing/btest/bifs/bloomfilter.bro
index e6091e25fa..c2a1c47ca8 100644
--- a/testing/btest/bifs/bloomfilter.bro
+++ b/testing/btest/bifs/bloomfilter.bro
@@ -15,6 +15,13 @@ function test_basic_bloom_filter()
   bloomfilter_add(bf_cnt, 0.5); # Type mismatch
   bloomfilter_add(bf_cnt, "foo"); # Type mismatch
 
+  # Alternative constructor.
+  local bf_dbl = bloomfilter_basic_init2(4, 10);
+  bloomfilter_add(bf_dbl, 4.2);
+  bloomfilter_add(bf_dbl, 3.14);
+  print bloomfilter_lookup(bf_dbl, 4.2);
+  print bloomfilter_lookup(bf_dbl, 3.14);
+
   # Basic usage with strings.
   local bf_str = bloomfilter_basic_init(0.9, 10);
   bloomfilter_add(bf_str, "foo");