From 8ca76dd4eea561f196b8ee39083a479121092337 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Wed, 31 Jul 2013 17:59:08 +0200 Subject: [PATCH 1/5] Introduce global_hash_seed script variable. This commit adds support for script-level specification of a seed to be used by hashers. For example, if the given name of a Bloom filter is not empty, then the seed used by the underlying hasher only depends on the Bloom filter name. If the name is empty, we check whether the user defined a non-empty global_hash_seed string variable at script and use it instead. If that script variable does not exist, then we fall back to the initial seed computed a Bro startup (which is affected ultimately by $BRO_SEED). See Hasher::MakeSeed for details. --- src/NetVar.cc | 4 ++ src/NetVar.h | 2 + src/probabilistic/Hasher.cc | 85 ++++++++++++++---------------- src/probabilistic/Hasher.h | 45 +++++++++------- src/probabilistic/bloom-filter.bif | 9 +++- testing/btest/bifs/bloomfilter.bro | 4 +- 6 files changed, 82 insertions(+), 67 deletions(-) diff --git a/src/NetVar.cc b/src/NetVar.cc index 388aa46f10..2fee46e2da 100644 --- a/src/NetVar.cc +++ b/src/NetVar.cc @@ -238,6 +238,8 @@ TableType* record_field_table; StringVal* cmd_line_bpf_filter; +StringVal* global_hash_seed; + OpaqueType* md5_type; OpaqueType* sha1_type; OpaqueType* sha256_type; @@ -304,6 +306,8 @@ void init_general_global_var() cmd_line_bpf_filter = internal_val("cmd_line_bpf_filter")->AsStringVal(); + global_hash_seed = opt_internal_string("global_hash_seed"); + md5_type = new OpaqueType("md5"); sha1_type = new OpaqueType("sha1"); sha256_type = new OpaqueType("sha256"); diff --git a/src/NetVar.h b/src/NetVar.h index 7ce33d1a1a..3615108f73 100644 --- a/src/NetVar.h +++ b/src/NetVar.h @@ -242,6 +242,8 @@ extern TableType* record_field_table; extern StringVal* cmd_line_bpf_filter; +extern StringVal* global_hash_seed; + class OpaqueType; extern OpaqueType* md5_type; extern OpaqueType* sha1_type; diff --git a/src/probabilistic/Hasher.cc b/src/probabilistic/Hasher.cc index 17597b9a82..e24a207e6e 100644 --- a/src/probabilistic/Hasher.cc +++ b/src/probabilistic/Hasher.cc @@ -3,11 +3,34 @@ #include #include "Hasher.h" +#include "NetVar.h" #include "digest.h" #include "Serializer.h" using namespace probabilistic; +size_t Hasher::MakeSeed(const void* data, size_t size) + { + u_char buf[SHA256_DIGEST_LENGTH]; + SHA256_CTX ctx; + sha256_init(&ctx); + + if ( data ) + sha256_update(&ctx, data, size); + + else if ( global_hash_seed && global_hash_seed->Len() > 0 ) + sha256_update(&ctx, global_hash_seed->Bytes(), global_hash_seed->Len()); + + else + { + unsigned int first_seed = initial_seed(); + sha256_update(&ctx, &first_seed, sizeof(first_seed)); + } + + sha256_final(&ctx, buf); + return *reinterpret_cast(buf); // Use the first bytes as seed. + } + bool Hasher::Serialize(SerialInfo* info) const { return SerialObj::Serialize(info); @@ -25,7 +48,7 @@ bool Hasher::DoSerialize(SerialInfo* info) const if ( ! SERIALIZE(static_cast(k)) ) return false; - return SERIALIZE_STR(name.c_str(), name.size()); + return SERIALIZE(static_cast(seed)); } bool Hasher::DoUnserialize(UnserialInfo* info) @@ -35,30 +58,26 @@ bool Hasher::DoUnserialize(UnserialInfo* info) uint16 serial_k; if ( ! UNSERIALIZE(&serial_k) ) return false; - k = serial_k; assert(k > 0); - const char* serial_name; - if ( ! UNSERIALIZE_STR(&serial_name, 0) ) + uint64 serial_seed; + if ( ! UNSERIALIZE(&serial_seed) ) return false; - - name = serial_name; - delete [] serial_name; + seed = serial_seed; return true; } -Hasher::Hasher(size_t k, const std::string& arg_name) - : k(k) +Hasher::Hasher(size_t arg_k, size_t arg_seed) { - k = k; - name = arg_name; + k = arg_k; + seed = arg_seed; } -UHF::UHF(size_t seed, const std::string& extra) - : h(compute_seed(seed, extra)) +UHF::UHF(size_t seed) + : h(seed) { } @@ -68,33 +87,11 @@ Hasher::digest UHF::hash(const void* x, size_t n) const return n == 0 ? 0 : h(x, n); } -size_t UHF::compute_seed(size_t seed, const std::string& extra) +DefaultHasher::DefaultHasher(size_t k, size_t seed) + : Hasher(k, seed) { - u_char buf[SHA256_DIGEST_LENGTH]; - SHA256_CTX ctx; - sha256_init(&ctx); - - if ( extra.empty() ) - { - unsigned int first_seed = initial_seed(); - sha256_update(&ctx, &first_seed, sizeof(first_seed)); - } - - else - sha256_update(&ctx, extra.c_str(), extra.size()); - - sha256_update(&ctx, &seed, sizeof(seed)); - sha256_final(&ctx, buf); - - // Take the first sizeof(size_t) bytes as seed. - return *reinterpret_cast(buf); - } - -DefaultHasher::DefaultHasher(size_t k, const std::string& name) - : Hasher(k, name) - { - for ( size_t i = 0; i < k; ++i ) - hash_functions.push_back(UHF(i, name)); + for ( size_t i = 1; i <= k; ++i ) + hash_functions.push_back(UHF(Seed() + bro_prng(i))); } Hasher::digest_vector DefaultHasher::Hash(const void* x, size_t n) const @@ -137,13 +134,13 @@ bool DefaultHasher::DoUnserialize(UnserialInfo* info) hash_functions.clear(); for ( size_t i = 0; i < K(); ++i ) - hash_functions.push_back(UHF(i, Name())); + hash_functions.push_back(UHF(Seed() + bro_prng(i))); return true; } -DoubleHasher::DoubleHasher(size_t k, const std::string& name) - : Hasher(k, name), h1(1, name), h2(2, name) +DoubleHasher::DoubleHasher(size_t k, size_t seed) + : Hasher(k, seed), h1(seed + bro_prng(1)), h2(seed + bro_prng(2)) { } @@ -187,8 +184,8 @@ bool DoubleHasher::DoUnserialize(UnserialInfo* info) { DO_UNSERIALIZE(Hasher); - h1 = UHF(1, Name()); - h2 = UHF(2, Name()); + h1 = UHF(Seed() + bro_prng(1)); + h2 = UHF(Seed() + bro_prng(2)); return true; } diff --git a/src/probabilistic/Hasher.h b/src/probabilistic/Hasher.h index 3acd5c5867..bd8f5ce5ff 100644 --- a/src/probabilistic/Hasher.h +++ b/src/probabilistic/Hasher.h @@ -18,6 +18,20 @@ public: typedef hash_t digest; typedef std::vector digest_vector; + /** + * Creates a valid hasher seed from an arbitrary string. + * + * @param data A pointer to contiguous data that should be crunched into a + * seed. If 0, the function tries to find a global_hash_seed script variable + * to derive a seed from. If this variable does not exist, the function uses + * the initial seed generated at Bro startup. + * + * @param size The number of bytes of *data*. + * + * @return A seed suitable for hashers. + */ + static size_t MakeSeed(const void* data, size_t size); + /** * Destructor. */ @@ -64,11 +78,9 @@ public: size_t K() const { return k; } /** - * Returns the hasher's name. If not empty, the hasher uses this descriptor - * to seed its *k* hash functions. Otherwise the hasher mixes in the initial - * seed derived from the environment variable `$BRO_SEED`. + * Returns the seed used to construct the hasher. */ - const std::string& Name() const { return name; } + size_t Seed() const { return seed; } bool Serialize(SerialInfo* info) const; static Hasher* Unserialize(UnserialInfo* info); @@ -81,16 +93,15 @@ protected: /** * Constructor. * - * @param k the number of hash functions. + * @param arg_k the number of hash functions. * - * @param name A name for the hasher. Hashers with the same name - * should provide consistent results. + * @param arg_seed The seed for the hasher. */ - Hasher(size_t k, const std::string& name); + Hasher(size_t arg_k, size_t arg_seed); private: size_t k; - std::string name; + size_t seed; }; /** @@ -104,12 +115,8 @@ public: * optional extra seed to replace the initial Bro seed. * * @param seed The seed to use for this instance. - * - * @param extra If not empty, this parameter replaces the initial - * seed to compute the seed for t to compute the seed NUL-terminated - * string as additional seed. */ - UHF(size_t seed = 0, const std::string& extra = ""); + UHF(size_t seed = 0); template Hasher::digest operator()(const T& x) const @@ -152,7 +159,7 @@ public: } private: - static size_t compute_seed(size_t seed, const std::string& extra); + static size_t compute_seed(size_t seed); H3 h; }; @@ -169,9 +176,9 @@ public: * * @param k The number of hash functions to use. * - * @param name The name of the hasher. + * @param seed The seed for the hasher. */ - DefaultHasher(size_t k, const std::string& name = ""); + DefaultHasher(size_t k, size_t seed); // Overridden from Hasher. virtual digest_vector Hash(const void* x, size_t n) const /* final */; @@ -197,9 +204,9 @@ public: * * @param k The number of hash functions to use. * - * @param name The name of the hasher. + * @param seed The seed for the hasher. */ - DoubleHasher(size_t k, const std::string& name = ""); + DoubleHasher(size_t k, size_t seed); // Overridden from Hasher. virtual digest_vector Hash(const void* x, size_t n) const /* final */; diff --git a/src/probabilistic/bloom-filter.bif b/src/probabilistic/bloom-filter.bif index a3567ad6f7..d936b77e3b 100644 --- a/src/probabilistic/bloom-filter.bif +++ b/src/probabilistic/bloom-filter.bif @@ -48,7 +48,9 @@ function bloomfilter_basic_init%(fp: double, capacity: count, size_t cells = BasicBloomFilter::M(fp, capacity); size_t optimal_k = BasicBloomFilter::K(cells, capacity); - const Hasher* h = new DefaultHasher(optimal_k, name->CheckString()); + size_t seed = Hasher::MakeSeed(name->Len() > 0 ? name->Bytes() : 0, + name->Len()); + const Hasher* h = new DefaultHasher(optimal_k, seed); return new BloomFilterVal(new BasicBloomFilter(h, cells)); %} @@ -86,7 +88,10 @@ function bloomfilter_counting_init%(k: count, cells: count, max: count, return 0; } - const Hasher* h = new DefaultHasher(k, name->CheckString()); + size_t seed = Hasher::MakeSeed(name->Len() > 0 ? name->Bytes() : 0, + name->Len()); + + const Hasher* h = new DefaultHasher(k, seed); uint16 width = 1; while ( max >>= 1 ) diff --git a/testing/btest/bifs/bloomfilter.bro b/testing/btest/bifs/bloomfilter.bro index 3b40f29553..e6091e25fa 100644 --- a/testing/btest/bifs/bloomfilter.bro +++ b/testing/btest/bifs/bloomfilter.bro @@ -21,8 +21,8 @@ function test_basic_bloom_filter() bloomfilter_add(bf_str, "bar"); print bloomfilter_lookup(bf_str, "foo"); print bloomfilter_lookup(bf_str, "bar"); - print bloomfilter_lookup(bf_str, "b4z"); # FP - print bloomfilter_lookup(bf_str, "quux"); # FP + print bloomfilter_lookup(bf_str, "b4zzz"); # FP + print bloomfilter_lookup(bf_str, "quuux"); # FP bloomfilter_add(bf_str, 0.5); # Type mismatch bloomfilter_add(bf_str, 100); # Type mismatch From d50b8a147d739e3fdce9cf235e47d7291adbe212 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Wed, 31 Jul 2013 18:21:37 +0200 Subject: [PATCH 2/5] Add new BiF for low-level Bloom filter initialization. For symmetry reasons, the new Bif bloomfilter_basic_init2 also allows users to manually specify the memory bounds and number of hash functions to use. --- NEWS | 1 + src/probabilistic/bloom-filter.bif | 69 +++++++++++++++---- .../btest/Baseline/bifs.bloomfilter/output | 2 + testing/btest/bifs/bloomfilter.bro | 7 ++ 4 files changed, 67 insertions(+), 12 deletions(-) diff --git a/NEWS b/NEWS index c421e7d675..64058054d6 100644 --- a/NEWS +++ b/NEWS @@ -113,6 +113,7 @@ New Functionality the frequency of elements. The corresponding functions are: bloomfilter_basic_init(fp: double, capacity: count, name: string &default=""): opaque of bloomfilter + bloomfilter_basic_init2(k: count, cells: count, name: string &default=""): opaque of bloomfilter bloomfilter_counting_init(k: count, cells: count, max: count, name: string &default=""): opaque of bloomfilter bloomfilter_add(bf: opaque of bloomfilter, x: any) bloomfilter_lookup(bf: opaque of bloomfilter, x: any): count diff --git a/src/probabilistic/bloom-filter.bif b/src/probabilistic/bloom-filter.bif index d936b77e3b..0c4a67ac6f 100644 --- a/src/probabilistic/bloom-filter.bif +++ b/src/probabilistic/bloom-filter.bif @@ -35,8 +35,8 @@ module GLOBAL; ## ## Returns: A Bloom filter handle. ## -## .. bro:see:: bloomfilter_counting_init bloomfilter_add bloomfilter_lookup -## bloomfilter_clear bloomfilter_merge +## .. bro:see:: bloomfilter_basic_init2 bloomfilter_counting_init bloomfilter_add +## bloomfilter_lookup bloomfilter_clear bloomfilter_merge function bloomfilter_basic_init%(fp: double, capacity: count, name: string &default=""%): opaque of bloomfilter %{ @@ -55,6 +55,47 @@ function bloomfilter_basic_init%(fp: double, capacity: count, return new BloomFilterVal(new BasicBloomFilter(h, cells)); %} +## Creates a basic Bloom filter. This function serves as a low-level +## alternative to bloomfilter_basic_init where the user has full control over +## the number of hash functions and cells in the underlying bit vector. +## +## .. note:: A Bloom filter can have a name associated with it. In the future, +## Bloom filters with the same name will be compatible across indepedent Bro +## instances, i.e., it will be possible to merge them. Currently, however, that is +## not yet supported. +## +## k: The number of hash functions to use. +## +## cells: The number of cells of the underlying bit vector. +## +## name: A name that uniquely identifies and seeds the Bloom filter. If empty, +## the filter will remain tied to the current Bro process. +## +## Returns: A Bloom filter handle. +## +## .. bro:see:: bloom_filter_basic_init bloomfilter_counting_init bloomfilter_add +## bloomfilter_lookup bloomfilter_clear bloomfilter_merge +function bloomfilter_basic_init2%(k: count, cells: count, + name: string &default=""%): opaque of bloomfilter + %{ + if ( k == 0 ) + { + reporter->Error("number of hash functions must be non-negative"); + return 0; + } + if ( cells == 0 ) + { + reporter->Error("number of cells must be non-negative"); + return 0; + } + + size_t seed = Hasher::MakeSeed(name->Len() > 0 ? name->Bytes() : 0, + name->Len()); + const Hasher* h = new DefaultHasher(k, seed); + + return new BloomFilterVal(new BasicBloomFilter(h, cells)); + %} + ## Creates a counting Bloom filter. ## ## .. note:: A Bloom filter can have a name associated with it. In the future, @@ -77,8 +118,8 @@ function bloomfilter_basic_init%(fp: double, capacity: count, ## ## Returns: A Bloom filter handle. ## -## .. bro:see:: bloomfilter_basic_init bloomfilter_add bloomfilter_lookup -## bloomfilter_clear bloomfilter_merge +## .. bro:see:: bloomfilter_basic_init bloomfilter_basic_init2 bloomfilter_add +## bloomfilter_lookup bloomfilter_clear bloomfilter_merge function bloomfilter_counting_init%(k: count, cells: count, max: count, name: string &default=""%): opaque of bloomfilter %{ @@ -106,8 +147,9 @@ function bloomfilter_counting_init%(k: count, cells: count, max: count, ## ## x: The element to add. ## -## .. bro:see:: bloomfilter_counting_init bloomfilter_basic_init loomfilter_lookup -## bloomfilter_clear bloomfilter_merge +## .. bro:see:: bloomfilter_basic_init bloomfilter_basic_init2 +## bloomfilter_counting_init bloomfilter_lookup bloomfilter_clear +## bloomfilter_merge function bloomfilter_add%(bf: opaque of bloomfilter, x: any%): any %{ BloomFilterVal* bfv = static_cast(bf); @@ -132,8 +174,9 @@ function bloomfilter_add%(bf: opaque of bloomfilter, x: any%): any ## ## Returns: the counter associated with *x* in *bf*. ## -## .. bro:see:: bloomfilter_counting_init bloomfilter_basic_init -## bloomfilter_add bloomfilter_clear bloomfilter_merge +## .. bro:see:: bloomfilter_basic_init bloomfilter_basic_init2 +## bloomfilter_counting_init bloomfilter_add bloomfilter_clear +## bloomfilter_merge function bloomfilter_lookup%(bf: opaque of bloomfilter, x: any%): count %{ const BloomFilterVal* bfv = static_cast(bf); @@ -159,8 +202,9 @@ function bloomfilter_lookup%(bf: opaque of bloomfilter, x: any%): count ## ## bf: The Bloom filter handle. ## -## .. bro:see:: bloomfilter_counting_init bloomfilter_basic_init -## bloomfilter_add bloomfilter_lookup bloomfilter_merge +## .. bro:see:: bloomfilter_basic_init bloomfilter_counting_init2 +## bloomfilter_counting_init bloomfilter_add bloomfilter_lookup +## bloomfilter_merge function bloomfilter_clear%(bf: opaque of bloomfilter%): any %{ BloomFilterVal* bfv = static_cast(bf); @@ -183,8 +227,9 @@ function bloomfilter_clear%(bf: opaque of bloomfilter%): any ## ## Returns: The union of *bf1* and *bf2*. ## -## .. bro:see:: bloomfilter_counting_init bloomfilter_basic_init -## bloomfilter_add bloomfilter_lookup bloomfilter_clear +## .. bro:see:: bloomfilter_basic_init bloomfilter_basic_init2 +## bloomfilter_counting_init bloomfilter_add bloomfilter_lookup +## bloomfilter_clear function bloomfilter_merge%(bf1: opaque of bloomfilter, bf2: opaque of bloomfilter%): opaque of bloomfilter %{ diff --git a/testing/btest/Baseline/bifs.bloomfilter/output b/testing/btest/Baseline/bifs.bloomfilter/output index 14e1f038c0..731b7c7ce9 100644 --- a/testing/btest/Baseline/bifs.bloomfilter/output +++ b/testing/btest/Baseline/bifs.bloomfilter/output @@ -17,6 +17,8 @@ error: false-positive rate must take value between 0 and 1 1 1 1 +1 +1 2 3 3 diff --git a/testing/btest/bifs/bloomfilter.bro b/testing/btest/bifs/bloomfilter.bro index e6091e25fa..c2a1c47ca8 100644 --- a/testing/btest/bifs/bloomfilter.bro +++ b/testing/btest/bifs/bloomfilter.bro @@ -15,6 +15,13 @@ function test_basic_bloom_filter() bloomfilter_add(bf_cnt, 0.5); # Type mismatch bloomfilter_add(bf_cnt, "foo"); # Type mismatch + # Alternative constructor. + local bf_dbl = bloomfilter_basic_init2(4, 10); + bloomfilter_add(bf_dbl, 4.2); + bloomfilter_add(bf_dbl, 3.14); + print bloomfilter_lookup(bf_dbl, 4.2); + print bloomfilter_lookup(bf_dbl, 3.14); + # Basic usage with strings. local bf_str = bloomfilter_basic_init(0.9, 10); bloomfilter_add(bf_str, "foo"); From 2a0790c2316380209b5a9d6f3abfffc94aa8120e Mon Sep 17 00:00:00 2001 From: Robin Sommer Date: Wed, 31 Jul 2013 17:14:02 -0700 Subject: [PATCH 3/5] Changing the Bloom filter hashing so that it's independent of CompositeHash. We do this by hashing values added to a BloomFilter another time more with a stable hash seeded only by either the filter's name or the global_hash_seed (or Bro's random() seed if neither is defined). I'm also adding a new bif bloomfilter_internal_state() that returns a string representation of a Bloom filter's current internal state. This is solely for writing tests that check that the filters end up consistent when seeded with the same value. --- scripts/base/init-bare.bro | 5 ++ src/OpaqueVal.cc | 9 ++- src/OpaqueVal.h | 1 + src/probabilistic/BitVector.cc | 10 ++++ src/probabilistic/BitVector.h | 7 +++ src/probabilistic/BloomFilter.cc | 28 ++++++++-- src/probabilistic/BloomFilter.h | 56 +++++++------------ src/probabilistic/CounterVector.cc | 5 ++ src/probabilistic/CounterVector.h | 7 +++ src/probabilistic/Hasher.cc | 6 +- src/probabilistic/Hasher.h | 9 +++ src/probabilistic/bloom-filter.bif | 25 ++++----- .../Baseline/bifs.bloomfilter-seed/output | 8 +++ testing/btest/bifs/bloomfilter-seed.bro | 40 +++++++++++++ 14 files changed, 157 insertions(+), 59 deletions(-) create mode 100644 testing/btest/Baseline/bifs.bloomfilter-seed/output create mode 100644 testing/btest/bifs/bloomfilter-seed.bro diff --git a/scripts/base/init-bare.bro b/scripts/base/init-bare.bro index 9876ad03f7..e5300cdc9f 100644 --- a/scripts/base/init-bare.bro +++ b/scripts/base/init-bare.bro @@ -3042,6 +3042,11 @@ module GLOBAL; ## Number of bytes per packet to capture from live interfaces. const snaplen = 8192 &redef; +## Seed for hashes computed internally for probabilistic data structures. Using +## the same value here will make the hashes compatible between independent Bro +## instances. If left unset, Bro will use a temporary local seed. +const global_hash_seed: string = "" &redef; + # Load BiFs defined by plugins. @load base/bif/plugins diff --git a/src/OpaqueVal.cc b/src/OpaqueVal.cc index 66b3c081e7..e79b4435b3 100644 --- a/src/OpaqueVal.cc +++ b/src/OpaqueVal.cc @@ -566,14 +566,14 @@ BroType* BloomFilterVal::Type() const void BloomFilterVal::Add(const Val* val) { HashKey* key = hash->ComputeHash(val, 1); - bloom_filter->Add(key->Hash()); + bloom_filter->Add(key); delete key; } size_t BloomFilterVal::Count(const Val* val) const { HashKey* key = hash->ComputeHash(val, 1); - size_t cnt = bloom_filter->Count(key->Hash()); + size_t cnt = bloom_filter->Count(key); delete key; return cnt; } @@ -588,6 +588,11 @@ bool BloomFilterVal::Empty() const return bloom_filter->Empty(); } +string BloomFilterVal::InternalState() const + { + return bloom_filter->InternalState(); + } + BloomFilterVal* BloomFilterVal::Merge(const BloomFilterVal* x, const BloomFilterVal* y) { diff --git a/src/OpaqueVal.h b/src/OpaqueVal.h index 52c9583fc7..08a20b1a31 100644 --- a/src/OpaqueVal.h +++ b/src/OpaqueVal.h @@ -127,6 +127,7 @@ public: size_t Count(const Val* val) const; void Clear(); bool Empty() const; + string InternalState() const; static BloomFilterVal* Merge(const BloomFilterVal* x, const BloomFilterVal* y); diff --git a/src/probabilistic/BitVector.cc b/src/probabilistic/BitVector.cc index 6e642e62c1..f820e6df27 100644 --- a/src/probabilistic/BitVector.cc +++ b/src/probabilistic/BitVector.cc @@ -490,6 +490,16 @@ BitVector::size_type BitVector::FindNext(size_type i) const return block ? bi * bits_per_block + lowest_bit(block) : find_from(bi + 1); } +size_t BitVector::Hash() const + { + size_t hash = 0; + + for ( size_type i = 0; i < Blocks(); ++i ) + hash += bits[i]; + + return hash; + } + BitVector::size_type BitVector::lowest_bit(block_type block) { block_type x = block - (block & (block - 1)); diff --git a/src/probabilistic/BitVector.h b/src/probabilistic/BitVector.h index d9c55d53c6..8e24336345 100644 --- a/src/probabilistic/BitVector.h +++ b/src/probabilistic/BitVector.h @@ -276,6 +276,13 @@ public: */ size_type FindNext(size_type i) const; + /** Computes a hash value of the internal representation. + * This is mainly for debugging/testing purposes. + * + * @return The hash. + */ + size_t Hash() const; + /** * Serializes the bit vector. * diff --git a/src/probabilistic/BloomFilter.cc b/src/probabilistic/BloomFilter.cc index 23b812269c..bcab6c9b54 100644 --- a/src/probabilistic/BloomFilter.cc +++ b/src/probabilistic/BloomFilter.cc @@ -9,6 +9,8 @@ #include "CounterVector.h" #include "Serializer.h" +#include "../util.h" + using namespace probabilistic; BloomFilter::BloomFilter() @@ -107,6 +109,11 @@ BasicBloomFilter* BasicBloomFilter::Clone() const return copy; } +std::string BasicBloomFilter::InternalState() const + { + return fmt("%" PRIu64, (uint64_t)bits->Hash()); + } + BasicBloomFilter::BasicBloomFilter() { bits = 0; @@ -133,14 +140,18 @@ bool BasicBloomFilter::DoUnserialize(UnserialInfo* info) return (bits != 0); } -void BasicBloomFilter::AddImpl(const Hasher::digest_vector& h) +void BasicBloomFilter::Add(const HashKey* key) { + Hasher::digest_vector h = hasher->Hash(key); + for ( size_t i = 0; i < h.size(); ++i ) bits->Set(h[i] % bits->Size()); } -size_t BasicBloomFilter::CountImpl(const Hasher::digest_vector& h) const +size_t BasicBloomFilter::Count(const HashKey* key) const { + Hasher::digest_vector h = hasher->Hash(key); + for ( size_t i = 0; i < h.size(); ++i ) { if ( ! (*bits)[h[i] % bits->Size()] ) @@ -206,6 +217,11 @@ CountingBloomFilter* CountingBloomFilter::Clone() const return copy; } +string CountingBloomFilter::InternalState() const + { + return fmt("%" PRIu64, (uint64_t)cells->Hash()); + } + IMPLEMENT_SERIAL(CountingBloomFilter, SER_COUNTINGBLOOMFILTER) bool CountingBloomFilter::DoSerialize(SerialInfo* info) const @@ -222,14 +238,18 @@ bool CountingBloomFilter::DoUnserialize(UnserialInfo* info) } // TODO: Use partitioning in add/count to allow for reusing CMS bounds. -void CountingBloomFilter::AddImpl(const Hasher::digest_vector& h) +void CountingBloomFilter::Add(const HashKey* key) { + Hasher::digest_vector h = hasher->Hash(key); + for ( size_t i = 0; i < h.size(); ++i ) cells->Increment(h[i] % cells->Size()); } -size_t CountingBloomFilter::CountImpl(const Hasher::digest_vector& h) const +size_t CountingBloomFilter::Count(const HashKey* key) const { + Hasher::digest_vector h = hasher->Hash(key); + CounterVector::size_type min = std::numeric_limits::max(); diff --git a/src/probabilistic/BloomFilter.h b/src/probabilistic/BloomFilter.h index 4865ae145c..65dda2396d 100644 --- a/src/probabilistic/BloomFilter.h +++ b/src/probabilistic/BloomFilter.h @@ -22,27 +22,20 @@ public: virtual ~BloomFilter(); /** - * Adds an element of type T to the Bloom filter. - * @param x The element to add + * Adds an element to the Bloom filter. + * + * @param key The key associated with the element to add. */ - template - void Add(const T& x) - { - AddImpl((*hasher)(x)); - } + virtual void Add(const HashKey* key) = 0; /** * Retrieves the associated count of a given value. * - * @param x The value of type `T` to check. + * @param key The key associated with the element to check. * - * @return The counter associated with *x*. + * @return The counter associated with *key*. */ - template - size_t Count(const T& x) const - { - return CountImpl((*hasher)(x)); - } + virtual size_t Count(const HashKey* key) const = 0; /** * Checks whether the Bloom filter is empty. @@ -72,6 +65,12 @@ public: */ virtual BloomFilter* Clone() const = 0; + /** + * Returns a string with a representation of the Bloom filter's + * internal state. This is for debugging/testing purposes only. + */ + virtual string InternalState() const = 0; + /** * Serializes the Bloom filter. * @@ -106,25 +105,6 @@ protected: */ BloomFilter(const Hasher* hasher); - /** - * Abstract method for implementinng the *Add* operation. - * - * @param hashes A set of *k* hashes for the item to add, computed by - * the internal hasher object. - * - */ - virtual void AddImpl(const Hasher::digest_vector& hashes) = 0; - - /** - * Abstract method for implementing the *Count* operation. - * - * @param hashes A set of *k* hashes for the item to add, computed by - * the internal hasher object. - * - * @return Returns the counter associated with the hashed element. - */ - virtual size_t CountImpl(const Hasher::digest_vector& hashes) const = 0; - const Hasher* hasher; }; @@ -177,6 +157,7 @@ public: virtual void Clear(); virtual bool Merge(const BloomFilter* other); virtual BasicBloomFilter* Clone() const; + virtual string InternalState() const; protected: DECLARE_SERIAL(BasicBloomFilter); @@ -187,8 +168,8 @@ protected: BasicBloomFilter(); // Overridden from BloomFilter. - virtual void AddImpl(const Hasher::digest_vector& h); - virtual size_t CountImpl(const Hasher::digest_vector& h) const; + virtual void Add(const HashKey* key); + virtual size_t Count(const HashKey* key) const; private: BitVector* bits; @@ -216,6 +197,7 @@ public: virtual void Clear(); virtual bool Merge(const BloomFilter* other); virtual CountingBloomFilter* Clone() const; + virtual string InternalState() const; protected: DECLARE_SERIAL(CountingBloomFilter); @@ -226,8 +208,8 @@ protected: CountingBloomFilter(); // Overridden from BloomFilter. - virtual void AddImpl(const Hasher::digest_vector& h); - virtual size_t CountImpl(const Hasher::digest_vector& h) const; + virtual void Add(const HashKey* key); + virtual size_t Count(const HashKey* key) const; private: CounterVector* cells; diff --git a/src/probabilistic/CounterVector.cc b/src/probabilistic/CounterVector.cc index d5635fc0f2..8a6feae5fd 100644 --- a/src/probabilistic/CounterVector.cc +++ b/src/probabilistic/CounterVector.cc @@ -153,6 +153,11 @@ CounterVector operator|(const CounterVector& x, const CounterVector& y) } +size_t CounterVector::Hash() const + { + return bits->Hash(); + } + bool CounterVector::Serialize(SerialInfo* info) const { return SerialObj::Serialize(info); diff --git a/src/probabilistic/CounterVector.h b/src/probabilistic/CounterVector.h index df6fc57ac2..9ce522d61c 100644 --- a/src/probabilistic/CounterVector.h +++ b/src/probabilistic/CounterVector.h @@ -126,6 +126,13 @@ public: */ CounterVector& operator|=(const CounterVector& other); + /** Computes a hash value of the internal representation. + * This is mainly for debugging/testing purposes. + * + * @return The hash. + */ + size_t Hash() const; + /** * Serializes the bit vector. * diff --git a/src/probabilistic/Hasher.cc b/src/probabilistic/Hasher.cc index 8b34aa5c77..b59274df7d 100644 --- a/src/probabilistic/Hasher.cc +++ b/src/probabilistic/Hasher.cc @@ -31,6 +31,11 @@ size_t Hasher::MakeSeed(const void* data, size_t size) return *reinterpret_cast(buf); // Use the first bytes as seed. } +Hasher::digest_vector Hasher::Hash(const HashKey* key) const + { + return Hash(key->Key(), key->Size()); + } + bool Hasher::Serialize(SerialInfo* info) const { return SerialObj::Serialize(info); @@ -77,7 +82,6 @@ Hasher::Hasher(size_t arg_k, size_t arg_seed) seed = arg_seed; } - UHF::UHF(size_t seed) : h(seed) { diff --git a/src/probabilistic/Hasher.h b/src/probabilistic/Hasher.h index bd8f5ce5ff..6b75fa1bea 100644 --- a/src/probabilistic/Hasher.h +++ b/src/probabilistic/Hasher.h @@ -50,6 +50,15 @@ public: return Hash(&x, sizeof(T)); } + /** + * Computes hash values for an element. + * + * @param x The key of the value to hash. + * + * @return Vector of *k* hash values. + */ + digest_vector Hash(const HashKey* key) const; + /** * Computes the hashes for a set of bytes. * diff --git a/src/probabilistic/bloom-filter.bif b/src/probabilistic/bloom-filter.bif index c288171e5d..98c8dd59a8 100644 --- a/src/probabilistic/bloom-filter.bif +++ b/src/probabilistic/bloom-filter.bif @@ -20,11 +20,6 @@ module GLOBAL; ## Creates a basic Bloom filter. ## -## .. note:: A Bloom filter can have a name associated with it. In the future, -## Bloom filters with the same name will be compatible across indepedent Bro -## instances, i.e., it will be possible to merge them. Currently, however, that is -## not yet supported. -## ## fp: The desired false-positive rate. ## ## capacity: the maximum number of elements that guarantees a false-positive @@ -61,11 +56,6 @@ function bloomfilter_basic_init%(fp: double, capacity: count, ## alternative to bloomfilter_basic_init where the user has full control over ## the number of hash functions and cells in the underlying bit vector. ## -## .. note:: A Bloom filter can have a name associated with it. In the future, -## Bloom filters with the same name will be compatible across indepedent Bro -## instances, i.e., it will be possible to merge them. Currently, however, that is -## not yet supported. -## ## k: The number of hash functions to use. ## ## cells: The number of cells of the underlying bit vector. @@ -102,11 +92,6 @@ function bloomfilter_basic_init2%(k: count, cells: count, ## Creates a counting Bloom filter. ## -## .. note:: A Bloom filter can have a name associated with it. In the future, -## Bloom filters with the same name will be compatible across indepedent Bro -## instances, i.e., it will be possible to merge them. Currently, however, that is -## not yet supported. -## ## k: The number of hash functions to use. ## ## cells: The number of cells of the underlying counter vector. As there's no @@ -250,3 +235,13 @@ function bloomfilter_merge%(bf1: opaque of bloomfilter, return BloomFilterVal::Merge(bfv1, bfv2); %} + +## Returns a string with a representation of a Bloom filter's internal +## state. This is for debugging/testing purposes only. +## +## bf: The Bloom filter handle. +function bloomfilter_internal_state%(bf: opaque of bloomfilter%): string + %{ + BloomFilterVal* bfv = static_cast(bf); + return new StringVal(bfv->InternalState()); + %} diff --git a/testing/btest/Baseline/bifs.bloomfilter-seed/output b/testing/btest/Baseline/bifs.bloomfilter-seed/output new file mode 100644 index 0000000000..53e0f583f2 --- /dev/null +++ b/testing/btest/Baseline/bifs.bloomfilter-seed/output @@ -0,0 +1,8 @@ +bf1, global_seed, 1 +bf2, global_seed, 5 +bf3, my_seed, 5 +bf4, my_seed, 6 +bf1, global_seed, 5 +bf2, global_seed, 6 +bf3, my_seed, 5 +bf4, my_seed, 6 diff --git a/testing/btest/bifs/bloomfilter-seed.bro b/testing/btest/bifs/bloomfilter-seed.bro new file mode 100644 index 0000000000..436638e2af --- /dev/null +++ b/testing/btest/bifs/bloomfilter-seed.bro @@ -0,0 +1,40 @@ +# @TEST-EXEC: bro -b %INPUT global_hash_seed="foo" >>output +# @TEST-EXEC: bro -b %INPUT global_hash_seed="my_seed" >>output +# @TEST-EXEC: btest-diff output + +type Foo: record + { + a: count; + b: string; + }; + +function test_bloom_filter() + { + local bf1 = bloomfilter_basic_init(0.9, 10); + bloomfilter_add(bf1, "foo"); + bloomfilter_add(bf1, "bar"); + + local bf2 = bloomfilter_basic_init(0.9, 10); + bloomfilter_add(bf2, Foo($a=1, $b="xx")); + bloomfilter_add(bf2, Foo($a=2, $b="yy")); + + local bf3 = bloomfilter_basic_init(0.9, 10, "my_seed"); + bloomfilter_add(bf3, "foo"); + bloomfilter_add(bf3, "bar"); + + local bf4 = bloomfilter_basic_init(0.9, 10, "my_seed"); + bloomfilter_add(bf4, Foo($a=1, $b="xx")); + bloomfilter_add(bf4, Foo($a=2, $b="yy")); + + print "bf1, global_seed", bloomfilter_internal_state(bf1); + print "bf2, global_seed", bloomfilter_internal_state(bf2); + print "bf3, my_seed", bloomfilter_internal_state(bf3); + print "bf4, my_seed", bloomfilter_internal_state(bf4); + + + } + +event bro_init() + { + test_bloom_filter(); + } From 34965b4e77b3091dd0d959873b21239f3da02ac4 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Thu, 1 Aug 2013 19:15:28 +0200 Subject: [PATCH 4/5] Support UHF hashing for >= UHASH_KEY_SIZE bytes. --- src/probabilistic/Hasher.cc | 23 +++++++++++++++++++---- src/probabilistic/Hasher.h | 5 +++-- 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/src/probabilistic/Hasher.cc b/src/probabilistic/Hasher.cc index b59274df7d..fe8eb66ad9 100644 --- a/src/probabilistic/Hasher.cc +++ b/src/probabilistic/Hasher.cc @@ -1,6 +1,7 @@ // See the file "COPYING" in the main distribution directory for copyright. #include +#include #include "Hasher.h" #include "NetVar.h" @@ -82,15 +83,29 @@ Hasher::Hasher(size_t arg_k, size_t arg_seed) seed = arg_seed; } -UHF::UHF(size_t seed) - : h(seed) +UHF::UHF(size_t arg_seed) + : h(arg_seed) { + seed = arg_seed; } +// This function is almost equivalent to HashKey::HashBytes except that it does +// not depend on global state and that we mix in the seed multiple times. Hasher::digest UHF::hash(const void* x, size_t n) const { - assert(n <= UHASH_KEY_SIZE); - return n == 0 ? 0 : h(x, n); + if ( n <= UHASH_KEY_SIZE ) + return n == 0 ? 0 : h(x, n); + + unsigned char d[16]; + MD5(reinterpret_cast(x), n, d); + + const unsigned char* s = reinterpret_cast(&seed); + for ( size_t i = 0; i < 16; ++i ) + d[i] ^= s[i % sizeof(seed)]; + + MD5(d, 16, d); + + return d[0]; } DefaultHasher::DefaultHasher(size_t k, size_t seed) diff --git a/src/probabilistic/Hasher.h b/src/probabilistic/Hasher.h index 6b75fa1bea..a3322f5e37 100644 --- a/src/probabilistic/Hasher.h +++ b/src/probabilistic/Hasher.h @@ -123,9 +123,9 @@ public: * Constructs an H3 hash function seeded with a given seed and an * optional extra seed to replace the initial Bro seed. * - * @param seed The seed to use for this instance. + * @param arg_seed The seed to use for this instance. */ - UHF(size_t seed = 0); + UHF(size_t arg_seed = 0); template Hasher::digest operator()(const T& x) const @@ -171,6 +171,7 @@ private: static size_t compute_seed(size_t seed); H3 h; + size_t seed; }; From 7ab21706411bb1bb6c191cce7e86b16d2facae78 Mon Sep 17 00:00:00 2001 From: Robin Sommer Date: Thu, 1 Aug 2013 10:46:05 -0700 Subject: [PATCH 5/5] Using a real hash function for hashing a BitVector's internal state. --- src/probabilistic/BitVector.cc | 17 ++++++++++++----- .../btest/Baseline/bifs.bloomfilter-seed/output | 16 ++++++++-------- 2 files changed, 20 insertions(+), 13 deletions(-) diff --git a/src/probabilistic/BitVector.cc b/src/probabilistic/BitVector.cc index f820e6df27..e8c2b2f80e 100644 --- a/src/probabilistic/BitVector.cc +++ b/src/probabilistic/BitVector.cc @@ -1,10 +1,12 @@ // See the file "COPYING" in the main distribution directory for copyright. -#include "BitVector.h" - +#include #include #include + +#include "BitVector.h" #include "Serializer.h" +#include "digest.h" using namespace probabilistic; @@ -494,10 +496,15 @@ size_t BitVector::Hash() const { size_t hash = 0; - for ( size_type i = 0; i < Blocks(); ++i ) - hash += bits[i]; + u_char buf[SHA256_DIGEST_LENGTH]; + SHA256_CTX ctx; + sha256_init(&ctx); - return hash; + for ( size_type i = 0; i < Blocks(); ++i ) + sha256_update(&ctx, &bits[i], sizeof(bits[i])); + + sha256_final(&ctx, buf); + return *reinterpret_cast(buf); // Use the first bytes as seed. } BitVector::size_type BitVector::lowest_bit(block_type block) diff --git a/testing/btest/Baseline/bifs.bloomfilter-seed/output b/testing/btest/Baseline/bifs.bloomfilter-seed/output index 53e0f583f2..533085900f 100644 --- a/testing/btest/Baseline/bifs.bloomfilter-seed/output +++ b/testing/btest/Baseline/bifs.bloomfilter-seed/output @@ -1,8 +1,8 @@ -bf1, global_seed, 1 -bf2, global_seed, 5 -bf3, my_seed, 5 -bf4, my_seed, 6 -bf1, global_seed, 5 -bf2, global_seed, 6 -bf3, my_seed, 5 -bf4, my_seed, 6 +bf1, global_seed, 11979365913534242684 +bf2, global_seed, 12550100962110750449 +bf3, my_seed, 12550100962110750449 +bf4, my_seed, 945716460325754659 +bf1, global_seed, 12550100962110750449 +bf2, global_seed, 945716460325754659 +bf3, my_seed, 12550100962110750449 +bf4, my_seed, 945716460325754659