diff --git a/NEWS b/NEWS index c421e7d675..64058054d6 100644 --- a/NEWS +++ b/NEWS @@ -113,6 +113,7 @@ New Functionality the frequency of elements. The corresponding functions are: bloomfilter_basic_init(fp: double, capacity: count, name: string &default=""): opaque of bloomfilter + bloomfilter_basic_init2(k: count, cells: count, name: string &default=""): opaque of bloomfilter bloomfilter_counting_init(k: count, cells: count, max: count, name: string &default=""): opaque of bloomfilter bloomfilter_add(bf: opaque of bloomfilter, x: any) bloomfilter_lookup(bf: opaque of bloomfilter, x: any): count diff --git a/scripts/base/init-bare.bro b/scripts/base/init-bare.bro index 9876ad03f7..e5300cdc9f 100644 --- a/scripts/base/init-bare.bro +++ b/scripts/base/init-bare.bro @@ -3042,6 +3042,11 @@ module GLOBAL; ## Number of bytes per packet to capture from live interfaces. const snaplen = 8192 &redef; +## Seed for hashes computed internally for probabilistic data structures. Using +## the same value here will make the hashes compatible between independent Bro +## instances. If left unset, Bro will use a temporary local seed. +const global_hash_seed: string = "" &redef; + # Load BiFs defined by plugins. @load base/bif/plugins diff --git a/src/NetVar.cc b/src/NetVar.cc index 388aa46f10..2fee46e2da 100644 --- a/src/NetVar.cc +++ b/src/NetVar.cc @@ -238,6 +238,8 @@ TableType* record_field_table; StringVal* cmd_line_bpf_filter; +StringVal* global_hash_seed; + OpaqueType* md5_type; OpaqueType* sha1_type; OpaqueType* sha256_type; @@ -304,6 +306,8 @@ void init_general_global_var() cmd_line_bpf_filter = internal_val("cmd_line_bpf_filter")->AsStringVal(); + global_hash_seed = opt_internal_string("global_hash_seed"); + md5_type = new OpaqueType("md5"); sha1_type = new OpaqueType("sha1"); sha256_type = new OpaqueType("sha256"); diff --git a/src/NetVar.h b/src/NetVar.h index 7ce33d1a1a..3615108f73 100644 --- a/src/NetVar.h +++ b/src/NetVar.h @@ -242,6 +242,8 @@ extern TableType* record_field_table; extern StringVal* cmd_line_bpf_filter; +extern StringVal* global_hash_seed; + class OpaqueType; extern OpaqueType* md5_type; extern OpaqueType* sha1_type; diff --git a/src/OpaqueVal.cc b/src/OpaqueVal.cc index 6939362ab8..0c1d9d509d 100644 --- a/src/OpaqueVal.cc +++ b/src/OpaqueVal.cc @@ -566,14 +566,14 @@ BroType* BloomFilterVal::Type() const void BloomFilterVal::Add(const Val* val) { HashKey* key = hash->ComputeHash(val, 1); - bloom_filter->Add(key->Hash()); + bloom_filter->Add(key); delete key; } size_t BloomFilterVal::Count(const Val* val) const { HashKey* key = hash->ComputeHash(val, 1); - size_t cnt = bloom_filter->Count(key->Hash()); + size_t cnt = bloom_filter->Count(key); delete key; return cnt; } @@ -588,6 +588,11 @@ bool BloomFilterVal::Empty() const return bloom_filter->Empty(); } +string BloomFilterVal::InternalState() const + { + return bloom_filter->InternalState(); + } + BloomFilterVal* BloomFilterVal::Merge(const BloomFilterVal* x, const BloomFilterVal* y) { diff --git a/src/OpaqueVal.h b/src/OpaqueVal.h index 52c9583fc7..08a20b1a31 100644 --- a/src/OpaqueVal.h +++ b/src/OpaqueVal.h @@ -127,6 +127,7 @@ public: size_t Count(const Val* val) const; void Clear(); bool Empty() const; + string InternalState() const; static BloomFilterVal* Merge(const BloomFilterVal* x, const BloomFilterVal* y); diff --git a/src/probabilistic/BitVector.cc b/src/probabilistic/BitVector.cc index 6e642e62c1..e8c2b2f80e 100644 --- a/src/probabilistic/BitVector.cc +++ b/src/probabilistic/BitVector.cc @@ -1,10 +1,12 @@ // See the file "COPYING" in the main distribution directory for copyright. -#include "BitVector.h" - +#include #include #include + +#include "BitVector.h" #include "Serializer.h" +#include "digest.h" using namespace probabilistic; @@ -490,6 +492,21 @@ BitVector::size_type BitVector::FindNext(size_type i) const return block ? bi * bits_per_block + lowest_bit(block) : find_from(bi + 1); } +size_t BitVector::Hash() const + { + size_t hash = 0; + + u_char buf[SHA256_DIGEST_LENGTH]; + SHA256_CTX ctx; + sha256_init(&ctx); + + for ( size_type i = 0; i < Blocks(); ++i ) + sha256_update(&ctx, &bits[i], sizeof(bits[i])); + + sha256_final(&ctx, buf); + return *reinterpret_cast(buf); // Use the first bytes as seed. + } + BitVector::size_type BitVector::lowest_bit(block_type block) { block_type x = block - (block & (block - 1)); diff --git a/src/probabilistic/BitVector.h b/src/probabilistic/BitVector.h index d9c55d53c6..8e24336345 100644 --- a/src/probabilistic/BitVector.h +++ b/src/probabilistic/BitVector.h @@ -276,6 +276,13 @@ public: */ size_type FindNext(size_type i) const; + /** Computes a hash value of the internal representation. + * This is mainly for debugging/testing purposes. + * + * @return The hash. + */ + size_t Hash() const; + /** * Serializes the bit vector. * diff --git a/src/probabilistic/BloomFilter.cc b/src/probabilistic/BloomFilter.cc index 23b812269c..bcab6c9b54 100644 --- a/src/probabilistic/BloomFilter.cc +++ b/src/probabilistic/BloomFilter.cc @@ -9,6 +9,8 @@ #include "CounterVector.h" #include "Serializer.h" +#include "../util.h" + using namespace probabilistic; BloomFilter::BloomFilter() @@ -107,6 +109,11 @@ BasicBloomFilter* BasicBloomFilter::Clone() const return copy; } +std::string BasicBloomFilter::InternalState() const + { + return fmt("%" PRIu64, (uint64_t)bits->Hash()); + } + BasicBloomFilter::BasicBloomFilter() { bits = 0; @@ -133,14 +140,18 @@ bool BasicBloomFilter::DoUnserialize(UnserialInfo* info) return (bits != 0); } -void BasicBloomFilter::AddImpl(const Hasher::digest_vector& h) +void BasicBloomFilter::Add(const HashKey* key) { + Hasher::digest_vector h = hasher->Hash(key); + for ( size_t i = 0; i < h.size(); ++i ) bits->Set(h[i] % bits->Size()); } -size_t BasicBloomFilter::CountImpl(const Hasher::digest_vector& h) const +size_t BasicBloomFilter::Count(const HashKey* key) const { + Hasher::digest_vector h = hasher->Hash(key); + for ( size_t i = 0; i < h.size(); ++i ) { if ( ! (*bits)[h[i] % bits->Size()] ) @@ -206,6 +217,11 @@ CountingBloomFilter* CountingBloomFilter::Clone() const return copy; } +string CountingBloomFilter::InternalState() const + { + return fmt("%" PRIu64, (uint64_t)cells->Hash()); + } + IMPLEMENT_SERIAL(CountingBloomFilter, SER_COUNTINGBLOOMFILTER) bool CountingBloomFilter::DoSerialize(SerialInfo* info) const @@ -222,14 +238,18 @@ bool CountingBloomFilter::DoUnserialize(UnserialInfo* info) } // TODO: Use partitioning in add/count to allow for reusing CMS bounds. -void CountingBloomFilter::AddImpl(const Hasher::digest_vector& h) +void CountingBloomFilter::Add(const HashKey* key) { + Hasher::digest_vector h = hasher->Hash(key); + for ( size_t i = 0; i < h.size(); ++i ) cells->Increment(h[i] % cells->Size()); } -size_t CountingBloomFilter::CountImpl(const Hasher::digest_vector& h) const +size_t CountingBloomFilter::Count(const HashKey* key) const { + Hasher::digest_vector h = hasher->Hash(key); + CounterVector::size_type min = std::numeric_limits::max(); diff --git a/src/probabilistic/BloomFilter.h b/src/probabilistic/BloomFilter.h index 4865ae145c..65dda2396d 100644 --- a/src/probabilistic/BloomFilter.h +++ b/src/probabilistic/BloomFilter.h @@ -22,27 +22,20 @@ public: virtual ~BloomFilter(); /** - * Adds an element of type T to the Bloom filter. - * @param x The element to add + * Adds an element to the Bloom filter. + * + * @param key The key associated with the element to add. */ - template - void Add(const T& x) - { - AddImpl((*hasher)(x)); - } + virtual void Add(const HashKey* key) = 0; /** * Retrieves the associated count of a given value. * - * @param x The value of type `T` to check. + * @param key The key associated with the element to check. * - * @return The counter associated with *x*. + * @return The counter associated with *key*. */ - template - size_t Count(const T& x) const - { - return CountImpl((*hasher)(x)); - } + virtual size_t Count(const HashKey* key) const = 0; /** * Checks whether the Bloom filter is empty. @@ -72,6 +65,12 @@ public: */ virtual BloomFilter* Clone() const = 0; + /** + * Returns a string with a representation of the Bloom filter's + * internal state. This is for debugging/testing purposes only. + */ + virtual string InternalState() const = 0; + /** * Serializes the Bloom filter. * @@ -106,25 +105,6 @@ protected: */ BloomFilter(const Hasher* hasher); - /** - * Abstract method for implementinng the *Add* operation. - * - * @param hashes A set of *k* hashes for the item to add, computed by - * the internal hasher object. - * - */ - virtual void AddImpl(const Hasher::digest_vector& hashes) = 0; - - /** - * Abstract method for implementing the *Count* operation. - * - * @param hashes A set of *k* hashes for the item to add, computed by - * the internal hasher object. - * - * @return Returns the counter associated with the hashed element. - */ - virtual size_t CountImpl(const Hasher::digest_vector& hashes) const = 0; - const Hasher* hasher; }; @@ -177,6 +157,7 @@ public: virtual void Clear(); virtual bool Merge(const BloomFilter* other); virtual BasicBloomFilter* Clone() const; + virtual string InternalState() const; protected: DECLARE_SERIAL(BasicBloomFilter); @@ -187,8 +168,8 @@ protected: BasicBloomFilter(); // Overridden from BloomFilter. - virtual void AddImpl(const Hasher::digest_vector& h); - virtual size_t CountImpl(const Hasher::digest_vector& h) const; + virtual void Add(const HashKey* key); + virtual size_t Count(const HashKey* key) const; private: BitVector* bits; @@ -216,6 +197,7 @@ public: virtual void Clear(); virtual bool Merge(const BloomFilter* other); virtual CountingBloomFilter* Clone() const; + virtual string InternalState() const; protected: DECLARE_SERIAL(CountingBloomFilter); @@ -226,8 +208,8 @@ protected: CountingBloomFilter(); // Overridden from BloomFilter. - virtual void AddImpl(const Hasher::digest_vector& h); - virtual size_t CountImpl(const Hasher::digest_vector& h) const; + virtual void Add(const HashKey* key); + virtual size_t Count(const HashKey* key) const; private: CounterVector* cells; diff --git a/src/probabilistic/CounterVector.cc b/src/probabilistic/CounterVector.cc index d5635fc0f2..8a6feae5fd 100644 --- a/src/probabilistic/CounterVector.cc +++ b/src/probabilistic/CounterVector.cc @@ -153,6 +153,11 @@ CounterVector operator|(const CounterVector& x, const CounterVector& y) } +size_t CounterVector::Hash() const + { + return bits->Hash(); + } + bool CounterVector::Serialize(SerialInfo* info) const { return SerialObj::Serialize(info); diff --git a/src/probabilistic/CounterVector.h b/src/probabilistic/CounterVector.h index df6fc57ac2..9ce522d61c 100644 --- a/src/probabilistic/CounterVector.h +++ b/src/probabilistic/CounterVector.h @@ -126,6 +126,13 @@ public: */ CounterVector& operator|=(const CounterVector& other); + /** Computes a hash value of the internal representation. + * This is mainly for debugging/testing purposes. + * + * @return The hash. + */ + size_t Hash() const; + /** * Serializes the bit vector. * diff --git a/src/probabilistic/Hasher.cc b/src/probabilistic/Hasher.cc index 17597b9a82..f5b1f4f5f7 100644 --- a/src/probabilistic/Hasher.cc +++ b/src/probabilistic/Hasher.cc @@ -1,13 +1,42 @@ // See the file "COPYING" in the main distribution directory for copyright. #include +#include #include "Hasher.h" +#include "NetVar.h" #include "digest.h" #include "Serializer.h" using namespace probabilistic; +size_t Hasher::MakeSeed(const void* data, size_t size) + { + u_char buf[SHA256_DIGEST_LENGTH]; + SHA256_CTX ctx; + sha256_init(&ctx); + + if ( data ) + sha256_update(&ctx, data, size); + + else if ( global_hash_seed && global_hash_seed->Len() > 0 ) + sha256_update(&ctx, global_hash_seed->Bytes(), global_hash_seed->Len()); + + else + { + unsigned int first_seed = initial_seed(); + sha256_update(&ctx, &first_seed, sizeof(first_seed)); + } + + sha256_final(&ctx, buf); + return *reinterpret_cast(buf); // Use the first bytes as seed. + } + +Hasher::digest_vector Hasher::Hash(const HashKey* key) const + { + return Hash(key->Key(), key->Size()); + } + bool Hasher::Serialize(SerialInfo* info) const { return SerialObj::Serialize(info); @@ -25,7 +54,7 @@ bool Hasher::DoSerialize(SerialInfo* info) const if ( ! SERIALIZE(static_cast(k)) ) return false; - return SERIALIZE_STR(name.c_str(), name.size()); + return SERIALIZE(static_cast(seed)); } bool Hasher::DoUnserialize(UnserialInfo* info) @@ -39,62 +68,52 @@ bool Hasher::DoUnserialize(UnserialInfo* info) k = serial_k; assert(k > 0); - const char* serial_name; - if ( ! UNSERIALIZE_STR(&serial_name, 0) ) + uint64 serial_seed; + if ( ! UNSERIALIZE(&serial_seed) ) return false; - name = serial_name; - delete [] serial_name; + seed = serial_seed; return true; } -Hasher::Hasher(size_t k, const std::string& arg_name) - : k(k) +Hasher::Hasher(size_t arg_k, size_t arg_seed) { - k = k; - name = arg_name; + k = arg_k; + seed = arg_seed; } - -UHF::UHF(size_t seed, const std::string& extra) - : h(compute_seed(seed, extra)) +UHF::UHF(size_t arg_seed) + : h(arg_seed) { + seed = arg_seed; } +// This function is almost equivalent to HashKey::HashBytes except that it +// does not depend on global state and that we mix in the seed multiple +// times. Hasher::digest UHF::hash(const void* x, size_t n) const { - assert(n <= UHASH_KEY_SIZE); - return n == 0 ? 0 : h(x, n); + if ( n <= UHASH_KEY_SIZE ) + return n == 0 ? 0 : h(x, n); + + unsigned char d[16]; + MD5(reinterpret_cast(x), n, d); + + const unsigned char* s = reinterpret_cast(&seed); + for ( size_t i = 0; i < 16; ++i ) + d[i] ^= s[i % sizeof(seed)]; + + MD5(d, 16, d); + + return d[0]; } -size_t UHF::compute_seed(size_t seed, const std::string& extra) +DefaultHasher::DefaultHasher(size_t k, size_t seed) + : Hasher(k, seed) { - u_char buf[SHA256_DIGEST_LENGTH]; - SHA256_CTX ctx; - sha256_init(&ctx); - - if ( extra.empty() ) - { - unsigned int first_seed = initial_seed(); - sha256_update(&ctx, &first_seed, sizeof(first_seed)); - } - - else - sha256_update(&ctx, extra.c_str(), extra.size()); - - sha256_update(&ctx, &seed, sizeof(seed)); - sha256_final(&ctx, buf); - - // Take the first sizeof(size_t) bytes as seed. - return *reinterpret_cast(buf); - } - -DefaultHasher::DefaultHasher(size_t k, const std::string& name) - : Hasher(k, name) - { - for ( size_t i = 0; i < k; ++i ) - hash_functions.push_back(UHF(i, name)); + for ( size_t i = 1; i <= k; ++i ) + hash_functions.push_back(UHF(Seed() + bro_prng(i))); } Hasher::digest_vector DefaultHasher::Hash(const void* x, size_t n) const @@ -137,13 +156,13 @@ bool DefaultHasher::DoUnserialize(UnserialInfo* info) hash_functions.clear(); for ( size_t i = 0; i < K(); ++i ) - hash_functions.push_back(UHF(i, Name())); + hash_functions.push_back(UHF(Seed() + bro_prng(i))); return true; } -DoubleHasher::DoubleHasher(size_t k, const std::string& name) - : Hasher(k, name), h1(1, name), h2(2, name) +DoubleHasher::DoubleHasher(size_t k, size_t seed) + : Hasher(k, seed), h1(seed + bro_prng(1)), h2(seed + bro_prng(2)) { } @@ -187,8 +206,8 @@ bool DoubleHasher::DoUnserialize(UnserialInfo* info) { DO_UNSERIALIZE(Hasher); - h1 = UHF(1, Name()); - h2 = UHF(2, Name()); + h1 = UHF(Seed() + bro_prng(1)); + h2 = UHF(Seed() + bro_prng(2)); return true; } diff --git a/src/probabilistic/Hasher.h b/src/probabilistic/Hasher.h index 3acd5c5867..a3322f5e37 100644 --- a/src/probabilistic/Hasher.h +++ b/src/probabilistic/Hasher.h @@ -18,6 +18,20 @@ public: typedef hash_t digest; typedef std::vector digest_vector; + /** + * Creates a valid hasher seed from an arbitrary string. + * + * @param data A pointer to contiguous data that should be crunched into a + * seed. If 0, the function tries to find a global_hash_seed script variable + * to derive a seed from. If this variable does not exist, the function uses + * the initial seed generated at Bro startup. + * + * @param size The number of bytes of *data*. + * + * @return A seed suitable for hashers. + */ + static size_t MakeSeed(const void* data, size_t size); + /** * Destructor. */ @@ -36,6 +50,15 @@ public: return Hash(&x, sizeof(T)); } + /** + * Computes hash values for an element. + * + * @param x The key of the value to hash. + * + * @return Vector of *k* hash values. + */ + digest_vector Hash(const HashKey* key) const; + /** * Computes the hashes for a set of bytes. * @@ -64,11 +87,9 @@ public: size_t K() const { return k; } /** - * Returns the hasher's name. If not empty, the hasher uses this descriptor - * to seed its *k* hash functions. Otherwise the hasher mixes in the initial - * seed derived from the environment variable `$BRO_SEED`. + * Returns the seed used to construct the hasher. */ - const std::string& Name() const { return name; } + size_t Seed() const { return seed; } bool Serialize(SerialInfo* info) const; static Hasher* Unserialize(UnserialInfo* info); @@ -81,16 +102,15 @@ protected: /** * Constructor. * - * @param k the number of hash functions. + * @param arg_k the number of hash functions. * - * @param name A name for the hasher. Hashers with the same name - * should provide consistent results. + * @param arg_seed The seed for the hasher. */ - Hasher(size_t k, const std::string& name); + Hasher(size_t arg_k, size_t arg_seed); private: size_t k; - std::string name; + size_t seed; }; /** @@ -103,13 +123,9 @@ public: * Constructs an H3 hash function seeded with a given seed and an * optional extra seed to replace the initial Bro seed. * - * @param seed The seed to use for this instance. - * - * @param extra If not empty, this parameter replaces the initial - * seed to compute the seed for t to compute the seed NUL-terminated - * string as additional seed. + * @param arg_seed The seed to use for this instance. */ - UHF(size_t seed = 0, const std::string& extra = ""); + UHF(size_t arg_seed = 0); template Hasher::digest operator()(const T& x) const @@ -152,9 +168,10 @@ public: } private: - static size_t compute_seed(size_t seed, const std::string& extra); + static size_t compute_seed(size_t seed); H3 h; + size_t seed; }; @@ -169,9 +186,9 @@ public: * * @param k The number of hash functions to use. * - * @param name The name of the hasher. + * @param seed The seed for the hasher. */ - DefaultHasher(size_t k, const std::string& name = ""); + DefaultHasher(size_t k, size_t seed); // Overridden from Hasher. virtual digest_vector Hash(const void* x, size_t n) const /* final */; @@ -197,9 +214,9 @@ public: * * @param k The number of hash functions to use. * - * @param name The name of the hasher. + * @param seed The seed for the hasher. */ - DoubleHasher(size_t k, const std::string& name = ""); + DoubleHasher(size_t k, size_t seed); // Overridden from Hasher. virtual digest_vector Hash(const void* x, size_t n) const /* final */; diff --git a/src/probabilistic/bloom-filter.bif b/src/probabilistic/bloom-filter.bif index 1797f941d2..6994f651dd 100644 --- a/src/probabilistic/bloom-filter.bif +++ b/src/probabilistic/bloom-filter.bif @@ -20,23 +20,20 @@ module GLOBAL; ## Creates a basic Bloom filter. ## -## .. note:: A Bloom filter can have a name associated with it. In the future, -## Bloom filters with the same name will be compatible across indepedent Bro -## instances, i.e., it will be possible to merge them. Currently, however, that is -## not yet supported. -## ## fp: The desired false-positive rate. ## ## capacity: the maximum number of elements that guarantees a false-positive ## rate of *fp*. ## ## name: A name that uniquely identifies and seeds the Bloom filter. If empty, -## the filter will remain tied to the current Bro process. +## the filter will use :bro:id:`global_hash_seed` if that's set, and otherwise use +## a local seed tied to the current Bro process. Only filters with the same seed +## can be merged with :bro:id:`bloomfilter_merge` . ## ## Returns: A Bloom filter handle. ## -## .. bro:see:: bloomfilter_counting_init bloomfilter_add bloomfilter_lookup -## bloomfilter_clear bloomfilter_merge +## .. bro:see:: bloomfilter_basic_init2 bloomfilter_counting_init bloomfilter_add +## bloomfilter_lookup bloomfilter_clear bloomfilter_merge global_hash_seed function bloomfilter_basic_init%(fp: double, capacity: count, name: string &default=""%): opaque of bloomfilter %{ @@ -48,18 +45,53 @@ function bloomfilter_basic_init%(fp: double, capacity: count, size_t cells = BasicBloomFilter::M(fp, capacity); size_t optimal_k = BasicBloomFilter::K(cells, capacity); - const Hasher* h = new DefaultHasher(optimal_k, name->CheckString()); + size_t seed = Hasher::MakeSeed(name->Len() > 0 ? name->Bytes() : 0, + name->Len()); + const Hasher* h = new DefaultHasher(optimal_k, seed); + + return new BloomFilterVal(new BasicBloomFilter(h, cells)); + %} + +## Creates a basic Bloom filter. This function serves as a low-level +## alternative to bloomfilter_basic_init where the user has full control over +## the number of hash functions and cells in the underlying bit vector. +## +## k: The number of hash functions to use. +## +## cells: The number of cells of the underlying bit vector. +## +## name: A name that uniquely identifies and seeds the Bloom filter. If empty, +## the filter will use :bro:id:`global_hash_seed` if that's set, and otherwise use +## a local seed tied to the current Bro process. Only filters with the same seed +## can be merged with :bro:id:`bloomfilter_merge` . +## +## Returns: A Bloom filter handle. +## +## .. bro:see:: bloom_filter_basic_init bloomfilter_counting_init bloomfilter_add +## bloomfilter_lookup bloomfilter_clear bloomfilter_merge global_hash_seed +function bloomfilter_basic_init2%(k: count, cells: count, + name: string &default=""%): opaque of bloomfilter + %{ + if ( k == 0 ) + { + reporter->Error("number of hash functions must be non-negative"); + return 0; + } + if ( cells == 0 ) + { + reporter->Error("number of cells must be non-negative"); + return 0; + } + + size_t seed = Hasher::MakeSeed(name->Len() > 0 ? name->Bytes() : 0, + name->Len()); + const Hasher* h = new DefaultHasher(k, seed); return new BloomFilterVal(new BasicBloomFilter(h, cells)); %} ## Creates a counting Bloom filter. ## -## .. note:: A Bloom filter can have a name associated with it. In the future, -## Bloom filters with the same name will be compatible across indepedent Bro -## instances, i.e., it will be possible to merge them. Currently, however, that is -## not yet supported. -## ## k: The number of hash functions to use. ## ## cells: The number of cells of the underlying counter vector. As there's no @@ -71,12 +103,14 @@ function bloomfilter_basic_init%(fp: double, capacity: count, ## becomes a cell of size *w* bits. ## ## name: A name that uniquely identifies and seeds the Bloom filter. If empty, -## the filter will remain tied to the current Bro process. +## the filter will use :bro:id:`global_hash_seed` if that's set, and otherwise use +## a local seed tied to the current Bro process. Only filters with the same seed +## can be merged with :bro:id:`bloomfilter_merge` . ## ## Returns: A Bloom filter handle. ## -## .. bro:see:: bloomfilter_basic_init bloomfilter_add bloomfilter_lookup -## bloomfilter_clear bloomfilter_merge +## .. bro:see:: bloomfilter_basic_init bloomfilter_basic_init2 bloomfilter_add +## bloomfilter_lookup bloomfilter_clear bloomfilter_merge global_hash_seed function bloomfilter_counting_init%(k: count, cells: count, max: count, name: string &default=""%): opaque of bloomfilter %{ @@ -86,7 +120,10 @@ function bloomfilter_counting_init%(k: count, cells: count, max: count, return 0; } - const Hasher* h = new DefaultHasher(k, name->CheckString()); + size_t seed = Hasher::MakeSeed(name->Len() > 0 ? name->Bytes() : 0, + name->Len()); + + const Hasher* h = new DefaultHasher(k, seed); uint16 width = 1; while ( max >>= 1 ) @@ -101,8 +138,9 @@ function bloomfilter_counting_init%(k: count, cells: count, max: count, ## ## x: The element to add. ## -## .. bro:see:: bloomfilter_counting_init bloomfilter_basic_init loomfilter_lookup -## bloomfilter_clear bloomfilter_merge +## .. bro:see:: bloomfilter_basic_init bloomfilter_basic_init2 +## bloomfilter_counting_init bloomfilter_lookup bloomfilter_clear +## bloomfilter_merge function bloomfilter_add%(bf: opaque of bloomfilter, x: any%): any %{ BloomFilterVal* bfv = static_cast(bf); @@ -127,8 +165,9 @@ function bloomfilter_add%(bf: opaque of bloomfilter, x: any%): any ## ## Returns: the counter associated with *x* in *bf*. ## -## .. bro:see:: bloomfilter_counting_init bloomfilter_basic_init -## bloomfilter_add bloomfilter_clear bloomfilter_merge +## .. bro:see:: bloomfilter_basic_init bloomfilter_basic_init2 +## bloomfilter_counting_init bloomfilter_add bloomfilter_clear +## bloomfilter_merge function bloomfilter_lookup%(bf: opaque of bloomfilter, x: any%): count %{ const BloomFilterVal* bfv = static_cast(bf); @@ -154,8 +193,9 @@ function bloomfilter_lookup%(bf: opaque of bloomfilter, x: any%): count ## ## bf: The Bloom filter handle. ## -## .. bro:see:: bloomfilter_counting_init bloomfilter_basic_init -## bloomfilter_add bloomfilter_lookup bloomfilter_merge +## .. bro:see:: bloomfilter_basic_init bloomfilter_counting_init2 +## bloomfilter_counting_init bloomfilter_add bloomfilter_lookup +## bloomfilter_merge function bloomfilter_clear%(bf: opaque of bloomfilter%): any %{ BloomFilterVal* bfv = static_cast(bf); @@ -178,8 +218,9 @@ function bloomfilter_clear%(bf: opaque of bloomfilter%): any ## ## Returns: The union of *bf1* and *bf2*. ## -## .. bro:see:: bloomfilter_counting_init bloomfilter_basic_init -## bloomfilter_add bloomfilter_lookup bloomfilter_clear +## .. bro:see:: bloomfilter_basic_init bloomfilter_basic_init2 +## bloomfilter_counting_init bloomfilter_add bloomfilter_lookup +## bloomfilter_clear function bloomfilter_merge%(bf1: opaque of bloomfilter, bf2: opaque of bloomfilter%): opaque of bloomfilter %{ @@ -196,3 +237,13 @@ function bloomfilter_merge%(bf1: opaque of bloomfilter, return BloomFilterVal::Merge(bfv1, bfv2); %} + +## Returns a string with a representation of a Bloom filter's internal +## state. This is for debugging/testing purposes only. +## +## bf: The Bloom filter handle. +function bloomfilter_internal_state%(bf: opaque of bloomfilter%): string + %{ + BloomFilterVal* bfv = static_cast(bf); + return new StringVal(bfv->InternalState()); + %} diff --git a/testing/btest/Baseline/bifs.bloomfilter-seed/output b/testing/btest/Baseline/bifs.bloomfilter-seed/output new file mode 100644 index 0000000000..533085900f --- /dev/null +++ b/testing/btest/Baseline/bifs.bloomfilter-seed/output @@ -0,0 +1,8 @@ +bf1, global_seed, 11979365913534242684 +bf2, global_seed, 12550100962110750449 +bf3, my_seed, 12550100962110750449 +bf4, my_seed, 945716460325754659 +bf1, global_seed, 12550100962110750449 +bf2, global_seed, 945716460325754659 +bf3, my_seed, 12550100962110750449 +bf4, my_seed, 945716460325754659 diff --git a/testing/btest/Baseline/bifs.bloomfilter/output b/testing/btest/Baseline/bifs.bloomfilter/output index 991308e10b..731b7c7ce9 100644 --- a/testing/btest/Baseline/bifs.bloomfilter/output +++ b/testing/btest/Baseline/bifs.bloomfilter/output @@ -18,6 +18,7 @@ error: false-positive rate must take value between 0 and 1 1 1 1 +1 2 3 3 diff --git a/testing/btest/bifs/bloomfilter-seed.bro b/testing/btest/bifs/bloomfilter-seed.bro new file mode 100644 index 0000000000..436638e2af --- /dev/null +++ b/testing/btest/bifs/bloomfilter-seed.bro @@ -0,0 +1,40 @@ +# @TEST-EXEC: bro -b %INPUT global_hash_seed="foo" >>output +# @TEST-EXEC: bro -b %INPUT global_hash_seed="my_seed" >>output +# @TEST-EXEC: btest-diff output + +type Foo: record + { + a: count; + b: string; + }; + +function test_bloom_filter() + { + local bf1 = bloomfilter_basic_init(0.9, 10); + bloomfilter_add(bf1, "foo"); + bloomfilter_add(bf1, "bar"); + + local bf2 = bloomfilter_basic_init(0.9, 10); + bloomfilter_add(bf2, Foo($a=1, $b="xx")); + bloomfilter_add(bf2, Foo($a=2, $b="yy")); + + local bf3 = bloomfilter_basic_init(0.9, 10, "my_seed"); + bloomfilter_add(bf3, "foo"); + bloomfilter_add(bf3, "bar"); + + local bf4 = bloomfilter_basic_init(0.9, 10, "my_seed"); + bloomfilter_add(bf4, Foo($a=1, $b="xx")); + bloomfilter_add(bf4, Foo($a=2, $b="yy")); + + print "bf1, global_seed", bloomfilter_internal_state(bf1); + print "bf2, global_seed", bloomfilter_internal_state(bf2); + print "bf3, my_seed", bloomfilter_internal_state(bf3); + print "bf4, my_seed", bloomfilter_internal_state(bf4); + + + } + +event bro_init() + { + test_bloom_filter(); + } diff --git a/testing/btest/bifs/bloomfilter.bro b/testing/btest/bifs/bloomfilter.bro index f2c882d175..3d3133fa84 100644 --- a/testing/btest/bifs/bloomfilter.bro +++ b/testing/btest/bifs/bloomfilter.bro @@ -15,14 +15,21 @@ function test_basic_bloom_filter() bloomfilter_add(bf_cnt, 0.5); # Type mismatch bloomfilter_add(bf_cnt, "foo"); # Type mismatch + # Alternative constructor. + local bf_dbl = bloomfilter_basic_init2(4, 10); + bloomfilter_add(bf_dbl, 4.2); + bloomfilter_add(bf_dbl, 3.14); + print bloomfilter_lookup(bf_dbl, 4.2); + print bloomfilter_lookup(bf_dbl, 3.14); + # Basic usage with strings. local bf_str = bloomfilter_basic_init(0.9, 10); bloomfilter_add(bf_str, "foo"); bloomfilter_add(bf_str, "bar"); print bloomfilter_lookup(bf_str, "foo"); print bloomfilter_lookup(bf_str, "bar"); - print bloomfilter_lookup(bf_str, "b4z"); # FP - print bloomfilter_lookup(bf_str, "quux"); # FP + print bloomfilter_lookup(bf_str, "b4zzz"); # FP + print bloomfilter_lookup(bf_str, "quuux"); # FP bloomfilter_add(bf_str, 0.5); # Type mismatch bloomfilter_add(bf_str, 100); # Type mismatch