diff --git a/NEWS b/NEWS index c421e7d675..64058054d6 100644 --- a/NEWS +++ b/NEWS @@ -113,6 +113,7 @@ New Functionality the frequency of elements. The corresponding functions are: bloomfilter_basic_init(fp: double, capacity: count, name: string &default=""): opaque of bloomfilter + bloomfilter_basic_init2(k: count, cells: count, name: string &default=""): opaque of bloomfilter bloomfilter_counting_init(k: count, cells: count, max: count, name: string &default=""): opaque of bloomfilter bloomfilter_add(bf: opaque of bloomfilter, x: any) bloomfilter_lookup(bf: opaque of bloomfilter, x: any): count diff --git a/src/NetVar.cc b/src/NetVar.cc index 388aa46f10..2fee46e2da 100644 --- a/src/NetVar.cc +++ b/src/NetVar.cc @@ -238,6 +238,8 @@ TableType* record_field_table; StringVal* cmd_line_bpf_filter; +StringVal* global_hash_seed; + OpaqueType* md5_type; OpaqueType* sha1_type; OpaqueType* sha256_type; @@ -304,6 +306,8 @@ void init_general_global_var() cmd_line_bpf_filter = internal_val("cmd_line_bpf_filter")->AsStringVal(); + global_hash_seed = opt_internal_string("global_hash_seed"); + md5_type = new OpaqueType("md5"); sha1_type = new OpaqueType("sha1"); sha256_type = new OpaqueType("sha256"); diff --git a/src/NetVar.h b/src/NetVar.h index 7ce33d1a1a..3615108f73 100644 --- a/src/NetVar.h +++ b/src/NetVar.h @@ -242,6 +242,8 @@ extern TableType* record_field_table; extern StringVal* cmd_line_bpf_filter; +extern StringVal* global_hash_seed; + class OpaqueType; extern OpaqueType* md5_type; extern OpaqueType* sha1_type; diff --git a/src/probabilistic/Hasher.cc b/src/probabilistic/Hasher.cc index 17597b9a82..8b34aa5c77 100644 --- a/src/probabilistic/Hasher.cc +++ b/src/probabilistic/Hasher.cc @@ -3,11 +3,34 @@ #include #include "Hasher.h" +#include "NetVar.h" #include "digest.h" #include "Serializer.h" using namespace probabilistic; +size_t Hasher::MakeSeed(const void* data, size_t size) + { + u_char buf[SHA256_DIGEST_LENGTH]; + SHA256_CTX ctx; + sha256_init(&ctx); + + if ( data ) + sha256_update(&ctx, data, size); + + else if ( global_hash_seed && global_hash_seed->Len() > 0 ) + sha256_update(&ctx, global_hash_seed->Bytes(), global_hash_seed->Len()); + + else + { + unsigned int first_seed = initial_seed(); + sha256_update(&ctx, &first_seed, sizeof(first_seed)); + } + + sha256_final(&ctx, buf); + return *reinterpret_cast(buf); // Use the first bytes as seed. + } + bool Hasher::Serialize(SerialInfo* info) const { return SerialObj::Serialize(info); @@ -25,7 +48,7 @@ bool Hasher::DoSerialize(SerialInfo* info) const if ( ! SERIALIZE(static_cast(k)) ) return false; - return SERIALIZE_STR(name.c_str(), name.size()); + return SERIALIZE(static_cast(seed)); } bool Hasher::DoUnserialize(UnserialInfo* info) @@ -39,26 +62,24 @@ bool Hasher::DoUnserialize(UnserialInfo* info) k = serial_k; assert(k > 0); - const char* serial_name; - if ( ! UNSERIALIZE_STR(&serial_name, 0) ) + uint64 serial_seed; + if ( ! UNSERIALIZE(&serial_seed) ) return false; - name = serial_name; - delete [] serial_name; + seed = serial_seed; return true; } -Hasher::Hasher(size_t k, const std::string& arg_name) - : k(k) +Hasher::Hasher(size_t arg_k, size_t arg_seed) { - k = k; - name = arg_name; + k = arg_k; + seed = arg_seed; } -UHF::UHF(size_t seed, const std::string& extra) - : h(compute_seed(seed, extra)) +UHF::UHF(size_t seed) + : h(seed) { } @@ -68,33 +89,11 @@ Hasher::digest UHF::hash(const void* x, size_t n) const return n == 0 ? 0 : h(x, n); } -size_t UHF::compute_seed(size_t seed, const std::string& extra) +DefaultHasher::DefaultHasher(size_t k, size_t seed) + : Hasher(k, seed) { - u_char buf[SHA256_DIGEST_LENGTH]; - SHA256_CTX ctx; - sha256_init(&ctx); - - if ( extra.empty() ) - { - unsigned int first_seed = initial_seed(); - sha256_update(&ctx, &first_seed, sizeof(first_seed)); - } - - else - sha256_update(&ctx, extra.c_str(), extra.size()); - - sha256_update(&ctx, &seed, sizeof(seed)); - sha256_final(&ctx, buf); - - // Take the first sizeof(size_t) bytes as seed. - return *reinterpret_cast(buf); - } - -DefaultHasher::DefaultHasher(size_t k, const std::string& name) - : Hasher(k, name) - { - for ( size_t i = 0; i < k; ++i ) - hash_functions.push_back(UHF(i, name)); + for ( size_t i = 1; i <= k; ++i ) + hash_functions.push_back(UHF(Seed() + bro_prng(i))); } Hasher::digest_vector DefaultHasher::Hash(const void* x, size_t n) const @@ -137,13 +136,13 @@ bool DefaultHasher::DoUnserialize(UnserialInfo* info) hash_functions.clear(); for ( size_t i = 0; i < K(); ++i ) - hash_functions.push_back(UHF(i, Name())); + hash_functions.push_back(UHF(Seed() + bro_prng(i))); return true; } -DoubleHasher::DoubleHasher(size_t k, const std::string& name) - : Hasher(k, name), h1(1, name), h2(2, name) +DoubleHasher::DoubleHasher(size_t k, size_t seed) + : Hasher(k, seed), h1(seed + bro_prng(1)), h2(seed + bro_prng(2)) { } @@ -187,8 +186,8 @@ bool DoubleHasher::DoUnserialize(UnserialInfo* info) { DO_UNSERIALIZE(Hasher); - h1 = UHF(1, Name()); - h2 = UHF(2, Name()); + h1 = UHF(Seed() + bro_prng(1)); + h2 = UHF(Seed() + bro_prng(2)); return true; } diff --git a/src/probabilistic/Hasher.h b/src/probabilistic/Hasher.h index 3acd5c5867..bd8f5ce5ff 100644 --- a/src/probabilistic/Hasher.h +++ b/src/probabilistic/Hasher.h @@ -18,6 +18,20 @@ public: typedef hash_t digest; typedef std::vector digest_vector; + /** + * Creates a valid hasher seed from an arbitrary string. + * + * @param data A pointer to contiguous data that should be crunched into a + * seed. If 0, the function tries to find a global_hash_seed script variable + * to derive a seed from. If this variable does not exist, the function uses + * the initial seed generated at Bro startup. + * + * @param size The number of bytes of *data*. + * + * @return A seed suitable for hashers. + */ + static size_t MakeSeed(const void* data, size_t size); + /** * Destructor. */ @@ -64,11 +78,9 @@ public: size_t K() const { return k; } /** - * Returns the hasher's name. If not empty, the hasher uses this descriptor - * to seed its *k* hash functions. Otherwise the hasher mixes in the initial - * seed derived from the environment variable `$BRO_SEED`. + * Returns the seed used to construct the hasher. */ - const std::string& Name() const { return name; } + size_t Seed() const { return seed; } bool Serialize(SerialInfo* info) const; static Hasher* Unserialize(UnserialInfo* info); @@ -81,16 +93,15 @@ protected: /** * Constructor. * - * @param k the number of hash functions. + * @param arg_k the number of hash functions. * - * @param name A name for the hasher. Hashers with the same name - * should provide consistent results. + * @param arg_seed The seed for the hasher. */ - Hasher(size_t k, const std::string& name); + Hasher(size_t arg_k, size_t arg_seed); private: size_t k; - std::string name; + size_t seed; }; /** @@ -104,12 +115,8 @@ public: * optional extra seed to replace the initial Bro seed. * * @param seed The seed to use for this instance. - * - * @param extra If not empty, this parameter replaces the initial - * seed to compute the seed for t to compute the seed NUL-terminated - * string as additional seed. */ - UHF(size_t seed = 0, const std::string& extra = ""); + UHF(size_t seed = 0); template Hasher::digest operator()(const T& x) const @@ -152,7 +159,7 @@ public: } private: - static size_t compute_seed(size_t seed, const std::string& extra); + static size_t compute_seed(size_t seed); H3 h; }; @@ -169,9 +176,9 @@ public: * * @param k The number of hash functions to use. * - * @param name The name of the hasher. + * @param seed The seed for the hasher. */ - DefaultHasher(size_t k, const std::string& name = ""); + DefaultHasher(size_t k, size_t seed); // Overridden from Hasher. virtual digest_vector Hash(const void* x, size_t n) const /* final */; @@ -197,9 +204,9 @@ public: * * @param k The number of hash functions to use. * - * @param name The name of the hasher. + * @param seed The seed for the hasher. */ - DoubleHasher(size_t k, const std::string& name = ""); + DoubleHasher(size_t k, size_t seed); // Overridden from Hasher. virtual digest_vector Hash(const void* x, size_t n) const /* final */; diff --git a/src/probabilistic/bloom-filter.bif b/src/probabilistic/bloom-filter.bif index a3567ad6f7..c288171e5d 100644 --- a/src/probabilistic/bloom-filter.bif +++ b/src/probabilistic/bloom-filter.bif @@ -31,12 +31,14 @@ module GLOBAL; ## rate of *fp*. ## ## name: A name that uniquely identifies and seeds the Bloom filter. If empty, -## the filter will remain tied to the current Bro process. +## the filter will use :bro:id:`global_hash_seed` if that's set, and otherwise use +## a local seed tied to the current Bro process. Only filters with the same seed +## can be merged with :bro:id:`bloomfilter_merge` . ## ## Returns: A Bloom filter handle. ## -## .. bro:see:: bloomfilter_counting_init bloomfilter_add bloomfilter_lookup -## bloomfilter_clear bloomfilter_merge +## .. bro:see:: bloomfilter_basic_init2 bloomfilter_counting_init bloomfilter_add +## bloomfilter_lookup bloomfilter_clear bloomfilter_merge global_hash_seed function bloomfilter_basic_init%(fp: double, capacity: count, name: string &default=""%): opaque of bloomfilter %{ @@ -48,7 +50,52 @@ function bloomfilter_basic_init%(fp: double, capacity: count, size_t cells = BasicBloomFilter::M(fp, capacity); size_t optimal_k = BasicBloomFilter::K(cells, capacity); - const Hasher* h = new DefaultHasher(optimal_k, name->CheckString()); + size_t seed = Hasher::MakeSeed(name->Len() > 0 ? name->Bytes() : 0, + name->Len()); + const Hasher* h = new DefaultHasher(optimal_k, seed); + + return new BloomFilterVal(new BasicBloomFilter(h, cells)); + %} + +## Creates a basic Bloom filter. This function serves as a low-level +## alternative to bloomfilter_basic_init where the user has full control over +## the number of hash functions and cells in the underlying bit vector. +## +## .. note:: A Bloom filter can have a name associated with it. In the future, +## Bloom filters with the same name will be compatible across indepedent Bro +## instances, i.e., it will be possible to merge them. Currently, however, that is +## not yet supported. +## +## k: The number of hash functions to use. +## +## cells: The number of cells of the underlying bit vector. +## +## name: A name that uniquely identifies and seeds the Bloom filter. If empty, +## the filter will use :bro:id:`global_hash_seed` if that's set, and otherwise use +## a local seed tied to the current Bro process. Only filters with the same seed +## can be merged with :bro:id:`bloomfilter_merge` . +## +## Returns: A Bloom filter handle. +## +## .. bro:see:: bloom_filter_basic_init bloomfilter_counting_init bloomfilter_add +## bloomfilter_lookup bloomfilter_clear bloomfilter_merge global_hash_seed +function bloomfilter_basic_init2%(k: count, cells: count, + name: string &default=""%): opaque of bloomfilter + %{ + if ( k == 0 ) + { + reporter->Error("number of hash functions must be non-negative"); + return 0; + } + if ( cells == 0 ) + { + reporter->Error("number of cells must be non-negative"); + return 0; + } + + size_t seed = Hasher::MakeSeed(name->Len() > 0 ? name->Bytes() : 0, + name->Len()); + const Hasher* h = new DefaultHasher(k, seed); return new BloomFilterVal(new BasicBloomFilter(h, cells)); %} @@ -71,12 +118,14 @@ function bloomfilter_basic_init%(fp: double, capacity: count, ## becomes a cell of size *w* bits. ## ## name: A name that uniquely identifies and seeds the Bloom filter. If empty, -## the filter will remain tied to the current Bro process. +## the filter will use :bro:id:`global_hash_seed` if that's set, and otherwise use +## a local seed tied to the current Bro process. Only filters with the same seed +## can be merged with :bro:id:`bloomfilter_merge` . ## ## Returns: A Bloom filter handle. ## -## .. bro:see:: bloomfilter_basic_init bloomfilter_add bloomfilter_lookup -## bloomfilter_clear bloomfilter_merge +## .. bro:see:: bloomfilter_basic_init bloomfilter_basic_init2 bloomfilter_add +## bloomfilter_lookup bloomfilter_clear bloomfilter_merge global_hash_seed function bloomfilter_counting_init%(k: count, cells: count, max: count, name: string &default=""%): opaque of bloomfilter %{ @@ -86,7 +135,10 @@ function bloomfilter_counting_init%(k: count, cells: count, max: count, return 0; } - const Hasher* h = new DefaultHasher(k, name->CheckString()); + size_t seed = Hasher::MakeSeed(name->Len() > 0 ? name->Bytes() : 0, + name->Len()); + + const Hasher* h = new DefaultHasher(k, seed); uint16 width = 1; while ( max >>= 1 ) @@ -101,8 +153,9 @@ function bloomfilter_counting_init%(k: count, cells: count, max: count, ## ## x: The element to add. ## -## .. bro:see:: bloomfilter_counting_init bloomfilter_basic_init loomfilter_lookup -## bloomfilter_clear bloomfilter_merge +## .. bro:see:: bloomfilter_basic_init bloomfilter_basic_init2 +## bloomfilter_counting_init bloomfilter_lookup bloomfilter_clear +## bloomfilter_merge function bloomfilter_add%(bf: opaque of bloomfilter, x: any%): any %{ BloomFilterVal* bfv = static_cast(bf); @@ -127,8 +180,9 @@ function bloomfilter_add%(bf: opaque of bloomfilter, x: any%): any ## ## Returns: the counter associated with *x* in *bf*. ## -## .. bro:see:: bloomfilter_counting_init bloomfilter_basic_init -## bloomfilter_add bloomfilter_clear bloomfilter_merge +## .. bro:see:: bloomfilter_basic_init bloomfilter_basic_init2 +## bloomfilter_counting_init bloomfilter_add bloomfilter_clear +## bloomfilter_merge function bloomfilter_lookup%(bf: opaque of bloomfilter, x: any%): count %{ const BloomFilterVal* bfv = static_cast(bf); @@ -154,8 +208,9 @@ function bloomfilter_lookup%(bf: opaque of bloomfilter, x: any%): count ## ## bf: The Bloom filter handle. ## -## .. bro:see:: bloomfilter_counting_init bloomfilter_basic_init -## bloomfilter_add bloomfilter_lookup bloomfilter_merge +## .. bro:see:: bloomfilter_basic_init bloomfilter_counting_init2 +## bloomfilter_counting_init bloomfilter_add bloomfilter_lookup +## bloomfilter_merge function bloomfilter_clear%(bf: opaque of bloomfilter%): any %{ BloomFilterVal* bfv = static_cast(bf); @@ -178,8 +233,9 @@ function bloomfilter_clear%(bf: opaque of bloomfilter%): any ## ## Returns: The union of *bf1* and *bf2*. ## -## .. bro:see:: bloomfilter_counting_init bloomfilter_basic_init -## bloomfilter_add bloomfilter_lookup bloomfilter_clear +## .. bro:see:: bloomfilter_basic_init bloomfilter_basic_init2 +## bloomfilter_counting_init bloomfilter_add bloomfilter_lookup +## bloomfilter_clear function bloomfilter_merge%(bf1: opaque of bloomfilter, bf2: opaque of bloomfilter%): opaque of bloomfilter %{ diff --git a/testing/btest/Baseline/bifs.bloomfilter/output b/testing/btest/Baseline/bifs.bloomfilter/output index 14e1f038c0..731b7c7ce9 100644 --- a/testing/btest/Baseline/bifs.bloomfilter/output +++ b/testing/btest/Baseline/bifs.bloomfilter/output @@ -17,6 +17,8 @@ error: false-positive rate must take value between 0 and 1 1 1 1 +1 +1 2 3 3 diff --git a/testing/btest/bifs/bloomfilter.bro b/testing/btest/bifs/bloomfilter.bro index 3b40f29553..c2a1c47ca8 100644 --- a/testing/btest/bifs/bloomfilter.bro +++ b/testing/btest/bifs/bloomfilter.bro @@ -15,14 +15,21 @@ function test_basic_bloom_filter() bloomfilter_add(bf_cnt, 0.5); # Type mismatch bloomfilter_add(bf_cnt, "foo"); # Type mismatch + # Alternative constructor. + local bf_dbl = bloomfilter_basic_init2(4, 10); + bloomfilter_add(bf_dbl, 4.2); + bloomfilter_add(bf_dbl, 3.14); + print bloomfilter_lookup(bf_dbl, 4.2); + print bloomfilter_lookup(bf_dbl, 3.14); + # Basic usage with strings. local bf_str = bloomfilter_basic_init(0.9, 10); bloomfilter_add(bf_str, "foo"); bloomfilter_add(bf_str, "bar"); print bloomfilter_lookup(bf_str, "foo"); print bloomfilter_lookup(bf_str, "bar"); - print bloomfilter_lookup(bf_str, "b4z"); # FP - print bloomfilter_lookup(bf_str, "quux"); # FP + print bloomfilter_lookup(bf_str, "b4zzz"); # FP + print bloomfilter_lookup(bf_str, "quuux"); # FP bloomfilter_add(bf_str, 0.5); # Type mismatch bloomfilter_add(bf_str, 100); # Type mismatch