diff --git a/src/NetVar.cc b/src/NetVar.cc index 388aa46f10..2fee46e2da 100644 --- a/src/NetVar.cc +++ b/src/NetVar.cc @@ -238,6 +238,8 @@ TableType* record_field_table; StringVal* cmd_line_bpf_filter; +StringVal* global_hash_seed; + OpaqueType* md5_type; OpaqueType* sha1_type; OpaqueType* sha256_type; @@ -304,6 +306,8 @@ void init_general_global_var() cmd_line_bpf_filter = internal_val("cmd_line_bpf_filter")->AsStringVal(); + global_hash_seed = opt_internal_string("global_hash_seed"); + md5_type = new OpaqueType("md5"); sha1_type = new OpaqueType("sha1"); sha256_type = new OpaqueType("sha256"); diff --git a/src/NetVar.h b/src/NetVar.h index 7ce33d1a1a..3615108f73 100644 --- a/src/NetVar.h +++ b/src/NetVar.h @@ -242,6 +242,8 @@ extern TableType* record_field_table; extern StringVal* cmd_line_bpf_filter; +extern StringVal* global_hash_seed; + class OpaqueType; extern OpaqueType* md5_type; extern OpaqueType* sha1_type; diff --git a/src/probabilistic/Hasher.cc b/src/probabilistic/Hasher.cc index 17597b9a82..e24a207e6e 100644 --- a/src/probabilistic/Hasher.cc +++ b/src/probabilistic/Hasher.cc @@ -3,11 +3,34 @@ #include #include "Hasher.h" +#include "NetVar.h" #include "digest.h" #include "Serializer.h" using namespace probabilistic; +size_t Hasher::MakeSeed(const void* data, size_t size) + { + u_char buf[SHA256_DIGEST_LENGTH]; + SHA256_CTX ctx; + sha256_init(&ctx); + + if ( data ) + sha256_update(&ctx, data, size); + + else if ( global_hash_seed && global_hash_seed->Len() > 0 ) + sha256_update(&ctx, global_hash_seed->Bytes(), global_hash_seed->Len()); + + else + { + unsigned int first_seed = initial_seed(); + sha256_update(&ctx, &first_seed, sizeof(first_seed)); + } + + sha256_final(&ctx, buf); + return *reinterpret_cast(buf); // Use the first bytes as seed. + } + bool Hasher::Serialize(SerialInfo* info) const { return SerialObj::Serialize(info); @@ -25,7 +48,7 @@ bool Hasher::DoSerialize(SerialInfo* info) const if ( ! SERIALIZE(static_cast(k)) ) return false; - return SERIALIZE_STR(name.c_str(), name.size()); + return SERIALIZE(static_cast(seed)); } bool Hasher::DoUnserialize(UnserialInfo* info) @@ -35,30 +58,26 @@ bool Hasher::DoUnserialize(UnserialInfo* info) uint16 serial_k; if ( ! UNSERIALIZE(&serial_k) ) return false; - k = serial_k; assert(k > 0); - const char* serial_name; - if ( ! UNSERIALIZE_STR(&serial_name, 0) ) + uint64 serial_seed; + if ( ! UNSERIALIZE(&serial_seed) ) return false; - - name = serial_name; - delete [] serial_name; + seed = serial_seed; return true; } -Hasher::Hasher(size_t k, const std::string& arg_name) - : k(k) +Hasher::Hasher(size_t arg_k, size_t arg_seed) { - k = k; - name = arg_name; + k = arg_k; + seed = arg_seed; } -UHF::UHF(size_t seed, const std::string& extra) - : h(compute_seed(seed, extra)) +UHF::UHF(size_t seed) + : h(seed) { } @@ -68,33 +87,11 @@ Hasher::digest UHF::hash(const void* x, size_t n) const return n == 0 ? 0 : h(x, n); } -size_t UHF::compute_seed(size_t seed, const std::string& extra) +DefaultHasher::DefaultHasher(size_t k, size_t seed) + : Hasher(k, seed) { - u_char buf[SHA256_DIGEST_LENGTH]; - SHA256_CTX ctx; - sha256_init(&ctx); - - if ( extra.empty() ) - { - unsigned int first_seed = initial_seed(); - sha256_update(&ctx, &first_seed, sizeof(first_seed)); - } - - else - sha256_update(&ctx, extra.c_str(), extra.size()); - - sha256_update(&ctx, &seed, sizeof(seed)); - sha256_final(&ctx, buf); - - // Take the first sizeof(size_t) bytes as seed. - return *reinterpret_cast(buf); - } - -DefaultHasher::DefaultHasher(size_t k, const std::string& name) - : Hasher(k, name) - { - for ( size_t i = 0; i < k; ++i ) - hash_functions.push_back(UHF(i, name)); + for ( size_t i = 1; i <= k; ++i ) + hash_functions.push_back(UHF(Seed() + bro_prng(i))); } Hasher::digest_vector DefaultHasher::Hash(const void* x, size_t n) const @@ -137,13 +134,13 @@ bool DefaultHasher::DoUnserialize(UnserialInfo* info) hash_functions.clear(); for ( size_t i = 0; i < K(); ++i ) - hash_functions.push_back(UHF(i, Name())); + hash_functions.push_back(UHF(Seed() + bro_prng(i))); return true; } -DoubleHasher::DoubleHasher(size_t k, const std::string& name) - : Hasher(k, name), h1(1, name), h2(2, name) +DoubleHasher::DoubleHasher(size_t k, size_t seed) + : Hasher(k, seed), h1(seed + bro_prng(1)), h2(seed + bro_prng(2)) { } @@ -187,8 +184,8 @@ bool DoubleHasher::DoUnserialize(UnserialInfo* info) { DO_UNSERIALIZE(Hasher); - h1 = UHF(1, Name()); - h2 = UHF(2, Name()); + h1 = UHF(Seed() + bro_prng(1)); + h2 = UHF(Seed() + bro_prng(2)); return true; } diff --git a/src/probabilistic/Hasher.h b/src/probabilistic/Hasher.h index 3acd5c5867..bd8f5ce5ff 100644 --- a/src/probabilistic/Hasher.h +++ b/src/probabilistic/Hasher.h @@ -18,6 +18,20 @@ public: typedef hash_t digest; typedef std::vector digest_vector; + /** + * Creates a valid hasher seed from an arbitrary string. + * + * @param data A pointer to contiguous data that should be crunched into a + * seed. If 0, the function tries to find a global_hash_seed script variable + * to derive a seed from. If this variable does not exist, the function uses + * the initial seed generated at Bro startup. + * + * @param size The number of bytes of *data*. + * + * @return A seed suitable for hashers. + */ + static size_t MakeSeed(const void* data, size_t size); + /** * Destructor. */ @@ -64,11 +78,9 @@ public: size_t K() const { return k; } /** - * Returns the hasher's name. If not empty, the hasher uses this descriptor - * to seed its *k* hash functions. Otherwise the hasher mixes in the initial - * seed derived from the environment variable `$BRO_SEED`. + * Returns the seed used to construct the hasher. */ - const std::string& Name() const { return name; } + size_t Seed() const { return seed; } bool Serialize(SerialInfo* info) const; static Hasher* Unserialize(UnserialInfo* info); @@ -81,16 +93,15 @@ protected: /** * Constructor. * - * @param k the number of hash functions. + * @param arg_k the number of hash functions. * - * @param name A name for the hasher. Hashers with the same name - * should provide consistent results. + * @param arg_seed The seed for the hasher. */ - Hasher(size_t k, const std::string& name); + Hasher(size_t arg_k, size_t arg_seed); private: size_t k; - std::string name; + size_t seed; }; /** @@ -104,12 +115,8 @@ public: * optional extra seed to replace the initial Bro seed. * * @param seed The seed to use for this instance. - * - * @param extra If not empty, this parameter replaces the initial - * seed to compute the seed for t to compute the seed NUL-terminated - * string as additional seed. */ - UHF(size_t seed = 0, const std::string& extra = ""); + UHF(size_t seed = 0); template Hasher::digest operator()(const T& x) const @@ -152,7 +159,7 @@ public: } private: - static size_t compute_seed(size_t seed, const std::string& extra); + static size_t compute_seed(size_t seed); H3 h; }; @@ -169,9 +176,9 @@ public: * * @param k The number of hash functions to use. * - * @param name The name of the hasher. + * @param seed The seed for the hasher. */ - DefaultHasher(size_t k, const std::string& name = ""); + DefaultHasher(size_t k, size_t seed); // Overridden from Hasher. virtual digest_vector Hash(const void* x, size_t n) const /* final */; @@ -197,9 +204,9 @@ public: * * @param k The number of hash functions to use. * - * @param name The name of the hasher. + * @param seed The seed for the hasher. */ - DoubleHasher(size_t k, const std::string& name = ""); + DoubleHasher(size_t k, size_t seed); // Overridden from Hasher. virtual digest_vector Hash(const void* x, size_t n) const /* final */; diff --git a/src/probabilistic/bloom-filter.bif b/src/probabilistic/bloom-filter.bif index a3567ad6f7..d936b77e3b 100644 --- a/src/probabilistic/bloom-filter.bif +++ b/src/probabilistic/bloom-filter.bif @@ -48,7 +48,9 @@ function bloomfilter_basic_init%(fp: double, capacity: count, size_t cells = BasicBloomFilter::M(fp, capacity); size_t optimal_k = BasicBloomFilter::K(cells, capacity); - const Hasher* h = new DefaultHasher(optimal_k, name->CheckString()); + size_t seed = Hasher::MakeSeed(name->Len() > 0 ? name->Bytes() : 0, + name->Len()); + const Hasher* h = new DefaultHasher(optimal_k, seed); return new BloomFilterVal(new BasicBloomFilter(h, cells)); %} @@ -86,7 +88,10 @@ function bloomfilter_counting_init%(k: count, cells: count, max: count, return 0; } - const Hasher* h = new DefaultHasher(k, name->CheckString()); + size_t seed = Hasher::MakeSeed(name->Len() > 0 ? name->Bytes() : 0, + name->Len()); + + const Hasher* h = new DefaultHasher(k, seed); uint16 width = 1; while ( max >>= 1 ) diff --git a/testing/btest/bifs/bloomfilter.bro b/testing/btest/bifs/bloomfilter.bro index 3b40f29553..e6091e25fa 100644 --- a/testing/btest/bifs/bloomfilter.bro +++ b/testing/btest/bifs/bloomfilter.bro @@ -21,8 +21,8 @@ function test_basic_bloom_filter() bloomfilter_add(bf_str, "bar"); print bloomfilter_lookup(bf_str, "foo"); print bloomfilter_lookup(bf_str, "bar"); - print bloomfilter_lookup(bf_str, "b4z"); # FP - print bloomfilter_lookup(bf_str, "quux"); # FP + print bloomfilter_lookup(bf_str, "b4zzz"); # FP + print bloomfilter_lookup(bf_str, "quuux"); # FP bloomfilter_add(bf_str, 0.5); # Type mismatch bloomfilter_add(bf_str, 100); # Type mismatch