mirror of
https://github.com/zeek/zeek.git
synced 2025-10-06 16:48:19 +00:00
Introduce global_hash_seed script variable.
This commit adds support for script-level specification of a seed to be used by hashers. For example, if the given name of a Bloom filter is not empty, then the seed used by the underlying hasher only depends on the Bloom filter name. If the name is empty, we check whether the user defined a non-empty global_hash_seed string variable at script and use it instead. If that script variable does not exist, then we fall back to the initial seed computed a Bro startup (which is affected ultimately by $BRO_SEED). See Hasher::MakeSeed for details.
This commit is contained in:
parent
af9e181731
commit
8ca76dd4ee
6 changed files with 82 additions and 67 deletions
|
@ -238,6 +238,8 @@ TableType* record_field_table;
|
||||||
|
|
||||||
StringVal* cmd_line_bpf_filter;
|
StringVal* cmd_line_bpf_filter;
|
||||||
|
|
||||||
|
StringVal* global_hash_seed;
|
||||||
|
|
||||||
OpaqueType* md5_type;
|
OpaqueType* md5_type;
|
||||||
OpaqueType* sha1_type;
|
OpaqueType* sha1_type;
|
||||||
OpaqueType* sha256_type;
|
OpaqueType* sha256_type;
|
||||||
|
@ -304,6 +306,8 @@ void init_general_global_var()
|
||||||
cmd_line_bpf_filter =
|
cmd_line_bpf_filter =
|
||||||
internal_val("cmd_line_bpf_filter")->AsStringVal();
|
internal_val("cmd_line_bpf_filter")->AsStringVal();
|
||||||
|
|
||||||
|
global_hash_seed = opt_internal_string("global_hash_seed");
|
||||||
|
|
||||||
md5_type = new OpaqueType("md5");
|
md5_type = new OpaqueType("md5");
|
||||||
sha1_type = new OpaqueType("sha1");
|
sha1_type = new OpaqueType("sha1");
|
||||||
sha256_type = new OpaqueType("sha256");
|
sha256_type = new OpaqueType("sha256");
|
||||||
|
|
|
@ -242,6 +242,8 @@ extern TableType* record_field_table;
|
||||||
|
|
||||||
extern StringVal* cmd_line_bpf_filter;
|
extern StringVal* cmd_line_bpf_filter;
|
||||||
|
|
||||||
|
extern StringVal* global_hash_seed;
|
||||||
|
|
||||||
class OpaqueType;
|
class OpaqueType;
|
||||||
extern OpaqueType* md5_type;
|
extern OpaqueType* md5_type;
|
||||||
extern OpaqueType* sha1_type;
|
extern OpaqueType* sha1_type;
|
||||||
|
|
|
@ -3,11 +3,34 @@
|
||||||
#include <typeinfo>
|
#include <typeinfo>
|
||||||
|
|
||||||
#include "Hasher.h"
|
#include "Hasher.h"
|
||||||
|
#include "NetVar.h"
|
||||||
#include "digest.h"
|
#include "digest.h"
|
||||||
#include "Serializer.h"
|
#include "Serializer.h"
|
||||||
|
|
||||||
using namespace probabilistic;
|
using namespace probabilistic;
|
||||||
|
|
||||||
|
size_t Hasher::MakeSeed(const void* data, size_t size)
|
||||||
|
{
|
||||||
|
u_char buf[SHA256_DIGEST_LENGTH];
|
||||||
|
SHA256_CTX ctx;
|
||||||
|
sha256_init(&ctx);
|
||||||
|
|
||||||
|
if ( data )
|
||||||
|
sha256_update(&ctx, data, size);
|
||||||
|
|
||||||
|
else if ( global_hash_seed && global_hash_seed->Len() > 0 )
|
||||||
|
sha256_update(&ctx, global_hash_seed->Bytes(), global_hash_seed->Len());
|
||||||
|
|
||||||
|
else
|
||||||
|
{
|
||||||
|
unsigned int first_seed = initial_seed();
|
||||||
|
sha256_update(&ctx, &first_seed, sizeof(first_seed));
|
||||||
|
}
|
||||||
|
|
||||||
|
sha256_final(&ctx, buf);
|
||||||
|
return *reinterpret_cast<size_t*>(buf); // Use the first bytes as seed.
|
||||||
|
}
|
||||||
|
|
||||||
bool Hasher::Serialize(SerialInfo* info) const
|
bool Hasher::Serialize(SerialInfo* info) const
|
||||||
{
|
{
|
||||||
return SerialObj::Serialize(info);
|
return SerialObj::Serialize(info);
|
||||||
|
@ -25,7 +48,7 @@ bool Hasher::DoSerialize(SerialInfo* info) const
|
||||||
if ( ! SERIALIZE(static_cast<uint16>(k)) )
|
if ( ! SERIALIZE(static_cast<uint16>(k)) )
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
return SERIALIZE_STR(name.c_str(), name.size());
|
return SERIALIZE(static_cast<uint64>(seed));
|
||||||
}
|
}
|
||||||
|
|
||||||
bool Hasher::DoUnserialize(UnserialInfo* info)
|
bool Hasher::DoUnserialize(UnserialInfo* info)
|
||||||
|
@ -35,30 +58,26 @@ bool Hasher::DoUnserialize(UnserialInfo* info)
|
||||||
uint16 serial_k;
|
uint16 serial_k;
|
||||||
if ( ! UNSERIALIZE(&serial_k) )
|
if ( ! UNSERIALIZE(&serial_k) )
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
k = serial_k;
|
k = serial_k;
|
||||||
assert(k > 0);
|
assert(k > 0);
|
||||||
|
|
||||||
const char* serial_name;
|
uint64 serial_seed;
|
||||||
if ( ! UNSERIALIZE_STR(&serial_name, 0) )
|
if ( ! UNSERIALIZE(&serial_seed) )
|
||||||
return false;
|
return false;
|
||||||
|
seed = serial_seed;
|
||||||
name = serial_name;
|
|
||||||
delete [] serial_name;
|
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
Hasher::Hasher(size_t k, const std::string& arg_name)
|
Hasher::Hasher(size_t arg_k, size_t arg_seed)
|
||||||
: k(k)
|
|
||||||
{
|
{
|
||||||
k = k;
|
k = arg_k;
|
||||||
name = arg_name;
|
seed = arg_seed;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
UHF::UHF(size_t seed, const std::string& extra)
|
UHF::UHF(size_t seed)
|
||||||
: h(compute_seed(seed, extra))
|
: h(seed)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -68,33 +87,11 @@ Hasher::digest UHF::hash(const void* x, size_t n) const
|
||||||
return n == 0 ? 0 : h(x, n);
|
return n == 0 ? 0 : h(x, n);
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t UHF::compute_seed(size_t seed, const std::string& extra)
|
DefaultHasher::DefaultHasher(size_t k, size_t seed)
|
||||||
|
: Hasher(k, seed)
|
||||||
{
|
{
|
||||||
u_char buf[SHA256_DIGEST_LENGTH];
|
for ( size_t i = 1; i <= k; ++i )
|
||||||
SHA256_CTX ctx;
|
hash_functions.push_back(UHF(Seed() + bro_prng(i)));
|
||||||
sha256_init(&ctx);
|
|
||||||
|
|
||||||
if ( extra.empty() )
|
|
||||||
{
|
|
||||||
unsigned int first_seed = initial_seed();
|
|
||||||
sha256_update(&ctx, &first_seed, sizeof(first_seed));
|
|
||||||
}
|
|
||||||
|
|
||||||
else
|
|
||||||
sha256_update(&ctx, extra.c_str(), extra.size());
|
|
||||||
|
|
||||||
sha256_update(&ctx, &seed, sizeof(seed));
|
|
||||||
sha256_final(&ctx, buf);
|
|
||||||
|
|
||||||
// Take the first sizeof(size_t) bytes as seed.
|
|
||||||
return *reinterpret_cast<size_t*>(buf);
|
|
||||||
}
|
|
||||||
|
|
||||||
DefaultHasher::DefaultHasher(size_t k, const std::string& name)
|
|
||||||
: Hasher(k, name)
|
|
||||||
{
|
|
||||||
for ( size_t i = 0; i < k; ++i )
|
|
||||||
hash_functions.push_back(UHF(i, name));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Hasher::digest_vector DefaultHasher::Hash(const void* x, size_t n) const
|
Hasher::digest_vector DefaultHasher::Hash(const void* x, size_t n) const
|
||||||
|
@ -137,13 +134,13 @@ bool DefaultHasher::DoUnserialize(UnserialInfo* info)
|
||||||
|
|
||||||
hash_functions.clear();
|
hash_functions.clear();
|
||||||
for ( size_t i = 0; i < K(); ++i )
|
for ( size_t i = 0; i < K(); ++i )
|
||||||
hash_functions.push_back(UHF(i, Name()));
|
hash_functions.push_back(UHF(Seed() + bro_prng(i)));
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
DoubleHasher::DoubleHasher(size_t k, const std::string& name)
|
DoubleHasher::DoubleHasher(size_t k, size_t seed)
|
||||||
: Hasher(k, name), h1(1, name), h2(2, name)
|
: Hasher(k, seed), h1(seed + bro_prng(1)), h2(seed + bro_prng(2))
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -187,8 +184,8 @@ bool DoubleHasher::DoUnserialize(UnserialInfo* info)
|
||||||
{
|
{
|
||||||
DO_UNSERIALIZE(Hasher);
|
DO_UNSERIALIZE(Hasher);
|
||||||
|
|
||||||
h1 = UHF(1, Name());
|
h1 = UHF(Seed() + bro_prng(1));
|
||||||
h2 = UHF(2, Name());
|
h2 = UHF(Seed() + bro_prng(2));
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
|
@ -18,6 +18,20 @@ public:
|
||||||
typedef hash_t digest;
|
typedef hash_t digest;
|
||||||
typedef std::vector<digest> digest_vector;
|
typedef std::vector<digest> digest_vector;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a valid hasher seed from an arbitrary string.
|
||||||
|
*
|
||||||
|
* @param data A pointer to contiguous data that should be crunched into a
|
||||||
|
* seed. If 0, the function tries to find a global_hash_seed script variable
|
||||||
|
* to derive a seed from. If this variable does not exist, the function uses
|
||||||
|
* the initial seed generated at Bro startup.
|
||||||
|
*
|
||||||
|
* @param size The number of bytes of *data*.
|
||||||
|
*
|
||||||
|
* @return A seed suitable for hashers.
|
||||||
|
*/
|
||||||
|
static size_t MakeSeed(const void* data, size_t size);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Destructor.
|
* Destructor.
|
||||||
*/
|
*/
|
||||||
|
@ -64,11 +78,9 @@ public:
|
||||||
size_t K() const { return k; }
|
size_t K() const { return k; }
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns the hasher's name. If not empty, the hasher uses this descriptor
|
* Returns the seed used to construct the hasher.
|
||||||
* to seed its *k* hash functions. Otherwise the hasher mixes in the initial
|
|
||||||
* seed derived from the environment variable `$BRO_SEED`.
|
|
||||||
*/
|
*/
|
||||||
const std::string& Name() const { return name; }
|
size_t Seed() const { return seed; }
|
||||||
|
|
||||||
bool Serialize(SerialInfo* info) const;
|
bool Serialize(SerialInfo* info) const;
|
||||||
static Hasher* Unserialize(UnserialInfo* info);
|
static Hasher* Unserialize(UnserialInfo* info);
|
||||||
|
@ -81,16 +93,15 @@ protected:
|
||||||
/**
|
/**
|
||||||
* Constructor.
|
* Constructor.
|
||||||
*
|
*
|
||||||
* @param k the number of hash functions.
|
* @param arg_k the number of hash functions.
|
||||||
*
|
*
|
||||||
* @param name A name for the hasher. Hashers with the same name
|
* @param arg_seed The seed for the hasher.
|
||||||
* should provide consistent results.
|
|
||||||
*/
|
*/
|
||||||
Hasher(size_t k, const std::string& name);
|
Hasher(size_t arg_k, size_t arg_seed);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
size_t k;
|
size_t k;
|
||||||
std::string name;
|
size_t seed;
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -104,12 +115,8 @@ public:
|
||||||
* optional extra seed to replace the initial Bro seed.
|
* optional extra seed to replace the initial Bro seed.
|
||||||
*
|
*
|
||||||
* @param seed The seed to use for this instance.
|
* @param seed The seed to use for this instance.
|
||||||
*
|
|
||||||
* @param extra If not empty, this parameter replaces the initial
|
|
||||||
* seed to compute the seed for t to compute the seed NUL-terminated
|
|
||||||
* string as additional seed.
|
|
||||||
*/
|
*/
|
||||||
UHF(size_t seed = 0, const std::string& extra = "");
|
UHF(size_t seed = 0);
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
Hasher::digest operator()(const T& x) const
|
Hasher::digest operator()(const T& x) const
|
||||||
|
@ -152,7 +159,7 @@ public:
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
static size_t compute_seed(size_t seed, const std::string& extra);
|
static size_t compute_seed(size_t seed);
|
||||||
|
|
||||||
H3<Hasher::digest, UHASH_KEY_SIZE> h;
|
H3<Hasher::digest, UHASH_KEY_SIZE> h;
|
||||||
};
|
};
|
||||||
|
@ -169,9 +176,9 @@ public:
|
||||||
*
|
*
|
||||||
* @param k The number of hash functions to use.
|
* @param k The number of hash functions to use.
|
||||||
*
|
*
|
||||||
* @param name The name of the hasher.
|
* @param seed The seed for the hasher.
|
||||||
*/
|
*/
|
||||||
DefaultHasher(size_t k, const std::string& name = "");
|
DefaultHasher(size_t k, size_t seed);
|
||||||
|
|
||||||
// Overridden from Hasher.
|
// Overridden from Hasher.
|
||||||
virtual digest_vector Hash(const void* x, size_t n) const /* final */;
|
virtual digest_vector Hash(const void* x, size_t n) const /* final */;
|
||||||
|
@ -197,9 +204,9 @@ public:
|
||||||
*
|
*
|
||||||
* @param k The number of hash functions to use.
|
* @param k The number of hash functions to use.
|
||||||
*
|
*
|
||||||
* @param name The name of the hasher.
|
* @param seed The seed for the hasher.
|
||||||
*/
|
*/
|
||||||
DoubleHasher(size_t k, const std::string& name = "");
|
DoubleHasher(size_t k, size_t seed);
|
||||||
|
|
||||||
// Overridden from Hasher.
|
// Overridden from Hasher.
|
||||||
virtual digest_vector Hash(const void* x, size_t n) const /* final */;
|
virtual digest_vector Hash(const void* x, size_t n) const /* final */;
|
||||||
|
|
|
@ -48,7 +48,9 @@ function bloomfilter_basic_init%(fp: double, capacity: count,
|
||||||
|
|
||||||
size_t cells = BasicBloomFilter::M(fp, capacity);
|
size_t cells = BasicBloomFilter::M(fp, capacity);
|
||||||
size_t optimal_k = BasicBloomFilter::K(cells, capacity);
|
size_t optimal_k = BasicBloomFilter::K(cells, capacity);
|
||||||
const Hasher* h = new DefaultHasher(optimal_k, name->CheckString());
|
size_t seed = Hasher::MakeSeed(name->Len() > 0 ? name->Bytes() : 0,
|
||||||
|
name->Len());
|
||||||
|
const Hasher* h = new DefaultHasher(optimal_k, seed);
|
||||||
|
|
||||||
return new BloomFilterVal(new BasicBloomFilter(h, cells));
|
return new BloomFilterVal(new BasicBloomFilter(h, cells));
|
||||||
%}
|
%}
|
||||||
|
@ -86,7 +88,10 @@ function bloomfilter_counting_init%(k: count, cells: count, max: count,
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
const Hasher* h = new DefaultHasher(k, name->CheckString());
|
size_t seed = Hasher::MakeSeed(name->Len() > 0 ? name->Bytes() : 0,
|
||||||
|
name->Len());
|
||||||
|
|
||||||
|
const Hasher* h = new DefaultHasher(k, seed);
|
||||||
|
|
||||||
uint16 width = 1;
|
uint16 width = 1;
|
||||||
while ( max >>= 1 )
|
while ( max >>= 1 )
|
||||||
|
|
|
@ -21,8 +21,8 @@ function test_basic_bloom_filter()
|
||||||
bloomfilter_add(bf_str, "bar");
|
bloomfilter_add(bf_str, "bar");
|
||||||
print bloomfilter_lookup(bf_str, "foo");
|
print bloomfilter_lookup(bf_str, "foo");
|
||||||
print bloomfilter_lookup(bf_str, "bar");
|
print bloomfilter_lookup(bf_str, "bar");
|
||||||
print bloomfilter_lookup(bf_str, "b4z"); # FP
|
print bloomfilter_lookup(bf_str, "b4zzz"); # FP
|
||||||
print bloomfilter_lookup(bf_str, "quux"); # FP
|
print bloomfilter_lookup(bf_str, "quuux"); # FP
|
||||||
bloomfilter_add(bf_str, 0.5); # Type mismatch
|
bloomfilter_add(bf_str, 0.5); # Type mismatch
|
||||||
bloomfilter_add(bf_str, 100); # Type mismatch
|
bloomfilter_add(bf_str, 100); # Type mismatch
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue