mirror of
https://github.com/zeek/zeek.git
synced 2025-10-06 16:48:19 +00:00
Merge remote-tracking branch 'origin/topic/matthias/bloom-filter'
* origin/topic/matthias/bloom-filter: Add new BiF for low-level Bloom filter initialization. Introduce global_hash_seed script variable.
This commit is contained in:
commit
6c197fbebf
8 changed files with 157 additions and 79 deletions
1
NEWS
1
NEWS
|
@ -113,6 +113,7 @@ New Functionality
|
|||
the frequency of elements. The corresponding functions are:
|
||||
|
||||
bloomfilter_basic_init(fp: double, capacity: count, name: string &default=""): opaque of bloomfilter
|
||||
bloomfilter_basic_init2(k: count, cells: count, name: string &default=""): opaque of bloomfilter
|
||||
bloomfilter_counting_init(k: count, cells: count, max: count, name: string &default=""): opaque of bloomfilter
|
||||
bloomfilter_add(bf: opaque of bloomfilter, x: any)
|
||||
bloomfilter_lookup(bf: opaque of bloomfilter, x: any): count
|
||||
|
|
|
@ -238,6 +238,8 @@ TableType* record_field_table;
|
|||
|
||||
StringVal* cmd_line_bpf_filter;
|
||||
|
||||
StringVal* global_hash_seed;
|
||||
|
||||
OpaqueType* md5_type;
|
||||
OpaqueType* sha1_type;
|
||||
OpaqueType* sha256_type;
|
||||
|
@ -304,6 +306,8 @@ void init_general_global_var()
|
|||
cmd_line_bpf_filter =
|
||||
internal_val("cmd_line_bpf_filter")->AsStringVal();
|
||||
|
||||
global_hash_seed = opt_internal_string("global_hash_seed");
|
||||
|
||||
md5_type = new OpaqueType("md5");
|
||||
sha1_type = new OpaqueType("sha1");
|
||||
sha256_type = new OpaqueType("sha256");
|
||||
|
|
|
@ -242,6 +242,8 @@ extern TableType* record_field_table;
|
|||
|
||||
extern StringVal* cmd_line_bpf_filter;
|
||||
|
||||
extern StringVal* global_hash_seed;
|
||||
|
||||
class OpaqueType;
|
||||
extern OpaqueType* md5_type;
|
||||
extern OpaqueType* sha1_type;
|
||||
|
|
|
@ -3,11 +3,34 @@
|
|||
#include <typeinfo>
|
||||
|
||||
#include "Hasher.h"
|
||||
#include "NetVar.h"
|
||||
#include "digest.h"
|
||||
#include "Serializer.h"
|
||||
|
||||
using namespace probabilistic;
|
||||
|
||||
size_t Hasher::MakeSeed(const void* data, size_t size)
|
||||
{
|
||||
u_char buf[SHA256_DIGEST_LENGTH];
|
||||
SHA256_CTX ctx;
|
||||
sha256_init(&ctx);
|
||||
|
||||
if ( data )
|
||||
sha256_update(&ctx, data, size);
|
||||
|
||||
else if ( global_hash_seed && global_hash_seed->Len() > 0 )
|
||||
sha256_update(&ctx, global_hash_seed->Bytes(), global_hash_seed->Len());
|
||||
|
||||
else
|
||||
{
|
||||
unsigned int first_seed = initial_seed();
|
||||
sha256_update(&ctx, &first_seed, sizeof(first_seed));
|
||||
}
|
||||
|
||||
sha256_final(&ctx, buf);
|
||||
return *reinterpret_cast<size_t*>(buf); // Use the first bytes as seed.
|
||||
}
|
||||
|
||||
bool Hasher::Serialize(SerialInfo* info) const
|
||||
{
|
||||
return SerialObj::Serialize(info);
|
||||
|
@ -25,7 +48,7 @@ bool Hasher::DoSerialize(SerialInfo* info) const
|
|||
if ( ! SERIALIZE(static_cast<uint16>(k)) )
|
||||
return false;
|
||||
|
||||
return SERIALIZE_STR(name.c_str(), name.size());
|
||||
return SERIALIZE(static_cast<uint64>(seed));
|
||||
}
|
||||
|
||||
bool Hasher::DoUnserialize(UnserialInfo* info)
|
||||
|
@ -39,26 +62,24 @@ bool Hasher::DoUnserialize(UnserialInfo* info)
|
|||
k = serial_k;
|
||||
assert(k > 0);
|
||||
|
||||
const char* serial_name;
|
||||
if ( ! UNSERIALIZE_STR(&serial_name, 0) )
|
||||
uint64 serial_seed;
|
||||
if ( ! UNSERIALIZE(&serial_seed) )
|
||||
return false;
|
||||
|
||||
name = serial_name;
|
||||
delete [] serial_name;
|
||||
seed = serial_seed;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
Hasher::Hasher(size_t k, const std::string& arg_name)
|
||||
: k(k)
|
||||
Hasher::Hasher(size_t arg_k, size_t arg_seed)
|
||||
{
|
||||
k = k;
|
||||
name = arg_name;
|
||||
k = arg_k;
|
||||
seed = arg_seed;
|
||||
}
|
||||
|
||||
|
||||
UHF::UHF(size_t seed, const std::string& extra)
|
||||
: h(compute_seed(seed, extra))
|
||||
UHF::UHF(size_t seed)
|
||||
: h(seed)
|
||||
{
|
||||
}
|
||||
|
||||
|
@ -68,33 +89,11 @@ Hasher::digest UHF::hash(const void* x, size_t n) const
|
|||
return n == 0 ? 0 : h(x, n);
|
||||
}
|
||||
|
||||
size_t UHF::compute_seed(size_t seed, const std::string& extra)
|
||||
DefaultHasher::DefaultHasher(size_t k, size_t seed)
|
||||
: Hasher(k, seed)
|
||||
{
|
||||
u_char buf[SHA256_DIGEST_LENGTH];
|
||||
SHA256_CTX ctx;
|
||||
sha256_init(&ctx);
|
||||
|
||||
if ( extra.empty() )
|
||||
{
|
||||
unsigned int first_seed = initial_seed();
|
||||
sha256_update(&ctx, &first_seed, sizeof(first_seed));
|
||||
}
|
||||
|
||||
else
|
||||
sha256_update(&ctx, extra.c_str(), extra.size());
|
||||
|
||||
sha256_update(&ctx, &seed, sizeof(seed));
|
||||
sha256_final(&ctx, buf);
|
||||
|
||||
// Take the first sizeof(size_t) bytes as seed.
|
||||
return *reinterpret_cast<size_t*>(buf);
|
||||
}
|
||||
|
||||
DefaultHasher::DefaultHasher(size_t k, const std::string& name)
|
||||
: Hasher(k, name)
|
||||
{
|
||||
for ( size_t i = 0; i < k; ++i )
|
||||
hash_functions.push_back(UHF(i, name));
|
||||
for ( size_t i = 1; i <= k; ++i )
|
||||
hash_functions.push_back(UHF(Seed() + bro_prng(i)));
|
||||
}
|
||||
|
||||
Hasher::digest_vector DefaultHasher::Hash(const void* x, size_t n) const
|
||||
|
@ -137,13 +136,13 @@ bool DefaultHasher::DoUnserialize(UnserialInfo* info)
|
|||
|
||||
hash_functions.clear();
|
||||
for ( size_t i = 0; i < K(); ++i )
|
||||
hash_functions.push_back(UHF(i, Name()));
|
||||
hash_functions.push_back(UHF(Seed() + bro_prng(i)));
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
DoubleHasher::DoubleHasher(size_t k, const std::string& name)
|
||||
: Hasher(k, name), h1(1, name), h2(2, name)
|
||||
DoubleHasher::DoubleHasher(size_t k, size_t seed)
|
||||
: Hasher(k, seed), h1(seed + bro_prng(1)), h2(seed + bro_prng(2))
|
||||
{
|
||||
}
|
||||
|
||||
|
@ -187,8 +186,8 @@ bool DoubleHasher::DoUnserialize(UnserialInfo* info)
|
|||
{
|
||||
DO_UNSERIALIZE(Hasher);
|
||||
|
||||
h1 = UHF(1, Name());
|
||||
h2 = UHF(2, Name());
|
||||
h1 = UHF(Seed() + bro_prng(1));
|
||||
h2 = UHF(Seed() + bro_prng(2));
|
||||
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -18,6 +18,20 @@ public:
|
|||
typedef hash_t digest;
|
||||
typedef std::vector<digest> digest_vector;
|
||||
|
||||
/**
|
||||
* Creates a valid hasher seed from an arbitrary string.
|
||||
*
|
||||
* @param data A pointer to contiguous data that should be crunched into a
|
||||
* seed. If 0, the function tries to find a global_hash_seed script variable
|
||||
* to derive a seed from. If this variable does not exist, the function uses
|
||||
* the initial seed generated at Bro startup.
|
||||
*
|
||||
* @param size The number of bytes of *data*.
|
||||
*
|
||||
* @return A seed suitable for hashers.
|
||||
*/
|
||||
static size_t MakeSeed(const void* data, size_t size);
|
||||
|
||||
/**
|
||||
* Destructor.
|
||||
*/
|
||||
|
@ -64,11 +78,9 @@ public:
|
|||
size_t K() const { return k; }
|
||||
|
||||
/**
|
||||
* Returns the hasher's name. If not empty, the hasher uses this descriptor
|
||||
* to seed its *k* hash functions. Otherwise the hasher mixes in the initial
|
||||
* seed derived from the environment variable `$BRO_SEED`.
|
||||
* Returns the seed used to construct the hasher.
|
||||
*/
|
||||
const std::string& Name() const { return name; }
|
||||
size_t Seed() const { return seed; }
|
||||
|
||||
bool Serialize(SerialInfo* info) const;
|
||||
static Hasher* Unserialize(UnserialInfo* info);
|
||||
|
@ -81,16 +93,15 @@ protected:
|
|||
/**
|
||||
* Constructor.
|
||||
*
|
||||
* @param k the number of hash functions.
|
||||
* @param arg_k the number of hash functions.
|
||||
*
|
||||
* @param name A name for the hasher. Hashers with the same name
|
||||
* should provide consistent results.
|
||||
* @param arg_seed The seed for the hasher.
|
||||
*/
|
||||
Hasher(size_t k, const std::string& name);
|
||||
Hasher(size_t arg_k, size_t arg_seed);
|
||||
|
||||
private:
|
||||
size_t k;
|
||||
std::string name;
|
||||
size_t seed;
|
||||
};
|
||||
|
||||
/**
|
||||
|
@ -104,12 +115,8 @@ public:
|
|||
* optional extra seed to replace the initial Bro seed.
|
||||
*
|
||||
* @param seed The seed to use for this instance.
|
||||
*
|
||||
* @param extra If not empty, this parameter replaces the initial
|
||||
* seed to compute the seed for t to compute the seed NUL-terminated
|
||||
* string as additional seed.
|
||||
*/
|
||||
UHF(size_t seed = 0, const std::string& extra = "");
|
||||
UHF(size_t seed = 0);
|
||||
|
||||
template <typename T>
|
||||
Hasher::digest operator()(const T& x) const
|
||||
|
@ -152,7 +159,7 @@ public:
|
|||
}
|
||||
|
||||
private:
|
||||
static size_t compute_seed(size_t seed, const std::string& extra);
|
||||
static size_t compute_seed(size_t seed);
|
||||
|
||||
H3<Hasher::digest, UHASH_KEY_SIZE> h;
|
||||
};
|
||||
|
@ -169,9 +176,9 @@ public:
|
|||
*
|
||||
* @param k The number of hash functions to use.
|
||||
*
|
||||
* @param name The name of the hasher.
|
||||
* @param seed The seed for the hasher.
|
||||
*/
|
||||
DefaultHasher(size_t k, const std::string& name = "");
|
||||
DefaultHasher(size_t k, size_t seed);
|
||||
|
||||
// Overridden from Hasher.
|
||||
virtual digest_vector Hash(const void* x, size_t n) const /* final */;
|
||||
|
@ -197,9 +204,9 @@ public:
|
|||
*
|
||||
* @param k The number of hash functions to use.
|
||||
*
|
||||
* @param name The name of the hasher.
|
||||
* @param seed The seed for the hasher.
|
||||
*/
|
||||
DoubleHasher(size_t k, const std::string& name = "");
|
||||
DoubleHasher(size_t k, size_t seed);
|
||||
|
||||
// Overridden from Hasher.
|
||||
virtual digest_vector Hash(const void* x, size_t n) const /* final */;
|
||||
|
|
|
@ -31,12 +31,14 @@ module GLOBAL;
|
|||
## rate of *fp*.
|
||||
##
|
||||
## name: A name that uniquely identifies and seeds the Bloom filter. If empty,
|
||||
## the filter will remain tied to the current Bro process.
|
||||
## the filter will use :bro:id:`global_hash_seed` if that's set, and otherwise use
|
||||
## a local seed tied to the current Bro process. Only filters with the same seed
|
||||
## can be merged with :bro:id:`bloomfilter_merge` .
|
||||
##
|
||||
## Returns: A Bloom filter handle.
|
||||
##
|
||||
## .. bro:see:: bloomfilter_counting_init bloomfilter_add bloomfilter_lookup
|
||||
## bloomfilter_clear bloomfilter_merge
|
||||
## .. bro:see:: bloomfilter_basic_init2 bloomfilter_counting_init bloomfilter_add
|
||||
## bloomfilter_lookup bloomfilter_clear bloomfilter_merge global_hash_seed
|
||||
function bloomfilter_basic_init%(fp: double, capacity: count,
|
||||
name: string &default=""%): opaque of bloomfilter
|
||||
%{
|
||||
|
@ -48,7 +50,52 @@ function bloomfilter_basic_init%(fp: double, capacity: count,
|
|||
|
||||
size_t cells = BasicBloomFilter::M(fp, capacity);
|
||||
size_t optimal_k = BasicBloomFilter::K(cells, capacity);
|
||||
const Hasher* h = new DefaultHasher(optimal_k, name->CheckString());
|
||||
size_t seed = Hasher::MakeSeed(name->Len() > 0 ? name->Bytes() : 0,
|
||||
name->Len());
|
||||
const Hasher* h = new DefaultHasher(optimal_k, seed);
|
||||
|
||||
return new BloomFilterVal(new BasicBloomFilter(h, cells));
|
||||
%}
|
||||
|
||||
## Creates a basic Bloom filter. This function serves as a low-level
|
||||
## alternative to bloomfilter_basic_init where the user has full control over
|
||||
## the number of hash functions and cells in the underlying bit vector.
|
||||
##
|
||||
## .. note:: A Bloom filter can have a name associated with it. In the future,
|
||||
## Bloom filters with the same name will be compatible across indepedent Bro
|
||||
## instances, i.e., it will be possible to merge them. Currently, however, that is
|
||||
## not yet supported.
|
||||
##
|
||||
## k: The number of hash functions to use.
|
||||
##
|
||||
## cells: The number of cells of the underlying bit vector.
|
||||
##
|
||||
## name: A name that uniquely identifies and seeds the Bloom filter. If empty,
|
||||
## the filter will use :bro:id:`global_hash_seed` if that's set, and otherwise use
|
||||
## a local seed tied to the current Bro process. Only filters with the same seed
|
||||
## can be merged with :bro:id:`bloomfilter_merge` .
|
||||
##
|
||||
## Returns: A Bloom filter handle.
|
||||
##
|
||||
## .. bro:see:: bloom_filter_basic_init bloomfilter_counting_init bloomfilter_add
|
||||
## bloomfilter_lookup bloomfilter_clear bloomfilter_merge global_hash_seed
|
||||
function bloomfilter_basic_init2%(k: count, cells: count,
|
||||
name: string &default=""%): opaque of bloomfilter
|
||||
%{
|
||||
if ( k == 0 )
|
||||
{
|
||||
reporter->Error("number of hash functions must be non-negative");
|
||||
return 0;
|
||||
}
|
||||
if ( cells == 0 )
|
||||
{
|
||||
reporter->Error("number of cells must be non-negative");
|
||||
return 0;
|
||||
}
|
||||
|
||||
size_t seed = Hasher::MakeSeed(name->Len() > 0 ? name->Bytes() : 0,
|
||||
name->Len());
|
||||
const Hasher* h = new DefaultHasher(k, seed);
|
||||
|
||||
return new BloomFilterVal(new BasicBloomFilter(h, cells));
|
||||
%}
|
||||
|
@ -71,12 +118,14 @@ function bloomfilter_basic_init%(fp: double, capacity: count,
|
|||
## becomes a cell of size *w* bits.
|
||||
##
|
||||
## name: A name that uniquely identifies and seeds the Bloom filter. If empty,
|
||||
## the filter will remain tied to the current Bro process.
|
||||
## the filter will use :bro:id:`global_hash_seed` if that's set, and otherwise use
|
||||
## a local seed tied to the current Bro process. Only filters with the same seed
|
||||
## can be merged with :bro:id:`bloomfilter_merge` .
|
||||
##
|
||||
## Returns: A Bloom filter handle.
|
||||
##
|
||||
## .. bro:see:: bloomfilter_basic_init bloomfilter_add bloomfilter_lookup
|
||||
## bloomfilter_clear bloomfilter_merge
|
||||
## .. bro:see:: bloomfilter_basic_init bloomfilter_basic_init2 bloomfilter_add
|
||||
## bloomfilter_lookup bloomfilter_clear bloomfilter_merge global_hash_seed
|
||||
function bloomfilter_counting_init%(k: count, cells: count, max: count,
|
||||
name: string &default=""%): opaque of bloomfilter
|
||||
%{
|
||||
|
@ -86,7 +135,10 @@ function bloomfilter_counting_init%(k: count, cells: count, max: count,
|
|||
return 0;
|
||||
}
|
||||
|
||||
const Hasher* h = new DefaultHasher(k, name->CheckString());
|
||||
size_t seed = Hasher::MakeSeed(name->Len() > 0 ? name->Bytes() : 0,
|
||||
name->Len());
|
||||
|
||||
const Hasher* h = new DefaultHasher(k, seed);
|
||||
|
||||
uint16 width = 1;
|
||||
while ( max >>= 1 )
|
||||
|
@ -101,8 +153,9 @@ function bloomfilter_counting_init%(k: count, cells: count, max: count,
|
|||
##
|
||||
## x: The element to add.
|
||||
##
|
||||
## .. bro:see:: bloomfilter_counting_init bloomfilter_basic_init loomfilter_lookup
|
||||
## bloomfilter_clear bloomfilter_merge
|
||||
## .. bro:see:: bloomfilter_basic_init bloomfilter_basic_init2
|
||||
## bloomfilter_counting_init bloomfilter_lookup bloomfilter_clear
|
||||
## bloomfilter_merge
|
||||
function bloomfilter_add%(bf: opaque of bloomfilter, x: any%): any
|
||||
%{
|
||||
BloomFilterVal* bfv = static_cast<BloomFilterVal*>(bf);
|
||||
|
@ -127,8 +180,9 @@ function bloomfilter_add%(bf: opaque of bloomfilter, x: any%): any
|
|||
##
|
||||
## Returns: the counter associated with *x* in *bf*.
|
||||
##
|
||||
## .. bro:see:: bloomfilter_counting_init bloomfilter_basic_init
|
||||
## bloomfilter_add bloomfilter_clear bloomfilter_merge
|
||||
## .. bro:see:: bloomfilter_basic_init bloomfilter_basic_init2
|
||||
## bloomfilter_counting_init bloomfilter_add bloomfilter_clear
|
||||
## bloomfilter_merge
|
||||
function bloomfilter_lookup%(bf: opaque of bloomfilter, x: any%): count
|
||||
%{
|
||||
const BloomFilterVal* bfv = static_cast<const BloomFilterVal*>(bf);
|
||||
|
@ -154,8 +208,9 @@ function bloomfilter_lookup%(bf: opaque of bloomfilter, x: any%): count
|
|||
##
|
||||
## bf: The Bloom filter handle.
|
||||
##
|
||||
## .. bro:see:: bloomfilter_counting_init bloomfilter_basic_init
|
||||
## bloomfilter_add bloomfilter_lookup bloomfilter_merge
|
||||
## .. bro:see:: bloomfilter_basic_init bloomfilter_counting_init2
|
||||
## bloomfilter_counting_init bloomfilter_add bloomfilter_lookup
|
||||
## bloomfilter_merge
|
||||
function bloomfilter_clear%(bf: opaque of bloomfilter%): any
|
||||
%{
|
||||
BloomFilterVal* bfv = static_cast<BloomFilterVal*>(bf);
|
||||
|
@ -178,8 +233,9 @@ function bloomfilter_clear%(bf: opaque of bloomfilter%): any
|
|||
##
|
||||
## Returns: The union of *bf1* and *bf2*.
|
||||
##
|
||||
## .. bro:see:: bloomfilter_counting_init bloomfilter_basic_init
|
||||
## bloomfilter_add bloomfilter_lookup bloomfilter_clear
|
||||
## .. bro:see:: bloomfilter_basic_init bloomfilter_basic_init2
|
||||
## bloomfilter_counting_init bloomfilter_add bloomfilter_lookup
|
||||
## bloomfilter_clear
|
||||
function bloomfilter_merge%(bf1: opaque of bloomfilter,
|
||||
bf2: opaque of bloomfilter%): opaque of bloomfilter
|
||||
%{
|
||||
|
|
|
@ -17,6 +17,8 @@ error: false-positive rate must take value between 0 and 1
|
|||
1
|
||||
1
|
||||
1
|
||||
1
|
||||
1
|
||||
2
|
||||
3
|
||||
3
|
||||
|
|
|
@ -15,14 +15,21 @@ function test_basic_bloom_filter()
|
|||
bloomfilter_add(bf_cnt, 0.5); # Type mismatch
|
||||
bloomfilter_add(bf_cnt, "foo"); # Type mismatch
|
||||
|
||||
# Alternative constructor.
|
||||
local bf_dbl = bloomfilter_basic_init2(4, 10);
|
||||
bloomfilter_add(bf_dbl, 4.2);
|
||||
bloomfilter_add(bf_dbl, 3.14);
|
||||
print bloomfilter_lookup(bf_dbl, 4.2);
|
||||
print bloomfilter_lookup(bf_dbl, 3.14);
|
||||
|
||||
# Basic usage with strings.
|
||||
local bf_str = bloomfilter_basic_init(0.9, 10);
|
||||
bloomfilter_add(bf_str, "foo");
|
||||
bloomfilter_add(bf_str, "bar");
|
||||
print bloomfilter_lookup(bf_str, "foo");
|
||||
print bloomfilter_lookup(bf_str, "bar");
|
||||
print bloomfilter_lookup(bf_str, "b4z"); # FP
|
||||
print bloomfilter_lookup(bf_str, "quux"); # FP
|
||||
print bloomfilter_lookup(bf_str, "b4zzz"); # FP
|
||||
print bloomfilter_lookup(bf_str, "quuux"); # FP
|
||||
bloomfilter_add(bf_str, 0.5); # Type mismatch
|
||||
bloomfilter_add(bf_str, 100); # Type mismatch
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue