mirror of
https://github.com/zeek/zeek.git
synced 2025-10-04 15:48:19 +00:00
Changing the Bloom filter hashing so that it's independent of
CompositeHash. We do this by hashing values added to a BloomFilter another time more with a stable hash seeded only by either the filter's name or the global_hash_seed (or Bro's random() seed if neither is defined). I'm also adding a new bif bloomfilter_internal_state() that returns a string representation of a Bloom filter's current internal state. This is solely for writing tests that check that the filters end up consistent when seeded with the same value.
This commit is contained in:
parent
6c197fbebf
commit
2a0790c231
14 changed files with 157 additions and 59 deletions
|
@ -3042,6 +3042,11 @@ module GLOBAL;
|
|||
## Number of bytes per packet to capture from live interfaces.
|
||||
const snaplen = 8192 &redef;
|
||||
|
||||
## Seed for hashes computed internally for probabilistic data structures. Using
|
||||
## the same value here will make the hashes compatible between independent Bro
|
||||
## instances. If left unset, Bro will use a temporary local seed.
|
||||
const global_hash_seed: string = "" &redef;
|
||||
|
||||
# Load BiFs defined by plugins.
|
||||
@load base/bif/plugins
|
||||
|
||||
|
|
|
@ -566,14 +566,14 @@ BroType* BloomFilterVal::Type() const
|
|||
void BloomFilterVal::Add(const Val* val)
|
||||
{
|
||||
HashKey* key = hash->ComputeHash(val, 1);
|
||||
bloom_filter->Add(key->Hash());
|
||||
bloom_filter->Add(key);
|
||||
delete key;
|
||||
}
|
||||
|
||||
size_t BloomFilterVal::Count(const Val* val) const
|
||||
{
|
||||
HashKey* key = hash->ComputeHash(val, 1);
|
||||
size_t cnt = bloom_filter->Count(key->Hash());
|
||||
size_t cnt = bloom_filter->Count(key);
|
||||
delete key;
|
||||
return cnt;
|
||||
}
|
||||
|
@ -588,6 +588,11 @@ bool BloomFilterVal::Empty() const
|
|||
return bloom_filter->Empty();
|
||||
}
|
||||
|
||||
string BloomFilterVal::InternalState() const
|
||||
{
|
||||
return bloom_filter->InternalState();
|
||||
}
|
||||
|
||||
BloomFilterVal* BloomFilterVal::Merge(const BloomFilterVal* x,
|
||||
const BloomFilterVal* y)
|
||||
{
|
||||
|
|
|
@ -127,6 +127,7 @@ public:
|
|||
size_t Count(const Val* val) const;
|
||||
void Clear();
|
||||
bool Empty() const;
|
||||
string InternalState() const;
|
||||
|
||||
static BloomFilterVal* Merge(const BloomFilterVal* x,
|
||||
const BloomFilterVal* y);
|
||||
|
|
|
@ -490,6 +490,16 @@ BitVector::size_type BitVector::FindNext(size_type i) const
|
|||
return block ? bi * bits_per_block + lowest_bit(block) : find_from(bi + 1);
|
||||
}
|
||||
|
||||
size_t BitVector::Hash() const
|
||||
{
|
||||
size_t hash = 0;
|
||||
|
||||
for ( size_type i = 0; i < Blocks(); ++i )
|
||||
hash += bits[i];
|
||||
|
||||
return hash;
|
||||
}
|
||||
|
||||
BitVector::size_type BitVector::lowest_bit(block_type block)
|
||||
{
|
||||
block_type x = block - (block & (block - 1));
|
||||
|
|
|
@ -276,6 +276,13 @@ public:
|
|||
*/
|
||||
size_type FindNext(size_type i) const;
|
||||
|
||||
/** Computes a hash value of the internal representation.
|
||||
* This is mainly for debugging/testing purposes.
|
||||
*
|
||||
* @return The hash.
|
||||
*/
|
||||
size_t Hash() const;
|
||||
|
||||
/**
|
||||
* Serializes the bit vector.
|
||||
*
|
||||
|
|
|
@ -9,6 +9,8 @@
|
|||
#include "CounterVector.h"
|
||||
#include "Serializer.h"
|
||||
|
||||
#include "../util.h"
|
||||
|
||||
using namespace probabilistic;
|
||||
|
||||
BloomFilter::BloomFilter()
|
||||
|
@ -107,6 +109,11 @@ BasicBloomFilter* BasicBloomFilter::Clone() const
|
|||
return copy;
|
||||
}
|
||||
|
||||
std::string BasicBloomFilter::InternalState() const
|
||||
{
|
||||
return fmt("%" PRIu64, (uint64_t)bits->Hash());
|
||||
}
|
||||
|
||||
BasicBloomFilter::BasicBloomFilter()
|
||||
{
|
||||
bits = 0;
|
||||
|
@ -133,14 +140,18 @@ bool BasicBloomFilter::DoUnserialize(UnserialInfo* info)
|
|||
return (bits != 0);
|
||||
}
|
||||
|
||||
void BasicBloomFilter::AddImpl(const Hasher::digest_vector& h)
|
||||
void BasicBloomFilter::Add(const HashKey* key)
|
||||
{
|
||||
Hasher::digest_vector h = hasher->Hash(key);
|
||||
|
||||
for ( size_t i = 0; i < h.size(); ++i )
|
||||
bits->Set(h[i] % bits->Size());
|
||||
}
|
||||
|
||||
size_t BasicBloomFilter::CountImpl(const Hasher::digest_vector& h) const
|
||||
size_t BasicBloomFilter::Count(const HashKey* key) const
|
||||
{
|
||||
Hasher::digest_vector h = hasher->Hash(key);
|
||||
|
||||
for ( size_t i = 0; i < h.size(); ++i )
|
||||
{
|
||||
if ( ! (*bits)[h[i] % bits->Size()] )
|
||||
|
@ -206,6 +217,11 @@ CountingBloomFilter* CountingBloomFilter::Clone() const
|
|||
return copy;
|
||||
}
|
||||
|
||||
string CountingBloomFilter::InternalState() const
|
||||
{
|
||||
return fmt("%" PRIu64, (uint64_t)cells->Hash());
|
||||
}
|
||||
|
||||
IMPLEMENT_SERIAL(CountingBloomFilter, SER_COUNTINGBLOOMFILTER)
|
||||
|
||||
bool CountingBloomFilter::DoSerialize(SerialInfo* info) const
|
||||
|
@ -222,14 +238,18 @@ bool CountingBloomFilter::DoUnserialize(UnserialInfo* info)
|
|||
}
|
||||
|
||||
// TODO: Use partitioning in add/count to allow for reusing CMS bounds.
|
||||
void CountingBloomFilter::AddImpl(const Hasher::digest_vector& h)
|
||||
void CountingBloomFilter::Add(const HashKey* key)
|
||||
{
|
||||
Hasher::digest_vector h = hasher->Hash(key);
|
||||
|
||||
for ( size_t i = 0; i < h.size(); ++i )
|
||||
cells->Increment(h[i] % cells->Size());
|
||||
}
|
||||
|
||||
size_t CountingBloomFilter::CountImpl(const Hasher::digest_vector& h) const
|
||||
size_t CountingBloomFilter::Count(const HashKey* key) const
|
||||
{
|
||||
Hasher::digest_vector h = hasher->Hash(key);
|
||||
|
||||
CounterVector::size_type min =
|
||||
std::numeric_limits<CounterVector::size_type>::max();
|
||||
|
||||
|
|
|
@ -22,27 +22,20 @@ public:
|
|||
virtual ~BloomFilter();
|
||||
|
||||
/**
|
||||
* Adds an element of type T to the Bloom filter.
|
||||
* @param x The element to add
|
||||
* Adds an element to the Bloom filter.
|
||||
*
|
||||
* @param key The key associated with the element to add.
|
||||
*/
|
||||
template <typename T>
|
||||
void Add(const T& x)
|
||||
{
|
||||
AddImpl((*hasher)(x));
|
||||
}
|
||||
virtual void Add(const HashKey* key) = 0;
|
||||
|
||||
/**
|
||||
* Retrieves the associated count of a given value.
|
||||
*
|
||||
* @param x The value of type `T` to check.
|
||||
* @param key The key associated with the element to check.
|
||||
*
|
||||
* @return The counter associated with *x*.
|
||||
* @return The counter associated with *key*.
|
||||
*/
|
||||
template <typename T>
|
||||
size_t Count(const T& x) const
|
||||
{
|
||||
return CountImpl((*hasher)(x));
|
||||
}
|
||||
virtual size_t Count(const HashKey* key) const = 0;
|
||||
|
||||
/**
|
||||
* Checks whether the Bloom filter is empty.
|
||||
|
@ -72,6 +65,12 @@ public:
|
|||
*/
|
||||
virtual BloomFilter* Clone() const = 0;
|
||||
|
||||
/**
|
||||
* Returns a string with a representation of the Bloom filter's
|
||||
* internal state. This is for debugging/testing purposes only.
|
||||
*/
|
||||
virtual string InternalState() const = 0;
|
||||
|
||||
/**
|
||||
* Serializes the Bloom filter.
|
||||
*
|
||||
|
@ -106,25 +105,6 @@ protected:
|
|||
*/
|
||||
BloomFilter(const Hasher* hasher);
|
||||
|
||||
/**
|
||||
* Abstract method for implementinng the *Add* operation.
|
||||
*
|
||||
* @param hashes A set of *k* hashes for the item to add, computed by
|
||||
* the internal hasher object.
|
||||
*
|
||||
*/
|
||||
virtual void AddImpl(const Hasher::digest_vector& hashes) = 0;
|
||||
|
||||
/**
|
||||
* Abstract method for implementing the *Count* operation.
|
||||
*
|
||||
* @param hashes A set of *k* hashes for the item to add, computed by
|
||||
* the internal hasher object.
|
||||
*
|
||||
* @return Returns the counter associated with the hashed element.
|
||||
*/
|
||||
virtual size_t CountImpl(const Hasher::digest_vector& hashes) const = 0;
|
||||
|
||||
const Hasher* hasher;
|
||||
};
|
||||
|
||||
|
@ -177,6 +157,7 @@ public:
|
|||
virtual void Clear();
|
||||
virtual bool Merge(const BloomFilter* other);
|
||||
virtual BasicBloomFilter* Clone() const;
|
||||
virtual string InternalState() const;
|
||||
|
||||
protected:
|
||||
DECLARE_SERIAL(BasicBloomFilter);
|
||||
|
@ -187,8 +168,8 @@ protected:
|
|||
BasicBloomFilter();
|
||||
|
||||
// Overridden from BloomFilter.
|
||||
virtual void AddImpl(const Hasher::digest_vector& h);
|
||||
virtual size_t CountImpl(const Hasher::digest_vector& h) const;
|
||||
virtual void Add(const HashKey* key);
|
||||
virtual size_t Count(const HashKey* key) const;
|
||||
|
||||
private:
|
||||
BitVector* bits;
|
||||
|
@ -216,6 +197,7 @@ public:
|
|||
virtual void Clear();
|
||||
virtual bool Merge(const BloomFilter* other);
|
||||
virtual CountingBloomFilter* Clone() const;
|
||||
virtual string InternalState() const;
|
||||
|
||||
protected:
|
||||
DECLARE_SERIAL(CountingBloomFilter);
|
||||
|
@ -226,8 +208,8 @@ protected:
|
|||
CountingBloomFilter();
|
||||
|
||||
// Overridden from BloomFilter.
|
||||
virtual void AddImpl(const Hasher::digest_vector& h);
|
||||
virtual size_t CountImpl(const Hasher::digest_vector& h) const;
|
||||
virtual void Add(const HashKey* key);
|
||||
virtual size_t Count(const HashKey* key) const;
|
||||
|
||||
private:
|
||||
CounterVector* cells;
|
||||
|
|
|
@ -153,6 +153,11 @@ CounterVector operator|(const CounterVector& x, const CounterVector& y)
|
|||
|
||||
}
|
||||
|
||||
size_t CounterVector::Hash() const
|
||||
{
|
||||
return bits->Hash();
|
||||
}
|
||||
|
||||
bool CounterVector::Serialize(SerialInfo* info) const
|
||||
{
|
||||
return SerialObj::Serialize(info);
|
||||
|
|
|
@ -126,6 +126,13 @@ public:
|
|||
*/
|
||||
CounterVector& operator|=(const CounterVector& other);
|
||||
|
||||
/** Computes a hash value of the internal representation.
|
||||
* This is mainly for debugging/testing purposes.
|
||||
*
|
||||
* @return The hash.
|
||||
*/
|
||||
size_t Hash() const;
|
||||
|
||||
/**
|
||||
* Serializes the bit vector.
|
||||
*
|
||||
|
|
|
@ -31,6 +31,11 @@ size_t Hasher::MakeSeed(const void* data, size_t size)
|
|||
return *reinterpret_cast<size_t*>(buf); // Use the first bytes as seed.
|
||||
}
|
||||
|
||||
Hasher::digest_vector Hasher::Hash(const HashKey* key) const
|
||||
{
|
||||
return Hash(key->Key(), key->Size());
|
||||
}
|
||||
|
||||
bool Hasher::Serialize(SerialInfo* info) const
|
||||
{
|
||||
return SerialObj::Serialize(info);
|
||||
|
@ -77,7 +82,6 @@ Hasher::Hasher(size_t arg_k, size_t arg_seed)
|
|||
seed = arg_seed;
|
||||
}
|
||||
|
||||
|
||||
UHF::UHF(size_t seed)
|
||||
: h(seed)
|
||||
{
|
||||
|
|
|
@ -50,6 +50,15 @@ public:
|
|||
return Hash(&x, sizeof(T));
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes hash values for an element.
|
||||
*
|
||||
* @param x The key of the value to hash.
|
||||
*
|
||||
* @return Vector of *k* hash values.
|
||||
*/
|
||||
digest_vector Hash(const HashKey* key) const;
|
||||
|
||||
/**
|
||||
* Computes the hashes for a set of bytes.
|
||||
*
|
||||
|
|
|
@ -20,11 +20,6 @@ module GLOBAL;
|
|||
|
||||
## Creates a basic Bloom filter.
|
||||
##
|
||||
## .. note:: A Bloom filter can have a name associated with it. In the future,
|
||||
## Bloom filters with the same name will be compatible across indepedent Bro
|
||||
## instances, i.e., it will be possible to merge them. Currently, however, that is
|
||||
## not yet supported.
|
||||
##
|
||||
## fp: The desired false-positive rate.
|
||||
##
|
||||
## capacity: the maximum number of elements that guarantees a false-positive
|
||||
|
@ -61,11 +56,6 @@ function bloomfilter_basic_init%(fp: double, capacity: count,
|
|||
## alternative to bloomfilter_basic_init where the user has full control over
|
||||
## the number of hash functions and cells in the underlying bit vector.
|
||||
##
|
||||
## .. note:: A Bloom filter can have a name associated with it. In the future,
|
||||
## Bloom filters with the same name will be compatible across indepedent Bro
|
||||
## instances, i.e., it will be possible to merge them. Currently, however, that is
|
||||
## not yet supported.
|
||||
##
|
||||
## k: The number of hash functions to use.
|
||||
##
|
||||
## cells: The number of cells of the underlying bit vector.
|
||||
|
@ -102,11 +92,6 @@ function bloomfilter_basic_init2%(k: count, cells: count,
|
|||
|
||||
## Creates a counting Bloom filter.
|
||||
##
|
||||
## .. note:: A Bloom filter can have a name associated with it. In the future,
|
||||
## Bloom filters with the same name will be compatible across indepedent Bro
|
||||
## instances, i.e., it will be possible to merge them. Currently, however, that is
|
||||
## not yet supported.
|
||||
##
|
||||
## k: The number of hash functions to use.
|
||||
##
|
||||
## cells: The number of cells of the underlying counter vector. As there's no
|
||||
|
@ -250,3 +235,13 @@ function bloomfilter_merge%(bf1: opaque of bloomfilter,
|
|||
|
||||
return BloomFilterVal::Merge(bfv1, bfv2);
|
||||
%}
|
||||
|
||||
## Returns a string with a representation of a Bloom filter's internal
|
||||
## state. This is for debugging/testing purposes only.
|
||||
##
|
||||
## bf: The Bloom filter handle.
|
||||
function bloomfilter_internal_state%(bf: opaque of bloomfilter%): string
|
||||
%{
|
||||
BloomFilterVal* bfv = static_cast<BloomFilterVal*>(bf);
|
||||
return new StringVal(bfv->InternalState());
|
||||
%}
|
||||
|
|
8
testing/btest/Baseline/bifs.bloomfilter-seed/output
Normal file
8
testing/btest/Baseline/bifs.bloomfilter-seed/output
Normal file
|
@ -0,0 +1,8 @@
|
|||
bf1, global_seed, 1
|
||||
bf2, global_seed, 5
|
||||
bf3, my_seed, 5
|
||||
bf4, my_seed, 6
|
||||
bf1, global_seed, 5
|
||||
bf2, global_seed, 6
|
||||
bf3, my_seed, 5
|
||||
bf4, my_seed, 6
|
40
testing/btest/bifs/bloomfilter-seed.bro
Normal file
40
testing/btest/bifs/bloomfilter-seed.bro
Normal file
|
@ -0,0 +1,40 @@
|
|||
# @TEST-EXEC: bro -b %INPUT global_hash_seed="foo" >>output
|
||||
# @TEST-EXEC: bro -b %INPUT global_hash_seed="my_seed" >>output
|
||||
# @TEST-EXEC: btest-diff output
|
||||
|
||||
type Foo: record
|
||||
{
|
||||
a: count;
|
||||
b: string;
|
||||
};
|
||||
|
||||
function test_bloom_filter()
|
||||
{
|
||||
local bf1 = bloomfilter_basic_init(0.9, 10);
|
||||
bloomfilter_add(bf1, "foo");
|
||||
bloomfilter_add(bf1, "bar");
|
||||
|
||||
local bf2 = bloomfilter_basic_init(0.9, 10);
|
||||
bloomfilter_add(bf2, Foo($a=1, $b="xx"));
|
||||
bloomfilter_add(bf2, Foo($a=2, $b="yy"));
|
||||
|
||||
local bf3 = bloomfilter_basic_init(0.9, 10, "my_seed");
|
||||
bloomfilter_add(bf3, "foo");
|
||||
bloomfilter_add(bf3, "bar");
|
||||
|
||||
local bf4 = bloomfilter_basic_init(0.9, 10, "my_seed");
|
||||
bloomfilter_add(bf4, Foo($a=1, $b="xx"));
|
||||
bloomfilter_add(bf4, Foo($a=2, $b="yy"));
|
||||
|
||||
print "bf1, global_seed", bloomfilter_internal_state(bf1);
|
||||
print "bf2, global_seed", bloomfilter_internal_state(bf2);
|
||||
print "bf3, my_seed", bloomfilter_internal_state(bf3);
|
||||
print "bf4, my_seed", bloomfilter_internal_state(bf4);
|
||||
|
||||
|
||||
}
|
||||
|
||||
event bro_init()
|
||||
{
|
||||
test_bloom_filter();
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue