Merge branch 'topic/robin/bloom-filter-merge'

* topic/robin/bloom-filter-merge:
  Using a real hash function for hashing a BitVector's internal state.
  Support UHF hashing for >= UHASH_KEY_SIZE bytes.
  Changing the Bloom filter hashing so that it's independent of CompositeHash.
  Add new BiF for low-level Bloom filter initialization.
  Introduce global_hash_seed script variable.

Conflicts:
	testing/btest/Baseline/bifs.bloomfilter/output
This commit is contained in:
Robin Sommer 2013-08-01 10:52:08 -07:00
commit 32a403cdaf
19 changed files with 337 additions and 138 deletions

View file

@ -1,10 +1,12 @@
// See the file "COPYING" in the main distribution directory for copyright.
#include "BitVector.h"
#include <openssl/sha.h>
#include <cassert>
#include <limits>
#include "BitVector.h"
#include "Serializer.h"
#include "digest.h"
using namespace probabilistic;
@ -490,6 +492,21 @@ BitVector::size_type BitVector::FindNext(size_type i) const
return block ? bi * bits_per_block + lowest_bit(block) : find_from(bi + 1);
}
size_t BitVector::Hash() const
{
size_t hash = 0;
u_char buf[SHA256_DIGEST_LENGTH];
SHA256_CTX ctx;
sha256_init(&ctx);
for ( size_type i = 0; i < Blocks(); ++i )
sha256_update(&ctx, &bits[i], sizeof(bits[i]));
sha256_final(&ctx, buf);
return *reinterpret_cast<size_t*>(buf); // Use the first bytes as seed.
}
BitVector::size_type BitVector::lowest_bit(block_type block)
{
block_type x = block - (block & (block - 1));

View file

@ -276,6 +276,13 @@ public:
*/
size_type FindNext(size_type i) const;
/** Computes a hash value of the internal representation.
* This is mainly for debugging/testing purposes.
*
* @return The hash.
*/
size_t Hash() const;
/**
* Serializes the bit vector.
*

View file

@ -9,6 +9,8 @@
#include "CounterVector.h"
#include "Serializer.h"
#include "../util.h"
using namespace probabilistic;
BloomFilter::BloomFilter()
@ -107,6 +109,11 @@ BasicBloomFilter* BasicBloomFilter::Clone() const
return copy;
}
std::string BasicBloomFilter::InternalState() const
{
return fmt("%" PRIu64, (uint64_t)bits->Hash());
}
BasicBloomFilter::BasicBloomFilter()
{
bits = 0;
@ -133,14 +140,18 @@ bool BasicBloomFilter::DoUnserialize(UnserialInfo* info)
return (bits != 0);
}
void BasicBloomFilter::AddImpl(const Hasher::digest_vector& h)
void BasicBloomFilter::Add(const HashKey* key)
{
Hasher::digest_vector h = hasher->Hash(key);
for ( size_t i = 0; i < h.size(); ++i )
bits->Set(h[i] % bits->Size());
}
size_t BasicBloomFilter::CountImpl(const Hasher::digest_vector& h) const
size_t BasicBloomFilter::Count(const HashKey* key) const
{
Hasher::digest_vector h = hasher->Hash(key);
for ( size_t i = 0; i < h.size(); ++i )
{
if ( ! (*bits)[h[i] % bits->Size()] )
@ -206,6 +217,11 @@ CountingBloomFilter* CountingBloomFilter::Clone() const
return copy;
}
string CountingBloomFilter::InternalState() const
{
return fmt("%" PRIu64, (uint64_t)cells->Hash());
}
IMPLEMENT_SERIAL(CountingBloomFilter, SER_COUNTINGBLOOMFILTER)
bool CountingBloomFilter::DoSerialize(SerialInfo* info) const
@ -222,14 +238,18 @@ bool CountingBloomFilter::DoUnserialize(UnserialInfo* info)
}
// TODO: Use partitioning in add/count to allow for reusing CMS bounds.
void CountingBloomFilter::AddImpl(const Hasher::digest_vector& h)
void CountingBloomFilter::Add(const HashKey* key)
{
Hasher::digest_vector h = hasher->Hash(key);
for ( size_t i = 0; i < h.size(); ++i )
cells->Increment(h[i] % cells->Size());
}
size_t CountingBloomFilter::CountImpl(const Hasher::digest_vector& h) const
size_t CountingBloomFilter::Count(const HashKey* key) const
{
Hasher::digest_vector h = hasher->Hash(key);
CounterVector::size_type min =
std::numeric_limits<CounterVector::size_type>::max();

View file

@ -22,27 +22,20 @@ public:
virtual ~BloomFilter();
/**
* Adds an element of type T to the Bloom filter.
* @param x The element to add
* Adds an element to the Bloom filter.
*
* @param key The key associated with the element to add.
*/
template <typename T>
void Add(const T& x)
{
AddImpl((*hasher)(x));
}
virtual void Add(const HashKey* key) = 0;
/**
* Retrieves the associated count of a given value.
*
* @param x The value of type `T` to check.
* @param key The key associated with the element to check.
*
* @return The counter associated with *x*.
* @return The counter associated with *key*.
*/
template <typename T>
size_t Count(const T& x) const
{
return CountImpl((*hasher)(x));
}
virtual size_t Count(const HashKey* key) const = 0;
/**
* Checks whether the Bloom filter is empty.
@ -72,6 +65,12 @@ public:
*/
virtual BloomFilter* Clone() const = 0;
/**
* Returns a string with a representation of the Bloom filter's
* internal state. This is for debugging/testing purposes only.
*/
virtual string InternalState() const = 0;
/**
* Serializes the Bloom filter.
*
@ -106,25 +105,6 @@ protected:
*/
BloomFilter(const Hasher* hasher);
/**
* Abstract method for implementinng the *Add* operation.
*
* @param hashes A set of *k* hashes for the item to add, computed by
* the internal hasher object.
*
*/
virtual void AddImpl(const Hasher::digest_vector& hashes) = 0;
/**
* Abstract method for implementing the *Count* operation.
*
* @param hashes A set of *k* hashes for the item to add, computed by
* the internal hasher object.
*
* @return Returns the counter associated with the hashed element.
*/
virtual size_t CountImpl(const Hasher::digest_vector& hashes) const = 0;
const Hasher* hasher;
};
@ -177,6 +157,7 @@ public:
virtual void Clear();
virtual bool Merge(const BloomFilter* other);
virtual BasicBloomFilter* Clone() const;
virtual string InternalState() const;
protected:
DECLARE_SERIAL(BasicBloomFilter);
@ -187,8 +168,8 @@ protected:
BasicBloomFilter();
// Overridden from BloomFilter.
virtual void AddImpl(const Hasher::digest_vector& h);
virtual size_t CountImpl(const Hasher::digest_vector& h) const;
virtual void Add(const HashKey* key);
virtual size_t Count(const HashKey* key) const;
private:
BitVector* bits;
@ -216,6 +197,7 @@ public:
virtual void Clear();
virtual bool Merge(const BloomFilter* other);
virtual CountingBloomFilter* Clone() const;
virtual string InternalState() const;
protected:
DECLARE_SERIAL(CountingBloomFilter);
@ -226,8 +208,8 @@ protected:
CountingBloomFilter();
// Overridden from BloomFilter.
virtual void AddImpl(const Hasher::digest_vector& h);
virtual size_t CountImpl(const Hasher::digest_vector& h) const;
virtual void Add(const HashKey* key);
virtual size_t Count(const HashKey* key) const;
private:
CounterVector* cells;

View file

@ -153,6 +153,11 @@ CounterVector operator|(const CounterVector& x, const CounterVector& y)
}
size_t CounterVector::Hash() const
{
return bits->Hash();
}
bool CounterVector::Serialize(SerialInfo* info) const
{
return SerialObj::Serialize(info);

View file

@ -126,6 +126,13 @@ public:
*/
CounterVector& operator|=(const CounterVector& other);
/** Computes a hash value of the internal representation.
* This is mainly for debugging/testing purposes.
*
* @return The hash.
*/
size_t Hash() const;
/**
* Serializes the bit vector.
*

View file

@ -1,13 +1,42 @@
// See the file "COPYING" in the main distribution directory for copyright.
#include <typeinfo>
#include <openssl/md5.h>
#include "Hasher.h"
#include "NetVar.h"
#include "digest.h"
#include "Serializer.h"
using namespace probabilistic;
size_t Hasher::MakeSeed(const void* data, size_t size)
{
u_char buf[SHA256_DIGEST_LENGTH];
SHA256_CTX ctx;
sha256_init(&ctx);
if ( data )
sha256_update(&ctx, data, size);
else if ( global_hash_seed && global_hash_seed->Len() > 0 )
sha256_update(&ctx, global_hash_seed->Bytes(), global_hash_seed->Len());
else
{
unsigned int first_seed = initial_seed();
sha256_update(&ctx, &first_seed, sizeof(first_seed));
}
sha256_final(&ctx, buf);
return *reinterpret_cast<size_t*>(buf); // Use the first bytes as seed.
}
Hasher::digest_vector Hasher::Hash(const HashKey* key) const
{
return Hash(key->Key(), key->Size());
}
bool Hasher::Serialize(SerialInfo* info) const
{
return SerialObj::Serialize(info);
@ -25,7 +54,7 @@ bool Hasher::DoSerialize(SerialInfo* info) const
if ( ! SERIALIZE(static_cast<uint16>(k)) )
return false;
return SERIALIZE_STR(name.c_str(), name.size());
return SERIALIZE(static_cast<uint64>(seed));
}
bool Hasher::DoUnserialize(UnserialInfo* info)
@ -39,62 +68,52 @@ bool Hasher::DoUnserialize(UnserialInfo* info)
k = serial_k;
assert(k > 0);
const char* serial_name;
if ( ! UNSERIALIZE_STR(&serial_name, 0) )
uint64 serial_seed;
if ( ! UNSERIALIZE(&serial_seed) )
return false;
name = serial_name;
delete [] serial_name;
seed = serial_seed;
return true;
}
Hasher::Hasher(size_t k, const std::string& arg_name)
: k(k)
Hasher::Hasher(size_t arg_k, size_t arg_seed)
{
k = k;
name = arg_name;
k = arg_k;
seed = arg_seed;
}
UHF::UHF(size_t seed, const std::string& extra)
: h(compute_seed(seed, extra))
UHF::UHF(size_t arg_seed)
: h(arg_seed)
{
seed = arg_seed;
}
// This function is almost equivalent to HashKey::HashBytes except that it
// does not depend on global state and that we mix in the seed multiple
// times.
Hasher::digest UHF::hash(const void* x, size_t n) const
{
assert(n <= UHASH_KEY_SIZE);
return n == 0 ? 0 : h(x, n);
if ( n <= UHASH_KEY_SIZE )
return n == 0 ? 0 : h(x, n);
unsigned char d[16];
MD5(reinterpret_cast<const unsigned char*>(x), n, d);
const unsigned char* s = reinterpret_cast<const unsigned char*>(&seed);
for ( size_t i = 0; i < 16; ++i )
d[i] ^= s[i % sizeof(seed)];
MD5(d, 16, d);
return d[0];
}
size_t UHF::compute_seed(size_t seed, const std::string& extra)
DefaultHasher::DefaultHasher(size_t k, size_t seed)
: Hasher(k, seed)
{
u_char buf[SHA256_DIGEST_LENGTH];
SHA256_CTX ctx;
sha256_init(&ctx);
if ( extra.empty() )
{
unsigned int first_seed = initial_seed();
sha256_update(&ctx, &first_seed, sizeof(first_seed));
}
else
sha256_update(&ctx, extra.c_str(), extra.size());
sha256_update(&ctx, &seed, sizeof(seed));
sha256_final(&ctx, buf);
// Take the first sizeof(size_t) bytes as seed.
return *reinterpret_cast<size_t*>(buf);
}
DefaultHasher::DefaultHasher(size_t k, const std::string& name)
: Hasher(k, name)
{
for ( size_t i = 0; i < k; ++i )
hash_functions.push_back(UHF(i, name));
for ( size_t i = 1; i <= k; ++i )
hash_functions.push_back(UHF(Seed() + bro_prng(i)));
}
Hasher::digest_vector DefaultHasher::Hash(const void* x, size_t n) const
@ -137,13 +156,13 @@ bool DefaultHasher::DoUnserialize(UnserialInfo* info)
hash_functions.clear();
for ( size_t i = 0; i < K(); ++i )
hash_functions.push_back(UHF(i, Name()));
hash_functions.push_back(UHF(Seed() + bro_prng(i)));
return true;
}
DoubleHasher::DoubleHasher(size_t k, const std::string& name)
: Hasher(k, name), h1(1, name), h2(2, name)
DoubleHasher::DoubleHasher(size_t k, size_t seed)
: Hasher(k, seed), h1(seed + bro_prng(1)), h2(seed + bro_prng(2))
{
}
@ -187,8 +206,8 @@ bool DoubleHasher::DoUnserialize(UnserialInfo* info)
{
DO_UNSERIALIZE(Hasher);
h1 = UHF(1, Name());
h2 = UHF(2, Name());
h1 = UHF(Seed() + bro_prng(1));
h2 = UHF(Seed() + bro_prng(2));
return true;
}

View file

@ -18,6 +18,20 @@ public:
typedef hash_t digest;
typedef std::vector<digest> digest_vector;
/**
* Creates a valid hasher seed from an arbitrary string.
*
* @param data A pointer to contiguous data that should be crunched into a
* seed. If 0, the function tries to find a global_hash_seed script variable
* to derive a seed from. If this variable does not exist, the function uses
* the initial seed generated at Bro startup.
*
* @param size The number of bytes of *data*.
*
* @return A seed suitable for hashers.
*/
static size_t MakeSeed(const void* data, size_t size);
/**
* Destructor.
*/
@ -36,6 +50,15 @@ public:
return Hash(&x, sizeof(T));
}
/**
* Computes hash values for an element.
*
* @param x The key of the value to hash.
*
* @return Vector of *k* hash values.
*/
digest_vector Hash(const HashKey* key) const;
/**
* Computes the hashes for a set of bytes.
*
@ -64,11 +87,9 @@ public:
size_t K() const { return k; }
/**
* Returns the hasher's name. If not empty, the hasher uses this descriptor
* to seed its *k* hash functions. Otherwise the hasher mixes in the initial
* seed derived from the environment variable `$BRO_SEED`.
* Returns the seed used to construct the hasher.
*/
const std::string& Name() const { return name; }
size_t Seed() const { return seed; }
bool Serialize(SerialInfo* info) const;
static Hasher* Unserialize(UnserialInfo* info);
@ -81,16 +102,15 @@ protected:
/**
* Constructor.
*
* @param k the number of hash functions.
* @param arg_k the number of hash functions.
*
* @param name A name for the hasher. Hashers with the same name
* should provide consistent results.
* @param arg_seed The seed for the hasher.
*/
Hasher(size_t k, const std::string& name);
Hasher(size_t arg_k, size_t arg_seed);
private:
size_t k;
std::string name;
size_t seed;
};
/**
@ -103,13 +123,9 @@ public:
* Constructs an H3 hash function seeded with a given seed and an
* optional extra seed to replace the initial Bro seed.
*
* @param seed The seed to use for this instance.
*
* @param extra If not empty, this parameter replaces the initial
* seed to compute the seed for t to compute the seed NUL-terminated
* string as additional seed.
* @param arg_seed The seed to use for this instance.
*/
UHF(size_t seed = 0, const std::string& extra = "");
UHF(size_t arg_seed = 0);
template <typename T>
Hasher::digest operator()(const T& x) const
@ -152,9 +168,10 @@ public:
}
private:
static size_t compute_seed(size_t seed, const std::string& extra);
static size_t compute_seed(size_t seed);
H3<Hasher::digest, UHASH_KEY_SIZE> h;
size_t seed;
};
@ -169,9 +186,9 @@ public:
*
* @param k The number of hash functions to use.
*
* @param name The name of the hasher.
* @param seed The seed for the hasher.
*/
DefaultHasher(size_t k, const std::string& name = "");
DefaultHasher(size_t k, size_t seed);
// Overridden from Hasher.
virtual digest_vector Hash(const void* x, size_t n) const /* final */;
@ -197,9 +214,9 @@ public:
*
* @param k The number of hash functions to use.
*
* @param name The name of the hasher.
* @param seed The seed for the hasher.
*/
DoubleHasher(size_t k, const std::string& name = "");
DoubleHasher(size_t k, size_t seed);
// Overridden from Hasher.
virtual digest_vector Hash(const void* x, size_t n) const /* final */;

View file

@ -20,23 +20,20 @@ module GLOBAL;
## Creates a basic Bloom filter.
##
## .. note:: A Bloom filter can have a name associated with it. In the future,
## Bloom filters with the same name will be compatible across indepedent Bro
## instances, i.e., it will be possible to merge them. Currently, however, that is
## not yet supported.
##
## fp: The desired false-positive rate.
##
## capacity: the maximum number of elements that guarantees a false-positive
## rate of *fp*.
##
## name: A name that uniquely identifies and seeds the Bloom filter. If empty,
## the filter will remain tied to the current Bro process.
## the filter will use :bro:id:`global_hash_seed` if that's set, and otherwise use
## a local seed tied to the current Bro process. Only filters with the same seed
## can be merged with :bro:id:`bloomfilter_merge` .
##
## Returns: A Bloom filter handle.
##
## .. bro:see:: bloomfilter_counting_init bloomfilter_add bloomfilter_lookup
## bloomfilter_clear bloomfilter_merge
## .. bro:see:: bloomfilter_basic_init2 bloomfilter_counting_init bloomfilter_add
## bloomfilter_lookup bloomfilter_clear bloomfilter_merge global_hash_seed
function bloomfilter_basic_init%(fp: double, capacity: count,
name: string &default=""%): opaque of bloomfilter
%{
@ -48,18 +45,53 @@ function bloomfilter_basic_init%(fp: double, capacity: count,
size_t cells = BasicBloomFilter::M(fp, capacity);
size_t optimal_k = BasicBloomFilter::K(cells, capacity);
const Hasher* h = new DefaultHasher(optimal_k, name->CheckString());
size_t seed = Hasher::MakeSeed(name->Len() > 0 ? name->Bytes() : 0,
name->Len());
const Hasher* h = new DefaultHasher(optimal_k, seed);
return new BloomFilterVal(new BasicBloomFilter(h, cells));
%}
## Creates a basic Bloom filter. This function serves as a low-level
## alternative to bloomfilter_basic_init where the user has full control over
## the number of hash functions and cells in the underlying bit vector.
##
## k: The number of hash functions to use.
##
## cells: The number of cells of the underlying bit vector.
##
## name: A name that uniquely identifies and seeds the Bloom filter. If empty,
## the filter will use :bro:id:`global_hash_seed` if that's set, and otherwise use
## a local seed tied to the current Bro process. Only filters with the same seed
## can be merged with :bro:id:`bloomfilter_merge` .
##
## Returns: A Bloom filter handle.
##
## .. bro:see:: bloom_filter_basic_init bloomfilter_counting_init bloomfilter_add
## bloomfilter_lookup bloomfilter_clear bloomfilter_merge global_hash_seed
function bloomfilter_basic_init2%(k: count, cells: count,
name: string &default=""%): opaque of bloomfilter
%{
if ( k == 0 )
{
reporter->Error("number of hash functions must be non-negative");
return 0;
}
if ( cells == 0 )
{
reporter->Error("number of cells must be non-negative");
return 0;
}
size_t seed = Hasher::MakeSeed(name->Len() > 0 ? name->Bytes() : 0,
name->Len());
const Hasher* h = new DefaultHasher(k, seed);
return new BloomFilterVal(new BasicBloomFilter(h, cells));
%}
## Creates a counting Bloom filter.
##
## .. note:: A Bloom filter can have a name associated with it. In the future,
## Bloom filters with the same name will be compatible across indepedent Bro
## instances, i.e., it will be possible to merge them. Currently, however, that is
## not yet supported.
##
## k: The number of hash functions to use.
##
## cells: The number of cells of the underlying counter vector. As there's no
@ -71,12 +103,14 @@ function bloomfilter_basic_init%(fp: double, capacity: count,
## becomes a cell of size *w* bits.
##
## name: A name that uniquely identifies and seeds the Bloom filter. If empty,
## the filter will remain tied to the current Bro process.
## the filter will use :bro:id:`global_hash_seed` if that's set, and otherwise use
## a local seed tied to the current Bro process. Only filters with the same seed
## can be merged with :bro:id:`bloomfilter_merge` .
##
## Returns: A Bloom filter handle.
##
## .. bro:see:: bloomfilter_basic_init bloomfilter_add bloomfilter_lookup
## bloomfilter_clear bloomfilter_merge
## .. bro:see:: bloomfilter_basic_init bloomfilter_basic_init2 bloomfilter_add
## bloomfilter_lookup bloomfilter_clear bloomfilter_merge global_hash_seed
function bloomfilter_counting_init%(k: count, cells: count, max: count,
name: string &default=""%): opaque of bloomfilter
%{
@ -86,7 +120,10 @@ function bloomfilter_counting_init%(k: count, cells: count, max: count,
return 0;
}
const Hasher* h = new DefaultHasher(k, name->CheckString());
size_t seed = Hasher::MakeSeed(name->Len() > 0 ? name->Bytes() : 0,
name->Len());
const Hasher* h = new DefaultHasher(k, seed);
uint16 width = 1;
while ( max >>= 1 )
@ -101,8 +138,9 @@ function bloomfilter_counting_init%(k: count, cells: count, max: count,
##
## x: The element to add.
##
## .. bro:see:: bloomfilter_counting_init bloomfilter_basic_init loomfilter_lookup
## bloomfilter_clear bloomfilter_merge
## .. bro:see:: bloomfilter_basic_init bloomfilter_basic_init2
## bloomfilter_counting_init bloomfilter_lookup bloomfilter_clear
## bloomfilter_merge
function bloomfilter_add%(bf: opaque of bloomfilter, x: any%): any
%{
BloomFilterVal* bfv = static_cast<BloomFilterVal*>(bf);
@ -127,8 +165,9 @@ function bloomfilter_add%(bf: opaque of bloomfilter, x: any%): any
##
## Returns: the counter associated with *x* in *bf*.
##
## .. bro:see:: bloomfilter_counting_init bloomfilter_basic_init
## bloomfilter_add bloomfilter_clear bloomfilter_merge
## .. bro:see:: bloomfilter_basic_init bloomfilter_basic_init2
## bloomfilter_counting_init bloomfilter_add bloomfilter_clear
## bloomfilter_merge
function bloomfilter_lookup%(bf: opaque of bloomfilter, x: any%): count
%{
const BloomFilterVal* bfv = static_cast<const BloomFilterVal*>(bf);
@ -154,8 +193,9 @@ function bloomfilter_lookup%(bf: opaque of bloomfilter, x: any%): count
##
## bf: The Bloom filter handle.
##
## .. bro:see:: bloomfilter_counting_init bloomfilter_basic_init
## bloomfilter_add bloomfilter_lookup bloomfilter_merge
## .. bro:see:: bloomfilter_basic_init bloomfilter_counting_init2
## bloomfilter_counting_init bloomfilter_add bloomfilter_lookup
## bloomfilter_merge
function bloomfilter_clear%(bf: opaque of bloomfilter%): any
%{
BloomFilterVal* bfv = static_cast<BloomFilterVal*>(bf);
@ -178,8 +218,9 @@ function bloomfilter_clear%(bf: opaque of bloomfilter%): any
##
## Returns: The union of *bf1* and *bf2*.
##
## .. bro:see:: bloomfilter_counting_init bloomfilter_basic_init
## bloomfilter_add bloomfilter_lookup bloomfilter_clear
## .. bro:see:: bloomfilter_basic_init bloomfilter_basic_init2
## bloomfilter_counting_init bloomfilter_add bloomfilter_lookup
## bloomfilter_clear
function bloomfilter_merge%(bf1: opaque of bloomfilter,
bf2: opaque of bloomfilter%): opaque of bloomfilter
%{
@ -196,3 +237,13 @@ function bloomfilter_merge%(bf1: opaque of bloomfilter,
return BloomFilterVal::Merge(bfv1, bfv2);
%}
## Returns a string with a representation of a Bloom filter's internal
## state. This is for debugging/testing purposes only.
##
## bf: The Bloom filter handle.
function bloomfilter_internal_state%(bf: opaque of bloomfilter%): string
%{
BloomFilterVal* bfv = static_cast<BloomFilterVal*>(bf);
return new StringVal(bfv->InternalState());
%}