Factor implementation and change interface.

When constructing a Bloom filter, one now has to pass a HashPolicy instance to
it. This separates more clearly the concerns of hashing and Bloom filter
management.

This commit also changes the interface to initialize Bloom filters: there exist
now two initialization functions, one for each type:

  (1) bloomfilter_basic_init(fp: double,
                             capacity: count,
                             name: string &default=""): opaque of bloomfilter

  (2) bloomfilter_counting_init(k: count,
                                cells: count,
                                max: count,
                                name: string &default=""): opaque of bloomfilter

The BiFs for adding elements and performing lookups remain the same. This
essentially gives us "BiF polymorphism" at script land, where the
initialization BiF constructs the most derived type while subsequent BiFs
adhere to the same interface.

The reason why we split up the constructor in this case is that we have not yet
derived the math that computes the optimal number of hash functions for
counting Bloom filters---users have to explicitly parameterize them for now.
This commit is contained in:
Matthias Vallentin 2013-06-17 16:06:02 -07:00
parent 9f74064289
commit 532fbfb4d2
11 changed files with 409 additions and 319 deletions

View file

@ -1,117 +1,16 @@
#include "BloomFilter.h" #include "BloomFilter.h"
#include <cmath> #include <cmath>
#include "CounterVector.h"
#include "Serializer.h" #include "Serializer.h"
CounterVector::CounterVector(size_t width, size_t cells)
: bits_(new BitVector(width * cells)), width_(width)
{
}
CounterVector::~CounterVector()
{
delete bits_;
}
bool CounterVector::Increment(size_type cell, count_type value)
{
// TODO
assert(! "not yet implemented");
return false;
}
bool CounterVector::Decrement(size_type cell, count_type value)
{
// TODO
assert(! "not yet implemented");
return false;
}
CounterVector::count_type CounterVector::Count(size_type cell) const
{
// TODO
assert(! "not yet implemented");
return 0;
}
CounterVector::size_type CounterVector::Size() const
{
return bits_->Blocks() / width_;
}
bool CounterVector::Serialize(SerialInfo* info) const
{
return SerialObj::Serialize(info);
}
CounterVector* CounterVector::Unserialize(UnserialInfo* info)
{
return reinterpret_cast<CounterVector*>(
SerialObj::Unserialize(info, SER_COUNTERVECTOR));
}
IMPLEMENT_SERIAL(CounterVector, SER_COUNTERVECTOR)
bool CounterVector::DoSerialize(SerialInfo* info) const
{
DO_SERIALIZE(SER_COUNTERVECTOR, SerialObj);
if ( ! bits_->Serialize(info) )
return false;
return SERIALIZE(static_cast<uint64>(width_));
}
bool CounterVector::DoUnserialize(UnserialInfo* info)
{
DO_UNSERIALIZE(SerialObj);
bits_ = BitVector::Unserialize(info);
if ( ! bits_ )
return false;
uint64 width;
if ( ! UNSERIALIZE(&width) )
return false;
width_ = static_cast<size_t>(width);
return true;
}
HashPolicy::Hasher::Hasher(size_t seed)
: h3_(seed)
{
}
HashPolicy::HashType
HashPolicy::Hasher::operator()(const void* x, size_t n) const
{
return n == 0 ? 0 : h3_(x, n);
}
HashPolicy::HashVector DefaultHashing::Hash(const void* x, size_t n) const
{
HashVector h(K(), 0);
for ( size_t i = 0; i < h.size(); ++i )
h[i] = hashers_[i](x, n);
return h;
}
HashPolicy::HashVector DoubleHashing::Hash(const void* x, size_t n) const
{
HashType h1 = hasher1_(x, n);
HashType h2 = hasher2_(x, n);
HashVector h(K(), 0);
for ( size_t i = 0; i < h.size(); ++i )
h[i] = h1 + i * h2;
return h;
}
BloomFilter::BloomFilter() BloomFilter::BloomFilter()
: hash_(NULL) : hash_(NULL)
{ {
} }
BloomFilter::BloomFilter(size_t k) BloomFilter::BloomFilter(const HashPolicy* hash_policy)
: hash_(new hash_policy(k)) : hash_(hash_policy)
{ {
} }
@ -135,7 +34,11 @@ BloomFilter* BloomFilter::Unserialize(UnserialInfo* info)
bool BloomFilter::DoSerialize(SerialInfo* info) const bool BloomFilter::DoSerialize(SerialInfo* info) const
{ {
DO_SERIALIZE(SER_BLOOMFILTER, SerialObj); DO_SERIALIZE(SER_BLOOMFILTER, SerialObj);
return SERIALIZE(static_cast<uint16>(hash_->K())); // FIXME: Since we have a fixed hashing policy, we just serialize the
// information needed to reconstruct it.
if ( ! SERIALIZE(static_cast<uint16>(hash_->K())) )
return false;
return SERIALIZE_STR(hash_->Name().c_str(), hash_->Name().size());
} }
bool BloomFilter::DoUnserialize(UnserialInfo* info) bool BloomFilter::DoUnserialize(UnserialInfo* info)
@ -144,10 +47,15 @@ bool BloomFilter::DoUnserialize(UnserialInfo* info)
uint16 k; uint16 k;
if ( ! UNSERIALIZE(&k) ) if ( ! UNSERIALIZE(&k) )
return false; return false;
hash_ = new hash_policy(static_cast<size_t>(k)); const char* name;
if ( ! UNSERIALIZE_STR(&name, 0) )
return false;
// FIXME: for now Bloom filters always use double hashing.
hash_ = new DefaultHashing(k, name);
return true; return true;
} }
size_t BasicBloomFilter::M(double fp, size_t capacity) size_t BasicBloomFilter::M(double fp, size_t capacity)
{ {
double ln2 = std::log(2); double ln2 = std::log(2);
@ -163,11 +71,9 @@ size_t BasicBloomFilter::K(size_t cells, size_t capacity)
BasicBloomFilter* BasicBloomFilter::Merge(const BasicBloomFilter* x, BasicBloomFilter* BasicBloomFilter::Merge(const BasicBloomFilter* x,
const BasicBloomFilter* y) const BasicBloomFilter* y)
{ {
// TODO: Ensure that x and y use the same HashPolicy before proceeding.
BasicBloomFilter* result = new BasicBloomFilter(); BasicBloomFilter* result = new BasicBloomFilter();
result->bits_ = new BitVector(*x->bits_ | *y->bits_); result->bits_ = new BitVector(*x->bits_ | *y->bits_);
// TODO: implement the hasher pool and make sure the new result gets the same
// number of (equal) hash functions.
//assert(x->hash_ == y->hash_);
return result; return result;
} }
@ -176,16 +82,10 @@ BasicBloomFilter::BasicBloomFilter()
{ {
} }
BasicBloomFilter::BasicBloomFilter(double fp, size_t capacity) BasicBloomFilter::BasicBloomFilter(const HashPolicy* hash_policy, size_t cells)
: BloomFilter(K(M(fp, capacity), capacity)) : BloomFilter(hash_policy),
bits_(new BitVector(cells))
{ {
bits_ = new BitVector(M(fp, capacity));
}
BasicBloomFilter::BasicBloomFilter(size_t cells, size_t capacity)
: BloomFilter(K(cells, capacity))
{
bits_ = new BitVector(cells);
} }
IMPLEMENT_SERIAL(BasicBloomFilter, SER_BASICBLOOMFILTER) IMPLEMENT_SERIAL(BasicBloomFilter, SER_BASICBLOOMFILTER)
@ -203,13 +103,13 @@ bool BasicBloomFilter::DoUnserialize(UnserialInfo* info)
return bits_ != NULL; return bits_ != NULL;
} }
void BasicBloomFilter::AddImpl(const HashPolicy::HashVector& h) void BasicBloomFilter::AddImpl(const HashPolicy::hash_vector& h)
{ {
for ( size_t i = 0; i < h.size(); ++i ) for ( size_t i = 0; i < h.size(); ++i )
bits_->Set(h[i] % bits_->Size()); bits_->Set(h[i] % bits_->Size());
} }
size_t BasicBloomFilter::CountImpl(const HashPolicy::HashVector& h) const size_t BasicBloomFilter::CountImpl(const HashPolicy::hash_vector& h) const
{ {
for ( size_t i = 0; i < h.size(); ++i ) for ( size_t i = 0; i < h.size(); ++i )
if ( ! (*bits_)[h[i] % bits_->Size()] ) if ( ! (*bits_)[h[i] % bits_->Size()] )
@ -230,17 +130,9 @@ CountingBloomFilter::CountingBloomFilter()
{ {
} }
CountingBloomFilter::CountingBloomFilter(double fp, size_t capacity, CountingBloomFilter::CountingBloomFilter(const HashPolicy* hash_policy,
size_t width) size_t cells, size_t width)
: BloomFilter(BasicBloomFilter::K(BasicBloomFilter::M(fp, capacity), : BloomFilter(hash_policy)
capacity))
{
cells_ = new CounterVector(width, BasicBloomFilter::M(fp, capacity));
}
CountingBloomFilter::CountingBloomFilter(size_t cells, size_t capacity,
size_t width)
: BloomFilter(BasicBloomFilter::K(cells, capacity))
{ {
cells_ = new CounterVector(width, cells); cells_ = new CounterVector(width, cells);
} }
@ -261,18 +153,19 @@ bool CountingBloomFilter::DoUnserialize(UnserialInfo* info)
return cells_ != NULL; return cells_ != NULL;
} }
void CountingBloomFilter::AddImpl(const HashPolicy::HashVector& h) void CountingBloomFilter::AddImpl(const HashPolicy::hash_vector& h)
{ {
for ( size_t i = 0; i < h.size(); ++i ) for ( size_t i = 0; i < h.size(); ++i )
cells_->Increment(h[i] % cells_->Size(), 1); cells_->Increment(h[i] % cells_->Size(), 1);
} }
size_t CountingBloomFilter::CountImpl(const HashPolicy::HashVector& h) const size_t CountingBloomFilter::CountImpl(const HashPolicy::hash_vector& h) const
{ {
CounterVector::size_type min = CounterVector::size_type min =
std::numeric_limits<CounterVector::size_type>::max(); std::numeric_limits<CounterVector::size_type>::max();
for ( size_t i = 0; i < h.size(); ++i ) for ( size_t i = 0; i < h.size(); ++i )
{ {
// TODO: Use partitioning.
CounterVector::size_type cnt = cells_->Count(h[i] % cells_->Size()); CounterVector::size_type cnt = cells_->Count(h[i] % cells_->Size());
if ( cnt < min ) if ( cnt < min )
min = cnt; min = cnt;

View file

@ -3,141 +3,9 @@
#include <vector> #include <vector>
#include "BitVector.h" #include "BitVector.h"
#include "Hash.h" #include "HashPolicy.h"
#include "H3.h"
/** class CounterVector;
* A vector of counters, each of which have a fixed number of bits.
*/
class CounterVector : public SerialObj {
public:
typedef size_t size_type;
typedef uint64 count_type;
/**
* Constructs a counter vector having cells of a given width.
*
* @param width The number of bits that each cell occupies.
*
* @param cells The number of cells in the bitvector.
*/
CounterVector(size_t width, size_t cells = 1024);
~CounterVector();
/**
* Increments a given cell.
*
* @param cell The cell to increment.
*
* @param value The value to add to the current counter in *cell*.
*
* @return `true` if adding *value* to the counter in *cell* succeeded.
*/
bool Increment(size_type cell, count_type value);
/**
* Decrements a given cell.
*
* @param cell The cell to decrement.
*
* @param value The value to subtract from the current counter in *cell*.
*
* @return `true` if subtracting *value* from the counter in *cell* succeeded.
*/
bool Decrement(size_type cell, count_type value);
/**
* Retrieves the counter of a given cell.
*
* @param cell The cell index to retrieve the count for.
*
* @return The counter associated with *cell*.
*/
count_type Count(size_type cell) const;
/**
* Retrieves the number of cells in the storage.
*
* @return The number of cells.
*/
size_type Size() const;
bool Serialize(SerialInfo* info) const;
static CounterVector* Unserialize(UnserialInfo* info);
protected:
DECLARE_SERIAL(CounterVector);
CounterVector() { }
private:
BitVector* bits_;
size_t width_;
};
/**
* The abstract base class for hash policies that hash elements *k* times.
* @tparam Codomain An integral type.
*/
class HashPolicy {
public:
typedef hash_t HashType;
typedef std::vector<HashType> HashVector;
virtual ~HashPolicy() { }
size_t K() const { return k_; }
virtual HashVector Hash(const void* x, size_t n) const = 0;
protected:
/**
* A functor that computes a universal hash function.
* @tparam Codomain An integral type.
*/
class Hasher {
public:
Hasher(size_t seed);
HashType operator()(const void* x, size_t n) const;
private:
// FIXME: The hardcoded value of 36 comes from UHASH_KEY_SIZE defined in
// Hash.h. I do not know how this value impacts the hash function behavior
// so I'll just copy it verbatim. (Matthias)
H3<HashType, 36> h3_;
};
HashPolicy(size_t k) : k_(k) { }
private:
const size_t k_;
};
/**
* The *default* hashing policy. Performs *k* hash function computations.
*/
class DefaultHashing : public HashPolicy {
public:
DefaultHashing(size_t k) : HashPolicy(k), hashers_(k) { }
virtual HashVector Hash(const void* x, size_t n) const;
private:
std::vector<Hasher> hashers_;
};
/**
* The *double-hashing* policy. Uses a linear combination of two hash functions.
*/
class DoubleHashing : public HashPolicy {
public:
DoubleHashing(size_t k) : HashPolicy(k) { }
virtual HashVector Hash(const void* x, size_t n) const;
private:
Hasher hasher1_;
Hasher hasher2_;
};
/** /**
* The abstract base class for Bloom filters. * The abstract base class for Bloom filters.
@ -146,8 +14,6 @@ class BloomFilter : public SerialObj {
public: public:
// At this point we won't let the user choose the hash policy, but we might // At this point we won't let the user choose the hash policy, but we might
// open up the interface in the future. // open up the interface in the future.
typedef DoubleHashing hash_policy;
virtual ~BloomFilter(); virtual ~BloomFilter();
/** /**
@ -180,13 +46,19 @@ protected:
DECLARE_ABSTRACT_SERIAL(BloomFilter); DECLARE_ABSTRACT_SERIAL(BloomFilter);
BloomFilter(); BloomFilter();
BloomFilter(size_t k);
virtual void AddImpl(const HashPolicy::HashVector& hashes) = 0; /**
virtual size_t CountImpl(const HashPolicy::HashVector& hashes) const = 0; * Constructs a Bloom filter.
*
* @param hash_policy The hash policy to use for this Bloom filter.
*/
BloomFilter(const HashPolicy* hash_policy);
virtual void AddImpl(const HashPolicy::hash_vector& hashes) = 0;
virtual size_t CountImpl(const HashPolicy::hash_vector& hashes) const = 0;
private: private:
HashPolicy* hash_; const HashPolicy* hash_;
}; };
/** /**
@ -223,24 +95,18 @@ public:
static BasicBloomFilter* Merge(const BasicBloomFilter* x, static BasicBloomFilter* Merge(const BasicBloomFilter* x,
const BasicBloomFilter* y); const BasicBloomFilter* y);
/**
* Constructs a basic Bloom filter with a given false-positive rate and
* capacity.
*/
BasicBloomFilter(double fp, size_t capacity);
/** /**
* Constructs a basic Bloom filter with a given number of cells and capacity. * Constructs a basic Bloom filter with a given number of cells and capacity.
*/ */
BasicBloomFilter(size_t cells, size_t capacity); BasicBloomFilter(const HashPolicy* hash_policy, size_t cells);
protected: protected:
DECLARE_SERIAL(BasicBloomFilter); DECLARE_SERIAL(BasicBloomFilter);
BasicBloomFilter(); BasicBloomFilter();
virtual void AddImpl(const HashPolicy::HashVector& h); virtual void AddImpl(const HashPolicy::hash_vector& h);
virtual size_t CountImpl(const HashPolicy::HashVector& h) const; virtual size_t CountImpl(const HashPolicy::hash_vector& h) const;
private: private:
BitVector* bits_; BitVector* bits_;
@ -254,16 +120,16 @@ public:
static CountingBloomFilter* Merge(const CountingBloomFilter* x, static CountingBloomFilter* Merge(const CountingBloomFilter* x,
const CountingBloomFilter* y); const CountingBloomFilter* y);
CountingBloomFilter(double fp, size_t capacity, size_t width); CountingBloomFilter(const HashPolicy* hash_policy, size_t cells,
CountingBloomFilter(size_t cells, size_t capacity, size_t width); size_t width);
protected: protected:
DECLARE_SERIAL(CountingBloomFilter); DECLARE_SERIAL(CountingBloomFilter);
CountingBloomFilter(); CountingBloomFilter();
virtual void AddImpl(const HashPolicy::HashVector& h); virtual void AddImpl(const HashPolicy::hash_vector& h);
virtual size_t CountImpl(const HashPolicy::HashVector& h) const; virtual size_t CountImpl(const HashPolicy::hash_vector& h) const;
private: private:
CounterVector* cells_; CounterVector* cells_;

View file

@ -255,6 +255,7 @@ set(bro_SRCS
ChunkedIO.cc ChunkedIO.cc
CompHash.cc CompHash.cc
Conn.cc Conn.cc
CounterVector.cc
DFA.cc DFA.cc
DbgBreakpoint.cc DbgBreakpoint.cc
DbgHelp.cc DbgHelp.cc
@ -278,6 +279,7 @@ set(bro_SRCS
Frame.cc Frame.cc
Func.cc Func.cc
Hash.cc Hash.cc
HashPolicy.cc
ID.cc ID.cc
IntSet.cc IntSet.cc
IOSource.cc IOSource.cc

75
src/CounterVector.cc Normal file
View file

@ -0,0 +1,75 @@
#include "CounterVector.h"
#include "BitVector.h"
#include "Serializer.h"
CounterVector::CounterVector(size_t width, size_t cells)
: bits_(new BitVector(width * cells)), width_(width)
{
}
CounterVector::~CounterVector()
{
delete bits_;
}
bool CounterVector::Increment(size_type cell, count_type value)
{
// TODO
assert(! "not yet implemented");
return false;
}
bool CounterVector::Decrement(size_type cell, count_type value)
{
// TODO
assert(! "not yet implemented");
return false;
}
CounterVector::count_type CounterVector::Count(size_type cell) const
{
// TODO
assert(! "not yet implemented");
return 0;
}
CounterVector::size_type CounterVector::Size() const
{
return bits_->Blocks() / width_;
}
bool CounterVector::Serialize(SerialInfo* info) const
{
return SerialObj::Serialize(info);
}
CounterVector* CounterVector::Unserialize(UnserialInfo* info)
{
return reinterpret_cast<CounterVector*>(
SerialObj::Unserialize(info, SER_COUNTERVECTOR));
}
IMPLEMENT_SERIAL(CounterVector, SER_COUNTERVECTOR)
bool CounterVector::DoSerialize(SerialInfo* info) const
{
DO_SERIALIZE(SER_COUNTERVECTOR, SerialObj);
if ( ! bits_->Serialize(info) )
return false;
return SERIALIZE(static_cast<uint64>(width_));
}
bool CounterVector::DoUnserialize(UnserialInfo* info)
{
DO_UNSERIALIZE(SerialObj);
bits_ = BitVector::Unserialize(info);
if ( ! bits_ )
return false;
uint64 width;
if ( ! UNSERIALIZE(&width) )
return false;
width_ = static_cast<size_t>(width);
return true;
}

78
src/CounterVector.h Normal file
View file

@ -0,0 +1,78 @@
#ifndef CounterVector_h
#define CounterVector_h
#include "SerialObj.h"
class BitVector;
/**
* A vector of counters, each of which have a fixed number of bits.
*/
class CounterVector : public SerialObj {
public:
typedef size_t size_type;
typedef uint64 count_type;
/**
* Constructs a counter vector having cells of a given width.
*
* @param width The number of bits that each cell occupies.
*
* @param cells The number of cells in the bitvector.
*/
CounterVector(size_t width, size_t cells = 1024);
~CounterVector();
/**
* Increments a given cell.
*
* @param cell The cell to increment.
*
* @param value The value to add to the current counter in *cell*.
*
* @return `true` if adding *value* to the counter in *cell* succeeded.
*/
bool Increment(size_type cell, count_type value);
/**
* Decrements a given cell.
*
* @param cell The cell to decrement.
*
* @param value The value to subtract from the current counter in *cell*.
*
* @return `true` if subtracting *value* from the counter in *cell* succeeded.
*/
bool Decrement(size_type cell, count_type value);
/**
* Retrieves the counter of a given cell.
*
* @param cell The cell index to retrieve the count for.
*
* @return The counter associated with *cell*.
*/
count_type Count(size_type cell) const;
/**
* Retrieves the number of cells in the storage.
*
* @return The number of cells.
*/
size_type Size() const;
bool Serialize(SerialInfo* info) const;
static CounterVector* Unserialize(UnserialInfo* info);
protected:
DECLARE_SERIAL(CounterVector);
CounterVector() { }
private:
BitVector* bits_;
size_t width_;
};
#endif

72
src/HashPolicy.cc Normal file
View file

@ -0,0 +1,72 @@
#include "HashPolicy.h"
#include "digest.h"
Hasher::Hasher(size_t seed, const std::string& extra)
: h_(compute_seed(seed, extra))
{
}
Hasher::hash_type Hasher::operator()(const void* x, size_t n) const
{
return n == 0 ? 0 : h_(x, n);
}
size_t Hasher::compute_seed(size_t seed, const std::string& extra)
{
u_char digest[SHA256_DIGEST_LENGTH];
SHA256_CTX ctx;
sha256_init(&ctx);
if ( extra.empty() )
{
unsigned int first_seed = initial_seed();
sha256_update(&ctx, &first_seed, sizeof(first_seed));
}
else
{
sha256_update(&ctx, extra.c_str(), extra.size());
}
sha256_update(&ctx, &seed, sizeof(seed));
sha256_final(&ctx, digest);
return *reinterpret_cast<size_t*>(digest);
}
HashPolicy::HashPolicy(size_t k, const std::string& name)
: k_(k), name_(name)
{
}
DefaultHashing::DefaultHashing(size_t k, const std::string& name)
: HashPolicy(k, name)
{
for ( size_t i = 0; i < k; ++i )
hashers_.push_back(Hasher(i, name));
}
HashPolicy::hash_vector DefaultHashing::Hash(const void* x, size_t n) const
{
hash_vector h(K(), 0);
for ( size_t i = 0; i < h.size(); ++i )
h[i] = hashers_[i](x, n);
return h;
}
DoubleHashing::DoubleHashing(size_t k, const std::string& name)
: HashPolicy(k, name),
hasher1_(1, name),
hasher2_(2, name)
{
}
HashPolicy::hash_vector DoubleHashing::Hash(const void* x, size_t n) const
{
hash_type h1 = hasher1_(x, n);
hash_type h2 = hasher2_(x, n);
hash_vector h(K(), 0);
for ( size_t i = 0; i < h.size(); ++i )
h[i] = h1 + i * h2;
return h;
}

90
src/HashPolicy.h Normal file
View file

@ -0,0 +1,90 @@
#ifndef HashPolicy_h
#define HashPolicy_h
#include "Hash.h"
#include "H3.h"
/**
* A functor that computes a universal hash function.
*/
class Hasher {
public:
typedef hash_t hash_type;
/**
* Constructs a hasher seeded by a given seed and optionally an extra
* descriptor.
*
* @param seed The seed to use.
*
* @param extra If not `NULL`, the hasher will not mix in the initial seed
* but instead use this NUL-terminated string as additional seed.
*/
Hasher(size_t seed, const std::string& extra = "");
/**
* Computes the hash digest of contiguous data.
*
* @param x A pointer to the beginning of the byte sequence to hash.
*
* @param n The length of the sequence pointed to by *x*.
*/
hash_type operator()(const void* x, size_t n) const;
private:
static size_t compute_seed(size_t seed, const std::string& extra);
H3<hash_type, UHASH_KEY_SIZE> h_;
};
/**
* The abstract base class for hash policies that hash elements *k* times.
*/
class HashPolicy {
public:
typedef Hasher::hash_type hash_type;
typedef std::vector<hash_type> hash_vector;
virtual ~HashPolicy() { }
virtual hash_vector Hash(const void* x, size_t n) const = 0;
size_t K() const { return k_; }
const std::string& Name() const { return name_; }
protected:
HashPolicy(size_t k, const std::string& name);
private:
const size_t k_;
std::string name_;
};
/**
* The default hashing policy. Performs *k* hash function computations.
*/
class DefaultHashing : public HashPolicy {
public:
DefaultHashing(size_t k, const std::string& name);
virtual hash_vector Hash(const void* x, size_t n) const /* override */;
private:
std::vector<Hasher> hashers_;
};
/**
* The *double-hashing* policy. Uses a linear combination of two hash functions.
*/
class DoubleHashing : public HashPolicy {
public:
DoubleHashing(size_t k, const std::string& name);
virtual hash_vector Hash(const void* x, size_t n) const;
private:
Hasher hasher1_;
Hasher hasher2_;
};
#endif

View file

@ -605,6 +605,7 @@ IMPLEMENT_SERIAL(BloomFilterVal, SER_BLOOMFILTER_VAL);
bool BloomFilterVal::DoSerialize(SerialInfo* info) const bool BloomFilterVal::DoSerialize(SerialInfo* info) const
{ {
DO_SERIALIZE(SER_BLOOMFILTER_VAL, OpaqueVal); DO_SERIALIZE(SER_BLOOMFILTER_VAL, OpaqueVal);
assert( type_ );
if ( ! type_->Serialize(info) ) if ( ! type_->Serialize(info) )
return false; return false;
return bloom_filter_->Serialize(info); return bloom_filter_->Serialize(info);

View file

@ -4986,42 +4986,55 @@ function anonymize_addr%(a: addr, cl: IPAddrAnonymizationClass%): addr
#include "BloomFilter.h" #include "BloomFilter.h"
%%} %%}
## Initializes a Bloom filter data structure. ## Creates a basic Bloom filter.
## ##
## fp: The desired false-positive rate. ## fp: The desired false-positive rate.
## ##
## capacity: the maximum number of elements that guarantees a false-positive ## capacity: the maximum number of elements that guarantees a false-positive
## rate of *fp*. ## rate of *fp*.
## ##
## max: The maximum counter value associated with each each element in the ## name: A name that uniquely identifies and seeds the Bloom filter. If empty,
## Bloom filter. If greater than 1, each element in the set has a counter of ## the initialization will become dependent on the initial seed.
## *w = ceil(log_2(max))* bits. Each bit in the underlying bit vector then
## becomes a cell of size *w* bits. Since the number number of cells is a
## function ## of *fp* and *capacity*, it is important to consider the effects
## on space when tuning this value.
## ##
## Returns: A Bloom filter handle. ## Returns: A Bloom filter handle.
function bloomfilter_init%(fp: double, capacity: count, function bloomfilter_basic_init%(fp: double, capacity: count,
max: count &default=1%): opaque of bloomfilter name: string &default=""%): opaque of bloomfilter
%{ %{
if ( fp < 0.0 || fp > 1.0 ) if ( fp < 0.0 || fp > 1.0 )
{ {
reporter->Error("false-positive rate must take value between 0 and 1"); reporter->Error("false-positive rate must take value between 0 and 1");
return NULL; return NULL;
} }
BloomFilter* bf;
if ( max == 1 ) size_t cells = BasicBloomFilter::M(fp, capacity);
{ size_t optimal_k = BasicBloomFilter::K(cells, capacity);
bf = new BasicBloomFilter(fp, capacity); const HashPolicy* hp = new DefaultHashing(optimal_k, name->CheckString());
} fprintf(stderr, "constructing Bloom filter with %llu hash fns and %llu cells\n", optimal_k, cells);
else return new BloomFilterVal(new BasicBloomFilter(hp, cells));
{ %}
uint16 width = 0;
while ( max >>= 1 ) ## Creates a counting Bloom filter.
++width; ##
bf = new CountingBloomFilter(fp, capacity, width); ## k: The number of hash functions to use.
} ##
return new BloomFilterVal(bf); ## cells: The number of cells of the underlying counter vector.
##
## max: The maximum counter value associated with each each element described
## by *w = ceil(log_2(max))* bits. Each bit in the underlying counter vector
## becomes a cell of size *w* bits.
##
## name: A name that uniquely identifies and seeds the Bloom filter. If empty,
## the initialization will become dependent on the initial seed.
##
## Returns: A Bloom filter handle.
function bloomfilter_counting_init%(k: count, cells: count, max: count,
name: string &default=""%): opaque of bloomfilter
%{
const HashPolicy* hp = new DefaultHashing(k, name->CheckString());
uint16 width = 0;
while ( max >>= 1 )
++width;
return new BloomFilterVal(new CountingBloomFilter(hp, cells, width));
%} %}
## Adds an element to a Bloom filter. ## Adds an element to a Bloom filter.

View file

@ -4,7 +4,7 @@
event bro_init() event bro_init()
{ {
# Basic usage with counts. # Basic usage with counts.
local bf_cnt = bloomfilter_init(0.1, 1000); local bf_cnt = bloomfilter_basic_init(0.1, 1000);
bloomfilter_add(bf_cnt, 42); bloomfilter_add(bf_cnt, 42);
bloomfilter_add(bf_cnt, 84); bloomfilter_add(bf_cnt, 84);
bloomfilter_add(bf_cnt, 168); bloomfilter_add(bf_cnt, 168);
@ -16,23 +16,23 @@ event bro_init()
bloomfilter_add(bf_cnt, "foo"); # Type mismatch bloomfilter_add(bf_cnt, "foo"); # Type mismatch
# Basic usage with strings. # Basic usage with strings.
local bf_str = bloomfilter_init(0.9, 10); local bf_str = bloomfilter_basic_init(0.9, 10);
bloomfilter_add(bf_str, "foo"); bloomfilter_add(bf_str, "foo");
bloomfilter_add(bf_str, "bar"); bloomfilter_add(bf_str, "bar");
print bloomfilter_lookup(bf_str, "foo"); print bloomfilter_lookup(bf_str, "foo");
print bloomfilter_lookup(bf_str, "bar"); print bloomfilter_lookup(bf_str, "bar");
print bloomfilter_lookup(bf_str, "baz"); # FP print bloomfilter_lookup(bf_str, "b4z"); # FP
print bloomfilter_lookup(bf_str, "qux"); # FP print bloomfilter_lookup(bf_str, "quux"); # FP
bloomfilter_add(bf_str, 0.5); # Type mismatch bloomfilter_add(bf_str, 0.5); # Type mismatch
bloomfilter_add(bf_str, 100); # Type mismatch bloomfilter_add(bf_str, 100); # Type mismatch
# Edge cases. # Edge cases.
local bf_edge0 = bloomfilter_init(0.000000000001, 1); local bf_edge0 = bloomfilter_basic_init(0.000000000001, 1);
local bf_edge1 = bloomfilter_init(0.00000001, 100000000); local bf_edge1 = bloomfilter_basic_init(0.00000001, 100000000);
local bf_edge2 = bloomfilter_init(0.9999999, 1); local bf_edge2 = bloomfilter_basic_init(0.9999999, 1);
local bf_edge3 = bloomfilter_init(0.9999999, 100000000000); local bf_edge3 = bloomfilter_basic_init(0.9999999, 100000000000);
# Invalid parameters. # Invalid parameters.
local bf_bug0 = bloomfilter_init(-0.5, 42); local bf_bug0 = bloomfilter_basic_init(-0.5, 42);
local bf_bug1 = bloomfilter_init(1.1, 42); local bf_bug1 = bloomfilter_basic_init(1.1, 42);
} }

View file

@ -82,7 +82,7 @@ event bro_init()
if ( ! entropy_test_add(entropy_handle, "f") ) if ( ! entropy_test_add(entropy_handle, "f") )
print out, "entropy_test_add() failed"; print out, "entropy_test_add() failed";
bloomfilter_handle = bloomfilter_init(0.1, 100); bloomfilter_handle = bloomfilter_basic_init(0.1, 100);
for ( e in bloomfilter_elements ) for ( e in bloomfilter_elements )
bloomfilter_add(bloomfilter_handle, e); bloomfilter_add(bloomfilter_handle, e);
} }