diff --git a/src/BloomFilter.cc b/src/BloomFilter.cc index 6a44defc6d..0be64c18de 100644 --- a/src/BloomFilter.cc +++ b/src/BloomFilter.cc @@ -1,117 +1,16 @@ #include "BloomFilter.h" #include +#include "CounterVector.h" #include "Serializer.h" -CounterVector::CounterVector(size_t width, size_t cells) - : bits_(new BitVector(width * cells)), width_(width) - { - } - -CounterVector::~CounterVector() - { - delete bits_; - } - -bool CounterVector::Increment(size_type cell, count_type value) - { - // TODO - assert(! "not yet implemented"); - return false; - } - -bool CounterVector::Decrement(size_type cell, count_type value) - { - // TODO - assert(! "not yet implemented"); - return false; - } - -CounterVector::count_type CounterVector::Count(size_type cell) const - { - // TODO - assert(! "not yet implemented"); - return 0; - } - -CounterVector::size_type CounterVector::Size() const - { - return bits_->Blocks() / width_; - } - -bool CounterVector::Serialize(SerialInfo* info) const - { - return SerialObj::Serialize(info); - } - -CounterVector* CounterVector::Unserialize(UnserialInfo* info) - { - return reinterpret_cast( - SerialObj::Unserialize(info, SER_COUNTERVECTOR)); - } - -IMPLEMENT_SERIAL(CounterVector, SER_COUNTERVECTOR) - -bool CounterVector::DoSerialize(SerialInfo* info) const - { - DO_SERIALIZE(SER_COUNTERVECTOR, SerialObj); - if ( ! bits_->Serialize(info) ) - return false; - return SERIALIZE(static_cast(width_)); - } - -bool CounterVector::DoUnserialize(UnserialInfo* info) - { - DO_UNSERIALIZE(SerialObj); - bits_ = BitVector::Unserialize(info); - if ( ! bits_ ) - return false; - uint64 width; - if ( ! UNSERIALIZE(&width) ) - return false; - width_ = static_cast(width); - return true; - } - - -HashPolicy::Hasher::Hasher(size_t seed) - : h3_(seed) -{ -} - -HashPolicy::HashType -HashPolicy::Hasher::operator()(const void* x, size_t n) const - { - return n == 0 ? 0 : h3_(x, n); - } - -HashPolicy::HashVector DefaultHashing::Hash(const void* x, size_t n) const - { - HashVector h(K(), 0); - for ( size_t i = 0; i < h.size(); ++i ) - h[i] = hashers_[i](x, n); - return h; - } - - -HashPolicy::HashVector DoubleHashing::Hash(const void* x, size_t n) const - { - HashType h1 = hasher1_(x, n); - HashType h2 = hasher2_(x, n); - HashVector h(K(), 0); - for ( size_t i = 0; i < h.size(); ++i ) - h[i] = h1 + i * h2; - return h; - } - - BloomFilter::BloomFilter() : hash_(NULL) { } -BloomFilter::BloomFilter(size_t k) - : hash_(new hash_policy(k)) +BloomFilter::BloomFilter(const HashPolicy* hash_policy) + : hash_(hash_policy) { } @@ -135,7 +34,11 @@ BloomFilter* BloomFilter::Unserialize(UnserialInfo* info) bool BloomFilter::DoSerialize(SerialInfo* info) const { DO_SERIALIZE(SER_BLOOMFILTER, SerialObj); - return SERIALIZE(static_cast(hash_->K())); + // FIXME: Since we have a fixed hashing policy, we just serialize the + // information needed to reconstruct it. + if ( ! SERIALIZE(static_cast(hash_->K())) ) + return false; + return SERIALIZE_STR(hash_->Name().c_str(), hash_->Name().size()); } bool BloomFilter::DoUnserialize(UnserialInfo* info) @@ -144,10 +47,15 @@ bool BloomFilter::DoUnserialize(UnserialInfo* info) uint16 k; if ( ! UNSERIALIZE(&k) ) return false; - hash_ = new hash_policy(static_cast(k)); + const char* name; + if ( ! UNSERIALIZE_STR(&name, 0) ) + return false; + // FIXME: for now Bloom filters always use double hashing. + hash_ = new DefaultHashing(k, name); return true; } + size_t BasicBloomFilter::M(double fp, size_t capacity) { double ln2 = std::log(2); @@ -163,11 +71,9 @@ size_t BasicBloomFilter::K(size_t cells, size_t capacity) BasicBloomFilter* BasicBloomFilter::Merge(const BasicBloomFilter* x, const BasicBloomFilter* y) { + // TODO: Ensure that x and y use the same HashPolicy before proceeding. BasicBloomFilter* result = new BasicBloomFilter(); result->bits_ = new BitVector(*x->bits_ | *y->bits_); - // TODO: implement the hasher pool and make sure the new result gets the same - // number of (equal) hash functions. - //assert(x->hash_ == y->hash_); return result; } @@ -176,16 +82,10 @@ BasicBloomFilter::BasicBloomFilter() { } -BasicBloomFilter::BasicBloomFilter(double fp, size_t capacity) - : BloomFilter(K(M(fp, capacity), capacity)) +BasicBloomFilter::BasicBloomFilter(const HashPolicy* hash_policy, size_t cells) + : BloomFilter(hash_policy), + bits_(new BitVector(cells)) { - bits_ = new BitVector(M(fp, capacity)); - } - -BasicBloomFilter::BasicBloomFilter(size_t cells, size_t capacity) - : BloomFilter(K(cells, capacity)) - { - bits_ = new BitVector(cells); } IMPLEMENT_SERIAL(BasicBloomFilter, SER_BASICBLOOMFILTER) @@ -203,13 +103,13 @@ bool BasicBloomFilter::DoUnserialize(UnserialInfo* info) return bits_ != NULL; } -void BasicBloomFilter::AddImpl(const HashPolicy::HashVector& h) +void BasicBloomFilter::AddImpl(const HashPolicy::hash_vector& h) { for ( size_t i = 0; i < h.size(); ++i ) bits_->Set(h[i] % bits_->Size()); } -size_t BasicBloomFilter::CountImpl(const HashPolicy::HashVector& h) const +size_t BasicBloomFilter::CountImpl(const HashPolicy::hash_vector& h) const { for ( size_t i = 0; i < h.size(); ++i ) if ( ! (*bits_)[h[i] % bits_->Size()] ) @@ -230,17 +130,9 @@ CountingBloomFilter::CountingBloomFilter() { } -CountingBloomFilter::CountingBloomFilter(double fp, size_t capacity, - size_t width) - : BloomFilter(BasicBloomFilter::K(BasicBloomFilter::M(fp, capacity), - capacity)) - { - cells_ = new CounterVector(width, BasicBloomFilter::M(fp, capacity)); - } - -CountingBloomFilter::CountingBloomFilter(size_t cells, size_t capacity, - size_t width) - : BloomFilter(BasicBloomFilter::K(cells, capacity)) +CountingBloomFilter::CountingBloomFilter(const HashPolicy* hash_policy, + size_t cells, size_t width) + : BloomFilter(hash_policy) { cells_ = new CounterVector(width, cells); } @@ -261,18 +153,19 @@ bool CountingBloomFilter::DoUnserialize(UnserialInfo* info) return cells_ != NULL; } -void CountingBloomFilter::AddImpl(const HashPolicy::HashVector& h) +void CountingBloomFilter::AddImpl(const HashPolicy::hash_vector& h) { for ( size_t i = 0; i < h.size(); ++i ) cells_->Increment(h[i] % cells_->Size(), 1); } -size_t CountingBloomFilter::CountImpl(const HashPolicy::HashVector& h) const +size_t CountingBloomFilter::CountImpl(const HashPolicy::hash_vector& h) const { CounterVector::size_type min = std::numeric_limits::max(); for ( size_t i = 0; i < h.size(); ++i ) { + // TODO: Use partitioning. CounterVector::size_type cnt = cells_->Count(h[i] % cells_->Size()); if ( cnt < min ) min = cnt; diff --git a/src/BloomFilter.h b/src/BloomFilter.h index 65133621f9..189f4920b7 100644 --- a/src/BloomFilter.h +++ b/src/BloomFilter.h @@ -3,141 +3,9 @@ #include #include "BitVector.h" -#include "Hash.h" -#include "H3.h" +#include "HashPolicy.h" -/** - * A vector of counters, each of which have a fixed number of bits. - */ -class CounterVector : public SerialObj { -public: - typedef size_t size_type; - typedef uint64 count_type; - - /** - * Constructs a counter vector having cells of a given width. - * - * @param width The number of bits that each cell occupies. - * - * @param cells The number of cells in the bitvector. - */ - CounterVector(size_t width, size_t cells = 1024); - - ~CounterVector(); - - /** - * Increments a given cell. - * - * @param cell The cell to increment. - * - * @param value The value to add to the current counter in *cell*. - * - * @return `true` if adding *value* to the counter in *cell* succeeded. - */ - bool Increment(size_type cell, count_type value); - - /** - * Decrements a given cell. - * - * @param cell The cell to decrement. - * - * @param value The value to subtract from the current counter in *cell*. - * - * @return `true` if subtracting *value* from the counter in *cell* succeeded. - */ - bool Decrement(size_type cell, count_type value); - - /** - * Retrieves the counter of a given cell. - * - * @param cell The cell index to retrieve the count for. - * - * @return The counter associated with *cell*. - */ - count_type Count(size_type cell) const; - - /** - * Retrieves the number of cells in the storage. - * - * @return The number of cells. - */ - size_type Size() const; - - bool Serialize(SerialInfo* info) const; - static CounterVector* Unserialize(UnserialInfo* info); - -protected: - DECLARE_SERIAL(CounterVector); - - CounterVector() { } - -private: - BitVector* bits_; - size_t width_; -}; - -/** - * The abstract base class for hash policies that hash elements *k* times. - * @tparam Codomain An integral type. - */ -class HashPolicy { -public: - typedef hash_t HashType; - typedef std::vector HashVector; - - virtual ~HashPolicy() { } - size_t K() const { return k_; } - virtual HashVector Hash(const void* x, size_t n) const = 0; - -protected: - /** - * A functor that computes a universal hash function. - * @tparam Codomain An integral type. - */ - class Hasher { - public: - Hasher(size_t seed); - - HashType operator()(const void* x, size_t n) const; - private: - // FIXME: The hardcoded value of 36 comes from UHASH_KEY_SIZE defined in - // Hash.h. I do not know how this value impacts the hash function behavior - // so I'll just copy it verbatim. (Matthias) - H3 h3_; - }; - - HashPolicy(size_t k) : k_(k) { } - -private: - const size_t k_; -}; - -/** - * The *default* hashing policy. Performs *k* hash function computations. - */ -class DefaultHashing : public HashPolicy { -public: - DefaultHashing(size_t k) : HashPolicy(k), hashers_(k) { } - - virtual HashVector Hash(const void* x, size_t n) const; - -private: - std::vector hashers_; -}; - -/** - * The *double-hashing* policy. Uses a linear combination of two hash functions. - */ -class DoubleHashing : public HashPolicy { -public: - DoubleHashing(size_t k) : HashPolicy(k) { } - - virtual HashVector Hash(const void* x, size_t n) const; - -private: - Hasher hasher1_; - Hasher hasher2_; -}; +class CounterVector; /** * The abstract base class for Bloom filters. @@ -146,8 +14,6 @@ class BloomFilter : public SerialObj { public: // At this point we won't let the user choose the hash policy, but we might // open up the interface in the future. - typedef DoubleHashing hash_policy; - virtual ~BloomFilter(); /** @@ -180,13 +46,19 @@ protected: DECLARE_ABSTRACT_SERIAL(BloomFilter); BloomFilter(); - BloomFilter(size_t k); - virtual void AddImpl(const HashPolicy::HashVector& hashes) = 0; - virtual size_t CountImpl(const HashPolicy::HashVector& hashes) const = 0; + /** + * Constructs a Bloom filter. + * + * @param hash_policy The hash policy to use for this Bloom filter. + */ + BloomFilter(const HashPolicy* hash_policy); + + virtual void AddImpl(const HashPolicy::hash_vector& hashes) = 0; + virtual size_t CountImpl(const HashPolicy::hash_vector& hashes) const = 0; private: - HashPolicy* hash_; + const HashPolicy* hash_; }; /** @@ -223,24 +95,18 @@ public: static BasicBloomFilter* Merge(const BasicBloomFilter* x, const BasicBloomFilter* y); - /** - * Constructs a basic Bloom filter with a given false-positive rate and - * capacity. - */ - BasicBloomFilter(double fp, size_t capacity); - /** * Constructs a basic Bloom filter with a given number of cells and capacity. */ - BasicBloomFilter(size_t cells, size_t capacity); + BasicBloomFilter(const HashPolicy* hash_policy, size_t cells); protected: DECLARE_SERIAL(BasicBloomFilter); BasicBloomFilter(); - virtual void AddImpl(const HashPolicy::HashVector& h); - virtual size_t CountImpl(const HashPolicy::HashVector& h) const; + virtual void AddImpl(const HashPolicy::hash_vector& h); + virtual size_t CountImpl(const HashPolicy::hash_vector& h) const; private: BitVector* bits_; @@ -254,16 +120,16 @@ public: static CountingBloomFilter* Merge(const CountingBloomFilter* x, const CountingBloomFilter* y); - CountingBloomFilter(double fp, size_t capacity, size_t width); - CountingBloomFilter(size_t cells, size_t capacity, size_t width); + CountingBloomFilter(const HashPolicy* hash_policy, size_t cells, + size_t width); protected: DECLARE_SERIAL(CountingBloomFilter); CountingBloomFilter(); - virtual void AddImpl(const HashPolicy::HashVector& h); - virtual size_t CountImpl(const HashPolicy::HashVector& h) const; + virtual void AddImpl(const HashPolicy::hash_vector& h); + virtual size_t CountImpl(const HashPolicy::hash_vector& h) const; private: CounterVector* cells_; diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 1537bb04b0..f2c7ce6bad 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -255,6 +255,7 @@ set(bro_SRCS ChunkedIO.cc CompHash.cc Conn.cc + CounterVector.cc DFA.cc DbgBreakpoint.cc DbgHelp.cc @@ -278,6 +279,7 @@ set(bro_SRCS Frame.cc Func.cc Hash.cc + HashPolicy.cc ID.cc IntSet.cc IOSource.cc diff --git a/src/CounterVector.cc b/src/CounterVector.cc new file mode 100644 index 0000000000..8ed4c30427 --- /dev/null +++ b/src/CounterVector.cc @@ -0,0 +1,75 @@ +#include "CounterVector.h" + +#include "BitVector.h" +#include "Serializer.h" + +CounterVector::CounterVector(size_t width, size_t cells) + : bits_(new BitVector(width * cells)), width_(width) + { + } + +CounterVector::~CounterVector() + { + delete bits_; + } + +bool CounterVector::Increment(size_type cell, count_type value) + { + // TODO + assert(! "not yet implemented"); + return false; + } + +bool CounterVector::Decrement(size_type cell, count_type value) + { + // TODO + assert(! "not yet implemented"); + return false; + } + +CounterVector::count_type CounterVector::Count(size_type cell) const + { + // TODO + assert(! "not yet implemented"); + return 0; + } + +CounterVector::size_type CounterVector::Size() const + { + return bits_->Blocks() / width_; + } + +bool CounterVector::Serialize(SerialInfo* info) const + { + return SerialObj::Serialize(info); + } + +CounterVector* CounterVector::Unserialize(UnserialInfo* info) + { + return reinterpret_cast( + SerialObj::Unserialize(info, SER_COUNTERVECTOR)); + } + +IMPLEMENT_SERIAL(CounterVector, SER_COUNTERVECTOR) + +bool CounterVector::DoSerialize(SerialInfo* info) const + { + DO_SERIALIZE(SER_COUNTERVECTOR, SerialObj); + if ( ! bits_->Serialize(info) ) + return false; + return SERIALIZE(static_cast(width_)); + } + +bool CounterVector::DoUnserialize(UnserialInfo* info) + { + DO_UNSERIALIZE(SerialObj); + bits_ = BitVector::Unserialize(info); + if ( ! bits_ ) + return false; + uint64 width; + if ( ! UNSERIALIZE(&width) ) + return false; + width_ = static_cast(width); + return true; + } + diff --git a/src/CounterVector.h b/src/CounterVector.h new file mode 100644 index 0000000000..ecc8fe90e0 --- /dev/null +++ b/src/CounterVector.h @@ -0,0 +1,78 @@ +#ifndef CounterVector_h +#define CounterVector_h + +#include "SerialObj.h" + +class BitVector; + +/** + * A vector of counters, each of which have a fixed number of bits. + */ +class CounterVector : public SerialObj { +public: + typedef size_t size_type; + typedef uint64 count_type; + + /** + * Constructs a counter vector having cells of a given width. + * + * @param width The number of bits that each cell occupies. + * + * @param cells The number of cells in the bitvector. + */ + CounterVector(size_t width, size_t cells = 1024); + + ~CounterVector(); + + /** + * Increments a given cell. + * + * @param cell The cell to increment. + * + * @param value The value to add to the current counter in *cell*. + * + * @return `true` if adding *value* to the counter in *cell* succeeded. + */ + bool Increment(size_type cell, count_type value); + + /** + * Decrements a given cell. + * + * @param cell The cell to decrement. + * + * @param value The value to subtract from the current counter in *cell*. + * + * @return `true` if subtracting *value* from the counter in *cell* succeeded. + */ + bool Decrement(size_type cell, count_type value); + + /** + * Retrieves the counter of a given cell. + * + * @param cell The cell index to retrieve the count for. + * + * @return The counter associated with *cell*. + */ + count_type Count(size_type cell) const; + + /** + * Retrieves the number of cells in the storage. + * + * @return The number of cells. + */ + size_type Size() const; + + bool Serialize(SerialInfo* info) const; + static CounterVector* Unserialize(UnserialInfo* info); + +protected: + DECLARE_SERIAL(CounterVector); + + CounterVector() { } + +private: + BitVector* bits_; + size_t width_; +}; + +#endif diff --git a/src/HashPolicy.cc b/src/HashPolicy.cc new file mode 100644 index 0000000000..d6fb4f3da4 --- /dev/null +++ b/src/HashPolicy.cc @@ -0,0 +1,72 @@ +#include "HashPolicy.h" + +#include "digest.h" + +Hasher::Hasher(size_t seed, const std::string& extra) + : h_(compute_seed(seed, extra)) + { + } + +Hasher::hash_type Hasher::operator()(const void* x, size_t n) const + { + return n == 0 ? 0 : h_(x, n); + } + +size_t Hasher::compute_seed(size_t seed, const std::string& extra) + { + u_char digest[SHA256_DIGEST_LENGTH]; + SHA256_CTX ctx; + sha256_init(&ctx); + if ( extra.empty() ) + { + unsigned int first_seed = initial_seed(); + sha256_update(&ctx, &first_seed, sizeof(first_seed)); + } + else + { + sha256_update(&ctx, extra.c_str(), extra.size()); + } + sha256_update(&ctx, &seed, sizeof(seed)); + sha256_final(&ctx, digest); + return *reinterpret_cast(digest); + } + + +HashPolicy::HashPolicy(size_t k, const std::string& name) + : k_(k), name_(name) + { + } + +DefaultHashing::DefaultHashing(size_t k, const std::string& name) + : HashPolicy(k, name) + { + for ( size_t i = 0; i < k; ++i ) + hashers_.push_back(Hasher(i, name)); + } + +HashPolicy::hash_vector DefaultHashing::Hash(const void* x, size_t n) const + { + hash_vector h(K(), 0); + for ( size_t i = 0; i < h.size(); ++i ) + h[i] = hashers_[i](x, n); + return h; + } + +DoubleHashing::DoubleHashing(size_t k, const std::string& name) + : HashPolicy(k, name), + hasher1_(1, name), + hasher2_(2, name) + { + } + +HashPolicy::hash_vector DoubleHashing::Hash(const void* x, size_t n) const + { + hash_type h1 = hasher1_(x, n); + hash_type h2 = hasher2_(x, n); + hash_vector h(K(), 0); + for ( size_t i = 0; i < h.size(); ++i ) + h[i] = h1 + i * h2; + return h; + } + + diff --git a/src/HashPolicy.h b/src/HashPolicy.h new file mode 100644 index 0000000000..4660bc0080 --- /dev/null +++ b/src/HashPolicy.h @@ -0,0 +1,90 @@ +#ifndef HashPolicy_h +#define HashPolicy_h + +#include "Hash.h" +#include "H3.h" + +/** + * A functor that computes a universal hash function. + */ +class Hasher { +public: + typedef hash_t hash_type; + + /** + * Constructs a hasher seeded by a given seed and optionally an extra + * descriptor. + * + * @param seed The seed to use. + * + * @param extra If not `NULL`, the hasher will not mix in the initial seed + * but instead use this NUL-terminated string as additional seed. + */ + Hasher(size_t seed, const std::string& extra = ""); + + /** + * Computes the hash digest of contiguous data. + * + * @param x A pointer to the beginning of the byte sequence to hash. + * + * @param n The length of the sequence pointed to by *x*. + */ + hash_type operator()(const void* x, size_t n) const; + +private: + static size_t compute_seed(size_t seed, const std::string& extra); + + H3 h_; +}; + +/** + * The abstract base class for hash policies that hash elements *k* times. + */ +class HashPolicy { +public: + typedef Hasher::hash_type hash_type; + typedef std::vector hash_vector; + + virtual ~HashPolicy() { } + + virtual hash_vector Hash(const void* x, size_t n) const = 0; + + size_t K() const { return k_; } + const std::string& Name() const { return name_; } + +protected: + HashPolicy(size_t k, const std::string& name); + +private: + const size_t k_; + std::string name_; +}; + +/** + * The default hashing policy. Performs *k* hash function computations. + */ +class DefaultHashing : public HashPolicy { +public: + DefaultHashing(size_t k, const std::string& name); + + virtual hash_vector Hash(const void* x, size_t n) const /* override */; + +private: + std::vector hashers_; +}; + +/** + * The *double-hashing* policy. Uses a linear combination of two hash functions. + */ +class DoubleHashing : public HashPolicy { +public: + DoubleHashing(size_t k, const std::string& name); + + virtual hash_vector Hash(const void* x, size_t n) const; + +private: + Hasher hasher1_; + Hasher hasher2_; +}; + +#endif diff --git a/src/OpaqueVal.cc b/src/OpaqueVal.cc index 9dd5c7f980..8b82916689 100644 --- a/src/OpaqueVal.cc +++ b/src/OpaqueVal.cc @@ -605,6 +605,7 @@ IMPLEMENT_SERIAL(BloomFilterVal, SER_BLOOMFILTER_VAL); bool BloomFilterVal::DoSerialize(SerialInfo* info) const { DO_SERIALIZE(SER_BLOOMFILTER_VAL, OpaqueVal); + assert( type_ ); if ( ! type_->Serialize(info) ) return false; return bloom_filter_->Serialize(info); diff --git a/src/bro.bif b/src/bro.bif index 9b80c90dbf..a89b808888 100644 --- a/src/bro.bif +++ b/src/bro.bif @@ -4986,42 +4986,55 @@ function anonymize_addr%(a: addr, cl: IPAddrAnonymizationClass%): addr #include "BloomFilter.h" %%} -## Initializes a Bloom filter data structure. +## Creates a basic Bloom filter. ## ## fp: The desired false-positive rate. ## ## capacity: the maximum number of elements that guarantees a false-positive ## rate of *fp*. ## -## max: The maximum counter value associated with each each element in the -## Bloom filter. If greater than 1, each element in the set has a counter of -## *w = ceil(log_2(max))* bits. Each bit in the underlying bit vector then -## becomes a cell of size *w* bits. Since the number number of cells is a -## function ## of *fp* and *capacity*, it is important to consider the effects -## on space when tuning this value. +## name: A name that uniquely identifies and seeds the Bloom filter. If empty, +## the initialization will become dependent on the initial seed. ## ## Returns: A Bloom filter handle. -function bloomfilter_init%(fp: double, capacity: count, - max: count &default=1%): opaque of bloomfilter +function bloomfilter_basic_init%(fp: double, capacity: count, + name: string &default=""%): opaque of bloomfilter %{ if ( fp < 0.0 || fp > 1.0 ) { reporter->Error("false-positive rate must take value between 0 and 1"); return NULL; } - BloomFilter* bf; - if ( max == 1 ) - { - bf = new BasicBloomFilter(fp, capacity); - } - else - { - uint16 width = 0; - while ( max >>= 1 ) - ++width; - bf = new CountingBloomFilter(fp, capacity, width); - } - return new BloomFilterVal(bf); + + size_t cells = BasicBloomFilter::M(fp, capacity); + size_t optimal_k = BasicBloomFilter::K(cells, capacity); + const HashPolicy* hp = new DefaultHashing(optimal_k, name->CheckString()); + fprintf(stderr, "constructing Bloom filter with %llu hash fns and %llu cells\n", optimal_k, cells); + return new BloomFilterVal(new BasicBloomFilter(hp, cells)); + %} + +## Creates a counting Bloom filter. +## +## k: The number of hash functions to use. +## +## cells: The number of cells of the underlying counter vector. +## +## max: The maximum counter value associated with each each element described +## by *w = ceil(log_2(max))* bits. Each bit in the underlying counter vector +## becomes a cell of size *w* bits. +## +## name: A name that uniquely identifies and seeds the Bloom filter. If empty, +## the initialization will become dependent on the initial seed. +## +## Returns: A Bloom filter handle. +function bloomfilter_counting_init%(k: count, cells: count, max: count, + name: string &default=""%): opaque of bloomfilter + %{ + const HashPolicy* hp = new DefaultHashing(k, name->CheckString()); + uint16 width = 0; + while ( max >>= 1 ) + ++width; + return new BloomFilterVal(new CountingBloomFilter(hp, cells, width)); %} ## Adds an element to a Bloom filter. diff --git a/testing/btest/bifs/bloomfilter.bro b/testing/btest/bifs/bloomfilter.bro index 769cec1200..3ff6a6668e 100644 --- a/testing/btest/bifs/bloomfilter.bro +++ b/testing/btest/bifs/bloomfilter.bro @@ -4,7 +4,7 @@ event bro_init() { # Basic usage with counts. - local bf_cnt = bloomfilter_init(0.1, 1000); + local bf_cnt = bloomfilter_basic_init(0.1, 1000); bloomfilter_add(bf_cnt, 42); bloomfilter_add(bf_cnt, 84); bloomfilter_add(bf_cnt, 168); @@ -16,23 +16,23 @@ event bro_init() bloomfilter_add(bf_cnt, "foo"); # Type mismatch # Basic usage with strings. - local bf_str = bloomfilter_init(0.9, 10); + local bf_str = bloomfilter_basic_init(0.9, 10); bloomfilter_add(bf_str, "foo"); bloomfilter_add(bf_str, "bar"); print bloomfilter_lookup(bf_str, "foo"); print bloomfilter_lookup(bf_str, "bar"); - print bloomfilter_lookup(bf_str, "baz"); # FP - print bloomfilter_lookup(bf_str, "qux"); # FP + print bloomfilter_lookup(bf_str, "b4z"); # FP + print bloomfilter_lookup(bf_str, "quux"); # FP bloomfilter_add(bf_str, 0.5); # Type mismatch bloomfilter_add(bf_str, 100); # Type mismatch # Edge cases. - local bf_edge0 = bloomfilter_init(0.000000000001, 1); - local bf_edge1 = bloomfilter_init(0.00000001, 100000000); - local bf_edge2 = bloomfilter_init(0.9999999, 1); - local bf_edge3 = bloomfilter_init(0.9999999, 100000000000); + local bf_edge0 = bloomfilter_basic_init(0.000000000001, 1); + local bf_edge1 = bloomfilter_basic_init(0.00000001, 100000000); + local bf_edge2 = bloomfilter_basic_init(0.9999999, 1); + local bf_edge3 = bloomfilter_basic_init(0.9999999, 100000000000); # Invalid parameters. - local bf_bug0 = bloomfilter_init(-0.5, 42); - local bf_bug1 = bloomfilter_init(1.1, 42); + local bf_bug0 = bloomfilter_basic_init(-0.5, 42); + local bf_bug1 = bloomfilter_basic_init(1.1, 42); } diff --git a/testing/btest/istate/opaque.bro b/testing/btest/istate/opaque.bro index ac3b2c0874..b387f9d6bc 100644 --- a/testing/btest/istate/opaque.bro +++ b/testing/btest/istate/opaque.bro @@ -82,7 +82,7 @@ event bro_init() if ( ! entropy_test_add(entropy_handle, "f") ) print out, "entropy_test_add() failed"; - bloomfilter_handle = bloomfilter_init(0.1, 100); + bloomfilter_handle = bloomfilter_basic_init(0.1, 100); for ( e in bloomfilter_elements ) bloomfilter_add(bloomfilter_handle, e); }