diff --git a/src/BloomFilter.cc b/src/BloomFilter.cc index 78048ee588..64f0e1c67b 100644 --- a/src/BloomFilter.cc +++ b/src/BloomFilter.cc @@ -46,12 +46,23 @@ CounterVector::size_type CounterVector::Size() const return bits_->Blocks() / width_; } +bool CounterVector::Serialize(SerialInfo* info) const + { + return SerialObj::Serialize(info); + } + +CounterVector* CounterVector::Unserialize(UnserialInfo* info) + { + return reinterpret_cast( + SerialObj::Unserialize(info, SER_COUNTERVECTOR)); + } + IMPLEMENT_SERIAL(CounterVector, SER_COUNTERVECTOR) bool CounterVector::DoSerialize(SerialInfo* info) const { DO_SERIALIZE(SER_COUNTERVECTOR, SerialObj); - if ( ! SERIALIZE(&bits_) ) + if ( ! SERIALIZE(bits_) ) return false; return SERIALIZE(static_cast(width_)); } @@ -60,9 +71,9 @@ bool CounterVector::DoUnserialize(UnserialInfo* info) { DO_UNSERIALIZE(SerialObj); return false; - // TODO: Ask Robin how to unserialize non-pointer members. - //if ( ! UNSERIALIZE(&bits_) ) - // return false; + bits_ = BitVector::Unserialize(info); + if ( ! bits_ ) + return false; uint64 width; if ( ! UNSERIALIZE(&width) ) return false; @@ -90,6 +101,18 @@ HashPolicy::HashVector DoubleHashing::Hash(const void* x, size_t n) const return h; } + +BloomFilter::BloomFilter(size_t k) + : hash_(new hash_policy(k)) + { + } + +BloomFilter::~BloomFilter() + { + if ( hash_ ) + delete hash_; + } + bool BloomFilter::Serialize(SerialInfo* info) const { return SerialObj::Serialize(info); @@ -101,24 +124,21 @@ BloomFilter* BloomFilter::Unserialize(UnserialInfo* info) SerialObj::Unserialize(info, SER_BLOOMFILTER)); } -// FIXME: should abstract base classes also have IMPLEMENT_SERIAL? -//IMPLEMENT_SERIAL(BloomFilter, SER_BLOOMFILTER) - bool BloomFilter::DoSerialize(SerialInfo* info) const { DO_SERIALIZE(SER_BLOOMFILTER, SerialObj); - // TODO: Make the hash policy serializable. - //if ( ! SERIALIZE(hash_) ) - // return false; - return SERIALIZE(static_cast(elements_)); + if ( ! SERIALIZE(static_cast(hash_->K())) ) + return false; + return SERIALIZE(static_cast(elements_)); } bool BloomFilter::DoUnserialize(UnserialInfo* info) { DO_UNSERIALIZE(SerialObj); - // TODO: Make the hash policy serializable. - //if ( ! hash_ = HashPolicy::Unserialize(info) ) - // return false; + uint16 k; + if ( ! UNSERIALIZE(&k) ) + return false; + hash_ = new hash_policy(static_cast(k)); uint64 elements; if ( UNSERIALIZE(&elements) ) return false; @@ -126,7 +146,7 @@ bool BloomFilter::DoUnserialize(UnserialInfo* info) return true; } -size_t BasicBloomFilter::Cells(double fp, size_t capacity) +size_t BasicBloomFilter::M(double fp, size_t capacity) { double ln2 = std::log(2); return std::ceil(-(capacity * std::log(fp) / ln2 / ln2)); @@ -138,9 +158,16 @@ size_t BasicBloomFilter::K(size_t cells, size_t capacity) return round(frac * std::log(2)); } -BasicBloomFilter::BasicBloomFilter(size_t cells, HashPolicy* hash) - : BloomFilter(hash), bits_(cells) +BasicBloomFilter::BasicBloomFilter(double fp, size_t capacity) + : BloomFilter(K(M(fp, capacity), capacity)) { + bits_ = new BitVector(M(fp, capacity)); + } + +BasicBloomFilter::BasicBloomFilter(size_t cells, size_t capacity) + : BloomFilter(K(cells, capacity)) + { + bits_ = new BitVector(cells); } IMPLEMENT_SERIAL(BasicBloomFilter, SER_BASICBLOOMFILTER) @@ -148,38 +175,50 @@ IMPLEMENT_SERIAL(BasicBloomFilter, SER_BASICBLOOMFILTER) bool BasicBloomFilter::DoSerialize(SerialInfo* info) const { DO_SERIALIZE(SER_BASICBLOOMFILTER, BloomFilter); - // TODO: Make the hash policy serializable. - //if ( ! SERIALIZE(&bits_) ) - // return false; - return true; + return SERIALIZE(bits_); } bool BasicBloomFilter::DoUnserialize(UnserialInfo* info) { DO_UNSERIALIZE(BloomFilter); - // TODO: Non-pointer member deserialization? - return true; + bits_ = BitVector::Unserialize(info); + return bits_ == NULL; } void BasicBloomFilter::AddImpl(const HashPolicy::HashVector& h) { for ( size_t i = 0; i < h.size(); ++i ) - bits_.set(h[i] % h.size()); + bits_->Set(h[i] % h.size()); } size_t BasicBloomFilter::CountImpl(const HashPolicy::HashVector& h) const { for ( size_t i = 0; i < h.size(); ++i ) - if ( ! bits_[h[i] % h.size()] ) + if ( ! (*bits_)[h[i] % h.size()] ) return 0; return 1; } +IMPLEMENT_SERIAL(CountingBloomFilter, SER_COUNTINGBLOOMFILTER) + +bool CountingBloomFilter::DoSerialize(SerialInfo* info) const + { + DO_SERIALIZE(SER_BASICBLOOMFILTER, BloomFilter); + return SERIALIZE(cells_); + } + +bool CountingBloomFilter::DoUnserialize(UnserialInfo* info) + { + DO_UNSERIALIZE(BloomFilter); + cells_ = CounterVector::Unserialize(info); + return cells_ == NULL; + } + void CountingBloomFilter::AddImpl(const HashPolicy::HashVector& h) { for ( size_t i = 0; i < h.size(); ++i ) - cells_.Increment(h[i] % h.size(), 1); + cells_->Increment(h[i] % h.size(), 1); } size_t CountingBloomFilter::CountImpl(const HashPolicy::HashVector& h) const @@ -188,7 +227,7 @@ size_t CountingBloomFilter::CountImpl(const HashPolicy::HashVector& h) const std::numeric_limits::max(); for ( size_t i = 0; i < h.size(); ++i ) { - CounterVector::size_type cnt = cells_.Count(h[i] % h.size()); + CounterVector::size_type cnt = cells_->Count(h[i] % h.size()); if ( cnt < min ) min = cnt; } diff --git a/src/BloomFilter.h b/src/BloomFilter.h index b4f82efee9..77c6bc4f56 100644 --- a/src/BloomFilter.h +++ b/src/BloomFilter.h @@ -151,9 +151,13 @@ private: /** * The abstract base class for Bloom filters. */ -class BloomFilter : SerialObj { +class BloomFilter : public SerialObj { public: - virtual ~BloomFilter() { delete hash_; } + // At this point we won't let the user choose the hash policy, but we might + // open up the interface in the future. + typedef DoubleHashing hash_policy; + + virtual ~BloomFilter(); /** * Adds an element of type T to the Bloom filter. @@ -193,10 +197,10 @@ public: static BloomFilter* Unserialize(UnserialInfo* info); protected: - DECLARE_SERIAL(BloomFilter); + DECLARE_ABSTRACT_SERIAL(BloomFilter); BloomFilter() { }; - BloomFilter(HashPolicy* hash) : hash_(hash) { } + BloomFilter(size_t k); virtual void AddImpl(const HashPolicy::HashVector& hashes) = 0; virtual size_t CountImpl(const HashPolicy::HashVector& hashes) const = 0; @@ -211,10 +215,42 @@ private: */ class BasicBloomFilter : public BloomFilter { public: - static size_t Cells(double fp, size_t capacity); + /** + * Computes the number of cells based a given false-positive rate and + * capacity. In the literature, this parameter often has the name *M*. + * + * @param fp The false-positive rate. + * + * @param capacity The number of exepected elements. + * + * Returns: The number cells needed to support a false-positive rate of *fp* + * with at most *capacity* elements. + */ + static size_t M(double fp, size_t capacity); + + /** + * Computes the optimal number of hash functions based on the number cells + * and expected number of elements. + * + * @param cells The number of cells (*m*). + * + * @param capacity The maximum number of elements. + * + * Returns: the optimal number of hash functions for a false-positive rate of + * *fp* for at most *capacity* elements. + */ static size_t K(size_t cells, size_t capacity); - BasicBloomFilter(size_t cells, HashPolicy* hash); + /** + * Constructs a basic Bloom filter with a given false-positive rate and + * capacity. + */ + BasicBloomFilter(double fp, size_t capacity); + + /** + * Constructs a basic Bloom filter with a given number of cells and capacity. + */ + BasicBloomFilter(size_t cells, size_t capacity); protected: DECLARE_SERIAL(BasicBloomFilter); @@ -225,7 +261,7 @@ protected: virtual size_t CountImpl(const HashPolicy::HashVector& h) const; private: - BitVector bits_; + BitVector* bits_; }; /** @@ -233,18 +269,18 @@ private: */ class CountingBloomFilter : public BloomFilter { public: - CountingBloomFilter(unsigned width, HashPolicy* hash); + CountingBloomFilter(unsigned width); protected: DECLARE_SERIAL(CountingBloomFilter); - CountingBloomFilter(); + CountingBloomFilter() { } virtual void AddImpl(const HashPolicy::HashVector& h); virtual size_t CountImpl(const HashPolicy::HashVector& h) const; private: - CounterVector cells_; + CounterVector* cells_; }; #endif diff --git a/src/NetVar.h b/src/NetVar.h index 1a20adcaf2..aa2a14ada5 100644 --- a/src/NetVar.h +++ b/src/NetVar.h @@ -249,6 +249,7 @@ extern OpaqueType* md5_type; extern OpaqueType* sha1_type; extern OpaqueType* sha256_type; extern OpaqueType* entropy_type; +extern OpaqueType* bloomfilter_type; // Initializes globals that don't pertain to network/event analysis. extern void init_general_global_var(); diff --git a/src/OpaqueVal.cc b/src/OpaqueVal.cc index a5fb65f53b..b4f1290436 100644 --- a/src/OpaqueVal.cc +++ b/src/OpaqueVal.cc @@ -518,23 +518,31 @@ bool EntropyVal::DoUnserialize(UnserialInfo* info) return true; } +BloomFilterVal::BloomFilterVal() : OpaqueVal(bloomfilter_type) + { + } + BloomFilterVal::BloomFilterVal(OpaqueType* t) : OpaqueVal(t) { } +BloomFilterVal::~BloomFilterVal() + { + if ( bloom_filter_ ) + delete bloom_filter_; + } + IMPLEMENT_SERIAL(BloomFilterVal, SER_BLOOMFILTER_VAL); bool BloomFilterVal::DoSerialize(SerialInfo* info) const { DO_SERIALIZE(SER_BLOOMFILTER_VAL, OpaqueVal); - // TODO: implement. - return true; + return SERIALIZE(bloom_filter_); } bool BloomFilterVal::DoUnserialize(UnserialInfo* info) { DO_UNSERIALIZE(OpaqueVal); - // TODO: implement. - return true; + bloom_filter_ = BloomFilter::Unserialize(info); + return bloom_filter_ == NULL; } - diff --git a/src/OpaqueVal.h b/src/OpaqueVal.h index 1c9c0361cc..68b42a8a49 100644 --- a/src/OpaqueVal.h +++ b/src/OpaqueVal.h @@ -112,6 +112,7 @@ private: class BloomFilterVal : public OpaqueVal { public: BloomFilterVal(); + ~BloomFilterVal(); protected: friend class Val; diff --git a/src/SerialTypes.h b/src/SerialTypes.h index 171113ab6a..859145f19f 100644 --- a/src/SerialTypes.h +++ b/src/SerialTypes.h @@ -53,6 +53,7 @@ SERIAL_IS(BITVECTOR, 0x1500) SERIAL_IS(COUNTERVECTOR, 0xa000) SERIAL_IS(BLOOMFILTER, 0xa100) SERIAL_IS(BASICBLOOMFILTER, 0xa200) +SERIAL_IS(COUNTINGBLOOMFILTER, 0xa300) // These are the externally visible types. const SerialType SER_NONE = 0; @@ -211,5 +212,6 @@ SERIAL_CONST2(BITVECTOR) SERIAL_CONST2(COUNTERVECTOR) SERIAL_CONST2(BLOOMFILTER) SERIAL_CONST2(BASICBLOOMFILTER) +SERIAL_CONST2(COUNTINGBLOOMFILTER) #endif