From 5383e8f75bae11bc5da30acf0b77493b90e5f71c Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Wed, 24 Jul 2013 11:21:10 +0200 Subject: [PATCH 1/3] Add bloomfilter_clear() BiF. --- src/OpaqueVal.cc | 5 +++++ src/OpaqueVal.h | 1 + src/probabilistic/BloomFilter.cc | 10 ++++++++++ src/probabilistic/BloomFilter.h | 11 +++++++++++ src/probabilistic/CounterVector.cc | 5 +++++ src/probabilistic/CounterVector.h | 5 +++++ src/probabilistic/bloom-filter.bif | 16 ++++++++++++++++ 7 files changed, 53 insertions(+) diff --git a/src/OpaqueVal.cc b/src/OpaqueVal.cc index efdd890f70..19a372c005 100644 --- a/src/OpaqueVal.cc +++ b/src/OpaqueVal.cc @@ -578,6 +578,11 @@ size_t BloomFilterVal::Count(const Val* val) const return cnt; } +void BloomFilterVal::Clear() + { + bloom_filter->Clear(); + } + BloomFilterVal* BloomFilterVal::Merge(const BloomFilterVal* x, const BloomFilterVal* y) { diff --git a/src/OpaqueVal.h b/src/OpaqueVal.h index ea704cb70a..cfb184fc77 100644 --- a/src/OpaqueVal.h +++ b/src/OpaqueVal.h @@ -125,6 +125,7 @@ public: void Add(const Val* val); size_t Count(const Val* val) const; + void Clear(); static BloomFilterVal* Merge(const BloomFilterVal* x, const BloomFilterVal* y); diff --git a/src/probabilistic/BloomFilter.cc b/src/probabilistic/BloomFilter.cc index 5613dcce05..c78cd4193d 100644 --- a/src/probabilistic/BloomFilter.cc +++ b/src/probabilistic/BloomFilter.cc @@ -74,6 +74,11 @@ size_t BasicBloomFilter::K(size_t cells, size_t capacity) return std::ceil(frac * std::log(2)); } +void BasicBloomFilter::Clear() + { + bits->Clear(); + } + BasicBloomFilter* BasicBloomFilter::Merge(const BasicBloomFilter* x, const BasicBloomFilter* y) { @@ -191,3 +196,8 @@ size_t CountingBloomFilter::CountImpl(const Hasher::digest_vector& h) const return min; } + +void CountingBloomFilter::Clear() + { + cells->Clear(); + } diff --git a/src/probabilistic/BloomFilter.h b/src/probabilistic/BloomFilter.h index 4a6b01c484..55bc76fca7 100644 --- a/src/probabilistic/BloomFilter.h +++ b/src/probabilistic/BloomFilter.h @@ -47,6 +47,11 @@ public: return CountImpl((*hasher)(x)); } + /** + * Removes all elements, i.e., resets all bits in the underlying bit vector. + */ + virtual void Clear() = 0; + /** * Serializes the Bloom filter. * @@ -147,6 +152,9 @@ public: */ static size_t K(size_t cells, size_t capacity); + // Overridden from BloomFilter. + virtual void Clear(); + /** * Merges two basic Bloom filters. * @@ -188,6 +196,9 @@ public: */ CountingBloomFilter(const Hasher* hasher, size_t cells, size_t width); + // Overridden from BloomFilter. + virtual void Clear(); + /** * Merges two counting Bloom filters. * diff --git a/src/probabilistic/CounterVector.cc b/src/probabilistic/CounterVector.cc index 570ed1f8ea..00fa7fb8c0 100644 --- a/src/probabilistic/CounterVector.cc +++ b/src/probabilistic/CounterVector.cc @@ -70,6 +70,11 @@ bool CounterVector::Decrement(size_type cell, count_type value) return carry; } +void CounterVector::Clear() + { + bits->Clear(); + } + CounterVector::count_type CounterVector::Count(size_type cell) const { assert(cell < Size()); diff --git a/src/probabilistic/CounterVector.h b/src/probabilistic/CounterVector.h index 178a68e8f2..896f98ef1e 100644 --- a/src/probabilistic/CounterVector.h +++ b/src/probabilistic/CounterVector.h @@ -77,6 +77,11 @@ public: */ count_type Count(size_type cell) const; + /** + * Sets all counters to 0. + */ + void Clear(); + /** * Retrieves the number of cells in the storage. * diff --git a/src/probabilistic/bloom-filter.bif b/src/probabilistic/bloom-filter.bif index cbbff85d7d..9df168be0e 100644 --- a/src/probabilistic/bloom-filter.bif +++ b/src/probabilistic/bloom-filter.bif @@ -121,6 +121,22 @@ function bloomfilter_lookup%(bf: opaque of bloomfilter, x: any%): count return new Val(0, TYPE_COUNT); %} +## Removes all elements from a Bloom filter. This function sets resets all bits +## in the underlying bitvector to 0 but does not change the parameterization of +## the Bloom filter, such as the element type and the hasher seed. +## +## bf: The Bloom filter handle. +function bloomfilter_clear%(bf: opaque of bloomfilter%): any + %{ + BloomFilterVal* bfv = static_cast(bf); + + if ( bfv->Type() ) // Untyped Bloom filters are already empty. + bfv->Clear(); + + return 0; + %} + + ## Merges two Bloom filters. ## ## bf1: The first Bloom filter handle. From 5736aef440574389dda6555642ee7e938156dcf1 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Wed, 24 Jul 2013 13:05:38 +0200 Subject: [PATCH 2/3] Refactor Bloom filter merging. --- src/OpaqueVal.cc | 31 ++++++++--- src/OpaqueVal.h | 22 -------- src/probabilistic/BloomFilter.cc | 92 +++++++++++++++++++++++--------- src/probabilistic/BloomFilter.h | 36 +++++++------ 4 files changed, 109 insertions(+), 72 deletions(-) diff --git a/src/OpaqueVal.cc b/src/OpaqueVal.cc index 19a372c005..feff4f3cc0 100644 --- a/src/OpaqueVal.cc +++ b/src/OpaqueVal.cc @@ -584,21 +584,36 @@ void BloomFilterVal::Clear() } BloomFilterVal* BloomFilterVal::Merge(const BloomFilterVal* x, - const BloomFilterVal* y) + const BloomFilterVal* y) { if ( ! same_type(x->Type(), y->Type()) ) + { reporter->InternalError("cannot merge Bloom filters with different types"); + return 0; + } - BloomFilterVal* result; + if ( typeid(*x->bloom_filter) != typeid(*y->bloom_filter) ) + { + reporter->InternalError("cannot merge different Bloom filter types"); + return 0; + } - if ( (result = DoMerge(x, y)) ) - return result; + probabilistic::BloomFilter* copy = x->bloom_filter->Clone(); + bool success = copy->Merge(y->bloom_filter); + if ( ! success ) + { + reporter->InternalError("failed to merge Bloom filter"); + return 0; + } - else if ( (result = DoMerge(x, y)) ) - return result; + BloomFilterVal* merged = new BloomFilterVal(copy); + if ( ! merged->Typify(x->Type()) ) + { + reporter->InternalError("failed to set type on merged Bloom filter"); + return 0; + } - reporter->InternalError("failed to merge Bloom filters"); - return 0; + return merged; } BloomFilterVal::~BloomFilterVal() diff --git a/src/OpaqueVal.h b/src/OpaqueVal.h index cfb184fc77..360bb69803 100644 --- a/src/OpaqueVal.h +++ b/src/OpaqueVal.h @@ -142,28 +142,6 @@ private: BloomFilterVal(const BloomFilterVal&); BloomFilterVal& operator=(const BloomFilterVal&); - template - static BloomFilterVal* DoMerge(const BloomFilterVal* x, - const BloomFilterVal* y) - { - if ( typeid(*x->bloom_filter) != typeid(*y->bloom_filter) ) - reporter->InternalError("cannot merge different Bloom filter types"); - - if ( typeid(T) != typeid(*x->bloom_filter) ) - return 0; - - const T* a = static_cast(x->bloom_filter); - const T* b = static_cast(y->bloom_filter); - - BloomFilterVal* merged = new BloomFilterVal(T::Merge(a, b)); - assert(merged); - - if ( ! merged->Typify(x->Type()) ) - reporter->InternalError("failed to set type on merged Bloom filter"); - - return merged; - } - BroType* type; CompositeHash* hash; probabilistic::BloomFilter* bloom_filter; diff --git a/src/probabilistic/BloomFilter.cc b/src/probabilistic/BloomFilter.cc index c78cd4193d..132cf376ec 100644 --- a/src/probabilistic/BloomFilter.cc +++ b/src/probabilistic/BloomFilter.cc @@ -79,17 +79,37 @@ void BasicBloomFilter::Clear() bits->Clear(); } -BasicBloomFilter* BasicBloomFilter::Merge(const BasicBloomFilter* x, - const BasicBloomFilter* y) +bool BasicBloomFilter::Merge(const BloomFilter* other) { - if ( ! x->hasher->Equals(y->hasher) ) - reporter->InternalError("incompatible hashers during BasicBloomFilter merge"); + if ( typeid(*this) != typeid(*other) ) + return 0; - BasicBloomFilter* result = new BasicBloomFilter(); - result->hasher = x->hasher->Clone(); - result->bits = new BitVector(*x->bits | *y->bits); + const BasicBloomFilter* o = static_cast(other); - return result; + if ( ! hasher->Equals(o->hasher) ) + { + reporter->InternalError("incompatible hashers in BasicBloomFilter merge"); + return false; + } + else if ( bits->Size() != o->bits->Size() ) + { + reporter->InternalError("different bitvector size in BasicBloomFilter merge"); + return false; + } + + (*bits) |= *o->bits; + + return true; + } + +BasicBloomFilter* BasicBloomFilter::Clone() const + { + BasicBloomFilter* copy = new BasicBloomFilter(); + + copy->hasher = hasher->Clone(); + copy->bits = new BitVector(*bits); + + return copy; } BasicBloomFilter::BasicBloomFilter() @@ -135,19 +155,6 @@ size_t BasicBloomFilter::CountImpl(const Hasher::digest_vector& h) const return 1; } -CountingBloomFilter* CountingBloomFilter::Merge(const CountingBloomFilter* x, - const CountingBloomFilter* y) - { - if ( ! x->hasher->Equals(y->hasher) ) - reporter->InternalError("incompatible hashers during CountingBloomFilter merge"); - - CountingBloomFilter* result = new CountingBloomFilter(); - result->hasher = x->hasher->Clone(); - result->cells = new CounterVector(*x->cells | *y->cells); - - return result; - } - CountingBloomFilter::CountingBloomFilter() { cells = 0; @@ -160,6 +167,44 @@ CountingBloomFilter::CountingBloomFilter(const Hasher* hasher, cells = new CounterVector(width, arg_cells); } +void CountingBloomFilter::Clear() + { + cells->Clear(); + } + +bool CountingBloomFilter::Merge(const BloomFilter* other) + { + if ( typeid(*this) != typeid(*other) ) + return 0; + + const CountingBloomFilter* o = static_cast(other); + + if ( ! hasher->Equals(o->hasher) ) + { + reporter->InternalError("incompatible hashers in CountingBloomFilter merge"); + return false; + } + else if ( cells->Size() != o->cells->Size() ) + { + reporter->InternalError("different bitvector size in CountingBloomFilter merge"); + return false; + } + + (*cells) |= *o->cells; + + return true; + } + +CountingBloomFilter* CountingBloomFilter::Clone() const + { + CountingBloomFilter* copy = new CountingBloomFilter(); + + copy->hasher = hasher->Clone(); + copy->cells = new CounterVector(*cells); + + return copy; + } + IMPLEMENT_SERIAL(CountingBloomFilter, SER_COUNTINGBLOOMFILTER) bool CountingBloomFilter::DoSerialize(SerialInfo* info) const @@ -196,8 +241,3 @@ size_t CountingBloomFilter::CountImpl(const Hasher::digest_vector& h) const return min; } - -void CountingBloomFilter::Clear() - { - cells->Clear(); - } diff --git a/src/probabilistic/BloomFilter.h b/src/probabilistic/BloomFilter.h index 55bc76fca7..2ab5b89941 100644 --- a/src/probabilistic/BloomFilter.h +++ b/src/probabilistic/BloomFilter.h @@ -52,6 +52,22 @@ public: */ virtual void Clear() = 0; + /** + * Merges another Bloom filter into a copy of this one. + * + * @param other The other Bloom filter. + * + * @return `true` on success. + */ + virtual bool Merge(const BloomFilter* other) = 0; + + /** + * Constructs a copy of this Bloom filter. + * + * @return A copy of `*this`. + */ + virtual BloomFilter* Clone() const = 0; + /** * Serializes the Bloom filter. * @@ -154,14 +170,8 @@ public: // Overridden from BloomFilter. virtual void Clear(); - - /** - * Merges two basic Bloom filters. - * - * @return The merged Bloom filter. - */ - static BasicBloomFilter* Merge(const BasicBloomFilter* x, - const BasicBloomFilter* y); + virtual bool Merge(const BloomFilter* other); + virtual BasicBloomFilter* Clone() const; protected: DECLARE_SERIAL(BasicBloomFilter); @@ -198,14 +208,8 @@ public: // Overridden from BloomFilter. virtual void Clear(); - - /** - * Merges two counting Bloom filters. - * - * @return The merged Bloom filter. - */ - static CountingBloomFilter* Merge(const CountingBloomFilter* x, - const CountingBloomFilter* y); + virtual bool Merge(const BloomFilter* other); + virtual CountingBloomFilter* Clone() const; protected: DECLARE_SERIAL(CountingBloomFilter); From 5769c32f1eeb319e599996e05e0e63b30af34823 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Wed, 24 Jul 2013 13:18:19 +0200 Subject: [PATCH 3/3] Support emptiness check on Bloom filters. --- src/OpaqueVal.cc | 5 +++++ src/OpaqueVal.h | 1 + src/probabilistic/BitVector.cc | 8 ++++++++ src/probabilistic/BitVector.h | 6 ++++++ src/probabilistic/BloomFilter.cc | 10 ++++++++++ src/probabilistic/BloomFilter.h | 9 +++++++++ src/probabilistic/CounterVector.cc | 5 +++++ src/probabilistic/CounterVector.h | 6 ++++++ src/probabilistic/bloom-filter.bif | 3 +++ 9 files changed, 53 insertions(+) diff --git a/src/OpaqueVal.cc b/src/OpaqueVal.cc index feff4f3cc0..a42892e2b2 100644 --- a/src/OpaqueVal.cc +++ b/src/OpaqueVal.cc @@ -583,6 +583,11 @@ void BloomFilterVal::Clear() bloom_filter->Clear(); } +bool BloomFilterVal::Empty() const + { + return bloom_filter->Empty(); + } + BloomFilterVal* BloomFilterVal::Merge(const BloomFilterVal* x, const BloomFilterVal* y) { diff --git a/src/OpaqueVal.h b/src/OpaqueVal.h index 360bb69803..52c9583fc7 100644 --- a/src/OpaqueVal.h +++ b/src/OpaqueVal.h @@ -126,6 +126,7 @@ public: void Add(const Val* val); size_t Count(const Val* val) const; void Clear(); + bool Empty() const; static BloomFilterVal* Merge(const BloomFilterVal* x, const BloomFilterVal* y); diff --git a/src/probabilistic/BitVector.cc b/src/probabilistic/BitVector.cc index 98f008b24b..13cd1aa3bb 100644 --- a/src/probabilistic/BitVector.cc +++ b/src/probabilistic/BitVector.cc @@ -463,6 +463,14 @@ bool BitVector::Empty() const return bits.empty(); } +bool BitVector::AllZero() const + { + for ( size_t i = 0; i < bits.size(); ++i ) + if ( bits[i] ) + return false; + return true; + } + BitVector::size_type BitVector::FindFirst() const { return find_from(0); diff --git a/src/probabilistic/BitVector.h b/src/probabilistic/BitVector.h index 9eefe1b633..d9c55d53c6 100644 --- a/src/probabilistic/BitVector.h +++ b/src/probabilistic/BitVector.h @@ -253,6 +253,12 @@ public: */ bool Empty() const; + /** + * Checks whether all bits are 0. + * @return `true` iff all bits in all blocks are 0. + */ + bool AllZero() const; + /** * Finds the bit position of of the first 1-bit. * @return The position of the first bit that equals to one or `npos` if no diff --git a/src/probabilistic/BloomFilter.cc b/src/probabilistic/BloomFilter.cc index 132cf376ec..7f769cbf7c 100644 --- a/src/probabilistic/BloomFilter.cc +++ b/src/probabilistic/BloomFilter.cc @@ -74,6 +74,11 @@ size_t BasicBloomFilter::K(size_t cells, size_t capacity) return std::ceil(frac * std::log(2)); } +bool BasicBloomFilter::Empty() const + { + return bits->AllZero(); + } + void BasicBloomFilter::Clear() { bits->Clear(); @@ -167,6 +172,11 @@ CountingBloomFilter::CountingBloomFilter(const Hasher* hasher, cells = new CounterVector(width, arg_cells); } +bool CountingBloomFilter::Empty() const + { + return cells->AllZero(); + } + void CountingBloomFilter::Clear() { cells->Clear(); diff --git a/src/probabilistic/BloomFilter.h b/src/probabilistic/BloomFilter.h index 2ab5b89941..b6cf18672f 100644 --- a/src/probabilistic/BloomFilter.h +++ b/src/probabilistic/BloomFilter.h @@ -47,6 +47,13 @@ public: return CountImpl((*hasher)(x)); } + /** + * Checks whether the Bloom filter is empty. + * + * @return `true` if the Bloom filter contains no elements. + */ + virtual bool Empty() const = 0; + /** * Removes all elements, i.e., resets all bits in the underlying bit vector. */ @@ -169,6 +176,7 @@ public: static size_t K(size_t cells, size_t capacity); // Overridden from BloomFilter. + virtual bool Empty() const; virtual void Clear(); virtual bool Merge(const BloomFilter* other); virtual BasicBloomFilter* Clone() const; @@ -207,6 +215,7 @@ public: CountingBloomFilter(const Hasher* hasher, size_t cells, size_t width); // Overridden from BloomFilter. + virtual bool Empty() const; virtual void Clear(); virtual bool Merge(const BloomFilter* other); virtual CountingBloomFilter* Clone() const; diff --git a/src/probabilistic/CounterVector.cc b/src/probabilistic/CounterVector.cc index 00fa7fb8c0..24c9ff3638 100644 --- a/src/probabilistic/CounterVector.cc +++ b/src/probabilistic/CounterVector.cc @@ -70,6 +70,11 @@ bool CounterVector::Decrement(size_type cell, count_type value) return carry; } +bool CounterVector::AllZero() const + { + return bits->AllZero(); + } + void CounterVector::Clear() { bits->Clear(); diff --git a/src/probabilistic/CounterVector.h b/src/probabilistic/CounterVector.h index 896f98ef1e..df6fc57ac2 100644 --- a/src/probabilistic/CounterVector.h +++ b/src/probabilistic/CounterVector.h @@ -77,6 +77,12 @@ public: */ count_type Count(size_type cell) const; + /** + * Checks whether all counters are 0. + * @return `true` iff all counters have the value 0. + */ + bool AllZero() const; + /** * Sets all counters to 0. */ diff --git a/src/probabilistic/bloom-filter.bif b/src/probabilistic/bloom-filter.bif index 9df168be0e..dd21688fdd 100644 --- a/src/probabilistic/bloom-filter.bif +++ b/src/probabilistic/bloom-filter.bif @@ -109,6 +109,9 @@ function bloomfilter_lookup%(bf: opaque of bloomfilter, x: any%): count %{ const BloomFilterVal* bfv = static_cast(bf); + if ( bfv->Empty() ) + return new Val(0, TYPE_COUNT); + if ( ! bfv->Type() ) reporter->Error("cannot perform lookup on untyped Bloom filter");