Add intersect operation for bloom filters

Intersecting two bloom filters yields a bloom filter that returns true
when an element was contained in both bloom filters. The false positive
rate is potentially a bit higher than in the original bloom filters.

This operation also works for counting bloom filters, however the
counters are discarded and the bloomfilters are converted to basic bloom
filters. The reason is that there is no obvious meaning to the counters
when two bloom filters are intersected - besides the fact if an element
was inserted at all.
This commit is contained in:
Johanna Amann 2022-01-20 13:34:07 +00:00 committed by Johanna Amann
parent aa58b6b37b
commit 796e18ecfc
9 changed files with 237 additions and 2 deletions

View file

@ -125,6 +125,31 @@ bool BasicBloomFilter::Merge(const BloomFilter* other)
return true;
}
BasicBloomFilter* BasicBloomFilter::Intersect(const BloomFilter* other) const
{
if ( typeid(*this) != typeid(*other) )
return nullptr;
const BasicBloomFilter* o = static_cast<const BasicBloomFilter*>(other);
if ( ! hasher->Equals(o->hasher) )
{
reporter->Error("incompatible hashers in BasicBloomFilter intersect");
return nullptr;
}
else if ( bits->Size() != o->bits->Size() )
{
reporter->Error("different bitvector size in BasicBloomFilter intersect");
return nullptr;
}
auto copy = Clone();
(*copy->bits) &= *o->bits;
return copy;
}
BasicBloomFilter* BasicBloomFilter::Clone() const
{
BasicBloomFilter* copy = new BasicBloomFilter();
@ -249,6 +274,32 @@ bool CountingBloomFilter::Merge(const BloomFilter* other)
return true;
}
BasicBloomFilter* CountingBloomFilter::Intersect(const BloomFilter* other) const
{
if ( typeid(*this) != typeid(*other) )
return nullptr;
const CountingBloomFilter* o = static_cast<const CountingBloomFilter*>(other);
if ( ! hasher->Equals(o->hasher) )
{
reporter->Error("incompatible hashers in CountingBloomFilter merge");
return nullptr;
}
else if ( cells->Size() != o->cells->Size() )
{
reporter->Error("different bitvector size in CountingBloomFilter merge");
return nullptr;
}
auto outbf = new BasicBloomFilter(hasher->Clone(), cells->Size());
*outbf->bits |= cells->ToBitVector();
*outbf->bits &= o->cells->ToBitVector();
return outbf;
}
CountingBloomFilter* CountingBloomFilter::Clone() const
{
CountingBloomFilter* copy = new CountingBloomFilter();

View file

@ -82,7 +82,7 @@ public:
virtual void Clear() = 0;
/**
* Merges another Bloom filter into a copy of this one.
* Merges another Bloom filter into this one.
*
* @param other The other Bloom filter.
*
@ -90,6 +90,15 @@ public:
*/
virtual bool Merge(const BloomFilter* other) = 0;
/**
* Intersects another Bloom filter with a copy of this one and returns the copy.
*
* @param other The other Bloom filter.
*
* @return Intersecting BloomFilter on success, nullptr otherwise.
*/
virtual BloomFilter* Intersect(const BloomFilter* other) const = 0;
/**
* Constructs a copy of this Bloom filter.
*
@ -126,11 +135,14 @@ protected:
const detail::Hasher* hasher;
};
class CountingBloomFilter;
/**
* A basic Bloom filter.
*/
class BasicBloomFilter : public BloomFilter
{
friend class CountingBloomFilter;
public:
/**
* Constructs a basic Bloom filter with a given number of cells. The
@ -181,6 +193,7 @@ public:
void Clear() override;
bool Merge(const BloomFilter* other) override;
BasicBloomFilter* Clone() const override;
BasicBloomFilter* Intersect(const BloomFilter* other) const override;
std::string InternalState() const override;
protected:
@ -233,6 +246,21 @@ public:
CountingBloomFilter* Clone() const override;
std::string InternalState() const override;
/**
* Intersects another Bloom filter this one and returns a new BasicBloomFilter.
*
* Please note that the Intersection of two Counting bloom filters results in a
* basic bloom filter. The reason for this is that the counters loose meaning during
* the intersection process. The BasicBloomFilter will have bits set in cases where
* both Counting Bloom filters has cell values greater than zero.
*
* @param other The other Bloom filter.
*
* @return Intersecting BloomFilter on success, nullptr otherwise.
*/
BasicBloomFilter* Intersect(const BloomFilter* other) const override;
protected:
friend class BloomFilter;

View file

@ -142,6 +142,24 @@ CounterVector& CounterVector::Merge(const CounterVector& other)
return *this;
}
BitVector CounterVector::ToBitVector() const
{
auto newbits = BitVector(Size());
for ( size_t cell = 0; cell < Size(); ++cell )
{
size_t lsb = cell * width;
bool set = false;
for ( size_t i = 0; i < width; ++i )
set |= (*bits)[lsb+1];
newbits[cell] = set;
}
return newbits;
}
CounterVector& CounterVector::operator|=(const CounterVector& other)
{
return Merge(other);

View file

@ -132,6 +132,14 @@ public:
*/
CounterVector& Merge(const CounterVector& other);
/**
* Converts a counter vector into a BitVector. Each cell that has a value
* of 1 or more set is set in the BitVector; otherwhise the bit remains unset.
*
* @return The newly created BitVector
*/
BitVector ToBitVector() const;
/**
* An alias for ::Merge.
*/

View file

@ -160,6 +160,9 @@ function bloomfilter_add%(bf: opaque of bloomfilter, x: any%): any
## Decrements the counter for an element that was added to a counting bloom filter in the past.
##
## Note that decrement operations can use to false negatives if used on a counting bloom-filter
## that exceeded the width of its counter.
##
## bf: The coubting bloom filter handle.
##
## x: The element to decrement
@ -247,7 +250,7 @@ function bloomfilter_clear%(bf: opaque of bloomfilter%): any
##
## .. zeek:see:: bloomfilter_basic_init bloomfilter_basic_init2
## bloomfilter_counting_init bloomfilter_add bloomfilter_lookup
## bloomfilter_clear
## bloomfilter_clear bloomfilter_merge
function bloomfilter_merge%(bf1: opaque of bloomfilter,
bf2: opaque of bloomfilter%): opaque of bloomfilter
%{
@ -265,6 +268,44 @@ function bloomfilter_merge%(bf1: opaque of bloomfilter,
return BloomFilterVal::Merge(bfv1, bfv2);
%}
## Intersects two Bloom filters.
##
## The resulting Bloom filter returns true when queried for elements
## that were contained in both bloom filters. Note that intersected Bloom
## filters have a slightly higher probability of false positives than
## Bloom filters created from scratch.
##
## Please note that, while this function works with basic and with counting
## bloom filters, the result always is a basic bloom filter. So - intersecting
## two counting bloom filters will result in a basic bloom filter. The reason
## for this is that there is no reasonable definition of how to handle counters
## during intersection.
##
## bf1: The first Bloom filter handle.
##
## bf2: The second Bloom filter handle.
##
## Returns: The intersection of *bf1* and *bf2*.
## .. zeek:see:: bloomfilter_basic_init bloomfilter_basic_init2
## bloomfilter_counting_init bloomfilter_add bloomfilter_lookup
## bloomfilter_clear bloomfilter_merge
function bloomfilter_intersect%(bf1: opaque of bloomfilter,
bf2: opaque of bloomfilter%): opaque of bloomfilter
%{
const auto* bfv1 = static_cast<const BloomFilterVal*>(bf1);
const auto* bfv2 = static_cast<const BloomFilterVal*>(bf2);
if ( bfv1->Type() && // any one 0 is ok here
bfv2->Type() &&
! same_type(bfv1->Type(), bfv2->Type()) )
{
reporter->Error("incompatible Bloom filter types");
return nullptr;
}
return BloomFilterVal::Intersect(bfv1, bfv2);
%}
## Returns a string with a representation of a Bloom filter's internal
## state. This is for debugging/testing purposes only.
##