Add intersect operation for bloom filters

Intersecting two bloom filters yields a bloom filter that returns true
when an element was contained in both bloom filters. The false positive
rate is potentially a bit higher than in the original bloom filters.

This operation also works for counting bloom filters, however the
counters are discarded and the bloomfilters are converted to basic bloom
filters. The reason is that there is no obvious meaning to the counters
when two bloom filters are intersected - besides the fact if an element
was inserted at all.
This commit is contained in:
Johanna Amann 2022-01-20 13:34:07 +00:00 committed by Johanna Amann
parent aa58b6b37b
commit 796e18ecfc
9 changed files with 237 additions and 2 deletions

View file

@ -701,6 +701,49 @@ BloomFilterValPtr BloomFilterVal::Merge(const BloomFilterVal* x, const BloomFilt
return merged;
}
BloomFilterValPtr BloomFilterVal::Intersect(const BloomFilterVal* x, const BloomFilterVal* y)
{
if ( x->Type() && // any one 0 is ok here
y->Type() && ! same_type(x->Type(), y->Type()) )
{
reporter->Error("cannot merge Bloom filters with different types");
return nullptr;
}
auto final_type = x->Type() ? x->Type() : y->Type();
if ( typeid(*x->bloom_filter) != typeid(*y->bloom_filter) )
{
reporter->Error("cannot merge different Bloom filter types");
return nullptr;
}
if ( typeid(*x->bloom_filter) != typeid(*y->bloom_filter) )
{
reporter->Error("cannot intersect different Bloom filter types");
return nullptr;
}
auto intersected_bf = x->bloom_filter->Intersect(y->bloom_filter);
if ( ! intersected_bf )
{
reporter->Error("failed to intersect Bloom filter");
return nullptr;
}
auto intersected = make_intrusive<BloomFilterVal>(intersected_bf);
if ( final_type && ! intersected->Typify(final_type) )
{
reporter->Error("Failed to set type on intersected bloom filter");
return nullptr;
}
return intersected;
}
BloomFilterVal::~BloomFilterVal()
{
delete hash;

View file

@ -335,6 +335,7 @@ public:
std::string InternalState() const;
static BloomFilterValPtr Merge(const BloomFilterVal* x, const BloomFilterVal* y);
static BloomFilterValPtr Intersect(const BloomFilterVal* x, const BloomFilterVal* y);
protected:
friend class Val;

View file

@ -125,6 +125,31 @@ bool BasicBloomFilter::Merge(const BloomFilter* other)
return true;
}
BasicBloomFilter* BasicBloomFilter::Intersect(const BloomFilter* other) const
{
if ( typeid(*this) != typeid(*other) )
return nullptr;
const BasicBloomFilter* o = static_cast<const BasicBloomFilter*>(other);
if ( ! hasher->Equals(o->hasher) )
{
reporter->Error("incompatible hashers in BasicBloomFilter intersect");
return nullptr;
}
else if ( bits->Size() != o->bits->Size() )
{
reporter->Error("different bitvector size in BasicBloomFilter intersect");
return nullptr;
}
auto copy = Clone();
(*copy->bits) &= *o->bits;
return copy;
}
BasicBloomFilter* BasicBloomFilter::Clone() const
{
BasicBloomFilter* copy = new BasicBloomFilter();
@ -249,6 +274,32 @@ bool CountingBloomFilter::Merge(const BloomFilter* other)
return true;
}
BasicBloomFilter* CountingBloomFilter::Intersect(const BloomFilter* other) const
{
if ( typeid(*this) != typeid(*other) )
return nullptr;
const CountingBloomFilter* o = static_cast<const CountingBloomFilter*>(other);
if ( ! hasher->Equals(o->hasher) )
{
reporter->Error("incompatible hashers in CountingBloomFilter merge");
return nullptr;
}
else if ( cells->Size() != o->cells->Size() )
{
reporter->Error("different bitvector size in CountingBloomFilter merge");
return nullptr;
}
auto outbf = new BasicBloomFilter(hasher->Clone(), cells->Size());
*outbf->bits |= cells->ToBitVector();
*outbf->bits &= o->cells->ToBitVector();
return outbf;
}
CountingBloomFilter* CountingBloomFilter::Clone() const
{
CountingBloomFilter* copy = new CountingBloomFilter();

View file

@ -82,7 +82,7 @@ public:
virtual void Clear() = 0;
/**
* Merges another Bloom filter into a copy of this one.
* Merges another Bloom filter into this one.
*
* @param other The other Bloom filter.
*
@ -90,6 +90,15 @@ public:
*/
virtual bool Merge(const BloomFilter* other) = 0;
/**
* Intersects another Bloom filter with a copy of this one and returns the copy.
*
* @param other The other Bloom filter.
*
* @return Intersecting BloomFilter on success, nullptr otherwise.
*/
virtual BloomFilter* Intersect(const BloomFilter* other) const = 0;
/**
* Constructs a copy of this Bloom filter.
*
@ -126,11 +135,14 @@ protected:
const detail::Hasher* hasher;
};
class CountingBloomFilter;
/**
* A basic Bloom filter.
*/
class BasicBloomFilter : public BloomFilter
{
friend class CountingBloomFilter;
public:
/**
* Constructs a basic Bloom filter with a given number of cells. The
@ -181,6 +193,7 @@ public:
void Clear() override;
bool Merge(const BloomFilter* other) override;
BasicBloomFilter* Clone() const override;
BasicBloomFilter* Intersect(const BloomFilter* other) const override;
std::string InternalState() const override;
protected:
@ -233,6 +246,21 @@ public:
CountingBloomFilter* Clone() const override;
std::string InternalState() const override;
/**
* Intersects another Bloom filter this one and returns a new BasicBloomFilter.
*
* Please note that the Intersection of two Counting bloom filters results in a
* basic bloom filter. The reason for this is that the counters loose meaning during
* the intersection process. The BasicBloomFilter will have bits set in cases where
* both Counting Bloom filters has cell values greater than zero.
*
* @param other The other Bloom filter.
*
* @return Intersecting BloomFilter on success, nullptr otherwise.
*/
BasicBloomFilter* Intersect(const BloomFilter* other) const override;
protected:
friend class BloomFilter;

View file

@ -142,6 +142,24 @@ CounterVector& CounterVector::Merge(const CounterVector& other)
return *this;
}
BitVector CounterVector::ToBitVector() const
{
auto newbits = BitVector(Size());
for ( size_t cell = 0; cell < Size(); ++cell )
{
size_t lsb = cell * width;
bool set = false;
for ( size_t i = 0; i < width; ++i )
set |= (*bits)[lsb+1];
newbits[cell] = set;
}
return newbits;
}
CounterVector& CounterVector::operator|=(const CounterVector& other)
{
return Merge(other);

View file

@ -132,6 +132,14 @@ public:
*/
CounterVector& Merge(const CounterVector& other);
/**
* Converts a counter vector into a BitVector. Each cell that has a value
* of 1 or more set is set in the BitVector; otherwhise the bit remains unset.
*
* @return The newly created BitVector
*/
BitVector ToBitVector() const;
/**
* An alias for ::Merge.
*/

View file

@ -160,6 +160,9 @@ function bloomfilter_add%(bf: opaque of bloomfilter, x: any%): any
## Decrements the counter for an element that was added to a counting bloom filter in the past.
##
## Note that decrement operations can use to false negatives if used on a counting bloom-filter
## that exceeded the width of its counter.
##
## bf: The coubting bloom filter handle.
##
## x: The element to decrement
@ -247,7 +250,7 @@ function bloomfilter_clear%(bf: opaque of bloomfilter%): any
##
## .. zeek:see:: bloomfilter_basic_init bloomfilter_basic_init2
## bloomfilter_counting_init bloomfilter_add bloomfilter_lookup
## bloomfilter_clear
## bloomfilter_clear bloomfilter_merge
function bloomfilter_merge%(bf1: opaque of bloomfilter,
bf2: opaque of bloomfilter%): opaque of bloomfilter
%{
@ -265,6 +268,44 @@ function bloomfilter_merge%(bf1: opaque of bloomfilter,
return BloomFilterVal::Merge(bfv1, bfv2);
%}
## Intersects two Bloom filters.
##
## The resulting Bloom filter returns true when queried for elements
## that were contained in both bloom filters. Note that intersected Bloom
## filters have a slightly higher probability of false positives than
## Bloom filters created from scratch.
##
## Please note that, while this function works with basic and with counting
## bloom filters, the result always is a basic bloom filter. So - intersecting
## two counting bloom filters will result in a basic bloom filter. The reason
## for this is that there is no reasonable definition of how to handle counters
## during intersection.
##
## bf1: The first Bloom filter handle.
##
## bf2: The second Bloom filter handle.
##
## Returns: The intersection of *bf1* and *bf2*.
## .. zeek:see:: bloomfilter_basic_init bloomfilter_basic_init2
## bloomfilter_counting_init bloomfilter_add bloomfilter_lookup
## bloomfilter_clear bloomfilter_merge
function bloomfilter_intersect%(bf1: opaque of bloomfilter,
bf2: opaque of bloomfilter%): opaque of bloomfilter
%{
const auto* bfv1 = static_cast<const BloomFilterVal*>(bf1);
const auto* bfv2 = static_cast<const BloomFilterVal*>(bf2);
if ( bfv1->Type() && // any one 0 is ok here
bfv2->Type() &&
! same_type(bfv1->Type(), bfv2->Type()) )
{
reporter->Error("incompatible Bloom filter types");
return nullptr;
}
return BloomFilterVal::Intersect(bfv1, bfv2);
%}
## Returns a string with a representation of a Bloom filter's internal
## state. This is for debugging/testing purposes only.
##

View file

@ -1,28 +1,46 @@
### BTest baseline data generated by btest-diff. Do not edit. Use "btest -U/-u" to update. Requires BTest >= 0.63.
basic
0
1
1
0
alternative constructor
1
1
basicstrings
1
1
1, fp
edgecases
merging
1
1
1
1
intersect
1
0
0
0
empty filter
0
1
counting
1
2
3
3
2
3
counting merge
3
3
2
counting intersect
1
0
0
counting decrement
2
1
T

View file

@ -6,6 +6,7 @@
function test_basic_bloom_filter()
{
# Basic usage with counts.
print "basic";
local bf_cnt = bloomfilter_basic_init(0.1, 1000);
bloomfilter_add(bf_cnt, 42);
bloomfilter_add(bf_cnt, 84);
@ -18,6 +19,7 @@ function test_basic_bloom_filter()
bloomfilter_add(bf_cnt, "foo"); # Type mismatch
# Alternative constructor.
print "alternative constructor";
local bf_dbl = bloomfilter_basic_init2(4, 10);
bloomfilter_add(bf_dbl, 4.2);
bloomfilter_add(bf_dbl, 3.14);
@ -25,6 +27,7 @@ function test_basic_bloom_filter()
print bloomfilter_lookup(bf_dbl, 3.14);
# Basic usage with strings.
print "basicstrings";
local bf_str = bloomfilter_basic_init(0.9, 10);
bloomfilter_add(bf_str, "foo");
bloomfilter_add(bf_str, "bar");
@ -36,12 +39,14 @@ function test_basic_bloom_filter()
bloomfilter_add(bf_str, 100); # Type mismatch
# Edge cases.
print "edgecases";
local bf_edge0 = bloomfilter_basic_init(0.000000000001, 1);
local bf_edge1 = bloomfilter_basic_init(0.00000001, 100000000);
local bf_edge2 = bloomfilter_basic_init(0.9999999, 1);
local bf_edge3 = bloomfilter_basic_init(0.9999999, 100000000000);
# Merging
print "merging";
local bf_cnt2 = bloomfilter_basic_init(0.1, 1000);
bloomfilter_add(bf_cnt2, 42);
bloomfilter_add(bf_cnt, 100);
@ -51,7 +56,16 @@ function test_basic_bloom_filter()
print bloomfilter_lookup(bf_merged, 100);
print bloomfilter_lookup(bf_merged, 168);
#Intersection
print "intersect";
local bf_intersected = bloomfilter_intersect(bf_cnt, bf_cnt2);
print bloomfilter_lookup(bf_intersected, 42);
print bloomfilter_lookup(bf_intersected, 84);
print bloomfilter_lookup(bf_intersected, 100);
print bloomfilter_lookup(bf_intersected, 168);
#empty filter tests
print "empty filter";
local bf_empty = bloomfilter_basic_init(0.1, 1000);
print bloomfilter_lookup(bf_empty, 42);
local bf_empty_merged = bloomfilter_merge(bf_merged, bf_empty);
@ -73,6 +87,7 @@ function test_bad_param2()
function test_counting_bloom_filter()
{
print "counting";
local bf = bloomfilter_counting_init(3, 32, 3);
bloomfilter_add(bf, "foo");
print bloomfilter_lookup(bf, "foo"); # 1
@ -90,6 +105,7 @@ function test_counting_bloom_filter()
print bloomfilter_lookup(bf, "foo"); # still 3
# Merging
print "counting merge";
local bf2 = bloomfilter_counting_init(3, 32, 3);
bloomfilter_add(bf2, "baz");
bloomfilter_add(bf2, "baz");
@ -99,6 +115,17 @@ function test_counting_bloom_filter()
print bloomfilter_lookup(bf_merged, "bar");
print bloomfilter_lookup(bf_merged, "baz");
# Intersect
print "counting intersect";
bloomfilter_add(bf2, "foo");
bloomfilter_add(bf2, "foo");
local bf_intersected = bloomfilter_intersect(bf, bf2);
print bloomfilter_lookup(bf_intersected, "foo");
print bloomfilter_lookup(bf_intersected, "bar");
print bloomfilter_lookup(bf_intersected, "baz");
# Decrement
print "counting decrement";
bloomfilter_decrement(bf, "foo");
print bloomfilter_lookup(bf, "foo"); # 2
bloomfilter_decrement(bf, "foo");