Add intersect operation for bloom filters

Intersecting two bloom filters yields a bloom filter that returns true
when an element was contained in both bloom filters. The false positive
rate is potentially a bit higher than in the original bloom filters.

This operation also works for counting bloom filters, however the
counters are discarded and the bloomfilters are converted to basic bloom
filters. The reason is that there is no obvious meaning to the counters
when two bloom filters are intersected - besides the fact if an element
was inserted at all.
This commit is contained in:
Johanna Amann 2022-01-20 13:34:07 +00:00 committed by Johanna Amann
parent aa58b6b37b
commit 796e18ecfc
9 changed files with 237 additions and 2 deletions

View file

@ -701,6 +701,49 @@ BloomFilterValPtr BloomFilterVal::Merge(const BloomFilterVal* x, const BloomFilt
return merged; return merged;
} }
BloomFilterValPtr BloomFilterVal::Intersect(const BloomFilterVal* x, const BloomFilterVal* y)
{
if ( x->Type() && // any one 0 is ok here
y->Type() && ! same_type(x->Type(), y->Type()) )
{
reporter->Error("cannot merge Bloom filters with different types");
return nullptr;
}
auto final_type = x->Type() ? x->Type() : y->Type();
if ( typeid(*x->bloom_filter) != typeid(*y->bloom_filter) )
{
reporter->Error("cannot merge different Bloom filter types");
return nullptr;
}
if ( typeid(*x->bloom_filter) != typeid(*y->bloom_filter) )
{
reporter->Error("cannot intersect different Bloom filter types");
return nullptr;
}
auto intersected_bf = x->bloom_filter->Intersect(y->bloom_filter);
if ( ! intersected_bf )
{
reporter->Error("failed to intersect Bloom filter");
return nullptr;
}
auto intersected = make_intrusive<BloomFilterVal>(intersected_bf);
if ( final_type && ! intersected->Typify(final_type) )
{
reporter->Error("Failed to set type on intersected bloom filter");
return nullptr;
}
return intersected;
}
BloomFilterVal::~BloomFilterVal() BloomFilterVal::~BloomFilterVal()
{ {
delete hash; delete hash;

View file

@ -335,6 +335,7 @@ public:
std::string InternalState() const; std::string InternalState() const;
static BloomFilterValPtr Merge(const BloomFilterVal* x, const BloomFilterVal* y); static BloomFilterValPtr Merge(const BloomFilterVal* x, const BloomFilterVal* y);
static BloomFilterValPtr Intersect(const BloomFilterVal* x, const BloomFilterVal* y);
protected: protected:
friend class Val; friend class Val;

View file

@ -125,6 +125,31 @@ bool BasicBloomFilter::Merge(const BloomFilter* other)
return true; return true;
} }
BasicBloomFilter* BasicBloomFilter::Intersect(const BloomFilter* other) const
{
if ( typeid(*this) != typeid(*other) )
return nullptr;
const BasicBloomFilter* o = static_cast<const BasicBloomFilter*>(other);
if ( ! hasher->Equals(o->hasher) )
{
reporter->Error("incompatible hashers in BasicBloomFilter intersect");
return nullptr;
}
else if ( bits->Size() != o->bits->Size() )
{
reporter->Error("different bitvector size in BasicBloomFilter intersect");
return nullptr;
}
auto copy = Clone();
(*copy->bits) &= *o->bits;
return copy;
}
BasicBloomFilter* BasicBloomFilter::Clone() const BasicBloomFilter* BasicBloomFilter::Clone() const
{ {
BasicBloomFilter* copy = new BasicBloomFilter(); BasicBloomFilter* copy = new BasicBloomFilter();
@ -249,6 +274,32 @@ bool CountingBloomFilter::Merge(const BloomFilter* other)
return true; return true;
} }
BasicBloomFilter* CountingBloomFilter::Intersect(const BloomFilter* other) const
{
if ( typeid(*this) != typeid(*other) )
return nullptr;
const CountingBloomFilter* o = static_cast<const CountingBloomFilter*>(other);
if ( ! hasher->Equals(o->hasher) )
{
reporter->Error("incompatible hashers in CountingBloomFilter merge");
return nullptr;
}
else if ( cells->Size() != o->cells->Size() )
{
reporter->Error("different bitvector size in CountingBloomFilter merge");
return nullptr;
}
auto outbf = new BasicBloomFilter(hasher->Clone(), cells->Size());
*outbf->bits |= cells->ToBitVector();
*outbf->bits &= o->cells->ToBitVector();
return outbf;
}
CountingBloomFilter* CountingBloomFilter::Clone() const CountingBloomFilter* CountingBloomFilter::Clone() const
{ {
CountingBloomFilter* copy = new CountingBloomFilter(); CountingBloomFilter* copy = new CountingBloomFilter();

View file

@ -82,7 +82,7 @@ public:
virtual void Clear() = 0; virtual void Clear() = 0;
/** /**
* Merges another Bloom filter into a copy of this one. * Merges another Bloom filter into this one.
* *
* @param other The other Bloom filter. * @param other The other Bloom filter.
* *
@ -90,6 +90,15 @@ public:
*/ */
virtual bool Merge(const BloomFilter* other) = 0; virtual bool Merge(const BloomFilter* other) = 0;
/**
* Intersects another Bloom filter with a copy of this one and returns the copy.
*
* @param other The other Bloom filter.
*
* @return Intersecting BloomFilter on success, nullptr otherwise.
*/
virtual BloomFilter* Intersect(const BloomFilter* other) const = 0;
/** /**
* Constructs a copy of this Bloom filter. * Constructs a copy of this Bloom filter.
* *
@ -126,11 +135,14 @@ protected:
const detail::Hasher* hasher; const detail::Hasher* hasher;
}; };
class CountingBloomFilter;
/** /**
* A basic Bloom filter. * A basic Bloom filter.
*/ */
class BasicBloomFilter : public BloomFilter class BasicBloomFilter : public BloomFilter
{ {
friend class CountingBloomFilter;
public: public:
/** /**
* Constructs a basic Bloom filter with a given number of cells. The * Constructs a basic Bloom filter with a given number of cells. The
@ -181,6 +193,7 @@ public:
void Clear() override; void Clear() override;
bool Merge(const BloomFilter* other) override; bool Merge(const BloomFilter* other) override;
BasicBloomFilter* Clone() const override; BasicBloomFilter* Clone() const override;
BasicBloomFilter* Intersect(const BloomFilter* other) const override;
std::string InternalState() const override; std::string InternalState() const override;
protected: protected:
@ -233,6 +246,21 @@ public:
CountingBloomFilter* Clone() const override; CountingBloomFilter* Clone() const override;
std::string InternalState() const override; std::string InternalState() const override;
/**
* Intersects another Bloom filter this one and returns a new BasicBloomFilter.
*
* Please note that the Intersection of two Counting bloom filters results in a
* basic bloom filter. The reason for this is that the counters loose meaning during
* the intersection process. The BasicBloomFilter will have bits set in cases where
* both Counting Bloom filters has cell values greater than zero.
*
* @param other The other Bloom filter.
*
* @return Intersecting BloomFilter on success, nullptr otherwise.
*/
BasicBloomFilter* Intersect(const BloomFilter* other) const override;
protected: protected:
friend class BloomFilter; friend class BloomFilter;

View file

@ -142,6 +142,24 @@ CounterVector& CounterVector::Merge(const CounterVector& other)
return *this; return *this;
} }
BitVector CounterVector::ToBitVector() const
{
auto newbits = BitVector(Size());
for ( size_t cell = 0; cell < Size(); ++cell )
{
size_t lsb = cell * width;
bool set = false;
for ( size_t i = 0; i < width; ++i )
set |= (*bits)[lsb+1];
newbits[cell] = set;
}
return newbits;
}
CounterVector& CounterVector::operator|=(const CounterVector& other) CounterVector& CounterVector::operator|=(const CounterVector& other)
{ {
return Merge(other); return Merge(other);

View file

@ -132,6 +132,14 @@ public:
*/ */
CounterVector& Merge(const CounterVector& other); CounterVector& Merge(const CounterVector& other);
/**
* Converts a counter vector into a BitVector. Each cell that has a value
* of 1 or more set is set in the BitVector; otherwhise the bit remains unset.
*
* @return The newly created BitVector
*/
BitVector ToBitVector() const;
/** /**
* An alias for ::Merge. * An alias for ::Merge.
*/ */

View file

@ -160,6 +160,9 @@ function bloomfilter_add%(bf: opaque of bloomfilter, x: any%): any
## Decrements the counter for an element that was added to a counting bloom filter in the past. ## Decrements the counter for an element that was added to a counting bloom filter in the past.
## ##
## Note that decrement operations can use to false negatives if used on a counting bloom-filter
## that exceeded the width of its counter.
##
## bf: The coubting bloom filter handle. ## bf: The coubting bloom filter handle.
## ##
## x: The element to decrement ## x: The element to decrement
@ -247,7 +250,7 @@ function bloomfilter_clear%(bf: opaque of bloomfilter%): any
## ##
## .. zeek:see:: bloomfilter_basic_init bloomfilter_basic_init2 ## .. zeek:see:: bloomfilter_basic_init bloomfilter_basic_init2
## bloomfilter_counting_init bloomfilter_add bloomfilter_lookup ## bloomfilter_counting_init bloomfilter_add bloomfilter_lookup
## bloomfilter_clear ## bloomfilter_clear bloomfilter_merge
function bloomfilter_merge%(bf1: opaque of bloomfilter, function bloomfilter_merge%(bf1: opaque of bloomfilter,
bf2: opaque of bloomfilter%): opaque of bloomfilter bf2: opaque of bloomfilter%): opaque of bloomfilter
%{ %{
@ -265,6 +268,44 @@ function bloomfilter_merge%(bf1: opaque of bloomfilter,
return BloomFilterVal::Merge(bfv1, bfv2); return BloomFilterVal::Merge(bfv1, bfv2);
%} %}
## Intersects two Bloom filters.
##
## The resulting Bloom filter returns true when queried for elements
## that were contained in both bloom filters. Note that intersected Bloom
## filters have a slightly higher probability of false positives than
## Bloom filters created from scratch.
##
## Please note that, while this function works with basic and with counting
## bloom filters, the result always is a basic bloom filter. So - intersecting
## two counting bloom filters will result in a basic bloom filter. The reason
## for this is that there is no reasonable definition of how to handle counters
## during intersection.
##
## bf1: The first Bloom filter handle.
##
## bf2: The second Bloom filter handle.
##
## Returns: The intersection of *bf1* and *bf2*.
## .. zeek:see:: bloomfilter_basic_init bloomfilter_basic_init2
## bloomfilter_counting_init bloomfilter_add bloomfilter_lookup
## bloomfilter_clear bloomfilter_merge
function bloomfilter_intersect%(bf1: opaque of bloomfilter,
bf2: opaque of bloomfilter%): opaque of bloomfilter
%{
const auto* bfv1 = static_cast<const BloomFilterVal*>(bf1);
const auto* bfv2 = static_cast<const BloomFilterVal*>(bf2);
if ( bfv1->Type() && // any one 0 is ok here
bfv2->Type() &&
! same_type(bfv1->Type(), bfv2->Type()) )
{
reporter->Error("incompatible Bloom filter types");
return nullptr;
}
return BloomFilterVal::Intersect(bfv1, bfv2);
%}
## Returns a string with a representation of a Bloom filter's internal ## Returns a string with a representation of a Bloom filter's internal
## state. This is for debugging/testing purposes only. ## state. This is for debugging/testing purposes only.
## ##

View file

@ -1,28 +1,46 @@
### BTest baseline data generated by btest-diff. Do not edit. Use "btest -U/-u" to update. Requires BTest >= 0.63. ### BTest baseline data generated by btest-diff. Do not edit. Use "btest -U/-u" to update. Requires BTest >= 0.63.
basic
0 0
1 1
1 1
0 0
alternative constructor
1 1
1 1
basicstrings
1 1
1 1
1, fp 1, fp
edgecases
merging
1 1
1 1
1 1
1 1
intersect
1
0
0
0
empty filter
0 0
1 1
counting
1 1
2 2
3 3
3 3
2 2
3 3
counting merge
3 3
3 3
2 2
counting intersect
1
0
0
counting decrement
2 2
1 1
T T

View file

@ -6,6 +6,7 @@
function test_basic_bloom_filter() function test_basic_bloom_filter()
{ {
# Basic usage with counts. # Basic usage with counts.
print "basic";
local bf_cnt = bloomfilter_basic_init(0.1, 1000); local bf_cnt = bloomfilter_basic_init(0.1, 1000);
bloomfilter_add(bf_cnt, 42); bloomfilter_add(bf_cnt, 42);
bloomfilter_add(bf_cnt, 84); bloomfilter_add(bf_cnt, 84);
@ -18,6 +19,7 @@ function test_basic_bloom_filter()
bloomfilter_add(bf_cnt, "foo"); # Type mismatch bloomfilter_add(bf_cnt, "foo"); # Type mismatch
# Alternative constructor. # Alternative constructor.
print "alternative constructor";
local bf_dbl = bloomfilter_basic_init2(4, 10); local bf_dbl = bloomfilter_basic_init2(4, 10);
bloomfilter_add(bf_dbl, 4.2); bloomfilter_add(bf_dbl, 4.2);
bloomfilter_add(bf_dbl, 3.14); bloomfilter_add(bf_dbl, 3.14);
@ -25,6 +27,7 @@ function test_basic_bloom_filter()
print bloomfilter_lookup(bf_dbl, 3.14); print bloomfilter_lookup(bf_dbl, 3.14);
# Basic usage with strings. # Basic usage with strings.
print "basicstrings";
local bf_str = bloomfilter_basic_init(0.9, 10); local bf_str = bloomfilter_basic_init(0.9, 10);
bloomfilter_add(bf_str, "foo"); bloomfilter_add(bf_str, "foo");
bloomfilter_add(bf_str, "bar"); bloomfilter_add(bf_str, "bar");
@ -36,12 +39,14 @@ function test_basic_bloom_filter()
bloomfilter_add(bf_str, 100); # Type mismatch bloomfilter_add(bf_str, 100); # Type mismatch
# Edge cases. # Edge cases.
print "edgecases";
local bf_edge0 = bloomfilter_basic_init(0.000000000001, 1); local bf_edge0 = bloomfilter_basic_init(0.000000000001, 1);
local bf_edge1 = bloomfilter_basic_init(0.00000001, 100000000); local bf_edge1 = bloomfilter_basic_init(0.00000001, 100000000);
local bf_edge2 = bloomfilter_basic_init(0.9999999, 1); local bf_edge2 = bloomfilter_basic_init(0.9999999, 1);
local bf_edge3 = bloomfilter_basic_init(0.9999999, 100000000000); local bf_edge3 = bloomfilter_basic_init(0.9999999, 100000000000);
# Merging # Merging
print "merging";
local bf_cnt2 = bloomfilter_basic_init(0.1, 1000); local bf_cnt2 = bloomfilter_basic_init(0.1, 1000);
bloomfilter_add(bf_cnt2, 42); bloomfilter_add(bf_cnt2, 42);
bloomfilter_add(bf_cnt, 100); bloomfilter_add(bf_cnt, 100);
@ -51,7 +56,16 @@ function test_basic_bloom_filter()
print bloomfilter_lookup(bf_merged, 100); print bloomfilter_lookup(bf_merged, 100);
print bloomfilter_lookup(bf_merged, 168); print bloomfilter_lookup(bf_merged, 168);
#Intersection
print "intersect";
local bf_intersected = bloomfilter_intersect(bf_cnt, bf_cnt2);
print bloomfilter_lookup(bf_intersected, 42);
print bloomfilter_lookup(bf_intersected, 84);
print bloomfilter_lookup(bf_intersected, 100);
print bloomfilter_lookup(bf_intersected, 168);
#empty filter tests #empty filter tests
print "empty filter";
local bf_empty = bloomfilter_basic_init(0.1, 1000); local bf_empty = bloomfilter_basic_init(0.1, 1000);
print bloomfilter_lookup(bf_empty, 42); print bloomfilter_lookup(bf_empty, 42);
local bf_empty_merged = bloomfilter_merge(bf_merged, bf_empty); local bf_empty_merged = bloomfilter_merge(bf_merged, bf_empty);
@ -73,6 +87,7 @@ function test_bad_param2()
function test_counting_bloom_filter() function test_counting_bloom_filter()
{ {
print "counting";
local bf = bloomfilter_counting_init(3, 32, 3); local bf = bloomfilter_counting_init(3, 32, 3);
bloomfilter_add(bf, "foo"); bloomfilter_add(bf, "foo");
print bloomfilter_lookup(bf, "foo"); # 1 print bloomfilter_lookup(bf, "foo"); # 1
@ -90,6 +105,7 @@ function test_counting_bloom_filter()
print bloomfilter_lookup(bf, "foo"); # still 3 print bloomfilter_lookup(bf, "foo"); # still 3
# Merging # Merging
print "counting merge";
local bf2 = bloomfilter_counting_init(3, 32, 3); local bf2 = bloomfilter_counting_init(3, 32, 3);
bloomfilter_add(bf2, "baz"); bloomfilter_add(bf2, "baz");
bloomfilter_add(bf2, "baz"); bloomfilter_add(bf2, "baz");
@ -99,6 +115,17 @@ function test_counting_bloom_filter()
print bloomfilter_lookup(bf_merged, "bar"); print bloomfilter_lookup(bf_merged, "bar");
print bloomfilter_lookup(bf_merged, "baz"); print bloomfilter_lookup(bf_merged, "baz");
# Intersect
print "counting intersect";
bloomfilter_add(bf2, "foo");
bloomfilter_add(bf2, "foo");
local bf_intersected = bloomfilter_intersect(bf, bf2);
print bloomfilter_lookup(bf_intersected, "foo");
print bloomfilter_lookup(bf_intersected, "bar");
print bloomfilter_lookup(bf_intersected, "baz");
# Decrement
print "counting decrement";
bloomfilter_decrement(bf, "foo"); bloomfilter_decrement(bf, "foo");
print bloomfilter_lookup(bf, "foo"); # 2 print bloomfilter_lookup(bf, "foo"); # 2
bloomfilter_decrement(bf, "foo"); bloomfilter_decrement(bf, "foo");