mirror of
https://github.com/zeek/zeek.git
synced 2025-10-02 06:38:20 +00:00
Add intersect operation for bloom filters
Intersecting two bloom filters yields a bloom filter that returns true when an element was contained in both bloom filters. The false positive rate is potentially a bit higher than in the original bloom filters. This operation also works for counting bloom filters, however the counters are discarded and the bloomfilters are converted to basic bloom filters. The reason is that there is no obvious meaning to the counters when two bloom filters are intersected - besides the fact if an element was inserted at all.
This commit is contained in:
parent
aa58b6b37b
commit
796e18ecfc
9 changed files with 237 additions and 2 deletions
|
@ -701,6 +701,49 @@ BloomFilterValPtr BloomFilterVal::Merge(const BloomFilterVal* x, const BloomFilt
|
|||
return merged;
|
||||
}
|
||||
|
||||
BloomFilterValPtr BloomFilterVal::Intersect(const BloomFilterVal* x, const BloomFilterVal* y)
|
||||
{
|
||||
if ( x->Type() && // any one 0 is ok here
|
||||
y->Type() && ! same_type(x->Type(), y->Type()) )
|
||||
{
|
||||
reporter->Error("cannot merge Bloom filters with different types");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
auto final_type = x->Type() ? x->Type() : y->Type();
|
||||
|
||||
if ( typeid(*x->bloom_filter) != typeid(*y->bloom_filter) )
|
||||
{
|
||||
reporter->Error("cannot merge different Bloom filter types");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
if ( typeid(*x->bloom_filter) != typeid(*y->bloom_filter) )
|
||||
{
|
||||
reporter->Error("cannot intersect different Bloom filter types");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
auto intersected_bf = x->bloom_filter->Intersect(y->bloom_filter);
|
||||
|
||||
if ( ! intersected_bf )
|
||||
{
|
||||
reporter->Error("failed to intersect Bloom filter");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
auto intersected = make_intrusive<BloomFilterVal>(intersected_bf);
|
||||
|
||||
if ( final_type && ! intersected->Typify(final_type) )
|
||||
{
|
||||
reporter->Error("Failed to set type on intersected bloom filter");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
return intersected;
|
||||
}
|
||||
|
||||
|
||||
BloomFilterVal::~BloomFilterVal()
|
||||
{
|
||||
delete hash;
|
||||
|
|
|
@ -335,6 +335,7 @@ public:
|
|||
std::string InternalState() const;
|
||||
|
||||
static BloomFilterValPtr Merge(const BloomFilterVal* x, const BloomFilterVal* y);
|
||||
static BloomFilterValPtr Intersect(const BloomFilterVal* x, const BloomFilterVal* y);
|
||||
|
||||
protected:
|
||||
friend class Val;
|
||||
|
|
|
@ -125,6 +125,31 @@ bool BasicBloomFilter::Merge(const BloomFilter* other)
|
|||
return true;
|
||||
}
|
||||
|
||||
BasicBloomFilter* BasicBloomFilter::Intersect(const BloomFilter* other) const
|
||||
{
|
||||
if ( typeid(*this) != typeid(*other) )
|
||||
return nullptr;
|
||||
|
||||
const BasicBloomFilter* o = static_cast<const BasicBloomFilter*>(other);
|
||||
|
||||
if ( ! hasher->Equals(o->hasher) )
|
||||
{
|
||||
reporter->Error("incompatible hashers in BasicBloomFilter intersect");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
else if ( bits->Size() != o->bits->Size() )
|
||||
{
|
||||
reporter->Error("different bitvector size in BasicBloomFilter intersect");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
auto copy = Clone();
|
||||
(*copy->bits) &= *o->bits;
|
||||
|
||||
return copy;
|
||||
}
|
||||
|
||||
BasicBloomFilter* BasicBloomFilter::Clone() const
|
||||
{
|
||||
BasicBloomFilter* copy = new BasicBloomFilter();
|
||||
|
@ -249,6 +274,32 @@ bool CountingBloomFilter::Merge(const BloomFilter* other)
|
|||
return true;
|
||||
}
|
||||
|
||||
BasicBloomFilter* CountingBloomFilter::Intersect(const BloomFilter* other) const
|
||||
{
|
||||
if ( typeid(*this) != typeid(*other) )
|
||||
return nullptr;
|
||||
|
||||
const CountingBloomFilter* o = static_cast<const CountingBloomFilter*>(other);
|
||||
|
||||
if ( ! hasher->Equals(o->hasher) )
|
||||
{
|
||||
reporter->Error("incompatible hashers in CountingBloomFilter merge");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
else if ( cells->Size() != o->cells->Size() )
|
||||
{
|
||||
reporter->Error("different bitvector size in CountingBloomFilter merge");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
auto outbf = new BasicBloomFilter(hasher->Clone(), cells->Size());
|
||||
*outbf->bits |= cells->ToBitVector();
|
||||
*outbf->bits &= o->cells->ToBitVector();
|
||||
|
||||
return outbf;
|
||||
}
|
||||
|
||||
CountingBloomFilter* CountingBloomFilter::Clone() const
|
||||
{
|
||||
CountingBloomFilter* copy = new CountingBloomFilter();
|
||||
|
|
|
@ -82,7 +82,7 @@ public:
|
|||
virtual void Clear() = 0;
|
||||
|
||||
/**
|
||||
* Merges another Bloom filter into a copy of this one.
|
||||
* Merges another Bloom filter into this one.
|
||||
*
|
||||
* @param other The other Bloom filter.
|
||||
*
|
||||
|
@ -90,6 +90,15 @@ public:
|
|||
*/
|
||||
virtual bool Merge(const BloomFilter* other) = 0;
|
||||
|
||||
/**
|
||||
* Intersects another Bloom filter with a copy of this one and returns the copy.
|
||||
*
|
||||
* @param other The other Bloom filter.
|
||||
*
|
||||
* @return Intersecting BloomFilter on success, nullptr otherwise.
|
||||
*/
|
||||
virtual BloomFilter* Intersect(const BloomFilter* other) const = 0;
|
||||
|
||||
/**
|
||||
* Constructs a copy of this Bloom filter.
|
||||
*
|
||||
|
@ -126,11 +135,14 @@ protected:
|
|||
const detail::Hasher* hasher;
|
||||
};
|
||||
|
||||
class CountingBloomFilter;
|
||||
|
||||
/**
|
||||
* A basic Bloom filter.
|
||||
*/
|
||||
class BasicBloomFilter : public BloomFilter
|
||||
{
|
||||
friend class CountingBloomFilter;
|
||||
public:
|
||||
/**
|
||||
* Constructs a basic Bloom filter with a given number of cells. The
|
||||
|
@ -181,6 +193,7 @@ public:
|
|||
void Clear() override;
|
||||
bool Merge(const BloomFilter* other) override;
|
||||
BasicBloomFilter* Clone() const override;
|
||||
BasicBloomFilter* Intersect(const BloomFilter* other) const override;
|
||||
std::string InternalState() const override;
|
||||
|
||||
protected:
|
||||
|
@ -233,6 +246,21 @@ public:
|
|||
CountingBloomFilter* Clone() const override;
|
||||
std::string InternalState() const override;
|
||||
|
||||
/**
|
||||
* Intersects another Bloom filter this one and returns a new BasicBloomFilter.
|
||||
*
|
||||
* Please note that the Intersection of two Counting bloom filters results in a
|
||||
* basic bloom filter. The reason for this is that the counters loose meaning during
|
||||
* the intersection process. The BasicBloomFilter will have bits set in cases where
|
||||
* both Counting Bloom filters has cell values greater than zero.
|
||||
*
|
||||
* @param other The other Bloom filter.
|
||||
*
|
||||
* @return Intersecting BloomFilter on success, nullptr otherwise.
|
||||
*/
|
||||
BasicBloomFilter* Intersect(const BloomFilter* other) const override;
|
||||
|
||||
|
||||
protected:
|
||||
friend class BloomFilter;
|
||||
|
||||
|
|
|
@ -142,6 +142,24 @@ CounterVector& CounterVector::Merge(const CounterVector& other)
|
|||
return *this;
|
||||
}
|
||||
|
||||
BitVector CounterVector::ToBitVector() const
|
||||
{
|
||||
auto newbits = BitVector(Size());
|
||||
|
||||
for ( size_t cell = 0; cell < Size(); ++cell )
|
||||
{
|
||||
size_t lsb = cell * width;
|
||||
bool set = false;
|
||||
|
||||
for ( size_t i = 0; i < width; ++i )
|
||||
set |= (*bits)[lsb+1];
|
||||
|
||||
newbits[cell] = set;
|
||||
}
|
||||
|
||||
return newbits;
|
||||
}
|
||||
|
||||
CounterVector& CounterVector::operator|=(const CounterVector& other)
|
||||
{
|
||||
return Merge(other);
|
||||
|
|
|
@ -132,6 +132,14 @@ public:
|
|||
*/
|
||||
CounterVector& Merge(const CounterVector& other);
|
||||
|
||||
/**
|
||||
* Converts a counter vector into a BitVector. Each cell that has a value
|
||||
* of 1 or more set is set in the BitVector; otherwhise the bit remains unset.
|
||||
*
|
||||
* @return The newly created BitVector
|
||||
*/
|
||||
BitVector ToBitVector() const;
|
||||
|
||||
/**
|
||||
* An alias for ::Merge.
|
||||
*/
|
||||
|
|
|
@ -160,6 +160,9 @@ function bloomfilter_add%(bf: opaque of bloomfilter, x: any%): any
|
|||
|
||||
## Decrements the counter for an element that was added to a counting bloom filter in the past.
|
||||
##
|
||||
## Note that decrement operations can use to false negatives if used on a counting bloom-filter
|
||||
## that exceeded the width of its counter.
|
||||
##
|
||||
## bf: The coubting bloom filter handle.
|
||||
##
|
||||
## x: The element to decrement
|
||||
|
@ -247,7 +250,7 @@ function bloomfilter_clear%(bf: opaque of bloomfilter%): any
|
|||
##
|
||||
## .. zeek:see:: bloomfilter_basic_init bloomfilter_basic_init2
|
||||
## bloomfilter_counting_init bloomfilter_add bloomfilter_lookup
|
||||
## bloomfilter_clear
|
||||
## bloomfilter_clear bloomfilter_merge
|
||||
function bloomfilter_merge%(bf1: opaque of bloomfilter,
|
||||
bf2: opaque of bloomfilter%): opaque of bloomfilter
|
||||
%{
|
||||
|
@ -265,6 +268,44 @@ function bloomfilter_merge%(bf1: opaque of bloomfilter,
|
|||
return BloomFilterVal::Merge(bfv1, bfv2);
|
||||
%}
|
||||
|
||||
## Intersects two Bloom filters.
|
||||
##
|
||||
## The resulting Bloom filter returns true when queried for elements
|
||||
## that were contained in both bloom filters. Note that intersected Bloom
|
||||
## filters have a slightly higher probability of false positives than
|
||||
## Bloom filters created from scratch.
|
||||
##
|
||||
## Please note that, while this function works with basic and with counting
|
||||
## bloom filters, the result always is a basic bloom filter. So - intersecting
|
||||
## two counting bloom filters will result in a basic bloom filter. The reason
|
||||
## for this is that there is no reasonable definition of how to handle counters
|
||||
## during intersection.
|
||||
##
|
||||
## bf1: The first Bloom filter handle.
|
||||
##
|
||||
## bf2: The second Bloom filter handle.
|
||||
##
|
||||
## Returns: The intersection of *bf1* and *bf2*.
|
||||
## .. zeek:see:: bloomfilter_basic_init bloomfilter_basic_init2
|
||||
## bloomfilter_counting_init bloomfilter_add bloomfilter_lookup
|
||||
## bloomfilter_clear bloomfilter_merge
|
||||
function bloomfilter_intersect%(bf1: opaque of bloomfilter,
|
||||
bf2: opaque of bloomfilter%): opaque of bloomfilter
|
||||
%{
|
||||
const auto* bfv1 = static_cast<const BloomFilterVal*>(bf1);
|
||||
const auto* bfv2 = static_cast<const BloomFilterVal*>(bf2);
|
||||
|
||||
if ( bfv1->Type() && // any one 0 is ok here
|
||||
bfv2->Type() &&
|
||||
! same_type(bfv1->Type(), bfv2->Type()) )
|
||||
{
|
||||
reporter->Error("incompatible Bloom filter types");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
return BloomFilterVal::Intersect(bfv1, bfv2);
|
||||
%}
|
||||
|
||||
## Returns a string with a representation of a Bloom filter's internal
|
||||
## state. This is for debugging/testing purposes only.
|
||||
##
|
||||
|
|
|
@ -1,28 +1,46 @@
|
|||
### BTest baseline data generated by btest-diff. Do not edit. Use "btest -U/-u" to update. Requires BTest >= 0.63.
|
||||
basic
|
||||
0
|
||||
1
|
||||
1
|
||||
0
|
||||
alternative constructor
|
||||
1
|
||||
1
|
||||
basicstrings
|
||||
1
|
||||
1
|
||||
1, fp
|
||||
edgecases
|
||||
merging
|
||||
1
|
||||
1
|
||||
1
|
||||
1
|
||||
intersect
|
||||
1
|
||||
0
|
||||
0
|
||||
0
|
||||
empty filter
|
||||
0
|
||||
1
|
||||
counting
|
||||
1
|
||||
2
|
||||
3
|
||||
3
|
||||
2
|
||||
3
|
||||
counting merge
|
||||
3
|
||||
3
|
||||
2
|
||||
counting intersect
|
||||
1
|
||||
0
|
||||
0
|
||||
counting decrement
|
||||
2
|
||||
1
|
||||
T
|
||||
|
|
|
@ -6,6 +6,7 @@
|
|||
function test_basic_bloom_filter()
|
||||
{
|
||||
# Basic usage with counts.
|
||||
print "basic";
|
||||
local bf_cnt = bloomfilter_basic_init(0.1, 1000);
|
||||
bloomfilter_add(bf_cnt, 42);
|
||||
bloomfilter_add(bf_cnt, 84);
|
||||
|
@ -18,6 +19,7 @@ function test_basic_bloom_filter()
|
|||
bloomfilter_add(bf_cnt, "foo"); # Type mismatch
|
||||
|
||||
# Alternative constructor.
|
||||
print "alternative constructor";
|
||||
local bf_dbl = bloomfilter_basic_init2(4, 10);
|
||||
bloomfilter_add(bf_dbl, 4.2);
|
||||
bloomfilter_add(bf_dbl, 3.14);
|
||||
|
@ -25,6 +27,7 @@ function test_basic_bloom_filter()
|
|||
print bloomfilter_lookup(bf_dbl, 3.14);
|
||||
|
||||
# Basic usage with strings.
|
||||
print "basicstrings";
|
||||
local bf_str = bloomfilter_basic_init(0.9, 10);
|
||||
bloomfilter_add(bf_str, "foo");
|
||||
bloomfilter_add(bf_str, "bar");
|
||||
|
@ -36,12 +39,14 @@ function test_basic_bloom_filter()
|
|||
bloomfilter_add(bf_str, 100); # Type mismatch
|
||||
|
||||
# Edge cases.
|
||||
print "edgecases";
|
||||
local bf_edge0 = bloomfilter_basic_init(0.000000000001, 1);
|
||||
local bf_edge1 = bloomfilter_basic_init(0.00000001, 100000000);
|
||||
local bf_edge2 = bloomfilter_basic_init(0.9999999, 1);
|
||||
local bf_edge3 = bloomfilter_basic_init(0.9999999, 100000000000);
|
||||
|
||||
# Merging
|
||||
print "merging";
|
||||
local bf_cnt2 = bloomfilter_basic_init(0.1, 1000);
|
||||
bloomfilter_add(bf_cnt2, 42);
|
||||
bloomfilter_add(bf_cnt, 100);
|
||||
|
@ -51,7 +56,16 @@ function test_basic_bloom_filter()
|
|||
print bloomfilter_lookup(bf_merged, 100);
|
||||
print bloomfilter_lookup(bf_merged, 168);
|
||||
|
||||
#Intersection
|
||||
print "intersect";
|
||||
local bf_intersected = bloomfilter_intersect(bf_cnt, bf_cnt2);
|
||||
print bloomfilter_lookup(bf_intersected, 42);
|
||||
print bloomfilter_lookup(bf_intersected, 84);
|
||||
print bloomfilter_lookup(bf_intersected, 100);
|
||||
print bloomfilter_lookup(bf_intersected, 168);
|
||||
|
||||
#empty filter tests
|
||||
print "empty filter";
|
||||
local bf_empty = bloomfilter_basic_init(0.1, 1000);
|
||||
print bloomfilter_lookup(bf_empty, 42);
|
||||
local bf_empty_merged = bloomfilter_merge(bf_merged, bf_empty);
|
||||
|
@ -73,6 +87,7 @@ function test_bad_param2()
|
|||
|
||||
function test_counting_bloom_filter()
|
||||
{
|
||||
print "counting";
|
||||
local bf = bloomfilter_counting_init(3, 32, 3);
|
||||
bloomfilter_add(bf, "foo");
|
||||
print bloomfilter_lookup(bf, "foo"); # 1
|
||||
|
@ -90,6 +105,7 @@ function test_counting_bloom_filter()
|
|||
print bloomfilter_lookup(bf, "foo"); # still 3
|
||||
|
||||
# Merging
|
||||
print "counting merge";
|
||||
local bf2 = bloomfilter_counting_init(3, 32, 3);
|
||||
bloomfilter_add(bf2, "baz");
|
||||
bloomfilter_add(bf2, "baz");
|
||||
|
@ -99,6 +115,17 @@ function test_counting_bloom_filter()
|
|||
print bloomfilter_lookup(bf_merged, "bar");
|
||||
print bloomfilter_lookup(bf_merged, "baz");
|
||||
|
||||
# Intersect
|
||||
print "counting intersect";
|
||||
bloomfilter_add(bf2, "foo");
|
||||
bloomfilter_add(bf2, "foo");
|
||||
local bf_intersected = bloomfilter_intersect(bf, bf2);
|
||||
print bloomfilter_lookup(bf_intersected, "foo");
|
||||
print bloomfilter_lookup(bf_intersected, "bar");
|
||||
print bloomfilter_lookup(bf_intersected, "baz");
|
||||
|
||||
# Decrement
|
||||
print "counting decrement";
|
||||
bloomfilter_decrement(bf, "foo");
|
||||
print bloomfilter_lookup(bf, "foo"); # 2
|
||||
bloomfilter_decrement(bf, "foo");
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue