Merge remote-tracking branch 'origin/master' into topic/bernhard/hyperloglog

Conflicts:
	src/Func.cc
	src/probabilistic/CMakeLists.txt
This commit is contained in:
Bernhard Amann 2013-07-25 14:46:38 -07:00
commit 32c2885742
29 changed files with 10374 additions and 107 deletions

View file

@ -463,6 +463,17 @@ bool BitVector::Empty() const
return bits.empty();
}
bool BitVector::AllZero() const
{
for ( size_t i = 0; i < bits.size(); ++i )
{
if ( bits[i] )
return false;
}
return true;
}
BitVector::size_type BitVector::FindFirst() const
{
return find_from(0);
@ -557,11 +568,11 @@ bool BitVector::DoUnserialize(UnserialInfo* info)
bits[i] = static_cast<block_type>(block);
}
uint64 num_bits;
if ( ! UNSERIALIZE(&num_bits) )
uint64 n;
if ( ! UNSERIALIZE(&n) )
return false;
num_bits = static_cast<size_type>(num_bits);
num_bits = static_cast<size_type>(n);
return true;
}

View file

@ -253,6 +253,12 @@ public:
*/
bool Empty() const;
/**
* Checks whether all bits are 0.
* @return `true` iff all bits in all blocks are 0.
*/
bool AllZero() const;
/**
* Finds the bit position of of the first 1-bit.
* @return The position of the first bit that equals to one or `npos` if no

View file

@ -1,9 +1,11 @@
// See the file "COPYING" in the main distribution directory for copyright.
#include <typeinfo>
#include <cmath>
#include <limits>
#include "BloomFilter.h"
#include <cmath>
#include <limits>
#include "CounterVector.h"
#include "Serializer.h"
@ -74,17 +76,48 @@ size_t BasicBloomFilter::K(size_t cells, size_t capacity)
return std::ceil(frac * std::log(2));
}
BasicBloomFilter* BasicBloomFilter::Merge(const BasicBloomFilter* x,
const BasicBloomFilter* y)
bool BasicBloomFilter::Empty() const
{
if ( ! x->hasher->Equals(y->hasher) )
reporter->InternalError("incompatible hashers during BasicBloomFilter merge");
return bits->AllZero();
}
BasicBloomFilter* result = new BasicBloomFilter();
result->hasher = x->hasher->Clone();
result->bits = new BitVector(*x->bits | *y->bits);
void BasicBloomFilter::Clear()
{
bits->Clear();
}
return result;
bool BasicBloomFilter::Merge(const BloomFilter* other)
{
if ( typeid(*this) != typeid(*other) )
return false;
const BasicBloomFilter* o = static_cast<const BasicBloomFilter*>(other);
if ( ! hasher->Equals(o->hasher) )
{
reporter->Error("incompatible hashers in BasicBloomFilter merge");
return false;
}
else if ( bits->Size() != o->bits->Size() )
{
reporter->Error("different bitvector size in BasicBloomFilter merge");
return false;
}
(*bits) |= *o->bits;
return true;
}
BasicBloomFilter* BasicBloomFilter::Clone() const
{
BasicBloomFilter* copy = new BasicBloomFilter();
copy->hasher = hasher->Clone();
copy->bits = new BitVector(*bits);
return copy;
}
BasicBloomFilter::BasicBloomFilter()
@ -130,19 +163,6 @@ size_t BasicBloomFilter::CountImpl(const Hasher::digest_vector& h) const
return 1;
}
CountingBloomFilter* CountingBloomFilter::Merge(const CountingBloomFilter* x,
const CountingBloomFilter* y)
{
if ( ! x->hasher->Equals(y->hasher) )
reporter->InternalError("incompatible hashers during CountingBloomFilter merge");
CountingBloomFilter* result = new CountingBloomFilter();
result->hasher = x->hasher->Clone();
result->cells = new CounterVector(*x->cells | *y->cells);
return result;
}
CountingBloomFilter::CountingBloomFilter()
{
cells = 0;
@ -155,6 +175,50 @@ CountingBloomFilter::CountingBloomFilter(const Hasher* hasher,
cells = new CounterVector(width, arg_cells);
}
bool CountingBloomFilter::Empty() const
{
return cells->AllZero();
}
void CountingBloomFilter::Clear()
{
cells->Clear();
}
bool CountingBloomFilter::Merge(const BloomFilter* other)
{
if ( typeid(*this) != typeid(*other) )
return false;
const CountingBloomFilter* o = static_cast<const CountingBloomFilter*>(other);
if ( ! hasher->Equals(o->hasher) )
{
reporter->Error("incompatible hashers in CountingBloomFilter merge");
return false;
}
else if ( cells->Size() != o->cells->Size() )
{
reporter->Error("different bitvector size in CountingBloomFilter merge");
return false;
}
(*cells) |= *o->cells;
return true;
}
CountingBloomFilter* CountingBloomFilter::Clone() const
{
CountingBloomFilter* copy = new CountingBloomFilter();
copy->hasher = hasher->Clone();
copy->cells = new CounterVector(*cells);
return copy;
}
IMPLEMENT_SERIAL(CountingBloomFilter, SER_COUNTINGBLOOMFILTER)
bool CountingBloomFilter::DoSerialize(SerialInfo* info) const

View file

@ -47,6 +47,34 @@ public:
return CountImpl((*hasher)(x));
}
/**
* Checks whether the Bloom filter is empty.
*
* @return `true` if the Bloom filter contains no elements.
*/
virtual bool Empty() const = 0;
/**
* Removes all elements, i.e., resets all bits in the underlying bit vector.
*/
virtual void Clear() = 0;
/**
* Merges another Bloom filter into a copy of this one.
*
* @param other The other Bloom filter.
*
* @return `true` on success.
*/
virtual bool Merge(const BloomFilter* other) = 0;
/**
* Constructs a copy of this Bloom filter.
*
* @return A copy of `*this`.
*/
virtual BloomFilter* Clone() const = 0;
/**
* Serializes the Bloom filter.
*
@ -147,13 +175,11 @@ public:
*/
static size_t K(size_t cells, size_t capacity);
/**
* Merges two basic Bloom filters.
*
* @return The merged Bloom filter.
*/
static BasicBloomFilter* Merge(const BasicBloomFilter* x,
const BasicBloomFilter* y);
// Overridden from BloomFilter.
virtual bool Empty() const;
virtual void Clear();
virtual bool Merge(const BloomFilter* other);
virtual BasicBloomFilter* Clone() const;
protected:
DECLARE_SERIAL(BasicBloomFilter);
@ -188,13 +214,11 @@ public:
*/
CountingBloomFilter(const Hasher* hasher, size_t cells, size_t width);
/**
* Merges two counting Bloom filters.
*
* @return The merged Bloom filter.
*/
static CountingBloomFilter* Merge(const CountingBloomFilter* x,
const CountingBloomFilter* y);
// Overridden from BloomFilter.
virtual bool Empty() const;
virtual void Clear();
virtual bool Merge(const BloomFilter* other);
virtual CountingBloomFilter* Clone() const;
protected:
DECLARE_SERIAL(CountingBloomFilter);

View file

@ -14,8 +14,8 @@ set(probabilistic_SRCS
HyperLogLog.cc)
bif_target(bloom-filter.bif)
set(BIF_OUTPUT_CC_SAVE ${BIF_OUTPUT_CC})
bif_target(hyper-loglog.bif)
bro_add_subdir_library(probabilistic ${probabilistic_SRCS} ${BIF_OUTPUT_CC_SAVE} ${BIF_OUTPUT_CC})
bro_add_subdir_library(probabilistic ${probabilistic_SRCS})
add_dependencies(bro_probabilistic generate_outputs)

View file

@ -70,6 +70,16 @@ bool CounterVector::Decrement(size_type cell, count_type value)
return carry;
}
bool CounterVector::AllZero() const
{
return bits->AllZero();
}
void CounterVector::Clear()
{
bits->Clear();
}
CounterVector::count_type CounterVector::Count(size_type cell) const
{
assert(cell < Size());
@ -173,11 +183,11 @@ bool CounterVector::DoUnserialize(UnserialInfo* info)
if ( ! bits )
return false;
uint64 width;
if ( ! UNSERIALIZE(&width) )
uint64 w;
if ( ! UNSERIALIZE(&w) )
return false;
width = static_cast<size_t>(width);
width = static_cast<size_t>(w);
return true;
}

View file

@ -77,6 +77,17 @@ public:
*/
count_type Count(size_type cell) const;
/**
* Checks whether all counters are 0.
* @return `true` iff all counters have the value 0.
*/
bool AllZero() const;
/**
* Sets all counters to 0.
*/
void Clear();
/**
* Retrieves the number of cells in the storage.
*

View file

@ -74,16 +74,25 @@ public:
*
* @param k The number of hash functions to apply.
*
* @param name The hasher's name.
* @param name The hasher's name. Hashers with the same name should
* provide consistent results.
*
* @return Returns a new hasher instance.
*/
static Hasher* Create(size_t k, const std::string& name);
protected:
/**
* Constructor.
*
* @param k the number of hash functions.
*
* @param name A name for the hasher. Hashers with the same name
* should provide consistent results.
*/
Hasher(size_t k, const std::string& name);
private:
private:
const size_t k;
std::string name;
};

View file

@ -20,15 +20,23 @@ module GLOBAL;
## Creates a basic Bloom filter.
##
## .. note:: A Bloom filter can have a name associated with it. In the future,
## Bloom filters with the same name will be compatible across indepedent Bro
## instances, i.e., it will be possible to merge them. Currently, however, that is
## not yet supported.
##
## fp: The desired false-positive rate.
##
## capacity: the maximum number of elements that guarantees a false-positive
## rate of *fp*.
##
## name: A name that uniquely identifies and seeds the Bloom filter. If empty,
## the initialization will become dependent on the initial seed.
## the filter will remain tied to the current Bro process.
##
## Returns: A Bloom filter handle.
##
## .. bro:see:: bloomfilter_counting_init bloomfilter_add bloomfilter_lookup
## bloomfilter_clear bloomfilter_merge
function bloomfilter_basic_init%(fp: double, capacity: count,
name: string &default=""%): opaque of bloomfilter
%{
@ -47,18 +55,28 @@ function bloomfilter_basic_init%(fp: double, capacity: count,
## Creates a counting Bloom filter.
##
## .. note:: A Bloom filter can have a name associated with it. In the future,
## Bloom filters with the same name will be compatible across indepedent Bro
## instances, i.e., it will be possible to merge them. Currently, however, that is
## not yet supported.
##
## k: The number of hash functions to use.
##
## cells: The number of cells of the underlying counter vector.
## cells: The number of cells of the underlying counter vector. As there's no
## single answer to what's the best parameterization for a counting Bloom filter,
## we refer to the Bloom filter literature here for choosing an appropiate value.
##
## max: The maximum counter value associated with each each element described
## by *w = ceil(log_2(max))* bits. Each bit in the underlying counter vector
## becomes a cell of size *w* bits.
##
## name: A name that uniquely identifies and seeds the Bloom filter. If empty,
## the initialization will become dependent on the initial seed.
## the filter will remain tied to the current Bro process.
##
## Returns: A Bloom filter handle.
##
## .. bro:see:: bloomfilter_basic_init bloomfilter_add bloomfilter_lookup
## bloomfilter_clear bloomfilter_merge
function bloomfilter_counting_init%(k: count, cells: count, max: count,
name: string &default=""%): opaque of bloomfilter
%{
@ -82,6 +100,9 @@ function bloomfilter_counting_init%(k: count, cells: count, max: count,
## bf: The Bloom filter handle.
##
## x: The element to add.
##
## .. bro:see:: bloomfilter_counting_init bloomfilter_basic_init loomfilter_lookup
## bloomfilter_clear bloomfilter_merge
function bloomfilter_add%(bf: opaque of bloomfilter, x: any%): any
%{
BloomFilterVal* bfv = static_cast<BloomFilterVal*>(bf);
@ -105,10 +126,16 @@ function bloomfilter_add%(bf: opaque of bloomfilter, x: any%): any
## x: The element to count.
##
## Returns: the counter associated with *x* in *bf*.
##
## .. bro:see:: bloomfilter_counting_init bloomfilter_basic_init
## bloomfilter_add bloomfilter_clear bloomfilter_merge
function bloomfilter_lookup%(bf: opaque of bloomfilter, x: any%): count
%{
const BloomFilterVal* bfv = static_cast<const BloomFilterVal*>(bf);
if ( bfv->Empty() )
return new Val(0, TYPE_COUNT);
if ( ! bfv->Type() )
reporter->Error("cannot perform lookup on untyped Bloom filter");
@ -121,13 +148,38 @@ function bloomfilter_lookup%(bf: opaque of bloomfilter, x: any%): count
return new Val(0, TYPE_COUNT);
%}
## Removes all elements from a Bloom filter. This function resets all bits in the
## underlying bitvector back to 0 but does not change the parameterization of the
## Bloom filter, such as the element type and the hasher seed.
##
## bf: The Bloom filter handle.
##
## .. bro:see:: bloomfilter_counting_init bloomfilter_basic_init
## bloomfilter_add bloomfilter_lookup bloomfilter_merge
function bloomfilter_clear%(bf: opaque of bloomfilter%): any
%{
BloomFilterVal* bfv = static_cast<BloomFilterVal*>(bf);
if ( bfv->Type() ) // Untyped Bloom filters are already empty.
bfv->Clear();
return 0;
%}
## Merges two Bloom filters.
##
## .. note:: Currently Bloom filters created by different Bro instances cannot
## be merged. In the future, this will be supported as long as both filters
## are created with the same name.
##
## bf1: The first Bloom filter handle.
##
## bf2: The second Bloom filter handle.
##
## Returns: The union of *bf1* and *bf2*.
##
## .. bro:see:: bloomfilter_counting_init bloomfilter_basic_init
## bloomfilter_add bloomfilter_lookup bloomfilter_clear
function bloomfilter_merge%(bf1: opaque of bloomfilter,
bf2: opaque of bloomfilter%): opaque of bloomfilter
%{