Merge remote-tracking branch 'origin/topic/matthias/bloom-filter'

I'm moving the new files into a subdirectory probabilistic, and into a
corresponding namespace. We can later put code for the other
probabilistic data structures there as well.

* origin/topic/matthias/bloom-filter: (45 commits)
  Implement and test Bloom filter merging.
  Make hash functions equality comparable.
  Make counter vectors mergeable.
  Use half adder for bitwise addition and subtraction.
  Fix and test counting Bloom filter.
  Implement missing CounterVector functions.
  Tweak hasher interface.
  Add missing include for GCC.
  Fixing for unserializion error.
  Small fixes and style tweaks.
  Only serialize Bloom filter type if available.
  Create hash policies through factory.
  Remove lingering debug code.
  Factor implementation and change interface.
  Expose Bro's linear congruence PRNG as utility function.
  H3 does not check for zero length input.
  Support seeding for hashers.
  Add utility function to access first random seed.
  Update H3 documentation (and minor style nits.)
  Make H3 seed configurable.
  ...
This commit is contained in:
Robin Sommer 2013-07-23 16:40:56 -07:00
commit 21685d2529
26 changed files with 2279 additions and 67 deletions

View file

@ -0,0 +1,512 @@
#include "BitVector.h"
#include <cassert>
#include <limits>
#include "Serializer.h"
using namespace probabilistic;
BitVector::size_type BitVector::npos = static_cast<BitVector::size_type>(-1);
BitVector::block_type BitVector::bits_per_block =
std::numeric_limits<BitVector::block_type>::digits;
namespace {
uint8_t count_table[] = {
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2,
3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3,
3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3,
4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4,
3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5,
6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4,
4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5,
6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, 5,
3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 3,
4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6,
6, 7, 6, 7, 7, 8
};
} // namespace <anonymous>
BitVector::Reference::Reference(block_type& block, block_type i)
: block_(block),
mask_(block_type(1) << i)
{
assert(i < bits_per_block);
}
BitVector::Reference& BitVector::Reference::Flip()
{
block_ ^= mask_;
return *this;
}
BitVector::Reference::operator bool() const
{
return (block_ & mask_) != 0;
}
bool BitVector::Reference::operator~() const
{
return (block_ & mask_) == 0;
}
BitVector::Reference& BitVector::Reference::operator=(bool x)
{
x ? block_ |= mask_ : block_ &= ~mask_;
return *this;
}
BitVector::Reference& BitVector::Reference::operator=(Reference const& other)
{
other ? block_ |= mask_ : block_ &= ~mask_;
return *this;
}
BitVector::Reference& BitVector::Reference::operator|=(bool x)
{
if (x)
block_ |= mask_;
return *this;
}
BitVector::Reference& BitVector::Reference::operator&=(bool x)
{
if (! x)
block_ &= ~mask_;
return *this;
}
BitVector::Reference& BitVector::Reference::operator^=(bool x)
{
if (x)
block_ ^= mask_;
return *this;
}
BitVector::Reference& BitVector::Reference::operator-=(bool x)
{
if (x)
block_ &= ~mask_;
return *this;
}
BitVector::BitVector() : num_bits_(0) { }
BitVector::BitVector(size_type size, bool value)
: bits_(bits_to_blocks(size), value ? ~block_type(0) : 0),
num_bits_(size)
{ }
BitVector::BitVector(BitVector const& other)
: bits_(other.bits_),
num_bits_(other.num_bits_)
{ }
BitVector BitVector::operator~() const
{
BitVector b(*this);
b.Flip();
return b;
}
BitVector& BitVector::operator=(BitVector const& other)
{
bits_ = other.bits_;
return *this;
}
BitVector BitVector::operator<<(size_type n) const
{
BitVector b(*this);
return b <<= n;
}
BitVector BitVector::operator>>(size_type n) const
{
BitVector b(*this);
return b >>= n;
}
BitVector& BitVector::operator<<=(size_type n)
{
if (n >= num_bits_)
return Reset();
if (n > 0)
{
size_type last = Blocks() - 1;
size_type div = n / bits_per_block;
block_type r = bit_index(n);
block_type* b = &bits_[0];
assert(Blocks() >= 1);
assert(div <= last);
if (r != 0)
{
for (size_type i = last - div; i > 0; --i)
b[i + div] = (b[i] << r) | (b[i - 1] >> (bits_per_block - r));
b[div] = b[0] << r;
}
else
{
for (size_type i = last-div; i > 0; --i)
b[i + div] = b[i];
b[div] = b[0];
}
std::fill_n(b, div, block_type(0));
zero_unused_bits();
}
return *this;
}
BitVector& BitVector::operator>>=(size_type n)
{
if (n >= num_bits_)
return Reset();
if (n > 0)
{
size_type last = Blocks() - 1;
size_type div = n / bits_per_block;
block_type r = bit_index(n);
block_type* b = &bits_[0];
assert(Blocks() >= 1);
assert(div <= last);
if (r != 0)
{
for (size_type i = last - div; i > 0; --i)
b[i - div] = (b[i] >> r) | (b[i + 1] << (bits_per_block - r));
b[last - div] = b[last] >> r;
}
else
{
for (size_type i = div; i <= last; ++i)
b[i-div] = b[i];
}
std::fill_n(b + (Blocks() - div), div, block_type(0));
}
return *this;
}
BitVector& BitVector::operator&=(BitVector const& other)
{
assert(Size() >= other.Size());
for (size_type i = 0; i < Blocks(); ++i)
bits_[i] &= other.bits_[i];
return *this;
}
BitVector& BitVector::operator|=(BitVector const& other)
{
assert(Size() >= other.Size());
for (size_type i = 0; i < Blocks(); ++i)
bits_[i] |= other.bits_[i];
return *this;
}
BitVector& BitVector::operator^=(BitVector const& other)
{
assert(Size() >= other.Size());
for (size_type i = 0; i < Blocks(); ++i)
bits_[i] ^= other.bits_[i];
return *this;
}
BitVector& BitVector::operator-=(BitVector const& other)
{
assert(Size() >= other.Size());
for (size_type i = 0; i < Blocks(); ++i)
bits_[i] &= ~other.bits_[i];
return *this;
}
namespace probabilistic {
BitVector operator&(BitVector const& x, BitVector const& y)
{
BitVector b(x);
return b &= y;
}
BitVector operator|(BitVector const& x, BitVector const& y)
{
BitVector b(x);
return b |= y;
}
BitVector operator^(BitVector const& x, BitVector const& y)
{
BitVector b(x);
return b ^= y;
}
BitVector operator-(BitVector const& x, BitVector const& y)
{
BitVector b(x);
return b -= y;
}
bool operator==(BitVector const& x, BitVector const& y)
{
return x.num_bits_ == y.num_bits_ && x.bits_ == y.bits_;
}
bool operator!=(BitVector const& x, BitVector const& y)
{
return ! (x == y);
}
bool operator<(BitVector const& x, BitVector const& y)
{
assert(x.Size() == y.Size());
for (BitVector::size_type r = x.Blocks(); r > 0; --r)
{
BitVector::size_type i = r - 1;
if (x.bits_[i] < y.bits_[i])
return true;
else if (x.bits_[i] > y.bits_[i])
return false;
}
return false;
}
}
void BitVector::Resize(size_type n, bool value)
{
size_type old = Blocks();
size_type required = bits_to_blocks(n);
block_type block_value = value ? ~block_type(0) : block_type(0);
if (required != old)
bits_.resize(required, block_value);
if (value && (n > num_bits_) && extra_bits())
bits_[old - 1] |= (block_value << extra_bits());
num_bits_ = n;
zero_unused_bits();
}
void BitVector::Clear()
{
bits_.clear();
num_bits_ = 0;
}
void BitVector::PushBack(bool bit)
{
size_type s = Size();
Resize(s + 1);
Set(s, bit);
}
void BitVector::Append(block_type block)
{
size_type excess = extra_bits();
if (excess)
{
assert(! Empty());
bits_.push_back(block >> (bits_per_block - excess));
bits_[Blocks() - 2] |= (block << excess);
}
else
{
bits_.push_back(block);
}
num_bits_ += bits_per_block;
}
BitVector& BitVector::Set(size_type i, bool bit)
{
assert(i < num_bits_);
if (bit)
bits_[block_index(i)] |= bit_mask(i);
else
Reset(i);
return *this;
}
BitVector& BitVector::Set()
{
std::fill(bits_.begin(), bits_.end(), ~block_type(0));
zero_unused_bits();
return *this;
}
BitVector& BitVector::Reset(size_type i)
{
assert(i < num_bits_);
bits_[block_index(i)] &= ~bit_mask(i);
return *this;
}
BitVector& BitVector::Reset()
{
std::fill(bits_.begin(), bits_.end(), block_type(0));
return *this;
}
BitVector& BitVector::Flip(size_type i)
{
assert(i < num_bits_);
bits_[block_index(i)] ^= bit_mask(i);
return *this;
}
BitVector& BitVector::Flip()
{
for (size_type i = 0; i < Blocks(); ++i)
bits_[i] = ~bits_[i];
zero_unused_bits();
return *this;
}
bool BitVector::operator[](size_type i) const
{
assert(i < num_bits_);
return (bits_[block_index(i)] & bit_mask(i)) != 0;
}
BitVector::Reference BitVector::operator[](size_type i)
{
assert(i < num_bits_);
return Reference(bits_[block_index(i)], bit_index(i));
}
BitVector::size_type BitVector::Count() const
{
std::vector<block_type>::const_iterator first = bits_.begin();
size_t n = 0;
size_type length = Blocks();
while (length)
{
block_type block = *first;
while (block)
{
// TODO: use __popcnt if available.
n += count_table[block & ((1u << 8) - 1)];
block >>= 8;
}
++first;
--length;
}
return n;
}
BitVector::size_type BitVector::Blocks() const
{
return bits_.size();
}
BitVector::size_type BitVector::Size() const
{
return num_bits_;
}
bool BitVector::Empty() const
{
return bits_.empty();
}
BitVector::size_type BitVector::FindFirst() const
{
return find_from(0);
}
BitVector::size_type BitVector::FindNext(size_type i) const
{
if (i >= (Size() - 1) || Size() == 0)
return npos;
++i;
size_type bi = block_index(i);
block_type block = bits_[bi] & (~block_type(0) << bit_index(i));
return block ? bi * bits_per_block + lowest_bit(block) : find_from(bi + 1);
}
BitVector::size_type BitVector::lowest_bit(block_type block)
{
block_type x = block - (block & (block - 1));
size_type log = 0;
while (x >>= 1)
++log;
return log;
}
BitVector::block_type BitVector::extra_bits() const
{
return bit_index(Size());
}
void BitVector::zero_unused_bits()
{
if (extra_bits())
bits_.back() &= ~(~block_type(0) << extra_bits());
}
BitVector::size_type BitVector::find_from(size_type i) const
{
while (i < Blocks() && bits_[i] == 0)
++i;
if (i >= Blocks())
return npos;
return i * bits_per_block + lowest_bit(bits_[i]);
}
bool BitVector::Serialize(SerialInfo* info) const
{
return SerialObj::Serialize(info);
}
BitVector* BitVector::Unserialize(UnserialInfo* info)
{
return reinterpret_cast<BitVector*>(
SerialObj::Unserialize(info, SER_BITVECTOR));
}
IMPLEMENT_SERIAL(BitVector, SER_BITVECTOR);
bool BitVector::DoSerialize(SerialInfo* info) const
{
DO_SERIALIZE(SER_BITVECTOR, SerialObj);
if ( ! SERIALIZE(static_cast<uint64>(bits_.size())) )
return false;
for ( size_t i = 0; i < bits_.size(); ++i )
if ( ! SERIALIZE(static_cast<uint64>(bits_[i])) )
return false;
return SERIALIZE(static_cast<uint64>(num_bits_));
}
bool BitVector::DoUnserialize(UnserialInfo* info)
{
DO_UNSERIALIZE(SerialObj);
uint64 size;
if ( ! UNSERIALIZE(&size) )
return false;
bits_.resize(static_cast<size_t>(size));
uint64 block;
for ( size_t i = 0; i < bits_.size(); ++i )
{
if ( ! UNSERIALIZE(&block) )
return false;
bits_[i] = static_cast<block_type>(block);
}
uint64 num_bits;
if ( ! UNSERIALIZE(&num_bits) )
return false;
num_bits_ = static_cast<size_type>(num_bits);
return true;
}

View file

@ -0,0 +1,335 @@
#ifndef BitVector_h
#define BitVector_h
#include <iterator>
#include <vector>
#include "SerialObj.h"
namespace probabilistic {
/**
* A vector of bits.
*/
class BitVector : public SerialObj {
public:
typedef size_t block_type;
typedef size_t size_type;
static size_type npos;
static block_type bits_per_block;
public:
/**
* An lvalue proxy for single bits.
*/
class Reference {
friend class BitVector;
Reference(block_type& block, block_type i);
public:
Reference& Flip();
operator bool() const;
bool operator~() const;
Reference& operator=(bool x);
Reference& operator=(Reference const& other);
Reference& operator|=(bool x);
Reference& operator&=(bool x);
Reference& operator^=(bool x);
Reference& operator-=(bool x);
private:
void operator&();
block_type& block_;
block_type const mask_;
};
typedef bool const_reference;
/**
* Default-constructs an empty bit vector.
*/
BitVector();
/**
* Constructs a bit vector of a given size.
* @param size The number of bits.
* @param value The value for each bit.
*/
explicit BitVector(size_type size, bool value = false);
/**
* Constructs a bit vector from a sequence of blocks.
*/
template <typename InputIterator>
BitVector(InputIterator first, InputIterator last)
{
bits_.insert(bits_.end(), first, last);
num_bits_ = bits_.size() * bits_per_block;
}
/**
* Copy-constructs a bit vector.
* @param other The bit vector to copy.
*/
BitVector(const BitVector& other);
/**
* Assigns another bit vector to this instance.
* @param other The RHS of the assignment.
*/
BitVector& operator=(const BitVector& other);
//
// Bitwise operations
//
BitVector operator~() const;
BitVector operator<<(size_type n) const;
BitVector operator>>(size_type n) const;
BitVector& operator<<=(size_type n);
BitVector& operator>>=(size_type n);
BitVector& operator&=(BitVector const& other);
BitVector& operator|=(BitVector const& other);
BitVector& operator^=(BitVector const& other);
BitVector& operator-=(BitVector const& other);
friend BitVector operator&(BitVector const& x, BitVector const& y);
friend BitVector operator|(BitVector const& x, BitVector const& y);
friend BitVector operator^(BitVector const& x, BitVector const& y);
friend BitVector operator-(BitVector const& x, BitVector const& y);
//
// Relational operators
//
friend bool operator==(BitVector const& x, BitVector const& y);
friend bool operator!=(BitVector const& x, BitVector const& y);
friend bool operator<(BitVector const& x, BitVector const& y);
//
// Basic operations
//
/** Appends the bits in a sequence of values.
* @tparam Iterator A forward iterator.
* @param first An iterator pointing to the first element of the sequence.
* @param last An iterator pointing to one past the last element of the
* sequence.
*/
template <typename ForwardIterator>
void Append(ForwardIterator first, ForwardIterator last)
{
if (first == last)
return;
block_type excess = extra_bits();
typename std::iterator_traits<ForwardIterator>::difference_type delta =
std::distance(first, last);
bits_.reserve(Blocks() + delta);
if (excess == 0)
{
bits_.back() |= (*first << excess);
do
{
block_type b = *first++ >> (bits_per_block - excess);
bits_.push_back(b | (first == last ? 0 : *first << excess));
} while (first != last);
}
else
{
bits_.insert(bits_.end(), first, last);
}
num_bits_ += bits_per_block * delta;
}
/**
* Appends the bits in a given block.
* @param block The block containing bits to append.
*/
void Append(block_type block);
/** Appends a single bit to the end of the bit vector.
* @param bit The value of the bit.
*/
void PushBack(bool bit);
/**
* Clears all bits in the bitvector.
*/
void Clear();
/**
* Resizes the bit vector to a new number of bits.
* @param n The new number of bits of the bit vector.
* @param value The bit value of new values, if the vector expands.
*/
void Resize(size_type n, bool value = false);
/**
* Sets a bit at a specific position to a given value.
* @param i The bit position.
* @param bit The value assigned to position *i*.
* @return A reference to the bit vector instance.
*/
BitVector& Set(size_type i, bool bit = true);
/**
* Sets all bits to 1.
* @return A reference to the bit vector instance.
*/
BitVector& Set();
/**
* Resets a bit at a specific position, i.e., sets it to 0.
* @param i The bit position.
* @return A reference to the bit vector instance.
*/
BitVector& Reset(size_type i);
/**
* Sets all bits to 0.
* @return A reference to the bit vector instance.
*/
BitVector& Reset();
/**
* Toggles/flips a bit at a specific position.
* @param i The bit position.
* @return A reference to the bit vector instance.
*/
BitVector& Flip(size_type i);
/**
* Computes the complement.
* @return A reference to the bit vector instance.
*/
BitVector& Flip();
/** Retrieves a single bit.
* @param i The bit position.
* @return A mutable reference to the bit at position *i*.
*/
Reference operator[](size_type i);
/**
* Retrieves a single bit.
* @param i The bit position.
* @return A const-reference to the bit at position *i*.
*/
const_reference operator[](size_type i) const;
/**
* Counts the number of 1-bits in the bit vector. Also known as *population
* count* or *Hamming weight*.
* @return The number of bits set to 1.
*/
size_type Count() const;
/**
* Retrieves the number of blocks of the underlying storage.
* @param The number of blocks that represent `Size()` bits.
*/
size_type Blocks() const;
/**
* Retrieves the number of bits the bitvector consist of.
* @return The length of the bit vector in bits.
*/
size_type Size() const;
/**
* Checks whether the bit vector is empty.
* @return `true` iff the bitvector has zero length.
*/
bool Empty() const;
/**
* Finds the bit position of of the first 1-bit.
* @return The position of the first bit that equals to one or `npos` if no
* such bit exists.
*/
size_type FindFirst() const;
/**
* Finds the next 1-bit from a given starting position.
*
* @param i The index where to start looking.
*
* @return The position of the first bit that equals to 1 after position
* *i* or `npos` if no such bit exists.
*/
size_type FindNext(size_type i) const;
bool Serialize(SerialInfo* info) const;
static BitVector* Unserialize(UnserialInfo* info);
protected:
DECLARE_SERIAL(BitVector);
private:
/**
* Computes the block index for a given bit position.
*/
static size_type block_index(size_type i)
{
return i / bits_per_block;
}
/**
* Computes the bit index within a given block for a given bit position.
*/
static block_type bit_index(size_type i)
{
return i % bits_per_block;
}
/**
* Computes the bitmask block to extract a bit a given bit position.
*/
static block_type bit_mask(size_type i)
{
return block_type(1) << bit_index(i);
}
/**
* Computes the number of blocks needed to represent a given number of
* bits.
* @param bits the number of bits.
* @return The number of blocks to represent *bits* number of bits.
*/
static size_type bits_to_blocks(size_type bits)
{
return bits / bits_per_block
+ static_cast<size_type>(bits % bits_per_block != 0);
}
/**
* Computes the bit position first 1-bit in a given block.
* @param block The block to inspect.
* @return The bit position where *block* has its first bit set to 1.
*/
static size_type lowest_bit(block_type block);
/**
* Computes the number of excess/unused bits in the bit vector.
*/
block_type extra_bits() const;
/**
* If the number of bits in the vector are not not a multiple of
* bitvector::bits_per_block, then the last block exhibits unused bits which
* this function resets.
*/
void zero_unused_bits();
/**
* Looks for the first 1-bit starting at a given position.
* @param i The block index to start looking.
* @return The block index of the first 1-bit starting from *i* or
* `bitvector::npos` if no 1-bit exists.
*/
size_type find_from(size_type i) const;
std::vector<block_type> bits_;
size_type num_bits_;
};
}
#endif

View file

@ -0,0 +1,188 @@
#include "BloomFilter.h"
#include <cmath>
#include <limits>
#include "CounterVector.h"
#include "Serializer.h"
using namespace probabilistic;
BloomFilter::BloomFilter()
: hasher_(NULL)
{
}
BloomFilter::BloomFilter(const Hasher* hasher)
: hasher_(hasher)
{
}
BloomFilter::~BloomFilter()
{
if ( hasher_ )
delete hasher_;
}
bool BloomFilter::Serialize(SerialInfo* info) const
{
return SerialObj::Serialize(info);
}
BloomFilter* BloomFilter::Unserialize(UnserialInfo* info)
{
return reinterpret_cast<BloomFilter*>(
SerialObj::Unserialize(info, SER_BLOOMFILTER));
}
bool BloomFilter::DoSerialize(SerialInfo* info) const
{
DO_SERIALIZE(SER_BLOOMFILTER, SerialObj);
if ( ! SERIALIZE(static_cast<uint16>(hasher_->K())) )
return false;
return SERIALIZE_STR(hasher_->Name().c_str(), hasher_->Name().size());
}
bool BloomFilter::DoUnserialize(UnserialInfo* info)
{
DO_UNSERIALIZE(SerialObj);
uint16 k;
if ( ! UNSERIALIZE(&k) )
return false;
const char* name;
if ( ! UNSERIALIZE_STR(&name, 0) )
return false;
hasher_ = Hasher::Create(k, name);
delete [] name;
return true;
}
size_t BasicBloomFilter::M(double fp, size_t capacity)
{
double ln2 = std::log(2);
return std::ceil(-(capacity * std::log(fp) / ln2 / ln2));
}
size_t BasicBloomFilter::K(size_t cells, size_t capacity)
{
double frac = static_cast<double>(cells) / static_cast<double>(capacity);
return std::ceil(frac * std::log(2));
}
BasicBloomFilter* BasicBloomFilter::Merge(const BasicBloomFilter* x,
const BasicBloomFilter* y)
{
if ( ! x->hasher_->Equals(y->hasher_) )
{
reporter->InternalError("incompatible hashers during Bloom filter merge");
return NULL;
}
BasicBloomFilter* result = new BasicBloomFilter();
result->hasher_ = x->hasher_->Clone();
result->bits_ = new BitVector(*x->bits_ | *y->bits_);
return result;
}
BasicBloomFilter::BasicBloomFilter()
: bits_(NULL)
{
}
BasicBloomFilter::BasicBloomFilter(const Hasher* hasher, size_t cells)
: BloomFilter(hasher),
bits_(new BitVector(cells))
{
}
IMPLEMENT_SERIAL(BasicBloomFilter, SER_BASICBLOOMFILTER)
bool BasicBloomFilter::DoSerialize(SerialInfo* info) const
{
DO_SERIALIZE(SER_BASICBLOOMFILTER, BloomFilter);
return bits_->Serialize(info);
}
bool BasicBloomFilter::DoUnserialize(UnserialInfo* info)
{
DO_UNSERIALIZE(BloomFilter);
bits_ = BitVector::Unserialize(info);
return bits_ != NULL;
}
void BasicBloomFilter::AddImpl(const Hasher::digest_vector& h)
{
for ( size_t i = 0; i < h.size(); ++i )
bits_->Set(h[i] % bits_->Size());
}
size_t BasicBloomFilter::CountImpl(const Hasher::digest_vector& h) const
{
for ( size_t i = 0; i < h.size(); ++i )
if ( ! (*bits_)[h[i] % bits_->Size()] )
return 0;
return 1;
}
CountingBloomFilter* CountingBloomFilter::Merge(const CountingBloomFilter* x,
const CountingBloomFilter* y)
{
if ( ! x->hasher_->Equals(y->hasher_) )
{
reporter->InternalError("incompatible hashers during Bloom filter merge");
return NULL;
}
CountingBloomFilter* result = new CountingBloomFilter();
result->hasher_ = x->hasher_->Clone();
result->cells_ = new CounterVector(*x->cells_ | *y->cells_);
return result;
}
CountingBloomFilter::CountingBloomFilter()
: cells_(NULL)
{
}
CountingBloomFilter::CountingBloomFilter(const Hasher* hasher,
size_t cells, size_t width)
: BloomFilter(hasher),
cells_(new CounterVector(width, cells))
{
}
IMPLEMENT_SERIAL(CountingBloomFilter, SER_COUNTINGBLOOMFILTER)
bool CountingBloomFilter::DoSerialize(SerialInfo* info) const
{
DO_SERIALIZE(SER_COUNTINGBLOOMFILTER, BloomFilter);
return cells_->Serialize(info);
}
bool CountingBloomFilter::DoUnserialize(UnserialInfo* info)
{
DO_UNSERIALIZE(BloomFilter);
cells_ = CounterVector::Unserialize(info);
return cells_ != NULL;
}
// TODO: Use partitioning in add/count to allow for reusing CMS bounds.
void CountingBloomFilter::AddImpl(const Hasher::digest_vector& h)
{
for ( size_t i = 0; i < h.size(); ++i )
cells_->Increment(h[i] % cells_->Size());
}
size_t CountingBloomFilter::CountImpl(const Hasher::digest_vector& h) const
{
CounterVector::size_type min =
std::numeric_limits<CounterVector::size_type>::max();
for ( size_t i = 0; i < h.size(); ++i )
{
CounterVector::size_type cnt = cells_->Count(h[i] % cells_->Size());
if ( cnt < min )
min = cnt;
}
return min;
}

View file

@ -0,0 +1,140 @@
#ifndef BloomFilter_h
#define BloomFilter_h
#include <vector>
#include "BitVector.h"
#include "Hasher.h"
namespace probabilistic {
class CounterVector;
/**
* The abstract base class for Bloom filters.
*/
class BloomFilter : public SerialObj {
public:
// At this point we won't let the user choose the hasher, but we might
// open up the interface in the future.
virtual ~BloomFilter();
/**
* Adds an element of type T to the Bloom filter.
* @param x The element to add
*/
template <typename T>
void Add(const T& x)
{
AddImpl((*hasher_)(x));
}
/**
* Retrieves the associated count of a given value.
*
* @param x The value of type `T` to check.
*
* @return The counter associated with *x*.
*/
template <typename T>
size_t Count(const T& x) const
{
return CountImpl((*hasher_)(x));
}
bool Serialize(SerialInfo* info) const;
static BloomFilter* Unserialize(UnserialInfo* info);
protected:
DECLARE_ABSTRACT_SERIAL(BloomFilter);
BloomFilter();
/**
* Constructs a Bloom filter.
*
* @param hasher The hasher to use for this Bloom filter.
*/
BloomFilter(const Hasher* hasher);
virtual void AddImpl(const Hasher::digest_vector& hashes) = 0;
virtual size_t CountImpl(const Hasher::digest_vector& hashes) const = 0;
const Hasher* hasher_;
};
/**
* A basic Bloom filter.
*/
class BasicBloomFilter : public BloomFilter {
public:
/**
* Computes the number of cells based a given false-positive rate and
* capacity. In the literature, this parameter often has the name *M*.
*
* @param fp The false-positive rate.
*
* @param capacity The number of exepected elements.
*
* Returns: The number cells needed to support a false-positive rate of *fp*
* with at most *capacity* elements.
*/
static size_t M(double fp, size_t capacity);
/**
* Computes the optimal number of hash functions based on the number cells
* and expected number of elements.
*
* @param cells The number of cells (*m*).
*
* @param capacity The maximum number of elements.
*
* Returns: the optimal number of hash functions for a false-positive rate of
* *fp* for at most *capacity* elements.
*/
static size_t K(size_t cells, size_t capacity);
static BasicBloomFilter* Merge(const BasicBloomFilter* x,
const BasicBloomFilter* y);
/**
* Constructs a basic Bloom filter with a given number of cells and capacity.
*/
BasicBloomFilter(const Hasher* hasher, size_t cells);
protected:
DECLARE_SERIAL(BasicBloomFilter);
BasicBloomFilter();
virtual void AddImpl(const Hasher::digest_vector& h);
virtual size_t CountImpl(const Hasher::digest_vector& h) const;
private:
BitVector* bits_;
};
/**
* A counting Bloom filter.
*/
class CountingBloomFilter : public BloomFilter {
public:
static CountingBloomFilter* Merge(const CountingBloomFilter* x,
const CountingBloomFilter* y);
CountingBloomFilter(const Hasher* hasher, size_t cells, size_t width);
protected:
DECLARE_SERIAL(CountingBloomFilter);
CountingBloomFilter();
virtual void AddImpl(const Hasher::digest_vector& h);
virtual size_t CountImpl(const Hasher::digest_vector& h) const;
private:
CounterVector* cells_;
};
}
#endif

View file

@ -0,0 +1,18 @@
include(BroSubdir)
include_directories(BEFORE
${CMAKE_CURRENT_SOURCE_DIR}
${CMAKE_CURRENT_BINARY_DIR}
)
set(probabilistic_SRCS
BitVector.cc
BloomFilter.cc
CounterVector.cc
Hasher.cc)
bif_target(bloom-filter.bif)
bro_add_subdir_library(probabilistic ${probabilistic_SRCS} ${BIF_OUTPUT_CC})
add_dependencies(bro_probabilistic generate_outputs)

View file

@ -0,0 +1,159 @@
#include "CounterVector.h"
#include <limits>
#include "BitVector.h"
#include "Serializer.h"
using namespace probabilistic;
CounterVector::CounterVector(size_t width, size_t cells)
: bits_(new BitVector(width * cells)),
width_(width)
{
}
CounterVector::CounterVector(const CounterVector& other)
: bits_(new BitVector(*other.bits_)),
width_(other.width_)
{
}
CounterVector::~CounterVector()
{
delete bits_;
}
bool CounterVector::Increment(size_type cell, count_type value)
{
assert(cell < Size());
assert(value != 0);
size_t lsb = cell * width_;
bool carry = false;
for ( size_t i = 0; i < width_; ++i )
{
bool b1 = (*bits_)[lsb + i];
bool b2 = value & (1 << i);
(*bits_)[lsb + i] = b1 ^ b2 ^ carry;
carry = ( b1 && b2 ) || ( carry && ( b1 != b2 ) );
}
if ( carry )
for ( size_t i = 0; i < width_; ++i )
bits_->Set(lsb + i);
return ! carry;
}
bool CounterVector::Decrement(size_type cell, count_type value)
{
assert(cell < Size());
assert(value != 0);
value = ~value + 1; // A - B := A + ~B + 1
bool carry = false;
size_t lsb = cell * width_;
for ( size_t i = 0; i < width_; ++i )
{
bool b1 = (*bits_)[lsb + i];
bool b2 = value & (1 << i);
(*bits_)[lsb + i] = b1 ^ b2 ^ carry;
carry = ( b1 && b2 ) || ( carry && ( b1 != b2 ) );
}
return carry;
}
CounterVector::count_type CounterVector::Count(size_type cell) const
{
assert(cell < Size());
size_t cnt = 0, order = 1;
size_t lsb = cell * width_;
for (size_t i = lsb; i < lsb + width_; ++i, order <<= 1)
if ((*bits_)[i])
cnt |= order;
return cnt;
}
CounterVector::size_type CounterVector::Size() const
{
return bits_->Size() / width_;
}
size_t CounterVector::Width() const
{
return width_;
}
size_t CounterVector::Max() const
{
return std::numeric_limits<size_t>::max()
>> (std::numeric_limits<size_t>::digits - width_);
}
CounterVector& CounterVector::Merge(const CounterVector& other)
{
assert(Size() == other.Size());
assert(Width() == other.Width());
for ( size_t cell = 0; cell < Size(); ++cell )
{
size_t lsb = cell * width_;
bool carry = false;
for ( size_t i = 0; i < width_; ++i )
{
bool b1 = (*bits_)[lsb + i];
bool b2 = (*other.bits_)[lsb + i];
(*bits_)[lsb + i] = b1 ^ b2 ^ carry;
carry = ( b1 && b2 ) || ( carry && ( b1 != b2 ) );
}
if ( carry )
for ( size_t i = 0; i < width_; ++i )
bits_->Set(lsb + i);
}
return *this;
}
namespace probabilistic {
CounterVector& CounterVector::operator|=(const CounterVector& other)
{
return Merge(other);
}
CounterVector operator|(const CounterVector& x, const CounterVector& y)
{
CounterVector cv(x);
return cv |= y;
}
}
bool CounterVector::Serialize(SerialInfo* info) const
{
return SerialObj::Serialize(info);
}
CounterVector* CounterVector::Unserialize(UnserialInfo* info)
{
return reinterpret_cast<CounterVector*>(
SerialObj::Unserialize(info, SER_COUNTERVECTOR));
}
IMPLEMENT_SERIAL(CounterVector, SER_COUNTERVECTOR)
bool CounterVector::DoSerialize(SerialInfo* info) const
{
DO_SERIALIZE(SER_COUNTERVECTOR, SerialObj);
if ( ! bits_->Serialize(info) )
return false;
return SERIALIZE(static_cast<uint64>(width_));
}
bool CounterVector::DoUnserialize(UnserialInfo* info)
{
DO_UNSERIALIZE(SerialObj);
bits_ = BitVector::Unserialize(info);
if ( ! bits_ )
return false;
uint64 width;
if ( ! UNSERIALIZE(&width) )
return false;
width_ = static_cast<size_t>(width);
return true;
}

View file

@ -0,0 +1,132 @@
#ifndef CounterVector_h
#define CounterVector_h
#include "SerialObj.h"
namespace probabilistic {
class BitVector;
/**
* A vector of counters, each of which have a fixed number of bits.
*/
class CounterVector : public SerialObj {
CounterVector& operator=(const CounterVector&);
public:
typedef size_t size_type;
typedef uint64 count_type;
/**
* Constructs a counter vector having cells of a given width.
*
* @param width The number of bits that each cell occupies.
*
* @param cells The number of cells in the bitvector.
*
* @pre `cells > 0 && width > 0`
*/
CounterVector(size_t width, size_t cells = 1024);
/**
* Copy-constructs a counter vector.
*
* @param other The counter vector to copy.
*/
CounterVector(const CounterVector& other);
~CounterVector();
/**
* Increments a given cell.
*
* @param cell The cell to increment.
*
* @param value The value to add to the current counter in *cell*.
*
* @return `true` if adding *value* to the counter in *cell* succeeded.
*
* @pre `cell < Size()`
*/
bool Increment(size_type cell, count_type value = 1);
/**
* Decrements a given cell.
*
* @param cell The cell to decrement.
*
* @param value The value to subtract from the current counter in *cell*.
*
* @return `true` if subtracting *value* from the counter in *cell* succeeded.
*
* @pre `cell < Size()`
*/
bool Decrement(size_type cell, count_type value = 1);
/**
* Retrieves the counter of a given cell.
*
* @param cell The cell index to retrieve the count for.
*
* @return The counter associated with *cell*.
*
* @pre `cell < Size()`
*/
count_type Count(size_type cell) const;
/**
* Retrieves the number of cells in the storage.
*
* @return The number of cells.
*/
size_type Size() const;
/**
* Retrieves the counter width.
*
* @return The number of bits per counter.
*/
size_t Width() const;
/**
* Computes the maximum counter value.
*
* @return The maximum counter value based on the width.
*/
size_t Max() const;
/**
* Merges another counter vector into this instance by *adding* the counters
* of each cells.
*
* @param other The counter vector to merge into this instance.
*
* @return A reference to `*this`.
*
* @pre `Size() == other.Size() && Width() == other.Width()`
*/
CounterVector& Merge(const CounterVector& other);
/**
* An alias for ::Merge.
*/
CounterVector& operator|=(const CounterVector& other);
friend CounterVector operator|(const CounterVector& x,
const CounterVector& y);
bool Serialize(SerialInfo* info) const;
static CounterVector* Unserialize(UnserialInfo* info);
protected:
DECLARE_SERIAL(CounterVector);
CounterVector() { }
private:
BitVector* bits_;
size_t width_;
};
}
#endif

109
src/probabilistic/Hasher.cc Normal file
View file

@ -0,0 +1,109 @@
#include <typeinfo>
#include "Hasher.h"
#include "digest.h"
using namespace probabilistic;
Hasher::UHF::UHF(size_t seed, const std::string& extra)
: h_(compute_seed(seed, extra))
{
}
Hasher::digest Hasher::UHF::hash(const void* x, size_t n) const
{
assert(n <= UHASH_KEY_SIZE);
return n == 0 ? 0 : h_(x, n);
}
size_t Hasher::UHF::compute_seed(size_t seed, const std::string& extra)
{
u_char buf[SHA256_DIGEST_LENGTH];
SHA256_CTX ctx;
sha256_init(&ctx);
if ( extra.empty() )
{
unsigned int first_seed = initial_seed();
sha256_update(&ctx, &first_seed, sizeof(first_seed));
}
else
{
sha256_update(&ctx, extra.c_str(), extra.size());
}
sha256_update(&ctx, &seed, sizeof(seed));
sha256_final(&ctx, buf);
// Take the first sizeof(size_t) bytes as seed.
return *reinterpret_cast<size_t*>(buf);
}
Hasher* Hasher::Create(size_t k, const std::string& name)
{
return new DefaultHasher(k, name);
}
Hasher::Hasher(size_t k, const std::string& name)
: k_(k), name_(name)
{
}
DefaultHasher::DefaultHasher(size_t k, const std::string& name)
: Hasher(k, name)
{
for ( size_t i = 0; i < k; ++i )
hash_functions_.push_back(UHF(i, name));
}
Hasher::digest_vector DefaultHasher::Hash(const void* x, size_t n) const
{
digest_vector h(K(), 0);
for ( size_t i = 0; i < h.size(); ++i )
h[i] = hash_functions_[i](x, n);
return h;
}
DefaultHasher* DefaultHasher::Clone() const
{
return new DefaultHasher(*this);
}
bool DefaultHasher::Equals(const Hasher* other) const
{
if ( typeid(*this) != typeid(*other) )
return false;
const DefaultHasher* o = static_cast<const DefaultHasher*>(other);
return hash_functions_ == o->hash_functions_;
}
DoubleHasher::DoubleHasher(size_t k, const std::string& name)
: Hasher(k, name),
h1_(1, name),
h2_(2, name)
{
}
Hasher::digest_vector DoubleHasher::Hash(const void* x, size_t n) const
{
digest h1 = h1_(x, n);
digest h2 = h2_(x, n);
digest_vector h(K(), 0);
for ( size_t i = 0; i < h.size(); ++i )
h[i] = h1 + i * h2;
return h;
}
DoubleHasher* DoubleHasher::Clone() const
{
return new DoubleHasher(*this);
}
bool DoubleHasher::Equals(const Hasher* other) const
{
if ( typeid(*this) != typeid(*other) )
return false;
const DoubleHasher* o = static_cast<const DoubleHasher*>(other);
return h1_ == o->h1_ && h2_ == o->h2_;
}

131
src/probabilistic/Hasher.h Normal file
View file

@ -0,0 +1,131 @@
#ifndef Hasher_h
#define Hasher_h
#include "Hash.h"
#include "H3.h"
namespace probabilistic {
/**
* The abstract base class for hashers, i.e., constructs which hash elements
* *k* times.
*/
class Hasher {
public:
typedef hash_t digest;
typedef std::vector<digest> digest_vector;
/**
* Constructs the hashing policy used by the implementation.
*
* @todo This factory function exists because the HashingPolicy class
* hierachy is not yet serializable.
*/
static Hasher* Create(size_t k, const std::string& name);
virtual ~Hasher() { }
template <typename T>
digest_vector operator()(const T& x) const
{
return Hash(&x, sizeof(T));
}
virtual digest_vector Hash(const void* x, size_t n) const = 0;
virtual Hasher* Clone() const = 0;
virtual bool Equals(const Hasher* other) const = 0;
size_t K() const { return k_; }
const std::string& Name() const { return name_; }
protected:
/**
* A universal hash function family.
*/
class UHF {
public:
/**
* Constructs an H3 hash function seeded with a given seed and an optional
* extra seed to replace the initial Bro seed.
*
* @param seed The seed to use for this instance.
*
* @param extra If not empty, this parameter replaces the initial seed to
* compute the seed for t to compute the
* seed
* NUL-terminated string as additional seed.
*/
UHF(size_t seed, const std::string& extra = "");
template <typename T>
digest operator()(const T& x) const
{
return hash(&x, sizeof(T));
}
digest operator()(const void* x, size_t n) const
{
return hash(x, n);
}
friend bool operator==(const UHF& x, const UHF& y)
{
return x.h_ == y.h_;
}
friend bool operator!=(const UHF& x, const UHF& y)
{
return ! (x == y);
}
digest hash(const void* x, size_t n) const;
private:
static size_t compute_seed(size_t seed, const std::string& extra);
H3<digest, UHASH_KEY_SIZE> h_;
};
Hasher(size_t k, const std::string& name);
private:
const size_t k_;
std::string name_;
};
/**
* The default hashing policy. Performs *k* hash function computations.
*/
class DefaultHasher : public Hasher {
public:
DefaultHasher(size_t k, const std::string& name);
virtual digest_vector Hash(const void* x, size_t n) const /* final */;
virtual DefaultHasher* Clone() const /* final */;
virtual bool Equals(const Hasher* other) const /* final */;
private:
std::vector<UHF> hash_functions_;
};
/**
* The *double-hashing* policy. Uses a linear combination of two hash functions.
*/
class DoubleHasher : public Hasher {
public:
DoubleHasher(size_t k, const std::string& name);
virtual digest_vector Hash(const void* x, size_t n) const /* final */;
virtual DoubleHasher* Clone() const /* final */;
virtual bool Equals(const Hasher* other) const /* final */;
private:
UHF h1_;
UHF h2_;
};
}
#endif

View file

@ -0,0 +1,130 @@
# ===========================================================================
#
# Bloom Filter Functions
#
# ===========================================================================
%%{
// TODO: This is currently included from the top-level src directory, hence
// paths are relative to there. We need a better mechanisms to pull in
// BiFs defined in sub directories.
#include "probabilistic/BloomFilter.h"
#include "OpaqueVal.h"
using namespace probabilistic;
%%}
module GLOBAL;
## Creates a basic Bloom filter.
##
## fp: The desired false-positive rate.
##
## capacity: the maximum number of elements that guarantees a false-positive
## rate of *fp*.
##
## name: A name that uniquely identifies and seeds the Bloom filter. If empty,
## the initialization will become dependent on the initial seed.
##
## Returns: A Bloom filter handle.
function bloomfilter_basic_init%(fp: double, capacity: count,
name: string &default=""%): opaque of bloomfilter
%{
if ( fp < 0.0 || fp > 1.0 )
{
reporter->Error("false-positive rate must take value between 0 and 1");
return NULL;
}
size_t cells = BasicBloomFilter::M(fp, capacity);
size_t optimal_k = BasicBloomFilter::K(cells, capacity);
const Hasher* h = Hasher::Create(optimal_k, name->CheckString());
return new BloomFilterVal(new BasicBloomFilter(h, cells));
%}
## Creates a counting Bloom filter.
##
## k: The number of hash functions to use.
##
## cells: The number of cells of the underlying counter vector.
##
## max: The maximum counter value associated with each each element described
## by *w = ceil(log_2(max))* bits. Each bit in the underlying counter vector
## becomes a cell of size *w* bits.
##
## name: A name that uniquely identifies and seeds the Bloom filter. If empty,
## the initialization will become dependent on the initial seed.
##
## Returns: A Bloom filter handle.
function bloomfilter_counting_init%(k: count, cells: count, max: count,
name: string &default=""%): opaque of bloomfilter
%{
if ( max == 0 )
{
reporter->Error("max counter value must be greater than 0");
return NULL;
}
const Hasher* h = Hasher::Create(k, name->CheckString());
uint16 width = 1;
while ( max >>= 1 )
++width;
return new BloomFilterVal(new CountingBloomFilter(h, cells, width));
%}
## Adds an element to a Bloom filter.
##
## bf: The Bloom filter handle.
##
## x: The element to add.
function bloomfilter_add%(bf: opaque of bloomfilter, x: any%): any
%{
BloomFilterVal* bfv = static_cast<BloomFilterVal*>(bf);
if ( ! bfv->Type() && ! bfv->Typify(x->Type()) )
reporter->Error("failed to set Bloom filter type");
else if ( bfv->Type() != x->Type() )
reporter->Error("incompatible Bloom filter types");
else
bfv->Add(x);
return NULL;
%}
## Retrieves the counter for a given element in a Bloom filter.
##
## bf: The Bloom filter handle.
##
## x: The element to count.
##
## Returns: the counter associated with *x* in *bf*.
function bloomfilter_lookup%(bf: opaque of bloomfilter, x: any%): count
%{
const BloomFilterVal* bfv = static_cast<const BloomFilterVal*>(bf);
if ( ! bfv->Type() )
reporter->Error("cannot perform lookup on untyped Bloom filter");
else if ( bfv->Type() != x->Type() )
reporter->Error("incompatible Bloom filter types");
else
return new Val(static_cast<uint64>(bfv->Count(x)), TYPE_COUNT);
return new Val(0, TYPE_COUNT);
%}
## Merges two Bloom filters.
##
## bf1: The first Bloom filter handle.
##
## bf2: The second Bloom filter handle.
##
## Returns: The union of *bf1* and *bf2*.
function bloomfilter_merge%(bf1: opaque of bloomfilter,
bf2: opaque of bloomfilter%): opaque of bloomfilter
%{
const BloomFilterVal* bfv1 = static_cast<const BloomFilterVal*>(bf1);
const BloomFilterVal* bfv2 = static_cast<const BloomFilterVal*>(bf2);
if ( bfv1->Type() != bfv2->Type() )
reporter->Error("incompatible Bloom filter types");
else
return BloomFilterVal::Merge(bfv1, bfv2);
return NULL;
%}