From 4d275522c7a87f8c69b1494126cc995a20b2d66b Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Thu, 23 May 2013 16:03:26 -0700 Subject: [PATCH 01/50] Add abstraction for vector of bits. A bitvector is a vector of bits with underlying block storage. Since C++ has no notion of lvalues in the context of bits, we use a small wrapper class Reference that masks the desired bit in the corresponding block. --- src/BitVector.cc | 455 +++++++++++++++++++++++++++++++++++++++++++++ src/BitVector.h | 324 ++++++++++++++++++++++++++++++++ src/CMakeLists.txt | 1 + 3 files changed, 780 insertions(+) create mode 100644 src/BitVector.cc create mode 100644 src/BitVector.h diff --git a/src/BitVector.cc b/src/BitVector.cc new file mode 100644 index 0000000000..2f714a6c79 --- /dev/null +++ b/src/BitVector.cc @@ -0,0 +1,455 @@ +#include "BitVector.h" + +#include +#include + +BitVector::size_type BitVector::npos = static_cast(-1); +BitVector::block_type BitVector::bits_per_block = + std::numeric_limits::digits; + +namespace { + +uint8_t count_table[] = { + 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, + 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, + 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, + 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, + 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, + 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, + 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, + 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, 5, + 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 3, + 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, + 6, 7, 6, 7, 7, 8 +}; + +} // namespace + +BitVector::Reference::Reference(block_type& block, block_type i) + : block_(block), + mask_(block_type(1) << i) + { + assert(i < bits_per_block); + } + +BitVector::Reference& BitVector::Reference::flip() + { + block_ ^= mask_; + return *this; + } + +BitVector::Reference::operator bool() const + { + return (block_ & mask_) != 0; + } + +bool BitVector::Reference::operator~() const + { + return (block_ & mask_) == 0; + } + +BitVector::Reference& BitVector::Reference::operator=(bool x) + { + x ? block_ |= mask_ : block_ &= ~mask_; + return *this; + } + +BitVector::Reference& BitVector::Reference::operator=(Reference const& other) + { + other ? block_ |= mask_ : block_ &= ~mask_; + return *this; + } + +BitVector::Reference& BitVector::Reference::operator|=(bool x) + { + if (x) + block_ |= mask_; + return *this; + } + +BitVector::Reference& BitVector::Reference::operator&=(bool x) + { + if (! x) + block_ &= ~mask_; + return *this; + } + +BitVector::Reference& BitVector::Reference::operator^=(bool x) + { + if (x) + block_ ^= mask_; + return *this; + } + +BitVector::Reference& BitVector::Reference::operator-=(bool x) + { + if (x) + block_ &= ~mask_; + return *this; + } + + +BitVector::BitVector() : num_bits_(0) { } + +BitVector::BitVector(size_type size, bool value) + : bits_(bits_to_blocks(size), value ? ~block_type(0) : 0), + num_bits_(size) +{ } + +BitVector::BitVector(BitVector const& other) + : bits_(other.bits_), + num_bits_(other.num_bits_) +{ } + +BitVector BitVector::operator~() const + { + BitVector b(*this); + b.flip(); + return b; + } + +BitVector& BitVector::operator=(BitVector const& other) + { + bits_ = other.bits_; + return *this; + } + +BitVector BitVector::operator<<(size_type n) const + { + BitVector b(*this); + return b <<= n; + } + +BitVector BitVector::operator>>(size_type n) const + { + BitVector b(*this); + return b >>= n; + } + +BitVector& BitVector::operator<<=(size_type n) + { + if (n >= num_bits_) + return reset(); + + if (n > 0) + { + size_type last = blocks() - 1; + size_type div = n / bits_per_block; + block_type r = bit_index(n); + block_type* b = &bits_[0]; + assert(blocks() >= 1); + assert(div <= last); + + if (r != 0) + { + for (size_type i = last - div; i > 0; --i) + b[i + div] = (b[i] << r) | (b[i - 1] >> (bits_per_block - r)); + b[div] = b[0] << r; + } + else + { + for (size_type i = last-div; i > 0; --i) + b[i + div] = b[i]; + b[div] = b[0]; + } + + std::fill_n(b, div, block_type(0)); + zero_unused_bits(); + } + + return *this; + } + +BitVector& BitVector::operator>>=(size_type n) + { + if (n >= num_bits_) + return reset(); + + if (n > 0) + { + size_type last = blocks() - 1; + size_type div = n / bits_per_block; + block_type r = bit_index(n); + block_type* b = &bits_[0]; + assert(blocks() >= 1); + assert(div <= last); + + if (r != 0) + { + for (size_type i = last - div; i > 0; --i) + b[i - div] = (b[i] >> r) | (b[i + 1] << (bits_per_block - r)); + b[last - div] = b[last] >> r; + } + else + { + for (size_type i = div; i <= last; ++i) + b[i-div] = b[i]; + } + + std::fill_n(b + (blocks() - div), div, block_type(0)); + } + return *this; + } + +BitVector& BitVector::operator&=(BitVector const& other) + { + assert(size() >= other.size()); + for (size_type i = 0; i < blocks(); ++i) + bits_[i] &= other.bits_[i]; + return *this; + } + +BitVector& BitVector::operator|=(BitVector const& other) + { + assert(size() >= other.size()); + for (size_type i = 0; i < blocks(); ++i) + bits_[i] |= other.bits_[i]; + return *this; + } + +BitVector& BitVector::operator^=(BitVector const& other) + { + assert(size() >= other.size()); + for (size_type i = 0; i < blocks(); ++i) + bits_[i] ^= other.bits_[i]; + return *this; + } + +BitVector& BitVector::operator-=(BitVector const& other) + { + assert(size() >= other.size()); + for (size_type i = 0; i < blocks(); ++i) + bits_[i] &= ~other.bits_[i]; + return *this; + } + +BitVector operator&(BitVector const& x, BitVector const& y) + { + BitVector b(x); + return b &= y; + } + +BitVector operator|(BitVector const& x, BitVector const& y) + { + BitVector b(x); + return b |= y; + } + +BitVector operator^(BitVector const& x, BitVector const& y) + { + BitVector b(x); + return b ^= y; + } + +BitVector operator-(BitVector const& x, BitVector const& y) + { + BitVector b(x); + return b -= y; + } + +bool operator==(BitVector const& x, BitVector const& y) + { + return x.num_bits_ == y.num_bits_ && x.bits_ == y.bits_; + } + +bool operator!=(BitVector const& x, BitVector const& y) + { + return ! (x == y); + } + +bool operator<(BitVector const& x, BitVector const& y) + { + assert(x.size() == y.size()); + for (BitVector::size_type r = x.blocks(); r > 0; --r) + { + BitVector::size_type i = r - 1; + if (x.bits_[i] < y.bits_[i]) + return true; + else if (x.bits_[i] > y.bits_[i]) + return false; + } + return false; + } + +void BitVector::resize(size_type n, bool value) + { + size_type old = blocks(); + size_type required = bits_to_blocks(n); + block_type block_value = value ? ~block_type(0) : block_type(0); + + if (required != old) + bits_.resize(required, block_value); + + if (value && (n > num_bits_) && extra_bits()) + bits_[old - 1] |= (block_value << extra_bits()); + + num_bits_ = n; + zero_unused_bits(); + } + +void BitVector::clear() + { + bits_.clear(); + num_bits_ = 0; + } + +void BitVector::push_back(bool bit) + { + size_type s = size(); + resize(s + 1); + set(s, bit); + } + +void BitVector::append(block_type block) + { + size_type excess = extra_bits(); + if (excess) + { + assert(! bits_.empty()); + bits_.push_back(block >> (bits_per_block - excess)); + bits_[bits_.size() - 2] |= (block << excess); + } + else + { + bits_.push_back(block); + } + num_bits_ += bits_per_block; + } + +BitVector& BitVector::set(size_type i, bool bit) + { + assert(i < num_bits_); + + if (bit) + bits_[block_index(i)] |= bit_mask(i); + else + reset(i); + + return *this; + } + +BitVector& BitVector::set() + { + std::fill(bits_.begin(), bits_.end(), ~block_type(0)); + zero_unused_bits(); + return *this; + } + +BitVector& BitVector::reset(size_type i) + { + assert(i < num_bits_); + bits_[block_index(i)] &= ~bit_mask(i); + return *this; + } + +BitVector& BitVector::reset() + { + std::fill(bits_.begin(), bits_.end(), block_type(0)); + return *this; + } + +BitVector& BitVector::flip(size_type i) + { + assert(i < num_bits_); + bits_[block_index(i)] ^= bit_mask(i); + return *this; + } + +BitVector& BitVector::flip() + { + for (size_type i = 0; i < blocks(); ++i) + bits_[i] = ~bits_[i]; + zero_unused_bits(); + return *this; + } + +bool BitVector::operator[](size_type i) const + { + assert(i < num_bits_); + return (bits_[block_index(i)] & bit_mask(i)) != 0; + } + +BitVector::Reference BitVector::operator[](size_type i) + { + assert(i < num_bits_); + return Reference(bits_[block_index(i)], bit_index(i)); + } + +BitVector::size_type BitVector::count() const + { + std::vector::const_iterator first = bits_.begin(); + size_t n = 0; + size_type length = blocks(); + while (length) + { + block_type block = *first; + while (block) + { + // TODO: use __popcnt if available. + n += count_table[block & ((1u << 8) - 1)]; + block >>= 8; + } + ++first; + --length; + } + return n; + } + +BitVector::size_type BitVector::blocks() const + { + return bits_.size(); + } + +BitVector::size_type BitVector::size() const + { + return num_bits_; + } + +bool BitVector::empty() const + { + return bits_.empty(); + } + +BitVector::size_type BitVector::find_first() const + { + return find_from(0); + } + +BitVector::size_type BitVector::find_next(size_type i) const + { + if (i >= (size() - 1) || size() == 0) + return npos; + ++i; + size_type bi = block_index(i); + block_type block = bits_[bi] & (~block_type(0) << bit_index(i)); + return block ? bi * bits_per_block + lowest_bit(block) : find_from(bi + 1); + } + +BitVector::size_type BitVector::lowest_bit(block_type block) + { + block_type x = block - (block & (block - 1)); + size_type log = 0; + while (x >>= 1) + ++log; + return log; + } + +BitVector::block_type BitVector::extra_bits() const + { + return bit_index(size()); + } + +void BitVector::zero_unused_bits() + { + if (extra_bits()) + bits_.back() &= ~(~block_type(0) << extra_bits()); + } + +BitVector::size_type BitVector::find_from(size_type i) const + { + while (i < blocks() && bits_[i] == 0) + ++i; + if (i >= blocks()) + return npos; + return i * bits_per_block + lowest_bit(bits_[i]); + } diff --git a/src/BitVector.h b/src/BitVector.h new file mode 100644 index 0000000000..46d7e2df8f --- /dev/null +++ b/src/BitVector.h @@ -0,0 +1,324 @@ +#ifndef BitVector_h +#define BitVector_h + +#include +#include + +/** + * A vector of bits. + */ +class BitVector { +public: + typedef size_t block_type; + typedef size_t size_type; + static size_type npos; + static block_type bits_per_block; + +public: + /** + * An lvalue proxy for single bits. + */ + class Reference { + friend class BitVector; + Reference(block_type& block, block_type i); + + public: + Reference& flip(); + operator bool() const; + bool operator~() const; + Reference& operator=(bool x); + Reference& operator=(Reference const& other); + Reference& operator|=(bool x); + Reference& operator&=(bool x); + Reference& operator^=(bool x); + Reference& operator-=(bool x); + + private: + void operator&(); + block_type& block_; + block_type const mask_; + }; + + typedef bool const_reference; + + /** + * Constructs an empty bit vector. + */ + BitVector(); + + /** + * Constructs a bit vector of a given size. + * @param size The number of bits. + * @param value The value for each bit. + */ + explicit BitVector(size_type size, bool value = false); + + /** + * Constructs a bit vector from a sequence of blocks. + */ + template + BitVector(InputIterator first, InputIterator last) + { + bits_.insert(bits_.end(), first, last); + num_bits_ = bits_.size() * bits_per_block; + } + + /** + * Copy-constructs a bit vector. + * @param other The bit vector to copy. + */ + BitVector(const BitVector& other); + + /** + * Assigns another bit vector to this instance. + * @param other The RHS of the assignment. + */ + BitVector& operator=(const BitVector& other); + + // + // Bitwise operations + // + BitVector operator~() const; + BitVector operator<<(size_type n) const; + BitVector operator>>(size_type n) const; + BitVector& operator<<=(size_type n); + BitVector& operator>>=(size_type n); + BitVector& operator&=(BitVector const& other); + BitVector& operator|=(BitVector const& other); + BitVector& operator^=(BitVector const& other); + BitVector& operator-=(BitVector const& other); + friend BitVector operator&(BitVector const& x, BitVector const& y); + friend BitVector operator|(BitVector const& x, BitVector const& y); + friend BitVector operator^(BitVector const& x, BitVector const& y); + friend BitVector operator-(BitVector const& x, BitVector const& y); + + // + // Relational operators + // + friend bool operator==(BitVector const& x, BitVector const& y); + friend bool operator!=(BitVector const& x, BitVector const& y); + friend bool operator<(BitVector const& x, BitVector const& y); + + // + // Basic operations + // + /** Appends the bits in a sequence of values. + * @tparam Iterator A forward iterator. + * @param first An iterator pointing to the first element of the sequence. + * @param last An iterator pointing to one past the last element of the + * sequence. + */ + template + void append(ForwardIterator first, ForwardIterator last) + { + if (first == last) + return; + + block_type excess = extra_bits(); + typename std::iterator_traits::difference_type delta = + std::distance(first, last); + + bits_.reserve(blocks() + delta); + if (excess == 0) + { + bits_.back() |= (*first << excess); + do + { + block_type b = *first++ >> (bits_per_block - excess); + bits_.push_back(b | (first == last ? 0 : *first << excess)); + } while (first != last); + } + else + { + bits_.insert(bits_.end(), first, last); + } + num_bits_ += bits_per_block * delta; + } + + /** + * Appends the bits in a given block. + * @param block The block containing bits to append. + */ + void append(block_type block); + + /** Appends a single bit to the end of the bit vector. + * @param bit The value of the bit. + */ + void push_back(bool bit); + + /** + * Clears all bits in the bitvector. + */ + void clear(); + + /** + * Resizes the bit vector to a new number of bits. + * @param n The new number of bits of the bit vector. + * @param value The bit value of new values, if the vector expands. + */ + void resize(size_type n, bool value = false); + + /** + * Sets a bit at a specific position to a given value. + * @param i The bit position. + * @param bit The value assigned to position *i*. + * @return A reference to the bit vector instance. + */ + BitVector& set(size_type i, bool bit = true); + + /** + * Sets all bits to 1. + * @return A reference to the bit vector instance. + */ + BitVector& set(); + + /** + * Resets a bit at a specific position, i.e., sets it to 0. + * @param i The bit position. + * @return A reference to the bit vector instance. + */ + BitVector& reset(size_type i); + + /** + * Sets all bits to 0. + * @return A reference to the bit vector instance. + */ + BitVector& reset(); + + /** + * Toggles/flips a bit at a specific position. + * @param i The bit position. + * @return A reference to the bit vector instance. + */ + BitVector& flip(size_type i); + + /** + * Computes the complement. + * @return A reference to the bit vector instance. + */ + BitVector& flip(); + + /** Retrieves a single bit. + * @param i The bit position. + * @return A mutable reference to the bit at position *i*. + */ + Reference operator[](size_type i); + + /** + * Retrieves a single bit. + * @param i The bit position. + * @return A const-reference to the bit at position *i*. + */ + const_reference operator[](size_type i) const; + + /** + * Counts the number of 1-bits in the bit vector. Also known as *population + * count* or *Hamming weight*. + * @return The number of bits set to 1. + */ + size_type count() const; + + /** + * Retrieves the number of blocks of the underlying storage. + * @param The number of blocks that represent `size()` bits. + */ + size_type blocks() const; + + /** + * Retrieves the number of bits the bitvector consist of. + * @return The length of the bit vector in bits. + */ + size_type size() const; + + /** + * Checks whether the bit vector is empty. + * @return `true` iff the bitvector has zero length. + */ + bool empty() const; + + /** + * Finds the bit position of of the first 1-bit. + * @return The position of the first bit that equals to one or `npos` if no + * such bit exists. + */ + size_type find_first() const; + + /** + * Finds the next 1-bit from a given starting position. + * + * @param i The index where to start looking. + * + * @return The position of the first bit that equals to 1 after position + * *i* or `npos` if no such bit exists. + */ + size_type find_next(size_type i) const; + +private: + /** + * Computes the block index for a given bit position. + */ + static size_type block_index(size_type i) + { + return i / bits_per_block; + } + + /** + * Computes the bit index within a given block for a given bit position. + */ + static block_type bit_index(size_type i) + { + return i % bits_per_block; + } + + /** + * Computes the bitmask block to extract a bit a given bit position. + */ + static block_type bit_mask(size_type i) + { + return block_type(1) << bit_index(i); + } + + /** + * Computes the number of blocks needed to represent a given number of + * bits. + * @param bits the number of bits. + * @return The number of blocks to represent *bits* number of bits. + */ + static size_type bits_to_blocks(size_type bits) + { + return bits / bits_per_block + + static_cast(bits % bits_per_block != 0); + } + + /** + * Computes the bit position first 1-bit in a given block. + * @param block The block to inspect. + * @return The bit position where *block* has its first bit set to 1. + */ + static size_type lowest_bit(block_type block); + + /** + * Computes the number of excess/unused bits in the bit vector. + */ + block_type extra_bits() const; + + /** + * If the number of bits in the vector are not not a multiple of + * bitvector::bits_per_block, then the last block exhibits unused bits which + * this function resets. + */ + void zero_unused_bits(); + + /** + * Looks for the first 1-bit starting at a given position. + * @param i The block index to start looking. + * @return The block index of the first 1-bit starting from *i* or + * `bitvector::npos` if no 1-bit exists. + */ + size_type find_from(size_type i) const; + + std::vector bits_; + size_type num_bits_; +}; + +#endif diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 447b7d9ec7..33aaab29c1 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -303,6 +303,7 @@ set(bro_SRCS Base64.cc BitTorrent.cc BitTorrentTracker.cc + BitVector.cc BPF_Program.cc BroDoc.cc BroDocObj.cc From 9e32eaad6db992e60a3d669c4d8c7b5016cc8cbc Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Tue, 28 May 2013 20:58:01 -0700 Subject: [PATCH 02/50] Make bitvectors serializable. --- src/BitVector.cc | 57 +++++++++++++++++++++++++++++++++++++++++++++-- src/BitVector.h | 13 ++++++++--- src/SerialTypes.h | 2 ++ 3 files changed, 67 insertions(+), 5 deletions(-) diff --git a/src/BitVector.cc b/src/BitVector.cc index 2f714a6c79..f57301d506 100644 --- a/src/BitVector.cc +++ b/src/BitVector.cc @@ -2,6 +2,7 @@ #include #include +#include "Serializer.h" BitVector::size_type BitVector::npos = static_cast(-1); BitVector::block_type BitVector::bits_per_block = @@ -62,7 +63,7 @@ BitVector::Reference& BitVector::Reference::operator=(Reference const& other) BitVector::Reference& BitVector::Reference::operator|=(bool x) { - if (x) + if (x) block_ |= mask_; return *this; } @@ -73,7 +74,7 @@ BitVector::Reference& BitVector::Reference::operator&=(bool x) block_ &= ~mask_; return *this; } - + BitVector::Reference& BitVector::Reference::operator^=(bool x) { if (x) @@ -453,3 +454,55 @@ BitVector::size_type BitVector::find_from(size_type i) const return npos; return i * bits_per_block + lowest_bit(bits_[i]); } + +bool BitVector::Serialize(SerialInfo* info) const + { + return SerialObj::Serialize(info); + } + +BitVector* BitVector::Unserialize(UnserialInfo* info) + { + return reinterpret_cast( + SerialObj::Unserialize(info, SER_BITVECTOR)); + } + +IMPLEMENT_SERIAL(BitVector, SER_BITVECTOR); + +bool BitVector::DoSerialize(SerialInfo* info) const + { + DO_SERIALIZE(SER_BITVECTOR, SerialObj); + + if ( ! SERIALIZE(static_cast(bits_.size())) ) + return false; + + for (size_t i = 0; i < bits_.size(); ++i) + if ( ! SERIALIZE(static_cast(bits_[i])) ) + return false; + + return SERIALIZE(static_cast(num_bits_)); + } + +bool BitVector::DoUnserialize(UnserialInfo* info) + { + DO_UNSERIALIZE(SerialObj); + + uint64 size; + if ( ! UNSERIALIZE(&size) ) + return false; + + bits_.resize(static_cast(size)); + uint64 block; + for ( size_t i = 0; i < bits_.size(); ++i ) + { + if ( ! UNSERIALIZE(&block) ) + return false; + bits_[i] = static_cast(block); + } + + uint64 num_bits; + if ( ! UNSERIALIZE(&num_bits) ) + return false; + num_bits_ = static_cast(num_bits); + + return true; + } diff --git a/src/BitVector.h b/src/BitVector.h index 46d7e2df8f..9900dd103e 100644 --- a/src/BitVector.h +++ b/src/BitVector.h @@ -3,11 +3,12 @@ #include #include +#include "SerialObj.h" /** * A vector of bits. */ -class BitVector { +class BitVector : SerialObj { public: typedef size_t block_type; typedef size_t size_type; @@ -42,7 +43,7 @@ public: typedef bool const_reference; /** - * Constructs an empty bit vector. + * Default-constructs an empty bit vector. */ BitVector(); @@ -253,6 +254,12 @@ public: */ size_type find_next(size_type i) const; + bool Serialize(SerialInfo* info) const; + static BitVector* Unserialize(UnserialInfo* info); + +protected: + DECLARE_SERIAL(BitVector); + private: /** * Computes the block index for a given bit position. @@ -286,7 +293,7 @@ private: */ static size_type bits_to_blocks(size_type bits) { - return bits / bits_per_block + return bits / bits_per_block + static_cast(bits % bits_per_block != 0); } diff --git a/src/SerialTypes.h b/src/SerialTypes.h index 723badab1e..c9c0c34a33 100644 --- a/src/SerialTypes.h +++ b/src/SerialTypes.h @@ -49,6 +49,7 @@ SERIAL_IS(STATE_ACCESS, 0x1100) SERIAL_IS_BO(CASE, 0x1200) SERIAL_IS(LOCATION, 0x1300) SERIAL_IS(RE_MATCHER, 0x1400) +SERIAL_IS(BITVECTOR, 0x1500) // These are the externally visible types. const SerialType SER_NONE = 0; @@ -202,5 +203,6 @@ SERIAL_CONST2(STATE_ACCESS) SERIAL_CONST2(CASE) SERIAL_CONST2(LOCATION) SERIAL_CONST2(RE_MATCHER) +SERIAL_CONST2(BITVECTOR) #endif From d873db03cef3bb09d45e789d69607487e36b6093 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Fri, 31 May 2013 18:31:14 -0700 Subject: [PATCH 03/50] Add draft of Bloom filter type hierarchy. --- src/BloomFilter.h | 266 +++++++++++++++++++++++++++++++++++++++++++++ src/CMakeLists.txt | 1 + 2 files changed, 267 insertions(+) create mode 100644 src/BloomFilter.h diff --git a/src/BloomFilter.h b/src/BloomFilter.h new file mode 100644 index 0000000000..a767c6b8b8 --- /dev/null +++ b/src/BloomFilter.h @@ -0,0 +1,266 @@ +#ifndef BloomFilter_h +#define BloomFilter_h + +#include +#include "BitVector.h" +#include "Hash.h" +#include "H3.h" + +/** + * A vector of counters, each of which have a fixed number of bits. + */ +class CounterVector : SerialObj { +public: + /** + * Constructs a counter vector having cells of a given width. + * + * @param width The number of bits that each cell occupies. + */ + explicit CounterVector(unsigned width); + + /** + * Increments a given cell. + * + * @param cell The cell to increment. + * + * @param value The value to add to the current counter in *cell*. + * + * @return `true` if adding *value* to the counter in *cell* succeeded. + */ + bool Increment(size_type cell, count_type value); + + /** + * Decrements a given cell. + * + * @param cell The cell to decrement. + * + * @param value The value to subtract from the current counter in *cell*. + * + * @return `true` if subtracting *value* from the counter in *cell* succeeded. + */ + bool Decrement(size_type cell, count_type value); + + /** + * Retrieves the counter of a given cell. + * + * @param cell The cell index to retrieve the count for. + * + * @return The counter associated with *cell*. + */ + count_type Count(size_type cell) const; + + /** + * Retrieves the number of cells in the storage. + * + * @return The number of cells. + */ + size_type Size() const; + + bool Serialize(SerialInfo* info) const; + static CounterVector* Unserialize(UnserialInfo* info); + +protected: + DECLARE_SERIAL(CounterVector); + + CounterVector(); + +private: + BitVector bits_; + unsigned width_; +}; + +/** + * The abstract base class for hash policies. + * @tparam Codomain An integral type. + */ +class HashPolicy { +public: + typedef hash_t hash_type; + virtual ~HashPolicy() { } + size_t k() const { return k; } + virtual std::vector Hash(const void* x, size_t n) const = 0; +protected: + /** + * A functor that computes a universal hash function. + * @tparam Codomain An integral type. + */ + template + class Hasher { + public: + template + Codomain operator()(const Domain& x) const + { + return h3_(&x, sizeof(x)); + } + Codomain operator()(const void* x, size_t n) const + { + return h3_(x, n); + } + private: + // FIXME: The hardcoded value of 36 comes from UHASH_KEY_SIZE defined in + // Hash.h. I do not know how this value impacts the hash function behavior + // so I'll just copy it verbatim. (Matthias) + H3 h3_; + }; + + HashPolicy(size_t k) : k_(k) { } +private: + size_t k_; +}; + +/** + * The *default* hashing policy. Performs *k* hash function computations. + */ +class DefaultHashing : public HashPolicy { +public: + DefaultHashing(size_t k) : HashPolicy(k), hashers_(k) { } + virtual ~DoubleHashing() { } + + virtual std::vector Hash(const void* x, size_t n) const + { + std::vector h(k(), 0); + for (size_t i = 0; i < h.size(); ++i) + h[i] = hashers_[i](x, n); + return h; + } + +private: + std::vector< Hasher > hashers_; +}; + +/** + * The *double-hashing* policy. Uses a linear combination of 2 hash functions. + */ +class DoubleHashing : public HashPolicy { +public: + DoubleHashing(size_t k) : HashPolicy(k), hashers_(k) { } + virtual ~DoubleHashing() { } + + virtual std::vector Hash(const void* x, size_t n) const + { + Codomain h1 = hasher1_(x); + Codomain h2 = hasher2_(x); + std::vector h(k(), 0); + for (size_t i = 0; i < h.size(); ++i) + h[i] = h1 + i * h2; + return h; + } + +private: + Hasher hasher1_; + Hasher hasher2_; +}; + +/** + * The abstract base class for Bloom filters. + */ +class BloomFilter : SerialObj { +public: + virtual ~BloomFilter() { delete hash_; } + + /** + * Adds an element of type T to the Bloom filter. + * @param x The element to add + */ + template + void Add(const T& x) + { + ++elements_; + AddImpl(hash_->Hash(x)); + } + + /** + * Retrieves the associated count of a given value. + * + * @param x The value of type `T` to check. + * + * @return The counter associated with *x*. + */ + template + size_t Count(const T& x) const + { + return CountImpl(hash_->Hash(x)); + } + + /** + * Retrieves the number of elements added to the Bloom filter. + * + * @return The number of elements in this Bloom filter. + */ + size_t Size() const + { + return elements_; + } + +protected: + typedef std::vector HashVector; + + /** + * Default-constructs a Bloom filter. + */ + BloomFilter(); + + /** + * Constructs a BloomFilter. + * @param hash The hashing policy. + */ + BloomFilter(HashPolicy* hash); + + virtual void AddImpl(const HashVector& hashes) = 0; + + virtual size_t CountImpl(const HashVector& hashes) const = 0; + + std::vector Hash(const T& x) const + { + return hash_->Hash(&x, sizeof(x)); + } + +private: + HashPolicy* hash_; // Owned by *this. + + size_t elements_; +}; + +/** + * A basic Bloom filter. + */ +class BasicBloomFilter : public BloomFilter { +public: + BasicBloomFilter(); + BasicBloomFilter(HashPolicy* hash); + +protected: + virtual void AddImpl(const HashVector& h) + { + for ( size_t i = 0; i < h.size(); ++i ) + bits_.set(h[i] % h.size()); + } + + virtual size_t CountImpl(const HashVector& h) const + { + for ( size_t i = 0; i < h.size(); ++i ) + if ( ! bits_[h[i] % h.size()] ) + return 0; + return 1; + } + +private: + BitVector bits_; +}; + +/** + * A counting Bloom filter. + */ +class CountingBloomFilter : public BloomFilter { +public: + CountingBloomFilter(unsigned width); + CountingBloomFilter(HashPolicy* hash); + +protected: + CountingBloomFilter(); + +private: + CounterVector cells_; +}; + +#endif diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 33aaab29c1..11de7772d7 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -304,6 +304,7 @@ set(bro_SRCS BitTorrent.cc BitTorrentTracker.cc BitVector.cc + BloomFilter.cc BPF_Program.cc BroDoc.cc BroDocObj.cc From f529df33e0afa930e4babff66f4a5f590b5eb6d9 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Mon, 3 Jun 2013 14:00:28 -0700 Subject: [PATCH 04/50] Stabilize Bloom filter interface. --- src/BloomFilter.cc | 33 ++++++++++++++++++ src/BloomFilter.h | 85 +++++++++++++++++----------------------------- 2 files changed, 65 insertions(+), 53 deletions(-) create mode 100644 src/BloomFilter.cc diff --git a/src/BloomFilter.cc b/src/BloomFilter.cc new file mode 100644 index 0000000000..6873815f69 --- /dev/null +++ b/src/BloomFilter.cc @@ -0,0 +1,33 @@ +#include "BloomFilter.h" + +HashPolicy::HashVector DefaultHashing::Hash(const void* x, size_t n) const + { + HashVector h(k(), 0); + for ( size_t i = 0; i < h.size(); ++i ) + h[i] = hashers_[i](x, n); + return h; + } + +HashPolicy::HashVector DoubleHashing::Hash(const void* x, size_t n) const + { + HashType h1 = hasher1_(x); + HashType h2 = hasher2_(x); + HashVector h(k(), 0); + for ( size_t i = 0; i < h.size(); ++i ) + h[i] = h1 + i * h2; + return h; + } + +void BasicBloomFilter::AddImpl(const HashPolicy::HashVector& h) + { + for ( size_t i = 0; i < h.size(); ++i ) + bits_.set(h[i] % h.size()); + } + +size_t BasicBloomFilter::CountImpl(const HashPolicy::HashVector& h) const + { + for ( size_t i = 0; i < h.size(); ++i ) + if ( ! bits_[h[i] % h.size()] ) + return 0; + return 1; + } diff --git a/src/BloomFilter.h b/src/BloomFilter.h index a767c6b8b8..dca4eff2bd 100644 --- a/src/BloomFilter.h +++ b/src/BloomFilter.h @@ -11,6 +11,9 @@ */ class CounterVector : SerialObj { public: + typedef size_t size_type; + typedef uint64 count_type; + /** * Constructs a counter vector having cells of a given width. * @@ -70,21 +73,24 @@ private: }; /** - * The abstract base class for hash policies. + * The abstract base class for hash policies that hash elements *k* times. * @tparam Codomain An integral type. */ class HashPolicy { public: - typedef hash_t hash_type; + typedef hash_t HashType; + typedef std::vector HashVector; + virtual ~HashPolicy() { } - size_t k() const { return k; } - virtual std::vector Hash(const void* x, size_t n) const = 0; + size_t k() const { return k_; } + virtual HashVector Hash(const void* x, size_t n) const = 0; + protected: /** * A functor that computes a universal hash function. * @tparam Codomain An integral type. */ - template + template class Hasher { public: template @@ -104,8 +110,9 @@ protected: }; HashPolicy(size_t k) : k_(k) { } + private: - size_t k_; + const size_t k_; }; /** @@ -114,18 +121,12 @@ private: class DefaultHashing : public HashPolicy { public: DefaultHashing(size_t k) : HashPolicy(k), hashers_(k) { } - virtual ~DoubleHashing() { } + virtual ~DefaultHashing() { } - virtual std::vector Hash(const void* x, size_t n) const - { - std::vector h(k(), 0); - for (size_t i = 0; i < h.size(); ++i) - h[i] = hashers_[i](x, n); - return h; - } + virtual HashVector Hash(const void* x, size_t n) const; private: - std::vector< Hasher > hashers_; + std::vector< Hasher > hashers_; }; /** @@ -133,22 +134,14 @@ private: */ class DoubleHashing : public HashPolicy { public: - DoubleHashing(size_t k) : HashPolicy(k), hashers_(k) { } + DoubleHashing(size_t k) : HashPolicy(k) { } virtual ~DoubleHashing() { } - virtual std::vector Hash(const void* x, size_t n) const - { - Codomain h1 = hasher1_(x); - Codomain h2 = hasher2_(x); - std::vector h(k(), 0); - for (size_t i = 0; i < h.size(); ++i) - h[i] = h1 + i * h2; - return h; - } + virtual HashVector Hash(const void* x, size_t n) const; private: - Hasher hasher1_; - Hasher hasher2_; + Hasher hasher1_; + Hasher hasher2_; }; /** @@ -166,7 +159,7 @@ public: void Add(const T& x) { ++elements_; - AddImpl(hash_->Hash(x)); + AddImpl(hash_->Hash(&x, sizeof(x))); } /** @@ -179,7 +172,7 @@ public: template size_t Count(const T& x) const { - return CountImpl(hash_->Hash(x)); + return CountImpl(hash_->Hash(&x, sizeof(x))); } /** @@ -193,8 +186,6 @@ public: } protected: - typedef std::vector HashVector; - /** * Default-constructs a Bloom filter. */ @@ -206,17 +197,12 @@ protected: */ BloomFilter(HashPolicy* hash); - virtual void AddImpl(const HashVector& hashes) = 0; + virtual void AddImpl(const HashPolicy::HashVector& hashes) = 0; - virtual size_t CountImpl(const HashVector& hashes) const = 0; - - std::vector Hash(const T& x) const - { - return hash_->Hash(&x, sizeof(x)); - } + virtual size_t CountImpl(const HashPolicy::HashVector& hashes) const = 0; private: - HashPolicy* hash_; // Owned by *this. + HashPolicy* hash_; // Owned by *this. size_t elements_; }; @@ -230,19 +216,9 @@ public: BasicBloomFilter(HashPolicy* hash); protected: - virtual void AddImpl(const HashVector& h) - { - for ( size_t i = 0; i < h.size(); ++i ) - bits_.set(h[i] % h.size()); - } + virtual void AddImpl(const HashPolicy::HashVector& h); - virtual size_t CountImpl(const HashVector& h) const - { - for ( size_t i = 0; i < h.size(); ++i ) - if ( ! bits_[h[i] % h.size()] ) - return 0; - return 1; - } + virtual size_t CountImpl(const HashPolicy::HashVector& h) const; private: BitVector bits_; @@ -253,12 +229,15 @@ private: */ class CountingBloomFilter : public BloomFilter { public: - CountingBloomFilter(unsigned width); - CountingBloomFilter(HashPolicy* hash); + CountingBloomFilter(unsigned width, HashPolicy* hash); protected: CountingBloomFilter(); + virtual void AddImpl(const HashPolicy::HashVector& h); + + virtual size_t CountImpl(const HashPolicy::HashVector& h) const; + private: CounterVector cells_; }; From f708cd4a361ba02083380cfe0db2949e3e06cff7 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Mon, 3 Jun 2013 22:55:21 -0700 Subject: [PATCH 05/50] Work on parameter estimation and serialization. --- src/BloomFilter.cc | 131 ++++++++++++++++++++++++++++++++++++++++++++- src/BloomFilter.h | 41 +++++++------- src/NetVar.cc | 2 + src/OpaqueVal.cc | 23 ++++++++ src/OpaqueVal.h | 16 ++++++ src/SerialTypes.h | 7 +++ 6 files changed, 198 insertions(+), 22 deletions(-) diff --git a/src/BloomFilter.cc b/src/BloomFilter.cc index 6873815f69..4787bef0f0 100644 --- a/src/BloomFilter.cc +++ b/src/BloomFilter.cc @@ -1,23 +1,130 @@ #include "BloomFilter.h" +#include +#include "Serializer.h" + +// Backport C++11's std::round(). +namespace { +template +T round(double x) { return (x > 0.0) ? (x + 0.5) : (x - 0.5); } +} // namespace + + +IMPLEMENT_SERIAL(CounterVector, SER_COUNTERVECTOR) + +bool CounterVector::DoSerialize(SerialInfo* info) const + { + DO_SERIALIZE(SER_COUNTERVECTOR, SerialObj); + if ( ! SERIALIZE(&bits_) ) + return false; + return SERIALIZE(static_cast(width_)); + } + +bool CounterVector::DoUnserialize(UnserialInfo* info) + { + DO_UNSERIALIZE(SerialObj); + return false; + // TODO: Ask Robin how to unserialize non-pointer members. + //if ( ! UNSERIALIZE(&bits_) ) + // return false; + uint64 width; + if ( ! UNSERIALIZE(&width) ) + return false; + width_ = static_cast(width); + return true; + } + + HashPolicy::HashVector DefaultHashing::Hash(const void* x, size_t n) const { - HashVector h(k(), 0); + HashVector h(K(), 0); for ( size_t i = 0; i < h.size(); ++i ) h[i] = hashers_[i](x, n); return h; } + HashPolicy::HashVector DoubleHashing::Hash(const void* x, size_t n) const { HashType h1 = hasher1_(x); HashType h2 = hasher2_(x); - HashVector h(k(), 0); + HashVector h(K(), 0); for ( size_t i = 0; i < h.size(); ++i ) h[i] = h1 + i * h2; return h; } +bool BloomFilter::Serialize(SerialInfo* info) const + { + return SerialObj::Serialize(info); + } + +BloomFilter* BloomFilter::Unserialize(UnserialInfo* info) + { + return reinterpret_cast( + SerialObj::Unserialize(info, SER_BLOOMFILTER)); + } + +// FIXME: should abstract base classes also have IMPLEMENT_SERIAL? +//IMPLEMENT_SERIAL(BloomFilter, SER_BLOOMFILTER) + +bool BloomFilter::DoSerialize(SerialInfo* info) const + { + DO_SERIALIZE(SER_BLOOMFILTER, SerialObj); + // TODO: Make the hash policy serializable. + //if ( ! SERIALIZE(hash_) ) + // return false; + return SERIALIZE(static_cast(elements_)); + } + +bool BloomFilter::DoUnserialize(UnserialInfo* info) + { + DO_UNSERIALIZE(SerialObj); + // TODO: Make the hash policy serializable. + //if ( ! hash_ = HashPolicy::Unserialize(info) ) + // return false; + uint64 elements; + if ( UNSERIALIZE(&elements) ) + return false; + elements_ = static_cast(elements); + return true; + } + +size_t BasicBloomFilter::Cells(double fp, size_t capacity) + { + double ln2 = std::log(2); + return std::ceil(-(capacity * std::log(fp) / ln2 / ln2)); + } + +size_t BasicBloomFilter::K(size_t cells, size_t capacity) + { + double frac = static_cast(cells) / static_cast(capacity); + return round(frac * std::log(2)); + } + +BasicBloomFilter::BasicBloomFilter(size_t cells, HashPolicy* hash) + : BloomFilter(hash), bits_(cells) + { + } + +IMPLEMENT_SERIAL(BasicBloomFilter, SER_BASICBLOOMFILTER) + +bool BasicBloomFilter::DoSerialize(SerialInfo* info) const + { + DO_SERIALIZE(SER_BASICBLOOMFILTER, BloomFilter); + // TODO: Make the hash policy serializable. + //if ( ! SERIALIZE(&bits_) ) + // return false; + return true; + } + +bool BasicBloomFilter::DoUnserialize(UnserialInfo* info) + { + DO_UNSERIALIZE(BloomFilter); + // TODO: Non-pointer member deserialization? + return true; + } + void BasicBloomFilter::AddImpl(const HashPolicy::HashVector& h) { for ( size_t i = 0; i < h.size(); ++i ) @@ -31,3 +138,23 @@ size_t BasicBloomFilter::CountImpl(const HashPolicy::HashVector& h) const return 0; return 1; } + + +void CountingBloomFilter::AddImpl(const HashPolicy::HashVector& h) + { + for ( size_t i = 0; i < h.size(); ++i ) + cells_.Increment(h[i] % h.size(), 1); + } + +size_t CountingBloomFilter::CountImpl(const HashPolicy::HashVector& h) const + { + CounterVector::size_type min = + std::numeric_limits::max(); + for ( size_t i = 0; i < h.size(); ++i ) + { + CounterVector::size_type cnt = cells_.Count(h[i] % h.size()); + if ( cnt < min ) + min = cnt; + } + return min; + } diff --git a/src/BloomFilter.h b/src/BloomFilter.h index dca4eff2bd..82948f30ec 100644 --- a/src/BloomFilter.h +++ b/src/BloomFilter.h @@ -65,7 +65,7 @@ public: protected: DECLARE_SERIAL(CounterVector); - CounterVector(); + CounterVector() { } private: BitVector bits_; @@ -82,7 +82,7 @@ public: typedef std::vector HashVector; virtual ~HashPolicy() { } - size_t k() const { return k_; } + size_t K() const { return k_; } virtual HashVector Hash(const void* x, size_t n) const = 0; protected: @@ -130,7 +130,7 @@ private: }; /** - * The *double-hashing* policy. Uses a linear combination of 2 hash functions. + * The *double-hashing* policy. Uses a linear combination of two hash functions. */ class DoubleHashing : public HashPolicy { public: @@ -185,25 +185,20 @@ public: return elements_; } -protected: - /** - * Default-constructs a Bloom filter. - */ - BloomFilter(); + bool Serialize(SerialInfo* info) const; + static BloomFilter* Unserialize(UnserialInfo* info); - /** - * Constructs a BloomFilter. - * @param hash The hashing policy. - */ - BloomFilter(HashPolicy* hash); +protected: + DECLARE_SERIAL(BloomFilter); + + BloomFilter() { }; + BloomFilter(HashPolicy* hash) : hash_(hash) { } virtual void AddImpl(const HashPolicy::HashVector& hashes) = 0; - virtual size_t CountImpl(const HashPolicy::HashVector& hashes) const = 0; private: - HashPolicy* hash_; // Owned by *this. - + HashPolicy* hash_; size_t elements_; }; @@ -212,12 +207,17 @@ private: */ class BasicBloomFilter : public BloomFilter { public: - BasicBloomFilter(); - BasicBloomFilter(HashPolicy* hash); + static size_t Cells(double fp, size_t capacity); + static size_t K(size_t cells, size_t capacity); + + BasicBloomFilter(size_t cells, HashPolicy* hash); protected: - virtual void AddImpl(const HashPolicy::HashVector& h); + DECLARE_SERIAL(BasicBloomFilter); + BasicBloomFilter() { } + + virtual void AddImpl(const HashPolicy::HashVector& h); virtual size_t CountImpl(const HashPolicy::HashVector& h) const; private: @@ -232,10 +232,11 @@ public: CountingBloomFilter(unsigned width, HashPolicy* hash); protected: + DECLARE_SERIAL(CountingBloomFilter); + CountingBloomFilter(); virtual void AddImpl(const HashPolicy::HashVector& h); - virtual size_t CountImpl(const HashPolicy::HashVector& h) const; private: diff --git a/src/NetVar.cc b/src/NetVar.cc index 3a23e4c9fa..d8c2192af7 100644 --- a/src/NetVar.cc +++ b/src/NetVar.cc @@ -244,6 +244,7 @@ OpaqueType* md5_type; OpaqueType* sha1_type; OpaqueType* sha256_type; OpaqueType* entropy_type; +OpaqueType* bloomfilter_type; #include "const.bif.netvar_def" #include "types.bif.netvar_def" @@ -310,6 +311,7 @@ void init_general_global_var() sha1_type = new OpaqueType("sha1"); sha256_type = new OpaqueType("sha256"); entropy_type = new OpaqueType("entropy"); + bloomfilter_type = new OpaqueType("bloomfilter"); } void init_net_var() diff --git a/src/OpaqueVal.cc b/src/OpaqueVal.cc index 19346e52f2..a5fb65f53b 100644 --- a/src/OpaqueVal.cc +++ b/src/OpaqueVal.cc @@ -1,4 +1,6 @@ #include "OpaqueVal.h" + +#include "BloomFilter.h" #include "NetVar.h" #include "Reporter.h" #include "Serializer.h" @@ -515,3 +517,24 @@ bool EntropyVal::DoUnserialize(UnserialInfo* info) return true; } + +BloomFilterVal::BloomFilterVal(OpaqueType* t) : OpaqueVal(t) + { + } + +IMPLEMENT_SERIAL(BloomFilterVal, SER_BLOOMFILTER_VAL); + +bool BloomFilterVal::DoSerialize(SerialInfo* info) const + { + DO_SERIALIZE(SER_BLOOMFILTER_VAL, OpaqueVal); + // TODO: implement. + return true; + } + +bool BloomFilterVal::DoUnserialize(UnserialInfo* info) + { + DO_UNSERIALIZE(OpaqueVal); + // TODO: implement. + return true; + } + diff --git a/src/OpaqueVal.h b/src/OpaqueVal.h index 78fa5da5e9..1c9c0361cc 100644 --- a/src/OpaqueVal.h +++ b/src/OpaqueVal.h @@ -7,6 +7,8 @@ #include "Val.h" #include "digest.h" +class BloomFilter; + class HashVal : public OpaqueVal { public: virtual bool IsValid() const; @@ -107,4 +109,18 @@ private: RandTest state; }; +class BloomFilterVal : public OpaqueVal { +public: + BloomFilterVal(); + +protected: + friend class Val; + BloomFilterVal(OpaqueType* t); + + DECLARE_SERIAL(BloomFilterVal); + +private: + BloomFilter* bloom_filter_; +}; + #endif diff --git a/src/SerialTypes.h b/src/SerialTypes.h index c9c0c34a33..171113ab6a 100644 --- a/src/SerialTypes.h +++ b/src/SerialTypes.h @@ -50,6 +50,9 @@ SERIAL_IS_BO(CASE, 0x1200) SERIAL_IS(LOCATION, 0x1300) SERIAL_IS(RE_MATCHER, 0x1400) SERIAL_IS(BITVECTOR, 0x1500) +SERIAL_IS(COUNTERVECTOR, 0xa000) +SERIAL_IS(BLOOMFILTER, 0xa100) +SERIAL_IS(BASICBLOOMFILTER, 0xa200) // These are the externally visible types. const SerialType SER_NONE = 0; @@ -105,6 +108,7 @@ SERIAL_VAL(MD5_VAL, 16) SERIAL_VAL(SHA1_VAL, 17) SERIAL_VAL(SHA256_VAL, 18) SERIAL_VAL(ENTROPY_VAL, 19) +SERIAL_VAL(BLOOMFILTER_VAL, 20) #define SERIAL_EXPR(name, val) SERIAL_CONST(name, val, EXPR) SERIAL_EXPR(EXPR, 1) @@ -204,5 +208,8 @@ SERIAL_CONST2(CASE) SERIAL_CONST2(LOCATION) SERIAL_CONST2(RE_MATCHER) SERIAL_CONST2(BITVECTOR) +SERIAL_CONST2(COUNTERVECTOR) +SERIAL_CONST2(BLOOMFILTER) +SERIAL_CONST2(BASICBLOOMFILTER) #endif From d3297dd6f3b6a50c07c90e9ad5f61c0ddf762460 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Tue, 4 Jun 2013 13:32:26 -0700 Subject: [PATCH 06/50] Adhere to Bro coding style. --- src/BitVector.cc | 100 +++++++++++++++++++++++------------------------ src/BitVector.h | 40 +++++++++---------- 2 files changed, 69 insertions(+), 71 deletions(-) diff --git a/src/BitVector.cc b/src/BitVector.cc index f57301d506..f029230609 100644 --- a/src/BitVector.cc +++ b/src/BitVector.cc @@ -33,7 +33,7 @@ BitVector::Reference::Reference(block_type& block, block_type i) assert(i < bits_per_block); } -BitVector::Reference& BitVector::Reference::flip() +BitVector::Reference& BitVector::Reference::Flip() { block_ ^= mask_; return *this; @@ -105,7 +105,7 @@ BitVector::BitVector(BitVector const& other) BitVector BitVector::operator~() const { BitVector b(*this); - b.flip(); + b.Flip(); return b; } @@ -130,15 +130,15 @@ BitVector BitVector::operator>>(size_type n) const BitVector& BitVector::operator<<=(size_type n) { if (n >= num_bits_) - return reset(); + return Reset(); if (n > 0) { - size_type last = blocks() - 1; + size_type last = Blocks() - 1; size_type div = n / bits_per_block; block_type r = bit_index(n); block_type* b = &bits_[0]; - assert(blocks() >= 1); + assert(Blocks() >= 1); assert(div <= last); if (r != 0) @@ -164,15 +164,15 @@ BitVector& BitVector::operator<<=(size_type n) BitVector& BitVector::operator>>=(size_type n) { if (n >= num_bits_) - return reset(); + return Reset(); if (n > 0) { - size_type last = blocks() - 1; + size_type last = Blocks() - 1; size_type div = n / bits_per_block; block_type r = bit_index(n); block_type* b = &bits_[0]; - assert(blocks() >= 1); + assert(Blocks() >= 1); assert(div <= last); if (r != 0) @@ -187,39 +187,39 @@ BitVector& BitVector::operator>>=(size_type n) b[i-div] = b[i]; } - std::fill_n(b + (blocks() - div), div, block_type(0)); + std::fill_n(b + (Blocks() - div), div, block_type(0)); } return *this; } BitVector& BitVector::operator&=(BitVector const& other) { - assert(size() >= other.size()); - for (size_type i = 0; i < blocks(); ++i) + assert(Size() >= other.Size()); + for (size_type i = 0; i < Blocks(); ++i) bits_[i] &= other.bits_[i]; return *this; } BitVector& BitVector::operator|=(BitVector const& other) { - assert(size() >= other.size()); - for (size_type i = 0; i < blocks(); ++i) + assert(Size() >= other.Size()); + for (size_type i = 0; i < Blocks(); ++i) bits_[i] |= other.bits_[i]; return *this; } BitVector& BitVector::operator^=(BitVector const& other) { - assert(size() >= other.size()); - for (size_type i = 0; i < blocks(); ++i) + assert(Size() >= other.Size()); + for (size_type i = 0; i < Blocks(); ++i) bits_[i] ^= other.bits_[i]; return *this; } BitVector& BitVector::operator-=(BitVector const& other) { - assert(size() >= other.size()); - for (size_type i = 0; i < blocks(); ++i) + assert(Size() >= other.Size()); + for (size_type i = 0; i < Blocks(); ++i) bits_[i] &= ~other.bits_[i]; return *this; } @@ -260,8 +260,8 @@ bool operator!=(BitVector const& x, BitVector const& y) bool operator<(BitVector const& x, BitVector const& y) { - assert(x.size() == y.size()); - for (BitVector::size_type r = x.blocks(); r > 0; --r) + assert(x.Size() == y.Size()); + for (BitVector::size_type r = x.Blocks(); r > 0; --r) { BitVector::size_type i = r - 1; if (x.bits_[i] < y.bits_[i]) @@ -272,9 +272,9 @@ bool operator<(BitVector const& x, BitVector const& y) return false; } -void BitVector::resize(size_type n, bool value) +void BitVector::Resize(size_type n, bool value) { - size_type old = blocks(); + size_type old = Blocks(); size_type required = bits_to_blocks(n); block_type block_value = value ? ~block_type(0) : block_type(0); @@ -288,27 +288,27 @@ void BitVector::resize(size_type n, bool value) zero_unused_bits(); } -void BitVector::clear() +void BitVector::Clear() { bits_.clear(); num_bits_ = 0; } -void BitVector::push_back(bool bit) +void BitVector::PushBack(bool bit) { - size_type s = size(); - resize(s + 1); - set(s, bit); + size_type s = Size(); + Resize(s + 1); + Set(s, bit); } -void BitVector::append(block_type block) +void BitVector::Append(block_type block) { size_type excess = extra_bits(); if (excess) { - assert(! bits_.empty()); + assert(! Empty()); bits_.push_back(block >> (bits_per_block - excess)); - bits_[bits_.size() - 2] |= (block << excess); + bits_[Blocks() - 2] |= (block << excess); } else { @@ -317,48 +317,46 @@ void BitVector::append(block_type block) num_bits_ += bits_per_block; } -BitVector& BitVector::set(size_type i, bool bit) +BitVector& BitVector::Set(size_type i, bool bit) { assert(i < num_bits_); - if (bit) - bits_[block_index(i)] |= bit_mask(i); + bits_[block_index(i)] |= bit_mask(i); else - reset(i); - + Reset(i); return *this; } -BitVector& BitVector::set() +BitVector& BitVector::Set() { std::fill(bits_.begin(), bits_.end(), ~block_type(0)); zero_unused_bits(); return *this; } -BitVector& BitVector::reset(size_type i) +BitVector& BitVector::Reset(size_type i) { assert(i < num_bits_); bits_[block_index(i)] &= ~bit_mask(i); return *this; } -BitVector& BitVector::reset() +BitVector& BitVector::Reset() { std::fill(bits_.begin(), bits_.end(), block_type(0)); return *this; } -BitVector& BitVector::flip(size_type i) +BitVector& BitVector::Flip(size_type i) { assert(i < num_bits_); bits_[block_index(i)] ^= bit_mask(i); return *this; } -BitVector& BitVector::flip() +BitVector& BitVector::Flip() { - for (size_type i = 0; i < blocks(); ++i) + for (size_type i = 0; i < Blocks(); ++i) bits_[i] = ~bits_[i]; zero_unused_bits(); return *this; @@ -376,11 +374,11 @@ BitVector::Reference BitVector::operator[](size_type i) return Reference(bits_[block_index(i)], bit_index(i)); } -BitVector::size_type BitVector::count() const +BitVector::size_type BitVector::Count() const { std::vector::const_iterator first = bits_.begin(); size_t n = 0; - size_type length = blocks(); + size_type length = Blocks(); while (length) { block_type block = *first; @@ -396,29 +394,29 @@ BitVector::size_type BitVector::count() const return n; } -BitVector::size_type BitVector::blocks() const +BitVector::size_type BitVector::Blocks() const { return bits_.size(); } -BitVector::size_type BitVector::size() const +BitVector::size_type BitVector::Size() const { return num_bits_; } -bool BitVector::empty() const +bool BitVector::Empty() const { return bits_.empty(); } -BitVector::size_type BitVector::find_first() const +BitVector::size_type BitVector::FindFirst() const { return find_from(0); } -BitVector::size_type BitVector::find_next(size_type i) const +BitVector::size_type BitVector::FindNext(size_type i) const { - if (i >= (size() - 1) || size() == 0) + if (i >= (Size() - 1) || Size() == 0) return npos; ++i; size_type bi = block_index(i); @@ -437,7 +435,7 @@ BitVector::size_type BitVector::lowest_bit(block_type block) BitVector::block_type BitVector::extra_bits() const { - return bit_index(size()); + return bit_index(Size()); } void BitVector::zero_unused_bits() @@ -448,9 +446,9 @@ void BitVector::zero_unused_bits() BitVector::size_type BitVector::find_from(size_type i) const { - while (i < blocks() && bits_[i] == 0) + while (i < Blocks() && bits_[i] == 0) ++i; - if (i >= blocks()) + if (i >= Blocks()) return npos; return i * bits_per_block + lowest_bit(bits_[i]); } diff --git a/src/BitVector.h b/src/BitVector.h index 9900dd103e..8315a151f0 100644 --- a/src/BitVector.h +++ b/src/BitVector.h @@ -24,7 +24,7 @@ public: Reference(block_type& block, block_type i); public: - Reference& flip(); + Reference& Flip(); operator bool() const; bool operator~() const; Reference& operator=(bool x); @@ -110,7 +110,7 @@ public: * sequence. */ template - void append(ForwardIterator first, ForwardIterator last) + void Append(ForwardIterator first, ForwardIterator last) { if (first == last) return; @@ -119,7 +119,7 @@ public: typename std::iterator_traits::difference_type delta = std::distance(first, last); - bits_.reserve(blocks() + delta); + bits_.reserve(Blocks() + delta); if (excess == 0) { bits_.back() |= (*first << excess); @@ -140,24 +140,24 @@ public: * Appends the bits in a given block. * @param block The block containing bits to append. */ - void append(block_type block); + void Append(block_type block); /** Appends a single bit to the end of the bit vector. * @param bit The value of the bit. */ - void push_back(bool bit); + void PushBack(bool bit); /** * Clears all bits in the bitvector. */ - void clear(); + void Clear(); /** * Resizes the bit vector to a new number of bits. * @param n The new number of bits of the bit vector. * @param value The bit value of new values, if the vector expands. */ - void resize(size_type n, bool value = false); + void Resize(size_type n, bool value = false); /** * Sets a bit at a specific position to a given value. @@ -165,39 +165,39 @@ public: * @param bit The value assigned to position *i*. * @return A reference to the bit vector instance. */ - BitVector& set(size_type i, bool bit = true); + BitVector& Set(size_type i, bool bit = true); /** * Sets all bits to 1. * @return A reference to the bit vector instance. */ - BitVector& set(); + BitVector& Set(); /** * Resets a bit at a specific position, i.e., sets it to 0. * @param i The bit position. * @return A reference to the bit vector instance. */ - BitVector& reset(size_type i); + BitVector& Reset(size_type i); /** * Sets all bits to 0. * @return A reference to the bit vector instance. */ - BitVector& reset(); + BitVector& Reset(); /** * Toggles/flips a bit at a specific position. * @param i The bit position. * @return A reference to the bit vector instance. */ - BitVector& flip(size_type i); + BitVector& Flip(size_type i); /** * Computes the complement. * @return A reference to the bit vector instance. */ - BitVector& flip(); + BitVector& Flip(); /** Retrieves a single bit. * @param i The bit position. @@ -217,32 +217,32 @@ public: * count* or *Hamming weight*. * @return The number of bits set to 1. */ - size_type count() const; + size_type Count() const; /** * Retrieves the number of blocks of the underlying storage. - * @param The number of blocks that represent `size()` bits. + * @param The number of blocks that represent `Size()` bits. */ - size_type blocks() const; + size_type Blocks() const; /** * Retrieves the number of bits the bitvector consist of. * @return The length of the bit vector in bits. */ - size_type size() const; + size_type Size() const; /** * Checks whether the bit vector is empty. * @return `true` iff the bitvector has zero length. */ - bool empty() const; + bool Empty() const; /** * Finds the bit position of of the first 1-bit. * @return The position of the first bit that equals to one or `npos` if no * such bit exists. */ - size_type find_first() const; + size_type FindFirst() const; /** * Finds the next 1-bit from a given starting position. @@ -252,7 +252,7 @@ public: * @return The position of the first bit that equals to 1 after position * *i* or `npos` if no such bit exists. */ - size_type find_next(size_type i) const; + size_type FindNext(size_type i) const; bool Serialize(SerialInfo* info) const; static BitVector* Unserialize(UnserialInfo* info); From a5572dd66f10ca653855483e0941da327b8422e4 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Tue, 4 Jun 2013 14:31:39 -0700 Subject: [PATCH 07/50] Write CounterVector implementation scaffold. --- src/BloomFilter.cc | 36 ++++++++++++++++++++++++++++++++++++ src/BloomFilter.h | 10 +++++++--- 2 files changed, 43 insertions(+), 3 deletions(-) diff --git a/src/BloomFilter.cc b/src/BloomFilter.cc index 4787bef0f0..78048ee588 100644 --- a/src/BloomFilter.cc +++ b/src/BloomFilter.cc @@ -10,6 +10,42 @@ T round(double x) { return (x > 0.0) ? (x + 0.5) : (x - 0.5); } } // namespace +CounterVector::CounterVector(size_t width, size_t cells) + : bits_(new BitVector(width * cells)), width_(width) + { + } + +CounterVector::~CounterVector() + { + delete bits_; + } + +bool CounterVector::Increment(size_type cell, count_type value) + { + // TODO + assert(! "not yet implemented"); + return false; + } + +bool CounterVector::Decrement(size_type cell, count_type value) + { + // TODO + assert(! "not yet implemented"); + return false; + } + +CounterVector::count_type CounterVector::Count(size_type cell) const + { + // TODO + assert(! "not yet implemented"); + return 0; + } + +CounterVector::size_type CounterVector::Size() const + { + return bits_->Blocks() / width_; + } + IMPLEMENT_SERIAL(CounterVector, SER_COUNTERVECTOR) bool CounterVector::DoSerialize(SerialInfo* info) const diff --git a/src/BloomFilter.h b/src/BloomFilter.h index 82948f30ec..b4f82efee9 100644 --- a/src/BloomFilter.h +++ b/src/BloomFilter.h @@ -9,7 +9,7 @@ /** * A vector of counters, each of which have a fixed number of bits. */ -class CounterVector : SerialObj { +class CounterVector : public SerialObj { public: typedef size_t size_type; typedef uint64 count_type; @@ -18,8 +18,12 @@ public: * Constructs a counter vector having cells of a given width. * * @param width The number of bits that each cell occupies. + * + * @param cells The number of cells in the bitvector. */ - explicit CounterVector(unsigned width); + CounterVector(size_t width, size_t cells = 1024); + + ~CounterVector(); /** * Increments a given cell. @@ -68,7 +72,7 @@ protected: CounterVector() { } private: - BitVector bits_; + BitVector* bits_; unsigned width_; }; From 751cf612931f021ddf7b5ee51019f20d05e0c309 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Tue, 4 Jun 2013 15:30:27 -0700 Subject: [PATCH 08/50] Add more serialization implementation. --- src/BloomFilter.cc | 93 ++++++++++++++++++++++++++++++++-------------- src/BloomFilter.h | 56 +++++++++++++++++++++++----- src/NetVar.h | 1 + src/OpaqueVal.cc | 18 ++++++--- src/OpaqueVal.h | 1 + src/SerialTypes.h | 2 + 6 files changed, 129 insertions(+), 42 deletions(-) diff --git a/src/BloomFilter.cc b/src/BloomFilter.cc index 78048ee588..64f0e1c67b 100644 --- a/src/BloomFilter.cc +++ b/src/BloomFilter.cc @@ -46,12 +46,23 @@ CounterVector::size_type CounterVector::Size() const return bits_->Blocks() / width_; } +bool CounterVector::Serialize(SerialInfo* info) const + { + return SerialObj::Serialize(info); + } + +CounterVector* CounterVector::Unserialize(UnserialInfo* info) + { + return reinterpret_cast( + SerialObj::Unserialize(info, SER_COUNTERVECTOR)); + } + IMPLEMENT_SERIAL(CounterVector, SER_COUNTERVECTOR) bool CounterVector::DoSerialize(SerialInfo* info) const { DO_SERIALIZE(SER_COUNTERVECTOR, SerialObj); - if ( ! SERIALIZE(&bits_) ) + if ( ! SERIALIZE(bits_) ) return false; return SERIALIZE(static_cast(width_)); } @@ -60,9 +71,9 @@ bool CounterVector::DoUnserialize(UnserialInfo* info) { DO_UNSERIALIZE(SerialObj); return false; - // TODO: Ask Robin how to unserialize non-pointer members. - //if ( ! UNSERIALIZE(&bits_) ) - // return false; + bits_ = BitVector::Unserialize(info); + if ( ! bits_ ) + return false; uint64 width; if ( ! UNSERIALIZE(&width) ) return false; @@ -90,6 +101,18 @@ HashPolicy::HashVector DoubleHashing::Hash(const void* x, size_t n) const return h; } + +BloomFilter::BloomFilter(size_t k) + : hash_(new hash_policy(k)) + { + } + +BloomFilter::~BloomFilter() + { + if ( hash_ ) + delete hash_; + } + bool BloomFilter::Serialize(SerialInfo* info) const { return SerialObj::Serialize(info); @@ -101,24 +124,21 @@ BloomFilter* BloomFilter::Unserialize(UnserialInfo* info) SerialObj::Unserialize(info, SER_BLOOMFILTER)); } -// FIXME: should abstract base classes also have IMPLEMENT_SERIAL? -//IMPLEMENT_SERIAL(BloomFilter, SER_BLOOMFILTER) - bool BloomFilter::DoSerialize(SerialInfo* info) const { DO_SERIALIZE(SER_BLOOMFILTER, SerialObj); - // TODO: Make the hash policy serializable. - //if ( ! SERIALIZE(hash_) ) - // return false; - return SERIALIZE(static_cast(elements_)); + if ( ! SERIALIZE(static_cast(hash_->K())) ) + return false; + return SERIALIZE(static_cast(elements_)); } bool BloomFilter::DoUnserialize(UnserialInfo* info) { DO_UNSERIALIZE(SerialObj); - // TODO: Make the hash policy serializable. - //if ( ! hash_ = HashPolicy::Unserialize(info) ) - // return false; + uint16 k; + if ( ! UNSERIALIZE(&k) ) + return false; + hash_ = new hash_policy(static_cast(k)); uint64 elements; if ( UNSERIALIZE(&elements) ) return false; @@ -126,7 +146,7 @@ bool BloomFilter::DoUnserialize(UnserialInfo* info) return true; } -size_t BasicBloomFilter::Cells(double fp, size_t capacity) +size_t BasicBloomFilter::M(double fp, size_t capacity) { double ln2 = std::log(2); return std::ceil(-(capacity * std::log(fp) / ln2 / ln2)); @@ -138,9 +158,16 @@ size_t BasicBloomFilter::K(size_t cells, size_t capacity) return round(frac * std::log(2)); } -BasicBloomFilter::BasicBloomFilter(size_t cells, HashPolicy* hash) - : BloomFilter(hash), bits_(cells) +BasicBloomFilter::BasicBloomFilter(double fp, size_t capacity) + : BloomFilter(K(M(fp, capacity), capacity)) { + bits_ = new BitVector(M(fp, capacity)); + } + +BasicBloomFilter::BasicBloomFilter(size_t cells, size_t capacity) + : BloomFilter(K(cells, capacity)) + { + bits_ = new BitVector(cells); } IMPLEMENT_SERIAL(BasicBloomFilter, SER_BASICBLOOMFILTER) @@ -148,38 +175,50 @@ IMPLEMENT_SERIAL(BasicBloomFilter, SER_BASICBLOOMFILTER) bool BasicBloomFilter::DoSerialize(SerialInfo* info) const { DO_SERIALIZE(SER_BASICBLOOMFILTER, BloomFilter); - // TODO: Make the hash policy serializable. - //if ( ! SERIALIZE(&bits_) ) - // return false; - return true; + return SERIALIZE(bits_); } bool BasicBloomFilter::DoUnserialize(UnserialInfo* info) { DO_UNSERIALIZE(BloomFilter); - // TODO: Non-pointer member deserialization? - return true; + bits_ = BitVector::Unserialize(info); + return bits_ == NULL; } void BasicBloomFilter::AddImpl(const HashPolicy::HashVector& h) { for ( size_t i = 0; i < h.size(); ++i ) - bits_.set(h[i] % h.size()); + bits_->Set(h[i] % h.size()); } size_t BasicBloomFilter::CountImpl(const HashPolicy::HashVector& h) const { for ( size_t i = 0; i < h.size(); ++i ) - if ( ! bits_[h[i] % h.size()] ) + if ( ! (*bits_)[h[i] % h.size()] ) return 0; return 1; } +IMPLEMENT_SERIAL(CountingBloomFilter, SER_COUNTINGBLOOMFILTER) + +bool CountingBloomFilter::DoSerialize(SerialInfo* info) const + { + DO_SERIALIZE(SER_BASICBLOOMFILTER, BloomFilter); + return SERIALIZE(cells_); + } + +bool CountingBloomFilter::DoUnserialize(UnserialInfo* info) + { + DO_UNSERIALIZE(BloomFilter); + cells_ = CounterVector::Unserialize(info); + return cells_ == NULL; + } + void CountingBloomFilter::AddImpl(const HashPolicy::HashVector& h) { for ( size_t i = 0; i < h.size(); ++i ) - cells_.Increment(h[i] % h.size(), 1); + cells_->Increment(h[i] % h.size(), 1); } size_t CountingBloomFilter::CountImpl(const HashPolicy::HashVector& h) const @@ -188,7 +227,7 @@ size_t CountingBloomFilter::CountImpl(const HashPolicy::HashVector& h) const std::numeric_limits::max(); for ( size_t i = 0; i < h.size(); ++i ) { - CounterVector::size_type cnt = cells_.Count(h[i] % h.size()); + CounterVector::size_type cnt = cells_->Count(h[i] % h.size()); if ( cnt < min ) min = cnt; } diff --git a/src/BloomFilter.h b/src/BloomFilter.h index b4f82efee9..77c6bc4f56 100644 --- a/src/BloomFilter.h +++ b/src/BloomFilter.h @@ -151,9 +151,13 @@ private: /** * The abstract base class for Bloom filters. */ -class BloomFilter : SerialObj { +class BloomFilter : public SerialObj { public: - virtual ~BloomFilter() { delete hash_; } + // At this point we won't let the user choose the hash policy, but we might + // open up the interface in the future. + typedef DoubleHashing hash_policy; + + virtual ~BloomFilter(); /** * Adds an element of type T to the Bloom filter. @@ -193,10 +197,10 @@ public: static BloomFilter* Unserialize(UnserialInfo* info); protected: - DECLARE_SERIAL(BloomFilter); + DECLARE_ABSTRACT_SERIAL(BloomFilter); BloomFilter() { }; - BloomFilter(HashPolicy* hash) : hash_(hash) { } + BloomFilter(size_t k); virtual void AddImpl(const HashPolicy::HashVector& hashes) = 0; virtual size_t CountImpl(const HashPolicy::HashVector& hashes) const = 0; @@ -211,10 +215,42 @@ private: */ class BasicBloomFilter : public BloomFilter { public: - static size_t Cells(double fp, size_t capacity); + /** + * Computes the number of cells based a given false-positive rate and + * capacity. In the literature, this parameter often has the name *M*. + * + * @param fp The false-positive rate. + * + * @param capacity The number of exepected elements. + * + * Returns: The number cells needed to support a false-positive rate of *fp* + * with at most *capacity* elements. + */ + static size_t M(double fp, size_t capacity); + + /** + * Computes the optimal number of hash functions based on the number cells + * and expected number of elements. + * + * @param cells The number of cells (*m*). + * + * @param capacity The maximum number of elements. + * + * Returns: the optimal number of hash functions for a false-positive rate of + * *fp* for at most *capacity* elements. + */ static size_t K(size_t cells, size_t capacity); - BasicBloomFilter(size_t cells, HashPolicy* hash); + /** + * Constructs a basic Bloom filter with a given false-positive rate and + * capacity. + */ + BasicBloomFilter(double fp, size_t capacity); + + /** + * Constructs a basic Bloom filter with a given number of cells and capacity. + */ + BasicBloomFilter(size_t cells, size_t capacity); protected: DECLARE_SERIAL(BasicBloomFilter); @@ -225,7 +261,7 @@ protected: virtual size_t CountImpl(const HashPolicy::HashVector& h) const; private: - BitVector bits_; + BitVector* bits_; }; /** @@ -233,18 +269,18 @@ private: */ class CountingBloomFilter : public BloomFilter { public: - CountingBloomFilter(unsigned width, HashPolicy* hash); + CountingBloomFilter(unsigned width); protected: DECLARE_SERIAL(CountingBloomFilter); - CountingBloomFilter(); + CountingBloomFilter() { } virtual void AddImpl(const HashPolicy::HashVector& h); virtual size_t CountImpl(const HashPolicy::HashVector& h) const; private: - CounterVector cells_; + CounterVector* cells_; }; #endif diff --git a/src/NetVar.h b/src/NetVar.h index 1a20adcaf2..aa2a14ada5 100644 --- a/src/NetVar.h +++ b/src/NetVar.h @@ -249,6 +249,7 @@ extern OpaqueType* md5_type; extern OpaqueType* sha1_type; extern OpaqueType* sha256_type; extern OpaqueType* entropy_type; +extern OpaqueType* bloomfilter_type; // Initializes globals that don't pertain to network/event analysis. extern void init_general_global_var(); diff --git a/src/OpaqueVal.cc b/src/OpaqueVal.cc index a5fb65f53b..b4f1290436 100644 --- a/src/OpaqueVal.cc +++ b/src/OpaqueVal.cc @@ -518,23 +518,31 @@ bool EntropyVal::DoUnserialize(UnserialInfo* info) return true; } +BloomFilterVal::BloomFilterVal() : OpaqueVal(bloomfilter_type) + { + } + BloomFilterVal::BloomFilterVal(OpaqueType* t) : OpaqueVal(t) { } +BloomFilterVal::~BloomFilterVal() + { + if ( bloom_filter_ ) + delete bloom_filter_; + } + IMPLEMENT_SERIAL(BloomFilterVal, SER_BLOOMFILTER_VAL); bool BloomFilterVal::DoSerialize(SerialInfo* info) const { DO_SERIALIZE(SER_BLOOMFILTER_VAL, OpaqueVal); - // TODO: implement. - return true; + return SERIALIZE(bloom_filter_); } bool BloomFilterVal::DoUnserialize(UnserialInfo* info) { DO_UNSERIALIZE(OpaqueVal); - // TODO: implement. - return true; + bloom_filter_ = BloomFilter::Unserialize(info); + return bloom_filter_ == NULL; } - diff --git a/src/OpaqueVal.h b/src/OpaqueVal.h index 1c9c0361cc..68b42a8a49 100644 --- a/src/OpaqueVal.h +++ b/src/OpaqueVal.h @@ -112,6 +112,7 @@ private: class BloomFilterVal : public OpaqueVal { public: BloomFilterVal(); + ~BloomFilterVal(); protected: friend class Val; diff --git a/src/SerialTypes.h b/src/SerialTypes.h index 171113ab6a..859145f19f 100644 --- a/src/SerialTypes.h +++ b/src/SerialTypes.h @@ -53,6 +53,7 @@ SERIAL_IS(BITVECTOR, 0x1500) SERIAL_IS(COUNTERVECTOR, 0xa000) SERIAL_IS(BLOOMFILTER, 0xa100) SERIAL_IS(BASICBLOOMFILTER, 0xa200) +SERIAL_IS(COUNTINGBLOOMFILTER, 0xa300) // These are the externally visible types. const SerialType SER_NONE = 0; @@ -211,5 +212,6 @@ SERIAL_CONST2(BITVECTOR) SERIAL_CONST2(COUNTERVECTOR) SERIAL_CONST2(BLOOMFILTER) SERIAL_CONST2(BASICBLOOMFILTER) +SERIAL_CONST2(COUNTINGBLOOMFILTER) #endif From 880d02f7204d21fc0e69f08ac78e963042df4f16 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Wed, 5 Jun 2013 16:16:55 -0700 Subject: [PATCH 09/50] Associate a Comphash with a BloomFilterVal. We also keep track of the Bloom filter's element type inside each value. The first use of the BiF bloomfilter_add will "typify" the Bloom filter and lock the Bloom filter's type to the element type. --- src/BloomFilter.cc | 15 ++++++++++++ src/BloomFilter.h | 3 ++- src/OpaqueVal.cc | 60 ++++++++++++++++++++++++++++++++++++++++++++-- src/OpaqueVal.h | 18 ++++++++++++-- 4 files changed, 91 insertions(+), 5 deletions(-) diff --git a/src/BloomFilter.cc b/src/BloomFilter.cc index 64f0e1c67b..74fa6fb255 100644 --- a/src/BloomFilter.cc +++ b/src/BloomFilter.cc @@ -199,6 +199,21 @@ size_t BasicBloomFilter::CountImpl(const HashPolicy::HashVector& h) const return 1; } +CountingBloomFilter::CountingBloomFilter(double fp, size_t capacity, + size_t width) + : BloomFilter(BasicBloomFilter::K(BasicBloomFilter::M(fp, capacity), + capacity)) + { + cells_ = new CounterVector(width, BasicBloomFilter::M(fp, capacity)); + } + +CountingBloomFilter::CountingBloomFilter(size_t cells, size_t capacity, + size_t width) + : BloomFilter(BasicBloomFilter::K(cells, capacity)) + { + cells_ = new CounterVector(width, cells); + } + IMPLEMENT_SERIAL(CountingBloomFilter, SER_COUNTINGBLOOMFILTER) diff --git a/src/BloomFilter.h b/src/BloomFilter.h index 77c6bc4f56..14b0ac3281 100644 --- a/src/BloomFilter.h +++ b/src/BloomFilter.h @@ -269,7 +269,8 @@ private: */ class CountingBloomFilter : public BloomFilter { public: - CountingBloomFilter(unsigned width); + CountingBloomFilter(double fp, size_t capacity, size_t width); + CountingBloomFilter(size_t cells, size_t capacity, size_t width); protected: DECLARE_SERIAL(CountingBloomFilter); diff --git a/src/OpaqueVal.cc b/src/OpaqueVal.cc index b4f1290436..abfd8f320f 100644 --- a/src/OpaqueVal.cc +++ b/src/OpaqueVal.cc @@ -518,31 +518,87 @@ bool EntropyVal::DoUnserialize(UnserialInfo* info) return true; } -BloomFilterVal::BloomFilterVal() : OpaqueVal(bloomfilter_type) +BloomFilterVal::BloomFilterVal(BloomFilter* bf) + : OpaqueVal(bloomfilter_type), bloom_filter_(bf) { } -BloomFilterVal::BloomFilterVal(OpaqueType* t) : OpaqueVal(t) +BloomFilterVal::BloomFilterVal(OpaqueType* t) + : OpaqueVal(t) { } +bool BloomFilterVal::Typify(BroType* type) + { + if ( type_ ) + return false; + type_ = type; + TypeList* tl = new TypeList(type_); + tl->Append(type_); + hash_ = new CompositeHash(tl); + Unref(tl); + return true; + } + +BroType* BloomFilterVal::Type() const + { + return type_; + } + +void BloomFilterVal::Add(const Val* val) + { + HashKey* key = hash_->ComputeHash(val, 1); + bloom_filter_->Add(key->Hash()); + } + +size_t BloomFilterVal::Count(const Val* val) const + { + HashKey* key = hash_->ComputeHash(val, 1); + return bloom_filter_->Count(key->Hash()); + } + +BloomFilterVal* BloomFilterVal::Merge(const BloomFilterVal* first, + const BloomFilterVal* second) +{ + assert(! "not yet implemented"); + return NULL; + } + BloomFilterVal::~BloomFilterVal() { + if ( type_ ) + Unref(type_); + if ( hash_ ) + delete hash_; if ( bloom_filter_ ) delete bloom_filter_; } +BloomFilterVal::BloomFilterVal() + : OpaqueVal(bloomfilter_type) + { + } + IMPLEMENT_SERIAL(BloomFilterVal, SER_BLOOMFILTER_VAL); bool BloomFilterVal::DoSerialize(SerialInfo* info) const { DO_SERIALIZE(SER_BLOOMFILTER_VAL, OpaqueVal); + if ( ! SERIALIZE(type_) ) + return false; return SERIALIZE(bloom_filter_); } bool BloomFilterVal::DoUnserialize(UnserialInfo* info) { DO_UNSERIALIZE(OpaqueVal); + type_ = BroType::Unserialize(info); + if ( ! type_ ) + return false; + TypeList* tl = new TypeList(type_); + tl->Append(type_); + hash_ = new CompositeHash(tl); + Unref(tl); bloom_filter_ = BloomFilter::Unserialize(info); return bloom_filter_ == NULL; } diff --git a/src/OpaqueVal.h b/src/OpaqueVal.h index 68b42a8a49..e97a530f3a 100644 --- a/src/OpaqueVal.h +++ b/src/OpaqueVal.h @@ -110,18 +110,32 @@ private: }; class BloomFilterVal : public OpaqueVal { + BloomFilterVal(const BloomFilterVal&); + BloomFilterVal& operator=(const BloomFilterVal&); public: - BloomFilterVal(); + static BloomFilterVal* Merge(const BloomFilterVal* first, + const BloomFilterVal* second); + + BloomFilterVal(BloomFilter* bf); ~BloomFilterVal(); + bool Typify(BroType* type); + BroType* Type() const; + + void Add(const Val* val); + size_t Count(const Val* val) const; + protected: friend class Val; + BloomFilterVal(); BloomFilterVal(OpaqueType* t); DECLARE_SERIAL(BloomFilterVal); private: - BloomFilter* bloom_filter_; + BroType* type_; + CompositeHash* hash_; + BloomFilter* bloom_filter_; }; #endif From 3d9764213191070a6b68375c0d0ae8c3193528e3 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Wed, 5 Jun 2013 16:26:16 -0700 Subject: [PATCH 10/50] Add Bloom filter BiFs. --- src/bro.bif | 89 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 89 insertions(+) diff --git a/src/bro.bif b/src/bro.bif index d9558106a7..60fb985dda 100644 --- a/src/bro.bif +++ b/src/bro.bif @@ -5730,3 +5730,92 @@ function anonymize_addr%(a: addr, cl: IPAddrAnonymizationClass%): addr } %} +# =========================================================================== +# +# Bloom Filter Functions +# +# =========================================================================== + +%%{ +#include "BloomFilter.h" +%%} + +## Initializes a Bloom filter data structure. +## +## fp: The desired false-positive rate. +## +## capacity: the maximum number of elements that guarantees a false-positive +## rate of *fp*. +## +## Returns: A Bloom filter handle. +function bloomfilter_init%(fp: double, capacity: count, + max: count &default=1%): opaque of bloomfilter + %{ + BloomFilter* bf; + if ( max == 1 ) + { + bf = new BasicBloomFilter(fp, capacity); + } + else + { + uint16 width = 0; + while ( max >>= 1 ) + ++width; + bf = new CountingBloomFilter(fp, capacity, width); + } + return new BloomFilterVal(bf); + %} + +## Adds an element to a Bloom filter. +## +## bf: The Bloom filter handle. +## +## x: The element to add. +function bloomfilter_add%(bf: opaque of bloomfilter, x: any%): any + %{ + BloomFilterVal* bfv = static_cast(bf); + if ( ! bfv->Type() || ! bfv->Typify(x->Type()) ) + reporter->Error("failed to set Bloom filter type"); + else if ( bfv->Type() != x->Type() ) + reporter->Error("incompatible Bloom filter types"); + bfv->Add(x); + return 0; + %} + +## Retrieves the counter for a given element in a Bloom filter. +## +## bf: The Bloom filter handle. +## +## x: The element to count. +## +## Returns: the counter associated with *x* in *bf*. +function bloomfilter_lookup%(bf: opaque of bloomfilter, x: any%): count + %{ + BloomFilterVal* bfv = static_cast(bf); + if ( ! bfv->Type() ) + reporter->Error("cannot perform lookup on untyped Bloom filter"); + else if ( bfv->Type() != x->Type() ) + reporter->Error("incompatible Bloom filter types"); + return new Val(static_cast(bfv->Count(x)), TYPE_COUNT); + %} + +## Merges two Bloom filters. +## +## bf1: The first Bloom filter handle. +## +## bf2: The second Bloom filter handle. +## +## Returns: The union of *bf1* and *bf2*. +function bloomfilter_merge%(bf1: opaque of bloomfilter, + bf2: opaque of bloomfilter%): opaque of bloomfilter + %{ + const BloomFilterVal* bfv1 = static_cast(bf1); + const BloomFilterVal* bfv2 = static_cast(bf2); + if ( ! bfv1->Type() ) + reporter->Error("The first Bloom filter has not yet been typed"); + if ( ! bfv2->Type() ) + reporter->Error("The second Bloom filter has not yet been typed"); + else if ( bfv1->Type() != bfv2->Type() ) + reporter->Error("incompatible Bloom filter types"); + return BloomFilterVal::Merge(bfv1, bfv2); + %} From d5126a13395f899fab12f081248336e687222ed9 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Wed, 5 Jun 2013 17:45:10 -0700 Subject: [PATCH 11/50] Fix some BiF issues. --- src/bro.bif | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/src/bro.bif b/src/bro.bif index 60fb985dda..08b532eaea 100644 --- a/src/bro.bif +++ b/src/bro.bif @@ -5774,12 +5774,18 @@ function bloomfilter_init%(fp: double, capacity: count, function bloomfilter_add%(bf: opaque of bloomfilter, x: any%): any %{ BloomFilterVal* bfv = static_cast(bf); - if ( ! bfv->Type() || ! bfv->Typify(x->Type()) ) + if ( ! bfv->Type() && ! bfv->Typify(x->Type()) ) + { reporter->Error("failed to set Bloom filter type"); + return NULL; + } else if ( bfv->Type() != x->Type() ) + { reporter->Error("incompatible Bloom filter types"); + return NULL; + } bfv->Add(x); - return 0; + return NULL; %} ## Retrieves the counter for a given element in a Bloom filter. @@ -5812,9 +5818,9 @@ function bloomfilter_merge%(bf1: opaque of bloomfilter, const BloomFilterVal* bfv1 = static_cast(bf1); const BloomFilterVal* bfv2 = static_cast(bf2); if ( ! bfv1->Type() ) - reporter->Error("The first Bloom filter has not yet been typed"); + reporter->Error("first Bloom filter has not yet been typed"); if ( ! bfv2->Type() ) - reporter->Error("The second Bloom filter has not yet been typed"); + reporter->Error("second Bloom filter has not yet been typed"); else if ( bfv1->Type() != bfv2->Type() ) reporter->Error("incompatible Bloom filter types"); return BloomFilterVal::Merge(bfv1, bfv2); From 012e09c5c40bdf0acd29a34bf2271417ed36d770 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Thu, 6 Jun 2013 12:56:46 -0700 Subject: [PATCH 12/50] Small fixes and simplifications. --- src/BloomFilter.cc | 2 +- src/BloomFilter.h | 17 +++++++---------- src/OpaqueVal.cc | 1 + 3 files changed, 9 insertions(+), 11 deletions(-) diff --git a/src/BloomFilter.cc b/src/BloomFilter.cc index 74fa6fb255..e549553bf4 100644 --- a/src/BloomFilter.cc +++ b/src/BloomFilter.cc @@ -140,7 +140,7 @@ bool BloomFilter::DoUnserialize(UnserialInfo* info) return false; hash_ = new hash_policy(static_cast(k)); uint64 elements; - if ( UNSERIALIZE(&elements) ) + if ( ! UNSERIALIZE(&elements) ) return false; elements_ = static_cast(elements); return true; diff --git a/src/BloomFilter.h b/src/BloomFilter.h index 14b0ac3281..3e2bd5de90 100644 --- a/src/BloomFilter.h +++ b/src/BloomFilter.h @@ -94,15 +94,14 @@ protected: * A functor that computes a universal hash function. * @tparam Codomain An integral type. */ - template class Hasher { public: - template - Codomain operator()(const Domain& x) const + template + HashType operator()(const T& x) const { return h3_(&x, sizeof(x)); } - Codomain operator()(const void* x, size_t n) const + HashType operator()(const void* x, size_t n) const { return h3_(x, n); } @@ -110,7 +109,7 @@ protected: // FIXME: The hardcoded value of 36 comes from UHASH_KEY_SIZE defined in // Hash.h. I do not know how this value impacts the hash function behavior // so I'll just copy it verbatim. (Matthias) - H3 h3_; + H3 h3_; }; HashPolicy(size_t k) : k_(k) { } @@ -125,12 +124,11 @@ private: class DefaultHashing : public HashPolicy { public: DefaultHashing(size_t k) : HashPolicy(k), hashers_(k) { } - virtual ~DefaultHashing() { } virtual HashVector Hash(const void* x, size_t n) const; private: - std::vector< Hasher > hashers_; + std::vector hashers_; }; /** @@ -139,13 +137,12 @@ private: class DoubleHashing : public HashPolicy { public: DoubleHashing(size_t k) : HashPolicy(k) { } - virtual ~DoubleHashing() { } virtual HashVector Hash(const void* x, size_t n) const; private: - Hasher hasher1_; - Hasher hasher2_; + Hasher hasher1_; + Hasher hasher2_; }; /** diff --git a/src/OpaqueVal.cc b/src/OpaqueVal.cc index abfd8f320f..03a6e51ce8 100644 --- a/src/OpaqueVal.cc +++ b/src/OpaqueVal.cc @@ -533,6 +533,7 @@ bool BloomFilterVal::Typify(BroType* type) if ( type_ ) return false; type_ = type; + type_->Ref(); TypeList* tl = new TypeList(type_); tl->Append(type_); hash_ = new CompositeHash(tl); From f211b856c9ae35e68ea4af194e08157fdefef7e6 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Thu, 6 Jun 2013 13:13:36 -0700 Subject: [PATCH 13/50] Catch invalid values of the false-positive rate. --- src/bro.bif | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/bro.bif b/src/bro.bif index 08b532eaea..74219dd2b7 100644 --- a/src/bro.bif +++ b/src/bro.bif @@ -5751,6 +5751,11 @@ function anonymize_addr%(a: addr, cl: IPAddrAnonymizationClass%): addr function bloomfilter_init%(fp: double, capacity: count, max: count &default=1%): opaque of bloomfilter %{ + if ( fp < 0.0 || fp > 1.0 ) + { + reporter->Error("false-positive rate must take value between 0 and 1"); + return NULL; + } BloomFilter* bf; if ( max == 1 ) { From 7ce986e31f59b1f1000ec335a4efc1f0f5e0c011 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Thu, 6 Jun 2013 13:21:27 -0700 Subject: [PATCH 14/50] Fix modding. --- src/BloomFilter.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/BloomFilter.cc b/src/BloomFilter.cc index e549553bf4..7c347927c3 100644 --- a/src/BloomFilter.cc +++ b/src/BloomFilter.cc @@ -188,13 +188,13 @@ bool BasicBloomFilter::DoUnserialize(UnserialInfo* info) void BasicBloomFilter::AddImpl(const HashPolicy::HashVector& h) { for ( size_t i = 0; i < h.size(); ++i ) - bits_->Set(h[i] % h.size()); + bits_->Set(h[i] % bits_->Size()); } size_t BasicBloomFilter::CountImpl(const HashPolicy::HashVector& h) const { for ( size_t i = 0; i < h.size(); ++i ) - if ( ! (*bits_)[h[i] % h.size()] ) + if ( ! (*bits_)[h[i] % bits_->Size()] ) return 0; return 1; } @@ -233,7 +233,7 @@ bool CountingBloomFilter::DoUnserialize(UnserialInfo* info) void CountingBloomFilter::AddImpl(const HashPolicy::HashVector& h) { for ( size_t i = 0; i < h.size(); ++i ) - cells_->Increment(h[i] % h.size(), 1); + cells_->Increment(h[i] % cells_->Size(), 1); } size_t CountingBloomFilter::CountImpl(const HashPolicy::HashVector& h) const @@ -242,7 +242,7 @@ size_t CountingBloomFilter::CountImpl(const HashPolicy::HashVector& h) const std::numeric_limits::max(); for ( size_t i = 0; i < h.size(); ++i ) { - CounterVector::size_type cnt = cells_->Count(h[i] % h.size()); + CounterVector::size_type cnt = cells_->Count(h[i] % cells_->Size()); if ( cnt < min ) min = cnt; } From fcf1807fc8ac320a6c787360e8b78509b58b0a5a Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Thu, 6 Jun 2013 13:39:00 -0700 Subject: [PATCH 15/50] Fix hasher usage and narrow interface. --- src/BloomFilter.cc | 4 ++-- src/BloomFilter.h | 10 +--------- 2 files changed, 3 insertions(+), 11 deletions(-) diff --git a/src/BloomFilter.cc b/src/BloomFilter.cc index 7c347927c3..c684c82c0e 100644 --- a/src/BloomFilter.cc +++ b/src/BloomFilter.cc @@ -93,8 +93,8 @@ HashPolicy::HashVector DefaultHashing::Hash(const void* x, size_t n) const HashPolicy::HashVector DoubleHashing::Hash(const void* x, size_t n) const { - HashType h1 = hasher1_(x); - HashType h2 = hasher2_(x); + HashType h1 = hasher1_(x, n); + HashType h2 = hasher2_(x, n); HashVector h(K(), 0); for ( size_t i = 0; i < h.size(); ++i ) h[i] = h1 + i * h2; diff --git a/src/BloomFilter.h b/src/BloomFilter.h index 3e2bd5de90..fd1cb31d61 100644 --- a/src/BloomFilter.h +++ b/src/BloomFilter.h @@ -96,15 +96,7 @@ protected: */ class Hasher { public: - template - HashType operator()(const T& x) const - { - return h3_(&x, sizeof(x)); - } - HashType operator()(const void* x, size_t n) const - { - return h3_(x, n); - } + HashType operator()(const void* x, size_t n) const { return h3_(x, n); } private: // FIXME: The hardcoded value of 36 comes from UHASH_KEY_SIZE defined in // Hash.h. I do not know how this value impacts the hash function behavior From 0d299eca57ddab9dfb17c1f6c99139c481dccb49 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Thu, 6 Jun 2013 14:54:25 -0700 Subject: [PATCH 16/50] Correct computation of k hash functions. --- src/BloomFilter.cc | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/src/BloomFilter.cc b/src/BloomFilter.cc index c684c82c0e..f1db71ae1d 100644 --- a/src/BloomFilter.cc +++ b/src/BloomFilter.cc @@ -3,13 +3,6 @@ #include #include "Serializer.h" -// Backport C++11's std::round(). -namespace { -template -T round(double x) { return (x > 0.0) ? (x + 0.5) : (x - 0.5); } -} // namespace - - CounterVector::CounterVector(size_t width, size_t cells) : bits_(new BitVector(width * cells)), width_(width) { @@ -155,7 +148,7 @@ size_t BasicBloomFilter::M(double fp, size_t capacity) size_t BasicBloomFilter::K(size_t cells, size_t capacity) { double frac = static_cast(cells) / static_cast(capacity); - return round(frac * std::log(2)); + return std::ceil(frac * std::log(2)); } BasicBloomFilter::BasicBloomFilter(double fp, size_t capacity) From e15f03d980e8bb63d00969268056b2e9592b2f85 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Thu, 6 Jun 2013 15:02:11 -0700 Subject: [PATCH 17/50] Cleanup BiFs. --- src/bro.bif | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/src/bro.bif b/src/bro.bif index 5c1280645e..8bd9575498 100644 --- a/src/bro.bif +++ b/src/bro.bif @@ -5026,16 +5026,11 @@ function bloomfilter_add%(bf: opaque of bloomfilter, x: any%): any %{ BloomFilterVal* bfv = static_cast(bf); if ( ! bfv->Type() && ! bfv->Typify(x->Type()) ) - { reporter->Error("failed to set Bloom filter type"); - return NULL; - } else if ( bfv->Type() != x->Type() ) - { reporter->Error("incompatible Bloom filter types"); - return NULL; - } - bfv->Add(x); + else + bfv->Add(x); return NULL; %} @@ -5048,12 +5043,14 @@ function bloomfilter_add%(bf: opaque of bloomfilter, x: any%): any ## Returns: the counter associated with *x* in *bf*. function bloomfilter_lookup%(bf: opaque of bloomfilter, x: any%): count %{ - BloomFilterVal* bfv = static_cast(bf); + const BloomFilterVal* bfv = static_cast(bf); if ( ! bfv->Type() ) reporter->Error("cannot perform lookup on untyped Bloom filter"); else if ( bfv->Type() != x->Type() ) reporter->Error("incompatible Bloom filter types"); - return new Val(static_cast(bfv->Count(x)), TYPE_COUNT); + else + return new Val(static_cast(bfv->Count(x)), TYPE_COUNT); + return new Val(0, TYPE_COUNT); %} ## Merges two Bloom filters. @@ -5068,11 +5065,9 @@ function bloomfilter_merge%(bf1: opaque of bloomfilter, %{ const BloomFilterVal* bfv1 = static_cast(bf1); const BloomFilterVal* bfv2 = static_cast(bf2); - if ( ! bfv1->Type() ) - reporter->Error("first Bloom filter has not yet been typed"); - if ( ! bfv2->Type() ) - reporter->Error("second Bloom filter has not yet been typed"); - else if ( bfv1->Type() != bfv2->Type() ) + if ( bfv1->Type() != bfv2->Type() ) reporter->Error("incompatible Bloom filter types"); - return BloomFilterVal::Merge(bfv1, bfv2); + else + return BloomFilterVal::Merge(bfv1, bfv2); + return NULL; %} From 86becdd6e467fabc475eb81baea6d3586b2d74e7 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Thu, 6 Jun 2013 15:08:24 -0700 Subject: [PATCH 18/50] Add tests. --- testing/btest/bifs/bloomfilter.bro | 38 ++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 testing/btest/bifs/bloomfilter.bro diff --git a/testing/btest/bifs/bloomfilter.bro b/testing/btest/bifs/bloomfilter.bro new file mode 100644 index 0000000000..6abbdd69f7 --- /dev/null +++ b/testing/btest/bifs/bloomfilter.bro @@ -0,0 +1,38 @@ +# @TEST-EXEC: bro -b %INPUT >output +# @TEST-EXEC: btest-diff output + +event bro_init() + { + # Basic usage with counts. + local bf_cnt = bloomfilter_init(0.1, 1000); + bloomfilter_add(bf_cnt, 42); + bloomfilter_add(bf_cnt, 84); + bloomfilter_add(bf_cnt, 168); + print bloomfilter_lookup(bf_cnt, 0); + print bloomfilter_lookup(bf_cnt, 42); + print bloomfilter_lookup(bf_cnt, 168); + print bloomfilter_lookup(bf_cnt, 336); + bloomfilter_add(bf_cnt, 0.5); # Type mismatch + bloomfilter_add(bf_cnt, "foo"); # Type mismatch + + # Basic usage with strings. + local bf_str = bloomfilter_init(0.9, 10); + bloomfilter_add(bf_str, "foo"); + bloomfilter_add(bf_str, "bar"); + print bloomfilter_lookup(bf_str, "foo"); + print bloomfilter_lookup(bf_str, "bar"); + print bloomfilter_lookup(bf_str, "baz"); + print bloomfilter_lookup(bf_str, "qux"); + bloomfilter_add(bf_str, 0.5); # Type mismatch + bloomfilter_add(bf_str, 100); # Type mismatch + + # Edge cases. + local bf_edge0 = bloomfilter_init(0.000000000001, 1); + local bf_edge1 = bloomfilter_init(0.00000001, 100000000); + local bf_edge2 = bloomfilter_init(0.9999999, 1); + local bf_edge3 = bloomfilter_init(0.9999999, 100000000000); + + # Invalid parameters. + local bf_bug0 = bloomfilter_init(-0.5, 42); + local bf_bug1 = bloomfilter_init(1.1, 42); + } From f2d536d2da1118b1d5feb143f751d47dc344232b Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Thu, 6 Jun 2013 15:22:04 -0700 Subject: [PATCH 19/50] Add missing initializations. --- src/BloomFilter.cc | 15 +++++++++++++++ src/BloomFilter.h | 6 +++--- src/OpaqueVal.cc | 25 +++++++++++++++++-------- 3 files changed, 35 insertions(+), 11 deletions(-) diff --git a/src/BloomFilter.cc b/src/BloomFilter.cc index f1db71ae1d..40772fecb6 100644 --- a/src/BloomFilter.cc +++ b/src/BloomFilter.cc @@ -95,6 +95,11 @@ HashPolicy::HashVector DoubleHashing::Hash(const void* x, size_t n) const } +BloomFilter::BloomFilter() + : hash_(NULL) + { + } + BloomFilter::BloomFilter(size_t k) : hash_(new hash_policy(k)) { @@ -151,6 +156,11 @@ size_t BasicBloomFilter::K(size_t cells, size_t capacity) return std::ceil(frac * std::log(2)); } +BasicBloomFilter::BasicBloomFilter() + : bits_(NULL) + { + } + BasicBloomFilter::BasicBloomFilter(double fp, size_t capacity) : BloomFilter(K(M(fp, capacity), capacity)) { @@ -192,6 +202,11 @@ size_t BasicBloomFilter::CountImpl(const HashPolicy::HashVector& h) const return 1; } +CountingBloomFilter::CountingBloomFilter() + : cells_(NULL) + { + } + CountingBloomFilter::CountingBloomFilter(double fp, size_t capacity, size_t width) : BloomFilter(BasicBloomFilter::K(BasicBloomFilter::M(fp, capacity), diff --git a/src/BloomFilter.h b/src/BloomFilter.h index fd1cb31d61..c0101cadf8 100644 --- a/src/BloomFilter.h +++ b/src/BloomFilter.h @@ -188,7 +188,7 @@ public: protected: DECLARE_ABSTRACT_SERIAL(BloomFilter); - BloomFilter() { }; + BloomFilter(); BloomFilter(size_t k); virtual void AddImpl(const HashPolicy::HashVector& hashes) = 0; @@ -244,7 +244,7 @@ public: protected: DECLARE_SERIAL(BasicBloomFilter); - BasicBloomFilter() { } + BasicBloomFilter(); virtual void AddImpl(const HashPolicy::HashVector& h); virtual size_t CountImpl(const HashPolicy::HashVector& h) const; @@ -264,7 +264,7 @@ public: protected: DECLARE_SERIAL(CountingBloomFilter); - CountingBloomFilter() { } + CountingBloomFilter(); virtual void AddImpl(const HashPolicy::HashVector& h); virtual size_t CountImpl(const HashPolicy::HashVector& h) const; diff --git a/src/OpaqueVal.cc b/src/OpaqueVal.cc index 03a6e51ce8..38ea93d000 100644 --- a/src/OpaqueVal.cc +++ b/src/OpaqueVal.cc @@ -518,13 +518,27 @@ bool EntropyVal::DoUnserialize(UnserialInfo* info) return true; } -BloomFilterVal::BloomFilterVal(BloomFilter* bf) - : OpaqueVal(bloomfilter_type), bloom_filter_(bf) +BloomFilterVal::BloomFilterVal() + : OpaqueVal(bloomfilter_type), + type_(NULL), + hash_(NULL), + bloom_filter_(NULL) { } BloomFilterVal::BloomFilterVal(OpaqueType* t) - : OpaqueVal(t) + : OpaqueVal(t), + type_(NULL), + hash_(NULL), + bloom_filter_(NULL) + { + } + +BloomFilterVal::BloomFilterVal(BloomFilter* bf) + : OpaqueVal(bloomfilter_type), + type_(NULL), + hash_(NULL), + bloom_filter_(bf) { } @@ -575,11 +589,6 @@ BloomFilterVal::~BloomFilterVal() delete bloom_filter_; } -BloomFilterVal::BloomFilterVal() - : OpaqueVal(bloomfilter_type) - { - } - IMPLEMENT_SERIAL(BloomFilterVal, SER_BLOOMFILTER_VAL); bool BloomFilterVal::DoSerialize(SerialInfo* info) const From c6381055380f889c4891efcf83da512597ae64d6 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Mon, 10 Jun 2013 12:51:41 -0700 Subject: [PATCH 20/50] Document max parameter in bloomfilter_init. --- src/bro.bif | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/bro.bif b/src/bro.bif index 8bd9575498..9b80c90dbf 100644 --- a/src/bro.bif +++ b/src/bro.bif @@ -4993,6 +4993,13 @@ function anonymize_addr%(a: addr, cl: IPAddrAnonymizationClass%): addr ## capacity: the maximum number of elements that guarantees a false-positive ## rate of *fp*. ## +## max: The maximum counter value associated with each each element in the +## Bloom filter. If greater than 1, each element in the set has a counter of +## *w = ceil(log_2(max))* bits. Each bit in the underlying bit vector then +## becomes a cell of size *w* bits. Since the number number of cells is a +## function ## of *fp* and *capacity*, it is important to consider the effects +## on space when tuning this value. +## ## Returns: A Bloom filter handle. function bloomfilter_init%(fp: double, capacity: count, max: count &default=1%): opaque of bloomfilter From d25984ba45643be524788b73d7cebc1278a78810 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Mon, 10 Jun 2013 12:55:03 -0700 Subject: [PATCH 21/50] Update baseline for unit tests. --- testing/btest/Baseline/bifs.bloomfilter/output | 8 ++++++++ testing/btest/bifs/bloomfilter.bro | 4 ++-- 2 files changed, 10 insertions(+), 2 deletions(-) create mode 100644 testing/btest/Baseline/bifs.bloomfilter/output diff --git a/testing/btest/Baseline/bifs.bloomfilter/output b/testing/btest/Baseline/bifs.bloomfilter/output new file mode 100644 index 0000000000..65aaa8b07c --- /dev/null +++ b/testing/btest/Baseline/bifs.bloomfilter/output @@ -0,0 +1,8 @@ +0 +1 +1 +0 +1 +1 +1 +1 diff --git a/testing/btest/bifs/bloomfilter.bro b/testing/btest/bifs/bloomfilter.bro index 6abbdd69f7..769cec1200 100644 --- a/testing/btest/bifs/bloomfilter.bro +++ b/testing/btest/bifs/bloomfilter.bro @@ -21,8 +21,8 @@ event bro_init() bloomfilter_add(bf_str, "bar"); print bloomfilter_lookup(bf_str, "foo"); print bloomfilter_lookup(bf_str, "bar"); - print bloomfilter_lookup(bf_str, "baz"); - print bloomfilter_lookup(bf_str, "qux"); + print bloomfilter_lookup(bf_str, "baz"); # FP + print bloomfilter_lookup(bf_str, "qux"); # FP bloomfilter_add(bf_str, 0.5); # Type mismatch bloomfilter_add(bf_str, 100); # Type mismatch From 4c21576c120a0dcc9725308549fd57a8bf9072a1 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Mon, 10 Jun 2013 20:14:34 -0700 Subject: [PATCH 22/50] Add Bloomfilter serialization test code. --- testing/btest/istate/opaque.bro | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/testing/btest/istate/opaque.bro b/testing/btest/istate/opaque.bro index 84818a5e70..ac3b2c0874 100644 --- a/testing/btest/istate/opaque.bro +++ b/testing/btest/istate/opaque.bro @@ -12,6 +12,9 @@ global sha1_handle: opaque of sha1 &persistent &synchronized; global sha256_handle: opaque of sha256 &persistent &synchronized; global entropy_handle: opaque of entropy &persistent &synchronized; +global bloomfilter_elements: set[string] &persistent &synchronized; +global bloomfilter_handle: opaque of bloomfilter &persistent &synchronized; + event bro_done() { local out = open("output.log"); @@ -36,6 +39,9 @@ event bro_done() print out, entropy_test_finish(entropy_handle); else print out, "entropy_test_add() failed"; + + for ( e in bloomfilter_elements ) + print bloomfilter_lookup(bloomfilter_handle, e); } @TEST-END-FILE @@ -47,6 +53,9 @@ global sha1_handle: opaque of sha1 &persistent &synchronized; global sha256_handle: opaque of sha256 &persistent &synchronized; global entropy_handle: opaque of entropy &persistent &synchronized; +global bloomfilter_elements = { "foo", "bar", "baz" } &persistent &synchronized; +global bloomfilter_handle: opaque of bloomfilter &persistent &synchronized; + event bro_init() { local out = open("expected.log"); @@ -72,6 +81,10 @@ event bro_init() entropy_handle = entropy_test_init(); if ( ! entropy_test_add(entropy_handle, "f") ) print out, "entropy_test_add() failed"; + + bloomfilter_handle = bloomfilter_init(0.1, 100); + for ( e in bloomfilter_elements ) + bloomfilter_add(bloomfilter_handle, e); } @TEST-END-FILE From 22afbe42dd91e668de8c72417b6a8ff8b544dd99 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Mon, 10 Jun 2013 20:15:13 -0700 Subject: [PATCH 23/50] A number of tweaks of the serialization code. --- src/BitVector.h | 2 +- src/BloomFilter.cc | 17 ++++++++--------- src/BloomFilter.h | 2 +- src/OpaqueVal.cc | 10 ++++++---- src/SerialTypes.h | 8 ++++---- 5 files changed, 20 insertions(+), 19 deletions(-) diff --git a/src/BitVector.h b/src/BitVector.h index 8315a151f0..83fec44a0d 100644 --- a/src/BitVector.h +++ b/src/BitVector.h @@ -8,7 +8,7 @@ /** * A vector of bits. */ -class BitVector : SerialObj { +class BitVector : public SerialObj { public: typedef size_t block_type; typedef size_t size_type; diff --git a/src/BloomFilter.cc b/src/BloomFilter.cc index 40772fecb6..1d73734236 100644 --- a/src/BloomFilter.cc +++ b/src/BloomFilter.cc @@ -55,7 +55,7 @@ IMPLEMENT_SERIAL(CounterVector, SER_COUNTERVECTOR) bool CounterVector::DoSerialize(SerialInfo* info) const { DO_SERIALIZE(SER_COUNTERVECTOR, SerialObj); - if ( ! SERIALIZE(bits_) ) + if ( ! bits_->Serialize(info) ) return false; return SERIALIZE(static_cast(width_)); } @@ -63,14 +63,13 @@ bool CounterVector::DoSerialize(SerialInfo* info) const bool CounterVector::DoUnserialize(UnserialInfo* info) { DO_UNSERIALIZE(SerialObj); - return false; bits_ = BitVector::Unserialize(info); if ( ! bits_ ) return false; uint64 width; if ( ! UNSERIALIZE(&width) ) return false; - width_ = static_cast(width); + width_ = static_cast(width); return true; } @@ -127,7 +126,7 @@ bool BloomFilter::DoSerialize(SerialInfo* info) const DO_SERIALIZE(SER_BLOOMFILTER, SerialObj); if ( ! SERIALIZE(static_cast(hash_->K())) ) return false; - return SERIALIZE(static_cast(elements_)); + return SERIALIZE(static_cast(elements_)); } bool BloomFilter::DoUnserialize(UnserialInfo* info) @@ -178,14 +177,14 @@ IMPLEMENT_SERIAL(BasicBloomFilter, SER_BASICBLOOMFILTER) bool BasicBloomFilter::DoSerialize(SerialInfo* info) const { DO_SERIALIZE(SER_BASICBLOOMFILTER, BloomFilter); - return SERIALIZE(bits_); + return bits_->Serialize(info); } bool BasicBloomFilter::DoUnserialize(UnserialInfo* info) { DO_UNSERIALIZE(BloomFilter); bits_ = BitVector::Unserialize(info); - return bits_ == NULL; + return bits_ != NULL; } void BasicBloomFilter::AddImpl(const HashPolicy::HashVector& h) @@ -227,15 +226,15 @@ IMPLEMENT_SERIAL(CountingBloomFilter, SER_COUNTINGBLOOMFILTER) bool CountingBloomFilter::DoSerialize(SerialInfo* info) const { - DO_SERIALIZE(SER_BASICBLOOMFILTER, BloomFilter); - return SERIALIZE(cells_); + DO_SERIALIZE(SER_COUNTINGBLOOMFILTER, BloomFilter); + return cells_->Serialize(info); } bool CountingBloomFilter::DoUnserialize(UnserialInfo* info) { DO_UNSERIALIZE(BloomFilter); cells_ = CounterVector::Unserialize(info); - return cells_ == NULL; + return cells_ != NULL; } void CountingBloomFilter::AddImpl(const HashPolicy::HashVector& h) diff --git a/src/BloomFilter.h b/src/BloomFilter.h index c0101cadf8..4a83ba904b 100644 --- a/src/BloomFilter.h +++ b/src/BloomFilter.h @@ -73,7 +73,7 @@ protected: private: BitVector* bits_; - unsigned width_; + size_t width_; }; /** diff --git a/src/OpaqueVal.cc b/src/OpaqueVal.cc index 38ea93d000..76936dfb78 100644 --- a/src/OpaqueVal.cc +++ b/src/OpaqueVal.cc @@ -574,7 +574,7 @@ size_t BloomFilterVal::Count(const Val* val) const BloomFilterVal* BloomFilterVal::Merge(const BloomFilterVal* first, const BloomFilterVal* second) -{ + { assert(! "not yet implemented"); return NULL; } @@ -594,14 +594,15 @@ IMPLEMENT_SERIAL(BloomFilterVal, SER_BLOOMFILTER_VAL); bool BloomFilterVal::DoSerialize(SerialInfo* info) const { DO_SERIALIZE(SER_BLOOMFILTER_VAL, OpaqueVal); - if ( ! SERIALIZE(type_) ) + if ( ! type_->Serialize(info) ) return false; - return SERIALIZE(bloom_filter_); + return bloom_filter_->Serialize(info); } bool BloomFilterVal::DoUnserialize(UnserialInfo* info) { DO_UNSERIALIZE(OpaqueVal); + type_ = BroType::Unserialize(info); if ( ! type_ ) return false; @@ -609,6 +610,7 @@ bool BloomFilterVal::DoUnserialize(UnserialInfo* info) tl->Append(type_); hash_ = new CompositeHash(tl); Unref(tl); + bloom_filter_ = BloomFilter::Unserialize(info); - return bloom_filter_ == NULL; + return bloom_filter_ != NULL; } diff --git a/src/SerialTypes.h b/src/SerialTypes.h index 859145f19f..9e4aef5b3b 100644 --- a/src/SerialTypes.h +++ b/src/SerialTypes.h @@ -50,10 +50,10 @@ SERIAL_IS_BO(CASE, 0x1200) SERIAL_IS(LOCATION, 0x1300) SERIAL_IS(RE_MATCHER, 0x1400) SERIAL_IS(BITVECTOR, 0x1500) -SERIAL_IS(COUNTERVECTOR, 0xa000) -SERIAL_IS(BLOOMFILTER, 0xa100) -SERIAL_IS(BASICBLOOMFILTER, 0xa200) -SERIAL_IS(COUNTINGBLOOMFILTER, 0xa300) +SERIAL_IS(COUNTERVECTOR, 0x1600) +SERIAL_IS(BLOOMFILTER, 0x1700) +SERIAL_IS(BASICBLOOMFILTER, 0x1800) +SERIAL_IS(COUNTINGBLOOMFILTER, 0x1900) // These are the externally visible types. const SerialType SER_NONE = 0; From 14a701a237dfdd745a842a11f363b93d01926505 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Mon, 10 Jun 2013 22:24:23 -0700 Subject: [PATCH 24/50] Implement value merging. The actual BloomFilter merging still lacks, this is just the first step in the right direction from the user interface side. --- src/BloomFilter.cc | 27 ++++++++++++++++++++------- src/BloomFilter.h | 18 ++++++------------ src/OpaqueVal.cc | 17 ++++++++++++++--- src/OpaqueVal.h | 17 ++++++++++++++--- 4 files changed, 54 insertions(+), 25 deletions(-) diff --git a/src/BloomFilter.cc b/src/BloomFilter.cc index 1d73734236..e55db71e46 100644 --- a/src/BloomFilter.cc +++ b/src/BloomFilter.cc @@ -124,9 +124,7 @@ BloomFilter* BloomFilter::Unserialize(UnserialInfo* info) bool BloomFilter::DoSerialize(SerialInfo* info) const { DO_SERIALIZE(SER_BLOOMFILTER, SerialObj); - if ( ! SERIALIZE(static_cast(hash_->K())) ) - return false; - return SERIALIZE(static_cast(elements_)); + return SERIALIZE(static_cast(hash_->K())); } bool BloomFilter::DoUnserialize(UnserialInfo* info) @@ -136,10 +134,6 @@ bool BloomFilter::DoUnserialize(UnserialInfo* info) if ( ! UNSERIALIZE(&k) ) return false; hash_ = new hash_policy(static_cast(k)); - uint64 elements; - if ( ! UNSERIALIZE(&elements) ) - return false; - elements_ = static_cast(elements); return true; } @@ -155,6 +149,17 @@ size_t BasicBloomFilter::K(size_t cells, size_t capacity) return std::ceil(frac * std::log(2)); } +BasicBloomFilter* BasicBloomFilter::Merge(const BasicBloomFilter* x, + const BasicBloomFilter* y) + { + BasicBloomFilter* result = new BasicBloomFilter(); + result->bits_ = new BitVector(*x->bits_ | *y->bits_); + // TODO: implement the hasher pool and make sure the new result gets the same + // number of (equal) hash functions. + //assert(x->hash_ == y->hash_); + return result; + } + BasicBloomFilter::BasicBloomFilter() : bits_(NULL) { @@ -201,6 +206,14 @@ size_t BasicBloomFilter::CountImpl(const HashPolicy::HashVector& h) const return 1; } + +CountingBloomFilter* CountingBloomFilter::Merge(const CountingBloomFilter* x, + const CountingBloomFilter* y) +{ + assert(! "not yet implemented"); + return NULL; +} + CountingBloomFilter::CountingBloomFilter() : cells_(NULL) { diff --git a/src/BloomFilter.h b/src/BloomFilter.h index 4a83ba904b..3b5d9efa71 100644 --- a/src/BloomFilter.h +++ b/src/BloomFilter.h @@ -155,7 +155,6 @@ public: template void Add(const T& x) { - ++elements_; AddImpl(hash_->Hash(&x, sizeof(x))); } @@ -172,16 +171,6 @@ public: return CountImpl(hash_->Hash(&x, sizeof(x))); } - /** - * Retrieves the number of elements added to the Bloom filter. - * - * @return The number of elements in this Bloom filter. - */ - size_t Size() const - { - return elements_; - } - bool Serialize(SerialInfo* info) const; static BloomFilter* Unserialize(UnserialInfo* info); @@ -196,7 +185,6 @@ protected: private: HashPolicy* hash_; - size_t elements_; }; /** @@ -230,6 +218,9 @@ public: */ static size_t K(size_t cells, size_t capacity); + static BasicBloomFilter* Merge(const BasicBloomFilter* x, + const BasicBloomFilter* y); + /** * Constructs a basic Bloom filter with a given false-positive rate and * capacity. @@ -258,6 +249,9 @@ private: */ class CountingBloomFilter : public BloomFilter { public: + static CountingBloomFilter* Merge(const CountingBloomFilter* x, + const CountingBloomFilter* y); + CountingBloomFilter(double fp, size_t capacity, size_t width); CountingBloomFilter(size_t cells, size_t capacity, size_t width); diff --git a/src/OpaqueVal.cc b/src/OpaqueVal.cc index 76936dfb78..9dd5c7f980 100644 --- a/src/OpaqueVal.cc +++ b/src/OpaqueVal.cc @@ -572,10 +572,21 @@ size_t BloomFilterVal::Count(const Val* val) const return bloom_filter_->Count(key->Hash()); } -BloomFilterVal* BloomFilterVal::Merge(const BloomFilterVal* first, - const BloomFilterVal* second) +BloomFilterVal* BloomFilterVal::Merge(const BloomFilterVal* x, + const BloomFilterVal* y) { - assert(! "not yet implemented"); + if ( x->Type() != y->Type() ) + { + reporter->InternalError("cannot merge Bloom filters with different types"); + return NULL; + } + + BloomFilterVal* result; + if ( (result = DoMerge(x, y)) ) + return result; + else if ( (result = DoMerge(x, y)) ) + return result; + return NULL; } diff --git a/src/OpaqueVal.h b/src/OpaqueVal.h index e97a530f3a..4b45cad519 100644 --- a/src/OpaqueVal.h +++ b/src/OpaqueVal.h @@ -113,10 +113,10 @@ class BloomFilterVal : public OpaqueVal { BloomFilterVal(const BloomFilterVal&); BloomFilterVal& operator=(const BloomFilterVal&); public: - static BloomFilterVal* Merge(const BloomFilterVal* first, - const BloomFilterVal* second); + static BloomFilterVal* Merge(const BloomFilterVal* x, + const BloomFilterVal* y); - BloomFilterVal(BloomFilter* bf); + explicit BloomFilterVal(BloomFilter* bf); ~BloomFilterVal(); bool Typify(BroType* type); @@ -133,6 +133,17 @@ protected: DECLARE_SERIAL(BloomFilterVal); private: + template + static BloomFilterVal* DoMerge(const BloomFilterVal* x, + const BloomFilterVal* y) + { + const T* a = dynamic_cast(x->bloom_filter_); + const T* b = dynamic_cast(y->bloom_filter_); + if ( a && b ) + return new BloomFilterVal(T::Merge(a, b)); + return NULL; + } + BroType* type_; CompositeHash* hash_; BloomFilter* bloom_filter_; From 1f90b539a8574eeadd4b20ae9f379b0fe08999be Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Thu, 13 Jun 2013 23:06:01 -0700 Subject: [PATCH 25/50] Make H3 class adhere to Bro coding style. --- src/H3.h | 89 ++++++++++++++++++++++++++++---------------------------- 1 file changed, 44 insertions(+), 45 deletions(-) diff --git a/src/H3.h b/src/H3.h index 72d81d519f..50afda5688 100644 --- a/src/H3.h +++ b/src/H3.h @@ -65,53 +65,52 @@ template class H3 { T byte_lookup[N][H3_BYTE_RANGE]; public: - H3(); - T operator()(const void* data, size_t size, size_t offset = 0) const - { - const unsigned char *p = static_cast(data); - T result = 0; + H3() + { + T bit_lookup[N * CHAR_BIT]; - // loop optmized with Duff's Device - register unsigned n = (size + 7) / 8; - switch (size % 8) { - case 0: do { result ^= byte_lookup[offset++][*p++]; - case 7: result ^= byte_lookup[offset++][*p++]; - case 6: result ^= byte_lookup[offset++][*p++]; - case 5: result ^= byte_lookup[offset++][*p++]; - case 4: result ^= byte_lookup[offset++][*p++]; - case 3: result ^= byte_lookup[offset++][*p++]; - case 2: result ^= byte_lookup[offset++][*p++]; - case 1: result ^= byte_lookup[offset++][*p++]; - } while (--n > 0); - } + for ( size_t bit = 0; bit < N * CHAR_BIT; bit++ ) + { + bit_lookup[bit] = 0; + for ( size_t i = 0; i < sizeof(T)/2; i++ ) + // assume random() returns at least 16 random bits + bit_lookup[bit] = (bit_lookup[bit] << 16) | (bro_random() & 0xFFFF); + } - return result; - } + for ( size_t byte = 0; byte < N; byte++ ) + { + for ( unsigned val = 0; val < H3_BYTE_RANGE; val++ ) + { + byte_lookup[byte][val] = 0; + for ( size_t bit = 0; bit < CHAR_BIT; bit++ ) + // Does this mean byte_lookup[*][0] == 0? -RP + if (val & (1 << bit)) + byte_lookup[byte][val] ^= bit_lookup[byte*CHAR_BIT+bit]; + } + } + } + + T operator()(const void* data, size_t size, size_t offset = 0) const + { + const unsigned char *p = static_cast(data); + T result = 0; + + // loop optmized with Duff's Device + register unsigned n = (size + 7) / 8; + switch (size % 8) { + case 0: do { result ^= byte_lookup[offset++][*p++]; + case 7: result ^= byte_lookup[offset++][*p++]; + case 6: result ^= byte_lookup[offset++][*p++]; + case 5: result ^= byte_lookup[offset++][*p++]; + case 4: result ^= byte_lookup[offset++][*p++]; + case 3: result ^= byte_lookup[offset++][*p++]; + case 2: result ^= byte_lookup[offset++][*p++]; + case 1: result ^= byte_lookup[offset++][*p++]; + } while (--n > 0); + } + + return result; + } }; -template -H3::H3() -{ - T bit_lookup[N * CHAR_BIT]; - - for (size_t bit = 0; bit < N * CHAR_BIT; bit++) { - bit_lookup[bit] = 0; - for (size_t i = 0; i < sizeof(T)/2; i++) { - // assume random() returns at least 16 random bits - bit_lookup[bit] = (bit_lookup[bit] << 16) | (bro_random() & 0xFFFF); - } - } - - for (size_t byte = 0; byte < N; byte++) { - for (unsigned val = 0; val < H3_BYTE_RANGE; val++) { - byte_lookup[byte][val] = 0; - for (size_t bit = 0; bit < CHAR_BIT; bit++) { - // Does this mean byte_lookup[*][0] == 0? -RP - if (val & (1 << bit)) - byte_lookup[byte][val] ^= bit_lookup[byte*CHAR_BIT+bit]; - } - } - } -} - #endif //H3_H From 529d12037672d34fd4d1ba5f0d291fd6214f41d4 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Thu, 13 Jun 2013 23:07:31 -0700 Subject: [PATCH 26/50] Make H3 seed configurable. --- src/H3.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/H3.h b/src/H3.h index 50afda5688..11b0cd79a5 100644 --- a/src/H3.h +++ b/src/H3.h @@ -65,7 +65,7 @@ template class H3 { T byte_lookup[N][H3_BYTE_RANGE]; public: - H3() + H3(T seed = bro_random()) { T bit_lookup[N * CHAR_BIT]; @@ -74,7 +74,7 @@ public: bit_lookup[bit] = 0; for ( size_t i = 0; i < sizeof(T)/2; i++ ) // assume random() returns at least 16 random bits - bit_lookup[bit] = (bit_lookup[bit] << 16) | (bro_random() & 0xFFFF); + bit_lookup[bit] = (bit_lookup[bit] << 16) | (seed & 0xFFFF); } for ( size_t byte = 0; byte < N; byte++ ) From a6d7b7856e87c3a15ba7009ccfb7d6550d1dcfcc Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Thu, 13 Jun 2013 23:12:00 -0700 Subject: [PATCH 27/50] Update H3 documentation (and minor style nits.) --- src/H3.h | 60 +++++++++++++++++++++++++++++--------------------------- 1 file changed, 31 insertions(+), 29 deletions(-) diff --git a/src/H3.h b/src/H3.h index 11b0cd79a5..2eda14d276 100644 --- a/src/H3.h +++ b/src/H3.h @@ -49,9 +49,9 @@ // hash a substring of the data. Hashes of substrings can be bitwise-XOR'ed // together to get the same result as hashing the full string. // Any number of hash functions can be created by creating new instances of H3, -// with the same or different template parameters. The hash function is -// randomly generated using bro_random(); you must call init_random_seed() -// before the H3 constructor if you wish to seed it. +// with the same or different template parameters. The hash function +// constructor takes a seed as argument which defaults to a call to +// bro_random(). #ifndef H3_H @@ -62,34 +62,34 @@ // The number of values representable by a byte. #define H3_BYTE_RANGE (UCHAR_MAX+1) -template class H3 { - T byte_lookup[N][H3_BYTE_RANGE]; +template +class H3 { public: - H3(T seed = bro_random()) + H3(T seed = bro_random()) + { + T bit_lookup[N * CHAR_BIT]; + + for ( size_t bit = 0; bit < N * CHAR_BIT; bit++ ) { - T bit_lookup[N * CHAR_BIT]; - - for ( size_t bit = 0; bit < N * CHAR_BIT; bit++ ) - { - bit_lookup[bit] = 0; - for ( size_t i = 0; i < sizeof(T)/2; i++ ) - // assume random() returns at least 16 random bits - bit_lookup[bit] = (bit_lookup[bit] << 16) | (seed & 0xFFFF); - } - - for ( size_t byte = 0; byte < N; byte++ ) - { - for ( unsigned val = 0; val < H3_BYTE_RANGE; val++ ) - { - byte_lookup[byte][val] = 0; - for ( size_t bit = 0; bit < CHAR_BIT; bit++ ) - // Does this mean byte_lookup[*][0] == 0? -RP - if (val & (1 << bit)) - byte_lookup[byte][val] ^= bit_lookup[byte*CHAR_BIT+bit]; - } - } + bit_lookup[bit] = 0; + for ( size_t i = 0; i < sizeof(T)/2; i++ ) + // assume random() returns at least 16 random bits + bit_lookup[bit] = (bit_lookup[bit] << 16) | (seed & 0xFFFF); } + for ( size_t byte = 0; byte < N; byte++ ) + { + for ( unsigned val = 0; val < H3_BYTE_RANGE; val++ ) + { + byte_lookup[byte][val] = 0; + for ( size_t bit = 0; bit < CHAR_BIT; bit++ ) + // Does this mean byte_lookup[*][0] == 0? -RP + if (val & (1 << bit)) + byte_lookup[byte][val] ^= bit_lookup[byte*CHAR_BIT+bit]; + } + } + } + T operator()(const void* data, size_t size, size_t offset = 0) const { const unsigned char *p = static_cast(data); @@ -97,7 +97,7 @@ public: // loop optmized with Duff's Device register unsigned n = (size + 7) / 8; - switch (size % 8) { + switch ( size % 8 ) { case 0: do { result ^= byte_lookup[offset++][*p++]; case 7: result ^= byte_lookup[offset++][*p++]; case 6: result ^= byte_lookup[offset++][*p++]; @@ -106,11 +106,13 @@ public: case 3: result ^= byte_lookup[offset++][*p++]; case 2: result ^= byte_lookup[offset++][*p++]; case 1: result ^= byte_lookup[offset++][*p++]; - } while (--n > 0); + } while ( --n > 0 ); } return result; } +private: + T byte_lookup[N][H3_BYTE_RANGE]; }; #endif //H3_H From d2d8aff81456413597b09b71557b0caabdb7af3d Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Fri, 14 Jun 2013 09:22:48 -0700 Subject: [PATCH 28/50] Add utility function to access first random seed. --- src/util.cc | 13 +++++++++++++ src/util.h | 5 +++++ 2 files changed, 18 insertions(+) diff --git a/src/util.cc b/src/util.cc index de9bd5b679..721ee10a7e 100644 --- a/src/util.cc +++ b/src/util.cc @@ -716,6 +716,8 @@ static bool write_random_seeds(const char* write_file, uint32 seed, static bool bro_rand_determistic = false; static unsigned int bro_rand_state = 0; +static bool first_seed_saved = false; +static unsigned int first_seed = 0; static void bro_srandom(unsigned int seed, bool deterministic) { @@ -800,6 +802,12 @@ void init_random_seed(uint32 seed, const char* read_file, const char* write_file bro_srandom(seed, seeds_done); + if ( ! first_seed_saved ) + { + first_seed = seed; + first_seed_saved = true; + } + if ( ! hmac_key_set ) { MD5((const u_char*) buf, sizeof(buf), shared_hmac_md5_key); @@ -811,6 +819,11 @@ void init_random_seed(uint32 seed, const char* read_file, const char* write_file write_file); } +unsigned int initial_seed() + { + return first_seed; +} + bool have_random_seed() { return bro_rand_determistic; diff --git a/src/util.h b/src/util.h index 49bcbf318b..c3eebb04e3 100644 --- a/src/util.h +++ b/src/util.h @@ -165,6 +165,11 @@ extern void hmac_md5(size_t size, const unsigned char* bytes, extern void init_random_seed(uint32 seed, const char* load_file, const char* write_file); +// Retrieves the initial seed computed after the very first call to +// init_random_seed(). Repeated calls to init_random_seed() will not affect the +// return value of this function. +unsigned int initial_seed(); + // Returns true if the user explicitly set a seed via init_random_seed(); extern bool have_random_seed(); From 1576239f67ef2641135f95bdd331f3c1a54ee5ad Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Fri, 14 Jun 2013 10:19:39 -0700 Subject: [PATCH 29/50] Support seeding for hashers. --- src/BloomFilter.cc | 11 +++++++++++ src/BloomFilter.h | 4 +++- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/src/BloomFilter.cc b/src/BloomFilter.cc index e55db71e46..eff7eee733 100644 --- a/src/BloomFilter.cc +++ b/src/BloomFilter.cc @@ -74,6 +74,17 @@ bool CounterVector::DoUnserialize(UnserialInfo* info) } +HashPolicy::Hasher::Hasher(size_t seed) + : h3_(seed) +{ +} + +HashPolicy::HashType +HashPolicy::Hasher::operator()(const void* x, size_t n) const + { + return h3_(x, n); + } + HashPolicy::HashVector DefaultHashing::Hash(const void* x, size_t n) const { HashVector h(K(), 0); diff --git a/src/BloomFilter.h b/src/BloomFilter.h index 3b5d9efa71..65133621f9 100644 --- a/src/BloomFilter.h +++ b/src/BloomFilter.h @@ -96,7 +96,9 @@ protected: */ class Hasher { public: - HashType operator()(const void* x, size_t n) const { return h3_(x, n); } + Hasher(size_t seed); + + HashType operator()(const void* x, size_t n) const; private: // FIXME: The hardcoded value of 36 comes from UHASH_KEY_SIZE defined in // Hash.h. I do not know how this value impacts the hash function behavior From 79a6a26f9f70a937551a94a5dc83b2c5dafe1414 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Fri, 14 Jun 2013 10:20:33 -0700 Subject: [PATCH 30/50] H3 does not check for zero length input. --- src/BloomFilter.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/BloomFilter.cc b/src/BloomFilter.cc index eff7eee733..6a44defc6d 100644 --- a/src/BloomFilter.cc +++ b/src/BloomFilter.cc @@ -82,7 +82,7 @@ HashPolicy::Hasher::Hasher(size_t seed) HashPolicy::HashType HashPolicy::Hasher::operator()(const void* x, size_t n) const { - return h3_(x, n); + return n == 0 ? 0 : h3_(x, n); } HashPolicy::HashVector DefaultHashing::Hash(const void* x, size_t n) const From 9f740642891664ee8f482285523969793d0063d0 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Mon, 17 Jun 2013 14:02:14 -0700 Subject: [PATCH 31/50] Expose Bro's linear congruence PRNG as utility function. It was previously not possible to crank the wheel on the PRNG in a deterministic way without affecting the globally unique seed. The new extra utility function bro_prng takes a state in the form of a long int and returns the new PRNG state, now allowing arbitrary code parts to use the random number functionality. This commit also fixes a problem in the H3 constructor, which requires use of multiple seeds. The single seed passed in now serves as seed to crank out as many value needed using bro_prng. --- src/H3.h | 1 + src/util.cc | 29 ++++++++++++++++++----------- src/util.h | 7 +++++-- 3 files changed, 24 insertions(+), 13 deletions(-) diff --git a/src/H3.h b/src/H3.h index 2eda14d276..e2dc865147 100644 --- a/src/H3.h +++ b/src/H3.h @@ -72,6 +72,7 @@ public: for ( size_t bit = 0; bit < N * CHAR_BIT; bit++ ) { bit_lookup[bit] = 0; + seed = bro_prng(seed); for ( size_t i = 0; i < sizeof(T)/2; i++ ) // assume random() returns at least 16 random bits bit_lookup[bit] = (bit_lookup[bit] << 16) | (seed & 0xFFFF); diff --git a/src/util.cc b/src/util.cc index 721ee10a7e..cdd257d94f 100644 --- a/src/util.cc +++ b/src/util.cc @@ -829,22 +829,29 @@ bool have_random_seed() return bro_rand_determistic; } +long int bro_prng(long int state) + { + // Use our own simple linear congruence PRNG to make sure we are + // predictable across platforms. + static const long int m = 2147483647; + static const long int a = 16807; + const long int q = m / a; + const long int r = m % a; + + state = a * ( state % q ) - r * ( state / q ); + + if ( state <= 0 ) + state += m; + + return state; + } + long int bro_random() { if ( ! bro_rand_determistic ) return random(); // Use system PRNG. - // Use our own simple linear congruence PRNG to make sure we are - // predictable across platforms. - const long int m = 2147483647; - const long int a = 16807; - const long int q = m / a; - const long int r = m % a; - - bro_rand_state = a * ( bro_rand_state % q ) - r * ( bro_rand_state / q ); - - if ( bro_rand_state <= 0 ) - bro_rand_state += m; + bro_rand_state = bro_prng(bro_rand_state); return bro_rand_state; } diff --git a/src/util.h b/src/util.h index c3eebb04e3..0af401c668 100644 --- a/src/util.h +++ b/src/util.h @@ -173,9 +173,12 @@ unsigned int initial_seed(); // Returns true if the user explicitly set a seed via init_random_seed(); extern bool have_random_seed(); +// A simple linear congruence PRNG. It takes its state as argument and returns +// a new random value, which can serve as state for subsequent calls. +long int bro_prng(long int state); + // Replacement for the system random(), to which is normally falls back -// except when a seed has been given. In that case, we use our own -// predictable PRNG. +// except when a seed has been given. In that case, the function bro_prng. long int bro_random(); // Calls the system srandom() function with the given seed if not running From 532fbfb4d27ac9ee733dbcfebccbc91e652d4eb0 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Mon, 17 Jun 2013 16:06:02 -0700 Subject: [PATCH 32/50] Factor implementation and change interface. When constructing a Bloom filter, one now has to pass a HashPolicy instance to it. This separates more clearly the concerns of hashing and Bloom filter management. This commit also changes the interface to initialize Bloom filters: there exist now two initialization functions, one for each type: (1) bloomfilter_basic_init(fp: double, capacity: count, name: string &default=""): opaque of bloomfilter (2) bloomfilter_counting_init(k: count, cells: count, max: count, name: string &default=""): opaque of bloomfilter The BiFs for adding elements and performing lookups remain the same. This essentially gives us "BiF polymorphism" at script land, where the initialization BiF constructs the most derived type while subsequent BiFs adhere to the same interface. The reason why we split up the constructor in this case is that we have not yet derived the math that computes the optimal number of hash functions for counting Bloom filters---users have to explicitly parameterize them for now. --- src/BloomFilter.cc | 159 +++++--------------------- src/BloomFilter.h | 172 ++++------------------------- src/CMakeLists.txt | 2 + src/CounterVector.cc | 75 +++++++++++++ src/CounterVector.h | 78 +++++++++++++ src/HashPolicy.cc | 72 ++++++++++++ src/HashPolicy.h | 90 +++++++++++++++ src/OpaqueVal.cc | 1 + src/bro.bif | 57 ++++++---- testing/btest/bifs/bloomfilter.bro | 20 ++-- testing/btest/istate/opaque.bro | 2 +- 11 files changed, 409 insertions(+), 319 deletions(-) create mode 100644 src/CounterVector.cc create mode 100644 src/CounterVector.h create mode 100644 src/HashPolicy.cc create mode 100644 src/HashPolicy.h diff --git a/src/BloomFilter.cc b/src/BloomFilter.cc index 6a44defc6d..0be64c18de 100644 --- a/src/BloomFilter.cc +++ b/src/BloomFilter.cc @@ -1,117 +1,16 @@ #include "BloomFilter.h" #include +#include "CounterVector.h" #include "Serializer.h" -CounterVector::CounterVector(size_t width, size_t cells) - : bits_(new BitVector(width * cells)), width_(width) - { - } - -CounterVector::~CounterVector() - { - delete bits_; - } - -bool CounterVector::Increment(size_type cell, count_type value) - { - // TODO - assert(! "not yet implemented"); - return false; - } - -bool CounterVector::Decrement(size_type cell, count_type value) - { - // TODO - assert(! "not yet implemented"); - return false; - } - -CounterVector::count_type CounterVector::Count(size_type cell) const - { - // TODO - assert(! "not yet implemented"); - return 0; - } - -CounterVector::size_type CounterVector::Size() const - { - return bits_->Blocks() / width_; - } - -bool CounterVector::Serialize(SerialInfo* info) const - { - return SerialObj::Serialize(info); - } - -CounterVector* CounterVector::Unserialize(UnserialInfo* info) - { - return reinterpret_cast( - SerialObj::Unserialize(info, SER_COUNTERVECTOR)); - } - -IMPLEMENT_SERIAL(CounterVector, SER_COUNTERVECTOR) - -bool CounterVector::DoSerialize(SerialInfo* info) const - { - DO_SERIALIZE(SER_COUNTERVECTOR, SerialObj); - if ( ! bits_->Serialize(info) ) - return false; - return SERIALIZE(static_cast(width_)); - } - -bool CounterVector::DoUnserialize(UnserialInfo* info) - { - DO_UNSERIALIZE(SerialObj); - bits_ = BitVector::Unserialize(info); - if ( ! bits_ ) - return false; - uint64 width; - if ( ! UNSERIALIZE(&width) ) - return false; - width_ = static_cast(width); - return true; - } - - -HashPolicy::Hasher::Hasher(size_t seed) - : h3_(seed) -{ -} - -HashPolicy::HashType -HashPolicy::Hasher::operator()(const void* x, size_t n) const - { - return n == 0 ? 0 : h3_(x, n); - } - -HashPolicy::HashVector DefaultHashing::Hash(const void* x, size_t n) const - { - HashVector h(K(), 0); - for ( size_t i = 0; i < h.size(); ++i ) - h[i] = hashers_[i](x, n); - return h; - } - - -HashPolicy::HashVector DoubleHashing::Hash(const void* x, size_t n) const - { - HashType h1 = hasher1_(x, n); - HashType h2 = hasher2_(x, n); - HashVector h(K(), 0); - for ( size_t i = 0; i < h.size(); ++i ) - h[i] = h1 + i * h2; - return h; - } - - BloomFilter::BloomFilter() : hash_(NULL) { } -BloomFilter::BloomFilter(size_t k) - : hash_(new hash_policy(k)) +BloomFilter::BloomFilter(const HashPolicy* hash_policy) + : hash_(hash_policy) { } @@ -135,7 +34,11 @@ BloomFilter* BloomFilter::Unserialize(UnserialInfo* info) bool BloomFilter::DoSerialize(SerialInfo* info) const { DO_SERIALIZE(SER_BLOOMFILTER, SerialObj); - return SERIALIZE(static_cast(hash_->K())); + // FIXME: Since we have a fixed hashing policy, we just serialize the + // information needed to reconstruct it. + if ( ! SERIALIZE(static_cast(hash_->K())) ) + return false; + return SERIALIZE_STR(hash_->Name().c_str(), hash_->Name().size()); } bool BloomFilter::DoUnserialize(UnserialInfo* info) @@ -144,10 +47,15 @@ bool BloomFilter::DoUnserialize(UnserialInfo* info) uint16 k; if ( ! UNSERIALIZE(&k) ) return false; - hash_ = new hash_policy(static_cast(k)); + const char* name; + if ( ! UNSERIALIZE_STR(&name, 0) ) + return false; + // FIXME: for now Bloom filters always use double hashing. + hash_ = new DefaultHashing(k, name); return true; } + size_t BasicBloomFilter::M(double fp, size_t capacity) { double ln2 = std::log(2); @@ -163,11 +71,9 @@ size_t BasicBloomFilter::K(size_t cells, size_t capacity) BasicBloomFilter* BasicBloomFilter::Merge(const BasicBloomFilter* x, const BasicBloomFilter* y) { + // TODO: Ensure that x and y use the same HashPolicy before proceeding. BasicBloomFilter* result = new BasicBloomFilter(); result->bits_ = new BitVector(*x->bits_ | *y->bits_); - // TODO: implement the hasher pool and make sure the new result gets the same - // number of (equal) hash functions. - //assert(x->hash_ == y->hash_); return result; } @@ -176,16 +82,10 @@ BasicBloomFilter::BasicBloomFilter() { } -BasicBloomFilter::BasicBloomFilter(double fp, size_t capacity) - : BloomFilter(K(M(fp, capacity), capacity)) +BasicBloomFilter::BasicBloomFilter(const HashPolicy* hash_policy, size_t cells) + : BloomFilter(hash_policy), + bits_(new BitVector(cells)) { - bits_ = new BitVector(M(fp, capacity)); - } - -BasicBloomFilter::BasicBloomFilter(size_t cells, size_t capacity) - : BloomFilter(K(cells, capacity)) - { - bits_ = new BitVector(cells); } IMPLEMENT_SERIAL(BasicBloomFilter, SER_BASICBLOOMFILTER) @@ -203,13 +103,13 @@ bool BasicBloomFilter::DoUnserialize(UnserialInfo* info) return bits_ != NULL; } -void BasicBloomFilter::AddImpl(const HashPolicy::HashVector& h) +void BasicBloomFilter::AddImpl(const HashPolicy::hash_vector& h) { for ( size_t i = 0; i < h.size(); ++i ) bits_->Set(h[i] % bits_->Size()); } -size_t BasicBloomFilter::CountImpl(const HashPolicy::HashVector& h) const +size_t BasicBloomFilter::CountImpl(const HashPolicy::hash_vector& h) const { for ( size_t i = 0; i < h.size(); ++i ) if ( ! (*bits_)[h[i] % bits_->Size()] ) @@ -230,17 +130,9 @@ CountingBloomFilter::CountingBloomFilter() { } -CountingBloomFilter::CountingBloomFilter(double fp, size_t capacity, - size_t width) - : BloomFilter(BasicBloomFilter::K(BasicBloomFilter::M(fp, capacity), - capacity)) - { - cells_ = new CounterVector(width, BasicBloomFilter::M(fp, capacity)); - } - -CountingBloomFilter::CountingBloomFilter(size_t cells, size_t capacity, - size_t width) - : BloomFilter(BasicBloomFilter::K(cells, capacity)) +CountingBloomFilter::CountingBloomFilter(const HashPolicy* hash_policy, + size_t cells, size_t width) + : BloomFilter(hash_policy) { cells_ = new CounterVector(width, cells); } @@ -261,18 +153,19 @@ bool CountingBloomFilter::DoUnserialize(UnserialInfo* info) return cells_ != NULL; } -void CountingBloomFilter::AddImpl(const HashPolicy::HashVector& h) +void CountingBloomFilter::AddImpl(const HashPolicy::hash_vector& h) { for ( size_t i = 0; i < h.size(); ++i ) cells_->Increment(h[i] % cells_->Size(), 1); } -size_t CountingBloomFilter::CountImpl(const HashPolicy::HashVector& h) const +size_t CountingBloomFilter::CountImpl(const HashPolicy::hash_vector& h) const { CounterVector::size_type min = std::numeric_limits::max(); for ( size_t i = 0; i < h.size(); ++i ) { + // TODO: Use partitioning. CounterVector::size_type cnt = cells_->Count(h[i] % cells_->Size()); if ( cnt < min ) min = cnt; diff --git a/src/BloomFilter.h b/src/BloomFilter.h index 65133621f9..189f4920b7 100644 --- a/src/BloomFilter.h +++ b/src/BloomFilter.h @@ -3,141 +3,9 @@ #include #include "BitVector.h" -#include "Hash.h" -#include "H3.h" +#include "HashPolicy.h" -/** - * A vector of counters, each of which have a fixed number of bits. - */ -class CounterVector : public SerialObj { -public: - typedef size_t size_type; - typedef uint64 count_type; - - /** - * Constructs a counter vector having cells of a given width. - * - * @param width The number of bits that each cell occupies. - * - * @param cells The number of cells in the bitvector. - */ - CounterVector(size_t width, size_t cells = 1024); - - ~CounterVector(); - - /** - * Increments a given cell. - * - * @param cell The cell to increment. - * - * @param value The value to add to the current counter in *cell*. - * - * @return `true` if adding *value* to the counter in *cell* succeeded. - */ - bool Increment(size_type cell, count_type value); - - /** - * Decrements a given cell. - * - * @param cell The cell to decrement. - * - * @param value The value to subtract from the current counter in *cell*. - * - * @return `true` if subtracting *value* from the counter in *cell* succeeded. - */ - bool Decrement(size_type cell, count_type value); - - /** - * Retrieves the counter of a given cell. - * - * @param cell The cell index to retrieve the count for. - * - * @return The counter associated with *cell*. - */ - count_type Count(size_type cell) const; - - /** - * Retrieves the number of cells in the storage. - * - * @return The number of cells. - */ - size_type Size() const; - - bool Serialize(SerialInfo* info) const; - static CounterVector* Unserialize(UnserialInfo* info); - -protected: - DECLARE_SERIAL(CounterVector); - - CounterVector() { } - -private: - BitVector* bits_; - size_t width_; -}; - -/** - * The abstract base class for hash policies that hash elements *k* times. - * @tparam Codomain An integral type. - */ -class HashPolicy { -public: - typedef hash_t HashType; - typedef std::vector HashVector; - - virtual ~HashPolicy() { } - size_t K() const { return k_; } - virtual HashVector Hash(const void* x, size_t n) const = 0; - -protected: - /** - * A functor that computes a universal hash function. - * @tparam Codomain An integral type. - */ - class Hasher { - public: - Hasher(size_t seed); - - HashType operator()(const void* x, size_t n) const; - private: - // FIXME: The hardcoded value of 36 comes from UHASH_KEY_SIZE defined in - // Hash.h. I do not know how this value impacts the hash function behavior - // so I'll just copy it verbatim. (Matthias) - H3 h3_; - }; - - HashPolicy(size_t k) : k_(k) { } - -private: - const size_t k_; -}; - -/** - * The *default* hashing policy. Performs *k* hash function computations. - */ -class DefaultHashing : public HashPolicy { -public: - DefaultHashing(size_t k) : HashPolicy(k), hashers_(k) { } - - virtual HashVector Hash(const void* x, size_t n) const; - -private: - std::vector hashers_; -}; - -/** - * The *double-hashing* policy. Uses a linear combination of two hash functions. - */ -class DoubleHashing : public HashPolicy { -public: - DoubleHashing(size_t k) : HashPolicy(k) { } - - virtual HashVector Hash(const void* x, size_t n) const; - -private: - Hasher hasher1_; - Hasher hasher2_; -}; +class CounterVector; /** * The abstract base class for Bloom filters. @@ -146,8 +14,6 @@ class BloomFilter : public SerialObj { public: // At this point we won't let the user choose the hash policy, but we might // open up the interface in the future. - typedef DoubleHashing hash_policy; - virtual ~BloomFilter(); /** @@ -180,13 +46,19 @@ protected: DECLARE_ABSTRACT_SERIAL(BloomFilter); BloomFilter(); - BloomFilter(size_t k); - virtual void AddImpl(const HashPolicy::HashVector& hashes) = 0; - virtual size_t CountImpl(const HashPolicy::HashVector& hashes) const = 0; + /** + * Constructs a Bloom filter. + * + * @param hash_policy The hash policy to use for this Bloom filter. + */ + BloomFilter(const HashPolicy* hash_policy); + + virtual void AddImpl(const HashPolicy::hash_vector& hashes) = 0; + virtual size_t CountImpl(const HashPolicy::hash_vector& hashes) const = 0; private: - HashPolicy* hash_; + const HashPolicy* hash_; }; /** @@ -223,24 +95,18 @@ public: static BasicBloomFilter* Merge(const BasicBloomFilter* x, const BasicBloomFilter* y); - /** - * Constructs a basic Bloom filter with a given false-positive rate and - * capacity. - */ - BasicBloomFilter(double fp, size_t capacity); - /** * Constructs a basic Bloom filter with a given number of cells and capacity. */ - BasicBloomFilter(size_t cells, size_t capacity); + BasicBloomFilter(const HashPolicy* hash_policy, size_t cells); protected: DECLARE_SERIAL(BasicBloomFilter); BasicBloomFilter(); - virtual void AddImpl(const HashPolicy::HashVector& h); - virtual size_t CountImpl(const HashPolicy::HashVector& h) const; + virtual void AddImpl(const HashPolicy::hash_vector& h); + virtual size_t CountImpl(const HashPolicy::hash_vector& h) const; private: BitVector* bits_; @@ -254,16 +120,16 @@ public: static CountingBloomFilter* Merge(const CountingBloomFilter* x, const CountingBloomFilter* y); - CountingBloomFilter(double fp, size_t capacity, size_t width); - CountingBloomFilter(size_t cells, size_t capacity, size_t width); + CountingBloomFilter(const HashPolicy* hash_policy, size_t cells, + size_t width); protected: DECLARE_SERIAL(CountingBloomFilter); CountingBloomFilter(); - virtual void AddImpl(const HashPolicy::HashVector& h); - virtual size_t CountImpl(const HashPolicy::HashVector& h) const; + virtual void AddImpl(const HashPolicy::hash_vector& h); + virtual size_t CountImpl(const HashPolicy::hash_vector& h) const; private: CounterVector* cells_; diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 1537bb04b0..f2c7ce6bad 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -255,6 +255,7 @@ set(bro_SRCS ChunkedIO.cc CompHash.cc Conn.cc + CounterVector.cc DFA.cc DbgBreakpoint.cc DbgHelp.cc @@ -278,6 +279,7 @@ set(bro_SRCS Frame.cc Func.cc Hash.cc + HashPolicy.cc ID.cc IntSet.cc IOSource.cc diff --git a/src/CounterVector.cc b/src/CounterVector.cc new file mode 100644 index 0000000000..8ed4c30427 --- /dev/null +++ b/src/CounterVector.cc @@ -0,0 +1,75 @@ +#include "CounterVector.h" + +#include "BitVector.h" +#include "Serializer.h" + +CounterVector::CounterVector(size_t width, size_t cells) + : bits_(new BitVector(width * cells)), width_(width) + { + } + +CounterVector::~CounterVector() + { + delete bits_; + } + +bool CounterVector::Increment(size_type cell, count_type value) + { + // TODO + assert(! "not yet implemented"); + return false; + } + +bool CounterVector::Decrement(size_type cell, count_type value) + { + // TODO + assert(! "not yet implemented"); + return false; + } + +CounterVector::count_type CounterVector::Count(size_type cell) const + { + // TODO + assert(! "not yet implemented"); + return 0; + } + +CounterVector::size_type CounterVector::Size() const + { + return bits_->Blocks() / width_; + } + +bool CounterVector::Serialize(SerialInfo* info) const + { + return SerialObj::Serialize(info); + } + +CounterVector* CounterVector::Unserialize(UnserialInfo* info) + { + return reinterpret_cast( + SerialObj::Unserialize(info, SER_COUNTERVECTOR)); + } + +IMPLEMENT_SERIAL(CounterVector, SER_COUNTERVECTOR) + +bool CounterVector::DoSerialize(SerialInfo* info) const + { + DO_SERIALIZE(SER_COUNTERVECTOR, SerialObj); + if ( ! bits_->Serialize(info) ) + return false; + return SERIALIZE(static_cast(width_)); + } + +bool CounterVector::DoUnserialize(UnserialInfo* info) + { + DO_UNSERIALIZE(SerialObj); + bits_ = BitVector::Unserialize(info); + if ( ! bits_ ) + return false; + uint64 width; + if ( ! UNSERIALIZE(&width) ) + return false; + width_ = static_cast(width); + return true; + } + diff --git a/src/CounterVector.h b/src/CounterVector.h new file mode 100644 index 0000000000..ecc8fe90e0 --- /dev/null +++ b/src/CounterVector.h @@ -0,0 +1,78 @@ +#ifndef CounterVector_h +#define CounterVector_h + +#include "SerialObj.h" + +class BitVector; + +/** + * A vector of counters, each of which have a fixed number of bits. + */ +class CounterVector : public SerialObj { +public: + typedef size_t size_type; + typedef uint64 count_type; + + /** + * Constructs a counter vector having cells of a given width. + * + * @param width The number of bits that each cell occupies. + * + * @param cells The number of cells in the bitvector. + */ + CounterVector(size_t width, size_t cells = 1024); + + ~CounterVector(); + + /** + * Increments a given cell. + * + * @param cell The cell to increment. + * + * @param value The value to add to the current counter in *cell*. + * + * @return `true` if adding *value* to the counter in *cell* succeeded. + */ + bool Increment(size_type cell, count_type value); + + /** + * Decrements a given cell. + * + * @param cell The cell to decrement. + * + * @param value The value to subtract from the current counter in *cell*. + * + * @return `true` if subtracting *value* from the counter in *cell* succeeded. + */ + bool Decrement(size_type cell, count_type value); + + /** + * Retrieves the counter of a given cell. + * + * @param cell The cell index to retrieve the count for. + * + * @return The counter associated with *cell*. + */ + count_type Count(size_type cell) const; + + /** + * Retrieves the number of cells in the storage. + * + * @return The number of cells. + */ + size_type Size() const; + + bool Serialize(SerialInfo* info) const; + static CounterVector* Unserialize(UnserialInfo* info); + +protected: + DECLARE_SERIAL(CounterVector); + + CounterVector() { } + +private: + BitVector* bits_; + size_t width_; +}; + +#endif diff --git a/src/HashPolicy.cc b/src/HashPolicy.cc new file mode 100644 index 0000000000..d6fb4f3da4 --- /dev/null +++ b/src/HashPolicy.cc @@ -0,0 +1,72 @@ +#include "HashPolicy.h" + +#include "digest.h" + +Hasher::Hasher(size_t seed, const std::string& extra) + : h_(compute_seed(seed, extra)) + { + } + +Hasher::hash_type Hasher::operator()(const void* x, size_t n) const + { + return n == 0 ? 0 : h_(x, n); + } + +size_t Hasher::compute_seed(size_t seed, const std::string& extra) + { + u_char digest[SHA256_DIGEST_LENGTH]; + SHA256_CTX ctx; + sha256_init(&ctx); + if ( extra.empty() ) + { + unsigned int first_seed = initial_seed(); + sha256_update(&ctx, &first_seed, sizeof(first_seed)); + } + else + { + sha256_update(&ctx, extra.c_str(), extra.size()); + } + sha256_update(&ctx, &seed, sizeof(seed)); + sha256_final(&ctx, digest); + return *reinterpret_cast(digest); + } + + +HashPolicy::HashPolicy(size_t k, const std::string& name) + : k_(k), name_(name) + { + } + +DefaultHashing::DefaultHashing(size_t k, const std::string& name) + : HashPolicy(k, name) + { + for ( size_t i = 0; i < k; ++i ) + hashers_.push_back(Hasher(i, name)); + } + +HashPolicy::hash_vector DefaultHashing::Hash(const void* x, size_t n) const + { + hash_vector h(K(), 0); + for ( size_t i = 0; i < h.size(); ++i ) + h[i] = hashers_[i](x, n); + return h; + } + +DoubleHashing::DoubleHashing(size_t k, const std::string& name) + : HashPolicy(k, name), + hasher1_(1, name), + hasher2_(2, name) + { + } + +HashPolicy::hash_vector DoubleHashing::Hash(const void* x, size_t n) const + { + hash_type h1 = hasher1_(x, n); + hash_type h2 = hasher2_(x, n); + hash_vector h(K(), 0); + for ( size_t i = 0; i < h.size(); ++i ) + h[i] = h1 + i * h2; + return h; + } + + diff --git a/src/HashPolicy.h b/src/HashPolicy.h new file mode 100644 index 0000000000..4660bc0080 --- /dev/null +++ b/src/HashPolicy.h @@ -0,0 +1,90 @@ +#ifndef HashPolicy_h +#define HashPolicy_h + +#include "Hash.h" +#include "H3.h" + +/** + * A functor that computes a universal hash function. + */ +class Hasher { +public: + typedef hash_t hash_type; + + /** + * Constructs a hasher seeded by a given seed and optionally an extra + * descriptor. + * + * @param seed The seed to use. + * + * @param extra If not `NULL`, the hasher will not mix in the initial seed + * but instead use this NUL-terminated string as additional seed. + */ + Hasher(size_t seed, const std::string& extra = ""); + + /** + * Computes the hash digest of contiguous data. + * + * @param x A pointer to the beginning of the byte sequence to hash. + * + * @param n The length of the sequence pointed to by *x*. + */ + hash_type operator()(const void* x, size_t n) const; + +private: + static size_t compute_seed(size_t seed, const std::string& extra); + + H3 h_; +}; + +/** + * The abstract base class for hash policies that hash elements *k* times. + */ +class HashPolicy { +public: + typedef Hasher::hash_type hash_type; + typedef std::vector hash_vector; + + virtual ~HashPolicy() { } + + virtual hash_vector Hash(const void* x, size_t n) const = 0; + + size_t K() const { return k_; } + const std::string& Name() const { return name_; } + +protected: + HashPolicy(size_t k, const std::string& name); + +private: + const size_t k_; + std::string name_; +}; + +/** + * The default hashing policy. Performs *k* hash function computations. + */ +class DefaultHashing : public HashPolicy { +public: + DefaultHashing(size_t k, const std::string& name); + + virtual hash_vector Hash(const void* x, size_t n) const /* override */; + +private: + std::vector hashers_; +}; + +/** + * The *double-hashing* policy. Uses a linear combination of two hash functions. + */ +class DoubleHashing : public HashPolicy { +public: + DoubleHashing(size_t k, const std::string& name); + + virtual hash_vector Hash(const void* x, size_t n) const; + +private: + Hasher hasher1_; + Hasher hasher2_; +}; + +#endif diff --git a/src/OpaqueVal.cc b/src/OpaqueVal.cc index 9dd5c7f980..8b82916689 100644 --- a/src/OpaqueVal.cc +++ b/src/OpaqueVal.cc @@ -605,6 +605,7 @@ IMPLEMENT_SERIAL(BloomFilterVal, SER_BLOOMFILTER_VAL); bool BloomFilterVal::DoSerialize(SerialInfo* info) const { DO_SERIALIZE(SER_BLOOMFILTER_VAL, OpaqueVal); + assert( type_ ); if ( ! type_->Serialize(info) ) return false; return bloom_filter_->Serialize(info); diff --git a/src/bro.bif b/src/bro.bif index 9b80c90dbf..a89b808888 100644 --- a/src/bro.bif +++ b/src/bro.bif @@ -4986,42 +4986,55 @@ function anonymize_addr%(a: addr, cl: IPAddrAnonymizationClass%): addr #include "BloomFilter.h" %%} -## Initializes a Bloom filter data structure. +## Creates a basic Bloom filter. ## ## fp: The desired false-positive rate. ## ## capacity: the maximum number of elements that guarantees a false-positive ## rate of *fp*. ## -## max: The maximum counter value associated with each each element in the -## Bloom filter. If greater than 1, each element in the set has a counter of -## *w = ceil(log_2(max))* bits. Each bit in the underlying bit vector then -## becomes a cell of size *w* bits. Since the number number of cells is a -## function ## of *fp* and *capacity*, it is important to consider the effects -## on space when tuning this value. +## name: A name that uniquely identifies and seeds the Bloom filter. If empty, +## the initialization will become dependent on the initial seed. ## ## Returns: A Bloom filter handle. -function bloomfilter_init%(fp: double, capacity: count, - max: count &default=1%): opaque of bloomfilter +function bloomfilter_basic_init%(fp: double, capacity: count, + name: string &default=""%): opaque of bloomfilter %{ if ( fp < 0.0 || fp > 1.0 ) { reporter->Error("false-positive rate must take value between 0 and 1"); return NULL; } - BloomFilter* bf; - if ( max == 1 ) - { - bf = new BasicBloomFilter(fp, capacity); - } - else - { - uint16 width = 0; - while ( max >>= 1 ) - ++width; - bf = new CountingBloomFilter(fp, capacity, width); - } - return new BloomFilterVal(bf); + + size_t cells = BasicBloomFilter::M(fp, capacity); + size_t optimal_k = BasicBloomFilter::K(cells, capacity); + const HashPolicy* hp = new DefaultHashing(optimal_k, name->CheckString()); + fprintf(stderr, "constructing Bloom filter with %llu hash fns and %llu cells\n", optimal_k, cells); + return new BloomFilterVal(new BasicBloomFilter(hp, cells)); + %} + +## Creates a counting Bloom filter. +## +## k: The number of hash functions to use. +## +## cells: The number of cells of the underlying counter vector. +## +## max: The maximum counter value associated with each each element described +## by *w = ceil(log_2(max))* bits. Each bit in the underlying counter vector +## becomes a cell of size *w* bits. +## +## name: A name that uniquely identifies and seeds the Bloom filter. If empty, +## the initialization will become dependent on the initial seed. +## +## Returns: A Bloom filter handle. +function bloomfilter_counting_init%(k: count, cells: count, max: count, + name: string &default=""%): opaque of bloomfilter + %{ + const HashPolicy* hp = new DefaultHashing(k, name->CheckString()); + uint16 width = 0; + while ( max >>= 1 ) + ++width; + return new BloomFilterVal(new CountingBloomFilter(hp, cells, width)); %} ## Adds an element to a Bloom filter. diff --git a/testing/btest/bifs/bloomfilter.bro b/testing/btest/bifs/bloomfilter.bro index 769cec1200..3ff6a6668e 100644 --- a/testing/btest/bifs/bloomfilter.bro +++ b/testing/btest/bifs/bloomfilter.bro @@ -4,7 +4,7 @@ event bro_init() { # Basic usage with counts. - local bf_cnt = bloomfilter_init(0.1, 1000); + local bf_cnt = bloomfilter_basic_init(0.1, 1000); bloomfilter_add(bf_cnt, 42); bloomfilter_add(bf_cnt, 84); bloomfilter_add(bf_cnt, 168); @@ -16,23 +16,23 @@ event bro_init() bloomfilter_add(bf_cnt, "foo"); # Type mismatch # Basic usage with strings. - local bf_str = bloomfilter_init(0.9, 10); + local bf_str = bloomfilter_basic_init(0.9, 10); bloomfilter_add(bf_str, "foo"); bloomfilter_add(bf_str, "bar"); print bloomfilter_lookup(bf_str, "foo"); print bloomfilter_lookup(bf_str, "bar"); - print bloomfilter_lookup(bf_str, "baz"); # FP - print bloomfilter_lookup(bf_str, "qux"); # FP + print bloomfilter_lookup(bf_str, "b4z"); # FP + print bloomfilter_lookup(bf_str, "quux"); # FP bloomfilter_add(bf_str, 0.5); # Type mismatch bloomfilter_add(bf_str, 100); # Type mismatch # Edge cases. - local bf_edge0 = bloomfilter_init(0.000000000001, 1); - local bf_edge1 = bloomfilter_init(0.00000001, 100000000); - local bf_edge2 = bloomfilter_init(0.9999999, 1); - local bf_edge3 = bloomfilter_init(0.9999999, 100000000000); + local bf_edge0 = bloomfilter_basic_init(0.000000000001, 1); + local bf_edge1 = bloomfilter_basic_init(0.00000001, 100000000); + local bf_edge2 = bloomfilter_basic_init(0.9999999, 1); + local bf_edge3 = bloomfilter_basic_init(0.9999999, 100000000000); # Invalid parameters. - local bf_bug0 = bloomfilter_init(-0.5, 42); - local bf_bug1 = bloomfilter_init(1.1, 42); + local bf_bug0 = bloomfilter_basic_init(-0.5, 42); + local bf_bug1 = bloomfilter_basic_init(1.1, 42); } diff --git a/testing/btest/istate/opaque.bro b/testing/btest/istate/opaque.bro index ac3b2c0874..b387f9d6bc 100644 --- a/testing/btest/istate/opaque.bro +++ b/testing/btest/istate/opaque.bro @@ -82,7 +82,7 @@ event bro_init() if ( ! entropy_test_add(entropy_handle, "f") ) print out, "entropy_test_add() failed"; - bloomfilter_handle = bloomfilter_init(0.1, 100); + bloomfilter_handle = bloomfilter_basic_init(0.1, 100); for ( e in bloomfilter_elements ) bloomfilter_add(bloomfilter_handle, e); } From 85668e7054dd22bc783a620eaf88b04f2e4bb952 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Mon, 17 Jun 2013 16:16:44 -0700 Subject: [PATCH 33/50] Remove lingering debug code. --- src/bro.bif | 1 - 1 file changed, 1 deletion(-) diff --git a/src/bro.bif b/src/bro.bif index a89b808888..7c81966317 100644 --- a/src/bro.bif +++ b/src/bro.bif @@ -5009,7 +5009,6 @@ function bloomfilter_basic_init%(fp: double, capacity: count, size_t cells = BasicBloomFilter::M(fp, capacity); size_t optimal_k = BasicBloomFilter::K(cells, capacity); const HashPolicy* hp = new DefaultHashing(optimal_k, name->CheckString()); - fprintf(stderr, "constructing Bloom filter with %llu hash fns and %llu cells\n", optimal_k, cells); return new BloomFilterVal(new BasicBloomFilter(hp, cells)); %} From e6e5f4926f5a850c773af05b51d7004fc4899a7c Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Mon, 17 Jun 2013 16:26:35 -0700 Subject: [PATCH 34/50] Create hash policies through factory. --- src/BloomFilter.cc | 5 +---- src/HashPolicy.cc | 5 +++++ src/HashPolicy.h | 7 +++++++ src/bro.bif | 4 ++-- 4 files changed, 15 insertions(+), 6 deletions(-) diff --git a/src/BloomFilter.cc b/src/BloomFilter.cc index 0be64c18de..59d411d8e2 100644 --- a/src/BloomFilter.cc +++ b/src/BloomFilter.cc @@ -34,8 +34,6 @@ BloomFilter* BloomFilter::Unserialize(UnserialInfo* info) bool BloomFilter::DoSerialize(SerialInfo* info) const { DO_SERIALIZE(SER_BLOOMFILTER, SerialObj); - // FIXME: Since we have a fixed hashing policy, we just serialize the - // information needed to reconstruct it. if ( ! SERIALIZE(static_cast(hash_->K())) ) return false; return SERIALIZE_STR(hash_->Name().c_str(), hash_->Name().size()); @@ -50,8 +48,7 @@ bool BloomFilter::DoUnserialize(UnserialInfo* info) const char* name; if ( ! UNSERIALIZE_STR(&name, 0) ) return false; - // FIXME: for now Bloom filters always use double hashing. - hash_ = new DefaultHashing(k, name); + hash_ = HashPolicy::Create(k, name); return true; } diff --git a/src/HashPolicy.cc b/src/HashPolicy.cc index d6fb4f3da4..7ce754be3c 100644 --- a/src/HashPolicy.cc +++ b/src/HashPolicy.cc @@ -32,6 +32,11 @@ size_t Hasher::compute_seed(size_t seed, const std::string& extra) } +HashPolicy* HashPolicy::Create(size_t k, const std::string& name) + { + return new DefaultHashing(k, name); + } + HashPolicy::HashPolicy(size_t k, const std::string& name) : k_(k), name_(name) { diff --git a/src/HashPolicy.h b/src/HashPolicy.h index 4660bc0080..7bdb968bfe 100644 --- a/src/HashPolicy.h +++ b/src/HashPolicy.h @@ -42,6 +42,13 @@ private: */ class HashPolicy { public: + /** + * Constructs the hashing policy used by the implementation. This factory + * function exists because the HashingPolicy class hierachy is not yet + * serializable. + */ + static HashPolicy* Create(size_t k, const std::string& name); + typedef Hasher::hash_type hash_type; typedef std::vector hash_vector; diff --git a/src/bro.bif b/src/bro.bif index 7c81966317..d0ce066139 100644 --- a/src/bro.bif +++ b/src/bro.bif @@ -5008,7 +5008,7 @@ function bloomfilter_basic_init%(fp: double, capacity: count, size_t cells = BasicBloomFilter::M(fp, capacity); size_t optimal_k = BasicBloomFilter::K(cells, capacity); - const HashPolicy* hp = new DefaultHashing(optimal_k, name->CheckString()); + const HashPolicy* hp = HashPolicy::Create(optimal_k, name->CheckString()); return new BloomFilterVal(new BasicBloomFilter(hp, cells)); %} @@ -5029,7 +5029,7 @@ function bloomfilter_basic_init%(fp: double, capacity: count, function bloomfilter_counting_init%(k: count, cells: count, max: count, name: string &default=""%): opaque of bloomfilter %{ - const HashPolicy* hp = new DefaultHashing(k, name->CheckString()); + const HashPolicy* hp = HashPolicy::Create(k, name->CheckString()); uint16 width = 0; while ( max >>= 1 ) ++width; From 273629de366290f411f381fe5970fc672adf465f Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Tue, 18 Jun 2013 10:23:07 -0700 Subject: [PATCH 35/50] Only serialize Bloom filter type if available. --- src/OpaqueVal.cc | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/src/OpaqueVal.cc b/src/OpaqueVal.cc index 8b82916689..5a673c4a40 100644 --- a/src/OpaqueVal.cc +++ b/src/OpaqueVal.cc @@ -605,9 +605,13 @@ IMPLEMENT_SERIAL(BloomFilterVal, SER_BLOOMFILTER_VAL); bool BloomFilterVal::DoSerialize(SerialInfo* info) const { DO_SERIALIZE(SER_BLOOMFILTER_VAL, OpaqueVal); - assert( type_ ); - if ( ! type_->Serialize(info) ) + + bool is_typed = type_ != NULL; + if ( ! SERIALIZE(is_typed) ) return false; + if ( is_typed && ! type_->Serialize(info) ) + return false; + return bloom_filter_->Serialize(info); } @@ -615,13 +619,16 @@ bool BloomFilterVal::DoUnserialize(UnserialInfo* info) { DO_UNSERIALIZE(OpaqueVal); - type_ = BroType::Unserialize(info); - if ( ! type_ ) + bool is_typed; + if ( ! UNSERIALIZE(&is_typed) ) return false; - TypeList* tl = new TypeList(type_); - tl->Append(type_); - hash_ = new CompositeHash(tl); - Unref(tl); + if ( is_typed ) + { + BroType* type = BroType::Unserialize(info); + if ( ! Typify(type) ) + return false; + Unref(type); + } bloom_filter_ = BloomFilter::Unserialize(info); return bloom_filter_ != NULL; From 5f70452a9ac816346c4e480d8de52b213630b5b7 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Tue, 18 Jun 2013 10:40:00 -0700 Subject: [PATCH 36/50] Small fixes and style tweaks. --- src/BitVector.cc | 2 +- src/BloomFilter.cc | 1 + src/OpaqueVal.h | 4 +--- src/Type.cc | 6 +++--- 4 files changed, 6 insertions(+), 7 deletions(-) diff --git a/src/BitVector.cc b/src/BitVector.cc index f029230609..64db32131f 100644 --- a/src/BitVector.cc +++ b/src/BitVector.cc @@ -473,7 +473,7 @@ bool BitVector::DoSerialize(SerialInfo* info) const if ( ! SERIALIZE(static_cast(bits_.size())) ) return false; - for (size_t i = 0; i < bits_.size(); ++i) + for ( size_t i = 0; i < bits_.size(); ++i ) if ( ! SERIALIZE(static_cast(bits_[i])) ) return false; diff --git a/src/BloomFilter.cc b/src/BloomFilter.cc index 59d411d8e2..a7727630f7 100644 --- a/src/BloomFilter.cc +++ b/src/BloomFilter.cc @@ -49,6 +49,7 @@ bool BloomFilter::DoUnserialize(UnserialInfo* info) if ( ! UNSERIALIZE_STR(&name, 0) ) return false; hash_ = HashPolicy::Create(k, name); + delete [] name; return true; } diff --git a/src/OpaqueVal.h b/src/OpaqueVal.h index 4b45cad519..2362fdacfc 100644 --- a/src/OpaqueVal.h +++ b/src/OpaqueVal.h @@ -139,9 +139,7 @@ private: { const T* a = dynamic_cast(x->bloom_filter_); const T* b = dynamic_cast(y->bloom_filter_); - if ( a && b ) - return new BloomFilterVal(T::Merge(a, b)); - return NULL; + return a && b ? new BloomFilterVal(T::Merge(a, b)) : NULL; } BroType* type_; diff --git a/src/Type.cc b/src/Type.cc index 6461bf2560..f19de461cd 100644 --- a/src/Type.cc +++ b/src/Type.cc @@ -1311,19 +1311,19 @@ IMPLEMENT_SERIAL(OpaqueType, SER_OPAQUE_TYPE); bool OpaqueType::DoSerialize(SerialInfo* info) const { DO_SERIALIZE(SER_OPAQUE_TYPE, BroType); - return SERIALIZE(name); + return SERIALIZE_STR(name.c_str(), name.size()); } bool OpaqueType::DoUnserialize(UnserialInfo* info) { DO_UNSERIALIZE(BroType); - char const* n; + const char* n; if ( ! UNSERIALIZE_STR(&n, 0) ) return false; - name = n; delete [] n; + return true; } From 40201a180e54a560711003f2e65e14be87a7b8e9 Mon Sep 17 00:00:00 2001 From: Robin Sommer Date: Tue, 9 Jul 2013 21:00:53 -0700 Subject: [PATCH 37/50] Fixing for unserializion error. Because BloomFilter is a base class, with other classes derived from it, it needs special treatment. --- src/SerialTypes.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/SerialTypes.h b/src/SerialTypes.h index 9e4aef5b3b..85aed10bda 100644 --- a/src/SerialTypes.h +++ b/src/SerialTypes.h @@ -52,8 +52,6 @@ SERIAL_IS(RE_MATCHER, 0x1400) SERIAL_IS(BITVECTOR, 0x1500) SERIAL_IS(COUNTERVECTOR, 0x1600) SERIAL_IS(BLOOMFILTER, 0x1700) -SERIAL_IS(BASICBLOOMFILTER, 0x1800) -SERIAL_IS(COUNTINGBLOOMFILTER, 0x1900) // These are the externally visible types. const SerialType SER_NONE = 0; @@ -203,6 +201,11 @@ SERIAL_FUNC(BRO_FUNC, 2) SERIAL_FUNC(DEBUG_FUNC, 3) SERIAL_FUNC(BUILTIN_FUNC, 4) +#define SERIAL_BLOOMFILTER(name, val) SERIAL_CONST(name, val, BLOOMFILTER) +SERIAL_BLOOMFILTER(BLOOMFILTER, 1) +SERIAL_BLOOMFILTER(BASICBLOOMFILTER, 2) +SERIAL_BLOOMFILTER(COUNTINGBLOOMFILTER, 3) + SERIAL_CONST2(ID) SERIAL_CONST2(STATE_ACCESS) SERIAL_CONST2(CASE) @@ -210,8 +213,5 @@ SERIAL_CONST2(LOCATION) SERIAL_CONST2(RE_MATCHER) SERIAL_CONST2(BITVECTOR) SERIAL_CONST2(COUNTERVECTOR) -SERIAL_CONST2(BLOOMFILTER) -SERIAL_CONST2(BASICBLOOMFILTER) -SERIAL_CONST2(COUNTINGBLOOMFILTER) #endif From 446344ae998e8eef30a0f45a05dcea29efe4f032 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Wed, 10 Jul 2013 01:32:59 -0700 Subject: [PATCH 38/50] Add missing include for GCC. --- src/BloomFilter.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/src/BloomFilter.cc b/src/BloomFilter.cc index a7727630f7..c59092b1e4 100644 --- a/src/BloomFilter.cc +++ b/src/BloomFilter.cc @@ -1,6 +1,7 @@ #include "BloomFilter.h" #include +#include #include "CounterVector.h" #include "Serializer.h" From fd2e155d1af26086d40e12d38f564b7954f4597e Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Sun, 21 Jul 2013 17:34:25 +0200 Subject: [PATCH 39/50] Tweak hasher interface. --- src/BloomFilter.cc | 34 +++++++------- src/BloomFilter.h | 31 +++++++------ src/CMakeLists.txt | 2 +- src/HashPolicy.cc | 77 -------------------------------- src/HashPolicy.h | 97 ---------------------------------------- src/Hasher.cc | 79 ++++++++++++++++++++++++++++++++ src/Hasher.h | 109 +++++++++++++++++++++++++++++++++++++++++++++ src/bro.bif | 8 ++-- 8 files changed, 225 insertions(+), 212 deletions(-) delete mode 100644 src/HashPolicy.cc delete mode 100644 src/HashPolicy.h create mode 100644 src/Hasher.cc create mode 100644 src/Hasher.h diff --git a/src/BloomFilter.cc b/src/BloomFilter.cc index c59092b1e4..f399bddeca 100644 --- a/src/BloomFilter.cc +++ b/src/BloomFilter.cc @@ -6,19 +6,19 @@ #include "Serializer.h" BloomFilter::BloomFilter() - : hash_(NULL) + : hasher_(NULL) { } -BloomFilter::BloomFilter(const HashPolicy* hash_policy) - : hash_(hash_policy) +BloomFilter::BloomFilter(const Hasher* hasher) + : hasher_(hasher) { } BloomFilter::~BloomFilter() { - if ( hash_ ) - delete hash_; + if ( hasher_ ) + delete hasher_; } bool BloomFilter::Serialize(SerialInfo* info) const @@ -35,9 +35,9 @@ BloomFilter* BloomFilter::Unserialize(UnserialInfo* info) bool BloomFilter::DoSerialize(SerialInfo* info) const { DO_SERIALIZE(SER_BLOOMFILTER, SerialObj); - if ( ! SERIALIZE(static_cast(hash_->K())) ) + if ( ! SERIALIZE(static_cast(hasher_->K())) ) return false; - return SERIALIZE_STR(hash_->Name().c_str(), hash_->Name().size()); + return SERIALIZE_STR(hasher_->Name().c_str(), hasher_->Name().size()); } bool BloomFilter::DoUnserialize(UnserialInfo* info) @@ -49,7 +49,7 @@ bool BloomFilter::DoUnserialize(UnserialInfo* info) const char* name; if ( ! UNSERIALIZE_STR(&name, 0) ) return false; - hash_ = HashPolicy::Create(k, name); + hasher_ = Hasher::Create(k, name); delete [] name; return true; } @@ -70,7 +70,7 @@ size_t BasicBloomFilter::K(size_t cells, size_t capacity) BasicBloomFilter* BasicBloomFilter::Merge(const BasicBloomFilter* x, const BasicBloomFilter* y) { - // TODO: Ensure that x and y use the same HashPolicy before proceeding. + // TODO: Ensure that x and y use the same Hasher before proceeding. BasicBloomFilter* result = new BasicBloomFilter(); result->bits_ = new BitVector(*x->bits_ | *y->bits_); return result; @@ -81,8 +81,8 @@ BasicBloomFilter::BasicBloomFilter() { } -BasicBloomFilter::BasicBloomFilter(const HashPolicy* hash_policy, size_t cells) - : BloomFilter(hash_policy), +BasicBloomFilter::BasicBloomFilter(const Hasher* hasher, size_t cells) + : BloomFilter(hasher), bits_(new BitVector(cells)) { } @@ -102,13 +102,13 @@ bool BasicBloomFilter::DoUnserialize(UnserialInfo* info) return bits_ != NULL; } -void BasicBloomFilter::AddImpl(const HashPolicy::hash_vector& h) +void BasicBloomFilter::AddImpl(const Hasher::digest_vector& h) { for ( size_t i = 0; i < h.size(); ++i ) bits_->Set(h[i] % bits_->Size()); } -size_t BasicBloomFilter::CountImpl(const HashPolicy::hash_vector& h) const +size_t BasicBloomFilter::CountImpl(const Hasher::digest_vector& h) const { for ( size_t i = 0; i < h.size(); ++i ) if ( ! (*bits_)[h[i] % bits_->Size()] ) @@ -129,9 +129,9 @@ CountingBloomFilter::CountingBloomFilter() { } -CountingBloomFilter::CountingBloomFilter(const HashPolicy* hash_policy, +CountingBloomFilter::CountingBloomFilter(const Hasher* hasher, size_t cells, size_t width) - : BloomFilter(hash_policy) + : BloomFilter(hasher) { cells_ = new CounterVector(width, cells); } @@ -152,13 +152,13 @@ bool CountingBloomFilter::DoUnserialize(UnserialInfo* info) return cells_ != NULL; } -void CountingBloomFilter::AddImpl(const HashPolicy::hash_vector& h) +void CountingBloomFilter::AddImpl(const Hasher::digest_vector& h) { for ( size_t i = 0; i < h.size(); ++i ) cells_->Increment(h[i] % cells_->Size(), 1); } -size_t CountingBloomFilter::CountImpl(const HashPolicy::hash_vector& h) const +size_t CountingBloomFilter::CountImpl(const Hasher::digest_vector& h) const { CounterVector::size_type min = std::numeric_limits::max(); diff --git a/src/BloomFilter.h b/src/BloomFilter.h index 189f4920b7..92f15c6070 100644 --- a/src/BloomFilter.h +++ b/src/BloomFilter.h @@ -3,7 +3,7 @@ #include #include "BitVector.h" -#include "HashPolicy.h" +#include "Hasher.h" class CounterVector; @@ -12,7 +12,7 @@ class CounterVector; */ class BloomFilter : public SerialObj { public: - // At this point we won't let the user choose the hash policy, but we might + // At this point we won't let the user choose the hasher, but we might // open up the interface in the future. virtual ~BloomFilter(); @@ -23,7 +23,7 @@ public: template void Add(const T& x) { - AddImpl(hash_->Hash(&x, sizeof(x))); + AddImpl((*hasher_)(x)); } /** @@ -36,7 +36,7 @@ public: template size_t Count(const T& x) const { - return CountImpl(hash_->Hash(&x, sizeof(x))); + return CountImpl((*hasher_)(x)); } bool Serialize(SerialInfo* info) const; @@ -50,15 +50,15 @@ protected: /** * Constructs a Bloom filter. * - * @param hash_policy The hash policy to use for this Bloom filter. + * @param hasher The hasher to use for this Bloom filter. */ - BloomFilter(const HashPolicy* hash_policy); + BloomFilter(const Hasher* hasher); - virtual void AddImpl(const HashPolicy::hash_vector& hashes) = 0; - virtual size_t CountImpl(const HashPolicy::hash_vector& hashes) const = 0; + virtual void AddImpl(const Hasher::digest_vector& hashes) = 0; + virtual size_t CountImpl(const Hasher::digest_vector& hashes) const = 0; private: - const HashPolicy* hash_; + const Hasher* hasher_; }; /** @@ -98,15 +98,15 @@ public: /** * Constructs a basic Bloom filter with a given number of cells and capacity. */ - BasicBloomFilter(const HashPolicy* hash_policy, size_t cells); + BasicBloomFilter(const Hasher* hasher, size_t cells); protected: DECLARE_SERIAL(BasicBloomFilter); BasicBloomFilter(); - virtual void AddImpl(const HashPolicy::hash_vector& h); - virtual size_t CountImpl(const HashPolicy::hash_vector& h) const; + virtual void AddImpl(const Hasher::digest_vector& h); + virtual size_t CountImpl(const Hasher::digest_vector& h) const; private: BitVector* bits_; @@ -120,16 +120,15 @@ public: static CountingBloomFilter* Merge(const CountingBloomFilter* x, const CountingBloomFilter* y); - CountingBloomFilter(const HashPolicy* hash_policy, size_t cells, - size_t width); + CountingBloomFilter(const Hasher* hasher, size_t cells, size_t width); protected: DECLARE_SERIAL(CountingBloomFilter); CountingBloomFilter(); - virtual void AddImpl(const HashPolicy::hash_vector& h); - virtual size_t CountImpl(const HashPolicy::hash_vector& h) const; + virtual void AddImpl(const Hasher::digest_vector& h); + virtual size_t CountImpl(const Hasher::digest_vector& h) const; private: CounterVector* cells_; diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index f2c7ce6bad..87a3db3b62 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -279,7 +279,7 @@ set(bro_SRCS Frame.cc Func.cc Hash.cc - HashPolicy.cc + Hasher.cc ID.cc IntSet.cc IOSource.cc diff --git a/src/HashPolicy.cc b/src/HashPolicy.cc deleted file mode 100644 index 7ce754be3c..0000000000 --- a/src/HashPolicy.cc +++ /dev/null @@ -1,77 +0,0 @@ -#include "HashPolicy.h" - -#include "digest.h" - -Hasher::Hasher(size_t seed, const std::string& extra) - : h_(compute_seed(seed, extra)) - { - } - -Hasher::hash_type Hasher::operator()(const void* x, size_t n) const - { - return n == 0 ? 0 : h_(x, n); - } - -size_t Hasher::compute_seed(size_t seed, const std::string& extra) - { - u_char digest[SHA256_DIGEST_LENGTH]; - SHA256_CTX ctx; - sha256_init(&ctx); - if ( extra.empty() ) - { - unsigned int first_seed = initial_seed(); - sha256_update(&ctx, &first_seed, sizeof(first_seed)); - } - else - { - sha256_update(&ctx, extra.c_str(), extra.size()); - } - sha256_update(&ctx, &seed, sizeof(seed)); - sha256_final(&ctx, digest); - return *reinterpret_cast(digest); - } - - -HashPolicy* HashPolicy::Create(size_t k, const std::string& name) - { - return new DefaultHashing(k, name); - } - -HashPolicy::HashPolicy(size_t k, const std::string& name) - : k_(k), name_(name) - { - } - -DefaultHashing::DefaultHashing(size_t k, const std::string& name) - : HashPolicy(k, name) - { - for ( size_t i = 0; i < k; ++i ) - hashers_.push_back(Hasher(i, name)); - } - -HashPolicy::hash_vector DefaultHashing::Hash(const void* x, size_t n) const - { - hash_vector h(K(), 0); - for ( size_t i = 0; i < h.size(); ++i ) - h[i] = hashers_[i](x, n); - return h; - } - -DoubleHashing::DoubleHashing(size_t k, const std::string& name) - : HashPolicy(k, name), - hasher1_(1, name), - hasher2_(2, name) - { - } - -HashPolicy::hash_vector DoubleHashing::Hash(const void* x, size_t n) const - { - hash_type h1 = hasher1_(x, n); - hash_type h2 = hasher2_(x, n); - hash_vector h(K(), 0); - for ( size_t i = 0; i < h.size(); ++i ) - h[i] = h1 + i * h2; - return h; - } - - diff --git a/src/HashPolicy.h b/src/HashPolicy.h deleted file mode 100644 index 7bdb968bfe..0000000000 --- a/src/HashPolicy.h +++ /dev/null @@ -1,97 +0,0 @@ -#ifndef HashPolicy_h -#define HashPolicy_h - -#include "Hash.h" -#include "H3.h" - -/** - * A functor that computes a universal hash function. - */ -class Hasher { -public: - typedef hash_t hash_type; - - /** - * Constructs a hasher seeded by a given seed and optionally an extra - * descriptor. - * - * @param seed The seed to use. - * - * @param extra If not `NULL`, the hasher will not mix in the initial seed - * but instead use this NUL-terminated string as additional seed. - */ - Hasher(size_t seed, const std::string& extra = ""); - - /** - * Computes the hash digest of contiguous data. - * - * @param x A pointer to the beginning of the byte sequence to hash. - * - * @param n The length of the sequence pointed to by *x*. - */ - hash_type operator()(const void* x, size_t n) const; - -private: - static size_t compute_seed(size_t seed, const std::string& extra); - - H3 h_; -}; - -/** - * The abstract base class for hash policies that hash elements *k* times. - */ -class HashPolicy { -public: - /** - * Constructs the hashing policy used by the implementation. This factory - * function exists because the HashingPolicy class hierachy is not yet - * serializable. - */ - static HashPolicy* Create(size_t k, const std::string& name); - - typedef Hasher::hash_type hash_type; - typedef std::vector hash_vector; - - virtual ~HashPolicy() { } - - virtual hash_vector Hash(const void* x, size_t n) const = 0; - - size_t K() const { return k_; } - const std::string& Name() const { return name_; } - -protected: - HashPolicy(size_t k, const std::string& name); - -private: - const size_t k_; - std::string name_; -}; - -/** - * The default hashing policy. Performs *k* hash function computations. - */ -class DefaultHashing : public HashPolicy { -public: - DefaultHashing(size_t k, const std::string& name); - - virtual hash_vector Hash(const void* x, size_t n) const /* override */; - -private: - std::vector hashers_; -}; - -/** - * The *double-hashing* policy. Uses a linear combination of two hash functions. - */ -class DoubleHashing : public HashPolicy { -public: - DoubleHashing(size_t k, const std::string& name); - - virtual hash_vector Hash(const void* x, size_t n) const; - -private: - Hasher hasher1_; - Hasher hasher2_; -}; - -#endif diff --git a/src/Hasher.cc b/src/Hasher.cc new file mode 100644 index 0000000000..045adcd174 --- /dev/null +++ b/src/Hasher.cc @@ -0,0 +1,79 @@ +#include "Hasher.h" + +#include "digest.h" + +Hasher::UHF::UHF(size_t seed, const std::string& extra) + : h_(compute_seed(seed, extra)) + { + } + +Hasher::digest Hasher::UHF::hash(const void* x, size_t n) const + { + assert(n <= UHASH_KEY_SIZE); + return n == 0 ? 0 : h_(x, n); + } + +size_t Hasher::UHF::compute_seed(size_t seed, const std::string& extra) + { + u_char buf[SHA256_DIGEST_LENGTH]; + SHA256_CTX ctx; + sha256_init(&ctx); + if ( extra.empty() ) + { + unsigned int first_seed = initial_seed(); + sha256_update(&ctx, &first_seed, sizeof(first_seed)); + } + else + { + sha256_update(&ctx, extra.c_str(), extra.size()); + } + sha256_update(&ctx, &seed, sizeof(seed)); + sha256_final(&ctx, buf); + // Take the first sizeof(size_t) bytes as seed. + return *reinterpret_cast(buf); + } + + +Hasher* Hasher::Create(size_t k, const std::string& name) + { + return new DefaultHasher(k, name); + } + +Hasher::Hasher(size_t k, const std::string& name) + : k_(k), name_(name) + { + } + +DefaultHasher::DefaultHasher(size_t k, const std::string& name) + : Hasher(k, name) + { + for ( size_t i = 0; i < k; ++i ) + hash_functions_.push_back(UHF(i, name)); + } + +Hasher::digest_vector DefaultHasher::Hash(const void* x, size_t n) const + { + digest_vector h(K(), 0); + for ( size_t i = 0; i < h.size(); ++i ) + h[i] = hash_functions_[i](x, n); + return h; + } + +DoubleHasher::DoubleHasher(size_t k, const std::string& name) + : Hasher(k, name), + h1_(1, name), + h2_(2, name) + { + } + +Hasher::digest_vector DoubleHasher::Hash(const void* x, size_t n) const + { + digest h1 = h1_(x, n); + digest h2 = h2_(x, n); + digest_vector h(K(), 0); + for ( size_t i = 0; i < h.size(); ++i ) + h[i] = h1 + i * h2; + return h; + } + + diff --git a/src/Hasher.h b/src/Hasher.h new file mode 100644 index 0000000000..8d0af6b03f --- /dev/null +++ b/src/Hasher.h @@ -0,0 +1,109 @@ +#ifndef Hasher_h +#define Hasher_h + +#include "Hash.h" +#include "H3.h" + +/** + * The abstract base class for hashers, i.e., constructs which hash elements + * *k* times. + */ +class Hasher { +public: + typedef hash_t digest; + typedef std::vector digest_vector; + + /** + * Constructs the hashing policy used by the implementation. + * + * @todo This factory function exists because the HashingPolicy class + * hierachy is not yet serializable. + */ + static Hasher* Create(size_t k, const std::string& name); + + virtual ~Hasher() { } + + template + digest_vector operator()(const T& x) const + { + return Hash(&x, sizeof(T)); + } + + virtual digest_vector Hash(const void* x, size_t n) const = 0; + + size_t K() const { return k_; } + const std::string& Name() const { return name_; } + +protected: + /** + * A universal hash function family. + */ + class UHF { + public: + /** + * Constructs an H3 hash function seeded with a given seed and an optional + * extra seed to replace the initial Bro seed. + * + * @param seed The seed to use for this instance. + * + * @param extra If not empty, this parameter replaces the initial seed to + * compute the seed for t to compute the + * seed + * NUL-terminated string as additional seed. + */ + UHF(size_t seed, const std::string& extra = ""); + + template + digest operator()(const T& x) const + { + return hash(&x, sizeof(T)); + } + + digest operator()(const void* x, size_t n) const + { + return hash(x, n); + } + + digest hash(const void* x, size_t n) const; + + private: + static size_t compute_seed(size_t seed, const std::string& extra); + + H3 h_; + }; + + Hasher(size_t k, const std::string& name); + +private: + const size_t k_; + std::string name_; +}; + +/** + * The default hashing policy. Performs *k* hash function computations. + */ +class DefaultHasher : public Hasher { +public: + DefaultHasher(size_t k, const std::string& name); + + virtual digest_vector Hash(const void* x, size_t n) const /* final */; + +private: + std::vector hash_functions_; +}; + +/** + * The *double-hashing* policy. Uses a linear combination of two hash functions. + */ +class DoubleHasher : public Hasher { +public: + DoubleHasher(size_t k, const std::string& name); + + virtual digest_vector Hash(const void* x, size_t n) const /* final */; + +private: + UHF h1_; + UHF h2_; +}; + +#endif diff --git a/src/bro.bif b/src/bro.bif index d0ce066139..71f8c0716f 100644 --- a/src/bro.bif +++ b/src/bro.bif @@ -5008,8 +5008,8 @@ function bloomfilter_basic_init%(fp: double, capacity: count, size_t cells = BasicBloomFilter::M(fp, capacity); size_t optimal_k = BasicBloomFilter::K(cells, capacity); - const HashPolicy* hp = HashPolicy::Create(optimal_k, name->CheckString()); - return new BloomFilterVal(new BasicBloomFilter(hp, cells)); + const Hasher* h = Hasher::Create(optimal_k, name->CheckString()); + return new BloomFilterVal(new BasicBloomFilter(h, cells)); %} ## Creates a counting Bloom filter. @@ -5029,11 +5029,11 @@ function bloomfilter_basic_init%(fp: double, capacity: count, function bloomfilter_counting_init%(k: count, cells: count, max: count, name: string &default=""%): opaque of bloomfilter %{ - const HashPolicy* hp = HashPolicy::Create(k, name->CheckString()); + const Hasher* h = Hasher::Create(k, name->CheckString()); uint16 width = 0; while ( max >>= 1 ) ++width; - return new BloomFilterVal(new CountingBloomFilter(hp, cells, width)); + return new BloomFilterVal(new CountingBloomFilter(h, cells, width)); %} ## Adds an element to a Bloom filter. From 79a2e4b5d5c28076a8db1857d3ea6a8891e1ef7c Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Sun, 21 Jul 2013 22:41:48 +0200 Subject: [PATCH 40/50] Implement missing CounterVector functions. --- src/CounterVector.cc | 66 ++++++++++++++++++++++++++++++++++++++------ src/CounterVector.h | 15 ++++++++++ 2 files changed, 73 insertions(+), 8 deletions(-) diff --git a/src/CounterVector.cc b/src/CounterVector.cc index 8ed4c30427..a661492313 100644 --- a/src/CounterVector.cc +++ b/src/CounterVector.cc @@ -1,5 +1,6 @@ #include "CounterVector.h" +#include #include "BitVector.h" #include "Serializer.h" @@ -15,23 +16,66 @@ CounterVector::~CounterVector() bool CounterVector::Increment(size_type cell, count_type value) { - // TODO - assert(! "not yet implemented"); + assert(cell < Size()); + assert(value != 0); + size_t lsb = cell * width_; + if (value >= Max()) + { + bool r = false; + for (size_t i = 0; i < width_; ++i) + if (! (*bits_)[lsb + i]) + { + bits_->Set(lsb + i); + if (! r) + r = true; + } + return r; + } + bool carry = false; + for (size_t i = 0; i < width_; ++i) + { + bool b1 = (*bits_)[lsb + i]; + bool b2 = value & (1 << i); + (*bits_)[lsb + i] ^= b2 != carry; // bit1 ^ bit2 ^ carry + carry = carry ? b1 || b2 : b1 && b2; + } + if (! carry) + return true; + for (size_t i = 0; i < width_; ++i) + bits_->Set(lsb + i); return false; } bool CounterVector::Decrement(size_type cell, count_type value) { - // TODO - assert(! "not yet implemented"); - return false; + assert(cell < Size()); + size_t lsb = cell * width_; + bool success; + while (value --> 0) + { + success = false; + for (size_t i = lsb; i < lsb + width_; ++i) + if ((*bits_)[i]) + { + bits_->Reset(i); + while (i && i > lsb) + bits_->Set(--i); + success = true; + break; + } + } + return success; } CounterVector::count_type CounterVector::Count(size_type cell) const { - // TODO - assert(! "not yet implemented"); - return 0; + assert(cell < Size()); + size_t cnt = 0, order = 1; + size_t lsb = cell * width_; + for (size_t i = lsb; i < lsb + width_; ++i, order <<= 1) + if ((*bits_)[i]) + cnt |= order; + return cnt; } CounterVector::size_type CounterVector::Size() const @@ -39,6 +83,12 @@ CounterVector::size_type CounterVector::Size() const return bits_->Blocks() / width_; } +size_t CounterVector::Max() const + { + return std::numeric_limits::max() + >> (std::numeric_limits::digits - width_); + } + bool CounterVector::Serialize(SerialInfo* info) const { return SerialObj::Serialize(info); diff --git a/src/CounterVector.h b/src/CounterVector.h index ecc8fe90e0..868beaca9b 100644 --- a/src/CounterVector.h +++ b/src/CounterVector.h @@ -19,6 +19,8 @@ public: * @param width The number of bits that each cell occupies. * * @param cells The number of cells in the bitvector. + * + * @pre `cells > 0 && width > 0` */ CounterVector(size_t width, size_t cells = 1024); @@ -32,6 +34,8 @@ public: * @param value The value to add to the current counter in *cell*. * * @return `true` if adding *value* to the counter in *cell* succeeded. + * + * @pre `cell < Size()` */ bool Increment(size_type cell, count_type value); @@ -43,6 +47,8 @@ public: * @param value The value to subtract from the current counter in *cell*. * * @return `true` if subtracting *value* from the counter in *cell* succeeded. + * + * @pre `cell < Size()` */ bool Decrement(size_type cell, count_type value); @@ -52,6 +58,8 @@ public: * @param cell The cell index to retrieve the count for. * * @return The counter associated with *cell*. + * + * @pre `cell < Size()` */ count_type Count(size_type cell) const; @@ -62,6 +70,13 @@ public: */ size_type Size() const; + /** + * Computes the maximum counter value. + * + * @return The maximum counter value based on the width. + */ + size_t Max() const; + bool Serialize(SerialInfo* info) const; static CounterVector* Unserialize(UnserialInfo* info); From 7a0240694ec69506b0789029ba48bb56ae703206 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Mon, 22 Jul 2013 14:07:47 +0200 Subject: [PATCH 41/50] Fix and test counting Bloom filter. --- src/BloomFilter.cc | 9 ++++--- src/CounterVector.cc | 5 ++-- src/CounterVector.h | 4 +-- src/bro.bif | 8 +++++- .../btest/Baseline/bifs.bloomfilter/output | 6 +++++ testing/btest/bifs/bloomfilter.bro | 26 ++++++++++++++++++- 6 files changed, 48 insertions(+), 10 deletions(-) diff --git a/src/BloomFilter.cc b/src/BloomFilter.cc index f399bddeca..3c7bac80f1 100644 --- a/src/BloomFilter.cc +++ b/src/BloomFilter.cc @@ -131,9 +131,9 @@ CountingBloomFilter::CountingBloomFilter() CountingBloomFilter::CountingBloomFilter(const Hasher* hasher, size_t cells, size_t width) - : BloomFilter(hasher) + : BloomFilter(hasher), + cells_(new CounterVector(width, cells)) { - cells_ = new CounterVector(width, cells); } @@ -152,10 +152,12 @@ bool CountingBloomFilter::DoUnserialize(UnserialInfo* info) return cells_ != NULL; } +// TODO: Use partitioning in add/count to allow for reusing CMS bounds. + void CountingBloomFilter::AddImpl(const Hasher::digest_vector& h) { for ( size_t i = 0; i < h.size(); ++i ) - cells_->Increment(h[i] % cells_->Size(), 1); + cells_->Increment(h[i] % cells_->Size()); } size_t CountingBloomFilter::CountImpl(const Hasher::digest_vector& h) const @@ -164,7 +166,6 @@ size_t CountingBloomFilter::CountImpl(const Hasher::digest_vector& h) const std::numeric_limits::max(); for ( size_t i = 0; i < h.size(); ++i ) { - // TODO: Use partitioning. CounterVector::size_type cnt = cells_->Count(h[i] % cells_->Size()); if ( cnt < min ) min = cnt; diff --git a/src/CounterVector.cc b/src/CounterVector.cc index a661492313..831b95386f 100644 --- a/src/CounterVector.cc +++ b/src/CounterVector.cc @@ -5,7 +5,8 @@ #include "Serializer.h" CounterVector::CounterVector(size_t width, size_t cells) - : bits_(new BitVector(width * cells)), width_(width) + : bits_(new BitVector(width * cells)), + width_(width) { } @@ -80,7 +81,7 @@ CounterVector::count_type CounterVector::Count(size_type cell) const CounterVector::size_type CounterVector::Size() const { - return bits_->Blocks() / width_; + return bits_->Size() / width_; } size_t CounterVector::Max() const diff --git a/src/CounterVector.h b/src/CounterVector.h index 868beaca9b..2d99bb44d8 100644 --- a/src/CounterVector.h +++ b/src/CounterVector.h @@ -37,7 +37,7 @@ public: * * @pre `cell < Size()` */ - bool Increment(size_type cell, count_type value); + bool Increment(size_type cell, count_type value = 1); /** * Decrements a given cell. @@ -50,7 +50,7 @@ public: * * @pre `cell < Size()` */ - bool Decrement(size_type cell, count_type value); + bool Decrement(size_type cell, count_type value = 1); /** * Retrieves the counter of a given cell. diff --git a/src/bro.bif b/src/bro.bif index 71f8c0716f..a33a2248dd 100644 --- a/src/bro.bif +++ b/src/bro.bif @@ -5029,8 +5029,14 @@ function bloomfilter_basic_init%(fp: double, capacity: count, function bloomfilter_counting_init%(k: count, cells: count, max: count, name: string &default=""%): opaque of bloomfilter %{ + if ( max == 0 ) + { + reporter->Error("max counter value must be greater than 0"); + return NULL; + } + const Hasher* h = Hasher::Create(k, name->CheckString()); - uint16 width = 0; + uint16 width = 1; while ( max >>= 1 ) ++width; return new BloomFilterVal(new CountingBloomFilter(h, cells, width)); diff --git a/testing/btest/Baseline/bifs.bloomfilter/output b/testing/btest/Baseline/bifs.bloomfilter/output index 65aaa8b07c..80847a81b9 100644 --- a/testing/btest/Baseline/bifs.bloomfilter/output +++ b/testing/btest/Baseline/bifs.bloomfilter/output @@ -6,3 +6,9 @@ 1 1 1 +1 +2 +3 +3 +2 +3 diff --git a/testing/btest/bifs/bloomfilter.bro b/testing/btest/bifs/bloomfilter.bro index 3ff6a6668e..ab0bf86c22 100644 --- a/testing/btest/bifs/bloomfilter.bro +++ b/testing/btest/bifs/bloomfilter.bro @@ -1,7 +1,7 @@ # @TEST-EXEC: bro -b %INPUT >output # @TEST-EXEC: btest-diff output -event bro_init() +function test_basic_bloom_filter() { # Basic usage with counts. local bf_cnt = bloomfilter_basic_init(0.1, 1000); @@ -36,3 +36,27 @@ event bro_init() local bf_bug0 = bloomfilter_basic_init(-0.5, 42); local bf_bug1 = bloomfilter_basic_init(1.1, 42); } + +function test_counting_bloom_filter() + { + local bf = bloomfilter_counting_init(3, 16, 3); + bloomfilter_add(bf, "foo"); + print bloomfilter_lookup(bf, "foo"); # 1 + bloomfilter_add(bf, "foo"); + print bloomfilter_lookup(bf, "foo"); # 2 + bloomfilter_add(bf, "foo"); + print bloomfilter_lookup(bf, "foo"); # 3 + bloomfilter_add(bf, "foo"); + print bloomfilter_lookup(bf, "foo"); # still 3 + + bloomfilter_add(bf, "bar"); + bloomfilter_add(bf, "bar"); + print bloomfilter_lookup(bf, "bar"); # 2 + print bloomfilter_lookup(bf, "foo"); # still 3 + } + +event bro_init() + { + test_basic_bloom_filter(); + test_counting_bloom_filter(); + } From a3c61fe7eb6c43622de17df0e818def20cab7e90 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Mon, 22 Jul 2013 15:39:13 +0200 Subject: [PATCH 42/50] Use half adder for bitwise addition and subtraction. --- src/CounterVector.cc | 53 +++++++++++++++----------------------------- 1 file changed, 18 insertions(+), 35 deletions(-) diff --git a/src/CounterVector.cc b/src/CounterVector.cc index 831b95386f..f46fae1b98 100644 --- a/src/CounterVector.cc +++ b/src/CounterVector.cc @@ -20,52 +20,35 @@ bool CounterVector::Increment(size_type cell, count_type value) assert(cell < Size()); assert(value != 0); size_t lsb = cell * width_; - if (value >= Max()) - { - bool r = false; - for (size_t i = 0; i < width_; ++i) - if (! (*bits_)[lsb + i]) - { - bits_->Set(lsb + i); - if (! r) - r = true; - } - return r; - } bool carry = false; - for (size_t i = 0; i < width_; ++i) - { + for ( size_t i = 0; i < width_; ++i ) + { bool b1 = (*bits_)[lsb + i]; bool b2 = value & (1 << i); - (*bits_)[lsb + i] ^= b2 != carry; // bit1 ^ bit2 ^ carry - carry = carry ? b1 || b2 : b1 && b2; - } - if (! carry) - return true; - for (size_t i = 0; i < width_; ++i) - bits_->Set(lsb + i); - return false; + (*bits_)[lsb + i] = b1 ^ b2 ^ carry; + carry = ( b1 && b2 ) || ( carry && ( b1 != b2 ) ); + } + if ( carry ) + for ( size_t i = 0; i < width_; ++i ) + bits_->Set(lsb + i); + return ! carry; } bool CounterVector::Decrement(size_type cell, count_type value) { assert(cell < Size()); + assert(value != 0); + value = ~value + 1; // A - B := A + ~B + 1 + bool carry = false; size_t lsb = cell * width_; - bool success; - while (value --> 0) + for ( size_t i = 0; i < width_; ++i ) { - success = false; - for (size_t i = lsb; i < lsb + width_; ++i) - if ((*bits_)[i]) - { - bits_->Reset(i); - while (i && i > lsb) - bits_->Set(--i); - success = true; - break; - } + bool b1 = bits_[lsb + i]; + bool b2 = value & (1 << i); + bits_[lsb + i] = b1 ^ b2 ^ carry; + carry = ( b1 && b2 ) || ( carry && ( b1 != b2 ) ); } - return success; + return carry; } CounterVector::count_type CounterVector::Count(size_type cell) const From 9c2f57a9d9d5667d05e43efd3c8541ff9d33382a Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Mon, 22 Jul 2013 16:36:54 +0200 Subject: [PATCH 43/50] Make counter vectors mergeable. --- src/CounterVector.cc | 42 ++++++++++++++++++++++++++++++++++++++++-- src/CounterVector.h | 27 +++++++++++++++++++++++++++ 2 files changed, 67 insertions(+), 2 deletions(-) diff --git a/src/CounterVector.cc b/src/CounterVector.cc index f46fae1b98..75c62b208a 100644 --- a/src/CounterVector.cc +++ b/src/CounterVector.cc @@ -43,9 +43,9 @@ bool CounterVector::Decrement(size_type cell, count_type value) size_t lsb = cell * width_; for ( size_t i = 0; i < width_; ++i ) { - bool b1 = bits_[lsb + i]; + bool b1 = (*bits_)[lsb + i]; bool b2 = value & (1 << i); - bits_[lsb + i] = b1 ^ b2 ^ carry; + (*bits_)[lsb + i] = b1 ^ b2 ^ carry; carry = ( b1 && b2 ) || ( carry && ( b1 != b2 ) ); } return carry; @@ -67,12 +67,50 @@ CounterVector::size_type CounterVector::Size() const return bits_->Size() / width_; } +size_t CounterVector::Width() const + { + return width_; + } + size_t CounterVector::Max() const { return std::numeric_limits::max() >> (std::numeric_limits::digits - width_); } +CounterVector& CounterVector::Merge(const CounterVector& other) + { + assert(Size() == other.Size()); + assert(Width() == other.Width()); + for ( size_t cell = 0; cell < Size(); ++cell ) + { + size_t lsb = cell * width_; + bool carry = false; + for ( size_t i = 0; i < width_; ++i ) + { + bool b1 = (*bits_)[lsb + i]; + bool b2 = (*other.bits_)[lsb + i]; + (*bits_)[lsb + i] = b1 ^ b2 ^ carry; + carry = ( b1 && b2 ) || ( carry && ( b1 != b2 ) ); + } + if ( carry ) + for ( size_t i = 0; i < width_; ++i ) + bits_->Set(lsb + i); + } + return *this; + } + +CounterVector& CounterVector::operator|=(const CounterVector& other) +{ + return Merge(other); +} + +CounterVector operator|(const CounterVector& x, const CounterVector& y) +{ + CounterVector cv(x); + return cv |= y; +} + bool CounterVector::Serialize(SerialInfo* info) const { return SerialObj::Serialize(info); diff --git a/src/CounterVector.h b/src/CounterVector.h index 2d99bb44d8..4ab221ff6b 100644 --- a/src/CounterVector.h +++ b/src/CounterVector.h @@ -70,6 +70,13 @@ public: */ size_type Size() const; + /** + * Retrieves the counter width. + * + * @return The number of bits per counter. + */ + size_t Width() const; + /** * Computes the maximum counter value. * @@ -77,6 +84,26 @@ public: */ size_t Max() const; + /** + * Merges another counter vector into this instance by *adding* the counters + * of each cells. + * + * @param other The counter vector to merge into this instance. + * + * @return A reference to `*this`. + * + * @pre `Size() == other.Size() && Width() == other.Width()` + */ + CounterVector& Merge(const CounterVector& other); + + /** + * An alias for ::Merge. + */ + CounterVector& operator|=(const CounterVector& other); + + friend CounterVector operator|(const CounterVector& x, + const CounterVector& y); + bool Serialize(SerialInfo* info) const; static CounterVector* Unserialize(UnserialInfo* info); From eb64f5f9616e84295bc17537e8db57ae4f089c41 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Mon, 22 Jul 2013 18:03:55 +0200 Subject: [PATCH 44/50] Make hash functions equality comparable. --- src/H3.h | 12 ++++++ src/Hasher.cc | 101 +++++++++++++++++++++++++++++++------------------- src/Hasher.h | 18 +++++++++ 3 files changed, 93 insertions(+), 38 deletions(-) diff --git a/src/H3.h b/src/H3.h index e2dc865147..123dd6f374 100644 --- a/src/H3.h +++ b/src/H3.h @@ -58,6 +58,7 @@ #define H3_H #include +#include // The number of values representable by a byte. #define H3_BYTE_RANGE (UCHAR_MAX+1) @@ -112,6 +113,17 @@ public: return result; } + + friend bool operator==(const H3& x, const H3& y) + { + return ! std::memcmp(x.byte_lookup, y.byte_lookup, N * H3_BYTE_RANGE); + } + + friend bool operator!=(const H3& x, const H3& y) + { + return ! (x == y); + } + private: T byte_lookup[N][H3_BYTE_RANGE]; }; diff --git a/src/Hasher.cc b/src/Hasher.cc index 045adcd174..7a8d9a67e0 100644 --- a/src/Hasher.cc +++ b/src/Hasher.cc @@ -8,56 +8,69 @@ Hasher::UHF::UHF(size_t seed, const std::string& extra) } Hasher::digest Hasher::UHF::hash(const void* x, size_t n) const - { - assert(n <= UHASH_KEY_SIZE); - return n == 0 ? 0 : h_(x, n); - } + { + assert(n <= UHASH_KEY_SIZE); + return n == 0 ? 0 : h_(x, n); + } size_t Hasher::UHF::compute_seed(size_t seed, const std::string& extra) - { - u_char buf[SHA256_DIGEST_LENGTH]; - SHA256_CTX ctx; - sha256_init(&ctx); - if ( extra.empty() ) + { + u_char buf[SHA256_DIGEST_LENGTH]; + SHA256_CTX ctx; + sha256_init(&ctx); + if ( extra.empty() ) { unsigned int first_seed = initial_seed(); sha256_update(&ctx, &first_seed, sizeof(first_seed)); } else { - sha256_update(&ctx, extra.c_str(), extra.size()); + sha256_update(&ctx, extra.c_str(), extra.size()); + } + sha256_update(&ctx, &seed, sizeof(seed)); + sha256_final(&ctx, buf); + // Take the first sizeof(size_t) bytes as seed. + return *reinterpret_cast(buf); } - sha256_update(&ctx, &seed, sizeof(seed)); - sha256_final(&ctx, buf); - // Take the first sizeof(size_t) bytes as seed. - return *reinterpret_cast(buf); - } Hasher* Hasher::Create(size_t k, const std::string& name) - { - return new DefaultHasher(k, name); - } + { + return new DefaultHasher(k, name); + } Hasher::Hasher(size_t k, const std::string& name) - : k_(k), name_(name) + : k_(k), name_(name) { } DefaultHasher::DefaultHasher(size_t k, const std::string& name) - : Hasher(k, name) - { - for ( size_t i = 0; i < k; ++i ) - hash_functions_.push_back(UHF(i, name)); - } + : Hasher(k, name) + { + for ( size_t i = 0; i < k; ++i ) + hash_functions_.push_back(UHF(i, name)); + } Hasher::digest_vector DefaultHasher::Hash(const void* x, size_t n) const - { - digest_vector h(K(), 0); - for ( size_t i = 0; i < h.size(); ++i ) - h[i] = hash_functions_[i](x, n); - return h; - } + { + digest_vector h(K(), 0); + for ( size_t i = 0; i < h.size(); ++i ) + h[i] = hash_functions_[i](x, n); + return h; + } + +DefaultHasher* DefaultHasher::Clone() const + { + return new DefaultHasher(*this); + } + +bool DefaultHasher::Equals(const Hasher* other) const /* final */ + { + if ( typeid(*this) != typeid(*other) ) + return false; + const DefaultHasher* o = static_cast(other); + return hash_functions_ == o->hash_functions_; + } DoubleHasher::DoubleHasher(size_t k, const std::string& name) : Hasher(k, name), @@ -67,13 +80,25 @@ DoubleHasher::DoubleHasher(size_t k, const std::string& name) } Hasher::digest_vector DoubleHasher::Hash(const void* x, size_t n) const - { - digest h1 = h1_(x, n); - digest h2 = h2_(x, n); - digest_vector h(K(), 0); - for ( size_t i = 0; i < h.size(); ++i ) - h[i] = h1 + i * h2; - return h; - } + { + digest h1 = h1_(x, n); + digest h2 = h2_(x, n); + digest_vector h(K(), 0); + for ( size_t i = 0; i < h.size(); ++i ) + h[i] = h1 + i * h2; + return h; + } +DoubleHasher* DoubleHasher::Clone() const + { + return new DoubleHasher(*this); + } + +bool DoubleHasher::Equals(const Hasher* other) const /* final */ + { + if ( typeid(*this) != typeid(*other) ) + return false; + const DoubleHasher* o = static_cast(other); + return h1_ == o->h1_ && h2_ == o->h2_; + } diff --git a/src/Hasher.h b/src/Hasher.h index 8d0af6b03f..12393e7217 100644 --- a/src/Hasher.h +++ b/src/Hasher.h @@ -31,6 +31,10 @@ public: virtual digest_vector Hash(const void* x, size_t n) const = 0; + virtual Hasher* Clone() const = 0; + + virtual bool Equals(const Hasher* other) const = 0; + size_t K() const { return k_; } const std::string& Name() const { return name_; } @@ -64,6 +68,16 @@ protected: return hash(x, n); } + friend bool operator==(const UHF& x, const UHF& y) + { + return x.h_ == y.h_; + } + + friend bool operator!=(const UHF& x, const UHF& y) + { + return ! (x == y); + } + digest hash(const void* x, size_t n) const; private: @@ -87,6 +101,8 @@ public: DefaultHasher(size_t k, const std::string& name); virtual digest_vector Hash(const void* x, size_t n) const /* final */; + virtual DefaultHasher* Clone() const /* final */; + virtual bool Equals(const Hasher* other) const /* final */; private: std::vector hash_functions_; @@ -100,6 +116,8 @@ public: DoubleHasher(size_t k, const std::string& name); virtual digest_vector Hash(const void* x, size_t n) const /* final */; + virtual DoubleHasher* Clone() const /* final */; + virtual bool Equals(const Hasher* other) const /* final */; private: UHF h1_; From a39f980cd493e64a6bb4016c47923e8754b059dc Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Mon, 22 Jul 2013 18:11:12 +0200 Subject: [PATCH 45/50] Implement and test Bloom filter merging. --- src/BloomFilter.cc | 22 ++++++++++++++---- src/BloomFilter.h | 1 - src/CounterVector.cc | 6 +++++ src/CounterVector.h | 8 +++++++ src/Hasher.cc | 4 ++-- src/OpaqueVal.cc | 2 +- src/OpaqueVal.h | 21 ++++++++++++++--- .../btest/Baseline/bifs.bloomfilter/output | 7 ++++++ testing/btest/bifs/bloomfilter.bro | 23 ++++++++++++++++++- 9 files changed, 81 insertions(+), 13 deletions(-) diff --git a/src/BloomFilter.cc b/src/BloomFilter.cc index 3c7bac80f1..889c7bafe1 100644 --- a/src/BloomFilter.cc +++ b/src/BloomFilter.cc @@ -70,8 +70,13 @@ size_t BasicBloomFilter::K(size_t cells, size_t capacity) BasicBloomFilter* BasicBloomFilter::Merge(const BasicBloomFilter* x, const BasicBloomFilter* y) { - // TODO: Ensure that x and y use the same Hasher before proceeding. + if ( ! x->hasher_->Equals(y->hasher_) ) + { + reporter->InternalError("incompatible hashers during Bloom filter merge"); + return NULL; + } BasicBloomFilter* result = new BasicBloomFilter(); + result->hasher_ = x->hasher_->Clone(); result->bits_ = new BitVector(*x->bits_ | *y->bits_); return result; } @@ -119,10 +124,17 @@ size_t BasicBloomFilter::CountImpl(const Hasher::digest_vector& h) const CountingBloomFilter* CountingBloomFilter::Merge(const CountingBloomFilter* x, const CountingBloomFilter* y) -{ - assert(! "not yet implemented"); - return NULL; -} + { + if ( ! x->hasher_->Equals(y->hasher_) ) + { + reporter->InternalError("incompatible hashers during Bloom filter merge"); + return NULL; + } + CountingBloomFilter* result = new CountingBloomFilter(); + result->hasher_ = x->hasher_->Clone(); + result->cells_ = new CounterVector(*x->cells_ | *y->cells_); + return result; + } CountingBloomFilter::CountingBloomFilter() : cells_(NULL) diff --git a/src/BloomFilter.h b/src/BloomFilter.h index 92f15c6070..070aa2dc25 100644 --- a/src/BloomFilter.h +++ b/src/BloomFilter.h @@ -57,7 +57,6 @@ protected: virtual void AddImpl(const Hasher::digest_vector& hashes) = 0; virtual size_t CountImpl(const Hasher::digest_vector& hashes) const = 0; -private: const Hasher* hasher_; }; diff --git a/src/CounterVector.cc b/src/CounterVector.cc index 75c62b208a..cf3083de9e 100644 --- a/src/CounterVector.cc +++ b/src/CounterVector.cc @@ -10,6 +10,12 @@ CounterVector::CounterVector(size_t width, size_t cells) { } +CounterVector::CounterVector(const CounterVector& other) + : bits_(new BitVector(*other.bits_)), + width_(other.width_) + { + } + CounterVector::~CounterVector() { delete bits_; diff --git a/src/CounterVector.h b/src/CounterVector.h index 4ab221ff6b..eced5956d4 100644 --- a/src/CounterVector.h +++ b/src/CounterVector.h @@ -9,6 +9,7 @@ class BitVector; * A vector of counters, each of which have a fixed number of bits. */ class CounterVector : public SerialObj { + CounterVector& operator=(const CounterVector&); public: typedef size_t size_type; typedef uint64 count_type; @@ -24,6 +25,13 @@ public: */ CounterVector(size_t width, size_t cells = 1024); + /** + * Copy-constructs a counter vector. + * + * @param other The counter vector to copy. + */ + CounterVector(const CounterVector& other); + ~CounterVector(); /** diff --git a/src/Hasher.cc b/src/Hasher.cc index 7a8d9a67e0..2a889c7e09 100644 --- a/src/Hasher.cc +++ b/src/Hasher.cc @@ -64,7 +64,7 @@ DefaultHasher* DefaultHasher::Clone() const return new DefaultHasher(*this); } -bool DefaultHasher::Equals(const Hasher* other) const /* final */ +bool DefaultHasher::Equals(const Hasher* other) const { if ( typeid(*this) != typeid(*other) ) return false; @@ -94,7 +94,7 @@ DoubleHasher* DoubleHasher::Clone() const return new DoubleHasher(*this); } -bool DoubleHasher::Equals(const Hasher* other) const /* final */ +bool DoubleHasher::Equals(const Hasher* other) const { if ( typeid(*this) != typeid(*other) ) return false; diff --git a/src/OpaqueVal.cc b/src/OpaqueVal.cc index 5a673c4a40..36038d679a 100644 --- a/src/OpaqueVal.cc +++ b/src/OpaqueVal.cc @@ -1,6 +1,5 @@ #include "OpaqueVal.h" -#include "BloomFilter.h" #include "NetVar.h" #include "Reporter.h" #include "Serializer.h" @@ -587,6 +586,7 @@ BloomFilterVal* BloomFilterVal::Merge(const BloomFilterVal* x, else if ( (result = DoMerge(x, y)) ) return result; + reporter->InternalError("failed to merge Bloom filters"); return NULL; } diff --git a/src/OpaqueVal.h b/src/OpaqueVal.h index 2362fdacfc..22c3dbfade 100644 --- a/src/OpaqueVal.h +++ b/src/OpaqueVal.h @@ -3,6 +3,7 @@ #ifndef OPAQUEVAL_H #define OPAQUEVAL_H +#include "BloomFilter.h" #include "RandTest.h" #include "Val.h" #include "digest.h" @@ -137,9 +138,23 @@ private: static BloomFilterVal* DoMerge(const BloomFilterVal* x, const BloomFilterVal* y) { - const T* a = dynamic_cast(x->bloom_filter_); - const T* b = dynamic_cast(y->bloom_filter_); - return a && b ? new BloomFilterVal(T::Merge(a, b)) : NULL; + if ( typeid(*x->bloom_filter_) != typeid(*y->bloom_filter_) ) + { + reporter->InternalError("cannot merge different Bloom filter types"); + return NULL; + } + if ( typeid(T) != typeid(*x->bloom_filter_) ) + return NULL; + const T* a = static_cast(x->bloom_filter_); + const T* b = static_cast(y->bloom_filter_); + BloomFilterVal* merged = new BloomFilterVal(T::Merge(a, b)); + assert(merged); + if ( ! merged->Typify(x->Type()) ) + { + reporter->InternalError("failed to set type on merged Bloom filter"); + return NULL; + } + return merged; } BroType* type_; diff --git a/testing/btest/Baseline/bifs.bloomfilter/output b/testing/btest/Baseline/bifs.bloomfilter/output index 80847a81b9..4fe2ae1ecc 100644 --- a/testing/btest/Baseline/bifs.bloomfilter/output +++ b/testing/btest/Baseline/bifs.bloomfilter/output @@ -7,8 +7,15 @@ 1 1 1 +1 +1 +1 +1 2 3 3 2 3 +3 +3 +2 diff --git a/testing/btest/bifs/bloomfilter.bro b/testing/btest/bifs/bloomfilter.bro index ab0bf86c22..f69ddbda0c 100644 --- a/testing/btest/bifs/bloomfilter.bro +++ b/testing/btest/bifs/bloomfilter.bro @@ -35,11 +35,21 @@ function test_basic_bloom_filter() # Invalid parameters. local bf_bug0 = bloomfilter_basic_init(-0.5, 42); local bf_bug1 = bloomfilter_basic_init(1.1, 42); + + # Merging + local bf_cnt2 = bloomfilter_basic_init(0.1, 1000); + bloomfilter_add(bf_cnt2, 42); + bloomfilter_add(bf_cnt, 100); + local bf_merged = bloomfilter_merge(bf_cnt, bf_cnt2); + print bloomfilter_lookup(bf_merged, 42); + print bloomfilter_lookup(bf_merged, 84); + print bloomfilter_lookup(bf_merged, 100); + print bloomfilter_lookup(bf_merged, 168); } function test_counting_bloom_filter() { - local bf = bloomfilter_counting_init(3, 16, 3); + local bf = bloomfilter_counting_init(3, 32, 3); bloomfilter_add(bf, "foo"); print bloomfilter_lookup(bf, "foo"); # 1 bloomfilter_add(bf, "foo"); @@ -49,10 +59,21 @@ function test_counting_bloom_filter() bloomfilter_add(bf, "foo"); print bloomfilter_lookup(bf, "foo"); # still 3 + bloomfilter_add(bf, "bar"); bloomfilter_add(bf, "bar"); print bloomfilter_lookup(bf, "bar"); # 2 print bloomfilter_lookup(bf, "foo"); # still 3 + + # Merging + local bf2 = bloomfilter_counting_init(3, 32, 3); + bloomfilter_add(bf2, "baz"); + bloomfilter_add(bf2, "baz"); + bloomfilter_add(bf2, "bar"); + local bf_merged = bloomfilter_merge(bf, bf2); + print bloomfilter_lookup(bf_merged, "foo"); + print bloomfilter_lookup(bf_merged, "bar"); + print bloomfilter_lookup(bf_merged, "baz"); } event bro_init() From 474107fe40c22dec977d4e9ee3dad0edcbc02344 Mon Sep 17 00:00:00 2001 From: Robin Sommer Date: Tue, 23 Jul 2013 17:16:57 -0700 Subject: [PATCH 46/50] Broifying the code. Also extending API documentation a bit more and fixing a memory leak. --- src/Func.cc | 4 +- src/H3.h | 4 +- src/OpaqueVal.cc | 159 ++-- src/OpaqueVal.h | 67 +- src/Type.cc | 1 + src/probabilistic/BitVector.cc | 777 ++++++++++-------- src/probabilistic/BitVector.h | 575 +++++++------ src/probabilistic/BloomFilter.cc | 229 +++--- src/probabilistic/BloomFilter.h | 229 ++++-- src/probabilistic/CounterVector.cc | 244 +++--- src/probabilistic/CounterVector.h | 208 ++--- src/probabilistic/Hasher.cc | 63 +- src/probabilistic/Hasher.h | 262 +++--- src/probabilistic/bloom-filter.bif | 122 +-- src/util.cc | 20 +- src/util.h | 8 +- .../btest/Baseline/bifs.bloomfilter/output | 6 + testing/btest/bifs/bloomfilter.bro | 2 +- 18 files changed, 1651 insertions(+), 1329 deletions(-) diff --git a/src/Func.cc b/src/Func.cc index a0d2299933..483699668f 100644 --- a/src/Func.cc +++ b/src/Func.cc @@ -560,7 +560,7 @@ void builtin_error(const char* msg, BroObj* arg) #include "reporter.bif.func_def" #include "strings.bif.func_def" -// TODO: Add a nicer mechanism to pull subdirectory bifs automatically. +// TODO: Add a nicer mechanism to pull in subdirectory bifs automatically. #include "probabilistic/bloom-filter.bif.h" void init_builtin_funcs() @@ -577,7 +577,7 @@ void init_builtin_funcs() #include "reporter.bif.func_init" #include "strings.bif.func_init" -// TODO: Add a nicer mechanism to pull subdirectory bifs automatically. +// TODO: Add a nicer mechanism to pull in subdirectory bifs automatically. #include "probabilistic/bloom-filter.bif.init.cc" did_builtin_init = true; diff --git a/src/H3.h b/src/H3.h index 123dd6f374..8ea5848816 100644 --- a/src/H3.h +++ b/src/H3.h @@ -100,8 +100,8 @@ public: // loop optmized with Duff's Device register unsigned n = (size + 7) / 8; switch ( size % 8 ) { - case 0: do { result ^= byte_lookup[offset++][*p++]; - case 7: result ^= byte_lookup[offset++][*p++]; + case 0: do { result ^= byte_lookup[offset++][*p++]; + case 7: result ^= byte_lookup[offset++][*p++]; case 6: result ^= byte_lookup[offset++][*p++]; case 5: result ^= byte_lookup[offset++][*p++]; case 4: result ^= byte_lookup[offset++][*p++]; diff --git a/src/OpaqueVal.cc b/src/OpaqueVal.cc index 04032b2cfc..efdd890f70 100644 --- a/src/OpaqueVal.cc +++ b/src/OpaqueVal.cc @@ -1,5 +1,6 @@ -#include "OpaqueVal.h" +// See the file "COPYING" in the main distribution directory for copyright. +#include "OpaqueVal.h" #include "NetVar.h" #include "Reporter.h" #include "Serializer.h" @@ -518,87 +519,89 @@ bool EntropyVal::DoUnserialize(UnserialInfo* info) } BloomFilterVal::BloomFilterVal() - : OpaqueVal(bloomfilter_type), - type_(NULL), - hash_(NULL), - bloom_filter_(NULL) + : OpaqueVal(bloomfilter_type) { + type = 0; + hash = 0; + bloom_filter = 0; } BloomFilterVal::BloomFilterVal(OpaqueType* t) - : OpaqueVal(t), - type_(NULL), - hash_(NULL), - bloom_filter_(NULL) + : OpaqueVal(t) { + type = 0; + hash = 0; + bloom_filter = 0; } BloomFilterVal::BloomFilterVal(probabilistic::BloomFilter* bf) - : OpaqueVal(bloomfilter_type), - type_(NULL), - hash_(NULL), - bloom_filter_(bf) + : OpaqueVal(bloomfilter_type) { + type = 0; + hash = 0; + bloom_filter = bf; } -bool BloomFilterVal::Typify(BroType* type) - { - if ( type_ ) - return false; - type_ = type; - type_->Ref(); - TypeList* tl = new TypeList(type_); - tl->Append(type_); - hash_ = new CompositeHash(tl); - Unref(tl); - return true; - } +bool BloomFilterVal::Typify(BroType* arg_type) + { + if ( type ) + return false; + + type = arg_type; + type->Ref(); + + TypeList* tl = new TypeList(type); + tl->Append(type); + hash = new CompositeHash(tl); + Unref(tl); + + return true; + } BroType* BloomFilterVal::Type() const - { - return type_; - } + { + return type; + } void BloomFilterVal::Add(const Val* val) - { - HashKey* key = hash_->ComputeHash(val, 1); - bloom_filter_->Add(key->Hash()); - } + { + HashKey* key = hash->ComputeHash(val, 1); + bloom_filter->Add(key->Hash()); + delete key; + } size_t BloomFilterVal::Count(const Val* val) const - { - HashKey* key = hash_->ComputeHash(val, 1); - return bloom_filter_->Count(key->Hash()); - } + { + HashKey* key = hash->ComputeHash(val, 1); + size_t cnt = bloom_filter->Count(key->Hash()); + delete key; + return cnt; + } BloomFilterVal* BloomFilterVal::Merge(const BloomFilterVal* x, const BloomFilterVal* y) - { - if ( x->Type() != y->Type() ) - { - reporter->InternalError("cannot merge Bloom filters with different types"); - return NULL; - } + { + if ( ! same_type(x->Type(), y->Type()) ) + reporter->InternalError("cannot merge Bloom filters with different types"); - BloomFilterVal* result; - if ( (result = DoMerge(x, y)) ) - return result; - else if ( (result = DoMerge(x, y)) ) - return result; + BloomFilterVal* result; - reporter->InternalError("failed to merge Bloom filters"); - return NULL; - } + if ( (result = DoMerge(x, y)) ) + return result; + + else if ( (result = DoMerge(x, y)) ) + return result; + + reporter->InternalError("failed to merge Bloom filters"); + return 0; + } BloomFilterVal::~BloomFilterVal() - { - if ( type_ ) - Unref(type_); - if ( hash_ ) - delete hash_; - if ( bloom_filter_ ) - delete bloom_filter_; - } + { + Unref(type); + delete hash; + delete bloom_filter; + } IMPLEMENT_SERIAL(BloomFilterVal, SER_BLOOMFILTER_VAL); @@ -606,14 +609,16 @@ bool BloomFilterVal::DoSerialize(SerialInfo* info) const { DO_SERIALIZE(SER_BLOOMFILTER_VAL, OpaqueVal); - bool is_typed = type_ != NULL; - if ( ! SERIALIZE(is_typed) ) - return false; - if ( is_typed && ! type_->Serialize(info) ) - return false; + bool is_typed = (type != 0); - return bloom_filter_->Serialize(info); - } + if ( ! SERIALIZE(is_typed) ) + return false; + + if ( is_typed && ! type->Serialize(info) ) + return false; + + return bloom_filter->Serialize(info); + } bool BloomFilterVal::DoUnserialize(UnserialInfo* info) { @@ -621,15 +626,17 @@ bool BloomFilterVal::DoUnserialize(UnserialInfo* info) bool is_typed; if ( ! UNSERIALIZE(&is_typed) ) - return false; - if ( is_typed ) - { - BroType* type = BroType::Unserialize(info); - if ( ! Typify(type) ) - return false; - Unref(type); - } + return false; - bloom_filter_ = probabilistic::BloomFilter::Unserialize(info); - return bloom_filter_ != NULL; - } + if ( is_typed ) + { + BroType* type = BroType::Unserialize(info); + if ( ! Typify(type) ) + return false; + + Unref(type); + } + + bloom_filter = probabilistic::BloomFilter::Unserialize(info); + return bloom_filter != 0; + } diff --git a/src/OpaqueVal.h b/src/OpaqueVal.h index 5ccf73e11f..ea704cb70a 100644 --- a/src/OpaqueVal.h +++ b/src/OpaqueVal.h @@ -116,21 +116,19 @@ private: }; class BloomFilterVal : public OpaqueVal { - BloomFilterVal(const BloomFilterVal&); - BloomFilterVal& operator=(const BloomFilterVal&); public: - static BloomFilterVal* Merge(const BloomFilterVal* x, - const BloomFilterVal* y); - explicit BloomFilterVal(probabilistic::BloomFilter* bf); - ~BloomFilterVal(); + virtual ~BloomFilterVal(); - bool Typify(BroType* type); BroType* Type() const; + bool Typify(BroType* type); void Add(const Val* val); size_t Count(const Val* val) const; + static BloomFilterVal* Merge(const BloomFilterVal* x, + const BloomFilterVal* y); + protected: friend class Val; BloomFilterVal(); @@ -139,32 +137,35 @@ protected: DECLARE_SERIAL(BloomFilterVal); private: - template - static BloomFilterVal* DoMerge(const BloomFilterVal* x, - const BloomFilterVal* y) - { - if ( typeid(*x->bloom_filter_) != typeid(*y->bloom_filter_) ) - { - reporter->InternalError("cannot merge different Bloom filter types"); - return NULL; - } - if ( typeid(T) != typeid(*x->bloom_filter_) ) - return NULL; - const T* a = static_cast(x->bloom_filter_); - const T* b = static_cast(y->bloom_filter_); - BloomFilterVal* merged = new BloomFilterVal(T::Merge(a, b)); - assert(merged); - if ( ! merged->Typify(x->Type()) ) - { - reporter->InternalError("failed to set type on merged Bloom filter"); - return NULL; - } - return merged; - } + // Disable. + BloomFilterVal(const BloomFilterVal&); + BloomFilterVal& operator=(const BloomFilterVal&); - BroType* type_; - CompositeHash* hash_; - probabilistic::BloomFilter* bloom_filter_; -}; + template + static BloomFilterVal* DoMerge(const BloomFilterVal* x, + const BloomFilterVal* y) + { + if ( typeid(*x->bloom_filter) != typeid(*y->bloom_filter) ) + reporter->InternalError("cannot merge different Bloom filter types"); + + if ( typeid(T) != typeid(*x->bloom_filter) ) + return 0; + + const T* a = static_cast(x->bloom_filter); + const T* b = static_cast(y->bloom_filter); + + BloomFilterVal* merged = new BloomFilterVal(T::Merge(a, b)); + assert(merged); + + if ( ! merged->Typify(x->Type()) ) + reporter->InternalError("failed to set type on merged Bloom filter"); + + return merged; + } + + BroType* type; + CompositeHash* hash; + probabilistic::BloomFilter* bloom_filter; + }; #endif diff --git a/src/Type.cc b/src/Type.cc index 57d9d0e6e5..563bc5afbd 100644 --- a/src/Type.cc +++ b/src/Type.cc @@ -1321,6 +1321,7 @@ bool OpaqueType::DoUnserialize(UnserialInfo* info) const char* n; if ( ! UNSERIALIZE_STR(&n, 0) ) return false; + name = n; delete [] n; diff --git a/src/probabilistic/BitVector.cc b/src/probabilistic/BitVector.cc index 67714fe7d0..98f008b24b 100644 --- a/src/probabilistic/BitVector.cc +++ b/src/probabilistic/BitVector.cc @@ -1,3 +1,5 @@ +// See the file "COPYING" in the main distribution directory for copyright. + #include "BitVector.h" #include @@ -8,505 +10,558 @@ using namespace probabilistic; BitVector::size_type BitVector::npos = static_cast(-1); BitVector::block_type BitVector::bits_per_block = - std::numeric_limits::digits; + std::numeric_limits::digits; namespace { uint8_t count_table[] = { - 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, - 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, - 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, - 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, - 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, - 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, - 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, - 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, 5, - 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 3, - 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, - 6, 7, 6, 7, 7, 8 + 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, + 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, + 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, + 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, + 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, + 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, + 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, + 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, 5, + 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 3, + 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, + 6, 7, 6, 7, 7, 8 }; } // namespace BitVector::Reference::Reference(block_type& block, block_type i) - : block_(block), - mask_(block_type(1) << i) - { - assert(i < bits_per_block); - } + : block(block), mask((block_type(1) << i)) + { + assert(i < bits_per_block); + } BitVector::Reference& BitVector::Reference::Flip() - { - block_ ^= mask_; - return *this; - } + { + block ^= mask; + return *this; + } BitVector::Reference::operator bool() const - { - return (block_ & mask_) != 0; - } + { + return (block & mask) != 0; + } bool BitVector::Reference::operator~() const - { - return (block_ & mask_) == 0; - } + { + return (block & mask) == 0; + } BitVector::Reference& BitVector::Reference::operator=(bool x) - { - x ? block_ |= mask_ : block_ &= ~mask_; - return *this; - } + { + if ( x ) + block |= mask; + else + block &= ~mask; -BitVector::Reference& BitVector::Reference::operator=(Reference const& other) - { - other ? block_ |= mask_ : block_ &= ~mask_; - return *this; - } + return *this; + } + +BitVector::Reference& BitVector::Reference::operator=(const Reference& other) + { + if ( other ) + block |= mask; + else + block &= ~mask; + + return *this; + } BitVector::Reference& BitVector::Reference::operator|=(bool x) - { - if (x) - block_ |= mask_; - return *this; - } + { + if ( x ) + block |= mask; + + return *this; + } BitVector::Reference& BitVector::Reference::operator&=(bool x) - { - if (! x) - block_ &= ~mask_; - return *this; - } + { + if ( ! x ) + block &= ~mask; + + return *this; + } BitVector::Reference& BitVector::Reference::operator^=(bool x) - { - if (x) - block_ ^= mask_; - return *this; - } + { + if ( x ) + block ^= mask; + + return *this; + } BitVector::Reference& BitVector::Reference::operator-=(bool x) - { - if (x) - block_ &= ~mask_; - return *this; - } + { + if ( x ) + block &= ~mask; + return *this; + } -BitVector::BitVector() : num_bits_(0) { } +BitVector::BitVector() + { + num_bits = 0; + } BitVector::BitVector(size_type size, bool value) - : bits_(bits_to_blocks(size), value ? ~block_type(0) : 0), - num_bits_(size) -{ } + : bits(bits_to_blocks(size), value ? ~block_type(0) : 0) + { + num_bits = size; + } BitVector::BitVector(BitVector const& other) - : bits_(other.bits_), - num_bits_(other.num_bits_) -{ } + : bits(other.bits) + { + num_bits = other.num_bits; + } BitVector BitVector::operator~() const - { - BitVector b(*this); - b.Flip(); - return b; - } + { + BitVector b(*this); + b.Flip(); + return b; + } BitVector& BitVector::operator=(BitVector const& other) - { - bits_ = other.bits_; - return *this; - } + { + bits = other.bits; + return *this; + } BitVector BitVector::operator<<(size_type n) const - { - BitVector b(*this); - return b <<= n; - } + { + BitVector b(*this); + return b <<= n; + } BitVector BitVector::operator>>(size_type n) const - { - BitVector b(*this); - return b >>= n; - } + { + BitVector b(*this); + return b >>= n; + } BitVector& BitVector::operator<<=(size_type n) - { - if (n >= num_bits_) - return Reset(); + { + if ( n >= num_bits ) + return Reset(); - if (n > 0) - { - size_type last = Blocks() - 1; - size_type div = n / bits_per_block; - block_type r = bit_index(n); - block_type* b = &bits_[0]; - assert(Blocks() >= 1); - assert(div <= last); + if ( n > 0 ) + { + size_type last = Blocks() - 1; + size_type div = n / bits_per_block; + block_type r = bit_index(n); + block_type* b = &bits[0]; - if (r != 0) - { - for (size_type i = last - div; i > 0; --i) - b[i + div] = (b[i] << r) | (b[i - 1] >> (bits_per_block - r)); - b[div] = b[0] << r; - } - else - { - for (size_type i = last-div; i > 0; --i) - b[i + div] = b[i]; - b[div] = b[0]; - } + assert(Blocks() >= 1); + assert(div <= last); - std::fill_n(b, div, block_type(0)); - zero_unused_bits(); - } + if ( r != 0 ) + { + for ( size_type i = last - div; i > 0; --i ) + b[i + div] = (b[i] << r) | (b[i - 1] >> (bits_per_block - r)); - return *this; - } + b[div] = b[0] << r; + } + + else + { + for (size_type i = last-div; i > 0; --i) + b[i + div] = b[i]; + + b[div] = b[0]; + } + + std::fill_n(b, div, block_type(0)); + zero_unused_bits(); + } + + return *this; + } BitVector& BitVector::operator>>=(size_type n) - { - if (n >= num_bits_) - return Reset(); + { + if ( n >= num_bits ) + return Reset(); - if (n > 0) - { - size_type last = Blocks() - 1; - size_type div = n / bits_per_block; - block_type r = bit_index(n); - block_type* b = &bits_[0]; - assert(Blocks() >= 1); - assert(div <= last); + if ( n > 0 ) + { + size_type last = Blocks() - 1; + size_type div = n / bits_per_block; + block_type r = bit_index(n); + block_type* b = &bits[0]; - if (r != 0) - { - for (size_type i = last - div; i > 0; --i) - b[i - div] = (b[i] >> r) | (b[i + 1] << (bits_per_block - r)); - b[last - div] = b[last] >> r; - } - else - { - for (size_type i = div; i <= last; ++i) - b[i-div] = b[i]; - } + assert(Blocks() >= 1); + assert(div <= last); - std::fill_n(b + (Blocks() - div), div, block_type(0)); - } - return *this; - } + if ( r != 0 ) + { + for (size_type i = last - div; i > 0; --i) + b[i - div] = (b[i] >> r) | (b[i + 1] << (bits_per_block - r)); + + b[last - div] = b[last] >> r; + } + + else + { + for (size_type i = div; i <= last; ++i) + b[i-div] = b[i]; + } + + std::fill_n(b + (Blocks() - div), div, block_type(0)); + } + + return *this; + } BitVector& BitVector::operator&=(BitVector const& other) - { - assert(Size() >= other.Size()); - for (size_type i = 0; i < Blocks(); ++i) - bits_[i] &= other.bits_[i]; - return *this; - } + { + assert(Size() >= other.Size()); + + for ( size_type i = 0; i < Blocks(); ++i ) + bits[i] &= other.bits[i]; + + return *this; + } BitVector& BitVector::operator|=(BitVector const& other) - { - assert(Size() >= other.Size()); - for (size_type i = 0; i < Blocks(); ++i) - bits_[i] |= other.bits_[i]; - return *this; - } + { + assert(Size() >= other.Size()); + + for ( size_type i = 0; i < Blocks(); ++i ) + bits[i] |= other.bits[i]; + + return *this; + } BitVector& BitVector::operator^=(BitVector const& other) - { - assert(Size() >= other.Size()); - for (size_type i = 0; i < Blocks(); ++i) - bits_[i] ^= other.bits_[i]; - return *this; - } + { + assert(Size() >= other.Size()); + + for ( size_type i = 0; i < Blocks(); ++i ) + bits[i] ^= other.bits[i]; + + return *this; + } BitVector& BitVector::operator-=(BitVector const& other) - { - assert(Size() >= other.Size()); - for (size_type i = 0; i < Blocks(); ++i) - bits_[i] &= ~other.bits_[i]; - return *this; - } + { + assert(Size() >= other.Size()); + + for ( size_type i = 0; i < Blocks(); ++i ) + bits[i] &= ~other.bits[i]; + + return *this; + } namespace probabilistic { BitVector operator&(BitVector const& x, BitVector const& y) - { - BitVector b(x); - return b &= y; - } + { + BitVector b(x); + return b &= y; + } BitVector operator|(BitVector const& x, BitVector const& y) - { - BitVector b(x); - return b |= y; - } + { + BitVector b(x); + return b |= y; + } BitVector operator^(BitVector const& x, BitVector const& y) - { - BitVector b(x); - return b ^= y; - } + { + BitVector b(x); + return b ^= y; + } BitVector operator-(BitVector const& x, BitVector const& y) - { - BitVector b(x); - return b -= y; - } + { + BitVector b(x); + return b -= y; + } bool operator==(BitVector const& x, BitVector const& y) - { - return x.num_bits_ == y.num_bits_ && x.bits_ == y.bits_; - } + { + return x.num_bits == y.num_bits && x.bits == y.bits; + } bool operator!=(BitVector const& x, BitVector const& y) - { - return ! (x == y); - } + { + return ! (x == y); + } bool operator<(BitVector const& x, BitVector const& y) - { - assert(x.Size() == y.Size()); - for (BitVector::size_type r = x.Blocks(); r > 0; --r) - { - BitVector::size_type i = r - 1; - if (x.bits_[i] < y.bits_[i]) - return true; - else if (x.bits_[i] > y.bits_[i]) - return false; - } - return false; - } + { + assert(x.Size() == y.Size()); + + for ( BitVector::size_type r = x.Blocks(); r > 0; --r ) + { + BitVector::size_type i = r - 1; + + if ( x.bits[i] < y.bits[i] ) + return true; + + else if ( x.bits[i] > y.bits[i] ) + return false; + + } + + return false; + } } void BitVector::Resize(size_type n, bool value) - { - size_type old = Blocks(); - size_type required = bits_to_blocks(n); - block_type block_value = value ? ~block_type(0) : block_type(0); + { + size_type old = Blocks(); + size_type required = bits_to_blocks(n); + block_type block_value = value ? ~block_type(0) : block_type(0); - if (required != old) - bits_.resize(required, block_value); + if ( required != old ) + bits.resize(required, block_value); - if (value && (n > num_bits_) && extra_bits()) - bits_[old - 1] |= (block_value << extra_bits()); + if ( value && (n > num_bits) && extra_bits() ) + bits[old - 1] |= (block_value << extra_bits()); - num_bits_ = n; - zero_unused_bits(); - } + num_bits = n; + zero_unused_bits(); + } void BitVector::Clear() - { - bits_.clear(); - num_bits_ = 0; - } + { + bits.clear(); + num_bits = 0; + } void BitVector::PushBack(bool bit) - { - size_type s = Size(); - Resize(s + 1); - Set(s, bit); - } + { + size_type s = Size(); + Resize(s + 1); + Set(s, bit); + } void BitVector::Append(block_type block) - { - size_type excess = extra_bits(); - if (excess) - { - assert(! Empty()); - bits_.push_back(block >> (bits_per_block - excess)); - bits_[Blocks() - 2] |= (block << excess); - } - else - { - bits_.push_back(block); - } - num_bits_ += bits_per_block; - } + { + size_type excess = extra_bits(); + + if ( excess ) + { + assert(! Empty()); + bits.push_back(block >> (bits_per_block - excess)); + bits[Blocks() - 2] |= (block << excess); + } + + else + { + bits.push_back(block); + } + + num_bits += bits_per_block; + } BitVector& BitVector::Set(size_type i, bool bit) - { - assert(i < num_bits_); - if (bit) - bits_[block_index(i)] |= bit_mask(i); - else - Reset(i); - return *this; - } + { + assert(i < num_bits); + + if ( bit ) + bits[block_index(i)] |= bit_mask(i); + else + Reset(i); + + return *this; + } BitVector& BitVector::Set() - { - std::fill(bits_.begin(), bits_.end(), ~block_type(0)); - zero_unused_bits(); - return *this; - } + { + std::fill(bits.begin(), bits.end(), ~block_type(0)); + zero_unused_bits(); + return *this; + } BitVector& BitVector::Reset(size_type i) - { - assert(i < num_bits_); - bits_[block_index(i)] &= ~bit_mask(i); - return *this; - } + { + assert(i < num_bits); + bits[block_index(i)] &= ~bit_mask(i); + return *this; + } BitVector& BitVector::Reset() - { - std::fill(bits_.begin(), bits_.end(), block_type(0)); - return *this; - } + { + std::fill(bits.begin(), bits.end(), block_type(0)); + return *this; + } BitVector& BitVector::Flip(size_type i) - { - assert(i < num_bits_); - bits_[block_index(i)] ^= bit_mask(i); - return *this; - } + { + assert(i < num_bits); + bits[block_index(i)] ^= bit_mask(i); + return *this; + } BitVector& BitVector::Flip() - { - for (size_type i = 0; i < Blocks(); ++i) - bits_[i] = ~bits_[i]; - zero_unused_bits(); - return *this; - } + { + for (size_type i = 0; i < Blocks(); ++i) + bits[i] = ~bits[i]; + + zero_unused_bits(); + return *this; + } bool BitVector::operator[](size_type i) const - { - assert(i < num_bits_); - return (bits_[block_index(i)] & bit_mask(i)) != 0; - } + { + assert(i < num_bits); + return (bits[block_index(i)] & bit_mask(i)) != 0; + } BitVector::Reference BitVector::operator[](size_type i) - { - assert(i < num_bits_); - return Reference(bits_[block_index(i)], bit_index(i)); - } + { + assert(i < num_bits); + return Reference(bits[block_index(i)], bit_index(i)); + } BitVector::size_type BitVector::Count() const - { - std::vector::const_iterator first = bits_.begin(); - size_t n = 0; - size_type length = Blocks(); - while (length) - { - block_type block = *first; - while (block) - { - // TODO: use __popcnt if available. - n += count_table[block & ((1u << 8) - 1)]; - block >>= 8; - } - ++first; - --length; - } - return n; - } + { + std::vector::const_iterator first = bits.begin(); + size_t n = 0; + size_type length = Blocks(); + + while ( length ) + { + block_type block = *first; + + while ( block ) + { + // TODO: use _popcnt if available. + n += count_table[block & ((1u << 8) - 1)]; + block >>= 8; + } + + ++first; + --length; + } + + return n; + } BitVector::size_type BitVector::Blocks() const - { - return bits_.size(); - } + { + return bits.size(); + } BitVector::size_type BitVector::Size() const - { - return num_bits_; - } + { + return num_bits; + } bool BitVector::Empty() const - { - return bits_.empty(); - } + { + return bits.empty(); + } BitVector::size_type BitVector::FindFirst() const - { - return find_from(0); - } + { + return find_from(0); + } BitVector::size_type BitVector::FindNext(size_type i) const - { - if (i >= (Size() - 1) || Size() == 0) - return npos; - ++i; - size_type bi = block_index(i); - block_type block = bits_[bi] & (~block_type(0) << bit_index(i)); - return block ? bi * bits_per_block + lowest_bit(block) : find_from(bi + 1); - } + { + if ( i >= (Size() - 1) || Size() == 0 ) + return npos; + + ++i; + size_type bi = block_index(i); + block_type block = bits[bi] & (~block_type(0) << bit_index(i)); + return block ? bi * bits_per_block + lowest_bit(block) : find_from(bi + 1); + } BitVector::size_type BitVector::lowest_bit(block_type block) - { - block_type x = block - (block & (block - 1)); - size_type log = 0; - while (x >>= 1) - ++log; - return log; - } + { + block_type x = block - (block & (block - 1)); + size_type log = 0; + + while (x >>= 1) + ++log; + + return log; + } BitVector::block_type BitVector::extra_bits() const - { - return bit_index(Size()); - } + { + return bit_index(Size()); + } void BitVector::zero_unused_bits() - { - if (extra_bits()) - bits_.back() &= ~(~block_type(0) << extra_bits()); - } + { + if ( extra_bits() ) + bits.back() &= ~(~block_type(0) << extra_bits()); + } BitVector::size_type BitVector::find_from(size_type i) const - { - while (i < Blocks() && bits_[i] == 0) - ++i; - if (i >= Blocks()) - return npos; - return i * bits_per_block + lowest_bit(bits_[i]); - } + { + while (i < Blocks() && bits[i] == 0) + ++i; + + if ( i >= Blocks() ) + return npos; + + return i * bits_per_block + lowest_bit(bits[i]); + } bool BitVector::Serialize(SerialInfo* info) const - { - return SerialObj::Serialize(info); - } + { + return SerialObj::Serialize(info); + } BitVector* BitVector::Unserialize(UnserialInfo* info) - { - return reinterpret_cast( - SerialObj::Unserialize(info, SER_BITVECTOR)); - } + { + return reinterpret_cast(SerialObj::Unserialize(info, SER_BITVECTOR)); + } IMPLEMENT_SERIAL(BitVector, SER_BITVECTOR); bool BitVector::DoSerialize(SerialInfo* info) const - { - DO_SERIALIZE(SER_BITVECTOR, SerialObj); + { + DO_SERIALIZE(SER_BITVECTOR, SerialObj); - if ( ! SERIALIZE(static_cast(bits_.size())) ) - return false; + if ( ! SERIALIZE(static_cast(bits.size())) ) + return false; - for ( size_t i = 0; i < bits_.size(); ++i ) - if ( ! SERIALIZE(static_cast(bits_[i])) ) - return false; + for ( size_t i = 0; i < bits.size(); ++i ) + if ( ! SERIALIZE(static_cast(bits[i])) ) + return false; - return SERIALIZE(static_cast(num_bits_)); - } + return SERIALIZE(static_cast(num_bits)); + } bool BitVector::DoUnserialize(UnserialInfo* info) - { - DO_UNSERIALIZE(SerialObj); + { + DO_UNSERIALIZE(SerialObj); - uint64 size; - if ( ! UNSERIALIZE(&size) ) - return false; + uint64 size; + if ( ! UNSERIALIZE(&size) ) + return false; - bits_.resize(static_cast(size)); - uint64 block; - for ( size_t i = 0; i < bits_.size(); ++i ) - { - if ( ! UNSERIALIZE(&block) ) - return false; - bits_[i] = static_cast(block); - } + bits.resize(static_cast(size)); - uint64 num_bits; - if ( ! UNSERIALIZE(&num_bits) ) - return false; - num_bits_ = static_cast(num_bits); + for ( size_t i = 0; i < bits.size(); ++i ) + { + uint64 block; + if ( ! UNSERIALIZE(&block) ) + return false; - return true; - } + bits[i] = static_cast(block); + } + + uint64 num_bits; + if ( ! UNSERIALIZE(&num_bits) ) + return false; + + num_bits = static_cast(num_bits); + + return true; + } diff --git a/src/probabilistic/BitVector.h b/src/probabilistic/BitVector.h index 8832c24cbe..9eefe1b633 100644 --- a/src/probabilistic/BitVector.h +++ b/src/probabilistic/BitVector.h @@ -1,8 +1,11 @@ -#ifndef BitVector_h -#define BitVector_h +// See the file "COPYING" in the main distribution directory for copyright. + +#ifndef PROBABILISTIC_BITVECTOR_H +#define PROBABILISTIC_BITVECTOR_H #include #include + #include "SerialObj.h" namespace probabilistic { @@ -12,322 +15,348 @@ namespace probabilistic { */ class BitVector : public SerialObj { public: - typedef size_t block_type; - typedef size_t size_type; - static size_type npos; - static block_type bits_per_block; + typedef size_t block_type; + typedef size_t size_type; + typedef bool const_reference; -public: - /** - * An lvalue proxy for single bits. - */ - class Reference { - friend class BitVector; - Reference(block_type& block, block_type i); + static size_type npos; + static block_type bits_per_block; - public: - Reference& Flip(); - operator bool() const; - bool operator~() const; - Reference& operator=(bool x); - Reference& operator=(Reference const& other); - Reference& operator|=(bool x); - Reference& operator&=(bool x); - Reference& operator^=(bool x); - Reference& operator-=(bool x); + /** + * An lvalue proxy for individual bits. + */ + class Reference { + public: + /** + * Inverts the bits' values. + */ + Reference& Flip(); - private: - void operator&(); - block_type& block_; - block_type const mask_; - }; + operator bool() const; + bool operator~() const; + Reference& operator=(bool x); + Reference& operator=(const Reference& other); + Reference& operator|=(bool x); + Reference& operator&=(bool x); + Reference& operator^=(bool x); + Reference& operator-=(bool x); - typedef bool const_reference; + private: + friend class BitVector; - /** - * Default-constructs an empty bit vector. - */ - BitVector(); + Reference(block_type& block, block_type i); + void operator&(); - /** - * Constructs a bit vector of a given size. - * @param size The number of bits. - * @param value The value for each bit. - */ - explicit BitVector(size_type size, bool value = false); + block_type& block; + const block_type mask; + }; - /** - * Constructs a bit vector from a sequence of blocks. - */ - template - BitVector(InputIterator first, InputIterator last) - { - bits_.insert(bits_.end(), first, last); - num_bits_ = bits_.size() * bits_per_block; - } + /** + * Default-constructs an empty bit vector. + */ + BitVector(); - /** - * Copy-constructs a bit vector. - * @param other The bit vector to copy. - */ - BitVector(const BitVector& other); + /** + * Constructs a bit vector of a given size. + * @param size The number of bits. + * @param value The value for each bit. + */ + explicit BitVector(size_type size, bool value = false); - /** - * Assigns another bit vector to this instance. - * @param other The RHS of the assignment. - */ - BitVector& operator=(const BitVector& other); + /** + * Constructs a bit vector from a sequence of blocks. + * + * @param first Start of range + * @param last End of range. + * + */ + template + BitVector(InputIterator first, InputIterator last) + { + bits.insert(bits.end(), first, last); + num_bits = bits.size() * bits_per_block; + } - // - // Bitwise operations - // - BitVector operator~() const; - BitVector operator<<(size_type n) const; - BitVector operator>>(size_type n) const; - BitVector& operator<<=(size_type n); - BitVector& operator>>=(size_type n); - BitVector& operator&=(BitVector const& other); - BitVector& operator|=(BitVector const& other); - BitVector& operator^=(BitVector const& other); - BitVector& operator-=(BitVector const& other); - friend BitVector operator&(BitVector const& x, BitVector const& y); - friend BitVector operator|(BitVector const& x, BitVector const& y); - friend BitVector operator^(BitVector const& x, BitVector const& y); - friend BitVector operator-(BitVector const& x, BitVector const& y); + /** + * Copy-constructs a bit vector. + * @param other The bit vector to copy. + */ + BitVector(const BitVector& other); - // - // Relational operators - // - friend bool operator==(BitVector const& x, BitVector const& y); - friend bool operator!=(BitVector const& x, BitVector const& y); - friend bool operator<(BitVector const& x, BitVector const& y); + /** + * Assigns another bit vector to this instance. + * @param other The RHS of the assignment. + */ + BitVector& operator=(const BitVector& other); - // - // Basic operations - // - /** Appends the bits in a sequence of values. - * @tparam Iterator A forward iterator. - * @param first An iterator pointing to the first element of the sequence. - * @param last An iterator pointing to one past the last element of the - * sequence. - */ - template - void Append(ForwardIterator first, ForwardIterator last) - { - if (first == last) - return; + // + // Bitwise operations. + // + BitVector operator~() const; + BitVector operator<<(size_type n) const; + BitVector operator>>(size_type n) const; + BitVector& operator<<=(size_type n); + BitVector& operator>>=(size_type n); + BitVector& operator&=(BitVector const& other); + BitVector& operator|=(BitVector const& other); + BitVector& operator^=(BitVector const& other); + BitVector& operator-=(BitVector const& other); + friend BitVector operator&(BitVector const& x, BitVector const& y); + friend BitVector operator|(BitVector const& x, BitVector const& y); + friend BitVector operator^(BitVector const& x, BitVector const& y); + friend BitVector operator-(BitVector const& x, BitVector const& y); - block_type excess = extra_bits(); - typename std::iterator_traits::difference_type delta = - std::distance(first, last); + // + // Relational operators + // + friend bool operator==(BitVector const& x, BitVector const& y); + friend bool operator!=(BitVector const& x, BitVector const& y); + friend bool operator<(BitVector const& x, BitVector const& y); - bits_.reserve(Blocks() + delta); - if (excess == 0) - { - bits_.back() |= (*first << excess); - do - { - block_type b = *first++ >> (bits_per_block - excess); - bits_.push_back(b | (first == last ? 0 : *first << excess)); - } while (first != last); - } - else - { - bits_.insert(bits_.end(), first, last); - } - num_bits_ += bits_per_block * delta; - } + // + // Basic operations + // - /** - * Appends the bits in a given block. - * @param block The block containing bits to append. - */ - void Append(block_type block); + /** Appends the bits in a sequence of values. + * @tparam Iterator A forward iterator. + * @param first An iterator pointing to the first element of the sequence. + * @param last An iterator pointing to one past the last element of the + * sequence. + */ + template + void Append(ForwardIterator first, ForwardIterator last) + { + if ( first == last ) + return; - /** Appends a single bit to the end of the bit vector. - * @param bit The value of the bit. - */ - void PushBack(bool bit); + block_type excess = extra_bits(); + typename std::iterator_traits::difference_type delta = + std::distance(first, last); - /** - * Clears all bits in the bitvector. - */ - void Clear(); + bits.reserve(Blocks() + delta); - /** - * Resizes the bit vector to a new number of bits. - * @param n The new number of bits of the bit vector. - * @param value The bit value of new values, if the vector expands. - */ - void Resize(size_type n, bool value = false); + if ( excess == 0 ) + { + bits.back() |= (*first << excess); - /** - * Sets a bit at a specific position to a given value. - * @param i The bit position. - * @param bit The value assigned to position *i*. - * @return A reference to the bit vector instance. - */ - BitVector& Set(size_type i, bool bit = true); + do { + block_type b = *first++ >> (bits_per_block - excess); + bits.push_back(b | (first == last ? 0 : *first << excess)); + } while (first != last); - /** - * Sets all bits to 1. - * @return A reference to the bit vector instance. - */ - BitVector& Set(); + } - /** - * Resets a bit at a specific position, i.e., sets it to 0. - * @param i The bit position. - * @return A reference to the bit vector instance. - */ - BitVector& Reset(size_type i); + else + bits.insert(bits.end(), first, last); - /** - * Sets all bits to 0. - * @return A reference to the bit vector instance. - */ - BitVector& Reset(); + num_bits += bits_per_block * delta; + } - /** - * Toggles/flips a bit at a specific position. - * @param i The bit position. - * @return A reference to the bit vector instance. - */ - BitVector& Flip(size_type i); + /** + * Appends the bits in a given block. + * @param block The block containing bits to append. + */ + void Append(block_type block); - /** - * Computes the complement. - * @return A reference to the bit vector instance. - */ - BitVector& Flip(); + /** Appends a single bit to the end of the bit vector. + * @param bit The value of the bit. + */ + void PushBack(bool bit); - /** Retrieves a single bit. - * @param i The bit position. - * @return A mutable reference to the bit at position *i*. - */ - Reference operator[](size_type i); + /** + * Clears all bits in the bitvector. + */ + void Clear(); - /** - * Retrieves a single bit. - * @param i The bit position. - * @return A const-reference to the bit at position *i*. - */ - const_reference operator[](size_type i) const; + /** + * Resizes the bit vector to a new number of bits. + * @param n The new number of bits of the bit vector. + * @param value The bit value of new values, if the vector expands. + */ + void Resize(size_type n, bool value = false); - /** - * Counts the number of 1-bits in the bit vector. Also known as *population - * count* or *Hamming weight*. - * @return The number of bits set to 1. - */ - size_type Count() const; + /** + * Sets a bit at a specific position to a given value. + * @param i The bit position. + * @param bit The value assigned to position *i*. + * @return A reference to the bit vector instance. + */ + BitVector& Set(size_type i, bool bit = true); - /** - * Retrieves the number of blocks of the underlying storage. - * @param The number of blocks that represent `Size()` bits. - */ - size_type Blocks() const; + /** + * Sets all bits to 1. + * @return A reference to the bit vector instance. + */ + BitVector& Set(); - /** - * Retrieves the number of bits the bitvector consist of. - * @return The length of the bit vector in bits. - */ - size_type Size() const; + /** + * Resets a bit at a specific position, i.e., sets it to 0. + * @param i The bit position. + * @return A reference to the bit vector instance. + */ + BitVector& Reset(size_type i); - /** - * Checks whether the bit vector is empty. - * @return `true` iff the bitvector has zero length. - */ - bool Empty() const; + /** + * Sets all bits to 0. + * @return A reference to the bit vector instance. + */ + BitVector& Reset(); - /** - * Finds the bit position of of the first 1-bit. - * @return The position of the first bit that equals to one or `npos` if no - * such bit exists. - */ - size_type FindFirst() const; + /** + * Toggles/flips a bit at a specific position. + * @param i The bit position. + * @return A reference to the bit vector instance. + */ + BitVector& Flip(size_type i); - /** - * Finds the next 1-bit from a given starting position. - * - * @param i The index where to start looking. - * - * @return The position of the first bit that equals to 1 after position - * *i* or `npos` if no such bit exists. - */ - size_type FindNext(size_type i) const; + /** + * Computes the complement. + * @return A reference to the bit vector instance. + */ + BitVector& Flip(); - bool Serialize(SerialInfo* info) const; - static BitVector* Unserialize(UnserialInfo* info); + /** Retrieves a single bit. + * @param i The bit position. + * @return A mutable reference to the bit at position *i*. + */ + Reference operator[](size_type i); + + /** + * Retrieves a single bit. + * @param i The bit position. + * @return A const-reference to the bit at position *i*. + */ + const_reference operator[](size_type i) const; + + /** + * Counts the number of 1-bits in the bit vector. Also known as *population + * count* or *Hamming weight*. + * @return The number of bits set to 1. + */ + size_type Count() const; + + /** + * Retrieves the number of blocks of the underlying storage. + * @param The number of blocks that represent `Size()` bits. + */ + size_type Blocks() const; + + /** + * Retrieves the number of bits the bitvector consist of. + * @return The length of the bit vector in bits. + */ + size_type Size() const; + + /** + * Checks whether the bit vector is empty. + * @return `true` iff the bitvector has zero length. + */ + bool Empty() const; + + /** + * Finds the bit position of of the first 1-bit. + * @return The position of the first bit that equals to one or `npos` if no + * such bit exists. + */ + size_type FindFirst() const; + + /** + * Finds the next 1-bit from a given starting position. + * + * @param i The index where to start looking. + * + * @return The position of the first bit that equals to 1 after position + * *i* or `npos` if no such bit exists. + */ + size_type FindNext(size_type i) const; + + /** + * Serializes the bit vector. + * + * @param info The serializaton informationt to use. + * + * @return True if successful. + */ + bool Serialize(SerialInfo* info) const; + + /** + * Unserialize the bit vector. + * + * @param info The serializaton informationt to use. + * + * @return The unserialized bit vector, or null if an error occured. + */ + static BitVector* Unserialize(UnserialInfo* info); protected: - DECLARE_SERIAL(BitVector); + DECLARE_SERIAL(BitVector); private: - /** - * Computes the block index for a given bit position. - */ - static size_type block_index(size_type i) - { - return i / bits_per_block; - } + /** + * Computes the number of excess/unused bits in the bit vector. + */ + block_type extra_bits() const; - /** - * Computes the bit index within a given block for a given bit position. - */ - static block_type bit_index(size_type i) - { - return i % bits_per_block; - } + /** + * If the number of bits in the vector are not not a multiple of + * bitvector::bits_per_block, then the last block exhibits unused bits which + * this function resets. + */ + void zero_unused_bits(); - /** - * Computes the bitmask block to extract a bit a given bit position. - */ - static block_type bit_mask(size_type i) - { - return block_type(1) << bit_index(i); - } + /** + * Looks for the first 1-bit starting at a given position. + * @param i The block index to start looking. + * @return The block index of the first 1-bit starting from *i* or + * `bitvector::npos` if no 1-bit exists. + */ + size_type find_from(size_type i) const; - /** - * Computes the number of blocks needed to represent a given number of - * bits. - * @param bits the number of bits. - * @return The number of blocks to represent *bits* number of bits. - */ - static size_type bits_to_blocks(size_type bits) - { - return bits / bits_per_block - + static_cast(bits % bits_per_block != 0); - } + /** + * Computes the block index for a given bit position. + */ + static size_type block_index(size_type i) + { + return i / bits_per_block; + } - /** - * Computes the bit position first 1-bit in a given block. - * @param block The block to inspect. - * @return The bit position where *block* has its first bit set to 1. - */ - static size_type lowest_bit(block_type block); + /** + * Computes the bit index within a given block for a given bit position. + */ + static block_type bit_index(size_type i) + { + return i % bits_per_block; + } - /** - * Computes the number of excess/unused bits in the bit vector. - */ - block_type extra_bits() const; + /** + * Computes the bitmask block to extract a bit a given bit position. + */ + static block_type bit_mask(size_type i) + { + return block_type(1) << bit_index(i); + } - /** - * If the number of bits in the vector are not not a multiple of - * bitvector::bits_per_block, then the last block exhibits unused bits which - * this function resets. - */ - void zero_unused_bits(); + /** + * Computes the number of blocks needed to represent a given number of + * bits. + * @param bits the number of bits. + * @return The number of blocks to represent *bits* number of bits. + */ + static size_type bits_to_blocks(size_type bits) + { + return bits / bits_per_block + + static_cast(bits % bits_per_block != 0); + } - /** - * Looks for the first 1-bit starting at a given position. - * @param i The block index to start looking. - * @return The block index of the first 1-bit starting from *i* or - * `bitvector::npos` if no 1-bit exists. - */ - size_type find_from(size_type i) const; + /** + * Computes the bit position first 1-bit in a given block. + * @param block The block to inspect. + * @return The bit position where *block* has its first bit set to 1. + */ + static size_type lowest_bit(block_type block); - std::vector bits_; - size_type num_bits_; + std::vector bits; + size_type num_bits; }; } diff --git a/src/probabilistic/BloomFilter.cc b/src/probabilistic/BloomFilter.cc index 1b86ea1441..5613dcce05 100644 --- a/src/probabilistic/BloomFilter.cc +++ b/src/probabilistic/BloomFilter.cc @@ -1,3 +1,5 @@ +// See the file "COPYING" in the main distribution directory for copyright. + #include "BloomFilter.h" #include @@ -8,181 +10,184 @@ using namespace probabilistic; BloomFilter::BloomFilter() - : hasher_(NULL) - { - } + { + hasher = 0; + } -BloomFilter::BloomFilter(const Hasher* hasher) - : hasher_(hasher) - { - } +BloomFilter::BloomFilter(const Hasher* arg_hasher) + { + hasher = arg_hasher; + } BloomFilter::~BloomFilter() - { - if ( hasher_ ) - delete hasher_; - } + { + delete hasher; + } bool BloomFilter::Serialize(SerialInfo* info) const - { - return SerialObj::Serialize(info); - } + { + return SerialObj::Serialize(info); + } BloomFilter* BloomFilter::Unserialize(UnserialInfo* info) - { - return reinterpret_cast( - SerialObj::Unserialize(info, SER_BLOOMFILTER)); - } + { + return reinterpret_cast(SerialObj::Unserialize(info, SER_BLOOMFILTER)); + } bool BloomFilter::DoSerialize(SerialInfo* info) const { DO_SERIALIZE(SER_BLOOMFILTER, SerialObj); - if ( ! SERIALIZE(static_cast(hasher_->K())) ) - return false; - return SERIALIZE_STR(hasher_->Name().c_str(), hasher_->Name().size()); - } + + if ( ! SERIALIZE(static_cast(hasher->K())) ) + return false; + + return SERIALIZE_STR(hasher->Name().c_str(), hasher->Name().size()); + } bool BloomFilter::DoUnserialize(UnserialInfo* info) { DO_UNSERIALIZE(SerialObj); + uint16 k; if ( ! UNSERIALIZE(&k) ) - return false; - const char* name; - if ( ! UNSERIALIZE_STR(&name, 0) ) - return false; - hasher_ = Hasher::Create(k, name); + return false; + + const char* name; + if ( ! UNSERIALIZE_STR(&name, 0) ) + return false; + + hasher = Hasher::Create(k, name); + delete [] name; return true; - } - + } size_t BasicBloomFilter::M(double fp, size_t capacity) - { - double ln2 = std::log(2); - return std::ceil(-(capacity * std::log(fp) / ln2 / ln2)); - } + { + double ln2 = std::log(2); + return std::ceil(-(capacity * std::log(fp) / ln2 / ln2)); + } size_t BasicBloomFilter::K(size_t cells, size_t capacity) - { - double frac = static_cast(cells) / static_cast(capacity); - return std::ceil(frac * std::log(2)); - } + { + double frac = static_cast(cells) / static_cast(capacity); + return std::ceil(frac * std::log(2)); + } BasicBloomFilter* BasicBloomFilter::Merge(const BasicBloomFilter* x, const BasicBloomFilter* y) - { - if ( ! x->hasher_->Equals(y->hasher_) ) - { - reporter->InternalError("incompatible hashers during Bloom filter merge"); - return NULL; - } - BasicBloomFilter* result = new BasicBloomFilter(); - result->hasher_ = x->hasher_->Clone(); - result->bits_ = new BitVector(*x->bits_ | *y->bits_); - return result; - } + { + if ( ! x->hasher->Equals(y->hasher) ) + reporter->InternalError("incompatible hashers during BasicBloomFilter merge"); + + BasicBloomFilter* result = new BasicBloomFilter(); + result->hasher = x->hasher->Clone(); + result->bits = new BitVector(*x->bits | *y->bits); + + return result; + } BasicBloomFilter::BasicBloomFilter() - : bits_(NULL) - { - } + { + bits = 0; + } BasicBloomFilter::BasicBloomFilter(const Hasher* hasher, size_t cells) - : BloomFilter(hasher), - bits_(new BitVector(cells)) - { - } + : BloomFilter(hasher) + { + bits = new BitVector(cells); + } IMPLEMENT_SERIAL(BasicBloomFilter, SER_BASICBLOOMFILTER) bool BasicBloomFilter::DoSerialize(SerialInfo* info) const { DO_SERIALIZE(SER_BASICBLOOMFILTER, BloomFilter); - return bits_->Serialize(info); - } + return bits->Serialize(info); + } bool BasicBloomFilter::DoUnserialize(UnserialInfo* info) { DO_UNSERIALIZE(BloomFilter); - bits_ = BitVector::Unserialize(info); - return bits_ != NULL; - } + bits = BitVector::Unserialize(info); + return (bits != 0); + } void BasicBloomFilter::AddImpl(const Hasher::digest_vector& h) - { - for ( size_t i = 0; i < h.size(); ++i ) - bits_->Set(h[i] % bits_->Size()); - } + { + for ( size_t i = 0; i < h.size(); ++i ) + bits->Set(h[i] % bits->Size()); + } size_t BasicBloomFilter::CountImpl(const Hasher::digest_vector& h) const - { - for ( size_t i = 0; i < h.size(); ++i ) - if ( ! (*bits_)[h[i] % bits_->Size()] ) - return 0; - return 1; - } + { + for ( size_t i = 0; i < h.size(); ++i ) + { + if ( ! (*bits)[h[i] % bits->Size()] ) + return 0; + } + return 1; + } CountingBloomFilter* CountingBloomFilter::Merge(const CountingBloomFilter* x, - const CountingBloomFilter* y) - { - if ( ! x->hasher_->Equals(y->hasher_) ) - { - reporter->InternalError("incompatible hashers during Bloom filter merge"); - return NULL; - } - CountingBloomFilter* result = new CountingBloomFilter(); - result->hasher_ = x->hasher_->Clone(); - result->cells_ = new CounterVector(*x->cells_ | *y->cells_); - return result; - } + const CountingBloomFilter* y) + { + if ( ! x->hasher->Equals(y->hasher) ) + reporter->InternalError("incompatible hashers during CountingBloomFilter merge"); + + CountingBloomFilter* result = new CountingBloomFilter(); + result->hasher = x->hasher->Clone(); + result->cells = new CounterVector(*x->cells | *y->cells); + + return result; + } CountingBloomFilter::CountingBloomFilter() - : cells_(NULL) - { - } + { + cells = 0; + } CountingBloomFilter::CountingBloomFilter(const Hasher* hasher, - size_t cells, size_t width) - : BloomFilter(hasher), - cells_(new CounterVector(width, cells)) - { - } - + size_t arg_cells, size_t width) + : BloomFilter(hasher) + { + cells = new CounterVector(width, arg_cells); + } IMPLEMENT_SERIAL(CountingBloomFilter, SER_COUNTINGBLOOMFILTER) bool CountingBloomFilter::DoSerialize(SerialInfo* info) const { DO_SERIALIZE(SER_COUNTINGBLOOMFILTER, BloomFilter); - return cells_->Serialize(info); - } + return cells->Serialize(info); + } bool CountingBloomFilter::DoUnserialize(UnserialInfo* info) { DO_UNSERIALIZE(BloomFilter); - cells_ = CounterVector::Unserialize(info); - return cells_ != NULL; - } + cells = CounterVector::Unserialize(info); + return (cells != 0); + } // TODO: Use partitioning in add/count to allow for reusing CMS bounds. - void CountingBloomFilter::AddImpl(const Hasher::digest_vector& h) - { - for ( size_t i = 0; i < h.size(); ++i ) - cells_->Increment(h[i] % cells_->Size()); - } + { + for ( size_t i = 0; i < h.size(); ++i ) + cells->Increment(h[i] % cells->Size()); + } size_t CountingBloomFilter::CountImpl(const Hasher::digest_vector& h) const - { - CounterVector::size_type min = - std::numeric_limits::max(); - for ( size_t i = 0; i < h.size(); ++i ) - { - CounterVector::size_type cnt = cells_->Count(h[i] % cells_->Size()); - if ( cnt < min ) - min = cnt; - } - return min; - } + { + CounterVector::size_type min = + std::numeric_limits::max(); + + for ( size_t i = 0; i < h.size(); ++i ) + { + CounterVector::size_type cnt = cells->Count(h[i] % cells->Size()); + if ( cnt < min ) + min = cnt; + } + + return min; + } diff --git a/src/probabilistic/BloomFilter.h b/src/probabilistic/BloomFilter.h index 2fa849505d..4a6b01c484 100644 --- a/src/probabilistic/BloomFilter.h +++ b/src/probabilistic/BloomFilter.h @@ -1,5 +1,7 @@ -#ifndef BloomFilter_h -#define BloomFilter_h +// See the file "COPYING" in the main distribution directory for copyright. + +#ifndef PROBABILISTIC_BLOOMFILTER_H +#define PROBABILISTIC_BLOOMFILTER_H #include #include "BitVector.h" @@ -11,42 +13,65 @@ class CounterVector; /** * The abstract base class for Bloom filters. + * + * At this point we won't let the user choose the hasher, but we might open + * up the interface in the future. */ class BloomFilter : public SerialObj { public: - // At this point we won't let the user choose the hasher, but we might - // open up the interface in the future. - virtual ~BloomFilter(); + /** + * Destructor. + */ + virtual ~BloomFilter(); - /** - * Adds an element of type T to the Bloom filter. - * @param x The element to add - */ - template - void Add(const T& x) - { - AddImpl((*hasher_)(x)); - } + /** + * Adds an element of type T to the Bloom filter. + * @param x The element to add + */ + template + void Add(const T& x) + { + AddImpl((*hasher)(x)); + } - /** - * Retrieves the associated count of a given value. - * - * @param x The value of type `T` to check. - * - * @return The counter associated with *x*. - */ - template - size_t Count(const T& x) const - { - return CountImpl((*hasher_)(x)); - } + /** + * Retrieves the associated count of a given value. + * + * @param x The value of type `T` to check. + * + * @return The counter associated with *x*. + */ + template + size_t Count(const T& x) const + { + return CountImpl((*hasher)(x)); + } - bool Serialize(SerialInfo* info) const; - static BloomFilter* Unserialize(UnserialInfo* info); + /** + * Serializes the Bloom filter. + * + * @param info The serializaton information to use. + * + * @return True if successful. + */ + bool Serialize(SerialInfo* info) const; + + /** + * Unserializes a Bloom filter. + * + * @param info The serializaton information to use. + * + * @return The unserialized Bloom filter, or null if an error + * occured. + */ + static BloomFilter* Unserialize(UnserialInfo* info); protected: - DECLARE_ABSTRACT_SERIAL(BloomFilter); + DECLARE_ABSTRACT_SERIAL(BloomFilter); + /** + * Default constructor. + */ BloomFilter(); /** @@ -54,12 +79,28 @@ protected: * * @param hasher The hasher to use for this Bloom filter. */ - BloomFilter(const Hasher* hasher); + BloomFilter(const Hasher* hasher); - virtual void AddImpl(const Hasher::digest_vector& hashes) = 0; - virtual size_t CountImpl(const Hasher::digest_vector& hashes) const = 0; + /** + * Abstract method for implementinng the *Add* operation. + * + * @param hashes A set of *k* hashes for the item to add, computed by + * the internal hasher object. + * + */ + virtual void AddImpl(const Hasher::digest_vector& hashes) = 0; - const Hasher* hasher_; + /** + * Abstract method for implementing the *Count* operation. + * + * @param hashes A set of *k* hashes for the item to add, computed by + * the internal hasher object. + * + * @return Returns the counter associated with the hashed element. + */ + virtual size_t CountImpl(const Hasher::digest_vector& hashes) const = 0; + + const Hasher* hasher; }; /** @@ -67,50 +108,67 @@ protected: */ class BasicBloomFilter : public BloomFilter { public: - /** - * Computes the number of cells based a given false-positive rate and - * capacity. In the literature, this parameter often has the name *M*. - * - * @param fp The false-positive rate. - * - * @param capacity The number of exepected elements. - * - * Returns: The number cells needed to support a false-positive rate of *fp* - * with at most *capacity* elements. - */ - static size_t M(double fp, size_t capacity); + /** + * Constructs a basic Bloom filter with a given number of cells. The + * ideal number of cells can be computed with *M*. + * + * @param hasher The hasher to use. The ideal number of hash + * functions can be computed with *K*. + * + * @param cells The number of cells. + */ + BasicBloomFilter(const Hasher* hasher, size_t cells); - /** - * Computes the optimal number of hash functions based on the number cells - * and expected number of elements. - * - * @param cells The number of cells (*m*). - * - * @param capacity The maximum number of elements. - * - * Returns: the optimal number of hash functions for a false-positive rate of - * *fp* for at most *capacity* elements. - */ - static size_t K(size_t cells, size_t capacity); + /** + * Computes the number of cells based on a given false positive rate + * and capacity. In the literature, this parameter often has the name + * *M*. + * + * @param fp The false positive rate. + * + * @param capacity The expected number of elements that will be + * stored. + * + * Returns: The number cells needed to support a false positive rate + * of *fp* with at most *capacity* elements. + */ + static size_t M(double fp, size_t capacity); - static BasicBloomFilter* Merge(const BasicBloomFilter* x, - const BasicBloomFilter* y); + /** + * Computes the optimal number of hash functions based on the number cells + * and expected number of elements. + * + * @param cells The number of cells (*m*). + * + * @param capacity The maximum number of elements. + * + * Returns: the optimal number of hash functions for a false-positive + * rate of *fp* for at most *capacity* elements. + */ + static size_t K(size_t cells, size_t capacity); - /** - * Constructs a basic Bloom filter with a given number of cells and capacity. - */ - BasicBloomFilter(const Hasher* hasher, size_t cells); + /** + * Merges two basic Bloom filters. + * + * @return The merged Bloom filter. + */ + static BasicBloomFilter* Merge(const BasicBloomFilter* x, + const BasicBloomFilter* y); protected: - DECLARE_SERIAL(BasicBloomFilter); + DECLARE_SERIAL(BasicBloomFilter); - BasicBloomFilter(); + /** + * Default constructor. + */ + BasicBloomFilter(); - virtual void AddImpl(const Hasher::digest_vector& h); - virtual size_t CountImpl(const Hasher::digest_vector& h) const; + // Overridden from BloomFilter. + virtual void AddImpl(const Hasher::digest_vector& h); + virtual size_t CountImpl(const Hasher::digest_vector& h) const; private: - BitVector* bits_; + BitVector* bits; }; /** @@ -118,21 +176,40 @@ private: */ class CountingBloomFilter : public BloomFilter { public: - static CountingBloomFilter* Merge(const CountingBloomFilter* x, - const CountingBloomFilter* y); + /** + * Constructs a counting Bloom filter. + * + * @param hasher The hasher to use. The ideal number of hash + * functions can be computed with *K*. + * + * @param cells The number of cells to use. + * + * @param width The maximal bit-width of counter values. + */ + CountingBloomFilter(const Hasher* hasher, size_t cells, size_t width); - CountingBloomFilter(const Hasher* hasher, size_t cells, size_t width); + /** + * Merges two counting Bloom filters. + * + * @return The merged Bloom filter. + */ + static CountingBloomFilter* Merge(const CountingBloomFilter* x, + const CountingBloomFilter* y); protected: - DECLARE_SERIAL(CountingBloomFilter); + DECLARE_SERIAL(CountingBloomFilter); - CountingBloomFilter(); + /** + * Default constructor. + */ + CountingBloomFilter(); - virtual void AddImpl(const Hasher::digest_vector& h); - virtual size_t CountImpl(const Hasher::digest_vector& h) const; + // Overridden from BloomFilter. + virtual void AddImpl(const Hasher::digest_vector& h); + virtual size_t CountImpl(const Hasher::digest_vector& h) const; private: - CounterVector* cells_; + CounterVector* cells; }; } diff --git a/src/probabilistic/CounterVector.cc b/src/probabilistic/CounterVector.cc index 943749ad46..570ed1f8ea 100644 --- a/src/probabilistic/CounterVector.cc +++ b/src/probabilistic/CounterVector.cc @@ -1,3 +1,5 @@ +// See the file "COPYING" in the main distribution directory for copyright. + #include "CounterVector.h" #include @@ -6,154 +8,176 @@ using namespace probabilistic; -CounterVector::CounterVector(size_t width, size_t cells) - : bits_(new BitVector(width * cells)), - width_(width) - { - } +CounterVector::CounterVector(size_t arg_width, size_t cells) + { + bits = new BitVector(arg_width * cells); + width = arg_width; + } CounterVector::CounterVector(const CounterVector& other) - : bits_(new BitVector(*other.bits_)), - width_(other.width_) - { - } + { + bits = new BitVector(*other.bits); + width = other.width; + } CounterVector::~CounterVector() - { - delete bits_; - } + { + delete bits; + } bool CounterVector::Increment(size_type cell, count_type value) - { - assert(cell < Size()); - assert(value != 0); - size_t lsb = cell * width_; - bool carry = false; - for ( size_t i = 0; i < width_; ++i ) - { - bool b1 = (*bits_)[lsb + i]; - bool b2 = value & (1 << i); - (*bits_)[lsb + i] = b1 ^ b2 ^ carry; - carry = ( b1 && b2 ) || ( carry && ( b1 != b2 ) ); - } - if ( carry ) - for ( size_t i = 0; i < width_; ++i ) - bits_->Set(lsb + i); - return ! carry; - } + { + assert(cell < Size()); + assert(value != 0); + + size_t lsb = cell * width; + bool carry = false; + + for ( size_t i = 0; i < width; ++i ) + { + bool b1 = (*bits)[lsb + i]; + bool b2 = value & (1 << i); + (*bits)[lsb + i] = b1 ^ b2 ^ carry; + carry = ( b1 && b2 ) || ( carry && ( b1 != b2 ) ); + } + + if ( carry ) + { + for ( size_t i = 0; i < width; ++i ) + bits->Set(lsb + i); + } + + return ! carry; + } bool CounterVector::Decrement(size_type cell, count_type value) - { - assert(cell < Size()); - assert(value != 0); - value = ~value + 1; // A - B := A + ~B + 1 - bool carry = false; - size_t lsb = cell * width_; - for ( size_t i = 0; i < width_; ++i ) - { - bool b1 = (*bits_)[lsb + i]; - bool b2 = value & (1 << i); - (*bits_)[lsb + i] = b1 ^ b2 ^ carry; - carry = ( b1 && b2 ) || ( carry && ( b1 != b2 ) ); - } - return carry; - } + { + assert(cell < Size()); + assert(value != 0); + + value = ~value + 1; // A - B := A + ~B + 1 + bool carry = false; + size_t lsb = cell * width; + + for ( size_t i = 0; i < width; ++i ) + { + bool b1 = (*bits)[lsb + i]; + bool b2 = value & (1 << i); + (*bits)[lsb + i] = b1 ^ b2 ^ carry; + carry = ( b1 && b2 ) || ( carry && ( b1 != b2 ) ); + } + + return carry; + } CounterVector::count_type CounterVector::Count(size_type cell) const - { - assert(cell < Size()); - size_t cnt = 0, order = 1; - size_t lsb = cell * width_; - for (size_t i = lsb; i < lsb + width_; ++i, order <<= 1) - if ((*bits_)[i]) - cnt |= order; - return cnt; - } + { + assert(cell < Size()); + + size_t cnt = 0, order = 1; + size_t lsb = cell * width; + + for ( size_t i = lsb; i < lsb + width; ++i, order <<= 1 ) + if ( (*bits)[i] ) + cnt |= order; + + return cnt; + } CounterVector::size_type CounterVector::Size() const - { - return bits_->Size() / width_; - } + { + return bits->Size() / width; + } size_t CounterVector::Width() const - { - return width_; - } + { + return width; + } size_t CounterVector::Max() const - { - return std::numeric_limits::max() - >> (std::numeric_limits::digits - width_); - } + { + return std::numeric_limits::max() + >> (std::numeric_limits::digits - width); + } CounterVector& CounterVector::Merge(const CounterVector& other) - { - assert(Size() == other.Size()); - assert(Width() == other.Width()); - for ( size_t cell = 0; cell < Size(); ++cell ) - { - size_t lsb = cell * width_; - bool carry = false; - for ( size_t i = 0; i < width_; ++i ) - { - bool b1 = (*bits_)[lsb + i]; - bool b2 = (*other.bits_)[lsb + i]; - (*bits_)[lsb + i] = b1 ^ b2 ^ carry; - carry = ( b1 && b2 ) || ( carry && ( b1 != b2 ) ); - } - if ( carry ) - for ( size_t i = 0; i < width_; ++i ) - bits_->Set(lsb + i); - } - return *this; - } + { + assert(Size() == other.Size()); + assert(Width() == other.Width()); + + for ( size_t cell = 0; cell < Size(); ++cell ) + { + size_t lsb = cell * width; + bool carry = false; + + for ( size_t i = 0; i < width; ++i ) + { + bool b1 = (*bits)[lsb + i]; + bool b2 = (*other.bits)[lsb + i]; + (*bits)[lsb + i] = b1 ^ b2 ^ carry; + carry = ( b1 && b2 ) || ( carry && ( b1 != b2 ) ); + } + + if ( carry ) + { + for ( size_t i = 0; i < width; ++i ) + bits->Set(lsb + i); + } + } + + return *this; + } namespace probabilistic { CounterVector& CounterVector::operator|=(const CounterVector& other) -{ - return Merge(other); -} + { + return Merge(other); + } CounterVector operator|(const CounterVector& x, const CounterVector& y) -{ - CounterVector cv(x); - return cv |= y; -} + { + CounterVector cv(x); + return cv |= y; + } } bool CounterVector::Serialize(SerialInfo* info) const - { - return SerialObj::Serialize(info); - } + { + return SerialObj::Serialize(info); + } CounterVector* CounterVector::Unserialize(UnserialInfo* info) - { - return reinterpret_cast( - SerialObj::Unserialize(info, SER_COUNTERVECTOR)); - } + { + return reinterpret_cast(SerialObj::Unserialize(info, SER_COUNTERVECTOR)); + } IMPLEMENT_SERIAL(CounterVector, SER_COUNTERVECTOR) bool CounterVector::DoSerialize(SerialInfo* info) const { DO_SERIALIZE(SER_COUNTERVECTOR, SerialObj); - if ( ! bits_->Serialize(info) ) - return false; - return SERIALIZE(static_cast(width_)); - } + + if ( ! bits->Serialize(info) ) + return false; + + return SERIALIZE(static_cast(width)); + } bool CounterVector::DoUnserialize(UnserialInfo* info) { DO_UNSERIALIZE(SerialObj); - bits_ = BitVector::Unserialize(info); - if ( ! bits_ ) - return false; - uint64 width; - if ( ! UNSERIALIZE(&width) ) - return false; - width_ = static_cast(width); - return true; - } + bits = BitVector::Unserialize(info); + if ( ! bits ) + return false; + + uint64 width; + if ( ! UNSERIALIZE(&width) ) + return false; + + width = static_cast(width); + + return true; + } diff --git a/src/probabilistic/CounterVector.h b/src/probabilistic/CounterVector.h index 63445ec12d..178a68e8f2 100644 --- a/src/probabilistic/CounterVector.h +++ b/src/probabilistic/CounterVector.h @@ -1,5 +1,7 @@ -#ifndef CounterVector_h -#define CounterVector_h +// See the file "COPYING" in the main distribution directory for copyright. + +#ifndef PROBABILISTIC_COUNTERVECTOR_H +#define PROBABILISTIC_COUNTERVECTOR_H #include "SerialObj.h" @@ -8,123 +10,143 @@ namespace probabilistic { class BitVector; /** - * A vector of counters, each of which have a fixed number of bits. + * A vector of counters, each of which has a fixed number of bits. */ class CounterVector : public SerialObj { - CounterVector& operator=(const CounterVector&); public: - typedef size_t size_type; - typedef uint64 count_type; + typedef size_t size_type; + typedef uint64 count_type; - /** - * Constructs a counter vector having cells of a given width. - * - * @param width The number of bits that each cell occupies. - * - * @param cells The number of cells in the bitvector. - * - * @pre `cells > 0 && width > 0` - */ - CounterVector(size_t width, size_t cells = 1024); + /** + * Constructs a counter vector having cells of a given width. + * + * @param width The number of bits that each cell occupies. + * + * @param cells The number of cells in the bitvector. + * + * @pre `cells > 0 && width > 0` + */ + CounterVector(size_t width, size_t cells = 1024); /** * Copy-constructs a counter vector. * * @param other The counter vector to copy. */ - CounterVector(const CounterVector& other); + CounterVector(const CounterVector& other); - ~CounterVector(); + /** + * Destructor. + */ + ~CounterVector(); - /** - * Increments a given cell. - * - * @param cell The cell to increment. - * - * @param value The value to add to the current counter in *cell*. - * - * @return `true` if adding *value* to the counter in *cell* succeeded. - * - * @pre `cell < Size()` - */ - bool Increment(size_type cell, count_type value = 1); + /** + * Increments a given cell. + * + * @param cell The cell to increment. + * + * @param value The value to add to the current counter in *cell*. + * + * @return `true` if adding *value* to the counter in *cell* succeeded. + * + * @pre `cell < Size()` + */ + bool Increment(size_type cell, count_type value = 1); - /** - * Decrements a given cell. - * - * @param cell The cell to decrement. - * - * @param value The value to subtract from the current counter in *cell*. - * - * @return `true` if subtracting *value* from the counter in *cell* succeeded. - * - * @pre `cell < Size()` - */ - bool Decrement(size_type cell, count_type value = 1); + /** + * Decrements a given cell. + * + * @param cell The cell to decrement. + * + * @param value The value to subtract from the current counter in *cell*. + * + * @return `true` if subtracting *value* from the counter in *cell* succeeded. + * + * @pre `cell < Size()` + */ + bool Decrement(size_type cell, count_type value = 1); - /** - * Retrieves the counter of a given cell. - * - * @param cell The cell index to retrieve the count for. - * - * @return The counter associated with *cell*. - * - * @pre `cell < Size()` - */ - count_type Count(size_type cell) const; + /** + * Retrieves the counter of a given cell. + * + * @param cell The cell index to retrieve the count for. + * + * @return The counter associated with *cell*. + * + * @pre `cell < Size()` + */ + count_type Count(size_type cell) const; - /** - * Retrieves the number of cells in the storage. - * - * @return The number of cells. - */ - size_type Size() const; + /** + * Retrieves the number of cells in the storage. + * + * @return The number of cells. + */ + size_type Size() const; - /** - * Retrieves the counter width. - * - * @return The number of bits per counter. - */ - size_t Width() const; + /** + * Retrieves the counter width. + * + * @return The number of bits per counter. + */ + size_t Width() const; - /** - * Computes the maximum counter value. - * - * @return The maximum counter value based on the width. - */ - size_t Max() const; + /** + * Computes the maximum counter value. + * + * @return The maximum counter value based on the width. + */ + size_t Max() const; - /** - * Merges another counter vector into this instance by *adding* the counters - * of each cells. - * - * @param other The counter vector to merge into this instance. - * - * @return A reference to `*this`. - * - * @pre `Size() == other.Size() && Width() == other.Width()` - */ - CounterVector& Merge(const CounterVector& other); + /** + * Merges another counter vector into this instance by *adding* the + * counters of each cells. + * + * @param other The counter vector to merge into this instance. + * + * @return A reference to `*this`. + * + * @pre `Size() == other.Size() && Width() == other.Width()` + */ + CounterVector& Merge(const CounterVector& other); - /** - * An alias for ::Merge. - */ - CounterVector& operator|=(const CounterVector& other); + /** + * An alias for ::Merge. + */ + CounterVector& operator|=(const CounterVector& other); - friend CounterVector operator|(const CounterVector& x, - const CounterVector& y); + /** + * Serializes the bit vector. + * + * @param info The serializaton information to use. + * + * @return True if successful. + */ + bool Serialize(SerialInfo* info) const; - bool Serialize(SerialInfo* info) const; - static CounterVector* Unserialize(UnserialInfo* info); + /** + * Unserialize the counter vector. + * + * @param info The serializaton information to use. + * + * @return The unserialized counter vector, or null if an error + * occured. + */ + static CounterVector* Unserialize(UnserialInfo* info); protected: - DECLARE_SERIAL(CounterVector); + friend CounterVector operator|(const CounterVector& x, + const CounterVector& y); - CounterVector() { } + CounterVector() { } + + DECLARE_SERIAL(CounterVector); private: - BitVector* bits_; - size_t width_; + CounterVector& operator=(const CounterVector&); // Disable. + + BitVector* bits; + size_t width; }; } diff --git a/src/probabilistic/Hasher.cc b/src/probabilistic/Hasher.cc index c2f1110ecd..f9ce7bdd6b 100644 --- a/src/probabilistic/Hasher.cc +++ b/src/probabilistic/Hasher.cc @@ -1,66 +1,70 @@ +// See the file "COPYING" in the main distribution directory for copyright. #include #include "Hasher.h" - #include "digest.h" using namespace probabilistic; -Hasher::UHF::UHF(size_t seed, const std::string& extra) - : h_(compute_seed(seed, extra)) +UHF::UHF(size_t seed, const std::string& extra) + : h(compute_seed(seed, extra)) { } -Hasher::digest Hasher::UHF::hash(const void* x, size_t n) const +Hasher::digest UHF::hash(const void* x, size_t n) const { assert(n <= UHASH_KEY_SIZE); - return n == 0 ? 0 : h_(x, n); + return n == 0 ? 0 : h(x, n); } -size_t Hasher::UHF::compute_seed(size_t seed, const std::string& extra) +size_t UHF::compute_seed(size_t seed, const std::string& extra) { u_char buf[SHA256_DIGEST_LENGTH]; SHA256_CTX ctx; sha256_init(&ctx); + if ( extra.empty() ) { unsigned int first_seed = initial_seed(); sha256_update(&ctx, &first_seed, sizeof(first_seed)); } - else - { - sha256_update(&ctx, extra.c_str(), extra.size()); - } - sha256_update(&ctx, &seed, sizeof(seed)); - sha256_final(&ctx, buf); - // Take the first sizeof(size_t) bytes as seed. - return *reinterpret_cast(buf); - } + else + sha256_update(&ctx, extra.c_str(), extra.size()); + + sha256_update(&ctx, &seed, sizeof(seed)); + sha256_final(&ctx, buf); + + // Take the first sizeof(size_t) bytes as seed. + return *reinterpret_cast(buf); + } Hasher* Hasher::Create(size_t k, const std::string& name) { return new DefaultHasher(k, name); } -Hasher::Hasher(size_t k, const std::string& name) - : k_(k), name_(name) +Hasher::Hasher(size_t k, const std::string& arg_name) + : k(k) { + name = arg_name; } DefaultHasher::DefaultHasher(size_t k, const std::string& name) : Hasher(k, name) { for ( size_t i = 0; i < k; ++i ) - hash_functions_.push_back(UHF(i, name)); + hash_functions.push_back(UHF(i, name)); } Hasher::digest_vector DefaultHasher::Hash(const void* x, size_t n) const { digest_vector h(K(), 0); + for ( size_t i = 0; i < h.size(); ++i ) - h[i] = hash_functions_[i](x, n); + h[i] = hash_functions[i](x, n); + return h; } @@ -73,24 +77,25 @@ bool DefaultHasher::Equals(const Hasher* other) const { if ( typeid(*this) != typeid(*other) ) return false; + const DefaultHasher* o = static_cast(other); - return hash_functions_ == o->hash_functions_; + return hash_functions == o->hash_functions; } DoubleHasher::DoubleHasher(size_t k, const std::string& name) - : Hasher(k, name), - h1_(1, name), - h2_(2, name) + : Hasher(k, name), h1(1, name), h2(2, name) { } Hasher::digest_vector DoubleHasher::Hash(const void* x, size_t n) const { - digest h1 = h1_(x, n); - digest h2 = h2_(x, n); + digest d1 = h1(x, n); + digest d2 = h2(x, n); digest_vector h(K(), 0); + for ( size_t i = 0; i < h.size(); ++i ) - h[i] = h1 + i * h2; + h[i] = d1 + i * d2; + return h; } @@ -103,7 +108,7 @@ bool DoubleHasher::Equals(const Hasher* other) const { if ( typeid(*this) != typeid(*other) ) return false; - const DoubleHasher* o = static_cast(other); - return h1_ == o->h1_ && h2_ == o->h2_; - } + const DoubleHasher* o = static_cast(other); + return h1 == o->h1 && h2 == o->h2; + } diff --git a/src/probabilistic/Hasher.h b/src/probabilistic/Hasher.h index 0231343dcd..62c5d58d1f 100644 --- a/src/probabilistic/Hasher.h +++ b/src/probabilistic/Hasher.h @@ -1,5 +1,7 @@ -#ifndef Hasher_h -#define Hasher_h +// See the file "COPYING" in the main distribution directory for copyright. + +#ifndef PROBABILISTIC_HASHER_H +#define PROBABILISTIC_HASHER_H #include "Hash.h" #include "H3.h" @@ -7,123 +9,197 @@ namespace probabilistic { /** - * The abstract base class for hashers, i.e., constructs which hash elements - * *k* times. + * Abstract base class for hashers. A hasher creates a family of hash + * functions to hash an element *k* times. */ class Hasher { public: - typedef hash_t digest; - typedef std::vector digest_vector; + typedef hash_t digest; + typedef std::vector digest_vector; - /** - * Constructs the hashing policy used by the implementation. - * - * @todo This factory function exists because the HashingPolicy class - * hierachy is not yet serializable. - */ + /** + * Destructor. + */ + virtual ~Hasher() { } + + /** + * Computes hash values for an element. + * + * @param x The element to hash. + * + * @return Vector of *k* hash values. + */ + template + digest_vector operator()(const T& x) const + { + return Hash(&x, sizeof(T)); + } + + /** + * Computes the hashes for a set of bytes. + * + * @param x Pointer to first byte to hash. + * + * @param n Number of bytes to hash. + * + * @return Vector of *k* hash values. + * + */ + virtual digest_vector Hash(const void* x, size_t n) const = 0; + + /** + * Returns a deep copy of the hasher. + */ + virtual Hasher* Clone() const = 0; + + /** + * Returns true if two hashers are identical. + */ + virtual bool Equals(const Hasher* other) const = 0; + + /** + * Returns the number *k* of hash functions the hashers applies. + */ + size_t K() const { return k; } + + /** + * Returns the hasher's name. TODO: What's this? + */ + const std::string& Name() const { return name; } + + /** + * Constructs the hasher used by the implementation. This hardcodes a + * specific hashing policy. It exists only because the HashingPolicy + * class hierachy is not yet serializable. + * + * @param k The number of hash functions to apply. + * + * @param name The hasher's name. + * + * @return Returns a new hasher instance. + */ static Hasher* Create(size_t k, const std::string& name); - virtual ~Hasher() { } - - template - digest_vector operator()(const T& x) const - { - return Hash(&x, sizeof(T)); - } - - virtual digest_vector Hash(const void* x, size_t n) const = 0; - - virtual Hasher* Clone() const = 0; - - virtual bool Equals(const Hasher* other) const = 0; - - size_t K() const { return k_; } - const std::string& Name() const { return name_; } - protected: - /** - * A universal hash function family. - */ - class UHF { - public: - /** - * Constructs an H3 hash function seeded with a given seed and an optional - * extra seed to replace the initial Bro seed. - * - * @param seed The seed to use for this instance. - * - * @param extra If not empty, this parameter replaces the initial seed to - * compute the seed for t to compute the - * seed - * NUL-terminated string as additional seed. - */ - UHF(size_t seed, const std::string& extra = ""); + Hasher(size_t k, const std::string& name); - template - digest operator()(const T& x) const - { - return hash(&x, sizeof(T)); - } - - digest operator()(const void* x, size_t n) const - { - return hash(x, n); - } - - friend bool operator==(const UHF& x, const UHF& y) - { - return x.h_ == y.h_; - } - - friend bool operator!=(const UHF& x, const UHF& y) - { - return ! (x == y); - } - - digest hash(const void* x, size_t n) const; - - private: - static size_t compute_seed(size_t seed, const std::string& extra); - - H3 h_; - }; - - Hasher(size_t k, const std::string& name); - -private: - const size_t k_; - std::string name_; + private: + const size_t k; + std::string name; }; /** - * The default hashing policy. Performs *k* hash function computations. + * A universal hash function family. This is a helper class that Hasher + * implementations can use in their implementation. + */ +class UHF { +public: + /** + * Constructs an H3 hash function seeded with a given seed and an + * optional extra seed to replace the initial Bro seed. + * + * @param seed The seed to use for this instance. + * + * @param extra If not empty, this parameter replaces the initial + * seed to compute the seed for t to compute the seed NUL-terminated + * string as additional seed. + */ + UHF(size_t seed, const std::string& extra = ""); + + template + Hasher::digest operator()(const T& x) const + { + return hash(&x, sizeof(T)); + } + + /** + * Computes hash values for an element. + * + * @param x The element to hash. + * + * @return Vector of *k* hash values. + */ + Hasher::digest operator()(const void* x, size_t n) const + { + return hash(x, n); + } + + /** + * Computes the hashes for a set of bytes. + * + * @param x Pointer to first byte to hash. + * + * @param n Number of bytes to hash. + * + * @return Vector of *k* hash values. + * + */ + Hasher::digest hash(const void* x, size_t n) const; + + friend bool operator==(const UHF& x, const UHF& y) + { + return x.h == y.h; + } + + friend bool operator!=(const UHF& x, const UHF& y) + { + return ! (x == y); + } + +private: + static size_t compute_seed(size_t seed, const std::string& extra); + + H3 h; +}; + + +/** + * A hasher implementing the default hashing policy. Uses *k* separate hash + * functions internally. */ class DefaultHasher : public Hasher { public: - DefaultHasher(size_t k, const std::string& name); + /** + * Constructor for a hasher with *k* hash functions. + * + * @param k The number of hash functions to use. + * + * @param name The name of the hasher. + */ + DefaultHasher(size_t k, const std::string& name); - virtual digest_vector Hash(const void* x, size_t n) const /* final */; - virtual DefaultHasher* Clone() const /* final */; - virtual bool Equals(const Hasher* other) const /* final */; + // Overridden from Hasher. + virtual digest_vector Hash(const void* x, size_t n) const /* final */; + virtual DefaultHasher* Clone() const /* final */; + virtual bool Equals(const Hasher* other) const /* final */; private: - std::vector hash_functions_; + std::vector hash_functions; }; /** - * The *double-hashing* policy. Uses a linear combination of two hash functions. + * The *double-hashing* policy. Uses a linear combination of two hash + * functions. */ class DoubleHasher : public Hasher { public: - DoubleHasher(size_t k, const std::string& name); + /** + * Constructor for a double hasher with *k* hash functions. + * + * @param k The number of hash functions to use. + * + * @param name The name of the hasher. + */ + DoubleHasher(size_t k, const std::string& name); - virtual digest_vector Hash(const void* x, size_t n) const /* final */; - virtual DoubleHasher* Clone() const /* final */; - virtual bool Equals(const Hasher* other) const /* final */; + // Overridden from Hasher. + virtual digest_vector Hash(const void* x, size_t n) const /* final */; + virtual DoubleHasher* Clone() const /* final */; + virtual bool Equals(const Hasher* other) const /* final */; private: - UHF h1_; - UHF h2_; + UHF h1; + UHF h2; }; } diff --git a/src/probabilistic/bloom-filter.bif b/src/probabilistic/bloom-filter.bif index 3c409b1b0f..cbbff85d7d 100644 --- a/src/probabilistic/bloom-filter.bif +++ b/src/probabilistic/bloom-filter.bif @@ -31,18 +31,19 @@ module GLOBAL; ## Returns: A Bloom filter handle. function bloomfilter_basic_init%(fp: double, capacity: count, name: string &default=""%): opaque of bloomfilter - %{ - if ( fp < 0.0 || fp > 1.0 ) - { - reporter->Error("false-positive rate must take value between 0 and 1"); - return NULL; - } + %{ + if ( fp < 0.0 || fp > 1.0 ) + { + reporter->Error("false-positive rate must take value between 0 and 1"); + return 0; + } - size_t cells = BasicBloomFilter::M(fp, capacity); - size_t optimal_k = BasicBloomFilter::K(cells, capacity); - const Hasher* h = Hasher::Create(optimal_k, name->CheckString()); - return new BloomFilterVal(new BasicBloomFilter(h, cells)); - %} + size_t cells = BasicBloomFilter::M(fp, capacity); + size_t optimal_k = BasicBloomFilter::K(cells, capacity); + const Hasher* h = Hasher::Create(optimal_k, name->CheckString()); + + return new BloomFilterVal(new BasicBloomFilter(h, cells)); + %} ## Creates a counting Bloom filter. ## @@ -59,20 +60,22 @@ function bloomfilter_basic_init%(fp: double, capacity: count, ## ## Returns: A Bloom filter handle. function bloomfilter_counting_init%(k: count, cells: count, max: count, - name: string &default=""%): opaque of bloomfilter - %{ - if ( max == 0 ) - { - reporter->Error("max counter value must be greater than 0"); - return NULL; - } + name: string &default=""%): opaque of bloomfilter + %{ + if ( max == 0 ) + { + reporter->Error("max counter value must be greater than 0"); + return 0; + } - const Hasher* h = Hasher::Create(k, name->CheckString()); - uint16 width = 1; - while ( max >>= 1 ) - ++width; - return new BloomFilterVal(new CountingBloomFilter(h, cells, width)); - %} + const Hasher* h = Hasher::Create(k, name->CheckString()); + + uint16 width = 1; + while ( max >>= 1 ) + ++width; + + return new BloomFilterVal(new CountingBloomFilter(h, cells, width)); + %} ## Adds an element to a Bloom filter. ## @@ -80,16 +83,20 @@ function bloomfilter_counting_init%(k: count, cells: count, max: count, ## ## x: The element to add. function bloomfilter_add%(bf: opaque of bloomfilter, x: any%): any - %{ - BloomFilterVal* bfv = static_cast(bf); - if ( ! bfv->Type() && ! bfv->Typify(x->Type()) ) - reporter->Error("failed to set Bloom filter type"); - else if ( bfv->Type() != x->Type() ) - reporter->Error("incompatible Bloom filter types"); - else - bfv->Add(x); - return NULL; - %} + %{ + BloomFilterVal* bfv = static_cast(bf); + + if ( ! bfv->Type() && ! bfv->Typify(x->Type()) ) + reporter->Error("failed to set Bloom filter type"); + + else if ( ! same_type(bfv->Type(), x->Type()) ) + reporter->Error("incompatible Bloom filter types"); + + else + bfv->Add(x); + + return 0; + %} ## Retrieves the counter for a given element in a Bloom filter. ## @@ -99,16 +106,20 @@ function bloomfilter_add%(bf: opaque of bloomfilter, x: any%): any ## ## Returns: the counter associated with *x* in *bf*. function bloomfilter_lookup%(bf: opaque of bloomfilter, x: any%): count - %{ - const BloomFilterVal* bfv = static_cast(bf); - if ( ! bfv->Type() ) - reporter->Error("cannot perform lookup on untyped Bloom filter"); - else if ( bfv->Type() != x->Type() ) - reporter->Error("incompatible Bloom filter types"); - else - return new Val(static_cast(bfv->Count(x)), TYPE_COUNT); - return new Val(0, TYPE_COUNT); - %} + %{ + const BloomFilterVal* bfv = static_cast(bf); + + if ( ! bfv->Type() ) + reporter->Error("cannot perform lookup on untyped Bloom filter"); + + else if ( ! same_type(bfv->Type(), x->Type()) ) + reporter->Error("incompatible Bloom filter types"); + + else + return new Val(static_cast(bfv->Count(x)), TYPE_COUNT); + + return new Val(0, TYPE_COUNT); + %} ## Merges two Bloom filters. ## @@ -118,13 +129,16 @@ function bloomfilter_lookup%(bf: opaque of bloomfilter, x: any%): count ## ## Returns: The union of *bf1* and *bf2*. function bloomfilter_merge%(bf1: opaque of bloomfilter, - bf2: opaque of bloomfilter%): opaque of bloomfilter - %{ - const BloomFilterVal* bfv1 = static_cast(bf1); - const BloomFilterVal* bfv2 = static_cast(bf2); - if ( bfv1->Type() != bfv2->Type() ) - reporter->Error("incompatible Bloom filter types"); - else - return BloomFilterVal::Merge(bfv1, bfv2); - return NULL; - %} + bf2: opaque of bloomfilter%): opaque of bloomfilter + %{ + const BloomFilterVal* bfv1 = static_cast(bf1); + const BloomFilterVal* bfv2 = static_cast(bf2); + + if ( ! same_type(bfv1->Type(), bfv2->Type()) ) + { + reporter->Error("incompatible Bloom filter types"); + return 0; + } + + return BloomFilterVal::Merge(bfv1, bfv2); + %} diff --git a/src/util.cc b/src/util.cc index 81ec135f98..6bea2eb7f1 100644 --- a/src/util.cc +++ b/src/util.cc @@ -803,10 +803,10 @@ void init_random_seed(uint32 seed, const char* read_file, const char* write_file bro_srandom(seed, seeds_done); if ( ! first_seed_saved ) - { - first_seed = seed; - first_seed_saved = true; - } + { + first_seed = seed; + first_seed_saved = true; + } if ( ! hmac_key_set ) { @@ -820,9 +820,9 @@ void init_random_seed(uint32 seed, const char* read_file, const char* write_file } unsigned int initial_seed() - { - return first_seed; -} + { + return first_seed; + } bool have_random_seed() { @@ -830,7 +830,7 @@ bool have_random_seed() } long int bro_prng(long int state) - { + { // Use our own simple linear congruence PRNG to make sure we are // predictable across platforms. static const long int m = 2147483647; @@ -844,14 +844,14 @@ long int bro_prng(long int state) state += m; return state; - } + } long int bro_random() { if ( ! bro_rand_determistic ) return random(); // Use system PRNG. - bro_rand_state = bro_prng(bro_rand_state); + bro_rand_state = bro_prng(bro_rand_state); return bro_rand_state; } diff --git a/src/util.h b/src/util.h index 5689253d95..aaad2d9403 100644 --- a/src/util.h +++ b/src/util.h @@ -166,15 +166,15 @@ extern void init_random_seed(uint32 seed, const char* load_file, const char* write_file); // Retrieves the initial seed computed after the very first call to -// init_random_seed(). Repeated calls to init_random_seed() will not affect the -// return value of this function. +// init_random_seed(). Repeated calls to init_random_seed() will not affect +// the return value of this function. unsigned int initial_seed(); // Returns true if the user explicitly set a seed via init_random_seed(); extern bool have_random_seed(); -// A simple linear congruence PRNG. It takes its state as argument and returns -// a new random value, which can serve as state for subsequent calls. +// A simple linear congruence PRNG. It takes its state as argument and +// returns a new random value, which can serve as state for subsequent calls. long int bro_prng(long int state); // Replacement for the system random(), to which is normally falls back diff --git a/testing/btest/Baseline/bifs.bloomfilter/output b/testing/btest/Baseline/bifs.bloomfilter/output index 4fe2ae1ecc..14e1f038c0 100644 --- a/testing/btest/Baseline/bifs.bloomfilter/output +++ b/testing/btest/Baseline/bifs.bloomfilter/output @@ -1,3 +1,9 @@ +error: incompatible Bloom filter types +error: incompatible Bloom filter types +error: incompatible Bloom filter types +error: incompatible Bloom filter types +error: false-positive rate must take value between 0 and 1 +error: false-positive rate must take value between 0 and 1 0 1 1 diff --git a/testing/btest/bifs/bloomfilter.bro b/testing/btest/bifs/bloomfilter.bro index f69ddbda0c..3b40f29553 100644 --- a/testing/btest/bifs/bloomfilter.bro +++ b/testing/btest/bifs/bloomfilter.bro @@ -1,4 +1,4 @@ -# @TEST-EXEC: bro -b %INPUT >output +# @TEST-EXEC: bro -b %INPUT >output 2>&1 # @TEST-EXEC: btest-diff output function test_basic_bloom_filter() From c89f61917b8b7a6ab8014fad211c879681c3ad5f Mon Sep 17 00:00:00 2001 From: Robin Sommer Date: Tue, 23 Jul 2013 18:44:22 -0700 Subject: [PATCH 47/50] Updating NEWS. --- NEWS | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/NEWS b/NEWS index 1fce6b1d9d..b1a5adc12b 100644 --- a/NEWS +++ b/NEWS @@ -108,6 +108,18 @@ New Functionality shunting, and sampling; plus plugin support to customize filters dynamically. +- Bro now provides Bloom filters of two kinds: basic Bloom filters + supporting membership tests, and counting Bloom filters that track + the frequency of elements. The corresponding functions are: + + bloomfilter_basic_init(fp: double, capacity: count, name: string &default=""): opaque of bloomfilter + bloomfilter_counting_init(k: count, cells: count, max: count, name: string &default=""): opaque of bloomfilter + bloomfilter_add(bf: opaque of bloomfilter, x: any) + bloomfilter_lookup(bf: opaque of bloomfilter, x: any): count + bloomfilter_merge(bf1: opaque of bloomfilter, bf2: opaque of bloomfilter): opaque of bloomfilter + + See TODO for full documentation. + Changed Functionality ~~~~~~~~~~~~~~~~~~~~~ From 5383e8f75bae11bc5da30acf0b77493b90e5f71c Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Wed, 24 Jul 2013 11:21:10 +0200 Subject: [PATCH 48/50] Add bloomfilter_clear() BiF. --- src/OpaqueVal.cc | 5 +++++ src/OpaqueVal.h | 1 + src/probabilistic/BloomFilter.cc | 10 ++++++++++ src/probabilistic/BloomFilter.h | 11 +++++++++++ src/probabilistic/CounterVector.cc | 5 +++++ src/probabilistic/CounterVector.h | 5 +++++ src/probabilistic/bloom-filter.bif | 16 ++++++++++++++++ 7 files changed, 53 insertions(+) diff --git a/src/OpaqueVal.cc b/src/OpaqueVal.cc index efdd890f70..19a372c005 100644 --- a/src/OpaqueVal.cc +++ b/src/OpaqueVal.cc @@ -578,6 +578,11 @@ size_t BloomFilterVal::Count(const Val* val) const return cnt; } +void BloomFilterVal::Clear() + { + bloom_filter->Clear(); + } + BloomFilterVal* BloomFilterVal::Merge(const BloomFilterVal* x, const BloomFilterVal* y) { diff --git a/src/OpaqueVal.h b/src/OpaqueVal.h index ea704cb70a..cfb184fc77 100644 --- a/src/OpaqueVal.h +++ b/src/OpaqueVal.h @@ -125,6 +125,7 @@ public: void Add(const Val* val); size_t Count(const Val* val) const; + void Clear(); static BloomFilterVal* Merge(const BloomFilterVal* x, const BloomFilterVal* y); diff --git a/src/probabilistic/BloomFilter.cc b/src/probabilistic/BloomFilter.cc index 5613dcce05..c78cd4193d 100644 --- a/src/probabilistic/BloomFilter.cc +++ b/src/probabilistic/BloomFilter.cc @@ -74,6 +74,11 @@ size_t BasicBloomFilter::K(size_t cells, size_t capacity) return std::ceil(frac * std::log(2)); } +void BasicBloomFilter::Clear() + { + bits->Clear(); + } + BasicBloomFilter* BasicBloomFilter::Merge(const BasicBloomFilter* x, const BasicBloomFilter* y) { @@ -191,3 +196,8 @@ size_t CountingBloomFilter::CountImpl(const Hasher::digest_vector& h) const return min; } + +void CountingBloomFilter::Clear() + { + cells->Clear(); + } diff --git a/src/probabilistic/BloomFilter.h b/src/probabilistic/BloomFilter.h index 4a6b01c484..55bc76fca7 100644 --- a/src/probabilistic/BloomFilter.h +++ b/src/probabilistic/BloomFilter.h @@ -47,6 +47,11 @@ public: return CountImpl((*hasher)(x)); } + /** + * Removes all elements, i.e., resets all bits in the underlying bit vector. + */ + virtual void Clear() = 0; + /** * Serializes the Bloom filter. * @@ -147,6 +152,9 @@ public: */ static size_t K(size_t cells, size_t capacity); + // Overridden from BloomFilter. + virtual void Clear(); + /** * Merges two basic Bloom filters. * @@ -188,6 +196,9 @@ public: */ CountingBloomFilter(const Hasher* hasher, size_t cells, size_t width); + // Overridden from BloomFilter. + virtual void Clear(); + /** * Merges two counting Bloom filters. * diff --git a/src/probabilistic/CounterVector.cc b/src/probabilistic/CounterVector.cc index 570ed1f8ea..00fa7fb8c0 100644 --- a/src/probabilistic/CounterVector.cc +++ b/src/probabilistic/CounterVector.cc @@ -70,6 +70,11 @@ bool CounterVector::Decrement(size_type cell, count_type value) return carry; } +void CounterVector::Clear() + { + bits->Clear(); + } + CounterVector::count_type CounterVector::Count(size_type cell) const { assert(cell < Size()); diff --git a/src/probabilistic/CounterVector.h b/src/probabilistic/CounterVector.h index 178a68e8f2..896f98ef1e 100644 --- a/src/probabilistic/CounterVector.h +++ b/src/probabilistic/CounterVector.h @@ -77,6 +77,11 @@ public: */ count_type Count(size_type cell) const; + /** + * Sets all counters to 0. + */ + void Clear(); + /** * Retrieves the number of cells in the storage. * diff --git a/src/probabilistic/bloom-filter.bif b/src/probabilistic/bloom-filter.bif index cbbff85d7d..9df168be0e 100644 --- a/src/probabilistic/bloom-filter.bif +++ b/src/probabilistic/bloom-filter.bif @@ -121,6 +121,22 @@ function bloomfilter_lookup%(bf: opaque of bloomfilter, x: any%): count return new Val(0, TYPE_COUNT); %} +## Removes all elements from a Bloom filter. This function sets resets all bits +## in the underlying bitvector to 0 but does not change the parameterization of +## the Bloom filter, such as the element type and the hasher seed. +## +## bf: The Bloom filter handle. +function bloomfilter_clear%(bf: opaque of bloomfilter%): any + %{ + BloomFilterVal* bfv = static_cast(bf); + + if ( bfv->Type() ) // Untyped Bloom filters are already empty. + bfv->Clear(); + + return 0; + %} + + ## Merges two Bloom filters. ## ## bf1: The first Bloom filter handle. From 5736aef440574389dda6555642ee7e938156dcf1 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Wed, 24 Jul 2013 13:05:38 +0200 Subject: [PATCH 49/50] Refactor Bloom filter merging. --- src/OpaqueVal.cc | 31 ++++++++--- src/OpaqueVal.h | 22 -------- src/probabilistic/BloomFilter.cc | 92 +++++++++++++++++++++++--------- src/probabilistic/BloomFilter.h | 36 +++++++------ 4 files changed, 109 insertions(+), 72 deletions(-) diff --git a/src/OpaqueVal.cc b/src/OpaqueVal.cc index 19a372c005..feff4f3cc0 100644 --- a/src/OpaqueVal.cc +++ b/src/OpaqueVal.cc @@ -584,21 +584,36 @@ void BloomFilterVal::Clear() } BloomFilterVal* BloomFilterVal::Merge(const BloomFilterVal* x, - const BloomFilterVal* y) + const BloomFilterVal* y) { if ( ! same_type(x->Type(), y->Type()) ) + { reporter->InternalError("cannot merge Bloom filters with different types"); + return 0; + } - BloomFilterVal* result; + if ( typeid(*x->bloom_filter) != typeid(*y->bloom_filter) ) + { + reporter->InternalError("cannot merge different Bloom filter types"); + return 0; + } - if ( (result = DoMerge(x, y)) ) - return result; + probabilistic::BloomFilter* copy = x->bloom_filter->Clone(); + bool success = copy->Merge(y->bloom_filter); + if ( ! success ) + { + reporter->InternalError("failed to merge Bloom filter"); + return 0; + } - else if ( (result = DoMerge(x, y)) ) - return result; + BloomFilterVal* merged = new BloomFilterVal(copy); + if ( ! merged->Typify(x->Type()) ) + { + reporter->InternalError("failed to set type on merged Bloom filter"); + return 0; + } - reporter->InternalError("failed to merge Bloom filters"); - return 0; + return merged; } BloomFilterVal::~BloomFilterVal() diff --git a/src/OpaqueVal.h b/src/OpaqueVal.h index cfb184fc77..360bb69803 100644 --- a/src/OpaqueVal.h +++ b/src/OpaqueVal.h @@ -142,28 +142,6 @@ private: BloomFilterVal(const BloomFilterVal&); BloomFilterVal& operator=(const BloomFilterVal&); - template - static BloomFilterVal* DoMerge(const BloomFilterVal* x, - const BloomFilterVal* y) - { - if ( typeid(*x->bloom_filter) != typeid(*y->bloom_filter) ) - reporter->InternalError("cannot merge different Bloom filter types"); - - if ( typeid(T) != typeid(*x->bloom_filter) ) - return 0; - - const T* a = static_cast(x->bloom_filter); - const T* b = static_cast(y->bloom_filter); - - BloomFilterVal* merged = new BloomFilterVal(T::Merge(a, b)); - assert(merged); - - if ( ! merged->Typify(x->Type()) ) - reporter->InternalError("failed to set type on merged Bloom filter"); - - return merged; - } - BroType* type; CompositeHash* hash; probabilistic::BloomFilter* bloom_filter; diff --git a/src/probabilistic/BloomFilter.cc b/src/probabilistic/BloomFilter.cc index c78cd4193d..132cf376ec 100644 --- a/src/probabilistic/BloomFilter.cc +++ b/src/probabilistic/BloomFilter.cc @@ -79,17 +79,37 @@ void BasicBloomFilter::Clear() bits->Clear(); } -BasicBloomFilter* BasicBloomFilter::Merge(const BasicBloomFilter* x, - const BasicBloomFilter* y) +bool BasicBloomFilter::Merge(const BloomFilter* other) { - if ( ! x->hasher->Equals(y->hasher) ) - reporter->InternalError("incompatible hashers during BasicBloomFilter merge"); + if ( typeid(*this) != typeid(*other) ) + return 0; - BasicBloomFilter* result = new BasicBloomFilter(); - result->hasher = x->hasher->Clone(); - result->bits = new BitVector(*x->bits | *y->bits); + const BasicBloomFilter* o = static_cast(other); - return result; + if ( ! hasher->Equals(o->hasher) ) + { + reporter->InternalError("incompatible hashers in BasicBloomFilter merge"); + return false; + } + else if ( bits->Size() != o->bits->Size() ) + { + reporter->InternalError("different bitvector size in BasicBloomFilter merge"); + return false; + } + + (*bits) |= *o->bits; + + return true; + } + +BasicBloomFilter* BasicBloomFilter::Clone() const + { + BasicBloomFilter* copy = new BasicBloomFilter(); + + copy->hasher = hasher->Clone(); + copy->bits = new BitVector(*bits); + + return copy; } BasicBloomFilter::BasicBloomFilter() @@ -135,19 +155,6 @@ size_t BasicBloomFilter::CountImpl(const Hasher::digest_vector& h) const return 1; } -CountingBloomFilter* CountingBloomFilter::Merge(const CountingBloomFilter* x, - const CountingBloomFilter* y) - { - if ( ! x->hasher->Equals(y->hasher) ) - reporter->InternalError("incompatible hashers during CountingBloomFilter merge"); - - CountingBloomFilter* result = new CountingBloomFilter(); - result->hasher = x->hasher->Clone(); - result->cells = new CounterVector(*x->cells | *y->cells); - - return result; - } - CountingBloomFilter::CountingBloomFilter() { cells = 0; @@ -160,6 +167,44 @@ CountingBloomFilter::CountingBloomFilter(const Hasher* hasher, cells = new CounterVector(width, arg_cells); } +void CountingBloomFilter::Clear() + { + cells->Clear(); + } + +bool CountingBloomFilter::Merge(const BloomFilter* other) + { + if ( typeid(*this) != typeid(*other) ) + return 0; + + const CountingBloomFilter* o = static_cast(other); + + if ( ! hasher->Equals(o->hasher) ) + { + reporter->InternalError("incompatible hashers in CountingBloomFilter merge"); + return false; + } + else if ( cells->Size() != o->cells->Size() ) + { + reporter->InternalError("different bitvector size in CountingBloomFilter merge"); + return false; + } + + (*cells) |= *o->cells; + + return true; + } + +CountingBloomFilter* CountingBloomFilter::Clone() const + { + CountingBloomFilter* copy = new CountingBloomFilter(); + + copy->hasher = hasher->Clone(); + copy->cells = new CounterVector(*cells); + + return copy; + } + IMPLEMENT_SERIAL(CountingBloomFilter, SER_COUNTINGBLOOMFILTER) bool CountingBloomFilter::DoSerialize(SerialInfo* info) const @@ -196,8 +241,3 @@ size_t CountingBloomFilter::CountImpl(const Hasher::digest_vector& h) const return min; } - -void CountingBloomFilter::Clear() - { - cells->Clear(); - } diff --git a/src/probabilistic/BloomFilter.h b/src/probabilistic/BloomFilter.h index 55bc76fca7..2ab5b89941 100644 --- a/src/probabilistic/BloomFilter.h +++ b/src/probabilistic/BloomFilter.h @@ -52,6 +52,22 @@ public: */ virtual void Clear() = 0; + /** + * Merges another Bloom filter into a copy of this one. + * + * @param other The other Bloom filter. + * + * @return `true` on success. + */ + virtual bool Merge(const BloomFilter* other) = 0; + + /** + * Constructs a copy of this Bloom filter. + * + * @return A copy of `*this`. + */ + virtual BloomFilter* Clone() const = 0; + /** * Serializes the Bloom filter. * @@ -154,14 +170,8 @@ public: // Overridden from BloomFilter. virtual void Clear(); - - /** - * Merges two basic Bloom filters. - * - * @return The merged Bloom filter. - */ - static BasicBloomFilter* Merge(const BasicBloomFilter* x, - const BasicBloomFilter* y); + virtual bool Merge(const BloomFilter* other); + virtual BasicBloomFilter* Clone() const; protected: DECLARE_SERIAL(BasicBloomFilter); @@ -198,14 +208,8 @@ public: // Overridden from BloomFilter. virtual void Clear(); - - /** - * Merges two counting Bloom filters. - * - * @return The merged Bloom filter. - */ - static CountingBloomFilter* Merge(const CountingBloomFilter* x, - const CountingBloomFilter* y); + virtual bool Merge(const BloomFilter* other); + virtual CountingBloomFilter* Clone() const; protected: DECLARE_SERIAL(CountingBloomFilter); From 5769c32f1eeb319e599996e05e0e63b30af34823 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Wed, 24 Jul 2013 13:18:19 +0200 Subject: [PATCH 50/50] Support emptiness check on Bloom filters. --- src/OpaqueVal.cc | 5 +++++ src/OpaqueVal.h | 1 + src/probabilistic/BitVector.cc | 8 ++++++++ src/probabilistic/BitVector.h | 6 ++++++ src/probabilistic/BloomFilter.cc | 10 ++++++++++ src/probabilistic/BloomFilter.h | 9 +++++++++ src/probabilistic/CounterVector.cc | 5 +++++ src/probabilistic/CounterVector.h | 6 ++++++ src/probabilistic/bloom-filter.bif | 3 +++ 9 files changed, 53 insertions(+) diff --git a/src/OpaqueVal.cc b/src/OpaqueVal.cc index feff4f3cc0..a42892e2b2 100644 --- a/src/OpaqueVal.cc +++ b/src/OpaqueVal.cc @@ -583,6 +583,11 @@ void BloomFilterVal::Clear() bloom_filter->Clear(); } +bool BloomFilterVal::Empty() const + { + return bloom_filter->Empty(); + } + BloomFilterVal* BloomFilterVal::Merge(const BloomFilterVal* x, const BloomFilterVal* y) { diff --git a/src/OpaqueVal.h b/src/OpaqueVal.h index 360bb69803..52c9583fc7 100644 --- a/src/OpaqueVal.h +++ b/src/OpaqueVal.h @@ -126,6 +126,7 @@ public: void Add(const Val* val); size_t Count(const Val* val) const; void Clear(); + bool Empty() const; static BloomFilterVal* Merge(const BloomFilterVal* x, const BloomFilterVal* y); diff --git a/src/probabilistic/BitVector.cc b/src/probabilistic/BitVector.cc index 98f008b24b..13cd1aa3bb 100644 --- a/src/probabilistic/BitVector.cc +++ b/src/probabilistic/BitVector.cc @@ -463,6 +463,14 @@ bool BitVector::Empty() const return bits.empty(); } +bool BitVector::AllZero() const + { + for ( size_t i = 0; i < bits.size(); ++i ) + if ( bits[i] ) + return false; + return true; + } + BitVector::size_type BitVector::FindFirst() const { return find_from(0); diff --git a/src/probabilistic/BitVector.h b/src/probabilistic/BitVector.h index 9eefe1b633..d9c55d53c6 100644 --- a/src/probabilistic/BitVector.h +++ b/src/probabilistic/BitVector.h @@ -253,6 +253,12 @@ public: */ bool Empty() const; + /** + * Checks whether all bits are 0. + * @return `true` iff all bits in all blocks are 0. + */ + bool AllZero() const; + /** * Finds the bit position of of the first 1-bit. * @return The position of the first bit that equals to one or `npos` if no diff --git a/src/probabilistic/BloomFilter.cc b/src/probabilistic/BloomFilter.cc index 132cf376ec..7f769cbf7c 100644 --- a/src/probabilistic/BloomFilter.cc +++ b/src/probabilistic/BloomFilter.cc @@ -74,6 +74,11 @@ size_t BasicBloomFilter::K(size_t cells, size_t capacity) return std::ceil(frac * std::log(2)); } +bool BasicBloomFilter::Empty() const + { + return bits->AllZero(); + } + void BasicBloomFilter::Clear() { bits->Clear(); @@ -167,6 +172,11 @@ CountingBloomFilter::CountingBloomFilter(const Hasher* hasher, cells = new CounterVector(width, arg_cells); } +bool CountingBloomFilter::Empty() const + { + return cells->AllZero(); + } + void CountingBloomFilter::Clear() { cells->Clear(); diff --git a/src/probabilistic/BloomFilter.h b/src/probabilistic/BloomFilter.h index 2ab5b89941..b6cf18672f 100644 --- a/src/probabilistic/BloomFilter.h +++ b/src/probabilistic/BloomFilter.h @@ -47,6 +47,13 @@ public: return CountImpl((*hasher)(x)); } + /** + * Checks whether the Bloom filter is empty. + * + * @return `true` if the Bloom filter contains no elements. + */ + virtual bool Empty() const = 0; + /** * Removes all elements, i.e., resets all bits in the underlying bit vector. */ @@ -169,6 +176,7 @@ public: static size_t K(size_t cells, size_t capacity); // Overridden from BloomFilter. + virtual bool Empty() const; virtual void Clear(); virtual bool Merge(const BloomFilter* other); virtual BasicBloomFilter* Clone() const; @@ -207,6 +215,7 @@ public: CountingBloomFilter(const Hasher* hasher, size_t cells, size_t width); // Overridden from BloomFilter. + virtual bool Empty() const; virtual void Clear(); virtual bool Merge(const BloomFilter* other); virtual CountingBloomFilter* Clone() const; diff --git a/src/probabilistic/CounterVector.cc b/src/probabilistic/CounterVector.cc index 00fa7fb8c0..24c9ff3638 100644 --- a/src/probabilistic/CounterVector.cc +++ b/src/probabilistic/CounterVector.cc @@ -70,6 +70,11 @@ bool CounterVector::Decrement(size_type cell, count_type value) return carry; } +bool CounterVector::AllZero() const + { + return bits->AllZero(); + } + void CounterVector::Clear() { bits->Clear(); diff --git a/src/probabilistic/CounterVector.h b/src/probabilistic/CounterVector.h index 896f98ef1e..df6fc57ac2 100644 --- a/src/probabilistic/CounterVector.h +++ b/src/probabilistic/CounterVector.h @@ -77,6 +77,12 @@ public: */ count_type Count(size_type cell) const; + /** + * Checks whether all counters are 0. + * @return `true` iff all counters have the value 0. + */ + bool AllZero() const; + /** * Sets all counters to 0. */ diff --git a/src/probabilistic/bloom-filter.bif b/src/probabilistic/bloom-filter.bif index 9df168be0e..dd21688fdd 100644 --- a/src/probabilistic/bloom-filter.bif +++ b/src/probabilistic/bloom-filter.bif @@ -109,6 +109,9 @@ function bloomfilter_lookup%(bf: opaque of bloomfilter, x: any%): count %{ const BloomFilterVal* bfv = static_cast(bf); + if ( bfv->Empty() ) + return new Val(0, TYPE_COUNT); + if ( ! bfv->Type() ) reporter->Error("cannot perform lookup on untyped Bloom filter");