From 4d275522c7a87f8c69b1494126cc995a20b2d66b Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Thu, 23 May 2013 16:03:26 -0700 Subject: [PATCH 01/45] Add abstraction for vector of bits. A bitvector is a vector of bits with underlying block storage. Since C++ has no notion of lvalues in the context of bits, we use a small wrapper class Reference that masks the desired bit in the corresponding block. --- src/BitVector.cc | 455 +++++++++++++++++++++++++++++++++++++++++++++ src/BitVector.h | 324 ++++++++++++++++++++++++++++++++ src/CMakeLists.txt | 1 + 3 files changed, 780 insertions(+) create mode 100644 src/BitVector.cc create mode 100644 src/BitVector.h diff --git a/src/BitVector.cc b/src/BitVector.cc new file mode 100644 index 0000000000..2f714a6c79 --- /dev/null +++ b/src/BitVector.cc @@ -0,0 +1,455 @@ +#include "BitVector.h" + +#include +#include + +BitVector::size_type BitVector::npos = static_cast(-1); +BitVector::block_type BitVector::bits_per_block = + std::numeric_limits::digits; + +namespace { + +uint8_t count_table[] = { + 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, + 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, + 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, + 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, + 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, + 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, + 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, + 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, 5, + 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 3, + 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, + 6, 7, 6, 7, 7, 8 +}; + +} // namespace + +BitVector::Reference::Reference(block_type& block, block_type i) + : block_(block), + mask_(block_type(1) << i) + { + assert(i < bits_per_block); + } + +BitVector::Reference& BitVector::Reference::flip() + { + block_ ^= mask_; + return *this; + } + +BitVector::Reference::operator bool() const + { + return (block_ & mask_) != 0; + } + +bool BitVector::Reference::operator~() const + { + return (block_ & mask_) == 0; + } + +BitVector::Reference& BitVector::Reference::operator=(bool x) + { + x ? block_ |= mask_ : block_ &= ~mask_; + return *this; + } + +BitVector::Reference& BitVector::Reference::operator=(Reference const& other) + { + other ? block_ |= mask_ : block_ &= ~mask_; + return *this; + } + +BitVector::Reference& BitVector::Reference::operator|=(bool x) + { + if (x) + block_ |= mask_; + return *this; + } + +BitVector::Reference& BitVector::Reference::operator&=(bool x) + { + if (! x) + block_ &= ~mask_; + return *this; + } + +BitVector::Reference& BitVector::Reference::operator^=(bool x) + { + if (x) + block_ ^= mask_; + return *this; + } + +BitVector::Reference& BitVector::Reference::operator-=(bool x) + { + if (x) + block_ &= ~mask_; + return *this; + } + + +BitVector::BitVector() : num_bits_(0) { } + +BitVector::BitVector(size_type size, bool value) + : bits_(bits_to_blocks(size), value ? ~block_type(0) : 0), + num_bits_(size) +{ } + +BitVector::BitVector(BitVector const& other) + : bits_(other.bits_), + num_bits_(other.num_bits_) +{ } + +BitVector BitVector::operator~() const + { + BitVector b(*this); + b.flip(); + return b; + } + +BitVector& BitVector::operator=(BitVector const& other) + { + bits_ = other.bits_; + return *this; + } + +BitVector BitVector::operator<<(size_type n) const + { + BitVector b(*this); + return b <<= n; + } + +BitVector BitVector::operator>>(size_type n) const + { + BitVector b(*this); + return b >>= n; + } + +BitVector& BitVector::operator<<=(size_type n) + { + if (n >= num_bits_) + return reset(); + + if (n > 0) + { + size_type last = blocks() - 1; + size_type div = n / bits_per_block; + block_type r = bit_index(n); + block_type* b = &bits_[0]; + assert(blocks() >= 1); + assert(div <= last); + + if (r != 0) + { + for (size_type i = last - div; i > 0; --i) + b[i + div] = (b[i] << r) | (b[i - 1] >> (bits_per_block - r)); + b[div] = b[0] << r; + } + else + { + for (size_type i = last-div; i > 0; --i) + b[i + div] = b[i]; + b[div] = b[0]; + } + + std::fill_n(b, div, block_type(0)); + zero_unused_bits(); + } + + return *this; + } + +BitVector& BitVector::operator>>=(size_type n) + { + if (n >= num_bits_) + return reset(); + + if (n > 0) + { + size_type last = blocks() - 1; + size_type div = n / bits_per_block; + block_type r = bit_index(n); + block_type* b = &bits_[0]; + assert(blocks() >= 1); + assert(div <= last); + + if (r != 0) + { + for (size_type i = last - div; i > 0; --i) + b[i - div] = (b[i] >> r) | (b[i + 1] << (bits_per_block - r)); + b[last - div] = b[last] >> r; + } + else + { + for (size_type i = div; i <= last; ++i) + b[i-div] = b[i]; + } + + std::fill_n(b + (blocks() - div), div, block_type(0)); + } + return *this; + } + +BitVector& BitVector::operator&=(BitVector const& other) + { + assert(size() >= other.size()); + for (size_type i = 0; i < blocks(); ++i) + bits_[i] &= other.bits_[i]; + return *this; + } + +BitVector& BitVector::operator|=(BitVector const& other) + { + assert(size() >= other.size()); + for (size_type i = 0; i < blocks(); ++i) + bits_[i] |= other.bits_[i]; + return *this; + } + +BitVector& BitVector::operator^=(BitVector const& other) + { + assert(size() >= other.size()); + for (size_type i = 0; i < blocks(); ++i) + bits_[i] ^= other.bits_[i]; + return *this; + } + +BitVector& BitVector::operator-=(BitVector const& other) + { + assert(size() >= other.size()); + for (size_type i = 0; i < blocks(); ++i) + bits_[i] &= ~other.bits_[i]; + return *this; + } + +BitVector operator&(BitVector const& x, BitVector const& y) + { + BitVector b(x); + return b &= y; + } + +BitVector operator|(BitVector const& x, BitVector const& y) + { + BitVector b(x); + return b |= y; + } + +BitVector operator^(BitVector const& x, BitVector const& y) + { + BitVector b(x); + return b ^= y; + } + +BitVector operator-(BitVector const& x, BitVector const& y) + { + BitVector b(x); + return b -= y; + } + +bool operator==(BitVector const& x, BitVector const& y) + { + return x.num_bits_ == y.num_bits_ && x.bits_ == y.bits_; + } + +bool operator!=(BitVector const& x, BitVector const& y) + { + return ! (x == y); + } + +bool operator<(BitVector const& x, BitVector const& y) + { + assert(x.size() == y.size()); + for (BitVector::size_type r = x.blocks(); r > 0; --r) + { + BitVector::size_type i = r - 1; + if (x.bits_[i] < y.bits_[i]) + return true; + else if (x.bits_[i] > y.bits_[i]) + return false; + } + return false; + } + +void BitVector::resize(size_type n, bool value) + { + size_type old = blocks(); + size_type required = bits_to_blocks(n); + block_type block_value = value ? ~block_type(0) : block_type(0); + + if (required != old) + bits_.resize(required, block_value); + + if (value && (n > num_bits_) && extra_bits()) + bits_[old - 1] |= (block_value << extra_bits()); + + num_bits_ = n; + zero_unused_bits(); + } + +void BitVector::clear() + { + bits_.clear(); + num_bits_ = 0; + } + +void BitVector::push_back(bool bit) + { + size_type s = size(); + resize(s + 1); + set(s, bit); + } + +void BitVector::append(block_type block) + { + size_type excess = extra_bits(); + if (excess) + { + assert(! bits_.empty()); + bits_.push_back(block >> (bits_per_block - excess)); + bits_[bits_.size() - 2] |= (block << excess); + } + else + { + bits_.push_back(block); + } + num_bits_ += bits_per_block; + } + +BitVector& BitVector::set(size_type i, bool bit) + { + assert(i < num_bits_); + + if (bit) + bits_[block_index(i)] |= bit_mask(i); + else + reset(i); + + return *this; + } + +BitVector& BitVector::set() + { + std::fill(bits_.begin(), bits_.end(), ~block_type(0)); + zero_unused_bits(); + return *this; + } + +BitVector& BitVector::reset(size_type i) + { + assert(i < num_bits_); + bits_[block_index(i)] &= ~bit_mask(i); + return *this; + } + +BitVector& BitVector::reset() + { + std::fill(bits_.begin(), bits_.end(), block_type(0)); + return *this; + } + +BitVector& BitVector::flip(size_type i) + { + assert(i < num_bits_); + bits_[block_index(i)] ^= bit_mask(i); + return *this; + } + +BitVector& BitVector::flip() + { + for (size_type i = 0; i < blocks(); ++i) + bits_[i] = ~bits_[i]; + zero_unused_bits(); + return *this; + } + +bool BitVector::operator[](size_type i) const + { + assert(i < num_bits_); + return (bits_[block_index(i)] & bit_mask(i)) != 0; + } + +BitVector::Reference BitVector::operator[](size_type i) + { + assert(i < num_bits_); + return Reference(bits_[block_index(i)], bit_index(i)); + } + +BitVector::size_type BitVector::count() const + { + std::vector::const_iterator first = bits_.begin(); + size_t n = 0; + size_type length = blocks(); + while (length) + { + block_type block = *first; + while (block) + { + // TODO: use __popcnt if available. + n += count_table[block & ((1u << 8) - 1)]; + block >>= 8; + } + ++first; + --length; + } + return n; + } + +BitVector::size_type BitVector::blocks() const + { + return bits_.size(); + } + +BitVector::size_type BitVector::size() const + { + return num_bits_; + } + +bool BitVector::empty() const + { + return bits_.empty(); + } + +BitVector::size_type BitVector::find_first() const + { + return find_from(0); + } + +BitVector::size_type BitVector::find_next(size_type i) const + { + if (i >= (size() - 1) || size() == 0) + return npos; + ++i; + size_type bi = block_index(i); + block_type block = bits_[bi] & (~block_type(0) << bit_index(i)); + return block ? bi * bits_per_block + lowest_bit(block) : find_from(bi + 1); + } + +BitVector::size_type BitVector::lowest_bit(block_type block) + { + block_type x = block - (block & (block - 1)); + size_type log = 0; + while (x >>= 1) + ++log; + return log; + } + +BitVector::block_type BitVector::extra_bits() const + { + return bit_index(size()); + } + +void BitVector::zero_unused_bits() + { + if (extra_bits()) + bits_.back() &= ~(~block_type(0) << extra_bits()); + } + +BitVector::size_type BitVector::find_from(size_type i) const + { + while (i < blocks() && bits_[i] == 0) + ++i; + if (i >= blocks()) + return npos; + return i * bits_per_block + lowest_bit(bits_[i]); + } diff --git a/src/BitVector.h b/src/BitVector.h new file mode 100644 index 0000000000..46d7e2df8f --- /dev/null +++ b/src/BitVector.h @@ -0,0 +1,324 @@ +#ifndef BitVector_h +#define BitVector_h + +#include +#include + +/** + * A vector of bits. + */ +class BitVector { +public: + typedef size_t block_type; + typedef size_t size_type; + static size_type npos; + static block_type bits_per_block; + +public: + /** + * An lvalue proxy for single bits. + */ + class Reference { + friend class BitVector; + Reference(block_type& block, block_type i); + + public: + Reference& flip(); + operator bool() const; + bool operator~() const; + Reference& operator=(bool x); + Reference& operator=(Reference const& other); + Reference& operator|=(bool x); + Reference& operator&=(bool x); + Reference& operator^=(bool x); + Reference& operator-=(bool x); + + private: + void operator&(); + block_type& block_; + block_type const mask_; + }; + + typedef bool const_reference; + + /** + * Constructs an empty bit vector. + */ + BitVector(); + + /** + * Constructs a bit vector of a given size. + * @param size The number of bits. + * @param value The value for each bit. + */ + explicit BitVector(size_type size, bool value = false); + + /** + * Constructs a bit vector from a sequence of blocks. + */ + template + BitVector(InputIterator first, InputIterator last) + { + bits_.insert(bits_.end(), first, last); + num_bits_ = bits_.size() * bits_per_block; + } + + /** + * Copy-constructs a bit vector. + * @param other The bit vector to copy. + */ + BitVector(const BitVector& other); + + /** + * Assigns another bit vector to this instance. + * @param other The RHS of the assignment. + */ + BitVector& operator=(const BitVector& other); + + // + // Bitwise operations + // + BitVector operator~() const; + BitVector operator<<(size_type n) const; + BitVector operator>>(size_type n) const; + BitVector& operator<<=(size_type n); + BitVector& operator>>=(size_type n); + BitVector& operator&=(BitVector const& other); + BitVector& operator|=(BitVector const& other); + BitVector& operator^=(BitVector const& other); + BitVector& operator-=(BitVector const& other); + friend BitVector operator&(BitVector const& x, BitVector const& y); + friend BitVector operator|(BitVector const& x, BitVector const& y); + friend BitVector operator^(BitVector const& x, BitVector const& y); + friend BitVector operator-(BitVector const& x, BitVector const& y); + + // + // Relational operators + // + friend bool operator==(BitVector const& x, BitVector const& y); + friend bool operator!=(BitVector const& x, BitVector const& y); + friend bool operator<(BitVector const& x, BitVector const& y); + + // + // Basic operations + // + /** Appends the bits in a sequence of values. + * @tparam Iterator A forward iterator. + * @param first An iterator pointing to the first element of the sequence. + * @param last An iterator pointing to one past the last element of the + * sequence. + */ + template + void append(ForwardIterator first, ForwardIterator last) + { + if (first == last) + return; + + block_type excess = extra_bits(); + typename std::iterator_traits::difference_type delta = + std::distance(first, last); + + bits_.reserve(blocks() + delta); + if (excess == 0) + { + bits_.back() |= (*first << excess); + do + { + block_type b = *first++ >> (bits_per_block - excess); + bits_.push_back(b | (first == last ? 0 : *first << excess)); + } while (first != last); + } + else + { + bits_.insert(bits_.end(), first, last); + } + num_bits_ += bits_per_block * delta; + } + + /** + * Appends the bits in a given block. + * @param block The block containing bits to append. + */ + void append(block_type block); + + /** Appends a single bit to the end of the bit vector. + * @param bit The value of the bit. + */ + void push_back(bool bit); + + /** + * Clears all bits in the bitvector. + */ + void clear(); + + /** + * Resizes the bit vector to a new number of bits. + * @param n The new number of bits of the bit vector. + * @param value The bit value of new values, if the vector expands. + */ + void resize(size_type n, bool value = false); + + /** + * Sets a bit at a specific position to a given value. + * @param i The bit position. + * @param bit The value assigned to position *i*. + * @return A reference to the bit vector instance. + */ + BitVector& set(size_type i, bool bit = true); + + /** + * Sets all bits to 1. + * @return A reference to the bit vector instance. + */ + BitVector& set(); + + /** + * Resets a bit at a specific position, i.e., sets it to 0. + * @param i The bit position. + * @return A reference to the bit vector instance. + */ + BitVector& reset(size_type i); + + /** + * Sets all bits to 0. + * @return A reference to the bit vector instance. + */ + BitVector& reset(); + + /** + * Toggles/flips a bit at a specific position. + * @param i The bit position. + * @return A reference to the bit vector instance. + */ + BitVector& flip(size_type i); + + /** + * Computes the complement. + * @return A reference to the bit vector instance. + */ + BitVector& flip(); + + /** Retrieves a single bit. + * @param i The bit position. + * @return A mutable reference to the bit at position *i*. + */ + Reference operator[](size_type i); + + /** + * Retrieves a single bit. + * @param i The bit position. + * @return A const-reference to the bit at position *i*. + */ + const_reference operator[](size_type i) const; + + /** + * Counts the number of 1-bits in the bit vector. Also known as *population + * count* or *Hamming weight*. + * @return The number of bits set to 1. + */ + size_type count() const; + + /** + * Retrieves the number of blocks of the underlying storage. + * @param The number of blocks that represent `size()` bits. + */ + size_type blocks() const; + + /** + * Retrieves the number of bits the bitvector consist of. + * @return The length of the bit vector in bits. + */ + size_type size() const; + + /** + * Checks whether the bit vector is empty. + * @return `true` iff the bitvector has zero length. + */ + bool empty() const; + + /** + * Finds the bit position of of the first 1-bit. + * @return The position of the first bit that equals to one or `npos` if no + * such bit exists. + */ + size_type find_first() const; + + /** + * Finds the next 1-bit from a given starting position. + * + * @param i The index where to start looking. + * + * @return The position of the first bit that equals to 1 after position + * *i* or `npos` if no such bit exists. + */ + size_type find_next(size_type i) const; + +private: + /** + * Computes the block index for a given bit position. + */ + static size_type block_index(size_type i) + { + return i / bits_per_block; + } + + /** + * Computes the bit index within a given block for a given bit position. + */ + static block_type bit_index(size_type i) + { + return i % bits_per_block; + } + + /** + * Computes the bitmask block to extract a bit a given bit position. + */ + static block_type bit_mask(size_type i) + { + return block_type(1) << bit_index(i); + } + + /** + * Computes the number of blocks needed to represent a given number of + * bits. + * @param bits the number of bits. + * @return The number of blocks to represent *bits* number of bits. + */ + static size_type bits_to_blocks(size_type bits) + { + return bits / bits_per_block + + static_cast(bits % bits_per_block != 0); + } + + /** + * Computes the bit position first 1-bit in a given block. + * @param block The block to inspect. + * @return The bit position where *block* has its first bit set to 1. + */ + static size_type lowest_bit(block_type block); + + /** + * Computes the number of excess/unused bits in the bit vector. + */ + block_type extra_bits() const; + + /** + * If the number of bits in the vector are not not a multiple of + * bitvector::bits_per_block, then the last block exhibits unused bits which + * this function resets. + */ + void zero_unused_bits(); + + /** + * Looks for the first 1-bit starting at a given position. + * @param i The block index to start looking. + * @return The block index of the first 1-bit starting from *i* or + * `bitvector::npos` if no 1-bit exists. + */ + size_type find_from(size_type i) const; + + std::vector bits_; + size_type num_bits_; +}; + +#endif diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 447b7d9ec7..33aaab29c1 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -303,6 +303,7 @@ set(bro_SRCS Base64.cc BitTorrent.cc BitTorrentTracker.cc + BitVector.cc BPF_Program.cc BroDoc.cc BroDocObj.cc From 9e32eaad6db992e60a3d669c4d8c7b5016cc8cbc Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Tue, 28 May 2013 20:58:01 -0700 Subject: [PATCH 02/45] Make bitvectors serializable. --- src/BitVector.cc | 57 +++++++++++++++++++++++++++++++++++++++++++++-- src/BitVector.h | 13 ++++++++--- src/SerialTypes.h | 2 ++ 3 files changed, 67 insertions(+), 5 deletions(-) diff --git a/src/BitVector.cc b/src/BitVector.cc index 2f714a6c79..f57301d506 100644 --- a/src/BitVector.cc +++ b/src/BitVector.cc @@ -2,6 +2,7 @@ #include #include +#include "Serializer.h" BitVector::size_type BitVector::npos = static_cast(-1); BitVector::block_type BitVector::bits_per_block = @@ -62,7 +63,7 @@ BitVector::Reference& BitVector::Reference::operator=(Reference const& other) BitVector::Reference& BitVector::Reference::operator|=(bool x) { - if (x) + if (x) block_ |= mask_; return *this; } @@ -73,7 +74,7 @@ BitVector::Reference& BitVector::Reference::operator&=(bool x) block_ &= ~mask_; return *this; } - + BitVector::Reference& BitVector::Reference::operator^=(bool x) { if (x) @@ -453,3 +454,55 @@ BitVector::size_type BitVector::find_from(size_type i) const return npos; return i * bits_per_block + lowest_bit(bits_[i]); } + +bool BitVector::Serialize(SerialInfo* info) const + { + return SerialObj::Serialize(info); + } + +BitVector* BitVector::Unserialize(UnserialInfo* info) + { + return reinterpret_cast( + SerialObj::Unserialize(info, SER_BITVECTOR)); + } + +IMPLEMENT_SERIAL(BitVector, SER_BITVECTOR); + +bool BitVector::DoSerialize(SerialInfo* info) const + { + DO_SERIALIZE(SER_BITVECTOR, SerialObj); + + if ( ! SERIALIZE(static_cast(bits_.size())) ) + return false; + + for (size_t i = 0; i < bits_.size(); ++i) + if ( ! SERIALIZE(static_cast(bits_[i])) ) + return false; + + return SERIALIZE(static_cast(num_bits_)); + } + +bool BitVector::DoUnserialize(UnserialInfo* info) + { + DO_UNSERIALIZE(SerialObj); + + uint64 size; + if ( ! UNSERIALIZE(&size) ) + return false; + + bits_.resize(static_cast(size)); + uint64 block; + for ( size_t i = 0; i < bits_.size(); ++i ) + { + if ( ! UNSERIALIZE(&block) ) + return false; + bits_[i] = static_cast(block); + } + + uint64 num_bits; + if ( ! UNSERIALIZE(&num_bits) ) + return false; + num_bits_ = static_cast(num_bits); + + return true; + } diff --git a/src/BitVector.h b/src/BitVector.h index 46d7e2df8f..9900dd103e 100644 --- a/src/BitVector.h +++ b/src/BitVector.h @@ -3,11 +3,12 @@ #include #include +#include "SerialObj.h" /** * A vector of bits. */ -class BitVector { +class BitVector : SerialObj { public: typedef size_t block_type; typedef size_t size_type; @@ -42,7 +43,7 @@ public: typedef bool const_reference; /** - * Constructs an empty bit vector. + * Default-constructs an empty bit vector. */ BitVector(); @@ -253,6 +254,12 @@ public: */ size_type find_next(size_type i) const; + bool Serialize(SerialInfo* info) const; + static BitVector* Unserialize(UnserialInfo* info); + +protected: + DECLARE_SERIAL(BitVector); + private: /** * Computes the block index for a given bit position. @@ -286,7 +293,7 @@ private: */ static size_type bits_to_blocks(size_type bits) { - return bits / bits_per_block + return bits / bits_per_block + static_cast(bits % bits_per_block != 0); } diff --git a/src/SerialTypes.h b/src/SerialTypes.h index 723badab1e..c9c0c34a33 100644 --- a/src/SerialTypes.h +++ b/src/SerialTypes.h @@ -49,6 +49,7 @@ SERIAL_IS(STATE_ACCESS, 0x1100) SERIAL_IS_BO(CASE, 0x1200) SERIAL_IS(LOCATION, 0x1300) SERIAL_IS(RE_MATCHER, 0x1400) +SERIAL_IS(BITVECTOR, 0x1500) // These are the externally visible types. const SerialType SER_NONE = 0; @@ -202,5 +203,6 @@ SERIAL_CONST2(STATE_ACCESS) SERIAL_CONST2(CASE) SERIAL_CONST2(LOCATION) SERIAL_CONST2(RE_MATCHER) +SERIAL_CONST2(BITVECTOR) #endif From d873db03cef3bb09d45e789d69607487e36b6093 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Fri, 31 May 2013 18:31:14 -0700 Subject: [PATCH 03/45] Add draft of Bloom filter type hierarchy. --- src/BloomFilter.h | 266 +++++++++++++++++++++++++++++++++++++++++++++ src/CMakeLists.txt | 1 + 2 files changed, 267 insertions(+) create mode 100644 src/BloomFilter.h diff --git a/src/BloomFilter.h b/src/BloomFilter.h new file mode 100644 index 0000000000..a767c6b8b8 --- /dev/null +++ b/src/BloomFilter.h @@ -0,0 +1,266 @@ +#ifndef BloomFilter_h +#define BloomFilter_h + +#include +#include "BitVector.h" +#include "Hash.h" +#include "H3.h" + +/** + * A vector of counters, each of which have a fixed number of bits. + */ +class CounterVector : SerialObj { +public: + /** + * Constructs a counter vector having cells of a given width. + * + * @param width The number of bits that each cell occupies. + */ + explicit CounterVector(unsigned width); + + /** + * Increments a given cell. + * + * @param cell The cell to increment. + * + * @param value The value to add to the current counter in *cell*. + * + * @return `true` if adding *value* to the counter in *cell* succeeded. + */ + bool Increment(size_type cell, count_type value); + + /** + * Decrements a given cell. + * + * @param cell The cell to decrement. + * + * @param value The value to subtract from the current counter in *cell*. + * + * @return `true` if subtracting *value* from the counter in *cell* succeeded. + */ + bool Decrement(size_type cell, count_type value); + + /** + * Retrieves the counter of a given cell. + * + * @param cell The cell index to retrieve the count for. + * + * @return The counter associated with *cell*. + */ + count_type Count(size_type cell) const; + + /** + * Retrieves the number of cells in the storage. + * + * @return The number of cells. + */ + size_type Size() const; + + bool Serialize(SerialInfo* info) const; + static CounterVector* Unserialize(UnserialInfo* info); + +protected: + DECLARE_SERIAL(CounterVector); + + CounterVector(); + +private: + BitVector bits_; + unsigned width_; +}; + +/** + * The abstract base class for hash policies. + * @tparam Codomain An integral type. + */ +class HashPolicy { +public: + typedef hash_t hash_type; + virtual ~HashPolicy() { } + size_t k() const { return k; } + virtual std::vector Hash(const void* x, size_t n) const = 0; +protected: + /** + * A functor that computes a universal hash function. + * @tparam Codomain An integral type. + */ + template + class Hasher { + public: + template + Codomain operator()(const Domain& x) const + { + return h3_(&x, sizeof(x)); + } + Codomain operator()(const void* x, size_t n) const + { + return h3_(x, n); + } + private: + // FIXME: The hardcoded value of 36 comes from UHASH_KEY_SIZE defined in + // Hash.h. I do not know how this value impacts the hash function behavior + // so I'll just copy it verbatim. (Matthias) + H3 h3_; + }; + + HashPolicy(size_t k) : k_(k) { } +private: + size_t k_; +}; + +/** + * The *default* hashing policy. Performs *k* hash function computations. + */ +class DefaultHashing : public HashPolicy { +public: + DefaultHashing(size_t k) : HashPolicy(k), hashers_(k) { } + virtual ~DoubleHashing() { } + + virtual std::vector Hash(const void* x, size_t n) const + { + std::vector h(k(), 0); + for (size_t i = 0; i < h.size(); ++i) + h[i] = hashers_[i](x, n); + return h; + } + +private: + std::vector< Hasher > hashers_; +}; + +/** + * The *double-hashing* policy. Uses a linear combination of 2 hash functions. + */ +class DoubleHashing : public HashPolicy { +public: + DoubleHashing(size_t k) : HashPolicy(k), hashers_(k) { } + virtual ~DoubleHashing() { } + + virtual std::vector Hash(const void* x, size_t n) const + { + Codomain h1 = hasher1_(x); + Codomain h2 = hasher2_(x); + std::vector h(k(), 0); + for (size_t i = 0; i < h.size(); ++i) + h[i] = h1 + i * h2; + return h; + } + +private: + Hasher hasher1_; + Hasher hasher2_; +}; + +/** + * The abstract base class for Bloom filters. + */ +class BloomFilter : SerialObj { +public: + virtual ~BloomFilter() { delete hash_; } + + /** + * Adds an element of type T to the Bloom filter. + * @param x The element to add + */ + template + void Add(const T& x) + { + ++elements_; + AddImpl(hash_->Hash(x)); + } + + /** + * Retrieves the associated count of a given value. + * + * @param x The value of type `T` to check. + * + * @return The counter associated with *x*. + */ + template + size_t Count(const T& x) const + { + return CountImpl(hash_->Hash(x)); + } + + /** + * Retrieves the number of elements added to the Bloom filter. + * + * @return The number of elements in this Bloom filter. + */ + size_t Size() const + { + return elements_; + } + +protected: + typedef std::vector HashVector; + + /** + * Default-constructs a Bloom filter. + */ + BloomFilter(); + + /** + * Constructs a BloomFilter. + * @param hash The hashing policy. + */ + BloomFilter(HashPolicy* hash); + + virtual void AddImpl(const HashVector& hashes) = 0; + + virtual size_t CountImpl(const HashVector& hashes) const = 0; + + std::vector Hash(const T& x) const + { + return hash_->Hash(&x, sizeof(x)); + } + +private: + HashPolicy* hash_; // Owned by *this. + + size_t elements_; +}; + +/** + * A basic Bloom filter. + */ +class BasicBloomFilter : public BloomFilter { +public: + BasicBloomFilter(); + BasicBloomFilter(HashPolicy* hash); + +protected: + virtual void AddImpl(const HashVector& h) + { + for ( size_t i = 0; i < h.size(); ++i ) + bits_.set(h[i] % h.size()); + } + + virtual size_t CountImpl(const HashVector& h) const + { + for ( size_t i = 0; i < h.size(); ++i ) + if ( ! bits_[h[i] % h.size()] ) + return 0; + return 1; + } + +private: + BitVector bits_; +}; + +/** + * A counting Bloom filter. + */ +class CountingBloomFilter : public BloomFilter { +public: + CountingBloomFilter(unsigned width); + CountingBloomFilter(HashPolicy* hash); + +protected: + CountingBloomFilter(); + +private: + CounterVector cells_; +}; + +#endif diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 33aaab29c1..11de7772d7 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -304,6 +304,7 @@ set(bro_SRCS BitTorrent.cc BitTorrentTracker.cc BitVector.cc + BloomFilter.cc BPF_Program.cc BroDoc.cc BroDocObj.cc From f529df33e0afa930e4babff66f4a5f590b5eb6d9 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Mon, 3 Jun 2013 14:00:28 -0700 Subject: [PATCH 04/45] Stabilize Bloom filter interface. --- src/BloomFilter.cc | 33 ++++++++++++++++++ src/BloomFilter.h | 85 +++++++++++++++++----------------------------- 2 files changed, 65 insertions(+), 53 deletions(-) create mode 100644 src/BloomFilter.cc diff --git a/src/BloomFilter.cc b/src/BloomFilter.cc new file mode 100644 index 0000000000..6873815f69 --- /dev/null +++ b/src/BloomFilter.cc @@ -0,0 +1,33 @@ +#include "BloomFilter.h" + +HashPolicy::HashVector DefaultHashing::Hash(const void* x, size_t n) const + { + HashVector h(k(), 0); + for ( size_t i = 0; i < h.size(); ++i ) + h[i] = hashers_[i](x, n); + return h; + } + +HashPolicy::HashVector DoubleHashing::Hash(const void* x, size_t n) const + { + HashType h1 = hasher1_(x); + HashType h2 = hasher2_(x); + HashVector h(k(), 0); + for ( size_t i = 0; i < h.size(); ++i ) + h[i] = h1 + i * h2; + return h; + } + +void BasicBloomFilter::AddImpl(const HashPolicy::HashVector& h) + { + for ( size_t i = 0; i < h.size(); ++i ) + bits_.set(h[i] % h.size()); + } + +size_t BasicBloomFilter::CountImpl(const HashPolicy::HashVector& h) const + { + for ( size_t i = 0; i < h.size(); ++i ) + if ( ! bits_[h[i] % h.size()] ) + return 0; + return 1; + } diff --git a/src/BloomFilter.h b/src/BloomFilter.h index a767c6b8b8..dca4eff2bd 100644 --- a/src/BloomFilter.h +++ b/src/BloomFilter.h @@ -11,6 +11,9 @@ */ class CounterVector : SerialObj { public: + typedef size_t size_type; + typedef uint64 count_type; + /** * Constructs a counter vector having cells of a given width. * @@ -70,21 +73,24 @@ private: }; /** - * The abstract base class for hash policies. + * The abstract base class for hash policies that hash elements *k* times. * @tparam Codomain An integral type. */ class HashPolicy { public: - typedef hash_t hash_type; + typedef hash_t HashType; + typedef std::vector HashVector; + virtual ~HashPolicy() { } - size_t k() const { return k; } - virtual std::vector Hash(const void* x, size_t n) const = 0; + size_t k() const { return k_; } + virtual HashVector Hash(const void* x, size_t n) const = 0; + protected: /** * A functor that computes a universal hash function. * @tparam Codomain An integral type. */ - template + template class Hasher { public: template @@ -104,8 +110,9 @@ protected: }; HashPolicy(size_t k) : k_(k) { } + private: - size_t k_; + const size_t k_; }; /** @@ -114,18 +121,12 @@ private: class DefaultHashing : public HashPolicy { public: DefaultHashing(size_t k) : HashPolicy(k), hashers_(k) { } - virtual ~DoubleHashing() { } + virtual ~DefaultHashing() { } - virtual std::vector Hash(const void* x, size_t n) const - { - std::vector h(k(), 0); - for (size_t i = 0; i < h.size(); ++i) - h[i] = hashers_[i](x, n); - return h; - } + virtual HashVector Hash(const void* x, size_t n) const; private: - std::vector< Hasher > hashers_; + std::vector< Hasher > hashers_; }; /** @@ -133,22 +134,14 @@ private: */ class DoubleHashing : public HashPolicy { public: - DoubleHashing(size_t k) : HashPolicy(k), hashers_(k) { } + DoubleHashing(size_t k) : HashPolicy(k) { } virtual ~DoubleHashing() { } - virtual std::vector Hash(const void* x, size_t n) const - { - Codomain h1 = hasher1_(x); - Codomain h2 = hasher2_(x); - std::vector h(k(), 0); - for (size_t i = 0; i < h.size(); ++i) - h[i] = h1 + i * h2; - return h; - } + virtual HashVector Hash(const void* x, size_t n) const; private: - Hasher hasher1_; - Hasher hasher2_; + Hasher hasher1_; + Hasher hasher2_; }; /** @@ -166,7 +159,7 @@ public: void Add(const T& x) { ++elements_; - AddImpl(hash_->Hash(x)); + AddImpl(hash_->Hash(&x, sizeof(x))); } /** @@ -179,7 +172,7 @@ public: template size_t Count(const T& x) const { - return CountImpl(hash_->Hash(x)); + return CountImpl(hash_->Hash(&x, sizeof(x))); } /** @@ -193,8 +186,6 @@ public: } protected: - typedef std::vector HashVector; - /** * Default-constructs a Bloom filter. */ @@ -206,17 +197,12 @@ protected: */ BloomFilter(HashPolicy* hash); - virtual void AddImpl(const HashVector& hashes) = 0; + virtual void AddImpl(const HashPolicy::HashVector& hashes) = 0; - virtual size_t CountImpl(const HashVector& hashes) const = 0; - - std::vector Hash(const T& x) const - { - return hash_->Hash(&x, sizeof(x)); - } + virtual size_t CountImpl(const HashPolicy::HashVector& hashes) const = 0; private: - HashPolicy* hash_; // Owned by *this. + HashPolicy* hash_; // Owned by *this. size_t elements_; }; @@ -230,19 +216,9 @@ public: BasicBloomFilter(HashPolicy* hash); protected: - virtual void AddImpl(const HashVector& h) - { - for ( size_t i = 0; i < h.size(); ++i ) - bits_.set(h[i] % h.size()); - } + virtual void AddImpl(const HashPolicy::HashVector& h); - virtual size_t CountImpl(const HashVector& h) const - { - for ( size_t i = 0; i < h.size(); ++i ) - if ( ! bits_[h[i] % h.size()] ) - return 0; - return 1; - } + virtual size_t CountImpl(const HashPolicy::HashVector& h) const; private: BitVector bits_; @@ -253,12 +229,15 @@ private: */ class CountingBloomFilter : public BloomFilter { public: - CountingBloomFilter(unsigned width); - CountingBloomFilter(HashPolicy* hash); + CountingBloomFilter(unsigned width, HashPolicy* hash); protected: CountingBloomFilter(); + virtual void AddImpl(const HashPolicy::HashVector& h); + + virtual size_t CountImpl(const HashPolicy::HashVector& h) const; + private: CounterVector cells_; }; From f708cd4a361ba02083380cfe0db2949e3e06cff7 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Mon, 3 Jun 2013 22:55:21 -0700 Subject: [PATCH 05/45] Work on parameter estimation and serialization. --- src/BloomFilter.cc | 131 ++++++++++++++++++++++++++++++++++++++++++++- src/BloomFilter.h | 41 +++++++------- src/NetVar.cc | 2 + src/OpaqueVal.cc | 23 ++++++++ src/OpaqueVal.h | 16 ++++++ src/SerialTypes.h | 7 +++ 6 files changed, 198 insertions(+), 22 deletions(-) diff --git a/src/BloomFilter.cc b/src/BloomFilter.cc index 6873815f69..4787bef0f0 100644 --- a/src/BloomFilter.cc +++ b/src/BloomFilter.cc @@ -1,23 +1,130 @@ #include "BloomFilter.h" +#include +#include "Serializer.h" + +// Backport C++11's std::round(). +namespace { +template +T round(double x) { return (x > 0.0) ? (x + 0.5) : (x - 0.5); } +} // namespace + + +IMPLEMENT_SERIAL(CounterVector, SER_COUNTERVECTOR) + +bool CounterVector::DoSerialize(SerialInfo* info) const + { + DO_SERIALIZE(SER_COUNTERVECTOR, SerialObj); + if ( ! SERIALIZE(&bits_) ) + return false; + return SERIALIZE(static_cast(width_)); + } + +bool CounterVector::DoUnserialize(UnserialInfo* info) + { + DO_UNSERIALIZE(SerialObj); + return false; + // TODO: Ask Robin how to unserialize non-pointer members. + //if ( ! UNSERIALIZE(&bits_) ) + // return false; + uint64 width; + if ( ! UNSERIALIZE(&width) ) + return false; + width_ = static_cast(width); + return true; + } + + HashPolicy::HashVector DefaultHashing::Hash(const void* x, size_t n) const { - HashVector h(k(), 0); + HashVector h(K(), 0); for ( size_t i = 0; i < h.size(); ++i ) h[i] = hashers_[i](x, n); return h; } + HashPolicy::HashVector DoubleHashing::Hash(const void* x, size_t n) const { HashType h1 = hasher1_(x); HashType h2 = hasher2_(x); - HashVector h(k(), 0); + HashVector h(K(), 0); for ( size_t i = 0; i < h.size(); ++i ) h[i] = h1 + i * h2; return h; } +bool BloomFilter::Serialize(SerialInfo* info) const + { + return SerialObj::Serialize(info); + } + +BloomFilter* BloomFilter::Unserialize(UnserialInfo* info) + { + return reinterpret_cast( + SerialObj::Unserialize(info, SER_BLOOMFILTER)); + } + +// FIXME: should abstract base classes also have IMPLEMENT_SERIAL? +//IMPLEMENT_SERIAL(BloomFilter, SER_BLOOMFILTER) + +bool BloomFilter::DoSerialize(SerialInfo* info) const + { + DO_SERIALIZE(SER_BLOOMFILTER, SerialObj); + // TODO: Make the hash policy serializable. + //if ( ! SERIALIZE(hash_) ) + // return false; + return SERIALIZE(static_cast(elements_)); + } + +bool BloomFilter::DoUnserialize(UnserialInfo* info) + { + DO_UNSERIALIZE(SerialObj); + // TODO: Make the hash policy serializable. + //if ( ! hash_ = HashPolicy::Unserialize(info) ) + // return false; + uint64 elements; + if ( UNSERIALIZE(&elements) ) + return false; + elements_ = static_cast(elements); + return true; + } + +size_t BasicBloomFilter::Cells(double fp, size_t capacity) + { + double ln2 = std::log(2); + return std::ceil(-(capacity * std::log(fp) / ln2 / ln2)); + } + +size_t BasicBloomFilter::K(size_t cells, size_t capacity) + { + double frac = static_cast(cells) / static_cast(capacity); + return round(frac * std::log(2)); + } + +BasicBloomFilter::BasicBloomFilter(size_t cells, HashPolicy* hash) + : BloomFilter(hash), bits_(cells) + { + } + +IMPLEMENT_SERIAL(BasicBloomFilter, SER_BASICBLOOMFILTER) + +bool BasicBloomFilter::DoSerialize(SerialInfo* info) const + { + DO_SERIALIZE(SER_BASICBLOOMFILTER, BloomFilter); + // TODO: Make the hash policy serializable. + //if ( ! SERIALIZE(&bits_) ) + // return false; + return true; + } + +bool BasicBloomFilter::DoUnserialize(UnserialInfo* info) + { + DO_UNSERIALIZE(BloomFilter); + // TODO: Non-pointer member deserialization? + return true; + } + void BasicBloomFilter::AddImpl(const HashPolicy::HashVector& h) { for ( size_t i = 0; i < h.size(); ++i ) @@ -31,3 +138,23 @@ size_t BasicBloomFilter::CountImpl(const HashPolicy::HashVector& h) const return 0; return 1; } + + +void CountingBloomFilter::AddImpl(const HashPolicy::HashVector& h) + { + for ( size_t i = 0; i < h.size(); ++i ) + cells_.Increment(h[i] % h.size(), 1); + } + +size_t CountingBloomFilter::CountImpl(const HashPolicy::HashVector& h) const + { + CounterVector::size_type min = + std::numeric_limits::max(); + for ( size_t i = 0; i < h.size(); ++i ) + { + CounterVector::size_type cnt = cells_.Count(h[i] % h.size()); + if ( cnt < min ) + min = cnt; + } + return min; + } diff --git a/src/BloomFilter.h b/src/BloomFilter.h index dca4eff2bd..82948f30ec 100644 --- a/src/BloomFilter.h +++ b/src/BloomFilter.h @@ -65,7 +65,7 @@ public: protected: DECLARE_SERIAL(CounterVector); - CounterVector(); + CounterVector() { } private: BitVector bits_; @@ -82,7 +82,7 @@ public: typedef std::vector HashVector; virtual ~HashPolicy() { } - size_t k() const { return k_; } + size_t K() const { return k_; } virtual HashVector Hash(const void* x, size_t n) const = 0; protected: @@ -130,7 +130,7 @@ private: }; /** - * The *double-hashing* policy. Uses a linear combination of 2 hash functions. + * The *double-hashing* policy. Uses a linear combination of two hash functions. */ class DoubleHashing : public HashPolicy { public: @@ -185,25 +185,20 @@ public: return elements_; } -protected: - /** - * Default-constructs a Bloom filter. - */ - BloomFilter(); + bool Serialize(SerialInfo* info) const; + static BloomFilter* Unserialize(UnserialInfo* info); - /** - * Constructs a BloomFilter. - * @param hash The hashing policy. - */ - BloomFilter(HashPolicy* hash); +protected: + DECLARE_SERIAL(BloomFilter); + + BloomFilter() { }; + BloomFilter(HashPolicy* hash) : hash_(hash) { } virtual void AddImpl(const HashPolicy::HashVector& hashes) = 0; - virtual size_t CountImpl(const HashPolicy::HashVector& hashes) const = 0; private: - HashPolicy* hash_; // Owned by *this. - + HashPolicy* hash_; size_t elements_; }; @@ -212,12 +207,17 @@ private: */ class BasicBloomFilter : public BloomFilter { public: - BasicBloomFilter(); - BasicBloomFilter(HashPolicy* hash); + static size_t Cells(double fp, size_t capacity); + static size_t K(size_t cells, size_t capacity); + + BasicBloomFilter(size_t cells, HashPolicy* hash); protected: - virtual void AddImpl(const HashPolicy::HashVector& h); + DECLARE_SERIAL(BasicBloomFilter); + BasicBloomFilter() { } + + virtual void AddImpl(const HashPolicy::HashVector& h); virtual size_t CountImpl(const HashPolicy::HashVector& h) const; private: @@ -232,10 +232,11 @@ public: CountingBloomFilter(unsigned width, HashPolicy* hash); protected: + DECLARE_SERIAL(CountingBloomFilter); + CountingBloomFilter(); virtual void AddImpl(const HashPolicy::HashVector& h); - virtual size_t CountImpl(const HashPolicy::HashVector& h) const; private: diff --git a/src/NetVar.cc b/src/NetVar.cc index 3a23e4c9fa..d8c2192af7 100644 --- a/src/NetVar.cc +++ b/src/NetVar.cc @@ -244,6 +244,7 @@ OpaqueType* md5_type; OpaqueType* sha1_type; OpaqueType* sha256_type; OpaqueType* entropy_type; +OpaqueType* bloomfilter_type; #include "const.bif.netvar_def" #include "types.bif.netvar_def" @@ -310,6 +311,7 @@ void init_general_global_var() sha1_type = new OpaqueType("sha1"); sha256_type = new OpaqueType("sha256"); entropy_type = new OpaqueType("entropy"); + bloomfilter_type = new OpaqueType("bloomfilter"); } void init_net_var() diff --git a/src/OpaqueVal.cc b/src/OpaqueVal.cc index 19346e52f2..a5fb65f53b 100644 --- a/src/OpaqueVal.cc +++ b/src/OpaqueVal.cc @@ -1,4 +1,6 @@ #include "OpaqueVal.h" + +#include "BloomFilter.h" #include "NetVar.h" #include "Reporter.h" #include "Serializer.h" @@ -515,3 +517,24 @@ bool EntropyVal::DoUnserialize(UnserialInfo* info) return true; } + +BloomFilterVal::BloomFilterVal(OpaqueType* t) : OpaqueVal(t) + { + } + +IMPLEMENT_SERIAL(BloomFilterVal, SER_BLOOMFILTER_VAL); + +bool BloomFilterVal::DoSerialize(SerialInfo* info) const + { + DO_SERIALIZE(SER_BLOOMFILTER_VAL, OpaqueVal); + // TODO: implement. + return true; + } + +bool BloomFilterVal::DoUnserialize(UnserialInfo* info) + { + DO_UNSERIALIZE(OpaqueVal); + // TODO: implement. + return true; + } + diff --git a/src/OpaqueVal.h b/src/OpaqueVal.h index 78fa5da5e9..1c9c0361cc 100644 --- a/src/OpaqueVal.h +++ b/src/OpaqueVal.h @@ -7,6 +7,8 @@ #include "Val.h" #include "digest.h" +class BloomFilter; + class HashVal : public OpaqueVal { public: virtual bool IsValid() const; @@ -107,4 +109,18 @@ private: RandTest state; }; +class BloomFilterVal : public OpaqueVal { +public: + BloomFilterVal(); + +protected: + friend class Val; + BloomFilterVal(OpaqueType* t); + + DECLARE_SERIAL(BloomFilterVal); + +private: + BloomFilter* bloom_filter_; +}; + #endif diff --git a/src/SerialTypes.h b/src/SerialTypes.h index c9c0c34a33..171113ab6a 100644 --- a/src/SerialTypes.h +++ b/src/SerialTypes.h @@ -50,6 +50,9 @@ SERIAL_IS_BO(CASE, 0x1200) SERIAL_IS(LOCATION, 0x1300) SERIAL_IS(RE_MATCHER, 0x1400) SERIAL_IS(BITVECTOR, 0x1500) +SERIAL_IS(COUNTERVECTOR, 0xa000) +SERIAL_IS(BLOOMFILTER, 0xa100) +SERIAL_IS(BASICBLOOMFILTER, 0xa200) // These are the externally visible types. const SerialType SER_NONE = 0; @@ -105,6 +108,7 @@ SERIAL_VAL(MD5_VAL, 16) SERIAL_VAL(SHA1_VAL, 17) SERIAL_VAL(SHA256_VAL, 18) SERIAL_VAL(ENTROPY_VAL, 19) +SERIAL_VAL(BLOOMFILTER_VAL, 20) #define SERIAL_EXPR(name, val) SERIAL_CONST(name, val, EXPR) SERIAL_EXPR(EXPR, 1) @@ -204,5 +208,8 @@ SERIAL_CONST2(CASE) SERIAL_CONST2(LOCATION) SERIAL_CONST2(RE_MATCHER) SERIAL_CONST2(BITVECTOR) +SERIAL_CONST2(COUNTERVECTOR) +SERIAL_CONST2(BLOOMFILTER) +SERIAL_CONST2(BASICBLOOMFILTER) #endif From d3297dd6f3b6a50c07c90e9ad5f61c0ddf762460 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Tue, 4 Jun 2013 13:32:26 -0700 Subject: [PATCH 06/45] Adhere to Bro coding style. --- src/BitVector.cc | 100 +++++++++++++++++++++++------------------------ src/BitVector.h | 40 +++++++++---------- 2 files changed, 69 insertions(+), 71 deletions(-) diff --git a/src/BitVector.cc b/src/BitVector.cc index f57301d506..f029230609 100644 --- a/src/BitVector.cc +++ b/src/BitVector.cc @@ -33,7 +33,7 @@ BitVector::Reference::Reference(block_type& block, block_type i) assert(i < bits_per_block); } -BitVector::Reference& BitVector::Reference::flip() +BitVector::Reference& BitVector::Reference::Flip() { block_ ^= mask_; return *this; @@ -105,7 +105,7 @@ BitVector::BitVector(BitVector const& other) BitVector BitVector::operator~() const { BitVector b(*this); - b.flip(); + b.Flip(); return b; } @@ -130,15 +130,15 @@ BitVector BitVector::operator>>(size_type n) const BitVector& BitVector::operator<<=(size_type n) { if (n >= num_bits_) - return reset(); + return Reset(); if (n > 0) { - size_type last = blocks() - 1; + size_type last = Blocks() - 1; size_type div = n / bits_per_block; block_type r = bit_index(n); block_type* b = &bits_[0]; - assert(blocks() >= 1); + assert(Blocks() >= 1); assert(div <= last); if (r != 0) @@ -164,15 +164,15 @@ BitVector& BitVector::operator<<=(size_type n) BitVector& BitVector::operator>>=(size_type n) { if (n >= num_bits_) - return reset(); + return Reset(); if (n > 0) { - size_type last = blocks() - 1; + size_type last = Blocks() - 1; size_type div = n / bits_per_block; block_type r = bit_index(n); block_type* b = &bits_[0]; - assert(blocks() >= 1); + assert(Blocks() >= 1); assert(div <= last); if (r != 0) @@ -187,39 +187,39 @@ BitVector& BitVector::operator>>=(size_type n) b[i-div] = b[i]; } - std::fill_n(b + (blocks() - div), div, block_type(0)); + std::fill_n(b + (Blocks() - div), div, block_type(0)); } return *this; } BitVector& BitVector::operator&=(BitVector const& other) { - assert(size() >= other.size()); - for (size_type i = 0; i < blocks(); ++i) + assert(Size() >= other.Size()); + for (size_type i = 0; i < Blocks(); ++i) bits_[i] &= other.bits_[i]; return *this; } BitVector& BitVector::operator|=(BitVector const& other) { - assert(size() >= other.size()); - for (size_type i = 0; i < blocks(); ++i) + assert(Size() >= other.Size()); + for (size_type i = 0; i < Blocks(); ++i) bits_[i] |= other.bits_[i]; return *this; } BitVector& BitVector::operator^=(BitVector const& other) { - assert(size() >= other.size()); - for (size_type i = 0; i < blocks(); ++i) + assert(Size() >= other.Size()); + for (size_type i = 0; i < Blocks(); ++i) bits_[i] ^= other.bits_[i]; return *this; } BitVector& BitVector::operator-=(BitVector const& other) { - assert(size() >= other.size()); - for (size_type i = 0; i < blocks(); ++i) + assert(Size() >= other.Size()); + for (size_type i = 0; i < Blocks(); ++i) bits_[i] &= ~other.bits_[i]; return *this; } @@ -260,8 +260,8 @@ bool operator!=(BitVector const& x, BitVector const& y) bool operator<(BitVector const& x, BitVector const& y) { - assert(x.size() == y.size()); - for (BitVector::size_type r = x.blocks(); r > 0; --r) + assert(x.Size() == y.Size()); + for (BitVector::size_type r = x.Blocks(); r > 0; --r) { BitVector::size_type i = r - 1; if (x.bits_[i] < y.bits_[i]) @@ -272,9 +272,9 @@ bool operator<(BitVector const& x, BitVector const& y) return false; } -void BitVector::resize(size_type n, bool value) +void BitVector::Resize(size_type n, bool value) { - size_type old = blocks(); + size_type old = Blocks(); size_type required = bits_to_blocks(n); block_type block_value = value ? ~block_type(0) : block_type(0); @@ -288,27 +288,27 @@ void BitVector::resize(size_type n, bool value) zero_unused_bits(); } -void BitVector::clear() +void BitVector::Clear() { bits_.clear(); num_bits_ = 0; } -void BitVector::push_back(bool bit) +void BitVector::PushBack(bool bit) { - size_type s = size(); - resize(s + 1); - set(s, bit); + size_type s = Size(); + Resize(s + 1); + Set(s, bit); } -void BitVector::append(block_type block) +void BitVector::Append(block_type block) { size_type excess = extra_bits(); if (excess) { - assert(! bits_.empty()); + assert(! Empty()); bits_.push_back(block >> (bits_per_block - excess)); - bits_[bits_.size() - 2] |= (block << excess); + bits_[Blocks() - 2] |= (block << excess); } else { @@ -317,48 +317,46 @@ void BitVector::append(block_type block) num_bits_ += bits_per_block; } -BitVector& BitVector::set(size_type i, bool bit) +BitVector& BitVector::Set(size_type i, bool bit) { assert(i < num_bits_); - if (bit) - bits_[block_index(i)] |= bit_mask(i); + bits_[block_index(i)] |= bit_mask(i); else - reset(i); - + Reset(i); return *this; } -BitVector& BitVector::set() +BitVector& BitVector::Set() { std::fill(bits_.begin(), bits_.end(), ~block_type(0)); zero_unused_bits(); return *this; } -BitVector& BitVector::reset(size_type i) +BitVector& BitVector::Reset(size_type i) { assert(i < num_bits_); bits_[block_index(i)] &= ~bit_mask(i); return *this; } -BitVector& BitVector::reset() +BitVector& BitVector::Reset() { std::fill(bits_.begin(), bits_.end(), block_type(0)); return *this; } -BitVector& BitVector::flip(size_type i) +BitVector& BitVector::Flip(size_type i) { assert(i < num_bits_); bits_[block_index(i)] ^= bit_mask(i); return *this; } -BitVector& BitVector::flip() +BitVector& BitVector::Flip() { - for (size_type i = 0; i < blocks(); ++i) + for (size_type i = 0; i < Blocks(); ++i) bits_[i] = ~bits_[i]; zero_unused_bits(); return *this; @@ -376,11 +374,11 @@ BitVector::Reference BitVector::operator[](size_type i) return Reference(bits_[block_index(i)], bit_index(i)); } -BitVector::size_type BitVector::count() const +BitVector::size_type BitVector::Count() const { std::vector::const_iterator first = bits_.begin(); size_t n = 0; - size_type length = blocks(); + size_type length = Blocks(); while (length) { block_type block = *first; @@ -396,29 +394,29 @@ BitVector::size_type BitVector::count() const return n; } -BitVector::size_type BitVector::blocks() const +BitVector::size_type BitVector::Blocks() const { return bits_.size(); } -BitVector::size_type BitVector::size() const +BitVector::size_type BitVector::Size() const { return num_bits_; } -bool BitVector::empty() const +bool BitVector::Empty() const { return bits_.empty(); } -BitVector::size_type BitVector::find_first() const +BitVector::size_type BitVector::FindFirst() const { return find_from(0); } -BitVector::size_type BitVector::find_next(size_type i) const +BitVector::size_type BitVector::FindNext(size_type i) const { - if (i >= (size() - 1) || size() == 0) + if (i >= (Size() - 1) || Size() == 0) return npos; ++i; size_type bi = block_index(i); @@ -437,7 +435,7 @@ BitVector::size_type BitVector::lowest_bit(block_type block) BitVector::block_type BitVector::extra_bits() const { - return bit_index(size()); + return bit_index(Size()); } void BitVector::zero_unused_bits() @@ -448,9 +446,9 @@ void BitVector::zero_unused_bits() BitVector::size_type BitVector::find_from(size_type i) const { - while (i < blocks() && bits_[i] == 0) + while (i < Blocks() && bits_[i] == 0) ++i; - if (i >= blocks()) + if (i >= Blocks()) return npos; return i * bits_per_block + lowest_bit(bits_[i]); } diff --git a/src/BitVector.h b/src/BitVector.h index 9900dd103e..8315a151f0 100644 --- a/src/BitVector.h +++ b/src/BitVector.h @@ -24,7 +24,7 @@ public: Reference(block_type& block, block_type i); public: - Reference& flip(); + Reference& Flip(); operator bool() const; bool operator~() const; Reference& operator=(bool x); @@ -110,7 +110,7 @@ public: * sequence. */ template - void append(ForwardIterator first, ForwardIterator last) + void Append(ForwardIterator first, ForwardIterator last) { if (first == last) return; @@ -119,7 +119,7 @@ public: typename std::iterator_traits::difference_type delta = std::distance(first, last); - bits_.reserve(blocks() + delta); + bits_.reserve(Blocks() + delta); if (excess == 0) { bits_.back() |= (*first << excess); @@ -140,24 +140,24 @@ public: * Appends the bits in a given block. * @param block The block containing bits to append. */ - void append(block_type block); + void Append(block_type block); /** Appends a single bit to the end of the bit vector. * @param bit The value of the bit. */ - void push_back(bool bit); + void PushBack(bool bit); /** * Clears all bits in the bitvector. */ - void clear(); + void Clear(); /** * Resizes the bit vector to a new number of bits. * @param n The new number of bits of the bit vector. * @param value The bit value of new values, if the vector expands. */ - void resize(size_type n, bool value = false); + void Resize(size_type n, bool value = false); /** * Sets a bit at a specific position to a given value. @@ -165,39 +165,39 @@ public: * @param bit The value assigned to position *i*. * @return A reference to the bit vector instance. */ - BitVector& set(size_type i, bool bit = true); + BitVector& Set(size_type i, bool bit = true); /** * Sets all bits to 1. * @return A reference to the bit vector instance. */ - BitVector& set(); + BitVector& Set(); /** * Resets a bit at a specific position, i.e., sets it to 0. * @param i The bit position. * @return A reference to the bit vector instance. */ - BitVector& reset(size_type i); + BitVector& Reset(size_type i); /** * Sets all bits to 0. * @return A reference to the bit vector instance. */ - BitVector& reset(); + BitVector& Reset(); /** * Toggles/flips a bit at a specific position. * @param i The bit position. * @return A reference to the bit vector instance. */ - BitVector& flip(size_type i); + BitVector& Flip(size_type i); /** * Computes the complement. * @return A reference to the bit vector instance. */ - BitVector& flip(); + BitVector& Flip(); /** Retrieves a single bit. * @param i The bit position. @@ -217,32 +217,32 @@ public: * count* or *Hamming weight*. * @return The number of bits set to 1. */ - size_type count() const; + size_type Count() const; /** * Retrieves the number of blocks of the underlying storage. - * @param The number of blocks that represent `size()` bits. + * @param The number of blocks that represent `Size()` bits. */ - size_type blocks() const; + size_type Blocks() const; /** * Retrieves the number of bits the bitvector consist of. * @return The length of the bit vector in bits. */ - size_type size() const; + size_type Size() const; /** * Checks whether the bit vector is empty. * @return `true` iff the bitvector has zero length. */ - bool empty() const; + bool Empty() const; /** * Finds the bit position of of the first 1-bit. * @return The position of the first bit that equals to one or `npos` if no * such bit exists. */ - size_type find_first() const; + size_type FindFirst() const; /** * Finds the next 1-bit from a given starting position. @@ -252,7 +252,7 @@ public: * @return The position of the first bit that equals to 1 after position * *i* or `npos` if no such bit exists. */ - size_type find_next(size_type i) const; + size_type FindNext(size_type i) const; bool Serialize(SerialInfo* info) const; static BitVector* Unserialize(UnserialInfo* info); From a5572dd66f10ca653855483e0941da327b8422e4 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Tue, 4 Jun 2013 14:31:39 -0700 Subject: [PATCH 07/45] Write CounterVector implementation scaffold. --- src/BloomFilter.cc | 36 ++++++++++++++++++++++++++++++++++++ src/BloomFilter.h | 10 +++++++--- 2 files changed, 43 insertions(+), 3 deletions(-) diff --git a/src/BloomFilter.cc b/src/BloomFilter.cc index 4787bef0f0..78048ee588 100644 --- a/src/BloomFilter.cc +++ b/src/BloomFilter.cc @@ -10,6 +10,42 @@ T round(double x) { return (x > 0.0) ? (x + 0.5) : (x - 0.5); } } // namespace +CounterVector::CounterVector(size_t width, size_t cells) + : bits_(new BitVector(width * cells)), width_(width) + { + } + +CounterVector::~CounterVector() + { + delete bits_; + } + +bool CounterVector::Increment(size_type cell, count_type value) + { + // TODO + assert(! "not yet implemented"); + return false; + } + +bool CounterVector::Decrement(size_type cell, count_type value) + { + // TODO + assert(! "not yet implemented"); + return false; + } + +CounterVector::count_type CounterVector::Count(size_type cell) const + { + // TODO + assert(! "not yet implemented"); + return 0; + } + +CounterVector::size_type CounterVector::Size() const + { + return bits_->Blocks() / width_; + } + IMPLEMENT_SERIAL(CounterVector, SER_COUNTERVECTOR) bool CounterVector::DoSerialize(SerialInfo* info) const diff --git a/src/BloomFilter.h b/src/BloomFilter.h index 82948f30ec..b4f82efee9 100644 --- a/src/BloomFilter.h +++ b/src/BloomFilter.h @@ -9,7 +9,7 @@ /** * A vector of counters, each of which have a fixed number of bits. */ -class CounterVector : SerialObj { +class CounterVector : public SerialObj { public: typedef size_t size_type; typedef uint64 count_type; @@ -18,8 +18,12 @@ public: * Constructs a counter vector having cells of a given width. * * @param width The number of bits that each cell occupies. + * + * @param cells The number of cells in the bitvector. */ - explicit CounterVector(unsigned width); + CounterVector(size_t width, size_t cells = 1024); + + ~CounterVector(); /** * Increments a given cell. @@ -68,7 +72,7 @@ protected: CounterVector() { } private: - BitVector bits_; + BitVector* bits_; unsigned width_; }; From 751cf612931f021ddf7b5ee51019f20d05e0c309 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Tue, 4 Jun 2013 15:30:27 -0700 Subject: [PATCH 08/45] Add more serialization implementation. --- src/BloomFilter.cc | 93 ++++++++++++++++++++++++++++++++-------------- src/BloomFilter.h | 56 +++++++++++++++++++++++----- src/NetVar.h | 1 + src/OpaqueVal.cc | 18 ++++++--- src/OpaqueVal.h | 1 + src/SerialTypes.h | 2 + 6 files changed, 129 insertions(+), 42 deletions(-) diff --git a/src/BloomFilter.cc b/src/BloomFilter.cc index 78048ee588..64f0e1c67b 100644 --- a/src/BloomFilter.cc +++ b/src/BloomFilter.cc @@ -46,12 +46,23 @@ CounterVector::size_type CounterVector::Size() const return bits_->Blocks() / width_; } +bool CounterVector::Serialize(SerialInfo* info) const + { + return SerialObj::Serialize(info); + } + +CounterVector* CounterVector::Unserialize(UnserialInfo* info) + { + return reinterpret_cast( + SerialObj::Unserialize(info, SER_COUNTERVECTOR)); + } + IMPLEMENT_SERIAL(CounterVector, SER_COUNTERVECTOR) bool CounterVector::DoSerialize(SerialInfo* info) const { DO_SERIALIZE(SER_COUNTERVECTOR, SerialObj); - if ( ! SERIALIZE(&bits_) ) + if ( ! SERIALIZE(bits_) ) return false; return SERIALIZE(static_cast(width_)); } @@ -60,9 +71,9 @@ bool CounterVector::DoUnserialize(UnserialInfo* info) { DO_UNSERIALIZE(SerialObj); return false; - // TODO: Ask Robin how to unserialize non-pointer members. - //if ( ! UNSERIALIZE(&bits_) ) - // return false; + bits_ = BitVector::Unserialize(info); + if ( ! bits_ ) + return false; uint64 width; if ( ! UNSERIALIZE(&width) ) return false; @@ -90,6 +101,18 @@ HashPolicy::HashVector DoubleHashing::Hash(const void* x, size_t n) const return h; } + +BloomFilter::BloomFilter(size_t k) + : hash_(new hash_policy(k)) + { + } + +BloomFilter::~BloomFilter() + { + if ( hash_ ) + delete hash_; + } + bool BloomFilter::Serialize(SerialInfo* info) const { return SerialObj::Serialize(info); @@ -101,24 +124,21 @@ BloomFilter* BloomFilter::Unserialize(UnserialInfo* info) SerialObj::Unserialize(info, SER_BLOOMFILTER)); } -// FIXME: should abstract base classes also have IMPLEMENT_SERIAL? -//IMPLEMENT_SERIAL(BloomFilter, SER_BLOOMFILTER) - bool BloomFilter::DoSerialize(SerialInfo* info) const { DO_SERIALIZE(SER_BLOOMFILTER, SerialObj); - // TODO: Make the hash policy serializable. - //if ( ! SERIALIZE(hash_) ) - // return false; - return SERIALIZE(static_cast(elements_)); + if ( ! SERIALIZE(static_cast(hash_->K())) ) + return false; + return SERIALIZE(static_cast(elements_)); } bool BloomFilter::DoUnserialize(UnserialInfo* info) { DO_UNSERIALIZE(SerialObj); - // TODO: Make the hash policy serializable. - //if ( ! hash_ = HashPolicy::Unserialize(info) ) - // return false; + uint16 k; + if ( ! UNSERIALIZE(&k) ) + return false; + hash_ = new hash_policy(static_cast(k)); uint64 elements; if ( UNSERIALIZE(&elements) ) return false; @@ -126,7 +146,7 @@ bool BloomFilter::DoUnserialize(UnserialInfo* info) return true; } -size_t BasicBloomFilter::Cells(double fp, size_t capacity) +size_t BasicBloomFilter::M(double fp, size_t capacity) { double ln2 = std::log(2); return std::ceil(-(capacity * std::log(fp) / ln2 / ln2)); @@ -138,9 +158,16 @@ size_t BasicBloomFilter::K(size_t cells, size_t capacity) return round(frac * std::log(2)); } -BasicBloomFilter::BasicBloomFilter(size_t cells, HashPolicy* hash) - : BloomFilter(hash), bits_(cells) +BasicBloomFilter::BasicBloomFilter(double fp, size_t capacity) + : BloomFilter(K(M(fp, capacity), capacity)) { + bits_ = new BitVector(M(fp, capacity)); + } + +BasicBloomFilter::BasicBloomFilter(size_t cells, size_t capacity) + : BloomFilter(K(cells, capacity)) + { + bits_ = new BitVector(cells); } IMPLEMENT_SERIAL(BasicBloomFilter, SER_BASICBLOOMFILTER) @@ -148,38 +175,50 @@ IMPLEMENT_SERIAL(BasicBloomFilter, SER_BASICBLOOMFILTER) bool BasicBloomFilter::DoSerialize(SerialInfo* info) const { DO_SERIALIZE(SER_BASICBLOOMFILTER, BloomFilter); - // TODO: Make the hash policy serializable. - //if ( ! SERIALIZE(&bits_) ) - // return false; - return true; + return SERIALIZE(bits_); } bool BasicBloomFilter::DoUnserialize(UnserialInfo* info) { DO_UNSERIALIZE(BloomFilter); - // TODO: Non-pointer member deserialization? - return true; + bits_ = BitVector::Unserialize(info); + return bits_ == NULL; } void BasicBloomFilter::AddImpl(const HashPolicy::HashVector& h) { for ( size_t i = 0; i < h.size(); ++i ) - bits_.set(h[i] % h.size()); + bits_->Set(h[i] % h.size()); } size_t BasicBloomFilter::CountImpl(const HashPolicy::HashVector& h) const { for ( size_t i = 0; i < h.size(); ++i ) - if ( ! bits_[h[i] % h.size()] ) + if ( ! (*bits_)[h[i] % h.size()] ) return 0; return 1; } +IMPLEMENT_SERIAL(CountingBloomFilter, SER_COUNTINGBLOOMFILTER) + +bool CountingBloomFilter::DoSerialize(SerialInfo* info) const + { + DO_SERIALIZE(SER_BASICBLOOMFILTER, BloomFilter); + return SERIALIZE(cells_); + } + +bool CountingBloomFilter::DoUnserialize(UnserialInfo* info) + { + DO_UNSERIALIZE(BloomFilter); + cells_ = CounterVector::Unserialize(info); + return cells_ == NULL; + } + void CountingBloomFilter::AddImpl(const HashPolicy::HashVector& h) { for ( size_t i = 0; i < h.size(); ++i ) - cells_.Increment(h[i] % h.size(), 1); + cells_->Increment(h[i] % h.size(), 1); } size_t CountingBloomFilter::CountImpl(const HashPolicy::HashVector& h) const @@ -188,7 +227,7 @@ size_t CountingBloomFilter::CountImpl(const HashPolicy::HashVector& h) const std::numeric_limits::max(); for ( size_t i = 0; i < h.size(); ++i ) { - CounterVector::size_type cnt = cells_.Count(h[i] % h.size()); + CounterVector::size_type cnt = cells_->Count(h[i] % h.size()); if ( cnt < min ) min = cnt; } diff --git a/src/BloomFilter.h b/src/BloomFilter.h index b4f82efee9..77c6bc4f56 100644 --- a/src/BloomFilter.h +++ b/src/BloomFilter.h @@ -151,9 +151,13 @@ private: /** * The abstract base class for Bloom filters. */ -class BloomFilter : SerialObj { +class BloomFilter : public SerialObj { public: - virtual ~BloomFilter() { delete hash_; } + // At this point we won't let the user choose the hash policy, but we might + // open up the interface in the future. + typedef DoubleHashing hash_policy; + + virtual ~BloomFilter(); /** * Adds an element of type T to the Bloom filter. @@ -193,10 +197,10 @@ public: static BloomFilter* Unserialize(UnserialInfo* info); protected: - DECLARE_SERIAL(BloomFilter); + DECLARE_ABSTRACT_SERIAL(BloomFilter); BloomFilter() { }; - BloomFilter(HashPolicy* hash) : hash_(hash) { } + BloomFilter(size_t k); virtual void AddImpl(const HashPolicy::HashVector& hashes) = 0; virtual size_t CountImpl(const HashPolicy::HashVector& hashes) const = 0; @@ -211,10 +215,42 @@ private: */ class BasicBloomFilter : public BloomFilter { public: - static size_t Cells(double fp, size_t capacity); + /** + * Computes the number of cells based a given false-positive rate and + * capacity. In the literature, this parameter often has the name *M*. + * + * @param fp The false-positive rate. + * + * @param capacity The number of exepected elements. + * + * Returns: The number cells needed to support a false-positive rate of *fp* + * with at most *capacity* elements. + */ + static size_t M(double fp, size_t capacity); + + /** + * Computes the optimal number of hash functions based on the number cells + * and expected number of elements. + * + * @param cells The number of cells (*m*). + * + * @param capacity The maximum number of elements. + * + * Returns: the optimal number of hash functions for a false-positive rate of + * *fp* for at most *capacity* elements. + */ static size_t K(size_t cells, size_t capacity); - BasicBloomFilter(size_t cells, HashPolicy* hash); + /** + * Constructs a basic Bloom filter with a given false-positive rate and + * capacity. + */ + BasicBloomFilter(double fp, size_t capacity); + + /** + * Constructs a basic Bloom filter with a given number of cells and capacity. + */ + BasicBloomFilter(size_t cells, size_t capacity); protected: DECLARE_SERIAL(BasicBloomFilter); @@ -225,7 +261,7 @@ protected: virtual size_t CountImpl(const HashPolicy::HashVector& h) const; private: - BitVector bits_; + BitVector* bits_; }; /** @@ -233,18 +269,18 @@ private: */ class CountingBloomFilter : public BloomFilter { public: - CountingBloomFilter(unsigned width, HashPolicy* hash); + CountingBloomFilter(unsigned width); protected: DECLARE_SERIAL(CountingBloomFilter); - CountingBloomFilter(); + CountingBloomFilter() { } virtual void AddImpl(const HashPolicy::HashVector& h); virtual size_t CountImpl(const HashPolicy::HashVector& h) const; private: - CounterVector cells_; + CounterVector* cells_; }; #endif diff --git a/src/NetVar.h b/src/NetVar.h index 1a20adcaf2..aa2a14ada5 100644 --- a/src/NetVar.h +++ b/src/NetVar.h @@ -249,6 +249,7 @@ extern OpaqueType* md5_type; extern OpaqueType* sha1_type; extern OpaqueType* sha256_type; extern OpaqueType* entropy_type; +extern OpaqueType* bloomfilter_type; // Initializes globals that don't pertain to network/event analysis. extern void init_general_global_var(); diff --git a/src/OpaqueVal.cc b/src/OpaqueVal.cc index a5fb65f53b..b4f1290436 100644 --- a/src/OpaqueVal.cc +++ b/src/OpaqueVal.cc @@ -518,23 +518,31 @@ bool EntropyVal::DoUnserialize(UnserialInfo* info) return true; } +BloomFilterVal::BloomFilterVal() : OpaqueVal(bloomfilter_type) + { + } + BloomFilterVal::BloomFilterVal(OpaqueType* t) : OpaqueVal(t) { } +BloomFilterVal::~BloomFilterVal() + { + if ( bloom_filter_ ) + delete bloom_filter_; + } + IMPLEMENT_SERIAL(BloomFilterVal, SER_BLOOMFILTER_VAL); bool BloomFilterVal::DoSerialize(SerialInfo* info) const { DO_SERIALIZE(SER_BLOOMFILTER_VAL, OpaqueVal); - // TODO: implement. - return true; + return SERIALIZE(bloom_filter_); } bool BloomFilterVal::DoUnserialize(UnserialInfo* info) { DO_UNSERIALIZE(OpaqueVal); - // TODO: implement. - return true; + bloom_filter_ = BloomFilter::Unserialize(info); + return bloom_filter_ == NULL; } - diff --git a/src/OpaqueVal.h b/src/OpaqueVal.h index 1c9c0361cc..68b42a8a49 100644 --- a/src/OpaqueVal.h +++ b/src/OpaqueVal.h @@ -112,6 +112,7 @@ private: class BloomFilterVal : public OpaqueVal { public: BloomFilterVal(); + ~BloomFilterVal(); protected: friend class Val; diff --git a/src/SerialTypes.h b/src/SerialTypes.h index 171113ab6a..859145f19f 100644 --- a/src/SerialTypes.h +++ b/src/SerialTypes.h @@ -53,6 +53,7 @@ SERIAL_IS(BITVECTOR, 0x1500) SERIAL_IS(COUNTERVECTOR, 0xa000) SERIAL_IS(BLOOMFILTER, 0xa100) SERIAL_IS(BASICBLOOMFILTER, 0xa200) +SERIAL_IS(COUNTINGBLOOMFILTER, 0xa300) // These are the externally visible types. const SerialType SER_NONE = 0; @@ -211,5 +212,6 @@ SERIAL_CONST2(BITVECTOR) SERIAL_CONST2(COUNTERVECTOR) SERIAL_CONST2(BLOOMFILTER) SERIAL_CONST2(BASICBLOOMFILTER) +SERIAL_CONST2(COUNTINGBLOOMFILTER) #endif From 880d02f7204d21fc0e69f08ac78e963042df4f16 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Wed, 5 Jun 2013 16:16:55 -0700 Subject: [PATCH 09/45] Associate a Comphash with a BloomFilterVal. We also keep track of the Bloom filter's element type inside each value. The first use of the BiF bloomfilter_add will "typify" the Bloom filter and lock the Bloom filter's type to the element type. --- src/BloomFilter.cc | 15 ++++++++++++ src/BloomFilter.h | 3 ++- src/OpaqueVal.cc | 60 ++++++++++++++++++++++++++++++++++++++++++++-- src/OpaqueVal.h | 18 ++++++++++++-- 4 files changed, 91 insertions(+), 5 deletions(-) diff --git a/src/BloomFilter.cc b/src/BloomFilter.cc index 64f0e1c67b..74fa6fb255 100644 --- a/src/BloomFilter.cc +++ b/src/BloomFilter.cc @@ -199,6 +199,21 @@ size_t BasicBloomFilter::CountImpl(const HashPolicy::HashVector& h) const return 1; } +CountingBloomFilter::CountingBloomFilter(double fp, size_t capacity, + size_t width) + : BloomFilter(BasicBloomFilter::K(BasicBloomFilter::M(fp, capacity), + capacity)) + { + cells_ = new CounterVector(width, BasicBloomFilter::M(fp, capacity)); + } + +CountingBloomFilter::CountingBloomFilter(size_t cells, size_t capacity, + size_t width) + : BloomFilter(BasicBloomFilter::K(cells, capacity)) + { + cells_ = new CounterVector(width, cells); + } + IMPLEMENT_SERIAL(CountingBloomFilter, SER_COUNTINGBLOOMFILTER) diff --git a/src/BloomFilter.h b/src/BloomFilter.h index 77c6bc4f56..14b0ac3281 100644 --- a/src/BloomFilter.h +++ b/src/BloomFilter.h @@ -269,7 +269,8 @@ private: */ class CountingBloomFilter : public BloomFilter { public: - CountingBloomFilter(unsigned width); + CountingBloomFilter(double fp, size_t capacity, size_t width); + CountingBloomFilter(size_t cells, size_t capacity, size_t width); protected: DECLARE_SERIAL(CountingBloomFilter); diff --git a/src/OpaqueVal.cc b/src/OpaqueVal.cc index b4f1290436..abfd8f320f 100644 --- a/src/OpaqueVal.cc +++ b/src/OpaqueVal.cc @@ -518,31 +518,87 @@ bool EntropyVal::DoUnserialize(UnserialInfo* info) return true; } -BloomFilterVal::BloomFilterVal() : OpaqueVal(bloomfilter_type) +BloomFilterVal::BloomFilterVal(BloomFilter* bf) + : OpaqueVal(bloomfilter_type), bloom_filter_(bf) { } -BloomFilterVal::BloomFilterVal(OpaqueType* t) : OpaqueVal(t) +BloomFilterVal::BloomFilterVal(OpaqueType* t) + : OpaqueVal(t) { } +bool BloomFilterVal::Typify(BroType* type) + { + if ( type_ ) + return false; + type_ = type; + TypeList* tl = new TypeList(type_); + tl->Append(type_); + hash_ = new CompositeHash(tl); + Unref(tl); + return true; + } + +BroType* BloomFilterVal::Type() const + { + return type_; + } + +void BloomFilterVal::Add(const Val* val) + { + HashKey* key = hash_->ComputeHash(val, 1); + bloom_filter_->Add(key->Hash()); + } + +size_t BloomFilterVal::Count(const Val* val) const + { + HashKey* key = hash_->ComputeHash(val, 1); + return bloom_filter_->Count(key->Hash()); + } + +BloomFilterVal* BloomFilterVal::Merge(const BloomFilterVal* first, + const BloomFilterVal* second) +{ + assert(! "not yet implemented"); + return NULL; + } + BloomFilterVal::~BloomFilterVal() { + if ( type_ ) + Unref(type_); + if ( hash_ ) + delete hash_; if ( bloom_filter_ ) delete bloom_filter_; } +BloomFilterVal::BloomFilterVal() + : OpaqueVal(bloomfilter_type) + { + } + IMPLEMENT_SERIAL(BloomFilterVal, SER_BLOOMFILTER_VAL); bool BloomFilterVal::DoSerialize(SerialInfo* info) const { DO_SERIALIZE(SER_BLOOMFILTER_VAL, OpaqueVal); + if ( ! SERIALIZE(type_) ) + return false; return SERIALIZE(bloom_filter_); } bool BloomFilterVal::DoUnserialize(UnserialInfo* info) { DO_UNSERIALIZE(OpaqueVal); + type_ = BroType::Unserialize(info); + if ( ! type_ ) + return false; + TypeList* tl = new TypeList(type_); + tl->Append(type_); + hash_ = new CompositeHash(tl); + Unref(tl); bloom_filter_ = BloomFilter::Unserialize(info); return bloom_filter_ == NULL; } diff --git a/src/OpaqueVal.h b/src/OpaqueVal.h index 68b42a8a49..e97a530f3a 100644 --- a/src/OpaqueVal.h +++ b/src/OpaqueVal.h @@ -110,18 +110,32 @@ private: }; class BloomFilterVal : public OpaqueVal { + BloomFilterVal(const BloomFilterVal&); + BloomFilterVal& operator=(const BloomFilterVal&); public: - BloomFilterVal(); + static BloomFilterVal* Merge(const BloomFilterVal* first, + const BloomFilterVal* second); + + BloomFilterVal(BloomFilter* bf); ~BloomFilterVal(); + bool Typify(BroType* type); + BroType* Type() const; + + void Add(const Val* val); + size_t Count(const Val* val) const; + protected: friend class Val; + BloomFilterVal(); BloomFilterVal(OpaqueType* t); DECLARE_SERIAL(BloomFilterVal); private: - BloomFilter* bloom_filter_; + BroType* type_; + CompositeHash* hash_; + BloomFilter* bloom_filter_; }; #endif From 3d9764213191070a6b68375c0d0ae8c3193528e3 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Wed, 5 Jun 2013 16:26:16 -0700 Subject: [PATCH 10/45] Add Bloom filter BiFs. --- src/bro.bif | 89 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 89 insertions(+) diff --git a/src/bro.bif b/src/bro.bif index d9558106a7..60fb985dda 100644 --- a/src/bro.bif +++ b/src/bro.bif @@ -5730,3 +5730,92 @@ function anonymize_addr%(a: addr, cl: IPAddrAnonymizationClass%): addr } %} +# =========================================================================== +# +# Bloom Filter Functions +# +# =========================================================================== + +%%{ +#include "BloomFilter.h" +%%} + +## Initializes a Bloom filter data structure. +## +## fp: The desired false-positive rate. +## +## capacity: the maximum number of elements that guarantees a false-positive +## rate of *fp*. +## +## Returns: A Bloom filter handle. +function bloomfilter_init%(fp: double, capacity: count, + max: count &default=1%): opaque of bloomfilter + %{ + BloomFilter* bf; + if ( max == 1 ) + { + bf = new BasicBloomFilter(fp, capacity); + } + else + { + uint16 width = 0; + while ( max >>= 1 ) + ++width; + bf = new CountingBloomFilter(fp, capacity, width); + } + return new BloomFilterVal(bf); + %} + +## Adds an element to a Bloom filter. +## +## bf: The Bloom filter handle. +## +## x: The element to add. +function bloomfilter_add%(bf: opaque of bloomfilter, x: any%): any + %{ + BloomFilterVal* bfv = static_cast(bf); + if ( ! bfv->Type() || ! bfv->Typify(x->Type()) ) + reporter->Error("failed to set Bloom filter type"); + else if ( bfv->Type() != x->Type() ) + reporter->Error("incompatible Bloom filter types"); + bfv->Add(x); + return 0; + %} + +## Retrieves the counter for a given element in a Bloom filter. +## +## bf: The Bloom filter handle. +## +## x: The element to count. +## +## Returns: the counter associated with *x* in *bf*. +function bloomfilter_lookup%(bf: opaque of bloomfilter, x: any%): count + %{ + BloomFilterVal* bfv = static_cast(bf); + if ( ! bfv->Type() ) + reporter->Error("cannot perform lookup on untyped Bloom filter"); + else if ( bfv->Type() != x->Type() ) + reporter->Error("incompatible Bloom filter types"); + return new Val(static_cast(bfv->Count(x)), TYPE_COUNT); + %} + +## Merges two Bloom filters. +## +## bf1: The first Bloom filter handle. +## +## bf2: The second Bloom filter handle. +## +## Returns: The union of *bf1* and *bf2*. +function bloomfilter_merge%(bf1: opaque of bloomfilter, + bf2: opaque of bloomfilter%): opaque of bloomfilter + %{ + const BloomFilterVal* bfv1 = static_cast(bf1); + const BloomFilterVal* bfv2 = static_cast(bf2); + if ( ! bfv1->Type() ) + reporter->Error("The first Bloom filter has not yet been typed"); + if ( ! bfv2->Type() ) + reporter->Error("The second Bloom filter has not yet been typed"); + else if ( bfv1->Type() != bfv2->Type() ) + reporter->Error("incompatible Bloom filter types"); + return BloomFilterVal::Merge(bfv1, bfv2); + %} From d5126a13395f899fab12f081248336e687222ed9 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Wed, 5 Jun 2013 17:45:10 -0700 Subject: [PATCH 11/45] Fix some BiF issues. --- src/bro.bif | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/src/bro.bif b/src/bro.bif index 60fb985dda..08b532eaea 100644 --- a/src/bro.bif +++ b/src/bro.bif @@ -5774,12 +5774,18 @@ function bloomfilter_init%(fp: double, capacity: count, function bloomfilter_add%(bf: opaque of bloomfilter, x: any%): any %{ BloomFilterVal* bfv = static_cast(bf); - if ( ! bfv->Type() || ! bfv->Typify(x->Type()) ) + if ( ! bfv->Type() && ! bfv->Typify(x->Type()) ) + { reporter->Error("failed to set Bloom filter type"); + return NULL; + } else if ( bfv->Type() != x->Type() ) + { reporter->Error("incompatible Bloom filter types"); + return NULL; + } bfv->Add(x); - return 0; + return NULL; %} ## Retrieves the counter for a given element in a Bloom filter. @@ -5812,9 +5818,9 @@ function bloomfilter_merge%(bf1: opaque of bloomfilter, const BloomFilterVal* bfv1 = static_cast(bf1); const BloomFilterVal* bfv2 = static_cast(bf2); if ( ! bfv1->Type() ) - reporter->Error("The first Bloom filter has not yet been typed"); + reporter->Error("first Bloom filter has not yet been typed"); if ( ! bfv2->Type() ) - reporter->Error("The second Bloom filter has not yet been typed"); + reporter->Error("second Bloom filter has not yet been typed"); else if ( bfv1->Type() != bfv2->Type() ) reporter->Error("incompatible Bloom filter types"); return BloomFilterVal::Merge(bfv1, bfv2); From 012e09c5c40bdf0acd29a34bf2271417ed36d770 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Thu, 6 Jun 2013 12:56:46 -0700 Subject: [PATCH 12/45] Small fixes and simplifications. --- src/BloomFilter.cc | 2 +- src/BloomFilter.h | 17 +++++++---------- src/OpaqueVal.cc | 1 + 3 files changed, 9 insertions(+), 11 deletions(-) diff --git a/src/BloomFilter.cc b/src/BloomFilter.cc index 74fa6fb255..e549553bf4 100644 --- a/src/BloomFilter.cc +++ b/src/BloomFilter.cc @@ -140,7 +140,7 @@ bool BloomFilter::DoUnserialize(UnserialInfo* info) return false; hash_ = new hash_policy(static_cast(k)); uint64 elements; - if ( UNSERIALIZE(&elements) ) + if ( ! UNSERIALIZE(&elements) ) return false; elements_ = static_cast(elements); return true; diff --git a/src/BloomFilter.h b/src/BloomFilter.h index 14b0ac3281..3e2bd5de90 100644 --- a/src/BloomFilter.h +++ b/src/BloomFilter.h @@ -94,15 +94,14 @@ protected: * A functor that computes a universal hash function. * @tparam Codomain An integral type. */ - template class Hasher { public: - template - Codomain operator()(const Domain& x) const + template + HashType operator()(const T& x) const { return h3_(&x, sizeof(x)); } - Codomain operator()(const void* x, size_t n) const + HashType operator()(const void* x, size_t n) const { return h3_(x, n); } @@ -110,7 +109,7 @@ protected: // FIXME: The hardcoded value of 36 comes from UHASH_KEY_SIZE defined in // Hash.h. I do not know how this value impacts the hash function behavior // so I'll just copy it verbatim. (Matthias) - H3 h3_; + H3 h3_; }; HashPolicy(size_t k) : k_(k) { } @@ -125,12 +124,11 @@ private: class DefaultHashing : public HashPolicy { public: DefaultHashing(size_t k) : HashPolicy(k), hashers_(k) { } - virtual ~DefaultHashing() { } virtual HashVector Hash(const void* x, size_t n) const; private: - std::vector< Hasher > hashers_; + std::vector hashers_; }; /** @@ -139,13 +137,12 @@ private: class DoubleHashing : public HashPolicy { public: DoubleHashing(size_t k) : HashPolicy(k) { } - virtual ~DoubleHashing() { } virtual HashVector Hash(const void* x, size_t n) const; private: - Hasher hasher1_; - Hasher hasher2_; + Hasher hasher1_; + Hasher hasher2_; }; /** diff --git a/src/OpaqueVal.cc b/src/OpaqueVal.cc index abfd8f320f..03a6e51ce8 100644 --- a/src/OpaqueVal.cc +++ b/src/OpaqueVal.cc @@ -533,6 +533,7 @@ bool BloomFilterVal::Typify(BroType* type) if ( type_ ) return false; type_ = type; + type_->Ref(); TypeList* tl = new TypeList(type_); tl->Append(type_); hash_ = new CompositeHash(tl); From f211b856c9ae35e68ea4af194e08157fdefef7e6 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Thu, 6 Jun 2013 13:13:36 -0700 Subject: [PATCH 13/45] Catch invalid values of the false-positive rate. --- src/bro.bif | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/bro.bif b/src/bro.bif index 08b532eaea..74219dd2b7 100644 --- a/src/bro.bif +++ b/src/bro.bif @@ -5751,6 +5751,11 @@ function anonymize_addr%(a: addr, cl: IPAddrAnonymizationClass%): addr function bloomfilter_init%(fp: double, capacity: count, max: count &default=1%): opaque of bloomfilter %{ + if ( fp < 0.0 || fp > 1.0 ) + { + reporter->Error("false-positive rate must take value between 0 and 1"); + return NULL; + } BloomFilter* bf; if ( max == 1 ) { From 7ce986e31f59b1f1000ec335a4efc1f0f5e0c011 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Thu, 6 Jun 2013 13:21:27 -0700 Subject: [PATCH 14/45] Fix modding. --- src/BloomFilter.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/BloomFilter.cc b/src/BloomFilter.cc index e549553bf4..7c347927c3 100644 --- a/src/BloomFilter.cc +++ b/src/BloomFilter.cc @@ -188,13 +188,13 @@ bool BasicBloomFilter::DoUnserialize(UnserialInfo* info) void BasicBloomFilter::AddImpl(const HashPolicy::HashVector& h) { for ( size_t i = 0; i < h.size(); ++i ) - bits_->Set(h[i] % h.size()); + bits_->Set(h[i] % bits_->Size()); } size_t BasicBloomFilter::CountImpl(const HashPolicy::HashVector& h) const { for ( size_t i = 0; i < h.size(); ++i ) - if ( ! (*bits_)[h[i] % h.size()] ) + if ( ! (*bits_)[h[i] % bits_->Size()] ) return 0; return 1; } @@ -233,7 +233,7 @@ bool CountingBloomFilter::DoUnserialize(UnserialInfo* info) void CountingBloomFilter::AddImpl(const HashPolicy::HashVector& h) { for ( size_t i = 0; i < h.size(); ++i ) - cells_->Increment(h[i] % h.size(), 1); + cells_->Increment(h[i] % cells_->Size(), 1); } size_t CountingBloomFilter::CountImpl(const HashPolicy::HashVector& h) const @@ -242,7 +242,7 @@ size_t CountingBloomFilter::CountImpl(const HashPolicy::HashVector& h) const std::numeric_limits::max(); for ( size_t i = 0; i < h.size(); ++i ) { - CounterVector::size_type cnt = cells_->Count(h[i] % h.size()); + CounterVector::size_type cnt = cells_->Count(h[i] % cells_->Size()); if ( cnt < min ) min = cnt; } From fcf1807fc8ac320a6c787360e8b78509b58b0a5a Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Thu, 6 Jun 2013 13:39:00 -0700 Subject: [PATCH 15/45] Fix hasher usage and narrow interface. --- src/BloomFilter.cc | 4 ++-- src/BloomFilter.h | 10 +--------- 2 files changed, 3 insertions(+), 11 deletions(-) diff --git a/src/BloomFilter.cc b/src/BloomFilter.cc index 7c347927c3..c684c82c0e 100644 --- a/src/BloomFilter.cc +++ b/src/BloomFilter.cc @@ -93,8 +93,8 @@ HashPolicy::HashVector DefaultHashing::Hash(const void* x, size_t n) const HashPolicy::HashVector DoubleHashing::Hash(const void* x, size_t n) const { - HashType h1 = hasher1_(x); - HashType h2 = hasher2_(x); + HashType h1 = hasher1_(x, n); + HashType h2 = hasher2_(x, n); HashVector h(K(), 0); for ( size_t i = 0; i < h.size(); ++i ) h[i] = h1 + i * h2; diff --git a/src/BloomFilter.h b/src/BloomFilter.h index 3e2bd5de90..fd1cb31d61 100644 --- a/src/BloomFilter.h +++ b/src/BloomFilter.h @@ -96,15 +96,7 @@ protected: */ class Hasher { public: - template - HashType operator()(const T& x) const - { - return h3_(&x, sizeof(x)); - } - HashType operator()(const void* x, size_t n) const - { - return h3_(x, n); - } + HashType operator()(const void* x, size_t n) const { return h3_(x, n); } private: // FIXME: The hardcoded value of 36 comes from UHASH_KEY_SIZE defined in // Hash.h. I do not know how this value impacts the hash function behavior From 0d299eca57ddab9dfb17c1f6c99139c481dccb49 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Thu, 6 Jun 2013 14:54:25 -0700 Subject: [PATCH 16/45] Correct computation of k hash functions. --- src/BloomFilter.cc | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/src/BloomFilter.cc b/src/BloomFilter.cc index c684c82c0e..f1db71ae1d 100644 --- a/src/BloomFilter.cc +++ b/src/BloomFilter.cc @@ -3,13 +3,6 @@ #include #include "Serializer.h" -// Backport C++11's std::round(). -namespace { -template -T round(double x) { return (x > 0.0) ? (x + 0.5) : (x - 0.5); } -} // namespace - - CounterVector::CounterVector(size_t width, size_t cells) : bits_(new BitVector(width * cells)), width_(width) { @@ -155,7 +148,7 @@ size_t BasicBloomFilter::M(double fp, size_t capacity) size_t BasicBloomFilter::K(size_t cells, size_t capacity) { double frac = static_cast(cells) / static_cast(capacity); - return round(frac * std::log(2)); + return std::ceil(frac * std::log(2)); } BasicBloomFilter::BasicBloomFilter(double fp, size_t capacity) From e15f03d980e8bb63d00969268056b2e9592b2f85 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Thu, 6 Jun 2013 15:02:11 -0700 Subject: [PATCH 17/45] Cleanup BiFs. --- src/bro.bif | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/src/bro.bif b/src/bro.bif index 5c1280645e..8bd9575498 100644 --- a/src/bro.bif +++ b/src/bro.bif @@ -5026,16 +5026,11 @@ function bloomfilter_add%(bf: opaque of bloomfilter, x: any%): any %{ BloomFilterVal* bfv = static_cast(bf); if ( ! bfv->Type() && ! bfv->Typify(x->Type()) ) - { reporter->Error("failed to set Bloom filter type"); - return NULL; - } else if ( bfv->Type() != x->Type() ) - { reporter->Error("incompatible Bloom filter types"); - return NULL; - } - bfv->Add(x); + else + bfv->Add(x); return NULL; %} @@ -5048,12 +5043,14 @@ function bloomfilter_add%(bf: opaque of bloomfilter, x: any%): any ## Returns: the counter associated with *x* in *bf*. function bloomfilter_lookup%(bf: opaque of bloomfilter, x: any%): count %{ - BloomFilterVal* bfv = static_cast(bf); + const BloomFilterVal* bfv = static_cast(bf); if ( ! bfv->Type() ) reporter->Error("cannot perform lookup on untyped Bloom filter"); else if ( bfv->Type() != x->Type() ) reporter->Error("incompatible Bloom filter types"); - return new Val(static_cast(bfv->Count(x)), TYPE_COUNT); + else + return new Val(static_cast(bfv->Count(x)), TYPE_COUNT); + return new Val(0, TYPE_COUNT); %} ## Merges two Bloom filters. @@ -5068,11 +5065,9 @@ function bloomfilter_merge%(bf1: opaque of bloomfilter, %{ const BloomFilterVal* bfv1 = static_cast(bf1); const BloomFilterVal* bfv2 = static_cast(bf2); - if ( ! bfv1->Type() ) - reporter->Error("first Bloom filter has not yet been typed"); - if ( ! bfv2->Type() ) - reporter->Error("second Bloom filter has not yet been typed"); - else if ( bfv1->Type() != bfv2->Type() ) + if ( bfv1->Type() != bfv2->Type() ) reporter->Error("incompatible Bloom filter types"); - return BloomFilterVal::Merge(bfv1, bfv2); + else + return BloomFilterVal::Merge(bfv1, bfv2); + return NULL; %} From 86becdd6e467fabc475eb81baea6d3586b2d74e7 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Thu, 6 Jun 2013 15:08:24 -0700 Subject: [PATCH 18/45] Add tests. --- testing/btest/bifs/bloomfilter.bro | 38 ++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 testing/btest/bifs/bloomfilter.bro diff --git a/testing/btest/bifs/bloomfilter.bro b/testing/btest/bifs/bloomfilter.bro new file mode 100644 index 0000000000..6abbdd69f7 --- /dev/null +++ b/testing/btest/bifs/bloomfilter.bro @@ -0,0 +1,38 @@ +# @TEST-EXEC: bro -b %INPUT >output +# @TEST-EXEC: btest-diff output + +event bro_init() + { + # Basic usage with counts. + local bf_cnt = bloomfilter_init(0.1, 1000); + bloomfilter_add(bf_cnt, 42); + bloomfilter_add(bf_cnt, 84); + bloomfilter_add(bf_cnt, 168); + print bloomfilter_lookup(bf_cnt, 0); + print bloomfilter_lookup(bf_cnt, 42); + print bloomfilter_lookup(bf_cnt, 168); + print bloomfilter_lookup(bf_cnt, 336); + bloomfilter_add(bf_cnt, 0.5); # Type mismatch + bloomfilter_add(bf_cnt, "foo"); # Type mismatch + + # Basic usage with strings. + local bf_str = bloomfilter_init(0.9, 10); + bloomfilter_add(bf_str, "foo"); + bloomfilter_add(bf_str, "bar"); + print bloomfilter_lookup(bf_str, "foo"); + print bloomfilter_lookup(bf_str, "bar"); + print bloomfilter_lookup(bf_str, "baz"); + print bloomfilter_lookup(bf_str, "qux"); + bloomfilter_add(bf_str, 0.5); # Type mismatch + bloomfilter_add(bf_str, 100); # Type mismatch + + # Edge cases. + local bf_edge0 = bloomfilter_init(0.000000000001, 1); + local bf_edge1 = bloomfilter_init(0.00000001, 100000000); + local bf_edge2 = bloomfilter_init(0.9999999, 1); + local bf_edge3 = bloomfilter_init(0.9999999, 100000000000); + + # Invalid parameters. + local bf_bug0 = bloomfilter_init(-0.5, 42); + local bf_bug1 = bloomfilter_init(1.1, 42); + } From f2d536d2da1118b1d5feb143f751d47dc344232b Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Thu, 6 Jun 2013 15:22:04 -0700 Subject: [PATCH 19/45] Add missing initializations. --- src/BloomFilter.cc | 15 +++++++++++++++ src/BloomFilter.h | 6 +++--- src/OpaqueVal.cc | 25 +++++++++++++++++-------- 3 files changed, 35 insertions(+), 11 deletions(-) diff --git a/src/BloomFilter.cc b/src/BloomFilter.cc index f1db71ae1d..40772fecb6 100644 --- a/src/BloomFilter.cc +++ b/src/BloomFilter.cc @@ -95,6 +95,11 @@ HashPolicy::HashVector DoubleHashing::Hash(const void* x, size_t n) const } +BloomFilter::BloomFilter() + : hash_(NULL) + { + } + BloomFilter::BloomFilter(size_t k) : hash_(new hash_policy(k)) { @@ -151,6 +156,11 @@ size_t BasicBloomFilter::K(size_t cells, size_t capacity) return std::ceil(frac * std::log(2)); } +BasicBloomFilter::BasicBloomFilter() + : bits_(NULL) + { + } + BasicBloomFilter::BasicBloomFilter(double fp, size_t capacity) : BloomFilter(K(M(fp, capacity), capacity)) { @@ -192,6 +202,11 @@ size_t BasicBloomFilter::CountImpl(const HashPolicy::HashVector& h) const return 1; } +CountingBloomFilter::CountingBloomFilter() + : cells_(NULL) + { + } + CountingBloomFilter::CountingBloomFilter(double fp, size_t capacity, size_t width) : BloomFilter(BasicBloomFilter::K(BasicBloomFilter::M(fp, capacity), diff --git a/src/BloomFilter.h b/src/BloomFilter.h index fd1cb31d61..c0101cadf8 100644 --- a/src/BloomFilter.h +++ b/src/BloomFilter.h @@ -188,7 +188,7 @@ public: protected: DECLARE_ABSTRACT_SERIAL(BloomFilter); - BloomFilter() { }; + BloomFilter(); BloomFilter(size_t k); virtual void AddImpl(const HashPolicy::HashVector& hashes) = 0; @@ -244,7 +244,7 @@ public: protected: DECLARE_SERIAL(BasicBloomFilter); - BasicBloomFilter() { } + BasicBloomFilter(); virtual void AddImpl(const HashPolicy::HashVector& h); virtual size_t CountImpl(const HashPolicy::HashVector& h) const; @@ -264,7 +264,7 @@ public: protected: DECLARE_SERIAL(CountingBloomFilter); - CountingBloomFilter() { } + CountingBloomFilter(); virtual void AddImpl(const HashPolicy::HashVector& h); virtual size_t CountImpl(const HashPolicy::HashVector& h) const; diff --git a/src/OpaqueVal.cc b/src/OpaqueVal.cc index 03a6e51ce8..38ea93d000 100644 --- a/src/OpaqueVal.cc +++ b/src/OpaqueVal.cc @@ -518,13 +518,27 @@ bool EntropyVal::DoUnserialize(UnserialInfo* info) return true; } -BloomFilterVal::BloomFilterVal(BloomFilter* bf) - : OpaqueVal(bloomfilter_type), bloom_filter_(bf) +BloomFilterVal::BloomFilterVal() + : OpaqueVal(bloomfilter_type), + type_(NULL), + hash_(NULL), + bloom_filter_(NULL) { } BloomFilterVal::BloomFilterVal(OpaqueType* t) - : OpaqueVal(t) + : OpaqueVal(t), + type_(NULL), + hash_(NULL), + bloom_filter_(NULL) + { + } + +BloomFilterVal::BloomFilterVal(BloomFilter* bf) + : OpaqueVal(bloomfilter_type), + type_(NULL), + hash_(NULL), + bloom_filter_(bf) { } @@ -575,11 +589,6 @@ BloomFilterVal::~BloomFilterVal() delete bloom_filter_; } -BloomFilterVal::BloomFilterVal() - : OpaqueVal(bloomfilter_type) - { - } - IMPLEMENT_SERIAL(BloomFilterVal, SER_BLOOMFILTER_VAL); bool BloomFilterVal::DoSerialize(SerialInfo* info) const From c6381055380f889c4891efcf83da512597ae64d6 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Mon, 10 Jun 2013 12:51:41 -0700 Subject: [PATCH 20/45] Document max parameter in bloomfilter_init. --- src/bro.bif | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/bro.bif b/src/bro.bif index 8bd9575498..9b80c90dbf 100644 --- a/src/bro.bif +++ b/src/bro.bif @@ -4993,6 +4993,13 @@ function anonymize_addr%(a: addr, cl: IPAddrAnonymizationClass%): addr ## capacity: the maximum number of elements that guarantees a false-positive ## rate of *fp*. ## +## max: The maximum counter value associated with each each element in the +## Bloom filter. If greater than 1, each element in the set has a counter of +## *w = ceil(log_2(max))* bits. Each bit in the underlying bit vector then +## becomes a cell of size *w* bits. Since the number number of cells is a +## function ## of *fp* and *capacity*, it is important to consider the effects +## on space when tuning this value. +## ## Returns: A Bloom filter handle. function bloomfilter_init%(fp: double, capacity: count, max: count &default=1%): opaque of bloomfilter From d25984ba45643be524788b73d7cebc1278a78810 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Mon, 10 Jun 2013 12:55:03 -0700 Subject: [PATCH 21/45] Update baseline for unit tests. --- testing/btest/Baseline/bifs.bloomfilter/output | 8 ++++++++ testing/btest/bifs/bloomfilter.bro | 4 ++-- 2 files changed, 10 insertions(+), 2 deletions(-) create mode 100644 testing/btest/Baseline/bifs.bloomfilter/output diff --git a/testing/btest/Baseline/bifs.bloomfilter/output b/testing/btest/Baseline/bifs.bloomfilter/output new file mode 100644 index 0000000000..65aaa8b07c --- /dev/null +++ b/testing/btest/Baseline/bifs.bloomfilter/output @@ -0,0 +1,8 @@ +0 +1 +1 +0 +1 +1 +1 +1 diff --git a/testing/btest/bifs/bloomfilter.bro b/testing/btest/bifs/bloomfilter.bro index 6abbdd69f7..769cec1200 100644 --- a/testing/btest/bifs/bloomfilter.bro +++ b/testing/btest/bifs/bloomfilter.bro @@ -21,8 +21,8 @@ event bro_init() bloomfilter_add(bf_str, "bar"); print bloomfilter_lookup(bf_str, "foo"); print bloomfilter_lookup(bf_str, "bar"); - print bloomfilter_lookup(bf_str, "baz"); - print bloomfilter_lookup(bf_str, "qux"); + print bloomfilter_lookup(bf_str, "baz"); # FP + print bloomfilter_lookup(bf_str, "qux"); # FP bloomfilter_add(bf_str, 0.5); # Type mismatch bloomfilter_add(bf_str, 100); # Type mismatch From 4c21576c120a0dcc9725308549fd57a8bf9072a1 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Mon, 10 Jun 2013 20:14:34 -0700 Subject: [PATCH 22/45] Add Bloomfilter serialization test code. --- testing/btest/istate/opaque.bro | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/testing/btest/istate/opaque.bro b/testing/btest/istate/opaque.bro index 84818a5e70..ac3b2c0874 100644 --- a/testing/btest/istate/opaque.bro +++ b/testing/btest/istate/opaque.bro @@ -12,6 +12,9 @@ global sha1_handle: opaque of sha1 &persistent &synchronized; global sha256_handle: opaque of sha256 &persistent &synchronized; global entropy_handle: opaque of entropy &persistent &synchronized; +global bloomfilter_elements: set[string] &persistent &synchronized; +global bloomfilter_handle: opaque of bloomfilter &persistent &synchronized; + event bro_done() { local out = open("output.log"); @@ -36,6 +39,9 @@ event bro_done() print out, entropy_test_finish(entropy_handle); else print out, "entropy_test_add() failed"; + + for ( e in bloomfilter_elements ) + print bloomfilter_lookup(bloomfilter_handle, e); } @TEST-END-FILE @@ -47,6 +53,9 @@ global sha1_handle: opaque of sha1 &persistent &synchronized; global sha256_handle: opaque of sha256 &persistent &synchronized; global entropy_handle: opaque of entropy &persistent &synchronized; +global bloomfilter_elements = { "foo", "bar", "baz" } &persistent &synchronized; +global bloomfilter_handle: opaque of bloomfilter &persistent &synchronized; + event bro_init() { local out = open("expected.log"); @@ -72,6 +81,10 @@ event bro_init() entropy_handle = entropy_test_init(); if ( ! entropy_test_add(entropy_handle, "f") ) print out, "entropy_test_add() failed"; + + bloomfilter_handle = bloomfilter_init(0.1, 100); + for ( e in bloomfilter_elements ) + bloomfilter_add(bloomfilter_handle, e); } @TEST-END-FILE From 22afbe42dd91e668de8c72417b6a8ff8b544dd99 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Mon, 10 Jun 2013 20:15:13 -0700 Subject: [PATCH 23/45] A number of tweaks of the serialization code. --- src/BitVector.h | 2 +- src/BloomFilter.cc | 17 ++++++++--------- src/BloomFilter.h | 2 +- src/OpaqueVal.cc | 10 ++++++---- src/SerialTypes.h | 8 ++++---- 5 files changed, 20 insertions(+), 19 deletions(-) diff --git a/src/BitVector.h b/src/BitVector.h index 8315a151f0..83fec44a0d 100644 --- a/src/BitVector.h +++ b/src/BitVector.h @@ -8,7 +8,7 @@ /** * A vector of bits. */ -class BitVector : SerialObj { +class BitVector : public SerialObj { public: typedef size_t block_type; typedef size_t size_type; diff --git a/src/BloomFilter.cc b/src/BloomFilter.cc index 40772fecb6..1d73734236 100644 --- a/src/BloomFilter.cc +++ b/src/BloomFilter.cc @@ -55,7 +55,7 @@ IMPLEMENT_SERIAL(CounterVector, SER_COUNTERVECTOR) bool CounterVector::DoSerialize(SerialInfo* info) const { DO_SERIALIZE(SER_COUNTERVECTOR, SerialObj); - if ( ! SERIALIZE(bits_) ) + if ( ! bits_->Serialize(info) ) return false; return SERIALIZE(static_cast(width_)); } @@ -63,14 +63,13 @@ bool CounterVector::DoSerialize(SerialInfo* info) const bool CounterVector::DoUnserialize(UnserialInfo* info) { DO_UNSERIALIZE(SerialObj); - return false; bits_ = BitVector::Unserialize(info); if ( ! bits_ ) return false; uint64 width; if ( ! UNSERIALIZE(&width) ) return false; - width_ = static_cast(width); + width_ = static_cast(width); return true; } @@ -127,7 +126,7 @@ bool BloomFilter::DoSerialize(SerialInfo* info) const DO_SERIALIZE(SER_BLOOMFILTER, SerialObj); if ( ! SERIALIZE(static_cast(hash_->K())) ) return false; - return SERIALIZE(static_cast(elements_)); + return SERIALIZE(static_cast(elements_)); } bool BloomFilter::DoUnserialize(UnserialInfo* info) @@ -178,14 +177,14 @@ IMPLEMENT_SERIAL(BasicBloomFilter, SER_BASICBLOOMFILTER) bool BasicBloomFilter::DoSerialize(SerialInfo* info) const { DO_SERIALIZE(SER_BASICBLOOMFILTER, BloomFilter); - return SERIALIZE(bits_); + return bits_->Serialize(info); } bool BasicBloomFilter::DoUnserialize(UnserialInfo* info) { DO_UNSERIALIZE(BloomFilter); bits_ = BitVector::Unserialize(info); - return bits_ == NULL; + return bits_ != NULL; } void BasicBloomFilter::AddImpl(const HashPolicy::HashVector& h) @@ -227,15 +226,15 @@ IMPLEMENT_SERIAL(CountingBloomFilter, SER_COUNTINGBLOOMFILTER) bool CountingBloomFilter::DoSerialize(SerialInfo* info) const { - DO_SERIALIZE(SER_BASICBLOOMFILTER, BloomFilter); - return SERIALIZE(cells_); + DO_SERIALIZE(SER_COUNTINGBLOOMFILTER, BloomFilter); + return cells_->Serialize(info); } bool CountingBloomFilter::DoUnserialize(UnserialInfo* info) { DO_UNSERIALIZE(BloomFilter); cells_ = CounterVector::Unserialize(info); - return cells_ == NULL; + return cells_ != NULL; } void CountingBloomFilter::AddImpl(const HashPolicy::HashVector& h) diff --git a/src/BloomFilter.h b/src/BloomFilter.h index c0101cadf8..4a83ba904b 100644 --- a/src/BloomFilter.h +++ b/src/BloomFilter.h @@ -73,7 +73,7 @@ protected: private: BitVector* bits_; - unsigned width_; + size_t width_; }; /** diff --git a/src/OpaqueVal.cc b/src/OpaqueVal.cc index 38ea93d000..76936dfb78 100644 --- a/src/OpaqueVal.cc +++ b/src/OpaqueVal.cc @@ -574,7 +574,7 @@ size_t BloomFilterVal::Count(const Val* val) const BloomFilterVal* BloomFilterVal::Merge(const BloomFilterVal* first, const BloomFilterVal* second) -{ + { assert(! "not yet implemented"); return NULL; } @@ -594,14 +594,15 @@ IMPLEMENT_SERIAL(BloomFilterVal, SER_BLOOMFILTER_VAL); bool BloomFilterVal::DoSerialize(SerialInfo* info) const { DO_SERIALIZE(SER_BLOOMFILTER_VAL, OpaqueVal); - if ( ! SERIALIZE(type_) ) + if ( ! type_->Serialize(info) ) return false; - return SERIALIZE(bloom_filter_); + return bloom_filter_->Serialize(info); } bool BloomFilterVal::DoUnserialize(UnserialInfo* info) { DO_UNSERIALIZE(OpaqueVal); + type_ = BroType::Unserialize(info); if ( ! type_ ) return false; @@ -609,6 +610,7 @@ bool BloomFilterVal::DoUnserialize(UnserialInfo* info) tl->Append(type_); hash_ = new CompositeHash(tl); Unref(tl); + bloom_filter_ = BloomFilter::Unserialize(info); - return bloom_filter_ == NULL; + return bloom_filter_ != NULL; } diff --git a/src/SerialTypes.h b/src/SerialTypes.h index 859145f19f..9e4aef5b3b 100644 --- a/src/SerialTypes.h +++ b/src/SerialTypes.h @@ -50,10 +50,10 @@ SERIAL_IS_BO(CASE, 0x1200) SERIAL_IS(LOCATION, 0x1300) SERIAL_IS(RE_MATCHER, 0x1400) SERIAL_IS(BITVECTOR, 0x1500) -SERIAL_IS(COUNTERVECTOR, 0xa000) -SERIAL_IS(BLOOMFILTER, 0xa100) -SERIAL_IS(BASICBLOOMFILTER, 0xa200) -SERIAL_IS(COUNTINGBLOOMFILTER, 0xa300) +SERIAL_IS(COUNTERVECTOR, 0x1600) +SERIAL_IS(BLOOMFILTER, 0x1700) +SERIAL_IS(BASICBLOOMFILTER, 0x1800) +SERIAL_IS(COUNTINGBLOOMFILTER, 0x1900) // These are the externally visible types. const SerialType SER_NONE = 0; From 14a701a237dfdd745a842a11f363b93d01926505 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Mon, 10 Jun 2013 22:24:23 -0700 Subject: [PATCH 24/45] Implement value merging. The actual BloomFilter merging still lacks, this is just the first step in the right direction from the user interface side. --- src/BloomFilter.cc | 27 ++++++++++++++++++++------- src/BloomFilter.h | 18 ++++++------------ src/OpaqueVal.cc | 17 ++++++++++++++--- src/OpaqueVal.h | 17 ++++++++++++++--- 4 files changed, 54 insertions(+), 25 deletions(-) diff --git a/src/BloomFilter.cc b/src/BloomFilter.cc index 1d73734236..e55db71e46 100644 --- a/src/BloomFilter.cc +++ b/src/BloomFilter.cc @@ -124,9 +124,7 @@ BloomFilter* BloomFilter::Unserialize(UnserialInfo* info) bool BloomFilter::DoSerialize(SerialInfo* info) const { DO_SERIALIZE(SER_BLOOMFILTER, SerialObj); - if ( ! SERIALIZE(static_cast(hash_->K())) ) - return false; - return SERIALIZE(static_cast(elements_)); + return SERIALIZE(static_cast(hash_->K())); } bool BloomFilter::DoUnserialize(UnserialInfo* info) @@ -136,10 +134,6 @@ bool BloomFilter::DoUnserialize(UnserialInfo* info) if ( ! UNSERIALIZE(&k) ) return false; hash_ = new hash_policy(static_cast(k)); - uint64 elements; - if ( ! UNSERIALIZE(&elements) ) - return false; - elements_ = static_cast(elements); return true; } @@ -155,6 +149,17 @@ size_t BasicBloomFilter::K(size_t cells, size_t capacity) return std::ceil(frac * std::log(2)); } +BasicBloomFilter* BasicBloomFilter::Merge(const BasicBloomFilter* x, + const BasicBloomFilter* y) + { + BasicBloomFilter* result = new BasicBloomFilter(); + result->bits_ = new BitVector(*x->bits_ | *y->bits_); + // TODO: implement the hasher pool and make sure the new result gets the same + // number of (equal) hash functions. + //assert(x->hash_ == y->hash_); + return result; + } + BasicBloomFilter::BasicBloomFilter() : bits_(NULL) { @@ -201,6 +206,14 @@ size_t BasicBloomFilter::CountImpl(const HashPolicy::HashVector& h) const return 1; } + +CountingBloomFilter* CountingBloomFilter::Merge(const CountingBloomFilter* x, + const CountingBloomFilter* y) +{ + assert(! "not yet implemented"); + return NULL; +} + CountingBloomFilter::CountingBloomFilter() : cells_(NULL) { diff --git a/src/BloomFilter.h b/src/BloomFilter.h index 4a83ba904b..3b5d9efa71 100644 --- a/src/BloomFilter.h +++ b/src/BloomFilter.h @@ -155,7 +155,6 @@ public: template void Add(const T& x) { - ++elements_; AddImpl(hash_->Hash(&x, sizeof(x))); } @@ -172,16 +171,6 @@ public: return CountImpl(hash_->Hash(&x, sizeof(x))); } - /** - * Retrieves the number of elements added to the Bloom filter. - * - * @return The number of elements in this Bloom filter. - */ - size_t Size() const - { - return elements_; - } - bool Serialize(SerialInfo* info) const; static BloomFilter* Unserialize(UnserialInfo* info); @@ -196,7 +185,6 @@ protected: private: HashPolicy* hash_; - size_t elements_; }; /** @@ -230,6 +218,9 @@ public: */ static size_t K(size_t cells, size_t capacity); + static BasicBloomFilter* Merge(const BasicBloomFilter* x, + const BasicBloomFilter* y); + /** * Constructs a basic Bloom filter with a given false-positive rate and * capacity. @@ -258,6 +249,9 @@ private: */ class CountingBloomFilter : public BloomFilter { public: + static CountingBloomFilter* Merge(const CountingBloomFilter* x, + const CountingBloomFilter* y); + CountingBloomFilter(double fp, size_t capacity, size_t width); CountingBloomFilter(size_t cells, size_t capacity, size_t width); diff --git a/src/OpaqueVal.cc b/src/OpaqueVal.cc index 76936dfb78..9dd5c7f980 100644 --- a/src/OpaqueVal.cc +++ b/src/OpaqueVal.cc @@ -572,10 +572,21 @@ size_t BloomFilterVal::Count(const Val* val) const return bloom_filter_->Count(key->Hash()); } -BloomFilterVal* BloomFilterVal::Merge(const BloomFilterVal* first, - const BloomFilterVal* second) +BloomFilterVal* BloomFilterVal::Merge(const BloomFilterVal* x, + const BloomFilterVal* y) { - assert(! "not yet implemented"); + if ( x->Type() != y->Type() ) + { + reporter->InternalError("cannot merge Bloom filters with different types"); + return NULL; + } + + BloomFilterVal* result; + if ( (result = DoMerge(x, y)) ) + return result; + else if ( (result = DoMerge(x, y)) ) + return result; + return NULL; } diff --git a/src/OpaqueVal.h b/src/OpaqueVal.h index e97a530f3a..4b45cad519 100644 --- a/src/OpaqueVal.h +++ b/src/OpaqueVal.h @@ -113,10 +113,10 @@ class BloomFilterVal : public OpaqueVal { BloomFilterVal(const BloomFilterVal&); BloomFilterVal& operator=(const BloomFilterVal&); public: - static BloomFilterVal* Merge(const BloomFilterVal* first, - const BloomFilterVal* second); + static BloomFilterVal* Merge(const BloomFilterVal* x, + const BloomFilterVal* y); - BloomFilterVal(BloomFilter* bf); + explicit BloomFilterVal(BloomFilter* bf); ~BloomFilterVal(); bool Typify(BroType* type); @@ -133,6 +133,17 @@ protected: DECLARE_SERIAL(BloomFilterVal); private: + template + static BloomFilterVal* DoMerge(const BloomFilterVal* x, + const BloomFilterVal* y) + { + const T* a = dynamic_cast(x->bloom_filter_); + const T* b = dynamic_cast(y->bloom_filter_); + if ( a && b ) + return new BloomFilterVal(T::Merge(a, b)); + return NULL; + } + BroType* type_; CompositeHash* hash_; BloomFilter* bloom_filter_; From 1f90b539a8574eeadd4b20ae9f379b0fe08999be Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Thu, 13 Jun 2013 23:06:01 -0700 Subject: [PATCH 25/45] Make H3 class adhere to Bro coding style. --- src/H3.h | 89 ++++++++++++++++++++++++++++---------------------------- 1 file changed, 44 insertions(+), 45 deletions(-) diff --git a/src/H3.h b/src/H3.h index 72d81d519f..50afda5688 100644 --- a/src/H3.h +++ b/src/H3.h @@ -65,53 +65,52 @@ template class H3 { T byte_lookup[N][H3_BYTE_RANGE]; public: - H3(); - T operator()(const void* data, size_t size, size_t offset = 0) const - { - const unsigned char *p = static_cast(data); - T result = 0; + H3() + { + T bit_lookup[N * CHAR_BIT]; - // loop optmized with Duff's Device - register unsigned n = (size + 7) / 8; - switch (size % 8) { - case 0: do { result ^= byte_lookup[offset++][*p++]; - case 7: result ^= byte_lookup[offset++][*p++]; - case 6: result ^= byte_lookup[offset++][*p++]; - case 5: result ^= byte_lookup[offset++][*p++]; - case 4: result ^= byte_lookup[offset++][*p++]; - case 3: result ^= byte_lookup[offset++][*p++]; - case 2: result ^= byte_lookup[offset++][*p++]; - case 1: result ^= byte_lookup[offset++][*p++]; - } while (--n > 0); - } + for ( size_t bit = 0; bit < N * CHAR_BIT; bit++ ) + { + bit_lookup[bit] = 0; + for ( size_t i = 0; i < sizeof(T)/2; i++ ) + // assume random() returns at least 16 random bits + bit_lookup[bit] = (bit_lookup[bit] << 16) | (bro_random() & 0xFFFF); + } - return result; - } + for ( size_t byte = 0; byte < N; byte++ ) + { + for ( unsigned val = 0; val < H3_BYTE_RANGE; val++ ) + { + byte_lookup[byte][val] = 0; + for ( size_t bit = 0; bit < CHAR_BIT; bit++ ) + // Does this mean byte_lookup[*][0] == 0? -RP + if (val & (1 << bit)) + byte_lookup[byte][val] ^= bit_lookup[byte*CHAR_BIT+bit]; + } + } + } + + T operator()(const void* data, size_t size, size_t offset = 0) const + { + const unsigned char *p = static_cast(data); + T result = 0; + + // loop optmized with Duff's Device + register unsigned n = (size + 7) / 8; + switch (size % 8) { + case 0: do { result ^= byte_lookup[offset++][*p++]; + case 7: result ^= byte_lookup[offset++][*p++]; + case 6: result ^= byte_lookup[offset++][*p++]; + case 5: result ^= byte_lookup[offset++][*p++]; + case 4: result ^= byte_lookup[offset++][*p++]; + case 3: result ^= byte_lookup[offset++][*p++]; + case 2: result ^= byte_lookup[offset++][*p++]; + case 1: result ^= byte_lookup[offset++][*p++]; + } while (--n > 0); + } + + return result; + } }; -template -H3::H3() -{ - T bit_lookup[N * CHAR_BIT]; - - for (size_t bit = 0; bit < N * CHAR_BIT; bit++) { - bit_lookup[bit] = 0; - for (size_t i = 0; i < sizeof(T)/2; i++) { - // assume random() returns at least 16 random bits - bit_lookup[bit] = (bit_lookup[bit] << 16) | (bro_random() & 0xFFFF); - } - } - - for (size_t byte = 0; byte < N; byte++) { - for (unsigned val = 0; val < H3_BYTE_RANGE; val++) { - byte_lookup[byte][val] = 0; - for (size_t bit = 0; bit < CHAR_BIT; bit++) { - // Does this mean byte_lookup[*][0] == 0? -RP - if (val & (1 << bit)) - byte_lookup[byte][val] ^= bit_lookup[byte*CHAR_BIT+bit]; - } - } - } -} - #endif //H3_H From 529d12037672d34fd4d1ba5f0d291fd6214f41d4 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Thu, 13 Jun 2013 23:07:31 -0700 Subject: [PATCH 26/45] Make H3 seed configurable. --- src/H3.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/H3.h b/src/H3.h index 50afda5688..11b0cd79a5 100644 --- a/src/H3.h +++ b/src/H3.h @@ -65,7 +65,7 @@ template class H3 { T byte_lookup[N][H3_BYTE_RANGE]; public: - H3() + H3(T seed = bro_random()) { T bit_lookup[N * CHAR_BIT]; @@ -74,7 +74,7 @@ public: bit_lookup[bit] = 0; for ( size_t i = 0; i < sizeof(T)/2; i++ ) // assume random() returns at least 16 random bits - bit_lookup[bit] = (bit_lookup[bit] << 16) | (bro_random() & 0xFFFF); + bit_lookup[bit] = (bit_lookup[bit] << 16) | (seed & 0xFFFF); } for ( size_t byte = 0; byte < N; byte++ ) From a6d7b7856e87c3a15ba7009ccfb7d6550d1dcfcc Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Thu, 13 Jun 2013 23:12:00 -0700 Subject: [PATCH 27/45] Update H3 documentation (and minor style nits.) --- src/H3.h | 60 +++++++++++++++++++++++++++++--------------------------- 1 file changed, 31 insertions(+), 29 deletions(-) diff --git a/src/H3.h b/src/H3.h index 11b0cd79a5..2eda14d276 100644 --- a/src/H3.h +++ b/src/H3.h @@ -49,9 +49,9 @@ // hash a substring of the data. Hashes of substrings can be bitwise-XOR'ed // together to get the same result as hashing the full string. // Any number of hash functions can be created by creating new instances of H3, -// with the same or different template parameters. The hash function is -// randomly generated using bro_random(); you must call init_random_seed() -// before the H3 constructor if you wish to seed it. +// with the same or different template parameters. The hash function +// constructor takes a seed as argument which defaults to a call to +// bro_random(). #ifndef H3_H @@ -62,34 +62,34 @@ // The number of values representable by a byte. #define H3_BYTE_RANGE (UCHAR_MAX+1) -template class H3 { - T byte_lookup[N][H3_BYTE_RANGE]; +template +class H3 { public: - H3(T seed = bro_random()) + H3(T seed = bro_random()) + { + T bit_lookup[N * CHAR_BIT]; + + for ( size_t bit = 0; bit < N * CHAR_BIT; bit++ ) { - T bit_lookup[N * CHAR_BIT]; - - for ( size_t bit = 0; bit < N * CHAR_BIT; bit++ ) - { - bit_lookup[bit] = 0; - for ( size_t i = 0; i < sizeof(T)/2; i++ ) - // assume random() returns at least 16 random bits - bit_lookup[bit] = (bit_lookup[bit] << 16) | (seed & 0xFFFF); - } - - for ( size_t byte = 0; byte < N; byte++ ) - { - for ( unsigned val = 0; val < H3_BYTE_RANGE; val++ ) - { - byte_lookup[byte][val] = 0; - for ( size_t bit = 0; bit < CHAR_BIT; bit++ ) - // Does this mean byte_lookup[*][0] == 0? -RP - if (val & (1 << bit)) - byte_lookup[byte][val] ^= bit_lookup[byte*CHAR_BIT+bit]; - } - } + bit_lookup[bit] = 0; + for ( size_t i = 0; i < sizeof(T)/2; i++ ) + // assume random() returns at least 16 random bits + bit_lookup[bit] = (bit_lookup[bit] << 16) | (seed & 0xFFFF); } + for ( size_t byte = 0; byte < N; byte++ ) + { + for ( unsigned val = 0; val < H3_BYTE_RANGE; val++ ) + { + byte_lookup[byte][val] = 0; + for ( size_t bit = 0; bit < CHAR_BIT; bit++ ) + // Does this mean byte_lookup[*][0] == 0? -RP + if (val & (1 << bit)) + byte_lookup[byte][val] ^= bit_lookup[byte*CHAR_BIT+bit]; + } + } + } + T operator()(const void* data, size_t size, size_t offset = 0) const { const unsigned char *p = static_cast(data); @@ -97,7 +97,7 @@ public: // loop optmized with Duff's Device register unsigned n = (size + 7) / 8; - switch (size % 8) { + switch ( size % 8 ) { case 0: do { result ^= byte_lookup[offset++][*p++]; case 7: result ^= byte_lookup[offset++][*p++]; case 6: result ^= byte_lookup[offset++][*p++]; @@ -106,11 +106,13 @@ public: case 3: result ^= byte_lookup[offset++][*p++]; case 2: result ^= byte_lookup[offset++][*p++]; case 1: result ^= byte_lookup[offset++][*p++]; - } while (--n > 0); + } while ( --n > 0 ); } return result; } +private: + T byte_lookup[N][H3_BYTE_RANGE]; }; #endif //H3_H From d2d8aff81456413597b09b71557b0caabdb7af3d Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Fri, 14 Jun 2013 09:22:48 -0700 Subject: [PATCH 28/45] Add utility function to access first random seed. --- src/util.cc | 13 +++++++++++++ src/util.h | 5 +++++ 2 files changed, 18 insertions(+) diff --git a/src/util.cc b/src/util.cc index de9bd5b679..721ee10a7e 100644 --- a/src/util.cc +++ b/src/util.cc @@ -716,6 +716,8 @@ static bool write_random_seeds(const char* write_file, uint32 seed, static bool bro_rand_determistic = false; static unsigned int bro_rand_state = 0; +static bool first_seed_saved = false; +static unsigned int first_seed = 0; static void bro_srandom(unsigned int seed, bool deterministic) { @@ -800,6 +802,12 @@ void init_random_seed(uint32 seed, const char* read_file, const char* write_file bro_srandom(seed, seeds_done); + if ( ! first_seed_saved ) + { + first_seed = seed; + first_seed_saved = true; + } + if ( ! hmac_key_set ) { MD5((const u_char*) buf, sizeof(buf), shared_hmac_md5_key); @@ -811,6 +819,11 @@ void init_random_seed(uint32 seed, const char* read_file, const char* write_file write_file); } +unsigned int initial_seed() + { + return first_seed; +} + bool have_random_seed() { return bro_rand_determistic; diff --git a/src/util.h b/src/util.h index 49bcbf318b..c3eebb04e3 100644 --- a/src/util.h +++ b/src/util.h @@ -165,6 +165,11 @@ extern void hmac_md5(size_t size, const unsigned char* bytes, extern void init_random_seed(uint32 seed, const char* load_file, const char* write_file); +// Retrieves the initial seed computed after the very first call to +// init_random_seed(). Repeated calls to init_random_seed() will not affect the +// return value of this function. +unsigned int initial_seed(); + // Returns true if the user explicitly set a seed via init_random_seed(); extern bool have_random_seed(); From 1576239f67ef2641135f95bdd331f3c1a54ee5ad Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Fri, 14 Jun 2013 10:19:39 -0700 Subject: [PATCH 29/45] Support seeding for hashers. --- src/BloomFilter.cc | 11 +++++++++++ src/BloomFilter.h | 4 +++- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/src/BloomFilter.cc b/src/BloomFilter.cc index e55db71e46..eff7eee733 100644 --- a/src/BloomFilter.cc +++ b/src/BloomFilter.cc @@ -74,6 +74,17 @@ bool CounterVector::DoUnserialize(UnserialInfo* info) } +HashPolicy::Hasher::Hasher(size_t seed) + : h3_(seed) +{ +} + +HashPolicy::HashType +HashPolicy::Hasher::operator()(const void* x, size_t n) const + { + return h3_(x, n); + } + HashPolicy::HashVector DefaultHashing::Hash(const void* x, size_t n) const { HashVector h(K(), 0); diff --git a/src/BloomFilter.h b/src/BloomFilter.h index 3b5d9efa71..65133621f9 100644 --- a/src/BloomFilter.h +++ b/src/BloomFilter.h @@ -96,7 +96,9 @@ protected: */ class Hasher { public: - HashType operator()(const void* x, size_t n) const { return h3_(x, n); } + Hasher(size_t seed); + + HashType operator()(const void* x, size_t n) const; private: // FIXME: The hardcoded value of 36 comes from UHASH_KEY_SIZE defined in // Hash.h. I do not know how this value impacts the hash function behavior From 79a6a26f9f70a937551a94a5dc83b2c5dafe1414 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Fri, 14 Jun 2013 10:20:33 -0700 Subject: [PATCH 30/45] H3 does not check for zero length input. --- src/BloomFilter.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/BloomFilter.cc b/src/BloomFilter.cc index eff7eee733..6a44defc6d 100644 --- a/src/BloomFilter.cc +++ b/src/BloomFilter.cc @@ -82,7 +82,7 @@ HashPolicy::Hasher::Hasher(size_t seed) HashPolicy::HashType HashPolicy::Hasher::operator()(const void* x, size_t n) const { - return h3_(x, n); + return n == 0 ? 0 : h3_(x, n); } HashPolicy::HashVector DefaultHashing::Hash(const void* x, size_t n) const From 9f740642891664ee8f482285523969793d0063d0 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Mon, 17 Jun 2013 14:02:14 -0700 Subject: [PATCH 31/45] Expose Bro's linear congruence PRNG as utility function. It was previously not possible to crank the wheel on the PRNG in a deterministic way without affecting the globally unique seed. The new extra utility function bro_prng takes a state in the form of a long int and returns the new PRNG state, now allowing arbitrary code parts to use the random number functionality. This commit also fixes a problem in the H3 constructor, which requires use of multiple seeds. The single seed passed in now serves as seed to crank out as many value needed using bro_prng. --- src/H3.h | 1 + src/util.cc | 29 ++++++++++++++++++----------- src/util.h | 7 +++++-- 3 files changed, 24 insertions(+), 13 deletions(-) diff --git a/src/H3.h b/src/H3.h index 2eda14d276..e2dc865147 100644 --- a/src/H3.h +++ b/src/H3.h @@ -72,6 +72,7 @@ public: for ( size_t bit = 0; bit < N * CHAR_BIT; bit++ ) { bit_lookup[bit] = 0; + seed = bro_prng(seed); for ( size_t i = 0; i < sizeof(T)/2; i++ ) // assume random() returns at least 16 random bits bit_lookup[bit] = (bit_lookup[bit] << 16) | (seed & 0xFFFF); diff --git a/src/util.cc b/src/util.cc index 721ee10a7e..cdd257d94f 100644 --- a/src/util.cc +++ b/src/util.cc @@ -829,22 +829,29 @@ bool have_random_seed() return bro_rand_determistic; } +long int bro_prng(long int state) + { + // Use our own simple linear congruence PRNG to make sure we are + // predictable across platforms. + static const long int m = 2147483647; + static const long int a = 16807; + const long int q = m / a; + const long int r = m % a; + + state = a * ( state % q ) - r * ( state / q ); + + if ( state <= 0 ) + state += m; + + return state; + } + long int bro_random() { if ( ! bro_rand_determistic ) return random(); // Use system PRNG. - // Use our own simple linear congruence PRNG to make sure we are - // predictable across platforms. - const long int m = 2147483647; - const long int a = 16807; - const long int q = m / a; - const long int r = m % a; - - bro_rand_state = a * ( bro_rand_state % q ) - r * ( bro_rand_state / q ); - - if ( bro_rand_state <= 0 ) - bro_rand_state += m; + bro_rand_state = bro_prng(bro_rand_state); return bro_rand_state; } diff --git a/src/util.h b/src/util.h index c3eebb04e3..0af401c668 100644 --- a/src/util.h +++ b/src/util.h @@ -173,9 +173,12 @@ unsigned int initial_seed(); // Returns true if the user explicitly set a seed via init_random_seed(); extern bool have_random_seed(); +// A simple linear congruence PRNG. It takes its state as argument and returns +// a new random value, which can serve as state for subsequent calls. +long int bro_prng(long int state); + // Replacement for the system random(), to which is normally falls back -// except when a seed has been given. In that case, we use our own -// predictable PRNG. +// except when a seed has been given. In that case, the function bro_prng. long int bro_random(); // Calls the system srandom() function with the given seed if not running From 532fbfb4d27ac9ee733dbcfebccbc91e652d4eb0 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Mon, 17 Jun 2013 16:06:02 -0700 Subject: [PATCH 32/45] Factor implementation and change interface. When constructing a Bloom filter, one now has to pass a HashPolicy instance to it. This separates more clearly the concerns of hashing and Bloom filter management. This commit also changes the interface to initialize Bloom filters: there exist now two initialization functions, one for each type: (1) bloomfilter_basic_init(fp: double, capacity: count, name: string &default=""): opaque of bloomfilter (2) bloomfilter_counting_init(k: count, cells: count, max: count, name: string &default=""): opaque of bloomfilter The BiFs for adding elements and performing lookups remain the same. This essentially gives us "BiF polymorphism" at script land, where the initialization BiF constructs the most derived type while subsequent BiFs adhere to the same interface. The reason why we split up the constructor in this case is that we have not yet derived the math that computes the optimal number of hash functions for counting Bloom filters---users have to explicitly parameterize them for now. --- src/BloomFilter.cc | 159 +++++--------------------- src/BloomFilter.h | 172 ++++------------------------- src/CMakeLists.txt | 2 + src/CounterVector.cc | 75 +++++++++++++ src/CounterVector.h | 78 +++++++++++++ src/HashPolicy.cc | 72 ++++++++++++ src/HashPolicy.h | 90 +++++++++++++++ src/OpaqueVal.cc | 1 + src/bro.bif | 57 ++++++---- testing/btest/bifs/bloomfilter.bro | 20 ++-- testing/btest/istate/opaque.bro | 2 +- 11 files changed, 409 insertions(+), 319 deletions(-) create mode 100644 src/CounterVector.cc create mode 100644 src/CounterVector.h create mode 100644 src/HashPolicy.cc create mode 100644 src/HashPolicy.h diff --git a/src/BloomFilter.cc b/src/BloomFilter.cc index 6a44defc6d..0be64c18de 100644 --- a/src/BloomFilter.cc +++ b/src/BloomFilter.cc @@ -1,117 +1,16 @@ #include "BloomFilter.h" #include +#include "CounterVector.h" #include "Serializer.h" -CounterVector::CounterVector(size_t width, size_t cells) - : bits_(new BitVector(width * cells)), width_(width) - { - } - -CounterVector::~CounterVector() - { - delete bits_; - } - -bool CounterVector::Increment(size_type cell, count_type value) - { - // TODO - assert(! "not yet implemented"); - return false; - } - -bool CounterVector::Decrement(size_type cell, count_type value) - { - // TODO - assert(! "not yet implemented"); - return false; - } - -CounterVector::count_type CounterVector::Count(size_type cell) const - { - // TODO - assert(! "not yet implemented"); - return 0; - } - -CounterVector::size_type CounterVector::Size() const - { - return bits_->Blocks() / width_; - } - -bool CounterVector::Serialize(SerialInfo* info) const - { - return SerialObj::Serialize(info); - } - -CounterVector* CounterVector::Unserialize(UnserialInfo* info) - { - return reinterpret_cast( - SerialObj::Unserialize(info, SER_COUNTERVECTOR)); - } - -IMPLEMENT_SERIAL(CounterVector, SER_COUNTERVECTOR) - -bool CounterVector::DoSerialize(SerialInfo* info) const - { - DO_SERIALIZE(SER_COUNTERVECTOR, SerialObj); - if ( ! bits_->Serialize(info) ) - return false; - return SERIALIZE(static_cast(width_)); - } - -bool CounterVector::DoUnserialize(UnserialInfo* info) - { - DO_UNSERIALIZE(SerialObj); - bits_ = BitVector::Unserialize(info); - if ( ! bits_ ) - return false; - uint64 width; - if ( ! UNSERIALIZE(&width) ) - return false; - width_ = static_cast(width); - return true; - } - - -HashPolicy::Hasher::Hasher(size_t seed) - : h3_(seed) -{ -} - -HashPolicy::HashType -HashPolicy::Hasher::operator()(const void* x, size_t n) const - { - return n == 0 ? 0 : h3_(x, n); - } - -HashPolicy::HashVector DefaultHashing::Hash(const void* x, size_t n) const - { - HashVector h(K(), 0); - for ( size_t i = 0; i < h.size(); ++i ) - h[i] = hashers_[i](x, n); - return h; - } - - -HashPolicy::HashVector DoubleHashing::Hash(const void* x, size_t n) const - { - HashType h1 = hasher1_(x, n); - HashType h2 = hasher2_(x, n); - HashVector h(K(), 0); - for ( size_t i = 0; i < h.size(); ++i ) - h[i] = h1 + i * h2; - return h; - } - - BloomFilter::BloomFilter() : hash_(NULL) { } -BloomFilter::BloomFilter(size_t k) - : hash_(new hash_policy(k)) +BloomFilter::BloomFilter(const HashPolicy* hash_policy) + : hash_(hash_policy) { } @@ -135,7 +34,11 @@ BloomFilter* BloomFilter::Unserialize(UnserialInfo* info) bool BloomFilter::DoSerialize(SerialInfo* info) const { DO_SERIALIZE(SER_BLOOMFILTER, SerialObj); - return SERIALIZE(static_cast(hash_->K())); + // FIXME: Since we have a fixed hashing policy, we just serialize the + // information needed to reconstruct it. + if ( ! SERIALIZE(static_cast(hash_->K())) ) + return false; + return SERIALIZE_STR(hash_->Name().c_str(), hash_->Name().size()); } bool BloomFilter::DoUnserialize(UnserialInfo* info) @@ -144,10 +47,15 @@ bool BloomFilter::DoUnserialize(UnserialInfo* info) uint16 k; if ( ! UNSERIALIZE(&k) ) return false; - hash_ = new hash_policy(static_cast(k)); + const char* name; + if ( ! UNSERIALIZE_STR(&name, 0) ) + return false; + // FIXME: for now Bloom filters always use double hashing. + hash_ = new DefaultHashing(k, name); return true; } + size_t BasicBloomFilter::M(double fp, size_t capacity) { double ln2 = std::log(2); @@ -163,11 +71,9 @@ size_t BasicBloomFilter::K(size_t cells, size_t capacity) BasicBloomFilter* BasicBloomFilter::Merge(const BasicBloomFilter* x, const BasicBloomFilter* y) { + // TODO: Ensure that x and y use the same HashPolicy before proceeding. BasicBloomFilter* result = new BasicBloomFilter(); result->bits_ = new BitVector(*x->bits_ | *y->bits_); - // TODO: implement the hasher pool and make sure the new result gets the same - // number of (equal) hash functions. - //assert(x->hash_ == y->hash_); return result; } @@ -176,16 +82,10 @@ BasicBloomFilter::BasicBloomFilter() { } -BasicBloomFilter::BasicBloomFilter(double fp, size_t capacity) - : BloomFilter(K(M(fp, capacity), capacity)) +BasicBloomFilter::BasicBloomFilter(const HashPolicy* hash_policy, size_t cells) + : BloomFilter(hash_policy), + bits_(new BitVector(cells)) { - bits_ = new BitVector(M(fp, capacity)); - } - -BasicBloomFilter::BasicBloomFilter(size_t cells, size_t capacity) - : BloomFilter(K(cells, capacity)) - { - bits_ = new BitVector(cells); } IMPLEMENT_SERIAL(BasicBloomFilter, SER_BASICBLOOMFILTER) @@ -203,13 +103,13 @@ bool BasicBloomFilter::DoUnserialize(UnserialInfo* info) return bits_ != NULL; } -void BasicBloomFilter::AddImpl(const HashPolicy::HashVector& h) +void BasicBloomFilter::AddImpl(const HashPolicy::hash_vector& h) { for ( size_t i = 0; i < h.size(); ++i ) bits_->Set(h[i] % bits_->Size()); } -size_t BasicBloomFilter::CountImpl(const HashPolicy::HashVector& h) const +size_t BasicBloomFilter::CountImpl(const HashPolicy::hash_vector& h) const { for ( size_t i = 0; i < h.size(); ++i ) if ( ! (*bits_)[h[i] % bits_->Size()] ) @@ -230,17 +130,9 @@ CountingBloomFilter::CountingBloomFilter() { } -CountingBloomFilter::CountingBloomFilter(double fp, size_t capacity, - size_t width) - : BloomFilter(BasicBloomFilter::K(BasicBloomFilter::M(fp, capacity), - capacity)) - { - cells_ = new CounterVector(width, BasicBloomFilter::M(fp, capacity)); - } - -CountingBloomFilter::CountingBloomFilter(size_t cells, size_t capacity, - size_t width) - : BloomFilter(BasicBloomFilter::K(cells, capacity)) +CountingBloomFilter::CountingBloomFilter(const HashPolicy* hash_policy, + size_t cells, size_t width) + : BloomFilter(hash_policy) { cells_ = new CounterVector(width, cells); } @@ -261,18 +153,19 @@ bool CountingBloomFilter::DoUnserialize(UnserialInfo* info) return cells_ != NULL; } -void CountingBloomFilter::AddImpl(const HashPolicy::HashVector& h) +void CountingBloomFilter::AddImpl(const HashPolicy::hash_vector& h) { for ( size_t i = 0; i < h.size(); ++i ) cells_->Increment(h[i] % cells_->Size(), 1); } -size_t CountingBloomFilter::CountImpl(const HashPolicy::HashVector& h) const +size_t CountingBloomFilter::CountImpl(const HashPolicy::hash_vector& h) const { CounterVector::size_type min = std::numeric_limits::max(); for ( size_t i = 0; i < h.size(); ++i ) { + // TODO: Use partitioning. CounterVector::size_type cnt = cells_->Count(h[i] % cells_->Size()); if ( cnt < min ) min = cnt; diff --git a/src/BloomFilter.h b/src/BloomFilter.h index 65133621f9..189f4920b7 100644 --- a/src/BloomFilter.h +++ b/src/BloomFilter.h @@ -3,141 +3,9 @@ #include #include "BitVector.h" -#include "Hash.h" -#include "H3.h" +#include "HashPolicy.h" -/** - * A vector of counters, each of which have a fixed number of bits. - */ -class CounterVector : public SerialObj { -public: - typedef size_t size_type; - typedef uint64 count_type; - - /** - * Constructs a counter vector having cells of a given width. - * - * @param width The number of bits that each cell occupies. - * - * @param cells The number of cells in the bitvector. - */ - CounterVector(size_t width, size_t cells = 1024); - - ~CounterVector(); - - /** - * Increments a given cell. - * - * @param cell The cell to increment. - * - * @param value The value to add to the current counter in *cell*. - * - * @return `true` if adding *value* to the counter in *cell* succeeded. - */ - bool Increment(size_type cell, count_type value); - - /** - * Decrements a given cell. - * - * @param cell The cell to decrement. - * - * @param value The value to subtract from the current counter in *cell*. - * - * @return `true` if subtracting *value* from the counter in *cell* succeeded. - */ - bool Decrement(size_type cell, count_type value); - - /** - * Retrieves the counter of a given cell. - * - * @param cell The cell index to retrieve the count for. - * - * @return The counter associated with *cell*. - */ - count_type Count(size_type cell) const; - - /** - * Retrieves the number of cells in the storage. - * - * @return The number of cells. - */ - size_type Size() const; - - bool Serialize(SerialInfo* info) const; - static CounterVector* Unserialize(UnserialInfo* info); - -protected: - DECLARE_SERIAL(CounterVector); - - CounterVector() { } - -private: - BitVector* bits_; - size_t width_; -}; - -/** - * The abstract base class for hash policies that hash elements *k* times. - * @tparam Codomain An integral type. - */ -class HashPolicy { -public: - typedef hash_t HashType; - typedef std::vector HashVector; - - virtual ~HashPolicy() { } - size_t K() const { return k_; } - virtual HashVector Hash(const void* x, size_t n) const = 0; - -protected: - /** - * A functor that computes a universal hash function. - * @tparam Codomain An integral type. - */ - class Hasher { - public: - Hasher(size_t seed); - - HashType operator()(const void* x, size_t n) const; - private: - // FIXME: The hardcoded value of 36 comes from UHASH_KEY_SIZE defined in - // Hash.h. I do not know how this value impacts the hash function behavior - // so I'll just copy it verbatim. (Matthias) - H3 h3_; - }; - - HashPolicy(size_t k) : k_(k) { } - -private: - const size_t k_; -}; - -/** - * The *default* hashing policy. Performs *k* hash function computations. - */ -class DefaultHashing : public HashPolicy { -public: - DefaultHashing(size_t k) : HashPolicy(k), hashers_(k) { } - - virtual HashVector Hash(const void* x, size_t n) const; - -private: - std::vector hashers_; -}; - -/** - * The *double-hashing* policy. Uses a linear combination of two hash functions. - */ -class DoubleHashing : public HashPolicy { -public: - DoubleHashing(size_t k) : HashPolicy(k) { } - - virtual HashVector Hash(const void* x, size_t n) const; - -private: - Hasher hasher1_; - Hasher hasher2_; -}; +class CounterVector; /** * The abstract base class for Bloom filters. @@ -146,8 +14,6 @@ class BloomFilter : public SerialObj { public: // At this point we won't let the user choose the hash policy, but we might // open up the interface in the future. - typedef DoubleHashing hash_policy; - virtual ~BloomFilter(); /** @@ -180,13 +46,19 @@ protected: DECLARE_ABSTRACT_SERIAL(BloomFilter); BloomFilter(); - BloomFilter(size_t k); - virtual void AddImpl(const HashPolicy::HashVector& hashes) = 0; - virtual size_t CountImpl(const HashPolicy::HashVector& hashes) const = 0; + /** + * Constructs a Bloom filter. + * + * @param hash_policy The hash policy to use for this Bloom filter. + */ + BloomFilter(const HashPolicy* hash_policy); + + virtual void AddImpl(const HashPolicy::hash_vector& hashes) = 0; + virtual size_t CountImpl(const HashPolicy::hash_vector& hashes) const = 0; private: - HashPolicy* hash_; + const HashPolicy* hash_; }; /** @@ -223,24 +95,18 @@ public: static BasicBloomFilter* Merge(const BasicBloomFilter* x, const BasicBloomFilter* y); - /** - * Constructs a basic Bloom filter with a given false-positive rate and - * capacity. - */ - BasicBloomFilter(double fp, size_t capacity); - /** * Constructs a basic Bloom filter with a given number of cells and capacity. */ - BasicBloomFilter(size_t cells, size_t capacity); + BasicBloomFilter(const HashPolicy* hash_policy, size_t cells); protected: DECLARE_SERIAL(BasicBloomFilter); BasicBloomFilter(); - virtual void AddImpl(const HashPolicy::HashVector& h); - virtual size_t CountImpl(const HashPolicy::HashVector& h) const; + virtual void AddImpl(const HashPolicy::hash_vector& h); + virtual size_t CountImpl(const HashPolicy::hash_vector& h) const; private: BitVector* bits_; @@ -254,16 +120,16 @@ public: static CountingBloomFilter* Merge(const CountingBloomFilter* x, const CountingBloomFilter* y); - CountingBloomFilter(double fp, size_t capacity, size_t width); - CountingBloomFilter(size_t cells, size_t capacity, size_t width); + CountingBloomFilter(const HashPolicy* hash_policy, size_t cells, + size_t width); protected: DECLARE_SERIAL(CountingBloomFilter); CountingBloomFilter(); - virtual void AddImpl(const HashPolicy::HashVector& h); - virtual size_t CountImpl(const HashPolicy::HashVector& h) const; + virtual void AddImpl(const HashPolicy::hash_vector& h); + virtual size_t CountImpl(const HashPolicy::hash_vector& h) const; private: CounterVector* cells_; diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 1537bb04b0..f2c7ce6bad 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -255,6 +255,7 @@ set(bro_SRCS ChunkedIO.cc CompHash.cc Conn.cc + CounterVector.cc DFA.cc DbgBreakpoint.cc DbgHelp.cc @@ -278,6 +279,7 @@ set(bro_SRCS Frame.cc Func.cc Hash.cc + HashPolicy.cc ID.cc IntSet.cc IOSource.cc diff --git a/src/CounterVector.cc b/src/CounterVector.cc new file mode 100644 index 0000000000..8ed4c30427 --- /dev/null +++ b/src/CounterVector.cc @@ -0,0 +1,75 @@ +#include "CounterVector.h" + +#include "BitVector.h" +#include "Serializer.h" + +CounterVector::CounterVector(size_t width, size_t cells) + : bits_(new BitVector(width * cells)), width_(width) + { + } + +CounterVector::~CounterVector() + { + delete bits_; + } + +bool CounterVector::Increment(size_type cell, count_type value) + { + // TODO + assert(! "not yet implemented"); + return false; + } + +bool CounterVector::Decrement(size_type cell, count_type value) + { + // TODO + assert(! "not yet implemented"); + return false; + } + +CounterVector::count_type CounterVector::Count(size_type cell) const + { + // TODO + assert(! "not yet implemented"); + return 0; + } + +CounterVector::size_type CounterVector::Size() const + { + return bits_->Blocks() / width_; + } + +bool CounterVector::Serialize(SerialInfo* info) const + { + return SerialObj::Serialize(info); + } + +CounterVector* CounterVector::Unserialize(UnserialInfo* info) + { + return reinterpret_cast( + SerialObj::Unserialize(info, SER_COUNTERVECTOR)); + } + +IMPLEMENT_SERIAL(CounterVector, SER_COUNTERVECTOR) + +bool CounterVector::DoSerialize(SerialInfo* info) const + { + DO_SERIALIZE(SER_COUNTERVECTOR, SerialObj); + if ( ! bits_->Serialize(info) ) + return false; + return SERIALIZE(static_cast(width_)); + } + +bool CounterVector::DoUnserialize(UnserialInfo* info) + { + DO_UNSERIALIZE(SerialObj); + bits_ = BitVector::Unserialize(info); + if ( ! bits_ ) + return false; + uint64 width; + if ( ! UNSERIALIZE(&width) ) + return false; + width_ = static_cast(width); + return true; + } + diff --git a/src/CounterVector.h b/src/CounterVector.h new file mode 100644 index 0000000000..ecc8fe90e0 --- /dev/null +++ b/src/CounterVector.h @@ -0,0 +1,78 @@ +#ifndef CounterVector_h +#define CounterVector_h + +#include "SerialObj.h" + +class BitVector; + +/** + * A vector of counters, each of which have a fixed number of bits. + */ +class CounterVector : public SerialObj { +public: + typedef size_t size_type; + typedef uint64 count_type; + + /** + * Constructs a counter vector having cells of a given width. + * + * @param width The number of bits that each cell occupies. + * + * @param cells The number of cells in the bitvector. + */ + CounterVector(size_t width, size_t cells = 1024); + + ~CounterVector(); + + /** + * Increments a given cell. + * + * @param cell The cell to increment. + * + * @param value The value to add to the current counter in *cell*. + * + * @return `true` if adding *value* to the counter in *cell* succeeded. + */ + bool Increment(size_type cell, count_type value); + + /** + * Decrements a given cell. + * + * @param cell The cell to decrement. + * + * @param value The value to subtract from the current counter in *cell*. + * + * @return `true` if subtracting *value* from the counter in *cell* succeeded. + */ + bool Decrement(size_type cell, count_type value); + + /** + * Retrieves the counter of a given cell. + * + * @param cell The cell index to retrieve the count for. + * + * @return The counter associated with *cell*. + */ + count_type Count(size_type cell) const; + + /** + * Retrieves the number of cells in the storage. + * + * @return The number of cells. + */ + size_type Size() const; + + bool Serialize(SerialInfo* info) const; + static CounterVector* Unserialize(UnserialInfo* info); + +protected: + DECLARE_SERIAL(CounterVector); + + CounterVector() { } + +private: + BitVector* bits_; + size_t width_; +}; + +#endif diff --git a/src/HashPolicy.cc b/src/HashPolicy.cc new file mode 100644 index 0000000000..d6fb4f3da4 --- /dev/null +++ b/src/HashPolicy.cc @@ -0,0 +1,72 @@ +#include "HashPolicy.h" + +#include "digest.h" + +Hasher::Hasher(size_t seed, const std::string& extra) + : h_(compute_seed(seed, extra)) + { + } + +Hasher::hash_type Hasher::operator()(const void* x, size_t n) const + { + return n == 0 ? 0 : h_(x, n); + } + +size_t Hasher::compute_seed(size_t seed, const std::string& extra) + { + u_char digest[SHA256_DIGEST_LENGTH]; + SHA256_CTX ctx; + sha256_init(&ctx); + if ( extra.empty() ) + { + unsigned int first_seed = initial_seed(); + sha256_update(&ctx, &first_seed, sizeof(first_seed)); + } + else + { + sha256_update(&ctx, extra.c_str(), extra.size()); + } + sha256_update(&ctx, &seed, sizeof(seed)); + sha256_final(&ctx, digest); + return *reinterpret_cast(digest); + } + + +HashPolicy::HashPolicy(size_t k, const std::string& name) + : k_(k), name_(name) + { + } + +DefaultHashing::DefaultHashing(size_t k, const std::string& name) + : HashPolicy(k, name) + { + for ( size_t i = 0; i < k; ++i ) + hashers_.push_back(Hasher(i, name)); + } + +HashPolicy::hash_vector DefaultHashing::Hash(const void* x, size_t n) const + { + hash_vector h(K(), 0); + for ( size_t i = 0; i < h.size(); ++i ) + h[i] = hashers_[i](x, n); + return h; + } + +DoubleHashing::DoubleHashing(size_t k, const std::string& name) + : HashPolicy(k, name), + hasher1_(1, name), + hasher2_(2, name) + { + } + +HashPolicy::hash_vector DoubleHashing::Hash(const void* x, size_t n) const + { + hash_type h1 = hasher1_(x, n); + hash_type h2 = hasher2_(x, n); + hash_vector h(K(), 0); + for ( size_t i = 0; i < h.size(); ++i ) + h[i] = h1 + i * h2; + return h; + } + + diff --git a/src/HashPolicy.h b/src/HashPolicy.h new file mode 100644 index 0000000000..4660bc0080 --- /dev/null +++ b/src/HashPolicy.h @@ -0,0 +1,90 @@ +#ifndef HashPolicy_h +#define HashPolicy_h + +#include "Hash.h" +#include "H3.h" + +/** + * A functor that computes a universal hash function. + */ +class Hasher { +public: + typedef hash_t hash_type; + + /** + * Constructs a hasher seeded by a given seed and optionally an extra + * descriptor. + * + * @param seed The seed to use. + * + * @param extra If not `NULL`, the hasher will not mix in the initial seed + * but instead use this NUL-terminated string as additional seed. + */ + Hasher(size_t seed, const std::string& extra = ""); + + /** + * Computes the hash digest of contiguous data. + * + * @param x A pointer to the beginning of the byte sequence to hash. + * + * @param n The length of the sequence pointed to by *x*. + */ + hash_type operator()(const void* x, size_t n) const; + +private: + static size_t compute_seed(size_t seed, const std::string& extra); + + H3 h_; +}; + +/** + * The abstract base class for hash policies that hash elements *k* times. + */ +class HashPolicy { +public: + typedef Hasher::hash_type hash_type; + typedef std::vector hash_vector; + + virtual ~HashPolicy() { } + + virtual hash_vector Hash(const void* x, size_t n) const = 0; + + size_t K() const { return k_; } + const std::string& Name() const { return name_; } + +protected: + HashPolicy(size_t k, const std::string& name); + +private: + const size_t k_; + std::string name_; +}; + +/** + * The default hashing policy. Performs *k* hash function computations. + */ +class DefaultHashing : public HashPolicy { +public: + DefaultHashing(size_t k, const std::string& name); + + virtual hash_vector Hash(const void* x, size_t n) const /* override */; + +private: + std::vector hashers_; +}; + +/** + * The *double-hashing* policy. Uses a linear combination of two hash functions. + */ +class DoubleHashing : public HashPolicy { +public: + DoubleHashing(size_t k, const std::string& name); + + virtual hash_vector Hash(const void* x, size_t n) const; + +private: + Hasher hasher1_; + Hasher hasher2_; +}; + +#endif diff --git a/src/OpaqueVal.cc b/src/OpaqueVal.cc index 9dd5c7f980..8b82916689 100644 --- a/src/OpaqueVal.cc +++ b/src/OpaqueVal.cc @@ -605,6 +605,7 @@ IMPLEMENT_SERIAL(BloomFilterVal, SER_BLOOMFILTER_VAL); bool BloomFilterVal::DoSerialize(SerialInfo* info) const { DO_SERIALIZE(SER_BLOOMFILTER_VAL, OpaqueVal); + assert( type_ ); if ( ! type_->Serialize(info) ) return false; return bloom_filter_->Serialize(info); diff --git a/src/bro.bif b/src/bro.bif index 9b80c90dbf..a89b808888 100644 --- a/src/bro.bif +++ b/src/bro.bif @@ -4986,42 +4986,55 @@ function anonymize_addr%(a: addr, cl: IPAddrAnonymizationClass%): addr #include "BloomFilter.h" %%} -## Initializes a Bloom filter data structure. +## Creates a basic Bloom filter. ## ## fp: The desired false-positive rate. ## ## capacity: the maximum number of elements that guarantees a false-positive ## rate of *fp*. ## -## max: The maximum counter value associated with each each element in the -## Bloom filter. If greater than 1, each element in the set has a counter of -## *w = ceil(log_2(max))* bits. Each bit in the underlying bit vector then -## becomes a cell of size *w* bits. Since the number number of cells is a -## function ## of *fp* and *capacity*, it is important to consider the effects -## on space when tuning this value. +## name: A name that uniquely identifies and seeds the Bloom filter. If empty, +## the initialization will become dependent on the initial seed. ## ## Returns: A Bloom filter handle. -function bloomfilter_init%(fp: double, capacity: count, - max: count &default=1%): opaque of bloomfilter +function bloomfilter_basic_init%(fp: double, capacity: count, + name: string &default=""%): opaque of bloomfilter %{ if ( fp < 0.0 || fp > 1.0 ) { reporter->Error("false-positive rate must take value between 0 and 1"); return NULL; } - BloomFilter* bf; - if ( max == 1 ) - { - bf = new BasicBloomFilter(fp, capacity); - } - else - { - uint16 width = 0; - while ( max >>= 1 ) - ++width; - bf = new CountingBloomFilter(fp, capacity, width); - } - return new BloomFilterVal(bf); + + size_t cells = BasicBloomFilter::M(fp, capacity); + size_t optimal_k = BasicBloomFilter::K(cells, capacity); + const HashPolicy* hp = new DefaultHashing(optimal_k, name->CheckString()); + fprintf(stderr, "constructing Bloom filter with %llu hash fns and %llu cells\n", optimal_k, cells); + return new BloomFilterVal(new BasicBloomFilter(hp, cells)); + %} + +## Creates a counting Bloom filter. +## +## k: The number of hash functions to use. +## +## cells: The number of cells of the underlying counter vector. +## +## max: The maximum counter value associated with each each element described +## by *w = ceil(log_2(max))* bits. Each bit in the underlying counter vector +## becomes a cell of size *w* bits. +## +## name: A name that uniquely identifies and seeds the Bloom filter. If empty, +## the initialization will become dependent on the initial seed. +## +## Returns: A Bloom filter handle. +function bloomfilter_counting_init%(k: count, cells: count, max: count, + name: string &default=""%): opaque of bloomfilter + %{ + const HashPolicy* hp = new DefaultHashing(k, name->CheckString()); + uint16 width = 0; + while ( max >>= 1 ) + ++width; + return new BloomFilterVal(new CountingBloomFilter(hp, cells, width)); %} ## Adds an element to a Bloom filter. diff --git a/testing/btest/bifs/bloomfilter.bro b/testing/btest/bifs/bloomfilter.bro index 769cec1200..3ff6a6668e 100644 --- a/testing/btest/bifs/bloomfilter.bro +++ b/testing/btest/bifs/bloomfilter.bro @@ -4,7 +4,7 @@ event bro_init() { # Basic usage with counts. - local bf_cnt = bloomfilter_init(0.1, 1000); + local bf_cnt = bloomfilter_basic_init(0.1, 1000); bloomfilter_add(bf_cnt, 42); bloomfilter_add(bf_cnt, 84); bloomfilter_add(bf_cnt, 168); @@ -16,23 +16,23 @@ event bro_init() bloomfilter_add(bf_cnt, "foo"); # Type mismatch # Basic usage with strings. - local bf_str = bloomfilter_init(0.9, 10); + local bf_str = bloomfilter_basic_init(0.9, 10); bloomfilter_add(bf_str, "foo"); bloomfilter_add(bf_str, "bar"); print bloomfilter_lookup(bf_str, "foo"); print bloomfilter_lookup(bf_str, "bar"); - print bloomfilter_lookup(bf_str, "baz"); # FP - print bloomfilter_lookup(bf_str, "qux"); # FP + print bloomfilter_lookup(bf_str, "b4z"); # FP + print bloomfilter_lookup(bf_str, "quux"); # FP bloomfilter_add(bf_str, 0.5); # Type mismatch bloomfilter_add(bf_str, 100); # Type mismatch # Edge cases. - local bf_edge0 = bloomfilter_init(0.000000000001, 1); - local bf_edge1 = bloomfilter_init(0.00000001, 100000000); - local bf_edge2 = bloomfilter_init(0.9999999, 1); - local bf_edge3 = bloomfilter_init(0.9999999, 100000000000); + local bf_edge0 = bloomfilter_basic_init(0.000000000001, 1); + local bf_edge1 = bloomfilter_basic_init(0.00000001, 100000000); + local bf_edge2 = bloomfilter_basic_init(0.9999999, 1); + local bf_edge3 = bloomfilter_basic_init(0.9999999, 100000000000); # Invalid parameters. - local bf_bug0 = bloomfilter_init(-0.5, 42); - local bf_bug1 = bloomfilter_init(1.1, 42); + local bf_bug0 = bloomfilter_basic_init(-0.5, 42); + local bf_bug1 = bloomfilter_basic_init(1.1, 42); } diff --git a/testing/btest/istate/opaque.bro b/testing/btest/istate/opaque.bro index ac3b2c0874..b387f9d6bc 100644 --- a/testing/btest/istate/opaque.bro +++ b/testing/btest/istate/opaque.bro @@ -82,7 +82,7 @@ event bro_init() if ( ! entropy_test_add(entropy_handle, "f") ) print out, "entropy_test_add() failed"; - bloomfilter_handle = bloomfilter_init(0.1, 100); + bloomfilter_handle = bloomfilter_basic_init(0.1, 100); for ( e in bloomfilter_elements ) bloomfilter_add(bloomfilter_handle, e); } From 85668e7054dd22bc783a620eaf88b04f2e4bb952 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Mon, 17 Jun 2013 16:16:44 -0700 Subject: [PATCH 33/45] Remove lingering debug code. --- src/bro.bif | 1 - 1 file changed, 1 deletion(-) diff --git a/src/bro.bif b/src/bro.bif index a89b808888..7c81966317 100644 --- a/src/bro.bif +++ b/src/bro.bif @@ -5009,7 +5009,6 @@ function bloomfilter_basic_init%(fp: double, capacity: count, size_t cells = BasicBloomFilter::M(fp, capacity); size_t optimal_k = BasicBloomFilter::K(cells, capacity); const HashPolicy* hp = new DefaultHashing(optimal_k, name->CheckString()); - fprintf(stderr, "constructing Bloom filter with %llu hash fns and %llu cells\n", optimal_k, cells); return new BloomFilterVal(new BasicBloomFilter(hp, cells)); %} From e6e5f4926f5a850c773af05b51d7004fc4899a7c Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Mon, 17 Jun 2013 16:26:35 -0700 Subject: [PATCH 34/45] Create hash policies through factory. --- src/BloomFilter.cc | 5 +---- src/HashPolicy.cc | 5 +++++ src/HashPolicy.h | 7 +++++++ src/bro.bif | 4 ++-- 4 files changed, 15 insertions(+), 6 deletions(-) diff --git a/src/BloomFilter.cc b/src/BloomFilter.cc index 0be64c18de..59d411d8e2 100644 --- a/src/BloomFilter.cc +++ b/src/BloomFilter.cc @@ -34,8 +34,6 @@ BloomFilter* BloomFilter::Unserialize(UnserialInfo* info) bool BloomFilter::DoSerialize(SerialInfo* info) const { DO_SERIALIZE(SER_BLOOMFILTER, SerialObj); - // FIXME: Since we have a fixed hashing policy, we just serialize the - // information needed to reconstruct it. if ( ! SERIALIZE(static_cast(hash_->K())) ) return false; return SERIALIZE_STR(hash_->Name().c_str(), hash_->Name().size()); @@ -50,8 +48,7 @@ bool BloomFilter::DoUnserialize(UnserialInfo* info) const char* name; if ( ! UNSERIALIZE_STR(&name, 0) ) return false; - // FIXME: for now Bloom filters always use double hashing. - hash_ = new DefaultHashing(k, name); + hash_ = HashPolicy::Create(k, name); return true; } diff --git a/src/HashPolicy.cc b/src/HashPolicy.cc index d6fb4f3da4..7ce754be3c 100644 --- a/src/HashPolicy.cc +++ b/src/HashPolicy.cc @@ -32,6 +32,11 @@ size_t Hasher::compute_seed(size_t seed, const std::string& extra) } +HashPolicy* HashPolicy::Create(size_t k, const std::string& name) + { + return new DefaultHashing(k, name); + } + HashPolicy::HashPolicy(size_t k, const std::string& name) : k_(k), name_(name) { diff --git a/src/HashPolicy.h b/src/HashPolicy.h index 4660bc0080..7bdb968bfe 100644 --- a/src/HashPolicy.h +++ b/src/HashPolicy.h @@ -42,6 +42,13 @@ private: */ class HashPolicy { public: + /** + * Constructs the hashing policy used by the implementation. This factory + * function exists because the HashingPolicy class hierachy is not yet + * serializable. + */ + static HashPolicy* Create(size_t k, const std::string& name); + typedef Hasher::hash_type hash_type; typedef std::vector hash_vector; diff --git a/src/bro.bif b/src/bro.bif index 7c81966317..d0ce066139 100644 --- a/src/bro.bif +++ b/src/bro.bif @@ -5008,7 +5008,7 @@ function bloomfilter_basic_init%(fp: double, capacity: count, size_t cells = BasicBloomFilter::M(fp, capacity); size_t optimal_k = BasicBloomFilter::K(cells, capacity); - const HashPolicy* hp = new DefaultHashing(optimal_k, name->CheckString()); + const HashPolicy* hp = HashPolicy::Create(optimal_k, name->CheckString()); return new BloomFilterVal(new BasicBloomFilter(hp, cells)); %} @@ -5029,7 +5029,7 @@ function bloomfilter_basic_init%(fp: double, capacity: count, function bloomfilter_counting_init%(k: count, cells: count, max: count, name: string &default=""%): opaque of bloomfilter %{ - const HashPolicy* hp = new DefaultHashing(k, name->CheckString()); + const HashPolicy* hp = HashPolicy::Create(k, name->CheckString()); uint16 width = 0; while ( max >>= 1 ) ++width; From 273629de366290f411f381fe5970fc672adf465f Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Tue, 18 Jun 2013 10:23:07 -0700 Subject: [PATCH 35/45] Only serialize Bloom filter type if available. --- src/OpaqueVal.cc | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/src/OpaqueVal.cc b/src/OpaqueVal.cc index 8b82916689..5a673c4a40 100644 --- a/src/OpaqueVal.cc +++ b/src/OpaqueVal.cc @@ -605,9 +605,13 @@ IMPLEMENT_SERIAL(BloomFilterVal, SER_BLOOMFILTER_VAL); bool BloomFilterVal::DoSerialize(SerialInfo* info) const { DO_SERIALIZE(SER_BLOOMFILTER_VAL, OpaqueVal); - assert( type_ ); - if ( ! type_->Serialize(info) ) + + bool is_typed = type_ != NULL; + if ( ! SERIALIZE(is_typed) ) return false; + if ( is_typed && ! type_->Serialize(info) ) + return false; + return bloom_filter_->Serialize(info); } @@ -615,13 +619,16 @@ bool BloomFilterVal::DoUnserialize(UnserialInfo* info) { DO_UNSERIALIZE(OpaqueVal); - type_ = BroType::Unserialize(info); - if ( ! type_ ) + bool is_typed; + if ( ! UNSERIALIZE(&is_typed) ) return false; - TypeList* tl = new TypeList(type_); - tl->Append(type_); - hash_ = new CompositeHash(tl); - Unref(tl); + if ( is_typed ) + { + BroType* type = BroType::Unserialize(info); + if ( ! Typify(type) ) + return false; + Unref(type); + } bloom_filter_ = BloomFilter::Unserialize(info); return bloom_filter_ != NULL; From 5f70452a9ac816346c4e480d8de52b213630b5b7 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Tue, 18 Jun 2013 10:40:00 -0700 Subject: [PATCH 36/45] Small fixes and style tweaks. --- src/BitVector.cc | 2 +- src/BloomFilter.cc | 1 + src/OpaqueVal.h | 4 +--- src/Type.cc | 6 +++--- 4 files changed, 6 insertions(+), 7 deletions(-) diff --git a/src/BitVector.cc b/src/BitVector.cc index f029230609..64db32131f 100644 --- a/src/BitVector.cc +++ b/src/BitVector.cc @@ -473,7 +473,7 @@ bool BitVector::DoSerialize(SerialInfo* info) const if ( ! SERIALIZE(static_cast(bits_.size())) ) return false; - for (size_t i = 0; i < bits_.size(); ++i) + for ( size_t i = 0; i < bits_.size(); ++i ) if ( ! SERIALIZE(static_cast(bits_[i])) ) return false; diff --git a/src/BloomFilter.cc b/src/BloomFilter.cc index 59d411d8e2..a7727630f7 100644 --- a/src/BloomFilter.cc +++ b/src/BloomFilter.cc @@ -49,6 +49,7 @@ bool BloomFilter::DoUnserialize(UnserialInfo* info) if ( ! UNSERIALIZE_STR(&name, 0) ) return false; hash_ = HashPolicy::Create(k, name); + delete [] name; return true; } diff --git a/src/OpaqueVal.h b/src/OpaqueVal.h index 4b45cad519..2362fdacfc 100644 --- a/src/OpaqueVal.h +++ b/src/OpaqueVal.h @@ -139,9 +139,7 @@ private: { const T* a = dynamic_cast(x->bloom_filter_); const T* b = dynamic_cast(y->bloom_filter_); - if ( a && b ) - return new BloomFilterVal(T::Merge(a, b)); - return NULL; + return a && b ? new BloomFilterVal(T::Merge(a, b)) : NULL; } BroType* type_; diff --git a/src/Type.cc b/src/Type.cc index 6461bf2560..f19de461cd 100644 --- a/src/Type.cc +++ b/src/Type.cc @@ -1311,19 +1311,19 @@ IMPLEMENT_SERIAL(OpaqueType, SER_OPAQUE_TYPE); bool OpaqueType::DoSerialize(SerialInfo* info) const { DO_SERIALIZE(SER_OPAQUE_TYPE, BroType); - return SERIALIZE(name); + return SERIALIZE_STR(name.c_str(), name.size()); } bool OpaqueType::DoUnserialize(UnserialInfo* info) { DO_UNSERIALIZE(BroType); - char const* n; + const char* n; if ( ! UNSERIALIZE_STR(&n, 0) ) return false; - name = n; delete [] n; + return true; } From 40201a180e54a560711003f2e65e14be87a7b8e9 Mon Sep 17 00:00:00 2001 From: Robin Sommer Date: Tue, 9 Jul 2013 21:00:53 -0700 Subject: [PATCH 37/45] Fixing for unserializion error. Because BloomFilter is a base class, with other classes derived from it, it needs special treatment. --- src/SerialTypes.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/SerialTypes.h b/src/SerialTypes.h index 9e4aef5b3b..85aed10bda 100644 --- a/src/SerialTypes.h +++ b/src/SerialTypes.h @@ -52,8 +52,6 @@ SERIAL_IS(RE_MATCHER, 0x1400) SERIAL_IS(BITVECTOR, 0x1500) SERIAL_IS(COUNTERVECTOR, 0x1600) SERIAL_IS(BLOOMFILTER, 0x1700) -SERIAL_IS(BASICBLOOMFILTER, 0x1800) -SERIAL_IS(COUNTINGBLOOMFILTER, 0x1900) // These are the externally visible types. const SerialType SER_NONE = 0; @@ -203,6 +201,11 @@ SERIAL_FUNC(BRO_FUNC, 2) SERIAL_FUNC(DEBUG_FUNC, 3) SERIAL_FUNC(BUILTIN_FUNC, 4) +#define SERIAL_BLOOMFILTER(name, val) SERIAL_CONST(name, val, BLOOMFILTER) +SERIAL_BLOOMFILTER(BLOOMFILTER, 1) +SERIAL_BLOOMFILTER(BASICBLOOMFILTER, 2) +SERIAL_BLOOMFILTER(COUNTINGBLOOMFILTER, 3) + SERIAL_CONST2(ID) SERIAL_CONST2(STATE_ACCESS) SERIAL_CONST2(CASE) @@ -210,8 +213,5 @@ SERIAL_CONST2(LOCATION) SERIAL_CONST2(RE_MATCHER) SERIAL_CONST2(BITVECTOR) SERIAL_CONST2(COUNTERVECTOR) -SERIAL_CONST2(BLOOMFILTER) -SERIAL_CONST2(BASICBLOOMFILTER) -SERIAL_CONST2(COUNTINGBLOOMFILTER) #endif From 446344ae998e8eef30a0f45a05dcea29efe4f032 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Wed, 10 Jul 2013 01:32:59 -0700 Subject: [PATCH 38/45] Add missing include for GCC. --- src/BloomFilter.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/src/BloomFilter.cc b/src/BloomFilter.cc index a7727630f7..c59092b1e4 100644 --- a/src/BloomFilter.cc +++ b/src/BloomFilter.cc @@ -1,6 +1,7 @@ #include "BloomFilter.h" #include +#include #include "CounterVector.h" #include "Serializer.h" From fd2e155d1af26086d40e12d38f564b7954f4597e Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Sun, 21 Jul 2013 17:34:25 +0200 Subject: [PATCH 39/45] Tweak hasher interface. --- src/BloomFilter.cc | 34 +++++++------- src/BloomFilter.h | 31 +++++++------ src/CMakeLists.txt | 2 +- src/HashPolicy.cc | 77 -------------------------------- src/HashPolicy.h | 97 ---------------------------------------- src/Hasher.cc | 79 ++++++++++++++++++++++++++++++++ src/Hasher.h | 109 +++++++++++++++++++++++++++++++++++++++++++++ src/bro.bif | 8 ++-- 8 files changed, 225 insertions(+), 212 deletions(-) delete mode 100644 src/HashPolicy.cc delete mode 100644 src/HashPolicy.h create mode 100644 src/Hasher.cc create mode 100644 src/Hasher.h diff --git a/src/BloomFilter.cc b/src/BloomFilter.cc index c59092b1e4..f399bddeca 100644 --- a/src/BloomFilter.cc +++ b/src/BloomFilter.cc @@ -6,19 +6,19 @@ #include "Serializer.h" BloomFilter::BloomFilter() - : hash_(NULL) + : hasher_(NULL) { } -BloomFilter::BloomFilter(const HashPolicy* hash_policy) - : hash_(hash_policy) +BloomFilter::BloomFilter(const Hasher* hasher) + : hasher_(hasher) { } BloomFilter::~BloomFilter() { - if ( hash_ ) - delete hash_; + if ( hasher_ ) + delete hasher_; } bool BloomFilter::Serialize(SerialInfo* info) const @@ -35,9 +35,9 @@ BloomFilter* BloomFilter::Unserialize(UnserialInfo* info) bool BloomFilter::DoSerialize(SerialInfo* info) const { DO_SERIALIZE(SER_BLOOMFILTER, SerialObj); - if ( ! SERIALIZE(static_cast(hash_->K())) ) + if ( ! SERIALIZE(static_cast(hasher_->K())) ) return false; - return SERIALIZE_STR(hash_->Name().c_str(), hash_->Name().size()); + return SERIALIZE_STR(hasher_->Name().c_str(), hasher_->Name().size()); } bool BloomFilter::DoUnserialize(UnserialInfo* info) @@ -49,7 +49,7 @@ bool BloomFilter::DoUnserialize(UnserialInfo* info) const char* name; if ( ! UNSERIALIZE_STR(&name, 0) ) return false; - hash_ = HashPolicy::Create(k, name); + hasher_ = Hasher::Create(k, name); delete [] name; return true; } @@ -70,7 +70,7 @@ size_t BasicBloomFilter::K(size_t cells, size_t capacity) BasicBloomFilter* BasicBloomFilter::Merge(const BasicBloomFilter* x, const BasicBloomFilter* y) { - // TODO: Ensure that x and y use the same HashPolicy before proceeding. + // TODO: Ensure that x and y use the same Hasher before proceeding. BasicBloomFilter* result = new BasicBloomFilter(); result->bits_ = new BitVector(*x->bits_ | *y->bits_); return result; @@ -81,8 +81,8 @@ BasicBloomFilter::BasicBloomFilter() { } -BasicBloomFilter::BasicBloomFilter(const HashPolicy* hash_policy, size_t cells) - : BloomFilter(hash_policy), +BasicBloomFilter::BasicBloomFilter(const Hasher* hasher, size_t cells) + : BloomFilter(hasher), bits_(new BitVector(cells)) { } @@ -102,13 +102,13 @@ bool BasicBloomFilter::DoUnserialize(UnserialInfo* info) return bits_ != NULL; } -void BasicBloomFilter::AddImpl(const HashPolicy::hash_vector& h) +void BasicBloomFilter::AddImpl(const Hasher::digest_vector& h) { for ( size_t i = 0; i < h.size(); ++i ) bits_->Set(h[i] % bits_->Size()); } -size_t BasicBloomFilter::CountImpl(const HashPolicy::hash_vector& h) const +size_t BasicBloomFilter::CountImpl(const Hasher::digest_vector& h) const { for ( size_t i = 0; i < h.size(); ++i ) if ( ! (*bits_)[h[i] % bits_->Size()] ) @@ -129,9 +129,9 @@ CountingBloomFilter::CountingBloomFilter() { } -CountingBloomFilter::CountingBloomFilter(const HashPolicy* hash_policy, +CountingBloomFilter::CountingBloomFilter(const Hasher* hasher, size_t cells, size_t width) - : BloomFilter(hash_policy) + : BloomFilter(hasher) { cells_ = new CounterVector(width, cells); } @@ -152,13 +152,13 @@ bool CountingBloomFilter::DoUnserialize(UnserialInfo* info) return cells_ != NULL; } -void CountingBloomFilter::AddImpl(const HashPolicy::hash_vector& h) +void CountingBloomFilter::AddImpl(const Hasher::digest_vector& h) { for ( size_t i = 0; i < h.size(); ++i ) cells_->Increment(h[i] % cells_->Size(), 1); } -size_t CountingBloomFilter::CountImpl(const HashPolicy::hash_vector& h) const +size_t CountingBloomFilter::CountImpl(const Hasher::digest_vector& h) const { CounterVector::size_type min = std::numeric_limits::max(); diff --git a/src/BloomFilter.h b/src/BloomFilter.h index 189f4920b7..92f15c6070 100644 --- a/src/BloomFilter.h +++ b/src/BloomFilter.h @@ -3,7 +3,7 @@ #include #include "BitVector.h" -#include "HashPolicy.h" +#include "Hasher.h" class CounterVector; @@ -12,7 +12,7 @@ class CounterVector; */ class BloomFilter : public SerialObj { public: - // At this point we won't let the user choose the hash policy, but we might + // At this point we won't let the user choose the hasher, but we might // open up the interface in the future. virtual ~BloomFilter(); @@ -23,7 +23,7 @@ public: template void Add(const T& x) { - AddImpl(hash_->Hash(&x, sizeof(x))); + AddImpl((*hasher_)(x)); } /** @@ -36,7 +36,7 @@ public: template size_t Count(const T& x) const { - return CountImpl(hash_->Hash(&x, sizeof(x))); + return CountImpl((*hasher_)(x)); } bool Serialize(SerialInfo* info) const; @@ -50,15 +50,15 @@ protected: /** * Constructs a Bloom filter. * - * @param hash_policy The hash policy to use for this Bloom filter. + * @param hasher The hasher to use for this Bloom filter. */ - BloomFilter(const HashPolicy* hash_policy); + BloomFilter(const Hasher* hasher); - virtual void AddImpl(const HashPolicy::hash_vector& hashes) = 0; - virtual size_t CountImpl(const HashPolicy::hash_vector& hashes) const = 0; + virtual void AddImpl(const Hasher::digest_vector& hashes) = 0; + virtual size_t CountImpl(const Hasher::digest_vector& hashes) const = 0; private: - const HashPolicy* hash_; + const Hasher* hasher_; }; /** @@ -98,15 +98,15 @@ public: /** * Constructs a basic Bloom filter with a given number of cells and capacity. */ - BasicBloomFilter(const HashPolicy* hash_policy, size_t cells); + BasicBloomFilter(const Hasher* hasher, size_t cells); protected: DECLARE_SERIAL(BasicBloomFilter); BasicBloomFilter(); - virtual void AddImpl(const HashPolicy::hash_vector& h); - virtual size_t CountImpl(const HashPolicy::hash_vector& h) const; + virtual void AddImpl(const Hasher::digest_vector& h); + virtual size_t CountImpl(const Hasher::digest_vector& h) const; private: BitVector* bits_; @@ -120,16 +120,15 @@ public: static CountingBloomFilter* Merge(const CountingBloomFilter* x, const CountingBloomFilter* y); - CountingBloomFilter(const HashPolicy* hash_policy, size_t cells, - size_t width); + CountingBloomFilter(const Hasher* hasher, size_t cells, size_t width); protected: DECLARE_SERIAL(CountingBloomFilter); CountingBloomFilter(); - virtual void AddImpl(const HashPolicy::hash_vector& h); - virtual size_t CountImpl(const HashPolicy::hash_vector& h) const; + virtual void AddImpl(const Hasher::digest_vector& h); + virtual size_t CountImpl(const Hasher::digest_vector& h) const; private: CounterVector* cells_; diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index f2c7ce6bad..87a3db3b62 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -279,7 +279,7 @@ set(bro_SRCS Frame.cc Func.cc Hash.cc - HashPolicy.cc + Hasher.cc ID.cc IntSet.cc IOSource.cc diff --git a/src/HashPolicy.cc b/src/HashPolicy.cc deleted file mode 100644 index 7ce754be3c..0000000000 --- a/src/HashPolicy.cc +++ /dev/null @@ -1,77 +0,0 @@ -#include "HashPolicy.h" - -#include "digest.h" - -Hasher::Hasher(size_t seed, const std::string& extra) - : h_(compute_seed(seed, extra)) - { - } - -Hasher::hash_type Hasher::operator()(const void* x, size_t n) const - { - return n == 0 ? 0 : h_(x, n); - } - -size_t Hasher::compute_seed(size_t seed, const std::string& extra) - { - u_char digest[SHA256_DIGEST_LENGTH]; - SHA256_CTX ctx; - sha256_init(&ctx); - if ( extra.empty() ) - { - unsigned int first_seed = initial_seed(); - sha256_update(&ctx, &first_seed, sizeof(first_seed)); - } - else - { - sha256_update(&ctx, extra.c_str(), extra.size()); - } - sha256_update(&ctx, &seed, sizeof(seed)); - sha256_final(&ctx, digest); - return *reinterpret_cast(digest); - } - - -HashPolicy* HashPolicy::Create(size_t k, const std::string& name) - { - return new DefaultHashing(k, name); - } - -HashPolicy::HashPolicy(size_t k, const std::string& name) - : k_(k), name_(name) - { - } - -DefaultHashing::DefaultHashing(size_t k, const std::string& name) - : HashPolicy(k, name) - { - for ( size_t i = 0; i < k; ++i ) - hashers_.push_back(Hasher(i, name)); - } - -HashPolicy::hash_vector DefaultHashing::Hash(const void* x, size_t n) const - { - hash_vector h(K(), 0); - for ( size_t i = 0; i < h.size(); ++i ) - h[i] = hashers_[i](x, n); - return h; - } - -DoubleHashing::DoubleHashing(size_t k, const std::string& name) - : HashPolicy(k, name), - hasher1_(1, name), - hasher2_(2, name) - { - } - -HashPolicy::hash_vector DoubleHashing::Hash(const void* x, size_t n) const - { - hash_type h1 = hasher1_(x, n); - hash_type h2 = hasher2_(x, n); - hash_vector h(K(), 0); - for ( size_t i = 0; i < h.size(); ++i ) - h[i] = h1 + i * h2; - return h; - } - - diff --git a/src/HashPolicy.h b/src/HashPolicy.h deleted file mode 100644 index 7bdb968bfe..0000000000 --- a/src/HashPolicy.h +++ /dev/null @@ -1,97 +0,0 @@ -#ifndef HashPolicy_h -#define HashPolicy_h - -#include "Hash.h" -#include "H3.h" - -/** - * A functor that computes a universal hash function. - */ -class Hasher { -public: - typedef hash_t hash_type; - - /** - * Constructs a hasher seeded by a given seed and optionally an extra - * descriptor. - * - * @param seed The seed to use. - * - * @param extra If not `NULL`, the hasher will not mix in the initial seed - * but instead use this NUL-terminated string as additional seed. - */ - Hasher(size_t seed, const std::string& extra = ""); - - /** - * Computes the hash digest of contiguous data. - * - * @param x A pointer to the beginning of the byte sequence to hash. - * - * @param n The length of the sequence pointed to by *x*. - */ - hash_type operator()(const void* x, size_t n) const; - -private: - static size_t compute_seed(size_t seed, const std::string& extra); - - H3 h_; -}; - -/** - * The abstract base class for hash policies that hash elements *k* times. - */ -class HashPolicy { -public: - /** - * Constructs the hashing policy used by the implementation. This factory - * function exists because the HashingPolicy class hierachy is not yet - * serializable. - */ - static HashPolicy* Create(size_t k, const std::string& name); - - typedef Hasher::hash_type hash_type; - typedef std::vector hash_vector; - - virtual ~HashPolicy() { } - - virtual hash_vector Hash(const void* x, size_t n) const = 0; - - size_t K() const { return k_; } - const std::string& Name() const { return name_; } - -protected: - HashPolicy(size_t k, const std::string& name); - -private: - const size_t k_; - std::string name_; -}; - -/** - * The default hashing policy. Performs *k* hash function computations. - */ -class DefaultHashing : public HashPolicy { -public: - DefaultHashing(size_t k, const std::string& name); - - virtual hash_vector Hash(const void* x, size_t n) const /* override */; - -private: - std::vector hashers_; -}; - -/** - * The *double-hashing* policy. Uses a linear combination of two hash functions. - */ -class DoubleHashing : public HashPolicy { -public: - DoubleHashing(size_t k, const std::string& name); - - virtual hash_vector Hash(const void* x, size_t n) const; - -private: - Hasher hasher1_; - Hasher hasher2_; -}; - -#endif diff --git a/src/Hasher.cc b/src/Hasher.cc new file mode 100644 index 0000000000..045adcd174 --- /dev/null +++ b/src/Hasher.cc @@ -0,0 +1,79 @@ +#include "Hasher.h" + +#include "digest.h" + +Hasher::UHF::UHF(size_t seed, const std::string& extra) + : h_(compute_seed(seed, extra)) + { + } + +Hasher::digest Hasher::UHF::hash(const void* x, size_t n) const + { + assert(n <= UHASH_KEY_SIZE); + return n == 0 ? 0 : h_(x, n); + } + +size_t Hasher::UHF::compute_seed(size_t seed, const std::string& extra) + { + u_char buf[SHA256_DIGEST_LENGTH]; + SHA256_CTX ctx; + sha256_init(&ctx); + if ( extra.empty() ) + { + unsigned int first_seed = initial_seed(); + sha256_update(&ctx, &first_seed, sizeof(first_seed)); + } + else + { + sha256_update(&ctx, extra.c_str(), extra.size()); + } + sha256_update(&ctx, &seed, sizeof(seed)); + sha256_final(&ctx, buf); + // Take the first sizeof(size_t) bytes as seed. + return *reinterpret_cast(buf); + } + + +Hasher* Hasher::Create(size_t k, const std::string& name) + { + return new DefaultHasher(k, name); + } + +Hasher::Hasher(size_t k, const std::string& name) + : k_(k), name_(name) + { + } + +DefaultHasher::DefaultHasher(size_t k, const std::string& name) + : Hasher(k, name) + { + for ( size_t i = 0; i < k; ++i ) + hash_functions_.push_back(UHF(i, name)); + } + +Hasher::digest_vector DefaultHasher::Hash(const void* x, size_t n) const + { + digest_vector h(K(), 0); + for ( size_t i = 0; i < h.size(); ++i ) + h[i] = hash_functions_[i](x, n); + return h; + } + +DoubleHasher::DoubleHasher(size_t k, const std::string& name) + : Hasher(k, name), + h1_(1, name), + h2_(2, name) + { + } + +Hasher::digest_vector DoubleHasher::Hash(const void* x, size_t n) const + { + digest h1 = h1_(x, n); + digest h2 = h2_(x, n); + digest_vector h(K(), 0); + for ( size_t i = 0; i < h.size(); ++i ) + h[i] = h1 + i * h2; + return h; + } + + diff --git a/src/Hasher.h b/src/Hasher.h new file mode 100644 index 0000000000..8d0af6b03f --- /dev/null +++ b/src/Hasher.h @@ -0,0 +1,109 @@ +#ifndef Hasher_h +#define Hasher_h + +#include "Hash.h" +#include "H3.h" + +/** + * The abstract base class for hashers, i.e., constructs which hash elements + * *k* times. + */ +class Hasher { +public: + typedef hash_t digest; + typedef std::vector digest_vector; + + /** + * Constructs the hashing policy used by the implementation. + * + * @todo This factory function exists because the HashingPolicy class + * hierachy is not yet serializable. + */ + static Hasher* Create(size_t k, const std::string& name); + + virtual ~Hasher() { } + + template + digest_vector operator()(const T& x) const + { + return Hash(&x, sizeof(T)); + } + + virtual digest_vector Hash(const void* x, size_t n) const = 0; + + size_t K() const { return k_; } + const std::string& Name() const { return name_; } + +protected: + /** + * A universal hash function family. + */ + class UHF { + public: + /** + * Constructs an H3 hash function seeded with a given seed and an optional + * extra seed to replace the initial Bro seed. + * + * @param seed The seed to use for this instance. + * + * @param extra If not empty, this parameter replaces the initial seed to + * compute the seed for t to compute the + * seed + * NUL-terminated string as additional seed. + */ + UHF(size_t seed, const std::string& extra = ""); + + template + digest operator()(const T& x) const + { + return hash(&x, sizeof(T)); + } + + digest operator()(const void* x, size_t n) const + { + return hash(x, n); + } + + digest hash(const void* x, size_t n) const; + + private: + static size_t compute_seed(size_t seed, const std::string& extra); + + H3 h_; + }; + + Hasher(size_t k, const std::string& name); + +private: + const size_t k_; + std::string name_; +}; + +/** + * The default hashing policy. Performs *k* hash function computations. + */ +class DefaultHasher : public Hasher { +public: + DefaultHasher(size_t k, const std::string& name); + + virtual digest_vector Hash(const void* x, size_t n) const /* final */; + +private: + std::vector hash_functions_; +}; + +/** + * The *double-hashing* policy. Uses a linear combination of two hash functions. + */ +class DoubleHasher : public Hasher { +public: + DoubleHasher(size_t k, const std::string& name); + + virtual digest_vector Hash(const void* x, size_t n) const /* final */; + +private: + UHF h1_; + UHF h2_; +}; + +#endif diff --git a/src/bro.bif b/src/bro.bif index d0ce066139..71f8c0716f 100644 --- a/src/bro.bif +++ b/src/bro.bif @@ -5008,8 +5008,8 @@ function bloomfilter_basic_init%(fp: double, capacity: count, size_t cells = BasicBloomFilter::M(fp, capacity); size_t optimal_k = BasicBloomFilter::K(cells, capacity); - const HashPolicy* hp = HashPolicy::Create(optimal_k, name->CheckString()); - return new BloomFilterVal(new BasicBloomFilter(hp, cells)); + const Hasher* h = Hasher::Create(optimal_k, name->CheckString()); + return new BloomFilterVal(new BasicBloomFilter(h, cells)); %} ## Creates a counting Bloom filter. @@ -5029,11 +5029,11 @@ function bloomfilter_basic_init%(fp: double, capacity: count, function bloomfilter_counting_init%(k: count, cells: count, max: count, name: string &default=""%): opaque of bloomfilter %{ - const HashPolicy* hp = HashPolicy::Create(k, name->CheckString()); + const Hasher* h = Hasher::Create(k, name->CheckString()); uint16 width = 0; while ( max >>= 1 ) ++width; - return new BloomFilterVal(new CountingBloomFilter(hp, cells, width)); + return new BloomFilterVal(new CountingBloomFilter(h, cells, width)); %} ## Adds an element to a Bloom filter. From 79a2e4b5d5c28076a8db1857d3ea6a8891e1ef7c Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Sun, 21 Jul 2013 22:41:48 +0200 Subject: [PATCH 40/45] Implement missing CounterVector functions. --- src/CounterVector.cc | 66 ++++++++++++++++++++++++++++++++++++++------ src/CounterVector.h | 15 ++++++++++ 2 files changed, 73 insertions(+), 8 deletions(-) diff --git a/src/CounterVector.cc b/src/CounterVector.cc index 8ed4c30427..a661492313 100644 --- a/src/CounterVector.cc +++ b/src/CounterVector.cc @@ -1,5 +1,6 @@ #include "CounterVector.h" +#include #include "BitVector.h" #include "Serializer.h" @@ -15,23 +16,66 @@ CounterVector::~CounterVector() bool CounterVector::Increment(size_type cell, count_type value) { - // TODO - assert(! "not yet implemented"); + assert(cell < Size()); + assert(value != 0); + size_t lsb = cell * width_; + if (value >= Max()) + { + bool r = false; + for (size_t i = 0; i < width_; ++i) + if (! (*bits_)[lsb + i]) + { + bits_->Set(lsb + i); + if (! r) + r = true; + } + return r; + } + bool carry = false; + for (size_t i = 0; i < width_; ++i) + { + bool b1 = (*bits_)[lsb + i]; + bool b2 = value & (1 << i); + (*bits_)[lsb + i] ^= b2 != carry; // bit1 ^ bit2 ^ carry + carry = carry ? b1 || b2 : b1 && b2; + } + if (! carry) + return true; + for (size_t i = 0; i < width_; ++i) + bits_->Set(lsb + i); return false; } bool CounterVector::Decrement(size_type cell, count_type value) { - // TODO - assert(! "not yet implemented"); - return false; + assert(cell < Size()); + size_t lsb = cell * width_; + bool success; + while (value --> 0) + { + success = false; + for (size_t i = lsb; i < lsb + width_; ++i) + if ((*bits_)[i]) + { + bits_->Reset(i); + while (i && i > lsb) + bits_->Set(--i); + success = true; + break; + } + } + return success; } CounterVector::count_type CounterVector::Count(size_type cell) const { - // TODO - assert(! "not yet implemented"); - return 0; + assert(cell < Size()); + size_t cnt = 0, order = 1; + size_t lsb = cell * width_; + for (size_t i = lsb; i < lsb + width_; ++i, order <<= 1) + if ((*bits_)[i]) + cnt |= order; + return cnt; } CounterVector::size_type CounterVector::Size() const @@ -39,6 +83,12 @@ CounterVector::size_type CounterVector::Size() const return bits_->Blocks() / width_; } +size_t CounterVector::Max() const + { + return std::numeric_limits::max() + >> (std::numeric_limits::digits - width_); + } + bool CounterVector::Serialize(SerialInfo* info) const { return SerialObj::Serialize(info); diff --git a/src/CounterVector.h b/src/CounterVector.h index ecc8fe90e0..868beaca9b 100644 --- a/src/CounterVector.h +++ b/src/CounterVector.h @@ -19,6 +19,8 @@ public: * @param width The number of bits that each cell occupies. * * @param cells The number of cells in the bitvector. + * + * @pre `cells > 0 && width > 0` */ CounterVector(size_t width, size_t cells = 1024); @@ -32,6 +34,8 @@ public: * @param value The value to add to the current counter in *cell*. * * @return `true` if adding *value* to the counter in *cell* succeeded. + * + * @pre `cell < Size()` */ bool Increment(size_type cell, count_type value); @@ -43,6 +47,8 @@ public: * @param value The value to subtract from the current counter in *cell*. * * @return `true` if subtracting *value* from the counter in *cell* succeeded. + * + * @pre `cell < Size()` */ bool Decrement(size_type cell, count_type value); @@ -52,6 +58,8 @@ public: * @param cell The cell index to retrieve the count for. * * @return The counter associated with *cell*. + * + * @pre `cell < Size()` */ count_type Count(size_type cell) const; @@ -62,6 +70,13 @@ public: */ size_type Size() const; + /** + * Computes the maximum counter value. + * + * @return The maximum counter value based on the width. + */ + size_t Max() const; + bool Serialize(SerialInfo* info) const; static CounterVector* Unserialize(UnserialInfo* info); From 7a0240694ec69506b0789029ba48bb56ae703206 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Mon, 22 Jul 2013 14:07:47 +0200 Subject: [PATCH 41/45] Fix and test counting Bloom filter. --- src/BloomFilter.cc | 9 ++++--- src/CounterVector.cc | 5 ++-- src/CounterVector.h | 4 +-- src/bro.bif | 8 +++++- .../btest/Baseline/bifs.bloomfilter/output | 6 +++++ testing/btest/bifs/bloomfilter.bro | 26 ++++++++++++++++++- 6 files changed, 48 insertions(+), 10 deletions(-) diff --git a/src/BloomFilter.cc b/src/BloomFilter.cc index f399bddeca..3c7bac80f1 100644 --- a/src/BloomFilter.cc +++ b/src/BloomFilter.cc @@ -131,9 +131,9 @@ CountingBloomFilter::CountingBloomFilter() CountingBloomFilter::CountingBloomFilter(const Hasher* hasher, size_t cells, size_t width) - : BloomFilter(hasher) + : BloomFilter(hasher), + cells_(new CounterVector(width, cells)) { - cells_ = new CounterVector(width, cells); } @@ -152,10 +152,12 @@ bool CountingBloomFilter::DoUnserialize(UnserialInfo* info) return cells_ != NULL; } +// TODO: Use partitioning in add/count to allow for reusing CMS bounds. + void CountingBloomFilter::AddImpl(const Hasher::digest_vector& h) { for ( size_t i = 0; i < h.size(); ++i ) - cells_->Increment(h[i] % cells_->Size(), 1); + cells_->Increment(h[i] % cells_->Size()); } size_t CountingBloomFilter::CountImpl(const Hasher::digest_vector& h) const @@ -164,7 +166,6 @@ size_t CountingBloomFilter::CountImpl(const Hasher::digest_vector& h) const std::numeric_limits::max(); for ( size_t i = 0; i < h.size(); ++i ) { - // TODO: Use partitioning. CounterVector::size_type cnt = cells_->Count(h[i] % cells_->Size()); if ( cnt < min ) min = cnt; diff --git a/src/CounterVector.cc b/src/CounterVector.cc index a661492313..831b95386f 100644 --- a/src/CounterVector.cc +++ b/src/CounterVector.cc @@ -5,7 +5,8 @@ #include "Serializer.h" CounterVector::CounterVector(size_t width, size_t cells) - : bits_(new BitVector(width * cells)), width_(width) + : bits_(new BitVector(width * cells)), + width_(width) { } @@ -80,7 +81,7 @@ CounterVector::count_type CounterVector::Count(size_type cell) const CounterVector::size_type CounterVector::Size() const { - return bits_->Blocks() / width_; + return bits_->Size() / width_; } size_t CounterVector::Max() const diff --git a/src/CounterVector.h b/src/CounterVector.h index 868beaca9b..2d99bb44d8 100644 --- a/src/CounterVector.h +++ b/src/CounterVector.h @@ -37,7 +37,7 @@ public: * * @pre `cell < Size()` */ - bool Increment(size_type cell, count_type value); + bool Increment(size_type cell, count_type value = 1); /** * Decrements a given cell. @@ -50,7 +50,7 @@ public: * * @pre `cell < Size()` */ - bool Decrement(size_type cell, count_type value); + bool Decrement(size_type cell, count_type value = 1); /** * Retrieves the counter of a given cell. diff --git a/src/bro.bif b/src/bro.bif index 71f8c0716f..a33a2248dd 100644 --- a/src/bro.bif +++ b/src/bro.bif @@ -5029,8 +5029,14 @@ function bloomfilter_basic_init%(fp: double, capacity: count, function bloomfilter_counting_init%(k: count, cells: count, max: count, name: string &default=""%): opaque of bloomfilter %{ + if ( max == 0 ) + { + reporter->Error("max counter value must be greater than 0"); + return NULL; + } + const Hasher* h = Hasher::Create(k, name->CheckString()); - uint16 width = 0; + uint16 width = 1; while ( max >>= 1 ) ++width; return new BloomFilterVal(new CountingBloomFilter(h, cells, width)); diff --git a/testing/btest/Baseline/bifs.bloomfilter/output b/testing/btest/Baseline/bifs.bloomfilter/output index 65aaa8b07c..80847a81b9 100644 --- a/testing/btest/Baseline/bifs.bloomfilter/output +++ b/testing/btest/Baseline/bifs.bloomfilter/output @@ -6,3 +6,9 @@ 1 1 1 +1 +2 +3 +3 +2 +3 diff --git a/testing/btest/bifs/bloomfilter.bro b/testing/btest/bifs/bloomfilter.bro index 3ff6a6668e..ab0bf86c22 100644 --- a/testing/btest/bifs/bloomfilter.bro +++ b/testing/btest/bifs/bloomfilter.bro @@ -1,7 +1,7 @@ # @TEST-EXEC: bro -b %INPUT >output # @TEST-EXEC: btest-diff output -event bro_init() +function test_basic_bloom_filter() { # Basic usage with counts. local bf_cnt = bloomfilter_basic_init(0.1, 1000); @@ -36,3 +36,27 @@ event bro_init() local bf_bug0 = bloomfilter_basic_init(-0.5, 42); local bf_bug1 = bloomfilter_basic_init(1.1, 42); } + +function test_counting_bloom_filter() + { + local bf = bloomfilter_counting_init(3, 16, 3); + bloomfilter_add(bf, "foo"); + print bloomfilter_lookup(bf, "foo"); # 1 + bloomfilter_add(bf, "foo"); + print bloomfilter_lookup(bf, "foo"); # 2 + bloomfilter_add(bf, "foo"); + print bloomfilter_lookup(bf, "foo"); # 3 + bloomfilter_add(bf, "foo"); + print bloomfilter_lookup(bf, "foo"); # still 3 + + bloomfilter_add(bf, "bar"); + bloomfilter_add(bf, "bar"); + print bloomfilter_lookup(bf, "bar"); # 2 + print bloomfilter_lookup(bf, "foo"); # still 3 + } + +event bro_init() + { + test_basic_bloom_filter(); + test_counting_bloom_filter(); + } From a3c61fe7eb6c43622de17df0e818def20cab7e90 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Mon, 22 Jul 2013 15:39:13 +0200 Subject: [PATCH 42/45] Use half adder for bitwise addition and subtraction. --- src/CounterVector.cc | 53 +++++++++++++++----------------------------- 1 file changed, 18 insertions(+), 35 deletions(-) diff --git a/src/CounterVector.cc b/src/CounterVector.cc index 831b95386f..f46fae1b98 100644 --- a/src/CounterVector.cc +++ b/src/CounterVector.cc @@ -20,52 +20,35 @@ bool CounterVector::Increment(size_type cell, count_type value) assert(cell < Size()); assert(value != 0); size_t lsb = cell * width_; - if (value >= Max()) - { - bool r = false; - for (size_t i = 0; i < width_; ++i) - if (! (*bits_)[lsb + i]) - { - bits_->Set(lsb + i); - if (! r) - r = true; - } - return r; - } bool carry = false; - for (size_t i = 0; i < width_; ++i) - { + for ( size_t i = 0; i < width_; ++i ) + { bool b1 = (*bits_)[lsb + i]; bool b2 = value & (1 << i); - (*bits_)[lsb + i] ^= b2 != carry; // bit1 ^ bit2 ^ carry - carry = carry ? b1 || b2 : b1 && b2; - } - if (! carry) - return true; - for (size_t i = 0; i < width_; ++i) - bits_->Set(lsb + i); - return false; + (*bits_)[lsb + i] = b1 ^ b2 ^ carry; + carry = ( b1 && b2 ) || ( carry && ( b1 != b2 ) ); + } + if ( carry ) + for ( size_t i = 0; i < width_; ++i ) + bits_->Set(lsb + i); + return ! carry; } bool CounterVector::Decrement(size_type cell, count_type value) { assert(cell < Size()); + assert(value != 0); + value = ~value + 1; // A - B := A + ~B + 1 + bool carry = false; size_t lsb = cell * width_; - bool success; - while (value --> 0) + for ( size_t i = 0; i < width_; ++i ) { - success = false; - for (size_t i = lsb; i < lsb + width_; ++i) - if ((*bits_)[i]) - { - bits_->Reset(i); - while (i && i > lsb) - bits_->Set(--i); - success = true; - break; - } + bool b1 = bits_[lsb + i]; + bool b2 = value & (1 << i); + bits_[lsb + i] = b1 ^ b2 ^ carry; + carry = ( b1 && b2 ) || ( carry && ( b1 != b2 ) ); } - return success; + return carry; } CounterVector::count_type CounterVector::Count(size_type cell) const From 9c2f57a9d9d5667d05e43efd3c8541ff9d33382a Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Mon, 22 Jul 2013 16:36:54 +0200 Subject: [PATCH 43/45] Make counter vectors mergeable. --- src/CounterVector.cc | 42 ++++++++++++++++++++++++++++++++++++++++-- src/CounterVector.h | 27 +++++++++++++++++++++++++++ 2 files changed, 67 insertions(+), 2 deletions(-) diff --git a/src/CounterVector.cc b/src/CounterVector.cc index f46fae1b98..75c62b208a 100644 --- a/src/CounterVector.cc +++ b/src/CounterVector.cc @@ -43,9 +43,9 @@ bool CounterVector::Decrement(size_type cell, count_type value) size_t lsb = cell * width_; for ( size_t i = 0; i < width_; ++i ) { - bool b1 = bits_[lsb + i]; + bool b1 = (*bits_)[lsb + i]; bool b2 = value & (1 << i); - bits_[lsb + i] = b1 ^ b2 ^ carry; + (*bits_)[lsb + i] = b1 ^ b2 ^ carry; carry = ( b1 && b2 ) || ( carry && ( b1 != b2 ) ); } return carry; @@ -67,12 +67,50 @@ CounterVector::size_type CounterVector::Size() const return bits_->Size() / width_; } +size_t CounterVector::Width() const + { + return width_; + } + size_t CounterVector::Max() const { return std::numeric_limits::max() >> (std::numeric_limits::digits - width_); } +CounterVector& CounterVector::Merge(const CounterVector& other) + { + assert(Size() == other.Size()); + assert(Width() == other.Width()); + for ( size_t cell = 0; cell < Size(); ++cell ) + { + size_t lsb = cell * width_; + bool carry = false; + for ( size_t i = 0; i < width_; ++i ) + { + bool b1 = (*bits_)[lsb + i]; + bool b2 = (*other.bits_)[lsb + i]; + (*bits_)[lsb + i] = b1 ^ b2 ^ carry; + carry = ( b1 && b2 ) || ( carry && ( b1 != b2 ) ); + } + if ( carry ) + for ( size_t i = 0; i < width_; ++i ) + bits_->Set(lsb + i); + } + return *this; + } + +CounterVector& CounterVector::operator|=(const CounterVector& other) +{ + return Merge(other); +} + +CounterVector operator|(const CounterVector& x, const CounterVector& y) +{ + CounterVector cv(x); + return cv |= y; +} + bool CounterVector::Serialize(SerialInfo* info) const { return SerialObj::Serialize(info); diff --git a/src/CounterVector.h b/src/CounterVector.h index 2d99bb44d8..4ab221ff6b 100644 --- a/src/CounterVector.h +++ b/src/CounterVector.h @@ -70,6 +70,13 @@ public: */ size_type Size() const; + /** + * Retrieves the counter width. + * + * @return The number of bits per counter. + */ + size_t Width() const; + /** * Computes the maximum counter value. * @@ -77,6 +84,26 @@ public: */ size_t Max() const; + /** + * Merges another counter vector into this instance by *adding* the counters + * of each cells. + * + * @param other The counter vector to merge into this instance. + * + * @return A reference to `*this`. + * + * @pre `Size() == other.Size() && Width() == other.Width()` + */ + CounterVector& Merge(const CounterVector& other); + + /** + * An alias for ::Merge. + */ + CounterVector& operator|=(const CounterVector& other); + + friend CounterVector operator|(const CounterVector& x, + const CounterVector& y); + bool Serialize(SerialInfo* info) const; static CounterVector* Unserialize(UnserialInfo* info); From eb64f5f9616e84295bc17537e8db57ae4f089c41 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Mon, 22 Jul 2013 18:03:55 +0200 Subject: [PATCH 44/45] Make hash functions equality comparable. --- src/H3.h | 12 ++++++ src/Hasher.cc | 101 +++++++++++++++++++++++++++++++------------------- src/Hasher.h | 18 +++++++++ 3 files changed, 93 insertions(+), 38 deletions(-) diff --git a/src/H3.h b/src/H3.h index e2dc865147..123dd6f374 100644 --- a/src/H3.h +++ b/src/H3.h @@ -58,6 +58,7 @@ #define H3_H #include +#include // The number of values representable by a byte. #define H3_BYTE_RANGE (UCHAR_MAX+1) @@ -112,6 +113,17 @@ public: return result; } + + friend bool operator==(const H3& x, const H3& y) + { + return ! std::memcmp(x.byte_lookup, y.byte_lookup, N * H3_BYTE_RANGE); + } + + friend bool operator!=(const H3& x, const H3& y) + { + return ! (x == y); + } + private: T byte_lookup[N][H3_BYTE_RANGE]; }; diff --git a/src/Hasher.cc b/src/Hasher.cc index 045adcd174..7a8d9a67e0 100644 --- a/src/Hasher.cc +++ b/src/Hasher.cc @@ -8,56 +8,69 @@ Hasher::UHF::UHF(size_t seed, const std::string& extra) } Hasher::digest Hasher::UHF::hash(const void* x, size_t n) const - { - assert(n <= UHASH_KEY_SIZE); - return n == 0 ? 0 : h_(x, n); - } + { + assert(n <= UHASH_KEY_SIZE); + return n == 0 ? 0 : h_(x, n); + } size_t Hasher::UHF::compute_seed(size_t seed, const std::string& extra) - { - u_char buf[SHA256_DIGEST_LENGTH]; - SHA256_CTX ctx; - sha256_init(&ctx); - if ( extra.empty() ) + { + u_char buf[SHA256_DIGEST_LENGTH]; + SHA256_CTX ctx; + sha256_init(&ctx); + if ( extra.empty() ) { unsigned int first_seed = initial_seed(); sha256_update(&ctx, &first_seed, sizeof(first_seed)); } else { - sha256_update(&ctx, extra.c_str(), extra.size()); + sha256_update(&ctx, extra.c_str(), extra.size()); + } + sha256_update(&ctx, &seed, sizeof(seed)); + sha256_final(&ctx, buf); + // Take the first sizeof(size_t) bytes as seed. + return *reinterpret_cast(buf); } - sha256_update(&ctx, &seed, sizeof(seed)); - sha256_final(&ctx, buf); - // Take the first sizeof(size_t) bytes as seed. - return *reinterpret_cast(buf); - } Hasher* Hasher::Create(size_t k, const std::string& name) - { - return new DefaultHasher(k, name); - } + { + return new DefaultHasher(k, name); + } Hasher::Hasher(size_t k, const std::string& name) - : k_(k), name_(name) + : k_(k), name_(name) { } DefaultHasher::DefaultHasher(size_t k, const std::string& name) - : Hasher(k, name) - { - for ( size_t i = 0; i < k; ++i ) - hash_functions_.push_back(UHF(i, name)); - } + : Hasher(k, name) + { + for ( size_t i = 0; i < k; ++i ) + hash_functions_.push_back(UHF(i, name)); + } Hasher::digest_vector DefaultHasher::Hash(const void* x, size_t n) const - { - digest_vector h(K(), 0); - for ( size_t i = 0; i < h.size(); ++i ) - h[i] = hash_functions_[i](x, n); - return h; - } + { + digest_vector h(K(), 0); + for ( size_t i = 0; i < h.size(); ++i ) + h[i] = hash_functions_[i](x, n); + return h; + } + +DefaultHasher* DefaultHasher::Clone() const + { + return new DefaultHasher(*this); + } + +bool DefaultHasher::Equals(const Hasher* other) const /* final */ + { + if ( typeid(*this) != typeid(*other) ) + return false; + const DefaultHasher* o = static_cast(other); + return hash_functions_ == o->hash_functions_; + } DoubleHasher::DoubleHasher(size_t k, const std::string& name) : Hasher(k, name), @@ -67,13 +80,25 @@ DoubleHasher::DoubleHasher(size_t k, const std::string& name) } Hasher::digest_vector DoubleHasher::Hash(const void* x, size_t n) const - { - digest h1 = h1_(x, n); - digest h2 = h2_(x, n); - digest_vector h(K(), 0); - for ( size_t i = 0; i < h.size(); ++i ) - h[i] = h1 + i * h2; - return h; - } + { + digest h1 = h1_(x, n); + digest h2 = h2_(x, n); + digest_vector h(K(), 0); + for ( size_t i = 0; i < h.size(); ++i ) + h[i] = h1 + i * h2; + return h; + } +DoubleHasher* DoubleHasher::Clone() const + { + return new DoubleHasher(*this); + } + +bool DoubleHasher::Equals(const Hasher* other) const /* final */ + { + if ( typeid(*this) != typeid(*other) ) + return false; + const DoubleHasher* o = static_cast(other); + return h1_ == o->h1_ && h2_ == o->h2_; + } diff --git a/src/Hasher.h b/src/Hasher.h index 8d0af6b03f..12393e7217 100644 --- a/src/Hasher.h +++ b/src/Hasher.h @@ -31,6 +31,10 @@ public: virtual digest_vector Hash(const void* x, size_t n) const = 0; + virtual Hasher* Clone() const = 0; + + virtual bool Equals(const Hasher* other) const = 0; + size_t K() const { return k_; } const std::string& Name() const { return name_; } @@ -64,6 +68,16 @@ protected: return hash(x, n); } + friend bool operator==(const UHF& x, const UHF& y) + { + return x.h_ == y.h_; + } + + friend bool operator!=(const UHF& x, const UHF& y) + { + return ! (x == y); + } + digest hash(const void* x, size_t n) const; private: @@ -87,6 +101,8 @@ public: DefaultHasher(size_t k, const std::string& name); virtual digest_vector Hash(const void* x, size_t n) const /* final */; + virtual DefaultHasher* Clone() const /* final */; + virtual bool Equals(const Hasher* other) const /* final */; private: std::vector hash_functions_; @@ -100,6 +116,8 @@ public: DoubleHasher(size_t k, const std::string& name); virtual digest_vector Hash(const void* x, size_t n) const /* final */; + virtual DoubleHasher* Clone() const /* final */; + virtual bool Equals(const Hasher* other) const /* final */; private: UHF h1_; From a39f980cd493e64a6bb4016c47923e8754b059dc Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Mon, 22 Jul 2013 18:11:12 +0200 Subject: [PATCH 45/45] Implement and test Bloom filter merging. --- src/BloomFilter.cc | 22 ++++++++++++++---- src/BloomFilter.h | 1 - src/CounterVector.cc | 6 +++++ src/CounterVector.h | 8 +++++++ src/Hasher.cc | 4 ++-- src/OpaqueVal.cc | 2 +- src/OpaqueVal.h | 21 ++++++++++++++--- .../btest/Baseline/bifs.bloomfilter/output | 7 ++++++ testing/btest/bifs/bloomfilter.bro | 23 ++++++++++++++++++- 9 files changed, 81 insertions(+), 13 deletions(-) diff --git a/src/BloomFilter.cc b/src/BloomFilter.cc index 3c7bac80f1..889c7bafe1 100644 --- a/src/BloomFilter.cc +++ b/src/BloomFilter.cc @@ -70,8 +70,13 @@ size_t BasicBloomFilter::K(size_t cells, size_t capacity) BasicBloomFilter* BasicBloomFilter::Merge(const BasicBloomFilter* x, const BasicBloomFilter* y) { - // TODO: Ensure that x and y use the same Hasher before proceeding. + if ( ! x->hasher_->Equals(y->hasher_) ) + { + reporter->InternalError("incompatible hashers during Bloom filter merge"); + return NULL; + } BasicBloomFilter* result = new BasicBloomFilter(); + result->hasher_ = x->hasher_->Clone(); result->bits_ = new BitVector(*x->bits_ | *y->bits_); return result; } @@ -119,10 +124,17 @@ size_t BasicBloomFilter::CountImpl(const Hasher::digest_vector& h) const CountingBloomFilter* CountingBloomFilter::Merge(const CountingBloomFilter* x, const CountingBloomFilter* y) -{ - assert(! "not yet implemented"); - return NULL; -} + { + if ( ! x->hasher_->Equals(y->hasher_) ) + { + reporter->InternalError("incompatible hashers during Bloom filter merge"); + return NULL; + } + CountingBloomFilter* result = new CountingBloomFilter(); + result->hasher_ = x->hasher_->Clone(); + result->cells_ = new CounterVector(*x->cells_ | *y->cells_); + return result; + } CountingBloomFilter::CountingBloomFilter() : cells_(NULL) diff --git a/src/BloomFilter.h b/src/BloomFilter.h index 92f15c6070..070aa2dc25 100644 --- a/src/BloomFilter.h +++ b/src/BloomFilter.h @@ -57,7 +57,6 @@ protected: virtual void AddImpl(const Hasher::digest_vector& hashes) = 0; virtual size_t CountImpl(const Hasher::digest_vector& hashes) const = 0; -private: const Hasher* hasher_; }; diff --git a/src/CounterVector.cc b/src/CounterVector.cc index 75c62b208a..cf3083de9e 100644 --- a/src/CounterVector.cc +++ b/src/CounterVector.cc @@ -10,6 +10,12 @@ CounterVector::CounterVector(size_t width, size_t cells) { } +CounterVector::CounterVector(const CounterVector& other) + : bits_(new BitVector(*other.bits_)), + width_(other.width_) + { + } + CounterVector::~CounterVector() { delete bits_; diff --git a/src/CounterVector.h b/src/CounterVector.h index 4ab221ff6b..eced5956d4 100644 --- a/src/CounterVector.h +++ b/src/CounterVector.h @@ -9,6 +9,7 @@ class BitVector; * A vector of counters, each of which have a fixed number of bits. */ class CounterVector : public SerialObj { + CounterVector& operator=(const CounterVector&); public: typedef size_t size_type; typedef uint64 count_type; @@ -24,6 +25,13 @@ public: */ CounterVector(size_t width, size_t cells = 1024); + /** + * Copy-constructs a counter vector. + * + * @param other The counter vector to copy. + */ + CounterVector(const CounterVector& other); + ~CounterVector(); /** diff --git a/src/Hasher.cc b/src/Hasher.cc index 7a8d9a67e0..2a889c7e09 100644 --- a/src/Hasher.cc +++ b/src/Hasher.cc @@ -64,7 +64,7 @@ DefaultHasher* DefaultHasher::Clone() const return new DefaultHasher(*this); } -bool DefaultHasher::Equals(const Hasher* other) const /* final */ +bool DefaultHasher::Equals(const Hasher* other) const { if ( typeid(*this) != typeid(*other) ) return false; @@ -94,7 +94,7 @@ DoubleHasher* DoubleHasher::Clone() const return new DoubleHasher(*this); } -bool DoubleHasher::Equals(const Hasher* other) const /* final */ +bool DoubleHasher::Equals(const Hasher* other) const { if ( typeid(*this) != typeid(*other) ) return false; diff --git a/src/OpaqueVal.cc b/src/OpaqueVal.cc index 5a673c4a40..36038d679a 100644 --- a/src/OpaqueVal.cc +++ b/src/OpaqueVal.cc @@ -1,6 +1,5 @@ #include "OpaqueVal.h" -#include "BloomFilter.h" #include "NetVar.h" #include "Reporter.h" #include "Serializer.h" @@ -587,6 +586,7 @@ BloomFilterVal* BloomFilterVal::Merge(const BloomFilterVal* x, else if ( (result = DoMerge(x, y)) ) return result; + reporter->InternalError("failed to merge Bloom filters"); return NULL; } diff --git a/src/OpaqueVal.h b/src/OpaqueVal.h index 2362fdacfc..22c3dbfade 100644 --- a/src/OpaqueVal.h +++ b/src/OpaqueVal.h @@ -3,6 +3,7 @@ #ifndef OPAQUEVAL_H #define OPAQUEVAL_H +#include "BloomFilter.h" #include "RandTest.h" #include "Val.h" #include "digest.h" @@ -137,9 +138,23 @@ private: static BloomFilterVal* DoMerge(const BloomFilterVal* x, const BloomFilterVal* y) { - const T* a = dynamic_cast(x->bloom_filter_); - const T* b = dynamic_cast(y->bloom_filter_); - return a && b ? new BloomFilterVal(T::Merge(a, b)) : NULL; + if ( typeid(*x->bloom_filter_) != typeid(*y->bloom_filter_) ) + { + reporter->InternalError("cannot merge different Bloom filter types"); + return NULL; + } + if ( typeid(T) != typeid(*x->bloom_filter_) ) + return NULL; + const T* a = static_cast(x->bloom_filter_); + const T* b = static_cast(y->bloom_filter_); + BloomFilterVal* merged = new BloomFilterVal(T::Merge(a, b)); + assert(merged); + if ( ! merged->Typify(x->Type()) ) + { + reporter->InternalError("failed to set type on merged Bloom filter"); + return NULL; + } + return merged; } BroType* type_; diff --git a/testing/btest/Baseline/bifs.bloomfilter/output b/testing/btest/Baseline/bifs.bloomfilter/output index 80847a81b9..4fe2ae1ecc 100644 --- a/testing/btest/Baseline/bifs.bloomfilter/output +++ b/testing/btest/Baseline/bifs.bloomfilter/output @@ -7,8 +7,15 @@ 1 1 1 +1 +1 +1 +1 2 3 3 2 3 +3 +3 +2 diff --git a/testing/btest/bifs/bloomfilter.bro b/testing/btest/bifs/bloomfilter.bro index ab0bf86c22..f69ddbda0c 100644 --- a/testing/btest/bifs/bloomfilter.bro +++ b/testing/btest/bifs/bloomfilter.bro @@ -35,11 +35,21 @@ function test_basic_bloom_filter() # Invalid parameters. local bf_bug0 = bloomfilter_basic_init(-0.5, 42); local bf_bug1 = bloomfilter_basic_init(1.1, 42); + + # Merging + local bf_cnt2 = bloomfilter_basic_init(0.1, 1000); + bloomfilter_add(bf_cnt2, 42); + bloomfilter_add(bf_cnt, 100); + local bf_merged = bloomfilter_merge(bf_cnt, bf_cnt2); + print bloomfilter_lookup(bf_merged, 42); + print bloomfilter_lookup(bf_merged, 84); + print bloomfilter_lookup(bf_merged, 100); + print bloomfilter_lookup(bf_merged, 168); } function test_counting_bloom_filter() { - local bf = bloomfilter_counting_init(3, 16, 3); + local bf = bloomfilter_counting_init(3, 32, 3); bloomfilter_add(bf, "foo"); print bloomfilter_lookup(bf, "foo"); # 1 bloomfilter_add(bf, "foo"); @@ -49,10 +59,21 @@ function test_counting_bloom_filter() bloomfilter_add(bf, "foo"); print bloomfilter_lookup(bf, "foo"); # still 3 + bloomfilter_add(bf, "bar"); bloomfilter_add(bf, "bar"); print bloomfilter_lookup(bf, "bar"); # 2 print bloomfilter_lookup(bf, "foo"); # still 3 + + # Merging + local bf2 = bloomfilter_counting_init(3, 32, 3); + bloomfilter_add(bf2, "baz"); + bloomfilter_add(bf2, "baz"); + bloomfilter_add(bf2, "bar"); + local bf_merged = bloomfilter_merge(bf, bf2); + print bloomfilter_lookup(bf_merged, "foo"); + print bloomfilter_lookup(bf_merged, "bar"); + print bloomfilter_lookup(bf_merged, "baz"); } event bro_init()