mirror of
https://github.com/zeek/zeek.git
synced 2025-10-07 17:18:20 +00:00
Reformat Zeek in Spicy style
This largely copies over Spicy's `.clang-format` configuration file. The one place where we deviate is header include order since Zeek depends on headers being included in a certain order.
This commit is contained in:
parent
7b8e7ed72c
commit
f5a76c1aed
786 changed files with 131714 additions and 153609 deletions
File diff suppressed because it is too large
Load diff
|
@ -7,346 +7,338 @@
|
|||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
namespace broker
|
||||
{
|
||||
namespace broker {
|
||||
class data;
|
||||
}
|
||||
}
|
||||
|
||||
namespace zeek::probabilistic::detail
|
||||
{
|
||||
namespace zeek::probabilistic::detail {
|
||||
|
||||
/**
|
||||
* A vector of bits.
|
||||
*/
|
||||
class BitVector
|
||||
{
|
||||
class BitVector {
|
||||
public:
|
||||
using block_type = uint64_t;
|
||||
using size_type = size_t;
|
||||
using const_reference = bool;
|
||||
using block_type = uint64_t;
|
||||
using size_type = size_t;
|
||||
using const_reference = bool;
|
||||
|
||||
static size_type npos;
|
||||
static block_type bits_per_block;
|
||||
static size_type npos;
|
||||
static block_type bits_per_block;
|
||||
|
||||
/**
|
||||
* An lvalue proxy for individual bits.
|
||||
*/
|
||||
class Reference
|
||||
{
|
||||
public:
|
||||
/**
|
||||
* Inverts the bits' values.
|
||||
*/
|
||||
Reference& Flip();
|
||||
/**
|
||||
* An lvalue proxy for individual bits.
|
||||
*/
|
||||
class Reference {
|
||||
public:
|
||||
/**
|
||||
* Inverts the bits' values.
|
||||
*/
|
||||
Reference& Flip();
|
||||
|
||||
operator bool() const;
|
||||
bool operator~() const;
|
||||
Reference& operator=(bool x);
|
||||
Reference& operator=(const Reference& other);
|
||||
Reference& operator|=(bool x);
|
||||
Reference& operator&=(bool x);
|
||||
Reference& operator^=(bool x);
|
||||
Reference& operator-=(bool x);
|
||||
operator bool() const;
|
||||
bool operator~() const;
|
||||
Reference& operator=(bool x);
|
||||
Reference& operator=(const Reference& other);
|
||||
Reference& operator|=(bool x);
|
||||
Reference& operator&=(bool x);
|
||||
Reference& operator^=(bool x);
|
||||
Reference& operator-=(bool x);
|
||||
|
||||
private:
|
||||
friend class BitVector;
|
||||
private:
|
||||
friend class BitVector;
|
||||
|
||||
Reference(block_type& block, block_type i);
|
||||
void operator&();
|
||||
Reference(block_type& block, block_type i);
|
||||
void operator&();
|
||||
|
||||
block_type& block;
|
||||
const block_type mask;
|
||||
};
|
||||
block_type& block;
|
||||
const block_type mask;
|
||||
};
|
||||
|
||||
/**
|
||||
* Default-constructs an empty bit vector.
|
||||
*/
|
||||
BitVector();
|
||||
/**
|
||||
* Default-constructs an empty bit vector.
|
||||
*/
|
||||
BitVector();
|
||||
|
||||
/**
|
||||
* Constructs a bit vector of a given size.
|
||||
* @param size The number of bits.
|
||||
* @param value The value for each bit.
|
||||
*/
|
||||
explicit BitVector(size_type size, bool value = false);
|
||||
/**
|
||||
* Constructs a bit vector of a given size.
|
||||
* @param size The number of bits.
|
||||
* @param value The value for each bit.
|
||||
*/
|
||||
explicit BitVector(size_type size, bool value = false);
|
||||
|
||||
/**
|
||||
* Constructs a bit vector from a sequence of blocks.
|
||||
*
|
||||
* @param first Start of range
|
||||
* @param last End of range.
|
||||
*
|
||||
*/
|
||||
template <typename InputIterator> BitVector(InputIterator first, InputIterator last)
|
||||
{
|
||||
bits.insert(bits.end(), first, last);
|
||||
num_bits = bits.size() * bits_per_block;
|
||||
}
|
||||
/**
|
||||
* Constructs a bit vector from a sequence of blocks.
|
||||
*
|
||||
* @param first Start of range
|
||||
* @param last End of range.
|
||||
*
|
||||
*/
|
||||
template<typename InputIterator>
|
||||
BitVector(InputIterator first, InputIterator last) {
|
||||
bits.insert(bits.end(), first, last);
|
||||
num_bits = bits.size() * bits_per_block;
|
||||
}
|
||||
|
||||
/**
|
||||
* Copy-constructs a bit vector.
|
||||
* @param other The bit vector to copy.
|
||||
*/
|
||||
BitVector(const BitVector& other);
|
||||
/**
|
||||
* Copy-constructs a bit vector.
|
||||
* @param other The bit vector to copy.
|
||||
*/
|
||||
BitVector(const BitVector& other);
|
||||
|
||||
/**
|
||||
* Assigns another bit vector to this instance.
|
||||
* @param other The RHS of the assignment.
|
||||
*/
|
||||
BitVector& operator=(const BitVector& other);
|
||||
/**
|
||||
* Assigns another bit vector to this instance.
|
||||
* @param other The RHS of the assignment.
|
||||
*/
|
||||
BitVector& operator=(const BitVector& other);
|
||||
|
||||
//
|
||||
// Bitwise operations.
|
||||
//
|
||||
BitVector operator~() const;
|
||||
BitVector operator<<(size_type n) const;
|
||||
BitVector operator>>(size_type n) const;
|
||||
BitVector& operator<<=(size_type n);
|
||||
BitVector& operator>>=(size_type n);
|
||||
BitVector& operator&=(BitVector const& other);
|
||||
BitVector& operator|=(BitVector const& other);
|
||||
BitVector& operator^=(BitVector const& other);
|
||||
BitVector& operator-=(BitVector const& other);
|
||||
friend BitVector operator&(BitVector const& x, BitVector const& y);
|
||||
friend BitVector operator|(BitVector const& x, BitVector const& y);
|
||||
friend BitVector operator^(BitVector const& x, BitVector const& y);
|
||||
friend BitVector operator-(BitVector const& x, BitVector const& y);
|
||||
//
|
||||
// Bitwise operations.
|
||||
//
|
||||
BitVector operator~() const;
|
||||
BitVector operator<<(size_type n) const;
|
||||
BitVector operator>>(size_type n) const;
|
||||
BitVector& operator<<=(size_type n);
|
||||
BitVector& operator>>=(size_type n);
|
||||
BitVector& operator&=(BitVector const& other);
|
||||
BitVector& operator|=(BitVector const& other);
|
||||
BitVector& operator^=(BitVector const& other);
|
||||
BitVector& operator-=(BitVector const& other);
|
||||
friend BitVector operator&(BitVector const& x, BitVector const& y);
|
||||
friend BitVector operator|(BitVector const& x, BitVector const& y);
|
||||
friend BitVector operator^(BitVector const& x, BitVector const& y);
|
||||
friend BitVector operator-(BitVector const& x, BitVector const& y);
|
||||
|
||||
//
|
||||
// Relational operators
|
||||
//
|
||||
friend bool operator==(BitVector const& x, BitVector const& y);
|
||||
friend bool operator!=(BitVector const& x, BitVector const& y);
|
||||
friend bool operator<(BitVector const& x, BitVector const& y);
|
||||
//
|
||||
// Relational operators
|
||||
//
|
||||
friend bool operator==(BitVector const& x, BitVector const& y);
|
||||
friend bool operator!=(BitVector const& x, BitVector const& y);
|
||||
friend bool operator<(BitVector const& x, BitVector const& y);
|
||||
|
||||
//
|
||||
// Basic operations
|
||||
//
|
||||
//
|
||||
// Basic operations
|
||||
//
|
||||
|
||||
/** Appends the bits in a sequence of values.
|
||||
* @tparam Iterator A forward iterator.
|
||||
* @param first An iterator pointing to the first element of the sequence.
|
||||
* @param last An iterator pointing to one past the last element of the
|
||||
* sequence.
|
||||
*/
|
||||
template <typename ForwardIterator> void Append(ForwardIterator first, ForwardIterator last)
|
||||
{
|
||||
if ( first == last )
|
||||
return;
|
||||
/** Appends the bits in a sequence of values.
|
||||
* @tparam Iterator A forward iterator.
|
||||
* @param first An iterator pointing to the first element of the sequence.
|
||||
* @param last An iterator pointing to one past the last element of the
|
||||
* sequence.
|
||||
*/
|
||||
template<typename ForwardIterator>
|
||||
void Append(ForwardIterator first, ForwardIterator last) {
|
||||
if ( first == last )
|
||||
return;
|
||||
|
||||
block_type excess = extra_bits();
|
||||
typename std::iterator_traits<ForwardIterator>::difference_type delta = std::distance(first,
|
||||
last);
|
||||
block_type excess = extra_bits();
|
||||
typename std::iterator_traits<ForwardIterator>::difference_type delta = std::distance(first, last);
|
||||
|
||||
bits.reserve(Blocks() + delta);
|
||||
bits.reserve(Blocks() + delta);
|
||||
|
||||
if ( excess == 0 )
|
||||
{
|
||||
bits.back() |= (*first << excess);
|
||||
if ( excess == 0 ) {
|
||||
bits.back() |= (*first << excess);
|
||||
|
||||
do
|
||||
{
|
||||
block_type b = *first++ >> (bits_per_block - excess);
|
||||
bits.push_back(b | (first == last ? 0 : *first << excess));
|
||||
} while ( first != last );
|
||||
}
|
||||
do {
|
||||
block_type b = *first++ >> (bits_per_block - excess);
|
||||
bits.push_back(b | (first == last ? 0 : *first << excess));
|
||||
} while ( first != last );
|
||||
}
|
||||
|
||||
else
|
||||
bits.insert(bits.end(), first, last);
|
||||
else
|
||||
bits.insert(bits.end(), first, last);
|
||||
|
||||
num_bits += bits_per_block * delta;
|
||||
}
|
||||
num_bits += bits_per_block * delta;
|
||||
}
|
||||
|
||||
/**
|
||||
* Appends the bits in a given block.
|
||||
* @param block The block containing bits to append.
|
||||
*/
|
||||
void Append(block_type block);
|
||||
/**
|
||||
* Appends the bits in a given block.
|
||||
* @param block The block containing bits to append.
|
||||
*/
|
||||
void Append(block_type block);
|
||||
|
||||
/** Appends a single bit to the end of the bit vector.
|
||||
* @param bit The value of the bit.
|
||||
*/
|
||||
void PushBack(bool bit);
|
||||
/** Appends a single bit to the end of the bit vector.
|
||||
* @param bit The value of the bit.
|
||||
*/
|
||||
void PushBack(bool bit);
|
||||
|
||||
/**
|
||||
* Clears all bits in the bitvector.
|
||||
*/
|
||||
void Clear();
|
||||
/**
|
||||
* Clears all bits in the bitvector.
|
||||
*/
|
||||
void Clear();
|
||||
|
||||
/**
|
||||
* Resizes the bit vector to a new number of bits.
|
||||
* @param n The new number of bits of the bit vector.
|
||||
* @param value The bit value of new values, if the vector expands.
|
||||
*/
|
||||
void Resize(size_type n, bool value = false);
|
||||
/**
|
||||
* Resizes the bit vector to a new number of bits.
|
||||
* @param n The new number of bits of the bit vector.
|
||||
* @param value The bit value of new values, if the vector expands.
|
||||
*/
|
||||
void Resize(size_type n, bool value = false);
|
||||
|
||||
/**
|
||||
* Sets a bit at a specific position to a given value.
|
||||
* @param i The bit position.
|
||||
* @param bit The value assigned to position *i*.
|
||||
* @return A reference to the bit vector instance.
|
||||
*/
|
||||
BitVector& Set(size_type i, bool bit = true);
|
||||
/**
|
||||
* Sets a bit at a specific position to a given value.
|
||||
* @param i The bit position.
|
||||
* @param bit The value assigned to position *i*.
|
||||
* @return A reference to the bit vector instance.
|
||||
*/
|
||||
BitVector& Set(size_type i, bool bit = true);
|
||||
|
||||
/**
|
||||
* Sets all bits to 1.
|
||||
* @return A reference to the bit vector instance.
|
||||
*/
|
||||
BitVector& Set();
|
||||
/**
|
||||
* Sets all bits to 1.
|
||||
* @return A reference to the bit vector instance.
|
||||
*/
|
||||
BitVector& Set();
|
||||
|
||||
/**
|
||||
* Resets a bit at a specific position, i.e., sets it to 0.
|
||||
* @param i The bit position.
|
||||
* @return A reference to the bit vector instance.
|
||||
*/
|
||||
BitVector& Reset(size_type i);
|
||||
/**
|
||||
* Resets a bit at a specific position, i.e., sets it to 0.
|
||||
* @param i The bit position.
|
||||
* @return A reference to the bit vector instance.
|
||||
*/
|
||||
BitVector& Reset(size_type i);
|
||||
|
||||
/**
|
||||
* Sets all bits to 0.
|
||||
* @return A reference to the bit vector instance.
|
||||
*/
|
||||
BitVector& Reset();
|
||||
/**
|
||||
* Sets all bits to 0.
|
||||
* @return A reference to the bit vector instance.
|
||||
*/
|
||||
BitVector& Reset();
|
||||
|
||||
/**
|
||||
* Toggles/flips a bit at a specific position.
|
||||
* @param i The bit position.
|
||||
* @return A reference to the bit vector instance.
|
||||
*/
|
||||
BitVector& Flip(size_type i);
|
||||
/**
|
||||
* Toggles/flips a bit at a specific position.
|
||||
* @param i The bit position.
|
||||
* @return A reference to the bit vector instance.
|
||||
*/
|
||||
BitVector& Flip(size_type i);
|
||||
|
||||
/**
|
||||
* Computes the complement.
|
||||
* @return A reference to the bit vector instance.
|
||||
*/
|
||||
BitVector& Flip();
|
||||
/**
|
||||
* Computes the complement.
|
||||
* @return A reference to the bit vector instance.
|
||||
*/
|
||||
BitVector& Flip();
|
||||
|
||||
/** Retrieves a single bit.
|
||||
* @param i The bit position.
|
||||
* @return A mutable reference to the bit at position *i*.
|
||||
*/
|
||||
Reference operator[](size_type i);
|
||||
/** Retrieves a single bit.
|
||||
* @param i The bit position.
|
||||
* @return A mutable reference to the bit at position *i*.
|
||||
*/
|
||||
Reference operator[](size_type i);
|
||||
|
||||
/**
|
||||
* Retrieves a single bit.
|
||||
* @param i The bit position.
|
||||
* @return A const-reference to the bit at position *i*.
|
||||
*/
|
||||
const_reference operator[](size_type i) const;
|
||||
/**
|
||||
* Retrieves a single bit.
|
||||
* @param i The bit position.
|
||||
* @return A const-reference to the bit at position *i*.
|
||||
*/
|
||||
const_reference operator[](size_type i) const;
|
||||
|
||||
/**
|
||||
* Counts the number of 1-bits in the bit vector. Also known as *population
|
||||
* count* or *Hamming weight*.
|
||||
* @return The number of bits set to 1.
|
||||
*/
|
||||
size_type Count() const;
|
||||
/**
|
||||
* Counts the number of 1-bits in the bit vector. Also known as *population
|
||||
* count* or *Hamming weight*.
|
||||
* @return The number of bits set to 1.
|
||||
*/
|
||||
size_type Count() const;
|
||||
|
||||
/**
|
||||
* Retrieves the number of blocks of the underlying storage.
|
||||
* @param The number of blocks that represent `Size()` bits.
|
||||
*/
|
||||
size_type Blocks() const;
|
||||
/**
|
||||
* Retrieves the number of blocks of the underlying storage.
|
||||
* @param The number of blocks that represent `Size()` bits.
|
||||
*/
|
||||
size_type Blocks() const;
|
||||
|
||||
/**
|
||||
* Retrieves the number of bits the bitvector consist of.
|
||||
* @return The length of the bit vector in bits.
|
||||
*/
|
||||
size_type Size() const;
|
||||
/**
|
||||
* Retrieves the number of bits the bitvector consist of.
|
||||
* @return The length of the bit vector in bits.
|
||||
*/
|
||||
size_type Size() const;
|
||||
|
||||
/**
|
||||
* Checks whether the bit vector is empty.
|
||||
* @return `true` iff the bitvector has zero length.
|
||||
*/
|
||||
bool Empty() const;
|
||||
/**
|
||||
* Checks whether the bit vector is empty.
|
||||
* @return `true` iff the bitvector has zero length.
|
||||
*/
|
||||
bool Empty() const;
|
||||
|
||||
/**
|
||||
* Checks whether all bits are 0.
|
||||
* @return `true` iff all bits in all blocks are 0.
|
||||
*/
|
||||
bool AllZero() const;
|
||||
/**
|
||||
* Checks whether all bits are 0.
|
||||
* @return `true` iff all bits in all blocks are 0.
|
||||
*/
|
||||
bool AllZero() const;
|
||||
|
||||
/**
|
||||
* Finds the bit position of of the first 1-bit.
|
||||
* @return The position of the first bit that equals to one or `npos` if no
|
||||
* such bit exists.
|
||||
*/
|
||||
size_type FindFirst() const;
|
||||
/**
|
||||
* Finds the bit position of of the first 1-bit.
|
||||
* @return The position of the first bit that equals to one or `npos` if no
|
||||
* such bit exists.
|
||||
*/
|
||||
size_type FindFirst() const;
|
||||
|
||||
/**
|
||||
* Finds the next 1-bit from a given starting position.
|
||||
*
|
||||
* @param i The index where to start looking.
|
||||
*
|
||||
* @return The position of the first bit that equals to 1 after position
|
||||
* *i* or `npos` if no such bit exists.
|
||||
*/
|
||||
size_type FindNext(size_type i) const;
|
||||
/**
|
||||
* Finds the next 1-bit from a given starting position.
|
||||
*
|
||||
* @param i The index where to start looking.
|
||||
*
|
||||
* @return The position of the first bit that equals to 1 after position
|
||||
* *i* or `npos` if no such bit exists.
|
||||
*/
|
||||
size_type FindNext(size_type i) const;
|
||||
|
||||
/** Computes a hash value of the internal representation.
|
||||
* This is mainly for debugging/testing purposes.
|
||||
*
|
||||
* @return The hash.
|
||||
*/
|
||||
uint64_t Hash() const;
|
||||
/** Computes a hash value of the internal representation.
|
||||
* This is mainly for debugging/testing purposes.
|
||||
*
|
||||
* @return The hash.
|
||||
*/
|
||||
uint64_t Hash() const;
|
||||
|
||||
broker::expected<broker::data> Serialize() const;
|
||||
static std::unique_ptr<BitVector> Unserialize(const broker::data& data);
|
||||
broker::expected<broker::data> Serialize() const;
|
||||
static std::unique_ptr<BitVector> Unserialize(const broker::data& data);
|
||||
|
||||
private:
|
||||
/**
|
||||
* Computes the number of excess/unused bits in the bit vector.
|
||||
*/
|
||||
block_type extra_bits() const;
|
||||
/**
|
||||
* Computes the number of excess/unused bits in the bit vector.
|
||||
*/
|
||||
block_type extra_bits() const;
|
||||
|
||||
/**
|
||||
* If the number of bits in the vector are not a multiple of
|
||||
* bitvector::bits_per_block, then the last block exhibits unused bits which
|
||||
* this function resets.
|
||||
*/
|
||||
void zero_unused_bits();
|
||||
/**
|
||||
* If the number of bits in the vector are not a multiple of
|
||||
* bitvector::bits_per_block, then the last block exhibits unused bits which
|
||||
* this function resets.
|
||||
*/
|
||||
void zero_unused_bits();
|
||||
|
||||
/**
|
||||
* Looks for the first 1-bit starting at a given position.
|
||||
* @param i The block index to start looking.
|
||||
* @return The block index of the first 1-bit starting from *i* or
|
||||
* `bitvector::npos` if no 1-bit exists.
|
||||
*/
|
||||
size_type find_from(size_type i) const;
|
||||
/**
|
||||
* Looks for the first 1-bit starting at a given position.
|
||||
* @param i The block index to start looking.
|
||||
* @return The block index of the first 1-bit starting from *i* or
|
||||
* `bitvector::npos` if no 1-bit exists.
|
||||
*/
|
||||
size_type find_from(size_type i) const;
|
||||
|
||||
/**
|
||||
* Computes the block index for a given bit position.
|
||||
*/
|
||||
static size_type block_index(size_type i) { return i / bits_per_block; }
|
||||
/**
|
||||
* Computes the block index for a given bit position.
|
||||
*/
|
||||
static size_type block_index(size_type i) { return i / bits_per_block; }
|
||||
|
||||
/**
|
||||
* Computes the bit index within a given block for a given bit position.
|
||||
*/
|
||||
static block_type bit_index(size_type i) { return i % bits_per_block; }
|
||||
/**
|
||||
* Computes the bit index within a given block for a given bit position.
|
||||
*/
|
||||
static block_type bit_index(size_type i) { return i % bits_per_block; }
|
||||
|
||||
/**
|
||||
* Computes the bitmask block to extract a bit a given bit position.
|
||||
*/
|
||||
static block_type bit_mask(size_type i) { return block_type(1) << bit_index(i); }
|
||||
/**
|
||||
* Computes the bitmask block to extract a bit a given bit position.
|
||||
*/
|
||||
static block_type bit_mask(size_type i) { return block_type(1) << bit_index(i); }
|
||||
|
||||
/**
|
||||
* Computes the number of blocks needed to represent a given number of
|
||||
* bits.
|
||||
* @param bits the number of bits.
|
||||
* @return The number of blocks to represent *bits* number of bits.
|
||||
*/
|
||||
static size_type bits_to_blocks(size_type bits)
|
||||
{
|
||||
return bits / bits_per_block + static_cast<size_type>(bits % bits_per_block != 0);
|
||||
}
|
||||
/**
|
||||
* Computes the number of blocks needed to represent a given number of
|
||||
* bits.
|
||||
* @param bits the number of bits.
|
||||
* @return The number of blocks to represent *bits* number of bits.
|
||||
*/
|
||||
static size_type bits_to_blocks(size_type bits) {
|
||||
return bits / bits_per_block + static_cast<size_type>(bits % bits_per_block != 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes the bit position first 1-bit in a given block.
|
||||
* @param block The block to inspect.
|
||||
* @return The bit position where *block* has its first bit set to 1.
|
||||
*/
|
||||
static size_type lowest_bit(block_type block);
|
||||
/**
|
||||
* Computes the bit position first 1-bit in a given block.
|
||||
* @param block The block to inspect.
|
||||
* @return The bit position where *block* has its first bit set to 1.
|
||||
*/
|
||||
static size_type lowest_bit(block_type block);
|
||||
|
||||
std::vector<block_type> bits;
|
||||
size_type num_bits;
|
||||
};
|
||||
std::vector<block_type> bits;
|
||||
size_type num_bits;
|
||||
};
|
||||
|
||||
} // namespace zeek::probabilistic::detail
|
||||
} // namespace zeek::probabilistic::detail
|
||||
|
|
|
@ -11,368 +11,287 @@
|
|||
#include "zeek/probabilistic/CounterVector.h"
|
||||
#include "zeek/util.h"
|
||||
|
||||
namespace zeek::probabilistic
|
||||
{
|
||||
namespace zeek::probabilistic {
|
||||
|
||||
BloomFilter::BloomFilter()
|
||||
{
|
||||
hasher = nullptr;
|
||||
}
|
||||
|
||||
BloomFilter::BloomFilter(const detail::Hasher* arg_hasher)
|
||||
{
|
||||
hasher = arg_hasher;
|
||||
}
|
||||
|
||||
BloomFilter::~BloomFilter()
|
||||
{
|
||||
delete hasher;
|
||||
}
|
||||
BloomFilter::BloomFilter() { hasher = nullptr; }
|
||||
|
||||
broker::expected<broker::data> BloomFilter::Serialize() const
|
||||
{
|
||||
auto h = hasher->Serialize();
|
||||
|
||||
if ( ! h )
|
||||
return broker::ec::invalid_data; // Cannot serialize
|
||||
BloomFilter::BloomFilter(const detail::Hasher* arg_hasher) { hasher = arg_hasher; }
|
||||
|
||||
auto d = DoSerialize();
|
||||
BloomFilter::~BloomFilter() { delete hasher; }
|
||||
|
||||
if ( ! d )
|
||||
return broker::ec::invalid_data; // Cannot serialize
|
||||
|
||||
return {broker::vector{static_cast<uint64_t>(Type()), std::move(*h), std::move(*d)}};
|
||||
}
|
||||
|
||||
std::unique_ptr<BloomFilter> BloomFilter::Unserialize(const broker::data& data)
|
||||
{
|
||||
auto v = broker::get_if<broker::vector>(&data);
|
||||
|
||||
if ( ! (v && v->size() == 3) )
|
||||
return nullptr;
|
||||
|
||||
auto type = broker::get_if<uint64_t>(&(*v)[0]);
|
||||
if ( ! type )
|
||||
return nullptr;
|
||||
|
||||
auto hasher_ = detail::Hasher::Unserialize((*v)[1]);
|
||||
if ( ! hasher_ )
|
||||
return nullptr;
|
||||
|
||||
std::unique_ptr<BloomFilter> bf;
|
||||
broker::expected<broker::data> BloomFilter::Serialize() const {
|
||||
auto h = hasher->Serialize();
|
||||
|
||||
switch ( *type )
|
||||
{
|
||||
case Basic:
|
||||
bf = std::unique_ptr<BloomFilter>(new BasicBloomFilter());
|
||||
break;
|
||||
|
||||
case Counting:
|
||||
bf = std::unique_ptr<BloomFilter>(new CountingBloomFilter());
|
||||
break;
|
||||
|
||||
default:
|
||||
reporter->Error("found invalid bloom filter type");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
if ( ! bf->DoUnserialize((*v)[2]) )
|
||||
return nullptr;
|
||||
|
||||
bf->hasher = hasher_.release();
|
||||
return bf;
|
||||
}
|
||||
|
||||
size_t BasicBloomFilter::M(double fp, size_t capacity)
|
||||
{
|
||||
double ln2 = std::log(2);
|
||||
return std::ceil(-(capacity * std::log(fp) / ln2 / ln2));
|
||||
}
|
||||
|
||||
size_t BasicBloomFilter::K(size_t cells, size_t capacity)
|
||||
{
|
||||
double frac = static_cast<double>(cells) / static_cast<double>(capacity);
|
||||
return std::ceil(frac * std::log(2));
|
||||
}
|
||||
|
||||
bool BasicBloomFilter::Empty() const
|
||||
{
|
||||
return bits->AllZero();
|
||||
}
|
||||
|
||||
void BasicBloomFilter::Clear()
|
||||
{
|
||||
bits->Reset();
|
||||
}
|
||||
|
||||
bool BasicBloomFilter::Merge(const BloomFilter* other)
|
||||
{
|
||||
if ( typeid(*this) != typeid(*other) )
|
||||
return false;
|
||||
|
||||
const BasicBloomFilter* o = static_cast<const BasicBloomFilter*>(other);
|
||||
|
||||
if ( ! hasher->Equals(o->hasher) )
|
||||
{
|
||||
reporter->Error("incompatible hashers in BasicBloomFilter merge");
|
||||
return false;
|
||||
}
|
||||
|
||||
else if ( bits->Size() != o->bits->Size() )
|
||||
{
|
||||
reporter->Error("different bitvector size in BasicBloomFilter merge");
|
||||
return false;
|
||||
}
|
||||
|
||||
(*bits) |= *o->bits;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
BasicBloomFilter* BasicBloomFilter::Intersect(const BloomFilter* other) const
|
||||
{
|
||||
if ( typeid(*this) != typeid(*other) )
|
||||
return nullptr;
|
||||
|
||||
const BasicBloomFilter* o = static_cast<const BasicBloomFilter*>(other);
|
||||
|
||||
if ( ! hasher->Equals(o->hasher) )
|
||||
{
|
||||
reporter->Error("incompatible hashers in BasicBloomFilter intersect");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
else if ( bits->Size() != o->bits->Size() )
|
||||
{
|
||||
reporter->Error("different bitvector size in BasicBloomFilter intersect");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
auto copy = Clone();
|
||||
(*copy->bits) &= *o->bits;
|
||||
|
||||
return copy;
|
||||
}
|
||||
|
||||
BasicBloomFilter* BasicBloomFilter::Clone() const
|
||||
{
|
||||
BasicBloomFilter* copy = new BasicBloomFilter();
|
||||
|
||||
copy->hasher = hasher->Clone();
|
||||
copy->bits = new detail::BitVector(*bits);
|
||||
|
||||
return copy;
|
||||
}
|
||||
|
||||
std::string BasicBloomFilter::InternalState() const
|
||||
{
|
||||
return util::fmt("%" PRIu64, bits->Hash());
|
||||
}
|
||||
|
||||
BasicBloomFilter::BasicBloomFilter()
|
||||
{
|
||||
bits = nullptr;
|
||||
}
|
||||
|
||||
BasicBloomFilter::BasicBloomFilter(const detail::Hasher* hasher, size_t cells) : BloomFilter(hasher)
|
||||
{
|
||||
bits = new detail::BitVector(cells);
|
||||
}
|
||||
|
||||
BasicBloomFilter::~BasicBloomFilter()
|
||||
{
|
||||
delete bits;
|
||||
}
|
||||
|
||||
void BasicBloomFilter::Add(const zeek::detail::HashKey* key)
|
||||
{
|
||||
detail::Hasher::digest_vector h = hasher->Hash(key);
|
||||
|
||||
for ( size_t i = 0; i < h.size(); ++i )
|
||||
bits->Set(h[i] % bits->Size());
|
||||
}
|
||||
|
||||
bool BasicBloomFilter::Decrement(const zeek::detail::HashKey* key)
|
||||
{
|
||||
// operation not supported by basic bloom filter
|
||||
return false;
|
||||
}
|
||||
|
||||
size_t BasicBloomFilter::Count(const zeek::detail::HashKey* key) const
|
||||
{
|
||||
detail::Hasher::digest_vector h = hasher->Hash(key);
|
||||
|
||||
for ( size_t i = 0; i < h.size(); ++i )
|
||||
{
|
||||
if ( ! (*bits)[h[i] % bits->Size()] )
|
||||
return 0;
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
broker::expected<broker::data> BasicBloomFilter::DoSerialize() const
|
||||
{
|
||||
auto b = bits->Serialize();
|
||||
return b;
|
||||
}
|
||||
|
||||
bool BasicBloomFilter::DoUnserialize(const broker::data& data)
|
||||
{
|
||||
auto b = detail::BitVector::Unserialize(data);
|
||||
if ( ! b )
|
||||
return false;
|
||||
|
||||
bits = b.release();
|
||||
return true;
|
||||
}
|
||||
|
||||
CountingBloomFilter::CountingBloomFilter()
|
||||
{
|
||||
cells = nullptr;
|
||||
}
|
||||
|
||||
CountingBloomFilter::CountingBloomFilter(const detail::Hasher* hasher, size_t arg_cells,
|
||||
size_t width)
|
||||
: BloomFilter(hasher)
|
||||
{
|
||||
cells = new detail::CounterVector(width, arg_cells);
|
||||
}
|
||||
|
||||
CountingBloomFilter::~CountingBloomFilter()
|
||||
{
|
||||
delete cells;
|
||||
}
|
||||
|
||||
bool CountingBloomFilter::Empty() const
|
||||
{
|
||||
return cells->AllZero();
|
||||
}
|
||||
|
||||
void CountingBloomFilter::Clear()
|
||||
{
|
||||
cells->Reset();
|
||||
}
|
||||
|
||||
bool CountingBloomFilter::Merge(const BloomFilter* other)
|
||||
{
|
||||
if ( typeid(*this) != typeid(*other) )
|
||||
return false;
|
||||
|
||||
const CountingBloomFilter* o = static_cast<const CountingBloomFilter*>(other);
|
||||
|
||||
if ( ! hasher->Equals(o->hasher) )
|
||||
{
|
||||
reporter->Error("incompatible hashers in CountingBloomFilter merge");
|
||||
return false;
|
||||
}
|
||||
|
||||
else if ( cells->Size() != o->cells->Size() )
|
||||
{
|
||||
reporter->Error("different bitvector size in CountingBloomFilter merge");
|
||||
return false;
|
||||
}
|
||||
|
||||
(*cells) |= *o->cells;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
BasicBloomFilter* CountingBloomFilter::Intersect(const BloomFilter* other) const
|
||||
{
|
||||
if ( typeid(*this) != typeid(*other) )
|
||||
return nullptr;
|
||||
|
||||
const CountingBloomFilter* o = static_cast<const CountingBloomFilter*>(other);
|
||||
|
||||
if ( ! hasher->Equals(o->hasher) )
|
||||
{
|
||||
reporter->Error("incompatible hashers in CountingBloomFilter merge");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
else if ( cells->Size() != o->cells->Size() )
|
||||
{
|
||||
reporter->Error("different bitvector size in CountingBloomFilter merge");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
auto outbf = new BasicBloomFilter(hasher->Clone(), cells->Size());
|
||||
*outbf->bits |= cells->ToBitVector();
|
||||
*outbf->bits &= o->cells->ToBitVector();
|
||||
|
||||
return outbf;
|
||||
}
|
||||
|
||||
CountingBloomFilter* CountingBloomFilter::Clone() const
|
||||
{
|
||||
CountingBloomFilter* copy = new CountingBloomFilter();
|
||||
|
||||
copy->hasher = hasher->Clone();
|
||||
copy->cells = new detail::CounterVector(*cells);
|
||||
|
||||
return copy;
|
||||
}
|
||||
|
||||
std::string CountingBloomFilter::InternalState() const
|
||||
{
|
||||
return util::fmt("%" PRIu64, cells->Hash());
|
||||
}
|
||||
if ( ! h )
|
||||
return broker::ec::invalid_data; // Cannot serialize
|
||||
|
||||
auto d = DoSerialize();
|
||||
|
||||
if ( ! d )
|
||||
return broker::ec::invalid_data; // Cannot serialize
|
||||
|
||||
return {broker::vector{static_cast<uint64_t>(Type()), std::move(*h), std::move(*d)}};
|
||||
}
|
||||
|
||||
std::unique_ptr<BloomFilter> BloomFilter::Unserialize(const broker::data& data) {
|
||||
auto v = broker::get_if<broker::vector>(&data);
|
||||
|
||||
if ( ! (v && v->size() == 3) )
|
||||
return nullptr;
|
||||
|
||||
auto type = broker::get_if<uint64_t>(&(*v)[0]);
|
||||
if ( ! type )
|
||||
return nullptr;
|
||||
|
||||
auto hasher_ = detail::Hasher::Unserialize((*v)[1]);
|
||||
if ( ! hasher_ )
|
||||
return nullptr;
|
||||
|
||||
std::unique_ptr<BloomFilter> bf;
|
||||
|
||||
switch ( *type ) {
|
||||
case Basic: bf = std::unique_ptr<BloomFilter>(new BasicBloomFilter()); break;
|
||||
|
||||
case Counting: bf = std::unique_ptr<BloomFilter>(new CountingBloomFilter()); break;
|
||||
|
||||
default: reporter->Error("found invalid bloom filter type"); return nullptr;
|
||||
}
|
||||
|
||||
if ( ! bf->DoUnserialize((*v)[2]) )
|
||||
return nullptr;
|
||||
|
||||
bf->hasher = hasher_.release();
|
||||
return bf;
|
||||
}
|
||||
|
||||
size_t BasicBloomFilter::M(double fp, size_t capacity) {
|
||||
double ln2 = std::log(2);
|
||||
return std::ceil(-(capacity * std::log(fp) / ln2 / ln2));
|
||||
}
|
||||
|
||||
size_t BasicBloomFilter::K(size_t cells, size_t capacity) {
|
||||
double frac = static_cast<double>(cells) / static_cast<double>(capacity);
|
||||
return std::ceil(frac * std::log(2));
|
||||
}
|
||||
|
||||
bool BasicBloomFilter::Empty() const { return bits->AllZero(); }
|
||||
|
||||
void BasicBloomFilter::Clear() { bits->Reset(); }
|
||||
|
||||
bool BasicBloomFilter::Merge(const BloomFilter* other) {
|
||||
if ( typeid(*this) != typeid(*other) )
|
||||
return false;
|
||||
|
||||
const BasicBloomFilter* o = static_cast<const BasicBloomFilter*>(other);
|
||||
|
||||
if ( ! hasher->Equals(o->hasher) ) {
|
||||
reporter->Error("incompatible hashers in BasicBloomFilter merge");
|
||||
return false;
|
||||
}
|
||||
|
||||
else if ( bits->Size() != o->bits->Size() ) {
|
||||
reporter->Error("different bitvector size in BasicBloomFilter merge");
|
||||
return false;
|
||||
}
|
||||
|
||||
(*bits) |= *o->bits;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
BasicBloomFilter* BasicBloomFilter::Intersect(const BloomFilter* other) const {
|
||||
if ( typeid(*this) != typeid(*other) )
|
||||
return nullptr;
|
||||
|
||||
const BasicBloomFilter* o = static_cast<const BasicBloomFilter*>(other);
|
||||
|
||||
if ( ! hasher->Equals(o->hasher) ) {
|
||||
reporter->Error("incompatible hashers in BasicBloomFilter intersect");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
else if ( bits->Size() != o->bits->Size() ) {
|
||||
reporter->Error("different bitvector size in BasicBloomFilter intersect");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
auto copy = Clone();
|
||||
(*copy->bits) &= *o->bits;
|
||||
|
||||
return copy;
|
||||
}
|
||||
|
||||
BasicBloomFilter* BasicBloomFilter::Clone() const {
|
||||
BasicBloomFilter* copy = new BasicBloomFilter();
|
||||
|
||||
copy->hasher = hasher->Clone();
|
||||
copy->bits = new detail::BitVector(*bits);
|
||||
|
||||
return copy;
|
||||
}
|
||||
|
||||
std::string BasicBloomFilter::InternalState() const { return util::fmt("%" PRIu64, bits->Hash()); }
|
||||
|
||||
BasicBloomFilter::BasicBloomFilter() { bits = nullptr; }
|
||||
|
||||
BasicBloomFilter::BasicBloomFilter(const detail::Hasher* hasher, size_t cells) : BloomFilter(hasher) {
|
||||
bits = new detail::BitVector(cells);
|
||||
}
|
||||
|
||||
BasicBloomFilter::~BasicBloomFilter() { delete bits; }
|
||||
|
||||
void BasicBloomFilter::Add(const zeek::detail::HashKey* key) {
|
||||
detail::Hasher::digest_vector h = hasher->Hash(key);
|
||||
|
||||
for ( size_t i = 0; i < h.size(); ++i )
|
||||
bits->Set(h[i] % bits->Size());
|
||||
}
|
||||
|
||||
bool BasicBloomFilter::Decrement(const zeek::detail::HashKey* key) {
|
||||
// operation not supported by basic bloom filter
|
||||
return false;
|
||||
}
|
||||
|
||||
size_t BasicBloomFilter::Count(const zeek::detail::HashKey* key) const {
|
||||
detail::Hasher::digest_vector h = hasher->Hash(key);
|
||||
|
||||
for ( size_t i = 0; i < h.size(); ++i ) {
|
||||
if ( ! (*bits)[h[i] % bits->Size()] )
|
||||
return 0;
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
broker::expected<broker::data> BasicBloomFilter::DoSerialize() const {
|
||||
auto b = bits->Serialize();
|
||||
return b;
|
||||
}
|
||||
|
||||
bool BasicBloomFilter::DoUnserialize(const broker::data& data) {
|
||||
auto b = detail::BitVector::Unserialize(data);
|
||||
if ( ! b )
|
||||
return false;
|
||||
|
||||
bits = b.release();
|
||||
return true;
|
||||
}
|
||||
|
||||
CountingBloomFilter::CountingBloomFilter() { cells = nullptr; }
|
||||
|
||||
CountingBloomFilter::CountingBloomFilter(const detail::Hasher* hasher, size_t arg_cells, size_t width)
|
||||
: BloomFilter(hasher) {
|
||||
cells = new detail::CounterVector(width, arg_cells);
|
||||
}
|
||||
|
||||
CountingBloomFilter::~CountingBloomFilter() { delete cells; }
|
||||
|
||||
bool CountingBloomFilter::Empty() const { return cells->AllZero(); }
|
||||
|
||||
void CountingBloomFilter::Clear() { cells->Reset(); }
|
||||
|
||||
bool CountingBloomFilter::Merge(const BloomFilter* other) {
|
||||
if ( typeid(*this) != typeid(*other) )
|
||||
return false;
|
||||
|
||||
const CountingBloomFilter* o = static_cast<const CountingBloomFilter*>(other);
|
||||
|
||||
if ( ! hasher->Equals(o->hasher) ) {
|
||||
reporter->Error("incompatible hashers in CountingBloomFilter merge");
|
||||
return false;
|
||||
}
|
||||
|
||||
else if ( cells->Size() != o->cells->Size() ) {
|
||||
reporter->Error("different bitvector size in CountingBloomFilter merge");
|
||||
return false;
|
||||
}
|
||||
|
||||
(*cells) |= *o->cells;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
BasicBloomFilter* CountingBloomFilter::Intersect(const BloomFilter* other) const {
|
||||
if ( typeid(*this) != typeid(*other) )
|
||||
return nullptr;
|
||||
|
||||
const CountingBloomFilter* o = static_cast<const CountingBloomFilter*>(other);
|
||||
|
||||
if ( ! hasher->Equals(o->hasher) ) {
|
||||
reporter->Error("incompatible hashers in CountingBloomFilter merge");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
else if ( cells->Size() != o->cells->Size() ) {
|
||||
reporter->Error("different bitvector size in CountingBloomFilter merge");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
auto outbf = new BasicBloomFilter(hasher->Clone(), cells->Size());
|
||||
*outbf->bits |= cells->ToBitVector();
|
||||
*outbf->bits &= o->cells->ToBitVector();
|
||||
|
||||
return outbf;
|
||||
}
|
||||
|
||||
CountingBloomFilter* CountingBloomFilter::Clone() const {
|
||||
CountingBloomFilter* copy = new CountingBloomFilter();
|
||||
|
||||
copy->hasher = hasher->Clone();
|
||||
copy->cells = new detail::CounterVector(*cells);
|
||||
|
||||
return copy;
|
||||
}
|
||||
|
||||
std::string CountingBloomFilter::InternalState() const { return util::fmt("%" PRIu64, cells->Hash()); }
|
||||
|
||||
// TODO: Use partitioning in add/count to allow for reusing CMS bounds.
|
||||
void CountingBloomFilter::Add(const zeek::detail::HashKey* key)
|
||||
{
|
||||
detail::Hasher::digest_vector h = hasher->Hash(key);
|
||||
void CountingBloomFilter::Add(const zeek::detail::HashKey* key) {
|
||||
detail::Hasher::digest_vector h = hasher->Hash(key);
|
||||
|
||||
for ( size_t i = 0; i < h.size(); ++i )
|
||||
cells->Increment(h[i] % cells->Size());
|
||||
}
|
||||
for ( size_t i = 0; i < h.size(); ++i )
|
||||
cells->Increment(h[i] % cells->Size());
|
||||
}
|
||||
|
||||
bool CountingBloomFilter::Decrement(const zeek::detail::HashKey* key)
|
||||
{
|
||||
// Only decrement if a member.
|
||||
if ( Count(key) == 0 )
|
||||
return false;
|
||||
bool CountingBloomFilter::Decrement(const zeek::detail::HashKey* key) {
|
||||
// Only decrement if a member.
|
||||
if ( Count(key) == 0 )
|
||||
return false;
|
||||
|
||||
detail::Hasher::digest_vector h = hasher->Hash(key);
|
||||
detail::Hasher::digest_vector h = hasher->Hash(key);
|
||||
|
||||
for ( size_t i = 0; i < h.size(); ++i )
|
||||
cells->Decrement(h[i] % cells->Size());
|
||||
for ( size_t i = 0; i < h.size(); ++i )
|
||||
cells->Decrement(h[i] % cells->Size());
|
||||
|
||||
return true;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
size_t CountingBloomFilter::Count(const zeek::detail::HashKey* key) const
|
||||
{
|
||||
detail::Hasher::digest_vector h = hasher->Hash(key);
|
||||
size_t CountingBloomFilter::Count(const zeek::detail::HashKey* key) const {
|
||||
detail::Hasher::digest_vector h = hasher->Hash(key);
|
||||
|
||||
detail::CounterVector::size_type min =
|
||||
std::numeric_limits<detail::CounterVector::size_type>::max();
|
||||
detail::CounterVector::size_type min = std::numeric_limits<detail::CounterVector::size_type>::max();
|
||||
|
||||
for ( size_t i = 0; i < h.size(); ++i )
|
||||
{
|
||||
detail::CounterVector::size_type cnt = cells->Count(h[i] % cells->Size());
|
||||
if ( cnt < min )
|
||||
min = cnt;
|
||||
}
|
||||
for ( size_t i = 0; i < h.size(); ++i ) {
|
||||
detail::CounterVector::size_type cnt = cells->Count(h[i] % cells->Size());
|
||||
if ( cnt < min )
|
||||
min = cnt;
|
||||
}
|
||||
|
||||
return min;
|
||||
}
|
||||
return min;
|
||||
}
|
||||
|
||||
broker::expected<broker::data> CountingBloomFilter::DoSerialize() const
|
||||
{
|
||||
auto c = cells->Serialize();
|
||||
return c;
|
||||
}
|
||||
broker::expected<broker::data> CountingBloomFilter::DoSerialize() const {
|
||||
auto c = cells->Serialize();
|
||||
return c;
|
||||
}
|
||||
|
||||
bool CountingBloomFilter::DoUnserialize(const broker::data& data)
|
||||
{
|
||||
auto c = detail::CounterVector::Unserialize(data);
|
||||
if ( ! c )
|
||||
return false;
|
||||
bool CountingBloomFilter::DoUnserialize(const broker::data& data) {
|
||||
auto c = detail::CounterVector::Unserialize(data);
|
||||
if ( ! c )
|
||||
return false;
|
||||
|
||||
cells = c.release();
|
||||
return true;
|
||||
}
|
||||
cells = c.release();
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace zeek::probabilistic
|
||||
} // namespace zeek::probabilistic
|
||||
|
|
|
@ -12,273 +12,263 @@
|
|||
#include "zeek/probabilistic/BitVector.h"
|
||||
#include "zeek/probabilistic/Hasher.h"
|
||||
|
||||
namespace broker
|
||||
{
|
||||
namespace broker {
|
||||
class data;
|
||||
}
|
||||
}
|
||||
|
||||
namespace zeek::probabilistic
|
||||
{
|
||||
namespace detail
|
||||
{
|
||||
namespace zeek::probabilistic {
|
||||
namespace detail {
|
||||
class CounterVector;
|
||||
}
|
||||
}
|
||||
|
||||
/** Types of derived BloomFilter classes. */
|
||||
enum BloomFilterType
|
||||
{
|
||||
Basic,
|
||||
Counting
|
||||
};
|
||||
enum BloomFilterType { Basic, Counting };
|
||||
|
||||
/**
|
||||
* The abstract base class for Bloom filters.
|
||||
*/
|
||||
class BloomFilter
|
||||
{
|
||||
class BloomFilter {
|
||||
public:
|
||||
/**
|
||||
* Destructor.
|
||||
*/
|
||||
virtual ~BloomFilter();
|
||||
/**
|
||||
* Destructor.
|
||||
*/
|
||||
virtual ~BloomFilter();
|
||||
|
||||
/**
|
||||
* Adds an element to the Bloom filter, or increments its value for counting
|
||||
* bloom filters
|
||||
*
|
||||
* @param key The key associated with the element to add.
|
||||
*/
|
||||
virtual void Add(const zeek::detail::HashKey* key) = 0;
|
||||
/**
|
||||
* Adds an element to the Bloom filter, or increments its value for counting
|
||||
* bloom filters
|
||||
*
|
||||
* @param key The key associated with the element to add.
|
||||
*/
|
||||
virtual void Add(const zeek::detail::HashKey* key) = 0;
|
||||
|
||||
/**
|
||||
* Decrements the value of an element in the bloom filter, if the underlying
|
||||
* filter supports the operation
|
||||
*
|
||||
* #param key The key associated with the element to decrement.
|
||||
*
|
||||
* @return True if the decrement operation succeeded.
|
||||
*/
|
||||
virtual bool Decrement(const zeek::detail::HashKey* key) = 0;
|
||||
/**
|
||||
* Decrements the value of an element in the bloom filter, if the underlying
|
||||
* filter supports the operation
|
||||
*
|
||||
* #param key The key associated with the element to decrement.
|
||||
*
|
||||
* @return True if the decrement operation succeeded.
|
||||
*/
|
||||
virtual bool Decrement(const zeek::detail::HashKey* key) = 0;
|
||||
|
||||
/**
|
||||
* Retrieves the associated count of a given value.
|
||||
*
|
||||
* @param key The key associated with the element to check.
|
||||
*
|
||||
* @return The counter associated with *key*.
|
||||
*/
|
||||
virtual size_t Count(const zeek::detail::HashKey* key) const = 0;
|
||||
/**
|
||||
* Retrieves the associated count of a given value.
|
||||
*
|
||||
* @param key The key associated with the element to check.
|
||||
*
|
||||
* @return The counter associated with *key*.
|
||||
*/
|
||||
virtual size_t Count(const zeek::detail::HashKey* key) const = 0;
|
||||
|
||||
/**
|
||||
* Checks whether the Bloom filter is empty.
|
||||
*
|
||||
* @return `true` if the Bloom filter contains no elements.
|
||||
*/
|
||||
virtual bool Empty() const = 0;
|
||||
/**
|
||||
* Checks whether the Bloom filter is empty.
|
||||
*
|
||||
* @return `true` if the Bloom filter contains no elements.
|
||||
*/
|
||||
virtual bool Empty() const = 0;
|
||||
|
||||
/**
|
||||
* Removes all elements, i.e., resets all bits in the underlying bit vector.
|
||||
*/
|
||||
virtual void Clear() = 0;
|
||||
/**
|
||||
* Removes all elements, i.e., resets all bits in the underlying bit vector.
|
||||
*/
|
||||
virtual void Clear() = 0;
|
||||
|
||||
/**
|
||||
* Merges another Bloom filter into this one.
|
||||
*
|
||||
* @param other The other Bloom filter.
|
||||
*
|
||||
* @return `true` on success.
|
||||
*/
|
||||
virtual bool Merge(const BloomFilter* other) = 0;
|
||||
/**
|
||||
* Merges another Bloom filter into this one.
|
||||
*
|
||||
* @param other The other Bloom filter.
|
||||
*
|
||||
* @return `true` on success.
|
||||
*/
|
||||
virtual bool Merge(const BloomFilter* other) = 0;
|
||||
|
||||
/**
|
||||
* Intersects another Bloom filter with a copy of this one and returns the copy.
|
||||
*
|
||||
* @param other The other Bloom filter.
|
||||
*
|
||||
* @return Intersecting BloomFilter on success, nullptr otherwise.
|
||||
*/
|
||||
virtual BloomFilter* Intersect(const BloomFilter* other) const = 0;
|
||||
/**
|
||||
* Intersects another Bloom filter with a copy of this one and returns the copy.
|
||||
*
|
||||
* @param other The other Bloom filter.
|
||||
*
|
||||
* @return Intersecting BloomFilter on success, nullptr otherwise.
|
||||
*/
|
||||
virtual BloomFilter* Intersect(const BloomFilter* other) const = 0;
|
||||
|
||||
/**
|
||||
* Constructs a copy of this Bloom filter.
|
||||
*
|
||||
* @return A copy of `*this`.
|
||||
*/
|
||||
virtual BloomFilter* Clone() const = 0;
|
||||
/**
|
||||
* Constructs a copy of this Bloom filter.
|
||||
*
|
||||
* @return A copy of `*this`.
|
||||
*/
|
||||
virtual BloomFilter* Clone() const = 0;
|
||||
|
||||
/**
|
||||
* Returns a string with a representation of the Bloom filter's
|
||||
* internal state. This is for debugging/testing purposes only.
|
||||
*/
|
||||
virtual std::string InternalState() const = 0;
|
||||
/**
|
||||
* Returns a string with a representation of the Bloom filter's
|
||||
* internal state. This is for debugging/testing purposes only.
|
||||
*/
|
||||
virtual std::string InternalState() const = 0;
|
||||
|
||||
broker::expected<broker::data> Serialize() const;
|
||||
static std::unique_ptr<BloomFilter> Unserialize(const broker::data& data);
|
||||
broker::expected<broker::data> Serialize() const;
|
||||
static std::unique_ptr<BloomFilter> Unserialize(const broker::data& data);
|
||||
|
||||
protected:
|
||||
/**
|
||||
* Default constructor.
|
||||
*/
|
||||
BloomFilter();
|
||||
/**
|
||||
* Default constructor.
|
||||
*/
|
||||
BloomFilter();
|
||||
|
||||
/**
|
||||
* Constructs a Bloom filter.
|
||||
*
|
||||
* @param hasher The hasher to use for this Bloom filter.
|
||||
*/
|
||||
explicit BloomFilter(const detail::Hasher* hasher);
|
||||
/**
|
||||
* Constructs a Bloom filter.
|
||||
*
|
||||
* @param hasher The hasher to use for this Bloom filter.
|
||||
*/
|
||||
explicit BloomFilter(const detail::Hasher* hasher);
|
||||
|
||||
virtual broker::expected<broker::data> DoSerialize() const = 0;
|
||||
virtual bool DoUnserialize(const broker::data& data) = 0;
|
||||
virtual BloomFilterType Type() const = 0;
|
||||
virtual broker::expected<broker::data> DoSerialize() const = 0;
|
||||
virtual bool DoUnserialize(const broker::data& data) = 0;
|
||||
virtual BloomFilterType Type() const = 0;
|
||||
|
||||
const detail::Hasher* hasher;
|
||||
};
|
||||
const detail::Hasher* hasher;
|
||||
};
|
||||
|
||||
class CountingBloomFilter;
|
||||
|
||||
/**
|
||||
* A basic Bloom filter.
|
||||
*/
|
||||
class BasicBloomFilter : public BloomFilter
|
||||
{
|
||||
friend class CountingBloomFilter;
|
||||
class BasicBloomFilter : public BloomFilter {
|
||||
friend class CountingBloomFilter;
|
||||
|
||||
public:
|
||||
/**
|
||||
* Constructs a basic Bloom filter with a given number of cells. The
|
||||
* ideal number of cells can be computed with *M*.
|
||||
*
|
||||
* @param hasher The hasher to use. The ideal number of hash
|
||||
* functions can be computed with *K*.
|
||||
*
|
||||
* @param cells The number of cells.
|
||||
*/
|
||||
BasicBloomFilter(const detail::Hasher* hasher, size_t cells);
|
||||
/**
|
||||
* Constructs a basic Bloom filter with a given number of cells. The
|
||||
* ideal number of cells can be computed with *M*.
|
||||
*
|
||||
* @param hasher The hasher to use. The ideal number of hash
|
||||
* functions can be computed with *K*.
|
||||
*
|
||||
* @param cells The number of cells.
|
||||
*/
|
||||
BasicBloomFilter(const detail::Hasher* hasher, size_t cells);
|
||||
|
||||
/**
|
||||
* Destructor.
|
||||
*/
|
||||
~BasicBloomFilter() override;
|
||||
/**
|
||||
* Destructor.
|
||||
*/
|
||||
~BasicBloomFilter() override;
|
||||
|
||||
/**
|
||||
* Computes the number of cells based on a given false positive rate
|
||||
* and capacity. In the literature, this parameter often has the name
|
||||
* *M*.
|
||||
*
|
||||
* @param fp The false positive rate.
|
||||
*
|
||||
* @param capacity The expected number of elements that will be
|
||||
* stored.
|
||||
*
|
||||
* Returns: The number cells needed to support a false positive rate
|
||||
* of *fp* with at most *capacity* elements.
|
||||
*/
|
||||
static size_t M(double fp, size_t capacity);
|
||||
/**
|
||||
* Computes the number of cells based on a given false positive rate
|
||||
* and capacity. In the literature, this parameter often has the name
|
||||
* *M*.
|
||||
*
|
||||
* @param fp The false positive rate.
|
||||
*
|
||||
* @param capacity The expected number of elements that will be
|
||||
* stored.
|
||||
*
|
||||
* Returns: The number cells needed to support a false positive rate
|
||||
* of *fp* with at most *capacity* elements.
|
||||
*/
|
||||
static size_t M(double fp, size_t capacity);
|
||||
|
||||
/**
|
||||
* Computes the optimal number of hash functions based on the number cells
|
||||
* and expected number of elements.
|
||||
*
|
||||
* @param cells The number of cells (*m*).
|
||||
*
|
||||
* @param capacity The maximum number of elements.
|
||||
*
|
||||
* Returns: the optimal number of hash functions for a false-positive
|
||||
* rate of *fp* for at most *capacity* elements.
|
||||
*/
|
||||
static size_t K(size_t cells, size_t capacity);
|
||||
/**
|
||||
* Computes the optimal number of hash functions based on the number cells
|
||||
* and expected number of elements.
|
||||
*
|
||||
* @param cells The number of cells (*m*).
|
||||
*
|
||||
* @param capacity The maximum number of elements.
|
||||
*
|
||||
* Returns: the optimal number of hash functions for a false-positive
|
||||
* rate of *fp* for at most *capacity* elements.
|
||||
*/
|
||||
static size_t K(size_t cells, size_t capacity);
|
||||
|
||||
// Overridden from BloomFilter.
|
||||
bool Empty() const override;
|
||||
void Clear() override;
|
||||
bool Merge(const BloomFilter* other) override;
|
||||
BasicBloomFilter* Clone() const override;
|
||||
BasicBloomFilter* Intersect(const BloomFilter* other) const override;
|
||||
std::string InternalState() const override;
|
||||
// Overridden from BloomFilter.
|
||||
bool Empty() const override;
|
||||
void Clear() override;
|
||||
bool Merge(const BloomFilter* other) override;
|
||||
BasicBloomFilter* Clone() const override;
|
||||
BasicBloomFilter* Intersect(const BloomFilter* other) const override;
|
||||
std::string InternalState() const override;
|
||||
|
||||
protected:
|
||||
friend class BloomFilter;
|
||||
friend class BloomFilter;
|
||||
|
||||
/**
|
||||
* Default constructor.
|
||||
*/
|
||||
BasicBloomFilter();
|
||||
/**
|
||||
* Default constructor.
|
||||
*/
|
||||
BasicBloomFilter();
|
||||
|
||||
// Overridden from BloomFilter.
|
||||
void Add(const zeek::detail::HashKey* key) override;
|
||||
bool Decrement(const zeek::detail::HashKey* key) override;
|
||||
size_t Count(const zeek::detail::HashKey* key) const override;
|
||||
broker::expected<broker::data> DoSerialize() const override;
|
||||
bool DoUnserialize(const broker::data& data) override;
|
||||
BloomFilterType Type() const override { return BloomFilterType::Basic; }
|
||||
// Overridden from BloomFilter.
|
||||
void Add(const zeek::detail::HashKey* key) override;
|
||||
bool Decrement(const zeek::detail::HashKey* key) override;
|
||||
size_t Count(const zeek::detail::HashKey* key) const override;
|
||||
broker::expected<broker::data> DoSerialize() const override;
|
||||
bool DoUnserialize(const broker::data& data) override;
|
||||
BloomFilterType Type() const override { return BloomFilterType::Basic; }
|
||||
|
||||
private:
|
||||
detail::BitVector* bits;
|
||||
};
|
||||
detail::BitVector* bits;
|
||||
};
|
||||
|
||||
/**
|
||||
* A counting Bloom filter.
|
||||
*/
|
||||
class CountingBloomFilter : public BloomFilter
|
||||
{
|
||||
class CountingBloomFilter : public BloomFilter {
|
||||
public:
|
||||
/**
|
||||
* Constructs a counting Bloom filter.
|
||||
*
|
||||
* @param hasher The hasher to use. The ideal number of hash
|
||||
* functions can be computed with *K*.
|
||||
*
|
||||
* @param cells The number of cells to use.
|
||||
*
|
||||
* @param width The maximal bit-width of counter values.
|
||||
*/
|
||||
CountingBloomFilter(const detail::Hasher* hasher, size_t cells, size_t width);
|
||||
/**
|
||||
* Constructs a counting Bloom filter.
|
||||
*
|
||||
* @param hasher The hasher to use. The ideal number of hash
|
||||
* functions can be computed with *K*.
|
||||
*
|
||||
* @param cells The number of cells to use.
|
||||
*
|
||||
* @param width The maximal bit-width of counter values.
|
||||
*/
|
||||
CountingBloomFilter(const detail::Hasher* hasher, size_t cells, size_t width);
|
||||
|
||||
/**
|
||||
* Destructor.
|
||||
*/
|
||||
~CountingBloomFilter() override;
|
||||
/**
|
||||
* Destructor.
|
||||
*/
|
||||
~CountingBloomFilter() override;
|
||||
|
||||
// Overridden from BloomFilter.
|
||||
bool Empty() const override;
|
||||
void Clear() override;
|
||||
bool Merge(const BloomFilter* other) override;
|
||||
CountingBloomFilter* Clone() const override;
|
||||
std::string InternalState() const override;
|
||||
// Overridden from BloomFilter.
|
||||
bool Empty() const override;
|
||||
void Clear() override;
|
||||
bool Merge(const BloomFilter* other) override;
|
||||
CountingBloomFilter* Clone() const override;
|
||||
std::string InternalState() const override;
|
||||
|
||||
/**
|
||||
* Intersects another Bloom filter this one and returns a new BasicBloomFilter.
|
||||
*
|
||||
* Please note that the Intersection of two Counting bloom filters results in a
|
||||
* basic bloom filter. The reason for this is that the counters loose meaning during
|
||||
* the intersection process. The BasicBloomFilter will have bits set in cases where
|
||||
* both Counting Bloom filters has cell values greater than zero.
|
||||
*
|
||||
* @param other The other Bloom filter.
|
||||
*
|
||||
* @return Intersecting BloomFilter on success, nullptr otherwise.
|
||||
*/
|
||||
BasicBloomFilter* Intersect(const BloomFilter* other) const override;
|
||||
/**
|
||||
* Intersects another Bloom filter this one and returns a new BasicBloomFilter.
|
||||
*
|
||||
* Please note that the Intersection of two Counting bloom filters results in a
|
||||
* basic bloom filter. The reason for this is that the counters loose meaning during
|
||||
* the intersection process. The BasicBloomFilter will have bits set in cases where
|
||||
* both Counting Bloom filters has cell values greater than zero.
|
||||
*
|
||||
* @param other The other Bloom filter.
|
||||
*
|
||||
* @return Intersecting BloomFilter on success, nullptr otherwise.
|
||||
*/
|
||||
BasicBloomFilter* Intersect(const BloomFilter* other) const override;
|
||||
|
||||
protected:
|
||||
friend class BloomFilter;
|
||||
friend class BloomFilter;
|
||||
|
||||
/**
|
||||
* Default constructor.
|
||||
*/
|
||||
CountingBloomFilter();
|
||||
/**
|
||||
* Default constructor.
|
||||
*/
|
||||
CountingBloomFilter();
|
||||
|
||||
// Overridden from BloomFilter.
|
||||
void Add(const zeek::detail::HashKey* key) override;
|
||||
bool Decrement(const zeek::detail::HashKey* key) override;
|
||||
size_t Count(const zeek::detail::HashKey* key) const override;
|
||||
broker::expected<broker::data> DoSerialize() const override;
|
||||
bool DoUnserialize(const broker::data& data) override;
|
||||
BloomFilterType Type() const override { return BloomFilterType::Counting; }
|
||||
// Overridden from BloomFilter.
|
||||
void Add(const zeek::detail::HashKey* key) override;
|
||||
bool Decrement(const zeek::detail::HashKey* key) override;
|
||||
size_t Count(const zeek::detail::HashKey* key) const override;
|
||||
broker::expected<broker::data> DoSerialize() const override;
|
||||
bool DoUnserialize(const broker::data& data) override;
|
||||
BloomFilterType Type() const override { return BloomFilterType::Counting; }
|
||||
|
||||
private:
|
||||
detail::CounterVector* cells;
|
||||
};
|
||||
detail::CounterVector* cells;
|
||||
};
|
||||
|
||||
} // namespace zeek::probabilistic
|
||||
} // namespace zeek::probabilistic
|
||||
|
|
|
@ -9,134 +9,119 @@
|
|||
|
||||
#include "zeek/Reporter.h"
|
||||
|
||||
namespace zeek::probabilistic::detail
|
||||
{
|
||||
namespace zeek::probabilistic::detail {
|
||||
|
||||
int CardinalityCounter::OptimalB(double error, double confidence) const
|
||||
{
|
||||
double initial_estimate = 2 * (log(1.04) - log(error)) / log(2);
|
||||
int answer = (int)floor(initial_estimate);
|
||||
int CardinalityCounter::OptimalB(double error, double confidence) const {
|
||||
double initial_estimate = 2 * (log(1.04) - log(error)) / log(2);
|
||||
int answer = (int)floor(initial_estimate);
|
||||
|
||||
// k is the number of standard deviations that we have to go to have
|
||||
// a confidence level of conf.
|
||||
// k is the number of standard deviations that we have to go to have
|
||||
// a confidence level of conf.
|
||||
|
||||
double k = 0;
|
||||
double k = 0;
|
||||
|
||||
do
|
||||
{
|
||||
answer++;
|
||||
k = pow(2, (answer - initial_estimate) / 2);
|
||||
} while ( erf(k / sqrt(2)) < confidence );
|
||||
do {
|
||||
answer++;
|
||||
k = pow(2, (answer - initial_estimate) / 2);
|
||||
} while ( erf(k / sqrt(2)) < confidence );
|
||||
|
||||
return answer;
|
||||
}
|
||||
return answer;
|
||||
}
|
||||
|
||||
void CardinalityCounter::Init(uint64_t size)
|
||||
{
|
||||
m = size;
|
||||
void CardinalityCounter::Init(uint64_t size) {
|
||||
m = size;
|
||||
|
||||
// The following magic values are taken directly out of the
|
||||
// description of the HyperLogLog algorithm.
|
||||
// The following magic values are taken directly out of the
|
||||
// description of the HyperLogLog algorithm.
|
||||
|
||||
if ( m == 16 )
|
||||
alpha_m = 0.673;
|
||||
if ( m == 16 )
|
||||
alpha_m = 0.673;
|
||||
|
||||
else if ( m == 32 )
|
||||
alpha_m = 0.697;
|
||||
else if ( m == 32 )
|
||||
alpha_m = 0.697;
|
||||
|
||||
else if ( m == 64 )
|
||||
alpha_m = 0.709;
|
||||
else if ( m == 64 )
|
||||
alpha_m = 0.709;
|
||||
|
||||
else if ( m >= 128 )
|
||||
alpha_m = 0.7213 / (1 + 1.079 / m);
|
||||
else if ( m >= 128 )
|
||||
alpha_m = 0.7213 / (1 + 1.079 / m);
|
||||
|
||||
else
|
||||
reporter->InternalError(
|
||||
"Invalid size %" PRIu64 ". Size either has to be 16, 32, 64 or bigger than 128", size);
|
||||
else
|
||||
reporter->InternalError("Invalid size %" PRIu64 ". Size either has to be 16, 32, 64 or bigger than 128", size);
|
||||
|
||||
double calc_p = log2(m);
|
||||
if ( trunc(calc_p) != calc_p )
|
||||
reporter->InternalError("Invalid size %" PRIu64 ". Size either has to be a power of 2",
|
||||
size);
|
||||
double calc_p = log2(m);
|
||||
if ( trunc(calc_p) != calc_p )
|
||||
reporter->InternalError("Invalid size %" PRIu64 ". Size either has to be a power of 2", size);
|
||||
|
||||
p = calc_p;
|
||||
p = calc_p;
|
||||
|
||||
buckets.reserve(m);
|
||||
for ( uint64_t i = 0; i < m; i++ )
|
||||
buckets.push_back(0);
|
||||
buckets.reserve(m);
|
||||
for ( uint64_t i = 0; i < m; i++ )
|
||||
buckets.push_back(0);
|
||||
|
||||
assert(buckets.size() == m);
|
||||
assert(buckets.size() == m);
|
||||
|
||||
V = m;
|
||||
}
|
||||
V = m;
|
||||
}
|
||||
|
||||
CardinalityCounter::CardinalityCounter(CardinalityCounter& other) : buckets(other.buckets)
|
||||
{
|
||||
V = other.V;
|
||||
alpha_m = other.alpha_m;
|
||||
m = other.m;
|
||||
p = other.p;
|
||||
}
|
||||
CardinalityCounter::CardinalityCounter(CardinalityCounter& other) : buckets(other.buckets) {
|
||||
V = other.V;
|
||||
alpha_m = other.alpha_m;
|
||||
m = other.m;
|
||||
p = other.p;
|
||||
}
|
||||
|
||||
CardinalityCounter::CardinalityCounter(CardinalityCounter&& o) noexcept
|
||||
{
|
||||
V = o.V;
|
||||
alpha_m = o.alpha_m;
|
||||
m = o.m;
|
||||
p = o.p;
|
||||
CardinalityCounter::CardinalityCounter(CardinalityCounter&& o) noexcept {
|
||||
V = o.V;
|
||||
alpha_m = o.alpha_m;
|
||||
m = o.m;
|
||||
p = o.p;
|
||||
|
||||
o.m = 0;
|
||||
buckets = std::move(o.buckets);
|
||||
}
|
||||
o.m = 0;
|
||||
buckets = std::move(o.buckets);
|
||||
}
|
||||
|
||||
CardinalityCounter::CardinalityCounter(double error_margin, double confidence)
|
||||
{
|
||||
int b = OptimalB(error_margin, confidence);
|
||||
Init((uint64_t)pow(2, b));
|
||||
CardinalityCounter::CardinalityCounter(double error_margin, double confidence) {
|
||||
int b = OptimalB(error_margin, confidence);
|
||||
Init((uint64_t)pow(2, b));
|
||||
|
||||
assert(b == p);
|
||||
}
|
||||
assert(b == p);
|
||||
}
|
||||
|
||||
CardinalityCounter::CardinalityCounter(uint64_t size)
|
||||
{
|
||||
Init(size);
|
||||
}
|
||||
CardinalityCounter::CardinalityCounter(uint64_t size) { Init(size); }
|
||||
|
||||
CardinalityCounter::CardinalityCounter(uint64_t arg_size, uint64_t arg_V, double arg_alpha_m)
|
||||
{
|
||||
m = arg_size;
|
||||
CardinalityCounter::CardinalityCounter(uint64_t arg_size, uint64_t arg_V, double arg_alpha_m) {
|
||||
m = arg_size;
|
||||
|
||||
buckets.reserve(m);
|
||||
for ( uint64_t i = 0; i < m; i++ )
|
||||
buckets.push_back(0);
|
||||
buckets.reserve(m);
|
||||
for ( uint64_t i = 0; i < m; i++ )
|
||||
buckets.push_back(0);
|
||||
|
||||
alpha_m = arg_alpha_m;
|
||||
V = arg_V;
|
||||
p = log2(m);
|
||||
}
|
||||
alpha_m = arg_alpha_m;
|
||||
V = arg_V;
|
||||
p = log2(m);
|
||||
}
|
||||
|
||||
uint8_t CardinalityCounter::Rank(uint64_t hash_modified) const
|
||||
{
|
||||
hash_modified = hash_modified >> p;
|
||||
int answer = 64 - p - CardinalityCounter::flsll(hash_modified) + 1;
|
||||
assert(answer > 0 && answer < 64);
|
||||
uint8_t CardinalityCounter::Rank(uint64_t hash_modified) const {
|
||||
hash_modified = hash_modified >> p;
|
||||
int answer = 64 - p - CardinalityCounter::flsll(hash_modified) + 1;
|
||||
assert(answer > 0 && answer < 64);
|
||||
|
||||
return answer;
|
||||
}
|
||||
return answer;
|
||||
}
|
||||
|
||||
void CardinalityCounter::AddElement(uint64_t hash)
|
||||
{
|
||||
uint64_t index = hash % m;
|
||||
hash = hash - index;
|
||||
void CardinalityCounter::AddElement(uint64_t hash) {
|
||||
uint64_t index = hash % m;
|
||||
hash = hash - index;
|
||||
|
||||
if ( buckets[index] == 0 )
|
||||
V--;
|
||||
if ( buckets[index] == 0 )
|
||||
V--;
|
||||
|
||||
uint8_t temp = Rank(hash);
|
||||
uint8_t temp = Rank(hash);
|
||||
|
||||
if ( temp > buckets[index] )
|
||||
buckets[index] = temp;
|
||||
}
|
||||
if ( temp > buckets[index] )
|
||||
buckets[index] = temp;
|
||||
}
|
||||
|
||||
/**
|
||||
* Estimate the size by using the "raw" HyperLogLog estimate. Then,
|
||||
|
@ -147,99 +132,87 @@ void CardinalityCounter::AddElement(uint64_t hash)
|
|||
* Note - we deviate from the HLL algorithm in the paper here, because
|
||||
* of our 64-bit hashes.
|
||||
**/
|
||||
double CardinalityCounter::Size() const
|
||||
{
|
||||
double answer = 0;
|
||||
for ( unsigned int i = 0; i < m; i++ )
|
||||
answer += pow(2, -((int)buckets[i]));
|
||||
double CardinalityCounter::Size() const {
|
||||
double answer = 0;
|
||||
for ( unsigned int i = 0; i < m; i++ )
|
||||
answer += pow(2, -((int)buckets[i]));
|
||||
|
||||
answer = 1 / answer;
|
||||
answer = (alpha_m * m * m * answer);
|
||||
answer = 1 / answer;
|
||||
answer = (alpha_m * m * m * answer);
|
||||
|
||||
if ( answer <= 5.0 * (((double)m) / 2) )
|
||||
return m * log(((double)m) / V);
|
||||
if ( answer <= 5.0 * (((double)m) / 2) )
|
||||
return m * log(((double)m) / V);
|
||||
|
||||
else if ( answer <= (pow(2, 64) / 30) )
|
||||
return answer;
|
||||
else if ( answer <= (pow(2, 64) / 30) )
|
||||
return answer;
|
||||
|
||||
else
|
||||
return -pow(2, 64) * log(1 - (answer / pow(2, 64)));
|
||||
}
|
||||
else
|
||||
return -pow(2, 64) * log(1 - (answer / pow(2, 64)));
|
||||
}
|
||||
|
||||
bool CardinalityCounter::Merge(CardinalityCounter* c)
|
||||
{
|
||||
if ( m != c->GetM() )
|
||||
return false;
|
||||
bool CardinalityCounter::Merge(CardinalityCounter* c) {
|
||||
if ( m != c->GetM() )
|
||||
return false;
|
||||
|
||||
const std::vector<uint8_t>& temp = c->GetBuckets();
|
||||
const std::vector<uint8_t>& temp = c->GetBuckets();
|
||||
|
||||
V = 0;
|
||||
V = 0;
|
||||
|
||||
for ( size_t i = 0; i < m; i++ )
|
||||
{
|
||||
if ( temp[i] > buckets[i] )
|
||||
buckets[i] = temp[i];
|
||||
for ( size_t i = 0; i < m; i++ ) {
|
||||
if ( temp[i] > buckets[i] )
|
||||
buckets[i] = temp[i];
|
||||
|
||||
if ( buckets[i] == 0 )
|
||||
++V;
|
||||
}
|
||||
if ( buckets[i] == 0 )
|
||||
++V;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
const std::vector<uint8_t>& CardinalityCounter::GetBuckets() const
|
||||
{
|
||||
return buckets;
|
||||
}
|
||||
const std::vector<uint8_t>& CardinalityCounter::GetBuckets() const { return buckets; }
|
||||
|
||||
uint64_t CardinalityCounter::GetM() const
|
||||
{
|
||||
return m;
|
||||
}
|
||||
uint64_t CardinalityCounter::GetM() const { return m; }
|
||||
|
||||
broker::expected<broker::data> CardinalityCounter::Serialize() const
|
||||
{
|
||||
broker::vector v = {m, V, alpha_m};
|
||||
v.reserve(3 + m);
|
||||
broker::expected<broker::data> CardinalityCounter::Serialize() const {
|
||||
broker::vector v = {m, V, alpha_m};
|
||||
v.reserve(3 + m);
|
||||
|
||||
for ( size_t i = 0; i < m; ++i )
|
||||
v.emplace_back(static_cast<uint64_t>(buckets[i]));
|
||||
for ( size_t i = 0; i < m; ++i )
|
||||
v.emplace_back(static_cast<uint64_t>(buckets[i]));
|
||||
|
||||
return {std::move(v)};
|
||||
}
|
||||
return {std::move(v)};
|
||||
}
|
||||
|
||||
std::unique_ptr<CardinalityCounter> CardinalityCounter::Unserialize(const broker::data& data)
|
||||
{
|
||||
auto v = broker::get_if<broker::vector>(&data);
|
||||
if ( ! (v && v->size() >= 3) )
|
||||
return nullptr;
|
||||
std::unique_ptr<CardinalityCounter> CardinalityCounter::Unserialize(const broker::data& data) {
|
||||
auto v = broker::get_if<broker::vector>(&data);
|
||||
if ( ! (v && v->size() >= 3) )
|
||||
return nullptr;
|
||||
|
||||
auto m = broker::get_if<uint64_t>(&(*v)[0]);
|
||||
auto V = broker::get_if<uint64_t>(&(*v)[1]);
|
||||
auto alpha_m = broker::get_if<double>(&(*v)[2]);
|
||||
auto m = broker::get_if<uint64_t>(&(*v)[0]);
|
||||
auto V = broker::get_if<uint64_t>(&(*v)[1]);
|
||||
auto alpha_m = broker::get_if<double>(&(*v)[2]);
|
||||
|
||||
if ( ! (m && V && alpha_m) )
|
||||
return nullptr;
|
||||
if ( v->size() != 3 + *m )
|
||||
return nullptr;
|
||||
if ( ! (m && V && alpha_m) )
|
||||
return nullptr;
|
||||
if ( v->size() != 3 + *m )
|
||||
return nullptr;
|
||||
|
||||
auto cc = std::unique_ptr<CardinalityCounter>(new CardinalityCounter(*m, *V, *alpha_m));
|
||||
if ( *m != cc->m )
|
||||
return nullptr;
|
||||
if ( cc->buckets.size() != *m )
|
||||
return nullptr;
|
||||
auto cc = std::unique_ptr<CardinalityCounter>(new CardinalityCounter(*m, *V, *alpha_m));
|
||||
if ( *m != cc->m )
|
||||
return nullptr;
|
||||
if ( cc->buckets.size() != *m )
|
||||
return nullptr;
|
||||
|
||||
for ( size_t i = 0; i < *m; ++i )
|
||||
{
|
||||
auto x = broker::get_if<uint64_t>(&(*v)[3 + i]);
|
||||
if ( ! x )
|
||||
return nullptr;
|
||||
for ( size_t i = 0; i < *m; ++i ) {
|
||||
auto x = broker::get_if<uint64_t>(&(*v)[3 + i]);
|
||||
if ( ! x )
|
||||
return nullptr;
|
||||
|
||||
cc->buckets[i] = *x;
|
||||
}
|
||||
cc->buckets[i] = *x;
|
||||
}
|
||||
|
||||
return cc;
|
||||
}
|
||||
return cc;
|
||||
}
|
||||
|
||||
/**
|
||||
* The following function is copied from libc/string/flsll.c from the FreeBSD source
|
||||
|
@ -277,15 +250,14 @@ std::unique_ptr<CardinalityCounter> CardinalityCounter::Unserialize(const broker
|
|||
/*
|
||||
* Find Last Set bit
|
||||
*/
|
||||
int CardinalityCounter::flsll(uint64_t mask)
|
||||
{
|
||||
int bit;
|
||||
int CardinalityCounter::flsll(uint64_t mask) {
|
||||
int bit;
|
||||
|
||||
if ( mask == 0 )
|
||||
return (0);
|
||||
for ( bit = 1; mask != 1; bit++ )
|
||||
mask = (uint64_t)mask >> 1;
|
||||
return (bit);
|
||||
}
|
||||
if ( mask == 0 )
|
||||
return (0);
|
||||
for ( bit = 1; mask != 1; bit++ )
|
||||
mask = (uint64_t)mask >> 1;
|
||||
return (bit);
|
||||
}
|
||||
|
||||
} // namespace zeek::probabilistic::detail
|
||||
} // namespace zeek::probabilistic::detail
|
||||
|
|
|
@ -7,186 +7,183 @@
|
|||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
namespace broker
|
||||
{
|
||||
namespace broker {
|
||||
class data;
|
||||
}
|
||||
}
|
||||
|
||||
namespace zeek::probabilistic::detail
|
||||
{
|
||||
namespace zeek::probabilistic::detail {
|
||||
|
||||
/**
|
||||
* A probabilistic cardinality counter using the HyperLogLog algorithm.
|
||||
*/
|
||||
class CardinalityCounter
|
||||
{
|
||||
class CardinalityCounter {
|
||||
public:
|
||||
/**
|
||||
* Constructor.
|
||||
*
|
||||
* The number of buckets of the data structure is determined using
|
||||
* the error margin and the given confidence.
|
||||
*
|
||||
* For example, assume an error_margin of 2% and a confidence
|
||||
* of 95%. If the Size function returns an estimate of 100, this
|
||||
* means that we are 95% sure that the cardinality is between 98
|
||||
* and 102.
|
||||
*
|
||||
* @param error_margin error margin
|
||||
*
|
||||
* @param confidence confidence of the error. Default: 0.95
|
||||
*/
|
||||
explicit CardinalityCounter(double error_margin, double confidence = 0.95);
|
||||
/**
|
||||
* Constructor.
|
||||
*
|
||||
* The number of buckets of the data structure is determined using
|
||||
* the error margin and the given confidence.
|
||||
*
|
||||
* For example, assume an error_margin of 2% and a confidence
|
||||
* of 95%. If the Size function returns an estimate of 100, this
|
||||
* means that we are 95% sure that the cardinality is between 98
|
||||
* and 102.
|
||||
*
|
||||
* @param error_margin error margin
|
||||
*
|
||||
* @param confidence confidence of the error. Default: 0.95
|
||||
*/
|
||||
explicit CardinalityCounter(double error_margin, double confidence = 0.95);
|
||||
|
||||
/**
|
||||
* Copy-Constructor
|
||||
*/
|
||||
CardinalityCounter(CardinalityCounter& other);
|
||||
/**
|
||||
* Copy-Constructor
|
||||
*/
|
||||
CardinalityCounter(CardinalityCounter& other);
|
||||
|
||||
/**
|
||||
* Move-Constructor
|
||||
*/
|
||||
CardinalityCounter(CardinalityCounter&& o) noexcept;
|
||||
/**
|
||||
* Move-Constructor
|
||||
*/
|
||||
CardinalityCounter(CardinalityCounter&& o) noexcept;
|
||||
|
||||
/**
|
||||
* Constructor for a known number of buckets.
|
||||
*
|
||||
* The error margin is 1.04/sqrt(size) with approximately 68%
|
||||
* probability.
|
||||
*
|
||||
* @param size number of buckets to create
|
||||
*/
|
||||
explicit CardinalityCounter(uint64_t size);
|
||||
/**
|
||||
* Constructor for a known number of buckets.
|
||||
*
|
||||
* The error margin is 1.04/sqrt(size) with approximately 68%
|
||||
* probability.
|
||||
*
|
||||
* @param size number of buckets to create
|
||||
*/
|
||||
explicit CardinalityCounter(uint64_t size);
|
||||
|
||||
/**
|
||||
* Destructor.
|
||||
*/
|
||||
~CardinalityCounter() = default;
|
||||
/**
|
||||
* Destructor.
|
||||
*/
|
||||
~CardinalityCounter() = default;
|
||||
|
||||
/**
|
||||
* Add a new element to the counter.
|
||||
*
|
||||
* The hash function generating the hashes needs to be uniformly
|
||||
* distributed over 64 bits.
|
||||
*
|
||||
* @param hash 64-bit hash value of the element to be added
|
||||
*/
|
||||
void AddElement(uint64_t hash);
|
||||
/**
|
||||
* Add a new element to the counter.
|
||||
*
|
||||
* The hash function generating the hashes needs to be uniformly
|
||||
* distributed over 64 bits.
|
||||
*
|
||||
* @param hash 64-bit hash value of the element to be added
|
||||
*/
|
||||
void AddElement(uint64_t hash);
|
||||
|
||||
/**
|
||||
* Get the current estimated number of elements in the data
|
||||
* structure
|
||||
*
|
||||
* @return Estimated number of elements
|
||||
**/
|
||||
double Size() const;
|
||||
/**
|
||||
* Get the current estimated number of elements in the data
|
||||
* structure
|
||||
*
|
||||
* @return Estimated number of elements
|
||||
**/
|
||||
double Size() const;
|
||||
|
||||
/**
|
||||
* Merges the argument cardinality counter with this one. The error
|
||||
* margins of both counters have to be the same, otherwise the merge
|
||||
* operation will not be carried out.
|
||||
*
|
||||
* @param c Cardinality counter to merge into the current counter.
|
||||
*
|
||||
* @return True if successful
|
||||
*/
|
||||
bool Merge(CardinalityCounter* c);
|
||||
/**
|
||||
* Merges the argument cardinality counter with this one. The error
|
||||
* margins of both counters have to be the same, otherwise the merge
|
||||
* operation will not be carried out.
|
||||
*
|
||||
* @param c Cardinality counter to merge into the current counter.
|
||||
*
|
||||
* @return True if successful
|
||||
*/
|
||||
bool Merge(CardinalityCounter* c);
|
||||
|
||||
broker::expected<broker::data> Serialize() const;
|
||||
static std::unique_ptr<CardinalityCounter> Unserialize(const broker::data& data);
|
||||
broker::expected<broker::data> Serialize() const;
|
||||
static std::unique_ptr<CardinalityCounter> Unserialize(const broker::data& data);
|
||||
|
||||
protected:
|
||||
/**
|
||||
* Return the number of buckets.
|
||||
*
|
||||
* @return Number of buckets
|
||||
*/
|
||||
uint64_t GetM() const;
|
||||
/**
|
||||
* Return the number of buckets.
|
||||
*
|
||||
* @return Number of buckets
|
||||
*/
|
||||
uint64_t GetM() const;
|
||||
|
||||
/**
|
||||
* Returns the buckets array that holds all of the rough cardinality
|
||||
* estimates.
|
||||
*
|
||||
* Use GetM() to determine the size.
|
||||
*
|
||||
* @return Array containing cardinality estimates
|
||||
*/
|
||||
const std::vector<uint8_t>& GetBuckets() const;
|
||||
/**
|
||||
* Returns the buckets array that holds all of the rough cardinality
|
||||
* estimates.
|
||||
*
|
||||
* Use GetM() to determine the size.
|
||||
*
|
||||
* @return Array containing cardinality estimates
|
||||
*/
|
||||
const std::vector<uint8_t>& GetBuckets() const;
|
||||
|
||||
private:
|
||||
/**
|
||||
* Constructor used when unserializing, i.e., all parameters are
|
||||
* known.
|
||||
*/
|
||||
explicit CardinalityCounter(uint64_t size, uint64_t V, double alpha_m);
|
||||
/**
|
||||
* Constructor used when unserializing, i.e., all parameters are
|
||||
* known.
|
||||
*/
|
||||
explicit CardinalityCounter(uint64_t size, uint64_t V, double alpha_m);
|
||||
|
||||
/**
|
||||
* Helper function with code used jointly by multiple constructors.
|
||||
*
|
||||
* @param arg_size: number of buckets that need to be kept
|
||||
*/
|
||||
void Init(uint64_t arg_size);
|
||||
/**
|
||||
* Helper function with code used jointly by multiple constructors.
|
||||
*
|
||||
* @param arg_size: number of buckets that need to be kept
|
||||
*/
|
||||
void Init(uint64_t arg_size);
|
||||
|
||||
/**
|
||||
* This function calculates the smallest value of b that will
|
||||
* satisfy these the constraints of a specified error margin and
|
||||
* confidence level.
|
||||
*
|
||||
* The exact expression for b is as follows:
|
||||
* Define x = 2*(log(1.04*k/error)/log(2)). Then b is the ceiling of x.
|
||||
*
|
||||
* After that initial estimate, the value of b is increased until the
|
||||
* standard deviation falls within the specified value.
|
||||
*
|
||||
* @param error error margin
|
||||
*
|
||||
* @param confidence confidence of the error
|
||||
*
|
||||
* @return minimal B-value satisfying the error-rate under confidence.
|
||||
*/
|
||||
int OptimalB(double error, double confidence) const;
|
||||
/**
|
||||
* This function calculates the smallest value of b that will
|
||||
* satisfy these the constraints of a specified error margin and
|
||||
* confidence level.
|
||||
*
|
||||
* The exact expression for b is as follows:
|
||||
* Define x = 2*(log(1.04*k/error)/log(2)). Then b is the ceiling of x.
|
||||
*
|
||||
* After that initial estimate, the value of b is increased until the
|
||||
* standard deviation falls within the specified value.
|
||||
*
|
||||
* @param error error margin
|
||||
*
|
||||
* @param confidence confidence of the error
|
||||
*
|
||||
* @return minimal B-value satisfying the error-rate under confidence.
|
||||
*/
|
||||
int OptimalB(double error, double confidence) const;
|
||||
|
||||
/**
|
||||
* Determines at which index (counted from the front) the first one-bit
|
||||
* appears. The last b bits have to be 0 (the element has to be divisible
|
||||
* by m), hence they are ignored. Always adds 1 to the result. This is the
|
||||
* rho function from the original algorithm.
|
||||
*
|
||||
* @param hash_modified hash value
|
||||
*
|
||||
* @returns index of first one-bit
|
||||
*/
|
||||
uint8_t Rank(uint64_t hash_modified) const;
|
||||
/**
|
||||
* Determines at which index (counted from the front) the first one-bit
|
||||
* appears. The last b bits have to be 0 (the element has to be divisible
|
||||
* by m), hence they are ignored. Always adds 1 to the result. This is the
|
||||
* rho function from the original algorithm.
|
||||
*
|
||||
* @param hash_modified hash value
|
||||
*
|
||||
* @returns index of first one-bit
|
||||
*/
|
||||
uint8_t Rank(uint64_t hash_modified) const;
|
||||
|
||||
/**
|
||||
* flsll from FreeBSD; especially Linux does not have this.
|
||||
*/
|
||||
static int flsll(uint64_t mask);
|
||||
/**
|
||||
* flsll from FreeBSD; especially Linux does not have this.
|
||||
*/
|
||||
static int flsll(uint64_t mask);
|
||||
|
||||
/**
|
||||
* This is the number of buckets that will be stored. The standard
|
||||
* error is 1.04/sqrt(m), so the actual cardinality will be the
|
||||
* estimate +/- 1.04/sqrt(m) with approximately 68% probability.
|
||||
*/
|
||||
uint64_t m = 0;
|
||||
/**
|
||||
* This is the number of buckets that will be stored. The standard
|
||||
* error is 1.04/sqrt(m), so the actual cardinality will be the
|
||||
* estimate +/- 1.04/sqrt(m) with approximately 68% probability.
|
||||
*/
|
||||
uint64_t m = 0;
|
||||
|
||||
/**
|
||||
* These are the actual buckets that are storing an estimate of the
|
||||
* cardinality. All these need to do is count when the first 1 bit
|
||||
* appears in the bitstring and that location is at most 65, so not
|
||||
* that many bits are needed to store it.
|
||||
*/
|
||||
std::vector<uint8_t> buckets;
|
||||
/**
|
||||
* These are the actual buckets that are storing an estimate of the
|
||||
* cardinality. All these need to do is count when the first 1 bit
|
||||
* appears in the bitstring and that location is at most 65, so not
|
||||
* that many bits are needed to store it.
|
||||
*/
|
||||
std::vector<uint8_t> buckets;
|
||||
|
||||
/**
|
||||
* There are some state constants that need to be kept track of to
|
||||
* make the final estimate easier. V is the number of values in
|
||||
* buckets that are 0 and this is used in the small error correction.
|
||||
* alpha_m is a multiplicative constant used in the algorithm.
|
||||
*/
|
||||
uint64_t V = 0;
|
||||
double alpha_m = 0.0;
|
||||
int p = 0; // the log2 of m
|
||||
};
|
||||
/**
|
||||
* There are some state constants that need to be kept track of to
|
||||
* make the final estimate easier. V is the number of values in
|
||||
* buckets that are 0 and this is used in the small error correction.
|
||||
* alpha_m is a multiplicative constant used in the algorithm.
|
||||
*/
|
||||
uint64_t V = 0;
|
||||
double alpha_m = 0.0;
|
||||
int p = 0; // the log2 of m
|
||||
};
|
||||
|
||||
} // namespace zeek::probabilistic::detail
|
||||
} // namespace zeek::probabilistic::detail
|
||||
|
|
|
@ -10,197 +10,157 @@
|
|||
#include "zeek/probabilistic/BitVector.h"
|
||||
#include "zeek/util.h"
|
||||
|
||||
namespace zeek::probabilistic::detail
|
||||
{
|
||||
namespace zeek::probabilistic::detail {
|
||||
|
||||
CounterVector::CounterVector(size_t arg_width, size_t cells)
|
||||
{
|
||||
bits = new BitVector(arg_width * cells);
|
||||
width = arg_width;
|
||||
}
|
||||
CounterVector::CounterVector(size_t arg_width, size_t cells) {
|
||||
bits = new BitVector(arg_width * cells);
|
||||
width = arg_width;
|
||||
}
|
||||
|
||||
CounterVector::CounterVector(const CounterVector& other)
|
||||
{
|
||||
bits = new BitVector(*other.bits);
|
||||
width = other.width;
|
||||
}
|
||||
CounterVector::CounterVector(const CounterVector& other) {
|
||||
bits = new BitVector(*other.bits);
|
||||
width = other.width;
|
||||
}
|
||||
|
||||
CounterVector::~CounterVector()
|
||||
{
|
||||
delete bits;
|
||||
}
|
||||
CounterVector::~CounterVector() { delete bits; }
|
||||
|
||||
bool CounterVector::Increment(size_type cell, count_type value)
|
||||
{
|
||||
assert(cell < Size());
|
||||
assert(value != 0);
|
||||
bool CounterVector::Increment(size_type cell, count_type value) {
|
||||
assert(cell < Size());
|
||||
assert(value != 0);
|
||||
|
||||
size_t lsb = cell * width;
|
||||
bool carry = false;
|
||||
size_t lsb = cell * width;
|
||||
bool carry = false;
|
||||
|
||||
for ( size_t i = 0; i < width; ++i )
|
||||
{
|
||||
bool b1 = (*bits)[lsb + i];
|
||||
bool b2 = value & (1 << i);
|
||||
(*bits)[lsb + i] = b1 ^ b2 ^ carry;
|
||||
carry = (b1 && b2) || (carry && (b1 != b2));
|
||||
}
|
||||
for ( size_t i = 0; i < width; ++i ) {
|
||||
bool b1 = (*bits)[lsb + i];
|
||||
bool b2 = value & (1 << i);
|
||||
(*bits)[lsb + i] = b1 ^ b2 ^ carry;
|
||||
carry = (b1 && b2) || (carry && (b1 != b2));
|
||||
}
|
||||
|
||||
if ( carry )
|
||||
{
|
||||
for ( size_t i = 0; i < width; ++i )
|
||||
bits->Set(lsb + i);
|
||||
}
|
||||
if ( carry ) {
|
||||
for ( size_t i = 0; i < width; ++i )
|
||||
bits->Set(lsb + i);
|
||||
}
|
||||
|
||||
return ! carry;
|
||||
}
|
||||
return ! carry;
|
||||
}
|
||||
|
||||
bool CounterVector::Decrement(size_type cell, count_type value)
|
||||
{
|
||||
assert(cell < Size());
|
||||
assert(value != 0);
|
||||
bool CounterVector::Decrement(size_type cell, count_type value) {
|
||||
assert(cell < Size());
|
||||
assert(value != 0);
|
||||
|
||||
value = ~value + 1; // A - B := A + ~B + 1
|
||||
bool carry = false;
|
||||
size_t lsb = cell * width;
|
||||
value = ~value + 1; // A - B := A + ~B + 1
|
||||
bool carry = false;
|
||||
size_t lsb = cell * width;
|
||||
|
||||
for ( size_t i = 0; i < width; ++i )
|
||||
{
|
||||
bool b1 = (*bits)[lsb + i];
|
||||
bool b2 = value & (1 << i);
|
||||
(*bits)[lsb + i] = b1 ^ b2 ^ carry;
|
||||
carry = (b1 && b2) || (carry && (b1 != b2));
|
||||
}
|
||||
for ( size_t i = 0; i < width; ++i ) {
|
||||
bool b1 = (*bits)[lsb + i];
|
||||
bool b2 = value & (1 << i);
|
||||
(*bits)[lsb + i] = b1 ^ b2 ^ carry;
|
||||
carry = (b1 && b2) || (carry && (b1 != b2));
|
||||
}
|
||||
|
||||
return carry;
|
||||
}
|
||||
return carry;
|
||||
}
|
||||
|
||||
bool CounterVector::AllZero() const
|
||||
{
|
||||
return bits->AllZero();
|
||||
}
|
||||
bool CounterVector::AllZero() const { return bits->AllZero(); }
|
||||
|
||||
void CounterVector::Reset()
|
||||
{
|
||||
bits->Reset();
|
||||
}
|
||||
void CounterVector::Reset() { bits->Reset(); }
|
||||
|
||||
CounterVector::count_type CounterVector::Count(size_type cell) const
|
||||
{
|
||||
assert(cell < Size());
|
||||
CounterVector::count_type CounterVector::Count(size_type cell) const {
|
||||
assert(cell < Size());
|
||||
|
||||
size_t cnt = 0, order = 1;
|
||||
size_t lsb = cell * width;
|
||||
size_t cnt = 0, order = 1;
|
||||
size_t lsb = cell * width;
|
||||
|
||||
for ( size_t i = lsb; i < lsb + width; ++i, order <<= 1 )
|
||||
if ( (*bits)[i] )
|
||||
cnt |= order;
|
||||
for ( size_t i = lsb; i < lsb + width; ++i, order <<= 1 )
|
||||
if ( (*bits)[i] )
|
||||
cnt |= order;
|
||||
|
||||
return cnt;
|
||||
}
|
||||
return cnt;
|
||||
}
|
||||
|
||||
CounterVector::size_type CounterVector::Size() const
|
||||
{
|
||||
return bits->Size() / width;
|
||||
}
|
||||
CounterVector::size_type CounterVector::Size() const { return bits->Size() / width; }
|
||||
|
||||
size_t CounterVector::Width() const
|
||||
{
|
||||
return width;
|
||||
}
|
||||
size_t CounterVector::Width() const { return width; }
|
||||
|
||||
size_t CounterVector::Max() const
|
||||
{
|
||||
return std::numeric_limits<size_t>::max() >> (std::numeric_limits<size_t>::digits - width);
|
||||
}
|
||||
size_t CounterVector::Max() const {
|
||||
return std::numeric_limits<size_t>::max() >> (std::numeric_limits<size_t>::digits - width);
|
||||
}
|
||||
|
||||
CounterVector& CounterVector::Merge(const CounterVector& other)
|
||||
{
|
||||
assert(Size() == other.Size());
|
||||
assert(Width() == other.Width());
|
||||
CounterVector& CounterVector::Merge(const CounterVector& other) {
|
||||
assert(Size() == other.Size());
|
||||
assert(Width() == other.Width());
|
||||
|
||||
for ( size_t cell = 0; cell < Size(); ++cell )
|
||||
{
|
||||
size_t lsb = cell * width;
|
||||
bool carry = false;
|
||||
for ( size_t cell = 0; cell < Size(); ++cell ) {
|
||||
size_t lsb = cell * width;
|
||||
bool carry = false;
|
||||
|
||||
for ( size_t i = 0; i < width; ++i )
|
||||
{
|
||||
bool b1 = (*bits)[lsb + i];
|
||||
bool b2 = (*other.bits)[lsb + i];
|
||||
(*bits)[lsb + i] = b1 ^ b2 ^ carry;
|
||||
carry = (b1 && b2) || (carry && (b1 != b2));
|
||||
}
|
||||
for ( size_t i = 0; i < width; ++i ) {
|
||||
bool b1 = (*bits)[lsb + i];
|
||||
bool b2 = (*other.bits)[lsb + i];
|
||||
(*bits)[lsb + i] = b1 ^ b2 ^ carry;
|
||||
carry = (b1 && b2) || (carry && (b1 != b2));
|
||||
}
|
||||
|
||||
if ( carry )
|
||||
{
|
||||
for ( size_t i = 0; i < width; ++i )
|
||||
bits->Set(lsb + i);
|
||||
}
|
||||
}
|
||||
if ( carry ) {
|
||||
for ( size_t i = 0; i < width; ++i )
|
||||
bits->Set(lsb + i);
|
||||
}
|
||||
}
|
||||
|
||||
return *this;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
BitVector CounterVector::ToBitVector() const
|
||||
{
|
||||
auto newbits = BitVector(Size());
|
||||
BitVector CounterVector::ToBitVector() const {
|
||||
auto newbits = BitVector(Size());
|
||||
|
||||
for ( size_t cell = 0; cell < Size(); ++cell )
|
||||
{
|
||||
size_t lsb = cell * width;
|
||||
bool set = false;
|
||||
for ( size_t cell = 0; cell < Size(); ++cell ) {
|
||||
size_t lsb = cell * width;
|
||||
bool set = false;
|
||||
|
||||
for ( size_t i = 0; i < width; ++i )
|
||||
set |= (*bits)[lsb + 1];
|
||||
for ( size_t i = 0; i < width; ++i )
|
||||
set |= (*bits)[lsb + 1];
|
||||
|
||||
newbits[cell] = set;
|
||||
}
|
||||
newbits[cell] = set;
|
||||
}
|
||||
|
||||
return newbits;
|
||||
}
|
||||
return newbits;
|
||||
}
|
||||
|
||||
CounterVector& CounterVector::operator|=(const CounterVector& other)
|
||||
{
|
||||
return Merge(other);
|
||||
}
|
||||
CounterVector& CounterVector::operator|=(const CounterVector& other) { return Merge(other); }
|
||||
|
||||
CounterVector operator|(const CounterVector& x, const CounterVector& y)
|
||||
{
|
||||
CounterVector cv(x);
|
||||
return cv |= y;
|
||||
}
|
||||
CounterVector operator|(const CounterVector& x, const CounterVector& y) {
|
||||
CounterVector cv(x);
|
||||
return cv |= y;
|
||||
}
|
||||
|
||||
uint64_t CounterVector::Hash() const
|
||||
{
|
||||
return bits->Hash();
|
||||
}
|
||||
uint64_t CounterVector::Hash() const { return bits->Hash(); }
|
||||
|
||||
broker::expected<broker::data> CounterVector::Serialize() const
|
||||
{
|
||||
auto b = bits->Serialize();
|
||||
if ( ! b )
|
||||
return broker::ec::invalid_data; // Cannot serialize
|
||||
broker::expected<broker::data> CounterVector::Serialize() const {
|
||||
auto b = bits->Serialize();
|
||||
if ( ! b )
|
||||
return broker::ec::invalid_data; // Cannot serialize
|
||||
|
||||
return {broker::vector{static_cast<uint64_t>(width), std::move(*b)}};
|
||||
}
|
||||
return {broker::vector{static_cast<uint64_t>(width), std::move(*b)}};
|
||||
}
|
||||
|
||||
std::unique_ptr<CounterVector> CounterVector::Unserialize(const broker::data& data)
|
||||
{
|
||||
auto v = broker::get_if<broker::vector>(&data);
|
||||
if ( ! (v && v->size() >= 2) )
|
||||
return nullptr;
|
||||
std::unique_ptr<CounterVector> CounterVector::Unserialize(const broker::data& data) {
|
||||
auto v = broker::get_if<broker::vector>(&data);
|
||||
if ( ! (v && v->size() >= 2) )
|
||||
return nullptr;
|
||||
|
||||
auto width = broker::get_if<uint64_t>(&(*v)[0]);
|
||||
auto bits = BitVector::Unserialize((*v)[1]);
|
||||
auto width = broker::get_if<uint64_t>(&(*v)[0]);
|
||||
auto bits = BitVector::Unserialize((*v)[1]);
|
||||
|
||||
if ( ! (width && bits) )
|
||||
return nullptr;
|
||||
if ( ! (width && bits) )
|
||||
return nullptr;
|
||||
|
||||
auto cv = std::unique_ptr<CounterVector>(new CounterVector());
|
||||
cv->width = *width;
|
||||
cv->bits = bits.release();
|
||||
return cv;
|
||||
}
|
||||
auto cv = std::unique_ptr<CounterVector>(new CounterVector());
|
||||
cv->width = *width;
|
||||
cv->bits = bits.release();
|
||||
return cv;
|
||||
}
|
||||
|
||||
} // namespace zeek::probabilistic::detail
|
||||
} // namespace zeek::probabilistic::detail
|
||||
|
|
|
@ -9,162 +9,159 @@
|
|||
#include <cstdint>
|
||||
#include <memory>
|
||||
|
||||
namespace broker
|
||||
{
|
||||
namespace broker {
|
||||
class data;
|
||||
}
|
||||
}
|
||||
|
||||
namespace zeek::probabilistic::detail
|
||||
{
|
||||
namespace zeek::probabilistic::detail {
|
||||
|
||||
class BitVector;
|
||||
|
||||
/**
|
||||
* A vector of counters, each of which has a fixed number of bits.
|
||||
*/
|
||||
class CounterVector
|
||||
{
|
||||
class CounterVector {
|
||||
public:
|
||||
using size_type = size_t;
|
||||
using count_type = uint64_t;
|
||||
using size_type = size_t;
|
||||
using count_type = uint64_t;
|
||||
|
||||
/**
|
||||
* Constructs a counter vector having cells of a given width.
|
||||
*
|
||||
* @param width The number of bits that each cell occupies.
|
||||
*
|
||||
* @param cells The number of cells in the bitvector.
|
||||
*
|
||||
* @pre `cells > 0 && width > 0`
|
||||
*/
|
||||
explicit CounterVector(size_t width, size_t cells = 1024);
|
||||
/**
|
||||
* Constructs a counter vector having cells of a given width.
|
||||
*
|
||||
* @param width The number of bits that each cell occupies.
|
||||
*
|
||||
* @param cells The number of cells in the bitvector.
|
||||
*
|
||||
* @pre `cells > 0 && width > 0`
|
||||
*/
|
||||
explicit CounterVector(size_t width, size_t cells = 1024);
|
||||
|
||||
/**
|
||||
* Copy-constructs a counter vector.
|
||||
*
|
||||
* @param other The counter vector to copy.
|
||||
*/
|
||||
CounterVector(const CounterVector& other);
|
||||
/**
|
||||
* Copy-constructs a counter vector.
|
||||
*
|
||||
* @param other The counter vector to copy.
|
||||
*/
|
||||
CounterVector(const CounterVector& other);
|
||||
|
||||
/**
|
||||
* Destructor.
|
||||
*/
|
||||
virtual ~CounterVector();
|
||||
/**
|
||||
* Destructor.
|
||||
*/
|
||||
virtual ~CounterVector();
|
||||
|
||||
/**
|
||||
* Increments a given cell.
|
||||
*
|
||||
* @param cell The cell to increment.
|
||||
*
|
||||
* @param value The value to add to the current counter in *cell*.
|
||||
*
|
||||
* @return `true` if adding *value* to the counter in *cell* succeeded.
|
||||
*
|
||||
* @pre `cell < Size()`
|
||||
*/
|
||||
bool Increment(size_type cell, count_type value = 1);
|
||||
/**
|
||||
* Increments a given cell.
|
||||
*
|
||||
* @param cell The cell to increment.
|
||||
*
|
||||
* @param value The value to add to the current counter in *cell*.
|
||||
*
|
||||
* @return `true` if adding *value* to the counter in *cell* succeeded.
|
||||
*
|
||||
* @pre `cell < Size()`
|
||||
*/
|
||||
bool Increment(size_type cell, count_type value = 1);
|
||||
|
||||
/**
|
||||
* Decrements a given cell.
|
||||
*
|
||||
* @param cell The cell to decrement.
|
||||
*
|
||||
* @param value The value to subtract from the current counter in *cell*.
|
||||
*
|
||||
* @return `true` if subtracting *value* from the counter in *cell* succeeded.
|
||||
*
|
||||
* @pre `cell < Size()`
|
||||
*/
|
||||
bool Decrement(size_type cell, count_type value = 1);
|
||||
/**
|
||||
* Decrements a given cell.
|
||||
*
|
||||
* @param cell The cell to decrement.
|
||||
*
|
||||
* @param value The value to subtract from the current counter in *cell*.
|
||||
*
|
||||
* @return `true` if subtracting *value* from the counter in *cell* succeeded.
|
||||
*
|
||||
* @pre `cell < Size()`
|
||||
*/
|
||||
bool Decrement(size_type cell, count_type value = 1);
|
||||
|
||||
/**
|
||||
* Retrieves the counter of a given cell.
|
||||
*
|
||||
* @param cell The cell index to retrieve the count for.
|
||||
*
|
||||
* @return The counter associated with *cell*.
|
||||
*
|
||||
* @pre `cell < Size()`
|
||||
*/
|
||||
count_type Count(size_type cell) const;
|
||||
/**
|
||||
* Retrieves the counter of a given cell.
|
||||
*
|
||||
* @param cell The cell index to retrieve the count for.
|
||||
*
|
||||
* @return The counter associated with *cell*.
|
||||
*
|
||||
* @pre `cell < Size()`
|
||||
*/
|
||||
count_type Count(size_type cell) const;
|
||||
|
||||
/**
|
||||
* Checks whether all counters are 0.
|
||||
* @return `true` iff all counters have the value 0.
|
||||
*/
|
||||
bool AllZero() const;
|
||||
/**
|
||||
* Checks whether all counters are 0.
|
||||
* @return `true` iff all counters have the value 0.
|
||||
*/
|
||||
bool AllZero() const;
|
||||
|
||||
/**
|
||||
* Sets all counters to 0.
|
||||
*/
|
||||
void Reset();
|
||||
/**
|
||||
* Sets all counters to 0.
|
||||
*/
|
||||
void Reset();
|
||||
|
||||
/**
|
||||
* Retrieves the number of cells in the storage.
|
||||
*
|
||||
* @return The number of cells.
|
||||
*/
|
||||
size_type Size() const;
|
||||
/**
|
||||
* Retrieves the number of cells in the storage.
|
||||
*
|
||||
* @return The number of cells.
|
||||
*/
|
||||
size_type Size() const;
|
||||
|
||||
/**
|
||||
* Retrieves the counter width.
|
||||
*
|
||||
* @return The number of bits per counter.
|
||||
*/
|
||||
size_t Width() const;
|
||||
/**
|
||||
* Retrieves the counter width.
|
||||
*
|
||||
* @return The number of bits per counter.
|
||||
*/
|
||||
size_t Width() const;
|
||||
|
||||
/**
|
||||
* Computes the maximum counter value.
|
||||
*
|
||||
* @return The maximum counter value based on the width.
|
||||
*/
|
||||
size_t Max() const;
|
||||
/**
|
||||
* Computes the maximum counter value.
|
||||
*
|
||||
* @return The maximum counter value based on the width.
|
||||
*/
|
||||
size_t Max() const;
|
||||
|
||||
/**
|
||||
* Merges another counter vector into this instance by *adding* the
|
||||
* counters of each cells.
|
||||
*
|
||||
* @param other The counter vector to merge into this instance.
|
||||
*
|
||||
* @return A reference to `*this`.
|
||||
*
|
||||
* @pre `Size() == other.Size() && Width() == other.Width()`
|
||||
*/
|
||||
CounterVector& Merge(const CounterVector& other);
|
||||
/**
|
||||
* Merges another counter vector into this instance by *adding* the
|
||||
* counters of each cells.
|
||||
*
|
||||
* @param other The counter vector to merge into this instance.
|
||||
*
|
||||
* @return A reference to `*this`.
|
||||
*
|
||||
* @pre `Size() == other.Size() && Width() == other.Width()`
|
||||
*/
|
||||
CounterVector& Merge(const CounterVector& other);
|
||||
|
||||
/**
|
||||
* Converts a counter vector into a BitVector. Each cell that has a value
|
||||
* of 1 or more set is set in the BitVector; otherwise the bit remains unset.
|
||||
*
|
||||
* @return The newly created BitVector
|
||||
*/
|
||||
BitVector ToBitVector() const;
|
||||
/**
|
||||
* Converts a counter vector into a BitVector. Each cell that has a value
|
||||
* of 1 or more set is set in the BitVector; otherwise the bit remains unset.
|
||||
*
|
||||
* @return The newly created BitVector
|
||||
*/
|
||||
BitVector ToBitVector() const;
|
||||
|
||||
/**
|
||||
* An alias for ::Merge.
|
||||
*/
|
||||
CounterVector& operator|=(const CounterVector& other);
|
||||
/**
|
||||
* An alias for ::Merge.
|
||||
*/
|
||||
CounterVector& operator|=(const CounterVector& other);
|
||||
|
||||
/** Computes a hash value of the internal representation.
|
||||
* This is mainly for debugging/testing purposes.
|
||||
*
|
||||
* @return The hash.
|
||||
*/
|
||||
uint64_t Hash() const;
|
||||
/** Computes a hash value of the internal representation.
|
||||
* This is mainly for debugging/testing purposes.
|
||||
*
|
||||
* @return The hash.
|
||||
*/
|
||||
uint64_t Hash() const;
|
||||
|
||||
broker::expected<broker::data> Serialize() const;
|
||||
static std::unique_ptr<CounterVector> Unserialize(const broker::data& data);
|
||||
broker::expected<broker::data> Serialize() const;
|
||||
static std::unique_ptr<CounterVector> Unserialize(const broker::data& data);
|
||||
|
||||
protected:
|
||||
friend CounterVector operator|(const CounterVector& x, const CounterVector& y);
|
||||
friend CounterVector operator|(const CounterVector& x, const CounterVector& y);
|
||||
|
||||
CounterVector() = default;
|
||||
CounterVector() = default;
|
||||
|
||||
private:
|
||||
CounterVector& operator=(const CounterVector&); // Disable.
|
||||
CounterVector& operator=(const CounterVector&); // Disable.
|
||||
|
||||
BitVector* bits = nullptr;
|
||||
size_t width = 0;
|
||||
};
|
||||
BitVector* bits = nullptr;
|
||||
size_t width = 0;
|
||||
};
|
||||
|
||||
} // namespace zeek::probabilistic::detail
|
||||
} // namespace zeek::probabilistic::detail
|
||||
|
|
|
@ -11,171 +11,135 @@
|
|||
#include "zeek/Var.h"
|
||||
#include "zeek/digest.h"
|
||||
|
||||
namespace zeek::probabilistic::detail
|
||||
{
|
||||
namespace zeek::probabilistic::detail {
|
||||
|
||||
Hasher::seed_t Hasher::MakeSeed(const void* data, size_t size)
|
||||
{
|
||||
u_char buf[SHA256_DIGEST_LENGTH];
|
||||
seed_t tmpseed;
|
||||
EVP_MD_CTX* ctx = zeek::detail::hash_init(zeek::detail::Hash_SHA256);
|
||||
Hasher::seed_t Hasher::MakeSeed(const void* data, size_t size) {
|
||||
u_char buf[SHA256_DIGEST_LENGTH];
|
||||
seed_t tmpseed;
|
||||
EVP_MD_CTX* ctx = zeek::detail::hash_init(zeek::detail::Hash_SHA256);
|
||||
|
||||
assert(sizeof(tmpseed) == 16);
|
||||
assert(sizeof(tmpseed) == 16);
|
||||
|
||||
static auto global_hash_seed = id::find_val<StringVal>("global_hash_seed");
|
||||
static auto global_hash_seed = id::find_val<StringVal>("global_hash_seed");
|
||||
|
||||
if ( data )
|
||||
zeek::detail::hash_update(ctx, data, size);
|
||||
if ( data )
|
||||
zeek::detail::hash_update(ctx, data, size);
|
||||
|
||||
else if ( global_hash_seed->Len() > 0 )
|
||||
zeek::detail::hash_update(ctx, global_hash_seed->Bytes(), global_hash_seed->Len());
|
||||
else if ( global_hash_seed->Len() > 0 )
|
||||
zeek::detail::hash_update(ctx, global_hash_seed->Bytes(), global_hash_seed->Len());
|
||||
|
||||
else
|
||||
{
|
||||
unsigned int first_seed = util::detail::initial_seed();
|
||||
zeek::detail::hash_update(ctx, &first_seed, sizeof(first_seed));
|
||||
}
|
||||
else {
|
||||
unsigned int first_seed = util::detail::initial_seed();
|
||||
zeek::detail::hash_update(ctx, &first_seed, sizeof(first_seed));
|
||||
}
|
||||
|
||||
zeek::detail::hash_final(ctx, buf);
|
||||
memcpy(&tmpseed, buf, sizeof(tmpseed)); // Use the first bytes as seed.
|
||||
return tmpseed;
|
||||
}
|
||||
zeek::detail::hash_final(ctx, buf);
|
||||
memcpy(&tmpseed, buf, sizeof(tmpseed)); // Use the first bytes as seed.
|
||||
return tmpseed;
|
||||
}
|
||||
|
||||
Hasher::digest_vector Hasher::Hash(const zeek::detail::HashKey* key) const
|
||||
{
|
||||
return Hash(key->Key(), key->Size());
|
||||
}
|
||||
Hasher::digest_vector Hasher::Hash(const zeek::detail::HashKey* key) const { return Hash(key->Key(), key->Size()); }
|
||||
|
||||
Hasher::Hasher(size_t arg_k, seed_t arg_seed)
|
||||
{
|
||||
k = arg_k;
|
||||
seed = arg_seed;
|
||||
}
|
||||
Hasher::Hasher(size_t arg_k, seed_t arg_seed) {
|
||||
k = arg_k;
|
||||
seed = arg_seed;
|
||||
}
|
||||
|
||||
broker::expected<broker::data> Hasher::Serialize() const
|
||||
{
|
||||
return {broker::vector{static_cast<uint64_t>(Type()), static_cast<uint64_t>(k), seed.h[0],
|
||||
seed.h[1]}};
|
||||
}
|
||||
broker::expected<broker::data> Hasher::Serialize() const {
|
||||
return {broker::vector{static_cast<uint64_t>(Type()), static_cast<uint64_t>(k), seed.h[0], seed.h[1]}};
|
||||
}
|
||||
|
||||
std::unique_ptr<Hasher> Hasher::Unserialize(const broker::data& data)
|
||||
{
|
||||
auto v = broker::get_if<broker::vector>(&data);
|
||||
std::unique_ptr<Hasher> Hasher::Unserialize(const broker::data& data) {
|
||||
auto v = broker::get_if<broker::vector>(&data);
|
||||
|
||||
if ( ! (v && v->size() == 4) )
|
||||
return nullptr;
|
||||
if ( ! (v && v->size() == 4) )
|
||||
return nullptr;
|
||||
|
||||
auto type = broker::get_if<uint64_t>(&(*v)[0]);
|
||||
auto k = broker::get_if<uint64_t>(&(*v)[1]);
|
||||
auto h1 = broker::get_if<uint64_t>(&(*v)[2]);
|
||||
auto h2 = broker::get_if<uint64_t>(&(*v)[3]);
|
||||
auto type = broker::get_if<uint64_t>(&(*v)[0]);
|
||||
auto k = broker::get_if<uint64_t>(&(*v)[1]);
|
||||
auto h1 = broker::get_if<uint64_t>(&(*v)[2]);
|
||||
auto h2 = broker::get_if<uint64_t>(&(*v)[3]);
|
||||
|
||||
if ( ! (type && k && h1 && h2) )
|
||||
return nullptr;
|
||||
if ( ! (type && k && h1 && h2) )
|
||||
return nullptr;
|
||||
|
||||
std::unique_ptr<Hasher> hasher;
|
||||
std::unique_ptr<Hasher> hasher;
|
||||
|
||||
switch ( *type )
|
||||
{
|
||||
case Default:
|
||||
hasher = std::unique_ptr<Hasher>(new DefaultHasher(*k, {*h1, *h2}));
|
||||
break;
|
||||
switch ( *type ) {
|
||||
case Default: hasher = std::unique_ptr<Hasher>(new DefaultHasher(*k, {*h1, *h2})); break;
|
||||
|
||||
case Double:
|
||||
hasher = std::unique_ptr<Hasher>(new DoubleHasher(*k, {*h1, *h2}));
|
||||
break;
|
||||
}
|
||||
case Double: hasher = std::unique_ptr<Hasher>(new DoubleHasher(*k, {*h1, *h2})); break;
|
||||
}
|
||||
|
||||
// Note that the derived classed don't hold any further state of
|
||||
// their own. They reconstruct all their information from their
|
||||
// constructors' arguments.
|
||||
// Note that the derived classed don't hold any further state of
|
||||
// their own. They reconstruct all their information from their
|
||||
// constructors' arguments.
|
||||
|
||||
return hasher;
|
||||
}
|
||||
return hasher;
|
||||
}
|
||||
|
||||
UHF::UHF()
|
||||
{
|
||||
memset(&seed, 0, sizeof(seed));
|
||||
}
|
||||
UHF::UHF() { memset(&seed, 0, sizeof(seed)); }
|
||||
|
||||
UHF::UHF(Hasher::seed_t arg_seed)
|
||||
{
|
||||
seed = arg_seed;
|
||||
}
|
||||
UHF::UHF(Hasher::seed_t arg_seed) { seed = arg_seed; }
|
||||
|
||||
// This function is almost equivalent to HashKey::HashBytes except that it
|
||||
// does not depend on global state and that we mix in the seed multiple
|
||||
// times.
|
||||
Hasher::digest UHF::hash(const void* x, size_t n) const
|
||||
{
|
||||
static_assert(std::is_same_v<highwayhash::SipHashState::Key, decltype(seed.h)>,
|
||||
"Seed value is not the same type as highwayhash key");
|
||||
return highwayhash::SipHash(seed.h, reinterpret_cast<const char*>(x), n);
|
||||
}
|
||||
Hasher::digest UHF::hash(const void* x, size_t n) const {
|
||||
static_assert(std::is_same_v<highwayhash::SipHashState::Key, decltype(seed.h)>,
|
||||
"Seed value is not the same type as highwayhash key");
|
||||
return highwayhash::SipHash(seed.h, reinterpret_cast<const char*>(x), n);
|
||||
}
|
||||
|
||||
DefaultHasher::DefaultHasher(size_t k, Hasher::seed_t seed) : Hasher(k, seed)
|
||||
{
|
||||
for ( size_t i = 1; i <= k; ++i )
|
||||
{
|
||||
seed_t s = Seed();
|
||||
s.h[0] += util::detail::prng(i);
|
||||
hash_functions.emplace_back(s);
|
||||
}
|
||||
}
|
||||
DefaultHasher::DefaultHasher(size_t k, Hasher::seed_t seed) : Hasher(k, seed) {
|
||||
for ( size_t i = 1; i <= k; ++i ) {
|
||||
seed_t s = Seed();
|
||||
s.h[0] += util::detail::prng(i);
|
||||
hash_functions.emplace_back(s);
|
||||
}
|
||||
}
|
||||
|
||||
Hasher::digest_vector DefaultHasher::Hash(const void* x, size_t n) const
|
||||
{
|
||||
digest_vector h(K(), 0);
|
||||
Hasher::digest_vector DefaultHasher::Hash(const void* x, size_t n) const {
|
||||
digest_vector h(K(), 0);
|
||||
|
||||
for ( size_t i = 0; i < h.size(); ++i )
|
||||
h[i] = hash_functions[i](x, n);
|
||||
for ( size_t i = 0; i < h.size(); ++i )
|
||||
h[i] = hash_functions[i](x, n);
|
||||
|
||||
return h;
|
||||
}
|
||||
return h;
|
||||
}
|
||||
|
||||
DefaultHasher* DefaultHasher::Clone() const
|
||||
{
|
||||
return new DefaultHasher(*this);
|
||||
}
|
||||
DefaultHasher* DefaultHasher::Clone() const { return new DefaultHasher(*this); }
|
||||
|
||||
bool DefaultHasher::Equals(const Hasher* other) const
|
||||
{
|
||||
if ( typeid(*this) != typeid(*other) )
|
||||
return false;
|
||||
bool DefaultHasher::Equals(const Hasher* other) const {
|
||||
if ( typeid(*this) != typeid(*other) )
|
||||
return false;
|
||||
|
||||
const DefaultHasher* o = static_cast<const DefaultHasher*>(other);
|
||||
return hash_functions == o->hash_functions;
|
||||
}
|
||||
const DefaultHasher* o = static_cast<const DefaultHasher*>(other);
|
||||
return hash_functions == o->hash_functions;
|
||||
}
|
||||
|
||||
DoubleHasher::DoubleHasher(size_t k, seed_t seed)
|
||||
: Hasher(k, seed), h1(seed + util::detail::prng(1)), h2(seed + util::detail::prng(2))
|
||||
{
|
||||
}
|
||||
: Hasher(k, seed), h1(seed + util::detail::prng(1)), h2(seed + util::detail::prng(2)) {}
|
||||
|
||||
Hasher::digest_vector DoubleHasher::Hash(const void* x, size_t n) const
|
||||
{
|
||||
digest d1 = h1(x, n);
|
||||
digest d2 = h2(x, n);
|
||||
digest_vector h(K(), 0);
|
||||
Hasher::digest_vector DoubleHasher::Hash(const void* x, size_t n) const {
|
||||
digest d1 = h1(x, n);
|
||||
digest d2 = h2(x, n);
|
||||
digest_vector h(K(), 0);
|
||||
|
||||
for ( size_t i = 0; i < h.size(); ++i )
|
||||
h[i] = d1 + i * d2;
|
||||
for ( size_t i = 0; i < h.size(); ++i )
|
||||
h[i] = d1 + i * d2;
|
||||
|
||||
return h;
|
||||
}
|
||||
return h;
|
||||
}
|
||||
|
||||
DoubleHasher* DoubleHasher::Clone() const
|
||||
{
|
||||
return new DoubleHasher(*this);
|
||||
}
|
||||
DoubleHasher* DoubleHasher::Clone() const { return new DoubleHasher(*this); }
|
||||
|
||||
bool DoubleHasher::Equals(const Hasher* other) const
|
||||
{
|
||||
if ( typeid(*this) != typeid(*other) )
|
||||
return false;
|
||||
bool DoubleHasher::Equals(const Hasher* other) const {
|
||||
if ( typeid(*this) != typeid(*other) )
|
||||
return false;
|
||||
|
||||
const DoubleHasher* o = static_cast<const DoubleHasher*>(other);
|
||||
return h1 == o->h1 && h2 == o->h2;
|
||||
}
|
||||
const DoubleHasher* o = static_cast<const DoubleHasher*>(other);
|
||||
return h1 == o->h1 && h2 == o->h2;
|
||||
}
|
||||
|
||||
} // namespace zeek::probabilistic::detail
|
||||
} // namespace zeek::probabilistic::detail
|
||||
|
|
|
@ -7,252 +7,242 @@
|
|||
|
||||
#include "zeek/Hash.h"
|
||||
|
||||
namespace broker
|
||||
{
|
||||
namespace broker {
|
||||
class data;
|
||||
}
|
||||
}
|
||||
|
||||
namespace zeek::probabilistic::detail
|
||||
{
|
||||
namespace zeek::probabilistic::detail {
|
||||
|
||||
/** Types of derived Hasher classes. */
|
||||
enum HasherType
|
||||
{
|
||||
Default,
|
||||
Double
|
||||
};
|
||||
enum HasherType { Default, Double };
|
||||
|
||||
/**
|
||||
* Abstract base class for hashers. A hasher creates a family of hash
|
||||
* functions to hash an element *k* times.
|
||||
*/
|
||||
class Hasher
|
||||
{
|
||||
class Hasher {
|
||||
public:
|
||||
using digest = zeek::detail::hash_t;
|
||||
using digest_vector = std::vector<digest>;
|
||||
struct seed_t
|
||||
{
|
||||
// actually HH_U64, which has the same type
|
||||
alignas(16) unsigned long long h[2];
|
||||
using digest = zeek::detail::hash_t;
|
||||
using digest_vector = std::vector<digest>;
|
||||
struct seed_t {
|
||||
// actually HH_U64, which has the same type
|
||||
alignas(16) unsigned long long h[2];
|
||||
|
||||
friend seed_t operator+(seed_t lhs, const uint64_t rhs)
|
||||
{
|
||||
lhs.h[0] += rhs;
|
||||
return lhs;
|
||||
}
|
||||
};
|
||||
friend seed_t operator+(seed_t lhs, const uint64_t rhs) {
|
||||
lhs.h[0] += rhs;
|
||||
return lhs;
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* Creates a valid hasher seed from an arbitrary string.
|
||||
*
|
||||
* @param data A pointer to contiguous data that should be crunched into a
|
||||
* seed. If 0, the function tries to find a global_hash_seed script variable
|
||||
* to derive a seed from. If this variable does not exist, the function uses
|
||||
* the initial seed generated at Zeek startup.
|
||||
*
|
||||
* @param size The number of bytes of *data*.
|
||||
*
|
||||
* @return A seed suitable for hashers.
|
||||
*/
|
||||
static seed_t MakeSeed(const void* data, size_t size);
|
||||
/**
|
||||
* Creates a valid hasher seed from an arbitrary string.
|
||||
*
|
||||
* @param data A pointer to contiguous data that should be crunched into a
|
||||
* seed. If 0, the function tries to find a global_hash_seed script variable
|
||||
* to derive a seed from. If this variable does not exist, the function uses
|
||||
* the initial seed generated at Zeek startup.
|
||||
*
|
||||
* @param size The number of bytes of *data*.
|
||||
*
|
||||
* @return A seed suitable for hashers.
|
||||
*/
|
||||
static seed_t MakeSeed(const void* data, size_t size);
|
||||
|
||||
/**
|
||||
* Destructor.
|
||||
*/
|
||||
virtual ~Hasher() { }
|
||||
/**
|
||||
* Destructor.
|
||||
*/
|
||||
virtual ~Hasher() {}
|
||||
|
||||
/**
|
||||
* Computes hash values for an element.
|
||||
*
|
||||
* @param x The element to hash.
|
||||
*
|
||||
* @return Vector of *k* hash values.
|
||||
*/
|
||||
template <typename T> digest_vector operator()(const T& x) const { return Hash(&x, sizeof(T)); }
|
||||
/**
|
||||
* Computes hash values for an element.
|
||||
*
|
||||
* @param x The element to hash.
|
||||
*
|
||||
* @return Vector of *k* hash values.
|
||||
*/
|
||||
template<typename T>
|
||||
digest_vector operator()(const T& x) const {
|
||||
return Hash(&x, sizeof(T));
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes hash values for an element.
|
||||
*
|
||||
* @param x The key of the value to hash.
|
||||
*
|
||||
* @return Vector of *k* hash values.
|
||||
*/
|
||||
digest_vector Hash(const zeek::detail::HashKey* key) const;
|
||||
/**
|
||||
* Computes hash values for an element.
|
||||
*
|
||||
* @param x The key of the value to hash.
|
||||
*
|
||||
* @return Vector of *k* hash values.
|
||||
*/
|
||||
digest_vector Hash(const zeek::detail::HashKey* key) const;
|
||||
|
||||
/**
|
||||
* Computes the hashes for a set of bytes.
|
||||
*
|
||||
* @param x Pointer to first byte to hash.
|
||||
*
|
||||
* @param n Number of bytes to hash.
|
||||
*
|
||||
* @return Vector of *k* hash values.
|
||||
*
|
||||
*/
|
||||
virtual digest_vector Hash(const void* x, size_t n) const = 0;
|
||||
/**
|
||||
* Computes the hashes for a set of bytes.
|
||||
*
|
||||
* @param x Pointer to first byte to hash.
|
||||
*
|
||||
* @param n Number of bytes to hash.
|
||||
*
|
||||
* @return Vector of *k* hash values.
|
||||
*
|
||||
*/
|
||||
virtual digest_vector Hash(const void* x, size_t n) const = 0;
|
||||
|
||||
/**
|
||||
* Returns a deep copy of the hasher.
|
||||
*/
|
||||
virtual Hasher* Clone() const = 0;
|
||||
/**
|
||||
* Returns a deep copy of the hasher.
|
||||
*/
|
||||
virtual Hasher* Clone() const = 0;
|
||||
|
||||
/**
|
||||
* Returns true if two hashers are identical.
|
||||
*/
|
||||
virtual bool Equals(const Hasher* other) const = 0;
|
||||
/**
|
||||
* Returns true if two hashers are identical.
|
||||
*/
|
||||
virtual bool Equals(const Hasher* other) const = 0;
|
||||
|
||||
/**
|
||||
* Returns the number *k* of hash functions the hashers applies.
|
||||
*/
|
||||
size_t K() const { return k; }
|
||||
/**
|
||||
* Returns the number *k* of hash functions the hashers applies.
|
||||
*/
|
||||
size_t K() const { return k; }
|
||||
|
||||
/**
|
||||
* Returns the seed used to construct the hasher.
|
||||
*/
|
||||
seed_t Seed() const { return seed; }
|
||||
/**
|
||||
* Returns the seed used to construct the hasher.
|
||||
*/
|
||||
seed_t Seed() const { return seed; }
|
||||
|
||||
broker::expected<broker::data> Serialize() const;
|
||||
static std::unique_ptr<Hasher> Unserialize(const broker::data& data);
|
||||
broker::expected<broker::data> Serialize() const;
|
||||
static std::unique_ptr<Hasher> Unserialize(const broker::data& data);
|
||||
|
||||
protected:
|
||||
Hasher() { }
|
||||
Hasher() {}
|
||||
|
||||
/**
|
||||
* Constructor.
|
||||
*
|
||||
* @param arg_k the number of hash functions.
|
||||
*
|
||||
* @param arg_seed The seed for the hasher.
|
||||
*/
|
||||
Hasher(size_t arg_k, seed_t arg_seed);
|
||||
/**
|
||||
* Constructor.
|
||||
*
|
||||
* @param arg_k the number of hash functions.
|
||||
*
|
||||
* @param arg_seed The seed for the hasher.
|
||||
*/
|
||||
Hasher(size_t arg_k, seed_t arg_seed);
|
||||
|
||||
virtual HasherType Type() const = 0;
|
||||
virtual HasherType Type() const = 0;
|
||||
|
||||
private:
|
||||
size_t k = 0;
|
||||
seed_t seed = {0};
|
||||
};
|
||||
size_t k = 0;
|
||||
seed_t seed = {0};
|
||||
};
|
||||
|
||||
/**
|
||||
* A universal hash function family. This is a helper class that Hasher
|
||||
* implementations can use in their implementation.
|
||||
*/
|
||||
class UHF
|
||||
{
|
||||
class UHF {
|
||||
public:
|
||||
/**
|
||||
* Default constructor with zero seed.
|
||||
*/
|
||||
UHF();
|
||||
/**
|
||||
* Default constructor with zero seed.
|
||||
*/
|
||||
UHF();
|
||||
|
||||
/**
|
||||
* Constructs an hash function seeded with a given seed and an
|
||||
* optional extra seed to replace the initial Zeek seed.
|
||||
*
|
||||
* @param arg_seed The seed to use for this instance.
|
||||
*/
|
||||
explicit UHF(Hasher::seed_t arg_seed);
|
||||
/**
|
||||
* Constructs an hash function seeded with a given seed and an
|
||||
* optional extra seed to replace the initial Zeek seed.
|
||||
*
|
||||
* @param arg_seed The seed to use for this instance.
|
||||
*/
|
||||
explicit UHF(Hasher::seed_t arg_seed);
|
||||
|
||||
template <typename T> Hasher::digest operator()(const T& x) const
|
||||
{
|
||||
return hash(&x, sizeof(T));
|
||||
}
|
||||
template<typename T>
|
||||
Hasher::digest operator()(const T& x) const {
|
||||
return hash(&x, sizeof(T));
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes hash values for an element.
|
||||
*
|
||||
* @param x The element to hash.
|
||||
*
|
||||
* @return Vector of *k* hash values.
|
||||
*/
|
||||
Hasher::digest operator()(const void* x, size_t n) const { return hash(x, n); }
|
||||
/**
|
||||
* Computes hash values for an element.
|
||||
*
|
||||
* @param x The element to hash.
|
||||
*
|
||||
* @return Vector of *k* hash values.
|
||||
*/
|
||||
Hasher::digest operator()(const void* x, size_t n) const { return hash(x, n); }
|
||||
|
||||
/**
|
||||
* Computes the hashes for a set of bytes.
|
||||
*
|
||||
* @param x Pointer to first byte to hash.
|
||||
*
|
||||
* @param n Number of bytes to hash.
|
||||
*
|
||||
* @return Vector of *k* hash values.
|
||||
*
|
||||
*/
|
||||
Hasher::digest hash(const void* x, size_t n) const;
|
||||
/**
|
||||
* Computes the hashes for a set of bytes.
|
||||
*
|
||||
* @param x Pointer to first byte to hash.
|
||||
*
|
||||
* @param n Number of bytes to hash.
|
||||
*
|
||||
* @return Vector of *k* hash values.
|
||||
*
|
||||
*/
|
||||
Hasher::digest hash(const void* x, size_t n) const;
|
||||
|
||||
friend bool operator==(const UHF& x, const UHF& y)
|
||||
{
|
||||
return (x.seed.h[0] == y.seed.h[0]) && (x.seed.h[1] == y.seed.h[1]);
|
||||
}
|
||||
friend bool operator==(const UHF& x, const UHF& y) {
|
||||
return (x.seed.h[0] == y.seed.h[0]) && (x.seed.h[1] == y.seed.h[1]);
|
||||
}
|
||||
|
||||
friend bool operator!=(const UHF& x, const UHF& y) { return ! (x == y); }
|
||||
friend bool operator!=(const UHF& x, const UHF& y) { return ! (x == y); }
|
||||
|
||||
broker::expected<broker::data> Serialize() const;
|
||||
static UHF Unserialize(const broker::data& data);
|
||||
broker::expected<broker::data> Serialize() const;
|
||||
static UHF Unserialize(const broker::data& data);
|
||||
|
||||
private:
|
||||
static size_t compute_seed(Hasher::seed_t seed);
|
||||
static size_t compute_seed(Hasher::seed_t seed);
|
||||
|
||||
Hasher::seed_t seed;
|
||||
};
|
||||
Hasher::seed_t seed;
|
||||
};
|
||||
|
||||
/**
|
||||
* A hasher implementing the default hashing policy. Uses *k* separate hash
|
||||
* functions internally.
|
||||
*/
|
||||
class DefaultHasher : public Hasher
|
||||
{
|
||||
class DefaultHasher : public Hasher {
|
||||
public:
|
||||
/**
|
||||
* Constructor for a hasher with *k* hash functions.
|
||||
*
|
||||
* @param k The number of hash functions to use.
|
||||
*
|
||||
* @param seed The seed for the hasher.
|
||||
*/
|
||||
DefaultHasher(size_t k, Hasher::seed_t seed);
|
||||
/**
|
||||
* Constructor for a hasher with *k* hash functions.
|
||||
*
|
||||
* @param k The number of hash functions to use.
|
||||
*
|
||||
* @param seed The seed for the hasher.
|
||||
*/
|
||||
DefaultHasher(size_t k, Hasher::seed_t seed);
|
||||
|
||||
// Overridden from Hasher.
|
||||
digest_vector Hash(const void* x, size_t n) const final;
|
||||
DefaultHasher* Clone() const final;
|
||||
bool Equals(const Hasher* other) const final;
|
||||
// Overridden from Hasher.
|
||||
digest_vector Hash(const void* x, size_t n) const final;
|
||||
DefaultHasher* Clone() const final;
|
||||
bool Equals(const Hasher* other) const final;
|
||||
|
||||
private:
|
||||
DefaultHasher() { }
|
||||
DefaultHasher() {}
|
||||
|
||||
HasherType Type() const override { return HasherType::Default; }
|
||||
HasherType Type() const override { return HasherType::Default; }
|
||||
|
||||
std::vector<UHF> hash_functions;
|
||||
};
|
||||
std::vector<UHF> hash_functions;
|
||||
};
|
||||
|
||||
/**
|
||||
* The *double-hashing* policy. Uses a linear combination of two hash
|
||||
* functions.
|
||||
*/
|
||||
class DoubleHasher : public Hasher
|
||||
{
|
||||
class DoubleHasher : public Hasher {
|
||||
public:
|
||||
/**
|
||||
* Constructor for a double hasher with *k* hash functions.
|
||||
*
|
||||
* @param k The number of hash functions to use.
|
||||
*
|
||||
* @param seed The seed for the hasher.
|
||||
*/
|
||||
DoubleHasher(size_t k, Hasher::seed_t seed);
|
||||
/**
|
||||
* Constructor for a double hasher with *k* hash functions.
|
||||
*
|
||||
* @param k The number of hash functions to use.
|
||||
*
|
||||
* @param seed The seed for the hasher.
|
||||
*/
|
||||
DoubleHasher(size_t k, Hasher::seed_t seed);
|
||||
|
||||
// Overridden from Hasher.
|
||||
digest_vector Hash(const void* x, size_t n) const final;
|
||||
DoubleHasher* Clone() const final;
|
||||
bool Equals(const Hasher* other) const final;
|
||||
// Overridden from Hasher.
|
||||
digest_vector Hash(const void* x, size_t n) const final;
|
||||
DoubleHasher* Clone() const final;
|
||||
bool Equals(const Hasher* other) const final;
|
||||
|
||||
private:
|
||||
DoubleHasher() { }
|
||||
DoubleHasher() {}
|
||||
|
||||
HasherType Type() const override { return HasherType::Double; }
|
||||
HasherType Type() const override { return HasherType::Double; }
|
||||
|
||||
UHF h1;
|
||||
UHF h2;
|
||||
};
|
||||
UHF h1;
|
||||
UHF h2;
|
||||
};
|
||||
|
||||
} // namespace zeek::probabilistic::detail
|
||||
} // namespace zeek::probabilistic::detail
|
||||
|
|
|
@ -9,513 +9,467 @@
|
|||
#include "zeek/Reporter.h"
|
||||
#include "zeek/broker/Data.h"
|
||||
|
||||
namespace zeek::probabilistic::detail
|
||||
{
|
||||
namespace zeek::probabilistic::detail {
|
||||
|
||||
static void topk_element_hash_delete_func(void* val)
|
||||
{
|
||||
Element* e = (Element*)val;
|
||||
delete e;
|
||||
}
|
||||
static void topk_element_hash_delete_func(void* val) {
|
||||
Element* e = (Element*)val;
|
||||
delete e;
|
||||
}
|
||||
|
||||
void TopkVal::Typify(TypePtr t)
|
||||
{
|
||||
assert(! hash && ! type);
|
||||
type = std::move(t);
|
||||
auto tl = make_intrusive<TypeList>(type);
|
||||
tl->Append(type);
|
||||
hash = new zeek::detail::CompositeHash(std::move(tl));
|
||||
}
|
||||
void TopkVal::Typify(TypePtr t) {
|
||||
assert(! hash && ! type);
|
||||
type = std::move(t);
|
||||
auto tl = make_intrusive<TypeList>(type);
|
||||
tl->Append(type);
|
||||
hash = new zeek::detail::CompositeHash(std::move(tl));
|
||||
}
|
||||
|
||||
zeek::detail::HashKey* TopkVal::GetHash(Val* v) const
|
||||
{
|
||||
auto key = hash->MakeHashKey(*v, true);
|
||||
assert(key);
|
||||
return key.release();
|
||||
}
|
||||
zeek::detail::HashKey* TopkVal::GetHash(Val* v) const {
|
||||
auto key = hash->MakeHashKey(*v, true);
|
||||
assert(key);
|
||||
return key.release();
|
||||
}
|
||||
|
||||
TopkVal::TopkVal(uint64_t arg_size) : OpaqueVal(topk_type)
|
||||
{
|
||||
elementDict = new PDict<Element>;
|
||||
elementDict->SetDeleteFunc(topk_element_hash_delete_func);
|
||||
size = arg_size;
|
||||
numElements = 0;
|
||||
pruned = false;
|
||||
hash = nullptr;
|
||||
}
|
||||
TopkVal::TopkVal(uint64_t arg_size) : OpaqueVal(topk_type) {
|
||||
elementDict = new PDict<Element>;
|
||||
elementDict->SetDeleteFunc(topk_element_hash_delete_func);
|
||||
size = arg_size;
|
||||
numElements = 0;
|
||||
pruned = false;
|
||||
hash = nullptr;
|
||||
}
|
||||
|
||||
TopkVal::TopkVal() : OpaqueVal(topk_type)
|
||||
{
|
||||
elementDict = new PDict<Element>;
|
||||
elementDict->SetDeleteFunc(topk_element_hash_delete_func);
|
||||
size = 0;
|
||||
numElements = 0;
|
||||
hash = nullptr;
|
||||
}
|
||||
TopkVal::TopkVal() : OpaqueVal(topk_type) {
|
||||
elementDict = new PDict<Element>;
|
||||
elementDict->SetDeleteFunc(topk_element_hash_delete_func);
|
||||
size = 0;
|
||||
numElements = 0;
|
||||
hash = nullptr;
|
||||
}
|
||||
|
||||
TopkVal::~TopkVal()
|
||||
{
|
||||
elementDict->Clear();
|
||||
delete elementDict;
|
||||
TopkVal::~TopkVal() {
|
||||
elementDict->Clear();
|
||||
delete elementDict;
|
||||
|
||||
// now all elements are already gone - delete the buckets
|
||||
std::list<Bucket*>::iterator bi = buckets.begin();
|
||||
while ( bi != buckets.end() )
|
||||
{
|
||||
delete *bi;
|
||||
bi++;
|
||||
}
|
||||
// now all elements are already gone - delete the buckets
|
||||
std::list<Bucket*>::iterator bi = buckets.begin();
|
||||
while ( bi != buckets.end() ) {
|
||||
delete *bi;
|
||||
bi++;
|
||||
}
|
||||
|
||||
delete hash;
|
||||
}
|
||||
delete hash;
|
||||
}
|
||||
|
||||
void TopkVal::Merge(const TopkVal* value, bool doPrune)
|
||||
{
|
||||
if ( ! value->type )
|
||||
{
|
||||
// Merge-from is empty. Nothing to do.
|
||||
assert(value->numElements == 0);
|
||||
return;
|
||||
}
|
||||
void TopkVal::Merge(const TopkVal* value, bool doPrune) {
|
||||
if ( ! value->type ) {
|
||||
// Merge-from is empty. Nothing to do.
|
||||
assert(value->numElements == 0);
|
||||
return;
|
||||
}
|
||||
|
||||
if ( type == nullptr )
|
||||
{
|
||||
assert(numElements == 0);
|
||||
Typify(value->type);
|
||||
}
|
||||
if ( type == nullptr ) {
|
||||
assert(numElements == 0);
|
||||
Typify(value->type);
|
||||
}
|
||||
|
||||
else
|
||||
{
|
||||
if ( ! same_type(type, value->type) )
|
||||
{
|
||||
reporter->Error("Cannot merge top-k elements of differing types.");
|
||||
return;
|
||||
}
|
||||
}
|
||||
else {
|
||||
if ( ! same_type(type, value->type) ) {
|
||||
reporter->Error("Cannot merge top-k elements of differing types.");
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
std::list<Bucket*>::const_iterator it = value->buckets.begin();
|
||||
while ( it != value->buckets.end() )
|
||||
{
|
||||
Bucket* b = *it;
|
||||
uint64_t currcount = b->count;
|
||||
std::list<Element*>::const_iterator eit = b->elements.begin();
|
||||
std::list<Bucket*>::const_iterator it = value->buckets.begin();
|
||||
while ( it != value->buckets.end() ) {
|
||||
Bucket* b = *it;
|
||||
uint64_t currcount = b->count;
|
||||
std::list<Element*>::const_iterator eit = b->elements.begin();
|
||||
|
||||
while ( eit != b->elements.end() )
|
||||
{
|
||||
Element* e = *eit;
|
||||
// lookup if we already know this one...
|
||||
zeek::detail::HashKey* key = GetHash(e->value);
|
||||
Element* olde = (Element*)elementDict->Lookup(key);
|
||||
while ( eit != b->elements.end() ) {
|
||||
Element* e = *eit;
|
||||
// lookup if we already know this one...
|
||||
zeek::detail::HashKey* key = GetHash(e->value);
|
||||
Element* olde = (Element*)elementDict->Lookup(key);
|
||||
|
||||
if ( olde == nullptr )
|
||||
{
|
||||
olde = new Element();
|
||||
olde->epsilon = 0;
|
||||
olde->value = e->value;
|
||||
// insert at bucket position 0
|
||||
if ( buckets.size() > 0 )
|
||||
{
|
||||
assert(buckets.front()->count > 0);
|
||||
}
|
||||
if ( olde == nullptr ) {
|
||||
olde = new Element();
|
||||
olde->epsilon = 0;
|
||||
olde->value = e->value;
|
||||
// insert at bucket position 0
|
||||
if ( buckets.size() > 0 ) {
|
||||
assert(buckets.front()->count > 0);
|
||||
}
|
||||
|
||||
Bucket* newbucket = new Bucket();
|
||||
newbucket->count = 0;
|
||||
newbucket->bucketPos = buckets.insert(buckets.begin(), newbucket);
|
||||
Bucket* newbucket = new Bucket();
|
||||
newbucket->count = 0;
|
||||
newbucket->bucketPos = buckets.insert(buckets.begin(), newbucket);
|
||||
|
||||
olde->parent = newbucket;
|
||||
newbucket->elements.insert(newbucket->elements.end(), olde);
|
||||
olde->parent = newbucket;
|
||||
newbucket->elements.insert(newbucket->elements.end(), olde);
|
||||
|
||||
elementDict->Insert(key, olde);
|
||||
numElements++;
|
||||
}
|
||||
elementDict->Insert(key, olde);
|
||||
numElements++;
|
||||
}
|
||||
|
||||
// now that we are sure that the old element is present - increment epsilon
|
||||
olde->epsilon += e->epsilon;
|
||||
// now that we are sure that the old element is present - increment epsilon
|
||||
olde->epsilon += e->epsilon;
|
||||
|
||||
// and increment position...
|
||||
IncrementCounter(olde, currcount);
|
||||
delete key;
|
||||
// and increment position...
|
||||
IncrementCounter(olde, currcount);
|
||||
delete key;
|
||||
|
||||
eit++;
|
||||
}
|
||||
eit++;
|
||||
}
|
||||
|
||||
it++;
|
||||
}
|
||||
it++;
|
||||
}
|
||||
|
||||
// now we have added everything. And our top-k table could be too big.
|
||||
// prune everything...
|
||||
// now we have added everything. And our top-k table could be too big.
|
||||
// prune everything...
|
||||
|
||||
assert(size > 0);
|
||||
assert(size > 0);
|
||||
|
||||
if ( ! doPrune )
|
||||
return;
|
||||
if ( ! doPrune )
|
||||
return;
|
||||
|
||||
while ( numElements > size )
|
||||
{
|
||||
pruned = true;
|
||||
assert(buckets.size() > 0);
|
||||
Bucket* b = buckets.front();
|
||||
assert(b->elements.size() > 0);
|
||||
while ( numElements > size ) {
|
||||
pruned = true;
|
||||
assert(buckets.size() > 0);
|
||||
Bucket* b = buckets.front();
|
||||
assert(b->elements.size() > 0);
|
||||
|
||||
Element* e = b->elements.front();
|
||||
zeek::detail::HashKey* key = GetHash(e->value);
|
||||
elementDict->RemoveEntry(key);
|
||||
delete key;
|
||||
delete e;
|
||||
Element* e = b->elements.front();
|
||||
zeek::detail::HashKey* key = GetHash(e->value);
|
||||
elementDict->RemoveEntry(key);
|
||||
delete key;
|
||||
delete e;
|
||||
|
||||
b->elements.pop_front();
|
||||
b->elements.pop_front();
|
||||
|
||||
if ( b->elements.size() == 0 )
|
||||
{
|
||||
delete b;
|
||||
buckets.pop_front();
|
||||
}
|
||||
if ( b->elements.size() == 0 ) {
|
||||
delete b;
|
||||
buckets.pop_front();
|
||||
}
|
||||
|
||||
numElements--;
|
||||
}
|
||||
}
|
||||
numElements--;
|
||||
}
|
||||
}
|
||||
|
||||
ValPtr TopkVal::DoClone(CloneState* state)
|
||||
{
|
||||
auto clone = make_intrusive<TopkVal>(size);
|
||||
clone->Merge(this);
|
||||
return state->NewClone(this, std::move(clone));
|
||||
}
|
||||
ValPtr TopkVal::DoClone(CloneState* state) {
|
||||
auto clone = make_intrusive<TopkVal>(size);
|
||||
clone->Merge(this);
|
||||
return state->NewClone(this, std::move(clone));
|
||||
}
|
||||
|
||||
VectorValPtr TopkVal::GetTopK(int k) const // returns vector
|
||||
{
|
||||
if ( numElements == 0 )
|
||||
{
|
||||
reporter->Error("Cannot return topk of empty");
|
||||
return nullptr;
|
||||
}
|
||||
{
|
||||
if ( numElements == 0 ) {
|
||||
reporter->Error("Cannot return topk of empty");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
auto v = make_intrusive<VectorType>(type);
|
||||
auto t = make_intrusive<VectorVal>(std::move(v));
|
||||
auto v = make_intrusive<VectorType>(type);
|
||||
auto t = make_intrusive<VectorVal>(std::move(v));
|
||||
|
||||
// this does no estimation if the results is correct!
|
||||
// in any case - just to make this future-proof (and I am lazy) - this can return more than k.
|
||||
// this does no estimation if the results is correct!
|
||||
// in any case - just to make this future-proof (and I am lazy) - this can return more than k.
|
||||
|
||||
int read = 0;
|
||||
std::list<Bucket*>::const_iterator it = buckets.end();
|
||||
it--;
|
||||
while ( read < k )
|
||||
{
|
||||
// printf("Bucket %llu\n", (*it)->count);
|
||||
std::list<Element*>::iterator eit = (*it)->elements.begin();
|
||||
while ( eit != (*it)->elements.end() )
|
||||
{
|
||||
// printf("Size: %ld\n", (*it)->elements.size());
|
||||
t->Assign(read, (*eit)->value);
|
||||
read++;
|
||||
eit++;
|
||||
}
|
||||
int read = 0;
|
||||
std::list<Bucket*>::const_iterator it = buckets.end();
|
||||
it--;
|
||||
while ( read < k ) {
|
||||
// printf("Bucket %llu\n", (*it)->count);
|
||||
std::list<Element*>::iterator eit = (*it)->elements.begin();
|
||||
while ( eit != (*it)->elements.end() ) {
|
||||
// printf("Size: %ld\n", (*it)->elements.size());
|
||||
t->Assign(read, (*eit)->value);
|
||||
read++;
|
||||
eit++;
|
||||
}
|
||||
|
||||
if ( it == buckets.begin() )
|
||||
break;
|
||||
if ( it == buckets.begin() )
|
||||
break;
|
||||
|
||||
it--;
|
||||
}
|
||||
it--;
|
||||
}
|
||||
|
||||
return t;
|
||||
}
|
||||
return t;
|
||||
}
|
||||
|
||||
uint64_t TopkVal::GetCount(Val* value) const
|
||||
{
|
||||
zeek::detail::HashKey* key = GetHash(value);
|
||||
Element* e = (Element*)elementDict->Lookup(key);
|
||||
delete key;
|
||||
uint64_t TopkVal::GetCount(Val* value) const {
|
||||
zeek::detail::HashKey* key = GetHash(value);
|
||||
Element* e = (Element*)elementDict->Lookup(key);
|
||||
delete key;
|
||||
|
||||
if ( e == nullptr )
|
||||
{
|
||||
reporter->Error("GetCount for element that is not in top-k");
|
||||
return 0;
|
||||
}
|
||||
if ( e == nullptr ) {
|
||||
reporter->Error("GetCount for element that is not in top-k");
|
||||
return 0;
|
||||
}
|
||||
|
||||
return e->parent->count;
|
||||
}
|
||||
return e->parent->count;
|
||||
}
|
||||
|
||||
uint64_t TopkVal::GetEpsilon(Val* value) const
|
||||
{
|
||||
zeek::detail::HashKey* key = GetHash(value);
|
||||
Element* e = (Element*)elementDict->Lookup(key);
|
||||
delete key;
|
||||
uint64_t TopkVal::GetEpsilon(Val* value) const {
|
||||
zeek::detail::HashKey* key = GetHash(value);
|
||||
Element* e = (Element*)elementDict->Lookup(key);
|
||||
delete key;
|
||||
|
||||
if ( e == nullptr )
|
||||
{
|
||||
reporter->Error("GetEpsilon for element that is not in top-k");
|
||||
return 0;
|
||||
}
|
||||
if ( e == nullptr ) {
|
||||
reporter->Error("GetEpsilon for element that is not in top-k");
|
||||
return 0;
|
||||
}
|
||||
|
||||
return e->epsilon;
|
||||
}
|
||||
return e->epsilon;
|
||||
}
|
||||
|
||||
uint64_t TopkVal::GetSum() const
|
||||
{
|
||||
uint64_t sum = 0;
|
||||
uint64_t TopkVal::GetSum() const {
|
||||
uint64_t sum = 0;
|
||||
|
||||
std::list<Bucket*>::const_iterator it = buckets.begin();
|
||||
while ( it != buckets.end() )
|
||||
{
|
||||
sum += (*it)->elements.size() * (*it)->count;
|
||||
std::list<Bucket*>::const_iterator it = buckets.begin();
|
||||
while ( it != buckets.end() ) {
|
||||
sum += (*it)->elements.size() * (*it)->count;
|
||||
|
||||
it++;
|
||||
}
|
||||
it++;
|
||||
}
|
||||
|
||||
if ( pruned )
|
||||
reporter->Warning("TopkVal::GetSum() was used on a pruned data structure. Result values do "
|
||||
"not represent total element count");
|
||||
if ( pruned )
|
||||
reporter->Warning(
|
||||
"TopkVal::GetSum() was used on a pruned data structure. Result values do "
|
||||
"not represent total element count");
|
||||
|
||||
return sum;
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
void TopkVal::Encountered(ValPtr encountered)
|
||||
{
|
||||
// ok, let's see if we already know this one.
|
||||
void TopkVal::Encountered(ValPtr encountered) {
|
||||
// ok, let's see if we already know this one.
|
||||
|
||||
if ( numElements == 0 )
|
||||
Typify(encountered->GetType());
|
||||
else if ( ! same_type(type, encountered->GetType()) )
|
||||
{
|
||||
reporter->Error("Trying to add element to topk with differing type from other elements");
|
||||
return;
|
||||
}
|
||||
if ( numElements == 0 )
|
||||
Typify(encountered->GetType());
|
||||
else if ( ! same_type(type, encountered->GetType()) ) {
|
||||
reporter->Error("Trying to add element to topk with differing type from other elements");
|
||||
return;
|
||||
}
|
||||
|
||||
// Step 1 - get the hash.
|
||||
zeek::detail::HashKey* key = GetHash(encountered);
|
||||
Element* e = (Element*)elementDict->Lookup(key);
|
||||
// Step 1 - get the hash.
|
||||
zeek::detail::HashKey* key = GetHash(encountered);
|
||||
Element* e = (Element*)elementDict->Lookup(key);
|
||||
|
||||
if ( e == nullptr )
|
||||
{
|
||||
e = new Element();
|
||||
e->epsilon = 0;
|
||||
e->value = std::move(encountered);
|
||||
if ( e == nullptr ) {
|
||||
e = new Element();
|
||||
e->epsilon = 0;
|
||||
e->value = std::move(encountered);
|
||||
|
||||
// well, we do not know this one yet...
|
||||
if ( numElements < size )
|
||||
{
|
||||
// brilliant. just add it at position 1
|
||||
if ( buckets.size() == 0 || (*buckets.begin())->count > 1 )
|
||||
{
|
||||
Bucket* b = new Bucket();
|
||||
b->count = 1;
|
||||
std::list<Bucket*>::iterator pos = buckets.insert(buckets.begin(), b);
|
||||
b->bucketPos = pos;
|
||||
b->elements.insert(b->elements.end(), e);
|
||||
e->parent = b;
|
||||
}
|
||||
else
|
||||
{
|
||||
Bucket* b = *buckets.begin();
|
||||
assert(b->count == 1);
|
||||
b->elements.insert(b->elements.end(), e);
|
||||
e->parent = b;
|
||||
}
|
||||
// well, we do not know this one yet...
|
||||
if ( numElements < size ) {
|
||||
// brilliant. just add it at position 1
|
||||
if ( buckets.size() == 0 || (*buckets.begin())->count > 1 ) {
|
||||
Bucket* b = new Bucket();
|
||||
b->count = 1;
|
||||
std::list<Bucket*>::iterator pos = buckets.insert(buckets.begin(), b);
|
||||
b->bucketPos = pos;
|
||||
b->elements.insert(b->elements.end(), e);
|
||||
e->parent = b;
|
||||
}
|
||||
else {
|
||||
Bucket* b = *buckets.begin();
|
||||
assert(b->count == 1);
|
||||
b->elements.insert(b->elements.end(), e);
|
||||
e->parent = b;
|
||||
}
|
||||
|
||||
elementDict->Insert(key, e);
|
||||
numElements++;
|
||||
delete key;
|
||||
elementDict->Insert(key, e);
|
||||
numElements++;
|
||||
delete key;
|
||||
|
||||
return; // done. it is at pos 1.
|
||||
}
|
||||
return; // done. it is at pos 1.
|
||||
}
|
||||
|
||||
else
|
||||
{
|
||||
// replace element with min-value
|
||||
Bucket* b = *buckets.begin(); // bucket with smallest elements
|
||||
else {
|
||||
// replace element with min-value
|
||||
Bucket* b = *buckets.begin(); // bucket with smallest elements
|
||||
|
||||
// evict oldest element with least hits.
|
||||
assert(b->elements.size() > 0);
|
||||
zeek::detail::HashKey* deleteKey = GetHash((*(b->elements.begin()))->value);
|
||||
b->elements.erase(b->elements.begin());
|
||||
Element* deleteElement = (Element*)elementDict->RemoveEntry(deleteKey);
|
||||
assert(deleteElement); // there has to have been a minimal element...
|
||||
delete deleteElement;
|
||||
delete deleteKey;
|
||||
// evict oldest element with least hits.
|
||||
assert(b->elements.size() > 0);
|
||||
zeek::detail::HashKey* deleteKey = GetHash((*(b->elements.begin()))->value);
|
||||
b->elements.erase(b->elements.begin());
|
||||
Element* deleteElement = (Element*)elementDict->RemoveEntry(deleteKey);
|
||||
assert(deleteElement); // there has to have been a minimal element...
|
||||
delete deleteElement;
|
||||
delete deleteKey;
|
||||
|
||||
// and add the new one to the end
|
||||
e->epsilon = b->count;
|
||||
b->elements.insert(b->elements.end(), e);
|
||||
elementDict->Insert(key, e);
|
||||
e->parent = b;
|
||||
// and add the new one to the end
|
||||
e->epsilon = b->count;
|
||||
b->elements.insert(b->elements.end(), e);
|
||||
elementDict->Insert(key, e);
|
||||
e->parent = b;
|
||||
|
||||
// fallthrough, increment operation has to run!
|
||||
}
|
||||
}
|
||||
// fallthrough, increment operation has to run!
|
||||
}
|
||||
}
|
||||
|
||||
// ok, we now have an element in e
|
||||
delete key;
|
||||
IncrementCounter(e); // well, this certainly was anticlimactic.
|
||||
}
|
||||
// ok, we now have an element in e
|
||||
delete key;
|
||||
IncrementCounter(e); // well, this certainly was anticlimactic.
|
||||
}
|
||||
|
||||
// increment by count
|
||||
void TopkVal::IncrementCounter(Element* e, unsigned int count)
|
||||
{
|
||||
Bucket* currBucket = e->parent;
|
||||
uint64_t currcount = currBucket->count;
|
||||
void TopkVal::IncrementCounter(Element* e, unsigned int count) {
|
||||
Bucket* currBucket = e->parent;
|
||||
uint64_t currcount = currBucket->count;
|
||||
|
||||
// well, let's test if there is a bucket for currcount++
|
||||
std::list<Bucket*>::iterator bucketIter = currBucket->bucketPos;
|
||||
// well, let's test if there is a bucket for currcount++
|
||||
std::list<Bucket*>::iterator bucketIter = currBucket->bucketPos;
|
||||
|
||||
Bucket* nextBucket = nullptr;
|
||||
Bucket* nextBucket = nullptr;
|
||||
|
||||
bucketIter++;
|
||||
bucketIter++;
|
||||
|
||||
while ( bucketIter != buckets.end() && (*bucketIter)->count < currcount + count )
|
||||
bucketIter++;
|
||||
while ( bucketIter != buckets.end() && (*bucketIter)->count < currcount + count )
|
||||
bucketIter++;
|
||||
|
||||
if ( bucketIter != buckets.end() && (*bucketIter)->count == currcount + count )
|
||||
nextBucket = *bucketIter;
|
||||
if ( bucketIter != buckets.end() && (*bucketIter)->count == currcount + count )
|
||||
nextBucket = *bucketIter;
|
||||
|
||||
if ( nextBucket == nullptr )
|
||||
{
|
||||
// the bucket for the value that we want does not exist.
|
||||
// create it...
|
||||
if ( nextBucket == nullptr ) {
|
||||
// the bucket for the value that we want does not exist.
|
||||
// create it...
|
||||
|
||||
Bucket* b = new Bucket();
|
||||
b->count = currcount + count;
|
||||
Bucket* b = new Bucket();
|
||||
b->count = currcount + count;
|
||||
|
||||
std::list<Bucket*>::iterator nextBucketPos = buckets.insert(bucketIter, b);
|
||||
b->bucketPos = nextBucketPos; // and give it the iterator we know now.
|
||||
std::list<Bucket*>::iterator nextBucketPos = buckets.insert(bucketIter, b);
|
||||
b->bucketPos = nextBucketPos; // and give it the iterator we know now.
|
||||
|
||||
nextBucket = b;
|
||||
}
|
||||
nextBucket = b;
|
||||
}
|
||||
|
||||
// ok, now we have the new bucket in nextBucket. Shift the element over...
|
||||
currBucket->elements.remove(e);
|
||||
nextBucket->elements.insert(nextBucket->elements.end(), e);
|
||||
// ok, now we have the new bucket in nextBucket. Shift the element over...
|
||||
currBucket->elements.remove(e);
|
||||
nextBucket->elements.insert(nextBucket->elements.end(), e);
|
||||
|
||||
e->parent = nextBucket;
|
||||
e->parent = nextBucket;
|
||||
|
||||
// if currBucket is empty, we have to delete it now
|
||||
if ( currBucket->elements.size() == 0 )
|
||||
{
|
||||
buckets.remove(currBucket);
|
||||
delete currBucket;
|
||||
currBucket = nullptr;
|
||||
}
|
||||
}
|
||||
// if currBucket is empty, we have to delete it now
|
||||
if ( currBucket->elements.size() == 0 ) {
|
||||
buckets.remove(currBucket);
|
||||
delete currBucket;
|
||||
currBucket = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
IMPLEMENT_OPAQUE_VALUE(TopkVal)
|
||||
|
||||
broker::expected<broker::data> TopkVal::DoSerialize() const
|
||||
{
|
||||
broker::vector d = {size, numElements, pruned};
|
||||
broker::expected<broker::data> TopkVal::DoSerialize() const {
|
||||
broker::vector d = {size, numElements, pruned};
|
||||
|
||||
if ( type )
|
||||
{
|
||||
auto t = SerializeType(type);
|
||||
if ( ! t )
|
||||
return broker::ec::invalid_data;
|
||||
if ( type ) {
|
||||
auto t = SerializeType(type);
|
||||
if ( ! t )
|
||||
return broker::ec::invalid_data;
|
||||
|
||||
d.emplace_back(std::move(*t));
|
||||
}
|
||||
else
|
||||
d.emplace_back(broker::none());
|
||||
d.emplace_back(std::move(*t));
|
||||
}
|
||||
else
|
||||
d.emplace_back(broker::none());
|
||||
|
||||
uint64_t i = 0;
|
||||
std::list<Bucket*>::const_iterator it = buckets.begin();
|
||||
while ( it != buckets.end() )
|
||||
{
|
||||
Bucket* b = *it;
|
||||
uint32_t elements_count = b->elements.size();
|
||||
uint64_t i = 0;
|
||||
std::list<Bucket*>::const_iterator it = buckets.begin();
|
||||
while ( it != buckets.end() ) {
|
||||
Bucket* b = *it;
|
||||
uint32_t elements_count = b->elements.size();
|
||||
|
||||
d.emplace_back(static_cast<uint64_t>(b->elements.size()));
|
||||
d.emplace_back(b->count);
|
||||
d.emplace_back(static_cast<uint64_t>(b->elements.size()));
|
||||
d.emplace_back(b->count);
|
||||
|
||||
std::list<Element*>::const_iterator eit = b->elements.begin();
|
||||
while ( eit != b->elements.end() )
|
||||
{
|
||||
Element* element = *eit;
|
||||
d.emplace_back(element->epsilon);
|
||||
auto v = Broker::detail::val_to_data(element->value.get());
|
||||
if ( ! v )
|
||||
return broker::ec::invalid_data;
|
||||
std::list<Element*>::const_iterator eit = b->elements.begin();
|
||||
while ( eit != b->elements.end() ) {
|
||||
Element* element = *eit;
|
||||
d.emplace_back(element->epsilon);
|
||||
auto v = Broker::detail::val_to_data(element->value.get());
|
||||
if ( ! v )
|
||||
return broker::ec::invalid_data;
|
||||
|
||||
d.emplace_back(*v);
|
||||
d.emplace_back(*v);
|
||||
|
||||
eit++;
|
||||
i++;
|
||||
}
|
||||
eit++;
|
||||
i++;
|
||||
}
|
||||
|
||||
it++;
|
||||
}
|
||||
it++;
|
||||
}
|
||||
|
||||
assert(i == numElements);
|
||||
return {std::move(d)};
|
||||
}
|
||||
assert(i == numElements);
|
||||
return {std::move(d)};
|
||||
}
|
||||
|
||||
bool TopkVal::DoUnserialize(const broker::data& data)
|
||||
{
|
||||
auto v = broker::get_if<broker::vector>(&data);
|
||||
bool TopkVal::DoUnserialize(const broker::data& data) {
|
||||
auto v = broker::get_if<broker::vector>(&data);
|
||||
|
||||
if ( ! (v && v->size() >= 4) )
|
||||
return false;
|
||||
if ( ! (v && v->size() >= 4) )
|
||||
return false;
|
||||
|
||||
auto size_ = broker::get_if<uint64_t>(&(*v)[0]);
|
||||
auto numElements_ = broker::get_if<uint64_t>(&(*v)[1]);
|
||||
auto pruned_ = broker::get_if<bool>(&(*v)[2]);
|
||||
auto size_ = broker::get_if<uint64_t>(&(*v)[0]);
|
||||
auto numElements_ = broker::get_if<uint64_t>(&(*v)[1]);
|
||||
auto pruned_ = broker::get_if<bool>(&(*v)[2]);
|
||||
|
||||
if ( ! (size_ && numElements_ && pruned_) )
|
||||
return false;
|
||||
if ( ! (size_ && numElements_ && pruned_) )
|
||||
return false;
|
||||
|
||||
size = *size_;
|
||||
numElements = *numElements_;
|
||||
pruned = *pruned_;
|
||||
size = *size_;
|
||||
numElements = *numElements_;
|
||||
pruned = *pruned_;
|
||||
|
||||
auto no_type = broker::get_if<broker::none>(&(*v)[3]);
|
||||
if ( ! no_type )
|
||||
{
|
||||
auto t = UnserializeType((*v)[3]);
|
||||
auto no_type = broker::get_if<broker::none>(&(*v)[3]);
|
||||
if ( ! no_type ) {
|
||||
auto t = UnserializeType((*v)[3]);
|
||||
|
||||
if ( ! t )
|
||||
return false;
|
||||
if ( ! t )
|
||||
return false;
|
||||
|
||||
Typify(t);
|
||||
}
|
||||
Typify(t);
|
||||
}
|
||||
|
||||
uint64_t i = 0;
|
||||
uint64_t idx = 4;
|
||||
uint64_t i = 0;
|
||||
uint64_t idx = 4;
|
||||
|
||||
while ( i < numElements )
|
||||
{
|
||||
auto elements_count = broker::get_if<uint64_t>(&(*v)[idx++]);
|
||||
auto count = broker::get_if<uint64_t>(&(*v)[idx++]);
|
||||
while ( i < numElements ) {
|
||||
auto elements_count = broker::get_if<uint64_t>(&(*v)[idx++]);
|
||||
auto count = broker::get_if<uint64_t>(&(*v)[idx++]);
|
||||
|
||||
if ( ! (elements_count && count) )
|
||||
return false;
|
||||
if ( ! (elements_count && count) )
|
||||
return false;
|
||||
|
||||
Bucket* b = new Bucket();
|
||||
b->count = *count;
|
||||
b->bucketPos = buckets.insert(buckets.end(), b);
|
||||
Bucket* b = new Bucket();
|
||||
b->count = *count;
|
||||
b->bucketPos = buckets.insert(buckets.end(), b);
|
||||
|
||||
for ( uint64_t j = 0; j < *elements_count; j++ )
|
||||
{
|
||||
auto epsilon = broker::get_if<uint64_t>(&(*v)[idx++]);
|
||||
auto val = Broker::detail::data_to_val((*v)[idx++], type.get());
|
||||
for ( uint64_t j = 0; j < *elements_count; j++ ) {
|
||||
auto epsilon = broker::get_if<uint64_t>(&(*v)[idx++]);
|
||||
auto val = Broker::detail::data_to_val((*v)[idx++], type.get());
|
||||
|
||||
if ( ! (epsilon && val) )
|
||||
return false;
|
||||
if ( ! (epsilon && val) )
|
||||
return false;
|
||||
|
||||
Element* e = new Element();
|
||||
e->epsilon = *epsilon;
|
||||
e->value = std::move(val);
|
||||
e->parent = b;
|
||||
Element* e = new Element();
|
||||
e->epsilon = *epsilon;
|
||||
e->value = std::move(val);
|
||||
e->parent = b;
|
||||
|
||||
b->elements.insert(b->elements.end(), e);
|
||||
b->elements.insert(b->elements.end(), e);
|
||||
|
||||
zeek::detail::HashKey* key = GetHash(e->value);
|
||||
assert(elementDict->Lookup(key) == nullptr);
|
||||
zeek::detail::HashKey* key = GetHash(e->value);
|
||||
assert(elementDict->Lookup(key) == nullptr);
|
||||
|
||||
elementDict->Insert(key, e);
|
||||
delete key;
|
||||
elementDict->Insert(key, e);
|
||||
delete key;
|
||||
|
||||
i++;
|
||||
}
|
||||
}
|
||||
i++;
|
||||
}
|
||||
}
|
||||
|
||||
assert(i == numElements);
|
||||
return true;
|
||||
}
|
||||
assert(i == numElements);
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace zeek::probabilistic::detail
|
||||
} // namespace zeek::probabilistic::detail
|
||||
|
|
|
@ -13,173 +13,167 @@
|
|||
//
|
||||
// Or - to be more precise - it implements an interpretation of it.
|
||||
|
||||
namespace zeek::detail
|
||||
{
|
||||
namespace zeek::detail {
|
||||
class CompositeHash;
|
||||
}
|
||||
}
|
||||
|
||||
namespace zeek::probabilistic::detail
|
||||
{
|
||||
namespace zeek::probabilistic::detail {
|
||||
|
||||
struct Element;
|
||||
|
||||
struct Bucket
|
||||
{
|
||||
uint64_t count;
|
||||
std::list<Element*> elements;
|
||||
struct Bucket {
|
||||
uint64_t count;
|
||||
std::list<Element*> elements;
|
||||
|
||||
// Iterators only get invalidated for removed elements. This one
|
||||
// points to us - so it is invalid when we are no longer there. Cute,
|
||||
// isn't it?
|
||||
std::list<Bucket*>::iterator bucketPos;
|
||||
};
|
||||
// Iterators only get invalidated for removed elements. This one
|
||||
// points to us - so it is invalid when we are no longer there. Cute,
|
||||
// isn't it?
|
||||
std::list<Bucket*>::iterator bucketPos;
|
||||
};
|
||||
|
||||
struct Element
|
||||
{
|
||||
uint64_t epsilon;
|
||||
ValPtr value;
|
||||
Bucket* parent;
|
||||
};
|
||||
|
||||
class TopkVal : public OpaqueVal
|
||||
{
|
||||
struct Element {
|
||||
uint64_t epsilon;
|
||||
ValPtr value;
|
||||
Bucket* parent;
|
||||
};
|
||||
|
||||
class TopkVal : public OpaqueVal {
|
||||
public:
|
||||
/**
|
||||
* Construct a TopkVal.
|
||||
*
|
||||
* @param size specifies how many total elements are tracked
|
||||
*
|
||||
* @return A newly initialized TopkVal
|
||||
*/
|
||||
explicit TopkVal(uint64_t size);
|
||||
/**
|
||||
* Construct a TopkVal.
|
||||
*
|
||||
* @param size specifies how many total elements are tracked
|
||||
*
|
||||
* @return A newly initialized TopkVal
|
||||
*/
|
||||
explicit TopkVal(uint64_t size);
|
||||
|
||||
/**
|
||||
* Destructor.
|
||||
*/
|
||||
~TopkVal() override;
|
||||
/**
|
||||
* Destructor.
|
||||
*/
|
||||
~TopkVal() override;
|
||||
|
||||
/**
|
||||
* Call this when a new value is encountered. Note that on the first
|
||||
* call, the Zeek type of the value types that are counted is set. All
|
||||
* following calls to encountered have to specify the same type.
|
||||
*
|
||||
* @param value The encountered element
|
||||
*/
|
||||
void Encountered(ValPtr value);
|
||||
/**
|
||||
* Call this when a new value is encountered. Note that on the first
|
||||
* call, the Zeek type of the value types that are counted is set. All
|
||||
* following calls to encountered have to specify the same type.
|
||||
*
|
||||
* @param value The encountered element
|
||||
*/
|
||||
void Encountered(ValPtr value);
|
||||
|
||||
/**
|
||||
* Get the first *k* elements of the result vector. At the moment,
|
||||
* this does not check if it is in the right order or if we can prove
|
||||
* that these are the correct top-k. Use count and epsilon for this.
|
||||
*
|
||||
* @param k Number of top-elements to return
|
||||
*
|
||||
* @returns The top-k encountered elements
|
||||
*/
|
||||
VectorValPtr GetTopK(int k) const;
|
||||
/**
|
||||
* Get the first *k* elements of the result vector. At the moment,
|
||||
* this does not check if it is in the right order or if we can prove
|
||||
* that these are the correct top-k. Use count and epsilon for this.
|
||||
*
|
||||
* @param k Number of top-elements to return
|
||||
*
|
||||
* @returns The top-k encountered elements
|
||||
*/
|
||||
VectorValPtr GetTopK(int k) const;
|
||||
|
||||
/**
|
||||
* Get the current count tracked in the top-k data structure for a
|
||||
* certain val. Returns 0 if the val is unknown (and logs the error
|
||||
* to reporter).
|
||||
*
|
||||
* @param value Zeek value to get counts for
|
||||
*
|
||||
* @returns internal count for val, 0 if unknown
|
||||
*/
|
||||
uint64_t GetCount(Val* value) const;
|
||||
/**
|
||||
* Get the current count tracked in the top-k data structure for a
|
||||
* certain val. Returns 0 if the val is unknown (and logs the error
|
||||
* to reporter).
|
||||
*
|
||||
* @param value Zeek value to get counts for
|
||||
*
|
||||
* @returns internal count for val, 0 if unknown
|
||||
*/
|
||||
uint64_t GetCount(Val* value) const;
|
||||
|
||||
/**
|
||||
* Get the current epsilon tracked in the top-k data structure for a
|
||||
* certain val.
|
||||
*
|
||||
* @param value Zeek value to get epsilons for
|
||||
*
|
||||
* @returns the epsilon. Returns 0 if the val is unknown (and logs
|
||||
* the error to reporter)
|
||||
*/
|
||||
uint64_t GetEpsilon(Val* value) const;
|
||||
/**
|
||||
* Get the current epsilon tracked in the top-k data structure for a
|
||||
* certain val.
|
||||
*
|
||||
* @param value Zeek value to get epsilons for
|
||||
*
|
||||
* @returns the epsilon. Returns 0 if the val is unknown (and logs
|
||||
* the error to reporter)
|
||||
*/
|
||||
uint64_t GetEpsilon(Val* value) const;
|
||||
|
||||
/**
|
||||
* Get the size set in the constructor
|
||||
*
|
||||
* @returns size of the top-k structure
|
||||
*/
|
||||
uint64_t GetSize() const { return size; }
|
||||
/**
|
||||
* Get the size set in the constructor
|
||||
*
|
||||
* @returns size of the top-k structure
|
||||
*/
|
||||
uint64_t GetSize() const { return size; }
|
||||
|
||||
/**
|
||||
* Get the sum of all counts of all tracked elements. This is equal
|
||||
* to the number of total observations up to this moment, if no
|
||||
* elements were pruned from the data structure.
|
||||
*
|
||||
* @returns sum of all counts
|
||||
*/
|
||||
uint64_t GetSum() const;
|
||||
/**
|
||||
* Get the sum of all counts of all tracked elements. This is equal
|
||||
* to the number of total observations up to this moment, if no
|
||||
* elements were pruned from the data structure.
|
||||
*
|
||||
* @returns sum of all counts
|
||||
*/
|
||||
uint64_t GetSum() const;
|
||||
|
||||
/**
|
||||
* Merge another top-k data structure into this one. doPrune
|
||||
* specifies if the total count of elements is limited to size after
|
||||
* merging. Please note, that pruning will invalidate the results of
|
||||
* getSum.
|
||||
*
|
||||
* @param value TopkVal to merge into this TopkVal
|
||||
*
|
||||
* @param doPrune prune resulting TopkVal to size after merging
|
||||
*/
|
||||
void Merge(const TopkVal* value, bool doPrune = false);
|
||||
/**
|
||||
* Merge another top-k data structure into this one. doPrune
|
||||
* specifies if the total count of elements is limited to size after
|
||||
* merging. Please note, that pruning will invalidate the results of
|
||||
* getSum.
|
||||
*
|
||||
* @param value TopkVal to merge into this TopkVal
|
||||
*
|
||||
* @param doPrune prune resulting TopkVal to size after merging
|
||||
*/
|
||||
void Merge(const TopkVal* value, bool doPrune = false);
|
||||
|
||||
/**
|
||||
* Clone the Opaque Type
|
||||
*
|
||||
* @param state Clone state (tracking duplicate pointers)
|
||||
*
|
||||
* @returns cloned TopkVal
|
||||
*/
|
||||
ValPtr DoClone(CloneState* state) override;
|
||||
/**
|
||||
* Clone the Opaque Type
|
||||
*
|
||||
* @param state Clone state (tracking duplicate pointers)
|
||||
*
|
||||
* @returns cloned TopkVal
|
||||
*/
|
||||
ValPtr DoClone(CloneState* state) override;
|
||||
|
||||
DECLARE_OPAQUE_VALUE(TopkVal)
|
||||
DECLARE_OPAQUE_VALUE(TopkVal)
|
||||
|
||||
protected:
|
||||
/**
|
||||
* Construct an empty TopkVal. Only used for deserialization
|
||||
*/
|
||||
TopkVal();
|
||||
/**
|
||||
* Construct an empty TopkVal. Only used for deserialization
|
||||
*/
|
||||
TopkVal();
|
||||
|
||||
private:
|
||||
/**
|
||||
* Increment the counter for a specific element
|
||||
*
|
||||
* @param e element to increment counter for
|
||||
*
|
||||
* @param count increment counter by this much
|
||||
*/
|
||||
void IncrementCounter(Element* e, unsigned int count = 1);
|
||||
/**
|
||||
* Increment the counter for a specific element
|
||||
*
|
||||
* @param e element to increment counter for
|
||||
*
|
||||
* @param count increment counter by this much
|
||||
*/
|
||||
void IncrementCounter(Element* e, unsigned int count = 1);
|
||||
|
||||
/**
|
||||
* get the hashkey for a specific value
|
||||
*
|
||||
* @param v value to generate key for
|
||||
*
|
||||
* @returns HashKey for value
|
||||
*/
|
||||
zeek::detail::HashKey* GetHash(Val* v) const; // this probably should go somewhere else.
|
||||
zeek::detail::HashKey* GetHash(const ValPtr& v) const { return GetHash(v.get()); }
|
||||
/**
|
||||
* get the hashkey for a specific value
|
||||
*
|
||||
* @param v value to generate key for
|
||||
*
|
||||
* @returns HashKey for value
|
||||
*/
|
||||
zeek::detail::HashKey* GetHash(Val* v) const; // this probably should go somewhere else.
|
||||
zeek::detail::HashKey* GetHash(const ValPtr& v) const { return GetHash(v.get()); }
|
||||
|
||||
/**
|
||||
* Set the type that this TopK instance tracks
|
||||
*
|
||||
* @param t type that is tracked
|
||||
*/
|
||||
void Typify(TypePtr t);
|
||||
/**
|
||||
* Set the type that this TopK instance tracks
|
||||
*
|
||||
* @param t type that is tracked
|
||||
*/
|
||||
void Typify(TypePtr t);
|
||||
|
||||
TypePtr type;
|
||||
zeek::detail::CompositeHash* hash = nullptr;
|
||||
std::list<Bucket*> buckets;
|
||||
PDict<Element>* elementDict = nullptr;
|
||||
uint64_t size = 0; // how many elements are we tracking?
|
||||
uint64_t numElements = 0; // how many elements do we have at the moment
|
||||
bool pruned = false; // was this data structure pruned?
|
||||
};
|
||||
TypePtr type;
|
||||
zeek::detail::CompositeHash* hash = nullptr;
|
||||
std::list<Bucket*> buckets;
|
||||
PDict<Element>* elementDict = nullptr;
|
||||
uint64_t size = 0; // how many elements are we tracking?
|
||||
uint64_t numElements = 0; // how many elements do we have at the moment
|
||||
bool pruned = false; // was this data structure pruned?
|
||||
};
|
||||
|
||||
} // namespace zeek::probabilistic::detail
|
||||
} // namespace zeek::probabilistic::detail
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue