Reformat Zeek in Spicy style

This largely copies over Spicy's `.clang-format` configuration file. The
one place where we deviate is header include order since Zeek depends on
headers being included in a certain order.
This commit is contained in:
Benjamin Bannier 2023-10-10 21:13:34 +02:00
parent 7b8e7ed72c
commit f5a76c1aed
786 changed files with 131714 additions and 153609 deletions

File diff suppressed because it is too large Load diff

View file

@ -7,346 +7,338 @@
#include <memory>
#include <vector>
namespace broker
{
namespace broker {
class data;
}
}
namespace zeek::probabilistic::detail
{
namespace zeek::probabilistic::detail {
/**
* A vector of bits.
*/
class BitVector
{
class BitVector {
public:
using block_type = uint64_t;
using size_type = size_t;
using const_reference = bool;
using block_type = uint64_t;
using size_type = size_t;
using const_reference = bool;
static size_type npos;
static block_type bits_per_block;
static size_type npos;
static block_type bits_per_block;
/**
* An lvalue proxy for individual bits.
*/
class Reference
{
public:
/**
* Inverts the bits' values.
*/
Reference& Flip();
/**
* An lvalue proxy for individual bits.
*/
class Reference {
public:
/**
* Inverts the bits' values.
*/
Reference& Flip();
operator bool() const;
bool operator~() const;
Reference& operator=(bool x);
Reference& operator=(const Reference& other);
Reference& operator|=(bool x);
Reference& operator&=(bool x);
Reference& operator^=(bool x);
Reference& operator-=(bool x);
operator bool() const;
bool operator~() const;
Reference& operator=(bool x);
Reference& operator=(const Reference& other);
Reference& operator|=(bool x);
Reference& operator&=(bool x);
Reference& operator^=(bool x);
Reference& operator-=(bool x);
private:
friend class BitVector;
private:
friend class BitVector;
Reference(block_type& block, block_type i);
void operator&();
Reference(block_type& block, block_type i);
void operator&();
block_type& block;
const block_type mask;
};
block_type& block;
const block_type mask;
};
/**
* Default-constructs an empty bit vector.
*/
BitVector();
/**
* Default-constructs an empty bit vector.
*/
BitVector();
/**
* Constructs a bit vector of a given size.
* @param size The number of bits.
* @param value The value for each bit.
*/
explicit BitVector(size_type size, bool value = false);
/**
* Constructs a bit vector of a given size.
* @param size The number of bits.
* @param value The value for each bit.
*/
explicit BitVector(size_type size, bool value = false);
/**
* Constructs a bit vector from a sequence of blocks.
*
* @param first Start of range
* @param last End of range.
*
*/
template <typename InputIterator> BitVector(InputIterator first, InputIterator last)
{
bits.insert(bits.end(), first, last);
num_bits = bits.size() * bits_per_block;
}
/**
* Constructs a bit vector from a sequence of blocks.
*
* @param first Start of range
* @param last End of range.
*
*/
template<typename InputIterator>
BitVector(InputIterator first, InputIterator last) {
bits.insert(bits.end(), first, last);
num_bits = bits.size() * bits_per_block;
}
/**
* Copy-constructs a bit vector.
* @param other The bit vector to copy.
*/
BitVector(const BitVector& other);
/**
* Copy-constructs a bit vector.
* @param other The bit vector to copy.
*/
BitVector(const BitVector& other);
/**
* Assigns another bit vector to this instance.
* @param other The RHS of the assignment.
*/
BitVector& operator=(const BitVector& other);
/**
* Assigns another bit vector to this instance.
* @param other The RHS of the assignment.
*/
BitVector& operator=(const BitVector& other);
//
// Bitwise operations.
//
BitVector operator~() const;
BitVector operator<<(size_type n) const;
BitVector operator>>(size_type n) const;
BitVector& operator<<=(size_type n);
BitVector& operator>>=(size_type n);
BitVector& operator&=(BitVector const& other);
BitVector& operator|=(BitVector const& other);
BitVector& operator^=(BitVector const& other);
BitVector& operator-=(BitVector const& other);
friend BitVector operator&(BitVector const& x, BitVector const& y);
friend BitVector operator|(BitVector const& x, BitVector const& y);
friend BitVector operator^(BitVector const& x, BitVector const& y);
friend BitVector operator-(BitVector const& x, BitVector const& y);
//
// Bitwise operations.
//
BitVector operator~() const;
BitVector operator<<(size_type n) const;
BitVector operator>>(size_type n) const;
BitVector& operator<<=(size_type n);
BitVector& operator>>=(size_type n);
BitVector& operator&=(BitVector const& other);
BitVector& operator|=(BitVector const& other);
BitVector& operator^=(BitVector const& other);
BitVector& operator-=(BitVector const& other);
friend BitVector operator&(BitVector const& x, BitVector const& y);
friend BitVector operator|(BitVector const& x, BitVector const& y);
friend BitVector operator^(BitVector const& x, BitVector const& y);
friend BitVector operator-(BitVector const& x, BitVector const& y);
//
// Relational operators
//
friend bool operator==(BitVector const& x, BitVector const& y);
friend bool operator!=(BitVector const& x, BitVector const& y);
friend bool operator<(BitVector const& x, BitVector const& y);
//
// Relational operators
//
friend bool operator==(BitVector const& x, BitVector const& y);
friend bool operator!=(BitVector const& x, BitVector const& y);
friend bool operator<(BitVector const& x, BitVector const& y);
//
// Basic operations
//
//
// Basic operations
//
/** Appends the bits in a sequence of values.
* @tparam Iterator A forward iterator.
* @param first An iterator pointing to the first element of the sequence.
* @param last An iterator pointing to one past the last element of the
* sequence.
*/
template <typename ForwardIterator> void Append(ForwardIterator first, ForwardIterator last)
{
if ( first == last )
return;
/** Appends the bits in a sequence of values.
* @tparam Iterator A forward iterator.
* @param first An iterator pointing to the first element of the sequence.
* @param last An iterator pointing to one past the last element of the
* sequence.
*/
template<typename ForwardIterator>
void Append(ForwardIterator first, ForwardIterator last) {
if ( first == last )
return;
block_type excess = extra_bits();
typename std::iterator_traits<ForwardIterator>::difference_type delta = std::distance(first,
last);
block_type excess = extra_bits();
typename std::iterator_traits<ForwardIterator>::difference_type delta = std::distance(first, last);
bits.reserve(Blocks() + delta);
bits.reserve(Blocks() + delta);
if ( excess == 0 )
{
bits.back() |= (*first << excess);
if ( excess == 0 ) {
bits.back() |= (*first << excess);
do
{
block_type b = *first++ >> (bits_per_block - excess);
bits.push_back(b | (first == last ? 0 : *first << excess));
} while ( first != last );
}
do {
block_type b = *first++ >> (bits_per_block - excess);
bits.push_back(b | (first == last ? 0 : *first << excess));
} while ( first != last );
}
else
bits.insert(bits.end(), first, last);
else
bits.insert(bits.end(), first, last);
num_bits += bits_per_block * delta;
}
num_bits += bits_per_block * delta;
}
/**
* Appends the bits in a given block.
* @param block The block containing bits to append.
*/
void Append(block_type block);
/**
* Appends the bits in a given block.
* @param block The block containing bits to append.
*/
void Append(block_type block);
/** Appends a single bit to the end of the bit vector.
* @param bit The value of the bit.
*/
void PushBack(bool bit);
/** Appends a single bit to the end of the bit vector.
* @param bit The value of the bit.
*/
void PushBack(bool bit);
/**
* Clears all bits in the bitvector.
*/
void Clear();
/**
* Clears all bits in the bitvector.
*/
void Clear();
/**
* Resizes the bit vector to a new number of bits.
* @param n The new number of bits of the bit vector.
* @param value The bit value of new values, if the vector expands.
*/
void Resize(size_type n, bool value = false);
/**
* Resizes the bit vector to a new number of bits.
* @param n The new number of bits of the bit vector.
* @param value The bit value of new values, if the vector expands.
*/
void Resize(size_type n, bool value = false);
/**
* Sets a bit at a specific position to a given value.
* @param i The bit position.
* @param bit The value assigned to position *i*.
* @return A reference to the bit vector instance.
*/
BitVector& Set(size_type i, bool bit = true);
/**
* Sets a bit at a specific position to a given value.
* @param i The bit position.
* @param bit The value assigned to position *i*.
* @return A reference to the bit vector instance.
*/
BitVector& Set(size_type i, bool bit = true);
/**
* Sets all bits to 1.
* @return A reference to the bit vector instance.
*/
BitVector& Set();
/**
* Sets all bits to 1.
* @return A reference to the bit vector instance.
*/
BitVector& Set();
/**
* Resets a bit at a specific position, i.e., sets it to 0.
* @param i The bit position.
* @return A reference to the bit vector instance.
*/
BitVector& Reset(size_type i);
/**
* Resets a bit at a specific position, i.e., sets it to 0.
* @param i The bit position.
* @return A reference to the bit vector instance.
*/
BitVector& Reset(size_type i);
/**
* Sets all bits to 0.
* @return A reference to the bit vector instance.
*/
BitVector& Reset();
/**
* Sets all bits to 0.
* @return A reference to the bit vector instance.
*/
BitVector& Reset();
/**
* Toggles/flips a bit at a specific position.
* @param i The bit position.
* @return A reference to the bit vector instance.
*/
BitVector& Flip(size_type i);
/**
* Toggles/flips a bit at a specific position.
* @param i The bit position.
* @return A reference to the bit vector instance.
*/
BitVector& Flip(size_type i);
/**
* Computes the complement.
* @return A reference to the bit vector instance.
*/
BitVector& Flip();
/**
* Computes the complement.
* @return A reference to the bit vector instance.
*/
BitVector& Flip();
/** Retrieves a single bit.
* @param i The bit position.
* @return A mutable reference to the bit at position *i*.
*/
Reference operator[](size_type i);
/** Retrieves a single bit.
* @param i The bit position.
* @return A mutable reference to the bit at position *i*.
*/
Reference operator[](size_type i);
/**
* Retrieves a single bit.
* @param i The bit position.
* @return A const-reference to the bit at position *i*.
*/
const_reference operator[](size_type i) const;
/**
* Retrieves a single bit.
* @param i The bit position.
* @return A const-reference to the bit at position *i*.
*/
const_reference operator[](size_type i) const;
/**
* Counts the number of 1-bits in the bit vector. Also known as *population
* count* or *Hamming weight*.
* @return The number of bits set to 1.
*/
size_type Count() const;
/**
* Counts the number of 1-bits in the bit vector. Also known as *population
* count* or *Hamming weight*.
* @return The number of bits set to 1.
*/
size_type Count() const;
/**
* Retrieves the number of blocks of the underlying storage.
* @param The number of blocks that represent `Size()` bits.
*/
size_type Blocks() const;
/**
* Retrieves the number of blocks of the underlying storage.
* @param The number of blocks that represent `Size()` bits.
*/
size_type Blocks() const;
/**
* Retrieves the number of bits the bitvector consist of.
* @return The length of the bit vector in bits.
*/
size_type Size() const;
/**
* Retrieves the number of bits the bitvector consist of.
* @return The length of the bit vector in bits.
*/
size_type Size() const;
/**
* Checks whether the bit vector is empty.
* @return `true` iff the bitvector has zero length.
*/
bool Empty() const;
/**
* Checks whether the bit vector is empty.
* @return `true` iff the bitvector has zero length.
*/
bool Empty() const;
/**
* Checks whether all bits are 0.
* @return `true` iff all bits in all blocks are 0.
*/
bool AllZero() const;
/**
* Checks whether all bits are 0.
* @return `true` iff all bits in all blocks are 0.
*/
bool AllZero() const;
/**
* Finds the bit position of of the first 1-bit.
* @return The position of the first bit that equals to one or `npos` if no
* such bit exists.
*/
size_type FindFirst() const;
/**
* Finds the bit position of of the first 1-bit.
* @return The position of the first bit that equals to one or `npos` if no
* such bit exists.
*/
size_type FindFirst() const;
/**
* Finds the next 1-bit from a given starting position.
*
* @param i The index where to start looking.
*
* @return The position of the first bit that equals to 1 after position
* *i* or `npos` if no such bit exists.
*/
size_type FindNext(size_type i) const;
/**
* Finds the next 1-bit from a given starting position.
*
* @param i The index where to start looking.
*
* @return The position of the first bit that equals to 1 after position
* *i* or `npos` if no such bit exists.
*/
size_type FindNext(size_type i) const;
/** Computes a hash value of the internal representation.
* This is mainly for debugging/testing purposes.
*
* @return The hash.
*/
uint64_t Hash() const;
/** Computes a hash value of the internal representation.
* This is mainly for debugging/testing purposes.
*
* @return The hash.
*/
uint64_t Hash() const;
broker::expected<broker::data> Serialize() const;
static std::unique_ptr<BitVector> Unserialize(const broker::data& data);
broker::expected<broker::data> Serialize() const;
static std::unique_ptr<BitVector> Unserialize(const broker::data& data);
private:
/**
* Computes the number of excess/unused bits in the bit vector.
*/
block_type extra_bits() const;
/**
* Computes the number of excess/unused bits in the bit vector.
*/
block_type extra_bits() const;
/**
* If the number of bits in the vector are not a multiple of
* bitvector::bits_per_block, then the last block exhibits unused bits which
* this function resets.
*/
void zero_unused_bits();
/**
* If the number of bits in the vector are not a multiple of
* bitvector::bits_per_block, then the last block exhibits unused bits which
* this function resets.
*/
void zero_unused_bits();
/**
* Looks for the first 1-bit starting at a given position.
* @param i The block index to start looking.
* @return The block index of the first 1-bit starting from *i* or
* `bitvector::npos` if no 1-bit exists.
*/
size_type find_from(size_type i) const;
/**
* Looks for the first 1-bit starting at a given position.
* @param i The block index to start looking.
* @return The block index of the first 1-bit starting from *i* or
* `bitvector::npos` if no 1-bit exists.
*/
size_type find_from(size_type i) const;
/**
* Computes the block index for a given bit position.
*/
static size_type block_index(size_type i) { return i / bits_per_block; }
/**
* Computes the block index for a given bit position.
*/
static size_type block_index(size_type i) { return i / bits_per_block; }
/**
* Computes the bit index within a given block for a given bit position.
*/
static block_type bit_index(size_type i) { return i % bits_per_block; }
/**
* Computes the bit index within a given block for a given bit position.
*/
static block_type bit_index(size_type i) { return i % bits_per_block; }
/**
* Computes the bitmask block to extract a bit a given bit position.
*/
static block_type bit_mask(size_type i) { return block_type(1) << bit_index(i); }
/**
* Computes the bitmask block to extract a bit a given bit position.
*/
static block_type bit_mask(size_type i) { return block_type(1) << bit_index(i); }
/**
* Computes the number of blocks needed to represent a given number of
* bits.
* @param bits the number of bits.
* @return The number of blocks to represent *bits* number of bits.
*/
static size_type bits_to_blocks(size_type bits)
{
return bits / bits_per_block + static_cast<size_type>(bits % bits_per_block != 0);
}
/**
* Computes the number of blocks needed to represent a given number of
* bits.
* @param bits the number of bits.
* @return The number of blocks to represent *bits* number of bits.
*/
static size_type bits_to_blocks(size_type bits) {
return bits / bits_per_block + static_cast<size_type>(bits % bits_per_block != 0);
}
/**
* Computes the bit position first 1-bit in a given block.
* @param block The block to inspect.
* @return The bit position where *block* has its first bit set to 1.
*/
static size_type lowest_bit(block_type block);
/**
* Computes the bit position first 1-bit in a given block.
* @param block The block to inspect.
* @return The bit position where *block* has its first bit set to 1.
*/
static size_type lowest_bit(block_type block);
std::vector<block_type> bits;
size_type num_bits;
};
std::vector<block_type> bits;
size_type num_bits;
};
} // namespace zeek::probabilistic::detail
} // namespace zeek::probabilistic::detail

View file

@ -11,368 +11,287 @@
#include "zeek/probabilistic/CounterVector.h"
#include "zeek/util.h"
namespace zeek::probabilistic
{
namespace zeek::probabilistic {
BloomFilter::BloomFilter()
{
hasher = nullptr;
}
BloomFilter::BloomFilter(const detail::Hasher* arg_hasher)
{
hasher = arg_hasher;
}
BloomFilter::~BloomFilter()
{
delete hasher;
}
BloomFilter::BloomFilter() { hasher = nullptr; }
broker::expected<broker::data> BloomFilter::Serialize() const
{
auto h = hasher->Serialize();
if ( ! h )
return broker::ec::invalid_data; // Cannot serialize
BloomFilter::BloomFilter(const detail::Hasher* arg_hasher) { hasher = arg_hasher; }
auto d = DoSerialize();
BloomFilter::~BloomFilter() { delete hasher; }
if ( ! d )
return broker::ec::invalid_data; // Cannot serialize
return {broker::vector{static_cast<uint64_t>(Type()), std::move(*h), std::move(*d)}};
}
std::unique_ptr<BloomFilter> BloomFilter::Unserialize(const broker::data& data)
{
auto v = broker::get_if<broker::vector>(&data);
if ( ! (v && v->size() == 3) )
return nullptr;
auto type = broker::get_if<uint64_t>(&(*v)[0]);
if ( ! type )
return nullptr;
auto hasher_ = detail::Hasher::Unserialize((*v)[1]);
if ( ! hasher_ )
return nullptr;
std::unique_ptr<BloomFilter> bf;
broker::expected<broker::data> BloomFilter::Serialize() const {
auto h = hasher->Serialize();
switch ( *type )
{
case Basic:
bf = std::unique_ptr<BloomFilter>(new BasicBloomFilter());
break;
case Counting:
bf = std::unique_ptr<BloomFilter>(new CountingBloomFilter());
break;
default:
reporter->Error("found invalid bloom filter type");
return nullptr;
}
if ( ! bf->DoUnserialize((*v)[2]) )
return nullptr;
bf->hasher = hasher_.release();
return bf;
}
size_t BasicBloomFilter::M(double fp, size_t capacity)
{
double ln2 = std::log(2);
return std::ceil(-(capacity * std::log(fp) / ln2 / ln2));
}
size_t BasicBloomFilter::K(size_t cells, size_t capacity)
{
double frac = static_cast<double>(cells) / static_cast<double>(capacity);
return std::ceil(frac * std::log(2));
}
bool BasicBloomFilter::Empty() const
{
return bits->AllZero();
}
void BasicBloomFilter::Clear()
{
bits->Reset();
}
bool BasicBloomFilter::Merge(const BloomFilter* other)
{
if ( typeid(*this) != typeid(*other) )
return false;
const BasicBloomFilter* o = static_cast<const BasicBloomFilter*>(other);
if ( ! hasher->Equals(o->hasher) )
{
reporter->Error("incompatible hashers in BasicBloomFilter merge");
return false;
}
else if ( bits->Size() != o->bits->Size() )
{
reporter->Error("different bitvector size in BasicBloomFilter merge");
return false;
}
(*bits) |= *o->bits;
return true;
}
BasicBloomFilter* BasicBloomFilter::Intersect(const BloomFilter* other) const
{
if ( typeid(*this) != typeid(*other) )
return nullptr;
const BasicBloomFilter* o = static_cast<const BasicBloomFilter*>(other);
if ( ! hasher->Equals(o->hasher) )
{
reporter->Error("incompatible hashers in BasicBloomFilter intersect");
return nullptr;
}
else if ( bits->Size() != o->bits->Size() )
{
reporter->Error("different bitvector size in BasicBloomFilter intersect");
return nullptr;
}
auto copy = Clone();
(*copy->bits) &= *o->bits;
return copy;
}
BasicBloomFilter* BasicBloomFilter::Clone() const
{
BasicBloomFilter* copy = new BasicBloomFilter();
copy->hasher = hasher->Clone();
copy->bits = new detail::BitVector(*bits);
return copy;
}
std::string BasicBloomFilter::InternalState() const
{
return util::fmt("%" PRIu64, bits->Hash());
}
BasicBloomFilter::BasicBloomFilter()
{
bits = nullptr;
}
BasicBloomFilter::BasicBloomFilter(const detail::Hasher* hasher, size_t cells) : BloomFilter(hasher)
{
bits = new detail::BitVector(cells);
}
BasicBloomFilter::~BasicBloomFilter()
{
delete bits;
}
void BasicBloomFilter::Add(const zeek::detail::HashKey* key)
{
detail::Hasher::digest_vector h = hasher->Hash(key);
for ( size_t i = 0; i < h.size(); ++i )
bits->Set(h[i] % bits->Size());
}
bool BasicBloomFilter::Decrement(const zeek::detail::HashKey* key)
{
// operation not supported by basic bloom filter
return false;
}
size_t BasicBloomFilter::Count(const zeek::detail::HashKey* key) const
{
detail::Hasher::digest_vector h = hasher->Hash(key);
for ( size_t i = 0; i < h.size(); ++i )
{
if ( ! (*bits)[h[i] % bits->Size()] )
return 0;
}
return 1;
}
broker::expected<broker::data> BasicBloomFilter::DoSerialize() const
{
auto b = bits->Serialize();
return b;
}
bool BasicBloomFilter::DoUnserialize(const broker::data& data)
{
auto b = detail::BitVector::Unserialize(data);
if ( ! b )
return false;
bits = b.release();
return true;
}
CountingBloomFilter::CountingBloomFilter()
{
cells = nullptr;
}
CountingBloomFilter::CountingBloomFilter(const detail::Hasher* hasher, size_t arg_cells,
size_t width)
: BloomFilter(hasher)
{
cells = new detail::CounterVector(width, arg_cells);
}
CountingBloomFilter::~CountingBloomFilter()
{
delete cells;
}
bool CountingBloomFilter::Empty() const
{
return cells->AllZero();
}
void CountingBloomFilter::Clear()
{
cells->Reset();
}
bool CountingBloomFilter::Merge(const BloomFilter* other)
{
if ( typeid(*this) != typeid(*other) )
return false;
const CountingBloomFilter* o = static_cast<const CountingBloomFilter*>(other);
if ( ! hasher->Equals(o->hasher) )
{
reporter->Error("incompatible hashers in CountingBloomFilter merge");
return false;
}
else if ( cells->Size() != o->cells->Size() )
{
reporter->Error("different bitvector size in CountingBloomFilter merge");
return false;
}
(*cells) |= *o->cells;
return true;
}
BasicBloomFilter* CountingBloomFilter::Intersect(const BloomFilter* other) const
{
if ( typeid(*this) != typeid(*other) )
return nullptr;
const CountingBloomFilter* o = static_cast<const CountingBloomFilter*>(other);
if ( ! hasher->Equals(o->hasher) )
{
reporter->Error("incompatible hashers in CountingBloomFilter merge");
return nullptr;
}
else if ( cells->Size() != o->cells->Size() )
{
reporter->Error("different bitvector size in CountingBloomFilter merge");
return nullptr;
}
auto outbf = new BasicBloomFilter(hasher->Clone(), cells->Size());
*outbf->bits |= cells->ToBitVector();
*outbf->bits &= o->cells->ToBitVector();
return outbf;
}
CountingBloomFilter* CountingBloomFilter::Clone() const
{
CountingBloomFilter* copy = new CountingBloomFilter();
copy->hasher = hasher->Clone();
copy->cells = new detail::CounterVector(*cells);
return copy;
}
std::string CountingBloomFilter::InternalState() const
{
return util::fmt("%" PRIu64, cells->Hash());
}
if ( ! h )
return broker::ec::invalid_data; // Cannot serialize
auto d = DoSerialize();
if ( ! d )
return broker::ec::invalid_data; // Cannot serialize
return {broker::vector{static_cast<uint64_t>(Type()), std::move(*h), std::move(*d)}};
}
std::unique_ptr<BloomFilter> BloomFilter::Unserialize(const broker::data& data) {
auto v = broker::get_if<broker::vector>(&data);
if ( ! (v && v->size() == 3) )
return nullptr;
auto type = broker::get_if<uint64_t>(&(*v)[0]);
if ( ! type )
return nullptr;
auto hasher_ = detail::Hasher::Unserialize((*v)[1]);
if ( ! hasher_ )
return nullptr;
std::unique_ptr<BloomFilter> bf;
switch ( *type ) {
case Basic: bf = std::unique_ptr<BloomFilter>(new BasicBloomFilter()); break;
case Counting: bf = std::unique_ptr<BloomFilter>(new CountingBloomFilter()); break;
default: reporter->Error("found invalid bloom filter type"); return nullptr;
}
if ( ! bf->DoUnserialize((*v)[2]) )
return nullptr;
bf->hasher = hasher_.release();
return bf;
}
size_t BasicBloomFilter::M(double fp, size_t capacity) {
double ln2 = std::log(2);
return std::ceil(-(capacity * std::log(fp) / ln2 / ln2));
}
size_t BasicBloomFilter::K(size_t cells, size_t capacity) {
double frac = static_cast<double>(cells) / static_cast<double>(capacity);
return std::ceil(frac * std::log(2));
}
bool BasicBloomFilter::Empty() const { return bits->AllZero(); }
void BasicBloomFilter::Clear() { bits->Reset(); }
bool BasicBloomFilter::Merge(const BloomFilter* other) {
if ( typeid(*this) != typeid(*other) )
return false;
const BasicBloomFilter* o = static_cast<const BasicBloomFilter*>(other);
if ( ! hasher->Equals(o->hasher) ) {
reporter->Error("incompatible hashers in BasicBloomFilter merge");
return false;
}
else if ( bits->Size() != o->bits->Size() ) {
reporter->Error("different bitvector size in BasicBloomFilter merge");
return false;
}
(*bits) |= *o->bits;
return true;
}
BasicBloomFilter* BasicBloomFilter::Intersect(const BloomFilter* other) const {
if ( typeid(*this) != typeid(*other) )
return nullptr;
const BasicBloomFilter* o = static_cast<const BasicBloomFilter*>(other);
if ( ! hasher->Equals(o->hasher) ) {
reporter->Error("incompatible hashers in BasicBloomFilter intersect");
return nullptr;
}
else if ( bits->Size() != o->bits->Size() ) {
reporter->Error("different bitvector size in BasicBloomFilter intersect");
return nullptr;
}
auto copy = Clone();
(*copy->bits) &= *o->bits;
return copy;
}
BasicBloomFilter* BasicBloomFilter::Clone() const {
BasicBloomFilter* copy = new BasicBloomFilter();
copy->hasher = hasher->Clone();
copy->bits = new detail::BitVector(*bits);
return copy;
}
std::string BasicBloomFilter::InternalState() const { return util::fmt("%" PRIu64, bits->Hash()); }
BasicBloomFilter::BasicBloomFilter() { bits = nullptr; }
BasicBloomFilter::BasicBloomFilter(const detail::Hasher* hasher, size_t cells) : BloomFilter(hasher) {
bits = new detail::BitVector(cells);
}
BasicBloomFilter::~BasicBloomFilter() { delete bits; }
void BasicBloomFilter::Add(const zeek::detail::HashKey* key) {
detail::Hasher::digest_vector h = hasher->Hash(key);
for ( size_t i = 0; i < h.size(); ++i )
bits->Set(h[i] % bits->Size());
}
bool BasicBloomFilter::Decrement(const zeek::detail::HashKey* key) {
// operation not supported by basic bloom filter
return false;
}
size_t BasicBloomFilter::Count(const zeek::detail::HashKey* key) const {
detail::Hasher::digest_vector h = hasher->Hash(key);
for ( size_t i = 0; i < h.size(); ++i ) {
if ( ! (*bits)[h[i] % bits->Size()] )
return 0;
}
return 1;
}
broker::expected<broker::data> BasicBloomFilter::DoSerialize() const {
auto b = bits->Serialize();
return b;
}
bool BasicBloomFilter::DoUnserialize(const broker::data& data) {
auto b = detail::BitVector::Unserialize(data);
if ( ! b )
return false;
bits = b.release();
return true;
}
CountingBloomFilter::CountingBloomFilter() { cells = nullptr; }
CountingBloomFilter::CountingBloomFilter(const detail::Hasher* hasher, size_t arg_cells, size_t width)
: BloomFilter(hasher) {
cells = new detail::CounterVector(width, arg_cells);
}
CountingBloomFilter::~CountingBloomFilter() { delete cells; }
bool CountingBloomFilter::Empty() const { return cells->AllZero(); }
void CountingBloomFilter::Clear() { cells->Reset(); }
bool CountingBloomFilter::Merge(const BloomFilter* other) {
if ( typeid(*this) != typeid(*other) )
return false;
const CountingBloomFilter* o = static_cast<const CountingBloomFilter*>(other);
if ( ! hasher->Equals(o->hasher) ) {
reporter->Error("incompatible hashers in CountingBloomFilter merge");
return false;
}
else if ( cells->Size() != o->cells->Size() ) {
reporter->Error("different bitvector size in CountingBloomFilter merge");
return false;
}
(*cells) |= *o->cells;
return true;
}
BasicBloomFilter* CountingBloomFilter::Intersect(const BloomFilter* other) const {
if ( typeid(*this) != typeid(*other) )
return nullptr;
const CountingBloomFilter* o = static_cast<const CountingBloomFilter*>(other);
if ( ! hasher->Equals(o->hasher) ) {
reporter->Error("incompatible hashers in CountingBloomFilter merge");
return nullptr;
}
else if ( cells->Size() != o->cells->Size() ) {
reporter->Error("different bitvector size in CountingBloomFilter merge");
return nullptr;
}
auto outbf = new BasicBloomFilter(hasher->Clone(), cells->Size());
*outbf->bits |= cells->ToBitVector();
*outbf->bits &= o->cells->ToBitVector();
return outbf;
}
CountingBloomFilter* CountingBloomFilter::Clone() const {
CountingBloomFilter* copy = new CountingBloomFilter();
copy->hasher = hasher->Clone();
copy->cells = new detail::CounterVector(*cells);
return copy;
}
std::string CountingBloomFilter::InternalState() const { return util::fmt("%" PRIu64, cells->Hash()); }
// TODO: Use partitioning in add/count to allow for reusing CMS bounds.
void CountingBloomFilter::Add(const zeek::detail::HashKey* key)
{
detail::Hasher::digest_vector h = hasher->Hash(key);
void CountingBloomFilter::Add(const zeek::detail::HashKey* key) {
detail::Hasher::digest_vector h = hasher->Hash(key);
for ( size_t i = 0; i < h.size(); ++i )
cells->Increment(h[i] % cells->Size());
}
for ( size_t i = 0; i < h.size(); ++i )
cells->Increment(h[i] % cells->Size());
}
bool CountingBloomFilter::Decrement(const zeek::detail::HashKey* key)
{
// Only decrement if a member.
if ( Count(key) == 0 )
return false;
bool CountingBloomFilter::Decrement(const zeek::detail::HashKey* key) {
// Only decrement if a member.
if ( Count(key) == 0 )
return false;
detail::Hasher::digest_vector h = hasher->Hash(key);
detail::Hasher::digest_vector h = hasher->Hash(key);
for ( size_t i = 0; i < h.size(); ++i )
cells->Decrement(h[i] % cells->Size());
for ( size_t i = 0; i < h.size(); ++i )
cells->Decrement(h[i] % cells->Size());
return true;
}
return true;
}
size_t CountingBloomFilter::Count(const zeek::detail::HashKey* key) const
{
detail::Hasher::digest_vector h = hasher->Hash(key);
size_t CountingBloomFilter::Count(const zeek::detail::HashKey* key) const {
detail::Hasher::digest_vector h = hasher->Hash(key);
detail::CounterVector::size_type min =
std::numeric_limits<detail::CounterVector::size_type>::max();
detail::CounterVector::size_type min = std::numeric_limits<detail::CounterVector::size_type>::max();
for ( size_t i = 0; i < h.size(); ++i )
{
detail::CounterVector::size_type cnt = cells->Count(h[i] % cells->Size());
if ( cnt < min )
min = cnt;
}
for ( size_t i = 0; i < h.size(); ++i ) {
detail::CounterVector::size_type cnt = cells->Count(h[i] % cells->Size());
if ( cnt < min )
min = cnt;
}
return min;
}
return min;
}
broker::expected<broker::data> CountingBloomFilter::DoSerialize() const
{
auto c = cells->Serialize();
return c;
}
broker::expected<broker::data> CountingBloomFilter::DoSerialize() const {
auto c = cells->Serialize();
return c;
}
bool CountingBloomFilter::DoUnserialize(const broker::data& data)
{
auto c = detail::CounterVector::Unserialize(data);
if ( ! c )
return false;
bool CountingBloomFilter::DoUnserialize(const broker::data& data) {
auto c = detail::CounterVector::Unserialize(data);
if ( ! c )
return false;
cells = c.release();
return true;
}
cells = c.release();
return true;
}
} // namespace zeek::probabilistic
} // namespace zeek::probabilistic

View file

@ -12,273 +12,263 @@
#include "zeek/probabilistic/BitVector.h"
#include "zeek/probabilistic/Hasher.h"
namespace broker
{
namespace broker {
class data;
}
}
namespace zeek::probabilistic
{
namespace detail
{
namespace zeek::probabilistic {
namespace detail {
class CounterVector;
}
}
/** Types of derived BloomFilter classes. */
enum BloomFilterType
{
Basic,
Counting
};
enum BloomFilterType { Basic, Counting };
/**
* The abstract base class for Bloom filters.
*/
class BloomFilter
{
class BloomFilter {
public:
/**
* Destructor.
*/
virtual ~BloomFilter();
/**
* Destructor.
*/
virtual ~BloomFilter();
/**
* Adds an element to the Bloom filter, or increments its value for counting
* bloom filters
*
* @param key The key associated with the element to add.
*/
virtual void Add(const zeek::detail::HashKey* key) = 0;
/**
* Adds an element to the Bloom filter, or increments its value for counting
* bloom filters
*
* @param key The key associated with the element to add.
*/
virtual void Add(const zeek::detail::HashKey* key) = 0;
/**
* Decrements the value of an element in the bloom filter, if the underlying
* filter supports the operation
*
* #param key The key associated with the element to decrement.
*
* @return True if the decrement operation succeeded.
*/
virtual bool Decrement(const zeek::detail::HashKey* key) = 0;
/**
* Decrements the value of an element in the bloom filter, if the underlying
* filter supports the operation
*
* #param key The key associated with the element to decrement.
*
* @return True if the decrement operation succeeded.
*/
virtual bool Decrement(const zeek::detail::HashKey* key) = 0;
/**
* Retrieves the associated count of a given value.
*
* @param key The key associated with the element to check.
*
* @return The counter associated with *key*.
*/
virtual size_t Count(const zeek::detail::HashKey* key) const = 0;
/**
* Retrieves the associated count of a given value.
*
* @param key The key associated with the element to check.
*
* @return The counter associated with *key*.
*/
virtual size_t Count(const zeek::detail::HashKey* key) const = 0;
/**
* Checks whether the Bloom filter is empty.
*
* @return `true` if the Bloom filter contains no elements.
*/
virtual bool Empty() const = 0;
/**
* Checks whether the Bloom filter is empty.
*
* @return `true` if the Bloom filter contains no elements.
*/
virtual bool Empty() const = 0;
/**
* Removes all elements, i.e., resets all bits in the underlying bit vector.
*/
virtual void Clear() = 0;
/**
* Removes all elements, i.e., resets all bits in the underlying bit vector.
*/
virtual void Clear() = 0;
/**
* Merges another Bloom filter into this one.
*
* @param other The other Bloom filter.
*
* @return `true` on success.
*/
virtual bool Merge(const BloomFilter* other) = 0;
/**
* Merges another Bloom filter into this one.
*
* @param other The other Bloom filter.
*
* @return `true` on success.
*/
virtual bool Merge(const BloomFilter* other) = 0;
/**
* Intersects another Bloom filter with a copy of this one and returns the copy.
*
* @param other The other Bloom filter.
*
* @return Intersecting BloomFilter on success, nullptr otherwise.
*/
virtual BloomFilter* Intersect(const BloomFilter* other) const = 0;
/**
* Intersects another Bloom filter with a copy of this one and returns the copy.
*
* @param other The other Bloom filter.
*
* @return Intersecting BloomFilter on success, nullptr otherwise.
*/
virtual BloomFilter* Intersect(const BloomFilter* other) const = 0;
/**
* Constructs a copy of this Bloom filter.
*
* @return A copy of `*this`.
*/
virtual BloomFilter* Clone() const = 0;
/**
* Constructs a copy of this Bloom filter.
*
* @return A copy of `*this`.
*/
virtual BloomFilter* Clone() const = 0;
/**
* Returns a string with a representation of the Bloom filter's
* internal state. This is for debugging/testing purposes only.
*/
virtual std::string InternalState() const = 0;
/**
* Returns a string with a representation of the Bloom filter's
* internal state. This is for debugging/testing purposes only.
*/
virtual std::string InternalState() const = 0;
broker::expected<broker::data> Serialize() const;
static std::unique_ptr<BloomFilter> Unserialize(const broker::data& data);
broker::expected<broker::data> Serialize() const;
static std::unique_ptr<BloomFilter> Unserialize(const broker::data& data);
protected:
/**
* Default constructor.
*/
BloomFilter();
/**
* Default constructor.
*/
BloomFilter();
/**
* Constructs a Bloom filter.
*
* @param hasher The hasher to use for this Bloom filter.
*/
explicit BloomFilter(const detail::Hasher* hasher);
/**
* Constructs a Bloom filter.
*
* @param hasher The hasher to use for this Bloom filter.
*/
explicit BloomFilter(const detail::Hasher* hasher);
virtual broker::expected<broker::data> DoSerialize() const = 0;
virtual bool DoUnserialize(const broker::data& data) = 0;
virtual BloomFilterType Type() const = 0;
virtual broker::expected<broker::data> DoSerialize() const = 0;
virtual bool DoUnserialize(const broker::data& data) = 0;
virtual BloomFilterType Type() const = 0;
const detail::Hasher* hasher;
};
const detail::Hasher* hasher;
};
class CountingBloomFilter;
/**
* A basic Bloom filter.
*/
class BasicBloomFilter : public BloomFilter
{
friend class CountingBloomFilter;
class BasicBloomFilter : public BloomFilter {
friend class CountingBloomFilter;
public:
/**
* Constructs a basic Bloom filter with a given number of cells. The
* ideal number of cells can be computed with *M*.
*
* @param hasher The hasher to use. The ideal number of hash
* functions can be computed with *K*.
*
* @param cells The number of cells.
*/
BasicBloomFilter(const detail::Hasher* hasher, size_t cells);
/**
* Constructs a basic Bloom filter with a given number of cells. The
* ideal number of cells can be computed with *M*.
*
* @param hasher The hasher to use. The ideal number of hash
* functions can be computed with *K*.
*
* @param cells The number of cells.
*/
BasicBloomFilter(const detail::Hasher* hasher, size_t cells);
/**
* Destructor.
*/
~BasicBloomFilter() override;
/**
* Destructor.
*/
~BasicBloomFilter() override;
/**
* Computes the number of cells based on a given false positive rate
* and capacity. In the literature, this parameter often has the name
* *M*.
*
* @param fp The false positive rate.
*
* @param capacity The expected number of elements that will be
* stored.
*
* Returns: The number cells needed to support a false positive rate
* of *fp* with at most *capacity* elements.
*/
static size_t M(double fp, size_t capacity);
/**
* Computes the number of cells based on a given false positive rate
* and capacity. In the literature, this parameter often has the name
* *M*.
*
* @param fp The false positive rate.
*
* @param capacity The expected number of elements that will be
* stored.
*
* Returns: The number cells needed to support a false positive rate
* of *fp* with at most *capacity* elements.
*/
static size_t M(double fp, size_t capacity);
/**
* Computes the optimal number of hash functions based on the number cells
* and expected number of elements.
*
* @param cells The number of cells (*m*).
*
* @param capacity The maximum number of elements.
*
* Returns: the optimal number of hash functions for a false-positive
* rate of *fp* for at most *capacity* elements.
*/
static size_t K(size_t cells, size_t capacity);
/**
* Computes the optimal number of hash functions based on the number cells
* and expected number of elements.
*
* @param cells The number of cells (*m*).
*
* @param capacity The maximum number of elements.
*
* Returns: the optimal number of hash functions for a false-positive
* rate of *fp* for at most *capacity* elements.
*/
static size_t K(size_t cells, size_t capacity);
// Overridden from BloomFilter.
bool Empty() const override;
void Clear() override;
bool Merge(const BloomFilter* other) override;
BasicBloomFilter* Clone() const override;
BasicBloomFilter* Intersect(const BloomFilter* other) const override;
std::string InternalState() const override;
// Overridden from BloomFilter.
bool Empty() const override;
void Clear() override;
bool Merge(const BloomFilter* other) override;
BasicBloomFilter* Clone() const override;
BasicBloomFilter* Intersect(const BloomFilter* other) const override;
std::string InternalState() const override;
protected:
friend class BloomFilter;
friend class BloomFilter;
/**
* Default constructor.
*/
BasicBloomFilter();
/**
* Default constructor.
*/
BasicBloomFilter();
// Overridden from BloomFilter.
void Add(const zeek::detail::HashKey* key) override;
bool Decrement(const zeek::detail::HashKey* key) override;
size_t Count(const zeek::detail::HashKey* key) const override;
broker::expected<broker::data> DoSerialize() const override;
bool DoUnserialize(const broker::data& data) override;
BloomFilterType Type() const override { return BloomFilterType::Basic; }
// Overridden from BloomFilter.
void Add(const zeek::detail::HashKey* key) override;
bool Decrement(const zeek::detail::HashKey* key) override;
size_t Count(const zeek::detail::HashKey* key) const override;
broker::expected<broker::data> DoSerialize() const override;
bool DoUnserialize(const broker::data& data) override;
BloomFilterType Type() const override { return BloomFilterType::Basic; }
private:
detail::BitVector* bits;
};
detail::BitVector* bits;
};
/**
* A counting Bloom filter.
*/
class CountingBloomFilter : public BloomFilter
{
class CountingBloomFilter : public BloomFilter {
public:
/**
* Constructs a counting Bloom filter.
*
* @param hasher The hasher to use. The ideal number of hash
* functions can be computed with *K*.
*
* @param cells The number of cells to use.
*
* @param width The maximal bit-width of counter values.
*/
CountingBloomFilter(const detail::Hasher* hasher, size_t cells, size_t width);
/**
* Constructs a counting Bloom filter.
*
* @param hasher The hasher to use. The ideal number of hash
* functions can be computed with *K*.
*
* @param cells The number of cells to use.
*
* @param width The maximal bit-width of counter values.
*/
CountingBloomFilter(const detail::Hasher* hasher, size_t cells, size_t width);
/**
* Destructor.
*/
~CountingBloomFilter() override;
/**
* Destructor.
*/
~CountingBloomFilter() override;
// Overridden from BloomFilter.
bool Empty() const override;
void Clear() override;
bool Merge(const BloomFilter* other) override;
CountingBloomFilter* Clone() const override;
std::string InternalState() const override;
// Overridden from BloomFilter.
bool Empty() const override;
void Clear() override;
bool Merge(const BloomFilter* other) override;
CountingBloomFilter* Clone() const override;
std::string InternalState() const override;
/**
* Intersects another Bloom filter this one and returns a new BasicBloomFilter.
*
* Please note that the Intersection of two Counting bloom filters results in a
* basic bloom filter. The reason for this is that the counters loose meaning during
* the intersection process. The BasicBloomFilter will have bits set in cases where
* both Counting Bloom filters has cell values greater than zero.
*
* @param other The other Bloom filter.
*
* @return Intersecting BloomFilter on success, nullptr otherwise.
*/
BasicBloomFilter* Intersect(const BloomFilter* other) const override;
/**
* Intersects another Bloom filter this one and returns a new BasicBloomFilter.
*
* Please note that the Intersection of two Counting bloom filters results in a
* basic bloom filter. The reason for this is that the counters loose meaning during
* the intersection process. The BasicBloomFilter will have bits set in cases where
* both Counting Bloom filters has cell values greater than zero.
*
* @param other The other Bloom filter.
*
* @return Intersecting BloomFilter on success, nullptr otherwise.
*/
BasicBloomFilter* Intersect(const BloomFilter* other) const override;
protected:
friend class BloomFilter;
friend class BloomFilter;
/**
* Default constructor.
*/
CountingBloomFilter();
/**
* Default constructor.
*/
CountingBloomFilter();
// Overridden from BloomFilter.
void Add(const zeek::detail::HashKey* key) override;
bool Decrement(const zeek::detail::HashKey* key) override;
size_t Count(const zeek::detail::HashKey* key) const override;
broker::expected<broker::data> DoSerialize() const override;
bool DoUnserialize(const broker::data& data) override;
BloomFilterType Type() const override { return BloomFilterType::Counting; }
// Overridden from BloomFilter.
void Add(const zeek::detail::HashKey* key) override;
bool Decrement(const zeek::detail::HashKey* key) override;
size_t Count(const zeek::detail::HashKey* key) const override;
broker::expected<broker::data> DoSerialize() const override;
bool DoUnserialize(const broker::data& data) override;
BloomFilterType Type() const override { return BloomFilterType::Counting; }
private:
detail::CounterVector* cells;
};
detail::CounterVector* cells;
};
} // namespace zeek::probabilistic
} // namespace zeek::probabilistic

View file

@ -9,134 +9,119 @@
#include "zeek/Reporter.h"
namespace zeek::probabilistic::detail
{
namespace zeek::probabilistic::detail {
int CardinalityCounter::OptimalB(double error, double confidence) const
{
double initial_estimate = 2 * (log(1.04) - log(error)) / log(2);
int answer = (int)floor(initial_estimate);
int CardinalityCounter::OptimalB(double error, double confidence) const {
double initial_estimate = 2 * (log(1.04) - log(error)) / log(2);
int answer = (int)floor(initial_estimate);
// k is the number of standard deviations that we have to go to have
// a confidence level of conf.
// k is the number of standard deviations that we have to go to have
// a confidence level of conf.
double k = 0;
double k = 0;
do
{
answer++;
k = pow(2, (answer - initial_estimate) / 2);
} while ( erf(k / sqrt(2)) < confidence );
do {
answer++;
k = pow(2, (answer - initial_estimate) / 2);
} while ( erf(k / sqrt(2)) < confidence );
return answer;
}
return answer;
}
void CardinalityCounter::Init(uint64_t size)
{
m = size;
void CardinalityCounter::Init(uint64_t size) {
m = size;
// The following magic values are taken directly out of the
// description of the HyperLogLog algorithm.
// The following magic values are taken directly out of the
// description of the HyperLogLog algorithm.
if ( m == 16 )
alpha_m = 0.673;
if ( m == 16 )
alpha_m = 0.673;
else if ( m == 32 )
alpha_m = 0.697;
else if ( m == 32 )
alpha_m = 0.697;
else if ( m == 64 )
alpha_m = 0.709;
else if ( m == 64 )
alpha_m = 0.709;
else if ( m >= 128 )
alpha_m = 0.7213 / (1 + 1.079 / m);
else if ( m >= 128 )
alpha_m = 0.7213 / (1 + 1.079 / m);
else
reporter->InternalError(
"Invalid size %" PRIu64 ". Size either has to be 16, 32, 64 or bigger than 128", size);
else
reporter->InternalError("Invalid size %" PRIu64 ". Size either has to be 16, 32, 64 or bigger than 128", size);
double calc_p = log2(m);
if ( trunc(calc_p) != calc_p )
reporter->InternalError("Invalid size %" PRIu64 ". Size either has to be a power of 2",
size);
double calc_p = log2(m);
if ( trunc(calc_p) != calc_p )
reporter->InternalError("Invalid size %" PRIu64 ". Size either has to be a power of 2", size);
p = calc_p;
p = calc_p;
buckets.reserve(m);
for ( uint64_t i = 0; i < m; i++ )
buckets.push_back(0);
buckets.reserve(m);
for ( uint64_t i = 0; i < m; i++ )
buckets.push_back(0);
assert(buckets.size() == m);
assert(buckets.size() == m);
V = m;
}
V = m;
}
CardinalityCounter::CardinalityCounter(CardinalityCounter& other) : buckets(other.buckets)
{
V = other.V;
alpha_m = other.alpha_m;
m = other.m;
p = other.p;
}
CardinalityCounter::CardinalityCounter(CardinalityCounter& other) : buckets(other.buckets) {
V = other.V;
alpha_m = other.alpha_m;
m = other.m;
p = other.p;
}
CardinalityCounter::CardinalityCounter(CardinalityCounter&& o) noexcept
{
V = o.V;
alpha_m = o.alpha_m;
m = o.m;
p = o.p;
CardinalityCounter::CardinalityCounter(CardinalityCounter&& o) noexcept {
V = o.V;
alpha_m = o.alpha_m;
m = o.m;
p = o.p;
o.m = 0;
buckets = std::move(o.buckets);
}
o.m = 0;
buckets = std::move(o.buckets);
}
CardinalityCounter::CardinalityCounter(double error_margin, double confidence)
{
int b = OptimalB(error_margin, confidence);
Init((uint64_t)pow(2, b));
CardinalityCounter::CardinalityCounter(double error_margin, double confidence) {
int b = OptimalB(error_margin, confidence);
Init((uint64_t)pow(2, b));
assert(b == p);
}
assert(b == p);
}
CardinalityCounter::CardinalityCounter(uint64_t size)
{
Init(size);
}
CardinalityCounter::CardinalityCounter(uint64_t size) { Init(size); }
CardinalityCounter::CardinalityCounter(uint64_t arg_size, uint64_t arg_V, double arg_alpha_m)
{
m = arg_size;
CardinalityCounter::CardinalityCounter(uint64_t arg_size, uint64_t arg_V, double arg_alpha_m) {
m = arg_size;
buckets.reserve(m);
for ( uint64_t i = 0; i < m; i++ )
buckets.push_back(0);
buckets.reserve(m);
for ( uint64_t i = 0; i < m; i++ )
buckets.push_back(0);
alpha_m = arg_alpha_m;
V = arg_V;
p = log2(m);
}
alpha_m = arg_alpha_m;
V = arg_V;
p = log2(m);
}
uint8_t CardinalityCounter::Rank(uint64_t hash_modified) const
{
hash_modified = hash_modified >> p;
int answer = 64 - p - CardinalityCounter::flsll(hash_modified) + 1;
assert(answer > 0 && answer < 64);
uint8_t CardinalityCounter::Rank(uint64_t hash_modified) const {
hash_modified = hash_modified >> p;
int answer = 64 - p - CardinalityCounter::flsll(hash_modified) + 1;
assert(answer > 0 && answer < 64);
return answer;
}
return answer;
}
void CardinalityCounter::AddElement(uint64_t hash)
{
uint64_t index = hash % m;
hash = hash - index;
void CardinalityCounter::AddElement(uint64_t hash) {
uint64_t index = hash % m;
hash = hash - index;
if ( buckets[index] == 0 )
V--;
if ( buckets[index] == 0 )
V--;
uint8_t temp = Rank(hash);
uint8_t temp = Rank(hash);
if ( temp > buckets[index] )
buckets[index] = temp;
}
if ( temp > buckets[index] )
buckets[index] = temp;
}
/**
* Estimate the size by using the "raw" HyperLogLog estimate. Then,
@ -147,99 +132,87 @@ void CardinalityCounter::AddElement(uint64_t hash)
* Note - we deviate from the HLL algorithm in the paper here, because
* of our 64-bit hashes.
**/
double CardinalityCounter::Size() const
{
double answer = 0;
for ( unsigned int i = 0; i < m; i++ )
answer += pow(2, -((int)buckets[i]));
double CardinalityCounter::Size() const {
double answer = 0;
for ( unsigned int i = 0; i < m; i++ )
answer += pow(2, -((int)buckets[i]));
answer = 1 / answer;
answer = (alpha_m * m * m * answer);
answer = 1 / answer;
answer = (alpha_m * m * m * answer);
if ( answer <= 5.0 * (((double)m) / 2) )
return m * log(((double)m) / V);
if ( answer <= 5.0 * (((double)m) / 2) )
return m * log(((double)m) / V);
else if ( answer <= (pow(2, 64) / 30) )
return answer;
else if ( answer <= (pow(2, 64) / 30) )
return answer;
else
return -pow(2, 64) * log(1 - (answer / pow(2, 64)));
}
else
return -pow(2, 64) * log(1 - (answer / pow(2, 64)));
}
bool CardinalityCounter::Merge(CardinalityCounter* c)
{
if ( m != c->GetM() )
return false;
bool CardinalityCounter::Merge(CardinalityCounter* c) {
if ( m != c->GetM() )
return false;
const std::vector<uint8_t>& temp = c->GetBuckets();
const std::vector<uint8_t>& temp = c->GetBuckets();
V = 0;
V = 0;
for ( size_t i = 0; i < m; i++ )
{
if ( temp[i] > buckets[i] )
buckets[i] = temp[i];
for ( size_t i = 0; i < m; i++ ) {
if ( temp[i] > buckets[i] )
buckets[i] = temp[i];
if ( buckets[i] == 0 )
++V;
}
if ( buckets[i] == 0 )
++V;
}
return true;
}
return true;
}
const std::vector<uint8_t>& CardinalityCounter::GetBuckets() const
{
return buckets;
}
const std::vector<uint8_t>& CardinalityCounter::GetBuckets() const { return buckets; }
uint64_t CardinalityCounter::GetM() const
{
return m;
}
uint64_t CardinalityCounter::GetM() const { return m; }
broker::expected<broker::data> CardinalityCounter::Serialize() const
{
broker::vector v = {m, V, alpha_m};
v.reserve(3 + m);
broker::expected<broker::data> CardinalityCounter::Serialize() const {
broker::vector v = {m, V, alpha_m};
v.reserve(3 + m);
for ( size_t i = 0; i < m; ++i )
v.emplace_back(static_cast<uint64_t>(buckets[i]));
for ( size_t i = 0; i < m; ++i )
v.emplace_back(static_cast<uint64_t>(buckets[i]));
return {std::move(v)};
}
return {std::move(v)};
}
std::unique_ptr<CardinalityCounter> CardinalityCounter::Unserialize(const broker::data& data)
{
auto v = broker::get_if<broker::vector>(&data);
if ( ! (v && v->size() >= 3) )
return nullptr;
std::unique_ptr<CardinalityCounter> CardinalityCounter::Unserialize(const broker::data& data) {
auto v = broker::get_if<broker::vector>(&data);
if ( ! (v && v->size() >= 3) )
return nullptr;
auto m = broker::get_if<uint64_t>(&(*v)[0]);
auto V = broker::get_if<uint64_t>(&(*v)[1]);
auto alpha_m = broker::get_if<double>(&(*v)[2]);
auto m = broker::get_if<uint64_t>(&(*v)[0]);
auto V = broker::get_if<uint64_t>(&(*v)[1]);
auto alpha_m = broker::get_if<double>(&(*v)[2]);
if ( ! (m && V && alpha_m) )
return nullptr;
if ( v->size() != 3 + *m )
return nullptr;
if ( ! (m && V && alpha_m) )
return nullptr;
if ( v->size() != 3 + *m )
return nullptr;
auto cc = std::unique_ptr<CardinalityCounter>(new CardinalityCounter(*m, *V, *alpha_m));
if ( *m != cc->m )
return nullptr;
if ( cc->buckets.size() != *m )
return nullptr;
auto cc = std::unique_ptr<CardinalityCounter>(new CardinalityCounter(*m, *V, *alpha_m));
if ( *m != cc->m )
return nullptr;
if ( cc->buckets.size() != *m )
return nullptr;
for ( size_t i = 0; i < *m; ++i )
{
auto x = broker::get_if<uint64_t>(&(*v)[3 + i]);
if ( ! x )
return nullptr;
for ( size_t i = 0; i < *m; ++i ) {
auto x = broker::get_if<uint64_t>(&(*v)[3 + i]);
if ( ! x )
return nullptr;
cc->buckets[i] = *x;
}
cc->buckets[i] = *x;
}
return cc;
}
return cc;
}
/**
* The following function is copied from libc/string/flsll.c from the FreeBSD source
@ -277,15 +250,14 @@ std::unique_ptr<CardinalityCounter> CardinalityCounter::Unserialize(const broker
/*
* Find Last Set bit
*/
int CardinalityCounter::flsll(uint64_t mask)
{
int bit;
int CardinalityCounter::flsll(uint64_t mask) {
int bit;
if ( mask == 0 )
return (0);
for ( bit = 1; mask != 1; bit++ )
mask = (uint64_t)mask >> 1;
return (bit);
}
if ( mask == 0 )
return (0);
for ( bit = 1; mask != 1; bit++ )
mask = (uint64_t)mask >> 1;
return (bit);
}
} // namespace zeek::probabilistic::detail
} // namespace zeek::probabilistic::detail

View file

@ -7,186 +7,183 @@
#include <memory>
#include <vector>
namespace broker
{
namespace broker {
class data;
}
}
namespace zeek::probabilistic::detail
{
namespace zeek::probabilistic::detail {
/**
* A probabilistic cardinality counter using the HyperLogLog algorithm.
*/
class CardinalityCounter
{
class CardinalityCounter {
public:
/**
* Constructor.
*
* The number of buckets of the data structure is determined using
* the error margin and the given confidence.
*
* For example, assume an error_margin of 2% and a confidence
* of 95%. If the Size function returns an estimate of 100, this
* means that we are 95% sure that the cardinality is between 98
* and 102.
*
* @param error_margin error margin
*
* @param confidence confidence of the error. Default: 0.95
*/
explicit CardinalityCounter(double error_margin, double confidence = 0.95);
/**
* Constructor.
*
* The number of buckets of the data structure is determined using
* the error margin and the given confidence.
*
* For example, assume an error_margin of 2% and a confidence
* of 95%. If the Size function returns an estimate of 100, this
* means that we are 95% sure that the cardinality is between 98
* and 102.
*
* @param error_margin error margin
*
* @param confidence confidence of the error. Default: 0.95
*/
explicit CardinalityCounter(double error_margin, double confidence = 0.95);
/**
* Copy-Constructor
*/
CardinalityCounter(CardinalityCounter& other);
/**
* Copy-Constructor
*/
CardinalityCounter(CardinalityCounter& other);
/**
* Move-Constructor
*/
CardinalityCounter(CardinalityCounter&& o) noexcept;
/**
* Move-Constructor
*/
CardinalityCounter(CardinalityCounter&& o) noexcept;
/**
* Constructor for a known number of buckets.
*
* The error margin is 1.04/sqrt(size) with approximately 68%
* probability.
*
* @param size number of buckets to create
*/
explicit CardinalityCounter(uint64_t size);
/**
* Constructor for a known number of buckets.
*
* The error margin is 1.04/sqrt(size) with approximately 68%
* probability.
*
* @param size number of buckets to create
*/
explicit CardinalityCounter(uint64_t size);
/**
* Destructor.
*/
~CardinalityCounter() = default;
/**
* Destructor.
*/
~CardinalityCounter() = default;
/**
* Add a new element to the counter.
*
* The hash function generating the hashes needs to be uniformly
* distributed over 64 bits.
*
* @param hash 64-bit hash value of the element to be added
*/
void AddElement(uint64_t hash);
/**
* Add a new element to the counter.
*
* The hash function generating the hashes needs to be uniformly
* distributed over 64 bits.
*
* @param hash 64-bit hash value of the element to be added
*/
void AddElement(uint64_t hash);
/**
* Get the current estimated number of elements in the data
* structure
*
* @return Estimated number of elements
**/
double Size() const;
/**
* Get the current estimated number of elements in the data
* structure
*
* @return Estimated number of elements
**/
double Size() const;
/**
* Merges the argument cardinality counter with this one. The error
* margins of both counters have to be the same, otherwise the merge
* operation will not be carried out.
*
* @param c Cardinality counter to merge into the current counter.
*
* @return True if successful
*/
bool Merge(CardinalityCounter* c);
/**
* Merges the argument cardinality counter with this one. The error
* margins of both counters have to be the same, otherwise the merge
* operation will not be carried out.
*
* @param c Cardinality counter to merge into the current counter.
*
* @return True if successful
*/
bool Merge(CardinalityCounter* c);
broker::expected<broker::data> Serialize() const;
static std::unique_ptr<CardinalityCounter> Unserialize(const broker::data& data);
broker::expected<broker::data> Serialize() const;
static std::unique_ptr<CardinalityCounter> Unserialize(const broker::data& data);
protected:
/**
* Return the number of buckets.
*
* @return Number of buckets
*/
uint64_t GetM() const;
/**
* Return the number of buckets.
*
* @return Number of buckets
*/
uint64_t GetM() const;
/**
* Returns the buckets array that holds all of the rough cardinality
* estimates.
*
* Use GetM() to determine the size.
*
* @return Array containing cardinality estimates
*/
const std::vector<uint8_t>& GetBuckets() const;
/**
* Returns the buckets array that holds all of the rough cardinality
* estimates.
*
* Use GetM() to determine the size.
*
* @return Array containing cardinality estimates
*/
const std::vector<uint8_t>& GetBuckets() const;
private:
/**
* Constructor used when unserializing, i.e., all parameters are
* known.
*/
explicit CardinalityCounter(uint64_t size, uint64_t V, double alpha_m);
/**
* Constructor used when unserializing, i.e., all parameters are
* known.
*/
explicit CardinalityCounter(uint64_t size, uint64_t V, double alpha_m);
/**
* Helper function with code used jointly by multiple constructors.
*
* @param arg_size: number of buckets that need to be kept
*/
void Init(uint64_t arg_size);
/**
* Helper function with code used jointly by multiple constructors.
*
* @param arg_size: number of buckets that need to be kept
*/
void Init(uint64_t arg_size);
/**
* This function calculates the smallest value of b that will
* satisfy these the constraints of a specified error margin and
* confidence level.
*
* The exact expression for b is as follows:
* Define x = 2*(log(1.04*k/error)/log(2)). Then b is the ceiling of x.
*
* After that initial estimate, the value of b is increased until the
* standard deviation falls within the specified value.
*
* @param error error margin
*
* @param confidence confidence of the error
*
* @return minimal B-value satisfying the error-rate under confidence.
*/
int OptimalB(double error, double confidence) const;
/**
* This function calculates the smallest value of b that will
* satisfy these the constraints of a specified error margin and
* confidence level.
*
* The exact expression for b is as follows:
* Define x = 2*(log(1.04*k/error)/log(2)). Then b is the ceiling of x.
*
* After that initial estimate, the value of b is increased until the
* standard deviation falls within the specified value.
*
* @param error error margin
*
* @param confidence confidence of the error
*
* @return minimal B-value satisfying the error-rate under confidence.
*/
int OptimalB(double error, double confidence) const;
/**
* Determines at which index (counted from the front) the first one-bit
* appears. The last b bits have to be 0 (the element has to be divisible
* by m), hence they are ignored. Always adds 1 to the result. This is the
* rho function from the original algorithm.
*
* @param hash_modified hash value
*
* @returns index of first one-bit
*/
uint8_t Rank(uint64_t hash_modified) const;
/**
* Determines at which index (counted from the front) the first one-bit
* appears. The last b bits have to be 0 (the element has to be divisible
* by m), hence they are ignored. Always adds 1 to the result. This is the
* rho function from the original algorithm.
*
* @param hash_modified hash value
*
* @returns index of first one-bit
*/
uint8_t Rank(uint64_t hash_modified) const;
/**
* flsll from FreeBSD; especially Linux does not have this.
*/
static int flsll(uint64_t mask);
/**
* flsll from FreeBSD; especially Linux does not have this.
*/
static int flsll(uint64_t mask);
/**
* This is the number of buckets that will be stored. The standard
* error is 1.04/sqrt(m), so the actual cardinality will be the
* estimate +/- 1.04/sqrt(m) with approximately 68% probability.
*/
uint64_t m = 0;
/**
* This is the number of buckets that will be stored. The standard
* error is 1.04/sqrt(m), so the actual cardinality will be the
* estimate +/- 1.04/sqrt(m) with approximately 68% probability.
*/
uint64_t m = 0;
/**
* These are the actual buckets that are storing an estimate of the
* cardinality. All these need to do is count when the first 1 bit
* appears in the bitstring and that location is at most 65, so not
* that many bits are needed to store it.
*/
std::vector<uint8_t> buckets;
/**
* These are the actual buckets that are storing an estimate of the
* cardinality. All these need to do is count when the first 1 bit
* appears in the bitstring and that location is at most 65, so not
* that many bits are needed to store it.
*/
std::vector<uint8_t> buckets;
/**
* There are some state constants that need to be kept track of to
* make the final estimate easier. V is the number of values in
* buckets that are 0 and this is used in the small error correction.
* alpha_m is a multiplicative constant used in the algorithm.
*/
uint64_t V = 0;
double alpha_m = 0.0;
int p = 0; // the log2 of m
};
/**
* There are some state constants that need to be kept track of to
* make the final estimate easier. V is the number of values in
* buckets that are 0 and this is used in the small error correction.
* alpha_m is a multiplicative constant used in the algorithm.
*/
uint64_t V = 0;
double alpha_m = 0.0;
int p = 0; // the log2 of m
};
} // namespace zeek::probabilistic::detail
} // namespace zeek::probabilistic::detail

View file

@ -10,197 +10,157 @@
#include "zeek/probabilistic/BitVector.h"
#include "zeek/util.h"
namespace zeek::probabilistic::detail
{
namespace zeek::probabilistic::detail {
CounterVector::CounterVector(size_t arg_width, size_t cells)
{
bits = new BitVector(arg_width * cells);
width = arg_width;
}
CounterVector::CounterVector(size_t arg_width, size_t cells) {
bits = new BitVector(arg_width * cells);
width = arg_width;
}
CounterVector::CounterVector(const CounterVector& other)
{
bits = new BitVector(*other.bits);
width = other.width;
}
CounterVector::CounterVector(const CounterVector& other) {
bits = new BitVector(*other.bits);
width = other.width;
}
CounterVector::~CounterVector()
{
delete bits;
}
CounterVector::~CounterVector() { delete bits; }
bool CounterVector::Increment(size_type cell, count_type value)
{
assert(cell < Size());
assert(value != 0);
bool CounterVector::Increment(size_type cell, count_type value) {
assert(cell < Size());
assert(value != 0);
size_t lsb = cell * width;
bool carry = false;
size_t lsb = cell * width;
bool carry = false;
for ( size_t i = 0; i < width; ++i )
{
bool b1 = (*bits)[lsb + i];
bool b2 = value & (1 << i);
(*bits)[lsb + i] = b1 ^ b2 ^ carry;
carry = (b1 && b2) || (carry && (b1 != b2));
}
for ( size_t i = 0; i < width; ++i ) {
bool b1 = (*bits)[lsb + i];
bool b2 = value & (1 << i);
(*bits)[lsb + i] = b1 ^ b2 ^ carry;
carry = (b1 && b2) || (carry && (b1 != b2));
}
if ( carry )
{
for ( size_t i = 0; i < width; ++i )
bits->Set(lsb + i);
}
if ( carry ) {
for ( size_t i = 0; i < width; ++i )
bits->Set(lsb + i);
}
return ! carry;
}
return ! carry;
}
bool CounterVector::Decrement(size_type cell, count_type value)
{
assert(cell < Size());
assert(value != 0);
bool CounterVector::Decrement(size_type cell, count_type value) {
assert(cell < Size());
assert(value != 0);
value = ~value + 1; // A - B := A + ~B + 1
bool carry = false;
size_t lsb = cell * width;
value = ~value + 1; // A - B := A + ~B + 1
bool carry = false;
size_t lsb = cell * width;
for ( size_t i = 0; i < width; ++i )
{
bool b1 = (*bits)[lsb + i];
bool b2 = value & (1 << i);
(*bits)[lsb + i] = b1 ^ b2 ^ carry;
carry = (b1 && b2) || (carry && (b1 != b2));
}
for ( size_t i = 0; i < width; ++i ) {
bool b1 = (*bits)[lsb + i];
bool b2 = value & (1 << i);
(*bits)[lsb + i] = b1 ^ b2 ^ carry;
carry = (b1 && b2) || (carry && (b1 != b2));
}
return carry;
}
return carry;
}
bool CounterVector::AllZero() const
{
return bits->AllZero();
}
bool CounterVector::AllZero() const { return bits->AllZero(); }
void CounterVector::Reset()
{
bits->Reset();
}
void CounterVector::Reset() { bits->Reset(); }
CounterVector::count_type CounterVector::Count(size_type cell) const
{
assert(cell < Size());
CounterVector::count_type CounterVector::Count(size_type cell) const {
assert(cell < Size());
size_t cnt = 0, order = 1;
size_t lsb = cell * width;
size_t cnt = 0, order = 1;
size_t lsb = cell * width;
for ( size_t i = lsb; i < lsb + width; ++i, order <<= 1 )
if ( (*bits)[i] )
cnt |= order;
for ( size_t i = lsb; i < lsb + width; ++i, order <<= 1 )
if ( (*bits)[i] )
cnt |= order;
return cnt;
}
return cnt;
}
CounterVector::size_type CounterVector::Size() const
{
return bits->Size() / width;
}
CounterVector::size_type CounterVector::Size() const { return bits->Size() / width; }
size_t CounterVector::Width() const
{
return width;
}
size_t CounterVector::Width() const { return width; }
size_t CounterVector::Max() const
{
return std::numeric_limits<size_t>::max() >> (std::numeric_limits<size_t>::digits - width);
}
size_t CounterVector::Max() const {
return std::numeric_limits<size_t>::max() >> (std::numeric_limits<size_t>::digits - width);
}
CounterVector& CounterVector::Merge(const CounterVector& other)
{
assert(Size() == other.Size());
assert(Width() == other.Width());
CounterVector& CounterVector::Merge(const CounterVector& other) {
assert(Size() == other.Size());
assert(Width() == other.Width());
for ( size_t cell = 0; cell < Size(); ++cell )
{
size_t lsb = cell * width;
bool carry = false;
for ( size_t cell = 0; cell < Size(); ++cell ) {
size_t lsb = cell * width;
bool carry = false;
for ( size_t i = 0; i < width; ++i )
{
bool b1 = (*bits)[lsb + i];
bool b2 = (*other.bits)[lsb + i];
(*bits)[lsb + i] = b1 ^ b2 ^ carry;
carry = (b1 && b2) || (carry && (b1 != b2));
}
for ( size_t i = 0; i < width; ++i ) {
bool b1 = (*bits)[lsb + i];
bool b2 = (*other.bits)[lsb + i];
(*bits)[lsb + i] = b1 ^ b2 ^ carry;
carry = (b1 && b2) || (carry && (b1 != b2));
}
if ( carry )
{
for ( size_t i = 0; i < width; ++i )
bits->Set(lsb + i);
}
}
if ( carry ) {
for ( size_t i = 0; i < width; ++i )
bits->Set(lsb + i);
}
}
return *this;
}
return *this;
}
BitVector CounterVector::ToBitVector() const
{
auto newbits = BitVector(Size());
BitVector CounterVector::ToBitVector() const {
auto newbits = BitVector(Size());
for ( size_t cell = 0; cell < Size(); ++cell )
{
size_t lsb = cell * width;
bool set = false;
for ( size_t cell = 0; cell < Size(); ++cell ) {
size_t lsb = cell * width;
bool set = false;
for ( size_t i = 0; i < width; ++i )
set |= (*bits)[lsb + 1];
for ( size_t i = 0; i < width; ++i )
set |= (*bits)[lsb + 1];
newbits[cell] = set;
}
newbits[cell] = set;
}
return newbits;
}
return newbits;
}
CounterVector& CounterVector::operator|=(const CounterVector& other)
{
return Merge(other);
}
CounterVector& CounterVector::operator|=(const CounterVector& other) { return Merge(other); }
CounterVector operator|(const CounterVector& x, const CounterVector& y)
{
CounterVector cv(x);
return cv |= y;
}
CounterVector operator|(const CounterVector& x, const CounterVector& y) {
CounterVector cv(x);
return cv |= y;
}
uint64_t CounterVector::Hash() const
{
return bits->Hash();
}
uint64_t CounterVector::Hash() const { return bits->Hash(); }
broker::expected<broker::data> CounterVector::Serialize() const
{
auto b = bits->Serialize();
if ( ! b )
return broker::ec::invalid_data; // Cannot serialize
broker::expected<broker::data> CounterVector::Serialize() const {
auto b = bits->Serialize();
if ( ! b )
return broker::ec::invalid_data; // Cannot serialize
return {broker::vector{static_cast<uint64_t>(width), std::move(*b)}};
}
return {broker::vector{static_cast<uint64_t>(width), std::move(*b)}};
}
std::unique_ptr<CounterVector> CounterVector::Unserialize(const broker::data& data)
{
auto v = broker::get_if<broker::vector>(&data);
if ( ! (v && v->size() >= 2) )
return nullptr;
std::unique_ptr<CounterVector> CounterVector::Unserialize(const broker::data& data) {
auto v = broker::get_if<broker::vector>(&data);
if ( ! (v && v->size() >= 2) )
return nullptr;
auto width = broker::get_if<uint64_t>(&(*v)[0]);
auto bits = BitVector::Unserialize((*v)[1]);
auto width = broker::get_if<uint64_t>(&(*v)[0]);
auto bits = BitVector::Unserialize((*v)[1]);
if ( ! (width && bits) )
return nullptr;
if ( ! (width && bits) )
return nullptr;
auto cv = std::unique_ptr<CounterVector>(new CounterVector());
cv->width = *width;
cv->bits = bits.release();
return cv;
}
auto cv = std::unique_ptr<CounterVector>(new CounterVector());
cv->width = *width;
cv->bits = bits.release();
return cv;
}
} // namespace zeek::probabilistic::detail
} // namespace zeek::probabilistic::detail

View file

@ -9,162 +9,159 @@
#include <cstdint>
#include <memory>
namespace broker
{
namespace broker {
class data;
}
}
namespace zeek::probabilistic::detail
{
namespace zeek::probabilistic::detail {
class BitVector;
/**
* A vector of counters, each of which has a fixed number of bits.
*/
class CounterVector
{
class CounterVector {
public:
using size_type = size_t;
using count_type = uint64_t;
using size_type = size_t;
using count_type = uint64_t;
/**
* Constructs a counter vector having cells of a given width.
*
* @param width The number of bits that each cell occupies.
*
* @param cells The number of cells in the bitvector.
*
* @pre `cells > 0 && width > 0`
*/
explicit CounterVector(size_t width, size_t cells = 1024);
/**
* Constructs a counter vector having cells of a given width.
*
* @param width The number of bits that each cell occupies.
*
* @param cells The number of cells in the bitvector.
*
* @pre `cells > 0 && width > 0`
*/
explicit CounterVector(size_t width, size_t cells = 1024);
/**
* Copy-constructs a counter vector.
*
* @param other The counter vector to copy.
*/
CounterVector(const CounterVector& other);
/**
* Copy-constructs a counter vector.
*
* @param other The counter vector to copy.
*/
CounterVector(const CounterVector& other);
/**
* Destructor.
*/
virtual ~CounterVector();
/**
* Destructor.
*/
virtual ~CounterVector();
/**
* Increments a given cell.
*
* @param cell The cell to increment.
*
* @param value The value to add to the current counter in *cell*.
*
* @return `true` if adding *value* to the counter in *cell* succeeded.
*
* @pre `cell < Size()`
*/
bool Increment(size_type cell, count_type value = 1);
/**
* Increments a given cell.
*
* @param cell The cell to increment.
*
* @param value The value to add to the current counter in *cell*.
*
* @return `true` if adding *value* to the counter in *cell* succeeded.
*
* @pre `cell < Size()`
*/
bool Increment(size_type cell, count_type value = 1);
/**
* Decrements a given cell.
*
* @param cell The cell to decrement.
*
* @param value The value to subtract from the current counter in *cell*.
*
* @return `true` if subtracting *value* from the counter in *cell* succeeded.
*
* @pre `cell < Size()`
*/
bool Decrement(size_type cell, count_type value = 1);
/**
* Decrements a given cell.
*
* @param cell The cell to decrement.
*
* @param value The value to subtract from the current counter in *cell*.
*
* @return `true` if subtracting *value* from the counter in *cell* succeeded.
*
* @pre `cell < Size()`
*/
bool Decrement(size_type cell, count_type value = 1);
/**
* Retrieves the counter of a given cell.
*
* @param cell The cell index to retrieve the count for.
*
* @return The counter associated with *cell*.
*
* @pre `cell < Size()`
*/
count_type Count(size_type cell) const;
/**
* Retrieves the counter of a given cell.
*
* @param cell The cell index to retrieve the count for.
*
* @return The counter associated with *cell*.
*
* @pre `cell < Size()`
*/
count_type Count(size_type cell) const;
/**
* Checks whether all counters are 0.
* @return `true` iff all counters have the value 0.
*/
bool AllZero() const;
/**
* Checks whether all counters are 0.
* @return `true` iff all counters have the value 0.
*/
bool AllZero() const;
/**
* Sets all counters to 0.
*/
void Reset();
/**
* Sets all counters to 0.
*/
void Reset();
/**
* Retrieves the number of cells in the storage.
*
* @return The number of cells.
*/
size_type Size() const;
/**
* Retrieves the number of cells in the storage.
*
* @return The number of cells.
*/
size_type Size() const;
/**
* Retrieves the counter width.
*
* @return The number of bits per counter.
*/
size_t Width() const;
/**
* Retrieves the counter width.
*
* @return The number of bits per counter.
*/
size_t Width() const;
/**
* Computes the maximum counter value.
*
* @return The maximum counter value based on the width.
*/
size_t Max() const;
/**
* Computes the maximum counter value.
*
* @return The maximum counter value based on the width.
*/
size_t Max() const;
/**
* Merges another counter vector into this instance by *adding* the
* counters of each cells.
*
* @param other The counter vector to merge into this instance.
*
* @return A reference to `*this`.
*
* @pre `Size() == other.Size() && Width() == other.Width()`
*/
CounterVector& Merge(const CounterVector& other);
/**
* Merges another counter vector into this instance by *adding* the
* counters of each cells.
*
* @param other The counter vector to merge into this instance.
*
* @return A reference to `*this`.
*
* @pre `Size() == other.Size() && Width() == other.Width()`
*/
CounterVector& Merge(const CounterVector& other);
/**
* Converts a counter vector into a BitVector. Each cell that has a value
* of 1 or more set is set in the BitVector; otherwise the bit remains unset.
*
* @return The newly created BitVector
*/
BitVector ToBitVector() const;
/**
* Converts a counter vector into a BitVector. Each cell that has a value
* of 1 or more set is set in the BitVector; otherwise the bit remains unset.
*
* @return The newly created BitVector
*/
BitVector ToBitVector() const;
/**
* An alias for ::Merge.
*/
CounterVector& operator|=(const CounterVector& other);
/**
* An alias for ::Merge.
*/
CounterVector& operator|=(const CounterVector& other);
/** Computes a hash value of the internal representation.
* This is mainly for debugging/testing purposes.
*
* @return The hash.
*/
uint64_t Hash() const;
/** Computes a hash value of the internal representation.
* This is mainly for debugging/testing purposes.
*
* @return The hash.
*/
uint64_t Hash() const;
broker::expected<broker::data> Serialize() const;
static std::unique_ptr<CounterVector> Unserialize(const broker::data& data);
broker::expected<broker::data> Serialize() const;
static std::unique_ptr<CounterVector> Unserialize(const broker::data& data);
protected:
friend CounterVector operator|(const CounterVector& x, const CounterVector& y);
friend CounterVector operator|(const CounterVector& x, const CounterVector& y);
CounterVector() = default;
CounterVector() = default;
private:
CounterVector& operator=(const CounterVector&); // Disable.
CounterVector& operator=(const CounterVector&); // Disable.
BitVector* bits = nullptr;
size_t width = 0;
};
BitVector* bits = nullptr;
size_t width = 0;
};
} // namespace zeek::probabilistic::detail
} // namespace zeek::probabilistic::detail

View file

@ -11,171 +11,135 @@
#include "zeek/Var.h"
#include "zeek/digest.h"
namespace zeek::probabilistic::detail
{
namespace zeek::probabilistic::detail {
Hasher::seed_t Hasher::MakeSeed(const void* data, size_t size)
{
u_char buf[SHA256_DIGEST_LENGTH];
seed_t tmpseed;
EVP_MD_CTX* ctx = zeek::detail::hash_init(zeek::detail::Hash_SHA256);
Hasher::seed_t Hasher::MakeSeed(const void* data, size_t size) {
u_char buf[SHA256_DIGEST_LENGTH];
seed_t tmpseed;
EVP_MD_CTX* ctx = zeek::detail::hash_init(zeek::detail::Hash_SHA256);
assert(sizeof(tmpseed) == 16);
assert(sizeof(tmpseed) == 16);
static auto global_hash_seed = id::find_val<StringVal>("global_hash_seed");
static auto global_hash_seed = id::find_val<StringVal>("global_hash_seed");
if ( data )
zeek::detail::hash_update(ctx, data, size);
if ( data )
zeek::detail::hash_update(ctx, data, size);
else if ( global_hash_seed->Len() > 0 )
zeek::detail::hash_update(ctx, global_hash_seed->Bytes(), global_hash_seed->Len());
else if ( global_hash_seed->Len() > 0 )
zeek::detail::hash_update(ctx, global_hash_seed->Bytes(), global_hash_seed->Len());
else
{
unsigned int first_seed = util::detail::initial_seed();
zeek::detail::hash_update(ctx, &first_seed, sizeof(first_seed));
}
else {
unsigned int first_seed = util::detail::initial_seed();
zeek::detail::hash_update(ctx, &first_seed, sizeof(first_seed));
}
zeek::detail::hash_final(ctx, buf);
memcpy(&tmpseed, buf, sizeof(tmpseed)); // Use the first bytes as seed.
return tmpseed;
}
zeek::detail::hash_final(ctx, buf);
memcpy(&tmpseed, buf, sizeof(tmpseed)); // Use the first bytes as seed.
return tmpseed;
}
Hasher::digest_vector Hasher::Hash(const zeek::detail::HashKey* key) const
{
return Hash(key->Key(), key->Size());
}
Hasher::digest_vector Hasher::Hash(const zeek::detail::HashKey* key) const { return Hash(key->Key(), key->Size()); }
Hasher::Hasher(size_t arg_k, seed_t arg_seed)
{
k = arg_k;
seed = arg_seed;
}
Hasher::Hasher(size_t arg_k, seed_t arg_seed) {
k = arg_k;
seed = arg_seed;
}
broker::expected<broker::data> Hasher::Serialize() const
{
return {broker::vector{static_cast<uint64_t>(Type()), static_cast<uint64_t>(k), seed.h[0],
seed.h[1]}};
}
broker::expected<broker::data> Hasher::Serialize() const {
return {broker::vector{static_cast<uint64_t>(Type()), static_cast<uint64_t>(k), seed.h[0], seed.h[1]}};
}
std::unique_ptr<Hasher> Hasher::Unserialize(const broker::data& data)
{
auto v = broker::get_if<broker::vector>(&data);
std::unique_ptr<Hasher> Hasher::Unserialize(const broker::data& data) {
auto v = broker::get_if<broker::vector>(&data);
if ( ! (v && v->size() == 4) )
return nullptr;
if ( ! (v && v->size() == 4) )
return nullptr;
auto type = broker::get_if<uint64_t>(&(*v)[0]);
auto k = broker::get_if<uint64_t>(&(*v)[1]);
auto h1 = broker::get_if<uint64_t>(&(*v)[2]);
auto h2 = broker::get_if<uint64_t>(&(*v)[3]);
auto type = broker::get_if<uint64_t>(&(*v)[0]);
auto k = broker::get_if<uint64_t>(&(*v)[1]);
auto h1 = broker::get_if<uint64_t>(&(*v)[2]);
auto h2 = broker::get_if<uint64_t>(&(*v)[3]);
if ( ! (type && k && h1 && h2) )
return nullptr;
if ( ! (type && k && h1 && h2) )
return nullptr;
std::unique_ptr<Hasher> hasher;
std::unique_ptr<Hasher> hasher;
switch ( *type )
{
case Default:
hasher = std::unique_ptr<Hasher>(new DefaultHasher(*k, {*h1, *h2}));
break;
switch ( *type ) {
case Default: hasher = std::unique_ptr<Hasher>(new DefaultHasher(*k, {*h1, *h2})); break;
case Double:
hasher = std::unique_ptr<Hasher>(new DoubleHasher(*k, {*h1, *h2}));
break;
}
case Double: hasher = std::unique_ptr<Hasher>(new DoubleHasher(*k, {*h1, *h2})); break;
}
// Note that the derived classed don't hold any further state of
// their own. They reconstruct all their information from their
// constructors' arguments.
// Note that the derived classed don't hold any further state of
// their own. They reconstruct all their information from their
// constructors' arguments.
return hasher;
}
return hasher;
}
UHF::UHF()
{
memset(&seed, 0, sizeof(seed));
}
UHF::UHF() { memset(&seed, 0, sizeof(seed)); }
UHF::UHF(Hasher::seed_t arg_seed)
{
seed = arg_seed;
}
UHF::UHF(Hasher::seed_t arg_seed) { seed = arg_seed; }
// This function is almost equivalent to HashKey::HashBytes except that it
// does not depend on global state and that we mix in the seed multiple
// times.
Hasher::digest UHF::hash(const void* x, size_t n) const
{
static_assert(std::is_same_v<highwayhash::SipHashState::Key, decltype(seed.h)>,
"Seed value is not the same type as highwayhash key");
return highwayhash::SipHash(seed.h, reinterpret_cast<const char*>(x), n);
}
Hasher::digest UHF::hash(const void* x, size_t n) const {
static_assert(std::is_same_v<highwayhash::SipHashState::Key, decltype(seed.h)>,
"Seed value is not the same type as highwayhash key");
return highwayhash::SipHash(seed.h, reinterpret_cast<const char*>(x), n);
}
DefaultHasher::DefaultHasher(size_t k, Hasher::seed_t seed) : Hasher(k, seed)
{
for ( size_t i = 1; i <= k; ++i )
{
seed_t s = Seed();
s.h[0] += util::detail::prng(i);
hash_functions.emplace_back(s);
}
}
DefaultHasher::DefaultHasher(size_t k, Hasher::seed_t seed) : Hasher(k, seed) {
for ( size_t i = 1; i <= k; ++i ) {
seed_t s = Seed();
s.h[0] += util::detail::prng(i);
hash_functions.emplace_back(s);
}
}
Hasher::digest_vector DefaultHasher::Hash(const void* x, size_t n) const
{
digest_vector h(K(), 0);
Hasher::digest_vector DefaultHasher::Hash(const void* x, size_t n) const {
digest_vector h(K(), 0);
for ( size_t i = 0; i < h.size(); ++i )
h[i] = hash_functions[i](x, n);
for ( size_t i = 0; i < h.size(); ++i )
h[i] = hash_functions[i](x, n);
return h;
}
return h;
}
DefaultHasher* DefaultHasher::Clone() const
{
return new DefaultHasher(*this);
}
DefaultHasher* DefaultHasher::Clone() const { return new DefaultHasher(*this); }
bool DefaultHasher::Equals(const Hasher* other) const
{
if ( typeid(*this) != typeid(*other) )
return false;
bool DefaultHasher::Equals(const Hasher* other) const {
if ( typeid(*this) != typeid(*other) )
return false;
const DefaultHasher* o = static_cast<const DefaultHasher*>(other);
return hash_functions == o->hash_functions;
}
const DefaultHasher* o = static_cast<const DefaultHasher*>(other);
return hash_functions == o->hash_functions;
}
DoubleHasher::DoubleHasher(size_t k, seed_t seed)
: Hasher(k, seed), h1(seed + util::detail::prng(1)), h2(seed + util::detail::prng(2))
{
}
: Hasher(k, seed), h1(seed + util::detail::prng(1)), h2(seed + util::detail::prng(2)) {}
Hasher::digest_vector DoubleHasher::Hash(const void* x, size_t n) const
{
digest d1 = h1(x, n);
digest d2 = h2(x, n);
digest_vector h(K(), 0);
Hasher::digest_vector DoubleHasher::Hash(const void* x, size_t n) const {
digest d1 = h1(x, n);
digest d2 = h2(x, n);
digest_vector h(K(), 0);
for ( size_t i = 0; i < h.size(); ++i )
h[i] = d1 + i * d2;
for ( size_t i = 0; i < h.size(); ++i )
h[i] = d1 + i * d2;
return h;
}
return h;
}
DoubleHasher* DoubleHasher::Clone() const
{
return new DoubleHasher(*this);
}
DoubleHasher* DoubleHasher::Clone() const { return new DoubleHasher(*this); }
bool DoubleHasher::Equals(const Hasher* other) const
{
if ( typeid(*this) != typeid(*other) )
return false;
bool DoubleHasher::Equals(const Hasher* other) const {
if ( typeid(*this) != typeid(*other) )
return false;
const DoubleHasher* o = static_cast<const DoubleHasher*>(other);
return h1 == o->h1 && h2 == o->h2;
}
const DoubleHasher* o = static_cast<const DoubleHasher*>(other);
return h1 == o->h1 && h2 == o->h2;
}
} // namespace zeek::probabilistic::detail
} // namespace zeek::probabilistic::detail

View file

@ -7,252 +7,242 @@
#include "zeek/Hash.h"
namespace broker
{
namespace broker {
class data;
}
}
namespace zeek::probabilistic::detail
{
namespace zeek::probabilistic::detail {
/** Types of derived Hasher classes. */
enum HasherType
{
Default,
Double
};
enum HasherType { Default, Double };
/**
* Abstract base class for hashers. A hasher creates a family of hash
* functions to hash an element *k* times.
*/
class Hasher
{
class Hasher {
public:
using digest = zeek::detail::hash_t;
using digest_vector = std::vector<digest>;
struct seed_t
{
// actually HH_U64, which has the same type
alignas(16) unsigned long long h[2];
using digest = zeek::detail::hash_t;
using digest_vector = std::vector<digest>;
struct seed_t {
// actually HH_U64, which has the same type
alignas(16) unsigned long long h[2];
friend seed_t operator+(seed_t lhs, const uint64_t rhs)
{
lhs.h[0] += rhs;
return lhs;
}
};
friend seed_t operator+(seed_t lhs, const uint64_t rhs) {
lhs.h[0] += rhs;
return lhs;
}
};
/**
* Creates a valid hasher seed from an arbitrary string.
*
* @param data A pointer to contiguous data that should be crunched into a
* seed. If 0, the function tries to find a global_hash_seed script variable
* to derive a seed from. If this variable does not exist, the function uses
* the initial seed generated at Zeek startup.
*
* @param size The number of bytes of *data*.
*
* @return A seed suitable for hashers.
*/
static seed_t MakeSeed(const void* data, size_t size);
/**
* Creates a valid hasher seed from an arbitrary string.
*
* @param data A pointer to contiguous data that should be crunched into a
* seed. If 0, the function tries to find a global_hash_seed script variable
* to derive a seed from. If this variable does not exist, the function uses
* the initial seed generated at Zeek startup.
*
* @param size The number of bytes of *data*.
*
* @return A seed suitable for hashers.
*/
static seed_t MakeSeed(const void* data, size_t size);
/**
* Destructor.
*/
virtual ~Hasher() { }
/**
* Destructor.
*/
virtual ~Hasher() {}
/**
* Computes hash values for an element.
*
* @param x The element to hash.
*
* @return Vector of *k* hash values.
*/
template <typename T> digest_vector operator()(const T& x) const { return Hash(&x, sizeof(T)); }
/**
* Computes hash values for an element.
*
* @param x The element to hash.
*
* @return Vector of *k* hash values.
*/
template<typename T>
digest_vector operator()(const T& x) const {
return Hash(&x, sizeof(T));
}
/**
* Computes hash values for an element.
*
* @param x The key of the value to hash.
*
* @return Vector of *k* hash values.
*/
digest_vector Hash(const zeek::detail::HashKey* key) const;
/**
* Computes hash values for an element.
*
* @param x The key of the value to hash.
*
* @return Vector of *k* hash values.
*/
digest_vector Hash(const zeek::detail::HashKey* key) const;
/**
* Computes the hashes for a set of bytes.
*
* @param x Pointer to first byte to hash.
*
* @param n Number of bytes to hash.
*
* @return Vector of *k* hash values.
*
*/
virtual digest_vector Hash(const void* x, size_t n) const = 0;
/**
* Computes the hashes for a set of bytes.
*
* @param x Pointer to first byte to hash.
*
* @param n Number of bytes to hash.
*
* @return Vector of *k* hash values.
*
*/
virtual digest_vector Hash(const void* x, size_t n) const = 0;
/**
* Returns a deep copy of the hasher.
*/
virtual Hasher* Clone() const = 0;
/**
* Returns a deep copy of the hasher.
*/
virtual Hasher* Clone() const = 0;
/**
* Returns true if two hashers are identical.
*/
virtual bool Equals(const Hasher* other) const = 0;
/**
* Returns true if two hashers are identical.
*/
virtual bool Equals(const Hasher* other) const = 0;
/**
* Returns the number *k* of hash functions the hashers applies.
*/
size_t K() const { return k; }
/**
* Returns the number *k* of hash functions the hashers applies.
*/
size_t K() const { return k; }
/**
* Returns the seed used to construct the hasher.
*/
seed_t Seed() const { return seed; }
/**
* Returns the seed used to construct the hasher.
*/
seed_t Seed() const { return seed; }
broker::expected<broker::data> Serialize() const;
static std::unique_ptr<Hasher> Unserialize(const broker::data& data);
broker::expected<broker::data> Serialize() const;
static std::unique_ptr<Hasher> Unserialize(const broker::data& data);
protected:
Hasher() { }
Hasher() {}
/**
* Constructor.
*
* @param arg_k the number of hash functions.
*
* @param arg_seed The seed for the hasher.
*/
Hasher(size_t arg_k, seed_t arg_seed);
/**
* Constructor.
*
* @param arg_k the number of hash functions.
*
* @param arg_seed The seed for the hasher.
*/
Hasher(size_t arg_k, seed_t arg_seed);
virtual HasherType Type() const = 0;
virtual HasherType Type() const = 0;
private:
size_t k = 0;
seed_t seed = {0};
};
size_t k = 0;
seed_t seed = {0};
};
/**
* A universal hash function family. This is a helper class that Hasher
* implementations can use in their implementation.
*/
class UHF
{
class UHF {
public:
/**
* Default constructor with zero seed.
*/
UHF();
/**
* Default constructor with zero seed.
*/
UHF();
/**
* Constructs an hash function seeded with a given seed and an
* optional extra seed to replace the initial Zeek seed.
*
* @param arg_seed The seed to use for this instance.
*/
explicit UHF(Hasher::seed_t arg_seed);
/**
* Constructs an hash function seeded with a given seed and an
* optional extra seed to replace the initial Zeek seed.
*
* @param arg_seed The seed to use for this instance.
*/
explicit UHF(Hasher::seed_t arg_seed);
template <typename T> Hasher::digest operator()(const T& x) const
{
return hash(&x, sizeof(T));
}
template<typename T>
Hasher::digest operator()(const T& x) const {
return hash(&x, sizeof(T));
}
/**
* Computes hash values for an element.
*
* @param x The element to hash.
*
* @return Vector of *k* hash values.
*/
Hasher::digest operator()(const void* x, size_t n) const { return hash(x, n); }
/**
* Computes hash values for an element.
*
* @param x The element to hash.
*
* @return Vector of *k* hash values.
*/
Hasher::digest operator()(const void* x, size_t n) const { return hash(x, n); }
/**
* Computes the hashes for a set of bytes.
*
* @param x Pointer to first byte to hash.
*
* @param n Number of bytes to hash.
*
* @return Vector of *k* hash values.
*
*/
Hasher::digest hash(const void* x, size_t n) const;
/**
* Computes the hashes for a set of bytes.
*
* @param x Pointer to first byte to hash.
*
* @param n Number of bytes to hash.
*
* @return Vector of *k* hash values.
*
*/
Hasher::digest hash(const void* x, size_t n) const;
friend bool operator==(const UHF& x, const UHF& y)
{
return (x.seed.h[0] == y.seed.h[0]) && (x.seed.h[1] == y.seed.h[1]);
}
friend bool operator==(const UHF& x, const UHF& y) {
return (x.seed.h[0] == y.seed.h[0]) && (x.seed.h[1] == y.seed.h[1]);
}
friend bool operator!=(const UHF& x, const UHF& y) { return ! (x == y); }
friend bool operator!=(const UHF& x, const UHF& y) { return ! (x == y); }
broker::expected<broker::data> Serialize() const;
static UHF Unserialize(const broker::data& data);
broker::expected<broker::data> Serialize() const;
static UHF Unserialize(const broker::data& data);
private:
static size_t compute_seed(Hasher::seed_t seed);
static size_t compute_seed(Hasher::seed_t seed);
Hasher::seed_t seed;
};
Hasher::seed_t seed;
};
/**
* A hasher implementing the default hashing policy. Uses *k* separate hash
* functions internally.
*/
class DefaultHasher : public Hasher
{
class DefaultHasher : public Hasher {
public:
/**
* Constructor for a hasher with *k* hash functions.
*
* @param k The number of hash functions to use.
*
* @param seed The seed for the hasher.
*/
DefaultHasher(size_t k, Hasher::seed_t seed);
/**
* Constructor for a hasher with *k* hash functions.
*
* @param k The number of hash functions to use.
*
* @param seed The seed for the hasher.
*/
DefaultHasher(size_t k, Hasher::seed_t seed);
// Overridden from Hasher.
digest_vector Hash(const void* x, size_t n) const final;
DefaultHasher* Clone() const final;
bool Equals(const Hasher* other) const final;
// Overridden from Hasher.
digest_vector Hash(const void* x, size_t n) const final;
DefaultHasher* Clone() const final;
bool Equals(const Hasher* other) const final;
private:
DefaultHasher() { }
DefaultHasher() {}
HasherType Type() const override { return HasherType::Default; }
HasherType Type() const override { return HasherType::Default; }
std::vector<UHF> hash_functions;
};
std::vector<UHF> hash_functions;
};
/**
* The *double-hashing* policy. Uses a linear combination of two hash
* functions.
*/
class DoubleHasher : public Hasher
{
class DoubleHasher : public Hasher {
public:
/**
* Constructor for a double hasher with *k* hash functions.
*
* @param k The number of hash functions to use.
*
* @param seed The seed for the hasher.
*/
DoubleHasher(size_t k, Hasher::seed_t seed);
/**
* Constructor for a double hasher with *k* hash functions.
*
* @param k The number of hash functions to use.
*
* @param seed The seed for the hasher.
*/
DoubleHasher(size_t k, Hasher::seed_t seed);
// Overridden from Hasher.
digest_vector Hash(const void* x, size_t n) const final;
DoubleHasher* Clone() const final;
bool Equals(const Hasher* other) const final;
// Overridden from Hasher.
digest_vector Hash(const void* x, size_t n) const final;
DoubleHasher* Clone() const final;
bool Equals(const Hasher* other) const final;
private:
DoubleHasher() { }
DoubleHasher() {}
HasherType Type() const override { return HasherType::Double; }
HasherType Type() const override { return HasherType::Double; }
UHF h1;
UHF h2;
};
UHF h1;
UHF h2;
};
} // namespace zeek::probabilistic::detail
} // namespace zeek::probabilistic::detail

View file

@ -9,513 +9,467 @@
#include "zeek/Reporter.h"
#include "zeek/broker/Data.h"
namespace zeek::probabilistic::detail
{
namespace zeek::probabilistic::detail {
static void topk_element_hash_delete_func(void* val)
{
Element* e = (Element*)val;
delete e;
}
static void topk_element_hash_delete_func(void* val) {
Element* e = (Element*)val;
delete e;
}
void TopkVal::Typify(TypePtr t)
{
assert(! hash && ! type);
type = std::move(t);
auto tl = make_intrusive<TypeList>(type);
tl->Append(type);
hash = new zeek::detail::CompositeHash(std::move(tl));
}
void TopkVal::Typify(TypePtr t) {
assert(! hash && ! type);
type = std::move(t);
auto tl = make_intrusive<TypeList>(type);
tl->Append(type);
hash = new zeek::detail::CompositeHash(std::move(tl));
}
zeek::detail::HashKey* TopkVal::GetHash(Val* v) const
{
auto key = hash->MakeHashKey(*v, true);
assert(key);
return key.release();
}
zeek::detail::HashKey* TopkVal::GetHash(Val* v) const {
auto key = hash->MakeHashKey(*v, true);
assert(key);
return key.release();
}
TopkVal::TopkVal(uint64_t arg_size) : OpaqueVal(topk_type)
{
elementDict = new PDict<Element>;
elementDict->SetDeleteFunc(topk_element_hash_delete_func);
size = arg_size;
numElements = 0;
pruned = false;
hash = nullptr;
}
TopkVal::TopkVal(uint64_t arg_size) : OpaqueVal(topk_type) {
elementDict = new PDict<Element>;
elementDict->SetDeleteFunc(topk_element_hash_delete_func);
size = arg_size;
numElements = 0;
pruned = false;
hash = nullptr;
}
TopkVal::TopkVal() : OpaqueVal(topk_type)
{
elementDict = new PDict<Element>;
elementDict->SetDeleteFunc(topk_element_hash_delete_func);
size = 0;
numElements = 0;
hash = nullptr;
}
TopkVal::TopkVal() : OpaqueVal(topk_type) {
elementDict = new PDict<Element>;
elementDict->SetDeleteFunc(topk_element_hash_delete_func);
size = 0;
numElements = 0;
hash = nullptr;
}
TopkVal::~TopkVal()
{
elementDict->Clear();
delete elementDict;
TopkVal::~TopkVal() {
elementDict->Clear();
delete elementDict;
// now all elements are already gone - delete the buckets
std::list<Bucket*>::iterator bi = buckets.begin();
while ( bi != buckets.end() )
{
delete *bi;
bi++;
}
// now all elements are already gone - delete the buckets
std::list<Bucket*>::iterator bi = buckets.begin();
while ( bi != buckets.end() ) {
delete *bi;
bi++;
}
delete hash;
}
delete hash;
}
void TopkVal::Merge(const TopkVal* value, bool doPrune)
{
if ( ! value->type )
{
// Merge-from is empty. Nothing to do.
assert(value->numElements == 0);
return;
}
void TopkVal::Merge(const TopkVal* value, bool doPrune) {
if ( ! value->type ) {
// Merge-from is empty. Nothing to do.
assert(value->numElements == 0);
return;
}
if ( type == nullptr )
{
assert(numElements == 0);
Typify(value->type);
}
if ( type == nullptr ) {
assert(numElements == 0);
Typify(value->type);
}
else
{
if ( ! same_type(type, value->type) )
{
reporter->Error("Cannot merge top-k elements of differing types.");
return;
}
}
else {
if ( ! same_type(type, value->type) ) {
reporter->Error("Cannot merge top-k elements of differing types.");
return;
}
}
std::list<Bucket*>::const_iterator it = value->buckets.begin();
while ( it != value->buckets.end() )
{
Bucket* b = *it;
uint64_t currcount = b->count;
std::list<Element*>::const_iterator eit = b->elements.begin();
std::list<Bucket*>::const_iterator it = value->buckets.begin();
while ( it != value->buckets.end() ) {
Bucket* b = *it;
uint64_t currcount = b->count;
std::list<Element*>::const_iterator eit = b->elements.begin();
while ( eit != b->elements.end() )
{
Element* e = *eit;
// lookup if we already know this one...
zeek::detail::HashKey* key = GetHash(e->value);
Element* olde = (Element*)elementDict->Lookup(key);
while ( eit != b->elements.end() ) {
Element* e = *eit;
// lookup if we already know this one...
zeek::detail::HashKey* key = GetHash(e->value);
Element* olde = (Element*)elementDict->Lookup(key);
if ( olde == nullptr )
{
olde = new Element();
olde->epsilon = 0;
olde->value = e->value;
// insert at bucket position 0
if ( buckets.size() > 0 )
{
assert(buckets.front()->count > 0);
}
if ( olde == nullptr ) {
olde = new Element();
olde->epsilon = 0;
olde->value = e->value;
// insert at bucket position 0
if ( buckets.size() > 0 ) {
assert(buckets.front()->count > 0);
}
Bucket* newbucket = new Bucket();
newbucket->count = 0;
newbucket->bucketPos = buckets.insert(buckets.begin(), newbucket);
Bucket* newbucket = new Bucket();
newbucket->count = 0;
newbucket->bucketPos = buckets.insert(buckets.begin(), newbucket);
olde->parent = newbucket;
newbucket->elements.insert(newbucket->elements.end(), olde);
olde->parent = newbucket;
newbucket->elements.insert(newbucket->elements.end(), olde);
elementDict->Insert(key, olde);
numElements++;
}
elementDict->Insert(key, olde);
numElements++;
}
// now that we are sure that the old element is present - increment epsilon
olde->epsilon += e->epsilon;
// now that we are sure that the old element is present - increment epsilon
olde->epsilon += e->epsilon;
// and increment position...
IncrementCounter(olde, currcount);
delete key;
// and increment position...
IncrementCounter(olde, currcount);
delete key;
eit++;
}
eit++;
}
it++;
}
it++;
}
// now we have added everything. And our top-k table could be too big.
// prune everything...
// now we have added everything. And our top-k table could be too big.
// prune everything...
assert(size > 0);
assert(size > 0);
if ( ! doPrune )
return;
if ( ! doPrune )
return;
while ( numElements > size )
{
pruned = true;
assert(buckets.size() > 0);
Bucket* b = buckets.front();
assert(b->elements.size() > 0);
while ( numElements > size ) {
pruned = true;
assert(buckets.size() > 0);
Bucket* b = buckets.front();
assert(b->elements.size() > 0);
Element* e = b->elements.front();
zeek::detail::HashKey* key = GetHash(e->value);
elementDict->RemoveEntry(key);
delete key;
delete e;
Element* e = b->elements.front();
zeek::detail::HashKey* key = GetHash(e->value);
elementDict->RemoveEntry(key);
delete key;
delete e;
b->elements.pop_front();
b->elements.pop_front();
if ( b->elements.size() == 0 )
{
delete b;
buckets.pop_front();
}
if ( b->elements.size() == 0 ) {
delete b;
buckets.pop_front();
}
numElements--;
}
}
numElements--;
}
}
ValPtr TopkVal::DoClone(CloneState* state)
{
auto clone = make_intrusive<TopkVal>(size);
clone->Merge(this);
return state->NewClone(this, std::move(clone));
}
ValPtr TopkVal::DoClone(CloneState* state) {
auto clone = make_intrusive<TopkVal>(size);
clone->Merge(this);
return state->NewClone(this, std::move(clone));
}
VectorValPtr TopkVal::GetTopK(int k) const // returns vector
{
if ( numElements == 0 )
{
reporter->Error("Cannot return topk of empty");
return nullptr;
}
{
if ( numElements == 0 ) {
reporter->Error("Cannot return topk of empty");
return nullptr;
}
auto v = make_intrusive<VectorType>(type);
auto t = make_intrusive<VectorVal>(std::move(v));
auto v = make_intrusive<VectorType>(type);
auto t = make_intrusive<VectorVal>(std::move(v));
// this does no estimation if the results is correct!
// in any case - just to make this future-proof (and I am lazy) - this can return more than k.
// this does no estimation if the results is correct!
// in any case - just to make this future-proof (and I am lazy) - this can return more than k.
int read = 0;
std::list<Bucket*>::const_iterator it = buckets.end();
it--;
while ( read < k )
{
// printf("Bucket %llu\n", (*it)->count);
std::list<Element*>::iterator eit = (*it)->elements.begin();
while ( eit != (*it)->elements.end() )
{
// printf("Size: %ld\n", (*it)->elements.size());
t->Assign(read, (*eit)->value);
read++;
eit++;
}
int read = 0;
std::list<Bucket*>::const_iterator it = buckets.end();
it--;
while ( read < k ) {
// printf("Bucket %llu\n", (*it)->count);
std::list<Element*>::iterator eit = (*it)->elements.begin();
while ( eit != (*it)->elements.end() ) {
// printf("Size: %ld\n", (*it)->elements.size());
t->Assign(read, (*eit)->value);
read++;
eit++;
}
if ( it == buckets.begin() )
break;
if ( it == buckets.begin() )
break;
it--;
}
it--;
}
return t;
}
return t;
}
uint64_t TopkVal::GetCount(Val* value) const
{
zeek::detail::HashKey* key = GetHash(value);
Element* e = (Element*)elementDict->Lookup(key);
delete key;
uint64_t TopkVal::GetCount(Val* value) const {
zeek::detail::HashKey* key = GetHash(value);
Element* e = (Element*)elementDict->Lookup(key);
delete key;
if ( e == nullptr )
{
reporter->Error("GetCount for element that is not in top-k");
return 0;
}
if ( e == nullptr ) {
reporter->Error("GetCount for element that is not in top-k");
return 0;
}
return e->parent->count;
}
return e->parent->count;
}
uint64_t TopkVal::GetEpsilon(Val* value) const
{
zeek::detail::HashKey* key = GetHash(value);
Element* e = (Element*)elementDict->Lookup(key);
delete key;
uint64_t TopkVal::GetEpsilon(Val* value) const {
zeek::detail::HashKey* key = GetHash(value);
Element* e = (Element*)elementDict->Lookup(key);
delete key;
if ( e == nullptr )
{
reporter->Error("GetEpsilon for element that is not in top-k");
return 0;
}
if ( e == nullptr ) {
reporter->Error("GetEpsilon for element that is not in top-k");
return 0;
}
return e->epsilon;
}
return e->epsilon;
}
uint64_t TopkVal::GetSum() const
{
uint64_t sum = 0;
uint64_t TopkVal::GetSum() const {
uint64_t sum = 0;
std::list<Bucket*>::const_iterator it = buckets.begin();
while ( it != buckets.end() )
{
sum += (*it)->elements.size() * (*it)->count;
std::list<Bucket*>::const_iterator it = buckets.begin();
while ( it != buckets.end() ) {
sum += (*it)->elements.size() * (*it)->count;
it++;
}
it++;
}
if ( pruned )
reporter->Warning("TopkVal::GetSum() was used on a pruned data structure. Result values do "
"not represent total element count");
if ( pruned )
reporter->Warning(
"TopkVal::GetSum() was used on a pruned data structure. Result values do "
"not represent total element count");
return sum;
}
return sum;
}
void TopkVal::Encountered(ValPtr encountered)
{
// ok, let's see if we already know this one.
void TopkVal::Encountered(ValPtr encountered) {
// ok, let's see if we already know this one.
if ( numElements == 0 )
Typify(encountered->GetType());
else if ( ! same_type(type, encountered->GetType()) )
{
reporter->Error("Trying to add element to topk with differing type from other elements");
return;
}
if ( numElements == 0 )
Typify(encountered->GetType());
else if ( ! same_type(type, encountered->GetType()) ) {
reporter->Error("Trying to add element to topk with differing type from other elements");
return;
}
// Step 1 - get the hash.
zeek::detail::HashKey* key = GetHash(encountered);
Element* e = (Element*)elementDict->Lookup(key);
// Step 1 - get the hash.
zeek::detail::HashKey* key = GetHash(encountered);
Element* e = (Element*)elementDict->Lookup(key);
if ( e == nullptr )
{
e = new Element();
e->epsilon = 0;
e->value = std::move(encountered);
if ( e == nullptr ) {
e = new Element();
e->epsilon = 0;
e->value = std::move(encountered);
// well, we do not know this one yet...
if ( numElements < size )
{
// brilliant. just add it at position 1
if ( buckets.size() == 0 || (*buckets.begin())->count > 1 )
{
Bucket* b = new Bucket();
b->count = 1;
std::list<Bucket*>::iterator pos = buckets.insert(buckets.begin(), b);
b->bucketPos = pos;
b->elements.insert(b->elements.end(), e);
e->parent = b;
}
else
{
Bucket* b = *buckets.begin();
assert(b->count == 1);
b->elements.insert(b->elements.end(), e);
e->parent = b;
}
// well, we do not know this one yet...
if ( numElements < size ) {
// brilliant. just add it at position 1
if ( buckets.size() == 0 || (*buckets.begin())->count > 1 ) {
Bucket* b = new Bucket();
b->count = 1;
std::list<Bucket*>::iterator pos = buckets.insert(buckets.begin(), b);
b->bucketPos = pos;
b->elements.insert(b->elements.end(), e);
e->parent = b;
}
else {
Bucket* b = *buckets.begin();
assert(b->count == 1);
b->elements.insert(b->elements.end(), e);
e->parent = b;
}
elementDict->Insert(key, e);
numElements++;
delete key;
elementDict->Insert(key, e);
numElements++;
delete key;
return; // done. it is at pos 1.
}
return; // done. it is at pos 1.
}
else
{
// replace element with min-value
Bucket* b = *buckets.begin(); // bucket with smallest elements
else {
// replace element with min-value
Bucket* b = *buckets.begin(); // bucket with smallest elements
// evict oldest element with least hits.
assert(b->elements.size() > 0);
zeek::detail::HashKey* deleteKey = GetHash((*(b->elements.begin()))->value);
b->elements.erase(b->elements.begin());
Element* deleteElement = (Element*)elementDict->RemoveEntry(deleteKey);
assert(deleteElement); // there has to have been a minimal element...
delete deleteElement;
delete deleteKey;
// evict oldest element with least hits.
assert(b->elements.size() > 0);
zeek::detail::HashKey* deleteKey = GetHash((*(b->elements.begin()))->value);
b->elements.erase(b->elements.begin());
Element* deleteElement = (Element*)elementDict->RemoveEntry(deleteKey);
assert(deleteElement); // there has to have been a minimal element...
delete deleteElement;
delete deleteKey;
// and add the new one to the end
e->epsilon = b->count;
b->elements.insert(b->elements.end(), e);
elementDict->Insert(key, e);
e->parent = b;
// and add the new one to the end
e->epsilon = b->count;
b->elements.insert(b->elements.end(), e);
elementDict->Insert(key, e);
e->parent = b;
// fallthrough, increment operation has to run!
}
}
// fallthrough, increment operation has to run!
}
}
// ok, we now have an element in e
delete key;
IncrementCounter(e); // well, this certainly was anticlimactic.
}
// ok, we now have an element in e
delete key;
IncrementCounter(e); // well, this certainly was anticlimactic.
}
// increment by count
void TopkVal::IncrementCounter(Element* e, unsigned int count)
{
Bucket* currBucket = e->parent;
uint64_t currcount = currBucket->count;
void TopkVal::IncrementCounter(Element* e, unsigned int count) {
Bucket* currBucket = e->parent;
uint64_t currcount = currBucket->count;
// well, let's test if there is a bucket for currcount++
std::list<Bucket*>::iterator bucketIter = currBucket->bucketPos;
// well, let's test if there is a bucket for currcount++
std::list<Bucket*>::iterator bucketIter = currBucket->bucketPos;
Bucket* nextBucket = nullptr;
Bucket* nextBucket = nullptr;
bucketIter++;
bucketIter++;
while ( bucketIter != buckets.end() && (*bucketIter)->count < currcount + count )
bucketIter++;
while ( bucketIter != buckets.end() && (*bucketIter)->count < currcount + count )
bucketIter++;
if ( bucketIter != buckets.end() && (*bucketIter)->count == currcount + count )
nextBucket = *bucketIter;
if ( bucketIter != buckets.end() && (*bucketIter)->count == currcount + count )
nextBucket = *bucketIter;
if ( nextBucket == nullptr )
{
// the bucket for the value that we want does not exist.
// create it...
if ( nextBucket == nullptr ) {
// the bucket for the value that we want does not exist.
// create it...
Bucket* b = new Bucket();
b->count = currcount + count;
Bucket* b = new Bucket();
b->count = currcount + count;
std::list<Bucket*>::iterator nextBucketPos = buckets.insert(bucketIter, b);
b->bucketPos = nextBucketPos; // and give it the iterator we know now.
std::list<Bucket*>::iterator nextBucketPos = buckets.insert(bucketIter, b);
b->bucketPos = nextBucketPos; // and give it the iterator we know now.
nextBucket = b;
}
nextBucket = b;
}
// ok, now we have the new bucket in nextBucket. Shift the element over...
currBucket->elements.remove(e);
nextBucket->elements.insert(nextBucket->elements.end(), e);
// ok, now we have the new bucket in nextBucket. Shift the element over...
currBucket->elements.remove(e);
nextBucket->elements.insert(nextBucket->elements.end(), e);
e->parent = nextBucket;
e->parent = nextBucket;
// if currBucket is empty, we have to delete it now
if ( currBucket->elements.size() == 0 )
{
buckets.remove(currBucket);
delete currBucket;
currBucket = nullptr;
}
}
// if currBucket is empty, we have to delete it now
if ( currBucket->elements.size() == 0 ) {
buckets.remove(currBucket);
delete currBucket;
currBucket = nullptr;
}
}
IMPLEMENT_OPAQUE_VALUE(TopkVal)
broker::expected<broker::data> TopkVal::DoSerialize() const
{
broker::vector d = {size, numElements, pruned};
broker::expected<broker::data> TopkVal::DoSerialize() const {
broker::vector d = {size, numElements, pruned};
if ( type )
{
auto t = SerializeType(type);
if ( ! t )
return broker::ec::invalid_data;
if ( type ) {
auto t = SerializeType(type);
if ( ! t )
return broker::ec::invalid_data;
d.emplace_back(std::move(*t));
}
else
d.emplace_back(broker::none());
d.emplace_back(std::move(*t));
}
else
d.emplace_back(broker::none());
uint64_t i = 0;
std::list<Bucket*>::const_iterator it = buckets.begin();
while ( it != buckets.end() )
{
Bucket* b = *it;
uint32_t elements_count = b->elements.size();
uint64_t i = 0;
std::list<Bucket*>::const_iterator it = buckets.begin();
while ( it != buckets.end() ) {
Bucket* b = *it;
uint32_t elements_count = b->elements.size();
d.emplace_back(static_cast<uint64_t>(b->elements.size()));
d.emplace_back(b->count);
d.emplace_back(static_cast<uint64_t>(b->elements.size()));
d.emplace_back(b->count);
std::list<Element*>::const_iterator eit = b->elements.begin();
while ( eit != b->elements.end() )
{
Element* element = *eit;
d.emplace_back(element->epsilon);
auto v = Broker::detail::val_to_data(element->value.get());
if ( ! v )
return broker::ec::invalid_data;
std::list<Element*>::const_iterator eit = b->elements.begin();
while ( eit != b->elements.end() ) {
Element* element = *eit;
d.emplace_back(element->epsilon);
auto v = Broker::detail::val_to_data(element->value.get());
if ( ! v )
return broker::ec::invalid_data;
d.emplace_back(*v);
d.emplace_back(*v);
eit++;
i++;
}
eit++;
i++;
}
it++;
}
it++;
}
assert(i == numElements);
return {std::move(d)};
}
assert(i == numElements);
return {std::move(d)};
}
bool TopkVal::DoUnserialize(const broker::data& data)
{
auto v = broker::get_if<broker::vector>(&data);
bool TopkVal::DoUnserialize(const broker::data& data) {
auto v = broker::get_if<broker::vector>(&data);
if ( ! (v && v->size() >= 4) )
return false;
if ( ! (v && v->size() >= 4) )
return false;
auto size_ = broker::get_if<uint64_t>(&(*v)[0]);
auto numElements_ = broker::get_if<uint64_t>(&(*v)[1]);
auto pruned_ = broker::get_if<bool>(&(*v)[2]);
auto size_ = broker::get_if<uint64_t>(&(*v)[0]);
auto numElements_ = broker::get_if<uint64_t>(&(*v)[1]);
auto pruned_ = broker::get_if<bool>(&(*v)[2]);
if ( ! (size_ && numElements_ && pruned_) )
return false;
if ( ! (size_ && numElements_ && pruned_) )
return false;
size = *size_;
numElements = *numElements_;
pruned = *pruned_;
size = *size_;
numElements = *numElements_;
pruned = *pruned_;
auto no_type = broker::get_if<broker::none>(&(*v)[3]);
if ( ! no_type )
{
auto t = UnserializeType((*v)[3]);
auto no_type = broker::get_if<broker::none>(&(*v)[3]);
if ( ! no_type ) {
auto t = UnserializeType((*v)[3]);
if ( ! t )
return false;
if ( ! t )
return false;
Typify(t);
}
Typify(t);
}
uint64_t i = 0;
uint64_t idx = 4;
uint64_t i = 0;
uint64_t idx = 4;
while ( i < numElements )
{
auto elements_count = broker::get_if<uint64_t>(&(*v)[idx++]);
auto count = broker::get_if<uint64_t>(&(*v)[idx++]);
while ( i < numElements ) {
auto elements_count = broker::get_if<uint64_t>(&(*v)[idx++]);
auto count = broker::get_if<uint64_t>(&(*v)[idx++]);
if ( ! (elements_count && count) )
return false;
if ( ! (elements_count && count) )
return false;
Bucket* b = new Bucket();
b->count = *count;
b->bucketPos = buckets.insert(buckets.end(), b);
Bucket* b = new Bucket();
b->count = *count;
b->bucketPos = buckets.insert(buckets.end(), b);
for ( uint64_t j = 0; j < *elements_count; j++ )
{
auto epsilon = broker::get_if<uint64_t>(&(*v)[idx++]);
auto val = Broker::detail::data_to_val((*v)[idx++], type.get());
for ( uint64_t j = 0; j < *elements_count; j++ ) {
auto epsilon = broker::get_if<uint64_t>(&(*v)[idx++]);
auto val = Broker::detail::data_to_val((*v)[idx++], type.get());
if ( ! (epsilon && val) )
return false;
if ( ! (epsilon && val) )
return false;
Element* e = new Element();
e->epsilon = *epsilon;
e->value = std::move(val);
e->parent = b;
Element* e = new Element();
e->epsilon = *epsilon;
e->value = std::move(val);
e->parent = b;
b->elements.insert(b->elements.end(), e);
b->elements.insert(b->elements.end(), e);
zeek::detail::HashKey* key = GetHash(e->value);
assert(elementDict->Lookup(key) == nullptr);
zeek::detail::HashKey* key = GetHash(e->value);
assert(elementDict->Lookup(key) == nullptr);
elementDict->Insert(key, e);
delete key;
elementDict->Insert(key, e);
delete key;
i++;
}
}
i++;
}
}
assert(i == numElements);
return true;
}
assert(i == numElements);
return true;
}
} // namespace zeek::probabilistic::detail
} // namespace zeek::probabilistic::detail

View file

@ -13,173 +13,167 @@
//
// Or - to be more precise - it implements an interpretation of it.
namespace zeek::detail
{
namespace zeek::detail {
class CompositeHash;
}
}
namespace zeek::probabilistic::detail
{
namespace zeek::probabilistic::detail {
struct Element;
struct Bucket
{
uint64_t count;
std::list<Element*> elements;
struct Bucket {
uint64_t count;
std::list<Element*> elements;
// Iterators only get invalidated for removed elements. This one
// points to us - so it is invalid when we are no longer there. Cute,
// isn't it?
std::list<Bucket*>::iterator bucketPos;
};
// Iterators only get invalidated for removed elements. This one
// points to us - so it is invalid when we are no longer there. Cute,
// isn't it?
std::list<Bucket*>::iterator bucketPos;
};
struct Element
{
uint64_t epsilon;
ValPtr value;
Bucket* parent;
};
class TopkVal : public OpaqueVal
{
struct Element {
uint64_t epsilon;
ValPtr value;
Bucket* parent;
};
class TopkVal : public OpaqueVal {
public:
/**
* Construct a TopkVal.
*
* @param size specifies how many total elements are tracked
*
* @return A newly initialized TopkVal
*/
explicit TopkVal(uint64_t size);
/**
* Construct a TopkVal.
*
* @param size specifies how many total elements are tracked
*
* @return A newly initialized TopkVal
*/
explicit TopkVal(uint64_t size);
/**
* Destructor.
*/
~TopkVal() override;
/**
* Destructor.
*/
~TopkVal() override;
/**
* Call this when a new value is encountered. Note that on the first
* call, the Zeek type of the value types that are counted is set. All
* following calls to encountered have to specify the same type.
*
* @param value The encountered element
*/
void Encountered(ValPtr value);
/**
* Call this when a new value is encountered. Note that on the first
* call, the Zeek type of the value types that are counted is set. All
* following calls to encountered have to specify the same type.
*
* @param value The encountered element
*/
void Encountered(ValPtr value);
/**
* Get the first *k* elements of the result vector. At the moment,
* this does not check if it is in the right order or if we can prove
* that these are the correct top-k. Use count and epsilon for this.
*
* @param k Number of top-elements to return
*
* @returns The top-k encountered elements
*/
VectorValPtr GetTopK(int k) const;
/**
* Get the first *k* elements of the result vector. At the moment,
* this does not check if it is in the right order or if we can prove
* that these are the correct top-k. Use count and epsilon for this.
*
* @param k Number of top-elements to return
*
* @returns The top-k encountered elements
*/
VectorValPtr GetTopK(int k) const;
/**
* Get the current count tracked in the top-k data structure for a
* certain val. Returns 0 if the val is unknown (and logs the error
* to reporter).
*
* @param value Zeek value to get counts for
*
* @returns internal count for val, 0 if unknown
*/
uint64_t GetCount(Val* value) const;
/**
* Get the current count tracked in the top-k data structure for a
* certain val. Returns 0 if the val is unknown (and logs the error
* to reporter).
*
* @param value Zeek value to get counts for
*
* @returns internal count for val, 0 if unknown
*/
uint64_t GetCount(Val* value) const;
/**
* Get the current epsilon tracked in the top-k data structure for a
* certain val.
*
* @param value Zeek value to get epsilons for
*
* @returns the epsilon. Returns 0 if the val is unknown (and logs
* the error to reporter)
*/
uint64_t GetEpsilon(Val* value) const;
/**
* Get the current epsilon tracked in the top-k data structure for a
* certain val.
*
* @param value Zeek value to get epsilons for
*
* @returns the epsilon. Returns 0 if the val is unknown (and logs
* the error to reporter)
*/
uint64_t GetEpsilon(Val* value) const;
/**
* Get the size set in the constructor
*
* @returns size of the top-k structure
*/
uint64_t GetSize() const { return size; }
/**
* Get the size set in the constructor
*
* @returns size of the top-k structure
*/
uint64_t GetSize() const { return size; }
/**
* Get the sum of all counts of all tracked elements. This is equal
* to the number of total observations up to this moment, if no
* elements were pruned from the data structure.
*
* @returns sum of all counts
*/
uint64_t GetSum() const;
/**
* Get the sum of all counts of all tracked elements. This is equal
* to the number of total observations up to this moment, if no
* elements were pruned from the data structure.
*
* @returns sum of all counts
*/
uint64_t GetSum() const;
/**
* Merge another top-k data structure into this one. doPrune
* specifies if the total count of elements is limited to size after
* merging. Please note, that pruning will invalidate the results of
* getSum.
*
* @param value TopkVal to merge into this TopkVal
*
* @param doPrune prune resulting TopkVal to size after merging
*/
void Merge(const TopkVal* value, bool doPrune = false);
/**
* Merge another top-k data structure into this one. doPrune
* specifies if the total count of elements is limited to size after
* merging. Please note, that pruning will invalidate the results of
* getSum.
*
* @param value TopkVal to merge into this TopkVal
*
* @param doPrune prune resulting TopkVal to size after merging
*/
void Merge(const TopkVal* value, bool doPrune = false);
/**
* Clone the Opaque Type
*
* @param state Clone state (tracking duplicate pointers)
*
* @returns cloned TopkVal
*/
ValPtr DoClone(CloneState* state) override;
/**
* Clone the Opaque Type
*
* @param state Clone state (tracking duplicate pointers)
*
* @returns cloned TopkVal
*/
ValPtr DoClone(CloneState* state) override;
DECLARE_OPAQUE_VALUE(TopkVal)
DECLARE_OPAQUE_VALUE(TopkVal)
protected:
/**
* Construct an empty TopkVal. Only used for deserialization
*/
TopkVal();
/**
* Construct an empty TopkVal. Only used for deserialization
*/
TopkVal();
private:
/**
* Increment the counter for a specific element
*
* @param e element to increment counter for
*
* @param count increment counter by this much
*/
void IncrementCounter(Element* e, unsigned int count = 1);
/**
* Increment the counter for a specific element
*
* @param e element to increment counter for
*
* @param count increment counter by this much
*/
void IncrementCounter(Element* e, unsigned int count = 1);
/**
* get the hashkey for a specific value
*
* @param v value to generate key for
*
* @returns HashKey for value
*/
zeek::detail::HashKey* GetHash(Val* v) const; // this probably should go somewhere else.
zeek::detail::HashKey* GetHash(const ValPtr& v) const { return GetHash(v.get()); }
/**
* get the hashkey for a specific value
*
* @param v value to generate key for
*
* @returns HashKey for value
*/
zeek::detail::HashKey* GetHash(Val* v) const; // this probably should go somewhere else.
zeek::detail::HashKey* GetHash(const ValPtr& v) const { return GetHash(v.get()); }
/**
* Set the type that this TopK instance tracks
*
* @param t type that is tracked
*/
void Typify(TypePtr t);
/**
* Set the type that this TopK instance tracks
*
* @param t type that is tracked
*/
void Typify(TypePtr t);
TypePtr type;
zeek::detail::CompositeHash* hash = nullptr;
std::list<Bucket*> buckets;
PDict<Element>* elementDict = nullptr;
uint64_t size = 0; // how many elements are we tracking?
uint64_t numElements = 0; // how many elements do we have at the moment
bool pruned = false; // was this data structure pruned?
};
TypePtr type;
zeek::detail::CompositeHash* hash = nullptr;
std::list<Bucket*> buckets;
PDict<Element>* elementDict = nullptr;
uint64_t size = 0; // how many elements are we tracking?
uint64_t numElements = 0; // how many elements do we have at the moment
bool pruned = false; // was this data structure pruned?
};
} // namespace zeek::probabilistic::detail
} // namespace zeek::probabilistic::detail