mirror of
https://github.com/zeek/zeek.git
synced 2025-10-04 15:48:19 +00:00
Merge remote-tracking branch 'origin/topic/bernhard/hyperloglog'
* origin/topic/bernhard/hyperloglog: (32 commits) add clustered leak test for hll. No issues. make gcc happy (hopefully) fix refcounting problem in hll/bloom-filter opaque vals. Thanks Robin. re-use same hash class for all add operations get hll ready for merging and forgot a file... adapt to new structure fix opaqueval-related memleak. make it compile on case-sensitive file systems and fix warnings make error rate configureable add persistence test not using predetermined random seeds. update cluster test to also use hll persistence really works. well, with this commit synchronizing the data structure should work.. ...if we had consistent hashing. and also serialize the other things we need ok, this bug was hard to find. serialization compiles. change plugin after feedback of seth Forgot a file. Again. Like always. Basically. do away with old file. ...
This commit is contained in:
commit
4dcf8fc0db
29 changed files with 991 additions and 18 deletions
|
@ -11,9 +11,11 @@ set(probabilistic_SRCS
|
|||
BloomFilter.cc
|
||||
CounterVector.cc
|
||||
Hasher.cc
|
||||
HyperLogLog.cc
|
||||
Topk.cc)
|
||||
|
||||
bif_target(bloom-filter.bif)
|
||||
bif_target(hyper-loglog.bif)
|
||||
bif_target(top-k.bif)
|
||||
bro_add_subdir_library(probabilistic ${probabilistic_SRCS})
|
||||
|
||||
|
|
187
src/probabilistic/HyperLogLog.cc
Normal file
187
src/probabilistic/HyperLogLog.cc
Normal file
|
@ -0,0 +1,187 @@
|
|||
// See the file "COPYING" in the main distribution directory for copyright.
|
||||
|
||||
#include <math.h>
|
||||
#include <stdint.h>
|
||||
#include <iostream>
|
||||
#include "HyperLogLog.h"
|
||||
#include "Reporter.h"
|
||||
#include "Serializer.h"
|
||||
|
||||
using namespace probabilistic;
|
||||
|
||||
int CardinalityCounter::OptimalB(double error)
|
||||
{
|
||||
double initial_estimate = 2 * (log(1.04) - log(error)) / log(2);
|
||||
int answer = (int) floor(initial_estimate);
|
||||
|
||||
double k = 0;
|
||||
|
||||
do {
|
||||
answer++;
|
||||
k = pow(2, (answer - initial_estimate) / 2);
|
||||
} while ( erf(k / sqrt(2)) < HLL_CONF );
|
||||
|
||||
return answer;
|
||||
}
|
||||
|
||||
void CardinalityCounter::Init(uint64 size)
|
||||
{
|
||||
m = size;
|
||||
buckets = new uint8_t[m];
|
||||
|
||||
if ( m == 16 )
|
||||
alpha_m = 0.673;
|
||||
|
||||
else if ( m == 32 )
|
||||
alpha_m = 0.697;
|
||||
|
||||
else if ( m == 64 )
|
||||
alpha_m = 0.709;
|
||||
|
||||
else if ( m >= 128 )
|
||||
alpha_m = 0.7213 / (1 + 1.079 / m);
|
||||
|
||||
else
|
||||
reporter->InternalError("Invalid size %" PRIu64 ". Size either has to be 16, 32, 64 or bigger than 128", size);
|
||||
|
||||
for ( uint64 i = 0; i < m; i++ )
|
||||
buckets[i] = 0;
|
||||
|
||||
V = m;
|
||||
}
|
||||
|
||||
CardinalityCounter::CardinalityCounter(double error_margin)
|
||||
{
|
||||
int b = OptimalB(error_margin);
|
||||
Init((uint64) pow(2, b));
|
||||
}
|
||||
|
||||
CardinalityCounter::CardinalityCounter(uint64 size)
|
||||
{
|
||||
Init(size);
|
||||
}
|
||||
|
||||
CardinalityCounter::CardinalityCounter(uint64 arg_size, uint64 arg_V, double arg_alpha_m)
|
||||
{
|
||||
m = arg_size;
|
||||
buckets = new uint8_t[m];
|
||||
alpha_m = arg_alpha_m;
|
||||
V = arg_V;
|
||||
}
|
||||
|
||||
CardinalityCounter::~CardinalityCounter()
|
||||
{
|
||||
delete [] buckets;
|
||||
}
|
||||
|
||||
uint8_t CardinalityCounter::Rank(uint64 hash_modified)
|
||||
{
|
||||
uint8_t answer = 0;
|
||||
|
||||
hash_modified = (uint64)(hash_modified/m);
|
||||
hash_modified *= 2;
|
||||
|
||||
do {
|
||||
hash_modified = (uint64)(hash_modified / 2);
|
||||
answer++;
|
||||
} while ( hash_modified % 2 == 0);
|
||||
|
||||
return answer;
|
||||
}
|
||||
|
||||
void CardinalityCounter::AddElement(uint64 hash)
|
||||
{
|
||||
uint64 index = hash % m;
|
||||
hash = hash-index;
|
||||
|
||||
if( buckets[index] == 0 )
|
||||
V--;
|
||||
|
||||
uint8_t temp = Rank(hash);
|
||||
|
||||
if ( temp > buckets[index] )
|
||||
buckets[index] = temp;
|
||||
}
|
||||
|
||||
double CardinalityCounter::Size()
|
||||
{
|
||||
double answer = 0;
|
||||
for ( unsigned int i = 0; i < m; i++ )
|
||||
answer += pow(2, -((int)buckets[i]));
|
||||
|
||||
answer = 1 / answer;
|
||||
answer = (alpha_m * m * m * answer);
|
||||
|
||||
if ( answer <= 5.0 * (m/2) )
|
||||
return m * log((double)(m / V));
|
||||
|
||||
else if ( answer <= (pow(2, 64) / 30) )
|
||||
return answer;
|
||||
|
||||
else
|
||||
return -pow(2, 64) * log(1 - (answer / pow(2, 64)));
|
||||
}
|
||||
|
||||
void CardinalityCounter::Merge(CardinalityCounter* c)
|
||||
{
|
||||
uint8_t* temp = c->GetBuckets();
|
||||
|
||||
V = 0;
|
||||
|
||||
for ( unsigned int i = 0; i < m; i++ )
|
||||
{
|
||||
if ( temp[i] > buckets[i] )
|
||||
buckets[i] = temp[i];
|
||||
|
||||
if ( buckets[i] == 0 )
|
||||
++V;
|
||||
}
|
||||
}
|
||||
|
||||
uint8_t* CardinalityCounter::GetBuckets()
|
||||
{
|
||||
return buckets;
|
||||
}
|
||||
|
||||
uint64 CardinalityCounter::GetM()
|
||||
{
|
||||
return m;
|
||||
}
|
||||
|
||||
bool CardinalityCounter::Serialize(SerialInfo* info) const
|
||||
{
|
||||
bool valid = true;
|
||||
|
||||
valid &= SERIALIZE(m);
|
||||
valid &= SERIALIZE(V);
|
||||
valid &= SERIALIZE(alpha_m);
|
||||
|
||||
for ( unsigned int i = 0; i < m; i++ )
|
||||
valid &= SERIALIZE(buckets[i]);
|
||||
|
||||
return valid;
|
||||
}
|
||||
|
||||
CardinalityCounter* CardinalityCounter::Unserialize(UnserialInfo* info)
|
||||
{
|
||||
uint64_t m;
|
||||
uint64 V;
|
||||
double alpha_m;
|
||||
|
||||
bool valid = true;
|
||||
valid &= UNSERIALIZE(&m);
|
||||
valid &= UNSERIALIZE(&V);
|
||||
valid &= UNSERIALIZE(&alpha_m);
|
||||
|
||||
CardinalityCounter* c = new CardinalityCounter(m, V, alpha_m);
|
||||
|
||||
uint8_t* buckets = c->buckets;
|
||||
|
||||
for ( unsigned int i = 0; i < m; i++ )
|
||||
{
|
||||
uint8_t* currbucket = buckets + i;
|
||||
valid &= UNSERIALIZE(currbucket);
|
||||
}
|
||||
|
||||
return valid ? c : 0;
|
||||
}
|
167
src/probabilistic/HyperLogLog.h
Normal file
167
src/probabilistic/HyperLogLog.h
Normal file
|
@ -0,0 +1,167 @@
|
|||
// See the file "COPYING" in the main distribution directory for copyright.
|
||||
|
||||
#ifndef hyperloglog_h
|
||||
#define hyperloglog_h
|
||||
|
||||
#include <stdint.h>
|
||||
#include <OpaqueVal.h>
|
||||
|
||||
namespace probabilistic {
|
||||
|
||||
/**
|
||||
* A probabilisitc cardinality counter using the HyperLogLog algorithm.
|
||||
*/
|
||||
class CardinalityCounter {
|
||||
public:
|
||||
/**
|
||||
* Constructor.
|
||||
*
|
||||
* Based on the error_margin, the number of buckets that need to be
|
||||
* kept will be determined. Based on the max_size, the number of bits
|
||||
* that will be used from the hash function will be determined.
|
||||
*
|
||||
* We need the hash function to return integers that are uniformly
|
||||
* distributed from 0 to 2^L-1. And if that happens, the maximum
|
||||
* cardinality that this counter can handle is approximately 2^L. By
|
||||
* default, we will assume a value of 64 bits.
|
||||
*/
|
||||
CardinalityCounter(double error_margin);
|
||||
|
||||
/**
|
||||
* Constructor used for cloning.
|
||||
*
|
||||
* The error margin will be 1.04/sqrt(m) with approximately 68%
|
||||
* probability.
|
||||
*/
|
||||
CardinalityCounter(uint64 size);
|
||||
|
||||
/**
|
||||
* Deletes the class variables.
|
||||
*/
|
||||
~CardinalityCounter();
|
||||
|
||||
/**
|
||||
* This will add an element to the counter. It's responsible for
|
||||
* adding an element and updating the value of V, if that applies.
|
||||
*/
|
||||
void AddElement(uint64 hash);
|
||||
|
||||
/**
|
||||
* Returns the size estimate of the set. First, it has the "raw"
|
||||
* HyperLogLog estimate. And then, we check if it's too "large" or
|
||||
* "small" because the raw estimate doesn't do well in those cases.
|
||||
* Thus, we correct for those errors as specified in the paper.
|
||||
*/
|
||||
double Size();
|
||||
|
||||
/**
|
||||
* Returns the buckets array that holds all of the rough cardinality
|
||||
* estimates.
|
||||
*/
|
||||
uint8_t* GetBuckets();
|
||||
|
||||
/**
|
||||
* Merges the argument cardinality counter with this one. The error
|
||||
* margins are assumed to be the same, so they have the same number of
|
||||
* buckets. If any of the conditions are violated, then the return
|
||||
* value of size() is meaningless.
|
||||
*/
|
||||
void Merge(CardinalityCounter* c);
|
||||
|
||||
/**
|
||||
* Returns the value of m. Should be used only for statistical
|
||||
* purposes.
|
||||
*/
|
||||
uint64 GetM();
|
||||
|
||||
/**
|
||||
c * Serializes the cardinality counter.
|
||||
*
|
||||
* @param info The serializaton information to use.
|
||||
*
|
||||
* @return True if successful.
|
||||
*/
|
||||
bool Serialize(SerialInfo* info) const;
|
||||
|
||||
/**
|
||||
* Unserializes a cardinality counter.
|
||||
*
|
||||
* @param info The serializaton information to use.
|
||||
*
|
||||
* @return The unserialized cardinality counter, or null if an error
|
||||
* occured.
|
||||
*/
|
||||
static CardinalityCounter* Unserialize(UnserialInfo* info);
|
||||
|
||||
private:
|
||||
/**
|
||||
* Constructor used when unserializing, i.e., all parameters are
|
||||
* known.
|
||||
*/
|
||||
CardinalityCounter(uint64 size, uint64 V, double alpha_m);
|
||||
|
||||
/**
|
||||
* Helper function with code used jointly by multiple constructors.
|
||||
*/
|
||||
void Init(uint64 arg_size);
|
||||
|
||||
/**
|
||||
* This function will calculate the smallest value of b that will
|
||||
* satisfy these the constraints of a specified error margin and
|
||||
* confidence level.
|
||||
*
|
||||
* The exact expression for b is as follows:
|
||||
* Define x = 2*(log(1.04*k/error)/log(2)). Then b is the ceiling of x
|
||||
*
|
||||
* error is the error margin.
|
||||
*
|
||||
* k is the number of standard deviations that we have to go to have
|
||||
* a confidence level of conf.
|
||||
*/
|
||||
int OptimalB(double error);
|
||||
|
||||
/**
|
||||
* Computes when the first one appears in the element. It looks at the
|
||||
* bitstring from the end though. A precondition is that the argument
|
||||
* is already divisible by m, so we just ignore the last b bits, since
|
||||
* m = 2^b and the last b bits will always be 0.
|
||||
*/
|
||||
uint8_t Rank(uint64 hash_modified);
|
||||
|
||||
/**
|
||||
* Confidence in the estimate given by a cardinality counter is.
|
||||
*
|
||||
* In other words, if the cardinality is estimated to be 100 with 2%
|
||||
* error margin and HLL_CONFis 0.95, then we are 95% sure that the
|
||||
* actual cardinality is between 98 and 102.
|
||||
*/
|
||||
static const double HLL_CONF = .95;
|
||||
|
||||
/**
|
||||
* This is the number of buckets that will be stored. The standard
|
||||
* error is 1.04/sqrt(m), so the actual cardinality will be the
|
||||
* estimate +/- 1.04/sqrt(m) with approximately 68% probability.
|
||||
*/
|
||||
uint64 m;
|
||||
|
||||
/**
|
||||
* These are the actual buckets that are storing an estimate of the
|
||||
* cardinality. All these need to do is count when the first 1 bit
|
||||
* appears in the bitstring and that location is at most 65, so not
|
||||
* that many bits are needed to store it.
|
||||
*/
|
||||
uint8_t* buckets;
|
||||
|
||||
/**
|
||||
* There are some state constants that need to be kept track of to
|
||||
* make the final estimate easier. V is the number of values in
|
||||
* buckets that are 0 and this is used in the small error correction.
|
||||
* alpha_m is a multiplicative constant used in the algorithm.
|
||||
*/
|
||||
uint64 V;
|
||||
double alpha_m;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#endif
|
137
src/probabilistic/hyper-loglog.bif
Normal file
137
src/probabilistic/hyper-loglog.bif
Normal file
|
@ -0,0 +1,137 @@
|
|||
# ===========================================================================
|
||||
#
|
||||
# HyperLogLog Functions
|
||||
#
|
||||
# ===========================================================================
|
||||
|
||||
|
||||
%%{
|
||||
#include "probabilistic/HyperLogLog.h"
|
||||
|
||||
using namespace probabilistic;
|
||||
%%}
|
||||
|
||||
module GLOBAL;
|
||||
|
||||
## Initializes a probabilistic cardinality counter that uses the HyperLogLog algorithm.
|
||||
##
|
||||
## err: the desired error rate (e.g. 0.01).
|
||||
##
|
||||
## Returns: a HLL cardinality handle.
|
||||
##
|
||||
## .. bro:see:: hll_cardinality_estimate hll_cardinality_merge_into hll_cardinality_add
|
||||
## hll_cardinality_copy
|
||||
function hll_cardinality_init%(err: double%): opaque of cardinality
|
||||
%{
|
||||
CardinalityCounter* c = new CardinalityCounter(err);
|
||||
CardinalityVal* cv = new CardinalityVal(c);
|
||||
|
||||
return cv;
|
||||
%}
|
||||
|
||||
## Adds an element to a HyperLogLog cardinality counter.
|
||||
##
|
||||
## handle: the HLL handle.
|
||||
##
|
||||
## elem: the element to add
|
||||
##
|
||||
## Returns: true on success
|
||||
##
|
||||
## .. bro:see:: hll_cardinality_estimate hll_cardinality_merge_into
|
||||
## hll_cardinality_init hll_cardinality_copy
|
||||
function hll_cardinality_add%(handle: opaque of cardinality, elem: any%): bool
|
||||
%{
|
||||
CardinalityVal* cv = static_cast<CardinalityVal*>(handle);
|
||||
|
||||
if ( ! cv->Type() && ! cv->Typify(elem->Type()) )
|
||||
{
|
||||
reporter->Error("failed to set HLL type");
|
||||
return new Val(0, TYPE_BOOL);
|
||||
}
|
||||
|
||||
else if ( ! same_type(cv->Type(), elem->Type()) )
|
||||
{
|
||||
reporter->Error("incompatible HLL data type");
|
||||
return new Val(0, TYPE_BOOL);
|
||||
}
|
||||
|
||||
cv->Add(elem);
|
||||
return new Val(1, TYPE_BOOL);
|
||||
%}
|
||||
|
||||
## Merges a HLL cardinality counter into another.
|
||||
##
|
||||
## .. note:: The same restrictions as for Bloom filter merging apply, see
|
||||
## :bro:id:`bloomfilter_merge`.
|
||||
##
|
||||
## handle1: the first HLL handle, which will contain the merged result
|
||||
##
|
||||
## handle2: the second HLL handle, which will be merged into the first
|
||||
##
|
||||
## Returns: true on success
|
||||
##
|
||||
## .. bro:see:: hll_cardinality_estimate hll_cardinality_add
|
||||
## hll_cardinality_init hll_cardinality_copy
|
||||
function hll_cardinality_merge_into%(handle1: opaque of cardinality, handle2: opaque of cardinality%): bool
|
||||
%{
|
||||
CardinalityVal* v1 = static_cast<CardinalityVal*>(handle1);
|
||||
CardinalityVal* v2 = static_cast<CardinalityVal*>(handle2);
|
||||
|
||||
if ( (v1->Type() != v2->Type()) && // both 0 is ok
|
||||
(v1->Type() != 0) && // any one 0 also is ok
|
||||
(v2->Type() != 0) &&
|
||||
! same_type(v1->Type(), v2->Type()) )
|
||||
{
|
||||
reporter->Error("incompatible HLL types");
|
||||
return new Val(0, TYPE_BOOL);
|
||||
}
|
||||
|
||||
CardinalityCounter* h1 = v1->Get();
|
||||
CardinalityCounter* h2 = v2->Get();
|
||||
|
||||
h1->Merge(h2);
|
||||
|
||||
return new Val(1, TYPE_BOOL);
|
||||
%}
|
||||
|
||||
## Estimate the current cardinality of an HLL cardinality counter.
|
||||
##
|
||||
## handle: the HLL handle
|
||||
##
|
||||
## Returns: the cardinality estimate. Returns -1.0 if the counter is empty.
|
||||
##
|
||||
## .. bro:see:: hll_cardinality_merge_into hll_cardinality_add
|
||||
## hll_cardinality_init hll_cardinality_copy
|
||||
function hll_cardinality_estimate%(handle: opaque of cardinality%): double
|
||||
%{
|
||||
CardinalityVal* cv = static_cast<CardinalityVal*>(handle);
|
||||
CardinalityCounter* h = cv->Get();
|
||||
|
||||
double estimate = h->Size();
|
||||
|
||||
return new Val(estimate, TYPE_DOUBLE);
|
||||
%}
|
||||
|
||||
## Copy a HLL cardinality counter.
|
||||
##
|
||||
## handle: cardinality counter to copy
|
||||
##
|
||||
## Returns: copy of handle
|
||||
##
|
||||
## .. bro:see:: hll_cardinality_estimate hll_cardinality_merge_into hll_cardinality_add
|
||||
## hll_cardinality_init
|
||||
function hll_cardinality_copy%(handle: opaque of cardinality%): opaque of cardinality
|
||||
%{
|
||||
CardinalityVal* cv = static_cast<CardinalityVal*>(handle);
|
||||
CardinalityCounter* h = cv->Get();
|
||||
|
||||
uint64_t m = h->GetM();
|
||||
CardinalityCounter* h2 = new CardinalityCounter(m);
|
||||
|
||||
int i = 0;
|
||||
h2->Merge(h);
|
||||
CardinalityVal* out = new CardinalityVal(h2);
|
||||
|
||||
return out;
|
||||
%}
|
||||
|
Loading…
Add table
Add a link
Reference in a new issue