mirror of
https://github.com/zeek/zeek.git
synced 2025-10-06 08:38:20 +00:00
Making the confidence configurable.
This commit is contained in:
parent
fb3ceae6d5
commit
295987c8d0
8 changed files with 49 additions and 40 deletions
|
@ -10,7 +10,7 @@
|
|||
|
||||
using namespace probabilistic;
|
||||
|
||||
int CardinalityCounter::OptimalB(double error)
|
||||
int CardinalityCounter::OptimalB(double error, double confidence)
|
||||
{
|
||||
double initial_estimate = 2 * (log(1.04) - log(error)) / log(2);
|
||||
int answer = (int) floor(initial_estimate);
|
||||
|
@ -20,7 +20,7 @@ int CardinalityCounter::OptimalB(double error)
|
|||
do {
|
||||
answer++;
|
||||
k = pow(2, (answer - initial_estimate) / 2);
|
||||
} while ( erf(k / sqrt(2)) < HLL_CONF );
|
||||
} while ( erf(k / sqrt(2)) < confidence );
|
||||
|
||||
return answer;
|
||||
}
|
||||
|
@ -30,6 +30,9 @@ void CardinalityCounter::Init(uint64 size)
|
|||
m = size;
|
||||
buckets = new uint8_t[m];
|
||||
|
||||
// The following magic values are taken directly out of the
|
||||
// description of the HyperLogLog algorithn.
|
||||
|
||||
if ( m == 16 )
|
||||
alpha_m = 0.673;
|
||||
|
||||
|
@ -51,9 +54,9 @@ void CardinalityCounter::Init(uint64 size)
|
|||
V = m;
|
||||
}
|
||||
|
||||
CardinalityCounter::CardinalityCounter(double error_margin)
|
||||
CardinalityCounter::CardinalityCounter(double error_margin, double confidence)
|
||||
{
|
||||
int b = OptimalB(error_margin);
|
||||
int b = OptimalB(error_margin, confidence);
|
||||
Init((uint64) pow(2, b));
|
||||
}
|
||||
|
||||
|
|
|
@ -14,18 +14,24 @@ namespace probabilistic {
|
|||
class CardinalityCounter {
|
||||
public:
|
||||
/**
|
||||
* Constructor.
|
||||
*
|
||||
* Based on the error_margin, the number of buckets that need to be
|
||||
* kept will be determined. Based on the max_size, the number of bits
|
||||
* that will be used from the hash function will be determined.
|
||||
*
|
||||
* We need the hash function to return integers that are uniformly
|
||||
* distributed from 0 to 2^L-1. And if that happens, the maximum
|
||||
* cardinality that this counter can handle is approximately 2^L. By
|
||||
* default, we will assume a value of 64 bits.
|
||||
*/
|
||||
CardinalityCounter(double error_margin);
|
||||
* Constructor.
|
||||
*
|
||||
* Based on the error_margin, the number of buckets that need to be
|
||||
* kept will be determined. Based on the max_size, the number of bits
|
||||
* that will be used from the hash function will be determined.
|
||||
*
|
||||
* We need the hash function to return integers that are uniformly
|
||||
* distributed from 0 to 2^L-1. And if that happens, the maximum
|
||||
* cardinality that this counter can handle is approximately 2^L. By
|
||||
* default, we will assume a value of 64 bits.
|
||||
*
|
||||
* Confidence in the estimate given by a cardinality counter is.
|
||||
*
|
||||
* In other words, if the cardinality is estimated to be 100 with 2%
|
||||
* error margin and HLL_CONFis 0.95, then we are 95% sure that the
|
||||
* actual cardinality is between 98 and 102.
|
||||
*/
|
||||
CardinalityCounter(double error_margin, double confidence = 0.95);
|
||||
|
||||
/**
|
||||
* Constructor used for cloning.
|
||||
|
@ -117,8 +123,10 @@ private:
|
|||
*
|
||||
* k is the number of standard deviations that we have to go to have
|
||||
* a confidence level of conf.
|
||||
*
|
||||
* confidence: TODO.
|
||||
*/
|
||||
int OptimalB(double error);
|
||||
int OptimalB(double error, double confidence);
|
||||
|
||||
/**
|
||||
* Computes when the first one appears in the element. It looks at the
|
||||
|
@ -128,15 +136,6 @@ private:
|
|||
*/
|
||||
uint8_t Rank(uint64 hash_modified);
|
||||
|
||||
/**
|
||||
* Confidence in the estimate given by a cardinality counter is.
|
||||
*
|
||||
* In other words, if the cardinality is estimated to be 100 with 2%
|
||||
* error margin and HLL_CONFis 0.95, then we are 95% sure that the
|
||||
* actual cardinality is between 98 and 102.
|
||||
*/
|
||||
static const double HLL_CONF = .95;
|
||||
|
||||
/**
|
||||
* This is the number of buckets that will be stored. The standard
|
||||
* error is 1.04/sqrt(m), so the actual cardinality will be the
|
||||
|
|
|
@ -16,14 +16,16 @@ module GLOBAL;
|
|||
## Initializes a probabilistic cardinality counter that uses the HyperLogLog algorithm.
|
||||
##
|
||||
## err: the desired error rate (e.g. 0.01).
|
||||
##
|
||||
## confidence: the desirec confidence for the error rate (e.g., 0.95).
|
||||
##
|
||||
## Returns: a HLL cardinality handle.
|
||||
##
|
||||
## .. bro:see:: hll_cardinality_estimate hll_cardinality_merge_into hll_cardinality_add
|
||||
## hll_cardinality_copy
|
||||
function hll_cardinality_init%(err: double%): opaque of cardinality
|
||||
function hll_cardinality_init%(err: double, confidence: double%): opaque of cardinality
|
||||
%{
|
||||
CardinalityCounter* c = new CardinalityCounter(err);
|
||||
CardinalityCounter* c = new CardinalityCounter(err, confidence);
|
||||
CardinalityVal* cv = new CardinalityVal(c);
|
||||
|
||||
return cv;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue