mirror of
https://github.com/zeek/zeek.git
synced 2025-10-02 14:48:21 +00:00
Making the confidence configurable.
This commit is contained in:
parent
fb3ceae6d5
commit
295987c8d0
8 changed files with 49 additions and 40 deletions
|
@ -4,8 +4,11 @@ module SumStats;
|
||||||
|
|
||||||
export {
|
export {
|
||||||
redef record Reducer += {
|
redef record Reducer += {
|
||||||
## The threshold when we switch to hll
|
## The error margin for HLL.
|
||||||
hll_error_margin: double &default=0.01;
|
hll_error_margin: double &default=0.01;
|
||||||
|
|
||||||
|
## The confidence for for HLL.
|
||||||
|
hll_confidence: double &default=0.95;
|
||||||
};
|
};
|
||||||
|
|
||||||
redef enum Calculation += {
|
redef enum Calculation += {
|
||||||
|
@ -26,8 +29,9 @@ redef record ResultVal += {
|
||||||
# specialized bifs.
|
# specialized bifs.
|
||||||
card: opaque of cardinality &optional;
|
card: opaque of cardinality &optional;
|
||||||
|
|
||||||
# We need this in the compose hook.
|
# We need these in the compose hook.
|
||||||
hll_error_margin: double &optional;
|
hll_error_margin: double &optional;
|
||||||
|
hll_confidence: double &optional;
|
||||||
};
|
};
|
||||||
|
|
||||||
hook register_observe_plugins()
|
hook register_observe_plugins()
|
||||||
|
@ -36,8 +40,9 @@ hook register_observe_plugins()
|
||||||
{
|
{
|
||||||
if ( ! rv?$card )
|
if ( ! rv?$card )
|
||||||
{
|
{
|
||||||
rv$card = hll_cardinality_init(r$hll_error_margin);
|
rv$card = hll_cardinality_init(r$hll_error_margin, r$hll_confidence);
|
||||||
rv$hll_error_margin = r$hll_error_margin;
|
rv$hll_error_margin = r$hll_error_margin;
|
||||||
|
rv$hll_confidence = r$hll_confidence;
|
||||||
rv$hll_unique = 0;
|
rv$hll_unique = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -48,7 +53,7 @@ hook register_observe_plugins()
|
||||||
|
|
||||||
hook compose_resultvals_hook(result: ResultVal, rv1: ResultVal, rv2: ResultVal)
|
hook compose_resultvals_hook(result: ResultVal, rv1: ResultVal, rv2: ResultVal)
|
||||||
{
|
{
|
||||||
local rhll = hll_cardinality_init(rv1$hll_error_margin);
|
local rhll = hll_cardinality_init(rv1$hll_error_margin, rv1$hll_confidence);
|
||||||
hll_cardinality_merge_into(rhll, rv1$card);
|
hll_cardinality_merge_into(rhll, rv1$card);
|
||||||
hll_cardinality_merge_into(rhll, rv2$card);
|
hll_cardinality_merge_into(rhll, rv2$card);
|
||||||
|
|
||||||
|
|
|
@ -5,7 +5,7 @@
|
||||||
#include "Reporter.h"
|
#include "Reporter.h"
|
||||||
#include "Serializer.h"
|
#include "Serializer.h"
|
||||||
#include "probabilistic/BloomFilter.h"
|
#include "probabilistic/BloomFilter.h"
|
||||||
#include "probabilistic/HyperLogLog.h"
|
#include "probabilistic/CardinalityCounter.h"
|
||||||
|
|
||||||
bool HashVal::IsValid() const
|
bool HashVal::IsValid() const
|
||||||
{
|
{
|
||||||
|
|
|
@ -10,7 +10,7 @@
|
||||||
|
|
||||||
using namespace probabilistic;
|
using namespace probabilistic;
|
||||||
|
|
||||||
int CardinalityCounter::OptimalB(double error)
|
int CardinalityCounter::OptimalB(double error, double confidence)
|
||||||
{
|
{
|
||||||
double initial_estimate = 2 * (log(1.04) - log(error)) / log(2);
|
double initial_estimate = 2 * (log(1.04) - log(error)) / log(2);
|
||||||
int answer = (int) floor(initial_estimate);
|
int answer = (int) floor(initial_estimate);
|
||||||
|
@ -20,7 +20,7 @@ int CardinalityCounter::OptimalB(double error)
|
||||||
do {
|
do {
|
||||||
answer++;
|
answer++;
|
||||||
k = pow(2, (answer - initial_estimate) / 2);
|
k = pow(2, (answer - initial_estimate) / 2);
|
||||||
} while ( erf(k / sqrt(2)) < HLL_CONF );
|
} while ( erf(k / sqrt(2)) < confidence );
|
||||||
|
|
||||||
return answer;
|
return answer;
|
||||||
}
|
}
|
||||||
|
@ -30,6 +30,9 @@ void CardinalityCounter::Init(uint64 size)
|
||||||
m = size;
|
m = size;
|
||||||
buckets = new uint8_t[m];
|
buckets = new uint8_t[m];
|
||||||
|
|
||||||
|
// The following magic values are taken directly out of the
|
||||||
|
// description of the HyperLogLog algorithn.
|
||||||
|
|
||||||
if ( m == 16 )
|
if ( m == 16 )
|
||||||
alpha_m = 0.673;
|
alpha_m = 0.673;
|
||||||
|
|
||||||
|
@ -51,9 +54,9 @@ void CardinalityCounter::Init(uint64 size)
|
||||||
V = m;
|
V = m;
|
||||||
}
|
}
|
||||||
|
|
||||||
CardinalityCounter::CardinalityCounter(double error_margin)
|
CardinalityCounter::CardinalityCounter(double error_margin, double confidence)
|
||||||
{
|
{
|
||||||
int b = OptimalB(error_margin);
|
int b = OptimalB(error_margin, confidence);
|
||||||
Init((uint64) pow(2, b));
|
Init((uint64) pow(2, b));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -14,18 +14,24 @@ namespace probabilistic {
|
||||||
class CardinalityCounter {
|
class CardinalityCounter {
|
||||||
public:
|
public:
|
||||||
/**
|
/**
|
||||||
* Constructor.
|
* Constructor.
|
||||||
*
|
*
|
||||||
* Based on the error_margin, the number of buckets that need to be
|
* Based on the error_margin, the number of buckets that need to be
|
||||||
* kept will be determined. Based on the max_size, the number of bits
|
* kept will be determined. Based on the max_size, the number of bits
|
||||||
* that will be used from the hash function will be determined.
|
* that will be used from the hash function will be determined.
|
||||||
*
|
*
|
||||||
* We need the hash function to return integers that are uniformly
|
* We need the hash function to return integers that are uniformly
|
||||||
* distributed from 0 to 2^L-1. And if that happens, the maximum
|
* distributed from 0 to 2^L-1. And if that happens, the maximum
|
||||||
* cardinality that this counter can handle is approximately 2^L. By
|
* cardinality that this counter can handle is approximately 2^L. By
|
||||||
* default, we will assume a value of 64 bits.
|
* default, we will assume a value of 64 bits.
|
||||||
*/
|
*
|
||||||
CardinalityCounter(double error_margin);
|
* Confidence in the estimate given by a cardinality counter is.
|
||||||
|
*
|
||||||
|
* In other words, if the cardinality is estimated to be 100 with 2%
|
||||||
|
* error margin and HLL_CONFis 0.95, then we are 95% sure that the
|
||||||
|
* actual cardinality is between 98 and 102.
|
||||||
|
*/
|
||||||
|
CardinalityCounter(double error_margin, double confidence = 0.95);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Constructor used for cloning.
|
* Constructor used for cloning.
|
||||||
|
@ -117,8 +123,10 @@ private:
|
||||||
*
|
*
|
||||||
* k is the number of standard deviations that we have to go to have
|
* k is the number of standard deviations that we have to go to have
|
||||||
* a confidence level of conf.
|
* a confidence level of conf.
|
||||||
|
*
|
||||||
|
* confidence: TODO.
|
||||||
*/
|
*/
|
||||||
int OptimalB(double error);
|
int OptimalB(double error, double confidence);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Computes when the first one appears in the element. It looks at the
|
* Computes when the first one appears in the element. It looks at the
|
||||||
|
@ -128,15 +136,6 @@ private:
|
||||||
*/
|
*/
|
||||||
uint8_t Rank(uint64 hash_modified);
|
uint8_t Rank(uint64 hash_modified);
|
||||||
|
|
||||||
/**
|
|
||||||
* Confidence in the estimate given by a cardinality counter is.
|
|
||||||
*
|
|
||||||
* In other words, if the cardinality is estimated to be 100 with 2%
|
|
||||||
* error margin and HLL_CONFis 0.95, then we are 95% sure that the
|
|
||||||
* actual cardinality is between 98 and 102.
|
|
||||||
*/
|
|
||||||
static const double HLL_CONF = .95;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This is the number of buckets that will be stored. The standard
|
* This is the number of buckets that will be stored. The standard
|
||||||
* error is 1.04/sqrt(m), so the actual cardinality will be the
|
* error is 1.04/sqrt(m), so the actual cardinality will be the
|
||||||
|
|
|
@ -16,14 +16,16 @@ module GLOBAL;
|
||||||
## Initializes a probabilistic cardinality counter that uses the HyperLogLog algorithm.
|
## Initializes a probabilistic cardinality counter that uses the HyperLogLog algorithm.
|
||||||
##
|
##
|
||||||
## err: the desired error rate (e.g. 0.01).
|
## err: the desired error rate (e.g. 0.01).
|
||||||
|
##
|
||||||
|
## confidence: the desirec confidence for the error rate (e.g., 0.95).
|
||||||
##
|
##
|
||||||
## Returns: a HLL cardinality handle.
|
## Returns: a HLL cardinality handle.
|
||||||
##
|
##
|
||||||
## .. bro:see:: hll_cardinality_estimate hll_cardinality_merge_into hll_cardinality_add
|
## .. bro:see:: hll_cardinality_estimate hll_cardinality_merge_into hll_cardinality_add
|
||||||
## hll_cardinality_copy
|
## hll_cardinality_copy
|
||||||
function hll_cardinality_init%(err: double%): opaque of cardinality
|
function hll_cardinality_init%(err: double, confidence: double%): opaque of cardinality
|
||||||
%{
|
%{
|
||||||
CardinalityCounter* c = new CardinalityCounter(err);
|
CardinalityCounter* c = new CardinalityCounter(err, confidence);
|
||||||
CardinalityVal* cv = new CardinalityVal(c);
|
CardinalityVal* cv = new CardinalityVal(c);
|
||||||
|
|
||||||
return cv;
|
return cv;
|
||||||
|
|
|
@ -5,8 +5,8 @@
|
||||||
|
|
||||||
event bro_init()
|
event bro_init()
|
||||||
{
|
{
|
||||||
local c1 = hll_cardinality_init(0.01);
|
local c1 = hll_cardinality_init(0.01, 0.95);
|
||||||
local c2 = hll_cardinality_init(0.01);
|
local c2 = hll_cardinality_init(0.01, 0.95);
|
||||||
|
|
||||||
local add1 = 2001;
|
local add1 = 2001;
|
||||||
local add2 = 2002;
|
local add2 = 2002;
|
||||||
|
@ -46,7 +46,7 @@ event bro_init()
|
||||||
print "This value should be about 12:";
|
print "This value should be about 12:";
|
||||||
print hll_cardinality_estimate(c2);
|
print hll_cardinality_estimate(c2);
|
||||||
|
|
||||||
local m2 = hll_cardinality_init(0.02);
|
local m2 = hll_cardinality_init(0.02, 0.95);
|
||||||
|
|
||||||
print "This value should be around 0:";
|
print "This value should be around 0:";
|
||||||
print hll_cardinality_estimate(m2);
|
print hll_cardinality_estimate(m2);
|
||||||
|
@ -56,7 +56,7 @@ event bro_init()
|
||||||
print "This value should be around 13:";
|
print "This value should be around 13:";
|
||||||
print hll_cardinality_estimate(c3);
|
print hll_cardinality_estimate(c3);
|
||||||
|
|
||||||
c3 = hll_cardinality_init(0.01);
|
c3 = hll_cardinality_init(0.01, 0.95);
|
||||||
print "This value should be 0:";
|
print "This value should be 0:";
|
||||||
print hll_cardinality_estimate(c3);
|
print hll_cardinality_estimate(c3);
|
||||||
|
|
||||||
|
|
|
@ -36,7 +36,7 @@ global runnumber: count &redef; # differentiate runs
|
||||||
|
|
||||||
event remote_connection_handshake_done(p: event_peer)
|
event remote_connection_handshake_done(p: event_peer)
|
||||||
{
|
{
|
||||||
local c = hll_cardinality_init(0.01);
|
local c = hll_cardinality_init(0.01, 0.95);
|
||||||
|
|
||||||
local add1 = 2001;
|
local add1 = 2001;
|
||||||
local add2 = 2002;
|
local add2 = 2002;
|
||||||
|
@ -92,7 +92,7 @@ global hll: opaque of cardinality;
|
||||||
|
|
||||||
event bro_init()
|
event bro_init()
|
||||||
{
|
{
|
||||||
hll = hll_cardinality_init(0.01);
|
hll = hll_cardinality_init(0.01, 0.95);
|
||||||
}
|
}
|
||||||
|
|
||||||
event hll_data(data: opaque of cardinality)
|
event hll_data(data: opaque of cardinality)
|
||||||
|
|
|
@ -13,7 +13,7 @@ event bro_init()
|
||||||
|
|
||||||
if ( runnumber == 1 )
|
if ( runnumber == 1 )
|
||||||
{
|
{
|
||||||
card = hll_cardinality_init(0.01);
|
card = hll_cardinality_init(0.01, 0.95);
|
||||||
|
|
||||||
hll_cardinality_add(card, "a");
|
hll_cardinality_add(card, "a");
|
||||||
hll_cardinality_add(card, "b");
|
hll_cardinality_add(card, "b");
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue