Making the confidence configurable.

This commit is contained in:
Robin Sommer 2013-08-31 10:23:00 -07:00
parent fb3ceae6d5
commit 295987c8d0
8 changed files with 49 additions and 40 deletions

View file

@ -4,8 +4,11 @@ module SumStats;
export { export {
redef record Reducer += { redef record Reducer += {
## The threshold when we switch to hll ## The error margin for HLL.
hll_error_margin: double &default=0.01; hll_error_margin: double &default=0.01;
## The confidence for for HLL.
hll_confidence: double &default=0.95;
}; };
redef enum Calculation += { redef enum Calculation += {
@ -26,8 +29,9 @@ redef record ResultVal += {
# specialized bifs. # specialized bifs.
card: opaque of cardinality &optional; card: opaque of cardinality &optional;
# We need this in the compose hook. # We need these in the compose hook.
hll_error_margin: double &optional; hll_error_margin: double &optional;
hll_confidence: double &optional;
}; };
hook register_observe_plugins() hook register_observe_plugins()
@ -36,8 +40,9 @@ hook register_observe_plugins()
{ {
if ( ! rv?$card ) if ( ! rv?$card )
{ {
rv$card = hll_cardinality_init(r$hll_error_margin); rv$card = hll_cardinality_init(r$hll_error_margin, r$hll_confidence);
rv$hll_error_margin = r$hll_error_margin; rv$hll_error_margin = r$hll_error_margin;
rv$hll_confidence = r$hll_confidence;
rv$hll_unique = 0; rv$hll_unique = 0;
} }
@ -48,7 +53,7 @@ hook register_observe_plugins()
hook compose_resultvals_hook(result: ResultVal, rv1: ResultVal, rv2: ResultVal) hook compose_resultvals_hook(result: ResultVal, rv1: ResultVal, rv2: ResultVal)
{ {
local rhll = hll_cardinality_init(rv1$hll_error_margin); local rhll = hll_cardinality_init(rv1$hll_error_margin, rv1$hll_confidence);
hll_cardinality_merge_into(rhll, rv1$card); hll_cardinality_merge_into(rhll, rv1$card);
hll_cardinality_merge_into(rhll, rv2$card); hll_cardinality_merge_into(rhll, rv2$card);

View file

@ -5,7 +5,7 @@
#include "Reporter.h" #include "Reporter.h"
#include "Serializer.h" #include "Serializer.h"
#include "probabilistic/BloomFilter.h" #include "probabilistic/BloomFilter.h"
#include "probabilistic/HyperLogLog.h" #include "probabilistic/CardinalityCounter.h"
bool HashVal::IsValid() const bool HashVal::IsValid() const
{ {

View file

@ -10,7 +10,7 @@
using namespace probabilistic; using namespace probabilistic;
int CardinalityCounter::OptimalB(double error) int CardinalityCounter::OptimalB(double error, double confidence)
{ {
double initial_estimate = 2 * (log(1.04) - log(error)) / log(2); double initial_estimate = 2 * (log(1.04) - log(error)) / log(2);
int answer = (int) floor(initial_estimate); int answer = (int) floor(initial_estimate);
@ -20,7 +20,7 @@ int CardinalityCounter::OptimalB(double error)
do { do {
answer++; answer++;
k = pow(2, (answer - initial_estimate) / 2); k = pow(2, (answer - initial_estimate) / 2);
} while ( erf(k / sqrt(2)) < HLL_CONF ); } while ( erf(k / sqrt(2)) < confidence );
return answer; return answer;
} }
@ -30,6 +30,9 @@ void CardinalityCounter::Init(uint64 size)
m = size; m = size;
buckets = new uint8_t[m]; buckets = new uint8_t[m];
// The following magic values are taken directly out of the
// description of the HyperLogLog algorithn.
if ( m == 16 ) if ( m == 16 )
alpha_m = 0.673; alpha_m = 0.673;
@ -51,9 +54,9 @@ void CardinalityCounter::Init(uint64 size)
V = m; V = m;
} }
CardinalityCounter::CardinalityCounter(double error_margin) CardinalityCounter::CardinalityCounter(double error_margin, double confidence)
{ {
int b = OptimalB(error_margin); int b = OptimalB(error_margin, confidence);
Init((uint64) pow(2, b)); Init((uint64) pow(2, b));
} }

View file

@ -14,18 +14,24 @@ namespace probabilistic {
class CardinalityCounter { class CardinalityCounter {
public: public:
/** /**
* Constructor. * Constructor.
* *
* Based on the error_margin, the number of buckets that need to be * Based on the error_margin, the number of buckets that need to be
* kept will be determined. Based on the max_size, the number of bits * kept will be determined. Based on the max_size, the number of bits
* that will be used from the hash function will be determined. * that will be used from the hash function will be determined.
* *
* We need the hash function to return integers that are uniformly * We need the hash function to return integers that are uniformly
* distributed from 0 to 2^L-1. And if that happens, the maximum * distributed from 0 to 2^L-1. And if that happens, the maximum
* cardinality that this counter can handle is approximately 2^L. By * cardinality that this counter can handle is approximately 2^L. By
* default, we will assume a value of 64 bits. * default, we will assume a value of 64 bits.
*/ *
CardinalityCounter(double error_margin); * Confidence in the estimate given by a cardinality counter is.
*
* In other words, if the cardinality is estimated to be 100 with 2%
* error margin and HLL_CONFis 0.95, then we are 95% sure that the
* actual cardinality is between 98 and 102.
*/
CardinalityCounter(double error_margin, double confidence = 0.95);
/** /**
* Constructor used for cloning. * Constructor used for cloning.
@ -117,8 +123,10 @@ private:
* *
* k is the number of standard deviations that we have to go to have * k is the number of standard deviations that we have to go to have
* a confidence level of conf. * a confidence level of conf.
*
* confidence: TODO.
*/ */
int OptimalB(double error); int OptimalB(double error, double confidence);
/** /**
* Computes when the first one appears in the element. It looks at the * Computes when the first one appears in the element. It looks at the
@ -128,15 +136,6 @@ private:
*/ */
uint8_t Rank(uint64 hash_modified); uint8_t Rank(uint64 hash_modified);
/**
* Confidence in the estimate given by a cardinality counter is.
*
* In other words, if the cardinality is estimated to be 100 with 2%
* error margin and HLL_CONFis 0.95, then we are 95% sure that the
* actual cardinality is between 98 and 102.
*/
static const double HLL_CONF = .95;
/** /**
* This is the number of buckets that will be stored. The standard * This is the number of buckets that will be stored. The standard
* error is 1.04/sqrt(m), so the actual cardinality will be the * error is 1.04/sqrt(m), so the actual cardinality will be the

View file

@ -16,14 +16,16 @@ module GLOBAL;
## Initializes a probabilistic cardinality counter that uses the HyperLogLog algorithm. ## Initializes a probabilistic cardinality counter that uses the HyperLogLog algorithm.
## ##
## err: the desired error rate (e.g. 0.01). ## err: the desired error rate (e.g. 0.01).
##
## confidence: the desirec confidence for the error rate (e.g., 0.95).
## ##
## Returns: a HLL cardinality handle. ## Returns: a HLL cardinality handle.
## ##
## .. bro:see:: hll_cardinality_estimate hll_cardinality_merge_into hll_cardinality_add ## .. bro:see:: hll_cardinality_estimate hll_cardinality_merge_into hll_cardinality_add
## hll_cardinality_copy ## hll_cardinality_copy
function hll_cardinality_init%(err: double%): opaque of cardinality function hll_cardinality_init%(err: double, confidence: double%): opaque of cardinality
%{ %{
CardinalityCounter* c = new CardinalityCounter(err); CardinalityCounter* c = new CardinalityCounter(err, confidence);
CardinalityVal* cv = new CardinalityVal(c); CardinalityVal* cv = new CardinalityVal(c);
return cv; return cv;

View file

@ -5,8 +5,8 @@
event bro_init() event bro_init()
{ {
local c1 = hll_cardinality_init(0.01); local c1 = hll_cardinality_init(0.01, 0.95);
local c2 = hll_cardinality_init(0.01); local c2 = hll_cardinality_init(0.01, 0.95);
local add1 = 2001; local add1 = 2001;
local add2 = 2002; local add2 = 2002;
@ -46,7 +46,7 @@ event bro_init()
print "This value should be about 12:"; print "This value should be about 12:";
print hll_cardinality_estimate(c2); print hll_cardinality_estimate(c2);
local m2 = hll_cardinality_init(0.02); local m2 = hll_cardinality_init(0.02, 0.95);
print "This value should be around 0:"; print "This value should be around 0:";
print hll_cardinality_estimate(m2); print hll_cardinality_estimate(m2);
@ -56,7 +56,7 @@ event bro_init()
print "This value should be around 13:"; print "This value should be around 13:";
print hll_cardinality_estimate(c3); print hll_cardinality_estimate(c3);
c3 = hll_cardinality_init(0.01); c3 = hll_cardinality_init(0.01, 0.95);
print "This value should be 0:"; print "This value should be 0:";
print hll_cardinality_estimate(c3); print hll_cardinality_estimate(c3);

View file

@ -36,7 +36,7 @@ global runnumber: count &redef; # differentiate runs
event remote_connection_handshake_done(p: event_peer) event remote_connection_handshake_done(p: event_peer)
{ {
local c = hll_cardinality_init(0.01); local c = hll_cardinality_init(0.01, 0.95);
local add1 = 2001; local add1 = 2001;
local add2 = 2002; local add2 = 2002;
@ -92,7 +92,7 @@ global hll: opaque of cardinality;
event bro_init() event bro_init()
{ {
hll = hll_cardinality_init(0.01); hll = hll_cardinality_init(0.01, 0.95);
} }
event hll_data(data: opaque of cardinality) event hll_data(data: opaque of cardinality)

View file

@ -13,7 +13,7 @@ event bro_init()
if ( runnumber == 1 ) if ( runnumber == 1 )
{ {
card = hll_cardinality_init(0.01); card = hll_cardinality_init(0.01, 0.95);
hll_cardinality_add(card, "a"); hll_cardinality_add(card, "a");
hll_cardinality_add(card, "b"); hll_cardinality_add(card, "b");