diff --git a/scripts/base/frameworks/sumstats/plugins/hll_unique.bro b/scripts/base/frameworks/sumstats/plugins/hll_unique.bro index 56595ca434..1110e6e105 100644 --- a/scripts/base/frameworks/sumstats/plugins/hll_unique.bro +++ b/scripts/base/frameworks/sumstats/plugins/hll_unique.bro @@ -4,8 +4,11 @@ module SumStats; export { redef record Reducer += { - ## The threshold when we switch to hll + ## The error margin for HLL. hll_error_margin: double &default=0.01; + + ## The confidence for for HLL. + hll_confidence: double &default=0.95; }; redef enum Calculation += { @@ -26,8 +29,9 @@ redef record ResultVal += { # specialized bifs. card: opaque of cardinality &optional; - # We need this in the compose hook. + # We need these in the compose hook. hll_error_margin: double &optional; + hll_confidence: double &optional; }; hook register_observe_plugins() @@ -36,8 +40,9 @@ hook register_observe_plugins() { if ( ! rv?$card ) { - rv$card = hll_cardinality_init(r$hll_error_margin); + rv$card = hll_cardinality_init(r$hll_error_margin, r$hll_confidence); rv$hll_error_margin = r$hll_error_margin; + rv$hll_confidence = r$hll_confidence; rv$hll_unique = 0; } @@ -48,7 +53,7 @@ hook register_observe_plugins() hook compose_resultvals_hook(result: ResultVal, rv1: ResultVal, rv2: ResultVal) { - local rhll = hll_cardinality_init(rv1$hll_error_margin); + local rhll = hll_cardinality_init(rv1$hll_error_margin, rv1$hll_confidence); hll_cardinality_merge_into(rhll, rv1$card); hll_cardinality_merge_into(rhll, rv2$card); diff --git a/src/OpaqueVal.cc b/src/OpaqueVal.cc index d9999c148b..b7ccd770ce 100644 --- a/src/OpaqueVal.cc +++ b/src/OpaqueVal.cc @@ -5,7 +5,7 @@ #include "Reporter.h" #include "Serializer.h" #include "probabilistic/BloomFilter.h" -#include "probabilistic/HyperLogLog.h" +#include "probabilistic/CardinalityCounter.h" bool HashVal::IsValid() const { diff --git a/src/probabilistic/CardinalityCounter.cc b/src/probabilistic/CardinalityCounter.cc index 5f6b60d0a7..91686bbb69 100644 --- a/src/probabilistic/CardinalityCounter.cc +++ b/src/probabilistic/CardinalityCounter.cc @@ -10,7 +10,7 @@ using namespace probabilistic; -int CardinalityCounter::OptimalB(double error) +int CardinalityCounter::OptimalB(double error, double confidence) { double initial_estimate = 2 * (log(1.04) - log(error)) / log(2); int answer = (int) floor(initial_estimate); @@ -20,7 +20,7 @@ int CardinalityCounter::OptimalB(double error) do { answer++; k = pow(2, (answer - initial_estimate) / 2); - } while ( erf(k / sqrt(2)) < HLL_CONF ); + } while ( erf(k / sqrt(2)) < confidence ); return answer; } @@ -30,6 +30,9 @@ void CardinalityCounter::Init(uint64 size) m = size; buckets = new uint8_t[m]; + // The following magic values are taken directly out of the + // description of the HyperLogLog algorithn. + if ( m == 16 ) alpha_m = 0.673; @@ -51,9 +54,9 @@ void CardinalityCounter::Init(uint64 size) V = m; } -CardinalityCounter::CardinalityCounter(double error_margin) +CardinalityCounter::CardinalityCounter(double error_margin, double confidence) { - int b = OptimalB(error_margin); + int b = OptimalB(error_margin, confidence); Init((uint64) pow(2, b)); } diff --git a/src/probabilistic/CardinalityCounter.h b/src/probabilistic/CardinalityCounter.h index 909de7a153..6e1c444d95 100644 --- a/src/probabilistic/CardinalityCounter.h +++ b/src/probabilistic/CardinalityCounter.h @@ -14,18 +14,24 @@ namespace probabilistic { class CardinalityCounter { public: /** - * Constructor. - * - * Based on the error_margin, the number of buckets that need to be - * kept will be determined. Based on the max_size, the number of bits - * that will be used from the hash function will be determined. - * - * We need the hash function to return integers that are uniformly - * distributed from 0 to 2^L-1. And if that happens, the maximum - * cardinality that this counter can handle is approximately 2^L. By - * default, we will assume a value of 64 bits. - */ - CardinalityCounter(double error_margin); + * Constructor. + * + * Based on the error_margin, the number of buckets that need to be + * kept will be determined. Based on the max_size, the number of bits + * that will be used from the hash function will be determined. + * + * We need the hash function to return integers that are uniformly + * distributed from 0 to 2^L-1. And if that happens, the maximum + * cardinality that this counter can handle is approximately 2^L. By + * default, we will assume a value of 64 bits. + * + * Confidence in the estimate given by a cardinality counter is. + * + * In other words, if the cardinality is estimated to be 100 with 2% + * error margin and HLL_CONFis 0.95, then we are 95% sure that the + * actual cardinality is between 98 and 102. + */ + CardinalityCounter(double error_margin, double confidence = 0.95); /** * Constructor used for cloning. @@ -117,8 +123,10 @@ private: * * k is the number of standard deviations that we have to go to have * a confidence level of conf. + * + * confidence: TODO. */ - int OptimalB(double error); + int OptimalB(double error, double confidence); /** * Computes when the first one appears in the element. It looks at the @@ -128,15 +136,6 @@ private: */ uint8_t Rank(uint64 hash_modified); - /** - * Confidence in the estimate given by a cardinality counter is. - * - * In other words, if the cardinality is estimated to be 100 with 2% - * error margin and HLL_CONFis 0.95, then we are 95% sure that the - * actual cardinality is between 98 and 102. - */ - static const double HLL_CONF = .95; - /** * This is the number of buckets that will be stored. The standard * error is 1.04/sqrt(m), so the actual cardinality will be the diff --git a/src/probabilistic/cardinality-counter.bif b/src/probabilistic/cardinality-counter.bif index 28f886ff6c..1ec07529dc 100644 --- a/src/probabilistic/cardinality-counter.bif +++ b/src/probabilistic/cardinality-counter.bif @@ -16,14 +16,16 @@ module GLOBAL; ## Initializes a probabilistic cardinality counter that uses the HyperLogLog algorithm. ## ## err: the desired error rate (e.g. 0.01). +## +## confidence: the desirec confidence for the error rate (e.g., 0.95). ## ## Returns: a HLL cardinality handle. ## ## .. bro:see:: hll_cardinality_estimate hll_cardinality_merge_into hll_cardinality_add ## hll_cardinality_copy -function hll_cardinality_init%(err: double%): opaque of cardinality +function hll_cardinality_init%(err: double, confidence: double%): opaque of cardinality %{ - CardinalityCounter* c = new CardinalityCounter(err); + CardinalityCounter* c = new CardinalityCounter(err, confidence); CardinalityVal* cv = new CardinalityVal(c); return cv; diff --git a/testing/btest/bifs/hll_cardinality.bro b/testing/btest/bifs/hll_cardinality.bro index 96371d268f..d1b0807416 100644 --- a/testing/btest/bifs/hll_cardinality.bro +++ b/testing/btest/bifs/hll_cardinality.bro @@ -5,8 +5,8 @@ event bro_init() { - local c1 = hll_cardinality_init(0.01); - local c2 = hll_cardinality_init(0.01); + local c1 = hll_cardinality_init(0.01, 0.95); + local c2 = hll_cardinality_init(0.01, 0.95); local add1 = 2001; local add2 = 2002; @@ -46,7 +46,7 @@ event bro_init() print "This value should be about 12:"; print hll_cardinality_estimate(c2); - local m2 = hll_cardinality_init(0.02); + local m2 = hll_cardinality_init(0.02, 0.95); print "This value should be around 0:"; print hll_cardinality_estimate(m2); @@ -56,7 +56,7 @@ event bro_init() print "This value should be around 13:"; print hll_cardinality_estimate(c3); - c3 = hll_cardinality_init(0.01); + c3 = hll_cardinality_init(0.01, 0.95); print "This value should be 0:"; print hll_cardinality_estimate(c3); diff --git a/testing/btest/core/leaks/hll_cluster.bro b/testing/btest/core/leaks/hll_cluster.bro index 0e342325c3..65fe8da447 100644 --- a/testing/btest/core/leaks/hll_cluster.bro +++ b/testing/btest/core/leaks/hll_cluster.bro @@ -36,7 +36,7 @@ global runnumber: count &redef; # differentiate runs event remote_connection_handshake_done(p: event_peer) { - local c = hll_cardinality_init(0.01); + local c = hll_cardinality_init(0.01, 0.95); local add1 = 2001; local add2 = 2002; @@ -92,7 +92,7 @@ global hll: opaque of cardinality; event bro_init() { - hll = hll_cardinality_init(0.01); + hll = hll_cardinality_init(0.01, 0.95); } event hll_data(data: opaque of cardinality) diff --git a/testing/btest/istate/hll.bro b/testing/btest/istate/hll.bro index 1dfca6c30f..511a892644 100644 --- a/testing/btest/istate/hll.bro +++ b/testing/btest/istate/hll.bro @@ -13,7 +13,7 @@ event bro_init() if ( runnumber == 1 ) { - card = hll_cardinality_init(0.01); + card = hll_cardinality_init(0.01, 0.95); hll_cardinality_add(card, "a"); hll_cardinality_add(card, "b");