From 9834755948c92992b80e87058e2813a0a774fed0 Mon Sep 17 00:00:00 2001 From: Bernhard Amann Date: Wed, 11 Sep 2013 12:50:29 -0700 Subject: [PATCH 1/3] fix case where hll_error_margin could be undefined (thanks John) --- .../base/frameworks/sumstats/plugins/hll_unique.bro | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/scripts/base/frameworks/sumstats/plugins/hll_unique.bro b/scripts/base/frameworks/sumstats/plugins/hll_unique.bro index fe1bfbd476..63a12d3092 100644 --- a/scripts/base/frameworks/sumstats/plugins/hll_unique.bro +++ b/scripts/base/frameworks/sumstats/plugins/hll_unique.bro @@ -55,9 +55,18 @@ hook compose_resultvals_hook(result: ResultVal, rv1: ResultVal, rv2: ResultVal) if ( ! (rv1?$card || rv2?$card) ) return; - local rhll = hll_cardinality_init(rv1$hll_error_margin, rv1$hll_confidence); + local rhll: opaque of cardinality; + if ( rv1?$card ) + { + rhll = hll_cardinality_init(rv1$hll_error_margin, rv1$hll_confidence); hll_cardinality_merge_into(rhll, rv1$card); + } + else # if we do not have rv1, we have to have rv2... + { + rhll = hll_cardinality_init(rv2$hll_error_margin, rv2$hll_confidence); + } + if ( rv2?$card ) hll_cardinality_merge_into(rhll, rv2$card); From c0f780c728503006c3d285cd3369ea3969a5024c Mon Sep 17 00:00:00 2001 From: Bernhard Amann Date: Mon, 16 Sep 2013 10:40:25 -0700 Subject: [PATCH 2/3] update hll documentation, make a few functions private and create a new copy constructor. --- src/probabilistic/CardinalityCounter.cc | 29 ++++- src/probabilistic/CardinalityCounter.h | 128 +++++++++++++--------- src/probabilistic/cardinality-counter.bif | 14 +-- 3 files changed, 108 insertions(+), 63 deletions(-) diff --git a/src/probabilistic/CardinalityCounter.cc b/src/probabilistic/CardinalityCounter.cc index ec82c31e7f..3e274da886 100644 --- a/src/probabilistic/CardinalityCounter.cc +++ b/src/probabilistic/CardinalityCounter.cc @@ -15,6 +15,9 @@ int CardinalityCounter::OptimalB(double error, double confidence) double initial_estimate = 2 * (log(1.04) - log(error)) / log(2); int answer = (int) floor(initial_estimate); + // k is the number of standard deviations that we have to go to have + // a confidence level of conf. + double k = 0; do { @@ -54,6 +57,12 @@ void CardinalityCounter::Init(uint64 size) V = m; } +CardinalityCounter::CardinalityCounter(CardinalityCounter& other) + { + Init(other.GetM()); + Merge(&other); + } + CardinalityCounter::CardinalityCounter(double error_margin, double confidence) { int b = OptimalB(error_margin, confidence); @@ -107,7 +116,16 @@ void CardinalityCounter::AddElement(uint64 hash) buckets[index] = temp; } -double CardinalityCounter::Size() +/** + * Estimate the size by using the the "raw" HyperLogLog estimate. Then, + * check if it's too "large" or "small" because the raw estimate doesn't + * do well in those cases. + * Thus, we correct for those errors as specified in the paper. + * + * Note - we deviate from the HLL algorithm in the paper here, because + * of our 64-bit hashes. + **/ +double CardinalityCounter::Size() const { double answer = 0; for ( unsigned int i = 0; i < m; i++ ) @@ -126,8 +144,11 @@ double CardinalityCounter::Size() return -pow(2, 64) * log(1 - (answer / pow(2, 64))); } -void CardinalityCounter::Merge(CardinalityCounter* c) +bool CardinalityCounter::Merge(CardinalityCounter* c) { + if ( m != c->GetM() ) + return false; + uint8_t* temp = c->GetBuckets(); V = 0; @@ -140,6 +161,8 @@ void CardinalityCounter::Merge(CardinalityCounter* c) if ( buckets[i] == 0 ) ++V; } + + return true; } uint8_t* CardinalityCounter::GetBuckets() @@ -147,7 +170,7 @@ uint8_t* CardinalityCounter::GetBuckets() return buckets; } -uint64 CardinalityCounter::GetM() +uint64 CardinalityCounter::GetM() const { return m; } diff --git a/src/probabilistic/CardinalityCounter.h b/src/probabilistic/CardinalityCounter.h index 2707c53808..a2aeea50eb 100644 --- a/src/probabilistic/CardinalityCounter.h +++ b/src/probabilistic/CardinalityCounter.h @@ -9,81 +9,78 @@ namespace probabilistic { /** - * A probabilisitc cardinality counter using the HyperLogLog algorithm. - * - * TODO: Update doc string. + * A probabilistic cardinality counter using the HyperLogLog algorithm. */ class CardinalityCounter { public: /** * Constructor. * - * Based on the error_margin, the number of buckets that need to be - * kept will be determined. Based on the max_size, the number of bits - * that will be used from the hash function will be determined. + * The number of buckets of the data structure is determined using + * the error margin and the given confidence. * - * We need the hash function to return integers that are uniformly - * distributed from 0 to 2^L-1. And if that happens, the maximum - * cardinality that this counter can handle is approximately 2^L. By - * default, we will assume a value of 64 bits. + * For example, assume an error_margin of 2% and a confidence + * of 95%. If the Size function returns an estimate of 100, this + * means that we are 95% sure that the cardinality is between 98 + * and 102. * - * Confidence in the estimate given by a cardinality counter is. + * @param error_margin error margin * - * In other words, if the cardinality is estimated to be 100 with 2% - * error margin and HLL_CONFis 0.95, then we are 95% sure that the - * actual cardinality is between 98 and 102. + * @param confidence confidence of the error. Default: 0.95 */ CardinalityCounter(double error_margin, double confidence = 0.95); /** - * Constructor used for cloning. + * Copy-Constructor + */ + CardinalityCounter(CardinalityCounter& other); + + /** + * Constructor for a known number of buckets. * - * The error margin will be 1.04/sqrt(m) with approximately 68% + * The error margin is 1.04/sqrt(size) with approximately 68% * probability. + * + * @param size number of buckets to create */ CardinalityCounter(uint64 size); /** - * Deletes the class variables. + * Destructor. */ ~CardinalityCounter(); /** - * This will add an element to the counter. It's responsible for - * adding an element and updating the value of V, if that applies. + * Add a new element to the counter. + * + * The hash function generating the hashes needs to be uniformly + * distributed over 64 bits. + * + * @param hash 64-bit hash value of the element to be added */ void AddElement(uint64 hash); /** - * Returns the size estimate of the set. First, it has the "raw" - * HyperLogLog estimate. And then, we check if it's too "large" or - * "small" because the raw estimate doesn't do well in those cases. - * Thus, we correct for those errors as specified in the paper. - */ - double Size(); - - /** - * Returns the buckets array that holds all of the rough cardinality - * estimates. - */ - uint8_t* GetBuckets(); + * Get the current estimated number of elements in the data + * structure + * + * @return Estimated number of elements + **/ + double Size() const; /** * Merges the argument cardinality counter with this one. The error - * margins are assumed to be the same, so they have the same number of - * buckets. If any of the conditions are violated, then the return - * value of size() is meaningless. + * margins of both counters have to be the same, otherwhise the merge + * operation will not be carried out. + * + * @param c Cardinality counter to merge into the current counter. + * + * @return True if successful */ - void Merge(CardinalityCounter* c); + bool Merge(CardinalityCounter* c); /** - * Returns the value of m. Should be used only for statistical - * purposes. - */ - uint64 GetM(); - - /** -c * Serializes the cardinality counter. + * Serializes the cardinality counter. * * @param info The serializaton information to use. * @@ -97,10 +94,28 @@ c * Serializes the cardinality counter. * @param info The serializaton information to use. * * @return The unserialized cardinality counter, or null if an error - * occured. + * occured. */ static CardinalityCounter* Unserialize(UnserialInfo* info); +protected: + /** + * Return the number of buckets. + * + * @return Number of buckets + */ + uint64 GetM() const; + + /** + * Returns the buckets array that holds all of the rough cardinality + * estimates. + * + * Use GetM() to determine the size. + * + * @return Array containing cardinality estimates + */ + uint8_t* GetBuckets(); + private: /** * Constructor used when unserializing, i.e., all parameters are @@ -110,31 +125,38 @@ private: /** * Helper function with code used jointly by multiple constructors. + * + * @param arg_size: number of buckets that need to be kept */ void Init(uint64 arg_size); /** - * This function will calculate the smallest value of b that will + * This function calculates the smallest value of b that will * satisfy these the constraints of a specified error margin and * confidence level. * * The exact expression for b is as follows: - * Define x = 2*(log(1.04*k/error)/log(2)). Then b is the ceiling of x + * Define x = 2*(log(1.04*k/error)/log(2)). Then b is the ceiling of x. * - * error is the error margin. + * After that initial estimate, the value of b is increased until the + * standard deviation falls within the specified valud. * - * k is the number of standard deviations that we have to go to have - * a confidence level of conf. + * @param error error margin * - * confidence: TODO. + * @param confidence confidence of the error + * + * @return minimal B-value satisfying the error-rate under confidence. */ int OptimalB(double error, double confidence); /** - * Computes when the first one appears in the element. It looks at the - * bitstring from the end though. A precondition is that the argument - * is already divisible by m, so we just ignore the last b bits, since - * m = 2^b and the last b bits will always be 0. + * Determines at which index (counted from the back) the first one-bit + * appears. The last b bits have to be 0 (the element has to be divisible + * by m), hence they are ignored. + * + * @param hash_modified hash value + * + * @returns index of first one-bit */ uint8_t Rank(uint64 hash_modified); diff --git a/src/probabilistic/cardinality-counter.bif b/src/probabilistic/cardinality-counter.bif index 5201ce95bb..46323bc212 100644 --- a/src/probabilistic/cardinality-counter.bif +++ b/src/probabilistic/cardinality-counter.bif @@ -91,7 +91,12 @@ function hll_cardinality_merge_into%(handle1: opaque of cardinality, handle2: op CardinalityCounter* h1 = v1->Get(); CardinalityCounter* h2 = v2->Get(); - h1->Merge(h2); + bool res = h1->Merge(h2); + if ( res == false ) + { + reporter->Error("Carinality counters with different parameters cannot be merged"); + return new Val(0, TYPE_BOOL); + } return new Val(1, TYPE_BOOL); %} @@ -126,12 +131,7 @@ function hll_cardinality_copy%(handle: opaque of cardinality%): opaque of cardin %{ CardinalityVal* cv = static_cast(handle); CardinalityCounter* h = cv->Get(); - - uint64_t m = h->GetM(); - CardinalityCounter* h2 = new CardinalityCounter(m); - - int i = 0; - h2->Merge(h); + CardinalityCounter* h2 = new CardinalityCounter(*h); CardinalityVal* out = new CardinalityVal(h2); return out; From ecc20b932a3697452208ed8ec1ebaa59ac3f7061 Mon Sep 17 00:00:00 2001 From: Bernhard Amann Date: Mon, 16 Sep 2013 11:00:54 -0700 Subject: [PATCH 3/3] and const 2 more functions --- src/probabilistic/CardinalityCounter.cc | 4 ++-- src/probabilistic/CardinalityCounter.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/probabilistic/CardinalityCounter.cc b/src/probabilistic/CardinalityCounter.cc index 3e274da886..ed9f4ae078 100644 --- a/src/probabilistic/CardinalityCounter.cc +++ b/src/probabilistic/CardinalityCounter.cc @@ -10,7 +10,7 @@ using namespace probabilistic; -int CardinalityCounter::OptimalB(double error, double confidence) +int CardinalityCounter::OptimalB(double error, double confidence) const { double initial_estimate = 2 * (log(1.04) - log(error)) / log(2); int answer = (int) floor(initial_estimate); @@ -87,7 +87,7 @@ CardinalityCounter::~CardinalityCounter() delete [] buckets; } -uint8_t CardinalityCounter::Rank(uint64 hash_modified) +uint8_t CardinalityCounter::Rank(uint64 hash_modified) const { uint8_t answer = 0; diff --git a/src/probabilistic/CardinalityCounter.h b/src/probabilistic/CardinalityCounter.h index a2aeea50eb..1d1e581d5d 100644 --- a/src/probabilistic/CardinalityCounter.h +++ b/src/probabilistic/CardinalityCounter.h @@ -147,7 +147,7 @@ private: * * @return minimal B-value satisfying the error-rate under confidence. */ - int OptimalB(double error, double confidence); + int OptimalB(double error, double confidence) const; /** * Determines at which index (counted from the back) the first one-bit @@ -158,7 +158,7 @@ private: * * @returns index of first one-bit */ - uint8_t Rank(uint64 hash_modified); + uint8_t Rank(uint64 hash_modified) const; /** * This is the number of buckets that will be stored. The standard