diff --git a/CHANGES b/CHANGES index fd5b4d400f..085e0376c0 100644 --- a/CHANGES +++ b/CHANGES @@ -1,4 +1,11 @@ +2.1-1357 | 2013-09-18 14:58:52 -0700 + + * Update HLL API and its documentation. (Bernhard Amann) + + * Fix case in HLL where hll_error_margin could be undefined. + (Bernhard Amann) + 2.1-1352 | 2013-09-18 14:42:28 -0700 * Fix a number of compiler warnings. (Daniel Thayer) diff --git a/VERSION b/VERSION index f3d1bad61e..c10ccc6394 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.1-1352 +2.1-1357 diff --git a/aux/bro-aux b/aux/bro-aux index cfe77b184c..3e1c8bf5f3 160000 --- a/aux/bro-aux +++ b/aux/bro-aux @@ -1 +1 @@ -Subproject commit cfe77b184c2362fe85d36a597a1cda776aac0a80 +Subproject commit 3e1c8bf5f34868759d578f2afc6707fd1797f958 diff --git a/aux/broctl b/aux/broctl index 4c87f14d17..a4912816d7 160000 --- a/aux/broctl +++ b/aux/broctl @@ -1 +1 @@ -Subproject commit 4c87f14d1797b254934dac34e739c08ede89c052 +Subproject commit a4912816d7a50c882fa537dbeadac13449ca3716 diff --git a/scripts/base/frameworks/sumstats/plugins/hll_unique.bro b/scripts/base/frameworks/sumstats/plugins/hll_unique.bro index fe1bfbd476..494cbf4667 100644 --- a/scripts/base/frameworks/sumstats/plugins/hll_unique.bro +++ b/scripts/base/frameworks/sumstats/plugins/hll_unique.bro @@ -55,9 +55,19 @@ hook compose_resultvals_hook(result: ResultVal, rv1: ResultVal, rv2: ResultVal) if ( ! (rv1?$card || rv2?$card) ) return; - local rhll = hll_cardinality_init(rv1$hll_error_margin, rv1$hll_confidence); + # Now at least one of rv1?$card or rv1?$card will be set, and + # potentially both. + + local rhll: opaque of cardinality; + if ( rv1?$card ) + { + rhll = hll_cardinality_init(rv1$hll_error_margin, rv1$hll_confidence); hll_cardinality_merge_into(rhll, rv1$card); + } + else # If we do not have rv1, we have to have rv2 ... + rhll = hll_cardinality_init(rv2$hll_error_margin, rv2$hll_confidence); + if ( rv2?$card ) hll_cardinality_merge_into(rhll, rv2$card); diff --git a/src/probabilistic/CardinalityCounter.cc b/src/probabilistic/CardinalityCounter.cc index ec82c31e7f..ed9f4ae078 100644 --- a/src/probabilistic/CardinalityCounter.cc +++ b/src/probabilistic/CardinalityCounter.cc @@ -10,11 +10,14 @@ using namespace probabilistic; -int CardinalityCounter::OptimalB(double error, double confidence) +int CardinalityCounter::OptimalB(double error, double confidence) const { double initial_estimate = 2 * (log(1.04) - log(error)) / log(2); int answer = (int) floor(initial_estimate); + // k is the number of standard deviations that we have to go to have + // a confidence level of conf. + double k = 0; do { @@ -54,6 +57,12 @@ void CardinalityCounter::Init(uint64 size) V = m; } +CardinalityCounter::CardinalityCounter(CardinalityCounter& other) + { + Init(other.GetM()); + Merge(&other); + } + CardinalityCounter::CardinalityCounter(double error_margin, double confidence) { int b = OptimalB(error_margin, confidence); @@ -78,7 +87,7 @@ CardinalityCounter::~CardinalityCounter() delete [] buckets; } -uint8_t CardinalityCounter::Rank(uint64 hash_modified) +uint8_t CardinalityCounter::Rank(uint64 hash_modified) const { uint8_t answer = 0; @@ -107,7 +116,16 @@ void CardinalityCounter::AddElement(uint64 hash) buckets[index] = temp; } -double CardinalityCounter::Size() +/** + * Estimate the size by using the the "raw" HyperLogLog estimate. Then, + * check if it's too "large" or "small" because the raw estimate doesn't + * do well in those cases. + * Thus, we correct for those errors as specified in the paper. + * + * Note - we deviate from the HLL algorithm in the paper here, because + * of our 64-bit hashes. + **/ +double CardinalityCounter::Size() const { double answer = 0; for ( unsigned int i = 0; i < m; i++ ) @@ -126,8 +144,11 @@ double CardinalityCounter::Size() return -pow(2, 64) * log(1 - (answer / pow(2, 64))); } -void CardinalityCounter::Merge(CardinalityCounter* c) +bool CardinalityCounter::Merge(CardinalityCounter* c) { + if ( m != c->GetM() ) + return false; + uint8_t* temp = c->GetBuckets(); V = 0; @@ -140,6 +161,8 @@ void CardinalityCounter::Merge(CardinalityCounter* c) if ( buckets[i] == 0 ) ++V; } + + return true; } uint8_t* CardinalityCounter::GetBuckets() @@ -147,7 +170,7 @@ uint8_t* CardinalityCounter::GetBuckets() return buckets; } -uint64 CardinalityCounter::GetM() +uint64 CardinalityCounter::GetM() const { return m; } diff --git a/src/probabilistic/CardinalityCounter.h b/src/probabilistic/CardinalityCounter.h index 2707c53808..8e079a5d84 100644 --- a/src/probabilistic/CardinalityCounter.h +++ b/src/probabilistic/CardinalityCounter.h @@ -9,81 +9,78 @@ namespace probabilistic { /** - * A probabilisitc cardinality counter using the HyperLogLog algorithm. - * - * TODO: Update doc string. + * A probabilistic cardinality counter using the HyperLogLog algorithm. */ class CardinalityCounter { public: /** * Constructor. * - * Based on the error_margin, the number of buckets that need to be - * kept will be determined. Based on the max_size, the number of bits - * that will be used from the hash function will be determined. + * The number of buckets of the data structure is determined using + * the error margin and the given confidence. * - * We need the hash function to return integers that are uniformly - * distributed from 0 to 2^L-1. And if that happens, the maximum - * cardinality that this counter can handle is approximately 2^L. By - * default, we will assume a value of 64 bits. + * For example, assume an error_margin of 2% and a confidence + * of 95%. If the Size function returns an estimate of 100, this + * means that we are 95% sure that the cardinality is between 98 + * and 102. * - * Confidence in the estimate given by a cardinality counter is. + * @param error_margin error margin * - * In other words, if the cardinality is estimated to be 100 with 2% - * error margin and HLL_CONFis 0.95, then we are 95% sure that the - * actual cardinality is between 98 and 102. + * @param confidence confidence of the error. Default: 0.95 */ CardinalityCounter(double error_margin, double confidence = 0.95); /** - * Constructor used for cloning. + * Copy-Constructor + */ + CardinalityCounter(CardinalityCounter& other); + + /** + * Constructor for a known number of buckets. * - * The error margin will be 1.04/sqrt(m) with approximately 68% + * The error margin is 1.04/sqrt(size) with approximately 68% * probability. + * + * @param size number of buckets to create */ CardinalityCounter(uint64 size); /** - * Deletes the class variables. + * Destructor. */ ~CardinalityCounter(); /** - * This will add an element to the counter. It's responsible for - * adding an element and updating the value of V, if that applies. + * Add a new element to the counter. + * + * The hash function generating the hashes needs to be uniformly + * distributed over 64 bits. + * + * @param hash 64-bit hash value of the element to be added */ void AddElement(uint64 hash); /** - * Returns the size estimate of the set. First, it has the "raw" - * HyperLogLog estimate. And then, we check if it's too "large" or - * "small" because the raw estimate doesn't do well in those cases. - * Thus, we correct for those errors as specified in the paper. - */ - double Size(); - - /** - * Returns the buckets array that holds all of the rough cardinality - * estimates. - */ - uint8_t* GetBuckets(); + * Get the current estimated number of elements in the data + * structure + * + * @return Estimated number of elements + **/ + double Size() const; /** * Merges the argument cardinality counter with this one. The error - * margins are assumed to be the same, so they have the same number of - * buckets. If any of the conditions are violated, then the return - * value of size() is meaningless. + * margins of both counters have to be the same, otherwhise the merge + * operation will not be carried out. + * + * @param c Cardinality counter to merge into the current counter. + * + * @return True if successful */ - void Merge(CardinalityCounter* c); + bool Merge(CardinalityCounter* c); /** - * Returns the value of m. Should be used only for statistical - * purposes. - */ - uint64 GetM(); - - /** -c * Serializes the cardinality counter. + * Serializes the cardinality counter. * * @param info The serializaton information to use. * @@ -97,10 +94,28 @@ c * Serializes the cardinality counter. * @param info The serializaton information to use. * * @return The unserialized cardinality counter, or null if an error - * occured. + * occured. */ static CardinalityCounter* Unserialize(UnserialInfo* info); +protected: + /** + * Return the number of buckets. + * + * @return Number of buckets + */ + uint64 GetM() const; + + /** + * Returns the buckets array that holds all of the rough cardinality + * estimates. + * + * Use GetM() to determine the size. + * + * @return Array containing cardinality estimates + */ + uint8_t* GetBuckets(); + private: /** * Constructor used when unserializing, i.e., all parameters are @@ -110,33 +125,40 @@ private: /** * Helper function with code used jointly by multiple constructors. + * + * @param arg_size: number of buckets that need to be kept */ void Init(uint64 arg_size); /** - * This function will calculate the smallest value of b that will + * This function calculates the smallest value of b that will * satisfy these the constraints of a specified error margin and * confidence level. * * The exact expression for b is as follows: - * Define x = 2*(log(1.04*k/error)/log(2)). Then b is the ceiling of x + * Define x = 2*(log(1.04*k/error)/log(2)). Then b is the ceiling of x. * - * error is the error margin. + * After that initial estimate, the value of b is increased until the + * standard deviation falls within the specified valud. * - * k is the number of standard deviations that we have to go to have - * a confidence level of conf. + * @param error error margin * - * confidence: TODO. + * @param confidence confidence of the error + * + * @return minimal B-value satisfying the error-rate under confidence. */ - int OptimalB(double error, double confidence); + int OptimalB(double error, double confidence) const; /** - * Computes when the first one appears in the element. It looks at the - * bitstring from the end though. A precondition is that the argument - * is already divisible by m, so we just ignore the last b bits, since - * m = 2^b and the last b bits will always be 0. + * Determines at which index (counted from the back) the first one-bit + * appears. The last b bits have to be 0 (the element has to be divisible + * by m), hence they are ignored. + * + * @param hash_modified hash value + * + * @returns index of first one-bit */ - uint8_t Rank(uint64 hash_modified); + uint8_t Rank(uint64 hash_modified) const; /** * This is the number of buckets that will be stored. The standard diff --git a/src/probabilistic/cardinality-counter.bif b/src/probabilistic/cardinality-counter.bif index 5201ce95bb..3f3f9653e6 100644 --- a/src/probabilistic/cardinality-counter.bif +++ b/src/probabilistic/cardinality-counter.bif @@ -91,7 +91,12 @@ function hll_cardinality_merge_into%(handle1: opaque of cardinality, handle2: op CardinalityCounter* h1 = v1->Get(); CardinalityCounter* h2 = v2->Get(); - h1->Merge(h2); + bool res = h1->Merge(h2); + if ( ! res ) + { + reporter->Error("Carinality counters with different parameters cannot be merged"); + return new Val(0, TYPE_BOOL); + } return new Val(1, TYPE_BOOL); %} @@ -126,12 +131,7 @@ function hll_cardinality_copy%(handle: opaque of cardinality%): opaque of cardin %{ CardinalityVal* cv = static_cast(handle); CardinalityCounter* h = cv->Get(); - - uint64_t m = h->GetM(); - CardinalityCounter* h2 = new CardinalityCounter(m); - - int i = 0; - h2->Merge(h); + CardinalityCounter* h2 = new CardinalityCounter(*h); CardinalityVal* out = new CardinalityVal(h2); return out;