diff --git a/src/probabilistic/CardinalityCounter.cc b/src/probabilistic/CardinalityCounter.cc index ec82c31e7f..3e274da886 100644 --- a/src/probabilistic/CardinalityCounter.cc +++ b/src/probabilistic/CardinalityCounter.cc @@ -15,6 +15,9 @@ int CardinalityCounter::OptimalB(double error, double confidence) double initial_estimate = 2 * (log(1.04) - log(error)) / log(2); int answer = (int) floor(initial_estimate); + // k is the number of standard deviations that we have to go to have + // a confidence level of conf. + double k = 0; do { @@ -54,6 +57,12 @@ void CardinalityCounter::Init(uint64 size) V = m; } +CardinalityCounter::CardinalityCounter(CardinalityCounter& other) + { + Init(other.GetM()); + Merge(&other); + } + CardinalityCounter::CardinalityCounter(double error_margin, double confidence) { int b = OptimalB(error_margin, confidence); @@ -107,7 +116,16 @@ void CardinalityCounter::AddElement(uint64 hash) buckets[index] = temp; } -double CardinalityCounter::Size() +/** + * Estimate the size by using the the "raw" HyperLogLog estimate. Then, + * check if it's too "large" or "small" because the raw estimate doesn't + * do well in those cases. + * Thus, we correct for those errors as specified in the paper. + * + * Note - we deviate from the HLL algorithm in the paper here, because + * of our 64-bit hashes. + **/ +double CardinalityCounter::Size() const { double answer = 0; for ( unsigned int i = 0; i < m; i++ ) @@ -126,8 +144,11 @@ double CardinalityCounter::Size() return -pow(2, 64) * log(1 - (answer / pow(2, 64))); } -void CardinalityCounter::Merge(CardinalityCounter* c) +bool CardinalityCounter::Merge(CardinalityCounter* c) { + if ( m != c->GetM() ) + return false; + uint8_t* temp = c->GetBuckets(); V = 0; @@ -140,6 +161,8 @@ void CardinalityCounter::Merge(CardinalityCounter* c) if ( buckets[i] == 0 ) ++V; } + + return true; } uint8_t* CardinalityCounter::GetBuckets() @@ -147,7 +170,7 @@ uint8_t* CardinalityCounter::GetBuckets() return buckets; } -uint64 CardinalityCounter::GetM() +uint64 CardinalityCounter::GetM() const { return m; } diff --git a/src/probabilistic/CardinalityCounter.h b/src/probabilistic/CardinalityCounter.h index 2707c53808..a2aeea50eb 100644 --- a/src/probabilistic/CardinalityCounter.h +++ b/src/probabilistic/CardinalityCounter.h @@ -9,81 +9,78 @@ namespace probabilistic { /** - * A probabilisitc cardinality counter using the HyperLogLog algorithm. - * - * TODO: Update doc string. + * A probabilistic cardinality counter using the HyperLogLog algorithm. */ class CardinalityCounter { public: /** * Constructor. * - * Based on the error_margin, the number of buckets that need to be - * kept will be determined. Based on the max_size, the number of bits - * that will be used from the hash function will be determined. + * The number of buckets of the data structure is determined using + * the error margin and the given confidence. * - * We need the hash function to return integers that are uniformly - * distributed from 0 to 2^L-1. And if that happens, the maximum - * cardinality that this counter can handle is approximately 2^L. By - * default, we will assume a value of 64 bits. + * For example, assume an error_margin of 2% and a confidence + * of 95%. If the Size function returns an estimate of 100, this + * means that we are 95% sure that the cardinality is between 98 + * and 102. * - * Confidence in the estimate given by a cardinality counter is. + * @param error_margin error margin * - * In other words, if the cardinality is estimated to be 100 with 2% - * error margin and HLL_CONFis 0.95, then we are 95% sure that the - * actual cardinality is between 98 and 102. + * @param confidence confidence of the error. Default: 0.95 */ CardinalityCounter(double error_margin, double confidence = 0.95); /** - * Constructor used for cloning. + * Copy-Constructor + */ + CardinalityCounter(CardinalityCounter& other); + + /** + * Constructor for a known number of buckets. * - * The error margin will be 1.04/sqrt(m) with approximately 68% + * The error margin is 1.04/sqrt(size) with approximately 68% * probability. + * + * @param size number of buckets to create */ CardinalityCounter(uint64 size); /** - * Deletes the class variables. + * Destructor. */ ~CardinalityCounter(); /** - * This will add an element to the counter. It's responsible for - * adding an element and updating the value of V, if that applies. + * Add a new element to the counter. + * + * The hash function generating the hashes needs to be uniformly + * distributed over 64 bits. + * + * @param hash 64-bit hash value of the element to be added */ void AddElement(uint64 hash); /** - * Returns the size estimate of the set. First, it has the "raw" - * HyperLogLog estimate. And then, we check if it's too "large" or - * "small" because the raw estimate doesn't do well in those cases. - * Thus, we correct for those errors as specified in the paper. - */ - double Size(); - - /** - * Returns the buckets array that holds all of the rough cardinality - * estimates. - */ - uint8_t* GetBuckets(); + * Get the current estimated number of elements in the data + * structure + * + * @return Estimated number of elements + **/ + double Size() const; /** * Merges the argument cardinality counter with this one. The error - * margins are assumed to be the same, so they have the same number of - * buckets. If any of the conditions are violated, then the return - * value of size() is meaningless. + * margins of both counters have to be the same, otherwhise the merge + * operation will not be carried out. + * + * @param c Cardinality counter to merge into the current counter. + * + * @return True if successful */ - void Merge(CardinalityCounter* c); + bool Merge(CardinalityCounter* c); /** - * Returns the value of m. Should be used only for statistical - * purposes. - */ - uint64 GetM(); - - /** -c * Serializes the cardinality counter. + * Serializes the cardinality counter. * * @param info The serializaton information to use. * @@ -97,10 +94,28 @@ c * Serializes the cardinality counter. * @param info The serializaton information to use. * * @return The unserialized cardinality counter, or null if an error - * occured. + * occured. */ static CardinalityCounter* Unserialize(UnserialInfo* info); +protected: + /** + * Return the number of buckets. + * + * @return Number of buckets + */ + uint64 GetM() const; + + /** + * Returns the buckets array that holds all of the rough cardinality + * estimates. + * + * Use GetM() to determine the size. + * + * @return Array containing cardinality estimates + */ + uint8_t* GetBuckets(); + private: /** * Constructor used when unserializing, i.e., all parameters are @@ -110,31 +125,38 @@ private: /** * Helper function with code used jointly by multiple constructors. + * + * @param arg_size: number of buckets that need to be kept */ void Init(uint64 arg_size); /** - * This function will calculate the smallest value of b that will + * This function calculates the smallest value of b that will * satisfy these the constraints of a specified error margin and * confidence level. * * The exact expression for b is as follows: - * Define x = 2*(log(1.04*k/error)/log(2)). Then b is the ceiling of x + * Define x = 2*(log(1.04*k/error)/log(2)). Then b is the ceiling of x. * - * error is the error margin. + * After that initial estimate, the value of b is increased until the + * standard deviation falls within the specified valud. * - * k is the number of standard deviations that we have to go to have - * a confidence level of conf. + * @param error error margin * - * confidence: TODO. + * @param confidence confidence of the error + * + * @return minimal B-value satisfying the error-rate under confidence. */ int OptimalB(double error, double confidence); /** - * Computes when the first one appears in the element. It looks at the - * bitstring from the end though. A precondition is that the argument - * is already divisible by m, so we just ignore the last b bits, since - * m = 2^b and the last b bits will always be 0. + * Determines at which index (counted from the back) the first one-bit + * appears. The last b bits have to be 0 (the element has to be divisible + * by m), hence they are ignored. + * + * @param hash_modified hash value + * + * @returns index of first one-bit */ uint8_t Rank(uint64 hash_modified); diff --git a/src/probabilistic/cardinality-counter.bif b/src/probabilistic/cardinality-counter.bif index 5201ce95bb..46323bc212 100644 --- a/src/probabilistic/cardinality-counter.bif +++ b/src/probabilistic/cardinality-counter.bif @@ -91,7 +91,12 @@ function hll_cardinality_merge_into%(handle1: opaque of cardinality, handle2: op CardinalityCounter* h1 = v1->Get(); CardinalityCounter* h2 = v2->Get(); - h1->Merge(h2); + bool res = h1->Merge(h2); + if ( res == false ) + { + reporter->Error("Carinality counters with different parameters cannot be merged"); + return new Val(0, TYPE_BOOL); + } return new Val(1, TYPE_BOOL); %} @@ -126,12 +131,7 @@ function hll_cardinality_copy%(handle: opaque of cardinality%): opaque of cardin %{ CardinalityVal* cv = static_cast(handle); CardinalityCounter* h = cv->Get(); - - uint64_t m = h->GetM(); - CardinalityCounter* h2 = new CardinalityCounter(m); - - int i = 0; - h2->Merge(h); + CardinalityCounter* h2 = new CardinalityCounter(*h); CardinalityVal* out = new CardinalityVal(h2); return out;