diff --git a/src/probabilistic/CardinalityCounter.cc b/src/probabilistic/CardinalityCounter.cc index ed9f4ae078..9c40e6a17d 100644 --- a/src/probabilistic/CardinalityCounter.cc +++ b/src/probabilistic/CardinalityCounter.cc @@ -28,10 +28,9 @@ int CardinalityCounter::OptimalB(double error, double confidence) const return answer; } -void CardinalityCounter::Init(uint64 size) +void CardinalityCounter::Init(uint64_t size) { m = size; - buckets = new uint8_t[m]; // The following magic values are taken directly out of the // description of the HyperLogLog algorithn. @@ -51,60 +50,80 @@ void CardinalityCounter::Init(uint64 size) else reporter->InternalError("Invalid size %" PRIu64 ". Size either has to be 16, 32, 64 or bigger than 128", size); - for ( uint64 i = 0; i < m; i++ ) - buckets[i] = 0; + double calc_p = log2(m); + if ( trunc(calc_p) != calc_p ) + reporter->InternalError("Invalid size %" PRIu64 ". Size either has to be a power of 2", size); + p = calc_p; + + buckets.reserve(m); + for ( uint64_t i = 0; i < m; i++ ) + buckets.push_back(0); + + assert(buckets.size() == m); V = m; } CardinalityCounter::CardinalityCounter(CardinalityCounter& other) + : buckets(other.buckets) { - Init(other.GetM()); - Merge(&other); + V = other.V; + alpha_m = other.alpha_m; + m = other.m; + p = other.p; + } + +CardinalityCounter::CardinalityCounter(CardinalityCounter&& o) + { + V = o.V; + alpha_m = o.alpha_m; + m = o.m; + p = o.p; + + o.m = 0; + buckets = std::move(o.buckets); } CardinalityCounter::CardinalityCounter(double error_margin, double confidence) { int b = OptimalB(error_margin, confidence); Init((uint64) pow(2, b)); + + assert(b == p); } -CardinalityCounter::CardinalityCounter(uint64 size) +CardinalityCounter::CardinalityCounter(uint64_t size) { Init(size); } -CardinalityCounter::CardinalityCounter(uint64 arg_size, uint64 arg_V, double arg_alpha_m) +CardinalityCounter::CardinalityCounter(uint64_t arg_size, uint64_t arg_V, double arg_alpha_m) { m = arg_size; - buckets = new uint8_t[m]; + + buckets.reserve(m); + for ( uint64_t i = 0; i < m; i++ ) + buckets.push_back(0); + alpha_m = arg_alpha_m; V = arg_V; + p = log2(m); } CardinalityCounter::~CardinalityCounter() { - delete [] buckets; } -uint8_t CardinalityCounter::Rank(uint64 hash_modified) const +uint8_t CardinalityCounter::Rank(uint64_t hash_modified) const { - uint8_t answer = 0; - - hash_modified = (uint64)(hash_modified / m); - hash_modified *= 2; - - do { - hash_modified = (uint64)(hash_modified / 2); - answer++; - } while ( hash_modified % 2 == 0); - + hash_modified = hash_modified >> p; + int answer = 64 - p - fls(hash_modified) + 1; return answer; } -void CardinalityCounter::AddElement(uint64 hash) +void CardinalityCounter::AddElement(uint64_t hash) { - uint64 index = hash % m; + uint64_t index = hash % m; hash = hash-index; if( buckets[index] == 0 ) @@ -114,11 +133,14 @@ void CardinalityCounter::AddElement(uint64 hash) if ( temp > buckets[index] ) buckets[index] = temp; + + if ( buckets[index] == 0 ) + V++; } /** * Estimate the size by using the the "raw" HyperLogLog estimate. Then, - * check if it's too "large" or "small" because the raw estimate doesn't + * check if it's too "large" or "small" because the raw estimate doesn't * do well in those cases. * Thus, we correct for those errors as specified in the paper. * @@ -149,7 +171,7 @@ bool CardinalityCounter::Merge(CardinalityCounter* c) if ( m != c->GetM() ) return false; - uint8_t* temp = c->GetBuckets(); + const vector temp = c->GetBuckets(); V = 0; @@ -165,12 +187,12 @@ bool CardinalityCounter::Merge(CardinalityCounter* c) return true; } -uint8_t* CardinalityCounter::GetBuckets() +const vector &CardinalityCounter::GetBuckets() const { return buckets; } -uint64 CardinalityCounter::GetM() const +uint64_t CardinalityCounter::GetM() const { return m; } @@ -192,7 +214,7 @@ bool CardinalityCounter::Serialize(SerialInfo* info) const CardinalityCounter* CardinalityCounter::Unserialize(UnserialInfo* info) { uint64_t m; - uint64 V; + uint64_t V; double alpha_m; bool valid = true; @@ -202,13 +224,13 @@ CardinalityCounter* CardinalityCounter::Unserialize(UnserialInfo* info) CardinalityCounter* c = new CardinalityCounter(m, V, alpha_m); - uint8_t* buckets = c->buckets; + vector& buckets = c->buckets; for ( unsigned int i = 0; i < m; i++ ) { char c; valid &= UNSERIALIZE(&c); - buckets[i] = (uint8)c; + buckets[i] = (uint8_t)c; } if ( ! valid ) diff --git a/src/probabilistic/CardinalityCounter.h b/src/probabilistic/CardinalityCounter.h index 8e079a5d84..cac66eedda 100644 --- a/src/probabilistic/CardinalityCounter.h +++ b/src/probabilistic/CardinalityCounter.h @@ -28,13 +28,18 @@ public: * * @param confidence confidence of the error. Default: 0.95 */ - CardinalityCounter(double error_margin, double confidence = 0.95); + explicit CardinalityCounter(double error_margin, double confidence = 0.95); /** * Copy-Constructor */ CardinalityCounter(CardinalityCounter& other); + /** + * Move-Constructor + */ + CardinalityCounter(CardinalityCounter&& o); + /** * Constructor for a known number of buckets. * @@ -43,7 +48,7 @@ public: * * @param size number of buckets to create */ - CardinalityCounter(uint64 size); + explicit CardinalityCounter(uint64_t size); /** * Destructor. @@ -58,7 +63,7 @@ public: * * @param hash 64-bit hash value of the element to be added */ - void AddElement(uint64 hash); + void AddElement(uint64_t hash); /** * Get the current estimated number of elements in the data @@ -104,7 +109,7 @@ protected: * * @return Number of buckets */ - uint64 GetM() const; + uint64_t GetM() const; /** * Returns the buckets array that holds all of the rough cardinality @@ -114,21 +119,21 @@ protected: * * @return Array containing cardinality estimates */ - uint8_t* GetBuckets(); + const std::vector &GetBuckets() const; private: /** * Constructor used when unserializing, i.e., all parameters are * known. */ - CardinalityCounter(uint64 size, uint64 V, double alpha_m); + explicit CardinalityCounter(uint64_t size, uint64_t V, double alpha_m); /** * Helper function with code used jointly by multiple constructors. * * @param arg_size: number of buckets that need to be kept */ - void Init(uint64 arg_size); + void Init(uint64_t arg_size); /** * This function calculates the smallest value of b that will @@ -158,14 +163,14 @@ private: * * @returns index of first one-bit */ - uint8_t Rank(uint64 hash_modified) const; + uint8_t Rank(uint64_t hash_modified) const; /** * This is the number of buckets that will be stored. The standard * error is 1.04/sqrt(m), so the actual cardinality will be the * estimate +/- 1.04/sqrt(m) with approximately 68% probability. */ - uint64 m; + uint64_t m; /** * These are the actual buckets that are storing an estimate of the @@ -173,7 +178,7 @@ private: * appears in the bitstring and that location is at most 65, so not * that many bits are needed to store it. */ - uint8_t* buckets; + std::vector buckets; /** * There are some state constants that need to be kept track of to @@ -181,8 +186,9 @@ private: * buckets that are 0 and this is used in the small error correction. * alpha_m is a multiplicative constant used in the algorithm. */ - uint64 V; + uint64_t V; double alpha_m; + int p; // the log2 of m }; }