Hyperloglog: change calculation of Rho

This commit changes the calculation of the rho-value to be in line with the implementation of the original research paper, counting the number of zero bits before the data. This also fixes an infinite loop in case the hash value is 0. I also cleaned up the code a bit, converting the raw pointers that were used to a STL vector. Addresses BIT-1612
2025-10-04 23:58:20 +00:00 · 2016-06-13 15:14:39 -07:00 · 2016-06-13 15:14:39 -07:00 · 3aabe83ec6
commit 3aabe83ec6
parent 151f9d6ced
2 changed files with 69 additions and 41 deletions
--- a/src/probabilistic/CardinalityCounter.cc
+++ b/src/probabilistic/CardinalityCounter.cc
@ -28,10 +28,9 @@ int CardinalityCounter::OptimalB(double error, double confidence) const
 	return answer;
 	}

-void CardinalityCounter::Init(uint64 size)
+void CardinalityCounter::Init(uint64_t size)
 	{
 	m = size;
-	buckets = new uint8_t[m];

 	// The following magic values are taken directly out of the
 	// description of the HyperLogLog algorithn.
@ -51,60 +50,80 @@ void CardinalityCounter::Init(uint64 size)
 	else
 		reporter->InternalError("Invalid size %" PRIu64 ". Size either has to be 16, 32, 64 or bigger than 128", size);

-	for ( uint64 i = 0; i < m; i++ )
-		buckets[i] = 0;
+	double calc_p = log2(m);
+	if ( trunc(calc_p) != calc_p )
+		reporter->InternalError("Invalid size %" PRIu64 ". Size either has to be a power of 2", size);
+	p = calc_p;
+
+	buckets.reserve(m);
+	for ( uint64_t i = 0; i < m; i++ )
+		buckets.push_back(0);
+
+	assert(buckets.size() == m);

 	V = m;
 	}

 CardinalityCounter::CardinalityCounter(CardinalityCounter& other)
+	: buckets(other.buckets)
 	{
-	Init(other.GetM());
-	Merge(&other);
+	V = other.V;
+	alpha_m = other.alpha_m;
+	m = other.m;
+	p = other.p;
+	}
+
+CardinalityCounter::CardinalityCounter(CardinalityCounter&& o)
+	{
+	V = o.V;
+	alpha_m = o.alpha_m;
+	m = o.m;
+	p = o.p;
+
+	o.m = 0;
+	buckets = std::move(o.buckets);
 	}

 CardinalityCounter::CardinalityCounter(double error_margin, double confidence)
 	{
 	int b = OptimalB(error_margin, confidence);
 	Init((uint64) pow(2, b));
+
+	assert(b == p);
 	}

-CardinalityCounter::CardinalityCounter(uint64 size)
+CardinalityCounter::CardinalityCounter(uint64_t size)
 	{
 	Init(size);
 	}

-CardinalityCounter::CardinalityCounter(uint64 arg_size, uint64 arg_V, double arg_alpha_m)
+CardinalityCounter::CardinalityCounter(uint64_t arg_size, uint64_t arg_V, double arg_alpha_m)
 	{
 	m = arg_size;
-	buckets = new uint8_t[m];
+
+	buckets.reserve(m);
+	for ( uint64_t i = 0; i < m; i++ )
+		buckets.push_back(0);
+
 	alpha_m = arg_alpha_m;
 	V = arg_V;
+	p = log2(m);
 	}

 CardinalityCounter::~CardinalityCounter()
 	{
-	delete [] buckets;
 	}

-uint8_t CardinalityCounter::Rank(uint64 hash_modified) const
+uint8_t CardinalityCounter::Rank(uint64_t hash_modified) const
 	{
-	uint8_t answer = 0;
-
-	hash_modified = (uint64)(hash_modified / m);
-	hash_modified *= 2;
-
-	do {
-		hash_modified = (uint64)(hash_modified / 2);
-		answer++;
-	} while ( hash_modified % 2 == 0);
-
+	hash_modified = hash_modified >> p;
+	int answer = 64 - p - fls(hash_modified) + 1;
 	return answer;
 	}

-void CardinalityCounter::AddElement(uint64 hash)
+void CardinalityCounter::AddElement(uint64_t hash)
 	{
-	uint64 index = hash % m;
+	uint64_t index = hash % m;
 	hash = hash-index;

 	if( buckets[index] == 0 )
@ -114,11 +133,14 @@ void CardinalityCounter::AddElement(uint64 hash)

 	if ( temp > buckets[index] )
 		buckets[index] = temp;
+
+	if ( buckets[index] == 0 )
+		V++;
 	}

 /**
 * Estimate the size by using the the "raw" HyperLogLog estimate. Then,
- * check if it's too "large" or "small" because the raw estimate doesn't 
+ * check if it's too "large" or "small" because the raw estimate doesn't
 * do well in those cases.
 * Thus, we correct for those errors as specified in the paper.
 *
@ -149,7 +171,7 @@ bool CardinalityCounter::Merge(CardinalityCounter* c)
 	if ( m != c->GetM() )
 		return false;

-	uint8_t* temp = c->GetBuckets();
+	const vector<uint8_t> temp = c->GetBuckets();

 	V = 0;

@ -165,12 +187,12 @@ bool CardinalityCounter::Merge(CardinalityCounter* c)
 	return true;
 	}

-uint8_t* CardinalityCounter::GetBuckets()
+const vector<uint8_t> &CardinalityCounter::GetBuckets() const
 	{
 	return buckets;
 	}

-uint64 CardinalityCounter::GetM() const
+uint64_t CardinalityCounter::GetM() const
 	{
 	return m;
 	}
@ -192,7 +214,7 @@ bool CardinalityCounter::Serialize(SerialInfo* info) const
 CardinalityCounter* CardinalityCounter::Unserialize(UnserialInfo* info)
 	{
 	uint64_t m;
-	uint64 V;
+	uint64_t V;
 	double alpha_m;

 	bool valid = true;
@ -202,13 +224,13 @@ CardinalityCounter* CardinalityCounter::Unserialize(UnserialInfo* info)

 	CardinalityCounter* c = new CardinalityCounter(m, V, alpha_m);

-	uint8_t* buckets = c->buckets;
+	vector<uint8_t>& buckets = c->buckets;

 	for ( unsigned int i = 0; i < m; i++ )
 		{
 		char c;
 		valid &= UNSERIALIZE(&c);
-		buckets[i] = (uint8)c;
+		buckets[i] = (uint8_t)c;
 		}

 	if ( ! valid )