Hyperloglog: change calculation of Rho

This commit changes the calculation of the rho-value to be in line with
the implementation of the original research paper, counting the number
of zero bits before the data.

This also fixes an infinite loop in case the hash value is 0.

I also cleaned up the code a bit, converting the raw pointers that were
used to a STL vector.

Addresses BIT-1612
This commit is contained in:
Johanna Amann 2016-06-13 15:14:39 -07:00
parent 151f9d6ced
commit 3aabe83ec6
2 changed files with 69 additions and 41 deletions

View file

@ -28,10 +28,9 @@ int CardinalityCounter::OptimalB(double error, double confidence) const
return answer;
}
void CardinalityCounter::Init(uint64 size)
void CardinalityCounter::Init(uint64_t size)
{
m = size;
buckets = new uint8_t[m];
// The following magic values are taken directly out of the
// description of the HyperLogLog algorithn.
@ -51,60 +50,80 @@ void CardinalityCounter::Init(uint64 size)
else
reporter->InternalError("Invalid size %" PRIu64 ". Size either has to be 16, 32, 64 or bigger than 128", size);
for ( uint64 i = 0; i < m; i++ )
buckets[i] = 0;
double calc_p = log2(m);
if ( trunc(calc_p) != calc_p )
reporter->InternalError("Invalid size %" PRIu64 ". Size either has to be a power of 2", size);
p = calc_p;
buckets.reserve(m);
for ( uint64_t i = 0; i < m; i++ )
buckets.push_back(0);
assert(buckets.size() == m);
V = m;
}
CardinalityCounter::CardinalityCounter(CardinalityCounter& other)
: buckets(other.buckets)
{
Init(other.GetM());
Merge(&other);
V = other.V;
alpha_m = other.alpha_m;
m = other.m;
p = other.p;
}
CardinalityCounter::CardinalityCounter(CardinalityCounter&& o)
{
V = o.V;
alpha_m = o.alpha_m;
m = o.m;
p = o.p;
o.m = 0;
buckets = std::move(o.buckets);
}
CardinalityCounter::CardinalityCounter(double error_margin, double confidence)
{
int b = OptimalB(error_margin, confidence);
Init((uint64) pow(2, b));
assert(b == p);
}
CardinalityCounter::CardinalityCounter(uint64 size)
CardinalityCounter::CardinalityCounter(uint64_t size)
{
Init(size);
}
CardinalityCounter::CardinalityCounter(uint64 arg_size, uint64 arg_V, double arg_alpha_m)
CardinalityCounter::CardinalityCounter(uint64_t arg_size, uint64_t arg_V, double arg_alpha_m)
{
m = arg_size;
buckets = new uint8_t[m];
buckets.reserve(m);
for ( uint64_t i = 0; i < m; i++ )
buckets.push_back(0);
alpha_m = arg_alpha_m;
V = arg_V;
p = log2(m);
}
CardinalityCounter::~CardinalityCounter()
{
delete [] buckets;
}
uint8_t CardinalityCounter::Rank(uint64 hash_modified) const
uint8_t CardinalityCounter::Rank(uint64_t hash_modified) const
{
uint8_t answer = 0;
hash_modified = (uint64)(hash_modified / m);
hash_modified *= 2;
do {
hash_modified = (uint64)(hash_modified / 2);
answer++;
} while ( hash_modified % 2 == 0);
hash_modified = hash_modified >> p;
int answer = 64 - p - fls(hash_modified) + 1;
return answer;
}
void CardinalityCounter::AddElement(uint64 hash)
void CardinalityCounter::AddElement(uint64_t hash)
{
uint64 index = hash % m;
uint64_t index = hash % m;
hash = hash-index;
if( buckets[index] == 0 )
@ -114,11 +133,14 @@ void CardinalityCounter::AddElement(uint64 hash)
if ( temp > buckets[index] )
buckets[index] = temp;
if ( buckets[index] == 0 )
V++;
}
/**
* Estimate the size by using the the "raw" HyperLogLog estimate. Then,
* check if it's too "large" or "small" because the raw estimate doesn't
* check if it's too "large" or "small" because the raw estimate doesn't
* do well in those cases.
* Thus, we correct for those errors as specified in the paper.
*
@ -149,7 +171,7 @@ bool CardinalityCounter::Merge(CardinalityCounter* c)
if ( m != c->GetM() )
return false;
uint8_t* temp = c->GetBuckets();
const vector<uint8_t> temp = c->GetBuckets();
V = 0;
@ -165,12 +187,12 @@ bool CardinalityCounter::Merge(CardinalityCounter* c)
return true;
}
uint8_t* CardinalityCounter::GetBuckets()
const vector<uint8_t> &CardinalityCounter::GetBuckets() const
{
return buckets;
}
uint64 CardinalityCounter::GetM() const
uint64_t CardinalityCounter::GetM() const
{
return m;
}
@ -192,7 +214,7 @@ bool CardinalityCounter::Serialize(SerialInfo* info) const
CardinalityCounter* CardinalityCounter::Unserialize(UnserialInfo* info)
{
uint64_t m;
uint64 V;
uint64_t V;
double alpha_m;
bool valid = true;
@ -202,13 +224,13 @@ CardinalityCounter* CardinalityCounter::Unserialize(UnserialInfo* info)
CardinalityCounter* c = new CardinalityCounter(m, V, alpha_m);
uint8_t* buckets = c->buckets;
vector<uint8_t>& buckets = c->buckets;
for ( unsigned int i = 0; i < m; i++ )
{
char c;
valid &= UNSERIALIZE(&c);
buckets[i] = (uint8)c;
buckets[i] = (uint8_t)c;
}
if ( ! valid )