Hyperloglog: change calculation of Rho

This commit changes the calculation of the rho-value to be in line with
the implementation of the original research paper, counting the number
of zero bits before the data.

This also fixes an infinite loop in case the hash value is 0.

I also cleaned up the code a bit, converting the raw pointers that were
used to a STL vector.

Addresses BIT-1612
This commit is contained in:
Johanna Amann 2016-06-13 15:14:39 -07:00
parent 151f9d6ced
commit 3aabe83ec6
2 changed files with 69 additions and 41 deletions

View file

@ -28,10 +28,9 @@ int CardinalityCounter::OptimalB(double error, double confidence) const
return answer;
}
void CardinalityCounter::Init(uint64 size)
void CardinalityCounter::Init(uint64_t size)
{
m = size;
buckets = new uint8_t[m];
// The following magic values are taken directly out of the
// description of the HyperLogLog algorithn.
@ -51,60 +50,80 @@ void CardinalityCounter::Init(uint64 size)
else
reporter->InternalError("Invalid size %" PRIu64 ". Size either has to be 16, 32, 64 or bigger than 128", size);
for ( uint64 i = 0; i < m; i++ )
buckets[i] = 0;
double calc_p = log2(m);
if ( trunc(calc_p) != calc_p )
reporter->InternalError("Invalid size %" PRIu64 ". Size either has to be a power of 2", size);
p = calc_p;
buckets.reserve(m);
for ( uint64_t i = 0; i < m; i++ )
buckets.push_back(0);
assert(buckets.size() == m);
V = m;
}
CardinalityCounter::CardinalityCounter(CardinalityCounter& other)
: buckets(other.buckets)
{
Init(other.GetM());
Merge(&other);
V = other.V;
alpha_m = other.alpha_m;
m = other.m;
p = other.p;
}
CardinalityCounter::CardinalityCounter(CardinalityCounter&& o)
{
V = o.V;
alpha_m = o.alpha_m;
m = o.m;
p = o.p;
o.m = 0;
buckets = std::move(o.buckets);
}
CardinalityCounter::CardinalityCounter(double error_margin, double confidence)
{
int b = OptimalB(error_margin, confidence);
Init((uint64) pow(2, b));
assert(b == p);
}
CardinalityCounter::CardinalityCounter(uint64 size)
CardinalityCounter::CardinalityCounter(uint64_t size)
{
Init(size);
}
CardinalityCounter::CardinalityCounter(uint64 arg_size, uint64 arg_V, double arg_alpha_m)
CardinalityCounter::CardinalityCounter(uint64_t arg_size, uint64_t arg_V, double arg_alpha_m)
{
m = arg_size;
buckets = new uint8_t[m];
buckets.reserve(m);
for ( uint64_t i = 0; i < m; i++ )
buckets.push_back(0);
alpha_m = arg_alpha_m;
V = arg_V;
p = log2(m);
}
CardinalityCounter::~CardinalityCounter()
{
delete [] buckets;
}
uint8_t CardinalityCounter::Rank(uint64 hash_modified) const
uint8_t CardinalityCounter::Rank(uint64_t hash_modified) const
{
uint8_t answer = 0;
hash_modified = (uint64)(hash_modified / m);
hash_modified *= 2;
do {
hash_modified = (uint64)(hash_modified / 2);
answer++;
} while ( hash_modified % 2 == 0);
hash_modified = hash_modified >> p;
int answer = 64 - p - fls(hash_modified) + 1;
return answer;
}
void CardinalityCounter::AddElement(uint64 hash)
void CardinalityCounter::AddElement(uint64_t hash)
{
uint64 index = hash % m;
uint64_t index = hash % m;
hash = hash-index;
if( buckets[index] == 0 )
@ -114,6 +133,9 @@ void CardinalityCounter::AddElement(uint64 hash)
if ( temp > buckets[index] )
buckets[index] = temp;
if ( buckets[index] == 0 )
V++;
}
/**
@ -149,7 +171,7 @@ bool CardinalityCounter::Merge(CardinalityCounter* c)
if ( m != c->GetM() )
return false;
uint8_t* temp = c->GetBuckets();
const vector<uint8_t> temp = c->GetBuckets();
V = 0;
@ -165,12 +187,12 @@ bool CardinalityCounter::Merge(CardinalityCounter* c)
return true;
}
uint8_t* CardinalityCounter::GetBuckets()
const vector<uint8_t> &CardinalityCounter::GetBuckets() const
{
return buckets;
}
uint64 CardinalityCounter::GetM() const
uint64_t CardinalityCounter::GetM() const
{
return m;
}
@ -192,7 +214,7 @@ bool CardinalityCounter::Serialize(SerialInfo* info) const
CardinalityCounter* CardinalityCounter::Unserialize(UnserialInfo* info)
{
uint64_t m;
uint64 V;
uint64_t V;
double alpha_m;
bool valid = true;
@ -202,13 +224,13 @@ CardinalityCounter* CardinalityCounter::Unserialize(UnserialInfo* info)
CardinalityCounter* c = new CardinalityCounter(m, V, alpha_m);
uint8_t* buckets = c->buckets;
vector<uint8_t>& buckets = c->buckets;
for ( unsigned int i = 0; i < m; i++ )
{
char c;
valid &= UNSERIALIZE(&c);
buckets[i] = (uint8)c;
buckets[i] = (uint8_t)c;
}
if ( ! valid )

View file

@ -28,13 +28,18 @@ public:
*
* @param confidence confidence of the error. Default: 0.95
*/
CardinalityCounter(double error_margin, double confidence = 0.95);
explicit CardinalityCounter(double error_margin, double confidence = 0.95);
/**
* Copy-Constructor
*/
CardinalityCounter(CardinalityCounter& other);
/**
* Move-Constructor
*/
CardinalityCounter(CardinalityCounter&& o);
/**
* Constructor for a known number of buckets.
*
@ -43,7 +48,7 @@ public:
*
* @param size number of buckets to create
*/
CardinalityCounter(uint64 size);
explicit CardinalityCounter(uint64_t size);
/**
* Destructor.
@ -58,7 +63,7 @@ public:
*
* @param hash 64-bit hash value of the element to be added
*/
void AddElement(uint64 hash);
void AddElement(uint64_t hash);
/**
* Get the current estimated number of elements in the data
@ -104,7 +109,7 @@ protected:
*
* @return Number of buckets
*/
uint64 GetM() const;
uint64_t GetM() const;
/**
* Returns the buckets array that holds all of the rough cardinality
@ -114,21 +119,21 @@ protected:
*
* @return Array containing cardinality estimates
*/
uint8_t* GetBuckets();
const std::vector<uint8_t> &GetBuckets() const;
private:
/**
* Constructor used when unserializing, i.e., all parameters are
* known.
*/
CardinalityCounter(uint64 size, uint64 V, double alpha_m);
explicit CardinalityCounter(uint64_t size, uint64_t V, double alpha_m);
/**
* Helper function with code used jointly by multiple constructors.
*
* @param arg_size: number of buckets that need to be kept
*/
void Init(uint64 arg_size);
void Init(uint64_t arg_size);
/**
* This function calculates the smallest value of b that will
@ -158,14 +163,14 @@ private:
*
* @returns index of first one-bit
*/
uint8_t Rank(uint64 hash_modified) const;
uint8_t Rank(uint64_t hash_modified) const;
/**
* This is the number of buckets that will be stored. The standard
* error is 1.04/sqrt(m), so the actual cardinality will be the
* estimate +/- 1.04/sqrt(m) with approximately 68% probability.
*/
uint64 m;
uint64_t m;
/**
* These are the actual buckets that are storing an estimate of the
@ -173,7 +178,7 @@ private:
* appears in the bitstring and that location is at most 65, so not
* that many bits are needed to store it.
*/
uint8_t* buckets;
std::vector<uint8_t> buckets;
/**
* There are some state constants that need to be kept track of to
@ -181,8 +186,9 @@ private:
* buckets that are 0 and this is used in the small error correction.
* alpha_m is a multiplicative constant used in the algorithm.
*/
uint64 V;
uint64_t V;
double alpha_m;
int p; // the log2 of m
};
}