mirror of
https://github.com/zeek/zeek.git
synced 2025-10-04 23:58:20 +00:00
Hyperloglog: change calculation of Rho
This commit changes the calculation of the rho-value to be in line with the implementation of the original research paper, counting the number of zero bits before the data. This also fixes an infinite loop in case the hash value is 0. I also cleaned up the code a bit, converting the raw pointers that were used to a STL vector. Addresses BIT-1612
This commit is contained in:
parent
151f9d6ced
commit
3aabe83ec6
2 changed files with 69 additions and 41 deletions
|
@ -28,10 +28,9 @@ int CardinalityCounter::OptimalB(double error, double confidence) const
|
|||
return answer;
|
||||
}
|
||||
|
||||
void CardinalityCounter::Init(uint64 size)
|
||||
void CardinalityCounter::Init(uint64_t size)
|
||||
{
|
||||
m = size;
|
||||
buckets = new uint8_t[m];
|
||||
|
||||
// The following magic values are taken directly out of the
|
||||
// description of the HyperLogLog algorithn.
|
||||
|
@ -51,60 +50,80 @@ void CardinalityCounter::Init(uint64 size)
|
|||
else
|
||||
reporter->InternalError("Invalid size %" PRIu64 ". Size either has to be 16, 32, 64 or bigger than 128", size);
|
||||
|
||||
for ( uint64 i = 0; i < m; i++ )
|
||||
buckets[i] = 0;
|
||||
double calc_p = log2(m);
|
||||
if ( trunc(calc_p) != calc_p )
|
||||
reporter->InternalError("Invalid size %" PRIu64 ". Size either has to be a power of 2", size);
|
||||
p = calc_p;
|
||||
|
||||
buckets.reserve(m);
|
||||
for ( uint64_t i = 0; i < m; i++ )
|
||||
buckets.push_back(0);
|
||||
|
||||
assert(buckets.size() == m);
|
||||
|
||||
V = m;
|
||||
}
|
||||
|
||||
CardinalityCounter::CardinalityCounter(CardinalityCounter& other)
|
||||
: buckets(other.buckets)
|
||||
{
|
||||
Init(other.GetM());
|
||||
Merge(&other);
|
||||
V = other.V;
|
||||
alpha_m = other.alpha_m;
|
||||
m = other.m;
|
||||
p = other.p;
|
||||
}
|
||||
|
||||
CardinalityCounter::CardinalityCounter(CardinalityCounter&& o)
|
||||
{
|
||||
V = o.V;
|
||||
alpha_m = o.alpha_m;
|
||||
m = o.m;
|
||||
p = o.p;
|
||||
|
||||
o.m = 0;
|
||||
buckets = std::move(o.buckets);
|
||||
}
|
||||
|
||||
CardinalityCounter::CardinalityCounter(double error_margin, double confidence)
|
||||
{
|
||||
int b = OptimalB(error_margin, confidence);
|
||||
Init((uint64) pow(2, b));
|
||||
|
||||
assert(b == p);
|
||||
}
|
||||
|
||||
CardinalityCounter::CardinalityCounter(uint64 size)
|
||||
CardinalityCounter::CardinalityCounter(uint64_t size)
|
||||
{
|
||||
Init(size);
|
||||
}
|
||||
|
||||
CardinalityCounter::CardinalityCounter(uint64 arg_size, uint64 arg_V, double arg_alpha_m)
|
||||
CardinalityCounter::CardinalityCounter(uint64_t arg_size, uint64_t arg_V, double arg_alpha_m)
|
||||
{
|
||||
m = arg_size;
|
||||
buckets = new uint8_t[m];
|
||||
|
||||
buckets.reserve(m);
|
||||
for ( uint64_t i = 0; i < m; i++ )
|
||||
buckets.push_back(0);
|
||||
|
||||
alpha_m = arg_alpha_m;
|
||||
V = arg_V;
|
||||
p = log2(m);
|
||||
}
|
||||
|
||||
CardinalityCounter::~CardinalityCounter()
|
||||
{
|
||||
delete [] buckets;
|
||||
}
|
||||
|
||||
uint8_t CardinalityCounter::Rank(uint64 hash_modified) const
|
||||
uint8_t CardinalityCounter::Rank(uint64_t hash_modified) const
|
||||
{
|
||||
uint8_t answer = 0;
|
||||
|
||||
hash_modified = (uint64)(hash_modified / m);
|
||||
hash_modified *= 2;
|
||||
|
||||
do {
|
||||
hash_modified = (uint64)(hash_modified / 2);
|
||||
answer++;
|
||||
} while ( hash_modified % 2 == 0);
|
||||
|
||||
hash_modified = hash_modified >> p;
|
||||
int answer = 64 - p - fls(hash_modified) + 1;
|
||||
return answer;
|
||||
}
|
||||
|
||||
void CardinalityCounter::AddElement(uint64 hash)
|
||||
void CardinalityCounter::AddElement(uint64_t hash)
|
||||
{
|
||||
uint64 index = hash % m;
|
||||
uint64_t index = hash % m;
|
||||
hash = hash-index;
|
||||
|
||||
if( buckets[index] == 0 )
|
||||
|
@ -114,11 +133,14 @@ void CardinalityCounter::AddElement(uint64 hash)
|
|||
|
||||
if ( temp > buckets[index] )
|
||||
buckets[index] = temp;
|
||||
|
||||
if ( buckets[index] == 0 )
|
||||
V++;
|
||||
}
|
||||
|
||||
/**
|
||||
* Estimate the size by using the the "raw" HyperLogLog estimate. Then,
|
||||
* check if it's too "large" or "small" because the raw estimate doesn't
|
||||
* check if it's too "large" or "small" because the raw estimate doesn't
|
||||
* do well in those cases.
|
||||
* Thus, we correct for those errors as specified in the paper.
|
||||
*
|
||||
|
@ -149,7 +171,7 @@ bool CardinalityCounter::Merge(CardinalityCounter* c)
|
|||
if ( m != c->GetM() )
|
||||
return false;
|
||||
|
||||
uint8_t* temp = c->GetBuckets();
|
||||
const vector<uint8_t> temp = c->GetBuckets();
|
||||
|
||||
V = 0;
|
||||
|
||||
|
@ -165,12 +187,12 @@ bool CardinalityCounter::Merge(CardinalityCounter* c)
|
|||
return true;
|
||||
}
|
||||
|
||||
uint8_t* CardinalityCounter::GetBuckets()
|
||||
const vector<uint8_t> &CardinalityCounter::GetBuckets() const
|
||||
{
|
||||
return buckets;
|
||||
}
|
||||
|
||||
uint64 CardinalityCounter::GetM() const
|
||||
uint64_t CardinalityCounter::GetM() const
|
||||
{
|
||||
return m;
|
||||
}
|
||||
|
@ -192,7 +214,7 @@ bool CardinalityCounter::Serialize(SerialInfo* info) const
|
|||
CardinalityCounter* CardinalityCounter::Unserialize(UnserialInfo* info)
|
||||
{
|
||||
uint64_t m;
|
||||
uint64 V;
|
||||
uint64_t V;
|
||||
double alpha_m;
|
||||
|
||||
bool valid = true;
|
||||
|
@ -202,13 +224,13 @@ CardinalityCounter* CardinalityCounter::Unserialize(UnserialInfo* info)
|
|||
|
||||
CardinalityCounter* c = new CardinalityCounter(m, V, alpha_m);
|
||||
|
||||
uint8_t* buckets = c->buckets;
|
||||
vector<uint8_t>& buckets = c->buckets;
|
||||
|
||||
for ( unsigned int i = 0; i < m; i++ )
|
||||
{
|
||||
char c;
|
||||
valid &= UNSERIALIZE(&c);
|
||||
buckets[i] = (uint8)c;
|
||||
buckets[i] = (uint8_t)c;
|
||||
}
|
||||
|
||||
if ( ! valid )
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue