mirror of
https://github.com/zeek/zeek.git
synced 2025-10-04 15:48:19 +00:00
update hll documentation, make a few functions private and create
a new copy constructor.
This commit is contained in:
parent
eb1d7ccc4a
commit
c0f780c728
3 changed files with 108 additions and 63 deletions
|
@ -15,6 +15,9 @@ int CardinalityCounter::OptimalB(double error, double confidence)
|
||||||
double initial_estimate = 2 * (log(1.04) - log(error)) / log(2);
|
double initial_estimate = 2 * (log(1.04) - log(error)) / log(2);
|
||||||
int answer = (int) floor(initial_estimate);
|
int answer = (int) floor(initial_estimate);
|
||||||
|
|
||||||
|
// k is the number of standard deviations that we have to go to have
|
||||||
|
// a confidence level of conf.
|
||||||
|
|
||||||
double k = 0;
|
double k = 0;
|
||||||
|
|
||||||
do {
|
do {
|
||||||
|
@ -54,6 +57,12 @@ void CardinalityCounter::Init(uint64 size)
|
||||||
V = m;
|
V = m;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
CardinalityCounter::CardinalityCounter(CardinalityCounter& other)
|
||||||
|
{
|
||||||
|
Init(other.GetM());
|
||||||
|
Merge(&other);
|
||||||
|
}
|
||||||
|
|
||||||
CardinalityCounter::CardinalityCounter(double error_margin, double confidence)
|
CardinalityCounter::CardinalityCounter(double error_margin, double confidence)
|
||||||
{
|
{
|
||||||
int b = OptimalB(error_margin, confidence);
|
int b = OptimalB(error_margin, confidence);
|
||||||
|
@ -107,7 +116,16 @@ void CardinalityCounter::AddElement(uint64 hash)
|
||||||
buckets[index] = temp;
|
buckets[index] = temp;
|
||||||
}
|
}
|
||||||
|
|
||||||
double CardinalityCounter::Size()
|
/**
|
||||||
|
* Estimate the size by using the the "raw" HyperLogLog estimate. Then,
|
||||||
|
* check if it's too "large" or "small" because the raw estimate doesn't
|
||||||
|
* do well in those cases.
|
||||||
|
* Thus, we correct for those errors as specified in the paper.
|
||||||
|
*
|
||||||
|
* Note - we deviate from the HLL algorithm in the paper here, because
|
||||||
|
* of our 64-bit hashes.
|
||||||
|
**/
|
||||||
|
double CardinalityCounter::Size() const
|
||||||
{
|
{
|
||||||
double answer = 0;
|
double answer = 0;
|
||||||
for ( unsigned int i = 0; i < m; i++ )
|
for ( unsigned int i = 0; i < m; i++ )
|
||||||
|
@ -126,8 +144,11 @@ double CardinalityCounter::Size()
|
||||||
return -pow(2, 64) * log(1 - (answer / pow(2, 64)));
|
return -pow(2, 64) * log(1 - (answer / pow(2, 64)));
|
||||||
}
|
}
|
||||||
|
|
||||||
void CardinalityCounter::Merge(CardinalityCounter* c)
|
bool CardinalityCounter::Merge(CardinalityCounter* c)
|
||||||
{
|
{
|
||||||
|
if ( m != c->GetM() )
|
||||||
|
return false;
|
||||||
|
|
||||||
uint8_t* temp = c->GetBuckets();
|
uint8_t* temp = c->GetBuckets();
|
||||||
|
|
||||||
V = 0;
|
V = 0;
|
||||||
|
@ -140,6 +161,8 @@ void CardinalityCounter::Merge(CardinalityCounter* c)
|
||||||
if ( buckets[i] == 0 )
|
if ( buckets[i] == 0 )
|
||||||
++V;
|
++V;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
uint8_t* CardinalityCounter::GetBuckets()
|
uint8_t* CardinalityCounter::GetBuckets()
|
||||||
|
@ -147,7 +170,7 @@ uint8_t* CardinalityCounter::GetBuckets()
|
||||||
return buckets;
|
return buckets;
|
||||||
}
|
}
|
||||||
|
|
||||||
uint64 CardinalityCounter::GetM()
|
uint64 CardinalityCounter::GetM() const
|
||||||
{
|
{
|
||||||
return m;
|
return m;
|
||||||
}
|
}
|
||||||
|
|
|
@ -9,81 +9,78 @@
|
||||||
namespace probabilistic {
|
namespace probabilistic {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A probabilisitc cardinality counter using the HyperLogLog algorithm.
|
* A probabilistic cardinality counter using the HyperLogLog algorithm.
|
||||||
*
|
|
||||||
* TODO: Update doc string.
|
|
||||||
*/
|
*/
|
||||||
class CardinalityCounter {
|
class CardinalityCounter {
|
||||||
public:
|
public:
|
||||||
/**
|
/**
|
||||||
* Constructor.
|
* Constructor.
|
||||||
*
|
*
|
||||||
* Based on the error_margin, the number of buckets that need to be
|
* The number of buckets of the data structure is determined using
|
||||||
* kept will be determined. Based on the max_size, the number of bits
|
* the error margin and the given confidence.
|
||||||
* that will be used from the hash function will be determined.
|
|
||||||
*
|
*
|
||||||
* We need the hash function to return integers that are uniformly
|
* For example, assume an error_margin of 2% and a confidence
|
||||||
* distributed from 0 to 2^L-1. And if that happens, the maximum
|
* of 95%. If the Size function returns an estimate of 100, this
|
||||||
* cardinality that this counter can handle is approximately 2^L. By
|
* means that we are 95% sure that the cardinality is between 98
|
||||||
* default, we will assume a value of 64 bits.
|
* and 102.
|
||||||
*
|
*
|
||||||
* Confidence in the estimate given by a cardinality counter is.
|
* @param error_margin error margin
|
||||||
*
|
*
|
||||||
* In other words, if the cardinality is estimated to be 100 with 2%
|
* @param confidence confidence of the error. Default: 0.95
|
||||||
* error margin and HLL_CONFis 0.95, then we are 95% sure that the
|
|
||||||
* actual cardinality is between 98 and 102.
|
|
||||||
*/
|
*/
|
||||||
CardinalityCounter(double error_margin, double confidence = 0.95);
|
CardinalityCounter(double error_margin, double confidence = 0.95);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Constructor used for cloning.
|
* Copy-Constructor
|
||||||
|
*/
|
||||||
|
CardinalityCounter(CardinalityCounter& other);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Constructor for a known number of buckets.
|
||||||
*
|
*
|
||||||
* The error margin will be 1.04/sqrt(m) with approximately 68%
|
* The error margin is 1.04/sqrt(size) with approximately 68%
|
||||||
* probability.
|
* probability.
|
||||||
|
*
|
||||||
|
* @param size number of buckets to create
|
||||||
*/
|
*/
|
||||||
CardinalityCounter(uint64 size);
|
CardinalityCounter(uint64 size);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Deletes the class variables.
|
* Destructor.
|
||||||
*/
|
*/
|
||||||
~CardinalityCounter();
|
~CardinalityCounter();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This will add an element to the counter. It's responsible for
|
* Add a new element to the counter.
|
||||||
* adding an element and updating the value of V, if that applies.
|
*
|
||||||
|
* The hash function generating the hashes needs to be uniformly
|
||||||
|
* distributed over 64 bits.
|
||||||
|
*
|
||||||
|
* @param hash 64-bit hash value of the element to be added
|
||||||
*/
|
*/
|
||||||
void AddElement(uint64 hash);
|
void AddElement(uint64 hash);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns the size estimate of the set. First, it has the "raw"
|
* Get the current estimated number of elements in the data
|
||||||
* HyperLogLog estimate. And then, we check if it's too "large" or
|
* structure
|
||||||
* "small" because the raw estimate doesn't do well in those cases.
|
*
|
||||||
* Thus, we correct for those errors as specified in the paper.
|
* @return Estimated number of elements
|
||||||
*/
|
**/
|
||||||
double Size();
|
double Size() const;
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the buckets array that holds all of the rough cardinality
|
|
||||||
* estimates.
|
|
||||||
*/
|
|
||||||
uint8_t* GetBuckets();
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Merges the argument cardinality counter with this one. The error
|
* Merges the argument cardinality counter with this one. The error
|
||||||
* margins are assumed to be the same, so they have the same number of
|
* margins of both counters have to be the same, otherwhise the merge
|
||||||
* buckets. If any of the conditions are violated, then the return
|
* operation will not be carried out.
|
||||||
* value of size() is meaningless.
|
*
|
||||||
|
* @param c Cardinality counter to merge into the current counter.
|
||||||
|
*
|
||||||
|
* @return True if successful
|
||||||
*/
|
*/
|
||||||
void Merge(CardinalityCounter* c);
|
bool Merge(CardinalityCounter* c);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns the value of m. Should be used only for statistical
|
* Serializes the cardinality counter.
|
||||||
* purposes.
|
|
||||||
*/
|
|
||||||
uint64 GetM();
|
|
||||||
|
|
||||||
/**
|
|
||||||
c * Serializes the cardinality counter.
|
|
||||||
*
|
*
|
||||||
* @param info The serializaton information to use.
|
* @param info The serializaton information to use.
|
||||||
*
|
*
|
||||||
|
@ -101,6 +98,24 @@ c * Serializes the cardinality counter.
|
||||||
*/
|
*/
|
||||||
static CardinalityCounter* Unserialize(UnserialInfo* info);
|
static CardinalityCounter* Unserialize(UnserialInfo* info);
|
||||||
|
|
||||||
|
protected:
|
||||||
|
/**
|
||||||
|
* Return the number of buckets.
|
||||||
|
*
|
||||||
|
* @return Number of buckets
|
||||||
|
*/
|
||||||
|
uint64 GetM() const;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the buckets array that holds all of the rough cardinality
|
||||||
|
* estimates.
|
||||||
|
*
|
||||||
|
* Use GetM() to determine the size.
|
||||||
|
*
|
||||||
|
* @return Array containing cardinality estimates
|
||||||
|
*/
|
||||||
|
uint8_t* GetBuckets();
|
||||||
|
|
||||||
private:
|
private:
|
||||||
/**
|
/**
|
||||||
* Constructor used when unserializing, i.e., all parameters are
|
* Constructor used when unserializing, i.e., all parameters are
|
||||||
|
@ -110,31 +125,38 @@ private:
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Helper function with code used jointly by multiple constructors.
|
* Helper function with code used jointly by multiple constructors.
|
||||||
|
*
|
||||||
|
* @param arg_size: number of buckets that need to be kept
|
||||||
*/
|
*/
|
||||||
void Init(uint64 arg_size);
|
void Init(uint64 arg_size);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This function will calculate the smallest value of b that will
|
* This function calculates the smallest value of b that will
|
||||||
* satisfy these the constraints of a specified error margin and
|
* satisfy these the constraints of a specified error margin and
|
||||||
* confidence level.
|
* confidence level.
|
||||||
*
|
*
|
||||||
* The exact expression for b is as follows:
|
* The exact expression for b is as follows:
|
||||||
* Define x = 2*(log(1.04*k/error)/log(2)). Then b is the ceiling of x
|
* Define x = 2*(log(1.04*k/error)/log(2)). Then b is the ceiling of x.
|
||||||
*
|
*
|
||||||
* error is the error margin.
|
* After that initial estimate, the value of b is increased until the
|
||||||
|
* standard deviation falls within the specified valud.
|
||||||
*
|
*
|
||||||
* k is the number of standard deviations that we have to go to have
|
* @param error error margin
|
||||||
* a confidence level of conf.
|
|
||||||
*
|
*
|
||||||
* confidence: TODO.
|
* @param confidence confidence of the error
|
||||||
|
*
|
||||||
|
* @return minimal B-value satisfying the error-rate under confidence.
|
||||||
*/
|
*/
|
||||||
int OptimalB(double error, double confidence);
|
int OptimalB(double error, double confidence);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Computes when the first one appears in the element. It looks at the
|
* Determines at which index (counted from the back) the first one-bit
|
||||||
* bitstring from the end though. A precondition is that the argument
|
* appears. The last b bits have to be 0 (the element has to be divisible
|
||||||
* is already divisible by m, so we just ignore the last b bits, since
|
* by m), hence they are ignored.
|
||||||
* m = 2^b and the last b bits will always be 0.
|
*
|
||||||
|
* @param hash_modified hash value
|
||||||
|
*
|
||||||
|
* @returns index of first one-bit
|
||||||
*/
|
*/
|
||||||
uint8_t Rank(uint64 hash_modified);
|
uint8_t Rank(uint64 hash_modified);
|
||||||
|
|
||||||
|
|
|
@ -91,7 +91,12 @@ function hll_cardinality_merge_into%(handle1: opaque of cardinality, handle2: op
|
||||||
CardinalityCounter* h1 = v1->Get();
|
CardinalityCounter* h1 = v1->Get();
|
||||||
CardinalityCounter* h2 = v2->Get();
|
CardinalityCounter* h2 = v2->Get();
|
||||||
|
|
||||||
h1->Merge(h2);
|
bool res = h1->Merge(h2);
|
||||||
|
if ( res == false )
|
||||||
|
{
|
||||||
|
reporter->Error("Carinality counters with different parameters cannot be merged");
|
||||||
|
return new Val(0, TYPE_BOOL);
|
||||||
|
}
|
||||||
|
|
||||||
return new Val(1, TYPE_BOOL);
|
return new Val(1, TYPE_BOOL);
|
||||||
%}
|
%}
|
||||||
|
@ -126,12 +131,7 @@ function hll_cardinality_copy%(handle: opaque of cardinality%): opaque of cardin
|
||||||
%{
|
%{
|
||||||
CardinalityVal* cv = static_cast<CardinalityVal*>(handle);
|
CardinalityVal* cv = static_cast<CardinalityVal*>(handle);
|
||||||
CardinalityCounter* h = cv->Get();
|
CardinalityCounter* h = cv->Get();
|
||||||
|
CardinalityCounter* h2 = new CardinalityCounter(*h);
|
||||||
uint64_t m = h->GetM();
|
|
||||||
CardinalityCounter* h2 = new CardinalityCounter(m);
|
|
||||||
|
|
||||||
int i = 0;
|
|
||||||
h2->Merge(h);
|
|
||||||
CardinalityVal* out = new CardinalityVal(h2);
|
CardinalityVal* out = new CardinalityVal(h2);
|
||||||
|
|
||||||
return out;
|
return out;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue