mirror of
https://github.com/zeek/zeek.git
synced 2025-10-10 02:28:21 +00:00
adapt to new structure
This commit is contained in:
parent
9e0fd963e0
commit
b7cdfc0e6e
9 changed files with 23 additions and 140 deletions
|
@ -10,9 +10,12 @@ set(probabilistic_SRCS
|
|||
BitVector.cc
|
||||
BloomFilter.cc
|
||||
CounterVector.cc
|
||||
Hasher.cc)
|
||||
Hasher.cc
|
||||
HyperLogLog.cc)
|
||||
|
||||
bif_target(bloom-filter.bif)
|
||||
set(BIF_OUTPUT_CC_SAVE ${BIF_OUTPUT_CC})
|
||||
bif_target(hyper-loglog.bif)
|
||||
|
||||
bro_add_subdir_library(probabilistic ${probabilistic_SRCS} ${BIF_OUTPUT_CC})
|
||||
bro_add_subdir_library(probabilistic ${probabilistic_SRCS} ${BIF_OUTPUT_CC_SAVE} ${BIF_OUTPUT_CC})
|
||||
add_dependencies(bro_probabilistic generate_outputs)
|
||||
|
|
140
src/probabilistic/HyperLogLog.cc
Normal file
140
src/probabilistic/HyperLogLog.cc
Normal file
|
@ -0,0 +1,140 @@
|
|||
// See the file "COPYING" in the main distribution directory for copyright.
|
||||
|
||||
#include <math.h>
|
||||
#include <stdint.h>
|
||||
#include "HyperLogLog.h"
|
||||
#include <iostream>
|
||||
|
||||
using namespace probabilistic;
|
||||
|
||||
int CardinalityCounter::optimalB(double error)
|
||||
{
|
||||
double initial_estimate = 2*(log(1.04)-log(error))/log(2);
|
||||
int answer = (int) floor(initial_estimate);
|
||||
double k;
|
||||
|
||||
do
|
||||
{
|
||||
answer++;
|
||||
k = pow(2, (answer - initial_estimate)/2);
|
||||
}
|
||||
while (erf(k/sqrt(2)) < HLL_CONF);
|
||||
|
||||
return answer;
|
||||
}
|
||||
|
||||
CardinalityCounter::CardinalityCounter(uint64_t size)
|
||||
{
|
||||
m = size;
|
||||
buckets = new uint8_t[m];
|
||||
|
||||
if(m == 16)
|
||||
alpha_m = 0.673;
|
||||
else if(m == 32)
|
||||
alpha_m = 0.697;
|
||||
else if(m == 64)
|
||||
alpha_m = 0.709;
|
||||
else
|
||||
alpha_m = 0.7213/(1+1.079/m);
|
||||
|
||||
for (uint64_t i = 0; i < m; i++)
|
||||
buckets[i] = 0;
|
||||
|
||||
V = m;
|
||||
}
|
||||
|
||||
CardinalityCounter::CardinalityCounter(double error_margin)
|
||||
{
|
||||
int b = optimalB(error_margin);
|
||||
m = (uint64_t) pow(2, b);
|
||||
buckets = new uint8_t[m];
|
||||
|
||||
if(m == 16)
|
||||
alpha_m = 0.673;
|
||||
else if(m == 32)
|
||||
alpha_m = 0.697;
|
||||
else if(m == 64)
|
||||
alpha_m = 0.709;
|
||||
else
|
||||
alpha_m = 0.7213/(1+1.079/m);
|
||||
|
||||
for (uint64_t i = 0; i < m; i++)
|
||||
buckets[i] = 0;
|
||||
|
||||
V = m;
|
||||
}
|
||||
|
||||
CardinalityCounter::~CardinalityCounter()
|
||||
{
|
||||
delete [] buckets;
|
||||
}
|
||||
|
||||
uint8_t CardinalityCounter::rank(uint64_t hash_modified)
|
||||
{
|
||||
uint8_t answer = 0;
|
||||
hash_modified = (uint64_t)(hash_modified/m);
|
||||
hash_modified *= 2;
|
||||
do
|
||||
{
|
||||
hash_modified = (uint64_t) (hash_modified/2);
|
||||
answer++;
|
||||
}
|
||||
while (hash_modified%2 == 0);
|
||||
|
||||
return answer;
|
||||
}
|
||||
|
||||
void CardinalityCounter::addElement(uint64_t hash)
|
||||
{
|
||||
uint64_t index = hash % m;
|
||||
hash = hash-index;
|
||||
|
||||
if(buckets[index] == 0)
|
||||
V--;
|
||||
|
||||
uint8_t temp = rank(hash);
|
||||
|
||||
if (temp > buckets[index])
|
||||
buckets[index] = temp;
|
||||
}
|
||||
|
||||
double CardinalityCounter::size()
|
||||
{
|
||||
double answer = 0;
|
||||
for (unsigned int i = 0; i < m; i++)
|
||||
answer += pow(2, -(int)buckets[i]);
|
||||
|
||||
answer = 1/answer;
|
||||
answer = alpha_m*m*m*answer;
|
||||
|
||||
if (answer <= 5*(double)(m/2))
|
||||
return m*log((double) m/V);
|
||||
else if(answer <= pow(2,64)/30)
|
||||
return answer;
|
||||
else
|
||||
return -pow(2,64)*log(1-answer/pow(2,64));
|
||||
}
|
||||
|
||||
void CardinalityCounter::merge(CardinalityCounter* c)
|
||||
{
|
||||
uint8_t* temp = (*c).getBuckets();
|
||||
V = 0;
|
||||
for (unsigned int i = 0; i < m; i++)
|
||||
{
|
||||
if (temp[i] > buckets[i])
|
||||
buckets[i] = temp[i];
|
||||
|
||||
if (buckets[i] == 0)
|
||||
V += 1;
|
||||
}
|
||||
}
|
||||
|
||||
uint8_t* CardinalityCounter::getBuckets()
|
||||
{
|
||||
return buckets;
|
||||
}
|
||||
|
||||
uint64_t CardinalityCounter::getM()
|
||||
{
|
||||
return m;
|
||||
}
|
125
src/probabilistic/HyperLogLog.h
Normal file
125
src/probabilistic/HyperLogLog.h
Normal file
|
@ -0,0 +1,125 @@
|
|||
// See the file "COPYING" in the main distribution directory for copyright.
|
||||
|
||||
#ifndef hyperloglog_h
|
||||
#define hyperloglog_h
|
||||
|
||||
#include <stdint.h>
|
||||
#include <OpaqueVal.h>
|
||||
|
||||
namespace probabilistic {
|
||||
|
||||
/*
|
||||
* "conf" is how confident the estimate given by the counter is.
|
||||
*
|
||||
* In other words, if the cardinality is estimated to be 100 with 2% error margin and HLL_CONFis
|
||||
* 0.95, then we are 95% sure that the actual cardinality is between 98 and 102.
|
||||
*/
|
||||
#define HLL_CONF .95
|
||||
|
||||
|
||||
class CardinalityCounter {
|
||||
friend class CardinalityVal;
|
||||
|
||||
private:
|
||||
/*
|
||||
* This is the number of buckets that will be stored. The standard error is 1.04/sqrt(m), so the
|
||||
* actual cardinality will be the estimate +/- 1.04/sqrt(m) with approximately 68% probability.
|
||||
*/
|
||||
uint64_t m;
|
||||
|
||||
/*
|
||||
* These are the actual buckets that are storing an estimate of the cardinality. All these need to
|
||||
* do is count when the first 1 bit appears in the bitstring and that location is at most 65, so
|
||||
* not that many bits are needed to store it.
|
||||
*/
|
||||
uint8_t* buckets;
|
||||
|
||||
/*
|
||||
* There are some state constants that need to be kept track of to make the final estimate easier.
|
||||
* V is the number of values in buckets that are 0 and this is used in the small error correction.
|
||||
* alpha_m is a multiplicative constant used in the algorithm.
|
||||
*/
|
||||
uint64_t V;
|
||||
double alpha_m;
|
||||
|
||||
/*
|
||||
* This function will calculate the smallest value of b that will satisfy these the constraints of
|
||||
* a specified error margin and confidence level.
|
||||
*
|
||||
* The exact expression for b is as follows:
|
||||
* Define x = 2*(log(1.04*k/error)/log(2)). Then b is the ceiling of x
|
||||
*
|
||||
* error is the error margin.
|
||||
* k is the number of standard deviations that we have to go to have a confidence level of conf.
|
||||
*/
|
||||
|
||||
int optimalB(double error);
|
||||
|
||||
/*
|
||||
* Computes when the first one appears in the element. It looks at the bitstring from the end though.
|
||||
* A precondition is that the argument is already divisible by m, so we just ignore the last b bits,
|
||||
* since m = 2^b and the last b bits will always be 0.
|
||||
*/
|
||||
uint8_t rank(uint64_t hash_modified);
|
||||
|
||||
public:
|
||||
/*
|
||||
* This will be used when cloning. The error margin will be 1.04/sqrt(m) with approximately 68%
|
||||
* probability.
|
||||
*/
|
||||
CardinalityCounter(uint64_t size);
|
||||
|
||||
/*
|
||||
* This will initialize the Cardinality counter.Based on the error_margin, the number of buckets
|
||||
* that need to be kept will be determined. Based on the max_size, the number of bits that will
|
||||
* be used from the hash function will be determined.
|
||||
*
|
||||
* We need the hash function to return integers that are uniformly distributed from 0 to 2^L-1.
|
||||
* And if that happens, the maximum cardinality that this counter can handle is approximately 2^L.
|
||||
* By default, we will assume a value of 64 bits.
|
||||
*/
|
||||
|
||||
CardinalityCounter(double error_margin);
|
||||
|
||||
/*
|
||||
* Deletes the class variables.
|
||||
*/
|
||||
|
||||
~CardinalityCounter();
|
||||
|
||||
/*
|
||||
* This will add an element to the counter. It's responsible for adding an element and updating
|
||||
* the value of V, if that applies.
|
||||
*/
|
||||
void addElement(uint64_t hash);
|
||||
|
||||
/*
|
||||
* Returns the size estimate of the set. First, it has the "raw" HyperLogLog estimate. And then, we
|
||||
* check if it's too "large" or "small" because the raw estimate doesn't do well in those cases.
|
||||
* Thus, we correct for those errors as specified in the paper.
|
||||
*/
|
||||
|
||||
double size();
|
||||
|
||||
/*
|
||||
* Returns the buckets array that holds all of the rough cardinality estimates.
|
||||
*/
|
||||
|
||||
uint8_t* getBuckets();
|
||||
|
||||
/*
|
||||
* Merges the argument cardinality counter with this one. The error margins are assumed to be the same,
|
||||
* so they have the same number of buckets. If any of the conditions are violated, then the return value
|
||||
* of size() is meaningless.
|
||||
*/
|
||||
void merge(CardinalityCounter* c);
|
||||
|
||||
/*
|
||||
* Returns the value of m. Should be used only for statistical purposes.
|
||||
*/
|
||||
uint64_t getM();
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#endif
|
Loading…
Add table
Add a link
Reference in a new issue