mirror of
https://github.com/zeek/zeek.git
synced 2025-10-13 12:08:20 +00:00
Added the HyperLogLog files to the repository, and the size method works as well. The add method has an error with the hashkeys thus far and no other methods outside of init and size have been tested yet.
This commit is contained in:
parent
a376f2244e
commit
bbaa35434b
4 changed files with 250 additions and 2 deletions
29
mytests.bro
Normal file
29
mytests.bro
Normal file
|
@ -0,0 +1,29 @@
|
||||||
|
event bro_init()
|
||||||
|
{
|
||||||
|
local m1 = "measurement1";
|
||||||
|
local m2 = "measurement2";
|
||||||
|
|
||||||
|
hll_cardinality_init(0.01, m1);
|
||||||
|
|
||||||
|
local add1 = "hey";
|
||||||
|
local add2 = "hi";
|
||||||
|
local add3 = 123;
|
||||||
|
|
||||||
|
hll_cardinality_add(add1, m1);
|
||||||
|
hll_cardinality_add(add2, m1);
|
||||||
|
hll_cardinality_add(add3, m1);
|
||||||
|
hll_cardinality_add("a", m1);
|
||||||
|
hll_cardinality_add("b", m1);
|
||||||
|
hll_cardinality_add("c", m1);
|
||||||
|
hll_cardinality_add("d", m1);
|
||||||
|
hll_cardinality_add("e", m1);
|
||||||
|
hll_cardinality_add("f", m1);
|
||||||
|
hll_cardinality_add("g", m1);
|
||||||
|
hll_cardinality_add("h", m1);
|
||||||
|
hll_cardinality_add("i", m1);
|
||||||
|
hll_cardinality_add("j", m1);
|
||||||
|
|
||||||
|
local e = hll_cardinality_estimate(m1);
|
||||||
|
print e;
|
||||||
|
|
||||||
|
}
|
113
src/HyperLogLog.cc
Normal file
113
src/HyperLogLog.cc
Normal file
|
@ -0,0 +1,113 @@
|
||||||
|
#include <math.h>
|
||||||
|
#include <stdint.h>
|
||||||
|
#include "HyperLogLog.h"
|
||||||
|
#include <iostream>
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
|
||||||
|
int CardinalityCounter::optimalB(double error){
|
||||||
|
double initial_estimate = 2*(log(1.04)-log(error))/log(2);
|
||||||
|
int answer = (int) floor(initial_estimate);
|
||||||
|
double k;
|
||||||
|
|
||||||
|
do{
|
||||||
|
answer++;
|
||||||
|
k = pow(2, (answer - initial_estimate)/2);
|
||||||
|
}while(erf(k/sqrt(2)) < conf);
|
||||||
|
|
||||||
|
return answer;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
CardinalityCounter :: CardinalityCounter(double error_margin){
|
||||||
|
int b = optimalB(error_margin);
|
||||||
|
m = (uint64_t) pow(2, b);
|
||||||
|
buckets = new uint8_t[m];
|
||||||
|
|
||||||
|
if(m == 16)
|
||||||
|
alpha_m = 0.673;
|
||||||
|
else if(m == 32)
|
||||||
|
alpha_m = 0.697;
|
||||||
|
else if(m == 64)
|
||||||
|
alpha_m = 0.709;
|
||||||
|
else
|
||||||
|
alpha_m = 0.7213/(1+1.079/m);
|
||||||
|
|
||||||
|
for(uint64_t i = 0; i < m; i++){
|
||||||
|
buckets[i] = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
V = m;
|
||||||
|
}
|
||||||
|
|
||||||
|
CardinalityCounter :: ~CardinalityCounter(){
|
||||||
|
delete [] buckets;
|
||||||
|
delete &m;
|
||||||
|
delete &V;
|
||||||
|
delete &alpha_m;
|
||||||
|
}
|
||||||
|
|
||||||
|
uint8_t CardinalityCounter :: rank(uint64_t hash_modified){
|
||||||
|
uint8_t answer = 0;
|
||||||
|
hash_modified = (uint64_t)(hash_modified/m);
|
||||||
|
hash_modified *= 2;
|
||||||
|
do{
|
||||||
|
hash_modified = (uint64_t) (hash_modified/2);
|
||||||
|
answer++;
|
||||||
|
}while(hash_modified%2 == 0);
|
||||||
|
return answer;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
void CardinalityCounter::addElement(uint64_t hash){
|
||||||
|
uint64_t index = hash % m;
|
||||||
|
hash = hash-index;
|
||||||
|
|
||||||
|
if(buckets[index] == 0)
|
||||||
|
V--;
|
||||||
|
uint8_t temp = rank(hash);
|
||||||
|
if(temp > buckets[index]){
|
||||||
|
buckets[index] = temp;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
double CardinalityCounter::size(){
|
||||||
|
double answer = 0;
|
||||||
|
for(int i = 0; i < m; i++){
|
||||||
|
answer += pow(2, -(int)buckets[i]);
|
||||||
|
}
|
||||||
|
answer = 1/answer;
|
||||||
|
answer = alpha_m*m*m*answer;
|
||||||
|
|
||||||
|
if(answer <= 5*(double)(m/2)){
|
||||||
|
return m*log((double) m/V);
|
||||||
|
}
|
||||||
|
else if(answer <= pow(2,64)/30){
|
||||||
|
return answer;
|
||||||
|
}
|
||||||
|
else{
|
||||||
|
return -pow(2,64)*log(1-answer/pow(2,64));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void CardinalityCounter::merge(CardinalityCounter* c){
|
||||||
|
uint8_t* temp = (*c).getBuckets();
|
||||||
|
V = 0;
|
||||||
|
for(int i = 0; i < m; i++){
|
||||||
|
if(temp[i] > buckets[i]){
|
||||||
|
buckets[i] = temp[i];
|
||||||
|
}
|
||||||
|
if(buckets[i] == 0){
|
||||||
|
V += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
uint8_t* CardinalityCounter::getBuckets(){
|
||||||
|
return buckets;
|
||||||
|
}
|
||||||
|
|
||||||
|
uint64_t CardinalityCounter::getM(){
|
||||||
|
return m;
|
||||||
|
}
|
105
src/HyperLogLog.h
Normal file
105
src/HyperLogLog.h
Normal file
|
@ -0,0 +1,105 @@
|
||||||
|
#include <stdint.h>
|
||||||
|
|
||||||
|
/*
|
||||||
|
* "conf" is how confident the estimate given by the counter is.
|
||||||
|
*
|
||||||
|
* In other words, if the cardinality is estimated to be 100 with 2% error margin and conf is
|
||||||
|
* 0.95, then we are 95% sure that the actual cardinality is between 98 and 102.
|
||||||
|
*/
|
||||||
|
#define conf .95
|
||||||
|
|
||||||
|
class CardinalityCounter {
|
||||||
|
|
||||||
|
private:
|
||||||
|
/*
|
||||||
|
* This is the number of buckets that will be stored. The standard error is 1.04/sqrt(m), so the
|
||||||
|
* actual cardinality will be the estimate +/- 1.04/sqrt(m) with approximately 68% probability.
|
||||||
|
*/
|
||||||
|
uint64_t m;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* These are the actual buckets that are storing an estimate of the cardinality. All these need to
|
||||||
|
* do is count when the first 1 bit appears in the bitstring and that location is at most 65, so
|
||||||
|
* not that many bits are needed to store it.
|
||||||
|
*/
|
||||||
|
uint8_t* buckets;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* There are some state constants that need to be kept track of to make the final estimate easier.
|
||||||
|
* V is the number of values in buckets that are 0 and this is used in the small error correction.
|
||||||
|
* alpha_m is a multiplicative constant used in the algorithm.
|
||||||
|
*/
|
||||||
|
uint64_t V;
|
||||||
|
double alpha_m;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This function will calculate the smallest value of b that will satisfy these the constraints of
|
||||||
|
* a specified error margin and confidence level.
|
||||||
|
*
|
||||||
|
* The exact expression for b is as follows:
|
||||||
|
* Define x = 2*(log(1.04*k/error)/log(2)). Then b is the ceiling of x
|
||||||
|
*
|
||||||
|
* error is the error margin.
|
||||||
|
* k is the number of standard deviations that we have to go to have a confidence level of conf.
|
||||||
|
*/
|
||||||
|
|
||||||
|
int optimalB(double error);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Computes when the first one appears in the element. It looks at the bitstring from the end though.
|
||||||
|
* A precondition is that the argument is already divisible by m, so we just ignore the last b bits,
|
||||||
|
* since m = 2^b and the last b bits will always be 0.
|
||||||
|
*/
|
||||||
|
uint8_t rank(uint64_t hash_modified);
|
||||||
|
|
||||||
|
public:
|
||||||
|
/*
|
||||||
|
* This will initialize the Cardinality counter.Based on the error_margin, the number of buckets
|
||||||
|
* that need to be kept will be determined. Based on the max_size, the number of bits that will
|
||||||
|
* be used from the hash function will be determined.
|
||||||
|
*
|
||||||
|
* We need the hash function to return integers that are uniformly distributed from 0 to 2^L-1.
|
||||||
|
* And if that happens, the maximum cardinality that this counter can handle is approximately 2^L.
|
||||||
|
* By default, we will assume a value of 64 bits.
|
||||||
|
*/
|
||||||
|
|
||||||
|
CardinalityCounter(double error_margin);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Deletes the class variables.
|
||||||
|
*/
|
||||||
|
|
||||||
|
~CardinalityCounter();
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This will add an element to the counter. It's responsible for adding an element and updating
|
||||||
|
* the value of V, if that applies.
|
||||||
|
*/
|
||||||
|
void addElement(uint64_t hash);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Returns the size estimate of the set. First, it has the "raw" HyperLogLog estimate. And then, we
|
||||||
|
* check if it's too "large" or "small" because the raw estimate doesn't do well in those cases.
|
||||||
|
* Thus, we correct for those errors as specified in the paper.
|
||||||
|
*/
|
||||||
|
|
||||||
|
double size();
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Returns the buckets array that holds all of the rough cardinality estimates.
|
||||||
|
*/
|
||||||
|
|
||||||
|
uint8_t* getBuckets();
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Merges the argument cardinality counter with this one. The error margins are assumed to be the same,
|
||||||
|
* so they have the same number of buckets. If any of the conditions are violated, then the return value
|
||||||
|
* of size() is meaningless.
|
||||||
|
*/
|
||||||
|
void merge(CardinalityCounter* c);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Returns the value of m. Should be used only for statistical purposes.
|
||||||
|
*/
|
||||||
|
uint64_t getM();
|
||||||
|
};
|
|
@ -5900,14 +5900,15 @@ function hll_cardinality_add%(elem: any, index: any%): bool
|
||||||
%{
|
%{
|
||||||
BroString* s = convert_index_to_string(index);
|
BroString* s = convert_index_to_string(index);
|
||||||
int status = 0;
|
int status = 0;
|
||||||
|
uint64_t a = 1230123;
|
||||||
|
|
||||||
CompositeHash* hll_hash = new CompositeHash(new TypeList(elem->Type()));
|
CompositeHash* hll_hash = new CompositeHash(new TypeList(elem->Type()));
|
||||||
HashKey* key;
|
HashKey* key;
|
||||||
if(hll_counters.count(*s) > 0)
|
if(hll_counters.count(*s) > 0)
|
||||||
{
|
{
|
||||||
CardinalityCounter* h = hll_counters[*s];
|
CardinalityCounter* h = hll_counters[*s];
|
||||||
key = hll_hash->ComputeHash(elem,1);
|
key = hll_hash->ComputeHash(elem,1);
|
||||||
(*h).addElement(key->Hash());
|
h->addElement(a);
|
||||||
status = 1;
|
status = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue