diff --git a/mytests.bro b/mytests.bro new file mode 100644 index 0000000000..3e5af497f3 --- /dev/null +++ b/mytests.bro @@ -0,0 +1,29 @@ +event bro_init() + { + local m1 = "measurement1"; + local m2 = "measurement2"; + + hll_cardinality_init(0.01, m1); + + local add1 = "hey"; + local add2 = "hi"; + local add3 = 123; + + hll_cardinality_add(add1, m1); + hll_cardinality_add(add2, m1); + hll_cardinality_add(add3, m1); + hll_cardinality_add("a", m1); + hll_cardinality_add("b", m1); + hll_cardinality_add("c", m1); + hll_cardinality_add("d", m1); + hll_cardinality_add("e", m1); + hll_cardinality_add("f", m1); + hll_cardinality_add("g", m1); + hll_cardinality_add("h", m1); + hll_cardinality_add("i", m1); + hll_cardinality_add("j", m1); + + local e = hll_cardinality_estimate(m1); + print e; + + } diff --git a/src/HyperLogLog.cc b/src/HyperLogLog.cc new file mode 100644 index 0000000000..22f522d1ab --- /dev/null +++ b/src/HyperLogLog.cc @@ -0,0 +1,113 @@ +#include +#include +#include "HyperLogLog.h" +#include + +using namespace std; + + int CardinalityCounter::optimalB(double error){ + double initial_estimate = 2*(log(1.04)-log(error))/log(2); + int answer = (int) floor(initial_estimate); + double k; + + do{ + answer++; + k = pow(2, (answer - initial_estimate)/2); + }while(erf(k/sqrt(2)) < conf); + + return answer; + } + + + CardinalityCounter :: CardinalityCounter(double error_margin){ + int b = optimalB(error_margin); + m = (uint64_t) pow(2, b); + buckets = new uint8_t[m]; + + if(m == 16) + alpha_m = 0.673; + else if(m == 32) + alpha_m = 0.697; + else if(m == 64) + alpha_m = 0.709; + else + alpha_m = 0.7213/(1+1.079/m); + + for(uint64_t i = 0; i < m; i++){ + buckets[i] = 0; + } + + V = m; + } + + CardinalityCounter :: ~CardinalityCounter(){ + delete [] buckets; + delete &m; + delete &V; + delete &alpha_m; + } + + uint8_t CardinalityCounter :: rank(uint64_t hash_modified){ + uint8_t answer = 0; + hash_modified = (uint64_t)(hash_modified/m); + hash_modified *= 2; + do{ + hash_modified = (uint64_t) (hash_modified/2); + answer++; + }while(hash_modified%2 == 0); + return answer; + } + + + + void CardinalityCounter::addElement(uint64_t hash){ + uint64_t index = hash % m; + hash = hash-index; + + if(buckets[index] == 0) + V--; + uint8_t temp = rank(hash); + if(temp > buckets[index]){ + buckets[index] = temp; + } + } + + double CardinalityCounter::size(){ + double answer = 0; + for(int i = 0; i < m; i++){ + answer += pow(2, -(int)buckets[i]); + } + answer = 1/answer; + answer = alpha_m*m*m*answer; + + if(answer <= 5*(double)(m/2)){ + return m*log((double) m/V); + } + else if(answer <= pow(2,64)/30){ + return answer; + } + else{ + return -pow(2,64)*log(1-answer/pow(2,64)); + } + } + + void CardinalityCounter::merge(CardinalityCounter* c){ + uint8_t* temp = (*c).getBuckets(); + V = 0; + for(int i = 0; i < m; i++){ + if(temp[i] > buckets[i]){ + buckets[i] = temp[i]; + } + if(buckets[i] == 0){ + V += 1; + } + } + } + + uint8_t* CardinalityCounter::getBuckets(){ + return buckets; + } + + uint64_t CardinalityCounter::getM(){ + return m; + } diff --git a/src/HyperLogLog.h b/src/HyperLogLog.h new file mode 100644 index 0000000000..3cbe4cfb03 --- /dev/null +++ b/src/HyperLogLog.h @@ -0,0 +1,105 @@ +#include + +/* + * "conf" is how confident the estimate given by the counter is. + * + * In other words, if the cardinality is estimated to be 100 with 2% error margin and conf is + * 0.95, then we are 95% sure that the actual cardinality is between 98 and 102. + */ +#define conf .95 + +class CardinalityCounter { + + private: + /* + * This is the number of buckets that will be stored. The standard error is 1.04/sqrt(m), so the + * actual cardinality will be the estimate +/- 1.04/sqrt(m) with approximately 68% probability. + */ + uint64_t m; + + /* + * These are the actual buckets that are storing an estimate of the cardinality. All these need to + * do is count when the first 1 bit appears in the bitstring and that location is at most 65, so + * not that many bits are needed to store it. + */ + uint8_t* buckets; + + /* + * There are some state constants that need to be kept track of to make the final estimate easier. + * V is the number of values in buckets that are 0 and this is used in the small error correction. + * alpha_m is a multiplicative constant used in the algorithm. + */ + uint64_t V; + double alpha_m; + + /* + * This function will calculate the smallest value of b that will satisfy these the constraints of + * a specified error margin and confidence level. + * + * The exact expression for b is as follows: + * Define x = 2*(log(1.04*k/error)/log(2)). Then b is the ceiling of x + * + * error is the error margin. + * k is the number of standard deviations that we have to go to have a confidence level of conf. + */ + + int optimalB(double error); + + /* + * Computes when the first one appears in the element. It looks at the bitstring from the end though. + * A precondition is that the argument is already divisible by m, so we just ignore the last b bits, + * since m = 2^b and the last b bits will always be 0. + */ + uint8_t rank(uint64_t hash_modified); + + public: + /* + * This will initialize the Cardinality counter.Based on the error_margin, the number of buckets + * that need to be kept will be determined. Based on the max_size, the number of bits that will + * be used from the hash function will be determined. + * + * We need the hash function to return integers that are uniformly distributed from 0 to 2^L-1. + * And if that happens, the maximum cardinality that this counter can handle is approximately 2^L. + * By default, we will assume a value of 64 bits. + */ + + CardinalityCounter(double error_margin); + + /* + * Deletes the class variables. + */ + + ~CardinalityCounter(); + + /* + * This will add an element to the counter. It's responsible for adding an element and updating + * the value of V, if that applies. + */ + void addElement(uint64_t hash); + + /* + * Returns the size estimate of the set. First, it has the "raw" HyperLogLog estimate. And then, we + * check if it's too "large" or "small" because the raw estimate doesn't do well in those cases. + * Thus, we correct for those errors as specified in the paper. + */ + + double size(); + + /* + * Returns the buckets array that holds all of the rough cardinality estimates. + */ + + uint8_t* getBuckets(); + + /* + * Merges the argument cardinality counter with this one. The error margins are assumed to be the same, + * so they have the same number of buckets. If any of the conditions are violated, then the return value + * of size() is meaningless. + */ + void merge(CardinalityCounter* c); + + /* + * Returns the value of m. Should be used only for statistical purposes. + */ + uint64_t getM(); +}; diff --git a/src/bro.bif b/src/bro.bif index 6e41aaad99..49446a1e83 100644 --- a/src/bro.bif +++ b/src/bro.bif @@ -5900,14 +5900,15 @@ function hll_cardinality_add%(elem: any, index: any%): bool %{ BroString* s = convert_index_to_string(index); int status = 0; - + uint64_t a = 1230123; + CompositeHash* hll_hash = new CompositeHash(new TypeList(elem->Type())); HashKey* key; if(hll_counters.count(*s) > 0) { CardinalityCounter* h = hll_counters[*s]; key = hll_hash->ComputeHash(elem,1); - (*h).addElement(key->Hash()); + h->addElement(a); status = 1; }