From a376f2244e9af06e636a47238d25099c1fcbb5c0 Mon Sep 17 00:00:00 2001 From: Soumya Basu Date: Thu, 9 Aug 2012 17:11:57 -0700 Subject: [PATCH 01/32] Initial commit. Everything compiles, but it seg faults when you try adding an element to the cardinality counter. --- src/CMakeLists.txt | 1 + src/bro.bif | 207 ++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 206 insertions(+), 2 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index ce440852d7..56d8faee98 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -332,6 +332,7 @@ set(bro_SRCS HTTP.cc HTTP-binpac.cc Hash.cc + HyperLogLog.cc ICMP.cc ID.cc Ident.cc diff --git a/src/bro.bif b/src/bro.bif index 2a37429ad6..6e41aaad99 100644 --- a/src/bro.bif +++ b/src/bro.bif @@ -1,5 +1,3 @@ -##! A collection of built-in functions that implement a variety of things -##! such as general programming algorithms, string processing, math functions, ##! introspection, type conversion, file/directory manipulation, packet ##! filtering, inter-process communication and controlling protocol analyzer ##! behavior. @@ -5860,3 +5858,208 @@ function generate_idmef%(src_ip: addr, src_port: port, return new Val(0, TYPE_BOOL); #endif %} + + +## This is where my code starts... +##Just a note about notation. I'm specifying everything with the prefix hll just +## in case in the future, there's a better way to count cardinalities or something. +## That way, code written that depends on the HyperLogLog algorithm will still be +## working. Though, I'm fairly certain that anything that might be better won't +## be significantly better. + + +%%{ +#include "HyperLogLog.h" +static map hll_counters; +%%} + +## Initializes the hash for the HyperLogLog cardinality counting algorithm. +## It returns true if it was successful in creating a structure and false +## if it wasn't. + +function hll_cardinality_init%(err: double,index: any%): bool + %{ + BroString* s = convert_index_to_string(index); + int status = 0; + + if ( hll_counters.count(*s) < 1 ) + { + hll_counters[*s] = new CardinalityCounter(err); + status = 1; + } + + delete s; + return new Val(status, TYPE_BOOL); + %} + +## Adds an element to the HyperLogLog data structure located at index. + +##elem->Type() to get the type of elem. + +function hll_cardinality_add%(elem: any, index: any%): bool + %{ + BroString* s = convert_index_to_string(index); + int status = 0; + + CompositeHash* hll_hash = new CompositeHash(new TypeList(elem->Type())); + HashKey* key; + if(hll_counters.count(*s) > 0) + { + CardinalityCounter* h = hll_counters[*s]; + key = hll_hash->ComputeHash(elem,1); + (*h).addElement(key->Hash()); + status = 1; + } + + delete s; + return new Val(status, TYPE_BOOL); + %} + +## The data structure at index1 will contain the combined count for the +## elements measured by index1 and index2. +## It returns true if it either cloned the value at index2 into index1 +## or if it merged the two data structures together. + +function hll_cardinality_merge_into%(index1: any, index2: any%): bool + %{ + BroString* s1 = convert_index_to_string(index1); + BroString* s2 = convert_index_to_string(index2); + int status = 0; + + if(hll_counters.count(*s1) < 1) + { + if(hll_counters.count(*s2) < 1) + { + status = 0; + } + else + { + uint64_t m = (*hll_counters[*s2]).getM(); + double error = 1.04/sqrt(m); + CardinalityCounter* newInst = new CardinalityCounter(error); + int i = 0; + while((*newInst).getM() != m) + { + i += 1; + newInst = new CardinalityCounter(error/i); + } + hll_counters[*s1] = newInst; + (*newInst).merge(hll_counters[*s2]); + status = 1; + } + } + else + { + if(hll_counters.count(*s2) < 1) + { + status = 0; + } + else + { + if((*hll_counters[*s2]).getM() == (*hll_counters[*s1]).getM()) + { + status = 1; + (*hll_counters[*s1]).merge(hll_counters[*s2]); + } + } + } + + delete s1; + delete s2; + return new Val(status, TYPE_BOOL); + + %} + +## Returns true if it destroyed something. False if it didn't. +function hll_cardinality_destroy%(index: any%): bool + %{ + BroString* s = convert_index_to_string(index); + int status = 0; + + if(hll_counters.count(*s) > 0) + { + delete hll_counters[*s]; + } + + delete s; + return new Val(status, TYPE_BOOL); + %} + +## Returns the cardinality estimate. Returns -1.0 if there is nothing in that index. +function hll_cardinality_estimate%(index: any%): double + %{ + BroString* s = convert_index_to_string(index); + double estimate = -1.0; + + if(hll_counters.count(*s) > 0) + { + estimate = (*hll_counters[*s]).size(); + } + + delete s; + return new Val(estimate, TYPE_DOUBLE); + %} + +##I'm really not sure about the notation of this function... + +function hll_cardinality_keys%(%): bool + %{ +// TableVal* a = new TableVal(string_set); +// map::iterator it; + +// for(it = hll_counters.begin() ; it != hll_counters.end(); it++) +// { +// a->Assign((*it).first); +// } +// return a; + return new Val(1, TYPE_BOOL); + %} + +## Stores the data structure at index2 into index1. Deletes the data structure at index1 +## if there was any. Returns True if the data structure at index1 was changed in any way. + +function hll_cardinality_clone%(index1: any, index2: any%): bool + %{ + BroString* s1 = convert_index_to_string(index1); + BroString* s2 = convert_index_to_string(index2); + int status = 0; + + if(hll_counters.count(*s2) < 1) + { + if(hll_counters.count(*s1) < 1) + { + status = 0; + } + else + { + delete hll_counters[*s1]; + status = 1; + } + } + else + { + uint64_t m = (*hll_counters[*s2]).getM(); + double error = 1.04/sqrt(m); + CardinalityCounter* newInst = new CardinalityCounter(error); + int i = 0; + while((*newInst).getM() != m) + { + i += 1; + newInst = new CardinalityCounter(error/i); + } + (*newInst).merge(hll_counters[*s2]); + if(hll_counters.count(*s1) < 1) + { + hll_counters[*s1] = newInst; + } + else + { + delete hll_counters[*s1]; + hll_counters[*s1] = newInst; + } + status = 1; + } + delete s1; + delete s2; + return new Val(status, TYPE_BOOL); + %} From bbaa35434b653bd680632b83ffaa0d0b9e71deeb Mon Sep 17 00:00:00 2001 From: Soumya Basu Date: Sat, 11 Aug 2012 16:45:58 -0700 Subject: [PATCH 02/32] Added the HyperLogLog files to the repository, and the size method works as well. The add method has an error with the hashkeys thus far and no other methods outside of init and size have been tested yet. --- mytests.bro | 29 ++++++++++++ src/HyperLogLog.cc | 113 +++++++++++++++++++++++++++++++++++++++++++++ src/HyperLogLog.h | 105 +++++++++++++++++++++++++++++++++++++++++ src/bro.bif | 5 +- 4 files changed, 250 insertions(+), 2 deletions(-) create mode 100644 mytests.bro create mode 100644 src/HyperLogLog.cc create mode 100644 src/HyperLogLog.h diff --git a/mytests.bro b/mytests.bro new file mode 100644 index 0000000000..3e5af497f3 --- /dev/null +++ b/mytests.bro @@ -0,0 +1,29 @@ +event bro_init() + { + local m1 = "measurement1"; + local m2 = "measurement2"; + + hll_cardinality_init(0.01, m1); + + local add1 = "hey"; + local add2 = "hi"; + local add3 = 123; + + hll_cardinality_add(add1, m1); + hll_cardinality_add(add2, m1); + hll_cardinality_add(add3, m1); + hll_cardinality_add("a", m1); + hll_cardinality_add("b", m1); + hll_cardinality_add("c", m1); + hll_cardinality_add("d", m1); + hll_cardinality_add("e", m1); + hll_cardinality_add("f", m1); + hll_cardinality_add("g", m1); + hll_cardinality_add("h", m1); + hll_cardinality_add("i", m1); + hll_cardinality_add("j", m1); + + local e = hll_cardinality_estimate(m1); + print e; + + } diff --git a/src/HyperLogLog.cc b/src/HyperLogLog.cc new file mode 100644 index 0000000000..22f522d1ab --- /dev/null +++ b/src/HyperLogLog.cc @@ -0,0 +1,113 @@ +#include +#include +#include "HyperLogLog.h" +#include + +using namespace std; + + int CardinalityCounter::optimalB(double error){ + double initial_estimate = 2*(log(1.04)-log(error))/log(2); + int answer = (int) floor(initial_estimate); + double k; + + do{ + answer++; + k = pow(2, (answer - initial_estimate)/2); + }while(erf(k/sqrt(2)) < conf); + + return answer; + } + + + CardinalityCounter :: CardinalityCounter(double error_margin){ + int b = optimalB(error_margin); + m = (uint64_t) pow(2, b); + buckets = new uint8_t[m]; + + if(m == 16) + alpha_m = 0.673; + else if(m == 32) + alpha_m = 0.697; + else if(m == 64) + alpha_m = 0.709; + else + alpha_m = 0.7213/(1+1.079/m); + + for(uint64_t i = 0; i < m; i++){ + buckets[i] = 0; + } + + V = m; + } + + CardinalityCounter :: ~CardinalityCounter(){ + delete [] buckets; + delete &m; + delete &V; + delete &alpha_m; + } + + uint8_t CardinalityCounter :: rank(uint64_t hash_modified){ + uint8_t answer = 0; + hash_modified = (uint64_t)(hash_modified/m); + hash_modified *= 2; + do{ + hash_modified = (uint64_t) (hash_modified/2); + answer++; + }while(hash_modified%2 == 0); + return answer; + } + + + + void CardinalityCounter::addElement(uint64_t hash){ + uint64_t index = hash % m; + hash = hash-index; + + if(buckets[index] == 0) + V--; + uint8_t temp = rank(hash); + if(temp > buckets[index]){ + buckets[index] = temp; + } + } + + double CardinalityCounter::size(){ + double answer = 0; + for(int i = 0; i < m; i++){ + answer += pow(2, -(int)buckets[i]); + } + answer = 1/answer; + answer = alpha_m*m*m*answer; + + if(answer <= 5*(double)(m/2)){ + return m*log((double) m/V); + } + else if(answer <= pow(2,64)/30){ + return answer; + } + else{ + return -pow(2,64)*log(1-answer/pow(2,64)); + } + } + + void CardinalityCounter::merge(CardinalityCounter* c){ + uint8_t* temp = (*c).getBuckets(); + V = 0; + for(int i = 0; i < m; i++){ + if(temp[i] > buckets[i]){ + buckets[i] = temp[i]; + } + if(buckets[i] == 0){ + V += 1; + } + } + } + + uint8_t* CardinalityCounter::getBuckets(){ + return buckets; + } + + uint64_t CardinalityCounter::getM(){ + return m; + } diff --git a/src/HyperLogLog.h b/src/HyperLogLog.h new file mode 100644 index 0000000000..3cbe4cfb03 --- /dev/null +++ b/src/HyperLogLog.h @@ -0,0 +1,105 @@ +#include + +/* + * "conf" is how confident the estimate given by the counter is. + * + * In other words, if the cardinality is estimated to be 100 with 2% error margin and conf is + * 0.95, then we are 95% sure that the actual cardinality is between 98 and 102. + */ +#define conf .95 + +class CardinalityCounter { + + private: + /* + * This is the number of buckets that will be stored. The standard error is 1.04/sqrt(m), so the + * actual cardinality will be the estimate +/- 1.04/sqrt(m) with approximately 68% probability. + */ + uint64_t m; + + /* + * These are the actual buckets that are storing an estimate of the cardinality. All these need to + * do is count when the first 1 bit appears in the bitstring and that location is at most 65, so + * not that many bits are needed to store it. + */ + uint8_t* buckets; + + /* + * There are some state constants that need to be kept track of to make the final estimate easier. + * V is the number of values in buckets that are 0 and this is used in the small error correction. + * alpha_m is a multiplicative constant used in the algorithm. + */ + uint64_t V; + double alpha_m; + + /* + * This function will calculate the smallest value of b that will satisfy these the constraints of + * a specified error margin and confidence level. + * + * The exact expression for b is as follows: + * Define x = 2*(log(1.04*k/error)/log(2)). Then b is the ceiling of x + * + * error is the error margin. + * k is the number of standard deviations that we have to go to have a confidence level of conf. + */ + + int optimalB(double error); + + /* + * Computes when the first one appears in the element. It looks at the bitstring from the end though. + * A precondition is that the argument is already divisible by m, so we just ignore the last b bits, + * since m = 2^b and the last b bits will always be 0. + */ + uint8_t rank(uint64_t hash_modified); + + public: + /* + * This will initialize the Cardinality counter.Based on the error_margin, the number of buckets + * that need to be kept will be determined. Based on the max_size, the number of bits that will + * be used from the hash function will be determined. + * + * We need the hash function to return integers that are uniformly distributed from 0 to 2^L-1. + * And if that happens, the maximum cardinality that this counter can handle is approximately 2^L. + * By default, we will assume a value of 64 bits. + */ + + CardinalityCounter(double error_margin); + + /* + * Deletes the class variables. + */ + + ~CardinalityCounter(); + + /* + * This will add an element to the counter. It's responsible for adding an element and updating + * the value of V, if that applies. + */ + void addElement(uint64_t hash); + + /* + * Returns the size estimate of the set. First, it has the "raw" HyperLogLog estimate. And then, we + * check if it's too "large" or "small" because the raw estimate doesn't do well in those cases. + * Thus, we correct for those errors as specified in the paper. + */ + + double size(); + + /* + * Returns the buckets array that holds all of the rough cardinality estimates. + */ + + uint8_t* getBuckets(); + + /* + * Merges the argument cardinality counter with this one. The error margins are assumed to be the same, + * so they have the same number of buckets. If any of the conditions are violated, then the return value + * of size() is meaningless. + */ + void merge(CardinalityCounter* c); + + /* + * Returns the value of m. Should be used only for statistical purposes. + */ + uint64_t getM(); +}; diff --git a/src/bro.bif b/src/bro.bif index 6e41aaad99..49446a1e83 100644 --- a/src/bro.bif +++ b/src/bro.bif @@ -5900,14 +5900,15 @@ function hll_cardinality_add%(elem: any, index: any%): bool %{ BroString* s = convert_index_to_string(index); int status = 0; - + uint64_t a = 1230123; + CompositeHash* hll_hash = new CompositeHash(new TypeList(elem->Type())); HashKey* key; if(hll_counters.count(*s) > 0) { CardinalityCounter* h = hll_counters[*s]; key = hll_hash->ComputeHash(elem,1); - (*h).addElement(key->Hash()); + h->addElement(a); status = 1; } From a41efd495de3f54457c80436ea3dd409a717abcd Mon Sep 17 00:00:00 2001 From: Soumya Basu Date: Mon, 13 Aug 2012 16:33:27 -0700 Subject: [PATCH 03/32] Still segfaults. No real updates. --- src/bro.bif | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/bro.bif b/src/bro.bif index 49446a1e83..4c07053981 100644 --- a/src/bro.bif +++ b/src/bro.bif @@ -5900,18 +5900,20 @@ function hll_cardinality_add%(elem: any, index: any%): bool %{ BroString* s = convert_index_to_string(index); int status = 0; - uint64_t a = 1230123; + uint64_t a = 123456; CompositeHash* hll_hash = new CompositeHash(new TypeList(elem->Type())); - HashKey* key; if(hll_counters.count(*s) > 0) { CardinalityCounter* h = hll_counters[*s]; - key = hll_hash->ComputeHash(elem,1); + HashKey* key = hll_hash->ComputeHash(elem,1); + a = (key->Hash()); h->addElement(a); status = 1; + delete key; } + delete hll_hash; delete s; return new Val(status, TYPE_BOOL); %} From ae4066dcf8dce6fab8957408595b1ddba9982c38 Mon Sep 17 00:00:00 2001 From: Robin Sommer Date: Tue, 14 Aug 2012 17:16:53 -0700 Subject: [PATCH 04/32] Fixing problem with type list. --- src/bro.bif | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/bro.bif b/src/bro.bif index 4c07053981..42191d3b22 100644 --- a/src/bro.bif +++ b/src/bro.bif @@ -5902,11 +5902,15 @@ function hll_cardinality_add%(elem: any, index: any%): bool int status = 0; uint64_t a = 123456; - CompositeHash* hll_hash = new CompositeHash(new TypeList(elem->Type())); - if(hll_counters.count(*s) > 0) + TypeList* tl = new TypeList(elem->Type()); + tl->Append(elem->Type()); + CompositeHash* hll_hash = new CompositeHash(tl); + Unref(tl); + + if( hll_counters.count(*s) > 0 ) { CardinalityCounter* h = hll_counters[*s]; - HashKey* key = hll_hash->ComputeHash(elem,1); + HashKey* key = hll_hash->ComputeHash(elem, 1); a = (key->Hash()); h->addElement(a); status = 1; From 7e07ce3cb12ed6454d8d6b0320150849997c7470 Mon Sep 17 00:00:00 2001 From: Soumya Basu Date: Mon, 20 Aug 2012 21:57:13 -0700 Subject: [PATCH 05/32] Basic functionality works. Merging two counters, cloning two counters and iterating over all of the counters still needs work. --- mytests.bro | 185 ++++++++++++++++++++++++++++++++++++++++++++++++++-- src/bro.bif | 31 +++++---- 2 files changed, 201 insertions(+), 15 deletions(-) diff --git a/mytests.bro b/mytests.bro index 3e5af497f3..4de6f29dab 100644 --- a/mytests.bro +++ b/mytests.bro @@ -3,7 +3,12 @@ event bro_init() local m1 = "measurement1"; local m2 = "measurement2"; - hll_cardinality_init(0.01, m1); + print "This value should be true:"; + print hll_cardinality_init(0.01, m1); + hll_cardinality_init(0.01, m2); + + print "This value should be false:"; + print hll_cardinality_init(0.02, "measurement1"); local add1 = "hey"; local add2 = "hi"; @@ -21,9 +26,181 @@ event bro_init() hll_cardinality_add("g", m1); hll_cardinality_add("h", m1); hll_cardinality_add("i", m1); - hll_cardinality_add("j", m1); - local e = hll_cardinality_estimate(m1); - print e; + print "This value should be true:"; + print hll_cardinality_add("j", m1); + print "This value should be false:"; + print hll_cardinality_add("asdf", "something"); + + + hll_cardinality_add(add1, m2); + hll_cardinality_add(add2, m2); + hll_cardinality_add(add3, m2); + hll_cardinality_add(1, m2); + hll_cardinality_add("b", m2); + hll_cardinality_add(2, m2); + hll_cardinality_add(3, m2); + hll_cardinality_add(4, m2); + hll_cardinality_add(5, m2); + hll_cardinality_add(6, m2); + hll_cardinality_add(7, m2); + hll_cardinality_add(8, m2); + + print "This value should be around 13:"; + print hll_cardinality_estimate("measurement1"); + + print "This value should be -1.0:"; + print hll_cardinality_estimate("m2"); + + hll_cardinality_init(0.02, "m2"); + + print "This value should be around 0:"; + print hll_cardinality_estimate("m2"); + + print "This value should be true:"; + print hll_cardinality_destroy("m2"); + + print "This value should be false:"; + print hll_cardinality_destroy("m2"); + + print "This value should be -1.0:"; + print hll_cardinality_estimate("m2"); + + print "This next thing should be false:"; + print hll_cardinality_clone("m3", "m2"); + + print "This next thing should be true:"; + print hll_cardinality_clone("measurement3", "measurement1"); + + print "This value should be around 13:"; + print hll_cardinality_estimate("measurement3"); + + hll_cardinality_destroy("measurement3"); + + print "This next thing should be equal to -1.0:"; + print hll_cardinality_estimate("measurement3"); + + print "This value should be around 13:"; + print hll_cardinality_estimate("measurement1"); } + +### The data structure at index1 will contain the combined count for the +## elements measured by index1 and index2. +## It returns true if it either cloned the value at index2 into index1 +## or if it merged the two data structures together. + +#function hll_cardinality_merge_into%(index1: any, index2: any%): bool +# %{ +# BroString* s1 = convert_index_to_string(index1); +# BroString* s2 = convert_index_to_string(index2); +# int status = 0; +# +# if(hll_counters.count(*s1) < 1) +# { +# if(hll_counters.count(*s2) < 1) +# { +# status = 0; +# } +# else +# { +# uint64_t m = (*hll_counters[*s2]).getM(); +# double error = 1.04/sqrt(m); +# CardinalityCounter* newInst = new CardinalityCounter(error); +# int i = 0; +# while((*newInst).getM() != m) +# { +# i += 1; +# newInst = new CardinalityCounter(error/i); +# } +# hll_counters[*s1] = newInst; +# (*newInst).merge(hll_counters[*s2]); +# status = 1; +# } +# } +# else +# { +# if(hll_counters.count(*s2) < 1) +# { +# status = 0; +# } +# else +# { +# if((*hll_counters[*s2]).getM() == (*hll_counters[*s1]).getM()) +# { +# status = 1; +## (*hll_counters[*s1]).merge(hll_counters[*s2]); +## } +# } +# } +# +# delete s1; +# delete s2; +# return new Val(status, TYPE_BOOL); +# +# %} + +##I'm really not sure about the notation of this function... +# +#function hll_cardinality_keys%(%): bool +# %{ +#// TableVal* a = new TableVal(string_set); +#// map::iterator it; +# +#// for(it = hll_counters.begin() ; it != hll_counters.end(); it++) +#// { +#// a->Assign((*it).first); +#// } +#// return a; +# return new Val(1, TYPE_BOOL); +# %} + +## Stores the data structure at index2 into index1. Deletes the data structure at index1 +## if there was any. Returns True if the data structure at index1 was changed in any way. + +#function hll_cardinality_clone%(index1: any, index2: any%): bool +# %{ +# BroString* s1 = convert_index_to_string(index1); +# BroString* s2 = convert_index_to_string(index2); +# int status = 0; +# +# if(hll_counters.count(*s2) < 1) +# { +# if(hll_counters.count(*s1) < 1) +## { +# status = 0; +# } +# else +# { +# delete hll_counters[*s1]; +# status = 1; +# } +# } +# else +# { +# uint64_t m = (*hll_counters[*s2]).getM(); +# double error = 1.04/sqrt(m); +# CardinalityCounter* newInst = new CardinalityCounter(error); +# int i = 0; +# while((*newInst).getM() != m) +# { +# i += 1; +# newInst = new CardinalityCounter(error/i); +# } +# (*newInst).merge(hll_counters[*s2]); +# if(hll_counters.count(*s1) < 1) +# { +# #hll_counters[*s1] = newInst; +# } +# else +# { +# delete hll_counters[*s1]; +# hll_counters[*s1] = newInst; +# } +# status = 1; +# } +# delete s1; +# delete s2; +# return new Val(status, TYPE_BOOL); +# %}} + diff --git a/src/bro.bif b/src/bro.bif index 42191d3b22..e8fbbfc169 100644 --- a/src/bro.bif +++ b/src/bro.bif @@ -5911,7 +5911,7 @@ function hll_cardinality_add%(elem: any, index: any%): bool { CardinalityCounter* h = hll_counters[*s]; HashKey* key = hll_hash->ComputeHash(elem, 1); - a = (key->Hash()); + a = key->Hash(); h->addElement(a); status = 1; delete key; @@ -5949,6 +5949,10 @@ function hll_cardinality_merge_into%(index1: any, index2: any%): bool { i += 1; newInst = new CardinalityCounter(error/i); + if(i >= 5) + { + break; + } } hll_counters[*s1] = newInst; (*newInst).merge(hll_counters[*s2]); @@ -5985,7 +5989,8 @@ function hll_cardinality_destroy%(index: any%): bool if(hll_counters.count(*s) > 0) { - delete hll_counters[*s]; + hll_counters.erase(*s); + status = 1; } delete s; @@ -6009,17 +6014,17 @@ function hll_cardinality_estimate%(index: any%): double ##I'm really not sure about the notation of this function... -function hll_cardinality_keys%(%): bool +function hll_cardinality_keys%(%): string_set %{ -// TableVal* a = new TableVal(string_set); -// map::iterator it; + TableVal* a = new TableVal(string_set); + map::iterator it; + int i = 0; -// for(it = hll_counters.begin() ; it != hll_counters.end(); it++) -// { -// a->Assign((*it).first); -// } -// return a; - return new Val(1, TYPE_BOOL); + for(it = hll_counters.begin() ; it != hll_counters.end(); it++) + { + a->Assign(new Val(i++, TYPE_INT),new Val(&(*it).first, TYPE_STRING)); + } + return a; %} ## Stores the data structure at index2 into index1. Deletes the data structure at index1 @@ -6053,6 +6058,10 @@ function hll_cardinality_clone%(index1: any, index2: any%): bool { i += 1; newInst = new CardinalityCounter(error/i); + if(i >=5 ) + { + break; + } } (*newInst).merge(hll_counters[*s2]); if(hll_counters.count(*s1) < 1) From 3ffb4cab64d7940001761be738ed83238151f47a Mon Sep 17 00:00:00 2001 From: Soumya Basu Date: Thu, 30 Aug 2012 01:09:44 -0700 Subject: [PATCH 06/32] Everything works, and I just need to put the tests in the proper folder to get this functionality working. --- mytests.bro | 144 ++++++++++++--------------------------------- src/HyperLogLog.cc | 22 ++++++- src/HyperLogLog.h | 6 ++ src/bro.bif | 34 +++-------- 4 files changed, 70 insertions(+), 136 deletions(-) diff --git a/mytests.bro b/mytests.bro index 4de6f29dab..9392b205b0 100644 --- a/mytests.bro +++ b/mytests.bro @@ -83,65 +83,44 @@ event bro_init() print "This value should be around 13:"; print hll_cardinality_estimate("measurement1"); + + print "This value should be true:"; + print hll_cardinality_merge_into("measurement3", "measurement2"); + + print "This value should be false:"; + print hll_cardinality_merge_into("measurement4", "measurement6"); + + print "This value should be about 12:"; + print hll_cardinality_estimate("measurement3"); + + print "This value should be false:"; + print hll_cardinality_merge_into("measurement3", "measurement15"); + + print "This value should be about 12:"; + print hll_cardinality_estimate("measurement3"); + + print "This value should be true:"; + print hll_cardinality_merge_into("measurement2", "measurement1"); + + print "This value should be about 21:"; + print hll_cardinality_estimate("measurement2"); + + print "This value should be about 13:"; + print hll_cardinality_estimate("measurement1"); + + print "This value should be about 12:"; + print hll_cardinality_estimate("measurement3"); + + local keys = hll_cardinality_keys(); + for(key in keys) + { + print "The key is:"; + print key; + print "The value is:"; + print hll_cardinality_estimate(key); + } } -### The data structure at index1 will contain the combined count for the -## elements measured by index1 and index2. -## It returns true if it either cloned the value at index2 into index1 -## or if it merged the two data structures together. - -#function hll_cardinality_merge_into%(index1: any, index2: any%): bool -# %{ -# BroString* s1 = convert_index_to_string(index1); -# BroString* s2 = convert_index_to_string(index2); -# int status = 0; -# -# if(hll_counters.count(*s1) < 1) -# { -# if(hll_counters.count(*s2) < 1) -# { -# status = 0; -# } -# else -# { -# uint64_t m = (*hll_counters[*s2]).getM(); -# double error = 1.04/sqrt(m); -# CardinalityCounter* newInst = new CardinalityCounter(error); -# int i = 0; -# while((*newInst).getM() != m) -# { -# i += 1; -# newInst = new CardinalityCounter(error/i); -# } -# hll_counters[*s1] = newInst; -# (*newInst).merge(hll_counters[*s2]); -# status = 1; -# } -# } -# else -# { -# if(hll_counters.count(*s2) < 1) -# { -# status = 0; -# } -# else -# { -# if((*hll_counters[*s2]).getM() == (*hll_counters[*s1]).getM()) -# { -# status = 1; -## (*hll_counters[*s1]).merge(hll_counters[*s2]); -## } -# } -# } -# -# delete s1; -# delete s2; -# return new Val(status, TYPE_BOOL); -# -# %} - -##I'm really not sure about the notation of this function... -# #function hll_cardinality_keys%(%): bool # %{ #// TableVal* a = new TableVal(string_set); @@ -153,54 +132,3 @@ event bro_init() #// } #// return a; # return new Val(1, TYPE_BOOL); -# %} - -## Stores the data structure at index2 into index1. Deletes the data structure at index1 -## if there was any. Returns True if the data structure at index1 was changed in any way. - -#function hll_cardinality_clone%(index1: any, index2: any%): bool -# %{ -# BroString* s1 = convert_index_to_string(index1); -# BroString* s2 = convert_index_to_string(index2); -# int status = 0; -# -# if(hll_counters.count(*s2) < 1) -# { -# if(hll_counters.count(*s1) < 1) -## { -# status = 0; -# } -# else -# { -# delete hll_counters[*s1]; -# status = 1; -# } -# } -# else -# { -# uint64_t m = (*hll_counters[*s2]).getM(); -# double error = 1.04/sqrt(m); -# CardinalityCounter* newInst = new CardinalityCounter(error); -# int i = 0; -# while((*newInst).getM() != m) -# { -# i += 1; -# newInst = new CardinalityCounter(error/i); -# } -# (*newInst).merge(hll_counters[*s2]); -# if(hll_counters.count(*s1) < 1) -# { -# #hll_counters[*s1] = newInst; -# } -# else -# { -# delete hll_counters[*s1]; -# hll_counters[*s1] = newInst; -# } -# status = 1; -# } -# delete s1; -# delete s2; -# return new Val(status, TYPE_BOOL); -# %}} - diff --git a/src/HyperLogLog.cc b/src/HyperLogLog.cc index 22f522d1ab..22a06ee6c7 100644 --- a/src/HyperLogLog.cc +++ b/src/HyperLogLog.cc @@ -18,12 +18,32 @@ using namespace std; return answer; } + CardinalityCounter :: CardinalityCounter(uint64_t size){ + m = size; + buckets = new uint8_t[m]; + + if(m == 16) + alpha_m = 0.673; + else if(m == 32) + alpha_m = 0.697; + else if(m == 64) + alpha_m = 0.709; + else + alpha_m = 0.7213/(1+1.079/m); + + for(uint64_t i = 0; i < m; i++){ + buckets[i] = 0; + } + + V = m; + + } CardinalityCounter :: CardinalityCounter(double error_margin){ int b = optimalB(error_margin); m = (uint64_t) pow(2, b); buckets = new uint8_t[m]; - + if(m == 16) alpha_m = 0.673; else if(m == 32) diff --git a/src/HyperLogLog.h b/src/HyperLogLog.h index 3cbe4cfb03..ba9a46f1bd 100644 --- a/src/HyperLogLog.h +++ b/src/HyperLogLog.h @@ -53,6 +53,12 @@ class CardinalityCounter { uint8_t rank(uint64_t hash_modified); public: + /* + * This will be used when cloning. The error margin will be 1.04/sqrt(m) with approximately 68% + * probability. + */ + CardinalityCounter(uint64_t size); + /* * This will initialize the Cardinality counter.Based on the error_margin, the number of buckets * that need to be kept will be determined. Based on the max_size, the number of bits that will diff --git a/src/bro.bif b/src/bro.bif index e8fbbfc169..e75acfa653 100644 --- a/src/bro.bif +++ b/src/bro.bif @@ -5942,18 +5942,7 @@ function hll_cardinality_merge_into%(index1: any, index2: any%): bool else { uint64_t m = (*hll_counters[*s2]).getM(); - double error = 1.04/sqrt(m); - CardinalityCounter* newInst = new CardinalityCounter(error); - int i = 0; - while((*newInst).getM() != m) - { - i += 1; - newInst = new CardinalityCounter(error/i); - if(i >= 5) - { - break; - } - } + CardinalityCounter* newInst = new CardinalityCounter(m); hll_counters[*s1] = newInst; (*newInst).merge(hll_counters[*s2]); status = 1; @@ -6022,7 +6011,8 @@ function hll_cardinality_keys%(%): string_set for(it = hll_counters.begin() ; it != hll_counters.end(); it++) { - a->Assign(new Val(i++, TYPE_INT),new Val(&(*it).first, TYPE_STRING)); + BroString* s = (BroString*) &(it->first); + a->Assign(new StringVal(s), 0); } return a; %} @@ -6050,20 +6040,10 @@ function hll_cardinality_clone%(index1: any, index2: any%): bool } else { - uint64_t m = (*hll_counters[*s2]).getM(); - double error = 1.04/sqrt(m); - CardinalityCounter* newInst = new CardinalityCounter(error); - int i = 0; - while((*newInst).getM() != m) - { - i += 1; - newInst = new CardinalityCounter(error/i); - if(i >=5 ) - { - break; - } - } - (*newInst).merge(hll_counters[*s2]); + uint64_t m = (*hll_counters[*s2]).getM(); + CardinalityCounter* newInst = new CardinalityCounter(m); + int i = 0; + (*newInst).merge(hll_counters[*s2]); if(hll_counters.count(*s1) < 1) { hll_counters[*s1] = newInst; From 80cdfbcab406c52ae4548b12876037b52f0b0f5a Mon Sep 17 00:00:00 2001 From: Soumya Basu Date: Thu, 15 Nov 2012 13:04:48 -0800 Subject: [PATCH 07/32] Moved the testing file to the correct directory --- testing/btest/bifs/hll_cardinality.bro | 138 +++++++++++++++++++++++++ 1 file changed, 138 insertions(+) create mode 100644 testing/btest/bifs/hll_cardinality.bro diff --git a/testing/btest/bifs/hll_cardinality.bro b/testing/btest/bifs/hll_cardinality.bro new file mode 100644 index 0000000000..093de134df --- /dev/null +++ b/testing/btest/bifs/hll_cardinality.bro @@ -0,0 +1,138 @@ +# +# @TEST-EXEC: bro %INPUT>out +# @TEST-EXEC: btest-diff out + +event bro_init() + { + local m1 = "measurement1"; + local m2 = "measurement2"; + + print "This value should be true:"; + print hll_cardinality_init(0.01, m1); + hll_cardinality_init(0.01, m2); + + print "This value should be false:"; + print hll_cardinality_init(0.02, "measurement1"); + + local add1 = "hey"; + local add2 = "hi"; + local add3 = 123; + + hll_cardinality_add(add1, m1); + hll_cardinality_add(add2, m1); + hll_cardinality_add(add3, m1); + hll_cardinality_add("a", m1); + hll_cardinality_add("b", m1); + hll_cardinality_add("c", m1); + hll_cardinality_add("d", m1); + hll_cardinality_add("e", m1); + hll_cardinality_add("f", m1); + hll_cardinality_add("g", m1); + hll_cardinality_add("h", m1); + hll_cardinality_add("i", m1); + + print "This value should be true:"; + print hll_cardinality_add("j", m1); + + print "This value should be false:"; + print hll_cardinality_add("asdf", "something"); + + + hll_cardinality_add(add1, m2); + hll_cardinality_add(add2, m2); + hll_cardinality_add(add3, m2); + hll_cardinality_add(1, m2); + hll_cardinality_add("b", m2); + hll_cardinality_add(2, m2); + hll_cardinality_add(3, m2); + hll_cardinality_add(4, m2); + hll_cardinality_add(5, m2); + hll_cardinality_add(6, m2); + hll_cardinality_add(7, m2); + hll_cardinality_add(8, m2); + + print "This value should be around 13:"; + print hll_cardinality_estimate("measurement1"); + + print "This value should be -1.0:"; + print hll_cardinality_estimate("m2"); + + hll_cardinality_init(0.02, "m2"); + + print "This value should be around 0:"; + print hll_cardinality_estimate("m2"); + + print "This value should be true:"; + print hll_cardinality_destroy("m2"); + + print "This value should be false:"; + print hll_cardinality_destroy("m2"); + + print "This value should be -1.0:"; + print hll_cardinality_estimate("m2"); + + print "This next thing should be false:"; + print hll_cardinality_clone("m3", "m2"); + + print "This next thing should be true:"; + print hll_cardinality_clone("measurement3", "measurement1"); + + print "This value should be around 13:"; + print hll_cardinality_estimate("measurement3"); + + hll_cardinality_destroy("measurement3"); + + print "This next thing should be equal to -1.0:"; + print hll_cardinality_estimate("measurement3"); + + print "This value should be around 13:"; + print hll_cardinality_estimate("measurement1"); + + print "This value should be true:"; + print hll_cardinality_merge_into("measurement3", "measurement2"); + + print "This value should be false:"; + print hll_cardinality_merge_into("measurement4", "measurement6"); + + print "This value should be about 12:"; + print hll_cardinality_estimate("measurement3"); + + print "This value should be false:"; + print hll_cardinality_merge_into("measurement3", "measurement15"); + + print "This value should be about 12:"; + print hll_cardinality_estimate("measurement3"); + + print "This value should be true:"; + print hll_cardinality_merge_into("measurement2", "measurement1"); + + print "This value should be about 21:"; + print hll_cardinality_estimate("measurement2"); + + print "This value should be about 13:"; + print hll_cardinality_estimate("measurement1"); + + print "This value should be about 12:"; + print hll_cardinality_estimate("measurement3"); + + local keys = hll_cardinality_keys(); + for(key in keys) + { + print "The key is:"; + print key; + print "The value is:"; + print hll_cardinality_estimate(key); + } + } + +#function hll_cardinality_keys%(%): bool +# %{ +#// TableVal* a = new TableVal(string_set); +#// map::iterator it; +# +#// for(it = hll_counters.begin() ; it != hll_counters.end(); it++) +#// { +#// a->Assign((*it).first); +#// } +#// return a; +# return new Val(1, TYPE_BOOL); From b5cdf13469335b0d4b49b6a2503b995e351d9155 Mon Sep 17 00:00:00 2001 From: Bernhard Amann Date: Tue, 2 Apr 2013 00:35:37 +0200 Subject: [PATCH 08/32] and re-add a function that I apparently deleted accidentally --- src/bro.bif | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/bro.bif b/src/bro.bif index 70e5568c6b..13e0d6e407 100644 --- a/src/bro.bif +++ b/src/bro.bif @@ -5651,6 +5651,15 @@ function anonymize_addr%(a: addr, cl: IPAddrAnonymizationClass%): addr %%{ #include "HyperLogLog.h" static map hll_counters; + +BroString* convert_index_to_string(Val* index) + { + ODesc d; + index->Describe(&d); + BroString* s = new BroString(1, d.TakeBytes(), d.Len()); + s->SetUseFreeToDelete(1); + return s; + } %%} ## Initializes the hash for the HyperLogLog cardinality counting algorithm. From fd51db1c89e024227a46e9770499ea9eceb87fb1 Mon Sep 17 00:00:00 2001 From: Bernhard Amann Date: Tue, 2 Apr 2013 11:24:03 +0200 Subject: [PATCH 09/32] purely aesthetical - make whitespacing fit bro coding style. Second step will be to change the bifs a bit... --- src/HyperLogLog.cc | 242 +++++++++++++++++++++++---------------------- src/HyperLogLog.h | 2 + 2 files changed, 128 insertions(+), 116 deletions(-) diff --git a/src/HyperLogLog.cc b/src/HyperLogLog.cc index 22a06ee6c7..436d754b4d 100644 --- a/src/HyperLogLog.cc +++ b/src/HyperLogLog.cc @@ -1,3 +1,5 @@ +// See the file "COPYING" in the main distribution directory for copyright. + #include #include #include "HyperLogLog.h" @@ -5,129 +7,137 @@ using namespace std; - int CardinalityCounter::optimalB(double error){ - double initial_estimate = 2*(log(1.04)-log(error))/log(2); - int answer = (int) floor(initial_estimate); - double k; +int CardinalityCounter::optimalB(double error) + { + double initial_estimate = 2*(log(1.04)-log(error))/log(2); + int answer = (int) floor(initial_estimate); + double k; - do{ - answer++; - k = pow(2, (answer - initial_estimate)/2); - }while(erf(k/sqrt(2)) < conf); + do + { + answer++; + k = pow(2, (answer - initial_estimate)/2); + } + while (erf(k/sqrt(2)) < conf); - return answer; - } + return answer; + } - CardinalityCounter :: CardinalityCounter(uint64_t size){ - m = size; - buckets = new uint8_t[m]; +CardinalityCounter::CardinalityCounter(uint64_t size) + { + m = size; + buckets = new uint8_t[m]; - if(m == 16) - alpha_m = 0.673; - else if(m == 32) - alpha_m = 0.697; - else if(m == 64) - alpha_m = 0.709; - else - alpha_m = 0.7213/(1+1.079/m); + if(m == 16) + alpha_m = 0.673; + else if(m == 32) + alpha_m = 0.697; + else if(m == 64) + alpha_m = 0.709; + else + alpha_m = 0.7213/(1+1.079/m); - for(uint64_t i = 0; i < m; i++){ - buckets[i] = 0; - } - - V = m; - - } - - CardinalityCounter :: CardinalityCounter(double error_margin){ - int b = optimalB(error_margin); - m = (uint64_t) pow(2, b); - buckets = new uint8_t[m]; - - if(m == 16) - alpha_m = 0.673; - else if(m == 32) - alpha_m = 0.697; - else if(m == 64) - alpha_m = 0.709; - else - alpha_m = 0.7213/(1+1.079/m); - - for(uint64_t i = 0; i < m; i++){ - buckets[i] = 0; - } - - V = m; - } - - CardinalityCounter :: ~CardinalityCounter(){ - delete [] buckets; - delete &m; - delete &V; - delete &alpha_m; - } - - uint8_t CardinalityCounter :: rank(uint64_t hash_modified){ - uint8_t answer = 0; - hash_modified = (uint64_t)(hash_modified/m); - hash_modified *= 2; - do{ - hash_modified = (uint64_t) (hash_modified/2); - answer++; - }while(hash_modified%2 == 0); - return answer; - } - - - - void CardinalityCounter::addElement(uint64_t hash){ - uint64_t index = hash % m; - hash = hash-index; - - if(buckets[index] == 0) - V--; - uint8_t temp = rank(hash); - if(temp > buckets[index]){ - buckets[index] = temp; - } - } + for (uint64_t i = 0; i < m; i++) + buckets[i] = 0; - double CardinalityCounter::size(){ - double answer = 0; - for(int i = 0; i < m; i++){ - answer += pow(2, -(int)buckets[i]); - } - answer = 1/answer; - answer = alpha_m*m*m*answer; + V = m; + } + +CardinalityCounter :: CardinalityCounter(double error_margin) + { + int b = optimalB(error_margin); + m = (uint64_t) pow(2, b); + buckets = new uint8_t[m]; - if(answer <= 5*(double)(m/2)){ - return m*log((double) m/V); - } - else if(answer <= pow(2,64)/30){ - return answer; - } - else{ - return -pow(2,64)*log(1-answer/pow(2,64)); - } - } + if(m == 16) + alpha_m = 0.673; + else if(m == 32) + alpha_m = 0.697; + else if(m == 64) + alpha_m = 0.709; + else + alpha_m = 0.7213/(1+1.079/m); - void CardinalityCounter::merge(CardinalityCounter* c){ - uint8_t* temp = (*c).getBuckets(); - V = 0; - for(int i = 0; i < m; i++){ - if(temp[i] > buckets[i]){ - buckets[i] = temp[i]; - } - if(buckets[i] == 0){ - V += 1; - } - } - } + for (uint64_t i = 0; i < m; i++) + buckets[i] = 0; - uint8_t* CardinalityCounter::getBuckets(){ - return buckets; - } + V = m; +} - uint64_t CardinalityCounter::getM(){ - return m; - } +CardinalityCounter::~CardinalityCounter() + { + delete [] buckets; + delete &m; + delete &V; + delete &alpha_m; + } + +uint8_t CardinalityCounter::rank(uint64_t hash_modified) + { + uint8_t answer = 0; + hash_modified = (uint64_t)(hash_modified/m); + hash_modified *= 2; + do + { + hash_modified = (uint64_t) (hash_modified/2); + answer++; + } + while (hash_modified%2 == 0); + + return answer; + } + +void CardinalityCounter::addElement(uint64_t hash) + { + uint64_t index = hash % m; + hash = hash-index; + + if(buckets[index] == 0) + V--; + + uint8_t temp = rank(hash); + + if (temp > buckets[index]) + buckets[index] = temp; +} + +double CardinalityCounter::size() + { + double answer = 0; + for (int i = 0; i < m; i++) + answer += pow(2, -(int)buckets[i]); + + answer = 1/answer; + answer = alpha_m*m*m*answer; + + if (answer <= 5*(double)(m/2)) + return m*log((double) m/V); + else if(answer <= pow(2,64)/30) + return answer; + else + return -pow(2,64)*log(1-answer/pow(2,64)); +} + +void CardinalityCounter::merge(CardinalityCounter* c) + { + uint8_t* temp = (*c).getBuckets(); + V = 0; + for (int i = 0; i < m; i++) + { + if (temp[i] > buckets[i]) + buckets[i] = temp[i]; + + if (buckets[i] == 0) + V += 1; + } + } + +uint8_t* CardinalityCounter::getBuckets() + { + return buckets; + } + +uint64_t CardinalityCounter::getM() + { + return m; + } diff --git a/src/HyperLogLog.h b/src/HyperLogLog.h index ba9a46f1bd..81c19067a1 100644 --- a/src/HyperLogLog.h +++ b/src/HyperLogLog.h @@ -1,3 +1,5 @@ +// See the file "COPYING" in the main distribution directory for copyright. + #include /* From 53d6f3aae70a770c977dcbf97d98b9ed4d267058 Mon Sep 17 00:00:00 2001 From: Bernhard Amann Date: Sun, 7 Apr 2013 23:05:14 +0200 Subject: [PATCH 10/32] rework cardinality interface to use opaque. I like it better... --- src/OpaqueVal.cc | 22 +++ src/OpaqueVal.h | 18 +++ src/bro.bif | 207 ++++++++----------------- testing/btest/bifs/hll_cardinality.bro | 145 ++++++----------- 4 files changed, 147 insertions(+), 245 deletions(-) diff --git a/src/OpaqueVal.cc b/src/OpaqueVal.cc index 51f975edf8..604ce2938e 100644 --- a/src/OpaqueVal.cc +++ b/src/OpaqueVal.cc @@ -2,6 +2,28 @@ #include "Reporter.h" #include "Serializer.h" + +CardinalityVal::CardinalityVal() + { + valid = false; + } + +CardinalityVal::~CardinalityVal() + { + if ( valid && c ) + delete c; + } + +bool CardinalityVal::Init(CardinalityCounter* arg_c) + { + if ( valid ) + return false; + + valid = true; + c = arg_c; + return valid; + } + bool HashVal::IsValid() const { return valid; diff --git a/src/OpaqueVal.h b/src/OpaqueVal.h index 0428e50bdb..01f86529c7 100644 --- a/src/OpaqueVal.h +++ b/src/OpaqueVal.h @@ -7,6 +7,24 @@ #include "Val.h" #include "digest.h" +class CardinalityCounter; + +class CardinalityVal: public OpaqueVal { +public: + CardinalityVal(); + ~CardinalityVal(); + bool Init(CardinalityCounter*); + bool IsValid() const { return valid; }; + CardinalityCounter* Get() { return c; }; + +private: + bool valid; + CardinalityCounter* c; + +// DECLARE_SERIAL(CardinalityVal); Fixme? +}; + + class HashVal : public OpaqueVal { public: virtual bool IsValid() const; diff --git a/src/bro.bif b/src/bro.bif index 13e0d6e407..6e39ee16b0 100644 --- a/src/bro.bif +++ b/src/bro.bif @@ -5650,44 +5650,36 @@ function anonymize_addr%(a: addr, cl: IPAddrAnonymizationClass%): addr %%{ #include "HyperLogLog.h" -static map hll_counters; - -BroString* convert_index_to_string(Val* index) - { - ODesc d; - index->Describe(&d); - BroString* s = new BroString(1, d.TakeBytes(), d.Len()); - s->SetUseFreeToDelete(1); - return s; - } %%} ## Initializes the hash for the HyperLogLog cardinality counting algorithm. ## It returns true if it was successful in creating a structure and false ## if it wasn't. -function hll_cardinality_init%(err: double,index: any%): bool +function hll_cardinality_init%(err: double%): opaque of cardinality %{ - BroString* s = convert_index_to_string(index); - int status = 0; + CardinalityCounter* c = new CardinalityCounter(err); + CardinalityVal* cv = new CardinalityVal(); - if ( hll_counters.count(*s) < 1 ) - { - hll_counters[*s] = new CardinalityCounter(err); - status = 1; - } + if ( !c ) + reporter->Error("Failed initialize Cardinality counter"); + else + cv->Init(c); - delete s; - return new Val(status, TYPE_BOOL); + return cv; %} ## Adds an element to the HyperLogLog data structure located at index. ##elem->Type() to get the type of elem. -function hll_cardinality_add%(elem: any, index: any%): bool +function hll_cardinality_add%(handle: opaque of cardinality, elem: any%): bool %{ - BroString* s = convert_index_to_string(index); + if ( !((CardinalityVal*) handle)->IsValid() ) { + reporter->Error("Need valid handle"); + return new Val(0, TYPE_BOOL); + } + int status = 0; uint64_t a = 123456; @@ -5696,19 +5688,13 @@ function hll_cardinality_add%(elem: any, index: any%): bool CompositeHash* hll_hash = new CompositeHash(tl); Unref(tl); - if( hll_counters.count(*s) > 0 ) - { - CardinalityCounter* h = hll_counters[*s]; - HashKey* key = hll_hash->ComputeHash(elem, 1); - a = key->Hash(); - h->addElement(a); - status = 1; - delete key; - } + CardinalityCounter* h = ((CardinalityVal*) handle)->Get(); + HashKey* key = hll_hash->ComputeHash(elem, 1); + a = key->Hash(); + h->addElement(a); delete hll_hash; - delete s; - return new Val(status, TYPE_BOOL); + return new Val(1, TYPE_BOOL); %} ## The data structure at index1 will contain the combined count for the @@ -5716,135 +5702,68 @@ function hll_cardinality_add%(elem: any, index: any%): bool ## It returns true if it either cloned the value at index2 into index1 ## or if it merged the two data structures together. -function hll_cardinality_merge_into%(index1: any, index2: any%): bool +function hll_cardinality_merge_into%(handle1: opaque of cardinality, handle2: opaque of cardinality%): bool %{ - BroString* s1 = convert_index_to_string(index1); - BroString* s2 = convert_index_to_string(index2); - int status = 0; + CardinalityVal* v1 = (CardinalityVal*) handle1; + CardinalityVal* v2 = (CardinalityVal*) handle2; - if(hll_counters.count(*s1) < 1) - { - if(hll_counters.count(*s2) < 1) - { - status = 0; - } - else - { - uint64_t m = (*hll_counters[*s2]).getM(); - CardinalityCounter* newInst = new CardinalityCounter(m); - hll_counters[*s1] = newInst; - (*newInst).merge(hll_counters[*s2]); - status = 1; - } - } - else - { - if(hll_counters.count(*s2) < 1) - { - status = 0; - } - else - { - if((*hll_counters[*s2]).getM() == (*hll_counters[*s1]).getM()) - { - status = 1; - (*hll_counters[*s1]).merge(hll_counters[*s2]); - } - } - } + if ( !v1->IsValid() || !v2->IsValid() ) { + reporter->Error("need valid handles"); + return new Val(0, TYPE_BOOL); + } - delete s1; - delete s2; - return new Val(status, TYPE_BOOL); + CardinalityCounter* h1 = v1->Get(); + CardinalityCounter* h2 = v2->Get(); + h1->merge(h2); + + return new Val(1, TYPE_BOOL); %} ## Returns true if it destroyed something. False if it didn't. -function hll_cardinality_destroy%(index: any%): bool - %{ - BroString* s = convert_index_to_string(index); - int status = 0; - - if(hll_counters.count(*s) > 0) - { - hll_counters.erase(*s); - status = 1; - } - - delete s; - return new Val(status, TYPE_BOOL); - %} +#function hll_cardinality_destroy%(handle: opaque of cardinality%): bool +# %{ +# if ( !((CardinalityVal*) handle)->IsValid() ) { +# reporter->Error("Need valid handle"); +# return new Val(0, TYPE_BOOL); +# } +# CardinalityCounter* h = ((CardinalityVal*) handle)->Get(); +# delete h; +# h = 0; +# return new Val(1, TYPE_BOOL); +# %} ## Returns the cardinality estimate. Returns -1.0 if there is nothing in that index. -function hll_cardinality_estimate%(index: any%): double +function hll_cardinality_estimate%(handle: opaque of cardinality%): double %{ - BroString* s = convert_index_to_string(index); - double estimate = -1.0; + if ( !((CardinalityVal*) handle)->IsValid() ) { + reporter->Error("Need valid handle"); + return new Val(0, TYPE_BOOL); + } + CardinalityCounter* h = ((CardinalityVal*) handle)->Get(); - if(hll_counters.count(*s) > 0) - { - estimate = (*hll_counters[*s]).size(); - } + double estimate = h->size(); - delete s; return new Val(estimate, TYPE_DOUBLE); %} -##I'm really not sure about the notation of this function... - -function hll_cardinality_keys%(%): string_set - %{ - TableVal* a = new TableVal(string_set); - map::iterator it; - int i = 0; - - for(it = hll_counters.begin() ; it != hll_counters.end(); it++) - { - BroString* s = (BroString*) &(it->first); - a->Assign(new StringVal(s), 0); - } - return a; - %} - ## Stores the data structure at index2 into index1. Deletes the data structure at index1 ## if there was any. Returns True if the data structure at index1 was changed in any way. -function hll_cardinality_clone%(index1: any, index2: any%): bool +function hll_cardinality_clone%(handle: opaque of cardinality%): opaque of cardinality %{ - BroString* s1 = convert_index_to_string(index1); - BroString* s2 = convert_index_to_string(index2); - int status = 0; + if ( !((CardinalityVal*) handle)->IsValid() ) { + reporter->Error("Need valid handle"); + return new Val(0, TYPE_BOOL); + } + CardinalityCounter* h = ((CardinalityVal*) handle)->Get(); - if(hll_counters.count(*s2) < 1) - { - if(hll_counters.count(*s1) < 1) - { - status = 0; - } - else - { - delete hll_counters[*s1]; - status = 1; - } - } - else - { - uint64_t m = (*hll_counters[*s2]).getM(); - CardinalityCounter* newInst = new CardinalityCounter(m); - int i = 0; - (*newInst).merge(hll_counters[*s2]); - if(hll_counters.count(*s1) < 1) - { - hll_counters[*s1] = newInst; - } - else - { - delete hll_counters[*s1]; - hll_counters[*s1] = newInst; - } - status = 1; - } - delete s1; - delete s2; - return new Val(status, TYPE_BOOL); + + uint64_t m = h->getM(); + CardinalityCounter* h2 = new CardinalityCounter(m); + int i = 0; + h2->merge(h); + CardinalityVal* cv = new CardinalityVal(); + cv->Init(h2); + return cv; %} diff --git a/testing/btest/bifs/hll_cardinality.bro b/testing/btest/bifs/hll_cardinality.bro index 093de134df..774e8f6e28 100644 --- a/testing/btest/bifs/hll_cardinality.bro +++ b/testing/btest/bifs/hll_cardinality.bro @@ -4,135 +4,78 @@ event bro_init() { - local m1 = "measurement1"; - local m2 = "measurement2"; - - print "This value should be true:"; - print hll_cardinality_init(0.01, m1); - hll_cardinality_init(0.01, m2); - - print "This value should be false:"; - print hll_cardinality_init(0.02, "measurement1"); + local c1 = hll_cardinality_init(0.01); + local c2 = hll_cardinality_init(0.01); local add1 = "hey"; local add2 = "hi"; local add3 = 123; - hll_cardinality_add(add1, m1); - hll_cardinality_add(add2, m1); - hll_cardinality_add(add3, m1); - hll_cardinality_add("a", m1); - hll_cardinality_add("b", m1); - hll_cardinality_add("c", m1); - hll_cardinality_add("d", m1); - hll_cardinality_add("e", m1); - hll_cardinality_add("f", m1); - hll_cardinality_add("g", m1); - hll_cardinality_add("h", m1); - hll_cardinality_add("i", m1); + hll_cardinality_add(c1, add1); + hll_cardinality_add(c1, add2); + hll_cardinality_add(c1, add3); + hll_cardinality_add(c1, "a"); + hll_cardinality_add(c1, "b"); + hll_cardinality_add(c1, "c"); + hll_cardinality_add(c1, "d"); + hll_cardinality_add(c1, "e"); + hll_cardinality_add(c1, "f"); + hll_cardinality_add(c1, "g"); + hll_cardinality_add(c1, "h"); + hll_cardinality_add(c1, "i"); + hll_cardinality_add(c1, "j"); - print "This value should be true:"; - print hll_cardinality_add("j", m1); - - print "This value should be false:"; - print hll_cardinality_add("asdf", "something"); - - - hll_cardinality_add(add1, m2); - hll_cardinality_add(add2, m2); - hll_cardinality_add(add3, m2); - hll_cardinality_add(1, m2); - hll_cardinality_add("b", m2); - hll_cardinality_add(2, m2); - hll_cardinality_add(3, m2); - hll_cardinality_add(4, m2); - hll_cardinality_add(5, m2); - hll_cardinality_add(6, m2); - hll_cardinality_add(7, m2); - hll_cardinality_add(8, m2); + hll_cardinality_add(c2, add1); + hll_cardinality_add(c2, add2); + hll_cardinality_add(c2, add3); + hll_cardinality_add(c2, 1); + hll_cardinality_add(c2, "b"); + hll_cardinality_add(c2, 2); + hll_cardinality_add(c2, 3); + hll_cardinality_add(c2, 4); + hll_cardinality_add(c2, 5); + hll_cardinality_add(c2, 6); + hll_cardinality_add(c2, 7); + hll_cardinality_add(c2, 8); print "This value should be around 13:"; - print hll_cardinality_estimate("measurement1"); + print hll_cardinality_estimate(c1); - print "This value should be -1.0:"; - print hll_cardinality_estimate("m2"); + print "This value should be about 12:"; + print hll_cardinality_estimate(c2); - hll_cardinality_init(0.02, "m2"); + local m2 = hll_cardinality_init(0.02); print "This value should be around 0:"; - print hll_cardinality_estimate("m2"); + print hll_cardinality_estimate(m2); - print "This value should be true:"; - print hll_cardinality_destroy("m2"); - - print "This value should be false:"; - print hll_cardinality_destroy("m2"); - - print "This value should be -1.0:"; - print hll_cardinality_estimate("m2"); - - print "This next thing should be false:"; - print hll_cardinality_clone("m3", "m2"); - - print "This next thing should be true:"; - print hll_cardinality_clone("measurement3", "measurement1"); + local c3 = hll_cardinality_clone(c1); print "This value should be around 13:"; - print hll_cardinality_estimate("measurement3"); + print hll_cardinality_estimate(c3); - hll_cardinality_destroy("measurement3"); - - print "This next thing should be equal to -1.0:"; - print hll_cardinality_estimate("measurement3"); - - print "This value should be around 13:"; - print hll_cardinality_estimate("measurement1"); + c3 = hll_cardinality_init(0.01); + print "This value should be 0:"; + print hll_cardinality_estimate(c3); print "This value should be true:"; - print hll_cardinality_merge_into("measurement3", "measurement2"); - - print "This value should be false:"; - print hll_cardinality_merge_into("measurement4", "measurement6"); + print hll_cardinality_merge_into(c3, c2); print "This value should be about 12:"; - print hll_cardinality_estimate("measurement3"); - - print "This value should be false:"; - print hll_cardinality_merge_into("measurement3", "measurement15"); - - print "This value should be about 12:"; - print hll_cardinality_estimate("measurement3"); + print hll_cardinality_estimate(c2); + print hll_cardinality_estimate(c3); print "This value should be true:"; - print hll_cardinality_merge_into("measurement2", "measurement1"); + print hll_cardinality_merge_into(c2, c1); print "This value should be about 21:"; - print hll_cardinality_estimate("measurement2"); + print hll_cardinality_estimate(c2); print "This value should be about 13:"; - print hll_cardinality_estimate("measurement1"); + print hll_cardinality_estimate(c1); print "This value should be about 12:"; - print hll_cardinality_estimate("measurement3"); + print hll_cardinality_estimate(c3); - local keys = hll_cardinality_keys(); - for(key in keys) - { - print "The key is:"; - print key; - print "The value is:"; - print hll_cardinality_estimate(key); - } } -#function hll_cardinality_keys%(%): bool -# %{ -#// TableVal* a = new TableVal(string_set); -#// map::iterator it; -# -#// for(it = hll_counters.begin() ; it != hll_counters.end(); it++) -#// { -#// a->Assign((*it).first); -#// } -#// return a; -# return new Val(1, TYPE_BOOL); From 7f5e2b130172c3c6350fbbc188d57d5e44bfd3ad Mon Sep 17 00:00:00 2001 From: Bernhard Amann Date: Mon, 8 Apr 2013 09:44:24 +0200 Subject: [PATCH 11/32] and test results. are those stable accross platforms? Or do we have to do some kind of rounding? --- .../btest/Baseline/bifs.hll_cardinality/out | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 testing/btest/Baseline/bifs.hll_cardinality/out diff --git a/testing/btest/Baseline/bifs.hll_cardinality/out b/testing/btest/Baseline/bifs.hll_cardinality/out new file mode 100644 index 0000000000..8d20248cc3 --- /dev/null +++ b/testing/btest/Baseline/bifs.hll_cardinality/out @@ -0,0 +1,23 @@ +This value should be around 13: +13.00129 +This value should be about 12: +12.001099 +This value should be around 0: +0.0 +This value should be around 13: +13.00129 +This value should be 0: +0.0 +This value should be true: +T +This value should be about 12: +12.001099 +12.001099 +This value should be true: +T +This value should be about 21: +21.003365 +This value should be about 13: +13.00129 +This value should be about 12: +12.001099 From 7eee2f0d17d7b51dcd893a0cacb87474112a4fb3 Mon Sep 17 00:00:00 2001 From: Bernhard Amann Date: Mon, 8 Apr 2013 10:00:34 +0200 Subject: [PATCH 12/32] measurement framework with hll unique --- scripts/base/frameworks/measurement/plugins/__load__.bro | 3 ++- .../scripts.base.frameworks.measurement.basic/.stdout | 6 +++--- testing/btest/scripts/base/frameworks/measurement/basic.bro | 5 +++-- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/scripts/base/frameworks/measurement/plugins/__load__.bro b/scripts/base/frameworks/measurement/plugins/__load__.bro index 0d4c2ed302..0a51a081a9 100644 --- a/scripts/base/frameworks/measurement/plugins/__load__.bro +++ b/scripts/base/frameworks/measurement/plugins/__load__.bro @@ -5,4 +5,5 @@ @load ./std-dev @load ./sum @load ./unique -@load ./variance \ No newline at end of file +@load ./variance +@load ./hll_unique diff --git a/testing/btest/Baseline/scripts.base.frameworks.measurement.basic/.stdout b/testing/btest/Baseline/scripts.base.frameworks.measurement.basic/.stdout index 208b6103b7..5f1c5ab5e4 100644 --- a/testing/btest/Baseline/scripts.base.frameworks.measurement.basic/.stdout +++ b/testing/btest/Baseline/scripts.base.frameworks.measurement.basic/.stdout @@ -1,3 +1,3 @@ -Host: 6.5.4.3 - num:1 - sum:2.0 - var:0.0 - avg:2.0 - max:2.0 - min:2.0 - std_dev:0.0 - unique:1 -Host: 1.2.3.4 - num:5 - sum:221.0 - var:1144.2 - avg:44.2 - max:94.0 - min:5.0 - std_dev:33.8 - unique:4 -Host: 7.2.1.5 - num:1 - sum:1.0 - var:0.0 - avg:1.0 - max:1.0 - min:1.0 - std_dev:0.0 - unique:1 +Host: 6.5.4.3 - num:1 - sum:2.0 - var:0.0 - avg:2.0 - max:2.0 - min:2.0 - std_dev:0.0 - unique:1 - hllunique:1.0 +Host: 1.2.3.4 - num:5 - sum:221.0 - var:1144.2 - avg:44.2 - max:94.0 - min:5.0 - std_dev:33.8 - unique:4 - hllunique:4.0 +Host: 7.2.1.5 - num:1 - sum:1.0 - var:0.0 - avg:1.0 - max:1.0 - min:1.0 - std_dev:0.0 - unique:1 - hllunique:1.0 diff --git a/testing/btest/scripts/base/frameworks/measurement/basic.bro b/testing/btest/scripts/base/frameworks/measurement/basic.bro index e9dd21e0ef..4706a7c9b1 100644 --- a/testing/btest/scripts/base/frameworks/measurement/basic.bro +++ b/testing/btest/scripts/base/frameworks/measurement/basic.bro @@ -10,7 +10,8 @@ event bro_init() &priority=5 Measurement::MAX, Measurement::MIN, Measurement::STD_DEV, - Measurement::UNIQUE)]; + Measurement::UNIQUE, + Measurement::HLLUNIQUE)]; Measurement::create([$epoch=3secs, $reducers=set(r1), $epoch_finished(data: Measurement::ResultTable) = @@ -18,7 +19,7 @@ event bro_init() &priority=5 for ( key in data ) { local r = data[key]["test.metric"]; - print fmt("Host: %s - num:%d - sum:%.1f - var:%.1f - avg:%.1f - max:%.1f - min:%.1f - std_dev:%.1f - unique:%d", key$host, r$num, r$sum, r$variance, r$average, r$max, r$min, r$std_dev, r$unique); + print fmt("Host: %s - num:%d - sum:%.1f - var:%.1f - avg:%.1f - max:%.1f - min:%.1f - std_dev:%.1f - unique:%d - hllunique:%.1f", key$host, r$num, r$sum, r$variance, r$average, r$max, r$min, r$std_dev, r$unique, hll_cardinality_estimate(r$hllunique)); } } ]); From ac0e211c6cb02e1160fc7153d5e25111acdb4ebd Mon Sep 17 00:00:00 2001 From: Bernhard Amann Date: Mon, 8 Apr 2013 10:01:55 +0200 Subject: [PATCH 13/32] do away with old file. --- mytests.bro | 134 ---------------------------------------------------- 1 file changed, 134 deletions(-) delete mode 100644 mytests.bro diff --git a/mytests.bro b/mytests.bro deleted file mode 100644 index 9392b205b0..0000000000 --- a/mytests.bro +++ /dev/null @@ -1,134 +0,0 @@ -event bro_init() - { - local m1 = "measurement1"; - local m2 = "measurement2"; - - print "This value should be true:"; - print hll_cardinality_init(0.01, m1); - hll_cardinality_init(0.01, m2); - - print "This value should be false:"; - print hll_cardinality_init(0.02, "measurement1"); - - local add1 = "hey"; - local add2 = "hi"; - local add3 = 123; - - hll_cardinality_add(add1, m1); - hll_cardinality_add(add2, m1); - hll_cardinality_add(add3, m1); - hll_cardinality_add("a", m1); - hll_cardinality_add("b", m1); - hll_cardinality_add("c", m1); - hll_cardinality_add("d", m1); - hll_cardinality_add("e", m1); - hll_cardinality_add("f", m1); - hll_cardinality_add("g", m1); - hll_cardinality_add("h", m1); - hll_cardinality_add("i", m1); - - print "This value should be true:"; - print hll_cardinality_add("j", m1); - - print "This value should be false:"; - print hll_cardinality_add("asdf", "something"); - - - hll_cardinality_add(add1, m2); - hll_cardinality_add(add2, m2); - hll_cardinality_add(add3, m2); - hll_cardinality_add(1, m2); - hll_cardinality_add("b", m2); - hll_cardinality_add(2, m2); - hll_cardinality_add(3, m2); - hll_cardinality_add(4, m2); - hll_cardinality_add(5, m2); - hll_cardinality_add(6, m2); - hll_cardinality_add(7, m2); - hll_cardinality_add(8, m2); - - print "This value should be around 13:"; - print hll_cardinality_estimate("measurement1"); - - print "This value should be -1.0:"; - print hll_cardinality_estimate("m2"); - - hll_cardinality_init(0.02, "m2"); - - print "This value should be around 0:"; - print hll_cardinality_estimate("m2"); - - print "This value should be true:"; - print hll_cardinality_destroy("m2"); - - print "This value should be false:"; - print hll_cardinality_destroy("m2"); - - print "This value should be -1.0:"; - print hll_cardinality_estimate("m2"); - - print "This next thing should be false:"; - print hll_cardinality_clone("m3", "m2"); - - print "This next thing should be true:"; - print hll_cardinality_clone("measurement3", "measurement1"); - - print "This value should be around 13:"; - print hll_cardinality_estimate("measurement3"); - - hll_cardinality_destroy("measurement3"); - - print "This next thing should be equal to -1.0:"; - print hll_cardinality_estimate("measurement3"); - - print "This value should be around 13:"; - print hll_cardinality_estimate("measurement1"); - - print "This value should be true:"; - print hll_cardinality_merge_into("measurement3", "measurement2"); - - print "This value should be false:"; - print hll_cardinality_merge_into("measurement4", "measurement6"); - - print "This value should be about 12:"; - print hll_cardinality_estimate("measurement3"); - - print "This value should be false:"; - print hll_cardinality_merge_into("measurement3", "measurement15"); - - print "This value should be about 12:"; - print hll_cardinality_estimate("measurement3"); - - print "This value should be true:"; - print hll_cardinality_merge_into("measurement2", "measurement1"); - - print "This value should be about 21:"; - print hll_cardinality_estimate("measurement2"); - - print "This value should be about 13:"; - print hll_cardinality_estimate("measurement1"); - - print "This value should be about 12:"; - print hll_cardinality_estimate("measurement3"); - - local keys = hll_cardinality_keys(); - for(key in keys) - { - print "The key is:"; - print key; - print "The value is:"; - print hll_cardinality_estimate(key); - } - } - -#function hll_cardinality_keys%(%): bool -# %{ -#// TableVal* a = new TableVal(string_set); -#// map::iterator it; -# -#// for(it = hll_counters.begin() ; it != hll_counters.end(); it++) -#// { -#// a->Assign((*it).first); -#// } -#// return a; -# return new Val(1, TYPE_BOOL); From bcd610fd50252b2095639e0d5821af1088e36325 Mon Sep 17 00:00:00 2001 From: Bernhard Amann Date: Mon, 8 Apr 2013 10:55:00 +0200 Subject: [PATCH 14/32] Forgot a file. Again. Like always. Basically. --- .../measurement/plugins/hll_unique.bro | 39 +++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 scripts/base/frameworks/measurement/plugins/hll_unique.bro diff --git a/scripts/base/frameworks/measurement/plugins/hll_unique.bro b/scripts/base/frameworks/measurement/plugins/hll_unique.bro new file mode 100644 index 0000000000..0e95e6fcdd --- /dev/null +++ b/scripts/base/frameworks/measurement/plugins/hll_unique.bro @@ -0,0 +1,39 @@ + +module Measurement; + +export { + redef enum Calculation += { + ## Calculate the number of unique values. + HLLUNIQUE + }; + + redef record ResultVal += { + ## If cardinality is being tracked, the number of unique + ## items is tracked here. + hllunique: opaque of cardinality &default=hll_cardinality_init(0.01); + }; +} + +hook init_resultval_hook(r: Reducer, rv: ResultVal) + { + if ( HLLUNIQUE in r$apply && ! rv?$hllunique ) + rv$hllunique = hll_cardinality_init(0.01); + } + + +hook add_to_reducer_hook(r: Reducer, val: double, data: DataPoint, rv: ResultVal) + { + if ( HLLUNIQUE in r$apply ) + { + hll_cardinality_add(rv$hllunique, data); + } + } + +hook compose_resultvals_hook(result: ResultVal, rv1: ResultVal, rv2: ResultVal) + { + local rhll = hll_cardinality_init(0.01); + hll_cardinality_merge_into(rhll, rv1$hllunique); + hll_cardinality_merge_into(rhll, rv2$hllunique); + + result$hllunique = rhll; + } From f10ed9e29a6abf56e8110e2cc400f8e796e4c07a Mon Sep 17 00:00:00 2001 From: Bernhard Amann Date: Wed, 10 Apr 2013 10:45:45 -0400 Subject: [PATCH 15/32] change plugin after feedback of seth --- .../measurement/plugins/hll_unique.bro | 25 +++++++++++++------ .../base/frameworks/measurement/basic.bro | 2 +- 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/scripts/base/frameworks/measurement/plugins/hll_unique.bro b/scripts/base/frameworks/measurement/plugins/hll_unique.bro index 0e95e6fcdd..ccdb872606 100644 --- a/scripts/base/frameworks/measurement/plugins/hll_unique.bro +++ b/scripts/base/frameworks/measurement/plugins/hll_unique.bro @@ -10,14 +10,23 @@ export { redef record ResultVal += { ## If cardinality is being tracked, the number of unique ## items is tracked here. - hllunique: opaque of cardinality &default=hll_cardinality_init(0.01); + hllunique: count &default=0; }; } +redef record ResultVal += { + # Internal use only. This is not meant to be publically available + # because probabilistic data structures have to be examined using + # specialized bifs. + card: opaque of cardinality &default=hll_cardinality_init(0.01); +}; + + hook init_resultval_hook(r: Reducer, rv: ResultVal) { - if ( HLLUNIQUE in r$apply && ! rv?$hllunique ) - rv$hllunique = hll_cardinality_init(0.01); + if ( HLLUNIQUE in r$apply && ! rv?$card ) + rv$card = hll_cardinality_init(0.01); + rv$hllunique = 0; } @@ -25,15 +34,17 @@ hook add_to_reducer_hook(r: Reducer, val: double, data: DataPoint, rv: ResultVal { if ( HLLUNIQUE in r$apply ) { - hll_cardinality_add(rv$hllunique, data); + hll_cardinality_add(rv$card, data); + rv$hllunique = double_to_count(hll_cardinality_estimate(rv$card)); } } hook compose_resultvals_hook(result: ResultVal, rv1: ResultVal, rv2: ResultVal) { local rhll = hll_cardinality_init(0.01); - hll_cardinality_merge_into(rhll, rv1$hllunique); - hll_cardinality_merge_into(rhll, rv2$hllunique); + hll_cardinality_merge_into(rhll, rv1$card); + hll_cardinality_merge_into(rhll, rv2$card); - result$hllunique = rhll; + result$card = rhll; + result$hllunique = double_to_count(hll_cardinality_estimate(rhll)); } diff --git a/testing/btest/scripts/base/frameworks/measurement/basic.bro b/testing/btest/scripts/base/frameworks/measurement/basic.bro index 4706a7c9b1..701b79fbb3 100644 --- a/testing/btest/scripts/base/frameworks/measurement/basic.bro +++ b/testing/btest/scripts/base/frameworks/measurement/basic.bro @@ -19,7 +19,7 @@ event bro_init() &priority=5 for ( key in data ) { local r = data[key]["test.metric"]; - print fmt("Host: %s - num:%d - sum:%.1f - var:%.1f - avg:%.1f - max:%.1f - min:%.1f - std_dev:%.1f - unique:%d - hllunique:%.1f", key$host, r$num, r$sum, r$variance, r$average, r$max, r$min, r$std_dev, r$unique, hll_cardinality_estimate(r$hllunique)); + print fmt("Host: %s - num:%d - sum:%.1f - var:%.1f - avg:%.1f - max:%.1f - min:%.1f - std_dev:%.1f - unique:%d - hllunique:%d", key$host, r$num, r$sum, r$variance, r$average, r$max, r$min, r$std_dev, r$unique, r$hllunique); } } ]); From a37ffab0ea6a2780469be2e6001709748f1034e7 Mon Sep 17 00:00:00 2001 From: Bernhard Amann Date: Wed, 10 Apr 2013 13:15:31 -0400 Subject: [PATCH 16/32] serialization compiles. Not entirely sure if it works too... --- src/HyperLogLog.h | 3 +++ src/OpaqueVal.cc | 46 +++++++++++++++++++++++++++++++++++++- src/OpaqueVal.h | 6 ++--- src/SerialTypes.h | 1 + src/SerializationFormat.cc | 30 +++++++++++++++++++++++++ src/SerializationFormat.h | 6 +++++ src/Serializer.h | 1 + 7 files changed, 89 insertions(+), 4 deletions(-) diff --git a/src/HyperLogLog.h b/src/HyperLogLog.h index 81c19067a1..34a2afd9b7 100644 --- a/src/HyperLogLog.h +++ b/src/HyperLogLog.h @@ -10,7 +10,10 @@ */ #define conf .95 +class CardinalityVal; + class CardinalityCounter { + friend class CardinalityVal; private: /* diff --git a/src/OpaqueVal.cc b/src/OpaqueVal.cc index 604ce2938e..d1d97c1f8c 100644 --- a/src/OpaqueVal.cc +++ b/src/OpaqueVal.cc @@ -1,9 +1,10 @@ #include "OpaqueVal.h" #include "Reporter.h" #include "Serializer.h" +#include "HyperLogLog.h" -CardinalityVal::CardinalityVal() +CardinalityVal::CardinalityVal() : OpaqueVal(new OpaqueType("cardinality")) { valid = false; } @@ -14,6 +15,49 @@ CardinalityVal::~CardinalityVal() delete c; } +IMPLEMENT_SERIAL(CardinalityVal, SER_CARDINALITY_VAL); + +bool CardinalityVal::DoSerialize(SerialInfo* info) const + { + DO_SERIALIZE(SER_CARDINALITY_VAL, OpaqueVal); + + if ( ! IsValid() ) + return true; + + assert(c); + + bool valid = true; + + valid &= SERIALIZE(c->m); + for ( int i = 0; i < c->m; i++ ) + { + valid &= SERIALIZE(c->buckets[i]); + } + + return valid; + } + +bool CardinalityVal::DoUnserialize(UnserialInfo* info) + { + DO_UNSERIALIZE(OpaqueVal); + + if ( ! IsValid() ) + return true; + + uint64_t m; + bool valid = UNSERIALIZE(&m); + + c = new CardinalityCounter(m); + uint8_t* buckets = c->buckets; + for ( int i = 0; i < m; i++ ) + { + uint8_t* currbucket = buckets + i; + valid &= UNSERIALIZE( currbucket ); + } + + return valid; + } + bool CardinalityVal::Init(CardinalityCounter* arg_c) { if ( valid ) diff --git a/src/OpaqueVal.h b/src/OpaqueVal.h index 01f86529c7..1bdd842cad 100644 --- a/src/OpaqueVal.h +++ b/src/OpaqueVal.h @@ -11,17 +11,17 @@ class CardinalityCounter; class CardinalityVal: public OpaqueVal { public: - CardinalityVal(); - ~CardinalityVal(); bool Init(CardinalityCounter*); bool IsValid() const { return valid; }; CardinalityCounter* Get() { return c; }; + CardinalityVal(); + ~CardinalityVal(); private: bool valid; CardinalityCounter* c; -// DECLARE_SERIAL(CardinalityVal); Fixme? + DECLARE_SERIAL(CardinalityVal); }; diff --git a/src/SerialTypes.h b/src/SerialTypes.h index 723badab1e..56a1c5e8dd 100644 --- a/src/SerialTypes.h +++ b/src/SerialTypes.h @@ -104,6 +104,7 @@ SERIAL_VAL(MD5_VAL, 16) SERIAL_VAL(SHA1_VAL, 17) SERIAL_VAL(SHA256_VAL, 18) SERIAL_VAL(ENTROPY_VAL, 19) +SERIAL_VAL(CARDINALITY_VAL, 20) #define SERIAL_EXPR(name, val) SERIAL_CONST(name, val, EXPR) SERIAL_EXPR(EXPR, 1) diff --git a/src/SerializationFormat.cc b/src/SerializationFormat.cc index 10dd4f29ea..271163d5db 100644 --- a/src/SerializationFormat.cc +++ b/src/SerializationFormat.cc @@ -107,6 +107,16 @@ bool BinarySerializationFormat::Read(int* v, const char* tag) return true; } +bool BinarySerializationFormat::Read(uint8* v, const char* tag) + { + if ( ! ReadData(v, sizeof(*v)) ) + return false; + + *v = ntohs(*v); + DBG_LOG(DBG_SERIAL, "Read uint8 %hu [%s]", *v, tag); + return true; + } + bool BinarySerializationFormat::Read(uint16* v, const char* tag) { if ( ! ReadData(v, sizeof(*v)) ) @@ -301,6 +311,13 @@ bool BinarySerializationFormat::Write(char v, const char* tag) return WriteData(&v, 1); } +bool BinarySerializationFormat::Write(uint8 v, const char* tag) + { + DBG_LOG(DBG_SERIAL, "Write uint8 %hu [%s]", v, tag); + v = htons(v); + return WriteData(&v, sizeof(v)); + } + bool BinarySerializationFormat::Write(uint16 v, const char* tag) { DBG_LOG(DBG_SERIAL, "Write uint16 %hu [%s]", v, tag); @@ -447,6 +464,12 @@ bool XMLSerializationFormat::Read(int* v, const char* tag) return false; } +bool XMLSerializationFormat::Read(uint8* v, const char* tag) + { + reporter->InternalError("no reading of xml"); + return false; + } + bool XMLSerializationFormat::Read(uint16* v, const char* tag) { reporter->InternalError("no reading of xml"); @@ -530,6 +553,13 @@ bool XMLSerializationFormat::Write(char v, const char* tag) return WriteElem(tag, "char", &v, 1); } +bool XMLSerializationFormat::Write(uint8 v, const char* tag) + { + const char* tmp = fmt("%" PRIu8, v); + return WriteElem(tag, "uint8", tmp, strlen(tmp)); + } + + bool XMLSerializationFormat::Write(uint16 v, const char* tag) { const char* tmp = fmt("%" PRIu16, v); diff --git a/src/SerializationFormat.h b/src/SerializationFormat.h index f270b61bae..05cf56d961 100644 --- a/src/SerializationFormat.h +++ b/src/SerializationFormat.h @@ -23,6 +23,7 @@ public: virtual void EndRead(); virtual bool Read(int* v, const char* tag) = 0; + virtual bool Read(uint8* v, const char* tag) = 0; virtual bool Read(uint16* v, const char* tag) = 0; virtual bool Read(uint32* v, const char* tag) = 0; virtual bool Read(int64* v, const char* tag) = 0; @@ -47,6 +48,7 @@ public: virtual uint32 EndWrite(char** data); // passes ownership virtual bool Write(int v, const char* tag) = 0; + virtual bool Write(uint8 v, const char* tag) = 0; virtual bool Write(uint16 v, const char* tag) = 0; virtual bool Write(uint32 v, const char* tag) = 0; virtual bool Write(int64 v, const char* tag) = 0; @@ -92,6 +94,7 @@ public: virtual ~BinarySerializationFormat(); virtual bool Read(int* v, const char* tag); + virtual bool Read(uint8* v, const char* tag); virtual bool Read(uint16* v, const char* tag); virtual bool Read(uint32* v, const char* tag); virtual bool Read(int64* v, const char* tag); @@ -106,6 +109,7 @@ public: virtual bool Read(struct in_addr* addr, const char* tag); virtual bool Read(struct in6_addr* addr, const char* tag); virtual bool Write(int v, const char* tag); + virtual bool Write(uint8 v, const char* tag); virtual bool Write(uint16 v, const char* tag); virtual bool Write(uint32 v, const char* tag); virtual bool Write(int64 v, const char* tag); @@ -132,6 +136,7 @@ public: // We don't write anything if tag is nil. virtual bool Write(int v, const char* tag); + virtual bool Write(uint8 v, const char* tag); virtual bool Write(uint16 v, const char* tag); virtual bool Write(uint32 v, const char* tag); virtual bool Write(int64 v, const char* tag); @@ -152,6 +157,7 @@ public: // Not implemented. virtual bool Read(int* v, const char* tag); + virtual bool Read(uint8* v, const char* tag); virtual bool Read(uint16* v, const char* tag); virtual bool Read(uint32* v, const char* tag); virtual bool Read(int64* v, const char* tag); diff --git a/src/Serializer.h b/src/Serializer.h index 72e0723880..719d4dc527 100644 --- a/src/Serializer.h +++ b/src/Serializer.h @@ -54,6 +54,7 @@ public: DECLARE_WRITE(type) DECLARE_IO(int) + DECLARE_IO(uint8) DECLARE_IO(uint16) DECLARE_IO(uint32) DECLARE_IO(int64) From 240d667e30568128422da75fc37a83c703635555 Mon Sep 17 00:00:00 2001 From: Bernhard Amann Date: Wed, 10 Apr 2013 13:45:21 -0400 Subject: [PATCH 17/32] ok, this bug was hard to find. hyperloglog.h was missing guards and randomly deleting memory at addresses equal to variable contents. I am not entirely sure why that did not crash before... --- src/HyperLogLog.cc | 5 +---- src/HyperLogLog.h | 11 ++++++++--- src/OpaqueVal.cc | 9 ++++++--- src/OpaqueVal.h | 4 ++-- 4 files changed, 17 insertions(+), 12 deletions(-) diff --git a/src/HyperLogLog.cc b/src/HyperLogLog.cc index 436d754b4d..58cbd72e70 100644 --- a/src/HyperLogLog.cc +++ b/src/HyperLogLog.cc @@ -18,7 +18,7 @@ int CardinalityCounter::optimalB(double error) answer++; k = pow(2, (answer - initial_estimate)/2); } - while (erf(k/sqrt(2)) < conf); + while (erf(k/sqrt(2)) < HLL_CONF); return answer; } @@ -67,9 +67,6 @@ CardinalityCounter :: CardinalityCounter(double error_margin) CardinalityCounter::~CardinalityCounter() { delete [] buckets; - delete &m; - delete &V; - delete &alpha_m; } uint8_t CardinalityCounter::rank(uint64_t hash_modified) diff --git a/src/HyperLogLog.h b/src/HyperLogLog.h index 34a2afd9b7..68b7f0ecfc 100644 --- a/src/HyperLogLog.h +++ b/src/HyperLogLog.h @@ -1,16 +1,19 @@ // See the file "COPYING" in the main distribution directory for copyright. +#ifndef hyperloglog_h +#define hyperloglog_h + #include +#include /* * "conf" is how confident the estimate given by the counter is. * - * In other words, if the cardinality is estimated to be 100 with 2% error margin and conf is + * In other words, if the cardinality is estimated to be 100 with 2% error margin and HLL_CONFis * 0.95, then we are 95% sure that the actual cardinality is between 98 and 102. */ -#define conf .95 +#define HLL_CONF .95 -class CardinalityVal; class CardinalityCounter { friend class CardinalityVal; @@ -114,3 +117,5 @@ class CardinalityCounter { */ uint64_t getM(); }; + +#endif diff --git a/src/OpaqueVal.cc b/src/OpaqueVal.cc index d1d97c1f8c..4e8b0744f9 100644 --- a/src/OpaqueVal.cc +++ b/src/OpaqueVal.cc @@ -11,14 +11,16 @@ CardinalityVal::CardinalityVal() : OpaqueVal(new OpaqueType("cardinality")) CardinalityVal::~CardinalityVal() { - if ( valid && c ) + if ( valid && c != 0 ) delete c; + c = 0; + valid = false; } - IMPLEMENT_SERIAL(CardinalityVal, SER_CARDINALITY_VAL); bool CardinalityVal::DoSerialize(SerialInfo* info) const { + printf("Serializing\n"); DO_SERIALIZE(SER_CARDINALITY_VAL, OpaqueVal); if ( ! IsValid() ) @@ -39,6 +41,7 @@ bool CardinalityVal::DoSerialize(SerialInfo* info) const bool CardinalityVal::DoUnserialize(UnserialInfo* info) { + printf("Unserializing\n"); DO_UNSERIALIZE(OpaqueVal); if ( ! IsValid() ) @@ -56,7 +59,7 @@ bool CardinalityVal::DoUnserialize(UnserialInfo* info) } return valid; - } + } bool CardinalityVal::Init(CardinalityCounter* arg_c) { diff --git a/src/OpaqueVal.h b/src/OpaqueVal.h index 1bdd842cad..dd70eaf96b 100644 --- a/src/OpaqueVal.h +++ b/src/OpaqueVal.h @@ -11,11 +11,11 @@ class CardinalityCounter; class CardinalityVal: public OpaqueVal { public: + CardinalityVal(); + ~CardinalityVal(); bool Init(CardinalityCounter*); bool IsValid() const { return valid; }; CardinalityCounter* Get() { return c; }; - CardinalityVal(); - ~CardinalityVal(); private: bool valid; From 5291bb29f2c50732d620a1b98aac7c4d4f983588 Mon Sep 17 00:00:00 2001 From: Bernhard Amann Date: Wed, 10 Apr 2013 16:05:24 -0400 Subject: [PATCH 18/32] and also serialize the other things we need --- src/OpaqueVal.cc | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/OpaqueVal.cc b/src/OpaqueVal.cc index 4e8b0744f9..90230c37fe 100644 --- a/src/OpaqueVal.cc +++ b/src/OpaqueVal.cc @@ -31,6 +31,8 @@ bool CardinalityVal::DoSerialize(SerialInfo* info) const bool valid = true; valid &= SERIALIZE(c->m); + valid &= SERIALIZE(c->V); + valid &= SERIALIZE(c->alpha_m); for ( int i = 0; i < c->m; i++ ) { valid &= SERIALIZE(c->buckets[i]); @@ -51,6 +53,9 @@ bool CardinalityVal::DoUnserialize(UnserialInfo* info) bool valid = UNSERIALIZE(&m); c = new CardinalityCounter(m); + valid &= UNSERIALIZE(&c->V); + valid &= UNSERIALIZE(&c->alpha_m); + uint8_t* buckets = c->buckets; for ( int i = 0; i < m; i++ ) { From 70c020e412e02e9b177863766340e3eb72f6cf3f Mon Sep 17 00:00:00 2001 From: Bernhard Amann Date: Tue, 16 Apr 2013 05:16:32 -0700 Subject: [PATCH 19/32] well, with this commit synchronizing the data structure should work.. ...if we had consistent hashing. --- src/OpaqueVal.cc | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/src/OpaqueVal.cc b/src/OpaqueVal.cc index 90230c37fe..6d56f3ae71 100644 --- a/src/OpaqueVal.cc +++ b/src/OpaqueVal.cc @@ -20,47 +20,48 @@ IMPLEMENT_SERIAL(CardinalityVal, SER_CARDINALITY_VAL); bool CardinalityVal::DoSerialize(SerialInfo* info) const { - printf("Serializing\n"); DO_SERIALIZE(SER_CARDINALITY_VAL, OpaqueVal); + bool serialvalid = true; + serialvalid &= SERIALIZE(&valid); + if ( ! IsValid() ) - return true; + return serialvalid; assert(c); - bool valid = true; - - valid &= SERIALIZE(c->m); - valid &= SERIALIZE(c->V); - valid &= SERIALIZE(c->alpha_m); + serialvalid &= SERIALIZE(c->m); + serialvalid &= SERIALIZE(c->V); + serialvalid &= SERIALIZE(c->alpha_m); for ( int i = 0; i < c->m; i++ ) { - valid &= SERIALIZE(c->buckets[i]); + serialvalid &= SERIALIZE(c->buckets[i]); } - return valid; + return serialvalid; } bool CardinalityVal::DoUnserialize(UnserialInfo* info) { - printf("Unserializing\n"); DO_UNSERIALIZE(OpaqueVal); + bool serialvalid = UNSERIALIZE(&valid); + if ( ! IsValid() ) - return true; + return serialvalid; uint64_t m; - bool valid = UNSERIALIZE(&m); + serialvalid &= UNSERIALIZE(&m); c = new CardinalityCounter(m); - valid &= UNSERIALIZE(&c->V); - valid &= UNSERIALIZE(&c->alpha_m); + serialvalid &= UNSERIALIZE(&c->V); + serialvalid &= UNSERIALIZE(&c->alpha_m); uint8_t* buckets = c->buckets; for ( int i = 0; i < m; i++ ) { uint8_t* currbucket = buckets + i; - valid &= UNSERIALIZE( currbucket ); + serialvalid &= UNSERIALIZE( currbucket ); } return valid; From 8340af55d1ac15a659488f117c2e1a8691542457 Mon Sep 17 00:00:00 2001 From: Bernhard Amann Date: Fri, 19 Apr 2013 09:52:45 -0700 Subject: [PATCH 20/32] persistence really works. It took me way too long to find this - I got the uint8 serialize/deserialize wrong :/ --- src/HyperLogLog.cc | 6 +-- src/OpaqueVal.cc | 5 +-- src/SerializationFormat.cc | 2 - .../btest/Baseline/bifs.hll_persistence/out | 6 +++ testing/btest/bifs/hll_persistence.bro | 40 +++++++++++++++++++ 5 files changed, 50 insertions(+), 9 deletions(-) create mode 100644 testing/btest/Baseline/bifs.hll_persistence/out create mode 100644 testing/btest/bifs/hll_persistence.bro diff --git a/src/HyperLogLog.cc b/src/HyperLogLog.cc index 58cbd72e70..a399f5a495 100644 --- a/src/HyperLogLog.cc +++ b/src/HyperLogLog.cc @@ -43,7 +43,7 @@ CardinalityCounter::CardinalityCounter(uint64_t size) V = m; } -CardinalityCounter :: CardinalityCounter(double error_margin) +CardinalityCounter::CardinalityCounter(double error_margin) { int b = optimalB(error_margin); m = (uint64_t) pow(2, b); @@ -101,9 +101,9 @@ void CardinalityCounter::addElement(uint64_t hash) double CardinalityCounter::size() { double answer = 0; - for (int i = 0; i < m; i++) + for (int i = 0; i < m; i++) answer += pow(2, -(int)buckets[i]); - + answer = 1/answer; answer = alpha_m*m*m*answer; diff --git a/src/OpaqueVal.cc b/src/OpaqueVal.cc index 6d56f3ae71..b712f728f7 100644 --- a/src/OpaqueVal.cc +++ b/src/OpaqueVal.cc @@ -34,9 +34,7 @@ bool CardinalityVal::DoSerialize(SerialInfo* info) const serialvalid &= SERIALIZE(c->V); serialvalid &= SERIALIZE(c->alpha_m); for ( int i = 0; i < c->m; i++ ) - { - serialvalid &= SERIALIZE(c->buckets[i]); - } + serialvalid &= SERIALIZE( c->buckets[i] ); return serialvalid; } @@ -63,7 +61,6 @@ bool CardinalityVal::DoUnserialize(UnserialInfo* info) uint8_t* currbucket = buckets + i; serialvalid &= UNSERIALIZE( currbucket ); } - return valid; } diff --git a/src/SerializationFormat.cc b/src/SerializationFormat.cc index 271163d5db..a005a103cf 100644 --- a/src/SerializationFormat.cc +++ b/src/SerializationFormat.cc @@ -112,7 +112,6 @@ bool BinarySerializationFormat::Read(uint8* v, const char* tag) if ( ! ReadData(v, sizeof(*v)) ) return false; - *v = ntohs(*v); DBG_LOG(DBG_SERIAL, "Read uint8 %hu [%s]", *v, tag); return true; } @@ -314,7 +313,6 @@ bool BinarySerializationFormat::Write(char v, const char* tag) bool BinarySerializationFormat::Write(uint8 v, const char* tag) { DBG_LOG(DBG_SERIAL, "Write uint8 %hu [%s]", v, tag); - v = htons(v); return WriteData(&v, sizeof(v)); } diff --git a/testing/btest/Baseline/bifs.hll_persistence/out b/testing/btest/Baseline/bifs.hll_persistence/out new file mode 100644 index 0000000000..f5bb99d960 --- /dev/null +++ b/testing/btest/Baseline/bifs.hll_persistence/out @@ -0,0 +1,6 @@ +1 +10.000763 +2 +10.000763 +3 +11.000923 diff --git a/testing/btest/bifs/hll_persistence.bro b/testing/btest/bifs/hll_persistence.bro new file mode 100644 index 0000000000..a966ef9343 --- /dev/null +++ b/testing/btest/bifs/hll_persistence.bro @@ -0,0 +1,40 @@ +# @TEST-EXEC: bro -b %INPUT runnumber=1 >out +# @TEST-EXEC: bro -b %INPUT runnumber=2 >>out +# @TEST-EXEC: bro -b %INPUT runnumber=3 >>out +# @TEST-EXEC: btest-diff out + +global runnumber: count &redef; # differentiate first and second run + +global card: opaque of cardinality &persistent; + +event bro_init() + { + print runnumber; + + if ( runnumber == 1 ) + { + card = hll_cardinality_init(0.01); + + hll_cardinality_add(card, "a"); + hll_cardinality_add(card, "b"); + hll_cardinality_add(card, "c"); + hll_cardinality_add(card, "d"); + hll_cardinality_add(card, "e"); + hll_cardinality_add(card, "f"); + hll_cardinality_add(card, "g"); + hll_cardinality_add(card, "h"); + hll_cardinality_add(card, "i"); + hll_cardinality_add(card, "j"); + } + + print hll_cardinality_estimate(card); + + if ( runnumber == 2 ) + { + hll_cardinality_add(card, "a"); + hll_cardinality_add(card, "b"); + hll_cardinality_add(card, "c"); + hll_cardinality_add(card, "aa"); + } + } + From 6e532e89608fc188e00d266a41a90ba6fdc9b668 Mon Sep 17 00:00:00 2001 From: Bernhard Amann Date: Fri, 19 Apr 2013 09:58:57 -0700 Subject: [PATCH 21/32] update cluster test to also use hll --- .../manager-1..stdout | 8 ++++---- .../scripts/base/frameworks/sumstats/basic-cluster.bro | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/testing/btest/Baseline/scripts.base.frameworks.sumstats.basic-cluster/manager-1..stdout b/testing/btest/Baseline/scripts.base.frameworks.sumstats.basic-cluster/manager-1..stdout index ea8904d2e6..ab25d52947 100644 --- a/testing/btest/Baseline/scripts.base.frameworks.sumstats.basic-cluster/manager-1..stdout +++ b/testing/btest/Baseline/scripts.base.frameworks.sumstats.basic-cluster/manager-1..stdout @@ -1,4 +1,4 @@ -Host: 6.5.4.3 - num:2 - sum:6.0 - avg:3.0 - max:5.0 - min:1.0 - var:8.0 - std_dev:2.8 - unique:2 -Host: 10.10.10.10 - num:1 - sum:5.0 - avg:5.0 - max:5.0 - min:5.0 - var:0.0 - std_dev:0.0 - unique:1 -Host: 1.2.3.4 - num:9 - sum:437.0 - avg:48.6 - max:95.0 - min:3.0 - var:758.8 - std_dev:27.5 - unique:8 -Host: 7.2.1.5 - num:2 - sum:145.0 - avg:72.5 - max:91.0 - min:54.0 - var:684.5 - std_dev:26.2 - unique:2 +Host: 6.5.4.3 - num:2 - sum:6.0 - avg:3.0 - max:5.0 - min:1.0 - var:8.0 - std_dev:2.8 - unique:2 - hllunique:2 +Host: 10.10.10.10 - num:1 - sum:5.0 - avg:5.0 - max:5.0 - min:5.0 - var:0.0 - std_dev:0.0 - unique:1 - hllunique:1 +Host: 1.2.3.4 - num:9 - sum:437.0 - avg:48.6 - max:95.0 - min:3.0 - var:758.8 - std_dev:27.5 - unique:8 - hllunique:8 +Host: 7.2.1.5 - num:2 - sum:145.0 - avg:72.5 - max:91.0 - min:54.0 - var:684.5 - std_dev:26.2 - unique:2 - hllunique:2 diff --git a/testing/btest/scripts/base/frameworks/sumstats/basic-cluster.bro b/testing/btest/scripts/base/frameworks/sumstats/basic-cluster.bro index 1b7903ca1a..080697a824 100644 --- a/testing/btest/scripts/base/frameworks/sumstats/basic-cluster.bro +++ b/testing/btest/scripts/base/frameworks/sumstats/basic-cluster.bro @@ -22,7 +22,7 @@ global n = 0; event bro_init() &priority=5 { - local r1: SumStats::Reducer = [$stream="test", $apply=set(SumStats::SUM, SumStats::MIN, SumStats::MAX, SumStats::AVERAGE, SumStats::STD_DEV, SumStats::VARIANCE, SumStats::UNIQUE)]; + local r1: SumStats::Reducer = [$stream="test", $apply=set(SumStats::SUM, SumStats::MIN, SumStats::MAX, SumStats::AVERAGE, SumStats::STD_DEV, SumStats::VARIANCE, SumStats::UNIQUE, SumStats::HLLUNIQUE)]; SumStats::create([$epoch=5secs, $reducers=set(r1), $epoch_finished(rt: SumStats::ResultTable) = @@ -30,7 +30,7 @@ event bro_init() &priority=5 for ( key in rt ) { local r = rt[key]["test"]; - print fmt("Host: %s - num:%d - sum:%.1f - avg:%.1f - max:%.1f - min:%.1f - var:%.1f - std_dev:%.1f - unique:%d", key$host, r$num, r$sum, r$average, r$max, r$min, r$variance, r$std_dev, r$unique); + print fmt("Host: %s - num:%d - sum:%.1f - avg:%.1f - max:%.1f - min:%.1f - var:%.1f - std_dev:%.1f - unique:%d - hllunique:%d", key$host, r$num, r$sum, r$average, r$max, r$min, r$variance, r$std_dev, r$unique, r$hllunique); } terminate(); From f2967f485b0f27365421523842121b1a44c1721c Mon Sep 17 00:00:00 2001 From: Bernhard Amann Date: Wed, 24 Apr 2013 16:03:40 -0700 Subject: [PATCH 22/32] add persistence test not using predetermined random seeds. This is failing at the moment. --- .../btest/bifs/hll_persistence_twoseeds.bro | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 testing/btest/bifs/hll_persistence_twoseeds.bro diff --git a/testing/btest/bifs/hll_persistence_twoseeds.bro b/testing/btest/bifs/hll_persistence_twoseeds.bro new file mode 100644 index 0000000000..4d828dafc9 --- /dev/null +++ b/testing/btest/bifs/hll_persistence_twoseeds.bro @@ -0,0 +1,40 @@ +# @TEST-EXEC: BRO_SEED_FILE="" bro -b %INPUT runnumber=1 >out +# @TEST-EXEC: BRO_SEED_FILE="" bro -b %INPUT runnumber=2 >>out +# @TEST-EXEC: BRO_SEED_FILE="" bro -b %INPUT runnumber=3 >>out +# @TEST-EXEC: btest-diff out + +global runnumber: count &redef; # differentiate first and second run + +global card: opaque of cardinality &persistent; + +event bro_init() + { + print runnumber; + + if ( runnumber == 1 ) + { + card = hll_cardinality_init(0.01); + + hll_cardinality_add(card, "a"); + hll_cardinality_add(card, "b"); + hll_cardinality_add(card, "c"); + hll_cardinality_add(card, "d"); + hll_cardinality_add(card, "e"); + hll_cardinality_add(card, "f"); + hll_cardinality_add(card, "g"); + hll_cardinality_add(card, "h"); + hll_cardinality_add(card, "i"); + hll_cardinality_add(card, "j"); + } + + print hll_cardinality_estimate(card); + + if ( runnumber == 2 ) + { + hll_cardinality_add(card, "a"); + hll_cardinality_add(card, "b"); + hll_cardinality_add(card, "c"); + hll_cardinality_add(card, "aa"); + } + } + From 5608caf79afd850f5bc610acfca7bd69b982ad88 Mon Sep 17 00:00:00 2001 From: Bernhard Amann Date: Thu, 25 Apr 2013 14:20:13 -0700 Subject: [PATCH 23/32] make error rate configureable --- .../frameworks/sumstats/plugins/hll_unique.bro | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/scripts/base/frameworks/sumstats/plugins/hll_unique.bro b/scripts/base/frameworks/sumstats/plugins/hll_unique.bro index 47ded2ca60..41889532f3 100644 --- a/scripts/base/frameworks/sumstats/plugins/hll_unique.bro +++ b/scripts/base/frameworks/sumstats/plugins/hll_unique.bro @@ -3,6 +3,11 @@ module SumStats; export { + redef record Reducer += { + ## The threshold when we switch to hll + hll_error_margin: double &default=0.01; + }; + redef enum Calculation += { ## Calculate the number of unique values. HLLUNIQUE @@ -19,14 +24,18 @@ redef record ResultVal += { # Internal use only. This is not meant to be publically available # because probabilistic data structures have to be examined using # specialized bifs. - card: opaque of cardinality &default=hll_cardinality_init(0.01); + card: opaque of cardinality &optional; + + # we need this in the compose hook. + hll_error_margin: double &optional; }; hook init_resultval_hook(r: Reducer, rv: ResultVal) { if ( HLLUNIQUE in r$apply && ! rv?$card ) - rv$card = hll_cardinality_init(0.01); + rv$card = hll_cardinality_init(r$hll_error_margin); + rv$hll_error_margin = r$hll_error_margin; rv$hllunique = 0; } @@ -42,7 +51,7 @@ hook observe_hook(r: Reducer, val: double, obs: Observation, rv: ResultVal) hook compose_resultvals_hook(result: ResultVal, rv1: ResultVal, rv2: ResultVal) { - local rhll = hll_cardinality_init(0.01); + local rhll = hll_cardinality_init(rv1$hll_error_margin); hll_cardinality_merge_into(rhll, rv1$card); hll_cardinality_merge_into(rhll, rv2$card); From 1cf506071df6498a87b495472f5d9236f79e165e Mon Sep 17 00:00:00 2001 From: Bernhard Amann Date: Wed, 1 May 2013 18:12:20 -0700 Subject: [PATCH 24/32] make it compile on case-sensitive file systems and fix warnings --- src/HyperLogLog.cc | 4 ++-- src/HyperLogLog.h | 2 +- src/OpaqueVal.cc | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/HyperLogLog.cc b/src/HyperLogLog.cc index a399f5a495..6dacab33a2 100644 --- a/src/HyperLogLog.cc +++ b/src/HyperLogLog.cc @@ -101,7 +101,7 @@ void CardinalityCounter::addElement(uint64_t hash) double CardinalityCounter::size() { double answer = 0; - for (int i = 0; i < m; i++) + for (unsigned int i = 0; i < m; i++) answer += pow(2, -(int)buckets[i]); answer = 1/answer; @@ -119,7 +119,7 @@ void CardinalityCounter::merge(CardinalityCounter* c) { uint8_t* temp = (*c).getBuckets(); V = 0; - for (int i = 0; i < m; i++) + for (unsigned int i = 0; i < m; i++) { if (temp[i] > buckets[i]) buckets[i] = temp[i]; diff --git a/src/HyperLogLog.h b/src/HyperLogLog.h index 68b7f0ecfc..f07167502a 100644 --- a/src/HyperLogLog.h +++ b/src/HyperLogLog.h @@ -4,7 +4,7 @@ #define hyperloglog_h #include -#include +#include /* * "conf" is how confident the estimate given by the counter is. diff --git a/src/OpaqueVal.cc b/src/OpaqueVal.cc index b712f728f7..54c771b366 100644 --- a/src/OpaqueVal.cc +++ b/src/OpaqueVal.cc @@ -33,7 +33,7 @@ bool CardinalityVal::DoSerialize(SerialInfo* info) const serialvalid &= SERIALIZE(c->m); serialvalid &= SERIALIZE(c->V); serialvalid &= SERIALIZE(c->alpha_m); - for ( int i = 0; i < c->m; i++ ) + for ( unsigned int i = 0; i < c->m; i++ ) serialvalid &= SERIALIZE( c->buckets[i] ); return serialvalid; @@ -56,7 +56,7 @@ bool CardinalityVal::DoUnserialize(UnserialInfo* info) serialvalid &= UNSERIALIZE(&c->alpha_m); uint8_t* buckets = c->buckets; - for ( int i = 0; i < m; i++ ) + for ( unsigned int i = 0; i < m; i++ ) { uint8_t* currbucket = buckets + i; serialvalid &= UNSERIALIZE( currbucket ); From f6e99fce11e7e10b50607cfaea82a801e690585b Mon Sep 17 00:00:00 2001 From: Bernhard Amann Date: Fri, 3 May 2013 23:03:31 -0700 Subject: [PATCH 25/32] fix opaqueval-related memleak. --- src/NetVar.cc | 2 ++ src/NetVar.h | 1 + src/OpaqueVal.cc | 2 +- 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/NetVar.cc b/src/NetVar.cc index 012e4a85bc..3ee899634a 100644 --- a/src/NetVar.cc +++ b/src/NetVar.cc @@ -243,6 +243,7 @@ OpaqueType* md5_type; OpaqueType* sha1_type; OpaqueType* sha256_type; OpaqueType* entropy_type; +OpaqueType* cardinality_type; #include "const.bif.netvar_def" #include "types.bif.netvar_def" @@ -308,6 +309,7 @@ void init_general_global_var() sha1_type = new OpaqueType("sha1"); sha256_type = new OpaqueType("sha256"); entropy_type = new OpaqueType("entropy"); + cardinality_type = new OpaqueType("cardinality"); } void init_net_var() diff --git a/src/NetVar.h b/src/NetVar.h index d7590b20e7..2239bbb560 100644 --- a/src/NetVar.h +++ b/src/NetVar.h @@ -248,6 +248,7 @@ extern OpaqueType* md5_type; extern OpaqueType* sha1_type; extern OpaqueType* sha256_type; extern OpaqueType* entropy_type; +extern OpaqueType* cardinality_type; // Initializes globals that don't pertain to network/event analysis. extern void init_general_global_var(); diff --git a/src/OpaqueVal.cc b/src/OpaqueVal.cc index 39eab973b7..0515ae2989 100644 --- a/src/OpaqueVal.cc +++ b/src/OpaqueVal.cc @@ -5,7 +5,7 @@ #include "HyperLogLog.h" -CardinalityVal::CardinalityVal() : OpaqueVal(new OpaqueType("cardinality")) +CardinalityVal::CardinalityVal() : OpaqueVal(cardinality_type) { valid = false; } From b7cdfc0e6eb627b2df451b81355466f35936a64a Mon Sep 17 00:00:00 2001 From: Bernhard Amann Date: Wed, 24 Jul 2013 12:50:01 -0700 Subject: [PATCH 26/32] adapt to new structure --- scripts/base/init-bare.bro | 1 + src/CMakeLists.txt | 1 - src/Func.cc | 2 + src/OpaqueVal.cc | 6 +- src/OpaqueVal.h | 15 ++- src/bro.bif | 125 ------------------------- src/probabilistic/CMakeLists.txt | 7 +- src/{ => probabilistic}/HyperLogLog.cc | 2 +- src/{ => probabilistic}/HyperLogLog.h | 4 + 9 files changed, 23 insertions(+), 140 deletions(-) rename src/{ => probabilistic}/HyperLogLog.cc (98%) rename src/{ => probabilistic}/HyperLogLog.h (99%) diff --git a/scripts/base/init-bare.bro b/scripts/base/init-bare.bro index c368b9d610..33978091f1 100644 --- a/scripts/base/init-bare.bro +++ b/scripts/base/init-bare.bro @@ -706,6 +706,7 @@ type entropy_test_result: record { @load base/bif/bro.bif @load base/bif/reporter.bif @load base/bif/bloom-filter.bif +@load base/bif/hyper-loglog.bif ## Deprecated. This is superseded by the new logging framework. global log_file_name: function(tag: string): string &redef; diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 3d23f7a2b4..0c979df19f 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -277,7 +277,6 @@ set(bro_SRCS Frame.cc Func.cc Hash.cc - HyperLogLog.cc ID.cc IntSet.cc IOSource.cc diff --git a/src/Func.cc b/src/Func.cc index 483699668f..4d8d7f3193 100644 --- a/src/Func.cc +++ b/src/Func.cc @@ -562,6 +562,7 @@ void builtin_error(const char* msg, BroObj* arg) // TODO: Add a nicer mechanism to pull in subdirectory bifs automatically. #include "probabilistic/bloom-filter.bif.h" +#include "probabilistic/hyper-loglog.bif.h" void init_builtin_funcs() { @@ -579,6 +580,7 @@ void init_builtin_funcs() // TODO: Add a nicer mechanism to pull in subdirectory bifs automatically. #include "probabilistic/bloom-filter.bif.init.cc" +#include "probabilistic/hyper-loglog.bif.init.cc" did_builtin_init = true; } diff --git a/src/OpaqueVal.cc b/src/OpaqueVal.cc index 1d8214fd85..67e39aa2cc 100644 --- a/src/OpaqueVal.cc +++ b/src/OpaqueVal.cc @@ -4,7 +4,7 @@ #include "NetVar.h" #include "Reporter.h" #include "Serializer.h" -#include "HyperLogLog.h" +#include "probabilistic/HyperLogLog.h" CardinalityVal::CardinalityVal() : OpaqueVal(cardinality_type) @@ -54,7 +54,7 @@ bool CardinalityVal::DoUnserialize(UnserialInfo* info) uint64_t m; serialvalid &= UNSERIALIZE(&m); - c = new CardinalityCounter(m); + c = new probabilistic::CardinalityCounter(m); serialvalid &= UNSERIALIZE(&c->V); serialvalid &= UNSERIALIZE(&c->alpha_m); @@ -67,7 +67,7 @@ bool CardinalityVal::DoUnserialize(UnserialInfo* info) return valid; } -bool CardinalityVal::Init(CardinalityCounter* arg_c) +bool CardinalityVal::Init(probabilistic::CardinalityCounter* arg_c) { if ( valid ) return false; diff --git a/src/OpaqueVal.h b/src/OpaqueVal.h index 23df0d50d7..aeae4d9d51 100644 --- a/src/OpaqueVal.h +++ b/src/OpaqueVal.h @@ -10,28 +10,27 @@ #include "digest.h" #include "probabilistic/BloomFilter.h" -class CardinalityCounter; +namespace probabilistic { + class BloomFilter; + class CardinalityCounter; +} class CardinalityVal: public OpaqueVal { public: CardinalityVal(); ~CardinalityVal(); - bool Init(CardinalityCounter*); + bool Init(probabilistic::CardinalityCounter*); bool IsValid() const { return valid; }; - CardinalityCounter* Get() { return c; }; + probabilistic::CardinalityCounter* Get() { return c; }; private: bool valid; - CardinalityCounter* c; + probabilistic::CardinalityCounter* c; DECLARE_SERIAL(CardinalityVal); }; -namespace probabilistic { - class BloomFilter; -} - class HashVal : public OpaqueVal { public: virtual bool IsValid() const; diff --git a/src/bro.bif b/src/bro.bif index c3e46b501d..a01d68c585 100644 --- a/src/bro.bif +++ b/src/bro.bif @@ -4974,130 +4974,5 @@ function anonymize_addr%(a: addr, cl: IPAddrAnonymizationClass%): addr } %} -## This is where my code starts... -##Just a note about notation. I'm specifying everything with the prefix hll just -## in case in the future, there's a better way to count cardinalities or something. -## That way, code written that depends on the HyperLogLog algorithm will still be -## working. Though, I'm fairly certain that anything that might be better won't -## be significantly better. -%%{ -#include "HyperLogLog.h" -%%} - -## Initializes the hash for the HyperLogLog cardinality counting algorithm. -## It returns true if it was successful in creating a structure and false -## if it wasn't. - -function hll_cardinality_init%(err: double%): opaque of cardinality - %{ - CardinalityCounter* c = new CardinalityCounter(err); - CardinalityVal* cv = new CardinalityVal(); - - if ( !c ) - reporter->Error("Failed initialize Cardinality counter"); - else - cv->Init(c); - - return cv; - %} - -## Adds an element to the HyperLogLog data structure located at index. - -##elem->Type() to get the type of elem. - -function hll_cardinality_add%(handle: opaque of cardinality, elem: any%): bool - %{ - if ( !((CardinalityVal*) handle)->IsValid() ) { - reporter->Error("Need valid handle"); - return new Val(0, TYPE_BOOL); - } - - int status = 0; - uint64_t a = 123456; - - TypeList* tl = new TypeList(elem->Type()); - tl->Append(elem->Type()); - CompositeHash* hll_hash = new CompositeHash(tl); - Unref(tl); - - CardinalityCounter* h = ((CardinalityVal*) handle)->Get(); - HashKey* key = hll_hash->ComputeHash(elem, 1); - a = key->Hash(); - h->addElement(a); - - delete hll_hash; - return new Val(1, TYPE_BOOL); - %} - -## The data structure at index1 will contain the combined count for the -## elements measured by index1 and index2. -## It returns true if it either cloned the value at index2 into index1 -## or if it merged the two data structures together. - -function hll_cardinality_merge_into%(handle1: opaque of cardinality, handle2: opaque of cardinality%): bool - %{ - CardinalityVal* v1 = (CardinalityVal*) handle1; - CardinalityVal* v2 = (CardinalityVal*) handle2; - - if ( !v1->IsValid() || !v2->IsValid() ) { - reporter->Error("need valid handles"); - return new Val(0, TYPE_BOOL); - } - - CardinalityCounter* h1 = v1->Get(); - CardinalityCounter* h2 = v2->Get(); - - h1->merge(h2); - - return new Val(1, TYPE_BOOL); - %} - -## Returns true if it destroyed something. False if it didn't. -#function hll_cardinality_destroy%(handle: opaque of cardinality%): bool -# %{ -# if ( !((CardinalityVal*) handle)->IsValid() ) { -# reporter->Error("Need valid handle"); -# return new Val(0, TYPE_BOOL); -# } -# CardinalityCounter* h = ((CardinalityVal*) handle)->Get(); -# delete h; -# h = 0; -# return new Val(1, TYPE_BOOL); -# %} - -## Returns the cardinality estimate. Returns -1.0 if there is nothing in that index. -function hll_cardinality_estimate%(handle: opaque of cardinality%): double - %{ - if ( !((CardinalityVal*) handle)->IsValid() ) { - reporter->Error("Need valid handle"); - return new Val(0, TYPE_BOOL); - } - CardinalityCounter* h = ((CardinalityVal*) handle)->Get(); - - double estimate = h->size(); - - return new Val(estimate, TYPE_DOUBLE); - %} - -## Stores the data structure at index2 into index1. Deletes the data structure at index1 -## if there was any. Returns True if the data structure at index1 was changed in any way. - -function hll_cardinality_clone%(handle: opaque of cardinality%): opaque of cardinality - %{ - if ( !((CardinalityVal*) handle)->IsValid() ) { - reporter->Error("Need valid handle"); - return new Val(0, TYPE_BOOL); - } - CardinalityCounter* h = ((CardinalityVal*) handle)->Get(); - - - uint64_t m = h->getM(); - CardinalityCounter* h2 = new CardinalityCounter(m); - int i = 0; - h2->merge(h); - CardinalityVal* cv = new CardinalityVal(); - cv->Init(h2); - return cv; - %} diff --git a/src/probabilistic/CMakeLists.txt b/src/probabilistic/CMakeLists.txt index 961c07fb33..6f3c64f67e 100644 --- a/src/probabilistic/CMakeLists.txt +++ b/src/probabilistic/CMakeLists.txt @@ -10,9 +10,12 @@ set(probabilistic_SRCS BitVector.cc BloomFilter.cc CounterVector.cc - Hasher.cc) + Hasher.cc + HyperLogLog.cc) bif_target(bloom-filter.bif) +set(BIF_OUTPUT_CC_SAVE ${BIF_OUTPUT_CC}) +bif_target(hyper-loglog.bif) -bro_add_subdir_library(probabilistic ${probabilistic_SRCS} ${BIF_OUTPUT_CC}) +bro_add_subdir_library(probabilistic ${probabilistic_SRCS} ${BIF_OUTPUT_CC_SAVE} ${BIF_OUTPUT_CC}) add_dependencies(bro_probabilistic generate_outputs) diff --git a/src/HyperLogLog.cc b/src/probabilistic/HyperLogLog.cc similarity index 98% rename from src/HyperLogLog.cc rename to src/probabilistic/HyperLogLog.cc index 6dacab33a2..b1deb39552 100644 --- a/src/HyperLogLog.cc +++ b/src/probabilistic/HyperLogLog.cc @@ -5,7 +5,7 @@ #include "HyperLogLog.h" #include -using namespace std; +using namespace probabilistic; int CardinalityCounter::optimalB(double error) { diff --git a/src/HyperLogLog.h b/src/probabilistic/HyperLogLog.h similarity index 99% rename from src/HyperLogLog.h rename to src/probabilistic/HyperLogLog.h index f07167502a..0a7ea6ac2f 100644 --- a/src/HyperLogLog.h +++ b/src/probabilistic/HyperLogLog.h @@ -6,6 +6,8 @@ #include #include +namespace probabilistic { + /* * "conf" is how confident the estimate given by the counter is. * @@ -118,4 +120,6 @@ class CardinalityCounter { uint64_t getM(); }; +} + #endif From efdffaec9e810482edcf9aad7afcdf4f779ea8b9 Mon Sep 17 00:00:00 2001 From: Bernhard Amann Date: Wed, 24 Jul 2013 12:51:31 -0700 Subject: [PATCH 27/32] and forgot a file... --- src/probabilistic/hyper-loglog.bif | 131 +++++++++++++++++++++++++++++ 1 file changed, 131 insertions(+) create mode 100644 src/probabilistic/hyper-loglog.bif diff --git a/src/probabilistic/hyper-loglog.bif b/src/probabilistic/hyper-loglog.bif new file mode 100644 index 0000000000..24b18e0c40 --- /dev/null +++ b/src/probabilistic/hyper-loglog.bif @@ -0,0 +1,131 @@ +# =========================================================================== +# +# HyperLogLog Functions +# +# =========================================================================== + + +%%{ +#include "probabilistic/HyperLogLog.h" + +using namespace probabilistic; +%%} + +module GLOBAL; + +## Initializes the hash for the HyperLogLog cardinality counting algorithm. +## It returns true if it was successful in creating a structure and false +## if it wasn't. + +function hll_cardinality_init%(err: double%): opaque of cardinality + %{ + CardinalityCounter* c = new CardinalityCounter(err); + CardinalityVal* cv = new CardinalityVal(); + + if ( !c ) + reporter->Error("Failed initialize Cardinality counter"); + else + cv->Init(c); + + return cv; + %} + +## Adds an element to the HyperLogLog data structure located at index. + +##elem->Type() to get the type of elem. + +function hll_cardinality_add%(handle: opaque of cardinality, elem: any%): bool + %{ + if ( !((CardinalityVal*) handle)->IsValid() ) { + reporter->Error("Need valid handle"); + return new Val(0, TYPE_BOOL); + } + + int status = 0; + uint64_t a = 123456; + + TypeList* tl = new TypeList(elem->Type()); + tl->Append(elem->Type()); + CompositeHash* hll_hash = new CompositeHash(tl); + Unref(tl); + + CardinalityCounter* h = ((CardinalityVal*) handle)->Get(); + HashKey* key = hll_hash->ComputeHash(elem, 1); + a = key->Hash(); + h->addElement(a); + + delete hll_hash; + return new Val(1, TYPE_BOOL); + %} + +## The data structure at index1 will contain the combined count for the +## elements measured by index1 and index2. +## It returns true if it either cloned the value at index2 into index1 +## or if it merged the two data structures together. + +function hll_cardinality_merge_into%(handle1: opaque of cardinality, handle2: opaque of cardinality%): bool + %{ + CardinalityVal* v1 = (CardinalityVal*) handle1; + CardinalityVal* v2 = (CardinalityVal*) handle2; + + if ( !v1->IsValid() || !v2->IsValid() ) { + reporter->Error("need valid handles"); + return new Val(0, TYPE_BOOL); + } + + CardinalityCounter* h1 = v1->Get(); + CardinalityCounter* h2 = v2->Get(); + + h1->merge(h2); + + return new Val(1, TYPE_BOOL); + %} + +## Returns true if it destroyed something. False if it didn't. +#function hll_cardinality_destroy%(handle: opaque of cardinality%): bool +# %{ +# if ( !((CardinalityVal*) handle)->IsValid() ) { +# reporter->Error("Need valid handle"); +# return new Val(0, TYPE_BOOL); +# } +# CardinalityCounter* h = ((CardinalityVal*) handle)->Get(); +# delete h; +# h = 0; +# return new Val(1, TYPE_BOOL); +# %} + +## Returns the cardinality estimate. Returns -1.0 if there is nothing in that index. +function hll_cardinality_estimate%(handle: opaque of cardinality%): double + %{ + if ( !((CardinalityVal*) handle)->IsValid() ) { + reporter->Error("Need valid handle"); + return new Val(0, TYPE_BOOL); + } + CardinalityCounter* h = ((CardinalityVal*) handle)->Get(); + + double estimate = h->size(); + + return new Val(estimate, TYPE_DOUBLE); + %} + +## Stores the data structure at index2 into index1. Deletes the data structure at index1 +## if there was any. Returns True if the data structure at index1 was changed in any way. + +function hll_cardinality_clone%(handle: opaque of cardinality%): opaque of cardinality + %{ + if ( !((CardinalityVal*) handle)->IsValid() ) { + reporter->Error("Need valid handle"); + return new Val(0, TYPE_BOOL); + } + CardinalityCounter* h = ((CardinalityVal*) handle)->Get(); + + + uint64_t m = h->getM(); + CardinalityCounter* h2 = new CardinalityCounter(m); + int i = 0; + h2->merge(h); + CardinalityVal* cv = new CardinalityVal(); + cv->Init(h2); + return cv; + %} + From 18c10f3cb53f63b24d393e56c96044b83e291a77 Mon Sep 17 00:00:00 2001 From: Bernhard Amann Date: Tue, 30 Jul 2013 16:47:26 -0700 Subject: [PATCH 28/32] get hll ready for merging --- src/OpaqueVal.cc | 171 +++++++++++------- src/OpaqueVal.h | 38 ++-- src/probabilistic/HyperLogLog.cc | 39 ++-- src/probabilistic/hyper-loglog.bif | 139 +++++++------- .../Baseline/bifs.hll_cardinality/.stderr | 1 + .../{bifs.hll_persistence => istate.hll}/out | 0 testing/btest/bifs/hll_cardinality.bro | 30 +-- .../btest/bifs/hll_persistence_twoseeds.bro | 40 ---- .../hll_persistence.bro => istate/hll.bro} | 0 9 files changed, 240 insertions(+), 218 deletions(-) create mode 100644 testing/btest/Baseline/bifs.hll_cardinality/.stderr rename testing/btest/Baseline/{bifs.hll_persistence => istate.hll}/out (100%) delete mode 100644 testing/btest/bifs/hll_persistence_twoseeds.bro rename testing/btest/{bifs/hll_persistence.bro => istate/hll.bro} (100%) diff --git a/src/OpaqueVal.cc b/src/OpaqueVal.cc index 64990af3fc..e2e7e4f967 100644 --- a/src/OpaqueVal.cc +++ b/src/OpaqueVal.cc @@ -7,75 +7,6 @@ #include "probabilistic/HyperLogLog.h" -CardinalityVal::CardinalityVal() : OpaqueVal(cardinality_type) - { - valid = false; - } - -CardinalityVal::~CardinalityVal() - { - if ( valid && c != 0 ) - delete c; - c = 0; - valid = false; - } -IMPLEMENT_SERIAL(CardinalityVal, SER_CARDINALITY_VAL); - -bool CardinalityVal::DoSerialize(SerialInfo* info) const - { - DO_SERIALIZE(SER_CARDINALITY_VAL, OpaqueVal); - - bool serialvalid = true; - serialvalid &= SERIALIZE(&valid); - - if ( ! IsValid() ) - return serialvalid; - - assert(c); - - serialvalid &= SERIALIZE(c->m); - serialvalid &= SERIALIZE(c->V); - serialvalid &= SERIALIZE(c->alpha_m); - for ( unsigned int i = 0; i < c->m; i++ ) - serialvalid &= SERIALIZE( c->buckets[i] ); - - return serialvalid; - } - -bool CardinalityVal::DoUnserialize(UnserialInfo* info) - { - DO_UNSERIALIZE(OpaqueVal); - - bool serialvalid = UNSERIALIZE(&valid); - - if ( ! IsValid() ) - return serialvalid; - - uint64_t m; - - serialvalid &= UNSERIALIZE(&m); - c = new probabilistic::CardinalityCounter(m); - serialvalid &= UNSERIALIZE(&c->V); - serialvalid &= UNSERIALIZE(&c->alpha_m); - - uint8_t* buckets = c->buckets; - for ( unsigned int i = 0; i < m; i++ ) - { - uint8_t* currbucket = buckets + i; - serialvalid &= UNSERIALIZE( currbucket ); - } - return valid; - } - -bool CardinalityVal::Init(probabilistic::CardinalityCounter* arg_c) - { - if ( valid ) - return false; - - valid = true; - c = arg_c; - return valid; - } bool HashVal::IsValid() const { @@ -738,3 +669,105 @@ bool BloomFilterVal::DoUnserialize(UnserialInfo* info) bloom_filter = probabilistic::BloomFilter::Unserialize(info); return bloom_filter != 0; } + +CardinalityVal::CardinalityVal() : OpaqueVal(cardinality_type) + { + c = 0; + type = 0; + hash = 0; + } + +CardinalityVal::CardinalityVal(probabilistic::CardinalityCounter* arg_c) : OpaqueVal(cardinality_type) + { + c = arg_c; + type = 0; + hash = 0; + } + +CardinalityVal::~CardinalityVal() + { + Unref(type); + delete c; + delete hash; + } + +IMPLEMENT_SERIAL(CardinalityVal, SER_CARDINALITY_VAL); + +bool CardinalityVal::DoSerialize(SerialInfo* info) const + { + DO_SERIALIZE(SER_CARDINALITY_VAL, OpaqueVal); + + bool valid = true; + + bool is_typed = (type != 0); + + valid &= SERIALIZE(is_typed); + + if ( is_typed ) + valid &= type->Serialize(info); + + assert(c); + + valid &= SERIALIZE(c->m); + valid &= SERIALIZE(c->V); + valid &= SERIALIZE(c->alpha_m); + for ( unsigned int i = 0; i < c->m; i++ ) + valid &= SERIALIZE( c->buckets[i] ); + + return valid; + } + +bool CardinalityVal::DoUnserialize(UnserialInfo* info) + { + DO_UNSERIALIZE(OpaqueVal); + + uint64_t m; + bool valid = true; + + bool is_typed; + if ( ! UNSERIALIZE(&is_typed) ) + return false; + + if ( is_typed ) + { + BroType* t = BroType::Unserialize(info); + if ( ! Typify(t) ) + return false; + + Unref(t); + } + + valid &= UNSERIALIZE(&m); + c = new probabilistic::CardinalityCounter(m); + valid &= UNSERIALIZE(&c->V); + valid &= UNSERIALIZE(&c->alpha_m); + + uint8_t* buckets = c->buckets; + for ( unsigned int i = 0; i < m; i++ ) + { + uint8_t* currbucket = buckets + i; + valid &= UNSERIALIZE( currbucket ); + } + return valid; + } + +bool CardinalityVal::Typify(BroType* arg_type) + { + if ( type ) + return false; + + type = arg_type; + type->Ref(); + + TypeList* tl = new TypeList(type); + tl->Append(type); + hash = new CompositeHash(tl); + Unref(tl); + + return true; + } + +BroType* CardinalityVal::Type() const + { + return type; + } diff --git a/src/OpaqueVal.h b/src/OpaqueVal.h index a4414acd73..3a4b548308 100644 --- a/src/OpaqueVal.h +++ b/src/OpaqueVal.h @@ -15,22 +15,6 @@ namespace probabilistic { class CardinalityCounter; } -class CardinalityVal: public OpaqueVal { -public: - CardinalityVal(); - ~CardinalityVal(); - bool Init(probabilistic::CardinalityCounter*); - bool IsValid() const { return valid; }; - probabilistic::CardinalityCounter* Get() { return c; }; - -private: - bool valid; - probabilistic::CardinalityCounter* c; - - DECLARE_SERIAL(CardinalityVal); -}; - - class HashVal : public OpaqueVal { public: virtual bool IsValid() const; @@ -164,4 +148,26 @@ private: probabilistic::BloomFilter* bloom_filter; }; + +class CardinalityVal: public OpaqueVal { +public: + explicit CardinalityVal(probabilistic::CardinalityCounter*); + virtual ~CardinalityVal(); + + BroType* Type() const; + bool Typify(BroType* type); + + probabilistic::CardinalityCounter* Get() { return c; }; + +protected: + CardinalityVal(); + +private: + BroType* type; + CompositeHash* hash; + probabilistic::CardinalityCounter* c; + + DECLARE_SERIAL(CardinalityVal); +}; + #endif diff --git a/src/probabilistic/HyperLogLog.cc b/src/probabilistic/HyperLogLog.cc index b1deb39552..ea847b935f 100644 --- a/src/probabilistic/HyperLogLog.cc +++ b/src/probabilistic/HyperLogLog.cc @@ -4,23 +4,24 @@ #include #include "HyperLogLog.h" #include +#include "Reporter.h" using namespace probabilistic; int CardinalityCounter::optimalB(double error) { - double initial_estimate = 2*(log(1.04)-log(error))/log(2); - int answer = (int) floor(initial_estimate); - double k; + double initial_estimate = 2*(log(1.04)-log(error))/log(2); + int answer = (int) floor(initial_estimate); + double k; - do - { - answer++; - k = pow(2, (answer - initial_estimate)/2); - } - while (erf(k/sqrt(2)) < HLL_CONF); + do + { + answer++; + k = pow(2, (answer - initial_estimate)/2); + } + while (erf(k/sqrt(2)) < HLL_CONF); - return answer; + return answer; } CardinalityCounter::CardinalityCounter(uint64_t size) @@ -28,14 +29,16 @@ CardinalityCounter::CardinalityCounter(uint64_t size) m = size; buckets = new uint8_t[m]; - if(m == 16) + if (m == 16) alpha_m = 0.673; - else if(m == 32) + else if (m == 32) alpha_m = 0.697; - else if(m == 64) + else if (m == 64) alpha_m = 0.709; - else + else if (m >= 128) alpha_m = 0.7213/(1+1.079/m); + else + reporter->InternalError("Invalid size %lld. Size either has to be 16, 32, 64 or bigger than 128", size); for (uint64_t i = 0; i < m; i++) buckets[i] = 0; @@ -55,8 +58,10 @@ CardinalityCounter::CardinalityCounter(double error_margin) alpha_m = 0.697; else if(m == 64) alpha_m = 0.709; - else + else if(m >= 128) alpha_m = 0.7213/(1+1.079/m); + else + reporter->InternalError("Invalid m %lld calculated for error margin %f", m, error_margin); for (uint64_t i = 0; i < m; i++) buckets[i] = 0; @@ -96,7 +101,7 @@ void CardinalityCounter::addElement(uint64_t hash) if (temp > buckets[index]) buckets[index] = temp; -} + } double CardinalityCounter::size() { @@ -113,7 +118,7 @@ double CardinalityCounter::size() return answer; else return -pow(2,64)*log(1-answer/pow(2,64)); -} + } void CardinalityCounter::merge(CardinalityCounter* c) { diff --git a/src/probabilistic/hyper-loglog.bif b/src/probabilistic/hyper-loglog.bif index 24b18e0c40..012b2d2988 100644 --- a/src/probabilistic/hyper-loglog.bif +++ b/src/probabilistic/hyper-loglog.bif @@ -14,64 +14,88 @@ using namespace probabilistic; module GLOBAL; ## Initializes the hash for the HyperLogLog cardinality counting algorithm. -## It returns true if it was successful in creating a structure and false -## if it wasn't. - +## +## err: the desired error rate (e.g. 0.01). +## +## Returns: a hll cardinality handle. +## +## .. bro:see:: hll_cardinality_estimate hll_cardinality_merge_into hll_cardinality_add +## hll_cardinality_copy function hll_cardinality_init%(err: double%): opaque of cardinality %{ CardinalityCounter* c = new CardinalityCounter(err); - CardinalityVal* cv = new CardinalityVal(); - - if ( !c ) - reporter->Error("Failed initialize Cardinality counter"); - else - cv->Init(c); + CardinalityVal* cv = new CardinalityVal(c); return cv; %} -## Adds an element to the HyperLogLog data structure located at index. - -##elem->Type() to get the type of elem. - +## Adds an element to the HyperLogLog data structure +## +## handle: the hll handle. +## +## elem: the element to add +## +## Returns: 1 on success +## +## .. bro:see:: hll_cardinality_estimate hll_cardinality_merge_into +## hll_cardinality_init hll_cardinality_copy function hll_cardinality_add%(handle: opaque of cardinality, elem: any%): bool %{ - if ( !((CardinalityVal*) handle)->IsValid() ) { - reporter->Error("Need valid handle"); + CardinalityVal* cv = static_cast(handle); + + if ( ! cv->Type() && ! cv->Typify(elem->Type()) ) + { + reporter->Error("failed to set HLL type"); return new Val(0, TYPE_BOOL); - } + } + else if ( ! same_type(cv->Type(), elem->Type()) ) + { + reporter->Error("incompatible HLL data type"); + return new Val(0, TYPE_BOOL); + } int status = 0; - uint64_t a = 123456; TypeList* tl = new TypeList(elem->Type()); tl->Append(elem->Type()); CompositeHash* hll_hash = new CompositeHash(tl); Unref(tl); - CardinalityCounter* h = ((CardinalityVal*) handle)->Get(); HashKey* key = hll_hash->ComputeHash(elem, 1); - a = key->Hash(); - h->addElement(a); + uint64_t hash = key->Hash(); + + CardinalityCounter* h = cv->Get(); + h->addElement(hash); delete hll_hash; return new Val(1, TYPE_BOOL); %} -## The data structure at index1 will contain the combined count for the -## elements measured by index1 and index2. -## It returns true if it either cloned the value at index2 into index1 -## or if it merged the two data structures together. - +## Merges the second hll data structure into the first +## +## .. note:: The same restrictions as for bloom-filter merging apply +## +## handle1: the first hll handle, which will contain the merged result +## +## handle2: the second hll handle, which will be merged into the first +## +## Returns: 1 on success +## +## .. bro:see:: hll_cardinality_estimate hll_cardinality_add +## hll_cardinality_init hll_cardinality_copy function hll_cardinality_merge_into%(handle1: opaque of cardinality, handle2: opaque of cardinality%): bool %{ - CardinalityVal* v1 = (CardinalityVal*) handle1; - CardinalityVal* v2 = (CardinalityVal*) handle2; + CardinalityVal* v1 = static_cast(handle1); + CardinalityVal* v2 = static_cast(handle2); - if ( !v1->IsValid() || !v2->IsValid() ) { - reporter->Error("need valid handles"); + if ( ( v1->Type() != v2->Type() ) && // both 0 is ok + ( v1->Type() != 0 ) && // any one 0 also is ok + ( v2->Type() != 0 ) && + ! same_type(v1->Type(), v2->Type()) ) + { + reporter->Error("incompatible HLL types"); return new Val(0, TYPE_BOOL); - } + } CardinalityCounter* h1 = v1->Get(); CardinalityCounter* h2 = v2->Get(); @@ -81,51 +105,42 @@ function hll_cardinality_merge_into%(handle1: opaque of cardinality, handle2: op return new Val(1, TYPE_BOOL); %} -## Returns true if it destroyed something. False if it didn't. -#function hll_cardinality_destroy%(handle: opaque of cardinality%): bool -# %{ -# if ( !((CardinalityVal*) handle)->IsValid() ) { -# reporter->Error("Need valid handle"); -# return new Val(0, TYPE_BOOL); -# } -# CardinalityCounter* h = ((CardinalityVal*) handle)->Get(); -# delete h; -# h = 0; -# return new Val(1, TYPE_BOOL); -# %} - -## Returns the cardinality estimate. Returns -1.0 if there is nothing in that index. +## Estimate the cardinality of the HLL data structure. +## +## handle: the hll handle +## +## Returns the cardinality estimate. Returns -1.0 if the structure is empty +## +## .. bro:see:: hll_cardinality_merge_into hll_cardinality_add +## hll_cardinality_init hll_cardinality_copy function hll_cardinality_estimate%(handle: opaque of cardinality%): double %{ - if ( !((CardinalityVal*) handle)->IsValid() ) { - reporter->Error("Need valid handle"); - return new Val(0, TYPE_BOOL); - } - CardinalityCounter* h = ((CardinalityVal*) handle)->Get(); + CardinalityVal* cv = static_cast(handle); + CardinalityCounter* h = cv->Get(); double estimate = h->size(); return new Val(estimate, TYPE_DOUBLE); %} -## Stores the data structure at index2 into index1. Deletes the data structure at index1 -## if there was any. Returns True if the data structure at index1 was changed in any way. - -function hll_cardinality_clone%(handle: opaque of cardinality%): opaque of cardinality +## Copy a hll data structure +## +## handle: data structure to copy +## +## Returns: copy of handle +## +## .. bro:see:: hll_cardinality_estimate hll_cardinality_merge_into hll_cardinality_add +## hll_cardinality_init +function hll_cardinality_copy%(handle: opaque of cardinality%): opaque of cardinality %{ - if ( !((CardinalityVal*) handle)->IsValid() ) { - reporter->Error("Need valid handle"); - return new Val(0, TYPE_BOOL); - } - CardinalityCounter* h = ((CardinalityVal*) handle)->Get(); - + CardinalityVal* cv = static_cast(handle); + CardinalityCounter* h = cv->Get(); uint64_t m = h->getM(); CardinalityCounter* h2 = new CardinalityCounter(m); int i = 0; h2->merge(h); - CardinalityVal* cv = new CardinalityVal(); - cv->Init(h2); - return cv; + CardinalityVal* out = new CardinalityVal(h2); + return out; %} diff --git a/testing/btest/Baseline/bifs.hll_cardinality/.stderr b/testing/btest/Baseline/bifs.hll_cardinality/.stderr new file mode 100644 index 0000000000..840ee3363e --- /dev/null +++ b/testing/btest/Baseline/bifs.hll_cardinality/.stderr @@ -0,0 +1 @@ +error: incompatible HLL data type diff --git a/testing/btest/Baseline/bifs.hll_persistence/out b/testing/btest/Baseline/istate.hll/out similarity index 100% rename from testing/btest/Baseline/bifs.hll_persistence/out rename to testing/btest/Baseline/istate.hll/out diff --git a/testing/btest/bifs/hll_cardinality.bro b/testing/btest/bifs/hll_cardinality.bro index 774e8f6e28..6d4075a4bf 100644 --- a/testing/btest/bifs/hll_cardinality.bro +++ b/testing/btest/bifs/hll_cardinality.bro @@ -1,35 +1,37 @@ # # @TEST-EXEC: bro %INPUT>out # @TEST-EXEC: btest-diff out +# @TEST-EXEC: btest-diff .stderr event bro_init() { local c1 = hll_cardinality_init(0.01); local c2 = hll_cardinality_init(0.01); - local add1 = "hey"; - local add2 = "hi"; - local add3 = 123; + local add1 = 2001; + local add2 = 2002; + local add3 = 2003; hll_cardinality_add(c1, add1); hll_cardinality_add(c1, add2); hll_cardinality_add(c1, add3); - hll_cardinality_add(c1, "a"); - hll_cardinality_add(c1, "b"); - hll_cardinality_add(c1, "c"); - hll_cardinality_add(c1, "d"); - hll_cardinality_add(c1, "e"); - hll_cardinality_add(c1, "f"); - hll_cardinality_add(c1, "g"); - hll_cardinality_add(c1, "h"); - hll_cardinality_add(c1, "i"); - hll_cardinality_add(c1, "j"); + hll_cardinality_add(c1, 1000); + hll_cardinality_add(c1, 1001); + hll_cardinality_add(c1, 101); + hll_cardinality_add(c1, 1003); + hll_cardinality_add(c1, 1004); + hll_cardinality_add(c1, 1005); + hll_cardinality_add(c1, 1006); + hll_cardinality_add(c1, 1007); + hll_cardinality_add(c1, 1008); + hll_cardinality_add(c1, 1009); hll_cardinality_add(c2, add1); hll_cardinality_add(c2, add2); hll_cardinality_add(c2, add3); hll_cardinality_add(c2, 1); hll_cardinality_add(c2, "b"); + hll_cardinality_add(c2, 101); hll_cardinality_add(c2, 2); hll_cardinality_add(c2, 3); hll_cardinality_add(c2, 4); @@ -49,7 +51,7 @@ event bro_init() print "This value should be around 0:"; print hll_cardinality_estimate(m2); - local c3 = hll_cardinality_clone(c1); + local c3 = hll_cardinality_copy(c1); print "This value should be around 13:"; print hll_cardinality_estimate(c3); diff --git a/testing/btest/bifs/hll_persistence_twoseeds.bro b/testing/btest/bifs/hll_persistence_twoseeds.bro deleted file mode 100644 index 4d828dafc9..0000000000 --- a/testing/btest/bifs/hll_persistence_twoseeds.bro +++ /dev/null @@ -1,40 +0,0 @@ -# @TEST-EXEC: BRO_SEED_FILE="" bro -b %INPUT runnumber=1 >out -# @TEST-EXEC: BRO_SEED_FILE="" bro -b %INPUT runnumber=2 >>out -# @TEST-EXEC: BRO_SEED_FILE="" bro -b %INPUT runnumber=3 >>out -# @TEST-EXEC: btest-diff out - -global runnumber: count &redef; # differentiate first and second run - -global card: opaque of cardinality &persistent; - -event bro_init() - { - print runnumber; - - if ( runnumber == 1 ) - { - card = hll_cardinality_init(0.01); - - hll_cardinality_add(card, "a"); - hll_cardinality_add(card, "b"); - hll_cardinality_add(card, "c"); - hll_cardinality_add(card, "d"); - hll_cardinality_add(card, "e"); - hll_cardinality_add(card, "f"); - hll_cardinality_add(card, "g"); - hll_cardinality_add(card, "h"); - hll_cardinality_add(card, "i"); - hll_cardinality_add(card, "j"); - } - - print hll_cardinality_estimate(card); - - if ( runnumber == 2 ) - { - hll_cardinality_add(card, "a"); - hll_cardinality_add(card, "b"); - hll_cardinality_add(card, "c"); - hll_cardinality_add(card, "aa"); - } - } - diff --git a/testing/btest/bifs/hll_persistence.bro b/testing/btest/istate/hll.bro similarity index 100% rename from testing/btest/bifs/hll_persistence.bro rename to testing/btest/istate/hll.bro From 83ce77e575cc99d1d922ca5bd6f47204e8ffd62e Mon Sep 17 00:00:00 2001 From: Bernhard Amann Date: Tue, 30 Jul 2013 18:48:05 -0700 Subject: [PATCH 29/32] re-use same hash class for all add operations --- src/OpaqueVal.cc | 8 ++++++++ src/OpaqueVal.h | 2 ++ src/probabilistic/hyper-loglog.bif | 15 +-------------- 3 files changed, 11 insertions(+), 14 deletions(-) diff --git a/src/OpaqueVal.cc b/src/OpaqueVal.cc index e2e7e4f967..dcf3e84430 100644 --- a/src/OpaqueVal.cc +++ b/src/OpaqueVal.cc @@ -771,3 +771,11 @@ BroType* CardinalityVal::Type() const { return type; } + +void CardinalityVal::Add(const Val* val) + { + HashKey* key = hash->ComputeHash(val, 1); + c->addElement(key->Hash()); + delete key; + } + diff --git a/src/OpaqueVal.h b/src/OpaqueVal.h index 3a4b548308..a7c51657c4 100644 --- a/src/OpaqueVal.h +++ b/src/OpaqueVal.h @@ -154,6 +154,8 @@ public: explicit CardinalityVal(probabilistic::CardinalityCounter*); virtual ~CardinalityVal(); + void Add(const Val* val); + BroType* Type() const; bool Typify(BroType* type); diff --git a/src/probabilistic/hyper-loglog.bif b/src/probabilistic/hyper-loglog.bif index 012b2d2988..6e69e70735 100644 --- a/src/probabilistic/hyper-loglog.bif +++ b/src/probabilistic/hyper-loglog.bif @@ -54,20 +54,7 @@ function hll_cardinality_add%(handle: opaque of cardinality, elem: any%): bool return new Val(0, TYPE_BOOL); } - int status = 0; - - TypeList* tl = new TypeList(elem->Type()); - tl->Append(elem->Type()); - CompositeHash* hll_hash = new CompositeHash(tl); - Unref(tl); - - HashKey* key = hll_hash->ComputeHash(elem, 1); - uint64_t hash = key->Hash(); - - CardinalityCounter* h = cv->Get(); - h->addElement(hash); - - delete hll_hash; + cv->Add(elem); return new Val(1, TYPE_BOOL); %} From 07634fd95eaefb436ebc26a07156121fc34ab176 Mon Sep 17 00:00:00 2001 From: Bernhard Amann Date: Wed, 31 Jul 2013 12:36:56 -0700 Subject: [PATCH 30/32] (hopefully) fix refcounting problem in hll/bloom-filter opaque vals. Thanks Robin. --- src/OpaqueVal.cc | 4 ++-- src/Serializer.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/OpaqueVal.cc b/src/OpaqueVal.cc index dcf3e84430..211426a434 100644 --- a/src/OpaqueVal.cc +++ b/src/OpaqueVal.cc @@ -554,7 +554,7 @@ bool BloomFilterVal::Typify(BroType* arg_type) type->Ref(); TypeList* tl = new TypeList(type); - tl->Append(type); + tl->Append(type->Ref()); hash = new CompositeHash(tl); Unref(tl); @@ -760,7 +760,7 @@ bool CardinalityVal::Typify(BroType* arg_type) type->Ref(); TypeList* tl = new TypeList(type); - tl->Append(type); + tl->Append(type->Ref()); hash = new CompositeHash(tl); Unref(tl); diff --git a/src/Serializer.h b/src/Serializer.h index 719d4dc527..f2cea1000e 100644 --- a/src/Serializer.h +++ b/src/Serializer.h @@ -126,7 +126,7 @@ protected: // This will be increased whenever there is an incompatible change // in the data format. - static const uint32 DATA_FORMAT_VERSION = 23; + static const uint32 DATA_FORMAT_VERSION = 24; ChunkedIO* io; From 39c0f5abadb9c523f056e22eb746cde6754a4ae3 Mon Sep 17 00:00:00 2001 From: Bernhard Amann Date: Wed, 31 Jul 2013 12:43:33 -0700 Subject: [PATCH 31/32] make gcc happy --- src/probabilistic/HyperLogLog.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/probabilistic/HyperLogLog.h b/src/probabilistic/HyperLogLog.h index 0a7ea6ac2f..f9920ab2a8 100644 --- a/src/probabilistic/HyperLogLog.h +++ b/src/probabilistic/HyperLogLog.h @@ -18,7 +18,7 @@ namespace probabilistic { class CardinalityCounter { - friend class CardinalityVal; + friend class ::CardinalityVal; private: /* From 4cd20c6375bcf8d417cfaa6f7eb3285a12f102c4 Mon Sep 17 00:00:00 2001 From: Bernhard Amann Date: Thu, 29 Aug 2013 14:01:22 -0700 Subject: [PATCH 32/32] add clustered leak test for hll. No issues. --- .../core.leaks.hll_cluster/manager-1..stdout | 2 + .../core.leaks.hll_cluster/worker-1..stdout | 2 + .../core.leaks.hll_cluster/worker-2..stdout | 2 + testing/btest/core/leaks/hll_cluster.bro | 111 ++++++++++++++++++ 4 files changed, 117 insertions(+) create mode 100644 testing/btest/Baseline/core.leaks.hll_cluster/manager-1..stdout create mode 100644 testing/btest/Baseline/core.leaks.hll_cluster/worker-1..stdout create mode 100644 testing/btest/Baseline/core.leaks.hll_cluster/worker-2..stdout create mode 100644 testing/btest/core/leaks/hll_cluster.bro diff --git a/testing/btest/Baseline/core.leaks.hll_cluster/manager-1..stdout b/testing/btest/Baseline/core.leaks.hll_cluster/manager-1..stdout new file mode 100644 index 0000000000..910a87642c --- /dev/null +++ b/testing/btest/Baseline/core.leaks.hll_cluster/manager-1..stdout @@ -0,0 +1,2 @@ +This value should be about 21: +21.003365 diff --git a/testing/btest/Baseline/core.leaks.hll_cluster/worker-1..stdout b/testing/btest/Baseline/core.leaks.hll_cluster/worker-1..stdout new file mode 100644 index 0000000000..e64c2b30aa --- /dev/null +++ b/testing/btest/Baseline/core.leaks.hll_cluster/worker-1..stdout @@ -0,0 +1,2 @@ +This value should be around 13: +13.00129 diff --git a/testing/btest/Baseline/core.leaks.hll_cluster/worker-2..stdout b/testing/btest/Baseline/core.leaks.hll_cluster/worker-2..stdout new file mode 100644 index 0000000000..d2b4f08b8d --- /dev/null +++ b/testing/btest/Baseline/core.leaks.hll_cluster/worker-2..stdout @@ -0,0 +1,2 @@ +This value should be about 12: +12.001099 diff --git a/testing/btest/core/leaks/hll_cluster.bro b/testing/btest/core/leaks/hll_cluster.bro new file mode 100644 index 0000000000..01653a895f --- /dev/null +++ b/testing/btest/core/leaks/hll_cluster.bro @@ -0,0 +1,111 @@ +# Needs perftools support. +# +# @TEST-SERIALIZE: comm +# @TEST-GROUP: leaks +# +# @TEST-REQUIRES: bro --help 2>&1 | grep -q mem-leaks +# +# @TEST-EXEC: bro %INPUT>out +# @TEST-EXEC: btest-bg-run manager-1 HEAP_CHECK_DUMP_DIRECTORY=. HEAPCHECK=local BROPATH=$BROPATH:.. CLUSTER_NODE=manager-1 bro %INPUT +# @TEST-EXEC: sleep 2 +# @TEST-EXEC: btest-bg-run worker-1 HEAP_CHECK_DUMP_DIRECTORY=. HEAPCHECK=local BROPATH=$BROPATH:.. CLUSTER_NODE=worker-1 bro runnumber=1 %INPUT +# @TEST-EXEC: btest-bg-run worker-2 HEAP_CHECK_DUMP_DIRECTORY=. HEAPCHECK=local BROPATH=$BROPATH:.. CLUSTER_NODE=worker-2 bro runnumber=2 %INPUT +# @TEST-EXEC: btest-bg-wait -k 10 +# +# @TEST-EXEC: btest-diff manager-1/.stdout +# @TEST-EXEC: btest-diff worker-1/.stdout +# @TEST-EXEC: btest-diff worker-2/.stdout + +@TEST-START-FILE cluster-layout.bro +redef Cluster::nodes = { + ["manager-1"] = [$node_type=Cluster::MANAGER, $ip=127.0.0.1, $p=37757/tcp, $workers=set("worker-1", "worker-2")], + ["worker-1"] = [$node_type=Cluster::WORKER, $ip=127.0.0.1, $p=37760/tcp, $manager="manager-1"], + ["worker-2"] = [$node_type=Cluster::WORKER, $ip=127.0.0.1, $p=37761/tcp, $manager="manager-1"], +}; +@TEST-END-FILE + +redef Log::default_rotation_interval = 0secs; + +global hll_data: event(data: opaque of cardinality); + +redef Cluster::worker2manager_events += /hll_data/; + +@if ( Cluster::local_node_type() == Cluster::WORKER ) + +global runnumber: count &redef; # differentiate runs + +event remote_connection_handshake_done(p: event_peer) + { + local c = hll_cardinality_init(0.01); + + local add1 = 2001; + local add2 = 2002; + local add3 = 2003; + + if ( runnumber == 1 ) + { + hll_cardinality_add(c, add1); + hll_cardinality_add(c, add2); + hll_cardinality_add(c, add3); + hll_cardinality_add(c, 1000); + hll_cardinality_add(c, 1001); + hll_cardinality_add(c, 101); + hll_cardinality_add(c, 1003); + hll_cardinality_add(c, 1004); + hll_cardinality_add(c, 1005); + hll_cardinality_add(c, 1006); + hll_cardinality_add(c, 1007); + hll_cardinality_add(c, 1008); + hll_cardinality_add(c, 1009); + print "This value should be around 13:"; + print hll_cardinality_estimate(c); + } + else if ( runnumber == 2 ) + { + hll_cardinality_add(c, add1); + hll_cardinality_add(c, add2); + hll_cardinality_add(c, add3); + hll_cardinality_add(c, 1); + hll_cardinality_add(c, 101); + hll_cardinality_add(c, 2); + hll_cardinality_add(c, 3); + hll_cardinality_add(c, 4); + hll_cardinality_add(c, 5); + hll_cardinality_add(c, 6); + hll_cardinality_add(c, 7); + hll_cardinality_add(c, 8); + print "This value should be about 12:"; + print hll_cardinality_estimate(c); + } + + event hll_data(c); + + terminate(); + } + +@endif + +@if ( Cluster::local_node_type() == Cluster::MANAGER ) + +global result_count = 0; +global hll: opaque of cardinality; + +event bro_init() + { + hll = hll_cardinality_init(0.01); + } + +event hll_data(data: opaque of cardinality) + { + hll_cardinality_merge_into(hll, data); + ++result_count; + + if ( result_count == 2 ) + { + print "This value should be about 21:"; + print hll_cardinality_estimate(hll); + terminate(); + } + } + +@endif