diff --git a/src/HyperLogLog.cc b/src/HyperLogLog.cc index 22a06ee6c7..436d754b4d 100644 --- a/src/HyperLogLog.cc +++ b/src/HyperLogLog.cc @@ -1,3 +1,5 @@ +// See the file "COPYING" in the main distribution directory for copyright. + #include #include #include "HyperLogLog.h" @@ -5,129 +7,137 @@ using namespace std; - int CardinalityCounter::optimalB(double error){ - double initial_estimate = 2*(log(1.04)-log(error))/log(2); - int answer = (int) floor(initial_estimate); - double k; +int CardinalityCounter::optimalB(double error) + { + double initial_estimate = 2*(log(1.04)-log(error))/log(2); + int answer = (int) floor(initial_estimate); + double k; - do{ - answer++; - k = pow(2, (answer - initial_estimate)/2); - }while(erf(k/sqrt(2)) < conf); + do + { + answer++; + k = pow(2, (answer - initial_estimate)/2); + } + while (erf(k/sqrt(2)) < conf); - return answer; - } + return answer; + } - CardinalityCounter :: CardinalityCounter(uint64_t size){ - m = size; - buckets = new uint8_t[m]; +CardinalityCounter::CardinalityCounter(uint64_t size) + { + m = size; + buckets = new uint8_t[m]; - if(m == 16) - alpha_m = 0.673; - else if(m == 32) - alpha_m = 0.697; - else if(m == 64) - alpha_m = 0.709; - else - alpha_m = 0.7213/(1+1.079/m); + if(m == 16) + alpha_m = 0.673; + else if(m == 32) + alpha_m = 0.697; + else if(m == 64) + alpha_m = 0.709; + else + alpha_m = 0.7213/(1+1.079/m); - for(uint64_t i = 0; i < m; i++){ - buckets[i] = 0; - } - - V = m; - - } - - CardinalityCounter :: CardinalityCounter(double error_margin){ - int b = optimalB(error_margin); - m = (uint64_t) pow(2, b); - buckets = new uint8_t[m]; - - if(m == 16) - alpha_m = 0.673; - else if(m == 32) - alpha_m = 0.697; - else if(m == 64) - alpha_m = 0.709; - else - alpha_m = 0.7213/(1+1.079/m); - - for(uint64_t i = 0; i < m; i++){ - buckets[i] = 0; - } - - V = m; - } - - CardinalityCounter :: ~CardinalityCounter(){ - delete [] buckets; - delete &m; - delete &V; - delete &alpha_m; - } - - uint8_t CardinalityCounter :: rank(uint64_t hash_modified){ - uint8_t answer = 0; - hash_modified = (uint64_t)(hash_modified/m); - hash_modified *= 2; - do{ - hash_modified = (uint64_t) (hash_modified/2); - answer++; - }while(hash_modified%2 == 0); - return answer; - } - - - - void CardinalityCounter::addElement(uint64_t hash){ - uint64_t index = hash % m; - hash = hash-index; - - if(buckets[index] == 0) - V--; - uint8_t temp = rank(hash); - if(temp > buckets[index]){ - buckets[index] = temp; - } - } + for (uint64_t i = 0; i < m; i++) + buckets[i] = 0; - double CardinalityCounter::size(){ - double answer = 0; - for(int i = 0; i < m; i++){ - answer += pow(2, -(int)buckets[i]); - } - answer = 1/answer; - answer = alpha_m*m*m*answer; + V = m; + } + +CardinalityCounter :: CardinalityCounter(double error_margin) + { + int b = optimalB(error_margin); + m = (uint64_t) pow(2, b); + buckets = new uint8_t[m]; - if(answer <= 5*(double)(m/2)){ - return m*log((double) m/V); - } - else if(answer <= pow(2,64)/30){ - return answer; - } - else{ - return -pow(2,64)*log(1-answer/pow(2,64)); - } - } + if(m == 16) + alpha_m = 0.673; + else if(m == 32) + alpha_m = 0.697; + else if(m == 64) + alpha_m = 0.709; + else + alpha_m = 0.7213/(1+1.079/m); - void CardinalityCounter::merge(CardinalityCounter* c){ - uint8_t* temp = (*c).getBuckets(); - V = 0; - for(int i = 0; i < m; i++){ - if(temp[i] > buckets[i]){ - buckets[i] = temp[i]; - } - if(buckets[i] == 0){ - V += 1; - } - } - } + for (uint64_t i = 0; i < m; i++) + buckets[i] = 0; - uint8_t* CardinalityCounter::getBuckets(){ - return buckets; - } + V = m; +} - uint64_t CardinalityCounter::getM(){ - return m; - } +CardinalityCounter::~CardinalityCounter() + { + delete [] buckets; + delete &m; + delete &V; + delete &alpha_m; + } + +uint8_t CardinalityCounter::rank(uint64_t hash_modified) + { + uint8_t answer = 0; + hash_modified = (uint64_t)(hash_modified/m); + hash_modified *= 2; + do + { + hash_modified = (uint64_t) (hash_modified/2); + answer++; + } + while (hash_modified%2 == 0); + + return answer; + } + +void CardinalityCounter::addElement(uint64_t hash) + { + uint64_t index = hash % m; + hash = hash-index; + + if(buckets[index] == 0) + V--; + + uint8_t temp = rank(hash); + + if (temp > buckets[index]) + buckets[index] = temp; +} + +double CardinalityCounter::size() + { + double answer = 0; + for (int i = 0; i < m; i++) + answer += pow(2, -(int)buckets[i]); + + answer = 1/answer; + answer = alpha_m*m*m*answer; + + if (answer <= 5*(double)(m/2)) + return m*log((double) m/V); + else if(answer <= pow(2,64)/30) + return answer; + else + return -pow(2,64)*log(1-answer/pow(2,64)); +} + +void CardinalityCounter::merge(CardinalityCounter* c) + { + uint8_t* temp = (*c).getBuckets(); + V = 0; + for (int i = 0; i < m; i++) + { + if (temp[i] > buckets[i]) + buckets[i] = temp[i]; + + if (buckets[i] == 0) + V += 1; + } + } + +uint8_t* CardinalityCounter::getBuckets() + { + return buckets; + } + +uint64_t CardinalityCounter::getM() + { + return m; + } diff --git a/src/HyperLogLog.h b/src/HyperLogLog.h index ba9a46f1bd..81c19067a1 100644 --- a/src/HyperLogLog.h +++ b/src/HyperLogLog.h @@ -1,3 +1,5 @@ +// See the file "COPYING" in the main distribution directory for copyright. + #include /* diff --git a/src/OpaqueVal.cc b/src/OpaqueVal.cc index 51f975edf8..604ce2938e 100644 --- a/src/OpaqueVal.cc +++ b/src/OpaqueVal.cc @@ -2,6 +2,28 @@ #include "Reporter.h" #include "Serializer.h" + +CardinalityVal::CardinalityVal() + { + valid = false; + } + +CardinalityVal::~CardinalityVal() + { + if ( valid && c ) + delete c; + } + +bool CardinalityVal::Init(CardinalityCounter* arg_c) + { + if ( valid ) + return false; + + valid = true; + c = arg_c; + return valid; + } + bool HashVal::IsValid() const { return valid; diff --git a/src/OpaqueVal.h b/src/OpaqueVal.h index 0428e50bdb..01f86529c7 100644 --- a/src/OpaqueVal.h +++ b/src/OpaqueVal.h @@ -7,6 +7,24 @@ #include "Val.h" #include "digest.h" +class CardinalityCounter; + +class CardinalityVal: public OpaqueVal { +public: + CardinalityVal(); + ~CardinalityVal(); + bool Init(CardinalityCounter*); + bool IsValid() const { return valid; }; + CardinalityCounter* Get() { return c; }; + +private: + bool valid; + CardinalityCounter* c; + +// DECLARE_SERIAL(CardinalityVal); Fixme? +}; + + class HashVal : public OpaqueVal { public: virtual bool IsValid() const; diff --git a/src/bro.bif b/src/bro.bif index 13e0d6e407..6e39ee16b0 100644 --- a/src/bro.bif +++ b/src/bro.bif @@ -5650,44 +5650,36 @@ function anonymize_addr%(a: addr, cl: IPAddrAnonymizationClass%): addr %%{ #include "HyperLogLog.h" -static map hll_counters; - -BroString* convert_index_to_string(Val* index) - { - ODesc d; - index->Describe(&d); - BroString* s = new BroString(1, d.TakeBytes(), d.Len()); - s->SetUseFreeToDelete(1); - return s; - } %%} ## Initializes the hash for the HyperLogLog cardinality counting algorithm. ## It returns true if it was successful in creating a structure and false ## if it wasn't. -function hll_cardinality_init%(err: double,index: any%): bool +function hll_cardinality_init%(err: double%): opaque of cardinality %{ - BroString* s = convert_index_to_string(index); - int status = 0; + CardinalityCounter* c = new CardinalityCounter(err); + CardinalityVal* cv = new CardinalityVal(); - if ( hll_counters.count(*s) < 1 ) - { - hll_counters[*s] = new CardinalityCounter(err); - status = 1; - } + if ( !c ) + reporter->Error("Failed initialize Cardinality counter"); + else + cv->Init(c); - delete s; - return new Val(status, TYPE_BOOL); + return cv; %} ## Adds an element to the HyperLogLog data structure located at index. ##elem->Type() to get the type of elem. -function hll_cardinality_add%(elem: any, index: any%): bool +function hll_cardinality_add%(handle: opaque of cardinality, elem: any%): bool %{ - BroString* s = convert_index_to_string(index); + if ( !((CardinalityVal*) handle)->IsValid() ) { + reporter->Error("Need valid handle"); + return new Val(0, TYPE_BOOL); + } + int status = 0; uint64_t a = 123456; @@ -5696,19 +5688,13 @@ function hll_cardinality_add%(elem: any, index: any%): bool CompositeHash* hll_hash = new CompositeHash(tl); Unref(tl); - if( hll_counters.count(*s) > 0 ) - { - CardinalityCounter* h = hll_counters[*s]; - HashKey* key = hll_hash->ComputeHash(elem, 1); - a = key->Hash(); - h->addElement(a); - status = 1; - delete key; - } + CardinalityCounter* h = ((CardinalityVal*) handle)->Get(); + HashKey* key = hll_hash->ComputeHash(elem, 1); + a = key->Hash(); + h->addElement(a); delete hll_hash; - delete s; - return new Val(status, TYPE_BOOL); + return new Val(1, TYPE_BOOL); %} ## The data structure at index1 will contain the combined count for the @@ -5716,135 +5702,68 @@ function hll_cardinality_add%(elem: any, index: any%): bool ## It returns true if it either cloned the value at index2 into index1 ## or if it merged the two data structures together. -function hll_cardinality_merge_into%(index1: any, index2: any%): bool +function hll_cardinality_merge_into%(handle1: opaque of cardinality, handle2: opaque of cardinality%): bool %{ - BroString* s1 = convert_index_to_string(index1); - BroString* s2 = convert_index_to_string(index2); - int status = 0; + CardinalityVal* v1 = (CardinalityVal*) handle1; + CardinalityVal* v2 = (CardinalityVal*) handle2; - if(hll_counters.count(*s1) < 1) - { - if(hll_counters.count(*s2) < 1) - { - status = 0; - } - else - { - uint64_t m = (*hll_counters[*s2]).getM(); - CardinalityCounter* newInst = new CardinalityCounter(m); - hll_counters[*s1] = newInst; - (*newInst).merge(hll_counters[*s2]); - status = 1; - } - } - else - { - if(hll_counters.count(*s2) < 1) - { - status = 0; - } - else - { - if((*hll_counters[*s2]).getM() == (*hll_counters[*s1]).getM()) - { - status = 1; - (*hll_counters[*s1]).merge(hll_counters[*s2]); - } - } - } + if ( !v1->IsValid() || !v2->IsValid() ) { + reporter->Error("need valid handles"); + return new Val(0, TYPE_BOOL); + } - delete s1; - delete s2; - return new Val(status, TYPE_BOOL); + CardinalityCounter* h1 = v1->Get(); + CardinalityCounter* h2 = v2->Get(); + h1->merge(h2); + + return new Val(1, TYPE_BOOL); %} ## Returns true if it destroyed something. False if it didn't. -function hll_cardinality_destroy%(index: any%): bool - %{ - BroString* s = convert_index_to_string(index); - int status = 0; - - if(hll_counters.count(*s) > 0) - { - hll_counters.erase(*s); - status = 1; - } - - delete s; - return new Val(status, TYPE_BOOL); - %} +#function hll_cardinality_destroy%(handle: opaque of cardinality%): bool +# %{ +# if ( !((CardinalityVal*) handle)->IsValid() ) { +# reporter->Error("Need valid handle"); +# return new Val(0, TYPE_BOOL); +# } +# CardinalityCounter* h = ((CardinalityVal*) handle)->Get(); +# delete h; +# h = 0; +# return new Val(1, TYPE_BOOL); +# %} ## Returns the cardinality estimate. Returns -1.0 if there is nothing in that index. -function hll_cardinality_estimate%(index: any%): double +function hll_cardinality_estimate%(handle: opaque of cardinality%): double %{ - BroString* s = convert_index_to_string(index); - double estimate = -1.0; + if ( !((CardinalityVal*) handle)->IsValid() ) { + reporter->Error("Need valid handle"); + return new Val(0, TYPE_BOOL); + } + CardinalityCounter* h = ((CardinalityVal*) handle)->Get(); - if(hll_counters.count(*s) > 0) - { - estimate = (*hll_counters[*s]).size(); - } + double estimate = h->size(); - delete s; return new Val(estimate, TYPE_DOUBLE); %} -##I'm really not sure about the notation of this function... - -function hll_cardinality_keys%(%): string_set - %{ - TableVal* a = new TableVal(string_set); - map::iterator it; - int i = 0; - - for(it = hll_counters.begin() ; it != hll_counters.end(); it++) - { - BroString* s = (BroString*) &(it->first); - a->Assign(new StringVal(s), 0); - } - return a; - %} - ## Stores the data structure at index2 into index1. Deletes the data structure at index1 ## if there was any. Returns True if the data structure at index1 was changed in any way. -function hll_cardinality_clone%(index1: any, index2: any%): bool +function hll_cardinality_clone%(handle: opaque of cardinality%): opaque of cardinality %{ - BroString* s1 = convert_index_to_string(index1); - BroString* s2 = convert_index_to_string(index2); - int status = 0; + if ( !((CardinalityVal*) handle)->IsValid() ) { + reporter->Error("Need valid handle"); + return new Val(0, TYPE_BOOL); + } + CardinalityCounter* h = ((CardinalityVal*) handle)->Get(); - if(hll_counters.count(*s2) < 1) - { - if(hll_counters.count(*s1) < 1) - { - status = 0; - } - else - { - delete hll_counters[*s1]; - status = 1; - } - } - else - { - uint64_t m = (*hll_counters[*s2]).getM(); - CardinalityCounter* newInst = new CardinalityCounter(m); - int i = 0; - (*newInst).merge(hll_counters[*s2]); - if(hll_counters.count(*s1) < 1) - { - hll_counters[*s1] = newInst; - } - else - { - delete hll_counters[*s1]; - hll_counters[*s1] = newInst; - } - status = 1; - } - delete s1; - delete s2; - return new Val(status, TYPE_BOOL); + + uint64_t m = h->getM(); + CardinalityCounter* h2 = new CardinalityCounter(m); + int i = 0; + h2->merge(h); + CardinalityVal* cv = new CardinalityVal(); + cv->Init(h2); + return cv; %} diff --git a/testing/btest/Baseline/bifs.hll_cardinality/out b/testing/btest/Baseline/bifs.hll_cardinality/out new file mode 100644 index 0000000000..8d20248cc3 --- /dev/null +++ b/testing/btest/Baseline/bifs.hll_cardinality/out @@ -0,0 +1,23 @@ +This value should be around 13: +13.00129 +This value should be about 12: +12.001099 +This value should be around 0: +0.0 +This value should be around 13: +13.00129 +This value should be 0: +0.0 +This value should be true: +T +This value should be about 12: +12.001099 +12.001099 +This value should be true: +T +This value should be about 21: +21.003365 +This value should be about 13: +13.00129 +This value should be about 12: +12.001099 diff --git a/testing/btest/bifs/hll_cardinality.bro b/testing/btest/bifs/hll_cardinality.bro index 093de134df..774e8f6e28 100644 --- a/testing/btest/bifs/hll_cardinality.bro +++ b/testing/btest/bifs/hll_cardinality.bro @@ -4,135 +4,78 @@ event bro_init() { - local m1 = "measurement1"; - local m2 = "measurement2"; - - print "This value should be true:"; - print hll_cardinality_init(0.01, m1); - hll_cardinality_init(0.01, m2); - - print "This value should be false:"; - print hll_cardinality_init(0.02, "measurement1"); + local c1 = hll_cardinality_init(0.01); + local c2 = hll_cardinality_init(0.01); local add1 = "hey"; local add2 = "hi"; local add3 = 123; - hll_cardinality_add(add1, m1); - hll_cardinality_add(add2, m1); - hll_cardinality_add(add3, m1); - hll_cardinality_add("a", m1); - hll_cardinality_add("b", m1); - hll_cardinality_add("c", m1); - hll_cardinality_add("d", m1); - hll_cardinality_add("e", m1); - hll_cardinality_add("f", m1); - hll_cardinality_add("g", m1); - hll_cardinality_add("h", m1); - hll_cardinality_add("i", m1); + hll_cardinality_add(c1, add1); + hll_cardinality_add(c1, add2); + hll_cardinality_add(c1, add3); + hll_cardinality_add(c1, "a"); + hll_cardinality_add(c1, "b"); + hll_cardinality_add(c1, "c"); + hll_cardinality_add(c1, "d"); + hll_cardinality_add(c1, "e"); + hll_cardinality_add(c1, "f"); + hll_cardinality_add(c1, "g"); + hll_cardinality_add(c1, "h"); + hll_cardinality_add(c1, "i"); + hll_cardinality_add(c1, "j"); - print "This value should be true:"; - print hll_cardinality_add("j", m1); - - print "This value should be false:"; - print hll_cardinality_add("asdf", "something"); - - - hll_cardinality_add(add1, m2); - hll_cardinality_add(add2, m2); - hll_cardinality_add(add3, m2); - hll_cardinality_add(1, m2); - hll_cardinality_add("b", m2); - hll_cardinality_add(2, m2); - hll_cardinality_add(3, m2); - hll_cardinality_add(4, m2); - hll_cardinality_add(5, m2); - hll_cardinality_add(6, m2); - hll_cardinality_add(7, m2); - hll_cardinality_add(8, m2); + hll_cardinality_add(c2, add1); + hll_cardinality_add(c2, add2); + hll_cardinality_add(c2, add3); + hll_cardinality_add(c2, 1); + hll_cardinality_add(c2, "b"); + hll_cardinality_add(c2, 2); + hll_cardinality_add(c2, 3); + hll_cardinality_add(c2, 4); + hll_cardinality_add(c2, 5); + hll_cardinality_add(c2, 6); + hll_cardinality_add(c2, 7); + hll_cardinality_add(c2, 8); print "This value should be around 13:"; - print hll_cardinality_estimate("measurement1"); + print hll_cardinality_estimate(c1); - print "This value should be -1.0:"; - print hll_cardinality_estimate("m2"); + print "This value should be about 12:"; + print hll_cardinality_estimate(c2); - hll_cardinality_init(0.02, "m2"); + local m2 = hll_cardinality_init(0.02); print "This value should be around 0:"; - print hll_cardinality_estimate("m2"); + print hll_cardinality_estimate(m2); - print "This value should be true:"; - print hll_cardinality_destroy("m2"); - - print "This value should be false:"; - print hll_cardinality_destroy("m2"); - - print "This value should be -1.0:"; - print hll_cardinality_estimate("m2"); - - print "This next thing should be false:"; - print hll_cardinality_clone("m3", "m2"); - - print "This next thing should be true:"; - print hll_cardinality_clone("measurement3", "measurement1"); + local c3 = hll_cardinality_clone(c1); print "This value should be around 13:"; - print hll_cardinality_estimate("measurement3"); + print hll_cardinality_estimate(c3); - hll_cardinality_destroy("measurement3"); - - print "This next thing should be equal to -1.0:"; - print hll_cardinality_estimate("measurement3"); - - print "This value should be around 13:"; - print hll_cardinality_estimate("measurement1"); + c3 = hll_cardinality_init(0.01); + print "This value should be 0:"; + print hll_cardinality_estimate(c3); print "This value should be true:"; - print hll_cardinality_merge_into("measurement3", "measurement2"); - - print "This value should be false:"; - print hll_cardinality_merge_into("measurement4", "measurement6"); + print hll_cardinality_merge_into(c3, c2); print "This value should be about 12:"; - print hll_cardinality_estimate("measurement3"); - - print "This value should be false:"; - print hll_cardinality_merge_into("measurement3", "measurement15"); - - print "This value should be about 12:"; - print hll_cardinality_estimate("measurement3"); + print hll_cardinality_estimate(c2); + print hll_cardinality_estimate(c3); print "This value should be true:"; - print hll_cardinality_merge_into("measurement2", "measurement1"); + print hll_cardinality_merge_into(c2, c1); print "This value should be about 21:"; - print hll_cardinality_estimate("measurement2"); + print hll_cardinality_estimate(c2); print "This value should be about 13:"; - print hll_cardinality_estimate("measurement1"); + print hll_cardinality_estimate(c1); print "This value should be about 12:"; - print hll_cardinality_estimate("measurement3"); + print hll_cardinality_estimate(c3); - local keys = hll_cardinality_keys(); - for(key in keys) - { - print "The key is:"; - print key; - print "The value is:"; - print hll_cardinality_estimate(key); - } } -#function hll_cardinality_keys%(%): bool -# %{ -#// TableVal* a = new TableVal(string_set); -#// map::iterator it; -# -#// for(it = hll_counters.begin() ; it != hll_counters.end(); it++) -#// { -#// a->Assign((*it).first); -#// } -#// return a; -# return new Val(1, TYPE_BOOL);