diff --git a/src/probabilistic/Topk.cc b/src/probabilistic/Topk.cc index dbfa3cdb83..fa3cda2e9a 100644 --- a/src/probabilistic/Topk.cc +++ b/src/probabilistic/Topk.cc @@ -19,19 +19,22 @@ static void topk_element_hash_delete_func(void* val) Element::~Element() { Unref(value); - value = 0; + } + +void TopkVal::Typify(BroType* t) + { + assert(!hash && !type); + type = t->Ref(); + TypeList* tl = new TypeList(t); + tl->Append(t->Ref()); + hash = new CompositeHash(tl); + Unref(tl); } HashKey* TopkVal::GetHash(Val* v) const { - TypeList* tl = new TypeList(v->Type()); - tl->Append(v->Type()->Ref()); - CompositeHash* topk_hash = new CompositeHash(tl); - Unref(tl); - - HashKey* key = topk_hash->ComputeHash(v, 1); + HashKey* key = hash->ComputeHash(v, 1); assert(key); - delete topk_hash; return key; } @@ -43,6 +46,7 @@ TopkVal::TopkVal(uint64 arg_size) : OpaqueVal(topk_type) type = 0; numElements = 0; pruned = false; + hash = 0; } TopkVal::TopkVal() : OpaqueVal(topk_type) @@ -52,6 +56,7 @@ TopkVal::TopkVal() : OpaqueVal(topk_type) size = 0; type = 0; numElements = 0; + hash = 0; } TopkVal::~TopkVal() @@ -68,7 +73,7 @@ TopkVal::~TopkVal() } Unref(type); - type = 0; + delete hash; } void TopkVal::Merge(const TopkVal* value, bool doPrune) @@ -76,7 +81,7 @@ void TopkVal::Merge(const TopkVal* value, bool doPrune) if ( type == 0 ) { assert(numElements == 0); - type = value->type->Ref(); + Typify(value->type); } else @@ -230,7 +235,10 @@ bool TopkVal::DoUnserialize(UnserialInfo* info) v &= UNSERIALIZE(&type_present); if ( type_present ) { - type = BroType::Unserialize(info); + BroType* deserialized_type = BroType::Unserialize(info); + + Typify(deserialized_type); + Unref(deserialized_type); assert(type); } else @@ -270,7 +278,7 @@ bool TopkVal::DoUnserialize(UnserialInfo* info) } -VectorVal* TopkVal::getTopK(int k) const // returns vector +VectorVal* TopkVal::GetTopK(int k) const // returns vector { if ( numElements == 0 ) { @@ -311,14 +319,14 @@ VectorVal* TopkVal::getTopK(int k) const // returns vector return t; } -uint64_t TopkVal::getCount(Val* value) const +uint64_t TopkVal::GetCount(Val* value) const { HashKey* key = GetHash(value); Element* e = (Element*) elementDict->Lookup(key); if ( e == 0 ) { - reporter->Error("getCount for element that is not in top-k"); + reporter->Error("GetCount for element that is not in top-k"); return 0; } @@ -326,14 +334,14 @@ uint64_t TopkVal::getCount(Val* value) const return e->parent->count; } -uint64_t TopkVal::getEpsilon(Val* value) const +uint64_t TopkVal::GetEpsilon(Val* value) const { HashKey* key = GetHash(value); Element* e = (Element*) elementDict->Lookup(key); if ( e == 0 ) { - reporter->Error("getEpsilon for element that is not in top-k"); + reporter->Error("GetEpsilon for element that is not in top-k"); return 0; } @@ -341,7 +349,7 @@ uint64_t TopkVal::getEpsilon(Val* value) const return e->epsilon; } -uint64_t TopkVal::getSum() const +uint64_t TopkVal::GetSum() const { uint64_t sum = 0; @@ -353,8 +361,8 @@ uint64_t TopkVal::getSum() const it++; } - if ( pruned ) - reporter->Warning("TopkVal::getSum() was used on a pruned data structure. Result values do not represent total element count"); + if ( pruned ) + reporter->Warning("TopkVal::GetSum() was used on a pruned data structure. Result values do not represent total element count"); return sum; } @@ -362,11 +370,9 @@ uint64_t TopkVal::getSum() const void TopkVal::Encountered(Val* encountered) { // ok, let's see if we already know this one. - - //printf("NumElements: %d\n", numElements); - // check type compatibility - if ( numElements == 0 ) - type = encountered->Type()->Ref(); + + if ( numElements == 0 ) + Typify(encountered->Type()); else if ( ! same_type(type, encountered->Type()) ) { diff --git a/src/probabilistic/Topk.h b/src/probabilistic/Topk.h index af15acf955..9189b80984 100644 --- a/src/probabilistic/Topk.h +++ b/src/probabilistic/Topk.h @@ -38,51 +38,101 @@ declare(PDict, Element); class TopkVal : public OpaqueVal { public: - // Initialize a TopkVal. Size specifies how many total elements are - // tracked + /** + * Construct a TopkVal. + * + * @param size specifies how many total elements are tracked + * + * @return A newly initialized TopkVal + */ TopkVal(uint64 size); + + /** + * Destructor. + */ ~TopkVal(); - // Call this, when a new value is encountered. Note that on the first call, - // the Bro-Type of the value types that are counted is set. All following calls - // to encountered have to specify the same type - void Encountered(Val* value); + /** + * Call this, when a new value is encountered. Note that on the first call, + * the Bro-Type of the value types that are counted is set. All following calls + * to encountered have to specify the same type. + * + * @param value The encountered element + */ + void Encountered(Val* value); - // Return the first k elements of the result vector. At the moment, this does - // not check if it is in the right order or if we can prove that these are - // the correct top-k. Use count and epsilon for this. - VectorVal* getTopK(int k) const; // returns vector + /** + * Get the first k elements of the result vector. At the moment, this does + * not check if it is in the right order or if we can prove that these are + * the correct top-k. Use count and epsilon for this. + * + * @param k Number of top-elements to return + * + * @returns The top-k encountered elements + */ + VectorVal* GetTopK(int k) const; - // Get the current count tracked in the top-k data structure for a certain val. - // Returns 0 if the val is unknown (and logs the error to reporter) - uint64_t getCount(Val* value) const; + /** + * Get the current count tracked in the top-k data structure for a certain val. + * Returns 0 if the val is unknown (and logs the error to reporter) + * + * @param value Bro value to get counts for + * + * @returns internal count for val, 0 if unknown + */ + uint64_t GetCount(Val* value) const; - // Get the current epsilon tracked in the top-k data structure for a certain val. - // Returns 0 if the val is unknown (and logs the error to reporter) - uint64_t getEpsilon(Val* value) const; + /** + * Get the current epsilon tracked in the top-k data structure for a certain val. + * + * @param value Bro value to get epsilons for + * + * @returns the epsilon. Returns 0 if the val is unknown (and logs the error to reporter) + */ + uint64_t GetEpsilon(Val* value) const; - // Get the size set in the constructor - uint64_t getSize() const { return size; } + /** + * Get the size set in the constructor + * + * @returns size of the top-k structure + */ + uint64_t GetSize() const { return size; } - // Get the sum of all counts of all tracked elements. This is equal to the number - // of total observations up to this moment, if no elements were pruned from the data - // structure. - uint64_t getSum() const; + /** + * Get the sum of all counts of all tracked elements. This is equal to the number + * of total observations up to this moment, if no elements were pruned from the data + * structure. + * + * @returns sum of all counts + */ + uint64_t GetSum() const; - // Merge another top-k data structure in this one. - // doPrune specifies if the total count of elements is limited to size after - // merging. - // Please note, that pruning will invalidate the results of getSum. + /** + * Merge another top-k data structure into this one. + * doPrune specifies if the total count of elements is limited to size after + * merging. + * Please note, that pruning will invalidate the results of getSum. + * + * @param value TopkVal to merge into this TopkVal + * + * @param doPrune prune resulting TopkVal to size after merging + */ void Merge(const TopkVal* value, bool doPrune=false); protected: - TopkVal(); // for deserialize + /** + * Construct an empty TopkVal. + * Only used for deserialization + */ + TopkVal(); private: void IncrementCounter(Element* e, unsigned int count = 1); HashKey* GetHash(Val*) const; // this probably should go somewhere else. - + void Typify(BroType*); + BroType* type; + CompositeHash* hash; std::list buckets; PDict(Element)* elementDict; uint64 size; // how many elements are we tracking? diff --git a/src/probabilistic/top-k.bif b/src/probabilistic/top-k.bif index f4e7753d90..a3ffeae03b 100644 --- a/src/probabilistic/top-k.bif +++ b/src/probabilistic/top-k.bif @@ -11,6 +11,8 @@ ## Creates a top-k data structure which tracks *size* elements. ## +## size: number of elements to track +## ## Returns: Opaque pointer to the data structure. function topk_init%(size: count%): opaque of topk %{ @@ -18,9 +20,14 @@ function topk_init%(size: count%): opaque of topk return v; %} -## Add a new observed object to the data structure. The first -## added object sets the type of data tracked by the top-k data -## structure. All following values have to be of the same type +## Add a new observed object to the data structure. +## +## .. note:: The first added object sets the type of data tracked by +## the top-k data structure. All following values have to be of the same type +## +## handle: the TopK handle +## +## value: observed value function topk_add%(handle: opaque of topk, value: any%): any %{ assert(handle); @@ -32,63 +39,86 @@ function topk_add%(handle: opaque of topk, value: any%): any ## Get the first k elements of the top-k data structure ## +## handle: the TopK handle +## +## k: number of elements to return +## ## Returns: vector of the first k elements function topk_get_top%(handle: opaque of topk, k: count%): any %{ assert(handle); probabilistic::TopkVal* h = (probabilistic::TopkVal*) handle; - return h->getTopK(k); + return h->GetTopK(k); %} ## Get an overestimated count of how often value has been encountered. -## value has to be part of the currently tracked elements, otherwise -## 0 will be returned and an error message will be added to reporter. +## +## .. note:: value has to be part of the currently tracked elements, otherwise +## 0 will be returned and an error message will be added to reporter. +## +## handle: the TopK handle +## +## value: Value to look up count for. ## ## Returns: Overestimated number for how often the element has been encountered function topk_count%(handle: opaque of topk, value: any%): count %{ assert(handle); probabilistic::TopkVal* h = (probabilistic::TopkVal*) handle; - return new Val(h->getCount(value), TYPE_COUNT); + return new Val(h->GetCount(value), TYPE_COUNT); %} -## Get a the maximal overestimation for count. Same restrictiosn as for topk_count -## apply. +## Get a the maximal overestimation for count. +## +## .. note:: Same restrictiosn as for topk_count apply. +## +## handle: the TopK handle +## +## value: Value to look up epsilon for. ## ## Returns: Number which represents the maximal overesimation for the count of this element. function topk_epsilon%(handle: opaque of topk, value: any%): count %{ assert(handle); probabilistic::TopkVal* h = (probabilistic::TopkVal*) handle; - return new Val(h->getEpsilon(value), TYPE_COUNT); + return new Val(h->GetEpsilon(value), TYPE_COUNT); %} ## Get the number of elements this data structure is supposed to track (given on init). -## Note that the actual number of elements in the data structure can be lower or higher -## than this. (higher due to non-pruned merges) ## -## Returns: size given during initialization +## .. note ::Note that the actual number of elements in the data structure can be lower +## or higher (due to non-pruned merges) than thiz +## +## handle: the TopK handle +## +## Returns: size given during initialization function topk_size%(handle: opaque of topk%): count %{ assert(handle); probabilistic::TopkVal* h = (probabilistic::TopkVal*) handle; - return new Val(h->getSize(), TYPE_COUNT); + return new Val(h->GetSize(), TYPE_COUNT); %} -## Get the sum of all counts of all elements in the data structure. Is equal to the number -## of all inserted objects if the data structure never has been pruned. Do not use after -## calling topk_merge_prune (will throw a warning message if used afterwards) +## Get the sum of all counts of all elements in the data structure. +## +## .. note:: This is equal to the number of all inserted objects if the data structure +## never has been pruned. Do not use after calling topk_merge_prune (will throw a +## warning message if used afterwards) +## +## handle: the TopK handle ## ## Returns: sum of all counts function topk_sum%(handle: opaque of topk%): count %{ assert(handle); probabilistic::TopkVal* h = (probabilistic::TopkVal*) handle; - return new Val(h->getSum(), TYPE_COUNT); + return new Val(h->GetSum(), TYPE_COUNT); %} -## Merge the second topk data structure into the first. Does not remove any elements, the -## resulting data structure can be bigger than the maximum size given on initialization. +## Merge the second topk data structure into the first. +## +## .. note:: This does not remove any elements, the resulting data structure can +## be bigger than the maximum size given on initialization. function topk_merge%(handle1: opaque of topk, handle2: opaque of topk%): any %{ assert(handle1); @@ -103,9 +133,15 @@ function topk_merge%(handle1: opaque of topk, handle2: opaque of topk%): any %} ## Merge the second topk data structure into the first and prunes the final data structure -## back to the size given on initialization. Use with care and only when being aware of the -## restrictions this imposed. Do not call topk_size or topk_add afterwards, results will -## probably not be what you expect. +## back to the size given on initialization. +## +## .. note:: Use with care and only when being aware of the restrictions this entails. +## Do not call topk_size or topk_add afterwards, results will probably not be what you +## expect. +## +## handle1: the TopK handle in which the second TopK structure is merged +## +## handle2: the TopK handle in which is merged into the first TopK structure function topk_merge_prune%(handle1: opaque of topk, handle2: opaque of topk%): any %{ assert(handle1); diff --git a/testing/btest/Baseline/bifs.topk/.stderr b/testing/btest/Baseline/bifs.topk/.stderr index 80626107aa..a711333fc0 100644 --- a/testing/btest/Baseline/bifs.topk/.stderr +++ b/testing/btest/Baseline/bifs.topk/.stderr @@ -1,11 +1,11 @@ -error: getCount for element that is not in top-k -error: getEpsilon for element that is not in top-k -error: getCount for element that is not in top-k -error: getEpsilon for element that is not in top-k -error: getCount for element that is not in top-k -error: getEpsilon for element that is not in top-k -error: getCount for element that is not in top-k -error: getEpsilon for element that is not in top-k -warning: TopkVal::getSum() was used on a pruned data structure. Result values do not represent total element count -error: getCount for element that is not in top-k -error: getEpsilon for element that is not in top-k +error: GetCount for element that is not in top-k +error: GetEpsilon for element that is not in top-k +error: GetCount for element that is not in top-k +error: GetEpsilon for element that is not in top-k +error: GetCount for element that is not in top-k +error: GetEpsilon for element that is not in top-k +error: GetCount for element that is not in top-k +error: GetEpsilon for element that is not in top-k +warning: TopkVal::GetSum() was used on a pruned data structure. Result values do not represent total element count +error: GetCount for element that is not in top-k +error: GetEpsilon for element that is not in top-k