mirror of
https://github.com/zeek/zeek.git
synced 2025-10-04 15:48:19 +00:00
Merge remote-tracking branch 'origin/topic/bernhard/topk' into topic/robin/topk-merge
* origin/topic/bernhard/topk: update documentation, rename get* to Get* and make hasher persistent Conflicts: src/probabilistic/Topk.cc src/probabilistic/Topk.h src/probabilistic/top-k.bif
This commit is contained in:
commit
f6e5de91fa
4 changed files with 178 additions and 86 deletions
|
@ -19,19 +19,22 @@ static void topk_element_hash_delete_func(void* val)
|
|||
Element::~Element()
|
||||
{
|
||||
Unref(value);
|
||||
value = 0;
|
||||
}
|
||||
|
||||
void TopkVal::Typify(BroType* t)
|
||||
{
|
||||
assert(!hash && !type);
|
||||
type = t->Ref();
|
||||
TypeList* tl = new TypeList(t);
|
||||
tl->Append(t->Ref());
|
||||
hash = new CompositeHash(tl);
|
||||
Unref(tl);
|
||||
}
|
||||
|
||||
HashKey* TopkVal::GetHash(Val* v) const
|
||||
{
|
||||
TypeList* tl = new TypeList(v->Type());
|
||||
tl->Append(v->Type()->Ref());
|
||||
CompositeHash* topk_hash = new CompositeHash(tl);
|
||||
Unref(tl);
|
||||
|
||||
HashKey* key = topk_hash->ComputeHash(v, 1);
|
||||
HashKey* key = hash->ComputeHash(v, 1);
|
||||
assert(key);
|
||||
delete topk_hash;
|
||||
return key;
|
||||
}
|
||||
|
||||
|
@ -43,6 +46,7 @@ TopkVal::TopkVal(uint64 arg_size) : OpaqueVal(topk_type)
|
|||
type = 0;
|
||||
numElements = 0;
|
||||
pruned = false;
|
||||
hash = 0;
|
||||
}
|
||||
|
||||
TopkVal::TopkVal() : OpaqueVal(topk_type)
|
||||
|
@ -52,6 +56,7 @@ TopkVal::TopkVal() : OpaqueVal(topk_type)
|
|||
size = 0;
|
||||
type = 0;
|
||||
numElements = 0;
|
||||
hash = 0;
|
||||
}
|
||||
|
||||
TopkVal::~TopkVal()
|
||||
|
@ -68,7 +73,7 @@ TopkVal::~TopkVal()
|
|||
}
|
||||
|
||||
Unref(type);
|
||||
type = 0;
|
||||
delete hash;
|
||||
}
|
||||
|
||||
void TopkVal::Merge(const TopkVal* value, bool doPrune)
|
||||
|
@ -76,7 +81,7 @@ void TopkVal::Merge(const TopkVal* value, bool doPrune)
|
|||
if ( type == 0 )
|
||||
{
|
||||
assert(numElements == 0);
|
||||
type = value->type->Ref();
|
||||
Typify(value->type);
|
||||
}
|
||||
|
||||
else
|
||||
|
@ -230,7 +235,10 @@ bool TopkVal::DoUnserialize(UnserialInfo* info)
|
|||
v &= UNSERIALIZE(&type_present);
|
||||
if ( type_present )
|
||||
{
|
||||
type = BroType::Unserialize(info);
|
||||
BroType* deserialized_type = BroType::Unserialize(info);
|
||||
|
||||
Typify(deserialized_type);
|
||||
Unref(deserialized_type);
|
||||
assert(type);
|
||||
}
|
||||
else
|
||||
|
@ -270,7 +278,7 @@ bool TopkVal::DoUnserialize(UnserialInfo* info)
|
|||
}
|
||||
|
||||
|
||||
VectorVal* TopkVal::getTopK(int k) const // returns vector
|
||||
VectorVal* TopkVal::GetTopK(int k) const // returns vector
|
||||
{
|
||||
if ( numElements == 0 )
|
||||
{
|
||||
|
@ -311,14 +319,14 @@ VectorVal* TopkVal::getTopK(int k) const // returns vector
|
|||
return t;
|
||||
}
|
||||
|
||||
uint64_t TopkVal::getCount(Val* value) const
|
||||
uint64_t TopkVal::GetCount(Val* value) const
|
||||
{
|
||||
HashKey* key = GetHash(value);
|
||||
Element* e = (Element*) elementDict->Lookup(key);
|
||||
|
||||
if ( e == 0 )
|
||||
{
|
||||
reporter->Error("getCount for element that is not in top-k");
|
||||
reporter->Error("GetCount for element that is not in top-k");
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -326,14 +334,14 @@ uint64_t TopkVal::getCount(Val* value) const
|
|||
return e->parent->count;
|
||||
}
|
||||
|
||||
uint64_t TopkVal::getEpsilon(Val* value) const
|
||||
uint64_t TopkVal::GetEpsilon(Val* value) const
|
||||
{
|
||||
HashKey* key = GetHash(value);
|
||||
Element* e = (Element*) elementDict->Lookup(key);
|
||||
|
||||
if ( e == 0 )
|
||||
{
|
||||
reporter->Error("getEpsilon for element that is not in top-k");
|
||||
reporter->Error("GetEpsilon for element that is not in top-k");
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -341,7 +349,7 @@ uint64_t TopkVal::getEpsilon(Val* value) const
|
|||
return e->epsilon;
|
||||
}
|
||||
|
||||
uint64_t TopkVal::getSum() const
|
||||
uint64_t TopkVal::GetSum() const
|
||||
{
|
||||
uint64_t sum = 0;
|
||||
|
||||
|
@ -353,8 +361,8 @@ uint64_t TopkVal::getSum() const
|
|||
it++;
|
||||
}
|
||||
|
||||
if ( pruned )
|
||||
reporter->Warning("TopkVal::getSum() was used on a pruned data structure. Result values do not represent total element count");
|
||||
if ( pruned )
|
||||
reporter->Warning("TopkVal::GetSum() was used on a pruned data structure. Result values do not represent total element count");
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
@ -362,11 +370,9 @@ uint64_t TopkVal::getSum() const
|
|||
void TopkVal::Encountered(Val* encountered)
|
||||
{
|
||||
// ok, let's see if we already know this one.
|
||||
|
||||
//printf("NumElements: %d\n", numElements);
|
||||
// check type compatibility
|
||||
if ( numElements == 0 )
|
||||
type = encountered->Type()->Ref();
|
||||
|
||||
if ( numElements == 0 )
|
||||
Typify(encountered->Type());
|
||||
else
|
||||
if ( ! same_type(type, encountered->Type()) )
|
||||
{
|
||||
|
|
|
@ -38,51 +38,101 @@ declare(PDict, Element);
|
|||
class TopkVal : public OpaqueVal {
|
||||
|
||||
public:
|
||||
// Initialize a TopkVal. Size specifies how many total elements are
|
||||
// tracked
|
||||
/**
|
||||
* Construct a TopkVal.
|
||||
*
|
||||
* @param size specifies how many total elements are tracked
|
||||
*
|
||||
* @return A newly initialized TopkVal
|
||||
*/
|
||||
TopkVal(uint64 size);
|
||||
|
||||
/**
|
||||
* Destructor.
|
||||
*/
|
||||
~TopkVal();
|
||||
|
||||
// Call this, when a new value is encountered. Note that on the first call,
|
||||
// the Bro-Type of the value types that are counted is set. All following calls
|
||||
// to encountered have to specify the same type
|
||||
void Encountered(Val* value);
|
||||
/**
|
||||
* Call this, when a new value is encountered. Note that on the first call,
|
||||
* the Bro-Type of the value types that are counted is set. All following calls
|
||||
* to encountered have to specify the same type.
|
||||
*
|
||||
* @param value The encountered element
|
||||
*/
|
||||
void Encountered(Val* value);
|
||||
|
||||
// Return the first k elements of the result vector. At the moment, this does
|
||||
// not check if it is in the right order or if we can prove that these are
|
||||
// the correct top-k. Use count and epsilon for this.
|
||||
VectorVal* getTopK(int k) const; // returns vector
|
||||
/**
|
||||
* Get the first k elements of the result vector. At the moment, this does
|
||||
* not check if it is in the right order or if we can prove that these are
|
||||
* the correct top-k. Use count and epsilon for this.
|
||||
*
|
||||
* @param k Number of top-elements to return
|
||||
*
|
||||
* @returns The top-k encountered elements
|
||||
*/
|
||||
VectorVal* GetTopK(int k) const;
|
||||
|
||||
// Get the current count tracked in the top-k data structure for a certain val.
|
||||
// Returns 0 if the val is unknown (and logs the error to reporter)
|
||||
uint64_t getCount(Val* value) const;
|
||||
/**
|
||||
* Get the current count tracked in the top-k data structure for a certain val.
|
||||
* Returns 0 if the val is unknown (and logs the error to reporter)
|
||||
*
|
||||
* @param value Bro value to get counts for
|
||||
*
|
||||
* @returns internal count for val, 0 if unknown
|
||||
*/
|
||||
uint64_t GetCount(Val* value) const;
|
||||
|
||||
// Get the current epsilon tracked in the top-k data structure for a certain val.
|
||||
// Returns 0 if the val is unknown (and logs the error to reporter)
|
||||
uint64_t getEpsilon(Val* value) const;
|
||||
/**
|
||||
* Get the current epsilon tracked in the top-k data structure for a certain val.
|
||||
*
|
||||
* @param value Bro value to get epsilons for
|
||||
*
|
||||
* @returns the epsilon. Returns 0 if the val is unknown (and logs the error to reporter)
|
||||
*/
|
||||
uint64_t GetEpsilon(Val* value) const;
|
||||
|
||||
// Get the size set in the constructor
|
||||
uint64_t getSize() const { return size; }
|
||||
/**
|
||||
* Get the size set in the constructor
|
||||
*
|
||||
* @returns size of the top-k structure
|
||||
*/
|
||||
uint64_t GetSize() const { return size; }
|
||||
|
||||
// Get the sum of all counts of all tracked elements. This is equal to the number
|
||||
// of total observations up to this moment, if no elements were pruned from the data
|
||||
// structure.
|
||||
uint64_t getSum() const;
|
||||
/**
|
||||
* Get the sum of all counts of all tracked elements. This is equal to the number
|
||||
* of total observations up to this moment, if no elements were pruned from the data
|
||||
* structure.
|
||||
*
|
||||
* @returns sum of all counts
|
||||
*/
|
||||
uint64_t GetSum() const;
|
||||
|
||||
// Merge another top-k data structure in this one.
|
||||
// doPrune specifies if the total count of elements is limited to size after
|
||||
// merging.
|
||||
// Please note, that pruning will invalidate the results of getSum.
|
||||
/**
|
||||
* Merge another top-k data structure into this one.
|
||||
* doPrune specifies if the total count of elements is limited to size after
|
||||
* merging.
|
||||
* Please note, that pruning will invalidate the results of getSum.
|
||||
*
|
||||
* @param value TopkVal to merge into this TopkVal
|
||||
*
|
||||
* @param doPrune prune resulting TopkVal to size after merging
|
||||
*/
|
||||
void Merge(const TopkVal* value, bool doPrune=false);
|
||||
|
||||
protected:
|
||||
TopkVal(); // for deserialize
|
||||
/**
|
||||
* Construct an empty TopkVal.
|
||||
* Only used for deserialization
|
||||
*/
|
||||
TopkVal();
|
||||
|
||||
private:
|
||||
void IncrementCounter(Element* e, unsigned int count = 1);
|
||||
HashKey* GetHash(Val*) const; // this probably should go somewhere else.
|
||||
|
||||
void Typify(BroType*);
|
||||
|
||||
BroType* type;
|
||||
CompositeHash* hash;
|
||||
std::list<Bucket*> buckets;
|
||||
PDict(Element)* elementDict;
|
||||
uint64 size; // how many elements are we tracking?
|
||||
|
|
|
@ -11,6 +11,8 @@
|
|||
|
||||
## Creates a top-k data structure which tracks *size* elements.
|
||||
##
|
||||
## size: number of elements to track
|
||||
##
|
||||
## Returns: Opaque pointer to the data structure.
|
||||
function topk_init%(size: count%): opaque of topk
|
||||
%{
|
||||
|
@ -18,9 +20,14 @@ function topk_init%(size: count%): opaque of topk
|
|||
return v;
|
||||
%}
|
||||
|
||||
## Add a new observed object to the data structure. The first
|
||||
## added object sets the type of data tracked by the top-k data
|
||||
## structure. All following values have to be of the same type
|
||||
## Add a new observed object to the data structure.
|
||||
##
|
||||
## .. note:: The first added object sets the type of data tracked by
|
||||
## the top-k data structure. All following values have to be of the same type
|
||||
##
|
||||
## handle: the TopK handle
|
||||
##
|
||||
## value: observed value
|
||||
function topk_add%(handle: opaque of topk, value: any%): any
|
||||
%{
|
||||
assert(handle);
|
||||
|
@ -32,63 +39,86 @@ function topk_add%(handle: opaque of topk, value: any%): any
|
|||
|
||||
## Get the first k elements of the top-k data structure
|
||||
##
|
||||
## handle: the TopK handle
|
||||
##
|
||||
## k: number of elements to return
|
||||
##
|
||||
## Returns: vector of the first k elements
|
||||
function topk_get_top%(handle: opaque of topk, k: count%): any
|
||||
%{
|
||||
assert(handle);
|
||||
probabilistic::TopkVal* h = (probabilistic::TopkVal*) handle;
|
||||
return h->getTopK(k);
|
||||
return h->GetTopK(k);
|
||||
%}
|
||||
|
||||
## Get an overestimated count of how often value has been encountered.
|
||||
## value has to be part of the currently tracked elements, otherwise
|
||||
## 0 will be returned and an error message will be added to reporter.
|
||||
##
|
||||
## .. note:: value has to be part of the currently tracked elements, otherwise
|
||||
## 0 will be returned and an error message will be added to reporter.
|
||||
##
|
||||
## handle: the TopK handle
|
||||
##
|
||||
## value: Value to look up count for.
|
||||
##
|
||||
## Returns: Overestimated number for how often the element has been encountered
|
||||
function topk_count%(handle: opaque of topk, value: any%): count
|
||||
%{
|
||||
assert(handle);
|
||||
probabilistic::TopkVal* h = (probabilistic::TopkVal*) handle;
|
||||
return new Val(h->getCount(value), TYPE_COUNT);
|
||||
return new Val(h->GetCount(value), TYPE_COUNT);
|
||||
%}
|
||||
|
||||
## Get a the maximal overestimation for count. Same restrictiosn as for topk_count
|
||||
## apply.
|
||||
## Get a the maximal overestimation for count.
|
||||
##
|
||||
## .. note:: Same restrictiosn as for topk_count apply.
|
||||
##
|
||||
## handle: the TopK handle
|
||||
##
|
||||
## value: Value to look up epsilon for.
|
||||
##
|
||||
## Returns: Number which represents the maximal overesimation for the count of this element.
|
||||
function topk_epsilon%(handle: opaque of topk, value: any%): count
|
||||
%{
|
||||
assert(handle);
|
||||
probabilistic::TopkVal* h = (probabilistic::TopkVal*) handle;
|
||||
return new Val(h->getEpsilon(value), TYPE_COUNT);
|
||||
return new Val(h->GetEpsilon(value), TYPE_COUNT);
|
||||
%}
|
||||
|
||||
## Get the number of elements this data structure is supposed to track (given on init).
|
||||
## Note that the actual number of elements in the data structure can be lower or higher
|
||||
## than this. (higher due to non-pruned merges)
|
||||
##
|
||||
## Returns: size given during initialization
|
||||
## .. note ::Note that the actual number of elements in the data structure can be lower
|
||||
## or higher (due to non-pruned merges) than thiz
|
||||
##
|
||||
## handle: the TopK handle
|
||||
##
|
||||
## Returns: size given during initialization
|
||||
function topk_size%(handle: opaque of topk%): count
|
||||
%{
|
||||
assert(handle);
|
||||
probabilistic::TopkVal* h = (probabilistic::TopkVal*) handle;
|
||||
return new Val(h->getSize(), TYPE_COUNT);
|
||||
return new Val(h->GetSize(), TYPE_COUNT);
|
||||
%}
|
||||
|
||||
## Get the sum of all counts of all elements in the data structure. Is equal to the number
|
||||
## of all inserted objects if the data structure never has been pruned. Do not use after
|
||||
## calling topk_merge_prune (will throw a warning message if used afterwards)
|
||||
## Get the sum of all counts of all elements in the data structure.
|
||||
##
|
||||
## .. note:: This is equal to the number of all inserted objects if the data structure
|
||||
## never has been pruned. Do not use after calling topk_merge_prune (will throw a
|
||||
## warning message if used afterwards)
|
||||
##
|
||||
## handle: the TopK handle
|
||||
##
|
||||
## Returns: sum of all counts
|
||||
function topk_sum%(handle: opaque of topk%): count
|
||||
%{
|
||||
assert(handle);
|
||||
probabilistic::TopkVal* h = (probabilistic::TopkVal*) handle;
|
||||
return new Val(h->getSum(), TYPE_COUNT);
|
||||
return new Val(h->GetSum(), TYPE_COUNT);
|
||||
%}
|
||||
|
||||
## Merge the second topk data structure into the first. Does not remove any elements, the
|
||||
## resulting data structure can be bigger than the maximum size given on initialization.
|
||||
## Merge the second topk data structure into the first.
|
||||
##
|
||||
## .. note:: This does not remove any elements, the resulting data structure can
|
||||
## be bigger than the maximum size given on initialization.
|
||||
function topk_merge%(handle1: opaque of topk, handle2: opaque of topk%): any
|
||||
%{
|
||||
assert(handle1);
|
||||
|
@ -103,9 +133,15 @@ function topk_merge%(handle1: opaque of topk, handle2: opaque of topk%): any
|
|||
%}
|
||||
|
||||
## Merge the second topk data structure into the first and prunes the final data structure
|
||||
## back to the size given on initialization. Use with care and only when being aware of the
|
||||
## restrictions this imposed. Do not call topk_size or topk_add afterwards, results will
|
||||
## probably not be what you expect.
|
||||
## back to the size given on initialization.
|
||||
##
|
||||
## .. note:: Use with care and only when being aware of the restrictions this entails.
|
||||
## Do not call topk_size or topk_add afterwards, results will probably not be what you
|
||||
## expect.
|
||||
##
|
||||
## handle1: the TopK handle in which the second TopK structure is merged
|
||||
##
|
||||
## handle2: the TopK handle in which is merged into the first TopK structure
|
||||
function topk_merge_prune%(handle1: opaque of topk, handle2: opaque of topk%): any
|
||||
%{
|
||||
assert(handle1);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue