mirror of
https://github.com/zeek/zeek.git
synced 2025-10-08 09:38:19 +00:00
implement topk.
This is _completely_ untested. It compiles. It will probably do nothing else (well, besides crashing Bro).
This commit is contained in:
parent
9a88dc500a
commit
c21c18ea45
3 changed files with 281 additions and 0 deletions
|
@ -408,6 +408,7 @@ set(bro_SRCS
|
||||||
Telnet.cc
|
Telnet.cc
|
||||||
Teredo.cc
|
Teredo.cc
|
||||||
Timer.cc
|
Timer.cc
|
||||||
|
Topk.cc
|
||||||
Traverse.cc
|
Traverse.cc
|
||||||
Trigger.cc
|
Trigger.cc
|
||||||
TunnelEncapsulation.cc
|
TunnelEncapsulation.cc
|
||||||
|
|
224
src/Topk.cc
Normal file
224
src/Topk.cc
Normal file
|
@ -0,0 +1,224 @@
|
||||||
|
// See the file "COPYING" in the main distribution directory for copyright.
|
||||||
|
|
||||||
|
#include "Topk.h"
|
||||||
|
#include "CompHash.h"
|
||||||
|
#include "Reporter.h"
|
||||||
|
|
||||||
|
namespace Topk {
|
||||||
|
|
||||||
|
static void topk_element_hash_delete_func(void* val)
|
||||||
|
{
|
||||||
|
Element* e = (Element*) val;
|
||||||
|
delete e;
|
||||||
|
}
|
||||||
|
|
||||||
|
Element::~Element()
|
||||||
|
{
|
||||||
|
if ( value )
|
||||||
|
Unref(value);
|
||||||
|
value=0;
|
||||||
|
}
|
||||||
|
|
||||||
|
HashKey* Topk::GetHash(Val* v)
|
||||||
|
{
|
||||||
|
TypeList* tl = new TypeList(v->Type());
|
||||||
|
tl->Append(v->Type());
|
||||||
|
CompositeHash* topk_hash = new CompositeHash(tl);
|
||||||
|
Unref(tl);
|
||||||
|
|
||||||
|
HashKey* key = topk_hash->ComputeHash(v, 1);
|
||||||
|
assert(key);
|
||||||
|
return key;
|
||||||
|
}
|
||||||
|
|
||||||
|
Topk::Topk(uint64 arg_size)
|
||||||
|
{
|
||||||
|
elementDict = new PDict(Element);
|
||||||
|
elementDict->SetDeleteFunc(topk_element_hash_delete_func);
|
||||||
|
size = arg_size;
|
||||||
|
type = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
Topk::~Topk()
|
||||||
|
{
|
||||||
|
elementDict->Clear();
|
||||||
|
delete elementDict;
|
||||||
|
|
||||||
|
// now all elements are already gone - delete the buckets
|
||||||
|
std::list<Bucket*>::iterator bi = buckets.begin();
|
||||||
|
while ( bi != buckets.end() )
|
||||||
|
{
|
||||||
|
delete *bi;
|
||||||
|
bi++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( type )
|
||||||
|
Unref(type);
|
||||||
|
type = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
VectorVal* Topk::getTopK(int k) // returns vector
|
||||||
|
{
|
||||||
|
if ( numElements == 0 )
|
||||||
|
{
|
||||||
|
reporter->Error("Cannot return topk of empty");
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
TypeList* vector_index = new TypeList(type);
|
||||||
|
vector_index->Append(type);
|
||||||
|
VectorType* v = new VectorType(vector_index);
|
||||||
|
VectorVal* t = new VectorVal(v);
|
||||||
|
|
||||||
|
// this does no estimation if the results is correct!
|
||||||
|
// in any case - just to make this future-proof (and I am lazy) - this can return more than k.
|
||||||
|
|
||||||
|
int read = 0;
|
||||||
|
std::list<Bucket*>::iterator it = buckets.end();
|
||||||
|
while (read < k )
|
||||||
|
{
|
||||||
|
std::list<Element*>::iterator eit = (*it)->elements.begin();
|
||||||
|
while (eit != (*it)->elements.end() )
|
||||||
|
{
|
||||||
|
t->Assign(read, (*eit)->value->Ref());
|
||||||
|
read++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( it == buckets.begin() )
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
Unref(v);
|
||||||
|
return t;
|
||||||
|
}
|
||||||
|
|
||||||
|
void Topk::Encountered(Val* encountered)
|
||||||
|
{
|
||||||
|
// ok, let's see if we already know this one.
|
||||||
|
|
||||||
|
// check type compatibility
|
||||||
|
if ( numElements == 0 )
|
||||||
|
type = encountered->Type()->Ref();
|
||||||
|
else
|
||||||
|
if ( !same_type(type, encountered->Type()) )
|
||||||
|
{
|
||||||
|
reporter->Error("Trying to add element to topk with differing type from other elements");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// Step 1 - get the hash.
|
||||||
|
HashKey* key = GetHash(encountered);
|
||||||
|
Element* e = (Element*) elementDict->Lookup(key);
|
||||||
|
|
||||||
|
if ( e == 0 )
|
||||||
|
{
|
||||||
|
e = new Element();
|
||||||
|
e->epsilon = 0;
|
||||||
|
e->value = encountered->Ref(); // or no ref?
|
||||||
|
|
||||||
|
|
||||||
|
// well, we do not know this one yet...
|
||||||
|
if ( numElements < size )
|
||||||
|
{
|
||||||
|
// brilliant. just add it at position 1
|
||||||
|
if ( buckets.size() == 0 || (*buckets.begin())->count > 1 )
|
||||||
|
{
|
||||||
|
Bucket* b = new Bucket();
|
||||||
|
b->count = 1;
|
||||||
|
std::list<Bucket*>::iterator pos = buckets.insert(buckets.begin(), b);
|
||||||
|
b->bucketPos = pos;
|
||||||
|
b->elements.insert(b->elements.end(), e);
|
||||||
|
e->parent = b;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
Bucket* b = *buckets.begin();
|
||||||
|
assert(b->count == 1);
|
||||||
|
b->elements.insert(b->elements.end(), e);
|
||||||
|
e->parent = b;
|
||||||
|
}
|
||||||
|
|
||||||
|
elementDict->Insert(key, e);
|
||||||
|
numElements++;
|
||||||
|
delete key;
|
||||||
|
return; // done. it is at pos 1.
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// replace element with min-value
|
||||||
|
Bucket* b = *buckets.begin(); // bucket with smallest elements
|
||||||
|
// evict oldest element with least hits.
|
||||||
|
assert(b->elements.size() > 0);
|
||||||
|
HashKey* deleteKey = GetHash((*(b->elements.begin()))->value);
|
||||||
|
b->elements.erase(b->elements.begin());
|
||||||
|
Element* deleteElement = (Element*) elementDict->RemoveEntry(deleteKey);
|
||||||
|
assert(deleteElement); // there has to have been a minimal element...
|
||||||
|
delete deleteElement;
|
||||||
|
delete deleteKey;
|
||||||
|
// and add the new one to the end
|
||||||
|
e->epsilon = b->count;
|
||||||
|
b->elements.insert(b->elements.end(), e);
|
||||||
|
elementDict->Insert(key, e);
|
||||||
|
// fallthrough, increment operation has to run!
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
// ok, we now have an element in e
|
||||||
|
delete key;
|
||||||
|
IncrementCounter(e); // well, this certainly was anticlimatic.
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
void Topk::IncrementCounter(Element* e)
|
||||||
|
{
|
||||||
|
Bucket* currBucket = e->parent;
|
||||||
|
uint64 currcount = currBucket->count;
|
||||||
|
|
||||||
|
// well, let's test if there is a bucket for currcount++
|
||||||
|
std::list<Bucket*>::iterator bucketIter = currBucket->bucketPos;
|
||||||
|
|
||||||
|
Bucket* nextBucket = 0;
|
||||||
|
|
||||||
|
bucketIter++;
|
||||||
|
|
||||||
|
if ( bucketIter != buckets.end() )
|
||||||
|
{
|
||||||
|
if ( (*bucketIter)->count == currcount+1 )
|
||||||
|
nextBucket = *bucketIter;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( nextBucket == 0 )
|
||||||
|
{
|
||||||
|
// the bucket for the value that we want does not exist.
|
||||||
|
// create it...
|
||||||
|
|
||||||
|
Bucket* b = new Bucket();
|
||||||
|
b->count = currcount+1;
|
||||||
|
|
||||||
|
std::list<Bucket*>::iterator nextBucketPos = buckets.insert(bucketIter, b);
|
||||||
|
b->bucketPos = nextBucketPos; // and give it the iterator we know now.
|
||||||
|
|
||||||
|
nextBucket = b;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ok, now we have the new bucket in nextBucket. Shift the element over...
|
||||||
|
currBucket->elements.remove(e);
|
||||||
|
nextBucket->elements.insert(nextBucket->elements.end(), e);
|
||||||
|
|
||||||
|
e->parent = nextBucket;
|
||||||
|
|
||||||
|
// if currBucket is empty, we have to delete it now
|
||||||
|
if ( currBucket->elements.size() == 0 )
|
||||||
|
{
|
||||||
|
buckets.remove(currBucket);
|
||||||
|
delete currBucket;
|
||||||
|
currBucket = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
};
|
56
src/Topk.h
Normal file
56
src/Topk.h
Normal file
|
@ -0,0 +1,56 @@
|
||||||
|
// See the file "COPYING" in the main distribution directory for copyright.
|
||||||
|
|
||||||
|
#ifndef topk_h
|
||||||
|
#define topk_h
|
||||||
|
|
||||||
|
#include <list>
|
||||||
|
#include "Val.h"
|
||||||
|
#include "CompHash.h"
|
||||||
|
|
||||||
|
// This class implements the top-k algorithm. Or - to be more precise - my interpretation of it.
|
||||||
|
|
||||||
|
namespace Topk {
|
||||||
|
|
||||||
|
struct Element;
|
||||||
|
|
||||||
|
struct Bucket {
|
||||||
|
uint64 count;
|
||||||
|
std::list<Element*> elements;
|
||||||
|
std::list<Bucket*>::iterator bucketPos; // iterators only get invalidated for removed elements. This one points to us - so it is invalid when we are no longer there. Cute, isn't it?
|
||||||
|
};
|
||||||
|
|
||||||
|
struct Element {
|
||||||
|
uint64 epsilon;
|
||||||
|
Val* value;
|
||||||
|
Bucket* parent;
|
||||||
|
|
||||||
|
~Element();
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
declare(PDict, Element);
|
||||||
|
|
||||||
|
class Topk {
|
||||||
|
|
||||||
|
public:
|
||||||
|
Topk(uint64 size);
|
||||||
|
~Topk();
|
||||||
|
void Encountered(Val* value); // we saw something
|
||||||
|
VectorVal* getTopK(int k); // returns vector
|
||||||
|
|
||||||
|
private:
|
||||||
|
void IncrementCounter(Element* e);
|
||||||
|
HashKey* GetHash(Val*); // this probably should go somewhere else.
|
||||||
|
|
||||||
|
BroType* type;
|
||||||
|
std::list<Bucket*> buckets;
|
||||||
|
PDict(Element)* elementDict;
|
||||||
|
uint64 size; // how many elements are we tracking?
|
||||||
|
uint64 numElements; // how many elements do we have at the moment
|
||||||
|
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif
|
Loading…
Add table
Add a link
Reference in a new issue