From c21c18ea45d24a20f48c42b0d828c184d1f66ebd Mon Sep 17 00:00:00 2001 From: Bernhard Amann Date: Mon, 22 Apr 2013 01:10:29 -0700 Subject: [PATCH] implement topk. This is _completely_ untested. It compiles. It will probably do nothing else (well, besides crashing Bro). --- src/CMakeLists.txt | 1 + src/Topk.cc | 224 +++++++++++++++++++++++++++++++++++++++++++++ src/Topk.h | 56 ++++++++++++ 3 files changed, 281 insertions(+) create mode 100644 src/Topk.cc create mode 100644 src/Topk.h diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 83a018ccde..bc2512af68 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -408,6 +408,7 @@ set(bro_SRCS Telnet.cc Teredo.cc Timer.cc + Topk.cc Traverse.cc Trigger.cc TunnelEncapsulation.cc diff --git a/src/Topk.cc b/src/Topk.cc new file mode 100644 index 0000000000..ef7d7bfbd8 --- /dev/null +++ b/src/Topk.cc @@ -0,0 +1,224 @@ +// See the file "COPYING" in the main distribution directory for copyright. + +#include "Topk.h" +#include "CompHash.h" +#include "Reporter.h" + +namespace Topk { + +static void topk_element_hash_delete_func(void* val) + { + Element* e = (Element*) val; + delete e; + } + +Element::~Element() + { + if ( value ) + Unref(value); + value=0; + } + +HashKey* Topk::GetHash(Val* v) + { + TypeList* tl = new TypeList(v->Type()); + tl->Append(v->Type()); + CompositeHash* topk_hash = new CompositeHash(tl); + Unref(tl); + + HashKey* key = topk_hash->ComputeHash(v, 1); + assert(key); + return key; + } + +Topk::Topk(uint64 arg_size) + { + elementDict = new PDict(Element); + elementDict->SetDeleteFunc(topk_element_hash_delete_func); + size = arg_size; + type = 0; + } + +Topk::~Topk() + { + elementDict->Clear(); + delete elementDict; + + // now all elements are already gone - delete the buckets + std::list::iterator bi = buckets.begin(); + while ( bi != buckets.end() ) + { + delete *bi; + bi++; + } + + if ( type ) + Unref(type); + type = 0; + } + +VectorVal* Topk::getTopK(int k) // returns vector + { + if ( numElements == 0 ) + { + reporter->Error("Cannot return topk of empty"); + return 0; + } + + TypeList* vector_index = new TypeList(type); + vector_index->Append(type); + VectorType* v = new VectorType(vector_index); + VectorVal* t = new VectorVal(v); + + // this does no estimation if the results is correct! + // in any case - just to make this future-proof (and I am lazy) - this can return more than k. + + int read = 0; + std::list::iterator it = buckets.end(); + while (read < k ) + { + std::list::iterator eit = (*it)->elements.begin(); + while (eit != (*it)->elements.end() ) + { + t->Assign(read, (*eit)->value->Ref()); + read++; + } + + if ( it == buckets.begin() ) + break; + } + + + Unref(v); + return t; + } + +void Topk::Encountered(Val* encountered) + { + // ok, let's see if we already know this one. + + // check type compatibility + if ( numElements == 0 ) + type = encountered->Type()->Ref(); + else + if ( !same_type(type, encountered->Type()) ) + { + reporter->Error("Trying to add element to topk with differing type from other elements"); + return; + } + + + // Step 1 - get the hash. + HashKey* key = GetHash(encountered); + Element* e = (Element*) elementDict->Lookup(key); + + if ( e == 0 ) + { + e = new Element(); + e->epsilon = 0; + e->value = encountered->Ref(); // or no ref? + + + // well, we do not know this one yet... + if ( numElements < size ) + { + // brilliant. just add it at position 1 + if ( buckets.size() == 0 || (*buckets.begin())->count > 1 ) + { + Bucket* b = new Bucket(); + b->count = 1; + std::list::iterator pos = buckets.insert(buckets.begin(), b); + b->bucketPos = pos; + b->elements.insert(b->elements.end(), e); + e->parent = b; + } + else + { + Bucket* b = *buckets.begin(); + assert(b->count == 1); + b->elements.insert(b->elements.end(), e); + e->parent = b; + } + + elementDict->Insert(key, e); + numElements++; + delete key; + return; // done. it is at pos 1. + } + else + { + // replace element with min-value + Bucket* b = *buckets.begin(); // bucket with smallest elements + // evict oldest element with least hits. + assert(b->elements.size() > 0); + HashKey* deleteKey = GetHash((*(b->elements.begin()))->value); + b->elements.erase(b->elements.begin()); + Element* deleteElement = (Element*) elementDict->RemoveEntry(deleteKey); + assert(deleteElement); // there has to have been a minimal element... + delete deleteElement; + delete deleteKey; + // and add the new one to the end + e->epsilon = b->count; + b->elements.insert(b->elements.end(), e); + elementDict->Insert(key, e); + // fallthrough, increment operation has to run! + } + + } + + // ok, we now have an element in e + delete key; + IncrementCounter(e); // well, this certainly was anticlimatic. + + } + +void Topk::IncrementCounter(Element* e) + { + Bucket* currBucket = e->parent; + uint64 currcount = currBucket->count; + + // well, let's test if there is a bucket for currcount++ + std::list::iterator bucketIter = currBucket->bucketPos; + + Bucket* nextBucket = 0; + + bucketIter++; + + if ( bucketIter != buckets.end() ) + { + if ( (*bucketIter)->count == currcount+1 ) + nextBucket = *bucketIter; + } + + if ( nextBucket == 0 ) + { + // the bucket for the value that we want does not exist. + // create it... + + Bucket* b = new Bucket(); + b->count = currcount+1; + + std::list::iterator nextBucketPos = buckets.insert(bucketIter, b); + b->bucketPos = nextBucketPos; // and give it the iterator we know now. + + nextBucket = b; + } + + // ok, now we have the new bucket in nextBucket. Shift the element over... + currBucket->elements.remove(e); + nextBucket->elements.insert(nextBucket->elements.end(), e); + + e->parent = nextBucket; + + // if currBucket is empty, we have to delete it now + if ( currBucket->elements.size() == 0 ) + { + buckets.remove(currBucket); + delete currBucket; + currBucket = 0; + } + + + } + +}; diff --git a/src/Topk.h b/src/Topk.h new file mode 100644 index 0000000000..b38e1e8ab3 --- /dev/null +++ b/src/Topk.h @@ -0,0 +1,56 @@ +// See the file "COPYING" in the main distribution directory for copyright. + +#ifndef topk_h +#define topk_h + +#include +#include "Val.h" +#include "CompHash.h" + +// This class implements the top-k algorithm. Or - to be more precise - my interpretation of it. + +namespace Topk { + +struct Element; + +struct Bucket { + uint64 count; + std::list elements; + std::list::iterator bucketPos; // iterators only get invalidated for removed elements. This one points to us - so it is invalid when we are no longer there. Cute, isn't it? +}; + +struct Element { + uint64 epsilon; + Val* value; + Bucket* parent; + + ~Element(); +}; + + +declare(PDict, Element); + +class Topk { + +public: + Topk(uint64 size); + ~Topk(); + void Encountered(Val* value); // we saw something + VectorVal* getTopK(int k); // returns vector + +private: + void IncrementCounter(Element* e); + HashKey* GetHash(Val*); // this probably should go somewhere else. + + BroType* type; + std::list buckets; + PDict(Element)* elementDict; + uint64 size; // how many elements are we tracking? + uint64 numElements; // how many elements do we have at the moment + + +}; + +}; + +#endif