implement topk.

This is _completely_ untested. It compiles. It will probably do nothing else (well, besides crashing Bro).
2025-10-08 17:48:21 +00:00 · 2013-04-22 01:10:29 -07:00 · 2013-04-22 01:10:29 -07:00 · c21c18ea45
commit c21c18ea45
parent 9a88dc500a
3 changed files with 281 additions and 0 deletions
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -408,6 +408,7 @@ set(bro_SRCS
    Telnet.cc
    Teredo.cc
    Timer.cc
    Topk.cc
    Traverse.cc
    Trigger.cc
    TunnelEncapsulation.cc
--- a/src/Topk.cc
+++ b/src/Topk.cc
@ -0,0 +1,224 @@
 // See the file "COPYING" in the main distribution directory for copyright.
 #include "Topk.h"
 #include "CompHash.h"
 #include "Reporter.h"
 namespace Topk {
 static void topk_element_hash_delete_func(void* val)
 	{
 	Element* e = (Element*) val;
 	delete e;
 	}
 Element::~Element() 
 	{
 	if ( value ) 
 		Unref(value);
 	value=0;
 	}
 HashKey* Topk::GetHash(Val* v) 
 	{
 	TypeList* tl = new TypeList(v->Type());
 	tl->Append(v->Type());
 	CompositeHash* topk_hash = new CompositeHash(tl);
 	Unref(tl);
 	HashKey* key = topk_hash->ComputeHash(v, 1);
 	assert(key);
 	return key;
 	}
 Topk::Topk(uint64 arg_size)
 	{
 	elementDict = new PDict(Element);
 	elementDict->SetDeleteFunc(topk_element_hash_delete_func);
 	size = arg_size;
 	type = 0;
 	}
 Topk::~Topk()
 	{
 	elementDict->Clear();
 	delete elementDict;
 	// now all elements are already gone - delete the buckets
 	std::list<Bucket*>::iterator bi = buckets.begin();
 	while ( bi != buckets.end() )
 		{
 		delete *bi;
 		bi++;
 		}
 	if ( type ) 
 		Unref(type);
 	type = 0;
 	}
 VectorVal* Topk::getTopK(int k) // returns vector
 	{
 	if ( numElements == 0 )
 		{
 		reporter->Error("Cannot return topk of empty");
 		return 0;
 		}
 	TypeList* vector_index = new TypeList(type);
 	vector_index->Append(type);
 	VectorType* v = new VectorType(vector_index);
 	VectorVal* t = new VectorVal(v);
 	// this does no estimation if the results is correct!
 	// in any case - just to make this future-proof (and I am lazy) - this can return more than k.
 	int read = 0;
 	std::list<Bucket*>::iterator it = buckets.end();
 	while (read < k )
 		{
 		std::list<Element*>::iterator eit = (*it)->elements.begin();
 		while (eit != (*it)->elements.end() ) 
 			{
 			t->Assign(read, (*eit)->value->Ref());
 			read++;
 			}
 		if ( it == buckets.begin() )
 			break;
 		}
 	Unref(v);
 	return t;
 	}
 void Topk::Encountered(Val* encountered) 
 	{
 	// ok, let's see if we already know this one.
 	// check type compatibility
 	if ( numElements == 0 ) 
 		type = encountered->Type()->Ref();
 	else
 		if ( !same_type(type, encountered->Type()) ) 
 			{
 			reporter->Error("Trying to add element to topk with differing type from other elements");
 			return;
 			}
 	// Step 1 - get the hash.
 	HashKey* key = GetHash(encountered);
 	Element* e = (Element*) elementDict->Lookup(key);
 	if ( e == 0 ) 
 		{
 		e = new Element();
 		e->epsilon = 0;
 		e->value = encountered->Ref(); // or no ref?
 		// well, we do not know this one yet...
 		if ( numElements < size ) 
 			{
 			// brilliant. just add it at position 1
 			if ( buckets.size() == 0 || (*buckets.begin())->count > 1 ) 
 				{
 				Bucket* b = new Bucket();
 				b->count = 1;
 				std::list<Bucket*>::iterator pos = buckets.insert(buckets.begin(), b);
 				b->bucketPos = pos;
 				b->elements.insert(b->elements.end(), e);
 				e->parent = b;
 				}
 			else 
 				{
 				Bucket* b = *buckets.begin();
 				assert(b->count == 1);
 				b->elements.insert(b->elements.end(), e);
 				e->parent = b;
 				}
 			elementDict->Insert(key, e);
 			numElements++;
 			delete key;
 			return; // done. it is at pos 1.
 			}
 		else 
 			{
 			// replace element with min-value
 			Bucket* b = *buckets.begin(); // bucket with smallest elements
 			// evict oldest element with least hits.
 			assert(b->elements.size() > 0);
 			HashKey* deleteKey = GetHash((*(b->elements.begin()))->value);
 			b->elements.erase(b->elements.begin());
 			Element* deleteElement = (Element*) elementDict->RemoveEntry(deleteKey);
 			assert(deleteElement); // there has to have been a minimal element...
 			delete deleteElement;
 			delete deleteKey;
 			// and add the new one to the end
 			e->epsilon = b->count;
 			b->elements.insert(b->elements.end(), e);
 			elementDict->Insert(key, e);
 			// fallthrough, increment operation has to run!
 			}
 		}
 	// ok, we now have an element in e
 	delete key;
 	IncrementCounter(e); // well, this certainly was anticlimatic.
 	}
 void Topk::IncrementCounter(Element* e) 
 	{
 	Bucket* currBucket = e->parent;
 	uint64 currcount = currBucket->count;
 	// well, let's test if there is a bucket for currcount++
 	std::list<Bucket*>::iterator bucketIter = currBucket->bucketPos;
 	Bucket* nextBucket = 0;
 	bucketIter++;
 	if ( bucketIter != buckets.end() ) 
 		{
 		if ( (*bucketIter)->count == currcount+1 )
 			nextBucket = *bucketIter;
 		}
 	if ( nextBucket == 0 ) 
 		{
 		// the bucket for the value that we want does not exist.
 		// create it...
 		Bucket* b = new Bucket();
 		b->count = currcount+1;
 		std::list<Bucket*>::iterator nextBucketPos = buckets.insert(bucketIter, b);
 		b->bucketPos = nextBucketPos; // and give it the iterator we know now.
 		nextBucket = b;
 		}
 	// ok, now we have the new bucket in nextBucket. Shift the element over...
 	currBucket->elements.remove(e);
 	nextBucket->elements.insert(nextBucket->elements.end(), e);
 	e->parent = nextBucket;
 	// if currBucket is empty, we have to delete it now
 	if ( currBucket->elements.size() == 0 ) 
 		{
 		buckets.remove(currBucket);
 		delete currBucket;
 		currBucket = 0;
 		}
 	}
 };
--- a/src/Topk.h
+++ b/src/Topk.h
@ -0,0 +1,56 @@
 // See the file "COPYING" in the main distribution directory for copyright.
 #ifndef topk_h
 #define topk_h
 #include <list>
 #include "Val.h"
 #include "CompHash.h"
 // This class implements the top-k algorithm. Or - to be more precise - my interpretation of it.
 namespace Topk {
 struct Element;
 struct Bucket {
 	uint64 count;
 	std::list<Element*> elements;
 	std::list<Bucket*>::iterator bucketPos; // iterators only get invalidated for removed elements. This one points to us - so it is invalid when we are no longer there. Cute, isn't it?
 };
 struct Element {
 	uint64 epsilon;
 	Val* value;
 	Bucket* parent;
 	~Element();
 };
 declare(PDict, Element);
 class Topk {
 public:
 	Topk(uint64 size);
 	~Topk();
 	void Encountered(Val* value); // we saw something
 	VectorVal* getTopK(int k); // returns vector
 private:
 	void IncrementCounter(Element* e);
 	HashKey* GetHash(Val*); // this probably should go somewhere else.
 	BroType* type;
 	std::list<Bucket*> buckets;
 	PDict(Element)* elementDict;
 	uint64 size; // how many elements are we tracking?
 	uint64 numElements; // how many elements do we have at the moment
 };
 };
 #endif