From c21c18ea45d24a20f48c42b0d828c184d1f66ebd Mon Sep 17 00:00:00 2001
From: Bernhard Amann <bernhard@icsi.berkeley.edu>
Date: Mon, 22 Apr 2013 01:10:29 -0700
Subject: [PATCH] implement topk.

This is _completely_ untested. It compiles. It will probably do
nothing else (well, besides crashing Bro).
---
 src/CMakeLists.txt |   1 +
 src/Topk.cc        | 224 +++++++++++++++++++++++++++++++++++++++++++++
 src/Topk.h         |  56 ++++++++++++
 3 files changed, 281 insertions(+)
 create mode 100644 src/Topk.cc
 create mode 100644 src/Topk.h

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 83a018ccde..bc2512af68 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -408,6 +408,7 @@ set(bro_SRCS
     Telnet.cc
     Teredo.cc
     Timer.cc
+    Topk.cc
     Traverse.cc
     Trigger.cc
     TunnelEncapsulation.cc
diff --git a/src/Topk.cc b/src/Topk.cc
new file mode 100644
index 0000000000..ef7d7bfbd8
--- /dev/null
+++ b/src/Topk.cc
@@ -0,0 +1,224 @@
+// See the file "COPYING" in the main distribution directory for copyright.
+
+#include "Topk.h"
+#include "CompHash.h"
+#include "Reporter.h"
+
+namespace Topk {
+
+static void topk_element_hash_delete_func(void* val)
+	{
+	Element* e = (Element*) val;
+	delete e;
+	}
+
+Element::~Element() 
+	{
+	if ( value ) 
+		Unref(value);
+	value=0;
+	}
+
+HashKey* Topk::GetHash(Val* v) 
+	{
+	TypeList* tl = new TypeList(v->Type());
+	tl->Append(v->Type());
+	CompositeHash* topk_hash = new CompositeHash(tl);
+	Unref(tl);
+
+	HashKey* key = topk_hash->ComputeHash(v, 1);
+	assert(key);
+	return key;
+	}
+
+Topk::Topk(uint64 arg_size)
+	{
+	elementDict = new PDict(Element);
+	elementDict->SetDeleteFunc(topk_element_hash_delete_func);
+	size = arg_size;
+	type = 0;
+	}
+
+Topk::~Topk()
+	{
+	elementDict->Clear();
+	delete elementDict;
+
+	// now all elements are already gone - delete the buckets
+	std::list<Bucket*>::iterator bi = buckets.begin();
+	while ( bi != buckets.end() )
+		{
+		delete *bi;
+		bi++;
+		}
+
+	if ( type ) 
+		Unref(type);
+	type = 0;
+	}
+
+VectorVal* Topk::getTopK(int k) // returns vector
+	{
+	if ( numElements == 0 )
+		{
+		reporter->Error("Cannot return topk of empty");
+		return 0;
+		}
+
+	TypeList* vector_index = new TypeList(type);
+	vector_index->Append(type);
+	VectorType* v = new VectorType(vector_index);
+	VectorVal* t = new VectorVal(v);
+
+	// this does no estimation if the results is correct!
+	// in any case - just to make this future-proof (and I am lazy) - this can return more than k.
+	
+	int read = 0;
+	std::list<Bucket*>::iterator it = buckets.end();
+	while (read < k )
+		{
+		std::list<Element*>::iterator eit = (*it)->elements.begin();
+		while (eit != (*it)->elements.end() ) 
+			{
+			t->Assign(read, (*eit)->value->Ref());
+			read++;
+			}
+
+		if ( it == buckets.begin() )
+			break;
+		}
+
+
+	Unref(v);
+	return t;
+	}
+
+void Topk::Encountered(Val* encountered) 
+	{
+	// ok, let's see if we already know this one.
+	
+	// check type compatibility
+	if ( numElements == 0 ) 
+		type = encountered->Type()->Ref();
+	else
+		if ( !same_type(type, encountered->Type()) ) 
+			{
+			reporter->Error("Trying to add element to topk with differing type from other elements");
+			return;
+			}
+
+	
+	// Step 1 - get the hash.
+	HashKey* key = GetHash(encountered);
+	Element* e = (Element*) elementDict->Lookup(key);
+
+	if ( e == 0 ) 
+		{
+		e = new Element();
+		e->epsilon = 0;
+		e->value = encountered->Ref(); // or no ref?
+
+
+		// well, we do not know this one yet...
+		if ( numElements < size ) 
+			{
+			// brilliant. just add it at position 1
+			if ( buckets.size() == 0 || (*buckets.begin())->count > 1 ) 
+				{
+				Bucket* b = new Bucket();
+				b->count = 1;
+				std::list<Bucket*>::iterator pos = buckets.insert(buckets.begin(), b);
+				b->bucketPos = pos;
+				b->elements.insert(b->elements.end(), e);
+				e->parent = b;
+				}
+			else 
+				{
+				Bucket* b = *buckets.begin();
+				assert(b->count == 1);
+				b->elements.insert(b->elements.end(), e);
+				e->parent = b;
+				}
+
+			elementDict->Insert(key, e);
+			numElements++;
+			delete key;
+			return; // done. it is at pos 1.
+			}
+		else 
+			{
+			// replace element with min-value
+			Bucket* b = *buckets.begin(); // bucket with smallest elements
+			// evict oldest element with least hits.
+			assert(b->elements.size() > 0);
+			HashKey* deleteKey = GetHash((*(b->elements.begin()))->value);
+			b->elements.erase(b->elements.begin());
+			Element* deleteElement = (Element*) elementDict->RemoveEntry(deleteKey);
+			assert(deleteElement); // there has to have been a minimal element...
+			delete deleteElement;
+			delete deleteKey;
+			// and add the new one to the end
+			e->epsilon = b->count;
+			b->elements.insert(b->elements.end(), e);
+			elementDict->Insert(key, e);
+			// fallthrough, increment operation has to run!
+			}
+
+		}
+
+	// ok, we now have an element in e
+	delete key;
+	IncrementCounter(e); // well, this certainly was anticlimatic.
+	
+	}
+
+void Topk::IncrementCounter(Element* e) 
+	{
+	Bucket* currBucket = e->parent;
+	uint64 currcount = currBucket->count;
+	
+	// well, let's test if there is a bucket for currcount++
+	std::list<Bucket*>::iterator bucketIter = currBucket->bucketPos;
+
+	Bucket* nextBucket = 0;
+
+	bucketIter++;
+
+	if ( bucketIter != buckets.end() ) 
+		{
+		if ( (*bucketIter)->count == currcount+1 )
+			nextBucket = *bucketIter;
+		}
+
+	if ( nextBucket == 0 ) 
+		{
+		// the bucket for the value that we want does not exist.
+		// create it...
+
+		Bucket* b = new Bucket();
+		b->count = currcount+1;
+
+		std::list<Bucket*>::iterator nextBucketPos = buckets.insert(bucketIter, b);
+		b->bucketPos = nextBucketPos; // and give it the iterator we know now.
+
+		nextBucket = b;
+		}
+
+	// ok, now we have the new bucket in nextBucket. Shift the element over...
+	currBucket->elements.remove(e);
+	nextBucket->elements.insert(nextBucket->elements.end(), e);
+
+	e->parent = nextBucket;
+
+	// if currBucket is empty, we have to delete it now
+	if ( currBucket->elements.size() == 0 ) 
+		{
+		buckets.remove(currBucket);
+		delete currBucket;
+		currBucket = 0;
+		}
+
+	
+	}
+
+};
diff --git a/src/Topk.h b/src/Topk.h
new file mode 100644
index 0000000000..b38e1e8ab3
--- /dev/null
+++ b/src/Topk.h
@@ -0,0 +1,56 @@
+// See the file "COPYING" in the main distribution directory for copyright.
+
+#ifndef topk_h
+#define topk_h
+
+#include <list>
+#include "Val.h"
+#include "CompHash.h"
+
+// This class implements the top-k algorithm. Or - to be more precise - my interpretation of it.
+
+namespace Topk {
+
+struct Element;
+
+struct Bucket {
+	uint64 count;
+	std::list<Element*> elements;
+	std::list<Bucket*>::iterator bucketPos; // iterators only get invalidated for removed elements. This one points to us - so it is invalid when we are no longer there. Cute, isn't it?
+};
+
+struct Element {
+	uint64 epsilon;
+	Val* value;
+	Bucket* parent;
+
+	~Element();
+};
+
+
+declare(PDict, Element);
+
+class Topk {
+
+public:
+	Topk(uint64 size);
+	~Topk();
+	void Encountered(Val* value); // we saw something
+	VectorVal* getTopK(int k); // returns vector
+
+private:
+	void IncrementCounter(Element* e);
+	HashKey* GetHash(Val*); // this probably should go somewhere else.
+		
+	BroType* type;
+	std::list<Bucket*> buckets;
+	PDict(Element)* elementDict;
+	uint64 size; // how many elements are we tracking?
+	uint64 numElements; // how many elements do we have at the moment
+
+
+};
+
+};
+
+#endif