adapt to new folder structure

2025-10-17 22:18:20 +00:00 · 2013-07-31 12:06:59 -07:00 · 2013-07-31 12:06:59 -07:00 · 5122bf4a7c
commit 5122bf4a7c
parent daaf091bc3
8 changed files with 128 additions and 119 deletions
--- a/src/probabilistic/CMakeLists.txt
+++ b/src/probabilistic/CMakeLists.txt
@ -10,9 +10,11 @@ set(probabilistic_SRCS
    BitVector.cc
    BloomFilter.cc
    CounterVector.cc
-    Hasher.cc)
+    Hasher.cc
+    Topk.cc)

 bif_target(bloom-filter.bif)
+bif_target(top-k.bif)
 bro_add_subdir_library(probabilistic ${probabilistic_SRCS})

 add_dependencies(bro_probabilistic generate_outputs)
--- a/src/probabilistic/Topk.cc
+++ b/src/probabilistic/Topk.cc
@ -0,0 +1,492 @@
+// See the file "COPYING" in the main distribution directory for copyright.
+
+#include "probabilistic/Topk.h"
+#include "CompHash.h"
+#include "Reporter.h"
+#include "Serializer.h"
+#include "NetVar.h"
+
+
+namespace probabilistic {
+
+IMPLEMENT_SERIAL(TopkVal, SER_TOPK_VAL);
+
+static void topk_element_hash_delete_func(void* val)
+	{
+	Element* e = (Element*) val;
+	delete e;
+	}
+
+Element::~Element() 
+	{
+	if ( value ) 
+		Unref(value);
+	value=0;
+	}
+
+HashKey* TopkVal::GetHash(Val* v) const
+	{
+	TypeList* tl = new TypeList(v->Type());
+	tl->Append(v->Type()->Ref());
+	CompositeHash* topk_hash = new CompositeHash(tl);
+	Unref(tl);
+
+	HashKey* key = topk_hash->ComputeHash(v, 1);
+	assert(key);
+	delete topk_hash;
+	return key;
+	}
+
+TopkVal::TopkVal(uint64 arg_size) : OpaqueVal(topk_type)
+	{
+	elementDict = new PDict(Element);
+	elementDict->SetDeleteFunc(topk_element_hash_delete_func);
+	size = arg_size;
+	type = 0;
+	numElements = 0;
+	pruned = false;
+	}
+
+TopkVal::TopkVal() : OpaqueVal(topk_type)
+	{
+	elementDict = new PDict(Element);
+	elementDict->SetDeleteFunc(topk_element_hash_delete_func);
+	size = 0;
+	type = 0;
+	numElements = 0;
+	}
+
+TopkVal::~TopkVal()
+	{
+	elementDict->Clear();
+	delete elementDict;
+
+	// now all elements are already gone - delete the buckets
+	std::list<Bucket*>::iterator bi = buckets.begin();
+	while ( bi != buckets.end() )
+		{
+		delete *bi;
+		bi++;
+		}
+
+	if ( type ) 
+		Unref(type);
+	type = 0;
+	}
+
+void TopkVal::Merge(const TopkVal* value, bool doPrune)
+	{
+
+	if ( type == 0 )
+		{
+		assert(numElements == 0);
+		type = value->type->Ref();
+		}
+	else
+		if ( !same_type(type, value->type) )
+			{
+			reporter->Error("Tried to merge top-k elements of differing types. Aborted");
+			return;
+			}
+
+	std::list<Bucket*>::const_iterator it = value->buckets.begin();
+	while ( it != value->buckets.end() )
+		{
+		Bucket* b = *it;
+		uint64_t currcount = b->count;
+		std::list<Element*>::const_iterator eit = b->elements.begin();
+		
+		while ( eit != b->elements.end() )
+			{
+			Element* e = *eit;
+			// lookup if we already know this one...
+			HashKey* key = GetHash(e->value);
+			Element* olde = (Element*) elementDict->Lookup(key);
+
+			if ( olde == 0 ) 
+				{
+				olde = new Element();
+				olde->epsilon=0;
+				olde->value = e->value->Ref();
+				// insert at bucket position 0
+				if ( buckets.size() > 0 ) 
+					{
+					assert (buckets.front()-> count > 0 );
+					}
+
+				Bucket* newbucket = new Bucket();
+				newbucket->count = 0;
+				newbucket->bucketPos = buckets.insert(buckets.begin(), newbucket);
+
+				olde->parent = newbucket;
+				newbucket->elements.insert(newbucket->elements.end(), olde);
+
+				elementDict->Insert(key, olde);
+				numElements++;
+
+				}
+
+			// now that we are sure that the old element is present - increment epsilon
+			olde->epsilon += e->epsilon;
+			// and increment position...
+			IncrementCounter(olde, currcount);
+			delete key;
+
+			eit++;
+			}
+
+		it++;
+		}
+
+	// now we have added everything. And our top-k table could be too big.
+	// prune everything...
+	
+	assert(size > 0);
+
+	if ( doPrune )
+		{
+		while ( numElements > size ) 
+			{
+			pruned = true;
+			assert(buckets.size() > 0 );
+			Bucket* b = buckets.front();
+			assert(b->elements.size() > 0);
+
+			Element* e = b->elements.front();
+			HashKey* key = GetHash(e->value);
+			elementDict->RemoveEntry(key);
+			delete e;
+
+			b->elements.pop_front();
+			
+			if ( b->elements.size() == 0 ) 
+				{
+				delete b;
+				buckets.pop_front();
+				}
+
+			numElements--;
+			}
+		}
+
+	}
+
+bool TopkVal::DoSerialize(SerialInfo* info) const
+	{
+	DO_SERIALIZE(SER_TOPK_VAL, OpaqueVal);
+
+	bool v = true;
+
+	v &= SERIALIZE(size);
+	v &= SERIALIZE(numElements);
+	v &= SERIALIZE(pruned);
+	bool type_present = (type != 0);
+	v &= SERIALIZE(type_present);
+	if ( type_present )
+		v &= type->Serialize(info);
+	else 
+		assert(numElements == 0);
+
+	uint64_t i = 0;
+	std::list<Bucket*>::const_iterator it = buckets.begin();
+	while ( it != buckets.end() ) 
+		{
+		Bucket* b = *it;
+		uint32_t elements_count = b->elements.size();
+		v &= SERIALIZE(elements_count);
+		v &= SERIALIZE(b->count);
+		std::list<Element*>::const_iterator eit = b->elements.begin();
+		while ( eit != b->elements.end() ) 
+			{
+			Element* element = *eit;
+			v &= SERIALIZE(element->epsilon);
+			v &= element->value->Serialize(info);
+
+			eit++;
+			i++;
+			}
+
+		it++;
+		}
+
+	assert(i == numElements);
+
+	return v;
+	}
+
+bool TopkVal::DoUnserialize(UnserialInfo* info)
+	{
+	DO_UNSERIALIZE(OpaqueVal);
+
+	bool v = true;
+
+	v &= UNSERIALIZE(&size);
+	v &= UNSERIALIZE(&numElements);
+	v &= UNSERIALIZE(&pruned);
+	bool type_present = false;
+	v &= UNSERIALIZE(&type_present);
+	if ( type_present ) 
+		{
+		type = BroType::Unserialize(info);
+		assert(type);
+		}
+	else
+		assert(numElements == 0);
+
+	uint64_t i = 0;
+	while ( i < numElements ) 
+		{
+		Bucket* b = new Bucket();
+		uint32_t elements_count;
+		v &= UNSERIALIZE(&elements_count);
+		v &= UNSERIALIZE(&b->count);
+		b->bucketPos = buckets.insert(buckets.end(), b);
+
+		for ( uint64_t j = 0; j < elements_count; j++ ) 
+			{
+			Element* e = new Element();
+			v &= UNSERIALIZE(&e->epsilon);
+			e->value = Val::Unserialize(info, type);
+			e->parent = b;
+
+			b->elements.insert(b->elements.end(), e);
+
+			HashKey* key = GetHash(e->value);
+			assert (  elementDict->Lookup(key) == 0 );
+
+			elementDict->Insert(key, e);
+			delete key;
+
+		
+			i++;
+			}
+		}
+
+	assert(i == numElements);
+
+	return v;
+	}
+
+
+VectorVal* TopkVal::getTopK(int k) const // returns vector
+	{
+	if ( numElements == 0 )
+		{
+		reporter->Error("Cannot return topk of empty");
+		return 0;
+		}
+
+	TypeList* vector_index = new TypeList(type);
+	vector_index->Append(type->Ref());
+	VectorType* v = new VectorType(vector_index);
+	VectorVal* t = new VectorVal(v);
+
+	// this does no estimation if the results is correct!
+	// in any case - just to make this future-proof (and I am lazy) - this can return more than k.
+	
+	int read = 0;
+	std::list<Bucket*>::const_iterator it = buckets.end();
+	it--;
+	while (read < k )
+		{
+		//printf("Bucket %llu\n", (*it)->count);
+		std::list<Element*>::iterator eit = (*it)->elements.begin();
+		while (eit != (*it)->elements.end() ) 
+			{
+			//printf("Size: %ld\n", (*it)->elements.size());
+			t->Assign(read, (*eit)->value->Ref());
+			read++;
+			eit++;
+			}
+
+		if ( it == buckets.begin() )
+			break;
+
+		it--;
+		}
+
+
+	Unref(v);
+	return t;
+	}
+
+uint64_t TopkVal::getCount(Val* value) const
+	{
+	HashKey* key = GetHash(value);
+	Element* e = (Element*) elementDict->Lookup(key);
+
+	if ( e == 0 ) 
+		{
+		reporter->Error("getCount for element that is not in top-k");	
+		return 0;
+		}
+
+	delete key;
+	return e->parent->count;
+	}
+
+uint64_t TopkVal::getEpsilon(Val* value) const
+	{
+	HashKey* key = GetHash(value);
+	Element* e = (Element*) elementDict->Lookup(key);
+
+	if ( e == 0 ) 
+		{
+		reporter->Error("getEpsilon for element that is not in top-k");	
+		return 0;
+		}
+
+	delete key;
+	return e->epsilon;
+	}
+
+uint64_t TopkVal::getSum() const
+	{
+	uint64_t sum = 0;
+
+	std::list<Bucket*>::const_iterator it = buckets.begin();
+	while ( it != buckets.end() ) 
+		{
+		sum += (*it)->elements.size() * (*it)->count;
+
+		it++;
+		}
+
+	if ( pruned ) 
+		reporter->Warning("TopkVal::getSum() was used on a pruned data structure. Result values do not represent total element count");
+
+	return sum;
+	}
+	
+void TopkVal::Encountered(Val* encountered) 
+	{
+	// ok, let's see if we already know this one.
+	
+	//printf("NumElements: %d\n", numElements);
+	// check type compatibility
+	if ( numElements == 0 ) 
+		type = encountered->Type()->Ref();
+	else
+		if ( !same_type(type, encountered->Type()) ) 
+			{
+			reporter->Error("Trying to add element to topk with differing type from other elements");
+			return;
+			}
+
+	
+	// Step 1 - get the hash.
+	HashKey* key = GetHash(encountered);
+	Element* e = (Element*) elementDict->Lookup(key);
+
+	if ( e == 0 ) 
+		{
+		e = new Element();
+		e->epsilon = 0;
+		e->value = encountered->Ref(); // or no ref?
+
+
+		// well, we do not know this one yet...
+		if ( numElements < size ) 
+			{
+			// brilliant. just add it at position 1
+			if ( buckets.size() == 0 || (*buckets.begin())->count > 1 ) 
+				{
+				Bucket* b = new Bucket();
+				b->count = 1;
+				std::list<Bucket*>::iterator pos = buckets.insert(buckets.begin(), b);
+				b->bucketPos = pos;
+				b->elements.insert(b->elements.end(), e);
+				e->parent = b;
+				}
+			else 
+				{
+				Bucket* b = *buckets.begin();
+				assert(b->count == 1);
+				b->elements.insert(b->elements.end(), e);
+				e->parent = b;
+				}
+
+			elementDict->Insert(key, e);
+			numElements++;
+			delete key;
+			return; // done. it is at pos 1.
+			}
+		else 
+			{
+			// replace element with min-value
+			Bucket* b = *buckets.begin(); // bucket with smallest elements
+			// evict oldest element with least hits.
+			assert(b->elements.size() > 0);
+			HashKey* deleteKey = GetHash((*(b->elements.begin()))->value);
+			b->elements.erase(b->elements.begin());
+			Element* deleteElement = (Element*) elementDict->RemoveEntry(deleteKey);
+			assert(deleteElement); // there has to have been a minimal element...
+			delete deleteElement;
+			delete deleteKey;
+			// and add the new one to the end
+			e->epsilon = b->count;
+			b->elements.insert(b->elements.end(), e);
+			elementDict->Insert(key, e);
+			e->parent = b;
+			// fallthrough, increment operation has to run!
+			}
+
+		}
+
+	// ok, we now have an element in e
+	delete key;
+	IncrementCounter(e); // well, this certainly was anticlimatic.
+	
+	}
+
+// increment by count
+void TopkVal::IncrementCounter(Element* e, unsigned int count) 
+	{
+	Bucket* currBucket = e->parent;
+	uint64 currcount = currBucket->count;
+	
+	// well, let's test if there is a bucket for currcount++
+	std::list<Bucket*>::iterator bucketIter = currBucket->bucketPos;
+
+	Bucket* nextBucket = 0;
+
+	bucketIter++;
+
+	while ( bucketIter != buckets.end() && (*bucketIter)->count < currcount+count ) 
+		bucketIter++;
+
+	if ( bucketIter != buckets.end() && (*bucketIter)->count == currcount+count )
+		nextBucket = *bucketIter;
+
+	if ( nextBucket == 0 ) 
+		{
+		// the bucket for the value that we want does not exist.
+		// create it...
+
+		Bucket* b = new Bucket();
+		b->count = currcount+count;
+
+		std::list<Bucket*>::iterator nextBucketPos = buckets.insert(bucketIter, b);
+		b->bucketPos = nextBucketPos; // and give it the iterator we know now.
+
+		nextBucket = b;
+		}
+
+	// ok, now we have the new bucket in nextBucket. Shift the element over...
+	currBucket->elements.remove(e);
+	nextBucket->elements.insert(nextBucket->elements.end(), e);
+
+	e->parent = nextBucket;
+
+	// if currBucket is empty, we have to delete it now
+	if ( currBucket->elements.size() == 0 ) 
+		{
+		buckets.remove(currBucket);
+		delete currBucket;
+		currBucket = 0;
+		}
+
+	
+	}
+
+};
--- a/src/probabilistic/Topk.h
+++ b/src/probabilistic/Topk.h
@ -0,0 +1,92 @@
+// See the file "COPYING" in the main distribution directory for copyright.
+
+#ifndef topk_h
+#define topk_h
+
+#include <list>
+#include "Val.h"
+#include "CompHash.h"
+#include "OpaqueVal.h"
+
+// This class implements the top-k algorithm. Or - to be more precise - my interpretation of it.
+
+namespace probabilistic {
+
+struct Element;
+
+struct Bucket {
+	uint64 count;
+	std::list<Element*> elements;
+	std::list<Bucket*>::iterator bucketPos; // iterators only get invalidated for removed elements. This one points to us - so it is invalid when we are no longer there. Cute, isn't it?
+};
+
+struct Element {
+	uint64 epsilon;
+	Val* value;
+	Bucket* parent;
+
+	~Element();
+};
+
+
+declare(PDict, Element);
+
+class TopkVal : public OpaqueVal {
+
+public:
+	// Initialize a TopkVal. Size specifies how many total elements are tracked
+	TopkVal(uint64 size);
+	~TopkVal();
+
+	// Call this, when a new value is encountered. Note that on the first call,
+	// the Bro-Type of the value types that are counted is set. All following calls
+	// to encountered have to specify the same type
+	void Encountered(Val* value); 
+
+	// Return the first k elements of the result vector. At the moment, this does
+	// not check if it is in the right order or if we can prove that these are 
+	// the correct top-k. Use count and epsilon for this.
+	VectorVal* getTopK(int k) const; // returns vector
+
+	// Get the current count tracked in the top-k data structure for a certain val.
+	// Returns 0 if the val is unknown (and logs the error to reporter)
+	uint64_t getCount(Val* value) const;
+
+	// Get the current epsilon tracked in the top-k data structure for a certain val.
+	// Returns 0 if the val is unknown (and logs the error to reporter)
+	uint64_t getEpsilon(Val* value) const;
+
+	// Get the size set in the constructor
+	uint64_t getSize() const { return size; }
+
+	// Get the sum of all counts of all tracked elements. This is equal to the number
+	// of total observations up to this moment, if no elements were pruned from the data
+	// structure.
+	uint64_t getSum() const;
+
+	// Merge another top-k data structure in this one. 
+	// doPrune specifies if the total count of elements is limited to size after
+	// merging. 
+	// Please note, that pruning will invalidate the results of getSum.
+	void Merge(const TopkVal* value, bool doPrune=false);
+
+protected:
+	TopkVal(); // for deserialize
+
+private:
+	void IncrementCounter(Element* e, unsigned int count = 1);
+	HashKey* GetHash(Val*) const; // this probably should go somewhere else.
+		
+	BroType* type;
+	std::list<Bucket*> buckets;
+	PDict(Element)* elementDict;
+	uint64 size; // how many elements are we tracking?
+	uint64 numElements; // how many elements do we have at the moment
+	bool pruned; // was this data structure pruned?
+
+	DECLARE_SERIAL(TopkVal); 
+};
+
+};
+
+#endif
--- a/src/probabilistic/top-k.bif
+++ b/src/probabilistic/top-k.bif
@ -0,0 +1,122 @@
+# ===========================================================================
+#
+#                                  Top-K Functions
+#
+# ===========================================================================
+
+
+%%{
+#include "probabilistic/Topk.h"
+%%}
+
+## Creates a top-k data structure which tracks size elements.
+##
+## Returns: Opaque pointer to the data structure.
+function topk_init%(size: count%): opaque of topk
+	%{
+	probabilistic::TopkVal* v = new probabilistic::TopkVal(size);
+	return v;
+	%}
+
+## Add a new observed object to the data structure. The first
+## added object sets the type of data tracked by the top-k data
+## structure. All following values have to be of the same type
+function topk_add%(handle: opaque of topk, value: any%): any
+	%{
+	assert(handle);
+	probabilistic::TopkVal* h = (probabilistic::TopkVal*) handle;
+	h->Encountered(value);
+
+	return 0;
+	%}
+
+## Get the first k elements of the top-k data structure
+##
+## Returns: vector of the first k elements
+function topk_get_top%(handle: opaque of topk, k: count%): any
+	%{
+	assert(handle);
+	probabilistic::TopkVal* h = (probabilistic::TopkVal*) handle;
+	return h->getTopK(k);	
+	%}
+
+## Get an overestimated count of how often value has been encountered.
+## value has to be part of the currently tracked elements, otherwise
+## 0 will be returned and an error message will be added to reporter.
+##
+## Returns: Overestimated number for how often the element has been encountered
+function topk_count%(handle: opaque of topk, value: any%): count
+	%{
+	assert(handle);
+	probabilistic::TopkVal* h = (probabilistic::TopkVal*) handle;
+	return new Val(h->getCount(value), TYPE_COUNT);
+	%}
+
+## Get a  the maximal overestimation for count. Same restrictiosn as for topk_count
+## apply.
+##
+## Returns: Number which represents the maximal overesimation for the count of this element.
+function topk_epsilon%(handle: opaque of topk, value: any%): count
+	%{
+	assert(handle);
+	probabilistic::TopkVal* h = (probabilistic::TopkVal*) handle;
+	return new Val(h->getEpsilon(value), TYPE_COUNT);
+	%}
+
+## Get the number of elements this data structure is supposed to track (given on init).
+## Note that the actual number of elements in the data structure can be lower or higher
+## than this. (higher due to non-pruned merges)
+## 
+## Returns: size given during initialization 
+function topk_size%(handle: opaque of topk%): count
+	%{
+	assert(handle);
+	probabilistic::TopkVal* h = (probabilistic::TopkVal*) handle;
+	return new Val(h->getSize(), TYPE_COUNT);
+	%}
+
+## Get the sum of all counts of all elements in the data structure. Is equal to the number
+## of all inserted objects if the data structure never has been pruned. Do not use after
+## calling topk_merge_prune (will throw a warning message if used afterwards)
+##
+## Returns: sum of all counts
+function topk_sum%(handle: opaque of topk%): count
+	%{
+	assert(handle);
+	probabilistic::TopkVal* h = (probabilistic::TopkVal*) handle;
+	return new Val(h->getSum(), TYPE_COUNT);
+	%}
+
+## Merge the second topk data structure into the first. Does not remove any elements, the
+## resulting data structure can be bigger than the maximum size given on initialization.
+function topk_merge%(handle1: opaque of topk, handle2: opaque of topk%): any
+	%{
+	assert(handle1);
+	assert(handle2);
+
+	probabilistic::TopkVal* h1 = (probabilistic::TopkVal*) handle1;
+	probabilistic::TopkVal* h2 = (probabilistic::TopkVal*) handle2;
+
+	h1->Merge(h2);
+
+	return 0;
+	%}
+
+## Merge the second topk data structure into the first and prunes the final data structure
+## back to the size given on initialization. Use with care and only when being aware of the
+## restrictions this imposed. Do not call topk_size or topk_add afterwards, results will
+## probably not be what you expect.
+function topk_merge_prune%(handle1: opaque of topk, handle2: opaque of topk%): any
+	%{
+	assert(handle1);
+	assert(handle2);
+
+	probabilistic::TopkVal* h1 = (probabilistic::TopkVal*) handle1;
+	probabilistic::TopkVal* h2 = (probabilistic::TopkVal*) handle2;
+
+	h1->Merge(h2, true);
+
+	return 0;
+	%}
+
+