well, a test that works..

Note: merging top-k data structures is not yet possible (and is
actually quite awkward/expensive). I will have to think about
how to do that for a bit...
This commit is contained in:
Bernhard Amann 2013-04-22 02:40:42 -07:00
parent c21c18ea45
commit ce7ad003f2
5 changed files with 115 additions and 12 deletions

View file

@ -19,7 +19,7 @@ Element::~Element()
value=0;
}
HashKey* Topk::GetHash(Val* v)
HashKey* TopkVal::GetHash(Val* v)
{
TypeList* tl = new TypeList(v->Type());
tl->Append(v->Type());
@ -31,15 +31,16 @@ HashKey* Topk::GetHash(Val* v)
return key;
}
Topk::Topk(uint64 arg_size)
TopkVal::TopkVal(uint64 arg_size) : OpaqueVal(new OpaqueType("topk"))
{
elementDict = new PDict(Element);
elementDict->SetDeleteFunc(topk_element_hash_delete_func);
size = arg_size;
type = 0;
numElements = 0;
}
Topk::~Topk()
TopkVal::~TopkVal()
{
elementDict->Clear();
delete elementDict;
@ -57,7 +58,7 @@ Topk::~Topk()
type = 0;
}
VectorVal* Topk::getTopK(int k) // returns vector
VectorVal* TopkVal::getTopK(int k) // returns vector
{
if ( numElements == 0 )
{
@ -75,17 +76,23 @@ VectorVal* Topk::getTopK(int k) // returns vector
int read = 0;
std::list<Bucket*>::iterator it = buckets.end();
it--;
while (read < k )
{
//printf("Bucket %llu\n", (*it)->count);
std::list<Element*>::iterator eit = (*it)->elements.begin();
while (eit != (*it)->elements.end() )
{
//printf("Size: %ld\n", (*it)->elements.size());
t->Assign(read, (*eit)->value->Ref());
read++;
eit++;
}
if ( it == buckets.begin() )
break;
it--;
}
@ -93,13 +100,14 @@ VectorVal* Topk::getTopK(int k) // returns vector
return t;
}
void Topk::Encountered(Val* encountered)
void TopkVal::Encountered(Val* encountered)
{
// ok, let's see if we already know this one.
//printf("NumElements: %d\n", numElements);
// check type compatibility
if ( numElements == 0 )
type = encountered->Type()->Ref();
type = encountered->Type()->Ref()->Ref();
else
if ( !same_type(type, encountered->Type()) )
{
@ -161,6 +169,7 @@ void Topk::Encountered(Val* encountered)
e->epsilon = b->count;
b->elements.insert(b->elements.end(), e);
elementDict->Insert(key, e);
e->parent = b;
// fallthrough, increment operation has to run!
}
@ -172,7 +181,7 @@ void Topk::Encountered(Val* encountered)
}
void Topk::IncrementCounter(Element* e)
void TopkVal::IncrementCounter(Element* e)
{
Bucket* currBucket = e->parent;
uint64 currcount = currBucket->count;

View file

@ -6,6 +6,7 @@
#include <list>
#include "Val.h"
#include "CompHash.h"
#include "OpaqueVal.h"
// This class implements the top-k algorithm. Or - to be more precise - my interpretation of it.
@ -30,11 +31,11 @@ struct Element {
declare(PDict, Element);
class Topk {
class TopkVal : public OpaqueVal {
public:
Topk(uint64 size);
~Topk();
TopkVal(uint64 size);
~TopkVal();
void Encountered(Val* value); // we saw something
VectorVal* getTopK(int k); // returns vector
@ -47,8 +48,6 @@ private:
PDict(Element)* elementDict;
uint64 size; // how many elements are we tracking?
uint64 numElements; // how many elements do we have at the moment
};
};

View file

@ -5642,3 +5642,30 @@ function anonymize_addr%(a: addr, cl: IPAddrAnonymizationClass%): addr
}
%}
%%{
#include "Topk.h"
%%}
function topk_init%(size: count%): opaque of topk
%{
Topk::TopkVal* v = new Topk::TopkVal(size);
return v;
%}
function topk_add%(handle: opaque of topk, value: any%): any
%{
assert(handle);
Topk::TopkVal* h = (Topk::TopkVal*) handle;
h->Encountered(value);
return 0;
%}
function topk_get_top%(handle: opaque of topk, k: count%): any
%{
assert(handle);
Topk::TopkVal* h = (Topk::TopkVal*) handle;
return h->getTopK(k);
%}

View file

@ -0,0 +1,7 @@
[b, c]
[d, c]
[d, e]
[f, e]
[f, e]
[g, e]
[c, e, d]

View file

@ -0,0 +1,61 @@
# @TEST-EXEC: bro -b %INPUT > out
# @TEST-EXEC: btest-diff out
event bro_init()
{
local k1 = topk_init(2);
# first - peculiarity check...
topk_add(k1, "a");
topk_add(k1, "b");
topk_add(k1, "b");
topk_add(k1, "c");
local s = topk_get_top(k1, 5);
print s;
topk_add(k1, "d");
s = topk_get_top(k1, 5);
print s;
topk_add(k1, "e");
s = topk_get_top(k1, 5);
print s;
topk_add(k1, "f");
s = topk_get_top(k1, 5);
print s;
topk_add(k1, "e");
s = topk_get_top(k1, 5);
print s;
topk_add(k1, "g");
s = topk_get_top(k1, 5);
print s;
k1 = topk_init(100);
topk_add(k1, "a");
topk_add(k1, "b");
topk_add(k1, "b");
topk_add(k1, "c");
topk_add(k1, "c");
topk_add(k1, "c");
topk_add(k1, "c");
topk_add(k1, "c");
topk_add(k1, "c");
topk_add(k1, "d");
topk_add(k1, "d");
topk_add(k1, "d");
topk_add(k1, "d");
topk_add(k1, "e");
topk_add(k1, "e");
topk_add(k1, "e");
topk_add(k1, "e");
topk_add(k1, "e");
topk_add(k1, "f");
s = topk_get_top(k1, 3);
print s;
}