From c21c18ea45d24a20f48c42b0d828c184d1f66ebd Mon Sep 17 00:00:00 2001
From: Bernhard Amann <bernhard@icsi.berkeley.edu>
Date: Mon, 22 Apr 2013 01:10:29 -0700
Subject: [PATCH 01/40] implement topk.

This is _completely_ untested. It compiles. It will probably do
nothing else (well, besides crashing Bro).
---
 src/CMakeLists.txt |   1 +
 src/Topk.cc        | 224 +++++++++++++++++++++++++++++++++++++++++++++
 src/Topk.h         |  56 ++++++++++++
 3 files changed, 281 insertions(+)
 create mode 100644 src/Topk.cc
 create mode 100644 src/Topk.h

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 83a018ccde..bc2512af68 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -408,6 +408,7 @@ set(bro_SRCS
     Telnet.cc
     Teredo.cc
     Timer.cc
+    Topk.cc
     Traverse.cc
     Trigger.cc
     TunnelEncapsulation.cc
diff --git a/src/Topk.cc b/src/Topk.cc
new file mode 100644
index 0000000000..ef7d7bfbd8
--- /dev/null
+++ b/src/Topk.cc
@@ -0,0 +1,224 @@
+// See the file "COPYING" in the main distribution directory for copyright.
+
+#include "Topk.h"
+#include "CompHash.h"
+#include "Reporter.h"
+
+namespace Topk {
+
+static void topk_element_hash_delete_func(void* val)
+	{
+	Element* e = (Element*) val;
+	delete e;
+	}
+
+Element::~Element() 
+	{
+	if ( value ) 
+		Unref(value);
+	value=0;
+	}
+
+HashKey* Topk::GetHash(Val* v) 
+	{
+	TypeList* tl = new TypeList(v->Type());
+	tl->Append(v->Type());
+	CompositeHash* topk_hash = new CompositeHash(tl);
+	Unref(tl);
+
+	HashKey* key = topk_hash->ComputeHash(v, 1);
+	assert(key);
+	return key;
+	}
+
+Topk::Topk(uint64 arg_size)
+	{
+	elementDict = new PDict(Element);
+	elementDict->SetDeleteFunc(topk_element_hash_delete_func);
+	size = arg_size;
+	type = 0;
+	}
+
+Topk::~Topk()
+	{
+	elementDict->Clear();
+	delete elementDict;
+
+	// now all elements are already gone - delete the buckets
+	std::list<Bucket*>::iterator bi = buckets.begin();
+	while ( bi != buckets.end() )
+		{
+		delete *bi;
+		bi++;
+		}
+
+	if ( type ) 
+		Unref(type);
+	type = 0;
+	}
+
+VectorVal* Topk::getTopK(int k) // returns vector
+	{
+	if ( numElements == 0 )
+		{
+		reporter->Error("Cannot return topk of empty");
+		return 0;
+		}
+
+	TypeList* vector_index = new TypeList(type);
+	vector_index->Append(type);
+	VectorType* v = new VectorType(vector_index);
+	VectorVal* t = new VectorVal(v);
+
+	// this does no estimation if the results is correct!
+	// in any case - just to make this future-proof (and I am lazy) - this can return more than k.
+	
+	int read = 0;
+	std::list<Bucket*>::iterator it = buckets.end();
+	while (read < k )
+		{
+		std::list<Element*>::iterator eit = (*it)->elements.begin();
+		while (eit != (*it)->elements.end() ) 
+			{
+			t->Assign(read, (*eit)->value->Ref());
+			read++;
+			}
+
+		if ( it == buckets.begin() )
+			break;
+		}
+
+
+	Unref(v);
+	return t;
+	}
+
+void Topk::Encountered(Val* encountered) 
+	{
+	// ok, let's see if we already know this one.
+	
+	// check type compatibility
+	if ( numElements == 0 ) 
+		type = encountered->Type()->Ref();
+	else
+		if ( !same_type(type, encountered->Type()) ) 
+			{
+			reporter->Error("Trying to add element to topk with differing type from other elements");
+			return;
+			}
+
+	
+	// Step 1 - get the hash.
+	HashKey* key = GetHash(encountered);
+	Element* e = (Element*) elementDict->Lookup(key);
+
+	if ( e == 0 ) 
+		{
+		e = new Element();
+		e->epsilon = 0;
+		e->value = encountered->Ref(); // or no ref?
+
+
+		// well, we do not know this one yet...
+		if ( numElements < size ) 
+			{
+			// brilliant. just add it at position 1
+			if ( buckets.size() == 0 || (*buckets.begin())->count > 1 ) 
+				{
+				Bucket* b = new Bucket();
+				b->count = 1;
+				std::list<Bucket*>::iterator pos = buckets.insert(buckets.begin(), b);
+				b->bucketPos = pos;
+				b->elements.insert(b->elements.end(), e);
+				e->parent = b;
+				}
+			else 
+				{
+				Bucket* b = *buckets.begin();
+				assert(b->count == 1);
+				b->elements.insert(b->elements.end(), e);
+				e->parent = b;
+				}
+
+			elementDict->Insert(key, e);
+			numElements++;
+			delete key;
+			return; // done. it is at pos 1.
+			}
+		else 
+			{
+			// replace element with min-value
+			Bucket* b = *buckets.begin(); // bucket with smallest elements
+			// evict oldest element with least hits.
+			assert(b->elements.size() > 0);
+			HashKey* deleteKey = GetHash((*(b->elements.begin()))->value);
+			b->elements.erase(b->elements.begin());
+			Element* deleteElement = (Element*) elementDict->RemoveEntry(deleteKey);
+			assert(deleteElement); // there has to have been a minimal element...
+			delete deleteElement;
+			delete deleteKey;
+			// and add the new one to the end
+			e->epsilon = b->count;
+			b->elements.insert(b->elements.end(), e);
+			elementDict->Insert(key, e);
+			// fallthrough, increment operation has to run!
+			}
+
+		}
+
+	// ok, we now have an element in e
+	delete key;
+	IncrementCounter(e); // well, this certainly was anticlimatic.
+	
+	}
+
+void Topk::IncrementCounter(Element* e) 
+	{
+	Bucket* currBucket = e->parent;
+	uint64 currcount = currBucket->count;
+	
+	// well, let's test if there is a bucket for currcount++
+	std::list<Bucket*>::iterator bucketIter = currBucket->bucketPos;
+
+	Bucket* nextBucket = 0;
+
+	bucketIter++;
+
+	if ( bucketIter != buckets.end() ) 
+		{
+		if ( (*bucketIter)->count == currcount+1 )
+			nextBucket = *bucketIter;
+		}
+
+	if ( nextBucket == 0 ) 
+		{
+		// the bucket for the value that we want does not exist.
+		// create it...
+
+		Bucket* b = new Bucket();
+		b->count = currcount+1;
+
+		std::list<Bucket*>::iterator nextBucketPos = buckets.insert(bucketIter, b);
+		b->bucketPos = nextBucketPos; // and give it the iterator we know now.
+
+		nextBucket = b;
+		}
+
+	// ok, now we have the new bucket in nextBucket. Shift the element over...
+	currBucket->elements.remove(e);
+	nextBucket->elements.insert(nextBucket->elements.end(), e);
+
+	e->parent = nextBucket;
+
+	// if currBucket is empty, we have to delete it now
+	if ( currBucket->elements.size() == 0 ) 
+		{
+		buckets.remove(currBucket);
+		delete currBucket;
+		currBucket = 0;
+		}
+
+	
+	}
+
+};
diff --git a/src/Topk.h b/src/Topk.h
new file mode 100644
index 0000000000..b38e1e8ab3
--- /dev/null
+++ b/src/Topk.h
@@ -0,0 +1,56 @@
+// See the file "COPYING" in the main distribution directory for copyright.
+
+#ifndef topk_h
+#define topk_h
+
+#include <list>
+#include "Val.h"
+#include "CompHash.h"
+
+// This class implements the top-k algorithm. Or - to be more precise - my interpretation of it.
+
+namespace Topk {
+
+struct Element;
+
+struct Bucket {
+	uint64 count;
+	std::list<Element*> elements;
+	std::list<Bucket*>::iterator bucketPos; // iterators only get invalidated for removed elements. This one points to us - so it is invalid when we are no longer there. Cute, isn't it?
+};
+
+struct Element {
+	uint64 epsilon;
+	Val* value;
+	Bucket* parent;
+
+	~Element();
+};
+
+
+declare(PDict, Element);
+
+class Topk {
+
+public:
+	Topk(uint64 size);
+	~Topk();
+	void Encountered(Val* value); // we saw something
+	VectorVal* getTopK(int k); // returns vector
+
+private:
+	void IncrementCounter(Element* e);
+	HashKey* GetHash(Val*); // this probably should go somewhere else.
+		
+	BroType* type;
+	std::list<Bucket*> buckets;
+	PDict(Element)* elementDict;
+	uint64 size; // how many elements are we tracking?
+	uint64 numElements; // how many elements do we have at the moment
+
+
+};
+
+};
+
+#endif

From ce7ad003f251e8c76be3d07190907157cd9a87c1 Mon Sep 17 00:00:00 2001
From: Bernhard Amann <bernhard@icsi.berkeley.edu>
Date: Mon, 22 Apr 2013 02:40:42 -0700
Subject: [PATCH 02/40] well, a test that works..

Note: merging top-k data structures is not yet possible (and is
actually quite awkward/expensive). I will have to think about
how to do that for a bit...
---
 src/Topk.cc                          | 23 +++++++----
 src/Topk.h                           |  9 ++--
 src/bro.bif                          | 27 ++++++++++++
 testing/btest/Baseline/bifs.topk/out |  7 ++++
 testing/btest/bifs/topk.bro          | 61 ++++++++++++++++++++++++++++
 5 files changed, 115 insertions(+), 12 deletions(-)
 create mode 100644 testing/btest/Baseline/bifs.topk/out
 create mode 100644 testing/btest/bifs/topk.bro

diff --git a/src/Topk.cc b/src/Topk.cc
index ef7d7bfbd8..8f4d63ed78 100644
--- a/src/Topk.cc
+++ b/src/Topk.cc
@@ -19,7 +19,7 @@ Element::~Element()
 	value=0;
 	}
 
-HashKey* Topk::GetHash(Val* v) 
+HashKey* TopkVal::GetHash(Val* v) 
 	{
 	TypeList* tl = new TypeList(v->Type());
 	tl->Append(v->Type());
@@ -31,15 +31,16 @@ HashKey* Topk::GetHash(Val* v)
 	return key;
 	}
 
-Topk::Topk(uint64 arg_size)
+TopkVal::TopkVal(uint64 arg_size) : OpaqueVal(new OpaqueType("topk"))
 	{
 	elementDict = new PDict(Element);
 	elementDict->SetDeleteFunc(topk_element_hash_delete_func);
 	size = arg_size;
 	type = 0;
+	numElements = 0;
 	}
 
-Topk::~Topk()
+TopkVal::~TopkVal()
 	{
 	elementDict->Clear();
 	delete elementDict;
@@ -57,7 +58,7 @@ Topk::~Topk()
 	type = 0;
 	}
 
-VectorVal* Topk::getTopK(int k) // returns vector
+VectorVal* TopkVal::getTopK(int k) // returns vector
 	{
 	if ( numElements == 0 )
 		{
@@ -75,17 +76,23 @@ VectorVal* Topk::getTopK(int k) // returns vector
 	
 	int read = 0;
 	std::list<Bucket*>::iterator it = buckets.end();
+	it--;
 	while (read < k )
 		{
+		//printf("Bucket %llu\n", (*it)->count);
 		std::list<Element*>::iterator eit = (*it)->elements.begin();
 		while (eit != (*it)->elements.end() ) 
 			{
+			//printf("Size: %ld\n", (*it)->elements.size());
 			t->Assign(read, (*eit)->value->Ref());
 			read++;
+			eit++;
 			}
 
 		if ( it == buckets.begin() )
 			break;
+
+		it--;
 		}
 
 
@@ -93,13 +100,14 @@ VectorVal* Topk::getTopK(int k) // returns vector
 	return t;
 	}
 
-void Topk::Encountered(Val* encountered) 
+void TopkVal::Encountered(Val* encountered) 
 	{
 	// ok, let's see if we already know this one.
 	
+	//printf("NumElements: %d\n", numElements);
 	// check type compatibility
 	if ( numElements == 0 ) 
-		type = encountered->Type()->Ref();
+		type = encountered->Type()->Ref()->Ref();
 	else
 		if ( !same_type(type, encountered->Type()) ) 
 			{
@@ -161,6 +169,7 @@ void Topk::Encountered(Val* encountered)
 			e->epsilon = b->count;
 			b->elements.insert(b->elements.end(), e);
 			elementDict->Insert(key, e);
+			e->parent = b;
 			// fallthrough, increment operation has to run!
 			}
 
@@ -172,7 +181,7 @@ void Topk::Encountered(Val* encountered)
 	
 	}
 
-void Topk::IncrementCounter(Element* e) 
+void TopkVal::IncrementCounter(Element* e) 
 	{
 	Bucket* currBucket = e->parent;
 	uint64 currcount = currBucket->count;
diff --git a/src/Topk.h b/src/Topk.h
index b38e1e8ab3..7c983ebdfc 100644
--- a/src/Topk.h
+++ b/src/Topk.h
@@ -6,6 +6,7 @@
 #include <list>
 #include "Val.h"
 #include "CompHash.h"
+#include "OpaqueVal.h"
 
 // This class implements the top-k algorithm. Or - to be more precise - my interpretation of it.
 
@@ -30,11 +31,11 @@ struct Element {
 
 declare(PDict, Element);
 
-class Topk {
+class TopkVal : public OpaqueVal {
 
 public:
-	Topk(uint64 size);
-	~Topk();
+	TopkVal(uint64 size);
+	~TopkVal();
 	void Encountered(Val* value); // we saw something
 	VectorVal* getTopK(int k); // returns vector
 
@@ -47,8 +48,6 @@ private:
 	PDict(Element)* elementDict;
 	uint64 size; // how many elements are we tracking?
 	uint64 numElements; // how many elements do we have at the moment
-
-
 };
 
 };
diff --git a/src/bro.bif b/src/bro.bif
index ac54da0e75..695337bcf1 100644
--- a/src/bro.bif
+++ b/src/bro.bif
@@ -5642,3 +5642,30 @@ function anonymize_addr%(a: addr, cl: IPAddrAnonymizationClass%): addr
 		}
 	%}
 
+
+
+%%{
+#include "Topk.h"
+%%}
+
+function topk_init%(size: count%): opaque of topk
+	%{
+	Topk::TopkVal* v = new Topk::TopkVal(size);
+	return v;
+	%}
+
+function topk_add%(handle: opaque of topk, value: any%): any
+	%{
+	assert(handle);
+	Topk::TopkVal* h = (Topk::TopkVal*) handle;
+	h->Encountered(value);
+
+	return 0;
+	%}
+
+function topk_get_top%(handle: opaque of topk, k: count%): any
+	%{
+	assert(handle);
+	Topk::TopkVal* h = (Topk::TopkVal*) handle;
+	return h->getTopK(k);	
+	%}
diff --git a/testing/btest/Baseline/bifs.topk/out b/testing/btest/Baseline/bifs.topk/out
new file mode 100644
index 0000000000..94aa5bd572
--- /dev/null
+++ b/testing/btest/Baseline/bifs.topk/out
@@ -0,0 +1,7 @@
+[b, c]
+[d, c]
+[d, e]
+[f, e]
+[f, e]
+[g, e]
+[c, e, d]
diff --git a/testing/btest/bifs/topk.bro b/testing/btest/bifs/topk.bro
new file mode 100644
index 0000000000..af1f38c773
--- /dev/null
+++ b/testing/btest/bifs/topk.bro
@@ -0,0 +1,61 @@
+# @TEST-EXEC: bro -b %INPUT > out
+# @TEST-EXEC: btest-diff out
+
+event bro_init() 
+	{
+	local k1 = topk_init(2);
+	
+	# first - peculiarity check...
+	topk_add(k1, "a");
+	topk_add(k1, "b");
+	topk_add(k1, "b");
+	topk_add(k1, "c");
+
+	local s = topk_get_top(k1, 5);
+	print s;
+ 
+	topk_add(k1, "d");
+	s = topk_get_top(k1, 5);
+	print s;
+	
+	topk_add(k1, "e");
+	s = topk_get_top(k1, 5);
+	print s;
+	
+	topk_add(k1, "f");
+	s = topk_get_top(k1, 5);
+	print s;
+	
+	topk_add(k1, "e");
+	s = topk_get_top(k1, 5);
+	print s;
+
+	topk_add(k1, "g");
+	s = topk_get_top(k1, 5);
+	print s;
+
+	k1 = topk_init(100);
+	topk_add(k1, "a");
+	topk_add(k1, "b");
+	topk_add(k1, "b");
+	topk_add(k1, "c");
+	topk_add(k1, "c");
+	topk_add(k1, "c");
+	topk_add(k1, "c");
+	topk_add(k1, "c");
+	topk_add(k1, "c");
+	topk_add(k1, "d");
+	topk_add(k1, "d");
+	topk_add(k1, "d");
+	topk_add(k1, "d");
+	topk_add(k1, "e");
+	topk_add(k1, "e");
+	topk_add(k1, "e");
+	topk_add(k1, "e");
+	topk_add(k1, "e");
+	topk_add(k1, "f");
+	s = topk_get_top(k1, 3);
+	print s;
+	
+
+}

From de5769a88fd123cf6f34f978cf67ec5ee494de15 Mon Sep 17 00:00:00 2001
From: Bernhard Amann <bernhard@icsi.berkeley.edu>
Date: Tue, 23 Apr 2013 15:19:01 -0700
Subject: [PATCH 03/40] topk for sumstats

---
 .../frameworks/sumstats/plugins/__load__.bro  |  3 +-
 .../base/frameworks/sumstats/plugins/topk.bro | 24 ++++++++++
 src/Topk.cc                                   | 33 ++++++++++++-
 src/Topk.h                                    |  4 +-
 src/bro.bif                                   | 15 ++++++
 testing/btest/Baseline/bifs.topk/.stderr      |  6 +++
 testing/btest/Baseline/bifs.topk/out          | 30 ++++++++++++
 .../.stdout                                   |  8 ++++
 testing/btest/bifs/topk.bro                   | 31 ++++++++++++
 .../scripts/base/frameworks/sumstats/topk.bro | 48 +++++++++++++++++++
 10 files changed, 198 insertions(+), 4 deletions(-)
 create mode 100644 scripts/base/frameworks/sumstats/plugins/topk.bro
 create mode 100644 testing/btest/Baseline/bifs.topk/.stderr
 create mode 100644 testing/btest/Baseline/scripts.base.frameworks.sumstats.topk/.stdout
 create mode 100644 testing/btest/scripts/base/frameworks/sumstats/topk.bro

diff --git a/scripts/base/frameworks/sumstats/plugins/__load__.bro b/scripts/base/frameworks/sumstats/plugins/__load__.bro
index 0d4c2ed302..35191a4776 100644
--- a/scripts/base/frameworks/sumstats/plugins/__load__.bro
+++ b/scripts/base/frameworks/sumstats/plugins/__load__.bro
@@ -4,5 +4,6 @@
 @load ./sample
 @load ./std-dev
 @load ./sum
+@load ./topk
 @load ./unique
-@load ./variance
\ No newline at end of file
+@load ./variance
diff --git a/scripts/base/frameworks/sumstats/plugins/topk.bro b/scripts/base/frameworks/sumstats/plugins/topk.bro
new file mode 100644
index 0000000000..f64e9fb18d
--- /dev/null
+++ b/scripts/base/frameworks/sumstats/plugins/topk.bro
@@ -0,0 +1,24 @@
+@load base/frameworks/sumstats
+
+module SumStats;
+
+export {
+	redef enum Calculation += {
+		TOPK
+	};
+
+	redef record ResultVal += {
+		topk: opaque of topk &default=topk_init(500);
+	};
+
+}
+
+hook observe_hook(r: Reducer, val: double, obs: Observation, rv: ResultVal)
+	{
+	if ( TOPK in r$apply ) 
+		{
+		topk_add(rv$topk, obs);
+		}
+	}
+
+
diff --git a/src/Topk.cc b/src/Topk.cc
index 8f4d63ed78..d5866b4f41 100644
--- a/src/Topk.cc
+++ b/src/Topk.cc
@@ -19,7 +19,7 @@ Element::~Element()
 	value=0;
 	}
 
-HashKey* TopkVal::GetHash(Val* v) 
+HashKey* TopkVal::GetHash(Val* v) const
 	{
 	TypeList* tl = new TypeList(v->Type());
 	tl->Append(v->Type());
@@ -58,7 +58,8 @@ TopkVal::~TopkVal()
 	type = 0;
 	}
 
-VectorVal* TopkVal::getTopK(int k) // returns vector
+
+VectorVal* TopkVal::getTopK(int k)  // returns vector
 	{
 	if ( numElements == 0 )
 		{
@@ -100,6 +101,34 @@ VectorVal* TopkVal::getTopK(int k) // returns vector
 	return t;
 	}
 
+uint64_t TopkVal::getCount(Val* value) const
+	{
+	HashKey* key = GetHash(value);
+	Element* e = (Element*) elementDict->Lookup(key);
+
+	if ( e == 0 ) 
+		{
+		reporter->Error("getCount for element that is not in top-k");	
+		return 0;
+		}
+
+	return e->parent->count;
+	}
+
+uint64_t TopkVal::getEpsilon(Val* value) const
+	{
+	HashKey* key = GetHash(value);
+	Element* e = (Element*) elementDict->Lookup(key);
+
+	if ( e == 0 ) 
+		{
+		reporter->Error("getEpsilon for element that is not in top-k");	
+		return 0;
+		}
+
+	return e->epsilon;
+	}
+	
 void TopkVal::Encountered(Val* encountered) 
 	{
 	// ok, let's see if we already know this one.
diff --git a/src/Topk.h b/src/Topk.h
index 7c983ebdfc..e4c6aa5aea 100644
--- a/src/Topk.h
+++ b/src/Topk.h
@@ -38,10 +38,12 @@ public:
 	~TopkVal();
 	void Encountered(Val* value); // we saw something
 	VectorVal* getTopK(int k); // returns vector
+	uint64_t getCount(Val* value) const;
+	uint64_t getEpsilon(Val* value) const;
 
 private:
 	void IncrementCounter(Element* e);
-	HashKey* GetHash(Val*); // this probably should go somewhere else.
+	HashKey* GetHash(Val*) const; // this probably should go somewhere else.
 		
 	BroType* type;
 	std::list<Bucket*> buckets;
diff --git a/src/bro.bif b/src/bro.bif
index 695337bcf1..e8e78c7872 100644
--- a/src/bro.bif
+++ b/src/bro.bif
@@ -5669,3 +5669,18 @@ function topk_get_top%(handle: opaque of topk, k: count%): any
 	Topk::TopkVal* h = (Topk::TopkVal*) handle;
 	return h->getTopK(k);	
 	%}
+
+function topk_count%(handle: opaque of topk, value: any%): count
+	%{
+	assert(handle);
+	Topk::TopkVal* h = (Topk::TopkVal*) handle;
+	return new Val(h->getCount(value), TYPE_COUNT);
+	%}
+
+function topk_epsilon%(handle: opaque of topk, value: any%): count
+	%{
+	assert(handle);
+	Topk::TopkVal* h = (Topk::TopkVal*) handle;
+	return new Val(h->getEpsilon(value), TYPE_COUNT);
+	%}
+
diff --git a/testing/btest/Baseline/bifs.topk/.stderr b/testing/btest/Baseline/bifs.topk/.stderr
new file mode 100644
index 0000000000..f57e35ca51
--- /dev/null
+++ b/testing/btest/Baseline/bifs.topk/.stderr
@@ -0,0 +1,6 @@
+error: getCount for element that is not in top-k
+error: getEpsilon for element that is not in top-k
+error: getCount for element that is not in top-k
+error: getEpsilon for element that is not in top-k
+error: getCount for element that is not in top-k
+error: getEpsilon for element that is not in top-k
diff --git a/testing/btest/Baseline/bifs.topk/out b/testing/btest/Baseline/bifs.topk/out
index 94aa5bd572..2116a30a12 100644
--- a/testing/btest/Baseline/bifs.topk/out
+++ b/testing/btest/Baseline/bifs.topk/out
@@ -1,7 +1,37 @@
 [b, c]
+0
+0
+2
+0
+2
+1
 [d, c]
+0
+0
+2
+1
+3
+2
 [d, e]
+3
+2
+3
+2
 [f, e]
+4
+3
+3
+2
 [f, e]
+4
+3
+4
+2
 [g, e]
+0
+0
+4
+2
+5
+4
 [c, e, d]
diff --git a/testing/btest/Baseline/scripts.base.frameworks.sumstats.topk/.stdout b/testing/btest/Baseline/scripts.base.frameworks.sumstats.topk/.stdout
new file mode 100644
index 0000000000..c85316eecc
--- /dev/null
+++ b/testing/btest/Baseline/scripts.base.frameworks.sumstats.topk/.stdout
@@ -0,0 +1,8 @@
+Top entries for key counter
+Num: 1, count: 99, epsilon: 0
+Num: 2, count: 98, epsilon: 0
+Num: 3, count: 97, epsilon: 0
+Num: 4, count: 96, epsilon: 0
+Num: 5, count: 95, epsilon: 0
+Top entries for key two
+Num: 1, count: 2, epsilon: 0
diff --git a/testing/btest/bifs/topk.bro b/testing/btest/bifs/topk.bro
index af1f38c773..9d936ce2f4 100644
--- a/testing/btest/bifs/topk.bro
+++ b/testing/btest/bifs/topk.bro
@@ -1,5 +1,6 @@
 # @TEST-EXEC: bro -b %INPUT > out
 # @TEST-EXEC: btest-diff out
+# @TEST-EXEC: btest-diff .stderr
 
 event bro_init() 
 	{
@@ -13,26 +14,56 @@ event bro_init()
 
 	local s = topk_get_top(k1, 5);
 	print s;
+	print topk_count(k1, "a");
+	print topk_epsilon(k1, "a");
+	print topk_count(k1, "b");
+	print topk_epsilon(k1, "b");
+	print topk_count(k1, "c");
+	print topk_epsilon(k1, "c");
  
 	topk_add(k1, "d");
 	s = topk_get_top(k1, 5);
 	print s;
+	print topk_count(k1, "b");
+	print topk_epsilon(k1, "b");
+	print topk_count(k1, "c");
+	print topk_epsilon(k1, "c");
+	print topk_count(k1, "d");
+	print topk_epsilon(k1, "d");
 	
 	topk_add(k1, "e");
 	s = topk_get_top(k1, 5);
 	print s;
+	print topk_count(k1, "d");
+	print topk_epsilon(k1, "d");
+	print topk_count(k1, "e");
+	print topk_epsilon(k1, "e");
 	
 	topk_add(k1, "f");
 	s = topk_get_top(k1, 5);
 	print s;
+	print topk_count(k1, "f");
+	print topk_epsilon(k1, "f");
+	print topk_count(k1, "e");
+	print topk_epsilon(k1, "e");
 	
 	topk_add(k1, "e");
 	s = topk_get_top(k1, 5);
 	print s;
+	print topk_count(k1, "f");
+	print topk_epsilon(k1, "f");
+	print topk_count(k1, "e");
+	print topk_epsilon(k1, "e");
 
 	topk_add(k1, "g");
 	s = topk_get_top(k1, 5);
 	print s;
+	print topk_count(k1, "f");
+	print topk_epsilon(k1, "f");
+	print topk_count(k1, "e");
+	print topk_epsilon(k1, "e");
+	print topk_count(k1, "g");
+	print topk_epsilon(k1, "g");
 
 	k1 = topk_init(100);
 	topk_add(k1, "a");
diff --git a/testing/btest/scripts/base/frameworks/sumstats/topk.bro b/testing/btest/scripts/base/frameworks/sumstats/topk.bro
new file mode 100644
index 0000000000..22a5af1bc7
--- /dev/null
+++ b/testing/btest/scripts/base/frameworks/sumstats/topk.bro
@@ -0,0 +1,48 @@
+# @TEST-EXEC: bro %INPUT
+# @TEST-EXEC: btest-diff .stdout
+
+event bro_init() &priority=5
+	{
+	local r1: SumStats::Reducer = [$stream="test.metric", 
+	                               $apply=set(SumStats::TOPK)];
+	SumStats::create([$epoch=3secs,
+	                     $reducers=set(r1),
+	                     $epoch_finished(data: SumStats::ResultTable) = 
+	                     	{
+	                     	for ( key in data )
+	                     		{
+	                     		local r = data[key]["test.metric"];
+					
+					local s: vector of SumStats::Observation;
+					s = topk_get_top(r$topk, 5);
+				
+					print fmt("Top entries for key %s", key$str);
+					for ( element in s ) 
+						{
+						print fmt("Num: %d, count: %d, epsilon: %d", s[element]$num, topk_count(r$topk, s[element]), topk_epsilon(r$topk, s[element]));
+						}
+
+	                     		}
+	                     	}
+		 ]);
+
+
+	const loop_v: vector of count = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100};
+
+	local a: count;
+	a = 0;
+
+	for ( i in loop_v ) 
+		{
+		a = a + 1;
+		for ( j in loop_v )
+			{
+			if ( i < j ) 
+				SumStats::observe("test.metric", [$str="counter"], [$num=a]);
+			}
+		}
+	
+
+	SumStats::observe("test.metric", [$str="two"], [$num=1]);
+	SumStats::observe("test.metric", [$str="two"], [$num=1]);
+	}

From a426c7612270b5cc40143cc14b4e3c42c6499617 Mon Sep 17 00:00:00 2001
From: Bernhard Amann <bernhard@icsi.berkeley.edu>
Date: Tue, 23 Apr 2013 18:23:34 -0700
Subject: [PATCH 04/40] make the get function const

---
 src/Topk.cc | 4 ++--
 src/Topk.h  | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/Topk.cc b/src/Topk.cc
index d5866b4f41..b89fa2e96f 100644
--- a/src/Topk.cc
+++ b/src/Topk.cc
@@ -59,7 +59,7 @@ TopkVal::~TopkVal()
 	}
 
 
-VectorVal* TopkVal::getTopK(int k)  // returns vector
+VectorVal* TopkVal::getTopK(int k) const // returns vector
 	{
 	if ( numElements == 0 )
 		{
@@ -76,7 +76,7 @@ VectorVal* TopkVal::getTopK(int k)  // returns vector
 	// in any case - just to make this future-proof (and I am lazy) - this can return more than k.
 	
 	int read = 0;
-	std::list<Bucket*>::iterator it = buckets.end();
+	std::list<Bucket*>::const_iterator it = buckets.end();
 	it--;
 	while (read < k )
 		{
diff --git a/src/Topk.h b/src/Topk.h
index e4c6aa5aea..f486948c5c 100644
--- a/src/Topk.h
+++ b/src/Topk.h
@@ -37,7 +37,7 @@ public:
 	TopkVal(uint64 size);
 	~TopkVal();
 	void Encountered(Val* value); // we saw something
-	VectorVal* getTopK(int k); // returns vector
+	VectorVal* getTopK(int k) const; // returns vector
 	uint64_t getCount(Val* value) const;
 	uint64_t getEpsilon(Val* value) const;
 

From 6f863d2259a5d388068f4ad70571ab62dcaa9cd4 Mon Sep 17 00:00:00 2001
From: Bernhard Amann <bernhard@icsi.berkeley.edu>
Date: Tue, 23 Apr 2013 23:24:02 -0700
Subject: [PATCH 05/40] add serialization for topk

---
 src/SerialTypes.h                             |   1 +
 src/Topk.cc                                   | 108 ++++++++++++++++++
 src/Topk.h                                    |   5 +
 .../btest/Baseline/bifs.topk_persistence/out  |  21 ++++
 testing/btest/bifs/topk_persistence.bro       |  74 ++++++++++++
 5 files changed, 209 insertions(+)
 create mode 100644 testing/btest/Baseline/bifs.topk_persistence/out
 create mode 100644 testing/btest/bifs/topk_persistence.bro

diff --git a/src/SerialTypes.h b/src/SerialTypes.h
index 723badab1e..f07392eff4 100644
--- a/src/SerialTypes.h
+++ b/src/SerialTypes.h
@@ -104,6 +104,7 @@ SERIAL_VAL(MD5_VAL, 16)
 SERIAL_VAL(SHA1_VAL, 17)
 SERIAL_VAL(SHA256_VAL, 18)
 SERIAL_VAL(ENTROPY_VAL, 19)
+SERIAL_VAL(TOPK_VAL, 20)
 
 #define SERIAL_EXPR(name, val) SERIAL_CONST(name, val, EXPR)
 SERIAL_EXPR(EXPR, 1)
diff --git a/src/Topk.cc b/src/Topk.cc
index b89fa2e96f..a31f49adf4 100644
--- a/src/Topk.cc
+++ b/src/Topk.cc
@@ -3,9 +3,13 @@
 #include "Topk.h"
 #include "CompHash.h"
 #include "Reporter.h"
+#include "Serializer.h"
+
 
 namespace Topk {
 
+IMPLEMENT_SERIAL(TopkVal, SER_TOPK_VAL);
+
 static void topk_element_hash_delete_func(void* val)
 	{
 	Element* e = (Element*) val;
@@ -40,6 +44,15 @@ TopkVal::TopkVal(uint64 arg_size) : OpaqueVal(new OpaqueType("topk"))
 	numElements = 0;
 	}
 
+TopkVal::TopkVal() : OpaqueVal(new OpaqueType("topk"))
+	{
+	elementDict = new PDict(Element);
+	elementDict->SetDeleteFunc(topk_element_hash_delete_func);
+	size = 0;
+	type = 0;
+	numElements = 0;
+	}
+
 TopkVal::~TopkVal()
 	{
 	elementDict->Clear();
@@ -59,6 +72,101 @@ TopkVal::~TopkVal()
 	}
 
 
+bool TopkVal::DoSerialize(SerialInfo* info) const
+	{
+	DO_SERIALIZE(SER_TOPK_VAL, OpaqueVal);
+
+	bool v = true;
+
+	v &= SERIALIZE(size);
+	v &= SERIALIZE(numElements);
+	bool type_present = (type != 0);
+	v &= SERIALIZE(type_present);
+	if ( type_present )
+		v &= type->Serialize(info);
+	else 
+		assert(numElements == 0);
+
+	int i = 0;
+	std::list<Bucket*>::const_iterator it = buckets.begin();
+	while ( it != buckets.end() ) 
+		{
+		Bucket* b = *it;
+		uint32_t elements_count = b->elements.size();
+		v &= SERIALIZE(elements_count);
+		v &= SERIALIZE(b->count);
+		std::list<Element*>::const_iterator eit = b->elements.begin();
+		while ( eit != b->elements.end() ) 
+			{
+			Element* element = *eit;
+			v &= SERIALIZE(element->epsilon);
+			v &= element->value->Serialize(info);
+
+			eit++;
+			i++;
+			}
+
+		it++;
+		}
+
+	assert(i == numElements);
+
+	return v;
+	}
+
+bool TopkVal::DoUnserialize(UnserialInfo* info)
+	{
+	DO_UNSERIALIZE(OpaqueVal);
+
+	bool v = true;
+
+	v &= UNSERIALIZE(&size);
+	v &= UNSERIALIZE(&numElements);
+	bool type_present = false;
+	v &= UNSERIALIZE(&type_present);
+	if ( type_present ) 
+		{
+		type = BroType::Unserialize(info);
+		assert(type);
+		}
+	else
+		assert(numElements == 0);
+
+	int i = 0;
+	while ( i < numElements ) 
+		{
+		Bucket* b = new Bucket();
+		uint32_t elements_count;
+		v &= UNSERIALIZE(&elements_count);
+		v &= UNSERIALIZE(&b->count);
+		b->bucketPos = buckets.insert(buckets.end(), b);
+
+		for ( int j = 0; j < elements_count; j++ ) 
+			{
+			Element* e = new Element();
+			v &= UNSERIALIZE(&e->epsilon);
+			e->value = Val::Unserialize(info, type);
+			e->parent = b;
+
+			b->elements.insert(b->elements.end(), e);
+
+			HashKey* key = GetHash(e->value);
+			assert (  elementDict->Lookup(key) == 0 );
+
+			elementDict->Insert(key, e);
+			delete key;
+
+		
+			i++;
+			}
+		}
+
+	assert(i == numElements);
+
+	return v;
+	}
+
+
 VectorVal* TopkVal::getTopK(int k) const // returns vector
 	{
 	if ( numElements == 0 )
diff --git a/src/Topk.h b/src/Topk.h
index f486948c5c..0e38319380 100644
--- a/src/Topk.h
+++ b/src/Topk.h
@@ -41,6 +41,9 @@ public:
 	uint64_t getCount(Val* value) const;
 	uint64_t getEpsilon(Val* value) const;
 
+protected:
+	TopkVal(); // for deserialize
+
 private:
 	void IncrementCounter(Element* e);
 	HashKey* GetHash(Val*) const; // this probably should go somewhere else.
@@ -50,6 +53,8 @@ private:
 	PDict(Element)* elementDict;
 	uint64 size; // how many elements are we tracking?
 	uint64 numElements; // how many elements do we have at the moment
+
+	DECLARE_SERIAL(TopkVal); 
 };
 
 };
diff --git a/testing/btest/Baseline/bifs.topk_persistence/out b/testing/btest/Baseline/bifs.topk_persistence/out
new file mode 100644
index 0000000000..ef3d0cef30
--- /dev/null
+++ b/testing/btest/Baseline/bifs.topk_persistence/out
@@ -0,0 +1,21 @@
+1
+2
+6
+4
+5
+1
+[c, e, d]
+1
+2
+6
+4
+5
+1
+[c, e, d]
+2
+4
+12
+8
+10
+2
+[c, e, d]
diff --git a/testing/btest/bifs/topk_persistence.bro b/testing/btest/bifs/topk_persistence.bro
new file mode 100644
index 0000000000..4d599c2780
--- /dev/null
+++ b/testing/btest/bifs/topk_persistence.bro
@@ -0,0 +1,74 @@
+# @TEST-EXEC: bro -b %INPUT runnumber=1 >out
+# @TEST-EXEC: bro -b %INPUT runnumber=2 >>out
+# @TEST-EXEC: bro -b %INPUT runnumber=3 >>out
+# @TEST-EXEC: btest-diff out
+
+global runnumber: count &redef; # differentiate runs
+
+global k1: opaque of topk &persistent;
+global k2: opaque of topk &persistent;
+
+event bro_init() 
+	{
+
+	k2 = topk_init(20);
+
+	if ( runnumber == 1 )
+		{
+		k1 = topk_init(100);
+		
+		topk_add(k1, "a");
+		topk_add(k1, "b");
+		topk_add(k1, "b");
+		topk_add(k1, "c");
+		topk_add(k1, "c");
+		topk_add(k1, "c");
+		topk_add(k1, "c");
+		topk_add(k1, "c");
+		topk_add(k1, "c");
+		topk_add(k1, "d");
+		topk_add(k1, "d");
+		topk_add(k1, "d");
+		topk_add(k1, "d");
+		topk_add(k1, "e");
+		topk_add(k1, "e");
+		topk_add(k1, "e");
+		topk_add(k1, "e");
+		topk_add(k1, "e");
+		topk_add(k1, "f");
+		}
+
+	local s = topk_get_top(k1, 3);
+	print topk_count(k1, "a");
+	print topk_count(k1, "b");
+	print topk_count(k1, "c");
+	print topk_count(k1, "d");
+	print topk_count(k1, "e");
+	print topk_count(k1, "f");
+
+	if ( runnumber == 2 ) 
+		{
+		topk_add(k1, "a");
+		topk_add(k1, "b");
+		topk_add(k1, "b");
+		topk_add(k1, "c");
+		topk_add(k1, "c");
+		topk_add(k1, "c");
+		topk_add(k1, "c");
+		topk_add(k1, "c");
+		topk_add(k1, "c");
+		topk_add(k1, "d");
+		topk_add(k1, "d");
+		topk_add(k1, "d");
+		topk_add(k1, "d");
+		topk_add(k1, "e");
+		topk_add(k1, "e");
+		topk_add(k1, "e");
+		topk_add(k1, "e");
+		topk_add(k1, "e");
+		topk_add(k1, "f");
+		}
+
+	print s;
+
+	}

From 2f48008c423019c05a209939f00d8c52d29eb1ee Mon Sep 17 00:00:00 2001
From: Bernhard Amann <bernhard@icsi.berkeley.edu>
Date: Wed, 24 Apr 2013 06:17:51 -0700
Subject: [PATCH 06/40] implement merging for top-k.

I am not (entirely) sure that this is mathematically correct, but
I am (more and more) getting the feeling that it... might be.

In any case - this was the last step and now it should work
in cluster settings.
---
 .../base/frameworks/sumstats/plugins/topk.bro |   7 ++
 src/Topk.cc                                   | 106 ++++++++++++++++--
 src/Topk.h                                    |   3 +-
 src/bro.bif                                   |  13 +++
 testing/btest/Baseline/bifs.topk/.stderr      |   4 +
 testing/btest/Baseline/bifs.topk/out          |  20 ++++
 testing/btest/bifs/topk.bro                   |  28 +++++
 7 files changed, 173 insertions(+), 8 deletions(-)

diff --git a/scripts/base/frameworks/sumstats/plugins/topk.bro b/scripts/base/frameworks/sumstats/plugins/topk.bro
index f64e9fb18d..6107a252ae 100644
--- a/scripts/base/frameworks/sumstats/plugins/topk.bro
+++ b/scripts/base/frameworks/sumstats/plugins/topk.bro
@@ -22,3 +22,10 @@ hook observe_hook(r: Reducer, val: double, obs: Observation, rv: ResultVal)
 	}
 
 
+hook compose_resultvals_hook(result: ResultVal, rv1: ResultVal, rv2: ResultVal)
+	{
+	result$topk = topk_init(500);
+
+	topk_merge(result$topk, rv1$topk);
+	topk_merge(result$topk, rv2$topk);
+	}
diff --git a/src/Topk.cc b/src/Topk.cc
index a31f49adf4..8ad2113235 100644
--- a/src/Topk.cc
+++ b/src/Topk.cc
@@ -71,6 +71,97 @@ TopkVal::~TopkVal()
 	type = 0;
 	}
 
+void TopkVal::Merge(const TopkVal* value)
+	{
+
+	if ( type == 0 )
+		{
+		assert(numElements == 0);
+		type = value->type->Ref();
+		}
+	else
+		if ( !same_type(type, value->type) )
+			{
+			reporter->Error("Tried to merge top-k elements of differing types. Aborted");
+			return;
+			}
+
+	std::list<Bucket*>::const_iterator it = value->buckets.begin();
+	while ( it != value->buckets.end() )
+		{
+		Bucket* b = *it;
+		uint64_t currcount = b->count;
+		std::list<Element*>::const_iterator eit = b->elements.begin();
+		
+		while ( eit != b->elements.end() )
+			{
+			Element* e = *eit;
+			// lookup if we already know this one...
+			HashKey* key = GetHash(e->value);
+			Element* olde = (Element*) elementDict->Lookup(key);
+
+			if ( olde == 0 ) 
+				{
+				olde = new Element();
+				olde->epsilon=0;
+				olde->value = e->value->Ref();
+				// insert at bucket position 0
+				if ( buckets.size() > 0 ) 
+					{
+					assert (buckets.front()-> count > 0 );
+					}
+
+				Bucket* newbucket = new Bucket();
+				newbucket->count = 0;
+				newbucket->bucketPos = buckets.insert(buckets.begin(), newbucket);
+
+				olde->parent = newbucket;
+				newbucket->elements.insert(newbucket->elements.end(), olde);
+
+				elementDict->Insert(key, olde);
+				numElements++;
+
+				}
+
+			// now that we are sure that the old element is present - increment epsilon
+			olde->epsilon += e->epsilon;
+			// and increment position...
+			IncrementCounter(olde, currcount);
+			delete key;
+
+			eit++;
+			}
+
+		it++;
+		}
+
+	// now we have added everything. And our top-k table could be too big.
+	// prune everything...
+	
+	assert(size > 0);
+	while ( numElements > size ) 
+		{
+		assert(buckets.size() > 0 );
+		Bucket* b = buckets.front();
+		assert(b->elements.size() > 0);
+
+		Element* e = b->elements.front();
+		HashKey* key = GetHash(e->value);
+		elementDict->RemoveEntry(key);
+		delete e;
+
+		b->elements.pop_front();
+		
+		if ( b->elements.size() == 0 ) 
+			{
+			delete b;
+			buckets.pop_front();
+			}
+
+		numElements--;
+		}
+
+	}
 
 bool TopkVal::DoSerialize(SerialInfo* info) const
 	{
@@ -318,7 +409,8 @@ void TopkVal::Encountered(Val* encountered)
 	
 	}
 
-void TopkVal::IncrementCounter(Element* e) 
+// increment by count
+void TopkVal::IncrementCounter(Element* e, unsigned int count) 
 	{
 	Bucket* currBucket = e->parent;
 	uint64 currcount = currBucket->count;
@@ -330,11 +422,11 @@ void TopkVal::IncrementCounter(Element* e)
 
 	bucketIter++;
 
-	if ( bucketIter != buckets.end() ) 
-		{
-		if ( (*bucketIter)->count == currcount+1 )
-			nextBucket = *bucketIter;
-		}
+	while ( bucketIter != buckets.end() && (*bucketIter)->count < currcount+count ) 
+		bucketIter++;
+
+	if ( bucketIter != buckets.end() && (*bucketIter)->count == currcount+count )
+		nextBucket = *bucketIter;
 
 	if ( nextBucket == 0 ) 
 		{
@@ -342,7 +434,7 @@ void TopkVal::IncrementCounter(Element* e)
 		// create it...
 
 		Bucket* b = new Bucket();
-		b->count = currcount+1;
+		b->count = currcount+count;
 
 		std::list<Bucket*>::iterator nextBucketPos = buckets.insert(bucketIter, b);
 		b->bucketPos = nextBucketPos; // and give it the iterator we know now.
diff --git a/src/Topk.h b/src/Topk.h
index 0e38319380..30e87f7a99 100644
--- a/src/Topk.h
+++ b/src/Topk.h
@@ -40,12 +40,13 @@ public:
 	VectorVal* getTopK(int k) const; // returns vector
 	uint64_t getCount(Val* value) const;
 	uint64_t getEpsilon(Val* value) const;
+	void Merge(const TopkVal* value);
 
 protected:
 	TopkVal(); // for deserialize
 
 private:
-	void IncrementCounter(Element* e);
+	void IncrementCounter(Element* e, unsigned int count = 1);
 	HashKey* GetHash(Val*) const; // this probably should go somewhere else.
 		
 	BroType* type;
diff --git a/src/bro.bif b/src/bro.bif
index e8e78c7872..b6f101c025 100644
--- a/src/bro.bif
+++ b/src/bro.bif
@@ -5684,3 +5684,16 @@ function topk_epsilon%(handle: opaque of topk, value: any%): count
 	return new Val(h->getEpsilon(value), TYPE_COUNT);
 	%}
 
+function topk_merge%(handle1: opaque of topk, handle2: opaque of topk%): any
+	%{
+	assert(handle1);
+	assert(handle2);
+
+	Topk::TopkVal* h1 = (Topk::TopkVal*) handle1;
+	Topk::TopkVal* h2 = (Topk::TopkVal*) handle2;
+
+	h1->Merge(h2);
+
+	return 0;
+	%}
+
diff --git a/testing/btest/Baseline/bifs.topk/.stderr b/testing/btest/Baseline/bifs.topk/.stderr
index f57e35ca51..f2bd316fd8 100644
--- a/testing/btest/Baseline/bifs.topk/.stderr
+++ b/testing/btest/Baseline/bifs.topk/.stderr
@@ -4,3 +4,7 @@ error: getCount for element that is not in top-k
 error: getEpsilon for element that is not in top-k
 error: getCount for element that is not in top-k
 error: getEpsilon for element that is not in top-k
+error: getCount for element that is not in top-k
+error: getEpsilon for element that is not in top-k
+error: getCount for element that is not in top-k
+error: getEpsilon for element that is not in top-k
diff --git a/testing/btest/Baseline/bifs.topk/out b/testing/btest/Baseline/bifs.topk/out
index 2116a30a12..8db55eeca8 100644
--- a/testing/btest/Baseline/bifs.topk/out
+++ b/testing/btest/Baseline/bifs.topk/out
@@ -35,3 +35,23 @@
 5
 4
 [c, e, d]
+6
+0
+5
+0
+4
+0
+[c, e]
+6
+0
+5
+0
+0
+0
+[c, e]
+12
+0
+10
+0
+0
+0
diff --git a/testing/btest/bifs/topk.bro b/testing/btest/bifs/topk.bro
index 9d936ce2f4..92a68999cc 100644
--- a/testing/btest/bifs/topk.bro
+++ b/testing/btest/bifs/topk.bro
@@ -87,6 +87,34 @@ event bro_init()
 	topk_add(k1, "f");
 	s = topk_get_top(k1, 3);
 	print s;
+	print topk_count(k1, "c");
+	print topk_epsilon(k1, "c");
+	print topk_count(k1, "e");
+	print topk_epsilon(k1, "d");
+	print topk_count(k1, "d");
+	print topk_epsilon(k1, "d");
 	
+	local k3 = topk_init(2);
+	topk_merge(k3, k1);
+
+	s = topk_get_top(k3, 3);
+	print s;
+	print topk_count(k3, "c");
+	print topk_epsilon(k3, "c");
+	print topk_count(k3, "e");
+	print topk_epsilon(k3, "e");
+	print topk_count(k3, "d");
+	print topk_epsilon(k3, "d");
+	
+	topk_merge(k3, k1);
+
+	s = topk_get_top(k3, 3);
+	print s;
+	print topk_count(k3, "c");
+	print topk_epsilon(k3, "c");
+	print topk_count(k3, "e");
+	print topk_epsilon(k3, "e");
+	print topk_count(k3, "d");
+	print topk_epsilon(k3, "d");
 
 }

From c0890f2a0f1448dc81f4b3e6b3e54a45a6e5a08b Mon Sep 17 00:00:00 2001
From: Bernhard Amann <bernhard@icsi.berkeley.edu>
Date: Wed, 24 Apr 2013 15:01:06 -0700
Subject: [PATCH 07/40] make size of topk-list configureable when using
 sumstats

---
 scripts/base/frameworks/sumstats/plugins/topk.bro | 15 +++++++++++++--
 src/Topk.h                                        |  1 +
 src/bro.bif                                       |  7 +++++++
 3 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/scripts/base/frameworks/sumstats/plugins/topk.bro b/scripts/base/frameworks/sumstats/plugins/topk.bro
index 6107a252ae..a830b1c5ec 100644
--- a/scripts/base/frameworks/sumstats/plugins/topk.bro
+++ b/scripts/base/frameworks/sumstats/plugins/topk.bro
@@ -3,16 +3,27 @@
 module SumStats;
 
 export {
+	redef record Reducer += {
+		## number of elements to keep in the top-k list
+		topk_size: count &default=500;
+	};
+
 	redef enum Calculation += {
 		TOPK
 	};
 
 	redef record ResultVal += {
-		topk: opaque of topk &default=topk_init(500);
+		topk: opaque of topk &optional;
 	};
 
 }
 
+hook init_resultval_hook(r: Reducer, rv: ResultVal)
+	{
+	if ( TOPK in r$apply && ! rv?$topk )
+		rv$topk = topk_init(r$topk_size);
+	}
+
 hook observe_hook(r: Reducer, val: double, obs: Observation, rv: ResultVal)
 	{
 	if ( TOPK in r$apply ) 
@@ -24,7 +35,7 @@ hook observe_hook(r: Reducer, val: double, obs: Observation, rv: ResultVal)
 
 hook compose_resultvals_hook(result: ResultVal, rv1: ResultVal, rv2: ResultVal)
 	{
-	result$topk = topk_init(500);
+	result$topk = topk_init(topk_size(rv1$topk));
 
 	topk_merge(result$topk, rv1$topk);
 	topk_merge(result$topk, rv2$topk);
diff --git a/src/Topk.h b/src/Topk.h
index 30e87f7a99..51a2d75251 100644
--- a/src/Topk.h
+++ b/src/Topk.h
@@ -40,6 +40,7 @@ public:
 	VectorVal* getTopK(int k) const; // returns vector
 	uint64_t getCount(Val* value) const;
 	uint64_t getEpsilon(Val* value) const;
+	uint64_t getSize() const { return size; }
 	void Merge(const TopkVal* value);
 
 protected:
diff --git a/src/bro.bif b/src/bro.bif
index b6f101c025..4d0db54c8c 100644
--- a/src/bro.bif
+++ b/src/bro.bif
@@ -5684,6 +5684,13 @@ function topk_epsilon%(handle: opaque of topk, value: any%): count
 	return new Val(h->getEpsilon(value), TYPE_COUNT);
 	%}
 
+function topk_size%(handle: opaque of topk%): count
+	%{
+	assert(handle);
+	Topk::TopkVal* h = (Topk::TopkVal*) handle;
+	return new Val(h->getSize(), TYPE_COUNT);
+	%}
+
 function topk_merge%(handle1: opaque of topk, handle2: opaque of topk%): any
 	%{
 	assert(handle1);

From 12cbf20ce07a8cd8a9eb6fd866d2c83855eda865 Mon Sep 17 00:00:00 2001
From: Bernhard Amann <bernhard@icsi.berkeley.edu>
Date: Wed, 24 Apr 2013 15:30:24 -0700
Subject: [PATCH 08/40] add topk cluster test

---
 .../manager-1..stdout                         |   9 ++
 .../base/frameworks/sumstats/topk-cluster.bro | 110 ++++++++++++++++++
 2 files changed, 119 insertions(+)
 create mode 100644 testing/btest/Baseline/scripts.base.frameworks.sumstats.topk-cluster/manager-1..stdout
 create mode 100644 testing/btest/scripts/base/frameworks/sumstats/topk-cluster.bro

diff --git a/testing/btest/Baseline/scripts.base.frameworks.sumstats.topk-cluster/manager-1..stdout b/testing/btest/Baseline/scripts.base.frameworks.sumstats.topk-cluster/manager-1..stdout
new file mode 100644
index 0000000000..2d076eeac7
--- /dev/null
+++ b/testing/btest/Baseline/scripts.base.frameworks.sumstats.topk-cluster/manager-1..stdout
@@ -0,0 +1,9 @@
+Top entries for key counter
+Num: 995, count: 100, epsilon: 0
+Num: 1, count: 99, epsilon: 0
+Num: 2, count: 98, epsilon: 0
+Num: 3, count: 97, epsilon: 0
+Num: 4, count: 96, epsilon: 0
+Top entries for key two
+Num: 2, count: 4, epsilon: 0
+Num: 1, count: 3, epsilon: 0
diff --git a/testing/btest/scripts/base/frameworks/sumstats/topk-cluster.bro b/testing/btest/scripts/base/frameworks/sumstats/topk-cluster.bro
new file mode 100644
index 0000000000..0ade38e86c
--- /dev/null
+++ b/testing/btest/scripts/base/frameworks/sumstats/topk-cluster.bro
@@ -0,0 +1,110 @@
+# @TEST-SERIALIZE: comm
+#
+# @TEST-EXEC: btest-bg-run manager-1 BROPATH=$BROPATH:.. CLUSTER_NODE=manager-1 bro %INPUT
+# @TEST-EXEC: sleep 1
+# @TEST-EXEC: btest-bg-run worker-1  BROPATH=$BROPATH:.. CLUSTER_NODE=worker-1 bro %INPUT
+# @TEST-EXEC: btest-bg-run worker-2  BROPATH=$BROPATH:.. CLUSTER_NODE=worker-2 bro %INPUT
+# @TEST-EXEC: btest-bg-wait 15
+
+# @TEST-EXEC: btest-diff manager-1/.stdout
+#
+@TEST-START-FILE cluster-layout.bro
+redef Cluster::nodes = {
+	["manager-1"] = [$node_type=Cluster::MANAGER, $ip=127.0.0.1, $p=37757/tcp, $workers=set("worker-1", "worker-2")],
+	["worker-1"]  = [$node_type=Cluster::WORKER,  $ip=127.0.0.1, $p=37760/tcp, $manager="manager-1", $interface="eth0"],
+	["worker-2"]  = [$node_type=Cluster::WORKER,  $ip=127.0.0.1, $p=37761/tcp, $manager="manager-1", $interface="eth1"],
+};
+@TEST-END-FILE
+
+redef Log::default_rotation_interval = 0secs;
+
+
+event bro_init() &priority=5
+	{
+	local r1: SumStats::Reducer = [$stream="test.metric", 
+	                               $apply=set(SumStats::TOPK)];
+	SumStats::create([$epoch=5secs,
+	                     $reducers=set(r1),
+	                     $epoch_finished(data: SumStats::ResultTable) = 
+	                     	{
+	                     	for ( key in data )
+	                     		{
+	                     		local r = data[key]["test.metric"];
+					
+					local s: vector of SumStats::Observation;
+					s = topk_get_top(r$topk, 5);
+				
+					print fmt("Top entries for key %s", key$str);
+					for ( element in s ) 
+						{
+						print fmt("Num: %d, count: %d, epsilon: %d", s[element]$num, topk_count(r$topk, s[element]), topk_epsilon(r$topk, s[element]));
+						}
+
+					terminate();
+	                     		}
+	                     	}
+		 ]);
+
+
+	}
+
+event remote_connection_closed(p: event_peer)
+	{
+	terminate();
+	}
+
+global ready_for_data: event();
+redef Cluster::manager2worker_events += /^ready_for_data$/;
+
+event ready_for_data()
+	{
+	const loop_v: vector of count = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100};
+
+
+	if ( Cluster::node == "worker-1" )
+		{
+
+		local a: count;
+		a = 0;
+
+		for ( i in loop_v ) 
+			{
+			a = a + 1;
+			for ( j in loop_v )
+				{
+				if ( i < j ) 
+					SumStats::observe("test.metric", [$str="counter"], [$num=a]);
+				}
+			}
+		
+
+		SumStats::observe("test.metric", [$str="two"], [$num=1]);
+		SumStats::observe("test.metric", [$str="two"], [$num=1]);
+		}
+	if ( Cluster::node == "worker-2" )
+		{
+		SumStats::observe("test.metric", [$str="two"], [$num=2]);
+		SumStats::observe("test.metric", [$str="two"], [$num=2]);
+		SumStats::observe("test.metric", [$str="two"], [$num=2]);
+		SumStats::observe("test.metric", [$str="two"], [$num=2]);
+		SumStats::observe("test.metric", [$str="two"], [$num=1]);
+
+		for ( i in loop_v )
+			{
+			SumStats::observe("test.metric", [$str="counter"], [$num=995]);
+			}
+		}
+	}
+
+@if ( Cluster::local_node_type() == Cluster::MANAGER )
+
+global peer_count = 0;
+event remote_connection_handshake_done(p: event_peer) &priority=-5
+	{
+	++peer_count;
+	if ( peer_count == 2 )
+		event ready_for_data();
+	}
+
+@endif
+

From fd2e0503068117d406a87b9a6d7d843b59289f59 Mon Sep 17 00:00:00 2001
From: Bernhard Amann <bernhard@icsi.berkeley.edu>
Date: Fri, 26 Apr 2013 11:34:07 -0700
Subject: [PATCH 09/40] fix warnings

---
 src/Topk.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/Topk.cc b/src/Topk.cc
index 8ad2113235..26f14c4fcb 100644
--- a/src/Topk.cc
+++ b/src/Topk.cc
@@ -178,7 +178,7 @@ bool TopkVal::DoSerialize(SerialInfo* info) const
 	else 
 		assert(numElements == 0);
 
-	int i = 0;
+	uint64_t i = 0;
 	std::list<Bucket*>::const_iterator it = buckets.begin();
 	while ( it != buckets.end() ) 
 		{
@@ -223,7 +223,7 @@ bool TopkVal::DoUnserialize(UnserialInfo* info)
 	else
 		assert(numElements == 0);
 
-	int i = 0;
+	uint64_t i = 0;
 	while ( i < numElements ) 
 		{
 		Bucket* b = new Bucket();
@@ -232,7 +232,7 @@ bool TopkVal::DoUnserialize(UnserialInfo* info)
 		v &= UNSERIALIZE(&b->count);
 		b->bucketPos = buckets.insert(buckets.end(), b);
 
-		for ( int j = 0; j < elements_count; j++ ) 
+		for ( uint64_t j = 0; j < elements_count; j++ ) 
 			{
 			Element* e = new Element();
 			v &= UNSERIALIZE(&e->epsilon);

From 1accee41edb48f7ed162f879bdf593f505d84b5b Mon Sep 17 00:00:00 2001
From: Bernhard Amann <bernhard@icsi.berkeley.edu>
Date: Fri, 26 Apr 2013 14:06:38 -0700
Subject: [PATCH 10/40] fix memory leaks

---
 src/Topk.cc | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/Topk.cc b/src/Topk.cc
index 26f14c4fcb..116a4d3de4 100644
--- a/src/Topk.cc
+++ b/src/Topk.cc
@@ -26,12 +26,13 @@ Element::~Element()
 HashKey* TopkVal::GetHash(Val* v) const
 	{
 	TypeList* tl = new TypeList(v->Type());
-	tl->Append(v->Type());
+	tl->Append(v->Type()->Ref());
 	CompositeHash* topk_hash = new CompositeHash(tl);
 	Unref(tl);
 
 	HashKey* key = topk_hash->ComputeHash(v, 1);
 	assert(key);
+	delete topk_hash;
 	return key;
 	}
 
@@ -311,6 +312,7 @@ uint64_t TopkVal::getCount(Val* value) const
 		return 0;
 		}
 
+	delete key;
 	return e->parent->count;
 	}
 
@@ -325,6 +327,7 @@ uint64_t TopkVal::getEpsilon(Val* value) const
 		return 0;
 		}
 
+	delete key;
 	return e->epsilon;
 	}
 	

From 07ecd31bbdcb85db52a206cb24ced7ea9b32c9f7 Mon Sep 17 00:00:00 2001
From: Bernhard Amann <bernhard@icsi.berkeley.edu>
Date: Sun, 28 Apr 2013 21:21:22 -0700
Subject: [PATCH 11/40] in cluster settings, the resultvals can apparently been
 uninitialized in some special cases

---
 .../base/frameworks/sumstats/plugins/topk.bro    | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/scripts/base/frameworks/sumstats/plugins/topk.bro b/scripts/base/frameworks/sumstats/plugins/topk.bro
index a830b1c5ec..ed6074b081 100644
--- a/scripts/base/frameworks/sumstats/plugins/topk.bro
+++ b/scripts/base/frameworks/sumstats/plugins/topk.bro
@@ -35,8 +35,18 @@ hook observe_hook(r: Reducer, val: double, obs: Observation, rv: ResultVal)
 
 hook compose_resultvals_hook(result: ResultVal, rv1: ResultVal, rv2: ResultVal)
 	{
-	result$topk = topk_init(topk_size(rv1$topk));
+	if ( rv1?$topk ) 
+		{
+		result$topk = topk_init(topk_size(rv1$topk));
 
-	topk_merge(result$topk, rv1$topk);
-	topk_merge(result$topk, rv2$topk);
+		topk_merge(result$topk, rv1$topk);
+		if ( rv2?$topk )
+			topk_merge(result$topk, rv2$topk);
+		}
+	else if ( rv2?$topk ) 
+		{
+		result$topk = topk_init(topk_size(rv2$topk));
+		topk_merge(result$topk, rv2$topk);
+		}
+		
 	}

From 160da6f1a6c246532705ac8a0f5ab49eee51c00e Mon Sep 17 00:00:00 2001
From: Bernhard Amann <bernhard@icsi.berkeley.edu>
Date: Sun, 28 Apr 2013 21:55:06 -0700
Subject: [PATCH 12/40] add sum function that can be used to get the number of
 total observed elements.

Add methods to merge with and without pruning (before only merge
method was with pruning, which invalidates the number of total
observed elements)
---
 src/Topk.cc                              | 60 +++++++++++++++++-------
 src/Topk.h                               | 32 ++++++++++++-
 src/bro.bif                              | 54 +++++++++++++++++++++
 testing/btest/Baseline/bifs.topk/.stderr |  1 +
 testing/btest/Baseline/bifs.topk/out     | 24 ++++++++++
 testing/btest/bifs/topk.bro              | 38 ++++++++++++++-
 6 files changed, 187 insertions(+), 22 deletions(-)

diff --git a/src/Topk.cc b/src/Topk.cc
index 116a4d3de4..2527ecc4bc 100644
--- a/src/Topk.cc
+++ b/src/Topk.cc
@@ -43,6 +43,7 @@ TopkVal::TopkVal(uint64 arg_size) : OpaqueVal(new OpaqueType("topk"))
 	size = arg_size;
 	type = 0;
 	numElements = 0;
+	pruned = false;
 	}
 
 TopkVal::TopkVal() : OpaqueVal(new OpaqueType("topk"))
@@ -72,7 +73,7 @@ TopkVal::~TopkVal()
 	type = 0;
 	}
 
-void TopkVal::Merge(const TopkVal* value)
+void TopkVal::Merge(const TopkVal* value, bool doPrune)
 	{
 
 	if ( type == 0 )
@@ -140,26 +141,31 @@ void TopkVal::Merge(const TopkVal* value)
 	// prune everything...
 	
 	assert(size > 0);
-	while ( numElements > size ) 
+
+	if ( doPrune )
 		{
-		assert(buckets.size() > 0 );
-		Bucket* b = buckets.front();
-		assert(b->elements.size() > 0);
-
-		Element* e = b->elements.front();
-		HashKey* key = GetHash(e->value);
-		elementDict->RemoveEntry(key);
-		delete e;
-
-		b->elements.pop_front();
-		
-		if ( b->elements.size() == 0 ) 
+		while ( numElements > size ) 
 			{
-			delete b;
-			buckets.pop_front();
-			}
+			pruned = true;
+			assert(buckets.size() > 0 );
+			Bucket* b = buckets.front();
+			assert(b->elements.size() > 0);
 
-		numElements--;
+			Element* e = b->elements.front();
+			HashKey* key = GetHash(e->value);
+			elementDict->RemoveEntry(key);
+			delete e;
+
+			b->elements.pop_front();
+			
+			if ( b->elements.size() == 0 ) 
+				{
+				delete b;
+				buckets.pop_front();
+				}
+
+			numElements--;
+			}
 		}
 
 	}
@@ -330,6 +336,24 @@ uint64_t TopkVal::getEpsilon(Val* value) const
 	delete key;
 	return e->epsilon;
 	}
+
+uint64_t TopkVal::getSum() const
+	{
+	uint64_t sum = 0;
+
+	std::list<Bucket*>::const_iterator it = buckets.begin();
+	while ( it != buckets.end() ) 
+		{
+		sum += (*it)->elements.size() * (*it)->count;
+
+		it++;
+		}
+
+	if ( pruned ) 
+		reporter->Warning("TopkVal::getSum() was used on a pruned data structure. Result values do not represent total element count");
+
+	return sum;
+	}
 	
 void TopkVal::Encountered(Val* encountered) 
 	{
diff --git a/src/Topk.h b/src/Topk.h
index 51a2d75251..608b810ddb 100644
--- a/src/Topk.h
+++ b/src/Topk.h
@@ -34,14 +34,41 @@ declare(PDict, Element);
 class TopkVal : public OpaqueVal {
 
 public:
+	// Initialize a TopkVal. Size specifies how many total elements are tracked
 	TopkVal(uint64 size);
 	~TopkVal();
-	void Encountered(Val* value); // we saw something
+
+	// Call this, when a new value is encountered. Note that on the first call,
+	// the Bro-Type of the value types that are counted is set. All following calls
+	// to encountered have to specify the same type
+	void Encountered(Val* value); 
+
+	// Return the first k elements of the result vector. At the moment, this does
+	// not check if it is in the right order or if we can prove that these are 
+	// the correct top-k. Use count and epsilon for this.
 	VectorVal* getTopK(int k) const; // returns vector
+
+	// Get the current count tracked in the top-k data structure for a certain val.
+	// Returns 0 if the val is unknown (and logs the error to reporter)
 	uint64_t getCount(Val* value) const;
+
+	// Get the current epsilon tracked in the top-k data structure for a certain val.
+	// Returns 0 if the val is unknown (and logs the error to reporter)
 	uint64_t getEpsilon(Val* value) const;
+
+	// Get the size set in the constructor
 	uint64_t getSize() const { return size; }
-	void Merge(const TopkVal* value);
+
+	// Get the sum of all counts of all tracked elements. This is equal to the number
+	// of total observations up to this moment, if no elements were pruned from the data
+	// structure.
+	uint64_t getSum() const;
+
+	// Merge another top-k data structure in this one. 
+	// doPrune specifies if the total count of elements is limited to size after
+	// merging. 
+	// Please note, that pruning will invalidate the results of getSum.
+	void Merge(const TopkVal* value, bool doPrune=false);
 
 protected:
 	TopkVal(); // for deserialize
@@ -55,6 +82,7 @@ private:
 	PDict(Element)* elementDict;
 	uint64 size; // how many elements are we tracking?
 	uint64 numElements; // how many elements do we have at the moment
+	bool pruned; // was this data structure pruned?
 
 	DECLARE_SERIAL(TopkVal); 
 };
diff --git a/src/bro.bif b/src/bro.bif
index 195e4c2bde..4c46b23241 100644
--- a/src/bro.bif
+++ b/src/bro.bif
@@ -5739,12 +5739,18 @@ function anonymize_addr%(a: addr, cl: IPAddrAnonymizationClass%): addr
 #include "Topk.h"
 %%}
 
+## Creates a top-k data structure which tracks size elements.
+##
+## Returns: Opaque pointer to the data structure.
 function topk_init%(size: count%): opaque of topk
 	%{
 	Topk::TopkVal* v = new Topk::TopkVal(size);
 	return v;
 	%}
 
+## Add a new observed object to the data structure. The first
+## added object sets the type of data tracked by the top-k data
+## structure. All following values have to be of the same type
 function topk_add%(handle: opaque of topk, value: any%): any
 	%{
 	assert(handle);
@@ -5754,6 +5760,9 @@ function topk_add%(handle: opaque of topk, value: any%): any
 	return 0;
 	%}
 
+## Get the first k elements of the top-k data structure
+##
+## Returns: vector of the first k elements
 function topk_get_top%(handle: opaque of topk, k: count%): any
 	%{
 	assert(handle);
@@ -5761,6 +5770,11 @@ function topk_get_top%(handle: opaque of topk, k: count%): any
 	return h->getTopK(k);	
 	%}
 
+## Get an overestimated count of how often value has been encountered.
+## value has to be part of the currently tracked elements, otherwise
+## 0 will be returned and an error message will be added to reporter.
+##
+## Returns: Overestimated number for how often the element has been encountered
 function topk_count%(handle: opaque of topk, value: any%): count
 	%{
 	assert(handle);
@@ -5768,6 +5782,10 @@ function topk_count%(handle: opaque of topk, value: any%): count
 	return new Val(h->getCount(value), TYPE_COUNT);
 	%}
 
+## Get a  the maximal overestimation for count. Same restrictiosn as for topk_count
+## apply.
+##
+## Returns: Number which represents the maximal overesimation for the count of this element.
 function topk_epsilon%(handle: opaque of topk, value: any%): count
 	%{
 	assert(handle);
@@ -5775,6 +5793,11 @@ function topk_epsilon%(handle: opaque of topk, value: any%): count
 	return new Val(h->getEpsilon(value), TYPE_COUNT);
 	%}
 
+## Get the number of elements this data structure is supposed to track (given on init).
+## Note that the actual number of elements in the data structure can be lower or higher
+## than this. (higher due to non-pruned merges)
+## 
+## Returns: size given during initialization 
 function topk_size%(handle: opaque of topk%): count
 	%{
 	assert(handle);
@@ -5782,6 +5805,20 @@ function topk_size%(handle: opaque of topk%): count
 	return new Val(h->getSize(), TYPE_COUNT);
 	%}
 
+## Get the sum of all counts of all elements in the data structure. Is equal to the number
+## of all inserted objects if the data structure never has been pruned. Do not use after
+## calling topk_merge_prune (will throw a warning message if used afterwards)
+##
+## Returns: sum of all counts
+function topk_sum%(handle: opaque of topk%): count
+	%{
+	assert(handle);
+	Topk::TopkVal* h = (Topk::TopkVal*) handle;
+	return new Val(h->getSum(), TYPE_COUNT);
+	%}
+
+## Merge the second topk data structure into the first. Does not remove any elements, the
+## resulting data structure can be bigger than the maximum size given on initialization.
 function topk_merge%(handle1: opaque of topk, handle2: opaque of topk%): any
 	%{
 	assert(handle1);
@@ -5795,3 +5832,20 @@ function topk_merge%(handle1: opaque of topk, handle2: opaque of topk%): any
 	return 0;
 	%}
 
+## Merge the second topk data structure into the first and prunes the final data structure
+## back to the size given on initialization. Use with care and only when being aware of the
+## restrictions this imposed. Do not call topk_size or topk_add afterwards, results will
+## probably not be what you expect.
+function topk_merge_prune%(handle1: opaque of topk, handle2: opaque of topk%): any
+	%{
+	assert(handle1);
+	assert(handle2);
+
+	Topk::TopkVal* h1 = (Topk::TopkVal*) handle1;
+	Topk::TopkVal* h2 = (Topk::TopkVal*) handle2;
+
+	h1->Merge(h2, true);
+
+	return 0;
+	%}
+
diff --git a/testing/btest/Baseline/bifs.topk/.stderr b/testing/btest/Baseline/bifs.topk/.stderr
index f2bd316fd8..80626107aa 100644
--- a/testing/btest/Baseline/bifs.topk/.stderr
+++ b/testing/btest/Baseline/bifs.topk/.stderr
@@ -6,5 +6,6 @@ error: getCount for element that is not in top-k
 error: getEpsilon for element that is not in top-k
 error: getCount for element that is not in top-k
 error: getEpsilon for element that is not in top-k
+warning: TopkVal::getSum() was used on a pruned data structure. Result values do not represent total element count
 error: getCount for element that is not in top-k
 error: getEpsilon for element that is not in top-k
diff --git a/testing/btest/Baseline/bifs.topk/out b/testing/btest/Baseline/bifs.topk/out
index 8db55eeca8..1ce5c4b850 100644
--- a/testing/btest/Baseline/bifs.topk/out
+++ b/testing/btest/Baseline/bifs.topk/out
@@ -1,4 +1,5 @@
 [b, c]
+4
 0
 0
 2
@@ -6,6 +7,7 @@
 2
 1
 [d, c]
+5
 0
 0
 2
@@ -13,21 +15,25 @@
 3
 2
 [d, e]
+6
 3
 2
 3
 2
 [f, e]
+7
 4
 3
 3
 2
 [f, e]
+8
 4
 3
 4
 2
 [g, e]
+9
 0
 0
 4
@@ -35,6 +41,7 @@
 5
 4
 [c, e, d]
+19
 6
 0
 5
@@ -49,9 +56,26 @@
 0
 0
 [c, e]
+22
 12
 0
 10
 0
 0
 0
+[c, e]
+19
+6
+0
+5
+0
+4
+0
+[c, e, d]
+38
+12
+0
+10
+0
+8
+0
diff --git a/testing/btest/bifs/topk.bro b/testing/btest/bifs/topk.bro
index 92a68999cc..02d13c4195 100644
--- a/testing/btest/bifs/topk.bro
+++ b/testing/btest/bifs/topk.bro
@@ -14,6 +14,7 @@ event bro_init()
 
 	local s = topk_get_top(k1, 5);
 	print s;
+	print topk_sum(k1);
 	print topk_count(k1, "a");
 	print topk_epsilon(k1, "a");
 	print topk_count(k1, "b");
@@ -24,6 +25,7 @@ event bro_init()
 	topk_add(k1, "d");
 	s = topk_get_top(k1, 5);
 	print s;
+	print topk_sum(k1);
 	print topk_count(k1, "b");
 	print topk_epsilon(k1, "b");
 	print topk_count(k1, "c");
@@ -34,6 +36,7 @@ event bro_init()
 	topk_add(k1, "e");
 	s = topk_get_top(k1, 5);
 	print s;
+	print topk_sum(k1);
 	print topk_count(k1, "d");
 	print topk_epsilon(k1, "d");
 	print topk_count(k1, "e");
@@ -42,6 +45,7 @@ event bro_init()
 	topk_add(k1, "f");
 	s = topk_get_top(k1, 5);
 	print s;
+	print topk_sum(k1);
 	print topk_count(k1, "f");
 	print topk_epsilon(k1, "f");
 	print topk_count(k1, "e");
@@ -50,6 +54,7 @@ event bro_init()
 	topk_add(k1, "e");
 	s = topk_get_top(k1, 5);
 	print s;
+	print topk_sum(k1);
 	print topk_count(k1, "f");
 	print topk_epsilon(k1, "f");
 	print topk_count(k1, "e");
@@ -58,6 +63,7 @@ event bro_init()
 	topk_add(k1, "g");
 	s = topk_get_top(k1, 5);
 	print s;
+	print topk_sum(k1);
 	print topk_count(k1, "f");
 	print topk_epsilon(k1, "f");
 	print topk_count(k1, "e");
@@ -87,6 +93,7 @@ event bro_init()
 	topk_add(k1, "f");
 	s = topk_get_top(k1, 3);
 	print s;
+	print topk_sum(k1);
 	print topk_count(k1, "c");
 	print topk_epsilon(k1, "c");
 	print topk_count(k1, "e");
@@ -95,7 +102,7 @@ event bro_init()
 	print topk_epsilon(k1, "d");
 	
 	local k3 = topk_init(2);
-	topk_merge(k3, k1);
+	topk_merge_prune(k3, k1);
 
 	s = topk_get_top(k3, 3);
 	print s;
@@ -106,10 +113,11 @@ event bro_init()
 	print topk_count(k3, "d");
 	print topk_epsilon(k3, "d");
 	
-	topk_merge(k3, k1);
+	topk_merge_prune(k3, k1);
 
 	s = topk_get_top(k3, 3);
 	print s;
+	print topk_sum(k3); # this gives a warning and a wrong result.
 	print topk_count(k3, "c");
 	print topk_epsilon(k3, "c");
 	print topk_count(k3, "e");
@@ -117,4 +125,30 @@ event bro_init()
 	print topk_count(k3, "d");
 	print topk_epsilon(k3, "d");
 
+	k3 = topk_init(2);
+	topk_merge(k3, k1);
+	print s;
+	print topk_sum(k3);
+	print topk_count(k3, "c");
+	print topk_epsilon(k3, "c");
+	print topk_count(k3, "e");
+	print topk_epsilon(k3, "e");
+	print topk_count(k3, "d");
+	print topk_epsilon(k3, "d");
+
+	topk_merge(k3, k1);
+
+	s = topk_get_top(k3, 3);
+	print s;
+	print topk_sum(k3);
+	print topk_count(k3, "c");
+	print topk_epsilon(k3, "c");
+	print topk_count(k3, "e");
+	print topk_epsilon(k3, "e");
+	print topk_count(k3, "d");
+	print topk_epsilon(k3, "d");
+
+	
+
+
 }

From c6e69ddc05cc6cdcaa8006e0d2fb2535a885f30b Mon Sep 17 00:00:00 2001
From: Bernhard Amann <bernhard@icsi.berkeley.edu>
Date: Wed, 1 May 2013 17:06:45 -0700
Subject: [PATCH 13/40] potentially found wrong Ref.

---
 src/Topk.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Topk.cc b/src/Topk.cc
index 2527ecc4bc..9c0a04607d 100644
--- a/src/Topk.cc
+++ b/src/Topk.cc
@@ -274,7 +274,7 @@ VectorVal* TopkVal::getTopK(int k) const // returns vector
 		}
 
 	TypeList* vector_index = new TypeList(type);
-	vector_index->Append(type);
+	vector_index->Append(type->Ref());
 	VectorType* v = new VectorType(vector_index);
 	VectorVal* t = new VectorVal(v);
 
@@ -362,7 +362,7 @@ void TopkVal::Encountered(Val* encountered)
 	//printf("NumElements: %d\n", numElements);
 	// check type compatibility
 	if ( numElements == 0 ) 
-		type = encountered->Type()->Ref()->Ref();
+		type = encountered->Type()->Ref();
 	else
 		if ( !same_type(type, encountered->Type()) ) 
 			{

From 075bfc5b3ded8b2f223805eaa48c4d04451868a8 Mon Sep 17 00:00:00 2001
From: Bernhard Amann <bernhard@icsi.berkeley.edu>
Date: Thu, 2 May 2013 12:09:35 -0700
Subject: [PATCH 14/40] synchronize pruned attribute

---
 src/Topk.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/Topk.cc b/src/Topk.cc
index 9c0a04607d..2b84b389b0 100644
--- a/src/Topk.cc
+++ b/src/Topk.cc
@@ -178,6 +178,7 @@ bool TopkVal::DoSerialize(SerialInfo* info) const
 
 	v &= SERIALIZE(size);
 	v &= SERIALIZE(numElements);
+	v &= SERIALIZE(pruned);
 	bool type_present = (type != 0);
 	v &= SERIALIZE(type_present);
 	if ( type_present )
@@ -220,6 +221,7 @@ bool TopkVal::DoUnserialize(UnserialInfo* info)
 
 	v &= UNSERIALIZE(&size);
 	v &= UNSERIALIZE(&numElements);
+	v &= UNSERIALIZE(&pruned);
 	bool type_present = false;
 	v &= UNSERIALIZE(&type_present);
 	if ( type_present ) 

From cf6e768ad641988320a3379b5ead701ea09a0ff1 Mon Sep 17 00:00:00 2001
From: Bernhard Amann <bernhard@icsi.berkeley.edu>
Date: Fri, 3 May 2013 23:08:26 -0700
Subject: [PATCH 15/40] fix opaqueval-related memleak

---
 src/NetVar.cc | 2 ++
 src/NetVar.h  | 1 +
 src/Topk.cc   | 5 +++--
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/NetVar.cc b/src/NetVar.cc
index 012e4a85bc..1a2e604e90 100644
--- a/src/NetVar.cc
+++ b/src/NetVar.cc
@@ -243,6 +243,7 @@ OpaqueType* md5_type;
 OpaqueType* sha1_type;
 OpaqueType* sha256_type;
 OpaqueType* entropy_type;
+OpaqueType* topk_type;
 
 #include "const.bif.netvar_def"
 #include "types.bif.netvar_def"
@@ -308,6 +309,7 @@ void init_general_global_var()
 	sha1_type = new OpaqueType("sha1");
 	sha256_type = new OpaqueType("sha256");
 	entropy_type = new OpaqueType("entropy");
+	topk_type = new OpaqueType("topk");
 	}
 
 void init_net_var()
diff --git a/src/NetVar.h b/src/NetVar.h
index d7590b20e7..37ed3c7c85 100644
--- a/src/NetVar.h
+++ b/src/NetVar.h
@@ -248,6 +248,7 @@ extern OpaqueType* md5_type;
 extern OpaqueType* sha1_type;
 extern OpaqueType* sha256_type;
 extern OpaqueType* entropy_type;
+extern OpaqueType* topk_type;
 
 // Initializes globals that don't pertain to network/event analysis.
 extern void init_general_global_var();
diff --git a/src/Topk.cc b/src/Topk.cc
index 2b84b389b0..10374f3087 100644
--- a/src/Topk.cc
+++ b/src/Topk.cc
@@ -4,6 +4,7 @@
 #include "CompHash.h"
 #include "Reporter.h"
 #include "Serializer.h"
+#include "NetVar.h"
 
 
 namespace Topk {
@@ -36,7 +37,7 @@ HashKey* TopkVal::GetHash(Val* v) const
 	return key;
 	}
 
-TopkVal::TopkVal(uint64 arg_size) : OpaqueVal(new OpaqueType("topk"))
+TopkVal::TopkVal(uint64 arg_size) : OpaqueVal(topk_type)
 	{
 	elementDict = new PDict(Element);
 	elementDict->SetDeleteFunc(topk_element_hash_delete_func);
@@ -46,7 +47,7 @@ TopkVal::TopkVal(uint64 arg_size) : OpaqueVal(new OpaqueType("topk"))
 	pruned = false;
 	}
 
-TopkVal::TopkVal() : OpaqueVal(new OpaqueType("topk"))
+TopkVal::TopkVal() : OpaqueVal(topk_type)
 	{
 	elementDict = new PDict(Element);
 	elementDict->SetDeleteFunc(topk_element_hash_delete_func);

From e482897f885e2f1039b96782d5e4bc080d74a535 Mon Sep 17 00:00:00 2001
From: Matthias Vallentin <vallentin@icir.org>
Date: Thu, 25 Jul 2013 15:16:53 +0200
Subject: [PATCH 16/40] Add docs and use default value for hasher names.

---
 src/probabilistic/Hasher.h | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/probabilistic/Hasher.h b/src/probabilistic/Hasher.h
index 62c5d58d1f..d266565284 100644
--- a/src/probabilistic/Hasher.h
+++ b/src/probabilistic/Hasher.h
@@ -63,7 +63,9 @@ public:
 	size_t K() const	{ return k; }
 
 	/**
-	 * Returns the hasher's name. TODO: What's this?
+	 * Returns the hasher's name. If not empty, the hasher uses this descriptor
+	 * to seed its *k* hash functions. Otherwise the hasher mixes in the initial
+	 * seed derived from the environment variable `$BRO_SEED`.
 	 */
 	const std::string& Name() const { return name; }
 
@@ -83,7 +85,7 @@ public:
 protected:
 	Hasher(size_t k, const std::string& name);
 
-	private:
+private:
 	const size_t k;
 	std::string name;
 };
@@ -166,7 +168,7 @@ public:
 	 *
 	 * @param name The name of the hasher.
 	 */
-	DefaultHasher(size_t k, const std::string& name);
+	DefaultHasher(size_t k, const std::string& name = "");
 
 	// Overridden from Hasher.
 	virtual digest_vector Hash(const void* x, size_t n) const /* final */;
@@ -190,7 +192,7 @@ public:
 	 *
 	 * @param name The name of the hasher.
 	 */
-	DoubleHasher(size_t k, const std::string& name);
+	DoubleHasher(size_t k, const std::string& name = "");
 
 	// Overridden from Hasher.
 	virtual digest_vector Hash(const void* x, size_t n) const /* final */;

From 2fc5ca53ff8f90aa959b2bc65626b319a1dee529 Mon Sep 17 00:00:00 2001
From: Matthias Vallentin <vallentin@icir.org>
Date: Thu, 25 Jul 2013 17:35:35 +0200
Subject: [PATCH 17/40] Make hashers serializable.

There exists still a small bug that I could not find; the unit test
istate/opaque.bro fails. If someone sees why, please chime in.
---
 src/SerialTypes.h                  |  6 ++
 src/probabilistic/BloomFilter.cc   | 19 +-----
 src/probabilistic/BloomFilter.h    |  3 -
 src/probabilistic/Hasher.cc        | 99 ++++++++++++++++++++++++++----
 src/probabilistic/Hasher.h         | 33 +++++-----
 src/probabilistic/bloom-filter.bif |  4 +-
 6 files changed, 117 insertions(+), 47 deletions(-)

diff --git a/src/SerialTypes.h b/src/SerialTypes.h
index 85aed10bda..9933d005f0 100644
--- a/src/SerialTypes.h
+++ b/src/SerialTypes.h
@@ -52,6 +52,7 @@ SERIAL_IS(RE_MATCHER, 0x1400)
 SERIAL_IS(BITVECTOR, 0x1500)
 SERIAL_IS(COUNTERVECTOR, 0x1600)
 SERIAL_IS(BLOOMFILTER, 0x1700)
+SERIAL_IS(HASHER, 0x1800)
 
 // These are the externally visible types.
 const SerialType SER_NONE = 0;
@@ -206,6 +207,11 @@ SERIAL_BLOOMFILTER(BLOOMFILTER, 1)
 SERIAL_BLOOMFILTER(BASICBLOOMFILTER, 2)
 SERIAL_BLOOMFILTER(COUNTINGBLOOMFILTER, 3)
 
+#define SERIAL_HASHER(name, val) SERIAL_CONST(name, val, HASHER)
+SERIAL_HASHER(HASHER, 1)
+SERIAL_HASHER(DEFAULTHASHER, 2)
+SERIAL_HASHER(DOUBLEHASHER, 3)
+
 SERIAL_CONST2(ID)
 SERIAL_CONST2(STATE_ACCESS)
 SERIAL_CONST2(CASE)
diff --git a/src/probabilistic/BloomFilter.cc b/src/probabilistic/BloomFilter.cc
index 7f769cbf7c..d446643ed3 100644
--- a/src/probabilistic/BloomFilter.cc
+++ b/src/probabilistic/BloomFilter.cc
@@ -38,28 +38,15 @@ bool BloomFilter::DoSerialize(SerialInfo* info) const
 	{
 	DO_SERIALIZE(SER_BLOOMFILTER, SerialObj);
 
-	if ( ! SERIALIZE(static_cast<uint16>(hasher->K())) )
-		return false;
-
-	return SERIALIZE_STR(hasher->Name().c_str(), hasher->Name().size());
+	return hasher->Serialize(info);
 	}
 
 bool BloomFilter::DoUnserialize(UnserialInfo* info)
 	{
 	DO_UNSERIALIZE(SerialObj);
 
-	uint16 k;
-	if ( ! UNSERIALIZE(&k) )
-		return false;
-
-	const char* name;
-	if ( ! UNSERIALIZE_STR(&name, 0) )
-		return false;
-
-	hasher = Hasher::Create(k, name);
-
-	delete [] name;
-	return true;
+	hasher = Hasher::Unserialize(info);
+	return hasher != 0;
 	}
 
 size_t BasicBloomFilter::M(double fp, size_t capacity)
diff --git a/src/probabilistic/BloomFilter.h b/src/probabilistic/BloomFilter.h
index b6cf18672f..4865ae145c 100644
--- a/src/probabilistic/BloomFilter.h
+++ b/src/probabilistic/BloomFilter.h
@@ -13,9 +13,6 @@ class CounterVector;
 
 /**
  * The abstract base class for Bloom filters.
- *
- * At this point we won't let the user choose the hasher, but we might open
- * up the interface in the future.
  */
 class BloomFilter : public SerialObj {
 public:
diff --git a/src/probabilistic/Hasher.cc b/src/probabilistic/Hasher.cc
index f9ce7bdd6b..7db363142d 100644
--- a/src/probabilistic/Hasher.cc
+++ b/src/probabilistic/Hasher.cc
@@ -4,9 +4,56 @@
 
 #include "Hasher.h"
 #include "digest.h"
+#include "Serializer.h"
 
 using namespace probabilistic;
 
+bool Hasher::Serialize(SerialInfo* info) const
+	{
+	return SerialObj::Serialize(info);
+	}
+
+Hasher* Hasher::Unserialize(UnserialInfo* info)
+	{
+	return reinterpret_cast<Hasher*>(SerialObj::Unserialize(info, SER_HASHER));
+	}
+
+bool Hasher::DoSerialize(SerialInfo* info) const
+	{
+	DO_SERIALIZE(SER_HASHER, SerialObj);
+
+	if ( ! SERIALIZE(static_cast<uint16>(k)) )
+		return false;
+
+	return SERIALIZE_STR(name.c_str(), name.size());
+	}
+
+bool Hasher::DoUnserialize(UnserialInfo* info)
+	{
+	DO_UNSERIALIZE(SerialObj);
+
+	uint16 serial_k;
+	if ( ! UNSERIALIZE(&serial_k) )
+		return false;
+	k = serial_k;
+	assert(k > 0);
+
+	const char* serial_name;
+	if ( ! UNSERIALIZE_STR(&serial_name, 0) )
+		return false;
+	name = serial_name;
+	delete [] serial_name;
+
+	return true;
+	}
+
+Hasher::Hasher(size_t k, const std::string& arg_name)
+	: k(k)
+	{
+	name = arg_name;
+	}
+
+
 UHF::UHF(size_t seed, const std::string& extra)
 	: h(compute_seed(seed, extra))
 	{
@@ -40,17 +87,6 @@ size_t UHF::compute_seed(size_t seed, const std::string& extra)
 	return *reinterpret_cast<size_t*>(buf);
 	}
 
-Hasher* Hasher::Create(size_t k, const std::string& name)
-	{
-	return new DefaultHasher(k, name);
-	}
-
-Hasher::Hasher(size_t k, const std::string& arg_name)
-	: k(k)
-	{
-	name = arg_name;
-	}
-
 DefaultHasher::DefaultHasher(size_t k, const std::string& name)
 	: Hasher(k, name)
 	{
@@ -82,6 +118,27 @@ bool DefaultHasher::Equals(const Hasher* other) const
 	return hash_functions == o->hash_functions;
 	}
 
+IMPLEMENT_SERIAL(DefaultHasher, SER_DEFAULTHASHER)
+
+bool DefaultHasher::DoSerialize(SerialInfo* info) const
+	{
+	DO_SERIALIZE(SER_DEFAULTHASHER, Hasher);
+
+	// Nothing to do here, the base class has all we need serialized already.
+	return true;
+	}
+
+bool DefaultHasher::DoUnserialize(UnserialInfo* info)
+	{
+	DO_UNSERIALIZE(Hasher);
+
+	hash_functions.clear();
+	for ( size_t i = 0; i < K(); ++i )
+		hash_functions.push_back(UHF(i, Name()));
+
+	return true;
+	}
+
 DoubleHasher::DoubleHasher(size_t k, const std::string& name)
 	: Hasher(k, name), h1(1, name), h2(2, name)
 	{
@@ -112,3 +169,23 @@ bool DoubleHasher::Equals(const Hasher* other) const
 	const DoubleHasher* o = static_cast<const DoubleHasher*>(other);
 	return h1 == o->h1 && h2 == o->h2;
 	}
+
+IMPLEMENT_SERIAL(DoubleHasher, SER_DOUBLEHASHER)
+
+bool DoubleHasher::DoSerialize(SerialInfo* info) const
+	{
+	DO_SERIALIZE(SER_DOUBLEHASHER, Hasher);
+
+	// Nothing to do here, the base class has all we need serialized already.
+	return true;
+	}
+
+bool DoubleHasher::DoUnserialize(UnserialInfo* info)
+	{
+	DO_UNSERIALIZE(Hasher);
+
+	h1 = UHF(1, Name());
+	h2 = UHF(2, Name());
+
+	return true;
+	}
diff --git a/src/probabilistic/Hasher.h b/src/probabilistic/Hasher.h
index d266565284..7e6a8ba134 100644
--- a/src/probabilistic/Hasher.h
+++ b/src/probabilistic/Hasher.h
@@ -5,6 +5,7 @@
 
 #include "Hash.h"
 #include "H3.h"
+#include "SerialObj.h"
 
 namespace probabilistic {
 
@@ -12,7 +13,7 @@ namespace probabilistic {
  * Abstract base class for hashers. A hasher creates a family of hash
  * functions to hash an element *k* times.
  */
-class Hasher {
+class Hasher : public SerialObj {
 public:
 	typedef hash_t digest;
 	typedef std::vector<digest> digest_vector;
@@ -69,24 +70,18 @@ public:
 	 */
 	const std::string& Name() const { return name; }
 
-	/**
-	 * Constructs the hasher used by the implementation. This hardcodes a
-	 * specific hashing policy. It exists only because the HashingPolicy
-	 * class hierachy is not yet serializable.
-	 *
-	 * @param k The number of hash functions to apply.
-	 *
-	 * @param name The hasher's name.
-	 *
-	 * @return Returns a new hasher instance.
-	 */
-	static Hasher* Create(size_t k, const std::string& name);
+	bool Serialize(SerialInfo* info) const;
+	static Hasher* Unserialize(UnserialInfo* info);
 
 protected:
+	DECLARE_ABSTRACT_SERIAL(Hasher);
+
+	Hasher() { }
+
 	Hasher(size_t k, const std::string& name);
 
 private:
-	const size_t k;
+	size_t k;
 	std::string name;
 };
 
@@ -106,7 +101,7 @@ public:
 	 * seed to compute the seed for t to compute the seed NUL-terminated
 	 * string as additional seed.
 	 */
-	UHF(size_t seed, const std::string& extra = "");
+	UHF(size_t seed = 0, const std::string& extra = "");
 
 	template <typename T>
 	Hasher::digest operator()(const T& x) const
@@ -175,7 +170,11 @@ public:
 	virtual DefaultHasher* Clone() const /* final */;
 	virtual bool Equals(const Hasher* other) const /* final */;
 
+	DECLARE_SERIAL(DefaultHasher);
+
 private:
+	DefaultHasher() { }
+
 	std::vector<UHF> hash_functions;
 };
 
@@ -199,7 +198,11 @@ public:
 	virtual DoubleHasher* Clone() const /* final */;
 	virtual bool Equals(const Hasher* other) const /* final */;
 
+	DECLARE_SERIAL(DoubleHasher);
+
 private:
+	DoubleHasher() { }
+
 	UHF h1;
 	UHF h2;
 };
diff --git a/src/probabilistic/bloom-filter.bif b/src/probabilistic/bloom-filter.bif
index dd21688fdd..f03e3d149b 100644
--- a/src/probabilistic/bloom-filter.bif
+++ b/src/probabilistic/bloom-filter.bif
@@ -40,7 +40,7 @@ function bloomfilter_basic_init%(fp: double, capacity: count,
 
 	size_t cells = BasicBloomFilter::M(fp, capacity);
 	size_t optimal_k = BasicBloomFilter::K(cells, capacity);
-	const Hasher* h = Hasher::Create(optimal_k, name->CheckString());
+	const Hasher* h = new DefaultHasher(optimal_k, name->CheckString());
 
 	return new BloomFilterVal(new BasicBloomFilter(h, cells));
 	%}
@@ -68,7 +68,7 @@ function bloomfilter_counting_init%(k: count, cells: count, max: count,
 		return 0;
 		}
 
-	const Hasher* h = Hasher::Create(k, name->CheckString());
+	const Hasher* h = new DefaultHasher(k, name->CheckString());
 
 	uint16 width = 1;
 	while ( max >>= 1 )

From d84f6e012ca726cd37fb1b1fe8131620ba2021a2 Mon Sep 17 00:00:00 2001
From: Jon Siwek <jsiwek@illinois.edu>
Date: Mon, 29 Jul 2013 15:41:34 -0500
Subject: [PATCH 18/40] Fix various documentation, mostly related to file
 analysis.

- Fix examples/references in the file analysis how-to/usage doc.

- Add Broxygen-generated docs for file analyzer plugins.

- Break FTP::Info type declaration out in to its own file to get
  rid of some circular dependencies (between s/b/p/ftp/main and
  s/b/p/ftp/utils).
---
 doc/file-analysis.rst                         | 27 +++----
 doc/index.rst                                 |  1 +
 doc/scripts/CMakeLists.txt                    | 50 +++++++------
 doc/scripts/DocSourcesList.cmake              |  1 +
 scripts/base/frameworks/files/main.bro        |  2 +-
 .../base/frameworks/packet-filter/main.bro    |  2 +-
 scripts/base/init-bare.bro                    | 17 ++---
 scripts/base/protocols/ftp/__load__.bro       |  1 +
 scripts/base/protocols/ftp/files.bro          |  1 +
 scripts/base/protocols/ftp/gridftp.bro        |  1 +
 scripts/base/protocols/ftp/info.bro           | 72 +++++++++++++++++++
 scripts/base/protocols/ftp/main.bro           | 67 +----------------
 scripts/base/protocols/ftp/utils.bro          |  5 +-
 .../policy/frameworks/packet-filter/shunt.bro |  4 +-
 scripts/policy/misc/load-balancing.bro        | 12 ++--
 .../tuning/defaults/packet-fragments.bro      | 12 ++--
 src/BroDoc.cc                                 | 65 +++++++++++++++--
 src/BroDoc.h                                  |  6 ++
 src/file_analysis/Manager.cc                  |  5 ++
 src/file_analysis/Manager.h                   |  6 ++
 src/main.cc                                   |  1 +
 .../canonified_loaded_scripts.log             |  7 +-
 22 files changed, 227 insertions(+), 138 deletions(-)
 create mode 100644 scripts/base/protocols/ftp/info.bro

diff --git a/doc/file-analysis.rst b/doc/file-analysis.rst
index f312e06471..0a96a8efb7 100644
--- a/doc/file-analysis.rst
+++ b/doc/file-analysis.rst
@@ -82,9 +82,9 @@ attached, they start receiving the contents of the file as Bro extracts
 it from an ongoing network connection.  What they do with the file
 contents is up to the particular file analyzer implementation, but
 they'll typically either report further information about the file via
-events (e.g. :bro:see:`FileAnalysis::ANALYZER_MD5` will report the
+events (e.g. :bro:see:`Files::ANALYZER_MD5` will report the
 file's MD5 checksum via :bro:see:`file_hash` once calculated) or they'll
-have some side effect (e.g. :bro:see:`FileAnalysis::ANALYZER_EXTRACT`
+have some side effect (e.g. :bro:see:`Files::ANALYZER_EXTRACT`
 will write the contents of the file out to the local file system).
 
 In the future there may be file analyzers that automatically attach to
@@ -98,7 +98,7 @@ explicit attachment decision:
         {
         print "new file", f$id;
         if ( f?$mime_type && f$mime_type == "text/plain" )
-            FileAnalysis::add_analyzer(f, [$tag=FileAnalysis::ANALYZER_MD5]);
+            Files::add_analyzer(f, Files::ANALYZER_MD5);
         }
 
     event file_hash(f: fa_file, kind: string, hash: string)
@@ -113,26 +113,27 @@ output::
     file_hash, Cx92a0ym5R8, md5, 397168fd09991a0e712254df7bc639ac
 
 Some file analyzers might have tunable parameters that need to be
-specified in the call to :bro:see:`FileAnalysis::add_analyzer`:
+specified in the call to :bro:see:`Files::add_analyzer`:
 
 .. code:: bro
 
     event file_new(f: fa_file)
         {
-        FileAnalysis::add_analyzer(f, [$tag=FileAnalysis::ANALYZER_EXTRACT,
-                                       $extract_filename="./myfile"]);
+        Files::add_analyzer(f, Files::ANALYZER_EXTRACT,
+                            [$extract_filename="myfile"]);
         }
 
 In this case, the file extraction analyzer doesn't generate any further
-events, but does have the side effect of writing out the file contents
-to the local file system at the specified location of ``./myfile``.  Of
-course, for a network with more than a single file being transferred,
-it's probably preferable to specify a different extraction path for each
-file, unlike this example.
+events, but does have the effect of writing out the file contents to the
+local file system at the location resulting from the concatenation of
+the path specified by :bro:see:`FileExtract::prefix` and the string,
+``myfile``.  Of course, for a network with more than a single file being
+transferred, it's probably preferable to specify a different extraction
+path for each file, unlike this example.
 
 Regardless of which file analyzers end up acting on a file, general
 information about the file (e.g. size, time of last data transferred,
-MIME type, etc.) are logged in ``file_analysis.log``.
+MIME type, etc.) are logged in ``files.log``.
 
 Input Framework Integration
 ===========================
@@ -150,7 +151,7 @@ a network interface it's monitoring.  It only requires a call to
     event file_new(f: fa_file)
         {
         print "new file", f$id;
-        FileAnalysis::add_analyzer(f, [$tag=FileAnalysis::ANALYZER_MD5]);
+        Files::add_analyzer(f, Files::ANALYZER_MD5);
         }
 
     event file_state_remove(f: fa_file)
diff --git a/doc/index.rst b/doc/index.rst
index ad05f7bf82..aa33d8797d 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -47,6 +47,7 @@ Script Reference
    scripts/index
    scripts/builtins
    scripts/proto-analyzers
+   scripts/file-analyzers
 
 Other Bro Components
 --------------------
diff --git a/doc/scripts/CMakeLists.txt b/doc/scripts/CMakeLists.txt
index e7e39d0b3f..fa234e74f2 100644
--- a/doc/scripts/CMakeLists.txt
+++ b/doc/scripts/CMakeLists.txt
@@ -124,28 +124,34 @@ endmacro(REST_TARGET)
 # Schedule Bro scripts for which to generate documentation.
 include(DocSourcesList.cmake)
 
-# This reST target is independent of a particular Bro script...
-add_custom_command(OUTPUT proto-analyzers.rst
-    # delete any leftover state from previous bro runs
-    COMMAND "${CMAKE_COMMAND}"
-    ARGS -E remove_directory .state
-    # generate the reST documentation using bro
-    COMMAND BROPATH=${BROPATH}:${srcDir} BROMAGIC=${CMAKE_SOURCE_DIR}/magic/database ${CMAKE_BINARY_DIR}/src/bro
-    ARGS -b -Z base/init-bare.bro || (rm -rf .state *.log *.rst && exit 1)
-    # move generated doc into a new directory tree that
-    # defines the final structure of documents
-    COMMAND "${CMAKE_COMMAND}"
-    ARGS -E make_directory ${dstDir}
-    COMMAND "${CMAKE_COMMAND}"
-    ARGS -E copy proto-analyzers.rst ${dstDir}
-    # clean up the build directory
-    COMMAND rm
-    ARGS -rf .state *.log *.rst
-    DEPENDS bro
-    WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
-    COMMENT "[Bro] Generating reST docs for proto-analyzers.rst"
-)
-list(APPEND ALL_REST_OUTPUTS proto-analyzers.rst)
+# Macro for generating reST docs that are independent of any particular Bro
+# script.
+macro(INDEPENDENT_REST_TARGET reST_file)
+    add_custom_command(OUTPUT ${reST_file}
+        # delete any leftover state from previous bro runs
+        COMMAND "${CMAKE_COMMAND}"
+        ARGS -E remove_directory .state
+        # generate the reST documentation using bro
+        COMMAND BROPATH=${BROPATH}:${srcDir} BROMAGIC=${CMAKE_SOURCE_DIR}/magic/database ${CMAKE_BINARY_DIR}/src/bro
+        ARGS -b -Z base/init-bare.bro || (rm -rf .state *.log *.rst && exit 1)
+        # move generated doc into a new directory tree that
+        # defines the final structure of documents
+        COMMAND "${CMAKE_COMMAND}"
+        ARGS -E make_directory ${dstDir}
+        COMMAND "${CMAKE_COMMAND}"
+        ARGS -E copy ${reST_file} ${dstDir}
+        # clean up the build directory
+        COMMAND rm
+        ARGS -rf .state *.log *.rst
+        DEPENDS bro
+        WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
+        COMMENT "[Bro] Generating reST docs for ${reST_file}"
+    )
+    list(APPEND ALL_REST_OUTPUTS ${reST_file})
+endmacro(INDEPENDENT_REST_TARGET)
+
+independent_rest_target(proto-analyzers.rst)
+independent_rest_target(file-analyzers.rst)
 
 # create temporary list of all docs to include in the master policy/index file
 file(WRITE ${MASTER_POLICY_INDEX} "${MASTER_POLICY_INDEX_TEXT}")
diff --git a/doc/scripts/DocSourcesList.cmake b/doc/scripts/DocSourcesList.cmake
index b2c932d117..d61db99db1 100644
--- a/doc/scripts/DocSourcesList.cmake
+++ b/doc/scripts/DocSourcesList.cmake
@@ -140,6 +140,7 @@ rest_target(${psd} base/protocols/dns/consts.bro)
 rest_target(${psd} base/protocols/dns/main.bro)
 rest_target(${psd} base/protocols/ftp/files.bro)
 rest_target(${psd} base/protocols/ftp/gridftp.bro)
+rest_target(${psd} base/protocols/ftp/info.bro)
 rest_target(${psd} base/protocols/ftp/main.bro)
 rest_target(${psd} base/protocols/ftp/utils-commands.bro)
 rest_target(${psd} base/protocols/ftp/utils.bro)
diff --git a/scripts/base/frameworks/files/main.bro b/scripts/base/frameworks/files/main.bro
index d0c381545b..a87608054d 100644
--- a/scripts/base/frameworks/files/main.bro
+++ b/scripts/base/frameworks/files/main.bro
@@ -204,7 +204,7 @@ export {
 	## 
 	## tag: Tag for the protocol analyzer having a callback being registered.
 	##
-	## reg: A :bro:see:`ProtoRegistration` record.
+	## reg: A :bro:see:`Files::ProtoRegistration` record.
 	##
 	## Returns: true if the protocol being registered was not previously registered.
 	global register_protocol: function(tag: Analyzer::Tag, reg: ProtoRegistration): bool;
diff --git a/scripts/base/frameworks/packet-filter/main.bro b/scripts/base/frameworks/packet-filter/main.bro
index 72b2b62f34..929b10fbe1 100644
--- a/scripts/base/frameworks/packet-filter/main.bro
+++ b/scripts/base/frameworks/packet-filter/main.bro
@@ -109,7 +109,7 @@ export {
 
 	## Enables the old filtering approach of "only watch common ports for
 	## analyzed protocols".
-        ##
+	##
 	## Unless you know what you are doing, leave this set to F.
 	const enable_auto_protocol_capture_filters = F &redef;
 
diff --git a/scripts/base/init-bare.bro b/scripts/base/init-bare.bro
index 594991c85a..92b806092c 100644
--- a/scripts/base/init-bare.bro
+++ b/scripts/base/init-bare.bro
@@ -531,22 +531,19 @@ type record_field_table: table[string] of record_field;
 # dependent on the names remaining as they are now.
 
 ## Set of BPF capture filters to use for capturing, indexed by a user-definable
-## ID (which must be unique). If Bro is *not* configured to examine
-## :bro:id:`PacketFilter::all_packets`, all packets matching at least
-## one of the filters in this table (and all in :bro:id:`restrict_filters`)
-## will be analyzed.
+## ID (which must be unique). If Bro is *not* configured with
+## :bro:id:`PacketFilter::enable_auto_protocol_capture_filters`,
+## all packets matching at least one of the filters in this table (and all in
+## :bro:id:`restrict_filters`) will be analyzed.
 ##
-## .. bro:see:: PacketFilter PacketFilter::all_packets
+## .. bro:see:: PacketFilter PacketFilter::enable_auto_protocol_capture_filters
 ##    PacketFilter::unrestricted_filter restrict_filters
 global capture_filters: table[string] of string &redef;
 
 ## Set of BPF filters to restrict capturing, indexed by a user-definable ID (which
-## must be unique). If Bro is *not* configured to examine
-## :bro:id:`PacketFilter::all_packets`, only packets matching *all* of the
-## filters in this table (and any in :bro:id:`capture_filters`) will be
-## analyzed.
+## must be unique).
 ##
-## .. bro:see:: PacketFilter PacketFilter::all_packets
+## .. bro:see:: PacketFilter PacketFilter::enable_auto_protocol_capture_filters
 ##    PacketFilter::unrestricted_filter capture_filters
 global restrict_filters: table[string] of string &redef;
 
diff --git a/scripts/base/protocols/ftp/__load__.bro b/scripts/base/protocols/ftp/__load__.bro
index ebb09e702c..3ddd8a2dc2 100644
--- a/scripts/base/protocols/ftp/__load__.bro
+++ b/scripts/base/protocols/ftp/__load__.bro
@@ -1,4 +1,5 @@
 @load ./utils-commands
+@load ./info
 @load ./main
 @load ./utils
 @load ./files
diff --git a/scripts/base/protocols/ftp/files.bro b/scripts/base/protocols/ftp/files.bro
index 9ed17ab2a4..b507ca32a7 100644
--- a/scripts/base/protocols/ftp/files.bro
+++ b/scripts/base/protocols/ftp/files.bro
@@ -1,3 +1,4 @@
+@load ./info
 @load ./main
 @load ./utils
 @load base/utils/conn-ids
diff --git a/scripts/base/protocols/ftp/gridftp.bro b/scripts/base/protocols/ftp/gridftp.bro
index 57752b1cbd..73bd656544 100644
--- a/scripts/base/protocols/ftp/gridftp.bro
+++ b/scripts/base/protocols/ftp/gridftp.bro
@@ -19,6 +19,7 @@
 ##! sizes are not logged, but at the benefit of saving CPU cycles that
 ##! otherwise go to analyzing the large (and likely benign) connections.
 
+@load ./info
 @load ./main
 @load base/protocols/conn
 @load base/protocols/ssl
diff --git a/scripts/base/protocols/ftp/info.bro b/scripts/base/protocols/ftp/info.bro
new file mode 100644
index 0000000000..f6fceb071e
--- /dev/null
+++ b/scripts/base/protocols/ftp/info.bro
@@ -0,0 +1,72 @@
+##! Defines data structures for tracking and logging FTP sessions.
+
+module FTP;
+
+@load ./utils-commands
+
+export {
+
+	## This setting changes if passwords used in FTP sessions are
+	## captured or not.
+	const default_capture_password = F &redef;
+
+	## The expected endpoints of an FTP data channel.
+	type ExpectedDataChannel: record {
+		## Whether PASV mode is toggled for control channel.
+		passive: bool &log;
+		## The host that will be initiating the data connection.
+		orig_h: addr &log;
+		## The host that will be accepting the data connection.
+		resp_h: addr &log;
+		## The port at which the acceptor is listening for the data connection.
+		resp_p: port &log;
+	};
+
+	type Info: record {
+		## Time when the command was sent.
+		ts:               time        &log;
+		## Unique ID for the connection.
+		uid:              string      &log;
+		## The connection's 4-tuple of endpoint addresses/ports.
+		id:               conn_id     &log;
+		## User name for the current FTP session.
+		user:             string      &log &default="<unknown>";
+		## Password for the current FTP session if captured.
+		password:         string      &log &optional;
+		## Command given by the client.
+		command:          string      &log &optional;
+		## Argument for the command if one is given.
+		arg:              string      &log &optional;
+
+		## Libmagic "sniffed" file type if the command indicates a file transfer.
+		mime_type:        string      &log &optional;
+		## Size of the file if the command indicates a file transfer.
+		file_size:        count       &log &optional;
+
+		## Reply code from the server in response to the command.
+		reply_code:       count       &log &optional;
+		## Reply message from the server in response to the command.
+		reply_msg:        string      &log &optional;
+
+		## Expected FTP data channel.
+		data_channel:     ExpectedDataChannel &log &optional;
+
+		## Current working directory that this session is in.  By making
+		## the default value '.', we can indicate that unless something
+		## more concrete is discovered that the existing but unknown
+		## directory is ok to use.
+		cwd:                string  &default=".";
+
+		## Command that is currently waiting for a response.
+		cmdarg:             CmdArg  &optional;
+		## Queue for commands that have been sent but not yet responded to
+		## are tracked here.
+		pending_commands:   PendingCmds;
+
+		## Indicates if the session is in active or passive mode.
+		passive:            bool &default=F;
+
+		## Determines if the password will be captured for this request.
+		capture_password:   bool &default=default_capture_password;
+	};
+}
diff --git a/scripts/base/protocols/ftp/main.bro b/scripts/base/protocols/ftp/main.bro
index c9549a14ec..df66235d49 100644
--- a/scripts/base/protocols/ftp/main.bro
+++ b/scripts/base/protocols/ftp/main.bro
@@ -3,6 +3,8 @@
 ##! will take on the full path that the client is at along with the requested
 ##! file name.
 
+@load ./info
+@load ./utils
 @load ./utils-commands
 @load base/utils/paths
 @load base/utils/numbers
@@ -20,72 +22,9 @@ export {
 		"EPSV"
 	} &redef;
 
-	## This setting changes if passwords used in FTP sessions are captured or not.
-	const default_capture_password = F &redef;
-
 	## User IDs that can be considered "anonymous".
 	const guest_ids = { "anonymous", "ftp", "ftpuser", "guest" } &redef;
 
-	## The expected endpoints of an FTP data channel.
-	type ExpectedDataChannel: record {
-		## Whether PASV mode is toggled for control channel.
-		passive: bool &log;
-		## The host that will be initiating the data connection.
-		orig_h: addr &log;
-		## The host that will be accepting the data connection.
-		resp_h: addr &log;
-		## The port at which the acceptor is listening for the data connection.
-		resp_p: port &log;
-	};
-
-	type Info: record {
-		## Time when the command was sent.
-		ts:               time        &log;
-		## Unique ID for the connection.
-		uid:              string      &log;
-		## The connection's 4-tuple of endpoint addresses/ports.
-		id:               conn_id     &log;
-		## User name for the current FTP session.
-		user:             string      &log &default="<unknown>";
-		## Password for the current FTP session if captured.
-		password:         string      &log &optional;
-		## Command given by the client.
-		command:          string      &log &optional;
-		## Argument for the command if one is given.
-		arg:              string      &log &optional;
-
-		## Libmagic "sniffed" file type if the command indicates a file transfer.
-		mime_type:        string      &log &optional;
-		## Size of the file if the command indicates a file transfer.
-		file_size:        count       &log &optional;
-
-		## Reply code from the server in response to the command.
-		reply_code:       count       &log &optional;
-		## Reply message from the server in response to the command.
-		reply_msg:        string      &log &optional;
-
-		## Expected FTP data channel.
-		data_channel:     ExpectedDataChannel &log &optional;
-
-		## Current working directory that this session is in.  By making
-		## the default value '.', we can indicate that unless something
-		## more concrete is discovered that the existing but unknown
-		## directory is ok to use.
-		cwd:                string  &default=".";
-
-		## Command that is currently waiting for a response.
-		cmdarg:             CmdArg  &optional;
-		## Queue for commands that have been sent but not yet responded to
-		## are tracked here.
-		pending_commands:   PendingCmds;
-
-		## Indicates if the session is in active or passive mode.
-		passive:            bool &default=F;
-
-		## Determines if the password will be captured for this request.
-		capture_password:   bool &default=default_capture_password;
-	};
-
 	## This record is to hold a parsed FTP reply code.  For example, for the
 	## 201 status code, the digits would be parsed as: x->2, y->0, z=>1.
 	type ReplyCode: record {
@@ -102,8 +41,6 @@ export {
 	global log_ftp: event(rec: Info);
 }
 
-@load ./utils
-
 # Add the state tracking information variable to the connection record
 redef record connection += {
 	ftp: Info &optional;
diff --git a/scripts/base/protocols/ftp/utils.bro b/scripts/base/protocols/ftp/utils.bro
index 629b87e5a8..a0b473e086 100644
--- a/scripts/base/protocols/ftp/utils.bro
+++ b/scripts/base/protocols/ftp/utils.bro
@@ -1,7 +1,8 @@
 ##! Utilities specific for FTP processing.
 
-@load ./main
+@load ./info
 @load base/utils/addrs
+@load base/utils/paths
 
 module FTP;
 
@@ -44,4 +45,4 @@ function build_url_ftp(rec: Info): string
 function describe(rec: Info): string
 	{
 	return build_url_ftp(rec);
-	}
\ No newline at end of file
+	}
diff --git a/scripts/policy/frameworks/packet-filter/shunt.bro b/scripts/policy/frameworks/packet-filter/shunt.bro
index b87369ee62..85ec189a17 100644
--- a/scripts/policy/frameworks/packet-filter/shunt.bro
+++ b/scripts/policy/frameworks/packet-filter/shunt.bro
@@ -34,8 +34,8 @@ export {
 	global current_shunted_host_pairs: function(): set[conn_id];
 
 	redef enum Notice::Type += {
-		## Indicative that :bro:id:`max_bpf_shunts` connections are already
-		## being shunted with BPF filters and no more are allowed.
+		## Indicative that :bro:id:`PacketFilter::max_bpf_shunts` connections
+		## are already being shunted with BPF filters and no more are allowed.
 		No_More_Conn_Shunts_Available,
 
 		## Limitations in BPF make shunting some connections with BPF impossible.
diff --git a/scripts/policy/misc/load-balancing.bro b/scripts/policy/misc/load-balancing.bro
index fe07dd64da..889d18119a 100644
--- a/scripts/policy/misc/load-balancing.bro
+++ b/scripts/policy/misc/load-balancing.bro
@@ -12,12 +12,12 @@ export {
 		## Apply BPF filters to each worker in a way that causes them to
 		## automatically flow balance traffic between them.
 		AUTO_BPF,
-		## Load balance traffic across the workers by making each one apply
-		## a restrict filter to only listen to a single MAC address.  This
-		## is a somewhat common deployment option for sites doing network
-		## based load balancing with MAC address rewriting and passing the
-		## traffic to a single interface.  Multiple MAC addresses will show
-		## up on the same interface and need filtered to a single address.
+		# Load balance traffic across the workers by making each one apply
+		# a restrict filter to only listen to a single MAC address.  This
+		# is a somewhat common deployment option for sites doing network
+		# based load balancing with MAC address rewriting and passing the
+		# traffic to a single interface.  Multiple MAC addresses will show
+		# up on the same interface and need filtered to a single address.
 		#MAC_ADDR_BPF,
 	};
 
diff --git a/scripts/policy/tuning/defaults/packet-fragments.bro b/scripts/policy/tuning/defaults/packet-fragments.bro
index 24b18d5917..f95c826547 100644
--- a/scripts/policy/tuning/defaults/packet-fragments.bro
+++ b/scripts/policy/tuning/defaults/packet-fragments.bro
@@ -1,10 +1,10 @@
-## Capture TCP fragments, but not UDP (or ICMP), since those are a lot more
-## common due to high-volume, fragmenting protocols such as NFS :-(.
+# Capture TCP fragments, but not UDP (or ICMP), since those are a lot more
+# common due to high-volume, fragmenting protocols such as NFS :-(.
 
-## This normally isn't used because of the default open packet filter 
-## but we set it anyway in case the user is using a packet filter.
-## Note: This was removed because the default model now is to have a wide
-##       open packet filter.
+# This normally isn't used because of the default open packet filter 
+# but we set it anyway in case the user is using a packet filter.
+# Note: This was removed because the default model now is to have a wide
+#       open packet filter.
 #redef capture_filters += { ["frag"] = "(ip[6:2] & 0x3fff != 0) and tcp" };
 
 ## Shorten the fragment timeout from never expiring to expiring fragments after
diff --git a/src/BroDoc.cc b/src/BroDoc.cc
index c04cd92eca..3cb271bdbf 100644
--- a/src/BroDoc.cc
+++ b/src/BroDoc.cc
@@ -11,6 +11,7 @@
 #include "plugin/Manager.h"
 #include "analyzer/Manager.h"
 #include "analyzer/Component.h"
+#include "file_analysis/Manager.h"
 
 BroDoc::BroDoc(const std::string& rel, const std::string& abs)
 	{
@@ -479,6 +480,17 @@ static void WriteAnalyzerComponent(FILE* f, const analyzer::Component* c)
 	fprintf(f, ":bro:enum:`Analyzer::%s`\n\n", tag.c_str());
 	}
 
+static void WriteAnalyzerComponent(FILE* f, const file_analysis::Component* c)
+	{
+	EnumType* atag = file_mgr->GetTagEnumType();
+	string tag = fmt("ANALYZER_%s", c->CanonicalName());
+
+	if ( atag->Lookup("Files", tag.c_str()) < 0 )
+		reporter->InternalError("missing analyzer tag for %s", tag.c_str());
+
+	fprintf(f, ":bro:enum:`Files::%s`\n\n", tag.c_str());
+	}
+
 static void WritePluginComponents(FILE* f, const plugin::Plugin* p)
 	{
 	plugin::Plugin::component_list components = p->Components();
@@ -494,6 +506,10 @@ static void WritePluginComponents(FILE* f, const plugin::Plugin* p)
 			WriteAnalyzerComponent(f,
 			        dynamic_cast<const analyzer::Component*>(*it));
 			break;
+		case plugin::component::FILE_ANALYZER:
+			WriteAnalyzerComponent(f,
+			        dynamic_cast<const file_analysis::Component*>(*it));
+			break;
 		case plugin::component::READER:
 			reporter->InternalError("docs for READER component unimplemented");
 		case plugin::component::WRITER:
@@ -537,12 +553,13 @@ static void WritePluginBifItems(FILE* f, const plugin::Plugin* p,
 		}
 	}
 
-static void WriteAnalyzerTagDefn(FILE* f, EnumType* e)
+static void WriteAnalyzerTagDefn(FILE* f, EnumType* e, const string& module)
 	{
+	string tag_id= module + "::Tag";
 	e = new CommentedEnumType(e);
-	e->SetTypeID(copy_string("Analyzer::Tag"));
+	e->SetTypeID(copy_string(tag_id.c_str()));
 
-	ID* dummy_id = new ID(copy_string("Analyzer::Tag"), SCOPE_GLOBAL, true);
+	ID* dummy_id = new ID(copy_string(tag_id.c_str()), SCOPE_GLOBAL, true);
 	dummy_id->SetType(e);
 	dummy_id->MakeType();
 
@@ -554,13 +571,17 @@ static void WriteAnalyzerTagDefn(FILE* f, EnumType* e)
 	bdo.WriteReST(f);
 	}
 
-static bool IsAnalyzerPlugin(const plugin::Plugin* p)
+static bool ComponentsMatch(const plugin::Plugin* p, plugin::component::Type t,
+                            bool match_empty = false)
 	{
 	plugin::Plugin::component_list components = p->Components();
 	plugin::Plugin::component_list::const_iterator it;
 
+	if ( components.empty() )
+		return match_empty;
+
 	for ( it = components.begin(); it != components.end(); ++it )
-		if ( (*it)->Type() != plugin::component::ANALYZER )
+		if ( (*it)->Type() != t )
 			return false;
 
 	return true;
@@ -573,14 +594,44 @@ void CreateProtoAnalyzerDoc(const char* filename)
 	fprintf(f, "Protocol Analyzer Reference\n");
 	fprintf(f, "===========================\n\n");
 
-	WriteAnalyzerTagDefn(f, analyzer_mgr->GetTagEnumType());
+	WriteAnalyzerTagDefn(f, analyzer_mgr->GetTagEnumType(), "Analyzer");
 
 	plugin::Manager::plugin_list plugins = plugin_mgr->Plugins();
 	plugin::Manager::plugin_list::const_iterator it;
 
 	for ( it = plugins.begin(); it != plugins.end(); ++it )
 		{
-		if ( ! IsAnalyzerPlugin(*it) )
+		if ( ! ComponentsMatch(*it, plugin::component::ANALYZER, true) )
+			continue;
+
+		WritePluginSectionHeading(f, *it);
+		WritePluginComponents(f, *it);
+		WritePluginBifItems(f, *it, plugin::BifItem::CONSTANT,
+		                    "Options/Constants");
+		WritePluginBifItems(f, *it, plugin::BifItem::GLOBAL, "Globals");
+		WritePluginBifItems(f, *it, plugin::BifItem::TYPE, "Types");
+		WritePluginBifItems(f, *it, plugin::BifItem::EVENT, "Events");
+		WritePluginBifItems(f, *it, plugin::BifItem::FUNCTION, "Functions");
+		}
+
+	fclose(f);
+	}
+
+void CreateFileAnalyzerDoc(const char* filename)
+	{
+	FILE* f = fopen(filename, "w");
+
+	fprintf(f, "File Analyzer Reference\n");
+	fprintf(f, "===========================\n\n");
+
+	WriteAnalyzerTagDefn(f, file_mgr->GetTagEnumType(), "Files");
+
+	plugin::Manager::plugin_list plugins = plugin_mgr->Plugins();
+	plugin::Manager::plugin_list::const_iterator it;
+
+	for ( it = plugins.begin(); it != plugins.end(); ++it )
+		{
+		if ( ! ComponentsMatch(*it, plugin::component::FILE_ANALYZER) )
 			continue;
 
 		WritePluginSectionHeading(f, *it);
diff --git a/src/BroDoc.h b/src/BroDoc.h
index 9f92f821f8..081df698d9 100644
--- a/src/BroDoc.h
+++ b/src/BroDoc.h
@@ -413,4 +413,10 @@ private:
  */
 void CreateProtoAnalyzerDoc(const char* filename);
 
+/**
+ * Writes out plugin index documentation for all file analyzer plugins.
+ * @param filename the name of the file to write.
+ */
+void CreateFileAnalyzerDoc(const char* filename);
+
 #endif
diff --git a/src/file_analysis/Manager.cc b/src/file_analysis/Manager.cc
index 4e25bb0b0e..fb74a409b4 100644
--- a/src/file_analysis/Manager.cc
+++ b/src/file_analysis/Manager.cc
@@ -394,3 +394,8 @@ const char* Manager::GetAnalyzerName(int tag) const
 
 	return it->second->CanonicalName();
 	}
+
+EnumType* Manager::GetTagEnumType()
+	{
+	return tag_enum_type;
+	}
diff --git a/src/file_analysis/Manager.h b/src/file_analysis/Manager.h
index 84b606173d..a93e78c638 100644
--- a/src/file_analysis/Manager.h
+++ b/src/file_analysis/Manager.h
@@ -214,6 +214,12 @@ public:
 	 */
 	const char* GetAnalyzerName(int tag) const;
 
+	/**
+	 * Returns the enum type that corresponds to the script-level type
+	 * \c Files::Tag.
+	 */
+	EnumType* GetTagEnumType();
+
 protected:
 	friend class FileTimer;
 
diff --git a/src/main.cc b/src/main.cc
index 56193a935b..6a58832964 100644
--- a/src/main.cc
+++ b/src/main.cc
@@ -872,6 +872,7 @@ int main(int argc, char** argv)
 	if ( generate_documentation )
 		{
 		CreateProtoAnalyzerDoc("proto-analyzers.rst");
+		CreateFileAnalyzerDoc("file-analyzers.rst");
 
 		std::list<BroDoc*>::iterator it;
 
diff --git a/testing/btest/Baseline/coverage.default-load-baseline/canonified_loaded_scripts.log b/testing/btest/Baseline/coverage.default-load-baseline/canonified_loaded_scripts.log
index f67d4b6158..c34e2e2e87 100644
--- a/testing/btest/Baseline/coverage.default-load-baseline/canonified_loaded_scripts.log
+++ b/testing/btest/Baseline/coverage.default-load-baseline/canonified_loaded_scripts.log
@@ -3,7 +3,7 @@
 #empty_field	(empty)
 #unset_field	-
 #path	loaded_scripts
-#open	2013-07-23-05-48-10
+#open	2013-07-29-20-08-38
 #fields	name
 #types	string
 scripts/base/init-bare.bro
@@ -156,8 +156,9 @@ scripts/base/init-default.bro
     scripts/base/protocols/dns/main.bro
   scripts/base/protocols/ftp/__load__.bro
     scripts/base/protocols/ftp/utils-commands.bro
+    scripts/base/protocols/ftp/info.bro
     scripts/base/protocols/ftp/main.bro
-    scripts/base/protocols/ftp/utils.bro
+      scripts/base/protocols/ftp/utils.bro
     scripts/base/protocols/ftp/files.bro
     scripts/base/protocols/ftp/gridftp.bro
       scripts/base/protocols/ssl/__load__.bro
@@ -196,4 +197,4 @@ scripts/base/init-default.bro
     scripts/base/files/extract/main.bro
   scripts/base/misc/find-checksum-offloading.bro
 scripts/policy/misc/loaded-scripts.bro
-#close	2013-07-23-05-48-10
+#close	2013-07-29-20-08-38

From 43825212db25ce540c6a12905844d246f8784c05 Mon Sep 17 00:00:00 2001
From: Matthias Vallentin <vallentin@icir.org>
Date: Tue, 30 Jul 2013 12:17:53 +0200
Subject: [PATCH 19/40] Update submodules.

---
 aux/binpac   | 2 +-
 aux/bro-aux  | 2 +-
 aux/broccoli | 2 +-
 aux/broctl   | 2 +-
 cmake        | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/aux/binpac b/aux/binpac
index c39bd478b9..314fa8f65f 160000
--- a/aux/binpac
+++ b/aux/binpac
@@ -1 +1 @@
-Subproject commit c39bd478b9d0ecd05b1b83aa9d09a7887893977c
+Subproject commit 314fa8f65fc240e960c23c3bba98623436a72b98
diff --git a/aux/bro-aux b/aux/bro-aux
index a9942558c7..91d258cc8b 160000
--- a/aux/bro-aux
+++ b/aux/bro-aux
@@ -1 +1 @@
-Subproject commit a9942558c7d3dfd80148b8aaded64c82ade3d117
+Subproject commit 91d258cc8b2f74cd02fc93dfe61f73ec9f0dd489
diff --git a/aux/broccoli b/aux/broccoli
index 889f9c6594..d59c73b6e0 160000
--- a/aux/broccoli
+++ b/aux/broccoli
@@ -1 +1 @@
-Subproject commit 889f9c65944ceac20ad9230efc39d33e6e1221c3
+Subproject commit d59c73b6e0966ad63bbc63a35741b5f68263e7b1
diff --git a/aux/broctl b/aux/broctl
index 0cd102805e..52fd91261f 160000
--- a/aux/broctl
+++ b/aux/broctl
@@ -1 +1 @@
-Subproject commit 0cd102805e73343cab3f9fd4a76552e13940dad9
+Subproject commit 52fd91261f41fa1528f7b964837a364d7991889e
diff --git a/cmake b/cmake
index 0187b33a29..026639f836 160000
--- a/cmake
+++ b/cmake
@@ -1 +1 @@
-Subproject commit 0187b33a29d5ec824f940feff60dc5d8c2fe314f
+Subproject commit 026639f8368e56742c0cb5d9fb390ea64e60ec50

From af9e181731b82167187b7a9ec8995b991920c0e1 Mon Sep 17 00:00:00 2001
From: Robin Sommer <robin@icir.org>
Date: Tue, 30 Jul 2013 10:29:27 -0700
Subject: [PATCH 20/40] Updating submodule(s).

 [nomail]
---
 magic | 1 +
 1 file changed, 1 insertion(+)
 create mode 160000 magic

diff --git a/magic b/magic
new file mode 160000
index 0000000000..e87fe13a7b
--- /dev/null
+++ b/magic
@@ -0,0 +1 @@
+Subproject commit e87fe13a7b776182ffc8c75076d42702f5c28fed

From 8df4df0b8b7c8760d830c0f99e26e8f4db66967a Mon Sep 17 00:00:00 2001
From: Jon Siwek <jsiwek@illinois.edu>
Date: Tue, 30 Jul 2013 15:19:48 -0500
Subject: [PATCH 21/40] Add a distinct tag class for file analyzers.

This should prevent assignment mismatches between file and protocol
analyzer tags.
---
 src/CMakeLists.txt               |   1 +
 src/Tag.cc                       |  82 ++++++++++++++++++
 src/Tag.h                        | 138 +++++++++++++++++++++++++++++++
 src/analyzer/Component.cc        |   2 +-
 src/analyzer/Manager.cc          |   2 +-
 src/analyzer/Tag.cc              |  84 ++-----------------
 src/analyzer/Tag.h               |  93 ++++++---------------
 src/file_analysis/CMakeLists.txt |   1 +
 src/file_analysis/Component.cc   |   8 +-
 src/file_analysis/Component.h    |  16 ++--
 src/file_analysis/Manager.h      |   2 +-
 src/file_analysis/Tag.cc         |  24 ++++++
 src/file_analysis/Tag.h          | 115 ++++++++++++++++++++++++++
 13 files changed, 407 insertions(+), 161 deletions(-)
 create mode 100644 src/Tag.cc
 create mode 100644 src/Tag.h
 create mode 100644 src/file_analysis/Tag.cc
 create mode 100644 src/file_analysis/Tag.h

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index e353dd4695..082f34fba1 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -315,6 +315,7 @@ set(bro_SRCS
     StateAccess.cc
     Stats.cc
     Stmt.cc
+    Tag.cc
     Timer.cc
     Traverse.cc
     Trigger.cc
diff --git a/src/Tag.cc b/src/Tag.cc
new file mode 100644
index 0000000000..178edaa71e
--- /dev/null
+++ b/src/Tag.cc
@@ -0,0 +1,82 @@
+// See the file "COPYING" in the main distribution directory for copyright.
+
+#include "Tag.h"
+#include "Val.h"
+
+Tag::Tag(EnumType* etype, type_t arg_type, subtype_t arg_subtype)
+	{
+	assert(arg_type > 0);
+
+	type = arg_type;
+	subtype = arg_subtype;
+	int64_t i = (int64)(type) | ((int64)subtype << 31);
+	Ref(etype);
+	val = new EnumVal(i, etype);
+	}
+
+Tag::Tag(EnumVal* arg_val)
+	{
+	assert(arg_val);
+
+	val = arg_val;
+	Ref(val);
+
+	int64 i = val->InternalInt();
+	type = i & 0xffffffff;
+	subtype = (i >> 31) & 0xffffffff;
+	}
+
+Tag::Tag(const Tag& other)
+	{
+	type = other.type;
+	subtype = other.subtype;
+	val = other.val;
+
+	if ( val )
+		Ref(val);
+	}
+
+Tag::Tag()
+	{
+	type = 0;
+	subtype = 0;
+	val = 0;
+	}
+
+Tag::~Tag()
+	{
+	Unref(val);
+	val = 0;
+	}
+
+Tag& Tag::operator=(const Tag& other)
+	{
+	if ( this != &other )
+		{
+		type = other.type;
+		subtype = other.subtype;
+		val = other.val;
+
+		if ( val )
+			Ref(val);
+		}
+
+	return *this;
+	}
+
+EnumVal* Tag::AsEnumVal(EnumType* etype) const
+	{
+	if ( ! val )
+		{
+		assert(type == 0 && subtype == 0);
+		Ref(etype);
+		val = new EnumVal(0, etype);
+		}
+
+	return val;
+	}
+
+std::string Tag::AsString() const
+	{
+	return fmt("%" PRIu32 "/%" PRIu32, type, subtype);
+	}
diff --git a/src/Tag.h b/src/Tag.h
new file mode 100644
index 0000000000..a0c218019e
--- /dev/null
+++ b/src/Tag.h
@@ -0,0 +1,138 @@
+// See the file "COPYING" in the main distribution directory for copyright.
+
+#ifndef TAG_H
+#define TAG_H
+
+#include "config.h"
+#include "util.h"
+#include "Type.h"
+
+class EnumVal;
+
+/**
+ * Class to identify an analyzer type.
+ *
+ * Each analyzer type gets a tag consisting of a main type and subtype. The
+ * former is an identifier that's unique all analyzer classes. The latter is
+ * passed through to the analyzer instances for their use, yet not further
+ * interpreted by the analyzer infrastructure; it allows an analyzer to
+ * branch out into a set of sub-analyzers internally. Jointly, main type and
+ * subtype form an analyzer "tag". Each unique tag corresponds to a single
+ * "analyzer" from the user's perspective. At the script layer, these tags
+ * are mapped into enums of type \c Analyzer::Tag or Files::Tag. Internally,
+ * the analyzer::Manager and file_analysis::Manager maintain the mapping of tag
+ * to analyzer (and it also assigns them their main types), and
+ * analyzer::Component and file_analysis::Component create new tag.
+ *
+ * The Tag class supports all operations necessary to act as an index in a
+ * \c std::map.
+ */
+class Tag  {
+public:
+	/**
+	 * Type for the analyzer's main type.
+	 */
+	typedef uint32 type_t;
+
+	/**
+	 * Type for the analyzer's subtype.
+	 */
+	typedef uint32 subtype_t;
+
+	/**
+	 * Returns the tag's main type.
+	 */
+	type_t Type() const 	{ return type; }
+
+	/**
+	 * Returns the tag's subtype.
+	 */
+	subtype_t Subtype() const 	{ return subtype; }
+
+	/**
+	 * Returns the numerical values for main and subtype inside a string
+	 * suitable for printing. This is primarily for debugging.
+	 */
+	std::string AsString() const;
+
+protected:
+	/*
+	 * Copy constructor.
+	 */
+	Tag(const Tag& other);
+
+	/**
+	 * Default constructor. This initializes the tag with an error value
+	 * that will make \c operator \c bool return false.
+	 */
+	Tag();
+
+	/**
+	 * Destructor.
+	 */
+	~Tag();
+
+	/**
+	 * Assignment operator.
+	 */
+	Tag& operator=(const Tag& other);
+
+	/**
+	 * Compares two tags for equality.
+	 */
+	bool operator==(const Tag& other) const
+		{
+		return type == other.type && subtype == other.subtype;
+		}
+
+	/**
+	 * Compares two tags for inequality.
+	 */
+	bool operator!=(const Tag& other) const
+		{
+		return type != other.type || subtype != other.subtype;
+		}
+
+	/**
+	 * Compares two tags for less-than relationship.
+	 */
+	bool operator<(const Tag& other) const
+		{
+		return type != other.type ? type < other.type : (subtype < other.subtype);
+		}
+
+	/**
+	 * Returns the script-layer enum that corresponds to this tag.
+	 * The returned value does not have its ref-count increased.
+	 *
+	 * @param etype the script-layer enum type associated with the tag.
+	 */
+	EnumVal* AsEnumVal(EnumType* etype) const;
+
+	/**
+	 * Constructor.
+	 *
+	 * @param etype the script-layer enum type associated with the tag.
+	 *
+	 * @param type The main type. Note that the manager class manages the
+	 * the value space internally, so noone else should assign main types.
+	 *
+	 * @param subtype The sub type, which is left to an analyzer for
+	 * interpretation. By default it's set to zero.
+	 */
+	Tag(EnumType* etype, type_t type, subtype_t subtype = 0);
+
+	/**
+	 * Constructor.
+	 *
+	 * @param val An enum value of script type \c Analyzer::Tag.
+	 */
+	Tag(EnumVal* val);
+
+private:
+	type_t type;            // Main type.
+	subtype_t subtype;      // Subtype.
+	mutable EnumVal* val;   // Script-layer value.
+};
+
+#endif
diff --git a/src/analyzer/Component.cc b/src/analyzer/Component.cc
index cbb0f40c20..ded0a1a2d5 100644
--- a/src/analyzer/Component.cc
+++ b/src/analyzer/Component.cc
@@ -8,7 +8,7 @@
 
 using namespace analyzer;
 
-Tag::type_t Component::type_counter = 0;
+analyzer::Tag::type_t Component::type_counter = 0;
 
 Component::Component(const char* arg_name, factory_callback arg_factory, Tag::subtype_t arg_subtype, bool arg_enabled, bool arg_partial)
 	: plugin::Component(plugin::component::ANALYZER)
diff --git a/src/analyzer/Manager.cc b/src/analyzer/Manager.cc
index 5695dec625..3f97cbb0c8 100644
--- a/src/analyzer/Manager.cc
+++ b/src/analyzer/Manager.cc
@@ -341,7 +341,7 @@ const char* Manager::GetAnalyzerName(Val* val)
 	return GetAnalyzerName(Tag(val->AsEnumVal()));
 	}
 
-Tag Manager::GetAnalyzerTag(const char* name)
+analyzer::Tag Manager::GetAnalyzerTag(const char* name)
 	{
 	Component* c = Lookup(name);
 	return c ? c->Tag() : Tag();
diff --git a/src/analyzer/Tag.cc b/src/analyzer/Tag.cc
index 2f04ff17da..3ab41daf78 100644
--- a/src/analyzer/Tag.cc
+++ b/src/analyzer/Tag.cc
@@ -3,90 +3,20 @@
 #include "Tag.h"
 #include "Manager.h"
 
-#include "../NetVar.h"
+analyzer::Tag analyzer::Tag::Error;
 
-using namespace analyzer;
-
-Tag Tag::Error;
-
-Tag::Tag(type_t arg_type, subtype_t arg_subtype)
+analyzer::Tag::Tag(type_t type, subtype_t subtype)
+	: ::Tag(analyzer_mgr->GetTagEnumType(), type, subtype)
 	{
-	assert(arg_type > 0);
-
-	type = arg_type;
-	subtype = arg_subtype;
-	int64_t i = (int64)(type) | ((int64)subtype << 31);
-
-	EnumType* etype = analyzer_mgr->GetTagEnumType();
-	Ref(etype);
-	val = new EnumVal(i, etype);
 	}
 
-Tag::Tag(EnumVal* arg_val)
+analyzer::Tag& analyzer::Tag::operator=(const analyzer::Tag& other)
 	{
-	assert(arg_val);
-
-	val = arg_val;
-	Ref(val);
-
-	int64 i = val->InternalInt();
-	type = i & 0xffffffff;
-	subtype = (i >> 31) & 0xffffffff;
-	}
-
-Tag::Tag(const Tag& other)
-	{
-	type = other.type;
-	subtype = other.subtype;
-	val = other.val;
-
-	if ( val )
-		Ref(val);
-	}
-
-Tag::Tag()
-	{
-	type = 0;
-	subtype = 0;
-	val = 0;
-	}
-
-Tag::~Tag()
-	{
-	Unref(val);
-	val = 0;
-	}
-
-Tag& Tag::operator=(const Tag& other)
-	{
-	if ( this != &other )
-		{
-		type = other.type;
-		subtype = other.subtype;
-		val = other.val;
-
-		if ( val )
-			Ref(val);
-		}
-
+	::Tag::operator=(other);
 	return *this;
 	}
 
-EnumVal* Tag::AsEnumVal() const
+EnumVal* analyzer::Tag::AsEnumVal() const
 	{
-	if ( ! val )
-		{
-		assert(analyzer_mgr);
-		assert(type == 0 && subtype == 0);
-		EnumType* etype = analyzer_mgr->GetTagEnumType();
-		Ref(etype);
-		val = new EnumVal(0, etype);
-		}
-
-	return val;
-	}
-
-std::string Tag::AsString() const
-	{
-	return fmt("%" PRIu32 "/%" PRIu32, type, subtype);
+	return ::Tag::AsEnumVal(analyzer_mgr->GetTagEnumType());
 	}
diff --git a/src/analyzer/Tag.h b/src/analyzer/Tag.h
index edb0ade8a7..8ac151e4b5 100644
--- a/src/analyzer/Tag.h
+++ b/src/analyzer/Tag.h
@@ -5,90 +5,44 @@
 
 #include "config.h"
 #include "util.h"
+#include "../Tag.h"
 
 class EnumVal;
 
-namespace file_analysis {
-class Manager;
-class Component;
-}
-
 namespace analyzer {
 
 class Manager;
 class Component;
 
 /**
- * Class to identify an analyzer type.
+ * Class to identify a protocol analyzer type.
  *
- * Each analyzer type gets a tag consisting of a main type and subtype. The
- * former is an identifier that's unique all analyzer classes. The latter is
- * passed through to the analyzer instances for their use, yet not further
- * interpreted by the analyzer infrastructure; it allows an analyzer to
- * branch out into a set of sub-analyzers internally. Jointly, main type and
- * subtype form an analyzer "tag". Each unique tag corresponds to a single
- * "analyzer" from the user's perspective. At the script layer, these tags
- * are mapped into enums of type \c Analyzer::Tag. Internally, the
- * analyzer::Manager maintains the mapping of tag to analyzer (and it also
- * assigns them their main types), and analyzer::Component creates new
- * tags.
- *
- * The Tag class supports all operations necessary to act as an index in a
- * \c std::map.
+ * The script-layer analogue is Analyzer::Tag.
  */
-class Tag  {
+class Tag : public ::Tag  {
 public:
-	/**
-	 * Type for the analyzer's main type.
-	 */
-	typedef uint32 type_t;
-
-	/**
-	 * Type for the analyzer's subtype.
-	 */
-	typedef uint32 subtype_t;
-
 	/*
 	 * Copy constructor.
 	 */
-	Tag(const Tag& other);
+	Tag(const Tag& other) : ::Tag(other) {}
 
 	/**
 	 * Default constructor. This initializes the tag with an error value
 	 * that will make \c operator \c bool return false.
 	 */
-	Tag();
+	Tag() : ::Tag() {}
 
 	/**
 	 * Destructor.
 	 */
-	~Tag();
-
-	/**
-	 * Returns the tag's main type.
-	 */
-	type_t Type() const 	{ return type; }
-
-	/**
-	 * Returns the tag's subtype.
-	 */
-	subtype_t Subtype() const 	{ return subtype; }
-
-	/**
-	 * Returns the \c Analyzer::Tag enum that corresponds to this tag.
-	 * The returned value is \a does not have its ref-count increased.
-	 */
-	EnumVal* AsEnumVal() const;
-
-	/**
-	 * Returns the numerical values for main and subtype inside a string
-	 * suitable for printing. This is primarily for debugging.
-	 */
-	std::string AsString() const;
+	~Tag() {}
 
 	/**
 	 * Returns false if the tag represents an error value rather than a
 	 * legal analyzer type.
+	 * TODO: make this conversion operator "explicit" (C++11) or use a
+	 *       "safe bool" idiom (not necessary if "explicit" is available),
+	 *       otherwise this may allow nonsense/undesired comparison operations.
 	 */
 	operator bool() const	{ return *this != Tag(); }
 
@@ -102,7 +56,7 @@ public:
 	 */
 	bool operator==(const Tag& other) const
 		{
-		return type == other.type && subtype == other.subtype;
+		return ::Tag::operator==(other);
 		}
 
 	/**
@@ -110,7 +64,7 @@ public:
 	 */
 	bool operator!=(const Tag& other) const
 		{
-		return type != other.type || subtype != other.subtype;
+		return ::Tag::operator!=(other);
 		}
 
 	/**
@@ -118,23 +72,29 @@ public:
 	 */
 	bool operator<(const Tag& other) const
 		{
-		return type != other.type ? type < other.type : (subtype < other.subtype);
+		return ::Tag::operator<(other);
 		}
 
+	/**
+	 * Returns the \c Analyzer::Tag enum that corresponds to this tag.
+	 * The returned value does not have its ref-count increased.
+	 *
+	 * @param etype the script-layer enum type associated with the tag.
+	 */
+	EnumVal* AsEnumVal() const;
+
 	static Tag Error;
 
 protected:
 	friend class analyzer::Manager;
 	friend class analyzer::Component;
-	friend class file_analysis::Manager;
-	friend class file_analysis::Component;
 
 	/**
 	 * Constructor.
 	 *
 	 * @param type The main type. Note that the \a analyzer::Manager
 	 * manages the value space internally, so noone else should assign
-	 * any main tyoes.
+	 * any main types.
 	 *
 	 * @param subtype The sub type, which is left to an analyzer for
 	 * interpretation. By default it's set to zero.
@@ -144,14 +104,9 @@ protected:
 	/**
 	 * Constructor.
 	 *
-	 * @param val An enuam value of script type \c Analyzer::Tag.
+	 * @param val An enum value of script type \c Analyzer::Tag.
 	 */
-	Tag(EnumVal* val);
-
-private:
-	type_t type;		// Main type.
-	subtype_t subtype;	// Subtype.
-	mutable EnumVal* val;	// Analyzer::Tag value.
+	Tag(EnumVal* val) : ::Tag(val) {}
 };
 
 }
diff --git a/src/file_analysis/CMakeLists.txt b/src/file_analysis/CMakeLists.txt
index f22c293cc4..709790cfaf 100644
--- a/src/file_analysis/CMakeLists.txt
+++ b/src/file_analysis/CMakeLists.txt
@@ -14,6 +14,7 @@ set(file_analysis_SRCS
     Analyzer.h
     AnalyzerSet.cc
     Component.cc
+    Tag.cc
 )
 
 bif_target(file_analysis.bif)
diff --git a/src/file_analysis/Component.cc b/src/file_analysis/Component.cc
index 99531e40f5..8ddd9cceaf 100644
--- a/src/file_analysis/Component.cc
+++ b/src/file_analysis/Component.cc
@@ -8,17 +8,17 @@
 
 using namespace file_analysis;
 
-analyzer::Tag::type_t Component::type_counter = 0;
+file_analysis::Tag::type_t Component::type_counter = 0;
 
 Component::Component(const char* arg_name, factory_callback arg_factory,
-                     analyzer::Tag::subtype_t arg_subtype)
+                     file_analysis::Tag::subtype_t arg_subtype)
 	: plugin::Component(plugin::component::FILE_ANALYZER)
 	{
 	name = copy_string(arg_name);
 	canon_name = canonify_name(arg_name);
 	factory = arg_factory;
 
-	tag = analyzer::Tag(++type_counter, arg_subtype);
+	tag = file_analysis::Tag(++type_counter, arg_subtype);
 	}
 
 Component::Component(const Component& other)
@@ -36,7 +36,7 @@ Component::~Component()
 	delete [] canon_name;
 	}
 
-analyzer::Tag Component::Tag() const
+file_analysis::Tag Component::Tag() const
 	{
 	return tag;
 	}
diff --git a/src/file_analysis/Component.h b/src/file_analysis/Component.h
index 3cdc69efdf..bd690bc081 100644
--- a/src/file_analysis/Component.h
+++ b/src/file_analysis/Component.h
@@ -3,7 +3,7 @@
 #ifndef FILE_ANALYZER_PLUGIN_COMPONENT_H
 #define FILE_ANALYZER_PLUGIN_COMPONENT_H
 
-#include "analyzer/Tag.h"
+#include "Tag.h"
 #include "plugin/Component.h"
 
 #include "Val.h"
@@ -41,12 +41,12 @@ public:
 	 *
 	 * @param subtype A subtype associated with this component that
 	 * further distinguishes it. The subtype will be integrated into
-	 * the analyzer::Tag that the manager associates with this analyzer,
-	 * and analyzer instances can accordingly access it via analyzer::Tag().
-	 * If not used, leave at zero.
+	 * the file_analysis::Tag that the manager associates with this analyzer,
+	 * and analyzer instances can accordingly access it via
+	 * file_analysis::Tag().  If not used, leave at zero.
 	 */
 	Component(const char* name, factory_callback factory,
-	          analyzer::Tag::subtype_t subtype = 0);
+	          file_analysis::Tag::subtype_t subtype = 0);
 
 	/**
 	 * Copy constructor.
@@ -84,7 +84,7 @@ public:
 	 * generated for each new Components, and hence unique across all of
 	 * them.
 	 */
-	analyzer::Tag Tag() const;
+	file_analysis::Tag Tag() const;
 
 	/**
 	 * Generates a human-readable description of the component's main
@@ -98,10 +98,10 @@ private:
 	const char* name;	// The analyzer's name.
 	const char* canon_name;	// The analyzer's canonical name.
 	factory_callback factory;	// The analyzer's factory callback.
-	analyzer::Tag tag;	// The automatically assigned analyzer tag.
+	file_analysis::Tag tag;	// The automatically assigned analyzer tag.
 
 	// Global counter used to generate unique tags.
-	static analyzer::Tag::type_t type_counter;
+	static file_analysis::Tag::type_t type_counter;
 };
 
 }
diff --git a/src/file_analysis/Manager.h b/src/file_analysis/Manager.h
index a93e78c638..55ff0896d7 100644
--- a/src/file_analysis/Manager.h
+++ b/src/file_analysis/Manager.h
@@ -294,7 +294,7 @@ protected:
 
 private:
 	typedef map<string, Component*> analyzer_map_by_name;
-	typedef map<analyzer::Tag, Component*> analyzer_map_by_tag;
+	typedef map<file_analysis::Tag, Component*> analyzer_map_by_tag;
 	typedef map<int, Component*> analyzer_map_by_val;
 
 	void RegisterAnalyzerComponent(Component* component);
diff --git a/src/file_analysis/Tag.cc b/src/file_analysis/Tag.cc
new file mode 100644
index 0000000000..6f0774a4b4
--- /dev/null
+++ b/src/file_analysis/Tag.cc
@@ -0,0 +1,24 @@
+// See the file "COPYING" in the main distribution directory for copyright.
+
+#include "Tag.h"
+#include "Manager.h"
+
+using namespace file_analysis;
+
+file_analysis::Tag file_analysis::Tag::Error;
+
+file_analysis::Tag::Tag(type_t type, subtype_t subtype)
+	: ::Tag(file_mgr->GetTagEnumType(), type, subtype)
+	{
+	}
+
+file_analysis::Tag& file_analysis::Tag::operator=(const file_analysis::Tag& other)
+	{
+	::Tag::operator=(other);
+	return *this;
+	}
+
+EnumVal* file_analysis::Tag::AsEnumVal() const
+	{
+	return ::Tag::AsEnumVal(file_mgr->GetTagEnumType());
+	}
diff --git a/src/file_analysis/Tag.h b/src/file_analysis/Tag.h
new file mode 100644
index 0000000000..85c20da5b5
--- /dev/null
+++ b/src/file_analysis/Tag.h
@@ -0,0 +1,115 @@
+// See the file "COPYING" in the main distribution directory for copyright.
+
+#ifndef FILE_ANALYZER_TAG_H
+#define FILE_ANALYZER_TAG_H
+
+#include "config.h"
+#include "util.h"
+#include "../Tag.h"
+
+class EnumVal;
+
+namespace file_analysis {
+
+class Manager;
+class Component;
+
+/**
+ * Class to identify a file analyzer type.
+ *
+ * The script-layer analogue is Files::Tag.
+ */
+class Tag : public ::Tag {
+public:
+	/*
+	 * Copy constructor.
+	 */
+	Tag(const Tag& other) : ::Tag(other) {}
+
+	/**
+	 * Default constructor. This initializes the tag with an error value
+	 * that will make \c operator \c bool return false.
+	 */
+	Tag() : ::Tag() {}
+
+	/**
+	 * Destructor.
+	 */
+	~Tag() {}
+
+	/**
+	 * Returns false if the tag represents an error value rather than a
+	 * legal analyzer type.
+	 * TODO: make this conversion operator "explicit" (C++11) or use a
+	 *       "safe bool" idiom (not necessary if "explicit" is available),
+	 *       otherwise this may allow nonsense/undesired comparison operations.
+	 *
+	 */
+	operator bool() const	{ return *this != Tag(); }
+
+	/**
+	 * Assignment operator.
+	 */
+	Tag& operator=(const Tag& other);
+
+	/**
+	 * Compares two tags for equality.
+	 */
+	bool operator==(const Tag& other) const
+		{
+		return ::Tag::operator==(other);
+		}
+
+	/**
+	 * Compares two tags for inequality.
+	 */
+	bool operator!=(const Tag& other) const
+		{
+		return ::Tag::operator!=(other);
+		}
+
+	/**
+	 * Compares two tags for less-than relationship.
+	 */
+	bool operator<(const Tag& other) const
+		{
+		return ::Tag::operator<(other);
+		}
+
+	/**
+	 * Returns the \c Files::Tag enum that corresponds to this tag.
+	 * The returned value does not have its ref-count increased.
+	 *
+	 * @param etype the script-layer enum type associated with the tag.
+	 */
+	EnumVal* AsEnumVal() const;
+
+	static Tag Error;
+
+protected:
+	friend class file_analysis::Manager;
+	friend class file_analysis::Component;
+
+	/**
+	 * Constructor.
+	 *
+	 * @param type The main type. Note that the \a file_analysis::Manager
+	 * manages the value space internally, so noone else should assign
+	 * main tyoes.
+	 *
+	 * @param subtype The sub type, which is left to an analyzer for
+	 * interpretation. By default it's set to zero.
+	 */
+	Tag(type_t type, subtype_t subtype = 0);
+
+	/**
+	 * Constructor.
+	 *
+	 * @param val An enum value of script type \c Files::Tag.
+	 */
+	Tag(EnumVal* val) : ::Tag(val) {}
+};
+
+}
+
+#endif

From edb04e6d8bfe68dddf6968ec37cf39ea3a47feab Mon Sep 17 00:00:00 2001
From: Bernhard Amann <bernhard@icsi.berkeley.edu>
Date: Tue, 30 Jul 2013 16:10:06 -0700
Subject: [PATCH 22/40] fix segfault that could be caused by merging an empty
 bloom-filter with a bloom-filter already containing values.

I assume that it is ok to merge an empty bloom-filter with any bloom-filter -
if not we have to change the patch to return an error in this case.
---
 src/OpaqueVal.cc                               | 5 ++++-
 src/probabilistic/bloom-filter.bif             | 5 ++++-
 testing/btest/Baseline/bifs.bloomfilter/output | 1 +
 testing/btest/bifs/bloomfilter.bro             | 5 +++++
 4 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/src/OpaqueVal.cc b/src/OpaqueVal.cc
index 66b3c081e7..2b3236203b 100644
--- a/src/OpaqueVal.cc
+++ b/src/OpaqueVal.cc
@@ -591,7 +591,10 @@ bool BloomFilterVal::Empty() const
 BloomFilterVal* BloomFilterVal::Merge(const BloomFilterVal* x,
 				      const BloomFilterVal* y)
 	{
-	if ( ! same_type(x->Type(), y->Type()) )
+	if ( ( x->Type() != y->Type() ) && // both 0 is ok
+	     ( x->Type() != 0 ) && // any one 0 also is ok
+	     ( y->Type() != 0 ) &&
+	     ! same_type(x->Type(), y->Type()) )
 		{
 		reporter->Error("cannot merge Bloom filters with different types");
 		return 0;
diff --git a/src/probabilistic/bloom-filter.bif b/src/probabilistic/bloom-filter.bif
index a3567ad6f7..5b06f95673 100644
--- a/src/probabilistic/bloom-filter.bif
+++ b/src/probabilistic/bloom-filter.bif
@@ -186,7 +186,10 @@ function bloomfilter_merge%(bf1: opaque of bloomfilter,
 	const BloomFilterVal* bfv1 = static_cast<const BloomFilterVal*>(bf1);
 	const BloomFilterVal* bfv2 = static_cast<const BloomFilterVal*>(bf2);
 
-	if ( ! same_type(bfv1->Type(), bfv2->Type()) )
+	if ( ( bfv1->Type() != bfv2->Type() ) && // both 0 is ok
+	     (bfv1->Type() != 0) && 
+	     (bfv2->Type() != 0) && 
+	     ! same_type(bfv1->Type(), bfv2->Type()) )
 		{
 		reporter->Error("incompatible Bloom filter types");
 		return 0;
diff --git a/testing/btest/Baseline/bifs.bloomfilter/output b/testing/btest/Baseline/bifs.bloomfilter/output
index 14e1f038c0..991308e10b 100644
--- a/testing/btest/Baseline/bifs.bloomfilter/output
+++ b/testing/btest/Baseline/bifs.bloomfilter/output
@@ -17,6 +17,7 @@ error: false-positive rate must take value between 0 and 1
 1
 1
 1
+1
 2
 3
 3
diff --git a/testing/btest/bifs/bloomfilter.bro b/testing/btest/bifs/bloomfilter.bro
index 3b40f29553..f2c882d175 100644
--- a/testing/btest/bifs/bloomfilter.bro
+++ b/testing/btest/bifs/bloomfilter.bro
@@ -45,6 +45,11 @@ function test_basic_bloom_filter()
   print bloomfilter_lookup(bf_merged, 84);
   print bloomfilter_lookup(bf_merged, 100);
   print bloomfilter_lookup(bf_merged, 168);
+
+  #empty filter tests
+  local bf_empty = bloomfilter_basic_init(0.1, 1000);
+  local bf_empty_merged = bloomfilter_merge(bf_merged, bf_empty);
+  print bloomfilter_lookup(bf_empty_merged, 42);
   }
 
 function test_counting_bloom_filter()

From 5fa9c5865b6748c642d91a01f3537331ee5747a9 Mon Sep 17 00:00:00 2001
From: Jon Siwek <jsiwek@illinois.edu>
Date: Wed, 31 Jul 2013 09:48:19 -0500
Subject: [PATCH 23/40] Factor out the need for a tag field in
 Files::AnalyzerArgs record.

This cleans up internals of how analyzer instances get identified by the
tag plus any args given to it and doesn't change script code a user
would write.
---
 scripts/base/frameworks/files/main.bro        | 12 +---
 src/file_analysis/Analyzer.cc                 | 11 ++++
 src/file_analysis/Analyzer.h                  | 30 ++--------
 src/file_analysis/AnalyzerSet.cc              | 56 ++++++++++---------
 src/file_analysis/AnalyzerSet.h               | 26 ++++++---
 src/file_analysis/CMakeLists.txt              |  2 +-
 src/file_analysis/File.cc                     | 18 +++---
 src/file_analysis/File.h                      |  7 ++-
 src/file_analysis/Manager.cc                  | 56 ++++++++++++++-----
 src/file_analysis/Manager.h                   | 42 ++++++++++++--
 .../analyzer/data_event/DataEvent.cc          |  5 +-
 src/file_analysis/analyzer/extract/Extract.cc |  4 +-
 src/file_analysis/analyzer/hash/Hash.cc       |  3 +-
 src/file_analysis/file_analysis.bif           | 12 ++--
 14 files changed, 177 insertions(+), 107 deletions(-)
 create mode 100644 src/file_analysis/Analyzer.cc

diff --git a/scripts/base/frameworks/files/main.bro b/scripts/base/frameworks/files/main.bro
index a87608054d..c1883e037f 100644
--- a/scripts/base/frameworks/files/main.bro
+++ b/scripts/base/frameworks/files/main.bro
@@ -228,11 +228,6 @@ redef record fa_file += {
 	info: Info &optional;
 };
 
-redef record AnalyzerArgs += {
-	# This is used interally for the core file analyzer api.
-	tag: Files::Tag &optional;
-};
-
 # Store the callbacks for protocol analyzers that have files.
 global registered_protocols: table[Analyzer::Tag] of ProtoRegistration = table();
 
@@ -275,14 +270,12 @@ function set_timeout_interval(f: fa_file, t: interval): bool
 
 function add_analyzer(f: fa_file, tag: Files::Tag, args: AnalyzerArgs): bool
 	{
-	# This is to construct the correct args for the core API.
-	args$tag = tag;
 	add f$info$analyzers[Files::analyzer_name(tag)];
 
 	if ( tag in analyzer_add_callbacks )
 		analyzer_add_callbacks[tag](f, args);
 
-	if ( ! __add_analyzer(f$id, args) )
+	if ( ! __add_analyzer(f$id, tag, args) )
 		{
 		Reporter::warning(fmt("Analyzer %s not added successfully to file %s.", tag, f$id));
 		return F;
@@ -297,8 +290,7 @@ function register_analyzer_add_callback(tag: Files::Tag, callback: function(f: f
 
 function remove_analyzer(f: fa_file, tag: Files::Tag, args: AnalyzerArgs): bool
 	{
-	args$tag = tag;
-	return __remove_analyzer(f$id, args);
+	return __remove_analyzer(f$id, tag, args);
 	}
 
 function stop(f: fa_file): bool
diff --git a/src/file_analysis/Analyzer.cc b/src/file_analysis/Analyzer.cc
new file mode 100644
index 0000000000..d472f4c80c
--- /dev/null
+++ b/src/file_analysis/Analyzer.cc
@@ -0,0 +1,11 @@
+// See the file "COPYING" in the main distribution directory for copyright.
+
+#include "Analyzer.h"
+#include "Manager.h"
+
+file_analysis::Analyzer::~Analyzer()
+	{
+	DBG_LOG(DBG_FILE_ANALYSIS, "Destroy file analyzer %s",
+	        file_mgr->GetAnalyzerName(tag));
+	Unref(args);
+	}
diff --git a/src/file_analysis/Analyzer.h b/src/file_analysis/Analyzer.h
index 0a5aa9e25c..e20e2802cf 100644
--- a/src/file_analysis/Analyzer.h
+++ b/src/file_analysis/Analyzer.h
@@ -5,14 +5,12 @@
 
 #include "Val.h"
 #include "NetVar.h"
-#include "analyzer/Tag.h"
+#include "Tag.h"
 
 #include "file_analysis/file_analysis.bif.h"
 
 namespace file_analysis {
 
-typedef int FA_Tag;
-
 class File;
 
 /**
@@ -25,11 +23,7 @@ public:
 	 * Destructor.  Nothing special about it. Virtual since we definitely expect
 	 * to delete instances of derived classes via pointers to this class.
 	 */
-	virtual ~Analyzer()
-		{
-		DBG_LOG(DBG_FILE_ANALYSIS, "Destroy file analyzer %d", tag);
-		Unref(args);
-		}
+	virtual ~Analyzer();
 
 	/**
 	 * Subclasses may override this metod to receive file data non-sequentially.
@@ -76,7 +70,7 @@ public:
 	/**
 	 * @return the analyzer type enum value.
 	 */
-	FA_Tag Tag() const { return tag; }
+	file_analysis::Tag Tag() const { return tag; }
 
 	/**
 	 * @return the AnalyzerArgs associated with the analyzer.
@@ -88,18 +82,6 @@ public:
 	 */
 	File* GetFile() const { return file; }
 
-	/**
-	 * Retrieves an analyzer tag field from full analyzer argument record.
-	 * @param args an \c AnalyzerArgs (script-layer type) value.
-	 * @return the analyzer tag equivalent of the 'tag' field from the
-	 *         \c AnalyzerArgs value \a args.
-	 */
-	static FA_Tag ArgsTag(const RecordVal* args)
-		{
-		using BifType::Record::Files::AnalyzerArgs;
-		return args->Lookup(AnalyzerArgs->FieldOffset("tag"))->AsEnum();
-		}
-
 protected:
 
 	/**
@@ -108,15 +90,15 @@ protected:
 	 *        tunable options, if any, related to a particular analyzer type.
 	 * @param arg_file the file to which the the analyzer is being attached.
 	 */
-	Analyzer(RecordVal* arg_args, File* arg_file)
-	    : tag(file_analysis::Analyzer::ArgsTag(arg_args)),
+	Analyzer(file_analysis::Tag arg_tag, RecordVal* arg_args, File* arg_file)
+	    : tag(arg_tag),
 	      args(arg_args->Ref()->AsRecordVal()),
 	      file(arg_file)
 		{}
 
 private:
 
-	FA_Tag tag;	/**< The particular analyzer type of the analyzer instance. */
+	file_analysis::Tag tag;	/**< The particular type of the analyzer instance. */
 	RecordVal* args;	/**< \c AnalyzerArgs val gives tunable analyzer params. */
 	File* file;	/**< The file to which the analyzer is attached. */
 };
diff --git a/src/file_analysis/AnalyzerSet.cc b/src/file_analysis/AnalyzerSet.cc
index c710d8b085..befb676c87 100644
--- a/src/file_analysis/AnalyzerSet.cc
+++ b/src/file_analysis/AnalyzerSet.cc
@@ -15,6 +15,7 @@ static void analyzer_del_func(void* v)
 AnalyzerSet::AnalyzerSet(File* arg_file) : file(arg_file)
 	{
 	TypeList* t = new TypeList();
+	t->Append(file_mgr->GetTagEnumType());
 	t->Append(BifType::Record::Files::AnalyzerArgs->Ref());
 	analyzer_hash = new CompositeHash(t);
 	Unref(t);
@@ -34,20 +35,20 @@ AnalyzerSet::~AnalyzerSet()
 	delete analyzer_hash;
 	}
 
-bool AnalyzerSet::Add(RecordVal* args)
+bool AnalyzerSet::Add(file_analysis::Tag tag, RecordVal* args)
 	{
-	HashKey* key = GetKey(args);
+	HashKey* key = GetKey(tag, args);
 
 	if ( analyzer_map.Lookup(key) )
 		{
-		DBG_LOG(DBG_FILE_ANALYSIS, "Instantiate analyzer %d skipped for file id"
-		        " %s: already exists", file_analysis::Analyzer::ArgsTag(args),
+		DBG_LOG(DBG_FILE_ANALYSIS, "Instantiate analyzer %s skipped for file id"
+		        " %s: already exists", file_mgr->GetAnalyzerName(tag),
 		        file->GetID().c_str());
 		delete key;
 		return true;
 		}
 
-	file_analysis::Analyzer* a = InstantiateAnalyzer(args);
+	file_analysis::Analyzer* a = InstantiateAnalyzer(tag, args);
 
 	if ( ! a )
 		{
@@ -60,10 +61,10 @@ bool AnalyzerSet::Add(RecordVal* args)
 	return true;
 	}
 
-bool AnalyzerSet::QueueAdd(RecordVal* args)
+bool AnalyzerSet::QueueAdd(file_analysis::Tag tag, RecordVal* args)
 	{
-	HashKey* key = GetKey(args);
-	file_analysis::Analyzer* a = InstantiateAnalyzer(args);
+	HashKey* key = GetKey(tag, args);
+	file_analysis::Analyzer* a = InstantiateAnalyzer(tag, args);
 
 	if ( ! a )
 		{
@@ -80,8 +81,9 @@ bool AnalyzerSet::AddMod::Perform(AnalyzerSet* set)
 	{
 	if ( set->analyzer_map.Lookup(key) )
 		{
-		DBG_LOG(DBG_FILE_ANALYSIS, "Add analyzer %d skipped for file id"
-		        " %s: already exists", a->Tag(), a->GetFile()->GetID().c_str());
+		DBG_LOG(DBG_FILE_ANALYSIS, "Add analyzer %s skipped for file id"
+		        " %s: already exists", file_mgr->GetAnalyzerName(a->Tag()),
+		        a->GetFile()->GetID().c_str());
 
 		Abort();
 		return true;
@@ -91,12 +93,12 @@ bool AnalyzerSet::AddMod::Perform(AnalyzerSet* set)
 	return true;
 	}
 
-bool AnalyzerSet::Remove(const RecordVal* args)
+bool AnalyzerSet::Remove(file_analysis::Tag tag, RecordVal* args)
 	{
-	return Remove(file_analysis::Analyzer::ArgsTag(args), GetKey(args));
+	return Remove(tag, GetKey(tag, args));
 	}
 
-bool AnalyzerSet::Remove(FA_Tag tag, HashKey* key)
+bool AnalyzerSet::Remove(file_analysis::Tag tag, HashKey* key)
 	{
 	file_analysis::Analyzer* a =
 	    (file_analysis::Analyzer*) analyzer_map.Remove(key);
@@ -105,22 +107,22 @@ bool AnalyzerSet::Remove(FA_Tag tag, HashKey* key)
 
 	if ( ! a )
 		{
-		DBG_LOG(DBG_FILE_ANALYSIS, "Skip remove analyzer %d for file id %s",
-		        tag, file->GetID().c_str());
+		DBG_LOG(DBG_FILE_ANALYSIS, "Skip remove analyzer %s for file id %s",
+		        file_mgr->GetAnalyzerName(tag), file->GetID().c_str());
 		return false;
 		}
 
-	DBG_LOG(DBG_FILE_ANALYSIS, "Remove analyzer %d for file id %s", a->Tag(),
+	DBG_LOG(DBG_FILE_ANALYSIS, "Remove analyzer %s for file id %s",
+	        file_mgr->GetAnalyzerName(tag),
 	        file->GetID().c_str());
 
 	delete a;
 	return true;
 	}
 
-bool AnalyzerSet::QueueRemove(const RecordVal* args)
+bool AnalyzerSet::QueueRemove(file_analysis::Tag tag, RecordVal* args)
 	{
-	HashKey* key = GetKey(args);
-	FA_Tag tag = file_analysis::Analyzer::ArgsTag(args);
+	HashKey* key = GetKey(tag, args);
 
 	mod_queue.push(new RemoveMod(tag, key));
 
@@ -132,18 +134,22 @@ bool AnalyzerSet::RemoveMod::Perform(AnalyzerSet* set)
 	return set->Remove(tag, key);
 	}
 
-HashKey* AnalyzerSet::GetKey(const RecordVal* args) const
+HashKey* AnalyzerSet::GetKey(file_analysis::Tag t, RecordVal* args) const
 	{
-	HashKey* key = analyzer_hash->ComputeHash(args, 1);
+	ListVal* lv = new ListVal(TYPE_ANY);
+	lv->Append(t.AsEnumVal()->Ref());
+	lv->Append(args->Ref());
+	HashKey* key = analyzer_hash->ComputeHash(lv, 1);
+	Unref(lv);
 	if ( ! key )
 		reporter->InternalError("AnalyzerArgs type mismatch");
 
 	return key;
 	}
 
-file_analysis::Analyzer* AnalyzerSet::InstantiateAnalyzer(RecordVal* args) const
+file_analysis::Analyzer* AnalyzerSet::InstantiateAnalyzer(Tag tag,
+                                                          RecordVal* args) const
 	{
-	FA_Tag tag = file_analysis::Analyzer::ArgsTag(args);
 	file_analysis::Analyzer* a = file_mgr->InstantiateAnalyzer(tag, args, file);
 
 	if ( ! a )
@@ -158,8 +164,8 @@ file_analysis::Analyzer* AnalyzerSet::InstantiateAnalyzer(RecordVal* args) const
 
 void AnalyzerSet::Insert(file_analysis::Analyzer* a, HashKey* key)
 	{
-	DBG_LOG(DBG_FILE_ANALYSIS, "Add analyzer %d for file id %s", a->Tag(),
-	        file->GetID().c_str());
+	DBG_LOG(DBG_FILE_ANALYSIS, "Add analyzer %s for file id %s",
+	        file_mgr->GetAnalyzerName(a->Tag()), file->GetID().c_str());
 	analyzer_map.Insert(key, a);
 	delete key;
 	}
diff --git a/src/file_analysis/AnalyzerSet.h b/src/file_analysis/AnalyzerSet.h
index 6f14149e30..42a54f4943 100644
--- a/src/file_analysis/AnalyzerSet.h
+++ b/src/file_analysis/AnalyzerSet.h
@@ -9,6 +9,7 @@
 #include "Dict.h"
 #include "CompHash.h"
 #include "Val.h"
+#include "Tag.h"
 
 namespace file_analysis {
 
@@ -38,31 +39,35 @@ public:
 
 	/**
 	 * Attach an analyzer to #file immediately.
+	 * @param tag the analyzer tag of the file analyzer to add.
 	 * @param args an \c AnalyzerArgs value which specifies an analyzer.
 	 * @return true if analyzer was instantiated/attached, else false.
 	 */
-	bool Add(RecordVal* args);
+	bool Add(file_analysis::Tag tag, RecordVal* args);
 
 	/**
 	 * Queue the attachment of an analyzer to #file.
+	 * @param tag the analyzer tag of the file analyzer to add.
 	 * @param args an \c AnalyzerArgs value which specifies an analyzer.
 	 * @return true if analyzer was able to be instantiated, else false.
 	 */
-	bool QueueAdd(RecordVal* args);
+	bool QueueAdd(file_analysis::Tag tag, RecordVal* args);
 
 	/**
 	 * Remove an analyzer from #file immediately.
+	 * @param tag the analyzer tag of the file analyzer to remove.
 	 * @param args an \c AnalyzerArgs value which specifies an analyzer.
 	 * @return false if analyzer didn't exist and so wasn't removed, else true.
 	 */
-	bool Remove(const RecordVal* args);
+	bool Remove(file_analysis::Tag tag, RecordVal* args);
 
 	/**
 	 * Queue the removal of an analyzer from #file.
+	 * @param tag the analyzer tag of the file analyzer to remove.
 	 * @param args an \c AnalyzerArgs value which specifies an analyzer.
 	 * @return true if analyzer exists at time of call, else false;
 	 */
-	bool QueueRemove(const RecordVal* args);
+	bool QueueRemove(file_analysis::Tag tag, RecordVal* args);
 
 	/**
 	 * Perform all queued modifications to the current analyzer set.
@@ -91,17 +96,20 @@ protected:
 
 	/**
 	 * Get a hash key which represents an analyzer instance.
+	 * @param tag the file analyzer tag.
 	 * @param args an \c AnalyzerArgs value which specifies an analyzer.
 	 * @return the hash key calculated from \a args
 	 */
-	HashKey* GetKey(const RecordVal* args) const;
+	HashKey* GetKey(file_analysis::Tag tag, RecordVal* args) const;
 
 	/**
 	 * Create an instance of a file analyzer.
+	 * @param tag the tag of a file analyzer.
 	 * @param args an \c AnalyzerArgs value which specifies an analyzer.
 	 * @return a new file analyzer instance.
 	 */
-	file_analysis::Analyzer* InstantiateAnalyzer(RecordVal* args) const;
+	file_analysis::Analyzer* InstantiateAnalyzer(file_analysis::Tag tag,
+	                                             RecordVal* args) const;
 
 	/**
 	 * Insert an analyzer instance in to the set.
@@ -116,7 +124,7 @@ protected:
 	 *        just used for debugging messages.
 	 * @param key the hash key which represents the analyzer's \c AnalyzerArgs.
 	 */
-	bool Remove(FA_Tag tag, HashKey* key);
+	bool Remove(file_analysis::Tag tag, HashKey* key);
 
 private:
 
@@ -175,14 +183,14 @@ private:
 		 * @param arg_a an analyzer instance to add to an analyzer set.
 		 * @param arg_key hash key representing the analyzer's \c AnalyzerArgs.
 		 */
-		RemoveMod(FA_Tag arg_tag, HashKey* arg_key)
+		RemoveMod(file_analysis::Tag arg_tag, HashKey* arg_key)
 			: Modification(), tag(arg_tag), key(arg_key) {}
 		virtual ~RemoveMod() {}
 		virtual bool Perform(AnalyzerSet* set);
 		virtual void Abort() { delete key; }
 
 	protected:
-		FA_Tag tag;
+		file_analysis::Tag tag;
 		HashKey* key;
 	};
 
diff --git a/src/file_analysis/CMakeLists.txt b/src/file_analysis/CMakeLists.txt
index 709790cfaf..846fc4bf15 100644
--- a/src/file_analysis/CMakeLists.txt
+++ b/src/file_analysis/CMakeLists.txt
@@ -11,7 +11,7 @@ set(file_analysis_SRCS
     Manager.cc
     File.cc
     FileTimer.cc
-    Analyzer.h
+    Analyzer.cc
     AnalyzerSet.cc
     Component.cc
     Tag.cc
diff --git a/src/file_analysis/File.cc b/src/file_analysis/File.cc
index 7189d90932..3db8ecdbee 100644
--- a/src/file_analysis/File.cc
+++ b/src/file_analysis/File.cc
@@ -230,14 +230,14 @@ void File::ScheduleInactivityTimer() const
 	timer_mgr->Add(new FileTimer(network_time, id, GetTimeoutInterval()));
 	}
 
-bool File::AddAnalyzer(RecordVal* args)
+bool File::AddAnalyzer(file_analysis::Tag tag, RecordVal* args)
 	{
-	return done ? false : analyzers.QueueAdd(args);
+	return done ? false : analyzers.QueueAdd(tag, args);
 	}
 
-bool File::RemoveAnalyzer(const RecordVal* args)
+bool File::RemoveAnalyzer(file_analysis::Tag tag, RecordVal* args)
 	{
-	return done ? false : analyzers.QueueRemove(args);
+	return done ? false : analyzers.QueueRemove(tag, args);
 	}
 
 bool File::BufferBOF(const u_char* data, uint64 len)
@@ -320,7 +320,7 @@ void File::DataIn(const u_char* data, uint64 len, uint64 offset)
 	while ( (a = analyzers.NextEntry(c)) )
 		{
 		if ( ! a->DeliverChunk(data, len, offset) )
-			analyzers.QueueRemove(a->Args());
+			analyzers.QueueRemove(a->Tag(), a->Args());
 		}
 
 	analyzers.DrainModifications();
@@ -355,7 +355,7 @@ void File::DataIn(const u_char* data, uint64 len)
 		{
 		if ( ! a->DeliverStream(data, len) )
 			{
-			analyzers.QueueRemove(a->Args());
+			analyzers.QueueRemove(a->Tag(), a->Args());
 			continue;
 			}
 
@@ -363,7 +363,7 @@ void File::DataIn(const u_char* data, uint64 len)
 		                LookupFieldDefaultCount(missing_bytes_idx);
 
 		if ( ! a->DeliverChunk(data, len, offset) )
-			analyzers.QueueRemove(a->Args());
+			analyzers.QueueRemove(a->Tag(), a->Args());
 		}
 
 	analyzers.DrainModifications();
@@ -388,7 +388,7 @@ void File::EndOfFile()
 	while ( (a = analyzers.NextEntry(c)) )
 		{
 		if ( ! a->EndOfFile() )
-			analyzers.QueueRemove(a->Args());
+			analyzers.QueueRemove(a->Tag(), a->Args());
 		}
 
 	FileEvent(file_state_remove);
@@ -410,7 +410,7 @@ void File::Gap(uint64 offset, uint64 len)
 	while ( (a = analyzers.NextEntry(c)) )
 		{
 		if ( ! a->Undelivered(offset, len) )
-			analyzers.QueueRemove(a->Args());
+			analyzers.QueueRemove(a->Tag(), a->Args());
 		}
 
 	if ( FileEventAvailable(file_gap) )
diff --git a/src/file_analysis/File.h b/src/file_analysis/File.h
index 794734d24b..12c1e061a8 100644
--- a/src/file_analysis/File.h
+++ b/src/file_analysis/File.h
@@ -10,6 +10,7 @@
 
 #include "Conn.h"
 #include "Val.h"
+#include "Tag.h"
 #include "AnalyzerSet.h"
 #include "BroString.h"
 
@@ -94,17 +95,19 @@ public:
 	/**
 	 * Queues attaching an analyzer.  Only one analyzer per type can be attached
 	 * at a time unless the arguments differ.
+	 * @param tag the analyzer tag of the file analyzer to add.
 	 * @param args an \c AnalyzerArgs value representing a file analyzer.
 	 * @return false if analyzer can't be instantiated, else true.
 	 */
-	bool AddAnalyzer(RecordVal* args);
+	bool AddAnalyzer(file_analysis::Tag tag, RecordVal* args);
 
 	/**
 	 * Queues removal of an analyzer.
+	 * @param tag the analyzer tag of the file analyzer to remove.
 	 * @param args an \c AnalyzerArgs value representing a file analyzer.
 	 * @return true if analyzer was active at time of call, else false.
 	 */
-	bool RemoveAnalyzer(const RecordVal* args);
+	bool RemoveAnalyzer(file_analysis::Tag tag, RecordVal* args);
 
 	/**
 	 * Pass in non-sequential data and deliver to attached analyzers.
diff --git a/src/file_analysis/Manager.cc b/src/file_analysis/Manager.cc
index fb74a409b4..c7fb1fc3b0 100644
--- a/src/file_analysis/Manager.cc
+++ b/src/file_analysis/Manager.cc
@@ -206,24 +206,26 @@ bool Manager::SetTimeoutInterval(const string& file_id, double interval) const
 	return true;
 	}
 
-bool Manager::AddAnalyzer(const string& file_id, RecordVal* args) const
+bool Manager::AddAnalyzer(const string& file_id, file_analysis::Tag tag,
+                          RecordVal* args) const
 	{
 	File* file = Lookup(file_id);
 
 	if ( ! file )
 		return false;
 
-	return file->AddAnalyzer(args);
+	return file->AddAnalyzer(tag, args);
 	}
 
-bool Manager::RemoveAnalyzer(const string& file_id, const RecordVal* args) const
+bool Manager::RemoveAnalyzer(const string& file_id, file_analysis::Tag tag,
+                             RecordVal* args) const
 	{
 	File* file = Lookup(file_id);
 
 	if ( ! file )
 		return false;
 
-	return file->RemoveAnalyzer(args);
+	return file->RemoveAnalyzer(tag, args);
 	}
 
 File* Manager::GetFile(const string& file_id, Connection* conn,
@@ -367,13 +369,13 @@ bool Manager::IsDisabled(analyzer::Tag tag)
 	return rval;
 	}
 
-Analyzer* Manager::InstantiateAnalyzer(int tag, RecordVal* args, File* f) const
+Analyzer* Manager::InstantiateAnalyzer(Tag tag, RecordVal* args, File* f) const
 	{
-	analyzer_map_by_val::const_iterator it = analyzers_by_val.find(tag);
+	analyzer_map_by_tag::const_iterator it = analyzers_by_tag.find(tag);
 
-	if ( it == analyzers_by_val.end() )
-		reporter->InternalError("cannot instantiate unknown file analyzer: %d",
-		                        tag);
+	if ( it == analyzers_by_tag.end() )
+		reporter->InternalError("cannot instantiate unknown file analyzer: %s",
+		                        tag.AsString().c_str());
 
 	Component* c = it->second;
 
@@ -384,17 +386,43 @@ Analyzer* Manager::InstantiateAnalyzer(int tag, RecordVal* args, File* f) const
 	return c->Factory()(args, f);
 	}
 
-const char* Manager::GetAnalyzerName(int tag) const
+const char* Manager::GetAnalyzerName(Val* v) const
 	{
-	analyzer_map_by_val::const_iterator it = analyzers_by_val.find(tag);
+	return GetAnalyzerName(file_analysis::Tag(v->AsEnumVal()));
+	}
 
-	if ( it == analyzers_by_val.end() )
-		reporter->InternalError("cannot get name of unknown file analyzer: %d",
-		                        tag);
+const char* Manager::GetAnalyzerName(file_analysis::Tag tag) const
+	{
+	analyzer_map_by_tag::const_iterator it = analyzers_by_tag.find(tag);
+
+	if ( it == analyzers_by_tag.end() )
+		reporter->InternalError("cannot get name of unknown file analyzer: %s",
+		                        tag.AsString().c_str());
 
 	return it->second->CanonicalName();
 	}
 
+file_analysis::Tag Manager::GetAnalyzerTag(const char* name) const
+	{
+	analyzer_map_by_name::const_iterator it = analyzers_by_name.find(name);
+
+	if ( it == analyzers_by_name.end() )
+		return file_analysis::Tag();
+
+	return it->second->Tag();
+	}
+
+file_analysis::Tag Manager::GetAnalyzerTag(Val* v) const
+	{
+	analyzer_map_by_val::const_iterator it =
+	    analyzers_by_val.find(v->AsEnumVal()->InternalInt());
+
+	if ( it == analyzers_by_val.end() )
+		return file_analysis::Tag();
+
+	return it->second->Tag();
+	}
+
 EnumType* Manager::GetTagEnumType()
 	{
 	return tag_enum_type;
diff --git a/src/file_analysis/Manager.h b/src/file_analysis/Manager.h
index 55ff0896d7..9a37042669 100644
--- a/src/file_analysis/Manager.h
+++ b/src/file_analysis/Manager.h
@@ -177,18 +177,22 @@ public:
 	 * analyzers of a given type can be attached per file identifier at a time
 	 * as long as the arguments differ.
 	 * @param file_id the file identifier/hash.
+	 * @param tag the analyzer tag of the file analyzer to add.
 	 * @param args a \c AnalyzerArgs value which describes a file analyzer.
 	 * @return false if the analyzer failed to be instantiated, else true.
 	 */
-	bool AddAnalyzer(const string& file_id, RecordVal* args) const;
+	bool AddAnalyzer(const string& file_id, file_analysis::Tag tag,
+	                 RecordVal* args) const;
 
 	/**
 	 * Queue removal of an analyzer for a given file identifier.
 	 * @param file_id the file identifier/hash.
+	 * @param tag the analyzer tag of the file analyzer to remove.
 	 * @param args a \c AnalyzerArgs value which describes a file analyzer.
 	 * @return true if the analyzer is active at the time of call, else false.
 	 */
-	bool RemoveAnalyzer(const string& file_id, const RecordVal* args) const;
+	bool RemoveAnalyzer(const string& file_id, file_analysis::Tag tag,
+	                    RecordVal* args) const;
 
 	/**
 	 * Tells whether analysis for a file is active or ignored.
@@ -204,15 +208,43 @@ public:
 	 * @param f The file analzer is to be associated with.
 	 * @return The new analyzer instance or null if tag is invalid.
 	 */
-	Analyzer* InstantiateAnalyzer(int tag, RecordVal* args, File* f) const;
+	Analyzer* InstantiateAnalyzer(Tag tag, RecordVal* args, File* f) const;
 
 	/**
 	 * Translates a script-level file analyzer tag in to corresponding file
 	 * analyzer name.
-	 * @param tag The enum val of a file analyzer.
+	 * @param v The enum val of a file analyzer.
 	 * @return The human-readable name of the file analyzer.
 	 */
-	const char* GetAnalyzerName(int tag) const;
+	const char* GetAnalyzerName(Val* v) const;
+
+	/**
+	 * Translates a script-level file analyzer tag in to corresponding file
+	 * analyzer name.
+	 * @param tag The analyzer tag of a file analyzer.
+	 * @return The human-readable name of the file analyzer.
+	 */
+	const char* GetAnalyzerName(file_analysis::Tag tag) const;
+
+	/**
+	 * Translates an analyzer name into the corresponding tag.
+	 *
+	 * @param name The name.
+	 *
+	 * @return The tag. If the name does not correspond to a valid
+	 * analyzer, the returned tag will evaluate to false.
+	 */
+	file_analysis::Tag GetAnalyzerTag(const char* name) const;
+
+	/**
+	 * Translates an analyzer enum value into the corresponding tag.
+	 *
+	 * @param v the enum val of the file analyzer.
+	 *
+	 * @return The tag. If the val does not correspond to a valid
+	 * analyzer, the returned tag will evaluate to false.
+	 */
+	file_analysis::Tag GetAnalyzerTag(Val* v) const;
 
 	/**
 	 * Returns the enum type that corresponds to the script-level type
diff --git a/src/file_analysis/analyzer/data_event/DataEvent.cc b/src/file_analysis/analyzer/data_event/DataEvent.cc
index 1b04111c44..44498f41e1 100644
--- a/src/file_analysis/analyzer/data_event/DataEvent.cc
+++ b/src/file_analysis/analyzer/data_event/DataEvent.cc
@@ -6,12 +6,15 @@
 #include "EventRegistry.h"
 #include "Event.h"
 #include "util.h"
+#include "file_analysis/Manager.h"
 
 using namespace file_analysis;
 
 DataEvent::DataEvent(RecordVal* args, File* file,
                      EventHandlerPtr ce, EventHandlerPtr se)
-    : file_analysis::Analyzer(args, file), chunk_event(ce), stream_event(se)
+    : file_analysis::Analyzer(file_mgr->GetAnalyzerTag("DATA_EVENT"),
+	                          args, file),
+	chunk_event(ce), stream_event(se)
 	{
 	}
 
diff --git a/src/file_analysis/analyzer/extract/Extract.cc b/src/file_analysis/analyzer/extract/Extract.cc
index ef37425003..0de1402939 100644
--- a/src/file_analysis/analyzer/extract/Extract.cc
+++ b/src/file_analysis/analyzer/extract/Extract.cc
@@ -4,11 +4,13 @@
 
 #include "Extract.h"
 #include "util.h"
+#include "file_analysis/Manager.h"
 
 using namespace file_analysis;
 
 Extract::Extract(RecordVal* args, File* file, const string& arg_filename)
-    : file_analysis::Analyzer(args, file), filename(arg_filename)
+    : file_analysis::Analyzer(file_mgr->GetAnalyzerTag("EXTRACT"), args, file),
+	  filename(arg_filename)
 	{
 	fd = open(filename.c_str(), O_WRONLY | O_CREAT | O_TRUNC, 0666);
 
diff --git a/src/file_analysis/analyzer/hash/Hash.cc b/src/file_analysis/analyzer/hash/Hash.cc
index 9835f343b6..12463df8bf 100644
--- a/src/file_analysis/analyzer/hash/Hash.cc
+++ b/src/file_analysis/analyzer/hash/Hash.cc
@@ -5,11 +5,12 @@
 #include "Hash.h"
 #include "util.h"
 #include "Event.h"
+#include "file_analysis/Manager.h"
 
 using namespace file_analysis;
 
 Hash::Hash(RecordVal* args, File* file, HashVal* hv, const char* arg_kind)
-	: file_analysis::Analyzer(args, file), hash(hv), fed(false), kind(arg_kind)
+	: file_analysis::Analyzer(file_mgr->GetAnalyzerTag(to_upper(string(arg_kind)).c_str()), args, file), hash(hv), fed(false), kind(arg_kind)
 	{
 	hash->Init();
 	}
diff --git a/src/file_analysis/file_analysis.bif b/src/file_analysis/file_analysis.bif
index b6c80ac800..7e07ddf6bb 100644
--- a/src/file_analysis/file_analysis.bif
+++ b/src/file_analysis/file_analysis.bif
@@ -16,21 +16,23 @@ function Files::__set_timeout_interval%(file_id: string, t: interval%): bool
 	%}
 
 ## :bro:see:`Files::add_analyzer`.
-function Files::__add_analyzer%(file_id: string, args: any%): bool
+function Files::__add_analyzer%(file_id: string, tag: Files::Tag, args: any%): bool
 	%{
 	using BifType::Record::Files::AnalyzerArgs;
 	RecordVal* rv = args->AsRecordVal()->CoerceTo(AnalyzerArgs);
-	bool result = file_mgr->AddAnalyzer(file_id->CheckString(), rv);
+	bool result = file_mgr->AddAnalyzer(file_id->CheckString(),
+	                                    file_mgr->GetAnalyzerTag(tag), rv);
 	Unref(rv);
 	return new Val(result, TYPE_BOOL);
 	%}
 
 ## :bro:see:`Files::remove_analyzer`.
-function Files::__remove_analyzer%(file_id: string, args: any%): bool
+function Files::__remove_analyzer%(file_id: string, tag: Files::Tag, args: any%): bool
 	%{
 	using BifType::Record::Files::AnalyzerArgs;
 	RecordVal* rv = args->AsRecordVal()->CoerceTo(AnalyzerArgs);
-	bool result = file_mgr->RemoveAnalyzer(file_id->CheckString(), rv);
+	bool result = file_mgr->RemoveAnalyzer(file_id->CheckString(),
+	                                       file_mgr->GetAnalyzerTag(tag) , rv);
 	Unref(rv);
 	return new Val(result, TYPE_BOOL);
 	%}
@@ -45,7 +47,7 @@ function Files::__stop%(file_id: string%): bool
 ## :bro:see:`Files::analyzer_name`.
 function Files::__analyzer_name%(tag: Files::Tag%) : string
 	%{
-	return new StringVal(file_mgr->GetAnalyzerName(tag->InternalInt()));
+	return new StringVal(file_mgr->GetAnalyzerName(tag));
 	%}
 
 module GLOBAL;

From 8ca76dd4eea561f196b8ee39083a479121092337 Mon Sep 17 00:00:00 2001
From: Matthias Vallentin <vallentin@icir.org>
Date: Wed, 31 Jul 2013 17:59:08 +0200
Subject: [PATCH 24/40] Introduce global_hash_seed script variable.

This commit adds support for script-level specification of a seed to be used by
hashers. For example, if the given name of a Bloom filter is not empty, then
the seed used by the underlying hasher only depends on the Bloom filter name.
If the name is empty, we check whether the user defined a non-empty
global_hash_seed string variable at script and use it instead. If that script
variable does not exist, then we fall back to the initial seed computed a
Bro startup (which is affected ultimately by $BRO_SEED).

See Hasher::MakeSeed for details.
---
 src/NetVar.cc                      |  4 ++
 src/NetVar.h                       |  2 +
 src/probabilistic/Hasher.cc        | 85 ++++++++++++++----------------
 src/probabilistic/Hasher.h         | 45 +++++++++-------
 src/probabilistic/bloom-filter.bif |  9 +++-
 testing/btest/bifs/bloomfilter.bro |  4 +-
 6 files changed, 82 insertions(+), 67 deletions(-)

diff --git a/src/NetVar.cc b/src/NetVar.cc
index 388aa46f10..2fee46e2da 100644
--- a/src/NetVar.cc
+++ b/src/NetVar.cc
@@ -238,6 +238,8 @@ TableType* record_field_table;
 
 StringVal* cmd_line_bpf_filter;
 
+StringVal* global_hash_seed;
+
 OpaqueType* md5_type;
 OpaqueType* sha1_type;
 OpaqueType* sha256_type;
@@ -304,6 +306,8 @@ void init_general_global_var()
 	cmd_line_bpf_filter =
 		internal_val("cmd_line_bpf_filter")->AsStringVal();
 
+	global_hash_seed = opt_internal_string("global_hash_seed");
+
 	md5_type = new OpaqueType("md5");
 	sha1_type = new OpaqueType("sha1");
 	sha256_type = new OpaqueType("sha256");
diff --git a/src/NetVar.h b/src/NetVar.h
index 7ce33d1a1a..3615108f73 100644
--- a/src/NetVar.h
+++ b/src/NetVar.h
@@ -242,6 +242,8 @@ extern TableType* record_field_table;
 
 extern StringVal* cmd_line_bpf_filter;
 
+extern StringVal* global_hash_seed;
+
 class OpaqueType;
 extern OpaqueType* md5_type;
 extern OpaqueType* sha1_type;
diff --git a/src/probabilistic/Hasher.cc b/src/probabilistic/Hasher.cc
index 17597b9a82..e24a207e6e 100644
--- a/src/probabilistic/Hasher.cc
+++ b/src/probabilistic/Hasher.cc
@@ -3,11 +3,34 @@
 #include <typeinfo>
 
 #include "Hasher.h"
+#include "NetVar.h"
 #include "digest.h"
 #include "Serializer.h"
 
 using namespace probabilistic;
 
+size_t Hasher::MakeSeed(const void* data, size_t size)
+	{
+	u_char buf[SHA256_DIGEST_LENGTH];
+	SHA256_CTX ctx;
+	sha256_init(&ctx);
+
+	if ( data )
+		sha256_update(&ctx, data, size);
+
+	else if ( global_hash_seed && global_hash_seed->Len() > 0 )
+		sha256_update(&ctx, global_hash_seed->Bytes(), global_hash_seed->Len());
+
+	else
+		{
+		unsigned int first_seed = initial_seed();
+		sha256_update(&ctx, &first_seed, sizeof(first_seed));
+		}
+
+	sha256_final(&ctx, buf);
+	return *reinterpret_cast<size_t*>(buf); // Use the first bytes as seed.
+	}
+
 bool Hasher::Serialize(SerialInfo* info) const
 	{
 	return SerialObj::Serialize(info);
@@ -25,7 +48,7 @@ bool Hasher::DoSerialize(SerialInfo* info) const
 	if ( ! SERIALIZE(static_cast<uint16>(k)) )
 		return false;
 
-	return SERIALIZE_STR(name.c_str(), name.size());
+	return SERIALIZE(static_cast<uint64>(seed));
 	}
 
 bool Hasher::DoUnserialize(UnserialInfo* info)
@@ -35,30 +58,26 @@ bool Hasher::DoUnserialize(UnserialInfo* info)
 	uint16 serial_k;
 	if ( ! UNSERIALIZE(&serial_k) )
 		return false;
-
 	k = serial_k;
 	assert(k > 0);
 
-	const char* serial_name;
-	if ( ! UNSERIALIZE_STR(&serial_name, 0) )
+	uint64 serial_seed;
+	if ( ! UNSERIALIZE(&serial_seed) )
 		return false;
-
-	name = serial_name;
-	delete [] serial_name;
+	seed = serial_seed;
 
 	return true;
 	}
 
-Hasher::Hasher(size_t k, const std::string& arg_name)
-	: k(k)
+Hasher::Hasher(size_t arg_k, size_t arg_seed)
 	{
-	k = k;
-	name = arg_name;
+	k = arg_k;
+	seed = arg_seed;
 	}
 
 
-UHF::UHF(size_t seed, const std::string& extra)
-	: h(compute_seed(seed, extra))
+UHF::UHF(size_t seed)
+	: h(seed)
 	{
 	}
 
@@ -68,33 +87,11 @@ Hasher::digest UHF::hash(const void* x, size_t n) const
 	return n == 0 ? 0 : h(x, n);
 	}
 
-size_t UHF::compute_seed(size_t seed, const std::string& extra)
+DefaultHasher::DefaultHasher(size_t k, size_t seed)
+	: Hasher(k, seed)
 	{
-	u_char buf[SHA256_DIGEST_LENGTH];
-	SHA256_CTX ctx;
-	sha256_init(&ctx);
-
-	if ( extra.empty() )
-		{
-		unsigned int first_seed = initial_seed();
-		sha256_update(&ctx, &first_seed, sizeof(first_seed));
-		}
-
-	else
-		sha256_update(&ctx, extra.c_str(), extra.size());
-
-	sha256_update(&ctx, &seed, sizeof(seed));
-	sha256_final(&ctx, buf);
-
-	// Take the first sizeof(size_t) bytes as seed.
-	return *reinterpret_cast<size_t*>(buf);
-	}
-
-DefaultHasher::DefaultHasher(size_t k, const std::string& name)
-	: Hasher(k, name)
-	{
-	for ( size_t i = 0; i < k; ++i )
-		hash_functions.push_back(UHF(i, name));
+	for ( size_t i = 1; i <= k; ++i )
+		hash_functions.push_back(UHF(Seed() + bro_prng(i)));
 	}
 
 Hasher::digest_vector DefaultHasher::Hash(const void* x, size_t n) const
@@ -137,13 +134,13 @@ bool DefaultHasher::DoUnserialize(UnserialInfo* info)
 
 	hash_functions.clear();
 	for ( size_t i = 0; i < K(); ++i )
-		hash_functions.push_back(UHF(i, Name()));
+		hash_functions.push_back(UHF(Seed() + bro_prng(i)));
 
 	return true;
 	}
 
-DoubleHasher::DoubleHasher(size_t k, const std::string& name)
-	: Hasher(k, name), h1(1, name), h2(2, name)
+DoubleHasher::DoubleHasher(size_t k, size_t seed)
+	: Hasher(k, seed), h1(seed + bro_prng(1)), h2(seed + bro_prng(2))
 	{
 	}
 
@@ -187,8 +184,8 @@ bool DoubleHasher::DoUnserialize(UnserialInfo* info)
 	{
 	DO_UNSERIALIZE(Hasher);
 
-	h1 = UHF(1, Name());
-	h2 = UHF(2, Name());
+	h1 = UHF(Seed() + bro_prng(1));
+	h2 = UHF(Seed() + bro_prng(2));
 
 	return true;
 	}
diff --git a/src/probabilistic/Hasher.h b/src/probabilistic/Hasher.h
index 3acd5c5867..bd8f5ce5ff 100644
--- a/src/probabilistic/Hasher.h
+++ b/src/probabilistic/Hasher.h
@@ -18,6 +18,20 @@ public:
 	typedef hash_t digest;
 	typedef std::vector<digest> digest_vector;
 
+	/**
+	 * Creates a valid hasher seed from an arbitrary string.
+	 *
+	 * @param data A pointer to contiguous data that should be crunched into a
+	 * seed. If 0, the function tries to find a global_hash_seed script variable
+	 * to derive a seed from. If this variable does not exist, the function uses
+	 * the initial seed generated at Bro startup.
+	 *
+	 * @param size The number of bytes of *data*.
+	 *
+	 * @return A seed suitable for hashers.
+	 */
+	static size_t MakeSeed(const void* data, size_t size);
+
 	/**
 	 * Destructor.
 	 */
@@ -64,11 +78,9 @@ public:
 	size_t K() const	{ return k; }
 
 	/**
-	 * Returns the hasher's name. If not empty, the hasher uses this descriptor
-	 * to seed its *k* hash functions. Otherwise the hasher mixes in the initial
-	 * seed derived from the environment variable `$BRO_SEED`.
+	 * Returns the seed used to construct the hasher.
 	 */
-	const std::string& Name() const { return name; }
+	size_t Seed() const	{ return seed; }
 
 	bool Serialize(SerialInfo* info) const;
 	static Hasher* Unserialize(UnserialInfo* info);
@@ -81,16 +93,15 @@ protected:
 	/**
 	 * Constructor.
 	 *
-	 * @param k the number of hash functions.
+	 * @param arg_k the number of hash functions.
 	 *
-	 * @param name A name for the hasher. Hashers with the same name
-	 * should provide consistent results.
+	 * @param arg_seed The seed for the hasher.
 	 */
-	Hasher(size_t k, const std::string& name);
+	Hasher(size_t arg_k, size_t arg_seed);
 
 private:
 	size_t k;
-	std::string name;
+	size_t seed;
 };
 
 /**
@@ -104,12 +115,8 @@ public:
 	 * optional extra seed to replace the initial Bro seed.
 	 *
 	 * @param seed The seed to use for this instance.
-	 *
-	 * @param extra If not empty, this parameter replaces the initial
-	 * seed to compute the seed for t to compute the seed NUL-terminated
-	 * string as additional seed.
 	 */
-	UHF(size_t seed = 0, const std::string& extra = "");
+	UHF(size_t seed = 0);
 
 	template <typename T>
 	Hasher::digest operator()(const T& x) const
@@ -152,7 +159,7 @@ public:
 		}
 
 private:
-	static size_t compute_seed(size_t seed, const std::string& extra);
+	static size_t compute_seed(size_t seed);
 
 	H3<Hasher::digest, UHASH_KEY_SIZE> h;
 };
@@ -169,9 +176,9 @@ public:
 	 *
 	 * @param k The number of hash functions to use.
 	 *
-	 * @param name The name of the hasher.
+	 * @param seed The seed for the hasher.
 	 */
-	DefaultHasher(size_t k, const std::string& name = "");
+	DefaultHasher(size_t k, size_t seed);
 
 	// Overridden from Hasher.
 	virtual digest_vector Hash(const void* x, size_t n) const /* final */;
@@ -197,9 +204,9 @@ public:
 	 *
 	 * @param k The number of hash functions to use.
 	 *
-	 * @param name The name of the hasher.
+	 * @param seed The seed for the hasher.
 	 */
-	DoubleHasher(size_t k, const std::string& name = "");
+	DoubleHasher(size_t k, size_t seed);
 
 	// Overridden from Hasher.
 	virtual digest_vector Hash(const void* x, size_t n) const /* final */;
diff --git a/src/probabilistic/bloom-filter.bif b/src/probabilistic/bloom-filter.bif
index a3567ad6f7..d936b77e3b 100644
--- a/src/probabilistic/bloom-filter.bif
+++ b/src/probabilistic/bloom-filter.bif
@@ -48,7 +48,9 @@ function bloomfilter_basic_init%(fp: double, capacity: count,
 
 	size_t cells = BasicBloomFilter::M(fp, capacity);
 	size_t optimal_k = BasicBloomFilter::K(cells, capacity);
-	const Hasher* h = new DefaultHasher(optimal_k, name->CheckString());
+	size_t seed = Hasher::MakeSeed(name->Len() > 0 ? name->Bytes() : 0,
+                                 name->Len());
+	const Hasher* h = new DefaultHasher(optimal_k, seed);
 
 	return new BloomFilterVal(new BasicBloomFilter(h, cells));
 	%}
@@ -86,7 +88,10 @@ function bloomfilter_counting_init%(k: count, cells: count, max: count,
 		return 0;
 		}
 
-	const Hasher* h = new DefaultHasher(k, name->CheckString());
+	size_t seed = Hasher::MakeSeed(name->Len() > 0 ? name->Bytes() : 0,
+                                 name->Len());
+
+	const Hasher* h = new DefaultHasher(k, seed);
 
 	uint16 width = 1;
 	while ( max >>= 1 )
diff --git a/testing/btest/bifs/bloomfilter.bro b/testing/btest/bifs/bloomfilter.bro
index 3b40f29553..e6091e25fa 100644
--- a/testing/btest/bifs/bloomfilter.bro
+++ b/testing/btest/bifs/bloomfilter.bro
@@ -21,8 +21,8 @@ function test_basic_bloom_filter()
   bloomfilter_add(bf_str, "bar");
   print bloomfilter_lookup(bf_str, "foo");
   print bloomfilter_lookup(bf_str, "bar");
-  print bloomfilter_lookup(bf_str, "b4z"); # FP
-  print bloomfilter_lookup(bf_str, "quux"); # FP
+  print bloomfilter_lookup(bf_str, "b4zzz"); # FP
+  print bloomfilter_lookup(bf_str, "quuux"); # FP
   bloomfilter_add(bf_str, 0.5); # Type mismatch
   bloomfilter_add(bf_str, 100); # Type mismatch
 

From d50b8a147d739e3fdce9cf235e47d7291adbe212 Mon Sep 17 00:00:00 2001
From: Matthias Vallentin <vallentin@icir.org>
Date: Wed, 31 Jul 2013 18:21:37 +0200
Subject: [PATCH 25/40] Add new BiF for low-level Bloom filter initialization.

For symmetry reasons, the new Bif bloomfilter_basic_init2 also allows users to
manually specify the memory bounds and number of hash functions to use.
---
 NEWS                                          |  1 +
 src/probabilistic/bloom-filter.bif            | 69 +++++++++++++++----
 .../btest/Baseline/bifs.bloomfilter/output    |  2 +
 testing/btest/bifs/bloomfilter.bro            |  7 ++
 4 files changed, 67 insertions(+), 12 deletions(-)

diff --git a/NEWS b/NEWS
index c421e7d675..64058054d6 100644
--- a/NEWS
+++ b/NEWS
@@ -113,6 +113,7 @@ New Functionality
   the frequency of elements. The corresponding functions are:
 
     bloomfilter_basic_init(fp: double, capacity: count, name: string &default=""): opaque of bloomfilter
+    bloomfilter_basic_init2(k: count, cells: count, name: string &default=""): opaque of bloomfilter
     bloomfilter_counting_init(k: count, cells: count, max: count, name: string &default=""): opaque of bloomfilter
     bloomfilter_add(bf: opaque of bloomfilter, x: any)
     bloomfilter_lookup(bf: opaque of bloomfilter, x: any): count
diff --git a/src/probabilistic/bloom-filter.bif b/src/probabilistic/bloom-filter.bif
index d936b77e3b..0c4a67ac6f 100644
--- a/src/probabilistic/bloom-filter.bif
+++ b/src/probabilistic/bloom-filter.bif
@@ -35,8 +35,8 @@ module GLOBAL;
 ##
 ## Returns: A Bloom filter handle.
 ##
-## .. bro:see:: bloomfilter_counting_init  bloomfilter_add bloomfilter_lookup
-##    bloomfilter_clear bloomfilter_merge
+## .. bro:see:: bloomfilter_basic_init2 bloomfilter_counting_init bloomfilter_add
+##    bloomfilter_lookup bloomfilter_clear bloomfilter_merge
 function bloomfilter_basic_init%(fp: double, capacity: count,
                                  name: string &default=""%): opaque of bloomfilter
 	%{
@@ -55,6 +55,47 @@ function bloomfilter_basic_init%(fp: double, capacity: count,
 	return new BloomFilterVal(new BasicBloomFilter(h, cells));
 	%}
 
+## Creates a basic Bloom filter. This function serves as a low-level
+## alternative to bloomfilter_basic_init where the user has full control over
+## the number of hash functions and cells in the underlying bit vector.
+##
+## .. note:: A Bloom filter can have a name associated with it. In the future,
+##    Bloom filters with the same name will be compatible across indepedent Bro
+##    instances, i.e., it will be possible to merge them. Currently, however, that is
+##    not yet supported.
+##
+## k: The number of hash functions to use.
+##
+## cells: The number of cells of the underlying bit vector.
+##
+## name: A name that uniquely identifies and seeds the Bloom filter. If empty,
+## the filter will remain tied to the current Bro process.
+##
+## Returns: A Bloom filter handle.
+##
+## .. bro:see:: bloom_filter_basic_init bloomfilter_counting_init  bloomfilter_add 
+##    bloomfilter_lookup bloomfilter_clear bloomfilter_merge
+function bloomfilter_basic_init2%(k: count, cells: count,
+                                  name: string &default=""%): opaque of bloomfilter
+	%{
+	if ( k == 0 )
+		{
+		reporter->Error("number of hash functions must be non-negative");
+		return 0;
+		}
+	if ( cells == 0 )
+		{
+		reporter->Error("number of cells must be non-negative");
+		return 0;
+		}
+
+	size_t seed = Hasher::MakeSeed(name->Len() > 0 ? name->Bytes() : 0,
+                                 name->Len());
+	const Hasher* h = new DefaultHasher(k, seed);
+
+	return new BloomFilterVal(new BasicBloomFilter(h, cells));
+	%}
+
 ## Creates a counting Bloom filter.
 ##
 ## .. note:: A Bloom filter can have a name associated with it. In the future,
@@ -77,8 +118,8 @@ function bloomfilter_basic_init%(fp: double, capacity: count,
 ##
 ## Returns: A Bloom filter handle.
 ##
-## .. bro:see:: bloomfilter_basic_init bloomfilter_add bloomfilter_lookup
-##    bloomfilter_clear bloomfilter_merge
+## .. bro:see:: bloomfilter_basic_init bloomfilter_basic_init2 bloomfilter_add
+##    bloomfilter_lookup bloomfilter_clear bloomfilter_merge
 function bloomfilter_counting_init%(k: count, cells: count, max: count,
 				    name: string &default=""%): opaque of bloomfilter
 	%{
@@ -106,8 +147,9 @@ function bloomfilter_counting_init%(k: count, cells: count, max: count,
 ##
 ## x: The element to add.
 ##
-## .. bro:see:: bloomfilter_counting_init bloomfilter_basic_init loomfilter_lookup
-##    bloomfilter_clear bloomfilter_merge
+## .. bro:see:: bloomfilter_basic_init bloomfilter_basic_init2 
+##    bloomfilter_counting_init bloomfilter_lookup bloomfilter_clear 
+##    bloomfilter_merge
 function bloomfilter_add%(bf: opaque of bloomfilter, x: any%): any
 	%{
 	BloomFilterVal* bfv = static_cast<BloomFilterVal*>(bf);
@@ -132,8 +174,9 @@ function bloomfilter_add%(bf: opaque of bloomfilter, x: any%): any
 ##
 ## Returns: the counter associated with *x* in *bf*.
 ##
-## .. bro:see:: bloomfilter_counting_init bloomfilter_basic_init
-##    bloomfilter_add bloomfilter_clear bloomfilter_merge
+## .. bro:see:: bloomfilter_basic_init bloomfilter_basic_init2
+##    bloomfilter_counting_init bloomfilter_add bloomfilter_clear
+##    bloomfilter_merge
 function bloomfilter_lookup%(bf: opaque of bloomfilter, x: any%): count
 	%{
 	const BloomFilterVal* bfv = static_cast<const BloomFilterVal*>(bf);
@@ -159,8 +202,9 @@ function bloomfilter_lookup%(bf: opaque of bloomfilter, x: any%): count
 ##
 ## bf: The Bloom filter handle.
 ##
-## .. bro:see:: bloomfilter_counting_init bloomfilter_basic_init
-##    bloomfilter_add bloomfilter_lookup bloomfilter_merge
+## .. bro:see:: bloomfilter_basic_init bloomfilter_counting_init2
+##    bloomfilter_counting_init bloomfilter_add bloomfilter_lookup
+##    bloomfilter_merge
 function bloomfilter_clear%(bf: opaque of bloomfilter%): any
 	%{
 	BloomFilterVal* bfv = static_cast<BloomFilterVal*>(bf);
@@ -183,8 +227,9 @@ function bloomfilter_clear%(bf: opaque of bloomfilter%): any
 ##
 ## Returns: The union of *bf1* and *bf2*.
 ##
-## .. bro:see:: bloomfilter_counting_init bloomfilter_basic_init
-##    bloomfilter_add bloomfilter_lookup bloomfilter_clear
+## .. bro:see:: bloomfilter_basic_init bloomfilter_basic_init2
+##    bloomfilter_counting_init bloomfilter_add bloomfilter_lookup
+##    bloomfilter_clear
 function bloomfilter_merge%(bf1: opaque of bloomfilter,
 			    bf2: opaque of bloomfilter%): opaque of bloomfilter
 	%{
diff --git a/testing/btest/Baseline/bifs.bloomfilter/output b/testing/btest/Baseline/bifs.bloomfilter/output
index 14e1f038c0..731b7c7ce9 100644
--- a/testing/btest/Baseline/bifs.bloomfilter/output
+++ b/testing/btest/Baseline/bifs.bloomfilter/output
@@ -17,6 +17,8 @@ error: false-positive rate must take value between 0 and 1
 1
 1
 1
+1
+1
 2
 3
 3
diff --git a/testing/btest/bifs/bloomfilter.bro b/testing/btest/bifs/bloomfilter.bro
index e6091e25fa..c2a1c47ca8 100644
--- a/testing/btest/bifs/bloomfilter.bro
+++ b/testing/btest/bifs/bloomfilter.bro
@@ -15,6 +15,13 @@ function test_basic_bloom_filter()
   bloomfilter_add(bf_cnt, 0.5); # Type mismatch
   bloomfilter_add(bf_cnt, "foo"); # Type mismatch
 
+  # Alternative constructor.
+  local bf_dbl = bloomfilter_basic_init2(4, 10);
+  bloomfilter_add(bf_dbl, 4.2);
+  bloomfilter_add(bf_dbl, 3.14);
+  print bloomfilter_lookup(bf_dbl, 4.2);
+  print bloomfilter_lookup(bf_dbl, 3.14);
+
   # Basic usage with strings.
   local bf_str = bloomfilter_basic_init(0.9, 10);
   bloomfilter_add(bf_str, "foo");

From 5122bf4a7cbe5e78802042729d53009d5cc28ab5 Mon Sep 17 00:00:00 2001
From: Bernhard Amann <bernhard@icsi.berkeley.edu>
Date: Wed, 31 Jul 2013 12:06:59 -0700
Subject: [PATCH 26/40] adapt to new folder structure

---
 src/CMakeLists.txt                            |   1 -
 src/bro.bif                                   | 114 ----------------
 src/probabilistic/CMakeLists.txt              |   4 +-
 src/{ => probabilistic}/Topk.cc               |   4 +-
 src/{ => probabilistic}/Topk.h                |   2 +-
 src/probabilistic/top-k.bif                   | 122 ++++++++++++++++++
 .../out                                       |   0
 .../topk_persistence.bro => istate/topk.bro}  |   0
 8 files changed, 128 insertions(+), 119 deletions(-)
 rename src/{ => probabilistic}/Topk.cc (99%)
 rename src/{ => probabilistic}/Topk.h (99%)
 create mode 100644 src/probabilistic/top-k.bif
 rename testing/btest/Baseline/{bifs.topk_persistence => istate.topk}/out (100%)
 rename testing/btest/{bifs/topk_persistence.bro => istate/topk.bro} (100%)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 2693e1f280..4a65ddd4d3 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -320,7 +320,6 @@ set(bro_SRCS
     Stats.cc
     Stmt.cc
     Timer.cc
-    Topk.cc
     Traverse.cc
     Trigger.cc
     TunnelEncapsulation.cc
diff --git a/src/bro.bif b/src/bro.bif
index fab11c7e90..efb913bbf7 100644
--- a/src/bro.bif
+++ b/src/bro.bif
@@ -4976,117 +4976,3 @@ function anonymize_addr%(a: addr, cl: IPAddrAnonymizationClass%): addr
 		}
 	%}
 
-%%{
-#include "Topk.h"
-%%}
-
-## Creates a top-k data structure which tracks size elements.
-##
-## Returns: Opaque pointer to the data structure.
-function topk_init%(size: count%): opaque of topk
-	%{
-	Topk::TopkVal* v = new Topk::TopkVal(size);
-	return v;
-	%}
-
-## Add a new observed object to the data structure. The first
-## added object sets the type of data tracked by the top-k data
-## structure. All following values have to be of the same type
-function topk_add%(handle: opaque of topk, value: any%): any
-	%{
-	assert(handle);
-	Topk::TopkVal* h = (Topk::TopkVal*) handle;
-	h->Encountered(value);
-
-	return 0;
-	%}
-
-## Get the first k elements of the top-k data structure
-##
-## Returns: vector of the first k elements
-function topk_get_top%(handle: opaque of topk, k: count%): any
-	%{
-	assert(handle);
-	Topk::TopkVal* h = (Topk::TopkVal*) handle;
-	return h->getTopK(k);	
-	%}
-
-## Get an overestimated count of how often value has been encountered.
-## value has to be part of the currently tracked elements, otherwise
-## 0 will be returned and an error message will be added to reporter.
-##
-## Returns: Overestimated number for how often the element has been encountered
-function topk_count%(handle: opaque of topk, value: any%): count
-	%{
-	assert(handle);
-	Topk::TopkVal* h = (Topk::TopkVal*) handle;
-	return new Val(h->getCount(value), TYPE_COUNT);
-	%}
-
-## Get a  the maximal overestimation for count. Same restrictiosn as for topk_count
-## apply.
-##
-## Returns: Number which represents the maximal overesimation for the count of this element.
-function topk_epsilon%(handle: opaque of topk, value: any%): count
-	%{
-	assert(handle);
-	Topk::TopkVal* h = (Topk::TopkVal*) handle;
-	return new Val(h->getEpsilon(value), TYPE_COUNT);
-	%}
-
-## Get the number of elements this data structure is supposed to track (given on init).
-## Note that the actual number of elements in the data structure can be lower or higher
-## than this. (higher due to non-pruned merges)
-## 
-## Returns: size given during initialization 
-function topk_size%(handle: opaque of topk%): count
-	%{
-	assert(handle);
-	Topk::TopkVal* h = (Topk::TopkVal*) handle;
-	return new Val(h->getSize(), TYPE_COUNT);
-	%}
-
-## Get the sum of all counts of all elements in the data structure. Is equal to the number
-## of all inserted objects if the data structure never has been pruned. Do not use after
-## calling topk_merge_prune (will throw a warning message if used afterwards)
-##
-## Returns: sum of all counts
-function topk_sum%(handle: opaque of topk%): count
-	%{
-	assert(handle);
-	Topk::TopkVal* h = (Topk::TopkVal*) handle;
-	return new Val(h->getSum(), TYPE_COUNT);
-	%}
-
-## Merge the second topk data structure into the first. Does not remove any elements, the
-## resulting data structure can be bigger than the maximum size given on initialization.
-function topk_merge%(handle1: opaque of topk, handle2: opaque of topk%): any
-	%{
-	assert(handle1);
-	assert(handle2);
-
-	Topk::TopkVal* h1 = (Topk::TopkVal*) handle1;
-	Topk::TopkVal* h2 = (Topk::TopkVal*) handle2;
-
-	h1->Merge(h2);
-
-	return 0;
-	%}
-
-## Merge the second topk data structure into the first and prunes the final data structure
-## back to the size given on initialization. Use with care and only when being aware of the
-## restrictions this imposed. Do not call topk_size or topk_add afterwards, results will
-## probably not be what you expect.
-function topk_merge_prune%(handle1: opaque of topk, handle2: opaque of topk%): any
-	%{
-	assert(handle1);
-	assert(handle2);
-
-	Topk::TopkVal* h1 = (Topk::TopkVal*) handle1;
-	Topk::TopkVal* h2 = (Topk::TopkVal*) handle2;
-
-	h1->Merge(h2, true);
-
-	return 0;
-	%}
-
diff --git a/src/probabilistic/CMakeLists.txt b/src/probabilistic/CMakeLists.txt
index af062b24ae..a36dfbbd6b 100644
--- a/src/probabilistic/CMakeLists.txt
+++ b/src/probabilistic/CMakeLists.txt
@@ -10,9 +10,11 @@ set(probabilistic_SRCS
     BitVector.cc
     BloomFilter.cc
     CounterVector.cc
-    Hasher.cc)
+    Hasher.cc
+    Topk.cc)
 
 bif_target(bloom-filter.bif)
+bif_target(top-k.bif)
 bro_add_subdir_library(probabilistic ${probabilistic_SRCS})
 
 add_dependencies(bro_probabilistic generate_outputs)
diff --git a/src/Topk.cc b/src/probabilistic/Topk.cc
similarity index 99%
rename from src/Topk.cc
rename to src/probabilistic/Topk.cc
index 10374f3087..d03a10ccfc 100644
--- a/src/Topk.cc
+++ b/src/probabilistic/Topk.cc
@@ -1,13 +1,13 @@
 // See the file "COPYING" in the main distribution directory for copyright.
 
-#include "Topk.h"
+#include "probabilistic/Topk.h"
 #include "CompHash.h"
 #include "Reporter.h"
 #include "Serializer.h"
 #include "NetVar.h"
 
 
-namespace Topk {
+namespace probabilistic {
 
 IMPLEMENT_SERIAL(TopkVal, SER_TOPK_VAL);
 
diff --git a/src/Topk.h b/src/probabilistic/Topk.h
similarity index 99%
rename from src/Topk.h
rename to src/probabilistic/Topk.h
index 608b810ddb..2c47fbd181 100644
--- a/src/Topk.h
+++ b/src/probabilistic/Topk.h
@@ -10,7 +10,7 @@
 
 // This class implements the top-k algorithm. Or - to be more precise - my interpretation of it.
 
-namespace Topk {
+namespace probabilistic {
 
 struct Element;
 
diff --git a/src/probabilistic/top-k.bif b/src/probabilistic/top-k.bif
new file mode 100644
index 0000000000..83d8e275c1
--- /dev/null
+++ b/src/probabilistic/top-k.bif
@@ -0,0 +1,122 @@
+# ===========================================================================
+#
+#                                  Top-K Functions
+#
+# ===========================================================================
+
+
+%%{
+#include "probabilistic/Topk.h"
+%%}
+
+## Creates a top-k data structure which tracks size elements.
+##
+## Returns: Opaque pointer to the data structure.
+function topk_init%(size: count%): opaque of topk
+	%{
+	probabilistic::TopkVal* v = new probabilistic::TopkVal(size);
+	return v;
+	%}
+
+## Add a new observed object to the data structure. The first
+## added object sets the type of data tracked by the top-k data
+## structure. All following values have to be of the same type
+function topk_add%(handle: opaque of topk, value: any%): any
+	%{
+	assert(handle);
+	probabilistic::TopkVal* h = (probabilistic::TopkVal*) handle;
+	h->Encountered(value);
+
+	return 0;
+	%}
+
+## Get the first k elements of the top-k data structure
+##
+## Returns: vector of the first k elements
+function topk_get_top%(handle: opaque of topk, k: count%): any
+	%{
+	assert(handle);
+	probabilistic::TopkVal* h = (probabilistic::TopkVal*) handle;
+	return h->getTopK(k);	
+	%}
+
+## Get an overestimated count of how often value has been encountered.
+## value has to be part of the currently tracked elements, otherwise
+## 0 will be returned and an error message will be added to reporter.
+##
+## Returns: Overestimated number for how often the element has been encountered
+function topk_count%(handle: opaque of topk, value: any%): count
+	%{
+	assert(handle);
+	probabilistic::TopkVal* h = (probabilistic::TopkVal*) handle;
+	return new Val(h->getCount(value), TYPE_COUNT);
+	%}
+
+## Get a  the maximal overestimation for count. Same restrictiosn as for topk_count
+## apply.
+##
+## Returns: Number which represents the maximal overesimation for the count of this element.
+function topk_epsilon%(handle: opaque of topk, value: any%): count
+	%{
+	assert(handle);
+	probabilistic::TopkVal* h = (probabilistic::TopkVal*) handle;
+	return new Val(h->getEpsilon(value), TYPE_COUNT);
+	%}
+
+## Get the number of elements this data structure is supposed to track (given on init).
+## Note that the actual number of elements in the data structure can be lower or higher
+## than this. (higher due to non-pruned merges)
+## 
+## Returns: size given during initialization 
+function topk_size%(handle: opaque of topk%): count
+	%{
+	assert(handle);
+	probabilistic::TopkVal* h = (probabilistic::TopkVal*) handle;
+	return new Val(h->getSize(), TYPE_COUNT);
+	%}
+
+## Get the sum of all counts of all elements in the data structure. Is equal to the number
+## of all inserted objects if the data structure never has been pruned. Do not use after
+## calling topk_merge_prune (will throw a warning message if used afterwards)
+##
+## Returns: sum of all counts
+function topk_sum%(handle: opaque of topk%): count
+	%{
+	assert(handle);
+	probabilistic::TopkVal* h = (probabilistic::TopkVal*) handle;
+	return new Val(h->getSum(), TYPE_COUNT);
+	%}
+
+## Merge the second topk data structure into the first. Does not remove any elements, the
+## resulting data structure can be bigger than the maximum size given on initialization.
+function topk_merge%(handle1: opaque of topk, handle2: opaque of topk%): any
+	%{
+	assert(handle1);
+	assert(handle2);
+
+	probabilistic::TopkVal* h1 = (probabilistic::TopkVal*) handle1;
+	probabilistic::TopkVal* h2 = (probabilistic::TopkVal*) handle2;
+
+	h1->Merge(h2);
+
+	return 0;
+	%}
+
+## Merge the second topk data structure into the first and prunes the final data structure
+## back to the size given on initialization. Use with care and only when being aware of the
+## restrictions this imposed. Do not call topk_size or topk_add afterwards, results will
+## probably not be what you expect.
+function topk_merge_prune%(handle1: opaque of topk, handle2: opaque of topk%): any
+	%{
+	assert(handle1);
+	assert(handle2);
+
+	probabilistic::TopkVal* h1 = (probabilistic::TopkVal*) handle1;
+	probabilistic::TopkVal* h2 = (probabilistic::TopkVal*) handle2;
+
+	h1->Merge(h2, true);
+
+	return 0;
+	%}
+
+
diff --git a/testing/btest/Baseline/bifs.topk_persistence/out b/testing/btest/Baseline/istate.topk/out
similarity index 100%
rename from testing/btest/Baseline/bifs.topk_persistence/out
rename to testing/btest/Baseline/istate.topk/out
diff --git a/testing/btest/bifs/topk_persistence.bro b/testing/btest/istate/topk.bro
similarity index 100%
rename from testing/btest/bifs/topk_persistence.bro
rename to testing/btest/istate/topk.bro

From 2a0790c2316380209b5a9d6f3abfffc94aa8120e Mon Sep 17 00:00:00 2001
From: Robin Sommer <robin@icir.org>
Date: Wed, 31 Jul 2013 17:14:02 -0700
Subject: [PATCH 27/40] Changing the Bloom filter hashing so that it's
 independent of CompositeHash.

We do this by hashing values added to a BloomFilter another time more
with a stable hash seeded only by either the filter's name or the
global_hash_seed (or Bro's random() seed if neither is defined).

I'm also adding a new bif bloomfilter_internal_state() that returns a
string representation of a Bloom filter's current internal state. This
is solely for writing tests that check that the filters end up
consistent when seeded with the same value.
---
 scripts/base/init-bare.bro                    |  5 ++
 src/OpaqueVal.cc                              |  9 ++-
 src/OpaqueVal.h                               |  1 +
 src/probabilistic/BitVector.cc                | 10 ++++
 src/probabilistic/BitVector.h                 |  7 +++
 src/probabilistic/BloomFilter.cc              | 28 ++++++++--
 src/probabilistic/BloomFilter.h               | 56 +++++++------------
 src/probabilistic/CounterVector.cc            |  5 ++
 src/probabilistic/CounterVector.h             |  7 +++
 src/probabilistic/Hasher.cc                   |  6 +-
 src/probabilistic/Hasher.h                    |  9 +++
 src/probabilistic/bloom-filter.bif            | 25 ++++-----
 .../Baseline/bifs.bloomfilter-seed/output     |  8 +++
 testing/btest/bifs/bloomfilter-seed.bro       | 40 +++++++++++++
 14 files changed, 157 insertions(+), 59 deletions(-)
 create mode 100644 testing/btest/Baseline/bifs.bloomfilter-seed/output
 create mode 100644 testing/btest/bifs/bloomfilter-seed.bro

diff --git a/scripts/base/init-bare.bro b/scripts/base/init-bare.bro
index 9876ad03f7..e5300cdc9f 100644
--- a/scripts/base/init-bare.bro
+++ b/scripts/base/init-bare.bro
@@ -3042,6 +3042,11 @@ module GLOBAL;
 ## Number of bytes per packet to capture from live interfaces.
 const snaplen = 8192 &redef;
 
+## Seed for hashes computed internally for probabilistic data structures. Using
+## the same value here will make the hashes compatible between independent Bro
+## instances. If left unset, Bro will use a temporary local seed.  
+const global_hash_seed: string = "" &redef;
+
 # Load BiFs defined by plugins.
 @load base/bif/plugins
 
diff --git a/src/OpaqueVal.cc b/src/OpaqueVal.cc
index 66b3c081e7..e79b4435b3 100644
--- a/src/OpaqueVal.cc
+++ b/src/OpaqueVal.cc
@@ -566,14 +566,14 @@ BroType* BloomFilterVal::Type() const
 void BloomFilterVal::Add(const Val* val)
 	{
 	HashKey* key = hash->ComputeHash(val, 1);
-	bloom_filter->Add(key->Hash());
+	bloom_filter->Add(key);
 	delete key;
 	}
 
 size_t BloomFilterVal::Count(const Val* val) const
 	{
 	HashKey* key = hash->ComputeHash(val, 1);
-	size_t cnt = bloom_filter->Count(key->Hash());
+	size_t cnt = bloom_filter->Count(key);
 	delete key;
 	return cnt;
 	}
@@ -588,6 +588,11 @@ bool BloomFilterVal::Empty() const
 	return bloom_filter->Empty();
 	}
 
+string BloomFilterVal::InternalState() const
+	{
+	return bloom_filter->InternalState();
+	}
+
 BloomFilterVal* BloomFilterVal::Merge(const BloomFilterVal* x,
 				      const BloomFilterVal* y)
 	{
diff --git a/src/OpaqueVal.h b/src/OpaqueVal.h
index 52c9583fc7..08a20b1a31 100644
--- a/src/OpaqueVal.h
+++ b/src/OpaqueVal.h
@@ -127,6 +127,7 @@ public:
 	size_t Count(const Val* val) const;
 	void Clear();
 	bool Empty() const;
+	string InternalState() const;
 
 	static BloomFilterVal* Merge(const BloomFilterVal* x,
 				     const BloomFilterVal* y);
diff --git a/src/probabilistic/BitVector.cc b/src/probabilistic/BitVector.cc
index 6e642e62c1..f820e6df27 100644
--- a/src/probabilistic/BitVector.cc
+++ b/src/probabilistic/BitVector.cc
@@ -490,6 +490,16 @@ BitVector::size_type BitVector::FindNext(size_type i) const
 	return block ? bi * bits_per_block + lowest_bit(block) : find_from(bi + 1);
 	}
 
+size_t BitVector::Hash() const
+	{
+	size_t hash = 0;
+
+	for ( size_type i = 0; i < Blocks(); ++i )
+		hash += bits[i];
+
+	return hash;
+	}
+
 BitVector::size_type BitVector::lowest_bit(block_type block)
 	{
 	block_type x = block - (block & (block - 1));
diff --git a/src/probabilistic/BitVector.h b/src/probabilistic/BitVector.h
index d9c55d53c6..8e24336345 100644
--- a/src/probabilistic/BitVector.h
+++ b/src/probabilistic/BitVector.h
@@ -276,6 +276,13 @@ public:
 	 */
 	size_type FindNext(size_type i) const;
 
+	/** Computes a hash value of the internal representation.
+	  * This is mainly for debugging/testing purposes.
+	  *
+	  * @return The hash.
+	  */
+	size_t Hash() const;
+
 	/**
 	 * Serializes the bit vector.
 	 *
diff --git a/src/probabilistic/BloomFilter.cc b/src/probabilistic/BloomFilter.cc
index 23b812269c..bcab6c9b54 100644
--- a/src/probabilistic/BloomFilter.cc
+++ b/src/probabilistic/BloomFilter.cc
@@ -9,6 +9,8 @@
 #include "CounterVector.h"
 #include "Serializer.h"
 
+#include "../util.h"
+
 using namespace probabilistic;
 
 BloomFilter::BloomFilter()
@@ -107,6 +109,11 @@ BasicBloomFilter* BasicBloomFilter::Clone() const
 	return copy;
 	}
 
+std::string BasicBloomFilter::InternalState() const
+	{
+	return fmt("%" PRIu64, (uint64_t)bits->Hash());
+	}
+
 BasicBloomFilter::BasicBloomFilter()
 	{
 	bits = 0;
@@ -133,14 +140,18 @@ bool BasicBloomFilter::DoUnserialize(UnserialInfo* info)
 	return (bits != 0);
 	}
 
-void BasicBloomFilter::AddImpl(const Hasher::digest_vector& h)
+void BasicBloomFilter::Add(const HashKey* key)
 	{
+	Hasher::digest_vector h = hasher->Hash(key);
+
 	for ( size_t i = 0; i < h.size(); ++i )
 		bits->Set(h[i] % bits->Size());
 	}
 
-size_t BasicBloomFilter::CountImpl(const Hasher::digest_vector& h) const
+size_t BasicBloomFilter::Count(const HashKey* key) const
 	{
+	Hasher::digest_vector h = hasher->Hash(key);
+
 	for ( size_t i = 0; i < h.size(); ++i )
 		{
 		if ( ! (*bits)[h[i] % bits->Size()] )
@@ -206,6 +217,11 @@ CountingBloomFilter* CountingBloomFilter::Clone() const
 	return copy;
 	}
 
+string CountingBloomFilter::InternalState() const
+	{
+	return fmt("%" PRIu64, (uint64_t)cells->Hash());
+	}
+
 IMPLEMENT_SERIAL(CountingBloomFilter, SER_COUNTINGBLOOMFILTER)
 
 bool CountingBloomFilter::DoSerialize(SerialInfo* info) const
@@ -222,14 +238,18 @@ bool CountingBloomFilter::DoUnserialize(UnserialInfo* info)
 	}
 
 // TODO: Use partitioning in add/count to allow for reusing CMS bounds.
-void CountingBloomFilter::AddImpl(const Hasher::digest_vector& h)
+void CountingBloomFilter::Add(const HashKey* key)
 	{
+	Hasher::digest_vector h = hasher->Hash(key);
+
 	for ( size_t i = 0; i < h.size(); ++i )
 		cells->Increment(h[i] % cells->Size());
 	}
 
-size_t CountingBloomFilter::CountImpl(const Hasher::digest_vector& h) const
+size_t CountingBloomFilter::Count(const HashKey* key) const
 	{
+	Hasher::digest_vector h = hasher->Hash(key);
+
 	CounterVector::size_type min =
 		std::numeric_limits<CounterVector::size_type>::max();
 
diff --git a/src/probabilistic/BloomFilter.h b/src/probabilistic/BloomFilter.h
index 4865ae145c..65dda2396d 100644
--- a/src/probabilistic/BloomFilter.h
+++ b/src/probabilistic/BloomFilter.h
@@ -22,27 +22,20 @@ public:
 	virtual ~BloomFilter();
 
 	/**
-	 * Adds an element of type T to the Bloom filter.
-	 * @param x The element to add
+	 * Adds an element to the Bloom filter.
+	 *
+	 * @param key The key associated with the element to add.
 	 */
-	template <typename T>
-	void Add(const T& x)
-		{
-		AddImpl((*hasher)(x));
-		}
+	virtual void Add(const HashKey* key) = 0;
 
 	/**
 	 * Retrieves the associated count of a given value.
 	 *
-	 * @param x The value of type `T` to check.
+	 * @param key The key associated with the element to check.
 	 *
-	 * @return The counter associated with *x*.
+	 * @return The counter associated with *key*.
 	 */
-	template <typename T>
-	size_t Count(const T& x) const
-		{
-		return CountImpl((*hasher)(x));
-		}
+	virtual size_t Count(const HashKey* key) const = 0;
 
 	/**
 	 * Checks whether the Bloom filter is empty.
@@ -72,6 +65,12 @@ public:
 	 */
 	virtual BloomFilter* Clone() const = 0;
 
+	/**
+	 * Returns a string with a representation of the Bloom filter's
+	 * internal state. This is for debugging/testing purposes only.
+	 */
+	virtual string InternalState() const = 0;
+
 	/**
 	 * Serializes the Bloom filter.
 	 *
@@ -106,25 +105,6 @@ protected:
 	 */
 	BloomFilter(const Hasher* hasher);
 
-	/**
-	 * Abstract method for implementinng the *Add* operation.
-	 *
-	 * @param hashes A set of *k* hashes for the item to add, computed by
-	 * the internal hasher object.
-	 *
-	 */
-	virtual void AddImpl(const Hasher::digest_vector& hashes) = 0;
-
-	/**
-	 * Abstract method for implementing the *Count* operation.
-	 *
-	 * @param hashes A set of *k* hashes for the item to add, computed by
-	 * the internal hasher object.
-	 *
-	 * @return Returns the counter associated with the hashed element.
-	 */
-	virtual size_t CountImpl(const Hasher::digest_vector& hashes) const = 0;
-
 	const Hasher* hasher;
 };
 
@@ -177,6 +157,7 @@ public:
 	virtual void Clear();
 	virtual bool Merge(const BloomFilter* other);
 	virtual BasicBloomFilter* Clone() const;
+	virtual string InternalState() const;
 
 protected:
 	DECLARE_SERIAL(BasicBloomFilter);
@@ -187,8 +168,8 @@ protected:
 	BasicBloomFilter();
 
 	// Overridden from BloomFilter.
-	virtual void AddImpl(const Hasher::digest_vector& h);
-	virtual size_t CountImpl(const Hasher::digest_vector& h) const;
+	virtual void Add(const HashKey* key);
+	virtual size_t Count(const HashKey* key) const;
 
 private:
 	BitVector* bits;
@@ -216,6 +197,7 @@ public:
 	virtual void Clear();
 	virtual bool Merge(const BloomFilter* other);
 	virtual CountingBloomFilter* Clone() const;
+	virtual string InternalState() const;
 
 protected:
 	DECLARE_SERIAL(CountingBloomFilter);
@@ -226,8 +208,8 @@ protected:
 	CountingBloomFilter();
 
 	// Overridden from BloomFilter.
-	virtual void AddImpl(const Hasher::digest_vector& h);
-	virtual size_t CountImpl(const Hasher::digest_vector& h) const;
+	virtual void Add(const HashKey* key);
+	virtual size_t Count(const HashKey* key) const;
 
 private:
 	CounterVector* cells;
diff --git a/src/probabilistic/CounterVector.cc b/src/probabilistic/CounterVector.cc
index d5635fc0f2..8a6feae5fd 100644
--- a/src/probabilistic/CounterVector.cc
+++ b/src/probabilistic/CounterVector.cc
@@ -153,6 +153,11 @@ CounterVector operator|(const CounterVector& x, const CounterVector& y)
 
 }
 
+size_t CounterVector::Hash() const
+	{
+	return bits->Hash();
+	}
+
 bool CounterVector::Serialize(SerialInfo* info) const
 	{
 	return SerialObj::Serialize(info);
diff --git a/src/probabilistic/CounterVector.h b/src/probabilistic/CounterVector.h
index df6fc57ac2..9ce522d61c 100644
--- a/src/probabilistic/CounterVector.h
+++ b/src/probabilistic/CounterVector.h
@@ -126,6 +126,13 @@ public:
 	 */
 	CounterVector& operator|=(const CounterVector& other);
 
+	/** Computes a hash value of the internal representation.
+	  * This is mainly for debugging/testing purposes.
+	  *
+	  * @return The hash.
+	  */
+	size_t Hash() const;
+
 	/**
 	 * Serializes the bit vector.
 	 *
diff --git a/src/probabilistic/Hasher.cc b/src/probabilistic/Hasher.cc
index 8b34aa5c77..b59274df7d 100644
--- a/src/probabilistic/Hasher.cc
+++ b/src/probabilistic/Hasher.cc
@@ -31,6 +31,11 @@ size_t Hasher::MakeSeed(const void* data, size_t size)
 	return *reinterpret_cast<size_t*>(buf); // Use the first bytes as seed.
 	}
 
+Hasher::digest_vector Hasher::Hash(const HashKey* key) const
+	{
+	return Hash(key->Key(), key->Size());
+	}
+
 bool Hasher::Serialize(SerialInfo* info) const
 	{
 	return SerialObj::Serialize(info);
@@ -77,7 +82,6 @@ Hasher::Hasher(size_t arg_k, size_t arg_seed)
 	seed = arg_seed;
 	}
 
-
 UHF::UHF(size_t seed)
 	: h(seed)
 	{
diff --git a/src/probabilistic/Hasher.h b/src/probabilistic/Hasher.h
index bd8f5ce5ff..6b75fa1bea 100644
--- a/src/probabilistic/Hasher.h
+++ b/src/probabilistic/Hasher.h
@@ -50,6 +50,15 @@ public:
 		return Hash(&x, sizeof(T));
 		}
 
+	/**
+	 * Computes hash values for an element.
+	 *
+	 * @param x The key of the value to hash.
+	 *
+	 * @return Vector of *k* hash values.
+	 */
+	digest_vector Hash(const HashKey* key) const;
+
 	/**
 	 * Computes the hashes for a set of bytes.
 	 *
diff --git a/src/probabilistic/bloom-filter.bif b/src/probabilistic/bloom-filter.bif
index c288171e5d..98c8dd59a8 100644
--- a/src/probabilistic/bloom-filter.bif
+++ b/src/probabilistic/bloom-filter.bif
@@ -20,11 +20,6 @@ module GLOBAL;
 
 ## Creates a basic Bloom filter.
 ##
-## .. note:: A Bloom filter can have a name associated with it. In the future,
-##    Bloom filters with the same name will be compatible across indepedent Bro
-##    instances, i.e., it will be possible to merge them. Currently, however, that is
-##    not yet supported.
-##
 ## fp: The desired false-positive rate.
 ##
 ## capacity: the maximum number of elements that guarantees a false-positive
@@ -61,11 +56,6 @@ function bloomfilter_basic_init%(fp: double, capacity: count,
 ## alternative to bloomfilter_basic_init where the user has full control over
 ## the number of hash functions and cells in the underlying bit vector.
 ##
-## .. note:: A Bloom filter can have a name associated with it. In the future,
-##    Bloom filters with the same name will be compatible across indepedent Bro
-##    instances, i.e., it will be possible to merge them. Currently, however, that is
-##    not yet supported.
-##
 ## k: The number of hash functions to use.
 ##
 ## cells: The number of cells of the underlying bit vector.
@@ -102,11 +92,6 @@ function bloomfilter_basic_init2%(k: count, cells: count,
 
 ## Creates a counting Bloom filter.
 ##
-## .. note:: A Bloom filter can have a name associated with it. In the future,
-##    Bloom filters with the same name will be compatible across indepedent Bro
-##    instances, i.e., it will be possible to merge them. Currently, however, that is
-##    not yet supported.
-##
 ## k: The number of hash functions to use.
 ##
 ## cells: The number of cells of the underlying counter vector. As there's no
@@ -250,3 +235,13 @@ function bloomfilter_merge%(bf1: opaque of bloomfilter,
 
 	return BloomFilterVal::Merge(bfv1, bfv2);
 	%}
+
+## Returns a string with a representation of a Bloom filter's internal
+## state. This is for debugging/testing purposes only.
+## 
+## bf: The Bloom filter handle.
+function bloomfilter_internal_state%(bf: opaque of bloomfilter%): string
+	%{
+	BloomFilterVal* bfv = static_cast<BloomFilterVal*>(bf);
+	return new StringVal(bfv->InternalState());
+	%}
diff --git a/testing/btest/Baseline/bifs.bloomfilter-seed/output b/testing/btest/Baseline/bifs.bloomfilter-seed/output
new file mode 100644
index 0000000000..53e0f583f2
--- /dev/null
+++ b/testing/btest/Baseline/bifs.bloomfilter-seed/output
@@ -0,0 +1,8 @@
+bf1, global_seed, 1
+bf2, global_seed, 5
+bf3, my_seed, 5
+bf4, my_seed, 6
+bf1, global_seed, 5
+bf2, global_seed, 6
+bf3, my_seed, 5
+bf4, my_seed, 6
diff --git a/testing/btest/bifs/bloomfilter-seed.bro b/testing/btest/bifs/bloomfilter-seed.bro
new file mode 100644
index 0000000000..436638e2af
--- /dev/null
+++ b/testing/btest/bifs/bloomfilter-seed.bro
@@ -0,0 +1,40 @@
+# @TEST-EXEC: bro -b %INPUT global_hash_seed="foo" >>output
+# @TEST-EXEC: bro -b %INPUT global_hash_seed="my_seed" >>output 
+# @TEST-EXEC: btest-diff output
+
+type Foo: record 
+	{
+	a: count;
+	b: string;
+	};
+
+function test_bloom_filter()
+  {
+  local bf1 = bloomfilter_basic_init(0.9, 10);
+  bloomfilter_add(bf1, "foo");
+  bloomfilter_add(bf1, "bar");
+  
+  local bf2 = bloomfilter_basic_init(0.9, 10);
+  bloomfilter_add(bf2, Foo($a=1, $b="xx"));
+  bloomfilter_add(bf2, Foo($a=2, $b="yy"));
+  
+  local bf3 = bloomfilter_basic_init(0.9, 10, "my_seed");
+  bloomfilter_add(bf3, "foo");
+  bloomfilter_add(bf3, "bar");
+  
+  local bf4 = bloomfilter_basic_init(0.9, 10, "my_seed");
+  bloomfilter_add(bf4, Foo($a=1, $b="xx"));
+  bloomfilter_add(bf4, Foo($a=2, $b="yy"));
+
+  print "bf1, global_seed", bloomfilter_internal_state(bf1);
+  print "bf2, global_seed", bloomfilter_internal_state(bf2);
+  print "bf3, my_seed",     bloomfilter_internal_state(bf3);
+  print "bf4, my_seed",     bloomfilter_internal_state(bf4);
+
+  
+  }
+
+event bro_init()
+  {
+  test_bloom_filter();
+  }

From ba12f4af859e68fbc380259aa61ea7fdb9e416a0 Mon Sep 17 00:00:00 2001
From: Robin Sommer <robin@icir.org>
Date: Wed, 31 Jul 2013 20:47:47 -0700
Subject: [PATCH 28/40] Updating submodule(s).

 [nomail]
---
 aux/bro-aux | 2 +-
 aux/btest   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/aux/bro-aux b/aux/bro-aux
index 91d258cc8b..d9963983c0 160000
--- a/aux/bro-aux
+++ b/aux/bro-aux
@@ -1 +1 @@
-Subproject commit 91d258cc8b2f74cd02fc93dfe61f73ec9f0dd489
+Subproject commit d9963983c0b4d426b24836f8d154d014d5aecbba
diff --git a/aux/btest b/aux/btest
index ce366206e3..57c07a2c4d 160000
--- a/aux/btest
+++ b/aux/btest
@@ -1 +1 @@
-Subproject commit ce366206e3407e534a786ad572c342e9f9fef26b
+Subproject commit 57c07a2c4da3693a7e0ec088fae0463361a1f0d5

From 279c7b1af5936d71555eca5b57f2875f49fa5253 Mon Sep 17 00:00:00 2001
From: Robin Sommer <robin@icir.org>
Date: Wed, 31 Jul 2013 20:51:01 -0700
Subject: [PATCH 29/40] Updating submodule(s).

 [nomail]
---
 aux/btest | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/aux/btest b/aux/btest
index 57c07a2c4d..69606f8f3c 160000
--- a/aux/btest
+++ b/aux/btest
@@ -1 +1 @@
-Subproject commit 57c07a2c4da3693a7e0ec088fae0463361a1f0d5
+Subproject commit 69606f8f3cc84d694ca1da14868a5fecd4abbc96

From 2ccc963e221bc747b48f908ec1df7dc167f41723 Mon Sep 17 00:00:00 2001
From: Seth Hall <seth@icir.org>
Date: Thu, 1 Aug 2013 11:19:23 -0400
Subject: [PATCH 30/40] Small fix to deal with a bug in the SSL log delay
 mechanism.

 - It looks like we might have some parser problem, but I just want
   to work around this problem for the moment.
---
 CHANGES                             | 4 ++++
 VERSION                             | 2 +-
 scripts/base/protocols/ssl/main.bro | 9 +++------
 3 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/CHANGES b/CHANGES
index e9b5b1c925..5cfbd0b17a 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,4 +1,8 @@
 
+2.1-951 | 2013-08-01 11:19:23 -0400
+
+  * Small fix to deal with a bug in the SSL log delay mechanism.
+
 2.1-948 | 2013-07-31 20:08:28 -0700
 
   * Fix segfault caused by merging an empty bloom-filter with a
diff --git a/VERSION b/VERSION
index aaa6984d5f..48cc941701 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-2.1-948
+2.1-951
diff --git a/scripts/base/protocols/ssl/main.bro b/scripts/base/protocols/ssl/main.bro
index 65526182ac..0d4a8435f0 100644
--- a/scripts/base/protocols/ssl/main.bro
+++ b/scripts/base/protocols/ssl/main.bro
@@ -67,11 +67,8 @@ export {
 	## (especially with large file transfers).
 	const disable_analyzer_after_detection = T &redef;
 
-	## The maximum amount of time a script can delay records from being logged.
-	const max_log_delay = 15secs &redef;
-
 	## Delays an SSL record for a specific token: the record will not be logged
-	## as longs the token exists or until :bro:id:`SSL::max_log_delay` elapses.
+	## as longs the token exists or until 15 seconds elapses.
 	global delay_log: function(info: Info, token: string);
 
 	## Undelays an SSL record for a previously inserted token, allowing the
@@ -90,7 +87,7 @@ redef record connection += {
 redef record Info += {
 		# Adding a string "token" to this set will cause the SSL script
 		# to delay logging the record until either the token has been removed or
-		# the record has been delayed for :bro:id:`SSL::max_log_delay`.
+		# the record has been delayed.
 		delay_tokens: set[string] &optional;
 };
 
@@ -138,7 +135,7 @@ function log_record(info: Info)
 			{
 			log_record(info);
 			}
-		timeout SSL::max_log_delay
+		timeout 15secs
 			{
 			Reporter::info(fmt("SSL delay tokens not released in time (%s tokens remaining)",
 			                   |info$delay_tokens|));

From 99c89b42d7b945108fd2d75ad59a89a0b93c2144 Mon Sep 17 00:00:00 2001
From: Jon Siwek <jsiwek@illinois.edu>
Date: Thu, 1 Aug 2013 10:35:47 -0500
Subject: [PATCH 31/40] Internal refactoring of how plugin components are
 tagged/managed.

Made some class templates for code that seemed duplicated between
file/protocol tags and managers.  Seems like it helps a bit and
hopefully can be also be used to transition other things that have
enum value "tags" (e.g. logging writers, input readers) to the
plugin system.
---
 src/CMakeLists.txt                            |   2 +
 src/DebugLogger.cc                            |   3 +-
 src/DebugLogger.h                             |   1 +
 src/RuleAction.cc                             |  10 +-
 src/analyzer/Analyzer.cc                      |  10 +-
 src/analyzer/Component.cc                     |  19 +-
 src/analyzer/Component.h                      |  15 +-
 src/analyzer/Manager.cc                       | 114 ++------
 src/analyzer/Manager.h                        |  54 +---
 src/analyzer/Tag.h                            |   5 +-
 src/analyzer/analyzer.bif                     |   4 +-
 src/file_analysis/Analyzer.cc                 |   2 +-
 src/file_analysis/AnalyzerSet.cc              |  12 +-
 src/file_analysis/Component.cc                |  22 +-
 src/file_analysis/Component.h                 |  24 +-
 src/file_analysis/File.cc                     |   2 +-
 src/file_analysis/FileTimer.cc                |   2 +-
 src/file_analysis/Manager.cc                  |  84 +-----
 src/file_analysis/Manager.h                   |  59 +----
 src/file_analysis/Tag.h                       |   9 +-
 .../analyzer/data_event/DataEvent.cc          |   2 +-
 src/file_analysis/analyzer/extract/Extract.cc |   2 +-
 src/file_analysis/analyzer/hash/Hash.cc       |   2 +-
 src/file_analysis/file_analysis.bif           |   6 +-
 src/plugin/ComponentManager.h                 | 248 ++++++++++++++++++
 src/plugin/TaggedComponent.h                  |  85 ++++++
 26 files changed, 432 insertions(+), 366 deletions(-)
 create mode 100644 src/plugin/ComponentManager.h
 create mode 100644 src/plugin/TaggedComponent.h

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 7960579c8a..e64dcbb9f6 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -363,6 +363,8 @@ set(bro_SRCS
     3rdparty/sqlite3.c
 
     plugin/Component.cc
+    plugin/ComponentManager.h
+    plugin/TaggedComponent.h
     plugin/Manager.cc
     plugin/Plugin.cc
     plugin/Macros.h
diff --git a/src/DebugLogger.cc b/src/DebugLogger.cc
index 380f21aa5f..dc557c4a0a 100644
--- a/src/DebugLogger.cc
+++ b/src/DebugLogger.cc
@@ -16,7 +16,8 @@ DebugLogger::Stream DebugLogger::streams[NUM_DBGS] = {
 	{ "notifiers", 0, false },  { "main-loop", 0, false },
 	{ "dpd", 0, false }, { "tm", 0, false },
 	{ "logging", 0, false }, {"input", 0, false }, 
-	{ "threading", 0, false }, { "file_analysis", 0, false }
+	{ "threading", 0, false }, { "file_analysis", 0, false },
+	{ "plugins", 0, false}
 };
 
 DebugLogger::DebugLogger(const char* filename)
diff --git a/src/DebugLogger.h b/src/DebugLogger.h
index e293b326a8..c5744642f5 100644
--- a/src/DebugLogger.h
+++ b/src/DebugLogger.h
@@ -27,6 +27,7 @@ enum DebugStream {
 	DBG_INPUT,	// Input streams
 	DBG_THREADING,	// Threading system
 	DBG_FILE_ANALYSIS,	// File analysis
+	DBG_PLUGINS,
 
 	NUM_DBGS // Has to be last
 };
diff --git a/src/RuleAction.cc b/src/RuleAction.cc
index a13392ee40..ec57c96bd2 100644
--- a/src/RuleAction.cc
+++ b/src/RuleAction.cc
@@ -40,7 +40,7 @@ RuleActionAnalyzer::RuleActionAnalyzer(const char* arg_analyzer)
 	string str(arg_analyzer);
 	string::size_type pos = str.find(':');
 	string arg = str.substr(0, pos);
-	analyzer = analyzer_mgr->GetAnalyzerTag(arg.c_str());
+	analyzer = analyzer_mgr->GetComponentTag(arg.c_str());
 
 	if ( ! analyzer )
 		reporter->Warning("unknown analyzer '%s' specified in rule", arg.c_str());
@@ -48,7 +48,7 @@ RuleActionAnalyzer::RuleActionAnalyzer(const char* arg_analyzer)
 	if ( pos != string::npos )
 		{
 		arg = str.substr(pos + 1);
-		child_analyzer = analyzer_mgr->GetAnalyzerTag(arg.c_str());
+		child_analyzer = analyzer_mgr->GetComponentTag(arg.c_str());
 
 		if ( ! child_analyzer )
 			reporter->Warning("unknown analyzer '%s' specified in rule", arg.c_str());
@@ -60,11 +60,11 @@ RuleActionAnalyzer::RuleActionAnalyzer(const char* arg_analyzer)
 void RuleActionAnalyzer::PrintDebug()
 	{
 	if ( ! child_analyzer )
-		fprintf(stderr, "|%s|\n", analyzer_mgr->GetAnalyzerName(analyzer));
+		fprintf(stderr, "|%s|\n", analyzer_mgr->GetComponentName(analyzer));
 	else
 		fprintf(stderr, "|%s:%s|\n",
-			analyzer_mgr->GetAnalyzerName(analyzer),
-			analyzer_mgr->GetAnalyzerName(child_analyzer));
+			analyzer_mgr->GetComponentName(analyzer),
+			analyzer_mgr->GetComponentName(child_analyzer));
 	}
 
 
diff --git a/src/analyzer/Analyzer.cc b/src/analyzer/Analyzer.cc
index ecd3c9f686..b8b739f3cb 100644
--- a/src/analyzer/Analyzer.cc
+++ b/src/analyzer/Analyzer.cc
@@ -70,12 +70,12 @@ void AnalyzerTimer::Init(Analyzer* arg_analyzer, analyzer_timer_func arg_timer,
 	Ref(analyzer->Conn());
 	}
 
-analyzer::ID Analyzer::id_counter = 0;;
+analyzer::ID Analyzer::id_counter = 0;
 
 const char* Analyzer::GetAnalyzerName() const
 	{
 	assert(tag);
-	return analyzer_mgr->GetAnalyzerName(tag);
+	return analyzer_mgr->GetComponentName(tag);
 	}
 
 void Analyzer::SetAnalyzerTag(const Tag& arg_tag)
@@ -87,7 +87,7 @@ void Analyzer::SetAnalyzerTag(const Tag& arg_tag)
 bool Analyzer::IsAnalyzer(const char* name)
 	{
 	assert(tag);
-	return strcmp(analyzer_mgr->GetAnalyzerName(tag), name) == 0;
+	return strcmp(analyzer_mgr->GetComponentName(tag), name) == 0;
 	}
 
 // Used in debugging output.
@@ -98,7 +98,7 @@ static string fmt_analyzer(Analyzer* a)
 
 Analyzer::Analyzer(const char* name, Connection* conn)
 	{
-	Tag tag = analyzer_mgr->GetAnalyzerTag(name);
+	Tag tag = analyzer_mgr->GetComponentTag(name);
 
 	if ( ! tag )
 		reporter->InternalError("unknown analyzer name %s; mismatch with tag analyzer::Component?", name);
@@ -494,7 +494,7 @@ Analyzer* Analyzer::FindChild(Tag arg_tag)
 
 Analyzer* Analyzer::FindChild(const char* name)
 	{
-	Tag tag = analyzer_mgr->GetAnalyzerTag(name);
+	Tag tag = analyzer_mgr->GetComponentTag(name);
 	return tag ? FindChild(tag) : 0;
 	}
 
diff --git a/src/analyzer/Component.cc b/src/analyzer/Component.cc
index ded0a1a2d5..66ab2213bb 100644
--- a/src/analyzer/Component.cc
+++ b/src/analyzer/Component.cc
@@ -8,29 +8,26 @@
 
 using namespace analyzer;
 
-analyzer::Tag::type_t Component::type_counter = 0;
-
 Component::Component(const char* arg_name, factory_callback arg_factory, Tag::subtype_t arg_subtype, bool arg_enabled, bool arg_partial)
-	: plugin::Component(plugin::component::ANALYZER)
+	: plugin::Component(plugin::component::ANALYZER),
+	  plugin::TaggedComponent<analyzer::Tag>(arg_subtype)
 	{
 	name = copy_string(arg_name);
 	canon_name = canonify_name(arg_name);
 	factory = arg_factory;
 	enabled = arg_enabled;
 	partial = arg_partial;
-
-	tag = analyzer::Tag(++type_counter, arg_subtype);
 	}
 
 Component::Component(const Component& other)
-	: plugin::Component(Type())
+	: plugin::Component(Type()),
+	  plugin::TaggedComponent<analyzer::Tag>(other)
 	{
 	name = copy_string(other.name);
 	canon_name = copy_string(other.canon_name);
 	factory = other.factory;
 	enabled = other.enabled;
 	partial = other.partial;
-	tag = other.tag;
 	}
 
 Component::~Component()
@@ -39,11 +36,6 @@ Component::~Component()
 	delete [] canon_name;
 	}
 
-analyzer::Tag Component::Tag() const
-	{
-	return tag;
-	}
-
 void Component::Describe(ODesc* d) const
 	{
 	plugin::Component::Describe(d);
@@ -63,13 +55,14 @@ void Component::Describe(ODesc* d) const
 
 Component& Component::operator=(const Component& other)
 	{
+	plugin::TaggedComponent<analyzer::Tag>::operator=(other);
+
 	if ( &other != this )
 		{
 		name = copy_string(other.name);
 		factory = other.factory;
 		enabled = other.enabled;
 		partial = other.partial;
-		tag = other.tag;
 		}
 
 	return *this;
diff --git a/src/analyzer/Component.h b/src/analyzer/Component.h
index 9e12ed347e..9bc8b357d7 100644
--- a/src/analyzer/Component.h
+++ b/src/analyzer/Component.h
@@ -5,6 +5,7 @@
 
 #include "Tag.h"
 #include "plugin/Component.h"
+#include "plugin/TaggedComponent.h"
 
 #include "../config.h"
 #include "../util.h"
@@ -21,7 +22,8 @@ class Analyzer;
  * A plugin can provide a specific protocol analyzer by registering this
  * analyzer component, describing the analyzer.
  */
-class Component : public plugin::Component {
+class Component : public plugin::Component,
+                  public plugin::TaggedComponent<analyzer::Tag> {
 public:
 	typedef Analyzer* (*factory_callback)(Connection* conn);
 
@@ -100,13 +102,6 @@ public:
 	 */
 	bool Enabled() const	{ return enabled; }
 
-	/**
-	 * Returns the analyzer's tag. Note that this is automatically
-	 * generated for each new Components, and hence unique across all of
-	 * them.
-	 */
-	analyzer::Tag Tag() const;
-
 	/**
 	 * Enables or disables this analyzer.
 	 *
@@ -128,11 +123,7 @@ private:
 	const char* canon_name;	// The analyzer's canonical name.
 	factory_callback factory;	// The analyzer's factory callback.
 	bool partial;	// True if the analyzer supports partial connections.
-	analyzer::Tag tag;	// The automatically assigned analyzer tag.
 	bool enabled;	// True if the analyzer is enabled.
-
-	// Global counter used to generate unique tags.
-	static analyzer::Tag::type_t type_counter;
 };
 
 }
diff --git a/src/analyzer/Manager.cc b/src/analyzer/Manager.cc
index 6eb162f204..82453aef06 100644
--- a/src/analyzer/Manager.cc
+++ b/src/analyzer/Manager.cc
@@ -60,10 +60,8 @@ bool Manager::ConnIndex::operator<(const ConnIndex& other) const
 	}
 
 Manager::Manager()
+	: plugin::ComponentManager<analyzer::Tag, analyzer::Component>("Analyzer")
 	{
-	tag_enum_type = new EnumType("Analyzer::Tag");
-	::ID* id = install_ID("Tag", "Analyzer", true, true);
-	add_type(id, tag_enum_type, 0, 0);
 	}
 
 Manager::~Manager()
@@ -91,14 +89,14 @@ void Manager::InitPreScript()
 	std::list<Component*> analyzers = plugin_mgr->Components<Component>();
 
 	for ( std::list<Component*>::const_iterator i = analyzers.begin(); i != analyzers.end(); i++ )
-		RegisterAnalyzerComponent(*i);
+		RegisterComponent(*i, "ANALYZER_");
 
 	// Cache these tags.
-	analyzer_backdoor = GetAnalyzerTag("BACKDOOR");
-	analyzer_connsize = GetAnalyzerTag("CONNSIZE");
-	analyzer_interconn = GetAnalyzerTag("INTERCONN");
-	analyzer_stepping = GetAnalyzerTag("STEPPINGSTONE");
-	analyzer_tcpstats = GetAnalyzerTag("TCPSTATS");
+	analyzer_backdoor = GetComponentTag("BACKDOOR");
+	analyzer_connsize = GetComponentTag("CONNSIZE");
+	analyzer_interconn = GetComponentTag("INTERCONN");
+	analyzer_stepping = GetComponentTag("STEPPINGSTONE");
+	analyzer_tcpstats = GetComponentTag("TCPSTATS");
 	}
 
 void Manager::InitPostScript()
@@ -109,8 +107,9 @@ void Manager::DumpDebug()
 	{
 #ifdef DEBUG
 	DBG_LOG(DBG_ANALYZER, "Available analyzers after bro_init():");
-	for ( analyzer_map_by_name::const_iterator i = analyzers_by_name.begin(); i != analyzers_by_name.end(); i++ )
-		DBG_LOG(DBG_ANALYZER, "    %s (%s)", i->second->Name(), IsEnabled(i->second->Tag()) ? "enabled" : "disabled");
+	list<Component*> all_analyzers = GetComponents();
+	for ( list<Component*>::const_iterator i = all_analyzers.begin(); i != all_analyzers.end(); ++i )
+		DBG_LOG(DBG_ANALYZER, "    %s (%s)", (*i)->Name(), IsEnabled((*i)->Tag()) ? "enabled" : "disabled");
 
 	DBG_LOG(DBG_ANALYZER, "");
 	DBG_LOG(DBG_ANALYZER, "Analyzers by port:");
@@ -120,7 +119,7 @@ void Manager::DumpDebug()
 		string s;
 
 		for ( tag_set::const_iterator j = i->second->begin(); j != i->second->end(); j++ )
-			s += string(GetAnalyzerName(*j)) + " ";
+			s += string(GetComponentName(*j)) + " ";
 
 		DBG_LOG(DBG_ANALYZER, "    %d/tcp: %s", i->first, s.c_str());
 		}
@@ -130,7 +129,7 @@ void Manager::DumpDebug()
 		string s;
 
 		for ( tag_set::const_iterator j = i->second->begin(); j != i->second->end(); j++ )
-			s += string(GetAnalyzerName(*j)) + " ";
+			s += string(GetComponentName(*j)) + " ";
 
 		DBG_LOG(DBG_ANALYZER, "    %d/udp: %s", i->first, s.c_str());
 		}
@@ -142,25 +141,6 @@ void Manager::Done()
 	{
 	}
 
-void Manager::RegisterAnalyzerComponent(Component* component)
-	{
-	const char* cname = component->CanonicalName();
-
-	if ( Lookup(cname) )
-		reporter->FatalError("Analyzer %s defined more than once", cname);
-
-	DBG_LOG(DBG_ANALYZER, "Registering analyzer %s (tag %s)",
-		component->Name(), component->Tag().AsString().c_str());
-
-	analyzers_by_name.insert(std::make_pair(cname, component));
-	analyzers_by_tag.insert(std::make_pair(component->Tag(), component));
-	analyzers_by_val.insert(std::make_pair(component->Tag().AsEnumVal()->InternalInt(), component));
-
-	// Install enum "Analyzer::ANALYZER_*"
-	string id = fmt("ANALYZER_%s", cname);
-	tag_enum_type->AddName("Analyzer", id.c_str(), component->Tag().AsEnumVal()->InternalInt(), true);
-	}
-
 bool Manager::EnableAnalyzer(Tag tag)
 	{
 	Component* p = Lookup(tag);
@@ -217,8 +197,9 @@ void Manager::DisableAllAnalyzers()
 	{
 	DBG_LOG(DBG_ANALYZER, "Disabling all analyzers");
 
-	for ( analyzer_map_by_tag::const_iterator i = analyzers_by_tag.begin(); i != analyzers_by_tag.end(); i++ )
-		i->second->SetEnabled(false);
+	list<Component*> all_analyzers = GetComponents();
+	for ( list<Component*>::const_iterator i = all_analyzers.begin(); i != all_analyzers.end(); ++i )
+		(*i)->SetEnabled(false);
 	}
 
 bool Manager::IsEnabled(Tag tag)
@@ -270,7 +251,7 @@ bool Manager::RegisterAnalyzerForPort(Tag tag, TransportProto proto, uint32 port
 	tag_set* l = LookupPort(proto, port, true);
 
 #ifdef DEBUG
-	const char* name = GetAnalyzerName(tag);
+	const char* name = GetComponentName(tag);
 	DBG_LOG(DBG_ANALYZER, "Registering analyzer %s for port %" PRIu32 "/%d", name, port, proto);
 #endif
 
@@ -283,7 +264,7 @@ bool Manager::UnregisterAnalyzerForPort(Tag tag, TransportProto proto, uint32 po
 	tag_set* l = LookupPort(proto, port, true);
 
 #ifdef DEBUG
-	const char* name = GetAnalyzerName(tag);
+	const char* name = GetComponentName(tag);
 	DBG_LOG(DBG_ANALYZER, "Unregistering analyzer %s for port %" PRIu32 "/%d", name, port, proto);
 #endif
 
@@ -302,7 +283,7 @@ Analyzer* Manager::InstantiateAnalyzer(Tag tag, Connection* conn)
 		return 0;
 
 	if ( ! c->Factory() )
-		reporter->InternalError("analyzer %s cannot be instantiated dynamically", GetAnalyzerName(tag));
+		reporter->InternalError("analyzer %s cannot be instantiated dynamically", GetComponentName(tag));
 
 	Analyzer* a = c->Factory()(conn);
 
@@ -316,59 +297,10 @@ Analyzer* Manager::InstantiateAnalyzer(Tag tag, Connection* conn)
 
 Analyzer* Manager::InstantiateAnalyzer(const char* name, Connection* conn)
 	{
-	Tag tag = GetAnalyzerTag(name);
+	Tag tag = GetComponentTag(name);
 	return tag ? InstantiateAnalyzer(tag, conn) : 0;
 	}
 
-const char* Manager::GetAnalyzerName(Tag tag)
-	{
-	static const char* error = "<error>";
-
-	if ( ! tag )
-		return error;
-
-	Component* c = Lookup(tag);
-
-	if ( ! c )
-		reporter->InternalError("request for name of unknown analyzer tag %s", tag.AsString().c_str());
-
-	return c->CanonicalName();
-	}
-
-const char* Manager::GetAnalyzerName(Val* val)
-	{
-	return GetAnalyzerName(Tag(val->AsEnumVal()));
-	}
-
-analyzer::Tag Manager::GetAnalyzerTag(const char* name)
-	{
-	Component* c = Lookup(name);
-	return c ? c->Tag() : Tag();
-	}
-
-EnumType* Manager::GetTagEnumType()
-	{
-	return tag_enum_type;
-	}
-
-Component* Manager::Lookup(const char* name)
-	{
-	analyzer_map_by_name::const_iterator i = analyzers_by_name.find(to_upper(name));
-	return i != analyzers_by_name.end() ? i->second : 0;
-	}
-
-Component* Manager::Lookup(const Tag& tag)
-	{
-	analyzer_map_by_tag::const_iterator i = analyzers_by_tag.find(tag);
-	return i != analyzers_by_tag.end() ? i->second : 0;
-	}
-
-Component* Manager::Lookup(EnumVal* val)
-	{
-	analyzer_map_by_val::const_iterator i = analyzers_by_val.find(val->InternalInt());
-	return i != analyzers_by_val.end() ? i->second : 0;
-	}
-
 Manager::tag_set* Manager::LookupPort(TransportProto proto, uint32 port, bool add_if_not_found)
 	{
 	analyzer_map_by_port* m = 0;
@@ -461,7 +393,7 @@ bool Manager::BuildInitialAnalyzerTree(Connection* conn)
 			root->AddChildAnalyzer(analyzer, false);
 
 			DBG_ANALYZER_ARGS(conn, "activated %s analyzer as scheduled",
-				     analyzer_mgr->GetAnalyzerName(*i));
+				     analyzer_mgr->GetComponentName(*i));
 			}
 
 		}
@@ -487,7 +419,7 @@ bool Manager::BuildInitialAnalyzerTree(Connection* conn)
 
 					root->AddChildAnalyzer(analyzer, false);
 					DBG_ANALYZER_ARGS(conn, "activated %s analyzer due to port %d",
-						     analyzer_mgr->GetAnalyzerName(*j), resp_port);
+						     analyzer_mgr->GetComponentName(*j), resp_port);
 					}
 				}
 			}
@@ -613,7 +545,7 @@ void Manager::ExpireScheduledAnalyzers()
 			conns.erase(i);
 
 			DBG_LOG(DBG_ANALYZER, "Expiring expected analyzer %s for connection %s",
-				analyzer_mgr->GetAnalyzerName(a->analyzer),
+				analyzer_mgr->GetComponentName(a->analyzer),
 				fmt_conn_id(a->conn.orig, 0, a->conn.resp, a->conn.resp_p));
 
 			delete a;
@@ -655,7 +587,7 @@ void Manager::ScheduleAnalyzer(const IPAddr& orig, const IPAddr& resp,
 			TransportProto proto, const char* analyzer,
 			double timeout)
 	{
-	Tag tag = GetAnalyzerTag(analyzer);
+	Tag tag = GetComponentTag(analyzer);
 
 	if ( tag != Tag() )
 		ScheduleAnalyzer(orig, resp, resp_p, proto, tag, timeout);
diff --git a/src/analyzer/Manager.h b/src/analyzer/Manager.h
index efae629971..d151709eda 100644
--- a/src/analyzer/Manager.h
+++ b/src/analyzer/Manager.h
@@ -26,6 +26,7 @@
 #include "Analyzer.h"
 #include "Component.h"
 #include "Tag.h"
+#include "plugin/ComponentManager.h"
 
 #include "../Dict.h"
 #include "../net_util.h"
@@ -49,7 +50,7 @@ namespace analyzer {
  * classes. This allows to external analyzer code to potentially use a
  * different C++ standard library.
  */
-class Manager {
+class Manager : public plugin::ComponentManager<Tag, Component> {
 public:
 	/**
 	 * Constructor.
@@ -231,42 +232,6 @@ public:
 	 */
 	Analyzer* InstantiateAnalyzer(const char* name, Connection* c);
 
-	/**
-	 * Translates an analyzer tag into corresponding analyzer name.
-	 *
-	 * @param tag The analyzer tag.
-	 *
-	 * @return The name, or an empty string if the tag is invalid.
-	 */
-	const char* GetAnalyzerName(Tag tag);
-
-	/**
-	 * Translates an script-level analyzer tag into corresponding
-	 * analyzer name.
-	 *
-	 * @param val The analyzer tag as an script-level enum value of type
-	 * \c Analyzer::Tag.
-	 *
-	 * @return The name, or an empty string if the tag is invalid.
-	 */
-	const char* GetAnalyzerName(Val* val);
-
-	/**
-	 * Translates an analyzer name into the corresponding tag.
-	 *
-	 * @param name The name.
-	 *
-	 * @return The tag. If the name does not correspond to a valid
-	 * analyzer, the returned tag will evaluate to false.
-	 */
-	Tag GetAnalyzerTag(const char* name);
-
-	/**
-	 * Returns the enum type that corresponds to the script-level type \c
-	 * Analyzer::Tag.
-	 */
-	EnumType* GetTagEnumType();
-
 	/**
 	 * Given the first packet of a connection, builds its initial
 	 * analyzer tree.
@@ -350,18 +315,8 @@ public:
 
 private:
 	typedef set<Tag> tag_set;
-	typedef map<string, Component*> analyzer_map_by_name;
-	typedef map<Tag, Component*>  analyzer_map_by_tag;
-	typedef map<int, Component*>  analyzer_map_by_val;
 	typedef map<uint32, tag_set*> analyzer_map_by_port;
 
-	void RegisterAnalyzerComponent(Component* component); // Takes ownership.
-
-	Component* Lookup(const string& name);
-	Component* Lookup(const char* name);
-	Component* Lookup(const Tag& tag);
-	Component* Lookup(EnumVal* val);
-
 	tag_set* LookupPort(PortVal* val, bool add_if_not_found);
 	tag_set* LookupPort(TransportProto proto, uint32 port, bool add_if_not_found);
 
@@ -370,9 +325,6 @@ private:
 
 	analyzer_map_by_port analyzers_by_port_tcp;
 	analyzer_map_by_port analyzers_by_port_udp;
-	analyzer_map_by_name analyzers_by_name;
-	analyzer_map_by_tag  analyzers_by_tag;
-	analyzer_map_by_val  analyzers_by_val;
 
 	Tag analyzer_backdoor;
 	Tag analyzer_connsize;
@@ -380,8 +332,6 @@ private:
 	Tag analyzer_stepping;
 	Tag analyzer_tcpstats;
 
-	EnumType* tag_enum_type;
-
 	//// Data structures to track analyzed scheduled for future connections.
 
 	// The index for a scheduled connection.
diff --git a/src/analyzer/Tag.h b/src/analyzer/Tag.h
index 8ac151e4b5..d01c8902ee 100644
--- a/src/analyzer/Tag.h
+++ b/src/analyzer/Tag.h
@@ -6,6 +6,8 @@
 #include "config.h"
 #include "util.h"
 #include "../Tag.h"
+#include "plugin/TaggedComponent.h"
+#include "plugin/ComponentManager.h"
 
 class EnumVal;
 
@@ -87,7 +89,8 @@ public:
 
 protected:
 	friend class analyzer::Manager;
-	friend class analyzer::Component;
+	friend class plugin::ComponentManager<Tag, Component>;
+	friend class plugin::TaggedComponent<Tag>;
 
 	/**
 	 * Constructor.
diff --git a/src/analyzer/analyzer.bif b/src/analyzer/analyzer.bif
index 4d70816075..ebf8083624 100644
--- a/src/analyzer/analyzer.bif
+++ b/src/analyzer/analyzer.bif
@@ -41,11 +41,11 @@ function Analyzer::__schedule_analyzer%(orig: addr, resp: addr, resp_p: port,
 
 function __name%(atype: Analyzer::Tag%) : string
 	%{
-	return new StringVal(analyzer_mgr->GetAnalyzerName(atype));
+	return new StringVal(analyzer_mgr->GetComponentName(atype));
 	%}
 
 function __tag%(name: string%) : Analyzer::Tag
 	%{
-	analyzer::Tag t = analyzer_mgr->GetAnalyzerTag(name->CheckString());
+	analyzer::Tag t = analyzer_mgr->GetComponentTag(name->CheckString());
 	return t.AsEnumVal()->Ref();
 	%}
diff --git a/src/file_analysis/Analyzer.cc b/src/file_analysis/Analyzer.cc
index d472f4c80c..e0b5011aa8 100644
--- a/src/file_analysis/Analyzer.cc
+++ b/src/file_analysis/Analyzer.cc
@@ -6,6 +6,6 @@
 file_analysis::Analyzer::~Analyzer()
 	{
 	DBG_LOG(DBG_FILE_ANALYSIS, "Destroy file analyzer %s",
-	        file_mgr->GetAnalyzerName(tag));
+	        file_mgr->GetComponentName(tag));
 	Unref(args);
 	}
diff --git a/src/file_analysis/AnalyzerSet.cc b/src/file_analysis/AnalyzerSet.cc
index befb676c87..2dc4902314 100644
--- a/src/file_analysis/AnalyzerSet.cc
+++ b/src/file_analysis/AnalyzerSet.cc
@@ -42,7 +42,7 @@ bool AnalyzerSet::Add(file_analysis::Tag tag, RecordVal* args)
 	if ( analyzer_map.Lookup(key) )
 		{
 		DBG_LOG(DBG_FILE_ANALYSIS, "Instantiate analyzer %s skipped for file id"
-		        " %s: already exists", file_mgr->GetAnalyzerName(tag),
+		        " %s: already exists", file_mgr->GetComponentName(tag),
 		        file->GetID().c_str());
 		delete key;
 		return true;
@@ -82,7 +82,7 @@ bool AnalyzerSet::AddMod::Perform(AnalyzerSet* set)
 	if ( set->analyzer_map.Lookup(key) )
 		{
 		DBG_LOG(DBG_FILE_ANALYSIS, "Add analyzer %s skipped for file id"
-		        " %s: already exists", file_mgr->GetAnalyzerName(a->Tag()),
+		        " %s: already exists", file_mgr->GetComponentName(a->Tag()),
 		        a->GetFile()->GetID().c_str());
 
 		Abort();
@@ -108,12 +108,12 @@ bool AnalyzerSet::Remove(file_analysis::Tag tag, HashKey* key)
 	if ( ! a )
 		{
 		DBG_LOG(DBG_FILE_ANALYSIS, "Skip remove analyzer %s for file id %s",
-		        file_mgr->GetAnalyzerName(tag), file->GetID().c_str());
+		        file_mgr->GetComponentName(tag), file->GetID().c_str());
 		return false;
 		}
 
 	DBG_LOG(DBG_FILE_ANALYSIS, "Remove analyzer %s for file id %s",
-	        file_mgr->GetAnalyzerName(tag),
+	        file_mgr->GetComponentName(tag),
 	        file->GetID().c_str());
 
 	delete a;
@@ -155,7 +155,7 @@ file_analysis::Analyzer* AnalyzerSet::InstantiateAnalyzer(Tag tag,
 	if ( ! a )
 		{
 		reporter->Error("Failed file analyzer %s instantiation for file id %s",
-		                file_mgr->GetAnalyzerName(tag), file->GetID().c_str());
+		                file_mgr->GetComponentName(tag), file->GetID().c_str());
 		return 0;
 		}
 
@@ -165,7 +165,7 @@ file_analysis::Analyzer* AnalyzerSet::InstantiateAnalyzer(Tag tag,
 void AnalyzerSet::Insert(file_analysis::Analyzer* a, HashKey* key)
 	{
 	DBG_LOG(DBG_FILE_ANALYSIS, "Add analyzer %s for file id %s",
-	        file_mgr->GetAnalyzerName(a->Tag()), file->GetID().c_str());
+	        file_mgr->GetComponentName(a->Tag()), file->GetID().c_str());
 	analyzer_map.Insert(key, a);
 	delete key;
 	}
diff --git a/src/file_analysis/Component.cc b/src/file_analysis/Component.cc
index 8ddd9cceaf..9c47f2c75e 100644
--- a/src/file_analysis/Component.cc
+++ b/src/file_analysis/Component.cc
@@ -8,26 +8,22 @@
 
 using namespace file_analysis;
 
-file_analysis::Tag::type_t Component::type_counter = 0;
-
-Component::Component(const char* arg_name, factory_callback arg_factory,
-                     file_analysis::Tag::subtype_t arg_subtype)
-	: plugin::Component(plugin::component::FILE_ANALYZER)
+Component::Component(const char* arg_name, factory_callback arg_factory)
+	: plugin::Component(plugin::component::FILE_ANALYZER),
+	  plugin::TaggedComponent<file_analysis::Tag>()
 	{
 	name = copy_string(arg_name);
 	canon_name = canonify_name(arg_name);
 	factory = arg_factory;
-
-	tag = file_analysis::Tag(++type_counter, arg_subtype);
 	}
 
 Component::Component(const Component& other)
-	: plugin::Component(Type())
+	: plugin::Component(Type()),
+	  plugin::TaggedComponent<file_analysis::Tag>(other)
 	{
 	name = copy_string(other.name);
 	canon_name = copy_string(other.canon_name);
 	factory = other.factory;
-	tag = other.tag;
 	}
 
 Component::~Component()
@@ -36,11 +32,6 @@ Component::~Component()
 	delete [] canon_name;
 	}
 
-file_analysis::Tag Component::Tag() const
-	{
-	return tag;
-	}
-
 void Component::Describe(ODesc* d) const
 	{
 	plugin::Component::Describe(d);
@@ -58,11 +49,12 @@ void Component::Describe(ODesc* d) const
 
 Component& Component::operator=(const Component& other)
 	{
+	plugin::TaggedComponent<file_analysis::Tag>::operator=(other);
+
 	if ( &other != this )
 		{
 		name = copy_string(other.name);
 		factory = other.factory;
-		tag = other.tag;
 		}
 
 	return *this;
diff --git a/src/file_analysis/Component.h b/src/file_analysis/Component.h
index bd690bc081..4cf2dced60 100644
--- a/src/file_analysis/Component.h
+++ b/src/file_analysis/Component.h
@@ -5,6 +5,7 @@
 
 #include "Tag.h"
 #include "plugin/Component.h"
+#include "plugin/TaggedComponent.h"
 
 #include "Val.h"
 
@@ -22,7 +23,8 @@ class Analyzer;
  * A plugin can provide a specific file analyzer by registering this
  * analyzer component, describing the analyzer.
  */
-class Component : public plugin::Component {
+class Component : public plugin::Component,
+                  public plugin::TaggedComponent<file_analysis::Tag> {
 public:
 	typedef Analyzer* (*factory_callback)(RecordVal* args, File* file);
 
@@ -38,15 +40,8 @@ public:
 	 * from file_analysis::Analyzer. This is typically a static \c
 	 * Instatiate() method inside the class that just allocates and
 	 * returns a new instance.
-	 *
-	 * @param subtype A subtype associated with this component that
-	 * further distinguishes it. The subtype will be integrated into
-	 * the file_analysis::Tag that the manager associates with this analyzer,
-	 * and analyzer instances can accordingly access it via
-	 * file_analysis::Tag().  If not used, leave at zero.
 	 */
-	Component(const char* name, factory_callback factory,
-	          file_analysis::Tag::subtype_t subtype = 0);
+	Component(const char* name, factory_callback factory);
 
 	/**
 	 * Copy constructor.
@@ -79,13 +74,6 @@ public:
 	 */
 	factory_callback Factory() const	{ return factory; }
 
-	/**
-	 * Returns the analyzer's tag. Note that this is automatically
-	 * generated for each new Components, and hence unique across all of
-	 * them.
-	 */
-	file_analysis::Tag Tag() const;
-
 	/**
 	 * Generates a human-readable description of the component's main
 	 * parameters. This goes into the output of \c "bro -NN".
@@ -98,10 +86,6 @@ private:
 	const char* name;	// The analyzer's name.
 	const char* canon_name;	// The analyzer's canonical name.
 	factory_callback factory;	// The analyzer's factory callback.
-	file_analysis::Tag tag;	// The automatically assigned analyzer tag.
-
-	// Global counter used to generate unique tags.
-	static file_analysis::Tag::type_t type_counter;
 };
 
 }
diff --git a/src/file_analysis/File.cc b/src/file_analysis/File.cc
index a27070174b..1197cd06f6 100644
--- a/src/file_analysis/File.cc
+++ b/src/file_analysis/File.cc
@@ -88,7 +88,7 @@ File::File(const string& file_id, Connection* conn, analyzer::Tag tag,
 	if ( conn )
 		{
 		// add source, connection, is_orig fields
-		SetSource(analyzer_mgr->GetAnalyzerName(tag));
+		SetSource(analyzer_mgr->GetComponentName(tag));
 		val->Assign(is_orig_idx, new Val(is_orig, TYPE_BOOL));
 		UpdateConnectionFields(conn, is_orig);
 		}
diff --git a/src/file_analysis/FileTimer.cc b/src/file_analysis/FileTimer.cc
index 575857fd15..6b1d70f136 100644
--- a/src/file_analysis/FileTimer.cc
+++ b/src/file_analysis/FileTimer.cc
@@ -14,7 +14,7 @@ FileTimer::FileTimer(double t, const string& id, double interval)
 
 void FileTimer::Dispatch(double t, int is_expire)
 	{
-	File* file = file_mgr->Lookup(file_id);
+	File* file = file_mgr->LookupFile(file_id);
 
 	if ( ! file )
 		return;
diff --git a/src/file_analysis/Manager.cc b/src/file_analysis/Manager.cc
index 243786b83a..b7f4335717 100644
--- a/src/file_analysis/Manager.cc
+++ b/src/file_analysis/Manager.cc
@@ -18,10 +18,8 @@ TableVal* Manager::disabled = 0;
 string Manager::salt;
 
 Manager::Manager()
+	: ComponentManager<file_analysis::Tag, file_analysis::Component>("Files")
 	{
-	tag_enum_type = new EnumType("Files::Tag");
-	::ID* id = install_ID("Tag", "Files", true, true);
-	add_type(id, tag_enum_type, 0, 0);
 	}
 
 Manager::~Manager()
@@ -35,27 +33,7 @@ void Manager::InitPreScript()
 
 	for ( std::list<Component*>::const_iterator i = analyzers.begin();
 	      i != analyzers.end(); ++i )
-	      RegisterAnalyzerComponent(*i);
-	}
-
-void Manager::RegisterAnalyzerComponent(Component* component)
-	{
-	const char* cname = component->CanonicalName();
-
-	if ( tag_enum_type->Lookup("Files", cname) != -1 )
-		reporter->FatalError("File Analyzer %s defined more than once", cname);
-
-	DBG_LOG(DBG_FILE_ANALYSIS, "Registering analyzer %s (tag %s)",
-			component->Name(), component->Tag().AsString().c_str());
-
-	analyzers_by_name.insert(std::make_pair(cname, component));
-	analyzers_by_tag.insert(std::make_pair(component->Tag(), component));
-	analyzers_by_val.insert(std::make_pair(
-	        component->Tag().AsEnumVal()->InternalInt(), component));
-
-	string id = fmt("ANALYZER_%s", cname);
-	tag_enum_type->AddName("Files", id.c_str(),
-						   component->Tag().AsEnumVal()->InternalInt(), true);
+	      RegisterComponent(*i, "ANALYZER_");
 	}
 
 void Manager::InitPostScript()
@@ -193,7 +171,7 @@ void Manager::SetSize(uint64 size, analyzer::Tag tag, Connection* conn,
 
 bool Manager::SetTimeoutInterval(const string& file_id, double interval) const
 	{
-	File* file = Lookup(file_id);
+	File* file = LookupFile(file_id);
 
 	if ( ! file )
 		return false;
@@ -208,7 +186,7 @@ bool Manager::SetTimeoutInterval(const string& file_id, double interval) const
 bool Manager::AddAnalyzer(const string& file_id, file_analysis::Tag tag,
                           RecordVal* args) const
 	{
-	File* file = Lookup(file_id);
+	File* file = LookupFile(file_id);
 
 	if ( ! file )
 		return false;
@@ -219,7 +197,7 @@ bool Manager::AddAnalyzer(const string& file_id, file_analysis::Tag tag,
 bool Manager::RemoveAnalyzer(const string& file_id, file_analysis::Tag tag,
                              RecordVal* args) const
 	{
-	File* file = Lookup(file_id);
+	File* file = LookupFile(file_id);
 
 	if ( ! file )
 		return false;
@@ -257,7 +235,7 @@ File* Manager::GetFile(const string& file_id, Connection* conn,
 	return rval;
 	}
 
-File* Manager::Lookup(const string& file_id) const
+File* Manager::LookupFile(const string& file_id) const
 	{
 	IDMap::const_iterator it = id_map.find(file_id);
 
@@ -269,7 +247,7 @@ File* Manager::Lookup(const string& file_id) const
 
 void Manager::Timeout(const string& file_id, bool is_terminating)
 	{
-	File* file = Lookup(file_id);
+	File* file = LookupFile(file_id);
 
 	if ( ! file )
 		return;
@@ -370,59 +348,15 @@ bool Manager::IsDisabled(analyzer::Tag tag)
 
 Analyzer* Manager::InstantiateAnalyzer(Tag tag, RecordVal* args, File* f) const
 	{
-	analyzer_map_by_tag::const_iterator it = analyzers_by_tag.find(tag);
+	Component* c = Lookup(tag);
 
-	if ( it == analyzers_by_tag.end() )
+	if ( ! c )
 		reporter->InternalError("cannot instantiate unknown file analyzer: %s",
 		                        tag.AsString().c_str());
 
-	Component* c = it->second;
-
 	if ( ! c->Factory() )
 		reporter->InternalError("file analyzer %s cannot be instantiated "
 								"dynamically", c->CanonicalName());
 
 	return c->Factory()(args, f);
 	}
-
-const char* Manager::GetAnalyzerName(Val* v) const
-	{
-	return GetAnalyzerName(file_analysis::Tag(v->AsEnumVal()));
-	}
-
-const char* Manager::GetAnalyzerName(file_analysis::Tag tag) const
-	{
-	analyzer_map_by_tag::const_iterator it = analyzers_by_tag.find(tag);
-
-	if ( it == analyzers_by_tag.end() )
-		reporter->InternalError("cannot get name of unknown file analyzer: %s",
-		                        tag.AsString().c_str());
-
-	return it->second->CanonicalName();
-	}
-
-file_analysis::Tag Manager::GetAnalyzerTag(const char* name) const
-	{
-	analyzer_map_by_name::const_iterator it = analyzers_by_name.find(name);
-
-	if ( it == analyzers_by_name.end() )
-		return file_analysis::Tag();
-
-	return it->second->Tag();
-	}
-
-file_analysis::Tag Manager::GetAnalyzerTag(Val* v) const
-	{
-	analyzer_map_by_val::const_iterator it =
-	    analyzers_by_val.find(v->AsEnumVal()->InternalInt());
-
-	if ( it == analyzers_by_val.end() )
-		return file_analysis::Tag();
-
-	return it->second->Tag();
-	}
-
-EnumType* Manager::GetTagEnumType()
-	{
-	return tag_enum_type;
-	}
diff --git a/src/file_analysis/Manager.h b/src/file_analysis/Manager.h
index 9a37042669..dcf33edc99 100644
--- a/src/file_analysis/Manager.h
+++ b/src/file_analysis/Manager.h
@@ -18,7 +18,8 @@
 #include "File.h"
 #include "FileTimer.h"
 #include "Component.h"
-
+#include "Tag.h"
+#include "plugin/ComponentManager.h"
 #include "analyzer/Tag.h"
 
 #include "file_analysis/file_analysis.bif.h"
@@ -28,7 +29,7 @@ namespace file_analysis {
 /**
  * Main entry point for interacting with file analysis.
  */
-class Manager {
+class Manager : public plugin::ComponentManager<Tag, Component> {
 public:
 
 	/**
@@ -210,48 +211,6 @@ public:
 	 */
 	Analyzer* InstantiateAnalyzer(Tag tag, RecordVal* args, File* f) const;
 
-	/**
-	 * Translates a script-level file analyzer tag in to corresponding file
-	 * analyzer name.
-	 * @param v The enum val of a file analyzer.
-	 * @return The human-readable name of the file analyzer.
-	 */
-	const char* GetAnalyzerName(Val* v) const;
-
-	/**
-	 * Translates a script-level file analyzer tag in to corresponding file
-	 * analyzer name.
-	 * @param tag The analyzer tag of a file analyzer.
-	 * @return The human-readable name of the file analyzer.
-	 */
-	const char* GetAnalyzerName(file_analysis::Tag tag) const;
-
-	/**
-	 * Translates an analyzer name into the corresponding tag.
-	 *
-	 * @param name The name.
-	 *
-	 * @return The tag. If the name does not correspond to a valid
-	 * analyzer, the returned tag will evaluate to false.
-	 */
-	file_analysis::Tag GetAnalyzerTag(const char* name) const;
-
-	/**
-	 * Translates an analyzer enum value into the corresponding tag.
-	 *
-	 * @param v the enum val of the file analyzer.
-	 *
-	 * @return The tag. If the val does not correspond to a valid
-	 * analyzer, the returned tag will evaluate to false.
-	 */
-	file_analysis::Tag GetAnalyzerTag(Val* v) const;
-
-	/**
-	 * Returns the enum type that corresponds to the script-level type
-	 * \c Files::Tag.
-	 */
-	EnumType* GetTagEnumType();
-
 protected:
 	friend class FileTimer;
 
@@ -285,7 +244,7 @@ protected:
 	 * @return the File object mapped to \a file_id, or a null pointer if no
 	 *         mapping exists.
 	 */
-	File* Lookup(const string& file_id) const;
+	File* LookupFile(const string& file_id) const;
 
 	/**
 	 * Evaluate timeout policy for a file and remove the File object mapped to
@@ -325,20 +284,10 @@ protected:
 	static bool IsDisabled(analyzer::Tag tag);
 
 private:
-	typedef map<string, Component*> analyzer_map_by_name;
-	typedef map<file_analysis::Tag, Component*> analyzer_map_by_tag;
-	typedef map<int, Component*> analyzer_map_by_val;
-
-	void RegisterAnalyzerComponent(Component* component);
 
 	IDMap id_map;	/**< Map file ID to file_analysis::File records. */
 	IDSet ignored;	/**< Ignored files.  Will be finally removed on EOF. */
 	string current_file_id;	/**< Hash of what get_file_handle event sets. */
-	EnumType* tag_enum_type;	/**< File analyzer tag type. */
-
-	analyzer_map_by_name analyzers_by_name;
-	analyzer_map_by_tag analyzers_by_tag;
-	analyzer_map_by_val analyzers_by_val;
 
 	static TableVal* disabled;	/**< Table of disabled analyzers. */
 	static string salt; /**< A salt added to file handles before hashing. */
diff --git a/src/file_analysis/Tag.h b/src/file_analysis/Tag.h
index 85c20da5b5..aa38836403 100644
--- a/src/file_analysis/Tag.h
+++ b/src/file_analysis/Tag.h
@@ -6,12 +6,13 @@
 #include "config.h"
 #include "util.h"
 #include "../Tag.h"
+#include "plugin/TaggedComponent.h"
+#include "plugin/ComponentManager.h"
 
 class EnumVal;
 
 namespace file_analysis {
 
-class Manager;
 class Component;
 
 /**
@@ -87,15 +88,15 @@ public:
 	static Tag Error;
 
 protected:
-	friend class file_analysis::Manager;
-	friend class file_analysis::Component;
+	friend class plugin::ComponentManager<Tag, Component>;
+	friend class plugin::TaggedComponent<Tag>;
 
 	/**
 	 * Constructor.
 	 *
 	 * @param type The main type. Note that the \a file_analysis::Manager
 	 * manages the value space internally, so noone else should assign
-	 * main tyoes.
+	 * main types.
 	 *
 	 * @param subtype The sub type, which is left to an analyzer for
 	 * interpretation. By default it's set to zero.
diff --git a/src/file_analysis/analyzer/data_event/DataEvent.cc b/src/file_analysis/analyzer/data_event/DataEvent.cc
index 44498f41e1..cf2d7e52ec 100644
--- a/src/file_analysis/analyzer/data_event/DataEvent.cc
+++ b/src/file_analysis/analyzer/data_event/DataEvent.cc
@@ -12,7 +12,7 @@ using namespace file_analysis;
 
 DataEvent::DataEvent(RecordVal* args, File* file,
                      EventHandlerPtr ce, EventHandlerPtr se)
-    : file_analysis::Analyzer(file_mgr->GetAnalyzerTag("DATA_EVENT"),
+    : file_analysis::Analyzer(file_mgr->GetComponentTag("DATA_EVENT"),
 	                          args, file),
 	chunk_event(ce), stream_event(se)
 	{
diff --git a/src/file_analysis/analyzer/extract/Extract.cc b/src/file_analysis/analyzer/extract/Extract.cc
index 0de1402939..28b5cf5a63 100644
--- a/src/file_analysis/analyzer/extract/Extract.cc
+++ b/src/file_analysis/analyzer/extract/Extract.cc
@@ -9,7 +9,7 @@
 using namespace file_analysis;
 
 Extract::Extract(RecordVal* args, File* file, const string& arg_filename)
-    : file_analysis::Analyzer(file_mgr->GetAnalyzerTag("EXTRACT"), args, file),
+    : file_analysis::Analyzer(file_mgr->GetComponentTag("EXTRACT"), args, file),
 	  filename(arg_filename)
 	{
 	fd = open(filename.c_str(), O_WRONLY | O_CREAT | O_TRUNC, 0666);
diff --git a/src/file_analysis/analyzer/hash/Hash.cc b/src/file_analysis/analyzer/hash/Hash.cc
index 12463df8bf..9829934301 100644
--- a/src/file_analysis/analyzer/hash/Hash.cc
+++ b/src/file_analysis/analyzer/hash/Hash.cc
@@ -10,7 +10,7 @@
 using namespace file_analysis;
 
 Hash::Hash(RecordVal* args, File* file, HashVal* hv, const char* arg_kind)
-	: file_analysis::Analyzer(file_mgr->GetAnalyzerTag(to_upper(string(arg_kind)).c_str()), args, file), hash(hv), fed(false), kind(arg_kind)
+	: file_analysis::Analyzer(file_mgr->GetComponentTag(to_upper(arg_kind).c_str()), args, file), hash(hv), fed(false), kind(arg_kind)
 	{
 	hash->Init();
 	}
diff --git a/src/file_analysis/file_analysis.bif b/src/file_analysis/file_analysis.bif
index 7e07ddf6bb..0e904f298f 100644
--- a/src/file_analysis/file_analysis.bif
+++ b/src/file_analysis/file_analysis.bif
@@ -21,7 +21,7 @@ function Files::__add_analyzer%(file_id: string, tag: Files::Tag, args: any%): b
 	using BifType::Record::Files::AnalyzerArgs;
 	RecordVal* rv = args->AsRecordVal()->CoerceTo(AnalyzerArgs);
 	bool result = file_mgr->AddAnalyzer(file_id->CheckString(),
-	                                    file_mgr->GetAnalyzerTag(tag), rv);
+	                                    file_mgr->GetComponentTag(tag), rv);
 	Unref(rv);
 	return new Val(result, TYPE_BOOL);
 	%}
@@ -32,7 +32,7 @@ function Files::__remove_analyzer%(file_id: string, tag: Files::Tag, args: any%)
 	using BifType::Record::Files::AnalyzerArgs;
 	RecordVal* rv = args->AsRecordVal()->CoerceTo(AnalyzerArgs);
 	bool result = file_mgr->RemoveAnalyzer(file_id->CheckString(),
-	                                       file_mgr->GetAnalyzerTag(tag) , rv);
+	                                       file_mgr->GetComponentTag(tag) , rv);
 	Unref(rv);
 	return new Val(result, TYPE_BOOL);
 	%}
@@ -47,7 +47,7 @@ function Files::__stop%(file_id: string%): bool
 ## :bro:see:`Files::analyzer_name`.
 function Files::__analyzer_name%(tag: Files::Tag%) : string
 	%{
-	return new StringVal(file_mgr->GetAnalyzerName(tag));
+	return new StringVal(file_mgr->GetComponentName(tag));
 	%}
 
 module GLOBAL;
diff --git a/src/plugin/ComponentManager.h b/src/plugin/ComponentManager.h
new file mode 100644
index 0000000000..16f9d80743
--- /dev/null
+++ b/src/plugin/ComponentManager.h
@@ -0,0 +1,248 @@
+#ifndef PLUGIN_COMPONENT_MANAGER_H
+#define PLUGIN_COMPONENT_MANAGER_H
+
+#include <map>
+#include <list>
+#include <string>
+
+#include "Type.h"
+#include "ID.h"
+#include "Var.h"
+#include "Val.h"
+#include "Reporter.h"
+
+namespace plugin {
+
+/**
+ * A class that manages tracking of plugin components (e.g. analyzers) and
+ * installs identifiers in the script-layer to identify them by a unique tag,
+ * (a script-layer enum value).
+ *
+ * @tparam T A ::Tag type or derivative.
+ * @tparam C A plugin::TaggedComponent type derivative.
+ */
+template <class T, class C>
+class ComponentManager {
+public:
+
+	/**
+	 * Constructor creates a new enum type called a "Tag" to associate with
+	 * a component.
+	 *
+	 * @param module The script-layer module in which to install the "Tag" ID
+	 * representing an enum type.
+	 */
+	ComponentManager(const string& module);
+
+	/**
+	 * @return The script-layer module in which the component's "Tag" ID lives.
+	 */
+	const char* GetModule() const;
+
+	/**
+	 * @return A list of all registered components.
+	 */
+	list<C*> GetComponents() const;
+
+	/**
+	 * @return The enum type associated with the script-layer "Tag".
+	 */
+	EnumType* GetTagEnumType() const;
+
+	/**
+	 * Get a component name from its tag.
+	 *
+	 * @param tag A component's tag.
+	 * @return The canonical component name.
+	 */
+	const char* GetComponentName(T tag) const;
+
+	/**
+	 * Get a component name from it's enum value.
+	 *
+	 * @param val A component's enum value.
+	 * @return The canonical component name.
+	 */
+	const char* GetComponentName(Val* val) const;
+
+	/**
+	 * Get a component tag from its name.
+	 *
+	 * @param name A component's canonical name.
+	 * @return The component's tag, or a tag representing an error if
+	 * no such component assoicated with the name exists.
+	 */
+	T GetComponentTag(const string& name) const;
+
+	/**
+	 * Get a component tag from its enum value.
+	 *
+	 * @param v A component's enum value.
+	 * @return The component's tag, or a tag representing an error if
+	 * no such component assoicated with the value exists.
+	 */
+	T GetComponentTag(Val* v) const;
+
+protected:
+
+	/**
+	 * Add a component the internal maps used to keep track of it and create
+	 * a script-layer ID for the component's enum value.
+	 *
+	 * @param component A component to track.
+	 * @param prefix The script-layer ID associated with the component's enum
+	 * value will be a concatenation of this prefix and the component's
+	 * canonical name.
+	 */
+	void RegisterComponent(C* component, const string& prefix = "");
+
+	/**
+	 * @param name The canonical name of a component.
+	 * @return The component associated with the name or a null pointer if no
+	 * such component exists.
+	 */
+	C* Lookup(const string& name) const;
+
+	/**
+	 * @param name A component tag.
+	 * @return The component associated with the tag or a null pointer if no
+	 * such component exists.
+	 */
+	C* Lookup(const T& tag) const;
+
+	/**
+	 * @param name A component's enum value.
+	 * @return The component associated with the value or a null pointer if no
+	 * such component exists.
+	 */
+	C* Lookup(EnumVal* val) const;
+
+private:
+
+	string module; /**< Script layer module in which component tags live. */
+	EnumType* tag_enum_type; /**< Enum type of component tags. */
+	map<string, C*> components_by_name;
+	map<T, C*> components_by_tag;
+	map<int, C*> components_by_val;
+};
+
+template <class T, class C>
+ComponentManager<T, C>::ComponentManager(const string& arg_module)
+	: module(arg_module)
+	{
+	tag_enum_type = new EnumType(module + "::Tag");
+	::ID* id = install_ID("Tag", module.c_str(), true, true);
+	add_type(id, tag_enum_type, 0, 0);
+	}
+
+template <class T, class C>
+const char* ComponentManager<T, C>::GetModule() const
+	{
+	return module.c_str();
+	}
+
+template <class T, class C>
+list<C*> ComponentManager<T, C>::GetComponents() const
+	{
+	list<C*> rval;
+	typename map<T, C*>::const_iterator i;
+
+	for ( i = components_by_tag.begin(); i != components_by_tag.end(); ++i )
+	      rval.push_back(i->second);
+
+	return rval;
+	}
+
+template <class T, class C>
+EnumType* ComponentManager<T, C>::GetTagEnumType() const
+	{
+	return tag_enum_type;
+	}
+
+template <class T, class C>
+const char* ComponentManager<T, C>::GetComponentName(T tag) const
+	{
+	static const char* error = "<error>";
+
+	if ( ! tag )
+		return error;
+
+	C* c = Lookup(tag);
+
+	if ( ! c )
+		reporter->InternalError("request for name of unknown component tag %s",
+		                        tag.AsString().c_str());
+
+	return c->CanonicalName();
+	}
+
+template <class T, class C>
+const char* ComponentManager<T, C>::GetComponentName(Val* val) const
+	{
+	return GetComponentName(T(val->AsEnumVal()));
+	}
+
+template <class T, class C>
+T ComponentManager<T, C>::GetComponentTag(const string& name) const
+	{
+	C* c = Lookup(name);
+	return c ? c->Tag() : T();
+	}
+
+template <class T, class C>
+T ComponentManager<T, C>::GetComponentTag(Val* v) const
+	{
+	C* c = Lookup(v->AsEnumVal());
+	return c ? c->Tag() : T();
+	}
+
+template <class T, class C>
+C* ComponentManager<T, C>::Lookup(const string& name) const
+	{
+	typename map<string, C*>::const_iterator i =
+	        components_by_name.find(to_upper(name));
+	return i != components_by_name.end() ? i->second : 0;
+	}
+
+template <class T, class C>
+C* ComponentManager<T, C>::Lookup(const T& tag) const
+	{
+	typename map<T, C*>::const_iterator i = components_by_tag.find(tag);
+	return i != components_by_tag.end() ? i->second : 0;
+	}
+
+template <class T, class C>
+C* ComponentManager<T, C>::Lookup(EnumVal* val) const
+	{
+	typename map<int, C*>::const_iterator i =
+	        components_by_val.find(val->InternalInt());
+	return i != components_by_val.end() ? i->second : 0;
+	}
+
+template <class T, class C>
+void ComponentManager<T, C>::RegisterComponent(C* component,
+                                               const string& prefix)
+	{
+	const char* cname = component->CanonicalName();
+
+	if ( Lookup(cname) )
+		reporter->FatalError("Component '%s::%s' defined more than once",
+		                     module.c_str(), cname);
+
+	DBG_LOG(DBG_PLUGINS, "Registering component %s (tag %s)",
+	        component->Name(), component->Tag().AsString().c_str());
+
+	components_by_name.insert(std::make_pair(cname, component));
+	components_by_tag.insert(std::make_pair(component->Tag(), component));
+	components_by_val.insert(std::make_pair(
+	        component->Tag().AsEnumVal()->InternalInt(), component));
+
+	// Install an identfier for enum value
+	string id = fmt("%s%s", prefix.c_str(), cname);
+	tag_enum_type->AddName(module, id.c_str(),
+	                       component->Tag().AsEnumVal()->InternalInt(), true);
+	}
+
+} // namespace plugin
+
+#endif
diff --git a/src/plugin/TaggedComponent.h b/src/plugin/TaggedComponent.h
new file mode 100644
index 0000000000..99eab9f230
--- /dev/null
+++ b/src/plugin/TaggedComponent.h
@@ -0,0 +1,85 @@
+#ifndef PLUGIN_TAGGED_COMPONENT_H
+#define PLUGIN_TAGGED_COMPONENT_H
+
+namespace plugin {
+
+/**
+ * A class which has a tag of a given type associated with it.
+ *
+ * @tparam T A ::Tag type or derivative.
+ */
+template <class T>
+class TaggedComponent {
+public:
+
+	/**
+	 * Constructor creates a unique tag value for this component.
+	 *
+	 * @param subtype A subtype associated with this component that
+	 * further distinguishes it. The subtype will be integrated into
+	 * the Tag that the manager associates with this component,
+	 * and component instances can accordingly access it via Tag().
+	 * If not used, leave at zero.
+	 */
+	TaggedComponent(typename T::subtype_t subtype = 0);
+
+	/**
+	 * Copy constructor.
+	 *
+	 * @param other Another component from which to copy its tag value.
+	 */
+	TaggedComponent(const TaggedComponent& other);
+
+	/**
+	 * Assignment operator.
+	 *
+	 * @param other A component to assign.
+	 * @return The assigned object.
+	 */
+	TaggedComponent& operator=(const TaggedComponent& other);
+
+	/**
+	 * @return The component's tag.
+	 */
+	T Tag() const;
+
+private:
+
+	T tag; /**< The automatically assigned analyzer tag. */
+	static typename T::type_t type_counter; /**< Used to generate globally
+	                                             unique tags. */
+};
+
+template <class T>
+TaggedComponent<T>::TaggedComponent(typename T::subtype_t subtype)
+	{
+	tag = T(++type_counter, subtype);
+	}
+
+template <class T>
+TaggedComponent<T>::TaggedComponent(const TaggedComponent<T>& other)
+	{
+	tag = other.tag;
+	}
+
+template <class T>
+TaggedComponent<T>&
+TaggedComponent<T>::operator =(const TaggedComponent<T>& other)
+	{
+	if ( &other != this )
+		tag = other.tag;
+
+	return *this;
+	}
+
+template <class T>
+T TaggedComponent<T>::Tag() const
+	{
+	return tag;
+	}
+
+template <class T> typename T::type_t TaggedComponent<T>::type_counter(0);
+
+} // namespace plugin
+
+#endif

From 238e4a8d5c24c5cc2c6637902effe4b322a9fae5 Mon Sep 17 00:00:00 2001
From: Jon Siwek <jsiwek@illinois.edu>
Date: Thu, 1 Aug 2013 10:45:44 -0500
Subject: [PATCH 32/40] Minor fix to file/protocol analyzer plugin reference
 doc.

---
 src/BroDoc.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/BroDoc.cc b/src/BroDoc.cc
index 3cb271bdbf..55dc8ce558 100644
--- a/src/BroDoc.cc
+++ b/src/BroDoc.cc
@@ -564,7 +564,7 @@ static void WriteAnalyzerTagDefn(FILE* f, EnumType* e, const string& module)
 	dummy_id->MakeType();
 
 	list<string>* r = new list<string>();
-	r->push_back("Unique identifiers for protocol analyzers.");
+	r->push_back("Unique identifiers for analyzers.");
 
 	BroDocObj bdo(dummy_id, r, true);
 
@@ -622,7 +622,7 @@ void CreateFileAnalyzerDoc(const char* filename)
 	FILE* f = fopen(filename, "w");
 
 	fprintf(f, "File Analyzer Reference\n");
-	fprintf(f, "===========================\n\n");
+	fprintf(f, "=======================\n\n");
 
 	WriteAnalyzerTagDefn(f, file_mgr->GetTagEnumType(), "Files");
 

From 34965b4e77b3091dd0d959873b21239f3da02ac4 Mon Sep 17 00:00:00 2001
From: Matthias Vallentin <vallentin@icir.org>
Date: Thu, 1 Aug 2013 19:15:28 +0200
Subject: [PATCH 33/40] Support UHF hashing for >= UHASH_KEY_SIZE bytes.

---
 src/probabilistic/Hasher.cc | 23 +++++++++++++++++++----
 src/probabilistic/Hasher.h  |  5 +++--
 2 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/src/probabilistic/Hasher.cc b/src/probabilistic/Hasher.cc
index b59274df7d..fe8eb66ad9 100644
--- a/src/probabilistic/Hasher.cc
+++ b/src/probabilistic/Hasher.cc
@@ -1,6 +1,7 @@
 // See the file "COPYING" in the main distribution directory for copyright.
 
 #include <typeinfo>
+#include <openssl/md5.h>
 
 #include "Hasher.h"
 #include "NetVar.h"
@@ -82,15 +83,29 @@ Hasher::Hasher(size_t arg_k, size_t arg_seed)
 	seed = arg_seed;
 	}
 
-UHF::UHF(size_t seed)
-	: h(seed)
+UHF::UHF(size_t arg_seed)
+	: h(arg_seed)
 	{
+  seed = arg_seed;
 	}
 
+// This function is almost equivalent to HashKey::HashBytes except that it does
+// not depend on global state and that we mix in the seed multiple times.
 Hasher::digest UHF::hash(const void* x, size_t n) const
 	{
-	assert(n <= UHASH_KEY_SIZE);
-	return n == 0 ? 0 : h(x, n);
+	if ( n <= UHASH_KEY_SIZE )
+		return n == 0 ? 0 : h(x, n);
+
+	unsigned char d[16];
+	MD5(reinterpret_cast<const unsigned char*>(x), n, d);
+
+	const unsigned char* s = reinterpret_cast<const unsigned char*>(&seed);
+	for ( size_t i = 0; i < 16; ++i )
+		d[i] ^= s[i % sizeof(seed)];
+
+	MD5(d, 16, d);
+
+	return d[0];
 	}
 
 DefaultHasher::DefaultHasher(size_t k, size_t seed)
diff --git a/src/probabilistic/Hasher.h b/src/probabilistic/Hasher.h
index 6b75fa1bea..a3322f5e37 100644
--- a/src/probabilistic/Hasher.h
+++ b/src/probabilistic/Hasher.h
@@ -123,9 +123,9 @@ public:
 	 * Constructs an H3 hash function seeded with a given seed and an
 	 * optional extra seed to replace the initial Bro seed.
 	 *
-	 * @param seed The seed to use for this instance.
+	 * @param arg_seed The seed to use for this instance.
 	 */
-	UHF(size_t seed = 0);
+	UHF(size_t arg_seed = 0);
 
 	template <typename T>
 	Hasher::digest operator()(const T& x) const
@@ -171,6 +171,7 @@ private:
 	static size_t compute_seed(size_t seed);
 
 	H3<Hasher::digest, UHASH_KEY_SIZE> h;
+	size_t seed;
 };
 
 

From ee7dba806d4016af59f969237a4a06d1cf158013 Mon Sep 17 00:00:00 2001
From: Jon Siwek <jsiwek@ncsa.illinois.edu>
Date: Thu, 1 Aug 2013 12:17:51 -0500
Subject: [PATCH 34/40] Fix some build errors.

On GCC, some namespace sensitivity and file analyzer plugins now need
to link in Analyzer since it's not just a header anymore.
---
 src/file_analysis/Manager.cc                         | 3 ++-
 src/file_analysis/analyzer/data_event/CMakeLists.txt | 2 +-
 src/file_analysis/analyzer/extract/CMakeLists.txt    | 2 +-
 src/file_analysis/analyzer/hash/CMakeLists.txt       | 2 +-
 4 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/file_analysis/Manager.cc b/src/file_analysis/Manager.cc
index b7f4335717..5975133356 100644
--- a/src/file_analysis/Manager.cc
+++ b/src/file_analysis/Manager.cc
@@ -18,7 +18,8 @@ TableVal* Manager::disabled = 0;
 string Manager::salt;
 
 Manager::Manager()
-	: ComponentManager<file_analysis::Tag, file_analysis::Component>("Files")
+	: plugin::ComponentManager<file_analysis::Tag,
+	                           file_analysis::Component>("Files")
 	{
 	}
 
diff --git a/src/file_analysis/analyzer/data_event/CMakeLists.txt b/src/file_analysis/analyzer/data_event/CMakeLists.txt
index 81551feda2..49e23d49a0 100644
--- a/src/file_analysis/analyzer/data_event/CMakeLists.txt
+++ b/src/file_analysis/analyzer/data_event/CMakeLists.txt
@@ -4,5 +4,5 @@ include_directories(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}
                            ${CMAKE_CURRENT_BINARY_DIR})
 
 bro_plugin_begin(Bro FileDataEvent)
-bro_plugin_cc(DataEvent.cc Plugin.cc)
+bro_plugin_cc(DataEvent.cc Plugin.cc ../../Analyzer.cc)
 bro_plugin_end()
diff --git a/src/file_analysis/analyzer/extract/CMakeLists.txt b/src/file_analysis/analyzer/extract/CMakeLists.txt
index df3fa2646d..e413196db2 100644
--- a/src/file_analysis/analyzer/extract/CMakeLists.txt
+++ b/src/file_analysis/analyzer/extract/CMakeLists.txt
@@ -4,5 +4,5 @@ include_directories(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}
                            ${CMAKE_CURRENT_BINARY_DIR})
 
 bro_plugin_begin(Bro FileExtract)
-bro_plugin_cc(Extract.cc Plugin.cc)
+bro_plugin_cc(Extract.cc Plugin.cc ../../Analyzer.cc)
 bro_plugin_end()
diff --git a/src/file_analysis/analyzer/hash/CMakeLists.txt b/src/file_analysis/analyzer/hash/CMakeLists.txt
index 5734740198..0e3143ee05 100644
--- a/src/file_analysis/analyzer/hash/CMakeLists.txt
+++ b/src/file_analysis/analyzer/hash/CMakeLists.txt
@@ -4,6 +4,6 @@ include_directories(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}
                            ${CMAKE_CURRENT_BINARY_DIR})
 
 bro_plugin_begin(Bro FileHash)
-bro_plugin_cc(Hash.cc Plugin.cc)
+bro_plugin_cc(Hash.cc Plugin.cc ../../Analyzer.cc)
 bro_plugin_bif(events.bif)
 bro_plugin_end()

From 7ab21706411bb1bb6c191cce7e86b16d2facae78 Mon Sep 17 00:00:00 2001
From: Robin Sommer <robin@icir.org>
Date: Thu, 1 Aug 2013 10:46:05 -0700
Subject: [PATCH 35/40] Using a real hash function for hashing a BitVector's
 internal state.

---
 src/probabilistic/BitVector.cc                  | 17 ++++++++++++-----
 .../btest/Baseline/bifs.bloomfilter-seed/output | 16 ++++++++--------
 2 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/src/probabilistic/BitVector.cc b/src/probabilistic/BitVector.cc
index f820e6df27..e8c2b2f80e 100644
--- a/src/probabilistic/BitVector.cc
+++ b/src/probabilistic/BitVector.cc
@@ -1,10 +1,12 @@
 // See the file "COPYING" in the main distribution directory for copyright.
 
-#include "BitVector.h"
-
+#include <openssl/sha.h>
 #include <cassert>
 #include <limits>
+
+#include "BitVector.h"
 #include "Serializer.h"
+#include "digest.h"
 
 using namespace probabilistic;
 
@@ -494,10 +496,15 @@ size_t BitVector::Hash() const
 	{
 	size_t hash = 0;
 
-	for ( size_type i = 0; i < Blocks(); ++i )
-		hash += bits[i];
+	u_char buf[SHA256_DIGEST_LENGTH];
+	SHA256_CTX ctx;
+	sha256_init(&ctx);
 
-	return hash;
+	for ( size_type i = 0; i < Blocks(); ++i )
+		sha256_update(&ctx, &bits[i], sizeof(bits[i]));
+
+	sha256_final(&ctx, buf);
+	return *reinterpret_cast<size_t*>(buf); // Use the first bytes as seed.
 	}
 
 BitVector::size_type BitVector::lowest_bit(block_type block)
diff --git a/testing/btest/Baseline/bifs.bloomfilter-seed/output b/testing/btest/Baseline/bifs.bloomfilter-seed/output
index 53e0f583f2..533085900f 100644
--- a/testing/btest/Baseline/bifs.bloomfilter-seed/output
+++ b/testing/btest/Baseline/bifs.bloomfilter-seed/output
@@ -1,8 +1,8 @@
-bf1, global_seed, 1
-bf2, global_seed, 5
-bf3, my_seed, 5
-bf4, my_seed, 6
-bf1, global_seed, 5
-bf2, global_seed, 6
-bf3, my_seed, 5
-bf4, my_seed, 6
+bf1, global_seed, 11979365913534242684
+bf2, global_seed, 12550100962110750449
+bf3, my_seed, 12550100962110750449
+bf4, my_seed, 945716460325754659
+bf1, global_seed, 12550100962110750449
+bf2, global_seed, 945716460325754659
+bf3, my_seed, 12550100962110750449
+bf4, my_seed, 945716460325754659

From 948441e1761611e6f9f1829519ee0da794f1fa06 Mon Sep 17 00:00:00 2001
From: Robin Sommer <robin@icir.org>
Date: Thu, 1 Aug 2013 10:52:15 -0700
Subject: [PATCH 36/40] Test expected false positive, but it isn't one any
 more.

Matthias, please check if this is correct.
---
 scripts/base/init-bare.bro                     | 2 +-
 testing/btest/Baseline/bifs.bloomfilter/output | 1 +
 testing/btest/bifs/bloomfilter.bro             | 2 +-
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/scripts/base/init-bare.bro b/scripts/base/init-bare.bro
index e5300cdc9f..6068796fbd 100644
--- a/scripts/base/init-bare.bro
+++ b/scripts/base/init-bare.bro
@@ -3044,7 +3044,7 @@ const snaplen = 8192 &redef;
 
 ## Seed for hashes computed internally for probabilistic data structures. Using
 ## the same value here will make the hashes compatible between independent Bro
-## instances. If left unset, Bro will use a temporary local seed.  
+## instances. If left unset, Bro will use a temporary local seed.
 const global_hash_seed: string = "" &redef;
 
 # Load BiFs defined by plugins.
diff --git a/testing/btest/Baseline/bifs.bloomfilter/output b/testing/btest/Baseline/bifs.bloomfilter/output
index 731b7c7ce9..82414f0686 100644
--- a/testing/btest/Baseline/bifs.bloomfilter/output
+++ b/testing/btest/Baseline/bifs.bloomfilter/output
@@ -12,6 +12,7 @@ error: false-positive rate must take value between 0 and 1
 1
 1
 1
+0, no fp
 1
 1
 1
diff --git a/testing/btest/bifs/bloomfilter.bro b/testing/btest/bifs/bloomfilter.bro
index 3d3133fa84..95455bc74c 100644
--- a/testing/btest/bifs/bloomfilter.bro
+++ b/testing/btest/bifs/bloomfilter.bro
@@ -28,7 +28,7 @@ function test_basic_bloom_filter()
   bloomfilter_add(bf_str, "bar");
   print bloomfilter_lookup(bf_str, "foo");
   print bloomfilter_lookup(bf_str, "bar");
-  print bloomfilter_lookup(bf_str, "b4zzz"); # FP
+  print bloomfilter_lookup(bf_str, "b4zzz"), "no fp"; # FP
   print bloomfilter_lookup(bf_str, "quuux"); # FP
   bloomfilter_add(bf_str, 0.5); # Type mismatch
   bloomfilter_add(bf_str, 100); # Type mismatch

From 0d39e00bc4b989d36bef379ebae7fcd7cd81ed3a Mon Sep 17 00:00:00 2001
From: Jon Siwek <jsiwek@illinois.edu>
Date: Thu, 1 Aug 2013 14:39:35 -0500
Subject: [PATCH 37/40] Fix a ref counting bug.

BIT-1049 #request-merge
---
 src/file_analysis/AnalyzerSet.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/file_analysis/AnalyzerSet.cc b/src/file_analysis/AnalyzerSet.cc
index 2dc4902314..f7abc01dc2 100644
--- a/src/file_analysis/AnalyzerSet.cc
+++ b/src/file_analysis/AnalyzerSet.cc
@@ -15,7 +15,7 @@ static void analyzer_del_func(void* v)
 AnalyzerSet::AnalyzerSet(File* arg_file) : file(arg_file)
 	{
 	TypeList* t = new TypeList();
-	t->Append(file_mgr->GetTagEnumType());
+	t->Append(file_mgr->GetTagEnumType()->Ref());
 	t->Append(BifType::Record::Files::AnalyzerArgs->Ref());
 	analyzer_hash = new CompositeHash(t);
 	Unref(t);

From 6a45a67eb52e208b3fd1156354b555487e614558 Mon Sep 17 00:00:00 2001
From: Bernhard Amann <bernhard@icsi.berkeley.edu>
Date: Thu, 1 Aug 2013 14:07:39 -0700
Subject: [PATCH 38/40] update documentation, rename get* to Get* and make
 hasher persistent

---
 src/probabilistic/Topk.cc                |  54 ++++++------
 src/probabilistic/Topk.h                 | 101 +++++++++++++++++------
 src/probabilistic/top-k.bif              |  80 +++++++++++++-----
 testing/btest/Baseline/bifs.topk/.stderr |  22 ++---
 4 files changed, 174 insertions(+), 83 deletions(-)

diff --git a/src/probabilistic/Topk.cc b/src/probabilistic/Topk.cc
index d03a10ccfc..6828fb0d7f 100644
--- a/src/probabilistic/Topk.cc
+++ b/src/probabilistic/Topk.cc
@@ -19,21 +19,23 @@ static void topk_element_hash_delete_func(void* val)
 
 Element::~Element() 
 	{
-	if ( value ) 
-		Unref(value);
-	value=0;
+	Unref(value);
+	}
+
+void TopkVal::Typify(BroType* t) 
+	{
+	assert(!hash && !type);
+	type = t->Ref();
+	TypeList* tl = new TypeList(t);
+	tl->Append(t->Ref());
+	hash = new CompositeHash(tl);
+	Unref(tl);
 	}
 
 HashKey* TopkVal::GetHash(Val* v) const
 	{
-	TypeList* tl = new TypeList(v->Type());
-	tl->Append(v->Type()->Ref());
-	CompositeHash* topk_hash = new CompositeHash(tl);
-	Unref(tl);
-
-	HashKey* key = topk_hash->ComputeHash(v, 1);
+	HashKey* key = hash->ComputeHash(v, 1);
 	assert(key);
-	delete topk_hash;
 	return key;
 	}
 
@@ -45,6 +47,7 @@ TopkVal::TopkVal(uint64 arg_size) : OpaqueVal(topk_type)
 	type = 0;
 	numElements = 0;
 	pruned = false;
+	hash = 0;
 	}
 
 TopkVal::TopkVal() : OpaqueVal(topk_type)
@@ -54,6 +57,7 @@ TopkVal::TopkVal() : OpaqueVal(topk_type)
 	size = 0;
 	type = 0;
 	numElements = 0;
+	hash = 0;
 	}
 
 TopkVal::~TopkVal()
@@ -69,9 +73,8 @@ TopkVal::~TopkVal()
 		bi++;
 		}
 
-	if ( type ) 
-		Unref(type);
-	type = 0;
+	Unref(type);
+	delete hash;
 	}
 
 void TopkVal::Merge(const TopkVal* value, bool doPrune)
@@ -80,7 +83,7 @@ void TopkVal::Merge(const TopkVal* value, bool doPrune)
 	if ( type == 0 )
 		{
 		assert(numElements == 0);
-		type = value->type->Ref();
+		Typify(value->type);
 		}
 	else
 		if ( !same_type(type, value->type) )
@@ -227,7 +230,10 @@ bool TopkVal::DoUnserialize(UnserialInfo* info)
 	v &= UNSERIALIZE(&type_present);
 	if ( type_present ) 
 		{
-		type = BroType::Unserialize(info);
+		BroType* deserialized_type = BroType::Unserialize(info);
+
+		Typify(deserialized_type);
+		Unref(deserialized_type);
 		assert(type);
 		}
 	else
@@ -268,7 +274,7 @@ bool TopkVal::DoUnserialize(UnserialInfo* info)
 	}
 
 
-VectorVal* TopkVal::getTopK(int k) const // returns vector
+VectorVal* TopkVal::GetTopK(int k) const // returns vector
 	{
 	if ( numElements == 0 )
 		{
@@ -310,14 +316,14 @@ VectorVal* TopkVal::getTopK(int k) const // returns vector
 	return t;
 	}
 
-uint64_t TopkVal::getCount(Val* value) const
+uint64_t TopkVal::GetCount(Val* value) const
 	{
 	HashKey* key = GetHash(value);
 	Element* e = (Element*) elementDict->Lookup(key);
 
 	if ( e == 0 ) 
 		{
-		reporter->Error("getCount for element that is not in top-k");	
+		reporter->Error("GetCount for element that is not in top-k");	
 		return 0;
 		}
 
@@ -325,14 +331,14 @@ uint64_t TopkVal::getCount(Val* value) const
 	return e->parent->count;
 	}
 
-uint64_t TopkVal::getEpsilon(Val* value) const
+uint64_t TopkVal::GetEpsilon(Val* value) const
 	{
 	HashKey* key = GetHash(value);
 	Element* e = (Element*) elementDict->Lookup(key);
 
 	if ( e == 0 ) 
 		{
-		reporter->Error("getEpsilon for element that is not in top-k");	
+		reporter->Error("GetEpsilon for element that is not in top-k");	
 		return 0;
 		}
 
@@ -340,7 +346,7 @@ uint64_t TopkVal::getEpsilon(Val* value) const
 	return e->epsilon;
 	}
 
-uint64_t TopkVal::getSum() const
+uint64_t TopkVal::GetSum() const
 	{
 	uint64_t sum = 0;
 
@@ -353,7 +359,7 @@ uint64_t TopkVal::getSum() const
 		}
 
 	if ( pruned ) 
-		reporter->Warning("TopkVal::getSum() was used on a pruned data structure. Result values do not represent total element count");
+		reporter->Warning("TopkVal::GetSum() was used on a pruned data structure. Result values do not represent total element count");
 
 	return sum;
 	}
@@ -362,10 +368,8 @@ void TopkVal::Encountered(Val* encountered)
 	{
 	// ok, let's see if we already know this one.
 	
-	//printf("NumElements: %d\n", numElements);
-	// check type compatibility
 	if ( numElements == 0 ) 
-		type = encountered->Type()->Ref();
+		Typify(encountered->Type());
 	else
 		if ( !same_type(type, encountered->Type()) ) 
 			{
diff --git a/src/probabilistic/Topk.h b/src/probabilistic/Topk.h
index 2c47fbd181..4f0599025d 100644
--- a/src/probabilistic/Topk.h
+++ b/src/probabilistic/Topk.h
@@ -34,50 +34,101 @@ declare(PDict, Element);
 class TopkVal : public OpaqueVal {
 
 public:
-	// Initialize a TopkVal. Size specifies how many total elements are tracked
+	/**
+	 * Construct a TopkVal. 
+	 *
+	 * @param size specifies how many total elements are tracked
+	 *
+	 * @return A newly initialized TopkVal
+	 */
 	TopkVal(uint64 size);
+	
+	/**
+	 * Destructor.
+	 */	
 	~TopkVal();
 
-	// Call this, when a new value is encountered. Note that on the first call,
-	// the Bro-Type of the value types that are counted is set. All following calls
-	// to encountered have to specify the same type
+	/**
+	 * Call this, when a new value is encountered. Note that on the first call,
+	 * the Bro-Type of the value types that are counted is set. All following calls
+	 * to encountered have to specify the same type.
+	 *
+	 * @param value The encountered element
+	 */
 	void Encountered(Val* value); 
 
-	// Return the first k elements of the result vector. At the moment, this does
-	// not check if it is in the right order or if we can prove that these are 
-	// the correct top-k. Use count and epsilon for this.
-	VectorVal* getTopK(int k) const; // returns vector
+	/**
+	 * Get the first k elements of the result vector. At the moment, this does
+	 * not check if it is in the right order or if we can prove that these are 
+	 * the correct top-k. Use count and epsilon for this.
+	 *
+	 * @param k Number of top-elements to return
+	 *
+	 * @returns The top-k encountered elements
+	 */
+	VectorVal* GetTopK(int k) const;
 
-	// Get the current count tracked in the top-k data structure for a certain val.
-	// Returns 0 if the val is unknown (and logs the error to reporter)
-	uint64_t getCount(Val* value) const;
+	/**
+	 * Get the current count tracked in the top-k data structure for a certain val.
+	 * Returns 0 if the val is unknown (and logs the error to reporter)
+	 *
+	 * @param value Bro value to get counts for
+	 *
+	 * @returns internal count for val, 0 if unknown
+	 */
+	 uint64_t GetCount(Val* value) const;
 
-	// Get the current epsilon tracked in the top-k data structure for a certain val.
-	// Returns 0 if the val is unknown (and logs the error to reporter)
-	uint64_t getEpsilon(Val* value) const;
+	/**
+	 * Get the current epsilon tracked in the top-k data structure for a certain val.
+	 *
+	 * @param value Bro value to get epsilons for
+	 *
+	 * @returns the epsilon. Returns 0 if the val is unknown (and logs the error to reporter)
+	 */
+	uint64_t GetEpsilon(Val* value) const;
 
-	// Get the size set in the constructor
-	uint64_t getSize() const { return size; }
+	/**
+	 * Get the size set in the constructor
+	 *
+	 * @returns size of the top-k structure
+	 */
+	uint64_t GetSize() const { return size; }
 
-	// Get the sum of all counts of all tracked elements. This is equal to the number
-	// of total observations up to this moment, if no elements were pruned from the data
-	// structure.
-	uint64_t getSum() const;
+	/**
+	 * Get the sum of all counts of all tracked elements. This is equal to the number
+	 * of total observations up to this moment, if no elements were pruned from the data
+	 * structure.
+	 *
+	 * @returns sum of all counts
+	 */
+	uint64_t GetSum() const;
 
-	// Merge another top-k data structure in this one. 
-	// doPrune specifies if the total count of elements is limited to size after
-	// merging. 
-	// Please note, that pruning will invalidate the results of getSum.
+	/**
+	 * Merge another top-k data structure into this one. 
+	 * doPrune specifies if the total count of elements is limited to size after
+	 * merging. 
+	 * Please note, that pruning will invalidate the results of getSum.
+	 *
+	 * @param value TopkVal to merge into this TopkVal
+	 *
+	 * @param doPrune prune resulting TopkVal to size after merging
+	 */
 	void Merge(const TopkVal* value, bool doPrune=false);
 
 protected:
-	TopkVal(); // for deserialize
+	/**
+	 * Construct an empty TopkVal.
+	 * Only used for deserialization
+	 */
+	TopkVal(); 
 
 private:
 	void IncrementCounter(Element* e, unsigned int count = 1);
 	HashKey* GetHash(Val*) const; // this probably should go somewhere else.
+	void Typify(BroType*);
 		
 	BroType* type;
+	CompositeHash* hash;	
 	std::list<Bucket*> buckets;
 	PDict(Element)* elementDict;
 	uint64 size; // how many elements are we tracking?
diff --git a/src/probabilistic/top-k.bif b/src/probabilistic/top-k.bif
index 83d8e275c1..156e96a65e 100644
--- a/src/probabilistic/top-k.bif
+++ b/src/probabilistic/top-k.bif
@@ -11,6 +11,8 @@
 
 ## Creates a top-k data structure which tracks size elements.
 ##
+## size: number of elements to track
+##
 ## Returns: Opaque pointer to the data structure.
 function topk_init%(size: count%): opaque of topk
 	%{
@@ -18,9 +20,14 @@ function topk_init%(size: count%): opaque of topk
 	return v;
 	%}
 
-## Add a new observed object to the data structure. The first
-## added object sets the type of data tracked by the top-k data
-## structure. All following values have to be of the same type
+## Add a new observed object to the data structure. 
+##
+## .. note:: The first added object sets the type of data tracked by 
+##    the top-k data structure. All following values have to be of the same type
+##
+## handle: the TopK handle
+##
+## value: observed value
 function topk_add%(handle: opaque of topk, value: any%): any
 	%{
 	assert(handle);
@@ -32,63 +39,86 @@ function topk_add%(handle: opaque of topk, value: any%): any
 
 ## Get the first k elements of the top-k data structure
 ##
+## handle: the TopK handle
+##
+## k: number of elements to return
+##
 ## Returns: vector of the first k elements
 function topk_get_top%(handle: opaque of topk, k: count%): any
 	%{
 	assert(handle);
 	probabilistic::TopkVal* h = (probabilistic::TopkVal*) handle;
-	return h->getTopK(k);	
+	return h->GetTopK(k);	
 	%}
 
 ## Get an overestimated count of how often value has been encountered.
-## value has to be part of the currently tracked elements, otherwise
-## 0 will be returned and an error message will be added to reporter.
+##
+## .. note:: value has to be part of the currently tracked elements, otherwise
+##    0 will be returned and an error message will be added to reporter.
+##
+## handle: the TopK handle
+##
+## value: Value to look up count for.
 ##
 ## Returns: Overestimated number for how often the element has been encountered
 function topk_count%(handle: opaque of topk, value: any%): count
 	%{
 	assert(handle);
 	probabilistic::TopkVal* h = (probabilistic::TopkVal*) handle;
-	return new Val(h->getCount(value), TYPE_COUNT);
+	return new Val(h->GetCount(value), TYPE_COUNT);
 	%}
 
-## Get a  the maximal overestimation for count. Same restrictiosn as for topk_count
-## apply.
+## Get a  the maximal overestimation for count.
+##
+## .. note:: Same restrictiosn as for topk_count apply.
+##
+## handle: the TopK handle
+##
+## value: Value to look up epsilon for.
 ##
 ## Returns: Number which represents the maximal overesimation for the count of this element.
 function topk_epsilon%(handle: opaque of topk, value: any%): count
 	%{
 	assert(handle);
 	probabilistic::TopkVal* h = (probabilistic::TopkVal*) handle;
-	return new Val(h->getEpsilon(value), TYPE_COUNT);
+	return new Val(h->GetEpsilon(value), TYPE_COUNT);
 	%}
 
 ## Get the number of elements this data structure is supposed to track (given on init).
-## Note that the actual number of elements in the data structure can be lower or higher
-## than this. (higher due to non-pruned merges)
+##
+## .. note ::Note that the actual number of elements in the data structure can be lower 
+##    or higher (due to non-pruned merges) than thiz
 ## 
+## handle: the TopK handle
+##
 ## Returns: size given during initialization 
 function topk_size%(handle: opaque of topk%): count
 	%{
 	assert(handle);
 	probabilistic::TopkVal* h = (probabilistic::TopkVal*) handle;
-	return new Val(h->getSize(), TYPE_COUNT);
+	return new Val(h->GetSize(), TYPE_COUNT);
 	%}
 
-## Get the sum of all counts of all elements in the data structure. Is equal to the number
-## of all inserted objects if the data structure never has been pruned. Do not use after
-## calling topk_merge_prune (will throw a warning message if used afterwards)
+## Get the sum of all counts of all elements in the data structure.
+##
+## .. note:: This is equal to the number of all inserted objects if the data structure
+##    never has been pruned. Do not use after calling topk_merge_prune (will throw a 
+##    warning message if used afterwards)
+##
+## handle: the TopK handle
 ##
 ## Returns: sum of all counts
 function topk_sum%(handle: opaque of topk%): count
 	%{
 	assert(handle);
 	probabilistic::TopkVal* h = (probabilistic::TopkVal*) handle;
-	return new Val(h->getSum(), TYPE_COUNT);
+	return new Val(h->GetSum(), TYPE_COUNT);
 	%}
 
-## Merge the second topk data structure into the first. Does not remove any elements, the
-## resulting data structure can be bigger than the maximum size given on initialization.
+## Merge the second topk data structure into the first. 
+##
+## .. note:: This does not remove any elements, the resulting data structure can
+##    be bigger than the maximum size given on initialization.
 function topk_merge%(handle1: opaque of topk, handle2: opaque of topk%): any
 	%{
 	assert(handle1);
@@ -103,9 +133,15 @@ function topk_merge%(handle1: opaque of topk, handle2: opaque of topk%): any
 	%}
 
 ## Merge the second topk data structure into the first and prunes the final data structure
-## back to the size given on initialization. Use with care and only when being aware of the
-## restrictions this imposed. Do not call topk_size or topk_add afterwards, results will
-## probably not be what you expect.
+## back to the size given on initialization. 
+##
+## .. note:: Use with care and only when being aware of the restrictions this entails. 
+##    Do not call topk_size or topk_add afterwards, results will probably not be what you 
+##    expect.
+## 
+## handle1: the TopK handle in which the second TopK structure is merged
+##
+## handle2: the TopK handle in which is merged into the first TopK structure
 function topk_merge_prune%(handle1: opaque of topk, handle2: opaque of topk%): any
 	%{
 	assert(handle1);
diff --git a/testing/btest/Baseline/bifs.topk/.stderr b/testing/btest/Baseline/bifs.topk/.stderr
index 80626107aa..a711333fc0 100644
--- a/testing/btest/Baseline/bifs.topk/.stderr
+++ b/testing/btest/Baseline/bifs.topk/.stderr
@@ -1,11 +1,11 @@
-error: getCount for element that is not in top-k
-error: getEpsilon for element that is not in top-k
-error: getCount for element that is not in top-k
-error: getEpsilon for element that is not in top-k
-error: getCount for element that is not in top-k
-error: getEpsilon for element that is not in top-k
-error: getCount for element that is not in top-k
-error: getEpsilon for element that is not in top-k
-warning: TopkVal::getSum() was used on a pruned data structure. Result values do not represent total element count
-error: getCount for element that is not in top-k
-error: getEpsilon for element that is not in top-k
+error: GetCount for element that is not in top-k
+error: GetEpsilon for element that is not in top-k
+error: GetCount for element that is not in top-k
+error: GetEpsilon for element that is not in top-k
+error: GetCount for element that is not in top-k
+error: GetEpsilon for element that is not in top-k
+error: GetCount for element that is not in top-k
+error: GetEpsilon for element that is not in top-k
+warning: TopkVal::GetSum() was used on a pruned data structure. Result values do not represent total element count
+error: GetCount for element that is not in top-k
+error: GetEpsilon for element that is not in top-k

From 3c0be747595a4e6df146fc7aba0cc1d5359ba231 Mon Sep 17 00:00:00 2001
From: Bernhard Amann <bernhard@icsi.berkeley.edu>
Date: Thu, 1 Aug 2013 14:13:20 -0700
Subject: [PATCH 39/40] 3 more functions to document.

---
 src/probabilistic/Topk.h | 25 +++++++++++++++++++++++--
 1 file changed, 23 insertions(+), 2 deletions(-)

diff --git a/src/probabilistic/Topk.h b/src/probabilistic/Topk.h
index 4f0599025d..48ca5c29a4 100644
--- a/src/probabilistic/Topk.h
+++ b/src/probabilistic/Topk.h
@@ -123,9 +123,30 @@ protected:
 	TopkVal(); 
 
 private:
+	/**
+	 * Increment the counter for a specific element
+	 *
+	 * @param e element to increment counter for
+	 *
+	 * @param count increment counter by this much
+	 */
 	void IncrementCounter(Element* e, unsigned int count = 1);
-	HashKey* GetHash(Val*) const; // this probably should go somewhere else.
-	void Typify(BroType*);
+	
+	/**
+	 * get the hashkey for a specific value
+	 *
+	 * @param v value to generate key for
+	 *
+	 * @returns HashKey for value
+	 */
+	HashKey* GetHash(Val* v) const; // this probably should go somewhere else.
+	
+	/**
+	 * Set the type that this TopK instance tracks
+	 *
+	 * @param t type that is tracked
+	 */
+	void Typify(BroType* t);
 		
 	BroType* type;
 	CompositeHash* hash;	

From 8a9344e3ea3cc35a055b3b5686ba78f166718772 Mon Sep 17 00:00:00 2001
From: Robin Sommer <robin@icir.org>
Date: Thu, 1 Aug 2013 15:41:10 -0700
Subject: [PATCH 40/40] Missing base line update.

---
 doc/scripts/DocSourcesList.cmake | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/doc/scripts/DocSourcesList.cmake b/doc/scripts/DocSourcesList.cmake
index fecd728afd..570de8bb6f 100644
--- a/doc/scripts/DocSourcesList.cmake
+++ b/doc/scripts/DocSourcesList.cmake
@@ -73,6 +73,7 @@ rest_target(${CMAKE_BINARY_DIR}/scripts base/bif/plugins/Bro_UDP.events.bif.bro)
 rest_target(${CMAKE_BINARY_DIR}/scripts base/bif/plugins/Bro_ZIP.events.bif.bro)
 rest_target(${CMAKE_BINARY_DIR}/scripts base/bif/reporter.bif.bro)
 rest_target(${CMAKE_BINARY_DIR}/scripts base/bif/strings.bif.bro)
+rest_target(${CMAKE_BINARY_DIR}/scripts base/bif/top-k.bif.bro)
 rest_target(${CMAKE_BINARY_DIR}/scripts base/bif/types.bif.bro)
 rest_target(${psd} base/files/extract/main.bro)
 rest_target(${psd} base/files/hash/main.bro)
@@ -129,6 +130,7 @@ rest_target(${psd} base/frameworks/sumstats/plugins/min.bro)
 rest_target(${psd} base/frameworks/sumstats/plugins/sample.bro)
 rest_target(${psd} base/frameworks/sumstats/plugins/std-dev.bro)
 rest_target(${psd} base/frameworks/sumstats/plugins/sum.bro)
+rest_target(${psd} base/frameworks/sumstats/plugins/topk.bro)
 rest_target(${psd} base/frameworks/sumstats/plugins/unique.bro)
 rest_target(${psd} base/frameworks/sumstats/plugins/variance.bro)
 rest_target(${psd} base/frameworks/tunnels/main.bro)