Work on parameter estimation and serialization.

This commit is contained in:
Matthias Vallentin 2013-06-03 22:55:21 -07:00
parent f529df33e0
commit f708cd4a36
6 changed files with 198 additions and 22 deletions

View file

@ -1,23 +1,130 @@
#include "BloomFilter.h" #include "BloomFilter.h"
#include <cmath>
#include "Serializer.h"
// Backport C++11's std::round().
namespace {
template <typename T>
T round(double x) { return (x > 0.0) ? (x + 0.5) : (x - 0.5); }
} // namespace <anonymous>
IMPLEMENT_SERIAL(CounterVector, SER_COUNTERVECTOR)
bool CounterVector::DoSerialize(SerialInfo* info) const
{
DO_SERIALIZE(SER_COUNTERVECTOR, SerialObj);
if ( ! SERIALIZE(&bits_) )
return false;
return SERIALIZE(static_cast<uint64>(width_));
}
bool CounterVector::DoUnserialize(UnserialInfo* info)
{
DO_UNSERIALIZE(SerialObj);
return false;
// TODO: Ask Robin how to unserialize non-pointer members.
//if ( ! UNSERIALIZE(&bits_) )
// return false;
uint64 width;
if ( ! UNSERIALIZE(&width) )
return false;
width_ = static_cast<unsigned>(width);
return true;
}
HashPolicy::HashVector DefaultHashing::Hash(const void* x, size_t n) const HashPolicy::HashVector DefaultHashing::Hash(const void* x, size_t n) const
{ {
HashVector h(k(), 0); HashVector h(K(), 0);
for ( size_t i = 0; i < h.size(); ++i ) for ( size_t i = 0; i < h.size(); ++i )
h[i] = hashers_[i](x, n); h[i] = hashers_[i](x, n);
return h; return h;
} }
HashPolicy::HashVector DoubleHashing::Hash(const void* x, size_t n) const HashPolicy::HashVector DoubleHashing::Hash(const void* x, size_t n) const
{ {
HashType h1 = hasher1_(x); HashType h1 = hasher1_(x);
HashType h2 = hasher2_(x); HashType h2 = hasher2_(x);
HashVector h(k(), 0); HashVector h(K(), 0);
for ( size_t i = 0; i < h.size(); ++i ) for ( size_t i = 0; i < h.size(); ++i )
h[i] = h1 + i * h2; h[i] = h1 + i * h2;
return h; return h;
} }
bool BloomFilter::Serialize(SerialInfo* info) const
{
return SerialObj::Serialize(info);
}
BloomFilter* BloomFilter::Unserialize(UnserialInfo* info)
{
return reinterpret_cast<BloomFilter*>(
SerialObj::Unserialize(info, SER_BLOOMFILTER));
}
// FIXME: should abstract base classes also have IMPLEMENT_SERIAL?
//IMPLEMENT_SERIAL(BloomFilter, SER_BLOOMFILTER)
bool BloomFilter::DoSerialize(SerialInfo* info) const
{
DO_SERIALIZE(SER_BLOOMFILTER, SerialObj);
// TODO: Make the hash policy serializable.
//if ( ! SERIALIZE(hash_) )
// return false;
return SERIALIZE(static_cast<uint64>(elements_));
}
bool BloomFilter::DoUnserialize(UnserialInfo* info)
{
DO_UNSERIALIZE(SerialObj);
// TODO: Make the hash policy serializable.
//if ( ! hash_ = HashPolicy::Unserialize(info) )
// return false;
uint64 elements;
if ( UNSERIALIZE(&elements) )
return false;
elements_ = static_cast<size_t>(elements);
return true;
}
size_t BasicBloomFilter::Cells(double fp, size_t capacity)
{
double ln2 = std::log(2);
return std::ceil(-(capacity * std::log(fp) / ln2 / ln2));
}
size_t BasicBloomFilter::K(size_t cells, size_t capacity)
{
double frac = static_cast<double>(cells) / static_cast<double>(capacity);
return round<size_t>(frac * std::log(2));
}
BasicBloomFilter::BasicBloomFilter(size_t cells, HashPolicy* hash)
: BloomFilter(hash), bits_(cells)
{
}
IMPLEMENT_SERIAL(BasicBloomFilter, SER_BASICBLOOMFILTER)
bool BasicBloomFilter::DoSerialize(SerialInfo* info) const
{
DO_SERIALIZE(SER_BASICBLOOMFILTER, BloomFilter);
// TODO: Make the hash policy serializable.
//if ( ! SERIALIZE(&bits_) )
// return false;
return true;
}
bool BasicBloomFilter::DoUnserialize(UnserialInfo* info)
{
DO_UNSERIALIZE(BloomFilter);
// TODO: Non-pointer member deserialization?
return true;
}
void BasicBloomFilter::AddImpl(const HashPolicy::HashVector& h) void BasicBloomFilter::AddImpl(const HashPolicy::HashVector& h)
{ {
for ( size_t i = 0; i < h.size(); ++i ) for ( size_t i = 0; i < h.size(); ++i )
@ -31,3 +138,23 @@ size_t BasicBloomFilter::CountImpl(const HashPolicy::HashVector& h) const
return 0; return 0;
return 1; return 1;
} }
void CountingBloomFilter::AddImpl(const HashPolicy::HashVector& h)
{
for ( size_t i = 0; i < h.size(); ++i )
cells_.Increment(h[i] % h.size(), 1);
}
size_t CountingBloomFilter::CountImpl(const HashPolicy::HashVector& h) const
{
CounterVector::size_type min =
std::numeric_limits<CounterVector::size_type>::max();
for ( size_t i = 0; i < h.size(); ++i )
{
CounterVector::size_type cnt = cells_.Count(h[i] % h.size());
if ( cnt < min )
min = cnt;
}
return min;
}

View file

@ -65,7 +65,7 @@ public:
protected: protected:
DECLARE_SERIAL(CounterVector); DECLARE_SERIAL(CounterVector);
CounterVector(); CounterVector() { }
private: private:
BitVector bits_; BitVector bits_;
@ -82,7 +82,7 @@ public:
typedef std::vector<HashType> HashVector; typedef std::vector<HashType> HashVector;
virtual ~HashPolicy() { } virtual ~HashPolicy() { }
size_t k() const { return k_; } size_t K() const { return k_; }
virtual HashVector Hash(const void* x, size_t n) const = 0; virtual HashVector Hash(const void* x, size_t n) const = 0;
protected: protected:
@ -130,7 +130,7 @@ private:
}; };
/** /**
* The *double-hashing* policy. Uses a linear combination of 2 hash functions. * The *double-hashing* policy. Uses a linear combination of two hash functions.
*/ */
class DoubleHashing : public HashPolicy { class DoubleHashing : public HashPolicy {
public: public:
@ -185,25 +185,20 @@ public:
return elements_; return elements_;
} }
protected: bool Serialize(SerialInfo* info) const;
/** static BloomFilter* Unserialize(UnserialInfo* info);
* Default-constructs a Bloom filter.
*/
BloomFilter();
/** protected:
* Constructs a BloomFilter. DECLARE_SERIAL(BloomFilter);
* @param hash The hashing policy.
*/ BloomFilter() { };
BloomFilter(HashPolicy* hash); BloomFilter(HashPolicy* hash) : hash_(hash) { }
virtual void AddImpl(const HashPolicy::HashVector& hashes) = 0; virtual void AddImpl(const HashPolicy::HashVector& hashes) = 0;
virtual size_t CountImpl(const HashPolicy::HashVector& hashes) const = 0; virtual size_t CountImpl(const HashPolicy::HashVector& hashes) const = 0;
private: private:
HashPolicy* hash_; // Owned by *this. HashPolicy* hash_;
size_t elements_; size_t elements_;
}; };
@ -212,12 +207,17 @@ private:
*/ */
class BasicBloomFilter : public BloomFilter { class BasicBloomFilter : public BloomFilter {
public: public:
BasicBloomFilter(); static size_t Cells(double fp, size_t capacity);
BasicBloomFilter(HashPolicy* hash); static size_t K(size_t cells, size_t capacity);
BasicBloomFilter(size_t cells, HashPolicy* hash);
protected: protected:
virtual void AddImpl(const HashPolicy::HashVector& h); DECLARE_SERIAL(BasicBloomFilter);
BasicBloomFilter() { }
virtual void AddImpl(const HashPolicy::HashVector& h);
virtual size_t CountImpl(const HashPolicy::HashVector& h) const; virtual size_t CountImpl(const HashPolicy::HashVector& h) const;
private: private:
@ -232,10 +232,11 @@ public:
CountingBloomFilter(unsigned width, HashPolicy* hash); CountingBloomFilter(unsigned width, HashPolicy* hash);
protected: protected:
DECLARE_SERIAL(CountingBloomFilter);
CountingBloomFilter(); CountingBloomFilter();
virtual void AddImpl(const HashPolicy::HashVector& h); virtual void AddImpl(const HashPolicy::HashVector& h);
virtual size_t CountImpl(const HashPolicy::HashVector& h) const; virtual size_t CountImpl(const HashPolicy::HashVector& h) const;
private: private:

View file

@ -244,6 +244,7 @@ OpaqueType* md5_type;
OpaqueType* sha1_type; OpaqueType* sha1_type;
OpaqueType* sha256_type; OpaqueType* sha256_type;
OpaqueType* entropy_type; OpaqueType* entropy_type;
OpaqueType* bloomfilter_type;
#include "const.bif.netvar_def" #include "const.bif.netvar_def"
#include "types.bif.netvar_def" #include "types.bif.netvar_def"
@ -310,6 +311,7 @@ void init_general_global_var()
sha1_type = new OpaqueType("sha1"); sha1_type = new OpaqueType("sha1");
sha256_type = new OpaqueType("sha256"); sha256_type = new OpaqueType("sha256");
entropy_type = new OpaqueType("entropy"); entropy_type = new OpaqueType("entropy");
bloomfilter_type = new OpaqueType("bloomfilter");
} }
void init_net_var() void init_net_var()

View file

@ -1,4 +1,6 @@
#include "OpaqueVal.h" #include "OpaqueVal.h"
#include "BloomFilter.h"
#include "NetVar.h" #include "NetVar.h"
#include "Reporter.h" #include "Reporter.h"
#include "Serializer.h" #include "Serializer.h"
@ -515,3 +517,24 @@ bool EntropyVal::DoUnserialize(UnserialInfo* info)
return true; return true;
} }
BloomFilterVal::BloomFilterVal(OpaqueType* t) : OpaqueVal(t)
{
}
IMPLEMENT_SERIAL(BloomFilterVal, SER_BLOOMFILTER_VAL);
bool BloomFilterVal::DoSerialize(SerialInfo* info) const
{
DO_SERIALIZE(SER_BLOOMFILTER_VAL, OpaqueVal);
// TODO: implement.
return true;
}
bool BloomFilterVal::DoUnserialize(UnserialInfo* info)
{
DO_UNSERIALIZE(OpaqueVal);
// TODO: implement.
return true;
}

View file

@ -7,6 +7,8 @@
#include "Val.h" #include "Val.h"
#include "digest.h" #include "digest.h"
class BloomFilter;
class HashVal : public OpaqueVal { class HashVal : public OpaqueVal {
public: public:
virtual bool IsValid() const; virtual bool IsValid() const;
@ -107,4 +109,18 @@ private:
RandTest state; RandTest state;
}; };
class BloomFilterVal : public OpaqueVal {
public:
BloomFilterVal();
protected:
friend class Val;
BloomFilterVal(OpaqueType* t);
DECLARE_SERIAL(BloomFilterVal);
private:
BloomFilter* bloom_filter_;
};
#endif #endif

View file

@ -50,6 +50,9 @@ SERIAL_IS_BO(CASE, 0x1200)
SERIAL_IS(LOCATION, 0x1300) SERIAL_IS(LOCATION, 0x1300)
SERIAL_IS(RE_MATCHER, 0x1400) SERIAL_IS(RE_MATCHER, 0x1400)
SERIAL_IS(BITVECTOR, 0x1500) SERIAL_IS(BITVECTOR, 0x1500)
SERIAL_IS(COUNTERVECTOR, 0xa000)
SERIAL_IS(BLOOMFILTER, 0xa100)
SERIAL_IS(BASICBLOOMFILTER, 0xa200)
// These are the externally visible types. // These are the externally visible types.
const SerialType SER_NONE = 0; const SerialType SER_NONE = 0;
@ -105,6 +108,7 @@ SERIAL_VAL(MD5_VAL, 16)
SERIAL_VAL(SHA1_VAL, 17) SERIAL_VAL(SHA1_VAL, 17)
SERIAL_VAL(SHA256_VAL, 18) SERIAL_VAL(SHA256_VAL, 18)
SERIAL_VAL(ENTROPY_VAL, 19) SERIAL_VAL(ENTROPY_VAL, 19)
SERIAL_VAL(BLOOMFILTER_VAL, 20)
#define SERIAL_EXPR(name, val) SERIAL_CONST(name, val, EXPR) #define SERIAL_EXPR(name, val) SERIAL_CONST(name, val, EXPR)
SERIAL_EXPR(EXPR, 1) SERIAL_EXPR(EXPR, 1)
@ -204,5 +208,8 @@ SERIAL_CONST2(CASE)
SERIAL_CONST2(LOCATION) SERIAL_CONST2(LOCATION)
SERIAL_CONST2(RE_MATCHER) SERIAL_CONST2(RE_MATCHER)
SERIAL_CONST2(BITVECTOR) SERIAL_CONST2(BITVECTOR)
SERIAL_CONST2(COUNTERVECTOR)
SERIAL_CONST2(BLOOMFILTER)
SERIAL_CONST2(BASICBLOOMFILTER)
#endif #endif