Merge remote-tracking branch 'origin/topic/matthias/bloom-filter'

* origin/topic/matthias/bloom-filter:
  Update submodules.
  Make hashers serializable.
  Add docs and use default value for hasher names.
This commit is contained in:
Robin Sommer 2013-07-30 10:05:10 -07:00
commit 629c331ca0
9 changed files with 133 additions and 53 deletions

View file

@ -1,4 +1,11 @@
2.1-945 | 2013-07-30 10:05:10 -0700
* Make hashers serializable. (Matthias Vallentin)
* Add docs and use default value for hasher names. (Matthias
Vallentin)
2.1-939 | 2013-07-29 15:42:38 -0700 2.1-939 | 2013-07-29 15:42:38 -0700
* Added Exec, Dir, and ActiveHTTP modules. (Seth Hall) * Added Exec, Dir, and ActiveHTTP modules. (Seth Hall)

View file

@ -1 +1 @@
2.1-939 2.1-945

1
magic

@ -1 +0,0 @@
Subproject commit e87fe13a7b776182ffc8c75076d42702f5c28fed

View file

@ -52,6 +52,7 @@ SERIAL_IS(RE_MATCHER, 0x1400)
SERIAL_IS(BITVECTOR, 0x1500) SERIAL_IS(BITVECTOR, 0x1500)
SERIAL_IS(COUNTERVECTOR, 0x1600) SERIAL_IS(COUNTERVECTOR, 0x1600)
SERIAL_IS(BLOOMFILTER, 0x1700) SERIAL_IS(BLOOMFILTER, 0x1700)
SERIAL_IS(HASHER, 0x1800)
// These are the externally visible types. // These are the externally visible types.
const SerialType SER_NONE = 0; const SerialType SER_NONE = 0;
@ -206,6 +207,11 @@ SERIAL_BLOOMFILTER(BLOOMFILTER, 1)
SERIAL_BLOOMFILTER(BASICBLOOMFILTER, 2) SERIAL_BLOOMFILTER(BASICBLOOMFILTER, 2)
SERIAL_BLOOMFILTER(COUNTINGBLOOMFILTER, 3) SERIAL_BLOOMFILTER(COUNTINGBLOOMFILTER, 3)
#define SERIAL_HASHER(name, val) SERIAL_CONST(name, val, HASHER)
SERIAL_HASHER(HASHER, 1)
SERIAL_HASHER(DEFAULTHASHER, 2)
SERIAL_HASHER(DOUBLEHASHER, 3)
SERIAL_CONST2(ID) SERIAL_CONST2(ID)
SERIAL_CONST2(STATE_ACCESS) SERIAL_CONST2(STATE_ACCESS)
SERIAL_CONST2(CASE) SERIAL_CONST2(CASE)

View file

@ -40,28 +40,15 @@ bool BloomFilter::DoSerialize(SerialInfo* info) const
{ {
DO_SERIALIZE(SER_BLOOMFILTER, SerialObj); DO_SERIALIZE(SER_BLOOMFILTER, SerialObj);
if ( ! SERIALIZE(static_cast<uint16>(hasher->K())) ) return hasher->Serialize(info);
return false;
return SERIALIZE_STR(hasher->Name().c_str(), hasher->Name().size());
} }
bool BloomFilter::DoUnserialize(UnserialInfo* info) bool BloomFilter::DoUnserialize(UnserialInfo* info)
{ {
DO_UNSERIALIZE(SerialObj); DO_UNSERIALIZE(SerialObj);
uint16 k; hasher = Hasher::Unserialize(info);
if ( ! UNSERIALIZE(&k) ) return hasher != 0;
return false;
const char* name;
if ( ! UNSERIALIZE_STR(&name, 0) )
return false;
hasher = Hasher::Create(k, name);
delete [] name;
return true;
} }
size_t BasicBloomFilter::M(double fp, size_t capacity) size_t BasicBloomFilter::M(double fp, size_t capacity)

View file

@ -13,9 +13,6 @@ class CounterVector;
/** /**
* The abstract base class for Bloom filters. * The abstract base class for Bloom filters.
*
* At this point we won't let the user choose the hasher, but we might open
* up the interface in the future.
*/ */
class BloomFilter : public SerialObj { class BloomFilter : public SerialObj {
public: public:

View file

@ -4,9 +4,59 @@
#include "Hasher.h" #include "Hasher.h"
#include "digest.h" #include "digest.h"
#include "Serializer.h"
using namespace probabilistic; using namespace probabilistic;
bool Hasher::Serialize(SerialInfo* info) const
{
return SerialObj::Serialize(info);
}
Hasher* Hasher::Unserialize(UnserialInfo* info)
{
return reinterpret_cast<Hasher*>(SerialObj::Unserialize(info, SER_HASHER));
}
bool Hasher::DoSerialize(SerialInfo* info) const
{
DO_SERIALIZE(SER_HASHER, SerialObj);
if ( ! SERIALIZE(static_cast<uint16>(k)) )
return false;
return SERIALIZE_STR(name.c_str(), name.size());
}
bool Hasher::DoUnserialize(UnserialInfo* info)
{
DO_UNSERIALIZE(SerialObj);
uint16 serial_k;
if ( ! UNSERIALIZE(&serial_k) )
return false;
k = serial_k;
assert(k > 0);
const char* serial_name;
if ( ! UNSERIALIZE_STR(&serial_name, 0) )
return false;
name = serial_name;
delete [] serial_name;
return true;
}
Hasher::Hasher(size_t k, const std::string& arg_name)
: k(k)
{
k = k;
name = arg_name;
}
UHF::UHF(size_t seed, const std::string& extra) UHF::UHF(size_t seed, const std::string& extra)
: h(compute_seed(seed, extra)) : h(compute_seed(seed, extra))
{ {
@ -40,17 +90,6 @@ size_t UHF::compute_seed(size_t seed, const std::string& extra)
return *reinterpret_cast<size_t*>(buf); return *reinterpret_cast<size_t*>(buf);
} }
Hasher* Hasher::Create(size_t k, const std::string& name)
{
return new DefaultHasher(k, name);
}
Hasher::Hasher(size_t k, const std::string& arg_name)
: k(k)
{
name = arg_name;
}
DefaultHasher::DefaultHasher(size_t k, const std::string& name) DefaultHasher::DefaultHasher(size_t k, const std::string& name)
: Hasher(k, name) : Hasher(k, name)
{ {
@ -82,6 +121,27 @@ bool DefaultHasher::Equals(const Hasher* other) const
return hash_functions == o->hash_functions; return hash_functions == o->hash_functions;
} }
IMPLEMENT_SERIAL(DefaultHasher, SER_DEFAULTHASHER)
bool DefaultHasher::DoSerialize(SerialInfo* info) const
{
DO_SERIALIZE(SER_DEFAULTHASHER, Hasher);
// Nothing to do here, the base class has all we need serialized already.
return true;
}
bool DefaultHasher::DoUnserialize(UnserialInfo* info)
{
DO_UNSERIALIZE(Hasher);
hash_functions.clear();
for ( size_t i = 0; i < K(); ++i )
hash_functions.push_back(UHF(i, Name()));
return true;
}
DoubleHasher::DoubleHasher(size_t k, const std::string& name) DoubleHasher::DoubleHasher(size_t k, const std::string& name)
: Hasher(k, name), h1(1, name), h2(2, name) : Hasher(k, name), h1(1, name), h2(2, name)
{ {
@ -112,3 +172,23 @@ bool DoubleHasher::Equals(const Hasher* other) const
const DoubleHasher* o = static_cast<const DoubleHasher*>(other); const DoubleHasher* o = static_cast<const DoubleHasher*>(other);
return h1 == o->h1 && h2 == o->h2; return h1 == o->h1 && h2 == o->h2;
} }
IMPLEMENT_SERIAL(DoubleHasher, SER_DOUBLEHASHER)
bool DoubleHasher::DoSerialize(SerialInfo* info) const
{
DO_SERIALIZE(SER_DOUBLEHASHER, Hasher);
// Nothing to do here, the base class has all we need serialized already.
return true;
}
bool DoubleHasher::DoUnserialize(UnserialInfo* info)
{
DO_UNSERIALIZE(Hasher);
h1 = UHF(1, Name());
h2 = UHF(2, Name());
return true;
}

View file

@ -5,6 +5,7 @@
#include "Hash.h" #include "Hash.h"
#include "H3.h" #include "H3.h"
#include "SerialObj.h"
namespace probabilistic { namespace probabilistic {
@ -12,7 +13,7 @@ namespace probabilistic {
* Abstract base class for hashers. A hasher creates a family of hash * Abstract base class for hashers. A hasher creates a family of hash
* functions to hash an element *k* times. * functions to hash an element *k* times.
*/ */
class Hasher { class Hasher : public SerialObj {
public: public:
typedef hash_t digest; typedef hash_t digest;
typedef std::vector<digest> digest_vector; typedef std::vector<digest> digest_vector;
@ -63,25 +64,20 @@ public:
size_t K() const { return k; } size_t K() const { return k; }
/** /**
* Returns the hasher's name. TODO: What's this? * Returns the hasher's name. If not empty, the hasher uses this descriptor
* to seed its *k* hash functions. Otherwise the hasher mixes in the initial
* seed derived from the environment variable `$BRO_SEED`.
*/ */
const std::string& Name() const { return name; } const std::string& Name() const { return name; }
/** bool Serialize(SerialInfo* info) const;
* Constructs the hasher used by the implementation. This hardcodes a static Hasher* Unserialize(UnserialInfo* info);
* specific hashing policy. It exists only because the HashingPolicy
* class hierachy is not yet serializable.
*
* @param k The number of hash functions to apply.
*
* @param name The hasher's name. Hashers with the same name should
* provide consistent results.
*
* @return Returns a new hasher instance.
*/
static Hasher* Create(size_t k, const std::string& name);
protected: protected:
DECLARE_ABSTRACT_SERIAL(Hasher);
Hasher() { }
/** /**
* Constructor. * Constructor.
* *
@ -93,7 +89,7 @@ protected:
Hasher(size_t k, const std::string& name); Hasher(size_t k, const std::string& name);
private: private:
const size_t k; size_t k;
std::string name; std::string name;
}; };
@ -113,7 +109,7 @@ public:
* seed to compute the seed for t to compute the seed NUL-terminated * seed to compute the seed for t to compute the seed NUL-terminated
* string as additional seed. * string as additional seed.
*/ */
UHF(size_t seed, const std::string& extra = ""); UHF(size_t seed = 0, const std::string& extra = "");
template <typename T> template <typename T>
Hasher::digest operator()(const T& x) const Hasher::digest operator()(const T& x) const
@ -175,14 +171,18 @@ public:
* *
* @param name The name of the hasher. * @param name The name of the hasher.
*/ */
DefaultHasher(size_t k, const std::string& name); DefaultHasher(size_t k, const std::string& name = "");
// Overridden from Hasher. // Overridden from Hasher.
virtual digest_vector Hash(const void* x, size_t n) const /* final */; virtual digest_vector Hash(const void* x, size_t n) const /* final */;
virtual DefaultHasher* Clone() const /* final */; virtual DefaultHasher* Clone() const /* final */;
virtual bool Equals(const Hasher* other) const /* final */; virtual bool Equals(const Hasher* other) const /* final */;
DECLARE_SERIAL(DefaultHasher);
private: private:
DefaultHasher() { }
std::vector<UHF> hash_functions; std::vector<UHF> hash_functions;
}; };
@ -199,14 +199,18 @@ public:
* *
* @param name The name of the hasher. * @param name The name of the hasher.
*/ */
DoubleHasher(size_t k, const std::string& name); DoubleHasher(size_t k, const std::string& name = "");
// Overridden from Hasher. // Overridden from Hasher.
virtual digest_vector Hash(const void* x, size_t n) const /* final */; virtual digest_vector Hash(const void* x, size_t n) const /* final */;
virtual DoubleHasher* Clone() const /* final */; virtual DoubleHasher* Clone() const /* final */;
virtual bool Equals(const Hasher* other) const /* final */; virtual bool Equals(const Hasher* other) const /* final */;
DECLARE_SERIAL(DoubleHasher);
private: private:
DoubleHasher() { }
UHF h1; UHF h1;
UHF h2; UHF h2;
}; };

View file

@ -48,7 +48,7 @@ function bloomfilter_basic_init%(fp: double, capacity: count,
size_t cells = BasicBloomFilter::M(fp, capacity); size_t cells = BasicBloomFilter::M(fp, capacity);
size_t optimal_k = BasicBloomFilter::K(cells, capacity); size_t optimal_k = BasicBloomFilter::K(cells, capacity);
const Hasher* h = Hasher::Create(optimal_k, name->CheckString()); const Hasher* h = new DefaultHasher(optimal_k, name->CheckString());
return new BloomFilterVal(new BasicBloomFilter(h, cells)); return new BloomFilterVal(new BasicBloomFilter(h, cells));
%} %}
@ -86,7 +86,7 @@ function bloomfilter_counting_init%(k: count, cells: count, max: count,
return 0; return 0;
} }
const Hasher* h = Hasher::Create(k, name->CheckString()); const Hasher* h = new DefaultHasher(k, name->CheckString());
uint16 width = 1; uint16 width = 1;
while ( max >>= 1 ) while ( max >>= 1 )