Tweak hasher interface.

This commit is contained in:
Matthias Vallentin 2013-07-21 17:34:25 +02:00
parent 446344ae99
commit fd2e155d1a
8 changed files with 225 additions and 212 deletions

View file

@ -6,19 +6,19 @@
#include "Serializer.h"
BloomFilter::BloomFilter()
: hash_(NULL)
: hasher_(NULL)
{
}
BloomFilter::BloomFilter(const HashPolicy* hash_policy)
: hash_(hash_policy)
BloomFilter::BloomFilter(const Hasher* hasher)
: hasher_(hasher)
{
}
BloomFilter::~BloomFilter()
{
if ( hash_ )
delete hash_;
if ( hasher_ )
delete hasher_;
}
bool BloomFilter::Serialize(SerialInfo* info) const
@ -35,9 +35,9 @@ BloomFilter* BloomFilter::Unserialize(UnserialInfo* info)
bool BloomFilter::DoSerialize(SerialInfo* info) const
{
DO_SERIALIZE(SER_BLOOMFILTER, SerialObj);
if ( ! SERIALIZE(static_cast<uint16>(hash_->K())) )
if ( ! SERIALIZE(static_cast<uint16>(hasher_->K())) )
return false;
return SERIALIZE_STR(hash_->Name().c_str(), hash_->Name().size());
return SERIALIZE_STR(hasher_->Name().c_str(), hasher_->Name().size());
}
bool BloomFilter::DoUnserialize(UnserialInfo* info)
@ -49,7 +49,7 @@ bool BloomFilter::DoUnserialize(UnserialInfo* info)
const char* name;
if ( ! UNSERIALIZE_STR(&name, 0) )
return false;
hash_ = HashPolicy::Create(k, name);
hasher_ = Hasher::Create(k, name);
delete [] name;
return true;
}
@ -70,7 +70,7 @@ size_t BasicBloomFilter::K(size_t cells, size_t capacity)
BasicBloomFilter* BasicBloomFilter::Merge(const BasicBloomFilter* x,
const BasicBloomFilter* y)
{
// TODO: Ensure that x and y use the same HashPolicy before proceeding.
// TODO: Ensure that x and y use the same Hasher before proceeding.
BasicBloomFilter* result = new BasicBloomFilter();
result->bits_ = new BitVector(*x->bits_ | *y->bits_);
return result;
@ -81,8 +81,8 @@ BasicBloomFilter::BasicBloomFilter()
{
}
BasicBloomFilter::BasicBloomFilter(const HashPolicy* hash_policy, size_t cells)
: BloomFilter(hash_policy),
BasicBloomFilter::BasicBloomFilter(const Hasher* hasher, size_t cells)
: BloomFilter(hasher),
bits_(new BitVector(cells))
{
}
@ -102,13 +102,13 @@ bool BasicBloomFilter::DoUnserialize(UnserialInfo* info)
return bits_ != NULL;
}
void BasicBloomFilter::AddImpl(const HashPolicy::hash_vector& h)
void BasicBloomFilter::AddImpl(const Hasher::digest_vector& h)
{
for ( size_t i = 0; i < h.size(); ++i )
bits_->Set(h[i] % bits_->Size());
}
size_t BasicBloomFilter::CountImpl(const HashPolicy::hash_vector& h) const
size_t BasicBloomFilter::CountImpl(const Hasher::digest_vector& h) const
{
for ( size_t i = 0; i < h.size(); ++i )
if ( ! (*bits_)[h[i] % bits_->Size()] )
@ -129,9 +129,9 @@ CountingBloomFilter::CountingBloomFilter()
{
}
CountingBloomFilter::CountingBloomFilter(const HashPolicy* hash_policy,
CountingBloomFilter::CountingBloomFilter(const Hasher* hasher,
size_t cells, size_t width)
: BloomFilter(hash_policy)
: BloomFilter(hasher)
{
cells_ = new CounterVector(width, cells);
}
@ -152,13 +152,13 @@ bool CountingBloomFilter::DoUnserialize(UnserialInfo* info)
return cells_ != NULL;
}
void CountingBloomFilter::AddImpl(const HashPolicy::hash_vector& h)
void CountingBloomFilter::AddImpl(const Hasher::digest_vector& h)
{
for ( size_t i = 0; i < h.size(); ++i )
cells_->Increment(h[i] % cells_->Size(), 1);
}
size_t CountingBloomFilter::CountImpl(const HashPolicy::hash_vector& h) const
size_t CountingBloomFilter::CountImpl(const Hasher::digest_vector& h) const
{
CounterVector::size_type min =
std::numeric_limits<CounterVector::size_type>::max();

View file

@ -3,7 +3,7 @@
#include <vector>
#include "BitVector.h"
#include "HashPolicy.h"
#include "Hasher.h"
class CounterVector;
@ -12,7 +12,7 @@ class CounterVector;
*/
class BloomFilter : public SerialObj {
public:
// At this point we won't let the user choose the hash policy, but we might
// At this point we won't let the user choose the hasher, but we might
// open up the interface in the future.
virtual ~BloomFilter();
@ -23,7 +23,7 @@ public:
template <typename T>
void Add(const T& x)
{
AddImpl(hash_->Hash(&x, sizeof(x)));
AddImpl((*hasher_)(x));
}
/**
@ -36,7 +36,7 @@ public:
template <typename T>
size_t Count(const T& x) const
{
return CountImpl(hash_->Hash(&x, sizeof(x)));
return CountImpl((*hasher_)(x));
}
bool Serialize(SerialInfo* info) const;
@ -50,15 +50,15 @@ protected:
/**
* Constructs a Bloom filter.
*
* @param hash_policy The hash policy to use for this Bloom filter.
* @param hasher The hasher to use for this Bloom filter.
*/
BloomFilter(const HashPolicy* hash_policy);
BloomFilter(const Hasher* hasher);
virtual void AddImpl(const HashPolicy::hash_vector& hashes) = 0;
virtual size_t CountImpl(const HashPolicy::hash_vector& hashes) const = 0;
virtual void AddImpl(const Hasher::digest_vector& hashes) = 0;
virtual size_t CountImpl(const Hasher::digest_vector& hashes) const = 0;
private:
const HashPolicy* hash_;
const Hasher* hasher_;
};
/**
@ -98,15 +98,15 @@ public:
/**
* Constructs a basic Bloom filter with a given number of cells and capacity.
*/
BasicBloomFilter(const HashPolicy* hash_policy, size_t cells);
BasicBloomFilter(const Hasher* hasher, size_t cells);
protected:
DECLARE_SERIAL(BasicBloomFilter);
BasicBloomFilter();
virtual void AddImpl(const HashPolicy::hash_vector& h);
virtual size_t CountImpl(const HashPolicy::hash_vector& h) const;
virtual void AddImpl(const Hasher::digest_vector& h);
virtual size_t CountImpl(const Hasher::digest_vector& h) const;
private:
BitVector* bits_;
@ -120,16 +120,15 @@ public:
static CountingBloomFilter* Merge(const CountingBloomFilter* x,
const CountingBloomFilter* y);
CountingBloomFilter(const HashPolicy* hash_policy, size_t cells,
size_t width);
CountingBloomFilter(const Hasher* hasher, size_t cells, size_t width);
protected:
DECLARE_SERIAL(CountingBloomFilter);
CountingBloomFilter();
virtual void AddImpl(const HashPolicy::hash_vector& h);
virtual size_t CountImpl(const HashPolicy::hash_vector& h) const;
virtual void AddImpl(const Hasher::digest_vector& h);
virtual size_t CountImpl(const Hasher::digest_vector& h) const;
private:
CounterVector* cells_;

View file

@ -279,7 +279,7 @@ set(bro_SRCS
Frame.cc
Func.cc
Hash.cc
HashPolicy.cc
Hasher.cc
ID.cc
IntSet.cc
IOSource.cc

View file

@ -1,77 +0,0 @@
#include "HashPolicy.h"
#include "digest.h"
Hasher::Hasher(size_t seed, const std::string& extra)
: h_(compute_seed(seed, extra))
{
}
Hasher::hash_type Hasher::operator()(const void* x, size_t n) const
{
return n == 0 ? 0 : h_(x, n);
}
size_t Hasher::compute_seed(size_t seed, const std::string& extra)
{
u_char digest[SHA256_DIGEST_LENGTH];
SHA256_CTX ctx;
sha256_init(&ctx);
if ( extra.empty() )
{
unsigned int first_seed = initial_seed();
sha256_update(&ctx, &first_seed, sizeof(first_seed));
}
else
{
sha256_update(&ctx, extra.c_str(), extra.size());
}
sha256_update(&ctx, &seed, sizeof(seed));
sha256_final(&ctx, digest);
return *reinterpret_cast<size_t*>(digest);
}
HashPolicy* HashPolicy::Create(size_t k, const std::string& name)
{
return new DefaultHashing(k, name);
}
HashPolicy::HashPolicy(size_t k, const std::string& name)
: k_(k), name_(name)
{
}
DefaultHashing::DefaultHashing(size_t k, const std::string& name)
: HashPolicy(k, name)
{
for ( size_t i = 0; i < k; ++i )
hashers_.push_back(Hasher(i, name));
}
HashPolicy::hash_vector DefaultHashing::Hash(const void* x, size_t n) const
{
hash_vector h(K(), 0);
for ( size_t i = 0; i < h.size(); ++i )
h[i] = hashers_[i](x, n);
return h;
}
DoubleHashing::DoubleHashing(size_t k, const std::string& name)
: HashPolicy(k, name),
hasher1_(1, name),
hasher2_(2, name)
{
}
HashPolicy::hash_vector DoubleHashing::Hash(const void* x, size_t n) const
{
hash_type h1 = hasher1_(x, n);
hash_type h2 = hasher2_(x, n);
hash_vector h(K(), 0);
for ( size_t i = 0; i < h.size(); ++i )
h[i] = h1 + i * h2;
return h;
}

View file

@ -1,97 +0,0 @@
#ifndef HashPolicy_h
#define HashPolicy_h
#include "Hash.h"
#include "H3.h"
/**
* A functor that computes a universal hash function.
*/
class Hasher {
public:
typedef hash_t hash_type;
/**
* Constructs a hasher seeded by a given seed and optionally an extra
* descriptor.
*
* @param seed The seed to use.
*
* @param extra If not `NULL`, the hasher will not mix in the initial seed
* but instead use this NUL-terminated string as additional seed.
*/
Hasher(size_t seed, const std::string& extra = "");
/**
* Computes the hash digest of contiguous data.
*
* @param x A pointer to the beginning of the byte sequence to hash.
*
* @param n The length of the sequence pointed to by *x*.
*/
hash_type operator()(const void* x, size_t n) const;
private:
static size_t compute_seed(size_t seed, const std::string& extra);
H3<hash_type, UHASH_KEY_SIZE> h_;
};
/**
* The abstract base class for hash policies that hash elements *k* times.
*/
class HashPolicy {
public:
/**
* Constructs the hashing policy used by the implementation. This factory
* function exists because the HashingPolicy class hierachy is not yet
* serializable.
*/
static HashPolicy* Create(size_t k, const std::string& name);
typedef Hasher::hash_type hash_type;
typedef std::vector<hash_type> hash_vector;
virtual ~HashPolicy() { }
virtual hash_vector Hash(const void* x, size_t n) const = 0;
size_t K() const { return k_; }
const std::string& Name() const { return name_; }
protected:
HashPolicy(size_t k, const std::string& name);
private:
const size_t k_;
std::string name_;
};
/**
* The default hashing policy. Performs *k* hash function computations.
*/
class DefaultHashing : public HashPolicy {
public:
DefaultHashing(size_t k, const std::string& name);
virtual hash_vector Hash(const void* x, size_t n) const /* override */;
private:
std::vector<Hasher> hashers_;
};
/**
* The *double-hashing* policy. Uses a linear combination of two hash functions.
*/
class DoubleHashing : public HashPolicy {
public:
DoubleHashing(size_t k, const std::string& name);
virtual hash_vector Hash(const void* x, size_t n) const;
private:
Hasher hasher1_;
Hasher hasher2_;
};
#endif

79
src/Hasher.cc Normal file
View file

@ -0,0 +1,79 @@
#include "Hasher.h"
#include "digest.h"
Hasher::UHF::UHF(size_t seed, const std::string& extra)
: h_(compute_seed(seed, extra))
{
}
Hasher::digest Hasher::UHF::hash(const void* x, size_t n) const
{
assert(n <= UHASH_KEY_SIZE);
return n == 0 ? 0 : h_(x, n);
}
size_t Hasher::UHF::compute_seed(size_t seed, const std::string& extra)
{
u_char buf[SHA256_DIGEST_LENGTH];
SHA256_CTX ctx;
sha256_init(&ctx);
if ( extra.empty() )
{
unsigned int first_seed = initial_seed();
sha256_update(&ctx, &first_seed, sizeof(first_seed));
}
else
{
sha256_update(&ctx, extra.c_str(), extra.size());
}
sha256_update(&ctx, &seed, sizeof(seed));
sha256_final(&ctx, buf);
// Take the first sizeof(size_t) bytes as seed.
return *reinterpret_cast<size_t*>(buf);
}
Hasher* Hasher::Create(size_t k, const std::string& name)
{
return new DefaultHasher(k, name);
}
Hasher::Hasher(size_t k, const std::string& name)
: k_(k), name_(name)
{
}
DefaultHasher::DefaultHasher(size_t k, const std::string& name)
: Hasher(k, name)
{
for ( size_t i = 0; i < k; ++i )
hash_functions_.push_back(UHF(i, name));
}
Hasher::digest_vector DefaultHasher::Hash(const void* x, size_t n) const
{
digest_vector h(K(), 0);
for ( size_t i = 0; i < h.size(); ++i )
h[i] = hash_functions_[i](x, n);
return h;
}
DoubleHasher::DoubleHasher(size_t k, const std::string& name)
: Hasher(k, name),
h1_(1, name),
h2_(2, name)
{
}
Hasher::digest_vector DoubleHasher::Hash(const void* x, size_t n) const
{
digest h1 = h1_(x, n);
digest h2 = h2_(x, n);
digest_vector h(K(), 0);
for ( size_t i = 0; i < h.size(); ++i )
h[i] = h1 + i * h2;
return h;
}

109
src/Hasher.h Normal file
View file

@ -0,0 +1,109 @@
#ifndef Hasher_h
#define Hasher_h
#include "Hash.h"
#include "H3.h"
/**
* The abstract base class for hashers, i.e., constructs which hash elements
* *k* times.
*/
class Hasher {
public:
typedef hash_t digest;
typedef std::vector<digest> digest_vector;
/**
* Constructs the hashing policy used by the implementation.
*
* @todo This factory function exists because the HashingPolicy class
* hierachy is not yet serializable.
*/
static Hasher* Create(size_t k, const std::string& name);
virtual ~Hasher() { }
template <typename T>
digest_vector operator()(const T& x) const
{
return Hash(&x, sizeof(T));
}
virtual digest_vector Hash(const void* x, size_t n) const = 0;
size_t K() const { return k_; }
const std::string& Name() const { return name_; }
protected:
/**
* A universal hash function family.
*/
class UHF {
public:
/**
* Constructs an H3 hash function seeded with a given seed and an optional
* extra seed to replace the initial Bro seed.
*
* @param seed The seed to use for this instance.
*
* @param extra If not empty, this parameter replaces the initial seed to
* compute the seed for t to compute the
* seed
* NUL-terminated string as additional seed.
*/
UHF(size_t seed, const std::string& extra = "");
template <typename T>
digest operator()(const T& x) const
{
return hash(&x, sizeof(T));
}
digest operator()(const void* x, size_t n) const
{
return hash(x, n);
}
digest hash(const void* x, size_t n) const;
private:
static size_t compute_seed(size_t seed, const std::string& extra);
H3<digest, UHASH_KEY_SIZE> h_;
};
Hasher(size_t k, const std::string& name);
private:
const size_t k_;
std::string name_;
};
/**
* The default hashing policy. Performs *k* hash function computations.
*/
class DefaultHasher : public Hasher {
public:
DefaultHasher(size_t k, const std::string& name);
virtual digest_vector Hash(const void* x, size_t n) const /* final */;
private:
std::vector<UHF> hash_functions_;
};
/**
* The *double-hashing* policy. Uses a linear combination of two hash functions.
*/
class DoubleHasher : public Hasher {
public:
DoubleHasher(size_t k, const std::string& name);
virtual digest_vector Hash(const void* x, size_t n) const /* final */;
private:
UHF h1_;
UHF h2_;
};
#endif

View file

@ -5008,8 +5008,8 @@ function bloomfilter_basic_init%(fp: double, capacity: count,
size_t cells = BasicBloomFilter::M(fp, capacity);
size_t optimal_k = BasicBloomFilter::K(cells, capacity);
const HashPolicy* hp = HashPolicy::Create(optimal_k, name->CheckString());
return new BloomFilterVal(new BasicBloomFilter(hp, cells));
const Hasher* h = Hasher::Create(optimal_k, name->CheckString());
return new BloomFilterVal(new BasicBloomFilter(h, cells));
%}
## Creates a counting Bloom filter.
@ -5029,11 +5029,11 @@ function bloomfilter_basic_init%(fp: double, capacity: count,
function bloomfilter_counting_init%(k: count, cells: count, max: count,
name: string &default=""%): opaque of bloomfilter
%{
const HashPolicy* hp = HashPolicy::Create(k, name->CheckString());
const Hasher* h = Hasher::Create(k, name->CheckString());
uint16 width = 0;
while ( max >>= 1 )
++width;
return new BloomFilterVal(new CountingBloomFilter(hp, cells, width));
return new BloomFilterVal(new CountingBloomFilter(h, cells, width));
%}
## Adds an element to a Bloom filter.