From fd2e155d1af26086d40e12d38f564b7954f4597e Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Sun, 21 Jul 2013 17:34:25 +0200 Subject: [PATCH] Tweak hasher interface. --- src/BloomFilter.cc | 34 +++++++------- src/BloomFilter.h | 31 +++++++------ src/CMakeLists.txt | 2 +- src/HashPolicy.cc | 77 -------------------------------- src/HashPolicy.h | 97 ---------------------------------------- src/Hasher.cc | 79 ++++++++++++++++++++++++++++++++ src/Hasher.h | 109 +++++++++++++++++++++++++++++++++++++++++++++ src/bro.bif | 8 ++-- 8 files changed, 225 insertions(+), 212 deletions(-) delete mode 100644 src/HashPolicy.cc delete mode 100644 src/HashPolicy.h create mode 100644 src/Hasher.cc create mode 100644 src/Hasher.h diff --git a/src/BloomFilter.cc b/src/BloomFilter.cc index c59092b1e4..f399bddeca 100644 --- a/src/BloomFilter.cc +++ b/src/BloomFilter.cc @@ -6,19 +6,19 @@ #include "Serializer.h" BloomFilter::BloomFilter() - : hash_(NULL) + : hasher_(NULL) { } -BloomFilter::BloomFilter(const HashPolicy* hash_policy) - : hash_(hash_policy) +BloomFilter::BloomFilter(const Hasher* hasher) + : hasher_(hasher) { } BloomFilter::~BloomFilter() { - if ( hash_ ) - delete hash_; + if ( hasher_ ) + delete hasher_; } bool BloomFilter::Serialize(SerialInfo* info) const @@ -35,9 +35,9 @@ BloomFilter* BloomFilter::Unserialize(UnserialInfo* info) bool BloomFilter::DoSerialize(SerialInfo* info) const { DO_SERIALIZE(SER_BLOOMFILTER, SerialObj); - if ( ! SERIALIZE(static_cast(hash_->K())) ) + if ( ! SERIALIZE(static_cast(hasher_->K())) ) return false; - return SERIALIZE_STR(hash_->Name().c_str(), hash_->Name().size()); + return SERIALIZE_STR(hasher_->Name().c_str(), hasher_->Name().size()); } bool BloomFilter::DoUnserialize(UnserialInfo* info) @@ -49,7 +49,7 @@ bool BloomFilter::DoUnserialize(UnserialInfo* info) const char* name; if ( ! UNSERIALIZE_STR(&name, 0) ) return false; - hash_ = HashPolicy::Create(k, name); + hasher_ = Hasher::Create(k, name); delete [] name; return true; } @@ -70,7 +70,7 @@ size_t BasicBloomFilter::K(size_t cells, size_t capacity) BasicBloomFilter* BasicBloomFilter::Merge(const BasicBloomFilter* x, const BasicBloomFilter* y) { - // TODO: Ensure that x and y use the same HashPolicy before proceeding. + // TODO: Ensure that x and y use the same Hasher before proceeding. BasicBloomFilter* result = new BasicBloomFilter(); result->bits_ = new BitVector(*x->bits_ | *y->bits_); return result; @@ -81,8 +81,8 @@ BasicBloomFilter::BasicBloomFilter() { } -BasicBloomFilter::BasicBloomFilter(const HashPolicy* hash_policy, size_t cells) - : BloomFilter(hash_policy), +BasicBloomFilter::BasicBloomFilter(const Hasher* hasher, size_t cells) + : BloomFilter(hasher), bits_(new BitVector(cells)) { } @@ -102,13 +102,13 @@ bool BasicBloomFilter::DoUnserialize(UnserialInfo* info) return bits_ != NULL; } -void BasicBloomFilter::AddImpl(const HashPolicy::hash_vector& h) +void BasicBloomFilter::AddImpl(const Hasher::digest_vector& h) { for ( size_t i = 0; i < h.size(); ++i ) bits_->Set(h[i] % bits_->Size()); } -size_t BasicBloomFilter::CountImpl(const HashPolicy::hash_vector& h) const +size_t BasicBloomFilter::CountImpl(const Hasher::digest_vector& h) const { for ( size_t i = 0; i < h.size(); ++i ) if ( ! (*bits_)[h[i] % bits_->Size()] ) @@ -129,9 +129,9 @@ CountingBloomFilter::CountingBloomFilter() { } -CountingBloomFilter::CountingBloomFilter(const HashPolicy* hash_policy, +CountingBloomFilter::CountingBloomFilter(const Hasher* hasher, size_t cells, size_t width) - : BloomFilter(hash_policy) + : BloomFilter(hasher) { cells_ = new CounterVector(width, cells); } @@ -152,13 +152,13 @@ bool CountingBloomFilter::DoUnserialize(UnserialInfo* info) return cells_ != NULL; } -void CountingBloomFilter::AddImpl(const HashPolicy::hash_vector& h) +void CountingBloomFilter::AddImpl(const Hasher::digest_vector& h) { for ( size_t i = 0; i < h.size(); ++i ) cells_->Increment(h[i] % cells_->Size(), 1); } -size_t CountingBloomFilter::CountImpl(const HashPolicy::hash_vector& h) const +size_t CountingBloomFilter::CountImpl(const Hasher::digest_vector& h) const { CounterVector::size_type min = std::numeric_limits::max(); diff --git a/src/BloomFilter.h b/src/BloomFilter.h index 189f4920b7..92f15c6070 100644 --- a/src/BloomFilter.h +++ b/src/BloomFilter.h @@ -3,7 +3,7 @@ #include #include "BitVector.h" -#include "HashPolicy.h" +#include "Hasher.h" class CounterVector; @@ -12,7 +12,7 @@ class CounterVector; */ class BloomFilter : public SerialObj { public: - // At this point we won't let the user choose the hash policy, but we might + // At this point we won't let the user choose the hasher, but we might // open up the interface in the future. virtual ~BloomFilter(); @@ -23,7 +23,7 @@ public: template void Add(const T& x) { - AddImpl(hash_->Hash(&x, sizeof(x))); + AddImpl((*hasher_)(x)); } /** @@ -36,7 +36,7 @@ public: template size_t Count(const T& x) const { - return CountImpl(hash_->Hash(&x, sizeof(x))); + return CountImpl((*hasher_)(x)); } bool Serialize(SerialInfo* info) const; @@ -50,15 +50,15 @@ protected: /** * Constructs a Bloom filter. * - * @param hash_policy The hash policy to use for this Bloom filter. + * @param hasher The hasher to use for this Bloom filter. */ - BloomFilter(const HashPolicy* hash_policy); + BloomFilter(const Hasher* hasher); - virtual void AddImpl(const HashPolicy::hash_vector& hashes) = 0; - virtual size_t CountImpl(const HashPolicy::hash_vector& hashes) const = 0; + virtual void AddImpl(const Hasher::digest_vector& hashes) = 0; + virtual size_t CountImpl(const Hasher::digest_vector& hashes) const = 0; private: - const HashPolicy* hash_; + const Hasher* hasher_; }; /** @@ -98,15 +98,15 @@ public: /** * Constructs a basic Bloom filter with a given number of cells and capacity. */ - BasicBloomFilter(const HashPolicy* hash_policy, size_t cells); + BasicBloomFilter(const Hasher* hasher, size_t cells); protected: DECLARE_SERIAL(BasicBloomFilter); BasicBloomFilter(); - virtual void AddImpl(const HashPolicy::hash_vector& h); - virtual size_t CountImpl(const HashPolicy::hash_vector& h) const; + virtual void AddImpl(const Hasher::digest_vector& h); + virtual size_t CountImpl(const Hasher::digest_vector& h) const; private: BitVector* bits_; @@ -120,16 +120,15 @@ public: static CountingBloomFilter* Merge(const CountingBloomFilter* x, const CountingBloomFilter* y); - CountingBloomFilter(const HashPolicy* hash_policy, size_t cells, - size_t width); + CountingBloomFilter(const Hasher* hasher, size_t cells, size_t width); protected: DECLARE_SERIAL(CountingBloomFilter); CountingBloomFilter(); - virtual void AddImpl(const HashPolicy::hash_vector& h); - virtual size_t CountImpl(const HashPolicy::hash_vector& h) const; + virtual void AddImpl(const Hasher::digest_vector& h); + virtual size_t CountImpl(const Hasher::digest_vector& h) const; private: CounterVector* cells_; diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index f2c7ce6bad..87a3db3b62 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -279,7 +279,7 @@ set(bro_SRCS Frame.cc Func.cc Hash.cc - HashPolicy.cc + Hasher.cc ID.cc IntSet.cc IOSource.cc diff --git a/src/HashPolicy.cc b/src/HashPolicy.cc deleted file mode 100644 index 7ce754be3c..0000000000 --- a/src/HashPolicy.cc +++ /dev/null @@ -1,77 +0,0 @@ -#include "HashPolicy.h" - -#include "digest.h" - -Hasher::Hasher(size_t seed, const std::string& extra) - : h_(compute_seed(seed, extra)) - { - } - -Hasher::hash_type Hasher::operator()(const void* x, size_t n) const - { - return n == 0 ? 0 : h_(x, n); - } - -size_t Hasher::compute_seed(size_t seed, const std::string& extra) - { - u_char digest[SHA256_DIGEST_LENGTH]; - SHA256_CTX ctx; - sha256_init(&ctx); - if ( extra.empty() ) - { - unsigned int first_seed = initial_seed(); - sha256_update(&ctx, &first_seed, sizeof(first_seed)); - } - else - { - sha256_update(&ctx, extra.c_str(), extra.size()); - } - sha256_update(&ctx, &seed, sizeof(seed)); - sha256_final(&ctx, digest); - return *reinterpret_cast(digest); - } - - -HashPolicy* HashPolicy::Create(size_t k, const std::string& name) - { - return new DefaultHashing(k, name); - } - -HashPolicy::HashPolicy(size_t k, const std::string& name) - : k_(k), name_(name) - { - } - -DefaultHashing::DefaultHashing(size_t k, const std::string& name) - : HashPolicy(k, name) - { - for ( size_t i = 0; i < k; ++i ) - hashers_.push_back(Hasher(i, name)); - } - -HashPolicy::hash_vector DefaultHashing::Hash(const void* x, size_t n) const - { - hash_vector h(K(), 0); - for ( size_t i = 0; i < h.size(); ++i ) - h[i] = hashers_[i](x, n); - return h; - } - -DoubleHashing::DoubleHashing(size_t k, const std::string& name) - : HashPolicy(k, name), - hasher1_(1, name), - hasher2_(2, name) - { - } - -HashPolicy::hash_vector DoubleHashing::Hash(const void* x, size_t n) const - { - hash_type h1 = hasher1_(x, n); - hash_type h2 = hasher2_(x, n); - hash_vector h(K(), 0); - for ( size_t i = 0; i < h.size(); ++i ) - h[i] = h1 + i * h2; - return h; - } - - diff --git a/src/HashPolicy.h b/src/HashPolicy.h deleted file mode 100644 index 7bdb968bfe..0000000000 --- a/src/HashPolicy.h +++ /dev/null @@ -1,97 +0,0 @@ -#ifndef HashPolicy_h -#define HashPolicy_h - -#include "Hash.h" -#include "H3.h" - -/** - * A functor that computes a universal hash function. - */ -class Hasher { -public: - typedef hash_t hash_type; - - /** - * Constructs a hasher seeded by a given seed and optionally an extra - * descriptor. - * - * @param seed The seed to use. - * - * @param extra If not `NULL`, the hasher will not mix in the initial seed - * but instead use this NUL-terminated string as additional seed. - */ - Hasher(size_t seed, const std::string& extra = ""); - - /** - * Computes the hash digest of contiguous data. - * - * @param x A pointer to the beginning of the byte sequence to hash. - * - * @param n The length of the sequence pointed to by *x*. - */ - hash_type operator()(const void* x, size_t n) const; - -private: - static size_t compute_seed(size_t seed, const std::string& extra); - - H3 h_; -}; - -/** - * The abstract base class for hash policies that hash elements *k* times. - */ -class HashPolicy { -public: - /** - * Constructs the hashing policy used by the implementation. This factory - * function exists because the HashingPolicy class hierachy is not yet - * serializable. - */ - static HashPolicy* Create(size_t k, const std::string& name); - - typedef Hasher::hash_type hash_type; - typedef std::vector hash_vector; - - virtual ~HashPolicy() { } - - virtual hash_vector Hash(const void* x, size_t n) const = 0; - - size_t K() const { return k_; } - const std::string& Name() const { return name_; } - -protected: - HashPolicy(size_t k, const std::string& name); - -private: - const size_t k_; - std::string name_; -}; - -/** - * The default hashing policy. Performs *k* hash function computations. - */ -class DefaultHashing : public HashPolicy { -public: - DefaultHashing(size_t k, const std::string& name); - - virtual hash_vector Hash(const void* x, size_t n) const /* override */; - -private: - std::vector hashers_; -}; - -/** - * The *double-hashing* policy. Uses a linear combination of two hash functions. - */ -class DoubleHashing : public HashPolicy { -public: - DoubleHashing(size_t k, const std::string& name); - - virtual hash_vector Hash(const void* x, size_t n) const; - -private: - Hasher hasher1_; - Hasher hasher2_; -}; - -#endif diff --git a/src/Hasher.cc b/src/Hasher.cc new file mode 100644 index 0000000000..045adcd174 --- /dev/null +++ b/src/Hasher.cc @@ -0,0 +1,79 @@ +#include "Hasher.h" + +#include "digest.h" + +Hasher::UHF::UHF(size_t seed, const std::string& extra) + : h_(compute_seed(seed, extra)) + { + } + +Hasher::digest Hasher::UHF::hash(const void* x, size_t n) const + { + assert(n <= UHASH_KEY_SIZE); + return n == 0 ? 0 : h_(x, n); + } + +size_t Hasher::UHF::compute_seed(size_t seed, const std::string& extra) + { + u_char buf[SHA256_DIGEST_LENGTH]; + SHA256_CTX ctx; + sha256_init(&ctx); + if ( extra.empty() ) + { + unsigned int first_seed = initial_seed(); + sha256_update(&ctx, &first_seed, sizeof(first_seed)); + } + else + { + sha256_update(&ctx, extra.c_str(), extra.size()); + } + sha256_update(&ctx, &seed, sizeof(seed)); + sha256_final(&ctx, buf); + // Take the first sizeof(size_t) bytes as seed. + return *reinterpret_cast(buf); + } + + +Hasher* Hasher::Create(size_t k, const std::string& name) + { + return new DefaultHasher(k, name); + } + +Hasher::Hasher(size_t k, const std::string& name) + : k_(k), name_(name) + { + } + +DefaultHasher::DefaultHasher(size_t k, const std::string& name) + : Hasher(k, name) + { + for ( size_t i = 0; i < k; ++i ) + hash_functions_.push_back(UHF(i, name)); + } + +Hasher::digest_vector DefaultHasher::Hash(const void* x, size_t n) const + { + digest_vector h(K(), 0); + for ( size_t i = 0; i < h.size(); ++i ) + h[i] = hash_functions_[i](x, n); + return h; + } + +DoubleHasher::DoubleHasher(size_t k, const std::string& name) + : Hasher(k, name), + h1_(1, name), + h2_(2, name) + { + } + +Hasher::digest_vector DoubleHasher::Hash(const void* x, size_t n) const + { + digest h1 = h1_(x, n); + digest h2 = h2_(x, n); + digest_vector h(K(), 0); + for ( size_t i = 0; i < h.size(); ++i ) + h[i] = h1 + i * h2; + return h; + } + + diff --git a/src/Hasher.h b/src/Hasher.h new file mode 100644 index 0000000000..8d0af6b03f --- /dev/null +++ b/src/Hasher.h @@ -0,0 +1,109 @@ +#ifndef Hasher_h +#define Hasher_h + +#include "Hash.h" +#include "H3.h" + +/** + * The abstract base class for hashers, i.e., constructs which hash elements + * *k* times. + */ +class Hasher { +public: + typedef hash_t digest; + typedef std::vector digest_vector; + + /** + * Constructs the hashing policy used by the implementation. + * + * @todo This factory function exists because the HashingPolicy class + * hierachy is not yet serializable. + */ + static Hasher* Create(size_t k, const std::string& name); + + virtual ~Hasher() { } + + template + digest_vector operator()(const T& x) const + { + return Hash(&x, sizeof(T)); + } + + virtual digest_vector Hash(const void* x, size_t n) const = 0; + + size_t K() const { return k_; } + const std::string& Name() const { return name_; } + +protected: + /** + * A universal hash function family. + */ + class UHF { + public: + /** + * Constructs an H3 hash function seeded with a given seed and an optional + * extra seed to replace the initial Bro seed. + * + * @param seed The seed to use for this instance. + * + * @param extra If not empty, this parameter replaces the initial seed to + * compute the seed for t to compute the + * seed + * NUL-terminated string as additional seed. + */ + UHF(size_t seed, const std::string& extra = ""); + + template + digest operator()(const T& x) const + { + return hash(&x, sizeof(T)); + } + + digest operator()(const void* x, size_t n) const + { + return hash(x, n); + } + + digest hash(const void* x, size_t n) const; + + private: + static size_t compute_seed(size_t seed, const std::string& extra); + + H3 h_; + }; + + Hasher(size_t k, const std::string& name); + +private: + const size_t k_; + std::string name_; +}; + +/** + * The default hashing policy. Performs *k* hash function computations. + */ +class DefaultHasher : public Hasher { +public: + DefaultHasher(size_t k, const std::string& name); + + virtual digest_vector Hash(const void* x, size_t n) const /* final */; + +private: + std::vector hash_functions_; +}; + +/** + * The *double-hashing* policy. Uses a linear combination of two hash functions. + */ +class DoubleHasher : public Hasher { +public: + DoubleHasher(size_t k, const std::string& name); + + virtual digest_vector Hash(const void* x, size_t n) const /* final */; + +private: + UHF h1_; + UHF h2_; +}; + +#endif diff --git a/src/bro.bif b/src/bro.bif index d0ce066139..71f8c0716f 100644 --- a/src/bro.bif +++ b/src/bro.bif @@ -5008,8 +5008,8 @@ function bloomfilter_basic_init%(fp: double, capacity: count, size_t cells = BasicBloomFilter::M(fp, capacity); size_t optimal_k = BasicBloomFilter::K(cells, capacity); - const HashPolicy* hp = HashPolicy::Create(optimal_k, name->CheckString()); - return new BloomFilterVal(new BasicBloomFilter(hp, cells)); + const Hasher* h = Hasher::Create(optimal_k, name->CheckString()); + return new BloomFilterVal(new BasicBloomFilter(h, cells)); %} ## Creates a counting Bloom filter. @@ -5029,11 +5029,11 @@ function bloomfilter_basic_init%(fp: double, capacity: count, function bloomfilter_counting_init%(k: count, cells: count, max: count, name: string &default=""%): opaque of bloomfilter %{ - const HashPolicy* hp = HashPolicy::Create(k, name->CheckString()); + const Hasher* h = Hasher::Create(k, name->CheckString()); uint16 width = 0; while ( max >>= 1 ) ++width; - return new BloomFilterVal(new CountingBloomFilter(hp, cells, width)); + return new BloomFilterVal(new CountingBloomFilter(h, cells, width)); %} ## Adds an element to a Bloom filter.