Also switch BloomFilters from H3 to siphash.

This removes all dependencies on H3 in our source tree.
This commit is contained in:
Johanna Amann 2016-07-13 09:04:10 -07:00
parent e1218cc7fa
commit f1bae871e9
8 changed files with 78 additions and 178 deletions

View file

@ -241,7 +241,8 @@ CardinalityCounter* CardinalityCounter::Unserialize(UnserialInfo* info)
return c;
}
/* The following function is copied from libc/string/flsll.c from the FreeBSD source
/**
* The following function is copied from libc/string/flsll.c from the FreeBSD source
* tree. Original copyright message follows
*/
/*-

View file

@ -155,9 +155,10 @@ private:
int OptimalB(double error, double confidence) const;
/**
* Determines at which index (counted from the back) the first one-bit
* Determines at which index (counted from the front) the first one-bit
* appears. The last b bits have to be 0 (the element has to be divisible
* by m), hence they are ignored.
* by m), hence they are ignored. Always adds 1 to the result. This is the
* rho function from the original algorithm.
*
* @param hash_modified hash value
*
@ -165,6 +166,9 @@ private:
*/
uint8_t Rank(uint64_t hash_modified) const;
/**
* flsll from FreeBSD; especially Linux does not have this.
*/
static int flsll(uint64_t mask);
/**

View file

@ -8,15 +8,21 @@
#include "digest.h"
#include "Serializer.h"
extern "C" {
extern int siphash( uint8_t *out, const uint8_t *in, uint64_t inlen, const uint8_t *k );
}
using namespace probabilistic;
uint64 Hasher::MakeSeed(const void* data, size_t size)
Hasher::seed_t Hasher::MakeSeed(const void* data, size_t size)
{
u_char buf[SHA256_DIGEST_LENGTH];
uint64 tmpseed;
seed_t tmpseed;
SHA256_CTX ctx;
sha256_init(&ctx);
assert(sizeof(tmpseed) == 16);
if ( data )
sha256_update(&ctx, data, size);
@ -56,7 +62,8 @@ bool Hasher::DoSerialize(SerialInfo* info) const
if ( ! SERIALIZE(static_cast<uint16>(k)) )
return false;
return SERIALIZE(static_cast<uint64>(seed));
return SERIALIZE(static_cast<uint64>(seed.h1));
return SERIALIZE(static_cast<uint64>(seed.h2));
}
bool Hasher::DoUnserialize(UnserialInfo* info)
@ -70,8 +77,10 @@ bool Hasher::DoUnserialize(UnserialInfo* info)
k = serial_k;
assert(k > 0);
uint64 serial_seed;
if ( ! UNSERIALIZE(&serial_seed) )
seed_t serial_seed;
if ( ! UNSERIALIZE(&serial_seed.h1) )
return false;
if ( ! UNSERIALIZE(&serial_seed.h2) )
return false;
seed = serial_seed;
@ -79,14 +88,18 @@ bool Hasher::DoUnserialize(UnserialInfo* info)
return true;
}
Hasher::Hasher(size_t arg_k, size_t arg_seed)
Hasher::Hasher(size_t arg_k, seed_t arg_seed)
{
k = arg_k;
seed = arg_seed;
}
UHF::UHF(size_t arg_seed)
: h(arg_seed)
UHF::UHF()
{
memset(&seed, 0, sizeof(seed));
}
UHF::UHF(Hasher::seed_t arg_seed)
{
seed = arg_seed;
}
@ -96,8 +109,13 @@ UHF::UHF(size_t arg_seed)
// times.
Hasher::digest UHF::hash(const void* x, size_t n) const
{
assert(sizeof(Hasher::seed_t) == 16);
hash_t outdigest;
if ( n <= UHASH_KEY_SIZE )
return n == 0 ? 0 : h(x, n);
{
siphash(reinterpret_cast<uint8_t*>(&outdigest), reinterpret_cast<const uint8_t*>(x), n, reinterpret_cast<const uint8_t*>(&seed));
return outdigest;
}
unsigned char d[16];
MD5(reinterpret_cast<const unsigned char*>(x), n, d);
@ -111,11 +129,15 @@ Hasher::digest UHF::hash(const void* x, size_t n) const
return *reinterpret_cast<const Hasher::digest*>(d);
}
DefaultHasher::DefaultHasher(size_t k, size_t seed)
DefaultHasher::DefaultHasher(size_t k, Hasher::seed_t seed)
: Hasher(k, seed)
{
for ( size_t i = 1; i <= k; ++i )
hash_functions.push_back(UHF(Seed() + bro_prng(i)));
{
seed_t s = Seed();
s.h1 += bro_prng(i);
hash_functions.push_back(UHF(s));
}
}
Hasher::digest_vector DefaultHasher::Hash(const void* x, size_t n) const
@ -158,12 +180,16 @@ bool DefaultHasher::DoUnserialize(UnserialInfo* info)
hash_functions.clear();
for ( size_t i = 0; i < K(); ++i )
hash_functions.push_back(UHF(Seed() + bro_prng(i)));
{
Hasher::seed_t s = Seed();
s.h1 += bro_prng(i);
hash_functions.push_back(UHF(s));
}
return true;
}
DoubleHasher::DoubleHasher(size_t k, size_t seed)
DoubleHasher::DoubleHasher(size_t k, seed_t seed)
: Hasher(k, seed), h1(seed + bro_prng(1)), h2(seed + bro_prng(2))
{
}

View file

@ -4,7 +4,6 @@
#define PROBABILISTIC_HASHER_H
#include "Hash.h"
#include "H3.h"
#include "SerialObj.h"
namespace probabilistic {
@ -17,6 +16,15 @@ class Hasher : public SerialObj {
public:
typedef hash_t digest;
typedef std::vector<digest> digest_vector;
struct seed_t {
uint64_t h1;
uint64_t h2;
friend seed_t operator+(seed_t lhs, const uint64_t rhs) {
lhs.h1 += rhs;
return lhs;
}
};
/**
* Creates a valid hasher seed from an arbitrary string.
@ -30,7 +38,7 @@ public:
*
* @return A seed suitable for hashers.
*/
static uint64 MakeSeed(const void* data, size_t size);
static seed_t MakeSeed(const void* data, size_t size);
/**
* Destructor.
@ -89,7 +97,7 @@ public:
/**
* Returns the seed used to construct the hasher.
*/
size_t Seed() const { return seed; }
seed_t Seed() const { return seed; }
bool Serialize(SerialInfo* info) const;
static Hasher* Unserialize(UnserialInfo* info);
@ -106,11 +114,11 @@ protected:
*
* @param arg_seed The seed for the hasher.
*/
Hasher(size_t arg_k, size_t arg_seed);
Hasher(size_t arg_k, seed_t arg_seed);
private:
size_t k;
size_t seed;
seed_t seed;
};
/**
@ -120,12 +128,17 @@ private:
class UHF {
public:
/**
* Constructs an H3 hash function seeded with a given seed and an
* Default constructor with zero seed.
*/
UHF();
/**
* Constructs an hash function seeded with a given seed and an
* optional extra seed to replace the initial Bro seed.
*
* @param arg_seed The seed to use for this instance.
*/
UHF(size_t arg_seed = 0);
UHF(Hasher::seed_t arg_seed);
template <typename T>
Hasher::digest operator()(const T& x) const
@ -159,7 +172,8 @@ public:
friend bool operator==(const UHF& x, const UHF& y)
{
return x.h == y.h;
return (x.seed.h1 == y.seed.h1) &&
(x.seed.h2 == y.seed.h2);
}
friend bool operator!=(const UHF& x, const UHF& y)
@ -168,10 +182,9 @@ public:
}
private:
static size_t compute_seed(size_t seed);
static size_t compute_seed(Hasher::seed_t seed);
H3<Hasher::digest, UHASH_KEY_SIZE> h;
size_t seed;
Hasher::seed_t seed;
};
@ -188,7 +201,7 @@ public:
*
* @param seed The seed for the hasher.
*/
DefaultHasher(size_t k, size_t seed);
DefaultHasher(size_t k, Hasher::seed_t seed);
// Overridden from Hasher.
virtual digest_vector Hash(const void* x, size_t n) const final;
@ -216,7 +229,7 @@ public:
*
* @param seed The seed for the hasher.
*/
DoubleHasher(size_t k, size_t seed);
DoubleHasher(size_t k, Hasher::seed_t seed);
// Overridden from Hasher.
virtual digest_vector Hash(const void* x, size_t n) const final;

View file

@ -42,7 +42,7 @@ function bloomfilter_basic_init%(fp: double, capacity: count,
size_t cells = BasicBloomFilter::M(fp, capacity);
size_t optimal_k = BasicBloomFilter::K(cells, capacity);
size_t seed = Hasher::MakeSeed(name->Len() > 0 ? name->Bytes() : 0,
Hasher::seed_t seed = Hasher::MakeSeed(name->Len() > 0 ? name->Bytes() : 0,
name->Len());
const Hasher* h = new DoubleHasher(optimal_k, seed);
@ -66,7 +66,7 @@ function bloomfilter_basic_init%(fp: double, capacity: count,
##
## Returns: A Bloom filter handle.
##
## .. bro:see:: bloomfilter_basic_init bloomfilter_counting_init bloomfilter_add
## .. bro:see:: bloomfilter_basic_init bloomfilter_counting_init bloomfilter_add
## bloomfilter_lookup bloomfilter_clear bloomfilter_merge global_hash_seed
function bloomfilter_basic_init2%(k: count, cells: count,
name: string &default=""%): opaque of bloomfilter
@ -82,7 +82,7 @@ function bloomfilter_basic_init2%(k: count, cells: count,
return 0;
}
size_t seed = Hasher::MakeSeed(name->Len() > 0 ? name->Bytes() : 0,
Hasher::seed_t seed = Hasher::MakeSeed(name->Len() > 0 ? name->Bytes() : 0,
name->Len());
const Hasher* h = new DoubleHasher(k, seed);
@ -121,7 +121,7 @@ function bloomfilter_counting_init%(k: count, cells: count, max: count,
return 0;
}
size_t seed = Hasher::MakeSeed(name->Len() > 0 ? name->Bytes() : 0,
Hasher::seed_t seed = Hasher::MakeSeed(name->Len() > 0 ? name->Bytes() : 0,
name->Len());
const Hasher* h = new DefaultHasher(k, seed);