mirror of
https://github.com/zeek/zeek.git
synced 2025-10-05 16:18:19 +00:00
Also switch BloomFilters from H3 to siphash.
This removes all dependencies on H3 in our source tree.
This commit is contained in:
parent
e1218cc7fa
commit
f1bae871e9
8 changed files with 78 additions and 178 deletions
143
src/H3.h
143
src/H3.h
|
@ -1,143 +0,0 @@
|
|||
// Copyright 2004, 2005
|
||||
// The Regents of the University of California
|
||||
// All Rights Reserved
|
||||
//
|
||||
// Permission to use, copy, modify and distribute any part of this
|
||||
// h3.h file, without fee, and without a written agreement is hereby
|
||||
// granted, provided that the above copyright notice, this paragraph
|
||||
// and the following paragraphs appear in all copies.
|
||||
//
|
||||
// IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY
|
||||
// PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL
|
||||
// DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF
|
||||
// THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE
|
||||
// UNIVERSITY OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE,
|
||||
// SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY
|
||||
// OF CALIFORNIA MAKES NO REPRESENTATIONS AND EXTENDS NO WARRANTIES
|
||||
// OF ANY KIND, EITHER IMPLIED OR EXPRESS, INCLUDING, BUT NOT LIMITED
|
||||
// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A
|
||||
// PARTICULAR PURPOSE, OR THAT THE USE OF THE SOFTWARE WILL NOT INFRINGE
|
||||
// ANY PATENT, TRADEMARK OR OTHER RIGHTS.
|
||||
//
|
||||
// The h3.h file is developed by the CoralReef development team at the
|
||||
// University of California, San Diego under the Cooperative Association
|
||||
// for Internet Data Analysis (CAIDA) Program. Support for this effort was
|
||||
// provided by the CAIDA grant NCR-9711092, DARPA NGI Contract
|
||||
// N66001-98-2-8922, DARPA NMS Grant N66001-01-1-8909, NSF Grant ANI-013710
|
||||
// and by CAIDA members.
|
||||
//
|
||||
// Report bugs and suggestions to coral-bugs@caida.org.
|
||||
|
||||
// H3 hash function family
|
||||
// C++ template implementation by Ken Keys (kkeys@caida.org)
|
||||
//
|
||||
// Usage:
|
||||
// #include <h3.h>
|
||||
// const H3<T, N> h;
|
||||
// T hashval = h(data, size [, offset]);
|
||||
// (T) is the type to be returned by the hash function; must be an integral
|
||||
// type, e.g. uint32_t.
|
||||
// (N) is the size of the data in bytes (if data is a struct, beware of
|
||||
// padding).
|
||||
// The hash function hashes the (size) bytes of the data pointed to by (data),
|
||||
// starting at (offset). Note: offset affects the hash value, so
|
||||
// h(data, size, offset) is not the same as h(data+offset, size, 0).
|
||||
// Typically (size) is N and (offset) is 0, but other values can be used to
|
||||
// hash a substring of the data. Hashes of substrings can be bitwise-XOR'ed
|
||||
// together to get the same result as hashing the full string.
|
||||
// Any number of hash functions can be created by creating new instances of H3,
|
||||
// with the same or different template parameters. The hash function
|
||||
// constructor takes a seed as argument which defaults to a call to
|
||||
// bro_random().
|
||||
|
||||
|
||||
#ifndef H3_H
|
||||
#define H3_H
|
||||
|
||||
#include <climits>
|
||||
#include <cstring>
|
||||
|
||||
// The number of values representable by a byte.
|
||||
#define H3_BYTE_RANGE (UCHAR_MAX+1)
|
||||
|
||||
template <typename T, int N>
|
||||
class H3 {
|
||||
public:
|
||||
H3()
|
||||
{
|
||||
Init(false, 0);
|
||||
}
|
||||
|
||||
H3(T seed)
|
||||
{
|
||||
Init(true, seed);
|
||||
}
|
||||
|
||||
void Init(bool have_seed, T seed)
|
||||
{
|
||||
T bit_lookup[N * CHAR_BIT];
|
||||
|
||||
for ( size_t bit = 0; bit < N * CHAR_BIT; bit++ )
|
||||
{
|
||||
bit_lookup[bit] = 0;
|
||||
for ( size_t i = 0; i < sizeof(T)/2; i++ )
|
||||
{
|
||||
seed = have_seed ? bro_prng(seed) : bro_random();
|
||||
// assume random() returns at least 16 random bits
|
||||
bit_lookup[bit] = (bit_lookup[bit] << 16) | (seed & 0xFFFF);
|
||||
}
|
||||
}
|
||||
|
||||
for ( size_t byte = 0; byte < N; byte++ )
|
||||
{
|
||||
for ( unsigned val = 0; val < H3_BYTE_RANGE; val++ )
|
||||
{
|
||||
byte_lookup[byte][val] = 0;
|
||||
for ( size_t bit = 0; bit < CHAR_BIT; bit++ )
|
||||
// Does this mean byte_lookup[*][0] == 0? -RP
|
||||
if (val & (1 << bit))
|
||||
byte_lookup[byte][val] ^= bit_lookup[byte*CHAR_BIT+bit];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
T operator()(const void* data, size_t size, size_t offset = 0) const
|
||||
{
|
||||
const unsigned char *p = static_cast<const unsigned char*>(data);
|
||||
T result = 0;
|
||||
|
||||
// loop optmized with Duff's Device
|
||||
unsigned n = (size + 7) / 8;
|
||||
switch ( size % 8 ) {
|
||||
case 0: do { result ^= byte_lookup[offset++][*p++];
|
||||
case 7: result ^= byte_lookup[offset++][*p++];
|
||||
case 6: result ^= byte_lookup[offset++][*p++];
|
||||
case 5: result ^= byte_lookup[offset++][*p++];
|
||||
case 4: result ^= byte_lookup[offset++][*p++];
|
||||
case 3: result ^= byte_lookup[offset++][*p++];
|
||||
case 2: result ^= byte_lookup[offset++][*p++];
|
||||
case 1: result ^= byte_lookup[offset++][*p++];
|
||||
} while ( --n > 0 );
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
friend bool operator==(const H3& x, const H3& y)
|
||||
{
|
||||
return ! std::memcmp(x.byte_lookup, y.byte_lookup, N * H3_BYTE_RANGE);
|
||||
}
|
||||
|
||||
friend bool operator!=(const H3& x, const H3& y)
|
||||
{
|
||||
return ! (x == y);
|
||||
}
|
||||
|
||||
private:
|
||||
T byte_lookup[N][H3_BYTE_RANGE];
|
||||
};
|
||||
|
||||
#endif //H3_H
|
|
@ -241,7 +241,8 @@ CardinalityCounter* CardinalityCounter::Unserialize(UnserialInfo* info)
|
|||
return c;
|
||||
}
|
||||
|
||||
/* The following function is copied from libc/string/flsll.c from the FreeBSD source
|
||||
/**
|
||||
* The following function is copied from libc/string/flsll.c from the FreeBSD source
|
||||
* tree. Original copyright message follows
|
||||
*/
|
||||
/*-
|
||||
|
|
|
@ -155,9 +155,10 @@ private:
|
|||
int OptimalB(double error, double confidence) const;
|
||||
|
||||
/**
|
||||
* Determines at which index (counted from the back) the first one-bit
|
||||
* Determines at which index (counted from the front) the first one-bit
|
||||
* appears. The last b bits have to be 0 (the element has to be divisible
|
||||
* by m), hence they are ignored.
|
||||
* by m), hence they are ignored. Always adds 1 to the result. This is the
|
||||
* rho function from the original algorithm.
|
||||
*
|
||||
* @param hash_modified hash value
|
||||
*
|
||||
|
@ -165,6 +166,9 @@ private:
|
|||
*/
|
||||
uint8_t Rank(uint64_t hash_modified) const;
|
||||
|
||||
/**
|
||||
* flsll from FreeBSD; especially Linux does not have this.
|
||||
*/
|
||||
static int flsll(uint64_t mask);
|
||||
|
||||
/**
|
||||
|
|
|
@ -8,15 +8,21 @@
|
|||
#include "digest.h"
|
||||
#include "Serializer.h"
|
||||
|
||||
extern "C" {
|
||||
extern int siphash( uint8_t *out, const uint8_t *in, uint64_t inlen, const uint8_t *k );
|
||||
}
|
||||
|
||||
using namespace probabilistic;
|
||||
|
||||
uint64 Hasher::MakeSeed(const void* data, size_t size)
|
||||
Hasher::seed_t Hasher::MakeSeed(const void* data, size_t size)
|
||||
{
|
||||
u_char buf[SHA256_DIGEST_LENGTH];
|
||||
uint64 tmpseed;
|
||||
seed_t tmpseed;
|
||||
SHA256_CTX ctx;
|
||||
sha256_init(&ctx);
|
||||
|
||||
assert(sizeof(tmpseed) == 16);
|
||||
|
||||
if ( data )
|
||||
sha256_update(&ctx, data, size);
|
||||
|
||||
|
@ -56,7 +62,8 @@ bool Hasher::DoSerialize(SerialInfo* info) const
|
|||
if ( ! SERIALIZE(static_cast<uint16>(k)) )
|
||||
return false;
|
||||
|
||||
return SERIALIZE(static_cast<uint64>(seed));
|
||||
return SERIALIZE(static_cast<uint64>(seed.h1));
|
||||
return SERIALIZE(static_cast<uint64>(seed.h2));
|
||||
}
|
||||
|
||||
bool Hasher::DoUnserialize(UnserialInfo* info)
|
||||
|
@ -70,8 +77,10 @@ bool Hasher::DoUnserialize(UnserialInfo* info)
|
|||
k = serial_k;
|
||||
assert(k > 0);
|
||||
|
||||
uint64 serial_seed;
|
||||
if ( ! UNSERIALIZE(&serial_seed) )
|
||||
seed_t serial_seed;
|
||||
if ( ! UNSERIALIZE(&serial_seed.h1) )
|
||||
return false;
|
||||
if ( ! UNSERIALIZE(&serial_seed.h2) )
|
||||
return false;
|
||||
|
||||
seed = serial_seed;
|
||||
|
@ -79,14 +88,18 @@ bool Hasher::DoUnserialize(UnserialInfo* info)
|
|||
return true;
|
||||
}
|
||||
|
||||
Hasher::Hasher(size_t arg_k, size_t arg_seed)
|
||||
Hasher::Hasher(size_t arg_k, seed_t arg_seed)
|
||||
{
|
||||
k = arg_k;
|
||||
seed = arg_seed;
|
||||
}
|
||||
|
||||
UHF::UHF(size_t arg_seed)
|
||||
: h(arg_seed)
|
||||
UHF::UHF()
|
||||
{
|
||||
memset(&seed, 0, sizeof(seed));
|
||||
}
|
||||
|
||||
UHF::UHF(Hasher::seed_t arg_seed)
|
||||
{
|
||||
seed = arg_seed;
|
||||
}
|
||||
|
@ -96,8 +109,13 @@ UHF::UHF(size_t arg_seed)
|
|||
// times.
|
||||
Hasher::digest UHF::hash(const void* x, size_t n) const
|
||||
{
|
||||
assert(sizeof(Hasher::seed_t) == 16);
|
||||
hash_t outdigest;
|
||||
if ( n <= UHASH_KEY_SIZE )
|
||||
return n == 0 ? 0 : h(x, n);
|
||||
{
|
||||
siphash(reinterpret_cast<uint8_t*>(&outdigest), reinterpret_cast<const uint8_t*>(x), n, reinterpret_cast<const uint8_t*>(&seed));
|
||||
return outdigest;
|
||||
}
|
||||
|
||||
unsigned char d[16];
|
||||
MD5(reinterpret_cast<const unsigned char*>(x), n, d);
|
||||
|
@ -111,11 +129,15 @@ Hasher::digest UHF::hash(const void* x, size_t n) const
|
|||
return *reinterpret_cast<const Hasher::digest*>(d);
|
||||
}
|
||||
|
||||
DefaultHasher::DefaultHasher(size_t k, size_t seed)
|
||||
DefaultHasher::DefaultHasher(size_t k, Hasher::seed_t seed)
|
||||
: Hasher(k, seed)
|
||||
{
|
||||
for ( size_t i = 1; i <= k; ++i )
|
||||
hash_functions.push_back(UHF(Seed() + bro_prng(i)));
|
||||
{
|
||||
seed_t s = Seed();
|
||||
s.h1 += bro_prng(i);
|
||||
hash_functions.push_back(UHF(s));
|
||||
}
|
||||
}
|
||||
|
||||
Hasher::digest_vector DefaultHasher::Hash(const void* x, size_t n) const
|
||||
|
@ -158,12 +180,16 @@ bool DefaultHasher::DoUnserialize(UnserialInfo* info)
|
|||
|
||||
hash_functions.clear();
|
||||
for ( size_t i = 0; i < K(); ++i )
|
||||
hash_functions.push_back(UHF(Seed() + bro_prng(i)));
|
||||
{
|
||||
Hasher::seed_t s = Seed();
|
||||
s.h1 += bro_prng(i);
|
||||
hash_functions.push_back(UHF(s));
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
DoubleHasher::DoubleHasher(size_t k, size_t seed)
|
||||
DoubleHasher::DoubleHasher(size_t k, seed_t seed)
|
||||
: Hasher(k, seed), h1(seed + bro_prng(1)), h2(seed + bro_prng(2))
|
||||
{
|
||||
}
|
||||
|
|
|
@ -4,7 +4,6 @@
|
|||
#define PROBABILISTIC_HASHER_H
|
||||
|
||||
#include "Hash.h"
|
||||
#include "H3.h"
|
||||
#include "SerialObj.h"
|
||||
|
||||
namespace probabilistic {
|
||||
|
@ -17,6 +16,15 @@ class Hasher : public SerialObj {
|
|||
public:
|
||||
typedef hash_t digest;
|
||||
typedef std::vector<digest> digest_vector;
|
||||
struct seed_t {
|
||||
uint64_t h1;
|
||||
uint64_t h2;
|
||||
|
||||
friend seed_t operator+(seed_t lhs, const uint64_t rhs) {
|
||||
lhs.h1 += rhs;
|
||||
return lhs;
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* Creates a valid hasher seed from an arbitrary string.
|
||||
|
@ -30,7 +38,7 @@ public:
|
|||
*
|
||||
* @return A seed suitable for hashers.
|
||||
*/
|
||||
static uint64 MakeSeed(const void* data, size_t size);
|
||||
static seed_t MakeSeed(const void* data, size_t size);
|
||||
|
||||
/**
|
||||
* Destructor.
|
||||
|
@ -89,7 +97,7 @@ public:
|
|||
/**
|
||||
* Returns the seed used to construct the hasher.
|
||||
*/
|
||||
size_t Seed() const { return seed; }
|
||||
seed_t Seed() const { return seed; }
|
||||
|
||||
bool Serialize(SerialInfo* info) const;
|
||||
static Hasher* Unserialize(UnserialInfo* info);
|
||||
|
@ -106,11 +114,11 @@ protected:
|
|||
*
|
||||
* @param arg_seed The seed for the hasher.
|
||||
*/
|
||||
Hasher(size_t arg_k, size_t arg_seed);
|
||||
Hasher(size_t arg_k, seed_t arg_seed);
|
||||
|
||||
private:
|
||||
size_t k;
|
||||
size_t seed;
|
||||
seed_t seed;
|
||||
};
|
||||
|
||||
/**
|
||||
|
@ -120,12 +128,17 @@ private:
|
|||
class UHF {
|
||||
public:
|
||||
/**
|
||||
* Constructs an H3 hash function seeded with a given seed and an
|
||||
* Default constructor with zero seed.
|
||||
*/
|
||||
UHF();
|
||||
|
||||
/**
|
||||
* Constructs an hash function seeded with a given seed and an
|
||||
* optional extra seed to replace the initial Bro seed.
|
||||
*
|
||||
* @param arg_seed The seed to use for this instance.
|
||||
*/
|
||||
UHF(size_t arg_seed = 0);
|
||||
UHF(Hasher::seed_t arg_seed);
|
||||
|
||||
template <typename T>
|
||||
Hasher::digest operator()(const T& x) const
|
||||
|
@ -159,7 +172,8 @@ public:
|
|||
|
||||
friend bool operator==(const UHF& x, const UHF& y)
|
||||
{
|
||||
return x.h == y.h;
|
||||
return (x.seed.h1 == y.seed.h1) &&
|
||||
(x.seed.h2 == y.seed.h2);
|
||||
}
|
||||
|
||||
friend bool operator!=(const UHF& x, const UHF& y)
|
||||
|
@ -168,10 +182,9 @@ public:
|
|||
}
|
||||
|
||||
private:
|
||||
static size_t compute_seed(size_t seed);
|
||||
static size_t compute_seed(Hasher::seed_t seed);
|
||||
|
||||
H3<Hasher::digest, UHASH_KEY_SIZE> h;
|
||||
size_t seed;
|
||||
Hasher::seed_t seed;
|
||||
};
|
||||
|
||||
|
||||
|
@ -188,7 +201,7 @@ public:
|
|||
*
|
||||
* @param seed The seed for the hasher.
|
||||
*/
|
||||
DefaultHasher(size_t k, size_t seed);
|
||||
DefaultHasher(size_t k, Hasher::seed_t seed);
|
||||
|
||||
// Overridden from Hasher.
|
||||
virtual digest_vector Hash(const void* x, size_t n) const final;
|
||||
|
@ -216,7 +229,7 @@ public:
|
|||
*
|
||||
* @param seed The seed for the hasher.
|
||||
*/
|
||||
DoubleHasher(size_t k, size_t seed);
|
||||
DoubleHasher(size_t k, Hasher::seed_t seed);
|
||||
|
||||
// Overridden from Hasher.
|
||||
virtual digest_vector Hash(const void* x, size_t n) const final;
|
||||
|
|
|
@ -42,7 +42,7 @@ function bloomfilter_basic_init%(fp: double, capacity: count,
|
|||
|
||||
size_t cells = BasicBloomFilter::M(fp, capacity);
|
||||
size_t optimal_k = BasicBloomFilter::K(cells, capacity);
|
||||
size_t seed = Hasher::MakeSeed(name->Len() > 0 ? name->Bytes() : 0,
|
||||
Hasher::seed_t seed = Hasher::MakeSeed(name->Len() > 0 ? name->Bytes() : 0,
|
||||
name->Len());
|
||||
const Hasher* h = new DoubleHasher(optimal_k, seed);
|
||||
|
||||
|
@ -66,7 +66,7 @@ function bloomfilter_basic_init%(fp: double, capacity: count,
|
|||
##
|
||||
## Returns: A Bloom filter handle.
|
||||
##
|
||||
## .. bro:see:: bloomfilter_basic_init bloomfilter_counting_init bloomfilter_add
|
||||
## .. bro:see:: bloomfilter_basic_init bloomfilter_counting_init bloomfilter_add
|
||||
## bloomfilter_lookup bloomfilter_clear bloomfilter_merge global_hash_seed
|
||||
function bloomfilter_basic_init2%(k: count, cells: count,
|
||||
name: string &default=""%): opaque of bloomfilter
|
||||
|
@ -82,7 +82,7 @@ function bloomfilter_basic_init2%(k: count, cells: count,
|
|||
return 0;
|
||||
}
|
||||
|
||||
size_t seed = Hasher::MakeSeed(name->Len() > 0 ? name->Bytes() : 0,
|
||||
Hasher::seed_t seed = Hasher::MakeSeed(name->Len() > 0 ? name->Bytes() : 0,
|
||||
name->Len());
|
||||
const Hasher* h = new DoubleHasher(k, seed);
|
||||
|
||||
|
@ -121,7 +121,7 @@ function bloomfilter_counting_init%(k: count, cells: count, max: count,
|
|||
return 0;
|
||||
}
|
||||
|
||||
size_t seed = Hasher::MakeSeed(name->Len() > 0 ? name->Bytes() : 0,
|
||||
Hasher::seed_t seed = Hasher::MakeSeed(name->Len() > 0 ? name->Bytes() : 0,
|
||||
name->Len());
|
||||
|
||||
const Hasher* h = new DefaultHasher(k, seed);
|
||||
|
|
|
@ -13,7 +13,6 @@ error: false-positive rate must take value between 0 and 1
|
|||
1
|
||||
1
|
||||
1, fp
|
||||
1, fp
|
||||
1
|
||||
1
|
||||
1
|
||||
|
|
|
@ -28,7 +28,7 @@ function test_basic_bloom_filter()
|
|||
bloomfilter_add(bf_str, "bar");
|
||||
print bloomfilter_lookup(bf_str, "foo");
|
||||
print bloomfilter_lookup(bf_str, "bar");
|
||||
print bloomfilter_lookup(bf_str, "bazzz"), "fp"; # FP
|
||||
# print bloomfilter_lookup(bf_str, "bazzz"), "fp"; # FP false positive does no longer trigger after hash function change
|
||||
print bloomfilter_lookup(bf_str, "quuux"), "fp"; # FP
|
||||
bloomfilter_add(bf_str, 0.5); # Type mismatch
|
||||
bloomfilter_add(bf_str, 100); # Type mismatch
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue