diff --git a/src/H3.h b/src/H3.h
deleted file mode 100644
index 3b4b9ee539..0000000000
--- a/src/H3.h
+++ /dev/null
@@ -1,143 +0,0 @@
-// Copyright 2004, 2005
-// The Regents of the University of California
-// All Rights Reserved
-//
-// Permission to use, copy, modify and distribute any part of this
-// h3.h file, without fee, and without a written agreement is hereby
-// granted, provided that the above copyright notice, this paragraph
-// and the following paragraphs appear in all copies.
-//
-// IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY
-// PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL
-// DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF
-// THE POSSIBILITY OF SUCH DAMAGE.
-//
-// THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE
-// UNIVERSITY OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE,
-// SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY
-// OF CALIFORNIA MAKES NO REPRESENTATIONS AND EXTENDS NO WARRANTIES
-// OF ANY KIND, EITHER IMPLIED OR EXPRESS, INCLUDING, BUT NOT LIMITED
-// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A
-// PARTICULAR PURPOSE, OR THAT THE USE OF THE SOFTWARE WILL NOT INFRINGE
-// ANY PATENT, TRADEMARK OR OTHER RIGHTS.
-//
-// The h3.h file is developed by the CoralReef development team at the
-// University of California, San Diego under the Cooperative Association
-// for Internet Data Analysis (CAIDA) Program. Support for this effort was
-// provided by the CAIDA grant NCR-9711092, DARPA NGI Contract
-// N66001-98-2-8922, DARPA NMS Grant N66001-01-1-8909, NSF Grant ANI-013710
-// and by CAIDA members.
-//
-// Report bugs and suggestions to coral-bugs@caida.org.
-
-// H3 hash function family
-// C++ template implementation by Ken Keys (kkeys@caida.org)
-//
-// Usage:
-// #include
-// const H3 h;
-// T hashval = h(data, size [, offset]);
-// (T) is the type to be returned by the hash function; must be an integral
-// type, e.g. uint32_t.
-// (N) is the size of the data in bytes (if data is a struct, beware of
-// padding).
-// The hash function hashes the (size) bytes of the data pointed to by (data),
-// starting at (offset). Note: offset affects the hash value, so
-// h(data, size, offset) is not the same as h(data+offset, size, 0).
-// Typically (size) is N and (offset) is 0, but other values can be used to
-// hash a substring of the data. Hashes of substrings can be bitwise-XOR'ed
-// together to get the same result as hashing the full string.
-// Any number of hash functions can be created by creating new instances of H3,
-// with the same or different template parameters. The hash function
-// constructor takes a seed as argument which defaults to a call to
-// bro_random().
-
-
-#ifndef H3_H
-#define H3_H
-
-#include
-#include
-
-// The number of values representable by a byte.
-#define H3_BYTE_RANGE (UCHAR_MAX+1)
-
-template
-class H3 {
-public:
- H3()
- {
- Init(false, 0);
- }
-
- H3(T seed)
- {
- Init(true, seed);
- }
-
- void Init(bool have_seed, T seed)
- {
- T bit_lookup[N * CHAR_BIT];
-
- for ( size_t bit = 0; bit < N * CHAR_BIT; bit++ )
- {
- bit_lookup[bit] = 0;
- for ( size_t i = 0; i < sizeof(T)/2; i++ )
- {
- seed = have_seed ? bro_prng(seed) : bro_random();
- // assume random() returns at least 16 random bits
- bit_lookup[bit] = (bit_lookup[bit] << 16) | (seed & 0xFFFF);
- }
- }
-
- for ( size_t byte = 0; byte < N; byte++ )
- {
- for ( unsigned val = 0; val < H3_BYTE_RANGE; val++ )
- {
- byte_lookup[byte][val] = 0;
- for ( size_t bit = 0; bit < CHAR_BIT; bit++ )
- // Does this mean byte_lookup[*][0] == 0? -RP
- if (val & (1 << bit))
- byte_lookup[byte][val] ^= bit_lookup[byte*CHAR_BIT+bit];
- }
- }
- }
-
- T operator()(const void* data, size_t size, size_t offset = 0) const
- {
- const unsigned char *p = static_cast(data);
- T result = 0;
-
- // loop optmized with Duff's Device
- unsigned n = (size + 7) / 8;
- switch ( size % 8 ) {
- case 0: do { result ^= byte_lookup[offset++][*p++];
- case 7: result ^= byte_lookup[offset++][*p++];
- case 6: result ^= byte_lookup[offset++][*p++];
- case 5: result ^= byte_lookup[offset++][*p++];
- case 4: result ^= byte_lookup[offset++][*p++];
- case 3: result ^= byte_lookup[offset++][*p++];
- case 2: result ^= byte_lookup[offset++][*p++];
- case 1: result ^= byte_lookup[offset++][*p++];
- } while ( --n > 0 );
- }
-
- return result;
- }
-
- friend bool operator==(const H3& x, const H3& y)
- {
- return ! std::memcmp(x.byte_lookup, y.byte_lookup, N * H3_BYTE_RANGE);
- }
-
- friend bool operator!=(const H3& x, const H3& y)
- {
- return ! (x == y);
- }
-
-private:
- T byte_lookup[N][H3_BYTE_RANGE];
-};
-
-#endif //H3_H
diff --git a/src/probabilistic/CardinalityCounter.cc b/src/probabilistic/CardinalityCounter.cc
index ea16711a21..d6d0de3657 100644
--- a/src/probabilistic/CardinalityCounter.cc
+++ b/src/probabilistic/CardinalityCounter.cc
@@ -241,7 +241,8 @@ CardinalityCounter* CardinalityCounter::Unserialize(UnserialInfo* info)
return c;
}
-/* The following function is copied from libc/string/flsll.c from the FreeBSD source
+/**
+ * The following function is copied from libc/string/flsll.c from the FreeBSD source
* tree. Original copyright message follows
*/
/*-
diff --git a/src/probabilistic/CardinalityCounter.h b/src/probabilistic/CardinalityCounter.h
index 2576c0276d..e8784c1607 100644
--- a/src/probabilistic/CardinalityCounter.h
+++ b/src/probabilistic/CardinalityCounter.h
@@ -155,9 +155,10 @@ private:
int OptimalB(double error, double confidence) const;
/**
- * Determines at which index (counted from the back) the first one-bit
+ * Determines at which index (counted from the front) the first one-bit
* appears. The last b bits have to be 0 (the element has to be divisible
- * by m), hence they are ignored.
+ * by m), hence they are ignored. Always adds 1 to the result. This is the
+ * rho function from the original algorithm.
*
* @param hash_modified hash value
*
@@ -165,6 +166,9 @@ private:
*/
uint8_t Rank(uint64_t hash_modified) const;
+ /**
+ * flsll from FreeBSD; especially Linux does not have this.
+ */
static int flsll(uint64_t mask);
/**
diff --git a/src/probabilistic/Hasher.cc b/src/probabilistic/Hasher.cc
index 0f209bfb5b..725d90b893 100644
--- a/src/probabilistic/Hasher.cc
+++ b/src/probabilistic/Hasher.cc
@@ -8,15 +8,21 @@
#include "digest.h"
#include "Serializer.h"
+extern "C" {
+extern int siphash( uint8_t *out, const uint8_t *in, uint64_t inlen, const uint8_t *k );
+}
+
using namespace probabilistic;
-uint64 Hasher::MakeSeed(const void* data, size_t size)
+Hasher::seed_t Hasher::MakeSeed(const void* data, size_t size)
{
u_char buf[SHA256_DIGEST_LENGTH];
- uint64 tmpseed;
+ seed_t tmpseed;
SHA256_CTX ctx;
sha256_init(&ctx);
+ assert(sizeof(tmpseed) == 16);
+
if ( data )
sha256_update(&ctx, data, size);
@@ -56,7 +62,8 @@ bool Hasher::DoSerialize(SerialInfo* info) const
if ( ! SERIALIZE(static_cast(k)) )
return false;
- return SERIALIZE(static_cast(seed));
+ return SERIALIZE(static_cast(seed.h1));
+ return SERIALIZE(static_cast(seed.h2));
}
bool Hasher::DoUnserialize(UnserialInfo* info)
@@ -70,8 +77,10 @@ bool Hasher::DoUnserialize(UnserialInfo* info)
k = serial_k;
assert(k > 0);
- uint64 serial_seed;
- if ( ! UNSERIALIZE(&serial_seed) )
+ seed_t serial_seed;
+ if ( ! UNSERIALIZE(&serial_seed.h1) )
+ return false;
+ if ( ! UNSERIALIZE(&serial_seed.h2) )
return false;
seed = serial_seed;
@@ -79,14 +88,18 @@ bool Hasher::DoUnserialize(UnserialInfo* info)
return true;
}
-Hasher::Hasher(size_t arg_k, size_t arg_seed)
+Hasher::Hasher(size_t arg_k, seed_t arg_seed)
{
k = arg_k;
seed = arg_seed;
}
-UHF::UHF(size_t arg_seed)
- : h(arg_seed)
+UHF::UHF()
+ {
+ memset(&seed, 0, sizeof(seed));
+ }
+
+UHF::UHF(Hasher::seed_t arg_seed)
{
seed = arg_seed;
}
@@ -96,8 +109,13 @@ UHF::UHF(size_t arg_seed)
// times.
Hasher::digest UHF::hash(const void* x, size_t n) const
{
+ assert(sizeof(Hasher::seed_t) == 16);
+ hash_t outdigest;
if ( n <= UHASH_KEY_SIZE )
- return n == 0 ? 0 : h(x, n);
+ {
+ siphash(reinterpret_cast(&outdigest), reinterpret_cast(x), n, reinterpret_cast(&seed));
+ return outdigest;
+ }
unsigned char d[16];
MD5(reinterpret_cast(x), n, d);
@@ -111,11 +129,15 @@ Hasher::digest UHF::hash(const void* x, size_t n) const
return *reinterpret_cast(d);
}
-DefaultHasher::DefaultHasher(size_t k, size_t seed)
+DefaultHasher::DefaultHasher(size_t k, Hasher::seed_t seed)
: Hasher(k, seed)
{
for ( size_t i = 1; i <= k; ++i )
- hash_functions.push_back(UHF(Seed() + bro_prng(i)));
+ {
+ seed_t s = Seed();
+ s.h1 += bro_prng(i);
+ hash_functions.push_back(UHF(s));
+ }
}
Hasher::digest_vector DefaultHasher::Hash(const void* x, size_t n) const
@@ -158,12 +180,16 @@ bool DefaultHasher::DoUnserialize(UnserialInfo* info)
hash_functions.clear();
for ( size_t i = 0; i < K(); ++i )
- hash_functions.push_back(UHF(Seed() + bro_prng(i)));
+ {
+ Hasher::seed_t s = Seed();
+ s.h1 += bro_prng(i);
+ hash_functions.push_back(UHF(s));
+ }
return true;
}
-DoubleHasher::DoubleHasher(size_t k, size_t seed)
+DoubleHasher::DoubleHasher(size_t k, seed_t seed)
: Hasher(k, seed), h1(seed + bro_prng(1)), h2(seed + bro_prng(2))
{
}
diff --git a/src/probabilistic/Hasher.h b/src/probabilistic/Hasher.h
index 6ce13c6302..da83104e9d 100644
--- a/src/probabilistic/Hasher.h
+++ b/src/probabilistic/Hasher.h
@@ -4,7 +4,6 @@
#define PROBABILISTIC_HASHER_H
#include "Hash.h"
-#include "H3.h"
#include "SerialObj.h"
namespace probabilistic {
@@ -17,6 +16,15 @@ class Hasher : public SerialObj {
public:
typedef hash_t digest;
typedef std::vector digest_vector;
+ struct seed_t {
+ uint64_t h1;
+ uint64_t h2;
+
+ friend seed_t operator+(seed_t lhs, const uint64_t rhs) {
+ lhs.h1 += rhs;
+ return lhs;
+ }
+ };
/**
* Creates a valid hasher seed from an arbitrary string.
@@ -30,7 +38,7 @@ public:
*
* @return A seed suitable for hashers.
*/
- static uint64 MakeSeed(const void* data, size_t size);
+ static seed_t MakeSeed(const void* data, size_t size);
/**
* Destructor.
@@ -89,7 +97,7 @@ public:
/**
* Returns the seed used to construct the hasher.
*/
- size_t Seed() const { return seed; }
+ seed_t Seed() const { return seed; }
bool Serialize(SerialInfo* info) const;
static Hasher* Unserialize(UnserialInfo* info);
@@ -106,11 +114,11 @@ protected:
*
* @param arg_seed The seed for the hasher.
*/
- Hasher(size_t arg_k, size_t arg_seed);
+ Hasher(size_t arg_k, seed_t arg_seed);
private:
size_t k;
- size_t seed;
+ seed_t seed;
};
/**
@@ -120,12 +128,17 @@ private:
class UHF {
public:
/**
- * Constructs an H3 hash function seeded with a given seed and an
+ * Default constructor with zero seed.
+ */
+ UHF();
+
+ /**
+ * Constructs an hash function seeded with a given seed and an
* optional extra seed to replace the initial Bro seed.
*
* @param arg_seed The seed to use for this instance.
*/
- UHF(size_t arg_seed = 0);
+ UHF(Hasher::seed_t arg_seed);
template
Hasher::digest operator()(const T& x) const
@@ -159,7 +172,8 @@ public:
friend bool operator==(const UHF& x, const UHF& y)
{
- return x.h == y.h;
+ return (x.seed.h1 == y.seed.h1) &&
+ (x.seed.h2 == y.seed.h2);
}
friend bool operator!=(const UHF& x, const UHF& y)
@@ -168,10 +182,9 @@ public:
}
private:
- static size_t compute_seed(size_t seed);
+ static size_t compute_seed(Hasher::seed_t seed);
- H3 h;
- size_t seed;
+ Hasher::seed_t seed;
};
@@ -188,7 +201,7 @@ public:
*
* @param seed The seed for the hasher.
*/
- DefaultHasher(size_t k, size_t seed);
+ DefaultHasher(size_t k, Hasher::seed_t seed);
// Overridden from Hasher.
virtual digest_vector Hash(const void* x, size_t n) const final;
@@ -216,7 +229,7 @@ public:
*
* @param seed The seed for the hasher.
*/
- DoubleHasher(size_t k, size_t seed);
+ DoubleHasher(size_t k, Hasher::seed_t seed);
// Overridden from Hasher.
virtual digest_vector Hash(const void* x, size_t n) const final;
diff --git a/src/probabilistic/bloom-filter.bif b/src/probabilistic/bloom-filter.bif
index 3e6b89fa4f..46ec4699a0 100644
--- a/src/probabilistic/bloom-filter.bif
+++ b/src/probabilistic/bloom-filter.bif
@@ -42,7 +42,7 @@ function bloomfilter_basic_init%(fp: double, capacity: count,
size_t cells = BasicBloomFilter::M(fp, capacity);
size_t optimal_k = BasicBloomFilter::K(cells, capacity);
- size_t seed = Hasher::MakeSeed(name->Len() > 0 ? name->Bytes() : 0,
+ Hasher::seed_t seed = Hasher::MakeSeed(name->Len() > 0 ? name->Bytes() : 0,
name->Len());
const Hasher* h = new DoubleHasher(optimal_k, seed);
@@ -66,7 +66,7 @@ function bloomfilter_basic_init%(fp: double, capacity: count,
##
## Returns: A Bloom filter handle.
##
-## .. bro:see:: bloomfilter_basic_init bloomfilter_counting_init bloomfilter_add
+## .. bro:see:: bloomfilter_basic_init bloomfilter_counting_init bloomfilter_add
## bloomfilter_lookup bloomfilter_clear bloomfilter_merge global_hash_seed
function bloomfilter_basic_init2%(k: count, cells: count,
name: string &default=""%): opaque of bloomfilter
@@ -82,7 +82,7 @@ function bloomfilter_basic_init2%(k: count, cells: count,
return 0;
}
- size_t seed = Hasher::MakeSeed(name->Len() > 0 ? name->Bytes() : 0,
+ Hasher::seed_t seed = Hasher::MakeSeed(name->Len() > 0 ? name->Bytes() : 0,
name->Len());
const Hasher* h = new DoubleHasher(k, seed);
@@ -121,7 +121,7 @@ function bloomfilter_counting_init%(k: count, cells: count, max: count,
return 0;
}
- size_t seed = Hasher::MakeSeed(name->Len() > 0 ? name->Bytes() : 0,
+ Hasher::seed_t seed = Hasher::MakeSeed(name->Len() > 0 ? name->Bytes() : 0,
name->Len());
const Hasher* h = new DefaultHasher(k, seed);
diff --git a/testing/btest/Baseline/bifs.bloomfilter/output b/testing/btest/Baseline/bifs.bloomfilter/output
index 788b1848d1..660f390153 100644
--- a/testing/btest/Baseline/bifs.bloomfilter/output
+++ b/testing/btest/Baseline/bifs.bloomfilter/output
@@ -13,7 +13,6 @@ error: false-positive rate must take value between 0 and 1
1
1
1, fp
-1, fp
1
1
1
diff --git a/testing/btest/bifs/bloomfilter.bro b/testing/btest/bifs/bloomfilter.bro
index f9dae7f7b5..c0ccc2a552 100644
--- a/testing/btest/bifs/bloomfilter.bro
+++ b/testing/btest/bifs/bloomfilter.bro
@@ -28,7 +28,7 @@ function test_basic_bloom_filter()
bloomfilter_add(bf_str, "bar");
print bloomfilter_lookup(bf_str, "foo");
print bloomfilter_lookup(bf_str, "bar");
- print bloomfilter_lookup(bf_str, "bazzz"), "fp"; # FP
+ # print bloomfilter_lookup(bf_str, "bazzz"), "fp"; # FP false positive does no longer trigger after hash function change
print bloomfilter_lookup(bf_str, "quuux"), "fp"; # FP
bloomfilter_add(bf_str, 0.5); # Type mismatch
bloomfilter_add(bf_str, 100); # Type mismatch