diff --git a/src/H3.h b/src/H3.h deleted file mode 100644 index 3b4b9ee539..0000000000 --- a/src/H3.h +++ /dev/null @@ -1,143 +0,0 @@ -// Copyright 2004, 2005 -// The Regents of the University of California -// All Rights Reserved -// -// Permission to use, copy, modify and distribute any part of this -// h3.h file, without fee, and without a written agreement is hereby -// granted, provided that the above copyright notice, this paragraph -// and the following paragraphs appear in all copies. -// -// IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY -// PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL -// DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF -// THE POSSIBILITY OF SUCH DAMAGE. -// -// THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE -// UNIVERSITY OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, -// SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY -// OF CALIFORNIA MAKES NO REPRESENTATIONS AND EXTENDS NO WARRANTIES -// OF ANY KIND, EITHER IMPLIED OR EXPRESS, INCLUDING, BUT NOT LIMITED -// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A -// PARTICULAR PURPOSE, OR THAT THE USE OF THE SOFTWARE WILL NOT INFRINGE -// ANY PATENT, TRADEMARK OR OTHER RIGHTS. -// -// The h3.h file is developed by the CoralReef development team at the -// University of California, San Diego under the Cooperative Association -// for Internet Data Analysis (CAIDA) Program. Support for this effort was -// provided by the CAIDA grant NCR-9711092, DARPA NGI Contract -// N66001-98-2-8922, DARPA NMS Grant N66001-01-1-8909, NSF Grant ANI-013710 -// and by CAIDA members. -// -// Report bugs and suggestions to coral-bugs@caida.org. - -// H3 hash function family -// C++ template implementation by Ken Keys (kkeys@caida.org) -// -// Usage: -// #include -// const H3 h; -// T hashval = h(data, size [, offset]); -// (T) is the type to be returned by the hash function; must be an integral -// type, e.g. uint32_t. -// (N) is the size of the data in bytes (if data is a struct, beware of -// padding). -// The hash function hashes the (size) bytes of the data pointed to by (data), -// starting at (offset). Note: offset affects the hash value, so -// h(data, size, offset) is not the same as h(data+offset, size, 0). -// Typically (size) is N and (offset) is 0, but other values can be used to -// hash a substring of the data. Hashes of substrings can be bitwise-XOR'ed -// together to get the same result as hashing the full string. -// Any number of hash functions can be created by creating new instances of H3, -// with the same or different template parameters. The hash function -// constructor takes a seed as argument which defaults to a call to -// bro_random(). - - -#ifndef H3_H -#define H3_H - -#include -#include - -// The number of values representable by a byte. -#define H3_BYTE_RANGE (UCHAR_MAX+1) - -template -class H3 { -public: - H3() - { - Init(false, 0); - } - - H3(T seed) - { - Init(true, seed); - } - - void Init(bool have_seed, T seed) - { - T bit_lookup[N * CHAR_BIT]; - - for ( size_t bit = 0; bit < N * CHAR_BIT; bit++ ) - { - bit_lookup[bit] = 0; - for ( size_t i = 0; i < sizeof(T)/2; i++ ) - { - seed = have_seed ? bro_prng(seed) : bro_random(); - // assume random() returns at least 16 random bits - bit_lookup[bit] = (bit_lookup[bit] << 16) | (seed & 0xFFFF); - } - } - - for ( size_t byte = 0; byte < N; byte++ ) - { - for ( unsigned val = 0; val < H3_BYTE_RANGE; val++ ) - { - byte_lookup[byte][val] = 0; - for ( size_t bit = 0; bit < CHAR_BIT; bit++ ) - // Does this mean byte_lookup[*][0] == 0? -RP - if (val & (1 << bit)) - byte_lookup[byte][val] ^= bit_lookup[byte*CHAR_BIT+bit]; - } - } - } - - T operator()(const void* data, size_t size, size_t offset = 0) const - { - const unsigned char *p = static_cast(data); - T result = 0; - - // loop optmized with Duff's Device - unsigned n = (size + 7) / 8; - switch ( size % 8 ) { - case 0: do { result ^= byte_lookup[offset++][*p++]; - case 7: result ^= byte_lookup[offset++][*p++]; - case 6: result ^= byte_lookup[offset++][*p++]; - case 5: result ^= byte_lookup[offset++][*p++]; - case 4: result ^= byte_lookup[offset++][*p++]; - case 3: result ^= byte_lookup[offset++][*p++]; - case 2: result ^= byte_lookup[offset++][*p++]; - case 1: result ^= byte_lookup[offset++][*p++]; - } while ( --n > 0 ); - } - - return result; - } - - friend bool operator==(const H3& x, const H3& y) - { - return ! std::memcmp(x.byte_lookup, y.byte_lookup, N * H3_BYTE_RANGE); - } - - friend bool operator!=(const H3& x, const H3& y) - { - return ! (x == y); - } - -private: - T byte_lookup[N][H3_BYTE_RANGE]; -}; - -#endif //H3_H diff --git a/src/probabilistic/CardinalityCounter.cc b/src/probabilistic/CardinalityCounter.cc index ea16711a21..d6d0de3657 100644 --- a/src/probabilistic/CardinalityCounter.cc +++ b/src/probabilistic/CardinalityCounter.cc @@ -241,7 +241,8 @@ CardinalityCounter* CardinalityCounter::Unserialize(UnserialInfo* info) return c; } -/* The following function is copied from libc/string/flsll.c from the FreeBSD source +/** + * The following function is copied from libc/string/flsll.c from the FreeBSD source * tree. Original copyright message follows */ /*- diff --git a/src/probabilistic/CardinalityCounter.h b/src/probabilistic/CardinalityCounter.h index 2576c0276d..e8784c1607 100644 --- a/src/probabilistic/CardinalityCounter.h +++ b/src/probabilistic/CardinalityCounter.h @@ -155,9 +155,10 @@ private: int OptimalB(double error, double confidence) const; /** - * Determines at which index (counted from the back) the first one-bit + * Determines at which index (counted from the front) the first one-bit * appears. The last b bits have to be 0 (the element has to be divisible - * by m), hence they are ignored. + * by m), hence they are ignored. Always adds 1 to the result. This is the + * rho function from the original algorithm. * * @param hash_modified hash value * @@ -165,6 +166,9 @@ private: */ uint8_t Rank(uint64_t hash_modified) const; + /** + * flsll from FreeBSD; especially Linux does not have this. + */ static int flsll(uint64_t mask); /** diff --git a/src/probabilistic/Hasher.cc b/src/probabilistic/Hasher.cc index 0f209bfb5b..725d90b893 100644 --- a/src/probabilistic/Hasher.cc +++ b/src/probabilistic/Hasher.cc @@ -8,15 +8,21 @@ #include "digest.h" #include "Serializer.h" +extern "C" { +extern int siphash( uint8_t *out, const uint8_t *in, uint64_t inlen, const uint8_t *k ); +} + using namespace probabilistic; -uint64 Hasher::MakeSeed(const void* data, size_t size) +Hasher::seed_t Hasher::MakeSeed(const void* data, size_t size) { u_char buf[SHA256_DIGEST_LENGTH]; - uint64 tmpseed; + seed_t tmpseed; SHA256_CTX ctx; sha256_init(&ctx); + assert(sizeof(tmpseed) == 16); + if ( data ) sha256_update(&ctx, data, size); @@ -56,7 +62,8 @@ bool Hasher::DoSerialize(SerialInfo* info) const if ( ! SERIALIZE(static_cast(k)) ) return false; - return SERIALIZE(static_cast(seed)); + return SERIALIZE(static_cast(seed.h1)); + return SERIALIZE(static_cast(seed.h2)); } bool Hasher::DoUnserialize(UnserialInfo* info) @@ -70,8 +77,10 @@ bool Hasher::DoUnserialize(UnserialInfo* info) k = serial_k; assert(k > 0); - uint64 serial_seed; - if ( ! UNSERIALIZE(&serial_seed) ) + seed_t serial_seed; + if ( ! UNSERIALIZE(&serial_seed.h1) ) + return false; + if ( ! UNSERIALIZE(&serial_seed.h2) ) return false; seed = serial_seed; @@ -79,14 +88,18 @@ bool Hasher::DoUnserialize(UnserialInfo* info) return true; } -Hasher::Hasher(size_t arg_k, size_t arg_seed) +Hasher::Hasher(size_t arg_k, seed_t arg_seed) { k = arg_k; seed = arg_seed; } -UHF::UHF(size_t arg_seed) - : h(arg_seed) +UHF::UHF() + { + memset(&seed, 0, sizeof(seed)); + } + +UHF::UHF(Hasher::seed_t arg_seed) { seed = arg_seed; } @@ -96,8 +109,13 @@ UHF::UHF(size_t arg_seed) // times. Hasher::digest UHF::hash(const void* x, size_t n) const { + assert(sizeof(Hasher::seed_t) == 16); + hash_t outdigest; if ( n <= UHASH_KEY_SIZE ) - return n == 0 ? 0 : h(x, n); + { + siphash(reinterpret_cast(&outdigest), reinterpret_cast(x), n, reinterpret_cast(&seed)); + return outdigest; + } unsigned char d[16]; MD5(reinterpret_cast(x), n, d); @@ -111,11 +129,15 @@ Hasher::digest UHF::hash(const void* x, size_t n) const return *reinterpret_cast(d); } -DefaultHasher::DefaultHasher(size_t k, size_t seed) +DefaultHasher::DefaultHasher(size_t k, Hasher::seed_t seed) : Hasher(k, seed) { for ( size_t i = 1; i <= k; ++i ) - hash_functions.push_back(UHF(Seed() + bro_prng(i))); + { + seed_t s = Seed(); + s.h1 += bro_prng(i); + hash_functions.push_back(UHF(s)); + } } Hasher::digest_vector DefaultHasher::Hash(const void* x, size_t n) const @@ -158,12 +180,16 @@ bool DefaultHasher::DoUnserialize(UnserialInfo* info) hash_functions.clear(); for ( size_t i = 0; i < K(); ++i ) - hash_functions.push_back(UHF(Seed() + bro_prng(i))); + { + Hasher::seed_t s = Seed(); + s.h1 += bro_prng(i); + hash_functions.push_back(UHF(s)); + } return true; } -DoubleHasher::DoubleHasher(size_t k, size_t seed) +DoubleHasher::DoubleHasher(size_t k, seed_t seed) : Hasher(k, seed), h1(seed + bro_prng(1)), h2(seed + bro_prng(2)) { } diff --git a/src/probabilistic/Hasher.h b/src/probabilistic/Hasher.h index 6ce13c6302..da83104e9d 100644 --- a/src/probabilistic/Hasher.h +++ b/src/probabilistic/Hasher.h @@ -4,7 +4,6 @@ #define PROBABILISTIC_HASHER_H #include "Hash.h" -#include "H3.h" #include "SerialObj.h" namespace probabilistic { @@ -17,6 +16,15 @@ class Hasher : public SerialObj { public: typedef hash_t digest; typedef std::vector digest_vector; + struct seed_t { + uint64_t h1; + uint64_t h2; + + friend seed_t operator+(seed_t lhs, const uint64_t rhs) { + lhs.h1 += rhs; + return lhs; + } + }; /** * Creates a valid hasher seed from an arbitrary string. @@ -30,7 +38,7 @@ public: * * @return A seed suitable for hashers. */ - static uint64 MakeSeed(const void* data, size_t size); + static seed_t MakeSeed(const void* data, size_t size); /** * Destructor. @@ -89,7 +97,7 @@ public: /** * Returns the seed used to construct the hasher. */ - size_t Seed() const { return seed; } + seed_t Seed() const { return seed; } bool Serialize(SerialInfo* info) const; static Hasher* Unserialize(UnserialInfo* info); @@ -106,11 +114,11 @@ protected: * * @param arg_seed The seed for the hasher. */ - Hasher(size_t arg_k, size_t arg_seed); + Hasher(size_t arg_k, seed_t arg_seed); private: size_t k; - size_t seed; + seed_t seed; }; /** @@ -120,12 +128,17 @@ private: class UHF { public: /** - * Constructs an H3 hash function seeded with a given seed and an + * Default constructor with zero seed. + */ + UHF(); + + /** + * Constructs an hash function seeded with a given seed and an * optional extra seed to replace the initial Bro seed. * * @param arg_seed The seed to use for this instance. */ - UHF(size_t arg_seed = 0); + UHF(Hasher::seed_t arg_seed); template Hasher::digest operator()(const T& x) const @@ -159,7 +172,8 @@ public: friend bool operator==(const UHF& x, const UHF& y) { - return x.h == y.h; + return (x.seed.h1 == y.seed.h1) && + (x.seed.h2 == y.seed.h2); } friend bool operator!=(const UHF& x, const UHF& y) @@ -168,10 +182,9 @@ public: } private: - static size_t compute_seed(size_t seed); + static size_t compute_seed(Hasher::seed_t seed); - H3 h; - size_t seed; + Hasher::seed_t seed; }; @@ -188,7 +201,7 @@ public: * * @param seed The seed for the hasher. */ - DefaultHasher(size_t k, size_t seed); + DefaultHasher(size_t k, Hasher::seed_t seed); // Overridden from Hasher. virtual digest_vector Hash(const void* x, size_t n) const final; @@ -216,7 +229,7 @@ public: * * @param seed The seed for the hasher. */ - DoubleHasher(size_t k, size_t seed); + DoubleHasher(size_t k, Hasher::seed_t seed); // Overridden from Hasher. virtual digest_vector Hash(const void* x, size_t n) const final; diff --git a/src/probabilistic/bloom-filter.bif b/src/probabilistic/bloom-filter.bif index 3e6b89fa4f..46ec4699a0 100644 --- a/src/probabilistic/bloom-filter.bif +++ b/src/probabilistic/bloom-filter.bif @@ -42,7 +42,7 @@ function bloomfilter_basic_init%(fp: double, capacity: count, size_t cells = BasicBloomFilter::M(fp, capacity); size_t optimal_k = BasicBloomFilter::K(cells, capacity); - size_t seed = Hasher::MakeSeed(name->Len() > 0 ? name->Bytes() : 0, + Hasher::seed_t seed = Hasher::MakeSeed(name->Len() > 0 ? name->Bytes() : 0, name->Len()); const Hasher* h = new DoubleHasher(optimal_k, seed); @@ -66,7 +66,7 @@ function bloomfilter_basic_init%(fp: double, capacity: count, ## ## Returns: A Bloom filter handle. ## -## .. bro:see:: bloomfilter_basic_init bloomfilter_counting_init bloomfilter_add +## .. bro:see:: bloomfilter_basic_init bloomfilter_counting_init bloomfilter_add ## bloomfilter_lookup bloomfilter_clear bloomfilter_merge global_hash_seed function bloomfilter_basic_init2%(k: count, cells: count, name: string &default=""%): opaque of bloomfilter @@ -82,7 +82,7 @@ function bloomfilter_basic_init2%(k: count, cells: count, return 0; } - size_t seed = Hasher::MakeSeed(name->Len() > 0 ? name->Bytes() : 0, + Hasher::seed_t seed = Hasher::MakeSeed(name->Len() > 0 ? name->Bytes() : 0, name->Len()); const Hasher* h = new DoubleHasher(k, seed); @@ -121,7 +121,7 @@ function bloomfilter_counting_init%(k: count, cells: count, max: count, return 0; } - size_t seed = Hasher::MakeSeed(name->Len() > 0 ? name->Bytes() : 0, + Hasher::seed_t seed = Hasher::MakeSeed(name->Len() > 0 ? name->Bytes() : 0, name->Len()); const Hasher* h = new DefaultHasher(k, seed); diff --git a/testing/btest/Baseline/bifs.bloomfilter/output b/testing/btest/Baseline/bifs.bloomfilter/output index 788b1848d1..660f390153 100644 --- a/testing/btest/Baseline/bifs.bloomfilter/output +++ b/testing/btest/Baseline/bifs.bloomfilter/output @@ -13,7 +13,6 @@ error: false-positive rate must take value between 0 and 1 1 1 1, fp -1, fp 1 1 1 diff --git a/testing/btest/bifs/bloomfilter.bro b/testing/btest/bifs/bloomfilter.bro index f9dae7f7b5..c0ccc2a552 100644 --- a/testing/btest/bifs/bloomfilter.bro +++ b/testing/btest/bifs/bloomfilter.bro @@ -28,7 +28,7 @@ function test_basic_bloom_filter() bloomfilter_add(bf_str, "bar"); print bloomfilter_lookup(bf_str, "foo"); print bloomfilter_lookup(bf_str, "bar"); - print bloomfilter_lookup(bf_str, "bazzz"), "fp"; # FP + # print bloomfilter_lookup(bf_str, "bazzz"), "fp"; # FP false positive does no longer trigger after hash function change print bloomfilter_lookup(bf_str, "quuux"), "fp"; # FP bloomfilter_add(bf_str, 0.5); # Type mismatch bloomfilter_add(bf_str, 100); # Type mismatch