From e1218cc7fa8a090a8f00f77bd0d3edc52fc34111 Mon Sep 17 00:00:00 2001 From: Johanna Amann Date: Wed, 13 Jul 2016 06:35:32 -0700 Subject: [PATCH] Change Hashing from H3 to Siphash. This commit mostly changes the hash function that is used for Internal hashing of data < 36 bytes from H3 to Siphash. This change is motivated by the fact that it turns out that H3 apparently does not deliver a very good source of data uniqueness; running HLL with H3 as a hashing function results in quite poor results (up to of 75% off in my tests). In difference, running HLL with Siphash (or HMAC-MD5) changes this factor to ~2%. This also fixes a long-standing bug in Hash.h which truncated our hash values to 32 bit on most machines. Furthermore, it once again fixes a problem with the Rank function in HLL. --- src/CMakeLists.txt | 1 + src/Hash.cc | 17 ++- src/Hash.h | 3 +- src/main.cc | 2 +- src/probabilistic/CardinalityCounter.cc | 51 +++++++- src/probabilistic/CardinalityCounter.h | 2 + src/siphash24.c | 165 ++++++++++++++++++++++++ src/util.cc | 32 +++-- src/util.h | 5 +- testing/btest/random.seed | 4 + 10 files changed, 257 insertions(+), 25 deletions(-) create mode 100644 src/siphash24.c diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 7b521125e4..55951a84a1 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -348,6 +348,7 @@ set(bro_SRCS PacketDumper.cc strsep.c modp_numtoa.c + siphash24.c threading/BasicThread.cc threading/Formatter.cc diff --git a/src/Hash.cc b/src/Hash.cc index d723601635..2186c6b1cd 100644 --- a/src/Hash.cc +++ b/src/Hash.cc @@ -19,14 +19,15 @@ #include "Hash.h" -#include "H3.h" -const H3* h3; +extern "C" { +extern int siphash( uint8_t *out, const uint8_t *in, uint64_t inlen, const uint8_t *k ); +} void init_hash_function() { // Make sure we have already called init_random_seed(). - ASSERT(hmac_key_set); - h3 = new H3(); + assert(hmac_key_set); + assert(siphash_key_set); } HashKey::HashKey(bro_int_t i) @@ -164,14 +165,16 @@ void* HashKey::CopyKey(const void* k, int s) const hash_t HashKey::HashBytes(const void* bytes, int size) { + assert(sizeof(hash_t) == 8); + hash_t digest[2]; // 2x hash_t (uint64) = 128 bits = 32 hex chars = sizeof md5 + if ( size <= UHASH_KEY_SIZE ) { - // H3 doesn't check if size is zero - return ( size == 0 ) ? 0 : (*h3)(bytes, size); + siphash((uint8_t*)digest, (const uint8_t*)bytes, size, shared_siphash_key); + return digest[0]; } // Fall back to HMAC/MD5 for longer data (which is usually rare). - hash_t digest[16]; hmac_md5(size, (const unsigned char*) bytes, (unsigned char*) digest); return digest[0]; } diff --git a/src/Hash.h b/src/Hash.h index 00db53d075..b8c998f461 100644 --- a/src/Hash.h +++ b/src/Hash.h @@ -81,7 +81,8 @@ protected: void* key; int is_our_dynamic; - int size, hash; + int size; + hash_t hash; }; extern void init_hash_function(); diff --git a/src/main.cc b/src/main.cc index abe57330d5..c75bf3ee1b 100644 --- a/src/main.cc +++ b/src/main.cc @@ -667,7 +667,7 @@ int main(int argc, char** argv) case 'K': MD5((const u_char*) optarg, strlen(optarg), shared_hmac_md5_key); - hmac_key_set = 1; + hmac_key_set = true; break; case 'N': diff --git a/src/probabilistic/CardinalityCounter.cc b/src/probabilistic/CardinalityCounter.cc index 8ce7156ea4..ea16711a21 100644 --- a/src/probabilistic/CardinalityCounter.cc +++ b/src/probabilistic/CardinalityCounter.cc @@ -117,7 +117,9 @@ CardinalityCounter::~CardinalityCounter() uint8_t CardinalityCounter::Rank(uint64_t hash_modified) const { hash_modified = hash_modified >> p; - int answer = 64 - p - fls(hash_modified) + 1; + int answer = 64 - p - CardinalityCounter::flsll(hash_modified) + 1; + assert(answer > 0 && answer < 64); + return answer; } @@ -238,3 +240,50 @@ CardinalityCounter* CardinalityCounter::Unserialize(UnserialInfo* info) return c; } + +/* The following function is copied from libc/string/flsll.c from the FreeBSD source + * tree. Original copyright message follows + */ +/*- + * Copyright (c) 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Find Last Set bit + */ +int +CardinalityCounter::flsll(uint64_t mask) +{ + int bit; + + if (mask == 0) + return (0); + for (bit = 1; mask != 1; bit++) + mask = (uint64_t)mask >> 1; + return (bit); +} diff --git a/src/probabilistic/CardinalityCounter.h b/src/probabilistic/CardinalityCounter.h index cac66eedda..2576c0276d 100644 --- a/src/probabilistic/CardinalityCounter.h +++ b/src/probabilistic/CardinalityCounter.h @@ -165,6 +165,8 @@ private: */ uint8_t Rank(uint64_t hash_modified) const; + static int flsll(uint64_t mask); + /** * This is the number of buckets that will be stored. The standard * error is 1.04/sqrt(m), so the actual cardinality will be the diff --git a/src/siphash24.c b/src/siphash24.c new file mode 100644 index 0000000000..060a558928 --- /dev/null +++ b/src/siphash24.c @@ -0,0 +1,165 @@ +/* + SipHash reference C implementation + + Copyright (c) 2012-2014 Jean-Philippe Aumasson + + Copyright (c) 2012-2014 Daniel J. Bernstein + + To the extent possible under law, the author(s) have dedicated all copyright + and related and neighboring rights to this software to the public domain + worldwide. This software is distributed without any warranty. + + You should have received a copy of the CC0 Public Domain Dedication along + with + this software. If not, see + . + */ +#include +#include +#include + +/* default: SipHash-2-4 */ +#define cROUNDS 2 +#define dROUNDS 4 + +#define ROTL(x, b) (uint64_t)(((x) << (b)) | ((x) >> (64 - (b)))) + +#define U32TO8_LE(p, v) \ + (p)[0] = (uint8_t)((v)); \ + (p)[1] = (uint8_t)((v) >> 8); \ + (p)[2] = (uint8_t)((v) >> 16); \ + (p)[3] = (uint8_t)((v) >> 24); + +#define U64TO8_LE(p, v) \ + U32TO8_LE((p), (uint32_t)((v))); \ + U32TO8_LE((p) + 4, (uint32_t)((v) >> 32)); + +#define U8TO64_LE(p) \ + (((uint64_t)((p)[0])) | ((uint64_t)((p)[1]) << 8) | \ + ((uint64_t)((p)[2]) << 16) | ((uint64_t)((p)[3]) << 24) | \ + ((uint64_t)((p)[4]) << 32) | ((uint64_t)((p)[5]) << 40) | \ + ((uint64_t)((p)[6]) << 48) | ((uint64_t)((p)[7]) << 56)) + +#define SIPROUND \ + do { \ + v0 += v1; \ + v1 = ROTL(v1, 13); \ + v1 ^= v0; \ + v0 = ROTL(v0, 32); \ + v2 += v3; \ + v3 = ROTL(v3, 16); \ + v3 ^= v2; \ + v0 += v3; \ + v3 = ROTL(v3, 21); \ + v3 ^= v0; \ + v2 += v1; \ + v1 = ROTL(v1, 17); \ + v1 ^= v2; \ + v2 = ROTL(v2, 32); \ + } while (0) + +#ifdef SIPHASHDEBUG +#define TRACE \ + do { \ + printf("(%3d) v0 %08x %08x\n", (int)inlen, (uint32_t)(v0 >> 32), \ + (uint32_t)v0); \ + printf("(%3d) v1 %08x %08x\n", (int)inlen, (uint32_t)(v1 >> 32), \ + (uint32_t)v1); \ + printf("(%3d) v2 %08x %08x\n", (int)inlen, (uint32_t)(v2 >> 32), \ + (uint32_t)v2); \ + printf("(%3d) v3 %08x %08x\n", (int)inlen, (uint32_t)(v3 >> 32), \ + (uint32_t)v3); \ + } while (0) +#else +#define TRACE +#endif + +int siphash(uint8_t *out, const uint8_t *in, uint64_t inlen, const uint8_t *k) { + /* "somepseudorandomlygeneratedbytes" */ + uint64_t v0 = 0x736f6d6570736575ULL; + uint64_t v1 = 0x646f72616e646f6dULL; + uint64_t v2 = 0x6c7967656e657261ULL; + uint64_t v3 = 0x7465646279746573ULL; + uint64_t b; + uint64_t k0 = U8TO64_LE(k); + uint64_t k1 = U8TO64_LE(k + 8); + uint64_t m; + int i; + const uint8_t *end = in + inlen - (inlen % sizeof(uint64_t)); + const int left = inlen & 7; + b = ((uint64_t)inlen) << 56; + v3 ^= k1; + v2 ^= k0; + v1 ^= k1; + v0 ^= k0; + +#ifdef DOUBLE + v1 ^= 0xee; +#endif + + for (; in != end; in += 8) { + m = U8TO64_LE(in); + v3 ^= m; + + TRACE; + for (i = 0; i < cROUNDS; ++i) + SIPROUND; + + v0 ^= m; + } + + switch (left) { + case 7: + b |= ((uint64_t)in[6]) << 48; + case 6: + b |= ((uint64_t)in[5]) << 40; + case 5: + b |= ((uint64_t)in[4]) << 32; + case 4: + b |= ((uint64_t)in[3]) << 24; + case 3: + b |= ((uint64_t)in[2]) << 16; + case 2: + b |= ((uint64_t)in[1]) << 8; + case 1: + b |= ((uint64_t)in[0]); + break; + case 0: + break; + } + + v3 ^= b; + + TRACE; + for (i = 0; i < cROUNDS; ++i) + SIPROUND; + + v0 ^= b; + +#ifndef DOUBLE + v2 ^= 0xff; +#else + v2 ^= 0xee; +#endif + + TRACE; + for (i = 0; i < dROUNDS; ++i) + SIPROUND; + + b = v0 ^ v1 ^ v2 ^ v3; + U64TO8_LE(out, b); + +#ifdef DOUBLE + v1 ^= 0xdd; + + TRACE; + for (i = 0; i < dROUNDS; ++i) + SIPROUND; + + b = v0 ^ v1 ^ v2 ^ v3; + U64TO8_LE(out + 8, b); +#endif + + return 0; +} + diff --git a/src/util.cc b/src/util.cc index e6015cc20a..489b10e112 100644 --- a/src/util.cc +++ b/src/util.cc @@ -695,8 +695,10 @@ std::string strstrip(std::string s) return s; } -int hmac_key_set = 0; +bool hmac_key_set = 0; uint8 shared_hmac_md5_key[16]; +bool siphash_key_set = false; +uint8 shared_siphash_key[16]; void hmac_md5(size_t size, const unsigned char* bytes, unsigned char digest[16]) { @@ -791,7 +793,7 @@ void bro_srandom(unsigned int seed) void init_random_seed(uint32 seed, const char* read_file, const char* write_file) { - static const int bufsiz = 16; + static const int bufsiz = 20; uint32 buf[bufsiz]; memset(buf, 0, sizeof(buf)); int pos = 0; // accumulates entropy @@ -812,12 +814,13 @@ void init_random_seed(uint32 seed, const char* read_file, const char* write_file gettimeofday((struct timeval *)(buf + pos), 0); pos += sizeof(struct timeval) / sizeof(uint32); + // use urandom. For reasons see e.g. http://www.2uo.de/myths-about-urandom/ #if defined(O_NONBLOCK) - int fd = open("/dev/random", O_RDONLY | O_NONBLOCK); + int fd = open("/dev/urandom", O_RDONLY | O_NONBLOCK); #elif defined(O_NDELAY) - int fd = open("/dev/random", O_RDONLY | O_NDELAY); + int fd = open("/dev/urandom", O_RDONLY | O_NDELAY); #else - int fd = open("/dev/random", O_RDONLY); + int fd = open("/dev/urandom", O_RDONLY); #endif if ( fd >= 0 ) @@ -835,12 +838,7 @@ void init_random_seed(uint32 seed, const char* read_file, const char* write_file } if ( pos < bufsiz ) - { - buf[pos++] = getpid(); - - if ( pos < bufsiz ) - buf[pos++] = getuid(); - } + reporter->InternalError("Could not read enough random data from /dev/urandom. Wanted %d, got %d", bufsiz, pos); if ( ! seed ) { @@ -864,8 +862,16 @@ void init_random_seed(uint32 seed, const char* read_file, const char* write_file if ( ! hmac_key_set ) { - MD5((const u_char*) buf, sizeof(buf), shared_hmac_md5_key); - hmac_key_set = 1; + assert(sizeof(buf)-16 == 64); + MD5((const u_char*) buf, sizeof(buf)-16, shared_hmac_md5_key); // The last 128 bits of buf are for siphash + hmac_key_set = true; + } + + if ( ! siphash_key_set ) + { + assert(sizeof(buf)-64 == 16); + memcpy(shared_siphash_key, buf+64, 16); + siphash_key_set = true; } if ( write_file && ! write_random_seeds(write_file, seed, buf, bufsiz) ) diff --git a/src/util.h b/src/util.h index 70095fba8d..e6ba7f348b 100644 --- a/src/util.h +++ b/src/util.h @@ -181,10 +181,11 @@ extern std::string strreplace(const std::string& s, const std::string& o, const // Remove all leading and trailing white space from string. extern std::string strstrip(std::string s); +extern bool hmac_key_set; extern uint8 shared_hmac_md5_key[16]; +extern bool siphash_key_set; +extern uint8 shared_siphash_key[16]; -extern int hmac_key_set; -extern unsigned char shared_hmac_md5_key[16]; extern void hmac_md5(size_t size, const unsigned char* bytes, unsigned char digest[16]); diff --git a/testing/btest/random.seed b/testing/btest/random.seed index e70c8f85ef..560e74f4a1 100644 --- a/testing/btest/random.seed +++ b/testing/btest/random.seed @@ -15,3 +15,7 @@ 3912865238 3596260151 517973768 +3606168384 +119014752 +1013039866 +2458585167