Change Hashing from H3 to Siphash.

This commit mostly changes the hash function that is used for Internal
hashing of data < 36 bytes from H3 to Siphash. This change is motivated
by the fact that it turns out that H3 apparently does not deliver a very
good source of data uniqueness; running HLL with H3 as a hashing
function results in quite poor results (up to of 75% off in my tests).
In difference, running HLL with Siphash (or HMAC-MD5) changes this
factor to ~2%.

This also fixes a long-standing bug in Hash.h which truncated our hash
values to 32 bit on most machines.

Furthermore, it once again fixes a problem with the Rank function in
HLL.
This commit is contained in:
Johanna Amann 2016-07-13 06:35:32 -07:00
parent c15f48661d
commit e1218cc7fa
10 changed files with 257 additions and 25 deletions

View file

@ -348,6 +348,7 @@ set(bro_SRCS
PacketDumper.cc PacketDumper.cc
strsep.c strsep.c
modp_numtoa.c modp_numtoa.c
siphash24.c
threading/BasicThread.cc threading/BasicThread.cc
threading/Formatter.cc threading/Formatter.cc

View file

@ -19,14 +19,15 @@
#include "Hash.h" #include "Hash.h"
#include "H3.h" extern "C" {
const H3<hash_t, UHASH_KEY_SIZE>* h3; extern int siphash( uint8_t *out, const uint8_t *in, uint64_t inlen, const uint8_t *k );
}
void init_hash_function() void init_hash_function()
{ {
// Make sure we have already called init_random_seed(). // Make sure we have already called init_random_seed().
ASSERT(hmac_key_set); assert(hmac_key_set);
h3 = new H3<hash_t, UHASH_KEY_SIZE>(); assert(siphash_key_set);
} }
HashKey::HashKey(bro_int_t i) HashKey::HashKey(bro_int_t i)
@ -164,14 +165,16 @@ void* HashKey::CopyKey(const void* k, int s) const
hash_t HashKey::HashBytes(const void* bytes, int size) hash_t HashKey::HashBytes(const void* bytes, int size)
{ {
assert(sizeof(hash_t) == 8);
hash_t digest[2]; // 2x hash_t (uint64) = 128 bits = 32 hex chars = sizeof md5
if ( size <= UHASH_KEY_SIZE ) if ( size <= UHASH_KEY_SIZE )
{ {
// H3 doesn't check if size is zero siphash((uint8_t*)digest, (const uint8_t*)bytes, size, shared_siphash_key);
return ( size == 0 ) ? 0 : (*h3)(bytes, size); return digest[0];
} }
// Fall back to HMAC/MD5 for longer data (which is usually rare). // Fall back to HMAC/MD5 for longer data (which is usually rare).
hash_t digest[16];
hmac_md5(size, (const unsigned char*) bytes, (unsigned char*) digest); hmac_md5(size, (const unsigned char*) bytes, (unsigned char*) digest);
return digest[0]; return digest[0];
} }

View file

@ -81,7 +81,8 @@ protected:
void* key; void* key;
int is_our_dynamic; int is_our_dynamic;
int size, hash; int size;
hash_t hash;
}; };
extern void init_hash_function(); extern void init_hash_function();

View file

@ -667,7 +667,7 @@ int main(int argc, char** argv)
case 'K': case 'K':
MD5((const u_char*) optarg, strlen(optarg), shared_hmac_md5_key); MD5((const u_char*) optarg, strlen(optarg), shared_hmac_md5_key);
hmac_key_set = 1; hmac_key_set = true;
break; break;
case 'N': case 'N':

View file

@ -117,7 +117,9 @@ CardinalityCounter::~CardinalityCounter()
uint8_t CardinalityCounter::Rank(uint64_t hash_modified) const uint8_t CardinalityCounter::Rank(uint64_t hash_modified) const
{ {
hash_modified = hash_modified >> p; hash_modified = hash_modified >> p;
int answer = 64 - p - fls(hash_modified) + 1; int answer = 64 - p - CardinalityCounter::flsll(hash_modified) + 1;
assert(answer > 0 && answer < 64);
return answer; return answer;
} }
@ -238,3 +240,50 @@ CardinalityCounter* CardinalityCounter::Unserialize(UnserialInfo* info)
return c; return c;
} }
/* The following function is copied from libc/string/flsll.c from the FreeBSD source
* tree. Original copyright message follows
*/
/*-
* Copyright (c) 1990, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* Find Last Set bit
*/
int
CardinalityCounter::flsll(uint64_t mask)
{
int bit;
if (mask == 0)
return (0);
for (bit = 1; mask != 1; bit++)
mask = (uint64_t)mask >> 1;
return (bit);
}

View file

@ -165,6 +165,8 @@ private:
*/ */
uint8_t Rank(uint64_t hash_modified) const; uint8_t Rank(uint64_t hash_modified) const;
static int flsll(uint64_t mask);
/** /**
* This is the number of buckets that will be stored. The standard * This is the number of buckets that will be stored. The standard
* error is 1.04/sqrt(m), so the actual cardinality will be the * error is 1.04/sqrt(m), so the actual cardinality will be the

165
src/siphash24.c Normal file
View file

@ -0,0 +1,165 @@
/*
SipHash reference C implementation
Copyright (c) 2012-2014 Jean-Philippe Aumasson
<jeanphilippe.aumasson@gmail.com>
Copyright (c) 2012-2014 Daniel J. Bernstein <djb@cr.yp.to>
To the extent possible under law, the author(s) have dedicated all copyright
and related and neighboring rights to this software to the public domain
worldwide. This software is distributed without any warranty.
You should have received a copy of the CC0 Public Domain Dedication along
with
this software. If not, see
<http://creativecommons.org/publicdomain/zero/1.0/>.
*/
#include <stdint.h>
#include <stdio.h>
#include <string.h>
/* default: SipHash-2-4 */
#define cROUNDS 2
#define dROUNDS 4
#define ROTL(x, b) (uint64_t)(((x) << (b)) | ((x) >> (64 - (b))))
#define U32TO8_LE(p, v) \
(p)[0] = (uint8_t)((v)); \
(p)[1] = (uint8_t)((v) >> 8); \
(p)[2] = (uint8_t)((v) >> 16); \
(p)[3] = (uint8_t)((v) >> 24);
#define U64TO8_LE(p, v) \
U32TO8_LE((p), (uint32_t)((v))); \
U32TO8_LE((p) + 4, (uint32_t)((v) >> 32));
#define U8TO64_LE(p) \
(((uint64_t)((p)[0])) | ((uint64_t)((p)[1]) << 8) | \
((uint64_t)((p)[2]) << 16) | ((uint64_t)((p)[3]) << 24) | \
((uint64_t)((p)[4]) << 32) | ((uint64_t)((p)[5]) << 40) | \
((uint64_t)((p)[6]) << 48) | ((uint64_t)((p)[7]) << 56))
#define SIPROUND \
do { \
v0 += v1; \
v1 = ROTL(v1, 13); \
v1 ^= v0; \
v0 = ROTL(v0, 32); \
v2 += v3; \
v3 = ROTL(v3, 16); \
v3 ^= v2; \
v0 += v3; \
v3 = ROTL(v3, 21); \
v3 ^= v0; \
v2 += v1; \
v1 = ROTL(v1, 17); \
v1 ^= v2; \
v2 = ROTL(v2, 32); \
} while (0)
#ifdef SIPHASHDEBUG
#define TRACE \
do { \
printf("(%3d) v0 %08x %08x\n", (int)inlen, (uint32_t)(v0 >> 32), \
(uint32_t)v0); \
printf("(%3d) v1 %08x %08x\n", (int)inlen, (uint32_t)(v1 >> 32), \
(uint32_t)v1); \
printf("(%3d) v2 %08x %08x\n", (int)inlen, (uint32_t)(v2 >> 32), \
(uint32_t)v2); \
printf("(%3d) v3 %08x %08x\n", (int)inlen, (uint32_t)(v3 >> 32), \
(uint32_t)v3); \
} while (0)
#else
#define TRACE
#endif
int siphash(uint8_t *out, const uint8_t *in, uint64_t inlen, const uint8_t *k) {
/* "somepseudorandomlygeneratedbytes" */
uint64_t v0 = 0x736f6d6570736575ULL;
uint64_t v1 = 0x646f72616e646f6dULL;
uint64_t v2 = 0x6c7967656e657261ULL;
uint64_t v3 = 0x7465646279746573ULL;
uint64_t b;
uint64_t k0 = U8TO64_LE(k);
uint64_t k1 = U8TO64_LE(k + 8);
uint64_t m;
int i;
const uint8_t *end = in + inlen - (inlen % sizeof(uint64_t));
const int left = inlen & 7;
b = ((uint64_t)inlen) << 56;
v3 ^= k1;
v2 ^= k0;
v1 ^= k1;
v0 ^= k0;
#ifdef DOUBLE
v1 ^= 0xee;
#endif
for (; in != end; in += 8) {
m = U8TO64_LE(in);
v3 ^= m;
TRACE;
for (i = 0; i < cROUNDS; ++i)
SIPROUND;
v0 ^= m;
}
switch (left) {
case 7:
b |= ((uint64_t)in[6]) << 48;
case 6:
b |= ((uint64_t)in[5]) << 40;
case 5:
b |= ((uint64_t)in[4]) << 32;
case 4:
b |= ((uint64_t)in[3]) << 24;
case 3:
b |= ((uint64_t)in[2]) << 16;
case 2:
b |= ((uint64_t)in[1]) << 8;
case 1:
b |= ((uint64_t)in[0]);
break;
case 0:
break;
}
v3 ^= b;
TRACE;
for (i = 0; i < cROUNDS; ++i)
SIPROUND;
v0 ^= b;
#ifndef DOUBLE
v2 ^= 0xff;
#else
v2 ^= 0xee;
#endif
TRACE;
for (i = 0; i < dROUNDS; ++i)
SIPROUND;
b = v0 ^ v1 ^ v2 ^ v3;
U64TO8_LE(out, b);
#ifdef DOUBLE
v1 ^= 0xdd;
TRACE;
for (i = 0; i < dROUNDS; ++i)
SIPROUND;
b = v0 ^ v1 ^ v2 ^ v3;
U64TO8_LE(out + 8, b);
#endif
return 0;
}

View file

@ -695,8 +695,10 @@ std::string strstrip(std::string s)
return s; return s;
} }
int hmac_key_set = 0; bool hmac_key_set = 0;
uint8 shared_hmac_md5_key[16]; uint8 shared_hmac_md5_key[16];
bool siphash_key_set = false;
uint8 shared_siphash_key[16];
void hmac_md5(size_t size, const unsigned char* bytes, unsigned char digest[16]) void hmac_md5(size_t size, const unsigned char* bytes, unsigned char digest[16])
{ {
@ -791,7 +793,7 @@ void bro_srandom(unsigned int seed)
void init_random_seed(uint32 seed, const char* read_file, const char* write_file) void init_random_seed(uint32 seed, const char* read_file, const char* write_file)
{ {
static const int bufsiz = 16; static const int bufsiz = 20;
uint32 buf[bufsiz]; uint32 buf[bufsiz];
memset(buf, 0, sizeof(buf)); memset(buf, 0, sizeof(buf));
int pos = 0; // accumulates entropy int pos = 0; // accumulates entropy
@ -812,12 +814,13 @@ void init_random_seed(uint32 seed, const char* read_file, const char* write_file
gettimeofday((struct timeval *)(buf + pos), 0); gettimeofday((struct timeval *)(buf + pos), 0);
pos += sizeof(struct timeval) / sizeof(uint32); pos += sizeof(struct timeval) / sizeof(uint32);
// use urandom. For reasons see e.g. http://www.2uo.de/myths-about-urandom/
#if defined(O_NONBLOCK) #if defined(O_NONBLOCK)
int fd = open("/dev/random", O_RDONLY | O_NONBLOCK); int fd = open("/dev/urandom", O_RDONLY | O_NONBLOCK);
#elif defined(O_NDELAY) #elif defined(O_NDELAY)
int fd = open("/dev/random", O_RDONLY | O_NDELAY); int fd = open("/dev/urandom", O_RDONLY | O_NDELAY);
#else #else
int fd = open("/dev/random", O_RDONLY); int fd = open("/dev/urandom", O_RDONLY);
#endif #endif
if ( fd >= 0 ) if ( fd >= 0 )
@ -835,12 +838,7 @@ void init_random_seed(uint32 seed, const char* read_file, const char* write_file
} }
if ( pos < bufsiz ) if ( pos < bufsiz )
{ reporter->InternalError("Could not read enough random data from /dev/urandom. Wanted %d, got %d", bufsiz, pos);
buf[pos++] = getpid();
if ( pos < bufsiz )
buf[pos++] = getuid();
}
if ( ! seed ) if ( ! seed )
{ {
@ -864,8 +862,16 @@ void init_random_seed(uint32 seed, const char* read_file, const char* write_file
if ( ! hmac_key_set ) if ( ! hmac_key_set )
{ {
MD5((const u_char*) buf, sizeof(buf), shared_hmac_md5_key); assert(sizeof(buf)-16 == 64);
hmac_key_set = 1; MD5((const u_char*) buf, sizeof(buf)-16, shared_hmac_md5_key); // The last 128 bits of buf are for siphash
hmac_key_set = true;
}
if ( ! siphash_key_set )
{
assert(sizeof(buf)-64 == 16);
memcpy(shared_siphash_key, buf+64, 16);
siphash_key_set = true;
} }
if ( write_file && ! write_random_seeds(write_file, seed, buf, bufsiz) ) if ( write_file && ! write_random_seeds(write_file, seed, buf, bufsiz) )

View file

@ -181,10 +181,11 @@ extern std::string strreplace(const std::string& s, const std::string& o, const
// Remove all leading and trailing white space from string. // Remove all leading and trailing white space from string.
extern std::string strstrip(std::string s); extern std::string strstrip(std::string s);
extern bool hmac_key_set;
extern uint8 shared_hmac_md5_key[16]; extern uint8 shared_hmac_md5_key[16];
extern bool siphash_key_set;
extern uint8 shared_siphash_key[16];
extern int hmac_key_set;
extern unsigned char shared_hmac_md5_key[16];
extern void hmac_md5(size_t size, const unsigned char* bytes, extern void hmac_md5(size_t size, const unsigned char* bytes,
unsigned char digest[16]); unsigned char digest[16]);

View file

@ -15,3 +15,7 @@
3912865238 3912865238
3596260151 3596260151
517973768 517973768
3606168384
119014752
1013039866
2458585167