mirror of
https://github.com/zeek/zeek.git
synced 2025-10-02 14:48:21 +00:00
Change Hashing from H3 to Siphash.
This commit mostly changes the hash function that is used for Internal hashing of data < 36 bytes from H3 to Siphash. This change is motivated by the fact that it turns out that H3 apparently does not deliver a very good source of data uniqueness; running HLL with H3 as a hashing function results in quite poor results (up to of 75% off in my tests). In difference, running HLL with Siphash (or HMAC-MD5) changes this factor to ~2%. This also fixes a long-standing bug in Hash.h which truncated our hash values to 32 bit on most machines. Furthermore, it once again fixes a problem with the Rank function in HLL.
This commit is contained in:
parent
c15f48661d
commit
e1218cc7fa
10 changed files with 257 additions and 25 deletions
|
@ -348,6 +348,7 @@ set(bro_SRCS
|
|||
PacketDumper.cc
|
||||
strsep.c
|
||||
modp_numtoa.c
|
||||
siphash24.c
|
||||
|
||||
threading/BasicThread.cc
|
||||
threading/Formatter.cc
|
||||
|
|
17
src/Hash.cc
17
src/Hash.cc
|
@ -19,14 +19,15 @@
|
|||
|
||||
#include "Hash.h"
|
||||
|
||||
#include "H3.h"
|
||||
const H3<hash_t, UHASH_KEY_SIZE>* h3;
|
||||
extern "C" {
|
||||
extern int siphash( uint8_t *out, const uint8_t *in, uint64_t inlen, const uint8_t *k );
|
||||
}
|
||||
|
||||
void init_hash_function()
|
||||
{
|
||||
// Make sure we have already called init_random_seed().
|
||||
ASSERT(hmac_key_set);
|
||||
h3 = new H3<hash_t, UHASH_KEY_SIZE>();
|
||||
assert(hmac_key_set);
|
||||
assert(siphash_key_set);
|
||||
}
|
||||
|
||||
HashKey::HashKey(bro_int_t i)
|
||||
|
@ -164,14 +165,16 @@ void* HashKey::CopyKey(const void* k, int s) const
|
|||
|
||||
hash_t HashKey::HashBytes(const void* bytes, int size)
|
||||
{
|
||||
assert(sizeof(hash_t) == 8);
|
||||
hash_t digest[2]; // 2x hash_t (uint64) = 128 bits = 32 hex chars = sizeof md5
|
||||
|
||||
if ( size <= UHASH_KEY_SIZE )
|
||||
{
|
||||
// H3 doesn't check if size is zero
|
||||
return ( size == 0 ) ? 0 : (*h3)(bytes, size);
|
||||
siphash((uint8_t*)digest, (const uint8_t*)bytes, size, shared_siphash_key);
|
||||
return digest[0];
|
||||
}
|
||||
|
||||
// Fall back to HMAC/MD5 for longer data (which is usually rare).
|
||||
hash_t digest[16];
|
||||
hmac_md5(size, (const unsigned char*) bytes, (unsigned char*) digest);
|
||||
return digest[0];
|
||||
}
|
||||
|
|
|
@ -81,7 +81,8 @@ protected:
|
|||
|
||||
void* key;
|
||||
int is_our_dynamic;
|
||||
int size, hash;
|
||||
int size;
|
||||
hash_t hash;
|
||||
};
|
||||
|
||||
extern void init_hash_function();
|
||||
|
|
|
@ -667,7 +667,7 @@ int main(int argc, char** argv)
|
|||
|
||||
case 'K':
|
||||
MD5((const u_char*) optarg, strlen(optarg), shared_hmac_md5_key);
|
||||
hmac_key_set = 1;
|
||||
hmac_key_set = true;
|
||||
break;
|
||||
|
||||
case 'N':
|
||||
|
|
|
@ -117,7 +117,9 @@ CardinalityCounter::~CardinalityCounter()
|
|||
uint8_t CardinalityCounter::Rank(uint64_t hash_modified) const
|
||||
{
|
||||
hash_modified = hash_modified >> p;
|
||||
int answer = 64 - p - fls(hash_modified) + 1;
|
||||
int answer = 64 - p - CardinalityCounter::flsll(hash_modified) + 1;
|
||||
assert(answer > 0 && answer < 64);
|
||||
|
||||
return answer;
|
||||
}
|
||||
|
||||
|
@ -238,3 +240,50 @@ CardinalityCounter* CardinalityCounter::Unserialize(UnserialInfo* info)
|
|||
|
||||
return c;
|
||||
}
|
||||
|
||||
/* The following function is copied from libc/string/flsll.c from the FreeBSD source
|
||||
* tree. Original copyright message follows
|
||||
*/
|
||||
/*-
|
||||
* Copyright (c) 1990, 1993
|
||||
* The Regents of the University of California. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* 3. Neither the name of the University nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/*
|
||||
* Find Last Set bit
|
||||
*/
|
||||
int
|
||||
CardinalityCounter::flsll(uint64_t mask)
|
||||
{
|
||||
int bit;
|
||||
|
||||
if (mask == 0)
|
||||
return (0);
|
||||
for (bit = 1; mask != 1; bit++)
|
||||
mask = (uint64_t)mask >> 1;
|
||||
return (bit);
|
||||
}
|
||||
|
|
|
@ -165,6 +165,8 @@ private:
|
|||
*/
|
||||
uint8_t Rank(uint64_t hash_modified) const;
|
||||
|
||||
static int flsll(uint64_t mask);
|
||||
|
||||
/**
|
||||
* This is the number of buckets that will be stored. The standard
|
||||
* error is 1.04/sqrt(m), so the actual cardinality will be the
|
||||
|
|
165
src/siphash24.c
Normal file
165
src/siphash24.c
Normal file
|
@ -0,0 +1,165 @@
|
|||
/*
|
||||
SipHash reference C implementation
|
||||
|
||||
Copyright (c) 2012-2014 Jean-Philippe Aumasson
|
||||
<jeanphilippe.aumasson@gmail.com>
|
||||
Copyright (c) 2012-2014 Daniel J. Bernstein <djb@cr.yp.to>
|
||||
|
||||
To the extent possible under law, the author(s) have dedicated all copyright
|
||||
and related and neighboring rights to this software to the public domain
|
||||
worldwide. This software is distributed without any warranty.
|
||||
|
||||
You should have received a copy of the CC0 Public Domain Dedication along
|
||||
with
|
||||
this software. If not, see
|
||||
<http://creativecommons.org/publicdomain/zero/1.0/>.
|
||||
*/
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
/* default: SipHash-2-4 */
|
||||
#define cROUNDS 2
|
||||
#define dROUNDS 4
|
||||
|
||||
#define ROTL(x, b) (uint64_t)(((x) << (b)) | ((x) >> (64 - (b))))
|
||||
|
||||
#define U32TO8_LE(p, v) \
|
||||
(p)[0] = (uint8_t)((v)); \
|
||||
(p)[1] = (uint8_t)((v) >> 8); \
|
||||
(p)[2] = (uint8_t)((v) >> 16); \
|
||||
(p)[3] = (uint8_t)((v) >> 24);
|
||||
|
||||
#define U64TO8_LE(p, v) \
|
||||
U32TO8_LE((p), (uint32_t)((v))); \
|
||||
U32TO8_LE((p) + 4, (uint32_t)((v) >> 32));
|
||||
|
||||
#define U8TO64_LE(p) \
|
||||
(((uint64_t)((p)[0])) | ((uint64_t)((p)[1]) << 8) | \
|
||||
((uint64_t)((p)[2]) << 16) | ((uint64_t)((p)[3]) << 24) | \
|
||||
((uint64_t)((p)[4]) << 32) | ((uint64_t)((p)[5]) << 40) | \
|
||||
((uint64_t)((p)[6]) << 48) | ((uint64_t)((p)[7]) << 56))
|
||||
|
||||
#define SIPROUND \
|
||||
do { \
|
||||
v0 += v1; \
|
||||
v1 = ROTL(v1, 13); \
|
||||
v1 ^= v0; \
|
||||
v0 = ROTL(v0, 32); \
|
||||
v2 += v3; \
|
||||
v3 = ROTL(v3, 16); \
|
||||
v3 ^= v2; \
|
||||
v0 += v3; \
|
||||
v3 = ROTL(v3, 21); \
|
||||
v3 ^= v0; \
|
||||
v2 += v1; \
|
||||
v1 = ROTL(v1, 17); \
|
||||
v1 ^= v2; \
|
||||
v2 = ROTL(v2, 32); \
|
||||
} while (0)
|
||||
|
||||
#ifdef SIPHASHDEBUG
|
||||
#define TRACE \
|
||||
do { \
|
||||
printf("(%3d) v0 %08x %08x\n", (int)inlen, (uint32_t)(v0 >> 32), \
|
||||
(uint32_t)v0); \
|
||||
printf("(%3d) v1 %08x %08x\n", (int)inlen, (uint32_t)(v1 >> 32), \
|
||||
(uint32_t)v1); \
|
||||
printf("(%3d) v2 %08x %08x\n", (int)inlen, (uint32_t)(v2 >> 32), \
|
||||
(uint32_t)v2); \
|
||||
printf("(%3d) v3 %08x %08x\n", (int)inlen, (uint32_t)(v3 >> 32), \
|
||||
(uint32_t)v3); \
|
||||
} while (0)
|
||||
#else
|
||||
#define TRACE
|
||||
#endif
|
||||
|
||||
int siphash(uint8_t *out, const uint8_t *in, uint64_t inlen, const uint8_t *k) {
|
||||
/* "somepseudorandomlygeneratedbytes" */
|
||||
uint64_t v0 = 0x736f6d6570736575ULL;
|
||||
uint64_t v1 = 0x646f72616e646f6dULL;
|
||||
uint64_t v2 = 0x6c7967656e657261ULL;
|
||||
uint64_t v3 = 0x7465646279746573ULL;
|
||||
uint64_t b;
|
||||
uint64_t k0 = U8TO64_LE(k);
|
||||
uint64_t k1 = U8TO64_LE(k + 8);
|
||||
uint64_t m;
|
||||
int i;
|
||||
const uint8_t *end = in + inlen - (inlen % sizeof(uint64_t));
|
||||
const int left = inlen & 7;
|
||||
b = ((uint64_t)inlen) << 56;
|
||||
v3 ^= k1;
|
||||
v2 ^= k0;
|
||||
v1 ^= k1;
|
||||
v0 ^= k0;
|
||||
|
||||
#ifdef DOUBLE
|
||||
v1 ^= 0xee;
|
||||
#endif
|
||||
|
||||
for (; in != end; in += 8) {
|
||||
m = U8TO64_LE(in);
|
||||
v3 ^= m;
|
||||
|
||||
TRACE;
|
||||
for (i = 0; i < cROUNDS; ++i)
|
||||
SIPROUND;
|
||||
|
||||
v0 ^= m;
|
||||
}
|
||||
|
||||
switch (left) {
|
||||
case 7:
|
||||
b |= ((uint64_t)in[6]) << 48;
|
||||
case 6:
|
||||
b |= ((uint64_t)in[5]) << 40;
|
||||
case 5:
|
||||
b |= ((uint64_t)in[4]) << 32;
|
||||
case 4:
|
||||
b |= ((uint64_t)in[3]) << 24;
|
||||
case 3:
|
||||
b |= ((uint64_t)in[2]) << 16;
|
||||
case 2:
|
||||
b |= ((uint64_t)in[1]) << 8;
|
||||
case 1:
|
||||
b |= ((uint64_t)in[0]);
|
||||
break;
|
||||
case 0:
|
||||
break;
|
||||
}
|
||||
|
||||
v3 ^= b;
|
||||
|
||||
TRACE;
|
||||
for (i = 0; i < cROUNDS; ++i)
|
||||
SIPROUND;
|
||||
|
||||
v0 ^= b;
|
||||
|
||||
#ifndef DOUBLE
|
||||
v2 ^= 0xff;
|
||||
#else
|
||||
v2 ^= 0xee;
|
||||
#endif
|
||||
|
||||
TRACE;
|
||||
for (i = 0; i < dROUNDS; ++i)
|
||||
SIPROUND;
|
||||
|
||||
b = v0 ^ v1 ^ v2 ^ v3;
|
||||
U64TO8_LE(out, b);
|
||||
|
||||
#ifdef DOUBLE
|
||||
v1 ^= 0xdd;
|
||||
|
||||
TRACE;
|
||||
for (i = 0; i < dROUNDS; ++i)
|
||||
SIPROUND;
|
||||
|
||||
b = v0 ^ v1 ^ v2 ^ v3;
|
||||
U64TO8_LE(out + 8, b);
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
32
src/util.cc
32
src/util.cc
|
@ -695,8 +695,10 @@ std::string strstrip(std::string s)
|
|||
return s;
|
||||
}
|
||||
|
||||
int hmac_key_set = 0;
|
||||
bool hmac_key_set = 0;
|
||||
uint8 shared_hmac_md5_key[16];
|
||||
bool siphash_key_set = false;
|
||||
uint8 shared_siphash_key[16];
|
||||
|
||||
void hmac_md5(size_t size, const unsigned char* bytes, unsigned char digest[16])
|
||||
{
|
||||
|
@ -791,7 +793,7 @@ void bro_srandom(unsigned int seed)
|
|||
|
||||
void init_random_seed(uint32 seed, const char* read_file, const char* write_file)
|
||||
{
|
||||
static const int bufsiz = 16;
|
||||
static const int bufsiz = 20;
|
||||
uint32 buf[bufsiz];
|
||||
memset(buf, 0, sizeof(buf));
|
||||
int pos = 0; // accumulates entropy
|
||||
|
@ -812,12 +814,13 @@ void init_random_seed(uint32 seed, const char* read_file, const char* write_file
|
|||
gettimeofday((struct timeval *)(buf + pos), 0);
|
||||
pos += sizeof(struct timeval) / sizeof(uint32);
|
||||
|
||||
// use urandom. For reasons see e.g. http://www.2uo.de/myths-about-urandom/
|
||||
#if defined(O_NONBLOCK)
|
||||
int fd = open("/dev/random", O_RDONLY | O_NONBLOCK);
|
||||
int fd = open("/dev/urandom", O_RDONLY | O_NONBLOCK);
|
||||
#elif defined(O_NDELAY)
|
||||
int fd = open("/dev/random", O_RDONLY | O_NDELAY);
|
||||
int fd = open("/dev/urandom", O_RDONLY | O_NDELAY);
|
||||
#else
|
||||
int fd = open("/dev/random", O_RDONLY);
|
||||
int fd = open("/dev/urandom", O_RDONLY);
|
||||
#endif
|
||||
|
||||
if ( fd >= 0 )
|
||||
|
@ -835,12 +838,7 @@ void init_random_seed(uint32 seed, const char* read_file, const char* write_file
|
|||
}
|
||||
|
||||
if ( pos < bufsiz )
|
||||
{
|
||||
buf[pos++] = getpid();
|
||||
|
||||
if ( pos < bufsiz )
|
||||
buf[pos++] = getuid();
|
||||
}
|
||||
reporter->InternalError("Could not read enough random data from /dev/urandom. Wanted %d, got %d", bufsiz, pos);
|
||||
|
||||
if ( ! seed )
|
||||
{
|
||||
|
@ -864,8 +862,16 @@ void init_random_seed(uint32 seed, const char* read_file, const char* write_file
|
|||
|
||||
if ( ! hmac_key_set )
|
||||
{
|
||||
MD5((const u_char*) buf, sizeof(buf), shared_hmac_md5_key);
|
||||
hmac_key_set = 1;
|
||||
assert(sizeof(buf)-16 == 64);
|
||||
MD5((const u_char*) buf, sizeof(buf)-16, shared_hmac_md5_key); // The last 128 bits of buf are for siphash
|
||||
hmac_key_set = true;
|
||||
}
|
||||
|
||||
if ( ! siphash_key_set )
|
||||
{
|
||||
assert(sizeof(buf)-64 == 16);
|
||||
memcpy(shared_siphash_key, buf+64, 16);
|
||||
siphash_key_set = true;
|
||||
}
|
||||
|
||||
if ( write_file && ! write_random_seeds(write_file, seed, buf, bufsiz) )
|
||||
|
|
|
@ -181,10 +181,11 @@ extern std::string strreplace(const std::string& s, const std::string& o, const
|
|||
// Remove all leading and trailing white space from string.
|
||||
extern std::string strstrip(std::string s);
|
||||
|
||||
extern bool hmac_key_set;
|
||||
extern uint8 shared_hmac_md5_key[16];
|
||||
extern bool siphash_key_set;
|
||||
extern uint8 shared_siphash_key[16];
|
||||
|
||||
extern int hmac_key_set;
|
||||
extern unsigned char shared_hmac_md5_key[16];
|
||||
extern void hmac_md5(size_t size, const unsigned char* bytes,
|
||||
unsigned char digest[16]);
|
||||
|
||||
|
|
|
@ -15,3 +15,7 @@
|
|||
3912865238
|
||||
3596260151
|
||||
517973768
|
||||
3606168384
|
||||
119014752
|
||||
1013039866
|
||||
2458585167
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue