Change Hashing from H3 to Siphash.

This commit mostly changes the hash function that is used for Internal
hashing of data < 36 bytes from H3 to Siphash. This change is motivated
by the fact that it turns out that H3 apparently does not deliver a very
good source of data uniqueness; running HLL with H3 as a hashing
function results in quite poor results (up to of 75% off in my tests).
In difference, running HLL with Siphash (or HMAC-MD5) changes this
factor to ~2%.

This also fixes a long-standing bug in Hash.h which truncated our hash
values to 32 bit on most machines.

Furthermore, it once again fixes a problem with the Rank function in
HLL.
This commit is contained in:
Johanna Amann 2016-07-13 06:35:32 -07:00
parent c15f48661d
commit e1218cc7fa
10 changed files with 257 additions and 25 deletions

165
src/siphash24.c Normal file
View file

@ -0,0 +1,165 @@
/*
SipHash reference C implementation
Copyright (c) 2012-2014 Jean-Philippe Aumasson
<jeanphilippe.aumasson@gmail.com>
Copyright (c) 2012-2014 Daniel J. Bernstein <djb@cr.yp.to>
To the extent possible under law, the author(s) have dedicated all copyright
and related and neighboring rights to this software to the public domain
worldwide. This software is distributed without any warranty.
You should have received a copy of the CC0 Public Domain Dedication along
with
this software. If not, see
<http://creativecommons.org/publicdomain/zero/1.0/>.
*/
#include <stdint.h>
#include <stdio.h>
#include <string.h>
/* default: SipHash-2-4 */
#define cROUNDS 2
#define dROUNDS 4
#define ROTL(x, b) (uint64_t)(((x) << (b)) | ((x) >> (64 - (b))))
#define U32TO8_LE(p, v) \
(p)[0] = (uint8_t)((v)); \
(p)[1] = (uint8_t)((v) >> 8); \
(p)[2] = (uint8_t)((v) >> 16); \
(p)[3] = (uint8_t)((v) >> 24);
#define U64TO8_LE(p, v) \
U32TO8_LE((p), (uint32_t)((v))); \
U32TO8_LE((p) + 4, (uint32_t)((v) >> 32));
#define U8TO64_LE(p) \
(((uint64_t)((p)[0])) | ((uint64_t)((p)[1]) << 8) | \
((uint64_t)((p)[2]) << 16) | ((uint64_t)((p)[3]) << 24) | \
((uint64_t)((p)[4]) << 32) | ((uint64_t)((p)[5]) << 40) | \
((uint64_t)((p)[6]) << 48) | ((uint64_t)((p)[7]) << 56))
#define SIPROUND \
do { \
v0 += v1; \
v1 = ROTL(v1, 13); \
v1 ^= v0; \
v0 = ROTL(v0, 32); \
v2 += v3; \
v3 = ROTL(v3, 16); \
v3 ^= v2; \
v0 += v3; \
v3 = ROTL(v3, 21); \
v3 ^= v0; \
v2 += v1; \
v1 = ROTL(v1, 17); \
v1 ^= v2; \
v2 = ROTL(v2, 32); \
} while (0)
#ifdef SIPHASHDEBUG
#define TRACE \
do { \
printf("(%3d) v0 %08x %08x\n", (int)inlen, (uint32_t)(v0 >> 32), \
(uint32_t)v0); \
printf("(%3d) v1 %08x %08x\n", (int)inlen, (uint32_t)(v1 >> 32), \
(uint32_t)v1); \
printf("(%3d) v2 %08x %08x\n", (int)inlen, (uint32_t)(v2 >> 32), \
(uint32_t)v2); \
printf("(%3d) v3 %08x %08x\n", (int)inlen, (uint32_t)(v3 >> 32), \
(uint32_t)v3); \
} while (0)
#else
#define TRACE
#endif
int siphash(uint8_t *out, const uint8_t *in, uint64_t inlen, const uint8_t *k) {
/* "somepseudorandomlygeneratedbytes" */
uint64_t v0 = 0x736f6d6570736575ULL;
uint64_t v1 = 0x646f72616e646f6dULL;
uint64_t v2 = 0x6c7967656e657261ULL;
uint64_t v3 = 0x7465646279746573ULL;
uint64_t b;
uint64_t k0 = U8TO64_LE(k);
uint64_t k1 = U8TO64_LE(k + 8);
uint64_t m;
int i;
const uint8_t *end = in + inlen - (inlen % sizeof(uint64_t));
const int left = inlen & 7;
b = ((uint64_t)inlen) << 56;
v3 ^= k1;
v2 ^= k0;
v1 ^= k1;
v0 ^= k0;
#ifdef DOUBLE
v1 ^= 0xee;
#endif
for (; in != end; in += 8) {
m = U8TO64_LE(in);
v3 ^= m;
TRACE;
for (i = 0; i < cROUNDS; ++i)
SIPROUND;
v0 ^= m;
}
switch (left) {
case 7:
b |= ((uint64_t)in[6]) << 48;
case 6:
b |= ((uint64_t)in[5]) << 40;
case 5:
b |= ((uint64_t)in[4]) << 32;
case 4:
b |= ((uint64_t)in[3]) << 24;
case 3:
b |= ((uint64_t)in[2]) << 16;
case 2:
b |= ((uint64_t)in[1]) << 8;
case 1:
b |= ((uint64_t)in[0]);
break;
case 0:
break;
}
v3 ^= b;
TRACE;
for (i = 0; i < cROUNDS; ++i)
SIPROUND;
v0 ^= b;
#ifndef DOUBLE
v2 ^= 0xff;
#else
v2 ^= 0xee;
#endif
TRACE;
for (i = 0; i < dROUNDS; ++i)
SIPROUND;
b = v0 ^ v1 ^ v2 ^ v3;
U64TO8_LE(out, b);
#ifdef DOUBLE
v1 ^= 0xdd;
TRACE;
for (i = 0; i < dROUNDS; ++i)
SIPROUND;
b = v0 ^ v1 ^ v2 ^ v3;
U64TO8_LE(out + 8, b);
#endif
return 0;
}