Remove the siphash->hmac-md5 switch after 36 bytes.

Currently, siphash is used for strings up to 36 bytes. hmac-md5 is used
for longer strings.

This switch-over is a remnant of the previous hash-function that was
used, which apparently was slower with longer input strings.

This change serves no purpose anymore. I performed a few performance tests
on strings of varying sizes:

For a 40 byte string with 10 million iterations:

siphash: 0.31 seconds
hmac-md5: 3.8 seconds

For a 1080 byte string with 10 million iterations:

siphash: 4.2 seconds
hmac-md5: 17 seconds

For a 18360 byte string with 10 million iterations:

siphash: 69 seconds
hmac-md5: 240 seconds

Hence, this commit removes the use of hmac-md5.

This change causes reordering of lines in a few logs.

This commit also changes the datastructure for the seed in probabilistic/Hasher
to get rid of a type-punning warning.
This commit is contained in:
Johanna Amann 2020-04-24 13:12:01 -07:00
parent bb050910bb
commit 5e7915ae7a
13 changed files with 269 additions and 297 deletions

View file

@ -54,7 +54,7 @@ broker::expected<broker::data> Hasher::Serialize() const
{
return {broker::vector{
static_cast<uint64_t>(Type()), static_cast<uint64_t>(k),
seed.h1, seed.h2 }};
seed.h[0], seed.h[1] }};
}
std::unique_ptr<Hasher> Hasher::Unserialize(const broker::data& data)
@ -107,23 +107,7 @@ UHF::UHF(Hasher::seed_t arg_seed)
Hasher::digest UHF::hash(const void* x, size_t n) const
{
assert(sizeof(Hasher::seed_t) == 16); // siphash always needs a 128 bit seed
if ( n <= UHASH_KEY_SIZE )
return highwayhash::SipHash(*(reinterpret_cast<const highwayhash::SipHashState::Key*>(&seed)), reinterpret_cast<const char*>(x), n);
union {
unsigned char d[16];
Hasher::digest rval;
} u;
internal_md5(reinterpret_cast<const unsigned char*>(x), n, u.d);
const unsigned char* s = reinterpret_cast<const unsigned char*>(&seed);
for ( size_t i = 0; i < 16; ++i )
u.d[i] ^= s[i % sizeof(seed)];
internal_md5(u.d, 16, u.d);
return u.rval;
return highwayhash::SipHash(seed.h, reinterpret_cast<const char*>(x), n);
}
DefaultHasher::DefaultHasher(size_t k, Hasher::seed_t seed)
@ -132,7 +116,7 @@ DefaultHasher::DefaultHasher(size_t k, Hasher::seed_t seed)
for ( size_t i = 1; i <= k; ++i )
{
seed_t s = Seed();
s.h1 += bro_prng(i);
s.h[0] += bro_prng(i);
hash_functions.push_back(UHF(s));
}
}