Start refactoring hashing.

This commit moves some of the hash datastructures and code from
util.cc into Hash.cc - where it seems more appropriate.

It also starts to make more Keyed hash functions available - still
using siphash as the default 64 bit keyed hash, but also making
128 and 256 bit highway hashes available.

There already are a few other functions that are defined but not
yet implemented - these will be "static" keyed hashes - which use
an installation specific key. These will be used to, e.g., get
rid of md5 hashing for the generation of file UIDs.
This commit is contained in:
Johanna Amann 2020-04-24 17:11:16 -07:00
parent d34532f847
commit 360c06a3f8
7 changed files with 123 additions and 67 deletions

View file

@ -1,32 +1,58 @@
// See the file "COPYING" in the main distribution directory for copyright.
// The hash function works as follows:
//
// 1) For short data we have a number of universal hash functions:
// UHASH_CW (ax + b (mod p)), H3, Dietzfelbinger and UMAC_NH (UMAC_NH is
// not as strongly universal as the others, but probably enough). All
// these functions require number of random bits linear to the data
// length. And we use them for data no longer than UHASH_KEY_SIZE.
// They are faster than HMAC/MD5 used for longer data, and most hash
// operations are on short data.
//
// 2) As a fall-back, we use HMAC/MD5 (keyed MD5) for data of arbitrary
// length. MD5 is used as a scrambling scheme so that it is difficult
// for the adversary to construct conflicts, though I do not know if
// HMAC/MD5 is provably universal.
#include "zeek-config.h"
#include "Hash.h"
#include "digest.h"
#include "Reporter.h"
#include "BroString.h"
#include "highwayhash/sip_hash.h"
#include "highwayhash/highwayhash_target.h"
#include "highwayhash/instruction_sets.h"
// we use the following lines to not pull in the highwayhash headers in Hash.h - but to check the types did not change underneath us.
static_assert(std::is_same<hash64_t, highwayhash::HHResult64>::value, "Highwayhash return values must match hash_x_t");
static_assert(std::is_same<hash128_t, highwayhash::HHResult128>::value, "Highwayhash return values must match hash_x_t");
static_assert(std::is_same<hash256_t, highwayhash::HHResult256>::value, "Highwayhash return values must match hash_x_t");
void KeyedHash::InitializeSeeds(const std::array<uint32_t, SEED_INIT_SIZE>& seed_data)
{
static_assert(std::is_same<decltype(KeyedHash::shared_siphash_key), highwayhash::SipHashState::Key>::value, "Highwayhash Key is not unsigned long long[2]");
static_assert(std::is_same<decltype(KeyedHash::shared_highwayhash_key), highwayhash::HHKey>::value, "Highwayhash HHKey is not uint64_t[4]");
if ( seeds_initialized )
return;
internal_md5((const u_char*) seed_data.data(), sizeof(seed_data) - 16, shared_hmac_md5_key); // The last 128 bits of buf are for siphash
// yes, we use the same buffer twice to initialize two different keys. This should not really be a
// security problem of any kind: hmac-md5 is not really used anymore - and even if it was, the hashes
// should not reveal any information about their initialization vector.
static_assert(sizeof(shared_highwayhash_key) == SHA256_DIGEST_LENGTH);
calculate_digest(Hash_SHA256, (const u_char*) seed_data.data(), sizeof(seed_data) - 16, reinterpret_cast<unsigned char*>(shared_highwayhash_key));
memcpy(shared_siphash_key, reinterpret_cast<const char*>(seed_data.data()) + 64, 16);
seeds_initialized = true;
}
hash64_t KeyedHash::Hash64(const void* bytes, uint64_t size)
{
return highwayhash::SipHash(shared_siphash_key, reinterpret_cast<const char *>(bytes), size);
}
void KeyedHash::Hash128(const void* bytes, uint64_t size, hash128_t* result)
{
highwayhash::InstructionSets::Run<highwayhash::HighwayHash>(shared_highwayhash_key, reinterpret_cast<const char *>(bytes), size, result);
}
void KeyedHash::Hash256(const void* bytes, uint64_t size, hash256_t* result)
{
highwayhash::InstructionSets::Run<highwayhash::HighwayHash>(shared_highwayhash_key, reinterpret_cast<const char *>(bytes), size, result);
}
void init_hash_function()
{
// Make sure we have already called init_random_seed().
if ( ! (hmac_key_set && siphash_key_set) )
if ( ! KeyedHash::IsInitialized() )
reporter->InternalError("Zeek's hash functions aren't fully initialized");
}
@ -156,6 +182,5 @@ void* HashKey::CopyKey(const void* k, int s) const
hash_t HashKey::HashBytes(const void* bytes, int size)
{
hash_t digest = highwayhash::SipHash(shared_siphash_key, reinterpret_cast<const char *>(bytes), size);
return digest;
return KeyedHash::Hash64(bytes, size);
}