Start refactoring hashing.

This commit moves some of the hash datastructures and code from
util.cc into Hash.cc - where it seems more appropriate.

It also starts to make more Keyed hash functions available - still
using siphash as the default 64 bit keyed hash, but also making
128 and 256 bit highway hashes available.

There already are a few other functions that are defined but not
yet implemented - these will be "static" keyed hashes - which use
an installation specific key. These will be used to, e.g., get
rid of md5 hashing for the generation of file UIDs.
This commit is contained in:
Johanna Amann 2020-04-24 17:11:16 -07:00
parent d34532f847
commit 360c06a3f8
7 changed files with 123 additions and 67 deletions

View file

@ -1,32 +1,58 @@
// See the file "COPYING" in the main distribution directory for copyright.
// The hash function works as follows:
//
// 1) For short data we have a number of universal hash functions:
// UHASH_CW (ax + b (mod p)), H3, Dietzfelbinger and UMAC_NH (UMAC_NH is
// not as strongly universal as the others, but probably enough). All
// these functions require number of random bits linear to the data
// length. And we use them for data no longer than UHASH_KEY_SIZE.
// They are faster than HMAC/MD5 used for longer data, and most hash
// operations are on short data.
//
// 2) As a fall-back, we use HMAC/MD5 (keyed MD5) for data of arbitrary
// length. MD5 is used as a scrambling scheme so that it is difficult
// for the adversary to construct conflicts, though I do not know if
// HMAC/MD5 is provably universal.
#include "zeek-config.h"
#include "Hash.h"
#include "digest.h"
#include "Reporter.h"
#include "BroString.h"
#include "highwayhash/sip_hash.h"
#include "highwayhash/highwayhash_target.h"
#include "highwayhash/instruction_sets.h"
// we use the following lines to not pull in the highwayhash headers in Hash.h - but to check the types did not change underneath us.
static_assert(std::is_same<hash64_t, highwayhash::HHResult64>::value, "Highwayhash return values must match hash_x_t");
static_assert(std::is_same<hash128_t, highwayhash::HHResult128>::value, "Highwayhash return values must match hash_x_t");
static_assert(std::is_same<hash256_t, highwayhash::HHResult256>::value, "Highwayhash return values must match hash_x_t");
void KeyedHash::InitializeSeeds(const std::array<uint32_t, SEED_INIT_SIZE>& seed_data)
{
static_assert(std::is_same<decltype(KeyedHash::shared_siphash_key), highwayhash::SipHashState::Key>::value, "Highwayhash Key is not unsigned long long[2]");
static_assert(std::is_same<decltype(KeyedHash::shared_highwayhash_key), highwayhash::HHKey>::value, "Highwayhash HHKey is not uint64_t[4]");
if ( seeds_initialized )
return;
internal_md5((const u_char*) seed_data.data(), sizeof(seed_data) - 16, shared_hmac_md5_key); // The last 128 bits of buf are for siphash
// yes, we use the same buffer twice to initialize two different keys. This should not really be a
// security problem of any kind: hmac-md5 is not really used anymore - and even if it was, the hashes
// should not reveal any information about their initialization vector.
static_assert(sizeof(shared_highwayhash_key) == SHA256_DIGEST_LENGTH);
calculate_digest(Hash_SHA256, (const u_char*) seed_data.data(), sizeof(seed_data) - 16, reinterpret_cast<unsigned char*>(shared_highwayhash_key));
memcpy(shared_siphash_key, reinterpret_cast<const char*>(seed_data.data()) + 64, 16);
seeds_initialized = true;
}
hash64_t KeyedHash::Hash64(const void* bytes, uint64_t size)
{
return highwayhash::SipHash(shared_siphash_key, reinterpret_cast<const char *>(bytes), size);
}
void KeyedHash::Hash128(const void* bytes, uint64_t size, hash128_t* result)
{
highwayhash::InstructionSets::Run<highwayhash::HighwayHash>(shared_highwayhash_key, reinterpret_cast<const char *>(bytes), size, result);
}
void KeyedHash::Hash256(const void* bytes, uint64_t size, hash256_t* result)
{
highwayhash::InstructionSets::Run<highwayhash::HighwayHash>(shared_highwayhash_key, reinterpret_cast<const char *>(bytes), size, result);
}
void init_hash_function()
{
// Make sure we have already called init_random_seed().
if ( ! (hmac_key_set && siphash_key_set) )
if ( ! KeyedHash::IsInitialized() )
reporter->InternalError("Zeek's hash functions aren't fully initialized");
}
@ -156,6 +182,5 @@ void* HashKey::CopyKey(const void* k, int s) const
hash_t HashKey::HashBytes(const void* bytes, int size)
{
hash_t digest = highwayhash::SipHash(shared_siphash_key, reinterpret_cast<const char *>(bytes), size);
return digest;
return KeyedHash::Hash64(bytes, size);
}

View file

@ -1,5 +1,22 @@
// See the file "COPYING" in the main distribution directory for copyright.
/***
* This file contains functions to generate hashes used keyed hash functions.
* Keyed hash functions make it difficult/impossible to find information about the
* output of a hash when the key is unknown to the attacker. This fact holds, even
* when the input value us known.
*
* We use these kinds of hashes heavily internally - e.g. for scriptland hash generation.
* It is important that these hashes are not easily guessable to prevent complexity attacks.
*
* The HashKey class is the actual class that is used to generate Hash keys that are used internally,
* e.g. for lookups in hash-tables; the Hashes are also used for connection ID generation.
*
* This means that the hashes created by most functions in this file will be different each run, unless
* a seed file is used. There are a few functions that create hashes that are static over runs
* and use an installation-wide seed value; these are specifically called out.
*/
#pragma once
#include "util.h" // for bro_int_t
@ -8,7 +25,43 @@
class BroString;
// to allow bro_md5_hmac access to the hmac seed
#include "ZeekArgs.h"
class Val;
class Frame;
namespace BifFunc {
extern Val* bro_md5_hmac(Frame* frame, const zeek::Args*);
}
typedef uint64_t hash_t;
typedef uint64_t hash64_t;
typedef uint64_t hash128_t[2];
typedef uint64_t hash256_t[4];
class KeyedHash {
public:
constexpr static int SEED_INIT_SIZE = 20;
static void InitializeSeeds(const std::array<uint32_t, SEED_INIT_SIZE>& seed_data);
static bool IsInitialized() { return seeds_initialized; }
static hash64_t Hash64(const void* bytes, uint64_t size);
static void Hash128(const void* bytes, uint64_t size, hash128_t* result);
static void Hash256(const void* bytes, uint64_t size, hash256_t* result);
static hash64_t StaticHash64(const void* bytes, uint64_t size);
static void StaticHash128(const void* bytes, uint64_t size, hash128_t* result);
static void StaticHash256(const void* bytes, uint64_t size, hash256_t* result);
private:
// actually HHKey
alignas(32) inline static uint64_t shared_highwayhash_key[4];
// actually HH_U64, which has the same type
alignas(16) inline static unsigned long long shared_siphash_key[2];
inline static uint8_t shared_hmac_md5_key[16];
inline static bool seeds_initialized = false;
friend void hmac_md5(size_t size, const unsigned char* bytes, unsigned char digest[16]);
friend Val* BifFunc::bro_md5_hmac(Frame* frame, const zeek::Args*);
};
typedef enum {
HASH_KEY_INT,

View file

@ -106,7 +106,7 @@ UHF::UHF(Hasher::seed_t arg_seed)
// times.
Hasher::digest UHF::hash(const void* x, size_t n) const
{
assert(sizeof(Hasher::seed_t) == 16); // siphash always needs a 128 bit seed
static_assert(std::is_same<highwayhash::SipHashState::Key, decltype(seed.h)>::value, "Seed value is not the same type as highwayhash key");
return highwayhash::SipHash(seed.h, reinterpret_cast<const char*>(x), n);
}

View file

@ -3,6 +3,7 @@
#pragma once
#include "Hash.h"
#include "highwayhash/sip_hash.h"
#include <broker/expected.hh>
@ -24,7 +25,8 @@ public:
typedef hash_t digest;
typedef std::vector<digest> digest_vector;
struct seed_t {
alignas(16) highwayhash::HH_U64 h[2];
// actually HH_U64, which has the same type
alignas(16) unsigned long long h[2];
friend seed_t operator+(seed_t lhs, const uint64_t rhs) {
lhs.h[0] += rhs;

View file

@ -55,6 +55,7 @@
#include "iosource/Manager.h"
#include "iosource/PktSrc.h"
#include "ConvertUTF.h"
#include "Hash.h"
#include "3rdparty/doctest.h"
@ -997,27 +998,21 @@ std::string strstrip(std::string s)
return s;
}
bool hmac_key_set = false;
uint8_t shared_hmac_md5_key[16];
bool siphash_key_set = false;
alignas(16) highwayhash::HH_U64 shared_siphash_key[2];
void hmac_md5(size_t size, const unsigned char* bytes, unsigned char digest[16])
{
if ( ! hmac_key_set )
if ( ! KeyedHash::seeds_initialized )
reporter->InternalError("HMAC-MD5 invoked before the HMAC key is set");
internal_md5(bytes, size, digest);
for ( int i = 0; i < 16; ++i )
digest[i] ^= shared_hmac_md5_key[i];
digest[i] ^= KeyedHash::shared_hmac_md5_key[i];
internal_md5(digest, 16, digest);
}
static bool read_random_seeds(const char* read_file, uint32_t* seed,
uint32_t* buf, int bufsiz)
std::array<uint32_t, KeyedHash::SEED_INIT_SIZE>& buf)
{
FILE* f = nullptr;
@ -1035,8 +1030,8 @@ static bool read_random_seeds(const char* read_file, uint32_t* seed,
return false;
}
// Read seeds for MD5.
for ( int i = 0; i < bufsiz; ++i )
// Read seeds for hmac-md5/siphash/highwayhash.
for ( int i = 0; i < KeyedHash::SEED_INIT_SIZE; ++i )
{
int tmp;
if ( fscanf(f, "%u", &tmp) != 1 )
@ -1053,7 +1048,7 @@ static bool read_random_seeds(const char* read_file, uint32_t* seed,
}
static bool write_random_seeds(const char* write_file, uint32_t seed,
uint32_t* buf, int bufsiz)
std::array<uint32_t, KeyedHash::SEED_INIT_SIZE>& buf)
{
FILE* f = nullptr;
@ -1066,7 +1061,7 @@ static bool write_random_seeds(const char* write_file, uint32_t seed,
fprintf(f, "%u\n", seed);
for ( int i = 0; i < bufsiz; ++i )
for ( int i = 0; i < KeyedHash::SEED_INIT_SIZE; ++i )
fprintf(f, "%u\n", buf[i]);
fclose(f);
@ -1096,16 +1091,14 @@ void bro_srandom(unsigned int seed)
void init_random_seed(const char* read_file, const char* write_file)
{
static const int bufsiz = 20;
uint32_t buf[bufsiz];
memset(buf, 0, sizeof(buf));
int pos = 0; // accumulates entropy
std::array<uint32_t, KeyedHash::SEED_INIT_SIZE> buf = {};
size_t pos = 0; // accumulates entropy
bool seeds_done = false;
uint32_t seed = 0;
if ( read_file )
{
if ( ! read_random_seeds(read_file, &seed, buf, bufsiz) )
if ( ! read_random_seeds(read_file, &seed, buf) )
reporter->FatalError("Could not load seeds from file '%s'.\n",
read_file);
else
@ -1115,7 +1108,7 @@ void init_random_seed(const char* read_file, const char* write_file)
#ifdef HAVE_GETRANDOM
if ( ! seeds_done )
{
ssize_t nbytes = getrandom(buf, sizeof(buf), 0);
ssize_t nbytes = getrandom(buf.data(), sizeof(buf), 0);
seeds_done = nbytes == ssize_t(sizeof(buf));
}
#endif
@ -1123,7 +1116,7 @@ void init_random_seed(const char* read_file, const char* write_file)
if ( ! seeds_done )
{
// Gather up some entropy.
gettimeofday((struct timeval *)(buf + pos), 0);
gettimeofday((struct timeval *)(buf.data() + pos), 0);
pos += sizeof(struct timeval) / sizeof(uint32_t);
// use urandom. For reasons see e.g. http://www.2uo.de/myths-about-urandom/
@ -1137,8 +1130,8 @@ void init_random_seed(const char* read_file, const char* write_file)
if ( fd >= 0 )
{
int amt = read(fd, buf + pos,
sizeof(uint32_t) * (bufsiz - pos));
int amt = read(fd, buf.data() + pos,
sizeof(uint32_t) * (KeyedHash::SEED_INIT_SIZE - pos));
safe_close(fd);
if ( amt > 0 )
@ -1149,12 +1142,12 @@ void init_random_seed(const char* read_file, const char* write_file)
errno = 0;
}
if ( pos < bufsiz )
reporter->FatalError("Could not read enough random data from /dev/urandom. Wanted %d, got %d", bufsiz, pos);
if ( pos < KeyedHash::SEED_INIT_SIZE )
reporter->FatalError("Could not read enough random data from /dev/urandom. Wanted %d, got %lu", KeyedHash::SEED_INIT_SIZE, pos);
if ( ! seed )
{
for ( int i = 0; i < pos; ++i )
for ( size_t i = 0; i < pos; ++i )
{
seed ^= buf[i];
seed = (seed << 1) | (seed >> 31);
@ -1172,22 +1165,10 @@ void init_random_seed(const char* read_file, const char* write_file)
first_seed_saved = true;
}
if ( ! hmac_key_set )
{
assert(sizeof(buf) - 16 == 64);
internal_md5((const u_char*) buf, sizeof(buf) - 16, shared_hmac_md5_key); // The last 128 bits of buf are for siphash
hmac_key_set = true;
}
if ( ! KeyedHash::IsInitialized() )
KeyedHash::InitializeSeeds(buf);
if ( ! siphash_key_set )
{
assert(sizeof(buf) - 64 == 16); // siphash key length is always 128 bytes, independent of implementation
assert(sizeof(shared_siphash_key) == 16);
memcpy(shared_siphash_key, reinterpret_cast<const char*>(buf) + 64, 16);
siphash_key_set = true;
}
if ( write_file && ! write_random_seeds(write_file, seed, buf, bufsiz) )
if ( write_file && ! write_random_seeds(write_file, seed, buf) )
reporter->Error("Could not write seeds to file '%s'.\n",
write_file);
}

View file

@ -25,7 +25,6 @@
#include <stdarg.h>
#include <libgen.h>
#include <memory> // std::unique_ptr
#include "highwayhash/sip_hash.h"
#include "zeek-config.h"
@ -200,11 +199,6 @@ extern std::string strreplace(const std::string& s, const std::string& o, const
// Remove all leading and trailing white space from string.
extern std::string strstrip(std::string s);
extern bool hmac_key_set;
extern uint8_t shared_hmac_md5_key[16];
extern bool siphash_key_set;
extern highwayhash::HH_U64 shared_siphash_key[2];
extern void hmac_md5(size_t size, const unsigned char* bytes,
unsigned char digest[16]);

View file

@ -27,6 +27,7 @@
#include "iosource/PktDumper.h"
#include "IntrusivePtr.h"
#include "input.h"
#include "Hash.h"
using namespace std;
@ -615,7 +616,7 @@ function sha256_hash%(...%): string
function md5_hmac%(...%): string
%{
unsigned char hmac[MD5_DIGEST_LENGTH];
MD5Val::hmac(@ARG@, shared_hmac_md5_key, hmac);
MD5Val::hmac(@ARG@, KeyedHash::shared_hmac_md5_key, hmac);
return new StringVal(md5_digest_print(hmac));
%}