zeek/src/probabilistic/Hasher.h
Dominik Charousset 647fdf7737 Add facade types to avoid using raw Broker types
By avoiding to use `broker::data` directly, we gain a degree of freedom
that allows us to swap out `broker::data` for something else (e.g.,
`broker::variant`) in the future. Furthermore, it also helps us to keep
Broker types "local" to the Broker manager and gives us a nicer
interface.

Also replaces uses of `broker::expected` with `std::optional`. While an
`expected `can carry additional information as to why a value is not
present, nothing in Zeek ever cared about that. Hence, using
`std::optional` removes an unnecessary dependency on a Broker detail
while also being more efficient (no extra heap allocation when no value
is present).
2023-12-04 15:23:28 +01:00

249 lines
6 KiB
C++

// See the file "COPYING" in the main distribution directory for copyright.
#pragma once
#include <broker/expected.hh>
#include <memory>
#include "zeek/Hash.h"
namespace zeek {
class BrokerData;
class BrokerDataView;
} // namespace zeek
namespace zeek::probabilistic::detail {
/** Types of derived Hasher classes. */
enum HasherType { Default, Double };
/**
* Abstract base class for hashers. A hasher creates a family of hash
* functions to hash an element *k* times.
*/
class Hasher {
public:
using digest = zeek::detail::hash_t;
using digest_vector = std::vector<digest>;
struct seed_t {
// actually HH_U64, which has the same type
alignas(16) unsigned long long h[2];
friend seed_t operator+(seed_t lhs, const uint64_t rhs) {
lhs.h[0] += rhs;
return lhs;
}
};
/**
* Creates a valid hasher seed from an arbitrary string.
*
* @param data A pointer to contiguous data that should be crunched into a
* seed. If 0, the function tries to find a global_hash_seed script variable
* to derive a seed from. If this variable does not exist, the function uses
* the initial seed generated at Zeek startup.
*
* @param size The number of bytes of *data*.
*
* @return A seed suitable for hashers.
*/
static seed_t MakeSeed(const void* data, size_t size);
/**
* Destructor.
*/
virtual ~Hasher() {}
/**
* Computes hash values for an element.
*
* @param x The element to hash.
*
* @return Vector of *k* hash values.
*/
template<typename T>
digest_vector operator()(const T& x) const {
return Hash(&x, sizeof(T));
}
/**
* Computes hash values for an element.
*
* @param x The key of the value to hash.
*
* @return Vector of *k* hash values.
*/
digest_vector Hash(const zeek::detail::HashKey* key) const;
/**
* Computes the hashes for a set of bytes.
*
* @param x Pointer to first byte to hash.
*
* @param n Number of bytes to hash.
*
* @return Vector of *k* hash values.
*
*/
virtual digest_vector Hash(const void* x, size_t n) const = 0;
/**
* Returns a deep copy of the hasher.
*/
virtual Hasher* Clone() const = 0;
/**
* Returns true if two hashers are identical.
*/
virtual bool Equals(const Hasher* other) const = 0;
/**
* Returns the number *k* of hash functions the hashers applies.
*/
size_t K() const { return k; }
/**
* Returns the seed used to construct the hasher.
*/
seed_t Seed() const { return seed; }
std::optional<BrokerData> Serialize() const;
static std::unique_ptr<Hasher> Unserialize(BrokerDataView data);
protected:
Hasher() {}
/**
* Constructor.
*
* @param arg_k the number of hash functions.
*
* @param arg_seed The seed for the hasher.
*/
Hasher(size_t arg_k, seed_t arg_seed);
virtual HasherType Type() const = 0;
private:
size_t k = 0;
seed_t seed = {0};
};
/**
* A universal hash function family. This is a helper class that Hasher
* implementations can use in their implementation.
*/
class UHF {
public:
/**
* Default constructor with zero seed.
*/
UHF();
/**
* Constructs an hash function seeded with a given seed and an
* optional extra seed to replace the initial Zeek seed.
*
* @param arg_seed The seed to use for this instance.
*/
explicit UHF(Hasher::seed_t arg_seed);
template<typename T>
Hasher::digest operator()(const T& x) const {
return hash(&x, sizeof(T));
}
/**
* Computes hash values for an element.
*
* @param x The element to hash.
*
* @return Vector of *k* hash values.
*/
Hasher::digest operator()(const void* x, size_t n) const { return hash(x, n); }
/**
* Computes the hashes for a set of bytes.
*
* @param x Pointer to first byte to hash.
*
* @param n Number of bytes to hash.
*
* @return Vector of *k* hash values.
*
*/
Hasher::digest hash(const void* x, size_t n) const;
friend bool operator==(const UHF& x, const UHF& y) {
return (x.seed.h[0] == y.seed.h[0]) && (x.seed.h[1] == y.seed.h[1]);
}
friend bool operator!=(const UHF& x, const UHF& y) { return ! (x == y); }
std::optional<BrokerData> Serialize() const;
static UHF Unserialize(BrokerDataView data);
private:
static size_t compute_seed(Hasher::seed_t seed);
Hasher::seed_t seed;
};
/**
* A hasher implementing the default hashing policy. Uses *k* separate hash
* functions internally.
*/
class DefaultHasher : public Hasher {
public:
/**
* Constructor for a hasher with *k* hash functions.
*
* @param k The number of hash functions to use.
*
* @param seed The seed for the hasher.
*/
DefaultHasher(size_t k, Hasher::seed_t seed);
// Overridden from Hasher.
digest_vector Hash(const void* x, size_t n) const final;
DefaultHasher* Clone() const final;
bool Equals(const Hasher* other) const final;
private:
DefaultHasher() {}
HasherType Type() const override { return HasherType::Default; }
std::vector<UHF> hash_functions;
};
/**
* The *double-hashing* policy. Uses a linear combination of two hash
* functions.
*/
class DoubleHasher : public Hasher {
public:
/**
* Constructor for a double hasher with *k* hash functions.
*
* @param k The number of hash functions to use.
*
* @param seed The seed for the hasher.
*/
DoubleHasher(size_t k, Hasher::seed_t seed);
// Overridden from Hasher.
digest_vector Hash(const void* x, size_t n) const final;
DoubleHasher* Clone() const final;
bool Equals(const Hasher* other) const final;
private:
DoubleHasher() {}
HasherType Type() const override { return HasherType::Double; }
UHF h1;
UHF h2;
};
} // namespace zeek::probabilistic::detail