zeek/src/util.h

// See the file "COPYING" in the main distribution directory for copyright.

#pragma once

#include "zeek/zeek-config.h"

// Expose C99 functionality from inttypes.h, which would otherwise not be
// available in C++.
#ifndef __STDC_FORMAT_MACROS
#define __STDC_FORMAT_MACROS
#endif

#ifndef __STDC_LIMIT_MACROS
#define __STDC_LIMIT_MACROS
#endif

#include <libgen.h>
#include <unistd.h>
#include <cinttypes>
#include <cstdarg>
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <string>
#include <string_view>
#include <vector>

#ifdef TIME_WITH_SYS_TIME
#include <sys/time.h>
#include <ctime>
#elif defined(HAVE_SYS_TIME_H)
#include <sys/time.h>
#else
#include <ctime>
#endif

#ifdef DEBUG

#include <cassert>

#ifdef ASSERT
#undef ASSERT
#endif
#define ASSERT(x) assert(x)
#define DEBUG_MSG(...) fprintf(stderr, __VA_ARGS__)
#define DEBUG_fputs fputs

#else
#ifdef MSTCPIP_ASSERT_UNDEFINED
#undef ASSERT
#endif

#define ASSERT(x)
#define DEBUG_MSG(...)
#define DEBUG_fputs(...)

#endif

#ifdef USE_PERFTOOLS_DEBUG
#include <gperftools/heap-checker.h>
#include <gperftools/heap-profiler.h>
extern HeapLeakChecker* heap_checker;
#endif

#include <stdint.h>

extern "C" {
#include "zeek/3rdparty/modp_numtoa.h"
}

#ifdef HAVE_LINUX
#include <pthread.h>
#include <sys/prctl.h>
#endif

#ifdef __FreeBSD__
#include <pthread_np.h>
#endif

#ifdef _MSC_VER
#include <pthread.h>
#include <filesystem>
namespace zeek {
namespace filesystem = std::filesystem;
}
inline constexpr std::string_view path_list_separator = ";";
#else
// Expose ghc::filesystem as zeek::filesystem until we can
// switch to std::filesystem on all platforms.
#include "zeek/3rdparty/ghc/filesystem.hpp"
namespace zeek {
namespace filesystem = ghc::filesystem;
}
inline constexpr std::string_view path_list_separator = ":";
#endif

#include "zeek/3rdparty/nonstd/expected.hpp"
namespace zeek {
template<typename T, typename E>
using expected = nonstd::expected<T, E>;

template<typename E>
using unexpected = nonstd::unexpected<E>;
} // namespace zeek

#include "zeek/Span.h"

using zeek_int_t = int64_t;
using zeek_uint_t = uint64_t;

#ifndef HAVE_STRCASESTR
extern char* strcasestr(const char* s, const char* find);
#endif

// This is used by the patricia code and so it remains outside of the namespace.
extern "C" void out_of_memory(const char* where);

namespace zeek {

class ODesc;
class RecordVal;

// Byte buffer types used by serialization code in storage and cluster.
using byte_buffer = std::vector<std::byte>;
using byte_buffer_span = Span<const std::byte>;

namespace util {
namespace detail {

std::string extract_ip(const std::string& i);
std::string extract_ip_and_len(const std::string& i, int* len);

// Returns the character corresponding to the given escape sequence (s points
// just past the '\'), and updates s to point just beyond the last character
// of the sequence.
extern int expand_escape(const char*& s);

extern const char* fmt_access_time(double time);

extern bool ensure_intermediate_dirs(const char* dirname);
extern bool ensure_dir(const char* dirname);

extern void hmac_md5(size_t size, const unsigned char* bytes, unsigned char digest[16]);

// Initializes RNGs for zeek::random_number() and hmac-md5/siphash/highwayhash usage.
// If load_file is given, the seeds (both random & hashes) are loaded from that file.  This
// takes precedence over the "seed_string and "use_empty_seeds" arguments. The content of
// "seed_string" is used as seeds if not empty next. Otherwise, when "use_empty_seeds" is
// set it zero-initializes all seed values. If neither of these provides initial seed values,
// platform specific random data is used as seeds. If write_file is given, the seeds are
// written to that file.
extern void init_random_seed(const char* load_file, const char* write_file, bool use_empty_seeds,
                             const std::string& seed_string = {});

// Retrieves the initial seed computed after the very first call to
// init_random_seed(). Repeated calls to init_random_seed() will not affect
// the return value of this function.
unsigned int initial_seed();

// Returns true if the user explicitly set a seed via init_random_seed();
extern bool have_random_seed();

/**
 * A platform-independent PRNG implementation.  Note that this is not
 * necessarily a "statistically sound" implementation as the main purpose is
 * not for production use, but rather for regression testing.
 * @param state  The value used to generate the next random number.
 * @return  A new random value generated from *state* and that can passed
 * back into subsequent calls to generate further random numbers.
 */
long int prng(long int state);

/**
 * Wrapper for system random() in the default case, but when running in
 * deterministic mode, uses the platform-independent zeek::prng()
 * to obtain consistent results since implementations of rand() may vary.
 * @return  A value in the range [0, zeek::max_random()].
 */
long int random_number();

/**
 * @return The maximum value that can be returned from zeek::random_number().
 * When not using deterministic-mode, this is always equivalent to RAND_MAX.
 */
long int max_random();

/**
 * Wrapper for system srandom() in the default case, but when running in
 * deterministic mode, updates the state used for calling zeek::prng()
 * inside of zeek::random_number().
 * @param seed  Value to use for initializing the PRNG.
 */
void seed_random(unsigned int seed);

/**
 * Set the process/thread name.  May not be supported on all OSs.
 * @param name  new name for the process/thread.  OS limitations typically
 * truncate the name to 15 bytes maximum.
 * @param tid  handle of thread whose name shall change
 */
void set_thread_name(const char* name, pthread_t tid = pthread_self());

// Each event source that may generate events gets an internally unique ID.
// This is always LOCAL for a local Zeek. For remote event sources, it gets
// assigned by the RemoteSerializer.
//
// FIXME: Find a nicer place for this type definition.
// Unfortunately, it introduces circular dependencies when defined in one of
// the obvious places (like Event.h or RemoteSerializer.h)

using SourceID = std::uintptr_t;
constexpr SourceID SOURCE_LOCAL = 0;

// TODO: This is a temporary marker to flag events coming in via Broker.
// Those are remote events but we don't have any further peer information
// available for them (as the old communication code would have). Once we
// remove RemoteSerializer, we can turn the SourceID into a simple boolean
// indicating whether it's a local or remote event.
constexpr SourceID SOURCE_BROKER = 0xffffffff;

bool is_package_loader(const std::string& path);

extern void add_to_zeek_path(const std::string& dir);

/**
 * Wrapper class for functions like dirname(3) or basename(3) that won't
 * modify the path argument and may optionally abort execution on error.
 */
class SafePathOp {
public:
    std::string result;
    bool error;

protected:
    SafePathOp() : result(), error() {}

    void CheckValid(const char* result, const char* path, bool error_aborts);
};

/**
 * Flatten a script name by replacing '/' path separators with '.'.
 * @param file A path to a Zeek script.  If it is a __load__.zeek, that part
 *             is discarded when constructing the flattened the name.
 * @param prefix A string to prepend to the flattened script name.
 * @return The flattened script name.
 */
std::string flatten_script_name(const std::string& name, const std::string& prefix = "");

/**
 * Return a canonical/shortened path string by removing superfluous elements
 * (path delimiters, dots referring to CWD or parent dir).
 * @param path A filesystem path.
 * @return A canonical/shortened version of \a path.
 */
std::string normalize_path(std::string_view path);

/**
 * Strip the ZEEKPATH component from a path.
 * @param path A file/directory path that may be within a ZEEKPATH component.
 * @return *path* minus the common ZEEKPATH component (if any) removed.
 */
std::string without_zeekpath_component(std::string_view path);

/**
 * Gets the full path used to invoke some executable.
 * @param invocation  any possible string that may be seen in argv[0], such as
 *                    absolute path, relative path, or name to lookup in PATH.
 * @return the absolute path to the executable file
 */
std::string get_exe_path(const std::string& invocation);

/** Opens a Zeek script package.
 * @param path Location of a Zeek script package (a directory).  Will be changed
 *             to the path of the package's loader script.
 * @param mode An fopen(3) mode.
 * @return The return value of fopen(3) on the loader script or null if one
 *         doesn't exist.
 */
FILE* open_package(std::string& path, const std::string& mode = "r");

// This mimics the script-level function with the same name.
const char* log_file_name(const char* tag);

// Terminates processing gracefully, similar to pressing CTRL-C.
void terminate_processing();

// Sets the current status of the Zeek process to the given string.
// If the option --status-file has been set, this is written into
// the corresponding file.  Otherwise, the function is a no-op.
void set_processing_status(const char* status, const char* reason);

// Renames the given file to a new temporary name, and opens a new file with
// the original name. Returns new file or NULL on error. Inits rotate_info if
// given (open time is set network time).
extern FILE* rotate_file(const char* name, RecordVal* rotate_info);

// Parse a time string of the form "HH:MM" (as used for the rotation base
// time) into a double representing the number of seconds. Returns -1 if the
// string cannot be parsed. The function's result is intended to be used with
// calc_next_rotate().
//
// This function is not thread-safe.
double parse_rotate_base_time(const char* rotate_base_time);

// Calculate the duration until the next time a file is to be rotated, based
// on the given rotate_interval and rotate_base_time. 'current' the the
// current time to be used as base, 'rotate_interval' the rotation interval,
// and 'base' the value returned by parse_rotate_base_time(). For the latter,
// if the function returned -1, that's fine, calc_next_rotate() handles that.
//
// This function is thread-safe.
double calc_next_rotate(double current, double rotate_interval, double base);

int setvbuf(FILE* stream, char* buf, int type, size_t size);

} // namespace detail

template<class T>
void delete_each(T* t) {
    using iterator = typename T::iterator;
    for ( iterator it = t->begin(); it != t->end(); ++it )
        delete *it;
}

inline void bytetohex(unsigned char byte, char* hex_out) {
    static constexpr char hex_chars[] = "0123456789abcdef";
    hex_out[0] = hex_chars[(byte & 0xf0) >> 4];
    hex_out[1] = hex_chars[byte & 0x0f];
}

std::string get_unescaped_string(const std::string& str);

ODesc* get_escaped_string(ODesc* d, const char* str, size_t len, bool escape_all);
std::string get_escaped_string(const char* str, size_t len, bool escape_all);

inline std::string get_escaped_string(const std::string& str, bool escape_all) {
    return get_escaped_string(str.data(), str.length(), escape_all);
}

std::vector<std::string>* tokenize_string(std::string_view input, std::string_view delim,
                                          std::vector<std::string>* rval = nullptr, int limit = 0);

std::vector<std::string_view> tokenize_string(std::string_view input, const char delim) noexcept;

extern char* copy_string(const char* str, size_t len);
extern char* copy_string(const char* s);
extern bool streq(const char* s1, const char* s2);
extern bool starts_with(std::string_view s, std::string_view beginning);
extern bool ends_with(std::string_view s, std::string_view ending);

extern char* skip_whitespace(char* s);
extern const char* skip_whitespace(const char* s);
extern char* skip_whitespace(char* s, char* end_of_s);
extern const char* skip_whitespace(const char* s, const char* end_of_s);
extern char* skip_digits(char* s);
extern char* get_word(char*& s);
extern void get_word(int length, const char* s, int& pwlen, const char*& pw);
extern void to_upper(char* s);
extern std::string to_upper(const std::string& s);
extern int decode_hex(char ch);
extern unsigned char encode_hex(int h);
template<class T>
int atoi_n(int len, const char* s, const char** end, int base, T& result);
extern char* uitoa_n(uint64_t value, char* str, int n, int base, const char* prefix = nullptr);
extern const char* strpbrk_n(size_t len, const char* s, const char* charset);
int strstr_n(const int big_len, const unsigned char* big, const int little_len, const unsigned char* little);

// Replaces all occurrences of *o* in *s* with *n*.
extern std::string strreplace(const std::string& s, const std::string& o, const std::string& n);

// Remove all leading and trailing white space from string.
extern std::string strstrip(std::string s);

// Return a lower-cased version of the string.
extern std::string strtolower(const std::string& s);

// Return a upper-cased version of the string.
extern std::string strtoupper(const std::string& s);

extern int fputs(int len, const char* s, FILE* fp);
extern bool is_printable(const char* s, int len);

extern const char* fmt_bytes(const char* data, int len);

// Note: returns a pointer into a shared buffer.
extern const char* vfmt(const char* format, va_list args);
// Note: returns a pointer into a shared buffer.
extern const char* fmt(const char* format, ...) __attribute__((format(printf, 1, 2)));

// Returns true if path exists and is a directory.
bool is_dir(const std::string& path);

// Returns true if path exists and is a file.
bool is_file(const std::string& path);

extern int int_list_cmp(const void* v1, const void* v2);

extern const std::string& zeek_path();
extern const char* zeek_plugin_path();
extern const char* zeek_plugin_activate();
extern std::string zeek_prefixes();

class SafeDirname : public detail::SafePathOp {
public:
    explicit SafeDirname(const char* path, bool error_aborts = true);
    explicit SafeDirname(const std::string& path, bool error_aborts = true);

private:
    void DoFunc(const std::string& path, bool error_aborts = true);
};

class SafeBasename : public detail::SafePathOp {
public:
    explicit SafeBasename(const char* path, bool error_aborts = true);
    explicit SafeBasename(const std::string& path, bool error_aborts = true);

private:
    void DoFunc(const std::string& path, bool error_aborts = true);
};

std::string implode_string_vector(const std::vector<std::string>& v, const std::string& delim = "\n");

/**
 * Locate a file within a given search path.
 * @param filename Name of a file to find.
 * @param path_set Colon-delimited set of paths to search for the file.
 * @param opt_ext A filename extension/suffix to allow.
 * @return Path to the found file, or an empty string if not found.
 */
std::string find_file(const std::string& filename, const std::string& path_set, const std::string& opt_ext = "");

/**
 * Locate a script file within a given search path.
 * @param filename Name of a file to find.
 * @param path_set Colon-delimited set of paths to search for the file.
 * @return Path to the found file, or an empty string if not found.
 */
std::string find_script_file(const std::string& filename, const std::string& path_set);

// Wrapper around fopen(3).  Emits an error when failing to open.
FILE* open_file(const std::string& path, const std::string& mode = "r");

// Returns the current time.
// (In pseudo-realtime mode this is faked to be the start time of the
// trace plus the time interval Zeek has been running. To avoid this,
// call with real=true).
extern double current_time(bool real = false);

// Convert a time represented as a double to a timeval struct.
extern struct timeval double_to_timeval(double t);

// Return > 0 if tv_a > tv_b, 0 if equal, < 0 if tv_a < tv_b.
extern int time_compare(struct timeval* tv_a, struct timeval* tv_b);

// Returns the CPU time consumed to date.
extern double curr_CPU_time();

// Returns an integer that's very likely to be unique, even across Zeek
// instances. The integer can be drawn from different pools, which is helpful
// when the random number generator is seeded to be deterministic. In that
// case, the same sequence of integers is generated per pool.
#define UID_POOL_DEFAULT_INTERNAL 1
#define UID_POOL_DEFAULT_SCRIPT 2
#define UID_POOL_CUSTOM_SCRIPT 10 // First available custom script level pool.
extern uint64_t calculate_unique_id();
extern uint64_t calculate_unique_id(const size_t pool);

// Use for map's string keys.
struct ltstr {
    bool operator()(const char* s1, const char* s2) const { return strcmp(s1, s2) < 0; }
};

constexpr size_t pad_size(size_t size) {
    // We emulate glibc here (values measured on Linux i386).
    // FIXME: We should better copy the portable value definitions from glibc.
    if ( size == 0 )
        return 0; // glibc allocated 16 bytes anyway.

    const int pad = 8;
    if ( size < 12 )
        return 2 * pad;

    return ((size + 3) / pad + 1) * pad;
}

#define padded_sizeof(x) (zeek::util::pad_size(sizeof(x)))

// Like write() but handles interrupted system calls by restarting. Returns
// true if the write was successful, otherwise sets errno. This function is
// thread-safe as long as no two threads write to the same descriptor.
extern bool safe_write(int fd, const char* data, int len);

// Same as safe_write(), but for pwrite().
extern bool safe_pwrite(int fd, const unsigned char* data, size_t len, size_t offset);

// Like fsync() but handles interrupted system calls by retrying and
// aborts on unrecoverable errors.
extern bool safe_fsync(int fd);

// Wraps close(2) to emit error messages and abort on unrecoverable errors.
extern void safe_close(int fd);

// Versions of realloc/malloc which abort() on out of memory

// Versions of realloc/malloc which abort() on out of memory

inline void* safe_realloc(void* ptr, size_t size) {
    ptr = realloc(ptr, size);
    if ( size && ! ptr )
        out_of_memory("realloc");

    return ptr;
}

inline void* safe_malloc(size_t size) {
    void* ptr = malloc(size);
    if ( ! ptr )
        out_of_memory("malloc");

    return ptr;
}

inline char* safe_strncpy(char* dest, const char* src, size_t n) {
    char* result = strncpy(dest, src, n - 1);
    dest[n - 1] = '\0';
    return result;
}

// Memory alignment helpers.

inline bool is_power_of_2(zeek_uint_t x) { return ((x - 1) & x) == 0; }

// Rounds the given pointer up to the nearest multiple of the
// given size, if not already a multiple.
const void* memory_align(const void* ptr, size_t size);

// Rounds the given pointer up to the nearest multiple of the
// given size, padding the skipped region with 0 bytes.
void* memory_align_and_pad(void* ptr, size_t size);

// Returns offset rounded up so it can correctly align data of the given size.
int memory_size_align(size_t offset, size_t size);

// Returns total memory allocations and (if available) amount actually
// handed out by malloc.
extern void get_memory_usage(uint64_t* total, uint64_t* malloced);

// Class to be used as a third argument for STL maps to be able to use
// char*'s as keys. Otherwise the pointer values will be compared instead of
// the actual string values.
struct CompareString {
    bool operator()(char const* a, char const* b) const { return strcmp(a, b) < 0; }
};

/**
 * Canonicalizes a name by converting it to uppercase letters and replacing
 * all non-alphanumeric characters with an underscore.
 * @param name The string to canonicalize.
 * @return The canonicalized version of \a name which caller may later delete[].
 */
std::string canonify_name(const std::string& name);

/**
 * Reentrant version of strerror(). Takes care of the difference between the
 * XSI-compliant and the GNU-specific version of strerror_r().
 */
void zeek_strerror_r(int zeek_errno, char* buf, size_t buflen);

/**
 * Escapes bytes in a string that are not valid UTF8 characters with \xYY format. Used
 * by the JSON writer and BIF methods.
 * @param val the input string to be escaped
 * @return the escaped string
 */
std::string json_escape_utf8(const std::string& val, bool escape_printable_controls = true);

/**
 * Escapes bytes in a string that are not valid UTF8 characters with \xYY format. Used
 * by the JSON writer and BIF methods.
 * @param val the character data to be escaped
 * @param val_size the length of the character data
 * @return the escaped string
 */
std::string json_escape_utf8(const char* val, size_t val_size, bool escape_printable_controls = true);

/**
 * Checks for values that are approximately equal.
 * @param a first value to compare
 * @param b second value to compare
 * @param tolerance how close they need to be to deem them "approximately equal"
 * @return true if `a` is within the given tolerance of `b`, false otherwise
 */
bool approx_equal(double a, double b, double tolerance = std::numeric_limits<double>::epsilon());

/**
 * Splits a string at all occurrences of a delimiter. Successive occurrences
 * of the delimiter will be split into multiple pieces.
 *
 * \note This function is not UTF8-aware.
 */
template<typename T>
std::vector<T> split(T s, const T& delim) {
    // If there's no delimiter, return a copy of the existing string.
    if ( delim.empty() )
        return {std::move(s)};

    // If the delimiter won't fit in the string, just return a copy as well.
    if ( s.size() < delim.size() )
        return {std::move(s)};

    std::vector<T> l;

    const bool ends_in_delim = (s.substr(s.size() - delim.size()) == delim);

    do {
        size_t p = s.find(delim);
        l.push_back(s.substr(0, p));
        if ( p == std::string::npos )
            break;

        s = s.substr(p + delim.size());
    } while ( ! s.empty() );

    if ( ends_in_delim )
        l.emplace_back(T{});

    return l;
}

/**
 * Specialized version of util::split that allows for differing string and delimiter types,
 * with the requirement that the delimiter must be of the same type as what is stored in the
 * string type. For example, this allows passing a std::string as the string to split with
 * a const char* delimiter.
 *
 * @param s the string to split
 * @param delim the delimiter to split the string on
 * @return a vector of containing the separate parts of the string.
 */
template<typename T, typename U = typename T::value_type*>
std::vector<T> split(T s, U delim) {
    return split(s, T{delim});
}

/**
 * Specialized version of util::split that takes a const char* string and delimiter.
 *
 * @param s the string to split
 * @param delim the delimiter to split the string on
 * @return a vector of string_view objects containing the separate parts of the string.
 */
inline std::vector<std::string_view> split(const char* s, const char* delim) {
    return split(std::string_view(s), std::string_view(delim));
}

/**
 * Specialized version of util::split that takes a const wchar_t* string and delimiter.
 *
 * @param s the string to split
 * @param delim the delimiter to split the string on
 * @return a vector of wstring_view objects containing the separate parts of the string.
 */
inline std::vector<std::wstring_view> split(const wchar_t* s, const wchar_t* delim) {
    return split(std::wstring_view(s), std::wstring_view(delim));
}

} // namespace util
} // namespace zeek