zeek/src/util.h

669 lines
23 KiB
C++

// See the file "COPYING" in the main distribution directory for copyright.
#pragma once
#include "zeek/zeek-config.h"
// Expose C99 functionality from inttypes.h, which would otherwise not be
// available in C++.
#ifndef __STDC_FORMAT_MACROS
#define __STDC_FORMAT_MACROS
#endif
#ifndef __STDC_LIMIT_MACROS
#define __STDC_LIMIT_MACROS
#endif
#include <libgen.h>
#include <unistd.h>
#include <cinttypes>
#include <cstdarg>
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <string>
#include <string_view>
#include <vector>
#ifdef TIME_WITH_SYS_TIME
#include <sys/time.h>
#include <ctime>
#elif defined(HAVE_SYS_TIME_H)
#include <sys/time.h>
#else
#include <ctime>
#endif
#ifdef DEBUG
#include <cassert>
#ifdef ASSERT
#undef ASSERT
#endif
#define ASSERT(x) assert(x)
#define DEBUG_MSG(...) fprintf(stderr, __VA_ARGS__)
#define DEBUG_fputs fputs
#else
#ifdef MSTCPIP_ASSERT_UNDEFINED
#undef ASSERT
#endif
#define ASSERT(x)
#define DEBUG_MSG(...)
#define DEBUG_fputs(...)
#endif
#ifdef USE_PERFTOOLS_DEBUG
#include <gperftools/heap-checker.h>
#include <gperftools/heap-profiler.h>
extern HeapLeakChecker* heap_checker;
#endif
#include <stdint.h>
extern "C" {
#include "zeek/3rdparty/modp_numtoa.h"
}
#ifdef HAVE_LINUX
#include <pthread.h>
#include <sys/prctl.h>
#endif
#ifdef __FreeBSD__
#include <pthread_np.h>
#endif
#ifdef _MSC_VER
#include <pthread.h>
#include <filesystem>
namespace zeek {
namespace filesystem = std::filesystem;
}
inline constexpr std::string_view path_list_separator = ";";
#else
// Expose ghc::filesystem as zeek::filesystem until we can
// switch to std::filesystem on all platforms.
#include "zeek/3rdparty/ghc/filesystem.hpp"
namespace zeek {
namespace filesystem = ghc::filesystem;
}
inline constexpr std::string_view path_list_separator = ":";
#endif
#include "zeek/3rdparty/nonstd/expected.hpp"
namespace zeek {
template<typename T, typename E>
using expected = nonstd::expected<T, E>;
template<typename E>
using unexpected = nonstd::unexpected<E>;
} // namespace zeek
#include "zeek/Span.h"
using zeek_int_t = int64_t;
using zeek_uint_t = uint64_t;
#ifndef HAVE_STRCASESTR
extern char* strcasestr(const char* s, const char* find);
#endif
// This is used by the patricia code and so it remains outside of the namespace.
extern "C" void out_of_memory(const char* where);
namespace zeek {
class ODesc;
class RecordVal;
// Byte buffer types used by serialization code in storage and cluster.
using byte_buffer = std::vector<std::byte>;
using byte_buffer_span = Span<const std::byte>;
namespace util {
namespace detail {
std::string extract_ip(const std::string& i);
std::string extract_ip_and_len(const std::string& i, int* len);
// Returns the character corresponding to the given escape sequence (s points
// just past the '\'), and updates s to point just beyond the last character
// of the sequence.
extern int expand_escape(const char*& s);
extern const char* fmt_access_time(double time);
extern bool ensure_intermediate_dirs(const char* dirname);
extern bool ensure_dir(const char* dirname);
extern void hmac_md5(size_t size, const unsigned char* bytes, unsigned char digest[16]);
// Initializes RNGs for zeek::random_number() and hmac-md5/siphash/highwayhash usage.
// If load_file is given, the seeds (both random & hashes) are loaded from that file. This
// takes precedence over the "seed_string and "use_empty_seeds" arguments. The content of
// "seed_string" is used as seeds if not empty next. Otherwise, when "use_empty_seeds" is
// set it zero-initializes all seed values. If neither of these provides initial seed values,
// platform specific random data is used as seeds. If write_file is given, the seeds are
// written to that file.
extern void init_random_seed(const char* load_file, const char* write_file, bool use_empty_seeds,
const std::string& seed_string = {});
// Retrieves the initial seed computed after the very first call to
// init_random_seed(). Repeated calls to init_random_seed() will not affect
// the return value of this function.
unsigned int initial_seed();
// Returns true if the user explicitly set a seed via init_random_seed();
extern bool have_random_seed();
/**
* A platform-independent PRNG implementation. Note that this is not
* necessarily a "statistically sound" implementation as the main purpose is
* not for production use, but rather for regression testing.
* @param state The value used to generate the next random number.
* @return A new random value generated from *state* and that can passed
* back into subsequent calls to generate further random numbers.
*/
long int prng(long int state);
/**
* Wrapper for system random() in the default case, but when running in
* deterministic mode, uses the platform-independent zeek::prng()
* to obtain consistent results since implementations of rand() may vary.
* @return A value in the range [0, zeek::max_random()].
*/
long int random_number();
/**
* @return The maximum value that can be returned from zeek::random_number().
* When not using deterministic-mode, this is always equivalent to RAND_MAX.
*/
long int max_random();
/**
* Wrapper for system srandom() in the default case, but when running in
* deterministic mode, updates the state used for calling zeek::prng()
* inside of zeek::random_number().
* @param seed Value to use for initializing the PRNG.
*/
void seed_random(unsigned int seed);
/**
* Set the process/thread name. May not be supported on all OSs.
* @param name new name for the process/thread. OS limitations typically
* truncate the name to 15 bytes maximum.
* @param tid handle of thread whose name shall change
*/
void set_thread_name(const char* name, pthread_t tid = pthread_self());
// Each event source that may generate events gets an internally unique ID.
// This is always LOCAL for a local Zeek. For remote event sources, it gets
// assigned by the RemoteSerializer.
//
// FIXME: Find a nicer place for this type definition.
// Unfortunately, it introduces circular dependencies when defined in one of
// the obvious places (like Event.h or RemoteSerializer.h)
using SourceID = std::uintptr_t;
constexpr SourceID SOURCE_LOCAL = 0;
// TODO: This is a temporary marker to flag events coming in via Broker.
// Those are remote events but we don't have any further peer information
// available for them (as the old communication code would have). Once we
// remove RemoteSerializer, we can turn the SourceID into a simple boolean
// indicating whether it's a local or remote event.
constexpr SourceID SOURCE_BROKER = 0xffffffff;
bool is_package_loader(const std::string& path);
extern void add_to_zeek_path(const std::string& dir);
/**
* Wrapper class for functions like dirname(3) or basename(3) that won't
* modify the path argument and may optionally abort execution on error.
*/
class SafePathOp {
public:
std::string result;
bool error;
protected:
SafePathOp() : result(), error() {}
void CheckValid(const char* result, const char* path, bool error_aborts);
};
/**
* Flatten a script name by replacing '/' path separators with '.'.
* @param file A path to a Zeek script. If it is a __load__.zeek, that part
* is discarded when constructing the flattened the name.
* @param prefix A string to prepend to the flattened script name.
* @return The flattened script name.
*/
std::string flatten_script_name(const std::string& name, const std::string& prefix = "");
/**
* Return a canonical/shortened path string by removing superfluous elements
* (path delimiters, dots referring to CWD or parent dir).
* @param path A filesystem path.
* @return A canonical/shortened version of \a path.
*/
std::string normalize_path(std::string_view path);
/**
* Strip the ZEEKPATH component from a path.
* @param path A file/directory path that may be within a ZEEKPATH component.
* @return *path* minus the common ZEEKPATH component (if any) removed.
*/
std::string without_zeekpath_component(std::string_view path);
/**
* Gets the full path used to invoke some executable.
* @param invocation any possible string that may be seen in argv[0], such as
* absolute path, relative path, or name to lookup in PATH.
* @return the absolute path to the executable file
*/
std::string get_exe_path(const std::string& invocation);
/** Opens a Zeek script package.
* @param path Location of a Zeek script package (a directory). Will be changed
* to the path of the package's loader script.
* @param mode An fopen(3) mode.
* @return The return value of fopen(3) on the loader script or null if one
* doesn't exist.
*/
FILE* open_package(std::string& path, const std::string& mode = "r");
// This mimics the script-level function with the same name.
const char* log_file_name(const char* tag);
// Terminates processing gracefully, similar to pressing CTRL-C.
void terminate_processing();
// Sets the current status of the Zeek process to the given string.
// If the option --status-file has been set, this is written into
// the corresponding file. Otherwise, the function is a no-op.
void set_processing_status(const char* status, const char* reason);
// Renames the given file to a new temporary name, and opens a new file with
// the original name. Returns new file or NULL on error. Inits rotate_info if
// given (open time is set network time).
extern FILE* rotate_file(const char* name, RecordVal* rotate_info);
// Parse a time string of the form "HH:MM" (as used for the rotation base
// time) into a double representing the number of seconds. Returns -1 if the
// string cannot be parsed. The function's result is intended to be used with
// calc_next_rotate().
//
// This function is not thread-safe.
double parse_rotate_base_time(const char* rotate_base_time);
// Calculate the duration until the next time a file is to be rotated, based
// on the given rotate_interval and rotate_base_time. 'current' the the
// current time to be used as base, 'rotate_interval' the rotation interval,
// and 'base' the value returned by parse_rotate_base_time(). For the latter,
// if the function returned -1, that's fine, calc_next_rotate() handles that.
//
// This function is thread-safe.
double calc_next_rotate(double current, double rotate_interval, double base);
int setvbuf(FILE* stream, char* buf, int type, size_t size);
} // namespace detail
template<class T>
void delete_each(T* t) {
using iterator = typename T::iterator;
for ( iterator it = t->begin(); it != t->end(); ++it )
delete *it;
}
inline void bytetohex(unsigned char byte, char* hex_out) {
static constexpr char hex_chars[] = "0123456789abcdef";
hex_out[0] = hex_chars[(byte & 0xf0) >> 4];
hex_out[1] = hex_chars[byte & 0x0f];
}
std::string get_unescaped_string(const std::string& str);
ODesc* get_escaped_string(ODesc* d, const char* str, size_t len, bool escape_all);
std::string get_escaped_string(const char* str, size_t len, bool escape_all);
inline std::string get_escaped_string(const std::string& str, bool escape_all) {
return get_escaped_string(str.data(), str.length(), escape_all);
}
std::vector<std::string>* tokenize_string(std::string_view input, std::string_view delim,
std::vector<std::string>* rval = nullptr, int limit = 0);
std::vector<std::string_view> tokenize_string(std::string_view input, const char delim) noexcept;
extern char* copy_string(const char* str, size_t len);
extern char* copy_string(const char* s);
extern bool streq(const char* s1, const char* s2);
extern bool starts_with(std::string_view s, std::string_view beginning);
extern bool ends_with(std::string_view s, std::string_view ending);
extern char* skip_whitespace(char* s);
extern const char* skip_whitespace(const char* s);
extern char* skip_whitespace(char* s, char* end_of_s);
extern const char* skip_whitespace(const char* s, const char* end_of_s);
extern char* skip_digits(char* s);
extern char* get_word(char*& s);
extern void get_word(int length, const char* s, int& pwlen, const char*& pw);
extern void to_upper(char* s);
extern std::string to_upper(const std::string& s);
extern int decode_hex(char ch);
extern unsigned char encode_hex(int h);
template<class T>
int atoi_n(int len, const char* s, const char** end, int base, T& result);
extern char* uitoa_n(uint64_t value, char* str, int n, int base, const char* prefix = nullptr);
extern const char* strpbrk_n(size_t len, const char* s, const char* charset);
int strstr_n(const int big_len, const unsigned char* big, const int little_len, const unsigned char* little);
// Replaces all occurrences of *o* in *s* with *n*.
extern std::string strreplace(const std::string& s, const std::string& o, const std::string& n);
// Remove all leading and trailing white space from string.
extern std::string strstrip(std::string s);
// Return a lower-cased version of the string.
extern std::string strtolower(const std::string& s);
// Return a upper-cased version of the string.
extern std::string strtoupper(const std::string& s);
extern int fputs(int len, const char* s, FILE* fp);
extern bool is_printable(const char* s, int len);
extern const char* fmt_bytes(const char* data, int len);
// Note: returns a pointer into a shared buffer.
extern const char* vfmt(const char* format, va_list args);
// Note: returns a pointer into a shared buffer.
extern const char* fmt(const char* format, ...) __attribute__((format(printf, 1, 2)));
// Returns true if path exists and is a directory.
bool is_dir(const std::string& path);
// Returns true if path exists and is a file.
bool is_file(const std::string& path);
extern int int_list_cmp(const void* v1, const void* v2);
extern const std::string& zeek_path();
extern const char* zeek_plugin_path();
extern const char* zeek_plugin_activate();
extern std::string zeek_prefixes();
class SafeDirname : public detail::SafePathOp {
public:
explicit SafeDirname(const char* path, bool error_aborts = true);
explicit SafeDirname(const std::string& path, bool error_aborts = true);
private:
void DoFunc(const std::string& path, bool error_aborts = true);
};
class SafeBasename : public detail::SafePathOp {
public:
explicit SafeBasename(const char* path, bool error_aborts = true);
explicit SafeBasename(const std::string& path, bool error_aborts = true);
private:
void DoFunc(const std::string& path, bool error_aborts = true);
};
std::string implode_string_vector(const std::vector<std::string>& v, const std::string& delim = "\n");
/**
* Locate a file within a given search path.
* @param filename Name of a file to find.
* @param path_set Colon-delimited set of paths to search for the file.
* @param opt_ext A filename extension/suffix to allow.
* @return Path to the found file, or an empty string if not found.
*/
std::string find_file(const std::string& filename, const std::string& path_set, const std::string& opt_ext = "");
/**
* Locate a script file within a given search path.
* @param filename Name of a file to find.
* @param path_set Colon-delimited set of paths to search for the file.
* @return Path to the found file, or an empty string if not found.
*/
std::string find_script_file(const std::string& filename, const std::string& path_set);
// Wrapper around fopen(3). Emits an error when failing to open.
FILE* open_file(const std::string& path, const std::string& mode = "r");
// Returns the current time.
// (In pseudo-realtime mode this is faked to be the start time of the
// trace plus the time interval Zeek has been running. To avoid this,
// call with real=true).
extern double current_time(bool real = false);
// Convert a time represented as a double to a timeval struct.
extern struct timeval double_to_timeval(double t);
// Return > 0 if tv_a > tv_b, 0 if equal, < 0 if tv_a < tv_b.
extern int time_compare(struct timeval* tv_a, struct timeval* tv_b);
// Returns the CPU time consumed to date.
extern double curr_CPU_time();
// Returns an integer that's very likely to be unique, even across Zeek
// instances. The integer can be drawn from different pools, which is helpful
// when the random number generator is seeded to be deterministic. In that
// case, the same sequence of integers is generated per pool.
#define UID_POOL_DEFAULT_INTERNAL 1
#define UID_POOL_DEFAULT_SCRIPT 2
#define UID_POOL_CUSTOM_SCRIPT 10 // First available custom script level pool.
extern uint64_t calculate_unique_id();
extern uint64_t calculate_unique_id(const size_t pool);
// Use for map's string keys.
struct ltstr {
bool operator()(const char* s1, const char* s2) const { return strcmp(s1, s2) < 0; }
};
constexpr size_t pad_size(size_t size) {
// We emulate glibc here (values measured on Linux i386).
// FIXME: We should better copy the portable value definitions from glibc.
if ( size == 0 )
return 0; // glibc allocated 16 bytes anyway.
const int pad = 8;
if ( size < 12 )
return 2 * pad;
return ((size + 3) / pad + 1) * pad;
}
#define padded_sizeof(x) (zeek::util::pad_size(sizeof(x)))
// Like write() but handles interrupted system calls by restarting. Returns
// true if the write was successful, otherwise sets errno. This function is
// thread-safe as long as no two threads write to the same descriptor.
extern bool safe_write(int fd, const char* data, int len);
// Same as safe_write(), but for pwrite().
extern bool safe_pwrite(int fd, const unsigned char* data, size_t len, size_t offset);
// Like fsync() but handles interrupted system calls by retrying and
// aborts on unrecoverable errors.
extern bool safe_fsync(int fd);
// Wraps close(2) to emit error messages and abort on unrecoverable errors.
extern void safe_close(int fd);
// Versions of realloc/malloc which abort() on out of memory
// Versions of realloc/malloc which abort() on out of memory
inline void* safe_realloc(void* ptr, size_t size) {
ptr = realloc(ptr, size);
if ( size && ! ptr )
out_of_memory("realloc");
return ptr;
}
inline void* safe_malloc(size_t size) {
void* ptr = malloc(size);
if ( ! ptr )
out_of_memory("malloc");
return ptr;
}
inline char* safe_strncpy(char* dest, const char* src, size_t n) {
char* result = strncpy(dest, src, n - 1);
dest[n - 1] = '\0';
return result;
}
// Memory alignment helpers.
inline bool is_power_of_2(zeek_uint_t x) { return ((x - 1) & x) == 0; }
// Rounds the given pointer up to the nearest multiple of the
// given size, if not already a multiple.
const void* memory_align(const void* ptr, size_t size);
// Rounds the given pointer up to the nearest multiple of the
// given size, padding the skipped region with 0 bytes.
void* memory_align_and_pad(void* ptr, size_t size);
// Returns offset rounded up so it can correctly align data of the given size.
int memory_size_align(size_t offset, size_t size);
// Returns total memory allocations and (if available) amount actually
// handed out by malloc.
extern void get_memory_usage(uint64_t* total, uint64_t* malloced);
// Class to be used as a third argument for STL maps to be able to use
// char*'s as keys. Otherwise the pointer values will be compared instead of
// the actual string values.
struct CompareString {
bool operator()(char const* a, char const* b) const { return strcmp(a, b) < 0; }
};
/**
* Canonicalizes a name by converting it to uppercase letters and replacing
* all non-alphanumeric characters with an underscore.
* @param name The string to canonicalize.
* @return The canonicalized version of \a name which caller may later delete[].
*/
std::string canonify_name(const std::string& name);
/**
* Reentrant version of strerror(). Takes care of the difference between the
* XSI-compliant and the GNU-specific version of strerror_r().
*/
void zeek_strerror_r(int zeek_errno, char* buf, size_t buflen);
/**
* Escapes bytes in a string that are not valid UTF8 characters with \xYY format. Used
* by the JSON writer and BIF methods.
* @param val the input string to be escaped
* @return the escaped string
*/
std::string json_escape_utf8(const std::string& val, bool escape_printable_controls = true);
/**
* Escapes bytes in a string that are not valid UTF8 characters with \xYY format. Used
* by the JSON writer and BIF methods.
* @param val the character data to be escaped
* @param val_size the length of the character data
* @return the escaped string
*/
std::string json_escape_utf8(const char* val, size_t val_size, bool escape_printable_controls = true);
/**
* Checks for values that are approximately equal.
* @param a first value to compare
* @param b second value to compare
* @param tolerance how close they need to be to deem them "approximately equal"
* @return true if `a` is within the given tolerance of `b`, false otherwise
*/
bool approx_equal(double a, double b, double tolerance = std::numeric_limits<double>::epsilon());
/**
* Splits a string at all occurrences of a delimiter. Successive occurrences
* of the delimiter will be split into multiple pieces.
*
* \note This function is not UTF8-aware.
*/
template<typename T>
std::vector<T> split(T s, const T& delim) {
// If there's no delimiter, return a copy of the existing string.
if ( delim.empty() )
return {std::move(s)};
// If the delimiter won't fit in the string, just return a copy as well.
if ( s.size() < delim.size() )
return {std::move(s)};
std::vector<T> l;
const bool ends_in_delim = (s.substr(s.size() - delim.size()) == delim);
do {
size_t p = s.find(delim);
l.push_back(s.substr(0, p));
if ( p == std::string::npos )
break;
s = s.substr(p + delim.size());
} while ( ! s.empty() );
if ( ends_in_delim )
l.emplace_back(T{});
return l;
}
/**
* Specialized version of util::split that allows for differing string and delimiter types,
* with the requirement that the delimiter must be of the same type as what is stored in the
* string type. For example, this allows passing a std::string as the string to split with
* a const char* delimiter.
*
* @param s the string to split
* @param delim the delimiter to split the string on
* @return a vector of containing the separate parts of the string.
*/
template<typename T, typename U = typename T::value_type*>
std::vector<T> split(T s, U delim) {
return split(s, T{delim});
}
/**
* Specialized version of util::split that takes a const char* string and delimiter.
*
* @param s the string to split
* @param delim the delimiter to split the string on
* @return a vector of string_view objects containing the separate parts of the string.
*/
inline std::vector<std::string_view> split(const char* s, const char* delim) {
return split(std::string_view(s), std::string_view(delim));
}
/**
* Specialized version of util::split that takes a const wchar_t* string and delimiter.
*
* @param s the string to split
* @param delim the delimiter to split the string on
* @return a vector of wstring_view objects containing the separate parts of the string.
*/
inline std::vector<std::wstring_view> split(const wchar_t* s, const wchar_t* delim) {
return split(std::wstring_view(s), std::wstring_view(delim));
}
} // namespace util
} // namespace zeek