Introduce telemetry framework

Adds base/frameworks/telemetry with wrappers around telemetry.bif
and updates telemetry/Manager to support collecting metrics from
script land.

Add policy/frameworks/telemetry/log for logging of metrics data
into a new telemetry.log and telemetry_histogram.log and add into
local.zeek by default.
This commit is contained in:
Arne Welzel 2022-06-30 17:01:13 +02:00
parent 95fba8fd29
commit 3fe930dbf2
32 changed files with 1950 additions and 27 deletions

View file

@ -0,0 +1,199 @@
##! Implementation of a telemetry.log and telemetry_histogram.log file
##! using metrics accessible via the Telemetry module.
@load base/frameworks/telemetry
module Telemetry;
export {
redef enum Log::ID += { LOG, LOG_HISTOGRAM };
## How often metrics are reported.
option log_interval = 60sec;
## Only metrics with prefixes in this set will be included in the
## `telemetry.log` and `telemetry_histogram.log` files by default.
## Setting this option to an empty set includes all prefixes.
##
## For more fine-grained customization, setting this option to an
## empty set and implementing the :zeek:see:`Telemetry::log_policy`
## and :zeek:see:`Telemetry::log_policy_histogram` hooks to filter
## individual records is recommended.
option log_prefixes: set[string] = {"process", "zeek"};
## Record type used for logging counter and gauge metrics.
type Info: record {
## Timestamp of reporting.
ts: time &log;
## Peer that generated this log.
peer: string &log;
## Contains the value "counter" or "gauge" depending on
## the underlying metric type.
metric_type: string &log;
## The prefix (namespace) of the metric.
prefix: string &log;
## The name of the metric.
name: string &log;
## The unit of this metric, or unset if unit-less.
unit: string &log &optional;
## The names of the individual labels.
labels: vector of string &log;
## The values of the labels as listed in ``labels``.
label_values: vector of string &log;
## The value of this metric.
value: double &log;
};
## Record type used for logging histogram metrics.
type HistogramInfo: record {
## Timestamp of reporting.
ts: time &log;
## Peer that generated this log.
peer: string &log;
## The prefix (namespace) of the metric.
prefix: string &log;
## The name of the metric.
name: string &log;
## The unit of this metric, or unset if unit-less.
unit: string &log &optional;
## The names of the individual labels.
labels: vector of string &log;
## The values of the labels as listed in ``labels``.
label_values: vector of string &log;
## The bounds of the individual buckets
bounds: vector of double &log;
## The number of observations within each individual bucket.
values: vector of double &log;
## The sum over all observations
sum: double &log;
## The total number of observations.
observations: double &log;
};
## A default logging policy hook for the stream.
global log_policy: Log::PolicyHook;
## A default logging policy hook for the histogram stream.
global log_policy_histogram: Log::PolicyHook;
## Event triggered for every record in the stream.
global log_telemetry: event(rec: Info);
## Event triggered for every record in the histogram stream.
global log_telemetry_histogram: event(rec: HistogramInfo);
}
function do_log()
{
local ts = network_time();
local metrics = Telemetry::collect_metrics();
for ( i in metrics )
{
local m = metrics[i];
# Histograms don't have single values, skip over them.
if ( m$opts$metric_type == DOUBLE_HISTOGRAM || m$opts$metric_type == INT_HISTOGRAM )
next;
if ( |log_prefixes| > 0 && m$opts$prefix !in log_prefixes )
next;
# Render the metric_type as a short string. Unknown
# shouldn't really happen, but lets have a fallback.
local metric_type = "unknown";
switch ( m$opts$metric_type ) {
case DOUBLE_COUNTER, INT_COUNTER:
metric_type = "counter";
break;
case DOUBLE_GAUGE, INT_GAUGE:
metric_type = "gauge";
break;
}
local rec = Info($ts=ts,
$peer=peer_description,
$metric_type=metric_type,
$prefix=m$opts$prefix,
$name=m$opts$name,
$labels=m$opts$labels,
$label_values=m$labels,
$value=m$value);
if ( m$opts$unit != "1" )
rec$unit = m$opts$unit;
Log::write(LOG, rec);
}
# Logging of histograms.
ts = network_time();
local histogram_metrics = Telemetry::collect_histogram_metrics();
for ( i in histogram_metrics )
{
local hm = histogram_metrics[i];
if ( |log_prefixes| > 0 && hm$opts$prefix !in log_prefixes )
next;
local hrec = HistogramInfo($ts=ts,
$peer=peer_description,
$prefix=hm$opts$prefix,
$name=hm$opts$name,
$labels=hm$opts$labels,
$label_values=hm$labels,
$bounds=hm$opts$bounds,
$values=hm$values,
$sum=hm$sum,
$observations=hm$observations);
if ( hm$opts$unit != "1" )
hrec$unit = hm$opts$unit;
Log::write(LOG_HISTOGRAM, hrec);
}
}
event Telemetry::log()
{
# We explicitly log once during zeek_done(), so short-circuit
# here when we're already in the process of shutting down.
if ( zeek_is_terminating() )
return;
do_log();
schedule log_interval { Telemetry::log() };
}
event zeek_init() &priority=5
{
Log::create_stream(LOG, [$columns=Info, $ev=log_telemetry, $path="telemetry", $policy=log_policy]);
Log::create_stream(LOG_HISTOGRAM, [$columns=HistogramInfo, $ev=log_telemetry_histogram, $path="telemetry_histogram", $policy=log_policy_histogram]);
schedule log_interval { Telemetry::log() };
}
# Log late during zeek_done() once more. Any metric updates
# afterwards won't be visible in the log.
event zeek_done() &priority=-1000
{
do_log();
}