Introduce telemetry framework

Adds base/frameworks/telemetry with wrappers around telemetry.bif
and updates telemetry/Manager to support collecting metrics from
script land.

Add policy/frameworks/telemetry/log for logging of metrics data
into a new telemetry.log and telemetry_histogram.log and add into
local.zeek by default.
This commit is contained in:
Arne Welzel 2022-06-30 17:01:13 +02:00
parent 95fba8fd29
commit 3fe930dbf2
32 changed files with 1950 additions and 27 deletions

View file

@ -0,0 +1,116 @@
# @TEST-DOC: Using and listing of counters and gauges using the telemetry module.
# @TEST-EXEC: zcat <$TRACES/echo-connections.pcap.gz | zeek -b -Cr - %INPUT > out
# @TEST-EXEC: btest-diff out
# @TEST-EXEC-FAIL: test -f reporter.log
@load base/frameworks/telemetry
global btest_a_cf = Telemetry::register_counter_family([
$prefix="btest",
$name="a_test",
$unit="1",
$help_text="A btest metric",
$labels=vector("x", "y")
]);
global btest_b_cf = Telemetry::register_counter_family([
$prefix="btest",
$name="b_test",
$unit="1",
$help_text="Another btest metric",
$labels=vector("x", "y")
]);
global btest_c_cf = Telemetry::register_counter_family([
$prefix="btest",
$name="c_test",
$unit="1",
$help_text="The last btest metric",
$labels=vector("x", "y")
]);
global system_sensor_temp_gf = Telemetry::register_gauge_family([
$prefix="system",
$name="sensor_temperature",
$unit="celsius",
$help_text="Temperatures reported by sensors in the system",
$labels=vector("name")
]);
global btest_sample_histogram_hf = Telemetry::register_histogram_family([
$prefix="btest",
$name="sample_histogram",
$unit="1",
$help_text="A sample histogram that is not returned by Telemetry::collect_metrics",
$bounds=vector(1.0, 2.0, 3.0, 4.0, 5.0),
$labels=vector("dim")
]);
function print_metrics(what: string, metrics: vector of Telemetry::Metric)
{
print fmt("### %s |%s|", what, |metrics|);
for (i in metrics)
{
local m = metrics[i];
print m$opts$metric_type, m$opts$prefix, m$opts$name, m$opts$labels, m$labels, m$value;
if (m?$count_value)
print "count_value", m$count_value;
}
}
function print_histogram_metrics(what: string, metrics: vector of Telemetry::HistogramMetric)
{
print fmt("### %s |%s|", what, |metrics|);
for (i in metrics)
{
local m = metrics[i];
print m$opts$metric_type, m$opts$prefix, m$opts$name, m$opts$bounds, m$opts$labels, m$labels, m$values, m$sum, m$observations;
}
}
event zeek_done() &priority=-100
{
Telemetry::counter_family_inc(btest_a_cf, vector("a", "b"));
Telemetry::counter_family_inc(btest_a_cf, vector("a", "c"));
Telemetry::counter_family_inc(btest_a_cf, vector("a", "c"));
Telemetry::counter_family_inc(btest_b_cf, vector("a", "b"), 10.0);
Telemetry::counter_family_inc(btest_b_cf, vector("a", "c"), 20.0);
Telemetry::counter_family_set(btest_c_cf, vector("a", "b"), 100.0);
Telemetry::counter_family_set(btest_c_cf, vector("a", "b"), 200.0);
Telemetry::gauge_family_set(system_sensor_temp_gf, vector("cpu0"), 43.0);
Telemetry::gauge_family_set(system_sensor_temp_gf, vector("cpu1"), 43.1);
Telemetry::gauge_family_inc(system_sensor_temp_gf, vector("cpu1"));
Telemetry::gauge_family_set(system_sensor_temp_gf, vector("cpu3"), 43.2);
Telemetry::gauge_family_dec(system_sensor_temp_gf, vector("cpu3"));
Telemetry::histogram_family_observe(btest_sample_histogram_hf, vector("a"), 0.5);
Telemetry::histogram_family_observe(btest_sample_histogram_hf, vector("a"), 0.9);
Telemetry::histogram_family_observe(btest_sample_histogram_hf, vector("a"), 1.1);
Telemetry::histogram_family_observe(btest_sample_histogram_hf, vector("a"), 2.0);
Telemetry::histogram_family_observe(btest_sample_histogram_hf, vector("a"), 7.0);
Telemetry::histogram_family_observe(btest_sample_histogram_hf, vector("b"), 0.5);
Telemetry::histogram_family_observe(btest_sample_histogram_hf, vector("b"), 7.0);
local zeek_session_metrics = Telemetry::collect_metrics("zeek", "*session*");
print_metrics("zeek_session_metrics", zeek_session_metrics);
local all_btest_metrics = Telemetry::collect_metrics("bt*", "*");
print_metrics("bt* metrics", all_btest_metrics);
local btest_a_metrics = Telemetry::collect_metrics("btest", "a_*");
print_metrics("btest_a_metrics", btest_a_metrics);
local btest_b_metrics = Telemetry::collect_metrics("btest", "b_*");
print_metrics("btest_b_metrics", btest_b_metrics);
local system_metrics = Telemetry::collect_metrics("system");
print_metrics("system_metrics", system_metrics);
local histogram_metrics = Telemetry::collect_histogram_metrics("btest");
print_histogram_metrics("btest_histogram_metrics", histogram_metrics);
}

View file

@ -0,0 +1,48 @@
# @TEST-EXEC: zcat <$TRACES/echo-connections.pcap.gz | zeek -b -Cr - %INPUT > out
# @TEST-EXEC: btest-diff out
# @TEST-EXEC-FAIL: test -f reporter.log
@load base/frameworks/telemetry
global connection_duration_hf = Telemetry::register_histogram_family([
$prefix="zeek",
$name="connection_duration",
$unit="seconds",
$help_text="Monitored connection durations",
$bounds=vector(2.0, 3.0, 4.0, 5.0, 6.0, 10.0)
]);
global realistic_connection_duration_hf = Telemetry::register_histogram_family([
$prefix="zeek",
$name="realistic_connection_duration",
$labels=vector("proto"),
$unit="seconds",
$help_text="Monitored connection durations by protocol",
$bounds=vector(0.1, 1.0, 10.0, 30.0, 60.0, 120.0, 300, 900.0, 1800.0)
]);
global connection_duration_h = Telemetry::histogram_with(connection_duration_hf);
event connection_state_remove(c: connection)
{
Telemetry::histogram_observe(connection_duration_h, interval_to_double(c$duration));
local proto = to_lower(cat(get_port_transport_proto(c$id$resp_p)));
Telemetry::histogram_family_observe(realistic_connection_duration_hf,
vector(proto),
interval_to_double(c$duration));
}
event zeek_done() &priority=-100
{
local histogram_metrics = Telemetry::collect_histogram_metrics("zeek", "*connection_duration");
for (i in histogram_metrics)
{
local hm = histogram_metrics[i];
print hm$opts$metric_type, hm$opts$prefix, hm$opts$name;
print hm$opts$labels;
print hm$labels;
print hm$opts$bounds;
print hm$values;
print hm$observations, hm$sum;
}
}

View file

@ -0,0 +1,43 @@
# @TEST-DOC: Query some internal broker/caf related metrics as they use the int64_t versions, too.
# @TEST-EXEC: zcat <$TRACES/echo-connections.pcap.gz | zeek -b -Cr - %INPUT > out
# @TEST-EXEC: btest-diff out
# @TEST-EXEC-FAIL: test -f reporter.log
@load base/frameworks/telemetry
function print_histogram_metrics(what: string, metrics: vector of Telemetry::HistogramMetric)
{
print fmt("### %s |%s|", what, |metrics|);
for (i in metrics)
{
local m = metrics[i];
print m$opts$metric_type, m$opts$prefix, m$opts$name, m$opts$bounds, m$opts$labels, m$labels;
# Don't output actual values as they are runtime dependent.
# print m$values, m$sum, m$observations;
if ( m$opts?$count_bounds )
print m$opts$count_bounds;
}
}
function print_metrics(what: string, metrics: vector of Telemetry::Metric)
{
print fmt("### %s |%s|", what, |metrics|);
for (i in metrics)
{
local m = metrics[i];
print m$opts$metric_type, m$opts$prefix, m$opts$name, m$opts$labels, m$labels, m$value;
if (m?$count_value)
print "count_value", m$count_value;
}
}
event zeek_done() &priority=-100
{
local broker_metrics = Telemetry::collect_metrics("broker", "*");
print_metrics("broker", broker_metrics);
local caf_metrics = Telemetry::collect_metrics("caf*", "*");
print_metrics("caf", caf_metrics);
local caf_histogram_metrics = Telemetry::collect_histogram_metrics("caf*", "*");
print_histogram_metrics("caf", caf_histogram_metrics);
}