From f396c2b16ed1a56627e0b189f3d4d011d0e46829 Mon Sep 17 00:00:00 2001 From: Arne Welzel Date: Tue, 23 May 2023 15:33:12 +0200 Subject: [PATCH] stats: Add zeek-net-packet-lag-seconds metric While writing documentation about troubleshooting and looking a bit at the older stats.log, realized we don't have the packet lag metric exposed as metric/telemetry. Add it. This is a Zeek instance lagging behind in network time ~6second because it's very overloaded: zeek_net_packet_lag_seconds{endpoint=""} 6.169406 1684848998092 --- NEWS | 12 ++++++++++++ scripts/policy/misc/stats.zeek | 23 +++++++++++++++++------ 2 files changed, 29 insertions(+), 6 deletions(-) diff --git a/NEWS b/NEWS index c80c94584b..41a0765aac 100644 --- a/NEWS +++ b/NEWS @@ -161,6 +161,18 @@ New Functionality - Add logging metrics for streams (``zeek-log-stream-writes``) and writers (``zeek-log-writer-writes-total``). +- Add networking metrics via the telemetry framework. These are enabled + when the ``misc/stats`` script is loaded. + + zeek-net-dropped-packets + zeek-net-link-packets + zeek-net-received-bytes + zeek-net-packet-lag-seconds + zeek-net-received-packets-total + + Except for lag, metrics originate from the ``get_net_stats()`` bif and are + updated through the ``Telemetry::sync()`` hook every 15 seconds by default. + - The DNS analyzer now parses RFC 2535's AD ("authentic data") and CD ("checking disabled") flags from DNS requests and responses, making them available in the ``dns_msg`` record provided by many of the ``dns_*`` events. The existing diff --git a/scripts/policy/misc/stats.zeek b/scripts/policy/misc/stats.zeek index d64406e217..4562f91a93 100644 --- a/scripts/policy/misc/stats.zeek +++ b/scripts/policy/misc/stats.zeek @@ -123,22 +123,33 @@ global packets_filtered_cf = Telemetry::register_counter_family([ $help_text="Total number of packets filtered", ]); +global packet_lag_gf = Telemetry::register_gauge_family([ + $prefix="zeek", + $name="net-packet-lag", + $unit="seconds", + $help_text="Difference of network time and wallclock time in seconds.", +]); + +global no_labels: vector of string; + hook Telemetry::sync() { local net_stats = get_net_stats(); - Telemetry::counter_family_set(bytes_received_cf, vector(), net_stats$bytes_recvd); - Telemetry::counter_family_set(packets_received_cf, vector(), net_stats$pkts_recvd); + Telemetry::counter_family_set(bytes_received_cf, no_labels, net_stats$bytes_recvd); + Telemetry::counter_family_set(packets_received_cf, no_labels, net_stats$pkts_recvd); if ( reading_live_traffic() ) { - Telemetry::counter_family_set(packets_dropped_cf, vector(), net_stats$pkts_dropped); - Telemetry::counter_family_set(link_packets_cf, vector(), net_stats$pkts_link); + Telemetry::counter_family_set(packets_dropped_cf, no_labels, net_stats$pkts_dropped); + Telemetry::counter_family_set(link_packets_cf, no_labels, net_stats$pkts_link); if ( net_stats?$pkts_filtered ) - Telemetry::counter_family_set(packets_filtered_cf, vector(), net_stats$pkts_filtered); + Telemetry::counter_family_set(packets_filtered_cf, no_labels, net_stats$pkts_filtered); + + Telemetry::gauge_family_set(packet_lag_gf, no_labels, + interval_to_double(current_time() - network_time())); } } - event zeek_init() &priority=5 { Log::create_stream(Stats::LOG, [$columns=Info, $ev=log_stats, $path="stats", $policy=log_policy]);