From 88a0cda8ca33b3912478df0ab1afcb4a273a1ee0 Mon Sep 17 00:00:00 2001 From: Christian Kreibich Date: Wed, 16 Apr 2025 18:00:28 -0700 Subject: [PATCH] Add cluster framework telemetry for Broker's send-buffer use This hooks into Telemetry::sync() to update Broker-level metrics tracking the peerings' send buffer state. We do this in the cluster framework so we can label the resulting metrics with Zeek cluster node names, not Broker's endpoint IDs. --- scripts/base/frameworks/cluster/__load__.zeek | 5 +- .../frameworks/cluster/broker-telemetry.zeek | 69 +++++++++++++++++++ .../coverage.init-default/missing_loads | 1 + 3 files changed, 74 insertions(+), 1 deletion(-) create mode 100644 scripts/base/frameworks/cluster/broker-telemetry.zeek diff --git a/scripts/base/frameworks/cluster/__load__.zeek b/scripts/base/frameworks/cluster/__load__.zeek index 0d6372e3d4..85fef40c5f 100644 --- a/scripts/base/frameworks/cluster/__load__.zeek +++ b/scripts/base/frameworks/cluster/__load__.zeek @@ -14,8 +14,11 @@ redef Broker::log_topic = Cluster::rr_log_topic; # Add a cluster prefix. @prefixes += cluster -# This should soon condition on loading only when Broker is in use. +# Broker-specific additions: +@if ( Cluster::backend == Cluster::CLUSTER_BACKEND_BROKER ) @load ./broker-backpressure +@load ./broker-telemetry +@endif @if ( Supervisor::is_supervised() ) # When running a supervised cluster, populate Cluster::nodes from the node table diff --git a/scripts/base/frameworks/cluster/broker-telemetry.zeek b/scripts/base/frameworks/cluster/broker-telemetry.zeek new file mode 100644 index 0000000000..7aa1e8fb3f --- /dev/null +++ b/scripts/base/frameworks/cluster/broker-telemetry.zeek @@ -0,0 +1,69 @@ +# Additional Broker-specific metrics that use Zeek cluster-level node names. + +@load base/frameworks/telemetry + +module Cluster; + +## This gauge tracks the current number of locally queued messages in each +## Broker peering's send buffer. The "peer" label identifies the remote side of +## the peering, containing a Zeek cluster node name. +global broker_peer_buffer_messages_gf = Telemetry::register_gauge_family([ + $prefix="zeek", + $name="broker-peer-buffer-messages", + $unit="", + $label_names=vector("peer"), + $help_text="Number of messages queued in Broker's send buffers", +]); + +## This gauge tracks recent maximum queue lengths for each Broker peering's send +## buffer. Most of the time the send buffers are nearly empty, so this gauge +## helps understand recent bursts of messages. "Recent" here means +## :zeek:see:`Broker::buffer_stats_reset_interval`. The time window advances in +## increments of at least the stats interval, not incrementally with every new +## observed message. That is, Zeek keeps a timestamp of when the window started, +## and once it notices that the interval has passed, it moves the start of the +## window to current time. +global broker_peer_buffer_recent_max_messages_gf = Telemetry::register_gauge_family([ + $prefix="zeek", + $name="broker-peer-buffer-recent-max-messages", + $unit="", + $label_names=vector("peer"), + $help_text="Maximum number of messages recently queued in Broker's send buffers", +]); + +## This counter tracks for each Broker peering the number of times its send +## buffer has overflowed. For the "disconnect" policy this can at most be 1, +## since Broker stops the peering at this time. For the "drop_oldest" and +## "drop_newest" policies (see :zeek:see:`Broker:peer_overflow_policy`) the count +## instead reflects the number of messages lost. +global broker_peer_buffer_overflows_cf = Telemetry::register_counter_family([ + $prefix="zeek", + $name="broker-peer-buffer-overflows", + $unit="", + $label_names=vector("peer"), + $help_text="Number of overflows in Broker's send buffers", +]); + +hook Telemetry::sync() + { + local peers = Broker::peering_stats(); + local nn: NamedNode; + + for ( peer, stats in peers ) + { + # Translate the Broker IDs to Zeek-level node names. We skip + # telemetry for peers where this mapping fails, i.e. ones for + # connections to external systems. + nn = nodeid_to_node(peer); + + if ( |nn$name| > 0 ) + { + Telemetry::gauge_family_set(broker_peer_buffer_messages_gf, + vector(nn$name), stats$num_queued); + Telemetry::gauge_family_set(broker_peer_buffer_recent_max_messages_gf, + vector(nn$name), stats$max_queued_recently); + Telemetry::counter_family_set(broker_peer_buffer_overflows_cf, + vector(nn$name), stats$num_overflows); + } + } + } diff --git a/testing/btest/Baseline/coverage.init-default/missing_loads b/testing/btest/Baseline/coverage.init-default/missing_loads index 9997ec4fd8..93af34614e 100644 --- a/testing/btest/Baseline/coverage.init-default/missing_loads +++ b/testing/btest/Baseline/coverage.init-default/missing_loads @@ -1,6 +1,7 @@ ### BTest baseline data generated by btest-diff. Do not edit. Use "btest -U/-u" to update. Requires BTest >= 0.63. -./frameworks/cluster/broker-backpressure.zeek -./frameworks/cluster/broker-stores.zeek +-./frameworks/cluster/broker-telemetry.zeek -./frameworks/cluster/nodes/logger.zeek -./frameworks/cluster/nodes/manager.zeek -./frameworks/cluster/nodes/proxy.zeek