From ace5c1104897ca5d1753c45c15c39784ed044b5d Mon Sep 17 00:00:00 2001 From: Christian Kreibich Date: Wed, 7 May 2025 16:40:02 -0700 Subject: [PATCH] Bugfix: accurately track Broker buffer overflows w/ multiple peerings When a node restarts or a peering between two nodes starts over for other reasons, the internal tracking in the Broker manager resets its state (since it's per-peering), and thus the message overflow counter. The script layer was unaware of this, and threw errors when trying to reset the corresponding counter metric down to zero at sync time. We now track past buffer overflows via a separate epoch table, using Broker peer ID comparisons to identify new peerings, and set the counter to the sum of past and current overflows. I considered just making this a gauge, but it seems more helpful to be able to look at a counter to see whether any messages have ever been dropped over the lifetime of the node process. As an aside, this now also avoids repeatedly creating the labels vector, re-using the same one for each metric. Thanks to @pbcullen for identifying this one! --- .../frameworks/cluster/broker-telemetry.zeek | 53 +++++++++++++++---- 1 file changed, 44 insertions(+), 9 deletions(-) diff --git a/scripts/base/frameworks/cluster/broker-telemetry.zeek b/scripts/base/frameworks/cluster/broker-telemetry.zeek index 7aa1e8fb3f..913bf1ee08 100644 --- a/scripts/base/frameworks/cluster/broker-telemetry.zeek +++ b/scripts/base/frameworks/cluster/broker-telemetry.zeek @@ -44,26 +44,61 @@ global broker_peer_buffer_overflows_cf = Telemetry::register_counter_family([ $help_text="Number of overflows in Broker's send buffers", ]); + +# A helper to track overflow counts over past peerings as well as the current +# one. The peer_id field allows us to identify when the counter has reset: a +# Broker ID different from the one on file means it's a new peering. +type EpochData: record { + peer_id: string; + num_overflows: count &default=0; + num_past_overflows: count &default=0; +}; + +# This maps from a cluster node name to its EpochData. +global peering_epoch_data: table[string] of EpochData; + hook Telemetry::sync() { local peers = Broker::peering_stats(); local nn: NamedNode; + local labels: vector of string; + local ed: EpochData; - for ( peer, stats in peers ) + for ( peer_id, stats in peers ) { # Translate the Broker IDs to Zeek-level node names. We skip # telemetry for peers where this mapping fails, i.e. ones for # connections to external systems. - nn = nodeid_to_node(peer); + nn = nodeid_to_node(peer_id); - if ( |nn$name| > 0 ) + if ( |nn$name| == 0 ) + next; + + labels = vector(nn$name); + + Telemetry::gauge_family_set(broker_peer_buffer_messages_gf, + labels, stats$num_queued); + Telemetry::gauge_family_set(broker_peer_buffer_recent_max_messages_gf, + labels, stats$max_queued_recently); + + if ( nn$name !in peering_epoch_data ) + peering_epoch_data[nn$name] = EpochData($peer_id=peer_id); + + ed = peering_epoch_data[nn$name]; + + if ( peer_id != ed$peer_id ) { - Telemetry::gauge_family_set(broker_peer_buffer_messages_gf, - vector(nn$name), stats$num_queued); - Telemetry::gauge_family_set(broker_peer_buffer_recent_max_messages_gf, - vector(nn$name), stats$max_queued_recently); - Telemetry::counter_family_set(broker_peer_buffer_overflows_cf, - vector(nn$name), stats$num_overflows); + # A new peering. Ensure that we account for overflows in + # past ones. There is a risk here that we might have + # missed a peering altogether if we scrape infrequently, + # but re-peering should be a rare event. + ed$peer_id = peer_id; + ed$num_past_overflows += ed$num_overflows; } + + ed$num_overflows = stats$num_overflows; + + Telemetry::counter_family_set(broker_peer_buffer_overflows_cf, + labels, ed$num_past_overflows + ed$num_overflows); } }