Merge branch 'topic/christian/fix-broker-peering-overflows-metric'

* topic/christian/fix-broker-peering-overflows-metric: Bugfix: accurately track Broker buffer overflows w/ multiple peerings
2025-10-02 14:48:21 +00:00 · 2025-05-08 15:07:09 -07:00 · 2025-05-08 15:07:09 -07:00 · 8d79429555
commit 8d79429555
parent 2c17c85f55 738ce1c235
3 changed files with 49 additions and 10 deletions
--- a/4
+++ b/4
@ -1,3 +1,7 @@
+8.0.0-dev.76 | 2025-05-08 15:07:09 -0700
+
+  * Bugfix: accurately track Broker buffer overflows w/ multiple peerings (Christian Kreibich, Corelight)
+
 8.0.0-dev.74 | 2025-05-08 13:46:54 -0700

  * Downgrade broker clone FatalError to an Error (Tim Wojtulewicz, Corelight)
--- a/2
+++ b/2
@ -1 +1 @@
-8.0.0-dev.74
+8.0.0-dev.76
--- a/scripts/base/frameworks/cluster/broker-telemetry.zeek
+++ b/scripts/base/frameworks/cluster/broker-telemetry.zeek
@ -44,26 +44,61 @@ global broker_peer_buffer_overflows_cf = Telemetry::register_counter_family([
    $help_text="Number of overflows in Broker's send buffers",
 ]);

+
+# A helper to track overflow counts over past peerings as well as the current
+# one.  The peer_id field allows us to identify when the counter has reset: a
+# Broker ID different from the one on file means it's a new peering.
+type EpochData: record {
+	peer_id: string;
+	num_overflows: count &default=0;
+	num_past_overflows: count &default=0;
+};
+
+# This maps from a cluster node name to its EpochData.
+global peering_epoch_data: table[string] of EpochData;
+
 hook Telemetry::sync()
 	{
 	local peers = Broker::peering_stats();
 	local nn: NamedNode;
+	local labels: vector of string;
+	local ed: EpochData;

-	for ( peer, stats in peers )
+	for ( peer_id, stats in peers )
 		{
 		# Translate the Broker IDs to Zeek-level node names. We skip
 		# telemetry for peers where this mapping fails, i.e. ones for
 		# connections to external systems.
-		nn = nodeid_to_node(peer);
+		nn = nodeid_to_node(peer_id);
+
+		if ( |nn$name| == 0 )
+			next;
+
+		labels = vector(nn$name);

-		if ( |nn$name| > 0 )
-			{
 		Telemetry::gauge_family_set(broker_peer_buffer_messages_gf,
-			    vector(nn$name), stats$num_queued);
+		    labels, stats$num_queued);
 		Telemetry::gauge_family_set(broker_peer_buffer_recent_max_messages_gf,
-			    vector(nn$name), stats$max_queued_recently);
+		    labels, stats$max_queued_recently);
+
+		if ( nn$name !in peering_epoch_data )
+			peering_epoch_data[nn$name] = EpochData($peer_id=peer_id);
+
+		ed = peering_epoch_data[nn$name];
+
+		if ( peer_id != ed$peer_id )
+			{
+			# A new peering. Ensure that we account for overflows in
+			# past ones. There is a risk here that we might have
+			# missed a peering altogether if we scrape infrequently,
+			# but re-peering should be a rare event.
+			ed$peer_id = peer_id;
+			ed$num_past_overflows += ed$num_overflows;
+			}
+
+		ed$num_overflows = stats$num_overflows;
+
 		Telemetry::counter_family_set(broker_peer_buffer_overflows_cf,
-			    vector(nn$name), stats$num_overflows);
-			}
+		    labels, ed$num_past_overflows + ed$num_overflows);
 		}
 	}