diff --git a/CHANGES b/CHANGES index 8c12ab7b5f..14553a9fe4 100644 --- a/CHANGES +++ b/CHANGES @@ -1,3 +1,7 @@ +8.0.0-dev.76 | 2025-05-08 15:07:09 -0700 + + * Bugfix: accurately track Broker buffer overflows w/ multiple peerings (Christian Kreibich, Corelight) + 8.0.0-dev.74 | 2025-05-08 13:46:54 -0700 * Downgrade broker clone FatalError to an Error (Tim Wojtulewicz, Corelight) diff --git a/VERSION b/VERSION index b578bb24ef..d71cce4e6f 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -8.0.0-dev.74 +8.0.0-dev.76 diff --git a/scripts/base/frameworks/cluster/broker-telemetry.zeek b/scripts/base/frameworks/cluster/broker-telemetry.zeek index 7aa1e8fb3f..913bf1ee08 100644 --- a/scripts/base/frameworks/cluster/broker-telemetry.zeek +++ b/scripts/base/frameworks/cluster/broker-telemetry.zeek @@ -44,26 +44,61 @@ global broker_peer_buffer_overflows_cf = Telemetry::register_counter_family([ $help_text="Number of overflows in Broker's send buffers", ]); + +# A helper to track overflow counts over past peerings as well as the current +# one. The peer_id field allows us to identify when the counter has reset: a +# Broker ID different from the one on file means it's a new peering. +type EpochData: record { + peer_id: string; + num_overflows: count &default=0; + num_past_overflows: count &default=0; +}; + +# This maps from a cluster node name to its EpochData. +global peering_epoch_data: table[string] of EpochData; + hook Telemetry::sync() { local peers = Broker::peering_stats(); local nn: NamedNode; + local labels: vector of string; + local ed: EpochData; - for ( peer, stats in peers ) + for ( peer_id, stats in peers ) { # Translate the Broker IDs to Zeek-level node names. We skip # telemetry for peers where this mapping fails, i.e. ones for # connections to external systems. - nn = nodeid_to_node(peer); + nn = nodeid_to_node(peer_id); - if ( |nn$name| > 0 ) + if ( |nn$name| == 0 ) + next; + + labels = vector(nn$name); + + Telemetry::gauge_family_set(broker_peer_buffer_messages_gf, + labels, stats$num_queued); + Telemetry::gauge_family_set(broker_peer_buffer_recent_max_messages_gf, + labels, stats$max_queued_recently); + + if ( nn$name !in peering_epoch_data ) + peering_epoch_data[nn$name] = EpochData($peer_id=peer_id); + + ed = peering_epoch_data[nn$name]; + + if ( peer_id != ed$peer_id ) { - Telemetry::gauge_family_set(broker_peer_buffer_messages_gf, - vector(nn$name), stats$num_queued); - Telemetry::gauge_family_set(broker_peer_buffer_recent_max_messages_gf, - vector(nn$name), stats$max_queued_recently); - Telemetry::counter_family_set(broker_peer_buffer_overflows_cf, - vector(nn$name), stats$num_overflows); + # A new peering. Ensure that we account for overflows in + # past ones. There is a risk here that we might have + # missed a peering altogether if we scrape infrequently, + # but re-peering should be a rare event. + ed$peer_id = peer_id; + ed$num_past_overflows += ed$num_overflows; } + + ed$num_overflows = stats$num_overflows; + + Telemetry::counter_family_set(broker_peer_buffer_overflows_cf, + labels, ed$num_past_overflows + ed$num_overflows); } }