diff --git a/scripts/base/frameworks/cluster/broker-telemetry.zeek b/scripts/base/frameworks/cluster/broker-telemetry.zeek index 7aa1e8fb3f..913bf1ee08 100644 --- a/scripts/base/frameworks/cluster/broker-telemetry.zeek +++ b/scripts/base/frameworks/cluster/broker-telemetry.zeek @@ -44,26 +44,61 @@ global broker_peer_buffer_overflows_cf = Telemetry::register_counter_family([ $help_text="Number of overflows in Broker's send buffers", ]); + +# A helper to track overflow counts over past peerings as well as the current +# one. The peer_id field allows us to identify when the counter has reset: a +# Broker ID different from the one on file means it's a new peering. +type EpochData: record { + peer_id: string; + num_overflows: count &default=0; + num_past_overflows: count &default=0; +}; + +# This maps from a cluster node name to its EpochData. +global peering_epoch_data: table[string] of EpochData; + hook Telemetry::sync() { local peers = Broker::peering_stats(); local nn: NamedNode; + local labels: vector of string; + local ed: EpochData; - for ( peer, stats in peers ) + for ( peer_id, stats in peers ) { # Translate the Broker IDs to Zeek-level node names. We skip # telemetry for peers where this mapping fails, i.e. ones for # connections to external systems. - nn = nodeid_to_node(peer); + nn = nodeid_to_node(peer_id); - if ( |nn$name| > 0 ) + if ( |nn$name| == 0 ) + next; + + labels = vector(nn$name); + + Telemetry::gauge_family_set(broker_peer_buffer_messages_gf, + labels, stats$num_queued); + Telemetry::gauge_family_set(broker_peer_buffer_recent_max_messages_gf, + labels, stats$max_queued_recently); + + if ( nn$name !in peering_epoch_data ) + peering_epoch_data[nn$name] = EpochData($peer_id=peer_id); + + ed = peering_epoch_data[nn$name]; + + if ( peer_id != ed$peer_id ) { - Telemetry::gauge_family_set(broker_peer_buffer_messages_gf, - vector(nn$name), stats$num_queued); - Telemetry::gauge_family_set(broker_peer_buffer_recent_max_messages_gf, - vector(nn$name), stats$max_queued_recently); - Telemetry::counter_family_set(broker_peer_buffer_overflows_cf, - vector(nn$name), stats$num_overflows); + # A new peering. Ensure that we account for overflows in + # past ones. There is a risk here that we might have + # missed a peering altogether if we scrape infrequently, + # but re-peering should be a rare event. + ed$peer_id = peer_id; + ed$num_past_overflows += ed$num_overflows; } + + ed$num_overflows = stats$num_overflows; + + Telemetry::counter_family_set(broker_peer_buffer_overflows_cf, + labels, ed$num_past_overflows + ed$num_overflows); } }