Merge branch 'topic/christian/fix-broker-peering-overflows-metric'

* topic/christian/fix-broker-peering-overflows-metric:
  Bugfix: accurately track Broker buffer overflows w/ multiple peerings
This commit is contained in:
Christian Kreibich 2025-05-08 15:07:09 -07:00
commit 8d79429555
3 changed files with 49 additions and 10 deletions

View file

@ -1,3 +1,7 @@
8.0.0-dev.76 | 2025-05-08 15:07:09 -0700
* Bugfix: accurately track Broker buffer overflows w/ multiple peerings (Christian Kreibich, Corelight)
8.0.0-dev.74 | 2025-05-08 13:46:54 -0700
* Downgrade broker clone FatalError to an Error (Tim Wojtulewicz, Corelight)

View file

@ -1 +1 @@
8.0.0-dev.74
8.0.0-dev.76

View file

@ -44,26 +44,61 @@ global broker_peer_buffer_overflows_cf = Telemetry::register_counter_family([
$help_text="Number of overflows in Broker's send buffers",
]);
# A helper to track overflow counts over past peerings as well as the current
# one. The peer_id field allows us to identify when the counter has reset: a
# Broker ID different from the one on file means it's a new peering.
type EpochData: record {
peer_id: string;
num_overflows: count &default=0;
num_past_overflows: count &default=0;
};
# This maps from a cluster node name to its EpochData.
global peering_epoch_data: table[string] of EpochData;
hook Telemetry::sync()
{
local peers = Broker::peering_stats();
local nn: NamedNode;
local labels: vector of string;
local ed: EpochData;
for ( peer, stats in peers )
for ( peer_id, stats in peers )
{
# Translate the Broker IDs to Zeek-level node names. We skip
# telemetry for peers where this mapping fails, i.e. ones for
# connections to external systems.
nn = nodeid_to_node(peer);
nn = nodeid_to_node(peer_id);
if ( |nn$name| == 0 )
next;
labels = vector(nn$name);
if ( |nn$name| > 0 )
{
Telemetry::gauge_family_set(broker_peer_buffer_messages_gf,
vector(nn$name), stats$num_queued);
labels, stats$num_queued);
Telemetry::gauge_family_set(broker_peer_buffer_recent_max_messages_gf,
vector(nn$name), stats$max_queued_recently);
labels, stats$max_queued_recently);
if ( nn$name !in peering_epoch_data )
peering_epoch_data[nn$name] = EpochData($peer_id=peer_id);
ed = peering_epoch_data[nn$name];
if ( peer_id != ed$peer_id )
{
# A new peering. Ensure that we account for overflows in
# past ones. There is a risk here that we might have
# missed a peering altogether if we scrape infrequently,
# but re-peering should be a rare event.
ed$peer_id = peer_id;
ed$num_past_overflows += ed$num_overflows;
}
ed$num_overflows = stats$num_overflows;
Telemetry::counter_family_set(broker_peer_buffer_overflows_cf,
vector(nn$name), stats$num_overflows);
}
labels, ed$num_past_overflows + ed$num_overflows);
}
}