zeek/scripts/base/frameworks/cluster/broker-telemetry.zeek

# Additional Broker-specific metrics that use Zeek cluster-level node names.

@load base/frameworks/telemetry

module Cluster;

## This gauge tracks the current number of locally queued messages in each
## Broker peering's send buffer. The "peer" label identifies the remote side of
## the peering, containing a Zeek cluster node name.
global broker_peer_buffer_messages_gf = Telemetry::register_gauge_family([
    $prefix="zeek",
    $name="broker-peer-buffer-messages",
    $unit="",
    $label_names=vector("peer"),
    $help_text="Number of messages queued in Broker's send buffers",
]);

## This gauge tracks recent maximum queue lengths for each Broker peering's send
## buffer. Most of the time the send buffers are nearly empty, so this gauge
## helps understand recent bursts of messages.  "Recent" here means
## :zeek:see:`Broker::buffer_stats_reset_interval`. The time window advances in
## increments of at least the stats interval, not incrementally with every new
## observed message. That is, Zeek keeps a timestamp of when the window started,
## and once it notices that the interval has passed, it moves the start of the
## window to current time.
global broker_peer_buffer_recent_max_messages_gf = Telemetry::register_gauge_family([
    $prefix="zeek",
    $name="broker-peer-buffer-recent-max-messages",
    $unit="",
    $label_names=vector("peer"),
    $help_text="Maximum number of messages recently queued in Broker's send buffers",
]);

## This counter tracks for each Broker peering the number of times its send
## buffer has overflowed. For the "disconnect" policy this can at most be 1,
## since Broker stops the peering at this time. For the "drop_oldest" and
## "drop_newest" policies (see :zeek:see:`Broker:peer_overflow_policy`) the count
## instead reflects the number of messages lost.
global broker_peer_buffer_overflows_cf = Telemetry::register_counter_family([
    $prefix="zeek",
    $name="broker-peer-buffer-overflows",
    $unit="",
    $label_names=vector("peer"),
    $help_text="Number of overflows in Broker's send buffers",
]);


# A helper to track overflow counts over past peerings as well as the current
# one.  The peer_id field allows us to identify when the counter has reset: a
# Broker ID different from the one on file means it's a new peering.
type EpochData: record {
	peer_id: string;
	num_overflows: count &default=0;
	num_past_overflows: count &default=0;
};

# This maps from a cluster node name to its EpochData.
global peering_epoch_data: table[string] of EpochData;

hook Telemetry::sync()
	{
	local peers = Broker::peering_stats();
	local nn: NamedNode;
	local labels: vector of string;
	local ed: EpochData;

	for ( peer_id, stats in peers )
		{
		# Translate the Broker IDs to Zeek-level node names. We skip
		# telemetry for peers where this mapping fails, i.e. ones for
		# connections to external systems.
		nn = nodeid_to_node(peer_id);

		if ( |nn$name| == 0 )
			next;

		labels = vector(nn$name);

		Telemetry::gauge_family_set(broker_peer_buffer_messages_gf,
		    labels, stats$num_queued);
		Telemetry::gauge_family_set(broker_peer_buffer_recent_max_messages_gf,
		    labels, stats$max_queued_recently);

		if ( nn$name !in peering_epoch_data )
			peering_epoch_data[nn$name] = EpochData($peer_id=peer_id);

		ed = peering_epoch_data[nn$name];

		if ( peer_id != ed$peer_id )
			{
			# A new peering. Ensure that we account for overflows in
			# past ones. There is a risk here that we might have
			# missed a peering altogether if we scrape infrequently,
			# but re-peering should be a rare event.
			ed$peer_id = peer_id;
			ed$num_past_overflows += ed$num_overflows;
			}

		ed$num_overflows = stats$num_overflows;

		Telemetry::counter_family_set(broker_peer_buffer_overflows_cf,
		    labels, ed$num_past_overflows + ed$num_overflows);
		}
	}