From ead6134501cd5d7eba217f4812f4c1d99e8789bf Mon Sep 17 00:00:00 2001 From: Christian Kreibich Date: Mon, 18 Nov 2024 16:09:26 -0800 Subject: [PATCH] Add backpressure disconnect notification to cluster.log and via telemetry This adds a Broker-specific script to the cluster framework, loaded only when Zeek is running in cluster mode. It adds logging in cluster.log as well as telemetry via a metrics counter for Broker-observed backpressure disconnects. The new zeek_broker_backpressure_disconnects counter, labeled by the neighboring peer that the reporting node has determined to be unresponsive, counts the number of unpeerings for this reason. Here the node "worker" has observed node "proxy" falling behind once: # HELP zeek_broker_backpressure_disconnects_total Number of Broker peering drops due to a neighbor falling too far behind in message I/O # TYPE zeek_broker_backpressure_disconnects_total counter zeek_broker_backpressure_disconnects_total{endpoint="worker",peer="proxy"} 1 Includes small btest baseline update to reflect @load of a new script. --- scripts/base/frameworks/cluster/__load__.zeek | 3 ++ .../cluster/broker-backpressure.zeek | 29 +++++++++++++++++++ .../coverage.init-default/missing_loads | 1 + 3 files changed, 33 insertions(+) create mode 100644 scripts/base/frameworks/cluster/broker-backpressure.zeek diff --git a/scripts/base/frameworks/cluster/__load__.zeek b/scripts/base/frameworks/cluster/__load__.zeek index a854302636..0d6372e3d4 100644 --- a/scripts/base/frameworks/cluster/__load__.zeek +++ b/scripts/base/frameworks/cluster/__load__.zeek @@ -14,6 +14,9 @@ redef Broker::log_topic = Cluster::rr_log_topic; # Add a cluster prefix. @prefixes += cluster +# This should soon condition on loading only when Broker is in use. +@load ./broker-backpressure + @if ( Supervisor::is_supervised() ) # When running a supervised cluster, populate Cluster::nodes from the node table # the Supervisor provides to new Zeek nodes. The management framework configures diff --git a/scripts/base/frameworks/cluster/broker-backpressure.zeek b/scripts/base/frameworks/cluster/broker-backpressure.zeek new file mode 100644 index 0000000000..e3fe4c9cdd --- /dev/null +++ b/scripts/base/frameworks/cluster/broker-backpressure.zeek @@ -0,0 +1,29 @@ +# Notifications for Broker-reported backpressure overflow. +# See base/frameworks/broker/backpressure.zeek for context. + +@load base/frameworks/telemetry + +module Cluster; + +global broker_backpressure_disconnects_cf = Telemetry::register_counter_family([ + $prefix="zeek", + $name="broker-backpressure-disconnects", + $unit="", + $label_names=vector("peer"), + $help_text="Number of Broker peerings dropped due to a neighbor falling behind in message I/O", +]); + +event Broker::peer_removed(endpoint: Broker::EndpointInfo, msg: string) + { + if ( ! endpoint?$network || "caf::sec::backpressure_overflow" !in msg ) + return; + + local nn = nodeid_to_node(endpoint$id); + + Cluster::log(fmt("removed due to backpressure overflow: %s%s:%s (%s)", + nn$name != "" ? "" : "non-cluster peer ", + endpoint$network$address, endpoint$network$bound_port, + nn$name != "" ? nn$name : endpoint$id)); + Telemetry::counter_family_inc(broker_backpressure_disconnects_cf, + vector(nn$name != "" ? nn$name : "unknown")); + } diff --git a/testing/btest/Baseline/coverage.init-default/missing_loads b/testing/btest/Baseline/coverage.init-default/missing_loads index e16624e1fb..9997ec4fd8 100644 --- a/testing/btest/Baseline/coverage.init-default/missing_loads +++ b/testing/btest/Baseline/coverage.init-default/missing_loads @@ -1,4 +1,5 @@ ### BTest baseline data generated by btest-diff. Do not edit. Use "btest -U/-u" to update. Requires BTest >= 0.63. +-./frameworks/cluster/broker-backpressure.zeek -./frameworks/cluster/broker-stores.zeek -./frameworks/cluster/nodes/logger.zeek -./frameworks/cluster/nodes/manager.zeek