mirror of
https://github.com/zeek/zeek.git
synced 2025-10-02 06:38:20 +00:00

This adds a Broker-specific script to the cluster framework, loaded only when Zeek is running in cluster mode. It adds logging in cluster.log as well as telemetry via a metrics counter for Broker-observed backpressure disconnects. The new zeek_broker_backpressure_disconnects counter, labeled by the neighboring peer that the reporting node has determined to be unresponsive, counts the number of unpeerings for this reason. Here the node "worker" has observed node "proxy" falling behind once: # HELP zeek_broker_backpressure_disconnects_total Number of Broker peering drops due to a neighbor falling too far behind in message I/O # TYPE zeek_broker_backpressure_disconnects_total counter zeek_broker_backpressure_disconnects_total{endpoint="worker",peer="proxy"} 1 Includes small btest baseline update to reflect @load of a new script.
29 lines
1.1 KiB
Text
29 lines
1.1 KiB
Text
# Notifications for Broker-reported backpressure overflow.
|
|
# See base/frameworks/broker/backpressure.zeek for context.
|
|
|
|
@load base/frameworks/telemetry
|
|
|
|
module Cluster;
|
|
|
|
global broker_backpressure_disconnects_cf = Telemetry::register_counter_family([
|
|
$prefix="zeek",
|
|
$name="broker-backpressure-disconnects",
|
|
$unit="",
|
|
$label_names=vector("peer"),
|
|
$help_text="Number of Broker peerings dropped due to a neighbor falling behind in message I/O",
|
|
]);
|
|
|
|
event Broker::peer_removed(endpoint: Broker::EndpointInfo, msg: string)
|
|
{
|
|
if ( ! endpoint?$network || "caf::sec::backpressure_overflow" !in msg )
|
|
return;
|
|
|
|
local nn = nodeid_to_node(endpoint$id);
|
|
|
|
Cluster::log(fmt("removed due to backpressure overflow: %s%s:%s (%s)",
|
|
nn$name != "" ? "" : "non-cluster peer ",
|
|
endpoint$network$address, endpoint$network$bound_port,
|
|
nn$name != "" ? nn$name : endpoint$id));
|
|
Telemetry::counter_family_inc(broker_backpressure_disconnects_cf,
|
|
vector(nn$name != "" ? nn$name : "unknown"));
|
|
}
|