Merge remote-tracking branch 'origin/topic/awelzel/4431-zeromq-drop-policy-v2'

* origin/topic/awelzel/4431-zeromq-drop-policy-v2: cluster.bif: Improve Cluster::publish() docstring btest/cluster/zeromq: Add tests for overload behavior cluster/zeromq: Metric for msg errors cluster/zeromq: Drop events when overloaded cluster/zeromq: Comments and move lookups to InitPostScript() cluster/zeromq: Rework lambdas to member functions cluster/zeromq: Support local XPUB/XSUB hwm and buf configurability cluster/OnLoop: Support DontBlock and Force flags for queueing cluster/ThreadedBackend: Injectable OnLoopProcess instance
2025-10-02 06:38:20 +00:00 · 2025-07-29 11:29:11 +02:00 · 2025-07-29 11:29:11 +02:00 · cd7836dda2
commit cd7836dda2
parent 12518e8256 55ecd90928
30 changed files with 1259 additions and 208 deletions
--- a/testing/btest/Baseline/cluster.zeromq.overload-drop/manager.out
+++ b/testing/btest/Baseline/cluster.zeromq.overload-drop/manager.out
@ -0,0 +1,13 @@
+### BTest baseline data generated by btest-diff. Do not edit. Use "btest -U/-u" to update. Requires BTest >= 0.63.
+nodes_up, 1
+nodes_up, 2
+nodes_up, 3
+sending finish
+nodes_down, 1
+nodes_down, 2
+nodes_down, 3
+had xpub_drops?, F
+had onloop_drops?, T
+node proxy dropped=T
+node worker-1 dropped=T
+node worker-2 dropped=T
--- a/testing/btest/Baseline/cluster.zeromq.overload-drop/proxy.out
+++ b/testing/btest/Baseline/cluster.zeromq.overload-drop/proxy.out
@ -0,0 +1,3 @@
+### BTest baseline data generated by btest-diff. Do not edit. Use "btest -U/-u" to update. Requires BTest >= 0.63.
+had xpub_drops?, T
+had onloop_drops?, F
--- a/testing/btest/Baseline/cluster.zeromq.overload-drop/worker-1.out
+++ b/testing/btest/Baseline/cluster.zeromq.overload-drop/worker-1.out
@ -0,0 +1,3 @@
+### BTest baseline data generated by btest-diff. Do not edit. Use "btest -U/-u" to update. Requires BTest >= 0.63.
+had xpub_drops?, T
+had onloop_drops?, F
--- a/testing/btest/Baseline/cluster.zeromq.overload-drop/worker-2.out
+++ b/testing/btest/Baseline/cluster.zeromq.overload-drop/worker-2.out
@ -0,0 +1,3 @@
+### BTest baseline data generated by btest-diff. Do not edit. Use "btest -U/-u" to update. Requires BTest >= 0.63.
+had xpub_drops?, T
+had onloop_drops?, F
--- a/testing/btest/Baseline/cluster.zeromq.overload-no-drop/manager.out
+++ b/testing/btest/Baseline/cluster.zeromq.overload-no-drop/manager.out
@ -0,0 +1,13 @@
+### BTest baseline data generated by btest-diff. Do not edit. Use "btest -U/-u" to update. Requires BTest >= 0.63.
+nodes_up, 1
+nodes_up, 2
+nodes_up, 3
+sending finish
+nodes_down, 1
+nodes_down, 2
+nodes_down, 3
+had xpub_drops?, F
+had onloop_drops?, F
+node proxy dropped=0 count=100000
+node worker-1 dropped=0 count=100000
+node worker-2 dropped=0 count=100000
--- a/testing/btest/Baseline/cluster.zeromq.overload-no-drop/proxy.out
+++ b/testing/btest/Baseline/cluster.zeromq.overload-no-drop/proxy.out
@ -0,0 +1,3 @@
+### BTest baseline data generated by btest-diff. Do not edit. Use "btest -U/-u" to update. Requires BTest >= 0.63.
+had xpub_drops?, F
+had onloop_drops?, F
--- a/testing/btest/Baseline/cluster.zeromq.overload-no-drop/worker-1.out
+++ b/testing/btest/Baseline/cluster.zeromq.overload-no-drop/worker-1.out
@ -0,0 +1,3 @@
+### BTest baseline data generated by btest-diff. Do not edit. Use "btest -U/-u" to update. Requires BTest >= 0.63.
+had xpub_drops?, F
+had onloop_drops?, F
--- a/testing/btest/Baseline/cluster.zeromq.overload-no-drop/worker-2.out
+++ b/testing/btest/Baseline/cluster.zeromq.overload-no-drop/worker-2.out
@ -0,0 +1,3 @@
+### BTest baseline data generated by btest-diff. Do not edit. Use "btest -U/-u" to update. Requires BTest >= 0.63.
+had xpub_drops?, F
+had onloop_drops?, F
--- a/testing/btest/Baseline/cluster.zeromq.overload-worker-proxy-topic-drop/manager.out
+++ b/testing/btest/Baseline/cluster.zeromq.overload-worker-proxy-topic-drop/manager.out
@ -0,0 +1,13 @@
+### BTest baseline data generated by btest-diff. Do not edit. Use "btest -U/-u" to update. Requires BTest >= 0.63.
+nodes_up, 1
+nodes_up, 2
+nodes_up, 3
+nodes_done, 1
+nodes_done, 2
+nodes_done, 3
+sending finish
+nodes_down, 1
+nodes_down, 2
+nodes_down, 3
+had xpub_drops?, F
+had onloop_drops?, F
--- a/testing/btest/Baseline/cluster.zeromq.overload-worker-proxy-topic-drop/proxy.out
+++ b/testing/btest/Baseline/cluster.zeromq.overload-worker-proxy-topic-drop/proxy.out
@ -0,0 +1,5 @@
+### BTest baseline data generated by btest-diff. Do not edit. Use "btest -U/-u" to update. Requires BTest >= 0.63.
+had xpub_drops?, T
+had onloop_drops?, T
+node worker-1 dropped=T
+node worker-2 dropped=T
--- a/testing/btest/Baseline/cluster.zeromq.overload-worker-proxy-topic-drop/worker-1.out
+++ b/testing/btest/Baseline/cluster.zeromq.overload-worker-proxy-topic-drop/worker-1.out
@ -0,0 +1,5 @@
+### BTest baseline data generated by btest-diff. Do not edit. Use "btest -U/-u" to update. Requires BTest >= 0.63.
+had xpub_drops?, T
+had onloop_drops?, T
+node proxy dropped=T
+node worker-2 dropped=T
--- a/testing/btest/Baseline/cluster.zeromq.overload-worker-proxy-topic-drop/worker-2.out
+++ b/testing/btest/Baseline/cluster.zeromq.overload-worker-proxy-topic-drop/worker-2.out
@ -0,0 +1,5 @@
+### BTest baseline data generated by btest-diff. Do not edit. Use "btest -U/-u" to update. Requires BTest >= 0.63.
+had xpub_drops?, T
+had onloop_drops?, T
+node proxy dropped=T
+node worker-1 dropped=T
--- a/testing/btest/Baseline/cluster.zeromq.overload-worker-proxy-topic-no-drop/manager.out
+++ b/testing/btest/Baseline/cluster.zeromq.overload-worker-proxy-topic-no-drop/manager.out
@ -0,0 +1,13 @@
+### BTest baseline data generated by btest-diff. Do not edit. Use "btest -U/-u" to update. Requires BTest >= 0.63.
+nodes_up, 1
+nodes_up, 2
+nodes_up, 3
+nodes_done, 1
+nodes_done, 2
+nodes_done, 3
+sending finish
+nodes_down, 1
+nodes_down, 2
+nodes_down, 3
+had xpub_drops?, F
+had onloop_drops?, F
--- a/testing/btest/Baseline/cluster.zeromq.overload-worker-proxy-topic-no-drop/proxy.out
+++ b/testing/btest/Baseline/cluster.zeromq.overload-worker-proxy-topic-no-drop/proxy.out
@ -0,0 +1,5 @@
+### BTest baseline data generated by btest-diff. Do not edit. Use "btest -U/-u" to update. Requires BTest >= 0.63.
+had xpub_drops?, F
+had onloop_drops?, F
+node worker-1 dropped=0 count=100000
+node worker-2 dropped=0 count=100000
--- a/testing/btest/Baseline/cluster.zeromq.overload-worker-proxy-topic-no-drop/worker-1.out
+++ b/testing/btest/Baseline/cluster.zeromq.overload-worker-proxy-topic-no-drop/worker-1.out
@ -0,0 +1,5 @@
+### BTest baseline data generated by btest-diff. Do not edit. Use "btest -U/-u" to update. Requires BTest >= 0.63.
+had xpub_drops?, F
+had onloop_drops?, F
+node proxy dropped=0 count=100000
+node worker-2 dropped=0 count=100000
--- a/testing/btest/Baseline/cluster.zeromq.overload-worker-proxy-topic-no-drop/worker-2.out
+++ b/testing/btest/Baseline/cluster.zeromq.overload-worker-proxy-topic-no-drop/worker-2.out
@ -0,0 +1,5 @@
+### BTest baseline data generated by btest-diff. Do not edit. Use "btest -U/-u" to update. Requires BTest >= 0.63.
+had xpub_drops?, F
+had onloop_drops?, F
+node proxy dropped=0 count=100000
+node worker-1 dropped=0 count=100000
--- a/testing/btest/Files/zeromq/metrics.zeek
+++ b/testing/btest/Files/zeromq/metrics.zeek
@ -0,0 +1,20 @@
+module Cluster::Backend::ZeroMQ;
+
+export {
+	global xpub_drops: function(): count;
+	global onloop_drops: function(): count;
+}
+
+function xpub_drops(): count
+	{
+	local ms = Telemetry::collect_metrics("zeek", "cluster_zeromq_xpub_drops_total");
+	assert |ms| == 1, fmt("%s", |ms|);
+	return double_to_count(ms[0]$value);
+	}
+
+function onloop_drops(): count
+	{
+	local ms = Telemetry::collect_metrics("zeek", "cluster_zeromq_onloop_drops_total");
+	assert |ms| == 1, fmt("%s", |ms|);
+	return double_to_count(ms[0]$value);
+	}
--- a/testing/btest/cluster/zeromq/overload-drop.zeek
+++ b/testing/btest/cluster/zeromq/overload-drop.zeek
@ -0,0 +1,166 @@
+# @TEST-DOC: Workers and proxy publish to the manager topic. They publish so fast that messages are dropped a) on their end and b) on the manager as well. The test checks that metrics are incremented and the manager also verifies that not all messages arrived.
+#
+# @TEST-REQUIRES: have-zeromq
+#
+# @TEST-GROUP: cluster-zeromq
+#
+# @TEST-PORT: XPUB_PORT
+# @TEST-PORT: XSUB_PORT
+# @TEST-PORT: LOG_PULL_PORT
+#
+# @TEST-EXEC: cp $FILES/zeromq/cluster-layout-no-logger.zeek cluster-layout.zeek
+# @TEST-EXEC: cp $FILES/zeromq/test-bootstrap.zeek zeromq-test-bootstrap.zeek
+# @TEST-EXEC: cp $FILES/zeromq/metrics.zeek zeromq-metrics.zeek
+#
+# @TEST-EXEC: zeek --parse-only manager.zeek
+# @TEST-EXEC: zeek --parse-only other.zeek
+#
+# @TEST-EXEC: btest-bg-run manager  "ZEEKPATH=$ZEEKPATH:.. && CLUSTER_NODE=manager  zeek -b ../manager.zeek> out"
+# @TEST-EXEC: btest-bg-run proxy    "ZEEKPATH=$ZEEKPATH:.. && CLUSTER_NODE=proxy    zeek -b ../other.zeek >out"
+# @TEST-EXEC: btest-bg-run worker-1 "ZEEKPATH=$ZEEKPATH:.. && CLUSTER_NODE=worker-1 zeek -b ../other.zeek >out"
+# @TEST-EXEC: btest-bg-run worker-2 "ZEEKPATH=$ZEEKPATH:.. && CLUSTER_NODE=worker-2 zeek -b ../other.zeek >out"
+#
+# @TEST-EXEC: btest-bg-wait 30
+# @TEST-EXEC: btest-diff manager/out
+# @TEST-EXEC: btest-diff proxy/out
+# @TEST-EXEC: btest-diff worker-1/out
+# @TEST-EXEC: btest-diff worker-2/out
+
+# @TEST-START-FILE common.zeek
+@load ./zeromq-test-bootstrap
+@load ./zeromq-metrics
+
+global tick: event() &is_used;
+global finish: event(name: string) &is_used;
+global ping: event(sender: string, c: count) &is_used;
+
+# How many messages each node publishes in total.
+const total_publishes = 100000;
+# How many events to publish per tick()
+const batch = 100;
+
+# Lower HWMs to provoke drops
+redef Cluster::Backend::ZeroMQ::xpub_sndhwm = batch/ 5;
+redef Cluster::Backend::ZeroMQ::onloop_queue_hwm = batch / 5;
+
+global test_nodes = set( "proxy", "worker-1", "worker-2" ) &ordered;
+# @TEST-END-FILE
+
+# @TEST-START-FILE manager.zeek
+@load ./common.zeek
+
+global nodes_up: set[string] = set();
+global nodes_down: set[string] = set();
+
+global sent_finish = F;
+
+event send_finish()
+	{
+	if ( sent_finish )
+		return;
+
+	print "sending finish";
+	for ( n in test_nodes )
+		Cluster::publish(Cluster::node_topic(n), finish, Cluster::node);
+
+	sent_finish = T;
+	}
+
+event Cluster::node_up(name: string, id: string)
+	{
+	add nodes_up[name];
+	print "nodes_up", |nodes_up|;
+
+	# Get the ball rolling once all nodes are available, sending the
+	# first tick() to proxy and workers.
+	if ( |nodes_up| == |test_nodes| )
+		{
+		Cluster::publish(Cluster::worker_topic, tick);
+		Cluster::publish(Cluster::proxy_topic, tick);
+		}
+	}
+
+event Cluster::node_down(name: string, id: string)
+	{
+	add nodes_down[name];
+	print "nodes_down", |nodes_down|;
+	if ( |nodes_down| == |test_nodes| )
+		terminate();
+	}
+
+global last_c: table[string] of count &default=0;
+global drop_c: table[string] of count &default=0;
+
+event ping(sender: string, c: count)
+	{
+	local dropped = c  - last_c[sender] - 1;
+	if ( dropped > 0 )
+		drop_c[sender] += dropped;
+
+	last_c[sender] = c;
+
+	# Check if all senders sent enough messages. If not,
+	# wait for the next ping to arrive.
+	for ( _, lc in last_c )
+		if ( lc < total_publishes )
+			return;
+
+	# Send finish just once.
+	event send_finish();
+	}
+
+event zeek_done()
+	{
+	local xpub_drops = Cluster::Backend::ZeroMQ::xpub_drops();
+	local onloop_drops = Cluster::Backend::ZeroMQ::onloop_drops();
+	print "had xpub_drops?", xpub_drops > 0;
+	print "had onloop_drops?", onloop_drops > 0;
+
+	for ( n in test_nodes )
+		print fmt("node %s dropped=%s", n, drop_c[n] > 0);
+
+	}
+# @TEST-END-FILE
+
+
+# @TEST-START-FILE other.zeek
+@load ./common.zeek
+
+global publishes = 0;
+
+event tick()
+	{
+	local i = batch;
+	while ( i > 0 )
+		{
+		--i;
+		++publishes;
+		Cluster::publish(Cluster::manager_topic, ping, Cluster::node, publishes);
+
+		# Continue sending a single publish for every tick() even
+		# if we've published enough in order for the manager to
+		# detect we're done. We need to continue here because this
+		# node, but also the manager node, may have dropped events.
+		if ( publishes >= total_publishes )
+			break;
+		}
+
+	# Relax publishing if we published enough so the manager
+	# isn't totally overloaded.
+	local s = publishes < total_publishes ? 0sec : 0.05sec;
+	schedule s { tick() };
+	}
+
+event finish(name: string)
+	{
+	terminate();
+	}
+
+event zeek_done()
+	{
+	local xpub_drops = Cluster::Backend::ZeroMQ::xpub_drops();
+	local onloop_drops = Cluster::Backend::ZeroMQ::onloop_drops();
+	print "had xpub_drops?", xpub_drops > 0;
+	print "had onloop_drops?", onloop_drops > 0;
+	}
+# @TEST-END-FILE
--- a/testing/btest/cluster/zeromq/overload-no-drop.zeek
+++ b/testing/btest/cluster/zeromq/overload-no-drop.zeek
@ -0,0 +1,160 @@
+# @TEST-DOC: Workers and proxy publish to the manager topic. They publish so fast that messages would be dropped by sender and receiver, but the HWM settings are 0 so nothing is dropped at the expense of using more memory. This is verified via metrics and checking the recevied pings on the manager.
+#
+# @TEST-REQUIRES: have-zeromq
+#
+# @TEST-GROUP: cluster-zeromq
+#
+# @TEST-PORT: XPUB_PORT
+# @TEST-PORT: XSUB_PORT
+# @TEST-PORT: LOG_PULL_PORT
+#
+# @TEST-EXEC: cp $FILES/zeromq/cluster-layout-no-logger.zeek cluster-layout.zeek
+# @TEST-EXEC: cp $FILES/zeromq/test-bootstrap.zeek zeromq-test-bootstrap.zeek
+# @TEST-EXEC: cp $FILES/zeromq/metrics.zeek zeromq-metrics.zeek
+#
+# @TEST-EXEC: zeek --parse-only manager.zeek
+# @TEST-EXEC: zeek --parse-only other.zeek
+#
+# @TEST-EXEC: btest-bg-run manager  "ZEEKPATH=$ZEEKPATH:.. && CLUSTER_NODE=manager  zeek -b ../manager.zeek> out"
+# @TEST-EXEC: btest-bg-run proxy    "ZEEKPATH=$ZEEKPATH:.. && CLUSTER_NODE=proxy    zeek -b ../other.zeek >out"
+# @TEST-EXEC: btest-bg-run worker-1 "ZEEKPATH=$ZEEKPATH:.. && CLUSTER_NODE=worker-1 zeek -b ../other.zeek >out"
+# @TEST-EXEC: btest-bg-run worker-2 "ZEEKPATH=$ZEEKPATH:.. && CLUSTER_NODE=worker-2 zeek -b ../other.zeek >out"
+#
+# @TEST-EXEC: btest-bg-wait 30
+# @TEST-EXEC: btest-diff manager/out
+# @TEST-EXEC: btest-diff proxy/out
+# @TEST-EXEC: btest-diff worker-1/out
+# @TEST-EXEC: btest-diff worker-2/out
+
+# @TEST-START-FILE common.zeek
+@load ./zeromq-test-bootstrap
+@load ./zeromq-metrics
+
+global tick: event() &is_used;
+global finish: event(name: string) &is_used;
+global ping: event(sender: string, c: count) &is_used;
+
+# How many messages each node publishes in total.
+const total_publishes = 100000;
+# How many events to publish per tick()
+const batch = 100;
+
+# Unlimited buffering.
+redef Cluster::Backend::ZeroMQ::xpub_sndhwm = 0;
+redef Cluster::Backend::ZeroMQ::onloop_queue_hwm = 0;
+
+global test_nodes = set( "proxy", "worker-1", "worker-2" ) &ordered;
+# @TEST-END-FILE
+
+# @TEST-START-FILE manager.zeek
+@load ./common.zeek
+
+global nodes_up: set[string] = set();
+global nodes_down: set[string] = set();
+
+global sent_finish = F;
+
+event send_finish()
+	{
+	if ( sent_finish )
+		return;
+
+	print "sending finish";
+	for ( n in test_nodes )
+		Cluster::publish(Cluster::node_topic(n), finish, Cluster::node);
+
+	sent_finish = T;
+	}
+
+event Cluster::node_up(name: string, id: string)
+	{
+	add nodes_up[name];
+	print "nodes_up", |nodes_up|;
+
+	# Get the ball rolling once all nodes are available, sending the
+	# first tick() to proxy and workers.
+	if ( |nodes_up| == |test_nodes| )
+		{
+		Cluster::publish(Cluster::worker_topic, tick);
+		Cluster::publish(Cluster::proxy_topic, tick);
+		}
+	}
+
+event Cluster::node_down(name: string, id: string)
+	{
+	add nodes_down[name];
+	print "nodes_down", |nodes_down|;
+	if ( |nodes_down| == |test_nodes| )
+		terminate();
+	}
+
+global last_c: table[string] of count &default=0;
+global drop_c: table[string] of count &default=0;
+
+event ping(sender: string, c: count)
+	{
+	local dropped = c  - last_c[sender] - 1;
+	if ( dropped > 0 )
+		drop_c[sender] += dropped;
+
+	last_c[sender] = c;
+
+	# Check if all senders sent enough messages. If not,
+	# wait for the next ping to arrive.
+	for ( _, lc in last_c )
+		if ( lc < total_publishes )
+			return;
+
+	# Send finish just once.
+	event send_finish();
+	}
+
+event zeek_done()
+	{
+	local xpub_drops = Cluster::Backend::ZeroMQ::xpub_drops();
+	local onloop_drops = Cluster::Backend::ZeroMQ::onloop_drops();
+	print "had xpub_drops?", xpub_drops > 0;
+	print "had onloop_drops?", onloop_drops > 0;
+
+	for ( n in test_nodes )
+		print fmt("node %s dropped=%s count=%s", n, drop_c[n], last_c[n]);
+	}
+# @TEST-END-FILE
+
+
+# @TEST-START-FILE other.zeek
+@load ./common.zeek
+
+global publishes = 0;
+
+event tick()
+	{
+	local i = batch;
+	while ( i > 0 )
+		{
+		--i;
+		++publishes;
+		Cluster::publish(Cluster::manager_topic, ping, Cluster::node, publishes);
+
+		# Return once all messages were published. Nothing's supposed
+		# to be dropped, so that should be fine.
+		if ( publishes >= total_publishes )
+			return;
+		}
+
+	schedule 0sec { tick() };
+	}
+
+event finish(name: string)
+	{
+	terminate();
+	}
+
+event zeek_done()
+	{
+	local xpub_drops = Cluster::Backend::ZeroMQ::xpub_drops();
+	local onloop_drops = Cluster::Backend::ZeroMQ::onloop_drops();
+	print "had xpub_drops?", xpub_drops > 0;
+	print "had onloop_drops?", onloop_drops > 0;
+	}
+# @TEST-END-FILE
--- a/testing/btest/cluster/zeromq/overload-worker-proxy-topic-drop.zeek
+++ b/testing/btest/cluster/zeromq/overload-worker-proxy-topic-drop.zeek
@ -0,0 +1,183 @@
+# @TEST-DOC: Workers and proxy publish to the worker and proxy topics. They publish so fast that messages are dropped a) on their end and b) their own onloop queue as well. The test checks that metrics are incremented and there's no lockup. The manager only coordinates startup and shutdown.
+#
+# @TEST-REQUIRES: have-zeromq
+#
+# @TEST-GROUP: cluster-zeromq
+#
+# @TEST-PORT: XPUB_PORT
+# @TEST-PORT: XSUB_PORT
+# @TEST-PORT: LOG_PULL_PORT
+#
+# @TEST-EXEC: cp $FILES/zeromq/cluster-layout-no-logger.zeek cluster-layout.zeek
+# @TEST-EXEC: cp $FILES/zeromq/test-bootstrap.zeek zeromq-test-bootstrap.zeek
+# @TEST-EXEC: cp $FILES/zeromq/metrics.zeek zeromq-metrics.zeek
+#
+# @TEST-EXEC: zeek --parse-only manager.zeek
+# @TEST-EXEC: zeek --parse-only other.zeek
+#
+# @TEST-EXEC: btest-bg-run manager  "ZEEKPATH=$ZEEKPATH:.. && CLUSTER_NODE=manager  zeek -b ../manager.zeek> out"
+# @TEST-EXEC: btest-bg-run proxy    "ZEEKPATH=$ZEEKPATH:.. && CLUSTER_NODE=proxy    zeek -b ../other.zeek >out"
+# @TEST-EXEC: btest-bg-run worker-1 "ZEEKPATH=$ZEEKPATH:.. && CLUSTER_NODE=worker-1 zeek -b ../other.zeek >out"
+# @TEST-EXEC: btest-bg-run worker-2 "ZEEKPATH=$ZEEKPATH:.. && CLUSTER_NODE=worker-2 zeek -b ../other.zeek >out"
+#
+# @TEST-EXEC: btest-bg-wait 30
+# @TEST-EXEC: btest-diff manager/out
+# @TEST-EXEC: btest-diff proxy/out
+# @TEST-EXEC: btest-diff worker-1/out
+# @TEST-EXEC: btest-diff worker-2/out
+
+# @TEST-START-FILE common.zeek
+@load ./zeromq-test-bootstrap
+@load ./zeromq-metrics
+
+global tick: event() &is_used;
+global done: event(name: string) &is_used;
+global finish: event(name: string) &is_used;
+global ping: event(sender: string, c: count) &is_used;
+
+# How many messages each node publishes in total.
+const total_publishes = 100000;
+# How many events to publish per tick()
+const batch = 100;
+
+# Lower HWMs to provoke drops
+redef Cluster::Backend::ZeroMQ::xpub_sndhwm = batch/ 5;
+redef Cluster::Backend::ZeroMQ::onloop_queue_hwm = batch / 5;
+
+global test_nodes = set( "proxy", "worker-1", "worker-2" ) &ordered;
+# @TEST-END-FILE
+
+# @TEST-START-FILE manager.zeek
+@load ./common.zeek
+
+global nodes_up: set[string] = set();
+global nodes_done: set[string] = set();
+global nodes_down: set[string] = set();
+
+global sent_finish = F;
+
+event send_finish()
+	{
+	if ( sent_finish )
+		return;
+
+	print "sending finish";
+	for ( n in test_nodes )
+		Cluster::publish(Cluster::node_topic(n), finish, Cluster::node);
+
+	sent_finish = T;
+	}
+
+event Cluster::node_up(name: string, id: string)
+	{
+	add nodes_up[name];
+	print "nodes_up", |nodes_up|;
+
+	# Get the ball rolling once all nodes are available, sending the
+	# first tick() to proxy and workers.
+	if ( |nodes_up| == |test_nodes| )
+		{
+		Cluster::publish(Cluster::worker_topic, tick);
+		Cluster::publish(Cluster::proxy_topic, tick);
+		}
+	}
+
+event Cluster::node_down(name: string, id: string)
+	{
+	add nodes_down[name];
+	print "nodes_down", |nodes_down|;
+	if ( |nodes_down| == |test_nodes| )
+		terminate();
+	}
+
+event done(sender: string)
+	{
+	local prev = |nodes_done|;
+	add nodes_done[sender];
+	if ( prev < |nodes_done| )
+		print "nodes_done", |nodes_done|;
+
+	if ( |nodes_done| == |test_nodes| )
+		event send_finish();
+	}
+
+event zeek_done()
+	{
+	local xpub_drops = Cluster::Backend::ZeroMQ::xpub_drops();
+	local onloop_drops = Cluster::Backend::ZeroMQ::onloop_drops();
+	print "had xpub_drops?", xpub_drops > 0;
+	print "had onloop_drops?", onloop_drops > 0;
+	}
+# @TEST-END-FILE
+
+
+# @TEST-START-FILE other.zeek
+@load ./common.zeek
+global last_c: table[string] of count &default=0;
+global drop_c: table[string] of count &default=0;
+
+event ping(sender: string, c: count)
+	{
+	local dropped = c  - last_c[sender] - 1;
+	if ( dropped > 0 )
+		drop_c[sender] += dropped;
+
+	last_c[sender] = c;
+
+	# Check if all senders sent enough messages. If not,
+	# wait for the next ping to arrive.
+	if ( |last_c| < |test_nodes|  - 1 )
+		return;
+
+	for ( _, lc in last_c )
+		if ( lc < total_publishes )
+			return;
+
+	# If all nodes sent enough pings, send "done" to the manager.
+	Cluster::publish(Cluster::manager_topic, done, Cluster::node);
+	}
+
+global publishes = 0;
+
+event tick()
+	{
+	local i = batch;
+	while ( i > 0 )
+		{
+		--i;
+		++publishes;
+		Cluster::publish(Cluster::worker_topic, ping, Cluster::node, publishes);
+		Cluster::publish(Cluster::proxy_topic, ping, Cluster::node, publishes);
+
+		# Continue sending a single publish for every tick() even
+		# if we've published enough in order for the manager to
+		# detect we're done. We need to continue here because this
+		# node, but also the manager node, may have dropped events.
+		if ( publishes >= total_publishes )
+			break;
+		}
+
+	# Relax publishing if we published enough as to not
+	# continue to overload the cluster and have a better
+	# chance of termination events going through.
+	local s = publishes < total_publishes ? 0sec : 0.05sec;
+	schedule s { tick() };
+	}
+
+event finish(name: string)
+	{
+	terminate();
+	}
+
+event zeek_done()
+	{
+	local xpub_drops = Cluster::Backend::ZeroMQ::xpub_drops();
+	local onloop_drops = Cluster::Backend::ZeroMQ::onloop_drops();
+	print "had xpub_drops?", xpub_drops > 0;
+	print "had onloop_drops?", onloop_drops > 0;
+
+	for ( n in test_nodes )
+		if ( n != Cluster::node )
+			print fmt("node %s dropped=%s", n, drop_c[n] > 0);
+	}
+# @TEST-END-FILE
--- a/testing/btest/cluster/zeromq/overload-worker-proxy-topic-no-drop.zeek
+++ b/testing/btest/cluster/zeromq/overload-worker-proxy-topic-no-drop.zeek
@ -0,0 +1,181 @@
+# @TEST-DOC: Workers and proxy publish to the worker and proxy topics. They publish so fast that messages are dropped a) on their end and b) their own onloop queue as well. The test checks that metrics are incremented and there's no lockup. The manager only coordinates startup and shutdown.
+#
+# @TEST-REQUIRES: have-zeromq
+#
+# @TEST-GROUP: cluster-zeromq
+#
+# @TEST-PORT: XPUB_PORT
+# @TEST-PORT: XSUB_PORT
+# @TEST-PORT: LOG_PULL_PORT
+#
+# @TEST-EXEC: cp $FILES/zeromq/cluster-layout-no-logger.zeek cluster-layout.zeek
+# @TEST-EXEC: cp $FILES/zeromq/test-bootstrap.zeek zeromq-test-bootstrap.zeek
+# @TEST-EXEC: cp $FILES/zeromq/metrics.zeek zeromq-metrics.zeek
+#
+# @TEST-EXEC: zeek --parse-only manager.zeek
+# @TEST-EXEC: zeek --parse-only other.zeek
+#
+# @TEST-EXEC: btest-bg-run manager  "ZEEKPATH=$ZEEKPATH:.. && CLUSTER_NODE=manager  zeek -b ../manager.zeek> out"
+# @TEST-EXEC: btest-bg-run proxy    "ZEEKPATH=$ZEEKPATH:.. && CLUSTER_NODE=proxy    zeek -b ../other.zeek >out"
+# @TEST-EXEC: btest-bg-run worker-1 "ZEEKPATH=$ZEEKPATH:.. && CLUSTER_NODE=worker-1 zeek -b ../other.zeek >out"
+# @TEST-EXEC: btest-bg-run worker-2 "ZEEKPATH=$ZEEKPATH:.. && CLUSTER_NODE=worker-2 zeek -b ../other.zeek >out"
+#
+# @TEST-EXEC: btest-bg-wait 30
+# @TEST-EXEC: btest-diff manager/out
+# @TEST-EXEC: btest-diff proxy/out
+# @TEST-EXEC: btest-diff worker-1/out
+# @TEST-EXEC: btest-diff worker-2/out
+
+# @TEST-START-FILE common.zeek
+@load ./zeromq-test-bootstrap
+@load ./zeromq-metrics
+
+global tick: event() &is_used;
+global done: event(name: string) &is_used;
+global finish: event(name: string) &is_used;
+global ping: event(sender: string, c: count) &is_used;
+
+# How many messages each node publishes in total.
+const total_publishes = 100000;
+# How many events to publish per tick()
+const batch = 100;
+
+# Lower HWMs to provoke drops
+redef Cluster::Backend::ZeroMQ::xpub_sndhwm = 0;
+redef Cluster::Backend::ZeroMQ::onloop_queue_hwm = 0;
+
+global test_nodes = set( "proxy", "worker-1", "worker-2" ) &ordered;
+# @TEST-END-FILE
+
+# @TEST-START-FILE manager.zeek
+@load ./common.zeek
+
+global nodes_up: set[string] = set();
+global nodes_done: set[string] = set();
+global nodes_down: set[string] = set();
+
+global sent_finish = F;
+
+event send_finish()
+	{
+	if ( sent_finish )
+		return;
+
+	print "sending finish";
+	for ( n in test_nodes )
+		Cluster::publish(Cluster::node_topic(n), finish, Cluster::node);
+
+	sent_finish = T;
+	}
+
+event Cluster::node_up(name: string, id: string)
+	{
+	add nodes_up[name];
+	print "nodes_up", |nodes_up|;
+
+	# Get the ball rolling once all nodes are available, sending the
+	# first tick() to proxy and workers.
+	if ( |nodes_up| == |test_nodes| )
+		{
+		Cluster::publish(Cluster::worker_topic, tick);
+		Cluster::publish(Cluster::proxy_topic, tick);
+		}
+	}
+
+event Cluster::node_down(name: string, id: string)
+	{
+	add nodes_down[name];
+	print "nodes_down", |nodes_down|;
+	if ( |nodes_down| == |test_nodes| )
+		terminate();
+	}
+
+event done(sender: string)
+	{
+	add nodes_done[sender];
+	print "nodes_done", |nodes_done|;
+	if ( |nodes_done| == |test_nodes| )
+		event send_finish();
+	}
+
+event zeek_done()
+	{
+	local xpub_drops = Cluster::Backend::ZeroMQ::xpub_drops();
+	local onloop_drops = Cluster::Backend::ZeroMQ::onloop_drops();
+	print "had xpub_drops?", xpub_drops > 0;
+	print "had onloop_drops?", onloop_drops > 0;
+	}
+# @TEST-END-FILE
+
+
+# @TEST-START-FILE other.zeek
+@load ./common.zeek
+global last_c: table[string] of count &default=0;
+global drop_c: table[string] of count &default=0;
+
+global sent_done = F;
+
+event ping(sender: string, c: count)
+	{
+	local dropped = c  - last_c[sender] - 1;
+	if ( dropped > 0 )
+		drop_c[sender] += dropped;
+
+	last_c[sender] = c;
+
+	# Check if all senders sent enough messages. If not,
+	# wait for the next ping to arrive.
+	if ( |last_c| < |test_nodes|  - 1 )
+		return;
+
+	for ( _, lc in last_c )
+		if ( lc < total_publishes )
+			return;
+
+	# If all nodes sent enough pings, send "done" to the manager.
+	if ( ! sent_done )
+		{
+		Cluster::publish(Cluster::manager_topic, done, Cluster::node);
+		sent_done = T;
+		}
+	}
+
+global publishes = 0;
+
+event tick()
+	{
+	local i = batch;
+	while ( i > 0 )
+		{
+		--i;
+		++publishes;
+		Cluster::publish(Cluster::worker_topic, ping, Cluster::node, publishes);
+		Cluster::publish(Cluster::proxy_topic, ping, Cluster::node, publishes);
+
+		# Return once all messages were published. Nothing's supposed
+		# to be dropped, so that should be fine.
+		if ( publishes >= total_publishes )
+			return;
+		}
+
+	schedule 0sec { tick() };
+	}
+
+event finish(name: string)
+	{
+	terminate();
+	}
+
+event zeek_done()
+	{
+	local xpub_drops = Cluster::Backend::ZeroMQ::xpub_drops();
+	local onloop_drops = Cluster::Backend::ZeroMQ::onloop_drops();
+	print "had xpub_drops?", xpub_drops > 0;
+	print "had onloop_drops?", onloop_drops > 0;
+
+	for ( n in test_nodes )
+		if ( n != Cluster::node )
+			print fmt("node %s dropped=%s count=%s", n, drop_c[n], last_c[n]);
+
+	}
+# @TEST-END-FILE