From 5008f586ea5085c0cf265ac3598d9defed35471a Mon Sep 17 00:00:00 2001 From: Christian Kreibich Date: Thu, 24 Apr 2025 16:07:26 -0700 Subject: [PATCH 1/4] Deprecate Broker::congestion_queue_size and stop using it internally Since a reorg in the Broker library (commit b04195183) that revamped flow control and that we pulled in with Zeek 5.0, this setting hasn't done anything. Broker's endpoint::make_subscriber() and endpoint::make_status_subscriber() take a queue size argument (with a default value) that simply gets dropped in the eventual subscriber::make() call. See: https://github.com/zeek/broker/commit/b04195183515a1db8eb493011a744ba787239e98#diff-5c0d2baa7981caeb6a4080708ddca6ad929746d10c73d66598e46d7c2c03c8deL34-R178 --- scripts/base/frameworks/broker/main.zeek | 2 +- src/broker/Manager.cc | 9 +++------ 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/scripts/base/frameworks/broker/main.zeek b/scripts/base/frameworks/broker/main.zeek index 7402d39ecb..1dccd123bd 100644 --- a/scripts/base/frameworks/broker/main.zeek +++ b/scripts/base/frameworks/broker/main.zeek @@ -72,7 +72,7 @@ export { ## The number of buffered messages at the Broker/CAF layer after which ## a subscriber considers themselves congested (i.e. tune the congestion ## control mechanisms). - const congestion_queue_size = 200 &redef; + const congestion_queue_size = 200 &redef &deprecated="Remove in v8.1. Non-functional since v5.0"; ## The max number of log entries per log stream to batch together when ## sending log messages to a remote logger. diff --git a/src/broker/Manager.cc b/src/broker/Manager.cc index 853255523b..9b89452674 100644 --- a/src/broker/Manager.cc +++ b/src/broker/Manager.cc @@ -404,11 +404,9 @@ class BrokerState { public: using LogSeverityLevel = Observer::LogSeverityLevel; - BrokerState(broker::configuration config, size_t congestion_queue_size, LoggerQueuePtr queue, - PeerBufferStatePtr pbstate) + BrokerState(broker::configuration config, LoggerQueuePtr queue, PeerBufferStatePtr pbstate) : endpoint(std::move(config), telemetry_mgr->GetRegistry()), - subscriber( - endpoint.make_subscriber({broker::topic::statuses(), broker::topic::errors()}, congestion_queue_size)), + subscriber(endpoint.make_subscriber({broker::topic::statuses(), broker::topic::errors()})), loggerQueue(std::move(queue)), peerBufferState(std::move(pbstate)) { peerBufferState->SetEndpoint(&endpoint); @@ -594,8 +592,7 @@ void Manager::DoInitPostScript() { auto observer = std::make_shared(adapterVerbosity, queue, pbstate); broker::logger(observer); // *must* be called before creating the BrokerState - auto cqs = get_option("Broker::congestion_queue_size")->AsCount(); - bstate = std::make_shared(std::move(config), cqs, queue, pbstate); + bstate = std::make_shared(std::move(config), queue, pbstate); bstate->logSeverity = static_cast(logSeverityVal); bstate->stderrSeverity = static_cast(stderrSeverityVal); From 841a40ff8888fe392b7a974939cd4a39ababf791 Mon Sep 17 00:00:00 2001 From: Christian Kreibich Date: Thu, 24 Apr 2025 16:15:56 -0700 Subject: [PATCH 2/4] Switch Broker's default backpressure policy to drop_oldest, bump buffer sizes At every site where we've dug into backpressure disconnect findings, it has been the case that the default values were too small. 8192, so 4x the old default, suffices at every site to drown out premature disconnects. With metrics now available for the send buffers regardless of backpressure overflow policy, this also switches the default from "disconnect" to "drop_oldest" (for both peers and websockets), meaning that peerings remain untouched but the oldest queued message simply gets dropped when a new message is enqueued. With this policy, the number of backpressure overflows is then simply the count of discarded messages, something that users can tune to see drop to zero in everyday use. Another benefit is that marginal overflows cause less message loss than when an entire buffer's worth (plus potentially more in-flight messages) gets thrown out with a disconnect. --- scripts/base/frameworks/broker/main.zeek | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/base/frameworks/broker/main.zeek b/scripts/base/frameworks/broker/main.zeek index 1dccd123bd..87fb28bd01 100644 --- a/scripts/base/frameworks/broker/main.zeek +++ b/scripts/base/frameworks/broker/main.zeek @@ -89,20 +89,20 @@ export { ## Max number of items we buffer at most per peer. What action to take when ## the buffer reaches its maximum size is determined by ## :zeek:see:`Broker::peer_overflow_policy`. - const peer_buffer_size = 2048 &redef; + const peer_buffer_size = 8192 &redef; ## Configures how Broker responds to peers that cannot keep up with the ## incoming message rate. Available strategies: ## - disconnect: drop the connection to the unresponsive peer ## - drop_newest: replace the newest message in the buffer ## - drop_oldest: removed the olsted message from the buffer, then append - const peer_overflow_policy = "disconnect" &redef; + const peer_overflow_policy = "drop_oldest" &redef; ## Same as :zeek:see:`Broker::peer_buffer_size` but for WebSocket clients. - const web_socket_buffer_size = 512 &redef; + const web_socket_buffer_size = 8192 &redef; ## Same as :zeek:see:`Broker::peer_overflow_policy` but for WebSocket clients. - const web_socket_overflow_policy = "disconnect" &redef; + const web_socket_overflow_policy = "drop_oldest" &redef; ## How frequently Zeek resets some peering/client buffer statistics, ## such as ``max_queued_recently`` in :zeek:see:`BrokerPeeringStats`. From 7540d48fd56fa3100c83bc706c0223c25e25b6cf Mon Sep 17 00:00:00 2001 From: Christian Kreibich Date: Thu, 24 Apr 2025 17:29:26 -0700 Subject: [PATCH 3/4] Bump cluster testsuite This pulls in an update for the backpressure disconnect tests, which now need to set the policy explicitly. --- testing/external/commit-hash.zeek-testing-cluster | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/testing/external/commit-hash.zeek-testing-cluster b/testing/external/commit-hash.zeek-testing-cluster index 929da9460c..9f94b829d3 100644 --- a/testing/external/commit-hash.zeek-testing-cluster +++ b/testing/external/commit-hash.zeek-testing-cluster @@ -1 +1 @@ -2d1f0ae518b26938e24bd26f701dab17e174a626 +de6bc382b2320185c168e9f429e47904034510d3 From 68fadd04641e16ba433f8305bd1511281ae33cfd Mon Sep 17 00:00:00 2001 From: Christian Kreibich Date: Thu, 24 Apr 2025 16:22:17 -0700 Subject: [PATCH 4/4] Lower listen/connect retry intervals in Broker and the cluster framework to 1sec The former defaults (30sec, 1min) can slow down cluster startup and recovery considerably, and other systems have more aggressive intervals still. --- scripts/base/frameworks/broker/main.zeek | 4 ++-- scripts/base/frameworks/cluster/main.zeek | 2 +- testing/btest/broker/telemetry.zeek | 1 - testing/btest/scripts/policy/misc/weird-stats-cluster.zeek | 4 ---- 4 files changed, 3 insertions(+), 8 deletions(-) diff --git a/scripts/base/frameworks/broker/main.zeek b/scripts/base/frameworks/broker/main.zeek index 87fb28bd01..10606c74af 100644 --- a/scripts/base/frameworks/broker/main.zeek +++ b/scripts/base/frameworks/broker/main.zeek @@ -19,7 +19,7 @@ export { ## use already. Use of the ZEEK_DEFAULT_LISTEN_RETRY environment variable ## (set as a number of seconds) will override this option and also ## any values given to :zeek:see:`Broker::listen`. - const default_listen_retry = 30sec &redef; + const default_listen_retry = 1sec &redef; ## Default address on which to listen. ## @@ -36,7 +36,7 @@ export { ## ZEEK_DEFAULT_CONNECT_RETRY environment variable (set as number of ## seconds) will override this option and also any values given to ## :zeek:see:`Broker::peer`. - const default_connect_retry = 30sec &redef; + const default_connect_retry = 1sec &redef; ## If true, do not use SSL for network connections. By default, SSL will ## even be used if no certificates / CAs have been configured. In that case diff --git a/scripts/base/frameworks/cluster/main.zeek b/scripts/base/frameworks/cluster/main.zeek index 30511f9b82..f6b79b07ca 100644 --- a/scripts/base/frameworks/cluster/main.zeek +++ b/scripts/base/frameworks/cluster/main.zeek @@ -262,7 +262,7 @@ export { ## Interval for retrying failed connections between cluster nodes. ## If set, the ZEEK_DEFAULT_CONNECT_RETRY (given in number of seconds) ## environment variable overrides this option. - const retry_interval = 1min &redef; + const retry_interval = 1sec &redef; ## When using broker-enabled cluster framework, nodes broadcast this event ## to exchange their user-defined name along with a string that uniquely diff --git a/testing/btest/broker/telemetry.zeek b/testing/btest/broker/telemetry.zeek index 3008dd72f0..e7874fefad 100644 --- a/testing/btest/broker/telemetry.zeek +++ b/testing/btest/broker/telemetry.zeek @@ -23,7 +23,6 @@ redef Cluster::nodes = { redef exit_only_after_terminate = T; redef Log::enable_local_logging = T; redef Log::default_rotation_interval = 0secs; -redef Cluster::retry_interval = 1sec; function print_metrics(metrics: vector of Telemetry::Metric) { diff --git a/testing/btest/scripts/policy/misc/weird-stats-cluster.zeek b/testing/btest/scripts/policy/misc/weird-stats-cluster.zeek index b905917d57..b84675beb1 100644 --- a/testing/btest/scripts/policy/misc/weird-stats-cluster.zeek +++ b/testing/btest/scripts/policy/misc/weird-stats-cluster.zeek @@ -20,10 +20,6 @@ redef Cluster::nodes = { @load misc/weird-stats @load policy/frameworks/cluster/experimental -redef Cluster::retry_interval = 1sec; -redef Broker::default_listen_retry = 1sec; -redef Broker::default_connect_retry = 1sec; - redef Log::enable_local_logging = T; redef Log::default_rotation_interval = 0secs; redef WeirdStats::weird_stat_interval = 5secs;