Merge branch 'topic/christian/broker-tuning'

* topic/christian/broker-tuning:
  Lower listen/connect retry intervals in Broker and the cluster framework to 1sec
  Bump cluster testsuite
  Switch Broker's default backpressure policy to drop_oldest, bump buffer sizes
  Deprecate Broker::congestion_queue_size and stop using it internally
This commit is contained in:
Christian Kreibich 2025-04-25 10:23:30 -07:00
commit ebd0207352
8 changed files with 23 additions and 21 deletions

10
CHANGES
View file

@ -1,3 +1,13 @@
7.2.0-dev.649 | 2025-04-25 10:23:30 -0700
* Lower listen/connect retry intervals in Broker and the cluster framework to 1sec (Christian Kreibich, Corelight)
* Bump cluster testsuite (Christian Kreibich, Corelight)
* Switch Broker's default backpressure policy to drop_oldest, bump buffer sizes (Christian Kreibich, Corelight)
* Deprecate Broker::congestion_queue_size and stop using it internally (Christian Kreibich, Corelight)
7.2.0-dev.644 | 2025-04-25 10:02:58 -0700 7.2.0-dev.644 | 2025-04-25 10:02:58 -0700
* Add basic btest to verify that Broker peering telemetry is available. (Christian Kreibich, Corelight) * Add basic btest to verify that Broker peering telemetry is available. (Christian Kreibich, Corelight)

View file

@ -1 +1 @@
7.2.0-dev.644 7.2.0-dev.649

View file

@ -19,7 +19,7 @@ export {
## use already. Use of the ZEEK_DEFAULT_LISTEN_RETRY environment variable ## use already. Use of the ZEEK_DEFAULT_LISTEN_RETRY environment variable
## (set as a number of seconds) will override this option and also ## (set as a number of seconds) will override this option and also
## any values given to :zeek:see:`Broker::listen`. ## any values given to :zeek:see:`Broker::listen`.
const default_listen_retry = 30sec &redef; const default_listen_retry = 1sec &redef;
## Default address on which to listen. ## Default address on which to listen.
## ##
@ -36,7 +36,7 @@ export {
## ZEEK_DEFAULT_CONNECT_RETRY environment variable (set as number of ## ZEEK_DEFAULT_CONNECT_RETRY environment variable (set as number of
## seconds) will override this option and also any values given to ## seconds) will override this option and also any values given to
## :zeek:see:`Broker::peer`. ## :zeek:see:`Broker::peer`.
const default_connect_retry = 30sec &redef; const default_connect_retry = 1sec &redef;
## If true, do not use SSL for network connections. By default, SSL will ## If true, do not use SSL for network connections. By default, SSL will
## even be used if no certificates / CAs have been configured. In that case ## even be used if no certificates / CAs have been configured. In that case
@ -72,7 +72,7 @@ export {
## The number of buffered messages at the Broker/CAF layer after which ## The number of buffered messages at the Broker/CAF layer after which
## a subscriber considers themselves congested (i.e. tune the congestion ## a subscriber considers themselves congested (i.e. tune the congestion
## control mechanisms). ## control mechanisms).
const congestion_queue_size = 200 &redef; const congestion_queue_size = 200 &redef &deprecated="Remove in v8.1. Non-functional since v5.0";
## The max number of log entries per log stream to batch together when ## The max number of log entries per log stream to batch together when
## sending log messages to a remote logger. ## sending log messages to a remote logger.
@ -89,20 +89,20 @@ export {
## Max number of items we buffer at most per peer. What action to take when ## Max number of items we buffer at most per peer. What action to take when
## the buffer reaches its maximum size is determined by ## the buffer reaches its maximum size is determined by
## :zeek:see:`Broker::peer_overflow_policy`. ## :zeek:see:`Broker::peer_overflow_policy`.
const peer_buffer_size = 2048 &redef; const peer_buffer_size = 8192 &redef;
## Configures how Broker responds to peers that cannot keep up with the ## Configures how Broker responds to peers that cannot keep up with the
## incoming message rate. Available strategies: ## incoming message rate. Available strategies:
## - disconnect: drop the connection to the unresponsive peer ## - disconnect: drop the connection to the unresponsive peer
## - drop_newest: replace the newest message in the buffer ## - drop_newest: replace the newest message in the buffer
## - drop_oldest: removed the olsted message from the buffer, then append ## - drop_oldest: removed the olsted message from the buffer, then append
const peer_overflow_policy = "disconnect" &redef; const peer_overflow_policy = "drop_oldest" &redef;
## Same as :zeek:see:`Broker::peer_buffer_size` but for WebSocket clients. ## Same as :zeek:see:`Broker::peer_buffer_size` but for WebSocket clients.
const web_socket_buffer_size = 512 &redef; const web_socket_buffer_size = 8192 &redef;
## Same as :zeek:see:`Broker::peer_overflow_policy` but for WebSocket clients. ## Same as :zeek:see:`Broker::peer_overflow_policy` but for WebSocket clients.
const web_socket_overflow_policy = "disconnect" &redef; const web_socket_overflow_policy = "drop_oldest" &redef;
## How frequently Zeek resets some peering/client buffer statistics, ## How frequently Zeek resets some peering/client buffer statistics,
## such as ``max_queued_recently`` in :zeek:see:`BrokerPeeringStats`. ## such as ``max_queued_recently`` in :zeek:see:`BrokerPeeringStats`.

View file

@ -262,7 +262,7 @@ export {
## Interval for retrying failed connections between cluster nodes. ## Interval for retrying failed connections between cluster nodes.
## If set, the ZEEK_DEFAULT_CONNECT_RETRY (given in number of seconds) ## If set, the ZEEK_DEFAULT_CONNECT_RETRY (given in number of seconds)
## environment variable overrides this option. ## environment variable overrides this option.
const retry_interval = 1min &redef; const retry_interval = 1sec &redef;
## When using broker-enabled cluster framework, nodes broadcast this event ## When using broker-enabled cluster framework, nodes broadcast this event
## to exchange their user-defined name along with a string that uniquely ## to exchange their user-defined name along with a string that uniquely

View file

@ -404,11 +404,9 @@ class BrokerState {
public: public:
using LogSeverityLevel = Observer::LogSeverityLevel; using LogSeverityLevel = Observer::LogSeverityLevel;
BrokerState(broker::configuration config, size_t congestion_queue_size, LoggerQueuePtr queue, BrokerState(broker::configuration config, LoggerQueuePtr queue, PeerBufferStatePtr pbstate)
PeerBufferStatePtr pbstate)
: endpoint(std::move(config), telemetry_mgr->GetRegistry()), : endpoint(std::move(config), telemetry_mgr->GetRegistry()),
subscriber( subscriber(endpoint.make_subscriber({broker::topic::statuses(), broker::topic::errors()})),
endpoint.make_subscriber({broker::topic::statuses(), broker::topic::errors()}, congestion_queue_size)),
loggerQueue(std::move(queue)), loggerQueue(std::move(queue)),
peerBufferState(std::move(pbstate)) { peerBufferState(std::move(pbstate)) {
peerBufferState->SetEndpoint(&endpoint); peerBufferState->SetEndpoint(&endpoint);
@ -594,8 +592,7 @@ void Manager::DoInitPostScript() {
auto observer = std::make_shared<Observer>(adapterVerbosity, queue, pbstate); auto observer = std::make_shared<Observer>(adapterVerbosity, queue, pbstate);
broker::logger(observer); // *must* be called before creating the BrokerState broker::logger(observer); // *must* be called before creating the BrokerState
auto cqs = get_option("Broker::congestion_queue_size")->AsCount(); bstate = std::make_shared<BrokerState>(std::move(config), queue, pbstate);
bstate = std::make_shared<BrokerState>(std::move(config), cqs, queue, pbstate);
bstate->logSeverity = static_cast<BrokerSeverityLevel>(logSeverityVal); bstate->logSeverity = static_cast<BrokerSeverityLevel>(logSeverityVal);
bstate->stderrSeverity = static_cast<BrokerSeverityLevel>(stderrSeverityVal); bstate->stderrSeverity = static_cast<BrokerSeverityLevel>(stderrSeverityVal);

View file

@ -23,7 +23,6 @@ redef Cluster::nodes = {
redef exit_only_after_terminate = T; redef exit_only_after_terminate = T;
redef Log::enable_local_logging = T; redef Log::enable_local_logging = T;
redef Log::default_rotation_interval = 0secs; redef Log::default_rotation_interval = 0secs;
redef Cluster::retry_interval = 1sec;
function print_metrics(metrics: vector of Telemetry::Metric) function print_metrics(metrics: vector of Telemetry::Metric)
{ {

View file

@ -20,10 +20,6 @@ redef Cluster::nodes = {
@load misc/weird-stats @load misc/weird-stats
@load policy/frameworks/cluster/experimental @load policy/frameworks/cluster/experimental
redef Cluster::retry_interval = 1sec;
redef Broker::default_listen_retry = 1sec;
redef Broker::default_connect_retry = 1sec;
redef Log::enable_local_logging = T; redef Log::enable_local_logging = T;
redef Log::default_rotation_interval = 0secs; redef Log::default_rotation_interval = 0secs;
redef WeirdStats::weird_stat_interval = 5secs; redef WeirdStats::weird_stat_interval = 5secs;

View file

@ -1 +1 @@
2d1f0ae518b26938e24bd26f701dab17e174a626 de6bc382b2320185c168e9f429e47904034510d3