mirror of
https://github.com/zeek/zeek.git
synced 2025-10-02 06:38:20 +00:00
cluster/zeromq: Add support for configurable overflow_policy
The current ZeroMQ default behavior is to block when the local XPUB queue is full. This commit adds a Cluster::Backend::ZeroMQ::overflow_policy setting to support dropping messages locally if the XPUB socket's queue reaches its HWM. Note that Cluster::publish() will continue to return T because the dropping happens in a separate thread.
This commit is contained in:
parent
49d16ad6e1
commit
f95c7e3499
13 changed files with 482 additions and 14 deletions
|
@ -28,6 +28,45 @@
|
||||||
module Cluster::Backend::ZeroMQ;
|
module Cluster::Backend::ZeroMQ;
|
||||||
|
|
||||||
export {
|
export {
|
||||||
|
## Behavior when the local XPUB socket's queue reaches the high-water-mark (HWM).
|
||||||
|
type OverflowPolicy: enum {
|
||||||
|
BLOCK, ##< Block publishing operations if overloaded.
|
||||||
|
DROP, ##< Drop events if publishing operations would block due to overload.
|
||||||
|
};
|
||||||
|
|
||||||
|
## Overflow policy for ZeroMQ.
|
||||||
|
##
|
||||||
|
## When publishing an event via :zeek:see:`Cluster::publish` would fail
|
||||||
|
## due to a node's local XPUB socket reaching its high water mark
|
||||||
|
## (see :zeek:see:`Cluster::Backend::ZeroMQ::xpub_sndhwm`), Zeek may
|
||||||
|
## either block until there's room available to queue the event, or
|
||||||
|
## drop the event. The default behavior is to block. Note that this can
|
||||||
|
## eventually result in :zeek:see:`Cluster::publish` calls to block,
|
||||||
|
## thereby halting Zeek-script execution and packet processing until
|
||||||
|
## an event can be published. If packet processing is more important than
|
||||||
|
## cluster communication, it is recommended to select the drop policy.
|
||||||
|
##
|
||||||
|
## For PCAP processing or cluster performance testing, where packet drops
|
||||||
|
## are usually no concern, the block policy might be more appropriate.
|
||||||
|
##
|
||||||
|
## Note that the return value of :zeek:see:`Cluster::publish` is currently
|
||||||
|
## not affected, even if events are dropped. It will return **T**
|
||||||
|
## even if the event is later dropped locally as the logic is running
|
||||||
|
## on a separate thread.
|
||||||
|
##
|
||||||
|
## With either policy, metrics are incremented for Zeek operators to
|
||||||
|
## notice a potential cluster overload. For the the blocking policy,
|
||||||
|
## a counter labeled ``zeek_cluster_zeromq_xpub_blocks_total`` is
|
||||||
|
## incremented on every blocking operation. For the drop policy, the
|
||||||
|
## ``zeek_cluster_zeromq_xpub_drops_total`` metric is incremented for
|
||||||
|
## every dropped event.
|
||||||
|
##
|
||||||
|
## Increasing :zeek:see:`Cluster::Backend::ZeroMQ::xpub_sndhwm` can help
|
||||||
|
## to account for bursty publish behavior, but should be carefully evaluated
|
||||||
|
## as it may result in increased peak memory usage or even out-of-memory
|
||||||
|
## situations if set to 0.
|
||||||
|
const overflow_policy = BLOCK &redef;
|
||||||
|
|
||||||
## The central broker's XPUB endpoint to connect to.
|
## The central broker's XPUB endpoint to connect to.
|
||||||
##
|
##
|
||||||
## A node connects with its XSUB socket to the XPUB socket
|
## A node connects with its XSUB socket to the XPUB socket
|
||||||
|
@ -188,7 +227,10 @@ export {
|
||||||
##
|
##
|
||||||
## Whether to configure ``ZMQ_XPUB_NODROP`` on the XPUB socket
|
## Whether to configure ``ZMQ_XPUB_NODROP`` on the XPUB socket
|
||||||
## connecting to the proxy to detect when sending a message fails
|
## connecting to the proxy to detect when sending a message fails
|
||||||
## due to reaching the high-water-mark.
|
## due to reaching the high-water-mark. If you set this to **F**,
|
||||||
|
## the :zeek:see:`Cluster::Backend::ZeroMQ::overflow_policy` setting
|
||||||
|
## has no effect as sending on the XPUB socket will always succeed,
|
||||||
|
## but may silently drop messages.
|
||||||
##
|
##
|
||||||
## See ZeroMQ's `ZMQ_XPUB_NODROP documentation <http://api.zeromq.org/4-2:zmq-setsockopt#toc61>`_
|
## See ZeroMQ's `ZMQ_XPUB_NODROP documentation <http://api.zeromq.org/4-2:zmq-setsockopt#toc61>`_
|
||||||
## for more details.
|
## for more details.
|
||||||
|
|
|
@ -15,6 +15,7 @@
|
||||||
#include <zmq.hpp>
|
#include <zmq.hpp>
|
||||||
|
|
||||||
#include "zeek/DebugLogger.h"
|
#include "zeek/DebugLogger.h"
|
||||||
|
#include "zeek/Desc.h"
|
||||||
#include "zeek/EventHandler.h"
|
#include "zeek/EventHandler.h"
|
||||||
#include "zeek/EventRegistry.h"
|
#include "zeek/EventRegistry.h"
|
||||||
#include "zeek/ID.h"
|
#include "zeek/ID.h"
|
||||||
|
@ -114,16 +115,34 @@ void ZeroMQBackend::DoInitPostScript() {
|
||||||
event_unsubscription = zeek::event_registry->Register("Cluster::Backend::ZeroMQ::unsubscription");
|
event_unsubscription = zeek::event_registry->Register("Cluster::Backend::ZeroMQ::unsubscription");
|
||||||
event_subscription = zeek::event_registry->Register("Cluster::Backend::ZeroMQ::subscription");
|
event_subscription = zeek::event_registry->Register("Cluster::Backend::ZeroMQ::subscription");
|
||||||
|
|
||||||
total_xpub_stalls =
|
|
||||||
zeek::telemetry_mgr
|
|
||||||
->CounterInstance("zeek", "cluster_zeromq_xpub_stalls", {},
|
|
||||||
"Counter for how many times sending on the XPUB socket stalled due to EAGAIN.");
|
|
||||||
|
|
||||||
// xpub/xsub hwm configuration
|
// xpub/xsub hwm configuration
|
||||||
xpub_sndhwm = static_cast<int>(zeek::id::find_val<zeek::IntVal>("Cluster::Backend::ZeroMQ::xpub_sndhwm")->AsInt());
|
xpub_sndhwm = static_cast<int>(zeek::id::find_val<zeek::IntVal>("Cluster::Backend::ZeroMQ::xpub_sndhwm")->AsInt());
|
||||||
xpub_sndbuf = static_cast<int>(zeek::id::find_val<zeek::IntVal>("Cluster::Backend::ZeroMQ::xpub_sndbuf")->AsInt());
|
xpub_sndbuf = static_cast<int>(zeek::id::find_val<zeek::IntVal>("Cluster::Backend::ZeroMQ::xpub_sndbuf")->AsInt());
|
||||||
xsub_rcvhwm = static_cast<int>(zeek::id::find_val<zeek::IntVal>("Cluster::Backend::ZeroMQ::xsub_rcvhwm")->AsInt());
|
xsub_rcvhwm = static_cast<int>(zeek::id::find_val<zeek::IntVal>("Cluster::Backend::ZeroMQ::xsub_rcvhwm")->AsInt());
|
||||||
xsub_rcvbuf = static_cast<int>(zeek::id::find_val<zeek::IntVal>("Cluster::Backend::ZeroMQ::xsub_rcvbuf")->AsInt());
|
xsub_rcvbuf = static_cast<int>(zeek::id::find_val<zeek::IntVal>("Cluster::Backend::ZeroMQ::xsub_rcvbuf")->AsInt());
|
||||||
|
// The values of ZeroMQ::OverflowPolicy
|
||||||
|
static const auto& block_policy = zeek::id::find_val<zeek::EnumVal>("Cluster::Backend::ZeroMQ::BLOCK");
|
||||||
|
static const auto& drop_policy = zeek::id::find_val<zeek::EnumVal>("Cluster::Backend::ZeroMQ::DROP");
|
||||||
|
static const auto& selected_policy =
|
||||||
|
zeek::id::find_const<zeek::EnumVal>("Cluster::Backend::ZeroMQ::overflow_policy");
|
||||||
|
|
||||||
|
// There's only two policies right now. If there's ever a lot more or this
|
||||||
|
// can be generalized across backends, consider introducing a class.
|
||||||
|
if ( selected_policy == block_policy )
|
||||||
|
overflow_policy = OverflowPolicy::Block;
|
||||||
|
else if ( selected_policy == drop_policy )
|
||||||
|
overflow_policy = OverflowPolicy::Drop;
|
||||||
|
else
|
||||||
|
zeek::reporter->InternalError("Unknown ZeroMQ overflow_policy: %s", obj_desc(selected_policy).c_str());
|
||||||
|
|
||||||
|
// Metrics for blocking and dropping.
|
||||||
|
total_xpub_blocks =
|
||||||
|
zeek::telemetry_mgr->CounterInstance("zeek", "cluster_zeromq_xpub_blocks", {},
|
||||||
|
"Number of times publishing events stalled due to XPUB socket HWM.");
|
||||||
|
|
||||||
|
total_xpub_drops = zeek::telemetry_mgr->CounterInstance("zeek", "cluster_zeromq_xpub_drops", {},
|
||||||
|
|
||||||
|
"Number of events dropped due to XPUB socket HWM.");
|
||||||
}
|
}
|
||||||
|
|
||||||
void ZeroMQBackend::DoTerminate() {
|
void ZeroMQBackend::DoTerminate() {
|
||||||
|
@ -490,7 +509,13 @@ void ZeroMQBackend::Run() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else if ( msg.size() == 4 ) {
|
else if ( msg.size() == 4 ) {
|
||||||
|
// Send out a 4-part message over the XPUB socket.
|
||||||
|
bool skip_msg = false;
|
||||||
|
|
||||||
for ( auto& part : msg ) {
|
for ( auto& part : msg ) {
|
||||||
|
if ( skip_msg )
|
||||||
|
break;
|
||||||
|
|
||||||
zmq::send_flags flags = zmq::send_flags::dontwait;
|
zmq::send_flags flags = zmq::send_flags::dontwait;
|
||||||
if ( part.more() )
|
if ( part.more() )
|
||||||
flags = flags | zmq::send_flags::sndmore;
|
flags = flags | zmq::send_flags::sndmore;
|
||||||
|
@ -510,16 +535,28 @@ void ZeroMQBackend::Run() {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Empty result means xpub.send() returned EAGAIN. The socket reached
|
// Empty result means xpub.send() returned EAGAIN. The socket reached
|
||||||
// its high-water-mark and we cannot send right now. We simply attempt
|
// its high-water-mark and we cannot send right now.
|
||||||
// to re-send the message without the dontwait flag after increasing
|
//
|
||||||
// the xpub stall metric. This way, ZeroMQ will block in xpub.send() until
|
// If the overflow policy is set to "drop", increment the metric and
|
||||||
// there's enough room available.
|
// skip processing all parts of this message by setting skip_msg=true.
|
||||||
|
//
|
||||||
if ( ! result ) {
|
if ( ! result ) {
|
||||||
total_xpub_stalls->Inc();
|
if ( overflow_policy == OverflowPolicy::Drop ) {
|
||||||
|
total_xpub_drops->Inc();
|
||||||
|
skip_msg = true;
|
||||||
|
break; // Breaks the do-while and skip_msg=true above breaks the outer for. I'm sorry.
|
||||||
|
}
|
||||||
|
|
||||||
|
// If the block policy is enabled, re-send the message without the dontwait
|
||||||
|
// flag after increasing the xpub stall metric. This way, ZeroMQ will block
|
||||||
|
// in xpub.send() until there's enough room available.
|
||||||
|
assert(overflow_policy == OverflowPolicy::Block);
|
||||||
|
total_xpub_blocks->Inc();
|
||||||
|
|
||||||
try {
|
try {
|
||||||
// We sent non-blocking above so we are able to observe and report stalls
|
// We sent non-blocking above so we are able to observe and report
|
||||||
// in a metric. Now that we have done that switch to blocking send.
|
// blocking as metric. Now that we have done that switch to an actual
|
||||||
|
// blocking send.
|
||||||
zmq::send_flags block_flags =
|
zmq::send_flags block_flags =
|
||||||
zmq::send_flags::none | (flags & zmq::send_flags::sndmore);
|
zmq::send_flags::none | (flags & zmq::send_flags::sndmore);
|
||||||
result = xpub.send(part, block_flags);
|
result = xpub.send(part, block_flags);
|
||||||
|
|
|
@ -2,6 +2,7 @@
|
||||||
|
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
|
#include <cstdint>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include <thread>
|
#include <thread>
|
||||||
#include <zmq.hpp>
|
#include <zmq.hpp>
|
||||||
|
@ -20,6 +21,14 @@ using CounterPtr = std::shared_ptr<Counter>;
|
||||||
|
|
||||||
namespace cluster::zeromq {
|
namespace cluster::zeromq {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The ZeroMQ overflow policies.
|
||||||
|
*/
|
||||||
|
enum class OverflowPolicy : uint8_t {
|
||||||
|
Block,
|
||||||
|
Drop,
|
||||||
|
};
|
||||||
|
|
||||||
class ZeroMQBackend : public cluster::ThreadedBackend {
|
class ZeroMQBackend : public cluster::ThreadedBackend {
|
||||||
public:
|
public:
|
||||||
/**
|
/**
|
||||||
|
@ -126,7 +135,10 @@ private:
|
||||||
std::map<std::string, SubscribeCallback> subscription_callbacks;
|
std::map<std::string, SubscribeCallback> subscription_callbacks;
|
||||||
std::set<std::string> xpub_subscriptions;
|
std::set<std::string> xpub_subscriptions;
|
||||||
|
|
||||||
zeek::telemetry::CounterPtr total_xpub_stalls;
|
// Overload policy , initialized in DoInitPostScript()
|
||||||
|
OverflowPolicy overflow_policy = OverflowPolicy::Block;
|
||||||
|
zeek::telemetry::CounterPtr total_xpub_blocks;
|
||||||
|
zeek::telemetry::CounterPtr total_xpub_drops;
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace cluster::zeromq
|
} // namespace cluster::zeromq
|
||||||
|
|
|
@ -0,0 +1,18 @@
|
||||||
|
### BTest baseline data generated by btest-diff. Do not edit. Use "btest -U/-u" to update. Requires BTest >= 0.63.
|
||||||
|
B nodes_up, 2
|
||||||
|
B nodes_up, 3
|
||||||
|
B nodes_up, 4
|
||||||
|
nodes_down, 2
|
||||||
|
nodes_down, 3
|
||||||
|
nodes_down, 4
|
||||||
|
drop_c, {
|
||||||
|
[proxy] = 0,
|
||||||
|
[worker-2] = 0,
|
||||||
|
[worker-1] = 0
|
||||||
|
}
|
||||||
|
last_c, {
|
||||||
|
[proxy] = 50000,
|
||||||
|
[worker-2] = 50000,
|
||||||
|
[worker-1] = 50000
|
||||||
|
}
|
||||||
|
GOOD: Observed no XPUB blocks on manager
|
|
@ -0,0 +1,4 @@
|
||||||
|
### BTest baseline data generated by btest-diff. Do not edit. Use "btest -U/-u" to update. Requires BTest >= 0.63.
|
||||||
|
start, XXXXXXXXXX.XXXXXX
|
||||||
|
zeek_done, XXXXXXXXXX.XXXXXX
|
||||||
|
GOOD: Observed XPUB blocks
|
|
@ -0,0 +1,4 @@
|
||||||
|
### BTest baseline data generated by btest-diff. Do not edit. Use "btest -U/-u" to update. Requires BTest >= 0.63.
|
||||||
|
start, XXXXXXXXXX.XXXXXX
|
||||||
|
zeek_done, XXXXXXXXXX.XXXXXX
|
||||||
|
GOOD: Observed XPUB blocks
|
|
@ -0,0 +1,4 @@
|
||||||
|
### BTest baseline data generated by btest-diff. Do not edit. Use "btest -U/-u" to update. Requires BTest >= 0.63.
|
||||||
|
start, XXXXXXXXXX.XXXXXX
|
||||||
|
zeek_done, XXXXXXXXXX.XXXXXX
|
||||||
|
GOOD: Observed XPUB blocks
|
|
@ -0,0 +1,11 @@
|
||||||
|
### BTest baseline data generated by btest-diff. Do not edit. Use "btest -U/-u" to update. Requires BTest >= 0.63.
|
||||||
|
B nodes_up, 2
|
||||||
|
B nodes_up, 3
|
||||||
|
B nodes_up, 4
|
||||||
|
nodes_down, 2
|
||||||
|
nodes_down, 3
|
||||||
|
nodes_down, 4
|
||||||
|
proxy, dropped?, T
|
||||||
|
worker-2, dropped?, T
|
||||||
|
worker-1, dropped?, T
|
||||||
|
GOOD: Observed no XPUB drops on manager
|
|
@ -0,0 +1,4 @@
|
||||||
|
### BTest baseline data generated by btest-diff. Do not edit. Use "btest -U/-u" to update. Requires BTest >= 0.63.
|
||||||
|
start, XXXXXXXXXX.XXXXXX
|
||||||
|
zeek_done, XXXXXXXXXX.XXXXXX
|
||||||
|
GOOD: Observed XPUB drops
|
|
@ -0,0 +1,4 @@
|
||||||
|
### BTest baseline data generated by btest-diff. Do not edit. Use "btest -U/-u" to update. Requires BTest >= 0.63.
|
||||||
|
start, XXXXXXXXXX.XXXXXX
|
||||||
|
zeek_done, XXXXXXXXXX.XXXXXX
|
||||||
|
GOOD: Observed XPUB drops
|
|
@ -0,0 +1,4 @@
|
||||||
|
### BTest baseline data generated by btest-diff. Do not edit. Use "btest -U/-u" to update. Requires BTest >= 0.63.
|
||||||
|
start, XXXXXXXXXX.XXXXXX
|
||||||
|
zeek_done, XXXXXXXXXX.XXXXXX
|
||||||
|
GOOD: Observed XPUB drops
|
155
testing/btest/cluster/zeromq/overflow-policy-block.zeek
Normal file
155
testing/btest/cluster/zeromq/overflow-policy-block.zeek
Normal file
|
@ -0,0 +1,155 @@
|
||||||
|
# @TEST-DOC: Workers and proxy publish to the manager topic. They publish so fast that their XPUB socket blocks. Check that all messages make it through and that the blocks metric is incremented.
|
||||||
|
#
|
||||||
|
# @TEST-REQUIRES: have-zeromq
|
||||||
|
#
|
||||||
|
# @TEST-GROUP: cluster-zeromq
|
||||||
|
#
|
||||||
|
# @TEST-PORT: XPUB_PORT
|
||||||
|
# @TEST-PORT: XSUB_PORT
|
||||||
|
# @TEST-PORT: LOG_PULL_PORT
|
||||||
|
#
|
||||||
|
# @TEST-EXEC: cp $FILES/zeromq/cluster-layout-no-logger.zeek cluster-layout.zeek
|
||||||
|
# @TEST-EXEC: cp $FILES/zeromq/test-bootstrap.zeek zeromq-test-bootstrap.zeek
|
||||||
|
#
|
||||||
|
# @TEST-EXEC: zeek --parse-only manager.zeek
|
||||||
|
# @TEST-EXEC: zeek --parse-only other.zeek
|
||||||
|
#
|
||||||
|
# @TEST-EXEC: btest-bg-run manager "ZEEKPATH=$ZEEKPATH:.. && CLUSTER_NODE=manager zeek -b ../manager.zeek >out"
|
||||||
|
# @TEST-EXEC: btest-bg-run proxy "ZEEKPATH=$ZEEKPATH:.. && CLUSTER_NODE=proxy zeek -b ../other.zeek >out"
|
||||||
|
# @TEST-EXEC: btest-bg-run worker-1 "ZEEKPATH=$ZEEKPATH:.. && CLUSTER_NODE=worker-1 zeek -b ../other.zeek >out"
|
||||||
|
# @TEST-EXEC: btest-bg-run worker-2 "ZEEKPATH=$ZEEKPATH:.. && CLUSTER_NODE=worker-2 zeek -b ../other.zeek >out"
|
||||||
|
#
|
||||||
|
# @TEST-EXEC: btest-bg-wait 30
|
||||||
|
# @TEST-EXEC: btest-diff manager/out
|
||||||
|
# @TEST-EXEC: btest-diff proxy/out
|
||||||
|
# @TEST-EXEC: btest-diff worker-1/out
|
||||||
|
# @TEST-EXEC: btest-diff worker-2/out
|
||||||
|
|
||||||
|
# @TEST-START-FILE common.zeek
|
||||||
|
@load ./zeromq-test-bootstrap
|
||||||
|
|
||||||
|
global start: event();
|
||||||
|
global finish: event(name: string);
|
||||||
|
global ping: event(sender: string, c: count);
|
||||||
|
|
||||||
|
# Lower high watermarks from 1000 (default) to something much lower to provoke blocking.
|
||||||
|
redef Cluster::Backend::ZeroMQ::xpub_sndhwm = 20;
|
||||||
|
redef Cluster::Backend::ZeroMQ::xsub_rcvhwm = 20;
|
||||||
|
|
||||||
|
const total_publishes = 50000;
|
||||||
|
|
||||||
|
function get_zeromq_blocks(): count {
|
||||||
|
local ms = Telemetry::collect_metrics("zeek", "cluster_zeromq_xpub_blocks_total");
|
||||||
|
assert |ms| == 1, fmt("%s", |ms|);
|
||||||
|
return double_to_count(ms[0]$value);
|
||||||
|
}
|
||||||
|
# @TEST-END-FILE
|
||||||
|
|
||||||
|
# @TEST-START-FILE manager.zeek
|
||||||
|
@load ./common.zeek
|
||||||
|
|
||||||
|
global nodes_up: set[string] = {"manager"};
|
||||||
|
global nodes_down: set[string] = {"manager"};
|
||||||
|
|
||||||
|
event send_finish() {
|
||||||
|
for ( n in nodes_up )
|
||||||
|
Cluster::publish(Cluster::node_topic(n), finish, Cluster::node);
|
||||||
|
}
|
||||||
|
|
||||||
|
event Cluster::node_up(name: string, id: string) {
|
||||||
|
add nodes_up[name];
|
||||||
|
print "B nodes_up", |nodes_up|;
|
||||||
|
|
||||||
|
if ( |nodes_up| == 4 ) {
|
||||||
|
Cluster::publish(Cluster::worker_topic, start);
|
||||||
|
Cluster::publish(Cluster::proxy_topic, start);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
event Cluster::node_down(name: string, id: string) {
|
||||||
|
add nodes_down[name];
|
||||||
|
print "nodes_down", |nodes_down|;
|
||||||
|
if ( |nodes_down| == |Cluster::nodes| )
|
||||||
|
terminate();
|
||||||
|
}
|
||||||
|
|
||||||
|
global last_c: table[string] of count;
|
||||||
|
global drop_c: table[string] of count;
|
||||||
|
|
||||||
|
event zeek_init() {
|
||||||
|
for ( name, _ in Cluster::nodes ) {
|
||||||
|
if ( name == "manager" )
|
||||||
|
next;
|
||||||
|
|
||||||
|
last_c[name] = 0;
|
||||||
|
drop_c[name] = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
event ping(sender: string, c: count) {
|
||||||
|
local dropped = c - last_c[sender] - 1;
|
||||||
|
if ( dropped > 0 ) {
|
||||||
|
print "DROP", sender, c, last_c[sender], dropped;
|
||||||
|
drop_c[sender] += dropped;
|
||||||
|
}
|
||||||
|
|
||||||
|
last_c[sender] = c;
|
||||||
|
|
||||||
|
# Check if all senders sent enough messages
|
||||||
|
for ( _, lc in last_c )
|
||||||
|
if ( lc < total_publishes )
|
||||||
|
return;
|
||||||
|
|
||||||
|
event send_finish();
|
||||||
|
}
|
||||||
|
|
||||||
|
event zeek_done() {
|
||||||
|
print "drop_c", drop_c;
|
||||||
|
print "last_c", last_c;
|
||||||
|
|
||||||
|
local blocks = get_zeromq_blocks();
|
||||||
|
if ( blocks == 0 )
|
||||||
|
print "GOOD: Observed no XPUB blocks on manager";
|
||||||
|
else
|
||||||
|
print "FAIL: XPUB blocks on manager";
|
||||||
|
}
|
||||||
|
# @TEST-END-FILE
|
||||||
|
|
||||||
|
|
||||||
|
# @TEST-START-FILE other.zeek
|
||||||
|
@load ./common.zeek
|
||||||
|
|
||||||
|
global publishes = 0;
|
||||||
|
const batch = 100;
|
||||||
|
|
||||||
|
event tick() {
|
||||||
|
local i = batch;
|
||||||
|
while ( i > 0 ) {
|
||||||
|
--i;
|
||||||
|
++publishes;
|
||||||
|
Cluster::publish(Cluster::manager_topic, ping, Cluster::node, publishes);
|
||||||
|
|
||||||
|
if ( publishes >= total_publishes )
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
schedule 0.01msec { tick() };
|
||||||
|
}
|
||||||
|
|
||||||
|
event start() {
|
||||||
|
print "start", current_time();
|
||||||
|
event tick();
|
||||||
|
}
|
||||||
|
|
||||||
|
event finish(name: string) {
|
||||||
|
terminate();
|
||||||
|
}
|
||||||
|
|
||||||
|
event zeek_done() {
|
||||||
|
print "zeek_done", current_time();
|
||||||
|
local blocks = get_zeromq_blocks();
|
||||||
|
if ( blocks > 0 )
|
||||||
|
print "GOOD: Observed XPUB blocks";
|
||||||
|
else
|
||||||
|
print "FAIL: No XPUB blocks";
|
||||||
|
}
|
||||||
|
# @TEST-END-FILE
|
169
testing/btest/cluster/zeromq/overflow-policy-drop.zeek
Normal file
169
testing/btest/cluster/zeromq/overflow-policy-drop.zeek
Normal file
|
@ -0,0 +1,169 @@
|
||||||
|
# @TEST-DOC: Workers and proxy publish to the manager topic. They publish so fast that messages are dropped. Check the metric is incremented and that we're missing some pings.
|
||||||
|
#
|
||||||
|
# @TEST-REQUIRES: have-zeromq
|
||||||
|
#
|
||||||
|
# @TEST-GROUP: cluster-zeromq
|
||||||
|
#
|
||||||
|
# @TEST-PORT: XPUB_PORT
|
||||||
|
# @TEST-PORT: XSUB_PORT
|
||||||
|
# @TEST-PORT: LOG_PULL_PORT
|
||||||
|
#
|
||||||
|
# @TEST-EXEC: cp $FILES/zeromq/cluster-layout-no-logger.zeek cluster-layout.zeek
|
||||||
|
# @TEST-EXEC: cp $FILES/zeromq/test-bootstrap.zeek zeromq-test-bootstrap.zeek
|
||||||
|
#
|
||||||
|
# @TEST-EXEC: zeek --parse-only manager.zeek
|
||||||
|
# @TEST-EXEC: zeek --parse-only other.zeek
|
||||||
|
#
|
||||||
|
# @TEST-EXEC: btest-bg-run manager "ZEEKPATH=$ZEEKPATH:.. && CLUSTER_NODE=manager zeek -b ../manager.zeek> out"
|
||||||
|
# @TEST-EXEC: btest-bg-run proxy "ZEEKPATH=$ZEEKPATH:.. && CLUSTER_NODE=proxy zeek -b ../other.zeek >out"
|
||||||
|
# @TEST-EXEC: btest-bg-run worker-1 "ZEEKPATH=$ZEEKPATH:.. && CLUSTER_NODE=worker-1 zeek -b ../other.zeek >out"
|
||||||
|
# @TEST-EXEC: btest-bg-run worker-2 "ZEEKPATH=$ZEEKPATH:.. && CLUSTER_NODE=worker-2 zeek -b ../other.zeek >out"
|
||||||
|
#
|
||||||
|
# @TEST-EXEC: btest-bg-wait 30
|
||||||
|
# @TEST-EXEC: btest-diff manager/out
|
||||||
|
# @TEST-EXEC: btest-diff proxy/out
|
||||||
|
# @TEST-EXEC: btest-diff worker-1/out
|
||||||
|
# @TEST-EXEC: btest-diff worker-2/out
|
||||||
|
|
||||||
|
# @TEST-START-FILE common.zeek
|
||||||
|
@load ./zeromq-test-bootstrap
|
||||||
|
|
||||||
|
global start: event();
|
||||||
|
global finish: event(name: string);
|
||||||
|
global ping: event(sender: string, c: count);
|
||||||
|
|
||||||
|
# Lower high watermarks from 1000 (default) to something much lower to provoke drops.
|
||||||
|
redef Cluster::Backend::ZeroMQ::xpub_sndhwm = 20;
|
||||||
|
redef Cluster::Backend::ZeroMQ::xsub_rcvhwm = 20;
|
||||||
|
|
||||||
|
# Drop'em!
|
||||||
|
redef Cluster::Backend::ZeroMQ::overflow_policy = Cluster::Backend::ZeroMQ::DROP;
|
||||||
|
|
||||||
|
const total_publishes = 50000;
|
||||||
|
|
||||||
|
function get_zeromq_drops(): count {
|
||||||
|
local ms = Telemetry::collect_metrics("zeek", "cluster_zeromq_xpub_drops_total");
|
||||||
|
assert |ms| == 1, fmt("%s", |ms|);
|
||||||
|
return double_to_count(ms[0]$value);
|
||||||
|
}
|
||||||
|
# @TEST-END-FILE
|
||||||
|
|
||||||
|
# @TEST-START-FILE manager.zeek
|
||||||
|
@load ./common.zeek
|
||||||
|
|
||||||
|
global nodes_up: set[string] = {"manager"};
|
||||||
|
global nodes_down: set[string] = {"manager"};
|
||||||
|
|
||||||
|
event send_finish() {
|
||||||
|
for ( n in nodes_up )
|
||||||
|
Cluster::publish(Cluster::node_topic(n), finish, Cluster::node);
|
||||||
|
}
|
||||||
|
|
||||||
|
event Cluster::node_up(name: string, id: string) {
|
||||||
|
add nodes_up[name];
|
||||||
|
print "B nodes_up", |nodes_up|;
|
||||||
|
|
||||||
|
if ( |nodes_up| == 4 ) {
|
||||||
|
Cluster::publish(Cluster::worker_topic, start);
|
||||||
|
Cluster::publish(Cluster::proxy_topic, start);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
event Cluster::node_down(name: string, id: string) {
|
||||||
|
add nodes_down[name];
|
||||||
|
print "nodes_down", |nodes_down|;
|
||||||
|
if ( |nodes_down| == |Cluster::nodes| )
|
||||||
|
terminate();
|
||||||
|
}
|
||||||
|
|
||||||
|
global last_c: table[string] of count;
|
||||||
|
global drop_c: table[string] of count;
|
||||||
|
|
||||||
|
event zeek_init() {
|
||||||
|
for ( name, _ in Cluster::nodes ) {
|
||||||
|
if ( name == "manager" )
|
||||||
|
next;
|
||||||
|
|
||||||
|
last_c[name] = 0;
|
||||||
|
drop_c[name] = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
global sent_finish = F;
|
||||||
|
|
||||||
|
event ping(sender: string, c: count) {
|
||||||
|
local dropped = c - last_c[sender] - 1;
|
||||||
|
if ( dropped > 0 )
|
||||||
|
drop_c[sender] += dropped;
|
||||||
|
|
||||||
|
last_c[sender] = c;
|
||||||
|
|
||||||
|
# Check if all senders sent enough messages
|
||||||
|
for ( _, lc in last_c )
|
||||||
|
if ( lc < total_publishes )
|
||||||
|
return;
|
||||||
|
|
||||||
|
if ( ! sent_finish ) {
|
||||||
|
event send_finish();
|
||||||
|
sent_finish = T;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
event zeek_done() {
|
||||||
|
for ( n, dropped in drop_c )
|
||||||
|
print n, "dropped?", dropped > 0;
|
||||||
|
|
||||||
|
local drops = get_zeromq_drops();
|
||||||
|
if ( drops == 0 )
|
||||||
|
print "GOOD: Observed no XPUB drops on manager";
|
||||||
|
else
|
||||||
|
print "FAIL: XPUB drops on manager";
|
||||||
|
}
|
||||||
|
# @TEST-END-FILE
|
||||||
|
|
||||||
|
|
||||||
|
# @TEST-START-FILE other.zeek
|
||||||
|
@load ./common.zeek
|
||||||
|
|
||||||
|
global publishes = 0;
|
||||||
|
const batch = 100;
|
||||||
|
|
||||||
|
event tick() {
|
||||||
|
local i = batch;
|
||||||
|
while ( i > 0 ) {
|
||||||
|
--i;
|
||||||
|
++publishes;
|
||||||
|
Cluster::publish(Cluster::manager_topic, ping, Cluster::node, publishes);
|
||||||
|
|
||||||
|
# Continue sending a single publish for every tick() even
|
||||||
|
# if we've published enough in order for the manager to
|
||||||
|
# detect we're done even if some events were dropped.
|
||||||
|
if ( publishes >= total_publishes )
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
# Relax publishing if we published enough so the manager
|
||||||
|
# isn't totally overloaded.
|
||||||
|
local s = publishes < total_publishes ? 0.01msec : 0.01sec;
|
||||||
|
schedule s { tick() };
|
||||||
|
}
|
||||||
|
|
||||||
|
event start() {
|
||||||
|
print "start", current_time();
|
||||||
|
event tick();
|
||||||
|
}
|
||||||
|
|
||||||
|
event finish(name: string) {
|
||||||
|
terminate();
|
||||||
|
}
|
||||||
|
|
||||||
|
event zeek_done() {
|
||||||
|
print "zeek_done", current_time();
|
||||||
|
local drops = get_zeromq_drops();
|
||||||
|
if ( drops > 0 )
|
||||||
|
print "GOOD: Observed XPUB drops";
|
||||||
|
else
|
||||||
|
print "FAIL: No XPUB drops";
|
||||||
|
}
|
||||||
|
# @TEST-END-FILE
|
Loading…
Add table
Add a link
Reference in a new issue