mirror of
https://github.com/zeek/zeek.git
synced 2025-10-02 06:38:20 +00:00
Merge branch 'topic/christian/broker-backpressure-metrics'
* topic/christian/broker-backpressure-metrics: Add basic btest to verify that Broker peering telemetry is available. Add cluster framework telemetry for Broker's send-buffer use Add peer buffer update tracking to the Broker manager's event_observer Rename the Broker manager's LoggerAdapter Avoid race in the cluster/broker/publish-any btest
This commit is contained in:
commit
c1a5f70df8
16 changed files with 412 additions and 23 deletions
|
@ -104,6 +104,10 @@ export {
|
|||
## Same as :zeek:see:`Broker::peer_overflow_policy` but for WebSocket clients.
|
||||
const web_socket_overflow_policy = "disconnect" &redef;
|
||||
|
||||
## How frequently Zeek resets some peering/client buffer statistics,
|
||||
## such as ``max_queued_recently`` in :zeek:see:`BrokerPeeringStats`.
|
||||
const buffer_stats_reset_interval = 1min &redef;
|
||||
|
||||
## The CAF scheduling policy to use. Available options are "sharing" and
|
||||
## "stealing". The "sharing" policy uses a single, global work queue along
|
||||
## with mutex and condition variable used for accessing it, which may be
|
||||
|
@ -392,6 +396,12 @@ export {
|
|||
## Returns: a unique identifier for the local broker endpoint.
|
||||
global node_id: function(): string;
|
||||
|
||||
## Obtain each peering's send-buffer statistics. The keys are Broker
|
||||
## endpoint IDs.
|
||||
##
|
||||
## Returns: per-peering statistics.
|
||||
global peering_stats: function(): table[string] of BrokerPeeringStats;
|
||||
|
||||
## Sends all pending log messages to remote peers. This normally
|
||||
## doesn't need to be used except for test cases that are time-sensitive.
|
||||
global flush_logs: function(): count;
|
||||
|
@ -554,6 +564,11 @@ function node_id(): string
|
|||
return __node_id();
|
||||
}
|
||||
|
||||
function peering_stats(): table[string] of BrokerPeeringStats
|
||||
{
|
||||
return __peering_stats();
|
||||
}
|
||||
|
||||
function flush_logs(): count
|
||||
{
|
||||
return __flush_logs();
|
||||
|
|
|
@ -14,8 +14,11 @@ redef Broker::log_topic = Cluster::rr_log_topic;
|
|||
# Add a cluster prefix.
|
||||
@prefixes += cluster
|
||||
|
||||
# This should soon condition on loading only when Broker is in use.
|
||||
# Broker-specific additions:
|
||||
@if ( Cluster::backend == Cluster::CLUSTER_BACKEND_BROKER )
|
||||
@load ./broker-backpressure
|
||||
@load ./broker-telemetry
|
||||
@endif
|
||||
|
||||
@if ( Supervisor::is_supervised() )
|
||||
# When running a supervised cluster, populate Cluster::nodes from the node table
|
||||
|
|
69
scripts/base/frameworks/cluster/broker-telemetry.zeek
Normal file
69
scripts/base/frameworks/cluster/broker-telemetry.zeek
Normal file
|
@ -0,0 +1,69 @@
|
|||
# Additional Broker-specific metrics that use Zeek cluster-level node names.
|
||||
|
||||
@load base/frameworks/telemetry
|
||||
|
||||
module Cluster;
|
||||
|
||||
## This gauge tracks the current number of locally queued messages in each
|
||||
## Broker peering's send buffer. The "peer" label identifies the remote side of
|
||||
## the peering, containing a Zeek cluster node name.
|
||||
global broker_peer_buffer_messages_gf = Telemetry::register_gauge_family([
|
||||
$prefix="zeek",
|
||||
$name="broker-peer-buffer-messages",
|
||||
$unit="",
|
||||
$label_names=vector("peer"),
|
||||
$help_text="Number of messages queued in Broker's send buffers",
|
||||
]);
|
||||
|
||||
## This gauge tracks recent maximum queue lengths for each Broker peering's send
|
||||
## buffer. Most of the time the send buffers are nearly empty, so this gauge
|
||||
## helps understand recent bursts of messages. "Recent" here means
|
||||
## :zeek:see:`Broker::buffer_stats_reset_interval`. The time window advances in
|
||||
## increments of at least the stats interval, not incrementally with every new
|
||||
## observed message. That is, Zeek keeps a timestamp of when the window started,
|
||||
## and once it notices that the interval has passed, it moves the start of the
|
||||
## window to current time.
|
||||
global broker_peer_buffer_recent_max_messages_gf = Telemetry::register_gauge_family([
|
||||
$prefix="zeek",
|
||||
$name="broker-peer-buffer-recent-max-messages",
|
||||
$unit="",
|
||||
$label_names=vector("peer"),
|
||||
$help_text="Maximum number of messages recently queued in Broker's send buffers",
|
||||
]);
|
||||
|
||||
## This counter tracks for each Broker peering the number of times its send
|
||||
## buffer has overflowed. For the "disconnect" policy this can at most be 1,
|
||||
## since Broker stops the peering at this time. For the "drop_oldest" and
|
||||
## "drop_newest" policies (see :zeek:see:`Broker:peer_overflow_policy`) the count
|
||||
## instead reflects the number of messages lost.
|
||||
global broker_peer_buffer_overflows_cf = Telemetry::register_counter_family([
|
||||
$prefix="zeek",
|
||||
$name="broker-peer-buffer-overflows",
|
||||
$unit="",
|
||||
$label_names=vector("peer"),
|
||||
$help_text="Number of overflows in Broker's send buffers",
|
||||
]);
|
||||
|
||||
hook Telemetry::sync()
|
||||
{
|
||||
local peers = Broker::peering_stats();
|
||||
local nn: NamedNode;
|
||||
|
||||
for ( peer, stats in peers )
|
||||
{
|
||||
# Translate the Broker IDs to Zeek-level node names. We skip
|
||||
# telemetry for peers where this mapping fails, i.e. ones for
|
||||
# connections to external systems.
|
||||
nn = nodeid_to_node(peer);
|
||||
|
||||
if ( |nn$name| > 0 )
|
||||
{
|
||||
Telemetry::gauge_family_set(broker_peer_buffer_messages_gf,
|
||||
vector(nn$name), stats$num_queued);
|
||||
Telemetry::gauge_family_set(broker_peer_buffer_recent_max_messages_gf,
|
||||
vector(nn$name), stats$max_queued_recently);
|
||||
Telemetry::counter_family_set(broker_peer_buffer_overflows_cf,
|
||||
vector(nn$name), stats$num_overflows);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1135,6 +1135,20 @@ type BrokerStats: record {
|
|||
num_ids_outgoing: count;
|
||||
};
|
||||
|
||||
## Broker statistics for an individual peering.
|
||||
##
|
||||
type BrokerPeeringStats: record {
|
||||
## The number of messages currently queued locally for transmission.
|
||||
num_queued: count;
|
||||
## The maximum number of messages queued in the recent
|
||||
## :zeek:see:`Broker::buffer_stats_reset_interval` time interval.
|
||||
max_queued_recently: count;
|
||||
## The number of times the send buffer has overflowed.
|
||||
num_overflows: count;
|
||||
};
|
||||
|
||||
type BrokerPeeringStatsTable: table[string] of BrokerPeeringStats;
|
||||
|
||||
## Statistics about reporter messages and weirds.
|
||||
##
|
||||
## .. zeek:see:: get_reporter_stats
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue