cluster/zeromq: Metric for msg errors

This commit is contained in:
Arne Welzel 2025-07-22 12:44:34 +02:00
parent 073de9f5fd
commit d2bb86f8b4
2 changed files with 9 additions and 1 deletions

View file

@ -91,7 +91,10 @@ ZeroMQBackend::ZeroMQBackend(std::unique_ptr<EventSerializer> es, std::unique_pt
"Number of published events dropped due to XPUB socket HWM.")),
total_onloop_drops(
zeek::telemetry_mgr->CounterInstance("zeek", "cluster_zeromq_onloop_drops", {},
"Number of received events dropped due to OnLoop queue full.")) {}
"Number of received events dropped due to OnLoop queue full.")),
total_msg_errors(
zeek::telemetry_mgr->CounterInstance("zeek", "cluster_zeromq_msg_errors", {},
"Number of events with the wrong number of message parts.")) {}
ZeroMQBackend::~ZeroMQBackend() {
try {
@ -513,6 +516,7 @@ void ZeroMQBackend::HandleInprocMessages(std::vector<MultipartMessage>& msgs) {
}
else {
ZEROMQ_THREAD_PRINTF("inproc: error: expected 2 or 4 parts, have %zu!\n", msg.size());
total_msg_errors->Inc();
}
}
}
@ -522,6 +526,7 @@ void ZeroMQBackend::HandleLogMessages(const std::vector<MultipartMessage>& msgs)
// sender, format, type, payload
if ( msg.size() != 4 ) {
ZEROMQ_THREAD_PRINTF("log: error: expected 4 parts, have %zu!\n", msg.size());
total_msg_errors->Inc();
continue;
}
@ -540,6 +545,7 @@ void ZeroMQBackend::HandleXPubMessages(const std::vector<MultipartMessage>& msgs
for ( const auto& msg : msgs ) {
if ( msg.size() != 1 ) {
ZEROMQ_THREAD_PRINTF("xpub: error: expected 1 part, have %zu!\n", msg.size());
total_msg_errors->Inc();
continue;
}
@ -576,6 +582,7 @@ void ZeroMQBackend::HandleXSubMessages(const std::vector<MultipartMessage>& msgs
for ( const auto& msg : msgs ) {
if ( msg.size() != 4 ) {
ZEROMQ_THREAD_PRINTF("xsub: error: expected 4 parts, have %zu!\n", msg.size());
total_msg_errors->Inc();
continue;
}

View file

@ -140,6 +140,7 @@ private:
zeek::telemetry::CounterPtr total_xpub_drops; // events dropped due to XPUB socket hwm reached
zeek::telemetry::CounterPtr total_onloop_drops; // events dropped due to onloop queue full
zeek::telemetry::CounterPtr total_msg_errors; // messages with the wrong number of parts
// Could rework to log-once-every X seconds if needed.
double xpub_drop_last_warn_at = 0.0;