Merge remote-tracking branch 'origin/topic/timw/3081-prof-log-metrics'

* origin/topic/timw/3081-prof-log-metrics:
  Remove some unnecessary #includes
  Avoid capturing 'this' for callback in telemetry::Manager
  Make telemetry metrics out of MsgThread statistics
  Split cpu time metric into user/system components like prof.log
  Move broker statistics to be telemetry metrics
  Add timer counts as telemetry metrics
  Move thread manager stats to telemetry metric
  Add extra metrics to session_mgr
  Move dns_mgr stats to telemetry instruments
  Move trigger stats to telemetry instruments
  Move pulling of global state inside 'expensive' check for stats
This commit is contained in:
Tim Wojtulewicz 2024-08-05 14:39:56 -07:00
commit b483fad3d0
27 changed files with 503 additions and 108 deletions

View file

@ -45,6 +45,7 @@ using ztd::out_ptr::out_ptr;
#include "zeek/Val.h" #include "zeek/Val.h"
#include "zeek/ZeekString.h" #include "zeek/ZeekString.h"
#include "zeek/iosource/Manager.h" #include "zeek/iosource/Manager.h"
#include "zeek/telemetry/Manager.h"
// Number of seconds we'll wait for a reply. // Number of seconds we'll wait for a reply.
constexpr int DNS_TIMEOUT = 5; constexpr int DNS_TIMEOUT = 5;
@ -545,6 +546,55 @@ void DNS_Mgr::InitSource() {
} }
void DNS_Mgr::InitPostScript() { void DNS_Mgr::InitPostScript() {
num_requests_metric =
telemetry_mgr->CounterInstance("zeek", "dnsmgr_requests", {}, "Total number of requests through DNS_Mgr");
successful_metric = telemetry_mgr->CounterInstance("zeek", "dnsmgr_successful_requests", {},
"Total number of successful requests through DNS_Mgr");
failed_metric = telemetry_mgr->CounterInstance("zeek", "dnsmgr_failed_requests", {},
"Total number of failed requests through DNS_Mgr");
asyncs_pending_metric = telemetry_mgr->GaugeInstance("zeek", "dnsmgr_pending_asyncs_requests", {},
"Number of pending async requests through DNS_Mgr");
cached_hosts_metric =
telemetry_mgr->GaugeInstance("zeek", "dnsmgr_cache_entries", {{"type", "host"}},
"Number of cached hosts in DNS_Mgr", "", []() -> prometheus::ClientMetric {
prometheus::ClientMetric metric;
metric.gauge.value = 0;
if ( dns_mgr ) {
dns_mgr->UpdateCachedStats(false);
metric.gauge.value = static_cast<double>(dns_mgr->last_cached_stats.hosts);
}
return metric;
});
cached_addresses_metric =
telemetry_mgr->GaugeInstance("zeek", "dnsmgr_cache_entries", {{"type", "address"}},
"Number of cached addresses in DNS_Mgr", "", []() -> prometheus::ClientMetric {
prometheus::ClientMetric metric;
metric.gauge.value = 0;
if ( dns_mgr ) {
dns_mgr->UpdateCachedStats(false);
metric.gauge.value =
static_cast<double>(dns_mgr->last_cached_stats.addresses);
}
return metric;
});
cached_texts_metric =
telemetry_mgr->GaugeInstance("zeek", "dnsmgr_cache_entries", {{"type", "text"}},
"Number of cached texts in DNS_Mgr", "", []() -> prometheus::ClientMetric {
prometheus::ClientMetric metric;
metric.gauge.value = 0;
if ( dns_mgr ) {
dns_mgr->UpdateCachedStats(false);
metric.gauge.value = static_cast<double>(dns_mgr->last_cached_stats.texts);
}
return metric;
});
if ( ! doctest::is_running_in_test ) { if ( ! doctest::is_running_in_test ) {
dm_rec = id::find_type<RecordType>("dns_mapping"); dm_rec = id::find_type<RecordType>("dns_mapping");
@ -1158,7 +1208,7 @@ void DNS_Mgr::IssueAsyncRequests() {
AsyncRequest* req = asyncs_queued.front(); AsyncRequest* req = asyncs_queued.front();
asyncs_queued.pop_front(); asyncs_queued.pop_front();
++num_requests; num_requests_metric->Inc();
req->time = util::current_time(); req->time = util::current_time();
if ( req->type == T_PTR ) if ( req->type == T_PTR )
@ -1173,6 +1223,7 @@ void DNS_Mgr::IssueAsyncRequests() {
dns_req->MakeRequest(channel, this); dns_req->MakeRequest(channel, this);
++asyncs_pending; ++asyncs_pending;
asyncs_pending_metric->Inc();
} }
} }
@ -1182,11 +1233,11 @@ void DNS_Mgr::CheckAsyncHostRequest(const std::string& host, bool timeout) {
if ( i != asyncs.end() ) { if ( i != asyncs.end() ) {
if ( timeout ) { if ( timeout ) {
++failed; failed_metric->Inc();
i->second->Timeout(); i->second->Timeout();
} }
else if ( auto addrs = LookupNameInCache(host, true, false) ) { else if ( auto addrs = LookupNameInCache(host, true, false) ) {
++successful; successful_metric->Inc();
i->second->Resolved(addrs); i->second->Resolved(addrs);
} }
else else
@ -1195,6 +1246,7 @@ void DNS_Mgr::CheckAsyncHostRequest(const std::string& host, bool timeout) {
delete i->second; delete i->second;
asyncs.erase(i); asyncs.erase(i);
--asyncs_pending; --asyncs_pending;
asyncs_pending_metric->Dec();
} }
} }
@ -1207,11 +1259,11 @@ void DNS_Mgr::CheckAsyncAddrRequest(const IPAddr& addr, bool timeout) {
if ( i != asyncs.end() ) { if ( i != asyncs.end() ) {
if ( timeout ) { if ( timeout ) {
++failed; failed_metric->Inc();
i->second->Timeout(); i->second->Timeout();
} }
else if ( auto name = LookupAddrInCache(addr, true, false) ) { else if ( auto name = LookupAddrInCache(addr, true, false) ) {
++successful; successful_metric->Inc();
i->second->Resolved(name->CheckString()); i->second->Resolved(name->CheckString());
} }
else else
@ -1220,6 +1272,7 @@ void DNS_Mgr::CheckAsyncAddrRequest(const IPAddr& addr, bool timeout) {
delete i->second; delete i->second;
asyncs.erase(i); asyncs.erase(i);
--asyncs_pending; --asyncs_pending;
asyncs_pending_metric->Dec();
} }
} }
@ -1229,11 +1282,11 @@ void DNS_Mgr::CheckAsyncOtherRequest(const std::string& host, bool timeout, int
auto i = asyncs.find(std::make_pair(request_type, host)); auto i = asyncs.find(std::make_pair(request_type, host));
if ( i != asyncs.end() ) { if ( i != asyncs.end() ) {
if ( timeout ) { if ( timeout ) {
++failed; failed_metric->Inc();
i->second->Timeout(); i->second->Timeout();
} }
else if ( auto name = LookupOtherInCache(host, request_type, true) ) { else if ( auto name = LookupOtherInCache(host, request_type, true) ) {
++successful; successful_metric->Inc();
i->second->Resolved(name->CheckString()); i->second->Resolved(name->CheckString());
} }
else else
@ -1242,6 +1295,7 @@ void DNS_Mgr::CheckAsyncOtherRequest(const std::string& host, bool timeout, int
delete i->second; delete i->second;
asyncs.erase(i); asyncs.erase(i);
--asyncs_pending; --asyncs_pending;
asyncs_pending_metric->Dec();
} }
} }
@ -1293,26 +1347,35 @@ void DNS_Mgr::Process() {
ares_process_fd(channel, ARES_SOCKET_BAD, ARES_SOCKET_BAD); ares_process_fd(channel, ARES_SOCKET_BAD, ARES_SOCKET_BAD);
} }
void DNS_Mgr::UpdateCachedStats(bool force) {
double now = util::current_time();
if ( force || last_cached_stats_update < now - 0.01 ) {
last_cached_stats.hosts = 0;
last_cached_stats.addresses = 0;
last_cached_stats.texts = 0;
last_cached_stats.total = all_mappings.size();
for ( const auto& [key, mapping] : all_mappings ) {
if ( mapping->ReqType() == T_PTR )
last_cached_stats.addresses++;
else if ( mapping->ReqType() == T_A )
last_cached_stats.hosts++;
else
last_cached_stats.texts++;
}
last_cached_stats_update = now;
}
}
void DNS_Mgr::GetStats(Stats* stats) { void DNS_Mgr::GetStats(Stats* stats) {
// TODO: can this use the telemetry framework? stats->requests = static_cast<unsigned long>(num_requests_metric->Value());
stats->requests = num_requests; stats->successful = static_cast<unsigned long>(successful_metric->Value());
stats->successful = successful; stats->failed = static_cast<unsigned long>(failed_metric->Value());
stats->failed = failed;
stats->pending = asyncs_pending; stats->pending = asyncs_pending;
stats->cached_hosts = 0; UpdateCachedStats(true);
stats->cached_addresses = 0; stats->cached = last_cached_stats;
stats->cached_texts = 0;
stats->cached_total = all_mappings.size();
for ( const auto& [key, mapping] : all_mappings ) {
if ( mapping->ReqType() == T_PTR )
stats->cached_addresses++;
else if ( mapping->ReqType() == T_A )
stats->cached_hosts++;
else
stats->cached_texts++;
}
} }
void DNS_Mgr::AsyncRequest::Resolved(const std::string& name) { void DNS_Mgr::AsyncRequest::Resolved(const std::string& name) {

View file

@ -42,6 +42,13 @@ using TableValPtr = IntrusivePtr<TableVal>;
using StringValPtr = IntrusivePtr<StringVal>; using StringValPtr = IntrusivePtr<StringVal>;
using RecordValPtr = IntrusivePtr<RecordVal>; using RecordValPtr = IntrusivePtr<RecordVal>;
namespace telemetry {
class Gauge;
class Counter;
using GaugePtr = std::shared_ptr<Gauge>;
using CounterPtr = std::shared_ptr<Counter>;
} // namespace telemetry
} // namespace zeek } // namespace zeek
namespace zeek::detail { namespace zeek::detail {
@ -198,15 +205,19 @@ public:
*/ */
bool Save(); bool Save();
struct CachedStats {
unsigned long hosts;
unsigned long addresses;
unsigned long texts;
unsigned long total;
};
struct Stats { struct Stats {
unsigned long requests; // These count only async requests. unsigned long requests; // These count only async requests.
unsigned long successful; unsigned long successful;
unsigned long failed; unsigned long failed;
unsigned long pending; unsigned long pending;
unsigned long cached_hosts; CachedStats cached;
unsigned long cached_addresses;
unsigned long cached_texts;
unsigned long cached_total;
}; };
/** /**
@ -285,6 +296,8 @@ protected:
const char* Tag() override { return "DNS_Mgr"; } const char* Tag() override { return "DNS_Mgr"; }
double GetNextTimeout() override; double GetNextTimeout() override;
void UpdateCachedStats(bool force);
DNS_MgrMode mode; DNS_MgrMode mode;
MappingMap all_mappings; MappingMap all_mappings;
@ -293,7 +306,6 @@ protected:
std::string dir; // directory in which cache_name resides std::string dir; // directory in which cache_name resides
bool did_init = false; bool did_init = false;
int asyncs_pending = 0;
RecordTypePtr dm_rec; RecordTypePtr dm_rec;
@ -327,9 +339,19 @@ protected:
using QueuedList = std::list<AsyncRequest*>; using QueuedList = std::list<AsyncRequest*>;
QueuedList asyncs_queued; QueuedList asyncs_queued;
unsigned long num_requests = 0; telemetry::CounterPtr num_requests_metric;
unsigned long successful = 0; telemetry::CounterPtr successful_metric;
unsigned long failed = 0; telemetry::CounterPtr failed_metric;
telemetry::GaugePtr asyncs_pending_metric;
telemetry::GaugePtr cached_hosts_metric;
telemetry::GaugePtr cached_addresses_metric;
telemetry::GaugePtr cached_texts_metric;
double last_cached_stats_update = 0;
CachedStats last_cached_stats;
int asyncs_pending = 0;
std::set<int> socket_fds; std::set<int> socket_fds;
std::set<int> write_socket_fds; std::set<int> write_socket_fds;

View file

@ -119,7 +119,7 @@ void ProfileLogger::Log() {
// TODO: This previously output the number of connections, but now that we're storing // TODO: This previously output the number of connections, but now that we're storing
// sessions as well as connections, this might need to be renamed. // sessions as well as connections, this might need to be renamed.
file->Write(util::fmt("%.06f Conns: total=%" PRIu64 " current=%" PRIu64 "/%u\n", run_state::network_time, file->Write(util::fmt("%.06f Conns: total=%" PRIu64 " current=%" PRIu64 "/%zu\n", run_state::network_time,
Connection::TotalConnections(), Connection::CurrentConnections(), Connection::TotalConnections(), Connection::CurrentConnections(),
session_mgr->CurrentSessions())); session_mgr->CurrentSessions()));
@ -173,7 +173,7 @@ void ProfileLogger::Log() {
util::fmt("%.06f DNS_Mgr: requests=%lu successful=%lu failed=%lu pending=%lu " util::fmt("%.06f DNS_Mgr: requests=%lu successful=%lu failed=%lu pending=%lu "
"cached_hosts=%lu cached_addrs=%lu\n", "cached_hosts=%lu cached_addrs=%lu\n",
run_state::network_time, dstats.requests, dstats.successful, dstats.failed, dstats.pending, run_state::network_time, dstats.requests, dstats.successful, dstats.failed, dstats.pending,
dstats.cached_hosts, dstats.cached_addresses)); dstats.cached.hosts, dstats.cached.addresses));
trigger::Manager::Stats tstats; trigger::Manager::Stats tstats;
trigger_mgr->GetStats(&tstats); trigger_mgr->GetStats(&tstats);
@ -188,7 +188,7 @@ void ProfileLogger::Log() {
timer_type_to_string(static_cast<TimerType>(i)), current_timers[i])); timer_type_to_string(static_cast<TimerType>(i)), current_timers[i]));
} }
file->Write(util::fmt("%0.6f Threads: current=%d\n", run_state::network_time, thread_mgr->NumThreads())); file->Write(util::fmt("%0.6f Threads: current=%zu\n", run_state::network_time, thread_mgr->NumThreads()));
const threading::Manager::msg_stats_list& thread_stats = thread_mgr->GetMsgThreadStats(); const threading::Manager::msg_stats_list& thread_stats = thread_mgr->GetMsgThreadStats();
for ( threading::Manager::msg_stats_list::const_iterator i = thread_stats.begin(); i != thread_stats.end(); ++i ) { for ( threading::Manager::msg_stats_list::const_iterator i = thread_stats.begin(); i != thread_stats.end(); ++i ) {
@ -213,14 +213,12 @@ void ProfileLogger::Log() {
cs.num_events_outgoing, cs.num_logs_incoming, cs.num_logs_outgoing, cs.num_ids_incoming, cs.num_events_outgoing, cs.num_logs_incoming, cs.num_logs_outgoing, cs.num_ids_incoming,
cs.num_ids_outgoing)); cs.num_ids_outgoing));
// Script-level state.
const auto& globals = global_scope()->Vars();
if ( expensive ) { if ( expensive ) {
// Script-level state.
int total_table_entries = 0; int total_table_entries = 0;
int total_table_rentries = 0; int total_table_rentries = 0;
for ( const auto& global : globals ) { for ( const auto& global : global_scope()->Vars() ) {
auto& id = global.second; auto& id = global.second;
// We don't show/count internal globals as they are always // We don't show/count internal globals as they are always

View file

@ -9,7 +9,7 @@
#include "zeek/RunState.h" #include "zeek/RunState.h"
#include "zeek/broker/Manager.h" #include "zeek/broker/Manager.h"
#include "zeek/iosource/Manager.h" #include "zeek/iosource/Manager.h"
#include "zeek/iosource/PktSrc.h" #include "zeek/telemetry/Manager.h"
#include "zeek/util.h" #include "zeek/util.h"
namespace zeek::detail { namespace zeek::detail {
@ -97,6 +97,34 @@ void TimerMgr::InitPostScript() {
iosource_mgr->Register(this, true); iosource_mgr->Register(this, true);
dispatch_all_expired = zeek::detail::max_timer_expires == 0; dispatch_all_expired = zeek::detail::max_timer_expires == 0;
cumulative_num_metric = telemetry_mgr->CounterInstance("zeek", "timers", {}, "Cumulative number of timers", "",
[]() -> prometheus::ClientMetric {
prometheus::ClientMetric metric;
metric.counter.value =
static_cast<double>(timer_mgr->CumulativeNum());
return metric;
});
lag_time_metric = telemetry_mgr->GaugeInstance("zeek", "timers_lag_time", {},
"Lag between current network time and last expired timer", "seconds",
[]() -> prometheus::ClientMetric {
prometheus::ClientMetric metric;
metric.gauge.value =
run_state::network_time - timer_mgr->last_timestamp;
return metric;
});
std::shared_ptr<telemetry::GaugeFamily> family =
telemetry_mgr->GaugeFamily("zeek", "timers_pending", {"type"}, "Number of timers for a certain type");
for ( int i = 0; i < NUM_TIMER_TYPES; i++ ) {
current_timer_metrics[i] = family->GetOrAdd({{"type", timer_type_to_string(static_cast<TimerType>(i))}},
[i]() -> prometheus::ClientMetric {
prometheus::ClientMetric metric;
metric.gauge.value = TimerMgr::CurrentTimers()[i];
return metric;
});
}
} }
void TimerMgr::Add(Timer* timer) { void TimerMgr::Add(Timer* timer) {

View file

@ -10,7 +10,14 @@
namespace zeek { namespace zeek {
class ODesc; class ODesc;
}
namespace telemetry {
class Gauge;
class Counter;
using GaugePtr = std::shared_ptr<Gauge>;
using CounterPtr = std::shared_ptr<Counter>;
} // namespace telemetry
} // namespace zeek
namespace zeek::detail { namespace zeek::detail {
@ -153,10 +160,12 @@ private:
// for the max_timer_expires=0 case. // for the max_timer_expires=0 case.
bool dispatch_all_expired = false; bool dispatch_all_expired = false;
size_t peak_size = 0;
size_t cumulative_num = 0;
static unsigned int current_timers[NUM_TIMER_TYPES]; static unsigned int current_timers[NUM_TIMER_TYPES];
telemetry::CounterPtr cumulative_num_metric;
telemetry::GaugePtr lag_time_metric;
telemetry::GaugePtr current_timer_metrics[NUM_TIMER_TYPES];
std::unique_ptr<PriorityQueue> q; std::unique_ptr<PriorityQueue> q;
}; };

View file

@ -13,6 +13,7 @@
#include "zeek/Traverse.h" #include "zeek/Traverse.h"
#include "zeek/Val.h" #include "zeek/Val.h"
#include "zeek/iosource/Manager.h" #include "zeek/iosource/Manager.h"
#include "zeek/telemetry/Manager.h"
using namespace zeek::detail; using namespace zeek::detail;
using namespace zeek::detail::trigger; using namespace zeek::detail::trigger;
@ -437,7 +438,19 @@ Manager::Manager() : iosource::IOSource() { pending = new TriggerList(); }
Manager::~Manager() { delete pending; } Manager::~Manager() { delete pending; }
void Manager::InitPostScript() { iosource_mgr->Register(this, true); } void Manager::InitPostScript() {
trigger_count = telemetry_mgr->CounterInstance("zeek", "triggers", {}, "Total number of triggers scheduled");
trigger_pending =
telemetry_mgr->GaugeInstance("zeek", "pending_triggers", {}, "Pending number of triggers", "",
[]() -> prometheus::ClientMetric {
prometheus::ClientMetric metric;
metric.gauge.value =
trigger_mgr ? static_cast<double>(trigger_mgr->pending->size()) : 0.0;
return metric;
});
iosource_mgr->Register(this, true);
}
double Manager::GetNextTimeout() { return pending->empty() ? -1 : run_state::network_time + 0.100; } double Manager::GetNextTimeout() { return pending->empty() ? -1 : run_state::network_time + 0.100; }
@ -468,13 +481,13 @@ void Manager::Queue(Trigger* trigger) {
if ( std::find(pending->begin(), pending->end(), trigger) == pending->end() ) { if ( std::find(pending->begin(), pending->end(), trigger) == pending->end() ) {
Ref(trigger); Ref(trigger);
pending->push_back(trigger); pending->push_back(trigger);
total_triggers++; trigger_count->Inc();
iosource_mgr->Wakeup(Tag()); iosource_mgr->Wakeup(Tag());
} }
} }
void Manager::GetStats(Stats* stats) { void Manager::GetStats(Stats* stats) {
stats->total = total_triggers; stats->total = static_cast<unsigned long>(trigger_count->Value());
stats->pending = pending->size(); stats->pending = pending->size();
} }

View file

@ -18,6 +18,13 @@ class Val;
using ValPtr = IntrusivePtr<Val>; using ValPtr = IntrusivePtr<Val>;
namespace telemetry {
class Gauge;
class Counter;
using GaugePtr = std::shared_ptr<Gauge>;
using CounterPtr = std::shared_ptr<Counter>;
} // namespace telemetry
namespace detail { namespace detail {
class Frame; class Frame;
@ -187,7 +194,8 @@ public:
private: private:
using TriggerList = std::list<Trigger*>; using TriggerList = std::list<Trigger*>;
TriggerList* pending; TriggerList* pending;
unsigned long total_triggers = 0; telemetry::CounterPtr trigger_count;
telemetry::GaugePtr trigger_pending;
}; };
} // namespace trigger } // namespace trigger

View file

@ -340,6 +340,43 @@ void Manager::InitPostScript() {
bstate->subscriber.add_topic(broker::topic::store_events(), true); bstate->subscriber.add_topic(broker::topic::store_events(), true);
InitializeBrokerStoreForwarding(); InitializeBrokerStoreForwarding();
num_peers_metric =
telemetry_mgr->GaugeInstance("zeek", "broker_peers", {}, "Current number of peers connected via broker", "",
[]() -> prometheus::ClientMetric {
prometheus::ClientMetric metric;
metric.gauge.value = static_cast<double>(broker_mgr->peer_count);
return metric;
});
num_stores_metric =
telemetry_mgr->GaugeInstance("zeek", "broker_stores", {}, "Current number of stores connected via broker", "",
[]() -> prometheus::ClientMetric {
prometheus::ClientMetric metric;
metric.gauge.value = static_cast<double>(broker_mgr->data_stores.size());
return metric;
});
num_pending_queries_metric =
telemetry_mgr->GaugeInstance("zeek", "broker_pending_queries", {}, "Current number of pending broker queries",
"", []() -> prometheus::ClientMetric {
prometheus::ClientMetric metric;
metric.gauge.value = static_cast<double>(broker_mgr->pending_queries.size());
return metric;
});
num_events_incoming_metric = telemetry_mgr->CounterInstance("zeek", "broker_incoming_events", {},
"Total number of incoming events via broker");
num_events_outgoing_metric = telemetry_mgr->CounterInstance("zeek", "broker_outgoing_events", {},
"Total number of outgoing events via broker");
num_logs_incoming_metric =
telemetry_mgr->CounterInstance("zeek", "broker_incoming_logs", {}, "Total number of incoming logs via broker");
num_logs_outgoing_metric =
telemetry_mgr->CounterInstance("zeek", "broker_outgoing_logs", {}, "Total number of outgoing logs via broker");
num_ids_incoming_metric =
telemetry_mgr->CounterInstance("zeek", "broker_incoming_ids", {}, "Total number of incoming ids via broker");
num_ids_outgoing_metric =
telemetry_mgr->CounterInstance("zeek", "broker_outgoing_ids", {}, "Total number of outgoing ids via broker");
} }
void Manager::InitializeBrokerStoreForwarding() { void Manager::InitializeBrokerStoreForwarding() {
@ -528,7 +565,7 @@ bool Manager::PublishEvent(string topic, std::string name, broker::vector args,
DBG_LOG(DBG_BROKER, "Publishing event: %s", RenderEvent(topic, name, args).c_str()); DBG_LOG(DBG_BROKER, "Publishing event: %s", RenderEvent(topic, name, args).c_str());
broker::zeek::Event ev(std::move(name), std::move(args), broker::to_timestamp(ts)); broker::zeek::Event ev(std::move(name), std::move(args), broker::to_timestamp(ts));
bstate->endpoint.publish(std::move(topic), ev.move_data()); bstate->endpoint.publish(std::move(topic), ev.move_data());
++statistics.num_events_outgoing; num_events_outgoing_metric->Inc();
return true; return true;
} }
@ -588,7 +625,7 @@ bool Manager::PublishIdentifier(std::string topic, std::string id) {
broker::zeek::IdentifierUpdate msg(std::move(id), std::move(data.value_)); broker::zeek::IdentifierUpdate msg(std::move(id), std::move(data.value_));
DBG_LOG(DBG_BROKER, "Publishing id-update: %s", RenderMessage(topic, msg.as_data()).c_str()); DBG_LOG(DBG_BROKER, "Publishing id-update: %s", RenderMessage(topic, msg.as_data()).c_str());
bstate->endpoint.publish(std::move(topic), msg.move_data()); bstate->endpoint.publish(std::move(topic), msg.move_data());
++statistics.num_ids_outgoing; num_ids_outgoing_metric->Inc();
return true; return true;
} }
@ -715,8 +752,10 @@ bool Manager::PublishLogWrite(EnumVal* stream, EnumVal* writer, string path, int
++lb.message_count; ++lb.message_count;
lb.msgs[topic].add(std::move(msg)); lb.msgs[topic].add(std::move(msg));
if ( lb.message_count >= log_batch_size ) if ( lb.message_count >= log_batch_size ) {
statistics.num_logs_outgoing += lb.Flush(bstate->endpoint, log_batch_size); auto outgoing_logs = static_cast<double>(lb.Flush(bstate->endpoint, log_batch_size));
num_logs_outgoing_metric->Inc(outgoing_logs);
}
return true; return true;
} }
@ -746,7 +785,8 @@ size_t Manager::FlushLogBuffers() {
for ( auto& lb : log_buffers ) for ( auto& lb : log_buffers )
rval += lb.Flush(bstate->endpoint, log_batch_size); rval += lb.Flush(bstate->endpoint, log_batch_size);
statistics.num_logs_outgoing += rval; num_logs_outgoing_metric->Inc(rval);
return rval; return rval;
} }
@ -1141,7 +1181,7 @@ void Manager::ProcessMessage(std::string_view topic, broker::zeek::Event& ev) {
ts = run_state::network_time; ts = run_state::network_time;
DBG_LOG(DBG_BROKER, "Process event: %s (%.6f) %s", c_str_safe(name).c_str(), ts, RenderMessage(args).c_str()); DBG_LOG(DBG_BROKER, "Process event: %s (%.6f) %s", c_str_safe(name).c_str(), ts, RenderMessage(args).c_str());
++statistics.num_events_incoming; num_events_incoming_metric->Inc();
auto handler = event_registry->Lookup(name); auto handler = event_registry->Lookup(name);
if ( ! handler ) if ( ! handler )
@ -1286,7 +1326,7 @@ bool Manager::ProcessMessage(std::string_view, broker::zeek::LogWrite& lw) {
return false; return false;
} }
++statistics.num_logs_incoming; num_logs_incoming_metric->Inc();
auto&& stream_id_name = lw.stream_id().name; auto&& stream_id_name = lw.stream_id().name;
// Get stream ID. // Get stream ID.
@ -1352,7 +1392,7 @@ bool Manager::ProcessMessage(std::string_view, broker::zeek::IdentifierUpdate& i
return false; return false;
} }
++statistics.num_ids_incoming; num_ids_incoming_metric->Inc();
auto id_name = c_str_safe(iu.id_name()); auto id_name = c_str_safe(iu.id_name());
auto id_value = convert_if_broker_variant_or_move(iu.id_value()); auto id_value = convert_if_broker_variant_or_move(iu.id_value());
const auto& id = zeek::detail::global_scope()->Find(id_name); const auto& id = zeek::detail::global_scope()->Find(id_name);
@ -1706,7 +1746,12 @@ const Stats& Manager::GetStatistics() {
statistics.num_stores = data_stores.size(); statistics.num_stores = data_stores.size();
statistics.num_pending_queries = pending_queries.size(); statistics.num_pending_queries = pending_queries.size();
// The other attributes are set as activity happens. statistics.num_events_incoming = static_cast<size_t>(num_events_incoming_metric->Value());
statistics.num_events_outgoing = static_cast<size_t>(num_events_outgoing_metric->Value());
statistics.num_logs_incoming = static_cast<size_t>(num_logs_incoming_metric->Value());
statistics.num_logs_outgoing = static_cast<size_t>(num_logs_outgoing_metric->Value());
statistics.num_ids_incoming = static_cast<size_t>(num_ids_incoming_metric->Value());
statistics.num_ids_outgoing = static_cast<size_t>(num_ids_outgoing_metric->Value());
return statistics; return statistics;
} }

View file

@ -27,8 +27,11 @@ using VectorTypePtr = IntrusivePtr<VectorType>;
using TableValPtr = IntrusivePtr<TableVal>; using TableValPtr = IntrusivePtr<TableVal>;
namespace telemetry { namespace telemetry {
class Manager; class Gauge;
} class Counter;
using GaugePtr = std::shared_ptr<Gauge>;
using CounterPtr = std::shared_ptr<Counter>;
} // namespace telemetry
namespace detail { namespace detail {
class Frame; class Frame;
@ -451,6 +454,16 @@ private:
std::string zeek_table_db_directory; std::string zeek_table_db_directory;
static int script_scope; static int script_scope;
telemetry::GaugePtr num_peers_metric;
telemetry::GaugePtr num_stores_metric;
telemetry::GaugePtr num_pending_queries_metric;
telemetry::CounterPtr num_events_incoming_metric;
telemetry::CounterPtr num_events_outgoing_metric;
telemetry::CounterPtr num_logs_incoming_metric;
telemetry::CounterPtr num_logs_outgoing_metric;
telemetry::CounterPtr num_ids_incoming_metric;
telemetry::CounterPtr num_ids_outgoing_metric;
}; };
} // namespace Broker } // namespace Broker

View file

@ -46,9 +46,9 @@ public:
ProtocolMap::iterator InitCounters(const std::string& protocol) { ProtocolMap::iterator InitCounters(const std::string& protocol) {
auto active_family = auto active_family =
telemetry_mgr->GaugeFamily("zeek", "active-sessions", {"protocol"}, "Active Zeek Sessions"); telemetry_mgr->GaugeFamily("zeek", "active_sessions", {"protocol"}, "Active Zeek Sessions");
auto total_family = auto total_family =
telemetry_mgr->CounterFamily("zeek", "total-sessions", {"protocol"}, "Total number of sessions"); telemetry_mgr->CounterFamily("zeek", "total_sessions", {"protocol"}, "Total number of sessions");
auto [it, inserted] = entries.insert({protocol, Protocol{active_family, total_family, protocol}}); auto [it, inserted] = entries.insert({protocol, Protocol{active_family, total_family, protocol}});
@ -75,7 +75,17 @@ private:
} // namespace detail } // namespace detail
Manager::Manager() { stats = new detail::ProtocolStats(); } Manager::Manager() {
stats = new detail::ProtocolStats();
ended_sessions_metric_family = telemetry_mgr->CounterFamily("zeek", "ended_sessions", {"reason"},
"Number of sessions ended for specific reasons");
ended_by_inactivity_metric =
ended_sessions_metric_family->GetOrAdd({{"reason", "inactivity"}}, []() -> prometheus::ClientMetric {
prometheus::ClientMetric metric;
metric.counter.value = static_cast<double>(zeek::detail::killed_by_inactivity);
return metric;
});
}
Manager::~Manager() { Manager::~Manager() {
Clear(); Clear();

View file

@ -13,6 +13,13 @@
namespace zeek { namespace zeek {
namespace telemetry {
class CounterFamily;
using CounterFamilyPtr = std::shared_ptr<CounterFamily>;
class Counter;
using CounterPtr = std::shared_ptr<Counter>;
} // namespace telemetry
namespace detail { namespace detail {
class PacketFilter; class PacketFilter;
} }
@ -82,7 +89,7 @@ public:
void Weird(const char* name, const Packet* pkt, const char* addl = "", const char* source = ""); void Weird(const char* name, const Packet* pkt, const char* addl = "", const char* source = "");
void Weird(const char* name, const IP_Hdr* ip, const char* addl = ""); void Weird(const char* name, const IP_Hdr* ip, const char* addl = "");
unsigned int CurrentSessions() { return session_map.size(); } size_t CurrentSessions() { return session_map.size(); }
private: private:
using SessionMap = std::unordered_map<detail::Key, Session*, detail::KeyHash>; using SessionMap = std::unordered_map<detail::Key, Session*, detail::KeyHash>;
@ -96,6 +103,8 @@ private:
SessionMap session_map; SessionMap session_map;
detail::ProtocolStats* stats; detail::ProtocolStats* stats;
telemetry::CounterFamilyPtr ended_sessions_metric_family;
telemetry::CounterPtr ended_by_inactivity_metric;
}; };
} // namespace session } // namespace session

View file

@ -83,7 +83,7 @@ function get_conn_stats%(%): ConnStats
r->Assign(n++, Connection::TotalConnections()); r->Assign(n++, Connection::TotalConnections());
r->Assign(n++, Connection::CurrentConnections()); r->Assign(n++, Connection::CurrentConnections());
r->Assign(n++, session_mgr->CurrentSessions()); r->Assign(n++, static_cast<uint64_t>(session_mgr->CurrentSessions()));
session::Stats s; session::Stats s;
if ( session_mgr ) if ( session_mgr )
@ -252,10 +252,10 @@ function get_dns_stats%(%): DNSStats
r->Assign(n++, static_cast<uint64_t>(dstats.successful)); r->Assign(n++, static_cast<uint64_t>(dstats.successful));
r->Assign(n++, static_cast<uint64_t>(dstats.failed)); r->Assign(n++, static_cast<uint64_t>(dstats.failed));
r->Assign(n++, static_cast<uint64_t>(dstats.pending)); r->Assign(n++, static_cast<uint64_t>(dstats.pending));
r->Assign(n++, static_cast<uint64_t>(dstats.cached_hosts)); r->Assign(n++, static_cast<uint64_t>(dstats.cached.hosts));
r->Assign(n++, static_cast<uint64_t>(dstats.cached_addresses)); r->Assign(n++, static_cast<uint64_t>(dstats.cached.addresses));
r->Assign(n++, static_cast<uint64_t>(dstats.cached_texts)); r->Assign(n++, static_cast<uint64_t>(dstats.cached.texts));
r->Assign(n++, static_cast<uint64_t>(dstats.cached_total)); r->Assign(n++, static_cast<uint64_t>(dstats.cached.total));
return std::move(r); return std::move(r);
%} %}
@ -337,7 +337,7 @@ function get_thread_stats%(%): ThreadStats
auto r = zeek::make_intrusive<zeek::RecordVal>(ThreadStats); auto r = zeek::make_intrusive<zeek::RecordVal>(ThreadStats);
int n = 0; int n = 0;
r->Assign(n++, zeek::thread_mgr->NumThreads()); r->Assign(n++, static_cast<uint64_t>(zeek::thread_mgr->NumThreads()));
return std::move(r); return std::move(r);
%} %}

View file

@ -3,7 +3,7 @@
using namespace zeek::telemetry; using namespace zeek::telemetry;
Counter::Counter(FamilyType* family, const prometheus::Labels& labels, prometheus::CollectCallbackPtr callback) noexcept Counter::Counter(FamilyType* family, const prometheus::Labels& labels, prometheus::CollectCallbackPtr callback) noexcept
: handle(family->Add(labels)), labels(labels) { : family(family), handle(family->Add(labels)), labels(labels) {
if ( callback ) { if ( callback ) {
handle.AddCollectCallback(std::move(callback)); handle.AddCollectCallback(std::move(callback));
has_callback = true; has_callback = true;

View file

@ -56,6 +56,7 @@ public:
bool CompareLabels(const prometheus::Labels& lbls) const { return labels == lbls; } bool CompareLabels(const prometheus::Labels& lbls) const { return labels == lbls; }
private: private:
FamilyType* family = nullptr;
Handle& handle; Handle& handle;
prometheus::Labels labels; prometheus::Labels labels;
bool has_callback = false; bool has_callback = false;

View file

@ -15,7 +15,7 @@ double Gauge::Value() const noexcept {
Gauge::Gauge(FamilyType* family, const prometheus::Labels& labels, prometheus::CollectCallbackPtr callback) noexcept Gauge::Gauge(FamilyType* family, const prometheus::Labels& labels, prometheus::CollectCallbackPtr callback) noexcept
: handle(family->Add(labels)), labels(labels) { : family(family), handle(family->Add(labels)), labels(labels) {
if ( callback ) { if ( callback ) {
handle.AddCollectCallback(std::move(callback)); handle.AddCollectCallback(std::move(callback));
has_callback = true; has_callback = true;

View file

@ -74,6 +74,7 @@ public:
bool CompareLabels(const prometheus::Labels& lbls) const { return labels == lbls; } bool CompareLabels(const prometheus::Labels& lbls) const { return labels == lbls; }
private: private:
FamilyType* family = nullptr;
Handle& handle; Handle& handle;
prometheus::Labels labels; prometheus::Labels labels;
bool has_callback = false; bool has_callback = false;

View file

@ -112,14 +112,14 @@ void Manager::InitPostScript() {
} }
#ifdef HAVE_PROCESS_STAT_METRICS #ifdef HAVE_PROCESS_STAT_METRICS
static auto get_stats = [this]() -> const detail::process_stats* { static auto get_stats = []() -> const detail::process_stats* {
double now = util::current_time(); double now = util::current_time();
if ( this->process_stats_last_updated < now - 0.01 ) { if ( telemetry_mgr->process_stats_last_updated < now - 0.01 ) {
this->current_process_stats = detail::get_process_stats(); telemetry_mgr->current_process_stats = detail::get_process_stats();
this->process_stats_last_updated = now; telemetry_mgr->process_stats_last_updated = now;
} }
return &this->current_process_stats; return &telemetry_mgr->current_process_stats;
}; };
rss_gauge = GaugeInstance("process", "resident_memory", {}, "Resident memory size", "bytes", rss_gauge = GaugeInstance("process", "resident_memory", {}, "Resident memory size", "bytes",
[]() -> prometheus::ClientMetric { []() -> prometheus::ClientMetric {
@ -137,13 +137,21 @@ void Manager::InitPostScript() {
return metric; return metric;
}); });
cpu_gauge = GaugeInstance("process", "cpu", {}, "Total user and system CPU time spent", "seconds", cpu_user_counter = CounterInstance("process", "cpu_user", {}, "Total user CPU time spent", "seconds",
[]() -> prometheus::ClientMetric { []() -> prometheus::ClientMetric {
auto* s = get_stats(); auto* s = get_stats();
prometheus::ClientMetric metric; prometheus::ClientMetric metric;
metric.gauge.value = s->cpu; metric.gauge.value = s->cpu_user;
return metric; return metric;
}); });
cpu_system_counter = CounterInstance("process", "cpu_system", {}, "Total system CPU time spent", "seconds",
[]() -> prometheus::ClientMetric {
auto* s = get_stats();
prometheus::ClientMetric metric;
metric.gauge.value = s->cpu_system;
return metric;
});
fds_gauge = GaugeInstance("process", "open_fds", {}, "Number of open file descriptors", "", fds_gauge = GaugeInstance("process", "open_fds", {}, "Number of open file descriptors", "",
[]() -> prometheus::ClientMetric { []() -> prometheus::ClientMetric {
@ -623,18 +631,6 @@ void Manager::WaitForPrometheusCallbacks() {
using namespace std::literals; using namespace std::literals;
using namespace zeek::telemetry; using namespace zeek::telemetry;
namespace {
template<class T>
auto toVector(zeek::Span<T> xs) {
std::vector<std::remove_const_t<T>> result;
for ( auto&& x : xs )
result.emplace_back(x);
return result;
}
} // namespace
SCENARIO("telemetry managers provide access to counter families") { SCENARIO("telemetry managers provide access to counter families") {
GIVEN("a telemetry manager") { GIVEN("a telemetry manager") {
Manager mgr; Manager mgr;

View file

@ -263,7 +263,8 @@ private:
GaugePtr rss_gauge; GaugePtr rss_gauge;
GaugePtr vms_gauge; GaugePtr vms_gauge;
GaugePtr cpu_gauge; CounterPtr cpu_user_counter;
CounterPtr cpu_system_counter;
GaugePtr fds_gauge; GaugePtr fds_gauge;
std::shared_ptr<prometheus::Registry> prometheus_registry; std::shared_ptr<prometheus::Registry> prometheus_registry;

View file

@ -34,10 +34,10 @@ process_stats get_process_stats() {
if ( task_info(mach_task_self(), TASK_THREAD_TIMES_INFO, reinterpret_cast<task_info_t>(&info), &count) == if ( task_info(mach_task_self(), TASK_THREAD_TIMES_INFO, reinterpret_cast<task_info_t>(&info), &count) ==
KERN_SUCCESS ) { KERN_SUCCESS ) {
// Round to milliseconds. // Round to milliseconds.
result.cpu += info.user_time.seconds; result.cpu_user += info.user_time.seconds;
result.cpu += ceil(info.user_time.microseconds / 1000.0) / 1000.0; result.cpu_user += ceil(info.user_time.microseconds / 1000.0) / 1000.0;
result.cpu += info.system_time.seconds; result.cpu_system += info.system_time.seconds;
result.cpu += ceil(info.system_time.microseconds / 1000.0) / 1000.0; result.cpu_system += ceil(info.system_time.microseconds / 1000.0) / 1000.0;
} }
} }
// Fetch open file handles. // Fetch open file handles.
@ -154,7 +154,8 @@ process_stats get_process_stats() {
result.rss = rss_pages * page_size; result.rss = rss_pages * page_size;
result.vms = vmsize_bytes; result.vms = vmsize_bytes;
result.cpu = static_cast<double>(utime_ticks + stime_ticks) / ticks_per_second; result.cpu_user = static_cast<double>(utime_ticks) / ticks_per_second;
result.cpu_system = static_cast<double>(stime_ticks) / ticks_per_second;
result.fds = count_entries_in_directory("/proc/self/fd"); result.fds = count_entries_in_directory("/proc/self/fd");
} }
@ -187,7 +188,10 @@ process_stats get_process_stats() {
if ( kp ) { if ( kp ) {
result.vms = kp->ki_size; result.vms = kp->ki_size;
result.rss = kp->ki_rssize * getpagesize(); result.rss = kp->ki_rssize * getpagesize();
result.cpu = static_cast<double>(kp->ki_runtime) / 1000000.0; result.cpu_user = static_cast<double>(kp->ki_rusage.ru_utime.tv_sec) +
(static_cast<double>(kp->ki_rusage.ru_utime.tv_usec) / 1e6);
result.cpu_system = static_cast<double>(kp->ki_rusage.ru_stime.tv_sec) +
(static_cast<double>(kp->ki_rusage.ru_stime.tv_usec) / 1e6);
struct procstat* procstat = procstat_open_sysctl(); struct procstat* procstat = procstat_open_sysctl();
struct filestat_list* files = procstat_getfiles(procstat, kp, 0); struct filestat_list* files = procstat_getfiles(procstat, kp, 0);

View file

@ -9,7 +9,8 @@ namespace zeek::telemetry::detail {
struct process_stats { struct process_stats {
int64_t rss = 0; int64_t rss = 0;
int64_t vms = 0; int64_t vms = 0;
double cpu = 0.0; double cpu_user = 0.0;
double cpu_system = 0.0;
int64_t fds = 0; int64_t fds = 0;
}; };

View file

@ -7,7 +7,7 @@
#include "zeek/IPAddr.h" #include "zeek/IPAddr.h"
#include "zeek/NetVar.h" #include "zeek/NetVar.h"
#include "zeek/RunState.h" #include "zeek/RunState.h"
#include "zeek/iosource/Manager.h" #include "zeek/telemetry/Manager.h"
namespace zeek::threading { namespace zeek::threading {
namespace detail { namespace detail {
@ -22,6 +22,9 @@ void HeartbeatTimer::Dispatch(double t, bool is_expire) {
} // namespace detail } // namespace detail
static std::vector<uint64_t> pending_bucket_brackets = {1, 10, 100,
1000, 10000, std::numeric_limits<uint64_t>::infinity()};
Manager::Manager() { Manager::Manager() {
DBG_LOG(DBG_THREADING, "Creating thread manager ..."); DBG_LOG(DBG_THREADING, "Creating thread manager ...");
@ -36,6 +39,131 @@ Manager::~Manager() {
Terminate(); Terminate();
} }
void Manager::InitPostScript() {
static auto get_message_thread_stats = []() -> const BucketedMessages* {
if ( ! thread_mgr->terminating ) {
double now = util::current_time();
if ( thread_mgr->bucketed_messages_last_updated < now - 1 ) {
thread_mgr->current_bucketed_messages.pending_in_total = 0;
thread_mgr->current_bucketed_messages.pending_out_total = 0;
for ( auto& m : thread_mgr->current_bucketed_messages.pending_in )
m.second = 0;
for ( auto& m : thread_mgr->current_bucketed_messages.pending_out )
m.second = 0;
MsgThread::Stats thread_stats;
for ( const auto& t : thread_mgr->msg_threads ) {
t->GetStats(&thread_stats);
thread_mgr->current_bucketed_messages.sent_in_total += thread_stats.sent_in;
thread_mgr->current_bucketed_messages.sent_out_total += thread_stats.sent_out;
thread_mgr->current_bucketed_messages.pending_in_total += thread_stats.pending_in;
thread_mgr->current_bucketed_messages.pending_out_total += thread_stats.pending_out;
for ( auto upper_limit : pending_bucket_brackets ) {
if ( thread_stats.pending_in < upper_limit ) {
thread_mgr->current_bucketed_messages.pending_in[upper_limit]++;
break;
}
}
for ( auto upper_limit : pending_bucket_brackets ) {
if ( thread_stats.pending_out < upper_limit ) {
thread_mgr->current_bucketed_messages.pending_out[upper_limit]++;
break;
}
}
}
thread_mgr->bucketed_messages_last_updated = now;
}
}
return &thread_mgr->current_bucketed_messages;
};
num_threads_metric =
telemetry_mgr->GaugeInstance("zeek", "msgthread_active_threads", {}, "Number of active threads", "",
[]() -> prometheus::ClientMetric {
prometheus::ClientMetric metric;
metric.gauge.value =
thread_mgr ? static_cast<double>(thread_mgr->all_threads.size()) : 0.0;
return metric;
});
total_threads_metric = telemetry_mgr->CounterInstance("zeek", "msgthread_threads", {}, "Total number of threads");
total_messages_in_metric =
telemetry_mgr->CounterInstance("zeek", "msgthread_in_messages", {}, "Number of inbound messages received", "",
[]() -> prometheus::ClientMetric {
auto* s = get_message_thread_stats();
prometheus::ClientMetric metric;
metric.gauge.value = static_cast<double>(s->sent_in_total);
return metric;
});
total_messages_in_metric =
telemetry_mgr->CounterInstance("zeek", "msgthread_out_messages", {}, "Number of outbound messages sent", "",
[]() -> prometheus::ClientMetric {
auto* s = get_message_thread_stats();
prometheus::ClientMetric metric;
metric.gauge.value = static_cast<double>(s->sent_out_total);
return metric;
});
pending_messages_in_metric =
telemetry_mgr->GaugeInstance("zeek", "msgthread_pending_in_messages", {}, "Pending number of inbound messages",
"", []() -> prometheus::ClientMetric {
auto* s = get_message_thread_stats();
prometheus::ClientMetric metric;
metric.gauge.value = static_cast<double>(s->pending_in_total);
return metric;
});
pending_messages_out_metric =
telemetry_mgr->GaugeInstance("zeek", "msgthread_pending_out_messages", {},
"Pending number of outbound messages", "", []() -> prometheus::ClientMetric {
auto* s = get_message_thread_stats();
prometheus::ClientMetric metric;
metric.gauge.value = static_cast<double>(s->pending_out_total);
return metric;
});
pending_message_in_buckets_fam =
telemetry_mgr->GaugeFamily("zeek", "msgthread_pending_messages_in_buckets", {"lt"},
"Number of threads with pending inbound messages split into buckets");
pending_message_out_buckets_fam =
telemetry_mgr->GaugeFamily("zeek", "msgthread_pending_messages_out_buckets", {"lt"},
"Number of threads with pending outbound messages split into buckets");
for ( auto upper_limit : pending_bucket_brackets ) {
std::string upper_limit_str;
if ( upper_limit == std::numeric_limits<uint64_t>::infinity() )
upper_limit_str = "inf";
else
upper_limit_str = std::to_string(upper_limit);
current_bucketed_messages.pending_in[upper_limit] = 0;
current_bucketed_messages.pending_out[upper_limit] = 0;
pending_message_in_buckets[upper_limit] =
pending_message_in_buckets_fam->GetOrAdd({{"lt", upper_limit_str}},
[upper_limit]() -> prometheus::ClientMetric {
auto* s = get_message_thread_stats();
prometheus::ClientMetric metric;
metric.gauge.value =
static_cast<double>(s->pending_in.at(upper_limit));
return metric;
});
pending_message_out_buckets[upper_limit] =
pending_message_out_buckets_fam->GetOrAdd({{"lt", upper_limit_str}},
[upper_limit]() -> prometheus::ClientMetric {
auto* s = get_message_thread_stats();
prometheus::ClientMetric metric;
metric.gauge.value =
static_cast<double>(s->pending_out.at(upper_limit));
return metric;
});
}
}
void Manager::Terminate() { void Manager::Terminate() {
DBG_LOG(DBG_THREADING, "Terminating thread manager ..."); DBG_LOG(DBG_THREADING, "Terminating thread manager ...");
terminating = true; terminating = true;
@ -78,6 +206,8 @@ void Manager::AddThread(BasicThread* thread) {
if ( ! heartbeat_timer_running ) if ( ! heartbeat_timer_running )
StartHeartbeatTimer(); StartHeartbeatTimer();
total_threads_metric->Inc();
} }
void Manager::AddMsgThread(MsgThread* thread) { void Manager::AddMsgThread(MsgThread* thread) {

View file

@ -1,12 +1,23 @@
#pragma once #pragma once
#include <list> #include <list>
#include <map>
#include <utility> #include <utility>
#include "zeek/Timer.h" #include "zeek/Timer.h"
#include "zeek/threading/MsgThread.h" #include "zeek/threading/MsgThread.h"
namespace zeek { namespace zeek {
namespace telemetry {
class Gauge;
using GaugePtr = std::shared_ptr<Gauge>;
class GaugeFamily;
using GaugeFamilyPtr = std::shared_ptr<GaugeFamily>;
class Counter;
using CounterPtr = std::shared_ptr<Counter>;
} // namespace telemetry
namespace threading { namespace threading {
namespace detail { namespace detail {
@ -46,6 +57,12 @@ public:
*/ */
~Manager(); ~Manager();
/**
* Performs initialization that can only happen after script parsing has
* completed.
*/
void InitPostScript();
/** /**
* Terminates the manager's processor. The method signals all threads * Terminates the manager's processor. The method signals all threads
* to terminates and wait for them to do so. It then joins them and * to terminates and wait for them to do so. It then joins them and
@ -77,7 +94,7 @@ public:
* threads that are not yet joined, including any potentially in * threads that are not yet joined, including any potentially in
* Terminating() state. * Terminating() state.
*/ */
int NumThreads() const { return all_threads.size(); } size_t NumThreads() const { return all_threads.size(); }
/** /**
* Signals a specific threads to terminate immediately. * Signals a specific threads to terminate immediately.
@ -151,6 +168,29 @@ private:
msg_stats_list stats; msg_stats_list stats;
bool heartbeat_timer_running = false; bool heartbeat_timer_running = false;
telemetry::GaugePtr num_threads_metric;
telemetry::CounterPtr total_threads_metric;
telemetry::CounterPtr total_messages_in_metric;
telemetry::CounterPtr total_messages_out_metric;
telemetry::GaugePtr pending_messages_in_metric;
telemetry::GaugePtr pending_messages_out_metric;
telemetry::GaugeFamilyPtr pending_message_in_buckets_fam;
telemetry::GaugeFamilyPtr pending_message_out_buckets_fam;
std::map<uint64_t, telemetry::GaugePtr> pending_message_in_buckets;
std::map<uint64_t, telemetry::GaugePtr> pending_message_out_buckets;
struct BucketedMessages {
uint64_t sent_in_total;
uint64_t sent_out_total;
uint64_t pending_in_total;
uint64_t pending_out_total;
std::map<uint64_t, uint64_t> pending_in;
std::map<uint64_t, uint64_t> pending_out;
};
BucketedMessages current_bucketed_messages;
double bucketed_messages_last_updated = 0.0;
}; };
} // namespace threading } // namespace threading

View file

@ -9,6 +9,7 @@
#include "zeek/Obj.h" #include "zeek/Obj.h"
#include "zeek/RunState.h" #include "zeek/RunState.h"
#include "zeek/iosource/Manager.h" #include "zeek/iosource/Manager.h"
#include "zeek/telemetry/Manager.h"
#include "zeek/threading/Manager.h" #include "zeek/threading/Manager.h"
// Set by Zeek's main signal handler. // Set by Zeek's main signal handler.

View file

@ -3,8 +3,6 @@
#include <atomic> #include <atomic>
#include "zeek/DebugLogger.h" #include "zeek/DebugLogger.h"
#include "zeek/Flare.h"
#include "zeek/iosource/IOSource.h"
#include "zeek/threading/BasicThread.h" #include "zeek/threading/BasicThread.h"
#include "zeek/threading/Queue.h" #include "zeek/threading/Queue.h"

View file

@ -802,6 +802,7 @@ SetupResult setup(int argc, char** argv, Options* zopts) {
RecordType::InitPostScript(); RecordType::InitPostScript();
telemetry_mgr->InitPostScript(); telemetry_mgr->InitPostScript();
thread_mgr->InitPostScript();
iosource_mgr->InitPostScript(); iosource_mgr->InitPostScript();
log_mgr->InitPostScript(); log_mgr->InitPostScript();
plugin_mgr->InitPostScript(); plugin_mgr->InitPostScript();

View file

@ -1,5 +1,6 @@
### BTest baseline data generated by btest-diff. Do not edit. Use "btest -U/-u" to update. Requires BTest >= 0.63. ### BTest baseline data generated by btest-diff. Do not edit. Use "btest -U/-u" to update. Requires BTest >= 0.63.
### zeek_session_metrics |2| ### zeek_session_metrics |3|
Telemetry::COUNTER, zeek, zeek_ended_sessions_total, [reason], [inactivity], 0.0
Telemetry::COUNTER, zeek, zeek_total_sessions_total, [protocol], [tcp], 500.0 Telemetry::COUNTER, zeek, zeek_total_sessions_total, [protocol], [tcp], 500.0
Telemetry::GAUGE, zeek, zeek_active_sessions, [protocol], [tcp], 500.0 Telemetry::GAUGE, zeek, zeek_active_sessions, [protocol], [tcp], 500.0
### bt* metrics |5| ### bt* metrics |5|

View file

@ -1,5 +1,7 @@
### BTest baseline data generated by btest-diff. Do not edit. Use "btest -U/-u" to update. Requires BTest >= 0.63. ### BTest baseline data generated by btest-diff. Do not edit. Use "btest -U/-u" to update. Requires BTest >= 0.63.
XXXXXXXXXX.XXXXXX zeek counter zeek_ended_sessions_total reason inactivity 0.0
XXXXXXXXXX.XXXXXX zeek counter zeek_total_sessions_total protocol tcp 1.0 XXXXXXXXXX.XXXXXX zeek counter zeek_total_sessions_total protocol tcp 1.0
XXXXXXXXXX.XXXXXX zeek gauge zeek_active_sessions protocol tcp 1.0 XXXXXXXXXX.XXXXXX zeek gauge zeek_active_sessions protocol tcp 1.0
XXXXXXXXXX.XXXXXX zeek counter zeek_ended_sessions_total reason inactivity 0.0
XXXXXXXXXX.XXXXXX zeek counter zeek_total_sessions_total protocol tcp 500.0 XXXXXXXXXX.XXXXXX zeek counter zeek_total_sessions_total protocol tcp 500.0
XXXXXXXXXX.XXXXXX zeek gauge zeek_active_sessions protocol tcp 500.0 XXXXXXXXXX.XXXXXX zeek gauge zeek_active_sessions protocol tcp 500.0