diff --git a/src/DNS_Mgr.cc b/src/DNS_Mgr.cc index a707d8ead0..33da7dd0ae 100644 --- a/src/DNS_Mgr.cc +++ b/src/DNS_Mgr.cc @@ -45,6 +45,7 @@ using ztd::out_ptr::out_ptr; #include "zeek/Val.h" #include "zeek/ZeekString.h" #include "zeek/iosource/Manager.h" +#include "zeek/telemetry/Manager.h" // Number of seconds we'll wait for a reply. constexpr int DNS_TIMEOUT = 5; @@ -545,6 +546,55 @@ void DNS_Mgr::InitSource() { } void DNS_Mgr::InitPostScript() { + num_requests_metric = + telemetry_mgr->CounterInstance("zeek", "dnsmgr_requests", {}, "Total number of requests through DNS_Mgr"); + successful_metric = telemetry_mgr->CounterInstance("zeek", "dnsmgr_successful_requests", {}, + "Total number of successful requests through DNS_Mgr"); + failed_metric = telemetry_mgr->CounterInstance("zeek", "dnsmgr_failed_requests", {}, + "Total number of failed requests through DNS_Mgr"); + asyncs_pending_metric = telemetry_mgr->GaugeInstance("zeek", "dnsmgr_pending_asyncs_requests", {}, + "Number of pending async requests through DNS_Mgr"); + + cached_hosts_metric = + telemetry_mgr->GaugeInstance("zeek", "dnsmgr_cache_entries", {{"type", "host"}}, + "Number of cached hosts in DNS_Mgr", "", []() -> prometheus::ClientMetric { + prometheus::ClientMetric metric; + metric.gauge.value = 0; + + if ( dns_mgr ) { + dns_mgr->UpdateCachedStats(false); + metric.gauge.value = static_cast(dns_mgr->last_cached_stats.hosts); + } + return metric; + }); + + cached_addresses_metric = + telemetry_mgr->GaugeInstance("zeek", "dnsmgr_cache_entries", {{"type", "address"}}, + "Number of cached addresses in DNS_Mgr", "", []() -> prometheus::ClientMetric { + prometheus::ClientMetric metric; + metric.gauge.value = 0; + + if ( dns_mgr ) { + dns_mgr->UpdateCachedStats(false); + metric.gauge.value = + static_cast(dns_mgr->last_cached_stats.addresses); + } + return metric; + }); + + cached_texts_metric = + telemetry_mgr->GaugeInstance("zeek", "dnsmgr_cache_entries", {{"type", "text"}}, + "Number of cached texts in DNS_Mgr", "", []() -> prometheus::ClientMetric { + prometheus::ClientMetric metric; + metric.gauge.value = 0; + + if ( dns_mgr ) { + dns_mgr->UpdateCachedStats(false); + metric.gauge.value = static_cast(dns_mgr->last_cached_stats.texts); + } + return metric; + }); + if ( ! doctest::is_running_in_test ) { dm_rec = id::find_type("dns_mapping"); @@ -1158,7 +1208,7 @@ void DNS_Mgr::IssueAsyncRequests() { AsyncRequest* req = asyncs_queued.front(); asyncs_queued.pop_front(); - ++num_requests; + num_requests_metric->Inc(); req->time = util::current_time(); if ( req->type == T_PTR ) @@ -1173,6 +1223,7 @@ void DNS_Mgr::IssueAsyncRequests() { dns_req->MakeRequest(channel, this); ++asyncs_pending; + asyncs_pending_metric->Inc(); } } @@ -1182,11 +1233,11 @@ void DNS_Mgr::CheckAsyncHostRequest(const std::string& host, bool timeout) { if ( i != asyncs.end() ) { if ( timeout ) { - ++failed; + failed_metric->Inc(); i->second->Timeout(); } else if ( auto addrs = LookupNameInCache(host, true, false) ) { - ++successful; + successful_metric->Inc(); i->second->Resolved(addrs); } else @@ -1195,6 +1246,7 @@ void DNS_Mgr::CheckAsyncHostRequest(const std::string& host, bool timeout) { delete i->second; asyncs.erase(i); --asyncs_pending; + asyncs_pending_metric->Dec(); } } @@ -1207,11 +1259,11 @@ void DNS_Mgr::CheckAsyncAddrRequest(const IPAddr& addr, bool timeout) { if ( i != asyncs.end() ) { if ( timeout ) { - ++failed; + failed_metric->Inc(); i->second->Timeout(); } else if ( auto name = LookupAddrInCache(addr, true, false) ) { - ++successful; + successful_metric->Inc(); i->second->Resolved(name->CheckString()); } else @@ -1220,6 +1272,7 @@ void DNS_Mgr::CheckAsyncAddrRequest(const IPAddr& addr, bool timeout) { delete i->second; asyncs.erase(i); --asyncs_pending; + asyncs_pending_metric->Dec(); } } @@ -1229,11 +1282,11 @@ void DNS_Mgr::CheckAsyncOtherRequest(const std::string& host, bool timeout, int auto i = asyncs.find(std::make_pair(request_type, host)); if ( i != asyncs.end() ) { if ( timeout ) { - ++failed; + failed_metric->Inc(); i->second->Timeout(); } else if ( auto name = LookupOtherInCache(host, request_type, true) ) { - ++successful; + successful_metric->Inc(); i->second->Resolved(name->CheckString()); } else @@ -1242,6 +1295,7 @@ void DNS_Mgr::CheckAsyncOtherRequest(const std::string& host, bool timeout, int delete i->second; asyncs.erase(i); --asyncs_pending; + asyncs_pending_metric->Dec(); } } @@ -1293,26 +1347,35 @@ void DNS_Mgr::Process() { ares_process_fd(channel, ARES_SOCKET_BAD, ARES_SOCKET_BAD); } +void DNS_Mgr::UpdateCachedStats(bool force) { + double now = util::current_time(); + if ( force || last_cached_stats_update < now - 0.01 ) { + last_cached_stats.hosts = 0; + last_cached_stats.addresses = 0; + last_cached_stats.texts = 0; + last_cached_stats.total = all_mappings.size(); + + for ( const auto& [key, mapping] : all_mappings ) { + if ( mapping->ReqType() == T_PTR ) + last_cached_stats.addresses++; + else if ( mapping->ReqType() == T_A ) + last_cached_stats.hosts++; + else + last_cached_stats.texts++; + } + + last_cached_stats_update = now; + } +} + void DNS_Mgr::GetStats(Stats* stats) { - // TODO: can this use the telemetry framework? - stats->requests = num_requests; - stats->successful = successful; - stats->failed = failed; + stats->requests = static_cast(num_requests_metric->Value()); + stats->successful = static_cast(successful_metric->Value()); + stats->failed = static_cast(failed_metric->Value()); stats->pending = asyncs_pending; - stats->cached_hosts = 0; - stats->cached_addresses = 0; - stats->cached_texts = 0; - stats->cached_total = all_mappings.size(); - - for ( const auto& [key, mapping] : all_mappings ) { - if ( mapping->ReqType() == T_PTR ) - stats->cached_addresses++; - else if ( mapping->ReqType() == T_A ) - stats->cached_hosts++; - else - stats->cached_texts++; - } + UpdateCachedStats(true); + stats->cached = last_cached_stats; } void DNS_Mgr::AsyncRequest::Resolved(const std::string& name) { diff --git a/src/DNS_Mgr.h b/src/DNS_Mgr.h index 5d0f9a84b7..7e063b28a3 100644 --- a/src/DNS_Mgr.h +++ b/src/DNS_Mgr.h @@ -42,6 +42,13 @@ using TableValPtr = IntrusivePtr; using StringValPtr = IntrusivePtr; using RecordValPtr = IntrusivePtr; +namespace telemetry { +class Gauge; +class Counter; +using GaugePtr = std::shared_ptr; +using CounterPtr = std::shared_ptr; +} // namespace telemetry + } // namespace zeek namespace zeek::detail { @@ -198,15 +205,19 @@ public: */ bool Save(); + struct CachedStats { + unsigned long hosts; + unsigned long addresses; + unsigned long texts; + unsigned long total; + }; + struct Stats { unsigned long requests; // These count only async requests. unsigned long successful; unsigned long failed; unsigned long pending; - unsigned long cached_hosts; - unsigned long cached_addresses; - unsigned long cached_texts; - unsigned long cached_total; + CachedStats cached; }; /** @@ -285,6 +296,8 @@ protected: const char* Tag() override { return "DNS_Mgr"; } double GetNextTimeout() override; + void UpdateCachedStats(bool force); + DNS_MgrMode mode; MappingMap all_mappings; @@ -293,7 +306,6 @@ protected: std::string dir; // directory in which cache_name resides bool did_init = false; - int asyncs_pending = 0; RecordTypePtr dm_rec; @@ -327,9 +339,19 @@ protected: using QueuedList = std::list; QueuedList asyncs_queued; - unsigned long num_requests = 0; - unsigned long successful = 0; - unsigned long failed = 0; + telemetry::CounterPtr num_requests_metric; + telemetry::CounterPtr successful_metric; + telemetry::CounterPtr failed_metric; + telemetry::GaugePtr asyncs_pending_metric; + + telemetry::GaugePtr cached_hosts_metric; + telemetry::GaugePtr cached_addresses_metric; + telemetry::GaugePtr cached_texts_metric; + + double last_cached_stats_update = 0; + CachedStats last_cached_stats; + + int asyncs_pending = 0; std::set socket_fds; std::set write_socket_fds; diff --git a/src/Stats.cc b/src/Stats.cc index fb7766eaac..3742c42528 100644 --- a/src/Stats.cc +++ b/src/Stats.cc @@ -173,7 +173,7 @@ void ProfileLogger::Log() { util::fmt("%.06f DNS_Mgr: requests=%lu successful=%lu failed=%lu pending=%lu " "cached_hosts=%lu cached_addrs=%lu\n", run_state::network_time, dstats.requests, dstats.successful, dstats.failed, dstats.pending, - dstats.cached_hosts, dstats.cached_addresses)); + dstats.cached.hosts, dstats.cached.addresses)); trigger::Manager::Stats tstats; trigger_mgr->GetStats(&tstats); diff --git a/src/stats.bif b/src/stats.bif index 50dcc5685f..4763eedea2 100644 --- a/src/stats.bif +++ b/src/stats.bif @@ -252,10 +252,10 @@ function get_dns_stats%(%): DNSStats r->Assign(n++, static_cast(dstats.successful)); r->Assign(n++, static_cast(dstats.failed)); r->Assign(n++, static_cast(dstats.pending)); - r->Assign(n++, static_cast(dstats.cached_hosts)); - r->Assign(n++, static_cast(dstats.cached_addresses)); - r->Assign(n++, static_cast(dstats.cached_texts)); - r->Assign(n++, static_cast(dstats.cached_total)); + r->Assign(n++, static_cast(dstats.cached.hosts)); + r->Assign(n++, static_cast(dstats.cached.addresses)); + r->Assign(n++, static_cast(dstats.cached.texts)); + r->Assign(n++, static_cast(dstats.cached.total)); return std::move(r); %}