Move dns_mgr stats to telemetry instruments

This commit is contained in:
Tim Wojtulewicz 2024-05-28 19:52:49 -07:00
parent 8b4af06484
commit d1f7999f61
4 changed files with 122 additions and 37 deletions

View file

@ -45,6 +45,7 @@ using ztd::out_ptr::out_ptr;
#include "zeek/Val.h" #include "zeek/Val.h"
#include "zeek/ZeekString.h" #include "zeek/ZeekString.h"
#include "zeek/iosource/Manager.h" #include "zeek/iosource/Manager.h"
#include "zeek/telemetry/Manager.h"
// Number of seconds we'll wait for a reply. // Number of seconds we'll wait for a reply.
constexpr int DNS_TIMEOUT = 5; constexpr int DNS_TIMEOUT = 5;
@ -545,6 +546,55 @@ void DNS_Mgr::InitSource() {
} }
void DNS_Mgr::InitPostScript() { void DNS_Mgr::InitPostScript() {
num_requests_metric =
telemetry_mgr->CounterInstance("zeek", "dnsmgr_requests", {}, "Total number of requests through DNS_Mgr");
successful_metric = telemetry_mgr->CounterInstance("zeek", "dnsmgr_successful_requests", {},
"Total number of successful requests through DNS_Mgr");
failed_metric = telemetry_mgr->CounterInstance("zeek", "dnsmgr_failed_requests", {},
"Total number of failed requests through DNS_Mgr");
asyncs_pending_metric = telemetry_mgr->GaugeInstance("zeek", "dnsmgr_pending_asyncs_requests", {},
"Number of pending async requests through DNS_Mgr");
cached_hosts_metric =
telemetry_mgr->GaugeInstance("zeek", "dnsmgr_cache_entries", {{"type", "host"}},
"Number of cached hosts in DNS_Mgr", "", []() -> prometheus::ClientMetric {
prometheus::ClientMetric metric;
metric.gauge.value = 0;
if ( dns_mgr ) {
dns_mgr->UpdateCachedStats(false);
metric.gauge.value = static_cast<double>(dns_mgr->last_cached_stats.hosts);
}
return metric;
});
cached_addresses_metric =
telemetry_mgr->GaugeInstance("zeek", "dnsmgr_cache_entries", {{"type", "address"}},
"Number of cached addresses in DNS_Mgr", "", []() -> prometheus::ClientMetric {
prometheus::ClientMetric metric;
metric.gauge.value = 0;
if ( dns_mgr ) {
dns_mgr->UpdateCachedStats(false);
metric.gauge.value =
static_cast<double>(dns_mgr->last_cached_stats.addresses);
}
return metric;
});
cached_texts_metric =
telemetry_mgr->GaugeInstance("zeek", "dnsmgr_cache_entries", {{"type", "text"}},
"Number of cached texts in DNS_Mgr", "", []() -> prometheus::ClientMetric {
prometheus::ClientMetric metric;
metric.gauge.value = 0;
if ( dns_mgr ) {
dns_mgr->UpdateCachedStats(false);
metric.gauge.value = static_cast<double>(dns_mgr->last_cached_stats.texts);
}
return metric;
});
if ( ! doctest::is_running_in_test ) { if ( ! doctest::is_running_in_test ) {
dm_rec = id::find_type<RecordType>("dns_mapping"); dm_rec = id::find_type<RecordType>("dns_mapping");
@ -1158,7 +1208,7 @@ void DNS_Mgr::IssueAsyncRequests() {
AsyncRequest* req = asyncs_queued.front(); AsyncRequest* req = asyncs_queued.front();
asyncs_queued.pop_front(); asyncs_queued.pop_front();
++num_requests; num_requests_metric->Inc();
req->time = util::current_time(); req->time = util::current_time();
if ( req->type == T_PTR ) if ( req->type == T_PTR )
@ -1173,6 +1223,7 @@ void DNS_Mgr::IssueAsyncRequests() {
dns_req->MakeRequest(channel, this); dns_req->MakeRequest(channel, this);
++asyncs_pending; ++asyncs_pending;
asyncs_pending_metric->Inc();
} }
} }
@ -1182,11 +1233,11 @@ void DNS_Mgr::CheckAsyncHostRequest(const std::string& host, bool timeout) {
if ( i != asyncs.end() ) { if ( i != asyncs.end() ) {
if ( timeout ) { if ( timeout ) {
++failed; failed_metric->Inc();
i->second->Timeout(); i->second->Timeout();
} }
else if ( auto addrs = LookupNameInCache(host, true, false) ) { else if ( auto addrs = LookupNameInCache(host, true, false) ) {
++successful; successful_metric->Inc();
i->second->Resolved(addrs); i->second->Resolved(addrs);
} }
else else
@ -1195,6 +1246,7 @@ void DNS_Mgr::CheckAsyncHostRequest(const std::string& host, bool timeout) {
delete i->second; delete i->second;
asyncs.erase(i); asyncs.erase(i);
--asyncs_pending; --asyncs_pending;
asyncs_pending_metric->Dec();
} }
} }
@ -1207,11 +1259,11 @@ void DNS_Mgr::CheckAsyncAddrRequest(const IPAddr& addr, bool timeout) {
if ( i != asyncs.end() ) { if ( i != asyncs.end() ) {
if ( timeout ) { if ( timeout ) {
++failed; failed_metric->Inc();
i->second->Timeout(); i->second->Timeout();
} }
else if ( auto name = LookupAddrInCache(addr, true, false) ) { else if ( auto name = LookupAddrInCache(addr, true, false) ) {
++successful; successful_metric->Inc();
i->second->Resolved(name->CheckString()); i->second->Resolved(name->CheckString());
} }
else else
@ -1220,6 +1272,7 @@ void DNS_Mgr::CheckAsyncAddrRequest(const IPAddr& addr, bool timeout) {
delete i->second; delete i->second;
asyncs.erase(i); asyncs.erase(i);
--asyncs_pending; --asyncs_pending;
asyncs_pending_metric->Dec();
} }
} }
@ -1229,11 +1282,11 @@ void DNS_Mgr::CheckAsyncOtherRequest(const std::string& host, bool timeout, int
auto i = asyncs.find(std::make_pair(request_type, host)); auto i = asyncs.find(std::make_pair(request_type, host));
if ( i != asyncs.end() ) { if ( i != asyncs.end() ) {
if ( timeout ) { if ( timeout ) {
++failed; failed_metric->Inc();
i->second->Timeout(); i->second->Timeout();
} }
else if ( auto name = LookupOtherInCache(host, request_type, true) ) { else if ( auto name = LookupOtherInCache(host, request_type, true) ) {
++successful; successful_metric->Inc();
i->second->Resolved(name->CheckString()); i->second->Resolved(name->CheckString());
} }
else else
@ -1242,6 +1295,7 @@ void DNS_Mgr::CheckAsyncOtherRequest(const std::string& host, bool timeout, int
delete i->second; delete i->second;
asyncs.erase(i); asyncs.erase(i);
--asyncs_pending; --asyncs_pending;
asyncs_pending_metric->Dec();
} }
} }
@ -1293,26 +1347,35 @@ void DNS_Mgr::Process() {
ares_process_fd(channel, ARES_SOCKET_BAD, ARES_SOCKET_BAD); ares_process_fd(channel, ARES_SOCKET_BAD, ARES_SOCKET_BAD);
} }
void DNS_Mgr::GetStats(Stats* stats) { void DNS_Mgr::UpdateCachedStats(bool force) {
// TODO: can this use the telemetry framework? double now = util::current_time();
stats->requests = num_requests; if ( force || last_cached_stats_update < now - 0.01 ) {
stats->successful = successful; last_cached_stats.hosts = 0;
stats->failed = failed; last_cached_stats.addresses = 0;
stats->pending = asyncs_pending; last_cached_stats.texts = 0;
last_cached_stats.total = all_mappings.size();
stats->cached_hosts = 0;
stats->cached_addresses = 0;
stats->cached_texts = 0;
stats->cached_total = all_mappings.size();
for ( const auto& [key, mapping] : all_mappings ) { for ( const auto& [key, mapping] : all_mappings ) {
if ( mapping->ReqType() == T_PTR ) if ( mapping->ReqType() == T_PTR )
stats->cached_addresses++; last_cached_stats.addresses++;
else if ( mapping->ReqType() == T_A ) else if ( mapping->ReqType() == T_A )
stats->cached_hosts++; last_cached_stats.hosts++;
else else
stats->cached_texts++; last_cached_stats.texts++;
} }
last_cached_stats_update = now;
}
}
void DNS_Mgr::GetStats(Stats* stats) {
stats->requests = static_cast<unsigned long>(num_requests_metric->Value());
stats->successful = static_cast<unsigned long>(successful_metric->Value());
stats->failed = static_cast<unsigned long>(failed_metric->Value());
stats->pending = asyncs_pending;
UpdateCachedStats(true);
stats->cached = last_cached_stats;
} }
void DNS_Mgr::AsyncRequest::Resolved(const std::string& name) { void DNS_Mgr::AsyncRequest::Resolved(const std::string& name) {

View file

@ -42,6 +42,13 @@ using TableValPtr = IntrusivePtr<TableVal>;
using StringValPtr = IntrusivePtr<StringVal>; using StringValPtr = IntrusivePtr<StringVal>;
using RecordValPtr = IntrusivePtr<RecordVal>; using RecordValPtr = IntrusivePtr<RecordVal>;
namespace telemetry {
class Gauge;
class Counter;
using GaugePtr = std::shared_ptr<Gauge>;
using CounterPtr = std::shared_ptr<Counter>;
} // namespace telemetry
} // namespace zeek } // namespace zeek
namespace zeek::detail { namespace zeek::detail {
@ -198,15 +205,19 @@ public:
*/ */
bool Save(); bool Save();
struct CachedStats {
unsigned long hosts;
unsigned long addresses;
unsigned long texts;
unsigned long total;
};
struct Stats { struct Stats {
unsigned long requests; // These count only async requests. unsigned long requests; // These count only async requests.
unsigned long successful; unsigned long successful;
unsigned long failed; unsigned long failed;
unsigned long pending; unsigned long pending;
unsigned long cached_hosts; CachedStats cached;
unsigned long cached_addresses;
unsigned long cached_texts;
unsigned long cached_total;
}; };
/** /**
@ -285,6 +296,8 @@ protected:
const char* Tag() override { return "DNS_Mgr"; } const char* Tag() override { return "DNS_Mgr"; }
double GetNextTimeout() override; double GetNextTimeout() override;
void UpdateCachedStats(bool force);
DNS_MgrMode mode; DNS_MgrMode mode;
MappingMap all_mappings; MappingMap all_mappings;
@ -293,7 +306,6 @@ protected:
std::string dir; // directory in which cache_name resides std::string dir; // directory in which cache_name resides
bool did_init = false; bool did_init = false;
int asyncs_pending = 0;
RecordTypePtr dm_rec; RecordTypePtr dm_rec;
@ -327,9 +339,19 @@ protected:
using QueuedList = std::list<AsyncRequest*>; using QueuedList = std::list<AsyncRequest*>;
QueuedList asyncs_queued; QueuedList asyncs_queued;
unsigned long num_requests = 0; telemetry::CounterPtr num_requests_metric;
unsigned long successful = 0; telemetry::CounterPtr successful_metric;
unsigned long failed = 0; telemetry::CounterPtr failed_metric;
telemetry::GaugePtr asyncs_pending_metric;
telemetry::GaugePtr cached_hosts_metric;
telemetry::GaugePtr cached_addresses_metric;
telemetry::GaugePtr cached_texts_metric;
double last_cached_stats_update = 0;
CachedStats last_cached_stats;
int asyncs_pending = 0;
std::set<int> socket_fds; std::set<int> socket_fds;
std::set<int> write_socket_fds; std::set<int> write_socket_fds;

View file

@ -173,7 +173,7 @@ void ProfileLogger::Log() {
util::fmt("%.06f DNS_Mgr: requests=%lu successful=%lu failed=%lu pending=%lu " util::fmt("%.06f DNS_Mgr: requests=%lu successful=%lu failed=%lu pending=%lu "
"cached_hosts=%lu cached_addrs=%lu\n", "cached_hosts=%lu cached_addrs=%lu\n",
run_state::network_time, dstats.requests, dstats.successful, dstats.failed, dstats.pending, run_state::network_time, dstats.requests, dstats.successful, dstats.failed, dstats.pending,
dstats.cached_hosts, dstats.cached_addresses)); dstats.cached.hosts, dstats.cached.addresses));
trigger::Manager::Stats tstats; trigger::Manager::Stats tstats;
trigger_mgr->GetStats(&tstats); trigger_mgr->GetStats(&tstats);

View file

@ -252,10 +252,10 @@ function get_dns_stats%(%): DNSStats
r->Assign(n++, static_cast<uint64_t>(dstats.successful)); r->Assign(n++, static_cast<uint64_t>(dstats.successful));
r->Assign(n++, static_cast<uint64_t>(dstats.failed)); r->Assign(n++, static_cast<uint64_t>(dstats.failed));
r->Assign(n++, static_cast<uint64_t>(dstats.pending)); r->Assign(n++, static_cast<uint64_t>(dstats.pending));
r->Assign(n++, static_cast<uint64_t>(dstats.cached_hosts)); r->Assign(n++, static_cast<uint64_t>(dstats.cached.hosts));
r->Assign(n++, static_cast<uint64_t>(dstats.cached_addresses)); r->Assign(n++, static_cast<uint64_t>(dstats.cached.addresses));
r->Assign(n++, static_cast<uint64_t>(dstats.cached_texts)); r->Assign(n++, static_cast<uint64_t>(dstats.cached.texts));
r->Assign(n++, static_cast<uint64_t>(dstats.cached_total)); r->Assign(n++, static_cast<uint64_t>(dstats.cached.total));
return std::move(r); return std::move(r);
%} %}