From a6843067e9157ad0a7ca6dd1abb6d969cc770e31 Mon Sep 17 00:00:00 2001 From: Tim Wojtulewicz Date: Mon, 5 Aug 2024 13:06:49 -0700 Subject: [PATCH] Split cpu time metric into user/system components like prof.log The total can be calculated from the two parts via Prometheus/Grafana if desired, so it's more informative to pass them as separate parts. --- src/telemetry/Manager.cc | 34 +++++++++++++++------------------- src/telemetry/Manager.h | 3 ++- src/telemetry/ProcessStats.cc | 16 ++++++++++------ src/telemetry/ProcessStats.h | 3 ++- 4 files changed, 29 insertions(+), 27 deletions(-) diff --git a/src/telemetry/Manager.cc b/src/telemetry/Manager.cc index 6ae4e06ec6..6bfcdc71a9 100644 --- a/src/telemetry/Manager.cc +++ b/src/telemetry/Manager.cc @@ -137,13 +137,21 @@ void Manager::InitPostScript() { return metric; }); - cpu_gauge = GaugeInstance("process", "cpu", {}, "Total user and system CPU time spent", "seconds", - []() -> prometheus::ClientMetric { - auto* s = get_stats(); - prometheus::ClientMetric metric; - metric.gauge.value = s->cpu; - return metric; - }); + cpu_user_counter = CounterInstance("process", "cpu_user", {}, "Total user CPU time spent", "seconds", + []() -> prometheus::ClientMetric { + auto* s = get_stats(); + prometheus::ClientMetric metric; + metric.gauge.value = s->cpu_user; + return metric; + }); + + cpu_system_counter = CounterInstance("process", "cpu_system", {}, "Total system CPU time spent", "seconds", + []() -> prometheus::ClientMetric { + auto* s = get_stats(); + prometheus::ClientMetric metric; + metric.gauge.value = s->cpu_system; + return metric; + }); fds_gauge = GaugeInstance("process", "open_fds", {}, "Number of open file descriptors", "", []() -> prometheus::ClientMetric { @@ -623,18 +631,6 @@ void Manager::WaitForPrometheusCallbacks() { using namespace std::literals; using namespace zeek::telemetry; -namespace { - -template -auto toVector(zeek::Span xs) { - std::vector> result; - for ( auto&& x : xs ) - result.emplace_back(x); - return result; -} - -} // namespace - SCENARIO("telemetry managers provide access to counter families") { GIVEN("a telemetry manager") { Manager mgr; diff --git a/src/telemetry/Manager.h b/src/telemetry/Manager.h index d967fe43c0..26647b7cf7 100644 --- a/src/telemetry/Manager.h +++ b/src/telemetry/Manager.h @@ -263,7 +263,8 @@ private: GaugePtr rss_gauge; GaugePtr vms_gauge; - GaugePtr cpu_gauge; + CounterPtr cpu_user_counter; + CounterPtr cpu_system_counter; GaugePtr fds_gauge; std::shared_ptr prometheus_registry; diff --git a/src/telemetry/ProcessStats.cc b/src/telemetry/ProcessStats.cc index f2a0447b63..476efd4487 100644 --- a/src/telemetry/ProcessStats.cc +++ b/src/telemetry/ProcessStats.cc @@ -34,10 +34,10 @@ process_stats get_process_stats() { if ( task_info(mach_task_self(), TASK_THREAD_TIMES_INFO, reinterpret_cast(&info), &count) == KERN_SUCCESS ) { // Round to milliseconds. - result.cpu += info.user_time.seconds; - result.cpu += ceil(info.user_time.microseconds / 1000.0) / 1000.0; - result.cpu += info.system_time.seconds; - result.cpu += ceil(info.system_time.microseconds / 1000.0) / 1000.0; + result.cpu_user += info.user_time.seconds; + result.cpu_user += ceil(info.user_time.microseconds / 1000.0) / 1000.0; + result.cpu_system += info.system_time.seconds; + result.cpu_system += ceil(info.system_time.microseconds / 1000.0) / 1000.0; } } // Fetch open file handles. @@ -154,7 +154,8 @@ process_stats get_process_stats() { result.rss = rss_pages * page_size; result.vms = vmsize_bytes; - result.cpu = static_cast(utime_ticks + stime_ticks) / ticks_per_second; + result.cpu_user = static_cast(utime_ticks) / ticks_per_second; + result.cpu_system = static_cast(stime_ticks) / ticks_per_second; result.fds = count_entries_in_directory("/proc/self/fd"); } @@ -187,7 +188,10 @@ process_stats get_process_stats() { if ( kp ) { result.vms = kp->ki_size; result.rss = kp->ki_rssize * getpagesize(); - result.cpu = static_cast(kp->ki_runtime) / 1000000.0; + result.cpu_user = static_cast(kp->ki_rusage.ru_utime.tv_sec) + + (static_cast(kp->ki_rusage.ru_utime.tv_usec) / 1e6); + result.cpu_system = static_cast(kp->ki_rusage.ru_stime.tv_sec) + + (static_cast(kp->ki_rusage.ru_stime.tv_usec) / 1e6); struct procstat* procstat = procstat_open_sysctl(); struct filestat_list* files = procstat_getfiles(procstat, kp, 0); diff --git a/src/telemetry/ProcessStats.h b/src/telemetry/ProcessStats.h index 02581362cc..d79bb2cb5f 100644 --- a/src/telemetry/ProcessStats.h +++ b/src/telemetry/ProcessStats.h @@ -9,7 +9,8 @@ namespace zeek::telemetry::detail { struct process_stats { int64_t rss = 0; int64_t vms = 0; - double cpu = 0.0; + double cpu_user = 0.0; + double cpu_system = 0.0; int64_t fds = 0; };