diff --git a/NEWS b/NEWS index 2fd1df0e9e..341bbd4d8e 100644 --- a/NEWS +++ b/NEWS @@ -65,6 +65,22 @@ Breaking Changes come in handy for example when working with tests that compare results against log baselines that have not yet been updated. +- Telemetry centralization and Prometheus exposition is not enabled by default + anymore. Previously, the manager node would open port 9911/tcp by default and + import all metrics from other nodes. For large clusters, the current implementation + introduces significant processing overhead on the manager even if the Prometheus + functionality is not used. While inconvenient, disable this functionality + (assumed to be used by few as of now) by default to preserve resources. + + The script to enable centralization and the Prometheus endpoint is now + located in the ``policy/`` folder. Re-enable the old functionality with: + + @load frameworks/telemetry/prometheus + + You may experiment with increasing ``Broker::metrics_export_interval`` + (default 1s) to reduce the extra overhead and communication at the expense + of stale metrics. + - Custom source tarballs require a ``repo-info.json`` file. Note, should you be using official Zeek release tarballs only, or build diff --git a/scripts/base/frameworks/telemetry/__load__.zeek b/scripts/base/frameworks/telemetry/__load__.zeek index 2aad89db82..88b6dbf672 100644 --- a/scripts/base/frameworks/telemetry/__load__.zeek +++ b/scripts/base/frameworks/telemetry/__load__.zeek @@ -1,7 +1,3 @@ @load ./main @load base/frameworks/cluster - -@if ( Cluster::is_enabled() ) -@load ./cluster -@endif diff --git a/scripts/base/frameworks/telemetry/cluster.zeek b/scripts/policy/frameworks/telemetry/prometheus.zeek similarity index 76% rename from scripts/base/frameworks/telemetry/cluster.zeek rename to scripts/policy/frameworks/telemetry/prometheus.zeek index a589f0ddc5..abc947670a 100644 --- a/scripts/base/frameworks/telemetry/cluster.zeek +++ b/scripts/policy/frameworks/telemetry/prometheus.zeek @@ -1,13 +1,18 @@ ##! In a cluster configuration, open port 9911 on the manager for -##! Prometheus exposition and import all metrics from +##! Prometheus exposition and import all metrics from the ##! `zeek/cluster/metrics/...` topic. ##! ##! For customization or disabling, redef the involved Broker options again. ##! Specifically, to disable listening on port 9911, set ##! :zeek:see:`Broker::metrics_port` to `0/unknown` again. - +##! +##! Note that in large clusters, metrics import may cause significant +##! communication overhead as well as load on the manager. +##! @load base/frameworks/cluster +@if ( Cluster::is_enabled() ) + # Use Cluster::node as "endpoint" label redef Broker::metrics_export_endpoint_name = Cluster::node; @@ -19,3 +24,5 @@ redef Broker::metrics_import_topics = vector("zeek/cluster/metrics/"); @else redef Broker::metrics_export_topic = "zeek/cluster/metrics/"; @endif + +@endif diff --git a/scripts/site/local.zeek b/scripts/site/local.zeek index 512b1ea9cc..6935c6c4d0 100644 --- a/scripts/site/local.zeek +++ b/scripts/site/local.zeek @@ -97,6 +97,10 @@ redef digest_salt = "Please change this value."; # telemetry_histogram.log. @load frameworks/telemetry/log +# Enable metrics centralization on the manager. This opens port 9911/tcp +# on the manager node that can be readily scraped by Prometheus. +# @load frameworks/telemetry/prometheus + # Uncomment the following line to enable detection of the heartbleed attack. Enabling # this might impact performance a bit. # @load policy/protocols/ssl/heartbleed diff --git a/scripts/test-all-policy.zeek b/scripts/test-all-policy.zeek index c244518408..efa867c6bf 100644 --- a/scripts/test-all-policy.zeek +++ b/scripts/test-all-policy.zeek @@ -77,6 +77,7 @@ # @load frameworks/spicy/record-spicy-batch.zeek # @load frameworks/spicy/resource-usage.zeek @load frameworks/software/windows-version-detection.zeek +@load frameworks/telemetry/prometheus.zeek @load frameworks/telemetry/log.zeek @load integration/collective-intel/__load__.zeek @load integration/collective-intel/main.zeek diff --git a/testing/btest/Baseline/coverage.init-default/missing_loads b/testing/btest/Baseline/coverage.init-default/missing_loads index ac50729b9a..fe23c7a04a 100644 --- a/testing/btest/Baseline/coverage.init-default/missing_loads +++ b/testing/btest/Baseline/coverage.init-default/missing_loads @@ -10,5 +10,4 @@ -./frameworks/openflow/cluster.zeek -./frameworks/packet-filter/cluster.zeek -./frameworks/sumstats/cluster.zeek --./frameworks/telemetry/cluster.zeek -./init-supervisor.zeek diff --git a/testing/btest/Baseline/scripts.base.frameworks.telemetry.cluster/manager-1..stdout b/testing/btest/Baseline/scripts.policy.frameworks.telemetry.prometheus/manager-1..stdout similarity index 100% rename from testing/btest/Baseline/scripts.base.frameworks.telemetry.cluster/manager-1..stdout rename to testing/btest/Baseline/scripts.policy.frameworks.telemetry.prometheus/manager-1..stdout diff --git a/testing/btest/scripts/base/frameworks/telemetry/cluster.zeek b/testing/btest/scripts/policy/frameworks/telemetry/prometheus.zeek similarity index 98% rename from testing/btest/scripts/base/frameworks/telemetry/cluster.zeek rename to testing/btest/scripts/policy/frameworks/telemetry/prometheus.zeek index 56ea828b36..83fb95f06d 100644 --- a/testing/btest/scripts/base/frameworks/telemetry/cluster.zeek +++ b/testing/btest/scripts/policy/frameworks/telemetry/prometheus.zeek @@ -29,6 +29,7 @@ redef Cluster::nodes = { @TEST-END-FILE @load policy/frameworks/cluster/experimental +@load policy/frameworks/telemetry/prometheus @load base/frameworks/telemetry @load base/utils/active-http