From 731095235821a39fb682471bf9774b41b8128775 Mon Sep 17 00:00:00 2001 From: Arne Welzel Date: Wed, 24 Aug 2022 18:54:32 +0200 Subject: [PATCH 1/2] telemetry: In a cluster, open port 9911 for Prometheus by default Port 9911 has been allocated here: https://github.com/prometheus/prometheus/wiki/Default-port-allocations Logic is put into telemetry/cluster.zeek. Doing it in `cluster/nodes/` seemed like quite some extra splitting for just those few settings. --- NEWS | 4 + .../base/frameworks/telemetry/__load__.zeek | 6 ++ .../base/frameworks/telemetry/cluster.zeek | 21 +++++ .../coverage.init-default/missing_loads | 1 + .../manager-1..stdout | 2 + .../base/frameworks/telemetry/cluster.zeek | 81 +++++++++++++++++++ 6 files changed, 115 insertions(+) create mode 100644 scripts/base/frameworks/telemetry/cluster.zeek create mode 100644 testing/btest/Baseline/scripts.base.frameworks.telemetry.cluster/manager-1..stdout create mode 100644 testing/btest/scripts/base/frameworks/telemetry/cluster.zeek diff --git a/NEWS b/NEWS index 8a6c383e01..1f1b14d3ba 100644 --- a/NEWS +++ b/NEWS @@ -131,6 +131,10 @@ Changed Functionality This prevents callbacks into script-land through change handlers when parts of the environment have already been torn down. +- When running in cluster mode, the manager by default now imports metrics + from all other cluster nodes via broker topics and opens port 9911/tcp for + Prometheus metrics exposition. + Removed Functionality --------------------- diff --git a/scripts/base/frameworks/telemetry/__load__.zeek b/scripts/base/frameworks/telemetry/__load__.zeek index a10fe855df..2aad89db82 100644 --- a/scripts/base/frameworks/telemetry/__load__.zeek +++ b/scripts/base/frameworks/telemetry/__load__.zeek @@ -1 +1,7 @@ @load ./main + +@load base/frameworks/cluster + +@if ( Cluster::is_enabled() ) +@load ./cluster +@endif diff --git a/scripts/base/frameworks/telemetry/cluster.zeek b/scripts/base/frameworks/telemetry/cluster.zeek new file mode 100644 index 0000000000..a589f0ddc5 --- /dev/null +++ b/scripts/base/frameworks/telemetry/cluster.zeek @@ -0,0 +1,21 @@ +##! In a cluster configuration, open port 9911 on the manager for +##! Prometheus exposition and import all metrics from +##! `zeek/cluster/metrics/...` topic. +##! +##! For customization or disabling, redef the involved Broker options again. +##! Specifically, to disable listening on port 9911, set +##! :zeek:see:`Broker::metrics_port` to `0/unknown` again. + +@load base/frameworks/cluster + +# Use Cluster::node as "endpoint" label +redef Broker::metrics_export_endpoint_name = Cluster::node; + +# The manager opens port 9911 and imports metrics from all nodes by default. +@if ( Cluster::local_node_type() == Cluster::MANAGER ) +redef Broker::metrics_port = 9911/tcp; +redef Broker::metrics_import_topics = vector("zeek/cluster/metrics/"); + +@else +redef Broker::metrics_export_topic = "zeek/cluster/metrics/"; +@endif diff --git a/testing/btest/Baseline/coverage.init-default/missing_loads b/testing/btest/Baseline/coverage.init-default/missing_loads index fe23c7a04a..ac50729b9a 100644 --- a/testing/btest/Baseline/coverage.init-default/missing_loads +++ b/testing/btest/Baseline/coverage.init-default/missing_loads @@ -10,4 +10,5 @@ -./frameworks/openflow/cluster.zeek -./frameworks/packet-filter/cluster.zeek -./frameworks/sumstats/cluster.zeek +-./frameworks/telemetry/cluster.zeek -./init-supervisor.zeek diff --git a/testing/btest/Baseline/scripts.base.frameworks.telemetry.cluster/manager-1..stdout b/testing/btest/Baseline/scripts.base.frameworks.telemetry.cluster/manager-1..stdout new file mode 100644 index 0000000000..103742c1f1 --- /dev/null +++ b/testing/btest/Baseline/scripts.base.frameworks.telemetry.cluster/manager-1..stdout @@ -0,0 +1,2 @@ +### BTest baseline data generated by btest-diff. Do not edit. Use "btest -U/-u" to update. Requires BTest >= 0.63. +[endpoint="logger-1", endpoint="manager-1", endpoint="proxy-1", endpoint="worker-1"] diff --git a/testing/btest/scripts/base/frameworks/telemetry/cluster.zeek b/testing/btest/scripts/base/frameworks/telemetry/cluster.zeek new file mode 100644 index 0000000000..d2efba0ecd --- /dev/null +++ b/testing/btest/scripts/base/frameworks/telemetry/cluster.zeek @@ -0,0 +1,81 @@ +# @TEST-DOC: Query the Prometheus endpoint on 9911 and smoke check that zeek_version_info{...} is contained in the response for all cluster nodes. +# +# @TEST-PORT: BROKER_PORT1 +# @TEST-PORT: BROKER_PORT2 +# @TEST-PORT: BROKER_PORT3 +# @TEST-PORT: BROKER_PORT4 +# +# @TEST-REQUIRES: which curl +# @TEST-EXEC: zeek --parse-only %INPUT +# @TEST-EXEC: btest-bg-run manager-1 ZEEKPATH=$ZEEKPATH:.. CLUSTER_NODE=manager-1 zeek -b %INPUT +# @TEST-EXEC: btest-bg-run logger-1 ZEEKPATH=$ZEEKPATH:.. CLUSTER_NODE=logger-1 zeek -b %INPUT +# @TEST-EXEC: btest-bg-run proxy-1 ZEEKPATH=$ZEEKPATH:.. CLUSTER_NODE=proxy-1 zeek -b %INPUT +# @TEST-EXEC: btest-bg-run worker-1 ZEEKPATH=$ZEEKPATH:.. CLUSTER_NODE=worker-1 zeek -b %INPUT +# @TEST-EXEC: btest-bg-wait 10 +# @TEST-EXEC: btest-diff manager-1/.stdout + +@load base/frameworks/cluster +@load base/frameworks/telemetry +@load base/utils/active-http + +# Query the Prometheus endpoint using ActiveHTTP for testing, oh my. +event run_test() + { + local url = fmt("http://localhost:%s/metrics", port_to_count(Broker::metrics_port)); + when [url] ( local response = ActiveHTTP::request([$url=url]) ) + { + if ( response$code != 200 ) + { + print fmt("ERROR: %s", response); + exit(1); + } + + # Grumble grumble, ActiveHTTP actually joins away the \n characters + # from the the response. Not sure how that's helpful. We simply + # grep out the zeek_version_info{...} endpoint="..." pieces and + # expect one for each node to exist as a smoke test. + local version_infos = find_all(response$body, /zeek_version_info\{[^}]+\}/); + local endpoints: vector of string; + for ( info in version_infos ) + for ( ep in find_all(info, /endpoint=\"[^"]+\"/)) + endpoints += ep; + + print sort(endpoints, strcmp); + + terminate(); + } + timeout 3sec + { + # This is bad. + print "ERROR: HTTP request timeout"; + exit(1); + } + } + +global node_count = 0; + +event Cluster::node_up(name: string, id: string) + { + ++node_count; + # Run the test after all nodes are up and metrics_export_interval + # has passed at least once. + if ( Cluster::node == "manager-1" ) + if ( node_count == 3 ) + schedule 2 * Broker::metrics_export_interval { run_test() }; + } + +# If any node goes down, terminate() right away. +event Cluster::node_down(name: string, id: string) + { + print fmt("node_down on %s", Cluster::node); + terminate(); + } + +@TEST-START-FILE cluster-layout.zeek +redef Cluster::nodes = { + ["manager-1"] = [$node_type=Cluster::MANAGER, $ip=127.0.0.1, $p=to_port(getenv("BROKER_PORT1"))], + ["logger-1"] = [$node_type=Cluster::LOGGER, $ip=127.0.0.1, $p=to_port(getenv("BROKER_PORT2")), $manager="manager-1"], + ["proxy-1"] = [$node_type=Cluster::PROXY, $ip=127.0.0.1, $p=to_port(getenv("BROKER_PORT3")), $manager="manager-1"], + ["worker-1"] = [$node_type=Cluster::WORKER, $ip=127.0.0.1, $p=to_port(getenv("BROKER_PORT4")), $manager="manager-1", $interface="eth0"], +}; +@TEST-END-FILE From 65b31c525ddfe7808b9f669156f42381f262fc5a Mon Sep 17 00:00:00 2001 From: Arne Welzel Date: Tue, 30 Aug 2022 11:19:28 +0200 Subject: [PATCH 2/2] telemetry: Use dynamic metrics port, remove broker topic mentioning from NEWS entry --- NEWS | 5 ++--- .../manager-1..stdout | 1 + .../base/frameworks/telemetry/cluster.zeek | 16 +++++++++++++++- 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/NEWS b/NEWS index 1f1b14d3ba..6e7c4fb728 100644 --- a/NEWS +++ b/NEWS @@ -131,9 +131,8 @@ Changed Functionality This prevents callbacks into script-land through change handlers when parts of the environment have already been torn down. -- When running in cluster mode, the manager by default now imports metrics - from all other cluster nodes via broker topics and opens port 9911/tcp for - Prometheus metrics exposition. +- When running in cluster mode, the manager by default now imports metrics from + all other cluster nodes and opens port 9911/tcp for Prometheus metrics exposition. Removed Functionality --------------------- diff --git a/testing/btest/Baseline/scripts.base.frameworks.telemetry.cluster/manager-1..stdout b/testing/btest/Baseline/scripts.base.frameworks.telemetry.cluster/manager-1..stdout index 103742c1f1..f0734f656c 100644 --- a/testing/btest/Baseline/scripts.base.frameworks.telemetry.cluster/manager-1..stdout +++ b/testing/btest/Baseline/scripts.base.frameworks.telemetry.cluster/manager-1..stdout @@ -1,2 +1,3 @@ ### BTest baseline data generated by btest-diff. Do not edit. Use "btest -U/-u" to update. Requires BTest >= 0.63. +manager-1, original Broker::metrics_port, 9911/tcp [endpoint="logger-1", endpoint="manager-1", endpoint="proxy-1", endpoint="worker-1"] diff --git a/testing/btest/scripts/base/frameworks/telemetry/cluster.zeek b/testing/btest/scripts/base/frameworks/telemetry/cluster.zeek index d2efba0ecd..f71fdc1ad1 100644 --- a/testing/btest/scripts/base/frameworks/telemetry/cluster.zeek +++ b/testing/btest/scripts/base/frameworks/telemetry/cluster.zeek @@ -4,6 +4,8 @@ # @TEST-PORT: BROKER_PORT2 # @TEST-PORT: BROKER_PORT3 # @TEST-PORT: BROKER_PORT4 +# @TEST-PORT: BROKER_PORT4 +# @TEST-PORT: BROKER_TEST_METRICS_PORT # # @TEST-REQUIRES: which curl # @TEST-EXEC: zeek --parse-only %INPUT @@ -44,7 +46,7 @@ event run_test() terminate(); } - timeout 3sec + timeout 10sec { # This is bad. print "ERROR: HTTP request timeout"; @@ -54,6 +56,17 @@ event run_test() global node_count = 0; +@if ( Cluster::node == "manager-1" ) +# Use a dynamic metrics port for testing to avoid colliding on 9911/tcp +# when running tests in parallel. +global orig_metrics_port = Broker::metrics_port; +redef Broker::metrics_port = to_port(getenv("BROKER_TEST_METRICS_PORT")); + +event zeek_init() + { + print Cluster::node, "original Broker::metrics_port", orig_metrics_port; + } + event Cluster::node_up(name: string, id: string) { ++node_count; @@ -63,6 +76,7 @@ event Cluster::node_up(name: string, id: string) if ( node_count == 3 ) schedule 2 * Broker::metrics_export_interval { run_test() }; } +@endif # If any node goes down, terminate() right away. event Cluster::node_down(name: string, id: string)