Add a services.json endpoint for Prometheus service discovery

This commit is contained in:
Tim Wojtulewicz 2024-03-13 15:28:48 -07:00
parent abb84db6c8
commit e93e4cc26d
10 changed files with 140 additions and 69 deletions

View file

@ -13,15 +13,15 @@ init_key2 in state2: 1
[worker-1] = [node_type=Cluster::WORKER, ip=127.0.0.1, p=5/udp, manager=manager-1]
}
{
[worker-4] = [node_type=Cluster::WORKER, ip=2.3.4.5, zone_id=, p=13/udp, interface=<uninitialized>, manager=<uninitialized>, time_machine=<uninitialized>, id=<uninitialized>]
[worker-4] = [node_type=Cluster::WORKER, ip=2.3.4.5, zone_id=, p=13/udp, interface=<uninitialized>, manager=<uninitialized>, time_machine=<uninitialized>, id=<uninitialized>, metrics_port=<uninitialized>]
}
{
[worker-4] = [node_type=Cluster::WORKER, ip=2.3.4.5, zone_id=, p=13/udp, interface=<uninitialized>, manager=<uninitialized>, time_machine=<uninitialized>, id=<uninitialized>],
[worker-5] = [node_type=Cluster::WORKER, ip=3.4.5.6, zone_id=, p=15/tcp, interface=<uninitialized>, manager=<uninitialized>, time_machine=<uninitialized>, id=<uninitialized>]
[worker-4] = [node_type=Cluster::WORKER, ip=2.3.4.5, zone_id=, p=13/udp, interface=<uninitialized>, manager=<uninitialized>, time_machine=<uninitialized>, id=<uninitialized>, metrics_port=<uninitialized>],
[worker-5] = [node_type=Cluster::WORKER, ip=3.4.5.6, zone_id=, p=15/tcp, interface=<uninitialized>, manager=<uninitialized>, time_machine=<uninitialized>, id=<uninitialized>, metrics_port=<uninitialized>]
}
{
[worker-4] = [node_type=Cluster::WORKER, ip=2.3.4.5, zone_id=, p=13/udp, interface=<uninitialized>, manager=<uninitialized>, time_machine=<uninitialized>, id=<uninitialized>],
[worker-6] = [node_type=Cluster::WORKER, ip=4.5.6.7, zone_id=, p=17/udp, interface=<uninitialized>, manager=<uninitialized>, time_machine=<uninitialized>, id=<uninitialized>]
[worker-4] = [node_type=Cluster::WORKER, ip=2.3.4.5, zone_id=, p=13/udp, interface=<uninitialized>, manager=<uninitialized>, time_machine=<uninitialized>, id=<uninitialized>, metrics_port=<uninitialized>],
[worker-6] = [node_type=Cluster::WORKER, ip=4.5.6.7, zone_id=, p=17/udp, interface=<uninitialized>, manager=<uninitialized>, time_machine=<uninitialized>, id=<uninitialized>, metrics_port=<uninitialized>]
}
{
[3.0, 4]

View file

@ -1,3 +1,2 @@
### BTest baseline data generated by btest-diff. Do not edit. Use "btest -U/-u" to update. Requires BTest >= 0.63.
manager-1, original Broker::metrics_port, 9911/tcp
[endpoint="logger-1", endpoint="manager-1", endpoint="proxy-1", endpoint="worker-1"]
manager-1, Telemetry::metrics_port from cluster config, 1028/tcp

View file

@ -0,0 +1,5 @@
### BTest baseline data generated by btest-diff. Do not edit. Use "btest -U/-u" to update. Requires BTest >= 0.63.
endpoint="proxy-1"
endpoint="worker-1"

View file

@ -1,5 +1,5 @@
### BTest baseline data generated by btest-diff. Do not edit. Use "btest -U/-u" to update. Requires BTest >= 0.63.
supervised node zeek_init()
1024, cluster_nodes!
[node_type=Cluster::WORKER, ip=127.0.0.1, zone_id=, p=0/tcp, interface=<uninitialized>, manager=<uninitialized>, time_machine=<uninitialized>, id=<uninitialized>]
[node_type=Cluster::WORKER, ip=127.0.0.1, zone_id=, p=0/tcp, interface=<uninitialized>, manager=<uninitialized>, time_machine=<uninitialized>, id=<uninitialized>, metrics_port=<uninitialized>]
supervised node zeek_done()

View file

@ -1,5 +1,5 @@
### BTest baseline data generated by btest-diff. Do not edit. Use "btest -U/-u" to update. Requires BTest >= 0.63.
supervised node zeek_init()
1024, cluster_nodes!
[node_type=Cluster::WORKER, ip=127.0.0.1, zone_id=, p=0/tcp, interface=<uninitialized>, manager=<uninitialized>, time_machine=<uninitialized>, id=<uninitialized>]
[node_type=Cluster::WORKER, ip=127.0.0.1, zone_id=, p=0/tcp, interface=<uninitialized>, manager=<uninitialized>, time_machine=<uninitialized>, id=<uninitialized>, metrics_port=<uninitialized>]
supervised node zeek_done()

View file

@ -1,4 +1,4 @@
# @TEST-DOC: Query the Prometheus endpoint on 9911 and smoke check that zeek_version_info{...} is contained in the response for all cluster nodes.
# @TEST-DOC: Query the Prometheus endpoint and smoke check that zeek_version_info{...} is contained in the response for all cluster nodes.
# Note compilable to C++ due to globals being initialized to a record that
# has an opaque type as a field.
# @TEST-REQUIRES: test "${ZEEK_USE_CPP}" != "1"
@ -18,16 +18,35 @@
# @TEST-EXEC: btest-bg-run worker-1 ZEEKPATH=$ZEEKPATH:.. CLUSTER_NODE=worker-1 zeek -b %INPUT
# @TEST-EXEC: btest-bg-wait 10
# @TEST-EXEC: btest-diff manager-1/.stdout
# @TEST-EXEC: btest-diff manager-1/services.out
@TEST-START-FILE cluster-layout.zeek
redef Cluster::nodes = {
["manager-1"] = [$node_type=Cluster::MANAGER, $ip=127.0.0.1, $p=to_port(getenv("BROKER_PORT1"))],
["logger-1"] = [$node_type=Cluster::LOGGER, $ip=127.0.0.1, $p=to_port(getenv("BROKER_PORT2")), $manager="manager-1"],
["proxy-1"] = [$node_type=Cluster::PROXY, $ip=127.0.0.1, $p=to_port(getenv("BROKER_PORT3")), $manager="manager-1"],
["worker-1"] = [$node_type=Cluster::WORKER, $ip=127.0.0.1, $p=to_port(getenv("BROKER_PORT4")), $manager="manager-1"],
["manager-1"] = [$node_type=Cluster::MANAGER, $ip=127.0.0.1, $p=to_port(getenv("BROKER_PORT1")), $metrics_port=1028/tcp],
["logger-1"] = [$node_type=Cluster::LOGGER, $ip=127.0.0.1, $p=to_port(getenv("BROKER_PORT2")), $manager="manager-1", $metrics_port=1029/tcp],
["proxy-1"] = [$node_type=Cluster::PROXY, $ip=127.0.0.1, $p=to_port(getenv("BROKER_PORT3")), $manager="manager-1", $metrics_port=1030/tcp],
["worker-1"] = [$node_type=Cluster::WORKER, $ip=127.0.0.1, $p=to_port(getenv("BROKER_PORT4")), $manager="manager-1", $metrics_port=1031/tcp],
};
@TEST-END-FILE
@TEST-START-FILE request-services.sh
#!/bin/sh
# This script makes repeat curl requests to find all of the metrics data from the
# hosts listed in the services output from the manager, and outputs it all into a
# single file.
services_url=$1
output_file=$2
for host in $(curl -s -m 5 ${services_url} | jq '.[0].targets.[]'); do
host=$(echo ${host} | sed 's/"//g')
metrics=$(curl -s -m 5 http://${host}/metrics)
version_info=$(echo ${metrics} | grep -Eo "zeek_version_info\{[^}]+\}" | grep -o 'endpoint=\"[^"]*\"')
echo ${version_info} >> ${output_file};
done
@TEST-END-FILE
@load policy/frameworks/cluster/experimental
@load policy/frameworks/telemetry/prometheus
@load base/frameworks/telemetry
@ -38,45 +57,24 @@ redef Cluster::nodes = {
# Query the Prometheus endpoint using ActiveHTTP for testing, oh my.
event run_test()
{
local url = fmt("http://localhost:%s/metrics", port_to_count(Telemetry::metrics_port));
when [url] ( local response = ActiveHTTP::request([$url=url]) )
{
if ( response$code != 200 )
{
print fmt("ERROR: %s", response);
exit(1);
}
# Grumble grumble, ActiveHTTP actually joins away the \n characters
# from the response. Not sure how that's helpful. We simply
# grep out the zeek_version_info{...} endpoint="..." pieces and
# expect one for each node to exist as a smoke test.
local version_infos = find_all(response$body, /zeek_version_info\{[^}]+\}/, 0);
local endpoints: vector of string;
for ( info in version_infos )
for ( ep in find_all(info, /endpoint=\"[^"]+\"/))
endpoints += ep;
print sort(endpoints, strcmp);
terminate();
}
timeout 10sec
local services_url = fmt("http://localhost:%s/services.json", port_to_count(Telemetry::metrics_port));
local result = system(fmt("sh ../request-services.sh %s %s", services_url, "services.out"));
if ( result != 0 )
{
# This is bad.
print "ERROR: HTTP request timeout";
print "ERROR: Failed to request service information";
exit(1);
}
terminate();
}
# Use a dynamic metrics port for testing to avoid colliding on 9911/tcp
# when running tests in parallel.
global orig_metrics_port = Telemetry::metrics_port;
redef Telemetry::metrics_port = to_port(getenv("BROKER_TEST_METRICS_PORT"));
event zeek_init()
{
print Cluster::node, "original Telemetry::metrics_port", orig_metrics_port;
print Cluster::node, "Telemetry::metrics_port from cluster config", Telemetry::metrics_port;
}
event Cluster::Experimental::cluster_started()