From a505a7814fe8d9da8dab3eab2eaa6a12fc54804b Mon Sep 17 00:00:00 2001 From: Christian Kreibich Date: Tue, 5 Jul 2022 16:48:15 -0700 Subject: [PATCH 1/6] Management framework: remove outdated comment The agent has a request_expired timeout handler at this point. --- scripts/policy/frameworks/management/agent/main.zeek | 4 ---- 1 file changed, 4 deletions(-) diff --git a/scripts/policy/frameworks/management/agent/main.zeek b/scripts/policy/frameworks/management/agent/main.zeek index 4111828770..20f2714b91 100644 --- a/scripts/policy/frameworks/management/agent/main.zeek +++ b/scripts/policy/frameworks/management/agent/main.zeek @@ -1063,10 +1063,6 @@ event Broker::peer_added(peer: Broker::EndpointInfo, msg: string) Management::Agent::API::version); } -# XXX We may want a request timeout event handler here. It's arguably cleaner to -# send supervisor failure events back to the controller than to rely on its -# controller-agent request timeout to kick in. - event zeek_init() { local epi = Management::Agent::endpoint_info(); From f6597ffabf4003df609e6b116fa96f79cbd9ebec Mon Sep 17 00:00:00 2001 From: Christian Kreibich Date: Tue, 5 Jul 2022 17:10:59 -0700 Subject: [PATCH 2/6] Management framework: await Supervisor peering before sending agent's hello Failing to do so could open a race condition in which a quickly connecting controller could send instructions whose resulting Supervisor interactions got lost. --- .../policy/frameworks/management/agent/main.zeek | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/scripts/policy/frameworks/management/agent/main.zeek b/scripts/policy/frameworks/management/agent/main.zeek index 20f2714b91..690c5ae29e 100644 --- a/scripts/policy/frameworks/management/agent/main.zeek +++ b/scripts/policy/frameworks/management/agent/main.zeek @@ -128,6 +128,11 @@ global restart_request_finish: function(req: Management::Request::Request); # a status response. global get_nodes_request_finish: function(req: Management::Request::Request); +# Whether we have peered with the Supervisor. We need to make sure we've peered +# prior to controller interactions, since we might otherwise send requests to +# the Supervisor that it never received. +global g_supervisor_peered = F; + # The global configuration, as deployed by the controller. global g_config: Management::Configuration; @@ -1049,6 +1054,14 @@ event Broker::peer_added(peer: Broker::EndpointInfo, msg: string) local sni = supervisor_network_info(); if ( peer$network$address == sni$address && peer$network$bound_port == sni$bound_port ) + g_supervisor_peered = T; + + # If the Supervisor hasn't yet peered with us, don't broadcast + # notify_agent_hello. Doing so would exposes a race: we might receive + # commands from the controller that lead to requests to the Supervisor + # that it won't yet receive. It's easier to handle this here than to + # push the wait into all types of received commands. + if ( g_supervisor_peered == F ) return; # Supervisor aside, this does not (cannot?) immediately verify that the From e947e1d1c2ead83bb36243bbef649c546335b45b Mon Sep 17 00:00:00 2001 From: Christian Kreibich Date: Tue, 5 Jul 2022 17:19:36 -0700 Subject: [PATCH 3/6] Management framework: additional context in a few log messages This adds request IDs in a few places that didn't mention them, and makes requests to the Supervisor that act on all current nodes explicit. --- scripts/policy/frameworks/management/agent/main.zeek | 12 ++++++++---- .../frameworks/management/controller/main.zeek | 4 ++-- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/scripts/policy/frameworks/management/agent/main.zeek b/scripts/policy/frameworks/management/agent/main.zeek index 690c5ae29e..54b9b28362 100644 --- a/scripts/policy/frameworks/management/agent/main.zeek +++ b/scripts/policy/frameworks/management/agent/main.zeek @@ -359,7 +359,8 @@ function supervisor_status(node: string): Management::Request::Request local req = Management::Request::create(); req$supervisor_state_agent = SupervisorState($node = node); - Management::Log::info(fmt("tx SupervisorControl::status_request %s %s", req$id, node)); + Management::Log::info(fmt("tx SupervisorControl::status_request %s %s", + req$id, node == "" ? "" : node)); Broker::publish(SupervisorControl::topic_prefix, SupervisorControl::status_request, req$id, node); @@ -371,7 +372,8 @@ function supervisor_create(nc: Supervisor::NodeConfig): Management::Request::Req local req = Management::Request::create(); req$supervisor_state_agent = SupervisorState($node = nc$name); - Management::Log::info(fmt("tx SupervisorControl::create_request %s %s", req$id, nc$name)); + Management::Log::info(fmt("tx SupervisorControl::create_request %s %s", + req$id, nc$name)); Broker::publish(SupervisorControl::topic_prefix, SupervisorControl::create_request, req$id, nc); @@ -383,7 +385,8 @@ function supervisor_destroy(node: string): Management::Request::Request local req = Management::Request::create(); req$supervisor_state_agent = SupervisorState($node = node); - Management::Log::info(fmt("tx SupervisorControl::destroy_request %s %s", req$id, node)); + Management::Log::info(fmt("tx SupervisorControl::destroy_request %s %s", + req$id, node == "" ? "" : node)); Broker::publish(SupervisorControl::topic_prefix, SupervisorControl::destroy_request, req$id, node); @@ -395,7 +398,8 @@ function supervisor_restart(node: string): Management::Request::Request local req = Management::Request::create(); req$supervisor_state_agent = SupervisorState($node = node); - Management::Log::info(fmt("tx SupervisorControl::restart_request %s %s", req$id, node)); + Management::Log::info(fmt("tx SupervisorControl::restart_request %s %s", + req$id, node == "" ? "" : node)); Broker::publish(SupervisorControl::topic_prefix, SupervisorControl::restart_request, req$id, node); diff --git a/scripts/policy/frameworks/management/controller/main.zeek b/scripts/policy/frameworks/management/controller/main.zeek index c88e3127d0..be3052af19 100644 --- a/scripts/policy/frameworks/management/controller/main.zeek +++ b/scripts/policy/frameworks/management/controller/main.zeek @@ -236,7 +236,7 @@ function add_instance(inst: Management::Instance) local req = Management::Request::create(); - Management::Log::info(fmt("tx Management::Agent::API::agent_welcome_request to %s", inst$name)); + Management::Log::info(fmt("tx Management::Agent::API::agent_welcome_request %s to %s", req$id, inst$name)); Broker::publish(Management::Agent::topic_prefix + "/" + inst$name, Management::Agent::API::agent_welcome_request, req$id); } @@ -767,7 +767,7 @@ event Management::Agent::API::notify_agent_hello(instance: string, id: string, c # context for it from the configuration. Tell agent. local req = Management::Request::create(); - Management::Log::info(fmt("tx Management::Agent::API::agent_welcome_request to %s", instance)); + Management::Log::info(fmt("tx Management::Agent::API::agent_welcome_request %s to %s", req$id, instance)); Broker::publish(Management::Agent::topic_prefix + "/" + instance, Management::Agent::API::agent_welcome_request, req$id); } From ffebf99badc81d76f42c8a009994dedab0e3ca29 Mon Sep 17 00:00:00 2001 From: Christian Kreibich Date: Tue, 12 Jul 2022 17:51:32 -0700 Subject: [PATCH 4/6] Management framework: additional logging tweaks Ensure the framework's log stream exists prior to using it in zeek_init(), and use a node-is-live message similar to those in agent and controller also in launched nodes. --- scripts/policy/frameworks/management/log.zeek | 3 ++- scripts/policy/frameworks/management/node/main.zeek | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/scripts/policy/frameworks/management/log.zeek b/scripts/policy/frameworks/management/log.zeek index 95798421a7..bc952f0971 100644 --- a/scripts/policy/frameworks/management/log.zeek +++ b/scripts/policy/frameworks/management/log.zeek @@ -122,7 +122,8 @@ function error(message: string) $role=r2s[Management::role], $message=message]); } -event zeek_init() +# Bump priority to ensure the log stream exists when other zeek_init handlers use it. +event zeek_init() &priority=5 { if ( ! Supervisor::is_supervised() ) return; diff --git a/scripts/policy/frameworks/management/node/main.zeek b/scripts/policy/frameworks/management/node/main.zeek index df65d754b9..7c3709cc0d 100644 --- a/scripts/policy/frameworks/management/node/main.zeek +++ b/scripts/policy/frameworks/management/node/main.zeek @@ -118,4 +118,6 @@ event zeek_init() Broker::peer(epi$network$address, epi$network$bound_port, Management::connect_retry); Broker::subscribe(node_topic); + + Management::Log::info(fmt("node %s is live, Broker ID %s", Cluster::node, Broker::node_id())); } From 600ae4ba6b8ba9e171991b619436f539415cfac6 Mon Sep 17 00:00:00 2001 From: Christian Kreibich Date: Fri, 8 Jul 2022 22:52:14 -0700 Subject: [PATCH 5/6] Management framework: undo the testsuite result mask --- .github/workflows/docker.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 980fe92604..54f06a6875 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -152,7 +152,6 @@ jobs: - name: Run testsuite run: make -C testing/external/zeek-testing-cluster - continue-on-error: true # upload-artifact balks at certain characters in artifact # filenames, so substitute them for dots. From b68f9bee61ff0cf787e872d6045337aad19a646b Mon Sep 17 00:00:00 2001 From: Christian Kreibich Date: Mon, 11 Jul 2022 14:10:53 -0700 Subject: [PATCH 6/6] Management framework: bump cluster testsuite --- testing/external/commit-hash.zeek-testing-cluster | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/testing/external/commit-hash.zeek-testing-cluster b/testing/external/commit-hash.zeek-testing-cluster index f63236b796..95d31313a2 100644 --- a/testing/external/commit-hash.zeek-testing-cluster +++ b/testing/external/commit-hash.zeek-testing-cluster @@ -1 +1 @@ -e25a763d4545f35e8442c9d3145ede483d629a1a +76f6824e0505fa855026ee52f71611e1431e931f