diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 980fe92604..54f06a6875 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -152,7 +152,6 @@ jobs: - name: Run testsuite run: make -C testing/external/zeek-testing-cluster - continue-on-error: true # upload-artifact balks at certain characters in artifact # filenames, so substitute them for dots. diff --git a/CHANGES b/CHANGES index da03e99271..456c924b56 100644 --- a/CHANGES +++ b/CHANGES @@ -1,3 +1,14 @@ +5.1.0-dev.261 | 2022-07-14 15:46:29 -0700 + + * Management framework: fix an agent boot-time race condition plus minor tweaks (Christian Kreibich, Corelight) + + - bump cluster testsuite + - undo the testsuite result mask + - additional logging tweaks + - additional context in a few log messages + - await Supervisor peering before sending agent's hello + - remove outdated comment + 5.1.0-dev.254 | 2022-07-14 09:48:20 -0700 * Add pow function (AmazingPP) diff --git a/VERSION b/VERSION index 767fe61ad5..86f87a61bf 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -5.1.0-dev.254 +5.1.0-dev.261 diff --git a/scripts/policy/frameworks/management/agent/main.zeek b/scripts/policy/frameworks/management/agent/main.zeek index 4111828770..54b9b28362 100644 --- a/scripts/policy/frameworks/management/agent/main.zeek +++ b/scripts/policy/frameworks/management/agent/main.zeek @@ -128,6 +128,11 @@ global restart_request_finish: function(req: Management::Request::Request); # a status response. global get_nodes_request_finish: function(req: Management::Request::Request); +# Whether we have peered with the Supervisor. We need to make sure we've peered +# prior to controller interactions, since we might otherwise send requests to +# the Supervisor that it never received. +global g_supervisor_peered = F; + # The global configuration, as deployed by the controller. global g_config: Management::Configuration; @@ -354,7 +359,8 @@ function supervisor_status(node: string): Management::Request::Request local req = Management::Request::create(); req$supervisor_state_agent = SupervisorState($node = node); - Management::Log::info(fmt("tx SupervisorControl::status_request %s %s", req$id, node)); + Management::Log::info(fmt("tx SupervisorControl::status_request %s %s", + req$id, node == "" ? "" : node)); Broker::publish(SupervisorControl::topic_prefix, SupervisorControl::status_request, req$id, node); @@ -366,7 +372,8 @@ function supervisor_create(nc: Supervisor::NodeConfig): Management::Request::Req local req = Management::Request::create(); req$supervisor_state_agent = SupervisorState($node = nc$name); - Management::Log::info(fmt("tx SupervisorControl::create_request %s %s", req$id, nc$name)); + Management::Log::info(fmt("tx SupervisorControl::create_request %s %s", + req$id, nc$name)); Broker::publish(SupervisorControl::topic_prefix, SupervisorControl::create_request, req$id, nc); @@ -378,7 +385,8 @@ function supervisor_destroy(node: string): Management::Request::Request local req = Management::Request::create(); req$supervisor_state_agent = SupervisorState($node = node); - Management::Log::info(fmt("tx SupervisorControl::destroy_request %s %s", req$id, node)); + Management::Log::info(fmt("tx SupervisorControl::destroy_request %s %s", + req$id, node == "" ? "" : node)); Broker::publish(SupervisorControl::topic_prefix, SupervisorControl::destroy_request, req$id, node); @@ -390,7 +398,8 @@ function supervisor_restart(node: string): Management::Request::Request local req = Management::Request::create(); req$supervisor_state_agent = SupervisorState($node = node); - Management::Log::info(fmt("tx SupervisorControl::restart_request %s %s", req$id, node)); + Management::Log::info(fmt("tx SupervisorControl::restart_request %s %s", + req$id, node == "" ? "" : node)); Broker::publish(SupervisorControl::topic_prefix, SupervisorControl::restart_request, req$id, node); @@ -1049,6 +1058,14 @@ event Broker::peer_added(peer: Broker::EndpointInfo, msg: string) local sni = supervisor_network_info(); if ( peer$network$address == sni$address && peer$network$bound_port == sni$bound_port ) + g_supervisor_peered = T; + + # If the Supervisor hasn't yet peered with us, don't broadcast + # notify_agent_hello. Doing so would exposes a race: we might receive + # commands from the controller that lead to requests to the Supervisor + # that it won't yet receive. It's easier to handle this here than to + # push the wait into all types of received commands. + if ( g_supervisor_peered == F ) return; # Supervisor aside, this does not (cannot?) immediately verify that the @@ -1063,10 +1080,6 @@ event Broker::peer_added(peer: Broker::EndpointInfo, msg: string) Management::Agent::API::version); } -# XXX We may want a request timeout event handler here. It's arguably cleaner to -# send supervisor failure events back to the controller than to rely on its -# controller-agent request timeout to kick in. - event zeek_init() { local epi = Management::Agent::endpoint_info(); diff --git a/scripts/policy/frameworks/management/controller/main.zeek b/scripts/policy/frameworks/management/controller/main.zeek index c88e3127d0..be3052af19 100644 --- a/scripts/policy/frameworks/management/controller/main.zeek +++ b/scripts/policy/frameworks/management/controller/main.zeek @@ -236,7 +236,7 @@ function add_instance(inst: Management::Instance) local req = Management::Request::create(); - Management::Log::info(fmt("tx Management::Agent::API::agent_welcome_request to %s", inst$name)); + Management::Log::info(fmt("tx Management::Agent::API::agent_welcome_request %s to %s", req$id, inst$name)); Broker::publish(Management::Agent::topic_prefix + "/" + inst$name, Management::Agent::API::agent_welcome_request, req$id); } @@ -767,7 +767,7 @@ event Management::Agent::API::notify_agent_hello(instance: string, id: string, c # context for it from the configuration. Tell agent. local req = Management::Request::create(); - Management::Log::info(fmt("tx Management::Agent::API::agent_welcome_request to %s", instance)); + Management::Log::info(fmt("tx Management::Agent::API::agent_welcome_request %s to %s", req$id, instance)); Broker::publish(Management::Agent::topic_prefix + "/" + instance, Management::Agent::API::agent_welcome_request, req$id); } diff --git a/scripts/policy/frameworks/management/log.zeek b/scripts/policy/frameworks/management/log.zeek index 95798421a7..bc952f0971 100644 --- a/scripts/policy/frameworks/management/log.zeek +++ b/scripts/policy/frameworks/management/log.zeek @@ -122,7 +122,8 @@ function error(message: string) $role=r2s[Management::role], $message=message]); } -event zeek_init() +# Bump priority to ensure the log stream exists when other zeek_init handlers use it. +event zeek_init() &priority=5 { if ( ! Supervisor::is_supervised() ) return; diff --git a/scripts/policy/frameworks/management/node/main.zeek b/scripts/policy/frameworks/management/node/main.zeek index df65d754b9..7c3709cc0d 100644 --- a/scripts/policy/frameworks/management/node/main.zeek +++ b/scripts/policy/frameworks/management/node/main.zeek @@ -118,4 +118,6 @@ event zeek_init() Broker::peer(epi$network$address, epi$network$bound_port, Management::connect_retry); Broker::subscribe(node_topic); + + Management::Log::info(fmt("node %s is live, Broker ID %s", Cluster::node, Broker::node_id())); } diff --git a/testing/external/commit-hash.zeek-testing-cluster b/testing/external/commit-hash.zeek-testing-cluster index f63236b796..06da73542e 100644 --- a/testing/external/commit-hash.zeek-testing-cluster +++ b/testing/external/commit-hash.zeek-testing-cluster @@ -1 +1 @@ -e25a763d4545f35e8442c9d3145ede483d629a1a +6e95008a28f5fe5d26d8a77384eb8355536df25e