Merge branch 'topic/christian/management-agent-boot-fix'

* topic/christian/management-agent-boot-fix:
  Management framework: bump cluster testsuite
  Management framework: undo the testsuite result mask
  Management framework: additional logging tweaks
  Management framework: additional context in a few log messages
  Management framework: await Supervisor peering before sending agent's hello
  Management framework: remove outdated comment
This commit is contained in:
Christian Kreibich 2022-07-14 15:46:29 -07:00
commit 6543c21d23
8 changed files with 40 additions and 14 deletions

View file

@ -152,7 +152,6 @@ jobs:
- name: Run testsuite - name: Run testsuite
run: make -C testing/external/zeek-testing-cluster run: make -C testing/external/zeek-testing-cluster
continue-on-error: true
# upload-artifact balks at certain characters in artifact # upload-artifact balks at certain characters in artifact
# filenames, so substitute them for dots. # filenames, so substitute them for dots.

11
CHANGES
View file

@ -1,3 +1,14 @@
5.1.0-dev.261 | 2022-07-14 15:46:29 -0700
* Management framework: fix an agent boot-time race condition plus minor tweaks (Christian Kreibich, Corelight)
- bump cluster testsuite
- undo the testsuite result mask
- additional logging tweaks
- additional context in a few log messages
- await Supervisor peering before sending agent's hello
- remove outdated comment
5.1.0-dev.254 | 2022-07-14 09:48:20 -0700 5.1.0-dev.254 | 2022-07-14 09:48:20 -0700
* Add pow function (AmazingPP) * Add pow function (AmazingPP)

View file

@ -1 +1 @@
5.1.0-dev.254 5.1.0-dev.261

View file

@ -128,6 +128,11 @@ global restart_request_finish: function(req: Management::Request::Request);
# a status response. # a status response.
global get_nodes_request_finish: function(req: Management::Request::Request); global get_nodes_request_finish: function(req: Management::Request::Request);
# Whether we have peered with the Supervisor. We need to make sure we've peered
# prior to controller interactions, since we might otherwise send requests to
# the Supervisor that it never received.
global g_supervisor_peered = F;
# The global configuration, as deployed by the controller. # The global configuration, as deployed by the controller.
global g_config: Management::Configuration; global g_config: Management::Configuration;
@ -354,7 +359,8 @@ function supervisor_status(node: string): Management::Request::Request
local req = Management::Request::create(); local req = Management::Request::create();
req$supervisor_state_agent = SupervisorState($node = node); req$supervisor_state_agent = SupervisorState($node = node);
Management::Log::info(fmt("tx SupervisorControl::status_request %s %s", req$id, node)); Management::Log::info(fmt("tx SupervisorControl::status_request %s %s",
req$id, node == "" ? "<all>" : node));
Broker::publish(SupervisorControl::topic_prefix, Broker::publish(SupervisorControl::topic_prefix,
SupervisorControl::status_request, req$id, node); SupervisorControl::status_request, req$id, node);
@ -366,7 +372,8 @@ function supervisor_create(nc: Supervisor::NodeConfig): Management::Request::Req
local req = Management::Request::create(); local req = Management::Request::create();
req$supervisor_state_agent = SupervisorState($node = nc$name); req$supervisor_state_agent = SupervisorState($node = nc$name);
Management::Log::info(fmt("tx SupervisorControl::create_request %s %s", req$id, nc$name)); Management::Log::info(fmt("tx SupervisorControl::create_request %s %s",
req$id, nc$name));
Broker::publish(SupervisorControl::topic_prefix, Broker::publish(SupervisorControl::topic_prefix,
SupervisorControl::create_request, req$id, nc); SupervisorControl::create_request, req$id, nc);
@ -378,7 +385,8 @@ function supervisor_destroy(node: string): Management::Request::Request
local req = Management::Request::create(); local req = Management::Request::create();
req$supervisor_state_agent = SupervisorState($node = node); req$supervisor_state_agent = SupervisorState($node = node);
Management::Log::info(fmt("tx SupervisorControl::destroy_request %s %s", req$id, node)); Management::Log::info(fmt("tx SupervisorControl::destroy_request %s %s",
req$id, node == "" ? "<all>" : node));
Broker::publish(SupervisorControl::topic_prefix, Broker::publish(SupervisorControl::topic_prefix,
SupervisorControl::destroy_request, req$id, node); SupervisorControl::destroy_request, req$id, node);
@ -390,7 +398,8 @@ function supervisor_restart(node: string): Management::Request::Request
local req = Management::Request::create(); local req = Management::Request::create();
req$supervisor_state_agent = SupervisorState($node = node); req$supervisor_state_agent = SupervisorState($node = node);
Management::Log::info(fmt("tx SupervisorControl::restart_request %s %s", req$id, node)); Management::Log::info(fmt("tx SupervisorControl::restart_request %s %s",
req$id, node == "" ? "<all>" : node));
Broker::publish(SupervisorControl::topic_prefix, Broker::publish(SupervisorControl::topic_prefix,
SupervisorControl::restart_request, req$id, node); SupervisorControl::restart_request, req$id, node);
@ -1049,6 +1058,14 @@ event Broker::peer_added(peer: Broker::EndpointInfo, msg: string)
local sni = supervisor_network_info(); local sni = supervisor_network_info();
if ( peer$network$address == sni$address && peer$network$bound_port == sni$bound_port ) if ( peer$network$address == sni$address && peer$network$bound_port == sni$bound_port )
g_supervisor_peered = T;
# If the Supervisor hasn't yet peered with us, don't broadcast
# notify_agent_hello. Doing so would exposes a race: we might receive
# commands from the controller that lead to requests to the Supervisor
# that it won't yet receive. It's easier to handle this here than to
# push the wait into all types of received commands.
if ( g_supervisor_peered == F )
return; return;
# Supervisor aside, this does not (cannot?) immediately verify that the # Supervisor aside, this does not (cannot?) immediately verify that the
@ -1063,10 +1080,6 @@ event Broker::peer_added(peer: Broker::EndpointInfo, msg: string)
Management::Agent::API::version); Management::Agent::API::version);
} }
# XXX We may want a request timeout event handler here. It's arguably cleaner to
# send supervisor failure events back to the controller than to rely on its
# controller-agent request timeout to kick in.
event zeek_init() event zeek_init()
{ {
local epi = Management::Agent::endpoint_info(); local epi = Management::Agent::endpoint_info();

View file

@ -236,7 +236,7 @@ function add_instance(inst: Management::Instance)
local req = Management::Request::create(); local req = Management::Request::create();
Management::Log::info(fmt("tx Management::Agent::API::agent_welcome_request to %s", inst$name)); Management::Log::info(fmt("tx Management::Agent::API::agent_welcome_request %s to %s", req$id, inst$name));
Broker::publish(Management::Agent::topic_prefix + "/" + inst$name, Broker::publish(Management::Agent::topic_prefix + "/" + inst$name,
Management::Agent::API::agent_welcome_request, req$id); Management::Agent::API::agent_welcome_request, req$id);
} }
@ -767,7 +767,7 @@ event Management::Agent::API::notify_agent_hello(instance: string, id: string, c
# context for it from the configuration. Tell agent. # context for it from the configuration. Tell agent.
local req = Management::Request::create(); local req = Management::Request::create();
Management::Log::info(fmt("tx Management::Agent::API::agent_welcome_request to %s", instance)); Management::Log::info(fmt("tx Management::Agent::API::agent_welcome_request %s to %s", req$id, instance));
Broker::publish(Management::Agent::topic_prefix + "/" + instance, Broker::publish(Management::Agent::topic_prefix + "/" + instance,
Management::Agent::API::agent_welcome_request, req$id); Management::Agent::API::agent_welcome_request, req$id);
} }

View file

@ -122,7 +122,8 @@ function error(message: string)
$role=r2s[Management::role], $message=message]); $role=r2s[Management::role], $message=message]);
} }
event zeek_init() # Bump priority to ensure the log stream exists when other zeek_init handlers use it.
event zeek_init() &priority=5
{ {
if ( ! Supervisor::is_supervised() ) if ( ! Supervisor::is_supervised() )
return; return;

View file

@ -118,4 +118,6 @@ event zeek_init()
Broker::peer(epi$network$address, epi$network$bound_port, Management::connect_retry); Broker::peer(epi$network$address, epi$network$bound_port, Management::connect_retry);
Broker::subscribe(node_topic); Broker::subscribe(node_topic);
Management::Log::info(fmt("node %s is live, Broker ID %s", Cluster::node, Broker::node_id()));
} }

View file

@ -1 +1 @@
e25a763d4545f35e8442c9d3145ede483d629a1a 6e95008a28f5fe5d26d8a77384eb8355536df25e