Management framework: improve handling of node run states

When agents receive a configuration, we don't currently honor requested run
states (there's no such thing as registering a node but not running it, for
example). To reflect this, we now start off nodes in state PENDING as we
launch them via the Supervisor, and move them to RUNNING when they check
in with us via Management::Node::API::notify_node_hello.
This commit is contained in:
Christian Kreibich 2022-04-15 18:27:26 -07:00
parent 497b2723d7
commit fcef7f4925

View file

@ -132,11 +132,10 @@ event Management::Agent::API::set_configuration_request(reqid: string, config: M
for ( nodename in g_nodes ) for ( nodename in g_nodes )
supervisor_destroy(nodename); supervisor_destroy(nodename);
g_nodes = table();
# Refresh the cluster and nodes tables # Refresh the cluster and nodes tables
g_nodes = table();
g_cluster = table(); g_cluster = table();
for ( node in config$nodes ) for ( node in config$nodes )
{ {
if ( node$instance == Management::Agent::name ) if ( node$instance == Management::Agent::name )
@ -166,6 +165,8 @@ event Management::Agent::API::set_configuration_request(reqid: string, config: M
for ( nodename in g_nodes ) for ( nodename in g_nodes )
{ {
node = g_nodes[nodename]; node = g_nodes[nodename];
node$state = Management::PENDING;
nc = Supervisor::NodeConfig($name=nodename); nc = Supervisor::NodeConfig($name=nodename);
if ( Management::Agent::cluster_directory != "" ) if ( Management::Agent::cluster_directory != "" )
@ -237,6 +238,11 @@ event SupervisorControl::status_response(reqid: string, result: Supervisor::Stat
{ {
cns$cluster_role = sns$node$cluster[node]$role; cns$cluster_role = sns$node$cluster[node]$role;
# For cluster nodes, copy run state from g_nodes, our
# live node status table.
if ( node in g_nodes )
cns$state = g_nodes[node]$state;
# The supervisor's responses use 0/tcp (not 0/unknown) # The supervisor's responses use 0/tcp (not 0/unknown)
# when indicating an unused port because its internal # when indicating an unused port because its internal
# serialization always assumes TCP. # serialization always assumes TCP.
@ -251,12 +257,22 @@ event SupervisorControl::status_response(reqid: string, result: Supervisor::Stat
if ( role == "CONTROLLER" ) if ( role == "CONTROLLER" )
{ {
cns$mgmt_role = Management::CONTROLLER; cns$mgmt_role = Management::CONTROLLER;
# Automatically declare the controller in running state
# here -- we'd not have received a request that brought
# us here otherwise.
cns$state = Management::RUNNING;
# The controller always listens, so the Zeek client can connect. # The controller always listens, so the Zeek client can connect.
cns$p = Management::Agent::endpoint_info()$network$bound_port; cns$p = Management::Agent::endpoint_info()$network$bound_port;
} }
else if ( role == "AGENT" ) else if ( role == "AGENT" )
{ {
cns$mgmt_role = Management::AGENT; cns$mgmt_role = Management::AGENT;
# Similarly to above, always declare agent running. We are. :)
cns$state = Management::RUNNING;
# If we have a controller address, the agent connects to it # If we have a controller address, the agent connects to it
# and does not listen. See zeek_init() below for similar logic. # and does not listen. See zeek_init() below for similar logic.
if ( Management::Agent::controller$address == "0.0.0.0" ) if ( Management::Agent::controller$address == "0.0.0.0" )
@ -268,13 +284,9 @@ event SupervisorControl::status_response(reqid: string, result: Supervisor::Stat
} }
} }
# A PID is available if a supervised node has fully launched # A PID is available if a supervised node has fully launched.
# and is therefore running.
if ( sns?$pid ) if ( sns?$pid )
{
cns$pid = sns$pid; cns$pid = sns$pid;
cns$state = Management::RUNNING;
}
node_statuses += cns; node_statuses += cns;
} }
@ -409,6 +421,14 @@ event Management::Agent::API::agent_standby_request(reqid: string)
event Management::Agent::API::agent_standby_response(reqid, res); event Management::Agent::API::agent_standby_response(reqid, res);
} }
event Management::Node::API::notify_node_hello(node: string)
{
Management::Log::info(fmt("rx Management::Node::API::notify_node_hello %s", node));
if ( node in g_nodes )
g_nodes[node]$state = Management::RUNNING;
}
event Broker::peer_added(peer: Broker::EndpointInfo, msg: string) event Broker::peer_added(peer: Broker::EndpointInfo, msg: string)
{ {
# This does not (cannot?) immediately verify that the new peer # This does not (cannot?) immediately verify that the new peer