Remove periodic pinging of controller by agents

This changes the agent-controller communication to remove the need for ongoing pinging of the controller by agents not actively "in service". Instead, agents now use the notify_agent_hello event to the controller to report only their identity. The controller puts them into service via an agent_welcome_request/ response pair, and takes them out of service via agent_standby_request/response. This removes the on_change handler from the set of agents that is ready for service, because not every change to this set is now a suitable time to potentially send out the configuration. We now invoke this check explicitly in the two situations where it's warranted: when a agent reports ready for service, and when we've received a new configuration.
2025-10-09 18:18:19 +00:00 · 2021-12-20 16:30:58 -08:00 · 2021-12-20 16:30:58 -08:00 · ac40d5c5b2
commit ac40d5c5b2
parent 8463f14a52
3 changed files with 208 additions and 113 deletions
--- a/scripts/policy/frameworks/cluster/agent/api.zeek
+++ b/scripts/policy/frameworks/cluster/agent/api.zeek
@ -8,14 +8,35 @@ export {

 	# Agent API events

+	# The controller uses this event to convey a new cluster
+	# configuration to the agent. Once processed, the agent
+	# responds with the response event.
 	global set_configuration_request: event(reqid: string,
 	    config: ClusterController::Types::Configuration);
 	global set_configuration_response: event(reqid: string,
 	    result: ClusterController::Types::Result);

+	# The controller uses this event to confirm to the agent
+	# that it is part of the current cluster. The agent
+	# acknowledges with the response event.
+	global agent_welcome_request: event(reqid: string);
+	global agent_welcome_response: event(reqid: string,
+	    result: ClusterController::Types::Result);
+
+	# The controller sends this event to convey that the agent is not
+	# currently required. This status may later change, depending on
+	# updates from the client, so the peering can remain active. The
+	# agent releases any cluster-related resources when processing the
+	# request.
+	global agent_standby_request: event(reqid: string);
+	global agent_standby_response: event(reqid: string,
+	    result: ClusterController::Types::Result);
+
 	# Notification events, agent -> controller

-	# Report agent being available.
+	# The agent sends this upon peering as a "check in", informing the
+	# controller that an agent of the given name is now available to
+	# communicate with.
 	global notify_agent_hello: event(instance: string, host: addr,
 	    api_version: count);

@ -30,11 +51,4 @@ export {

 	# Report informational message.
 	global notify_log: event(instance: string, msg: string, node: string &default="");
-
-	# Notification events, controller -> agent
-
-	# Confirmation from controller in response to notify_agent_hello
-	# that the agent is welcome.
-	global notify_controller_hello: event(controller: string, host: addr);
-
 }
--- a/scripts/policy/frameworks/cluster/agent/main.zeek
+++ b/scripts/policy/frameworks/cluster/agent/main.zeek
@ -24,11 +24,6 @@ global g_nodes: table[string] of ClusterController::Types::Node;
 # new configurations.
 global g_data_cluster: table[string] of Supervisor::ClusterEndpoint;

-# Whether we currenty keep notifying the controller that we're here.
-# We stop once the controller responds back, and resume when we've
-# lost the peering.
-global g_notify_controller: bool = T;
-

 event SupervisorControl::create_response(reqid: string, result: string)
 	{
@ -169,25 +164,44 @@ event ClusterAgent::API::set_configuration_request(reqid: string, config: Cluste
 		}
 	}

-event ClusterAgent::API::notify_controller_hello(controller: string, host: addr)
+event ClusterAgent::API::agent_welcome_request(reqid: string)
 	{
-	ClusterController::Log::info(fmt("rx ClusterAgent::API::notify_controller_hello %s %s", controller, host));
-	g_notify_controller = F;
+	ClusterController::Log::info(fmt("rx ClusterAgent::API::agent_welcome_request %s", reqid));
+
+	local res = ClusterController::Types::Result(
+	    $reqid = reqid,
+	    $instance = ClusterAgent::name);
+
+	ClusterController::Log::info(fmt("tx ClusterAgent::API::agent_welcome_response %s",
+	                                 ClusterController::Types::result_to_string(res)));
+	event ClusterAgent::API::agent_welcome_response(reqid, res);
 	}

-event ClusterAgent::API::notify_agent_hello(instance: string, host: addr, api_version: count)
+event ClusterAgent::API::agent_standby_request(reqid: string)
 	{
-	if ( g_notify_controller )
-		schedule 1sec { ClusterAgent::API::notify_agent_hello(instance, host, api_version) };
+	ClusterController::Log::info(fmt("rx ClusterAgent::API::agent_standby_request %s", reqid));
+
+	# We shut down any existing cluster nodes via an empty configuration,
+	# and fall silent. We do not unpeer/disconnect (assuming we earlier
+	# peered/connected -- otherwise there's nothing we can do here via
+	# Broker anyway), mainly to keep open the possibility of running
+	# cluster nodes again later.
+	event ClusterAgent::API::set_configuration_request("", ClusterController::Types::Configuration());
+
+	local res = ClusterController::Types::Result(
+	    $reqid = reqid,
+	    $instance = ClusterAgent::name);
+
+	ClusterController::Log::info(fmt("tx ClusterAgent::API::agent_standby_response %s",
+	                                 ClusterController::Types::result_to_string(res)));
+	event ClusterAgent::API::agent_standby_response(reqid, res);
 	}

 event Broker::peer_added(peer: Broker::EndpointInfo, msg: string)
 	{
 	# This does not (cannot?) immediately verify that the new peer
-	# is in fact a controller, so we might send this redundantly.
-	# Controllers handle the hello event accordingly.
-
-	g_notify_controller = T;
+	# is in fact a controller, so we might send this in vain.
+	# Controllers register the agent upon receipt of the event.

 	local epi = ClusterAgent::endpoint_info();

@ -218,6 +232,9 @@ event zeek_init()
 	# Auto-publish a bunch of events. Glob patterns or module-level
 	# auto-publish would be helpful here.
 	Broker::auto_publish(agent_topic, ClusterAgent::API::set_configuration_response);
+	Broker::auto_publish(agent_topic, ClusterAgent::API::agent_welcome_response);
+	Broker::auto_publish(agent_topic, ClusterAgent::API::agent_standby_response);
+
 	Broker::auto_publish(agent_topic, ClusterAgent::API::notify_agent_hello);
 	Broker::auto_publish(agent_topic, ClusterAgent::API::notify_change);
 	Broker::auto_publish(agent_topic, ClusterAgent::API::notify_error);