Remove periodic pinging of controller by agents

This changes the agent-controller communication to remove the need for ongoing
pinging of the controller by agents not actively "in service". Instead, agents
now use the notify_agent_hello event to the controller to report only their
identity. The controller puts them into service via an agent_welcome_request/
response pair, and takes them out of service via agent_standby_request/response.

This removes the on_change handler from the set of agents that is ready for
service, because not every change to this set is now a suitable time to
potentially send out the configuration. We now invoke this check explicitly in
the two situations where it's warranted: when a agent reports ready for service,
and when we've received a new configuration.
This commit is contained in:
Christian Kreibich 2021-12-20 16:30:58 -08:00
parent 8463f14a52
commit ac40d5c5b2
3 changed files with 208 additions and 113 deletions

View file

@ -8,14 +8,35 @@ export {
# Agent API events
# The controller uses this event to convey a new cluster
# configuration to the agent. Once processed, the agent
# responds with the response event.
global set_configuration_request: event(reqid: string,
config: ClusterController::Types::Configuration);
global set_configuration_response: event(reqid: string,
result: ClusterController::Types::Result);
# The controller uses this event to confirm to the agent
# that it is part of the current cluster. The agent
# acknowledges with the response event.
global agent_welcome_request: event(reqid: string);
global agent_welcome_response: event(reqid: string,
result: ClusterController::Types::Result);
# The controller sends this event to convey that the agent is not
# currently required. This status may later change, depending on
# updates from the client, so the peering can remain active. The
# agent releases any cluster-related resources when processing the
# request.
global agent_standby_request: event(reqid: string);
global agent_standby_response: event(reqid: string,
result: ClusterController::Types::Result);
# Notification events, agent -> controller
# Report agent being available.
# The agent sends this upon peering as a "check in", informing the
# controller that an agent of the given name is now available to
# communicate with.
global notify_agent_hello: event(instance: string, host: addr,
api_version: count);
@ -30,11 +51,4 @@ export {
# Report informational message.
global notify_log: event(instance: string, msg: string, node: string &default="");
# Notification events, controller -> agent
# Confirmation from controller in response to notify_agent_hello
# that the agent is welcome.
global notify_controller_hello: event(controller: string, host: addr);
}

View file

@ -24,11 +24,6 @@ global g_nodes: table[string] of ClusterController::Types::Node;
# new configurations.
global g_data_cluster: table[string] of Supervisor::ClusterEndpoint;
# Whether we currenty keep notifying the controller that we're here.
# We stop once the controller responds back, and resume when we've
# lost the peering.
global g_notify_controller: bool = T;
event SupervisorControl::create_response(reqid: string, result: string)
{
@ -169,25 +164,44 @@ event ClusterAgent::API::set_configuration_request(reqid: string, config: Cluste
}
}
event ClusterAgent::API::notify_controller_hello(controller: string, host: addr)
event ClusterAgent::API::agent_welcome_request(reqid: string)
{
ClusterController::Log::info(fmt("rx ClusterAgent::API::notify_controller_hello %s %s", controller, host));
g_notify_controller = F;
ClusterController::Log::info(fmt("rx ClusterAgent::API::agent_welcome_request %s", reqid));
local res = ClusterController::Types::Result(
$reqid = reqid,
$instance = ClusterAgent::name);
ClusterController::Log::info(fmt("tx ClusterAgent::API::agent_welcome_response %s",
ClusterController::Types::result_to_string(res)));
event ClusterAgent::API::agent_welcome_response(reqid, res);
}
event ClusterAgent::API::notify_agent_hello(instance: string, host: addr, api_version: count)
event ClusterAgent::API::agent_standby_request(reqid: string)
{
if ( g_notify_controller )
schedule 1sec { ClusterAgent::API::notify_agent_hello(instance, host, api_version) };
ClusterController::Log::info(fmt("rx ClusterAgent::API::agent_standby_request %s", reqid));
# We shut down any existing cluster nodes via an empty configuration,
# and fall silent. We do not unpeer/disconnect (assuming we earlier
# peered/connected -- otherwise there's nothing we can do here via
# Broker anyway), mainly to keep open the possibility of running
# cluster nodes again later.
event ClusterAgent::API::set_configuration_request("", ClusterController::Types::Configuration());
local res = ClusterController::Types::Result(
$reqid = reqid,
$instance = ClusterAgent::name);
ClusterController::Log::info(fmt("tx ClusterAgent::API::agent_standby_response %s",
ClusterController::Types::result_to_string(res)));
event ClusterAgent::API::agent_standby_response(reqid, res);
}
event Broker::peer_added(peer: Broker::EndpointInfo, msg: string)
{
# This does not (cannot?) immediately verify that the new peer
# is in fact a controller, so we might send this redundantly.
# Controllers handle the hello event accordingly.
g_notify_controller = T;
# is in fact a controller, so we might send this in vain.
# Controllers register the agent upon receipt of the event.
local epi = ClusterAgent::endpoint_info();
@ -218,6 +232,9 @@ event zeek_init()
# Auto-publish a bunch of events. Glob patterns or module-level
# auto-publish would be helpful here.
Broker::auto_publish(agent_topic, ClusterAgent::API::set_configuration_response);
Broker::auto_publish(agent_topic, ClusterAgent::API::agent_welcome_response);
Broker::auto_publish(agent_topic, ClusterAgent::API::agent_standby_response);
Broker::auto_publish(agent_topic, ClusterAgent::API::notify_agent_hello);
Broker::auto_publish(agent_topic, ClusterAgent::API::notify_change);
Broker::auto_publish(agent_topic, ClusterAgent::API::notify_error);