Add ClusterController::API::notify_agents_ready event

This changes the basic agent-management model to one in which the configurations
received from the client define not just the data cluster, but also set the set
of acceptable instances. Unless connectivity already exists, the controller will
establish peerings with new agents that listen, or wait for ones that connect to
the controller to check in.

Once all required agents are available, the controller triggers the new
notify_agents_ready event, an agent/controller-level "cluster-is-ready"
event. The controller also uses this event to submit a pending config update to
the now-ready instances.
This commit is contained in:
Christian Kreibich 2021-11-29 11:41:49 -08:00
parent b57be021b7
commit 09d9be3433
5 changed files with 221 additions and 171 deletions

View file

@ -30,4 +30,11 @@ export {
# Report informational message. # Report informational message.
global notify_log: event(instance: string, msg: string, node: string &default=""); global notify_log: event(instance: string, msg: string, node: string &default="");
# Notification events, controller -> agent
# Confirmation from controller in response to notify_agent_hello
# that the agent is welcome.
global notify_controller_hello: event(controller: string, host: addr);
} }

View file

@ -22,6 +22,12 @@ global g_nodes: table[string] of ClusterController::Types::Node;
# new configurations. # new configurations.
global g_data_cluster: table[string] of Supervisor::ClusterEndpoint; global g_data_cluster: table[string] of Supervisor::ClusterEndpoint;
# Whether we currenty keep notifying the controller that we're here.
# We stop once the controller responds back, and resume when we've
# lost the peering.
global g_notify_controller: bool = T;
event SupervisorControl::create_response(reqid: string, result: string) event SupervisorControl::create_response(reqid: string, result: string)
{ {
local req = ClusterController::Request::lookup(reqid); local req = ClusterController::Request::lookup(reqid);
@ -161,14 +167,28 @@ event ClusterAgent::API::set_configuration_request(reqid: string, config: Cluste
} }
} }
event ClusterAgent::API::notify_controller_hello(controller: string, host: addr)
{
ClusterController::Log::info(fmt("rx ClusterAgent::API::notify_controller_hello %s %s", controller, host));
g_notify_controller = F;
}
event ClusterAgent::API::notify_agent_hello(instance: string, host: addr, api_version: count)
{
if ( g_notify_controller )
schedule 1sec { ClusterAgent::API::notify_agent_hello(instance, host, api_version) };
}
event Broker::peer_added(peer: Broker::EndpointInfo, msg: string) event Broker::peer_added(peer: Broker::EndpointInfo, msg: string)
{ {
# This does not (cannot?) immediately verify that the new peer # This does not (cannot?) immediately verify that the new peer
# is in fact a controller, so we might send this redundantly. # is in fact a controller, so we might send this redundantly.
# Controllers handle the hello event accordingly. # Controllers handle the hello event accordingly.
g_notify_controller = T;
local epi = ClusterAgent::endpoint_info(); local epi = ClusterAgent::endpoint_info();
# XXX deal with unexpected peers, unless we're okay with it
event ClusterAgent::API::notify_agent_hello(epi$id, event ClusterAgent::API::notify_agent_hello(epi$id,
to_addr(epi$network$address), ClusterAgent::API::version); to_addr(epi$network$address), ClusterAgent::API::version);
} }

View file

@ -5,6 +5,10 @@ module ClusterController::API;
export { export {
const version = 1; const version = 1;
# Triggered when the operational instances align with desired ones, as
# defined by the latest cluster config sent by the client.
global notify_agents_ready: event(instances: set[string]);
global get_instances_request: event(reqid: string); global get_instances_request: event(reqid: string);
global get_instances_response: event(reqid: string, global get_instances_response: event(reqid: string,
instances: vector of ClusterController::Types::Instance); instances: vector of ClusterController::Types::Instance);

View file

@ -28,13 +28,6 @@ export {
# The controller listens for messages on this topic: # The controller listens for messages on this topic:
const topic = "zeek/cluster-control/controller" &redef; const topic = "zeek/cluster-control/controller" &redef;
# The set of agents to interact with. When this is non-empty
# at startup, the controller contacts the agents; when it is
# empty, it waits for agents to connect. They key is a name of
# each instance. This should match the $name member of the
# instance records.
const instances: table[string] of ClusterController::Types::Instance = { } &redef;
# The role of this node in cluster management. Agent and # The role of this node in cluster management. Agent and
# controller both redef this. Used during logging. # controller both redef this. Used during logging.
const role = ClusterController::Types::NONE &redef; const role = ClusterController::Types::NONE &redef;

View file

@ -6,16 +6,35 @@
@load ./api @load ./api
@load ./log @load ./log
@load ./request @load ./request
@load ./util
redef ClusterController::role = ClusterController::Types::CONTROLLER; redef ClusterController::role = ClusterController::Types::CONTROLLER;
global g_config_current: ClusterController::Types::Configuration; global check_instances_ready: function(insts: set[string], tc: TableChange, inst: string);
global add_instance: function(inst: ClusterController::Types::Instance);
global drop_instance: function(inst: ClusterController::Types::Instance);
# The desired set of agents the controller interact with, as provided by the
# most recent config update sent by the client. They key is a name of each
# instance. This should match the $name member of the instance records.
global g_instances: table[string] of ClusterController::Types::Instance = table();
# A corresponding set of instances/agents that we track in order to understand
# when all of the above instances have checked in with a notify_agent_hello
# event. (An alternative would be to use a record that adds a single state bit
# for each instance, and store that above.)
global g_instances_ready: set[string] = set() &on_change=check_instances_ready;
# The request ID of the most recent configuration update that's come in from
# a client. We track it here until we know we are ready to communicate with all
# agents required by the update.
global g_config_reqid_pending: string = ""; global g_config_reqid_pending: string = "";
function send_config_to_agents(req: ClusterController::Request::Request, function send_config_to_agents(req: ClusterController::Request::Request,
config: ClusterController::Types::Configuration) config: ClusterController::Types::Configuration)
{ {
for ( name in ClusterController::instances ) for ( name in g_instances )
{ {
local agent_topic = ClusterAgent::topic_prefix + "/" + name; local agent_topic = ClusterAgent::topic_prefix + "/" + name;
local areq = ClusterController::Request::create(); local areq = ClusterController::Request::create();
@ -26,100 +45,129 @@ function send_config_to_agents(req: ClusterController::Request::Request,
# and once all are, we respond back to the client. # and once all are, we respond back to the client.
req$set_configuration_state$requests += areq; req$set_configuration_state$requests += areq;
# XXX could also broadcast just once on the agent prefix, but # We could also broadcast just once on the agent prefix, but
# explicit request/response pairs for each agent seems cleaner. # explicit request/response pairs for each agent seems cleaner.
ClusterController::Log::info(fmt("tx ClusterAgent::API::set_configuration_request %s to %s", areq$id, name)); ClusterController::Log::info(fmt("tx ClusterAgent::API::set_configuration_request %s to %s", areq$id, name));
Broker::publish(agent_topic, ClusterAgent::API::set_configuration_request, areq$id, config); Broker::publish(agent_topic, ClusterAgent::API::set_configuration_request, areq$id, config);
} }
} }
# This is the &on_change handler for the g_instances_ready set.
function check_instances_ready(insts: set[string], tc: TableChange, inst: string)
{
local cur_instances: set[string];
# See if the new update to the readiness set makes it match the current
# instances. If so, trigger the notify_agents_ready event.
if ( tc == TABLE_ELEMENT_NEW || tc == TABLE_ELEMENT_REMOVED )
{
for ( inst in g_instances )
add cur_instances[inst];
if ( cur_instances == g_instances_ready )
{
event ClusterController::API::notify_agents_ready(cur_instances);
}
}
}
function add_instance(inst: ClusterController::Types::Instance)
{
g_instances[inst$name] = inst;
if ( inst?$listen_port )
Broker::peer(cat(inst$host), inst$listen_port,
ClusterController::connect_retry);
}
function drop_instance(inst: ClusterController::Types::Instance) function drop_instance(inst: ClusterController::Types::Instance)
{ {
if ( inst$name in ClusterController::instances ) if ( inst$name !in g_instances )
{ return;
# Send this agent a config that will terminate any data cluster
# nodes it might have. This is "fire and forget" -- there will
# not be a response.
Broker::publish(ClusterAgent::topic_prefix + "/" + inst$name,
ClusterAgent::API::set_configuration_request,
"", ClusterController::Types::Configuration());
delete ClusterController::instances[inst$name]; # Send this agent a config that will terminate any data cluster
} # nodes it might have. This is "fire and forget" -- there will
# not be a response.
Broker::publish(ClusterAgent::topic_prefix + "/" + inst$name,
ClusterAgent::API::set_configuration_request,
"", ClusterController::Types::Configuration());
# If the instance has a port, we peered with it, so now unpeer. # If the instance has a port, we peered with it, so now unpeer.
if ( inst?$listen_port ) if ( inst?$listen_port )
Broker::unpeer(cat(inst$host), inst$listen_port ); Broker::unpeer(cat(inst$host), inst$listen_port );
delete g_instances[inst$name];
if ( inst$name in g_instances_ready )
delete g_instances_ready[inst$name];
ClusterController::Log::info(fmt("dropped instance %s", inst$name)); ClusterController::Log::info(fmt("dropped instance %s", inst$name));
} }
event ClusterAgent::API::notify_agent_hello(instance: string, host: addr, api_version: count) event ClusterController::API::notify_agents_ready(instances: set[string])
{ {
# See if we already know about this agent; if not, register local insts = ClusterController::Util::set_to_vector(instances);
# it.
#
# XXX protection against rogue agents?
local inst: ClusterController::Types::Instance; ClusterController::Log::info(fmt("rx ClusterController::API:notify_agents_ready %s", join_string_vec(insts, ",")));
if ( instance in ClusterController::instances )
{
# Do nothing, unless this known agent checks in with a mismatching
# API version, in which case we kick it out.
if ( api_version != ClusterController::API::version )
drop_instance(ClusterController::instances[instance]);
# Update the instance name in the pointed-to record, in case it
# was previously named otherwise. Not being too picky here allows
# the user some leeway in spelling out the original config.
ClusterController::instances[instance]$name = instance;
return;
}
if ( api_version != ClusterController::API::version )
{
ClusterController::Log::warning(
fmt("agent %s/%s speaks incompatible agent protocol (%s, need %s), ignoring",
instance, host, api_version, ClusterController::API::version));
return;
}
ClusterController::instances[instance] = ClusterController::Types::Instance($name=instance, $host=host);
ClusterController::Log::info(fmt("instance %s/%s has checked in", instance, host));
# If we have a pending configuration request, check in on it now to see whether
# we have all agents required, and finalize the config request.
if ( g_config_reqid_pending == "" )
return;
# When all agents are ready, send them the pending config update.
local req = ClusterController::Request::lookup(g_config_reqid_pending); local req = ClusterController::Request::lookup(g_config_reqid_pending);
# If the request isn't available or doesn't have config state, we just
# clear it out and stop.
if ( ClusterController::Request::is_null(req) || ! req?$set_configuration_state ) if ( ClusterController::Request::is_null(req) || ! req?$set_configuration_state )
{ {
# Odd, just clear out pending state.
g_config_reqid_pending = ""; g_config_reqid_pending = "";
return; return;
} }
for ( inst in req$set_configuration_state$config$instances ) # All instances requested in the pending configuration update are now
{ # known to us. Send them the config. As they send their response events
if ( inst$name !in ClusterController::instances ) # we update the client's request state and eventually send the response
{ # event to the it.
# The request still has instances not known to us, try again
# later.
return;
}
}
# All instances requested in the configuration are now known to us.
# Send them the config. As they send their response events we
# update the request state and eventually send the response event
# to the client.
send_config_to_agents(req, req$set_configuration_state$config); send_config_to_agents(req, req$set_configuration_state$config);
# The request object continues to exist and will be referenced by the
# responses coming in, but there's no longer a pending config update to
# track.
g_config_reqid_pending = ""; g_config_reqid_pending = "";
} }
event ClusterAgent::API::notify_agent_hello(instance: string, host: addr, api_version: count)
{
if ( instance !in g_instances )
{
# An unknown agent has checked in. This can happen with agentsthat aren't yet
# showing in a configuration received by the client. We log at debug level only.
ClusterController::Log::debug(
fmt("unknown instance %s/%s has checked in, ignoring", instance, host));
return;
}
ClusterController::Log::info(fmt("rx ClusterAgent::API::notify_agent_hello %s %s", instance, host));
local inst = g_instances[instance];
# When a known agent checks in with a mismatching API version we kick it out.
if ( api_version != ClusterController::API::version )
{
ClusterController::Log::warning(
fmt("instance %s/%s has checked in with incompatible API version %s, dropping",
instance, host, api_version));
drop_instance(inst);
return;
}
if ( instance !in g_instances_ready )
{
ClusterController::Log::info(fmt("instance %s/%s has checked in", instance, host));
add g_instances_ready[instance];
}
Broker::publish(ClusterAgent::topic_prefix + "/" + instance,
ClusterAgent::API::notify_controller_hello, ClusterController::name,
to_addr(ClusterController::network_info()$address));
}
event ClusterAgent::API::notify_change(instance: string, n: ClusterController::Types::Node, event ClusterAgent::API::notify_change(instance: string, n: ClusterController::Types::Node,
old: ClusterController::Types::State, old: ClusterController::Types::State,
@ -213,6 +261,7 @@ event ClusterController::API::set_configuration_request(reqid: string, config: C
local res: ClusterController::Types::Result; local res: ClusterController::Types::Result;
local req = ClusterController::Request::create(reqid); local req = ClusterController::Request::create(reqid);
req$set_configuration_state = ClusterController::Request::SetConfigurationState($config = config); req$set_configuration_state = ClusterController::Request::SetConfigurationState($config = config);
# At the moment there can only be one pending request. # At the moment there can only be one pending request.
@ -230,91 +279,78 @@ event ClusterController::API::set_configuration_request(reqid: string, config: C
return; return;
} }
# Compare the instance configuration to our current one. For instances # Compare the instance configuration to our current one. If it matches,
# that are supposed to be checking in with us but have not, record # we can proceed to deploying the new data cluster topology. If it does
# errors. We will fail the request in those cases. For instances we # not, we need to establish connectivity with agents we connect to, or
# don't know and are supposed to connect to, do so next. # wait until all instances that connect to us have done so. Either triggers
# a notify_agents_ready event, upon which we then deploy the data cluster.
# The current & new set of instance names.
local insts_current: set[string];
local insts_new: set[string];
# A set of current instances not contained in the new config.
# Those will need to get dropped.
local insts_to_drop: set[string];
# The opposite: new instances not yet in our current set. Those we will need
# to establish contact with (or they with us).
local insts_to_add: set[string];
# The overlap: instances in both the current and new set. For those we verify
# that we're actually dealign with the same entities, and might need to re-
# connect if not.
local insts_to_keep: set[string];
# Alternative representation of insts_to_add, directly providing the instances.
local insts_to_peer: table[string] of ClusterController::Types::Instance;
# Helpful locals.
local inst_name: string; local inst_name: string;
local inst: ClusterController::Types::Instance; local inst: ClusterController::Types::Instance;
local insts_to_add: ClusterController::Types::InstanceVec;
# A set of instances not contained in the config, that therefore needs for ( inst_name in g_instances )
# to be dropped. add insts_current[inst_name];
local insts_to_drop: set[string] = {}; for ( inst in config$instances )
add insts_new[inst$name];
for ( inst_name in ClusterController::instances ) # Populate TODO lists for instances we need to drop, check, or add.
add insts_to_drop[inst_name]; insts_to_drop = insts_current - insts_new;
insts_to_add = insts_new - insts_current;
insts_to_keep = insts_new & insts_current;
for ( inst in config$instances ) for ( inst in config$instances )
{ {
if ( inst$name in ClusterController::instances ) if ( inst$name in insts_to_add )
{ {
# Verify that it's actually the same endpoint. If not, kick out insts_to_peer[inst$name] = inst;
# the old one. XXX 0.0.0.0 is a workaround until we've resolved next;
# how agents that peer with us obtain their identity.
if ( ( inst$host != 0.0.0.0 && inst$host != ClusterController::instances[inst$name]$host ) ||
inst?$listen_port != ClusterController::instances[inst$name]?$listen_port ||
( inst?$listen_port && ClusterController::instances[inst$name]?$listen_port &&
inst$listen_port != ClusterController::instances[inst$name]$listen_port ) )
{
drop_instance(ClusterController::instances[inst$name]);
}
else
{
# We know this instance, don't drop it below.
delete insts_to_drop[inst$name];
}
} }
if ( inst$name !in ClusterController::instances ) # Focus on the keepers: check for change in identity/location.
if ( inst$name !in insts_to_keep )
next;
# XXX 0.0.0.0 is a workaround until we've resolved
# how agents that peer with us obtain their identity.
# Broker ID?
if ( ( inst$host != 0.0.0.0 && inst$host != g_instances[inst$name]$host ) ||
inst?$listen_port != g_instances[inst$name]?$listen_port ||
( inst?$listen_port && g_instances[inst$name]?$listen_port &&
inst$listen_port != g_instances[inst$name]$listen_port ) )
{ {
# It's an instance we don't currently know about. # The endpoint looks different. We drop the current one
if ( ! inst?$listen_port ) # and need to re-establish connectivity.
{ add insts_to_drop[inst$name];
# If a requested instance doesn't have a listen port and isn't known
# to us, we have no way to establish connectivity. We reject the
# request.
res = ClusterController::Types::Result($reqid=reqid, $instance=inst$name);
res$success = F;
res$error = fmt("instance %s is unknown", inst$name);
req$results += res;
}
else
{
# We'll need to connect to this instance.
insts_to_add[|insts_to_add|] = inst;
}
} }
} }
# An error at this point means that we're rejecting this request. # Process our TODO lists.
if ( |req$results| > 0 )
{
ClusterController::Log::info(fmt("tx ClusterController::API::set_configuration_response %s",
ClusterController::Request::to_string(req)));
event ClusterController::API::set_configuration_response(req$id, req$results);
ClusterController::Request::finish(req$id);
return;
}
for ( inst_name in insts_to_drop ) for ( inst_name in insts_to_drop )
drop_instance(ClusterController::instances[inst_name]); drop_instance(g_instances[inst_name]);
# We have instances to connect to, so initiate peering and stop for now. for ( inst_name in insts_to_peer )
# Processing will continue as the agents check in. That's also when they add_instance(insts_to_peer[inst_name]);
# get added to ClusterController::instances.
if ( |insts_to_add| > 0 )
{
for ( idx in insts_to_add )
{
inst = insts_to_add[idx];
Broker::peer(cat(inst$host), inst$listen_port,
ClusterController::connect_retry);
}
g_config_reqid_pending = req$id;
return;
}
# XXX validate the configuration: # XXX validate the configuration:
# - Are node instances among defined instances? # - Are node instances among defined instances?
@ -323,8 +359,15 @@ event ClusterController::API::set_configuration_request(reqid: string, config: C
# - Do node types with optional fields have required values? # - Do node types with optional fields have required values?
# ... # ...
# Response event gets sent via the agents' reponse event. # Track this config request globally until all of the agents required
send_config_to_agents(req, config); # for it have checked in. It gets cleared in the notify_agents_ready
# event handler.
g_config_reqid_pending = req$id;
# Special case: if the new request kept the set of instances identical,
# trigger notify_agents_ready explicitly so we transmit the new config.
if ( |insts_to_drop| == 0 && |insts_to_add| == 0 && |insts_to_keep| > 0 )
event ClusterController::API::notify_agents_ready(insts_to_keep);
} }
event ClusterController::API::get_instances_request(reqid: string) event ClusterController::API::get_instances_request(reqid: string)
@ -333,8 +376,8 @@ event ClusterController::API::get_instances_request(reqid: string)
local insts: vector of ClusterController::Types::Instance; local insts: vector of ClusterController::Types::Instance;
for ( i in ClusterController::instances ) for ( i in g_instances )
insts += ClusterController::instances[i]; insts += g_instances[i];
ClusterController::Log::info(fmt("tx ClusterController::API::get_instances_response %s", reqid)); ClusterController::Log::info(fmt("tx ClusterController::API::get_instances_response %s", reqid));
event ClusterController::API::get_instances_response(reqid, insts); event ClusterController::API::get_instances_response(reqid, insts);
@ -342,42 +385,25 @@ event ClusterController::API::get_instances_request(reqid: string)
event zeek_init() event zeek_init()
{ {
# Controller always listens -- it needs to be able to respond # The controller always listens -- it needs to be able to respond to the
# to the Zeek client. This port is also used by the agents # Zeek client. This port is also used by the agents if they connect to
# if they connect to the client. # the client. The client doesn't automatically establish or accept
# connectivity to agents: agents are defined and communicated with as
# defined via configurations defined by the client.
local cni = ClusterController::network_info(); local cni = ClusterController::network_info();
Broker::listen(cat(cni$address), cni$bound_port); Broker::listen(cat(cni$address), cni$bound_port);
Broker::subscribe(ClusterAgent::topic_prefix); Broker::subscribe(ClusterAgent::topic_prefix);
Broker::subscribe(ClusterController::topic); Broker::subscribe(ClusterController::topic);
# Events sent to the client:
Broker::auto_publish(ClusterController::topic, Broker::auto_publish(ClusterController::topic,
ClusterController::API::get_instances_response); ClusterController::API::get_instances_response);
Broker::auto_publish(ClusterController::topic, Broker::auto_publish(ClusterController::topic,
ClusterController::API::set_configuration_response); ClusterController::API::set_configuration_response);
if ( |ClusterController::instances| > 0 )
{
# We peer with the agents -- otherwise, the agents peer
# with (i.e., connect to) us.
for ( i in ClusterController::instances )
{
local inst = ClusterController::instances[i];
if ( ! inst?$listen_port )
{
# XXX config error -- this must be there
next;
}
Broker::peer(cat(inst$host), inst$listen_port,
ClusterController::connect_retry);
}
}
# If ClusterController::instances is empty, agents peer with
# us and we do nothing. We'll build up state as the
# notify_agent_hello() events come int.
ClusterController::Log::info("controller is live"); ClusterController::Log::info("controller is live");
} }