Introduce cluster controller and cluster agent scripting

This is a preliminary implementation of a subset of the functionality set out in our cluster controller architecture. The controller is the central management node, existing once in any Zeek cluster. The agent is a node that runs once per instance, where an instance will commonly be a physical machine. The agent in turn manages the "data cluster", i.e. the traditional notion of a Zeek cluster with manager, worker nodes, etc. Agent and controller live in the policy folder, and are activated when loading policy/frameworks/cluster/agent and policy/frameworks/cluster/controller, respectively. Both run in nodes forked by the supervisor. When Zeek doesn't use the supervisor, they do nothing. Otherwise, boot.zeek instructs the supervisor to create the respective node, running main.zeek. Both controller and agent have their own config.zeek with relevant knobs. For both, controller/types.zeek provides common data types, and controller/log.zeek provides basic logging (without logger communication -- no such node might exist). A primitive request-tracking abstraction can be found in controller/request.zeek to track outstanding request events and their subsequent responses.
2025-10-02 14:48:21 +00:00 · 2021-07-06 17:24:56 -07:00 · 2021-07-06 17:24:56 -07:00 · c744702f94
commit c744702f94
parent a3623bfb2d
13 changed files with 1041 additions and 0 deletions
--- a/scripts/policy/frameworks/cluster/agent/load.zeek
+++ b/scripts/policy/frameworks/cluster/agent/load.zeek
@ -0,0 +1,5 @@
 # The entry point for the cluster agent. It only runs bootstrap logic for
 # launching via the Supervisor. If we're not running the Supervisor, this does
 # nothing.
@load ./boot
--- a/scripts/policy/frameworks/cluster/agent/api.zeek
+++ b/scripts/policy/frameworks/cluster/agent/api.zeek
@ -0,0 +1,33 @@
@load base/frameworks/supervisor/control
@load policy/frameworks/cluster/controller/types
 module ClusterAgent::API;
 export {
 	const version = 1;
 	# Agent API events
 	global set_configuration_request: event(reqid: string,
 	    config: ClusterController::Types::Configuration);
 	global set_configuration_response: event(reqid: string,
 	    result: ClusterController::Types::Result);
 	# Notification events, agent -> controller
 	# Report agent being available.
 	global notify_agent_hello: event(instance: string, host: addr,
 	    api_version: count);
 	# Report node state changes.
 	global notify_change: event(instance: string,
 	    n: ClusterController::Types::Node,
 	    old: ClusterController::Types::State,
 	    new: ClusterController::Types::State);
 	# Report operational error.
 	global notify_error: event(instance: string, msg: string, node: string &default="");
 	# Report informational message.
 	global notify_log: event(instance: string, msg: string, node: string &default="");
 }
--- a/scripts/policy/frameworks/cluster/agent/boot.zeek
+++ b/scripts/policy/frameworks/cluster/agent/boot.zeek
@ -0,0 +1,35 @@
@load ./config
 # The agent needs the supervisor to listen for node management requests.  We
 # need to tell it to do so, and we need to do so here, in the agent
 # bootstrapping code, so the redef applies prior to the fork of the agent
 # process itself.
 redef SupervisorControl::enable_listen = T;
 event zeek_init()
 	{
 	if ( ! Supervisor::is_supervisor() )
 		return;
 	local epi = ClusterAgent::endpoint_info();
 	local sn = Supervisor::NodeConfig($name=epi$id, $bare_mode=T,
 		$scripts=vector("policy/frameworks/cluster/agent/main.zeek"));
 	if ( ClusterAgent::directory != "" )
 		sn$directory = ClusterAgent::directory;
 	if ( ClusterAgent::stdout_file_suffix != "" )
 		sn$stdout_file = epi$id + "." + ClusterAgent::stdout_file_suffix;
 	if ( ClusterAgent::stderr_file_suffix != "" )
 		sn$stderr_file = epi$id + "." + ClusterAgent::stderr_file_suffix;
 	# This helps Zeek run controller and agent with a minimal set of scripts.
 	sn$env["ZEEK_CLUSTER_MGMT_NODE"] = "AGENT";
 	local res = Supervisor::create(sn);
 	if ( res != "" )
 		{
 		print(fmt("error: supervisor could not create agent node: %s", res));
 		exit(1);
 		}
 	}
--- a/scripts/policy/frameworks/cluster/agent/config.zeek
+++ b/scripts/policy/frameworks/cluster/agent/config.zeek
@ -0,0 +1,85 @@
@load policy/frameworks/cluster/controller/types
 module ClusterAgent;
 export {
 	# The name this agent uses to represent the cluster instance
        # it manages. When the environment variable isn't set and there's,
 	# no redef, this falls back to "agent-<hostname>".
 	const name = getenv("ZEEK_AGENT_NAME") &redef;
 	# Agent stdout/stderr log files to produce in Zeek's working
 	# directory. If empty, no such logs will result. The actual
 	# log files have the agent's name (as per above) dot-prefixed.
 	const stdout_file_suffix = "agent.stdout" &redef;
 	const stderr_file_suffix = "agent.stderr" &redef;
 	# The address and port the agent listens on. When
 	# undefined, falls back to configurable default values.
 	const listen_address = getenv("ZEEK_AGENT_ADDR") &redef;
 	const default_address = Broker::default_listen_address &redef;
 	const listen_port = getenv("ZEEK_AGENT_PORT") &redef;
 	const default_port = 2151/tcp &redef;
 	# The agent communicates under to following topic prefix,
 	# suffixed with "/<name>" (see above):
 	const topic_prefix = "zeek/cluster-control/agent" &redef;
 	# The coordinates of the controller. When defined, it means
 	# agents peer with (connect to) the controller; otherwise the
 	# controller knows all agents and peers with them.
 	const controller: Broker::NetworkInfo = [
 		$address="0.0.0.0", $bound_port=0/unknown] &redef;
 	# Agent and controller currently log only, not via the data cluster's
        # logger. (This might get added later.) For now, this means that
 	# if both write to the same log file, it gets garbled. The following
 	# lets you specify the working directory specifically for the agent.
 	const directory = "" &redef;
 	# Working directory for data cluster nodes. When relative, note
 	# that this will apply from the working directory of the agent,
 	# since it creates data cluster nodes.
 	const cluster_directory = "" &redef;
 	# The following functions return the effective network endpoint
 	# information for this agent, in two related forms.
 	global instance: function(): ClusterController::Types::Instance;
 	global endpoint_info: function(): Broker::EndpointInfo;
 }
 function instance(): ClusterController::Types::Instance
 	{
 	local epi = endpoint_info();
 	return ClusterController::Types::Instance($name=epi$id,
 		$host=to_addr(epi$network$address),
 		$listen_port=epi$network$bound_port);
 	}
 function endpoint_info(): Broker::EndpointInfo
 	{
 	local epi: Broker::EndpointInfo;
 	local network: Broker::NetworkInfo;
 	if ( ClusterAgent::name != "" )
 		epi$id = ClusterAgent::name;
 	else
 		epi$id = fmt("agent-%s", gethostname());
 	if ( ClusterAgent::listen_address != "" )
 		network$address = ClusterAgent::listen_address;
 	else if ( ClusterAgent::default_address != "" )
 		network$address = ClusterAgent::default_address;
 	else
 		network$address = "127.0.0.1";
 	if ( ClusterAgent::listen_port != "" )
 		network$bound_port = to_port(ClusterAgent::listen_port);
 	else
 		network$bound_port = ClusterAgent::default_port;
 	epi$network = network;
 	return epi;
 	}
--- a/scripts/policy/frameworks/cluster/agent/main.zeek
+++ b/scripts/policy/frameworks/cluster/agent/main.zeek
@ -0,0 +1,223 @@
@load base/frameworks/broker
@load policy/frameworks/cluster/controller/config
@load policy/frameworks/cluster/controller/log
@load policy/frameworks/cluster/controller/request
@load ./api
 redef ClusterController::role = ClusterController::Types::AGENT;
 # The global configuration as passed to us by the controller
 global global_config: ClusterController::Types::Configuration;
 # A map to make other instance info accessible
 global instances: table[string] of ClusterController::Types::Instance;
 # A map for the nodes we run on this instance, via this agent.
 global nodes: table[string] of ClusterController::Types::Node;
 # The node map employed by the supervisor to describe the cluster
 # topology to newly forked nodes. We refresh it when we receive
 # new configurations.
 global data_cluster: table[string] of Supervisor::ClusterEndpoint;
 event SupervisorControl::create_response(reqid: string, result: string)
 	{
 	local req = ClusterController::Request::lookup(reqid);
 	if ( ClusterController::Request::is_null(req) )
 		return;
 	local name = req$supervisor_state$node;
 	if ( |result| > 0 )
 		{
 		local msg = fmt("failed to create node %s: %s", name, result);
 		ClusterController::Log::error(msg);
 		event ClusterAgent::API::notify_error(ClusterAgent::name, msg, name);
 		}
 	ClusterController::Request::finish(reqid);
 	}
 event SupervisorControl::destroy_response(reqid: string, result: bool)
 	{
 	local req = ClusterController::Request::lookup(reqid);
 	if ( ClusterController::Request::is_null(req) )
 		return;
 	local name = req$supervisor_state$node;
 	if ( ! result )
 		{
 		local msg = fmt("failed to destroy node %s, %s", name, reqid);
 		ClusterController::Log::error(msg);
 		event ClusterAgent::API::notify_error(ClusterAgent::name, msg, name);
 		}
 	ClusterController::Request::finish(reqid);
 	}
 function supervisor_create(nc: Supervisor::NodeConfig)
 	{
 	local req = ClusterController::Request::create();
 	req$supervisor_state = ClusterController::Request::SupervisorState($node = nc$name);
 	event SupervisorControl::create_request(req$id, nc);
 	ClusterController::Log::info(fmt("issued supervisor create for %s, %s", nc$name, req$id));
 	}
 function supervisor_destroy(node: string)
 	{
 	local req = ClusterController::Request::create();
 	req$supervisor_state = ClusterController::Request::SupervisorState($node = node);
 	event SupervisorControl::destroy_request(req$id, node);
 	ClusterController::Log::info(fmt("issued supervisor destroy for %s, %s", node, req$id));
 	}
 event ClusterAgent::API::set_configuration_request(reqid: string, config: ClusterController::Types::Configuration)
 	{
 	ClusterController::Log::info(fmt("rx ClusterAgent::API::set_configuration_request %s", reqid));
 	local nodename: string;
 	local node: ClusterController::Types::Node;
 	local nc: Supervisor::NodeConfig;
 	local msg: string;
 	# Adopt the global configuration provided.
 	# XXX this can later handle validation and persistence
 	# XXX should do this transactionally, only set when all else worked
 	global_config = config;
 	# Refresh the instances table:
 	instances = table();
 	for ( inst in config$instances )
 		instances[inst$name] = inst;
 	# Terminate existing nodes
 	for ( nodename in nodes )
 		supervisor_destroy(nodename);
 	nodes = table();
 	# Refresh the data cluster and nodes tables
 	data_cluster = table();
 	for ( node in config$nodes )
 		{
 		if ( node$instance == ClusterAgent::name )
 			nodes[node$name] = node;
 		local cep = Supervisor::ClusterEndpoint(
 		    $role = node$role,
 		    $host = instances[node$instance]$host,
 		    $p = node$p);
 		if ( node?$interface )
 			cep$interface = node$interface;
 		data_cluster[node$name] = cep;
 		}
 	# Apply the new configuration via the supervisor
 	for ( nodename in nodes )
 		{
 		node = nodes[nodename];
 		nc = Supervisor::NodeConfig($name=nodename);
 		if ( ClusterAgent::cluster_directory != "" )
 			nc$directory = ClusterAgent::cluster_directory;
 		if ( node?$interface )
 			nc$interface = node$interface;
 		if ( node?$cpu_affinity )
 			nc$cpu_affinity = node$cpu_affinity;
 		if ( node?$scripts )
 			nc$scripts = node$scripts;
 		if ( node?$env )
 			nc$env = node$env;
 		# XXX could use options to enable per-node overrides for
 		# directory, stdout, stderr, others?
 		nc$cluster = data_cluster;
 		supervisor_create(nc);
 		}
 	# XXX this currently doesn not fail if any of above problems occurred,
 	# mainly due to the tediousness of handling the supervisor's response
 	# events asynchonously. The only indication of error will be
 	# notification events to the controller.
 	local res = ClusterController::Types::Result(
 	    $reqid = reqid,
 	    $instance = ClusterAgent::name);
 	ClusterController::Log::info(fmt("tx ClusterAgent::API::set_configuration_response %s", reqid));
 	event ClusterAgent::API::set_configuration_response(reqid, res);
 	}
 event Broker::peer_added(peer: Broker::EndpointInfo, msg: string)
 	{
 	# This does not (cannot?) immediately verify that the new peer
 	# is in fact a controller, so we might send this redundantly.
 	# Controllers handle the hello event accordingly.
 	local epi = ClusterAgent::endpoint_info();
 	# XXX deal with unexpected peers, unless we're okay with it
 	event ClusterAgent::API::notify_agent_hello(epi$id,
 	    to_addr(epi$network$address), ClusterAgent::API::version);
 	}
 event zeek_init()
 	{
 	local epi = ClusterAgent::endpoint_info();
 	local agent_topic = ClusterAgent::topic_prefix + "/" + epi$id;
 	# The agent needs to peer with the supervisor -- this doesn't currently
 	# happen automatically. The address defaults to Broker's default, which
 	# relies on ZEEK_DEFAULT_LISTEN_ADDR and so might just be "". Broker
 	# internally falls back to listening on any; we pick 127.0.0.1.
 	local supervisor_addr = Broker::default_listen_address;
 	if ( supervisor_addr == "" )
 		supervisor_addr = "127.0.0.1";
 	Broker::peer(supervisor_addr, Broker::default_port, Broker::default_listen_retry);
 	# Agents need receive communication targeted at it, and any responses
        # from the supervisor.
 	Broker::subscribe(agent_topic);
 	Broker::subscribe(SupervisorControl::topic_prefix);
 	# Auto-publish a bunch of events. Glob patterns or module-level
 	# auto-publish would be helpful here.
 	Broker::auto_publish(agent_topic, ClusterAgent::API::set_configuration_response);
 	Broker::auto_publish(agent_topic, ClusterAgent::API::notify_agent_hello);
 	Broker::auto_publish(agent_topic, ClusterAgent::API::notify_change);
 	Broker::auto_publish(agent_topic, ClusterAgent::API::notify_error);
 	Broker::auto_publish(agent_topic, ClusterAgent::API::notify_log);
 	Broker::auto_publish(SupervisorControl::topic_prefix, SupervisorControl::create_request);
 	Broker::auto_publish(SupervisorControl::topic_prefix, SupervisorControl::create_response);
 	Broker::auto_publish(SupervisorControl::topic_prefix, SupervisorControl::destroy_request);
 	Broker::auto_publish(SupervisorControl::topic_prefix, SupervisorControl::destroy_response);
 	Broker::auto_publish(SupervisorControl::topic_prefix, SupervisorControl::restart_request);
 	Broker::auto_publish(SupervisorControl::topic_prefix, SupervisorControl::restart_response);
 	Broker::auto_publish(SupervisorControl::topic_prefix, SupervisorControl::stop_request);
 	# Establish connectivity with the controller.
 	if ( ClusterAgent::controller$address != "0.0.0.0" )
 		{
 		# We connect to the controller.
 		Broker::peer(ClusterAgent::controller$address,
 			     ClusterAgent::controller$bound_port,
 			     ClusterController::connect_retry);
 		}
 	else
 		{
 		# Controller connects to us; listen for it.
 		Broker::listen(cat(epi$network$address), epi$network$bound_port);
 		}
 	ClusterController::Log::info("agent is live");
 	}
--- a/scripts/policy/frameworks/cluster/controller/load.zeek
+++ b/scripts/policy/frameworks/cluster/controller/load.zeek
@ -0,0 +1,5 @@
 # The entry point for the cluster controller. It only runs bootstrap logic for
 # launching via the Supervisor. If we're not running the Supervisor, this does
 # nothing.
@load ./boot
--- a/scripts/policy/frameworks/cluster/controller/api.zeek
+++ b/scripts/policy/frameworks/cluster/controller/api.zeek
@ -0,0 +1,16 @@
@load ./types
 module ClusterController::API;
 export {
 	const version = 1;
 	global get_instances_request: event(reqid: string);
 	global get_instances_response: event(reqid: string,
 	    instances: vector of ClusterController::Types::Instance);
 	global set_configuration_request: event(reqid: string,
 	    config: ClusterController::Types::Configuration);
 	global set_configuration_response: event(reqid: string,
 	    result: ClusterController::Types::ResultVec);
 }
--- a/scripts/policy/frameworks/cluster/controller/boot.zeek
+++ b/scripts/policy/frameworks/cluster/controller/boot.zeek
@ -0,0 +1,29 @@
@load ./config
 event zeek_init()
 	{
 	if ( ! Supervisor::is_supervisor() )
 		return;
 	local epi = ClusterController::endpoint_info();
 	local sn = Supervisor::NodeConfig($name=epi$id, $bare_mode=T,
 	    $scripts=vector("policy/frameworks/cluster/controller/main.zeek"));
 	if ( ClusterController::directory != "" )
 		sn$directory = ClusterController::directory;
 	if ( ClusterController::stdout_file != "" )
 		sn$stdout_file = ClusterController::stdout_file;
 	if ( ClusterController::stderr_file != "" )
 		sn$stderr_file = ClusterController::stderr_file;
 	# This helps Zeek run controller and agent with a minimal set of scripts.
 	sn$env["ZEEK_CLUSTER_MGMT_NODE"] = "CONTROLLER";
 	local res = Supervisor::create(sn);
 	if ( res != "" )
 		{
 		print(fmt("error: supervisor could not create controller node: %s", res));
 		exit(1);
 		}
 	}
--- a/scripts/policy/frameworks/cluster/controller/config.zeek
+++ b/scripts/policy/frameworks/cluster/controller/config.zeek
@ -0,0 +1,85 @@
@load policy/frameworks/cluster/agent/config
 module ClusterController;
 export {
 	# The name of this controller in the cluster.
 	# Without the environment variable and no redef, this
 	# falls back to "controller-<hostname>".
 	const name = getenv("ZEEK_CONTROLLER_NAME") &redef;
 	# Controller stdout/stderr log files to produce in Zeek's
 	# working directory. If empty, no such logs will result.
 	const stdout_file = "controller.stdout" &redef;
 	const stderr_file = "controller.stderr" &redef;
 	# The address and port the controller listens on. When
 	# undefined, falls back to the default_address, which you can
 	# likewise customize.
 	const listen_address = getenv("ZEEK_CONTROLLER_ADDR") &redef;
 	const default_address = Broker::default_listen_address &redef;
 	const listen_port = getenv("ZEEK_CONTROLLER_PORT") &redef;
 	const default_port = 2150/tcp &redef;
 	# A more aggressive default retry interval (vs default 30s)
 	const connect_retry = 1sec &redef;
 	# The controller listens for messages on this topic:
 	const topic = "zeek/cluster-control/controller" &redef;
 	# The set of agents to interact with. When this is non-empty
 	# at startup, the controller contacts the agents; when it is
 	# empty, it waits for agents to connect. They key is a name of
 	# each instance. This should match the $name member of the
 	# instance records.
 	const instances: table[string] of ClusterController::Types::Instance = { } &redef;
 	# The role of this node in cluster management. Agent and
 	# controller both redef this. Used during logging.
 	const role = ClusterController::Types::NONE &redef;
 	# Agent and controller currently log only, not via the data cluster's
        # logger. (This might get added later.) For now, this means that
 	# if both write to the same log file, it gets garbled. The following
 	# lets you specify the working directory specifically for the agent.
 	const directory = "" &redef;
 	# The following functions return the effective network endpoint
 	# information for this controller, in two related forms.
 	global network_info: function(): Broker::NetworkInfo;
 	global endpoint_info: function(): Broker::EndpointInfo;
 }
 function network_info(): Broker::NetworkInfo
 	{
 	local ni: Broker::NetworkInfo;
 	if ( ClusterController::listen_address != "" )
 		ni$address = ClusterController::listen_address;
 	else if ( ClusterController::default_address != "" )
 		ni$address = ClusterController::default_address;
 	else
 		ni$address = "127.0.0.1";
 	if ( ClusterController::listen_port != "" )
 		ni$bound_port = to_port(ClusterController::listen_port);
 	else
 		ni$bound_port = ClusterController::default_port;
 	return ni;
 	}
 function endpoint_info(): Broker::EndpointInfo
 	{
 	local epi: Broker::EndpointInfo;
 	if ( ClusterController::name != "" )
 		epi$id = ClusterController::name;
 	else
 		epi$id = fmt("controller-%s", gethostname());
 	epi$network = network_info();
 	return epi;
 	}
--- a/scripts/policy/frameworks/cluster/controller/log.zeek
+++ b/scripts/policy/frameworks/cluster/controller/log.zeek
@ -0,0 +1,109 @@
@load ./config
 module ClusterController::Log;
 export {
 	## The cluster logging stream identifier.
 	redef enum Log::ID += { LOG };
 	## A default logging policy hook for the stream.
 	global log_policy: Log::PolicyHook;
 	type Level: enum {
 		DEBUG,
 		INFO,
 		WARNING,
 		ERROR,
 	};
 	## The record type which contains the column fields of the cluster log.
 	type Info: record {
 		## The time at which a cluster message was generated.
 		ts:       time;
 		## The name of the node that is creating the log record.
 		node: string;
 		## Log level of this message, converted from the above Level enum
 		level: string;
 		## The role of the node, translated from ClusterController::Types::Role.
 		role: string;
 		## A message indicating information about cluster controller operation.
 		message:  string;
 	} &log;
 	global log_level = DEBUG &redef;
 	global info: function(message: string);
 	global warning: function(message: string);
 	global error: function(message: string);
 }
 # Enum translations to strings. This avoids those enums being reported
 # with full qualifications in the logs, which is too verbose.
 global l2s: table[Level] of string = {
 	[DEBUG] = "DEBUG",
 	[INFO] = "INFO",
 	[WARNING] = "WARNING",
 	[ERROR] = "ERROR",
 };
 global r2s: table[ClusterController::Types::Role] of string = {
 	[ClusterController::Types::AGENT] = "AGENT",
 	[ClusterController::Types::CONTROLLER] = "CONTROLLER",
 };
 function debug(message: string)
 	{
 	if ( enum_to_int(log_level) > enum_to_int(DEBUG) )
 		return;
 	local node = Supervisor::node();
 	Log::write(LOG, [$ts=network_time(), $node=node$name, $level=l2s[DEBUG],
 			 $role=r2s[ClusterController::role], $message=message]);
 	}
 function info(message: string)
 	{
 	if ( enum_to_int(log_level) > enum_to_int(INFO) )
 		return;
 	local node = Supervisor::node();
 	Log::write(LOG, [$ts=network_time(), $node=node$name, $level=l2s[INFO],
 			 $role=r2s[ClusterController::role], $message=message]);
 	}
 function warning(message: string)
 	{
 	if ( enum_to_int(log_level) > enum_to_int(WARNING) )
 		return;
 	local node = Supervisor::node();
 	Log::write(LOG, [$ts=network_time(), $node=node$name, $level=l2s[WARNING],
 			 $role=r2s[ClusterController::role], $message=message]);
 	}
 function error(message: string)
 	{
 	if ( enum_to_int(log_level) > enum_to_int(ERROR) )
 		return;
 	local node = Supervisor::node();
 	Log::write(LOG, [$ts=network_time(), $node=node$name, $level=l2s[ERROR],
 			 $role=r2s[ClusterController::role], $message=message]);
 	}
 event zeek_init()
 	{
 	if ( ! Supervisor::is_supervised() )
 		return;
 	local node = Supervisor::node();
 	# Defining the stream outside of the stream creation call sidesteps
 	# the coverage.find-bro-logs test, which tries to inventory all logs.
 	# This log isn't yet ready for that level of scrutiny.
 	local stream = Log::Stream($columns=Info, $path=fmt("cluster-%s", node$name),
 	                           $policy=log_policy);
 	Log::create_stream(ClusterController::Log::LOG, stream);
 	}
--- a/scripts/policy/frameworks/cluster/controller/main.zeek
+++ b/scripts/policy/frameworks/cluster/controller/main.zeek
@ -0,0 +1,250 @@
@load base/frameworks/broker
@load policy/frameworks/cluster/agent/config
@load policy/frameworks/cluster/agent/api
@load ./api
@load ./log
@load ./request
 redef ClusterController::role = ClusterController::Types::CONTROLLER;
 event ClusterAgent::API::notify_agent_hello(instance: string, host: addr, api_version: count)
 	{
 	# See if we already know about this agent; if not, register
 	# it.
 	#
 	# XXX protection against rogue agents?
 	if ( instance in ClusterController::instances )
 		{
 		# Do nothing, unless this known agent checks in with a mismatching
 		# API version, in which case we kick it out.
 		if ( api_version != ClusterController::API::version )
 			{
 			local inst = ClusterController::instances[instance];
 			if ( inst?$listen_port )
 				{
 				# We peered with this instance, unpeer.
 				Broker::unpeer(cat(inst$host), inst$listen_port );
 				# XXX what to do if they connected to us?
 				}
 			delete ClusterController::instances[instance];
 			}
 		# Update the instance name in the pointed-to record, in case it
 		# was previously named otherwise. Not being too picky here allows
 		# the user some leeway in spelling out the original config.
 		ClusterController::instances[instance]$name = instance;
 		return;
 		}
 	if ( api_version != ClusterController::API::version )
 		{
 		ClusterController::Log::warning(
 		    fmt("agent %s/%s speaks incompatible agent protocol (%s, need %s), unpeering",
 		        instance, host, api_version, ClusterController::API::version));
 		}
 	ClusterController::instances[instance] = ClusterController::Types::Instance($name=instance, $host=host);
 	ClusterController::Log::info(fmt("instance %s/%s has checked in", instance, host));
 	}
 event ClusterAgent::API::notify_change(instance: string, n: ClusterController::Types::Node,
 				       old: ClusterController::Types::State,
 				       new: ClusterController::Types::State)
 	{
 	# XXX TODO
 	}
 event ClusterAgent::API::notify_error(instance: string, msg: string, node: string)
 	{
 	# XXX TODO
 	}
 event ClusterAgent::API::notify_log(instance: string, msg: string, node: string)
 	{
 	# XXX TODO
 	}
 event ClusterAgent::API::set_configuration_response(reqid: string, result: ClusterController::Types::Result)
 	{
 	ClusterController::Log::info(fmt("rx ClusterAgent::API::set_configuration_response %s", reqid));
 	# Retrieve state for the request we just got a response to
 	local areq = ClusterController::Request::lookup(reqid);
 	if ( ClusterController::Request::is_null(areq) )
 		return;
 	# Record the result and mark the request as done. This also
 	# marks the request as done in the parent-level request, since
 	# these records are stored by reference.
 	areq$results[0] = result; # We only have a single result here atm
 	areq$finished = T;
 	# Update the original request from the client:
 	local req = ClusterController::Request::lookup(areq$parent_id);
 	if ( ClusterController::Request::is_null(req) )
 		return;
 	# If there are any requests to the agents still unfinished,
 	# we're not done yet.
 	for ( i in req$set_configuration_state$requests )
 		if ( ! req$set_configuration_state$requests[i]$finished )
 			return;
 	# All set_configuration requests to instances are done, so respond
        # back to client. We need to compose the result, aggregating
        # the results we got from the requests to the agents. In the
        # end we have one Result per instance requested in the
        # original set_configuration_request.
 	#
 	# XXX we can likely generalize result aggregation in the request module.
 	for ( i in req$set_configuration_state$requests )
 		{
 		local r = req$set_configuration_state$requests[i];
 		local success = T;
 		local errors: string_vec;
 		local instance = "";
 		for ( j in r$results )
 			{
 			local res = r$results[j];
 			instance = res$instance;
 			if ( res$success )
 				next;
 			success = F;
 			errors += fmt("node %s failed: %s", res$node, res$error);
 			}
 		req$results += ClusterController::Types::Result(
 		    $reqid = req$id,
 		    $instance = instance,
 		    $success = success,
 		    $error = join_string_vec(errors, ", ")
 		);
 		ClusterController::Request::finish(r$id);
 		}
 	ClusterController::Log::info(fmt("tx ClusterController::API::set_configuration_response %s", req$id));
 	event ClusterController::API::set_configuration_response(req$id, req$results);
 	ClusterController::Request::finish(req$id);
 	}
 event ClusterController::API::set_configuration_request(reqid: string, config: ClusterController::Types::Configuration)
 	{
 	ClusterController::Log::info(fmt("rx ClusterController::API::set_configuration_request %s", reqid));
 	local req = ClusterController::Request::create(reqid);
 	req$set_configuration_state = ClusterController::Request::SetConfigurationState();
 	# Compare new configuration to the current one and send updates
 	# to the instances as needed.
 	if ( config?$instances )
 		{
 		# XXX properly handle instance update: connect to new instances provided
 		# when they are listening, accept connections from new instances that are
 		# not
 		for ( inst in config$instances )
 			{
 			if ( inst$name !in ClusterController::instances )
 				{
 				local res = ClusterController::Types::Result($reqid=reqid, $instance=inst$name);
 				res$error = fmt("instance %s is unknown, skipping", inst$name);
 				req$results += res;
 				}
 			}
 		}
 	# XXX validate the configuration:
 	# - Are node instances among defined instances?
 	# - Are all names unique?
 	# - Are any node options understood?
 	# - Do node types with optional fields have required values?
 	# ...
 	# Transmit the configuration on to the agents. They need to be aware of
 	# each other's location and nodes, so the data cluster nodes can connect
 	# (for example, so a worker on instance 1 can connect to a logger on
 	# instance 2).
 	for ( name in ClusterController::instances )
 		{
 		local agent_topic = ClusterAgent::topic_prefix + "/" + name;
 		local areq = ClusterController::Request::create();
 		areq$parent_id = reqid;
 		# We track the requests sent off to each agent. As the
 		# responses come in, we can check them off as completed,
 		# and once all are, we respond back to the client.
 		req$set_configuration_state$requests += areq;
 		# XXX could also broadcast just once on the agent prefix, but
 		# explicit request/response pairs for each agent seems cleaner.
 		ClusterController::Log::info(fmt("tx ClusterAgent::API::set_configuration_request %s to %s",
 		                                 areq$id, name));
 		Broker::publish(agent_topic, ClusterAgent::API::set_configuration_request, areq$id, config);
 		}
 	# Response event gets sent via the agents' reponse event.
 	}
 event ClusterController::API::get_instances_request(reqid: string)
 	{
 	ClusterController::Log::info(fmt("rx ClusterController::API::set_instances_request %s", reqid));
 	local insts: vector of ClusterController::Types::Instance;
 	for ( i in ClusterController::instances )
 		insts += ClusterController::instances[i];
 	ClusterController::Log::info(fmt("tx ClusterController::API::get_instances_response %s", reqid));
 	event ClusterController::API::get_instances_response(reqid, insts);
 	}
 event zeek_init()
 	{
 	# Controller always listens -- it needs to be able to respond
 	# to the Zeek client. This port is also used by the agents
 	# if they connect to the client.
 	local cni = ClusterController::network_info();
 	Broker::listen(cat(cni$address), cni$bound_port);
 	Broker::subscribe(ClusterAgent::topic_prefix);
 	Broker::subscribe(ClusterController::topic);
 	Broker::auto_publish(ClusterController::topic,
 	    ClusterController::API::get_instances_response);
 	Broker::auto_publish(ClusterController::topic,
 	    ClusterController::API::set_configuration_response);
 	if ( |ClusterController::instances| > 0 )
 		{
 		# We peer with the agents -- otherwise, the agents peer
 		# with (i.e., connect to) us.
 		for ( i in ClusterController::instances )
 			{
 			local inst = ClusterController::instances[i];
 			if ( ! inst?$listen_port )
 				{
 				# XXX config error -- this must be there
 				next;
 				}
 			Broker::peer(cat(inst$host), inst$listen_port,
 			             ClusterController::connect_retry);
 			}
 		}
 	# If ClusterController::instances is empty, agents peer with
 	# us and we do nothing. We'll build up state as the
 	# notify_agent_hello() events come int.
 	ClusterController::Log::info("controller is live");
 	}
--- a/scripts/policy/frameworks/cluster/controller/request.zeek
+++ b/scripts/policy/frameworks/cluster/controller/request.zeek
@ -0,0 +1,86 @@
@load ./types
 module ClusterController::Request;
 export {
 	type Request: record {
 		id: string;
 		parent_id: string &optional;
 	};
 	# API-specific state. XXX we may be able to generalize after this
 	# has settled a bit more.
 	# State specific to the set_configuration request/response events
 	type SetConfigurationState: record {
 		requests: vector of Request &default=vector();
 	};
 	# State specific to the set_nodes request/response events
 	type SetNodesState: record {
 		requests: vector of Request &default=vector();
 	};
 	# State specific to supervisor interactions
 	type SupervisorState: record {
 		node: string;
 	};
 	# The redef is a workaround so we can use the Request type
        # while it is still being defined
 	redef record Request += {
 		results: ClusterController::Types::ResultVec &default=vector();
 		finished: bool &default=F;
 		set_configuration_state: SetConfigurationState &optional;
 		set_nodes_state: SetNodesState &optional;
 		supervisor_state: SupervisorState &optional;
 	};
 	global null_req = Request($id="", $finished=T);
 	global create: function(reqid: string &default=unique_id("")): Request;
 	global lookup: function(reqid: string): Request;
 	global finish: function(reqid: string): bool;
 	global is_null: function(request: Request): bool;
 }
 # XXX this needs a mechanism for expiring stale requests
 global requests: table[string] of Request;
 function create(reqid: string): Request
 	{
 	local ret = Request($id=reqid);
 	requests[reqid] = ret;
 	return ret;
 	}
 function lookup(reqid: string): Request
 	{
 	if ( reqid in requests )
 		return requests[reqid];
 	return null_req;
 	}
 function finish(reqid: string): bool
 	{
 	if ( reqid !in requests )
 		return F;
 	local req = requests[reqid];
 	delete requests[reqid];
 	req$finished = T;
 	return T;
 	}
 function is_null(request: Request): bool
 	{
 	if ( request$id == "" )
 		return T;
 	return F;
 	}
--- a/scripts/policy/frameworks/cluster/controller/types.zeek
+++ b/scripts/policy/frameworks/cluster/controller/types.zeek
@ -0,0 +1,80 @@
 # Types for the Cluster Controller framework. These are used by both agent and controller.
 module ClusterController::Types;
 export {
 	## Management infrastructure node type. This intentionally does not
 	## include the data cluster node types (worker, logger, etc) -- those
 	## continue to be managed by the cluster framework.
 	type Role: enum {
 		NONE,
 		AGENT,
 		CONTROLLER,
 	};
 	## A Zeek-side option with value.
 	type Option: record {
 		name: string;  # Name of option
 		value: string; # Value of option
 	};
 	## Configuration describing a Zeek instance running a Cluster
 	## Agent. Normally, there'll be one instance per cluster
 	## system: a single physical system.
 	type Instance: record {
 		# Unique, human-readable instance name
 		name: string;
 		# IP address of system
 		host: addr;
 		# Agent listening port. Not needed if agents connect to controller.
 		listen_port: port &optional;
 	};
 	## State that a Cluster Node can be in. State changes trigger an
 	## API notification (see notify_change()).
 	type State: enum {
 		Running,  # Running and operating normally
 		Stopped,  # Explicitly stopped
 		Failed,   # Failed to start; and permanently halted
 		Crashed,  # Crashed, will be restarted,
 	        Unknown,  # State not known currently (e.g., because of lost connectivity)
 	};
 	## Configuration describing a Cluster Node process.
 	type Node: record {
 		name: string;                        # Cluster-unique, human-readable node name
 		instance: string;                    # Name of instance where node is to run
 		p: port;                             # Port on which this node will listen
 		role: Supervisor::ClusterRole;       # Role of the node.
 		state: State;                        # Desired, or current, run state.
 		scripts: vector of string &optional; # Additional Zeek scripts for node
 		options: set[Option] &optional;      # Zeek options for node
 		interface: string &optional;         # Interface to sniff
 		cpu_affinity: int &optional;         # CPU/core number to pin to
 		env: table[string] of string &default=table(); # Custom environment vars
 	};
 	# Data structure capturing a cluster's complete configuration.
 	type Configuration: record {
 		id: string &default=unique_id(""); # Unique identifier for a particular configuration
 		## The instances in the cluster.
 		## XXX we may be able to make this optional
 		instances: set[Instance];
 		## The set of nodes in the cluster, as distributed over the instances.
 		nodes: set[Node];
 	};
 	# Return value for request-response API event pairs
 	type Result: record {
 		reqid: string;              # Request ID of operation this result refers to
 		instance: string;           # Name of associated instance (for context)
 		success: bool &default=T;   # True if successful
 		data: any &optional;        # Addl data returned for successful operation
 		error: string &default="";  # Descriptive error on failure
 		node: string &optional;     # Name of associated node (for context)
 	};
 	type ResultVec: vector of Result;
 }