mirror of
https://github.com/zeek/zeek.git
synced 2025-10-02 14:48:21 +00:00
Merge branch 'topic/christian/cluster-controller'
* topic/christian/cluster-controller: Add a cluster controller testcase for agent-controller checkin Add zeek-client via new submodule Update baselines affected by cluster controller changes Introduce cluster controller and cluster agent scripting Establish a separate init script when using the supervisor Add optional bare-mode boolean flag to Supervisor's node configuration Add support for making the supervisor listen for requests Add support for setting environment variables via supervisor
This commit is contained in:
commit
8db985ea78
41 changed files with 1483 additions and 7 deletions
5
scripts/policy/frameworks/cluster/agent/__load__.zeek
Normal file
5
scripts/policy/frameworks/cluster/agent/__load__.zeek
Normal file
|
@ -0,0 +1,5 @@
|
|||
# The entry point for the cluster agent. It only runs bootstrap logic for
|
||||
# launching via the Supervisor. If we're not running the Supervisor, this does
|
||||
# nothing.
|
||||
|
||||
@load ./boot
|
33
scripts/policy/frameworks/cluster/agent/api.zeek
Normal file
33
scripts/policy/frameworks/cluster/agent/api.zeek
Normal file
|
@ -0,0 +1,33 @@
|
|||
@load base/frameworks/supervisor/control
|
||||
@load policy/frameworks/cluster/controller/types
|
||||
|
||||
module ClusterAgent::API;
|
||||
|
||||
export {
|
||||
const version = 1;
|
||||
|
||||
# Agent API events
|
||||
|
||||
global set_configuration_request: event(reqid: string,
|
||||
config: ClusterController::Types::Configuration);
|
||||
global set_configuration_response: event(reqid: string,
|
||||
result: ClusterController::Types::Result);
|
||||
|
||||
# Notification events, agent -> controller
|
||||
|
||||
# Report agent being available.
|
||||
global notify_agent_hello: event(instance: string, host: addr,
|
||||
api_version: count);
|
||||
|
||||
# Report node state changes.
|
||||
global notify_change: event(instance: string,
|
||||
n: ClusterController::Types::Node,
|
||||
old: ClusterController::Types::State,
|
||||
new: ClusterController::Types::State);
|
||||
|
||||
# Report operational error.
|
||||
global notify_error: event(instance: string, msg: string, node: string &default="");
|
||||
|
||||
# Report informational message.
|
||||
global notify_log: event(instance: string, msg: string, node: string &default="");
|
||||
}
|
35
scripts/policy/frameworks/cluster/agent/boot.zeek
Normal file
35
scripts/policy/frameworks/cluster/agent/boot.zeek
Normal file
|
@ -0,0 +1,35 @@
|
|||
@load ./config
|
||||
|
||||
# The agent needs the supervisor to listen for node management requests. We
|
||||
# need to tell it to do so, and we need to do so here, in the agent
|
||||
# bootstrapping code, so the redef applies prior to the fork of the agent
|
||||
# process itself.
|
||||
redef SupervisorControl::enable_listen = T;
|
||||
|
||||
event zeek_init()
|
||||
{
|
||||
if ( ! Supervisor::is_supervisor() )
|
||||
return;
|
||||
|
||||
local epi = ClusterAgent::endpoint_info();
|
||||
local sn = Supervisor::NodeConfig($name=epi$id, $bare_mode=T,
|
||||
$scripts=vector("policy/frameworks/cluster/agent/main.zeek"));
|
||||
|
||||
if ( ClusterAgent::directory != "" )
|
||||
sn$directory = ClusterAgent::directory;
|
||||
if ( ClusterAgent::stdout_file_suffix != "" )
|
||||
sn$stdout_file = epi$id + "." + ClusterAgent::stdout_file_suffix;
|
||||
if ( ClusterAgent::stderr_file_suffix != "" )
|
||||
sn$stderr_file = epi$id + "." + ClusterAgent::stderr_file_suffix;
|
||||
|
||||
# This helps Zeek run controller and agent with a minimal set of scripts.
|
||||
sn$env["ZEEK_CLUSTER_MGMT_NODE"] = "AGENT";
|
||||
|
||||
local res = Supervisor::create(sn);
|
||||
|
||||
if ( res != "" )
|
||||
{
|
||||
print(fmt("error: supervisor could not create agent node: %s", res));
|
||||
exit(1);
|
||||
}
|
||||
}
|
85
scripts/policy/frameworks/cluster/agent/config.zeek
Normal file
85
scripts/policy/frameworks/cluster/agent/config.zeek
Normal file
|
@ -0,0 +1,85 @@
|
|||
@load policy/frameworks/cluster/controller/types
|
||||
|
||||
module ClusterAgent;
|
||||
|
||||
export {
|
||||
# The name this agent uses to represent the cluster instance
|
||||
# it manages. When the environment variable isn't set and there's,
|
||||
# no redef, this falls back to "agent-<hostname>".
|
||||
const name = getenv("ZEEK_AGENT_NAME") &redef;
|
||||
|
||||
# Agent stdout/stderr log files to produce in Zeek's working
|
||||
# directory. If empty, no such logs will result. The actual
|
||||
# log files have the agent's name (as per above) dot-prefixed.
|
||||
const stdout_file_suffix = "agent.stdout" &redef;
|
||||
const stderr_file_suffix = "agent.stderr" &redef;
|
||||
|
||||
# The address and port the agent listens on. When
|
||||
# undefined, falls back to configurable default values.
|
||||
const listen_address = getenv("ZEEK_AGENT_ADDR") &redef;
|
||||
const default_address = Broker::default_listen_address &redef;
|
||||
|
||||
const listen_port = getenv("ZEEK_AGENT_PORT") &redef;
|
||||
const default_port = 2151/tcp &redef;
|
||||
|
||||
# The agent communicates under to following topic prefix,
|
||||
# suffixed with "/<name>" (see above):
|
||||
const topic_prefix = "zeek/cluster-control/agent" &redef;
|
||||
|
||||
# The coordinates of the controller. When defined, it means
|
||||
# agents peer with (connect to) the controller; otherwise the
|
||||
# controller knows all agents and peers with them.
|
||||
const controller: Broker::NetworkInfo = [
|
||||
$address="0.0.0.0", $bound_port=0/unknown] &redef;
|
||||
|
||||
# Agent and controller currently log only, not via the data cluster's
|
||||
# logger. (This might get added later.) For now, this means that
|
||||
# if both write to the same log file, it gets garbled. The following
|
||||
# lets you specify the working directory specifically for the agent.
|
||||
const directory = "" &redef;
|
||||
|
||||
# Working directory for data cluster nodes. When relative, note
|
||||
# that this will apply from the working directory of the agent,
|
||||
# since it creates data cluster nodes.
|
||||
const cluster_directory = "" &redef;
|
||||
|
||||
# The following functions return the effective network endpoint
|
||||
# information for this agent, in two related forms.
|
||||
global instance: function(): ClusterController::Types::Instance;
|
||||
global endpoint_info: function(): Broker::EndpointInfo;
|
||||
}
|
||||
|
||||
function instance(): ClusterController::Types::Instance
|
||||
{
|
||||
local epi = endpoint_info();
|
||||
return ClusterController::Types::Instance($name=epi$id,
|
||||
$host=to_addr(epi$network$address),
|
||||
$listen_port=epi$network$bound_port);
|
||||
}
|
||||
|
||||
function endpoint_info(): Broker::EndpointInfo
|
||||
{
|
||||
local epi: Broker::EndpointInfo;
|
||||
local network: Broker::NetworkInfo;
|
||||
|
||||
if ( ClusterAgent::name != "" )
|
||||
epi$id = ClusterAgent::name;
|
||||
else
|
||||
epi$id = fmt("agent-%s", gethostname());
|
||||
|
||||
if ( ClusterAgent::listen_address != "" )
|
||||
network$address = ClusterAgent::listen_address;
|
||||
else if ( ClusterAgent::default_address != "" )
|
||||
network$address = ClusterAgent::default_address;
|
||||
else
|
||||
network$address = "127.0.0.1";
|
||||
|
||||
if ( ClusterAgent::listen_port != "" )
|
||||
network$bound_port = to_port(ClusterAgent::listen_port);
|
||||
else
|
||||
network$bound_port = ClusterAgent::default_port;
|
||||
|
||||
epi$network = network;
|
||||
|
||||
return epi;
|
||||
}
|
223
scripts/policy/frameworks/cluster/agent/main.zeek
Normal file
223
scripts/policy/frameworks/cluster/agent/main.zeek
Normal file
|
@ -0,0 +1,223 @@
|
|||
@load base/frameworks/broker
|
||||
|
||||
@load policy/frameworks/cluster/controller/config
|
||||
@load policy/frameworks/cluster/controller/log
|
||||
@load policy/frameworks/cluster/controller/request
|
||||
|
||||
@load ./api
|
||||
|
||||
redef ClusterController::role = ClusterController::Types::AGENT;
|
||||
|
||||
# The global configuration as passed to us by the controller
|
||||
global global_config: ClusterController::Types::Configuration;
|
||||
|
||||
# A map to make other instance info accessible
|
||||
global instances: table[string] of ClusterController::Types::Instance;
|
||||
|
||||
# A map for the nodes we run on this instance, via this agent.
|
||||
global nodes: table[string] of ClusterController::Types::Node;
|
||||
|
||||
# The node map employed by the supervisor to describe the cluster
|
||||
# topology to newly forked nodes. We refresh it when we receive
|
||||
# new configurations.
|
||||
global data_cluster: table[string] of Supervisor::ClusterEndpoint;
|
||||
|
||||
event SupervisorControl::create_response(reqid: string, result: string)
|
||||
{
|
||||
local req = ClusterController::Request::lookup(reqid);
|
||||
if ( ClusterController::Request::is_null(req) )
|
||||
return;
|
||||
|
||||
local name = req$supervisor_state$node;
|
||||
|
||||
if ( |result| > 0 )
|
||||
{
|
||||
local msg = fmt("failed to create node %s: %s", name, result);
|
||||
ClusterController::Log::error(msg);
|
||||
event ClusterAgent::API::notify_error(ClusterAgent::name, msg, name);
|
||||
}
|
||||
|
||||
ClusterController::Request::finish(reqid);
|
||||
}
|
||||
|
||||
event SupervisorControl::destroy_response(reqid: string, result: bool)
|
||||
{
|
||||
local req = ClusterController::Request::lookup(reqid);
|
||||
if ( ClusterController::Request::is_null(req) )
|
||||
return;
|
||||
|
||||
local name = req$supervisor_state$node;
|
||||
|
||||
if ( ! result )
|
||||
{
|
||||
local msg = fmt("failed to destroy node %s, %s", name, reqid);
|
||||
ClusterController::Log::error(msg);
|
||||
event ClusterAgent::API::notify_error(ClusterAgent::name, msg, name);
|
||||
}
|
||||
|
||||
ClusterController::Request::finish(reqid);
|
||||
}
|
||||
|
||||
function supervisor_create(nc: Supervisor::NodeConfig)
|
||||
{
|
||||
local req = ClusterController::Request::create();
|
||||
req$supervisor_state = ClusterController::Request::SupervisorState($node = nc$name);
|
||||
event SupervisorControl::create_request(req$id, nc);
|
||||
ClusterController::Log::info(fmt("issued supervisor create for %s, %s", nc$name, req$id));
|
||||
}
|
||||
|
||||
function supervisor_destroy(node: string)
|
||||
{
|
||||
local req = ClusterController::Request::create();
|
||||
req$supervisor_state = ClusterController::Request::SupervisorState($node = node);
|
||||
event SupervisorControl::destroy_request(req$id, node);
|
||||
ClusterController::Log::info(fmt("issued supervisor destroy for %s, %s", node, req$id));
|
||||
}
|
||||
|
||||
event ClusterAgent::API::set_configuration_request(reqid: string, config: ClusterController::Types::Configuration)
|
||||
{
|
||||
ClusterController::Log::info(fmt("rx ClusterAgent::API::set_configuration_request %s", reqid));
|
||||
|
||||
local nodename: string;
|
||||
local node: ClusterController::Types::Node;
|
||||
local nc: Supervisor::NodeConfig;
|
||||
local msg: string;
|
||||
|
||||
# Adopt the global configuration provided.
|
||||
# XXX this can later handle validation and persistence
|
||||
# XXX should do this transactionally, only set when all else worked
|
||||
global_config = config;
|
||||
|
||||
# Refresh the instances table:
|
||||
instances = table();
|
||||
for ( inst in config$instances )
|
||||
instances[inst$name] = inst;
|
||||
|
||||
# Terminate existing nodes
|
||||
for ( nodename in nodes )
|
||||
supervisor_destroy(nodename);
|
||||
|
||||
nodes = table();
|
||||
|
||||
# Refresh the data cluster and nodes tables
|
||||
|
||||
data_cluster = table();
|
||||
for ( node in config$nodes )
|
||||
{
|
||||
if ( node$instance == ClusterAgent::name )
|
||||
nodes[node$name] = node;
|
||||
|
||||
local cep = Supervisor::ClusterEndpoint(
|
||||
$role = node$role,
|
||||
$host = instances[node$instance]$host,
|
||||
$p = node$p);
|
||||
|
||||
if ( node?$interface )
|
||||
cep$interface = node$interface;
|
||||
|
||||
data_cluster[node$name] = cep;
|
||||
}
|
||||
|
||||
# Apply the new configuration via the supervisor
|
||||
|
||||
for ( nodename in nodes )
|
||||
{
|
||||
node = nodes[nodename];
|
||||
nc = Supervisor::NodeConfig($name=nodename);
|
||||
|
||||
if ( ClusterAgent::cluster_directory != "" )
|
||||
nc$directory = ClusterAgent::cluster_directory;
|
||||
|
||||
if ( node?$interface )
|
||||
nc$interface = node$interface;
|
||||
if ( node?$cpu_affinity )
|
||||
nc$cpu_affinity = node$cpu_affinity;
|
||||
if ( node?$scripts )
|
||||
nc$scripts = node$scripts;
|
||||
if ( node?$env )
|
||||
nc$env = node$env;
|
||||
|
||||
# XXX could use options to enable per-node overrides for
|
||||
# directory, stdout, stderr, others?
|
||||
|
||||
nc$cluster = data_cluster;
|
||||
supervisor_create(nc);
|
||||
}
|
||||
|
||||
# XXX this currently doesn not fail if any of above problems occurred,
|
||||
# mainly due to the tediousness of handling the supervisor's response
|
||||
# events asynchonously. The only indication of error will be
|
||||
# notification events to the controller.
|
||||
|
||||
local res = ClusterController::Types::Result(
|
||||
$reqid = reqid,
|
||||
$instance = ClusterAgent::name);
|
||||
|
||||
ClusterController::Log::info(fmt("tx ClusterAgent::API::set_configuration_response %s", reqid));
|
||||
event ClusterAgent::API::set_configuration_response(reqid, res);
|
||||
}
|
||||
|
||||
event Broker::peer_added(peer: Broker::EndpointInfo, msg: string)
|
||||
{
|
||||
# This does not (cannot?) immediately verify that the new peer
|
||||
# is in fact a controller, so we might send this redundantly.
|
||||
# Controllers handle the hello event accordingly.
|
||||
|
||||
local epi = ClusterAgent::endpoint_info();
|
||||
# XXX deal with unexpected peers, unless we're okay with it
|
||||
event ClusterAgent::API::notify_agent_hello(epi$id,
|
||||
to_addr(epi$network$address), ClusterAgent::API::version);
|
||||
}
|
||||
|
||||
event zeek_init()
|
||||
{
|
||||
local epi = ClusterAgent::endpoint_info();
|
||||
local agent_topic = ClusterAgent::topic_prefix + "/" + epi$id;
|
||||
|
||||
# The agent needs to peer with the supervisor -- this doesn't currently
|
||||
# happen automatically. The address defaults to Broker's default, which
|
||||
# relies on ZEEK_DEFAULT_LISTEN_ADDR and so might just be "". Broker
|
||||
# internally falls back to listening on any; we pick 127.0.0.1.
|
||||
local supervisor_addr = Broker::default_listen_address;
|
||||
if ( supervisor_addr == "" )
|
||||
supervisor_addr = "127.0.0.1";
|
||||
|
||||
Broker::peer(supervisor_addr, Broker::default_port, Broker::default_listen_retry);
|
||||
|
||||
# Agents need receive communication targeted at it, and any responses
|
||||
# from the supervisor.
|
||||
Broker::subscribe(agent_topic);
|
||||
Broker::subscribe(SupervisorControl::topic_prefix);
|
||||
|
||||
# Auto-publish a bunch of events. Glob patterns or module-level
|
||||
# auto-publish would be helpful here.
|
||||
Broker::auto_publish(agent_topic, ClusterAgent::API::set_configuration_response);
|
||||
Broker::auto_publish(agent_topic, ClusterAgent::API::notify_agent_hello);
|
||||
Broker::auto_publish(agent_topic, ClusterAgent::API::notify_change);
|
||||
Broker::auto_publish(agent_topic, ClusterAgent::API::notify_error);
|
||||
Broker::auto_publish(agent_topic, ClusterAgent::API::notify_log);
|
||||
|
||||
Broker::auto_publish(SupervisorControl::topic_prefix, SupervisorControl::create_request);
|
||||
Broker::auto_publish(SupervisorControl::topic_prefix, SupervisorControl::create_response);
|
||||
Broker::auto_publish(SupervisorControl::topic_prefix, SupervisorControl::destroy_request);
|
||||
Broker::auto_publish(SupervisorControl::topic_prefix, SupervisorControl::destroy_response);
|
||||
Broker::auto_publish(SupervisorControl::topic_prefix, SupervisorControl::restart_request);
|
||||
Broker::auto_publish(SupervisorControl::topic_prefix, SupervisorControl::restart_response);
|
||||
Broker::auto_publish(SupervisorControl::topic_prefix, SupervisorControl::stop_request);
|
||||
|
||||
# Establish connectivity with the controller.
|
||||
if ( ClusterAgent::controller$address != "0.0.0.0" )
|
||||
{
|
||||
# We connect to the controller.
|
||||
Broker::peer(ClusterAgent::controller$address,
|
||||
ClusterAgent::controller$bound_port,
|
||||
ClusterController::connect_retry);
|
||||
}
|
||||
else
|
||||
{
|
||||
# Controller connects to us; listen for it.
|
||||
Broker::listen(cat(epi$network$address), epi$network$bound_port);
|
||||
}
|
||||
|
||||
ClusterController::Log::info("agent is live");
|
||||
}
|
|
@ -0,0 +1,5 @@
|
|||
# The entry point for the cluster controller. It only runs bootstrap logic for
|
||||
# launching via the Supervisor. If we're not running the Supervisor, this does
|
||||
# nothing.
|
||||
|
||||
@load ./boot
|
16
scripts/policy/frameworks/cluster/controller/api.zeek
Normal file
16
scripts/policy/frameworks/cluster/controller/api.zeek
Normal file
|
@ -0,0 +1,16 @@
|
|||
@load ./types
|
||||
|
||||
module ClusterController::API;
|
||||
|
||||
export {
|
||||
const version = 1;
|
||||
|
||||
global get_instances_request: event(reqid: string);
|
||||
global get_instances_response: event(reqid: string,
|
||||
instances: vector of ClusterController::Types::Instance);
|
||||
|
||||
global set_configuration_request: event(reqid: string,
|
||||
config: ClusterController::Types::Configuration);
|
||||
global set_configuration_response: event(reqid: string,
|
||||
result: ClusterController::Types::ResultVec);
|
||||
}
|
29
scripts/policy/frameworks/cluster/controller/boot.zeek
Normal file
29
scripts/policy/frameworks/cluster/controller/boot.zeek
Normal file
|
@ -0,0 +1,29 @@
|
|||
@load ./config
|
||||
|
||||
event zeek_init()
|
||||
{
|
||||
if ( ! Supervisor::is_supervisor() )
|
||||
return;
|
||||
|
||||
local epi = ClusterController::endpoint_info();
|
||||
local sn = Supervisor::NodeConfig($name=epi$id, $bare_mode=T,
|
||||
$scripts=vector("policy/frameworks/cluster/controller/main.zeek"));
|
||||
|
||||
if ( ClusterController::directory != "" )
|
||||
sn$directory = ClusterController::directory;
|
||||
if ( ClusterController::stdout_file != "" )
|
||||
sn$stdout_file = ClusterController::stdout_file;
|
||||
if ( ClusterController::stderr_file != "" )
|
||||
sn$stderr_file = ClusterController::stderr_file;
|
||||
|
||||
# This helps Zeek run controller and agent with a minimal set of scripts.
|
||||
sn$env["ZEEK_CLUSTER_MGMT_NODE"] = "CONTROLLER";
|
||||
|
||||
local res = Supervisor::create(sn);
|
||||
|
||||
if ( res != "" )
|
||||
{
|
||||
print(fmt("error: supervisor could not create controller node: %s", res));
|
||||
exit(1);
|
||||
}
|
||||
}
|
85
scripts/policy/frameworks/cluster/controller/config.zeek
Normal file
85
scripts/policy/frameworks/cluster/controller/config.zeek
Normal file
|
@ -0,0 +1,85 @@
|
|||
@load policy/frameworks/cluster/agent/config
|
||||
|
||||
module ClusterController;
|
||||
|
||||
export {
|
||||
# The name of this controller in the cluster.
|
||||
# Without the environment variable and no redef, this
|
||||
# falls back to "controller-<hostname>".
|
||||
const name = getenv("ZEEK_CONTROLLER_NAME") &redef;
|
||||
|
||||
# Controller stdout/stderr log files to produce in Zeek's
|
||||
# working directory. If empty, no such logs will result.
|
||||
const stdout_file = "controller.stdout" &redef;
|
||||
const stderr_file = "controller.stderr" &redef;
|
||||
|
||||
# The address and port the controller listens on. When
|
||||
# undefined, falls back to the default_address, which you can
|
||||
# likewise customize.
|
||||
const listen_address = getenv("ZEEK_CONTROLLER_ADDR") &redef;
|
||||
const default_address = Broker::default_listen_address &redef;
|
||||
|
||||
const listen_port = getenv("ZEEK_CONTROLLER_PORT") &redef;
|
||||
const default_port = 2150/tcp &redef;
|
||||
|
||||
# A more aggressive default retry interval (vs default 30s)
|
||||
const connect_retry = 1sec &redef;
|
||||
|
||||
# The controller listens for messages on this topic:
|
||||
const topic = "zeek/cluster-control/controller" &redef;
|
||||
|
||||
# The set of agents to interact with. When this is non-empty
|
||||
# at startup, the controller contacts the agents; when it is
|
||||
# empty, it waits for agents to connect. They key is a name of
|
||||
# each instance. This should match the $name member of the
|
||||
# instance records.
|
||||
const instances: table[string] of ClusterController::Types::Instance = { } &redef;
|
||||
|
||||
# The role of this node in cluster management. Agent and
|
||||
# controller both redef this. Used during logging.
|
||||
const role = ClusterController::Types::NONE &redef;
|
||||
|
||||
# Agent and controller currently log only, not via the data cluster's
|
||||
# logger. (This might get added later.) For now, this means that
|
||||
# if both write to the same log file, it gets garbled. The following
|
||||
# lets you specify the working directory specifically for the agent.
|
||||
const directory = "" &redef;
|
||||
|
||||
# The following functions return the effective network endpoint
|
||||
# information for this controller, in two related forms.
|
||||
global network_info: function(): Broker::NetworkInfo;
|
||||
global endpoint_info: function(): Broker::EndpointInfo;
|
||||
}
|
||||
|
||||
function network_info(): Broker::NetworkInfo
|
||||
{
|
||||
local ni: Broker::NetworkInfo;
|
||||
|
||||
if ( ClusterController::listen_address != "" )
|
||||
ni$address = ClusterController::listen_address;
|
||||
else if ( ClusterController::default_address != "" )
|
||||
ni$address = ClusterController::default_address;
|
||||
else
|
||||
ni$address = "127.0.0.1";
|
||||
|
||||
if ( ClusterController::listen_port != "" )
|
||||
ni$bound_port = to_port(ClusterController::listen_port);
|
||||
else
|
||||
ni$bound_port = ClusterController::default_port;
|
||||
|
||||
return ni;
|
||||
}
|
||||
|
||||
function endpoint_info(): Broker::EndpointInfo
|
||||
{
|
||||
local epi: Broker::EndpointInfo;
|
||||
|
||||
if ( ClusterController::name != "" )
|
||||
epi$id = ClusterController::name;
|
||||
else
|
||||
epi$id = fmt("controller-%s", gethostname());
|
||||
|
||||
epi$network = network_info();
|
||||
|
||||
return epi;
|
||||
}
|
109
scripts/policy/frameworks/cluster/controller/log.zeek
Normal file
109
scripts/policy/frameworks/cluster/controller/log.zeek
Normal file
|
@ -0,0 +1,109 @@
|
|||
@load ./config
|
||||
|
||||
module ClusterController::Log;
|
||||
|
||||
export {
|
||||
## The cluster logging stream identifier.
|
||||
redef enum Log::ID += { LOG };
|
||||
|
||||
## A default logging policy hook for the stream.
|
||||
global log_policy: Log::PolicyHook;
|
||||
|
||||
type Level: enum {
|
||||
DEBUG,
|
||||
INFO,
|
||||
WARNING,
|
||||
ERROR,
|
||||
};
|
||||
|
||||
## The record type which contains the column fields of the cluster log.
|
||||
type Info: record {
|
||||
## The time at which a cluster message was generated.
|
||||
ts: time;
|
||||
## The name of the node that is creating the log record.
|
||||
node: string;
|
||||
## Log level of this message, converted from the above Level enum
|
||||
level: string;
|
||||
## The role of the node, translated from ClusterController::Types::Role.
|
||||
role: string;
|
||||
## A message indicating information about cluster controller operation.
|
||||
message: string;
|
||||
} &log;
|
||||
|
||||
global log_level = DEBUG &redef;
|
||||
|
||||
global info: function(message: string);
|
||||
global warning: function(message: string);
|
||||
global error: function(message: string);
|
||||
}
|
||||
|
||||
# Enum translations to strings. This avoids those enums being reported
|
||||
# with full qualifications in the logs, which is too verbose.
|
||||
|
||||
global l2s: table[Level] of string = {
|
||||
[DEBUG] = "DEBUG",
|
||||
[INFO] = "INFO",
|
||||
[WARNING] = "WARNING",
|
||||
[ERROR] = "ERROR",
|
||||
};
|
||||
|
||||
global r2s: table[ClusterController::Types::Role] of string = {
|
||||
[ClusterController::Types::AGENT] = "AGENT",
|
||||
[ClusterController::Types::CONTROLLER] = "CONTROLLER",
|
||||
};
|
||||
|
||||
function debug(message: string)
|
||||
{
|
||||
if ( enum_to_int(log_level) > enum_to_int(DEBUG) )
|
||||
return;
|
||||
|
||||
local node = Supervisor::node();
|
||||
Log::write(LOG, [$ts=network_time(), $node=node$name, $level=l2s[DEBUG],
|
||||
$role=r2s[ClusterController::role], $message=message]);
|
||||
}
|
||||
|
||||
function info(message: string)
|
||||
{
|
||||
if ( enum_to_int(log_level) > enum_to_int(INFO) )
|
||||
return;
|
||||
|
||||
local node = Supervisor::node();
|
||||
Log::write(LOG, [$ts=network_time(), $node=node$name, $level=l2s[INFO],
|
||||
$role=r2s[ClusterController::role], $message=message]);
|
||||
}
|
||||
|
||||
function warning(message: string)
|
||||
{
|
||||
if ( enum_to_int(log_level) > enum_to_int(WARNING) )
|
||||
return;
|
||||
|
||||
local node = Supervisor::node();
|
||||
Log::write(LOG, [$ts=network_time(), $node=node$name, $level=l2s[WARNING],
|
||||
$role=r2s[ClusterController::role], $message=message]);
|
||||
}
|
||||
|
||||
function error(message: string)
|
||||
{
|
||||
if ( enum_to_int(log_level) > enum_to_int(ERROR) )
|
||||
return;
|
||||
|
||||
local node = Supervisor::node();
|
||||
Log::write(LOG, [$ts=network_time(), $node=node$name, $level=l2s[ERROR],
|
||||
$role=r2s[ClusterController::role], $message=message]);
|
||||
}
|
||||
|
||||
event zeek_init()
|
||||
{
|
||||
if ( ! Supervisor::is_supervised() )
|
||||
return;
|
||||
|
||||
local node = Supervisor::node();
|
||||
|
||||
# Defining the stream outside of the stream creation call sidesteps
|
||||
# the coverage.find-bro-logs test, which tries to inventory all logs.
|
||||
# This log isn't yet ready for that level of scrutiny.
|
||||
local stream = Log::Stream($columns=Info, $path=fmt("cluster-%s", node$name),
|
||||
$policy=log_policy);
|
||||
|
||||
Log::create_stream(ClusterController::Log::LOG, stream);
|
||||
}
|
250
scripts/policy/frameworks/cluster/controller/main.zeek
Normal file
250
scripts/policy/frameworks/cluster/controller/main.zeek
Normal file
|
@ -0,0 +1,250 @@
|
|||
@load base/frameworks/broker
|
||||
|
||||
@load policy/frameworks/cluster/agent/config
|
||||
@load policy/frameworks/cluster/agent/api
|
||||
|
||||
@load ./api
|
||||
@load ./log
|
||||
@load ./request
|
||||
|
||||
redef ClusterController::role = ClusterController::Types::CONTROLLER;
|
||||
|
||||
event ClusterAgent::API::notify_agent_hello(instance: string, host: addr, api_version: count)
|
||||
{
|
||||
# See if we already know about this agent; if not, register
|
||||
# it.
|
||||
#
|
||||
# XXX protection against rogue agents?
|
||||
|
||||
if ( instance in ClusterController::instances )
|
||||
{
|
||||
# Do nothing, unless this known agent checks in with a mismatching
|
||||
# API version, in which case we kick it out.
|
||||
if ( api_version != ClusterController::API::version )
|
||||
{
|
||||
local inst = ClusterController::instances[instance];
|
||||
if ( inst?$listen_port )
|
||||
{
|
||||
# We peered with this instance, unpeer.
|
||||
Broker::unpeer(cat(inst$host), inst$listen_port );
|
||||
# XXX what to do if they connected to us?
|
||||
}
|
||||
delete ClusterController::instances[instance];
|
||||
}
|
||||
|
||||
# Update the instance name in the pointed-to record, in case it
|
||||
# was previously named otherwise. Not being too picky here allows
|
||||
# the user some leeway in spelling out the original config.
|
||||
ClusterController::instances[instance]$name = instance;
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
if ( api_version != ClusterController::API::version )
|
||||
{
|
||||
ClusterController::Log::warning(
|
||||
fmt("agent %s/%s speaks incompatible agent protocol (%s, need %s), unpeering",
|
||||
instance, host, api_version, ClusterController::API::version));
|
||||
}
|
||||
|
||||
ClusterController::instances[instance] = ClusterController::Types::Instance($name=instance, $host=host);
|
||||
ClusterController::Log::info(fmt("instance %s/%s has checked in", instance, host));
|
||||
}
|
||||
|
||||
|
||||
event ClusterAgent::API::notify_change(instance: string, n: ClusterController::Types::Node,
|
||||
old: ClusterController::Types::State,
|
||||
new: ClusterController::Types::State)
|
||||
{
|
||||
# XXX TODO
|
||||
}
|
||||
|
||||
event ClusterAgent::API::notify_error(instance: string, msg: string, node: string)
|
||||
{
|
||||
# XXX TODO
|
||||
}
|
||||
|
||||
event ClusterAgent::API::notify_log(instance: string, msg: string, node: string)
|
||||
{
|
||||
# XXX TODO
|
||||
}
|
||||
|
||||
event ClusterAgent::API::set_configuration_response(reqid: string, result: ClusterController::Types::Result)
|
||||
{
|
||||
ClusterController::Log::info(fmt("rx ClusterAgent::API::set_configuration_response %s", reqid));
|
||||
|
||||
# Retrieve state for the request we just got a response to
|
||||
local areq = ClusterController::Request::lookup(reqid);
|
||||
if ( ClusterController::Request::is_null(areq) )
|
||||
return;
|
||||
|
||||
# Record the result and mark the request as done. This also
|
||||
# marks the request as done in the parent-level request, since
|
||||
# these records are stored by reference.
|
||||
areq$results[0] = result; # We only have a single result here atm
|
||||
areq$finished = T;
|
||||
|
||||
# Update the original request from the client:
|
||||
local req = ClusterController::Request::lookup(areq$parent_id);
|
||||
if ( ClusterController::Request::is_null(req) )
|
||||
return;
|
||||
|
||||
# If there are any requests to the agents still unfinished,
|
||||
# we're not done yet.
|
||||
for ( i in req$set_configuration_state$requests )
|
||||
if ( ! req$set_configuration_state$requests[i]$finished )
|
||||
return;
|
||||
|
||||
# All set_configuration requests to instances are done, so respond
|
||||
# back to client. We need to compose the result, aggregating
|
||||
# the results we got from the requests to the agents. In the
|
||||
# end we have one Result per instance requested in the
|
||||
# original set_configuration_request.
|
||||
#
|
||||
# XXX we can likely generalize result aggregation in the request module.
|
||||
for ( i in req$set_configuration_state$requests )
|
||||
{
|
||||
local r = req$set_configuration_state$requests[i];
|
||||
|
||||
local success = T;
|
||||
local errors: string_vec;
|
||||
local instance = "";
|
||||
|
||||
for ( j in r$results )
|
||||
{
|
||||
local res = r$results[j];
|
||||
instance = res$instance;
|
||||
|
||||
if ( res$success )
|
||||
next;
|
||||
|
||||
success = F;
|
||||
errors += fmt("node %s failed: %s", res$node, res$error);
|
||||
}
|
||||
|
||||
req$results += ClusterController::Types::Result(
|
||||
$reqid = req$id,
|
||||
$instance = instance,
|
||||
$success = success,
|
||||
$error = join_string_vec(errors, ", ")
|
||||
);
|
||||
|
||||
ClusterController::Request::finish(r$id);
|
||||
}
|
||||
|
||||
ClusterController::Log::info(fmt("tx ClusterController::API::set_configuration_response %s", req$id));
|
||||
event ClusterController::API::set_configuration_response(req$id, req$results);
|
||||
ClusterController::Request::finish(req$id);
|
||||
}
|
||||
|
||||
event ClusterController::API::set_configuration_request(reqid: string, config: ClusterController::Types::Configuration)
|
||||
{
|
||||
ClusterController::Log::info(fmt("rx ClusterController::API::set_configuration_request %s", reqid));
|
||||
|
||||
local req = ClusterController::Request::create(reqid);
|
||||
req$set_configuration_state = ClusterController::Request::SetConfigurationState();
|
||||
|
||||
# Compare new configuration to the current one and send updates
|
||||
# to the instances as needed.
|
||||
if ( config?$instances )
|
||||
{
|
||||
# XXX properly handle instance update: connect to new instances provided
|
||||
# when they are listening, accept connections from new instances that are
|
||||
# not
|
||||
for ( inst in config$instances )
|
||||
{
|
||||
if ( inst$name !in ClusterController::instances )
|
||||
{
|
||||
local res = ClusterController::Types::Result($reqid=reqid, $instance=inst$name);
|
||||
res$error = fmt("instance %s is unknown, skipping", inst$name);
|
||||
req$results += res;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# XXX validate the configuration:
|
||||
# - Are node instances among defined instances?
|
||||
# - Are all names unique?
|
||||
# - Are any node options understood?
|
||||
# - Do node types with optional fields have required values?
|
||||
# ...
|
||||
|
||||
# Transmit the configuration on to the agents. They need to be aware of
|
||||
# each other's location and nodes, so the data cluster nodes can connect
|
||||
# (for example, so a worker on instance 1 can connect to a logger on
|
||||
# instance 2).
|
||||
for ( name in ClusterController::instances )
|
||||
{
|
||||
local agent_topic = ClusterAgent::topic_prefix + "/" + name;
|
||||
local areq = ClusterController::Request::create();
|
||||
areq$parent_id = reqid;
|
||||
|
||||
# We track the requests sent off to each agent. As the
|
||||
# responses come in, we can check them off as completed,
|
||||
# and once all are, we respond back to the client.
|
||||
req$set_configuration_state$requests += areq;
|
||||
|
||||
# XXX could also broadcast just once on the agent prefix, but
|
||||
# explicit request/response pairs for each agent seems cleaner.
|
||||
ClusterController::Log::info(fmt("tx ClusterAgent::API::set_configuration_request %s to %s",
|
||||
areq$id, name));
|
||||
Broker::publish(agent_topic, ClusterAgent::API::set_configuration_request, areq$id, config);
|
||||
}
|
||||
|
||||
# Response event gets sent via the agents' reponse event.
|
||||
}
|
||||
|
||||
event ClusterController::API::get_instances_request(reqid: string)
|
||||
{
|
||||
ClusterController::Log::info(fmt("rx ClusterController::API::set_instances_request %s", reqid));
|
||||
|
||||
local insts: vector of ClusterController::Types::Instance;
|
||||
|
||||
for ( i in ClusterController::instances )
|
||||
insts += ClusterController::instances[i];
|
||||
|
||||
ClusterController::Log::info(fmt("tx ClusterController::API::get_instances_response %s", reqid));
|
||||
event ClusterController::API::get_instances_response(reqid, insts);
|
||||
}
|
||||
|
||||
event zeek_init()
|
||||
{
|
||||
# Controller always listens -- it needs to be able to respond
|
||||
# to the Zeek client. This port is also used by the agents
|
||||
# if they connect to the client.
|
||||
local cni = ClusterController::network_info();
|
||||
Broker::listen(cat(cni$address), cni$bound_port);
|
||||
|
||||
Broker::subscribe(ClusterAgent::topic_prefix);
|
||||
Broker::subscribe(ClusterController::topic);
|
||||
|
||||
Broker::auto_publish(ClusterController::topic,
|
||||
ClusterController::API::get_instances_response);
|
||||
Broker::auto_publish(ClusterController::topic,
|
||||
ClusterController::API::set_configuration_response);
|
||||
|
||||
if ( |ClusterController::instances| > 0 )
|
||||
{
|
||||
# We peer with the agents -- otherwise, the agents peer
|
||||
# with (i.e., connect to) us.
|
||||
for ( i in ClusterController::instances )
|
||||
{
|
||||
local inst = ClusterController::instances[i];
|
||||
|
||||
if ( ! inst?$listen_port )
|
||||
{
|
||||
# XXX config error -- this must be there
|
||||
next;
|
||||
}
|
||||
|
||||
Broker::peer(cat(inst$host), inst$listen_port,
|
||||
ClusterController::connect_retry);
|
||||
}
|
||||
}
|
||||
|
||||
# If ClusterController::instances is empty, agents peer with
|
||||
# us and we do nothing. We'll build up state as the
|
||||
# notify_agent_hello() events come int.
|
||||
|
||||
ClusterController::Log::info("controller is live");
|
||||
}
|
86
scripts/policy/frameworks/cluster/controller/request.zeek
Normal file
86
scripts/policy/frameworks/cluster/controller/request.zeek
Normal file
|
@ -0,0 +1,86 @@
|
|||
@load ./types
|
||||
|
||||
module ClusterController::Request;
|
||||
|
||||
export {
|
||||
type Request: record {
|
||||
id: string;
|
||||
parent_id: string &optional;
|
||||
};
|
||||
|
||||
# API-specific state. XXX we may be able to generalize after this
|
||||
# has settled a bit more.
|
||||
|
||||
# State specific to the set_configuration request/response events
|
||||
type SetConfigurationState: record {
|
||||
requests: vector of Request &default=vector();
|
||||
};
|
||||
|
||||
# State specific to the set_nodes request/response events
|
||||
type SetNodesState: record {
|
||||
requests: vector of Request &default=vector();
|
||||
};
|
||||
|
||||
# State specific to supervisor interactions
|
||||
type SupervisorState: record {
|
||||
node: string;
|
||||
};
|
||||
|
||||
# The redef is a workaround so we can use the Request type
|
||||
# while it is still being defined
|
||||
redef record Request += {
|
||||
results: ClusterController::Types::ResultVec &default=vector();
|
||||
finished: bool &default=F;
|
||||
|
||||
set_configuration_state: SetConfigurationState &optional;
|
||||
set_nodes_state: SetNodesState &optional;
|
||||
supervisor_state: SupervisorState &optional;
|
||||
};
|
||||
|
||||
global null_req = Request($id="", $finished=T);
|
||||
|
||||
global create: function(reqid: string &default=unique_id("")): Request;
|
||||
global lookup: function(reqid: string): Request;
|
||||
global finish: function(reqid: string): bool;
|
||||
|
||||
global is_null: function(request: Request): bool;
|
||||
}
|
||||
|
||||
# XXX this needs a mechanism for expiring stale requests
|
||||
global requests: table[string] of Request;
|
||||
|
||||
function create(reqid: string): Request
|
||||
{
|
||||
local ret = Request($id=reqid);
|
||||
requests[reqid] = ret;
|
||||
return ret;
|
||||
}
|
||||
|
||||
function lookup(reqid: string): Request
|
||||
{
|
||||
if ( reqid in requests )
|
||||
return requests[reqid];
|
||||
|
||||
return null_req;
|
||||
}
|
||||
|
||||
function finish(reqid: string): bool
|
||||
{
|
||||
if ( reqid !in requests )
|
||||
return F;
|
||||
|
||||
local req = requests[reqid];
|
||||
delete requests[reqid];
|
||||
|
||||
req$finished = T;
|
||||
|
||||
return T;
|
||||
}
|
||||
|
||||
function is_null(request: Request): bool
|
||||
{
|
||||
if ( request$id == "" )
|
||||
return T;
|
||||
|
||||
return F;
|
||||
}
|
80
scripts/policy/frameworks/cluster/controller/types.zeek
Normal file
80
scripts/policy/frameworks/cluster/controller/types.zeek
Normal file
|
@ -0,0 +1,80 @@
|
|||
# Types for the Cluster Controller framework. These are used by both agent and controller.
|
||||
|
||||
module ClusterController::Types;
|
||||
|
||||
export {
|
||||
## Management infrastructure node type. This intentionally does not
|
||||
## include the data cluster node types (worker, logger, etc) -- those
|
||||
## continue to be managed by the cluster framework.
|
||||
type Role: enum {
|
||||
NONE,
|
||||
AGENT,
|
||||
CONTROLLER,
|
||||
};
|
||||
|
||||
## A Zeek-side option with value.
|
||||
type Option: record {
|
||||
name: string; # Name of option
|
||||
value: string; # Value of option
|
||||
};
|
||||
|
||||
## Configuration describing a Zeek instance running a Cluster
|
||||
## Agent. Normally, there'll be one instance per cluster
|
||||
## system: a single physical system.
|
||||
type Instance: record {
|
||||
# Unique, human-readable instance name
|
||||
name: string;
|
||||
# IP address of system
|
||||
host: addr;
|
||||
# Agent listening port. Not needed if agents connect to controller.
|
||||
listen_port: port &optional;
|
||||
};
|
||||
|
||||
## State that a Cluster Node can be in. State changes trigger an
|
||||
## API notification (see notify_change()).
|
||||
type State: enum {
|
||||
Running, # Running and operating normally
|
||||
Stopped, # Explicitly stopped
|
||||
Failed, # Failed to start; and permanently halted
|
||||
Crashed, # Crashed, will be restarted,
|
||||
Unknown, # State not known currently (e.g., because of lost connectivity)
|
||||
};
|
||||
|
||||
## Configuration describing a Cluster Node process.
|
||||
type Node: record {
|
||||
name: string; # Cluster-unique, human-readable node name
|
||||
instance: string; # Name of instance where node is to run
|
||||
p: port; # Port on which this node will listen
|
||||
role: Supervisor::ClusterRole; # Role of the node.
|
||||
state: State; # Desired, or current, run state.
|
||||
scripts: vector of string &optional; # Additional Zeek scripts for node
|
||||
options: set[Option] &optional; # Zeek options for node
|
||||
interface: string &optional; # Interface to sniff
|
||||
cpu_affinity: int &optional; # CPU/core number to pin to
|
||||
env: table[string] of string &default=table(); # Custom environment vars
|
||||
};
|
||||
|
||||
# Data structure capturing a cluster's complete configuration.
|
||||
type Configuration: record {
|
||||
id: string &default=unique_id(""); # Unique identifier for a particular configuration
|
||||
|
||||
## The instances in the cluster.
|
||||
## XXX we may be able to make this optional
|
||||
instances: set[Instance];
|
||||
|
||||
## The set of nodes in the cluster, as distributed over the instances.
|
||||
nodes: set[Node];
|
||||
};
|
||||
|
||||
# Return value for request-response API event pairs
|
||||
type Result: record {
|
||||
reqid: string; # Request ID of operation this result refers to
|
||||
instance: string; # Name of associated instance (for context)
|
||||
success: bool &default=T; # True if successful
|
||||
data: any &optional; # Addl data returned for successful operation
|
||||
error: string &default=""; # Descriptive error on failure
|
||||
node: string &optional; # Name of associated node (for context)
|
||||
};
|
||||
|
||||
type ResultVec: vector of Result;
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue