mirror of
https://github.com/zeek/zeek.git
synced 2025-10-02 14:48:21 +00:00
Merge branch 'topic/christian/management-verify-nodestarts'
* topic/christian/management-verify-nodestarts: Management framework: bump external cluster testsuite Management framework: bump zeek-client to pull in set-config rendering Management framework: enable stdout/stderr reporting Management framework: Supervisor extensions for stdout/stderr handling Management framework: disambiguate redef field names in agent and controller Management framework: move to ResultVec in agent's set_configuration response Management framework: tune request timeout granularity and interval Management framework: verify node starts when deploying a configuration Management framework: a bit of debug-level logging for troubleshooting
This commit is contained in:
commit
65589c9bcb
18 changed files with 400 additions and 61 deletions
14
CHANGES
14
CHANGES
|
@ -1,3 +1,17 @@
|
||||||
|
5.0.0-dev.535 | 2022-05-31 12:58:32 -0700
|
||||||
|
|
||||||
|
* Management framework updates (Christian Kreibich, Corelight)
|
||||||
|
|
||||||
|
- bump external cluster testsuite
|
||||||
|
- bump zeek-client to pull in set-config rendering
|
||||||
|
- enable stdout/stderr reporting
|
||||||
|
- Supervisor extensions for stdout/stderr handling
|
||||||
|
- disambiguate redef field names in agent and controller
|
||||||
|
- move to ResultVec in agent's set_configuration response
|
||||||
|
- tune request timeout granularity and interval
|
||||||
|
- verify node starts when deploying a configuration
|
||||||
|
- a bit of debug-level logging for troubleshooting
|
||||||
|
|
||||||
5.0.0-dev.525 | 2022-05-31 12:53:01 -0700
|
5.0.0-dev.525 | 2022-05-31 12:53:01 -0700
|
||||||
|
|
||||||
* Add Supervisor::node_status notification event (Christian Kreibich, Corelight)
|
* Add Supervisor::node_status notification event (Christian Kreibich, Corelight)
|
||||||
|
|
2
VERSION
2
VERSION
|
@ -1 +1 @@
|
||||||
5.0.0-dev.525
|
5.0.0-dev.535
|
||||||
|
|
|
@ -1 +1 @@
|
||||||
Subproject commit 6a3d1b5516e5c9343072466e3c627aa13324f2d0
|
Subproject commit 2b00ec50c0e79e9f3ad18bad58b92ba32b405274
|
|
@ -12,5 +12,6 @@
|
||||||
@endif
|
@endif
|
||||||
|
|
||||||
@if ( Supervisor::is_supervisor() )
|
@if ( Supervisor::is_supervisor() )
|
||||||
|
@load policy/frameworks/management/supervisor
|
||||||
@load ./boot
|
@load ./boot
|
||||||
@endif
|
@endif
|
||||||
|
|
|
@ -38,7 +38,7 @@ export {
|
||||||
## result: the result record.
|
## result: the result record.
|
||||||
##
|
##
|
||||||
global set_configuration_response: event(reqid: string,
|
global set_configuration_response: event(reqid: string,
|
||||||
result: Management::Result);
|
result: Management::ResultVec);
|
||||||
|
|
||||||
|
|
||||||
## The controller sends this event to request a list of
|
## The controller sends this event to request a list of
|
||||||
|
|
|
@ -39,10 +39,9 @@ event zeek_init()
|
||||||
if ( ! mkdir(sn$directory) )
|
if ( ! mkdir(sn$directory) )
|
||||||
print(fmt("warning: could not create agent state dir '%s'", sn$directory));
|
print(fmt("warning: could not create agent state dir '%s'", sn$directory));
|
||||||
|
|
||||||
if ( Management::Agent::stdout_file != "" )
|
# We don't set sn$stdout_file/stderr_file here because the Management
|
||||||
sn$stdout_file = Management::Agent::stdout_file;
|
# framework's Supervisor shim manages those output files itself. See
|
||||||
if ( Management::Agent::stderr_file != "" )
|
# frameworks/management/supervisor/main.zeek for details.
|
||||||
sn$stderr_file = Management::Agent::stderr_file;
|
|
||||||
|
|
||||||
# This helps identify Management framework nodes reliably.
|
# This helps identify Management framework nodes reliably.
|
||||||
sn$env["ZEEK_MANAGEMENT_NODE"] = "AGENT";
|
sn$env["ZEEK_MANAGEMENT_NODE"] = "AGENT";
|
||||||
|
|
|
@ -9,6 +9,8 @@
|
||||||
@load policy/frameworks/management
|
@load policy/frameworks/management
|
||||||
@load policy/frameworks/management/node/api
|
@load policy/frameworks/management/node/api
|
||||||
@load policy/frameworks/management/node/config
|
@load policy/frameworks/management/node/config
|
||||||
|
@load policy/frameworks/management/supervisor/api
|
||||||
|
@load policy/frameworks/management/supervisor/config
|
||||||
|
|
||||||
@load ./api
|
@load ./api
|
||||||
@load ./config
|
@load ./config
|
||||||
|
@ -24,6 +26,13 @@ export {
|
||||||
node: string; ##< Name of the node the Supervisor is acting on.
|
node: string; ##< Name of the node the Supervisor is acting on.
|
||||||
};
|
};
|
||||||
|
|
||||||
|
## Request state for set_configuration requests.
|
||||||
|
type SetConfigurationState: record {
|
||||||
|
## Zeek cluster nodes the provided configuration requested
|
||||||
|
## and which have not yet checked in with the agent.
|
||||||
|
nodes_pending: set[string];
|
||||||
|
};
|
||||||
|
|
||||||
## Request state for node dispatches, tracking the requested action
|
## Request state for node dispatches, tracking the requested action
|
||||||
## as well as received responses.
|
## as well as received responses.
|
||||||
type NodeDispatchState: record {
|
type NodeDispatchState: record {
|
||||||
|
@ -36,14 +45,34 @@ export {
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# We need to go out of our way here to avoid colliding record field names with
|
||||||
|
# the similar redef in the controller -- not because of real-world use, but
|
||||||
|
# because Zeekygen loads it both during documentation extraction. Suffix all
|
||||||
|
# members with _agent to disambiguate.
|
||||||
redef record Management::Request::Request += {
|
redef record Management::Request::Request += {
|
||||||
supervisor_state: SupervisorState &optional;
|
supervisor_state_agent: SupervisorState &optional;
|
||||||
node_dispatch_state: NodeDispatchState &optional;
|
set_configuration_state_agent: SetConfigurationState &optional;
|
||||||
|
node_dispatch_state_agent: NodeDispatchState &optional;
|
||||||
};
|
};
|
||||||
|
|
||||||
# Tag our logs correctly
|
# Tag our logs correctly
|
||||||
redef Management::role = Management::AGENT;
|
redef Management::role = Management::AGENT;
|
||||||
|
|
||||||
|
# Conduct more frequent table expiration checks. This helps get more predictable
|
||||||
|
# timing for request timeouts and only affects the controller, which is mostly idle.
|
||||||
|
redef table_expire_interval = 2 sec;
|
||||||
|
|
||||||
|
# Tweak the request timeout so it's relatively quick, and quick enough always to
|
||||||
|
# time out strictly before the controller's request state (at 10 sec).
|
||||||
|
redef Management::Request::timeout_interval = 5 sec;
|
||||||
|
|
||||||
|
# Returns the effective agent topic for this agent.
|
||||||
|
global agent_topic: function(): string;
|
||||||
|
|
||||||
|
# Finalizes a set_configuration_request transaction: cleans up remaining state
|
||||||
|
# and sends response event.
|
||||||
|
global send_set_configuration_response: function(req: Management::Request::Request);
|
||||||
|
|
||||||
# The global configuration as passed to us by the controller
|
# The global configuration as passed to us by the controller
|
||||||
global g_config: Management::Configuration;
|
global g_config: Management::Configuration;
|
||||||
|
|
||||||
|
@ -53,11 +82,21 @@ global g_instances: table[string] of Management::Instance;
|
||||||
# A map for the nodes we run on this instance, via this agent.
|
# A map for the nodes we run on this instance, via this agent.
|
||||||
global g_nodes: table[string] of Management::Node;
|
global g_nodes: table[string] of Management::Node;
|
||||||
|
|
||||||
|
# The request ID of the most recent configuration update from the controller.
|
||||||
|
# We track it here until the nodes_pending set in the corresponding request's
|
||||||
|
# SetConfigurationState is cleared out, or the corresponding request state hits
|
||||||
|
# a timeout.
|
||||||
|
global g_config_reqid_pending: string = "";
|
||||||
|
|
||||||
# The complete node map employed by the supervisor to describe the cluster
|
# The complete node map employed by the supervisor to describe the cluster
|
||||||
# topology to newly forked nodes. We refresh it when we receive new
|
# topology to newly forked nodes. We refresh it when we receive new
|
||||||
# configurations.
|
# configurations.
|
||||||
global g_cluster: table[string] of Supervisor::ClusterEndpoint;
|
global g_cluster: table[string] of Supervisor::ClusterEndpoint;
|
||||||
|
|
||||||
|
# The most recent output contexts we've received from the Supervisor, for
|
||||||
|
# any of our nodes.
|
||||||
|
global g_outputs: table[string] of Management::NodeOutputs;
|
||||||
|
|
||||||
|
|
||||||
function agent_topic(): string
|
function agent_topic(): string
|
||||||
{
|
{
|
||||||
|
@ -65,13 +104,57 @@ function agent_topic(): string
|
||||||
return Management::Agent::topic_prefix + "/" + epi$id;
|
return Management::Agent::topic_prefix + "/" + epi$id;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function send_set_configuration_response(req: Management::Request::Request)
|
||||||
|
{
|
||||||
|
local node: string;
|
||||||
|
local res: Management::Result;
|
||||||
|
|
||||||
|
# Put together the results vector for the response event.
|
||||||
|
for ( node in g_nodes )
|
||||||
|
{
|
||||||
|
res = Management::Result(
|
||||||
|
$reqid = req$id,
|
||||||
|
$instance = Management::Agent::get_name(),
|
||||||
|
$node = node);
|
||||||
|
|
||||||
|
if ( node in req$set_configuration_state_agent$nodes_pending )
|
||||||
|
{
|
||||||
|
# This node failed.
|
||||||
|
res$success = F;
|
||||||
|
|
||||||
|
# Pull in any stdout/stderr context we might have.
|
||||||
|
if ( node in g_outputs )
|
||||||
|
res$data = g_outputs[node];
|
||||||
|
}
|
||||||
|
|
||||||
|
# Add this result to the overall response
|
||||||
|
req$results[|req$results|] = res;
|
||||||
|
}
|
||||||
|
|
||||||
|
Management::Log::info(fmt("tx Management::Agent::API::set_configuration_response %s",
|
||||||
|
Management::result_to_string(res)));
|
||||||
|
Broker::publish(agent_topic(),
|
||||||
|
Management::Agent::API::set_configuration_response, req$id, req$results);
|
||||||
|
|
||||||
|
Management::Request::finish(req$id);
|
||||||
|
|
||||||
|
if ( req$id == g_config_reqid_pending )
|
||||||
|
g_config_reqid_pending = "";
|
||||||
|
}
|
||||||
|
|
||||||
|
event Management::Supervisor::API::notify_node_exit(node: string, outputs: Management::NodeOutputs)
|
||||||
|
{
|
||||||
|
if ( node in g_nodes )
|
||||||
|
g_outputs[node] = outputs;
|
||||||
|
}
|
||||||
|
|
||||||
event SupervisorControl::create_response(reqid: string, result: string)
|
event SupervisorControl::create_response(reqid: string, result: string)
|
||||||
{
|
{
|
||||||
local req = Management::Request::lookup(reqid);
|
local req = Management::Request::lookup(reqid);
|
||||||
if ( Management::Request::is_null(req) )
|
if ( Management::Request::is_null(req) )
|
||||||
return;
|
return;
|
||||||
|
|
||||||
local name = req$supervisor_state$node;
|
local name = req$supervisor_state_agent$node;
|
||||||
|
|
||||||
if ( |result| > 0 )
|
if ( |result| > 0 )
|
||||||
{
|
{
|
||||||
|
@ -91,7 +174,7 @@ event SupervisorControl::destroy_response(reqid: string, result: bool)
|
||||||
if ( Management::Request::is_null(req) )
|
if ( Management::Request::is_null(req) )
|
||||||
return;
|
return;
|
||||||
|
|
||||||
local name = req$supervisor_state$node;
|
local name = req$supervisor_state_agent$node;
|
||||||
|
|
||||||
if ( ! result )
|
if ( ! result )
|
||||||
{
|
{
|
||||||
|
@ -108,7 +191,7 @@ event SupervisorControl::destroy_response(reqid: string, result: bool)
|
||||||
function supervisor_create(nc: Supervisor::NodeConfig)
|
function supervisor_create(nc: Supervisor::NodeConfig)
|
||||||
{
|
{
|
||||||
local req = Management::Request::create();
|
local req = Management::Request::create();
|
||||||
req$supervisor_state = SupervisorState($node = nc$name);
|
req$supervisor_state_agent = SupervisorState($node = nc$name);
|
||||||
Broker::publish(SupervisorControl::topic_prefix,
|
Broker::publish(SupervisorControl::topic_prefix,
|
||||||
SupervisorControl::create_request, req$id, nc);
|
SupervisorControl::create_request, req$id, nc);
|
||||||
Management::Log::info(fmt("issued supervisor create for %s, %s", nc$name, req$id));
|
Management::Log::info(fmt("issued supervisor create for %s, %s", nc$name, req$id));
|
||||||
|
@ -117,7 +200,7 @@ function supervisor_create(nc: Supervisor::NodeConfig)
|
||||||
function supervisor_destroy(node: string)
|
function supervisor_destroy(node: string)
|
||||||
{
|
{
|
||||||
local req = Management::Request::create();
|
local req = Management::Request::create();
|
||||||
req$supervisor_state = SupervisorState($node = node);
|
req$supervisor_state_agent = SupervisorState($node = node);
|
||||||
Broker::publish(SupervisorControl::topic_prefix,
|
Broker::publish(SupervisorControl::topic_prefix,
|
||||||
SupervisorControl::destroy_request, req$id, node);
|
SupervisorControl::destroy_request, req$id, node);
|
||||||
Management::Log::info(fmt("issued supervisor destroy for %s, %s", node, req$id));
|
Management::Log::info(fmt("issued supervisor destroy for %s, %s", node, req$id));
|
||||||
|
@ -132,9 +215,9 @@ event Management::Agent::API::set_configuration_request(reqid: string, config: M
|
||||||
local nc: Supervisor::NodeConfig;
|
local nc: Supervisor::NodeConfig;
|
||||||
local msg: string;
|
local msg: string;
|
||||||
|
|
||||||
# Adopt the global configuration provided.
|
# Adopt the global configuration provided. The act of trying to launch
|
||||||
# XXX this can later handle validation and persistence
|
# the requested nodes perturbs any existing ones one way or another, so
|
||||||
# XXX should do this transactionally, only set when all else worked
|
# even if the launch fails it effectively is our new configuration.
|
||||||
g_config = config;
|
g_config = config;
|
||||||
|
|
||||||
# Refresh the instances table:
|
# Refresh the instances table:
|
||||||
|
@ -150,19 +233,49 @@ event Management::Agent::API::set_configuration_request(reqid: string, config: M
|
||||||
g_nodes = table();
|
g_nodes = table();
|
||||||
g_cluster = table();
|
g_cluster = table();
|
||||||
|
|
||||||
|
# Special case: the config contains no nodes. We can respond right away.
|
||||||
|
if ( |config$nodes| == 0 )
|
||||||
|
{
|
||||||
|
g_config_reqid_pending = "";
|
||||||
|
|
||||||
|
local res = Management::Result(
|
||||||
|
$reqid = reqid,
|
||||||
|
$instance = Management::Agent::get_name());
|
||||||
|
|
||||||
|
Management::Log::info(fmt("tx Management::Agent::API::set_configuration_response %s",
|
||||||
|
Management::result_to_string(res)));
|
||||||
|
Broker::publish(agent_topic(),
|
||||||
|
Management::Agent::API::set_configuration_response, reqid, vector(res));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
local req = Management::Request::create(reqid);
|
||||||
|
req$set_configuration_state_agent = SetConfigurationState();
|
||||||
|
|
||||||
|
# Establish this request as the pending one:
|
||||||
|
g_config_reqid_pending = reqid;
|
||||||
|
|
||||||
for ( node in config$nodes )
|
for ( node in config$nodes )
|
||||||
{
|
{
|
||||||
|
# Filter the node set down to the ones this agent manages.
|
||||||
if ( node$instance == Management::Agent::get_name() )
|
if ( node$instance == Management::Agent::get_name() )
|
||||||
|
{
|
||||||
g_nodes[node$name] = node;
|
g_nodes[node$name] = node;
|
||||||
|
add req$set_configuration_state_agent$nodes_pending[node$name];
|
||||||
|
}
|
||||||
|
|
||||||
# The cluster and supervisor frameworks require a port for every
|
# The cluster and supervisor frameworks require a port for every
|
||||||
# node, using 0/unknown to signify "don't listen". We use
|
# node, using 0/unknown to signify "don't listen". The management
|
||||||
# optional values and map an absent value to 0/unknown.
|
# framework uses optional values, so here we map absent values
|
||||||
|
# to 0/unknown.
|
||||||
local p = 0/unknown;
|
local p = 0/unknown;
|
||||||
|
|
||||||
if ( node?$p )
|
if ( node?$p )
|
||||||
p = node$p;
|
p = node$p;
|
||||||
|
|
||||||
|
# Register the node in the g_cluster table. We use it below to
|
||||||
|
# ship the cluster topology with node configs launched via the
|
||||||
|
# Supervisor.
|
||||||
local cep = Supervisor::ClusterEndpoint(
|
local cep = Supervisor::ClusterEndpoint(
|
||||||
$role = node$role,
|
$role = node$role,
|
||||||
$host = g_instances[node$instance]$host,
|
$host = g_instances[node$instance]$host,
|
||||||
|
@ -215,10 +328,10 @@ event Management::Agent::API::set_configuration_request(reqid: string, config: M
|
||||||
# node.
|
# node.
|
||||||
nc$scripts[|nc$scripts|] = "policy/frameworks/management/node";
|
nc$scripts[|nc$scripts|] = "policy/frameworks/management/node";
|
||||||
|
|
||||||
if ( Management::Node::stdout_file != "" )
|
# We don't set nc$stdout_file/stderr_file here because the
|
||||||
nc$stdout_file = Management::Node::stdout_file;
|
# Management framework's Supervisor shim manages those output
|
||||||
if ( Management::Node::stderr_file != "" )
|
# files itself. See frameworks/management/supervisor/main.zeek
|
||||||
nc$stderr_file = Management::Node::stderr_file;
|
# for details.
|
||||||
|
|
||||||
# XXX could use options to enable per-node overrides for
|
# XXX could use options to enable per-node overrides for
|
||||||
# directory, stdout, stderr, others?
|
# directory, stdout, stderr, others?
|
||||||
|
@ -227,22 +340,9 @@ event Management::Agent::API::set_configuration_request(reqid: string, config: M
|
||||||
supervisor_create(nc);
|
supervisor_create(nc);
|
||||||
}
|
}
|
||||||
|
|
||||||
# XXX this currently doesn not fail if any of above problems occurred,
|
# At this point we await Management::Node::API::notify_node_hello events
|
||||||
# mainly due to the tediousness of handling the supervisor's response
|
# from the new nodes, or a timeout, whichever happens first. These
|
||||||
# events asynchonously. The only indication of error will be
|
# trigger the set_configuration_response event back to the controller.
|
||||||
# notification events to the controller.
|
|
||||||
|
|
||||||
if ( reqid != "" )
|
|
||||||
{
|
|
||||||
local res = Management::Result(
|
|
||||||
$reqid = reqid,
|
|
||||||
$instance = Management::Agent::get_name());
|
|
||||||
|
|
||||||
Management::Log::info(fmt("tx Management::Agent::API::set_configuration_response %s",
|
|
||||||
Management::result_to_string(res)));
|
|
||||||
Broker::publish(agent_topic(),
|
|
||||||
Management::Agent::API::set_configuration_response, reqid, res);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
event SupervisorControl::status_response(reqid: string, result: Supervisor::Status)
|
event SupervisorControl::status_response(reqid: string, result: Supervisor::Status)
|
||||||
|
@ -369,8 +469,8 @@ event Management::Node::API::node_dispatch_response(reqid: string, result: Manag
|
||||||
# report themselves would eventually lead to request timeout.
|
# report themselves would eventually lead to request timeout.
|
||||||
if ( result?$node )
|
if ( result?$node )
|
||||||
{
|
{
|
||||||
if ( result$node in req$node_dispatch_state$requests )
|
if ( result$node in req$node_dispatch_state_agent$requests )
|
||||||
delete req$node_dispatch_state$requests[result$node];
|
delete req$node_dispatch_state_agent$requests[result$node];
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
# An unknown or duplicate response -- do nothing.
|
# An unknown or duplicate response -- do nothing.
|
||||||
|
@ -381,7 +481,7 @@ event Management::Node::API::node_dispatch_response(reqid: string, result: Manag
|
||||||
|
|
||||||
# The usual special treatment for Broker values that are of type "any":
|
# The usual special treatment for Broker values that are of type "any":
|
||||||
# confirm their type here based on the requested dispatch command.
|
# confirm their type here based on the requested dispatch command.
|
||||||
switch req$node_dispatch_state$action[0]
|
switch req$node_dispatch_state_agent$action[0]
|
||||||
{
|
{
|
||||||
case "get_id_value":
|
case "get_id_value":
|
||||||
if ( result?$data )
|
if ( result?$data )
|
||||||
|
@ -389,7 +489,7 @@ event Management::Node::API::node_dispatch_response(reqid: string, result: Manag
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
Management::Log::error(fmt("unexpected dispatch command %s",
|
Management::Log::error(fmt("unexpected dispatch command %s",
|
||||||
req$node_dispatch_state$action[0]));
|
req$node_dispatch_state_agent$action[0]));
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -403,7 +503,7 @@ event Management::Node::API::node_dispatch_response(reqid: string, result: Manag
|
||||||
# If we still have pending queries out to the agents, do nothing: we'll
|
# If we still have pending queries out to the agents, do nothing: we'll
|
||||||
# handle this soon, or our request will time out and we respond with
|
# handle this soon, or our request will time out and we respond with
|
||||||
# error.
|
# error.
|
||||||
if ( |req$node_dispatch_state$requests| > 0 )
|
if ( |req$node_dispatch_state_agent$requests| > 0 )
|
||||||
return;
|
return;
|
||||||
|
|
||||||
# Release the agent-nodes request state, since we now have all responses.
|
# Release the agent-nodes request state, since we now have all responses.
|
||||||
|
@ -469,7 +569,7 @@ event Management::Agent::API::node_dispatch_request(reqid: string, action: vecto
|
||||||
local res: Management::Result;
|
local res: Management::Result;
|
||||||
local req = Management::Request::create(reqid);
|
local req = Management::Request::create(reqid);
|
||||||
|
|
||||||
req$node_dispatch_state = NodeDispatchState($action=action);
|
req$node_dispatch_state_agent = NodeDispatchState($action=action);
|
||||||
|
|
||||||
# Build up dispatch state for tracking responses. We only dispatch to
|
# Build up dispatch state for tracking responses. We only dispatch to
|
||||||
# nodes that are in state RUNNING, as those have confirmed they're ready
|
# nodes that are in state RUNNING, as those have confirmed they're ready
|
||||||
|
@ -477,7 +577,7 @@ event Management::Agent::API::node_dispatch_request(reqid: string, action: vecto
|
||||||
for ( node in nodes_final )
|
for ( node in nodes_final )
|
||||||
{
|
{
|
||||||
if ( g_nodes[node]$state == Management::RUNNING )
|
if ( g_nodes[node]$state == Management::RUNNING )
|
||||||
add req$node_dispatch_state$requests[node];
|
add req$node_dispatch_state_agent$requests[node];
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
res = Management::Result($reqid=reqid, $node=node);
|
res = Management::Result($reqid=reqid, $node=node);
|
||||||
|
@ -488,7 +588,7 @@ event Management::Agent::API::node_dispatch_request(reqid: string, action: vecto
|
||||||
}
|
}
|
||||||
|
|
||||||
# Corner case: nothing is in state RUNNING.
|
# Corner case: nothing is in state RUNNING.
|
||||||
if ( |req$node_dispatch_state$requests| == 0 )
|
if ( |req$node_dispatch_state_agent$requests| == 0 )
|
||||||
{
|
{
|
||||||
Management::Log::info(fmt(
|
Management::Log::info(fmt(
|
||||||
"tx Management::Agent::API::node_dispatch_response %s, no nodes running",
|
"tx Management::Agent::API::node_dispatch_response %s, no nodes running",
|
||||||
|
@ -549,8 +649,39 @@ event Management::Node::API::notify_node_hello(node: string)
|
||||||
{
|
{
|
||||||
Management::Log::info(fmt("rx Management::Node::API::notify_node_hello %s", node));
|
Management::Log::info(fmt("rx Management::Node::API::notify_node_hello %s", node));
|
||||||
|
|
||||||
|
# This node is now running; update its state:
|
||||||
if ( node in g_nodes )
|
if ( node in g_nodes )
|
||||||
g_nodes[node]$state = Management::RUNNING;
|
g_nodes[node]$state = Management::RUNNING;
|
||||||
|
|
||||||
|
# Look up the set_configuration request this node launch was part of (if
|
||||||
|
# any), and check it off. If it was the last node we expected to launch,
|
||||||
|
# finalize the request and respond to the controller.
|
||||||
|
|
||||||
|
local req = Management::Request::lookup(g_config_reqid_pending);
|
||||||
|
|
||||||
|
if ( Management::Request::is_null(req) || ! req?$set_configuration_state_agent )
|
||||||
|
return;
|
||||||
|
|
||||||
|
if ( node in req$set_configuration_state_agent$nodes_pending )
|
||||||
|
{
|
||||||
|
delete req$set_configuration_state_agent$nodes_pending[node];
|
||||||
|
if ( |req$set_configuration_state_agent$nodes_pending| == 0 )
|
||||||
|
send_set_configuration_response(req);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
event Management::Request::request_expired(req: Management::Request::Request)
|
||||||
|
{
|
||||||
|
local res = Management::Result($reqid=req$id,
|
||||||
|
$success = F,
|
||||||
|
$error = "request timed out");
|
||||||
|
|
||||||
|
if ( req?$set_configuration_state_agent )
|
||||||
|
{
|
||||||
|
send_set_configuration_response(req);
|
||||||
|
# This timeout means we no longer have a pending request.
|
||||||
|
g_config_reqid_pending = "";
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
event Broker::peer_added(peer: Broker::EndpointInfo, msg: string)
|
event Broker::peer_added(peer: Broker::EndpointInfo, msg: string)
|
||||||
|
@ -590,6 +721,7 @@ event zeek_init()
|
||||||
Broker::subscribe(agent_topic());
|
Broker::subscribe(agent_topic());
|
||||||
Broker::subscribe(SupervisorControl::topic_prefix);
|
Broker::subscribe(SupervisorControl::topic_prefix);
|
||||||
Broker::subscribe(Management::Node::node_topic);
|
Broker::subscribe(Management::Node::node_topic);
|
||||||
|
Broker::subscribe(Management::Supervisor::topic_prefix);
|
||||||
|
|
||||||
# Establish connectivity with the controller.
|
# Establish connectivity with the controller.
|
||||||
if ( Management::Agent::controller$address != "0.0.0.0" )
|
if ( Management::Agent::controller$address != "0.0.0.0" )
|
||||||
|
|
|
@ -34,10 +34,9 @@ event zeek_init()
|
||||||
if ( ! mkdir(sn$directory) )
|
if ( ! mkdir(sn$directory) )
|
||||||
print(fmt("warning: could not create controller state dir '%s'", sn$directory));
|
print(fmt("warning: could not create controller state dir '%s'", sn$directory));
|
||||||
|
|
||||||
if ( Management::Controller::stdout_file != "" )
|
# We don't set sn$stdout_file/stderr_file here because the Management
|
||||||
sn$stdout_file = Management::Controller::stdout_file;
|
# framework's Supervisor shim manages those output files itself. See
|
||||||
if ( Management::Controller::stderr_file != "" )
|
# frameworks/management/supervisor/main.zeek for details.
|
||||||
sn$stderr_file = Management::Controller::stderr_file;
|
|
||||||
|
|
||||||
# This helps identify Management framework nodes reliably.
|
# This helps identify Management framework nodes reliably.
|
||||||
sn$env["ZEEK_MANAGEMENT_NODE"] = "CONTROLLER";
|
sn$env["ZEEK_MANAGEMENT_NODE"] = "CONTROLLER";
|
||||||
|
|
|
@ -73,6 +73,10 @@ redef record Management::Request::Request += {
|
||||||
# Tag our logs correctly
|
# Tag our logs correctly
|
||||||
redef Management::role = Management::CONTROLLER;
|
redef Management::role = Management::CONTROLLER;
|
||||||
|
|
||||||
|
# Conduct more frequent table expiration checks. This helps get more predictable
|
||||||
|
# timing for request timeouts and only affects the agent, which is mostly idle.
|
||||||
|
redef table_expire_interval = 2 sec;
|
||||||
|
|
||||||
global check_instances_ready: function();
|
global check_instances_ready: function();
|
||||||
global add_instance: function(inst: Management::Instance);
|
global add_instance: function(inst: Management::Instance);
|
||||||
global drop_instance: function(inst: Management::Instance);
|
global drop_instance: function(inst: Management::Instance);
|
||||||
|
@ -169,6 +173,8 @@ function add_instance(inst: Management::Instance)
|
||||||
Broker::publish(Management::Agent::topic_prefix + "/" + inst$name,
|
Broker::publish(Management::Agent::topic_prefix + "/" + inst$name,
|
||||||
Management::Agent::API::agent_welcome_request, req$id);
|
Management::Agent::API::agent_welcome_request, req$id);
|
||||||
}
|
}
|
||||||
|
else
|
||||||
|
Management::Log::debug(fmt("instance %s not known to us, skipping", inst$name));
|
||||||
}
|
}
|
||||||
|
|
||||||
function drop_instance(inst: Management::Instance)
|
function drop_instance(inst: Management::Instance)
|
||||||
|
@ -281,6 +287,7 @@ event Management::Agent::API::notify_agent_hello(instance: string, host: addr, a
|
||||||
}
|
}
|
||||||
|
|
||||||
add g_instances_known[instance];
|
add g_instances_known[instance];
|
||||||
|
Management::Log::debug(fmt("instance %s now known to us", instance));
|
||||||
|
|
||||||
if ( instance in g_instances && instance !in g_instances_ready )
|
if ( instance in g_instances && instance !in g_instances_ready )
|
||||||
{
|
{
|
||||||
|
@ -339,7 +346,7 @@ event Management::Agent::API::notify_log(instance: string, msg: string, node: st
|
||||||
# XXX TODO
|
# XXX TODO
|
||||||
}
|
}
|
||||||
|
|
||||||
event Management::Agent::API::set_configuration_response(reqid: string, result: Management::Result)
|
event Management::Agent::API::set_configuration_response(reqid: string, results: Management::ResultVec)
|
||||||
{
|
{
|
||||||
Management::Log::info(fmt("rx Management::Agent::API::set_configuration_response %s", reqid));
|
Management::Log::info(fmt("rx Management::Agent::API::set_configuration_response %s", reqid));
|
||||||
|
|
||||||
|
@ -356,8 +363,15 @@ event Management::Agent::API::set_configuration_response(reqid: string, result:
|
||||||
if ( Management::Request::is_null(req) )
|
if ( Management::Request::is_null(req) )
|
||||||
return;
|
return;
|
||||||
|
|
||||||
# Add this result to the overall response
|
# Add this agent's results to the overall response
|
||||||
req$results[|req$results|] = result;
|
for ( i in results )
|
||||||
|
{
|
||||||
|
# The usual "any" treatment to keep access predictable
|
||||||
|
if ( results[i]?$data )
|
||||||
|
results[i]$data = results[i]$data as Management::NodeOutputs;
|
||||||
|
|
||||||
|
req$results[|req$results|] = results[i];
|
||||||
|
}
|
||||||
|
|
||||||
# Mark this request as done by removing it from the table of pending
|
# Mark this request as done by removing it from the table of pending
|
||||||
# ones. The following if-check should always be true.
|
# ones. The following if-check should always be true.
|
||||||
|
@ -485,9 +499,15 @@ event Management::Controller::API::set_configuration_request(reqid: string, conf
|
||||||
# case we need to re-establish connectivity with an agent.
|
# case we need to re-establish connectivity with an agent.
|
||||||
|
|
||||||
for ( inst_name in insts_to_drop )
|
for ( inst_name in insts_to_drop )
|
||||||
|
{
|
||||||
|
Management::Log::debug(fmt("dropping instance %s", inst_name));
|
||||||
drop_instance(g_instances[inst_name]);
|
drop_instance(g_instances[inst_name]);
|
||||||
|
}
|
||||||
for ( inst_name in insts_to_peer )
|
for ( inst_name in insts_to_peer )
|
||||||
|
{
|
||||||
|
Management::Log::debug(fmt("adding instance %s", inst_name));
|
||||||
add_instance(insts_to_peer[inst_name]);
|
add_instance(insts_to_peer[inst_name]);
|
||||||
|
}
|
||||||
|
|
||||||
# Updates to out instance tables are complete, now check if we're already
|
# Updates to out instance tables are complete, now check if we're already
|
||||||
# able to send the config to the agents:
|
# able to send the config to the agents:
|
||||||
|
|
|
@ -32,11 +32,15 @@ export {
|
||||||
finished: bool &default=F;
|
finished: bool &default=F;
|
||||||
};
|
};
|
||||||
|
|
||||||
## The timeout for request state. Such state (see the :zeek:see:`Management::Request`
|
## The timeout interval for request state. Such state (see the
|
||||||
## module) ties together request and response event pairs. The timeout causes
|
## :zeek:see:`Management::Request` module) ties together request and
|
||||||
## its cleanup in the absence of a timely response. It applies both to
|
## response event pairs. A timeout causes cleanup of request state if
|
||||||
## state kept for client requests, as well as state in the agents for
|
## regular request/response processing hasn't already done so. It
|
||||||
## requests to the supervisor.
|
## applies both to request state kept in the controller and the agent,
|
||||||
|
## though the two use different timeout values: agent-side requests time
|
||||||
|
## out more quickly. This allows agents to send more meaningful error
|
||||||
|
## messages, while the controller's timeouts serve as a last resort to
|
||||||
|
## ensure response to the client.
|
||||||
const timeout_interval = 10sec &redef;
|
const timeout_interval = 10sec &redef;
|
||||||
|
|
||||||
## A token request that serves as a null/nonexistant request.
|
## A token request that serves as a null/nonexistant request.
|
||||||
|
|
|
@ -0,0 +1 @@
|
||||||
|
@load ./main
|
18
scripts/policy/frameworks/management/supervisor/api.zeek
Normal file
18
scripts/policy/frameworks/management/supervisor/api.zeek
Normal file
|
@ -0,0 +1,18 @@
|
||||||
|
@load policy/frameworks/management/types
|
||||||
|
|
||||||
|
module Management::Supervisor::API;
|
||||||
|
|
||||||
|
export {
|
||||||
|
## The Supervisor generates this event whenever it has received a status
|
||||||
|
## update from the stem, indicating that a node exited.
|
||||||
|
##
|
||||||
|
## node: the name of a node previously created via
|
||||||
|
## :zeek:see:`Supervisor::create`.
|
||||||
|
##
|
||||||
|
## outputs: stdout/stderr context for the node. The contained strings
|
||||||
|
## span up to the 100 most recent lines in the corresponding
|
||||||
|
## stream. See :zeek:see:`Management::Supervisor::output_max_lines`
|
||||||
|
## to adjust the line limit.
|
||||||
|
##
|
||||||
|
global notify_node_exit: event(node: string, outputs: Management::NodeOutputs);
|
||||||
|
}
|
13
scripts/policy/frameworks/management/supervisor/config.zeek
Normal file
13
scripts/policy/frameworks/management/supervisor/config.zeek
Normal file
|
@ -0,0 +1,13 @@
|
||||||
|
##! Configuration settings for the Management framework's supervisor extension.
|
||||||
|
|
||||||
|
module Management::Supervisor;
|
||||||
|
|
||||||
|
export {
|
||||||
|
## The Broker topic for Management framework communication with the
|
||||||
|
## Supervisor. The agent subscribes to this.
|
||||||
|
const topic_prefix = "zeek/management/supervisor" &redef;
|
||||||
|
|
||||||
|
## The maximum number of stdout/stderr output lines to convey in
|
||||||
|
## :zeek:see:`Management::Supervisor::API::notify_node_exit` events.
|
||||||
|
const output_max_lines: count = 100 &redef;
|
||||||
|
}
|
122
scripts/policy/frameworks/management/supervisor/main.zeek
Normal file
122
scripts/policy/frameworks/management/supervisor/main.zeek
Normal file
|
@ -0,0 +1,122 @@
|
||||||
|
##! This module provides functionality the Management framework places directly
|
||||||
|
##! in the Supervisor.
|
||||||
|
|
||||||
|
@load base/utils/paths
|
||||||
|
@load base/utils/queue
|
||||||
|
|
||||||
|
@load policy/frameworks/management/types
|
||||||
|
@load policy/frameworks/management/node/config
|
||||||
|
|
||||||
|
@load ./api
|
||||||
|
@load ./config
|
||||||
|
|
||||||
|
module Management::Supervisor;
|
||||||
|
|
||||||
|
# stdout/stderr state for a given node.
|
||||||
|
type NodeOutputStreams: record {
|
||||||
|
# Line buffers for stdout and stderr. Their length is capped
|
||||||
|
# to the most recent Management::Supervisor::output_max_lines.
|
||||||
|
stdout: Queue::Queue;
|
||||||
|
stderr: Queue::Queue;
|
||||||
|
|
||||||
|
#
|
||||||
|
stdout_file: file &optional;
|
||||||
|
stderr_file: file &optional;
|
||||||
|
};
|
||||||
|
|
||||||
|
# This tracks output state for the current nodes.
|
||||||
|
global g_outputs: table[string] of NodeOutputStreams;
|
||||||
|
|
||||||
|
function make_node_output_streams(node: string): NodeOutputStreams
|
||||||
|
{
|
||||||
|
local stdout = Queue::init([$max_len = Management::Supervisor::output_max_lines]);
|
||||||
|
local stderr = Queue::init([$max_len = Management::Supervisor::output_max_lines]);
|
||||||
|
|
||||||
|
local res = NodeOutputStreams($stdout=stdout, $stderr=stderr);
|
||||||
|
local status = Supervisor::status(node);
|
||||||
|
|
||||||
|
if ( node !in status$nodes )
|
||||||
|
return res;
|
||||||
|
|
||||||
|
local ns = status$nodes[node];
|
||||||
|
local directory = ".";
|
||||||
|
|
||||||
|
if ( ns$node?$directory )
|
||||||
|
directory = ns$node$directory;
|
||||||
|
|
||||||
|
if ( Management::Node::stdout_file != "" )
|
||||||
|
res$stdout_file = open(build_path(directory, Management::Node::stdout_file));
|
||||||
|
if ( Management::Node::stderr_file != "" )
|
||||||
|
res$stderr_file = open(build_path(directory, Management::Node::stderr_file));
|
||||||
|
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
hook Supervisor::stdout_hook(node: string, msg: string)
|
||||||
|
{
|
||||||
|
if ( node !in g_outputs )
|
||||||
|
g_outputs[node] = make_node_output_streams(node);
|
||||||
|
|
||||||
|
# Write to the stdout file if we have one. The flush is clunky, but
|
||||||
|
# seems worth it: it's too confusing for errors to have happened and not
|
||||||
|
# yet shown up in the file. (The Supervisor's built-in file redirection
|
||||||
|
# does this too.)
|
||||||
|
if ( g_outputs[node]?$stdout_file )
|
||||||
|
{
|
||||||
|
print g_outputs[node]$stdout_file, msg;
|
||||||
|
flush_all();
|
||||||
|
}
|
||||||
|
|
||||||
|
# Update the sliding window of recent output lines.
|
||||||
|
Queue::put(g_outputs[node]$stdout, msg);
|
||||||
|
|
||||||
|
# Don't print this message in the Supervisor's own stdout
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
hook Supervisor::stderr_hook(node: string, msg: string)
|
||||||
|
{
|
||||||
|
if ( node !in g_outputs )
|
||||||
|
g_outputs[node] = make_node_output_streams(node);
|
||||||
|
|
||||||
|
if ( g_outputs[node]?$stderr_file )
|
||||||
|
{
|
||||||
|
print g_outputs[node]$stderr_file, msg;
|
||||||
|
flush_all();
|
||||||
|
}
|
||||||
|
|
||||||
|
Queue::put(g_outputs[node]$stderr, msg);
|
||||||
|
|
||||||
|
# Don't print this message in the Supervisor's own stdout
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
event Supervisor::node_status(node: string, pid: count)
|
||||||
|
{
|
||||||
|
# The node just started or restarted. If we have collected any output
|
||||||
|
# for its past life, send it via a notify_node_exit event.
|
||||||
|
if ( node in g_outputs )
|
||||||
|
{
|
||||||
|
local stdout_lines: vector of string;
|
||||||
|
local stderr_lines: vector of string;
|
||||||
|
|
||||||
|
Queue::get_vector(g_outputs[node]$stdout, stdout_lines);
|
||||||
|
Queue::get_vector(g_outputs[node]$stderr, stderr_lines);
|
||||||
|
|
||||||
|
if ( |stdout_lines| > 0 || |stderr_lines| > 0 )
|
||||||
|
{
|
||||||
|
local outputs = Management::NodeOutputs(
|
||||||
|
$stdout = join_string_vec(stdout_lines, "\n"),
|
||||||
|
$stderr = join_string_vec(stderr_lines, "\n"));
|
||||||
|
|
||||||
|
Broker::publish(topic_prefix, Management::Supervisor::API::notify_node_exit, node, outputs);
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( g_outputs[node]?$stdout_file )
|
||||||
|
close(g_outputs[node]$stdout_file);
|
||||||
|
if ( g_outputs[node]?$stderr_file )
|
||||||
|
close(g_outputs[node]$stderr_file);
|
||||||
|
}
|
||||||
|
|
||||||
|
g_outputs[node] = make_node_output_streams(node);
|
||||||
|
}
|
|
@ -104,6 +104,18 @@ export {
|
||||||
|
|
||||||
type ResultVec: vector of Result;
|
type ResultVec: vector of Result;
|
||||||
|
|
||||||
|
## In :zeek:see:`Management::Controller::API::set_configuration_response`,
|
||||||
|
## events, each :zeek:see:`Management::Result` indicates the outcome of a
|
||||||
|
## requested cluster node. If a node does not launch properly (meaning
|
||||||
|
## it doesn't check in with the agent on thee machine it's running on),
|
||||||
|
## the result will indicate failure, and its data field will be an
|
||||||
|
## instance of this record, capturing the stdout and stderr output of
|
||||||
|
## the failing node.
|
||||||
|
type NodeOutputs: record {
|
||||||
|
stdout: string; ##< The stdout stream of a Zeek process
|
||||||
|
stderr: string; ##< The stderr stream of a Zeek process
|
||||||
|
};
|
||||||
|
|
||||||
## Given a :zeek:see:`Management::Result` record,
|
## Given a :zeek:see:`Management::Result` record,
|
||||||
## this function returns a string summarizing it.
|
## this function returns a string summarizing it.
|
||||||
global result_to_string: function(res: Result): string;
|
global result_to_string: function(res: Result): string;
|
||||||
|
|
|
@ -29,6 +29,10 @@
|
||||||
@load frameworks/management/node/api.zeek
|
@load frameworks/management/node/api.zeek
|
||||||
@load frameworks/management/node/config.zeek
|
@load frameworks/management/node/config.zeek
|
||||||
# @load frameworks/management/node/main.zeek
|
# @load frameworks/management/node/main.zeek
|
||||||
|
@load frameworks/management/supervisor/__load__.zeek
|
||||||
|
@load frameworks/management/supervisor/api.zeek
|
||||||
|
@load frameworks/management/supervisor/config.zeek
|
||||||
|
@load frameworks/management/supervisor/main.zeek
|
||||||
@load frameworks/management/request.zeek
|
@load frameworks/management/request.zeek
|
||||||
@load frameworks/management/types.zeek
|
@load frameworks/management/types.zeek
|
||||||
@load frameworks/management/util.zeek
|
@load frameworks/management/util.zeek
|
||||||
|
|
|
@ -2,8 +2,8 @@
|
||||||
### NOTE: This file has been sorted with diff-sort.
|
### NOTE: This file has been sorted with diff-sort.
|
||||||
warning in <...>/extract-certs-pem.zeek, line 1: deprecated script loaded from <...>/__load__.zeek:15 "Remove in v5.1. Use log-certs-base64.zeek instead."
|
warning in <...>/extract-certs-pem.zeek, line 1: deprecated script loaded from <...>/__load__.zeek:15 "Remove in v5.1. Use log-certs-base64.zeek instead."
|
||||||
warning in <...>/extract-certs-pem.zeek, line 1: deprecated script loaded from command line arguments "Remove in v5.1. Use log-certs-base64.zeek instead."
|
warning in <...>/extract-certs-pem.zeek, line 1: deprecated script loaded from command line arguments "Remove in v5.1. Use log-certs-base64.zeek instead."
|
||||||
warning in <...>/log-ocsp.zeek, line 1: deprecated script loaded from <...>/test-all-policy.zeek:66 ("Remove in v5.1. OCSP logging is now enabled by default")
|
warning in <...>/log-ocsp.zeek, line 1: deprecated script loaded from <...>/test-all-policy.zeek:70 ("Remove in v5.1. OCSP logging is now enabled by default")
|
||||||
warning in <...>/log-ocsp.zeek, line 1: deprecated script loaded from <...>/test-all-policy.zeek:66 ("Remove in v5.1. OCSP logging is now enabled by default")
|
warning in <...>/log-ocsp.zeek, line 1: deprecated script loaded from <...>/test-all-policy.zeek:70 ("Remove in v5.1. OCSP logging is now enabled by default")
|
||||||
warning in <...>/log-ocsp.zeek, line 1: deprecated script loaded from command line arguments ("Remove in v5.1. OCSP logging is now enabled by default")
|
warning in <...>/log-ocsp.zeek, line 1: deprecated script loaded from command line arguments ("Remove in v5.1. OCSP logging is now enabled by default")
|
||||||
warning in <...>/notary.zeek, line 1: deprecated script loaded from <...>/__load__.zeek:5 ("Remove in v5.1. Please switch to other more modern approaches like SCT validation (validate-sct.zeek).")
|
warning in <...>/notary.zeek, line 1: deprecated script loaded from <...>/__load__.zeek:5 ("Remove in v5.1. Please switch to other more modern approaches like SCT validation (validate-sct.zeek).")
|
||||||
warning in <...>/notary.zeek, line 1: deprecated script loaded from command line arguments ("Remove in v5.1. Please switch to other more modern approaches like SCT validation (validate-sct.zeek).")
|
warning in <...>/notary.zeek, line 1: deprecated script loaded from command line arguments ("Remove in v5.1. Please switch to other more modern approaches like SCT validation (validate-sct.zeek).")
|
||||||
|
|
|
@ -1 +1 @@
|
||||||
01e1d1ad94cea81091c74e829d86815fdef0dd62
|
de28fe5db9a733f8b4b53f84127bab888a184f6a
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue