From 3ac5fdfc599c9b58bf8dc01879fc60a27e2b49f0 Mon Sep 17 00:00:00 2001 From: Christian Kreibich Date: Tue, 14 Jun 2022 14:40:56 -0700 Subject: [PATCH 01/21] Management framework: trivial changes and comment-only rewording --- .../frameworks/management/controller/api.zeek | 18 +++++++++--------- .../frameworks/management/controller/main.zeek | 18 +++++++++--------- .../policy/frameworks/management/request.zeek | 2 +- 3 files changed, 19 insertions(+), 19 deletions(-) diff --git a/scripts/policy/frameworks/management/controller/api.zeek b/scripts/policy/frameworks/management/controller/api.zeek index b0897ed20a..3235de8598 100644 --- a/scripts/policy/frameworks/management/controller/api.zeek +++ b/scripts/policy/frameworks/management/controller/api.zeek @@ -1,7 +1,7 @@ ##! The event API of cluster controllers. Most endpoints consist of event pairs, -##! where the controller answers a zeek-client request event with a -##! corresponding response event. Such event pairs share the same name prefix -##! and end in "_request" and "_response", respectively. +##! where the controller answers the client's request event with a corresponding +##! response event. Such event pairs share the same name prefix and end in +##! "_request" and "_response", respectively. @load policy/frameworks/management/types @@ -9,11 +9,11 @@ module Management::Controller::API; export { ## A simple versioning scheme, used to track basic compatibility of - ## controller, agents, and zeek-client. + ## controller, agents, and the client. const version = 1; - ## zeek-client sends this event to request a list of the currently + ## The client sends this event to request a list of the currently ## peered agents/instances. ## ## reqid: a request identifier string, echoed in the response event. @@ -32,7 +32,7 @@ export { result: Management::Result); - ## zeek-client sends this event to establish a new cluster configuration, + ## The client sends this event to establish a new cluster configuration, ## including the full cluster topology. The controller processes the update ## and relays it to the agents. Once each has responded (or a timeout occurs) ## the controller sends a corresponding response event back to the client. @@ -57,7 +57,7 @@ export { result: Management::ResultVec); - ## zeek-client sends this event to retrieve the currently deployed + ## The client sends this event to retrieve the currently deployed ## cluster configuration. ## ## reqid: a request identifier string, echoed in the response event. @@ -78,7 +78,7 @@ export { result: Management::Result); - ## zeek-client sends this event to request a list of + ## The client sends this event to request a list of ## :zeek:see:`Management::NodeStatus` records that capture ## the status of Supervisor-managed nodes running on the cluster's ## instances. @@ -102,7 +102,7 @@ export { result: Management::ResultVec); - ## zeek-client sends this event to retrieve the current value of a + ## The client sends this event to retrieve the current value of a ## variable in Zeek's global namespace, referenced by the given ## identifier (i.e., variable name). The controller asks all agents ## to retrieve this value from each cluster node, accumulates the diff --git a/scripts/policy/frameworks/management/controller/main.zeek b/scripts/policy/frameworks/management/controller/main.zeek index ef6047e3bf..b9edef3fed 100644 --- a/scripts/policy/frameworks/management/controller/main.zeek +++ b/scripts/policy/frameworks/management/controller/main.zeek @@ -91,7 +91,7 @@ global drop_instance: function(inst: Management::Instance); # Helpers to simplify handling of config records. global null_config: function(): Management::Configuration; -global is_null_config: function(config: Management::Configuration): bool; +global config_is_null: function(config: Management::Configuration): bool; # Returns list of names of nodes in the given configuration that require a # listening port. Returns empty list if the config has no such nodes. @@ -505,7 +505,7 @@ function find_endpoint(id: string): Broker::EndpointInfo return Broker::EndpointInfo($id=""); } -function is_null_config(config: Management::Configuration): bool +function config_is_null(config: Management::Configuration): bool { return config$id == ""; } @@ -654,7 +654,6 @@ event Management::Agent::API::agent_welcome_response(reqid: string, result: Mana # An agent we've been waiting to hear back from is ready for cluster # work. Double-check we still want it, otherwise drop it. - if ( ! result$success || result$instance !in g_instances ) { Management::Log::info(fmt( @@ -897,7 +896,7 @@ event Management::Controller::API::get_configuration_request(reqid: string) local res = Management::Result($reqid=reqid); - if ( is_null_config(g_config_current) ) + if ( config_is_null(g_config_current) ) { # We don't have a live configuration yet. res$success = F; @@ -1247,11 +1246,12 @@ event zeek_init() # set_configuration request. g_config_current = null_config(); - # The controller always listens -- it needs to be able to respond to the - # Zeek client. This port is also used by the agents if they connect to - # the client. The client doesn't automatically establish or accept - # connectivity to agents: agents are defined and communicated with as - # defined via configurations defined by the client. + # The controller always listens: it needs to be able to respond to + # clients connecting to it, as well as agents if they connect to the + # controller. The controller does not automatically connect to any + # agents; instances with listening agents are conveyed to the controller + # via configurations uploaded by a client, with connections established + # upon deployment. local cni = Management::Controller::network_info(); diff --git a/scripts/policy/frameworks/management/request.zeek b/scripts/policy/frameworks/management/request.zeek index 86de3ea0fd..e291e02260 100644 --- a/scripts/policy/frameworks/management/request.zeek +++ b/scripts/policy/frameworks/management/request.zeek @@ -161,5 +161,5 @@ function to_string(request: Request): string return fmt("[request %s%s %s, results: %s]", request$id, parent_id, request$finished ? "finished" : "pending", - join_string_vec(results, ",")); + join_string_vec(results, ", ")); } From f353ac22a58d456f36dbf48a0f1a14fbe89828bd Mon Sep 17 00:00:00 2001 From: Christian Kreibich Date: Wed, 15 Jun 2022 15:35:19 -0700 Subject: [PATCH 02/21] Management framework: consistency fixes to the Result record The instance and error fields are now optional instead of defaulting to empty strings, which caused minor output deviations in the client. Agents now ensure that any Result record they create has the instance field filled in. --- .../policy/frameworks/management/agent/main.zeek | 9 ++++++--- scripts/policy/frameworks/management/types.zeek | 15 ++++++++++----- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/scripts/policy/frameworks/management/agent/main.zeek b/scripts/policy/frameworks/management/agent/main.zeek index 4757f5a3ec..17b14e9c4d 100644 --- a/scripts/policy/frameworks/management/agent/main.zeek +++ b/scripts/policy/frameworks/management/agent/main.zeek @@ -637,9 +637,11 @@ event Management::Agent::API::node_dispatch_request(reqid: string, action: vecto add req$node_dispatch_state_agent$requests[node]; else { - res = Management::Result($reqid=reqid, $node=node); - res$success = F; - res$error = fmt("cluster node %s not in runnning state", node); + res = Management::Result($reqid=reqid, + $instance = Management::Agent::get_name(), + $success = F, + $error = fmt("cluster node %s not in runnning state", node), + $node=node); req$results += res; } } @@ -730,6 +732,7 @@ event Management::Node::API::notify_node_hello(node: string) event Management::Request::request_expired(req: Management::Request::Request) { local res = Management::Result($reqid=req$id, + $instance = Management::Agent::get_name(), $success = F, $error = "request timed out"); diff --git a/scripts/policy/frameworks/management/types.zeek b/scripts/policy/frameworks/management/types.zeek index c4076cd8f7..7632cc0247 100644 --- a/scripts/policy/frameworks/management/types.zeek +++ b/scripts/policy/frameworks/management/types.zeek @@ -92,13 +92,18 @@ export { type NodeStatusVec: vector of NodeStatus; - ## Return value for request-response API event pairs + ## Return value for request-response API event pairs. Some responses + ## contain one, others multiple of these. The request ID allows clients + ## to string requests and responses together. Agents and the controller + ## fill in the instance and node fields whenever there's sufficient + ## context to define them. Any result produced by an agent will carry an + ## instance value, for example. type Result: record { reqid: string; ##< Request ID of operation this result refers to - instance: string &default=""; ##< Name of associated instance (for context) success: bool &default=T; ##< True if successful + instance: string &optional; ##< Name of associated instance (for context) data: any &optional; ##< Addl data returned for successful operation - error: string &default=""; ##< Descriptive error on failure + error: string &optional; ##< Descriptive error on failure node: string &optional; ##< Name of associated node (for context) }; @@ -127,7 +132,7 @@ function result_to_string(res: Result): string if ( res$success ) result = "success"; - else if ( res$error != "" ) + else if ( res?$error ) result = fmt("error (%s)", res$error); else result = "error"; @@ -136,7 +141,7 @@ function result_to_string(res: Result): string if ( res$reqid != "" ) details[|details|] = fmt("reqid %s", res$reqid); - if ( res$instance != "" ) + if ( res?$instance ) details[|details|] = fmt("instance %s", res$instance); if ( res?$node && res$node != "" ) details[|details|] = fmt("node %s", res$node); From 0480b5f39c616cfb5980f33878a4ae0c318a86cd Mon Sep 17 00:00:00 2001 From: Christian Kreibich Date: Wed, 15 Jun 2022 13:49:47 -0700 Subject: [PATCH 03/21] Management framework: rename agent "set_configuration" to "deploy" This renames the agent's functionality for setting a configuration to reflect the controller's upcoming separation of set_configuration and deployment. --- .../frameworks/management/agent/api.zeek | 21 +++---- .../frameworks/management/agent/main.zeek | 61 +++++++++---------- .../management/controller/main.zeek | 10 +-- 3 files changed, 45 insertions(+), 47 deletions(-) diff --git a/scripts/policy/frameworks/management/agent/api.zeek b/scripts/policy/frameworks/management/agent/api.zeek index ba9c0b22bd..55c8b2a563 100644 --- a/scripts/policy/frameworks/management/agent/api.zeek +++ b/scripts/policy/frameworks/management/agent/api.zeek @@ -15,29 +15,28 @@ export { # Agent API events - ## The controller sends this event to convey a new cluster configuration - ## to the agent. Once processed, the agent responds with the response - ## event. + ## The controller sends this event to deploy a cluster configuration to + ## this instance. Once processed, the agent responds with a + ## :zeek:see:`Management::Agent::API::deploy_response` event. event. ## ## reqid: a request identifier string, echoed in the response event. ## - ## config: a :zeek:see:`Management::Configuration` record - ## describing the cluster topology. Note that this contains the full - ## topology, not just the part pertaining to this agent. That's because - ## the cluster framework requires full cluster visibility to establish - ## the needed peerings. + ## config: a :zeek:see:`Management::Configuration` record describing the + ## cluster topology. This contains the full topology, not just the + ## part pertaining to this instance: the cluster framework requires + ## full cluster visibility to establish needed peerings. ## - global set_configuration_request: event(reqid: string, + global deploy_request: event(reqid: string, config: Management::Configuration); - ## Response to a set_configuration_request event. The agent sends + ## Response to a deploy_request event. The agent sends ## this back to the controller. ## ## reqid: the request identifier used in the request event. ## ## result: the result record. ## - global set_configuration_response: event(reqid: string, + global deploy_response: event(reqid: string, result: Management::ResultVec); diff --git a/scripts/policy/frameworks/management/agent/main.zeek b/scripts/policy/frameworks/management/agent/main.zeek index 17b14e9c4d..8ee4a9b320 100644 --- a/scripts/policy/frameworks/management/agent/main.zeek +++ b/scripts/policy/frameworks/management/agent/main.zeek @@ -26,8 +26,8 @@ export { node: string; ##< Name of the node the Supervisor is acting on. }; - ## Request state for set_configuration requests. - type SetConfigurationState: record { + ## Request state for deploy requests. + type DeployState: record { ## Zeek cluster nodes the provided configuration requested ## and which have not yet checked in with the agent. nodes_pending: set[string]; @@ -62,7 +62,7 @@ export { # members with _agent to disambiguate. redef record Management::Request::Request += { supervisor_state_agent: SupervisorState &optional; - set_configuration_state_agent: SetConfigurationState &optional; + deploy_state_agent: DeployState &optional; node_dispatch_state_agent: NodeDispatchState &optional; }; @@ -80,11 +80,11 @@ redef Management::Request::timeout_interval = 5 sec; # Returns the effective agent topic for this agent. global agent_topic: function(): string; -# Finalizes a set_configuration_request transaction: cleans up remaining state +# Finalizes a deploy_request transaction: cleans up remaining state # and sends response event. -global send_set_configuration_response: function(req: Management::Request::Request); +global send_deploy_response: function(req: Management::Request::Request); -# The global configuration as passed to us by the controller +# The global configuration, as deployed by the controller. global g_config: Management::Configuration; # A map to make other instance info accessible @@ -93,10 +93,9 @@ global g_instances: table[string] of Management::Instance; # A map for the nodes we run on this instance, via this agent. global g_nodes: table[string] of Management::Node; -# The request ID of the most recent configuration update from the controller. -# We track it here until the nodes_pending set in the corresponding request's -# SetConfigurationState is cleared out, or the corresponding request state hits -# a timeout. +# The request ID of the most recent config deployment from the controller. We +# track it until the nodes_pending set in the corresponding request's +# DeployState is cleared out, or the corresponding request state hits a timeout. global g_config_reqid_pending: string = ""; # The complete node map employed by the supervisor to describe the cluster @@ -115,7 +114,7 @@ function agent_topic(): string return Management::Agent::topic_prefix + "/" + epi$id; } -function send_set_configuration_response(req: Management::Request::Request) +function send_deploy_response(req: Management::Request::Request) { local node: string; local res: Management::Result; @@ -128,7 +127,7 @@ function send_set_configuration_response(req: Management::Request::Request) $instance = Management::Agent::get_name(), $node = node); - if ( node in req$set_configuration_state_agent$nodes_pending ) + if ( node in req$deploy_state_agent$nodes_pending ) { # This node failed. res$success = F; @@ -142,10 +141,10 @@ function send_set_configuration_response(req: Management::Request::Request) req$results[|req$results|] = res; } - Management::Log::info(fmt("tx Management::Agent::API::set_configuration_response %s", + Management::Log::info(fmt("tx Management::Agent::API::deploy_response %s", Management::result_to_string(res))); Broker::publish(agent_topic(), - Management::Agent::API::set_configuration_response, req$id, req$results); + Management::Agent::API::deploy_response, req$id, req$results); Management::Request::finish(req$id); @@ -263,14 +262,13 @@ function supervisor_destroy(node: string) Management::Log::info(fmt("issued supervisor destroy for %s, %s", node, req$id)); } -event Management::Agent::API::set_configuration_request(reqid: string, config: Management::Configuration) +event Management::Agent::API::deploy_request(reqid: string, config: Management::Configuration) { - Management::Log::info(fmt("rx Management::Agent::API::set_configuration_request %s", reqid)); + Management::Log::info(fmt("rx Management::Agent::API::deploy_request %s", reqid)); local nodename: string; local node: Management::Node; local nc: Supervisor::NodeConfig; - local msg: string; # Adopt the global configuration provided. The act of trying to launch # the requested nodes perturbs any existing ones one way or another, so @@ -299,15 +297,15 @@ event Management::Agent::API::set_configuration_request(reqid: string, config: M $reqid = reqid, $instance = Management::Agent::get_name()); - Management::Log::info(fmt("tx Management::Agent::API::set_configuration_response %s", + Management::Log::info(fmt("tx Management::Agent::API::deploy_response %s", Management::result_to_string(res))); Broker::publish(agent_topic(), - Management::Agent::API::set_configuration_response, reqid, vector(res)); + Management::Agent::API::deploy_response, reqid, vector(res)); return; } local req = Management::Request::create(reqid); - req$set_configuration_state_agent = SetConfigurationState(); + req$deploy_state_agent = DeployState(); # Establish this request as the pending one: g_config_reqid_pending = reqid; @@ -318,7 +316,7 @@ event Management::Agent::API::set_configuration_request(reqid: string, config: M if ( node$instance == Management::Agent::get_name() ) { g_nodes[node$name] = node; - add req$set_configuration_state_agent$nodes_pending[node$name]; + add req$deploy_state_agent$nodes_pending[node$name]; } # The cluster and supervisor frameworks require a port for every @@ -399,7 +397,8 @@ event Management::Agent::API::set_configuration_request(reqid: string, config: M # At this point we await Management::Node::API::notify_node_hello events # from the new nodes, or a timeout, whichever happens first. These - # trigger the set_configuration_response event back to the controller. + # update the pending nodes in the request state, and eventually trigger + # the deploy_response event back to the controller. } event SupervisorControl::status_response(reqid: string, result: Supervisor::Status) @@ -692,7 +691,7 @@ event Management::Agent::API::agent_standby_request(reqid: string) # peered/connected -- otherwise there's nothing we can do here via # Broker anyway), mainly to keep open the possibility of running # cluster nodes again later. - event Management::Agent::API::set_configuration_request("", Management::Configuration()); + event Management::Agent::API::deploy_request("", Management::Configuration()); local res = Management::Result( $reqid = reqid, @@ -712,20 +711,20 @@ event Management::Node::API::notify_node_hello(node: string) if ( node in g_nodes ) g_nodes[node]$state = Management::RUNNING; - # Look up the set_configuration request this node launch was part of (if + # Look up the deploy request this node launch was part of (if # any), and check it off. If it was the last node we expected to launch, # finalize the request and respond to the controller. local req = Management::Request::lookup(g_config_reqid_pending); - if ( Management::Request::is_null(req) || ! req?$set_configuration_state_agent ) + if ( Management::Request::is_null(req) || ! req?$deploy_state_agent ) return; - if ( node in req$set_configuration_state_agent$nodes_pending ) + if ( node in req$deploy_state_agent$nodes_pending ) { - delete req$set_configuration_state_agent$nodes_pending[node]; - if ( |req$set_configuration_state_agent$nodes_pending| == 0 ) - send_set_configuration_response(req); + delete req$deploy_state_agent$nodes_pending[node]; + if ( |req$deploy_state_agent$nodes_pending| == 0 ) + send_deploy_response(req); } } @@ -736,9 +735,9 @@ event Management::Request::request_expired(req: Management::Request::Request) $success = F, $error = "request timed out"); - if ( req?$set_configuration_state_agent ) + if ( req?$deploy_state_agent ) { - send_set_configuration_response(req); + send_deploy_response(req); # This timeout means we no longer have a pending request. g_config_reqid_pending = ""; } diff --git a/scripts/policy/frameworks/management/controller/main.zeek b/scripts/policy/frameworks/management/controller/main.zeek index b9edef3fed..5487e85606 100644 --- a/scripts/policy/frameworks/management/controller/main.zeek +++ b/scripts/policy/frameworks/management/controller/main.zeek @@ -172,8 +172,8 @@ function send_config_to_agents(req: Management::Request::Request, config: Manage # We could also broadcast just once on the agent prefix, but # explicit request/response pairs for each agent seems cleaner. - Management::Log::info(fmt("tx Management::Agent::API::set_configuration_request %s to %s", areq$id, name)); - Broker::publish(agent_topic, Management::Agent::API::set_configuration_request, areq$id, config); + Management::Log::info(fmt("tx Management::Agent::API::deploy_request %s to %s", areq$id, name)); + Broker::publish(agent_topic, Management::Agent::API::deploy_request, areq$id, config); } } @@ -685,9 +685,9 @@ event Management::Agent::API::notify_log(instance: string, msg: string, node: st # XXX TODO } -event Management::Agent::API::set_configuration_response(reqid: string, results: Management::ResultVec) +event Management::Agent::API::deploy_response(reqid: string, results: Management::ResultVec) { - Management::Log::info(fmt("rx Management::Agent::API::set_configuration_response %s", reqid)); + Management::Log::info(fmt("rx Management::Agent::API::deploy_response %s", reqid)); # Retrieve state for the request we just got a response to local areq = Management::Request::lookup(reqid); @@ -722,7 +722,7 @@ event Management::Agent::API::set_configuration_response(reqid: string, results: if ( |req$set_configuration_state$requests| > 0 ) return; - # All set_configuration requests to instances are done, so adopt the + # All deploy requests to instances are done, so adopt the # client's requested configuration as the new one and respond back to # client. g_config_current = req$set_configuration_state$config; From 77556e9f1183c746296e45ced807bad090a9bdd7 Mon Sep 17 00:00:00 2001 From: Christian Kreibich Date: Wed, 15 Jun 2022 13:53:40 -0700 Subject: [PATCH 04/21] Management framework: introduce deployment API in controller This separates uploading a configuration from deploying it to the instances into separate event transactions. set_configuration_request/response remains, but now only conducts validation and storage of the new configuration (upon validation success, and not yet persisted to disk). The response event indicates success or the list of validation errors. Successful upload now returns the configuration's ID in the result record's data struct. The new deploy_request/response event takes a previously uploaded configuration and deploys it to the agents. The controller now tracks uploaded and deployed configurations separately. Uploading assigns g_config_staged; deployment assigns g_config_deployed. Deployment does not affect g_config_staged. The get_config_request/response event pair now allows selecting the configuration the caller would like to retrieve. --- .../frameworks/management/controller/api.zeek | 47 ++- .../management/controller/main.zeek | 283 +++++++++++------- 2 files changed, 211 insertions(+), 119 deletions(-) diff --git a/scripts/policy/frameworks/management/controller/api.zeek b/scripts/policy/frameworks/management/controller/api.zeek index 3235de8598..42b870b4e5 100644 --- a/scripts/policy/frameworks/management/controller/api.zeek +++ b/scripts/policy/frameworks/management/controller/api.zeek @@ -32,10 +32,12 @@ export { result: Management::Result); - ## The client sends this event to establish a new cluster configuration, - ## including the full cluster topology. The controller processes the update - ## and relays it to the agents. Once each has responded (or a timeout occurs) - ## the controller sends a corresponding response event back to the client. + ## The client sends this event to upload a new cluster configuration, + ## including the full cluster topology. The controller validates the + ## configuration and indicates the outcome in its response event. No + ## deployment takes place yet, and existing deployed configurations and + ## clusters remain intact. To trigger deployment of an uploaded + ## configuration, use :zeek:see:`Management::Controller::API::deploy_request`. ## ## reqid: a request identifier string, echoed in the response event. ## @@ -50,8 +52,10 @@ export { ## ## reqid: the request identifier used in the request event. ## - ## result: a vector of :zeek:see:`Management::Result` records. - ## Each member captures one agent's response. + ## result: a :zeek:see:`Management::Result` vector, indicating whether + ## the controller accepts the configuration. In case of a success, + ## a single result record indicates so. Otherwise, the sequence is + ## all errors, each indicating a configuration validation error. ## global set_configuration_response: event(reqid: string, result: Management::ResultVec); @@ -62,7 +66,10 @@ export { ## ## reqid: a request identifier string, echoed in the response event. ## - global get_configuration_request: event(reqid: string); + ## deployed: when true, returns the deployed configuration (if any), + ## otherwise the staged one (if any). + ## + global get_configuration_request: event(reqid: string, deployed: bool); ## Response to a get_configuration_request event. The controller sends ## this back to the client. @@ -78,6 +85,32 @@ export { result: Management::Result); + ## The client sends this event to trigger deployment of a previously + ## uploaded configuration. The controller deploys the uploaded + ## configuration to all agents involved in running the former + ## configuration or the new one. The agents terminate any previously + ## running cluster nodes and (re-)launch those defined in the new + ## configuration. Once each agent has responded (or a timeout occurs), + ## the controller sends a response event back to the client. + ## + ## reqid: a request identifier string, echoed in the response event. + ## + global deploy_request: event(reqid: string); + + ## Response to a deploy_request event. The controller sends this + ## back to the client. + ## + ## reqid: the request identifier used in the request event. + ## + ## result: a vector of :zeek:see:`Management::Result` records. + ## Each member captures the result of launching one cluster + ## node captured in the configuration, or an agent-wide error + ## when the result does not indicate a particular node. + ## + global deploy_response: event(reqid: string, + result: Management::ResultVec); + + ## The client sends this event to request a list of ## :zeek:see:`Management::NodeStatus` records that capture ## the status of Supervisor-managed nodes running on the cluster's diff --git a/scripts/policy/frameworks/management/controller/main.zeek b/scripts/policy/frameworks/management/controller/main.zeek index 5487e85606..4d9d11705d 100644 --- a/scripts/policy/frameworks/management/controller/main.zeek +++ b/scripts/policy/frameworks/management/controller/main.zeek @@ -19,10 +19,10 @@ module Management::Controller::Runtime; # tucked-on types. export { ## Request state specific to - ## :zeek:see:`Management::Controller::API::set_configuration_request` and - ## :zeek:see:`Management::Controller::API::set_configuration_response`. - type SetConfigurationState: record { - ## The cluster configuration established with this request + ## :zeek:see:`Management::Controller::API::deploy_request` and + ## :zeek:see:`Management::Controller::API::deploy_response`. + type DeployState: record { + ## The cluster configuration the controller is deploying. config: Management::Configuration; ## Request state for every controller/agent transaction. requests: set[string] &default=set(); @@ -64,7 +64,7 @@ export { } redef record Management::Request::Request += { - set_configuration_state: SetConfigurationState &optional; + deploy_state: DeployState &optional; get_nodes_state: GetNodesState &optional; node_dispatch_state: NodeDispatchState &optional; test_state: TestState &optional; @@ -93,6 +93,12 @@ global drop_instance: function(inst: Management::Instance); global null_config: function(): Management::Configuration; global config_is_null: function(config: Management::Configuration): bool; +# Sends the given configuration to all g_instances members via a +# Management::Agent::API::deploy_request event. To track responses, it builds up +# deployment state in the given request for each recipient. +global config_deploy_to_agents: function(config: Management::Configuration, + req: Management::Request::Request); + # Returns list of names of nodes in the given configuration that require a # listening port. Returns empty list if the config has no such nodes. global config_nodes_lacking_ports: function(config: Management::Configuration): vector of string; @@ -110,11 +116,16 @@ global config_assign_ports: function(config: Management::Configuration); global config_validate: function(config: Management::Configuration, req: Management::Request::Request): bool; -# Rejects the given configuration with the given error message. The function -# adds a non-success result record to the given request and send the -# set_configuration_response event back to the client. It does not call finish() -# on the request. -global send_set_configuration_response_error: function(req: Management::Request::Request, error: string); +# Returns the set of node names in the given configuration that overlaps with +# the set provided. +global config_filter_nodes_by_name: function(config: Management::Configuration, + nodes: set[string]): set[string]; + +# Fails the given deployment request with the given error message. The function +# adds a non-success result record to the request and send a deploy_response +# event back to the client. It does not call finish() on the request. +global send_deploy_response_error: function( + req: Management::Request::Request, error: string); # Given a Broker ID, this returns the endpoint info associated with it. # On error, returns a dummy record with an empty ID string. @@ -145,16 +156,22 @@ global g_instances_known: table[string] of Management::Instance = table(); # instance, and store that in g_instances.) global g_instances_ready: set[string] = set(); -# The request ID of the most recent configuration update that's come in from -# a client. We track it here until we know we are ready to communicate with all -# agents required by the update. +# The request ID of the most recent deployment request from a client. We track +# it here until we know we are ready to communicate with all agents required for +# the update. global g_config_reqid_pending: string = ""; -# The most recent configuration we have successfully deployed. This is also -# the one we send whenever the client requests it. -global g_config_current: Management::Configuration; +# The most recent configuration we have successfully deployed. When no +# deployment has happened yet, this is a "null config" as per null_config(). +global g_config_deployed: Management::Configuration; -function send_config_to_agents(req: Management::Request::Request, config: Management::Configuration) +# The most recently provided configuration by the client. When the controller +# hasn't yet received a configuration, this is a "null config" as per +# null_config(). Successful deployment doesn't invalidate or clear this +# configuration, it remains set. +global g_config_staged: Management::Configuration; + +function config_deploy_to_agents(config: Management::Configuration, req: Management::Request::Request) { for ( name in g_instances ) { @@ -168,7 +185,7 @@ function send_config_to_agents(req: Management::Request::Request, config: Manage # We track the requests sent off to each agent. As the # responses come in, we delete them. Once the requests # set is empty, we respond back to the client. - add req$set_configuration_state$requests[areq$id]; + add req$deploy_state$requests[areq$id]; # We could also broadcast just once on the agent prefix, but # explicit request/response pairs for each agent seems cleaner. @@ -244,6 +261,11 @@ function null_config(): Management::Configuration return Management::Configuration($id=""); } +function config_is_null(config: Management::Configuration): bool + { + return config$id == ""; + } + function config_nodes_lacking_ports(config: Management::Configuration): vector of string { local res: vector of string; @@ -491,6 +513,18 @@ function config_validate(config: Management::Configuration, return F; } +function config_filter_nodes_by_name(config: Management::Configuration, nodes: set[string]) + : set[string] + { + local res: set[string]; + local cluster_nodes: set[string]; + + for ( node in config$nodes ) + add cluster_nodes[node$name]; + + return nodes & cluster_nodes; + } + function find_endpoint(id: string): Broker::EndpointInfo { local peers = Broker::peers(); @@ -505,11 +539,6 @@ function find_endpoint(id: string): Broker::EndpointInfo return Broker::EndpointInfo($id=""); } -function config_is_null(config: Management::Configuration): bool - { - return config$id == ""; - } - function is_instance_connectivity_change(inst: Management::Instance): bool { # If we're not tracking this instance as part of a cluster config, it's @@ -536,18 +565,7 @@ function is_instance_connectivity_change(inst: Management::Instance): bool return F; } -function filter_config_nodes_by_name(nodes: set[string]): set[string] - { - local res: set[string]; - local cluster_nodes: set[string]; - - for ( node in g_config_current$nodes ) - add cluster_nodes[node$name]; - - return nodes & cluster_nodes; - } - -function send_set_configuration_response_error(req: Management::Request::Request, error: string) +function send_deploy_response_error(req: Management::Request::Request, error: string) { local res = Management::Result($reqid=req$id); @@ -555,8 +573,10 @@ function send_set_configuration_response_error(req: Management::Request::Request res$error = error; req$results += res; + Management::Log::info(fmt("tx Management::Controller::API::deploy_response %s", + Management::Request::to_string(req))); Broker::publish(Management::Controller::topic, - Management::Controller::API::set_configuration_response, req$id, req$results); + Management::Controller::API::deploy_response, req$id, req$results); } event Management::Controller::API::notify_agents_ready(instances: set[string]) @@ -570,14 +590,14 @@ event Management::Controller::API::notify_agents_ready(instances: set[string]) # If there's no pending request, when it's no longer available, or it # doesn't have config state, don't do anything else. - if ( Management::Request::is_null(req) || ! req?$set_configuration_state ) + if ( Management::Request::is_null(req) || ! req?$deploy_state ) return; # All instances requested in the pending configuration update are now # known to us. Send them the config. As they send their response events # we update the client's request state and eventually send the response # event to the it. - send_config_to_agents(req, req$set_configuration_state$config); + config_deploy_to_agents(req$deploy_state$config, req); } event Management::Agent::API::notify_agent_hello(instance: string, id: string, connecting: bool, api_version: count) @@ -714,24 +734,26 @@ event Management::Agent::API::deploy_response(reqid: string, results: Management # Mark this request as done by removing it from the table of pending # ones. The following if-check should always be true. - if ( areq$id in req$set_configuration_state$requests ) - delete req$set_configuration_state$requests[areq$id]; + if ( areq$id in req$deploy_state$requests ) + delete req$deploy_state$requests[areq$id]; # If there are any pending requests to the agents, we're # done: we respond once every agent has responed (or we time out). - if ( |req$set_configuration_state$requests| > 0 ) + if ( |req$deploy_state$requests| > 0 ) return; - # All deploy requests to instances are done, so adopt the - # client's requested configuration as the new one and respond back to - # client. - g_config_current = req$set_configuration_state$config; + # All deploy requests to instances are done, so adopt the config + # as the new deployed one and respond back to client. + g_config_deployed = req$deploy_state$config; g_config_reqid_pending = ""; - Management::Log::info(fmt("tx Management::Controller::API::set_configuration_response %s", + local res = Management::Result($reqid=req$id, $data=g_config_deployed$id); + req$results += res; + + Management::Log::info(fmt("tx Management::Controller::API::deploy_response %s", Management::Request::to_string(req))); Broker::publish(Management::Controller::topic, - Management::Controller::API::set_configuration_response, req$id, req$results); + Management::Controller::API::deploy_response, req$id, req$results); Management::Request::finish(req$id); } @@ -739,24 +761,9 @@ event Management::Controller::API::set_configuration_request(reqid: string, conf { Management::Log::info(fmt("rx Management::Controller::API::set_configuration_request %s", reqid)); - local res: Management::Result; local req = Management::Request::create(reqid); + local res = Management::Result($reqid=req$id); - req$set_configuration_state = SetConfigurationState($config = config); - - # At the moment there can only be one pending request. - if ( g_config_reqid_pending != "" ) - { - send_set_configuration_response_error(req, - fmt("request %s still pending", g_config_reqid_pending)); - - Management::Request::finish(req$id); - Management::Log::info(fmt("tx Management::Controller::API::set_configuration_response %s", - Management::Request::to_string(req))); - return; - } - - # If the config has problems, reject it: if ( ! config_validate(config, req) ) { Management::Request::finish(req$id); @@ -776,26 +783,99 @@ event Management::Controller::API::set_configuration_request(reqid: string, conf if ( |nodes| > 0 ) { local nodes_str = join_string_vec(nodes, ", "); - send_set_configuration_response_error(req, - fmt("port auto-assignment disabled but nodes %s lack ports", nodes_str)); - Management::Request::finish(req$id); + res$success = F; + res$error = fmt("port auto-assignment disabled but nodes %s lack ports", nodes_str); + req$results += res; + Management::Log::info(fmt("tx Management::Controller::API::set_configuration_response %s", Management::Request::to_string(req))); + Broker::publish(Management::Controller::topic, + Management::Controller::API::set_configuration_response, req$id, req$results); + Management::Request::finish(req$id); return; } } - # The incoming request is now the pending one. It gets cleared when all - # agents have processed their config updates successfully, or their - # responses time out. + g_config_staged = config; + + # We return the ID of the new configuration in the response. + res$data = config$id; + req$results += res; + + Management::Log::info(fmt( + "tx Management::Controller::API::set_configuration_response %s", + Management::result_to_string(res))); + Broker::publish(Management::Controller::topic, + Management::Controller::API::set_configuration_response, reqid, req$results); + Management::Request::finish(req$id); + } + +event Management::Controller::API::get_configuration_request(reqid: string, deployed: bool) + { + Management::Log::info(fmt("rx Management::Controller::API::get_configuration_request %s", reqid)); + + local res = Management::Result($reqid=reqid); + local config: Management::Configuration; + + if ( deployed ) + config = g_config_deployed; + else + config = g_config_staged; + + if ( config_is_null(config) ) + { + res$success = F; + res$error = fmt("no %s configuration available", + deployed ? "deployed" : "staged"); + } + else + { + res$data = config; + } + + Management::Log::info(fmt( + "tx Management::Controller::API::get_configuration_response %s", + Management::result_to_string(res))); + Broker::publish(Management::Controller::topic, + Management::Controller::API::get_configuration_response, reqid, res); + } + +event Management::Controller::API::deploy_request(reqid: string) + { + Management::Log::info(fmt("rx Management::Controller::API::deploy_request %s", reqid)); + + local req = Management::Request::create(reqid); + + # If there's no staged configuration, there's nothing to deploy. + if ( config_is_null(g_config_staged) ) + { + send_deploy_response_error(req, "no configuration available to deploy"); + Management::Request::finish(req$id); + return; + } + + # At the moment there can only be one pending deployment. + if ( g_config_reqid_pending != "" ) + { + send_deploy_response_error(req, + fmt("earlier deployment %s still pending", g_config_reqid_pending)); + Management::Request::finish(req$id); + return; + } + + req$deploy_state = DeployState($config = g_config_staged); + + # This deployment is now pending. It clears when all agents have + # processed their config updates successfully, or any responses time + # out. g_config_reqid_pending = req$id; # Compare the instance configuration to our current one. If it matches, - # we can proceed to deploying the new cluster topology. If it does - # not, we need to establish connectivity with agents we connect to, or - # wait until all instances that connect to us have done so. Either triggers - # a notify_agents_ready event, upon which we then deploy the topology. + # we can proceed to deploying the new cluster topology. If it does not, + # we need to establish connectivity with agents we connect to, or wait + # until all instances that connect to us have done so. Either case + # triggers a notify_agents_ready event, upon which we deploy the config. # The current & new set of instance names. local insts_current: set[string]; @@ -823,7 +903,7 @@ event Management::Controller::API::set_configuration_request(reqid: string, conf for ( inst_name in g_instances ) add insts_current[inst_name]; - for ( inst in config$instances ) + for ( inst in g_config_staged$instances ) add insts_new[inst$name]; # Populate TODO lists for instances we need to drop, check, or add. @@ -831,7 +911,7 @@ event Management::Controller::API::set_configuration_request(reqid: string, conf insts_to_add = insts_new - insts_current; insts_to_keep = insts_new & insts_current; - for ( inst in config$instances ) + for ( inst in g_config_staged$instances ) { if ( inst$name in insts_to_add ) { @@ -869,16 +949,21 @@ event Management::Controller::API::set_configuration_request(reqid: string, conf # Updates to instance tables are complete. As a corner case, if the # config contained no instances (and thus no nodes), we're now done - # since there are no agent interactions to wait for: + # since there are no agent interactions to wait for (any agents that + # need to tear down nodes are doing so asynchronously as part of the + # drop_instance() calls above). if ( |insts_new| == 0 ) { - g_config_current = req$set_configuration_state$config; + g_config_deployed = req$deploy_state$config; g_config_reqid_pending = ""; - Management::Log::info(fmt("tx Management::Controller::API::set_configuration_response %s", + local res = Management::Result($reqid=req$id, $data=g_config_deployed$id); + req$results += res; + + Management::Log::info(fmt("tx Management::Controller::API::deploy_response %s", Management::Request::to_string(req))); Broker::publish(Management::Controller::topic, - Management::Controller::API::set_configuration_response, req$id, req$results); + Management::Controller::API::deploy_response, req$id, req$results); Management::Request::finish(req$id); return; } @@ -890,30 +975,6 @@ event Management::Controller::API::set_configuration_request(reqid: string, conf check_instances_ready(); } -event Management::Controller::API::get_configuration_request(reqid: string) - { - Management::Log::info(fmt("rx Management::Controller::API::get_configuration_request %s", reqid)); - - local res = Management::Result($reqid=reqid); - - if ( config_is_null(g_config_current) ) - { - # We don't have a live configuration yet. - res$success = F; - res$error = "no configuration deployed"; - } - else - { - res$data = g_config_current; - } - - Management::Log::info(fmt( - "tx Management::Controller::API::get_configuration_response %s", - Management::result_to_string(res))); - Broker::publish(Management::Controller::topic, - Management::Controller::API::get_configuration_response, reqid, res); - } - event Management::Controller::API::get_instances_request(reqid: string) { Management::Log::info(fmt("rx Management::Controller::API::get_instances_request %s", reqid)); @@ -1110,8 +1171,8 @@ event Management::Controller::API::get_id_value_request(reqid: string, id: strin # don't propagate them to the agents. if ( |nodes| > 0 ) { - # Requested nodes that are in the current configuration: - nodes_final = filter_config_nodes_by_name(nodes); + # Requested nodes that are in the deployed configuration: + nodes_final = config_filter_nodes_by_name(g_config_deployed, nodes); # Requested nodes that are not in current configuration: local nodes_invalid = nodes - nodes_final; @@ -1170,16 +1231,16 @@ event Management::Request::request_expired(req: Management::Request::Request) $success = F, $error = "request timed out"); - if ( req?$set_configuration_state ) + if ( req?$deploy_state ) { # This timeout means we no longer have a pending request. g_config_reqid_pending = ""; req$results += res; - Management::Log::info(fmt("tx Management::Controller::API::set_configuration_response %s", + Management::Log::info(fmt("tx Management::Controller::API::deploy_response %s", Management::Request::to_string(req))); Broker::publish(Management::Controller::topic, - Management::Controller::API::set_configuration_response, req$id, req$results); + Management::Controller::API::deploy_response, req$id, req$results); } if ( req?$get_nodes_state ) @@ -1241,10 +1302,8 @@ event Broker::peer_added(peer: Broker::EndpointInfo, msg: string) event zeek_init() { - # Initialize null config at startup. We will replace it once we have - # persistence, and again whenever we complete a client's - # set_configuration request. - g_config_current = null_config(); + g_config_deployed = null_config(); + g_config_staged = null_config(); # The controller always listens: it needs to be able to respond to # clients connecting to it, as well as agents if they connect to the From 46db4a0e7159e0124f1fa5023619a30a3179e670 Mon Sep 17 00:00:00 2001 From: Christian Kreibich Date: Thu, 16 Jun 2022 10:48:57 -0700 Subject: [PATCH 05/21] Management framework: introduce state machine for configs and persist them The controller now knows three states that a cluster configuration can be in: - STAGED: as uploaded by the client - READY: with needed tweaks applied, e.g. to fill in ports - DEPLOYED: as sent off to agents for deployment These states aren't exclusive, they represent checkpoints that a config goes through from upload through deployment. A deployed configuration will also exist in its STAGED and READY versions, unless a client has uploaded a new configuration, which will overwrite the STAGED and READY ones. The controller saves all of these in a table, which lets us use Broker to persist all states to disk. We use &broker_allow_complex_type, since we only ever store entire configurations. --- .../management/controller/config.zeek | 4 + .../management/controller/main.zeek | 86 ++++++++----------- 2 files changed, 40 insertions(+), 50 deletions(-) diff --git a/scripts/policy/frameworks/management/controller/config.zeek b/scripts/policy/frameworks/management/controller/config.zeek index 01b8445a2b..eb1e0ca2d6 100644 --- a/scripts/policy/frameworks/management/controller/config.zeek +++ b/scripts/policy/frameworks/management/controller/config.zeek @@ -59,6 +59,10 @@ export { ## output gets garbled. const directory = "" &redef; + ## The name of the Broker store the controller uses to persist internal + ## state to disk. + const store_name = "controller"; + ## Returns the effective name of the controller. global get_name: function(): string; diff --git a/scripts/policy/frameworks/management/controller/main.zeek b/scripts/policy/frameworks/management/controller/main.zeek index 4d9d11705d..48e48ca2fd 100644 --- a/scripts/policy/frameworks/management/controller/main.zeek +++ b/scripts/policy/frameworks/management/controller/main.zeek @@ -18,6 +18,14 @@ module Management::Controller::Runtime; # Request record below. Without it, it fails to establish link targets for the # tucked-on types. export { + ## A cluster configuration uploaded by the client goes through multiple + ## states on its way to deployment. + type ConfigState: enum { + STAGED, ##< As provided by the client. + READY, ##< Necessary updates made, e.g. ports filled in. + DEPLOYED, ##< Sent off to the agents for deployment. + }; + ## Request state specific to ## :zeek:see:`Management::Controller::API::deploy_request` and ## :zeek:see:`Management::Controller::API::deploy_response`. @@ -89,10 +97,6 @@ global add_instance: function(inst: Management::Instance); # agent_standby_request, so it drops its current cluster nodes (if any). global drop_instance: function(inst: Management::Instance); -# Helpers to simplify handling of config records. -global null_config: function(): Management::Configuration; -global config_is_null: function(config: Management::Configuration): bool; - # Sends the given configuration to all g_instances members via a # Management::Agent::API::deploy_request event. To track responses, it builds up # deployment state in the given request for each recipient. @@ -161,15 +165,9 @@ global g_instances_ready: set[string] = set(); # the update. global g_config_reqid_pending: string = ""; -# The most recent configuration we have successfully deployed. When no -# deployment has happened yet, this is a "null config" as per null_config(). -global g_config_deployed: Management::Configuration; - -# The most recently provided configuration by the client. When the controller -# hasn't yet received a configuration, this is a "null config" as per -# null_config(). Successful deployment doesn't invalidate or clear this -# configuration, it remains set. -global g_config_staged: Management::Configuration; +# This table tracks a cluster configuration through its states to deployment. +global g_configs: table[ConfigState] of Management::Configuration + &broker_allow_complex_type &backend=Broker::SQLITE; function config_deploy_to_agents(config: Management::Configuration, req: Management::Request::Request) { @@ -256,16 +254,6 @@ function drop_instance(inst: Management::Instance) Management::Log::info(fmt("dropped instance %s", inst$name)); } -function null_config(): Management::Configuration - { - return Management::Configuration($id=""); - } - -function config_is_null(config: Management::Configuration): bool - { - return config$id == ""; - } - function config_nodes_lacking_ports(config: Management::Configuration): vector of string { local res: vector of string; @@ -744,10 +732,11 @@ event Management::Agent::API::deploy_response(reqid: string, results: Management # All deploy requests to instances are done, so adopt the config # as the new deployed one and respond back to client. - g_config_deployed = req$deploy_state$config; + local config = req$deploy_state$config; + g_configs[DEPLOYED] = config; g_config_reqid_pending = ""; - local res = Management::Result($reqid=req$id, $data=g_config_deployed$id); + local res = Management::Result($reqid=req$id, $data=config$id); req$results += res; Management::Log::info(fmt("tx Management::Controller::API::deploy_response %s", @@ -763,6 +752,7 @@ event Management::Controller::API::set_configuration_request(reqid: string, conf local req = Management::Request::create(reqid); local res = Management::Result($reqid=req$id); + local config_copy: Management::Configuration; if ( ! config_validate(config, req) ) { @@ -774,9 +764,7 @@ event Management::Controller::API::set_configuration_request(reqid: string, conf return; } - if ( Management::Controller::auto_assign_ports ) - config_assign_ports(config); - else + if ( ! Management::Controller::auto_assign_ports ) { local nodes = config_nodes_lacking_ports(config); @@ -797,7 +785,13 @@ event Management::Controller::API::set_configuration_request(reqid: string, conf } } - g_config_staged = config; + g_configs[STAGED] = config; + config_copy = copy(config); + + if ( Management::Controller::auto_assign_ports ) + config_assign_ports(config_copy); + + g_configs[READY] = config_copy; # We return the ID of the new configuration in the response. res$data = config$id; @@ -816,22 +810,17 @@ event Management::Controller::API::get_configuration_request(reqid: string, depl Management::Log::info(fmt("rx Management::Controller::API::get_configuration_request %s", reqid)); local res = Management::Result($reqid=reqid); - local config: Management::Configuration; + local key = deployed ? DEPLOYED : STAGED; - if ( deployed ) - config = g_config_deployed; - else - config = g_config_staged; - - if ( config_is_null(config) ) + if ( key !in g_configs ) { + # Cut off enum namespacing prefix and turn rest lower-case, for readability: + res$error = fmt("no %s configuration available", to_lower(sub(cat(key), /.+::/, ""))); res$success = F; - res$error = fmt("no %s configuration available", - deployed ? "deployed" : "staged"); } else { - res$data = config; + res$data = g_configs[key]; } Management::Log::info(fmt( @@ -847,8 +836,7 @@ event Management::Controller::API::deploy_request(reqid: string) local req = Management::Request::create(reqid); - # If there's no staged configuration, there's nothing to deploy. - if ( config_is_null(g_config_staged) ) + if ( READY !in g_configs ) { send_deploy_response_error(req, "no configuration available to deploy"); Management::Request::finish(req$id); @@ -864,7 +852,7 @@ event Management::Controller::API::deploy_request(reqid: string) return; } - req$deploy_state = DeployState($config = g_config_staged); + req$deploy_state = DeployState($config = g_configs[READY]); # This deployment is now pending. It clears when all agents have # processed their config updates successfully, or any responses time @@ -903,7 +891,7 @@ event Management::Controller::API::deploy_request(reqid: string) for ( inst_name in g_instances ) add insts_current[inst_name]; - for ( inst in g_config_staged$instances ) + for ( inst in g_configs[READY]$instances ) add insts_new[inst$name]; # Populate TODO lists for instances we need to drop, check, or add. @@ -911,7 +899,7 @@ event Management::Controller::API::deploy_request(reqid: string) insts_to_add = insts_new - insts_current; insts_to_keep = insts_new & insts_current; - for ( inst in g_config_staged$instances ) + for ( inst in g_configs[READY]$instances ) { if ( inst$name in insts_to_add ) { @@ -954,10 +942,11 @@ event Management::Controller::API::deploy_request(reqid: string) # drop_instance() calls above). if ( |insts_new| == 0 ) { - g_config_deployed = req$deploy_state$config; + local config = req$deploy_state$config; + g_configs[DEPLOYED] = config; g_config_reqid_pending = ""; - local res = Management::Result($reqid=req$id, $data=g_config_deployed$id); + local res = Management::Result($reqid=req$id, $data=config$id); req$results += res; Management::Log::info(fmt("tx Management::Controller::API::deploy_response %s", @@ -1172,7 +1161,7 @@ event Management::Controller::API::get_id_value_request(reqid: string, id: strin if ( |nodes| > 0 ) { # Requested nodes that are in the deployed configuration: - nodes_final = config_filter_nodes_by_name(g_config_deployed, nodes); + nodes_final = config_filter_nodes_by_name(g_configs[DEPLOYED], nodes); # Requested nodes that are not in current configuration: local nodes_invalid = nodes - nodes_final; @@ -1302,9 +1291,6 @@ event Broker::peer_added(peer: Broker::EndpointInfo, msg: string) event zeek_init() { - g_config_deployed = null_config(); - g_config_staged = null_config(); - # The controller always listens: it needs to be able to respond to # clients connecting to it, as well as agents if they connect to the # controller. The controller does not automatically connect to any From a68ee139399e73600410198738e93eb16189c51d Mon Sep 17 00:00:00 2001 From: Christian Kreibich Date: Fri, 17 Jun 2022 13:45:39 -0700 Subject: [PATCH 06/21] Management framework: suppress notify_agent_hello upon Supervisor peering The agent's Broker::peer_added handler now recognizes the Supervisor and does not trigger a notify_agent_hello event upon it. It might still send such events repeatedly as other things peer with the agent. --- .../frameworks/management/agent/main.zeek | 39 +++++++++++++------ 1 file changed, 28 insertions(+), 11 deletions(-) diff --git a/scripts/policy/frameworks/management/agent/main.zeek b/scripts/policy/frameworks/management/agent/main.zeek index 8ee4a9b320..bea2e53576 100644 --- a/scripts/policy/frameworks/management/agent/main.zeek +++ b/scripts/policy/frameworks/management/agent/main.zeek @@ -80,6 +80,9 @@ redef Management::Request::timeout_interval = 5 sec; # Returns the effective agent topic for this agent. global agent_topic: function(): string; +# Returns the effective supervisor's address and port, to peer with +global supervisor_network_info: function(): Broker::NetworkInfo; + # Finalizes a deploy_request transaction: cleans up remaining state # and sends response event. global send_deploy_response: function(req: Management::Request::Request); @@ -114,6 +117,19 @@ function agent_topic(): string return Management::Agent::topic_prefix + "/" + epi$id; } +function supervisor_network_info(): Broker::NetworkInfo + { + # The Supervisor's address defaults to Broker's default, which + # relies on ZEEK_DEFAULT_LISTEN_ADDR and so might just be "". Broker + # internally falls back to listening on any; we pick 127.0.0.1. + local address = Broker::default_listen_address; + + if ( address == "" ) + address = "127.0.0.1"; + + return Broker::NetworkInfo($address=address, $bound_port=Broker::default_port); + } + function send_deploy_response(req: Management::Request::Request) { local node: string; @@ -745,10 +761,16 @@ event Management::Request::request_expired(req: Management::Request::Request) event Broker::peer_added(peer: Broker::EndpointInfo, msg: string) { - # This does not (cannot?) immediately verify that the new peer - # is in fact a controller, so we might send this in vain. - # Controllers register the agent upon receipt of the event. + Management::Log::debug(fmt("broker peer %s added: %s", peer, msg)); + local sni = supervisor_network_info(); + + if ( peer$network$address == sni$address && peer$network$bound_port == sni$bound_port ) + return; + + # Supervisor aside, this does not (cannot?) immediately verify that the + # new peer is in fact a controller, so we might send this in vain. + # Controllers register the agent upon receipt of the event. local epi = Management::Agent::endpoint_info(); Broker::publish(agent_topic(), @@ -767,14 +789,9 @@ event zeek_init() local epi = Management::Agent::endpoint_info(); # The agent needs to peer with the supervisor -- this doesn't currently - # happen automatically. The address defaults to Broker's default, which - # relies on ZEEK_DEFAULT_LISTEN_ADDR and so might just be "". Broker - # internally falls back to listening on any; we pick 127.0.0.1. - local supervisor_addr = Broker::default_listen_address; - if ( supervisor_addr == "" ) - supervisor_addr = "127.0.0.1"; - - Broker::peer(supervisor_addr, Broker::default_port, Broker::default_listen_retry); + # happen automatically. + local sni = supervisor_network_info(); + Broker::peer(sni$address, sni$bound_port, Broker::default_listen_retry); # Agents need receive communication targeted at it, any responses # from the supervisor, and any responses from cluster nodes. From d367f1bad93195ea328d8db7e1b88a4adce218f3 Mon Sep 17 00:00:00 2001 From: Christian Kreibich Date: Fri, 17 Jun 2022 14:39:45 -0700 Subject: [PATCH 07/21] Management framework: agents now skip re-deployment of current config When an agent is already running the configuration it's asked to deploy, it will now recognize this and by default do nothing. The requester can force it if needed, via a new argument to the deploy_request event. --- .../frameworks/management/agent/api.zeek | 6 ++++- .../frameworks/management/agent/main.zeek | 22 ++++++++++++++++--- .../management/controller/main.zeek | 2 +- 3 files changed, 25 insertions(+), 5 deletions(-) diff --git a/scripts/policy/frameworks/management/agent/api.zeek b/scripts/policy/frameworks/management/agent/api.zeek index 55c8b2a563..857f79c232 100644 --- a/scripts/policy/frameworks/management/agent/api.zeek +++ b/scripts/policy/frameworks/management/agent/api.zeek @@ -26,8 +26,12 @@ export { ## part pertaining to this instance: the cluster framework requires ## full cluster visibility to establish needed peerings. ## + ## force: whether to re-deploy (i.e., restart its Zeek cluster nodes) + ## when the agent already runs this configuration. This relies on + ## the config ID to determine config equality. + ## global deploy_request: event(reqid: string, - config: Management::Configuration); + config: Management::Configuration, force: bool &default=F); ## Response to a deploy_request event. The agent sends ## this back to the controller. diff --git a/scripts/policy/frameworks/management/agent/main.zeek b/scripts/policy/frameworks/management/agent/main.zeek index bea2e53576..80629b3b31 100644 --- a/scripts/policy/frameworks/management/agent/main.zeek +++ b/scripts/policy/frameworks/management/agent/main.zeek @@ -278,13 +278,29 @@ function supervisor_destroy(node: string) Management::Log::info(fmt("issued supervisor destroy for %s, %s", node, req$id)); } -event Management::Agent::API::deploy_request(reqid: string, config: Management::Configuration) +event Management::Agent::API::deploy_request(reqid: string, config: Management::Configuration, force: bool) { - Management::Log::info(fmt("rx Management::Agent::API::deploy_request %s", reqid)); + Management::Log::info(fmt("rx Management::Agent::API::deploy_request %s %s", reqid, config$id)); local nodename: string; local node: Management::Node; local nc: Supervisor::NodeConfig; + local res: Management::Result; + + # Special case: we're already running this configuration. + if ( g_config$id == config$id && ! force ) + { + res = Management::Result( + $reqid = reqid, + $instance = Management::Agent::get_name()); + + Management::Log::info(fmt("already running config %s", config$id)); + Management::Log::info(fmt("tx Management::Agent::API::deploy_response %s", + Management::result_to_string(res))); + Broker::publish(agent_topic(), + Management::Agent::API::deploy_response, reqid, vector(res)); + return; + } # Adopt the global configuration provided. The act of trying to launch # the requested nodes perturbs any existing ones one way or another, so @@ -309,7 +325,7 @@ event Management::Agent::API::deploy_request(reqid: string, config: Management:: { g_config_reqid_pending = ""; - local res = Management::Result( + res = Management::Result( $reqid = reqid, $instance = Management::Agent::get_name()); diff --git a/scripts/policy/frameworks/management/controller/main.zeek b/scripts/policy/frameworks/management/controller/main.zeek index 48e48ca2fd..77c8ea25cd 100644 --- a/scripts/policy/frameworks/management/controller/main.zeek +++ b/scripts/policy/frameworks/management/controller/main.zeek @@ -188,7 +188,7 @@ function config_deploy_to_agents(config: Management::Configuration, req: Managem # We could also broadcast just once on the agent prefix, but # explicit request/response pairs for each agent seems cleaner. Management::Log::info(fmt("tx Management::Agent::API::deploy_request %s to %s", areq$id, name)); - Broker::publish(agent_topic, Management::Agent::API::deploy_request, areq$id, config); + Broker::publish(agent_topic, Management::Agent::API::deploy_request, areq$id, config, F); } } From a2525e44ba7745c907261d2d19a39c2c321c3f29 Mon Sep 17 00:00:00 2001 From: Christian Kreibich Date: Fri, 17 Jun 2022 22:31:10 -0700 Subject: [PATCH 08/21] Management framework: add a helper for rendering result vectors to a string --- .../frameworks/management/controller/main.zeek | 3 ++- scripts/policy/frameworks/management/request.zeek | 10 +--------- scripts/policy/frameworks/management/types.zeek | 14 ++++++++++++++ 3 files changed, 17 insertions(+), 10 deletions(-) diff --git a/scripts/policy/frameworks/management/controller/main.zeek b/scripts/policy/frameworks/management/controller/main.zeek index 77c8ea25cd..e951d3d35e 100644 --- a/scripts/policy/frameworks/management/controller/main.zeek +++ b/scripts/policy/frameworks/management/controller/main.zeek @@ -695,7 +695,8 @@ event Management::Agent::API::notify_log(instance: string, msg: string, node: st event Management::Agent::API::deploy_response(reqid: string, results: Management::ResultVec) { - Management::Log::info(fmt("rx Management::Agent::API::deploy_response %s", reqid)); + Management::Log::info(fmt("rx Management::Agent::API::deploy_response %s %s", + reqid, Management::result_vec_to_string(results))); # Retrieve state for the request we just got a response to local areq = Management::Request::lookup(reqid); diff --git a/scripts/policy/frameworks/management/request.zeek b/scripts/policy/frameworks/management/request.zeek index e291e02260..676c65f6bd 100644 --- a/scripts/policy/frameworks/management/request.zeek +++ b/scripts/policy/frameworks/management/request.zeek @@ -146,20 +146,12 @@ function is_null(request: Request): bool function to_string(request: Request): string { - local results: string_vec; - local res: Management::Result; local parent_id = ""; if ( request?$parent_id ) parent_id = fmt(" (via %s)", request$parent_id); - for ( idx in request$results ) - { - res = request$results[idx]; - results[|results|] = Management::result_to_string(res); - } - return fmt("[request %s%s %s, results: %s]", request$id, parent_id, request$finished ? "finished" : "pending", - join_string_vec(results, ", ")); + Management::result_vec_to_string(request$results)); } diff --git a/scripts/policy/frameworks/management/types.zeek b/scripts/policy/frameworks/management/types.zeek index 7632cc0247..f28706a313 100644 --- a/scripts/policy/frameworks/management/types.zeek +++ b/scripts/policy/frameworks/management/types.zeek @@ -124,6 +124,10 @@ export { ## Given a :zeek:see:`Management::Result` record, ## this function returns a string summarizing it. global result_to_string: function(res: Result): string; + + ## Given a vector of :zeek:see:`Management::Result` records, + ## this function returns a string summarizing them. + global result_vec_to_string: function(res: ResultVec): string; } function result_to_string(res: Result): string @@ -151,3 +155,13 @@ function result_to_string(res: Result): string return result; } + +function result_vec_to_string(res: ResultVec): string + { + local ret: vector of string; + + for ( idx in res ) + ret += result_to_string(res[idx]);; + + return join_string_vec(ret, ", "); + } From 8bc142f73c4ce6b0218d5fa3897dc29b3edd0dc3 Mon Sep 17 00:00:00 2001 From: Christian Kreibich Date: Sat, 18 Jun 2022 01:03:31 -0700 Subject: [PATCH 09/21] Management framework: add "finish" callback to requests These callbacks are handy for stringing together codepaths separated by event request/response transactions: when such a transaction completes, the callback allows locating a parent request for the finished one, to continue its processing. --- scripts/policy/frameworks/management/request.zeek | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/scripts/policy/frameworks/management/request.zeek b/scripts/policy/frameworks/management/request.zeek index 676c65f6bd..f7a798ace6 100644 --- a/scripts/policy/frameworks/management/request.zeek +++ b/scripts/policy/frameworks/management/request.zeek @@ -32,6 +32,14 @@ export { finished: bool &default=F; }; + # To allow a callback to refer to Requests, the Request type must + # exist. So redef to add it: + redef record Request += { + ## A callback to invoke when this request is finished via + ## :zeek:see:`Management::Request::finish`. + finish: function(req: Management::Request::Request) &optional; + }; + ## The timeout interval for request state. Such state (see the ## :zeek:see:`Management::Request` module) ties together request and ## response event pairs. A timeout causes cleanup of request state if @@ -131,6 +139,9 @@ function finish(reqid: string): bool local req = g_requests[reqid]; delete g_requests[reqid]; + if ( req?$finish ) + req$finish(req); + req$finished = T; return T; From 35ea5662231784055a32e9590251e0d4e21292f5 Mon Sep 17 00:00:00 2001 From: Christian Kreibich Date: Sun, 19 Jun 2022 16:24:40 -0700 Subject: [PATCH 10/21] Management framework: rename "log_level" to "level" "Management::Log::log_level" looks redundant. --- scripts/policy/frameworks/management/log.zeek | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/scripts/policy/frameworks/management/log.zeek b/scripts/policy/frameworks/management/log.zeek index a6f8d37571..95798421a7 100644 --- a/scripts/policy/frameworks/management/log.zeek +++ b/scripts/policy/frameworks/management/log.zeek @@ -38,7 +38,7 @@ export { ## The log level in use for this node. This is the minimum ## log level required to produce output. - global log_level = INFO &redef; + global level = INFO &redef; ## A debug-level log message writer. ## @@ -84,7 +84,7 @@ global r2s: table[Management::Role] of string = { function debug(message: string) { - if ( enum_to_int(log_level) > enum_to_int(DEBUG) ) + if ( enum_to_int(level) > enum_to_int(DEBUG) ) return; local node = Supervisor::node(); @@ -94,7 +94,7 @@ function debug(message: string) function info(message: string) { - if ( enum_to_int(log_level) > enum_to_int(INFO) ) + if ( enum_to_int(level) > enum_to_int(INFO) ) return; local node = Supervisor::node(); @@ -104,7 +104,7 @@ function info(message: string) function warning(message: string) { - if ( enum_to_int(log_level) > enum_to_int(WARNING) ) + if ( enum_to_int(level) > enum_to_int(WARNING) ) return; local node = Supervisor::node(); @@ -114,7 +114,7 @@ function warning(message: string) function error(message: string) { - if ( enum_to_int(log_level) > enum_to_int(ERROR) ) + if ( enum_to_int(level) > enum_to_int(ERROR) ) return; local node = Supervisor::node(); From d7e88fc07976159401455473b97bcd794a3abdf5 Mon Sep 17 00:00:00 2001 From: Christian Kreibich Date: Sun, 19 Jun 2022 16:55:55 -0700 Subject: [PATCH 11/21] Management framework: make helper function a local --- .../management/controller/main.zeek | 35 +++++++------------ 1 file changed, 13 insertions(+), 22 deletions(-) diff --git a/scripts/policy/frameworks/management/controller/main.zeek b/scripts/policy/frameworks/management/controller/main.zeek index e951d3d35e..8ad72803c7 100644 --- a/scripts/policy/frameworks/management/controller/main.zeek +++ b/scripts/policy/frameworks/management/controller/main.zeek @@ -125,12 +125,6 @@ global config_validate: function(config: Management::Configuration, global config_filter_nodes_by_name: function(config: Management::Configuration, nodes: set[string]): set[string]; -# Fails the given deployment request with the given error message. The function -# adds a non-success result record to the request and send a deploy_response -# event back to the client. It does not call finish() on the request. -global send_deploy_response_error: function( - req: Management::Request::Request, error: string); - # Given a Broker ID, this returns the endpoint info associated with it. # On error, returns a dummy record with an empty ID string. global find_endpoint: function(id: string): Broker::EndpointInfo; @@ -553,20 +547,6 @@ function is_instance_connectivity_change(inst: Management::Instance): bool return F; } -function send_deploy_response_error(req: Management::Request::Request, error: string) - { - local res = Management::Result($reqid=req$id); - - res$success = F; - res$error = error; - req$results += res; - - Management::Log::info(fmt("tx Management::Controller::API::deploy_response %s", - Management::Request::to_string(req))); - Broker::publish(Management::Controller::topic, - Management::Controller::API::deploy_response, req$id, req$results); - } - event Management::Controller::API::notify_agents_ready(instances: set[string]) { local insts = Management::Util::set_to_vector(instances); @@ -833,13 +813,24 @@ event Management::Controller::API::get_configuration_request(reqid: string, depl event Management::Controller::API::deploy_request(reqid: string) { + local send_error_response = function(req: Management::Request::Request, error: string) + { + local res = Management::Result($reqid=req$id, $success=F, $error=error); + req$results += res; + + Management::Log::info(fmt("tx Management::Controller::API::deploy_response %s", + Management::Request::to_string(req))); + Broker::publish(Management::Controller::topic, + Management::Controller::API::deploy_response, req$id, req$results); + }; + Management::Log::info(fmt("rx Management::Controller::API::deploy_request %s", reqid)); local req = Management::Request::create(reqid); if ( READY !in g_configs ) { - send_deploy_response_error(req, "no configuration available to deploy"); + send_error_response(req, "no configuration available to deploy"); Management::Request::finish(req$id); return; } @@ -847,7 +838,7 @@ event Management::Controller::API::deploy_request(reqid: string) # At the moment there can only be one pending deployment. if ( g_config_reqid_pending != "" ) { - send_deploy_response_error(req, + send_error_response(req, fmt("earlier deployment %s still pending", g_config_reqid_pending)); Management::Request::finish(req$id); return; From 633535d8dae5eff6d2694de5ac32d69ff9ceb652 Mon Sep 17 00:00:00 2001 From: Christian Kreibich Date: Sun, 19 Jun 2022 17:07:59 -0700 Subject: [PATCH 12/21] Management framework: tweak Supervisor event logging We now log Supervisor event interaction just like we do transmission/receipt of other Management framework events. --- scripts/policy/frameworks/management/agent/main.zeek | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/scripts/policy/frameworks/management/agent/main.zeek b/scripts/policy/frameworks/management/agent/main.zeek index 80629b3b31..2ab626723d 100644 --- a/scripts/policy/frameworks/management/agent/main.zeek +++ b/scripts/policy/frameworks/management/agent/main.zeek @@ -222,6 +222,8 @@ event Management::Supervisor::API::notify_node_exit(node: string, outputs: Manag event SupervisorControl::create_response(reqid: string, result: string) { + Management::Log::info(fmt("rx SupervisorControl::create_response %s %s", reqid, result)); + local req = Management::Request::lookup(reqid); if ( Management::Request::is_null(req) ) return; @@ -242,6 +244,8 @@ event SupervisorControl::create_response(reqid: string, result: string) event SupervisorControl::destroy_response(reqid: string, result: bool) { + Management::Log::info(fmt("rx SupervisorControl::destroy_response %s %s", reqid, result)); + local req = Management::Request::lookup(reqid); if ( Management::Request::is_null(req) ) return; @@ -264,18 +268,20 @@ function supervisor_create(nc: Supervisor::NodeConfig) { local req = Management::Request::create(); req$supervisor_state_agent = SupervisorState($node = nc$name); + + Management::Log::info(fmt("tx SupervisorControl::create_request %s %s", req$id, nc$name)); Broker::publish(SupervisorControl::topic_prefix, SupervisorControl::create_request, req$id, nc); - Management::Log::info(fmt("issued supervisor create for %s, %s", nc$name, req$id)); } function supervisor_destroy(node: string) { local req = Management::Request::create(); req$supervisor_state_agent = SupervisorState($node = node); + + Management::Log::info(fmt("tx SupervisorControl::destroy_request %s %s", req$id, node)); Broker::publish(SupervisorControl::topic_prefix, SupervisorControl::destroy_request, req$id, node); - Management::Log::info(fmt("issued supervisor destroy for %s, %s", node, req$id)); } event Management::Agent::API::deploy_request(reqid: string, config: Management::Configuration, force: bool) @@ -435,7 +441,9 @@ event Management::Agent::API::deploy_request(reqid: string, config: Management:: event SupervisorControl::status_response(reqid: string, result: Supervisor::Status) { + Management::Log::info(fmt("rx SupervisorControl::status_response %s", reqid)); local req = Management::Request::lookup(reqid); + if ( Management::Request::is_null(req) ) return; From 7787d847396c735dff3b50c38dfa9d8e5530a67d Mon Sep 17 00:00:00 2001 From: Christian Kreibich Date: Sun, 19 Jun 2022 17:11:30 -0700 Subject: [PATCH 13/21] Management framework: track instances by their Broker IDs This allows us to handle loss of Broker peerings, updating instance state as we see instances go away. This also tweaks logging slightly to differentiate between an instance checking in for the first time, and checking in when the controller already knows it. --- .../management/controller/main.zeek | 33 +++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/scripts/policy/frameworks/management/controller/main.zeek b/scripts/policy/frameworks/management/controller/main.zeek index 8ad72803c7..06ddcbdb39 100644 --- a/scripts/policy/frameworks/management/controller/main.zeek +++ b/scripts/policy/frameworks/management/controller/main.zeek @@ -154,6 +154,11 @@ global g_instances_known: table[string] of Management::Instance = table(); # instance, and store that in g_instances.) global g_instances_ready: set[string] = set(); +# A map from Broker ID values to instance names. When we lose a peering, this +# helps us understand whether it was an instance, and if so, update its state +# accordingly. +global g_instances_by_id: table[string] of string; + # The request ID of the most recent deployment request from a client. We track # it here until we know we are ready to communicate with all agents required for # the update. @@ -605,6 +610,12 @@ event Management::Agent::API::notify_agent_hello(instance: string, id: string, c if ( ei$id != "" && ei?$network ) { + if ( instance !in g_instances_known ) + Management::Log::debug(fmt("instance %s newly checked in", instance)); + else + Management::Log::debug(fmt("instance %s checked in again", instance)); + + g_instances_by_id[id] = instance; g_instances_known[instance] = Management::Instance( $name=instance, $host=to_addr(ei$network$address)); @@ -613,8 +624,6 @@ event Management::Agent::API::notify_agent_hello(instance: string, id: string, c # We connected to this agent, note down its port. g_instances_known[instance]$listen_port = ei$network$bound_port; } - - Management::Log::debug(fmt("instance %s now known to us", instance)); } if ( instance in g_instances && instance !in g_instances_ready ) @@ -1212,6 +1221,8 @@ event Management::Request::request_expired(req: Management::Request::Request) $success = F, $error = "request timed out"); + Management::Log::info(fmt("request %s timed out", req$id)); + if ( req?$deploy_state ) { # This timeout means we no longer have a pending request. @@ -1281,6 +1292,24 @@ event Broker::peer_added(peer: Broker::EndpointInfo, msg: string) Management::Log::debug(fmt("broker peer %s added: %s", peer, msg)); } +event Broker::peer_lost(peer: Broker::EndpointInfo, msg: string) + { + Management::Log::debug(fmt("broker peer %s lost: %s", peer, msg)); + + if ( peer$id in g_instances_by_id ) + { + local instance = g_instances_by_id[peer$id]; + + if ( instance in g_instances_known ) + delete g_instances_known[instance]; + if ( instance in g_instances_ready ) + delete g_instances_ready[instance]; + + Management::Log::info(fmt("dropped state for instance %s", instance)); + delete g_instances_by_id[peer$id]; + } + } + event zeek_init() { # The controller always listens: it needs to be able to respond to From 3120fbc75e3b3017ebb1819745afc35fd92f83c7 Mon Sep 17 00:00:00 2001 From: Christian Kreibich Date: Sun, 19 Jun 2022 17:34:12 -0700 Subject: [PATCH 14/21] Management framework: distinguish internally and externally requested deployments The controller's deployment request state now features a bit that indicates whether the deployment was requested by a client, or triggered internally. This affects logging and the transmission of deployment response events via Broker, which are skipped when the deployment is internal. This is in preparation of resilience features when the controller (re-)boots. --- .../management/controller/main.zeek | 38 +++++++++++++------ 1 file changed, 26 insertions(+), 12 deletions(-) diff --git a/scripts/policy/frameworks/management/controller/main.zeek b/scripts/policy/frameworks/management/controller/main.zeek index 06ddcbdb39..9cf2417985 100644 --- a/scripts/policy/frameworks/management/controller/main.zeek +++ b/scripts/policy/frameworks/management/controller/main.zeek @@ -32,6 +32,9 @@ export { type DeployState: record { ## The cluster configuration the controller is deploying. config: Management::Configuration; + ## Whether this is a controller-internal deployment, or + ## triggered via a request by a remote peer/client. + is_internal: bool &default=F; ## Request state for every controller/agent transaction. requests: set[string] &default=set(); }; @@ -729,10 +732,14 @@ event Management::Agent::API::deploy_response(reqid: string, results: Management local res = Management::Result($reqid=req$id, $data=config$id); req$results += res; - Management::Log::info(fmt("tx Management::Controller::API::deploy_response %s", - Management::Request::to_string(req))); - Broker::publish(Management::Controller::topic, - Management::Controller::API::deploy_response, req$id, req$results); + if ( ! req$deploy_state$is_internal ) + { + Management::Log::info(fmt("tx Management::Controller::API::deploy_response %s", + Management::Request::to_string(req))); + Broker::publish(Management::Controller::topic, + Management::Controller::API::deploy_response, req$id, req$results); + } + Management::Request::finish(req$id); } @@ -950,10 +957,14 @@ event Management::Controller::API::deploy_request(reqid: string) local res = Management::Result($reqid=req$id, $data=config$id); req$results += res; - Management::Log::info(fmt("tx Management::Controller::API::deploy_response %s", - Management::Request::to_string(req))); - Broker::publish(Management::Controller::topic, - Management::Controller::API::deploy_response, req$id, req$results); + if ( ! req$deploy_state$is_internal ) + { + Management::Log::info(fmt("tx Management::Controller::API::deploy_response %s", + Management::Request::to_string(req))); + Broker::publish(Management::Controller::topic, + Management::Controller::API::deploy_response, req$id, req$results); + } + Management::Request::finish(req$id); return; } @@ -1229,10 +1240,13 @@ event Management::Request::request_expired(req: Management::Request::Request) g_config_reqid_pending = ""; req$results += res; - Management::Log::info(fmt("tx Management::Controller::API::deploy_response %s", - Management::Request::to_string(req))); - Broker::publish(Management::Controller::topic, - Management::Controller::API::deploy_response, req$id, req$results); + if ( ! req$deploy_state$is_internal ) + { + Management::Log::info(fmt("tx Management::Controller::API::deploy_response %s", + Management::Request::to_string(req))); + Broker::publish(Management::Controller::topic, + Management::Controller::API::deploy_response, req$id, req$results); + } } if ( req?$get_nodes_state ) From c4862e7c5e5d3e14876c82138a61b86f61e161fd Mon Sep 17 00:00:00 2001 From: Christian Kreibich Date: Sun, 19 Jun 2022 17:42:36 -0700 Subject: [PATCH 15/21] Management framework: move most deployment handling to internal function The controller now runs most of a config deployment via an internal function, allowing it to be called from multiple places instead of just the deploy_request event handler. --- .../management/controller/main.zeek | 225 +++++++++--------- 1 file changed, 114 insertions(+), 111 deletions(-) diff --git a/scripts/policy/frameworks/management/controller/main.zeek b/scripts/policy/frameworks/management/controller/main.zeek index 9cf2417985..657039e785 100644 --- a/scripts/policy/frameworks/management/controller/main.zeek +++ b/scripts/policy/frameworks/management/controller/main.zeek @@ -555,6 +555,116 @@ function is_instance_connectivity_change(inst: Management::Instance): bool return F; } +function deploy(req: Management::Request::Request) + { + # This deployment is now pending. It clears when all agents have + # processed their config updates successfully, or any responses time + # out. + g_config_reqid_pending = req$id; + + # Compare the instance configuration to our current one. If it matches, + # we can proceed to deploying the new cluster topology. If it does not, + # we need to establish connectivity with agents we connect to, or wait + # until all instances that connect to us have done so. Either case + # triggers a notify_agents_ready event, upon which we deploy the config. + + # The current & new set of instance names. + local insts_current: set[string]; + local insts_new: set[string]; + + # A set of current instances not contained in the new config. + # Those will need to get dropped. + local insts_to_drop: set[string]; + + # The opposite: new instances not yet in our current set. Those we will need + # to establish contact with (or they with us). + local insts_to_add: set[string]; + + # The overlap: instances in both the current and new set. For those we verify + # that we're actually dealign with the same entities, and might need to re- + # connect if not. + local insts_to_keep: set[string]; + + # Alternative representation of insts_to_add, directly providing the instances. + local insts_to_peer: table[string] of Management::Instance; + + # Helpful locals. + local inst_name: string; + local inst: Management::Instance; + + for ( inst_name in g_instances ) + add insts_current[inst_name]; + for ( inst in g_configs[READY]$instances ) + add insts_new[inst$name]; + + # Populate TODO lists for instances we need to drop, check, or add. + insts_to_drop = insts_current - insts_new; + insts_to_add = insts_new - insts_current; + insts_to_keep = insts_new & insts_current; + + for ( inst in g_configs[READY]$instances ) + { + if ( inst$name in insts_to_add ) + { + insts_to_peer[inst$name] = inst; + next; + } + + # Focus on the keepers: check for change in identity/location. + if ( inst$name !in insts_to_keep ) + next; + + if ( is_instance_connectivity_change(inst) ) + { + # The endpoint looks different. We drop the current one + # and need to re-establish connectivity with the new + # one. + add insts_to_drop[inst$name]; + add insts_to_add[inst$name]; + } + } + + # Process our TODO lists. Handle drops first, then additions, in + # case we need to re-establish connectivity with an agent. + + for ( inst_name in insts_to_drop ) + { + Management::Log::debug(fmt("dropping instance %s", inst_name)); + drop_instance(g_instances[inst_name]); + } + for ( inst_name in insts_to_peer ) + { + Management::Log::debug(fmt("adding instance %s", inst_name)); + add_instance(insts_to_peer[inst_name]); + } + + # Updates to instance tables are complete. As a corner case, if the + # config contained no instances (and thus no nodes), we're now done + # since there are no agent interactions to wait for (any agents that + # need to tear down nodes are doing so asynchronously as part of the + # drop_instance() calls above). + if ( |insts_new| == 0 ) + { + local config = req$deploy_state$config; + g_configs[DEPLOYED] = config; + g_config_reqid_pending = ""; + + local res = Management::Result($reqid=req$id, $data=config$id); + req$results += res; + + if ( ! req$deploy_state$is_internal ) + { + Management::Log::info(fmt("tx Management::Controller::API::deploy_response %s", + Management::Request::to_string(req))); + Broker::publish(Management::Controller::topic, + Management::Controller::API::deploy_response, req$id, req$results); + } + + Management::Request::finish(req$id); + return; + } + } + event Management::Controller::API::notify_agents_ready(instances: set[string]) { local insts = Management::Util::set_to_vector(instances); @@ -861,118 +971,11 @@ event Management::Controller::API::deploy_request(reqid: string) } req$deploy_state = DeployState($config = g_configs[READY]); + deploy(req); - # This deployment is now pending. It clears when all agents have - # processed their config updates successfully, or any responses time - # out. - g_config_reqid_pending = req$id; - - # Compare the instance configuration to our current one. If it matches, - # we can proceed to deploying the new cluster topology. If it does not, - # we need to establish connectivity with agents we connect to, or wait - # until all instances that connect to us have done so. Either case - # triggers a notify_agents_ready event, upon which we deploy the config. - - # The current & new set of instance names. - local insts_current: set[string]; - local insts_new: set[string]; - - # A set of current instances not contained in the new config. - # Those will need to get dropped. - local insts_to_drop: set[string]; - - # The opposite: new instances not yet in our current set. Those we will need - # to establish contact with (or they with us). - local insts_to_add: set[string]; - - # The overlap: instances in both the current and new set. For those we verify - # that we're actually dealign with the same entities, and might need to re- - # connect if not. - local insts_to_keep: set[string]; - - # Alternative representation of insts_to_add, directly providing the instances. - local insts_to_peer: table[string] of Management::Instance; - - # Helpful locals. - local inst_name: string; - local inst: Management::Instance; - - for ( inst_name in g_instances ) - add insts_current[inst_name]; - for ( inst in g_configs[READY]$instances ) - add insts_new[inst$name]; - - # Populate TODO lists for instances we need to drop, check, or add. - insts_to_drop = insts_current - insts_new; - insts_to_add = insts_new - insts_current; - insts_to_keep = insts_new & insts_current; - - for ( inst in g_configs[READY]$instances ) - { - if ( inst$name in insts_to_add ) - { - insts_to_peer[inst$name] = inst; - next; - } - - # Focus on the keepers: check for change in identity/location. - if ( inst$name !in insts_to_keep ) - next; - - if ( is_instance_connectivity_change(inst) ) - { - # The endpoint looks different. We drop the current one - # and need to re-establish connectivity with the new - # one. - add insts_to_drop[inst$name]; - add insts_to_add[inst$name]; - } - } - - # Process our TODO lists. Handle drops first, then additions, in - # case we need to re-establish connectivity with an agent. - - for ( inst_name in insts_to_drop ) - { - Management::Log::debug(fmt("dropping instance %s", inst_name)); - drop_instance(g_instances[inst_name]); - } - for ( inst_name in insts_to_peer ) - { - Management::Log::debug(fmt("adding instance %s", inst_name)); - add_instance(insts_to_peer[inst_name]); - } - - # Updates to instance tables are complete. As a corner case, if the - # config contained no instances (and thus no nodes), we're now done - # since there are no agent interactions to wait for (any agents that - # need to tear down nodes are doing so asynchronously as part of the - # drop_instance() calls above). - if ( |insts_new| == 0 ) - { - local config = req$deploy_state$config; - g_configs[DEPLOYED] = config; - g_config_reqid_pending = ""; - - local res = Management::Result($reqid=req$id, $data=config$id); - req$results += res; - - if ( ! req$deploy_state$is_internal ) - { - Management::Log::info(fmt("tx Management::Controller::API::deploy_response %s", - Management::Request::to_string(req))); - Broker::publish(Management::Controller::topic, - Management::Controller::API::deploy_response, req$id, req$results); - } - - Management::Request::finish(req$id); - return; - } - - # Otherwise, check if we're able to send the config to all agents - # involved. If that's the case, this will trigger a - # Management::Controller::API::notify_agents_ready event that implements - # the distribution in the controller's own event handler, above. + # Check if we're already able to send the config to all agents involved: + # If so, this triggers Management::Controller::API::notify_agents_ready, + # which we handle above by distributing the config. check_instances_ready(); } From 1faf1ab8b7864b8bb3b5ebfd7f6737cb3c032978 Mon Sep 17 00:00:00 2001 From: Christian Kreibich Date: Sun, 19 Jun 2022 17:43:37 -0700 Subject: [PATCH 16/21] Management framework: re-trigger deployment upon controller launch A resilience feature: when a booting controller has a previously deployed configuration (just reloaded from persistent state), it now triggers a deployment. When agents at this point run something else, this restores the controller's understanding of what's deployed, and if the agents do still run this configuration, does nothing since agents ignore deployment of a configuration they already run. --- .../policy/frameworks/management/controller/main.zeek | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/scripts/policy/frameworks/management/controller/main.zeek b/scripts/policy/frameworks/management/controller/main.zeek index 657039e785..67057799db 100644 --- a/scripts/policy/frameworks/management/controller/main.zeek +++ b/scripts/policy/frameworks/management/controller/main.zeek @@ -1344,4 +1344,15 @@ event zeek_init() Broker::subscribe(Management::Controller::topic); Management::Log::info(fmt("controller is live, Broker ID %s", Broker::node_id())); + + # If we have a persisted deployed configuration, we need to make sure + # it's actually running. The agents involved might be gone, running a + # different config, etc. We simply run a deployment: agents already + # running this configuration will do nothing. + if ( DEPLOYED in g_configs ) + { + local req = Management::Request::create(); + req$deploy_state = DeployState($config=g_configs[DEPLOYED], $is_internal=T); + deploy(req); + } } From a622e28eab3f12e4d9f00eb868e78d254c14fb1c Mon Sep 17 00:00:00 2001 From: Christian Kreibich Date: Sun, 19 Jun 2022 17:29:50 -0700 Subject: [PATCH 17/21] Management framework: more resilient node shutdown upon deployment When agents had to terminate existing Zeek cluster nodes at the beginning of a new deployment, they so far used their internal state to look up the nodes and fired off requests to the Supervisor to shut these down. This has a problem: when an agent restarts unexpectedly, it has no internal state, and when it then tries to create nodes that already exist, the Supervisor complains with error messages. To avoid this, the agent now tears down all Supervised nodes other than agents and controllers. In order to do so, it first needs to query the Supervisor for the current node status, which means there are now two such status requests: one upon deployment, and one during get_nodes requests. In order to disambiguate these contexts in the SupervisorControl::status_request/response transactions, we use the finish() callback in the corresponding request state to continue execution as needed. --- .../frameworks/management/agent/main.zeek | 109 +++++++++++++----- 1 file changed, 80 insertions(+), 29 deletions(-) diff --git a/scripts/policy/frameworks/management/agent/main.zeek b/scripts/policy/frameworks/management/agent/main.zeek index 2ab626723d..3ec546f76e 100644 --- a/scripts/policy/frameworks/management/agent/main.zeek +++ b/scripts/policy/frameworks/management/agent/main.zeek @@ -23,7 +23,10 @@ module Management::Agent::Runtime; export { ## Request state specific to the agent's Supervisor interactions. type SupervisorState: record { - node: string; ##< Name of the node the Supervisor is acting on. + ## Name of the node the Supervisor is acting on, if applicable. + node: string &default=""; + ## The result of a status request. + status: Supervisor::Status &optional; }; ## Request state for deploy requests. @@ -87,6 +90,14 @@ global supervisor_network_info: function(): Broker::NetworkInfo; # and sends response event. global send_deploy_response: function(req: Management::Request::Request); +# Callback completing a deploy_request after the Supervisor has delivered +# a status response. +global deploy_request_finish: function(req: Management::Request::Request); + +# Callback completing a get_nodes_request after the Supervisor has delivered +# a status response. +global get_nodes_request_finish: function(req: Management::Request::Request); + # The global configuration, as deployed by the controller. global g_config: Management::Configuration; @@ -308,6 +319,9 @@ event Management::Agent::API::deploy_request(reqid: string, config: Management:: return; } + local req = Management::Request::create(reqid); + req$deploy_state_agent = DeployState(); + # Adopt the global configuration provided. The act of trying to launch # the requested nodes perturbs any existing ones one way or another, so # even if the launch fails it effectively is our new configuration. @@ -318,37 +332,59 @@ event Management::Agent::API::deploy_request(reqid: string, config: Management:: for ( inst in config$instances ) g_instances[inst$name] = inst; - # Terminate existing nodes - for ( nodename in g_nodes ) - supervisor_destroy(nodename); + local areq = Management::Request::create(); + areq$parent_id = req$id; + areq$finish = deploy_request_finish; + areq$supervisor_state_agent = SupervisorState(); + + Management::Log::info(fmt("tx SupervisorControl::status_request %s, all nodes", areq$id)); + Broker::publish(SupervisorControl::topic_prefix, + SupervisorControl::status_request, areq$id, ""); + } + +function deploy_request_finish(areq: Management::Request::Request) + { + local status = areq$supervisor_state_agent$status; + + for ( nodename in status$nodes ) + { + if ( "ZEEK_MANAGEMENT_NODE" in status$nodes[nodename]$node$env ) + next; + supervisor_destroy(status$nodes[nodename]$node$name); + } + + local req = Management::Request::lookup(areq$parent_id); + if ( Management::Request::is_null(req) ) + return; + + local res: Management::Result; + local nc: Supervisor::NodeConfig; + local node: Management::Node; # Refresh the cluster and nodes tables g_nodes = table(); g_cluster = table(); # Special case: the config contains no nodes. We can respond right away. - if ( |config$nodes| == 0 ) + if ( |g_config$nodes| == 0 ) { g_config_reqid_pending = ""; res = Management::Result( - $reqid = reqid, + $reqid = req$id, $instance = Management::Agent::get_name()); Management::Log::info(fmt("tx Management::Agent::API::deploy_response %s", Management::result_to_string(res))); Broker::publish(agent_topic(), - Management::Agent::API::deploy_response, reqid, vector(res)); + Management::Agent::API::deploy_response, req$id, vector(res)); return; } - local req = Management::Request::create(reqid); - req$deploy_state_agent = DeployState(); - # Establish this request as the pending one: - g_config_reqid_pending = reqid; + g_config_reqid_pending = req$id; - for ( node in config$nodes ) + for ( node in g_config$nodes ) { # Filter the node set down to the ones this agent manages. if ( node$instance == Management::Agent::get_name() ) @@ -446,17 +482,43 @@ event SupervisorControl::status_response(reqid: string, result: Supervisor::Stat if ( Management::Request::is_null(req) ) return; + if ( ! req?$supervisor_state_agent ) + return; + req$supervisor_state_agent$status = result; Management::Request::finish(reqid); + } - local res = Management::Result( - $reqid = req$parent_id, $instance = Management::Agent::get_name()); +event Management::Agent::API::get_nodes_request(reqid: string) + { + Management::Log::info(fmt("rx Management::Agent::API::get_nodes_request %s", reqid)); + + local req = Management::Request::create(reqid); + + local areq = Management::Request::create(); + areq$parent_id = req$id; + areq$finish = get_nodes_request_finish; + areq$supervisor_state_agent = SupervisorState(); + + Broker::publish(SupervisorControl::topic_prefix, + SupervisorControl::status_request, areq$id, ""); + Management::Log::info(fmt("issued supervisor status, %s", areq$id)); + } + +function get_nodes_request_finish(areq: Management::Request::Request) + { + local req = Management::Request::lookup(areq$parent_id); + if ( Management::Request::is_null(req) ) + return; + + local res = Management::Result($reqid=req$id, + $instance=Management::Agent::get_name()); local node_statuses: Management::NodeStatusVec; - for ( node in result$nodes ) + for ( node in areq$supervisor_state_agent$status$nodes ) { - local sns = result$nodes[node]; # Supervisor node status + local sns = areq$supervisor_state_agent$status$nodes[node]; # Supervisor node status local cns = Management::NodeStatus( $node=node, $state=Management::PENDING); @@ -527,19 +589,8 @@ event SupervisorControl::status_response(reqid: string, result: Supervisor::Stat Management::Log::info(fmt("tx Management::Agent::API::get_nodes_response %s", Management::result_to_string(res))); Broker::publish(agent_topic(), - Management::Agent::API::get_nodes_response, req$parent_id, res); - } - -event Management::Agent::API::get_nodes_request(reqid: string) - { - Management::Log::info(fmt("rx Management::Agent::API::get_nodes_request %s", reqid)); - - local req = Management::Request::create(); - req$parent_id = reqid; - - Broker::publish(SupervisorControl::topic_prefix, - SupervisorControl::status_request, req$id, ""); - Management::Log::info(fmt("issued supervisor status, %s", req$id)); + Management::Agent::API::get_nodes_response, req$id, res); + Management::Request::finish(req$id); } event Management::Node::API::node_dispatch_response(reqid: string, result: Management::Result) From 68558e28745bf7e6384406220ed7c2c008dab45e Mon Sep 17 00:00:00 2001 From: Christian Kreibich Date: Tue, 21 Jun 2022 15:25:42 -0700 Subject: [PATCH 18/21] Management framework: trigger deployment upon when instances are ready More resilience: when an agent restarts, it checks in with the controller. If the controller has deployed a config, this check-in may lead to an internal notify_agents_ready event. At that point, we now trigger a deployment when there currently isn't already one running. This ensures that any agents not yet running the current cluster will start to do so, and does nothing when those agents already run it, since they ignore the request in that case. --- .../frameworks/management/controller/main.zeek | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/scripts/policy/frameworks/management/controller/main.zeek b/scripts/policy/frameworks/management/controller/main.zeek index 67057799db..ecbdfc4e53 100644 --- a/scripts/policy/frameworks/management/controller/main.zeek +++ b/scripts/policy/frameworks/management/controller/main.zeek @@ -668,11 +668,24 @@ function deploy(req: Management::Request::Request) event Management::Controller::API::notify_agents_ready(instances: set[string]) { local insts = Management::Util::set_to_vector(instances); + local req: Management::Request::Request; Management::Log::info(fmt("rx Management::Controller::API:notify_agents_ready %s", join_string_vec(insts, ", "))); - local req = Management::Request::lookup(g_config_reqid_pending); + # If we're not currently deploying a configuration, but have a deployed + # configuration, trigger a deployment at this point. Some of our agents + # might have restarted, and need to get in sync with us. Agents already + # running this configuration will do nothing. + if ( g_config_reqid_pending == "" && DEPLOYED in g_configs ) + { + req = Management::Request::create(); + req$deploy_state = DeployState($config=g_configs[DEPLOYED], $is_internal=T); + Management::Log::info(fmt("no deployment in progress, triggering via %s", req$id)); + deploy(req); + } + + req = Management::Request::lookup(g_config_reqid_pending); # If there's no pending request, when it's no longer available, or it # doesn't have config state, don't do anything else. From 2c1cd1d4016a6ddbc35bc60d3b0d18c67d148892 Mon Sep 17 00:00:00 2001 From: Christian Kreibich Date: Wed, 22 Jun 2022 11:48:11 -0700 Subject: [PATCH 19/21] Management framework: rename set_configuration events to stage_configuration This reflects corresponding renaming of the client's set-config command to stage-config, to make it more clear what's happening. --- .../frameworks/management/controller/api.zeek | 35 ++++++++++--------- .../management/controller/main.zeek | 16 ++++----- .../policy/frameworks/management/types.zeek | 8 ++--- 3 files changed, 31 insertions(+), 28 deletions(-) diff --git a/scripts/policy/frameworks/management/controller/api.zeek b/scripts/policy/frameworks/management/controller/api.zeek index 42b870b4e5..9b5de9087c 100644 --- a/scripts/policy/frameworks/management/controller/api.zeek +++ b/scripts/policy/frameworks/management/controller/api.zeek @@ -32,11 +32,11 @@ export { result: Management::Result); - ## The client sends this event to upload a new cluster configuration, - ## including the full cluster topology. The controller validates the + ## Upload a configuration to the controller for later deployment. + ## The client sends this event to the controller, which validates the ## configuration and indicates the outcome in its response event. No ## deployment takes place yet, and existing deployed configurations and - ## clusters remain intact. To trigger deployment of an uploaded + ## the running Zeek cluster remain intact. To trigger deployment of an uploaded ## configuration, use :zeek:see:`Management::Controller::API::deploy_request`. ## ## reqid: a request identifier string, echoed in the response event. @@ -44,11 +44,11 @@ export { ## config: a :zeek:see:`Management::Configuration` record ## specifying the cluster configuration. ## - global set_configuration_request: event(reqid: string, + global stage_configuration_request: event(reqid: string, config: Management::Configuration); - ## Response to a set_configuration_request event. The controller sends - ## this back to the client. + ## Response to a stage_configuration_request event. The controller sends + ## this back to the client, conveying validation results. ## ## reqid: the request identifier used in the request event. ## @@ -57,12 +57,12 @@ export { ## a single result record indicates so. Otherwise, the sequence is ## all errors, each indicating a configuration validation error. ## - global set_configuration_response: event(reqid: string, + global stage_configuration_response: event(reqid: string, result: Management::ResultVec); - ## The client sends this event to retrieve the currently deployed - ## cluster configuration. + ## The client sends this event to retrieve the controller's current + ## cluster configuration(s). ## ## reqid: a request identifier string, echoed in the response event. ## @@ -85,13 +85,16 @@ export { result: Management::Result); - ## The client sends this event to trigger deployment of a previously - ## uploaded configuration. The controller deploys the uploaded - ## configuration to all agents involved in running the former - ## configuration or the new one. The agents terminate any previously - ## running cluster nodes and (re-)launch those defined in the new - ## configuration. Once each agent has responded (or a timeout occurs), - ## the controller sends a response event back to the client. + ## Trigger deployment of a previously staged configuration. The client + ## sends this event to the controller, which deploys the configuration + ## to the agents. Agents then terminate any previously running cluster + ## nodes and (re-)launch those defined in the new configuration. Once + ## each agent has responded (or a timeout occurs), the controller sends + ## a response event back to the client, aggregating the results from the + ## agents. The controller keeps the staged configuration available for + ## download, or re-deployment. In addition, the deployed configuration + ## becomes available for download as well, with any augmentations + ## (e.g. node ports filled in by auto-assignment) reflected. ## ## reqid: a request identifier string, echoed in the response event. ## diff --git a/scripts/policy/frameworks/management/controller/main.zeek b/scripts/policy/frameworks/management/controller/main.zeek index ecbdfc4e53..a5c1840b18 100644 --- a/scripts/policy/frameworks/management/controller/main.zeek +++ b/scripts/policy/frameworks/management/controller/main.zeek @@ -866,9 +866,9 @@ event Management::Agent::API::deploy_response(reqid: string, results: Management Management::Request::finish(req$id); } -event Management::Controller::API::set_configuration_request(reqid: string, config: Management::Configuration) +event Management::Controller::API::stage_configuration_request(reqid: string, config: Management::Configuration) { - Management::Log::info(fmt("rx Management::Controller::API::set_configuration_request %s", reqid)); + Management::Log::info(fmt("rx Management::Controller::API::stage_configuration_request %s", reqid)); local req = Management::Request::create(reqid); local res = Management::Result($reqid=req$id); @@ -877,10 +877,10 @@ event Management::Controller::API::set_configuration_request(reqid: string, conf if ( ! config_validate(config, req) ) { Management::Request::finish(req$id); - Management::Log::info(fmt("tx Management::Controller::API::set_configuration_response %s", + Management::Log::info(fmt("tx Management::Controller::API::stage_configuration_response %s", Management::Request::to_string(req))); Broker::publish(Management::Controller::topic, - Management::Controller::API::set_configuration_response, req$id, req$results); + Management::Controller::API::stage_configuration_response, req$id, req$results); return; } @@ -896,10 +896,10 @@ event Management::Controller::API::set_configuration_request(reqid: string, conf res$error = fmt("port auto-assignment disabled but nodes %s lack ports", nodes_str); req$results += res; - Management::Log::info(fmt("tx Management::Controller::API::set_configuration_response %s", + Management::Log::info(fmt("tx Management::Controller::API::stage_configuration_response %s", Management::Request::to_string(req))); Broker::publish(Management::Controller::topic, - Management::Controller::API::set_configuration_response, req$id, req$results); + Management::Controller::API::stage_configuration_response, req$id, req$results); Management::Request::finish(req$id); return; } @@ -918,10 +918,10 @@ event Management::Controller::API::set_configuration_request(reqid: string, conf req$results += res; Management::Log::info(fmt( - "tx Management::Controller::API::set_configuration_response %s", + "tx Management::Controller::API::stage_configuration_response %s", Management::result_to_string(res))); Broker::publish(Management::Controller::topic, - Management::Controller::API::set_configuration_response, reqid, req$results); + Management::Controller::API::stage_configuration_response, reqid, req$results); Management::Request::finish(req$id); } diff --git a/scripts/policy/frameworks/management/types.zeek b/scripts/policy/frameworks/management/types.zeek index f28706a313..796c943754 100644 --- a/scripts/policy/frameworks/management/types.zeek +++ b/scripts/policy/frameworks/management/types.zeek @@ -109,10 +109,10 @@ export { type ResultVec: vector of Result; - ## In :zeek:see:`Management::Controller::API::set_configuration_response`, - ## events, each :zeek:see:`Management::Result` indicates the outcome of a - ## requested cluster node. If a node does not launch properly (meaning - ## it doesn't check in with the agent on thee machine it's running on), + ## In :zeek:see:`Management::Controller::API::deploy_response` events, + ## each :zeek:see:`Management::Result` indicates the outcome of a + ## launched cluster node. If a node does not launch properly (meaning + ## it doesn't check in with the agent on the machine it's running on), ## the result will indicate failure, and its data field will be an ## instance of this record, capturing the stdout and stderr output of ## the failing node. From 3c3d0f0c1ed39b3c0be8b4850c5a16dfdef9ef05 Mon Sep 17 00:00:00 2001 From: Christian Kreibich Date: Wed, 15 Jun 2022 13:31:07 -0700 Subject: [PATCH 20/21] Management framework: bump zeek-client --- auxil/zeek-client | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/auxil/zeek-client b/auxil/zeek-client index 0f1ee6489e..298e57f8d1 160000 --- a/auxil/zeek-client +++ b/auxil/zeek-client @@ -1 +1 @@ -Subproject commit 0f1ee6489ecc07767cb1f37beb312eae69e5a6d8 +Subproject commit 298e57f8d1ef45fa02ad775eb4d0e72e57056527 From 661774eb6ef7029f3998e85d39a673a45dca48f6 Mon Sep 17 00:00:00 2001 From: Christian Kreibich Date: Mon, 13 Jun 2022 22:47:55 -0700 Subject: [PATCH 21/21] Management framework: bump external cluster testsuite --- testing/external/commit-hash.zeek-testing-cluster | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/testing/external/commit-hash.zeek-testing-cluster b/testing/external/commit-hash.zeek-testing-cluster index c818a8cb3a..56b977eb38 100644 --- a/testing/external/commit-hash.zeek-testing-cluster +++ b/testing/external/commit-hash.zeek-testing-cluster @@ -1 +1 @@ -6cb24ebe9ad0725b28c4e4d5afa3ec1f6d04eed5 +d8bb7c9fe7d2a3b9d9526860ec74a3967ecea584