From 5592beaf31c9e9c23b0f49fd99596ca1df3fb227 Mon Sep 17 00:00:00 2001 From: Christian Kreibich Date: Mon, 13 Jun 2022 13:19:04 -0700 Subject: [PATCH 1/6] Management framework: handle no-instances corner case in set-config correctly When the controller receives a configuration with no instances (and thus no nodes), it needs to roundtrip to agents and can send the response right away. --- .../management/controller/main.zeek | 22 +++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/scripts/policy/frameworks/management/controller/main.zeek b/scripts/policy/frameworks/management/controller/main.zeek index a0100355ab..e1e02c2c81 100644 --- a/scripts/policy/frameworks/management/controller/main.zeek +++ b/scripts/policy/frameworks/management/controller/main.zeek @@ -668,8 +668,26 @@ event Management::Controller::API::set_configuration_request(reqid: string, conf add_instance(insts_to_peer[inst_name]); } - # Updates to out instance tables are complete, now check if we're already - # able to send the config to the agents: + # Updates to instance tables are complete. As a corner case, if the + # config contained no instances (and thus no nodes), we're now done + # since there are no agent interactions to wait for: + if ( |insts_new| == 0 ) + { + g_config_current = req$set_configuration_state$config; + g_config_reqid_pending = ""; + + Management::Log::info(fmt("tx Management::Controller::API::set_configuration_response %s", + Management::Request::to_string(req))); + Broker::publish(Management::Controller::topic, + Management::Controller::API::set_configuration_response, req$id, req$results); + Management::Request::finish(req$id); + return; + } + + # Otherwise, check if we're able to send the config to all agents + # involved. If that's the case, this will trigger a + # Management::Controller::API::notify_agents_ready event that implements + # the distribution in the controller's own event handler, above. check_instances_ready(); } From 0c20f16055000c7e37bd68dbf4f46dda4fb6c934 Mon Sep 17 00:00:00 2001 From: Christian Kreibich Date: Mon, 13 Jun 2022 21:12:42 -0700 Subject: [PATCH 2/6] Management framework: control output-to-console in Supervisor It helps during testing to be able to control whether the Supervisor process also routs node output to the console, in addition to writing to output files. Since the Supervisor runs as the main process in Docker containers, its output becomes visible in "docker logs" that way, simplifying diagnostics. --- .../frameworks/management/supervisor/config.zeek | 12 ++++++++++++ .../frameworks/management/supervisor/main.zeek | 8 ++++---- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/scripts/policy/frameworks/management/supervisor/config.zeek b/scripts/policy/frameworks/management/supervisor/config.zeek index f910ee8a7d..1f69f1b569 100644 --- a/scripts/policy/frameworks/management/supervisor/config.zeek +++ b/scripts/policy/frameworks/management/supervisor/config.zeek @@ -7,6 +7,18 @@ export { ## Supervisor. The agent subscribes to this. const topic_prefix = "zeek/management/supervisor" &redef; + ## Whether to print the stdout sent up to the Supervisor by created + ## nodes to the terminal. By default, this is disabled since this output + ## already ends up in a node-specific stdout file, per + ## :zeek:see:`Management::Node::stdout_file`. + const print_stdout = F &redef; + + ## Whether to print the stderr sent up to the Supervisor by created + ## nodes to the terminal. By default, this is disabled since this output + ## already ends up in a node-specific stderr file, per + ## :zeek:see:`Management::Node::stderr_file`. + const print_stderr = F &redef; + ## The maximum number of stdout/stderr output lines to convey in ## :zeek:see:`Management::Supervisor::API::notify_node_exit` events. const output_max_lines: count = 100 &redef; diff --git a/scripts/policy/frameworks/management/supervisor/main.zeek b/scripts/policy/frameworks/management/supervisor/main.zeek index f68413a5d3..8683a895f7 100644 --- a/scripts/policy/frameworks/management/supervisor/main.zeek +++ b/scripts/policy/frameworks/management/supervisor/main.zeek @@ -70,8 +70,8 @@ hook Supervisor::stdout_hook(node: string, msg: string) # Update the sliding window of recent output lines. Queue::put(g_outputs[node]$stdout, msg); - # Don't print this message in the Supervisor's own stdout - break; + if ( ! print_stdout ) + break; } hook Supervisor::stderr_hook(node: string, msg: string) @@ -87,8 +87,8 @@ hook Supervisor::stderr_hook(node: string, msg: string) Queue::put(g_outputs[node]$stderr, msg); - # Don't print this message in the Supervisor's own stdout - break; + if ( ! print_stderr ) + break; } event Supervisor::node_status(node: string, pid: count) From 620db4d4eb068f641fe5faacc030b763e0e94b1b Mon Sep 17 00:00:00 2001 From: Christian Kreibich Date: Mon, 13 Jun 2022 21:15:14 -0700 Subject: [PATCH 3/6] Management framework: improvements to port auto-enumeration The numbering process now accounts for the possibility of colliding with the agent port, as well as with ports explicitly assigned in the configuration. It also avoids nondeterminism that could result from traversal of sets. --- .../management/controller/main.zeek | 89 ++++++++++++++----- 1 file changed, 69 insertions(+), 20 deletions(-) diff --git a/scripts/policy/frameworks/management/controller/main.zeek b/scripts/policy/frameworks/management/controller/main.zeek index e1e02c2c81..b46bfc014e 100644 --- a/scripts/policy/frameworks/management/controller/main.zeek +++ b/scripts/policy/frameworks/management/controller/main.zeek @@ -256,40 +256,89 @@ function config_assign_ports(config: Management::Configuration) # We're changing nodes in the configuration's set, so need to rebuild it: local new_nodes: set[Management::Node]; - # Workers don't need listening ports, but these do: - local roles = vector(Supervisor::MANAGER, Supervisor::LOGGER, Supervisor::PROXY); + # Workers don't need listening ports, but the others do. Define an ordering: + local roles: table[Supervisor::ClusterRole] of count = { + [Supervisor::MANAGER] = 0, + [Supervisor::LOGGER] = 1, + [Supervisor::PROXY] = 2 + }; + # We start counting from this port. Counting proceeds across instances, + # not per-instance: if the user wants auto-assignment, it seems better + # to avoid confusion with the same port being used on multiple + # instances. local p = port_to_count(Management::Controller::auto_assign_start_port); - local roles_set: set[Supervisor::ClusterRole]; - for ( i in roles ) - add roles_set[roles[i]]; + # A set that tracks the ports we've used so far. Helpful for avoiding + # collisions between manually specified and auto-enumerated ports. + local ports_set: set[count]; + + local node: Management::Node; + + # Pre-populate agents ports, if we have them: + for ( inst in config$instances ) + { + if ( inst?$listen_port ) + add ports_set[port_to_count(inst$listen_port)]; + } + + # Pre-populate nodes with pre-defined ports: + for ( node in config$nodes ) + { + if ( node?$p ) + { + add ports_set[port_to_count(node$p)]; + add new_nodes[node]; + } + } # Copy any nodes to the new set that have roles we don't care about. for ( node in config$nodes ) { - if ( node$role !in roles_set ) + if ( node$role !in roles ) add new_nodes[node]; } - # Now process the ones that may need ports, in order. - for ( i in roles ) + # Now process the ones that may need ports, in order. We first sort by + # roles; we go manager -> logger -> proxy. Next are instance names, to + # get locally sequential ports among the same roles, and finally by + # name. + local nodes: vector of Management::Node; + for ( node in config$nodes ) { - for ( node in config$nodes ) - { - if ( node$role != roles[i] ) - next; + if ( node?$p ) + next; + if ( node$role !in roles ) + next; + nodes += node; + } - if ( node?$p ) # Already has a port. - { - add new_nodes[node]; - next; - } + sort(nodes, function [roles] (n1: Management::Node, n2: Management::Node): int + { + if ( roles[n1$role] < roles[n2$role] ) + return -1; + if ( roles[n1$role] > roles[n2$role] ) + return 1; + local instcmp = strcmp(n1$instance, n2$instance); + if ( instcmp != 0 ) + return instcmp; + return strcmp(n1$name, n2$name); + }); - node$p = count_to_port(p, tcp); - add new_nodes[node]; + for ( i in nodes ) + { + node = nodes[i]; + + # Find next available port ... + while ( p in ports_set ) ++p; - } + + node$p = count_to_port(p, tcp); + add new_nodes[node]; + add ports_set[p]; + + # ... and consume it. + ++p; } config$nodes = new_nodes; From d6042cf516665bcecb00cc355d2b99dcb23e8406 Mon Sep 17 00:00:00 2001 From: Christian Kreibich Date: Mon, 13 Jun 2022 21:16:09 -0700 Subject: [PATCH 4/6] Management framework: add config validation During `set_configuration_request` handling the controller now validates received configurations, checking for a few common gotchas around naming and port use. Validation continues once it finds a problem, resulting in a list summarizing all identified problems. --- .../management/controller/main.zeek | 163 +++++++++++++++++- 1 file changed, 157 insertions(+), 6 deletions(-) diff --git a/scripts/policy/frameworks/management/controller/main.zeek b/scripts/policy/frameworks/management/controller/main.zeek index b46bfc014e..ef6047e3bf 100644 --- a/scripts/policy/frameworks/management/controller/main.zeek +++ b/scripts/policy/frameworks/management/controller/main.zeek @@ -103,6 +103,13 @@ global config_nodes_lacking_ports: function(config: Management::Configuration): # port. This assumes those ports are actually available on the instances. global config_assign_ports: function(config: Management::Configuration); +# Validate the given configuration in terms of missing, incorrect, or +# conflicting content. Returns T if validation succeeds, F otherwise. The +# function updates the given request's success state and adds any error +# result(s) to it. +global config_validate: function(config: Management::Configuration, + req: Management::Request::Request): bool; + # Rejects the given configuration with the given error message. The function # adds a non-success result record to the given request and send the # set_configuration_response event back to the client. It does not call finish() @@ -344,6 +351,146 @@ function config_assign_ports(config: Management::Configuration) config$nodes = new_nodes; } +function config_validate(config: Management::Configuration, + req: Management::Request::Request): bool + { + local errors: Management::ResultVec; + local make_error = function(reqid: string, error: string): Management::Result + { return Management::Result($reqid=reqid, $success=F, $error=error); }; + + # We could reject the config if it lists agents that connect to the + # controller, and the controller is currently unaware of those + # agents. We don't do this because setting the configuration is often + # the first task in scripting, and it's helpful that the controller will + # naturally wait for such agents to connect (hopefully soon). The + # alternative would be that clients need to check get-instances output + # until the expected connectivity is in place. We may revisit this in + # the future. + + # Reject if node names or instance names repeat: + + local inst_names: set[string]; + local node_names: set[string]; + local inst_names_done: set[string]; # These ensure we only report the + local node_names_done: set[string]; # same problem once. + + for ( inst in config$instances ) + { + if ( inst$name !in inst_names ) + { + add inst_names[inst$name]; + next; + } + + if ( inst$name !in inst_names_done ) + { + errors += make_error(req$id, fmt("multiple instances named '%s'", inst$name)); + add inst_names_done[inst$name]; + } + } + + for ( node in config$nodes ) + { + if ( node$name !in node_names ) + { + add node_names[node$name]; + next; + } + + if ( node$name !in node_names_done ) + { + errors += make_error(req$id, fmt("multiple nodes named '%s'", node$name)); + add node_names_done[node$name]; + } + } + + # Also reject if instance and node names overlap: + local both_names = inst_names & node_names; + + if ( |both_names| > 0 ) + errors += make_error(req$id, fmt("node and instance names collide: %s", + join_string_vec(Management::Util::set_to_vector(both_names), ", "))); + + # Reject if a node instance name isn't actually an instance. We permit + # instances that aren't mentioned by nodes. + for ( node in config$nodes ) + { + if ( node$instance !in inst_names ) + errors += make_error(req$id, fmt("node '%s' has undeclared instance name '%s'", + node$name, node$instance)); + } + + # Conflicts also apply to ports: auto-assignment avoids collisions, but + # nodes that spell out ports must not collide, neither with themselves + # nor agents. We need to track this per-instance. + local inst_ports: table[string] of set[port]; + local node_ports: table[string] of set[port]; + local node_ports_done: table[string] of set[port]; + + for ( inst in config$instances ) + { + if ( ! inst?$listen_port ) + next; + if ( inst$name !in inst_ports ) + inst_ports[inst$name] = set(); + add inst_ports[inst$name][inst$listen_port]; + } + + for ( node in config$nodes ) + { + if ( node$instance !in node_ports ) + node_ports[node$instance] = set(); + if ( node$instance !in node_ports_done ) + node_ports_done[node$instance] = set(); + + if ( ! node?$p ) + next; + + if ( node$instance in inst_ports && node$p in inst_ports[node$instance] ) + errors += make_error(req$id, fmt("node '%s' port %s conflicts with agent port on instance '%s'", + node$name, node$p, node$instance)); + + # It might not strictly be a problem if controller and cluster + # node have the same port, since they may well be on different + # machines. Ideally we'd verify this, but this too would require + # that the agents are already connected. Might revisit later. + if ( node$p == Management::Controller::network_info()$bound_port ) + errors += make_error(req$id, fmt("node '%s' port %s conflicts with controller port", + node$name, node$p)); + + if ( node$p !in node_ports[node$instance] ) + { + add node_ports[node$instance][node$p]; + next; + } + + if ( node$p !in node_ports_done[node$instance] ) + { + errors += make_error(req$id, fmt("multiple nodes on instance '%s' using port %s", + node$instance, node$p)); + add node_ports_done[node$instance][node$p]; + } + } + + # Possibilities for the future: + # - Are node options understood? + # - Do provided scripts exist/load? + # - Do requested interfaces exist on their instances? + + if ( |errors| == 0 ) + return T; + + # Sort errors alphabetically for deterministic results, since in the + # above we iterated nondeterministically over several sets. + sort(errors, function(r1: Management::Result, r2: Management::Result): int + { return strcmp(r1$error, r2$error); }); + + for ( i in errors ) + req$results += errors[i]; + + return F; + } + function find_endpoint(id: string): Broker::EndpointInfo { local peers = Broker::peers(); @@ -610,12 +757,16 @@ event Management::Controller::API::set_configuration_request(reqid: string, conf return; } - # XXX validate the configuration: - # - Are node instances among defined instances? - # - Are all names unique? - # - Are any node options understood? - # - Do node types with optional fields have required values? - # ... + # If the config has problems, reject it: + if ( ! config_validate(config, req) ) + { + Management::Request::finish(req$id); + Management::Log::info(fmt("tx Management::Controller::API::set_configuration_response %s", + Management::Request::to_string(req))); + Broker::publish(Management::Controller::topic, + Management::Controller::API::set_configuration_response, req$id, req$results); + return; + } if ( Management::Controller::auto_assign_ports ) config_assign_ports(config); From 7fbb008f85e37989b7eadf4a23540cb2570ac4e2 Mon Sep 17 00:00:00 2001 From: Christian Kreibich Date: Mon, 13 Jun 2022 22:45:43 -0700 Subject: [PATCH 5/6] Management framework: bump zeek-client --- auxil/zeek-client | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/auxil/zeek-client b/auxil/zeek-client index 21296e171c..fefd20b1b5 160000 --- a/auxil/zeek-client +++ b/auxil/zeek-client @@ -1 +1 @@ -Subproject commit 21296e171cdbfe40ac7d169e319ec02b82d9e414 +Subproject commit fefd20b1b574e3708d8dbcd2e273a437e03a952b From 09e412c9413fb240aec6f9e6bec2b9a35f953ee1 Mon Sep 17 00:00:00 2001 From: Christian Kreibich Date: Mon, 13 Jun 2022 22:47:55 -0700 Subject: [PATCH 6/6] Management framework: bump external cluster testsuite --- testing/external/commit-hash.zeek-testing-cluster | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/testing/external/commit-hash.zeek-testing-cluster b/testing/external/commit-hash.zeek-testing-cluster index 091d8d272e..9c19585078 100644 --- a/testing/external/commit-hash.zeek-testing-cluster +++ b/testing/external/commit-hash.zeek-testing-cluster @@ -1 +1 @@ -fa91ac8b028034bbbf22acca00a059cbc6d90829 +4aaab045ca6b4c53a815faf45accc685cddbcd73