From 24a495da423589927147cefe05476b283e3eec0c Mon Sep 17 00:00:00 2001 From: Christian Kreibich Date: Mon, 30 May 2022 12:54:58 -0700 Subject: [PATCH] Management framework: Supervisor extensions for stdout/stderr handling This improves the framework's handling of Zeek node stdout and stderr by extending the (script-layer) Supervisor functionality. - The Supervisor _either_ directs Zeek nodes' stdout/stderr to files _or_ lets you hook into it at the script level. We'd like both: files make sense to allow inspection outside of the framework, and the framework would benefit from tapping into the streams e.g. for error context. We now provide the file redirection functionality in the Supervisor, in addition to the hook mechanism. The hook mechanism also builds up rolling windows of up to 100 lines (configurable) into stdout/stderr. - The new Mangement::Supervisor::API::notify_node_exit event notifies subscribers (agents, really) that a particular node has exited (and is possibly being restarted by the Supervisor). The event includes the name of the node, plus its recent stdout/stderr context. --- .../management/supervisor/__load__.zeek | 1 + .../frameworks/management/supervisor/api.zeek | 18 +++ .../management/supervisor/config.zeek | 13 ++ .../management/supervisor/main.zeek | 122 ++++++++++++++++++ .../policy/frameworks/management/types.zeek | 12 ++ scripts/test-all-policy.zeek | 4 + .../Baseline/coverage.bare-mode-errors/errors | 4 +- 7 files changed, 172 insertions(+), 2 deletions(-) create mode 100644 scripts/policy/frameworks/management/supervisor/__load__.zeek create mode 100644 scripts/policy/frameworks/management/supervisor/api.zeek create mode 100644 scripts/policy/frameworks/management/supervisor/config.zeek create mode 100644 scripts/policy/frameworks/management/supervisor/main.zeek diff --git a/scripts/policy/frameworks/management/supervisor/__load__.zeek b/scripts/policy/frameworks/management/supervisor/__load__.zeek new file mode 100644 index 0000000000..a10fe855df --- /dev/null +++ b/scripts/policy/frameworks/management/supervisor/__load__.zeek @@ -0,0 +1 @@ +@load ./main diff --git a/scripts/policy/frameworks/management/supervisor/api.zeek b/scripts/policy/frameworks/management/supervisor/api.zeek new file mode 100644 index 0000000000..f02b30d7c7 --- /dev/null +++ b/scripts/policy/frameworks/management/supervisor/api.zeek @@ -0,0 +1,18 @@ +@load policy/frameworks/management/types + +module Management::Supervisor::API; + +export { + ## The Supervisor generates this event whenever it has received a status + ## update from the stem, indicating that a node exited. + ## + ## node: the name of a node previously created via + ## :zeek:see:`Supervisor::create`. + ## + ## outputs: stdout/stderr context for the node. The contained strings + ## span up to the 100 most recent lines in the corresponding + ## stream. See :zeek:see:`Management::Supervisor::output_max_lines` + ## to adjust the line limit. + ## + global notify_node_exit: event(node: string, outputs: Management::NodeOutputs); +} diff --git a/scripts/policy/frameworks/management/supervisor/config.zeek b/scripts/policy/frameworks/management/supervisor/config.zeek new file mode 100644 index 0000000000..f910ee8a7d --- /dev/null +++ b/scripts/policy/frameworks/management/supervisor/config.zeek @@ -0,0 +1,13 @@ +##! Configuration settings for the Management framework's supervisor extension. + +module Management::Supervisor; + +export { + ## The Broker topic for Management framework communication with the + ## Supervisor. The agent subscribes to this. + const topic_prefix = "zeek/management/supervisor" &redef; + + ## The maximum number of stdout/stderr output lines to convey in + ## :zeek:see:`Management::Supervisor::API::notify_node_exit` events. + const output_max_lines: count = 100 &redef; +} diff --git a/scripts/policy/frameworks/management/supervisor/main.zeek b/scripts/policy/frameworks/management/supervisor/main.zeek new file mode 100644 index 0000000000..f68413a5d3 --- /dev/null +++ b/scripts/policy/frameworks/management/supervisor/main.zeek @@ -0,0 +1,122 @@ +##! This module provides functionality the Management framework places directly +##! in the Supervisor. + +@load base/utils/paths +@load base/utils/queue + +@load policy/frameworks/management/types +@load policy/frameworks/management/node/config + +@load ./api +@load ./config + +module Management::Supervisor; + +# stdout/stderr state for a given node. +type NodeOutputStreams: record { + # Line buffers for stdout and stderr. Their length is capped + # to the most recent Management::Supervisor::output_max_lines. + stdout: Queue::Queue; + stderr: Queue::Queue; + + # + stdout_file: file &optional; + stderr_file: file &optional; +}; + +# This tracks output state for the current nodes. +global g_outputs: table[string] of NodeOutputStreams; + +function make_node_output_streams(node: string): NodeOutputStreams + { + local stdout = Queue::init([$max_len = Management::Supervisor::output_max_lines]); + local stderr = Queue::init([$max_len = Management::Supervisor::output_max_lines]); + + local res = NodeOutputStreams($stdout=stdout, $stderr=stderr); + local status = Supervisor::status(node); + + if ( node !in status$nodes ) + return res; + + local ns = status$nodes[node]; + local directory = "."; + + if ( ns$node?$directory ) + directory = ns$node$directory; + + if ( Management::Node::stdout_file != "" ) + res$stdout_file = open(build_path(directory, Management::Node::stdout_file)); + if ( Management::Node::stderr_file != "" ) + res$stderr_file = open(build_path(directory, Management::Node::stderr_file)); + + return res; + } + +hook Supervisor::stdout_hook(node: string, msg: string) + { + if ( node !in g_outputs ) + g_outputs[node] = make_node_output_streams(node); + + # Write to the stdout file if we have one. The flush is clunky, but + # seems worth it: it's too confusing for errors to have happened and not + # yet shown up in the file. (The Supervisor's built-in file redirection + # does this too.) + if ( g_outputs[node]?$stdout_file ) + { + print g_outputs[node]$stdout_file, msg; + flush_all(); + } + + # Update the sliding window of recent output lines. + Queue::put(g_outputs[node]$stdout, msg); + + # Don't print this message in the Supervisor's own stdout + break; + } + +hook Supervisor::stderr_hook(node: string, msg: string) + { + if ( node !in g_outputs ) + g_outputs[node] = make_node_output_streams(node); + + if ( g_outputs[node]?$stderr_file ) + { + print g_outputs[node]$stderr_file, msg; + flush_all(); + } + + Queue::put(g_outputs[node]$stderr, msg); + + # Don't print this message in the Supervisor's own stdout + break; + } + +event Supervisor::node_status(node: string, pid: count) + { + # The node just started or restarted. If we have collected any output + # for its past life, send it via a notify_node_exit event. + if ( node in g_outputs ) + { + local stdout_lines: vector of string; + local stderr_lines: vector of string; + + Queue::get_vector(g_outputs[node]$stdout, stdout_lines); + Queue::get_vector(g_outputs[node]$stderr, stderr_lines); + + if ( |stdout_lines| > 0 || |stderr_lines| > 0 ) + { + local outputs = Management::NodeOutputs( + $stdout = join_string_vec(stdout_lines, "\n"), + $stderr = join_string_vec(stderr_lines, "\n")); + + Broker::publish(topic_prefix, Management::Supervisor::API::notify_node_exit, node, outputs); + } + + if ( g_outputs[node]?$stdout_file ) + close(g_outputs[node]$stdout_file); + if ( g_outputs[node]?$stderr_file ) + close(g_outputs[node]$stderr_file); + } + + g_outputs[node] = make_node_output_streams(node); + } diff --git a/scripts/policy/frameworks/management/types.zeek b/scripts/policy/frameworks/management/types.zeek index db3ac8da55..c4076cd8f7 100644 --- a/scripts/policy/frameworks/management/types.zeek +++ b/scripts/policy/frameworks/management/types.zeek @@ -104,6 +104,18 @@ export { type ResultVec: vector of Result; + ## In :zeek:see:`Management::Controller::API::set_configuration_response`, + ## events, each :zeek:see:`Management::Result` indicates the outcome of a + ## requested cluster node. If a node does not launch properly (meaning + ## it doesn't check in with the agent on thee machine it's running on), + ## the result will indicate failure, and its data field will be an + ## instance of this record, capturing the stdout and stderr output of + ## the failing node. + type NodeOutputs: record { + stdout: string; ##< The stdout stream of a Zeek process + stderr: string; ##< The stderr stream of a Zeek process + }; + ## Given a :zeek:see:`Management::Result` record, ## this function returns a string summarizing it. global result_to_string: function(res: Result): string; diff --git a/scripts/test-all-policy.zeek b/scripts/test-all-policy.zeek index 2dc50cd8af..eb089a9aa2 100644 --- a/scripts/test-all-policy.zeek +++ b/scripts/test-all-policy.zeek @@ -29,6 +29,10 @@ @load frameworks/management/node/api.zeek @load frameworks/management/node/config.zeek # @load frameworks/management/node/main.zeek +@load frameworks/management/supervisor/__load__.zeek +@load frameworks/management/supervisor/api.zeek +@load frameworks/management/supervisor/config.zeek +@load frameworks/management/supervisor/main.zeek @load frameworks/management/request.zeek @load frameworks/management/types.zeek @load frameworks/management/util.zeek diff --git a/testing/btest/Baseline/coverage.bare-mode-errors/errors b/testing/btest/Baseline/coverage.bare-mode-errors/errors index 6e8c09e1bd..d179d2c6ff 100644 --- a/testing/btest/Baseline/coverage.bare-mode-errors/errors +++ b/testing/btest/Baseline/coverage.bare-mode-errors/errors @@ -2,8 +2,8 @@ ### NOTE: This file has been sorted with diff-sort. warning in <...>/extract-certs-pem.zeek, line 1: deprecated script loaded from <...>/__load__.zeek:15 "Remove in v5.1. Use log-certs-base64.zeek instead." warning in <...>/extract-certs-pem.zeek, line 1: deprecated script loaded from command line arguments "Remove in v5.1. Use log-certs-base64.zeek instead." -warning in <...>/log-ocsp.zeek, line 1: deprecated script loaded from <...>/test-all-policy.zeek:66 ("Remove in v5.1. OCSP logging is now enabled by default") -warning in <...>/log-ocsp.zeek, line 1: deprecated script loaded from <...>/test-all-policy.zeek:66 ("Remove in v5.1. OCSP logging is now enabled by default") +warning in <...>/log-ocsp.zeek, line 1: deprecated script loaded from <...>/test-all-policy.zeek:70 ("Remove in v5.1. OCSP logging is now enabled by default") +warning in <...>/log-ocsp.zeek, line 1: deprecated script loaded from <...>/test-all-policy.zeek:70 ("Remove in v5.1. OCSP logging is now enabled by default") warning in <...>/log-ocsp.zeek, line 1: deprecated script loaded from command line arguments ("Remove in v5.1. OCSP logging is now enabled by default") warning in <...>/notary.zeek, line 1: deprecated script loaded from <...>/__load__.zeek:5 ("Remove in v5.1. Please switch to other more modern approaches like SCT validation (validate-sct.zeek).") warning in <...>/notary.zeek, line 1: deprecated script loaded from command line arguments ("Remove in v5.1. Please switch to other more modern approaches like SCT validation (validate-sct.zeek).")