diff --git a/scripts/base/frameworks/supervisor/api.zeek b/scripts/base/frameworks/supervisor/api.zeek index d11d8a2da5..a7593b8f3e 100644 --- a/scripts/base/frameworks/supervisor/api.zeek +++ b/scripts/base/frameworks/supervisor/api.zeek @@ -39,25 +39,25 @@ export { nodes: table[string] of NodeStatus; }; - global status: function(nodes: string &default="all"): Status; global create: function(node: NodeConfig): string; - global destroy: function(nodes: string): bool; - global restart: function(nodes: string &default="all"): bool; + global status: function(node: string &default=""): Status; + global restart: function(node: string &default=""): bool; + global destroy: function(node: string &default=""): bool; global is_supervisor: function(): bool; global is_supervised: function(): bool; - global Supervisor::stop_request: event(); - - global Supervisor::status_request: event(reqid: string, nodes: string); - global Supervisor::status_response: event(reqid: string, result: Status); - global Supervisor::create_request: event(reqid: string, node: NodeConfig); global Supervisor::create_response: event(reqid: string, result: string); - global Supervisor::destroy_request: event(reqid: string, nodes: string); + global Supervisor::status_request: event(reqid: string, node: string); + global Supervisor::status_response: event(reqid: string, result: Status); + + global Supervisor::restart_request: event(reqid: string, node: string); + global Supervisor::restart_response: event(reqid: string, result: bool); + + global Supervisor::destroy_request: event(reqid: string, node: string); global Supervisor::destroy_response: event(reqid: string, result: bool); - global Supervisor::restart_request: event(reqid: string, nodes: string); - global Supervisor::restart_response: event(reqid: string, result: bool); + global Supervisor::stop_request: event(); } diff --git a/scripts/base/frameworks/supervisor/main.zeek b/scripts/base/frameworks/supervisor/main.zeek index 0dc93a64bc..3e42331b71 100644 --- a/scripts/base/frameworks/supervisor/main.zeek +++ b/scripts/base/frameworks/supervisor/main.zeek @@ -21,9 +21,9 @@ event Supervisor::stop_request() terminate(); } -event Supervisor::status_request(reqid: string, nodes: string) +event Supervisor::status_request(reqid: string, node: string) { - local res = Supervisor::status(nodes); + local res = Supervisor::status(node); local topic = Supervisor::topic_prefix + fmt("/status_response/%s", reqid); Broker::publish(topic, Supervisor::status_response, reqid, res); } @@ -35,23 +35,23 @@ event Supervisor::create_request(reqid: string, node: NodeConfig) Broker::publish(topic, Supervisor::create_response, reqid, res); } -event Supervisor::destroy_request(reqid: string, nodes: string) +event Supervisor::destroy_request(reqid: string, node: string) { - local res = Supervisor::destroy(nodes); + local res = Supervisor::destroy(node); local topic = Supervisor::topic_prefix + fmt("/destroy_response/%s", reqid); Broker::publish(topic, Supervisor::destroy_response, reqid, res); } -event Supervisor::restart_request(reqid: string, nodes: string) +event Supervisor::restart_request(reqid: string, node: string) { - local res = Supervisor::restart(nodes); + local res = Supervisor::restart(node); local topic = Supervisor::topic_prefix + fmt("/restart_response/%s", reqid); Broker::publish(topic, Supervisor::restart_response, reqid, res); } -function Supervisor::status(nodes: string): Status +function Supervisor::status(node: string): Status { - return Supervisor::__status(nodes); + return Supervisor::__status(node); } function Supervisor::create(node: NodeConfig): string @@ -59,14 +59,14 @@ function Supervisor::create(node: NodeConfig): string return Supervisor::__create(node); } -function Supervisor::destroy(nodes: string): bool +function Supervisor::destroy(node: string): bool { - return Supervisor::__destroy(nodes); + return Supervisor::__destroy(node); } -function Supervisor::restart(nodes: string): bool +function Supervisor::restart(node: string): bool { - return Supervisor::__restart(nodes); + return Supervisor::__restart(node); } function is_supervisor(): bool diff --git a/src/Supervisor.cc b/src/Supervisor.cc index 4d9f163ccb..755e6e8032 100644 --- a/src/Supervisor.cc +++ b/src/Supervisor.cc @@ -148,7 +148,6 @@ Supervisor::~Supervisor() DBG_LOG(DBG_SUPERVISOR, "shutdown, killing stem process %d", stem_pid); - // TODO: is signal the best way to trigger shutdown of decendent processes? auto kill_res = kill(stem_pid, SIGTERM); if ( kill_res == -1 ) @@ -232,7 +231,6 @@ void Supervisor::HandleChildSignal() return; // Revive the Stem process - // TODO: Stem process needs a way to inform Supervisor not to revive stem_pid = fork(); if ( stem_pid == -1 ) @@ -280,10 +278,13 @@ void Supervisor::HandleChildSignal() DBG_LOG(DBG_SUPERVISOR, "stem process revived, new pid: %d", stem_pid); // Parent supervisor process resends node configurations to recreate - // the desired process hierarchy + // the desired process hierarchy. - // TODO: probably a preferred order in which to create nodes - // e.g. logger, manager, proxy, worker + // Note: there's probably a preferred order in which to create nodes. + // E.g. logger, manager, proxy, worker. However, fully synchronizing + // a startup order like that is slow and complicated: essentially have + // to wait for each process to start up and reach the point just after + // it starts listening (and maybe that never happens for some error case). for ( const auto& n : nodes ) { const auto& node = n.second; @@ -355,14 +356,16 @@ Stem::Stem(std::unique_ptr p) setsignal(SIGCHLD, stem_sig_handler); setsignal(SIGTERM, stem_sig_handler); - // TODO: changing the process group here so that SIGINT to the - // supervisor doesn't also get passed to the children. i.e. supervisor - // should be in charge of initiating orderly shutdown. But calling - // just setpgid() like this is technically a race-condition -- need - // to do more work of blocking SIGINT before fork(), unblocking after, - // then also calling setpgid() from parent. And just not doing that - // until more is known whether that's the right SIGINT behavior in - // the first place. + // Note: changing the process group here so that SIGINT to the supervisor + // doesn't also get passed to the children. I.e. the supervisor should be + // in charge of initiating orderly shutdown of the process tree. + // Technically calling setpgid() like this is a race-condition (if we get a + // SIGINT in between the fork() and setpgid() calls), but can treat that as + // mostly be harmless since the only affected node in the process tree at + // the point will be this Stem process and the Supervisor *should* do the + // right thing if it also sees SIGINT with the Stem already having exited + // (since that same type of situation with the Stem dying prematurely can + // happen for any arbitrary reason, not just for SIGINT). auto res = setpgid(0, 0); if ( res == -1 ) @@ -407,7 +410,6 @@ bool Stem::Wait(Supervisor::Node* node, int options) const if ( WIFEXITED(status) ) { node->exit_status = WEXITSTATUS(status); - // TODO: may be some cases where the node is intended to exit DBG_STEM("node '%s' exited with status %d", node->Name().data(), node->exit_status); } @@ -709,7 +711,6 @@ std::optional Stem::Poll() for ( auto& msg : msgs ) { - // TODO: improve message format ... std::vector msg_tokens; tokenize_string(std::move(msg), " ", &msg_tokens, 2); const auto& cmd = msg_tokens[0]; @@ -1035,16 +1036,31 @@ void Supervisor::NodeConfig::InitCluster() RecordVal* Supervisor::Status(std::string_view node_name) { - // TODO: handle node classes auto rval = new RecordVal(BifType::Record::Supervisor::Status); auto tt = BifType::Record::Supervisor::Status->FieldType("nodes"); auto node_table_val = new TableVal(tt->AsTableType()); rval->Assign(0, node_table_val); - for ( const auto& n : nodes ) + if ( node_name.empty() ) { - const auto& name = n.first; - const auto& node = n.second; + for ( const auto& n : nodes ) + { + const auto& name = n.first; + const auto& node = n.second; + auto key = make_intrusive(name); + auto val = node.ToRecord(); + node_table_val->Assign(key.get(), val.detach()); + } + } + else + { + auto it = nodes.find(node_name); + + if ( it == nodes.end() ) + return rval; + + const auto& name = it->first; + const auto& node = it->second; auto key = make_intrusive(name); auto val = node.ToRecord(); node_table_val->Assign(key.get(), val.detach()); @@ -1061,6 +1077,9 @@ std::string Supervisor::Create(const RecordVal* node_val) std::string Supervisor::Create(const Supervisor::NodeConfig& node) { + if ( node.name.empty() ) + return "node names must not be an empty string"; + if ( node.name.find(' ') != std::string::npos ) return fmt("node names must not contain spaces: '%s'", node.name.data()); @@ -1085,7 +1104,22 @@ std::string Supervisor::Create(const Supervisor::NodeConfig& node) bool Supervisor::Destroy(std::string_view node_name) { - // TODO: handle node classes + auto send_destroy_msg = [this](std::string_view name) + { + std::stringstream ss; + ss << "destroy " << name; + std::string msg = ss.str(); + safe_write(stem_pipe->OutFD(), msg.data(), msg.size() + 1); + }; + + if ( node_name.empty() ) + { + for ( const auto& n : nodes ) + send_destroy_msg(n.first); + + nodes.clear(); + return true; + } auto it = nodes.find(node_name); @@ -1093,24 +1127,31 @@ bool Supervisor::Destroy(std::string_view node_name) return false; nodes.erase(it); - - std::stringstream ss; - ss << "destroy " << node_name; - std::string msg = ss.str(); - safe_write(stem_pipe->OutFD(), msg.data(), msg.size() + 1); + send_destroy_msg(node_name); return true; } bool Supervisor::Restart(std::string_view node_name) { - // TODO: handle node classes + auto send_restart_msg = [this](std::string_view name) + { + std::stringstream ss; + ss << "restart " << name; + std::string msg = ss.str(); + safe_write(stem_pipe->OutFD(), msg.data(), msg.size() + 1); + }; + + if ( node_name.empty() ) + { + for ( const auto& n : nodes ) + send_restart_msg(n.first); + + return true; + } if ( nodes.find(node_name) == nodes.end() ) return false; - std::stringstream ss; - ss << "restart " << node_name; - std::string msg = ss.str(); - safe_write(stem_pipe->OutFD(), msg.data(), msg.size() + 1); + send_restart_msg(node_name); return true; } diff --git a/src/main.cc b/src/main.cc index 6e8a34fec4..6929067975 100644 --- a/src/main.cc +++ b/src/main.cc @@ -284,9 +284,8 @@ struct zeek_options { /** * Inherit certain options set in the original supervisor parent process * and discard the rest. - * @param node the supervised-node whose Zeek options are to be modified. */ - void filter_supervised_node_options(const zeek::Supervisor::NodeConfig& node) + void filter_supervised_node_options() { auto og = *this; *this = {}; @@ -327,6 +326,82 @@ struct zeek_options { } }; +static void init_supervised_node(zeek_options* options) + { + const auto& node_name = zeek::supervised_node->name; + + if ( zeek::supervised_node->directory ) + { + if ( chdir(zeek::supervised_node->directory->data()) ) + { + fprintf(stderr, "node '%s' failed to chdir to %s: %s\n", + node_name.data(), + zeek::supervised_node->directory->data(), + strerror(errno)); + exit(1); + } + } + + if ( zeek::supervised_node->stderr_file ) + { + auto fd = open(zeek::supervised_node->stderr_file->data(), + O_WRONLY | O_CREAT | O_TRUNC | O_APPEND | O_CLOEXEC, + 0600); + + if ( fd == -1 || dup2(fd, STDERR_FILENO) == -1 ) + { + fprintf(stderr, "node '%s' failed to create stderr file %s: %s\n", + node_name.data(), + zeek::supervised_node->stderr_file->data(), + strerror(errno)); + exit(1); + } + } + + if ( zeek::supervised_node->stdout_file ) + { + auto fd = open(zeek::supervised_node->stdout_file->data(), + O_WRONLY | O_CREAT | O_TRUNC | O_APPEND | O_CLOEXEC, + 0600); + + if ( fd == -1 || dup2(fd, STDOUT_FILENO) == -1 ) + { + fprintf(stderr, "node '%s' failed to create stdout file %s: %s\n", + node_name.data(), + zeek::supervised_node->stdout_file->data(), + strerror(errno)); + exit(1); + } + } + + if ( zeek::supervised_node->cpu_affinity ) + { + auto res = zeek::set_affinity(*zeek::supervised_node->cpu_affinity); + + if ( ! res ) + fprintf(stderr, "node '%s' failed to set CPU affinity: %s\n", + node_name.data(), strerror(errno)); + } + + options->filter_supervised_node_options(); + + if ( zeek::supervised_node->interface ) + options->interfaces.emplace_back(*zeek::supervised_node->interface); + + if ( ! zeek::supervised_node->cluster.empty() ) + { + if ( setenv("CLUSTER_NODE", node_name.data(), true) == -1 ) + { + fprintf(stderr, "node '%s' failed to setenv: %s\n", + node_name.data(), strerror(errno)); + exit(1); + } + } + + for ( const auto& s : zeek::supervised_node->scripts ) + options->scripts_to_load.emplace_back(s); + } + static std::vector to_cargs(const std::vector& args) { std::vector rval; @@ -933,82 +1008,7 @@ int main(int argc, char** argv) } if ( zeek::supervised_node ) - { - // TODO: probably all of this block could move to a new - // zeek::supervised_node->Init(options) method - const auto& node_name = zeek::supervised_node->name; - - if ( zeek::supervised_node->directory ) - { - if ( chdir(zeek::supervised_node->directory->data()) ) - { - fprintf(stderr, "node '%s' failed to chdir to %s: %s\n", - node_name.data(), - zeek::supervised_node->directory->data(), - strerror(errno)); - exit(1); - } - } - - if ( zeek::supervised_node->stderr_file ) - { - auto fd = open(zeek::supervised_node->stderr_file->data(), - O_WRONLY | O_CREAT | O_TRUNC | O_APPEND | O_CLOEXEC, - 0600); - - if ( fd == -1 || dup2(fd, STDERR_FILENO) == -1 ) - { - fprintf(stderr, "node '%s' failed to create stderr file %s: %s\n", - node_name.data(), - zeek::supervised_node->stderr_file->data(), - strerror(errno)); - exit(1); - } - } - - if ( zeek::supervised_node->stdout_file ) - { - auto fd = open(zeek::supervised_node->stdout_file->data(), - O_WRONLY | O_CREAT | O_TRUNC | O_APPEND | O_CLOEXEC, - 0600); - - if ( fd == -1 || dup2(fd, STDOUT_FILENO) == -1 ) - { - fprintf(stderr, "node '%s' failed to create stdout file %s: %s\n", - node_name.data(), - zeek::supervised_node->stdout_file->data(), - strerror(errno)); - exit(1); - } - } - - if ( zeek::supervised_node->cpu_affinity ) - { - auto res = zeek::set_affinity(*zeek::supervised_node->cpu_affinity); - - if ( ! res ) - fprintf(stderr, "node '%s' failed to set CPU affinity: %s\n", - node_name.data(), strerror(errno)); - } - - options.filter_supervised_node_options(*zeek::supervised_node); - - if ( zeek::supervised_node->interface ) - options.interfaces.emplace_back(*zeek::supervised_node->interface); - - if ( ! zeek::supervised_node->cluster.empty() ) - { - if ( setenv("CLUSTER_NODE", node_name.data(), true) == -1 ) - { - fprintf(stderr, "node '%s' failed to setenv: %s\n", - node_name.data(), strerror(errno)); - exit(1); - } - } - - for ( const auto& s : zeek::supervised_node->scripts ) - options.scripts_to_load.emplace_back(s); - } + init_supervised_node(&options); double time_start = current_time(true); diff --git a/src/supervisor.bif b/src/supervisor.bif index 41f99c6e73..6990bf8385 100644 --- a/src/supervisor.bif +++ b/src/supervisor.bif @@ -19,7 +19,7 @@ type Supervisor::Status: record; type Supervisor::NodeConfig: record; type Supervisor::NodeStatus: record; -function Supervisor::__status%(nodes: string%): Supervisor::Status +function Supervisor::__status%(node: string%): Supervisor::Status %{ if ( ! zeek::supervisor ) { @@ -27,7 +27,7 @@ function Supervisor::__status%(nodes: string%): Supervisor::Status return new RecordVal(BifType::Record::Supervisor::Status); } - return zeek::supervisor->Status(nodes->CheckString()); + return zeek::supervisor->Status(node->CheckString()); %} function Supervisor::__create%(node: Supervisor::NodeConfig%): string @@ -42,7 +42,7 @@ function Supervisor::__create%(node: Supervisor::NodeConfig%): string return new StringVal(rval); %} -function Supervisor::__destroy%(nodes: string%): bool +function Supervisor::__destroy%(node: string%): bool %{ if ( ! zeek::supervisor ) { @@ -50,11 +50,11 @@ function Supervisor::__destroy%(nodes: string%): bool return val_mgr->GetBool(false); } - auto rval = zeek::supervisor->Destroy(nodes->CheckString()); + auto rval = zeek::supervisor->Destroy(node->CheckString()); return val_mgr->GetBool(rval); %} -function Supervisor::__restart%(nodes: string%): bool +function Supervisor::__restart%(node: string%): bool %{ if ( ! zeek::supervisor ) { @@ -62,7 +62,7 @@ function Supervisor::__restart%(nodes: string%): bool return val_mgr->GetBool(false); } - auto rval = zeek::supervisor->Restart(nodes->CheckString()); + auto rval = zeek::supervisor->Restart(node->CheckString()); return val_mgr->GetBool(rval); %}