Improve supervisor debug logging

Mainly making stem process debug messages sent up to parent supervisor
process and included in its debug.log though option to print to stderr
remains in form of environment variable in case debugging breaking
change to the IPC mechanism itself.
This commit is contained in:
Jon Siwek 2020-01-14 11:24:46 -08:00
parent 0ff99c3df8
commit 80b3aef486
3 changed files with 95 additions and 40 deletions

View file

@ -6,6 +6,7 @@
#include <poll.h> #include <poll.h>
#include <csignal> #include <csignal>
#include <cstdarg>
#include <sstream> #include <sstream>
#include "Supervisor.h" #include "Supervisor.h"
@ -22,6 +23,12 @@ extern "C" {
#include "setsignal.h" #include "setsignal.h"
} }
#ifdef DEBUG
#define DBG_STEM(args...) stem->LogDebug(args);
#else
#define DBG_STEM
#endif
using namespace zeek; using namespace zeek;
namespace { namespace {
@ -54,6 +61,10 @@ struct Stem {
void ReportStatus(const Supervisor::Node& node) const; void ReportStatus(const Supervisor::Node& node) const;
void LogDebug(const char* format, ...) const __attribute__((format(printf, 2, 3)));
void LogError(const char* format, ...) const __attribute__((format(printf, 2, 3)));
std::unique_ptr<bro::Flare> signal_flare; std::unique_ptr<bro::Flare> signal_flare;
std::unique_ptr<bro::PipePair> pipe; std::unique_ptr<bro::PipePair> pipe;
std::map<std::string, Supervisor::Node> nodes; std::map<std::string, Supervisor::Node> nodes;
@ -67,7 +78,7 @@ static Stem* stem = nullptr;
static RETSIGTYPE stem_sig_handler(int signo) static RETSIGTYPE stem_sig_handler(int signo)
{ {
// TODO: signal safety // TODO: signal safety
printf("Stem received signal: %d\n", signo); DBG_STEM("Stem received signal: %d", signo);
if ( stem->shutting_down ) if ( stem->shutting_down )
return RETSIGVAL; return RETSIGVAL;
@ -158,6 +169,8 @@ Supervisor::~Supervisor()
reporter->Error("Failed to wait for stem process to exit: %s", tmp); reporter->Error("Failed to wait for stem process to exit: %s", tmp);
} }
} }
while ( ProcessMessages() != 0 );
} }
void Supervisor::ObserveChildSignal() void Supervisor::ObserveChildSignal()
@ -294,7 +307,11 @@ double Supervisor::NextTimestamp(double* local_network_time)
void Supervisor::Process() void Supervisor::Process()
{ {
HandleChildSignal(); HandleChildSignal();
ProcessMessages();
}
size_t Supervisor::ProcessMessages()
{
char buf[256]; char buf[256];
int bytes_read = read(stem_pipe->InFD(), buf, 256); int bytes_read = read(stem_pipe->InFD(), buf, 256);
@ -318,9 +335,15 @@ void Supervisor::Process()
if ( it != nodes.end() ) if ( it != nodes.end() )
it->second.pid = std::stoi(msg_tokens[2]); it->second.pid = std::stoi(msg_tokens[2]);
} }
else if ( type == "debug" )
{
// Already logged the unparsed message above.
}
else else
reporter->Error("Supervisor got unknown msg: %s", msg.data()); reporter->Error("Supervisor got unknown msg: %s", msg.data());
} }
return msgs.size();
} }
Stem::Stem(std::unique_ptr<bro::PipePair> p) Stem::Stem(std::unique_ptr<bro::PipePair> p)
@ -343,8 +366,7 @@ Stem::Stem(std::unique_ptr<bro::PipePair> p)
auto res = setpgid(0, 0); auto res = setpgid(0, 0);
if ( res == -1 ) if ( res == -1 )
fprintf(stderr, "failed to set stem process group: %s\n", LogError("failed to set stem process group: %s", strerror(errno));
strerror(errno));
} }
Stem::~Stem() Stem::~Stem()
@ -377,8 +399,8 @@ bool Stem::Wait(Supervisor::Node* node, int options) const
if ( res == -1 ) if ( res == -1 )
{ {
fprintf(stderr, "Stem failed to get node exit status %s (%d): %s\n", LogError("Stem failed to get node exit status %s (%d): %s",
node->Name().data(), node->pid, strerror(errno)); node->Name().data(), node->pid, strerror(errno));
return false; return false;
} }
@ -386,18 +408,18 @@ bool Stem::Wait(Supervisor::Node* node, int options) const
{ {
node->exit_status = WEXITSTATUS(status); node->exit_status = WEXITSTATUS(status);
// TODO: may be some cases where the node is intended to exit // TODO: may be some cases where the node is intended to exit
printf("node '%s' exited with status %d\n", DBG_STEM("node '%s' exited with status %d",
node->Name().data(), node->exit_status); node->Name().data(), node->exit_status);
} }
else if ( WIFSIGNALED(status) ) else if ( WIFSIGNALED(status) )
{ {
node->signal_number = WTERMSIG(status); node->signal_number = WTERMSIG(status);
printf("node '%s' terminated by signal %d\n", DBG_STEM("node '%s' terminated by signal %d",
node->Name().data(), node->signal_number); node->Name().data(), node->signal_number);
} }
else else
fprintf(stderr, "Stem failed to get node exit status %s (%d)\n", LogError("Stem failed to get node exit status %s (%d)",
node->Name().data(), node->pid); node->Name().data(), node->pid);
node->pid = 0; node->pid = 0;
return true; return true;
@ -408,8 +430,8 @@ void Stem::KillNode(const Supervisor::Node& node, int signal) const
auto kill_res = kill(node.pid, signal); auto kill_res = kill(node.pid, signal);
if ( kill_res == -1 ) if ( kill_res == -1 )
fprintf(stderr, "Failed to send signal to node %s: %s", LogError("Failed to send signal to node %s: %s",
node.Name().data(), strerror(errno)); node.Name().data(), strerror(errno));
} }
void Stem::Destroy(Supervisor::Node* node) const void Stem::Destroy(Supervisor::Node* node) const
@ -427,8 +449,8 @@ void Stem::Destroy(Supervisor::Node* node) const
if ( Wait(node, WNOHANG) ) if ( Wait(node, WNOHANG) )
break; break;
printf("Stem waiting to destroy node: %s (%d)\n", DBG_STEM("Stem waiting to destroy node: %s (%d)",
node->Name().data(), node->pid); node->Name().data(), node->pid);
sleep(kill_delay); sleep(kill_delay);
} }
} }
@ -482,8 +504,8 @@ bool Stem::Spawn(Supervisor::Node* node)
if ( node_pid == -1 ) if ( node_pid == -1 )
{ {
fprintf(stderr, "failed to fork Zeek node '%s': %s\n", LogError("failed to fork Zeek node '%s': %s",
node->Name().data(), strerror(errno)); node->Name().data(), strerror(errno));
return false; return false;
} }
@ -495,7 +517,7 @@ bool Stem::Spawn(Supervisor::Node* node)
node->pid = node_pid; node->pid = node_pid;
node->spawn_time = std::chrono::steady_clock::now(); node->spawn_time = std::chrono::steady_clock::now();
printf("Stem spawned node: %s (%d)\n", node->Name().data(), node->pid); DBG_STEM("Stem spawned node: %s (%d)", node->Name().data(), node->pid);
return false; return false;
} }
@ -529,7 +551,7 @@ void Stem::Shutdown(int exit_code)
if ( ! nodes.empty() ) if ( ! nodes.empty() )
{ {
KillNodes(sig); KillNodes(sig);
printf("Stem killed nodes with signal %d\n", sig); DBG_STEM("Stem killed nodes with signal %d", sig);
usleep(10); usleep(10);
Reap(); Reap();
} }
@ -539,8 +561,8 @@ void Stem::Shutdown(int exit_code)
if ( nodes_alive == 0 ) if ( nodes_alive == 0 )
exit(exit_code); exit(exit_code);
printf("Stem nodes still alive %d, sleeping for %d seconds\n", DBG_STEM("Stem nodes still alive %d, sleeping for %d seconds",
nodes_alive, kill_delay); nodes_alive, kill_delay);
auto sleep_time_left = kill_delay; auto sleep_time_left = kill_delay;
@ -567,6 +589,44 @@ void Stem::ReportStatus(const Supervisor::Node& node) const
safe_write(pipe->OutFD(), msg.data(), msg.size() + 1); safe_write(pipe->OutFD(), msg.data(), msg.size() + 1);
} }
void Stem::LogDebug(const char* format, ...) const
{
va_list args;
va_start(args, format);
auto raw_msg = fmt(format, args);
va_end(args);
if ( getenv("ZEEK_DEBUG_STEM_STDERR") )
{
// Useful when debugging a breaking change to the IPC mechanism itself.
fprintf(stderr, "%s\n", raw_msg);
return;
}
std::string msg = "debug ";
msg += raw_msg;
safe_write(pipe->OutFD(), msg.data(), msg.size() + 1);
}
void Stem::LogError(const char* format, ...) const
{
va_list args;
va_start(args, format);
std::string msg = fmt(format, args);
va_end(args);
fprintf(stderr, "%s\n", msg.data());
#ifdef DEBUG
if ( getenv("ZEEK_DEBUG_STEM_STDERR") )
// Essentially already emitted above.
return;
// Useful to also insert the error message into the debug log.
LogDebug("%s", msg.data());
#endif
}
std::optional<Supervisor::NodeConfig> Stem::Run() std::optional<Supervisor::NodeConfig> Stem::Run()
{ {
for ( ; ; ) for ( ; ; )
@ -591,7 +651,7 @@ std::optional<Supervisor::NodeConfig> Stem::Poll()
{ {
if ( errno != EINTR ) if ( errno != EINTR )
{ {
fprintf(stderr, "Stem poll() failed: %s\n", strerror(errno)); LogError("Stem poll() failed: %s", strerror(errno));
return {}; return {};
} }
} }
@ -601,7 +661,7 @@ std::optional<Supervisor::NodeConfig> Stem::Poll()
// TODO: better way to detect loss of parent than polling ? // TODO: better way to detect loss of parent than polling ?
// e.g. prctl(PR_SET_PDEATHSIG, ...) on Linux // e.g. prctl(PR_SET_PDEATHSIG, ...) on Linux
// or procctl(PROC_PDEATHSIG_CTL) on FreeBSD // or procctl(PROC_PDEATHSIG_CTL) on FreeBSD
printf("Stem suicide\n"); DBG_STEM("Stem suicide");
Shutdown(13); Shutdown(13);
} }
@ -634,13 +694,13 @@ std::optional<Supervisor::NodeConfig> Stem::Poll()
if ( bytes_read == 0 ) if ( bytes_read == 0 )
{ {
// EOF, supervisor must have exited // EOF, supervisor must have exited
printf("Stem EOF\n"); DBG_STEM("Stem EOF");
Shutdown(14); Shutdown(14);
} }
if ( bytes_read < 0 ) if ( bytes_read < 0 )
{ {
fprintf(stderr, "Stem read() failed: %s\n", strerror(errno)); LogError("Stem read() failed: %s", strerror(errno));
return {}; return {};
} }
@ -666,8 +726,7 @@ std::optional<Supervisor::NodeConfig> Stem::Poll()
if ( Spawn(&node) ) if ( Spawn(&node) )
return node.config; return node.config;
// TODO: get stem printfs going through standard Zeek debug.log DBG_STEM("Stem created node: %s (%d)", node.Name().data(), node.pid);
printf("Stem created node: %s (%d)\n", node.Name().data(), node.pid);
ReportStatus(node); ReportStatus(node);
} }
else if ( cmd == "destroy" ) else if ( cmd == "destroy" )
@ -675,7 +734,7 @@ std::optional<Supervisor::NodeConfig> Stem::Poll()
auto it = nodes.find(node_name); auto it = nodes.find(node_name);
assert(it != nodes.end()); assert(it != nodes.end());
auto& node = it->second; auto& node = it->second;
printf("Stem destroying node: %s\n", node_name.data()); DBG_STEM("Stem destroying node: %s", node_name.data());
Destroy(&node); Destroy(&node);
nodes.erase(it); nodes.erase(it);
} }
@ -684,7 +743,7 @@ std::optional<Supervisor::NodeConfig> Stem::Poll()
auto it = nodes.find(node_name); auto it = nodes.find(node_name);
assert(it != nodes.end()); assert(it != nodes.end());
auto& node = it->second; auto& node = it->second;
printf("Stem restarting node: %s\n", node_name.data()); DBG_STEM("Stem restarting node: %s", node_name.data());
Destroy(&node); Destroy(&node);
if ( Spawn(&node) ) if ( Spawn(&node) )
@ -693,7 +752,7 @@ std::optional<Supervisor::NodeConfig> Stem::Poll()
ReportStatus(node); ReportStatus(node);
} }
else else
fprintf(stderr, "unknown supervisor message: %s", cmd.data()); LogError("Stem got unknown supervisor message: %s", cmd.data());
} }
return {}; return {};

View file

@ -103,6 +103,8 @@ private:
void Process() override; void Process() override;
size_t ProcessMessages();
void HandleChildSignal(); void HandleChildSignal();
void ReapStem(); void ReapStem();

View file

@ -1067,17 +1067,11 @@ int main(int argc, char** argv)
if ( options.debug_log_streams ) if ( options.debug_log_streams )
{ {
debug_logger.EnableStreams(options.debug_log_streams->data()); debug_logger.EnableStreams(options.debug_log_streams->data());
const char* debug_log_name = nullptr;
if ( ! getenv("ZEEK_DEBUG_LOG_STDERR") ) if ( getenv("ZEEK_DEBUG_LOG_STDERR") )
{ debug_logger.OpenDebugLog(nullptr);
if ( is_supervisor(options) ) else
debug_log_name = "debug-supervisor"; debug_logger.OpenDebugLog("debug");
else
debug_log_name = "debug";
}
debug_logger.OpenDebugLog(debug_log_name);
} }
#endif #endif