Add ClusterController::API::get_nodes_request/response event pair

This allows querying the status of Zeek nodes currently running in a cluster.
The controller relays the request to all instances and accumulates their
responses.

The response back to the client contains one Result record per instance
response, each of which carrying a ClusterController::Types::NodeState vector in
its $data member to convey the state of each node at that instance.

The NodeState record tracks the name of the node, its role in the controller (if
any), its role in the data cluster (if any), as well as PID and listening port,
if any.
This commit is contained in:
Christian Kreibich 2022-01-31 17:51:33 -08:00
parent 791e5545b1
commit 7db8634c8b
5 changed files with 272 additions and 12 deletions

View file

@ -21,11 +21,17 @@ type SetConfigurationState: record {
requests: set[string] &default=set();
};
# Request state specific to the get_nodes request/response events
type GetNodesState: record {
requests: set[string] &default=set();
};
# Dummy state for testing events.
type TestState: record { };
redef record ClusterController::Request::Request += {
set_configuration_state: SetConfigurationState &optional;
get_nodes_state: GetNodesState &optional;
test_state: TestState &optional;
};
@ -457,6 +463,84 @@ event ClusterController::API::get_instances_request(reqid: string)
event ClusterController::API::get_instances_response(reqid, res);
}
event ClusterAgent::API::get_nodes_response(reqid: string, result: ClusterController::Types::Result)
{
ClusterController::Log::info(fmt("rx ClusterAgent::API::get_nodes_response %s", reqid));
# Retrieve state for the request we just got a response to
local areq = ClusterController::Request::lookup(reqid);
if ( ClusterController::Request::is_null(areq) )
return;
# Release the request, which is now done.
ClusterController::Request::finish(areq$id);
# Find the original request from the client
local req = ClusterController::Request::lookup(areq$parent_id);
if ( ClusterController::Request::is_null(req) )
return;
# Zeek's ingestion of an any-typed val via Broker yields an opaque
# Broker DataVal. When Zeek forwards this val via another event it stays
# in this opaque form. To avoid forcing recipients to distinguish
# whether the val is of the actual, intended (any-)type or a Broker
# DataVal wrapper, we explicitly cast it back to our intended Zeek
# type. This test case demonstrates: broker.remote_event_vector_any
result$data = result$data as ClusterController::Types::NodeStatusVec;
# Add this result to the overall response
req$results[|req$results|] = result;
# Mark this request as done by removing it from the table of pending
# ones. The following if-check should always be true.
if ( areq$id in req$get_nodes_state$requests )
delete req$get_nodes_state$requests[areq$id];
# If we still have pending queries out to the agents, do nothing: we'll
# handle this soon, or our request will time out and we respond with
# error.
if ( |req$get_nodes_state$requests| > 0 )
return;
ClusterController::Log::info(fmt("tx ClusterController::API::get_nodes_response %s",
ClusterController::Request::to_string(req)));
event ClusterController::API::get_nodes_response(req$id, req$results);
ClusterController::Request::finish(req$id);
}
event ClusterController::API::get_nodes_request(reqid: string)
{
ClusterController::Log::info(fmt("rx ClusterController::API::get_nodes_request %s", reqid));
# Special case: if we have no instances, respond right away.
if ( |g_instances| == 0 )
{
ClusterController::Log::info(fmt("tx ClusterController::API::get_nodes_response %s", reqid));
event ClusterController::API::get_nodes_response(reqid, vector(
ClusterController::Types::Result($reqid=reqid, $success=F,
$error="no instances connected")));
return;
}
local req = ClusterController::Request::create(reqid);
req$get_nodes_state = GetNodesState();
for ( name in g_instances )
{
if ( name !in g_instances_ready )
next;
local agent_topic = ClusterAgent::topic_prefix + "/" + name;
local areq = ClusterController::Request::create();
areq$parent_id = req$id;
add req$get_nodes_state$requests[areq$id];
ClusterController::Log::info(fmt("tx ClusterAgent::API::get_nodes_request %s to %s", areq$id, name));
Broker::publish(agent_topic, ClusterAgent::API::get_nodes_request, areq$id);
}
}
event ClusterController::Request::request_expired(req: ClusterController::Request::Request)
{
# Various handlers for timed-out request state. We use the state members
@ -480,6 +564,18 @@ event ClusterController::Request::request_expired(req: ClusterController::Reques
event ClusterController::API::set_configuration_response(req$id, req$results);
}
if ( req?$get_nodes_state )
{
res = ClusterController::Types::Result($reqid=req$id);
res$success = F;
res$error = "request timed out";
req$results += res;
ClusterController::Log::info(fmt("tx ClusterController::API::get_nodes_response %s",
ClusterController::Request::to_string(req)));
event ClusterController::API::get_nodes_response(req$id, req$results);
}
if ( req?$test_state )
{
res = ClusterController::Types::Result($reqid=req$id);
@ -525,13 +621,15 @@ event zeek_init()
Broker::subscribe(ClusterController::topic);
# Events sent to the client:
local events: vector of any = [
ClusterController::API::get_instances_response,
ClusterController::API::set_configuration_response,
ClusterController::API::get_nodes_response,
ClusterController::API::test_timeout_response
];
Broker::auto_publish(ClusterController::topic,
ClusterController::API::get_instances_response);
Broker::auto_publish(ClusterController::topic,
ClusterController::API::set_configuration_response);
Broker::auto_publish(ClusterController::topic,
ClusterController::API::test_timeout_response);
for ( i in events )
Broker::auto_publish(ClusterController::topic, events[i]);
ClusterController::Log::info("controller is live");
}