Management framework: track instances by their Broker IDs

This allows us to handle loss of Broker peerings, updating instance state as we
see instances go away. This also tweaks logging slightly to differentiate
between an instance checking in for the first time, and checking in when the
controller already knows it.
This commit is contained in:
Christian Kreibich 2022-06-19 17:11:30 -07:00
parent 633535d8da
commit 7787d84739

View file

@ -154,6 +154,11 @@ global g_instances_known: table[string] of Management::Instance = table();
# instance, and store that in g_instances.) # instance, and store that in g_instances.)
global g_instances_ready: set[string] = set(); global g_instances_ready: set[string] = set();
# A map from Broker ID values to instance names. When we lose a peering, this
# helps us understand whether it was an instance, and if so, update its state
# accordingly.
global g_instances_by_id: table[string] of string;
# The request ID of the most recent deployment request from a client. We track # The request ID of the most recent deployment request from a client. We track
# it here until we know we are ready to communicate with all agents required for # it here until we know we are ready to communicate with all agents required for
# the update. # the update.
@ -605,6 +610,12 @@ event Management::Agent::API::notify_agent_hello(instance: string, id: string, c
if ( ei$id != "" && ei?$network ) if ( ei$id != "" && ei?$network )
{ {
if ( instance !in g_instances_known )
Management::Log::debug(fmt("instance %s newly checked in", instance));
else
Management::Log::debug(fmt("instance %s checked in again", instance));
g_instances_by_id[id] = instance;
g_instances_known[instance] = Management::Instance( g_instances_known[instance] = Management::Instance(
$name=instance, $host=to_addr(ei$network$address)); $name=instance, $host=to_addr(ei$network$address));
@ -613,8 +624,6 @@ event Management::Agent::API::notify_agent_hello(instance: string, id: string, c
# We connected to this agent, note down its port. # We connected to this agent, note down its port.
g_instances_known[instance]$listen_port = ei$network$bound_port; g_instances_known[instance]$listen_port = ei$network$bound_port;
} }
Management::Log::debug(fmt("instance %s now known to us", instance));
} }
if ( instance in g_instances && instance !in g_instances_ready ) if ( instance in g_instances && instance !in g_instances_ready )
@ -1212,6 +1221,8 @@ event Management::Request::request_expired(req: Management::Request::Request)
$success = F, $success = F,
$error = "request timed out"); $error = "request timed out");
Management::Log::info(fmt("request %s timed out", req$id));
if ( req?$deploy_state ) if ( req?$deploy_state )
{ {
# This timeout means we no longer have a pending request. # This timeout means we no longer have a pending request.
@ -1281,6 +1292,24 @@ event Broker::peer_added(peer: Broker::EndpointInfo, msg: string)
Management::Log::debug(fmt("broker peer %s added: %s", peer, msg)); Management::Log::debug(fmt("broker peer %s added: %s", peer, msg));
} }
event Broker::peer_lost(peer: Broker::EndpointInfo, msg: string)
{
Management::Log::debug(fmt("broker peer %s lost: %s", peer, msg));
if ( peer$id in g_instances_by_id )
{
local instance = g_instances_by_id[peer$id];
if ( instance in g_instances_known )
delete g_instances_known[instance];
if ( instance in g_instances_ready )
delete g_instances_ready[instance];
Management::Log::info(fmt("dropped state for instance %s", instance));
delete g_instances_by_id[peer$id];
}
}
event zeek_init() event zeek_init()
{ {
# The controller always listens: it needs to be able to respond to # The controller always listens: it needs to be able to respond to