From aa689807facf53582b4ef20073be0a1f6008c650 Mon Sep 17 00:00:00 2001 From: Christian Kreibich Date: Thu, 2 Jun 2022 15:29:05 -0700 Subject: [PATCH 1/6] Management framework: comment-only tweaks and typo fixes --- .../frameworks/management/agent/main.zeek | 2 +- .../frameworks/management/controller/api.zeek | 4 +-- .../management/controller/main.zeek | 25 ++++++++++++------- 3 files changed, 19 insertions(+), 12 deletions(-) diff --git a/scripts/policy/frameworks/management/agent/main.zeek b/scripts/policy/frameworks/management/agent/main.zeek index efeb680539..5b9912c89b 100644 --- a/scripts/policy/frameworks/management/agent/main.zeek +++ b/scripts/policy/frameworks/management/agent/main.zeek @@ -59,7 +59,7 @@ redef record Management::Request::Request += { redef Management::role = Management::AGENT; # Conduct more frequent table expiration checks. This helps get more predictable -# timing for request timeouts and only affects the controller, which is mostly idle. +# timing for request timeouts and only affects the agent, which is mostly idle. redef table_expire_interval = 2 sec; # Tweak the request timeout so it's relatively quick, and quick enough always to diff --git a/scripts/policy/frameworks/management/controller/api.zeek b/scripts/policy/frameworks/management/controller/api.zeek index b840aecab6..b0897ed20a 100644 --- a/scripts/policy/frameworks/management/controller/api.zeek +++ b/scripts/policy/frameworks/management/controller/api.zeek @@ -164,8 +164,8 @@ export { ## The controller triggers this event when the operational cluster ## instances align with the ones desired by the cluster ## configuration. It's essentially a cluster management readiness - ## event. This event is currently only used by the controller and not - ## published to other topics. + ## event. This event is currently only used internally by the controller, + ## and not published to topics. ## ## instances: the set of instance names now ready. ## diff --git a/scripts/policy/frameworks/management/controller/main.zeek b/scripts/policy/frameworks/management/controller/main.zeek index ba35619a6c..4759f5774c 100644 --- a/scripts/policy/frameworks/management/controller/main.zeek +++ b/scripts/policy/frameworks/management/controller/main.zeek @@ -74,13 +74,22 @@ redef record Management::Request::Request += { redef Management::role = Management::CONTROLLER; # Conduct more frequent table expiration checks. This helps get more predictable -# timing for request timeouts and only affects the agent, which is mostly idle. +# timing for request timeouts and only affects the controller, which is mostly idle. redef table_expire_interval = 2 sec; +# Helper that checks whether the agents that are ready match those we need to +# operate the current cluster configuration. When that is the case, triggers +# notify_agents_ready event. global check_instances_ready: function(); + +# Adds the given instance to g_instances and peers, if needed. global add_instance: function(inst: Management::Instance); + +# Drops the given instance from g_instances and sends it an +# agent_standby_request, so it drops its current cluster nodes (if any). global drop_instance: function(inst: Management::Instance); +# Helpers to simplify handling of config records. global null_config: function(): Management::Configuration; global is_null_config: function(config: Management::Configuration): bool; @@ -90,13 +99,13 @@ global is_null_config: function(config: Management::Configuration): bool; # one from our internal state. global is_instance_connectivity_change: function(inst: Management::Instance): bool; -# The set of agents the controller interacts with to manage to currently +# The set of agents the controller interacts with to manage the currently # configured cluster. This may be a subset of all the agents known to the -# controller, as tracked by the g_instances_known set. They key is the instance +# controller, tracked by the g_instances_known set. They key is the instance # name and should match the $name member of the corresponding instance record. global g_instances: table[string] of Management::Instance = table(); -# The set of instances that have checked in with the controller. This is a +# The set of instances that the controller communicates with. This may be a # superset of g_instances, since it covers any agent that has sent us a # notify_agent_hello event. global g_instances_known: set[string] = set(); @@ -139,8 +148,6 @@ function send_config_to_agents(req: Management::Request::Request, config: Manage } } -# This is the &on_change handler for the g_instances_ready set, meaning -# it runs whenever a required agent has confirmed it's ready. function check_instances_ready() { local cur_instances: set[string]; @@ -291,8 +298,8 @@ event Management::Agent::API::notify_agent_hello(instance: string, host: addr, a if ( instance in g_instances && instance !in g_instances_ready ) { - # We need this instance for our cluster and have full context for - # it from the configuration. Tell agent. + # We need this instance for the requested cluster and have full + # context for it from the configuration. Tell agent. local req = Management::Request::create(); Management::Log::info(fmt("tx Management::Agent::API::agent_welcome_request to %s", instance)); @@ -540,7 +547,7 @@ event Management::Controller::API::get_configuration_request(reqid: string) event Management::Controller::API::get_instances_request(reqid: string) { - Management::Log::info(fmt("rx Management::Controller::API::set_instances_request %s", reqid)); + Management::Log::info(fmt("rx Management::Controller::API::get_instances_request %s", reqid)); local res = Management::Result($reqid = reqid); local insts: vector of Management::Instance; From 72acf24f52b6bb86c4f8f9b91a25c2987cbe1df8 Mon Sep 17 00:00:00 2001 From: Christian Kreibich Date: Thu, 2 Jun 2022 18:11:56 -0700 Subject: [PATCH 2/6] Management framework: expand notify_agent_hello event arguments This swaps the host event argument for the Broker ID. The latter is more useful, since the sending agent doesn't necessarily know its IP address as visible to the controller, and the controller can pull up the full Broker context via the ID. It also adds an explicit argument to the event to indicate whether the agent connected to the controller or vice versa. This simplifies the controller's internal logic. Also minor tweaks to logging to show Broker IDs. --- scripts/policy/frameworks/management/agent/api.zeek | 11 +++++++---- scripts/policy/frameworks/management/agent/main.zeek | 5 +++-- .../policy/frameworks/management/controller/main.zeek | 9 +++++---- .../zeek.nodes.controller.stdout | 2 +- .../management/controller/agent-checkin.zeek | 6 ++---- 5 files changed, 18 insertions(+), 15 deletions(-) diff --git a/scripts/policy/frameworks/management/agent/api.zeek b/scripts/policy/frameworks/management/agent/api.zeek index 9fddaa44f1..ba9c0b22bd 100644 --- a/scripts/policy/frameworks/management/agent/api.zeek +++ b/scripts/policy/frameworks/management/agent/api.zeek @@ -145,17 +145,20 @@ export { ## The agent sends this event upon peering as a "check-in", informing ## the controller that an agent of the given name is now available to ## communicate with. It is a controller-level equivalent of - ## `:zeek:see:`Broker::peer_added`. + ## `:zeek:see:`Broker::peer_added` and triggered by it. ## ## instance: an instance name, really the agent's name as per ## :zeek:see:`Management::Agent::get_name`. ## - ## host: the IP address of the agent. (This may change in the future.) + ## id: the Broker ID of the agent. + ## + ## connecting: true if this agent connected to the controller, + ## false if the controller connected to the agent. ## ## api_version: the API version of this agent. ## - global notify_agent_hello: event(instance: string, host: addr, - api_version: count); + global notify_agent_hello: event(instance: string, id: string, + connecting: bool, api_version: count); # The following are not yet implemented. diff --git a/scripts/policy/frameworks/management/agent/main.zeek b/scripts/policy/frameworks/management/agent/main.zeek index 5b9912c89b..6105edb8dd 100644 --- a/scripts/policy/frameworks/management/agent/main.zeek +++ b/scripts/policy/frameworks/management/agent/main.zeek @@ -694,7 +694,8 @@ event Broker::peer_added(peer: Broker::EndpointInfo, msg: string) Broker::publish(agent_topic(), Management::Agent::API::notify_agent_hello, - epi$id, to_addr(epi$network$address), + epi$id, Broker::node_id(), + Management::Agent::controller$address != "0.0.0.0", Management::Agent::API::version); } @@ -736,5 +737,5 @@ event zeek_init() # If the controller connects to us, it also uses this port. Broker::listen(cat(epi$network$address), epi$network$bound_port); - Management::Log::info("agent is live"); + Management::Log::info(fmt("agent is live, Broker ID %s", Broker::node_id())); } diff --git a/scripts/policy/frameworks/management/controller/main.zeek b/scripts/policy/frameworks/management/controller/main.zeek index 4759f5774c..ccee0a6a7d 100644 --- a/scripts/policy/frameworks/management/controller/main.zeek +++ b/scripts/policy/frameworks/management/controller/main.zeek @@ -273,9 +273,10 @@ event Management::Controller::API::notify_agents_ready(instances: set[string]) send_config_to_agents(req, req$set_configuration_state$config); } -event Management::Agent::API::notify_agent_hello(instance: string, host: addr, api_version: count) +event Management::Agent::API::notify_agent_hello(instance: string, id: string, connecting: bool, api_version: count) { - Management::Log::info(fmt("rx Management::Agent::API::notify_agent_hello %s %s", instance, host)); + Management::Log::info(fmt("rx Management::Agent::API::notify_agent_hello %s %s %s", + instance, id, connecting)); # When an agent checks in with a mismatching API version, we log the # fact and drop its state, if any. @@ -283,7 +284,7 @@ event Management::Agent::API::notify_agent_hello(instance: string, host: addr, a { Management::Log::warning( fmt("instance %s/%s has checked in with incompatible API version %s", - instance, host, api_version)); + instance, id, api_version)); if ( instance in g_instances ) drop_instance(g_instances[instance]); @@ -882,5 +883,5 @@ event zeek_init() Broker::subscribe(Management::Agent::topic_prefix); Broker::subscribe(Management::Controller::topic); - Management::Log::info("controller is live"); + Management::Log::info(fmt("controller is live, Broker ID %s", Broker::node_id())); } diff --git a/testing/btest/Baseline/scripts.policy.frameworks.management.controller.agent-checkin/zeek.nodes.controller.stdout b/testing/btest/Baseline/scripts.policy.frameworks.management.controller.agent-checkin/zeek.nodes.controller.stdout index 3ce8a1f373..2aa07185c6 100644 --- a/testing/btest/Baseline/scripts.policy.frameworks.management.controller.agent-checkin/zeek.nodes.controller.stdout +++ b/testing/btest/Baseline/scripts.policy.frameworks.management.controller.agent-checkin/zeek.nodes.controller.stdout @@ -1,2 +1,2 @@ ### BTest baseline data generated by btest-diff. Do not edit. Use "btest -U/-u" to update. Requires BTest >= 0.63. -notify_agent_hello agent 127.0.0.1 1 +notify_agent_hello agent 1 diff --git a/testing/btest/scripts/policy/frameworks/management/controller/agent-checkin.zeek b/testing/btest/scripts/policy/frameworks/management/controller/agent-checkin.zeek index 3d248848c7..be0881c6ad 100644 --- a/testing/btest/scripts/policy/frameworks/management/controller/agent-checkin.zeek +++ b/testing/btest/scripts/policy/frameworks/management/controller/agent-checkin.zeek @@ -40,14 +40,12 @@ event zeek_init() } } -event Management::Agent::API::notify_agent_hello(instance: string, host: addr, api_version: count) +event Management::Agent::API::notify_agent_hello(instance: string, id: string, connecting: bool, api_version: count) { if ( Management::role == Management::CONTROLLER ) { - # On rare occasion it can happen that we log this twice, which'll need - # investigating. For now we ensure we only do so once. if ( ! logged ) - print(fmt("notify_agent_hello %s %s %s", instance, host, api_version)); + print(fmt("notify_agent_hello %s %s", instance, api_version)); logged = T; From 0c47d45bb9171f777f558668cd25d8e0fdba113e Mon Sep 17 00:00:00 2001 From: Christian Kreibich Date: Thu, 2 Jun 2022 18:14:16 -0700 Subject: [PATCH 3/6] Management framework: broaden get_instances response data to connected instances This response so far contained only the connected instances that are relevant to the current configuration, but this isn't very helpful when troubleshooting instance connectivity. It now reports all currently connected instances, with network addresses & ports as known to Broker. --- .../management/controller/main.zeek | 82 ++++++++++++++++--- 1 file changed, 69 insertions(+), 13 deletions(-) diff --git a/scripts/policy/frameworks/management/controller/main.zeek b/scripts/policy/frameworks/management/controller/main.zeek index ccee0a6a7d..d192535a69 100644 --- a/scripts/policy/frameworks/management/controller/main.zeek +++ b/scripts/policy/frameworks/management/controller/main.zeek @@ -93,6 +93,10 @@ global drop_instance: function(inst: Management::Instance); global null_config: function(): Management::Configuration; global is_null_config: function(config: Management::Configuration): bool; +# Given a Broker ID, this returns the endpoint info associated with it. +# On error, returns a dummy record with an empty ID string. +global find_endpoint: function(id: string): Broker::EndpointInfo; + # Checks whether the given instance is one that we know with different # communication settings: a different peering direction, a different listening # port, etc. Used as a predicate to indicate when we need to drop the existing @@ -101,19 +105,21 @@ global is_instance_connectivity_change: function(inst: Management::Instance): bo # The set of agents the controller interacts with to manage the currently # configured cluster. This may be a subset of all the agents known to the -# controller, tracked by the g_instances_known set. They key is the instance +# controller, tracked by the g_instances_known table. They key is the instance # name and should match the $name member of the corresponding instance record. global g_instances: table[string] of Management::Instance = table(); -# The set of instances that the controller communicates with. This may be a -# superset of g_instances, since it covers any agent that has sent us a -# notify_agent_hello event. -global g_instances_known: set[string] = set(); +# The instances the controller is communicating with. This can deviate from +# g_instances, since it covers any agent that has sent us a notify_agent_hello +# event, regardless of whether it relates to the current configuration. +global g_instances_known: table[string] of Management::Instance = table(); -# A corresponding set of instances/agents that we track in order to understand -# when all of the above instances have sent agent_welcome_response events. (An -# alternative would be to use a record that adds a single state bit for each -# instance, and store that above.) +# The set of instances we need for the current configuration and that are ready +# (meaning they have responded to our agent_welcome_request message). Once they +# match g_instances, a Management::Controller::API::notify_agents_ready event +# signals readiness and triggers config deployment. (An alternative +# implementation would be via a record that adds a single state bit for each +# instance, and store that in g_instances.) global g_instances_ready: set[string] = set(); # The request ID of the most recent configuration update that's come in from @@ -210,6 +216,20 @@ function null_config(): Management::Configuration return Management::Configuration($id=""); } +function find_endpoint(id: string): Broker::EndpointInfo + { + local peers = Broker::peers(); + + for ( i in peers ) + { + if ( peers[i]$peer$id == id ) + return peers[i]$peer; + } + + # Return a dummy instance. + return Broker::EndpointInfo($id=""); + } + function is_null_config(config: Management::Configuration): bool { return config$id == ""; @@ -294,8 +314,33 @@ event Management::Agent::API::notify_agent_hello(instance: string, id: string, c return; } - add g_instances_known[instance]; - Management::Log::debug(fmt("instance %s now known to us", instance)); + local ei = find_endpoint(id); + + if ( ei$id == "" ) + { + Management::Log::warning(fmt("notify_agent_hello from %s with unknown Broker ID %s", + instance, id)); + } + + if ( ! ei?$network ) + { + Management::Log::warning(fmt("notify_agent_hello from %s lacks network state, Broker ID %s", + instance, id)); + } + + if ( ei$id != "" && ei?$network ) + { + g_instances_known[instance] = Management::Instance( + $name=instance, $host=to_addr(ei$network$address)); + + if ( ! connecting ) + { + # We connected to this agent, note down its port. + g_instances_known[instance]$listen_port = ei$network$bound_port; + } + + Management::Log::debug(fmt("instance %s now known to us", instance)); + } if ( instance in g_instances && instance !in g_instances_ready ) { @@ -551,10 +596,16 @@ event Management::Controller::API::get_instances_request(reqid: string) Management::Log::info(fmt("rx Management::Controller::API::get_instances_request %s", reqid)); local res = Management::Result($reqid = reqid); + local inst_names: vector of string; local insts: vector of Management::Instance; - for ( i in g_instances ) - insts += g_instances[i]; + for ( inst in g_instances_known ) + inst_names += inst; + + sort(inst_names, strcmp); + + for ( i in inst_names ) + insts += g_instances_known[inst_names[i]]; res$data = insts; @@ -863,6 +914,11 @@ event Management::Controller::API::test_timeout_request(reqid: string, with_stat } } +event Broker::peer_added(peer: Broker::EndpointInfo, msg: string) + { + Management::Log::debug(fmt("broker peer %s added: %s", peer, msg)); + } + event zeek_init() { # Initialize null config at startup. We will replace it once we have From c53044981ac68b755c4397f47640331af3654ac6 Mon Sep 17 00:00:00 2001 From: Christian Kreibich Date: Fri, 3 Jun 2022 02:06:12 -0700 Subject: [PATCH 4/6] Management framework: improve address and port handling The get-nodes command also benefits from showing the state on connected agents more broadly (as opposed to just the one for the current configuration). Also a bugfix: ensure we use an agent's IP address as seen by the controller. This avoids reporting "0.0.0.0" in some cases. --- .../management/controller/main.zeek | 20 ++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/scripts/policy/frameworks/management/controller/main.zeek b/scripts/policy/frameworks/management/controller/main.zeek index d192535a69..98a9fe19e8 100644 --- a/scripts/policy/frameworks/management/controller/main.zeek +++ b/scripts/policy/frameworks/management/controller/main.zeek @@ -175,10 +175,15 @@ function add_instance(inst: Management::Instance) if ( inst$name in g_instances_known ) { - # The agent has already peered with us. Send welcome to indicate - # it's part of cluster management. Once it responds, we update - # the set of ready instances and proceed as feasible with config - # deployments. + # The agent has already peered with us. This means we have its + # IP address as observed by us, so use it if this agent + # connected to us. + if ( ! inst?$listen_port ) + inst$host = g_instances_known[inst$name]$host; + + # Send welcome to indicate it's part of cluster management. Once + # it responds, we update the set of ready instances and proceed + # as feasible with config deployments. local req = Management::Request::create(); @@ -665,7 +670,7 @@ event Management::Controller::API::get_nodes_request(reqid: string) Management::Log::info(fmt("rx Management::Controller::API::get_nodes_request %s", reqid)); # Special case: if we have no instances, respond right away. - if ( |g_instances| == 0 ) + if ( |g_instances_known| == 0 ) { Management::Log::info(fmt("tx Management::Controller::API::get_nodes_response %s", reqid)); local res = Management::Result($reqid=reqid, $success=F, @@ -678,11 +683,8 @@ event Management::Controller::API::get_nodes_request(reqid: string) local req = Management::Request::create(reqid); req$get_nodes_state = GetNodesState(); - for ( name in g_instances ) + for ( name in g_instances_known ) { - if ( name !in g_instances_ready ) - next; - local agent_topic = Management::Agent::topic_prefix + "/" + name; local areq = Management::Request::create(); From 08d1f93292ebbc6a5913aa86d258903225943d07 Mon Sep 17 00:00:00 2001 From: Christian Kreibich Date: Thu, 2 Jun 2022 18:18:21 -0700 Subject: [PATCH 5/6] Management framework: bump external cluster testsuite --- testing/external/commit-hash.zeek-testing-cluster | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/testing/external/commit-hash.zeek-testing-cluster b/testing/external/commit-hash.zeek-testing-cluster index e96b72d873..4100a9a60e 100644 --- a/testing/external/commit-hash.zeek-testing-cluster +++ b/testing/external/commit-hash.zeek-testing-cluster @@ -1 +1 @@ -de28fe5db9a733f8b4b53f84127bab888a184f6a +ffc21d5be60e78cab7f8a57b07d1a5842cbd1322 From 47f4342821d63d8ac45648437ac8de23420a3b55 Mon Sep 17 00:00:00 2001 From: Christian Kreibich Date: Fri, 3 Jun 2022 02:09:23 -0700 Subject: [PATCH 6/6] Management framework: bump zeek-client to pull in rendering tweaks --- auxil/zeek-client | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/auxil/zeek-client b/auxil/zeek-client index 2b00ec50c0..71e2de5864 160000 --- a/auxil/zeek-client +++ b/auxil/zeek-client @@ -1 +1 @@ -Subproject commit 2b00ec50c0e79e9f3ad18bad58b92ba32b405274 +Subproject commit 71e2de5864e0c1a9c232d26d592f809e983e56ed