From 83c60fd8ac9cd94703a0ce93c5c86864d4f5f310 Mon Sep 17 00:00:00 2001 From: Christian Kreibich Date: Sun, 29 May 2022 22:10:03 -0700 Subject: [PATCH] Management framework: tune request timeout granularity and interval When the controller relays requests to agents, we want agents to time out more quickly than the corresponding controller requests. This allows agents to respond with more meaningful errors, while the controller's timeout acts mostly as a last resort to ensure a response to the client actually happens. This dials down the table_expire_interval to 2 seconds in both agent and controller, for more predictable timeout behavior. It also dials the agent-side request expiration interval down to 5 seconds, compared to the agent's 10 seconds. We may have to revisit this to allow custom expiration intervals per request/response message type. --- .../policy/frameworks/management/agent/main.zeek | 8 ++++++++ .../frameworks/management/controller/main.zeek | 4 ++++ scripts/policy/frameworks/management/request.zeek | 14 +++++++++----- 3 files changed, 21 insertions(+), 5 deletions(-) diff --git a/scripts/policy/frameworks/management/agent/main.zeek b/scripts/policy/frameworks/management/agent/main.zeek index 40efd67606..5b6338f62a 100644 --- a/scripts/policy/frameworks/management/agent/main.zeek +++ b/scripts/policy/frameworks/management/agent/main.zeek @@ -52,6 +52,14 @@ redef record Management::Request::Request += { # Tag our logs correctly redef Management::role = Management::AGENT; +# Conduct more frequent table expiration checks. This helps get more predictable +# timing for request timeouts and only affects the controller, which is mostly idle. +redef table_expire_interval = 2 sec; + +# Tweak the request timeout so it's relatively quick, and quick enough always to +# time out strictly before the controller's request state (at 10 sec). +redef Management::Request::timeout_interval = 5 sec; + # Returns the effective agent topic for this agent. global agent_topic: function(): string; diff --git a/scripts/policy/frameworks/management/controller/main.zeek b/scripts/policy/frameworks/management/controller/main.zeek index 4a0ec7d081..f83644313e 100644 --- a/scripts/policy/frameworks/management/controller/main.zeek +++ b/scripts/policy/frameworks/management/controller/main.zeek @@ -73,6 +73,10 @@ redef record Management::Request::Request += { # Tag our logs correctly redef Management::role = Management::CONTROLLER; +# Conduct more frequent table expiration checks. This helps get more predictable +# timing for request timeouts and only affects the agent, which is mostly idle. +redef table_expire_interval = 2 sec; + global check_instances_ready: function(); global add_instance: function(inst: Management::Instance); global drop_instance: function(inst: Management::Instance); diff --git a/scripts/policy/frameworks/management/request.zeek b/scripts/policy/frameworks/management/request.zeek index 82a4de3648..86de3ea0fd 100644 --- a/scripts/policy/frameworks/management/request.zeek +++ b/scripts/policy/frameworks/management/request.zeek @@ -32,11 +32,15 @@ export { finished: bool &default=F; }; - ## The timeout for request state. Such state (see the :zeek:see:`Management::Request` - ## module) ties together request and response event pairs. The timeout causes - ## its cleanup in the absence of a timely response. It applies both to - ## state kept for client requests, as well as state in the agents for - ## requests to the supervisor. + ## The timeout interval for request state. Such state (see the + ## :zeek:see:`Management::Request` module) ties together request and + ## response event pairs. A timeout causes cleanup of request state if + ## regular request/response processing hasn't already done so. It + ## applies both to request state kept in the controller and the agent, + ## though the two use different timeout values: agent-side requests time + ## out more quickly. This allows agents to send more meaningful error + ## messages, while the controller's timeouts serve as a last resort to + ## ensure response to the client. const timeout_interval = 10sec &redef; ## A token request that serves as a null/nonexistant request.