mirror of
https://github.com/zeek/zeek.git
synced 2025-10-10 02:28:21 +00:00
Optimize software found cluster communication
As a followup to 3bf8c8ceb6
that added the
parse cache, add a small short lived cache on the workers to effectively
debounce the number of Software::new events sent up to the proxies.
User-Agents are highly repetitive, workers often see exact duplicate
user-agents on the same orig_h. Worse, due to NAT, virtualization, and
the proliferation of Electron based applications, variations of the same
user-agent can be seen at the same time. For example:
Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36
Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.6613.18 Safari/537.36 Zoom/6.2.0 (1855)
When these two user-agents are seen concurrently, the software framework
will log each flip as a new user-agent. This can be fixed separately on
the proxy side, but a reduction of Software::new events is still needed
to reduce cluster communication overhead as well as the load on the
proxies.
With a 10 minute cache on the workers, this should greatly reduce the
number of redundant user-agents logged in the software.log
This commit is contained in:
parent
d655c64e0b
commit
e8de3de2d4
1 changed files with 8 additions and 0 deletions
|
@ -239,7 +239,10 @@ function parse(unparsed_version: string): Description
|
|||
return [$version=v, $unparsed_version=unparsed_version, $name=alternate_names[software_name]];
|
||||
}
|
||||
|
||||
# A cache for the proxies that stores the result of parsing unparsed_version.
|
||||
global parse_cache: table[string] of Description &read_expire=65secs;
|
||||
# A suppression cache for the workers to prevent sending the same information to the proxies multiple times.
|
||||
global found_cache: set[Info] &create_expire=10mins;
|
||||
|
||||
# Call parse, but cache results in the parse_cache table
|
||||
function parse_with_cache(unparsed_version: string): Description
|
||||
|
@ -523,6 +526,11 @@ function found(id: conn_id, info: Info): bool
|
|||
if ( ! info$force_log && ! addr_matches_host(info$host, asset_tracking) )
|
||||
return F;
|
||||
|
||||
# This assumes that callers do not fill in info$ts, none of the current callers do.
|
||||
if (info in found_cache)
|
||||
return T;
|
||||
add found_cache[info];
|
||||
|
||||
if ( ! info?$ts )
|
||||
info$ts = network_time();
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue