From 1b696490d0c152b0c164ec82f47f3ad667e39618 Mon Sep 17 00:00:00 2001 From: Vlad Grigorescu Date: Thu, 8 Oct 2020 09:36:18 -0500 Subject: [PATCH 1/6] Whitespace fixes only [nomail] [skip ci] --- scripts/policy/misc/capture-loss.zeek | 30 +++++++++++++-------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/scripts/policy/misc/capture-loss.zeek b/scripts/policy/misc/capture-loss.zeek index 0b827db544..05fdee3011 100644 --- a/scripts/policy/misc/capture-loss.zeek +++ b/scripts/policy/misc/capture-loss.zeek @@ -1,10 +1,10 @@ ##! This script logs evidence regarding the degree to which the packet -##! capture process suffers from measurement loss. -##! The loss could be due to overload on the host or NIC performing -##! the packet capture or it could even be beyond the host. If you are -##! capturing from a switch with a SPAN port, it's very possible that +##! capture process suffers from measurement loss. +##! The loss could be due to overload on the host or NIC performing +##! the packet capture or it could even be beyond the host. If you are +##! capturing from a switch with a SPAN port, it's very possible that ##! the switch itself could be overloaded and dropping packets. -##! Reported loss is computed in terms of the number of "gap events" (ACKs +##! Reported loss is computed in terms of the number of "gap events" (ACKs ##! for a sequence number that's above a gap). @load base/frameworks/notice @@ -13,7 +13,7 @@ module CaptureLoss; export { redef enum Log::ID += { LOG }; - + global log_policy: Log::PolicyHook; redef enum Notice::Type += { @@ -21,7 +21,7 @@ export { ## threshold. Too_Much_Loss }; - + type Info: record { ## Timestamp for when the measurement occurred. ts: time &log; @@ -38,11 +38,11 @@ export { ## Percentage of ACKs seen where the data being ACKed wasn't seen. percent_lost: double &log; }; - + ## The interval at which capture loss reports are created. option watch_interval = 15mins; - - ## The percentage of missed data that is considered "too much" + + ## The percentage of missed data that is considered "too much" ## when the :zeek:enum:`CaptureLoss::Too_Much_Loss` notice should be ## generated. The value is expressed as a double between 0 and 1 with 1 ## being 100%. @@ -56,7 +56,7 @@ event CaptureLoss::take_measurement(last_ts: time, last_acks: count, last_gaps: schedule watch_interval { CaptureLoss::take_measurement(network_time(), 0, 0) }; return; } - + local now = network_time(); local g = get_gap_stats(); local acks = g$ack_events - last_acks; @@ -65,13 +65,13 @@ event CaptureLoss::take_measurement(last_ts: time, last_acks: count, last_gaps: local info: Info = [$ts=now, $ts_delta=now-last_ts, $peer=peer_description, - $acks=acks, $gaps=gaps, + $acks=acks, $gaps=gaps, $percent_lost=pct_lost]; - + if ( pct_lost >= too_much_loss*100 ) - NOTICE([$note=Too_Much_Loss, + NOTICE([$note=Too_Much_Loss, $msg=fmt("The capture loss script detected an estimated loss rate above %.3f%%", pct_lost)]); - + Log::write(LOG, info); schedule watch_interval { CaptureLoss::take_measurement(now, g$ack_events, g$gap_events) }; } From bb3527c95513210c7125e5a49447e5c4e76da303 Mon Sep 17 00:00:00 2001 From: Vlad Grigorescu Date: Thu, 8 Oct 2020 09:38:26 -0500 Subject: [PATCH 2/6] Documentation update, reference the threshold variable. [nomail] [skip ci] --- scripts/policy/misc/capture-loss.zeek | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/policy/misc/capture-loss.zeek b/scripts/policy/misc/capture-loss.zeek index 05fdee3011..5e0e49bc9d 100644 --- a/scripts/policy/misc/capture-loss.zeek +++ b/scripts/policy/misc/capture-loss.zeek @@ -18,7 +18,7 @@ export { redef enum Notice::Type += { ## Report if the detected capture loss exceeds the percentage - ## threshold. + ## threshold defined in :zeek:id:`CaptureLoss::too_much_loss`. Too_Much_Loss }; From cb9d419fa4846a09bafac8fa5564940abbf7483e Mon Sep 17 00:00:00 2001 From: Vlad Grigorescu Date: Thu, 8 Oct 2020 09:41:29 -0500 Subject: [PATCH 3/6] Add CaptureLoss::initial_watch_interval for a quick read on cluster health after startup. --- scripts/policy/misc/capture-loss.zeek | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/scripts/policy/misc/capture-loss.zeek b/scripts/policy/misc/capture-loss.zeek index 5e0e49bc9d..a6fff664e0 100644 --- a/scripts/policy/misc/capture-loss.zeek +++ b/scripts/policy/misc/capture-loss.zeek @@ -39,9 +39,14 @@ export { percent_lost: double &log; }; - ## The interval at which capture loss reports are created. + ## The interval at which capture loss reports are created in a + ## running cluster (that is, after the first report). option watch_interval = 15mins; + ## For faster feedback on cluster health, the first capture loss + ## report is generated this many minutes after startup. + option initial_watch_interval = 1mins; + ## The percentage of missed data that is considered "too much" ## when the :zeek:enum:`CaptureLoss::Too_Much_Loss` notice should be ## generated. The value is expressed as a double between 0 and 1 with 1 @@ -82,5 +87,5 @@ event zeek_init() &priority=5 # We only schedule the event if we are capturing packets. if ( reading_live_traffic() || reading_traces() ) - schedule watch_interval { CaptureLoss::take_measurement(network_time(), 0, 0) }; + schedule initial_watch_interval { CaptureLoss::take_measurement(network_time(), 0, 0) }; } From 59620ed75dc27e24f453d5d4d2979303537ef6d6 Mon Sep 17 00:00:00 2001 From: Vlad Grigorescu Date: Thu, 8 Oct 2020 11:26:46 -0500 Subject: [PATCH 4/6] Add CaptureLoss::Too_Little_Traffic --- scripts/policy/misc/capture-loss.zeek | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/scripts/policy/misc/capture-loss.zeek b/scripts/policy/misc/capture-loss.zeek index a6fff664e0..e836dc31ab 100644 --- a/scripts/policy/misc/capture-loss.zeek +++ b/scripts/policy/misc/capture-loss.zeek @@ -19,7 +19,10 @@ export { redef enum Notice::Type += { ## Report if the detected capture loss exceeds the percentage ## threshold defined in :zeek:id:`CaptureLoss::too_much_loss`. - Too_Much_Loss + Too_Much_Loss, + ## Report if the traffic seen by a peer within a given watch + ## interval is less than :zeek:id:`CaptureLoss::minimum_acks`. + Too_Little_Traffic, }; type Info: record { @@ -52,6 +55,11 @@ export { ## generated. The value is expressed as a double between 0 and 1 with 1 ## being 100%. option too_much_loss: double = 0.1; + + ## The minimum number of ACKs expected for a single peer in a + ## watch interval. If the number seen is less than this, + ## :zeek:enum:`CaptureLoss::Too_Little_Traffic` is raised. + option minimum_acks: count = 1; } event CaptureLoss::take_measurement(last_ts: time, last_acks: count, last_gaps: count) @@ -77,6 +85,10 @@ event CaptureLoss::take_measurement(last_ts: time, last_acks: count, last_gaps: NOTICE([$note=Too_Much_Loss, $msg=fmt("The capture loss script detected an estimated loss rate above %.3f%%", pct_lost)]); + if ( acks < minimum_acks ) + NOTICE([$note=Too_Little_Traffic, + $msg=fmt("The worker only observed %d ACKs and was expecting at least %d.", acks, minimum_acks)]); + Log::write(LOG, info); schedule watch_interval { CaptureLoss::take_measurement(now, g$ack_events, g$gap_events) }; } From c58cf698902874d824c86ec16ef4601ec38f456c Mon Sep 17 00:00:00 2001 From: Vlad Grigorescu Date: Thu, 8 Oct 2020 11:34:04 -0500 Subject: [PATCH 5/6] Add test for CaptureLoss::Too_Little_Traffic --- .../capture_loss.log | 10 ++++++++++ .../scripts.policy.misc.capture-loss/notice.log | 10 ++++++++++ testing/btest/scripts/policy/misc/capture-loss.zeek | 12 ++++++++++++ 3 files changed, 32 insertions(+) create mode 100644 testing/btest/Baseline/scripts.policy.misc.capture-loss/capture_loss.log create mode 100644 testing/btest/Baseline/scripts.policy.misc.capture-loss/notice.log create mode 100644 testing/btest/scripts/policy/misc/capture-loss.zeek diff --git a/testing/btest/Baseline/scripts.policy.misc.capture-loss/capture_loss.log b/testing/btest/Baseline/scripts.policy.misc.capture-loss/capture_loss.log new file mode 100644 index 0000000000..ef5dca2f4b --- /dev/null +++ b/testing/btest/Baseline/scripts.policy.misc.capture-loss/capture_loss.log @@ -0,0 +1,10 @@ +#separator \x09 +#set_separator , +#empty_field (empty) +#unset_field - +#path capture_loss +#open 2020-10-08-16-33-05 +#fields ts ts_delta peer gaps acks percent_lost +#types time interval string count count double +964953086.310131 0.000000 zeek 0 0 0.0 +#close 2020-10-08-16-33-05 diff --git a/testing/btest/Baseline/scripts.policy.misc.capture-loss/notice.log b/testing/btest/Baseline/scripts.policy.misc.capture-loss/notice.log new file mode 100644 index 0000000000..7c585c0301 --- /dev/null +++ b/testing/btest/Baseline/scripts.policy.misc.capture-loss/notice.log @@ -0,0 +1,10 @@ +#separator \x09 +#set_separator , +#empty_field (empty) +#unset_field - +#path notice +#open 2020-10-08-16-33-05 +#fields ts uid id.orig_h id.orig_p id.resp_h id.resp_p fuid file_mime_type file_desc proto note msg sub src dst p n peer_descr actions suppress_for remote_location.country_code remote_location.region remote_location.city remote_location.latitude remote_location.longitude +#types time string addr port addr port string string string enum enum string string addr addr port count string set[enum] interval string string string double double +964953086.310131 - - - - - - - - - CaptureLoss::Too_Little_Traffic The worker only observed 0 ACKs and was expecting at least 1. - - - - - - Notice::ACTION_LOG 3600.000000 - - - - - +#close 2020-10-08-16-33-05 diff --git a/testing/btest/scripts/policy/misc/capture-loss.zeek b/testing/btest/scripts/policy/misc/capture-loss.zeek new file mode 100644 index 0000000000..d51576a5b4 --- /dev/null +++ b/testing/btest/scripts/policy/misc/capture-loss.zeek @@ -0,0 +1,12 @@ +# @TEST-EXEC: zeek -b -r $TRACES/dns53.pcap %INPUT +# @TEST-EXEC: btest-diff capture_loss.log +# @TEST-EXEC: btest-diff notice.log + +@load misc/capture-loss + +module CaptureLoss; + +event zeek_init() + { + event take_measurement(network_time(), 0, 0); + } From 4d998742e2c70e7c583f3bb5424f04cf6bae0750 Mon Sep 17 00:00:00 2001 From: Vlad Grigorescu Date: Fri, 9 Oct 2020 14:43:04 -0500 Subject: [PATCH 6/6] Fix scheduling due to network_time being 0 in zeek_init Co-authored-by: Jon Siwek --- scripts/policy/misc/capture-loss.zeek | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/policy/misc/capture-loss.zeek b/scripts/policy/misc/capture-loss.zeek index e836dc31ab..645bbb51b4 100644 --- a/scripts/policy/misc/capture-loss.zeek +++ b/scripts/policy/misc/capture-loss.zeek @@ -66,7 +66,7 @@ event CaptureLoss::take_measurement(last_ts: time, last_acks: count, last_gaps: { if ( last_ts == 0 ) { - schedule watch_interval { CaptureLoss::take_measurement(network_time(), 0, 0) }; + schedule initial_watch_interval { CaptureLoss::take_measurement(network_time(), 0, 0) }; return; }