diff --git a/CHANGES b/CHANGES index d2b1aedf7f..8ee3c16f8e 100644 --- a/CHANGES +++ b/CHANGES @@ -1,4 +1,14 @@ +3.3.0-dev.388 | 2020-10-12 17:02:20 -0700 + + * Add CaptureLoss::Too_Little_Traffic notice (Vlad Grigorescu) + + * Add CaptureLoss::initial_watch_interval for a quick read on cluster health after startup. (Vlad Grigorescu) + + * Improve capture-loss.zeek documentation. (Vlad Grigorescu) + + * Fix whitespace in capture-loss.zek (Vlad Grigorescu) + 3.3.0-dev.381 | 2020-10-12 11:15:29 -0700 * GH-779: Add "udp-state" signature condition (Jon Siwek, Corelight) diff --git a/NEWS b/NEWS index 986f6b0736..457166c6fd 100644 --- a/NEWS +++ b/NEWS @@ -37,6 +37,23 @@ New Functionality - Added a ``udp-state`` signature condition to enforce matching against either "originator" or "responder" flow direction of UDP packets. +- Improvements to catpure-loss.zeek: + + - A new option, ``CaptureLoss::initial_watch_interval``. When restarting a + Zeek cluster, one usually wants some immediate feedback as to the health of + the monitoring via capture loss. However, you previously needed to wait a + full ``CaptureLoss::watch_interval``, which defaults to 15 minutes. The + new option specifies the interval for the first-time report. So the new + default behavior provides stats after 1 minute and then after + 15 minutes afterward. + + - A new notice type, ``CaptureLoss::Too_Little_Traffic``. + If a Zeek process sees less than ``CaptureLoss::minimum_acks`` ACKs in a + given interval, this notice gets raised. This can be a useful diagnostic + if, for whatever reason, a Zeek process stops seeing traffic, but + capture-loss.zeek would have previously only reported that "0 gaps and 0 + ACKs is 0% loss". + Changed Functionality --------------------- diff --git a/VERSION b/VERSION index 674616d996..85691726f1 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -3.3.0-dev.381 +3.3.0-dev.388 diff --git a/scripts/policy/misc/capture-loss.zeek b/scripts/policy/misc/capture-loss.zeek index 0b827db544..3f53de3e2a 100644 --- a/scripts/policy/misc/capture-loss.zeek +++ b/scripts/policy/misc/capture-loss.zeek @@ -1,10 +1,10 @@ ##! This script logs evidence regarding the degree to which the packet -##! capture process suffers from measurement loss. -##! The loss could be due to overload on the host or NIC performing -##! the packet capture or it could even be beyond the host. If you are -##! capturing from a switch with a SPAN port, it's very possible that +##! capture process suffers from measurement loss. +##! The loss could be due to overload on the host or NIC performing +##! the packet capture or it could even be beyond the host. If you are +##! capturing from a switch with a SPAN port, it's very possible that ##! the switch itself could be overloaded and dropping packets. -##! Reported loss is computed in terms of the number of "gap events" (ACKs +##! Reported loss is computed in terms of the number of "gap events" (ACKs ##! for a sequence number that's above a gap). @load base/frameworks/notice @@ -13,15 +13,18 @@ module CaptureLoss; export { redef enum Log::ID += { LOG }; - + global log_policy: Log::PolicyHook; redef enum Notice::Type += { ## Report if the detected capture loss exceeds the percentage - ## threshold. - Too_Much_Loss + ## threshold defined in :zeek:id:`CaptureLoss::too_much_loss`. + Too_Much_Loss, + ## Report if the traffic seen by a peer within a given watch + ## interval is less than :zeek:id:`CaptureLoss::minimum_acks`. + Too_Little_Traffic, }; - + type Info: record { ## Timestamp for when the measurement occurred. ts: time &log; @@ -38,25 +41,35 @@ export { ## Percentage of ACKs seen where the data being ACKed wasn't seen. percent_lost: double &log; }; - - ## The interval at which capture loss reports are created. + + ## The interval at which capture loss reports are created in a + ## running cluster (that is, after the first report). option watch_interval = 15mins; - - ## The percentage of missed data that is considered "too much" + + ## For faster feedback on cluster health, the first capture loss + ## report is generated this many minutes after startup. + option initial_watch_interval = 1mins; + + ## The percentage of missed data that is considered "too much" ## when the :zeek:enum:`CaptureLoss::Too_Much_Loss` notice should be ## generated. The value is expressed as a double between 0 and 1 with 1 ## being 100%. option too_much_loss: double = 0.1; + + ## The minimum number of ACKs expected for a single peer in a + ## watch interval. If the number seen is less than this, + ## :zeek:enum:`CaptureLoss::Too_Little_Traffic` is raised. + option minimum_acks: count = 1; } event CaptureLoss::take_measurement(last_ts: time, last_acks: count, last_gaps: count) { if ( last_ts == 0 ) { - schedule watch_interval { CaptureLoss::take_measurement(network_time(), 0, 0) }; + schedule initial_watch_interval { CaptureLoss::take_measurement(network_time(), 0, 0) }; return; } - + local now = network_time(); local g = get_gap_stats(); local acks = g$ack_events - last_acks; @@ -65,13 +78,17 @@ event CaptureLoss::take_measurement(last_ts: time, last_acks: count, last_gaps: local info: Info = [$ts=now, $ts_delta=now-last_ts, $peer=peer_description, - $acks=acks, $gaps=gaps, + $acks=acks, $gaps=gaps, $percent_lost=pct_lost]; - + if ( pct_lost >= too_much_loss*100 ) - NOTICE([$note=Too_Much_Loss, + NOTICE([$note=Too_Much_Loss, $msg=fmt("The capture loss script detected an estimated loss rate above %.3f%%", pct_lost)]); - + + if ( acks < minimum_acks ) + NOTICE([$note=Too_Little_Traffic, + $msg=fmt("Only observed %d TCP ACKs and was expecting at least %d.", acks, minimum_acks)]); + Log::write(LOG, info); schedule watch_interval { CaptureLoss::take_measurement(now, g$ack_events, g$gap_events) }; } @@ -82,5 +99,5 @@ event zeek_init() &priority=5 # We only schedule the event if we are capturing packets. if ( reading_live_traffic() || reading_traces() ) - schedule watch_interval { CaptureLoss::take_measurement(network_time(), 0, 0) }; + schedule initial_watch_interval { CaptureLoss::take_measurement(network_time(), 0, 0) }; } diff --git a/testing/btest/Baseline/scripts.policy.misc.capture-loss/capture_loss.log b/testing/btest/Baseline/scripts.policy.misc.capture-loss/capture_loss.log new file mode 100644 index 0000000000..ef5dca2f4b --- /dev/null +++ b/testing/btest/Baseline/scripts.policy.misc.capture-loss/capture_loss.log @@ -0,0 +1,10 @@ +#separator \x09 +#set_separator , +#empty_field (empty) +#unset_field - +#path capture_loss +#open 2020-10-08-16-33-05 +#fields ts ts_delta peer gaps acks percent_lost +#types time interval string count count double +964953086.310131 0.000000 zeek 0 0 0.0 +#close 2020-10-08-16-33-05 diff --git a/testing/btest/Baseline/scripts.policy.misc.capture-loss/notice.log b/testing/btest/Baseline/scripts.policy.misc.capture-loss/notice.log new file mode 100644 index 0000000000..c5fd517179 --- /dev/null +++ b/testing/btest/Baseline/scripts.policy.misc.capture-loss/notice.log @@ -0,0 +1,10 @@ +#separator \x09 +#set_separator , +#empty_field (empty) +#unset_field - +#path notice +#open 2020-10-12-23-36-17 +#fields ts uid id.orig_h id.orig_p id.resp_h id.resp_p fuid file_mime_type file_desc proto note msg sub src dst p n peer_descr actions suppress_for remote_location.country_code remote_location.region remote_location.city remote_location.latitude remote_location.longitude +#types time string addr port addr port string string string enum enum string string addr addr port count string set[enum] interval string string string double double +964953086.310131 - - - - - - - - - CaptureLoss::Too_Little_Traffic Only observed 0 TCP ACKs and was expecting at least 1. - - - - - - Notice::ACTION_LOG 3600.000000 - - - - - +#close 2020-10-12-23-36-17 diff --git a/testing/btest/scripts/policy/misc/capture-loss.zeek b/testing/btest/scripts/policy/misc/capture-loss.zeek new file mode 100644 index 0000000000..d51576a5b4 --- /dev/null +++ b/testing/btest/scripts/policy/misc/capture-loss.zeek @@ -0,0 +1,12 @@ +# @TEST-EXEC: zeek -b -r $TRACES/dns53.pcap %INPUT +# @TEST-EXEC: btest-diff capture_loss.log +# @TEST-EXEC: btest-diff notice.log + +@load misc/capture-loss + +module CaptureLoss; + +event zeek_init() + { + event take_measurement(network_time(), 0, 0); + } diff --git a/testing/external/commit-hash.zeek-testing b/testing/external/commit-hash.zeek-testing index 403f9bd9e2..142bb2075d 100644 --- a/testing/external/commit-hash.zeek-testing +++ b/testing/external/commit-hash.zeek-testing @@ -1 +1 @@ -1386fa03e0b84be1491749502d3d3cb9d45a2b95 +e9e9363814c592a4b0557f70bd7d95e3a5573d8f diff --git a/testing/external/commit-hash.zeek-testing-private b/testing/external/commit-hash.zeek-testing-private index 2e89811d14..038bf923a4 100644 --- a/testing/external/commit-hash.zeek-testing-private +++ b/testing/external/commit-hash.zeek-testing-private @@ -1 +1 @@ -e9e3249a9fe5a407ada6de61eeeb4faf1a928ec4 +ca98da7a376b8c6b3fb1c3dc2e415030f6b876bf