From a4ceb98bf803c35fd92e84f10991c555d7ceb85a Mon Sep 17 00:00:00 2001 From: Seth Hall Date: Wed, 10 Mar 2021 14:39:03 +0000 Subject: [PATCH 1/3] Switch the TSV Zeek logs to be UTF8 by default. There is a paired zeek-testing branch for some updates there. --- NEWS | 4 ++++ scripts/base/frameworks/logging/writers/ascii.zeek | 2 +- .../Baseline/scripts.base.files.unified2.alert/unified2.log | 4 ++-- .../scripts/base/frameworks/logging/ascii-escape-binary.zeek | 2 ++ testing/external/commit-hash.zeek-testing | 2 +- 5 files changed, 10 insertions(+), 4 deletions(-) diff --git a/NEWS b/NEWS index 2c4ec1f2c5..7c010db247 100644 --- a/NEWS +++ b/NEWS @@ -12,6 +12,10 @@ New Functionality Changed Functionality --------------------- +- The traditional TSV Zeek logs are now valid UTF8 by default. It's possible + to revert to the previous behavior by setting ``LogAscii::enable_utf_8`` to + false. + Removed Functionality --------------------- diff --git a/scripts/base/frameworks/logging/writers/ascii.zeek b/scripts/base/frameworks/logging/writers/ascii.zeek index 61cf3f59c9..ce37a32537 100644 --- a/scripts/base/frameworks/logging/writers/ascii.zeek +++ b/scripts/base/frameworks/logging/writers/ascii.zeek @@ -38,7 +38,7 @@ export { ## written into logs. ## ## This option is also available as a per-filter ``$config`` option. - const enable_utf_8 = F &redef; + const enable_utf_8 = T &redef; ## Define the gzip level to compress the logs. If 0, then no gzip ## compression is performed. Enabling compression also changes diff --git a/testing/btest/Baseline/scripts.base.files.unified2.alert/unified2.log b/testing/btest/Baseline/scripts.base.files.unified2.alert/unified2.log index 6b99c4356d..17516f3b15 100644 --- a/testing/btest/Baseline/scripts.base.files.unified2.alert/unified2.log +++ b/testing/btest/Baseline/scripts.base.files.unified2.alert/unified2.log @@ -7,6 +7,6 @@ #open XXXX-XX-XX-XX-XX-XX #fields ts id.src_ip id.src_p id.dst_ip id.dst_p sensor_id signature_id signature generator_id generator signature_revision classification_id classification priority_id event_id packet #types time addr port addr port count count string count string count count string count count string -XXXXXXXXXX.XXXXXX 192.168.1.72 50185 74.125.225.49 80 0 2003058 ET MALWARE 180solutions (Zango) Spyware Installer Download 1 snort general alert 5 21 trojan-activity 1 2 \xd80bH\xc5\xb5x\xca9\xb7\xe4r\x08\x00E\x10\x00\\\x1a\xce@\x00@\x062\x1f\xc0\xa8\x01HJ}\xe11\xc4\x09\x00P*\xa8bv]z/\xde\x80\x18\x82+\x88,\x00\x00\x01\x01\x08\x0a\x17J\x83Q\xfe\xad\xac\x1aGET /Zango/ZangoInstaller.exe HTTP/1.0\x0d\x0a -XXXXXXXXXX.XXXXXX 192.168.1.72 49862 199.47.216.144 80 0 2012647 ET POLICY Dropbox.com Offsite File Backup in Use 1 snort general alert 3 33 policy-violation 1 3 \xd80bH\xc5\xb5x\xca9\xb7\xe4r\x08\x00E\x00\x00\xf8Q\xdf@\x00@\x06\x86p\xc0\xa8\x01H\xc7/\xd8\x90\xc2\xc6\x00P\x9cm\x97U\xf07\x084\x80\x18\x82\x18%<\x00\x00\x01\x01\x08\x0a\x17J\xd7\xde\x00\x92\x81\xc5GET /subscribe?host_int=43112345&ns_map=123456_1234524412104916591&ts=1323827344 HTTP/1.1\x0d\x0aHost: notify1.dropbox.com\x0d\x0aAccept-Encoding: identity\x0d\x0aConnection: keep-alive\x0d\x0aX-Dropbox-Locale: en_US\x0d\x0a\x0d\x0a +XXXXXXXXXX.XXXXXX 192.168.1.72 50185 74.125.225.49 80 0 2003058 ET MALWARE 180solutions (Zango) Spyware Installer Download 1 snort general alert 5 21 trojan-activity 1 2 \xd80bHŵx\xca9\xb7\xe4r\x08\x00E\x10\x00\\\x1a\xce@\x00@\x062\x1f\xc0\xa8\x01HJ}\xe11\xc4\x09\x00P*\xa8bv]z/ހ\x18\x82+\x88,\x00\x00\x01\x01\x08\x0a\x17J\x83Q\xfe\xad\xac\x1aGET /Zango/ZangoInstaller.exe HTTP/1.0\x0d\x0a +XXXXXXXXXX.XXXXXX 192.168.1.72 49862 199.47.216.144 80 0 2012647 ET POLICY Dropbox.com Offsite File Backup in Use 1 snort general alert 3 33 policy-violation 1 3 \xd80bHŵx\xca9\xb7\xe4r\x08\x00E\x00\x00\xf8Q\xdf@\x00@\x06\x86p\xc0\xa8\x01H\xc7/ؐ\xc2\xc6\x00P\x9cm\x97U\xf07\x084\x80\x18\x82\x18%<\x00\x00\x01\x01\x08\x0a\x17J\xd7\xde\x00\x92\x81\xc5GET /subscribe?host_int=43112345&ns_map=123456_1234524412104916591&ts=1323827344 HTTP/1.1\x0d\x0aHost: notify1.dropbox.com\x0d\x0aAccept-Encoding: identity\x0d\x0aConnection: keep-alive\x0d\x0aX-Dropbox-Locale: en_US\x0d\x0a\x0d\x0a #close XXXX-XX-XX-XX-XX-XX diff --git a/testing/btest/scripts/base/frameworks/logging/ascii-escape-binary.zeek b/testing/btest/scripts/base/frameworks/logging/ascii-escape-binary.zeek index 5535f83276..ef8461abfd 100644 --- a/testing/btest/scripts/base/frameworks/logging/ascii-escape-binary.zeek +++ b/testing/btest/scripts/base/frameworks/logging/ascii-escape-binary.zeek @@ -2,6 +2,8 @@ # @TEST-EXEC: btest-diff test.log # @TEST-EXEC: btest-diff output +redef LogAscii::enable_utf_8 = F; + module Test; export { diff --git a/testing/external/commit-hash.zeek-testing b/testing/external/commit-hash.zeek-testing index 5f05400db4..b14748e754 100644 --- a/testing/external/commit-hash.zeek-testing +++ b/testing/external/commit-hash.zeek-testing @@ -1 +1 @@ -6816110d6af23562c837298ef8b65cb8a9d5b73a +465b0bb55451934b211dda72ad388496b3a2d1d7 From 404fed69231e5bbbc6972f8b991f809aff50228b Mon Sep 17 00:00:00 2001 From: Tim Wojtulewicz Date: Mon, 30 Aug 2021 16:14:49 -0700 Subject: [PATCH 2/3] Use json_escape_utf8 for all utf8 data in ODesc --- src/Desc.cc | 57 +++++++------------ src/util.cc | 12 ++-- src/util.h | 11 +++- .../unified2.log | 4 +- testing/external/commit-hash.zeek-testing | 2 +- .../external/commit-hash.zeek-testing-private | 2 +- 6 files changed, 44 insertions(+), 44 deletions(-) diff --git a/src/Desc.cc b/src/Desc.cc index 0def8cd8aa..b77a150cc3 100644 --- a/src/Desc.cc +++ b/src/Desc.cc @@ -251,11 +251,8 @@ size_t ODesc::StartsWithEscapeSequence(const char* start, const char* end) if ( escape_sequences.empty() ) return 0; - escape_set::const_iterator it; - - for ( it = escape_sequences.begin(); it != escape_sequences.end(); ++it ) + for ( const auto& esc_str : escape_sequences ) { - const std::string& esc_str = *it; size_t esc_len = esc_str.length(); if ( start + esc_len > end ) @@ -289,33 +286,9 @@ std::pair ODesc::FirstEscapeLoc(const char* bytes, size_t n if ( len ) return escape_pos(bytes + i, len); - - if ( ! printable && utf8 ) - { - size_t utf_found = getNumBytesForUTF8(bytes[i]); - - if ( utf_found == 1 ) - return escape_pos(bytes + i, 1); - - if ( i + utf_found > n ) - // Don't know if this is even meant to be a utf8 encoding, - // since there's not enough bytes left to check it's a valid - // sequence, so maybe safest to just move up by one instead - // of escaping the entire remainder. - return escape_pos(bytes + i, 1); - - if ( isLegalUTF8Sequence(reinterpret_cast(bytes + i), - reinterpret_cast(bytes + i + utf_found)) ) - { - i += utf_found - 1; - continue; - } - - return escape_pos(bytes + i, 1); - } } - return escape_pos(0, 0); + return escape_pos(nullptr, 0); } void ODesc::AddBytes(const void* bytes, unsigned int n) @@ -331,17 +304,31 @@ void ODesc::AddBytes(const void* bytes, unsigned int n) while ( s < e ) { - std::pair p = FirstEscapeLoc(s, e - s); + auto [ esc_start, esc_len ] = FirstEscapeLoc(s, e - s); - if ( p.first ) + if ( esc_start != nullptr ) { - AddBytesRaw(s, p.first - s); - util::get_escaped_string(this, p.first, p.second, true); - s = p.first + p.second; + if ( utf8 ) + { + std::string result = util::json_escape_utf8(s, esc_start - s, false); + AddBytesRaw(result.c_str(), result.size()); + } + else + AddBytesRaw(s, esc_start - s); + + util::get_escaped_string(this, esc_start, esc_len, true); + s = esc_start + esc_len; } else { - AddBytesRaw(s, e - s); + if ( utf8 ) + { + std::string result = util::json_escape_utf8(s, e - s, false); + AddBytesRaw(result.c_str(), result.size()); + } + else + AddBytesRaw(s, e - s); + break; } } diff --git a/src/util.cc b/src/util.cc index 9a2b83c7bf..c9023a1a88 100644 --- a/src/util.cc +++ b/src/util.cc @@ -2439,10 +2439,14 @@ static bool check_ok_utf8(const unsigned char* start, const unsigned char* end) return true; } -string json_escape_utf8(const string& val) +string json_escape_utf8(const string& val, bool escape_printable_controls) { - auto val_data = reinterpret_cast(val.c_str()); - auto val_size = val.length(); + return json_escape_utf8(val.c_str(), val.size(), escape_printable_controls); + } + +string json_escape_utf8(const char* val, size_t val_size, bool escape_printable_controls) + { + auto val_data = reinterpret_cast(val); // Reserve at least the size of the existing string to avoid resizing the string in the best-case // scenario where we don't have any multi-byte characters. We keep two versions of this string: @@ -2464,7 +2468,7 @@ string json_escape_utf8(const string& val) // Normal ASCII characters plus a few of the control characters can be inserted directly. The // rest of the control characters should be escaped as regular bytes. if ( ( ch >= 32 && ch < 127 ) || - ch == '\b' || ch == '\f' || ch == '\n' || ch == '\r' || ch == '\t' ) + ( escape_printable_controls && ( ch == '\b' || ch == '\f' || ch == '\n' || ch == '\r' || ch == '\t' ) ) ) { if ( ! found_bad ) utf_result.push_back(ch); diff --git a/src/util.h b/src/util.h index 4da4a10735..d7b61a5a9d 100644 --- a/src/util.h +++ b/src/util.h @@ -550,7 +550,16 @@ char* zeekenv(const char* name); * @param val the input string to be escaped * @return the escaped string */ -std::string json_escape_utf8(const std::string& val); +std::string json_escape_utf8(const std::string& val, bool escape_printable_controls=true); + +/** + * Escapes bytes in a string that are not valid UTF8 characters with \xYY format. Used + * by the JSON writer and BIF methods. + * @param val the character data to be escaped + * @param val_size the length of the character data + * @return the escaped string + */ +std::string json_escape_utf8(const char* val, size_t val_size, bool escape_printable_controls=true); } // namespace util } // namespace zeek diff --git a/testing/btest/Baseline/scripts.base.files.unified2.alert/unified2.log b/testing/btest/Baseline/scripts.base.files.unified2.alert/unified2.log index 17516f3b15..6b99c4356d 100644 --- a/testing/btest/Baseline/scripts.base.files.unified2.alert/unified2.log +++ b/testing/btest/Baseline/scripts.base.files.unified2.alert/unified2.log @@ -7,6 +7,6 @@ #open XXXX-XX-XX-XX-XX-XX #fields ts id.src_ip id.src_p id.dst_ip id.dst_p sensor_id signature_id signature generator_id generator signature_revision classification_id classification priority_id event_id packet #types time addr port addr port count count string count string count count string count count string -XXXXXXXXXX.XXXXXX 192.168.1.72 50185 74.125.225.49 80 0 2003058 ET MALWARE 180solutions (Zango) Spyware Installer Download 1 snort general alert 5 21 trojan-activity 1 2 \xd80bHŵx\xca9\xb7\xe4r\x08\x00E\x10\x00\\\x1a\xce@\x00@\x062\x1f\xc0\xa8\x01HJ}\xe11\xc4\x09\x00P*\xa8bv]z/ހ\x18\x82+\x88,\x00\x00\x01\x01\x08\x0a\x17J\x83Q\xfe\xad\xac\x1aGET /Zango/ZangoInstaller.exe HTTP/1.0\x0d\x0a -XXXXXXXXXX.XXXXXX 192.168.1.72 49862 199.47.216.144 80 0 2012647 ET POLICY Dropbox.com Offsite File Backup in Use 1 snort general alert 3 33 policy-violation 1 3 \xd80bHŵx\xca9\xb7\xe4r\x08\x00E\x00\x00\xf8Q\xdf@\x00@\x06\x86p\xc0\xa8\x01H\xc7/ؐ\xc2\xc6\x00P\x9cm\x97U\xf07\x084\x80\x18\x82\x18%<\x00\x00\x01\x01\x08\x0a\x17J\xd7\xde\x00\x92\x81\xc5GET /subscribe?host_int=43112345&ns_map=123456_1234524412104916591&ts=1323827344 HTTP/1.1\x0d\x0aHost: notify1.dropbox.com\x0d\x0aAccept-Encoding: identity\x0d\x0aConnection: keep-alive\x0d\x0aX-Dropbox-Locale: en_US\x0d\x0a\x0d\x0a +XXXXXXXXXX.XXXXXX 192.168.1.72 50185 74.125.225.49 80 0 2003058 ET MALWARE 180solutions (Zango) Spyware Installer Download 1 snort general alert 5 21 trojan-activity 1 2 \xd80bH\xc5\xb5x\xca9\xb7\xe4r\x08\x00E\x10\x00\\\x1a\xce@\x00@\x062\x1f\xc0\xa8\x01HJ}\xe11\xc4\x09\x00P*\xa8bv]z/\xde\x80\x18\x82+\x88,\x00\x00\x01\x01\x08\x0a\x17J\x83Q\xfe\xad\xac\x1aGET /Zango/ZangoInstaller.exe HTTP/1.0\x0d\x0a +XXXXXXXXXX.XXXXXX 192.168.1.72 49862 199.47.216.144 80 0 2012647 ET POLICY Dropbox.com Offsite File Backup in Use 1 snort general alert 3 33 policy-violation 1 3 \xd80bH\xc5\xb5x\xca9\xb7\xe4r\x08\x00E\x00\x00\xf8Q\xdf@\x00@\x06\x86p\xc0\xa8\x01H\xc7/\xd8\x90\xc2\xc6\x00P\x9cm\x97U\xf07\x084\x80\x18\x82\x18%<\x00\x00\x01\x01\x08\x0a\x17J\xd7\xde\x00\x92\x81\xc5GET /subscribe?host_int=43112345&ns_map=123456_1234524412104916591&ts=1323827344 HTTP/1.1\x0d\x0aHost: notify1.dropbox.com\x0d\x0aAccept-Encoding: identity\x0d\x0aConnection: keep-alive\x0d\x0aX-Dropbox-Locale: en_US\x0d\x0a\x0d\x0a #close XXXX-XX-XX-XX-XX-XX diff --git a/testing/external/commit-hash.zeek-testing b/testing/external/commit-hash.zeek-testing index b14748e754..18da8a263e 100644 --- a/testing/external/commit-hash.zeek-testing +++ b/testing/external/commit-hash.zeek-testing @@ -1 +1 @@ -465b0bb55451934b211dda72ad388496b3a2d1d7 +fe19edc8e08e1f4cc41a5166ee7b7bf1b6e71bc5 diff --git a/testing/external/commit-hash.zeek-testing-private b/testing/external/commit-hash.zeek-testing-private index c2bef97a74..4b84c0db8f 100644 --- a/testing/external/commit-hash.zeek-testing-private +++ b/testing/external/commit-hash.zeek-testing-private @@ -1 +1 @@ -4b88837c49ade5d9fd980d5e6cf02ec91d19a3bb +9e15189e3d4356e98bb1b155da282de2375ac80f From 58cb9163d17c518cb27698ef2ea8497ec6ff0278 Mon Sep 17 00:00:00 2001 From: Tim Wojtulewicz Date: Fri, 3 Sep 2021 23:08:15 +0000 Subject: [PATCH 3/3] Fix mis-usage of string::append that leads to an overflow --- src/util.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/util.cc b/src/util.cc index c9023a1a88..7121777446 100644 --- a/src/util.cc +++ b/src/util.cc @@ -2508,7 +2508,7 @@ string json_escape_utf8(const char* val, size_t val_size, bool escape_printable_ { for ( unsigned int i = 0; i < char_size; i++ ) escaped_result.append(json_escape_byte(val[idx+i])); - utf_result.append(val, idx, char_size); + utf_result.append(val+idx, char_size); idx += char_size; } }