Merge remote-tracking branch 'origin/topic/seth/tsv-logs-utf8-by-default'

* origin/topic/seth/tsv-logs-utf8-by-default: Fix mis-usage of string::append that leads to an overflow Use json_escape_utf8 for all utf8 data in ODesc Switch the TSV Zeek logs to be UTF8 by default.
2025-10-02 06:38:20 +00:00 · 2021-09-08 12:03:51 -07:00 · 2021-09-08 12:03:51 -07:00 · b7e264f8ef
commit b7e264f8ef
parent a251aa07f7 58cb9163d1
10 changed files with 61 additions and 45 deletions
--- a/10
+++ b/10
@ -1,3 +1,13 @@
 4.2.0-dev.154 | 2021-09-08 12:03:51 -0700
  * Fix mis-usage of string::append that leads to an overflow (Tim Wojtulewicz, Corelight)
  * Use json_escape_utf8 for all utf8 data in ODesc (Tim Wojtulewicz, Corelight)
  * Switch the TSV Zeek logs to be UTF8 by default. (Seth Hall, Corelight)
    There is a paired zeek-testing branch for some updates there.
 4.2.0-dev.150 | 2021-09-08 11:44:15 -0700
  * fix race condition in btest output ordering (Vern Paxson, Corelight)
--- a/4
+++ b/4
@ -12,6 +12,10 @@ New Functionality
 Changed Functionality
 ---------------------
 - The traditional TSV Zeek logs are now valid UTF8 by default. It's possible
  to revert to the previous behavior by setting ``LogAscii::enable_utf_8`` to
  false.
 Removed Functionality
 ---------------------
--- a/2
+++ b/2
@ -1 +1 @@
-4.2.0-dev.150
+4.2.0-dev.154
--- a/scripts/base/frameworks/logging/writers/ascii.zeek
+++ b/scripts/base/frameworks/logging/writers/ascii.zeek
@ -38,7 +38,7 @@ export {
 	## written into logs.
 	##
 	## This option is also available as a per-filter ``$config`` option.
-	const enable_utf_8 = F &redef;
+	const enable_utf_8 = T &redef;
 	## Define the gzip level to compress the logs.  If 0, then no gzip
 	## compression is performed. Enabling compression also changes
--- a/src/Desc.cc
+++ b/src/Desc.cc
@ -251,11 +251,8 @@ size_t ODesc::StartsWithEscapeSequence(const char* start, const char* end)
 	if ( escape_sequences.empty() )
 		return 0;
-	escape_set::const_iterator it;
+	for ( const auto& esc_str : escape_sequences )
 	for ( it = escape_sequences.begin(); it != escape_sequences.end(); ++it )
 		{
 		const std::string& esc_str = *it;
 		size_t esc_len = esc_str.length();
 		if ( start + esc_len > end )
@ -289,33 +286,9 @@ std::pair<const char*, size_t> ODesc::FirstEscapeLoc(const char* bytes, size_t n
 		if ( len )
 			return escape_pos(bytes + i, len);
 		if ( ! printable && utf8 )
 			{
 			size_t utf_found = getNumBytesForUTF8(bytes[i]);
 			if ( utf_found == 1 )
 				return escape_pos(bytes + i, 1);
 			if ( i + utf_found > n )
 				// Don't know if this is even meant to be a utf8 encoding,
 				// since there's not enough bytes left to check it's a valid
 				// sequence, so maybe safest to just move up by one instead
 				// of escaping the entire remainder.
 				return escape_pos(bytes + i, 1);
 			if ( isLegalUTF8Sequence(reinterpret_cast<const unsigned char *>(bytes + i),
 			                         reinterpret_cast<const unsigned char *>(bytes + i + utf_found)) )
 				{
 				i += utf_found - 1;
 				continue;
 				}
 			return escape_pos(bytes + i, 1);
 			}
 		}
-	return escape_pos(0, 0);
+	return escape_pos(nullptr, 0);
 	}
 void ODesc::AddBytes(const void* bytes, unsigned int n)
@ -331,17 +304,31 @@ void ODesc::AddBytes(const void* bytes, unsigned int n)
 	while ( s < e )
 		{
-		std::pair<const char*, size_t> p = FirstEscapeLoc(s, e - s);
+		auto [ esc_start, esc_len ] = FirstEscapeLoc(s, e - s);
-		if ( p.first )
+		if ( esc_start != nullptr )
 			{
-			AddBytesRaw(s, p.first - s);
+			if ( utf8 )
-			util::get_escaped_string(this, p.first, p.second, true);
+				{
-			s = p.first + p.second;
+				std::string result = util::json_escape_utf8(s, esc_start - s, false);
 				AddBytesRaw(result.c_str(), result.size());
 				}
 			else
 				AddBytesRaw(s, esc_start - s);
 			util::get_escaped_string(this, esc_start, esc_len, true);
 			s = esc_start + esc_len;
 			}
 		else
 			{
-			AddBytesRaw(s, e - s);
+			if ( utf8 )
 				{
 				std::string result = util::json_escape_utf8(s, e - s, false);
 				AddBytesRaw(result.c_str(), result.size());
 				}
 			else
 				AddBytesRaw(s, e - s);
 			break;
 			}
 		}
--- a/src/util.cc
+++ b/src/util.cc
@ -2439,10 +2439,14 @@ static bool check_ok_utf8(const unsigned char* start, const unsigned char* end)
 	return true;
 	}
-string json_escape_utf8(const string& val)
+string json_escape_utf8(const string& val, bool escape_printable_controls)
 	{
-	auto val_data = reinterpret_cast<const unsigned char*>(val.c_str());
+	return json_escape_utf8(val.c_str(), val.size(), escape_printable_controls);
-	auto val_size = val.length();
+	}
 string json_escape_utf8(const char* val, size_t val_size, bool escape_printable_controls)
 	{
 	auto val_data = reinterpret_cast<const unsigned char*>(val);
 	// Reserve at least the size of the existing string to avoid resizing the string in the best-case
 	// scenario where we don't have any multi-byte characters. We keep two versions of this string:
@ -2464,7 +2468,7 @@ string json_escape_utf8(const string& val)
 		// Normal ASCII characters plus a few of the control characters can be inserted directly. The
 		// rest of the control characters should be escaped as regular bytes.
 		if ( ( ch >= 32 && ch < 127 ) ||
-		       ch == '\b' || ch == '\f' || ch == '\n' || ch == '\r' || ch == '\t' )
+		     ( escape_printable_controls && ( ch == '\b' || ch == '\f' || ch == '\n' || ch == '\r' || ch == '\t' ) ) )
 			{
 			if ( ! found_bad )
 				utf_result.push_back(ch);
@ -2504,7 +2508,7 @@ string json_escape_utf8(const string& val)
 				{
 				for ( unsigned int i = 0; i < char_size; i++ )
 					escaped_result.append(json_escape_byte(val[idx+i]));
-				utf_result.append(val, idx, char_size);
+				utf_result.append(val+idx, char_size);
 				idx += char_size;
 				}
 			}
--- a/src/util.h
+++ b/src/util.h
@ -550,7 +550,16 @@ char* zeekenv(const char* name);
 * @param val the input string to be escaped
 * @return the escaped string
 */
-std::string json_escape_utf8(const std::string& val);
+std::string json_escape_utf8(const std::string& val, bool escape_printable_controls=true);
 /**
 * Escapes bytes in a string that are not valid UTF8 characters with \xYY format. Used
 * by the JSON writer and BIF methods.
 * @param val the character data to be escaped
 * @param val_size the length of the character data
 * @return the escaped string
 */
 std::string json_escape_utf8(const char* val, size_t val_size, bool escape_printable_controls=true);
 } // namespace util
 } // namespace zeek
--- a/testing/btest/scripts/base/frameworks/logging/ascii-escape-binary.zeek
+++ b/testing/btest/scripts/base/frameworks/logging/ascii-escape-binary.zeek
@ -2,6 +2,8 @@
 # @TEST-EXEC: btest-diff test.log
 # @TEST-EXEC: btest-diff output
 redef LogAscii::enable_utf_8 = F;
 module Test;
 export {
--- a/testing/external/commit-hash.zeek-testing
+++ b/testing/external/commit-hash.zeek-testing
@ -1 +1 @@
-6816110d6af23562c837298ef8b65cb8a9d5b73a
+5b2a6d78f789f1271b68123875ef66eaaba6f3e8
--- a/testing/external/commit-hash.zeek-testing-private
+++ b/testing/external/commit-hash.zeek-testing-private
@ -1 +1 @@
-4b88837c49ade5d9fd980d5e6cf02ec91d19a3bb
+44cc696ed070bf7569848437ab1368d557ace4e5
`@ -1 +1 @@`
	`6816110d6af23562c837298ef8b65cb8a9d5b73a`	`5b2a6d78f789f1271b68123875ef66eaaba6f3e8`
`@ -1 +1 @@`
	`4b88837c49ade5d9fd980d5e6cf02ec91d19a3bb`	`44cc696ed070bf7569848437ab1368d557ace4e5`