Merge remote-tracking branch 'origin/topic/timw/1581-utf8-escaping'

* origin/topic/timw/1581-utf8-escaping: Return fully-escaped string if utf8 conversion fails
2025-10-02 14:48:21 +00:00 · 2021-08-19 09:39:23 -07:00 · 2021-08-19 09:39:23 -07:00 · fe22d7cbb4
commit fe22d7cbb4
parent b28fc77054 f442893c98
4 changed files with 93 additions and 30 deletions
--- a/8
+++ b/8
@ -1,3 +1,11 @@
 4.2.0-dev.78 | 2021-08-19 09:39:23 -0700
  * Return fully-escaped string if utf8 conversion fails (Tim Wojtulewicz, Corelight)
    This adds a new function for validating UTF-8 sequences by converting to
    UTF-32. This allows us to also check for various blocks of codepointsi
    that we consider invalid while checking for valid sequences in general.
 4.2.0-dev.76 | 2021-08-18 08:40:41 -0700
  * Fix option length computation in Geneve analyzer. (Benjamin Bannier, Corelight)
--- a/2
+++ b/2
@ -1 +1 @@
-4.2.0-dev.76
+4.2.0-dev.78
--- a/src/util.cc
+++ b/src/util.cc
@ -2352,7 +2352,7 @@ TEST_CASE("util json_escape_utf8")
 	CHECK(json_escape_utf8("string") == "string");
 	CHECK(json_escape_utf8("string\n") == "string\n");
 	CHECK(json_escape_utf8("string\x82") == "string\\x82");
-	CHECK(json_escape_utf8("\x07\xd4\xb7o") == "\\x07Էo");
+	CHECK(json_escape_utf8("\x07\xd4\xb7o") == "\\x07\\xd4\\xb7o");
 	// These strings are duplicated from the scripts.base.frameworks.logging.ascii-json-utf8 btest
@ -2406,6 +2406,38 @@ TEST_CASE("util json_escape_utf8")
 	// Invalid 4 Octet Sequence (too short)
 	CHECK(json_escape_utf8("\xf4\x80\x8c") == "\\xf4\\x80\\x8c");
 	CHECK(json_escape_utf8("\xf0") == "\\xf0");
 	// Private Use Area (E000-F8FF) are always invalid
 	CHECK(json_escape_utf8("\xee\x8b\xa0") == "\\xee\\x8b\\xa0");
 	// Valid UTF-8 character followed by an invalid one
 	CHECK(json_escape_utf8("\xc3\xb1\xc0\x81") == "\\xc3\\xb1\\xc0\\x81");
 	}
 static bool check_ok_utf8(const unsigned char* start, const unsigned char* end)
 	{
 	// There's certain blocks of UTF-8 that we don't want, but the easiest way to find
 	// them is to convert to UTF-32 and then compare. This is annoying, but it also calls
 	// isLegalUTF8Sequence along the way so go with it.
 	std::array<UTF32, 2> output;
 	UTF32* output2 = output.data();
 	auto result = ConvertUTF8toUTF32(&start, end, &output2, output2+1, strictConversion);
 	if ( result != conversionOK )
 		return false;
 	if ( ( output[0] >= 0x0000 && output[0] <= 0x001F ) ||
 	     ( output[0] == 0x007F ) ||
 	     ( output[0] >= 0x0080 && output[0] <= 0x009F ) )
 		// Control characters
 		return false;
 	else if ( output[0] >= 0xE000 && output[0] <= 0xF8FF )
 		// Private Use Area
 		return false;
 	else if ( output[0] >= 0xFFF0 && output[0] <= 0xFFFF )
 		// Specials Characters
 		return false;
 	return true;
 	}
 string json_escape_utf8(const string& val)
@ -2414,52 +2446,75 @@ string json_escape_utf8(const string& val)
 	auto val_size = val.length();
 	// Reserve at least the size of the existing string to avoid resizing the string in the best-case
-	// scenario where we don't have any multi-byte characters.
+	// scenario where we don't have any multi-byte characters. We keep two versions of this string:
-	string result;
+	// one that has a valid utf8 string and one that has a fully-escaped version. The utf8 string gets
-	result.reserve(val_size);
+	// returned if all of the characters were valid utf8 sequences, but it will fall back to the
 	// escaped version otherwise. This uses slightly more memory but it avoids looping through all
 	// of the characters a second time in the case of a bad utf8 sequence.
 	string utf_result;
 	utf_result.reserve(val_size);
 	string escaped_result;
 	escaped_result.reserve(val_size);
-	size_t idx;
+	bool found_bad = false;
-	for ( idx = 0; idx < val_size; )
+	size_t idx = 0;
 	while ( idx < val_size )
 		{
 		const char ch = val[idx];
 		// Normal ASCII characters plus a few of the control characters can be inserted directly. The
 		// rest of the control characters should be escaped as regular bytes.
-		if ( ( ch >= 32 && ch <= 127 ) ||
+		if ( ( ch >= 32 && ch < 127 ) ||
 		       ch == '\b' || ch == '\f' || ch == '\n' || ch == '\r' || ch == '\t' )
 			{
-			result.push_back(ch);
+			if ( ! found_bad )
 				utf_result.push_back(ch);
 			escaped_result.push_back(ch);
 			++idx;
 			continue;
 			}
-		else if ( ch >= 0 && ch < 32 )
+		else if ( found_bad )
 			{
-			result.append(json_escape_byte(ch));
+			// If we already found a bad UTF8 character (see check_ok_utf8) just insert the bytes
 			// as escaped characters into the escaped result and move on.
 			escaped_result.append(json_escape_byte(ch));
 			++idx;
 			continue;
 			}
 		// If we haven't found a bad UTF-8 character yet, check to see if the next one starts a
 		// UTF-8 character. If not, we'll mark that we're on a bad result. Otherwise we'll go
 		// ahead and insert this character and continue.
 		if ( ! found_bad )
 			{
 			// Find out how long the next character should be.
 			unsigned int char_size = getNumBytesForUTF8(ch);
-		// If it says that it's a single character or it's not an valid string UTF8 sequence, insert
+			// If we don't have enough data for this character or it's an invalid sequence,
-		// the one escaped byte into the string, step forward one, and go to the next character.
+			// insert the one escaped byte into the string and go to the next character.
-		if ( char_size == 0 || idx+char_size > val_size || isLegalUTF8Sequence(val_data+idx, val_data+idx+char_size) == 0 )
+			if ( idx+char_size > val_size ||
 			     ! check_ok_utf8(val_data + idx, val_data + idx + char_size) )
 				{
-			result.append(json_escape_byte(ch));
+				found_bad = true;
 				escaped_result.append(json_escape_byte(ch));
 				++idx;
 				continue;
 				}
-
+			else
-		result.append(val, idx, char_size);
+				{
 				for ( int i = 0; i < char_size; i++ )
 					escaped_result.append(json_escape_byte(val[idx+i]));
 				utf_result.append(val, idx, char_size);
 				idx += char_size;
 				}
 			}
 		}
-	// Insert any of the remaining bytes into the string as escaped bytes
+	if ( found_bad )
-	for ( ; idx < val_size; ++idx )
+		return escaped_result;
-		result.append(json_escape_byte(val[idx]));
+	else
-
+		return utf_result;
 	return result;
 	}
 } // namespace zeek::util
--- a/testing/btest/Baseline/bifs.print_raw/out
+++ b/testing/btest/Baseline/bifs.print_raw/out
@ -1,4 +1,4 @@
 ### BTest baseline data generated by btest-diff. Do not edit. Use "btest -U/-u" to update. Requires BTest >= 0.63.
-"\\x07Էo"
+"\\x07\\xd4\\xb7o"
-start "\\x07Էo"137T[9, 10] finish
+start "\\x07\\xd4\\xb7o"137T[9, 10] finish
 é