Merge remote-tracking branch 'origin/topic/timw/1581-utf8-escaping'

* origin/topic/timw/1581-utf8-escaping:
  Return fully-escaped string if utf8 conversion fails
This commit is contained in:
Tim Wojtulewicz 2021-08-19 09:39:23 -07:00
commit fe22d7cbb4
4 changed files with 93 additions and 30 deletions

View file

@ -1,3 +1,11 @@
4.2.0-dev.78 | 2021-08-19 09:39:23 -0700
* Return fully-escaped string if utf8 conversion fails (Tim Wojtulewicz, Corelight)
This adds a new function for validating UTF-8 sequences by converting to
UTF-32. This allows us to also check for various blocks of codepointsi
that we consider invalid while checking for valid sequences in general.
4.2.0-dev.76 | 2021-08-18 08:40:41 -0700 4.2.0-dev.76 | 2021-08-18 08:40:41 -0700
* Fix option length computation in Geneve analyzer. (Benjamin Bannier, Corelight) * Fix option length computation in Geneve analyzer. (Benjamin Bannier, Corelight)

View file

@ -1 +1 @@
4.2.0-dev.76 4.2.0-dev.78

View file

@ -2352,7 +2352,7 @@ TEST_CASE("util json_escape_utf8")
CHECK(json_escape_utf8("string") == "string"); CHECK(json_escape_utf8("string") == "string");
CHECK(json_escape_utf8("string\n") == "string\n"); CHECK(json_escape_utf8("string\n") == "string\n");
CHECK(json_escape_utf8("string\x82") == "string\\x82"); CHECK(json_escape_utf8("string\x82") == "string\\x82");
CHECK(json_escape_utf8("\x07\xd4\xb7o") == "\\x07Էo"); CHECK(json_escape_utf8("\x07\xd4\xb7o") == "\\x07\\xd4\\xb7o");
// These strings are duplicated from the scripts.base.frameworks.logging.ascii-json-utf8 btest // These strings are duplicated from the scripts.base.frameworks.logging.ascii-json-utf8 btest
@ -2406,6 +2406,38 @@ TEST_CASE("util json_escape_utf8")
// Invalid 4 Octet Sequence (too short) // Invalid 4 Octet Sequence (too short)
CHECK(json_escape_utf8("\xf4\x80\x8c") == "\\xf4\\x80\\x8c"); CHECK(json_escape_utf8("\xf4\x80\x8c") == "\\xf4\\x80\\x8c");
CHECK(json_escape_utf8("\xf0") == "\\xf0"); CHECK(json_escape_utf8("\xf0") == "\\xf0");
// Private Use Area (E000-F8FF) are always invalid
CHECK(json_escape_utf8("\xee\x8b\xa0") == "\\xee\\x8b\\xa0");
// Valid UTF-8 character followed by an invalid one
CHECK(json_escape_utf8("\xc3\xb1\xc0\x81") == "\\xc3\\xb1\\xc0\\x81");
}
static bool check_ok_utf8(const unsigned char* start, const unsigned char* end)
{
// There's certain blocks of UTF-8 that we don't want, but the easiest way to find
// them is to convert to UTF-32 and then compare. This is annoying, but it also calls
// isLegalUTF8Sequence along the way so go with it.
std::array<UTF32, 2> output;
UTF32* output2 = output.data();
auto result = ConvertUTF8toUTF32(&start, end, &output2, output2+1, strictConversion);
if ( result != conversionOK )
return false;
if ( ( output[0] >= 0x0000 && output[0] <= 0x001F ) ||
( output[0] == 0x007F ) ||
( output[0] >= 0x0080 && output[0] <= 0x009F ) )
// Control characters
return false;
else if ( output[0] >= 0xE000 && output[0] <= 0xF8FF )
// Private Use Area
return false;
else if ( output[0] >= 0xFFF0 && output[0] <= 0xFFFF )
// Specials Characters
return false;
return true;
} }
string json_escape_utf8(const string& val) string json_escape_utf8(const string& val)
@ -2414,52 +2446,75 @@ string json_escape_utf8(const string& val)
auto val_size = val.length(); auto val_size = val.length();
// Reserve at least the size of the existing string to avoid resizing the string in the best-case // Reserve at least the size of the existing string to avoid resizing the string in the best-case
// scenario where we don't have any multi-byte characters. // scenario where we don't have any multi-byte characters. We keep two versions of this string:
string result; // one that has a valid utf8 string and one that has a fully-escaped version. The utf8 string gets
result.reserve(val_size); // returned if all of the characters were valid utf8 sequences, but it will fall back to the
// escaped version otherwise. This uses slightly more memory but it avoids looping through all
// of the characters a second time in the case of a bad utf8 sequence.
string utf_result;
utf_result.reserve(val_size);
string escaped_result;
escaped_result.reserve(val_size);
size_t idx; bool found_bad = false;
for ( idx = 0; idx < val_size; ) size_t idx = 0;
while ( idx < val_size )
{ {
const char ch = val[idx]; const char ch = val[idx];
// Normal ASCII characters plus a few of the control characters can be inserted directly. The // Normal ASCII characters plus a few of the control characters can be inserted directly. The
// rest of the control characters should be escaped as regular bytes. // rest of the control characters should be escaped as regular bytes.
if ( ( ch >= 32 && ch <= 127 ) || if ( ( ch >= 32 && ch < 127 ) ||
ch == '\b' || ch == '\f' || ch == '\n' || ch == '\r' || ch == '\t' ) ch == '\b' || ch == '\f' || ch == '\n' || ch == '\r' || ch == '\t' )
{ {
result.push_back(ch); if ( ! found_bad )
utf_result.push_back(ch);
escaped_result.push_back(ch);
++idx; ++idx;
continue; continue;
} }
else if ( ch >= 0 && ch < 32 ) else if ( found_bad )
{ {
result.append(json_escape_byte(ch)); // If we already found a bad UTF8 character (see check_ok_utf8) just insert the bytes
// as escaped characters into the escaped result and move on.
escaped_result.append(json_escape_byte(ch));
++idx; ++idx;
continue; continue;
} }
// If we haven't found a bad UTF-8 character yet, check to see if the next one starts a
// UTF-8 character. If not, we'll mark that we're on a bad result. Otherwise we'll go
// ahead and insert this character and continue.
if ( ! found_bad )
{
// Find out how long the next character should be. // Find out how long the next character should be.
unsigned int char_size = getNumBytesForUTF8(ch); unsigned int char_size = getNumBytesForUTF8(ch);
// If it says that it's a single character or it's not an valid string UTF8 sequence, insert // If we don't have enough data for this character or it's an invalid sequence,
// the one escaped byte into the string, step forward one, and go to the next character. // insert the one escaped byte into the string and go to the next character.
if ( char_size == 0 || idx+char_size > val_size || isLegalUTF8Sequence(val_data+idx, val_data+idx+char_size) == 0 ) if ( idx+char_size > val_size ||
! check_ok_utf8(val_data + idx, val_data + idx + char_size) )
{ {
result.append(json_escape_byte(ch)); found_bad = true;
escaped_result.append(json_escape_byte(ch));
++idx; ++idx;
continue; continue;
} }
else
result.append(val, idx, char_size); {
for ( int i = 0; i < char_size; i++ )
escaped_result.append(json_escape_byte(val[idx+i]));
utf_result.append(val, idx, char_size);
idx += char_size; idx += char_size;
} }
}
}
// Insert any of the remaining bytes into the string as escaped bytes if ( found_bad )
for ( ; idx < val_size; ++idx ) return escaped_result;
result.append(json_escape_byte(val[idx])); else
return utf_result;
return result;
} }
} // namespace zeek::util } // namespace zeek::util

View file

@ -1,4 +1,4 @@
### BTest baseline data generated by btest-diff. Do not edit. Use "btest -U/-u" to update. Requires BTest >= 0.63. ### BTest baseline data generated by btest-diff. Do not edit. Use "btest -U/-u" to update. Requires BTest >= 0.63.
"\\x07Էo" "\\x07\\xd4\\xb7o"
start "\\x07Էo"137T[9, 10] finish start "\\x07\\xd4\\xb7o"137T[9, 10] finish
é é