From ad19f1e1bbfefde19c56ef4f6998e9aa7f818b45 Mon Sep 17 00:00:00 2001 From: Tim Wojtulewicz Date: Wed, 24 Jul 2019 10:38:45 -0700 Subject: [PATCH] GHI-486: Switch over to using LLVM utf8-checking code to better validate characters --- src/util.cc | 53 +++++-------------- .../ssh.log | 12 ++++- .../frameworks/logging/ascii-json-utf8.zeek | 17 +++++- 3 files changed, 40 insertions(+), 42 deletions(-) diff --git a/src/util.cc b/src/util.cc index f111c3f2b3..0ad45f72a5 100644 --- a/src/util.cc +++ b/src/util.cc @@ -51,6 +51,7 @@ #include "Net.h" #include "Reporter.h" #include "iosource/Manager.h" +#include "ConvertUTF.h" /** * Return IP address without enclosing brackets and any leading 0x. Also @@ -1889,8 +1890,9 @@ string json_escape_utf8(const string& val) { string result; result.reserve(val.length()); + + auto val_data = reinterpret_cast(val.c_str()); - size_t char_start = 0; size_t idx; for ( idx = 0; idx < val.length(); ) { @@ -1910,50 +1912,23 @@ string json_escape_utf8(const string& val) continue; } - // The next bit is based on the table at https://en.wikipedia.org/wiki/UTF-8#Description. - // If next character is 11110xxx, this is a 4-byte UTF-8 - unsigned int char_size = 0; - if ( (val[idx] & 0xF8) == 0xF0 ) char_size = 4; - - // If next character is 1110xxxx, this is a 3-byte UTF-8 - else if ( (val[idx] & 0xF0) == 0xE0 ) char_size = 3; - - // If next character is 110xxxxx, this is a 2-byte UTF-8 - else if ( (val[idx] & 0xE0) == 0xC0 ) char_size = 2; - - // This byte isn't a continuation byte, insert it as a byte and continue. - if ( char_size == 0) + // Find out how long the next character should be. + unsigned int char_size = getNumBytesForUTF8(val[idx]); + + // If it says that it's a single character or it's not an invalid string UTF8 sequence, insert the one + // escaped byte into the string, step forward one, and go to the next character. + if ( char_size == 0 || isLegalUTF8Sequence(val_data+idx, val_data+idx+char_size) == 0 ) { result.append(json_escape_byte(val[idx])); ++idx; continue; } - - // If we don't have enough bytes to get to the end of character, give up and insert all of the rest - // of them as escaped values. - if ( char_size > (val.length() - idx) ) - break; - - // Loop through the rest of the supposed character and see if this is a valid character. - size_t c_idx = idx + 1; - for ( ; c_idx < idx + char_size; c_idx++ ) - if ( (val[c_idx] & 0xC0) != 0x80 ) break; - - // if we didn't make it to the end of the character without finding an error, insert just this - // character and skip ahead. Otherwise insert all of the bytes for this character into the result. - if ( c_idx != idx + char_size ) - { - result.append(json_escape_byte(val[idx])); - ++idx; - continue; - } - else - { - for ( size_t step = 0; step < char_size; step++, idx++ ) - result.push_back(val[idx]); - } + + for ( size_t step = 0; step < char_size; step++, idx++ ) + result.push_back(val[idx]); } - + + // Insert any of the remaining bytes into the string as escaped bytes if ( idx != val.length() ) for ( ; idx < val.length(); ++idx ) result.append(json_escape_byte(val[idx])); diff --git a/testing/btest/Baseline/scripts.base.frameworks.logging.ascii-json-utf8/ssh.log b/testing/btest/Baseline/scripts.base.frameworks.logging.ascii-json-utf8/ssh.log index b1a395906f..5a23d20794 100644 --- a/testing/btest/Baseline/scripts.base.frameworks.logging.ascii-json-utf8/ssh.log +++ b/testing/btest/Baseline/scripts.base.frameworks.logging.ascii-json-utf8/ssh.log @@ -2,11 +2,21 @@ {"s":"\b\f\n\r\t\\x00\\x15"} {"s":"ñ"} {"s":"\\xc3("} +{"s":"\\xc0\\x81"} +{"s":"\\xc1\\x81"} +{"s":"\\xc2\\xcf"} {"s":"\\xa0\\xa1"} {"s":"₡"} +{"s":"࣡"} +{"s":"\\xe0\\x80\\xa1"} {"s":"\\xe2(\\xa1"} +{"s":"\\xed\\xa0\\xa1"} {"s":"\\xe2\\x82("} {"s":"𐌼"} -{"s":"\\xf0(\\x8c\\xbc"} +{"s":"񀌼"} +{"s":"􀌼"} +{"s":"\\xf0\\x80\\x8c\\xbc"} +{"s":"\\xf2(\\x8c\\xbc"} +{"s":"\\xf4\\x90\\x8c\\xbc"} {"s":"\\xf0\\x90(\\xbc"} {"s":"\\xf0(\\x8c("} diff --git a/testing/btest/scripts/base/frameworks/logging/ascii-json-utf8.zeek b/testing/btest/scripts/base/frameworks/logging/ascii-json-utf8.zeek index dabe86073d..c5a6689094 100644 --- a/testing/btest/scripts/base/frameworks/logging/ascii-json-utf8.zeek +++ b/testing/btest/scripts/base/frameworks/logging/ascii-json-utf8.zeek @@ -27,29 +27,42 @@ event zeek_init() Log::write(SSH::LOG, [$s="a"]); Log::write(SSH::LOG, [$s="\b\f\n\r\t\x00\x15"]); + # Table 3-7 in https://www.unicode.org/versions/Unicode12.0.0/ch03.pdf describes what is + # valid and invalid for the tests below + # Valid 2 Octet Sequence Log::write(SSH::LOG, [$s="\xc3\xb1"]); - + # Invalid 2 Octet Sequence Log::write(SSH::LOG, [$s="\xc3\x28"]); + Log::write(SSH::LOG, [$s="\xc0\x81"]); + Log::write(SSH::LOG, [$s="\xc1\x81"]); + Log::write(SSH::LOG, [$s="\xc2\xcf"]); # Invalid Sequence Identifier Log::write(SSH::LOG, [$s="\xa0\xa1"]); # Valid 3 Octet Sequence Log::write(SSH::LOG, [$s="\xe2\x82\xa1"]); + Log::write(SSH::LOG, [$s="\xe0\xa3\xa1"]); # Invalid 3 Octet Sequence (in 2nd Octet) + Log::write(SSH::LOG, [$s="\xe0\x80\xa1"]); Log::write(SSH::LOG, [$s="\xe2\x28\xa1"]); + Log::write(SSH::LOG, [$s="\xed\xa0\xa1"]); # Invalid 3 Octet Sequence (in 3rd Octet) Log::write(SSH::LOG, [$s="\xe2\x82\x28"]); # Valid 4 Octet Sequence Log::write(SSH::LOG, [$s="\xf0\x90\x8c\xbc"]); + Log::write(SSH::LOG, [$s="\xf1\x80\x8c\xbc"]); + Log::write(SSH::LOG, [$s="\xf4\x80\x8c\xbc"]); # Invalid 4 Octet Sequence (in 2nd Octet) - Log::write(SSH::LOG, [$s="\xf0\x28\x8c\xbc"]); + Log::write(SSH::LOG, [$s="\xf0\x80\x8c\xbc"]); + Log::write(SSH::LOG, [$s="\xf2\x28\x8c\xbc"]); + Log::write(SSH::LOG, [$s="\xf4\x90\x8c\xbc"]); # Invalid 4 Octet Sequence (in 3rd Octet) Log::write(SSH::LOG, [$s="\xf0\x90\x28\xbc"]);