mirror of
https://github.com/zeek/zeek.git
synced 2025-10-02 14:48:21 +00:00
GHI-486: Switch over to using LLVM utf8-checking code to better validate characters
This commit is contained in:
parent
9698d8d7cc
commit
ad19f1e1bb
3 changed files with 40 additions and 42 deletions
53
src/util.cc
53
src/util.cc
|
@ -51,6 +51,7 @@
|
||||||
#include "Net.h"
|
#include "Net.h"
|
||||||
#include "Reporter.h"
|
#include "Reporter.h"
|
||||||
#include "iosource/Manager.h"
|
#include "iosource/Manager.h"
|
||||||
|
#include "ConvertUTF.h"
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Return IP address without enclosing brackets and any leading 0x. Also
|
* Return IP address without enclosing brackets and any leading 0x. Also
|
||||||
|
@ -1889,8 +1890,9 @@ string json_escape_utf8(const string& val)
|
||||||
{
|
{
|
||||||
string result;
|
string result;
|
||||||
result.reserve(val.length());
|
result.reserve(val.length());
|
||||||
|
|
||||||
|
auto val_data = reinterpret_cast<const unsigned char*>(val.c_str());
|
||||||
|
|
||||||
size_t char_start = 0;
|
|
||||||
size_t idx;
|
size_t idx;
|
||||||
for ( idx = 0; idx < val.length(); )
|
for ( idx = 0; idx < val.length(); )
|
||||||
{
|
{
|
||||||
|
@ -1910,50 +1912,23 @@ string json_escape_utf8(const string& val)
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// The next bit is based on the table at https://en.wikipedia.org/wiki/UTF-8#Description.
|
// Find out how long the next character should be.
|
||||||
// If next character is 11110xxx, this is a 4-byte UTF-8
|
unsigned int char_size = getNumBytesForUTF8(val[idx]);
|
||||||
unsigned int char_size = 0;
|
|
||||||
if ( (val[idx] & 0xF8) == 0xF0 ) char_size = 4;
|
// If it says that it's a single character or it's not an invalid string UTF8 sequence, insert the one
|
||||||
|
// escaped byte into the string, step forward one, and go to the next character.
|
||||||
// If next character is 1110xxxx, this is a 3-byte UTF-8
|
if ( char_size == 0 || isLegalUTF8Sequence(val_data+idx, val_data+idx+char_size) == 0 )
|
||||||
else if ( (val[idx] & 0xF0) == 0xE0 ) char_size = 3;
|
|
||||||
|
|
||||||
// If next character is 110xxxxx, this is a 2-byte UTF-8
|
|
||||||
else if ( (val[idx] & 0xE0) == 0xC0 ) char_size = 2;
|
|
||||||
|
|
||||||
// This byte isn't a continuation byte, insert it as a byte and continue.
|
|
||||||
if ( char_size == 0)
|
|
||||||
{
|
{
|
||||||
result.append(json_escape_byte(val[idx]));
|
result.append(json_escape_byte(val[idx]));
|
||||||
++idx;
|
++idx;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// If we don't have enough bytes to get to the end of character, give up and insert all of the rest
|
for ( size_t step = 0; step < char_size; step++, idx++ )
|
||||||
// of them as escaped values.
|
result.push_back(val[idx]);
|
||||||
if ( char_size > (val.length() - idx) )
|
|
||||||
break;
|
|
||||||
|
|
||||||
// Loop through the rest of the supposed character and see if this is a valid character.
|
|
||||||
size_t c_idx = idx + 1;
|
|
||||||
for ( ; c_idx < idx + char_size; c_idx++ )
|
|
||||||
if ( (val[c_idx] & 0xC0) != 0x80 ) break;
|
|
||||||
|
|
||||||
// if we didn't make it to the end of the character without finding an error, insert just this
|
|
||||||
// character and skip ahead. Otherwise insert all of the bytes for this character into the result.
|
|
||||||
if ( c_idx != idx + char_size )
|
|
||||||
{
|
|
||||||
result.append(json_escape_byte(val[idx]));
|
|
||||||
++idx;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
for ( size_t step = 0; step < char_size; step++, idx++ )
|
|
||||||
result.push_back(val[idx]);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Insert any of the remaining bytes into the string as escaped bytes
|
||||||
if ( idx != val.length() )
|
if ( idx != val.length() )
|
||||||
for ( ; idx < val.length(); ++idx )
|
for ( ; idx < val.length(); ++idx )
|
||||||
result.append(json_escape_byte(val[idx]));
|
result.append(json_escape_byte(val[idx]));
|
||||||
|
|
|
@ -2,11 +2,21 @@
|
||||||
{"s":"\b\f\n\r\t\\x00\\x15"}
|
{"s":"\b\f\n\r\t\\x00\\x15"}
|
||||||
{"s":"ñ"}
|
{"s":"ñ"}
|
||||||
{"s":"\\xc3("}
|
{"s":"\\xc3("}
|
||||||
|
{"s":"\\xc0\\x81"}
|
||||||
|
{"s":"\\xc1\\x81"}
|
||||||
|
{"s":"\\xc2\\xcf"}
|
||||||
{"s":"\\xa0\\xa1"}
|
{"s":"\\xa0\\xa1"}
|
||||||
{"s":"₡"}
|
{"s":"₡"}
|
||||||
|
{"s":"࣡"}
|
||||||
|
{"s":"\\xe0\\x80\\xa1"}
|
||||||
{"s":"\\xe2(\\xa1"}
|
{"s":"\\xe2(\\xa1"}
|
||||||
|
{"s":"\\xed\\xa0\\xa1"}
|
||||||
{"s":"\\xe2\\x82("}
|
{"s":"\\xe2\\x82("}
|
||||||
{"s":"𐌼"}
|
{"s":"𐌼"}
|
||||||
{"s":"\\xf0(\\x8c\\xbc"}
|
{"s":""}
|
||||||
|
{"s":""}
|
||||||
|
{"s":"\\xf0\\x80\\x8c\\xbc"}
|
||||||
|
{"s":"\\xf2(\\x8c\\xbc"}
|
||||||
|
{"s":"\\xf4\\x90\\x8c\\xbc"}
|
||||||
{"s":"\\xf0\\x90(\\xbc"}
|
{"s":"\\xf0\\x90(\\xbc"}
|
||||||
{"s":"\\xf0(\\x8c("}
|
{"s":"\\xf0(\\x8c("}
|
||||||
|
|
|
@ -27,29 +27,42 @@ event zeek_init()
|
||||||
Log::write(SSH::LOG, [$s="a"]);
|
Log::write(SSH::LOG, [$s="a"]);
|
||||||
Log::write(SSH::LOG, [$s="\b\f\n\r\t\x00\x15"]);
|
Log::write(SSH::LOG, [$s="\b\f\n\r\t\x00\x15"]);
|
||||||
|
|
||||||
|
# Table 3-7 in https://www.unicode.org/versions/Unicode12.0.0/ch03.pdf describes what is
|
||||||
|
# valid and invalid for the tests below
|
||||||
|
|
||||||
# Valid 2 Octet Sequence
|
# Valid 2 Octet Sequence
|
||||||
Log::write(SSH::LOG, [$s="\xc3\xb1"]);
|
Log::write(SSH::LOG, [$s="\xc3\xb1"]);
|
||||||
|
|
||||||
# Invalid 2 Octet Sequence
|
# Invalid 2 Octet Sequence
|
||||||
Log::write(SSH::LOG, [$s="\xc3\x28"]);
|
Log::write(SSH::LOG, [$s="\xc3\x28"]);
|
||||||
|
Log::write(SSH::LOG, [$s="\xc0\x81"]);
|
||||||
|
Log::write(SSH::LOG, [$s="\xc1\x81"]);
|
||||||
|
Log::write(SSH::LOG, [$s="\xc2\xcf"]);
|
||||||
|
|
||||||
# Invalid Sequence Identifier
|
# Invalid Sequence Identifier
|
||||||
Log::write(SSH::LOG, [$s="\xa0\xa1"]);
|
Log::write(SSH::LOG, [$s="\xa0\xa1"]);
|
||||||
|
|
||||||
# Valid 3 Octet Sequence
|
# Valid 3 Octet Sequence
|
||||||
Log::write(SSH::LOG, [$s="\xe2\x82\xa1"]);
|
Log::write(SSH::LOG, [$s="\xe2\x82\xa1"]);
|
||||||
|
Log::write(SSH::LOG, [$s="\xe0\xa3\xa1"]);
|
||||||
|
|
||||||
# Invalid 3 Octet Sequence (in 2nd Octet)
|
# Invalid 3 Octet Sequence (in 2nd Octet)
|
||||||
|
Log::write(SSH::LOG, [$s="\xe0\x80\xa1"]);
|
||||||
Log::write(SSH::LOG, [$s="\xe2\x28\xa1"]);
|
Log::write(SSH::LOG, [$s="\xe2\x28\xa1"]);
|
||||||
|
Log::write(SSH::LOG, [$s="\xed\xa0\xa1"]);
|
||||||
|
|
||||||
# Invalid 3 Octet Sequence (in 3rd Octet)
|
# Invalid 3 Octet Sequence (in 3rd Octet)
|
||||||
Log::write(SSH::LOG, [$s="\xe2\x82\x28"]);
|
Log::write(SSH::LOG, [$s="\xe2\x82\x28"]);
|
||||||
|
|
||||||
# Valid 4 Octet Sequence
|
# Valid 4 Octet Sequence
|
||||||
Log::write(SSH::LOG, [$s="\xf0\x90\x8c\xbc"]);
|
Log::write(SSH::LOG, [$s="\xf0\x90\x8c\xbc"]);
|
||||||
|
Log::write(SSH::LOG, [$s="\xf1\x80\x8c\xbc"]);
|
||||||
|
Log::write(SSH::LOG, [$s="\xf4\x80\x8c\xbc"]);
|
||||||
|
|
||||||
# Invalid 4 Octet Sequence (in 2nd Octet)
|
# Invalid 4 Octet Sequence (in 2nd Octet)
|
||||||
Log::write(SSH::LOG, [$s="\xf0\x28\x8c\xbc"]);
|
Log::write(SSH::LOG, [$s="\xf0\x80\x8c\xbc"]);
|
||||||
|
Log::write(SSH::LOG, [$s="\xf2\x28\x8c\xbc"]);
|
||||||
|
Log::write(SSH::LOG, [$s="\xf4\x90\x8c\xbc"]);
|
||||||
|
|
||||||
# Invalid 4 Octet Sequence (in 3rd Octet)
|
# Invalid 4 Octet Sequence (in 3rd Octet)
|
||||||
Log::write(SSH::LOG, [$s="\xf0\x90\x28\xbc"]);
|
Log::write(SSH::LOG, [$s="\xf0\x90\x28\xbc"]);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue