Added additional check to confirm anything unescaping is a multibyte UTF-8 sequence, addressing the test case Jon brought up

This commit is contained in:
Dev Bali 2019-07-22 14:23:50 -07:00
parent 66557d3178
commit d6bcdfce52

View file

@ -259,6 +259,11 @@ size_t ODesc::StartsWithEscapeSequence(const char* start, const char* end)
size_t check_utf8 (const char* bytes, size_t n, size_t i) size_t check_utf8 (const char* bytes, size_t n, size_t i)
{ {
// Check if this is infact a multibyte UTF-8 sequence,
// which requires a 1 to be the first bit of the first byte
if (!(bytes[i] >> 7 & 1))
return 0;
// Checks two to four bytes from starting position i // Checks two to four bytes from starting position i
// and returns the length of the valid utf-8 sequence // and returns the length of the valid utf-8 sequence
size_t num_to_check = ((n-i+1) < 4) ? (n-i+1) : 4; size_t num_to_check = ((n-i+1) < 4) ? (n-i+1) : 4;
@ -283,7 +288,10 @@ pair<const char*, size_t> ODesc::FirstEscapeLoc(const char* bytes, size_t n)
for ( size_t i = 0; i < n; ++i ) for ( size_t i = 0; i < n; ++i )
{ {
if (!isprint(bytes[i])) if (bytes[i] == '\\' )
return escape_pos(bytes + i, 1);
else if (!isprint(bytes[i]))
{ {
if (utf8) if (utf8)
{ {
@ -297,9 +305,6 @@ pair<const char*, size_t> ODesc::FirstEscapeLoc(const char* bytes, size_t n)
return escape_pos(bytes + i, 1); return escape_pos(bytes + i, 1);
} }
else if (bytes[i] == '\\' )
return escape_pos(bytes + i, 1);
size_t len = StartsWithEscapeSequence(bytes + i, bytes + n); size_t len = StartsWithEscapeSequence(bytes + i, bytes + n);
if ( len ) if ( len )