Merge remote-tracking branch 'origin/topic/timw/595-rapidjson'

Tweaks:
    - Small change to the logic for removing quotes around strings.
    - Updated NEWS & COPYING.3rdparty
    - Use of intrusive_ptr for stack-allocated StringVals
    - Little bit of refactoring (I would love to merge the two BuildJSON() functions, too, but that's a larger task)

* origin/topic/timw/595-rapidjson:
  Use the list of files from clang-tidy when searching for unit tests
  Optimize json_escape_utf8 a bit by removing repeated calls to string methods
  Expand unit test for json_escape_utf8 to include all of the strings from the ascii-json-utf8 btest
  GHI-595: Convert from nlohmann/json to rapidjson for performance reasons
  Convert type-checking macros to actual functions
This commit is contained in:
Robin Sommer 2020-01-17 13:05:59 +00:00
commit 8170baabef
12 changed files with 335 additions and 210 deletions

View file

@ -2190,54 +2190,112 @@ TEST_CASE("util json_escape_utf8")
CHECK(json_escape_utf8("string") == "string");
CHECK(json_escape_utf8("string\n") == "string\n");
CHECK(json_escape_utf8("string\x82") == "string\\x82");
CHECK(json_escape_utf8("\x07\xd4\xb7o") == "\\x07Էo");
// These strings are duplicated from the scripts.base.frameworks.logging.ascii-json-utf8 btest
// Valid ASCII and valid ASCII control characters
CHECK(json_escape_utf8("a") == "a");
CHECK(json_escape_utf8("\b\f\n\r\t\x00\x15") == "\b\f\n\r\t\x00\x15");
// Table 3-7 in https://www.unicode.org/versions/Unicode12.0.0/ch03.pdf describes what is
// valid and invalid for the tests below
// Valid 2 Octet Sequence
CHECK(json_escape_utf8("\xc3\xb1") == "\xc3\xb1");
// Invalid 2 Octet Sequence
CHECK(json_escape_utf8("\xc3\x28") == "\\xc3(");
CHECK(json_escape_utf8("\xc0\x81") == "\\xc0\\x81");
CHECK(json_escape_utf8("\xc1\x81") == "\\xc1\\x81");
CHECK(json_escape_utf8("\xc2\xcf") == "\\xc2\\xcf");
// Invalid Sequence Identifier
CHECK(json_escape_utf8("\xa0\xa1") == "\\xa0\\xa1");
// Valid 3 Octet Sequence
CHECK(json_escape_utf8("\xe2\x82\xa1") == "\xe2\x82\xa1");
CHECK(json_escape_utf8("\xe0\xa3\xa1") == "\xe0\xa3\xa1");
// Invalid 3 Octet Sequence (in 2nd Octet)
CHECK(json_escape_utf8("\xe0\x80\xa1") == "\\xe0\\x80\\xa1");
CHECK(json_escape_utf8("\xe2\x28\xa1") == "\\xe2(\\xa1");
CHECK(json_escape_utf8("\xed\xa0\xa1") == "\\xed\\xa0\\xa1");
// Invalid 3 Octet Sequence (in 3rd Octet)
CHECK(json_escape_utf8("\xe2\x82\x28") == "\\xe2\\x82(");
// Valid 4 Octet Sequence
CHECK(json_escape_utf8("\xf0\x90\x8c\xbc") == "\xf0\x90\x8c\xbc");
CHECK(json_escape_utf8("\xf1\x80\x8c\xbc") == "\xf1\x80\x8c\xbc");
CHECK(json_escape_utf8("\xf4\x80\x8c\xbc") == "\xf4\x80\x8c\xbc");
// Invalid 4 Octet Sequence (in 2nd Octet)
CHECK(json_escape_utf8("\xf0\x80\x8c\xbc") == "\\xf0\\x80\\x8c\\xbc");
CHECK(json_escape_utf8("\xf2\x28\x8c\xbc") == "\\xf2(\\x8c\\xbc");
CHECK(json_escape_utf8("\xf4\x90\x8c\xbc") == "\\xf4\\x90\\x8c\\xbc");
// Invalid 4 Octet Sequence (in 3rd Octet)
CHECK(json_escape_utf8("\xf0\x90\x28\xbc") == "\\xf0\\x90(\\xbc");
// Invalid 4 Octet Sequence (in 4th Octet)
CHECK(json_escape_utf8("\xf0\x28\x8c\x28") == "\\xf0(\\x8c(");
// Invalid 4 Octet Sequence (too short)
CHECK(json_escape_utf8("\xf4\x80\x8c") == "\\xf4\\x80\\x8c");
CHECK(json_escape_utf8("\xf0") == "\\xf0");
}
string json_escape_utf8(const string& val)
{
string result;
result.reserve(val.length());
auto val_data = reinterpret_cast<const unsigned char*>(val.c_str());
auto val_size = val.length();
// Reserve at least the size of the existing string to avoid resizing the string in the best-case
// scenario where we don't have any multi-byte characters.
string result;
result.reserve(val_size);
size_t idx;
for ( idx = 0; idx < val.length(); )
for ( idx = 0; idx < val_size; )
{
// Normal ASCII characters plus a few of the control characters can be inserted directly. The rest of
// the control characters should be escaped as regular bytes.
if ( ( val[idx] >= 32 && val[idx] <= 127 ) ||
val[idx] == '\b' || val[idx] == '\f' || val[idx] == '\n' || val[idx] == '\r' || val[idx] == '\t' )
const char ch = val[idx];
// Normal ASCII characters plus a few of the control characters can be inserted directly. The
// rest of the control characters should be escaped as regular bytes.
if ( ( ch >= 32 && ch <= 127 ) ||
ch == '\b' || ch == '\f' || ch == '\n' || ch == '\r' || ch == '\t' )
{
result.push_back(val[idx]);
result.push_back(ch);
++idx;
continue;
}
else if ( val[idx] >= 0 && val[idx] < 32 )
else if ( ch >= 0 && ch < 32 )
{
result.append(json_escape_byte(val[idx]));
result.append(json_escape_byte(ch));
++idx;
continue;
}
// Find out how long the next character should be.
unsigned int char_size = getNumBytesForUTF8(val[idx]);
unsigned int char_size = getNumBytesForUTF8(ch);
// If it says that it's a single character or it's not an invalid string UTF8 sequence, insert the one
// escaped byte into the string, step forward one, and go to the next character.
if ( char_size == 0 || idx+char_size > val.length() || isLegalUTF8Sequence(val_data+idx, val_data+idx+char_size) == 0 )
// If it says that it's a single character or it's not an valid string UTF8 sequence, insert
// the one escaped byte into the string, step forward one, and go to the next character.
if ( char_size == 0 || idx+char_size > val_size || isLegalUTF8Sequence(val_data+idx, val_data+idx+char_size) == 0 )
{
result.append(json_escape_byte(val[idx]));
result.append(json_escape_byte(ch));
++idx;
continue;
}
for ( size_t step = 0; step < char_size; step++, idx++ )
result.push_back(val[idx]);
result.append(val, idx, char_size);
idx += char_size;
}
// Insert any of the remaining bytes into the string as escaped bytes
if ( idx != val.length() )
for ( ; idx < val.length(); ++idx )
result.append(json_escape_byte(val[idx]));
for ( ; idx < val_size; ++idx )
result.append(json_escape_byte(val[idx]));
return result;
}