Merge remote-tracking branch 'origin/topic/timw/595-rapidjson'

Tweaks:
    - Small change to the logic for removing quotes around strings.
    - Updated NEWS & COPYING.3rdparty
    - Use of intrusive_ptr for stack-allocated StringVals
    - Little bit of refactoring (I would love to merge the two BuildJSON() functions, too, but that's a larger task)

* origin/topic/timw/595-rapidjson:
  Use the list of files from clang-tidy when searching for unit tests
  Optimize json_escape_utf8 a bit by removing repeated calls to string methods
  Expand unit test for json_escape_utf8 to include all of the strings from the ascii-json-utf8 btest
  GHI-595: Convert from nlohmann/json to rapidjson for performance reasons
  Convert type-checking macros to actual functions
This commit is contained in:
Robin Sommer 2020-01-17 13:05:59 +00:00
commit 8170baabef
12 changed files with 335 additions and 210 deletions

@ -1 +1 @@
Subproject commit 2b3206b7add3472ea0736f2841473e11d506a85e
Subproject commit fae32236391d9117bf996e75d56ebd01ef076bc2

View file

@ -413,15 +413,14 @@ install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/
)
install(FILES
${CMAKE_CURRENT_SOURCE_DIR}/3rdparty/json.hpp
${CMAKE_CURRENT_SOURCE_DIR}/3rdparty/sqlite3.h
DESTINATION include/zeek/3rdparty
)
install(FILES
${CMAKE_CURRENT_SOURCE_DIR}/3rdparty/tsl-ordered-map/ordered_map.h
${CMAKE_CURRENT_SOURCE_DIR}/3rdparty/tsl-ordered-map/ordered_hash.h
DESTINATION include/zeek/3rdparty/tsl-ordered-map
${CMAKE_CURRENT_SOURCE_DIR}/3rdparty/rapidjson/include/rapidjson/document.h
${CMAKE_CURRENT_SOURCE_DIR}/3rdparty/rapidjson/include/rapidjson/writer.h
DESTINATION include/zeek/3rdparty/rapidjson/include/rapidjson
)
########################################################################
@ -439,9 +438,8 @@ create_clang_tidy_target()
# Scan all .cc files for TEST_CASE macros and generate CTest targets.
if (ENABLE_ZEEK_UNIT_TESTS)
file(GLOB_RECURSE all_cc_files "*.cc")
set(test_cases "")
foreach (cc_file ${all_cc_files})
foreach (cc_file ${TIDY_SRCS})
file (STRINGS ${cc_file} test_case_lines REGEX "TEST_CASE")
foreach (line ${test_case_lines})
string(REGEX REPLACE "TEST_CASE\\(\"(.+)\"\\)" "\\1" test_case "${line}")

View file

@ -698,53 +698,53 @@ extern BroType* init_type(Expr* init);
// Returns true if argument is an atomic type.
bool is_atomic_type(const BroType* t);
// True if the given type tag corresponds to an integral type.
#define IsIntegral(t) (t == TYPE_INT || t == TYPE_COUNT || t == TYPE_COUNTER)
// True if the given type tag corresponds to an arithmetic type.
#define IsArithmetic(t) (IsIntegral(t) || t == TYPE_DOUBLE)
// True if the given type tag corresponds to a boolean type.
#define IsBool(t) (t == TYPE_BOOL)
// True if the given type tag corresponds to an interval type.
#define IsInterval(t) (t == TYPE_INTERVAL)
// True if the given type tag corresponds to a record type.
#define IsRecord(t) (t == TYPE_RECORD || t == TYPE_UNION)
// True if the given type tag corresponds to a function type.
#define IsFunc(t) (t == TYPE_FUNC)
// True if the given type type is a vector.
#define IsVector(t) (t == TYPE_VECTOR)
// True if the given type type is a string.
#define IsString(t) (t == TYPE_STRING)
// True if the given type tag corresponds to type that can be assigned to.
extern int is_assignable(BroType* t);
// True if the given type tag corresponds to an integral type.
inline bool IsIntegral(TypeTag t) { return (t == TYPE_INT || t == TYPE_COUNT || t == TYPE_COUNTER); }
// True if the given type tag corresponds to an arithmetic type.
inline bool IsArithmetic(TypeTag t) { return (IsIntegral(t) || t == TYPE_DOUBLE); }
// True if the given type tag corresponds to a boolean type.
inline bool IsBool(TypeTag t) { return (t == TYPE_BOOL); }
// True if the given type tag corresponds to an interval type.
inline bool IsInterval(TypeTag t) { return (t == TYPE_INTERVAL); }
// True if the given type tag corresponds to a record type.
inline bool IsRecord(TypeTag t) { return (t == TYPE_RECORD || t == TYPE_UNION); }
// True if the given type tag corresponds to a function type.
inline bool IsFunc(TypeTag t) { return (t == TYPE_FUNC); }
// True if the given type type is a vector.
inline bool IsVector(TypeTag t) { return (t == TYPE_VECTOR); }
// True if the given type type is a string.
inline bool IsString(TypeTag t) { return (t == TYPE_STRING); }
// True if the given type tag corresponds to the error type.
#define IsErrorType(t) (t == TYPE_ERROR)
inline bool IsErrorType(TypeTag t) { return (t == TYPE_ERROR); }
// True if both tags are integral types.
#define BothIntegral(t1, t2) (IsIntegral(t1) && IsIntegral(t2))
inline bool BothIntegral(TypeTag t1, TypeTag t2) { return (IsIntegral(t1) && IsIntegral(t2)); }
// True if both tags are arithmetic types.
#define BothArithmetic(t1, t2) (IsArithmetic(t1) && IsArithmetic(t2))
inline bool BothArithmetic(TypeTag t1, TypeTag t2) { return (IsArithmetic(t1) && IsArithmetic(t2)); }
// True if either tags is an arithmetic type.
#define EitherArithmetic(t1, t2) (IsArithmetic(t1) || IsArithmetic(t2))
inline bool EitherArithmetic(TypeTag t1, TypeTag t2) { return (IsArithmetic(t1) || IsArithmetic(t2)); }
// True if both tags are boolean types.
#define BothBool(t1, t2) (IsBool(t1) && IsBool(t2))
inline bool BothBool(TypeTag t1, TypeTag t2) { return (IsBool(t1) && IsBool(t2)); }
// True if both tags are interval types.
#define BothInterval(t1, t2) (IsInterval(t1) && IsInterval(t2))
inline bool BothInterval(TypeTag t1, TypeTag t2) { return (IsInterval(t1) && IsInterval(t2)); }
// True if both tags are string types.
#define BothString(t1, t2) (IsString(t1) && IsString(t2))
inline bool BothString(TypeTag t1, TypeTag t2) { return (IsString(t1) && IsString(t2)); }
// True if either tag is the error type.
#define EitherError(t1, t2) (IsErrorType(t1) || IsErrorType(t2))
inline bool EitherError(TypeTag t1, TypeTag t2) { return (IsErrorType(t1) || IsErrorType(t2)); }

View file

@ -27,20 +27,7 @@
#include "broker/Data.h"
#include "3rdparty/json.hpp"
#include "3rdparty/tsl-ordered-map/ordered_map.h"
// Define a class for use with the json library that orders the keys in the same order that
// they were inserted. By default, the json library orders them alphabetically and we don't
// want it like that.
template<class Key, class T, class Ignore, class Allocator,
class Hash = std::hash<Key>, class KeyEqual = std::equal_to<Key>,
class AllocatorPair = typename std::allocator_traits<Allocator>::template rebind_alloc<std::pair<Key, T>>,
class ValueTypeContainer = std::vector<std::pair<Key, T>, AllocatorPair>>
using ordered_map = tsl::ordered_map<Key, T, Hash, KeyEqual, AllocatorPair, ValueTypeContainer>;
using ZeekJson = nlohmann::basic_json<ordered_map>;
#include "threading/formatters/JSON.h"
Val::Val(Func* f)
{
@ -433,46 +420,56 @@ TableVal* Val::GetRecordFields()
return rt->GetRecordFieldsVal(rv);
}
// This is a static method in this file to avoid including json.hpp in Val.h since it's huge.
static ZeekJson BuildJSON(Val* val, bool only_loggable=false, RE_Matcher* re=nullptr)
// This is a static method in this file to avoid including rapidjson's headers in Val.h because they're huge.
static void BuildJSON(threading::formatter::JSON::NullDoubleWriter& writer, Val* val, bool only_loggable=false, RE_Matcher* re=nullptr, const string& key="")
{
// If the value wasn't set, return a nullptr. This will get turned into a 'null' in the json output.
if ( ! val )
return nullptr;
if ( !key.empty() )
writer.Key(key);
ZeekJson j;
// If the value wasn't set, write a null into the stream and return.
if ( ! val )
{
writer.Null();
return;
}
rapidjson::Value j;
BroType* type = val->Type();
switch ( type->Tag() )
{
case TYPE_BOOL:
j = val->AsBool();
writer.Bool(val->AsBool());
break;
case TYPE_INT:
j = val->AsInt();
writer.Int64(val->AsInt());
break;
case TYPE_COUNT:
j = val->AsCount();
writer.Uint64(val->AsCount());
break;
case TYPE_COUNTER:
j = val->AsCounter();
writer.Uint64(val->AsCounter());
break;
case TYPE_TIME:
j = val->AsTime();
writer.Double(val->AsTime());
break;
case TYPE_DOUBLE:
j = val->AsDouble();
writer.Double(val->AsDouble());
break;
case TYPE_PORT:
{
auto* pval = val->AsPortVal();
j.emplace("port", pval->Port());
j.emplace("proto", pval->Protocol());
writer.StartObject();
writer.Key("port");
writer.Int64(pval->Port());
writer.Key("proto");
writer.String(pval->Protocol());
writer.EndObject();
break;
}
@ -484,7 +481,7 @@ static ZeekJson BuildJSON(Val* val, bool only_loggable=false, RE_Matcher* re=nul
ODesc d;
d.SetStyle(RAW_STYLE);
val->Describe(&d);
j = string(reinterpret_cast<const char*>(d.Bytes()), d.Len());
writer.String(reinterpret_cast<const char*>(d.Bytes()), d.Len());
break;
}
@ -496,7 +493,7 @@ static ZeekJson BuildJSON(Val* val, bool only_loggable=false, RE_Matcher* re=nul
ODesc d;
d.SetStyle(RAW_STYLE);
val->Describe(&d);
j = json_escape_utf8(string(reinterpret_cast<const char*>(d.Bytes()), d.Len()));
writer.String(json_escape_utf8(string(reinterpret_cast<const char*>(d.Bytes()), d.Len())));
break;
}
@ -506,9 +503,9 @@ static ZeekJson BuildJSON(Val* val, bool only_loggable=false, RE_Matcher* re=nul
auto* tval = val->AsTableVal();
if ( tval->Type()->IsSet() )
j = ZeekJson::array();
writer.StartArray();
else
j = ZeekJson::object();
writer.StartObject();
HashKey* k;
TableEntryVal* entry;
@ -524,102 +521,125 @@ static ZeekJson BuildJSON(Val* val, bool only_loggable=false, RE_Matcher* re=nul
else
entry_key = lv->Ref();
ZeekJson key_json = BuildJSON(entry_key, only_loggable, re);
if ( tval->Type()->IsSet() )
j.emplace_back(std::move(key_json));
BuildJSON(writer, entry_key, only_loggable, re);
else
{
Val* entry_value = entry->Value();
rapidjson::StringBuffer buffer;
threading::formatter::JSON::NullDoubleWriter key_writer(buffer);
BuildJSON(key_writer, entry_key, only_loggable, re);
string key_str = buffer.GetString();
string key_string;
if ( key_json.is_string() )
key_string = key_json;
else
key_string = key_json.dump();
if ( key_str.length() >= 2 &&
key_str[0] == '"' &&
key_str[key_str.length() - 1] == '"' )
// Strip quotes.
key_str = key_str.substr(1, key_str.length() - 2);
j.emplace(key_string, BuildJSON(entry_value, only_loggable, re));
BuildJSON(writer, entry->Value(), only_loggable, re, key_str);
}
Unref(entry_key);
Unref(lv);
}
if ( tval->Type()->IsSet() )
writer.EndArray();
else
writer.EndObject();
break;
}
case TYPE_RECORD:
{
j = ZeekJson::object();
writer.StartObject();
auto* rval = val->AsRecordVal();
auto rt = rval->Type()->AsRecordType();
for ( auto i = 0; i < rt->NumFields(); ++i )
{
auto field_name = rt->FieldName(i);
std::string key_string;
if ( re && re->MatchAnywhere(field_name) != 0 )
{
StringVal blank("");
StringVal fn_val(field_name);
auto key_val = fn_val.Substitute(re, &blank, 0)->AsStringVal();
key_string = key_val->ToStdString();
Unref(key_val);
}
else
key_string = field_name;
Val* value = rval->LookupWithDefault(i);
if ( value && ( ! only_loggable || rt->FieldHasAttr(i, ATTR_LOG) ) )
j.emplace(key_string, BuildJSON(value, only_loggable, re));
{
string key_str;
auto field_name = rt->FieldName(i);
if ( re && re->MatchAnywhere(field_name) != 0 )
{
auto blank = make_intrusive<StringVal>("");
auto fn_val = make_intrusive<StringVal>(field_name);
auto key_val = fn_val->Substitute(re, blank.get(), 0)->AsStringVal();
key_str = key_val->ToStdString();
Unref(key_val);
}
else
key_str = field_name;
BuildJSON(writer, value, only_loggable, re, key_str);
}
Unref(value);
}
writer.EndObject();
break;
}
case TYPE_LIST:
{
j = ZeekJson::array();
writer.StartArray();
auto* lval = val->AsListVal();
size_t size = lval->Length();
for (size_t i = 0; i < size; i++)
j.push_back(BuildJSON(lval->Index(i), only_loggable, re));
BuildJSON(writer, lval->Index(i), only_loggable, re);
writer.EndArray();
break;
}
case TYPE_VECTOR:
{
j = ZeekJson::array();
writer.StartArray();
auto* vval = val->AsVectorVal();
size_t size = vval->SizeVal()->AsCount();
for (size_t i = 0; i < size; i++)
j.push_back(BuildJSON(vval->Lookup(i), only_loggable, re));
BuildJSON(writer, vval->Lookup(i), only_loggable, re);
writer.EndArray();
break;
}
case TYPE_OPAQUE:
{
writer.StartObject();
writer.Key("opaque_type");
auto* oval = val->AsOpaqueVal();
j = { { "opaque_type", OpaqueMgr::mgr()->TypeID(oval) } };
writer.String(OpaqueMgr::mgr()->TypeID(oval));
writer.EndObject();
break;
}
default: break;
default:
writer.Null();
break;
}
return j;
}
StringVal* Val::ToJSON(bool only_loggable, RE_Matcher* re)
{
ZeekJson j = BuildJSON(this, only_loggable, re);
return new StringVal(j.dump());
rapidjson::StringBuffer buffer;
threading::formatter::JSON::NullDoubleWriter writer(buffer);
BuildJSON(writer, this, only_loggable, re, "");
return new StringVal(buffer.GetString());
}
IntervalVal::IntervalVal(double quantity, double units) :

View file

@ -12,9 +12,18 @@
#include <stdint.h>
#include "JSON.h"
#include "3rdparty/rapidjson/include/rapidjson/internal/ieee754.h"
using namespace threading::formatter;
bool JSON::NullDoubleWriter::Double(double d)
{
if ( rapidjson::internal::Double(d).IsNanOrInf() )
return rapidjson::Writer<rapidjson::StringBuffer>::Null();
return rapidjson::Writer<rapidjson::StringBuffer>::Double(d);
}
JSON::JSON(MsgThread* t, TimeFormat tf) : Formatter(t), surrounding_braces(true)
{
timestamps = tf;
@ -27,21 +36,19 @@ JSON::~JSON()
bool JSON::Describe(ODesc* desc, int num_fields, const Field* const * fields,
Value** vals) const
{
ZeekJson j = ZeekJson::object();
rapidjson::StringBuffer buffer;
NullDoubleWriter writer(buffer);
writer.StartObject();
for ( int i = 0; i < num_fields; i++ )
{
if ( vals[i]->present )
{
ZeekJson new_entry = BuildJSON(vals[i]);
if ( new_entry.is_null() )
return false;
j.emplace(fields[i]->name, new_entry);
}
BuildJSON(writer, vals[i], fields[i]->name);
}
desc->Add(j.dump());
writer.EndObject();
desc->Add(buffer.GetString());
return true;
}
@ -54,14 +61,18 @@ bool JSON::Describe(ODesc* desc, Value* val, const string& name) const
return false;
}
if ( ! val->present )
if ( ! val->present || name.empty() )
return true;
ZeekJson j = BuildJSON(val, name);
if ( j.is_null() )
return false;
rapidjson::Document doc;
rapidjson::StringBuffer buffer;
NullDoubleWriter writer(buffer);
desc->Add(j.dump());
writer.StartObject();
BuildJSON(writer, val, name);
writer.EndObject();
desc->Add(buffer.GetString());
return true;
}
@ -71,43 +82,47 @@ threading::Value* JSON::ParseValue(const string& s, const string& name, TypeTag
return nullptr;
}
ZeekJson JSON::BuildJSON(Value* val, const string& name) const
void JSON::BuildJSON(NullDoubleWriter& writer, Value* val, const string& name) const
{
// If the value wasn't set, return a nullptr. This will get turned into a 'null' in the json output.
if ( ! val->present )
return nullptr;
{
writer.Null();
return;
}
if ( ! name.empty() )
writer.Key(name);
ZeekJson j;
switch ( val->type )
{
case TYPE_BOOL:
j = val->val.int_val != 0;
writer.Bool(val->val.int_val != 0);
break;
case TYPE_INT:
j = val->val.int_val;
writer.Int64(val->val.int_val);
break;
case TYPE_COUNT:
case TYPE_COUNTER:
j = val->val.uint_val;
writer.Uint64(val->val.uint_val);
break;
case TYPE_PORT:
j = val->val.port_val.port;
writer.Uint64(val->val.port_val.port);
break;
case TYPE_SUBNET:
j = Formatter::Render(val->val.subnet_val);
writer.String(Formatter::Render(val->val.subnet_val));
break;
case TYPE_ADDR:
j = Formatter::Render(val->val.addr_val);
writer.String(Formatter::Render(val->val.addr_val));
break;
case TYPE_DOUBLE:
case TYPE_INTERVAL:
j = val->val.double_val;
writer.Double(val->val.double_val);
break;
case TYPE_TIME:
@ -125,7 +140,7 @@ ZeekJson JSON::BuildJSON(Value* val, const string& name) const
GetThread()->Error(GetThread()->Fmt("json formatter: failure getting time: (%lf)", val->val.double_val));
// This was a failure, doesn't really matter what gets put here
// but it should probably stand out...
j = "2000-01-01T00:00:00.000000";
writer.String("2000-01-01T00:00:00.000000");
}
else
{
@ -136,17 +151,17 @@ ZeekJson JSON::BuildJSON(Value* val, const string& name) const
frac += 1;
snprintf(buffer2, sizeof(buffer2), "%s.%06.0fZ", buffer, fabs(frac) * 1000000);
j = buffer2;
writer.String(buffer2, strlen(buffer2));
}
}
else if ( timestamps == TS_EPOCH )
j = val->val.double_val;
writer.Double(val->val.double_val);
else if ( timestamps == TS_MILLIS )
{
// ElasticSearch uses milliseconds for timestamps
j = (uint64_t) (val->val.double_val * 1000);
writer.Uint64((uint64_t) (val->val.double_val * 1000));
}
break;
@ -157,36 +172,34 @@ ZeekJson JSON::BuildJSON(Value* val, const string& name) const
case TYPE_FILE:
case TYPE_FUNC:
{
j = json_escape_utf8(string(val->val.string_val.data, val->val.string_val.length));
writer.String(json_escape_utf8(string(val->val.string_val.data, val->val.string_val.length)));
break;
}
case TYPE_TABLE:
{
j = ZeekJson::array();
writer.StartArray();
for ( int idx = 0; idx < val->val.set_val.size; idx++ )
j.push_back(BuildJSON(val->val.set_val.vals[idx]));
BuildJSON(writer, val->val.set_val.vals[idx]);
writer.EndArray();
break;
}
case TYPE_VECTOR:
{
j = ZeekJson::array();
writer.StartArray();
for ( int idx = 0; idx < val->val.vector_val.size; idx++ )
j.push_back(BuildJSON(val->val.vector_val.vals[idx]));
BuildJSON(writer, val->val.vector_val.vals[idx]);
writer.EndArray();
break;
}
default:
reporter->Warning("Unhandled type in JSON::BuildJSON");
break;
}
if ( ! name.empty() && ! j.is_null() )
return { { name, j } };
return j;
}

View file

@ -2,24 +2,14 @@
#pragma once
#include "../Formatter.h"
#include "3rdparty/json.hpp"
#include "3rdparty/tsl-ordered-map/ordered_map.h"
#define RAPIDJSON_HAS_STDSTRING 1
#include "3rdparty/rapidjson/include/rapidjson/document.h"
#include "3rdparty/rapidjson/include/rapidjson/writer.h"
#include "../Formatter.h"
namespace threading { namespace formatter {
// Define a class for use with the json library that orders the keys in the same order that
// they were inserted. By default, the json library orders them alphabetically and we don't
// want it like that.
template<class Key, class T, class Ignore, class Allocator,
class Hash = std::hash<Key>, class KeyEqual = std::equal_to<Key>,
class AllocatorPair = typename std::allocator_traits<Allocator>::template rebind_alloc<std::pair<Key, T>>,
class ValueTypeContainer = std::vector<std::pair<Key, T>, AllocatorPair>>
using ordered_map = tsl::ordered_map<Key, T, Hash, KeyEqual, AllocatorPair, ValueTypeContainer>;
using ZeekJson = nlohmann::basic_json<ordered_map>;
/**
* A thread-safe class for converting values into a JSON representation
* and vice versa.
@ -40,9 +30,14 @@ public:
threading::Value** vals) const override;
threading::Value* ParseValue(const string& s, const string& name, TypeTag type, TypeTag subtype = TYPE_ERROR) const override;
private:
class NullDoubleWriter : public rapidjson::Writer<rapidjson::StringBuffer> {
public:
NullDoubleWriter(rapidjson::StringBuffer& stream) : rapidjson::Writer<rapidjson::StringBuffer>(stream) {}
bool Double(double d);
};
ZeekJson BuildJSON(Value* val, const string& name = "") const;
private:
void BuildJSON(NullDoubleWriter& writer, Value* val, const string& name = "") const;
TimeFormat timestamps;
bool surrounding_braces;

View file

@ -2190,54 +2190,112 @@ TEST_CASE("util json_escape_utf8")
CHECK(json_escape_utf8("string") == "string");
CHECK(json_escape_utf8("string\n") == "string\n");
CHECK(json_escape_utf8("string\x82") == "string\\x82");
CHECK(json_escape_utf8("\x07\xd4\xb7o") == "\\x07Էo");
// These strings are duplicated from the scripts.base.frameworks.logging.ascii-json-utf8 btest
// Valid ASCII and valid ASCII control characters
CHECK(json_escape_utf8("a") == "a");
CHECK(json_escape_utf8("\b\f\n\r\t\x00\x15") == "\b\f\n\r\t\x00\x15");
// Table 3-7 in https://www.unicode.org/versions/Unicode12.0.0/ch03.pdf describes what is
// valid and invalid for the tests below
// Valid 2 Octet Sequence
CHECK(json_escape_utf8("\xc3\xb1") == "\xc3\xb1");
// Invalid 2 Octet Sequence
CHECK(json_escape_utf8("\xc3\x28") == "\\xc3(");
CHECK(json_escape_utf8("\xc0\x81") == "\\xc0\\x81");
CHECK(json_escape_utf8("\xc1\x81") == "\\xc1\\x81");
CHECK(json_escape_utf8("\xc2\xcf") == "\\xc2\\xcf");
// Invalid Sequence Identifier
CHECK(json_escape_utf8("\xa0\xa1") == "\\xa0\\xa1");
// Valid 3 Octet Sequence
CHECK(json_escape_utf8("\xe2\x82\xa1") == "\xe2\x82\xa1");
CHECK(json_escape_utf8("\xe0\xa3\xa1") == "\xe0\xa3\xa1");
// Invalid 3 Octet Sequence (in 2nd Octet)
CHECK(json_escape_utf8("\xe0\x80\xa1") == "\\xe0\\x80\\xa1");
CHECK(json_escape_utf8("\xe2\x28\xa1") == "\\xe2(\\xa1");
CHECK(json_escape_utf8("\xed\xa0\xa1") == "\\xed\\xa0\\xa1");
// Invalid 3 Octet Sequence (in 3rd Octet)
CHECK(json_escape_utf8("\xe2\x82\x28") == "\\xe2\\x82(");
// Valid 4 Octet Sequence
CHECK(json_escape_utf8("\xf0\x90\x8c\xbc") == "\xf0\x90\x8c\xbc");
CHECK(json_escape_utf8("\xf1\x80\x8c\xbc") == "\xf1\x80\x8c\xbc");
CHECK(json_escape_utf8("\xf4\x80\x8c\xbc") == "\xf4\x80\x8c\xbc");
// Invalid 4 Octet Sequence (in 2nd Octet)
CHECK(json_escape_utf8("\xf0\x80\x8c\xbc") == "\\xf0\\x80\\x8c\\xbc");
CHECK(json_escape_utf8("\xf2\x28\x8c\xbc") == "\\xf2(\\x8c\\xbc");
CHECK(json_escape_utf8("\xf4\x90\x8c\xbc") == "\\xf4\\x90\\x8c\\xbc");
// Invalid 4 Octet Sequence (in 3rd Octet)
CHECK(json_escape_utf8("\xf0\x90\x28\xbc") == "\\xf0\\x90(\\xbc");
// Invalid 4 Octet Sequence (in 4th Octet)
CHECK(json_escape_utf8("\xf0\x28\x8c\x28") == "\\xf0(\\x8c(");
// Invalid 4 Octet Sequence (too short)
CHECK(json_escape_utf8("\xf4\x80\x8c") == "\\xf4\\x80\\x8c");
CHECK(json_escape_utf8("\xf0") == "\\xf0");
}
string json_escape_utf8(const string& val)
{
string result;
result.reserve(val.length());
auto val_data = reinterpret_cast<const unsigned char*>(val.c_str());
auto val_size = val.length();
// Reserve at least the size of the existing string to avoid resizing the string in the best-case
// scenario where we don't have any multi-byte characters.
string result;
result.reserve(val_size);
size_t idx;
for ( idx = 0; idx < val.length(); )
for ( idx = 0; idx < val_size; )
{
// Normal ASCII characters plus a few of the control characters can be inserted directly. The rest of
// the control characters should be escaped as regular bytes.
if ( ( val[idx] >= 32 && val[idx] <= 127 ) ||
val[idx] == '\b' || val[idx] == '\f' || val[idx] == '\n' || val[idx] == '\r' || val[idx] == '\t' )
const char ch = val[idx];
// Normal ASCII characters plus a few of the control characters can be inserted directly. The
// rest of the control characters should be escaped as regular bytes.
if ( ( ch >= 32 && ch <= 127 ) ||
ch == '\b' || ch == '\f' || ch == '\n' || ch == '\r' || ch == '\t' )
{
result.push_back(val[idx]);
result.push_back(ch);
++idx;
continue;
}
else if ( val[idx] >= 0 && val[idx] < 32 )
else if ( ch >= 0 && ch < 32 )
{
result.append(json_escape_byte(val[idx]));
result.append(json_escape_byte(ch));
++idx;
continue;
}
// Find out how long the next character should be.
unsigned int char_size = getNumBytesForUTF8(val[idx]);
unsigned int char_size = getNumBytesForUTF8(ch);
// If it says that it's a single character or it's not an invalid string UTF8 sequence, insert the one
// escaped byte into the string, step forward one, and go to the next character.
if ( char_size == 0 || idx+char_size > val.length() || isLegalUTF8Sequence(val_data+idx, val_data+idx+char_size) == 0 )
// If it says that it's a single character or it's not an valid string UTF8 sequence, insert
// the one escaped byte into the string, step forward one, and go to the next character.
if ( char_size == 0 || idx+char_size > val_size || isLegalUTF8Sequence(val_data+idx, val_data+idx+char_size) == 0 )
{
result.append(json_escape_byte(val[idx]));
result.append(json_escape_byte(ch));
++idx;
continue;
}
for ( size_t step = 0; step < char_size; step++, idx++ )
result.push_back(val[idx]);
result.append(val, idx, char_size);
idx += char_size;
}
// Insert any of the remaining bytes into the string as escaped bytes
if ( idx != val.length() )
for ( ; idx < val.length(); ++idx )
result.append(json_escape_byte(val[idx]));
for ( ; idx < val_size; ++idx )
result.append(json_escape_byte(val[idx]));
return result;
}

View file

@ -118,7 +118,7 @@ std::string extract_ip_and_len(const std::string& i, int* len);
inline void bytetohex(unsigned char byte, char* hex_out)
{
static const char hex_chars[] = "0123456789abcdef";
static constexpr char hex_chars[] = "0123456789abcdef";
hex_out[0] = hex_chars[(byte & 0xf0) >> 4];
hex_out[1] = hex_chars[byte & 0x0f];
}