From 50a0835b41f3bee692c767c4f6b680f1f592d68e Mon Sep 17 00:00:00 2001 From: Tim Wojtulewicz Date: Thu, 7 Nov 2019 12:19:43 -0700 Subject: [PATCH 1/5] Convert type-checking macros to actual functions --- src/Type.h | 64 +++++++++++++++++++++++++++--------------------------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/src/Type.h b/src/Type.h index 7be4e01946..371e4385a5 100644 --- a/src/Type.h +++ b/src/Type.h @@ -698,53 +698,53 @@ extern BroType* init_type(Expr* init); // Returns true if argument is an atomic type. bool is_atomic_type(const BroType* t); -// True if the given type tag corresponds to an integral type. -#define IsIntegral(t) (t == TYPE_INT || t == TYPE_COUNT || t == TYPE_COUNTER) - -// True if the given type tag corresponds to an arithmetic type. -#define IsArithmetic(t) (IsIntegral(t) || t == TYPE_DOUBLE) - -// True if the given type tag corresponds to a boolean type. -#define IsBool(t) (t == TYPE_BOOL) - -// True if the given type tag corresponds to an interval type. -#define IsInterval(t) (t == TYPE_INTERVAL) - -// True if the given type tag corresponds to a record type. -#define IsRecord(t) (t == TYPE_RECORD || t == TYPE_UNION) - -// True if the given type tag corresponds to a function type. -#define IsFunc(t) (t == TYPE_FUNC) - -// True if the given type type is a vector. -#define IsVector(t) (t == TYPE_VECTOR) - -// True if the given type type is a string. -#define IsString(t) (t == TYPE_STRING) - // True if the given type tag corresponds to type that can be assigned to. extern int is_assignable(BroType* t); +// True if the given type tag corresponds to an integral type. +inline bool IsIntegral(TypeTag t) { return (t == TYPE_INT || t == TYPE_COUNT || t == TYPE_COUNTER); } + +// True if the given type tag corresponds to an arithmetic type. +inline bool IsArithmetic(TypeTag t) { return (IsIntegral(t) || t == TYPE_DOUBLE); } + +// True if the given type tag corresponds to a boolean type. +inline bool IsBool(TypeTag t) { return (t == TYPE_BOOL); } + +// True if the given type tag corresponds to an interval type. +inline bool IsInterval(TypeTag t) { return (t == TYPE_INTERVAL); } + +// True if the given type tag corresponds to a record type. +inline bool IsRecord(TypeTag t) { return (t == TYPE_RECORD || t == TYPE_UNION); } + +// True if the given type tag corresponds to a function type. +inline bool IsFunc(TypeTag t) { return (t == TYPE_FUNC); } + +// True if the given type type is a vector. +inline bool IsVector(TypeTag t) { return (t == TYPE_VECTOR); } + +// True if the given type type is a string. +inline bool IsString(TypeTag t) { return (t == TYPE_STRING); } + // True if the given type tag corresponds to the error type. -#define IsErrorType(t) (t == TYPE_ERROR) +inline bool IsErrorType(TypeTag t) { return (t == TYPE_ERROR); } // True if both tags are integral types. -#define BothIntegral(t1, t2) (IsIntegral(t1) && IsIntegral(t2)) +inline bool BothIntegral(TypeTag t1, TypeTag t2) { return (IsIntegral(t1) && IsIntegral(t2)); } // True if both tags are arithmetic types. -#define BothArithmetic(t1, t2) (IsArithmetic(t1) && IsArithmetic(t2)) +inline bool BothArithmetic(TypeTag t1, TypeTag t2) { return (IsArithmetic(t1) && IsArithmetic(t2)); } // True if either tags is an arithmetic type. -#define EitherArithmetic(t1, t2) (IsArithmetic(t1) || IsArithmetic(t2)) +inline bool EitherArithmetic(TypeTag t1, TypeTag t2) { return (IsArithmetic(t1) || IsArithmetic(t2)); } // True if both tags are boolean types. -#define BothBool(t1, t2) (IsBool(t1) && IsBool(t2)) +inline bool BothBool(TypeTag t1, TypeTag t2) { return (IsBool(t1) && IsBool(t2)); } // True if both tags are interval types. -#define BothInterval(t1, t2) (IsInterval(t1) && IsInterval(t2)) +inline bool BothInterval(TypeTag t1, TypeTag t2) { return (IsInterval(t1) && IsInterval(t2)); } // True if both tags are string types. -#define BothString(t1, t2) (IsString(t1) && IsString(t2)) +inline bool BothString(TypeTag t1, TypeTag t2) { return (IsString(t1) && IsString(t2)); } // True if either tag is the error type. -#define EitherError(t1, t2) (IsErrorType(t1) || IsErrorType(t2)) +inline bool EitherError(TypeTag t1, TypeTag t2) { return (IsErrorType(t1) || IsErrorType(t2)); } From 46e730842297d16d1ea9e02cb06edd95b02c81dc Mon Sep 17 00:00:00 2001 From: Tim Wojtulewicz Date: Thu, 7 Nov 2019 12:30:30 -0700 Subject: [PATCH 2/5] GHI-595: Convert from nlohmann/json to rapidjson for performance reasons --- src/3rdparty | 2 +- src/CMakeLists.txt | 9 +- src/Val.cc | 167 +++++++++++------- src/threading/formatters/JSON.cc | 99 +++++++---- src/threading/formatters/JSON.h | 26 ++- .../json.log | 10 +- .../Baseline/scripts.base.utils.json/output | 2 +- 7 files changed, 183 insertions(+), 132 deletions(-) diff --git a/src/3rdparty b/src/3rdparty index 2b3206b7ad..e2f0a1b3f1 160000 --- a/src/3rdparty +++ b/src/3rdparty @@ -1 +1 @@ -Subproject commit 2b3206b7add3472ea0736f2841473e11d506a85e +Subproject commit e2f0a1b3f1c54f9dc97a806261cc329e4dd596c8 diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 2c47f7dd3b..c64f9608fa 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -413,15 +413,14 @@ install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/ ) install(FILES - ${CMAKE_CURRENT_SOURCE_DIR}/3rdparty/json.hpp ${CMAKE_CURRENT_SOURCE_DIR}/3rdparty/sqlite3.h DESTINATION include/zeek/3rdparty -) + ) install(FILES - ${CMAKE_CURRENT_SOURCE_DIR}/3rdparty/tsl-ordered-map/ordered_map.h - ${CMAKE_CURRENT_SOURCE_DIR}/3rdparty/tsl-ordered-map/ordered_hash.h - DESTINATION include/zeek/3rdparty/tsl-ordered-map + ${CMAKE_CURRENT_SOURCE_DIR}/3rdparty/rapidjson/include/rapidjson/document.h + ${CMAKE_CURRENT_SOURCE_DIR}/3rdparty/rapidjson/include/rapidjson/writer.h + DESTINATION include/zeek/3rdparty/rapidjson/include/rapidjson ) ######################################################################## diff --git a/src/Val.cc b/src/Val.cc index 98ef6da532..b5b667aa56 100644 --- a/src/Val.cc +++ b/src/Val.cc @@ -12,6 +12,11 @@ #include #include +#define RAPIDJSON_HAS_STDSTRING 1 +#include "3rdparty/rapidjson/include/rapidjson/document.h" +#include "3rdparty/rapidjson/include/rapidjson/stringbuffer.h" +#include "3rdparty/rapidjson/include/rapidjson/writer.h" + #include "Val.h" #include "Net.h" #include "File.h" @@ -27,20 +32,17 @@ #include "broker/Data.h" -#include "3rdparty/json.hpp" -#include "3rdparty/tsl-ordered-map/ordered_map.h" +class NullDoubleWriter : public rapidjson::Writer { +public: + NullDoubleWriter(rapidjson::StringBuffer& buffer) : rapidjson::Writer(buffer) {} + bool Double(double d) + { + if ( rapidjson::internal::Double(d).IsNanOrInf() ) + return rapidjson::Writer::Null(); - -// Define a class for use with the json library that orders the keys in the same order that -// they were inserted. By default, the json library orders them alphabetically and we don't -// want it like that. -template, class KeyEqual = std::equal_to, - class AllocatorPair = typename std::allocator_traits::template rebind_alloc>, - class ValueTypeContainer = std::vector, AllocatorPair>> -using ordered_map = tsl::ordered_map; - -using ZeekJson = nlohmann::basic_json; + return rapidjson::Writer::Double(d); + } +}; Val::Val(Func* f) { @@ -433,46 +435,56 @@ TableVal* Val::GetRecordFields() return rt->GetRecordFieldsVal(rv); } -// This is a static method in this file to avoid including json.hpp in Val.h since it's huge. -static ZeekJson BuildJSON(Val* val, bool only_loggable=false, RE_Matcher* re=nullptr) +// This is a static method in this file to avoid including rapidjson's headers in Val.h because they're huge. +static void BuildJSON(NullDoubleWriter& writer, Val* val, bool only_loggable=false, RE_Matcher* re=nullptr, const string& key="") { - // If the value wasn't set, return a nullptr. This will get turned into a 'null' in the json output. - if ( ! val ) - return nullptr; + if ( !key.empty() ) + writer.Key(key); - ZeekJson j; + // If the value wasn't set, write a null into the stream and return. + if ( ! val ) + { + writer.Null(); + return; + } + + rapidjson::Value j; BroType* type = val->Type(); switch ( type->Tag() ) { case TYPE_BOOL: - j = val->AsBool(); + writer.Bool(val->AsBool()); break; case TYPE_INT: - j = val->AsInt(); + writer.Int64(val->AsInt()); break; case TYPE_COUNT: - j = val->AsCount(); + writer.Uint64(val->AsCount()); break; case TYPE_COUNTER: - j = val->AsCounter(); + writer.Uint64(val->AsCounter()); break; case TYPE_TIME: - j = val->AsTime(); + writer.Double(val->AsTime()); break; case TYPE_DOUBLE: - j = val->AsDouble(); + writer.Double(val->AsDouble()); break; case TYPE_PORT: { auto* pval = val->AsPortVal(); - j.emplace("port", pval->Port()); - j.emplace("proto", pval->Protocol()); + writer.StartObject(); + writer.Key("port"); + writer.Int64(pval->Port()); + writer.Key("proto"); + writer.String(pval->Protocol()); + writer.EndObject(); break; } @@ -484,7 +496,7 @@ static ZeekJson BuildJSON(Val* val, bool only_loggable=false, RE_Matcher* re=nul ODesc d; d.SetStyle(RAW_STYLE); val->Describe(&d); - j = string(reinterpret_cast(d.Bytes()), d.Len()); + writer.String(reinterpret_cast(d.Bytes()), d.Len()); break; } @@ -496,7 +508,7 @@ static ZeekJson BuildJSON(Val* val, bool only_loggable=false, RE_Matcher* re=nul ODesc d; d.SetStyle(RAW_STYLE); val->Describe(&d); - j = json_escape_utf8(string(reinterpret_cast(d.Bytes()), d.Len())); + writer.String(json_escape_utf8(string(reinterpret_cast(d.Bytes()), d.Len()))); break; } @@ -506,9 +518,9 @@ static ZeekJson BuildJSON(Val* val, bool only_loggable=false, RE_Matcher* re=nul auto* tval = val->AsTableVal(); if ( tval->Type()->IsSet() ) - j = ZeekJson::array(); + writer.StartArray(); else - j = ZeekJson::object(); + writer.StartObject(); HashKey* k; TableEntryVal* entry; @@ -524,102 +536,125 @@ static ZeekJson BuildJSON(Val* val, bool only_loggable=false, RE_Matcher* re=nul else entry_key = lv->Ref(); - ZeekJson key_json = BuildJSON(entry_key, only_loggable, re); - if ( tval->Type()->IsSet() ) - j.emplace_back(std::move(key_json)); + BuildJSON(writer, entry_key, only_loggable, re); else { Val* entry_value = entry->Value(); - string key_string; - if ( key_json.is_string() ) - key_string = key_json; - else - key_string = key_json.dump(); + rapidjson::StringBuffer buffer; + NullDoubleWriter key_writer(buffer); + BuildJSON(key_writer, entry_key, only_loggable, re); + string key_str = buffer.GetString(); + if ( key_str[0] == '"') + key_str = key_str.substr(1); + if ( key_str[key_str.length()-1] == '"') + key_str = key_str.substr(0, key_str.length()-1); - j.emplace(key_string, BuildJSON(entry_value, only_loggable, re)); + BuildJSON(writer, entry_value, only_loggable, re, key_str); } Unref(entry_key); Unref(lv); } + if ( tval->Type()->IsSet() ) + writer.EndArray(); + else + writer.EndObject(); + break; } case TYPE_RECORD: { - j = ZeekJson::object(); + writer.StartObject(); + auto* rval = val->AsRecordVal(); auto rt = rval->Type()->AsRecordType(); for ( auto i = 0; i < rt->NumFields(); ++i ) { - auto field_name = rt->FieldName(i); - std::string key_string; - - if ( re && re->MatchAnywhere(field_name) != 0 ) - { - StringVal blank(""); - StringVal fn_val(field_name); - auto key_val = fn_val.Substitute(re, &blank, 0)->AsStringVal(); - key_string = key_val->ToStdString(); - Unref(key_val); - } - else - key_string = field_name; - Val* value = rval->LookupWithDefault(i); if ( value && ( ! only_loggable || rt->FieldHasAttr(i, ATTR_LOG) ) ) - j.emplace(key_string, BuildJSON(value, only_loggable, re)); + { + string key_str; + auto field_name = rt->FieldName(i); + + if ( re && re->MatchAnywhere(field_name) != 0 ) + { + StringVal blank(""); + StringVal fn_val(field_name); + auto key_val = fn_val.Substitute(re, &blank, 0)->AsStringVal(); + key_str = key_val->ToStdString(); + Unref(key_val); + } + else + key_str = field_name; + + BuildJSON(writer, value, only_loggable, re, key_str); + } Unref(value); } + writer.EndObject(); break; } case TYPE_LIST: { - j = ZeekJson::array(); + writer.StartArray(); + auto* lval = val->AsListVal(); size_t size = lval->Length(); for (size_t i = 0; i < size; i++) - j.push_back(BuildJSON(lval->Index(i), only_loggable, re)); + BuildJSON(writer, lval->Index(i), only_loggable, re); + writer.EndArray(); break; } case TYPE_VECTOR: { - j = ZeekJson::array(); + writer.StartArray(); + auto* vval = val->AsVectorVal(); size_t size = vval->SizeVal()->AsCount(); for (size_t i = 0; i < size; i++) - j.push_back(BuildJSON(vval->Lookup(i), only_loggable, re)); + BuildJSON(writer, vval->Lookup(i), only_loggable, re); + writer.EndArray(); break; } case TYPE_OPAQUE: { + writer.StartObject(); + + writer.Key("opaque_type"); auto* oval = val->AsOpaqueVal(); - j = { { "opaque_type", OpaqueMgr::mgr()->TypeID(oval) } }; + writer.String(OpaqueMgr::mgr()->TypeID(oval)); + + writer.EndObject(); break; } - default: break; + default: + writer.Null(); + break; } - - return j; } StringVal* Val::ToJSON(bool only_loggable, RE_Matcher* re) { - ZeekJson j = BuildJSON(this, only_loggable, re); - return new StringVal(j.dump()); + rapidjson::StringBuffer buffer; + NullDoubleWriter writer(buffer); + + BuildJSON(writer, this, only_loggable, re, ""); + + return new StringVal(buffer.GetString()); } IntervalVal::IntervalVal(double quantity, double units) : diff --git a/src/threading/formatters/JSON.cc b/src/threading/formatters/JSON.cc index 919ecd4a2b..6944af81cd 100644 --- a/src/threading/formatters/JSON.cc +++ b/src/threading/formatters/JSON.cc @@ -12,9 +12,18 @@ #include #include "JSON.h" +#include "3rdparty/rapidjson/include/rapidjson/internal/ieee754.h" using namespace threading::formatter; +bool JSON::NullDoubleWriter::Double(double d) + { + if ( rapidjson::internal::Double(d).IsNanOrInf() ) + return rapidjson::Writer::Null(); + + return rapidjson::Writer::Double(d); + } + JSON::JSON(MsgThread* t, TimeFormat tf) : Formatter(t), surrounding_braces(true) { timestamps = tf; @@ -27,21 +36,19 @@ JSON::~JSON() bool JSON::Describe(ODesc* desc, int num_fields, const Field* const * fields, Value** vals) const { - ZeekJson j = ZeekJson::object(); + rapidjson::StringBuffer buffer; + NullDoubleWriter writer(buffer); + + writer.StartObject(); for ( int i = 0; i < num_fields; i++ ) { if ( vals[i]->present ) - { - ZeekJson new_entry = BuildJSON(vals[i]); - if ( new_entry.is_null() ) - return false; - - j.emplace(fields[i]->name, new_entry); - } + BuildJSON(writer, vals[i], fields[i]->name); } - desc->Add(j.dump()); + writer.EndObject(); + desc->Add(buffer.GetString()); return true; } @@ -54,14 +61,18 @@ bool JSON::Describe(ODesc* desc, Value* val, const string& name) const return false; } - if ( ! val->present ) + if ( ! val->present || name.empty() ) return true; - ZeekJson j = BuildJSON(val, name); - if ( j.is_null() ) - return false; + rapidjson::Document doc; + rapidjson::StringBuffer buffer; + NullDoubleWriter writer(buffer); - desc->Add(j.dump()); + writer.StartObject(); + BuildJSON(writer, val, name); + writer.EndObject(); + + desc->Add(buffer.GetString()); return true; } @@ -71,47 +82,56 @@ threading::Value* JSON::ParseValue(const string& s, const string& name, TypeTag return nullptr; } -ZeekJson JSON::BuildJSON(Value* val, const string& name) const +void JSON::BuildJSON(NullDoubleWriter& writer, Value* val, const string& name) const { - // If the value wasn't set, return a nullptr. This will get turned into a 'null' in the json output. if ( ! val->present ) - return nullptr; + { + writer.Null(); + return; + } - ZeekJson j; switch ( val->type ) { case TYPE_BOOL: - j = val->val.int_val != 0; + if ( ! name.empty() ) writer.Key(name); + writer.Bool(val->val.int_val != 0); break; case TYPE_INT: - j = val->val.int_val; + if ( ! name.empty() ) writer.Key(name); + writer.Int64(val->val.int_val); break; case TYPE_COUNT: case TYPE_COUNTER: - j = val->val.uint_val; + if ( ! name.empty() ) writer.Key(name); + writer.Uint64(val->val.uint_val); break; case TYPE_PORT: - j = val->val.port_val.port; + if ( ! name.empty() ) writer.Key(name); + writer.Uint64(val->val.port_val.port); break; case TYPE_SUBNET: - j = Formatter::Render(val->val.subnet_val); + if ( ! name.empty() ) writer.Key(name); + writer.String(Formatter::Render(val->val.subnet_val)); break; case TYPE_ADDR: - j = Formatter::Render(val->val.addr_val); + if ( ! name.empty() ) writer.Key(name); + writer.String(Formatter::Render(val->val.addr_val)); break; case TYPE_DOUBLE: case TYPE_INTERVAL: - j = val->val.double_val; + if ( ! name.empty() ) writer.Key(name); + writer.Double(val->val.double_val); break; case TYPE_TIME: { + if ( ! name.empty() ) writer.Key(name); if ( timestamps == TS_ISO8601 ) { char buffer[40]; @@ -125,7 +145,7 @@ ZeekJson JSON::BuildJSON(Value* val, const string& name) const GetThread()->Error(GetThread()->Fmt("json formatter: failure getting time: (%lf)", val->val.double_val)); // This was a failure, doesn't really matter what gets put here // but it should probably stand out... - j = "2000-01-01T00:00:00.000000"; + writer.String("2000-01-01T00:00:00.000000"); } else { @@ -136,17 +156,17 @@ ZeekJson JSON::BuildJSON(Value* val, const string& name) const frac += 1; snprintf(buffer2, sizeof(buffer2), "%s.%06.0fZ", buffer, fabs(frac) * 1000000); - j = buffer2; + writer.String(buffer2, strlen(buffer2)); } } else if ( timestamps == TS_EPOCH ) - j = val->val.double_val; + writer.Double(val->val.double_val); else if ( timestamps == TS_MILLIS ) { // ElasticSearch uses milliseconds for timestamps - j = (uint64_t) (val->val.double_val * 1000); + writer.Uint64((uint64_t) (val->val.double_val * 1000)); } break; @@ -157,36 +177,37 @@ ZeekJson JSON::BuildJSON(Value* val, const string& name) const case TYPE_FILE: case TYPE_FUNC: { - j = json_escape_utf8(string(val->val.string_val.data, val->val.string_val.length)); + if ( ! name.empty() ) writer.Key(name); + writer.String(json_escape_utf8(string(val->val.string_val.data, val->val.string_val.length))); break; } case TYPE_TABLE: { - j = ZeekJson::array(); + if ( ! name.empty() ) writer.Key(name); + writer.StartArray(); for ( int idx = 0; idx < val->val.set_val.size; idx++ ) - j.push_back(BuildJSON(val->val.set_val.vals[idx])); + BuildJSON(writer, val->val.set_val.vals[idx]); + writer.EndArray(); break; } case TYPE_VECTOR: { - j = ZeekJson::array(); + if ( ! name.empty() ) writer.Key(name); + writer.StartArray(); for ( int idx = 0; idx < val->val.vector_val.size; idx++ ) - j.push_back(BuildJSON(val->val.vector_val.vals[idx])); + BuildJSON(writer, val->val.vector_val.vals[idx]); + writer.EndArray(); break; } default: + reporter->Warning("Unhandled type in JSON::BuildJSON"); break; } - - if ( ! name.empty() && ! j.is_null() ) - return { { name, j } }; - - return j; } diff --git a/src/threading/formatters/JSON.h b/src/threading/formatters/JSON.h index 71edadc61d..e640264a3a 100644 --- a/src/threading/formatters/JSON.h +++ b/src/threading/formatters/JSON.h @@ -2,24 +2,14 @@ #pragma once -#include "../Formatter.h" -#include "3rdparty/json.hpp" -#include "3rdparty/tsl-ordered-map/ordered_map.h" +#define RAPIDJSON_HAS_STDSTRING 1 +#include "3rdparty/rapidjson/include/rapidjson/document.h" +#include "3rdparty/rapidjson/include/rapidjson/writer.h" +#include "../Formatter.h" namespace threading { namespace formatter { -// Define a class for use with the json library that orders the keys in the same order that -// they were inserted. By default, the json library orders them alphabetically and we don't -// want it like that. -template, class KeyEqual = std::equal_to, - class AllocatorPair = typename std::allocator_traits::template rebind_alloc>, - class ValueTypeContainer = std::vector, AllocatorPair>> -using ordered_map = tsl::ordered_map; - -using ZeekJson = nlohmann::basic_json; - /** * A thread-safe class for converting values into a JSON representation * and vice versa. @@ -42,7 +32,13 @@ public: private: - ZeekJson BuildJSON(Value* val, const string& name = "") const; + class NullDoubleWriter : public rapidjson::Writer { + public: + NullDoubleWriter(rapidjson::StringBuffer& stream) : rapidjson::Writer(stream) {} + bool Double(double d); + }; + + void BuildJSON(NullDoubleWriter& writer, Value* val, const string& name = "") const; TimeFormat timestamps; bool surrounding_braces; diff --git a/testing/btest/Baseline/scripts.base.frameworks.logging.ascii-double/json.log b/testing/btest/Baseline/scripts.base.frameworks.logging.ascii-double/json.log index bb0f950b13..99d353b197 100644 --- a/testing/btest/Baseline/scripts.base.frameworks.logging.ascii-double/json.log +++ b/testing/btest/Baseline/scripts.base.frameworks.logging.ascii-double/json.log @@ -9,11 +9,11 @@ {"d":0.1234} {"d":50000.0} {"d":-50000.0} -{"d":3.14e+15} -{"d":-3.14e+15} -{"d":1.79e+308} -{"d":-1.79e+308} -{"d":1.23456789e-05} +{"d":3140000000000000.0} +{"d":-3140000000000000.0} +{"d":1.79e308} +{"d":-1.79e308} +{"d":0.0000123456789} {"d":2.23e-308} {"d":-2.23e-308} {"d":null} diff --git a/testing/btest/Baseline/scripts.base.utils.json/output b/testing/btest/Baseline/scripts.base.utils.json/output index 8757a51433..0794bd9587 100644 --- a/testing/btest/Baseline/scripts.base.utils.json/output +++ b/testing/btest/Baseline/scripts.base.utils.json/output @@ -2,7 +2,7 @@ true 123 -999 3.14 --1.23456789e+308 +-1.23456789e308 9e-308 1480788576.868945 "-12.0 hrs" From ee0619f99905bfbdb7590010ed217e659bf78890 Mon Sep 17 00:00:00 2001 From: Tim Wojtulewicz Date: Wed, 8 Jan 2020 12:06:52 -0700 Subject: [PATCH 3/5] Expand unit test for json_escape_utf8 to include all of the strings from the ascii-json-utf8 btest --- src/util.cc | 54 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/src/util.cc b/src/util.cc index 2b46e38f7b..16ec14515a 100644 --- a/src/util.cc +++ b/src/util.cc @@ -2190,6 +2190,60 @@ TEST_CASE("util json_escape_utf8") CHECK(json_escape_utf8("string") == "string"); CHECK(json_escape_utf8("string\n") == "string\n"); CHECK(json_escape_utf8("string\x82") == "string\\x82"); + CHECK(json_escape_utf8("\x07\xd4\xb7o") == "\\x07Էo"); + + // These strings are duplicated from the scripts.base.frameworks.logging.ascii-json-utf8 btest + + // Valid ASCII and valid ASCII control characters + CHECK(json_escape_utf8("a") == "a"); + CHECK(json_escape_utf8("\b\f\n\r\t\x00\x15") == "\b\f\n\r\t\x00\x15"); + + // Table 3-7 in https://www.unicode.org/versions/Unicode12.0.0/ch03.pdf describes what is + // valid and invalid for the tests below + + // Valid 2 Octet Sequence + CHECK(json_escape_utf8("\xc3\xb1") == "\xc3\xb1"); + + // Invalid 2 Octet Sequence + CHECK(json_escape_utf8("\xc3\x28") == "\\xc3("); + CHECK(json_escape_utf8("\xc0\x81") == "\\xc0\\x81"); + CHECK(json_escape_utf8("\xc1\x81") == "\\xc1\\x81"); + CHECK(json_escape_utf8("\xc2\xcf") == "\\xc2\\xcf"); + + // Invalid Sequence Identifier + CHECK(json_escape_utf8("\xa0\xa1") == "\\xa0\\xa1"); + + // Valid 3 Octet Sequence + CHECK(json_escape_utf8("\xe2\x82\xa1") == "\xe2\x82\xa1"); + CHECK(json_escape_utf8("\xe0\xa3\xa1") == "\xe0\xa3\xa1"); + + // Invalid 3 Octet Sequence (in 2nd Octet) + CHECK(json_escape_utf8("\xe0\x80\xa1") == "\\xe0\\x80\\xa1"); + CHECK(json_escape_utf8("\xe2\x28\xa1") == "\\xe2(\\xa1"); + CHECK(json_escape_utf8("\xed\xa0\xa1") == "\\xed\\xa0\\xa1"); + + // Invalid 3 Octet Sequence (in 3rd Octet) + CHECK(json_escape_utf8("\xe2\x82\x28") == "\\xe2\\x82("); + + // Valid 4 Octet Sequence + CHECK(json_escape_utf8("\xf0\x90\x8c\xbc") == "\xf0\x90\x8c\xbc"); + CHECK(json_escape_utf8("\xf1\x80\x8c\xbc") == "\xf1\x80\x8c\xbc"); + CHECK(json_escape_utf8("\xf4\x80\x8c\xbc") == "\xf4\x80\x8c\xbc"); + + // Invalid 4 Octet Sequence (in 2nd Octet) + CHECK(json_escape_utf8("\xf0\x80\x8c\xbc") == "\\xf0\\x80\\x8c\\xbc"); + CHECK(json_escape_utf8("\xf2\x28\x8c\xbc") == "\\xf2(\\x8c\\xbc"); + CHECK(json_escape_utf8("\xf4\x90\x8c\xbc") == "\\xf4\\x90\\x8c\\xbc"); + + // Invalid 4 Octet Sequence (in 3rd Octet) + CHECK(json_escape_utf8("\xf0\x90\x28\xbc") == "\\xf0\\x90(\\xbc"); + + // Invalid 4 Octet Sequence (in 4th Octet) + CHECK(json_escape_utf8("\xf0\x28\x8c\x28") == "\\xf0(\\x8c("); + + // Invalid 4 Octet Sequence (too short) + CHECK(json_escape_utf8("\xf4\x80\x8c") == "\\xf4\\x80\\x8c"); + CHECK(json_escape_utf8("\xf0") == "\\xf0"); } string json_escape_utf8(const string& val) From 23f551876cc21ce98d9d180d95f81ae6d28c9e13 Mon Sep 17 00:00:00 2001 From: Tim Wojtulewicz Date: Wed, 8 Jan 2020 12:08:32 -0700 Subject: [PATCH 4/5] Optimize json_escape_utf8 a bit by removing repeated calls to string methods --- src/util.cc | 46 +++++++++++++++++++++++++--------------------- src/util.h | 2 +- 2 files changed, 26 insertions(+), 22 deletions(-) diff --git a/src/util.cc b/src/util.cc index 16ec14515a..d7a699ab6e 100644 --- a/src/util.cc +++ b/src/util.cc @@ -2248,50 +2248,54 @@ TEST_CASE("util json_escape_utf8") string json_escape_utf8(const string& val) { - string result; - result.reserve(val.length()); - auto val_data = reinterpret_cast(val.c_str()); + auto val_size = val.length(); + + // Reserve at least the size of the existing string to avoid resizing the string in the best-case + // scenario where we don't have any multi-byte characters. + string result; + result.reserve(val_size); size_t idx; - for ( idx = 0; idx < val.length(); ) + for ( idx = 0; idx < val_size; ) { - // Normal ASCII characters plus a few of the control characters can be inserted directly. The rest of - // the control characters should be escaped as regular bytes. - if ( ( val[idx] >= 32 && val[idx] <= 127 ) || - val[idx] == '\b' || val[idx] == '\f' || val[idx] == '\n' || val[idx] == '\r' || val[idx] == '\t' ) + char ch = val[idx]; + + // Normal ASCII characters plus a few of the control characters can be inserted directly. The + // rest of the control characters should be escaped as regular bytes. + if ( ( ch >= 32 && ch <= 127 ) || + ch == '\b' || ch == '\f' || ch == '\n' || ch == '\r' || ch == '\t' ) { - result.push_back(val[idx]); + result.push_back(ch); ++idx; continue; } - else if ( val[idx] >= 0 && val[idx] < 32 ) + else if ( ch >= 0 && ch < 32 ) { - result.append(json_escape_byte(val[idx])); + result.append(json_escape_byte(ch)); ++idx; continue; } // Find out how long the next character should be. - unsigned int char_size = getNumBytesForUTF8(val[idx]); + unsigned int char_size = getNumBytesForUTF8(ch); - // If it says that it's a single character or it's not an invalid string UTF8 sequence, insert the one - // escaped byte into the string, step forward one, and go to the next character. - if ( char_size == 0 || idx+char_size > val.length() || isLegalUTF8Sequence(val_data+idx, val_data+idx+char_size) == 0 ) + // If it says that it's a single character or it's not an valid string UTF8 sequence, insert + // the one escaped byte into the string, step forward one, and go to the next character. + if ( char_size == 0 || idx+char_size > val_size || isLegalUTF8Sequence(val_data+idx, val_data+idx+char_size) == 0 ) { - result.append(json_escape_byte(val[idx])); + result.append(json_escape_byte(ch)); ++idx; continue; } - for ( size_t step = 0; step < char_size; step++, idx++ ) - result.push_back(val[idx]); + result.append(val, idx, char_size); + idx += char_size; } // Insert any of the remaining bytes into the string as escaped bytes - if ( idx != val.length() ) - for ( ; idx < val.length(); ++idx ) - result.append(json_escape_byte(val[idx])); + for ( ; idx < val_size; ++idx ) + result.append(json_escape_byte(val[idx])); return result; } diff --git a/src/util.h b/src/util.h index 3665518f96..5a5a8c5159 100644 --- a/src/util.h +++ b/src/util.h @@ -118,7 +118,7 @@ std::string extract_ip_and_len(const std::string& i, int* len); inline void bytetohex(unsigned char byte, char* hex_out) { - static const char hex_chars[] = "0123456789abcdef"; + static constexpr char hex_chars[] = "0123456789abcdef"; hex_out[0] = hex_chars[(byte & 0xf0) >> 4]; hex_out[1] = hex_chars[byte & 0x0f]; } From 227d29db80f0769216de526a9fad83f6adfa5623 Mon Sep 17 00:00:00 2001 From: Tim Wojtulewicz Date: Tue, 14 Jan 2020 08:29:32 -0700 Subject: [PATCH 5/5] Use the list of files from clang-tidy when searching for unit tests The previous method for searching for these files included everything from src/3rdparty, which breaks when rapidjson is included. We don't want to include that directory anyways. We already had a good list of files to scan from the previous clang-tidy and adding any that are missing is an easy task. --- src/CMakeLists.txt | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index c64f9608fa..0a2cfa388a 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -438,9 +438,8 @@ create_clang_tidy_target() # Scan all .cc files for TEST_CASE macros and generate CTest targets. if (ENABLE_ZEEK_UNIT_TESTS) - file(GLOB_RECURSE all_cc_files "*.cc") set(test_cases "") - foreach (cc_file ${all_cc_files}) + foreach (cc_file ${TIDY_SRCS}) file (STRINGS ${cc_file} test_case_lines REGEX "TEST_CASE") foreach (line ${test_case_lines}) string(REGEX REPLACE "TEST_CASE\\(\"(.+)\"\\)" "\\1" test_case "${line}")