diff --git a/CHANGES b/CHANGES index b760fee336..629da49aa1 100644 --- a/CHANGES +++ b/CHANGES @@ -1,4 +1,11 @@ +2.6-723 | 2019-07-30 19:36:56 -0700 + + * Add LogAscii::enable_utf_8 option (Dev Bali, Corelight) + + This option allows valid utf8 sequences to be written directly + into the ASCII logs without any escaping. + 2.6-713 | 2019-07-30 18:12:49 +0000 * Fix memory leaks in expire_func introduced by recent changes (Jon diff --git a/NEWS b/NEWS index 1ab7eb38cf..3f415db988 100644 --- a/NEWS +++ b/NEWS @@ -162,6 +162,9 @@ New Functionality local three = make_adder(3); print three(5); # prints 8 +- Add ``LogAscii::enable_utf_8`` option to allow valid utf8 sequences + to be written directly into the ASCII logs without any escaping. + Changed Functionality --------------------- diff --git a/VERSION b/VERSION index be79d9bda7..33b4eff95f 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.6-713 +2.6-723 diff --git a/doc b/doc index bb2f06ff8e..f3b78d6369 160000 --- a/doc +++ b/doc @@ -1 +1 @@ -Subproject commit bb2f06ff8e3b6ae1f362ad4b56da4ef2ebb12d0c +Subproject commit f3b78d63696c3145144fbd8cfd82cdae15a13e98 diff --git a/scripts/base/frameworks/logging/writers/ascii.zeek b/scripts/base/frameworks/logging/writers/ascii.zeek index a32ce552e3..c06de02242 100644 --- a/scripts/base/frameworks/logging/writers/ascii.zeek +++ b/scripts/base/frameworks/logging/writers/ascii.zeek @@ -26,6 +26,12 @@ export { ## This option is also available as a per-filter ``$config`` option. const use_json = F &redef; + ## If true, valid UTF-8 sequences will pass through unescaped and be + ## written into logs. + ## + ## This option is also available as a per-filter ``$config`` option. + const enable_utf_8 = F &redef; + ## Define the gzip level to compress the logs. If 0, then no gzip ## compression is performed. Enabling compression also changes ## the log file name extension to include ".gz". diff --git a/src/Desc.cc b/src/Desc.cc index f10f61fa77..aa9bfdacad 100644 --- a/src/Desc.cc +++ b/src/Desc.cc @@ -10,6 +10,8 @@ #include "File.h" #include "Reporter.h" +#include "ConvertUTF.h" + #define DEFAULT_SIZE 128 #define SLOP 10 @@ -39,6 +41,7 @@ ODesc::ODesc(desc_type t, BroFile* arg_f) include_stats = 0; indent_with_spaces = 0; escape = false; + utf8 = false; } ODesc::~ODesc() @@ -57,6 +60,11 @@ void ODesc::EnableEscaping() escape = true; } +void ODesc::EnableUTF8 () + { + utf8 = true; + } + void ODesc::PushIndent() { ++indent_level; @@ -258,13 +266,42 @@ pair ODesc::FirstEscapeLoc(const char* bytes, size_t n) for ( size_t i = 0; i < n; ++i ) { - if ( ! isprint(bytes[i]) || bytes[i] == '\\' ) + auto printable = isprint(bytes[i]); + + if ( ! printable && ! utf8 ) + return escape_pos(bytes + i, 1); + + if ( bytes[i] == '\\' ) return escape_pos(bytes + i, 1); size_t len = StartsWithEscapeSequence(bytes + i, bytes + n); if ( len ) return escape_pos(bytes + i, len); + + if ( ! printable && utf8 ) + { + size_t utf_found = getNumBytesForUTF8(bytes[i]); + + if ( utf_found == 1 ) + return escape_pos(bytes + i, 1); + + if ( i + utf_found > n ) + // Don't know if this is even meant to be a utf8 encoding, + // since there's not enough bytes left to check it's a valid + // sequence, so maybe safest to just move up by one instead + // of escaping the entire remainder. + return escape_pos(bytes + i, 1); + + if ( isLegalUTF8Sequence(reinterpret_cast(bytes + i), + reinterpret_cast(bytes + i + utf_found)) ) + { + i += utf_found - 1; + continue; + } + + return escape_pos(bytes + i, 1); + } } return escape_pos(0, 0); diff --git a/src/Desc.h b/src/Desc.h index 8f7ae53ac4..53a2e52b61 100644 --- a/src/Desc.h +++ b/src/Desc.h @@ -54,6 +54,7 @@ public: void SetFlush(int arg_do_flush) { do_flush = arg_do_flush; } void EnableEscaping(); + void EnableUTF8(); void AddEscapeSequence(const char* s) { escape_sequences.insert(s); } void AddEscapeSequence(const char* s, size_t n) { escape_sequences.insert(string(s, n)); } @@ -185,6 +186,7 @@ protected: unsigned int offset; // where we are in the buffer unsigned int size; // size of buffer in bytes + bool utf8; // whether valid utf-8 sequences may pass through unescaped bool escape; // escape unprintable characters in output? typedef set escape_set; escape_set escape_sequences; // additional sequences of chars to escape diff --git a/src/logging/writers/ascii/Ascii.cc b/src/logging/writers/ascii/Ascii.cc index f84bde5488..79a6c49941 100644 --- a/src/logging/writers/ascii/Ascii.cc +++ b/src/logging/writers/ascii/Ascii.cc @@ -23,6 +23,7 @@ Ascii::Ascii(WriterFrontend* frontend) : WriterBackend(frontend) include_meta = false; tsv = false; use_json = false; + enable_utf_8 = false; formatter = 0; gzip_level = 0; gzfile = nullptr; @@ -36,6 +37,7 @@ void Ascii::InitConfigOptions() output_to_stdout = BifConst::LogAscii::output_to_stdout; include_meta = BifConst::LogAscii::include_meta; use_json = BifConst::LogAscii::use_json; + enable_utf_8 = BifConst::LogAscii::enable_utf_8; gzip_level = BifConst::LogAscii::gzip_level; separator.assign( @@ -115,6 +117,19 @@ bool Ascii::InitFilterOptions() } } + else if ( strcmp(i->first, "enable_utf_8") == 0 ) + { + if ( strcmp(i->second, "T") == 0 ) + enable_utf_8 = true; + else if ( strcmp(i->second, "F") == 0 ) + enable_utf_8 = false; + else + { + Error("invalid value for 'enable_utf_8', must be a string and either \"T\" or \"F\""); + return false; + } + } + else if ( strcmp(i->first, "output_to_stdout") == 0 ) { if ( strcmp(i->second, "T") == 0 ) @@ -181,6 +196,10 @@ bool Ascii::InitFormatter() } else { + // Enable utf-8 if needed + if ( enable_utf_8 ) + desc.EnableUTF8(); + // Use the default "Bro logs" format. desc.EnableEscaping(); desc.AddEscapeSequence(separator); diff --git a/src/logging/writers/ascii/Ascii.h b/src/logging/writers/ascii/Ascii.h index d1a6f2d0f3..1ee9c29599 100644 --- a/src/logging/writers/ascii/Ascii.h +++ b/src/logging/writers/ascii/Ascii.h @@ -65,6 +65,7 @@ private: int gzip_level; // level > 0 enables gzip compression bool use_json; + bool enable_utf_8; string json_timestamps; threading::formatter::Formatter* formatter; diff --git a/src/logging/writers/ascii/ascii.bif b/src/logging/writers/ascii/ascii.bif index b12b14f1a0..d8263f07c0 100644 --- a/src/logging/writers/ascii/ascii.bif +++ b/src/logging/writers/ascii/ascii.bif @@ -11,5 +11,6 @@ const set_separator: string; const empty_field: string; const unset_field: string; const use_json: bool; +const enable_utf_8: bool; const json_timestamps: JSON::TimestampFormat; const gzip_level: count; diff --git a/testing/btest/Baseline/scripts.base.frameworks.logging.ascii-utf8-enabled-control-characters/test.log b/testing/btest/Baseline/scripts.base.frameworks.logging.ascii-utf8-enabled-control-characters/test.log new file mode 100644 index 0000000000..48b8c72898 --- /dev/null +++ b/testing/btest/Baseline/scripts.base.frameworks.logging.ascii-utf8-enabled-control-characters/test.log @@ -0,0 +1,10 @@ +#separator \x09 +#set_separator , +#empty_field (empty) +#unset_field - +#path test +#open 2019-07-23-11-40-47 +#fields s +#types string +foo \x0a\x09\x00 bar +#close 2019-07-23-11-40-47 diff --git a/testing/btest/Baseline/scripts.base.frameworks.logging.ascii-utf8-enabled-set-separator-escape/test.log b/testing/btest/Baseline/scripts.base.frameworks.logging.ascii-utf8-enabled-set-separator-escape/test.log new file mode 100644 index 0000000000..db59fed0b9 --- /dev/null +++ b/testing/btest/Baseline/scripts.base.frameworks.logging.ascii-utf8-enabled-set-separator-escape/test.log @@ -0,0 +1,10 @@ +#separator \x09 +#set_separator \xc2\xae +#empty_field (empty) +#unset_field - +#path test +#open 2019-07-23-11-46-43 +#fields ss +#types set[string] +\xc2\xae +#close 2019-07-23-11-46-43 diff --git a/testing/btest/Baseline/scripts.base.frameworks.logging.ascii-utf8-enabled/test.log b/testing/btest/Baseline/scripts.base.frameworks.logging.ascii-utf8-enabled/test.log new file mode 100644 index 0000000000..be0a7d94ba --- /dev/null +++ b/testing/btest/Baseline/scripts.base.frameworks.logging.ascii-utf8-enabled/test.log @@ -0,0 +1,12 @@ +#separator \x09 +#set_separator , +#empty_field (empty) +#unset_field - +#path test +#open 2019-07-22-10-13-09 +#fields s +#types string +foo ® bar +दुनिया को नमस्ते +hello 𠜎 +#close 2019-07-22-10-13-09 diff --git a/testing/btest/scripts/base/frameworks/logging/ascii-utf8-enabled-control-characters.zeek b/testing/btest/scripts/base/frameworks/logging/ascii-utf8-enabled-control-characters.zeek new file mode 100644 index 0000000000..e98a295c0b --- /dev/null +++ b/testing/btest/scripts/base/frameworks/logging/ascii-utf8-enabled-control-characters.zeek @@ -0,0 +1,21 @@ +# +# @TEST-EXEC: zeek -b %INPUT +# @TEST-EXEC: btest-diff test.log + +redef LogAscii::enable_utf_8 = T; + +module Test; +export { + redef enum Log::ID += { LOG }; + + type Log: record { + s: string; + } &log; +} + +event zeek_init() + { + local a = "foo \n\t\0 bar"; + Log::create_stream(Test::LOG, [$columns=Log]); + Log::write(Test::LOG, [$s=a]); + } \ No newline at end of file diff --git a/testing/btest/scripts/base/frameworks/logging/ascii-utf8-enabled-set-separator-escape.zeek b/testing/btest/scripts/base/frameworks/logging/ascii-utf8-enabled-set-separator-escape.zeek new file mode 100644 index 0000000000..6f6ca1875b --- /dev/null +++ b/testing/btest/scripts/base/frameworks/logging/ascii-utf8-enabled-set-separator-escape.zeek @@ -0,0 +1,23 @@ +# +# @TEST-EXEC: zeek -b %INPUT +# @TEST-EXEC: btest-diff test.log + +redef LogAscii::enable_utf_8 = T; + +redef LogAscii::set_separator = "\xc2\xae"; + +module Test; + +export { + redef enum Log::ID += { LOG }; + + type Log: record { + ss: set[string]; + } &log; +} + +event zeek_init() + { + Log::create_stream(Test::LOG, [$columns=Log]); + Log::write(Test::LOG, [$ss=set("\xc2\xae")]); + } \ No newline at end of file diff --git a/testing/btest/scripts/base/frameworks/logging/ascii-utf8-enabled.zeek b/testing/btest/scripts/base/frameworks/logging/ascii-utf8-enabled.zeek new file mode 100644 index 0000000000..ac7b1f5daf --- /dev/null +++ b/testing/btest/scripts/base/frameworks/logging/ascii-utf8-enabled.zeek @@ -0,0 +1,26 @@ +# +# @TEST-EXEC: zeek -b %INPUT +# @TEST-EXEC: btest-diff test.log + +redef LogAscii::enable_utf_8 = T; + +module Test; +export { + redef enum Log::ID += { LOG }; + + type Log: record { + s: string; + } &log; +} + +event zeek_init() +{ + local a = "foo \xc2\xae bar"; # 2 bytes + local b = "दुनिया को नमस्ते"; # Hindi characters are 3 byte utf-8 + local c = "hello 𠜎"; # A 4 byte Chinese character + + Log::create_stream(Test::LOG, [$columns=Log]); + Log::write(Test::LOG, [$s=a]); + Log::write(Test::LOG, [$s=b]); + Log::write(Test::LOG, [$s=c]); +} \ No newline at end of file