diff --git a/scripts/base/frameworks/logging/writers/ascii.zeek b/scripts/base/frameworks/logging/writers/ascii.zeek index a32ce552e3..136b8b5eb7 100644 --- a/scripts/base/frameworks/logging/writers/ascii.zeek +++ b/scripts/base/frameworks/logging/writers/ascii.zeek @@ -26,6 +26,12 @@ export { ## This option is also available as a per-filter ``$config`` option. const use_json = F &redef; + ## If true, non ASCII UTF-8 characters will pass through and + ## be written into logs + ## + ## This option is also available as a per-filter ``$config`` option. + const enable_utf_8 = F &redef; + ## Define the gzip level to compress the logs. If 0, then no gzip ## compression is performed. Enabling compression also changes ## the log file name extension to include ".gz". diff --git a/scripts/policy/tuning/enable-utf-8-logs.zeek b/scripts/policy/tuning/enable-utf-8-logs.zeek new file mode 100644 index 0000000000..4f8eb3615f --- /dev/null +++ b/scripts/policy/tuning/enable-utf-8-logs.zeek @@ -0,0 +1,4 @@ +##! Loading this script will enable utf-8 characters +##! instead of escaping them into the \xYY format + +redef LogAscii::enable_utf_8=T; diff --git a/scripts/test-all-policy.zeek b/scripts/test-all-policy.zeek index 1741d42a18..93336b80ea 100644 --- a/scripts/test-all-policy.zeek +++ b/scripts/test-all-policy.zeek @@ -112,4 +112,5 @@ @load tuning/defaults/packet-fragments.zeek @load tuning/defaults/warnings.zeek @load tuning/json-logs.zeek +@load tuning/enable-utf-8-logs.zeek @load tuning/track-all-assets.zeek diff --git a/src/Desc.cc b/src/Desc.cc index 606ff7252f..3e1f2cef6e 100644 --- a/src/Desc.cc +++ b/src/Desc.cc @@ -10,6 +10,8 @@ #include "File.h" #include "Reporter.h" +#include "ConvertUTF.h" + #define DEFAULT_SIZE 128 #define SLOP 10 @@ -39,6 +41,7 @@ ODesc::ODesc(desc_type t, BroFile* arg_f) include_stats = 0; indent_with_spaces = 0; escape = false; + utf8 = false; } ODesc::~ODesc() @@ -57,6 +60,11 @@ void ODesc::EnableEscaping() escape = true; } +void ODesc::EnableUTF8 () + { + utf8 = true; + } + void ODesc::PushIndent() { ++indent_level; @@ -249,6 +257,23 @@ size_t ODesc::StartsWithEscapeSequence(const char* start, const char* end) return 0; } +size_t check_utf8 (const char* bytes, size_t n, size_t i) + { + // Checks two to four bytes from starting position i + // and returns the length of the valid utf-8 sequence + size_t num_to_check = ((n-i+1) < 4) ? (n-i+1) : 4; + + for (size_t j = 1; j <= num_to_check; ++j) + { + if (isLegalUTF8Sequence(reinterpret_cast(bytes+i), reinterpret_cast(bytes+i+j) )) + { + return j; + } + } + return 0; + + } + pair ODesc::FirstEscapeLoc(const char* bytes, size_t n) { typedef pair escape_pos; @@ -258,8 +283,21 @@ pair ODesc::FirstEscapeLoc(const char* bytes, size_t n) for ( size_t i = 0; i < n; ++i ) { - //if ( ! isprint(bytes[i]) || bytes[i] == '\\' ) - if ( bytes[i] == '\\' ) + if (!isprint(bytes[i])) + { + if (utf8) + { + size_t utf_found = check_utf8(bytes, n, i); + if (utf_found) + { + i += utf_found - 1; + continue; + } + } + return escape_pos(bytes + i, 1); + } + + else if (bytes[i] == '\\' ) return escape_pos(bytes + i, 1); size_t len = StartsWithEscapeSequence(bytes + i, bytes + n); diff --git a/src/Desc.h b/src/Desc.h index 8f7ae53ac4..1de7b0c249 100644 --- a/src/Desc.h +++ b/src/Desc.h @@ -54,6 +54,7 @@ public: void SetFlush(int arg_do_flush) { do_flush = arg_do_flush; } void EnableEscaping(); + void EnableUTF8(); void AddEscapeSequence(const char* s) { escape_sequences.insert(s); } void AddEscapeSequence(const char* s, size_t n) { escape_sequences.insert(string(s, n)); } @@ -185,6 +186,7 @@ protected: unsigned int offset; // where we are in the buffer unsigned int size; // size of buffer in bytes + bool utf8; //whether to allow non ascii utf-8 characters to pass through bool escape; // escape unprintable characters in output? typedef set escape_set; escape_set escape_sequences; // additional sequences of chars to escape diff --git a/src/logging/writers/ascii/Ascii.cc b/src/logging/writers/ascii/Ascii.cc index f84bde5488..bc0116d1f9 100644 --- a/src/logging/writers/ascii/Ascii.cc +++ b/src/logging/writers/ascii/Ascii.cc @@ -23,6 +23,7 @@ Ascii::Ascii(WriterFrontend* frontend) : WriterBackend(frontend) include_meta = false; tsv = false; use_json = false; + enable_utf_8 = false; formatter = 0; gzip_level = 0; gzfile = nullptr; @@ -36,6 +37,7 @@ void Ascii::InitConfigOptions() output_to_stdout = BifConst::LogAscii::output_to_stdout; include_meta = BifConst::LogAscii::include_meta; use_json = BifConst::LogAscii::use_json; + enable_utf_8 = BifConst::LogAscii::enable_utf_8; gzip_level = BifConst::LogAscii::gzip_level; separator.assign( @@ -115,6 +117,19 @@ bool Ascii::InitFilterOptions() } } + else if ( strcmp(i->first, "enable_utf_8") == 0 ) + { + if ( strcmp(i->second, "T") == 0 ) + enable_utf_8 = true; + else if ( strcmp(i->second, "F") == 0 ) + enable_utf_8 = false; + else + { + Error("invalid value for 'enable_utf_8', must be a string and either \"T\" or \"F\""); + return false; + } + } + else if ( strcmp(i->first, "output_to_stdout") == 0 ) { if ( strcmp(i->second, "T") == 0 ) @@ -181,6 +196,9 @@ bool Ascii::InitFormatter() } else { + // Enable utf-8 if needed + if (enable_utf_8) + desc.EnableUTF8(); // Use the default "Bro logs" format. desc.EnableEscaping(); desc.AddEscapeSequence(separator); diff --git a/src/logging/writers/ascii/Ascii.h b/src/logging/writers/ascii/Ascii.h index d1a6f2d0f3..1ee9c29599 100644 --- a/src/logging/writers/ascii/Ascii.h +++ b/src/logging/writers/ascii/Ascii.h @@ -65,6 +65,7 @@ private: int gzip_level; // level > 0 enables gzip compression bool use_json; + bool enable_utf_8; string json_timestamps; threading::formatter::Formatter* formatter; diff --git a/src/logging/writers/ascii/ascii.bif b/src/logging/writers/ascii/ascii.bif index b12b14f1a0..d8263f07c0 100644 --- a/src/logging/writers/ascii/ascii.bif +++ b/src/logging/writers/ascii/ascii.bif @@ -11,5 +11,6 @@ const set_separator: string; const empty_field: string; const unset_field: string; const use_json: bool; +const enable_utf_8: bool; const json_timestamps: JSON::TimestampFormat; const gzip_level: count; diff --git a/testing/btest/Baseline/scripts.base.frameworks.logging.ascii-utf8-enabled/test.log b/testing/btest/Baseline/scripts.base.frameworks.logging.ascii-utf8-enabled/test.log new file mode 100644 index 0000000000..be0a7d94ba --- /dev/null +++ b/testing/btest/Baseline/scripts.base.frameworks.logging.ascii-utf8-enabled/test.log @@ -0,0 +1,12 @@ +#separator \x09 +#set_separator , +#empty_field (empty) +#unset_field - +#path test +#open 2019-07-22-10-13-09 +#fields s +#types string +foo ® bar +दुनिया को नमस्ते +hello 𠜎 +#close 2019-07-22-10-13-09 diff --git a/testing/btest/scripts/base/frameworks/logging/ascii-utf8-enabled.zeek b/testing/btest/scripts/base/frameworks/logging/ascii-utf8-enabled.zeek new file mode 100644 index 0000000000..ac507eda3b --- /dev/null +++ b/testing/btest/scripts/base/frameworks/logging/ascii-utf8-enabled.zeek @@ -0,0 +1,26 @@ +# +# @TEST-EXEC: zeek -b %INPUT +# @TEST-EXEC: btest-diff test.log + +@load tuning/enable-utf-8-logs + +module Test; +export { + redef enum Log::ID += { LOG }; + + type Log: record { + s: string; + } &log; +} + +event zeek_init() +{ + local a = "foo \xc2\xae bar"; # 2 bytes + local b = "दुनिया को नमस्ते"; # Hindi characters are 3 byte utf-8 + local c = "hello 𠜎"; # A 4 byte Chinese character + + Log::create_stream(Test::LOG, [$columns=Log]); + Log::write(Test::LOG, [$s=a]); + Log::write(Test::LOG, [$s=b]); + Log::write(Test::LOG, [$s=c]); +} \ No newline at end of file