Merge remote-tracking branch 'origin/topic/dev/non-ascii-logging'

* origin/topic/dev/non-ascii-logging: Removed Policy Script for UTF-8 Logs Commented out UTF-8 Script in Test All Policy Minor Style Tweak Use getNumBytesForUTF8 method to determine number of bytes Added Jon's test cases as unit tests Prioritizes escaping predefined Escape Sequences over Unescaping UTF-8 Sequences Added additional check to confirm anything unescaping is a multibyte UTF-8 sequence, addressing the test case Jon brought up Added optional script and redef bool to enable utf-8 in ASCII logs Initial Commit, removed std::isprint check to escape Made minor code format and logic adjustments during merge.
2025-10-02 14:48:21 +00:00 · 2019-07-30 19:36:56 -07:00 · 2019-07-30 19:36:56 -07:00 · d1770853b3
commit d1770853b3
parent bae60aee31 da5a0e800e
16 changed files with 181 additions and 3 deletions
--- a/7
+++ b/7
@ -1,4 +1,11 @@
 2.6-723 | 2019-07-30 19:36:56 -0700
  * Add LogAscii::enable_utf_8 option (Dev Bali, Corelight)
    This option allows valid utf8 sequences to be written directly
    into the ASCII logs without any escaping.
 2.6-713 | 2019-07-30 18:12:49 +0000
  * Fix memory leaks in expire_func introduced by recent changes (Jon
--- a/3
+++ b/3
@ -162,6 +162,9 @@ New Functionality
      local three = make_adder(3);
      print three(5); # prints 8
 - Add ``LogAscii::enable_utf_8`` option to allow valid utf8 sequences
  to be written directly into the ASCII logs without any escaping.
 Changed Functionality
 ---------------------
--- a/2
+++ b/2
@ -1 +1 @@
-2.6-713
+2.6-723
--- a/2
+++ b/2
@ -1 +1 @@
-Subproject commit bb2f06ff8e3b6ae1f362ad4b56da4ef2ebb12d0c
+Subproject commit f3b78d63696c3145144fbd8cfd82cdae15a13e98
--- a/scripts/base/frameworks/logging/writers/ascii.zeek
+++ b/scripts/base/frameworks/logging/writers/ascii.zeek
@ -26,6 +26,12 @@ export {
 	## This option is also available as a per-filter ``$config`` option.
 	const use_json = F &redef;
 	## If true, valid UTF-8 sequences will pass through unescaped and be
 	## written into logs.
 	##
 	## This option is also available as a per-filter ``$config`` option.
 	const enable_utf_8 = F &redef;
 	## Define the gzip level to compress the logs.  If 0, then no gzip
 	## compression is performed. Enabling compression also changes
 	## the log file name extension to include ".gz".
--- a/src/Desc.cc
+++ b/src/Desc.cc
@ -10,6 +10,8 @@
 #include "File.h"
 #include "Reporter.h"
 #include "ConvertUTF.h"
 #define DEFAULT_SIZE 128
 #define SLOP 10
@ -39,6 +41,7 @@ ODesc::ODesc(desc_type t, BroFile* arg_f)
 	include_stats = 0;
 	indent_with_spaces = 0;
 	escape = false;
 	utf8 = false;
 	}
 ODesc::~ODesc()
@ -57,6 +60,11 @@ void ODesc::EnableEscaping()
 	escape = true;
 	}
 void ODesc::EnableUTF8 ()
 	{
 	utf8 = true;
 	}
 void ODesc::PushIndent()
 	{
 	++indent_level;
@ -258,13 +266,42 @@ pair<const char*, size_t> ODesc::FirstEscapeLoc(const char* bytes, size_t n)
 	for ( size_t i = 0; i < n; ++i )
 		{
-		if ( ! isprint(bytes[i]) || bytes[i] == '\\' )
+		auto printable = isprint(bytes[i]);
 		if ( ! printable && ! utf8 )
 			return escape_pos(bytes + i, 1);
 		if ( bytes[i] == '\\' )
 			return escape_pos(bytes + i, 1);
 		size_t len = StartsWithEscapeSequence(bytes + i, bytes + n);
 		if ( len )
 			return escape_pos(bytes + i, len);
 		if ( ! printable && utf8 )
 			{
 			size_t utf_found = getNumBytesForUTF8(bytes[i]);
 			if ( utf_found == 1 )
 				return escape_pos(bytes + i, 1);
 			if ( i + utf_found > n )
 				// Don't know if this is even meant to be a utf8 encoding,
 				// since there's not enough bytes left to check it's a valid
 				// sequence, so maybe safest to just move up by one instead
 				// of escaping the entire remainder.
 				return escape_pos(bytes + i, 1);
 			if ( isLegalUTF8Sequence(reinterpret_cast<const unsigned char *>(bytes + i),
 			                         reinterpret_cast<const unsigned char *>(bytes + i + utf_found)) )
 				{
 				i += utf_found - 1;
 				continue;
 				}
 			return escape_pos(bytes + i, 1);
 			}
 		}
 	return escape_pos(0, 0);
--- a/src/Desc.h
+++ b/src/Desc.h
@ -54,6 +54,7 @@ public:
 	void SetFlush(int arg_do_flush)	{ do_flush = arg_do_flush; }
 	void EnableEscaping();
 	void EnableUTF8();
 	void AddEscapeSequence(const char* s) { escape_sequences.insert(s); }
 	void AddEscapeSequence(const char* s, size_t n)
 	    { escape_sequences.insert(string(s, n)); }
@ -185,6 +186,7 @@ protected:
 	unsigned int offset;	// where we are in the buffer
 	unsigned int size;	// size of buffer in bytes
 	bool utf8; // whether valid utf-8 sequences may pass through unescaped
 	bool escape;	// escape unprintable characters in output?
 	typedef set<string> escape_set;
 	escape_set escape_sequences; // additional sequences of chars to escape
--- a/src/logging/writers/ascii/Ascii.cc
+++ b/src/logging/writers/ascii/Ascii.cc
@ -23,6 +23,7 @@ Ascii::Ascii(WriterFrontend* frontend) : WriterBackend(frontend)
 	include_meta = false;
 	tsv = false;
 	use_json = false;
 	enable_utf_8 = false;
 	formatter = 0;
 	gzip_level = 0;
 	gzfile = nullptr;
@ -36,6 +37,7 @@ void Ascii::InitConfigOptions()
 	output_to_stdout = BifConst::LogAscii::output_to_stdout;
 	include_meta = BifConst::LogAscii::include_meta;
 	use_json = BifConst::LogAscii::use_json;
 	enable_utf_8 = BifConst::LogAscii::enable_utf_8;
 	gzip_level = BifConst::LogAscii::gzip_level;
 	separator.assign(
@ -115,6 +117,19 @@ bool Ascii::InitFilterOptions()
 				}
 			}
 		else if ( strcmp(i->first, "enable_utf_8") == 0 )
 			{
 			if ( strcmp(i->second, "T") == 0 )
 				enable_utf_8 = true;
 			else if ( strcmp(i->second, "F") == 0 )
 				enable_utf_8 = false;
 			else
 				{
 				Error("invalid value for 'enable_utf_8', must be a string and either \"T\" or \"F\"");
 				return false;
 				}
 			}
 		else if ( strcmp(i->first, "output_to_stdout") == 0 )
 			{
 			if ( strcmp(i->second, "T") == 0 )
@ -181,6 +196,10 @@ bool Ascii::InitFormatter()
 		}
 	else
 		{
 		// Enable utf-8 if needed
 		if ( enable_utf_8 )
 			desc.EnableUTF8();
 		// Use the default "Bro logs" format.
 		desc.EnableEscaping();
 		desc.AddEscapeSequence(separator);
--- a/src/logging/writers/ascii/Ascii.h
+++ b/src/logging/writers/ascii/Ascii.h
@ -65,6 +65,7 @@ private:
 	int gzip_level; // level > 0 enables gzip compression
 	bool use_json;
 	bool enable_utf_8;
 	string json_timestamps;
 	threading::formatter::Formatter* formatter;
--- a/src/logging/writers/ascii/ascii.bif
+++ b/src/logging/writers/ascii/ascii.bif
@ -11,5 +11,6 @@ const set_separator: string;
 const empty_field: string;
 const unset_field: string;
 const use_json: bool;
 const enable_utf_8: bool;
 const json_timestamps: JSON::TimestampFormat;
 const gzip_level: count;
--- a/testing/btest/Baseline/scripts.base.frameworks.logging.ascii-utf8-enabled-control-characters/test.log
+++ b/testing/btest/Baseline/scripts.base.frameworks.logging.ascii-utf8-enabled-control-characters/test.log
@ -0,0 +1,10 @@
 #separator \x09
 #set_separator	,
 #empty_field	(empty)
 #unset_field	-
 #path	test
 #open	2019-07-23-11-40-47
 #fields	s
 #types	string
 foo \x0a\x09\x00 bar
 #close	2019-07-23-11-40-47
--- a/testing/btest/Baseline/scripts.base.frameworks.logging.ascii-utf8-enabled-set-separator-escape/test.log
+++ b/testing/btest/Baseline/scripts.base.frameworks.logging.ascii-utf8-enabled-set-separator-escape/test.log
@ -0,0 +1,10 @@
 #separator \x09
 #set_separator	\xc2\xae
 #empty_field	(empty)
 #unset_field	-
 #path	test
 #open	2019-07-23-11-46-43
 #fields	ss
 #types	set[string]
 \xc2\xae
 #close	2019-07-23-11-46-43
--- a/testing/btest/Baseline/scripts.base.frameworks.logging.ascii-utf8-enabled/test.log
+++ b/testing/btest/Baseline/scripts.base.frameworks.logging.ascii-utf8-enabled/test.log
@ -0,0 +1,12 @@
 #separator \x09
 #set_separator	,
 #empty_field	(empty)
 #unset_field	-
 #path	test
 #open	2019-07-22-10-13-09
 #fields	s
 #types	string
 foo ® bar
 दुनिया को नमस्ते
 hello 𠜎
 #close	2019-07-22-10-13-09
--- a/testing/btest/scripts/base/frameworks/logging/ascii-utf8-enabled-control-characters.zeek
+++ b/testing/btest/scripts/base/frameworks/logging/ascii-utf8-enabled-control-characters.zeek
@ -0,0 +1,21 @@
 #
 # @TEST-EXEC: zeek -b %INPUT
 # @TEST-EXEC: btest-diff test.log
 redef LogAscii::enable_utf_8 = T;
 module Test;
 export {
 	redef enum Log::ID += { LOG };
 	type Log: record {
 		s: string;
 	} &log;
 }
 event zeek_init()
 	{
 	local a = "foo \n\t\0 bar";
 	Log::create_stream(Test::LOG, [$columns=Log]);
 	Log::write(Test::LOG, [$s=a]);
 	}
--- a/testing/btest/scripts/base/frameworks/logging/ascii-utf8-enabled-set-separator-escape.zeek
+++ b/testing/btest/scripts/base/frameworks/logging/ascii-utf8-enabled-set-separator-escape.zeek
@ -0,0 +1,23 @@
 #
 # @TEST-EXEC: zeek -b %INPUT
 # @TEST-EXEC: btest-diff test.log
 redef LogAscii::enable_utf_8 = T;
 redef LogAscii::set_separator = "\xc2\xae";
 module Test;
 export {
 	redef enum Log::ID += { LOG };
 	type Log: record {
 		ss: set[string];
 	} &log;
 }
 event zeek_init()
 	{
 	Log::create_stream(Test::LOG, [$columns=Log]);
 	Log::write(Test::LOG, [$ss=set("\xc2\xae")]);
 	}
--- a/testing/btest/scripts/base/frameworks/logging/ascii-utf8-enabled.zeek
+++ b/testing/btest/scripts/base/frameworks/logging/ascii-utf8-enabled.zeek
@ -0,0 +1,26 @@
 #
 # @TEST-EXEC: zeek -b %INPUT
 # @TEST-EXEC: btest-diff test.log
 redef LogAscii::enable_utf_8 = T;
 module Test;
 export {
 	redef enum Log::ID += { LOG };
 	type Log: record {
 		s: string;
 	} &log;
 }
 event zeek_init()
 {
 	local a = "foo \xc2\xae bar"; # 2 bytes
 	local b = "दुनिया को नमस्ते"; # Hindi characters are 3 byte utf-8
 	local c = "hello 𠜎"; # A 4 byte Chinese character
 	Log::create_stream(Test::LOG, [$columns=Log]);
 	Log::write(Test::LOG, [$s=a]);
 	Log::write(Test::LOG, [$s=b]);
 	Log::write(Test::LOG, [$s=c]);
 }
		`@ -1 +1 @@`
			`Subproject commit bb2f06ff8e3b6ae1f362ad4b56da4ef2ebb12d0c`				`Subproject commit f3b78d63696c3145144fbd8cfd82cdae15a13e98`