Added optional script and redef bool to enable utf-8 in ASCII logs

2025-10-02 14:48:21 +00:00 · 2019-07-22 10:28:38 -07:00 · 2019-07-22 10:28:38 -07:00 · 66557d3178
commit 66557d3178
parent 6927dd1213
10 changed files with 111 additions and 2 deletions
--- a/scripts/base/frameworks/logging/writers/ascii.zeek
+++ b/scripts/base/frameworks/logging/writers/ascii.zeek
@ -26,6 +26,12 @@ export {
 	## This option is also available as a per-filter ``$config`` option.
 	const use_json = F &redef;
 	## If true, non ASCII UTF-8 characters will pass through and 
 	## be written into logs
 	##
 	## This option is also available as a per-filter ``$config`` option.
 	const enable_utf_8 = F &redef;
 	## Define the gzip level to compress the logs.  If 0, then no gzip
 	## compression is performed. Enabling compression also changes
 	## the log file name extension to include ".gz".
--- a/scripts/policy/tuning/enable-utf-8-logs.zeek
+++ b/scripts/policy/tuning/enable-utf-8-logs.zeek
@ -0,0 +1,4 @@
 ##! Loading this script will enable utf-8 characters
 ##! instead of escaping them into the \xYY format
 redef LogAscii::enable_utf_8=T;
--- a/scripts/test-all-policy.zeek
+++ b/scripts/test-all-policy.zeek
@ -112,4 +112,5 @@
@load tuning/defaults/packet-fragments.zeek
@load tuning/defaults/warnings.zeek
@load tuning/json-logs.zeek
@load tuning/enable-utf-8-logs.zeek
@load tuning/track-all-assets.zeek
--- a/src/Desc.cc
+++ b/src/Desc.cc
@ -10,6 +10,8 @@
 #include "File.h"
 #include "Reporter.h"
 #include "ConvertUTF.h"
 #define DEFAULT_SIZE 128
 #define SLOP 10
@ -39,6 +41,7 @@ ODesc::ODesc(desc_type t, BroFile* arg_f)
 	include_stats = 0;
 	indent_with_spaces = 0;
 	escape = false;
 	utf8 = false;
 	}
 ODesc::~ODesc()
@ -57,6 +60,11 @@ void ODesc::EnableEscaping()
 	escape = true;
 	}
 void ODesc::EnableUTF8 ()
 	{
 	utf8 = true;
 	}
 void ODesc::PushIndent()
 	{
 	++indent_level;
@ -249,6 +257,23 @@ size_t ODesc::StartsWithEscapeSequence(const char* start, const char* end)
 	return 0;
 	}
 size_t check_utf8 (const char* bytes, size_t n, size_t i)
 	{
 	// Checks two to four bytes from starting position i
 	// and returns the length of the valid utf-8 sequence
 	size_t num_to_check = ((n-i+1) < 4) ? (n-i+1) : 4;
 	for (size_t j = 1; j <= num_to_check; ++j)
 		{
 		if (isLegalUTF8Sequence(reinterpret_cast<const unsigned char *>(bytes+i), reinterpret_cast<const unsigned char *>(bytes+i+j) ))	
 			{
 			return j;
 			}
 		}
 	return 0;
 	}
 pair<const char*, size_t> ODesc::FirstEscapeLoc(const char* bytes, size_t n)
 	{
 	typedef pair<const char*, size_t> escape_pos;
@ -258,8 +283,21 @@ pair<const char*, size_t> ODesc::FirstEscapeLoc(const char* bytes, size_t n)
 	for ( size_t i = 0; i < n; ++i )
 		{
-		//if ( ! isprint(bytes[i]) || bytes[i] == '\\' )
+		if (!isprint(bytes[i])) 
-		if ( bytes[i] == '\\' )
+			{
 			if (utf8)
 				{
 				size_t utf_found = check_utf8(bytes, n, i);
 				if (utf_found)
 					{
 					i += utf_found - 1;
 					continue;
 					}
 				}
 			return escape_pos(bytes + i, 1);
 			}
 		else if (bytes[i] == '\\' )
 			return escape_pos(bytes + i, 1);
 		size_t len = StartsWithEscapeSequence(bytes + i, bytes + n);
--- a/src/Desc.h
+++ b/src/Desc.h
@ -54,6 +54,7 @@ public:
 	void SetFlush(int arg_do_flush)	{ do_flush = arg_do_flush; }
 	void EnableEscaping();
 	void EnableUTF8();
 	void AddEscapeSequence(const char* s) { escape_sequences.insert(s); }
 	void AddEscapeSequence(const char* s, size_t n)
 	    { escape_sequences.insert(string(s, n)); }
@ -185,6 +186,7 @@ protected:
 	unsigned int offset;	// where we are in the buffer
 	unsigned int size;	// size of buffer in bytes
 	bool utf8; //whether to allow non ascii utf-8 characters to pass through
 	bool escape;	// escape unprintable characters in output?
 	typedef set<string> escape_set;
 	escape_set escape_sequences; // additional sequences of chars to escape
--- a/src/logging/writers/ascii/Ascii.cc
+++ b/src/logging/writers/ascii/Ascii.cc
@ -23,6 +23,7 @@ Ascii::Ascii(WriterFrontend* frontend) : WriterBackend(frontend)
 	include_meta = false;
 	tsv = false;
 	use_json = false;
 	enable_utf_8 = false;
 	formatter = 0;
 	gzip_level = 0;
 	gzfile = nullptr;
@ -36,6 +37,7 @@ void Ascii::InitConfigOptions()
 	output_to_stdout = BifConst::LogAscii::output_to_stdout;
 	include_meta = BifConst::LogAscii::include_meta;
 	use_json = BifConst::LogAscii::use_json;
 	enable_utf_8 = BifConst::LogAscii::enable_utf_8;
 	gzip_level = BifConst::LogAscii::gzip_level;
 	separator.assign(
@ -115,6 +117,19 @@ bool Ascii::InitFilterOptions()
 				}
 			}
 		else if ( strcmp(i->first, "enable_utf_8") == 0 )
 			{
 			if ( strcmp(i->second, "T") == 0 )
 				enable_utf_8 = true;
 			else if ( strcmp(i->second, "F") == 0 )
 				enable_utf_8 = false;
 			else
 				{
 				Error("invalid value for 'enable_utf_8', must be a string and either \"T\" or \"F\"");
 				return false;
 				}
 			}
 		else if ( strcmp(i->first, "output_to_stdout") == 0 )
 			{
 			if ( strcmp(i->second, "T") == 0 )
@ -181,6 +196,9 @@ bool Ascii::InitFormatter()
 		}
 	else
 		{
 		// Enable utf-8 if needed
 		if (enable_utf_8)
 			desc.EnableUTF8();
 		// Use the default "Bro logs" format.
 		desc.EnableEscaping();
 		desc.AddEscapeSequence(separator);
--- a/src/logging/writers/ascii/Ascii.h
+++ b/src/logging/writers/ascii/Ascii.h
@ -65,6 +65,7 @@ private:
 	int gzip_level; // level > 0 enables gzip compression
 	bool use_json;
 	bool enable_utf_8;
 	string json_timestamps;
 	threading::formatter::Formatter* formatter;
--- a/src/logging/writers/ascii/ascii.bif
+++ b/src/logging/writers/ascii/ascii.bif
@ -11,5 +11,6 @@ const set_separator: string;
 const empty_field: string;
 const unset_field: string;
 const use_json: bool;
 const enable_utf_8: bool;
 const json_timestamps: JSON::TimestampFormat;
 const gzip_level: count;
--- a/testing/btest/Baseline/scripts.base.frameworks.logging.ascii-utf8-enabled/test.log
+++ b/testing/btest/Baseline/scripts.base.frameworks.logging.ascii-utf8-enabled/test.log
@ -0,0 +1,12 @@
 #separator \x09
 #set_separator	,
 #empty_field	(empty)
 #unset_field	-
 #path	test
 #open	2019-07-22-10-13-09
 #fields	s
 #types	string
 foo ® bar
 दुनिया को नमस्ते
 hello 𠜎
 #close	2019-07-22-10-13-09
--- a/testing/btest/scripts/base/frameworks/logging/ascii-utf8-enabled.zeek
+++ b/testing/btest/scripts/base/frameworks/logging/ascii-utf8-enabled.zeek
@ -0,0 +1,26 @@
 #
 # @TEST-EXEC: zeek -b %INPUT
 # @TEST-EXEC: btest-diff test.log
@load tuning/enable-utf-8-logs
 module Test;
 export {
 	redef enum Log::ID += { LOG };
 	type Log: record {
 		s: string;
 	} &log;
 }
 event zeek_init()
 {
 	local a = "foo \xc2\xae bar"; # 2 bytes
 	local b = "दुनिया को नमस्ते"; # Hindi characters are 3 byte utf-8
 	local c = "hello 𠜎"; # A 4 byte Chinese character
 	Log::create_stream(Test::LOG, [$columns=Log]);
 	Log::write(Test::LOG, [$s=a]);
 	Log::write(Test::LOG, [$s=b]);
 	Log::write(Test::LOG, [$s=c]);
 }