Merge remote-tracking branch 'origin/topic/dev/non-ascii-logging'

* origin/topic/dev/non-ascii-logging: Removed Policy Script for UTF-8 Logs Commented out UTF-8 Script in Test All Policy Minor Style Tweak Use getNumBytesForUTF8 method to determine number of bytes Added Jon's test cases as unit tests Prioritizes escaping predefined Escape Sequences over Unescaping UTF-8 Sequences Added additional check to confirm anything unescaping is a multibyte UTF-8 sequence, addressing the test case Jon brought up Added optional script and redef bool to enable utf-8 in ASCII logs Initial Commit, removed std::isprint check to escape Made minor code format and logic adjustments during merge.
2025-10-02 14:48:21 +00:00 · 2019-07-30 19:36:56 -07:00 · 2019-07-30 19:36:56 -07:00 · d1770853b3
commit d1770853b3
parent bae60aee31 da5a0e800e
16 changed files with 181 additions and 3 deletions
--- a/src/Desc.cc
+++ b/src/Desc.cc
@ -10,6 +10,8 @@
 #include "File.h"
 #include "Reporter.h"

+#include "ConvertUTF.h"
+
 #define DEFAULT_SIZE 128
 #define SLOP 10

@ -39,6 +41,7 @@ ODesc::ODesc(desc_type t, BroFile* arg_f)
 	include_stats = 0;
 	indent_with_spaces = 0;
 	escape = false;
+	utf8 = false;
 	}

 ODesc::~ODesc()
@ -57,6 +60,11 @@ void ODesc::EnableEscaping()
 	escape = true;
 	}

+void ODesc::EnableUTF8 ()
+	{
+	utf8 = true;
+	}
+
 void ODesc::PushIndent()
 	{
 	++indent_level;
@ -258,13 +266,42 @@ pair<const char*, size_t> ODesc::FirstEscapeLoc(const char* bytes, size_t n)

 	for ( size_t i = 0; i < n; ++i )
 		{
-		if ( ! isprint(bytes[i]) || bytes[i] == '\\' )
+		auto printable = isprint(bytes[i]);
+
+		if ( ! printable && ! utf8 )
+			return escape_pos(bytes + i, 1);
+
+		if ( bytes[i] == '\\' )
 			return escape_pos(bytes + i, 1);

 		size_t len = StartsWithEscapeSequence(bytes + i, bytes + n);

 		if ( len )
 			return escape_pos(bytes + i, len);
+
+		if ( ! printable && utf8 )
+			{
+			size_t utf_found = getNumBytesForUTF8(bytes[i]);
+
+			if ( utf_found == 1 )
+				return escape_pos(bytes + i, 1);
+
+			if ( i + utf_found > n )
+				// Don't know if this is even meant to be a utf8 encoding,
+				// since there's not enough bytes left to check it's a valid
+				// sequence, so maybe safest to just move up by one instead
+				// of escaping the entire remainder.
+				return escape_pos(bytes + i, 1);
+
+			if ( isLegalUTF8Sequence(reinterpret_cast<const unsigned char *>(bytes + i),
+			                         reinterpret_cast<const unsigned char *>(bytes + i + utf_found)) )
+				{
+				i += utf_found - 1;
+				continue;
+				}
+
+			return escape_pos(bytes + i, 1);
+			}
 		}

 	return escape_pos(0, 0);