Added optional script and redef bool to enable utf-8 in ASCII logs

2025-10-02 14:48:21 +00:00 · 2019-07-22 10:28:38 -07:00 · 2019-07-22 10:28:38 -07:00 · 66557d3178
commit 66557d3178
parent 6927dd1213
10 changed files with 111 additions and 2 deletions
--- a/src/Desc.cc
+++ b/src/Desc.cc
@ -10,6 +10,8 @@
 #include "File.h"
 #include "Reporter.h"

+#include "ConvertUTF.h"
+
 #define DEFAULT_SIZE 128
 #define SLOP 10

@ -39,6 +41,7 @@ ODesc::ODesc(desc_type t, BroFile* arg_f)
 	include_stats = 0;
 	indent_with_spaces = 0;
 	escape = false;
+	utf8 = false;
 	}

 ODesc::~ODesc()
@ -57,6 +60,11 @@ void ODesc::EnableEscaping()
 	escape = true;
 	}

+void ODesc::EnableUTF8 ()
+	{
+	utf8 = true;
+	}
+
 void ODesc::PushIndent()
 	{
 	++indent_level;
@ -249,6 +257,23 @@ size_t ODesc::StartsWithEscapeSequence(const char* start, const char* end)
 	return 0;
 	}

+size_t check_utf8 (const char* bytes, size_t n, size_t i)
+	{
+	// Checks two to four bytes from starting position i
+	// and returns the length of the valid utf-8 sequence
+	size_t num_to_check = ((n-i+1) < 4) ? (n-i+1) : 4;
+
+	for (size_t j = 1; j <= num_to_check; ++j)
+		{
+		if (isLegalUTF8Sequence(reinterpret_cast<const unsigned char *>(bytes+i), reinterpret_cast<const unsigned char *>(bytes+i+j) ))	
+			{
+			return j;
+			}
+		}
+	return 0;
+
+	}
+
 pair<const char*, size_t> ODesc::FirstEscapeLoc(const char* bytes, size_t n)
 	{
 	typedef pair<const char*, size_t> escape_pos;
@ -258,8 +283,21 @@ pair<const char*, size_t> ODesc::FirstEscapeLoc(const char* bytes, size_t n)

 	for ( size_t i = 0; i < n; ++i )
 		{
-		//if ( ! isprint(bytes[i]) || bytes[i] == '\\' )
-		if ( bytes[i] == '\\' )
+		if (!isprint(bytes[i])) 
+			{
+			if (utf8)
+				{
+				size_t utf_found = check_utf8(bytes, n, i);
+				if (utf_found)
+					{
+					i += utf_found - 1;
+					continue;
+					}
+				}
+			return escape_pos(bytes + i, 1);
+			}
+
+		else if (bytes[i] == '\\' )
 			return escape_pos(bytes + i, 1);

 		size_t len = StartsWithEscapeSequence(bytes + i, bytes + n);