From ad19f1e1bbfefde19c56ef4f6998e9aa7f818b45 Mon Sep 17 00:00:00 2001
From: Tim Wojtulewicz <tim@corelight.com>
Date: Wed, 24 Jul 2019 10:38:45 -0700
Subject: [PATCH] GHI-486: Switch over to using LLVM utf8-checking code to
 better validate characters

---
 src/util.cc                                   | 53 +++++--------------
 .../ssh.log                                   | 12 ++++-
 .../frameworks/logging/ascii-json-utf8.zeek   | 17 +++++-
 3 files changed, 40 insertions(+), 42 deletions(-)
diff --git a/src/util.cc b/src/util.cc
index f111c3f2b3..0ad45f72a5 100644
--- a/src/util.cc
+++ b/src/util.cc
@@ -51,6 +51,7 @@
 #include "Net.h"
 #include "Reporter.h"
 #include "iosource/Manager.h"
+#include "ConvertUTF.h"
 
 /**
  * Return IP address without enclosing brackets and any leading 0x.  Also
@@ -1889,8 +1890,9 @@ string json_escape_utf8(const string& val)
 	{
 	string result;
 	result.reserve(val.length());
+
+	auto val_data = reinterpret_cast<const unsigned char*>(val.c_str());
 	
-	size_t char_start = 0;
 	size_t idx;
 	for ( idx = 0; idx < val.length(); )
 		{
@@ -1910,50 +1912,23 @@ string json_escape_utf8(const string& val)
 			continue;
 			}
 
-		// The next bit is based on the table at https://en.wikipedia.org/wiki/UTF-8#Description.
-		// If next character is 11110xxx, this is a 4-byte UTF-8
-		unsigned int char_size = 0;
-		if ( (val[idx] & 0xF8) == 0xF0 ) char_size = 4;
-		
-		// If next character is 1110xxxx, this is a 3-byte UTF-8
-		else if ( (val[idx] & 0xF0) == 0xE0 ) char_size = 3;
-		
-		// If next character is 110xxxxx, this is a 2-byte UTF-8
-		else if ( (val[idx] & 0xE0) == 0xC0 ) char_size = 2;
-		
-		// This byte isn't a continuation byte, insert it as a byte and continue.
-		if ( char_size == 0)
+		// Find out how long the next character should be.
+		unsigned int char_size = getNumBytesForUTF8(val[idx]);
+
+		// If it says that it's a single character or it's not an invalid string UTF8 sequence, insert the one
+		// escaped byte into the string, step forward one, and go to the next character.
+		if ( char_size == 0 || isLegalUTF8Sequence(val_data+idx, val_data+idx+char_size) == 0 )
 			{
 			result.append(json_escape_byte(val[idx]));
 			++idx;
 			continue;
 			}
-		
-		// If we don't have enough bytes to get to the end of character, give up and insert all of the rest
-		// of them as escaped values.
-		if ( char_size > (val.length() - idx) )
-			break;
-		
-		// Loop through the rest of the supposed character and see if this is a valid character.
-		size_t c_idx = idx + 1;
-		for ( ; c_idx < idx + char_size; c_idx++ )
-			if ( (val[c_idx] & 0xC0) != 0x80 ) break;
-		
-		// if we didn't make it to the end of the character without finding an error, insert just this
-		// character and skip ahead. Otherwise insert all of the bytes for this character into the result.
-		if ( c_idx != idx + char_size )
-			{
-			result.append(json_escape_byte(val[idx]));
-			++idx;
-			continue;
-			}
-		else
-			{
-			for ( size_t step = 0; step < char_size; step++, idx++ )
-				result.push_back(val[idx]);
-			}
+
+		for ( size_t step = 0; step < char_size; step++, idx++ )
+			result.push_back(val[idx]);
 		}
-	
+
+	// Insert any of the remaining bytes into the string as escaped bytes
 	if ( idx != val.length() )
 		for ( ; idx < val.length(); ++idx )
 			result.append(json_escape_byte(val[idx]));
diff --git a/testing/btest/Baseline/scripts.base.frameworks.logging.ascii-json-utf8/ssh.log b/testing/btest/Baseline/scripts.base.frameworks.logging.ascii-json-utf8/ssh.log
index b1a395906f..5a23d20794 100644
--- a/testing/btest/Baseline/scripts.base.frameworks.logging.ascii-json-utf8/ssh.log
+++ b/testing/btest/Baseline/scripts.base.frameworks.logging.ascii-json-utf8/ssh.log
@@ -2,11 +2,21 @@
 {"s":"\b\f\n\r\t\\x00\\x15"}
 {"s":"ñ"}
 {"s":"\\xc3("}
+{"s":"\\xc0\\x81"}
+{"s":"\\xc1\\x81"}
+{"s":"\\xc2\\xcf"}
 {"s":"\\xa0\\xa1"}
 {"s":"₡"}
+{"s":"࣡"}
+{"s":"\\xe0\\x80\\xa1"}
 {"s":"\\xe2(\\xa1"}
+{"s":"\\xed\\xa0\\xa1"}
 {"s":"\\xe2\\x82("}
 {"s":"𐌼"}
-{"s":"\\xf0(\\x8c\\xbc"}
+{"s":"񀌼"}
+{"s":"􀌼"}
+{"s":"\\xf0\\x80\\x8c\\xbc"}
+{"s":"\\xf2(\\x8c\\xbc"}
+{"s":"\\xf4\\x90\\x8c\\xbc"}
 {"s":"\\xf0\\x90(\\xbc"}
 {"s":"\\xf0(\\x8c("}
diff --git a/testing/btest/scripts/base/frameworks/logging/ascii-json-utf8.zeek b/testing/btest/scripts/base/frameworks/logging/ascii-json-utf8.zeek
index dabe86073d..c5a6689094 100644
--- a/testing/btest/scripts/base/frameworks/logging/ascii-json-utf8.zeek
+++ b/testing/btest/scripts/base/frameworks/logging/ascii-json-utf8.zeek
@@ -27,29 +27,42 @@ event zeek_init()
 	Log::write(SSH::LOG, [$s="a"]);
 	Log::write(SSH::LOG, [$s="\b\f\n\r\t\x00\x15"]);
 
+	# Table 3-7 in https://www.unicode.org/versions/Unicode12.0.0/ch03.pdf describes what is
+	# valid and invalid for the tests below
+
 	# Valid 2 Octet Sequence
 	Log::write(SSH::LOG, [$s="\xc3\xb1"]);
-	
+
 	# Invalid 2 Octet Sequence
 	Log::write(SSH::LOG, [$s="\xc3\x28"]);
+	Log::write(SSH::LOG, [$s="\xc0\x81"]);
+	Log::write(SSH::LOG, [$s="\xc1\x81"]);
+	Log::write(SSH::LOG, [$s="\xc2\xcf"]);
 	
 	# Invalid Sequence Identifier
 	Log::write(SSH::LOG, [$s="\xa0\xa1"]);
 	
 	# Valid 3 Octet Sequence
 	Log::write(SSH::LOG, [$s="\xe2\x82\xa1"]);
+	Log::write(SSH::LOG, [$s="\xe0\xa3\xa1"]);
 	
 	# Invalid 3 Octet Sequence (in 2nd Octet)
+	Log::write(SSH::LOG, [$s="\xe0\x80\xa1"]);
 	Log::write(SSH::LOG, [$s="\xe2\x28\xa1"]);
+	Log::write(SSH::LOG, [$s="\xed\xa0\xa1"]);
 	
 	# Invalid 3 Octet Sequence (in 3rd Octet)
 	Log::write(SSH::LOG, [$s="\xe2\x82\x28"]);
 	
 	# Valid 4 Octet Sequence
 	Log::write(SSH::LOG, [$s="\xf0\x90\x8c\xbc"]);
+	Log::write(SSH::LOG, [$s="\xf1\x80\x8c\xbc"]);
+	Log::write(SSH::LOG, [$s="\xf4\x80\x8c\xbc"]);
 	
 	# Invalid 4 Octet Sequence (in 2nd Octet)
-	Log::write(SSH::LOG, [$s="\xf0\x28\x8c\xbc"]);
+	Log::write(SSH::LOG, [$s="\xf0\x80\x8c\xbc"]);
+	Log::write(SSH::LOG, [$s="\xf2\x28\x8c\xbc"]);
+	Log::write(SSH::LOG, [$s="\xf4\x90\x8c\xbc"]);
 	
 	# Invalid 4 Octet Sequence (in 3rd Octet)
 	Log::write(SSH::LOG, [$s="\xf0\x90\x28\xbc"]);