From 3151a9538117819d706088a016dc73ceb768360f Mon Sep 17 00:00:00 2001
From: Seth Hall <seth@icir.org>
Date: Wed, 25 May 2016 08:33:30 -0400
Subject: [PATCH 1/2] Remove the unescaped_special_char HTTP weird.

This weird points out a lot of benign stuff and it would
be easily reimplemented in a Bro script.  This commit
also makes the minor change to update the reserved and
unreserved characters from a newer from of the URI RFC.
---
 src/analyzer/protocol/http/HTTP.cc | 28 +++-------------------------
 1 file changed, 3 insertions(+), 25 deletions(-)

diff --git a/src/analyzer/protocol/http/HTTP.cc b/src/analyzer/protocol/http/HTTP.cc
index 490a9d2324..ac6eef44d9 100644
--- a/src/analyzer/protocol/http/HTTP.cc
+++ b/src/analyzer/protocol/http/HTTP.cc
@@ -1813,12 +1813,12 @@ void HTTP_Analyzer::SkipEntityData(int is_orig)
 	}
 
 int analyzer::http::is_reserved_URI_char(unsigned char ch)
-	{ // see RFC 2396 (definition of URI)
-	return strchr(";/?:@&=+$,", ch) != 0;
+	{ // see RFC 3986 (definition of URI)
+	return strchr(":/?#[]@!$&'()*+,;=", ch) != 0;
 	}
 
 int analyzer::http::is_unreserved_URI_char(unsigned char ch)
-	{ // see RFC 2396 (definition of URI)
+	{ // see RFC 3986 (definition of URI)
 	return isalnum(ch) || strchr("-_.!~*\'()", ch) != 0;
 	}
 
@@ -1835,19 +1835,6 @@ BroString* analyzer::http::unescape_URI(const u_char* line, const u_char* line_e
 	byte_vec decoded_URI = new u_char[line_end - line + 1];
 	byte_vec URI_p = decoded_URI;
 
-	// An 'unescaped_special_char' here means a character that *should*
-	// be escaped, but isn't in the URI.  A control characters that
-	// appears directly in the URI would be an example.  The RFC implies
-	// that if we do not unescape the URI that we see in the trace, every
-	// character should be a printable one -- either reserved or unreserved
-	// (or '%').
-	//
-	// Counting the number of unescaped characters and generating a weird
-	// event on URI's with unescaped characters (which are rare) will
-	// let us locate strange-looking URI's in the trace -- those URI's
-	// are often interesting.
-	int unescaped_special_char = 0;
-
 	while ( line < line_end )
 		{
 		if ( *line == '%' )
@@ -1892,12 +1879,6 @@ BroString* analyzer::http::unescape_URI(const u_char* line, const u_char* line_e
 
 		else
 			{
-			if ( ! is_reserved_URI_char(*line) &&
-			     ! is_unreserved_URI_char(*line) )
-				// Count these up as a way to compress
-				// the corresponding Weird event to a
-				// single instance.
-				++unescaped_special_char;
 			*URI_p++ = *line;
 			}
 
@@ -1906,8 +1887,5 @@ BroString* analyzer::http::unescape_URI(const u_char* line, const u_char* line_e
 
 	URI_p[0] = 0;
 
-	if ( unescaped_special_char && analyzer )
-		analyzer->Weird("unescaped_special_URI_char");
-
 	return new BroString(1, decoded_URI, URI_p - decoded_URI);
 	}

From 2f6e069c003580beb1ab105bbf1bbc5dc42fabfd Mon Sep 17 00:00:00 2001
From: Seth Hall <seth@icir.org>
Date: Wed, 25 May 2016 09:35:23 -0400
Subject: [PATCH 2/2] Add urldecoding for the unofficial %u00AE style of
 encoding.

---
 src/analyzer/protocol/http/HTTP.cc | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/src/analyzer/protocol/http/HTTP.cc b/src/analyzer/protocol/http/HTTP.cc
index ac6eef44d9..4d6b6977b4 100644
--- a/src/analyzer/protocol/http/HTTP.cc
+++ b/src/analyzer/protocol/http/HTTP.cc
@@ -1868,6 +1868,36 @@ BroString* analyzer::http::unescape_URI(const u_char* line, const u_char* line_e
 				++line; // place line at the last hex digit
 				}
 
+			else if ( line_end - line >= 5 &&
+			          line[0] == 'u' && 
+			          isxdigit(line[1]) && 
+			          isxdigit(line[2]) && 
+			          isxdigit(line[3]) && 
+			          isxdigit(line[4]) )
+				{
+				// Decode escaping like this: %u00AE
+				// The W3C rejected escaping this way, and
+				// there is no RFC that specifies it.
+				// Appparently there is some software doing 
+				// this sort of 4 byte unicode encoding anyway.
+				// Likely causing an increase in it's use is
+				// the third edition of the ECMAScript spec
+				// having functions for encoding and decoding 
+				// data in this format.
+
+				// If the first byte is null, let's eat it.
+				// It could just be ASCII encoded into this
+				// unicode escaping structure.
+				if ( ! (line[1] == '0' && line[2] == '0' ) )
+					*URI_p++ = (decode_hex(line[1]) << 4) +
+				               decode_hex(line[2]);
+				
+				*URI_p++ = (decode_hex(line[3]) << 4) +
+				           decode_hex(line[4]);
+
+				++line; ++line; ++line; ++line;
+				}
+
 			else
 				{
 				if ( analyzer )