Merge remote-tracking branch 'origin/topic/seth/remove-unescaped_special_char-weird'

BIT-1611 #merged * origin/topic/seth/remove-unescaped_special_char-weird: Add urldecoding for the unofficial %u00AE style of encoding. Remove the unescaped_special_char HTTP weird.
2025-10-10 18:48:20 +00:00 · 2016-06-06 18:04:22 -07:00 · 2016-06-06 18:04:22 -07:00 · 17dc28b8a8
commit 17dc28b8a8
parent d59bb2e9d1 2f6e069c00
3 changed files with 53 additions and 28 deletions
--- a/19
+++ b/19
@ -1,4 +1,23 @@
 2.4-588 | 2016-06-06 17:59:34 -0700
  * Moved link-layer addresses into endpoints. The link-layer
    addresses are now part of the connection endpoints following the
    originator/responder pattern. (Jan Grashoefer)
  * Link-layer addresses are extracted for 802.11 plus RadioTap. (Jan
    Grashoefer)
  * Fix coverity error (uninitialized variable) (Johanna Amann)
  * Use ether_ntoa instead of ether_ntoa_r
    The latter is thread-safe, but a GNU addition which does not exist on
    OS-X. Since the function only is called in the main thread, it should
    not matter if it is or is not threadsafe. (Johanna Amann)
  * Fix FreeBSD/OSX compile problem due to headers (Johanna Amann)
 2.4-581 | 2016-05-30 10:58:19 -0700
  * Adding missing new script file mac-logging.bro. (Robin Sommer)
--- a/2
+++ b/2
@ -1 +1 @@
-2.4-581
+2.4-588
--- a/src/analyzer/protocol/http/HTTP.cc
+++ b/src/analyzer/protocol/http/HTTP.cc
@ -1813,12 +1813,12 @@ void HTTP_Analyzer::SkipEntityData(int is_orig)
 	}
 int analyzer::http::is_reserved_URI_char(unsigned char ch)
-	{ // see RFC 2396 (definition of URI)
+	{ // see RFC 3986 (definition of URI)
-	return strchr(";/?:@&=+$,", ch) != 0;
+	return strchr(":/?#[]@!$&'()*+,;=", ch) != 0;
 	}
 int analyzer::http::is_unreserved_URI_char(unsigned char ch)
-	{ // see RFC 2396 (definition of URI)
+	{ // see RFC 3986 (definition of URI)
 	return isalnum(ch) || strchr("-_.!~*\'()", ch) != 0;
 	}
@ -1835,19 +1835,6 @@ BroString* analyzer::http::unescape_URI(const u_char* line, const u_char* line_e
 	byte_vec decoded_URI = new u_char[line_end - line + 1];
 	byte_vec URI_p = decoded_URI;
 	// An 'unescaped_special_char' here means a character that *should*
 	// be escaped, but isn't in the URI.  A control characters that
 	// appears directly in the URI would be an example.  The RFC implies
 	// that if we do not unescape the URI that we see in the trace, every
 	// character should be a printable one -- either reserved or unreserved
 	// (or '%').
 	//
 	// Counting the number of unescaped characters and generating a weird
 	// event on URI's with unescaped characters (which are rare) will
 	// let us locate strange-looking URI's in the trace -- those URI's
 	// are often interesting.
 	int unescaped_special_char = 0;
 	while ( line < line_end )
 		{
 		if ( *line == '%' )
@ -1881,6 +1868,36 @@ BroString* analyzer::http::unescape_URI(const u_char* line, const u_char* line_e
 				++line; // place line at the last hex digit
 				}
 			else if ( line_end - line >= 5 &&
 			          line[0] == 'u' &&
 			          isxdigit(line[1]) &&
 			          isxdigit(line[2]) &&
 			          isxdigit(line[3]) &&
 			          isxdigit(line[4]) )
 				{
 				// Decode escaping like this: %u00AE
 				// The W3C rejected escaping this way, and
 				// there is no RFC that specifies it.
 				// Appparently there is some software doing
 				// this sort of 4 byte unicode encoding anyway.
 				// Likely causing an increase in it's use is
 				// the third edition of the ECMAScript spec
 				// having functions for encoding and decoding
 				// data in this format.
 				// If the first byte is null, let's eat it.
 				// It could just be ASCII encoded into this
 				// unicode escaping structure.
 				if ( ! (line[1] == '0' && line[2] == '0' ) )
 					*URI_p++ = (decode_hex(line[1]) << 4) +
 					            decode_hex(line[2]);
 				*URI_p++ = (decode_hex(line[3]) << 4) +
 					    decode_hex(line[4]);
 				line += 4;
 				}
 			else
 				{
 				if ( analyzer )
@ -1891,23 +1908,12 @@ BroString* analyzer::http::unescape_URI(const u_char* line, const u_char* line_e
 			}
 		else
 			{
 			if ( ! is_reserved_URI_char(*line) &&
 			     ! is_unreserved_URI_char(*line) )
 				// Count these up as a way to compress
 				// the corresponding Weird event to a
 				// single instance.
 				++unescaped_special_char;
 			*URI_p++ = *line;
 			}
 		++line;
 		}
 	URI_p[0] = 0;
 	if ( unescaped_special_char && analyzer )
 		analyzer->Weird("unescaped_special_URI_char");
 	return new BroString(1, decoded_URI, URI_p - decoded_URI);
 	}