From 3151a9538117819d706088a016dc73ceb768360f Mon Sep 17 00:00:00 2001 From: Seth Hall Date: Wed, 25 May 2016 08:33:30 -0400 Subject: [PATCH 1/2] Remove the unescaped_special_char HTTP weird. This weird points out a lot of benign stuff and it would be easily reimplemented in a Bro script. This commit also makes the minor change to update the reserved and unreserved characters from a newer from of the URI RFC. --- src/analyzer/protocol/http/HTTP.cc | 28 +++------------------------- 1 file changed, 3 insertions(+), 25 deletions(-) diff --git a/src/analyzer/protocol/http/HTTP.cc b/src/analyzer/protocol/http/HTTP.cc index 490a9d2324..ac6eef44d9 100644 --- a/src/analyzer/protocol/http/HTTP.cc +++ b/src/analyzer/protocol/http/HTTP.cc @@ -1813,12 +1813,12 @@ void HTTP_Analyzer::SkipEntityData(int is_orig) } int analyzer::http::is_reserved_URI_char(unsigned char ch) - { // see RFC 2396 (definition of URI) - return strchr(";/?:@&=+$,", ch) != 0; + { // see RFC 3986 (definition of URI) + return strchr(":/?#[]@!$&'()*+,;=", ch) != 0; } int analyzer::http::is_unreserved_URI_char(unsigned char ch) - { // see RFC 2396 (definition of URI) + { // see RFC 3986 (definition of URI) return isalnum(ch) || strchr("-_.!~*\'()", ch) != 0; } @@ -1835,19 +1835,6 @@ BroString* analyzer::http::unescape_URI(const u_char* line, const u_char* line_e byte_vec decoded_URI = new u_char[line_end - line + 1]; byte_vec URI_p = decoded_URI; - // An 'unescaped_special_char' here means a character that *should* - // be escaped, but isn't in the URI. A control characters that - // appears directly in the URI would be an example. The RFC implies - // that if we do not unescape the URI that we see in the trace, every - // character should be a printable one -- either reserved or unreserved - // (or '%'). - // - // Counting the number of unescaped characters and generating a weird - // event on URI's with unescaped characters (which are rare) will - // let us locate strange-looking URI's in the trace -- those URI's - // are often interesting. - int unescaped_special_char = 0; - while ( line < line_end ) { if ( *line == '%' ) @@ -1892,12 +1879,6 @@ BroString* analyzer::http::unescape_URI(const u_char* line, const u_char* line_e else { - if ( ! is_reserved_URI_char(*line) && - ! is_unreserved_URI_char(*line) ) - // Count these up as a way to compress - // the corresponding Weird event to a - // single instance. - ++unescaped_special_char; *URI_p++ = *line; } @@ -1906,8 +1887,5 @@ BroString* analyzer::http::unescape_URI(const u_char* line, const u_char* line_e URI_p[0] = 0; - if ( unescaped_special_char && analyzer ) - analyzer->Weird("unescaped_special_URI_char"); - return new BroString(1, decoded_URI, URI_p - decoded_URI); } From 2f6e069c003580beb1ab105bbf1bbc5dc42fabfd Mon Sep 17 00:00:00 2001 From: Seth Hall Date: Wed, 25 May 2016 09:35:23 -0400 Subject: [PATCH 2/2] Add urldecoding for the unofficial %u00AE style of encoding. --- src/analyzer/protocol/http/HTTP.cc | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/src/analyzer/protocol/http/HTTP.cc b/src/analyzer/protocol/http/HTTP.cc index ac6eef44d9..4d6b6977b4 100644 --- a/src/analyzer/protocol/http/HTTP.cc +++ b/src/analyzer/protocol/http/HTTP.cc @@ -1868,6 +1868,36 @@ BroString* analyzer::http::unescape_URI(const u_char* line, const u_char* line_e ++line; // place line at the last hex digit } + else if ( line_end - line >= 5 && + line[0] == 'u' && + isxdigit(line[1]) && + isxdigit(line[2]) && + isxdigit(line[3]) && + isxdigit(line[4]) ) + { + // Decode escaping like this: %u00AE + // The W3C rejected escaping this way, and + // there is no RFC that specifies it. + // Appparently there is some software doing + // this sort of 4 byte unicode encoding anyway. + // Likely causing an increase in it's use is + // the third edition of the ECMAScript spec + // having functions for encoding and decoding + // data in this format. + + // If the first byte is null, let's eat it. + // It could just be ASCII encoded into this + // unicode escaping structure. + if ( ! (line[1] == '0' && line[2] == '0' ) ) + *URI_p++ = (decode_hex(line[1]) << 4) + + decode_hex(line[2]); + + *URI_p++ = (decode_hex(line[3]) << 4) + + decode_hex(line[4]); + + ++line; ++line; ++line; ++line; + } + else { if ( analyzer )