From 3c42350e77eff3c163bc38e4c15d2533c6c8bef8 Mon Sep 17 00:00:00 2001 From: akasza Date: Wed, 5 Nov 2014 20:44:03 -0800 Subject: [PATCH 1/3] uri parsing function --- scripts/base/utils/urls.bro | 58 +++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/scripts/base/utils/urls.bro b/scripts/base/utils/urls.bro index 8ef9ed7e2d..6c9eada67a 100644 --- a/scripts/base/utils/urls.bro +++ b/scripts/base/utils/urls.bro @@ -3,6 +3,16 @@ ## A regular expression for matching and extracting URLs. const url_regex = /^([a-zA-Z\-]{3,5})(:\/\/[^\/?#"'\r\n><]*)([^?#"'\r\n><]*)([^[:blank:]\r\n"'><]*|\??[^"'\r\n><]*)/ &redef; +type uri_record: record { + protocol: string &optional; + # this could be a domain name or an IP address + netlocation: string; + portnum: count &optional; + path: string &optional; + file_name: string &optional; + file_ext: string &optional; +}; + ## Extracts URLs discovered in arbitrary text. function find_all_urls(s: string): string_set { @@ -23,3 +33,51 @@ function find_all_urls_without_scheme(s: string): string_set return return_urls; } + +function decompose_uri(s: string): uri_record + { + local parts: string_array; + local u: uri = [$netlocation=""]; + + if (/:\/\// in s) + { + parts = split1(s, /:\/\//); + u$protocol = parts[1]; + s = parts[2]; + } + if (/\// in s) + { + parts = split1(s, /\//); + s = parts[1]; + u$path = fmt("/%s", parts[2]); + + if (|u$path| > 1) + { + local last_token: string = find_last(u$path, /\/.+/); + local full_filename = split1(last_token, /\//)[2]; + if (/\./ in full_filename) + { + u$file_name = split1(full_filename, /\./)[1]; + u$file_ext = split1(full_filename, /\./)[2]; + u$path = subst_string(u$path, fmt("%s.%s", u$file_name, u$file_ext), ""); + } + else + { + u$file_name = full_filename; + u$path = subst_string(u$path, u$file_name, ""); + } + } + } + if (/:/ in s) + { + parts = split1(s, /:/); + u$netlocation = parts[1]; + u$portnum = to_count(parts[2]); + } + else + { + u$netlocation = s; + } + + return u; + } From 69ce4d30382a90d121132bb0fcc699f6ed726b3e Mon Sep 17 00:00:00 2001 From: akasza Date: Thu, 6 Nov 2014 19:47:28 -0800 Subject: [PATCH 2/3] uri_decompose complete, need btests --- scripts/base/utils/urls.bro | 47 +++++++++++++++++++++++++++++++++---- 1 file changed, 43 insertions(+), 4 deletions(-) diff --git a/scripts/base/utils/urls.bro b/scripts/base/utils/urls.bro index 6c9eada67a..9beb424489 100644 --- a/scripts/base/utils/urls.bro +++ b/scripts/base/utils/urls.bro @@ -3,14 +3,16 @@ ## A regular expression for matching and extracting URLs. const url_regex = /^([a-zA-Z\-]{3,5})(:\/\/[^\/?#"'\r\n><]*)([^?#"'\r\n><]*)([^[:blank:]\r\n"'><]*|\??[^"'\r\n><]*)/ &redef; -type uri_record: record { - protocol: string &optional; +type URI: record { + scheme: string &optional; # this could be a domain name or an IP address netlocation: string; portnum: count &optional; path: string &optional; file_name: string &optional; file_ext: string &optional; + params_k: table[count] of string; + params_v: table[count] of string; }; ## Extracts URLs discovered in arbitrary text. @@ -34,11 +36,49 @@ function find_all_urls_without_scheme(s: string): string_set return return_urls; } -function decompose_uri(s: string): uri_record +function decompose_uri(s: string): URI { local parts: string_array; local u: uri = [$netlocation=""]; + if ( /\?/ in s) + { + local k: table[count] of string; + local v: table[count] of string; + u$params_k = k; + u$params_v = v; + + parts = split1(s, /\?/); + s = parts[1]; + local query: string = parts[2]; + if (/&/ in query) + { + local opv: table[count] of string = split(query, /&/); + + for (each in opv) + { + if (/=/ in opv[each]) + { + parts = split1(opv[each], /=/); + + # why does the order here matter? + u$params_k[each] = parts[1]; + u$params_v[each] = parts[2]; + } + else + { + # malformed URI + # domain.tld/path/file.ext?foo& + } + } + } + else + { + parts = split1(query, /=/); + u$params_k[0] = parts[1]; + u$params_v[0] = parts[2]; + } + } if (/:\/\// in s) { parts = split1(s, /:\/\//); @@ -78,6 +118,5 @@ function decompose_uri(s: string): uri_record { u$netlocation = s; } - return u; } From ea79c07730268cfa7ac515860ad3c5d678b93c87 Mon Sep 17 00:00:00 2001 From: akasza Date: Thu, 6 Nov 2014 19:52:03 -0800 Subject: [PATCH 3/3] uri parsing complete --- scripts/base/utils/urls.bro | 6 ------ 1 file changed, 6 deletions(-) diff --git a/scripts/base/utils/urls.bro b/scripts/base/utils/urls.bro index 9beb424489..1f42d517d6 100644 --- a/scripts/base/utils/urls.bro +++ b/scripts/base/utils/urls.bro @@ -61,15 +61,9 @@ function decompose_uri(s: string): URI { parts = split1(opv[each], /=/); - # why does the order here matter? u$params_k[each] = parts[1]; u$params_v[each] = parts[2]; } - else - { - # malformed URI - # domain.tld/path/file.ext?foo& - } } } else