diff --git a/scripts/base/utils/urls.bro b/scripts/base/utils/urls.bro index 8ef9ed7e2d..6c9eada67a 100644 --- a/scripts/base/utils/urls.bro +++ b/scripts/base/utils/urls.bro @@ -3,6 +3,16 @@ ## A regular expression for matching and extracting URLs. const url_regex = /^([a-zA-Z\-]{3,5})(:\/\/[^\/?#"'\r\n><]*)([^?#"'\r\n><]*)([^[:blank:]\r\n"'><]*|\??[^"'\r\n><]*)/ &redef; +type uri_record: record { + protocol: string &optional; + # this could be a domain name or an IP address + netlocation: string; + portnum: count &optional; + path: string &optional; + file_name: string &optional; + file_ext: string &optional; +}; + ## Extracts URLs discovered in arbitrary text. function find_all_urls(s: string): string_set { @@ -23,3 +33,51 @@ function find_all_urls_without_scheme(s: string): string_set return return_urls; } + +function decompose_uri(s: string): uri_record + { + local parts: string_array; + local u: uri = [$netlocation=""]; + + if (/:\/\// in s) + { + parts = split1(s, /:\/\//); + u$protocol = parts[1]; + s = parts[2]; + } + if (/\// in s) + { + parts = split1(s, /\//); + s = parts[1]; + u$path = fmt("/%s", parts[2]); + + if (|u$path| > 1) + { + local last_token: string = find_last(u$path, /\/.+/); + local full_filename = split1(last_token, /\//)[2]; + if (/\./ in full_filename) + { + u$file_name = split1(full_filename, /\./)[1]; + u$file_ext = split1(full_filename, /\./)[2]; + u$path = subst_string(u$path, fmt("%s.%s", u$file_name, u$file_ext), ""); + } + else + { + u$file_name = full_filename; + u$path = subst_string(u$path, u$file_name, ""); + } + } + } + if (/:/ in s) + { + parts = split1(s, /:/); + u$netlocation = parts[1]; + u$portnum = to_count(parts[2]); + } + else + { + u$netlocation = s; + } + + return u; + }