diff --git a/CHANGES b/CHANGES index 26fd059492..faf441eaae 100644 --- a/CHANGES +++ b/CHANGES @@ -1,4 +1,9 @@ +2.3-309 | 2014-11-18 12:17:53 -0800 + + * New decompose_uri() function in base/utils/urls that splits a URI + into its pieces. (Anthony Kasza). + 2.3-305 | 2014-11-18 11:09:04 -0800 * Improve coercion of &default expressions. Addresses BIT-1288. (Jon diff --git a/VERSION b/VERSION index fd48b52d6d..748ef210f1 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.3-305 +2.3-309 diff --git a/scripts/base/utils/urls.bro b/scripts/base/utils/urls.bro index 8ef9ed7e2d..d4279cd0ce 100644 --- a/scripts/base/utils/urls.bro +++ b/scripts/base/utils/urls.bro @@ -3,6 +3,28 @@ ## A regular expression for matching and extracting URLs. const url_regex = /^([a-zA-Z\-]{3,5})(:\/\/[^\/?#"'\r\n><]*)([^?#"'\r\n><]*)([^[:blank:]\r\n"'><]*|\??[^"'\r\n><]*)/ &redef; +## A URI, as parsed by :bro:id:`decompose_uri`. +type URI: record { + ## The URL's scheme.. + scheme: string &optional; + ## The location, which could be a domain name or an IP address. Left empty if not + ## specified. + netlocation: string; + ## Port number, if included in URI. + portnum: count &optional; + ## Full including the file name. Will be '/' if there's not path given. + path: string; + ## Full file name, including extension, if there is a file name. + file_name: string &optional; + ## The base filename, without extension, if there is a file name. + file_base: string &optional; + ## The filename's extension, if there is a file name. + file_ext: string &optional; + ## A table of all query parameters, mapping their keys to values, if there's a + ## query. + params: table[string] of string &optional; +}; + ## Extracts URLs discovered in arbitrary text. function find_all_urls(s: string): string_set { @@ -23,3 +45,84 @@ function find_all_urls_without_scheme(s: string): string_set return return_urls; } + +function decompose_uri(s: string): URI + { + local parts: string_array; + local u: URI = [$netlocation="", $path="/"]; + + if ( /\?/ in s) + { + # Parse query. + u$params = table(); + + parts = split1(s, /\?/); + s = parts[1]; + local query: string = parts[2]; + + if ( /&/ in query ) + { + local opv: table[count] of string = split(query, /&/); + + for ( each in opv ) + { + if ( /=/ in opv[each] ) + { + parts = split1(opv[each], /=/); + u$params[parts[1]] = parts[2]; + } + } + } + else + { + parts = split1(query, /=/); + u$params[parts[1]] = parts[2]; + } + } + + if ( /:\/\// in s ) + { + # Parse scheme and remove from s. + parts = split1(s, /:\/\//); + u$scheme = parts[1]; + s = parts[2]; + } + + if ( /\// in s ) + { + # Parse path and remove from s. + parts = split1(s, /\//); + s = parts[1]; + u$path = fmt("/%s", parts[2]); + + if ( |u$path| > 1 && u$path[|u$path| - 1] != "/" ) + { + local last_token: string = find_last(u$path, /\/.+/); + local full_filename = split1(last_token, /\//)[2]; + + if ( /\./ in full_filename ) + { + u$file_name = full_filename; + u$file_base = split1(full_filename, /\./)[1]; + u$file_ext = split1(full_filename, /\./)[2]; + } + else + { + u$file_name = full_filename; + u$file_base = full_filename; + } + } + } + + if ( /:/ in s ) + { + # Parse location and port. + parts = split1(s, /:/); + u$netlocation = parts[1]; + u$portnum = to_count(parts[2]); + } + else + u$netlocation = s; + + return u; + } diff --git a/testing/btest/Baseline/scripts.base.utils.decompose_uri/output b/testing/btest/Baseline/scripts.base.utils.decompose_uri/output new file mode 100644 index 0000000000..c31851d8bd --- /dev/null +++ b/testing/btest/Baseline/scripts.base.utils.decompose_uri/output @@ -0,0 +1,51 @@ +https://www.bro.org:42/documentation/faq.html?k1=v1&k2=v2 + -> [scheme=https, netlocation=www.bro.org, portnum=42, path=/documentation/faq.html, file_name=faq.html, file_base=faq, file_ext=html, params={^J^I[k2] = v2,^J^I[k1] = v1^J}] + + + -> [scheme=, netlocation=, portnum=, path=/, file_name=, file_base=, file_ext=, params=] + +https:// + -> [scheme=https, netlocation=, portnum=, path=/, file_name=, file_base=, file_ext=, params=] + +https://www.bro.org + -> [scheme=https, netlocation=www.bro.org, portnum=, path=/, file_name=, file_base=, file_ext=, params=] + +https://www.bro.org/ + -> [scheme=https, netlocation=www.bro.org, portnum=, path=/, file_name=, file_base=, file_ext=, params=] + +https://www.bro.org:42 + -> [scheme=https, netlocation=www.bro.org, portnum=42, path=/, file_name=, file_base=, file_ext=, params=] + +https://www.bro.org:42/ + -> [scheme=https, netlocation=www.bro.org, portnum=42, path=/, file_name=, file_base=, file_ext=, params=] + +https://www.bro.org/documentation + -> [scheme=https, netlocation=www.bro.org, portnum=, path=/documentation, file_name=documentation, file_base=documentation, file_ext=, params=] + +https://www.bro.org/documentation/ + -> [scheme=https, netlocation=www.bro.org, portnum=, path=/documentation/, file_name=, file_base=, file_ext=, params=] + +https://www.bro.org/documentation/faq + -> [scheme=https, netlocation=www.bro.org, portnum=, path=/documentation/faq, file_name=faq, file_base=faq, file_ext=, params=] + +https://www.bro.org/documentation/faq.html + -> [scheme=https, netlocation=www.bro.org, portnum=, path=/documentation/faq.html, file_name=faq.html, file_base=faq, file_ext=html, params=] + +https://www.bro.org/documentation/faq.html? + -> [scheme=https, netlocation=www.bro.org, portnum=, path=/documentation/faq.html, file_name=faq.html, file_base=faq, file_ext=html, params={^J^J}] + +https://www.bro.org/documentation/faq.html?k=v + -> [scheme=https, netlocation=www.bro.org, portnum=, path=/documentation/faq.html, file_name=faq.html, file_base=faq, file_ext=html, params={^J^I[k] = v^J}] + +https://www.bro.org/documentation/faq.html?k= + -> [scheme=https, netlocation=www.bro.org, portnum=, path=/documentation/faq.html, file_name=faq.html, file_base=faq, file_ext=html, params={^J^I[k] = ^J}] + +https://www.bro.org/documentation/faq.html?=v + -> [scheme=https, netlocation=www.bro.org, portnum=, path=/documentation/faq.html, file_name=faq.html, file_base=faq, file_ext=html, params={^J^I[] = v^J}] + +file:///documentation/faq.html?=v + -> [scheme=file, netlocation=, portnum=, path=/documentation/faq.html, file_name=faq.html, file_base=faq, file_ext=html, params={^J^I[] = v^J}] + +www.bro.org/?foo=bar + -> [scheme=, netlocation=www.bro.org, portnum=, path=/, file_name=, file_base=, file_ext=, params={^J^I[foo] = bar^J}] + diff --git a/testing/btest/scripts/base/utils/decompose_uri.bro b/testing/btest/scripts/base/utils/decompose_uri.bro new file mode 100644 index 0000000000..6ed30e7889 --- /dev/null +++ b/testing/btest/scripts/base/utils/decompose_uri.bro @@ -0,0 +1,33 @@ +# @TEST-EXEC: bro -b %INPUT > output +# @TEST-EXEC: btest-diff output + +@load base/utils/urls + +function dc(s: string) + { + print fmt("%s", s); + print fmt(" -> %s", decompose_uri(s)); + print ""; + } + +event bro_init() + { + dc("https://www.bro.org:42/documentation/faq.html?k1=v1&k2=v2"); + dc(""); + dc("https://"); + dc("https://www.bro.org"); + dc("https://www.bro.org/"); + dc("https://www.bro.org:42"); + dc("https://www.bro.org:42/"); + dc("https://www.bro.org/documentation"); + dc("https://www.bro.org/documentation/"); + dc("https://www.bro.org/documentation/faq"); + dc("https://www.bro.org/documentation/faq.html"); + dc("https://www.bro.org/documentation/faq.html?"); + dc("https://www.bro.org/documentation/faq.html?k=v"); + dc("https://www.bro.org/documentation/faq.html?k="); + dc("https://www.bro.org/documentation/faq.html?=v"); + dc("file:///documentation/faq.html?=v"); + dc("www.bro.org/?foo=bar"); + } +