zeek/scripts/base/utils/urls.zeek
Frerich Raabe bb879fc2aa Fixed decompose_uri choking on URIs with empty port component
A call such as

  decompose_uri("git://git.kernel.org:/pub/scm/linux/");

would raise an error along the lines of

  error in /usr/local/zeek-3.0.0/share/zeek/base/utils/urls.zeek, line 122: bad conversion to count (to_count(parts[1]) and )

This was because an empty string got passsed to the to_count()
function.

Let's improve the behaviour and rather consider the portnum component
of the URI to be uninitialized.
2020-01-24 22:24:00 +01:00

133 lines
3.1 KiB
Text

##! Functions for URL handling.
## A regular expression for matching and extracting URLs.
const url_regex = /^([a-zA-Z\-]{3,5})(:\/\/[^\/?#"'\r\n><]*)([^?#"'\r\n><]*)([^[:blank:]\r\n"'><]*|\??[^"'\r\n><]*)/ &redef;
## A URI, as parsed by :zeek:id:`decompose_uri`.
type URI: record {
## The URL's scheme..
scheme: string &optional;
## The location, which could be a domain name or an IP address. Left empty if not
## specified.
netlocation: string;
## Port number, if included in URI.
portnum: count &optional;
## Full including the file name. Will be '/' if there's not path given.
path: string;
## Full file name, including extension, if there is a file name.
file_name: string &optional;
## The base filename, without extension, if there is a file name.
file_base: string &optional;
## The filename's extension, if there is a file name.
file_ext: string &optional;
## A table of all query parameters, mapping their keys to values, if there's a
## query.
params: table[string] of string &optional;
};
## Extracts URLs discovered in arbitrary text.
function find_all_urls(s: string): string_set
{
return find_all(s, url_regex);
}
## Extracts URLs discovered in arbitrary text without
## the URL scheme included.
function find_all_urls_without_scheme(s: string): string_set
{
local urls = find_all_urls(s);
local return_urls: set[string] = set();
for ( url in urls )
{
local no_scheme = sub(url, /^([a-zA-Z\-]{3,5})(:\/\/)/, "");
add return_urls[no_scheme];
}
return return_urls;
}
function decompose_uri(uri: string): URI
{
local parts: string_vec;
local u = URI($netlocation="", $path="/");
local s = uri;
if ( /\?/ in s )
{
u$params = table();
parts = split_string1(s, /\?/);
s = parts[0];
local query = parts[1];
if ( /&/ in query )
{
local opv = split_string(query, /&/);
for ( each in opv )
{
if ( /=/ in opv[each] )
{
parts = split_string1(opv[each], /=/);
u$params[parts[0]] = parts[1];
}
}
}
else if ( /=/ in query )
{
parts = split_string1(query, /=/);
u$params[parts[0]] = parts[1];
}
}
if ( /:\/\// in s )
{
# Parse scheme and remove from s.
parts = split_string1(s, /:\/\//);
u$scheme = parts[0];
s = parts[1];
}
if ( /\// in s )
{
# Parse path and remove from s.
parts = split_string1(s, /\//);
s = parts[0];
u$path = fmt("/%s", parts[1]);
if ( |u$path| > 1 && u$path[|u$path| - 1] != "/" )
{
local last_token = find_last(u$path, /\/.+/);
local full_filename = split_string1(last_token, /\//)[1];
if ( /\./ in full_filename )
{
u$file_name = full_filename;
u$file_base = split_string1(full_filename, /\./)[0];
u$file_ext = split_string1(full_filename, /\./)[1];
}
else
{
u$file_name = full_filename;
u$file_base = full_filename;
}
}
}
if ( /:/ in s )
{
# Parse location and port.
parts = split_string1(s, /:/);
u$netlocation = parts[0];
if ( parts[1] != "" )
{
u$portnum = to_count(parts[1]);
}
}
else
{
u$netlocation = s;
}
return u;
}