mirror of
https://github.com/zeek/zeek.git
synced 2025-10-02 14:48:21 +00:00
uri parsing function
This commit is contained in:
parent
9045288ad3
commit
3c42350e77
1 changed files with 58 additions and 0 deletions
|
@ -3,6 +3,16 @@
|
||||||
## A regular expression for matching and extracting URLs.
|
## A regular expression for matching and extracting URLs.
|
||||||
const url_regex = /^([a-zA-Z\-]{3,5})(:\/\/[^\/?#"'\r\n><]*)([^?#"'\r\n><]*)([^[:blank:]\r\n"'><]*|\??[^"'\r\n><]*)/ &redef;
|
const url_regex = /^([a-zA-Z\-]{3,5})(:\/\/[^\/?#"'\r\n><]*)([^?#"'\r\n><]*)([^[:blank:]\r\n"'><]*|\??[^"'\r\n><]*)/ &redef;
|
||||||
|
|
||||||
|
type uri_record: record {
|
||||||
|
protocol: string &optional;
|
||||||
|
# this could be a domain name or an IP address
|
||||||
|
netlocation: string;
|
||||||
|
portnum: count &optional;
|
||||||
|
path: string &optional;
|
||||||
|
file_name: string &optional;
|
||||||
|
file_ext: string &optional;
|
||||||
|
};
|
||||||
|
|
||||||
## Extracts URLs discovered in arbitrary text.
|
## Extracts URLs discovered in arbitrary text.
|
||||||
function find_all_urls(s: string): string_set
|
function find_all_urls(s: string): string_set
|
||||||
{
|
{
|
||||||
|
@ -23,3 +33,51 @@ function find_all_urls_without_scheme(s: string): string_set
|
||||||
|
|
||||||
return return_urls;
|
return return_urls;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function decompose_uri(s: string): uri_record
|
||||||
|
{
|
||||||
|
local parts: string_array;
|
||||||
|
local u: uri = [$netlocation=""];
|
||||||
|
|
||||||
|
if (/:\/\// in s)
|
||||||
|
{
|
||||||
|
parts = split1(s, /:\/\//);
|
||||||
|
u$protocol = parts[1];
|
||||||
|
s = parts[2];
|
||||||
|
}
|
||||||
|
if (/\// in s)
|
||||||
|
{
|
||||||
|
parts = split1(s, /\//);
|
||||||
|
s = parts[1];
|
||||||
|
u$path = fmt("/%s", parts[2]);
|
||||||
|
|
||||||
|
if (|u$path| > 1)
|
||||||
|
{
|
||||||
|
local last_token: string = find_last(u$path, /\/.+/);
|
||||||
|
local full_filename = split1(last_token, /\//)[2];
|
||||||
|
if (/\./ in full_filename)
|
||||||
|
{
|
||||||
|
u$file_name = split1(full_filename, /\./)[1];
|
||||||
|
u$file_ext = split1(full_filename, /\./)[2];
|
||||||
|
u$path = subst_string(u$path, fmt("%s.%s", u$file_name, u$file_ext), "");
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
u$file_name = full_filename;
|
||||||
|
u$path = subst_string(u$path, u$file_name, "");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (/:/ in s)
|
||||||
|
{
|
||||||
|
parts = split1(s, /:/);
|
||||||
|
u$netlocation = parts[1];
|
||||||
|
u$portnum = to_count(parts[2]);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
u$netlocation = s;
|
||||||
|
}
|
||||||
|
|
||||||
|
return u;
|
||||||
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue