mirror of
https://github.com/zeek/zeek.git
synced 2025-10-02 06:38:20 +00:00
Merge branch 'master' of https://github.com/anthonykasza/bro
- I've changed/extended the URI record fields a bit: - path is always the full path including the full file name - if there's no path, the field still still be set set "/". - file_name is the full name including extenstion, and file_base and file_ext split it out. - Adding a test exercising a bunch of URLs.
This commit is contained in:
commit
9d3cfaddaa
5 changed files with 193 additions and 1 deletions
5
CHANGES
5
CHANGES
|
@ -1,4 +1,9 @@
|
|||
|
||||
2.3-309 | 2014-11-18 12:17:53 -0800
|
||||
|
||||
* New decompose_uri() function in base/utils/urls that splits a URI
|
||||
into its pieces. (Anthony Kasza).
|
||||
|
||||
2.3-305 | 2014-11-18 11:09:04 -0800
|
||||
|
||||
* Improve coercion of &default expressions. Addresses BIT-1288. (Jon
|
||||
|
|
2
VERSION
2
VERSION
|
@ -1 +1 @@
|
|||
2.3-305
|
||||
2.3-309
|
||||
|
|
|
@ -3,6 +3,28 @@
|
|||
## A regular expression for matching and extracting URLs.
|
||||
const url_regex = /^([a-zA-Z\-]{3,5})(:\/\/[^\/?#"'\r\n><]*)([^?#"'\r\n><]*)([^[:blank:]\r\n"'><]*|\??[^"'\r\n><]*)/ &redef;
|
||||
|
||||
## A URI, as parsed by :bro:id:`decompose_uri`.
|
||||
type URI: record {
|
||||
## The URL's scheme..
|
||||
scheme: string &optional;
|
||||
## The location, which could be a domain name or an IP address. Left empty if not
|
||||
## specified.
|
||||
netlocation: string;
|
||||
## Port number, if included in URI.
|
||||
portnum: count &optional;
|
||||
## Full including the file name. Will be '/' if there's not path given.
|
||||
path: string;
|
||||
## Full file name, including extension, if there is a file name.
|
||||
file_name: string &optional;
|
||||
## The base filename, without extension, if there is a file name.
|
||||
file_base: string &optional;
|
||||
## The filename's extension, if there is a file name.
|
||||
file_ext: string &optional;
|
||||
## A table of all query parameters, mapping their keys to values, if there's a
|
||||
## query.
|
||||
params: table[string] of string &optional;
|
||||
};
|
||||
|
||||
## Extracts URLs discovered in arbitrary text.
|
||||
function find_all_urls(s: string): string_set
|
||||
{
|
||||
|
@ -23,3 +45,84 @@ function find_all_urls_without_scheme(s: string): string_set
|
|||
|
||||
return return_urls;
|
||||
}
|
||||
|
||||
function decompose_uri(s: string): URI
|
||||
{
|
||||
local parts: string_array;
|
||||
local u: URI = [$netlocation="", $path="/"];
|
||||
|
||||
if ( /\?/ in s)
|
||||
{
|
||||
# Parse query.
|
||||
u$params = table();
|
||||
|
||||
parts = split1(s, /\?/);
|
||||
s = parts[1];
|
||||
local query: string = parts[2];
|
||||
|
||||
if ( /&/ in query )
|
||||
{
|
||||
local opv: table[count] of string = split(query, /&/);
|
||||
|
||||
for ( each in opv )
|
||||
{
|
||||
if ( /=/ in opv[each] )
|
||||
{
|
||||
parts = split1(opv[each], /=/);
|
||||
u$params[parts[1]] = parts[2];
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
parts = split1(query, /=/);
|
||||
u$params[parts[1]] = parts[2];
|
||||
}
|
||||
}
|
||||
|
||||
if ( /:\/\// in s )
|
||||
{
|
||||
# Parse scheme and remove from s.
|
||||
parts = split1(s, /:\/\//);
|
||||
u$scheme = parts[1];
|
||||
s = parts[2];
|
||||
}
|
||||
|
||||
if ( /\// in s )
|
||||
{
|
||||
# Parse path and remove from s.
|
||||
parts = split1(s, /\//);
|
||||
s = parts[1];
|
||||
u$path = fmt("/%s", parts[2]);
|
||||
|
||||
if ( |u$path| > 1 && u$path[|u$path| - 1] != "/" )
|
||||
{
|
||||
local last_token: string = find_last(u$path, /\/.+/);
|
||||
local full_filename = split1(last_token, /\//)[2];
|
||||
|
||||
if ( /\./ in full_filename )
|
||||
{
|
||||
u$file_name = full_filename;
|
||||
u$file_base = split1(full_filename, /\./)[1];
|
||||
u$file_ext = split1(full_filename, /\./)[2];
|
||||
}
|
||||
else
|
||||
{
|
||||
u$file_name = full_filename;
|
||||
u$file_base = full_filename;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ( /:/ in s )
|
||||
{
|
||||
# Parse location and port.
|
||||
parts = split1(s, /:/);
|
||||
u$netlocation = parts[1];
|
||||
u$portnum = to_count(parts[2]);
|
||||
}
|
||||
else
|
||||
u$netlocation = s;
|
||||
|
||||
return u;
|
||||
}
|
||||
|
|
|
@ -0,0 +1,51 @@
|
|||
https://www.bro.org:42/documentation/faq.html?k1=v1&k2=v2
|
||||
-> [scheme=https, netlocation=www.bro.org, portnum=42, path=/documentation/faq.html, file_name=faq.html, file_base=faq, file_ext=html, params={^J^I[k2] = v2,^J^I[k1] = v1^J}]
|
||||
|
||||
|
||||
-> [scheme=<uninitialized>, netlocation=, portnum=<uninitialized>, path=/, file_name=<uninitialized>, file_base=<uninitialized>, file_ext=<uninitialized>, params=<uninitialized>]
|
||||
|
||||
https://
|
||||
-> [scheme=https, netlocation=, portnum=<uninitialized>, path=/, file_name=<uninitialized>, file_base=<uninitialized>, file_ext=<uninitialized>, params=<uninitialized>]
|
||||
|
||||
https://www.bro.org
|
||||
-> [scheme=https, netlocation=www.bro.org, portnum=<uninitialized>, path=/, file_name=<uninitialized>, file_base=<uninitialized>, file_ext=<uninitialized>, params=<uninitialized>]
|
||||
|
||||
https://www.bro.org/
|
||||
-> [scheme=https, netlocation=www.bro.org, portnum=<uninitialized>, path=/, file_name=<uninitialized>, file_base=<uninitialized>, file_ext=<uninitialized>, params=<uninitialized>]
|
||||
|
||||
https://www.bro.org:42
|
||||
-> [scheme=https, netlocation=www.bro.org, portnum=42, path=/, file_name=<uninitialized>, file_base=<uninitialized>, file_ext=<uninitialized>, params=<uninitialized>]
|
||||
|
||||
https://www.bro.org:42/
|
||||
-> [scheme=https, netlocation=www.bro.org, portnum=42, path=/, file_name=<uninitialized>, file_base=<uninitialized>, file_ext=<uninitialized>, params=<uninitialized>]
|
||||
|
||||
https://www.bro.org/documentation
|
||||
-> [scheme=https, netlocation=www.bro.org, portnum=<uninitialized>, path=/documentation, file_name=documentation, file_base=documentation, file_ext=<uninitialized>, params=<uninitialized>]
|
||||
|
||||
https://www.bro.org/documentation/
|
||||
-> [scheme=https, netlocation=www.bro.org, portnum=<uninitialized>, path=/documentation/, file_name=<uninitialized>, file_base=<uninitialized>, file_ext=<uninitialized>, params=<uninitialized>]
|
||||
|
||||
https://www.bro.org/documentation/faq
|
||||
-> [scheme=https, netlocation=www.bro.org, portnum=<uninitialized>, path=/documentation/faq, file_name=faq, file_base=faq, file_ext=<uninitialized>, params=<uninitialized>]
|
||||
|
||||
https://www.bro.org/documentation/faq.html
|
||||
-> [scheme=https, netlocation=www.bro.org, portnum=<uninitialized>, path=/documentation/faq.html, file_name=faq.html, file_base=faq, file_ext=html, params=<uninitialized>]
|
||||
|
||||
https://www.bro.org/documentation/faq.html?
|
||||
-> [scheme=https, netlocation=www.bro.org, portnum=<uninitialized>, path=/documentation/faq.html, file_name=faq.html, file_base=faq, file_ext=html, params={^J^J}]
|
||||
|
||||
https://www.bro.org/documentation/faq.html?k=v
|
||||
-> [scheme=https, netlocation=www.bro.org, portnum=<uninitialized>, path=/documentation/faq.html, file_name=faq.html, file_base=faq, file_ext=html, params={^J^I[k] = v^J}]
|
||||
|
||||
https://www.bro.org/documentation/faq.html?k=
|
||||
-> [scheme=https, netlocation=www.bro.org, portnum=<uninitialized>, path=/documentation/faq.html, file_name=faq.html, file_base=faq, file_ext=html, params={^J^I[k] = ^J}]
|
||||
|
||||
https://www.bro.org/documentation/faq.html?=v
|
||||
-> [scheme=https, netlocation=www.bro.org, portnum=<uninitialized>, path=/documentation/faq.html, file_name=faq.html, file_base=faq, file_ext=html, params={^J^I[] = v^J}]
|
||||
|
||||
file:///documentation/faq.html?=v
|
||||
-> [scheme=file, netlocation=, portnum=<uninitialized>, path=/documentation/faq.html, file_name=faq.html, file_base=faq, file_ext=html, params={^J^I[] = v^J}]
|
||||
|
||||
www.bro.org/?foo=bar
|
||||
-> [scheme=<uninitialized>, netlocation=www.bro.org, portnum=<uninitialized>, path=/, file_name=<uninitialized>, file_base=<uninitialized>, file_ext=<uninitialized>, params={^J^I[foo] = bar^J}]
|
||||
|
33
testing/btest/scripts/base/utils/decompose_uri.bro
Normal file
33
testing/btest/scripts/base/utils/decompose_uri.bro
Normal file
|
@ -0,0 +1,33 @@
|
|||
# @TEST-EXEC: bro -b %INPUT > output
|
||||
# @TEST-EXEC: btest-diff output
|
||||
|
||||
@load base/utils/urls
|
||||
|
||||
function dc(s: string)
|
||||
{
|
||||
print fmt("%s", s);
|
||||
print fmt(" -> %s", decompose_uri(s));
|
||||
print "";
|
||||
}
|
||||
|
||||
event bro_init()
|
||||
{
|
||||
dc("https://www.bro.org:42/documentation/faq.html?k1=v1&k2=v2");
|
||||
dc("");
|
||||
dc("https://");
|
||||
dc("https://www.bro.org");
|
||||
dc("https://www.bro.org/");
|
||||
dc("https://www.bro.org:42");
|
||||
dc("https://www.bro.org:42/");
|
||||
dc("https://www.bro.org/documentation");
|
||||
dc("https://www.bro.org/documentation/");
|
||||
dc("https://www.bro.org/documentation/faq");
|
||||
dc("https://www.bro.org/documentation/faq.html");
|
||||
dc("https://www.bro.org/documentation/faq.html?");
|
||||
dc("https://www.bro.org/documentation/faq.html?k=v");
|
||||
dc("https://www.bro.org/documentation/faq.html?k=");
|
||||
dc("https://www.bro.org/documentation/faq.html?=v");
|
||||
dc("file:///documentation/faq.html?=v");
|
||||
dc("www.bro.org/?foo=bar");
|
||||
}
|
||||
|
Loading…
Add table
Add a link
Reference in a new issue