GH-906: Fix the regex in url.zeek to better match for find_all_urls

This commit is contained in:
Tim Wojtulewicz 2020-04-13 12:43:54 -07:00
parent ce9183a2ed
commit 0d31d39de9
3 changed files with 19 additions and 3 deletions

View file

@ -1,7 +1,10 @@
##! Functions for URL handling.
## A regular expression for matching and extracting URLs.
const url_regex = /^([a-zA-Z\-]{3,5})(:\/\/[^\/?#"'\r\n><]*)([^?#"'\r\n><]*)([^[:blank:]\r\n"'><]*|\??[^"'\r\n><]*)/ &redef;
## This is the @imme_emosol regex from https://mathiasbynens.be/demo/url-regex, adapted for Zeek. It's
## not perfect for all of their test cases, but it's one of the shorter ones that covers most of the
## test cases.
const url_regex = /^(https?|ftp):\/\/(-\.)?([^[:blank:]\/?\.#-]+\.?)+(\/[^[:blank:]]*)?/ &redef;
## A URI, as parsed by :zeek:id:`decompose_uri`.
type URI: record {

View file

@ -10,3 +10,11 @@
[scheme=<uninitialized>, netlocation=dfasjdfasdfasdf, portnum=<uninitialized>, path=/, file_name=<uninitialized>, file_base=<uninitialized>, file_ext=<uninitialized>, params={
}]
{
https://example1.com,
https://example2.com
}
{
https://example2.com/?test=2,
https://example1.com/?test=1
}

View file

@ -18,3 +18,8 @@ print decompose_uri("dfasjdfasdfasdf?asd");
#print decompose_uri("mailto:foo@bar.com?subject=test!");
#print decompose_uri("http://example.com/?test=ampersand&amp;test");
#print decompose_uri("http://user:password@example.com/");
local s = "https://example1.com testing https://example2.com";
print find_all_urls(s);
local t = "https://example1.com/?test=1 testing https://example2.com/?test=2";
print find_all_urls(t);