diff --git a/CHANGES b/CHANGES index c13d062eef..a82c7a446b 100644 --- a/CHANGES +++ b/CHANGES @@ -1,4 +1,8 @@ +3.2.0-dev.530 | 2020-05-13 15:05:31 -0700 + + * GH-906: Fix the regex in url.zeek to better match for find_all_urls (Tim Wojtulewicz, Corelight) + 3.2.0-dev.526 | 2020-05-13 13:49:29 -0700 * Hash unification: address PR feedback (Johanna Amann, Corelight) diff --git a/VERSION b/VERSION index 7cfba1ec0d..560f02389d 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -3.2.0-dev.526 +3.2.0-dev.530 diff --git a/scripts/base/utils/urls.zeek b/scripts/base/utils/urls.zeek index c8077f5424..cdc8548d52 100644 --- a/scripts/base/utils/urls.zeek +++ b/scripts/base/utils/urls.zeek @@ -1,7 +1,10 @@ ##! Functions for URL handling. ## A regular expression for matching and extracting URLs. -const url_regex = /^([a-zA-Z\-]{3,5})(:\/\/[^\/?#"'\r\n><]*)([^?#"'\r\n><]*)([^[:blank:]\r\n"'><]*|\??[^"'\r\n><]*)/ &redef; +## This is the @imme_emosol regex from https://mathiasbynens.be/demo/url-regex, adapted for Zeek. It's +## not perfect for all of their test cases, but it's one of the shorter ones that covers most of the +## test cases. +const url_regex = /^([a-zA-Z\-]{3,5}):\/\/(-\.)?([^[:blank:]\/?\.#-]+\.?)+(\/[^[:blank:]]*)?/ &redef; ## A URI, as parsed by :zeek:id:`decompose_uri`. type URI: record { diff --git a/testing/btest/Baseline/scripts.base.utils.urls/output b/testing/btest/Baseline/scripts.base.utils.urls/output index e10010ab10..44bc5977a4 100644 --- a/testing/btest/Baseline/scripts.base.utils.urls/output +++ b/testing/btest/Baseline/scripts.base.utils.urls/output @@ -10,3 +10,11 @@ [scheme=, netlocation=dfasjdfasdfasdf, portnum=, path=/, file_name=, file_base=, file_ext=, params={ }] +{ +https://example1.com, +https://example2.com +} +{ +https://example2.com/?test=2, +https://example1.com/?test=1 +} diff --git a/testing/btest/scripts/base/utils/urls.test b/testing/btest/scripts/base/utils/urls.test index c307ee601e..002cc0087a 100644 --- a/testing/btest/scripts/base/utils/urls.test +++ b/testing/btest/scripts/base/utils/urls.test @@ -10,11 +10,16 @@ print decompose_uri("ftp://1.2.3.4/pub/files/something.exe"); print decompose_uri("http://hyphen-example.com/index.asp?q=123"); print decompose_uri("git://git.kernel.org:/pub/scm/linux/"); -# This is mostly undefined behavior but it doesn't give any +# This is mostly undefined behavior but it doesn't give any # reporter messages at least. print decompose_uri("dfasjdfasdfasdf?asd"); # These aren't supported yet. #print decompose_uri("mailto:foo@bar.com?subject=test!"); #print decompose_uri("http://example.com/?test=ampersand&test"); -#print decompose_uri("http://user:password@example.com/"); \ No newline at end of file +#print decompose_uri("http://user:password@example.com/"); + +local s = "https://example1.com testing https://example2.com"; +print find_all_urls(s); +local t = "https://example1.com/?test=1 testing https://example2.com/?test=2"; +print find_all_urls(t);