From 0d31d39de98425cdbfac083587f56e393b398c41 Mon Sep 17 00:00:00 2001 From: Tim Wojtulewicz Date: Mon, 13 Apr 2020 12:43:54 -0700 Subject: [PATCH 1/2] GH-906: Fix the regex in url.zeek to better match for find_all_urls --- scripts/base/utils/urls.zeek | 5 ++++- testing/btest/Baseline/scripts.base.utils.urls/output | 8 ++++++++ testing/btest/scripts/base/utils/urls.test | 9 +++++++-- 3 files changed, 19 insertions(+), 3 deletions(-) diff --git a/scripts/base/utils/urls.zeek b/scripts/base/utils/urls.zeek index c8077f5424..58247d61a8 100644 --- a/scripts/base/utils/urls.zeek +++ b/scripts/base/utils/urls.zeek @@ -1,7 +1,10 @@ ##! Functions for URL handling. ## A regular expression for matching and extracting URLs. -const url_regex = /^([a-zA-Z\-]{3,5})(:\/\/[^\/?#"'\r\n><]*)([^?#"'\r\n><]*)([^[:blank:]\r\n"'><]*|\??[^"'\r\n><]*)/ &redef; +## This is the @imme_emosol regex from https://mathiasbynens.be/demo/url-regex, adapted for Zeek. It's +## not perfect for all of their test cases, but it's one of the shorter ones that covers most of the +## test cases. +const url_regex = /^(https?|ftp):\/\/(-\.)?([^[:blank:]\/?\.#-]+\.?)+(\/[^[:blank:]]*)?/ &redef; ## A URI, as parsed by :zeek:id:`decompose_uri`. type URI: record { diff --git a/testing/btest/Baseline/scripts.base.utils.urls/output b/testing/btest/Baseline/scripts.base.utils.urls/output index e10010ab10..44bc5977a4 100644 --- a/testing/btest/Baseline/scripts.base.utils.urls/output +++ b/testing/btest/Baseline/scripts.base.utils.urls/output @@ -10,3 +10,11 @@ [scheme=, netlocation=dfasjdfasdfasdf, portnum=, path=/, file_name=, file_base=, file_ext=, params={ }] +{ +https://example1.com, +https://example2.com +} +{ +https://example2.com/?test=2, +https://example1.com/?test=1 +} diff --git a/testing/btest/scripts/base/utils/urls.test b/testing/btest/scripts/base/utils/urls.test index c307ee601e..002cc0087a 100644 --- a/testing/btest/scripts/base/utils/urls.test +++ b/testing/btest/scripts/base/utils/urls.test @@ -10,11 +10,16 @@ print decompose_uri("ftp://1.2.3.4/pub/files/something.exe"); print decompose_uri("http://hyphen-example.com/index.asp?q=123"); print decompose_uri("git://git.kernel.org:/pub/scm/linux/"); -# This is mostly undefined behavior but it doesn't give any +# This is mostly undefined behavior but it doesn't give any # reporter messages at least. print decompose_uri("dfasjdfasdfasdf?asd"); # These aren't supported yet. #print decompose_uri("mailto:foo@bar.com?subject=test!"); #print decompose_uri("http://example.com/?test=ampersand&test"); -#print decompose_uri("http://user:password@example.com/"); \ No newline at end of file +#print decompose_uri("http://user:password@example.com/"); + +local s = "https://example1.com testing https://example2.com"; +print find_all_urls(s); +local t = "https://example1.com/?test=1 testing https://example2.com/?test=2"; +print find_all_urls(t); From 612c59e09978ffd8a9667098e2531764e7b86697 Mon Sep 17 00:00:00 2001 From: Tim Wojtulewicz Date: Tue, 14 Apr 2020 16:33:19 -0700 Subject: [PATCH 2/2] Restore previous url scheme capture group --- scripts/base/utils/urls.zeek | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/base/utils/urls.zeek b/scripts/base/utils/urls.zeek index 58247d61a8..cdc8548d52 100644 --- a/scripts/base/utils/urls.zeek +++ b/scripts/base/utils/urls.zeek @@ -4,7 +4,7 @@ ## This is the @imme_emosol regex from https://mathiasbynens.be/demo/url-regex, adapted for Zeek. It's ## not perfect for all of their test cases, but it's one of the shorter ones that covers most of the ## test cases. -const url_regex = /^(https?|ftp):\/\/(-\.)?([^[:blank:]\/?\.#-]+\.?)+(\/[^[:blank:]]*)?/ &redef; +const url_regex = /^([a-zA-Z\-]{3,5}):\/\/(-\.)?([^[:blank:]\/?\.#-]+\.?)+(\/[^[:blank:]]*)?/ &redef; ## A URI, as parsed by :zeek:id:`decompose_uri`. type URI: record {