mirror of
https://github.com/zeek/zeek.git
synced 2025-10-02 14:48:21 +00:00
Merge remote-tracking branch 'origin/topic/timw/906-find-all-urls-regex'
* origin/topic/timw/906-find-all-urls-regex: Restore previous url scheme capture group GH-906: Fix the regex in url.zeek to better match for find_all_urls
This commit is contained in:
commit
2aeb3d8e39
5 changed files with 24 additions and 4 deletions
4
CHANGES
4
CHANGES
|
@ -1,4 +1,8 @@
|
||||||
|
|
||||||
|
3.2.0-dev.530 | 2020-05-13 15:05:31 -0700
|
||||||
|
|
||||||
|
* GH-906: Fix the regex in url.zeek to better match for find_all_urls (Tim Wojtulewicz, Corelight)
|
||||||
|
|
||||||
3.2.0-dev.526 | 2020-05-13 13:49:29 -0700
|
3.2.0-dev.526 | 2020-05-13 13:49:29 -0700
|
||||||
|
|
||||||
* Hash unification: address PR feedback (Johanna Amann, Corelight)
|
* Hash unification: address PR feedback (Johanna Amann, Corelight)
|
||||||
|
|
2
VERSION
2
VERSION
|
@ -1 +1 @@
|
||||||
3.2.0-dev.526
|
3.2.0-dev.530
|
||||||
|
|
|
@ -1,7 +1,10 @@
|
||||||
##! Functions for URL handling.
|
##! Functions for URL handling.
|
||||||
|
|
||||||
## A regular expression for matching and extracting URLs.
|
## A regular expression for matching and extracting URLs.
|
||||||
const url_regex = /^([a-zA-Z\-]{3,5})(:\/\/[^\/?#"'\r\n><]*)([^?#"'\r\n><]*)([^[:blank:]\r\n"'><]*|\??[^"'\r\n><]*)/ &redef;
|
## This is the @imme_emosol regex from https://mathiasbynens.be/demo/url-regex, adapted for Zeek. It's
|
||||||
|
## not perfect for all of their test cases, but it's one of the shorter ones that covers most of the
|
||||||
|
## test cases.
|
||||||
|
const url_regex = /^([a-zA-Z\-]{3,5}):\/\/(-\.)?([^[:blank:]\/?\.#-]+\.?)+(\/[^[:blank:]]*)?/ &redef;
|
||||||
|
|
||||||
## A URI, as parsed by :zeek:id:`decompose_uri`.
|
## A URI, as parsed by :zeek:id:`decompose_uri`.
|
||||||
type URI: record {
|
type URI: record {
|
||||||
|
|
|
@ -10,3 +10,11 @@
|
||||||
[scheme=<uninitialized>, netlocation=dfasjdfasdfasdf, portnum=<uninitialized>, path=/, file_name=<uninitialized>, file_base=<uninitialized>, file_ext=<uninitialized>, params={
|
[scheme=<uninitialized>, netlocation=dfasjdfasdfasdf, portnum=<uninitialized>, path=/, file_name=<uninitialized>, file_base=<uninitialized>, file_ext=<uninitialized>, params={
|
||||||
|
|
||||||
}]
|
}]
|
||||||
|
{
|
||||||
|
https://example1.com,
|
||||||
|
https://example2.com
|
||||||
|
}
|
||||||
|
{
|
||||||
|
https://example2.com/?test=2,
|
||||||
|
https://example1.com/?test=1
|
||||||
|
}
|
||||||
|
|
|
@ -10,11 +10,16 @@ print decompose_uri("ftp://1.2.3.4/pub/files/something.exe");
|
||||||
print decompose_uri("http://hyphen-example.com/index.asp?q=123");
|
print decompose_uri("http://hyphen-example.com/index.asp?q=123");
|
||||||
print decompose_uri("git://git.kernel.org:/pub/scm/linux/");
|
print decompose_uri("git://git.kernel.org:/pub/scm/linux/");
|
||||||
|
|
||||||
# This is mostly undefined behavior but it doesn't give any
|
# This is mostly undefined behavior but it doesn't give any
|
||||||
# reporter messages at least.
|
# reporter messages at least.
|
||||||
print decompose_uri("dfasjdfasdfasdf?asd");
|
print decompose_uri("dfasjdfasdfasdf?asd");
|
||||||
|
|
||||||
# These aren't supported yet.
|
# These aren't supported yet.
|
||||||
#print decompose_uri("mailto:foo@bar.com?subject=test!");
|
#print decompose_uri("mailto:foo@bar.com?subject=test!");
|
||||||
#print decompose_uri("http://example.com/?test=ampersand&test");
|
#print decompose_uri("http://example.com/?test=ampersand&test");
|
||||||
#print decompose_uri("http://user:password@example.com/");
|
#print decompose_uri("http://user:password@example.com/");
|
||||||
|
|
||||||
|
local s = "https://example1.com testing https://example2.com";
|
||||||
|
print find_all_urls(s);
|
||||||
|
local t = "https://example1.com/?test=1 testing https://example2.com/?test=2";
|
||||||
|
print find_all_urls(t);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue