mirror of
https://github.com/zeek/zeek.git
synced 2025-10-02 14:48:21 +00:00

- New utils package for URLs. - Two functions in the URLs utils. find_all_urls and find_all_urls_without_scheme.
25 lines
No EOL
713 B
Text
25 lines
No EOL
713 B
Text
## Functions for URL handling.
|
|
|
|
## A regular expression for matching and extracting URLs.
|
|
const url_regex = /^([a-zA-Z\-]{3,5})(:\/\/[^\/?#"'\r\n><]*)([^?#"'\r\n><]*)([^[:blank:]\r\n"'><]*|\??[^"'\r\n><]*)/ &redef;
|
|
|
|
## Extracts URLs discovered in arbitrary text.
|
|
function find_all_urls(s: string): string_set
|
|
{
|
|
return find_all(s, url_regex);
|
|
}
|
|
|
|
## Extracts URLs discovered in arbitrary text without
|
|
## the URL scheme included.
|
|
function find_all_urls_without_scheme(s: string): string_set
|
|
{
|
|
local urls = find_all_urls(s);
|
|
local return_urls: set[string] = set();
|
|
for ( url in urls )
|
|
{
|
|
local no_scheme = sub(url, /^([a-zA-Z\-]{3,5})(:\/\/)/, "");
|
|
add return_urls[no_scheme];
|
|
}
|
|
|
|
return return_urls;
|
|
} |