zeek/scripts/base/utils/urls.bro
Seth Hall 172a6345b8 Extracting URLs from message bodies over SMTP and sending them to Intel framework.
- New utils package for URLs.

- Two functions in the URLs utils.  find_all_urls and
  find_all_urls_without_scheme.
2012-10-31 08:46:40 -04:00

25 lines
No EOL
713 B
Text

## Functions for URL handling.
## A regular expression for matching and extracting URLs.
const url_regex = /^([a-zA-Z\-]{3,5})(:\/\/[^\/?#"'\r\n><]*)([^?#"'\r\n><]*)([^[:blank:]\r\n"'><]*|\??[^"'\r\n><]*)/ &redef;
## Extracts URLs discovered in arbitrary text.
function find_all_urls(s: string): string_set
{
return find_all(s, url_regex);
}
## Extracts URLs discovered in arbitrary text without
## the URL scheme included.
function find_all_urls_without_scheme(s: string): string_set
{
local urls = find_all_urls(s);
local return_urls: set[string] = set();
for ( url in urls )
{
local no_scheme = sub(url, /^([a-zA-Z\-]{3,5})(:\/\/)/, "");
add return_urls[no_scheme];
}
return return_urls;
}