From 3c42350e77eff3c163bc38e4c15d2533c6c8bef8 Mon Sep 17 00:00:00 2001
From: akasza
Date: Wed, 5 Nov 2014 20:44:03 -0800
Subject: [PATCH 1/3] uri parsing function
---
scripts/base/utils/urls.bro | 58 +++++++++++++++++++++++++++++++++++++
1 file changed, 58 insertions(+)
diff --git a/scripts/base/utils/urls.bro b/scripts/base/utils/urls.bro
index 8ef9ed7e2d..6c9eada67a 100644
--- a/scripts/base/utils/urls.bro
+++ b/scripts/base/utils/urls.bro
@@ -3,6 +3,16 @@
## A regular expression for matching and extracting URLs.
const url_regex = /^([a-zA-Z\-]{3,5})(:\/\/[^\/?#"'\r\n><]*)([^?#"'\r\n><]*)([^[:blank:]\r\n"'><]*|\??[^"'\r\n><]*)/ &redef;
+type uri_record: record {
+ protocol: string &optional;
+ # this could be a domain name or an IP address
+ netlocation: string;
+ portnum: count &optional;
+ path: string &optional;
+ file_name: string &optional;
+ file_ext: string &optional;
+};
+
## Extracts URLs discovered in arbitrary text.
function find_all_urls(s: string): string_set
{
@@ -23,3 +33,51 @@ function find_all_urls_without_scheme(s: string): string_set
return return_urls;
}
+
+function decompose_uri(s: string): uri_record
+ {
+ local parts: string_array;
+ local u: uri = [$netlocation=""];
+
+ if (/:\/\// in s)
+ {
+ parts = split1(s, /:\/\//);
+ u$protocol = parts[1];
+ s = parts[2];
+ }
+ if (/\// in s)
+ {
+ parts = split1(s, /\//);
+ s = parts[1];
+ u$path = fmt("/%s", parts[2]);
+
+ if (|u$path| > 1)
+ {
+ local last_token: string = find_last(u$path, /\/.+/);
+ local full_filename = split1(last_token, /\//)[2];
+ if (/\./ in full_filename)
+ {
+ u$file_name = split1(full_filename, /\./)[1];
+ u$file_ext = split1(full_filename, /\./)[2];
+ u$path = subst_string(u$path, fmt("%s.%s", u$file_name, u$file_ext), "");
+ }
+ else
+ {
+ u$file_name = full_filename;
+ u$path = subst_string(u$path, u$file_name, "");
+ }
+ }
+ }
+ if (/:/ in s)
+ {
+ parts = split1(s, /:/);
+ u$netlocation = parts[1];
+ u$portnum = to_count(parts[2]);
+ }
+ else
+ {
+ u$netlocation = s;
+ }
+
+ return u;
+ }
From 69ce4d30382a90d121132bb0fcc699f6ed726b3e Mon Sep 17 00:00:00 2001
From: akasza
Date: Thu, 6 Nov 2014 19:47:28 -0800
Subject: [PATCH 2/3] uri_decompose complete, need btests
---
scripts/base/utils/urls.bro | 47 +++++++++++++++++++++++++++++++++----
1 file changed, 43 insertions(+), 4 deletions(-)
diff --git a/scripts/base/utils/urls.bro b/scripts/base/utils/urls.bro
index 6c9eada67a..9beb424489 100644
--- a/scripts/base/utils/urls.bro
+++ b/scripts/base/utils/urls.bro
@@ -3,14 +3,16 @@
## A regular expression for matching and extracting URLs.
const url_regex = /^([a-zA-Z\-]{3,5})(:\/\/[^\/?#"'\r\n><]*)([^?#"'\r\n><]*)([^[:blank:]\r\n"'><]*|\??[^"'\r\n><]*)/ &redef;
-type uri_record: record {
- protocol: string &optional;
+type URI: record {
+ scheme: string &optional;
# this could be a domain name or an IP address
netlocation: string;
portnum: count &optional;
path: string &optional;
file_name: string &optional;
file_ext: string &optional;
+ params_k: table[count] of string;
+ params_v: table[count] of string;
};
## Extracts URLs discovered in arbitrary text.
@@ -34,11 +36,49 @@ function find_all_urls_without_scheme(s: string): string_set
return return_urls;
}
-function decompose_uri(s: string): uri_record
+function decompose_uri(s: string): URI
{
local parts: string_array;
local u: uri = [$netlocation=""];
+ if ( /\?/ in s)
+ {
+ local k: table[count] of string;
+ local v: table[count] of string;
+ u$params_k = k;
+ u$params_v = v;
+
+ parts = split1(s, /\?/);
+ s = parts[1];
+ local query: string = parts[2];
+ if (/&/ in query)
+ {
+ local opv: table[count] of string = split(query, /&/);
+
+ for (each in opv)
+ {
+ if (/=/ in opv[each])
+ {
+ parts = split1(opv[each], /=/);
+
+ # why does the order here matter?
+ u$params_k[each] = parts[1];
+ u$params_v[each] = parts[2];
+ }
+ else
+ {
+ # malformed URI
+ # domain.tld/path/file.ext?foo&
+ }
+ }
+ }
+ else
+ {
+ parts = split1(query, /=/);
+ u$params_k[0] = parts[1];
+ u$params_v[0] = parts[2];
+ }
+ }
if (/:\/\// in s)
{
parts = split1(s, /:\/\//);
@@ -78,6 +118,5 @@ function decompose_uri(s: string): uri_record
{
u$netlocation = s;
}
-
return u;
}
From ea79c07730268cfa7ac515860ad3c5d678b93c87 Mon Sep 17 00:00:00 2001
From: akasza
Date: Thu, 6 Nov 2014 19:52:03 -0800
Subject: [PATCH 3/3] uri parsing complete
---
scripts/base/utils/urls.bro | 6 ------
1 file changed, 6 deletions(-)
diff --git a/scripts/base/utils/urls.bro b/scripts/base/utils/urls.bro
index 9beb424489..1f42d517d6 100644
--- a/scripts/base/utils/urls.bro
+++ b/scripts/base/utils/urls.bro
@@ -61,15 +61,9 @@ function decompose_uri(s: string): URI
{
parts = split1(opv[each], /=/);
- # why does the order here matter?
u$params_k[each] = parts[1];
u$params_v[each] = parts[2];
}
- else
- {
- # malformed URI
- # domain.tld/path/file.ext?foo&
- }
}
}
else