zeek/policy.old/http-anon-utils.bro
2011-03-01 10:51:44 -05:00

164 lines
3.5 KiB
Text

# $Id:$
@load anon
global http_anon_log = open_log_file("http-anon") &redef;
const URI_proto_pat = /^ *([a-zA-Z]+)\:\/\// ;
const known_URI_proto_pat = /^ *(http|https|ftp|ssh)\:\/\// ;
const host_pat = / *^([\-0-9a-zA-Z]+\.)+([\_\-0-9a-zA-Z])*/ ;
const port_pat = /^ *(\:[0-9]+\.)/ ;
const query_pat = /\?/ ;
function anonymize_http_URI(URI: string): string
{
URI = to_lower(URI);
# Strip off protocol.
local proto = "";
if ( URI_proto_pat in URI )
{
local proto_part = split(URI, /\:\/\//);
# Check if we know the protocol. If not, flag it so we
# can update our protocol database.
if ( known_URI_proto_pat !in URI )
{
print http_anon_log,
fmt("*** protocol %s unknown ", proto_part[1]);
proto_part[1] =
string_cat(" (bro: unknown) ",
anonymize_arg("proto", proto_part[1]));
}
proto = string_cat(proto_part[1],"://");
URI = proto_part[2];
}
# Strip off domain.
local host = "";
if ( host_pat in URI )
{
local base_parts =
split_all(URI, / *^([\-\_0-9a-z]+\.)+[\-\_0-9a-z]*/);
if ( |base_parts| < 2 )
{
print http_anon_log,
fmt (" XXXXXXXXXXXXXXXXXXXXXX BASE %s", URI);
return " XXXX processing error XXXX";
}
if ( |base_parts| == 2 )
URI = "";
else if ( |base_parts| == 3)
URI = base_parts[3];
else if ( |base_parts| > 3)
{
local patch_me = "";
local hack = base_parts[2];
local i = 1;
for ( part in base_parts )
{
if ( i != 2 )
patch_me = string_cat(patch_me,
base_parts[i]);
i += 1;
}
URI = patch_me;
}
if ( host == simple_filename )
host = anonymize_path(host);
else
host = anonymize_host(base_parts[2]);
}
# Strip off port (if it exists).
local pport = "";
if ( port_pat in URI )
{
print "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX ";
print "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX ";
print "XXXXX anon.bro doing nothing with port XXXXXXXXXXX ";
print "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX ";
print "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX ";
}
# Handle query (if exists).
local tail = "";
if ( URI == "/" )
{
# -- pass
}
else if ( query_pat in URI )
{
local query_part = split(URI, /\?/);
tail = fmt("%s?%s",
anonymize_path(query_part[1]),
anonymize_path(query_part[2]));
}
else
tail = anonymize_path(URI);
tail = string_cat("/", tail);
return fmt("%s%s%s%s", proto, host, pport, tail);
}
const a_href_pat = /.*\< *a *href.*\>.*/ ;
#/.*\< *a *href *= *\"[[:print:]]+\" *\>.*/;
# Doesn't get everything ... but works for most.
const a_href_split =
/\< *a *href *= *(\\)?(\"|\')?([0-9a-z\/._!\[\]():*;~&|$\\=+\-?%@])+(\\)?(\"|\')?/ ;
# Elegant ... yeah ... really .. :-/
const file_split =
/(\"|\')([0-9a-z\/._!\[\]():*;~&|$\\=+\-?%@])+(\"|\')/ ;
const file_strip_split = /([0-9a-z\/._!\[\]():*;~&|$\\=+\-?%@])+/ ;
function http_doc_link_list(abstract: string): string
{
abstract = to_lower(abstract);
if ( abstract == "" )
return abstract;
local concat_key = "";
local href_parts = split_all(abstract, a_href_split);
for ( part in href_parts )
{
if ( href_parts[part] == a_href_split )
{
local file_parts =
split_all(href_parts[part], file_split);
for ( a_part in file_parts )
{
if ( file_parts[a_part] == file_split )
{
local file_strip_parts =
split_all(file_parts[a_part],
file_strip_split);
concat_key = fmt("%s %s", concat_key,
anonymize_http_URI(file_strip_parts[2]));
}
}
}
}
return concat_key;
}