mirror of
https://github.com/zeek/zeek.git
synced 2025-10-02 14:48:21 +00:00
164 lines
3.5 KiB
Text
164 lines
3.5 KiB
Text
# $Id:$
|
|
|
|
@load anon
|
|
|
|
global http_anon_log = open_log_file("http-anon") &redef;
|
|
|
|
const URI_proto_pat = /^ *([a-zA-Z]+)\:\/\// ;
|
|
const known_URI_proto_pat = /^ *(http|https|ftp|ssh)\:\/\// ;
|
|
|
|
const host_pat = / *^([\-0-9a-zA-Z]+\.)+([\_\-0-9a-zA-Z])*/ ;
|
|
const port_pat = /^ *(\:[0-9]+\.)/ ;
|
|
|
|
const query_pat = /\?/ ;
|
|
|
|
function anonymize_http_URI(URI: string): string
|
|
{
|
|
URI = to_lower(URI);
|
|
|
|
# Strip off protocol.
|
|
local proto = "";
|
|
if ( URI_proto_pat in URI )
|
|
{
|
|
local proto_part = split(URI, /\:\/\//);
|
|
|
|
# Check if we know the protocol. If not, flag it so we
|
|
# can update our protocol database.
|
|
|
|
if ( known_URI_proto_pat !in URI )
|
|
{
|
|
print http_anon_log,
|
|
fmt("*** protocol %s unknown ", proto_part[1]);
|
|
|
|
proto_part[1] =
|
|
string_cat(" (bro: unknown) ",
|
|
anonymize_arg("proto", proto_part[1]));
|
|
}
|
|
|
|
proto = string_cat(proto_part[1],"://");
|
|
URI = proto_part[2];
|
|
}
|
|
|
|
# Strip off domain.
|
|
local host = "";
|
|
if ( host_pat in URI )
|
|
{
|
|
local base_parts =
|
|
split_all(URI, / *^([\-\_0-9a-z]+\.)+[\-\_0-9a-z]*/);
|
|
|
|
if ( |base_parts| < 2 )
|
|
{
|
|
print http_anon_log,
|
|
fmt (" XXXXXXXXXXXXXXXXXXXXXX BASE %s", URI);
|
|
return " XXXX processing error XXXX";
|
|
}
|
|
|
|
if ( |base_parts| == 2 )
|
|
URI = "";
|
|
|
|
else if ( |base_parts| == 3)
|
|
URI = base_parts[3];
|
|
|
|
else if ( |base_parts| > 3)
|
|
{
|
|
local patch_me = "";
|
|
local hack = base_parts[2];
|
|
|
|
local i = 1;
|
|
for ( part in base_parts )
|
|
{
|
|
if ( i != 2 )
|
|
patch_me = string_cat(patch_me,
|
|
base_parts[i]);
|
|
i += 1;
|
|
}
|
|
|
|
URI = patch_me;
|
|
}
|
|
|
|
if ( host == simple_filename )
|
|
host = anonymize_path(host);
|
|
else
|
|
host = anonymize_host(base_parts[2]);
|
|
}
|
|
|
|
# Strip off port (if it exists).
|
|
local pport = "";
|
|
if ( port_pat in URI )
|
|
{
|
|
print "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX ";
|
|
print "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX ";
|
|
print "XXXXX anon.bro doing nothing with port XXXXXXXXXXX ";
|
|
print "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX ";
|
|
print "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX ";
|
|
}
|
|
|
|
# Handle query (if exists).
|
|
local tail = "";
|
|
if ( URI == "/" )
|
|
{
|
|
# -- pass
|
|
}
|
|
|
|
else if ( query_pat in URI )
|
|
{
|
|
local query_part = split(URI, /\?/);
|
|
|
|
tail = fmt("%s?%s",
|
|
anonymize_path(query_part[1]),
|
|
anonymize_path(query_part[2]));
|
|
}
|
|
|
|
else
|
|
tail = anonymize_path(URI);
|
|
|
|
tail = string_cat("/", tail);
|
|
|
|
return fmt("%s%s%s%s", proto, host, pport, tail);
|
|
}
|
|
|
|
|
|
const a_href_pat = /.*\< *a *href.*\>.*/ ;
|
|
#/.*\< *a *href *= *\"[[:print:]]+\" *\>.*/;
|
|
|
|
# Doesn't get everything ... but works for most.
|
|
const a_href_split =
|
|
/\< *a *href *= *(\\)?(\"|\')?([0-9a-z\/._!\[\]():*;~&|$\\=+\-?%@])+(\\)?(\"|\')?/ ;
|
|
|
|
# Elegant ... yeah ... really .. :-/
|
|
const file_split =
|
|
/(\"|\')([0-9a-z\/._!\[\]():*;~&|$\\=+\-?%@])+(\"|\')/ ;
|
|
const file_strip_split = /([0-9a-z\/._!\[\]():*;~&|$\\=+\-?%@])+/ ;
|
|
|
|
function http_doc_link_list(abstract: string): string
|
|
{
|
|
abstract = to_lower(abstract);
|
|
|
|
if ( abstract == "" )
|
|
return abstract;
|
|
|
|
local concat_key = "";
|
|
local href_parts = split_all(abstract, a_href_split);
|
|
|
|
for ( part in href_parts )
|
|
{
|
|
if ( href_parts[part] == a_href_split )
|
|
{
|
|
local file_parts =
|
|
split_all(href_parts[part], file_split);
|
|
for ( a_part in file_parts )
|
|
{
|
|
if ( file_parts[a_part] == file_split )
|
|
{
|
|
local file_strip_parts =
|
|
split_all(file_parts[a_part],
|
|
file_strip_split);
|
|
concat_key = fmt("%s %s", concat_key,
|
|
anonymize_http_URI(file_strip_parts[2]));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return concat_key;
|
|
}
|