From 0f99956417425ef20e5592781e3b6335ea4f3f37 Mon Sep 17 00:00:00 2001 From: Seth Hall Date: Wed, 13 Mar 2013 14:36:27 -0400 Subject: [PATCH 01/73] Added Exec, Dir, and ActiveHTTP modules. --- scripts/base/init-default.bro | 3 + scripts/base/utils/active-http.bro | 120 +++++++++++++++++ scripts/base/utils/dir.bro | 51 +++++++ scripts/base/utils/exec.bro | 207 +++++++++++++++++++++++++++++ 4 files changed, 381 insertions(+) create mode 100644 scripts/base/utils/active-http.bro create mode 100644 scripts/base/utils/dir.bro create mode 100644 scripts/base/utils/exec.bro diff --git a/scripts/base/init-default.bro b/scripts/base/init-default.bro index 8b36899f10..9b62c80014 100644 --- a/scripts/base/init-default.bro +++ b/scripts/base/init-default.bro @@ -5,9 +5,12 @@ ##! you actually want. @load base/utils/site +@load base/utils/active-http @load base/utils/addrs @load base/utils/conn-ids +@load base/utils/dir @load base/utils/directions-and-hosts +@load base/utils/exec @load base/utils/files @load base/utils/numbers @load base/utils/paths diff --git a/scripts/base/utils/active-http.bro b/scripts/base/utils/active-http.bro new file mode 100644 index 0000000000..5522cc108a --- /dev/null +++ b/scripts/base/utils/active-http.bro @@ -0,0 +1,120 @@ +##! A module for performing active HTTP requests and +##! getting the reply at runtime. + +@load ./exec + +module ActiveHTTP; + +export { + ## The default timeout for HTTP requests. + const default_max_time = 1min &redef; + + ## The default HTTP method/verb to use for requests. + const default_method = "GET" &redef; + + type Response: record { + ## Numeric response code from the server. + code: count; + ## String response messgae from the server. + msg: string; + ## Full body of the response. + body: string &optional; + ## All headers returned by the server. + headers: table[string] of string &optional; + }; + + type Request: record { + ## The URL being requested. + url: string; + ## The HTTP method/verb to use for the request. + method: string &default=default_method; + ## Data to send to the server in the client body. Keep in + ## mind that you will probably need to set the $method field + ## to "POST" or "PUT". + client_data: string &optional; + ## Arbitrary headers to pass to the server. Some headers + ## will be included by libCurl. + #custom_headers: table[string] of string &optional; + ## Timeout for the request. + max_time: interval &default=default_max_time; + ## Additional curl command line arguments. Be very careful + ## with this option since shell injection could take place + ## if careful handling of untrusted data is not applied. + addl_curl_args: string &optional; + }; + + ## Perform an HTTP request according to the :bro:type:`Request` record. + ## This is an asynchronous function and must be called within a "when" + ## statement. + ## + ## req: A record instance representing all options for an HTTP request. + ## + ## Returns: A record with the full response message. + global request: function(req: ActiveHTTP::Request): ActiveHTTP::Response; +} + +function request2curl(r: Request, bodyfile: string, headersfile: string): string + { + local cmd = fmt("curl -s -g -o \"%s\" -D \"%s\" -X \"%s\"", + str_shell_escape(bodyfile), + str_shell_escape(headersfile), + str_shell_escape(r$method)); + + cmd = fmt("%s -m %.0f", cmd, r$max_time); + + if ( r?$client_data ) + cmd = fmt("%s -d -", cmd); + + if ( r?$addl_curl_args ) + cmd = fmt("%s %s", cmd, r$addl_curl_args); + + cmd = fmt("%s \"%s\"", cmd, str_shell_escape(r$url)); + return cmd; + } + +function request(req: Request): ActiveHTTP::Response + { + local tmpfile = "/tmp/bro-activehttp-" + unique_id(""); + local bodyfile = fmt("%s_body", tmpfile); + local headersfile = fmt("%s_headers", tmpfile); + + local cmd = request2curl(req, bodyfile, headersfile); + local stdin_data = req?$client_data ? req$client_data : ""; + + local resp: Response; + resp$code = 0; + resp$msg = ""; + resp$body = ""; + resp$headers = table(); + return when ( local result = Exec::run([$cmd=cmd, $stdin=stdin_data, $read_files=set(bodyfile, headersfile)]) ) + { + # If there is no response line then nothing else will work either. + if ( ! (result?$files && headersfile in result$files) ) + Reporter::error(fmt("There was a failure when requesting \"%s\" with ActiveHTTP.", req$url)); + + local headers = result$files[headersfile]; + for ( i in headers ) + { + # The reply is the first line. + if ( i == 0 ) + { + local response_line = split_n(headers[0], /[[:blank:]]+/, F, 2); + if ( |response_line| != 3 ) + return resp; + + resp$code = to_count(response_line[2]); + resp$msg = response_line[3]; + resp$body = join_string_vec(result$files[bodyfile], ""); + } + else + { + local line = headers[i]; + local h = split1(line, /:/); + if ( |h| != 2 ) + next; + resp$headers[h[1]] = sub_bytes(h[2], 0, |h[2]|-1); + } + } + return resp; + } + } diff --git a/scripts/base/utils/dir.bro b/scripts/base/utils/dir.bro new file mode 100644 index 0000000000..2ed1c8e6e9 --- /dev/null +++ b/scripts/base/utils/dir.bro @@ -0,0 +1,51 @@ +@load base/utils/exec +@load base/frameworks/reporter +@load base/utils/paths + +module Dir; + +export { + ## Register a directory to monitor with a callback that is called + ## every time a previously unseen file is seen. If a file is deleted + ## and seen to be gone, the file is available for being seen again in + ## the future. + ## + ## dir: The directory to monitor for files. + ## + ## callback: Callback that gets executed with each file name + ## that is found. Filenames are provided with the full path. + global monitor: function(dir: string, callback: function(fname: string)); + + ## The interval this module checks for files in directories when using + ## the :bro:see:`Dir::monitor` function. + const polling_interval = 30sec &redef; +} + +event Dir::monitor_ev(dir: string, last_files: set[string], callback: function(fname: string)) + { + when ( local result = Exec::run([$cmd=fmt("ls \"%s\"", str_shell_escape(dir))]) ) + { + if ( result$exit_code != 0 ) + { + Reporter::warning("Requested monitoring of non-existent directory."); + return; + } + + local current_files: set[string] = set(); + local files = result$stdout; + for ( i in files ) + { + if ( files[i] !in last_files ) + callback(build_path_compressed(dir, files[i])); + add current_files[files[i]]; + } + schedule polling_interval { Dir::monitor_ev(dir, current_files, callback) }; + } + } + +function monitor(dir: string, callback: function(fname: string)) + { + event Dir::monitor_ev(dir, set(), callback); + } + + diff --git a/scripts/base/utils/exec.bro b/scripts/base/utils/exec.bro new file mode 100644 index 0000000000..fe353cf590 --- /dev/null +++ b/scripts/base/utils/exec.bro @@ -0,0 +1,207 @@ +##! A module for executing external command line programs. +##! This requires code that is still in topic branches and +##! definitely won't currently work on any released version of Bro. + +@load base/frameworks/input + +module Exec; + +export { + type Command: record { + ## The command line to execute. + ## Use care to avoid injection attacks! + cmd: string; + ## Provide standard in to the program as a + ## string. + stdin: string &default=""; + ## If additional files are required to be read + ## in as part of the output of the command they + ## can be defined here. + read_files: set[string] &optional; + }; + + type Result: record { + ## Exit code from the program. + exit_code: count &default=0; + ## Each line of standard out. + stdout: vector of string &optional; + ## Each line of standard error. + stderr: vector of string &optional; + ## If additional files were requested to be read in + ## the content of the files will be available here. + files: table[string] of string_vec &optional; + }; + + ## Function for running command line programs and getting + ## output. This is an asynchronous function which is meant + ## to be run with the `when` statement. + ## + ## cmd: The command to run. Use care to avoid injection attacks! + ## + ## returns: A record representing the full results from the + ## external program execution. + global run: function(cmd: Command): Result; +} + +redef record Command += { + # The prefix name for tracking temp files. + prefix_name: string &optional; +}; + +global results: table[string] of Result = table(); +global finished_commands: set[string]; +global tmp_files: set[string] = set(); + +type OneLine: record { line: string; }; + +event Exec::stdout_line(description: Input::EventDescription, tpe: Input::Event, s: string) + { + local name = sub(description$name, /_[^_]*$/, ""); + + local result = results[name]; + if ( ! results[name]?$stdout ) + result$stdout = vector(s); + else + result$stdout[|result$stdout|] = s; + } + +event Exec::stderr_line(description: Input::EventDescription, tpe: Input::Event, s: string) + { + local name = sub(description$name, /_[^_]*$/, ""); + + local result = results[name]; + if ( ! results[name]?$stderr ) + result$stderr = vector(s); + else + result$stderr[|result$stderr|] = s; + } + +event Exec::file_line(description: Input::EventDescription, tpe: Input::Event, s: string) + { + local parts = split1(description$name, /_/); + local name = parts[1]; + local track_file = parts[2]; + + local result = results[name]; + if ( ! result?$files ) + result$files = table(); + + if ( track_file !in result$files ) + result$files[track_file] = vector(s); + else + result$files[track_file][|result$files[track_file]|] = s; + } + +event Exec::cleanup_and_do_callback(name: string) + { + Input::remove(fmt("%s_stdout", name)); + system(fmt("rm %s_stdout", name)); + delete tmp_files[fmt("%s_stdout", name)]; + + Input::remove(fmt("%s_stderr", name)); + system(fmt("rm %s_stderr", name)); + delete tmp_files[fmt("%s_stderr", name)]; + + Input::remove(fmt("%s_done", name)); + system(fmt("rm %s_done", name)); + delete tmp_files[fmt("%s_done", name)]; + + # Indicate to the "when" async watcher that this command is done. + add finished_commands[name]; + } + +event Exec::run_done(description: Input::EventDescription, tpe: Input::Event, s: string) + { + local name = sub(description$name, /_[^_]*$/, ""); + + if ( /^exit_code:/ in s ) + results[name]$exit_code = to_count(split1(s, /:/)[2]); + else if ( s == "done" ) + # Wait one second to allow all threads to read all of their input + # and forward it. + schedule 1sec { Exec::cleanup_and_do_callback(name) }; + } + +event Exec::start_watching_files(cmd: Command) + { + Input::add_event([$source=fmt("%s_done", cmd$prefix_name), + $name=fmt("%s_done", cmd$prefix_name), + $reader=Input::READER_RAW, + $mode=Input::STREAM, + $want_record=F, + $fields=OneLine, + $ev=Exec::run_done]); + + Input::add_event([$source=fmt("%s_stdout", cmd$prefix_name), + $name=fmt("%s_stdout", cmd$prefix_name), + $reader=Input::READER_RAW, + $mode=Input::STREAM, + $want_record=F, + $fields=OneLine, + $ev=Exec::stdout_line]); + + Input::add_event([$source=fmt("%s_stderr", cmd$prefix_name), + $name=fmt("%s_stderr", cmd$prefix_name), + $reader=Input::READER_RAW, + $mode=Input::STREAM, + $want_record=F, + $fields=OneLine, + $ev=Exec::stderr_line]); + + if ( cmd?$read_files ) + { + for ( read_file in cmd$read_files ) + { + Input::add_event([$source=fmt("%s", read_file), + $name=fmt("%s_%s", cmd$prefix_name, read_file), + $reader=Input::READER_RAW, + $mode=Input::STREAM, + $want_record=F, + $fields=OneLine, + $ev=Exec::file_line]); + } + } + } + +function run(cmd: Command): Result + { + cmd$prefix_name = "/tmp/bro-exec-" + unique_id(""); + system(fmt("touch %s_done %s_stdout %s_stderr 2>/dev/null", cmd$prefix_name, cmd$prefix_name, cmd$prefix_name)); + add tmp_files[fmt("%s_done", cmd$prefix_name)]; + add tmp_files[fmt("%s_stdout", cmd$prefix_name)]; + add tmp_files[fmt("%s_stderr", cmd$prefix_name)]; + + if ( cmd?$read_files ) + { + for ( read_file in cmd$read_files ) + { + system(fmt("touch %s 2>/dev/null", read_file)); + add tmp_files[read_file]; + } + } + + piped_exec(fmt("%s 2>> %s_stderr 1>> %s_stdout; echo \"exit_code:${?}\" >> %s_done; echo \"done\" >> %s_done", + cmd$cmd, cmd$prefix_name, cmd$prefix_name, cmd$prefix_name, cmd$prefix_name), + cmd$stdin); + + results[cmd$prefix_name] = []; + + schedule 1msec { Exec::start_watching_files(cmd) }; + + return when ( cmd$prefix_name in finished_commands ) + { + delete finished_commands[cmd$prefix_name]; + local result = results[cmd$prefix_name]; + delete results[cmd$prefix_name]; + return result; + } + } + +event bro_done() + { + # We are punting here and just deleting any files that haven't been processed yet. + for ( fname in tmp_files ) + { + system(fmt("rm \"%s\"", str_shell_escape(fname))); + } + } \ No newline at end of file From 035b668f7398cd4b803c9ecc455ce58203de666b Mon Sep 17 00:00:00 2001 From: Seth Hall Date: Mon, 22 Apr 2013 21:52:21 -0400 Subject: [PATCH 02/73] Updates to use new input framework mechanism to execute command line programs. --- scripts/base/utils/exec.bro | 160 ++++++++++++++---------------------- 1 file changed, 60 insertions(+), 100 deletions(-) diff --git a/scripts/base/utils/exec.bro b/scripts/base/utils/exec.bro index fe353cf590..45cd8cb287 100644 --- a/scripts/base/utils/exec.bro +++ b/scripts/base/utils/exec.bro @@ -23,6 +23,8 @@ export { type Result: record { ## Exit code from the program. exit_code: count &default=0; + ## True if the command was terminated with a signal. + signal_exit: bool &default=F; ## Each line of standard out. stdout: vector of string &optional; ## Each line of standard error. @@ -41,39 +43,45 @@ export { ## returns: A record representing the full results from the ## external program execution. global run: function(cmd: Command): Result; + + ## The system directory for temp files. + const tmp_dir = "/tmp" &redef; } redef record Command += { - # The prefix name for tracking temp files. - prefix_name: string &optional; + # The unique id for tracking executors. + uid: string &optional; }; global results: table[string] of Result = table(); global finished_commands: set[string]; -global tmp_files: set[string] = set(); +global currently_tracked_files: set[string] = set(); +type OneLine: record { + s: string; + is_stderr: bool; +}; -type OneLine: record { line: string; }; +type FileLine: record { + s: string; +}; -event Exec::stdout_line(description: Input::EventDescription, tpe: Input::Event, s: string) +event Exec::line(description: Input::EventDescription, tpe: Input::Event, s: string, is_stderr: bool) { - local name = sub(description$name, /_[^_]*$/, ""); - - local result = results[name]; - if ( ! results[name]?$stdout ) - result$stdout = vector(s); + local result = results[description$name]; + if ( is_stderr ) + { + if ( ! result?$stderr ) + result$stderr = vector(s); + else + result$stderr[|result$stderr|] = s; + } else - result$stdout[|result$stdout|] = s; - } - -event Exec::stderr_line(description: Input::EventDescription, tpe: Input::Event, s: string) - { - local name = sub(description$name, /_[^_]*$/, ""); - - local result = results[name]; - if ( ! results[name]?$stderr ) - result$stderr = vector(s); - else - result$stderr[|result$stderr|] = s; + { + if ( ! result?$stdout ) + result$stdout = vector(s); + else + result$stdout[|result$stdout|] = s; + } } event Exec::file_line(description: Input::EventDescription, tpe: Input::Event, s: string) @@ -92,107 +100,59 @@ event Exec::file_line(description: Input::EventDescription, tpe: Input::Event, s result$files[track_file][|result$files[track_file]|] = s; } -event Exec::cleanup_and_do_callback(name: string) +event InputRaw::process_finished(name: string, source:string, exit_code:count, signal_exit:bool) { - Input::remove(fmt("%s_stdout", name)); - system(fmt("rm %s_stdout", name)); - delete tmp_files[fmt("%s_stdout", name)]; - - Input::remove(fmt("%s_stderr", name)); - system(fmt("rm %s_stderr", name)); - delete tmp_files[fmt("%s_stderr", name)]; - - Input::remove(fmt("%s_done", name)); - system(fmt("rm %s_done", name)); - delete tmp_files[fmt("%s_done", name)]; + results[name]$exit_code = exit_code; + results[name]$signal_exit = signal_exit; + Input::remove(name); # Indicate to the "when" async watcher that this command is done. add finished_commands[name]; } -event Exec::run_done(description: Input::EventDescription, tpe: Input::Event, s: string) +event Exec::start_watching_file(uid: string, read_file: string) { - local name = sub(description$name, /_[^_]*$/, ""); - - if ( /^exit_code:/ in s ) - results[name]$exit_code = to_count(split1(s, /:/)[2]); - else if ( s == "done" ) - # Wait one second to allow all threads to read all of their input - # and forward it. - schedule 1sec { Exec::cleanup_and_do_callback(name) }; - } - -event Exec::start_watching_files(cmd: Command) - { - Input::add_event([$source=fmt("%s_done", cmd$prefix_name), - $name=fmt("%s_done", cmd$prefix_name), + Input::add_event([$source=fmt("%s", read_file), + $name=fmt("%s_%s", uid, read_file), $reader=Input::READER_RAW, $mode=Input::STREAM, $want_record=F, - $fields=OneLine, - $ev=Exec::run_done]); - - Input::add_event([$source=fmt("%s_stdout", cmd$prefix_name), - $name=fmt("%s_stdout", cmd$prefix_name), - $reader=Input::READER_RAW, - $mode=Input::STREAM, - $want_record=F, - $fields=OneLine, - $ev=Exec::stdout_line]); - - Input::add_event([$source=fmt("%s_stderr", cmd$prefix_name), - $name=fmt("%s_stderr", cmd$prefix_name), - $reader=Input::READER_RAW, - $mode=Input::STREAM, - $want_record=F, - $fields=OneLine, - $ev=Exec::stderr_line]); - - if ( cmd?$read_files ) - { - for ( read_file in cmd$read_files ) - { - Input::add_event([$source=fmt("%s", read_file), - $name=fmt("%s_%s", cmd$prefix_name, read_file), - $reader=Input::READER_RAW, - $mode=Input::STREAM, - $want_record=F, - $fields=OneLine, - $ev=Exec::file_line]); - } - } + $fields=FileLine, + $ev=Exec::file_line]); } function run(cmd: Command): Result { - cmd$prefix_name = "/tmp/bro-exec-" + unique_id(""); - system(fmt("touch %s_done %s_stdout %s_stderr 2>/dev/null", cmd$prefix_name, cmd$prefix_name, cmd$prefix_name)); - add tmp_files[fmt("%s_done", cmd$prefix_name)]; - add tmp_files[fmt("%s_stdout", cmd$prefix_name)]; - add tmp_files[fmt("%s_stderr", cmd$prefix_name)]; + cmd$uid = unique_id(""); + results[cmd$uid] = []; if ( cmd?$read_files ) { for ( read_file in cmd$read_files ) { - system(fmt("touch %s 2>/dev/null", read_file)); - add tmp_files[read_file]; + add currently_tracked_files[read_file]; + system(fmt("touch \"%s\" 2>/dev/null", str_shell_escape(read_file))); + schedule 1msec { Exec::start_watching_file(cmd$uid, read_file) }; } } - piped_exec(fmt("%s 2>> %s_stderr 1>> %s_stdout; echo \"exit_code:${?}\" >> %s_done; echo \"done\" >> %s_done", - cmd$cmd, cmd$prefix_name, cmd$prefix_name, cmd$prefix_name, cmd$prefix_name), - cmd$stdin); + local config_strings: table[string] of string = { + ["stdin"] = cmd$stdin, + ["read_stderr"] = "1", + }; + Input::add_event([$name=cmd$uid, + $source=fmt("%s |", cmd$cmd), + $reader=Input::READER_RAW, + $fields=Exec::OneLine, + $ev=Exec::line, + $want_record=F, + $config=config_strings]); - results[cmd$prefix_name] = []; - - schedule 1msec { Exec::start_watching_files(cmd) }; - - return when ( cmd$prefix_name in finished_commands ) + return when ( cmd$uid in finished_commands ) { - delete finished_commands[cmd$prefix_name]; - local result = results[cmd$prefix_name]; - delete results[cmd$prefix_name]; + delete finished_commands[cmd$uid]; + local result = results[cmd$uid]; + delete results[cmd$uid]; return result; } } @@ -200,7 +160,7 @@ function run(cmd: Command): Result event bro_done() { # We are punting here and just deleting any files that haven't been processed yet. - for ( fname in tmp_files ) + for ( fname in currently_tracked_files ) { system(fmt("rm \"%s\"", str_shell_escape(fname))); } From 08348b2bc29f0d4661fbe61be355716a3ee51a25 Mon Sep 17 00:00:00 2001 From: Seth Hall Date: Mon, 22 Apr 2013 21:53:00 -0400 Subject: [PATCH 03/73] Update to make Dir::monitor watch inodes instead of file names. --- scripts/base/utils/dir.bro | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/scripts/base/utils/dir.bro b/scripts/base/utils/dir.bro index 2ed1c8e6e9..b154fe000e 100644 --- a/scripts/base/utils/dir.bro +++ b/scripts/base/utils/dir.bro @@ -23,11 +23,11 @@ export { event Dir::monitor_ev(dir: string, last_files: set[string], callback: function(fname: string)) { - when ( local result = Exec::run([$cmd=fmt("ls \"%s\"", str_shell_escape(dir))]) ) + when ( local result = Exec::run([$cmd=fmt("ls -i \"%s/\"", str_shell_escape(dir))]) ) { if ( result$exit_code != 0 ) { - Reporter::warning("Requested monitoring of non-existent directory."); + Reporter::warning(fmt("Requested monitoring of non-existent directory (%s).", dir)); return; } @@ -35,9 +35,10 @@ event Dir::monitor_ev(dir: string, last_files: set[string], callback: function(f local files = result$stdout; for ( i in files ) { - if ( files[i] !in last_files ) - callback(build_path_compressed(dir, files[i])); - add current_files[files[i]]; + local parts = split1(files[i], / /); + if ( parts[1] !in last_files ) + callback(build_path_compressed(dir, parts[2])); + add current_files[parts[1]]; } schedule polling_interval { Dir::monitor_ev(dir, current_files, callback) }; } From 4d275522c7a87f8c69b1494126cc995a20b2d66b Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Thu, 23 May 2013 16:03:26 -0700 Subject: [PATCH 04/73] Add abstraction for vector of bits. A bitvector is a vector of bits with underlying block storage. Since C++ has no notion of lvalues in the context of bits, we use a small wrapper class Reference that masks the desired bit in the corresponding block. --- src/BitVector.cc | 455 +++++++++++++++++++++++++++++++++++++++++++++ src/BitVector.h | 324 ++++++++++++++++++++++++++++++++ src/CMakeLists.txt | 1 + 3 files changed, 780 insertions(+) create mode 100644 src/BitVector.cc create mode 100644 src/BitVector.h diff --git a/src/BitVector.cc b/src/BitVector.cc new file mode 100644 index 0000000000..2f714a6c79 --- /dev/null +++ b/src/BitVector.cc @@ -0,0 +1,455 @@ +#include "BitVector.h" + +#include +#include + +BitVector::size_type BitVector::npos = static_cast(-1); +BitVector::block_type BitVector::bits_per_block = + std::numeric_limits::digits; + +namespace { + +uint8_t count_table[] = { + 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, + 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, + 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, + 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, + 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, + 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, + 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, + 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, 5, + 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 3, + 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, + 6, 7, 6, 7, 7, 8 +}; + +} // namespace + +BitVector::Reference::Reference(block_type& block, block_type i) + : block_(block), + mask_(block_type(1) << i) + { + assert(i < bits_per_block); + } + +BitVector::Reference& BitVector::Reference::flip() + { + block_ ^= mask_; + return *this; + } + +BitVector::Reference::operator bool() const + { + return (block_ & mask_) != 0; + } + +bool BitVector::Reference::operator~() const + { + return (block_ & mask_) == 0; + } + +BitVector::Reference& BitVector::Reference::operator=(bool x) + { + x ? block_ |= mask_ : block_ &= ~mask_; + return *this; + } + +BitVector::Reference& BitVector::Reference::operator=(Reference const& other) + { + other ? block_ |= mask_ : block_ &= ~mask_; + return *this; + } + +BitVector::Reference& BitVector::Reference::operator|=(bool x) + { + if (x) + block_ |= mask_; + return *this; + } + +BitVector::Reference& BitVector::Reference::operator&=(bool x) + { + if (! x) + block_ &= ~mask_; + return *this; + } + +BitVector::Reference& BitVector::Reference::operator^=(bool x) + { + if (x) + block_ ^= mask_; + return *this; + } + +BitVector::Reference& BitVector::Reference::operator-=(bool x) + { + if (x) + block_ &= ~mask_; + return *this; + } + + +BitVector::BitVector() : num_bits_(0) { } + +BitVector::BitVector(size_type size, bool value) + : bits_(bits_to_blocks(size), value ? ~block_type(0) : 0), + num_bits_(size) +{ } + +BitVector::BitVector(BitVector const& other) + : bits_(other.bits_), + num_bits_(other.num_bits_) +{ } + +BitVector BitVector::operator~() const + { + BitVector b(*this); + b.flip(); + return b; + } + +BitVector& BitVector::operator=(BitVector const& other) + { + bits_ = other.bits_; + return *this; + } + +BitVector BitVector::operator<<(size_type n) const + { + BitVector b(*this); + return b <<= n; + } + +BitVector BitVector::operator>>(size_type n) const + { + BitVector b(*this); + return b >>= n; + } + +BitVector& BitVector::operator<<=(size_type n) + { + if (n >= num_bits_) + return reset(); + + if (n > 0) + { + size_type last = blocks() - 1; + size_type div = n / bits_per_block; + block_type r = bit_index(n); + block_type* b = &bits_[0]; + assert(blocks() >= 1); + assert(div <= last); + + if (r != 0) + { + for (size_type i = last - div; i > 0; --i) + b[i + div] = (b[i] << r) | (b[i - 1] >> (bits_per_block - r)); + b[div] = b[0] << r; + } + else + { + for (size_type i = last-div; i > 0; --i) + b[i + div] = b[i]; + b[div] = b[0]; + } + + std::fill_n(b, div, block_type(0)); + zero_unused_bits(); + } + + return *this; + } + +BitVector& BitVector::operator>>=(size_type n) + { + if (n >= num_bits_) + return reset(); + + if (n > 0) + { + size_type last = blocks() - 1; + size_type div = n / bits_per_block; + block_type r = bit_index(n); + block_type* b = &bits_[0]; + assert(blocks() >= 1); + assert(div <= last); + + if (r != 0) + { + for (size_type i = last - div; i > 0; --i) + b[i - div] = (b[i] >> r) | (b[i + 1] << (bits_per_block - r)); + b[last - div] = b[last] >> r; + } + else + { + for (size_type i = div; i <= last; ++i) + b[i-div] = b[i]; + } + + std::fill_n(b + (blocks() - div), div, block_type(0)); + } + return *this; + } + +BitVector& BitVector::operator&=(BitVector const& other) + { + assert(size() >= other.size()); + for (size_type i = 0; i < blocks(); ++i) + bits_[i] &= other.bits_[i]; + return *this; + } + +BitVector& BitVector::operator|=(BitVector const& other) + { + assert(size() >= other.size()); + for (size_type i = 0; i < blocks(); ++i) + bits_[i] |= other.bits_[i]; + return *this; + } + +BitVector& BitVector::operator^=(BitVector const& other) + { + assert(size() >= other.size()); + for (size_type i = 0; i < blocks(); ++i) + bits_[i] ^= other.bits_[i]; + return *this; + } + +BitVector& BitVector::operator-=(BitVector const& other) + { + assert(size() >= other.size()); + for (size_type i = 0; i < blocks(); ++i) + bits_[i] &= ~other.bits_[i]; + return *this; + } + +BitVector operator&(BitVector const& x, BitVector const& y) + { + BitVector b(x); + return b &= y; + } + +BitVector operator|(BitVector const& x, BitVector const& y) + { + BitVector b(x); + return b |= y; + } + +BitVector operator^(BitVector const& x, BitVector const& y) + { + BitVector b(x); + return b ^= y; + } + +BitVector operator-(BitVector const& x, BitVector const& y) + { + BitVector b(x); + return b -= y; + } + +bool operator==(BitVector const& x, BitVector const& y) + { + return x.num_bits_ == y.num_bits_ && x.bits_ == y.bits_; + } + +bool operator!=(BitVector const& x, BitVector const& y) + { + return ! (x == y); + } + +bool operator<(BitVector const& x, BitVector const& y) + { + assert(x.size() == y.size()); + for (BitVector::size_type r = x.blocks(); r > 0; --r) + { + BitVector::size_type i = r - 1; + if (x.bits_[i] < y.bits_[i]) + return true; + else if (x.bits_[i] > y.bits_[i]) + return false; + } + return false; + } + +void BitVector::resize(size_type n, bool value) + { + size_type old = blocks(); + size_type required = bits_to_blocks(n); + block_type block_value = value ? ~block_type(0) : block_type(0); + + if (required != old) + bits_.resize(required, block_value); + + if (value && (n > num_bits_) && extra_bits()) + bits_[old - 1] |= (block_value << extra_bits()); + + num_bits_ = n; + zero_unused_bits(); + } + +void BitVector::clear() + { + bits_.clear(); + num_bits_ = 0; + } + +void BitVector::push_back(bool bit) + { + size_type s = size(); + resize(s + 1); + set(s, bit); + } + +void BitVector::append(block_type block) + { + size_type excess = extra_bits(); + if (excess) + { + assert(! bits_.empty()); + bits_.push_back(block >> (bits_per_block - excess)); + bits_[bits_.size() - 2] |= (block << excess); + } + else + { + bits_.push_back(block); + } + num_bits_ += bits_per_block; + } + +BitVector& BitVector::set(size_type i, bool bit) + { + assert(i < num_bits_); + + if (bit) + bits_[block_index(i)] |= bit_mask(i); + else + reset(i); + + return *this; + } + +BitVector& BitVector::set() + { + std::fill(bits_.begin(), bits_.end(), ~block_type(0)); + zero_unused_bits(); + return *this; + } + +BitVector& BitVector::reset(size_type i) + { + assert(i < num_bits_); + bits_[block_index(i)] &= ~bit_mask(i); + return *this; + } + +BitVector& BitVector::reset() + { + std::fill(bits_.begin(), bits_.end(), block_type(0)); + return *this; + } + +BitVector& BitVector::flip(size_type i) + { + assert(i < num_bits_); + bits_[block_index(i)] ^= bit_mask(i); + return *this; + } + +BitVector& BitVector::flip() + { + for (size_type i = 0; i < blocks(); ++i) + bits_[i] = ~bits_[i]; + zero_unused_bits(); + return *this; + } + +bool BitVector::operator[](size_type i) const + { + assert(i < num_bits_); + return (bits_[block_index(i)] & bit_mask(i)) != 0; + } + +BitVector::Reference BitVector::operator[](size_type i) + { + assert(i < num_bits_); + return Reference(bits_[block_index(i)], bit_index(i)); + } + +BitVector::size_type BitVector::count() const + { + std::vector::const_iterator first = bits_.begin(); + size_t n = 0; + size_type length = blocks(); + while (length) + { + block_type block = *first; + while (block) + { + // TODO: use __popcnt if available. + n += count_table[block & ((1u << 8) - 1)]; + block >>= 8; + } + ++first; + --length; + } + return n; + } + +BitVector::size_type BitVector::blocks() const + { + return bits_.size(); + } + +BitVector::size_type BitVector::size() const + { + return num_bits_; + } + +bool BitVector::empty() const + { + return bits_.empty(); + } + +BitVector::size_type BitVector::find_first() const + { + return find_from(0); + } + +BitVector::size_type BitVector::find_next(size_type i) const + { + if (i >= (size() - 1) || size() == 0) + return npos; + ++i; + size_type bi = block_index(i); + block_type block = bits_[bi] & (~block_type(0) << bit_index(i)); + return block ? bi * bits_per_block + lowest_bit(block) : find_from(bi + 1); + } + +BitVector::size_type BitVector::lowest_bit(block_type block) + { + block_type x = block - (block & (block - 1)); + size_type log = 0; + while (x >>= 1) + ++log; + return log; + } + +BitVector::block_type BitVector::extra_bits() const + { + return bit_index(size()); + } + +void BitVector::zero_unused_bits() + { + if (extra_bits()) + bits_.back() &= ~(~block_type(0) << extra_bits()); + } + +BitVector::size_type BitVector::find_from(size_type i) const + { + while (i < blocks() && bits_[i] == 0) + ++i; + if (i >= blocks()) + return npos; + return i * bits_per_block + lowest_bit(bits_[i]); + } diff --git a/src/BitVector.h b/src/BitVector.h new file mode 100644 index 0000000000..46d7e2df8f --- /dev/null +++ b/src/BitVector.h @@ -0,0 +1,324 @@ +#ifndef BitVector_h +#define BitVector_h + +#include +#include + +/** + * A vector of bits. + */ +class BitVector { +public: + typedef size_t block_type; + typedef size_t size_type; + static size_type npos; + static block_type bits_per_block; + +public: + /** + * An lvalue proxy for single bits. + */ + class Reference { + friend class BitVector; + Reference(block_type& block, block_type i); + + public: + Reference& flip(); + operator bool() const; + bool operator~() const; + Reference& operator=(bool x); + Reference& operator=(Reference const& other); + Reference& operator|=(bool x); + Reference& operator&=(bool x); + Reference& operator^=(bool x); + Reference& operator-=(bool x); + + private: + void operator&(); + block_type& block_; + block_type const mask_; + }; + + typedef bool const_reference; + + /** + * Constructs an empty bit vector. + */ + BitVector(); + + /** + * Constructs a bit vector of a given size. + * @param size The number of bits. + * @param value The value for each bit. + */ + explicit BitVector(size_type size, bool value = false); + + /** + * Constructs a bit vector from a sequence of blocks. + */ + template + BitVector(InputIterator first, InputIterator last) + { + bits_.insert(bits_.end(), first, last); + num_bits_ = bits_.size() * bits_per_block; + } + + /** + * Copy-constructs a bit vector. + * @param other The bit vector to copy. + */ + BitVector(const BitVector& other); + + /** + * Assigns another bit vector to this instance. + * @param other The RHS of the assignment. + */ + BitVector& operator=(const BitVector& other); + + // + // Bitwise operations + // + BitVector operator~() const; + BitVector operator<<(size_type n) const; + BitVector operator>>(size_type n) const; + BitVector& operator<<=(size_type n); + BitVector& operator>>=(size_type n); + BitVector& operator&=(BitVector const& other); + BitVector& operator|=(BitVector const& other); + BitVector& operator^=(BitVector const& other); + BitVector& operator-=(BitVector const& other); + friend BitVector operator&(BitVector const& x, BitVector const& y); + friend BitVector operator|(BitVector const& x, BitVector const& y); + friend BitVector operator^(BitVector const& x, BitVector const& y); + friend BitVector operator-(BitVector const& x, BitVector const& y); + + // + // Relational operators + // + friend bool operator==(BitVector const& x, BitVector const& y); + friend bool operator!=(BitVector const& x, BitVector const& y); + friend bool operator<(BitVector const& x, BitVector const& y); + + // + // Basic operations + // + /** Appends the bits in a sequence of values. + * @tparam Iterator A forward iterator. + * @param first An iterator pointing to the first element of the sequence. + * @param last An iterator pointing to one past the last element of the + * sequence. + */ + template + void append(ForwardIterator first, ForwardIterator last) + { + if (first == last) + return; + + block_type excess = extra_bits(); + typename std::iterator_traits::difference_type delta = + std::distance(first, last); + + bits_.reserve(blocks() + delta); + if (excess == 0) + { + bits_.back() |= (*first << excess); + do + { + block_type b = *first++ >> (bits_per_block - excess); + bits_.push_back(b | (first == last ? 0 : *first << excess)); + } while (first != last); + } + else + { + bits_.insert(bits_.end(), first, last); + } + num_bits_ += bits_per_block * delta; + } + + /** + * Appends the bits in a given block. + * @param block The block containing bits to append. + */ + void append(block_type block); + + /** Appends a single bit to the end of the bit vector. + * @param bit The value of the bit. + */ + void push_back(bool bit); + + /** + * Clears all bits in the bitvector. + */ + void clear(); + + /** + * Resizes the bit vector to a new number of bits. + * @param n The new number of bits of the bit vector. + * @param value The bit value of new values, if the vector expands. + */ + void resize(size_type n, bool value = false); + + /** + * Sets a bit at a specific position to a given value. + * @param i The bit position. + * @param bit The value assigned to position *i*. + * @return A reference to the bit vector instance. + */ + BitVector& set(size_type i, bool bit = true); + + /** + * Sets all bits to 1. + * @return A reference to the bit vector instance. + */ + BitVector& set(); + + /** + * Resets a bit at a specific position, i.e., sets it to 0. + * @param i The bit position. + * @return A reference to the bit vector instance. + */ + BitVector& reset(size_type i); + + /** + * Sets all bits to 0. + * @return A reference to the bit vector instance. + */ + BitVector& reset(); + + /** + * Toggles/flips a bit at a specific position. + * @param i The bit position. + * @return A reference to the bit vector instance. + */ + BitVector& flip(size_type i); + + /** + * Computes the complement. + * @return A reference to the bit vector instance. + */ + BitVector& flip(); + + /** Retrieves a single bit. + * @param i The bit position. + * @return A mutable reference to the bit at position *i*. + */ + Reference operator[](size_type i); + + /** + * Retrieves a single bit. + * @param i The bit position. + * @return A const-reference to the bit at position *i*. + */ + const_reference operator[](size_type i) const; + + /** + * Counts the number of 1-bits in the bit vector. Also known as *population + * count* or *Hamming weight*. + * @return The number of bits set to 1. + */ + size_type count() const; + + /** + * Retrieves the number of blocks of the underlying storage. + * @param The number of blocks that represent `size()` bits. + */ + size_type blocks() const; + + /** + * Retrieves the number of bits the bitvector consist of. + * @return The length of the bit vector in bits. + */ + size_type size() const; + + /** + * Checks whether the bit vector is empty. + * @return `true` iff the bitvector has zero length. + */ + bool empty() const; + + /** + * Finds the bit position of of the first 1-bit. + * @return The position of the first bit that equals to one or `npos` if no + * such bit exists. + */ + size_type find_first() const; + + /** + * Finds the next 1-bit from a given starting position. + * + * @param i The index where to start looking. + * + * @return The position of the first bit that equals to 1 after position + * *i* or `npos` if no such bit exists. + */ + size_type find_next(size_type i) const; + +private: + /** + * Computes the block index for a given bit position. + */ + static size_type block_index(size_type i) + { + return i / bits_per_block; + } + + /** + * Computes the bit index within a given block for a given bit position. + */ + static block_type bit_index(size_type i) + { + return i % bits_per_block; + } + + /** + * Computes the bitmask block to extract a bit a given bit position. + */ + static block_type bit_mask(size_type i) + { + return block_type(1) << bit_index(i); + } + + /** + * Computes the number of blocks needed to represent a given number of + * bits. + * @param bits the number of bits. + * @return The number of blocks to represent *bits* number of bits. + */ + static size_type bits_to_blocks(size_type bits) + { + return bits / bits_per_block + + static_cast(bits % bits_per_block != 0); + } + + /** + * Computes the bit position first 1-bit in a given block. + * @param block The block to inspect. + * @return The bit position where *block* has its first bit set to 1. + */ + static size_type lowest_bit(block_type block); + + /** + * Computes the number of excess/unused bits in the bit vector. + */ + block_type extra_bits() const; + + /** + * If the number of bits in the vector are not not a multiple of + * bitvector::bits_per_block, then the last block exhibits unused bits which + * this function resets. + */ + void zero_unused_bits(); + + /** + * Looks for the first 1-bit starting at a given position. + * @param i The block index to start looking. + * @return The block index of the first 1-bit starting from *i* or + * `bitvector::npos` if no 1-bit exists. + */ + size_type find_from(size_type i) const; + + std::vector bits_; + size_type num_bits_; +}; + +#endif diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 447b7d9ec7..33aaab29c1 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -303,6 +303,7 @@ set(bro_SRCS Base64.cc BitTorrent.cc BitTorrentTracker.cc + BitVector.cc BPF_Program.cc BroDoc.cc BroDocObj.cc From 9e32eaad6db992e60a3d669c4d8c7b5016cc8cbc Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Tue, 28 May 2013 20:58:01 -0700 Subject: [PATCH 05/73] Make bitvectors serializable. --- src/BitVector.cc | 57 +++++++++++++++++++++++++++++++++++++++++++++-- src/BitVector.h | 13 ++++++++--- src/SerialTypes.h | 2 ++ 3 files changed, 67 insertions(+), 5 deletions(-) diff --git a/src/BitVector.cc b/src/BitVector.cc index 2f714a6c79..f57301d506 100644 --- a/src/BitVector.cc +++ b/src/BitVector.cc @@ -2,6 +2,7 @@ #include #include +#include "Serializer.h" BitVector::size_type BitVector::npos = static_cast(-1); BitVector::block_type BitVector::bits_per_block = @@ -62,7 +63,7 @@ BitVector::Reference& BitVector::Reference::operator=(Reference const& other) BitVector::Reference& BitVector::Reference::operator|=(bool x) { - if (x) + if (x) block_ |= mask_; return *this; } @@ -73,7 +74,7 @@ BitVector::Reference& BitVector::Reference::operator&=(bool x) block_ &= ~mask_; return *this; } - + BitVector::Reference& BitVector::Reference::operator^=(bool x) { if (x) @@ -453,3 +454,55 @@ BitVector::size_type BitVector::find_from(size_type i) const return npos; return i * bits_per_block + lowest_bit(bits_[i]); } + +bool BitVector::Serialize(SerialInfo* info) const + { + return SerialObj::Serialize(info); + } + +BitVector* BitVector::Unserialize(UnserialInfo* info) + { + return reinterpret_cast( + SerialObj::Unserialize(info, SER_BITVECTOR)); + } + +IMPLEMENT_SERIAL(BitVector, SER_BITVECTOR); + +bool BitVector::DoSerialize(SerialInfo* info) const + { + DO_SERIALIZE(SER_BITVECTOR, SerialObj); + + if ( ! SERIALIZE(static_cast(bits_.size())) ) + return false; + + for (size_t i = 0; i < bits_.size(); ++i) + if ( ! SERIALIZE(static_cast(bits_[i])) ) + return false; + + return SERIALIZE(static_cast(num_bits_)); + } + +bool BitVector::DoUnserialize(UnserialInfo* info) + { + DO_UNSERIALIZE(SerialObj); + + uint64 size; + if ( ! UNSERIALIZE(&size) ) + return false; + + bits_.resize(static_cast(size)); + uint64 block; + for ( size_t i = 0; i < bits_.size(); ++i ) + { + if ( ! UNSERIALIZE(&block) ) + return false; + bits_[i] = static_cast(block); + } + + uint64 num_bits; + if ( ! UNSERIALIZE(&num_bits) ) + return false; + num_bits_ = static_cast(num_bits); + + return true; + } diff --git a/src/BitVector.h b/src/BitVector.h index 46d7e2df8f..9900dd103e 100644 --- a/src/BitVector.h +++ b/src/BitVector.h @@ -3,11 +3,12 @@ #include #include +#include "SerialObj.h" /** * A vector of bits. */ -class BitVector { +class BitVector : SerialObj { public: typedef size_t block_type; typedef size_t size_type; @@ -42,7 +43,7 @@ public: typedef bool const_reference; /** - * Constructs an empty bit vector. + * Default-constructs an empty bit vector. */ BitVector(); @@ -253,6 +254,12 @@ public: */ size_type find_next(size_type i) const; + bool Serialize(SerialInfo* info) const; + static BitVector* Unserialize(UnserialInfo* info); + +protected: + DECLARE_SERIAL(BitVector); + private: /** * Computes the block index for a given bit position. @@ -286,7 +293,7 @@ private: */ static size_type bits_to_blocks(size_type bits) { - return bits / bits_per_block + return bits / bits_per_block + static_cast(bits % bits_per_block != 0); } diff --git a/src/SerialTypes.h b/src/SerialTypes.h index 723badab1e..c9c0c34a33 100644 --- a/src/SerialTypes.h +++ b/src/SerialTypes.h @@ -49,6 +49,7 @@ SERIAL_IS(STATE_ACCESS, 0x1100) SERIAL_IS_BO(CASE, 0x1200) SERIAL_IS(LOCATION, 0x1300) SERIAL_IS(RE_MATCHER, 0x1400) +SERIAL_IS(BITVECTOR, 0x1500) // These are the externally visible types. const SerialType SER_NONE = 0; @@ -202,5 +203,6 @@ SERIAL_CONST2(STATE_ACCESS) SERIAL_CONST2(CASE) SERIAL_CONST2(LOCATION) SERIAL_CONST2(RE_MATCHER) +SERIAL_CONST2(BITVECTOR) #endif From d873db03cef3bb09d45e789d69607487e36b6093 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Fri, 31 May 2013 18:31:14 -0700 Subject: [PATCH 06/73] Add draft of Bloom filter type hierarchy. --- src/BloomFilter.h | 266 +++++++++++++++++++++++++++++++++++++++++++++ src/CMakeLists.txt | 1 + 2 files changed, 267 insertions(+) create mode 100644 src/BloomFilter.h diff --git a/src/BloomFilter.h b/src/BloomFilter.h new file mode 100644 index 0000000000..a767c6b8b8 --- /dev/null +++ b/src/BloomFilter.h @@ -0,0 +1,266 @@ +#ifndef BloomFilter_h +#define BloomFilter_h + +#include +#include "BitVector.h" +#include "Hash.h" +#include "H3.h" + +/** + * A vector of counters, each of which have a fixed number of bits. + */ +class CounterVector : SerialObj { +public: + /** + * Constructs a counter vector having cells of a given width. + * + * @param width The number of bits that each cell occupies. + */ + explicit CounterVector(unsigned width); + + /** + * Increments a given cell. + * + * @param cell The cell to increment. + * + * @param value The value to add to the current counter in *cell*. + * + * @return `true` if adding *value* to the counter in *cell* succeeded. + */ + bool Increment(size_type cell, count_type value); + + /** + * Decrements a given cell. + * + * @param cell The cell to decrement. + * + * @param value The value to subtract from the current counter in *cell*. + * + * @return `true` if subtracting *value* from the counter in *cell* succeeded. + */ + bool Decrement(size_type cell, count_type value); + + /** + * Retrieves the counter of a given cell. + * + * @param cell The cell index to retrieve the count for. + * + * @return The counter associated with *cell*. + */ + count_type Count(size_type cell) const; + + /** + * Retrieves the number of cells in the storage. + * + * @return The number of cells. + */ + size_type Size() const; + + bool Serialize(SerialInfo* info) const; + static CounterVector* Unserialize(UnserialInfo* info); + +protected: + DECLARE_SERIAL(CounterVector); + + CounterVector(); + +private: + BitVector bits_; + unsigned width_; +}; + +/** + * The abstract base class for hash policies. + * @tparam Codomain An integral type. + */ +class HashPolicy { +public: + typedef hash_t hash_type; + virtual ~HashPolicy() { } + size_t k() const { return k; } + virtual std::vector Hash(const void* x, size_t n) const = 0; +protected: + /** + * A functor that computes a universal hash function. + * @tparam Codomain An integral type. + */ + template + class Hasher { + public: + template + Codomain operator()(const Domain& x) const + { + return h3_(&x, sizeof(x)); + } + Codomain operator()(const void* x, size_t n) const + { + return h3_(x, n); + } + private: + // FIXME: The hardcoded value of 36 comes from UHASH_KEY_SIZE defined in + // Hash.h. I do not know how this value impacts the hash function behavior + // so I'll just copy it verbatim. (Matthias) + H3 h3_; + }; + + HashPolicy(size_t k) : k_(k) { } +private: + size_t k_; +}; + +/** + * The *default* hashing policy. Performs *k* hash function computations. + */ +class DefaultHashing : public HashPolicy { +public: + DefaultHashing(size_t k) : HashPolicy(k), hashers_(k) { } + virtual ~DoubleHashing() { } + + virtual std::vector Hash(const void* x, size_t n) const + { + std::vector h(k(), 0); + for (size_t i = 0; i < h.size(); ++i) + h[i] = hashers_[i](x, n); + return h; + } + +private: + std::vector< Hasher > hashers_; +}; + +/** + * The *double-hashing* policy. Uses a linear combination of 2 hash functions. + */ +class DoubleHashing : public HashPolicy { +public: + DoubleHashing(size_t k) : HashPolicy(k), hashers_(k) { } + virtual ~DoubleHashing() { } + + virtual std::vector Hash(const void* x, size_t n) const + { + Codomain h1 = hasher1_(x); + Codomain h2 = hasher2_(x); + std::vector h(k(), 0); + for (size_t i = 0; i < h.size(); ++i) + h[i] = h1 + i * h2; + return h; + } + +private: + Hasher hasher1_; + Hasher hasher2_; +}; + +/** + * The abstract base class for Bloom filters. + */ +class BloomFilter : SerialObj { +public: + virtual ~BloomFilter() { delete hash_; } + + /** + * Adds an element of type T to the Bloom filter. + * @param x The element to add + */ + template + void Add(const T& x) + { + ++elements_; + AddImpl(hash_->Hash(x)); + } + + /** + * Retrieves the associated count of a given value. + * + * @param x The value of type `T` to check. + * + * @return The counter associated with *x*. + */ + template + size_t Count(const T& x) const + { + return CountImpl(hash_->Hash(x)); + } + + /** + * Retrieves the number of elements added to the Bloom filter. + * + * @return The number of elements in this Bloom filter. + */ + size_t Size() const + { + return elements_; + } + +protected: + typedef std::vector HashVector; + + /** + * Default-constructs a Bloom filter. + */ + BloomFilter(); + + /** + * Constructs a BloomFilter. + * @param hash The hashing policy. + */ + BloomFilter(HashPolicy* hash); + + virtual void AddImpl(const HashVector& hashes) = 0; + + virtual size_t CountImpl(const HashVector& hashes) const = 0; + + std::vector Hash(const T& x) const + { + return hash_->Hash(&x, sizeof(x)); + } + +private: + HashPolicy* hash_; // Owned by *this. + + size_t elements_; +}; + +/** + * A basic Bloom filter. + */ +class BasicBloomFilter : public BloomFilter { +public: + BasicBloomFilter(); + BasicBloomFilter(HashPolicy* hash); + +protected: + virtual void AddImpl(const HashVector& h) + { + for ( size_t i = 0; i < h.size(); ++i ) + bits_.set(h[i] % h.size()); + } + + virtual size_t CountImpl(const HashVector& h) const + { + for ( size_t i = 0; i < h.size(); ++i ) + if ( ! bits_[h[i] % h.size()] ) + return 0; + return 1; + } + +private: + BitVector bits_; +}; + +/** + * A counting Bloom filter. + */ +class CountingBloomFilter : public BloomFilter { +public: + CountingBloomFilter(unsigned width); + CountingBloomFilter(HashPolicy* hash); + +protected: + CountingBloomFilter(); + +private: + CounterVector cells_; +}; + +#endif diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 33aaab29c1..11de7772d7 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -304,6 +304,7 @@ set(bro_SRCS BitTorrent.cc BitTorrentTracker.cc BitVector.cc + BloomFilter.cc BPF_Program.cc BroDoc.cc BroDocObj.cc From f529df33e0afa930e4babff66f4a5f590b5eb6d9 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Mon, 3 Jun 2013 14:00:28 -0700 Subject: [PATCH 07/73] Stabilize Bloom filter interface. --- src/BloomFilter.cc | 33 ++++++++++++++++++ src/BloomFilter.h | 85 +++++++++++++++++----------------------------- 2 files changed, 65 insertions(+), 53 deletions(-) create mode 100644 src/BloomFilter.cc diff --git a/src/BloomFilter.cc b/src/BloomFilter.cc new file mode 100644 index 0000000000..6873815f69 --- /dev/null +++ b/src/BloomFilter.cc @@ -0,0 +1,33 @@ +#include "BloomFilter.h" + +HashPolicy::HashVector DefaultHashing::Hash(const void* x, size_t n) const + { + HashVector h(k(), 0); + for ( size_t i = 0; i < h.size(); ++i ) + h[i] = hashers_[i](x, n); + return h; + } + +HashPolicy::HashVector DoubleHashing::Hash(const void* x, size_t n) const + { + HashType h1 = hasher1_(x); + HashType h2 = hasher2_(x); + HashVector h(k(), 0); + for ( size_t i = 0; i < h.size(); ++i ) + h[i] = h1 + i * h2; + return h; + } + +void BasicBloomFilter::AddImpl(const HashPolicy::HashVector& h) + { + for ( size_t i = 0; i < h.size(); ++i ) + bits_.set(h[i] % h.size()); + } + +size_t BasicBloomFilter::CountImpl(const HashPolicy::HashVector& h) const + { + for ( size_t i = 0; i < h.size(); ++i ) + if ( ! bits_[h[i] % h.size()] ) + return 0; + return 1; + } diff --git a/src/BloomFilter.h b/src/BloomFilter.h index a767c6b8b8..dca4eff2bd 100644 --- a/src/BloomFilter.h +++ b/src/BloomFilter.h @@ -11,6 +11,9 @@ */ class CounterVector : SerialObj { public: + typedef size_t size_type; + typedef uint64 count_type; + /** * Constructs a counter vector having cells of a given width. * @@ -70,21 +73,24 @@ private: }; /** - * The abstract base class for hash policies. + * The abstract base class for hash policies that hash elements *k* times. * @tparam Codomain An integral type. */ class HashPolicy { public: - typedef hash_t hash_type; + typedef hash_t HashType; + typedef std::vector HashVector; + virtual ~HashPolicy() { } - size_t k() const { return k; } - virtual std::vector Hash(const void* x, size_t n) const = 0; + size_t k() const { return k_; } + virtual HashVector Hash(const void* x, size_t n) const = 0; + protected: /** * A functor that computes a universal hash function. * @tparam Codomain An integral type. */ - template + template class Hasher { public: template @@ -104,8 +110,9 @@ protected: }; HashPolicy(size_t k) : k_(k) { } + private: - size_t k_; + const size_t k_; }; /** @@ -114,18 +121,12 @@ private: class DefaultHashing : public HashPolicy { public: DefaultHashing(size_t k) : HashPolicy(k), hashers_(k) { } - virtual ~DoubleHashing() { } + virtual ~DefaultHashing() { } - virtual std::vector Hash(const void* x, size_t n) const - { - std::vector h(k(), 0); - for (size_t i = 0; i < h.size(); ++i) - h[i] = hashers_[i](x, n); - return h; - } + virtual HashVector Hash(const void* x, size_t n) const; private: - std::vector< Hasher > hashers_; + std::vector< Hasher > hashers_; }; /** @@ -133,22 +134,14 @@ private: */ class DoubleHashing : public HashPolicy { public: - DoubleHashing(size_t k) : HashPolicy(k), hashers_(k) { } + DoubleHashing(size_t k) : HashPolicy(k) { } virtual ~DoubleHashing() { } - virtual std::vector Hash(const void* x, size_t n) const - { - Codomain h1 = hasher1_(x); - Codomain h2 = hasher2_(x); - std::vector h(k(), 0); - for (size_t i = 0; i < h.size(); ++i) - h[i] = h1 + i * h2; - return h; - } + virtual HashVector Hash(const void* x, size_t n) const; private: - Hasher hasher1_; - Hasher hasher2_; + Hasher hasher1_; + Hasher hasher2_; }; /** @@ -166,7 +159,7 @@ public: void Add(const T& x) { ++elements_; - AddImpl(hash_->Hash(x)); + AddImpl(hash_->Hash(&x, sizeof(x))); } /** @@ -179,7 +172,7 @@ public: template size_t Count(const T& x) const { - return CountImpl(hash_->Hash(x)); + return CountImpl(hash_->Hash(&x, sizeof(x))); } /** @@ -193,8 +186,6 @@ public: } protected: - typedef std::vector HashVector; - /** * Default-constructs a Bloom filter. */ @@ -206,17 +197,12 @@ protected: */ BloomFilter(HashPolicy* hash); - virtual void AddImpl(const HashVector& hashes) = 0; + virtual void AddImpl(const HashPolicy::HashVector& hashes) = 0; - virtual size_t CountImpl(const HashVector& hashes) const = 0; - - std::vector Hash(const T& x) const - { - return hash_->Hash(&x, sizeof(x)); - } + virtual size_t CountImpl(const HashPolicy::HashVector& hashes) const = 0; private: - HashPolicy* hash_; // Owned by *this. + HashPolicy* hash_; // Owned by *this. size_t elements_; }; @@ -230,19 +216,9 @@ public: BasicBloomFilter(HashPolicy* hash); protected: - virtual void AddImpl(const HashVector& h) - { - for ( size_t i = 0; i < h.size(); ++i ) - bits_.set(h[i] % h.size()); - } + virtual void AddImpl(const HashPolicy::HashVector& h); - virtual size_t CountImpl(const HashVector& h) const - { - for ( size_t i = 0; i < h.size(); ++i ) - if ( ! bits_[h[i] % h.size()] ) - return 0; - return 1; - } + virtual size_t CountImpl(const HashPolicy::HashVector& h) const; private: BitVector bits_; @@ -253,12 +229,15 @@ private: */ class CountingBloomFilter : public BloomFilter { public: - CountingBloomFilter(unsigned width); - CountingBloomFilter(HashPolicy* hash); + CountingBloomFilter(unsigned width, HashPolicy* hash); protected: CountingBloomFilter(); + virtual void AddImpl(const HashPolicy::HashVector& h); + + virtual size_t CountImpl(const HashPolicy::HashVector& h) const; + private: CounterVector cells_; }; From f708cd4a361ba02083380cfe0db2949e3e06cff7 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Mon, 3 Jun 2013 22:55:21 -0700 Subject: [PATCH 08/73] Work on parameter estimation and serialization. --- src/BloomFilter.cc | 131 ++++++++++++++++++++++++++++++++++++++++++++- src/BloomFilter.h | 41 +++++++------- src/NetVar.cc | 2 + src/OpaqueVal.cc | 23 ++++++++ src/OpaqueVal.h | 16 ++++++ src/SerialTypes.h | 7 +++ 6 files changed, 198 insertions(+), 22 deletions(-) diff --git a/src/BloomFilter.cc b/src/BloomFilter.cc index 6873815f69..4787bef0f0 100644 --- a/src/BloomFilter.cc +++ b/src/BloomFilter.cc @@ -1,23 +1,130 @@ #include "BloomFilter.h" +#include +#include "Serializer.h" + +// Backport C++11's std::round(). +namespace { +template +T round(double x) { return (x > 0.0) ? (x + 0.5) : (x - 0.5); } +} // namespace + + +IMPLEMENT_SERIAL(CounterVector, SER_COUNTERVECTOR) + +bool CounterVector::DoSerialize(SerialInfo* info) const + { + DO_SERIALIZE(SER_COUNTERVECTOR, SerialObj); + if ( ! SERIALIZE(&bits_) ) + return false; + return SERIALIZE(static_cast(width_)); + } + +bool CounterVector::DoUnserialize(UnserialInfo* info) + { + DO_UNSERIALIZE(SerialObj); + return false; + // TODO: Ask Robin how to unserialize non-pointer members. + //if ( ! UNSERIALIZE(&bits_) ) + // return false; + uint64 width; + if ( ! UNSERIALIZE(&width) ) + return false; + width_ = static_cast(width); + return true; + } + + HashPolicy::HashVector DefaultHashing::Hash(const void* x, size_t n) const { - HashVector h(k(), 0); + HashVector h(K(), 0); for ( size_t i = 0; i < h.size(); ++i ) h[i] = hashers_[i](x, n); return h; } + HashPolicy::HashVector DoubleHashing::Hash(const void* x, size_t n) const { HashType h1 = hasher1_(x); HashType h2 = hasher2_(x); - HashVector h(k(), 0); + HashVector h(K(), 0); for ( size_t i = 0; i < h.size(); ++i ) h[i] = h1 + i * h2; return h; } +bool BloomFilter::Serialize(SerialInfo* info) const + { + return SerialObj::Serialize(info); + } + +BloomFilter* BloomFilter::Unserialize(UnserialInfo* info) + { + return reinterpret_cast( + SerialObj::Unserialize(info, SER_BLOOMFILTER)); + } + +// FIXME: should abstract base classes also have IMPLEMENT_SERIAL? +//IMPLEMENT_SERIAL(BloomFilter, SER_BLOOMFILTER) + +bool BloomFilter::DoSerialize(SerialInfo* info) const + { + DO_SERIALIZE(SER_BLOOMFILTER, SerialObj); + // TODO: Make the hash policy serializable. + //if ( ! SERIALIZE(hash_) ) + // return false; + return SERIALIZE(static_cast(elements_)); + } + +bool BloomFilter::DoUnserialize(UnserialInfo* info) + { + DO_UNSERIALIZE(SerialObj); + // TODO: Make the hash policy serializable. + //if ( ! hash_ = HashPolicy::Unserialize(info) ) + // return false; + uint64 elements; + if ( UNSERIALIZE(&elements) ) + return false; + elements_ = static_cast(elements); + return true; + } + +size_t BasicBloomFilter::Cells(double fp, size_t capacity) + { + double ln2 = std::log(2); + return std::ceil(-(capacity * std::log(fp) / ln2 / ln2)); + } + +size_t BasicBloomFilter::K(size_t cells, size_t capacity) + { + double frac = static_cast(cells) / static_cast(capacity); + return round(frac * std::log(2)); + } + +BasicBloomFilter::BasicBloomFilter(size_t cells, HashPolicy* hash) + : BloomFilter(hash), bits_(cells) + { + } + +IMPLEMENT_SERIAL(BasicBloomFilter, SER_BASICBLOOMFILTER) + +bool BasicBloomFilter::DoSerialize(SerialInfo* info) const + { + DO_SERIALIZE(SER_BASICBLOOMFILTER, BloomFilter); + // TODO: Make the hash policy serializable. + //if ( ! SERIALIZE(&bits_) ) + // return false; + return true; + } + +bool BasicBloomFilter::DoUnserialize(UnserialInfo* info) + { + DO_UNSERIALIZE(BloomFilter); + // TODO: Non-pointer member deserialization? + return true; + } + void BasicBloomFilter::AddImpl(const HashPolicy::HashVector& h) { for ( size_t i = 0; i < h.size(); ++i ) @@ -31,3 +138,23 @@ size_t BasicBloomFilter::CountImpl(const HashPolicy::HashVector& h) const return 0; return 1; } + + +void CountingBloomFilter::AddImpl(const HashPolicy::HashVector& h) + { + for ( size_t i = 0; i < h.size(); ++i ) + cells_.Increment(h[i] % h.size(), 1); + } + +size_t CountingBloomFilter::CountImpl(const HashPolicy::HashVector& h) const + { + CounterVector::size_type min = + std::numeric_limits::max(); + for ( size_t i = 0; i < h.size(); ++i ) + { + CounterVector::size_type cnt = cells_.Count(h[i] % h.size()); + if ( cnt < min ) + min = cnt; + } + return min; + } diff --git a/src/BloomFilter.h b/src/BloomFilter.h index dca4eff2bd..82948f30ec 100644 --- a/src/BloomFilter.h +++ b/src/BloomFilter.h @@ -65,7 +65,7 @@ public: protected: DECLARE_SERIAL(CounterVector); - CounterVector(); + CounterVector() { } private: BitVector bits_; @@ -82,7 +82,7 @@ public: typedef std::vector HashVector; virtual ~HashPolicy() { } - size_t k() const { return k_; } + size_t K() const { return k_; } virtual HashVector Hash(const void* x, size_t n) const = 0; protected: @@ -130,7 +130,7 @@ private: }; /** - * The *double-hashing* policy. Uses a linear combination of 2 hash functions. + * The *double-hashing* policy. Uses a linear combination of two hash functions. */ class DoubleHashing : public HashPolicy { public: @@ -185,25 +185,20 @@ public: return elements_; } -protected: - /** - * Default-constructs a Bloom filter. - */ - BloomFilter(); + bool Serialize(SerialInfo* info) const; + static BloomFilter* Unserialize(UnserialInfo* info); - /** - * Constructs a BloomFilter. - * @param hash The hashing policy. - */ - BloomFilter(HashPolicy* hash); +protected: + DECLARE_SERIAL(BloomFilter); + + BloomFilter() { }; + BloomFilter(HashPolicy* hash) : hash_(hash) { } virtual void AddImpl(const HashPolicy::HashVector& hashes) = 0; - virtual size_t CountImpl(const HashPolicy::HashVector& hashes) const = 0; private: - HashPolicy* hash_; // Owned by *this. - + HashPolicy* hash_; size_t elements_; }; @@ -212,12 +207,17 @@ private: */ class BasicBloomFilter : public BloomFilter { public: - BasicBloomFilter(); - BasicBloomFilter(HashPolicy* hash); + static size_t Cells(double fp, size_t capacity); + static size_t K(size_t cells, size_t capacity); + + BasicBloomFilter(size_t cells, HashPolicy* hash); protected: - virtual void AddImpl(const HashPolicy::HashVector& h); + DECLARE_SERIAL(BasicBloomFilter); + BasicBloomFilter() { } + + virtual void AddImpl(const HashPolicy::HashVector& h); virtual size_t CountImpl(const HashPolicy::HashVector& h) const; private: @@ -232,10 +232,11 @@ public: CountingBloomFilter(unsigned width, HashPolicy* hash); protected: + DECLARE_SERIAL(CountingBloomFilter); + CountingBloomFilter(); virtual void AddImpl(const HashPolicy::HashVector& h); - virtual size_t CountImpl(const HashPolicy::HashVector& h) const; private: diff --git a/src/NetVar.cc b/src/NetVar.cc index 3a23e4c9fa..d8c2192af7 100644 --- a/src/NetVar.cc +++ b/src/NetVar.cc @@ -244,6 +244,7 @@ OpaqueType* md5_type; OpaqueType* sha1_type; OpaqueType* sha256_type; OpaqueType* entropy_type; +OpaqueType* bloomfilter_type; #include "const.bif.netvar_def" #include "types.bif.netvar_def" @@ -310,6 +311,7 @@ void init_general_global_var() sha1_type = new OpaqueType("sha1"); sha256_type = new OpaqueType("sha256"); entropy_type = new OpaqueType("entropy"); + bloomfilter_type = new OpaqueType("bloomfilter"); } void init_net_var() diff --git a/src/OpaqueVal.cc b/src/OpaqueVal.cc index 19346e52f2..a5fb65f53b 100644 --- a/src/OpaqueVal.cc +++ b/src/OpaqueVal.cc @@ -1,4 +1,6 @@ #include "OpaqueVal.h" + +#include "BloomFilter.h" #include "NetVar.h" #include "Reporter.h" #include "Serializer.h" @@ -515,3 +517,24 @@ bool EntropyVal::DoUnserialize(UnserialInfo* info) return true; } + +BloomFilterVal::BloomFilterVal(OpaqueType* t) : OpaqueVal(t) + { + } + +IMPLEMENT_SERIAL(BloomFilterVal, SER_BLOOMFILTER_VAL); + +bool BloomFilterVal::DoSerialize(SerialInfo* info) const + { + DO_SERIALIZE(SER_BLOOMFILTER_VAL, OpaqueVal); + // TODO: implement. + return true; + } + +bool BloomFilterVal::DoUnserialize(UnserialInfo* info) + { + DO_UNSERIALIZE(OpaqueVal); + // TODO: implement. + return true; + } + diff --git a/src/OpaqueVal.h b/src/OpaqueVal.h index 78fa5da5e9..1c9c0361cc 100644 --- a/src/OpaqueVal.h +++ b/src/OpaqueVal.h @@ -7,6 +7,8 @@ #include "Val.h" #include "digest.h" +class BloomFilter; + class HashVal : public OpaqueVal { public: virtual bool IsValid() const; @@ -107,4 +109,18 @@ private: RandTest state; }; +class BloomFilterVal : public OpaqueVal { +public: + BloomFilterVal(); + +protected: + friend class Val; + BloomFilterVal(OpaqueType* t); + + DECLARE_SERIAL(BloomFilterVal); + +private: + BloomFilter* bloom_filter_; +}; + #endif diff --git a/src/SerialTypes.h b/src/SerialTypes.h index c9c0c34a33..171113ab6a 100644 --- a/src/SerialTypes.h +++ b/src/SerialTypes.h @@ -50,6 +50,9 @@ SERIAL_IS_BO(CASE, 0x1200) SERIAL_IS(LOCATION, 0x1300) SERIAL_IS(RE_MATCHER, 0x1400) SERIAL_IS(BITVECTOR, 0x1500) +SERIAL_IS(COUNTERVECTOR, 0xa000) +SERIAL_IS(BLOOMFILTER, 0xa100) +SERIAL_IS(BASICBLOOMFILTER, 0xa200) // These are the externally visible types. const SerialType SER_NONE = 0; @@ -105,6 +108,7 @@ SERIAL_VAL(MD5_VAL, 16) SERIAL_VAL(SHA1_VAL, 17) SERIAL_VAL(SHA256_VAL, 18) SERIAL_VAL(ENTROPY_VAL, 19) +SERIAL_VAL(BLOOMFILTER_VAL, 20) #define SERIAL_EXPR(name, val) SERIAL_CONST(name, val, EXPR) SERIAL_EXPR(EXPR, 1) @@ -204,5 +208,8 @@ SERIAL_CONST2(CASE) SERIAL_CONST2(LOCATION) SERIAL_CONST2(RE_MATCHER) SERIAL_CONST2(BITVECTOR) +SERIAL_CONST2(COUNTERVECTOR) +SERIAL_CONST2(BLOOMFILTER) +SERIAL_CONST2(BASICBLOOMFILTER) #endif From d3297dd6f3b6a50c07c90e9ad5f61c0ddf762460 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Tue, 4 Jun 2013 13:32:26 -0700 Subject: [PATCH 09/73] Adhere to Bro coding style. --- src/BitVector.cc | 100 +++++++++++++++++++++++------------------------ src/BitVector.h | 40 +++++++++---------- 2 files changed, 69 insertions(+), 71 deletions(-) diff --git a/src/BitVector.cc b/src/BitVector.cc index f57301d506..f029230609 100644 --- a/src/BitVector.cc +++ b/src/BitVector.cc @@ -33,7 +33,7 @@ BitVector::Reference::Reference(block_type& block, block_type i) assert(i < bits_per_block); } -BitVector::Reference& BitVector::Reference::flip() +BitVector::Reference& BitVector::Reference::Flip() { block_ ^= mask_; return *this; @@ -105,7 +105,7 @@ BitVector::BitVector(BitVector const& other) BitVector BitVector::operator~() const { BitVector b(*this); - b.flip(); + b.Flip(); return b; } @@ -130,15 +130,15 @@ BitVector BitVector::operator>>(size_type n) const BitVector& BitVector::operator<<=(size_type n) { if (n >= num_bits_) - return reset(); + return Reset(); if (n > 0) { - size_type last = blocks() - 1; + size_type last = Blocks() - 1; size_type div = n / bits_per_block; block_type r = bit_index(n); block_type* b = &bits_[0]; - assert(blocks() >= 1); + assert(Blocks() >= 1); assert(div <= last); if (r != 0) @@ -164,15 +164,15 @@ BitVector& BitVector::operator<<=(size_type n) BitVector& BitVector::operator>>=(size_type n) { if (n >= num_bits_) - return reset(); + return Reset(); if (n > 0) { - size_type last = blocks() - 1; + size_type last = Blocks() - 1; size_type div = n / bits_per_block; block_type r = bit_index(n); block_type* b = &bits_[0]; - assert(blocks() >= 1); + assert(Blocks() >= 1); assert(div <= last); if (r != 0) @@ -187,39 +187,39 @@ BitVector& BitVector::operator>>=(size_type n) b[i-div] = b[i]; } - std::fill_n(b + (blocks() - div), div, block_type(0)); + std::fill_n(b + (Blocks() - div), div, block_type(0)); } return *this; } BitVector& BitVector::operator&=(BitVector const& other) { - assert(size() >= other.size()); - for (size_type i = 0; i < blocks(); ++i) + assert(Size() >= other.Size()); + for (size_type i = 0; i < Blocks(); ++i) bits_[i] &= other.bits_[i]; return *this; } BitVector& BitVector::operator|=(BitVector const& other) { - assert(size() >= other.size()); - for (size_type i = 0; i < blocks(); ++i) + assert(Size() >= other.Size()); + for (size_type i = 0; i < Blocks(); ++i) bits_[i] |= other.bits_[i]; return *this; } BitVector& BitVector::operator^=(BitVector const& other) { - assert(size() >= other.size()); - for (size_type i = 0; i < blocks(); ++i) + assert(Size() >= other.Size()); + for (size_type i = 0; i < Blocks(); ++i) bits_[i] ^= other.bits_[i]; return *this; } BitVector& BitVector::operator-=(BitVector const& other) { - assert(size() >= other.size()); - for (size_type i = 0; i < blocks(); ++i) + assert(Size() >= other.Size()); + for (size_type i = 0; i < Blocks(); ++i) bits_[i] &= ~other.bits_[i]; return *this; } @@ -260,8 +260,8 @@ bool operator!=(BitVector const& x, BitVector const& y) bool operator<(BitVector const& x, BitVector const& y) { - assert(x.size() == y.size()); - for (BitVector::size_type r = x.blocks(); r > 0; --r) + assert(x.Size() == y.Size()); + for (BitVector::size_type r = x.Blocks(); r > 0; --r) { BitVector::size_type i = r - 1; if (x.bits_[i] < y.bits_[i]) @@ -272,9 +272,9 @@ bool operator<(BitVector const& x, BitVector const& y) return false; } -void BitVector::resize(size_type n, bool value) +void BitVector::Resize(size_type n, bool value) { - size_type old = blocks(); + size_type old = Blocks(); size_type required = bits_to_blocks(n); block_type block_value = value ? ~block_type(0) : block_type(0); @@ -288,27 +288,27 @@ void BitVector::resize(size_type n, bool value) zero_unused_bits(); } -void BitVector::clear() +void BitVector::Clear() { bits_.clear(); num_bits_ = 0; } -void BitVector::push_back(bool bit) +void BitVector::PushBack(bool bit) { - size_type s = size(); - resize(s + 1); - set(s, bit); + size_type s = Size(); + Resize(s + 1); + Set(s, bit); } -void BitVector::append(block_type block) +void BitVector::Append(block_type block) { size_type excess = extra_bits(); if (excess) { - assert(! bits_.empty()); + assert(! Empty()); bits_.push_back(block >> (bits_per_block - excess)); - bits_[bits_.size() - 2] |= (block << excess); + bits_[Blocks() - 2] |= (block << excess); } else { @@ -317,48 +317,46 @@ void BitVector::append(block_type block) num_bits_ += bits_per_block; } -BitVector& BitVector::set(size_type i, bool bit) +BitVector& BitVector::Set(size_type i, bool bit) { assert(i < num_bits_); - if (bit) - bits_[block_index(i)] |= bit_mask(i); + bits_[block_index(i)] |= bit_mask(i); else - reset(i); - + Reset(i); return *this; } -BitVector& BitVector::set() +BitVector& BitVector::Set() { std::fill(bits_.begin(), bits_.end(), ~block_type(0)); zero_unused_bits(); return *this; } -BitVector& BitVector::reset(size_type i) +BitVector& BitVector::Reset(size_type i) { assert(i < num_bits_); bits_[block_index(i)] &= ~bit_mask(i); return *this; } -BitVector& BitVector::reset() +BitVector& BitVector::Reset() { std::fill(bits_.begin(), bits_.end(), block_type(0)); return *this; } -BitVector& BitVector::flip(size_type i) +BitVector& BitVector::Flip(size_type i) { assert(i < num_bits_); bits_[block_index(i)] ^= bit_mask(i); return *this; } -BitVector& BitVector::flip() +BitVector& BitVector::Flip() { - for (size_type i = 0; i < blocks(); ++i) + for (size_type i = 0; i < Blocks(); ++i) bits_[i] = ~bits_[i]; zero_unused_bits(); return *this; @@ -376,11 +374,11 @@ BitVector::Reference BitVector::operator[](size_type i) return Reference(bits_[block_index(i)], bit_index(i)); } -BitVector::size_type BitVector::count() const +BitVector::size_type BitVector::Count() const { std::vector::const_iterator first = bits_.begin(); size_t n = 0; - size_type length = blocks(); + size_type length = Blocks(); while (length) { block_type block = *first; @@ -396,29 +394,29 @@ BitVector::size_type BitVector::count() const return n; } -BitVector::size_type BitVector::blocks() const +BitVector::size_type BitVector::Blocks() const { return bits_.size(); } -BitVector::size_type BitVector::size() const +BitVector::size_type BitVector::Size() const { return num_bits_; } -bool BitVector::empty() const +bool BitVector::Empty() const { return bits_.empty(); } -BitVector::size_type BitVector::find_first() const +BitVector::size_type BitVector::FindFirst() const { return find_from(0); } -BitVector::size_type BitVector::find_next(size_type i) const +BitVector::size_type BitVector::FindNext(size_type i) const { - if (i >= (size() - 1) || size() == 0) + if (i >= (Size() - 1) || Size() == 0) return npos; ++i; size_type bi = block_index(i); @@ -437,7 +435,7 @@ BitVector::size_type BitVector::lowest_bit(block_type block) BitVector::block_type BitVector::extra_bits() const { - return bit_index(size()); + return bit_index(Size()); } void BitVector::zero_unused_bits() @@ -448,9 +446,9 @@ void BitVector::zero_unused_bits() BitVector::size_type BitVector::find_from(size_type i) const { - while (i < blocks() && bits_[i] == 0) + while (i < Blocks() && bits_[i] == 0) ++i; - if (i >= blocks()) + if (i >= Blocks()) return npos; return i * bits_per_block + lowest_bit(bits_[i]); } diff --git a/src/BitVector.h b/src/BitVector.h index 9900dd103e..8315a151f0 100644 --- a/src/BitVector.h +++ b/src/BitVector.h @@ -24,7 +24,7 @@ public: Reference(block_type& block, block_type i); public: - Reference& flip(); + Reference& Flip(); operator bool() const; bool operator~() const; Reference& operator=(bool x); @@ -110,7 +110,7 @@ public: * sequence. */ template - void append(ForwardIterator first, ForwardIterator last) + void Append(ForwardIterator first, ForwardIterator last) { if (first == last) return; @@ -119,7 +119,7 @@ public: typename std::iterator_traits::difference_type delta = std::distance(first, last); - bits_.reserve(blocks() + delta); + bits_.reserve(Blocks() + delta); if (excess == 0) { bits_.back() |= (*first << excess); @@ -140,24 +140,24 @@ public: * Appends the bits in a given block. * @param block The block containing bits to append. */ - void append(block_type block); + void Append(block_type block); /** Appends a single bit to the end of the bit vector. * @param bit The value of the bit. */ - void push_back(bool bit); + void PushBack(bool bit); /** * Clears all bits in the bitvector. */ - void clear(); + void Clear(); /** * Resizes the bit vector to a new number of bits. * @param n The new number of bits of the bit vector. * @param value The bit value of new values, if the vector expands. */ - void resize(size_type n, bool value = false); + void Resize(size_type n, bool value = false); /** * Sets a bit at a specific position to a given value. @@ -165,39 +165,39 @@ public: * @param bit The value assigned to position *i*. * @return A reference to the bit vector instance. */ - BitVector& set(size_type i, bool bit = true); + BitVector& Set(size_type i, bool bit = true); /** * Sets all bits to 1. * @return A reference to the bit vector instance. */ - BitVector& set(); + BitVector& Set(); /** * Resets a bit at a specific position, i.e., sets it to 0. * @param i The bit position. * @return A reference to the bit vector instance. */ - BitVector& reset(size_type i); + BitVector& Reset(size_type i); /** * Sets all bits to 0. * @return A reference to the bit vector instance. */ - BitVector& reset(); + BitVector& Reset(); /** * Toggles/flips a bit at a specific position. * @param i The bit position. * @return A reference to the bit vector instance. */ - BitVector& flip(size_type i); + BitVector& Flip(size_type i); /** * Computes the complement. * @return A reference to the bit vector instance. */ - BitVector& flip(); + BitVector& Flip(); /** Retrieves a single bit. * @param i The bit position. @@ -217,32 +217,32 @@ public: * count* or *Hamming weight*. * @return The number of bits set to 1. */ - size_type count() const; + size_type Count() const; /** * Retrieves the number of blocks of the underlying storage. - * @param The number of blocks that represent `size()` bits. + * @param The number of blocks that represent `Size()` bits. */ - size_type blocks() const; + size_type Blocks() const; /** * Retrieves the number of bits the bitvector consist of. * @return The length of the bit vector in bits. */ - size_type size() const; + size_type Size() const; /** * Checks whether the bit vector is empty. * @return `true` iff the bitvector has zero length. */ - bool empty() const; + bool Empty() const; /** * Finds the bit position of of the first 1-bit. * @return The position of the first bit that equals to one or `npos` if no * such bit exists. */ - size_type find_first() const; + size_type FindFirst() const; /** * Finds the next 1-bit from a given starting position. @@ -252,7 +252,7 @@ public: * @return The position of the first bit that equals to 1 after position * *i* or `npos` if no such bit exists. */ - size_type find_next(size_type i) const; + size_type FindNext(size_type i) const; bool Serialize(SerialInfo* info) const; static BitVector* Unserialize(UnserialInfo* info); From a5572dd66f10ca653855483e0941da327b8422e4 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Tue, 4 Jun 2013 14:31:39 -0700 Subject: [PATCH 10/73] Write CounterVector implementation scaffold. --- src/BloomFilter.cc | 36 ++++++++++++++++++++++++++++++++++++ src/BloomFilter.h | 10 +++++++--- 2 files changed, 43 insertions(+), 3 deletions(-) diff --git a/src/BloomFilter.cc b/src/BloomFilter.cc index 4787bef0f0..78048ee588 100644 --- a/src/BloomFilter.cc +++ b/src/BloomFilter.cc @@ -10,6 +10,42 @@ T round(double x) { return (x > 0.0) ? (x + 0.5) : (x - 0.5); } } // namespace +CounterVector::CounterVector(size_t width, size_t cells) + : bits_(new BitVector(width * cells)), width_(width) + { + } + +CounterVector::~CounterVector() + { + delete bits_; + } + +bool CounterVector::Increment(size_type cell, count_type value) + { + // TODO + assert(! "not yet implemented"); + return false; + } + +bool CounterVector::Decrement(size_type cell, count_type value) + { + // TODO + assert(! "not yet implemented"); + return false; + } + +CounterVector::count_type CounterVector::Count(size_type cell) const + { + // TODO + assert(! "not yet implemented"); + return 0; + } + +CounterVector::size_type CounterVector::Size() const + { + return bits_->Blocks() / width_; + } + IMPLEMENT_SERIAL(CounterVector, SER_COUNTERVECTOR) bool CounterVector::DoSerialize(SerialInfo* info) const diff --git a/src/BloomFilter.h b/src/BloomFilter.h index 82948f30ec..b4f82efee9 100644 --- a/src/BloomFilter.h +++ b/src/BloomFilter.h @@ -9,7 +9,7 @@ /** * A vector of counters, each of which have a fixed number of bits. */ -class CounterVector : SerialObj { +class CounterVector : public SerialObj { public: typedef size_t size_type; typedef uint64 count_type; @@ -18,8 +18,12 @@ public: * Constructs a counter vector having cells of a given width. * * @param width The number of bits that each cell occupies. + * + * @param cells The number of cells in the bitvector. */ - explicit CounterVector(unsigned width); + CounterVector(size_t width, size_t cells = 1024); + + ~CounterVector(); /** * Increments a given cell. @@ -68,7 +72,7 @@ protected: CounterVector() { } private: - BitVector bits_; + BitVector* bits_; unsigned width_; }; From 751cf612931f021ddf7b5ee51019f20d05e0c309 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Tue, 4 Jun 2013 15:30:27 -0700 Subject: [PATCH 11/73] Add more serialization implementation. --- src/BloomFilter.cc | 93 ++++++++++++++++++++++++++++++++-------------- src/BloomFilter.h | 56 +++++++++++++++++++++++----- src/NetVar.h | 1 + src/OpaqueVal.cc | 18 ++++++--- src/OpaqueVal.h | 1 + src/SerialTypes.h | 2 + 6 files changed, 129 insertions(+), 42 deletions(-) diff --git a/src/BloomFilter.cc b/src/BloomFilter.cc index 78048ee588..64f0e1c67b 100644 --- a/src/BloomFilter.cc +++ b/src/BloomFilter.cc @@ -46,12 +46,23 @@ CounterVector::size_type CounterVector::Size() const return bits_->Blocks() / width_; } +bool CounterVector::Serialize(SerialInfo* info) const + { + return SerialObj::Serialize(info); + } + +CounterVector* CounterVector::Unserialize(UnserialInfo* info) + { + return reinterpret_cast( + SerialObj::Unserialize(info, SER_COUNTERVECTOR)); + } + IMPLEMENT_SERIAL(CounterVector, SER_COUNTERVECTOR) bool CounterVector::DoSerialize(SerialInfo* info) const { DO_SERIALIZE(SER_COUNTERVECTOR, SerialObj); - if ( ! SERIALIZE(&bits_) ) + if ( ! SERIALIZE(bits_) ) return false; return SERIALIZE(static_cast(width_)); } @@ -60,9 +71,9 @@ bool CounterVector::DoUnserialize(UnserialInfo* info) { DO_UNSERIALIZE(SerialObj); return false; - // TODO: Ask Robin how to unserialize non-pointer members. - //if ( ! UNSERIALIZE(&bits_) ) - // return false; + bits_ = BitVector::Unserialize(info); + if ( ! bits_ ) + return false; uint64 width; if ( ! UNSERIALIZE(&width) ) return false; @@ -90,6 +101,18 @@ HashPolicy::HashVector DoubleHashing::Hash(const void* x, size_t n) const return h; } + +BloomFilter::BloomFilter(size_t k) + : hash_(new hash_policy(k)) + { + } + +BloomFilter::~BloomFilter() + { + if ( hash_ ) + delete hash_; + } + bool BloomFilter::Serialize(SerialInfo* info) const { return SerialObj::Serialize(info); @@ -101,24 +124,21 @@ BloomFilter* BloomFilter::Unserialize(UnserialInfo* info) SerialObj::Unserialize(info, SER_BLOOMFILTER)); } -// FIXME: should abstract base classes also have IMPLEMENT_SERIAL? -//IMPLEMENT_SERIAL(BloomFilter, SER_BLOOMFILTER) - bool BloomFilter::DoSerialize(SerialInfo* info) const { DO_SERIALIZE(SER_BLOOMFILTER, SerialObj); - // TODO: Make the hash policy serializable. - //if ( ! SERIALIZE(hash_) ) - // return false; - return SERIALIZE(static_cast(elements_)); + if ( ! SERIALIZE(static_cast(hash_->K())) ) + return false; + return SERIALIZE(static_cast(elements_)); } bool BloomFilter::DoUnserialize(UnserialInfo* info) { DO_UNSERIALIZE(SerialObj); - // TODO: Make the hash policy serializable. - //if ( ! hash_ = HashPolicy::Unserialize(info) ) - // return false; + uint16 k; + if ( ! UNSERIALIZE(&k) ) + return false; + hash_ = new hash_policy(static_cast(k)); uint64 elements; if ( UNSERIALIZE(&elements) ) return false; @@ -126,7 +146,7 @@ bool BloomFilter::DoUnserialize(UnserialInfo* info) return true; } -size_t BasicBloomFilter::Cells(double fp, size_t capacity) +size_t BasicBloomFilter::M(double fp, size_t capacity) { double ln2 = std::log(2); return std::ceil(-(capacity * std::log(fp) / ln2 / ln2)); @@ -138,9 +158,16 @@ size_t BasicBloomFilter::K(size_t cells, size_t capacity) return round(frac * std::log(2)); } -BasicBloomFilter::BasicBloomFilter(size_t cells, HashPolicy* hash) - : BloomFilter(hash), bits_(cells) +BasicBloomFilter::BasicBloomFilter(double fp, size_t capacity) + : BloomFilter(K(M(fp, capacity), capacity)) { + bits_ = new BitVector(M(fp, capacity)); + } + +BasicBloomFilter::BasicBloomFilter(size_t cells, size_t capacity) + : BloomFilter(K(cells, capacity)) + { + bits_ = new BitVector(cells); } IMPLEMENT_SERIAL(BasicBloomFilter, SER_BASICBLOOMFILTER) @@ -148,38 +175,50 @@ IMPLEMENT_SERIAL(BasicBloomFilter, SER_BASICBLOOMFILTER) bool BasicBloomFilter::DoSerialize(SerialInfo* info) const { DO_SERIALIZE(SER_BASICBLOOMFILTER, BloomFilter); - // TODO: Make the hash policy serializable. - //if ( ! SERIALIZE(&bits_) ) - // return false; - return true; + return SERIALIZE(bits_); } bool BasicBloomFilter::DoUnserialize(UnserialInfo* info) { DO_UNSERIALIZE(BloomFilter); - // TODO: Non-pointer member deserialization? - return true; + bits_ = BitVector::Unserialize(info); + return bits_ == NULL; } void BasicBloomFilter::AddImpl(const HashPolicy::HashVector& h) { for ( size_t i = 0; i < h.size(); ++i ) - bits_.set(h[i] % h.size()); + bits_->Set(h[i] % h.size()); } size_t BasicBloomFilter::CountImpl(const HashPolicy::HashVector& h) const { for ( size_t i = 0; i < h.size(); ++i ) - if ( ! bits_[h[i] % h.size()] ) + if ( ! (*bits_)[h[i] % h.size()] ) return 0; return 1; } +IMPLEMENT_SERIAL(CountingBloomFilter, SER_COUNTINGBLOOMFILTER) + +bool CountingBloomFilter::DoSerialize(SerialInfo* info) const + { + DO_SERIALIZE(SER_BASICBLOOMFILTER, BloomFilter); + return SERIALIZE(cells_); + } + +bool CountingBloomFilter::DoUnserialize(UnserialInfo* info) + { + DO_UNSERIALIZE(BloomFilter); + cells_ = CounterVector::Unserialize(info); + return cells_ == NULL; + } + void CountingBloomFilter::AddImpl(const HashPolicy::HashVector& h) { for ( size_t i = 0; i < h.size(); ++i ) - cells_.Increment(h[i] % h.size(), 1); + cells_->Increment(h[i] % h.size(), 1); } size_t CountingBloomFilter::CountImpl(const HashPolicy::HashVector& h) const @@ -188,7 +227,7 @@ size_t CountingBloomFilter::CountImpl(const HashPolicy::HashVector& h) const std::numeric_limits::max(); for ( size_t i = 0; i < h.size(); ++i ) { - CounterVector::size_type cnt = cells_.Count(h[i] % h.size()); + CounterVector::size_type cnt = cells_->Count(h[i] % h.size()); if ( cnt < min ) min = cnt; } diff --git a/src/BloomFilter.h b/src/BloomFilter.h index b4f82efee9..77c6bc4f56 100644 --- a/src/BloomFilter.h +++ b/src/BloomFilter.h @@ -151,9 +151,13 @@ private: /** * The abstract base class for Bloom filters. */ -class BloomFilter : SerialObj { +class BloomFilter : public SerialObj { public: - virtual ~BloomFilter() { delete hash_; } + // At this point we won't let the user choose the hash policy, but we might + // open up the interface in the future. + typedef DoubleHashing hash_policy; + + virtual ~BloomFilter(); /** * Adds an element of type T to the Bloom filter. @@ -193,10 +197,10 @@ public: static BloomFilter* Unserialize(UnserialInfo* info); protected: - DECLARE_SERIAL(BloomFilter); + DECLARE_ABSTRACT_SERIAL(BloomFilter); BloomFilter() { }; - BloomFilter(HashPolicy* hash) : hash_(hash) { } + BloomFilter(size_t k); virtual void AddImpl(const HashPolicy::HashVector& hashes) = 0; virtual size_t CountImpl(const HashPolicy::HashVector& hashes) const = 0; @@ -211,10 +215,42 @@ private: */ class BasicBloomFilter : public BloomFilter { public: - static size_t Cells(double fp, size_t capacity); + /** + * Computes the number of cells based a given false-positive rate and + * capacity. In the literature, this parameter often has the name *M*. + * + * @param fp The false-positive rate. + * + * @param capacity The number of exepected elements. + * + * Returns: The number cells needed to support a false-positive rate of *fp* + * with at most *capacity* elements. + */ + static size_t M(double fp, size_t capacity); + + /** + * Computes the optimal number of hash functions based on the number cells + * and expected number of elements. + * + * @param cells The number of cells (*m*). + * + * @param capacity The maximum number of elements. + * + * Returns: the optimal number of hash functions for a false-positive rate of + * *fp* for at most *capacity* elements. + */ static size_t K(size_t cells, size_t capacity); - BasicBloomFilter(size_t cells, HashPolicy* hash); + /** + * Constructs a basic Bloom filter with a given false-positive rate and + * capacity. + */ + BasicBloomFilter(double fp, size_t capacity); + + /** + * Constructs a basic Bloom filter with a given number of cells and capacity. + */ + BasicBloomFilter(size_t cells, size_t capacity); protected: DECLARE_SERIAL(BasicBloomFilter); @@ -225,7 +261,7 @@ protected: virtual size_t CountImpl(const HashPolicy::HashVector& h) const; private: - BitVector bits_; + BitVector* bits_; }; /** @@ -233,18 +269,18 @@ private: */ class CountingBloomFilter : public BloomFilter { public: - CountingBloomFilter(unsigned width, HashPolicy* hash); + CountingBloomFilter(unsigned width); protected: DECLARE_SERIAL(CountingBloomFilter); - CountingBloomFilter(); + CountingBloomFilter() { } virtual void AddImpl(const HashPolicy::HashVector& h); virtual size_t CountImpl(const HashPolicy::HashVector& h) const; private: - CounterVector cells_; + CounterVector* cells_; }; #endif diff --git a/src/NetVar.h b/src/NetVar.h index 1a20adcaf2..aa2a14ada5 100644 --- a/src/NetVar.h +++ b/src/NetVar.h @@ -249,6 +249,7 @@ extern OpaqueType* md5_type; extern OpaqueType* sha1_type; extern OpaqueType* sha256_type; extern OpaqueType* entropy_type; +extern OpaqueType* bloomfilter_type; // Initializes globals that don't pertain to network/event analysis. extern void init_general_global_var(); diff --git a/src/OpaqueVal.cc b/src/OpaqueVal.cc index a5fb65f53b..b4f1290436 100644 --- a/src/OpaqueVal.cc +++ b/src/OpaqueVal.cc @@ -518,23 +518,31 @@ bool EntropyVal::DoUnserialize(UnserialInfo* info) return true; } +BloomFilterVal::BloomFilterVal() : OpaqueVal(bloomfilter_type) + { + } + BloomFilterVal::BloomFilterVal(OpaqueType* t) : OpaqueVal(t) { } +BloomFilterVal::~BloomFilterVal() + { + if ( bloom_filter_ ) + delete bloom_filter_; + } + IMPLEMENT_SERIAL(BloomFilterVal, SER_BLOOMFILTER_VAL); bool BloomFilterVal::DoSerialize(SerialInfo* info) const { DO_SERIALIZE(SER_BLOOMFILTER_VAL, OpaqueVal); - // TODO: implement. - return true; + return SERIALIZE(bloom_filter_); } bool BloomFilterVal::DoUnserialize(UnserialInfo* info) { DO_UNSERIALIZE(OpaqueVal); - // TODO: implement. - return true; + bloom_filter_ = BloomFilter::Unserialize(info); + return bloom_filter_ == NULL; } - diff --git a/src/OpaqueVal.h b/src/OpaqueVal.h index 1c9c0361cc..68b42a8a49 100644 --- a/src/OpaqueVal.h +++ b/src/OpaqueVal.h @@ -112,6 +112,7 @@ private: class BloomFilterVal : public OpaqueVal { public: BloomFilterVal(); + ~BloomFilterVal(); protected: friend class Val; diff --git a/src/SerialTypes.h b/src/SerialTypes.h index 171113ab6a..859145f19f 100644 --- a/src/SerialTypes.h +++ b/src/SerialTypes.h @@ -53,6 +53,7 @@ SERIAL_IS(BITVECTOR, 0x1500) SERIAL_IS(COUNTERVECTOR, 0xa000) SERIAL_IS(BLOOMFILTER, 0xa100) SERIAL_IS(BASICBLOOMFILTER, 0xa200) +SERIAL_IS(COUNTINGBLOOMFILTER, 0xa300) // These are the externally visible types. const SerialType SER_NONE = 0; @@ -211,5 +212,6 @@ SERIAL_CONST2(BITVECTOR) SERIAL_CONST2(COUNTERVECTOR) SERIAL_CONST2(BLOOMFILTER) SERIAL_CONST2(BASICBLOOMFILTER) +SERIAL_CONST2(COUNTINGBLOOMFILTER) #endif From 880d02f7204d21fc0e69f08ac78e963042df4f16 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Wed, 5 Jun 2013 16:16:55 -0700 Subject: [PATCH 12/73] Associate a Comphash with a BloomFilterVal. We also keep track of the Bloom filter's element type inside each value. The first use of the BiF bloomfilter_add will "typify" the Bloom filter and lock the Bloom filter's type to the element type. --- src/BloomFilter.cc | 15 ++++++++++++ src/BloomFilter.h | 3 ++- src/OpaqueVal.cc | 60 ++++++++++++++++++++++++++++++++++++++++++++-- src/OpaqueVal.h | 18 ++++++++++++-- 4 files changed, 91 insertions(+), 5 deletions(-) diff --git a/src/BloomFilter.cc b/src/BloomFilter.cc index 64f0e1c67b..74fa6fb255 100644 --- a/src/BloomFilter.cc +++ b/src/BloomFilter.cc @@ -199,6 +199,21 @@ size_t BasicBloomFilter::CountImpl(const HashPolicy::HashVector& h) const return 1; } +CountingBloomFilter::CountingBloomFilter(double fp, size_t capacity, + size_t width) + : BloomFilter(BasicBloomFilter::K(BasicBloomFilter::M(fp, capacity), + capacity)) + { + cells_ = new CounterVector(width, BasicBloomFilter::M(fp, capacity)); + } + +CountingBloomFilter::CountingBloomFilter(size_t cells, size_t capacity, + size_t width) + : BloomFilter(BasicBloomFilter::K(cells, capacity)) + { + cells_ = new CounterVector(width, cells); + } + IMPLEMENT_SERIAL(CountingBloomFilter, SER_COUNTINGBLOOMFILTER) diff --git a/src/BloomFilter.h b/src/BloomFilter.h index 77c6bc4f56..14b0ac3281 100644 --- a/src/BloomFilter.h +++ b/src/BloomFilter.h @@ -269,7 +269,8 @@ private: */ class CountingBloomFilter : public BloomFilter { public: - CountingBloomFilter(unsigned width); + CountingBloomFilter(double fp, size_t capacity, size_t width); + CountingBloomFilter(size_t cells, size_t capacity, size_t width); protected: DECLARE_SERIAL(CountingBloomFilter); diff --git a/src/OpaqueVal.cc b/src/OpaqueVal.cc index b4f1290436..abfd8f320f 100644 --- a/src/OpaqueVal.cc +++ b/src/OpaqueVal.cc @@ -518,31 +518,87 @@ bool EntropyVal::DoUnserialize(UnserialInfo* info) return true; } -BloomFilterVal::BloomFilterVal() : OpaqueVal(bloomfilter_type) +BloomFilterVal::BloomFilterVal(BloomFilter* bf) + : OpaqueVal(bloomfilter_type), bloom_filter_(bf) { } -BloomFilterVal::BloomFilterVal(OpaqueType* t) : OpaqueVal(t) +BloomFilterVal::BloomFilterVal(OpaqueType* t) + : OpaqueVal(t) { } +bool BloomFilterVal::Typify(BroType* type) + { + if ( type_ ) + return false; + type_ = type; + TypeList* tl = new TypeList(type_); + tl->Append(type_); + hash_ = new CompositeHash(tl); + Unref(tl); + return true; + } + +BroType* BloomFilterVal::Type() const + { + return type_; + } + +void BloomFilterVal::Add(const Val* val) + { + HashKey* key = hash_->ComputeHash(val, 1); + bloom_filter_->Add(key->Hash()); + } + +size_t BloomFilterVal::Count(const Val* val) const + { + HashKey* key = hash_->ComputeHash(val, 1); + return bloom_filter_->Count(key->Hash()); + } + +BloomFilterVal* BloomFilterVal::Merge(const BloomFilterVal* first, + const BloomFilterVal* second) +{ + assert(! "not yet implemented"); + return NULL; + } + BloomFilterVal::~BloomFilterVal() { + if ( type_ ) + Unref(type_); + if ( hash_ ) + delete hash_; if ( bloom_filter_ ) delete bloom_filter_; } +BloomFilterVal::BloomFilterVal() + : OpaqueVal(bloomfilter_type) + { + } + IMPLEMENT_SERIAL(BloomFilterVal, SER_BLOOMFILTER_VAL); bool BloomFilterVal::DoSerialize(SerialInfo* info) const { DO_SERIALIZE(SER_BLOOMFILTER_VAL, OpaqueVal); + if ( ! SERIALIZE(type_) ) + return false; return SERIALIZE(bloom_filter_); } bool BloomFilterVal::DoUnserialize(UnserialInfo* info) { DO_UNSERIALIZE(OpaqueVal); + type_ = BroType::Unserialize(info); + if ( ! type_ ) + return false; + TypeList* tl = new TypeList(type_); + tl->Append(type_); + hash_ = new CompositeHash(tl); + Unref(tl); bloom_filter_ = BloomFilter::Unserialize(info); return bloom_filter_ == NULL; } diff --git a/src/OpaqueVal.h b/src/OpaqueVal.h index 68b42a8a49..e97a530f3a 100644 --- a/src/OpaqueVal.h +++ b/src/OpaqueVal.h @@ -110,18 +110,32 @@ private: }; class BloomFilterVal : public OpaqueVal { + BloomFilterVal(const BloomFilterVal&); + BloomFilterVal& operator=(const BloomFilterVal&); public: - BloomFilterVal(); + static BloomFilterVal* Merge(const BloomFilterVal* first, + const BloomFilterVal* second); + + BloomFilterVal(BloomFilter* bf); ~BloomFilterVal(); + bool Typify(BroType* type); + BroType* Type() const; + + void Add(const Val* val); + size_t Count(const Val* val) const; + protected: friend class Val; + BloomFilterVal(); BloomFilterVal(OpaqueType* t); DECLARE_SERIAL(BloomFilterVal); private: - BloomFilter* bloom_filter_; + BroType* type_; + CompositeHash* hash_; + BloomFilter* bloom_filter_; }; #endif From 3d9764213191070a6b68375c0d0ae8c3193528e3 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Wed, 5 Jun 2013 16:26:16 -0700 Subject: [PATCH 13/73] Add Bloom filter BiFs. --- src/bro.bif | 89 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 89 insertions(+) diff --git a/src/bro.bif b/src/bro.bif index d9558106a7..60fb985dda 100644 --- a/src/bro.bif +++ b/src/bro.bif @@ -5730,3 +5730,92 @@ function anonymize_addr%(a: addr, cl: IPAddrAnonymizationClass%): addr } %} +# =========================================================================== +# +# Bloom Filter Functions +# +# =========================================================================== + +%%{ +#include "BloomFilter.h" +%%} + +## Initializes a Bloom filter data structure. +## +## fp: The desired false-positive rate. +## +## capacity: the maximum number of elements that guarantees a false-positive +## rate of *fp*. +## +## Returns: A Bloom filter handle. +function bloomfilter_init%(fp: double, capacity: count, + max: count &default=1%): opaque of bloomfilter + %{ + BloomFilter* bf; + if ( max == 1 ) + { + bf = new BasicBloomFilter(fp, capacity); + } + else + { + uint16 width = 0; + while ( max >>= 1 ) + ++width; + bf = new CountingBloomFilter(fp, capacity, width); + } + return new BloomFilterVal(bf); + %} + +## Adds an element to a Bloom filter. +## +## bf: The Bloom filter handle. +## +## x: The element to add. +function bloomfilter_add%(bf: opaque of bloomfilter, x: any%): any + %{ + BloomFilterVal* bfv = static_cast(bf); + if ( ! bfv->Type() || ! bfv->Typify(x->Type()) ) + reporter->Error("failed to set Bloom filter type"); + else if ( bfv->Type() != x->Type() ) + reporter->Error("incompatible Bloom filter types"); + bfv->Add(x); + return 0; + %} + +## Retrieves the counter for a given element in a Bloom filter. +## +## bf: The Bloom filter handle. +## +## x: The element to count. +## +## Returns: the counter associated with *x* in *bf*. +function bloomfilter_lookup%(bf: opaque of bloomfilter, x: any%): count + %{ + BloomFilterVal* bfv = static_cast(bf); + if ( ! bfv->Type() ) + reporter->Error("cannot perform lookup on untyped Bloom filter"); + else if ( bfv->Type() != x->Type() ) + reporter->Error("incompatible Bloom filter types"); + return new Val(static_cast(bfv->Count(x)), TYPE_COUNT); + %} + +## Merges two Bloom filters. +## +## bf1: The first Bloom filter handle. +## +## bf2: The second Bloom filter handle. +## +## Returns: The union of *bf1* and *bf2*. +function bloomfilter_merge%(bf1: opaque of bloomfilter, + bf2: opaque of bloomfilter%): opaque of bloomfilter + %{ + const BloomFilterVal* bfv1 = static_cast(bf1); + const BloomFilterVal* bfv2 = static_cast(bf2); + if ( ! bfv1->Type() ) + reporter->Error("The first Bloom filter has not yet been typed"); + if ( ! bfv2->Type() ) + reporter->Error("The second Bloom filter has not yet been typed"); + else if ( bfv1->Type() != bfv2->Type() ) + reporter->Error("incompatible Bloom filter types"); + return BloomFilterVal::Merge(bfv1, bfv2); + %} From d5126a13395f899fab12f081248336e687222ed9 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Wed, 5 Jun 2013 17:45:10 -0700 Subject: [PATCH 14/73] Fix some BiF issues. --- src/bro.bif | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/src/bro.bif b/src/bro.bif index 60fb985dda..08b532eaea 100644 --- a/src/bro.bif +++ b/src/bro.bif @@ -5774,12 +5774,18 @@ function bloomfilter_init%(fp: double, capacity: count, function bloomfilter_add%(bf: opaque of bloomfilter, x: any%): any %{ BloomFilterVal* bfv = static_cast(bf); - if ( ! bfv->Type() || ! bfv->Typify(x->Type()) ) + if ( ! bfv->Type() && ! bfv->Typify(x->Type()) ) + { reporter->Error("failed to set Bloom filter type"); + return NULL; + } else if ( bfv->Type() != x->Type() ) + { reporter->Error("incompatible Bloom filter types"); + return NULL; + } bfv->Add(x); - return 0; + return NULL; %} ## Retrieves the counter for a given element in a Bloom filter. @@ -5812,9 +5818,9 @@ function bloomfilter_merge%(bf1: opaque of bloomfilter, const BloomFilterVal* bfv1 = static_cast(bf1); const BloomFilterVal* bfv2 = static_cast(bf2); if ( ! bfv1->Type() ) - reporter->Error("The first Bloom filter has not yet been typed"); + reporter->Error("first Bloom filter has not yet been typed"); if ( ! bfv2->Type() ) - reporter->Error("The second Bloom filter has not yet been typed"); + reporter->Error("second Bloom filter has not yet been typed"); else if ( bfv1->Type() != bfv2->Type() ) reporter->Error("incompatible Bloom filter types"); return BloomFilterVal::Merge(bfv1, bfv2); From 012e09c5c40bdf0acd29a34bf2271417ed36d770 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Thu, 6 Jun 2013 12:56:46 -0700 Subject: [PATCH 15/73] Small fixes and simplifications. --- src/BloomFilter.cc | 2 +- src/BloomFilter.h | 17 +++++++---------- src/OpaqueVal.cc | 1 + 3 files changed, 9 insertions(+), 11 deletions(-) diff --git a/src/BloomFilter.cc b/src/BloomFilter.cc index 74fa6fb255..e549553bf4 100644 --- a/src/BloomFilter.cc +++ b/src/BloomFilter.cc @@ -140,7 +140,7 @@ bool BloomFilter::DoUnserialize(UnserialInfo* info) return false; hash_ = new hash_policy(static_cast(k)); uint64 elements; - if ( UNSERIALIZE(&elements) ) + if ( ! UNSERIALIZE(&elements) ) return false; elements_ = static_cast(elements); return true; diff --git a/src/BloomFilter.h b/src/BloomFilter.h index 14b0ac3281..3e2bd5de90 100644 --- a/src/BloomFilter.h +++ b/src/BloomFilter.h @@ -94,15 +94,14 @@ protected: * A functor that computes a universal hash function. * @tparam Codomain An integral type. */ - template class Hasher { public: - template - Codomain operator()(const Domain& x) const + template + HashType operator()(const T& x) const { return h3_(&x, sizeof(x)); } - Codomain operator()(const void* x, size_t n) const + HashType operator()(const void* x, size_t n) const { return h3_(x, n); } @@ -110,7 +109,7 @@ protected: // FIXME: The hardcoded value of 36 comes from UHASH_KEY_SIZE defined in // Hash.h. I do not know how this value impacts the hash function behavior // so I'll just copy it verbatim. (Matthias) - H3 h3_; + H3 h3_; }; HashPolicy(size_t k) : k_(k) { } @@ -125,12 +124,11 @@ private: class DefaultHashing : public HashPolicy { public: DefaultHashing(size_t k) : HashPolicy(k), hashers_(k) { } - virtual ~DefaultHashing() { } virtual HashVector Hash(const void* x, size_t n) const; private: - std::vector< Hasher > hashers_; + std::vector hashers_; }; /** @@ -139,13 +137,12 @@ private: class DoubleHashing : public HashPolicy { public: DoubleHashing(size_t k) : HashPolicy(k) { } - virtual ~DoubleHashing() { } virtual HashVector Hash(const void* x, size_t n) const; private: - Hasher hasher1_; - Hasher hasher2_; + Hasher hasher1_; + Hasher hasher2_; }; /** diff --git a/src/OpaqueVal.cc b/src/OpaqueVal.cc index abfd8f320f..03a6e51ce8 100644 --- a/src/OpaqueVal.cc +++ b/src/OpaqueVal.cc @@ -533,6 +533,7 @@ bool BloomFilterVal::Typify(BroType* type) if ( type_ ) return false; type_ = type; + type_->Ref(); TypeList* tl = new TypeList(type_); tl->Append(type_); hash_ = new CompositeHash(tl); From f211b856c9ae35e68ea4af194e08157fdefef7e6 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Thu, 6 Jun 2013 13:13:36 -0700 Subject: [PATCH 16/73] Catch invalid values of the false-positive rate. --- src/bro.bif | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/bro.bif b/src/bro.bif index 08b532eaea..74219dd2b7 100644 --- a/src/bro.bif +++ b/src/bro.bif @@ -5751,6 +5751,11 @@ function anonymize_addr%(a: addr, cl: IPAddrAnonymizationClass%): addr function bloomfilter_init%(fp: double, capacity: count, max: count &default=1%): opaque of bloomfilter %{ + if ( fp < 0.0 || fp > 1.0 ) + { + reporter->Error("false-positive rate must take value between 0 and 1"); + return NULL; + } BloomFilter* bf; if ( max == 1 ) { From 7ce986e31f59b1f1000ec335a4efc1f0f5e0c011 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Thu, 6 Jun 2013 13:21:27 -0700 Subject: [PATCH 17/73] Fix modding. --- src/BloomFilter.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/BloomFilter.cc b/src/BloomFilter.cc index e549553bf4..7c347927c3 100644 --- a/src/BloomFilter.cc +++ b/src/BloomFilter.cc @@ -188,13 +188,13 @@ bool BasicBloomFilter::DoUnserialize(UnserialInfo* info) void BasicBloomFilter::AddImpl(const HashPolicy::HashVector& h) { for ( size_t i = 0; i < h.size(); ++i ) - bits_->Set(h[i] % h.size()); + bits_->Set(h[i] % bits_->Size()); } size_t BasicBloomFilter::CountImpl(const HashPolicy::HashVector& h) const { for ( size_t i = 0; i < h.size(); ++i ) - if ( ! (*bits_)[h[i] % h.size()] ) + if ( ! (*bits_)[h[i] % bits_->Size()] ) return 0; return 1; } @@ -233,7 +233,7 @@ bool CountingBloomFilter::DoUnserialize(UnserialInfo* info) void CountingBloomFilter::AddImpl(const HashPolicy::HashVector& h) { for ( size_t i = 0; i < h.size(); ++i ) - cells_->Increment(h[i] % h.size(), 1); + cells_->Increment(h[i] % cells_->Size(), 1); } size_t CountingBloomFilter::CountImpl(const HashPolicy::HashVector& h) const @@ -242,7 +242,7 @@ size_t CountingBloomFilter::CountImpl(const HashPolicy::HashVector& h) const std::numeric_limits::max(); for ( size_t i = 0; i < h.size(); ++i ) { - CounterVector::size_type cnt = cells_->Count(h[i] % h.size()); + CounterVector::size_type cnt = cells_->Count(h[i] % cells_->Size()); if ( cnt < min ) min = cnt; } From fcf1807fc8ac320a6c787360e8b78509b58b0a5a Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Thu, 6 Jun 2013 13:39:00 -0700 Subject: [PATCH 18/73] Fix hasher usage and narrow interface. --- src/BloomFilter.cc | 4 ++-- src/BloomFilter.h | 10 +--------- 2 files changed, 3 insertions(+), 11 deletions(-) diff --git a/src/BloomFilter.cc b/src/BloomFilter.cc index 7c347927c3..c684c82c0e 100644 --- a/src/BloomFilter.cc +++ b/src/BloomFilter.cc @@ -93,8 +93,8 @@ HashPolicy::HashVector DefaultHashing::Hash(const void* x, size_t n) const HashPolicy::HashVector DoubleHashing::Hash(const void* x, size_t n) const { - HashType h1 = hasher1_(x); - HashType h2 = hasher2_(x); + HashType h1 = hasher1_(x, n); + HashType h2 = hasher2_(x, n); HashVector h(K(), 0); for ( size_t i = 0; i < h.size(); ++i ) h[i] = h1 + i * h2; diff --git a/src/BloomFilter.h b/src/BloomFilter.h index 3e2bd5de90..fd1cb31d61 100644 --- a/src/BloomFilter.h +++ b/src/BloomFilter.h @@ -96,15 +96,7 @@ protected: */ class Hasher { public: - template - HashType operator()(const T& x) const - { - return h3_(&x, sizeof(x)); - } - HashType operator()(const void* x, size_t n) const - { - return h3_(x, n); - } + HashType operator()(const void* x, size_t n) const { return h3_(x, n); } private: // FIXME: The hardcoded value of 36 comes from UHASH_KEY_SIZE defined in // Hash.h. I do not know how this value impacts the hash function behavior From 0d299eca57ddab9dfb17c1f6c99139c481dccb49 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Thu, 6 Jun 2013 14:54:25 -0700 Subject: [PATCH 19/73] Correct computation of k hash functions. --- src/BloomFilter.cc | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/src/BloomFilter.cc b/src/BloomFilter.cc index c684c82c0e..f1db71ae1d 100644 --- a/src/BloomFilter.cc +++ b/src/BloomFilter.cc @@ -3,13 +3,6 @@ #include #include "Serializer.h" -// Backport C++11's std::round(). -namespace { -template -T round(double x) { return (x > 0.0) ? (x + 0.5) : (x - 0.5); } -} // namespace - - CounterVector::CounterVector(size_t width, size_t cells) : bits_(new BitVector(width * cells)), width_(width) { @@ -155,7 +148,7 @@ size_t BasicBloomFilter::M(double fp, size_t capacity) size_t BasicBloomFilter::K(size_t cells, size_t capacity) { double frac = static_cast(cells) / static_cast(capacity); - return round(frac * std::log(2)); + return std::ceil(frac * std::log(2)); } BasicBloomFilter::BasicBloomFilter(double fp, size_t capacity) From e15f03d980e8bb63d00969268056b2e9592b2f85 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Thu, 6 Jun 2013 15:02:11 -0700 Subject: [PATCH 20/73] Cleanup BiFs. --- src/bro.bif | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/src/bro.bif b/src/bro.bif index 5c1280645e..8bd9575498 100644 --- a/src/bro.bif +++ b/src/bro.bif @@ -5026,16 +5026,11 @@ function bloomfilter_add%(bf: opaque of bloomfilter, x: any%): any %{ BloomFilterVal* bfv = static_cast(bf); if ( ! bfv->Type() && ! bfv->Typify(x->Type()) ) - { reporter->Error("failed to set Bloom filter type"); - return NULL; - } else if ( bfv->Type() != x->Type() ) - { reporter->Error("incompatible Bloom filter types"); - return NULL; - } - bfv->Add(x); + else + bfv->Add(x); return NULL; %} @@ -5048,12 +5043,14 @@ function bloomfilter_add%(bf: opaque of bloomfilter, x: any%): any ## Returns: the counter associated with *x* in *bf*. function bloomfilter_lookup%(bf: opaque of bloomfilter, x: any%): count %{ - BloomFilterVal* bfv = static_cast(bf); + const BloomFilterVal* bfv = static_cast(bf); if ( ! bfv->Type() ) reporter->Error("cannot perform lookup on untyped Bloom filter"); else if ( bfv->Type() != x->Type() ) reporter->Error("incompatible Bloom filter types"); - return new Val(static_cast(bfv->Count(x)), TYPE_COUNT); + else + return new Val(static_cast(bfv->Count(x)), TYPE_COUNT); + return new Val(0, TYPE_COUNT); %} ## Merges two Bloom filters. @@ -5068,11 +5065,9 @@ function bloomfilter_merge%(bf1: opaque of bloomfilter, %{ const BloomFilterVal* bfv1 = static_cast(bf1); const BloomFilterVal* bfv2 = static_cast(bf2); - if ( ! bfv1->Type() ) - reporter->Error("first Bloom filter has not yet been typed"); - if ( ! bfv2->Type() ) - reporter->Error("second Bloom filter has not yet been typed"); - else if ( bfv1->Type() != bfv2->Type() ) + if ( bfv1->Type() != bfv2->Type() ) reporter->Error("incompatible Bloom filter types"); - return BloomFilterVal::Merge(bfv1, bfv2); + else + return BloomFilterVal::Merge(bfv1, bfv2); + return NULL; %} From 86becdd6e467fabc475eb81baea6d3586b2d74e7 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Thu, 6 Jun 2013 15:08:24 -0700 Subject: [PATCH 21/73] Add tests. --- testing/btest/bifs/bloomfilter.bro | 38 ++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 testing/btest/bifs/bloomfilter.bro diff --git a/testing/btest/bifs/bloomfilter.bro b/testing/btest/bifs/bloomfilter.bro new file mode 100644 index 0000000000..6abbdd69f7 --- /dev/null +++ b/testing/btest/bifs/bloomfilter.bro @@ -0,0 +1,38 @@ +# @TEST-EXEC: bro -b %INPUT >output +# @TEST-EXEC: btest-diff output + +event bro_init() + { + # Basic usage with counts. + local bf_cnt = bloomfilter_init(0.1, 1000); + bloomfilter_add(bf_cnt, 42); + bloomfilter_add(bf_cnt, 84); + bloomfilter_add(bf_cnt, 168); + print bloomfilter_lookup(bf_cnt, 0); + print bloomfilter_lookup(bf_cnt, 42); + print bloomfilter_lookup(bf_cnt, 168); + print bloomfilter_lookup(bf_cnt, 336); + bloomfilter_add(bf_cnt, 0.5); # Type mismatch + bloomfilter_add(bf_cnt, "foo"); # Type mismatch + + # Basic usage with strings. + local bf_str = bloomfilter_init(0.9, 10); + bloomfilter_add(bf_str, "foo"); + bloomfilter_add(bf_str, "bar"); + print bloomfilter_lookup(bf_str, "foo"); + print bloomfilter_lookup(bf_str, "bar"); + print bloomfilter_lookup(bf_str, "baz"); + print bloomfilter_lookup(bf_str, "qux"); + bloomfilter_add(bf_str, 0.5); # Type mismatch + bloomfilter_add(bf_str, 100); # Type mismatch + + # Edge cases. + local bf_edge0 = bloomfilter_init(0.000000000001, 1); + local bf_edge1 = bloomfilter_init(0.00000001, 100000000); + local bf_edge2 = bloomfilter_init(0.9999999, 1); + local bf_edge3 = bloomfilter_init(0.9999999, 100000000000); + + # Invalid parameters. + local bf_bug0 = bloomfilter_init(-0.5, 42); + local bf_bug1 = bloomfilter_init(1.1, 42); + } From f2d536d2da1118b1d5feb143f751d47dc344232b Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Thu, 6 Jun 2013 15:22:04 -0700 Subject: [PATCH 22/73] Add missing initializations. --- src/BloomFilter.cc | 15 +++++++++++++++ src/BloomFilter.h | 6 +++--- src/OpaqueVal.cc | 25 +++++++++++++++++-------- 3 files changed, 35 insertions(+), 11 deletions(-) diff --git a/src/BloomFilter.cc b/src/BloomFilter.cc index f1db71ae1d..40772fecb6 100644 --- a/src/BloomFilter.cc +++ b/src/BloomFilter.cc @@ -95,6 +95,11 @@ HashPolicy::HashVector DoubleHashing::Hash(const void* x, size_t n) const } +BloomFilter::BloomFilter() + : hash_(NULL) + { + } + BloomFilter::BloomFilter(size_t k) : hash_(new hash_policy(k)) { @@ -151,6 +156,11 @@ size_t BasicBloomFilter::K(size_t cells, size_t capacity) return std::ceil(frac * std::log(2)); } +BasicBloomFilter::BasicBloomFilter() + : bits_(NULL) + { + } + BasicBloomFilter::BasicBloomFilter(double fp, size_t capacity) : BloomFilter(K(M(fp, capacity), capacity)) { @@ -192,6 +202,11 @@ size_t BasicBloomFilter::CountImpl(const HashPolicy::HashVector& h) const return 1; } +CountingBloomFilter::CountingBloomFilter() + : cells_(NULL) + { + } + CountingBloomFilter::CountingBloomFilter(double fp, size_t capacity, size_t width) : BloomFilter(BasicBloomFilter::K(BasicBloomFilter::M(fp, capacity), diff --git a/src/BloomFilter.h b/src/BloomFilter.h index fd1cb31d61..c0101cadf8 100644 --- a/src/BloomFilter.h +++ b/src/BloomFilter.h @@ -188,7 +188,7 @@ public: protected: DECLARE_ABSTRACT_SERIAL(BloomFilter); - BloomFilter() { }; + BloomFilter(); BloomFilter(size_t k); virtual void AddImpl(const HashPolicy::HashVector& hashes) = 0; @@ -244,7 +244,7 @@ public: protected: DECLARE_SERIAL(BasicBloomFilter); - BasicBloomFilter() { } + BasicBloomFilter(); virtual void AddImpl(const HashPolicy::HashVector& h); virtual size_t CountImpl(const HashPolicy::HashVector& h) const; @@ -264,7 +264,7 @@ public: protected: DECLARE_SERIAL(CountingBloomFilter); - CountingBloomFilter() { } + CountingBloomFilter(); virtual void AddImpl(const HashPolicy::HashVector& h); virtual size_t CountImpl(const HashPolicy::HashVector& h) const; diff --git a/src/OpaqueVal.cc b/src/OpaqueVal.cc index 03a6e51ce8..38ea93d000 100644 --- a/src/OpaqueVal.cc +++ b/src/OpaqueVal.cc @@ -518,13 +518,27 @@ bool EntropyVal::DoUnserialize(UnserialInfo* info) return true; } -BloomFilterVal::BloomFilterVal(BloomFilter* bf) - : OpaqueVal(bloomfilter_type), bloom_filter_(bf) +BloomFilterVal::BloomFilterVal() + : OpaqueVal(bloomfilter_type), + type_(NULL), + hash_(NULL), + bloom_filter_(NULL) { } BloomFilterVal::BloomFilterVal(OpaqueType* t) - : OpaqueVal(t) + : OpaqueVal(t), + type_(NULL), + hash_(NULL), + bloom_filter_(NULL) + { + } + +BloomFilterVal::BloomFilterVal(BloomFilter* bf) + : OpaqueVal(bloomfilter_type), + type_(NULL), + hash_(NULL), + bloom_filter_(bf) { } @@ -575,11 +589,6 @@ BloomFilterVal::~BloomFilterVal() delete bloom_filter_; } -BloomFilterVal::BloomFilterVal() - : OpaqueVal(bloomfilter_type) - { - } - IMPLEMENT_SERIAL(BloomFilterVal, SER_BLOOMFILTER_VAL); bool BloomFilterVal::DoSerialize(SerialInfo* info) const From c6381055380f889c4891efcf83da512597ae64d6 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Mon, 10 Jun 2013 12:51:41 -0700 Subject: [PATCH 23/73] Document max parameter in bloomfilter_init. --- src/bro.bif | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/bro.bif b/src/bro.bif index 8bd9575498..9b80c90dbf 100644 --- a/src/bro.bif +++ b/src/bro.bif @@ -4993,6 +4993,13 @@ function anonymize_addr%(a: addr, cl: IPAddrAnonymizationClass%): addr ## capacity: the maximum number of elements that guarantees a false-positive ## rate of *fp*. ## +## max: The maximum counter value associated with each each element in the +## Bloom filter. If greater than 1, each element in the set has a counter of +## *w = ceil(log_2(max))* bits. Each bit in the underlying bit vector then +## becomes a cell of size *w* bits. Since the number number of cells is a +## function ## of *fp* and *capacity*, it is important to consider the effects +## on space when tuning this value. +## ## Returns: A Bloom filter handle. function bloomfilter_init%(fp: double, capacity: count, max: count &default=1%): opaque of bloomfilter From d25984ba45643be524788b73d7cebc1278a78810 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Mon, 10 Jun 2013 12:55:03 -0700 Subject: [PATCH 24/73] Update baseline for unit tests. --- testing/btest/Baseline/bifs.bloomfilter/output | 8 ++++++++ testing/btest/bifs/bloomfilter.bro | 4 ++-- 2 files changed, 10 insertions(+), 2 deletions(-) create mode 100644 testing/btest/Baseline/bifs.bloomfilter/output diff --git a/testing/btest/Baseline/bifs.bloomfilter/output b/testing/btest/Baseline/bifs.bloomfilter/output new file mode 100644 index 0000000000..65aaa8b07c --- /dev/null +++ b/testing/btest/Baseline/bifs.bloomfilter/output @@ -0,0 +1,8 @@ +0 +1 +1 +0 +1 +1 +1 +1 diff --git a/testing/btest/bifs/bloomfilter.bro b/testing/btest/bifs/bloomfilter.bro index 6abbdd69f7..769cec1200 100644 --- a/testing/btest/bifs/bloomfilter.bro +++ b/testing/btest/bifs/bloomfilter.bro @@ -21,8 +21,8 @@ event bro_init() bloomfilter_add(bf_str, "bar"); print bloomfilter_lookup(bf_str, "foo"); print bloomfilter_lookup(bf_str, "bar"); - print bloomfilter_lookup(bf_str, "baz"); - print bloomfilter_lookup(bf_str, "qux"); + print bloomfilter_lookup(bf_str, "baz"); # FP + print bloomfilter_lookup(bf_str, "qux"); # FP bloomfilter_add(bf_str, 0.5); # Type mismatch bloomfilter_add(bf_str, 100); # Type mismatch From 4c21576c120a0dcc9725308549fd57a8bf9072a1 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Mon, 10 Jun 2013 20:14:34 -0700 Subject: [PATCH 25/73] Add Bloomfilter serialization test code. --- testing/btest/istate/opaque.bro | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/testing/btest/istate/opaque.bro b/testing/btest/istate/opaque.bro index 84818a5e70..ac3b2c0874 100644 --- a/testing/btest/istate/opaque.bro +++ b/testing/btest/istate/opaque.bro @@ -12,6 +12,9 @@ global sha1_handle: opaque of sha1 &persistent &synchronized; global sha256_handle: opaque of sha256 &persistent &synchronized; global entropy_handle: opaque of entropy &persistent &synchronized; +global bloomfilter_elements: set[string] &persistent &synchronized; +global bloomfilter_handle: opaque of bloomfilter &persistent &synchronized; + event bro_done() { local out = open("output.log"); @@ -36,6 +39,9 @@ event bro_done() print out, entropy_test_finish(entropy_handle); else print out, "entropy_test_add() failed"; + + for ( e in bloomfilter_elements ) + print bloomfilter_lookup(bloomfilter_handle, e); } @TEST-END-FILE @@ -47,6 +53,9 @@ global sha1_handle: opaque of sha1 &persistent &synchronized; global sha256_handle: opaque of sha256 &persistent &synchronized; global entropy_handle: opaque of entropy &persistent &synchronized; +global bloomfilter_elements = { "foo", "bar", "baz" } &persistent &synchronized; +global bloomfilter_handle: opaque of bloomfilter &persistent &synchronized; + event bro_init() { local out = open("expected.log"); @@ -72,6 +81,10 @@ event bro_init() entropy_handle = entropy_test_init(); if ( ! entropy_test_add(entropy_handle, "f") ) print out, "entropy_test_add() failed"; + + bloomfilter_handle = bloomfilter_init(0.1, 100); + for ( e in bloomfilter_elements ) + bloomfilter_add(bloomfilter_handle, e); } @TEST-END-FILE From 22afbe42dd91e668de8c72417b6a8ff8b544dd99 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Mon, 10 Jun 2013 20:15:13 -0700 Subject: [PATCH 26/73] A number of tweaks of the serialization code. --- src/BitVector.h | 2 +- src/BloomFilter.cc | 17 ++++++++--------- src/BloomFilter.h | 2 +- src/OpaqueVal.cc | 10 ++++++---- src/SerialTypes.h | 8 ++++---- 5 files changed, 20 insertions(+), 19 deletions(-) diff --git a/src/BitVector.h b/src/BitVector.h index 8315a151f0..83fec44a0d 100644 --- a/src/BitVector.h +++ b/src/BitVector.h @@ -8,7 +8,7 @@ /** * A vector of bits. */ -class BitVector : SerialObj { +class BitVector : public SerialObj { public: typedef size_t block_type; typedef size_t size_type; diff --git a/src/BloomFilter.cc b/src/BloomFilter.cc index 40772fecb6..1d73734236 100644 --- a/src/BloomFilter.cc +++ b/src/BloomFilter.cc @@ -55,7 +55,7 @@ IMPLEMENT_SERIAL(CounterVector, SER_COUNTERVECTOR) bool CounterVector::DoSerialize(SerialInfo* info) const { DO_SERIALIZE(SER_COUNTERVECTOR, SerialObj); - if ( ! SERIALIZE(bits_) ) + if ( ! bits_->Serialize(info) ) return false; return SERIALIZE(static_cast(width_)); } @@ -63,14 +63,13 @@ bool CounterVector::DoSerialize(SerialInfo* info) const bool CounterVector::DoUnserialize(UnserialInfo* info) { DO_UNSERIALIZE(SerialObj); - return false; bits_ = BitVector::Unserialize(info); if ( ! bits_ ) return false; uint64 width; if ( ! UNSERIALIZE(&width) ) return false; - width_ = static_cast(width); + width_ = static_cast(width); return true; } @@ -127,7 +126,7 @@ bool BloomFilter::DoSerialize(SerialInfo* info) const DO_SERIALIZE(SER_BLOOMFILTER, SerialObj); if ( ! SERIALIZE(static_cast(hash_->K())) ) return false; - return SERIALIZE(static_cast(elements_)); + return SERIALIZE(static_cast(elements_)); } bool BloomFilter::DoUnserialize(UnserialInfo* info) @@ -178,14 +177,14 @@ IMPLEMENT_SERIAL(BasicBloomFilter, SER_BASICBLOOMFILTER) bool BasicBloomFilter::DoSerialize(SerialInfo* info) const { DO_SERIALIZE(SER_BASICBLOOMFILTER, BloomFilter); - return SERIALIZE(bits_); + return bits_->Serialize(info); } bool BasicBloomFilter::DoUnserialize(UnserialInfo* info) { DO_UNSERIALIZE(BloomFilter); bits_ = BitVector::Unserialize(info); - return bits_ == NULL; + return bits_ != NULL; } void BasicBloomFilter::AddImpl(const HashPolicy::HashVector& h) @@ -227,15 +226,15 @@ IMPLEMENT_SERIAL(CountingBloomFilter, SER_COUNTINGBLOOMFILTER) bool CountingBloomFilter::DoSerialize(SerialInfo* info) const { - DO_SERIALIZE(SER_BASICBLOOMFILTER, BloomFilter); - return SERIALIZE(cells_); + DO_SERIALIZE(SER_COUNTINGBLOOMFILTER, BloomFilter); + return cells_->Serialize(info); } bool CountingBloomFilter::DoUnserialize(UnserialInfo* info) { DO_UNSERIALIZE(BloomFilter); cells_ = CounterVector::Unserialize(info); - return cells_ == NULL; + return cells_ != NULL; } void CountingBloomFilter::AddImpl(const HashPolicy::HashVector& h) diff --git a/src/BloomFilter.h b/src/BloomFilter.h index c0101cadf8..4a83ba904b 100644 --- a/src/BloomFilter.h +++ b/src/BloomFilter.h @@ -73,7 +73,7 @@ protected: private: BitVector* bits_; - unsigned width_; + size_t width_; }; /** diff --git a/src/OpaqueVal.cc b/src/OpaqueVal.cc index 38ea93d000..76936dfb78 100644 --- a/src/OpaqueVal.cc +++ b/src/OpaqueVal.cc @@ -574,7 +574,7 @@ size_t BloomFilterVal::Count(const Val* val) const BloomFilterVal* BloomFilterVal::Merge(const BloomFilterVal* first, const BloomFilterVal* second) -{ + { assert(! "not yet implemented"); return NULL; } @@ -594,14 +594,15 @@ IMPLEMENT_SERIAL(BloomFilterVal, SER_BLOOMFILTER_VAL); bool BloomFilterVal::DoSerialize(SerialInfo* info) const { DO_SERIALIZE(SER_BLOOMFILTER_VAL, OpaqueVal); - if ( ! SERIALIZE(type_) ) + if ( ! type_->Serialize(info) ) return false; - return SERIALIZE(bloom_filter_); + return bloom_filter_->Serialize(info); } bool BloomFilterVal::DoUnserialize(UnserialInfo* info) { DO_UNSERIALIZE(OpaqueVal); + type_ = BroType::Unserialize(info); if ( ! type_ ) return false; @@ -609,6 +610,7 @@ bool BloomFilterVal::DoUnserialize(UnserialInfo* info) tl->Append(type_); hash_ = new CompositeHash(tl); Unref(tl); + bloom_filter_ = BloomFilter::Unserialize(info); - return bloom_filter_ == NULL; + return bloom_filter_ != NULL; } diff --git a/src/SerialTypes.h b/src/SerialTypes.h index 859145f19f..9e4aef5b3b 100644 --- a/src/SerialTypes.h +++ b/src/SerialTypes.h @@ -50,10 +50,10 @@ SERIAL_IS_BO(CASE, 0x1200) SERIAL_IS(LOCATION, 0x1300) SERIAL_IS(RE_MATCHER, 0x1400) SERIAL_IS(BITVECTOR, 0x1500) -SERIAL_IS(COUNTERVECTOR, 0xa000) -SERIAL_IS(BLOOMFILTER, 0xa100) -SERIAL_IS(BASICBLOOMFILTER, 0xa200) -SERIAL_IS(COUNTINGBLOOMFILTER, 0xa300) +SERIAL_IS(COUNTERVECTOR, 0x1600) +SERIAL_IS(BLOOMFILTER, 0x1700) +SERIAL_IS(BASICBLOOMFILTER, 0x1800) +SERIAL_IS(COUNTINGBLOOMFILTER, 0x1900) // These are the externally visible types. const SerialType SER_NONE = 0; From 14a701a237dfdd745a842a11f363b93d01926505 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Mon, 10 Jun 2013 22:24:23 -0700 Subject: [PATCH 27/73] Implement value merging. The actual BloomFilter merging still lacks, this is just the first step in the right direction from the user interface side. --- src/BloomFilter.cc | 27 ++++++++++++++++++++------- src/BloomFilter.h | 18 ++++++------------ src/OpaqueVal.cc | 17 ++++++++++++++--- src/OpaqueVal.h | 17 ++++++++++++++--- 4 files changed, 54 insertions(+), 25 deletions(-) diff --git a/src/BloomFilter.cc b/src/BloomFilter.cc index 1d73734236..e55db71e46 100644 --- a/src/BloomFilter.cc +++ b/src/BloomFilter.cc @@ -124,9 +124,7 @@ BloomFilter* BloomFilter::Unserialize(UnserialInfo* info) bool BloomFilter::DoSerialize(SerialInfo* info) const { DO_SERIALIZE(SER_BLOOMFILTER, SerialObj); - if ( ! SERIALIZE(static_cast(hash_->K())) ) - return false; - return SERIALIZE(static_cast(elements_)); + return SERIALIZE(static_cast(hash_->K())); } bool BloomFilter::DoUnserialize(UnserialInfo* info) @@ -136,10 +134,6 @@ bool BloomFilter::DoUnserialize(UnserialInfo* info) if ( ! UNSERIALIZE(&k) ) return false; hash_ = new hash_policy(static_cast(k)); - uint64 elements; - if ( ! UNSERIALIZE(&elements) ) - return false; - elements_ = static_cast(elements); return true; } @@ -155,6 +149,17 @@ size_t BasicBloomFilter::K(size_t cells, size_t capacity) return std::ceil(frac * std::log(2)); } +BasicBloomFilter* BasicBloomFilter::Merge(const BasicBloomFilter* x, + const BasicBloomFilter* y) + { + BasicBloomFilter* result = new BasicBloomFilter(); + result->bits_ = new BitVector(*x->bits_ | *y->bits_); + // TODO: implement the hasher pool and make sure the new result gets the same + // number of (equal) hash functions. + //assert(x->hash_ == y->hash_); + return result; + } + BasicBloomFilter::BasicBloomFilter() : bits_(NULL) { @@ -201,6 +206,14 @@ size_t BasicBloomFilter::CountImpl(const HashPolicy::HashVector& h) const return 1; } + +CountingBloomFilter* CountingBloomFilter::Merge(const CountingBloomFilter* x, + const CountingBloomFilter* y) +{ + assert(! "not yet implemented"); + return NULL; +} + CountingBloomFilter::CountingBloomFilter() : cells_(NULL) { diff --git a/src/BloomFilter.h b/src/BloomFilter.h index 4a83ba904b..3b5d9efa71 100644 --- a/src/BloomFilter.h +++ b/src/BloomFilter.h @@ -155,7 +155,6 @@ public: template void Add(const T& x) { - ++elements_; AddImpl(hash_->Hash(&x, sizeof(x))); } @@ -172,16 +171,6 @@ public: return CountImpl(hash_->Hash(&x, sizeof(x))); } - /** - * Retrieves the number of elements added to the Bloom filter. - * - * @return The number of elements in this Bloom filter. - */ - size_t Size() const - { - return elements_; - } - bool Serialize(SerialInfo* info) const; static BloomFilter* Unserialize(UnserialInfo* info); @@ -196,7 +185,6 @@ protected: private: HashPolicy* hash_; - size_t elements_; }; /** @@ -230,6 +218,9 @@ public: */ static size_t K(size_t cells, size_t capacity); + static BasicBloomFilter* Merge(const BasicBloomFilter* x, + const BasicBloomFilter* y); + /** * Constructs a basic Bloom filter with a given false-positive rate and * capacity. @@ -258,6 +249,9 @@ private: */ class CountingBloomFilter : public BloomFilter { public: + static CountingBloomFilter* Merge(const CountingBloomFilter* x, + const CountingBloomFilter* y); + CountingBloomFilter(double fp, size_t capacity, size_t width); CountingBloomFilter(size_t cells, size_t capacity, size_t width); diff --git a/src/OpaqueVal.cc b/src/OpaqueVal.cc index 76936dfb78..9dd5c7f980 100644 --- a/src/OpaqueVal.cc +++ b/src/OpaqueVal.cc @@ -572,10 +572,21 @@ size_t BloomFilterVal::Count(const Val* val) const return bloom_filter_->Count(key->Hash()); } -BloomFilterVal* BloomFilterVal::Merge(const BloomFilterVal* first, - const BloomFilterVal* second) +BloomFilterVal* BloomFilterVal::Merge(const BloomFilterVal* x, + const BloomFilterVal* y) { - assert(! "not yet implemented"); + if ( x->Type() != y->Type() ) + { + reporter->InternalError("cannot merge Bloom filters with different types"); + return NULL; + } + + BloomFilterVal* result; + if ( (result = DoMerge(x, y)) ) + return result; + else if ( (result = DoMerge(x, y)) ) + return result; + return NULL; } diff --git a/src/OpaqueVal.h b/src/OpaqueVal.h index e97a530f3a..4b45cad519 100644 --- a/src/OpaqueVal.h +++ b/src/OpaqueVal.h @@ -113,10 +113,10 @@ class BloomFilterVal : public OpaqueVal { BloomFilterVal(const BloomFilterVal&); BloomFilterVal& operator=(const BloomFilterVal&); public: - static BloomFilterVal* Merge(const BloomFilterVal* first, - const BloomFilterVal* second); + static BloomFilterVal* Merge(const BloomFilterVal* x, + const BloomFilterVal* y); - BloomFilterVal(BloomFilter* bf); + explicit BloomFilterVal(BloomFilter* bf); ~BloomFilterVal(); bool Typify(BroType* type); @@ -133,6 +133,17 @@ protected: DECLARE_SERIAL(BloomFilterVal); private: + template + static BloomFilterVal* DoMerge(const BloomFilterVal* x, + const BloomFilterVal* y) + { + const T* a = dynamic_cast(x->bloom_filter_); + const T* b = dynamic_cast(y->bloom_filter_); + if ( a && b ) + return new BloomFilterVal(T::Merge(a, b)); + return NULL; + } + BroType* type_; CompositeHash* hash_; BloomFilter* bloom_filter_; From 1f90b539a8574eeadd4b20ae9f379b0fe08999be Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Thu, 13 Jun 2013 23:06:01 -0700 Subject: [PATCH 28/73] Make H3 class adhere to Bro coding style. --- src/H3.h | 89 ++++++++++++++++++++++++++++---------------------------- 1 file changed, 44 insertions(+), 45 deletions(-) diff --git a/src/H3.h b/src/H3.h index 72d81d519f..50afda5688 100644 --- a/src/H3.h +++ b/src/H3.h @@ -65,53 +65,52 @@ template class H3 { T byte_lookup[N][H3_BYTE_RANGE]; public: - H3(); - T operator()(const void* data, size_t size, size_t offset = 0) const - { - const unsigned char *p = static_cast(data); - T result = 0; + H3() + { + T bit_lookup[N * CHAR_BIT]; - // loop optmized with Duff's Device - register unsigned n = (size + 7) / 8; - switch (size % 8) { - case 0: do { result ^= byte_lookup[offset++][*p++]; - case 7: result ^= byte_lookup[offset++][*p++]; - case 6: result ^= byte_lookup[offset++][*p++]; - case 5: result ^= byte_lookup[offset++][*p++]; - case 4: result ^= byte_lookup[offset++][*p++]; - case 3: result ^= byte_lookup[offset++][*p++]; - case 2: result ^= byte_lookup[offset++][*p++]; - case 1: result ^= byte_lookup[offset++][*p++]; - } while (--n > 0); - } + for ( size_t bit = 0; bit < N * CHAR_BIT; bit++ ) + { + bit_lookup[bit] = 0; + for ( size_t i = 0; i < sizeof(T)/2; i++ ) + // assume random() returns at least 16 random bits + bit_lookup[bit] = (bit_lookup[bit] << 16) | (bro_random() & 0xFFFF); + } - return result; - } + for ( size_t byte = 0; byte < N; byte++ ) + { + for ( unsigned val = 0; val < H3_BYTE_RANGE; val++ ) + { + byte_lookup[byte][val] = 0; + for ( size_t bit = 0; bit < CHAR_BIT; bit++ ) + // Does this mean byte_lookup[*][0] == 0? -RP + if (val & (1 << bit)) + byte_lookup[byte][val] ^= bit_lookup[byte*CHAR_BIT+bit]; + } + } + } + + T operator()(const void* data, size_t size, size_t offset = 0) const + { + const unsigned char *p = static_cast(data); + T result = 0; + + // loop optmized with Duff's Device + register unsigned n = (size + 7) / 8; + switch (size % 8) { + case 0: do { result ^= byte_lookup[offset++][*p++]; + case 7: result ^= byte_lookup[offset++][*p++]; + case 6: result ^= byte_lookup[offset++][*p++]; + case 5: result ^= byte_lookup[offset++][*p++]; + case 4: result ^= byte_lookup[offset++][*p++]; + case 3: result ^= byte_lookup[offset++][*p++]; + case 2: result ^= byte_lookup[offset++][*p++]; + case 1: result ^= byte_lookup[offset++][*p++]; + } while (--n > 0); + } + + return result; + } }; -template -H3::H3() -{ - T bit_lookup[N * CHAR_BIT]; - - for (size_t bit = 0; bit < N * CHAR_BIT; bit++) { - bit_lookup[bit] = 0; - for (size_t i = 0; i < sizeof(T)/2; i++) { - // assume random() returns at least 16 random bits - bit_lookup[bit] = (bit_lookup[bit] << 16) | (bro_random() & 0xFFFF); - } - } - - for (size_t byte = 0; byte < N; byte++) { - for (unsigned val = 0; val < H3_BYTE_RANGE; val++) { - byte_lookup[byte][val] = 0; - for (size_t bit = 0; bit < CHAR_BIT; bit++) { - // Does this mean byte_lookup[*][0] == 0? -RP - if (val & (1 << bit)) - byte_lookup[byte][val] ^= bit_lookup[byte*CHAR_BIT+bit]; - } - } - } -} - #endif //H3_H From 529d12037672d34fd4d1ba5f0d291fd6214f41d4 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Thu, 13 Jun 2013 23:07:31 -0700 Subject: [PATCH 29/73] Make H3 seed configurable. --- src/H3.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/H3.h b/src/H3.h index 50afda5688..11b0cd79a5 100644 --- a/src/H3.h +++ b/src/H3.h @@ -65,7 +65,7 @@ template class H3 { T byte_lookup[N][H3_BYTE_RANGE]; public: - H3() + H3(T seed = bro_random()) { T bit_lookup[N * CHAR_BIT]; @@ -74,7 +74,7 @@ public: bit_lookup[bit] = 0; for ( size_t i = 0; i < sizeof(T)/2; i++ ) // assume random() returns at least 16 random bits - bit_lookup[bit] = (bit_lookup[bit] << 16) | (bro_random() & 0xFFFF); + bit_lookup[bit] = (bit_lookup[bit] << 16) | (seed & 0xFFFF); } for ( size_t byte = 0; byte < N; byte++ ) From a6d7b7856e87c3a15ba7009ccfb7d6550d1dcfcc Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Thu, 13 Jun 2013 23:12:00 -0700 Subject: [PATCH 30/73] Update H3 documentation (and minor style nits.) --- src/H3.h | 60 +++++++++++++++++++++++++++++--------------------------- 1 file changed, 31 insertions(+), 29 deletions(-) diff --git a/src/H3.h b/src/H3.h index 11b0cd79a5..2eda14d276 100644 --- a/src/H3.h +++ b/src/H3.h @@ -49,9 +49,9 @@ // hash a substring of the data. Hashes of substrings can be bitwise-XOR'ed // together to get the same result as hashing the full string. // Any number of hash functions can be created by creating new instances of H3, -// with the same or different template parameters. The hash function is -// randomly generated using bro_random(); you must call init_random_seed() -// before the H3 constructor if you wish to seed it. +// with the same or different template parameters. The hash function +// constructor takes a seed as argument which defaults to a call to +// bro_random(). #ifndef H3_H @@ -62,34 +62,34 @@ // The number of values representable by a byte. #define H3_BYTE_RANGE (UCHAR_MAX+1) -template class H3 { - T byte_lookup[N][H3_BYTE_RANGE]; +template +class H3 { public: - H3(T seed = bro_random()) + H3(T seed = bro_random()) + { + T bit_lookup[N * CHAR_BIT]; + + for ( size_t bit = 0; bit < N * CHAR_BIT; bit++ ) { - T bit_lookup[N * CHAR_BIT]; - - for ( size_t bit = 0; bit < N * CHAR_BIT; bit++ ) - { - bit_lookup[bit] = 0; - for ( size_t i = 0; i < sizeof(T)/2; i++ ) - // assume random() returns at least 16 random bits - bit_lookup[bit] = (bit_lookup[bit] << 16) | (seed & 0xFFFF); - } - - for ( size_t byte = 0; byte < N; byte++ ) - { - for ( unsigned val = 0; val < H3_BYTE_RANGE; val++ ) - { - byte_lookup[byte][val] = 0; - for ( size_t bit = 0; bit < CHAR_BIT; bit++ ) - // Does this mean byte_lookup[*][0] == 0? -RP - if (val & (1 << bit)) - byte_lookup[byte][val] ^= bit_lookup[byte*CHAR_BIT+bit]; - } - } + bit_lookup[bit] = 0; + for ( size_t i = 0; i < sizeof(T)/2; i++ ) + // assume random() returns at least 16 random bits + bit_lookup[bit] = (bit_lookup[bit] << 16) | (seed & 0xFFFF); } + for ( size_t byte = 0; byte < N; byte++ ) + { + for ( unsigned val = 0; val < H3_BYTE_RANGE; val++ ) + { + byte_lookup[byte][val] = 0; + for ( size_t bit = 0; bit < CHAR_BIT; bit++ ) + // Does this mean byte_lookup[*][0] == 0? -RP + if (val & (1 << bit)) + byte_lookup[byte][val] ^= bit_lookup[byte*CHAR_BIT+bit]; + } + } + } + T operator()(const void* data, size_t size, size_t offset = 0) const { const unsigned char *p = static_cast(data); @@ -97,7 +97,7 @@ public: // loop optmized with Duff's Device register unsigned n = (size + 7) / 8; - switch (size % 8) { + switch ( size % 8 ) { case 0: do { result ^= byte_lookup[offset++][*p++]; case 7: result ^= byte_lookup[offset++][*p++]; case 6: result ^= byte_lookup[offset++][*p++]; @@ -106,11 +106,13 @@ public: case 3: result ^= byte_lookup[offset++][*p++]; case 2: result ^= byte_lookup[offset++][*p++]; case 1: result ^= byte_lookup[offset++][*p++]; - } while (--n > 0); + } while ( --n > 0 ); } return result; } +private: + T byte_lookup[N][H3_BYTE_RANGE]; }; #endif //H3_H From d2d8aff81456413597b09b71557b0caabdb7af3d Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Fri, 14 Jun 2013 09:22:48 -0700 Subject: [PATCH 31/73] Add utility function to access first random seed. --- src/util.cc | 13 +++++++++++++ src/util.h | 5 +++++ 2 files changed, 18 insertions(+) diff --git a/src/util.cc b/src/util.cc index de9bd5b679..721ee10a7e 100644 --- a/src/util.cc +++ b/src/util.cc @@ -716,6 +716,8 @@ static bool write_random_seeds(const char* write_file, uint32 seed, static bool bro_rand_determistic = false; static unsigned int bro_rand_state = 0; +static bool first_seed_saved = false; +static unsigned int first_seed = 0; static void bro_srandom(unsigned int seed, bool deterministic) { @@ -800,6 +802,12 @@ void init_random_seed(uint32 seed, const char* read_file, const char* write_file bro_srandom(seed, seeds_done); + if ( ! first_seed_saved ) + { + first_seed = seed; + first_seed_saved = true; + } + if ( ! hmac_key_set ) { MD5((const u_char*) buf, sizeof(buf), shared_hmac_md5_key); @@ -811,6 +819,11 @@ void init_random_seed(uint32 seed, const char* read_file, const char* write_file write_file); } +unsigned int initial_seed() + { + return first_seed; +} + bool have_random_seed() { return bro_rand_determistic; diff --git a/src/util.h b/src/util.h index 49bcbf318b..c3eebb04e3 100644 --- a/src/util.h +++ b/src/util.h @@ -165,6 +165,11 @@ extern void hmac_md5(size_t size, const unsigned char* bytes, extern void init_random_seed(uint32 seed, const char* load_file, const char* write_file); +// Retrieves the initial seed computed after the very first call to +// init_random_seed(). Repeated calls to init_random_seed() will not affect the +// return value of this function. +unsigned int initial_seed(); + // Returns true if the user explicitly set a seed via init_random_seed(); extern bool have_random_seed(); From 1576239f67ef2641135f95bdd331f3c1a54ee5ad Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Fri, 14 Jun 2013 10:19:39 -0700 Subject: [PATCH 32/73] Support seeding for hashers. --- src/BloomFilter.cc | 11 +++++++++++ src/BloomFilter.h | 4 +++- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/src/BloomFilter.cc b/src/BloomFilter.cc index e55db71e46..eff7eee733 100644 --- a/src/BloomFilter.cc +++ b/src/BloomFilter.cc @@ -74,6 +74,17 @@ bool CounterVector::DoUnserialize(UnserialInfo* info) } +HashPolicy::Hasher::Hasher(size_t seed) + : h3_(seed) +{ +} + +HashPolicy::HashType +HashPolicy::Hasher::operator()(const void* x, size_t n) const + { + return h3_(x, n); + } + HashPolicy::HashVector DefaultHashing::Hash(const void* x, size_t n) const { HashVector h(K(), 0); diff --git a/src/BloomFilter.h b/src/BloomFilter.h index 3b5d9efa71..65133621f9 100644 --- a/src/BloomFilter.h +++ b/src/BloomFilter.h @@ -96,7 +96,9 @@ protected: */ class Hasher { public: - HashType operator()(const void* x, size_t n) const { return h3_(x, n); } + Hasher(size_t seed); + + HashType operator()(const void* x, size_t n) const; private: // FIXME: The hardcoded value of 36 comes from UHASH_KEY_SIZE defined in // Hash.h. I do not know how this value impacts the hash function behavior From 79a6a26f9f70a937551a94a5dc83b2c5dafe1414 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Fri, 14 Jun 2013 10:20:33 -0700 Subject: [PATCH 33/73] H3 does not check for zero length input. --- src/BloomFilter.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/BloomFilter.cc b/src/BloomFilter.cc index eff7eee733..6a44defc6d 100644 --- a/src/BloomFilter.cc +++ b/src/BloomFilter.cc @@ -82,7 +82,7 @@ HashPolicy::Hasher::Hasher(size_t seed) HashPolicy::HashType HashPolicy::Hasher::operator()(const void* x, size_t n) const { - return h3_(x, n); + return n == 0 ? 0 : h3_(x, n); } HashPolicy::HashVector DefaultHashing::Hash(const void* x, size_t n) const From 9f740642891664ee8f482285523969793d0063d0 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Mon, 17 Jun 2013 14:02:14 -0700 Subject: [PATCH 34/73] Expose Bro's linear congruence PRNG as utility function. It was previously not possible to crank the wheel on the PRNG in a deterministic way without affecting the globally unique seed. The new extra utility function bro_prng takes a state in the form of a long int and returns the new PRNG state, now allowing arbitrary code parts to use the random number functionality. This commit also fixes a problem in the H3 constructor, which requires use of multiple seeds. The single seed passed in now serves as seed to crank out as many value needed using bro_prng. --- src/H3.h | 1 + src/util.cc | 29 ++++++++++++++++++----------- src/util.h | 7 +++++-- 3 files changed, 24 insertions(+), 13 deletions(-) diff --git a/src/H3.h b/src/H3.h index 2eda14d276..e2dc865147 100644 --- a/src/H3.h +++ b/src/H3.h @@ -72,6 +72,7 @@ public: for ( size_t bit = 0; bit < N * CHAR_BIT; bit++ ) { bit_lookup[bit] = 0; + seed = bro_prng(seed); for ( size_t i = 0; i < sizeof(T)/2; i++ ) // assume random() returns at least 16 random bits bit_lookup[bit] = (bit_lookup[bit] << 16) | (seed & 0xFFFF); diff --git a/src/util.cc b/src/util.cc index 721ee10a7e..cdd257d94f 100644 --- a/src/util.cc +++ b/src/util.cc @@ -829,22 +829,29 @@ bool have_random_seed() return bro_rand_determistic; } +long int bro_prng(long int state) + { + // Use our own simple linear congruence PRNG to make sure we are + // predictable across platforms. + static const long int m = 2147483647; + static const long int a = 16807; + const long int q = m / a; + const long int r = m % a; + + state = a * ( state % q ) - r * ( state / q ); + + if ( state <= 0 ) + state += m; + + return state; + } + long int bro_random() { if ( ! bro_rand_determistic ) return random(); // Use system PRNG. - // Use our own simple linear congruence PRNG to make sure we are - // predictable across platforms. - const long int m = 2147483647; - const long int a = 16807; - const long int q = m / a; - const long int r = m % a; - - bro_rand_state = a * ( bro_rand_state % q ) - r * ( bro_rand_state / q ); - - if ( bro_rand_state <= 0 ) - bro_rand_state += m; + bro_rand_state = bro_prng(bro_rand_state); return bro_rand_state; } diff --git a/src/util.h b/src/util.h index c3eebb04e3..0af401c668 100644 --- a/src/util.h +++ b/src/util.h @@ -173,9 +173,12 @@ unsigned int initial_seed(); // Returns true if the user explicitly set a seed via init_random_seed(); extern bool have_random_seed(); +// A simple linear congruence PRNG. It takes its state as argument and returns +// a new random value, which can serve as state for subsequent calls. +long int bro_prng(long int state); + // Replacement for the system random(), to which is normally falls back -// except when a seed has been given. In that case, we use our own -// predictable PRNG. +// except when a seed has been given. In that case, the function bro_prng. long int bro_random(); // Calls the system srandom() function with the given seed if not running From 532fbfb4d27ac9ee733dbcfebccbc91e652d4eb0 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Mon, 17 Jun 2013 16:06:02 -0700 Subject: [PATCH 35/73] Factor implementation and change interface. When constructing a Bloom filter, one now has to pass a HashPolicy instance to it. This separates more clearly the concerns of hashing and Bloom filter management. This commit also changes the interface to initialize Bloom filters: there exist now two initialization functions, one for each type: (1) bloomfilter_basic_init(fp: double, capacity: count, name: string &default=""): opaque of bloomfilter (2) bloomfilter_counting_init(k: count, cells: count, max: count, name: string &default=""): opaque of bloomfilter The BiFs for adding elements and performing lookups remain the same. This essentially gives us "BiF polymorphism" at script land, where the initialization BiF constructs the most derived type while subsequent BiFs adhere to the same interface. The reason why we split up the constructor in this case is that we have not yet derived the math that computes the optimal number of hash functions for counting Bloom filters---users have to explicitly parameterize them for now. --- src/BloomFilter.cc | 159 +++++--------------------- src/BloomFilter.h | 172 ++++------------------------- src/CMakeLists.txt | 2 + src/CounterVector.cc | 75 +++++++++++++ src/CounterVector.h | 78 +++++++++++++ src/HashPolicy.cc | 72 ++++++++++++ src/HashPolicy.h | 90 +++++++++++++++ src/OpaqueVal.cc | 1 + src/bro.bif | 57 ++++++---- testing/btest/bifs/bloomfilter.bro | 20 ++-- testing/btest/istate/opaque.bro | 2 +- 11 files changed, 409 insertions(+), 319 deletions(-) create mode 100644 src/CounterVector.cc create mode 100644 src/CounterVector.h create mode 100644 src/HashPolicy.cc create mode 100644 src/HashPolicy.h diff --git a/src/BloomFilter.cc b/src/BloomFilter.cc index 6a44defc6d..0be64c18de 100644 --- a/src/BloomFilter.cc +++ b/src/BloomFilter.cc @@ -1,117 +1,16 @@ #include "BloomFilter.h" #include +#include "CounterVector.h" #include "Serializer.h" -CounterVector::CounterVector(size_t width, size_t cells) - : bits_(new BitVector(width * cells)), width_(width) - { - } - -CounterVector::~CounterVector() - { - delete bits_; - } - -bool CounterVector::Increment(size_type cell, count_type value) - { - // TODO - assert(! "not yet implemented"); - return false; - } - -bool CounterVector::Decrement(size_type cell, count_type value) - { - // TODO - assert(! "not yet implemented"); - return false; - } - -CounterVector::count_type CounterVector::Count(size_type cell) const - { - // TODO - assert(! "not yet implemented"); - return 0; - } - -CounterVector::size_type CounterVector::Size() const - { - return bits_->Blocks() / width_; - } - -bool CounterVector::Serialize(SerialInfo* info) const - { - return SerialObj::Serialize(info); - } - -CounterVector* CounterVector::Unserialize(UnserialInfo* info) - { - return reinterpret_cast( - SerialObj::Unserialize(info, SER_COUNTERVECTOR)); - } - -IMPLEMENT_SERIAL(CounterVector, SER_COUNTERVECTOR) - -bool CounterVector::DoSerialize(SerialInfo* info) const - { - DO_SERIALIZE(SER_COUNTERVECTOR, SerialObj); - if ( ! bits_->Serialize(info) ) - return false; - return SERIALIZE(static_cast(width_)); - } - -bool CounterVector::DoUnserialize(UnserialInfo* info) - { - DO_UNSERIALIZE(SerialObj); - bits_ = BitVector::Unserialize(info); - if ( ! bits_ ) - return false; - uint64 width; - if ( ! UNSERIALIZE(&width) ) - return false; - width_ = static_cast(width); - return true; - } - - -HashPolicy::Hasher::Hasher(size_t seed) - : h3_(seed) -{ -} - -HashPolicy::HashType -HashPolicy::Hasher::operator()(const void* x, size_t n) const - { - return n == 0 ? 0 : h3_(x, n); - } - -HashPolicy::HashVector DefaultHashing::Hash(const void* x, size_t n) const - { - HashVector h(K(), 0); - for ( size_t i = 0; i < h.size(); ++i ) - h[i] = hashers_[i](x, n); - return h; - } - - -HashPolicy::HashVector DoubleHashing::Hash(const void* x, size_t n) const - { - HashType h1 = hasher1_(x, n); - HashType h2 = hasher2_(x, n); - HashVector h(K(), 0); - for ( size_t i = 0; i < h.size(); ++i ) - h[i] = h1 + i * h2; - return h; - } - - BloomFilter::BloomFilter() : hash_(NULL) { } -BloomFilter::BloomFilter(size_t k) - : hash_(new hash_policy(k)) +BloomFilter::BloomFilter(const HashPolicy* hash_policy) + : hash_(hash_policy) { } @@ -135,7 +34,11 @@ BloomFilter* BloomFilter::Unserialize(UnserialInfo* info) bool BloomFilter::DoSerialize(SerialInfo* info) const { DO_SERIALIZE(SER_BLOOMFILTER, SerialObj); - return SERIALIZE(static_cast(hash_->K())); + // FIXME: Since we have a fixed hashing policy, we just serialize the + // information needed to reconstruct it. + if ( ! SERIALIZE(static_cast(hash_->K())) ) + return false; + return SERIALIZE_STR(hash_->Name().c_str(), hash_->Name().size()); } bool BloomFilter::DoUnserialize(UnserialInfo* info) @@ -144,10 +47,15 @@ bool BloomFilter::DoUnserialize(UnserialInfo* info) uint16 k; if ( ! UNSERIALIZE(&k) ) return false; - hash_ = new hash_policy(static_cast(k)); + const char* name; + if ( ! UNSERIALIZE_STR(&name, 0) ) + return false; + // FIXME: for now Bloom filters always use double hashing. + hash_ = new DefaultHashing(k, name); return true; } + size_t BasicBloomFilter::M(double fp, size_t capacity) { double ln2 = std::log(2); @@ -163,11 +71,9 @@ size_t BasicBloomFilter::K(size_t cells, size_t capacity) BasicBloomFilter* BasicBloomFilter::Merge(const BasicBloomFilter* x, const BasicBloomFilter* y) { + // TODO: Ensure that x and y use the same HashPolicy before proceeding. BasicBloomFilter* result = new BasicBloomFilter(); result->bits_ = new BitVector(*x->bits_ | *y->bits_); - // TODO: implement the hasher pool and make sure the new result gets the same - // number of (equal) hash functions. - //assert(x->hash_ == y->hash_); return result; } @@ -176,16 +82,10 @@ BasicBloomFilter::BasicBloomFilter() { } -BasicBloomFilter::BasicBloomFilter(double fp, size_t capacity) - : BloomFilter(K(M(fp, capacity), capacity)) +BasicBloomFilter::BasicBloomFilter(const HashPolicy* hash_policy, size_t cells) + : BloomFilter(hash_policy), + bits_(new BitVector(cells)) { - bits_ = new BitVector(M(fp, capacity)); - } - -BasicBloomFilter::BasicBloomFilter(size_t cells, size_t capacity) - : BloomFilter(K(cells, capacity)) - { - bits_ = new BitVector(cells); } IMPLEMENT_SERIAL(BasicBloomFilter, SER_BASICBLOOMFILTER) @@ -203,13 +103,13 @@ bool BasicBloomFilter::DoUnserialize(UnserialInfo* info) return bits_ != NULL; } -void BasicBloomFilter::AddImpl(const HashPolicy::HashVector& h) +void BasicBloomFilter::AddImpl(const HashPolicy::hash_vector& h) { for ( size_t i = 0; i < h.size(); ++i ) bits_->Set(h[i] % bits_->Size()); } -size_t BasicBloomFilter::CountImpl(const HashPolicy::HashVector& h) const +size_t BasicBloomFilter::CountImpl(const HashPolicy::hash_vector& h) const { for ( size_t i = 0; i < h.size(); ++i ) if ( ! (*bits_)[h[i] % bits_->Size()] ) @@ -230,17 +130,9 @@ CountingBloomFilter::CountingBloomFilter() { } -CountingBloomFilter::CountingBloomFilter(double fp, size_t capacity, - size_t width) - : BloomFilter(BasicBloomFilter::K(BasicBloomFilter::M(fp, capacity), - capacity)) - { - cells_ = new CounterVector(width, BasicBloomFilter::M(fp, capacity)); - } - -CountingBloomFilter::CountingBloomFilter(size_t cells, size_t capacity, - size_t width) - : BloomFilter(BasicBloomFilter::K(cells, capacity)) +CountingBloomFilter::CountingBloomFilter(const HashPolicy* hash_policy, + size_t cells, size_t width) + : BloomFilter(hash_policy) { cells_ = new CounterVector(width, cells); } @@ -261,18 +153,19 @@ bool CountingBloomFilter::DoUnserialize(UnserialInfo* info) return cells_ != NULL; } -void CountingBloomFilter::AddImpl(const HashPolicy::HashVector& h) +void CountingBloomFilter::AddImpl(const HashPolicy::hash_vector& h) { for ( size_t i = 0; i < h.size(); ++i ) cells_->Increment(h[i] % cells_->Size(), 1); } -size_t CountingBloomFilter::CountImpl(const HashPolicy::HashVector& h) const +size_t CountingBloomFilter::CountImpl(const HashPolicy::hash_vector& h) const { CounterVector::size_type min = std::numeric_limits::max(); for ( size_t i = 0; i < h.size(); ++i ) { + // TODO: Use partitioning. CounterVector::size_type cnt = cells_->Count(h[i] % cells_->Size()); if ( cnt < min ) min = cnt; diff --git a/src/BloomFilter.h b/src/BloomFilter.h index 65133621f9..189f4920b7 100644 --- a/src/BloomFilter.h +++ b/src/BloomFilter.h @@ -3,141 +3,9 @@ #include #include "BitVector.h" -#include "Hash.h" -#include "H3.h" +#include "HashPolicy.h" -/** - * A vector of counters, each of which have a fixed number of bits. - */ -class CounterVector : public SerialObj { -public: - typedef size_t size_type; - typedef uint64 count_type; - - /** - * Constructs a counter vector having cells of a given width. - * - * @param width The number of bits that each cell occupies. - * - * @param cells The number of cells in the bitvector. - */ - CounterVector(size_t width, size_t cells = 1024); - - ~CounterVector(); - - /** - * Increments a given cell. - * - * @param cell The cell to increment. - * - * @param value The value to add to the current counter in *cell*. - * - * @return `true` if adding *value* to the counter in *cell* succeeded. - */ - bool Increment(size_type cell, count_type value); - - /** - * Decrements a given cell. - * - * @param cell The cell to decrement. - * - * @param value The value to subtract from the current counter in *cell*. - * - * @return `true` if subtracting *value* from the counter in *cell* succeeded. - */ - bool Decrement(size_type cell, count_type value); - - /** - * Retrieves the counter of a given cell. - * - * @param cell The cell index to retrieve the count for. - * - * @return The counter associated with *cell*. - */ - count_type Count(size_type cell) const; - - /** - * Retrieves the number of cells in the storage. - * - * @return The number of cells. - */ - size_type Size() const; - - bool Serialize(SerialInfo* info) const; - static CounterVector* Unserialize(UnserialInfo* info); - -protected: - DECLARE_SERIAL(CounterVector); - - CounterVector() { } - -private: - BitVector* bits_; - size_t width_; -}; - -/** - * The abstract base class for hash policies that hash elements *k* times. - * @tparam Codomain An integral type. - */ -class HashPolicy { -public: - typedef hash_t HashType; - typedef std::vector HashVector; - - virtual ~HashPolicy() { } - size_t K() const { return k_; } - virtual HashVector Hash(const void* x, size_t n) const = 0; - -protected: - /** - * A functor that computes a universal hash function. - * @tparam Codomain An integral type. - */ - class Hasher { - public: - Hasher(size_t seed); - - HashType operator()(const void* x, size_t n) const; - private: - // FIXME: The hardcoded value of 36 comes from UHASH_KEY_SIZE defined in - // Hash.h. I do not know how this value impacts the hash function behavior - // so I'll just copy it verbatim. (Matthias) - H3 h3_; - }; - - HashPolicy(size_t k) : k_(k) { } - -private: - const size_t k_; -}; - -/** - * The *default* hashing policy. Performs *k* hash function computations. - */ -class DefaultHashing : public HashPolicy { -public: - DefaultHashing(size_t k) : HashPolicy(k), hashers_(k) { } - - virtual HashVector Hash(const void* x, size_t n) const; - -private: - std::vector hashers_; -}; - -/** - * The *double-hashing* policy. Uses a linear combination of two hash functions. - */ -class DoubleHashing : public HashPolicy { -public: - DoubleHashing(size_t k) : HashPolicy(k) { } - - virtual HashVector Hash(const void* x, size_t n) const; - -private: - Hasher hasher1_; - Hasher hasher2_; -}; +class CounterVector; /** * The abstract base class for Bloom filters. @@ -146,8 +14,6 @@ class BloomFilter : public SerialObj { public: // At this point we won't let the user choose the hash policy, but we might // open up the interface in the future. - typedef DoubleHashing hash_policy; - virtual ~BloomFilter(); /** @@ -180,13 +46,19 @@ protected: DECLARE_ABSTRACT_SERIAL(BloomFilter); BloomFilter(); - BloomFilter(size_t k); - virtual void AddImpl(const HashPolicy::HashVector& hashes) = 0; - virtual size_t CountImpl(const HashPolicy::HashVector& hashes) const = 0; + /** + * Constructs a Bloom filter. + * + * @param hash_policy The hash policy to use for this Bloom filter. + */ + BloomFilter(const HashPolicy* hash_policy); + + virtual void AddImpl(const HashPolicy::hash_vector& hashes) = 0; + virtual size_t CountImpl(const HashPolicy::hash_vector& hashes) const = 0; private: - HashPolicy* hash_; + const HashPolicy* hash_; }; /** @@ -223,24 +95,18 @@ public: static BasicBloomFilter* Merge(const BasicBloomFilter* x, const BasicBloomFilter* y); - /** - * Constructs a basic Bloom filter with a given false-positive rate and - * capacity. - */ - BasicBloomFilter(double fp, size_t capacity); - /** * Constructs a basic Bloom filter with a given number of cells and capacity. */ - BasicBloomFilter(size_t cells, size_t capacity); + BasicBloomFilter(const HashPolicy* hash_policy, size_t cells); protected: DECLARE_SERIAL(BasicBloomFilter); BasicBloomFilter(); - virtual void AddImpl(const HashPolicy::HashVector& h); - virtual size_t CountImpl(const HashPolicy::HashVector& h) const; + virtual void AddImpl(const HashPolicy::hash_vector& h); + virtual size_t CountImpl(const HashPolicy::hash_vector& h) const; private: BitVector* bits_; @@ -254,16 +120,16 @@ public: static CountingBloomFilter* Merge(const CountingBloomFilter* x, const CountingBloomFilter* y); - CountingBloomFilter(double fp, size_t capacity, size_t width); - CountingBloomFilter(size_t cells, size_t capacity, size_t width); + CountingBloomFilter(const HashPolicy* hash_policy, size_t cells, + size_t width); protected: DECLARE_SERIAL(CountingBloomFilter); CountingBloomFilter(); - virtual void AddImpl(const HashPolicy::HashVector& h); - virtual size_t CountImpl(const HashPolicy::HashVector& h) const; + virtual void AddImpl(const HashPolicy::hash_vector& h); + virtual size_t CountImpl(const HashPolicy::hash_vector& h) const; private: CounterVector* cells_; diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 1537bb04b0..f2c7ce6bad 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -255,6 +255,7 @@ set(bro_SRCS ChunkedIO.cc CompHash.cc Conn.cc + CounterVector.cc DFA.cc DbgBreakpoint.cc DbgHelp.cc @@ -278,6 +279,7 @@ set(bro_SRCS Frame.cc Func.cc Hash.cc + HashPolicy.cc ID.cc IntSet.cc IOSource.cc diff --git a/src/CounterVector.cc b/src/CounterVector.cc new file mode 100644 index 0000000000..8ed4c30427 --- /dev/null +++ b/src/CounterVector.cc @@ -0,0 +1,75 @@ +#include "CounterVector.h" + +#include "BitVector.h" +#include "Serializer.h" + +CounterVector::CounterVector(size_t width, size_t cells) + : bits_(new BitVector(width * cells)), width_(width) + { + } + +CounterVector::~CounterVector() + { + delete bits_; + } + +bool CounterVector::Increment(size_type cell, count_type value) + { + // TODO + assert(! "not yet implemented"); + return false; + } + +bool CounterVector::Decrement(size_type cell, count_type value) + { + // TODO + assert(! "not yet implemented"); + return false; + } + +CounterVector::count_type CounterVector::Count(size_type cell) const + { + // TODO + assert(! "not yet implemented"); + return 0; + } + +CounterVector::size_type CounterVector::Size() const + { + return bits_->Blocks() / width_; + } + +bool CounterVector::Serialize(SerialInfo* info) const + { + return SerialObj::Serialize(info); + } + +CounterVector* CounterVector::Unserialize(UnserialInfo* info) + { + return reinterpret_cast( + SerialObj::Unserialize(info, SER_COUNTERVECTOR)); + } + +IMPLEMENT_SERIAL(CounterVector, SER_COUNTERVECTOR) + +bool CounterVector::DoSerialize(SerialInfo* info) const + { + DO_SERIALIZE(SER_COUNTERVECTOR, SerialObj); + if ( ! bits_->Serialize(info) ) + return false; + return SERIALIZE(static_cast(width_)); + } + +bool CounterVector::DoUnserialize(UnserialInfo* info) + { + DO_UNSERIALIZE(SerialObj); + bits_ = BitVector::Unserialize(info); + if ( ! bits_ ) + return false; + uint64 width; + if ( ! UNSERIALIZE(&width) ) + return false; + width_ = static_cast(width); + return true; + } + diff --git a/src/CounterVector.h b/src/CounterVector.h new file mode 100644 index 0000000000..ecc8fe90e0 --- /dev/null +++ b/src/CounterVector.h @@ -0,0 +1,78 @@ +#ifndef CounterVector_h +#define CounterVector_h + +#include "SerialObj.h" + +class BitVector; + +/** + * A vector of counters, each of which have a fixed number of bits. + */ +class CounterVector : public SerialObj { +public: + typedef size_t size_type; + typedef uint64 count_type; + + /** + * Constructs a counter vector having cells of a given width. + * + * @param width The number of bits that each cell occupies. + * + * @param cells The number of cells in the bitvector. + */ + CounterVector(size_t width, size_t cells = 1024); + + ~CounterVector(); + + /** + * Increments a given cell. + * + * @param cell The cell to increment. + * + * @param value The value to add to the current counter in *cell*. + * + * @return `true` if adding *value* to the counter in *cell* succeeded. + */ + bool Increment(size_type cell, count_type value); + + /** + * Decrements a given cell. + * + * @param cell The cell to decrement. + * + * @param value The value to subtract from the current counter in *cell*. + * + * @return `true` if subtracting *value* from the counter in *cell* succeeded. + */ + bool Decrement(size_type cell, count_type value); + + /** + * Retrieves the counter of a given cell. + * + * @param cell The cell index to retrieve the count for. + * + * @return The counter associated with *cell*. + */ + count_type Count(size_type cell) const; + + /** + * Retrieves the number of cells in the storage. + * + * @return The number of cells. + */ + size_type Size() const; + + bool Serialize(SerialInfo* info) const; + static CounterVector* Unserialize(UnserialInfo* info); + +protected: + DECLARE_SERIAL(CounterVector); + + CounterVector() { } + +private: + BitVector* bits_; + size_t width_; +}; + +#endif diff --git a/src/HashPolicy.cc b/src/HashPolicy.cc new file mode 100644 index 0000000000..d6fb4f3da4 --- /dev/null +++ b/src/HashPolicy.cc @@ -0,0 +1,72 @@ +#include "HashPolicy.h" + +#include "digest.h" + +Hasher::Hasher(size_t seed, const std::string& extra) + : h_(compute_seed(seed, extra)) + { + } + +Hasher::hash_type Hasher::operator()(const void* x, size_t n) const + { + return n == 0 ? 0 : h_(x, n); + } + +size_t Hasher::compute_seed(size_t seed, const std::string& extra) + { + u_char digest[SHA256_DIGEST_LENGTH]; + SHA256_CTX ctx; + sha256_init(&ctx); + if ( extra.empty() ) + { + unsigned int first_seed = initial_seed(); + sha256_update(&ctx, &first_seed, sizeof(first_seed)); + } + else + { + sha256_update(&ctx, extra.c_str(), extra.size()); + } + sha256_update(&ctx, &seed, sizeof(seed)); + sha256_final(&ctx, digest); + return *reinterpret_cast(digest); + } + + +HashPolicy::HashPolicy(size_t k, const std::string& name) + : k_(k), name_(name) + { + } + +DefaultHashing::DefaultHashing(size_t k, const std::string& name) + : HashPolicy(k, name) + { + for ( size_t i = 0; i < k; ++i ) + hashers_.push_back(Hasher(i, name)); + } + +HashPolicy::hash_vector DefaultHashing::Hash(const void* x, size_t n) const + { + hash_vector h(K(), 0); + for ( size_t i = 0; i < h.size(); ++i ) + h[i] = hashers_[i](x, n); + return h; + } + +DoubleHashing::DoubleHashing(size_t k, const std::string& name) + : HashPolicy(k, name), + hasher1_(1, name), + hasher2_(2, name) + { + } + +HashPolicy::hash_vector DoubleHashing::Hash(const void* x, size_t n) const + { + hash_type h1 = hasher1_(x, n); + hash_type h2 = hasher2_(x, n); + hash_vector h(K(), 0); + for ( size_t i = 0; i < h.size(); ++i ) + h[i] = h1 + i * h2; + return h; + } + + diff --git a/src/HashPolicy.h b/src/HashPolicy.h new file mode 100644 index 0000000000..4660bc0080 --- /dev/null +++ b/src/HashPolicy.h @@ -0,0 +1,90 @@ +#ifndef HashPolicy_h +#define HashPolicy_h + +#include "Hash.h" +#include "H3.h" + +/** + * A functor that computes a universal hash function. + */ +class Hasher { +public: + typedef hash_t hash_type; + + /** + * Constructs a hasher seeded by a given seed and optionally an extra + * descriptor. + * + * @param seed The seed to use. + * + * @param extra If not `NULL`, the hasher will not mix in the initial seed + * but instead use this NUL-terminated string as additional seed. + */ + Hasher(size_t seed, const std::string& extra = ""); + + /** + * Computes the hash digest of contiguous data. + * + * @param x A pointer to the beginning of the byte sequence to hash. + * + * @param n The length of the sequence pointed to by *x*. + */ + hash_type operator()(const void* x, size_t n) const; + +private: + static size_t compute_seed(size_t seed, const std::string& extra); + + H3 h_; +}; + +/** + * The abstract base class for hash policies that hash elements *k* times. + */ +class HashPolicy { +public: + typedef Hasher::hash_type hash_type; + typedef std::vector hash_vector; + + virtual ~HashPolicy() { } + + virtual hash_vector Hash(const void* x, size_t n) const = 0; + + size_t K() const { return k_; } + const std::string& Name() const { return name_; } + +protected: + HashPolicy(size_t k, const std::string& name); + +private: + const size_t k_; + std::string name_; +}; + +/** + * The default hashing policy. Performs *k* hash function computations. + */ +class DefaultHashing : public HashPolicy { +public: + DefaultHashing(size_t k, const std::string& name); + + virtual hash_vector Hash(const void* x, size_t n) const /* override */; + +private: + std::vector hashers_; +}; + +/** + * The *double-hashing* policy. Uses a linear combination of two hash functions. + */ +class DoubleHashing : public HashPolicy { +public: + DoubleHashing(size_t k, const std::string& name); + + virtual hash_vector Hash(const void* x, size_t n) const; + +private: + Hasher hasher1_; + Hasher hasher2_; +}; + +#endif diff --git a/src/OpaqueVal.cc b/src/OpaqueVal.cc index 9dd5c7f980..8b82916689 100644 --- a/src/OpaqueVal.cc +++ b/src/OpaqueVal.cc @@ -605,6 +605,7 @@ IMPLEMENT_SERIAL(BloomFilterVal, SER_BLOOMFILTER_VAL); bool BloomFilterVal::DoSerialize(SerialInfo* info) const { DO_SERIALIZE(SER_BLOOMFILTER_VAL, OpaqueVal); + assert( type_ ); if ( ! type_->Serialize(info) ) return false; return bloom_filter_->Serialize(info); diff --git a/src/bro.bif b/src/bro.bif index 9b80c90dbf..a89b808888 100644 --- a/src/bro.bif +++ b/src/bro.bif @@ -4986,42 +4986,55 @@ function anonymize_addr%(a: addr, cl: IPAddrAnonymizationClass%): addr #include "BloomFilter.h" %%} -## Initializes a Bloom filter data structure. +## Creates a basic Bloom filter. ## ## fp: The desired false-positive rate. ## ## capacity: the maximum number of elements that guarantees a false-positive ## rate of *fp*. ## -## max: The maximum counter value associated with each each element in the -## Bloom filter. If greater than 1, each element in the set has a counter of -## *w = ceil(log_2(max))* bits. Each bit in the underlying bit vector then -## becomes a cell of size *w* bits. Since the number number of cells is a -## function ## of *fp* and *capacity*, it is important to consider the effects -## on space when tuning this value. +## name: A name that uniquely identifies and seeds the Bloom filter. If empty, +## the initialization will become dependent on the initial seed. ## ## Returns: A Bloom filter handle. -function bloomfilter_init%(fp: double, capacity: count, - max: count &default=1%): opaque of bloomfilter +function bloomfilter_basic_init%(fp: double, capacity: count, + name: string &default=""%): opaque of bloomfilter %{ if ( fp < 0.0 || fp > 1.0 ) { reporter->Error("false-positive rate must take value between 0 and 1"); return NULL; } - BloomFilter* bf; - if ( max == 1 ) - { - bf = new BasicBloomFilter(fp, capacity); - } - else - { - uint16 width = 0; - while ( max >>= 1 ) - ++width; - bf = new CountingBloomFilter(fp, capacity, width); - } - return new BloomFilterVal(bf); + + size_t cells = BasicBloomFilter::M(fp, capacity); + size_t optimal_k = BasicBloomFilter::K(cells, capacity); + const HashPolicy* hp = new DefaultHashing(optimal_k, name->CheckString()); + fprintf(stderr, "constructing Bloom filter with %llu hash fns and %llu cells\n", optimal_k, cells); + return new BloomFilterVal(new BasicBloomFilter(hp, cells)); + %} + +## Creates a counting Bloom filter. +## +## k: The number of hash functions to use. +## +## cells: The number of cells of the underlying counter vector. +## +## max: The maximum counter value associated with each each element described +## by *w = ceil(log_2(max))* bits. Each bit in the underlying counter vector +## becomes a cell of size *w* bits. +## +## name: A name that uniquely identifies and seeds the Bloom filter. If empty, +## the initialization will become dependent on the initial seed. +## +## Returns: A Bloom filter handle. +function bloomfilter_counting_init%(k: count, cells: count, max: count, + name: string &default=""%): opaque of bloomfilter + %{ + const HashPolicy* hp = new DefaultHashing(k, name->CheckString()); + uint16 width = 0; + while ( max >>= 1 ) + ++width; + return new BloomFilterVal(new CountingBloomFilter(hp, cells, width)); %} ## Adds an element to a Bloom filter. diff --git a/testing/btest/bifs/bloomfilter.bro b/testing/btest/bifs/bloomfilter.bro index 769cec1200..3ff6a6668e 100644 --- a/testing/btest/bifs/bloomfilter.bro +++ b/testing/btest/bifs/bloomfilter.bro @@ -4,7 +4,7 @@ event bro_init() { # Basic usage with counts. - local bf_cnt = bloomfilter_init(0.1, 1000); + local bf_cnt = bloomfilter_basic_init(0.1, 1000); bloomfilter_add(bf_cnt, 42); bloomfilter_add(bf_cnt, 84); bloomfilter_add(bf_cnt, 168); @@ -16,23 +16,23 @@ event bro_init() bloomfilter_add(bf_cnt, "foo"); # Type mismatch # Basic usage with strings. - local bf_str = bloomfilter_init(0.9, 10); + local bf_str = bloomfilter_basic_init(0.9, 10); bloomfilter_add(bf_str, "foo"); bloomfilter_add(bf_str, "bar"); print bloomfilter_lookup(bf_str, "foo"); print bloomfilter_lookup(bf_str, "bar"); - print bloomfilter_lookup(bf_str, "baz"); # FP - print bloomfilter_lookup(bf_str, "qux"); # FP + print bloomfilter_lookup(bf_str, "b4z"); # FP + print bloomfilter_lookup(bf_str, "quux"); # FP bloomfilter_add(bf_str, 0.5); # Type mismatch bloomfilter_add(bf_str, 100); # Type mismatch # Edge cases. - local bf_edge0 = bloomfilter_init(0.000000000001, 1); - local bf_edge1 = bloomfilter_init(0.00000001, 100000000); - local bf_edge2 = bloomfilter_init(0.9999999, 1); - local bf_edge3 = bloomfilter_init(0.9999999, 100000000000); + local bf_edge0 = bloomfilter_basic_init(0.000000000001, 1); + local bf_edge1 = bloomfilter_basic_init(0.00000001, 100000000); + local bf_edge2 = bloomfilter_basic_init(0.9999999, 1); + local bf_edge3 = bloomfilter_basic_init(0.9999999, 100000000000); # Invalid parameters. - local bf_bug0 = bloomfilter_init(-0.5, 42); - local bf_bug1 = bloomfilter_init(1.1, 42); + local bf_bug0 = bloomfilter_basic_init(-0.5, 42); + local bf_bug1 = bloomfilter_basic_init(1.1, 42); } diff --git a/testing/btest/istate/opaque.bro b/testing/btest/istate/opaque.bro index ac3b2c0874..b387f9d6bc 100644 --- a/testing/btest/istate/opaque.bro +++ b/testing/btest/istate/opaque.bro @@ -82,7 +82,7 @@ event bro_init() if ( ! entropy_test_add(entropy_handle, "f") ) print out, "entropy_test_add() failed"; - bloomfilter_handle = bloomfilter_init(0.1, 100); + bloomfilter_handle = bloomfilter_basic_init(0.1, 100); for ( e in bloomfilter_elements ) bloomfilter_add(bloomfilter_handle, e); } From 85668e7054dd22bc783a620eaf88b04f2e4bb952 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Mon, 17 Jun 2013 16:16:44 -0700 Subject: [PATCH 36/73] Remove lingering debug code. --- src/bro.bif | 1 - 1 file changed, 1 deletion(-) diff --git a/src/bro.bif b/src/bro.bif index a89b808888..7c81966317 100644 --- a/src/bro.bif +++ b/src/bro.bif @@ -5009,7 +5009,6 @@ function bloomfilter_basic_init%(fp: double, capacity: count, size_t cells = BasicBloomFilter::M(fp, capacity); size_t optimal_k = BasicBloomFilter::K(cells, capacity); const HashPolicy* hp = new DefaultHashing(optimal_k, name->CheckString()); - fprintf(stderr, "constructing Bloom filter with %llu hash fns and %llu cells\n", optimal_k, cells); return new BloomFilterVal(new BasicBloomFilter(hp, cells)); %} From e6e5f4926f5a850c773af05b51d7004fc4899a7c Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Mon, 17 Jun 2013 16:26:35 -0700 Subject: [PATCH 37/73] Create hash policies through factory. --- src/BloomFilter.cc | 5 +---- src/HashPolicy.cc | 5 +++++ src/HashPolicy.h | 7 +++++++ src/bro.bif | 4 ++-- 4 files changed, 15 insertions(+), 6 deletions(-) diff --git a/src/BloomFilter.cc b/src/BloomFilter.cc index 0be64c18de..59d411d8e2 100644 --- a/src/BloomFilter.cc +++ b/src/BloomFilter.cc @@ -34,8 +34,6 @@ BloomFilter* BloomFilter::Unserialize(UnserialInfo* info) bool BloomFilter::DoSerialize(SerialInfo* info) const { DO_SERIALIZE(SER_BLOOMFILTER, SerialObj); - // FIXME: Since we have a fixed hashing policy, we just serialize the - // information needed to reconstruct it. if ( ! SERIALIZE(static_cast(hash_->K())) ) return false; return SERIALIZE_STR(hash_->Name().c_str(), hash_->Name().size()); @@ -50,8 +48,7 @@ bool BloomFilter::DoUnserialize(UnserialInfo* info) const char* name; if ( ! UNSERIALIZE_STR(&name, 0) ) return false; - // FIXME: for now Bloom filters always use double hashing. - hash_ = new DefaultHashing(k, name); + hash_ = HashPolicy::Create(k, name); return true; } diff --git a/src/HashPolicy.cc b/src/HashPolicy.cc index d6fb4f3da4..7ce754be3c 100644 --- a/src/HashPolicy.cc +++ b/src/HashPolicy.cc @@ -32,6 +32,11 @@ size_t Hasher::compute_seed(size_t seed, const std::string& extra) } +HashPolicy* HashPolicy::Create(size_t k, const std::string& name) + { + return new DefaultHashing(k, name); + } + HashPolicy::HashPolicy(size_t k, const std::string& name) : k_(k), name_(name) { diff --git a/src/HashPolicy.h b/src/HashPolicy.h index 4660bc0080..7bdb968bfe 100644 --- a/src/HashPolicy.h +++ b/src/HashPolicy.h @@ -42,6 +42,13 @@ private: */ class HashPolicy { public: + /** + * Constructs the hashing policy used by the implementation. This factory + * function exists because the HashingPolicy class hierachy is not yet + * serializable. + */ + static HashPolicy* Create(size_t k, const std::string& name); + typedef Hasher::hash_type hash_type; typedef std::vector hash_vector; diff --git a/src/bro.bif b/src/bro.bif index 7c81966317..d0ce066139 100644 --- a/src/bro.bif +++ b/src/bro.bif @@ -5008,7 +5008,7 @@ function bloomfilter_basic_init%(fp: double, capacity: count, size_t cells = BasicBloomFilter::M(fp, capacity); size_t optimal_k = BasicBloomFilter::K(cells, capacity); - const HashPolicy* hp = new DefaultHashing(optimal_k, name->CheckString()); + const HashPolicy* hp = HashPolicy::Create(optimal_k, name->CheckString()); return new BloomFilterVal(new BasicBloomFilter(hp, cells)); %} @@ -5029,7 +5029,7 @@ function bloomfilter_basic_init%(fp: double, capacity: count, function bloomfilter_counting_init%(k: count, cells: count, max: count, name: string &default=""%): opaque of bloomfilter %{ - const HashPolicy* hp = new DefaultHashing(k, name->CheckString()); + const HashPolicy* hp = HashPolicy::Create(k, name->CheckString()); uint16 width = 0; while ( max >>= 1 ) ++width; From 273629de366290f411f381fe5970fc672adf465f Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Tue, 18 Jun 2013 10:23:07 -0700 Subject: [PATCH 38/73] Only serialize Bloom filter type if available. --- src/OpaqueVal.cc | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/src/OpaqueVal.cc b/src/OpaqueVal.cc index 8b82916689..5a673c4a40 100644 --- a/src/OpaqueVal.cc +++ b/src/OpaqueVal.cc @@ -605,9 +605,13 @@ IMPLEMENT_SERIAL(BloomFilterVal, SER_BLOOMFILTER_VAL); bool BloomFilterVal::DoSerialize(SerialInfo* info) const { DO_SERIALIZE(SER_BLOOMFILTER_VAL, OpaqueVal); - assert( type_ ); - if ( ! type_->Serialize(info) ) + + bool is_typed = type_ != NULL; + if ( ! SERIALIZE(is_typed) ) return false; + if ( is_typed && ! type_->Serialize(info) ) + return false; + return bloom_filter_->Serialize(info); } @@ -615,13 +619,16 @@ bool BloomFilterVal::DoUnserialize(UnserialInfo* info) { DO_UNSERIALIZE(OpaqueVal); - type_ = BroType::Unserialize(info); - if ( ! type_ ) + bool is_typed; + if ( ! UNSERIALIZE(&is_typed) ) return false; - TypeList* tl = new TypeList(type_); - tl->Append(type_); - hash_ = new CompositeHash(tl); - Unref(tl); + if ( is_typed ) + { + BroType* type = BroType::Unserialize(info); + if ( ! Typify(type) ) + return false; + Unref(type); + } bloom_filter_ = BloomFilter::Unserialize(info); return bloom_filter_ != NULL; From 5f70452a9ac816346c4e480d8de52b213630b5b7 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Tue, 18 Jun 2013 10:40:00 -0700 Subject: [PATCH 39/73] Small fixes and style tweaks. --- src/BitVector.cc | 2 +- src/BloomFilter.cc | 1 + src/OpaqueVal.h | 4 +--- src/Type.cc | 6 +++--- 4 files changed, 6 insertions(+), 7 deletions(-) diff --git a/src/BitVector.cc b/src/BitVector.cc index f029230609..64db32131f 100644 --- a/src/BitVector.cc +++ b/src/BitVector.cc @@ -473,7 +473,7 @@ bool BitVector::DoSerialize(SerialInfo* info) const if ( ! SERIALIZE(static_cast(bits_.size())) ) return false; - for (size_t i = 0; i < bits_.size(); ++i) + for ( size_t i = 0; i < bits_.size(); ++i ) if ( ! SERIALIZE(static_cast(bits_[i])) ) return false; diff --git a/src/BloomFilter.cc b/src/BloomFilter.cc index 59d411d8e2..a7727630f7 100644 --- a/src/BloomFilter.cc +++ b/src/BloomFilter.cc @@ -49,6 +49,7 @@ bool BloomFilter::DoUnserialize(UnserialInfo* info) if ( ! UNSERIALIZE_STR(&name, 0) ) return false; hash_ = HashPolicy::Create(k, name); + delete [] name; return true; } diff --git a/src/OpaqueVal.h b/src/OpaqueVal.h index 4b45cad519..2362fdacfc 100644 --- a/src/OpaqueVal.h +++ b/src/OpaqueVal.h @@ -139,9 +139,7 @@ private: { const T* a = dynamic_cast(x->bloom_filter_); const T* b = dynamic_cast(y->bloom_filter_); - if ( a && b ) - return new BloomFilterVal(T::Merge(a, b)); - return NULL; + return a && b ? new BloomFilterVal(T::Merge(a, b)) : NULL; } BroType* type_; diff --git a/src/Type.cc b/src/Type.cc index 6461bf2560..f19de461cd 100644 --- a/src/Type.cc +++ b/src/Type.cc @@ -1311,19 +1311,19 @@ IMPLEMENT_SERIAL(OpaqueType, SER_OPAQUE_TYPE); bool OpaqueType::DoSerialize(SerialInfo* info) const { DO_SERIALIZE(SER_OPAQUE_TYPE, BroType); - return SERIALIZE(name); + return SERIALIZE_STR(name.c_str(), name.size()); } bool OpaqueType::DoUnserialize(UnserialInfo* info) { DO_UNSERIALIZE(BroType); - char const* n; + const char* n; if ( ! UNSERIALIZE_STR(&n, 0) ) return false; - name = n; delete [] n; + return true; } From 40201a180e54a560711003f2e65e14be87a7b8e9 Mon Sep 17 00:00:00 2001 From: Robin Sommer Date: Tue, 9 Jul 2013 21:00:53 -0700 Subject: [PATCH 40/73] Fixing for unserializion error. Because BloomFilter is a base class, with other classes derived from it, it needs special treatment. --- src/SerialTypes.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/SerialTypes.h b/src/SerialTypes.h index 9e4aef5b3b..85aed10bda 100644 --- a/src/SerialTypes.h +++ b/src/SerialTypes.h @@ -52,8 +52,6 @@ SERIAL_IS(RE_MATCHER, 0x1400) SERIAL_IS(BITVECTOR, 0x1500) SERIAL_IS(COUNTERVECTOR, 0x1600) SERIAL_IS(BLOOMFILTER, 0x1700) -SERIAL_IS(BASICBLOOMFILTER, 0x1800) -SERIAL_IS(COUNTINGBLOOMFILTER, 0x1900) // These are the externally visible types. const SerialType SER_NONE = 0; @@ -203,6 +201,11 @@ SERIAL_FUNC(BRO_FUNC, 2) SERIAL_FUNC(DEBUG_FUNC, 3) SERIAL_FUNC(BUILTIN_FUNC, 4) +#define SERIAL_BLOOMFILTER(name, val) SERIAL_CONST(name, val, BLOOMFILTER) +SERIAL_BLOOMFILTER(BLOOMFILTER, 1) +SERIAL_BLOOMFILTER(BASICBLOOMFILTER, 2) +SERIAL_BLOOMFILTER(COUNTINGBLOOMFILTER, 3) + SERIAL_CONST2(ID) SERIAL_CONST2(STATE_ACCESS) SERIAL_CONST2(CASE) @@ -210,8 +213,5 @@ SERIAL_CONST2(LOCATION) SERIAL_CONST2(RE_MATCHER) SERIAL_CONST2(BITVECTOR) SERIAL_CONST2(COUNTERVECTOR) -SERIAL_CONST2(BLOOMFILTER) -SERIAL_CONST2(BASICBLOOMFILTER) -SERIAL_CONST2(COUNTINGBLOOMFILTER) #endif From 446344ae998e8eef30a0f45a05dcea29efe4f032 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Wed, 10 Jul 2013 01:32:59 -0700 Subject: [PATCH 41/73] Add missing include for GCC. --- src/BloomFilter.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/src/BloomFilter.cc b/src/BloomFilter.cc index a7727630f7..c59092b1e4 100644 --- a/src/BloomFilter.cc +++ b/src/BloomFilter.cc @@ -1,6 +1,7 @@ #include "BloomFilter.h" #include +#include #include "CounterVector.h" #include "Serializer.h" From 9b444b2617c0a910a24ea938a3064eb092f26537 Mon Sep 17 00:00:00 2001 From: Seth Hall Date: Fri, 19 Jul 2013 13:16:12 -0400 Subject: [PATCH 42/73] Updates for the Intel Framework. - Intel importing format has changed (refer to docs). - All string matching is now case insensitive. - SMTP intel script has been updated to extract email addresses correctly. - Small fix sneaking into the smtp base script to actually extract individual email addresses in the To: field correctly. --- doc/intel.rst | 20 +-- scripts/base/frameworks/intel/main.bro | 141 ++++++++---------- scripts/base/protocols/smtp/main.bro | 5 +- .../frameworks/intel/conn-established.bro | 8 +- scripts/policy/frameworks/intel/dns.bro | 4 +- .../frameworks/intel/http-host-header.bro | 4 +- scripts/policy/frameworks/intel/http-url.bro | 4 +- .../frameworks/intel/http-user-agents.bro | 4 +- .../frameworks/intel/smtp-url-extraction.bro | 4 +- scripts/policy/frameworks/intel/smtp.bro | 70 ++++++--- scripts/policy/frameworks/intel/ssl.bro | 12 +- .../manager-1.intel.log | 10 +- .../broproc.intel.log | 12 +- .../manager-1.intel.log | 16 +- .../frameworks/intel/cluster-transparency.bro | 8 +- .../base/frameworks/intel/input-and-match.bro | 12 +- .../intel/read-file-dist-cluster.bro | 10 +- 17 files changed, 178 insertions(+), 166 deletions(-) diff --git a/doc/intel.rst b/doc/intel.rst index 390313461a..2a59a98974 100644 --- a/doc/intel.rst +++ b/doc/intel.rst @@ -29,9 +29,6 @@ Framework to be checked by loading this script in local.bro:: @load policy/frameworks/intel -(TODO: find some good mechanism for getting setup with good data -quickly) - Refer to the "Loading Intelligence" section below to see the format for Intelligence Framework text files, then load those text files with this line in local.bro:: @@ -61,16 +58,14 @@ data out to all of the nodes that need it. Here is an example of the intelligence data format. Note that all whitespace separators are literal tabs and fields containing only a -hyphen a considered to be null values.:: +hyphen are considered to be null values.:: - #fields host net str str_type meta.source meta.desc meta.url - 1.2.3.4 - - - source1 Sending phishing email http://source1.com/badhosts/1.2.3.4 - - 31.131.248.0/21 - - spamhaus-drop SBL154982 - - - - - a.b.com Intel::DOMAIN source2 Name used for data exfiltration - + #fields indicator indicator_type meta.source meta.desc meta.url + 1.2.3.4 Intel::ADDR source1 Sending phishing email http://source1.com/badhosts/1.2.3.4 + a.b.com Intel::DOMAIN source2 Name used for data exfiltration - -For more examples of built in `str_type` values, please refer to the -autogenerated documentation for the intelligence framework (TODO: -figure out how to do this link). +For more examples of built in `indicator_type` values, please refer to the +autogenerated documentation for the intelligence framework. To load the data once files are created, use the following example code to define files to load with your own file names of course:: @@ -90,8 +85,7 @@ When some bit of data is extracted (such as an email address in the "From" header in a message over SMTP), the Intelligence Framework needs to be informed that this data was discovered and it's presence should be checked within the intelligence data set. This is -accomplished through the Intel::seen (TODO: do a reference link) -function. +accomplished through the Intel::seen function. Typically users won't need to work with this function due to built in hook scripts that Bro ships with that will "see" data and send it into diff --git a/scripts/base/frameworks/intel/main.bro b/scripts/base/frameworks/intel/main.bro index aeb7bf4bfc..1b740f538d 100644 --- a/scripts/base/frameworks/intel/main.bro +++ b/scripts/base/frameworks/intel/main.bro @@ -10,13 +10,14 @@ module Intel; export { redef enum Log::ID += { LOG }; - ## String data needs to be further categoried since it could represent - ## and number of types of data. - type StrType: enum { + ## Enum type to represent various types of intelligence data. + type Type: enum { + ## An IP address. + ADDR, ## A complete URL without the prefix "http://". URL, - ## User-Agent string, typically HTTP or mail message body. - USER_AGENT, + ## Software name. + SOFTWARE, ## Email address. EMAIL, ## DNS domain name. @@ -44,18 +45,15 @@ export { ## Represents a piece of intelligence. type Item: record { - ## The IP address if the intelligence is about an IP address. - host: addr &optional; - ## The network if the intelligence is about a CIDR block. - net: subnet &optional; - ## The string if the intelligence is about a string. - str: string &optional; - ## The type of data that is in the string if the $str field is set. - str_type: StrType &optional; + ## The intelligence indicator. + indicator: string; + + ## The type of data that the indicator field represents. + indicator_type: Type; - ## Metadata for the item. Typically represents more deeply \ + ## Metadata for the item. Typically represents more deeply ## descriptive data for a piece of intelligence. - meta: MetaData; + meta: MetaData; }; ## Enum to represent where data came from when it was discovered. @@ -69,19 +67,22 @@ export { ## exclusive. These records *must* represent either an IP address being ## seen or a string being seen. type Seen: record { - ## The IP address if the data seen is an IP address. - host: addr &log &optional; ## The string if the data is about a string. - str: string &log &optional; - ## The type of data that is in the string if the $str field is set. - str_type: StrType &log &optional; + indicator: string &log &optional; + + ## The type of data that the indicator represents. + indicator_type: Type &log &optional; + + ## If the indicator type was :bro:enum:`Intel::ADDR`, then this + ## field will be present. + host: addr &optional; ## Where the data was discovered. - where: Where &log; + where: Where &log; ## If the data was discovered within a connection, the ## connection record should go into get to give context to the data. - conn: connection &optional; + conn: connection &optional; }; ## Record used for the logging framework representing a positive @@ -100,7 +101,7 @@ export { ## Where the data was seen. seen: Seen &log; ## Sources which supplied data that resulted in this match. - sources: set[string] &log; + sources: set[string] &log &default=string_set(); }; ## Intelligence data manipulation functions. @@ -135,8 +136,8 @@ const have_full_data = T &redef; # The in memory data structure for holding intelligence. type DataStore: record { - net_data: table[subnet] of set[MetaData]; - string_data: table[string, StrType] of set[MetaData]; + host_data: table[addr] of set[MetaData]; + string_data: table[string, Type] of set[MetaData]; }; global data_store: DataStore &redef; @@ -144,8 +145,8 @@ global data_store: DataStore &redef; # This is primarily for workers to do the initial quick matches and store # a minimal amount of data for the full match to happen on the manager. type MinDataStore: record { - net_data: set[subnet]; - string_data: set[string, StrType]; + host_data: set[addr]; + string_data: set[string, Type]; }; global min_data_store: MinDataStore &redef; @@ -157,15 +158,13 @@ event bro_init() &priority=5 function find(s: Seen): bool { - if ( s?$host && - ((have_full_data && s$host in data_store$net_data) || - (s$host in min_data_store$net_data))) + if ( s?$host ) { - return T; + return ((s$host in min_data_store$host_data) || + (have_full_data && s$host in data_store$host_data)); } - else if ( s?$str && s?$str_type && - ((have_full_data && [s$str, s$str_type] in data_store$string_data) || - ([s$str, s$str_type] in min_data_store$string_data))) + else if ( ([to_lower(s$indicator), s$indicator_type] in min_data_store$string_data) || + (have_full_data && [to_lower(s$indicator), s$indicator_type] in data_store$string_data) ) { return T; } @@ -177,8 +176,7 @@ function find(s: Seen): bool function get_items(s: Seen): set[Item] { - local item: Item; - local return_data: set[Item] = set(); + local return_data: set[Item]; if ( ! have_full_data ) { @@ -191,26 +189,23 @@ function get_items(s: Seen): set[Item] if ( s?$host ) { # See if the host is known about and it has meta values - if ( s$host in data_store$net_data ) + if ( s$host in data_store$host_data ) { - for ( m in data_store$net_data[s$host] ) + for ( m in data_store$host_data[s$host] ) { - # TODO: the lookup should be finding all and not just most specific - # and $host/$net should have the correct value. - item = [$host=s$host, $meta=m]; - add return_data[item]; + add return_data[Item($indicator=cat(s$host), $indicator_type=ADDR, $meta=m)]; } } } - else if ( s?$str && s?$str_type ) + else { + local lower_indicator = to_lower(s$indicator); # See if the string is known about and it has meta values - if ( [s$str, s$str_type] in data_store$string_data ) + if ( [lower_indicator, s$indicator_type] in data_store$string_data ) { - for ( m in data_store$string_data[s$str, s$str_type] ) + for ( m in data_store$string_data[lower_indicator, s$indicator_type] ) { - item = [$str=s$str, $str_type=s$str_type, $meta=m]; - add return_data[item]; + add return_data[Item($indicator=s$indicator, $indicator_type=s$indicator_type, $meta=m)]; } } } @@ -222,6 +217,12 @@ function Intel::seen(s: Seen) { if ( find(s) ) { + if ( s?$host ) + { + s$indicator = cat(s$host); + s$indicator_type = Intel::ADDR; + } + if ( have_full_data ) { local items = get_items(s); @@ -250,8 +251,7 @@ function has_meta(check: MetaData, metas: set[MetaData]): bool event Intel::match(s: Seen, items: set[Item]) &priority=5 { - local empty_set: set[string] = set(); - local info: Info = [$ts=network_time(), $seen=s, $sources=empty_set]; + local info: Info = [$ts=network_time(), $seen=s]; if ( s?$conn ) { @@ -267,52 +267,37 @@ event Intel::match(s: Seen, items: set[Item]) &priority=5 function insert(item: Item) { - if ( item?$str && !item?$str_type ) - { - event reporter_warning(network_time(), fmt("You must provide a str_type for strings or this item doesn't make sense. Item: %s", item), ""); - return; - } - # Create and fill out the meta data item. local meta = item$meta; local metas: set[MetaData]; - if ( item?$host ) + # All intelligence is case insensitive at the moment. + local lower_indicator = to_lower(item$indicator); + + if ( item$indicator_type == ADDR ) { - local host = mask_addr(item$host, is_v4_addr(item$host) ? 32 : 128); + local host = to_addr(item$indicator); if ( have_full_data ) { - if ( host !in data_store$net_data ) - data_store$net_data[host] = set(); + if ( host !in data_store$host_data ) + data_store$host_data[host] = set(); - metas = data_store$net_data[host]; + metas = data_store$host_data[host]; } - add min_data_store$net_data[host]; + add min_data_store$host_data[host]; } - else if ( item?$net ) + else { if ( have_full_data ) { - if ( item$net !in data_store$net_data ) - data_store$net_data[item$net] = set(); + if ( [lower_indicator, item$indicator_type] !in data_store$string_data ) + data_store$string_data[lower_indicator, item$indicator_type] = set(); - metas = data_store$net_data[item$net]; + metas = data_store$string_data[lower_indicator, item$indicator_type]; } - add min_data_store$net_data[item$net]; - } - else if ( item?$str ) - { - if ( have_full_data ) - { - if ( [item$str, item$str_type] !in data_store$string_data ) - data_store$string_data[item$str, item$str_type] = set(); - - metas = data_store$string_data[item$str, item$str_type]; - } - - add min_data_store$string_data[item$str, item$str_type]; + add min_data_store$string_data[lower_indicator, item$indicator_type]; } local updated = F; diff --git a/scripts/base/protocols/smtp/main.bro b/scripts/base/protocols/smtp/main.bro index d53128b06c..0d510e645d 100644 --- a/scripts/base/protocols/smtp/main.bro +++ b/scripts/base/protocols/smtp/main.bro @@ -223,7 +223,10 @@ event mime_one_header(c: connection, h: mime_header_rec) &priority=5 { if ( ! c$smtp?$to ) c$smtp$to = set(); - add c$smtp$to[h$value]; + + local to_parts = split(h$value, /[[:blank:]]*,[[:blank:]]*/); + for ( i in to_parts ) + add c$smtp$to[to_parts[i]]; } else if ( h$name == "X-ORIGINATING-IP" ) diff --git a/scripts/policy/frameworks/intel/conn-established.bro b/scripts/policy/frameworks/intel/conn-established.bro index a2e67b292b..20cec43e04 100644 --- a/scripts/policy/frameworks/intel/conn-established.bro +++ b/scripts/policy/frameworks/intel/conn-established.bro @@ -3,6 +3,10 @@ event connection_established(c: connection) { - Intel::seen([$host=c$id$orig_h, $conn=c, $where=Conn::IN_ORIG]); - Intel::seen([$host=c$id$resp_h, $conn=c, $where=Conn::IN_RESP]); + if ( c$orig$state == TCP_ESTABLISHED && + c$resp$state == TCP_ESTABLISHED ) + { + Intel::seen([$host=c$id$orig_h, $conn=c, $where=Conn::IN_ORIG]); + Intel::seen([$host=c$id$resp_h, $conn=c, $where=Conn::IN_RESP]); + } } diff --git a/scripts/policy/frameworks/intel/dns.bro b/scripts/policy/frameworks/intel/dns.bro index a0dee47acf..9218586c95 100644 --- a/scripts/policy/frameworks/intel/dns.bro +++ b/scripts/policy/frameworks/intel/dns.bro @@ -3,8 +3,8 @@ event dns_request(c: connection, msg: dns_msg, query: string, qtype: count, qclass: count) { - Intel::seen([$str=query, - $str_type=Intel::DOMAIN, + Intel::seen([$indicator=query, + $indicator_type=Intel::DOMAIN, $conn=c, $where=DNS::IN_REQUEST]); } diff --git a/scripts/policy/frameworks/intel/http-host-header.bro b/scripts/policy/frameworks/intel/http-host-header.bro index f16b1628aa..3fd28b8ef9 100644 --- a/scripts/policy/frameworks/intel/http-host-header.bro +++ b/scripts/policy/frameworks/intel/http-host-header.bro @@ -4,8 +4,8 @@ event http_header(c: connection, is_orig: bool, name: string, value: string) { if ( is_orig && name == "HOST" ) - Intel::seen([$str=value, - $str_type=Intel::DOMAIN, + Intel::seen([$indicator=value, + $indicator_type=Intel::DOMAIN, $conn=c, $where=HTTP::IN_HOST_HEADER]); } diff --git a/scripts/policy/frameworks/intel/http-url.bro b/scripts/policy/frameworks/intel/http-url.bro index feef4f0dac..340ae3c5ab 100644 --- a/scripts/policy/frameworks/intel/http-url.bro +++ b/scripts/policy/frameworks/intel/http-url.bro @@ -5,8 +5,8 @@ event http_message_done(c: connection, is_orig: bool, stat: http_message_stat) { if ( is_orig && c?$http ) - Intel::seen([$str=HTTP::build_url(c$http), - $str_type=Intel::URL, + Intel::seen([$indicator=HTTP::build_url(c$http), + $indicator_type=Intel::URL, $conn=c, $where=HTTP::IN_URL]); } diff --git a/scripts/policy/frameworks/intel/http-user-agents.bro b/scripts/policy/frameworks/intel/http-user-agents.bro index 93445c1e43..7c4558d2a5 100644 --- a/scripts/policy/frameworks/intel/http-user-agents.bro +++ b/scripts/policy/frameworks/intel/http-user-agents.bro @@ -4,8 +4,8 @@ event http_header(c: connection, is_orig: bool, name: string, value: string) { if ( is_orig && name == "USER-AGENT" ) - Intel::seen([$str=value, - $str_type=Intel::USER_AGENT, + Intel::seen([$indicator=value, + $indicator_type=Intel::SOFTWARE, $conn=c, $where=HTTP::IN_USER_AGENT_HEADER]); } diff --git a/scripts/policy/frameworks/intel/smtp-url-extraction.bro b/scripts/policy/frameworks/intel/smtp-url-extraction.bro index 2b87f809a6..a3ba410641 100644 --- a/scripts/policy/frameworks/intel/smtp-url-extraction.bro +++ b/scripts/policy/frameworks/intel/smtp-url-extraction.bro @@ -13,8 +13,8 @@ event intel_mime_data(f: fa_file, data: string) local urls = find_all_urls_without_scheme(data); for ( url in urls ) { - Intel::seen([$str=url, - $str_type=Intel::URL, + Intel::seen([$indicator=url, + $indicator_type=Intel::URL, $conn=c, $where=SMTP::IN_MESSAGE]); } diff --git a/scripts/policy/frameworks/intel/smtp.bro b/scripts/policy/frameworks/intel/smtp.bro index 02e97ea54a..d760995e51 100644 --- a/scripts/policy/frameworks/intel/smtp.bro +++ b/scripts/policy/frameworks/intel/smtp.bro @@ -18,8 +18,8 @@ event mime_end_entity(c: connection) } if ( c$smtp?$user_agent ) - Intel::seen([$str=c$smtp$user_agent, - $str_type=Intel::USER_AGENT, + Intel::seen([$indicator=c$smtp$user_agent, + $indicator_type=Intel::SOFTWARE, $conn=c, $where=SMTP::IN_HEADER]); @@ -29,43 +29,69 @@ event mime_end_entity(c: connection) $where=SMTP::IN_X_ORIGINATING_IP_HEADER]); if ( c$smtp?$mailfrom ) - Intel::seen([$str=c$smtp$mailfrom, - $str_type=Intel::EMAIL, - $conn=c, - $where=SMTP::IN_MAIL_FROM]); + { + local mailfromparts = split_n(c$smtp$mailfrom, /<.+>/, T, 1); + if ( |mailfromparts| > 2 ) + { + Intel::seen([$indicator=mailfromparts[2][1:-2], + $indicator_type=Intel::EMAIL, + $conn=c, + $where=SMTP::IN_MAIL_FROM]); + } + } if ( c$smtp?$rcptto ) { for ( rcptto in c$smtp$rcptto ) { - Intel::seen([$str=rcptto, - $str_type=Intel::EMAIL, - $conn=c, - $where=SMTP::IN_RCPT_TO]); + local rcpttoparts = split_n(rcptto, /<.+>/, T, 1); + if ( |rcpttoparts| > 2 ) + { + Intel::seen([$indicator=rcpttoparts[2][1:-2], + $indicator_type=Intel::EMAIL, + $conn=c, + $where=SMTP::IN_RCPT_TO]); + } } } if ( c$smtp?$from ) - Intel::seen([$str=c$smtp$from, - $str_type=Intel::EMAIL, - $conn=c, - $where=SMTP::IN_FROM]); + { + local fromparts = split_n(c$smtp$from, /<.+>/, T, 1); + if ( |fromparts| > 2 ) + { + Intel::seen([$indicator=fromparts[2][1:-2], + $indicator_type=Intel::EMAIL, + $conn=c, + $where=SMTP::IN_FROM]); + } + } if ( c$smtp?$to ) { for ( email_to in c$smtp$to ) { - Intel::seen([$str=email_to, - $str_type=Intel::EMAIL, - $conn=c, - $where=SMTP::IN_TO]); + local toparts = split_n(email_to, /<.+>/, T, 1); + if ( |toparts| > 2 ) + { + Intel::seen([$indicator=toparts[2][1:-2], + $indicator_type=Intel::EMAIL, + $conn=c, + $where=SMTP::IN_TO]); + } } } if ( c$smtp?$reply_to ) - Intel::seen([$str=c$smtp$reply_to, - $str_type=Intel::EMAIL, - $conn=c, - $where=SMTP::IN_REPLY_TO]); + { + local replytoparts = split_n(c$smtp$reply_to, /<.+>/, T, 1); + if ( |replytoparts| > 2 ) + { + Intel::seen([$indicator=replytoparts[2][1:-2], + $indicator_type=Intel::EMAIL, + $conn=c, + $where=SMTP::IN_REPLY_TO]); + } + } } } diff --git a/scripts/policy/frameworks/intel/ssl.bro b/scripts/policy/frameworks/intel/ssl.bro index 3f18a11e6e..e404c39e5b 100644 --- a/scripts/policy/frameworks/intel/ssl.bro +++ b/scripts/policy/frameworks/intel/ssl.bro @@ -10,14 +10,14 @@ event x509_certificate(c: connection, is_orig: bool, cert: X509, chain_idx: coun { local email = sub(cert$subject, /^.*emailAddress=/, ""); email = sub(email, /,.*$/, ""); - Intel::seen([$str=email, - $str_type=Intel::EMAIL, + Intel::seen([$indicator=email, + $indicator_type=Intel::EMAIL, $conn=c, $where=(is_orig ? SSL::IN_CLIENT_CERT : SSL::IN_SERVER_CERT)]); } - Intel::seen([$str=sha1_hash(der_cert), - $str_type=Intel::CERT_HASH, + Intel::seen([$indicator=sha1_hash(der_cert), + $indicator_type=Intel::CERT_HASH, $conn=c, $where=(is_orig ? SSL::IN_CLIENT_CERT : SSL::IN_SERVER_CERT)]); } @@ -27,8 +27,8 @@ event ssl_extension(c: connection, is_orig: bool, code: count, val: string) { if ( is_orig && SSL::extensions[code] == "server_name" && c?$ssl && c$ssl?$server_name ) - Intel::seen([$str=c$ssl$server_name, - $str_type=Intel::DOMAIN, + Intel::seen([$indicator=c$ssl$server_name, + $indicator_type=Intel::DOMAIN, $conn=c, $where=SSL::IN_SERVER_NAME]); } diff --git a/testing/btest/Baseline/scripts.base.frameworks.intel.cluster-transparency/manager-1.intel.log b/testing/btest/Baseline/scripts.base.frameworks.intel.cluster-transparency/manager-1.intel.log index 26efc039c4..00871e7d93 100644 --- a/testing/btest/Baseline/scripts.base.frameworks.intel.cluster-transparency/manager-1.intel.log +++ b/testing/btest/Baseline/scripts.base.frameworks.intel.cluster-transparency/manager-1.intel.log @@ -3,8 +3,8 @@ #empty_field (empty) #unset_field - #path intel -#open 2012-10-03-20-20-39 -#fields ts uid id.orig_h id.orig_p id.resp_h id.resp_p seen.host seen.str seen.str_type seen.where sources -#types time string addr port addr port addr string enum enum table[string] -1349295639.424940 - - - - - 123.123.123.123 - - Intel::IN_ANYWHERE worker-1 -#close 2012-10-03-20-20-49 +#open 2013-07-19-17-05-48 +#fields ts uid id.orig_h id.orig_p id.resp_h id.resp_p seen.indicator seen.indicator_type seen.where sources +#types time string addr port addr port string enum enum table[string] +1374253548.038580 - - - - - 123.123.123.123 Intel::ADDR Intel::IN_ANYWHERE worker-1 +#close 2013-07-19-17-05-57 diff --git a/testing/btest/Baseline/scripts.base.frameworks.intel.input-and-match/broproc.intel.log b/testing/btest/Baseline/scripts.base.frameworks.intel.input-and-match/broproc.intel.log index d72e9efed3..8c01ae5c27 100644 --- a/testing/btest/Baseline/scripts.base.frameworks.intel.input-and-match/broproc.intel.log +++ b/testing/btest/Baseline/scripts.base.frameworks.intel.input-and-match/broproc.intel.log @@ -3,9 +3,9 @@ #empty_field (empty) #unset_field - #path intel -#open 2012-10-03-20-18-05 -#fields ts uid id.orig_h id.orig_p id.resp_h id.resp_p seen.host seen.str seen.str_type seen.where sources -#types time string addr port addr port addr string enum enum table[string] -1349295485.114156 - - - - - - e@mail.com Intel::EMAIL SOMEWHERE source1 -1349295485.114156 - - - - - 1.2.3.4 - - SOMEWHERE source1 -#close 2012-10-03-20-18-05 +#open 2013-07-19-17-04-26 +#fields ts uid id.orig_h id.orig_p id.resp_h id.resp_p seen.indicator seen.indicator_type seen.where sources +#types time string addr port addr port string enum enum table[string] +1374253466.857185 - - - - - e@mail.com Intel::EMAIL SOMEWHERE source1 +1374253466.857185 - - - - - 1.2.3.4 Intel::ADDR SOMEWHERE source1 +#close 2013-07-19-17-04-26 diff --git a/testing/btest/Baseline/scripts.base.frameworks.intel.read-file-dist-cluster/manager-1.intel.log b/testing/btest/Baseline/scripts.base.frameworks.intel.read-file-dist-cluster/manager-1.intel.log index 8069bad528..70d92a3604 100644 --- a/testing/btest/Baseline/scripts.base.frameworks.intel.read-file-dist-cluster/manager-1.intel.log +++ b/testing/btest/Baseline/scripts.base.frameworks.intel.read-file-dist-cluster/manager-1.intel.log @@ -3,11 +3,11 @@ #empty_field (empty) #unset_field - #path intel -#open 2012-10-10-15-05-23 -#fields ts uid id.orig_h id.orig_p id.resp_h id.resp_p seen.host seen.str seen.str_type seen.where sources -#types time string addr port addr port addr string enum enum table[string] -1349881523.548946 - - - - - 1.2.3.4 - - Intel::IN_A_TEST source1 -1349881523.548946 - - - - - - e@mail.com Intel::EMAIL Intel::IN_A_TEST source1 -1349881524.567896 - - - - - 1.2.3.4 - - Intel::IN_A_TEST source1 -1349881524.567896 - - - - - - e@mail.com Intel::EMAIL Intel::IN_A_TEST source1 -#close 2012-10-10-15-05-24 +#open 2013-07-19-17-06-57 +#fields ts uid id.orig_h id.orig_p id.resp_h id.resp_p seen.indicator seen.indicator_type seen.where sources +#types time string addr port addr port string enum enum table[string] +1374253617.312158 - - - - - 1.2.3.4 Intel::ADDR Intel::IN_A_TEST source1 +1374253617.312158 - - - - - e@mail.com Intel::EMAIL Intel::IN_A_TEST source1 +1374253618.332565 - - - - - 1.2.3.4 Intel::ADDR Intel::IN_A_TEST source1 +1374253618.332565 - - - - - e@mail.com Intel::EMAIL Intel::IN_A_TEST source1 +#close 2013-07-19-17-07-06 diff --git a/testing/btest/scripts/base/frameworks/intel/cluster-transparency.bro b/testing/btest/scripts/base/frameworks/intel/cluster-transparency.bro index 3810de5d4b..4d977d475d 100644 --- a/testing/btest/scripts/base/frameworks/intel/cluster-transparency.bro +++ b/testing/btest/scripts/base/frameworks/intel/cluster-transparency.bro @@ -28,7 +28,7 @@ event remote_connection_handshake_done(p: event_peer) # Insert the data once both workers are connected. if ( Cluster::local_node_type() == Cluster::MANAGER && Cluster::worker_count == 2 ) { - Intel::insert([$host=1.2.3.4,$meta=[$source="manager"]]); + Intel::insert([$indicator="1.2.3.4", $indicator_type=Intel::ADDR, $meta=[$source="manager"]]); } } @@ -39,7 +39,7 @@ event Intel::cluster_new_item(item: Intel::Item) if ( ! is_remote_event() ) return; - print fmt("cluster_new_item: %s inserted by %s (from peer: %s)", item$host, item$meta$source, get_event_peer()$descr); + print fmt("cluster_new_item: %s inserted by %s (from peer: %s)", item$indicator, item$meta$source, get_event_peer()$descr); if ( ! sent_data ) { @@ -47,9 +47,9 @@ event Intel::cluster_new_item(item: Intel::Item) # full cluster is constructed. sent_data = T; if ( Cluster::node == "worker-1" ) - Intel::insert([$host=123.123.123.123,$meta=[$source="worker-1"]]); + Intel::insert([$indicator="123.123.123.123", $indicator_type=Intel::ADDR, $meta=[$source="worker-1"]]); if ( Cluster::node == "worker-2" ) - Intel::insert([$host=4.3.2.1,$meta=[$source="worker-2"]]); + Intel::insert([$indicator="4.3.2.1", $indicator_type=Intel::ADDR, $meta=[$source="worker-2"]]); } # We're forcing worker-2 to do a lookup when it has three intelligence items diff --git a/testing/btest/scripts/base/frameworks/intel/input-and-match.bro b/testing/btest/scripts/base/frameworks/intel/input-and-match.bro index f77f5c0f1d..7150d30993 100644 --- a/testing/btest/scripts/base/frameworks/intel/input-and-match.bro +++ b/testing/btest/scripts/base/frameworks/intel/input-and-match.bro @@ -5,10 +5,10 @@ # @TEST-EXEC: btest-diff broproc/intel.log @TEST-START-FILE intel.dat -#fields host net str str_type meta.source meta.desc meta.url -1.2.3.4 - - - source1 this host is just plain baaad http://some-data-distributor.com/1234 -1.2.3.4 - - - source1 this host is just plain baaad http://some-data-distributor.com/1234 -- - e@mail.com Intel::EMAIL source1 Phishing email source http://some-data-distributor.com/100000 +#fields indicator indicator_type meta.source meta.desc meta.url +1.2.3.4 Intel::ADDR source1 this host is just plain baaad http://some-data-distributor.com/1234 +1.2.3.4 Intel::ADDR source1 this host is just plain baaad http://some-data-distributor.com/1234 +e@mail.com Intel::EMAIL source1 Phishing email source http://some-data-distributor.com/100000 @TEST-END-FILE @load frameworks/communication/listen @@ -18,8 +18,8 @@ redef enum Intel::Where += { SOMEWHERE }; event do_it() { - Intel::seen([$str="e@mail.com", - $str_type=Intel::EMAIL, + Intel::seen([$indicator="e@mail.com", + $indicator_type=Intel::EMAIL, $where=SOMEWHERE]); Intel::seen([$host=1.2.3.4, diff --git a/testing/btest/scripts/base/frameworks/intel/read-file-dist-cluster.bro b/testing/btest/scripts/base/frameworks/intel/read-file-dist-cluster.bro index 6838736249..f336fe24b3 100644 --- a/testing/btest/scripts/base/frameworks/intel/read-file-dist-cluster.bro +++ b/testing/btest/scripts/base/frameworks/intel/read-file-dist-cluster.bro @@ -19,10 +19,10 @@ redef Cluster::nodes = { @TEST-END-FILE @TEST-START-FILE intel.dat -#fields host net str str_type meta.source meta.desc meta.url -1.2.3.4 - - - source1 this host is just plain baaad http://some-data-distributor.com/1234 -1.2.3.4 - - - source1 this host is just plain baaad http://some-data-distributor.com/1234 -- - e@mail.com Intel::EMAIL source1 Phishing email source http://some-data-distributor.com/100000 +#fields indicator indicator_type meta.source meta.desc meta.url +1.2.3.4 Intel::ADDR source1 this host is just plain baaad http://some-data-distributor.com/1234 +1.2.3.4 Intel::ADDR source1 this host is just plain baaad http://some-data-distributor.com/1234 +e@mail.com Intel::EMAIL source1 Phishing email source http://some-data-distributor.com/100000 @TEST-END-FILE @load base/frameworks/control @@ -41,7 +41,7 @@ redef enum Intel::Where += { event do_it() { Intel::seen([$host=1.2.3.4, $where=Intel::IN_A_TEST]); - Intel::seen([$str="e@mail.com", $str_type=Intel::EMAIL, $where=Intel::IN_A_TEST]); + Intel::seen([$indicator="e@mail.com", $indicator_type=Intel::EMAIL, $where=Intel::IN_A_TEST]); } event bro_init() From 9dae9dd3e26627d50c3a3620205eee3db88b2e4b Mon Sep 17 00:00:00 2001 From: Seth Hall Date: Fri, 19 Jul 2013 13:53:15 -0400 Subject: [PATCH 43/73] Remove the intel insertion after heuristically detecting ssh bruteforcing. --- scripts/policy/protocols/ssh/detect-bruteforcing.bro | 4 ---- 1 file changed, 4 deletions(-) diff --git a/scripts/policy/protocols/ssh/detect-bruteforcing.bro b/scripts/policy/protocols/ssh/detect-bruteforcing.bro index 309905e939..ada418e61f 100644 --- a/scripts/policy/protocols/ssh/detect-bruteforcing.bro +++ b/scripts/policy/protocols/ssh/detect-bruteforcing.bro @@ -58,10 +58,6 @@ event bro_init() $msg=fmt("%s appears to be guessing SSH passwords (seen in %d connections).", key$host, r$num), $src=key$host, $identifier=cat(key$host)]); - # Insert the guesser into the intel framework. - Intel::insert([$host=key$host, - $meta=[$source="local", - $desc=fmt("Bro observed %d apparently failed SSH connections.", r$num)]]); }]); } From fd2e155d1af26086d40e12d38f564b7954f4597e Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Sun, 21 Jul 2013 17:34:25 +0200 Subject: [PATCH 44/73] Tweak hasher interface. --- src/BloomFilter.cc | 34 +++++++------- src/BloomFilter.h | 31 +++++++------ src/CMakeLists.txt | 2 +- src/HashPolicy.cc | 77 -------------------------------- src/HashPolicy.h | 97 ---------------------------------------- src/Hasher.cc | 79 ++++++++++++++++++++++++++++++++ src/Hasher.h | 109 +++++++++++++++++++++++++++++++++++++++++++++ src/bro.bif | 8 ++-- 8 files changed, 225 insertions(+), 212 deletions(-) delete mode 100644 src/HashPolicy.cc delete mode 100644 src/HashPolicy.h create mode 100644 src/Hasher.cc create mode 100644 src/Hasher.h diff --git a/src/BloomFilter.cc b/src/BloomFilter.cc index c59092b1e4..f399bddeca 100644 --- a/src/BloomFilter.cc +++ b/src/BloomFilter.cc @@ -6,19 +6,19 @@ #include "Serializer.h" BloomFilter::BloomFilter() - : hash_(NULL) + : hasher_(NULL) { } -BloomFilter::BloomFilter(const HashPolicy* hash_policy) - : hash_(hash_policy) +BloomFilter::BloomFilter(const Hasher* hasher) + : hasher_(hasher) { } BloomFilter::~BloomFilter() { - if ( hash_ ) - delete hash_; + if ( hasher_ ) + delete hasher_; } bool BloomFilter::Serialize(SerialInfo* info) const @@ -35,9 +35,9 @@ BloomFilter* BloomFilter::Unserialize(UnserialInfo* info) bool BloomFilter::DoSerialize(SerialInfo* info) const { DO_SERIALIZE(SER_BLOOMFILTER, SerialObj); - if ( ! SERIALIZE(static_cast(hash_->K())) ) + if ( ! SERIALIZE(static_cast(hasher_->K())) ) return false; - return SERIALIZE_STR(hash_->Name().c_str(), hash_->Name().size()); + return SERIALIZE_STR(hasher_->Name().c_str(), hasher_->Name().size()); } bool BloomFilter::DoUnserialize(UnserialInfo* info) @@ -49,7 +49,7 @@ bool BloomFilter::DoUnserialize(UnserialInfo* info) const char* name; if ( ! UNSERIALIZE_STR(&name, 0) ) return false; - hash_ = HashPolicy::Create(k, name); + hasher_ = Hasher::Create(k, name); delete [] name; return true; } @@ -70,7 +70,7 @@ size_t BasicBloomFilter::K(size_t cells, size_t capacity) BasicBloomFilter* BasicBloomFilter::Merge(const BasicBloomFilter* x, const BasicBloomFilter* y) { - // TODO: Ensure that x and y use the same HashPolicy before proceeding. + // TODO: Ensure that x and y use the same Hasher before proceeding. BasicBloomFilter* result = new BasicBloomFilter(); result->bits_ = new BitVector(*x->bits_ | *y->bits_); return result; @@ -81,8 +81,8 @@ BasicBloomFilter::BasicBloomFilter() { } -BasicBloomFilter::BasicBloomFilter(const HashPolicy* hash_policy, size_t cells) - : BloomFilter(hash_policy), +BasicBloomFilter::BasicBloomFilter(const Hasher* hasher, size_t cells) + : BloomFilter(hasher), bits_(new BitVector(cells)) { } @@ -102,13 +102,13 @@ bool BasicBloomFilter::DoUnserialize(UnserialInfo* info) return bits_ != NULL; } -void BasicBloomFilter::AddImpl(const HashPolicy::hash_vector& h) +void BasicBloomFilter::AddImpl(const Hasher::digest_vector& h) { for ( size_t i = 0; i < h.size(); ++i ) bits_->Set(h[i] % bits_->Size()); } -size_t BasicBloomFilter::CountImpl(const HashPolicy::hash_vector& h) const +size_t BasicBloomFilter::CountImpl(const Hasher::digest_vector& h) const { for ( size_t i = 0; i < h.size(); ++i ) if ( ! (*bits_)[h[i] % bits_->Size()] ) @@ -129,9 +129,9 @@ CountingBloomFilter::CountingBloomFilter() { } -CountingBloomFilter::CountingBloomFilter(const HashPolicy* hash_policy, +CountingBloomFilter::CountingBloomFilter(const Hasher* hasher, size_t cells, size_t width) - : BloomFilter(hash_policy) + : BloomFilter(hasher) { cells_ = new CounterVector(width, cells); } @@ -152,13 +152,13 @@ bool CountingBloomFilter::DoUnserialize(UnserialInfo* info) return cells_ != NULL; } -void CountingBloomFilter::AddImpl(const HashPolicy::hash_vector& h) +void CountingBloomFilter::AddImpl(const Hasher::digest_vector& h) { for ( size_t i = 0; i < h.size(); ++i ) cells_->Increment(h[i] % cells_->Size(), 1); } -size_t CountingBloomFilter::CountImpl(const HashPolicy::hash_vector& h) const +size_t CountingBloomFilter::CountImpl(const Hasher::digest_vector& h) const { CounterVector::size_type min = std::numeric_limits::max(); diff --git a/src/BloomFilter.h b/src/BloomFilter.h index 189f4920b7..92f15c6070 100644 --- a/src/BloomFilter.h +++ b/src/BloomFilter.h @@ -3,7 +3,7 @@ #include #include "BitVector.h" -#include "HashPolicy.h" +#include "Hasher.h" class CounterVector; @@ -12,7 +12,7 @@ class CounterVector; */ class BloomFilter : public SerialObj { public: - // At this point we won't let the user choose the hash policy, but we might + // At this point we won't let the user choose the hasher, but we might // open up the interface in the future. virtual ~BloomFilter(); @@ -23,7 +23,7 @@ public: template void Add(const T& x) { - AddImpl(hash_->Hash(&x, sizeof(x))); + AddImpl((*hasher_)(x)); } /** @@ -36,7 +36,7 @@ public: template size_t Count(const T& x) const { - return CountImpl(hash_->Hash(&x, sizeof(x))); + return CountImpl((*hasher_)(x)); } bool Serialize(SerialInfo* info) const; @@ -50,15 +50,15 @@ protected: /** * Constructs a Bloom filter. * - * @param hash_policy The hash policy to use for this Bloom filter. + * @param hasher The hasher to use for this Bloom filter. */ - BloomFilter(const HashPolicy* hash_policy); + BloomFilter(const Hasher* hasher); - virtual void AddImpl(const HashPolicy::hash_vector& hashes) = 0; - virtual size_t CountImpl(const HashPolicy::hash_vector& hashes) const = 0; + virtual void AddImpl(const Hasher::digest_vector& hashes) = 0; + virtual size_t CountImpl(const Hasher::digest_vector& hashes) const = 0; private: - const HashPolicy* hash_; + const Hasher* hasher_; }; /** @@ -98,15 +98,15 @@ public: /** * Constructs a basic Bloom filter with a given number of cells and capacity. */ - BasicBloomFilter(const HashPolicy* hash_policy, size_t cells); + BasicBloomFilter(const Hasher* hasher, size_t cells); protected: DECLARE_SERIAL(BasicBloomFilter); BasicBloomFilter(); - virtual void AddImpl(const HashPolicy::hash_vector& h); - virtual size_t CountImpl(const HashPolicy::hash_vector& h) const; + virtual void AddImpl(const Hasher::digest_vector& h); + virtual size_t CountImpl(const Hasher::digest_vector& h) const; private: BitVector* bits_; @@ -120,16 +120,15 @@ public: static CountingBloomFilter* Merge(const CountingBloomFilter* x, const CountingBloomFilter* y); - CountingBloomFilter(const HashPolicy* hash_policy, size_t cells, - size_t width); + CountingBloomFilter(const Hasher* hasher, size_t cells, size_t width); protected: DECLARE_SERIAL(CountingBloomFilter); CountingBloomFilter(); - virtual void AddImpl(const HashPolicy::hash_vector& h); - virtual size_t CountImpl(const HashPolicy::hash_vector& h) const; + virtual void AddImpl(const Hasher::digest_vector& h); + virtual size_t CountImpl(const Hasher::digest_vector& h) const; private: CounterVector* cells_; diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index f2c7ce6bad..87a3db3b62 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -279,7 +279,7 @@ set(bro_SRCS Frame.cc Func.cc Hash.cc - HashPolicy.cc + Hasher.cc ID.cc IntSet.cc IOSource.cc diff --git a/src/HashPolicy.cc b/src/HashPolicy.cc deleted file mode 100644 index 7ce754be3c..0000000000 --- a/src/HashPolicy.cc +++ /dev/null @@ -1,77 +0,0 @@ -#include "HashPolicy.h" - -#include "digest.h" - -Hasher::Hasher(size_t seed, const std::string& extra) - : h_(compute_seed(seed, extra)) - { - } - -Hasher::hash_type Hasher::operator()(const void* x, size_t n) const - { - return n == 0 ? 0 : h_(x, n); - } - -size_t Hasher::compute_seed(size_t seed, const std::string& extra) - { - u_char digest[SHA256_DIGEST_LENGTH]; - SHA256_CTX ctx; - sha256_init(&ctx); - if ( extra.empty() ) - { - unsigned int first_seed = initial_seed(); - sha256_update(&ctx, &first_seed, sizeof(first_seed)); - } - else - { - sha256_update(&ctx, extra.c_str(), extra.size()); - } - sha256_update(&ctx, &seed, sizeof(seed)); - sha256_final(&ctx, digest); - return *reinterpret_cast(digest); - } - - -HashPolicy* HashPolicy::Create(size_t k, const std::string& name) - { - return new DefaultHashing(k, name); - } - -HashPolicy::HashPolicy(size_t k, const std::string& name) - : k_(k), name_(name) - { - } - -DefaultHashing::DefaultHashing(size_t k, const std::string& name) - : HashPolicy(k, name) - { - for ( size_t i = 0; i < k; ++i ) - hashers_.push_back(Hasher(i, name)); - } - -HashPolicy::hash_vector DefaultHashing::Hash(const void* x, size_t n) const - { - hash_vector h(K(), 0); - for ( size_t i = 0; i < h.size(); ++i ) - h[i] = hashers_[i](x, n); - return h; - } - -DoubleHashing::DoubleHashing(size_t k, const std::string& name) - : HashPolicy(k, name), - hasher1_(1, name), - hasher2_(2, name) - { - } - -HashPolicy::hash_vector DoubleHashing::Hash(const void* x, size_t n) const - { - hash_type h1 = hasher1_(x, n); - hash_type h2 = hasher2_(x, n); - hash_vector h(K(), 0); - for ( size_t i = 0; i < h.size(); ++i ) - h[i] = h1 + i * h2; - return h; - } - - diff --git a/src/HashPolicy.h b/src/HashPolicy.h deleted file mode 100644 index 7bdb968bfe..0000000000 --- a/src/HashPolicy.h +++ /dev/null @@ -1,97 +0,0 @@ -#ifndef HashPolicy_h -#define HashPolicy_h - -#include "Hash.h" -#include "H3.h" - -/** - * A functor that computes a universal hash function. - */ -class Hasher { -public: - typedef hash_t hash_type; - - /** - * Constructs a hasher seeded by a given seed and optionally an extra - * descriptor. - * - * @param seed The seed to use. - * - * @param extra If not `NULL`, the hasher will not mix in the initial seed - * but instead use this NUL-terminated string as additional seed. - */ - Hasher(size_t seed, const std::string& extra = ""); - - /** - * Computes the hash digest of contiguous data. - * - * @param x A pointer to the beginning of the byte sequence to hash. - * - * @param n The length of the sequence pointed to by *x*. - */ - hash_type operator()(const void* x, size_t n) const; - -private: - static size_t compute_seed(size_t seed, const std::string& extra); - - H3 h_; -}; - -/** - * The abstract base class for hash policies that hash elements *k* times. - */ -class HashPolicy { -public: - /** - * Constructs the hashing policy used by the implementation. This factory - * function exists because the HashingPolicy class hierachy is not yet - * serializable. - */ - static HashPolicy* Create(size_t k, const std::string& name); - - typedef Hasher::hash_type hash_type; - typedef std::vector hash_vector; - - virtual ~HashPolicy() { } - - virtual hash_vector Hash(const void* x, size_t n) const = 0; - - size_t K() const { return k_; } - const std::string& Name() const { return name_; } - -protected: - HashPolicy(size_t k, const std::string& name); - -private: - const size_t k_; - std::string name_; -}; - -/** - * The default hashing policy. Performs *k* hash function computations. - */ -class DefaultHashing : public HashPolicy { -public: - DefaultHashing(size_t k, const std::string& name); - - virtual hash_vector Hash(const void* x, size_t n) const /* override */; - -private: - std::vector hashers_; -}; - -/** - * The *double-hashing* policy. Uses a linear combination of two hash functions. - */ -class DoubleHashing : public HashPolicy { -public: - DoubleHashing(size_t k, const std::string& name); - - virtual hash_vector Hash(const void* x, size_t n) const; - -private: - Hasher hasher1_; - Hasher hasher2_; -}; - -#endif diff --git a/src/Hasher.cc b/src/Hasher.cc new file mode 100644 index 0000000000..045adcd174 --- /dev/null +++ b/src/Hasher.cc @@ -0,0 +1,79 @@ +#include "Hasher.h" + +#include "digest.h" + +Hasher::UHF::UHF(size_t seed, const std::string& extra) + : h_(compute_seed(seed, extra)) + { + } + +Hasher::digest Hasher::UHF::hash(const void* x, size_t n) const + { + assert(n <= UHASH_KEY_SIZE); + return n == 0 ? 0 : h_(x, n); + } + +size_t Hasher::UHF::compute_seed(size_t seed, const std::string& extra) + { + u_char buf[SHA256_DIGEST_LENGTH]; + SHA256_CTX ctx; + sha256_init(&ctx); + if ( extra.empty() ) + { + unsigned int first_seed = initial_seed(); + sha256_update(&ctx, &first_seed, sizeof(first_seed)); + } + else + { + sha256_update(&ctx, extra.c_str(), extra.size()); + } + sha256_update(&ctx, &seed, sizeof(seed)); + sha256_final(&ctx, buf); + // Take the first sizeof(size_t) bytes as seed. + return *reinterpret_cast(buf); + } + + +Hasher* Hasher::Create(size_t k, const std::string& name) + { + return new DefaultHasher(k, name); + } + +Hasher::Hasher(size_t k, const std::string& name) + : k_(k), name_(name) + { + } + +DefaultHasher::DefaultHasher(size_t k, const std::string& name) + : Hasher(k, name) + { + for ( size_t i = 0; i < k; ++i ) + hash_functions_.push_back(UHF(i, name)); + } + +Hasher::digest_vector DefaultHasher::Hash(const void* x, size_t n) const + { + digest_vector h(K(), 0); + for ( size_t i = 0; i < h.size(); ++i ) + h[i] = hash_functions_[i](x, n); + return h; + } + +DoubleHasher::DoubleHasher(size_t k, const std::string& name) + : Hasher(k, name), + h1_(1, name), + h2_(2, name) + { + } + +Hasher::digest_vector DoubleHasher::Hash(const void* x, size_t n) const + { + digest h1 = h1_(x, n); + digest h2 = h2_(x, n); + digest_vector h(K(), 0); + for ( size_t i = 0; i < h.size(); ++i ) + h[i] = h1 + i * h2; + return h; + } + + diff --git a/src/Hasher.h b/src/Hasher.h new file mode 100644 index 0000000000..8d0af6b03f --- /dev/null +++ b/src/Hasher.h @@ -0,0 +1,109 @@ +#ifndef Hasher_h +#define Hasher_h + +#include "Hash.h" +#include "H3.h" + +/** + * The abstract base class for hashers, i.e., constructs which hash elements + * *k* times. + */ +class Hasher { +public: + typedef hash_t digest; + typedef std::vector digest_vector; + + /** + * Constructs the hashing policy used by the implementation. + * + * @todo This factory function exists because the HashingPolicy class + * hierachy is not yet serializable. + */ + static Hasher* Create(size_t k, const std::string& name); + + virtual ~Hasher() { } + + template + digest_vector operator()(const T& x) const + { + return Hash(&x, sizeof(T)); + } + + virtual digest_vector Hash(const void* x, size_t n) const = 0; + + size_t K() const { return k_; } + const std::string& Name() const { return name_; } + +protected: + /** + * A universal hash function family. + */ + class UHF { + public: + /** + * Constructs an H3 hash function seeded with a given seed and an optional + * extra seed to replace the initial Bro seed. + * + * @param seed The seed to use for this instance. + * + * @param extra If not empty, this parameter replaces the initial seed to + * compute the seed for t to compute the + * seed + * NUL-terminated string as additional seed. + */ + UHF(size_t seed, const std::string& extra = ""); + + template + digest operator()(const T& x) const + { + return hash(&x, sizeof(T)); + } + + digest operator()(const void* x, size_t n) const + { + return hash(x, n); + } + + digest hash(const void* x, size_t n) const; + + private: + static size_t compute_seed(size_t seed, const std::string& extra); + + H3 h_; + }; + + Hasher(size_t k, const std::string& name); + +private: + const size_t k_; + std::string name_; +}; + +/** + * The default hashing policy. Performs *k* hash function computations. + */ +class DefaultHasher : public Hasher { +public: + DefaultHasher(size_t k, const std::string& name); + + virtual digest_vector Hash(const void* x, size_t n) const /* final */; + +private: + std::vector hash_functions_; +}; + +/** + * The *double-hashing* policy. Uses a linear combination of two hash functions. + */ +class DoubleHasher : public Hasher { +public: + DoubleHasher(size_t k, const std::string& name); + + virtual digest_vector Hash(const void* x, size_t n) const /* final */; + +private: + UHF h1_; + UHF h2_; +}; + +#endif diff --git a/src/bro.bif b/src/bro.bif index d0ce066139..71f8c0716f 100644 --- a/src/bro.bif +++ b/src/bro.bif @@ -5008,8 +5008,8 @@ function bloomfilter_basic_init%(fp: double, capacity: count, size_t cells = BasicBloomFilter::M(fp, capacity); size_t optimal_k = BasicBloomFilter::K(cells, capacity); - const HashPolicy* hp = HashPolicy::Create(optimal_k, name->CheckString()); - return new BloomFilterVal(new BasicBloomFilter(hp, cells)); + const Hasher* h = Hasher::Create(optimal_k, name->CheckString()); + return new BloomFilterVal(new BasicBloomFilter(h, cells)); %} ## Creates a counting Bloom filter. @@ -5029,11 +5029,11 @@ function bloomfilter_basic_init%(fp: double, capacity: count, function bloomfilter_counting_init%(k: count, cells: count, max: count, name: string &default=""%): opaque of bloomfilter %{ - const HashPolicy* hp = HashPolicy::Create(k, name->CheckString()); + const Hasher* h = Hasher::Create(k, name->CheckString()); uint16 width = 0; while ( max >>= 1 ) ++width; - return new BloomFilterVal(new CountingBloomFilter(hp, cells, width)); + return new BloomFilterVal(new CountingBloomFilter(h, cells, width)); %} ## Adds an element to a Bloom filter. From 79a2e4b5d5c28076a8db1857d3ea6a8891e1ef7c Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Sun, 21 Jul 2013 22:41:48 +0200 Subject: [PATCH 45/73] Implement missing CounterVector functions. --- src/CounterVector.cc | 66 ++++++++++++++++++++++++++++++++++++++------ src/CounterVector.h | 15 ++++++++++ 2 files changed, 73 insertions(+), 8 deletions(-) diff --git a/src/CounterVector.cc b/src/CounterVector.cc index 8ed4c30427..a661492313 100644 --- a/src/CounterVector.cc +++ b/src/CounterVector.cc @@ -1,5 +1,6 @@ #include "CounterVector.h" +#include #include "BitVector.h" #include "Serializer.h" @@ -15,23 +16,66 @@ CounterVector::~CounterVector() bool CounterVector::Increment(size_type cell, count_type value) { - // TODO - assert(! "not yet implemented"); + assert(cell < Size()); + assert(value != 0); + size_t lsb = cell * width_; + if (value >= Max()) + { + bool r = false; + for (size_t i = 0; i < width_; ++i) + if (! (*bits_)[lsb + i]) + { + bits_->Set(lsb + i); + if (! r) + r = true; + } + return r; + } + bool carry = false; + for (size_t i = 0; i < width_; ++i) + { + bool b1 = (*bits_)[lsb + i]; + bool b2 = value & (1 << i); + (*bits_)[lsb + i] ^= b2 != carry; // bit1 ^ bit2 ^ carry + carry = carry ? b1 || b2 : b1 && b2; + } + if (! carry) + return true; + for (size_t i = 0; i < width_; ++i) + bits_->Set(lsb + i); return false; } bool CounterVector::Decrement(size_type cell, count_type value) { - // TODO - assert(! "not yet implemented"); - return false; + assert(cell < Size()); + size_t lsb = cell * width_; + bool success; + while (value --> 0) + { + success = false; + for (size_t i = lsb; i < lsb + width_; ++i) + if ((*bits_)[i]) + { + bits_->Reset(i); + while (i && i > lsb) + bits_->Set(--i); + success = true; + break; + } + } + return success; } CounterVector::count_type CounterVector::Count(size_type cell) const { - // TODO - assert(! "not yet implemented"); - return 0; + assert(cell < Size()); + size_t cnt = 0, order = 1; + size_t lsb = cell * width_; + for (size_t i = lsb; i < lsb + width_; ++i, order <<= 1) + if ((*bits_)[i]) + cnt |= order; + return cnt; } CounterVector::size_type CounterVector::Size() const @@ -39,6 +83,12 @@ CounterVector::size_type CounterVector::Size() const return bits_->Blocks() / width_; } +size_t CounterVector::Max() const + { + return std::numeric_limits::max() + >> (std::numeric_limits::digits - width_); + } + bool CounterVector::Serialize(SerialInfo* info) const { return SerialObj::Serialize(info); diff --git a/src/CounterVector.h b/src/CounterVector.h index ecc8fe90e0..868beaca9b 100644 --- a/src/CounterVector.h +++ b/src/CounterVector.h @@ -19,6 +19,8 @@ public: * @param width The number of bits that each cell occupies. * * @param cells The number of cells in the bitvector. + * + * @pre `cells > 0 && width > 0` */ CounterVector(size_t width, size_t cells = 1024); @@ -32,6 +34,8 @@ public: * @param value The value to add to the current counter in *cell*. * * @return `true` if adding *value* to the counter in *cell* succeeded. + * + * @pre `cell < Size()` */ bool Increment(size_type cell, count_type value); @@ -43,6 +47,8 @@ public: * @param value The value to subtract from the current counter in *cell*. * * @return `true` if subtracting *value* from the counter in *cell* succeeded. + * + * @pre `cell < Size()` */ bool Decrement(size_type cell, count_type value); @@ -52,6 +58,8 @@ public: * @param cell The cell index to retrieve the count for. * * @return The counter associated with *cell*. + * + * @pre `cell < Size()` */ count_type Count(size_type cell) const; @@ -62,6 +70,13 @@ public: */ size_type Size() const; + /** + * Computes the maximum counter value. + * + * @return The maximum counter value based on the width. + */ + size_t Max() const; + bool Serialize(SerialInfo* info) const; static CounterVector* Unserialize(UnserialInfo* info); From 7a0240694ec69506b0789029ba48bb56ae703206 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Mon, 22 Jul 2013 14:07:47 +0200 Subject: [PATCH 46/73] Fix and test counting Bloom filter. --- src/BloomFilter.cc | 9 ++++--- src/CounterVector.cc | 5 ++-- src/CounterVector.h | 4 +-- src/bro.bif | 8 +++++- .../btest/Baseline/bifs.bloomfilter/output | 6 +++++ testing/btest/bifs/bloomfilter.bro | 26 ++++++++++++++++++- 6 files changed, 48 insertions(+), 10 deletions(-) diff --git a/src/BloomFilter.cc b/src/BloomFilter.cc index f399bddeca..3c7bac80f1 100644 --- a/src/BloomFilter.cc +++ b/src/BloomFilter.cc @@ -131,9 +131,9 @@ CountingBloomFilter::CountingBloomFilter() CountingBloomFilter::CountingBloomFilter(const Hasher* hasher, size_t cells, size_t width) - : BloomFilter(hasher) + : BloomFilter(hasher), + cells_(new CounterVector(width, cells)) { - cells_ = new CounterVector(width, cells); } @@ -152,10 +152,12 @@ bool CountingBloomFilter::DoUnserialize(UnserialInfo* info) return cells_ != NULL; } +// TODO: Use partitioning in add/count to allow for reusing CMS bounds. + void CountingBloomFilter::AddImpl(const Hasher::digest_vector& h) { for ( size_t i = 0; i < h.size(); ++i ) - cells_->Increment(h[i] % cells_->Size(), 1); + cells_->Increment(h[i] % cells_->Size()); } size_t CountingBloomFilter::CountImpl(const Hasher::digest_vector& h) const @@ -164,7 +166,6 @@ size_t CountingBloomFilter::CountImpl(const Hasher::digest_vector& h) const std::numeric_limits::max(); for ( size_t i = 0; i < h.size(); ++i ) { - // TODO: Use partitioning. CounterVector::size_type cnt = cells_->Count(h[i] % cells_->Size()); if ( cnt < min ) min = cnt; diff --git a/src/CounterVector.cc b/src/CounterVector.cc index a661492313..831b95386f 100644 --- a/src/CounterVector.cc +++ b/src/CounterVector.cc @@ -5,7 +5,8 @@ #include "Serializer.h" CounterVector::CounterVector(size_t width, size_t cells) - : bits_(new BitVector(width * cells)), width_(width) + : bits_(new BitVector(width * cells)), + width_(width) { } @@ -80,7 +81,7 @@ CounterVector::count_type CounterVector::Count(size_type cell) const CounterVector::size_type CounterVector::Size() const { - return bits_->Blocks() / width_; + return bits_->Size() / width_; } size_t CounterVector::Max() const diff --git a/src/CounterVector.h b/src/CounterVector.h index 868beaca9b..2d99bb44d8 100644 --- a/src/CounterVector.h +++ b/src/CounterVector.h @@ -37,7 +37,7 @@ public: * * @pre `cell < Size()` */ - bool Increment(size_type cell, count_type value); + bool Increment(size_type cell, count_type value = 1); /** * Decrements a given cell. @@ -50,7 +50,7 @@ public: * * @pre `cell < Size()` */ - bool Decrement(size_type cell, count_type value); + bool Decrement(size_type cell, count_type value = 1); /** * Retrieves the counter of a given cell. diff --git a/src/bro.bif b/src/bro.bif index 71f8c0716f..a33a2248dd 100644 --- a/src/bro.bif +++ b/src/bro.bif @@ -5029,8 +5029,14 @@ function bloomfilter_basic_init%(fp: double, capacity: count, function bloomfilter_counting_init%(k: count, cells: count, max: count, name: string &default=""%): opaque of bloomfilter %{ + if ( max == 0 ) + { + reporter->Error("max counter value must be greater than 0"); + return NULL; + } + const Hasher* h = Hasher::Create(k, name->CheckString()); - uint16 width = 0; + uint16 width = 1; while ( max >>= 1 ) ++width; return new BloomFilterVal(new CountingBloomFilter(h, cells, width)); diff --git a/testing/btest/Baseline/bifs.bloomfilter/output b/testing/btest/Baseline/bifs.bloomfilter/output index 65aaa8b07c..80847a81b9 100644 --- a/testing/btest/Baseline/bifs.bloomfilter/output +++ b/testing/btest/Baseline/bifs.bloomfilter/output @@ -6,3 +6,9 @@ 1 1 1 +1 +2 +3 +3 +2 +3 diff --git a/testing/btest/bifs/bloomfilter.bro b/testing/btest/bifs/bloomfilter.bro index 3ff6a6668e..ab0bf86c22 100644 --- a/testing/btest/bifs/bloomfilter.bro +++ b/testing/btest/bifs/bloomfilter.bro @@ -1,7 +1,7 @@ # @TEST-EXEC: bro -b %INPUT >output # @TEST-EXEC: btest-diff output -event bro_init() +function test_basic_bloom_filter() { # Basic usage with counts. local bf_cnt = bloomfilter_basic_init(0.1, 1000); @@ -36,3 +36,27 @@ event bro_init() local bf_bug0 = bloomfilter_basic_init(-0.5, 42); local bf_bug1 = bloomfilter_basic_init(1.1, 42); } + +function test_counting_bloom_filter() + { + local bf = bloomfilter_counting_init(3, 16, 3); + bloomfilter_add(bf, "foo"); + print bloomfilter_lookup(bf, "foo"); # 1 + bloomfilter_add(bf, "foo"); + print bloomfilter_lookup(bf, "foo"); # 2 + bloomfilter_add(bf, "foo"); + print bloomfilter_lookup(bf, "foo"); # 3 + bloomfilter_add(bf, "foo"); + print bloomfilter_lookup(bf, "foo"); # still 3 + + bloomfilter_add(bf, "bar"); + bloomfilter_add(bf, "bar"); + print bloomfilter_lookup(bf, "bar"); # 2 + print bloomfilter_lookup(bf, "foo"); # still 3 + } + +event bro_init() + { + test_basic_bloom_filter(); + test_counting_bloom_filter(); + } From a3c61fe7eb6c43622de17df0e818def20cab7e90 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Mon, 22 Jul 2013 15:39:13 +0200 Subject: [PATCH 47/73] Use half adder for bitwise addition and subtraction. --- src/CounterVector.cc | 53 +++++++++++++++----------------------------- 1 file changed, 18 insertions(+), 35 deletions(-) diff --git a/src/CounterVector.cc b/src/CounterVector.cc index 831b95386f..f46fae1b98 100644 --- a/src/CounterVector.cc +++ b/src/CounterVector.cc @@ -20,52 +20,35 @@ bool CounterVector::Increment(size_type cell, count_type value) assert(cell < Size()); assert(value != 0); size_t lsb = cell * width_; - if (value >= Max()) - { - bool r = false; - for (size_t i = 0; i < width_; ++i) - if (! (*bits_)[lsb + i]) - { - bits_->Set(lsb + i); - if (! r) - r = true; - } - return r; - } bool carry = false; - for (size_t i = 0; i < width_; ++i) - { + for ( size_t i = 0; i < width_; ++i ) + { bool b1 = (*bits_)[lsb + i]; bool b2 = value & (1 << i); - (*bits_)[lsb + i] ^= b2 != carry; // bit1 ^ bit2 ^ carry - carry = carry ? b1 || b2 : b1 && b2; - } - if (! carry) - return true; - for (size_t i = 0; i < width_; ++i) - bits_->Set(lsb + i); - return false; + (*bits_)[lsb + i] = b1 ^ b2 ^ carry; + carry = ( b1 && b2 ) || ( carry && ( b1 != b2 ) ); + } + if ( carry ) + for ( size_t i = 0; i < width_; ++i ) + bits_->Set(lsb + i); + return ! carry; } bool CounterVector::Decrement(size_type cell, count_type value) { assert(cell < Size()); + assert(value != 0); + value = ~value + 1; // A - B := A + ~B + 1 + bool carry = false; size_t lsb = cell * width_; - bool success; - while (value --> 0) + for ( size_t i = 0; i < width_; ++i ) { - success = false; - for (size_t i = lsb; i < lsb + width_; ++i) - if ((*bits_)[i]) - { - bits_->Reset(i); - while (i && i > lsb) - bits_->Set(--i); - success = true; - break; - } + bool b1 = bits_[lsb + i]; + bool b2 = value & (1 << i); + bits_[lsb + i] = b1 ^ b2 ^ carry; + carry = ( b1 && b2 ) || ( carry && ( b1 != b2 ) ); } - return success; + return carry; } CounterVector::count_type CounterVector::Count(size_type cell) const From 9c2f57a9d9d5667d05e43efd3c8541ff9d33382a Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Mon, 22 Jul 2013 16:36:54 +0200 Subject: [PATCH 48/73] Make counter vectors mergeable. --- src/CounterVector.cc | 42 ++++++++++++++++++++++++++++++++++++++++-- src/CounterVector.h | 27 +++++++++++++++++++++++++++ 2 files changed, 67 insertions(+), 2 deletions(-) diff --git a/src/CounterVector.cc b/src/CounterVector.cc index f46fae1b98..75c62b208a 100644 --- a/src/CounterVector.cc +++ b/src/CounterVector.cc @@ -43,9 +43,9 @@ bool CounterVector::Decrement(size_type cell, count_type value) size_t lsb = cell * width_; for ( size_t i = 0; i < width_; ++i ) { - bool b1 = bits_[lsb + i]; + bool b1 = (*bits_)[lsb + i]; bool b2 = value & (1 << i); - bits_[lsb + i] = b1 ^ b2 ^ carry; + (*bits_)[lsb + i] = b1 ^ b2 ^ carry; carry = ( b1 && b2 ) || ( carry && ( b1 != b2 ) ); } return carry; @@ -67,12 +67,50 @@ CounterVector::size_type CounterVector::Size() const return bits_->Size() / width_; } +size_t CounterVector::Width() const + { + return width_; + } + size_t CounterVector::Max() const { return std::numeric_limits::max() >> (std::numeric_limits::digits - width_); } +CounterVector& CounterVector::Merge(const CounterVector& other) + { + assert(Size() == other.Size()); + assert(Width() == other.Width()); + for ( size_t cell = 0; cell < Size(); ++cell ) + { + size_t lsb = cell * width_; + bool carry = false; + for ( size_t i = 0; i < width_; ++i ) + { + bool b1 = (*bits_)[lsb + i]; + bool b2 = (*other.bits_)[lsb + i]; + (*bits_)[lsb + i] = b1 ^ b2 ^ carry; + carry = ( b1 && b2 ) || ( carry && ( b1 != b2 ) ); + } + if ( carry ) + for ( size_t i = 0; i < width_; ++i ) + bits_->Set(lsb + i); + } + return *this; + } + +CounterVector& CounterVector::operator|=(const CounterVector& other) +{ + return Merge(other); +} + +CounterVector operator|(const CounterVector& x, const CounterVector& y) +{ + CounterVector cv(x); + return cv |= y; +} + bool CounterVector::Serialize(SerialInfo* info) const { return SerialObj::Serialize(info); diff --git a/src/CounterVector.h b/src/CounterVector.h index 2d99bb44d8..4ab221ff6b 100644 --- a/src/CounterVector.h +++ b/src/CounterVector.h @@ -70,6 +70,13 @@ public: */ size_type Size() const; + /** + * Retrieves the counter width. + * + * @return The number of bits per counter. + */ + size_t Width() const; + /** * Computes the maximum counter value. * @@ -77,6 +84,26 @@ public: */ size_t Max() const; + /** + * Merges another counter vector into this instance by *adding* the counters + * of each cells. + * + * @param other The counter vector to merge into this instance. + * + * @return A reference to `*this`. + * + * @pre `Size() == other.Size() && Width() == other.Width()` + */ + CounterVector& Merge(const CounterVector& other); + + /** + * An alias for ::Merge. + */ + CounterVector& operator|=(const CounterVector& other); + + friend CounterVector operator|(const CounterVector& x, + const CounterVector& y); + bool Serialize(SerialInfo* info) const; static CounterVector* Unserialize(UnserialInfo* info); From eb64f5f9616e84295bc17537e8db57ae4f089c41 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Mon, 22 Jul 2013 18:03:55 +0200 Subject: [PATCH 49/73] Make hash functions equality comparable. --- src/H3.h | 12 ++++++ src/Hasher.cc | 101 +++++++++++++++++++++++++++++++------------------- src/Hasher.h | 18 +++++++++ 3 files changed, 93 insertions(+), 38 deletions(-) diff --git a/src/H3.h b/src/H3.h index e2dc865147..123dd6f374 100644 --- a/src/H3.h +++ b/src/H3.h @@ -58,6 +58,7 @@ #define H3_H #include +#include // The number of values representable by a byte. #define H3_BYTE_RANGE (UCHAR_MAX+1) @@ -112,6 +113,17 @@ public: return result; } + + friend bool operator==(const H3& x, const H3& y) + { + return ! std::memcmp(x.byte_lookup, y.byte_lookup, N * H3_BYTE_RANGE); + } + + friend bool operator!=(const H3& x, const H3& y) + { + return ! (x == y); + } + private: T byte_lookup[N][H3_BYTE_RANGE]; }; diff --git a/src/Hasher.cc b/src/Hasher.cc index 045adcd174..7a8d9a67e0 100644 --- a/src/Hasher.cc +++ b/src/Hasher.cc @@ -8,56 +8,69 @@ Hasher::UHF::UHF(size_t seed, const std::string& extra) } Hasher::digest Hasher::UHF::hash(const void* x, size_t n) const - { - assert(n <= UHASH_KEY_SIZE); - return n == 0 ? 0 : h_(x, n); - } + { + assert(n <= UHASH_KEY_SIZE); + return n == 0 ? 0 : h_(x, n); + } size_t Hasher::UHF::compute_seed(size_t seed, const std::string& extra) - { - u_char buf[SHA256_DIGEST_LENGTH]; - SHA256_CTX ctx; - sha256_init(&ctx); - if ( extra.empty() ) + { + u_char buf[SHA256_DIGEST_LENGTH]; + SHA256_CTX ctx; + sha256_init(&ctx); + if ( extra.empty() ) { unsigned int first_seed = initial_seed(); sha256_update(&ctx, &first_seed, sizeof(first_seed)); } else { - sha256_update(&ctx, extra.c_str(), extra.size()); + sha256_update(&ctx, extra.c_str(), extra.size()); + } + sha256_update(&ctx, &seed, sizeof(seed)); + sha256_final(&ctx, buf); + // Take the first sizeof(size_t) bytes as seed. + return *reinterpret_cast(buf); } - sha256_update(&ctx, &seed, sizeof(seed)); - sha256_final(&ctx, buf); - // Take the first sizeof(size_t) bytes as seed. - return *reinterpret_cast(buf); - } Hasher* Hasher::Create(size_t k, const std::string& name) - { - return new DefaultHasher(k, name); - } + { + return new DefaultHasher(k, name); + } Hasher::Hasher(size_t k, const std::string& name) - : k_(k), name_(name) + : k_(k), name_(name) { } DefaultHasher::DefaultHasher(size_t k, const std::string& name) - : Hasher(k, name) - { - for ( size_t i = 0; i < k; ++i ) - hash_functions_.push_back(UHF(i, name)); - } + : Hasher(k, name) + { + for ( size_t i = 0; i < k; ++i ) + hash_functions_.push_back(UHF(i, name)); + } Hasher::digest_vector DefaultHasher::Hash(const void* x, size_t n) const - { - digest_vector h(K(), 0); - for ( size_t i = 0; i < h.size(); ++i ) - h[i] = hash_functions_[i](x, n); - return h; - } + { + digest_vector h(K(), 0); + for ( size_t i = 0; i < h.size(); ++i ) + h[i] = hash_functions_[i](x, n); + return h; + } + +DefaultHasher* DefaultHasher::Clone() const + { + return new DefaultHasher(*this); + } + +bool DefaultHasher::Equals(const Hasher* other) const /* final */ + { + if ( typeid(*this) != typeid(*other) ) + return false; + const DefaultHasher* o = static_cast(other); + return hash_functions_ == o->hash_functions_; + } DoubleHasher::DoubleHasher(size_t k, const std::string& name) : Hasher(k, name), @@ -67,13 +80,25 @@ DoubleHasher::DoubleHasher(size_t k, const std::string& name) } Hasher::digest_vector DoubleHasher::Hash(const void* x, size_t n) const - { - digest h1 = h1_(x, n); - digest h2 = h2_(x, n); - digest_vector h(K(), 0); - for ( size_t i = 0; i < h.size(); ++i ) - h[i] = h1 + i * h2; - return h; - } + { + digest h1 = h1_(x, n); + digest h2 = h2_(x, n); + digest_vector h(K(), 0); + for ( size_t i = 0; i < h.size(); ++i ) + h[i] = h1 + i * h2; + return h; + } +DoubleHasher* DoubleHasher::Clone() const + { + return new DoubleHasher(*this); + } + +bool DoubleHasher::Equals(const Hasher* other) const /* final */ + { + if ( typeid(*this) != typeid(*other) ) + return false; + const DoubleHasher* o = static_cast(other); + return h1_ == o->h1_ && h2_ == o->h2_; + } diff --git a/src/Hasher.h b/src/Hasher.h index 8d0af6b03f..12393e7217 100644 --- a/src/Hasher.h +++ b/src/Hasher.h @@ -31,6 +31,10 @@ public: virtual digest_vector Hash(const void* x, size_t n) const = 0; + virtual Hasher* Clone() const = 0; + + virtual bool Equals(const Hasher* other) const = 0; + size_t K() const { return k_; } const std::string& Name() const { return name_; } @@ -64,6 +68,16 @@ protected: return hash(x, n); } + friend bool operator==(const UHF& x, const UHF& y) + { + return x.h_ == y.h_; + } + + friend bool operator!=(const UHF& x, const UHF& y) + { + return ! (x == y); + } + digest hash(const void* x, size_t n) const; private: @@ -87,6 +101,8 @@ public: DefaultHasher(size_t k, const std::string& name); virtual digest_vector Hash(const void* x, size_t n) const /* final */; + virtual DefaultHasher* Clone() const /* final */; + virtual bool Equals(const Hasher* other) const /* final */; private: std::vector hash_functions_; @@ -100,6 +116,8 @@ public: DoubleHasher(size_t k, const std::string& name); virtual digest_vector Hash(const void* x, size_t n) const /* final */; + virtual DoubleHasher* Clone() const /* final */; + virtual bool Equals(const Hasher* other) const /* final */; private: UHF h1_; From a39f980cd493e64a6bb4016c47923e8754b059dc Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Mon, 22 Jul 2013 18:11:12 +0200 Subject: [PATCH 50/73] Implement and test Bloom filter merging. --- src/BloomFilter.cc | 22 ++++++++++++++---- src/BloomFilter.h | 1 - src/CounterVector.cc | 6 +++++ src/CounterVector.h | 8 +++++++ src/Hasher.cc | 4 ++-- src/OpaqueVal.cc | 2 +- src/OpaqueVal.h | 21 ++++++++++++++--- .../btest/Baseline/bifs.bloomfilter/output | 7 ++++++ testing/btest/bifs/bloomfilter.bro | 23 ++++++++++++++++++- 9 files changed, 81 insertions(+), 13 deletions(-) diff --git a/src/BloomFilter.cc b/src/BloomFilter.cc index 3c7bac80f1..889c7bafe1 100644 --- a/src/BloomFilter.cc +++ b/src/BloomFilter.cc @@ -70,8 +70,13 @@ size_t BasicBloomFilter::K(size_t cells, size_t capacity) BasicBloomFilter* BasicBloomFilter::Merge(const BasicBloomFilter* x, const BasicBloomFilter* y) { - // TODO: Ensure that x and y use the same Hasher before proceeding. + if ( ! x->hasher_->Equals(y->hasher_) ) + { + reporter->InternalError("incompatible hashers during Bloom filter merge"); + return NULL; + } BasicBloomFilter* result = new BasicBloomFilter(); + result->hasher_ = x->hasher_->Clone(); result->bits_ = new BitVector(*x->bits_ | *y->bits_); return result; } @@ -119,10 +124,17 @@ size_t BasicBloomFilter::CountImpl(const Hasher::digest_vector& h) const CountingBloomFilter* CountingBloomFilter::Merge(const CountingBloomFilter* x, const CountingBloomFilter* y) -{ - assert(! "not yet implemented"); - return NULL; -} + { + if ( ! x->hasher_->Equals(y->hasher_) ) + { + reporter->InternalError("incompatible hashers during Bloom filter merge"); + return NULL; + } + CountingBloomFilter* result = new CountingBloomFilter(); + result->hasher_ = x->hasher_->Clone(); + result->cells_ = new CounterVector(*x->cells_ | *y->cells_); + return result; + } CountingBloomFilter::CountingBloomFilter() : cells_(NULL) diff --git a/src/BloomFilter.h b/src/BloomFilter.h index 92f15c6070..070aa2dc25 100644 --- a/src/BloomFilter.h +++ b/src/BloomFilter.h @@ -57,7 +57,6 @@ protected: virtual void AddImpl(const Hasher::digest_vector& hashes) = 0; virtual size_t CountImpl(const Hasher::digest_vector& hashes) const = 0; -private: const Hasher* hasher_; }; diff --git a/src/CounterVector.cc b/src/CounterVector.cc index 75c62b208a..cf3083de9e 100644 --- a/src/CounterVector.cc +++ b/src/CounterVector.cc @@ -10,6 +10,12 @@ CounterVector::CounterVector(size_t width, size_t cells) { } +CounterVector::CounterVector(const CounterVector& other) + : bits_(new BitVector(*other.bits_)), + width_(other.width_) + { + } + CounterVector::~CounterVector() { delete bits_; diff --git a/src/CounterVector.h b/src/CounterVector.h index 4ab221ff6b..eced5956d4 100644 --- a/src/CounterVector.h +++ b/src/CounterVector.h @@ -9,6 +9,7 @@ class BitVector; * A vector of counters, each of which have a fixed number of bits. */ class CounterVector : public SerialObj { + CounterVector& operator=(const CounterVector&); public: typedef size_t size_type; typedef uint64 count_type; @@ -24,6 +25,13 @@ public: */ CounterVector(size_t width, size_t cells = 1024); + /** + * Copy-constructs a counter vector. + * + * @param other The counter vector to copy. + */ + CounterVector(const CounterVector& other); + ~CounterVector(); /** diff --git a/src/Hasher.cc b/src/Hasher.cc index 7a8d9a67e0..2a889c7e09 100644 --- a/src/Hasher.cc +++ b/src/Hasher.cc @@ -64,7 +64,7 @@ DefaultHasher* DefaultHasher::Clone() const return new DefaultHasher(*this); } -bool DefaultHasher::Equals(const Hasher* other) const /* final */ +bool DefaultHasher::Equals(const Hasher* other) const { if ( typeid(*this) != typeid(*other) ) return false; @@ -94,7 +94,7 @@ DoubleHasher* DoubleHasher::Clone() const return new DoubleHasher(*this); } -bool DoubleHasher::Equals(const Hasher* other) const /* final */ +bool DoubleHasher::Equals(const Hasher* other) const { if ( typeid(*this) != typeid(*other) ) return false; diff --git a/src/OpaqueVal.cc b/src/OpaqueVal.cc index 5a673c4a40..36038d679a 100644 --- a/src/OpaqueVal.cc +++ b/src/OpaqueVal.cc @@ -1,6 +1,5 @@ #include "OpaqueVal.h" -#include "BloomFilter.h" #include "NetVar.h" #include "Reporter.h" #include "Serializer.h" @@ -587,6 +586,7 @@ BloomFilterVal* BloomFilterVal::Merge(const BloomFilterVal* x, else if ( (result = DoMerge(x, y)) ) return result; + reporter->InternalError("failed to merge Bloom filters"); return NULL; } diff --git a/src/OpaqueVal.h b/src/OpaqueVal.h index 2362fdacfc..22c3dbfade 100644 --- a/src/OpaqueVal.h +++ b/src/OpaqueVal.h @@ -3,6 +3,7 @@ #ifndef OPAQUEVAL_H #define OPAQUEVAL_H +#include "BloomFilter.h" #include "RandTest.h" #include "Val.h" #include "digest.h" @@ -137,9 +138,23 @@ private: static BloomFilterVal* DoMerge(const BloomFilterVal* x, const BloomFilterVal* y) { - const T* a = dynamic_cast(x->bloom_filter_); - const T* b = dynamic_cast(y->bloom_filter_); - return a && b ? new BloomFilterVal(T::Merge(a, b)) : NULL; + if ( typeid(*x->bloom_filter_) != typeid(*y->bloom_filter_) ) + { + reporter->InternalError("cannot merge different Bloom filter types"); + return NULL; + } + if ( typeid(T) != typeid(*x->bloom_filter_) ) + return NULL; + const T* a = static_cast(x->bloom_filter_); + const T* b = static_cast(y->bloom_filter_); + BloomFilterVal* merged = new BloomFilterVal(T::Merge(a, b)); + assert(merged); + if ( ! merged->Typify(x->Type()) ) + { + reporter->InternalError("failed to set type on merged Bloom filter"); + return NULL; + } + return merged; } BroType* type_; diff --git a/testing/btest/Baseline/bifs.bloomfilter/output b/testing/btest/Baseline/bifs.bloomfilter/output index 80847a81b9..4fe2ae1ecc 100644 --- a/testing/btest/Baseline/bifs.bloomfilter/output +++ b/testing/btest/Baseline/bifs.bloomfilter/output @@ -7,8 +7,15 @@ 1 1 1 +1 +1 +1 +1 2 3 3 2 3 +3 +3 +2 diff --git a/testing/btest/bifs/bloomfilter.bro b/testing/btest/bifs/bloomfilter.bro index ab0bf86c22..f69ddbda0c 100644 --- a/testing/btest/bifs/bloomfilter.bro +++ b/testing/btest/bifs/bloomfilter.bro @@ -35,11 +35,21 @@ function test_basic_bloom_filter() # Invalid parameters. local bf_bug0 = bloomfilter_basic_init(-0.5, 42); local bf_bug1 = bloomfilter_basic_init(1.1, 42); + + # Merging + local bf_cnt2 = bloomfilter_basic_init(0.1, 1000); + bloomfilter_add(bf_cnt2, 42); + bloomfilter_add(bf_cnt, 100); + local bf_merged = bloomfilter_merge(bf_cnt, bf_cnt2); + print bloomfilter_lookup(bf_merged, 42); + print bloomfilter_lookup(bf_merged, 84); + print bloomfilter_lookup(bf_merged, 100); + print bloomfilter_lookup(bf_merged, 168); } function test_counting_bloom_filter() { - local bf = bloomfilter_counting_init(3, 16, 3); + local bf = bloomfilter_counting_init(3, 32, 3); bloomfilter_add(bf, "foo"); print bloomfilter_lookup(bf, "foo"); # 1 bloomfilter_add(bf, "foo"); @@ -49,10 +59,21 @@ function test_counting_bloom_filter() bloomfilter_add(bf, "foo"); print bloomfilter_lookup(bf, "foo"); # still 3 + bloomfilter_add(bf, "bar"); bloomfilter_add(bf, "bar"); print bloomfilter_lookup(bf, "bar"); # 2 print bloomfilter_lookup(bf, "foo"); # still 3 + + # Merging + local bf2 = bloomfilter_counting_init(3, 32, 3); + bloomfilter_add(bf2, "baz"); + bloomfilter_add(bf2, "baz"); + bloomfilter_add(bf2, "bar"); + local bf_merged = bloomfilter_merge(bf, bf2); + print bloomfilter_lookup(bf_merged, "foo"); + print bloomfilter_lookup(bf_merged, "bar"); + print bloomfilter_lookup(bf_merged, "baz"); } event bro_init() From 325f0c2a3f087508dc0817739b9c312bcc5873d5 Mon Sep 17 00:00:00 2001 From: Jon Siwek Date: Mon, 22 Jul 2013 14:15:35 -0500 Subject: [PATCH 51/73] Coverage test fixes and whitespace/doc tweaks. --- doc/scripts/DocSourcesList.cmake | 3 ++ scripts/base/utils/active-http.bro | 26 ++++++------ scripts/base/utils/exec.bro | 40 +++++++++---------- .../canonified_loaded_scripts.log | 13 +++--- 4 files changed, 42 insertions(+), 40 deletions(-) diff --git a/doc/scripts/DocSourcesList.cmake b/doc/scripts/DocSourcesList.cmake index 529b03ca83..bd264bfcb4 100644 --- a/doc/scripts/DocSourcesList.cmake +++ b/doc/scripts/DocSourcesList.cmake @@ -164,9 +164,12 @@ rest_target(${psd} base/protocols/ssl/main.bro) rest_target(${psd} base/protocols/ssl/mozilla-ca-list.bro) rest_target(${psd} base/protocols/syslog/consts.bro) rest_target(${psd} base/protocols/syslog/main.bro) +rest_target(${psd} base/utils/active-http.bro) rest_target(${psd} base/utils/addrs.bro) rest_target(${psd} base/utils/conn-ids.bro) +rest_target(${psd} base/utils/dir.bro) rest_target(${psd} base/utils/directions-and-hosts.bro) +rest_target(${psd} base/utils/exec.bro) rest_target(${psd} base/utils/files.bro) rest_target(${psd} base/utils/numbers.bro) rest_target(${psd} base/utils/paths.bro) diff --git a/scripts/base/utils/active-http.bro b/scripts/base/utils/active-http.bro index 5522cc108a..3f475a378b 100644 --- a/scripts/base/utils/active-http.bro +++ b/scripts/base/utils/active-http.bro @@ -1,21 +1,21 @@ -##! A module for performing active HTTP requests and +##! A module for performing active HTTP requests and ##! getting the reply at runtime. @load ./exec module ActiveHTTP; - + export { ## The default timeout for HTTP requests. const default_max_time = 1min &redef; - + ## The default HTTP method/verb to use for requests. const default_method = "GET" &redef; - - type Response: record { + + type Response: record { ## Numeric response code from the server. code: count; - ## String response messgae from the server. + ## String response message from the server. msg: string; ## Full body of the response. body: string &optional; @@ -29,24 +29,24 @@ export { ## The HTTP method/verb to use for the request. method: string &default=default_method; ## Data to send to the server in the client body. Keep in - ## mind that you will probably need to set the $method field + ## mind that you will probably need to set the *method* field ## to "POST" or "PUT". client_data: string &optional; - ## Arbitrary headers to pass to the server. Some headers + ## Arbitrary headers to pass to the server. Some headers ## will be included by libCurl. #custom_headers: table[string] of string &optional; ## Timeout for the request. max_time: interval &default=default_max_time; - ## Additional curl command line arguments. Be very careful + ## Additional curl command line arguments. Be very careful ## with this option since shell injection could take place ## if careful handling of untrusted data is not applied. addl_curl_args: string &optional; }; ## Perform an HTTP request according to the :bro:type:`Request` record. - ## This is an asynchronous function and must be called within a "when" + ## This is an asynchronous function and must be called within a "when" ## statement. - ## + ## ## req: A record instance representing all options for an HTTP request. ## ## Returns: A record with the full response message. @@ -55,7 +55,7 @@ export { function request2curl(r: Request, bodyfile: string, headersfile: string): string { - local cmd = fmt("curl -s -g -o \"%s\" -D \"%s\" -X \"%s\"", + local cmd = fmt("curl -s -g -o \"%s\" -D \"%s\" -X \"%s\"", str_shell_escape(bodyfile), str_shell_escape(headersfile), str_shell_escape(r$method)); @@ -91,7 +91,7 @@ function request(req: Request): ActiveHTTP::Response # If there is no response line then nothing else will work either. if ( ! (result?$files && headersfile in result$files) ) Reporter::error(fmt("There was a failure when requesting \"%s\" with ActiveHTTP.", req$url)); - + local headers = result$files[headersfile]; for ( i in headers ) { diff --git a/scripts/base/utils/exec.bro b/scripts/base/utils/exec.bro index 45cd8cb287..f896a68064 100644 --- a/scripts/base/utils/exec.bro +++ b/scripts/base/utils/exec.bro @@ -1,6 +1,4 @@ ##! A module for executing external command line programs. -##! This requires code that is still in topic branches and -##! definitely won't currently work on any released version of Bro. @load base/frameworks/input @@ -8,15 +6,13 @@ module Exec; export { type Command: record { - ## The command line to execute. - ## Use care to avoid injection attacks! + ## The command line to execute. Use care to avoid injection attacks. + ## I.e. if the command uses untrusted/variable data, sanitize it. cmd: string; - ## Provide standard in to the program as a - ## string. + ## Provide standard in to the program as a string. stdin: string &default=""; - ## If additional files are required to be read - ## in as part of the output of the command they - ## can be defined here. + ## If additional files are required to be read in as part of the output + ## of the command they can be defined here. read_files: set[string] &optional; }; @@ -27,7 +23,7 @@ export { signal_exit: bool &default=F; ## Each line of standard out. stdout: vector of string &optional; - ## Each line of standard error. + ## Each line of standard error. stderr: vector of string &optional; ## If additional files were requested to be read in ## the content of the files will be available here. @@ -35,7 +31,7 @@ export { }; ## Function for running command line programs and getting - ## output. This is an asynchronous function which is meant + ## output. This is an asynchronous function which is meant ## to be run with the `when` statement. ## ## cmd: The command to run. Use care to avoid injection attacks! @@ -56,12 +52,12 @@ redef record Command += { global results: table[string] of Result = table(); global finished_commands: set[string]; global currently_tracked_files: set[string] = set(); -type OneLine: record { +type OneLine: record { s: string; is_stderr: bool; }; -type FileLine: record { +type FileLine: record { s: string; }; @@ -93,7 +89,7 @@ event Exec::file_line(description: Input::EventDescription, tpe: Input::Event, s local result = results[name]; if ( ! result?$files ) result$files = table(); - + if ( track_file !in result$files ) result$files[track_file] = vector(s); else @@ -136,16 +132,16 @@ function run(cmd: Command): Result } } - local config_strings: table[string] of string = { + local config_strings: table[string] of string = { ["stdin"] = cmd$stdin, ["read_stderr"] = "1", }; - Input::add_event([$name=cmd$uid, - $source=fmt("%s |", cmd$cmd), - $reader=Input::READER_RAW, - $fields=Exec::OneLine, - $ev=Exec::line, - $want_record=F, + Input::add_event([$name=cmd$uid, + $source=fmt("%s |", cmd$cmd), + $reader=Input::READER_RAW, + $fields=Exec::OneLine, + $ev=Exec::line, + $want_record=F, $config=config_strings]); return when ( cmd$uid in finished_commands ) @@ -164,4 +160,4 @@ event bro_done() { system(fmt("rm \"%s\"", str_shell_escape(fname))); } - } \ No newline at end of file + } diff --git a/testing/btest/Baseline/coverage.default-load-baseline/canonified_loaded_scripts.log b/testing/btest/Baseline/coverage.default-load-baseline/canonified_loaded_scripts.log index 999fd7c841..37f1c739f8 100644 --- a/testing/btest/Baseline/coverage.default-load-baseline/canonified_loaded_scripts.log +++ b/testing/btest/Baseline/coverage.default-load-baseline/canonified_loaded_scripts.log @@ -3,7 +3,7 @@ #empty_field (empty) #unset_field - #path loaded_scripts -#open 2013-07-10-21-18-31 +#open 2013-07-22-16-01-22 #fields name #types string scripts/base/init-bare.bro @@ -90,12 +90,17 @@ scripts/base/init-bare.bro scripts/base/init-default.bro scripts/base/utils/site.bro scripts/base/utils/patterns.bro + scripts/base/utils/active-http.bro + scripts/base/utils/exec.bro scripts/base/utils/addrs.bro scripts/base/utils/conn-ids.bro + scripts/base/utils/dir.bro + scripts/base/frameworks/reporter/__load__.bro + scripts/base/frameworks/reporter/main.bro + scripts/base/utils/paths.bro scripts/base/utils/directions-and-hosts.bro scripts/base/utils/files.bro scripts/base/utils/numbers.bro - scripts/base/utils/paths.bro scripts/base/utils/queue.bro scripts/base/utils/strings.bro scripts/base/utils/thresholds.bro @@ -129,8 +134,6 @@ scripts/base/init-default.bro scripts/base/frameworks/intel/__load__.bro scripts/base/frameworks/intel/main.bro scripts/base/frameworks/intel/input.bro - scripts/base/frameworks/reporter/__load__.bro - scripts/base/frameworks/reporter/main.bro scripts/base/frameworks/sumstats/__load__.bro scripts/base/frameworks/sumstats/main.bro scripts/base/frameworks/sumstats/plugins/__load__.bro @@ -195,4 +198,4 @@ scripts/base/init-default.bro scripts/base/protocols/tunnels/__load__.bro scripts/base/misc/find-checksum-offloading.bro scripts/policy/misc/loaded-scripts.bro -#close 2013-07-10-21-18-31 +#close 2013-07-22-16-01-22 From 73eb87a41ef5d79f5f84d8aebe42ce9b61aadc5a Mon Sep 17 00:00:00 2001 From: Jon Siwek Date: Tue, 23 Jul 2013 14:16:39 -0500 Subject: [PATCH 52/73] Exec module changes/fixes. - Give Dir::monitor() a param for the polling interval, so different dirs can be monitored at different frequencies. - Fix race in Exec::run() when reading extra output files produced by a process -- it was possible for Exec::run() to return before all extra output files had been fully read. - Add test cases. --- scripts/base/utils/active-http.bro | 3 + scripts/base/utils/dir.bro | 34 +++++--- scripts/base/utils/exec.bro | 85 ++++++++++++------- .../bro..stdout | 5 ++ .../scripts.base.utils.dir/bro..stdout | 10 +++ .../scripts.base.utils.exec/bro..stdout | 7 ++ .../btest/scripts/base/utils/active-http.test | 25 ++++++ testing/btest/scripts/base/utils/dir.test | 58 +++++++++++++ testing/btest/scripts/base/utils/exec.test | 74 ++++++++++++++++ testing/scripts/httpd.py | 40 +++++++++ 10 files changed, 299 insertions(+), 42 deletions(-) create mode 100644 testing/btest/Baseline/scripts.base.utils.active-http/bro..stdout create mode 100644 testing/btest/Baseline/scripts.base.utils.dir/bro..stdout create mode 100644 testing/btest/Baseline/scripts.base.utils.exec/bro..stdout create mode 100644 testing/btest/scripts/base/utils/active-http.test create mode 100644 testing/btest/scripts/base/utils/dir.test create mode 100644 testing/btest/scripts/base/utils/exec.test create mode 100755 testing/scripts/httpd.py diff --git a/scripts/base/utils/active-http.bro b/scripts/base/utils/active-http.bro index 3f475a378b..eb9a212221 100644 --- a/scripts/base/utils/active-http.bro +++ b/scripts/base/utils/active-http.bro @@ -90,7 +90,10 @@ function request(req: Request): ActiveHTTP::Response { # If there is no response line then nothing else will work either. if ( ! (result?$files && headersfile in result$files) ) + { Reporter::error(fmt("There was a failure when requesting \"%s\" with ActiveHTTP.", req$url)); + return resp; + } local headers = result$files[headersfile]; for ( i in headers ) diff --git a/scripts/base/utils/dir.bro b/scripts/base/utils/dir.bro index b154fe000e..3329dc6306 100644 --- a/scripts/base/utils/dir.bro +++ b/scripts/base/utils/dir.bro @@ -5,6 +5,10 @@ module Dir; export { + ## The default interval this module checks for files in directories when + ## using the :bro:see:`Dir::monitor` function. + const polling_interval = 30sec &redef; + ## Register a directory to monitor with a callback that is called ## every time a previously unseen file is seen. If a file is deleted ## and seen to be gone, the file is available for being seen again in @@ -14,14 +18,15 @@ export { ## ## callback: Callback that gets executed with each file name ## that is found. Filenames are provided with the full path. - global monitor: function(dir: string, callback: function(fname: string)); - - ## The interval this module checks for files in directories when using - ## the :bro:see:`Dir::monitor` function. - const polling_interval = 30sec &redef; + ## + ## poll_interval: An interval at which to check for new files. + global monitor: function(dir: string, callback: function(fname: string), + poll_interval: interval &default=polling_interval); } -event Dir::monitor_ev(dir: string, last_files: set[string], callback: function(fname: string)) +event Dir::monitor_ev(dir: string, last_files: set[string], + callback: function(fname: string), + poll_interval: interval) { when ( local result = Exec::run([$cmd=fmt("ls -i \"%s/\"", str_shell_escape(dir))]) ) { @@ -32,7 +37,11 @@ event Dir::monitor_ev(dir: string, last_files: set[string], callback: function(f } local current_files: set[string] = set(); - local files = result$stdout; + local files: vector of string = vector(); + + if ( result?$stdout ) + files = result$stdout; + for ( i in files ) { local parts = split1(files[i], / /); @@ -40,13 +49,18 @@ event Dir::monitor_ev(dir: string, last_files: set[string], callback: function(f callback(build_path_compressed(dir, parts[2])); add current_files[parts[1]]; } - schedule polling_interval { Dir::monitor_ev(dir, current_files, callback) }; + + schedule poll_interval + { + Dir::monitor_ev(dir, current_files, callback, poll_interval) + }; } } -function monitor(dir: string, callback: function(fname: string)) +function monitor(dir: string, callback: function(fname: string), + poll_interval: interval &default=polling_interval) { - event Dir::monitor_ev(dir, set(), callback); + event Dir::monitor_ev(dir, set(), callback, poll_interval); } diff --git a/scripts/base/utils/exec.bro b/scripts/base/utils/exec.bro index f896a68064..4ffae29303 100644 --- a/scripts/base/utils/exec.bro +++ b/scripts/base/utils/exec.bro @@ -14,6 +14,8 @@ export { ## If additional files are required to be read in as part of the output ## of the command they can be defined here. read_files: set[string] &optional; + # The unique id for tracking executors. + uid: string &default=unique_id(""); }; type Result: record { @@ -44,14 +46,11 @@ export { const tmp_dir = "/tmp" &redef; } -redef record Command += { - # The unique id for tracking executors. - uid: string &optional; -}; +# Indexed by command uid. +global results: table[string] of Result; +global pending_commands: set[string]; +global pending_files: table[string] of set[string]; -global results: table[string] of Result = table(); -global finished_commands: set[string]; -global currently_tracked_files: set[string] = set(); type OneLine: record { s: string; is_stderr: bool; @@ -96,39 +95,63 @@ event Exec::file_line(description: Input::EventDescription, tpe: Input::Event, s result$files[track_file][|result$files[track_file]|] = s; } +event Input::end_of_data(name: string, source:string) + { + local parts = split1(name, /_/); + name = parts[1]; + + if ( name !in pending_commands || |parts| < 2 ) + return; + + local track_file = parts[2]; + + Input::remove(name); + + if ( name !in pending_files ) + delete pending_commands[name]; + else + { + delete pending_files[name][track_file]; + if ( |pending_files[name]| == 0 ) + delete pending_commands[name]; + system(fmt("rm \"%s\"", str_shell_escape(track_file))); + } + } + event InputRaw::process_finished(name: string, source:string, exit_code:count, signal_exit:bool) { + if ( name !in pending_commands ) + return; + + Input::remove(name); results[name]$exit_code = exit_code; results[name]$signal_exit = signal_exit; - Input::remove(name); - # Indicate to the "when" async watcher that this command is done. - add finished_commands[name]; - } - -event Exec::start_watching_file(uid: string, read_file: string) - { - Input::add_event([$source=fmt("%s", read_file), - $name=fmt("%s_%s", uid, read_file), - $reader=Input::READER_RAW, - $mode=Input::STREAM, - $want_record=F, - $fields=FileLine, - $ev=Exec::file_line]); + if ( name !in pending_files || |pending_files[name]| == 0 ) + # No extra files to read, command is done. + delete pending_commands[name]; + else + for ( read_file in pending_files[name] ) + Input::add_event([$source=fmt("%s", read_file), + $name=fmt("%s_%s", name, read_file), + $reader=Input::READER_RAW, + $want_record=F, + $fields=FileLine, + $ev=Exec::file_line]); } function run(cmd: Command): Result { - cmd$uid = unique_id(""); + add pending_commands[cmd$uid]; results[cmd$uid] = []; if ( cmd?$read_files ) { for ( read_file in cmd$read_files ) { - add currently_tracked_files[read_file]; - system(fmt("touch \"%s\" 2>/dev/null", str_shell_escape(read_file))); - schedule 1msec { Exec::start_watching_file(cmd$uid, read_file) }; + if ( cmd$uid !in pending_files ) + pending_files[cmd$uid] = set(); + add pending_files[cmd$uid][read_file]; } } @@ -144,9 +167,8 @@ function run(cmd: Command): Result $want_record=F, $config=config_strings]); - return when ( cmd$uid in finished_commands ) + return when ( cmd$uid !in pending_commands ) { - delete finished_commands[cmd$uid]; local result = results[cmd$uid]; delete results[cmd$uid]; return result; @@ -155,9 +177,8 @@ function run(cmd: Command): Result event bro_done() { - # We are punting here and just deleting any files that haven't been processed yet. - for ( fname in currently_tracked_files ) - { - system(fmt("rm \"%s\"", str_shell_escape(fname))); - } + # We are punting here and just deleting any unprocessed files. + for ( uid in pending_files ) + for ( fname in pending_files[uid] ) + system(fmt("rm \"%s\"", str_shell_escape(fname))); } diff --git a/testing/btest/Baseline/scripts.base.utils.active-http/bro..stdout b/testing/btest/Baseline/scripts.base.utils.active-http/bro..stdout new file mode 100644 index 0000000000..0284eb19b3 --- /dev/null +++ b/testing/btest/Baseline/scripts.base.utils.active-http/bro..stdout @@ -0,0 +1,5 @@ +[code=200, msg=OK^M, body=It works!, headers={ +[Server] = 1.0, +[Content-type] = text/plain, +[Date] = July 22, 2013 +}] diff --git a/testing/btest/Baseline/scripts.base.utils.dir/bro..stdout b/testing/btest/Baseline/scripts.base.utils.dir/bro..stdout new file mode 100644 index 0000000000..c3103b7f64 --- /dev/null +++ b/testing/btest/Baseline/scripts.base.utils.dir/bro..stdout @@ -0,0 +1,10 @@ +new_file1, ../testdir/bye +new_file1, ../testdir/hi +new_file1, ../testdir/howsitgoing +new_file2, ../testdir/bye +new_file2, ../testdir/hi +new_file2, ../testdir/howsitgoing +new_file1, ../testdir/bye +new_file1, ../testdir/newone +new_file2, ../testdir/bye +new_file2, ../testdir/newone diff --git a/testing/btest/Baseline/scripts.base.utils.exec/bro..stdout b/testing/btest/Baseline/scripts.base.utils.exec/bro..stdout new file mode 100644 index 0000000000..5352d15d18 --- /dev/null +++ b/testing/btest/Baseline/scripts.base.utils.exec/bro..stdout @@ -0,0 +1,7 @@ +test1, [exit_code=0, signal_exit=F, stdout=[done, exit, stop], stderr=, files={ +[out1] = [insert text here, and here], +[out2] = [insert more text here, and there] +}] +test2, [exit_code=1, signal_exit=F, stdout=[here's something on stdout, some more stdout, last stdout], stderr=[and some stderr, more stderr, last stderr], files=] +test3, [exit_code=9, signal_exit=F, stdout=[FML], stderr=, files=] +test4, [exit_code=0, signal_exit=F, stdout=[hibye], stderr=, files=] diff --git a/testing/btest/scripts/base/utils/active-http.test b/testing/btest/scripts/base/utils/active-http.test new file mode 100644 index 0000000000..9ac762b9b7 --- /dev/null +++ b/testing/btest/scripts/base/utils/active-http.test @@ -0,0 +1,25 @@ +# @TEST-EXEC: btest-bg-run httpd python $SCRIPTS/httpd.py --max 1 +# @TEST-EXEC: sleep 3 +# @TEST-EXEC: btest-bg-run bro bro -b %INPUT +# @TEST-EXEC: btest-bg-wait 15 +# @TEST-EXEC: btest-diff bro/.stdout + +@load base/utils/active-http + +redef exit_only_after_terminate = T; + +event bro_init() + { + local req = ActiveHTTP::Request($url="localhost:32123"); + + when ( local resp = ActiveHTTP::request(req) ) + { + print resp; + terminate(); + } + timeout 1min + { + print "HTTP request timeout"; + terminate(); + } + } diff --git a/testing/btest/scripts/base/utils/dir.test b/testing/btest/scripts/base/utils/dir.test new file mode 100644 index 0000000000..44fee3860f --- /dev/null +++ b/testing/btest/scripts/base/utils/dir.test @@ -0,0 +1,58 @@ +# @TEST-EXEC: btest-bg-run bro bro -b ../dirtest.bro +# @TEST-EXEC: btest-bg-wait 10 +# @TEST-EXEC: TEST_DIFF_CANONIFIER=$SCRIPTS/diff-sort btest-diff bro/.stdout + +@TEST-START-FILE dirtest.bro + +@load base/utils/dir + +redef exit_only_after_terminate = T; + +global c: count = 0; + +function check_terminate_condition() + { + c += 1; + + if ( c == 10 ) + terminate(); + } + +function new_file1(fname: string) + { + print "new_file1", fname; + check_terminate_condition(); + } + +function new_file2(fname: string) + { + print "new_file2", fname; + check_terminate_condition(); + } + +event change_things() + { + system("touch ../testdir/newone"); + system("rm ../testdir/bye && touch ../testdir/bye"); + } + +event bro_init() + { + Dir::monitor("../testdir", new_file1, .5sec); + Dir::monitor("../testdir", new_file2, 1sec); + schedule 1sec { change_things() }; + } + +@TEST-END-FILE + +@TEST-START-FILE testdir/hi +123 +@TEST-END-FILE + +@TEST-START-FILE testdir/howsitgoing +abc +@TEST-END-FILE + +@TEST-START-FILE testdir/bye +!@# +@TEST-END-FILE diff --git a/testing/btest/scripts/base/utils/exec.test b/testing/btest/scripts/base/utils/exec.test new file mode 100644 index 0000000000..8876f0f49b --- /dev/null +++ b/testing/btest/scripts/base/utils/exec.test @@ -0,0 +1,74 @@ +# @TEST-EXEC: btest-bg-run bro bro -b ../exectest.bro +# @TEST-EXEC: btest-bg-wait 10 +# @TEST-EXEC: TEST_DIFF_CANONIFIER=$SCRIPTS/diff-sort btest-diff bro/.stdout + +@TEST-START-FILE exectest.bro + +@load base/utils/exec + +redef exit_only_after_terminate = T; + +global c: count = 0; + +function check_exit_condition() + { + c += 1; + + if ( c == 4 ) + terminate(); + } + +function test_cmd(label: string, cmd: Exec::Command) + { + when ( local result = Exec::run(cmd) ) + { + print label, result; + check_exit_condition(); + } + } + +event bro_init() + { + test_cmd("test1", [$cmd="bash ../somescript.sh", + $read_files=set("out1", "out2")]); + test_cmd("test2", [$cmd="bash ../nofiles.sh"]); + test_cmd("test3", [$cmd="bash ../suicide.sh"]); + test_cmd("test4", [$cmd="bash ../stdin.sh", $stdin="hibye"]); + } + +@TEST-END-FILE + +@TEST-START-FILE somescript.sh +#! /usr/bin/env bash +echo "insert text here" > out1 +echo "and here" >> out1 +echo "insert more text here" > out2 +echo "and there" >> out2 +echo "done" +echo "exit" +echo "stop" +@TEST-END-FILE + +@TEST-START-FILE nofiles.sh +#! /usr/bin/env bash +echo "here's something on stdout" +echo "some more stdout" +echo "last stdout" +echo "and some stderr" 1>&2 +echo "more stderr" 1>&2 +echo "last stderr" 1>&2 +exit 1 +@TEST-END-FILE + +@TEST-START-FILE suicide.sh +#! /usr/bin/env bash +echo "FML" +kill -9 $$ +echo "nope" +@TEST-END-FILE + +@TEST-START-FILE stdin.sh +#! /usr/bin/env bash +read -r line +echo "$line" +@TEST-END-FILE diff --git a/testing/scripts/httpd.py b/testing/scripts/httpd.py new file mode 100755 index 0000000000..0732614bc2 --- /dev/null +++ b/testing/scripts/httpd.py @@ -0,0 +1,40 @@ +#! /usr/bin/env python + +import BaseHTTPServer + +class MyRequestHandler(BaseHTTPServer.BaseHTTPRequestHandler): + + def do_GET(self): + self.send_response(200) + self.send_header("Content-type", "text/plain") + self.end_headers() + self.wfile.write("It works!") + + def version_string(self): + return "1.0" + + def date_time_string(self): + return "July 22, 2013" + + +if __name__ == "__main__": + from optparse import OptionParser + p = OptionParser() + p.add_option("-a", "--addr", type="string", default="localhost", + help=("listen on given address (numeric IP or host name), " + "an empty string (the default) means INADDR_ANY")) + p.add_option("-p", "--port", type="int", default=32123, + help="listen on given TCP port number") + p.add_option("-m", "--max", type="int", default=-1, + help="max number of requests to respond to, -1 means no max") + options, args = p.parse_args() + + httpd = BaseHTTPServer.HTTPServer((options.addr, options.port), + MyRequestHandler) + if options.max == -1: + httpd.serve_forever() + else: + served_count = 0 + while served_count != options.max: + httpd.handle_request() + served_count += 1 From 474107fe40c22dec977d4e9ee3dad0edcbc02344 Mon Sep 17 00:00:00 2001 From: Robin Sommer Date: Tue, 23 Jul 2013 17:16:57 -0700 Subject: [PATCH 53/73] Broifying the code. Also extending API documentation a bit more and fixing a memory leak. --- src/Func.cc | 4 +- src/H3.h | 4 +- src/OpaqueVal.cc | 159 ++-- src/OpaqueVal.h | 67 +- src/Type.cc | 1 + src/probabilistic/BitVector.cc | 777 ++++++++++-------- src/probabilistic/BitVector.h | 575 +++++++------ src/probabilistic/BloomFilter.cc | 229 +++--- src/probabilistic/BloomFilter.h | 229 ++++-- src/probabilistic/CounterVector.cc | 244 +++--- src/probabilistic/CounterVector.h | 208 ++--- src/probabilistic/Hasher.cc | 63 +- src/probabilistic/Hasher.h | 262 +++--- src/probabilistic/bloom-filter.bif | 122 +-- src/util.cc | 20 +- src/util.h | 8 +- .../btest/Baseline/bifs.bloomfilter/output | 6 + testing/btest/bifs/bloomfilter.bro | 2 +- 18 files changed, 1651 insertions(+), 1329 deletions(-) diff --git a/src/Func.cc b/src/Func.cc index a0d2299933..483699668f 100644 --- a/src/Func.cc +++ b/src/Func.cc @@ -560,7 +560,7 @@ void builtin_error(const char* msg, BroObj* arg) #include "reporter.bif.func_def" #include "strings.bif.func_def" -// TODO: Add a nicer mechanism to pull subdirectory bifs automatically. +// TODO: Add a nicer mechanism to pull in subdirectory bifs automatically. #include "probabilistic/bloom-filter.bif.h" void init_builtin_funcs() @@ -577,7 +577,7 @@ void init_builtin_funcs() #include "reporter.bif.func_init" #include "strings.bif.func_init" -// TODO: Add a nicer mechanism to pull subdirectory bifs automatically. +// TODO: Add a nicer mechanism to pull in subdirectory bifs automatically. #include "probabilistic/bloom-filter.bif.init.cc" did_builtin_init = true; diff --git a/src/H3.h b/src/H3.h index 123dd6f374..8ea5848816 100644 --- a/src/H3.h +++ b/src/H3.h @@ -100,8 +100,8 @@ public: // loop optmized with Duff's Device register unsigned n = (size + 7) / 8; switch ( size % 8 ) { - case 0: do { result ^= byte_lookup[offset++][*p++]; - case 7: result ^= byte_lookup[offset++][*p++]; + case 0: do { result ^= byte_lookup[offset++][*p++]; + case 7: result ^= byte_lookup[offset++][*p++]; case 6: result ^= byte_lookup[offset++][*p++]; case 5: result ^= byte_lookup[offset++][*p++]; case 4: result ^= byte_lookup[offset++][*p++]; diff --git a/src/OpaqueVal.cc b/src/OpaqueVal.cc index 04032b2cfc..efdd890f70 100644 --- a/src/OpaqueVal.cc +++ b/src/OpaqueVal.cc @@ -1,5 +1,6 @@ -#include "OpaqueVal.h" +// See the file "COPYING" in the main distribution directory for copyright. +#include "OpaqueVal.h" #include "NetVar.h" #include "Reporter.h" #include "Serializer.h" @@ -518,87 +519,89 @@ bool EntropyVal::DoUnserialize(UnserialInfo* info) } BloomFilterVal::BloomFilterVal() - : OpaqueVal(bloomfilter_type), - type_(NULL), - hash_(NULL), - bloom_filter_(NULL) + : OpaqueVal(bloomfilter_type) { + type = 0; + hash = 0; + bloom_filter = 0; } BloomFilterVal::BloomFilterVal(OpaqueType* t) - : OpaqueVal(t), - type_(NULL), - hash_(NULL), - bloom_filter_(NULL) + : OpaqueVal(t) { + type = 0; + hash = 0; + bloom_filter = 0; } BloomFilterVal::BloomFilterVal(probabilistic::BloomFilter* bf) - : OpaqueVal(bloomfilter_type), - type_(NULL), - hash_(NULL), - bloom_filter_(bf) + : OpaqueVal(bloomfilter_type) { + type = 0; + hash = 0; + bloom_filter = bf; } -bool BloomFilterVal::Typify(BroType* type) - { - if ( type_ ) - return false; - type_ = type; - type_->Ref(); - TypeList* tl = new TypeList(type_); - tl->Append(type_); - hash_ = new CompositeHash(tl); - Unref(tl); - return true; - } +bool BloomFilterVal::Typify(BroType* arg_type) + { + if ( type ) + return false; + + type = arg_type; + type->Ref(); + + TypeList* tl = new TypeList(type); + tl->Append(type); + hash = new CompositeHash(tl); + Unref(tl); + + return true; + } BroType* BloomFilterVal::Type() const - { - return type_; - } + { + return type; + } void BloomFilterVal::Add(const Val* val) - { - HashKey* key = hash_->ComputeHash(val, 1); - bloom_filter_->Add(key->Hash()); - } + { + HashKey* key = hash->ComputeHash(val, 1); + bloom_filter->Add(key->Hash()); + delete key; + } size_t BloomFilterVal::Count(const Val* val) const - { - HashKey* key = hash_->ComputeHash(val, 1); - return bloom_filter_->Count(key->Hash()); - } + { + HashKey* key = hash->ComputeHash(val, 1); + size_t cnt = bloom_filter->Count(key->Hash()); + delete key; + return cnt; + } BloomFilterVal* BloomFilterVal::Merge(const BloomFilterVal* x, const BloomFilterVal* y) - { - if ( x->Type() != y->Type() ) - { - reporter->InternalError("cannot merge Bloom filters with different types"); - return NULL; - } + { + if ( ! same_type(x->Type(), y->Type()) ) + reporter->InternalError("cannot merge Bloom filters with different types"); - BloomFilterVal* result; - if ( (result = DoMerge(x, y)) ) - return result; - else if ( (result = DoMerge(x, y)) ) - return result; + BloomFilterVal* result; - reporter->InternalError("failed to merge Bloom filters"); - return NULL; - } + if ( (result = DoMerge(x, y)) ) + return result; + + else if ( (result = DoMerge(x, y)) ) + return result; + + reporter->InternalError("failed to merge Bloom filters"); + return 0; + } BloomFilterVal::~BloomFilterVal() - { - if ( type_ ) - Unref(type_); - if ( hash_ ) - delete hash_; - if ( bloom_filter_ ) - delete bloom_filter_; - } + { + Unref(type); + delete hash; + delete bloom_filter; + } IMPLEMENT_SERIAL(BloomFilterVal, SER_BLOOMFILTER_VAL); @@ -606,14 +609,16 @@ bool BloomFilterVal::DoSerialize(SerialInfo* info) const { DO_SERIALIZE(SER_BLOOMFILTER_VAL, OpaqueVal); - bool is_typed = type_ != NULL; - if ( ! SERIALIZE(is_typed) ) - return false; - if ( is_typed && ! type_->Serialize(info) ) - return false; + bool is_typed = (type != 0); - return bloom_filter_->Serialize(info); - } + if ( ! SERIALIZE(is_typed) ) + return false; + + if ( is_typed && ! type->Serialize(info) ) + return false; + + return bloom_filter->Serialize(info); + } bool BloomFilterVal::DoUnserialize(UnserialInfo* info) { @@ -621,15 +626,17 @@ bool BloomFilterVal::DoUnserialize(UnserialInfo* info) bool is_typed; if ( ! UNSERIALIZE(&is_typed) ) - return false; - if ( is_typed ) - { - BroType* type = BroType::Unserialize(info); - if ( ! Typify(type) ) - return false; - Unref(type); - } + return false; - bloom_filter_ = probabilistic::BloomFilter::Unserialize(info); - return bloom_filter_ != NULL; - } + if ( is_typed ) + { + BroType* type = BroType::Unserialize(info); + if ( ! Typify(type) ) + return false; + + Unref(type); + } + + bloom_filter = probabilistic::BloomFilter::Unserialize(info); + return bloom_filter != 0; + } diff --git a/src/OpaqueVal.h b/src/OpaqueVal.h index 5ccf73e11f..ea704cb70a 100644 --- a/src/OpaqueVal.h +++ b/src/OpaqueVal.h @@ -116,21 +116,19 @@ private: }; class BloomFilterVal : public OpaqueVal { - BloomFilterVal(const BloomFilterVal&); - BloomFilterVal& operator=(const BloomFilterVal&); public: - static BloomFilterVal* Merge(const BloomFilterVal* x, - const BloomFilterVal* y); - explicit BloomFilterVal(probabilistic::BloomFilter* bf); - ~BloomFilterVal(); + virtual ~BloomFilterVal(); - bool Typify(BroType* type); BroType* Type() const; + bool Typify(BroType* type); void Add(const Val* val); size_t Count(const Val* val) const; + static BloomFilterVal* Merge(const BloomFilterVal* x, + const BloomFilterVal* y); + protected: friend class Val; BloomFilterVal(); @@ -139,32 +137,35 @@ protected: DECLARE_SERIAL(BloomFilterVal); private: - template - static BloomFilterVal* DoMerge(const BloomFilterVal* x, - const BloomFilterVal* y) - { - if ( typeid(*x->bloom_filter_) != typeid(*y->bloom_filter_) ) - { - reporter->InternalError("cannot merge different Bloom filter types"); - return NULL; - } - if ( typeid(T) != typeid(*x->bloom_filter_) ) - return NULL; - const T* a = static_cast(x->bloom_filter_); - const T* b = static_cast(y->bloom_filter_); - BloomFilterVal* merged = new BloomFilterVal(T::Merge(a, b)); - assert(merged); - if ( ! merged->Typify(x->Type()) ) - { - reporter->InternalError("failed to set type on merged Bloom filter"); - return NULL; - } - return merged; - } + // Disable. + BloomFilterVal(const BloomFilterVal&); + BloomFilterVal& operator=(const BloomFilterVal&); - BroType* type_; - CompositeHash* hash_; - probabilistic::BloomFilter* bloom_filter_; -}; + template + static BloomFilterVal* DoMerge(const BloomFilterVal* x, + const BloomFilterVal* y) + { + if ( typeid(*x->bloom_filter) != typeid(*y->bloom_filter) ) + reporter->InternalError("cannot merge different Bloom filter types"); + + if ( typeid(T) != typeid(*x->bloom_filter) ) + return 0; + + const T* a = static_cast(x->bloom_filter); + const T* b = static_cast(y->bloom_filter); + + BloomFilterVal* merged = new BloomFilterVal(T::Merge(a, b)); + assert(merged); + + if ( ! merged->Typify(x->Type()) ) + reporter->InternalError("failed to set type on merged Bloom filter"); + + return merged; + } + + BroType* type; + CompositeHash* hash; + probabilistic::BloomFilter* bloom_filter; + }; #endif diff --git a/src/Type.cc b/src/Type.cc index 57d9d0e6e5..563bc5afbd 100644 --- a/src/Type.cc +++ b/src/Type.cc @@ -1321,6 +1321,7 @@ bool OpaqueType::DoUnserialize(UnserialInfo* info) const char* n; if ( ! UNSERIALIZE_STR(&n, 0) ) return false; + name = n; delete [] n; diff --git a/src/probabilistic/BitVector.cc b/src/probabilistic/BitVector.cc index 67714fe7d0..98f008b24b 100644 --- a/src/probabilistic/BitVector.cc +++ b/src/probabilistic/BitVector.cc @@ -1,3 +1,5 @@ +// See the file "COPYING" in the main distribution directory for copyright. + #include "BitVector.h" #include @@ -8,505 +10,558 @@ using namespace probabilistic; BitVector::size_type BitVector::npos = static_cast(-1); BitVector::block_type BitVector::bits_per_block = - std::numeric_limits::digits; + std::numeric_limits::digits; namespace { uint8_t count_table[] = { - 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, - 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, - 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, - 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, - 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, - 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, - 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, - 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, 5, - 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 3, - 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, - 6, 7, 6, 7, 7, 8 + 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, + 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, + 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, + 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, + 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, + 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, + 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, + 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, 5, + 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 3, + 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, + 6, 7, 6, 7, 7, 8 }; } // namespace BitVector::Reference::Reference(block_type& block, block_type i) - : block_(block), - mask_(block_type(1) << i) - { - assert(i < bits_per_block); - } + : block(block), mask((block_type(1) << i)) + { + assert(i < bits_per_block); + } BitVector::Reference& BitVector::Reference::Flip() - { - block_ ^= mask_; - return *this; - } + { + block ^= mask; + return *this; + } BitVector::Reference::operator bool() const - { - return (block_ & mask_) != 0; - } + { + return (block & mask) != 0; + } bool BitVector::Reference::operator~() const - { - return (block_ & mask_) == 0; - } + { + return (block & mask) == 0; + } BitVector::Reference& BitVector::Reference::operator=(bool x) - { - x ? block_ |= mask_ : block_ &= ~mask_; - return *this; - } + { + if ( x ) + block |= mask; + else + block &= ~mask; -BitVector::Reference& BitVector::Reference::operator=(Reference const& other) - { - other ? block_ |= mask_ : block_ &= ~mask_; - return *this; - } + return *this; + } + +BitVector::Reference& BitVector::Reference::operator=(const Reference& other) + { + if ( other ) + block |= mask; + else + block &= ~mask; + + return *this; + } BitVector::Reference& BitVector::Reference::operator|=(bool x) - { - if (x) - block_ |= mask_; - return *this; - } + { + if ( x ) + block |= mask; + + return *this; + } BitVector::Reference& BitVector::Reference::operator&=(bool x) - { - if (! x) - block_ &= ~mask_; - return *this; - } + { + if ( ! x ) + block &= ~mask; + + return *this; + } BitVector::Reference& BitVector::Reference::operator^=(bool x) - { - if (x) - block_ ^= mask_; - return *this; - } + { + if ( x ) + block ^= mask; + + return *this; + } BitVector::Reference& BitVector::Reference::operator-=(bool x) - { - if (x) - block_ &= ~mask_; - return *this; - } + { + if ( x ) + block &= ~mask; + return *this; + } -BitVector::BitVector() : num_bits_(0) { } +BitVector::BitVector() + { + num_bits = 0; + } BitVector::BitVector(size_type size, bool value) - : bits_(bits_to_blocks(size), value ? ~block_type(0) : 0), - num_bits_(size) -{ } + : bits(bits_to_blocks(size), value ? ~block_type(0) : 0) + { + num_bits = size; + } BitVector::BitVector(BitVector const& other) - : bits_(other.bits_), - num_bits_(other.num_bits_) -{ } + : bits(other.bits) + { + num_bits = other.num_bits; + } BitVector BitVector::operator~() const - { - BitVector b(*this); - b.Flip(); - return b; - } + { + BitVector b(*this); + b.Flip(); + return b; + } BitVector& BitVector::operator=(BitVector const& other) - { - bits_ = other.bits_; - return *this; - } + { + bits = other.bits; + return *this; + } BitVector BitVector::operator<<(size_type n) const - { - BitVector b(*this); - return b <<= n; - } + { + BitVector b(*this); + return b <<= n; + } BitVector BitVector::operator>>(size_type n) const - { - BitVector b(*this); - return b >>= n; - } + { + BitVector b(*this); + return b >>= n; + } BitVector& BitVector::operator<<=(size_type n) - { - if (n >= num_bits_) - return Reset(); + { + if ( n >= num_bits ) + return Reset(); - if (n > 0) - { - size_type last = Blocks() - 1; - size_type div = n / bits_per_block; - block_type r = bit_index(n); - block_type* b = &bits_[0]; - assert(Blocks() >= 1); - assert(div <= last); + if ( n > 0 ) + { + size_type last = Blocks() - 1; + size_type div = n / bits_per_block; + block_type r = bit_index(n); + block_type* b = &bits[0]; - if (r != 0) - { - for (size_type i = last - div; i > 0; --i) - b[i + div] = (b[i] << r) | (b[i - 1] >> (bits_per_block - r)); - b[div] = b[0] << r; - } - else - { - for (size_type i = last-div; i > 0; --i) - b[i + div] = b[i]; - b[div] = b[0]; - } + assert(Blocks() >= 1); + assert(div <= last); - std::fill_n(b, div, block_type(0)); - zero_unused_bits(); - } + if ( r != 0 ) + { + for ( size_type i = last - div; i > 0; --i ) + b[i + div] = (b[i] << r) | (b[i - 1] >> (bits_per_block - r)); - return *this; - } + b[div] = b[0] << r; + } + + else + { + for (size_type i = last-div; i > 0; --i) + b[i + div] = b[i]; + + b[div] = b[0]; + } + + std::fill_n(b, div, block_type(0)); + zero_unused_bits(); + } + + return *this; + } BitVector& BitVector::operator>>=(size_type n) - { - if (n >= num_bits_) - return Reset(); + { + if ( n >= num_bits ) + return Reset(); - if (n > 0) - { - size_type last = Blocks() - 1; - size_type div = n / bits_per_block; - block_type r = bit_index(n); - block_type* b = &bits_[0]; - assert(Blocks() >= 1); - assert(div <= last); + if ( n > 0 ) + { + size_type last = Blocks() - 1; + size_type div = n / bits_per_block; + block_type r = bit_index(n); + block_type* b = &bits[0]; - if (r != 0) - { - for (size_type i = last - div; i > 0; --i) - b[i - div] = (b[i] >> r) | (b[i + 1] << (bits_per_block - r)); - b[last - div] = b[last] >> r; - } - else - { - for (size_type i = div; i <= last; ++i) - b[i-div] = b[i]; - } + assert(Blocks() >= 1); + assert(div <= last); - std::fill_n(b + (Blocks() - div), div, block_type(0)); - } - return *this; - } + if ( r != 0 ) + { + for (size_type i = last - div; i > 0; --i) + b[i - div] = (b[i] >> r) | (b[i + 1] << (bits_per_block - r)); + + b[last - div] = b[last] >> r; + } + + else + { + for (size_type i = div; i <= last; ++i) + b[i-div] = b[i]; + } + + std::fill_n(b + (Blocks() - div), div, block_type(0)); + } + + return *this; + } BitVector& BitVector::operator&=(BitVector const& other) - { - assert(Size() >= other.Size()); - for (size_type i = 0; i < Blocks(); ++i) - bits_[i] &= other.bits_[i]; - return *this; - } + { + assert(Size() >= other.Size()); + + for ( size_type i = 0; i < Blocks(); ++i ) + bits[i] &= other.bits[i]; + + return *this; + } BitVector& BitVector::operator|=(BitVector const& other) - { - assert(Size() >= other.Size()); - for (size_type i = 0; i < Blocks(); ++i) - bits_[i] |= other.bits_[i]; - return *this; - } + { + assert(Size() >= other.Size()); + + for ( size_type i = 0; i < Blocks(); ++i ) + bits[i] |= other.bits[i]; + + return *this; + } BitVector& BitVector::operator^=(BitVector const& other) - { - assert(Size() >= other.Size()); - for (size_type i = 0; i < Blocks(); ++i) - bits_[i] ^= other.bits_[i]; - return *this; - } + { + assert(Size() >= other.Size()); + + for ( size_type i = 0; i < Blocks(); ++i ) + bits[i] ^= other.bits[i]; + + return *this; + } BitVector& BitVector::operator-=(BitVector const& other) - { - assert(Size() >= other.Size()); - for (size_type i = 0; i < Blocks(); ++i) - bits_[i] &= ~other.bits_[i]; - return *this; - } + { + assert(Size() >= other.Size()); + + for ( size_type i = 0; i < Blocks(); ++i ) + bits[i] &= ~other.bits[i]; + + return *this; + } namespace probabilistic { BitVector operator&(BitVector const& x, BitVector const& y) - { - BitVector b(x); - return b &= y; - } + { + BitVector b(x); + return b &= y; + } BitVector operator|(BitVector const& x, BitVector const& y) - { - BitVector b(x); - return b |= y; - } + { + BitVector b(x); + return b |= y; + } BitVector operator^(BitVector const& x, BitVector const& y) - { - BitVector b(x); - return b ^= y; - } + { + BitVector b(x); + return b ^= y; + } BitVector operator-(BitVector const& x, BitVector const& y) - { - BitVector b(x); - return b -= y; - } + { + BitVector b(x); + return b -= y; + } bool operator==(BitVector const& x, BitVector const& y) - { - return x.num_bits_ == y.num_bits_ && x.bits_ == y.bits_; - } + { + return x.num_bits == y.num_bits && x.bits == y.bits; + } bool operator!=(BitVector const& x, BitVector const& y) - { - return ! (x == y); - } + { + return ! (x == y); + } bool operator<(BitVector const& x, BitVector const& y) - { - assert(x.Size() == y.Size()); - for (BitVector::size_type r = x.Blocks(); r > 0; --r) - { - BitVector::size_type i = r - 1; - if (x.bits_[i] < y.bits_[i]) - return true; - else if (x.bits_[i] > y.bits_[i]) - return false; - } - return false; - } + { + assert(x.Size() == y.Size()); + + for ( BitVector::size_type r = x.Blocks(); r > 0; --r ) + { + BitVector::size_type i = r - 1; + + if ( x.bits[i] < y.bits[i] ) + return true; + + else if ( x.bits[i] > y.bits[i] ) + return false; + + } + + return false; + } } void BitVector::Resize(size_type n, bool value) - { - size_type old = Blocks(); - size_type required = bits_to_blocks(n); - block_type block_value = value ? ~block_type(0) : block_type(0); + { + size_type old = Blocks(); + size_type required = bits_to_blocks(n); + block_type block_value = value ? ~block_type(0) : block_type(0); - if (required != old) - bits_.resize(required, block_value); + if ( required != old ) + bits.resize(required, block_value); - if (value && (n > num_bits_) && extra_bits()) - bits_[old - 1] |= (block_value << extra_bits()); + if ( value && (n > num_bits) && extra_bits() ) + bits[old - 1] |= (block_value << extra_bits()); - num_bits_ = n; - zero_unused_bits(); - } + num_bits = n; + zero_unused_bits(); + } void BitVector::Clear() - { - bits_.clear(); - num_bits_ = 0; - } + { + bits.clear(); + num_bits = 0; + } void BitVector::PushBack(bool bit) - { - size_type s = Size(); - Resize(s + 1); - Set(s, bit); - } + { + size_type s = Size(); + Resize(s + 1); + Set(s, bit); + } void BitVector::Append(block_type block) - { - size_type excess = extra_bits(); - if (excess) - { - assert(! Empty()); - bits_.push_back(block >> (bits_per_block - excess)); - bits_[Blocks() - 2] |= (block << excess); - } - else - { - bits_.push_back(block); - } - num_bits_ += bits_per_block; - } + { + size_type excess = extra_bits(); + + if ( excess ) + { + assert(! Empty()); + bits.push_back(block >> (bits_per_block - excess)); + bits[Blocks() - 2] |= (block << excess); + } + + else + { + bits.push_back(block); + } + + num_bits += bits_per_block; + } BitVector& BitVector::Set(size_type i, bool bit) - { - assert(i < num_bits_); - if (bit) - bits_[block_index(i)] |= bit_mask(i); - else - Reset(i); - return *this; - } + { + assert(i < num_bits); + + if ( bit ) + bits[block_index(i)] |= bit_mask(i); + else + Reset(i); + + return *this; + } BitVector& BitVector::Set() - { - std::fill(bits_.begin(), bits_.end(), ~block_type(0)); - zero_unused_bits(); - return *this; - } + { + std::fill(bits.begin(), bits.end(), ~block_type(0)); + zero_unused_bits(); + return *this; + } BitVector& BitVector::Reset(size_type i) - { - assert(i < num_bits_); - bits_[block_index(i)] &= ~bit_mask(i); - return *this; - } + { + assert(i < num_bits); + bits[block_index(i)] &= ~bit_mask(i); + return *this; + } BitVector& BitVector::Reset() - { - std::fill(bits_.begin(), bits_.end(), block_type(0)); - return *this; - } + { + std::fill(bits.begin(), bits.end(), block_type(0)); + return *this; + } BitVector& BitVector::Flip(size_type i) - { - assert(i < num_bits_); - bits_[block_index(i)] ^= bit_mask(i); - return *this; - } + { + assert(i < num_bits); + bits[block_index(i)] ^= bit_mask(i); + return *this; + } BitVector& BitVector::Flip() - { - for (size_type i = 0; i < Blocks(); ++i) - bits_[i] = ~bits_[i]; - zero_unused_bits(); - return *this; - } + { + for (size_type i = 0; i < Blocks(); ++i) + bits[i] = ~bits[i]; + + zero_unused_bits(); + return *this; + } bool BitVector::operator[](size_type i) const - { - assert(i < num_bits_); - return (bits_[block_index(i)] & bit_mask(i)) != 0; - } + { + assert(i < num_bits); + return (bits[block_index(i)] & bit_mask(i)) != 0; + } BitVector::Reference BitVector::operator[](size_type i) - { - assert(i < num_bits_); - return Reference(bits_[block_index(i)], bit_index(i)); - } + { + assert(i < num_bits); + return Reference(bits[block_index(i)], bit_index(i)); + } BitVector::size_type BitVector::Count() const - { - std::vector::const_iterator first = bits_.begin(); - size_t n = 0; - size_type length = Blocks(); - while (length) - { - block_type block = *first; - while (block) - { - // TODO: use __popcnt if available. - n += count_table[block & ((1u << 8) - 1)]; - block >>= 8; - } - ++first; - --length; - } - return n; - } + { + std::vector::const_iterator first = bits.begin(); + size_t n = 0; + size_type length = Blocks(); + + while ( length ) + { + block_type block = *first; + + while ( block ) + { + // TODO: use _popcnt if available. + n += count_table[block & ((1u << 8) - 1)]; + block >>= 8; + } + + ++first; + --length; + } + + return n; + } BitVector::size_type BitVector::Blocks() const - { - return bits_.size(); - } + { + return bits.size(); + } BitVector::size_type BitVector::Size() const - { - return num_bits_; - } + { + return num_bits; + } bool BitVector::Empty() const - { - return bits_.empty(); - } + { + return bits.empty(); + } BitVector::size_type BitVector::FindFirst() const - { - return find_from(0); - } + { + return find_from(0); + } BitVector::size_type BitVector::FindNext(size_type i) const - { - if (i >= (Size() - 1) || Size() == 0) - return npos; - ++i; - size_type bi = block_index(i); - block_type block = bits_[bi] & (~block_type(0) << bit_index(i)); - return block ? bi * bits_per_block + lowest_bit(block) : find_from(bi + 1); - } + { + if ( i >= (Size() - 1) || Size() == 0 ) + return npos; + + ++i; + size_type bi = block_index(i); + block_type block = bits[bi] & (~block_type(0) << bit_index(i)); + return block ? bi * bits_per_block + lowest_bit(block) : find_from(bi + 1); + } BitVector::size_type BitVector::lowest_bit(block_type block) - { - block_type x = block - (block & (block - 1)); - size_type log = 0; - while (x >>= 1) - ++log; - return log; - } + { + block_type x = block - (block & (block - 1)); + size_type log = 0; + + while (x >>= 1) + ++log; + + return log; + } BitVector::block_type BitVector::extra_bits() const - { - return bit_index(Size()); - } + { + return bit_index(Size()); + } void BitVector::zero_unused_bits() - { - if (extra_bits()) - bits_.back() &= ~(~block_type(0) << extra_bits()); - } + { + if ( extra_bits() ) + bits.back() &= ~(~block_type(0) << extra_bits()); + } BitVector::size_type BitVector::find_from(size_type i) const - { - while (i < Blocks() && bits_[i] == 0) - ++i; - if (i >= Blocks()) - return npos; - return i * bits_per_block + lowest_bit(bits_[i]); - } + { + while (i < Blocks() && bits[i] == 0) + ++i; + + if ( i >= Blocks() ) + return npos; + + return i * bits_per_block + lowest_bit(bits[i]); + } bool BitVector::Serialize(SerialInfo* info) const - { - return SerialObj::Serialize(info); - } + { + return SerialObj::Serialize(info); + } BitVector* BitVector::Unserialize(UnserialInfo* info) - { - return reinterpret_cast( - SerialObj::Unserialize(info, SER_BITVECTOR)); - } + { + return reinterpret_cast(SerialObj::Unserialize(info, SER_BITVECTOR)); + } IMPLEMENT_SERIAL(BitVector, SER_BITVECTOR); bool BitVector::DoSerialize(SerialInfo* info) const - { - DO_SERIALIZE(SER_BITVECTOR, SerialObj); + { + DO_SERIALIZE(SER_BITVECTOR, SerialObj); - if ( ! SERIALIZE(static_cast(bits_.size())) ) - return false; + if ( ! SERIALIZE(static_cast(bits.size())) ) + return false; - for ( size_t i = 0; i < bits_.size(); ++i ) - if ( ! SERIALIZE(static_cast(bits_[i])) ) - return false; + for ( size_t i = 0; i < bits.size(); ++i ) + if ( ! SERIALIZE(static_cast(bits[i])) ) + return false; - return SERIALIZE(static_cast(num_bits_)); - } + return SERIALIZE(static_cast(num_bits)); + } bool BitVector::DoUnserialize(UnserialInfo* info) - { - DO_UNSERIALIZE(SerialObj); + { + DO_UNSERIALIZE(SerialObj); - uint64 size; - if ( ! UNSERIALIZE(&size) ) - return false; + uint64 size; + if ( ! UNSERIALIZE(&size) ) + return false; - bits_.resize(static_cast(size)); - uint64 block; - for ( size_t i = 0; i < bits_.size(); ++i ) - { - if ( ! UNSERIALIZE(&block) ) - return false; - bits_[i] = static_cast(block); - } + bits.resize(static_cast(size)); - uint64 num_bits; - if ( ! UNSERIALIZE(&num_bits) ) - return false; - num_bits_ = static_cast(num_bits); + for ( size_t i = 0; i < bits.size(); ++i ) + { + uint64 block; + if ( ! UNSERIALIZE(&block) ) + return false; - return true; - } + bits[i] = static_cast(block); + } + + uint64 num_bits; + if ( ! UNSERIALIZE(&num_bits) ) + return false; + + num_bits = static_cast(num_bits); + + return true; + } diff --git a/src/probabilistic/BitVector.h b/src/probabilistic/BitVector.h index 8832c24cbe..9eefe1b633 100644 --- a/src/probabilistic/BitVector.h +++ b/src/probabilistic/BitVector.h @@ -1,8 +1,11 @@ -#ifndef BitVector_h -#define BitVector_h +// See the file "COPYING" in the main distribution directory for copyright. + +#ifndef PROBABILISTIC_BITVECTOR_H +#define PROBABILISTIC_BITVECTOR_H #include #include + #include "SerialObj.h" namespace probabilistic { @@ -12,322 +15,348 @@ namespace probabilistic { */ class BitVector : public SerialObj { public: - typedef size_t block_type; - typedef size_t size_type; - static size_type npos; - static block_type bits_per_block; + typedef size_t block_type; + typedef size_t size_type; + typedef bool const_reference; -public: - /** - * An lvalue proxy for single bits. - */ - class Reference { - friend class BitVector; - Reference(block_type& block, block_type i); + static size_type npos; + static block_type bits_per_block; - public: - Reference& Flip(); - operator bool() const; - bool operator~() const; - Reference& operator=(bool x); - Reference& operator=(Reference const& other); - Reference& operator|=(bool x); - Reference& operator&=(bool x); - Reference& operator^=(bool x); - Reference& operator-=(bool x); + /** + * An lvalue proxy for individual bits. + */ + class Reference { + public: + /** + * Inverts the bits' values. + */ + Reference& Flip(); - private: - void operator&(); - block_type& block_; - block_type const mask_; - }; + operator bool() const; + bool operator~() const; + Reference& operator=(bool x); + Reference& operator=(const Reference& other); + Reference& operator|=(bool x); + Reference& operator&=(bool x); + Reference& operator^=(bool x); + Reference& operator-=(bool x); - typedef bool const_reference; + private: + friend class BitVector; - /** - * Default-constructs an empty bit vector. - */ - BitVector(); + Reference(block_type& block, block_type i); + void operator&(); - /** - * Constructs a bit vector of a given size. - * @param size The number of bits. - * @param value The value for each bit. - */ - explicit BitVector(size_type size, bool value = false); + block_type& block; + const block_type mask; + }; - /** - * Constructs a bit vector from a sequence of blocks. - */ - template - BitVector(InputIterator first, InputIterator last) - { - bits_.insert(bits_.end(), first, last); - num_bits_ = bits_.size() * bits_per_block; - } + /** + * Default-constructs an empty bit vector. + */ + BitVector(); - /** - * Copy-constructs a bit vector. - * @param other The bit vector to copy. - */ - BitVector(const BitVector& other); + /** + * Constructs a bit vector of a given size. + * @param size The number of bits. + * @param value The value for each bit. + */ + explicit BitVector(size_type size, bool value = false); - /** - * Assigns another bit vector to this instance. - * @param other The RHS of the assignment. - */ - BitVector& operator=(const BitVector& other); + /** + * Constructs a bit vector from a sequence of blocks. + * + * @param first Start of range + * @param last End of range. + * + */ + template + BitVector(InputIterator first, InputIterator last) + { + bits.insert(bits.end(), first, last); + num_bits = bits.size() * bits_per_block; + } - // - // Bitwise operations - // - BitVector operator~() const; - BitVector operator<<(size_type n) const; - BitVector operator>>(size_type n) const; - BitVector& operator<<=(size_type n); - BitVector& operator>>=(size_type n); - BitVector& operator&=(BitVector const& other); - BitVector& operator|=(BitVector const& other); - BitVector& operator^=(BitVector const& other); - BitVector& operator-=(BitVector const& other); - friend BitVector operator&(BitVector const& x, BitVector const& y); - friend BitVector operator|(BitVector const& x, BitVector const& y); - friend BitVector operator^(BitVector const& x, BitVector const& y); - friend BitVector operator-(BitVector const& x, BitVector const& y); + /** + * Copy-constructs a bit vector. + * @param other The bit vector to copy. + */ + BitVector(const BitVector& other); - // - // Relational operators - // - friend bool operator==(BitVector const& x, BitVector const& y); - friend bool operator!=(BitVector const& x, BitVector const& y); - friend bool operator<(BitVector const& x, BitVector const& y); + /** + * Assigns another bit vector to this instance. + * @param other The RHS of the assignment. + */ + BitVector& operator=(const BitVector& other); - // - // Basic operations - // - /** Appends the bits in a sequence of values. - * @tparam Iterator A forward iterator. - * @param first An iterator pointing to the first element of the sequence. - * @param last An iterator pointing to one past the last element of the - * sequence. - */ - template - void Append(ForwardIterator first, ForwardIterator last) - { - if (first == last) - return; + // + // Bitwise operations. + // + BitVector operator~() const; + BitVector operator<<(size_type n) const; + BitVector operator>>(size_type n) const; + BitVector& operator<<=(size_type n); + BitVector& operator>>=(size_type n); + BitVector& operator&=(BitVector const& other); + BitVector& operator|=(BitVector const& other); + BitVector& operator^=(BitVector const& other); + BitVector& operator-=(BitVector const& other); + friend BitVector operator&(BitVector const& x, BitVector const& y); + friend BitVector operator|(BitVector const& x, BitVector const& y); + friend BitVector operator^(BitVector const& x, BitVector const& y); + friend BitVector operator-(BitVector const& x, BitVector const& y); - block_type excess = extra_bits(); - typename std::iterator_traits::difference_type delta = - std::distance(first, last); + // + // Relational operators + // + friend bool operator==(BitVector const& x, BitVector const& y); + friend bool operator!=(BitVector const& x, BitVector const& y); + friend bool operator<(BitVector const& x, BitVector const& y); - bits_.reserve(Blocks() + delta); - if (excess == 0) - { - bits_.back() |= (*first << excess); - do - { - block_type b = *first++ >> (bits_per_block - excess); - bits_.push_back(b | (first == last ? 0 : *first << excess)); - } while (first != last); - } - else - { - bits_.insert(bits_.end(), first, last); - } - num_bits_ += bits_per_block * delta; - } + // + // Basic operations + // - /** - * Appends the bits in a given block. - * @param block The block containing bits to append. - */ - void Append(block_type block); + /** Appends the bits in a sequence of values. + * @tparam Iterator A forward iterator. + * @param first An iterator pointing to the first element of the sequence. + * @param last An iterator pointing to one past the last element of the + * sequence. + */ + template + void Append(ForwardIterator first, ForwardIterator last) + { + if ( first == last ) + return; - /** Appends a single bit to the end of the bit vector. - * @param bit The value of the bit. - */ - void PushBack(bool bit); + block_type excess = extra_bits(); + typename std::iterator_traits::difference_type delta = + std::distance(first, last); - /** - * Clears all bits in the bitvector. - */ - void Clear(); + bits.reserve(Blocks() + delta); - /** - * Resizes the bit vector to a new number of bits. - * @param n The new number of bits of the bit vector. - * @param value The bit value of new values, if the vector expands. - */ - void Resize(size_type n, bool value = false); + if ( excess == 0 ) + { + bits.back() |= (*first << excess); - /** - * Sets a bit at a specific position to a given value. - * @param i The bit position. - * @param bit The value assigned to position *i*. - * @return A reference to the bit vector instance. - */ - BitVector& Set(size_type i, bool bit = true); + do { + block_type b = *first++ >> (bits_per_block - excess); + bits.push_back(b | (first == last ? 0 : *first << excess)); + } while (first != last); - /** - * Sets all bits to 1. - * @return A reference to the bit vector instance. - */ - BitVector& Set(); + } - /** - * Resets a bit at a specific position, i.e., sets it to 0. - * @param i The bit position. - * @return A reference to the bit vector instance. - */ - BitVector& Reset(size_type i); + else + bits.insert(bits.end(), first, last); - /** - * Sets all bits to 0. - * @return A reference to the bit vector instance. - */ - BitVector& Reset(); + num_bits += bits_per_block * delta; + } - /** - * Toggles/flips a bit at a specific position. - * @param i The bit position. - * @return A reference to the bit vector instance. - */ - BitVector& Flip(size_type i); + /** + * Appends the bits in a given block. + * @param block The block containing bits to append. + */ + void Append(block_type block); - /** - * Computes the complement. - * @return A reference to the bit vector instance. - */ - BitVector& Flip(); + /** Appends a single bit to the end of the bit vector. + * @param bit The value of the bit. + */ + void PushBack(bool bit); - /** Retrieves a single bit. - * @param i The bit position. - * @return A mutable reference to the bit at position *i*. - */ - Reference operator[](size_type i); + /** + * Clears all bits in the bitvector. + */ + void Clear(); - /** - * Retrieves a single bit. - * @param i The bit position. - * @return A const-reference to the bit at position *i*. - */ - const_reference operator[](size_type i) const; + /** + * Resizes the bit vector to a new number of bits. + * @param n The new number of bits of the bit vector. + * @param value The bit value of new values, if the vector expands. + */ + void Resize(size_type n, bool value = false); - /** - * Counts the number of 1-bits in the bit vector. Also known as *population - * count* or *Hamming weight*. - * @return The number of bits set to 1. - */ - size_type Count() const; + /** + * Sets a bit at a specific position to a given value. + * @param i The bit position. + * @param bit The value assigned to position *i*. + * @return A reference to the bit vector instance. + */ + BitVector& Set(size_type i, bool bit = true); - /** - * Retrieves the number of blocks of the underlying storage. - * @param The number of blocks that represent `Size()` bits. - */ - size_type Blocks() const; + /** + * Sets all bits to 1. + * @return A reference to the bit vector instance. + */ + BitVector& Set(); - /** - * Retrieves the number of bits the bitvector consist of. - * @return The length of the bit vector in bits. - */ - size_type Size() const; + /** + * Resets a bit at a specific position, i.e., sets it to 0. + * @param i The bit position. + * @return A reference to the bit vector instance. + */ + BitVector& Reset(size_type i); - /** - * Checks whether the bit vector is empty. - * @return `true` iff the bitvector has zero length. - */ - bool Empty() const; + /** + * Sets all bits to 0. + * @return A reference to the bit vector instance. + */ + BitVector& Reset(); - /** - * Finds the bit position of of the first 1-bit. - * @return The position of the first bit that equals to one or `npos` if no - * such bit exists. - */ - size_type FindFirst() const; + /** + * Toggles/flips a bit at a specific position. + * @param i The bit position. + * @return A reference to the bit vector instance. + */ + BitVector& Flip(size_type i); - /** - * Finds the next 1-bit from a given starting position. - * - * @param i The index where to start looking. - * - * @return The position of the first bit that equals to 1 after position - * *i* or `npos` if no such bit exists. - */ - size_type FindNext(size_type i) const; + /** + * Computes the complement. + * @return A reference to the bit vector instance. + */ + BitVector& Flip(); - bool Serialize(SerialInfo* info) const; - static BitVector* Unserialize(UnserialInfo* info); + /** Retrieves a single bit. + * @param i The bit position. + * @return A mutable reference to the bit at position *i*. + */ + Reference operator[](size_type i); + + /** + * Retrieves a single bit. + * @param i The bit position. + * @return A const-reference to the bit at position *i*. + */ + const_reference operator[](size_type i) const; + + /** + * Counts the number of 1-bits in the bit vector. Also known as *population + * count* or *Hamming weight*. + * @return The number of bits set to 1. + */ + size_type Count() const; + + /** + * Retrieves the number of blocks of the underlying storage. + * @param The number of blocks that represent `Size()` bits. + */ + size_type Blocks() const; + + /** + * Retrieves the number of bits the bitvector consist of. + * @return The length of the bit vector in bits. + */ + size_type Size() const; + + /** + * Checks whether the bit vector is empty. + * @return `true` iff the bitvector has zero length. + */ + bool Empty() const; + + /** + * Finds the bit position of of the first 1-bit. + * @return The position of the first bit that equals to one or `npos` if no + * such bit exists. + */ + size_type FindFirst() const; + + /** + * Finds the next 1-bit from a given starting position. + * + * @param i The index where to start looking. + * + * @return The position of the first bit that equals to 1 after position + * *i* or `npos` if no such bit exists. + */ + size_type FindNext(size_type i) const; + + /** + * Serializes the bit vector. + * + * @param info The serializaton informationt to use. + * + * @return True if successful. + */ + bool Serialize(SerialInfo* info) const; + + /** + * Unserialize the bit vector. + * + * @param info The serializaton informationt to use. + * + * @return The unserialized bit vector, or null if an error occured. + */ + static BitVector* Unserialize(UnserialInfo* info); protected: - DECLARE_SERIAL(BitVector); + DECLARE_SERIAL(BitVector); private: - /** - * Computes the block index for a given bit position. - */ - static size_type block_index(size_type i) - { - return i / bits_per_block; - } + /** + * Computes the number of excess/unused bits in the bit vector. + */ + block_type extra_bits() const; - /** - * Computes the bit index within a given block for a given bit position. - */ - static block_type bit_index(size_type i) - { - return i % bits_per_block; - } + /** + * If the number of bits in the vector are not not a multiple of + * bitvector::bits_per_block, then the last block exhibits unused bits which + * this function resets. + */ + void zero_unused_bits(); - /** - * Computes the bitmask block to extract a bit a given bit position. - */ - static block_type bit_mask(size_type i) - { - return block_type(1) << bit_index(i); - } + /** + * Looks for the first 1-bit starting at a given position. + * @param i The block index to start looking. + * @return The block index of the first 1-bit starting from *i* or + * `bitvector::npos` if no 1-bit exists. + */ + size_type find_from(size_type i) const; - /** - * Computes the number of blocks needed to represent a given number of - * bits. - * @param bits the number of bits. - * @return The number of blocks to represent *bits* number of bits. - */ - static size_type bits_to_blocks(size_type bits) - { - return bits / bits_per_block - + static_cast(bits % bits_per_block != 0); - } + /** + * Computes the block index for a given bit position. + */ + static size_type block_index(size_type i) + { + return i / bits_per_block; + } - /** - * Computes the bit position first 1-bit in a given block. - * @param block The block to inspect. - * @return The bit position where *block* has its first bit set to 1. - */ - static size_type lowest_bit(block_type block); + /** + * Computes the bit index within a given block for a given bit position. + */ + static block_type bit_index(size_type i) + { + return i % bits_per_block; + } - /** - * Computes the number of excess/unused bits in the bit vector. - */ - block_type extra_bits() const; + /** + * Computes the bitmask block to extract a bit a given bit position. + */ + static block_type bit_mask(size_type i) + { + return block_type(1) << bit_index(i); + } - /** - * If the number of bits in the vector are not not a multiple of - * bitvector::bits_per_block, then the last block exhibits unused bits which - * this function resets. - */ - void zero_unused_bits(); + /** + * Computes the number of blocks needed to represent a given number of + * bits. + * @param bits the number of bits. + * @return The number of blocks to represent *bits* number of bits. + */ + static size_type bits_to_blocks(size_type bits) + { + return bits / bits_per_block + + static_cast(bits % bits_per_block != 0); + } - /** - * Looks for the first 1-bit starting at a given position. - * @param i The block index to start looking. - * @return The block index of the first 1-bit starting from *i* or - * `bitvector::npos` if no 1-bit exists. - */ - size_type find_from(size_type i) const; + /** + * Computes the bit position first 1-bit in a given block. + * @param block The block to inspect. + * @return The bit position where *block* has its first bit set to 1. + */ + static size_type lowest_bit(block_type block); - std::vector bits_; - size_type num_bits_; + std::vector bits; + size_type num_bits; }; } diff --git a/src/probabilistic/BloomFilter.cc b/src/probabilistic/BloomFilter.cc index 1b86ea1441..5613dcce05 100644 --- a/src/probabilistic/BloomFilter.cc +++ b/src/probabilistic/BloomFilter.cc @@ -1,3 +1,5 @@ +// See the file "COPYING" in the main distribution directory for copyright. + #include "BloomFilter.h" #include @@ -8,181 +10,184 @@ using namespace probabilistic; BloomFilter::BloomFilter() - : hasher_(NULL) - { - } + { + hasher = 0; + } -BloomFilter::BloomFilter(const Hasher* hasher) - : hasher_(hasher) - { - } +BloomFilter::BloomFilter(const Hasher* arg_hasher) + { + hasher = arg_hasher; + } BloomFilter::~BloomFilter() - { - if ( hasher_ ) - delete hasher_; - } + { + delete hasher; + } bool BloomFilter::Serialize(SerialInfo* info) const - { - return SerialObj::Serialize(info); - } + { + return SerialObj::Serialize(info); + } BloomFilter* BloomFilter::Unserialize(UnserialInfo* info) - { - return reinterpret_cast( - SerialObj::Unserialize(info, SER_BLOOMFILTER)); - } + { + return reinterpret_cast(SerialObj::Unserialize(info, SER_BLOOMFILTER)); + } bool BloomFilter::DoSerialize(SerialInfo* info) const { DO_SERIALIZE(SER_BLOOMFILTER, SerialObj); - if ( ! SERIALIZE(static_cast(hasher_->K())) ) - return false; - return SERIALIZE_STR(hasher_->Name().c_str(), hasher_->Name().size()); - } + + if ( ! SERIALIZE(static_cast(hasher->K())) ) + return false; + + return SERIALIZE_STR(hasher->Name().c_str(), hasher->Name().size()); + } bool BloomFilter::DoUnserialize(UnserialInfo* info) { DO_UNSERIALIZE(SerialObj); + uint16 k; if ( ! UNSERIALIZE(&k) ) - return false; - const char* name; - if ( ! UNSERIALIZE_STR(&name, 0) ) - return false; - hasher_ = Hasher::Create(k, name); + return false; + + const char* name; + if ( ! UNSERIALIZE_STR(&name, 0) ) + return false; + + hasher = Hasher::Create(k, name); + delete [] name; return true; - } - + } size_t BasicBloomFilter::M(double fp, size_t capacity) - { - double ln2 = std::log(2); - return std::ceil(-(capacity * std::log(fp) / ln2 / ln2)); - } + { + double ln2 = std::log(2); + return std::ceil(-(capacity * std::log(fp) / ln2 / ln2)); + } size_t BasicBloomFilter::K(size_t cells, size_t capacity) - { - double frac = static_cast(cells) / static_cast(capacity); - return std::ceil(frac * std::log(2)); - } + { + double frac = static_cast(cells) / static_cast(capacity); + return std::ceil(frac * std::log(2)); + } BasicBloomFilter* BasicBloomFilter::Merge(const BasicBloomFilter* x, const BasicBloomFilter* y) - { - if ( ! x->hasher_->Equals(y->hasher_) ) - { - reporter->InternalError("incompatible hashers during Bloom filter merge"); - return NULL; - } - BasicBloomFilter* result = new BasicBloomFilter(); - result->hasher_ = x->hasher_->Clone(); - result->bits_ = new BitVector(*x->bits_ | *y->bits_); - return result; - } + { + if ( ! x->hasher->Equals(y->hasher) ) + reporter->InternalError("incompatible hashers during BasicBloomFilter merge"); + + BasicBloomFilter* result = new BasicBloomFilter(); + result->hasher = x->hasher->Clone(); + result->bits = new BitVector(*x->bits | *y->bits); + + return result; + } BasicBloomFilter::BasicBloomFilter() - : bits_(NULL) - { - } + { + bits = 0; + } BasicBloomFilter::BasicBloomFilter(const Hasher* hasher, size_t cells) - : BloomFilter(hasher), - bits_(new BitVector(cells)) - { - } + : BloomFilter(hasher) + { + bits = new BitVector(cells); + } IMPLEMENT_SERIAL(BasicBloomFilter, SER_BASICBLOOMFILTER) bool BasicBloomFilter::DoSerialize(SerialInfo* info) const { DO_SERIALIZE(SER_BASICBLOOMFILTER, BloomFilter); - return bits_->Serialize(info); - } + return bits->Serialize(info); + } bool BasicBloomFilter::DoUnserialize(UnserialInfo* info) { DO_UNSERIALIZE(BloomFilter); - bits_ = BitVector::Unserialize(info); - return bits_ != NULL; - } + bits = BitVector::Unserialize(info); + return (bits != 0); + } void BasicBloomFilter::AddImpl(const Hasher::digest_vector& h) - { - for ( size_t i = 0; i < h.size(); ++i ) - bits_->Set(h[i] % bits_->Size()); - } + { + for ( size_t i = 0; i < h.size(); ++i ) + bits->Set(h[i] % bits->Size()); + } size_t BasicBloomFilter::CountImpl(const Hasher::digest_vector& h) const - { - for ( size_t i = 0; i < h.size(); ++i ) - if ( ! (*bits_)[h[i] % bits_->Size()] ) - return 0; - return 1; - } + { + for ( size_t i = 0; i < h.size(); ++i ) + { + if ( ! (*bits)[h[i] % bits->Size()] ) + return 0; + } + return 1; + } CountingBloomFilter* CountingBloomFilter::Merge(const CountingBloomFilter* x, - const CountingBloomFilter* y) - { - if ( ! x->hasher_->Equals(y->hasher_) ) - { - reporter->InternalError("incompatible hashers during Bloom filter merge"); - return NULL; - } - CountingBloomFilter* result = new CountingBloomFilter(); - result->hasher_ = x->hasher_->Clone(); - result->cells_ = new CounterVector(*x->cells_ | *y->cells_); - return result; - } + const CountingBloomFilter* y) + { + if ( ! x->hasher->Equals(y->hasher) ) + reporter->InternalError("incompatible hashers during CountingBloomFilter merge"); + + CountingBloomFilter* result = new CountingBloomFilter(); + result->hasher = x->hasher->Clone(); + result->cells = new CounterVector(*x->cells | *y->cells); + + return result; + } CountingBloomFilter::CountingBloomFilter() - : cells_(NULL) - { - } + { + cells = 0; + } CountingBloomFilter::CountingBloomFilter(const Hasher* hasher, - size_t cells, size_t width) - : BloomFilter(hasher), - cells_(new CounterVector(width, cells)) - { - } - + size_t arg_cells, size_t width) + : BloomFilter(hasher) + { + cells = new CounterVector(width, arg_cells); + } IMPLEMENT_SERIAL(CountingBloomFilter, SER_COUNTINGBLOOMFILTER) bool CountingBloomFilter::DoSerialize(SerialInfo* info) const { DO_SERIALIZE(SER_COUNTINGBLOOMFILTER, BloomFilter); - return cells_->Serialize(info); - } + return cells->Serialize(info); + } bool CountingBloomFilter::DoUnserialize(UnserialInfo* info) { DO_UNSERIALIZE(BloomFilter); - cells_ = CounterVector::Unserialize(info); - return cells_ != NULL; - } + cells = CounterVector::Unserialize(info); + return (cells != 0); + } // TODO: Use partitioning in add/count to allow for reusing CMS bounds. - void CountingBloomFilter::AddImpl(const Hasher::digest_vector& h) - { - for ( size_t i = 0; i < h.size(); ++i ) - cells_->Increment(h[i] % cells_->Size()); - } + { + for ( size_t i = 0; i < h.size(); ++i ) + cells->Increment(h[i] % cells->Size()); + } size_t CountingBloomFilter::CountImpl(const Hasher::digest_vector& h) const - { - CounterVector::size_type min = - std::numeric_limits::max(); - for ( size_t i = 0; i < h.size(); ++i ) - { - CounterVector::size_type cnt = cells_->Count(h[i] % cells_->Size()); - if ( cnt < min ) - min = cnt; - } - return min; - } + { + CounterVector::size_type min = + std::numeric_limits::max(); + + for ( size_t i = 0; i < h.size(); ++i ) + { + CounterVector::size_type cnt = cells->Count(h[i] % cells->Size()); + if ( cnt < min ) + min = cnt; + } + + return min; + } diff --git a/src/probabilistic/BloomFilter.h b/src/probabilistic/BloomFilter.h index 2fa849505d..4a6b01c484 100644 --- a/src/probabilistic/BloomFilter.h +++ b/src/probabilistic/BloomFilter.h @@ -1,5 +1,7 @@ -#ifndef BloomFilter_h -#define BloomFilter_h +// See the file "COPYING" in the main distribution directory for copyright. + +#ifndef PROBABILISTIC_BLOOMFILTER_H +#define PROBABILISTIC_BLOOMFILTER_H #include #include "BitVector.h" @@ -11,42 +13,65 @@ class CounterVector; /** * The abstract base class for Bloom filters. + * + * At this point we won't let the user choose the hasher, but we might open + * up the interface in the future. */ class BloomFilter : public SerialObj { public: - // At this point we won't let the user choose the hasher, but we might - // open up the interface in the future. - virtual ~BloomFilter(); + /** + * Destructor. + */ + virtual ~BloomFilter(); - /** - * Adds an element of type T to the Bloom filter. - * @param x The element to add - */ - template - void Add(const T& x) - { - AddImpl((*hasher_)(x)); - } + /** + * Adds an element of type T to the Bloom filter. + * @param x The element to add + */ + template + void Add(const T& x) + { + AddImpl((*hasher)(x)); + } - /** - * Retrieves the associated count of a given value. - * - * @param x The value of type `T` to check. - * - * @return The counter associated with *x*. - */ - template - size_t Count(const T& x) const - { - return CountImpl((*hasher_)(x)); - } + /** + * Retrieves the associated count of a given value. + * + * @param x The value of type `T` to check. + * + * @return The counter associated with *x*. + */ + template + size_t Count(const T& x) const + { + return CountImpl((*hasher)(x)); + } - bool Serialize(SerialInfo* info) const; - static BloomFilter* Unserialize(UnserialInfo* info); + /** + * Serializes the Bloom filter. + * + * @param info The serializaton information to use. + * + * @return True if successful. + */ + bool Serialize(SerialInfo* info) const; + + /** + * Unserializes a Bloom filter. + * + * @param info The serializaton information to use. + * + * @return The unserialized Bloom filter, or null if an error + * occured. + */ + static BloomFilter* Unserialize(UnserialInfo* info); protected: - DECLARE_ABSTRACT_SERIAL(BloomFilter); + DECLARE_ABSTRACT_SERIAL(BloomFilter); + /** + * Default constructor. + */ BloomFilter(); /** @@ -54,12 +79,28 @@ protected: * * @param hasher The hasher to use for this Bloom filter. */ - BloomFilter(const Hasher* hasher); + BloomFilter(const Hasher* hasher); - virtual void AddImpl(const Hasher::digest_vector& hashes) = 0; - virtual size_t CountImpl(const Hasher::digest_vector& hashes) const = 0; + /** + * Abstract method for implementinng the *Add* operation. + * + * @param hashes A set of *k* hashes for the item to add, computed by + * the internal hasher object. + * + */ + virtual void AddImpl(const Hasher::digest_vector& hashes) = 0; - const Hasher* hasher_; + /** + * Abstract method for implementing the *Count* operation. + * + * @param hashes A set of *k* hashes for the item to add, computed by + * the internal hasher object. + * + * @return Returns the counter associated with the hashed element. + */ + virtual size_t CountImpl(const Hasher::digest_vector& hashes) const = 0; + + const Hasher* hasher; }; /** @@ -67,50 +108,67 @@ protected: */ class BasicBloomFilter : public BloomFilter { public: - /** - * Computes the number of cells based a given false-positive rate and - * capacity. In the literature, this parameter often has the name *M*. - * - * @param fp The false-positive rate. - * - * @param capacity The number of exepected elements. - * - * Returns: The number cells needed to support a false-positive rate of *fp* - * with at most *capacity* elements. - */ - static size_t M(double fp, size_t capacity); + /** + * Constructs a basic Bloom filter with a given number of cells. The + * ideal number of cells can be computed with *M*. + * + * @param hasher The hasher to use. The ideal number of hash + * functions can be computed with *K*. + * + * @param cells The number of cells. + */ + BasicBloomFilter(const Hasher* hasher, size_t cells); - /** - * Computes the optimal number of hash functions based on the number cells - * and expected number of elements. - * - * @param cells The number of cells (*m*). - * - * @param capacity The maximum number of elements. - * - * Returns: the optimal number of hash functions for a false-positive rate of - * *fp* for at most *capacity* elements. - */ - static size_t K(size_t cells, size_t capacity); + /** + * Computes the number of cells based on a given false positive rate + * and capacity. In the literature, this parameter often has the name + * *M*. + * + * @param fp The false positive rate. + * + * @param capacity The expected number of elements that will be + * stored. + * + * Returns: The number cells needed to support a false positive rate + * of *fp* with at most *capacity* elements. + */ + static size_t M(double fp, size_t capacity); - static BasicBloomFilter* Merge(const BasicBloomFilter* x, - const BasicBloomFilter* y); + /** + * Computes the optimal number of hash functions based on the number cells + * and expected number of elements. + * + * @param cells The number of cells (*m*). + * + * @param capacity The maximum number of elements. + * + * Returns: the optimal number of hash functions for a false-positive + * rate of *fp* for at most *capacity* elements. + */ + static size_t K(size_t cells, size_t capacity); - /** - * Constructs a basic Bloom filter with a given number of cells and capacity. - */ - BasicBloomFilter(const Hasher* hasher, size_t cells); + /** + * Merges two basic Bloom filters. + * + * @return The merged Bloom filter. + */ + static BasicBloomFilter* Merge(const BasicBloomFilter* x, + const BasicBloomFilter* y); protected: - DECLARE_SERIAL(BasicBloomFilter); + DECLARE_SERIAL(BasicBloomFilter); - BasicBloomFilter(); + /** + * Default constructor. + */ + BasicBloomFilter(); - virtual void AddImpl(const Hasher::digest_vector& h); - virtual size_t CountImpl(const Hasher::digest_vector& h) const; + // Overridden from BloomFilter. + virtual void AddImpl(const Hasher::digest_vector& h); + virtual size_t CountImpl(const Hasher::digest_vector& h) const; private: - BitVector* bits_; + BitVector* bits; }; /** @@ -118,21 +176,40 @@ private: */ class CountingBloomFilter : public BloomFilter { public: - static CountingBloomFilter* Merge(const CountingBloomFilter* x, - const CountingBloomFilter* y); + /** + * Constructs a counting Bloom filter. + * + * @param hasher The hasher to use. The ideal number of hash + * functions can be computed with *K*. + * + * @param cells The number of cells to use. + * + * @param width The maximal bit-width of counter values. + */ + CountingBloomFilter(const Hasher* hasher, size_t cells, size_t width); - CountingBloomFilter(const Hasher* hasher, size_t cells, size_t width); + /** + * Merges two counting Bloom filters. + * + * @return The merged Bloom filter. + */ + static CountingBloomFilter* Merge(const CountingBloomFilter* x, + const CountingBloomFilter* y); protected: - DECLARE_SERIAL(CountingBloomFilter); + DECLARE_SERIAL(CountingBloomFilter); - CountingBloomFilter(); + /** + * Default constructor. + */ + CountingBloomFilter(); - virtual void AddImpl(const Hasher::digest_vector& h); - virtual size_t CountImpl(const Hasher::digest_vector& h) const; + // Overridden from BloomFilter. + virtual void AddImpl(const Hasher::digest_vector& h); + virtual size_t CountImpl(const Hasher::digest_vector& h) const; private: - CounterVector* cells_; + CounterVector* cells; }; } diff --git a/src/probabilistic/CounterVector.cc b/src/probabilistic/CounterVector.cc index 943749ad46..570ed1f8ea 100644 --- a/src/probabilistic/CounterVector.cc +++ b/src/probabilistic/CounterVector.cc @@ -1,3 +1,5 @@ +// See the file "COPYING" in the main distribution directory for copyright. + #include "CounterVector.h" #include @@ -6,154 +8,176 @@ using namespace probabilistic; -CounterVector::CounterVector(size_t width, size_t cells) - : bits_(new BitVector(width * cells)), - width_(width) - { - } +CounterVector::CounterVector(size_t arg_width, size_t cells) + { + bits = new BitVector(arg_width * cells); + width = arg_width; + } CounterVector::CounterVector(const CounterVector& other) - : bits_(new BitVector(*other.bits_)), - width_(other.width_) - { - } + { + bits = new BitVector(*other.bits); + width = other.width; + } CounterVector::~CounterVector() - { - delete bits_; - } + { + delete bits; + } bool CounterVector::Increment(size_type cell, count_type value) - { - assert(cell < Size()); - assert(value != 0); - size_t lsb = cell * width_; - bool carry = false; - for ( size_t i = 0; i < width_; ++i ) - { - bool b1 = (*bits_)[lsb + i]; - bool b2 = value & (1 << i); - (*bits_)[lsb + i] = b1 ^ b2 ^ carry; - carry = ( b1 && b2 ) || ( carry && ( b1 != b2 ) ); - } - if ( carry ) - for ( size_t i = 0; i < width_; ++i ) - bits_->Set(lsb + i); - return ! carry; - } + { + assert(cell < Size()); + assert(value != 0); + + size_t lsb = cell * width; + bool carry = false; + + for ( size_t i = 0; i < width; ++i ) + { + bool b1 = (*bits)[lsb + i]; + bool b2 = value & (1 << i); + (*bits)[lsb + i] = b1 ^ b2 ^ carry; + carry = ( b1 && b2 ) || ( carry && ( b1 != b2 ) ); + } + + if ( carry ) + { + for ( size_t i = 0; i < width; ++i ) + bits->Set(lsb + i); + } + + return ! carry; + } bool CounterVector::Decrement(size_type cell, count_type value) - { - assert(cell < Size()); - assert(value != 0); - value = ~value + 1; // A - B := A + ~B + 1 - bool carry = false; - size_t lsb = cell * width_; - for ( size_t i = 0; i < width_; ++i ) - { - bool b1 = (*bits_)[lsb + i]; - bool b2 = value & (1 << i); - (*bits_)[lsb + i] = b1 ^ b2 ^ carry; - carry = ( b1 && b2 ) || ( carry && ( b1 != b2 ) ); - } - return carry; - } + { + assert(cell < Size()); + assert(value != 0); + + value = ~value + 1; // A - B := A + ~B + 1 + bool carry = false; + size_t lsb = cell * width; + + for ( size_t i = 0; i < width; ++i ) + { + bool b1 = (*bits)[lsb + i]; + bool b2 = value & (1 << i); + (*bits)[lsb + i] = b1 ^ b2 ^ carry; + carry = ( b1 && b2 ) || ( carry && ( b1 != b2 ) ); + } + + return carry; + } CounterVector::count_type CounterVector::Count(size_type cell) const - { - assert(cell < Size()); - size_t cnt = 0, order = 1; - size_t lsb = cell * width_; - for (size_t i = lsb; i < lsb + width_; ++i, order <<= 1) - if ((*bits_)[i]) - cnt |= order; - return cnt; - } + { + assert(cell < Size()); + + size_t cnt = 0, order = 1; + size_t lsb = cell * width; + + for ( size_t i = lsb; i < lsb + width; ++i, order <<= 1 ) + if ( (*bits)[i] ) + cnt |= order; + + return cnt; + } CounterVector::size_type CounterVector::Size() const - { - return bits_->Size() / width_; - } + { + return bits->Size() / width; + } size_t CounterVector::Width() const - { - return width_; - } + { + return width; + } size_t CounterVector::Max() const - { - return std::numeric_limits::max() - >> (std::numeric_limits::digits - width_); - } + { + return std::numeric_limits::max() + >> (std::numeric_limits::digits - width); + } CounterVector& CounterVector::Merge(const CounterVector& other) - { - assert(Size() == other.Size()); - assert(Width() == other.Width()); - for ( size_t cell = 0; cell < Size(); ++cell ) - { - size_t lsb = cell * width_; - bool carry = false; - for ( size_t i = 0; i < width_; ++i ) - { - bool b1 = (*bits_)[lsb + i]; - bool b2 = (*other.bits_)[lsb + i]; - (*bits_)[lsb + i] = b1 ^ b2 ^ carry; - carry = ( b1 && b2 ) || ( carry && ( b1 != b2 ) ); - } - if ( carry ) - for ( size_t i = 0; i < width_; ++i ) - bits_->Set(lsb + i); - } - return *this; - } + { + assert(Size() == other.Size()); + assert(Width() == other.Width()); + + for ( size_t cell = 0; cell < Size(); ++cell ) + { + size_t lsb = cell * width; + bool carry = false; + + for ( size_t i = 0; i < width; ++i ) + { + bool b1 = (*bits)[lsb + i]; + bool b2 = (*other.bits)[lsb + i]; + (*bits)[lsb + i] = b1 ^ b2 ^ carry; + carry = ( b1 && b2 ) || ( carry && ( b1 != b2 ) ); + } + + if ( carry ) + { + for ( size_t i = 0; i < width; ++i ) + bits->Set(lsb + i); + } + } + + return *this; + } namespace probabilistic { CounterVector& CounterVector::operator|=(const CounterVector& other) -{ - return Merge(other); -} + { + return Merge(other); + } CounterVector operator|(const CounterVector& x, const CounterVector& y) -{ - CounterVector cv(x); - return cv |= y; -} + { + CounterVector cv(x); + return cv |= y; + } } bool CounterVector::Serialize(SerialInfo* info) const - { - return SerialObj::Serialize(info); - } + { + return SerialObj::Serialize(info); + } CounterVector* CounterVector::Unserialize(UnserialInfo* info) - { - return reinterpret_cast( - SerialObj::Unserialize(info, SER_COUNTERVECTOR)); - } + { + return reinterpret_cast(SerialObj::Unserialize(info, SER_COUNTERVECTOR)); + } IMPLEMENT_SERIAL(CounterVector, SER_COUNTERVECTOR) bool CounterVector::DoSerialize(SerialInfo* info) const { DO_SERIALIZE(SER_COUNTERVECTOR, SerialObj); - if ( ! bits_->Serialize(info) ) - return false; - return SERIALIZE(static_cast(width_)); - } + + if ( ! bits->Serialize(info) ) + return false; + + return SERIALIZE(static_cast(width)); + } bool CounterVector::DoUnserialize(UnserialInfo* info) { DO_UNSERIALIZE(SerialObj); - bits_ = BitVector::Unserialize(info); - if ( ! bits_ ) - return false; - uint64 width; - if ( ! UNSERIALIZE(&width) ) - return false; - width_ = static_cast(width); - return true; - } + bits = BitVector::Unserialize(info); + if ( ! bits ) + return false; + + uint64 width; + if ( ! UNSERIALIZE(&width) ) + return false; + + width = static_cast(width); + + return true; + } diff --git a/src/probabilistic/CounterVector.h b/src/probabilistic/CounterVector.h index 63445ec12d..178a68e8f2 100644 --- a/src/probabilistic/CounterVector.h +++ b/src/probabilistic/CounterVector.h @@ -1,5 +1,7 @@ -#ifndef CounterVector_h -#define CounterVector_h +// See the file "COPYING" in the main distribution directory for copyright. + +#ifndef PROBABILISTIC_COUNTERVECTOR_H +#define PROBABILISTIC_COUNTERVECTOR_H #include "SerialObj.h" @@ -8,123 +10,143 @@ namespace probabilistic { class BitVector; /** - * A vector of counters, each of which have a fixed number of bits. + * A vector of counters, each of which has a fixed number of bits. */ class CounterVector : public SerialObj { - CounterVector& operator=(const CounterVector&); public: - typedef size_t size_type; - typedef uint64 count_type; + typedef size_t size_type; + typedef uint64 count_type; - /** - * Constructs a counter vector having cells of a given width. - * - * @param width The number of bits that each cell occupies. - * - * @param cells The number of cells in the bitvector. - * - * @pre `cells > 0 && width > 0` - */ - CounterVector(size_t width, size_t cells = 1024); + /** + * Constructs a counter vector having cells of a given width. + * + * @param width The number of bits that each cell occupies. + * + * @param cells The number of cells in the bitvector. + * + * @pre `cells > 0 && width > 0` + */ + CounterVector(size_t width, size_t cells = 1024); /** * Copy-constructs a counter vector. * * @param other The counter vector to copy. */ - CounterVector(const CounterVector& other); + CounterVector(const CounterVector& other); - ~CounterVector(); + /** + * Destructor. + */ + ~CounterVector(); - /** - * Increments a given cell. - * - * @param cell The cell to increment. - * - * @param value The value to add to the current counter in *cell*. - * - * @return `true` if adding *value* to the counter in *cell* succeeded. - * - * @pre `cell < Size()` - */ - bool Increment(size_type cell, count_type value = 1); + /** + * Increments a given cell. + * + * @param cell The cell to increment. + * + * @param value The value to add to the current counter in *cell*. + * + * @return `true` if adding *value* to the counter in *cell* succeeded. + * + * @pre `cell < Size()` + */ + bool Increment(size_type cell, count_type value = 1); - /** - * Decrements a given cell. - * - * @param cell The cell to decrement. - * - * @param value The value to subtract from the current counter in *cell*. - * - * @return `true` if subtracting *value* from the counter in *cell* succeeded. - * - * @pre `cell < Size()` - */ - bool Decrement(size_type cell, count_type value = 1); + /** + * Decrements a given cell. + * + * @param cell The cell to decrement. + * + * @param value The value to subtract from the current counter in *cell*. + * + * @return `true` if subtracting *value* from the counter in *cell* succeeded. + * + * @pre `cell < Size()` + */ + bool Decrement(size_type cell, count_type value = 1); - /** - * Retrieves the counter of a given cell. - * - * @param cell The cell index to retrieve the count for. - * - * @return The counter associated with *cell*. - * - * @pre `cell < Size()` - */ - count_type Count(size_type cell) const; + /** + * Retrieves the counter of a given cell. + * + * @param cell The cell index to retrieve the count for. + * + * @return The counter associated with *cell*. + * + * @pre `cell < Size()` + */ + count_type Count(size_type cell) const; - /** - * Retrieves the number of cells in the storage. - * - * @return The number of cells. - */ - size_type Size() const; + /** + * Retrieves the number of cells in the storage. + * + * @return The number of cells. + */ + size_type Size() const; - /** - * Retrieves the counter width. - * - * @return The number of bits per counter. - */ - size_t Width() const; + /** + * Retrieves the counter width. + * + * @return The number of bits per counter. + */ + size_t Width() const; - /** - * Computes the maximum counter value. - * - * @return The maximum counter value based on the width. - */ - size_t Max() const; + /** + * Computes the maximum counter value. + * + * @return The maximum counter value based on the width. + */ + size_t Max() const; - /** - * Merges another counter vector into this instance by *adding* the counters - * of each cells. - * - * @param other The counter vector to merge into this instance. - * - * @return A reference to `*this`. - * - * @pre `Size() == other.Size() && Width() == other.Width()` - */ - CounterVector& Merge(const CounterVector& other); + /** + * Merges another counter vector into this instance by *adding* the + * counters of each cells. + * + * @param other The counter vector to merge into this instance. + * + * @return A reference to `*this`. + * + * @pre `Size() == other.Size() && Width() == other.Width()` + */ + CounterVector& Merge(const CounterVector& other); - /** - * An alias for ::Merge. - */ - CounterVector& operator|=(const CounterVector& other); + /** + * An alias for ::Merge. + */ + CounterVector& operator|=(const CounterVector& other); - friend CounterVector operator|(const CounterVector& x, - const CounterVector& y); + /** + * Serializes the bit vector. + * + * @param info The serializaton information to use. + * + * @return True if successful. + */ + bool Serialize(SerialInfo* info) const; - bool Serialize(SerialInfo* info) const; - static CounterVector* Unserialize(UnserialInfo* info); + /** + * Unserialize the counter vector. + * + * @param info The serializaton information to use. + * + * @return The unserialized counter vector, or null if an error + * occured. + */ + static CounterVector* Unserialize(UnserialInfo* info); protected: - DECLARE_SERIAL(CounterVector); + friend CounterVector operator|(const CounterVector& x, + const CounterVector& y); - CounterVector() { } + CounterVector() { } + + DECLARE_SERIAL(CounterVector); private: - BitVector* bits_; - size_t width_; + CounterVector& operator=(const CounterVector&); // Disable. + + BitVector* bits; + size_t width; }; } diff --git a/src/probabilistic/Hasher.cc b/src/probabilistic/Hasher.cc index c2f1110ecd..f9ce7bdd6b 100644 --- a/src/probabilistic/Hasher.cc +++ b/src/probabilistic/Hasher.cc @@ -1,66 +1,70 @@ +// See the file "COPYING" in the main distribution directory for copyright. #include #include "Hasher.h" - #include "digest.h" using namespace probabilistic; -Hasher::UHF::UHF(size_t seed, const std::string& extra) - : h_(compute_seed(seed, extra)) +UHF::UHF(size_t seed, const std::string& extra) + : h(compute_seed(seed, extra)) { } -Hasher::digest Hasher::UHF::hash(const void* x, size_t n) const +Hasher::digest UHF::hash(const void* x, size_t n) const { assert(n <= UHASH_KEY_SIZE); - return n == 0 ? 0 : h_(x, n); + return n == 0 ? 0 : h(x, n); } -size_t Hasher::UHF::compute_seed(size_t seed, const std::string& extra) +size_t UHF::compute_seed(size_t seed, const std::string& extra) { u_char buf[SHA256_DIGEST_LENGTH]; SHA256_CTX ctx; sha256_init(&ctx); + if ( extra.empty() ) { unsigned int first_seed = initial_seed(); sha256_update(&ctx, &first_seed, sizeof(first_seed)); } - else - { - sha256_update(&ctx, extra.c_str(), extra.size()); - } - sha256_update(&ctx, &seed, sizeof(seed)); - sha256_final(&ctx, buf); - // Take the first sizeof(size_t) bytes as seed. - return *reinterpret_cast(buf); - } + else + sha256_update(&ctx, extra.c_str(), extra.size()); + + sha256_update(&ctx, &seed, sizeof(seed)); + sha256_final(&ctx, buf); + + // Take the first sizeof(size_t) bytes as seed. + return *reinterpret_cast(buf); + } Hasher* Hasher::Create(size_t k, const std::string& name) { return new DefaultHasher(k, name); } -Hasher::Hasher(size_t k, const std::string& name) - : k_(k), name_(name) +Hasher::Hasher(size_t k, const std::string& arg_name) + : k(k) { + name = arg_name; } DefaultHasher::DefaultHasher(size_t k, const std::string& name) : Hasher(k, name) { for ( size_t i = 0; i < k; ++i ) - hash_functions_.push_back(UHF(i, name)); + hash_functions.push_back(UHF(i, name)); } Hasher::digest_vector DefaultHasher::Hash(const void* x, size_t n) const { digest_vector h(K(), 0); + for ( size_t i = 0; i < h.size(); ++i ) - h[i] = hash_functions_[i](x, n); + h[i] = hash_functions[i](x, n); + return h; } @@ -73,24 +77,25 @@ bool DefaultHasher::Equals(const Hasher* other) const { if ( typeid(*this) != typeid(*other) ) return false; + const DefaultHasher* o = static_cast(other); - return hash_functions_ == o->hash_functions_; + return hash_functions == o->hash_functions; } DoubleHasher::DoubleHasher(size_t k, const std::string& name) - : Hasher(k, name), - h1_(1, name), - h2_(2, name) + : Hasher(k, name), h1(1, name), h2(2, name) { } Hasher::digest_vector DoubleHasher::Hash(const void* x, size_t n) const { - digest h1 = h1_(x, n); - digest h2 = h2_(x, n); + digest d1 = h1(x, n); + digest d2 = h2(x, n); digest_vector h(K(), 0); + for ( size_t i = 0; i < h.size(); ++i ) - h[i] = h1 + i * h2; + h[i] = d1 + i * d2; + return h; } @@ -103,7 +108,7 @@ bool DoubleHasher::Equals(const Hasher* other) const { if ( typeid(*this) != typeid(*other) ) return false; - const DoubleHasher* o = static_cast(other); - return h1_ == o->h1_ && h2_ == o->h2_; - } + const DoubleHasher* o = static_cast(other); + return h1 == o->h1 && h2 == o->h2; + } diff --git a/src/probabilistic/Hasher.h b/src/probabilistic/Hasher.h index 0231343dcd..62c5d58d1f 100644 --- a/src/probabilistic/Hasher.h +++ b/src/probabilistic/Hasher.h @@ -1,5 +1,7 @@ -#ifndef Hasher_h -#define Hasher_h +// See the file "COPYING" in the main distribution directory for copyright. + +#ifndef PROBABILISTIC_HASHER_H +#define PROBABILISTIC_HASHER_H #include "Hash.h" #include "H3.h" @@ -7,123 +9,197 @@ namespace probabilistic { /** - * The abstract base class for hashers, i.e., constructs which hash elements - * *k* times. + * Abstract base class for hashers. A hasher creates a family of hash + * functions to hash an element *k* times. */ class Hasher { public: - typedef hash_t digest; - typedef std::vector digest_vector; + typedef hash_t digest; + typedef std::vector digest_vector; - /** - * Constructs the hashing policy used by the implementation. - * - * @todo This factory function exists because the HashingPolicy class - * hierachy is not yet serializable. - */ + /** + * Destructor. + */ + virtual ~Hasher() { } + + /** + * Computes hash values for an element. + * + * @param x The element to hash. + * + * @return Vector of *k* hash values. + */ + template + digest_vector operator()(const T& x) const + { + return Hash(&x, sizeof(T)); + } + + /** + * Computes the hashes for a set of bytes. + * + * @param x Pointer to first byte to hash. + * + * @param n Number of bytes to hash. + * + * @return Vector of *k* hash values. + * + */ + virtual digest_vector Hash(const void* x, size_t n) const = 0; + + /** + * Returns a deep copy of the hasher. + */ + virtual Hasher* Clone() const = 0; + + /** + * Returns true if two hashers are identical. + */ + virtual bool Equals(const Hasher* other) const = 0; + + /** + * Returns the number *k* of hash functions the hashers applies. + */ + size_t K() const { return k; } + + /** + * Returns the hasher's name. TODO: What's this? + */ + const std::string& Name() const { return name; } + + /** + * Constructs the hasher used by the implementation. This hardcodes a + * specific hashing policy. It exists only because the HashingPolicy + * class hierachy is not yet serializable. + * + * @param k The number of hash functions to apply. + * + * @param name The hasher's name. + * + * @return Returns a new hasher instance. + */ static Hasher* Create(size_t k, const std::string& name); - virtual ~Hasher() { } - - template - digest_vector operator()(const T& x) const - { - return Hash(&x, sizeof(T)); - } - - virtual digest_vector Hash(const void* x, size_t n) const = 0; - - virtual Hasher* Clone() const = 0; - - virtual bool Equals(const Hasher* other) const = 0; - - size_t K() const { return k_; } - const std::string& Name() const { return name_; } - protected: - /** - * A universal hash function family. - */ - class UHF { - public: - /** - * Constructs an H3 hash function seeded with a given seed and an optional - * extra seed to replace the initial Bro seed. - * - * @param seed The seed to use for this instance. - * - * @param extra If not empty, this parameter replaces the initial seed to - * compute the seed for t to compute the - * seed - * NUL-terminated string as additional seed. - */ - UHF(size_t seed, const std::string& extra = ""); + Hasher(size_t k, const std::string& name); - template - digest operator()(const T& x) const - { - return hash(&x, sizeof(T)); - } - - digest operator()(const void* x, size_t n) const - { - return hash(x, n); - } - - friend bool operator==(const UHF& x, const UHF& y) - { - return x.h_ == y.h_; - } - - friend bool operator!=(const UHF& x, const UHF& y) - { - return ! (x == y); - } - - digest hash(const void* x, size_t n) const; - - private: - static size_t compute_seed(size_t seed, const std::string& extra); - - H3 h_; - }; - - Hasher(size_t k, const std::string& name); - -private: - const size_t k_; - std::string name_; + private: + const size_t k; + std::string name; }; /** - * The default hashing policy. Performs *k* hash function computations. + * A universal hash function family. This is a helper class that Hasher + * implementations can use in their implementation. + */ +class UHF { +public: + /** + * Constructs an H3 hash function seeded with a given seed and an + * optional extra seed to replace the initial Bro seed. + * + * @param seed The seed to use for this instance. + * + * @param extra If not empty, this parameter replaces the initial + * seed to compute the seed for t to compute the seed NUL-terminated + * string as additional seed. + */ + UHF(size_t seed, const std::string& extra = ""); + + template + Hasher::digest operator()(const T& x) const + { + return hash(&x, sizeof(T)); + } + + /** + * Computes hash values for an element. + * + * @param x The element to hash. + * + * @return Vector of *k* hash values. + */ + Hasher::digest operator()(const void* x, size_t n) const + { + return hash(x, n); + } + + /** + * Computes the hashes for a set of bytes. + * + * @param x Pointer to first byte to hash. + * + * @param n Number of bytes to hash. + * + * @return Vector of *k* hash values. + * + */ + Hasher::digest hash(const void* x, size_t n) const; + + friend bool operator==(const UHF& x, const UHF& y) + { + return x.h == y.h; + } + + friend bool operator!=(const UHF& x, const UHF& y) + { + return ! (x == y); + } + +private: + static size_t compute_seed(size_t seed, const std::string& extra); + + H3 h; +}; + + +/** + * A hasher implementing the default hashing policy. Uses *k* separate hash + * functions internally. */ class DefaultHasher : public Hasher { public: - DefaultHasher(size_t k, const std::string& name); + /** + * Constructor for a hasher with *k* hash functions. + * + * @param k The number of hash functions to use. + * + * @param name The name of the hasher. + */ + DefaultHasher(size_t k, const std::string& name); - virtual digest_vector Hash(const void* x, size_t n) const /* final */; - virtual DefaultHasher* Clone() const /* final */; - virtual bool Equals(const Hasher* other) const /* final */; + // Overridden from Hasher. + virtual digest_vector Hash(const void* x, size_t n) const /* final */; + virtual DefaultHasher* Clone() const /* final */; + virtual bool Equals(const Hasher* other) const /* final */; private: - std::vector hash_functions_; + std::vector hash_functions; }; /** - * The *double-hashing* policy. Uses a linear combination of two hash functions. + * The *double-hashing* policy. Uses a linear combination of two hash + * functions. */ class DoubleHasher : public Hasher { public: - DoubleHasher(size_t k, const std::string& name); + /** + * Constructor for a double hasher with *k* hash functions. + * + * @param k The number of hash functions to use. + * + * @param name The name of the hasher. + */ + DoubleHasher(size_t k, const std::string& name); - virtual digest_vector Hash(const void* x, size_t n) const /* final */; - virtual DoubleHasher* Clone() const /* final */; - virtual bool Equals(const Hasher* other) const /* final */; + // Overridden from Hasher. + virtual digest_vector Hash(const void* x, size_t n) const /* final */; + virtual DoubleHasher* Clone() const /* final */; + virtual bool Equals(const Hasher* other) const /* final */; private: - UHF h1_; - UHF h2_; + UHF h1; + UHF h2; }; } diff --git a/src/probabilistic/bloom-filter.bif b/src/probabilistic/bloom-filter.bif index 3c409b1b0f..cbbff85d7d 100644 --- a/src/probabilistic/bloom-filter.bif +++ b/src/probabilistic/bloom-filter.bif @@ -31,18 +31,19 @@ module GLOBAL; ## Returns: A Bloom filter handle. function bloomfilter_basic_init%(fp: double, capacity: count, name: string &default=""%): opaque of bloomfilter - %{ - if ( fp < 0.0 || fp > 1.0 ) - { - reporter->Error("false-positive rate must take value between 0 and 1"); - return NULL; - } + %{ + if ( fp < 0.0 || fp > 1.0 ) + { + reporter->Error("false-positive rate must take value between 0 and 1"); + return 0; + } - size_t cells = BasicBloomFilter::M(fp, capacity); - size_t optimal_k = BasicBloomFilter::K(cells, capacity); - const Hasher* h = Hasher::Create(optimal_k, name->CheckString()); - return new BloomFilterVal(new BasicBloomFilter(h, cells)); - %} + size_t cells = BasicBloomFilter::M(fp, capacity); + size_t optimal_k = BasicBloomFilter::K(cells, capacity); + const Hasher* h = Hasher::Create(optimal_k, name->CheckString()); + + return new BloomFilterVal(new BasicBloomFilter(h, cells)); + %} ## Creates a counting Bloom filter. ## @@ -59,20 +60,22 @@ function bloomfilter_basic_init%(fp: double, capacity: count, ## ## Returns: A Bloom filter handle. function bloomfilter_counting_init%(k: count, cells: count, max: count, - name: string &default=""%): opaque of bloomfilter - %{ - if ( max == 0 ) - { - reporter->Error("max counter value must be greater than 0"); - return NULL; - } + name: string &default=""%): opaque of bloomfilter + %{ + if ( max == 0 ) + { + reporter->Error("max counter value must be greater than 0"); + return 0; + } - const Hasher* h = Hasher::Create(k, name->CheckString()); - uint16 width = 1; - while ( max >>= 1 ) - ++width; - return new BloomFilterVal(new CountingBloomFilter(h, cells, width)); - %} + const Hasher* h = Hasher::Create(k, name->CheckString()); + + uint16 width = 1; + while ( max >>= 1 ) + ++width; + + return new BloomFilterVal(new CountingBloomFilter(h, cells, width)); + %} ## Adds an element to a Bloom filter. ## @@ -80,16 +83,20 @@ function bloomfilter_counting_init%(k: count, cells: count, max: count, ## ## x: The element to add. function bloomfilter_add%(bf: opaque of bloomfilter, x: any%): any - %{ - BloomFilterVal* bfv = static_cast(bf); - if ( ! bfv->Type() && ! bfv->Typify(x->Type()) ) - reporter->Error("failed to set Bloom filter type"); - else if ( bfv->Type() != x->Type() ) - reporter->Error("incompatible Bloom filter types"); - else - bfv->Add(x); - return NULL; - %} + %{ + BloomFilterVal* bfv = static_cast(bf); + + if ( ! bfv->Type() && ! bfv->Typify(x->Type()) ) + reporter->Error("failed to set Bloom filter type"); + + else if ( ! same_type(bfv->Type(), x->Type()) ) + reporter->Error("incompatible Bloom filter types"); + + else + bfv->Add(x); + + return 0; + %} ## Retrieves the counter for a given element in a Bloom filter. ## @@ -99,16 +106,20 @@ function bloomfilter_add%(bf: opaque of bloomfilter, x: any%): any ## ## Returns: the counter associated with *x* in *bf*. function bloomfilter_lookup%(bf: opaque of bloomfilter, x: any%): count - %{ - const BloomFilterVal* bfv = static_cast(bf); - if ( ! bfv->Type() ) - reporter->Error("cannot perform lookup on untyped Bloom filter"); - else if ( bfv->Type() != x->Type() ) - reporter->Error("incompatible Bloom filter types"); - else - return new Val(static_cast(bfv->Count(x)), TYPE_COUNT); - return new Val(0, TYPE_COUNT); - %} + %{ + const BloomFilterVal* bfv = static_cast(bf); + + if ( ! bfv->Type() ) + reporter->Error("cannot perform lookup on untyped Bloom filter"); + + else if ( ! same_type(bfv->Type(), x->Type()) ) + reporter->Error("incompatible Bloom filter types"); + + else + return new Val(static_cast(bfv->Count(x)), TYPE_COUNT); + + return new Val(0, TYPE_COUNT); + %} ## Merges two Bloom filters. ## @@ -118,13 +129,16 @@ function bloomfilter_lookup%(bf: opaque of bloomfilter, x: any%): count ## ## Returns: The union of *bf1* and *bf2*. function bloomfilter_merge%(bf1: opaque of bloomfilter, - bf2: opaque of bloomfilter%): opaque of bloomfilter - %{ - const BloomFilterVal* bfv1 = static_cast(bf1); - const BloomFilterVal* bfv2 = static_cast(bf2); - if ( bfv1->Type() != bfv2->Type() ) - reporter->Error("incompatible Bloom filter types"); - else - return BloomFilterVal::Merge(bfv1, bfv2); - return NULL; - %} + bf2: opaque of bloomfilter%): opaque of bloomfilter + %{ + const BloomFilterVal* bfv1 = static_cast(bf1); + const BloomFilterVal* bfv2 = static_cast(bf2); + + if ( ! same_type(bfv1->Type(), bfv2->Type()) ) + { + reporter->Error("incompatible Bloom filter types"); + return 0; + } + + return BloomFilterVal::Merge(bfv1, bfv2); + %} diff --git a/src/util.cc b/src/util.cc index 81ec135f98..6bea2eb7f1 100644 --- a/src/util.cc +++ b/src/util.cc @@ -803,10 +803,10 @@ void init_random_seed(uint32 seed, const char* read_file, const char* write_file bro_srandom(seed, seeds_done); if ( ! first_seed_saved ) - { - first_seed = seed; - first_seed_saved = true; - } + { + first_seed = seed; + first_seed_saved = true; + } if ( ! hmac_key_set ) { @@ -820,9 +820,9 @@ void init_random_seed(uint32 seed, const char* read_file, const char* write_file } unsigned int initial_seed() - { - return first_seed; -} + { + return first_seed; + } bool have_random_seed() { @@ -830,7 +830,7 @@ bool have_random_seed() } long int bro_prng(long int state) - { + { // Use our own simple linear congruence PRNG to make sure we are // predictable across platforms. static const long int m = 2147483647; @@ -844,14 +844,14 @@ long int bro_prng(long int state) state += m; return state; - } + } long int bro_random() { if ( ! bro_rand_determistic ) return random(); // Use system PRNG. - bro_rand_state = bro_prng(bro_rand_state); + bro_rand_state = bro_prng(bro_rand_state); return bro_rand_state; } diff --git a/src/util.h b/src/util.h index 5689253d95..aaad2d9403 100644 --- a/src/util.h +++ b/src/util.h @@ -166,15 +166,15 @@ extern void init_random_seed(uint32 seed, const char* load_file, const char* write_file); // Retrieves the initial seed computed after the very first call to -// init_random_seed(). Repeated calls to init_random_seed() will not affect the -// return value of this function. +// init_random_seed(). Repeated calls to init_random_seed() will not affect +// the return value of this function. unsigned int initial_seed(); // Returns true if the user explicitly set a seed via init_random_seed(); extern bool have_random_seed(); -// A simple linear congruence PRNG. It takes its state as argument and returns -// a new random value, which can serve as state for subsequent calls. +// A simple linear congruence PRNG. It takes its state as argument and +// returns a new random value, which can serve as state for subsequent calls. long int bro_prng(long int state); // Replacement for the system random(), to which is normally falls back diff --git a/testing/btest/Baseline/bifs.bloomfilter/output b/testing/btest/Baseline/bifs.bloomfilter/output index 4fe2ae1ecc..14e1f038c0 100644 --- a/testing/btest/Baseline/bifs.bloomfilter/output +++ b/testing/btest/Baseline/bifs.bloomfilter/output @@ -1,3 +1,9 @@ +error: incompatible Bloom filter types +error: incompatible Bloom filter types +error: incompatible Bloom filter types +error: incompatible Bloom filter types +error: false-positive rate must take value between 0 and 1 +error: false-positive rate must take value between 0 and 1 0 1 1 diff --git a/testing/btest/bifs/bloomfilter.bro b/testing/btest/bifs/bloomfilter.bro index f69ddbda0c..3b40f29553 100644 --- a/testing/btest/bifs/bloomfilter.bro +++ b/testing/btest/bifs/bloomfilter.bro @@ -1,4 +1,4 @@ -# @TEST-EXEC: bro -b %INPUT >output +# @TEST-EXEC: bro -b %INPUT >output 2>&1 # @TEST-EXEC: btest-diff output function test_basic_bloom_filter() From c89f61917b8b7a6ab8014fad211c879681c3ad5f Mon Sep 17 00:00:00 2001 From: Robin Sommer Date: Tue, 23 Jul 2013 18:44:22 -0700 Subject: [PATCH 54/73] Updating NEWS. --- NEWS | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/NEWS b/NEWS index 1fce6b1d9d..b1a5adc12b 100644 --- a/NEWS +++ b/NEWS @@ -108,6 +108,18 @@ New Functionality shunting, and sampling; plus plugin support to customize filters dynamically. +- Bro now provides Bloom filters of two kinds: basic Bloom filters + supporting membership tests, and counting Bloom filters that track + the frequency of elements. The corresponding functions are: + + bloomfilter_basic_init(fp: double, capacity: count, name: string &default=""): opaque of bloomfilter + bloomfilter_counting_init(k: count, cells: count, max: count, name: string &default=""): opaque of bloomfilter + bloomfilter_add(bf: opaque of bloomfilter, x: any) + bloomfilter_lookup(bf: opaque of bloomfilter, x: any): count + bloomfilter_merge(bf1: opaque of bloomfilter, bf2: opaque of bloomfilter): opaque of bloomfilter + + See TODO for full documentation. + Changed Functionality ~~~~~~~~~~~~~~~~~~~~~ From 5383e8f75bae11bc5da30acf0b77493b90e5f71c Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Wed, 24 Jul 2013 11:21:10 +0200 Subject: [PATCH 55/73] Add bloomfilter_clear() BiF. --- src/OpaqueVal.cc | 5 +++++ src/OpaqueVal.h | 1 + src/probabilistic/BloomFilter.cc | 10 ++++++++++ src/probabilistic/BloomFilter.h | 11 +++++++++++ src/probabilistic/CounterVector.cc | 5 +++++ src/probabilistic/CounterVector.h | 5 +++++ src/probabilistic/bloom-filter.bif | 16 ++++++++++++++++ 7 files changed, 53 insertions(+) diff --git a/src/OpaqueVal.cc b/src/OpaqueVal.cc index efdd890f70..19a372c005 100644 --- a/src/OpaqueVal.cc +++ b/src/OpaqueVal.cc @@ -578,6 +578,11 @@ size_t BloomFilterVal::Count(const Val* val) const return cnt; } +void BloomFilterVal::Clear() + { + bloom_filter->Clear(); + } + BloomFilterVal* BloomFilterVal::Merge(const BloomFilterVal* x, const BloomFilterVal* y) { diff --git a/src/OpaqueVal.h b/src/OpaqueVal.h index ea704cb70a..cfb184fc77 100644 --- a/src/OpaqueVal.h +++ b/src/OpaqueVal.h @@ -125,6 +125,7 @@ public: void Add(const Val* val); size_t Count(const Val* val) const; + void Clear(); static BloomFilterVal* Merge(const BloomFilterVal* x, const BloomFilterVal* y); diff --git a/src/probabilistic/BloomFilter.cc b/src/probabilistic/BloomFilter.cc index 5613dcce05..c78cd4193d 100644 --- a/src/probabilistic/BloomFilter.cc +++ b/src/probabilistic/BloomFilter.cc @@ -74,6 +74,11 @@ size_t BasicBloomFilter::K(size_t cells, size_t capacity) return std::ceil(frac * std::log(2)); } +void BasicBloomFilter::Clear() + { + bits->Clear(); + } + BasicBloomFilter* BasicBloomFilter::Merge(const BasicBloomFilter* x, const BasicBloomFilter* y) { @@ -191,3 +196,8 @@ size_t CountingBloomFilter::CountImpl(const Hasher::digest_vector& h) const return min; } + +void CountingBloomFilter::Clear() + { + cells->Clear(); + } diff --git a/src/probabilistic/BloomFilter.h b/src/probabilistic/BloomFilter.h index 4a6b01c484..55bc76fca7 100644 --- a/src/probabilistic/BloomFilter.h +++ b/src/probabilistic/BloomFilter.h @@ -47,6 +47,11 @@ public: return CountImpl((*hasher)(x)); } + /** + * Removes all elements, i.e., resets all bits in the underlying bit vector. + */ + virtual void Clear() = 0; + /** * Serializes the Bloom filter. * @@ -147,6 +152,9 @@ public: */ static size_t K(size_t cells, size_t capacity); + // Overridden from BloomFilter. + virtual void Clear(); + /** * Merges two basic Bloom filters. * @@ -188,6 +196,9 @@ public: */ CountingBloomFilter(const Hasher* hasher, size_t cells, size_t width); + // Overridden from BloomFilter. + virtual void Clear(); + /** * Merges two counting Bloom filters. * diff --git a/src/probabilistic/CounterVector.cc b/src/probabilistic/CounterVector.cc index 570ed1f8ea..00fa7fb8c0 100644 --- a/src/probabilistic/CounterVector.cc +++ b/src/probabilistic/CounterVector.cc @@ -70,6 +70,11 @@ bool CounterVector::Decrement(size_type cell, count_type value) return carry; } +void CounterVector::Clear() + { + bits->Clear(); + } + CounterVector::count_type CounterVector::Count(size_type cell) const { assert(cell < Size()); diff --git a/src/probabilistic/CounterVector.h b/src/probabilistic/CounterVector.h index 178a68e8f2..896f98ef1e 100644 --- a/src/probabilistic/CounterVector.h +++ b/src/probabilistic/CounterVector.h @@ -77,6 +77,11 @@ public: */ count_type Count(size_type cell) const; + /** + * Sets all counters to 0. + */ + void Clear(); + /** * Retrieves the number of cells in the storage. * diff --git a/src/probabilistic/bloom-filter.bif b/src/probabilistic/bloom-filter.bif index cbbff85d7d..9df168be0e 100644 --- a/src/probabilistic/bloom-filter.bif +++ b/src/probabilistic/bloom-filter.bif @@ -121,6 +121,22 @@ function bloomfilter_lookup%(bf: opaque of bloomfilter, x: any%): count return new Val(0, TYPE_COUNT); %} +## Removes all elements from a Bloom filter. This function sets resets all bits +## in the underlying bitvector to 0 but does not change the parameterization of +## the Bloom filter, such as the element type and the hasher seed. +## +## bf: The Bloom filter handle. +function bloomfilter_clear%(bf: opaque of bloomfilter%): any + %{ + BloomFilterVal* bfv = static_cast(bf); + + if ( bfv->Type() ) // Untyped Bloom filters are already empty. + bfv->Clear(); + + return 0; + %} + + ## Merges two Bloom filters. ## ## bf1: The first Bloom filter handle. From 5736aef440574389dda6555642ee7e938156dcf1 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Wed, 24 Jul 2013 13:05:38 +0200 Subject: [PATCH 56/73] Refactor Bloom filter merging. --- src/OpaqueVal.cc | 31 ++++++++--- src/OpaqueVal.h | 22 -------- src/probabilistic/BloomFilter.cc | 92 +++++++++++++++++++++++--------- src/probabilistic/BloomFilter.h | 36 +++++++------ 4 files changed, 109 insertions(+), 72 deletions(-) diff --git a/src/OpaqueVal.cc b/src/OpaqueVal.cc index 19a372c005..feff4f3cc0 100644 --- a/src/OpaqueVal.cc +++ b/src/OpaqueVal.cc @@ -584,21 +584,36 @@ void BloomFilterVal::Clear() } BloomFilterVal* BloomFilterVal::Merge(const BloomFilterVal* x, - const BloomFilterVal* y) + const BloomFilterVal* y) { if ( ! same_type(x->Type(), y->Type()) ) + { reporter->InternalError("cannot merge Bloom filters with different types"); + return 0; + } - BloomFilterVal* result; + if ( typeid(*x->bloom_filter) != typeid(*y->bloom_filter) ) + { + reporter->InternalError("cannot merge different Bloom filter types"); + return 0; + } - if ( (result = DoMerge(x, y)) ) - return result; + probabilistic::BloomFilter* copy = x->bloom_filter->Clone(); + bool success = copy->Merge(y->bloom_filter); + if ( ! success ) + { + reporter->InternalError("failed to merge Bloom filter"); + return 0; + } - else if ( (result = DoMerge(x, y)) ) - return result; + BloomFilterVal* merged = new BloomFilterVal(copy); + if ( ! merged->Typify(x->Type()) ) + { + reporter->InternalError("failed to set type on merged Bloom filter"); + return 0; + } - reporter->InternalError("failed to merge Bloom filters"); - return 0; + return merged; } BloomFilterVal::~BloomFilterVal() diff --git a/src/OpaqueVal.h b/src/OpaqueVal.h index cfb184fc77..360bb69803 100644 --- a/src/OpaqueVal.h +++ b/src/OpaqueVal.h @@ -142,28 +142,6 @@ private: BloomFilterVal(const BloomFilterVal&); BloomFilterVal& operator=(const BloomFilterVal&); - template - static BloomFilterVal* DoMerge(const BloomFilterVal* x, - const BloomFilterVal* y) - { - if ( typeid(*x->bloom_filter) != typeid(*y->bloom_filter) ) - reporter->InternalError("cannot merge different Bloom filter types"); - - if ( typeid(T) != typeid(*x->bloom_filter) ) - return 0; - - const T* a = static_cast(x->bloom_filter); - const T* b = static_cast(y->bloom_filter); - - BloomFilterVal* merged = new BloomFilterVal(T::Merge(a, b)); - assert(merged); - - if ( ! merged->Typify(x->Type()) ) - reporter->InternalError("failed to set type on merged Bloom filter"); - - return merged; - } - BroType* type; CompositeHash* hash; probabilistic::BloomFilter* bloom_filter; diff --git a/src/probabilistic/BloomFilter.cc b/src/probabilistic/BloomFilter.cc index c78cd4193d..132cf376ec 100644 --- a/src/probabilistic/BloomFilter.cc +++ b/src/probabilistic/BloomFilter.cc @@ -79,17 +79,37 @@ void BasicBloomFilter::Clear() bits->Clear(); } -BasicBloomFilter* BasicBloomFilter::Merge(const BasicBloomFilter* x, - const BasicBloomFilter* y) +bool BasicBloomFilter::Merge(const BloomFilter* other) { - if ( ! x->hasher->Equals(y->hasher) ) - reporter->InternalError("incompatible hashers during BasicBloomFilter merge"); + if ( typeid(*this) != typeid(*other) ) + return 0; - BasicBloomFilter* result = new BasicBloomFilter(); - result->hasher = x->hasher->Clone(); - result->bits = new BitVector(*x->bits | *y->bits); + const BasicBloomFilter* o = static_cast(other); - return result; + if ( ! hasher->Equals(o->hasher) ) + { + reporter->InternalError("incompatible hashers in BasicBloomFilter merge"); + return false; + } + else if ( bits->Size() != o->bits->Size() ) + { + reporter->InternalError("different bitvector size in BasicBloomFilter merge"); + return false; + } + + (*bits) |= *o->bits; + + return true; + } + +BasicBloomFilter* BasicBloomFilter::Clone() const + { + BasicBloomFilter* copy = new BasicBloomFilter(); + + copy->hasher = hasher->Clone(); + copy->bits = new BitVector(*bits); + + return copy; } BasicBloomFilter::BasicBloomFilter() @@ -135,19 +155,6 @@ size_t BasicBloomFilter::CountImpl(const Hasher::digest_vector& h) const return 1; } -CountingBloomFilter* CountingBloomFilter::Merge(const CountingBloomFilter* x, - const CountingBloomFilter* y) - { - if ( ! x->hasher->Equals(y->hasher) ) - reporter->InternalError("incompatible hashers during CountingBloomFilter merge"); - - CountingBloomFilter* result = new CountingBloomFilter(); - result->hasher = x->hasher->Clone(); - result->cells = new CounterVector(*x->cells | *y->cells); - - return result; - } - CountingBloomFilter::CountingBloomFilter() { cells = 0; @@ -160,6 +167,44 @@ CountingBloomFilter::CountingBloomFilter(const Hasher* hasher, cells = new CounterVector(width, arg_cells); } +void CountingBloomFilter::Clear() + { + cells->Clear(); + } + +bool CountingBloomFilter::Merge(const BloomFilter* other) + { + if ( typeid(*this) != typeid(*other) ) + return 0; + + const CountingBloomFilter* o = static_cast(other); + + if ( ! hasher->Equals(o->hasher) ) + { + reporter->InternalError("incompatible hashers in CountingBloomFilter merge"); + return false; + } + else if ( cells->Size() != o->cells->Size() ) + { + reporter->InternalError("different bitvector size in CountingBloomFilter merge"); + return false; + } + + (*cells) |= *o->cells; + + return true; + } + +CountingBloomFilter* CountingBloomFilter::Clone() const + { + CountingBloomFilter* copy = new CountingBloomFilter(); + + copy->hasher = hasher->Clone(); + copy->cells = new CounterVector(*cells); + + return copy; + } + IMPLEMENT_SERIAL(CountingBloomFilter, SER_COUNTINGBLOOMFILTER) bool CountingBloomFilter::DoSerialize(SerialInfo* info) const @@ -196,8 +241,3 @@ size_t CountingBloomFilter::CountImpl(const Hasher::digest_vector& h) const return min; } - -void CountingBloomFilter::Clear() - { - cells->Clear(); - } diff --git a/src/probabilistic/BloomFilter.h b/src/probabilistic/BloomFilter.h index 55bc76fca7..2ab5b89941 100644 --- a/src/probabilistic/BloomFilter.h +++ b/src/probabilistic/BloomFilter.h @@ -52,6 +52,22 @@ public: */ virtual void Clear() = 0; + /** + * Merges another Bloom filter into a copy of this one. + * + * @param other The other Bloom filter. + * + * @return `true` on success. + */ + virtual bool Merge(const BloomFilter* other) = 0; + + /** + * Constructs a copy of this Bloom filter. + * + * @return A copy of `*this`. + */ + virtual BloomFilter* Clone() const = 0; + /** * Serializes the Bloom filter. * @@ -154,14 +170,8 @@ public: // Overridden from BloomFilter. virtual void Clear(); - - /** - * Merges two basic Bloom filters. - * - * @return The merged Bloom filter. - */ - static BasicBloomFilter* Merge(const BasicBloomFilter* x, - const BasicBloomFilter* y); + virtual bool Merge(const BloomFilter* other); + virtual BasicBloomFilter* Clone() const; protected: DECLARE_SERIAL(BasicBloomFilter); @@ -198,14 +208,8 @@ public: // Overridden from BloomFilter. virtual void Clear(); - - /** - * Merges two counting Bloom filters. - * - * @return The merged Bloom filter. - */ - static CountingBloomFilter* Merge(const CountingBloomFilter* x, - const CountingBloomFilter* y); + virtual bool Merge(const BloomFilter* other); + virtual CountingBloomFilter* Clone() const; protected: DECLARE_SERIAL(CountingBloomFilter); From 5769c32f1eeb319e599996e05e0e63b30af34823 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Wed, 24 Jul 2013 13:18:19 +0200 Subject: [PATCH 57/73] Support emptiness check on Bloom filters. --- src/OpaqueVal.cc | 5 +++++ src/OpaqueVal.h | 1 + src/probabilistic/BitVector.cc | 8 ++++++++ src/probabilistic/BitVector.h | 6 ++++++ src/probabilistic/BloomFilter.cc | 10 ++++++++++ src/probabilistic/BloomFilter.h | 9 +++++++++ src/probabilistic/CounterVector.cc | 5 +++++ src/probabilistic/CounterVector.h | 6 ++++++ src/probabilistic/bloom-filter.bif | 3 +++ 9 files changed, 53 insertions(+) diff --git a/src/OpaqueVal.cc b/src/OpaqueVal.cc index feff4f3cc0..a42892e2b2 100644 --- a/src/OpaqueVal.cc +++ b/src/OpaqueVal.cc @@ -583,6 +583,11 @@ void BloomFilterVal::Clear() bloom_filter->Clear(); } +bool BloomFilterVal::Empty() const + { + return bloom_filter->Empty(); + } + BloomFilterVal* BloomFilterVal::Merge(const BloomFilterVal* x, const BloomFilterVal* y) { diff --git a/src/OpaqueVal.h b/src/OpaqueVal.h index 360bb69803..52c9583fc7 100644 --- a/src/OpaqueVal.h +++ b/src/OpaqueVal.h @@ -126,6 +126,7 @@ public: void Add(const Val* val); size_t Count(const Val* val) const; void Clear(); + bool Empty() const; static BloomFilterVal* Merge(const BloomFilterVal* x, const BloomFilterVal* y); diff --git a/src/probabilistic/BitVector.cc b/src/probabilistic/BitVector.cc index 98f008b24b..13cd1aa3bb 100644 --- a/src/probabilistic/BitVector.cc +++ b/src/probabilistic/BitVector.cc @@ -463,6 +463,14 @@ bool BitVector::Empty() const return bits.empty(); } +bool BitVector::AllZero() const + { + for ( size_t i = 0; i < bits.size(); ++i ) + if ( bits[i] ) + return false; + return true; + } + BitVector::size_type BitVector::FindFirst() const { return find_from(0); diff --git a/src/probabilistic/BitVector.h b/src/probabilistic/BitVector.h index 9eefe1b633..d9c55d53c6 100644 --- a/src/probabilistic/BitVector.h +++ b/src/probabilistic/BitVector.h @@ -253,6 +253,12 @@ public: */ bool Empty() const; + /** + * Checks whether all bits are 0. + * @return `true` iff all bits in all blocks are 0. + */ + bool AllZero() const; + /** * Finds the bit position of of the first 1-bit. * @return The position of the first bit that equals to one or `npos` if no diff --git a/src/probabilistic/BloomFilter.cc b/src/probabilistic/BloomFilter.cc index 132cf376ec..7f769cbf7c 100644 --- a/src/probabilistic/BloomFilter.cc +++ b/src/probabilistic/BloomFilter.cc @@ -74,6 +74,11 @@ size_t BasicBloomFilter::K(size_t cells, size_t capacity) return std::ceil(frac * std::log(2)); } +bool BasicBloomFilter::Empty() const + { + return bits->AllZero(); + } + void BasicBloomFilter::Clear() { bits->Clear(); @@ -167,6 +172,11 @@ CountingBloomFilter::CountingBloomFilter(const Hasher* hasher, cells = new CounterVector(width, arg_cells); } +bool CountingBloomFilter::Empty() const + { + return cells->AllZero(); + } + void CountingBloomFilter::Clear() { cells->Clear(); diff --git a/src/probabilistic/BloomFilter.h b/src/probabilistic/BloomFilter.h index 2ab5b89941..b6cf18672f 100644 --- a/src/probabilistic/BloomFilter.h +++ b/src/probabilistic/BloomFilter.h @@ -47,6 +47,13 @@ public: return CountImpl((*hasher)(x)); } + /** + * Checks whether the Bloom filter is empty. + * + * @return `true` if the Bloom filter contains no elements. + */ + virtual bool Empty() const = 0; + /** * Removes all elements, i.e., resets all bits in the underlying bit vector. */ @@ -169,6 +176,7 @@ public: static size_t K(size_t cells, size_t capacity); // Overridden from BloomFilter. + virtual bool Empty() const; virtual void Clear(); virtual bool Merge(const BloomFilter* other); virtual BasicBloomFilter* Clone() const; @@ -207,6 +215,7 @@ public: CountingBloomFilter(const Hasher* hasher, size_t cells, size_t width); // Overridden from BloomFilter. + virtual bool Empty() const; virtual void Clear(); virtual bool Merge(const BloomFilter* other); virtual CountingBloomFilter* Clone() const; diff --git a/src/probabilistic/CounterVector.cc b/src/probabilistic/CounterVector.cc index 00fa7fb8c0..24c9ff3638 100644 --- a/src/probabilistic/CounterVector.cc +++ b/src/probabilistic/CounterVector.cc @@ -70,6 +70,11 @@ bool CounterVector::Decrement(size_type cell, count_type value) return carry; } +bool CounterVector::AllZero() const + { + return bits->AllZero(); + } + void CounterVector::Clear() { bits->Clear(); diff --git a/src/probabilistic/CounterVector.h b/src/probabilistic/CounterVector.h index 896f98ef1e..df6fc57ac2 100644 --- a/src/probabilistic/CounterVector.h +++ b/src/probabilistic/CounterVector.h @@ -77,6 +77,12 @@ public: */ count_type Count(size_type cell) const; + /** + * Checks whether all counters are 0. + * @return `true` iff all counters have the value 0. + */ + bool AllZero() const; + /** * Sets all counters to 0. */ diff --git a/src/probabilistic/bloom-filter.bif b/src/probabilistic/bloom-filter.bif index 9df168be0e..dd21688fdd 100644 --- a/src/probabilistic/bloom-filter.bif +++ b/src/probabilistic/bloom-filter.bif @@ -109,6 +109,9 @@ function bloomfilter_lookup%(bf: opaque of bloomfilter, x: any%): count %{ const BloomFilterVal* bfv = static_cast(bf); + if ( bfv->Empty() ) + return new Val(0, TYPE_COUNT); + if ( ! bfv->Type() ) reporter->Error("cannot perform lookup on untyped Bloom filter"); From d8226169b8266b554c73b2804d480d10c4a9e456 Mon Sep 17 00:00:00 2001 From: Robin Sommer Date: Wed, 24 Jul 2013 16:34:52 -0700 Subject: [PATCH 58/73] Fixing random number generation so that it returns same numbers as before. That broke a lot of tests. --- src/H3.h | 16 ++++++++++++++-- src/util.cc | 2 +- src/util.h | 2 +- 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/src/H3.h b/src/H3.h index 8ea5848816..321fda924b 100644 --- a/src/H3.h +++ b/src/H3.h @@ -66,17 +66,29 @@ template class H3 { public: - H3(T seed = bro_random()) + H3() + { + Init(false, 0); + } + + H3(T seed) + { + Init(true, seed); + } + + void Init(bool have_seed, T seed) { T bit_lookup[N * CHAR_BIT]; for ( size_t bit = 0; bit < N * CHAR_BIT; bit++ ) { bit_lookup[bit] = 0; - seed = bro_prng(seed); for ( size_t i = 0; i < sizeof(T)/2; i++ ) + { + seed = have_seed ? bro_prng(seed) : bro_random(); // assume random() returns at least 16 random bits bit_lookup[bit] = (bit_lookup[bit] << 16) | (seed & 0xFFFF); + } } for ( size_t byte = 0; byte < N; byte++ ) diff --git a/src/util.cc b/src/util.cc index 6bea2eb7f1..23abbacc3f 100644 --- a/src/util.cc +++ b/src/util.cc @@ -829,7 +829,7 @@ bool have_random_seed() return bro_rand_determistic; } -long int bro_prng(long int state) +unsigned int bro_prng(unsigned int state) { // Use our own simple linear congruence PRNG to make sure we are // predictable across platforms. diff --git a/src/util.h b/src/util.h index aaad2d9403..05b3f032d0 100644 --- a/src/util.h +++ b/src/util.h @@ -175,7 +175,7 @@ extern bool have_random_seed(); // A simple linear congruence PRNG. It takes its state as argument and // returns a new random value, which can serve as state for subsequent calls. -long int bro_prng(long int state); +unsigned int bro_prng(unsigned int state); // Replacement for the system random(), to which is normally falls back // except when a seed has been given. In that case, the function bro_prng. From 33e6435329c9c629b47069fd48fd97139f21a2e4 Mon Sep 17 00:00:00 2001 From: Robin Sommer Date: Wed, 24 Jul 2013 16:39:22 -0700 Subject: [PATCH 59/73] Updating tests. --- doc/scripts/DocSourcesList.cmake | 1 + .../canonified_loaded_scripts.log | 5 +++-- .../canonified_loaded_scripts.log | 5 +++-- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/doc/scripts/DocSourcesList.cmake b/doc/scripts/DocSourcesList.cmake index 529b03ca83..26a88027ef 100644 --- a/doc/scripts/DocSourcesList.cmake +++ b/doc/scripts/DocSourcesList.cmake @@ -17,6 +17,7 @@ rest_target(${psd} base/init-default.bro internal) rest_target(${psd} base/init-bare.bro internal) rest_target(${CMAKE_BINARY_DIR}/scripts base/bif/analyzer.bif.bro) +rest_target(${CMAKE_BINARY_DIR}/scripts base/bif/bloom-filter.bif.bro) rest_target(${CMAKE_BINARY_DIR}/scripts base/bif/bro.bif.bro) rest_target(${CMAKE_BINARY_DIR}/scripts base/bif/const.bif.bro) rest_target(${CMAKE_BINARY_DIR}/scripts base/bif/event.bif.bro) diff --git a/testing/btest/Baseline/coverage.bare-load-baseline/canonified_loaded_scripts.log b/testing/btest/Baseline/coverage.bare-load-baseline/canonified_loaded_scripts.log index b7585a1477..04316da023 100644 --- a/testing/btest/Baseline/coverage.bare-load-baseline/canonified_loaded_scripts.log +++ b/testing/btest/Baseline/coverage.bare-load-baseline/canonified_loaded_scripts.log @@ -3,7 +3,7 @@ #empty_field (empty) #unset_field - #path loaded_scripts -#open 2013-07-05-05-20-50 +#open 2013-07-24-23-38-28 #fields name #types string scripts/base/init-bare.bro @@ -12,6 +12,7 @@ scripts/base/init-bare.bro build/scripts/base/bif/strings.bif.bro build/scripts/base/bif/bro.bif.bro build/scripts/base/bif/reporter.bif.bro + build/scripts/base/bif/bloom-filter.bif.bro build/scripts/base/bif/event.bif.bro build/scripts/base/bif/plugins/__load__.bro build/scripts/base/bif/plugins/Bro_ARP.events.bif.bro @@ -89,4 +90,4 @@ scripts/base/init-bare.bro build/scripts/base/bif/file_analysis.bif.bro scripts/policy/misc/loaded-scripts.bro scripts/base/utils/paths.bro -#close 2013-07-05-05-20-50 +#close 2013-07-24-23-38-28 diff --git a/testing/btest/Baseline/coverage.default-load-baseline/canonified_loaded_scripts.log b/testing/btest/Baseline/coverage.default-load-baseline/canonified_loaded_scripts.log index 999fd7c841..66212643f3 100644 --- a/testing/btest/Baseline/coverage.default-load-baseline/canonified_loaded_scripts.log +++ b/testing/btest/Baseline/coverage.default-load-baseline/canonified_loaded_scripts.log @@ -3,7 +3,7 @@ #empty_field (empty) #unset_field - #path loaded_scripts -#open 2013-07-10-21-18-31 +#open 2013-07-24-23-38-33 #fields name #types string scripts/base/init-bare.bro @@ -12,6 +12,7 @@ scripts/base/init-bare.bro build/scripts/base/bif/strings.bif.bro build/scripts/base/bif/bro.bif.bro build/scripts/base/bif/reporter.bif.bro + build/scripts/base/bif/bloom-filter.bif.bro build/scripts/base/bif/event.bif.bro build/scripts/base/bif/plugins/__load__.bro build/scripts/base/bif/plugins/Bro_ARP.events.bif.bro @@ -195,4 +196,4 @@ scripts/base/init-default.bro scripts/base/protocols/tunnels/__load__.bro scripts/base/misc/find-checksum-offloading.bro scripts/policy/misc/loaded-scripts.bro -#close 2013-07-10-21-18-31 +#close 2013-07-24-23-38-33 From e482897f885e2f1039b96782d5e4bc080d74a535 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Thu, 25 Jul 2013 15:16:53 +0200 Subject: [PATCH 60/73] Add docs and use default value for hasher names. --- src/probabilistic/Hasher.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/probabilistic/Hasher.h b/src/probabilistic/Hasher.h index 62c5d58d1f..d266565284 100644 --- a/src/probabilistic/Hasher.h +++ b/src/probabilistic/Hasher.h @@ -63,7 +63,9 @@ public: size_t K() const { return k; } /** - * Returns the hasher's name. TODO: What's this? + * Returns the hasher's name. If not empty, the hasher uses this descriptor + * to seed its *k* hash functions. Otherwise the hasher mixes in the initial + * seed derived from the environment variable `$BRO_SEED`. */ const std::string& Name() const { return name; } @@ -83,7 +85,7 @@ public: protected: Hasher(size_t k, const std::string& name); - private: +private: const size_t k; std::string name; }; @@ -166,7 +168,7 @@ public: * * @param name The name of the hasher. */ - DefaultHasher(size_t k, const std::string& name); + DefaultHasher(size_t k, const std::string& name = ""); // Overridden from Hasher. virtual digest_vector Hash(const void* x, size_t n) const /* final */; @@ -190,7 +192,7 @@ public: * * @param name The name of the hasher. */ - DoubleHasher(size_t k, const std::string& name); + DoubleHasher(size_t k, const std::string& name = ""); // Overridden from Hasher. virtual digest_vector Hash(const void* x, size_t n) const /* final */; From 2fc5ca53ff8f90aa959b2bc65626b319a1dee529 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Thu, 25 Jul 2013 17:35:35 +0200 Subject: [PATCH 61/73] Make hashers serializable. There exists still a small bug that I could not find; the unit test istate/opaque.bro fails. If someone sees why, please chime in. --- src/SerialTypes.h | 6 ++ src/probabilistic/BloomFilter.cc | 19 +----- src/probabilistic/BloomFilter.h | 3 - src/probabilistic/Hasher.cc | 99 ++++++++++++++++++++++++++---- src/probabilistic/Hasher.h | 33 +++++----- src/probabilistic/bloom-filter.bif | 4 +- 6 files changed, 117 insertions(+), 47 deletions(-) diff --git a/src/SerialTypes.h b/src/SerialTypes.h index 85aed10bda..9933d005f0 100644 --- a/src/SerialTypes.h +++ b/src/SerialTypes.h @@ -52,6 +52,7 @@ SERIAL_IS(RE_MATCHER, 0x1400) SERIAL_IS(BITVECTOR, 0x1500) SERIAL_IS(COUNTERVECTOR, 0x1600) SERIAL_IS(BLOOMFILTER, 0x1700) +SERIAL_IS(HASHER, 0x1800) // These are the externally visible types. const SerialType SER_NONE = 0; @@ -206,6 +207,11 @@ SERIAL_BLOOMFILTER(BLOOMFILTER, 1) SERIAL_BLOOMFILTER(BASICBLOOMFILTER, 2) SERIAL_BLOOMFILTER(COUNTINGBLOOMFILTER, 3) +#define SERIAL_HASHER(name, val) SERIAL_CONST(name, val, HASHER) +SERIAL_HASHER(HASHER, 1) +SERIAL_HASHER(DEFAULTHASHER, 2) +SERIAL_HASHER(DOUBLEHASHER, 3) + SERIAL_CONST2(ID) SERIAL_CONST2(STATE_ACCESS) SERIAL_CONST2(CASE) diff --git a/src/probabilistic/BloomFilter.cc b/src/probabilistic/BloomFilter.cc index 7f769cbf7c..d446643ed3 100644 --- a/src/probabilistic/BloomFilter.cc +++ b/src/probabilistic/BloomFilter.cc @@ -38,28 +38,15 @@ bool BloomFilter::DoSerialize(SerialInfo* info) const { DO_SERIALIZE(SER_BLOOMFILTER, SerialObj); - if ( ! SERIALIZE(static_cast(hasher->K())) ) - return false; - - return SERIALIZE_STR(hasher->Name().c_str(), hasher->Name().size()); + return hasher->Serialize(info); } bool BloomFilter::DoUnserialize(UnserialInfo* info) { DO_UNSERIALIZE(SerialObj); - uint16 k; - if ( ! UNSERIALIZE(&k) ) - return false; - - const char* name; - if ( ! UNSERIALIZE_STR(&name, 0) ) - return false; - - hasher = Hasher::Create(k, name); - - delete [] name; - return true; + hasher = Hasher::Unserialize(info); + return hasher != 0; } size_t BasicBloomFilter::M(double fp, size_t capacity) diff --git a/src/probabilistic/BloomFilter.h b/src/probabilistic/BloomFilter.h index b6cf18672f..4865ae145c 100644 --- a/src/probabilistic/BloomFilter.h +++ b/src/probabilistic/BloomFilter.h @@ -13,9 +13,6 @@ class CounterVector; /** * The abstract base class for Bloom filters. - * - * At this point we won't let the user choose the hasher, but we might open - * up the interface in the future. */ class BloomFilter : public SerialObj { public: diff --git a/src/probabilistic/Hasher.cc b/src/probabilistic/Hasher.cc index f9ce7bdd6b..7db363142d 100644 --- a/src/probabilistic/Hasher.cc +++ b/src/probabilistic/Hasher.cc @@ -4,9 +4,56 @@ #include "Hasher.h" #include "digest.h" +#include "Serializer.h" using namespace probabilistic; +bool Hasher::Serialize(SerialInfo* info) const + { + return SerialObj::Serialize(info); + } + +Hasher* Hasher::Unserialize(UnserialInfo* info) + { + return reinterpret_cast(SerialObj::Unserialize(info, SER_HASHER)); + } + +bool Hasher::DoSerialize(SerialInfo* info) const + { + DO_SERIALIZE(SER_HASHER, SerialObj); + + if ( ! SERIALIZE(static_cast(k)) ) + return false; + + return SERIALIZE_STR(name.c_str(), name.size()); + } + +bool Hasher::DoUnserialize(UnserialInfo* info) + { + DO_UNSERIALIZE(SerialObj); + + uint16 serial_k; + if ( ! UNSERIALIZE(&serial_k) ) + return false; + k = serial_k; + assert(k > 0); + + const char* serial_name; + if ( ! UNSERIALIZE_STR(&serial_name, 0) ) + return false; + name = serial_name; + delete [] serial_name; + + return true; + } + +Hasher::Hasher(size_t k, const std::string& arg_name) + : k(k) + { + name = arg_name; + } + + UHF::UHF(size_t seed, const std::string& extra) : h(compute_seed(seed, extra)) { @@ -40,17 +87,6 @@ size_t UHF::compute_seed(size_t seed, const std::string& extra) return *reinterpret_cast(buf); } -Hasher* Hasher::Create(size_t k, const std::string& name) - { - return new DefaultHasher(k, name); - } - -Hasher::Hasher(size_t k, const std::string& arg_name) - : k(k) - { - name = arg_name; - } - DefaultHasher::DefaultHasher(size_t k, const std::string& name) : Hasher(k, name) { @@ -82,6 +118,27 @@ bool DefaultHasher::Equals(const Hasher* other) const return hash_functions == o->hash_functions; } +IMPLEMENT_SERIAL(DefaultHasher, SER_DEFAULTHASHER) + +bool DefaultHasher::DoSerialize(SerialInfo* info) const + { + DO_SERIALIZE(SER_DEFAULTHASHER, Hasher); + + // Nothing to do here, the base class has all we need serialized already. + return true; + } + +bool DefaultHasher::DoUnserialize(UnserialInfo* info) + { + DO_UNSERIALIZE(Hasher); + + hash_functions.clear(); + for ( size_t i = 0; i < K(); ++i ) + hash_functions.push_back(UHF(i, Name())); + + return true; + } + DoubleHasher::DoubleHasher(size_t k, const std::string& name) : Hasher(k, name), h1(1, name), h2(2, name) { @@ -112,3 +169,23 @@ bool DoubleHasher::Equals(const Hasher* other) const const DoubleHasher* o = static_cast(other); return h1 == o->h1 && h2 == o->h2; } + +IMPLEMENT_SERIAL(DoubleHasher, SER_DOUBLEHASHER) + +bool DoubleHasher::DoSerialize(SerialInfo* info) const + { + DO_SERIALIZE(SER_DOUBLEHASHER, Hasher); + + // Nothing to do here, the base class has all we need serialized already. + return true; + } + +bool DoubleHasher::DoUnserialize(UnserialInfo* info) + { + DO_UNSERIALIZE(Hasher); + + h1 = UHF(1, Name()); + h2 = UHF(2, Name()); + + return true; + } diff --git a/src/probabilistic/Hasher.h b/src/probabilistic/Hasher.h index d266565284..7e6a8ba134 100644 --- a/src/probabilistic/Hasher.h +++ b/src/probabilistic/Hasher.h @@ -5,6 +5,7 @@ #include "Hash.h" #include "H3.h" +#include "SerialObj.h" namespace probabilistic { @@ -12,7 +13,7 @@ namespace probabilistic { * Abstract base class for hashers. A hasher creates a family of hash * functions to hash an element *k* times. */ -class Hasher { +class Hasher : public SerialObj { public: typedef hash_t digest; typedef std::vector digest_vector; @@ -69,24 +70,18 @@ public: */ const std::string& Name() const { return name; } - /** - * Constructs the hasher used by the implementation. This hardcodes a - * specific hashing policy. It exists only because the HashingPolicy - * class hierachy is not yet serializable. - * - * @param k The number of hash functions to apply. - * - * @param name The hasher's name. - * - * @return Returns a new hasher instance. - */ - static Hasher* Create(size_t k, const std::string& name); + bool Serialize(SerialInfo* info) const; + static Hasher* Unserialize(UnserialInfo* info); protected: + DECLARE_ABSTRACT_SERIAL(Hasher); + + Hasher() { } + Hasher(size_t k, const std::string& name); private: - const size_t k; + size_t k; std::string name; }; @@ -106,7 +101,7 @@ public: * seed to compute the seed for t to compute the seed NUL-terminated * string as additional seed. */ - UHF(size_t seed, const std::string& extra = ""); + UHF(size_t seed = 0, const std::string& extra = ""); template Hasher::digest operator()(const T& x) const @@ -175,7 +170,11 @@ public: virtual DefaultHasher* Clone() const /* final */; virtual bool Equals(const Hasher* other) const /* final */; + DECLARE_SERIAL(DefaultHasher); + private: + DefaultHasher() { } + std::vector hash_functions; }; @@ -199,7 +198,11 @@ public: virtual DoubleHasher* Clone() const /* final */; virtual bool Equals(const Hasher* other) const /* final */; + DECLARE_SERIAL(DoubleHasher); + private: + DoubleHasher() { } + UHF h1; UHF h2; }; diff --git a/src/probabilistic/bloom-filter.bif b/src/probabilistic/bloom-filter.bif index dd21688fdd..f03e3d149b 100644 --- a/src/probabilistic/bloom-filter.bif +++ b/src/probabilistic/bloom-filter.bif @@ -40,7 +40,7 @@ function bloomfilter_basic_init%(fp: double, capacity: count, size_t cells = BasicBloomFilter::M(fp, capacity); size_t optimal_k = BasicBloomFilter::K(cells, capacity); - const Hasher* h = Hasher::Create(optimal_k, name->CheckString()); + const Hasher* h = new DefaultHasher(optimal_k, name->CheckString()); return new BloomFilterVal(new BasicBloomFilter(h, cells)); %} @@ -68,7 +68,7 @@ function bloomfilter_counting_init%(k: count, cells: count, max: count, return 0; } - const Hasher* h = Hasher::Create(k, name->CheckString()); + const Hasher* h = new DefaultHasher(k, name->CheckString()); uint16 width = 1; while ( max >>= 1 ) From febb7e83957aa14fbc14d59782b33ac3690388b3 Mon Sep 17 00:00:00 2001 From: Robin Sommer Date: Thu, 25 Jul 2013 09:55:15 -0700 Subject: [PATCH 62/73] Covenience make target to update the three coverage tests that usually need tweaking when scripts get added/removed. --- testing/btest/Makefile | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/testing/btest/Makefile b/testing/btest/Makefile index ff63bdb601..47451fbf27 100644 --- a/testing/btest/Makefile +++ b/testing/btest/Makefile @@ -24,4 +24,11 @@ cleanup: update-doc-sources: ../../doc/scripts/genDocSourcesList.sh ../../doc/scripts/DocSourcesList.cmake +# Updates the three coverage tests that usually need tweaking when +# scripts get added/removed. +update-coverage-tests: update-doc-sources + btest -qU coverage.bare-load-baseline + btest -qU coverage.default-load-baseline + @echo "Use 'git diff' to check updates look right." + .PHONY: all btest-verbose brief btest-brief coverage cleanup From 4a7046848caf6f0b97149c91902e42b770c97b3c Mon Sep 17 00:00:00 2001 From: Robin Sommer Date: Thu, 25 Jul 2013 09:45:10 -0700 Subject: [PATCH 63/73] bif files declared with bif_target() are now automatically compiled in. No more manual includes to pull them in. (It doesn't quite work fully automatically yet for some bifs that need script-level types defined, like the input and logging frameworks. They still do a manual "@load foo.bif" in their main.bro to get the order right. It's a bit tricky to fix that and would probably need splitting main.bro into two parts; not sure that's worth it.) --- CHANGES | 10 ++++++++++ VERSION | 2 +- aux/binpac | 2 +- cmake | 2 +- scripts/base/init-bare.bro | 2 ++ src/CMakeLists.txt | 18 +++++++++++++++++- src/Func.cc | 4 ++++ src/analyzer/Manager.cc | 1 - src/file_analysis/Manager.cc | 1 - .../canonified_loaded_scripts.log | 5 +++-- .../canonified_loaded_scripts.log | 5 +++-- 11 files changed, 42 insertions(+), 10 deletions(-) diff --git a/CHANGES b/CHANGES index 7cbbc74e4f..92d16d7776 100644 --- a/CHANGES +++ b/CHANGES @@ -1,4 +1,14 @@ +2.1-826 | 2013-07-25 10:12:26 -0700 + + * bif files declared with bif_target() are now automatically + compiled in. No more manual includes to pull them in. (Robin + Sommer) + + * Covenience make target in testing/btest to update the three + coverage tests that usually need tweaking when scripts get + added/removed. (Robin Sommer) + 2.1-824 | 2013-07-22 14:25:14 -0400 * Fixed a scriptland state issue that manifested especially badly on proxies. (Seth Hall) diff --git a/VERSION b/VERSION index d35eaf1454..71d91b2ea8 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.1-824 +2.1-826 diff --git a/aux/binpac b/aux/binpac index c39bd478b9..0c91feea55 160000 --- a/aux/binpac +++ b/aux/binpac @@ -1 +1 @@ -Subproject commit c39bd478b9d0ecd05b1b83aa9d09a7887893977c +Subproject commit 0c91feea55d00d3a1787203b3a43e3f9044d66e0 diff --git a/cmake b/cmake index 0187b33a29..026639f836 160000 --- a/cmake +++ b/cmake @@ -1 +1 @@ -Subproject commit 0187b33a29d5ec824f940feff60dc5d8c2fe314f +Subproject commit 026639f8368e56742c0cb5d9fb390ea64e60ec50 diff --git a/scripts/base/init-bare.bro b/scripts/base/init-bare.bro index 60ed0d2fd1..cffa6d80f1 100644 --- a/scripts/base/init-bare.bro +++ b/scripts/base/init-bare.bro @@ -3050,3 +3050,5 @@ const snaplen = 8192 &redef; @load base/frameworks/input @load base/frameworks/analyzer @load base/frameworks/file-analysis + +@load base/bif diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index e353dd4695..4644bab80a 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -6,6 +6,9 @@ include_directories(BEFORE # This collects generated bif and pac files from subdirectories. set(bro_ALL_GENERATED_OUTPUTS CACHE INTERNAL "automatically generated files" FORCE) +# This collects bif inputs that we'll load automatically. +set(bro_AUTO_BIFS CACHE INTERNAL "BIFs for automatic inclusion" FORCE) + # If TRUE, use CMake's object libraries for sub-directories instead of # static libraries. This requires CMake >= 2.8.8. set(bro_HAVE_OBJECT_LIBRARIES FALSE) @@ -382,8 +385,21 @@ set(BRO_EXE bro CACHE STRING "Bro executable binary" FORCE) # Target to create all the autogenerated files. +add_custom_target(generate_outputs_stage1) +add_dependencies(generate_outputs_stage1 ${bro_ALL_GENERATED_OUTPUTS}) + +# Target to create the joint includes files that pull in the bif code. +bro_bif_create_includes(generate_outputs_stage2 ${CMAKE_CURRENT_BINARY_DIR} "${bro_AUTO_BIFS}") +add_dependencies(generate_outputs_stage2 generate_outputs_stage1) + +# Global target to trigger creation of autogenerated code. add_custom_target(generate_outputs) -add_dependencies(generate_outputs ${bro_ALL_GENERATED_OUTPUTS}) +add_dependencies(generate_outputs generate_outputs_stage2) + +# Build __load__.bro files for standard *.bif.bro. +bro_bif_create_loader(bif_loader ${CMAKE_BINARY_DIR}/scripts/base/bif) +add_dependencies(bif_loader ${bro_SUBDIRS}) +add_dependencies(bro bif_loader) # Build __load__.bro files for plugins/*.bif.bro. bro_bif_create_loader(bif_loader_plugins ${CMAKE_BINARY_DIR}/scripts/base/bif/plugins) diff --git a/src/Func.cc b/src/Func.cc index f3718fe231..7859e8d2ad 100644 --- a/src/Func.cc +++ b/src/Func.cc @@ -560,6 +560,8 @@ void builtin_error(const char* msg, BroObj* arg) #include "reporter.bif.func_def" #include "strings.bif.func_def" +#include "__all__.bif.cc" // Autogenerated for compiling in the bif_target() code. + void init_builtin_funcs() { bro_resources = internal_type("bro_resources")->AsRecordType(); @@ -574,6 +576,8 @@ void init_builtin_funcs() #include "reporter.bif.func_init" #include "strings.bif.func_init" +#include "__all__.bif.init.cc" // Autogenerated for compiling in the bif_target() code. + did_builtin_init = true; } diff --git a/src/analyzer/Manager.cc b/src/analyzer/Manager.cc index 5695dec625..8b290e2341 100644 --- a/src/analyzer/Manager.cc +++ b/src/analyzer/Manager.cc @@ -103,7 +103,6 @@ void Manager::InitPreScript() void Manager::InitPostScript() { - #include "analyzer.bif.init.cc" } void Manager::DumpDebug() diff --git a/src/file_analysis/Manager.cc b/src/file_analysis/Manager.cc index ea1ed954ed..a7f7a29c18 100644 --- a/src/file_analysis/Manager.cc +++ b/src/file_analysis/Manager.cc @@ -60,7 +60,6 @@ void Manager::RegisterAnalyzerComponent(Component* component) void Manager::InitPostScript() { - #include "file_analysis.bif.init.cc" } void Manager::Terminate() diff --git a/testing/btest/Baseline/coverage.bare-load-baseline/canonified_loaded_scripts.log b/testing/btest/Baseline/coverage.bare-load-baseline/canonified_loaded_scripts.log index b7585a1477..724de75027 100644 --- a/testing/btest/Baseline/coverage.bare-load-baseline/canonified_loaded_scripts.log +++ b/testing/btest/Baseline/coverage.bare-load-baseline/canonified_loaded_scripts.log @@ -3,7 +3,7 @@ #empty_field (empty) #unset_field - #path loaded_scripts -#open 2013-07-05-05-20-50 +#open 2013-07-25-17-10-49 #fields name #types string scripts/base/init-bare.bro @@ -87,6 +87,7 @@ scripts/base/init-bare.bro scripts/base/frameworks/file-analysis/__load__.bro scripts/base/frameworks/file-analysis/main.bro build/scripts/base/bif/file_analysis.bif.bro + build/scripts/base/bif/__load__.bro scripts/policy/misc/loaded-scripts.bro scripts/base/utils/paths.bro -#close 2013-07-05-05-20-50 +#close 2013-07-25-17-10-49 diff --git a/testing/btest/Baseline/coverage.default-load-baseline/canonified_loaded_scripts.log b/testing/btest/Baseline/coverage.default-load-baseline/canonified_loaded_scripts.log index 999fd7c841..a3e89b4d60 100644 --- a/testing/btest/Baseline/coverage.default-load-baseline/canonified_loaded_scripts.log +++ b/testing/btest/Baseline/coverage.default-load-baseline/canonified_loaded_scripts.log @@ -3,7 +3,7 @@ #empty_field (empty) #unset_field - #path loaded_scripts -#open 2013-07-10-21-18-31 +#open 2013-07-25-17-10-50 #fields name #types string scripts/base/init-bare.bro @@ -87,6 +87,7 @@ scripts/base/init-bare.bro scripts/base/frameworks/file-analysis/__load__.bro scripts/base/frameworks/file-analysis/main.bro build/scripts/base/bif/file_analysis.bif.bro + build/scripts/base/bif/__load__.bro scripts/base/init-default.bro scripts/base/utils/site.bro scripts/base/utils/patterns.bro @@ -195,4 +196,4 @@ scripts/base/init-default.bro scripts/base/protocols/tunnels/__load__.bro scripts/base/misc/find-checksum-offloading.bro scripts/policy/misc/loaded-scripts.bro -#close 2013-07-10-21-18-31 +#close 2013-07-25-17-10-50 From c11bf3d9226fed28dbf2676c123cadd52bd13a68 Mon Sep 17 00:00:00 2001 From: Robin Sommer Date: Thu, 25 Jul 2013 11:28:30 -0700 Subject: [PATCH 64/73] Fixing serialization bug introduced during earlier merge. --- src/OpaqueVal.cc | 6 +++--- src/probabilistic/BitVector.cc | 6 +++--- src/probabilistic/CounterVector.cc | 6 +++--- .../canonified_loaded_scripts.log | 14 +++++++------- .../canonified_loaded_scripts.log | 14 +++++++------- 5 files changed, 23 insertions(+), 23 deletions(-) diff --git a/src/OpaqueVal.cc b/src/OpaqueVal.cc index b70cfee086..66b3c081e7 100644 --- a/src/OpaqueVal.cc +++ b/src/OpaqueVal.cc @@ -656,11 +656,11 @@ bool BloomFilterVal::DoUnserialize(UnserialInfo* info) if ( is_typed ) { - BroType* type = BroType::Unserialize(info); - if ( ! Typify(type) ) + BroType* t = BroType::Unserialize(info); + if ( ! Typify(t) ) return false; - Unref(type); + Unref(t); } bloom_filter = probabilistic::BloomFilter::Unserialize(info); diff --git a/src/probabilistic/BitVector.cc b/src/probabilistic/BitVector.cc index c0285eced3..6e642e62c1 100644 --- a/src/probabilistic/BitVector.cc +++ b/src/probabilistic/BitVector.cc @@ -568,11 +568,11 @@ bool BitVector::DoUnserialize(UnserialInfo* info) bits[i] = static_cast(block); } - uint64 num_bits; - if ( ! UNSERIALIZE(&num_bits) ) + uint64 n; + if ( ! UNSERIALIZE(&n) ) return false; - num_bits = static_cast(num_bits); + num_bits = static_cast(n); return true; } diff --git a/src/probabilistic/CounterVector.cc b/src/probabilistic/CounterVector.cc index 24c9ff3638..d5635fc0f2 100644 --- a/src/probabilistic/CounterVector.cc +++ b/src/probabilistic/CounterVector.cc @@ -183,11 +183,11 @@ bool CounterVector::DoUnserialize(UnserialInfo* info) if ( ! bits ) return false; - uint64 width; - if ( ! UNSERIALIZE(&width) ) + uint64 w; + if ( ! UNSERIALIZE(&w) ) return false; - width = static_cast(width); + width = static_cast(w); return true; } diff --git a/testing/btest/Baseline/coverage.bare-load-baseline/canonified_loaded_scripts.log b/testing/btest/Baseline/coverage.bare-load-baseline/canonified_loaded_scripts.log index 3236b39acd..5879c504e2 100644 --- a/testing/btest/Baseline/coverage.bare-load-baseline/canonified_loaded_scripts.log +++ b/testing/btest/Baseline/coverage.bare-load-baseline/canonified_loaded_scripts.log @@ -3,7 +3,7 @@ #empty_field (empty) #unset_field - #path loaded_scripts -#open 2013-07-25-17-17-10 +#open 2013-07-25-17-54-33 #fields name #types string scripts/base/init-bare.bro @@ -23,28 +23,28 @@ scripts/base/init-bare.bro build/scripts/base/bif/plugins/Bro_DCE_RPC.events.bif.bro build/scripts/base/bif/plugins/Bro_DHCP.events.bif.bro build/scripts/base/bif/plugins/Bro_DNS.events.bif.bro + build/scripts/base/bif/plugins/Bro_FTP.events.bif.bro + build/scripts/base/bif/plugins/Bro_FTP.functions.bif.bro build/scripts/base/bif/plugins/Bro_File.events.bif.bro build/scripts/base/bif/plugins/Bro_FileHash.events.bif.bro build/scripts/base/bif/plugins/Bro_Finger.events.bif.bro - build/scripts/base/bif/plugins/Bro_FTP.events.bif.bro - build/scripts/base/bif/plugins/Bro_FTP.functions.bif.bro - build/scripts/base/bif/plugins/Bro_Gnutella.events.bif.bro build/scripts/base/bif/plugins/Bro_GTPv1.events.bif.bro + build/scripts/base/bif/plugins/Bro_Gnutella.events.bif.bro build/scripts/base/bif/plugins/Bro_HTTP.events.bif.bro build/scripts/base/bif/plugins/Bro_HTTP.functions.bif.bro build/scripts/base/bif/plugins/Bro_ICMP.events.bif.bro + build/scripts/base/bif/plugins/Bro_IRC.events.bif.bro build/scripts/base/bif/plugins/Bro_Ident.events.bif.bro build/scripts/base/bif/plugins/Bro_InterConn.events.bif.bro - build/scripts/base/bif/plugins/Bro_IRC.events.bif.bro build/scripts/base/bif/plugins/Bro_Login.events.bif.bro build/scripts/base/bif/plugins/Bro_Login.functions.bif.bro build/scripts/base/bif/plugins/Bro_MIME.events.bif.bro build/scripts/base/bif/plugins/Bro_Modbus.events.bif.bro build/scripts/base/bif/plugins/Bro_NCP.events.bif.bro + build/scripts/base/bif/plugins/Bro_NTP.events.bif.bro build/scripts/base/bif/plugins/Bro_NetBIOS.events.bif.bro build/scripts/base/bif/plugins/Bro_NetBIOS.functions.bif.bro build/scripts/base/bif/plugins/Bro_NetFlow.events.bif.bro - build/scripts/base/bif/plugins/Bro_NTP.events.bif.bro build/scripts/base/bif/plugins/Bro_PIA.events.bif.bro build/scripts/base/bif/plugins/Bro_POP3.events.bif.bro build/scripts/base/bif/plugins/Bro_RPC.events.bif.bro @@ -91,4 +91,4 @@ scripts/base/init-bare.bro build/scripts/base/bif/__load__.bro scripts/policy/misc/loaded-scripts.bro scripts/base/utils/paths.bro -#close 2013-07-25-17-17-10 +#close 2013-07-25-17-54-33 diff --git a/testing/btest/Baseline/coverage.default-load-baseline/canonified_loaded_scripts.log b/testing/btest/Baseline/coverage.default-load-baseline/canonified_loaded_scripts.log index cb4ccba850..2a820f4270 100644 --- a/testing/btest/Baseline/coverage.default-load-baseline/canonified_loaded_scripts.log +++ b/testing/btest/Baseline/coverage.default-load-baseline/canonified_loaded_scripts.log @@ -3,7 +3,7 @@ #empty_field (empty) #unset_field - #path loaded_scripts -#open 2013-07-25-17-17-11 +#open 2013-07-25-17-54-33 #fields name #types string scripts/base/init-bare.bro @@ -23,28 +23,28 @@ scripts/base/init-bare.bro build/scripts/base/bif/plugins/Bro_DCE_RPC.events.bif.bro build/scripts/base/bif/plugins/Bro_DHCP.events.bif.bro build/scripts/base/bif/plugins/Bro_DNS.events.bif.bro + build/scripts/base/bif/plugins/Bro_FTP.events.bif.bro + build/scripts/base/bif/plugins/Bro_FTP.functions.bif.bro build/scripts/base/bif/plugins/Bro_File.events.bif.bro build/scripts/base/bif/plugins/Bro_FileHash.events.bif.bro build/scripts/base/bif/plugins/Bro_Finger.events.bif.bro - build/scripts/base/bif/plugins/Bro_FTP.events.bif.bro - build/scripts/base/bif/plugins/Bro_FTP.functions.bif.bro - build/scripts/base/bif/plugins/Bro_Gnutella.events.bif.bro build/scripts/base/bif/plugins/Bro_GTPv1.events.bif.bro + build/scripts/base/bif/plugins/Bro_Gnutella.events.bif.bro build/scripts/base/bif/plugins/Bro_HTTP.events.bif.bro build/scripts/base/bif/plugins/Bro_HTTP.functions.bif.bro build/scripts/base/bif/plugins/Bro_ICMP.events.bif.bro + build/scripts/base/bif/plugins/Bro_IRC.events.bif.bro build/scripts/base/bif/plugins/Bro_Ident.events.bif.bro build/scripts/base/bif/plugins/Bro_InterConn.events.bif.bro - build/scripts/base/bif/plugins/Bro_IRC.events.bif.bro build/scripts/base/bif/plugins/Bro_Login.events.bif.bro build/scripts/base/bif/plugins/Bro_Login.functions.bif.bro build/scripts/base/bif/plugins/Bro_MIME.events.bif.bro build/scripts/base/bif/plugins/Bro_Modbus.events.bif.bro build/scripts/base/bif/plugins/Bro_NCP.events.bif.bro + build/scripts/base/bif/plugins/Bro_NTP.events.bif.bro build/scripts/base/bif/plugins/Bro_NetBIOS.events.bif.bro build/scripts/base/bif/plugins/Bro_NetBIOS.functions.bif.bro build/scripts/base/bif/plugins/Bro_NetFlow.events.bif.bro - build/scripts/base/bif/plugins/Bro_NTP.events.bif.bro build/scripts/base/bif/plugins/Bro_PIA.events.bif.bro build/scripts/base/bif/plugins/Bro_POP3.events.bif.bro build/scripts/base/bif/plugins/Bro_RPC.events.bif.bro @@ -197,4 +197,4 @@ scripts/base/init-default.bro scripts/base/protocols/tunnels/__load__.bro scripts/base/misc/find-checksum-offloading.bro scripts/policy/misc/loaded-scripts.bro -#close 2013-07-25-17-17-11 +#close 2013-07-25-17-54-33 From 7dd5771384d6e45693e602efaebc18ffbabe8c47 Mon Sep 17 00:00:00 2001 From: Robin Sommer Date: Thu, 25 Jul 2013 12:02:41 -0700 Subject: [PATCH 65/73] Protection about broken traces with empty pcap headers. --- CHANGES | 5 +++++ VERSION | 2 +- src/PktSrc.cc | 6 ++++++ 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/CHANGES b/CHANGES index 3529576088..912d7d301f 100644 --- a/CHANGES +++ b/CHANGES @@ -1,4 +1,9 @@ +2.1-888 | 2013-07-25 12:02:41 -0700 + + * Protection about broken traces with empty pcap headers. (Matt + Thompson) + 2.1-887 | 2013-07-25 11:33:27 -0700 * Support for Bloom filter. (Matthias Vallentin) diff --git a/VERSION b/VERSION index 2ced22d6f4..4f0ea7a5ac 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.1-887 +2.1-888 diff --git a/src/PktSrc.cc b/src/PktSrc.cc index 105dc90d30..48b382565b 100644 --- a/src/PktSrc.cc +++ b/src/PktSrc.cc @@ -77,6 +77,12 @@ int PktSrc::ExtractNextPacket() data = last_data = pcap_next(pd, &hdr); + if ( data && (hdr.len == 0 || hdr.caplen == 0) ) + { + sessions->Weird("empty_pcap_header", &hdr, data); + return 0; + } + if ( data ) next_timestamp = hdr.ts.tv_sec + double(hdr.ts.tv_usec) / 1e6; From 8d729a378bd149206326f470fa76c1d4447e038f Mon Sep 17 00:00:00 2001 From: Robin Sommer Date: Thu, 25 Jul 2013 12:08:01 -0700 Subject: [PATCH 66/73] Updating submodule(s). [nomail] --- aux/binpac | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aux/binpac b/aux/binpac index 0c91feea55..896ddedde5 160000 --- a/aux/binpac +++ b/aux/binpac @@ -1 +1 @@ -Subproject commit 0c91feea55d00d3a1787203b3a43e3f9044d66e0 +Subproject commit 896ddedde55c48ec2163577fc258b49c418abb3e From 32f1c736f7d425b0d03deb93d5d057075737c3c1 Mon Sep 17 00:00:00 2001 From: Seth Hall Date: Mon, 29 Jul 2013 16:40:16 -0400 Subject: [PATCH 67/73] Some script reorg and a new intel extension script. - policy/frameworks/intel/seen is the new location for the scripts that push data into the intel framework for checking. - The new policy/frameworks/intel/do_notice script adds an example mechanism for data driven notices. --- doc/intel.rst | 4 +- doc/scripts/DocSourcesList.cmake | 19 ++++---- scripts/base/frameworks/intel/main.bro | 3 -- scripts/policy/frameworks/intel/do_notice.bro | 44 +++++++++++++++++++ .../frameworks/intel/{ => seen}/__load__.bro | 0 .../intel/{ => seen}/conn-established.bro | 0 .../frameworks/intel/{ => seen}/dns.bro | 0 .../intel/{ => seen}/http-host-header.bro | 0 .../frameworks/intel/{ => seen}/http-url.bro | 0 .../intel/{ => seen}/http-user-agents.bro | 0 .../intel/{ => seen}/smtp-url-extraction.bro | 0 .../frameworks/intel/{ => seen}/smtp.bro | 0 .../frameworks/intel/{ => seen}/ssl.bro | 0 .../intel/{ => seen}/where-locations.bro | 0 scripts/test-all-policy.bro | 21 ++++----- 15 files changed, 67 insertions(+), 24 deletions(-) create mode 100644 scripts/policy/frameworks/intel/do_notice.bro rename scripts/policy/frameworks/intel/{ => seen}/__load__.bro (100%) rename scripts/policy/frameworks/intel/{ => seen}/conn-established.bro (100%) rename scripts/policy/frameworks/intel/{ => seen}/dns.bro (100%) rename scripts/policy/frameworks/intel/{ => seen}/http-host-header.bro (100%) rename scripts/policy/frameworks/intel/{ => seen}/http-url.bro (100%) rename scripts/policy/frameworks/intel/{ => seen}/http-user-agents.bro (100%) rename scripts/policy/frameworks/intel/{ => seen}/smtp-url-extraction.bro (100%) rename scripts/policy/frameworks/intel/{ => seen}/smtp.bro (100%) rename scripts/policy/frameworks/intel/{ => seen}/ssl.bro (100%) rename scripts/policy/frameworks/intel/{ => seen}/where-locations.bro (100%) diff --git a/doc/intel.rst b/doc/intel.rst index 2a59a98974..787524a417 100644 --- a/doc/intel.rst +++ b/doc/intel.rst @@ -27,7 +27,7 @@ Quick Start Load the package of scripts that sends data into the Intelligence Framework to be checked by loading this script in local.bro:: - @load policy/frameworks/intel + @load policy/frameworks/intel/seen Refer to the "Loading Intelligence" section below to see the format for Intelligence Framework text files, then load those text files with @@ -100,7 +100,7 @@ The full package of hook scripts that Bro ships with for sending this "seen" data into the intelligence framework can be loading by adding this line to local.bro:: - @load policy/frameworks/intel + @load policy/frameworks/intel/seen Intelligence Matches ******************** diff --git a/doc/scripts/DocSourcesList.cmake b/doc/scripts/DocSourcesList.cmake index 26a88027ef..f507172161 100644 --- a/doc/scripts/DocSourcesList.cmake +++ b/doc/scripts/DocSourcesList.cmake @@ -183,15 +183,16 @@ rest_target(${psd} policy/frameworks/control/controllee.bro) rest_target(${psd} policy/frameworks/control/controller.bro) rest_target(${psd} policy/frameworks/dpd/detect-protocols.bro) rest_target(${psd} policy/frameworks/dpd/packet-segment-logging.bro) -rest_target(${psd} policy/frameworks/intel/conn-established.bro) -rest_target(${psd} policy/frameworks/intel/dns.bro) -rest_target(${psd} policy/frameworks/intel/http-host-header.bro) -rest_target(${psd} policy/frameworks/intel/http-url.bro) -rest_target(${psd} policy/frameworks/intel/http-user-agents.bro) -rest_target(${psd} policy/frameworks/intel/smtp-url-extraction.bro) -rest_target(${psd} policy/frameworks/intel/smtp.bro) -rest_target(${psd} policy/frameworks/intel/ssl.bro) -rest_target(${psd} policy/frameworks/intel/where-locations.bro) +rest_target(${psd} policy/frameworks/intel/do_notice.bro) +rest_target(${psd} policy/frameworks/intel/seen/conn-established.bro) +rest_target(${psd} policy/frameworks/intel/seen/dns.bro) +rest_target(${psd} policy/frameworks/intel/seen/http-host-header.bro) +rest_target(${psd} policy/frameworks/intel/seen/http-url.bro) +rest_target(${psd} policy/frameworks/intel/seen/http-user-agents.bro) +rest_target(${psd} policy/frameworks/intel/seen/smtp-url-extraction.bro) +rest_target(${psd} policy/frameworks/intel/seen/smtp.bro) +rest_target(${psd} policy/frameworks/intel/seen/ssl.bro) +rest_target(${psd} policy/frameworks/intel/seen/where-locations.bro) rest_target(${psd} policy/frameworks/packet-filter/shunt.bro) rest_target(${psd} policy/frameworks/software/version-changes.bro) rest_target(${psd} policy/frameworks/software/vulnerable.bro) diff --git a/scripts/base/frameworks/intel/main.bro b/scripts/base/frameworks/intel/main.bro index 1b740f538d..a201a7a041 100644 --- a/scripts/base/frameworks/intel/main.bro +++ b/scripts/base/frameworks/intel/main.bro @@ -63,9 +63,6 @@ export { IN_ANYWHERE, }; - ## The $host field and combination of $str and $str_type fields are mutually - ## exclusive. These records *must* represent either an IP address being - ## seen or a string being seen. type Seen: record { ## The string if the data is about a string. indicator: string &log &optional; diff --git a/scripts/policy/frameworks/intel/do_notice.bro b/scripts/policy/frameworks/intel/do_notice.bro new file mode 100644 index 0000000000..720e29c35c --- /dev/null +++ b/scripts/policy/frameworks/intel/do_notice.bro @@ -0,0 +1,44 @@ + +@load base/frameworks/intel +@load base/frameworks/notice + +module Intel; + +export { + redef enum Notice::Type += { + ## Intel::Notice is a notice that happens when an intelligence + ## indicator is denoted to be notice-worthy. + Intel::Notice + }; + + redef record Intel::MetaData += { + ## A boolean value to allow the data itself to represent + ## if the indicator that this metadata is attached to + ## is notice worthy. + do_notice: bool &default=F; + + ## Restrictions on when notices are created to only create + ## them if the do_notice field is T and the notice was + ## seen in the indicated location. + if_in: Intel::Where &optional; + }; +} + +event Intel::match(s: Seen, items: set[Item]) + { + for ( item in items ) + { + if ( item$meta$do_notice && + (! item$meta?$if_in || s$where == item$meta$if_in) ) + { + local n = Notice::Info($note=Intel::Notice, + $msg=fmt("Intel hit on %s at %s", s$indicator, s$where), + $sub=s$indicator); + + if ( s?$conn ) + n$conn = s$conn; + + NOTICE(n); + } + } + } diff --git a/scripts/policy/frameworks/intel/__load__.bro b/scripts/policy/frameworks/intel/seen/__load__.bro similarity index 100% rename from scripts/policy/frameworks/intel/__load__.bro rename to scripts/policy/frameworks/intel/seen/__load__.bro diff --git a/scripts/policy/frameworks/intel/conn-established.bro b/scripts/policy/frameworks/intel/seen/conn-established.bro similarity index 100% rename from scripts/policy/frameworks/intel/conn-established.bro rename to scripts/policy/frameworks/intel/seen/conn-established.bro diff --git a/scripts/policy/frameworks/intel/dns.bro b/scripts/policy/frameworks/intel/seen/dns.bro similarity index 100% rename from scripts/policy/frameworks/intel/dns.bro rename to scripts/policy/frameworks/intel/seen/dns.bro diff --git a/scripts/policy/frameworks/intel/http-host-header.bro b/scripts/policy/frameworks/intel/seen/http-host-header.bro similarity index 100% rename from scripts/policy/frameworks/intel/http-host-header.bro rename to scripts/policy/frameworks/intel/seen/http-host-header.bro diff --git a/scripts/policy/frameworks/intel/http-url.bro b/scripts/policy/frameworks/intel/seen/http-url.bro similarity index 100% rename from scripts/policy/frameworks/intel/http-url.bro rename to scripts/policy/frameworks/intel/seen/http-url.bro diff --git a/scripts/policy/frameworks/intel/http-user-agents.bro b/scripts/policy/frameworks/intel/seen/http-user-agents.bro similarity index 100% rename from scripts/policy/frameworks/intel/http-user-agents.bro rename to scripts/policy/frameworks/intel/seen/http-user-agents.bro diff --git a/scripts/policy/frameworks/intel/smtp-url-extraction.bro b/scripts/policy/frameworks/intel/seen/smtp-url-extraction.bro similarity index 100% rename from scripts/policy/frameworks/intel/smtp-url-extraction.bro rename to scripts/policy/frameworks/intel/seen/smtp-url-extraction.bro diff --git a/scripts/policy/frameworks/intel/smtp.bro b/scripts/policy/frameworks/intel/seen/smtp.bro similarity index 100% rename from scripts/policy/frameworks/intel/smtp.bro rename to scripts/policy/frameworks/intel/seen/smtp.bro diff --git a/scripts/policy/frameworks/intel/ssl.bro b/scripts/policy/frameworks/intel/seen/ssl.bro similarity index 100% rename from scripts/policy/frameworks/intel/ssl.bro rename to scripts/policy/frameworks/intel/seen/ssl.bro diff --git a/scripts/policy/frameworks/intel/where-locations.bro b/scripts/policy/frameworks/intel/seen/where-locations.bro similarity index 100% rename from scripts/policy/frameworks/intel/where-locations.bro rename to scripts/policy/frameworks/intel/seen/where-locations.bro diff --git a/scripts/test-all-policy.bro b/scripts/test-all-policy.bro index 1fd34d6f2f..809fc1d1ec 100644 --- a/scripts/test-all-policy.bro +++ b/scripts/test-all-policy.bro @@ -14,16 +14,17 @@ # @load frameworks/control/controller.bro @load frameworks/dpd/detect-protocols.bro @load frameworks/dpd/packet-segment-logging.bro -@load frameworks/intel/__load__.bro -@load frameworks/intel/conn-established.bro -@load frameworks/intel/dns.bro -@load frameworks/intel/http-host-header.bro -@load frameworks/intel/http-url.bro -@load frameworks/intel/http-user-agents.bro -@load frameworks/intel/smtp-url-extraction.bro -@load frameworks/intel/smtp.bro -@load frameworks/intel/ssl.bro -@load frameworks/intel/where-locations.bro +@load frameworks/intel/do_notice.bro +@load frameworks/intel/seen/__load__.bro +@load frameworks/intel/seen/conn-established.bro +@load frameworks/intel/seen/dns.bro +@load frameworks/intel/seen/http-host-header.bro +@load frameworks/intel/seen/http-url.bro +@load frameworks/intel/seen/http-user-agents.bro +@load frameworks/intel/seen/smtp-url-extraction.bro +@load frameworks/intel/seen/smtp.bro +@load frameworks/intel/seen/ssl.bro +@load frameworks/intel/seen/where-locations.bro @load frameworks/packet-filter/shunt.bro @load frameworks/software/version-changes.bro @load frameworks/software/vulnerable.bro From 64fc80d7e4a4c1a653a16bf3d3892c50982fcffa Mon Sep 17 00:00:00 2001 From: Robin Sommer Date: Thu, 25 Jul 2013 13:31:57 -0700 Subject: [PATCH 68/73] Adding a trace with a DNSKEY RR. Still had this sitting in my inbox, but seems Bro is doing everything right. --- CHANGES | 4 ++++ VERSION | 2 +- .../scripts.base.protocols.dns.dns-key/dns.log | 10 ++++++++++ testing/btest/Traces/dns-dnskey.trace | Bin 0 -> 1110 bytes .../btest/scripts/base/protocols/dns/dns-key.bro | 4 ++++ 5 files changed, 19 insertions(+), 1 deletion(-) create mode 100644 testing/btest/Baseline/scripts.base.protocols.dns.dns-key/dns.log create mode 100644 testing/btest/Traces/dns-dnskey.trace create mode 100644 testing/btest/scripts/base/protocols/dns/dns-key.bro diff --git a/CHANGES b/CHANGES index f4b7e43a7e..0c7235bd47 100644 --- a/CHANGES +++ b/CHANGES @@ -1,4 +1,8 @@ +2.1-895 | 2013-07-29 14:07:35 -0700 + + * Adding a test for a DNSKEY RR. (Robin Sommer) + 2.1-894 | 2013-07-29 16:44:41 -0400 * Updates for the Intel Framework. (Seth Hall) diff --git a/VERSION b/VERSION index 3131a2159f..9e4a84ae0a 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.1-894 +2.1-895 diff --git a/testing/btest/Baseline/scripts.base.protocols.dns.dns-key/dns.log b/testing/btest/Baseline/scripts.base.protocols.dns.dns-key/dns.log new file mode 100644 index 0000000000..722d2c3912 --- /dev/null +++ b/testing/btest/Baseline/scripts.base.protocols.dns.dns-key/dns.log @@ -0,0 +1,10 @@ +#separator \x09 +#set_separator , +#empty_field (empty) +#unset_field - +#path dns +#open 2013-07-25-20-29-44 +#fields ts uid id.orig_h id.orig_p id.resp_h id.resp_p proto trans_id query qclass qclass_name qtype qtype_name rcode rcode_name AA TC RD RA Z answers TTLs rejected +#types time string addr port addr port enum count string count string count string count string bool bool bool bool count vector[string] vector[interval] bool +1359565680.761790 UWkUyAuUGXf 192.168.6.10 53209 192.168.129.36 53 udp 41477 paypal.com 1 C_INTERNET 48 DNSKEY 0 NOERROR F F T F 1 - - F +#close 2013-07-25-20-29-44 diff --git a/testing/btest/Traces/dns-dnskey.trace b/testing/btest/Traces/dns-dnskey.trace new file mode 100644 index 0000000000000000000000000000000000000000..c7a6448e7990c6717c5f8da73cb1c300bcb4f534 GIT binary patch literal 1110 zcmca|c+)~A1{MYw`2U}Qff2|l2<8mjH=mm!5Xc5$1_qw_pW+x)Crs=x;$U!PU~s++ zROui%Py4_MHm(CJ8dc8UWH4pWcg2l zAU9)JaDWG54dZb}4n{_1R%S*9#;dtA^fp+P2bQoGp4N|geL&lL*-Vp7o5g}J zOiDO>LZzj#t#jUXgDH3N+L@~@Bs=(DJ$&)9_txXvJ9M5Uv@@M)`XV5jrn9D|M{BEK zj=kgJuFnx?+O1ZkHpCv_)Yx<#=q5c-a$r2pFpt52m5G6YDI!ogN+96($re3jDJs_1%i&kO`s>{Z8M|mvw@lF;N;3?2080u$S?4>&_C<&yL@pE_lvLnHLi$pWcCLAeH59Sd_SVCU72I2Nv_F-U(Y3u zuF+j~iP2hWx@U{{ansb?GncbZ>{+uk<(5eOq;(DongKd%=l!cz_v1ZwpL^2O!*+=_ zQNB;P&ZV6-%1D@y_C%nUKX3u>mkPzOzfFq-YEE7~c%^ENUE}|3m&h%;YpkYC^3+iV LW^UFVNah9rRu9Wl literal 0 HcmV?d00001 diff --git a/testing/btest/scripts/base/protocols/dns/dns-key.bro b/testing/btest/scripts/base/protocols/dns/dns-key.bro new file mode 100644 index 0000000000..c51788c605 --- /dev/null +++ b/testing/btest/scripts/base/protocols/dns/dns-key.bro @@ -0,0 +1,4 @@ +# Making sure DNSKEY gets logged as such. +# +# @TEST-EXEC: bro -r $TRACES/dns-dnskey.trace +# @TEST-EXEC: btest-diff dns.log From c7676c5e695b0a4590a2fa18e96241455ff4970e Mon Sep 17 00:00:00 2001 From: Robin Sommer Date: Mon, 29 Jul 2013 14:29:45 -0700 Subject: [PATCH 69/73] The new magic submodule didn't get merged. --- magic | 1 + 1 file changed, 1 insertion(+) create mode 160000 magic diff --git a/magic b/magic new file mode 160000 index 0000000000..e87fe13a7b --- /dev/null +++ b/magic @@ -0,0 +1 @@ +Subproject commit e87fe13a7b776182ffc8c75076d42702f5c28fed From b76d1d07ca0d0175f57f83379612009c8c09400a Mon Sep 17 00:00:00 2001 From: Robin Sommer Date: Mon, 29 Jul 2013 15:06:07 -0700 Subject: [PATCH 70/73] Test updates. BIT-1044 #merged --- CHANGES | 40 +++++++++++++++++++ NEWS | 2 +- VERSION | 2 +- .../canonified_loaded_scripts.log | 5 ++- .../canonified_loaded_scripts.log | 7 ++-- .../http.ds.txt | 18 ++++----- testing/btest/coverage/bare-mode-errors.test | 5 ++- 7 files changed, 62 insertions(+), 17 deletions(-) diff --git a/CHANGES b/CHANGES index 0c7235bd47..1f64cc908a 100644 --- a/CHANGES +++ b/CHANGES @@ -1,4 +1,44 @@ +2.1-930 | 2013-07-29 15:06:07 -0700 + + * Major file analysis overhaul in naming and appearance, along with + fixes and test updates. (Seth Hall and Jon Siwek) + + Includes: + + * Added protocol description functions that provide a super + compressed log representation. (Seth Hall) + + * Added mime types to http.log (Seth Hall) + + * Add jar files to the default MHR lookups. (Seth Hall) + + * Adding CAB files for MHR checking. (Seth Hall) + + * Improve malware hash registry script. + + - Include a link to a virustotal search in the notice sub message field. + - Give all information returned from Team Cymru in the notice message. + - Add more file types to match on to the default set. + + * Make the custom libmagic database a git submodule. + + * Add an is_orig parameter to file_over_new_connection event. + + * Recorrected the module name to Files. + + * Added Files::analyzer_name to get a more readable name for a + file analyzer. + + * Improved and just overall better handled multipart mime + transfers in HTTP and SMTP. HTTP now has orig_fuids and + resp_fuids log fields since multiple "files" can be transferred + with multipart mime in a single request/response pair. SMTP has + an fuids field which has file unique IDs for all parts + transferred. FTP and IRC have a log field named fuid added + because only a single file can be transferred per irc and ftp + log line. + 2.1-895 | 2013-07-29 14:07:35 -0700 * Adding a test for a DNSKEY RR. (Robin Sommer) diff --git a/NEWS b/NEWS index c3eabf5554..de2ee1b684 100644 --- a/NEWS +++ b/NEWS @@ -80,7 +80,7 @@ New Functionality with the following user-visibible functionality (some of that was already available before, but done differently): - [TODO: This will probably change with further script updates.] + [TODO: Update with changes from 984e9793db56.] - A binary input reader interfaces the input framework with file analysis, allowing to inject files on disk into Bro's diff --git a/VERSION b/VERSION index 9e4a84ae0a..cacffbfffc 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.1-895 +2.1-930 diff --git a/testing/btest/Baseline/coverage.bare-load-baseline/canonified_loaded_scripts.log b/testing/btest/Baseline/coverage.bare-load-baseline/canonified_loaded_scripts.log index 0caafdf107..e28efc9563 100644 --- a/testing/btest/Baseline/coverage.bare-load-baseline/canonified_loaded_scripts.log +++ b/testing/btest/Baseline/coverage.bare-load-baseline/canonified_loaded_scripts.log @@ -3,7 +3,7 @@ #empty_field (empty) #unset_field - #path loaded_scripts -#open 2013-07-25-17-54-33 +#open 2013-07-29-21-31-47 #fields name #types string scripts/base/init-bare.bro @@ -90,6 +90,7 @@ scripts/base/init-bare.bro build/scripts/base/bif/file_analysis.bif.bro scripts/base/utils/site.bro scripts/base/utils/patterns.bro + build/scripts/base/bif/__load__.bro scripts/policy/misc/loaded-scripts.bro scripts/base/utils/paths.bro -#close 2013-07-25-19-59-47 +#close 2013-07-29-21-31-47 diff --git a/testing/btest/Baseline/coverage.default-load-baseline/canonified_loaded_scripts.log b/testing/btest/Baseline/coverage.default-load-baseline/canonified_loaded_scripts.log index deffbe364b..faf372222b 100644 --- a/testing/btest/Baseline/coverage.default-load-baseline/canonified_loaded_scripts.log +++ b/testing/btest/Baseline/coverage.default-load-baseline/canonified_loaded_scripts.log @@ -3,7 +3,7 @@ #empty_field (empty) #unset_field - #path loaded_scripts -#open 2013-07-23-05-48-10 +#open 2013-07-29-21-31-48 #fields name #types string scripts/base/init-bare.bro @@ -90,6 +90,7 @@ scripts/base/init-bare.bro build/scripts/base/bif/file_analysis.bif.bro scripts/base/utils/site.bro scripts/base/utils/patterns.bro + build/scripts/base/bif/__load__.bro scripts/base/init-default.bro scripts/base/utils/addrs.bro scripts/base/utils/conn-ids.bro @@ -158,7 +159,7 @@ scripts/base/init-default.bro scripts/base/protocols/ftp/__load__.bro scripts/base/protocols/ftp/utils-commands.bro scripts/base/protocols/ftp/main.bro - scripts/base/protocols/ftp/utils.bro + scripts/base/protocols/ftp/utils.bro scripts/base/protocols/ftp/files.bro scripts/base/protocols/ftp/gridftp.bro scripts/base/protocols/ssl/__load__.bro @@ -197,4 +198,4 @@ scripts/base/init-default.bro scripts/base/files/extract/main.bro scripts/base/misc/find-checksum-offloading.bro scripts/policy/misc/loaded-scripts.bro -#close 2013-07-23-05-48-10 +#close 2013-07-29-21-31-48 diff --git a/testing/btest/Baseline/scripts.base.frameworks.logging.dataseries.wikipedia/http.ds.txt b/testing/btest/Baseline/scripts.base.frameworks.logging.dataseries.wikipedia/http.ds.txt index e919233b79..fd998057f3 100644 --- a/testing/btest/Baseline/scripts.base.frameworks.logging.dataseries.wikipedia/http.ds.txt +++ b/testing/btest/Baseline/scripts.base.frameworks.logging.dataseries.wikipedia/http.ds.txt @@ -32,10 +32,10 @@ - - - - + + + + @@ -60,13 +60,13 @@ - - - - + + + + # Extent, type='http' -ts uid id.orig_h id.orig_p id.resp_h id.resp_p trans_depth method host uri referrer user_agent request_body_len response_body_len status_code status_msg info_code info_msg filename tags username password proxied mime_type md5 extracted_request_files extracted_response_files +ts uid id.orig_h id.orig_p id.resp_h id.resp_p trans_depth method host uri referrer user_agent request_body_len response_body_len status_code status_msg info_code info_msg filename tags username password proxied orig_fuids orig_mime_types resp_fuids resp_mime_types 1300475168.784020 j4u32Pc5bif 141.142.220.118 48649 208.80.152.118 80 1 GET bits.wikimedia.org /skins-1.5/monobook/main.css http://www.wikipedia.org/ Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.15) Gecko/20110303 Ubuntu/10.04 (lucid) Firefox/3.6.15 0 0 304 Not Modified 0 1300475168.916018 VW0XPVINV8a 141.142.220.118 49997 208.80.152.3 80 1 GET upload.wikimedia.org /wikipedia/commons/6/63/Wikipedia-logo.png http://www.wikipedia.org/ Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.15) Gecko/20110303 Ubuntu/10.04 (lucid) Firefox/3.6.15 0 0 304 Not Modified 0 1300475168.916183 3PKsZ2Uye21 141.142.220.118 49996 208.80.152.3 80 1 GET upload.wikimedia.org /wikipedia/commons/thumb/b/bb/Wikipedia_wordmark.svg/174px-Wikipedia_wordmark.svg.png http://www.wikipedia.org/ Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.15) Gecko/20110303 Ubuntu/10.04 (lucid) Firefox/3.6.15 0 0 304 Not Modified 0 diff --git a/testing/btest/coverage/bare-mode-errors.test b/testing/btest/coverage/bare-mode-errors.test index 34ba063081..1910ef8e17 100644 --- a/testing/btest/coverage/bare-mode-errors.test +++ b/testing/btest/coverage/bare-mode-errors.test @@ -10,5 +10,8 @@ # # @TEST-EXEC: test -d $DIST/scripts # @TEST-EXEC: for script in `find $DIST/scripts/ -name \*\.bro -not -path '*/site/*'`; do echo "=== $script" >>allerrors; if echo "$script" | egrep -q 'communication/listen|controllee'; then rm -rf load_attempt .bgprocs; btest-bg-run load_attempt bro -b $script; btest-bg-wait -k 2; cat load_attempt/.stderr >>allerrors; else bro -b $script 2>>allerrors; fi done || exit 0 -# @TEST-EXEC: cat allerrors | grep -v "received termination signal" | grep -v '===' | sort | uniq > unique_errors +# @TEST-EXEC: cat allerrors | grep -v "received termination signal" | fgrep -v -f %INPUT | grep -v '===' | sort | uniq > unique_errors # @TEST-EXEC: btest-diff unique_errors + +# White-list of tests to exclude because of cyclic load dependencies. +scripts/base/protocols/ftp/utils.bro From c30fa36d14382c03d08f545002a33f21eb778cfe Mon Sep 17 00:00:00 2001 From: Robin Sommer Date: Mon, 29 Jul 2013 16:39:40 -0700 Subject: [PATCH 71/73] Updating submodule(s). [nomail] --- aux/binpac | 2 +- aux/bro-aux | 2 +- aux/broccoli | 2 +- aux/broctl | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/aux/binpac b/aux/binpac index 896ddedde5..314fa8f65f 160000 --- a/aux/binpac +++ b/aux/binpac @@ -1 +1 @@ -Subproject commit 896ddedde55c48ec2163577fc258b49c418abb3e +Subproject commit 314fa8f65fc240e960c23c3bba98623436a72b98 diff --git a/aux/bro-aux b/aux/bro-aux index a9942558c7..91d258cc8b 160000 --- a/aux/bro-aux +++ b/aux/bro-aux @@ -1 +1 @@ -Subproject commit a9942558c7d3dfd80148b8aaded64c82ade3d117 +Subproject commit 91d258cc8b2f74cd02fc93dfe61f73ec9f0dd489 diff --git a/aux/broccoli b/aux/broccoli index 889f9c6594..d59c73b6e0 160000 --- a/aux/broccoli +++ b/aux/broccoli @@ -1 +1 @@ -Subproject commit 889f9c65944ceac20ad9230efc39d33e6e1221c3 +Subproject commit d59c73b6e0966ad63bbc63a35741b5f68263e7b1 diff --git a/aux/broctl b/aux/broctl index 0cd102805e..52fd91261f 160000 --- a/aux/broctl +++ b/aux/broctl @@ -1 +1 @@ -Subproject commit 0cd102805e73343cab3f9fd4a76552e13940dad9 +Subproject commit 52fd91261f41fa1528f7b964837a364d7991889e From 43825212db25ce540c6a12905844d246f8784c05 Mon Sep 17 00:00:00 2001 From: Matthias Vallentin Date: Tue, 30 Jul 2013 12:17:53 +0200 Subject: [PATCH 72/73] Update submodules. --- aux/binpac | 2 +- aux/bro-aux | 2 +- aux/broccoli | 2 +- aux/broctl | 2 +- cmake | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/aux/binpac b/aux/binpac index c39bd478b9..314fa8f65f 160000 --- a/aux/binpac +++ b/aux/binpac @@ -1 +1 @@ -Subproject commit c39bd478b9d0ecd05b1b83aa9d09a7887893977c +Subproject commit 314fa8f65fc240e960c23c3bba98623436a72b98 diff --git a/aux/bro-aux b/aux/bro-aux index a9942558c7..91d258cc8b 160000 --- a/aux/bro-aux +++ b/aux/bro-aux @@ -1 +1 @@ -Subproject commit a9942558c7d3dfd80148b8aaded64c82ade3d117 +Subproject commit 91d258cc8b2f74cd02fc93dfe61f73ec9f0dd489 diff --git a/aux/broccoli b/aux/broccoli index 889f9c6594..d59c73b6e0 160000 --- a/aux/broccoli +++ b/aux/broccoli @@ -1 +1 @@ -Subproject commit 889f9c65944ceac20ad9230efc39d33e6e1221c3 +Subproject commit d59c73b6e0966ad63bbc63a35741b5f68263e7b1 diff --git a/aux/broctl b/aux/broctl index 0cd102805e..52fd91261f 160000 --- a/aux/broctl +++ b/aux/broctl @@ -1 +1 @@ -Subproject commit 0cd102805e73343cab3f9fd4a76552e13940dad9 +Subproject commit 52fd91261f41fa1528f7b964837a364d7991889e diff --git a/cmake b/cmake index 0187b33a29..026639f836 160000 --- a/cmake +++ b/cmake @@ -1 +1 @@ -Subproject commit 0187b33a29d5ec824f940feff60dc5d8c2fe314f +Subproject commit 026639f8368e56742c0cb5d9fb390ea64e60ec50 From af9e181731b82167187b7a9ec8995b991920c0e1 Mon Sep 17 00:00:00 2001 From: Robin Sommer Date: Tue, 30 Jul 2013 10:29:27 -0700 Subject: [PATCH 73/73] Updating submodule(s). [nomail] --- magic | 1 + 1 file changed, 1 insertion(+) create mode 160000 magic diff --git a/magic b/magic new file mode 160000 index 0000000000..e87fe13a7b --- /dev/null +++ b/magic @@ -0,0 +1 @@ +Subproject commit e87fe13a7b776182ffc8c75076d42702f5c28fed