diff --git a/CHANGES b/CHANGES index 3d30b3d195..eb780a816f 100644 --- a/CHANGES +++ b/CHANGES @@ -1,4 +1,54 @@ +2.1-1007 | 2013-08-01 15:41:54 -0700 + + * More function documentation. (Bernhard Amann) + +2.1-1004 | 2013-08-01 14:37:43 -0700 + + * Adding a probabilistic data structure for computing "top k" + elements. (Bernhard Amann) + + The corresponding functions are: + + topk_init(size: count): opaque of topk + topk_add(handle: opaque of topk, value: any) + topk_get_top(handle: opaque of topk, k: count) + topk_count(handle: opaque of topk, value: any): count + topk_epsilon(handle: opaque of topk, value: any): count + topk_size(handle: opaque of topk): count + topk_sum(handle: opaque of topk): count + topk_merge(handle1: opaque of topk, handle2: opaque of topk) + topk_merge_prune(handle1: opaque of topk, handle2: opaque of topk) + +2.1-971 | 2013-08-01 13:28:32 -0700 + + * Fix some build errors. (Jon Siwek) + + * Internal refactoring of how plugin components are tagged/managed. + (Jon Siwek) + + * Fix various documentation, mostly related to file analysis. (Jon + Siwek) + + * Changing the Bloom filter hashing so that it's independent of + CompositeHash. (Robin Sommer) + +2.1-951 | 2013-08-01 11:19:23 -0400 + + * Small fix to deal with a bug in the SSL log delay mechanism. + +2.1-948 | 2013-07-31 20:08:28 -0700 + + * Fix segfault caused by merging an empty bloom-filter with a + bloom-filter already containing values. (Bernhard Amann) + +2.1-945 | 2013-07-30 10:05:10 -0700 + + * Make hashers serializable. (Matthias Vallentin) + + * Add docs and use default value for hasher names. (Matthias + Vallentin) + 2.1-939 | 2013-07-29 15:42:38 -0700 * Added Exec, Dir, and ActiveHTTP modules. (Seth Hall) diff --git a/NEWS b/NEWS index c421e7d675..3e349e7db3 100644 --- a/NEWS +++ b/NEWS @@ -113,6 +113,7 @@ New Functionality the frequency of elements. The corresponding functions are: bloomfilter_basic_init(fp: double, capacity: count, name: string &default=""): opaque of bloomfilter + bloomfilter_basic_init2(k: count, cells: count, name: string &default=""): opaque of bloomfilter bloomfilter_counting_init(k: count, cells: count, max: count, name: string &default=""): opaque of bloomfilter bloomfilter_add(bf: opaque of bloomfilter, x: any) bloomfilter_lookup(bf: opaque of bloomfilter, x: any): count @@ -121,6 +122,21 @@ New Functionality See for full documentation. +- Bro now provides a probabilistic data structure for computing + "top k" elements. The corresponding functions are: + + topk_init(size: count): opaque of topk + topk_add(handle: opaque of topk, value: any) + topk_get_top(handle: opaque of topk, k: count) + topk_count(handle: opaque of topk, value: any): count + topk_epsilon(handle: opaque of topk, value: any): count + topk_size(handle: opaque of topk): count + topk_sum(handle: opaque of topk): count + topk_merge(handle1: opaque of topk, handle2: opaque of topk) + topk_merge_prune(handle1: opaque of topk, handle2: opaque of topk) + + See for full documentation. + - base/utils/exec.bro provides a module to start external processes asynchronously and retrieve their output on termination. base/utils/dir.bro uses it to monitor a directory for changes, and diff --git a/VERSION b/VERSION index 4ebbf81a9f..b52e411524 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.1-939 +2.1-1007 diff --git a/aux/bro-aux b/aux/bro-aux index 91d258cc8b..d9963983c0 160000 --- a/aux/bro-aux +++ b/aux/bro-aux @@ -1 +1 @@ -Subproject commit 91d258cc8b2f74cd02fc93dfe61f73ec9f0dd489 +Subproject commit d9963983c0b4d426b24836f8d154d014d5aecbba diff --git a/aux/btest b/aux/btest index ce366206e3..69606f8f3c 160000 --- a/aux/btest +++ b/aux/btest @@ -1 +1 @@ -Subproject commit ce366206e3407e534a786ad572c342e9f9fef26b +Subproject commit 69606f8f3cc84d694ca1da14868a5fecd4abbc96 diff --git a/doc/file-analysis.rst b/doc/file-analysis.rst index f312e06471..0a96a8efb7 100644 --- a/doc/file-analysis.rst +++ b/doc/file-analysis.rst @@ -82,9 +82,9 @@ attached, they start receiving the contents of the file as Bro extracts it from an ongoing network connection. What they do with the file contents is up to the particular file analyzer implementation, but they'll typically either report further information about the file via -events (e.g. :bro:see:`FileAnalysis::ANALYZER_MD5` will report the +events (e.g. :bro:see:`Files::ANALYZER_MD5` will report the file's MD5 checksum via :bro:see:`file_hash` once calculated) or they'll -have some side effect (e.g. :bro:see:`FileAnalysis::ANALYZER_EXTRACT` +have some side effect (e.g. :bro:see:`Files::ANALYZER_EXTRACT` will write the contents of the file out to the local file system). In the future there may be file analyzers that automatically attach to @@ -98,7 +98,7 @@ explicit attachment decision: { print "new file", f$id; if ( f?$mime_type && f$mime_type == "text/plain" ) - FileAnalysis::add_analyzer(f, [$tag=FileAnalysis::ANALYZER_MD5]); + Files::add_analyzer(f, Files::ANALYZER_MD5); } event file_hash(f: fa_file, kind: string, hash: string) @@ -113,26 +113,27 @@ output:: file_hash, Cx92a0ym5R8, md5, 397168fd09991a0e712254df7bc639ac Some file analyzers might have tunable parameters that need to be -specified in the call to :bro:see:`FileAnalysis::add_analyzer`: +specified in the call to :bro:see:`Files::add_analyzer`: .. code:: bro event file_new(f: fa_file) { - FileAnalysis::add_analyzer(f, [$tag=FileAnalysis::ANALYZER_EXTRACT, - $extract_filename="./myfile"]); + Files::add_analyzer(f, Files::ANALYZER_EXTRACT, + [$extract_filename="myfile"]); } In this case, the file extraction analyzer doesn't generate any further -events, but does have the side effect of writing out the file contents -to the local file system at the specified location of ``./myfile``. Of -course, for a network with more than a single file being transferred, -it's probably preferable to specify a different extraction path for each -file, unlike this example. +events, but does have the effect of writing out the file contents to the +local file system at the location resulting from the concatenation of +the path specified by :bro:see:`FileExtract::prefix` and the string, +``myfile``. Of course, for a network with more than a single file being +transferred, it's probably preferable to specify a different extraction +path for each file, unlike this example. Regardless of which file analyzers end up acting on a file, general information about the file (e.g. size, time of last data transferred, -MIME type, etc.) are logged in ``file_analysis.log``. +MIME type, etc.) are logged in ``files.log``. Input Framework Integration =========================== @@ -150,7 +151,7 @@ a network interface it's monitoring. It only requires a call to event file_new(f: fa_file) { print "new file", f$id; - FileAnalysis::add_analyzer(f, [$tag=FileAnalysis::ANALYZER_MD5]); + Files::add_analyzer(f, Files::ANALYZER_MD5); } event file_state_remove(f: fa_file) diff --git a/doc/index.rst b/doc/index.rst index ad05f7bf82..aa33d8797d 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -47,6 +47,7 @@ Script Reference scripts/index scripts/builtins scripts/proto-analyzers + scripts/file-analyzers Other Bro Components -------------------- diff --git a/doc/scripts/CMakeLists.txt b/doc/scripts/CMakeLists.txt index e7e39d0b3f..fa234e74f2 100644 --- a/doc/scripts/CMakeLists.txt +++ b/doc/scripts/CMakeLists.txt @@ -124,28 +124,34 @@ endmacro(REST_TARGET) # Schedule Bro scripts for which to generate documentation. include(DocSourcesList.cmake) -# This reST target is independent of a particular Bro script... -add_custom_command(OUTPUT proto-analyzers.rst - # delete any leftover state from previous bro runs - COMMAND "${CMAKE_COMMAND}" - ARGS -E remove_directory .state - # generate the reST documentation using bro - COMMAND BROPATH=${BROPATH}:${srcDir} BROMAGIC=${CMAKE_SOURCE_DIR}/magic/database ${CMAKE_BINARY_DIR}/src/bro - ARGS -b -Z base/init-bare.bro || (rm -rf .state *.log *.rst && exit 1) - # move generated doc into a new directory tree that - # defines the final structure of documents - COMMAND "${CMAKE_COMMAND}" - ARGS -E make_directory ${dstDir} - COMMAND "${CMAKE_COMMAND}" - ARGS -E copy proto-analyzers.rst ${dstDir} - # clean up the build directory - COMMAND rm - ARGS -rf .state *.log *.rst - DEPENDS bro - WORKING_DIRECTORY ${CMAKE_BINARY_DIR} - COMMENT "[Bro] Generating reST docs for proto-analyzers.rst" -) -list(APPEND ALL_REST_OUTPUTS proto-analyzers.rst) +# Macro for generating reST docs that are independent of any particular Bro +# script. +macro(INDEPENDENT_REST_TARGET reST_file) + add_custom_command(OUTPUT ${reST_file} + # delete any leftover state from previous bro runs + COMMAND "${CMAKE_COMMAND}" + ARGS -E remove_directory .state + # generate the reST documentation using bro + COMMAND BROPATH=${BROPATH}:${srcDir} BROMAGIC=${CMAKE_SOURCE_DIR}/magic/database ${CMAKE_BINARY_DIR}/src/bro + ARGS -b -Z base/init-bare.bro || (rm -rf .state *.log *.rst && exit 1) + # move generated doc into a new directory tree that + # defines the final structure of documents + COMMAND "${CMAKE_COMMAND}" + ARGS -E make_directory ${dstDir} + COMMAND "${CMAKE_COMMAND}" + ARGS -E copy ${reST_file} ${dstDir} + # clean up the build directory + COMMAND rm + ARGS -rf .state *.log *.rst + DEPENDS bro + WORKING_DIRECTORY ${CMAKE_BINARY_DIR} + COMMENT "[Bro] Generating reST docs for ${reST_file}" + ) + list(APPEND ALL_REST_OUTPUTS ${reST_file}) +endmacro(INDEPENDENT_REST_TARGET) + +independent_rest_target(proto-analyzers.rst) +independent_rest_target(file-analyzers.rst) # create temporary list of all docs to include in the master policy/index file file(WRITE ${MASTER_POLICY_INDEX} "${MASTER_POLICY_INDEX_TEXT}") diff --git a/doc/scripts/DocSourcesList.cmake b/doc/scripts/DocSourcesList.cmake index 97150f84aa..e405834ac6 100644 --- a/doc/scripts/DocSourcesList.cmake +++ b/doc/scripts/DocSourcesList.cmake @@ -73,6 +73,7 @@ rest_target(${CMAKE_BINARY_DIR}/scripts base/bif/plugins/Bro_UDP.events.bif.bro) rest_target(${CMAKE_BINARY_DIR}/scripts base/bif/plugins/Bro_ZIP.events.bif.bro) rest_target(${CMAKE_BINARY_DIR}/scripts base/bif/reporter.bif.bro) rest_target(${CMAKE_BINARY_DIR}/scripts base/bif/strings.bif.bro) +rest_target(${CMAKE_BINARY_DIR}/scripts base/bif/top-k.bif.bro) rest_target(${CMAKE_BINARY_DIR}/scripts base/bif/types.bif.bro) rest_target(${psd} base/files/extract/main.bro) rest_target(${psd} base/files/hash/main.bro) @@ -129,6 +130,7 @@ rest_target(${psd} base/frameworks/sumstats/plugins/min.bro) rest_target(${psd} base/frameworks/sumstats/plugins/sample.bro) rest_target(${psd} base/frameworks/sumstats/plugins/std-dev.bro) rest_target(${psd} base/frameworks/sumstats/plugins/sum.bro) +rest_target(${psd} base/frameworks/sumstats/plugins/topk.bro) rest_target(${psd} base/frameworks/sumstats/plugins/unique.bro) rest_target(${psd} base/frameworks/sumstats/plugins/variance.bro) rest_target(${psd} base/frameworks/tunnels/main.bro) @@ -141,6 +143,7 @@ rest_target(${psd} base/protocols/dns/consts.bro) rest_target(${psd} base/protocols/dns/main.bro) rest_target(${psd} base/protocols/ftp/files.bro) rest_target(${psd} base/protocols/ftp/gridftp.bro) +rest_target(${psd} base/protocols/ftp/info.bro) rest_target(${psd} base/protocols/ftp/main.bro) rest_target(${psd} base/protocols/ftp/utils-commands.bro) rest_target(${psd} base/protocols/ftp/utils.bro) diff --git a/scripts/base/frameworks/files/main.bro b/scripts/base/frameworks/files/main.bro index d0c381545b..c1883e037f 100644 --- a/scripts/base/frameworks/files/main.bro +++ b/scripts/base/frameworks/files/main.bro @@ -204,7 +204,7 @@ export { ## ## tag: Tag for the protocol analyzer having a callback being registered. ## - ## reg: A :bro:see:`ProtoRegistration` record. + ## reg: A :bro:see:`Files::ProtoRegistration` record. ## ## Returns: true if the protocol being registered was not previously registered. global register_protocol: function(tag: Analyzer::Tag, reg: ProtoRegistration): bool; @@ -228,11 +228,6 @@ redef record fa_file += { info: Info &optional; }; -redef record AnalyzerArgs += { - # This is used interally for the core file analyzer api. - tag: Files::Tag &optional; -}; - # Store the callbacks for protocol analyzers that have files. global registered_protocols: table[Analyzer::Tag] of ProtoRegistration = table(); @@ -275,14 +270,12 @@ function set_timeout_interval(f: fa_file, t: interval): bool function add_analyzer(f: fa_file, tag: Files::Tag, args: AnalyzerArgs): bool { - # This is to construct the correct args for the core API. - args$tag = tag; add f$info$analyzers[Files::analyzer_name(tag)]; if ( tag in analyzer_add_callbacks ) analyzer_add_callbacks[tag](f, args); - if ( ! __add_analyzer(f$id, args) ) + if ( ! __add_analyzer(f$id, tag, args) ) { Reporter::warning(fmt("Analyzer %s not added successfully to file %s.", tag, f$id)); return F; @@ -297,8 +290,7 @@ function register_analyzer_add_callback(tag: Files::Tag, callback: function(f: f function remove_analyzer(f: fa_file, tag: Files::Tag, args: AnalyzerArgs): bool { - args$tag = tag; - return __remove_analyzer(f$id, args); + return __remove_analyzer(f$id, tag, args); } function stop(f: fa_file): bool diff --git a/scripts/base/frameworks/packet-filter/main.bro b/scripts/base/frameworks/packet-filter/main.bro index 72b2b62f34..929b10fbe1 100644 --- a/scripts/base/frameworks/packet-filter/main.bro +++ b/scripts/base/frameworks/packet-filter/main.bro @@ -109,7 +109,7 @@ export { ## Enables the old filtering approach of "only watch common ports for ## analyzed protocols". - ## + ## ## Unless you know what you are doing, leave this set to F. const enable_auto_protocol_capture_filters = F &redef; diff --git a/scripts/base/frameworks/sumstats/plugins/__load__.bro b/scripts/base/frameworks/sumstats/plugins/__load__.bro index d208f2bbfa..d2f89e41c5 100644 --- a/scripts/base/frameworks/sumstats/plugins/__load__.bro +++ b/scripts/base/frameworks/sumstats/plugins/__load__.bro @@ -5,5 +5,6 @@ @load ./sample @load ./std-dev @load ./sum +@load ./topk @load ./unique -@load ./variance \ No newline at end of file +@load ./variance diff --git a/scripts/base/frameworks/sumstats/plugins/topk.bro b/scripts/base/frameworks/sumstats/plugins/topk.bro new file mode 100644 index 0000000000..58f8168f5b --- /dev/null +++ b/scripts/base/frameworks/sumstats/plugins/topk.bro @@ -0,0 +1,50 @@ +@load base/frameworks/sumstats + +module SumStats; + +export { + redef record Reducer += { + ## number of elements to keep in the top-k list + topk_size: count &default=500; + }; + + redef enum Calculation += { + TOPK + }; + + redef record ResultVal += { + topk: opaque of topk &optional; + }; + +} + +hook init_resultval_hook(r: Reducer, rv: ResultVal) + { + if ( TOPK in r$apply && ! rv?$topk ) + rv$topk = topk_init(r$topk_size); + } + +hook observe_hook(r: Reducer, val: double, obs: Observation, rv: ResultVal) + { + if ( TOPK in r$apply ) + topk_add(rv$topk, obs); + } + +hook compose_resultvals_hook(result: ResultVal, rv1: ResultVal, rv2: ResultVal) + { + if ( rv1?$topk ) + { + result$topk = topk_init(topk_size(rv1$topk)); + + topk_merge(result$topk, rv1$topk); + + if ( rv2?$topk ) + topk_merge(result$topk, rv2$topk); + } + + else if ( rv2?$topk ) + { + result$topk = topk_init(topk_size(rv2$topk)); + topk_merge(result$topk, rv2$topk); + } + } diff --git a/scripts/base/init-bare.bro b/scripts/base/init-bare.bro index 9876ad03f7..ea79fb23a0 100644 --- a/scripts/base/init-bare.bro +++ b/scripts/base/init-bare.bro @@ -531,22 +531,19 @@ type record_field_table: table[string] of record_field; # dependent on the names remaining as they are now. ## Set of BPF capture filters to use for capturing, indexed by a user-definable -## ID (which must be unique). If Bro is *not* configured to examine -## :bro:id:`PacketFilter::all_packets`, all packets matching at least -## one of the filters in this table (and all in :bro:id:`restrict_filters`) -## will be analyzed. +## ID (which must be unique). If Bro is *not* configured with +## :bro:id:`PacketFilter::enable_auto_protocol_capture_filters`, +## all packets matching at least one of the filters in this table (and all in +## :bro:id:`restrict_filters`) will be analyzed. ## -## .. bro:see:: PacketFilter PacketFilter::all_packets +## .. bro:see:: PacketFilter PacketFilter::enable_auto_protocol_capture_filters ## PacketFilter::unrestricted_filter restrict_filters global capture_filters: table[string] of string &redef; ## Set of BPF filters to restrict capturing, indexed by a user-definable ID (which -## must be unique). If Bro is *not* configured to examine -## :bro:id:`PacketFilter::all_packets`, only packets matching *all* of the -## filters in this table (and any in :bro:id:`capture_filters`) will be -## analyzed. +## must be unique). ## -## .. bro:see:: PacketFilter PacketFilter::all_packets +## .. bro:see:: PacketFilter PacketFilter::enable_auto_protocol_capture_filters ## PacketFilter::unrestricted_filter capture_filters global restrict_filters: table[string] of string &redef; @@ -3042,6 +3039,11 @@ module GLOBAL; ## Number of bytes per packet to capture from live interfaces. const snaplen = 8192 &redef; +## Seed for hashes computed internally for probabilistic data structures. Using +## the same value here will make the hashes compatible between independent Bro +## instances. If left unset, Bro will use a temporary local seed. +const global_hash_seed: string = "" &redef; + # Load BiFs defined by plugins. @load base/bif/plugins diff --git a/scripts/base/protocols/ftp/__load__.bro b/scripts/base/protocols/ftp/__load__.bro index ebb09e702c..3ddd8a2dc2 100644 --- a/scripts/base/protocols/ftp/__load__.bro +++ b/scripts/base/protocols/ftp/__load__.bro @@ -1,4 +1,5 @@ @load ./utils-commands +@load ./info @load ./main @load ./utils @load ./files diff --git a/scripts/base/protocols/ftp/files.bro b/scripts/base/protocols/ftp/files.bro index 9ed17ab2a4..b507ca32a7 100644 --- a/scripts/base/protocols/ftp/files.bro +++ b/scripts/base/protocols/ftp/files.bro @@ -1,3 +1,4 @@ +@load ./info @load ./main @load ./utils @load base/utils/conn-ids diff --git a/scripts/base/protocols/ftp/gridftp.bro b/scripts/base/protocols/ftp/gridftp.bro index 57752b1cbd..73bd656544 100644 --- a/scripts/base/protocols/ftp/gridftp.bro +++ b/scripts/base/protocols/ftp/gridftp.bro @@ -19,6 +19,7 @@ ##! sizes are not logged, but at the benefit of saving CPU cycles that ##! otherwise go to analyzing the large (and likely benign) connections. +@load ./info @load ./main @load base/protocols/conn @load base/protocols/ssl diff --git a/scripts/base/protocols/ftp/info.bro b/scripts/base/protocols/ftp/info.bro new file mode 100644 index 0000000000..f6fceb071e --- /dev/null +++ b/scripts/base/protocols/ftp/info.bro @@ -0,0 +1,72 @@ +##! Defines data structures for tracking and logging FTP sessions. + +module FTP; + +@load ./utils-commands + +export { + + ## This setting changes if passwords used in FTP sessions are + ## captured or not. + const default_capture_password = F &redef; + + ## The expected endpoints of an FTP data channel. + type ExpectedDataChannel: record { + ## Whether PASV mode is toggled for control channel. + passive: bool &log; + ## The host that will be initiating the data connection. + orig_h: addr &log; + ## The host that will be accepting the data connection. + resp_h: addr &log; + ## The port at which the acceptor is listening for the data connection. + resp_p: port &log; + }; + + type Info: record { + ## Time when the command was sent. + ts: time &log; + ## Unique ID for the connection. + uid: string &log; + ## The connection's 4-tuple of endpoint addresses/ports. + id: conn_id &log; + ## User name for the current FTP session. + user: string &log &default=""; + ## Password for the current FTP session if captured. + password: string &log &optional; + ## Command given by the client. + command: string &log &optional; + ## Argument for the command if one is given. + arg: string &log &optional; + + ## Libmagic "sniffed" file type if the command indicates a file transfer. + mime_type: string &log &optional; + ## Size of the file if the command indicates a file transfer. + file_size: count &log &optional; + + ## Reply code from the server in response to the command. + reply_code: count &log &optional; + ## Reply message from the server in response to the command. + reply_msg: string &log &optional; + + ## Expected FTP data channel. + data_channel: ExpectedDataChannel &log &optional; + + ## Current working directory that this session is in. By making + ## the default value '.', we can indicate that unless something + ## more concrete is discovered that the existing but unknown + ## directory is ok to use. + cwd: string &default="."; + + ## Command that is currently waiting for a response. + cmdarg: CmdArg &optional; + ## Queue for commands that have been sent but not yet responded to + ## are tracked here. + pending_commands: PendingCmds; + + ## Indicates if the session is in active or passive mode. + passive: bool &default=F; + + ## Determines if the password will be captured for this request. + capture_password: bool &default=default_capture_password; + }; +} diff --git a/scripts/base/protocols/ftp/main.bro b/scripts/base/protocols/ftp/main.bro index 7e66b63f40..254dca7d42 100644 --- a/scripts/base/protocols/ftp/main.bro +++ b/scripts/base/protocols/ftp/main.bro @@ -3,6 +3,8 @@ ##! will take on the full path that the client is at along with the requested ##! file name. +@load ./info +@load ./utils @load ./utils-commands @load base/utils/paths @load base/utils/numbers @@ -20,72 +22,9 @@ export { "EPSV" } &redef; - ## This setting changes if passwords used in FTP sessions are captured or not. - const default_capture_password = F &redef; - ## User IDs that can be considered "anonymous". const guest_ids = { "anonymous", "ftp", "ftpuser", "guest" } &redef; - ## The expected endpoints of an FTP data channel. - type ExpectedDataChannel: record { - ## Whether PASV mode is toggled for control channel. - passive: bool &log; - ## The host that will be initiating the data connection. - orig_h: addr &log; - ## The host that will be accepting the data connection. - resp_h: addr &log; - ## The port at which the acceptor is listening for the data connection. - resp_p: port &log; - }; - - type Info: record { - ## Time when the command was sent. - ts: time &log; - ## Unique ID for the connection. - uid: string &log; - ## The connection's 4-tuple of endpoint addresses/ports. - id: conn_id &log; - ## User name for the current FTP session. - user: string &log &default=""; - ## Password for the current FTP session if captured. - password: string &log &optional; - ## Command given by the client. - command: string &log &optional; - ## Argument for the command if one is given. - arg: string &log &optional; - - ## Libmagic "sniffed" file type if the command indicates a file transfer. - mime_type: string &log &optional; - ## Size of the file if the command indicates a file transfer. - file_size: count &log &optional; - - ## Reply code from the server in response to the command. - reply_code: count &log &optional; - ## Reply message from the server in response to the command. - reply_msg: string &log &optional; - - ## Expected FTP data channel. - data_channel: ExpectedDataChannel &log &optional; - - ## Current working directory that this session is in. By making - ## the default value '.', we can indicate that unless something - ## more concrete is discovered that the existing but unknown - ## directory is ok to use. - cwd: string &default="."; - - ## Command that is currently waiting for a response. - cmdarg: CmdArg &optional; - ## Queue for commands that have been sent but not yet responded to - ## are tracked here. - pending_commands: PendingCmds; - - ## Indicates if the session is in active or passive mode. - passive: bool &default=F; - - ## Determines if the password will be captured for this request. - capture_password: bool &default=default_capture_password; - }; - ## This record is to hold a parsed FTP reply code. For example, for the ## 201 status code, the digits would be parsed as: x->2, y->0, z=>1. type ReplyCode: record { @@ -102,8 +41,6 @@ export { global log_ftp: event(rec: Info); } -@load ./utils - # Add the state tracking information variable to the connection record redef record connection += { ftp: Info &optional; diff --git a/scripts/base/protocols/ftp/utils.bro b/scripts/base/protocols/ftp/utils.bro index 8b92a37764..313280b904 100644 --- a/scripts/base/protocols/ftp/utils.bro +++ b/scripts/base/protocols/ftp/utils.bro @@ -1,7 +1,8 @@ ##! Utilities specific for FTP processing. -@load ./main +@load ./info @load base/utils/addrs +@load base/utils/paths module FTP; diff --git a/scripts/base/protocols/ssl/main.bro b/scripts/base/protocols/ssl/main.bro index 65526182ac..0d4a8435f0 100644 --- a/scripts/base/protocols/ssl/main.bro +++ b/scripts/base/protocols/ssl/main.bro @@ -67,11 +67,8 @@ export { ## (especially with large file transfers). const disable_analyzer_after_detection = T &redef; - ## The maximum amount of time a script can delay records from being logged. - const max_log_delay = 15secs &redef; - ## Delays an SSL record for a specific token: the record will not be logged - ## as longs the token exists or until :bro:id:`SSL::max_log_delay` elapses. + ## as longs the token exists or until 15 seconds elapses. global delay_log: function(info: Info, token: string); ## Undelays an SSL record for a previously inserted token, allowing the @@ -90,7 +87,7 @@ redef record connection += { redef record Info += { # Adding a string "token" to this set will cause the SSL script # to delay logging the record until either the token has been removed or - # the record has been delayed for :bro:id:`SSL::max_log_delay`. + # the record has been delayed. delay_tokens: set[string] &optional; }; @@ -138,7 +135,7 @@ function log_record(info: Info) { log_record(info); } - timeout SSL::max_log_delay + timeout 15secs { Reporter::info(fmt("SSL delay tokens not released in time (%s tokens remaining)", |info$delay_tokens|)); diff --git a/scripts/policy/frameworks/packet-filter/shunt.bro b/scripts/policy/frameworks/packet-filter/shunt.bro index b87369ee62..85ec189a17 100644 --- a/scripts/policy/frameworks/packet-filter/shunt.bro +++ b/scripts/policy/frameworks/packet-filter/shunt.bro @@ -34,8 +34,8 @@ export { global current_shunted_host_pairs: function(): set[conn_id]; redef enum Notice::Type += { - ## Indicative that :bro:id:`max_bpf_shunts` connections are already - ## being shunted with BPF filters and no more are allowed. + ## Indicative that :bro:id:`PacketFilter::max_bpf_shunts` connections + ## are already being shunted with BPF filters and no more are allowed. No_More_Conn_Shunts_Available, ## Limitations in BPF make shunting some connections with BPF impossible. diff --git a/scripts/policy/misc/load-balancing.bro b/scripts/policy/misc/load-balancing.bro index fe07dd64da..889d18119a 100644 --- a/scripts/policy/misc/load-balancing.bro +++ b/scripts/policy/misc/load-balancing.bro @@ -12,12 +12,12 @@ export { ## Apply BPF filters to each worker in a way that causes them to ## automatically flow balance traffic between them. AUTO_BPF, - ## Load balance traffic across the workers by making each one apply - ## a restrict filter to only listen to a single MAC address. This - ## is a somewhat common deployment option for sites doing network - ## based load balancing with MAC address rewriting and passing the - ## traffic to a single interface. Multiple MAC addresses will show - ## up on the same interface and need filtered to a single address. + # Load balance traffic across the workers by making each one apply + # a restrict filter to only listen to a single MAC address. This + # is a somewhat common deployment option for sites doing network + # based load balancing with MAC address rewriting and passing the + # traffic to a single interface. Multiple MAC addresses will show + # up on the same interface and need filtered to a single address. #MAC_ADDR_BPF, }; diff --git a/scripts/policy/tuning/defaults/packet-fragments.bro b/scripts/policy/tuning/defaults/packet-fragments.bro index 24b18d5917..f95c826547 100644 --- a/scripts/policy/tuning/defaults/packet-fragments.bro +++ b/scripts/policy/tuning/defaults/packet-fragments.bro @@ -1,10 +1,10 @@ -## Capture TCP fragments, but not UDP (or ICMP), since those are a lot more -## common due to high-volume, fragmenting protocols such as NFS :-(. +# Capture TCP fragments, but not UDP (or ICMP), since those are a lot more +# common due to high-volume, fragmenting protocols such as NFS :-(. -## This normally isn't used because of the default open packet filter -## but we set it anyway in case the user is using a packet filter. -## Note: This was removed because the default model now is to have a wide -## open packet filter. +# This normally isn't used because of the default open packet filter +# but we set it anyway in case the user is using a packet filter. +# Note: This was removed because the default model now is to have a wide +# open packet filter. #redef capture_filters += { ["frag"] = "(ip[6:2] & 0x3fff != 0) and tcp" }; ## Shorten the fragment timeout from never expiring to expiring fragments after diff --git a/src/BroDoc.cc b/src/BroDoc.cc index c04cd92eca..55dc8ce558 100644 --- a/src/BroDoc.cc +++ b/src/BroDoc.cc @@ -11,6 +11,7 @@ #include "plugin/Manager.h" #include "analyzer/Manager.h" #include "analyzer/Component.h" +#include "file_analysis/Manager.h" BroDoc::BroDoc(const std::string& rel, const std::string& abs) { @@ -479,6 +480,17 @@ static void WriteAnalyzerComponent(FILE* f, const analyzer::Component* c) fprintf(f, ":bro:enum:`Analyzer::%s`\n\n", tag.c_str()); } +static void WriteAnalyzerComponent(FILE* f, const file_analysis::Component* c) + { + EnumType* atag = file_mgr->GetTagEnumType(); + string tag = fmt("ANALYZER_%s", c->CanonicalName()); + + if ( atag->Lookup("Files", tag.c_str()) < 0 ) + reporter->InternalError("missing analyzer tag for %s", tag.c_str()); + + fprintf(f, ":bro:enum:`Files::%s`\n\n", tag.c_str()); + } + static void WritePluginComponents(FILE* f, const plugin::Plugin* p) { plugin::Plugin::component_list components = p->Components(); @@ -494,6 +506,10 @@ static void WritePluginComponents(FILE* f, const plugin::Plugin* p) WriteAnalyzerComponent(f, dynamic_cast(*it)); break; + case plugin::component::FILE_ANALYZER: + WriteAnalyzerComponent(f, + dynamic_cast(*it)); + break; case plugin::component::READER: reporter->InternalError("docs for READER component unimplemented"); case plugin::component::WRITER: @@ -537,30 +553,35 @@ static void WritePluginBifItems(FILE* f, const plugin::Plugin* p, } } -static void WriteAnalyzerTagDefn(FILE* f, EnumType* e) +static void WriteAnalyzerTagDefn(FILE* f, EnumType* e, const string& module) { + string tag_id= module + "::Tag"; e = new CommentedEnumType(e); - e->SetTypeID(copy_string("Analyzer::Tag")); + e->SetTypeID(copy_string(tag_id.c_str())); - ID* dummy_id = new ID(copy_string("Analyzer::Tag"), SCOPE_GLOBAL, true); + ID* dummy_id = new ID(copy_string(tag_id.c_str()), SCOPE_GLOBAL, true); dummy_id->SetType(e); dummy_id->MakeType(); list* r = new list(); - r->push_back("Unique identifiers for protocol analyzers."); + r->push_back("Unique identifiers for analyzers."); BroDocObj bdo(dummy_id, r, true); bdo.WriteReST(f); } -static bool IsAnalyzerPlugin(const plugin::Plugin* p) +static bool ComponentsMatch(const plugin::Plugin* p, plugin::component::Type t, + bool match_empty = false) { plugin::Plugin::component_list components = p->Components(); plugin::Plugin::component_list::const_iterator it; + if ( components.empty() ) + return match_empty; + for ( it = components.begin(); it != components.end(); ++it ) - if ( (*it)->Type() != plugin::component::ANALYZER ) + if ( (*it)->Type() != t ) return false; return true; @@ -573,14 +594,44 @@ void CreateProtoAnalyzerDoc(const char* filename) fprintf(f, "Protocol Analyzer Reference\n"); fprintf(f, "===========================\n\n"); - WriteAnalyzerTagDefn(f, analyzer_mgr->GetTagEnumType()); + WriteAnalyzerTagDefn(f, analyzer_mgr->GetTagEnumType(), "Analyzer"); plugin::Manager::plugin_list plugins = plugin_mgr->Plugins(); plugin::Manager::plugin_list::const_iterator it; for ( it = plugins.begin(); it != plugins.end(); ++it ) { - if ( ! IsAnalyzerPlugin(*it) ) + if ( ! ComponentsMatch(*it, plugin::component::ANALYZER, true) ) + continue; + + WritePluginSectionHeading(f, *it); + WritePluginComponents(f, *it); + WritePluginBifItems(f, *it, plugin::BifItem::CONSTANT, + "Options/Constants"); + WritePluginBifItems(f, *it, plugin::BifItem::GLOBAL, "Globals"); + WritePluginBifItems(f, *it, plugin::BifItem::TYPE, "Types"); + WritePluginBifItems(f, *it, plugin::BifItem::EVENT, "Events"); + WritePluginBifItems(f, *it, plugin::BifItem::FUNCTION, "Functions"); + } + + fclose(f); + } + +void CreateFileAnalyzerDoc(const char* filename) + { + FILE* f = fopen(filename, "w"); + + fprintf(f, "File Analyzer Reference\n"); + fprintf(f, "=======================\n\n"); + + WriteAnalyzerTagDefn(f, file_mgr->GetTagEnumType(), "Files"); + + plugin::Manager::plugin_list plugins = plugin_mgr->Plugins(); + plugin::Manager::plugin_list::const_iterator it; + + for ( it = plugins.begin(); it != plugins.end(); ++it ) + { + if ( ! ComponentsMatch(*it, plugin::component::FILE_ANALYZER) ) continue; WritePluginSectionHeading(f, *it); diff --git a/src/BroDoc.h b/src/BroDoc.h index 9f92f821f8..081df698d9 100644 --- a/src/BroDoc.h +++ b/src/BroDoc.h @@ -413,4 +413,10 @@ private: */ void CreateProtoAnalyzerDoc(const char* filename); +/** + * Writes out plugin index documentation for all file analyzer plugins. + * @param filename the name of the file to write. + */ +void CreateFileAnalyzerDoc(const char* filename); + #endif diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 4a65ddd4d3..e64dcbb9f6 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -319,6 +319,7 @@ set(bro_SRCS StateAccess.cc Stats.cc Stmt.cc + Tag.cc Timer.cc Traverse.cc Trigger.cc @@ -362,6 +363,8 @@ set(bro_SRCS 3rdparty/sqlite3.c plugin/Component.cc + plugin/ComponentManager.h + plugin/TaggedComponent.h plugin/Manager.cc plugin/Plugin.cc plugin/Macros.h diff --git a/src/DebugLogger.cc b/src/DebugLogger.cc index 380f21aa5f..dc557c4a0a 100644 --- a/src/DebugLogger.cc +++ b/src/DebugLogger.cc @@ -16,7 +16,8 @@ DebugLogger::Stream DebugLogger::streams[NUM_DBGS] = { { "notifiers", 0, false }, { "main-loop", 0, false }, { "dpd", 0, false }, { "tm", 0, false }, { "logging", 0, false }, {"input", 0, false }, - { "threading", 0, false }, { "file_analysis", 0, false } + { "threading", 0, false }, { "file_analysis", 0, false }, + { "plugins", 0, false} }; DebugLogger::DebugLogger(const char* filename) diff --git a/src/DebugLogger.h b/src/DebugLogger.h index e293b326a8..c5744642f5 100644 --- a/src/DebugLogger.h +++ b/src/DebugLogger.h @@ -27,6 +27,7 @@ enum DebugStream { DBG_INPUT, // Input streams DBG_THREADING, // Threading system DBG_FILE_ANALYSIS, // File analysis + DBG_PLUGINS, NUM_DBGS // Has to be last }; diff --git a/src/NetVar.cc b/src/NetVar.cc index 388aa46f10..cf80759173 100644 --- a/src/NetVar.cc +++ b/src/NetVar.cc @@ -238,10 +238,13 @@ TableType* record_field_table; StringVal* cmd_line_bpf_filter; +StringVal* global_hash_seed; + OpaqueType* md5_type; OpaqueType* sha1_type; OpaqueType* sha256_type; OpaqueType* entropy_type; +OpaqueType* topk_type; OpaqueType* bloomfilter_type; #include "const.bif.netvar_def" @@ -304,10 +307,13 @@ void init_general_global_var() cmd_line_bpf_filter = internal_val("cmd_line_bpf_filter")->AsStringVal(); + global_hash_seed = opt_internal_string("global_hash_seed"); + md5_type = new OpaqueType("md5"); sha1_type = new OpaqueType("sha1"); sha256_type = new OpaqueType("sha256"); entropy_type = new OpaqueType("entropy"); + topk_type = new OpaqueType("topk"); bloomfilter_type = new OpaqueType("bloomfilter"); } diff --git a/src/NetVar.h b/src/NetVar.h index 7ce33d1a1a..9a14525d4a 100644 --- a/src/NetVar.h +++ b/src/NetVar.h @@ -242,11 +242,14 @@ extern TableType* record_field_table; extern StringVal* cmd_line_bpf_filter; +extern StringVal* global_hash_seed; + class OpaqueType; extern OpaqueType* md5_type; extern OpaqueType* sha1_type; extern OpaqueType* sha256_type; extern OpaqueType* entropy_type; +extern OpaqueType* topk_type; extern OpaqueType* bloomfilter_type; // Initializes globals that don't pertain to network/event analysis. diff --git a/src/OpaqueVal.cc b/src/OpaqueVal.cc index 66b3c081e7..0c1d9d509d 100644 --- a/src/OpaqueVal.cc +++ b/src/OpaqueVal.cc @@ -566,14 +566,14 @@ BroType* BloomFilterVal::Type() const void BloomFilterVal::Add(const Val* val) { HashKey* key = hash->ComputeHash(val, 1); - bloom_filter->Add(key->Hash()); + bloom_filter->Add(key); delete key; } size_t BloomFilterVal::Count(const Val* val) const { HashKey* key = hash->ComputeHash(val, 1); - size_t cnt = bloom_filter->Count(key->Hash()); + size_t cnt = bloom_filter->Count(key); delete key; return cnt; } @@ -588,10 +588,17 @@ bool BloomFilterVal::Empty() const return bloom_filter->Empty(); } +string BloomFilterVal::InternalState() const + { + return bloom_filter->InternalState(); + } + BloomFilterVal* BloomFilterVal::Merge(const BloomFilterVal* x, const BloomFilterVal* y) { - if ( ! same_type(x->Type(), y->Type()) ) + if ( x->Type() && // any one 0 is ok here + y->Type() && + ! same_type(x->Type(), y->Type()) ) { reporter->Error("cannot merge Bloom filters with different types"); return 0; @@ -613,7 +620,7 @@ BloomFilterVal* BloomFilterVal::Merge(const BloomFilterVal* x, BloomFilterVal* merged = new BloomFilterVal(copy); - if ( ! merged->Typify(x->Type()) ) + if ( x->Type() && ! merged->Typify(x->Type()) ) { reporter->Error("failed to set type on merged Bloom filter"); return 0; diff --git a/src/OpaqueVal.h b/src/OpaqueVal.h index 52c9583fc7..08a20b1a31 100644 --- a/src/OpaqueVal.h +++ b/src/OpaqueVal.h @@ -127,6 +127,7 @@ public: size_t Count(const Val* val) const; void Clear(); bool Empty() const; + string InternalState() const; static BloomFilterVal* Merge(const BloomFilterVal* x, const BloomFilterVal* y); diff --git a/src/RuleAction.cc b/src/RuleAction.cc index a13392ee40..ec57c96bd2 100644 --- a/src/RuleAction.cc +++ b/src/RuleAction.cc @@ -40,7 +40,7 @@ RuleActionAnalyzer::RuleActionAnalyzer(const char* arg_analyzer) string str(arg_analyzer); string::size_type pos = str.find(':'); string arg = str.substr(0, pos); - analyzer = analyzer_mgr->GetAnalyzerTag(arg.c_str()); + analyzer = analyzer_mgr->GetComponentTag(arg.c_str()); if ( ! analyzer ) reporter->Warning("unknown analyzer '%s' specified in rule", arg.c_str()); @@ -48,7 +48,7 @@ RuleActionAnalyzer::RuleActionAnalyzer(const char* arg_analyzer) if ( pos != string::npos ) { arg = str.substr(pos + 1); - child_analyzer = analyzer_mgr->GetAnalyzerTag(arg.c_str()); + child_analyzer = analyzer_mgr->GetComponentTag(arg.c_str()); if ( ! child_analyzer ) reporter->Warning("unknown analyzer '%s' specified in rule", arg.c_str()); @@ -60,11 +60,11 @@ RuleActionAnalyzer::RuleActionAnalyzer(const char* arg_analyzer) void RuleActionAnalyzer::PrintDebug() { if ( ! child_analyzer ) - fprintf(stderr, "|%s|\n", analyzer_mgr->GetAnalyzerName(analyzer)); + fprintf(stderr, "|%s|\n", analyzer_mgr->GetComponentName(analyzer)); else fprintf(stderr, "|%s:%s|\n", - analyzer_mgr->GetAnalyzerName(analyzer), - analyzer_mgr->GetAnalyzerName(child_analyzer)); + analyzer_mgr->GetComponentName(analyzer), + analyzer_mgr->GetComponentName(child_analyzer)); } diff --git a/src/SerialTypes.h b/src/SerialTypes.h index 85aed10bda..5271caa2e3 100644 --- a/src/SerialTypes.h +++ b/src/SerialTypes.h @@ -52,6 +52,7 @@ SERIAL_IS(RE_MATCHER, 0x1400) SERIAL_IS(BITVECTOR, 0x1500) SERIAL_IS(COUNTERVECTOR, 0x1600) SERIAL_IS(BLOOMFILTER, 0x1700) +SERIAL_IS(HASHER, 0x1800) // These are the externally visible types. const SerialType SER_NONE = 0; @@ -107,7 +108,8 @@ SERIAL_VAL(MD5_VAL, 16) SERIAL_VAL(SHA1_VAL, 17) SERIAL_VAL(SHA256_VAL, 18) SERIAL_VAL(ENTROPY_VAL, 19) -SERIAL_VAL(BLOOMFILTER_VAL, 20) +SERIAL_VAL(TOPK_VAL, 20) +SERIAL_VAL(BLOOMFILTER_VAL, 21) #define SERIAL_EXPR(name, val) SERIAL_CONST(name, val, EXPR) SERIAL_EXPR(EXPR, 1) @@ -206,6 +208,11 @@ SERIAL_BLOOMFILTER(BLOOMFILTER, 1) SERIAL_BLOOMFILTER(BASICBLOOMFILTER, 2) SERIAL_BLOOMFILTER(COUNTINGBLOOMFILTER, 3) +#define SERIAL_HASHER(name, val) SERIAL_CONST(name, val, HASHER) +SERIAL_HASHER(HASHER, 1) +SERIAL_HASHER(DEFAULTHASHER, 2) +SERIAL_HASHER(DOUBLEHASHER, 3) + SERIAL_CONST2(ID) SERIAL_CONST2(STATE_ACCESS) SERIAL_CONST2(CASE) diff --git a/src/Tag.cc b/src/Tag.cc new file mode 100644 index 0000000000..178edaa71e --- /dev/null +++ b/src/Tag.cc @@ -0,0 +1,82 @@ +// See the file "COPYING" in the main distribution directory for copyright. + +#include "Tag.h" +#include "Val.h" + +Tag::Tag(EnumType* etype, type_t arg_type, subtype_t arg_subtype) + { + assert(arg_type > 0); + + type = arg_type; + subtype = arg_subtype; + int64_t i = (int64)(type) | ((int64)subtype << 31); + Ref(etype); + val = new EnumVal(i, etype); + } + +Tag::Tag(EnumVal* arg_val) + { + assert(arg_val); + + val = arg_val; + Ref(val); + + int64 i = val->InternalInt(); + type = i & 0xffffffff; + subtype = (i >> 31) & 0xffffffff; + } + +Tag::Tag(const Tag& other) + { + type = other.type; + subtype = other.subtype; + val = other.val; + + if ( val ) + Ref(val); + } + +Tag::Tag() + { + type = 0; + subtype = 0; + val = 0; + } + +Tag::~Tag() + { + Unref(val); + val = 0; + } + +Tag& Tag::operator=(const Tag& other) + { + if ( this != &other ) + { + type = other.type; + subtype = other.subtype; + val = other.val; + + if ( val ) + Ref(val); + } + + return *this; + } + +EnumVal* Tag::AsEnumVal(EnumType* etype) const + { + if ( ! val ) + { + assert(type == 0 && subtype == 0); + Ref(etype); + val = new EnumVal(0, etype); + } + + return val; + } + +std::string Tag::AsString() const + { + return fmt("%" PRIu32 "/%" PRIu32, type, subtype); + } diff --git a/src/Tag.h b/src/Tag.h new file mode 100644 index 0000000000..2c76f253a5 --- /dev/null +++ b/src/Tag.h @@ -0,0 +1,138 @@ +// See the file "COPYING" in the main distribution directory for copyright. + +#ifndef TAG_H +#define TAG_H + +#include "config.h" +#include "util.h" +#include "Type.h" + +class EnumVal; + +/** + * Class to identify an analyzer type. + * + * Each analyzer type gets a tag consisting of a main type and subtype. The + * former is an identifier that's unique across all analyzer classes. The latter is + * passed through to the analyzer instances for their use, yet not further + * interpreted by the analyzer infrastructure; it allows an analyzer to + * branch out into a set of sub-analyzers internally. Jointly, main type and + * subtype form an analyzer "tag". Each unique tag corresponds to a single + * "analyzer" from the user's perspective. At the script layer, these tags + * are mapped into enums of type \c Analyzer::Tag or Files::Tag. Internally, + * the analyzer::Manager and file_analysis::Manager maintain the mapping of tag + * to analyzer (and it also assigns them their main types), and + * analyzer::Component and file_analysis::Component create new tag. + * + * The Tag class supports all operations necessary to act as an index in a + * \c std::map. + */ +class Tag { +public: + /** + * Type for the analyzer's main type. + */ + typedef uint32 type_t; + + /** + * Type for the analyzer's subtype. + */ + typedef uint32 subtype_t; + + /** + * Returns the tag's main type. + */ + type_t Type() const { return type; } + + /** + * Returns the tag's subtype. + */ + subtype_t Subtype() const { return subtype; } + + /** + * Returns the numerical values for main and subtype inside a string + * suitable for printing. This is primarily for debugging. + */ + std::string AsString() const; + +protected: + /* + * Copy constructor. + */ + Tag(const Tag& other); + + /** + * Default constructor. This initializes the tag with an error value + * that will make \c operator \c bool return false. + */ + Tag(); + + /** + * Destructor. + */ + ~Tag(); + + /** + * Assignment operator. + */ + Tag& operator=(const Tag& other); + + /** + * Compares two tags for equality. + */ + bool operator==(const Tag& other) const + { + return type == other.type && subtype == other.subtype; + } + + /** + * Compares two tags for inequality. + */ + bool operator!=(const Tag& other) const + { + return type != other.type || subtype != other.subtype; + } + + /** + * Compares two tags for less-than relationship. + */ + bool operator<(const Tag& other) const + { + return type != other.type ? type < other.type : (subtype < other.subtype); + } + + /** + * Returns the script-layer enum that corresponds to this tag. + * The returned value does not have its ref-count increased. + * + * @param etype the script-layer enum type associated with the tag. + */ + EnumVal* AsEnumVal(EnumType* etype) const; + + /** + * Constructor. + * + * @param etype the script-layer enum type associated with the tag. + * + * @param type The main type. Note that the manager class manages the + * the value space internally, so noone else should assign main types. + * + * @param subtype The sub type, which is left to an analyzer for + * interpretation. By default it's set to zero. + */ + Tag(EnumType* etype, type_t type, subtype_t subtype = 0); + + /** + * Constructor. + * + * @param val An enum value of script type \c Analyzer::Tag. + */ + Tag(EnumVal* val); + +private: + type_t type; // Main type. + subtype_t subtype; // Subtype. + mutable EnumVal* val; // Script-layer value. +}; + +#endif diff --git a/src/analyzer/Analyzer.cc b/src/analyzer/Analyzer.cc index ecd3c9f686..b8b739f3cb 100644 --- a/src/analyzer/Analyzer.cc +++ b/src/analyzer/Analyzer.cc @@ -70,12 +70,12 @@ void AnalyzerTimer::Init(Analyzer* arg_analyzer, analyzer_timer_func arg_timer, Ref(analyzer->Conn()); } -analyzer::ID Analyzer::id_counter = 0;; +analyzer::ID Analyzer::id_counter = 0; const char* Analyzer::GetAnalyzerName() const { assert(tag); - return analyzer_mgr->GetAnalyzerName(tag); + return analyzer_mgr->GetComponentName(tag); } void Analyzer::SetAnalyzerTag(const Tag& arg_tag) @@ -87,7 +87,7 @@ void Analyzer::SetAnalyzerTag(const Tag& arg_tag) bool Analyzer::IsAnalyzer(const char* name) { assert(tag); - return strcmp(analyzer_mgr->GetAnalyzerName(tag), name) == 0; + return strcmp(analyzer_mgr->GetComponentName(tag), name) == 0; } // Used in debugging output. @@ -98,7 +98,7 @@ static string fmt_analyzer(Analyzer* a) Analyzer::Analyzer(const char* name, Connection* conn) { - Tag tag = analyzer_mgr->GetAnalyzerTag(name); + Tag tag = analyzer_mgr->GetComponentTag(name); if ( ! tag ) reporter->InternalError("unknown analyzer name %s; mismatch with tag analyzer::Component?", name); @@ -494,7 +494,7 @@ Analyzer* Analyzer::FindChild(Tag arg_tag) Analyzer* Analyzer::FindChild(const char* name) { - Tag tag = analyzer_mgr->GetAnalyzerTag(name); + Tag tag = analyzer_mgr->GetComponentTag(name); return tag ? FindChild(tag) : 0; } diff --git a/src/analyzer/Component.cc b/src/analyzer/Component.cc index cbb0f40c20..66ab2213bb 100644 --- a/src/analyzer/Component.cc +++ b/src/analyzer/Component.cc @@ -8,29 +8,26 @@ using namespace analyzer; -Tag::type_t Component::type_counter = 0; - Component::Component(const char* arg_name, factory_callback arg_factory, Tag::subtype_t arg_subtype, bool arg_enabled, bool arg_partial) - : plugin::Component(plugin::component::ANALYZER) + : plugin::Component(plugin::component::ANALYZER), + plugin::TaggedComponent(arg_subtype) { name = copy_string(arg_name); canon_name = canonify_name(arg_name); factory = arg_factory; enabled = arg_enabled; partial = arg_partial; - - tag = analyzer::Tag(++type_counter, arg_subtype); } Component::Component(const Component& other) - : plugin::Component(Type()) + : plugin::Component(Type()), + plugin::TaggedComponent(other) { name = copy_string(other.name); canon_name = copy_string(other.canon_name); factory = other.factory; enabled = other.enabled; partial = other.partial; - tag = other.tag; } Component::~Component() @@ -39,11 +36,6 @@ Component::~Component() delete [] canon_name; } -analyzer::Tag Component::Tag() const - { - return tag; - } - void Component::Describe(ODesc* d) const { plugin::Component::Describe(d); @@ -63,13 +55,14 @@ void Component::Describe(ODesc* d) const Component& Component::operator=(const Component& other) { + plugin::TaggedComponent::operator=(other); + if ( &other != this ) { name = copy_string(other.name); factory = other.factory; enabled = other.enabled; partial = other.partial; - tag = other.tag; } return *this; diff --git a/src/analyzer/Component.h b/src/analyzer/Component.h index 9e12ed347e..9bc8b357d7 100644 --- a/src/analyzer/Component.h +++ b/src/analyzer/Component.h @@ -5,6 +5,7 @@ #include "Tag.h" #include "plugin/Component.h" +#include "plugin/TaggedComponent.h" #include "../config.h" #include "../util.h" @@ -21,7 +22,8 @@ class Analyzer; * A plugin can provide a specific protocol analyzer by registering this * analyzer component, describing the analyzer. */ -class Component : public plugin::Component { +class Component : public plugin::Component, + public plugin::TaggedComponent { public: typedef Analyzer* (*factory_callback)(Connection* conn); @@ -100,13 +102,6 @@ public: */ bool Enabled() const { return enabled; } - /** - * Returns the analyzer's tag. Note that this is automatically - * generated for each new Components, and hence unique across all of - * them. - */ - analyzer::Tag Tag() const; - /** * Enables or disables this analyzer. * @@ -128,11 +123,7 @@ private: const char* canon_name; // The analyzer's canonical name. factory_callback factory; // The analyzer's factory callback. bool partial; // True if the analyzer supports partial connections. - analyzer::Tag tag; // The automatically assigned analyzer tag. bool enabled; // True if the analyzer is enabled. - - // Global counter used to generate unique tags. - static analyzer::Tag::type_t type_counter; }; } diff --git a/src/analyzer/Manager.cc b/src/analyzer/Manager.cc index 8b290e2341..2359e4ec98 100644 --- a/src/analyzer/Manager.cc +++ b/src/analyzer/Manager.cc @@ -60,10 +60,8 @@ bool Manager::ConnIndex::operator<(const ConnIndex& other) const } Manager::Manager() + : plugin::ComponentManager("Analyzer") { - tag_enum_type = new EnumType("Analyzer::Tag"); - ::ID* id = install_ID("Tag", "Analyzer", true, true); - add_type(id, tag_enum_type, 0, 0); } Manager::~Manager() @@ -91,14 +89,14 @@ void Manager::InitPreScript() std::list analyzers = plugin_mgr->Components(); for ( std::list::const_iterator i = analyzers.begin(); i != analyzers.end(); i++ ) - RegisterAnalyzerComponent(*i); + RegisterComponent(*i, "ANALYZER_"); // Cache these tags. - analyzer_backdoor = GetAnalyzerTag("BACKDOOR"); - analyzer_connsize = GetAnalyzerTag("CONNSIZE"); - analyzer_interconn = GetAnalyzerTag("INTERCONN"); - analyzer_stepping = GetAnalyzerTag("STEPPINGSTONE"); - analyzer_tcpstats = GetAnalyzerTag("TCPSTATS"); + analyzer_backdoor = GetComponentTag("BACKDOOR"); + analyzer_connsize = GetComponentTag("CONNSIZE"); + analyzer_interconn = GetComponentTag("INTERCONN"); + analyzer_stepping = GetComponentTag("STEPPINGSTONE"); + analyzer_tcpstats = GetComponentTag("TCPSTATS"); } void Manager::InitPostScript() @@ -109,8 +107,9 @@ void Manager::DumpDebug() { #ifdef DEBUG DBG_LOG(DBG_ANALYZER, "Available analyzers after bro_init():"); - for ( analyzer_map_by_name::const_iterator i = analyzers_by_name.begin(); i != analyzers_by_name.end(); i++ ) - DBG_LOG(DBG_ANALYZER, " %s (%s)", i->second->Name(), IsEnabled(i->second->Tag()) ? "enabled" : "disabled"); + list all_analyzers = GetComponents(); + for ( list::const_iterator i = all_analyzers.begin(); i != all_analyzers.end(); ++i ) + DBG_LOG(DBG_ANALYZER, " %s (%s)", (*i)->Name(), IsEnabled((*i)->Tag()) ? "enabled" : "disabled"); DBG_LOG(DBG_ANALYZER, ""); DBG_LOG(DBG_ANALYZER, "Analyzers by port:"); @@ -120,7 +119,7 @@ void Manager::DumpDebug() string s; for ( tag_set::const_iterator j = i->second->begin(); j != i->second->end(); j++ ) - s += string(GetAnalyzerName(*j)) + " "; + s += string(GetComponentName(*j)) + " "; DBG_LOG(DBG_ANALYZER, " %d/tcp: %s", i->first, s.c_str()); } @@ -130,7 +129,7 @@ void Manager::DumpDebug() string s; for ( tag_set::const_iterator j = i->second->begin(); j != i->second->end(); j++ ) - s += string(GetAnalyzerName(*j)) + " "; + s += string(GetComponentName(*j)) + " "; DBG_LOG(DBG_ANALYZER, " %d/udp: %s", i->first, s.c_str()); } @@ -142,25 +141,6 @@ void Manager::Done() { } -void Manager::RegisterAnalyzerComponent(Component* component) - { - const char* cname = component->CanonicalName(); - - if ( Lookup(cname) ) - reporter->FatalError("Analyzer %s defined more than once", cname); - - DBG_LOG(DBG_ANALYZER, "Registering analyzer %s (tag %s)", - component->Name(), component->Tag().AsString().c_str()); - - analyzers_by_name.insert(std::make_pair(cname, component)); - analyzers_by_tag.insert(std::make_pair(component->Tag(), component)); - analyzers_by_val.insert(std::make_pair(component->Tag().AsEnumVal()->InternalInt(), component)); - - // Install enum "Analyzer::ANALYZER_*" - string id = fmt("ANALYZER_%s", cname); - tag_enum_type->AddName("Analyzer", id.c_str(), component->Tag().AsEnumVal()->InternalInt(), true); - } - bool Manager::EnableAnalyzer(Tag tag) { Component* p = Lookup(tag); @@ -217,8 +197,9 @@ void Manager::DisableAllAnalyzers() { DBG_LOG(DBG_ANALYZER, "Disabling all analyzers"); - for ( analyzer_map_by_tag::const_iterator i = analyzers_by_tag.begin(); i != analyzers_by_tag.end(); i++ ) - i->second->SetEnabled(false); + list all_analyzers = GetComponents(); + for ( list::const_iterator i = all_analyzers.begin(); i != all_analyzers.end(); ++i ) + (*i)->SetEnabled(false); } bool Manager::IsEnabled(Tag tag) @@ -270,7 +251,7 @@ bool Manager::RegisterAnalyzerForPort(Tag tag, TransportProto proto, uint32 port tag_set* l = LookupPort(proto, port, true); #ifdef DEBUG - const char* name = GetAnalyzerName(tag); + const char* name = GetComponentName(tag); DBG_LOG(DBG_ANALYZER, "Registering analyzer %s for port %" PRIu32 "/%d", name, port, proto); #endif @@ -283,7 +264,7 @@ bool Manager::UnregisterAnalyzerForPort(Tag tag, TransportProto proto, uint32 po tag_set* l = LookupPort(proto, port, true); #ifdef DEBUG - const char* name = GetAnalyzerName(tag); + const char* name = GetComponentName(tag); DBG_LOG(DBG_ANALYZER, "Unregistering analyzer %s for port %" PRIu32 "/%d", name, port, proto); #endif @@ -302,7 +283,7 @@ Analyzer* Manager::InstantiateAnalyzer(Tag tag, Connection* conn) return 0; if ( ! c->Factory() ) - reporter->InternalError("analyzer %s cannot be instantiated dynamically", GetAnalyzerName(tag)); + reporter->InternalError("analyzer %s cannot be instantiated dynamically", GetComponentName(tag)); Analyzer* a = c->Factory()(conn); @@ -316,59 +297,10 @@ Analyzer* Manager::InstantiateAnalyzer(Tag tag, Connection* conn) Analyzer* Manager::InstantiateAnalyzer(const char* name, Connection* conn) { - Tag tag = GetAnalyzerTag(name); + Tag tag = GetComponentTag(name); return tag ? InstantiateAnalyzer(tag, conn) : 0; } -const char* Manager::GetAnalyzerName(Tag tag) - { - static const char* error = ""; - - if ( ! tag ) - return error; - - Component* c = Lookup(tag); - - if ( ! c ) - reporter->InternalError("request for name of unknown analyzer tag %s", tag.AsString().c_str()); - - return c->CanonicalName(); - } - -const char* Manager::GetAnalyzerName(Val* val) - { - return GetAnalyzerName(Tag(val->AsEnumVal())); - } - -Tag Manager::GetAnalyzerTag(const char* name) - { - Component* c = Lookup(name); - return c ? c->Tag() : Tag(); - } - -EnumType* Manager::GetTagEnumType() - { - return tag_enum_type; - } - -Component* Manager::Lookup(const char* name) - { - analyzer_map_by_name::const_iterator i = analyzers_by_name.find(to_upper(name)); - return i != analyzers_by_name.end() ? i->second : 0; - } - -Component* Manager::Lookup(const Tag& tag) - { - analyzer_map_by_tag::const_iterator i = analyzers_by_tag.find(tag); - return i != analyzers_by_tag.end() ? i->second : 0; - } - -Component* Manager::Lookup(EnumVal* val) - { - analyzer_map_by_val::const_iterator i = analyzers_by_val.find(val->InternalInt()); - return i != analyzers_by_val.end() ? i->second : 0; - } - Manager::tag_set* Manager::LookupPort(TransportProto proto, uint32 port, bool add_if_not_found) { analyzer_map_by_port* m = 0; @@ -461,7 +393,7 @@ bool Manager::BuildInitialAnalyzerTree(Connection* conn) root->AddChildAnalyzer(analyzer, false); DBG_ANALYZER_ARGS(conn, "activated %s analyzer as scheduled", - analyzer_mgr->GetAnalyzerName(*i)); + analyzer_mgr->GetComponentName(*i)); } } @@ -487,7 +419,7 @@ bool Manager::BuildInitialAnalyzerTree(Connection* conn) root->AddChildAnalyzer(analyzer, false); DBG_ANALYZER_ARGS(conn, "activated %s analyzer due to port %d", - analyzer_mgr->GetAnalyzerName(*j), resp_port); + analyzer_mgr->GetComponentName(*j), resp_port); } } } @@ -613,7 +545,7 @@ void Manager::ExpireScheduledAnalyzers() conns.erase(i); DBG_LOG(DBG_ANALYZER, "Expiring expected analyzer %s for connection %s", - analyzer_mgr->GetAnalyzerName(a->analyzer), + analyzer_mgr->GetComponentName(a->analyzer), fmt_conn_id(a->conn.orig, 0, a->conn.resp, a->conn.resp_p)); delete a; @@ -655,7 +587,7 @@ void Manager::ScheduleAnalyzer(const IPAddr& orig, const IPAddr& resp, TransportProto proto, const char* analyzer, double timeout) { - Tag tag = GetAnalyzerTag(analyzer); + Tag tag = GetComponentTag(analyzer); if ( tag != Tag() ) ScheduleAnalyzer(orig, resp, resp_p, proto, tag, timeout); diff --git a/src/analyzer/Manager.h b/src/analyzer/Manager.h index efae629971..d151709eda 100644 --- a/src/analyzer/Manager.h +++ b/src/analyzer/Manager.h @@ -26,6 +26,7 @@ #include "Analyzer.h" #include "Component.h" #include "Tag.h" +#include "plugin/ComponentManager.h" #include "../Dict.h" #include "../net_util.h" @@ -49,7 +50,7 @@ namespace analyzer { * classes. This allows to external analyzer code to potentially use a * different C++ standard library. */ -class Manager { +class Manager : public plugin::ComponentManager { public: /** * Constructor. @@ -231,42 +232,6 @@ public: */ Analyzer* InstantiateAnalyzer(const char* name, Connection* c); - /** - * Translates an analyzer tag into corresponding analyzer name. - * - * @param tag The analyzer tag. - * - * @return The name, or an empty string if the tag is invalid. - */ - const char* GetAnalyzerName(Tag tag); - - /** - * Translates an script-level analyzer tag into corresponding - * analyzer name. - * - * @param val The analyzer tag as an script-level enum value of type - * \c Analyzer::Tag. - * - * @return The name, or an empty string if the tag is invalid. - */ - const char* GetAnalyzerName(Val* val); - - /** - * Translates an analyzer name into the corresponding tag. - * - * @param name The name. - * - * @return The tag. If the name does not correspond to a valid - * analyzer, the returned tag will evaluate to false. - */ - Tag GetAnalyzerTag(const char* name); - - /** - * Returns the enum type that corresponds to the script-level type \c - * Analyzer::Tag. - */ - EnumType* GetTagEnumType(); - /** * Given the first packet of a connection, builds its initial * analyzer tree. @@ -350,18 +315,8 @@ public: private: typedef set tag_set; - typedef map analyzer_map_by_name; - typedef map analyzer_map_by_tag; - typedef map analyzer_map_by_val; typedef map analyzer_map_by_port; - void RegisterAnalyzerComponent(Component* component); // Takes ownership. - - Component* Lookup(const string& name); - Component* Lookup(const char* name); - Component* Lookup(const Tag& tag); - Component* Lookup(EnumVal* val); - tag_set* LookupPort(PortVal* val, bool add_if_not_found); tag_set* LookupPort(TransportProto proto, uint32 port, bool add_if_not_found); @@ -370,9 +325,6 @@ private: analyzer_map_by_port analyzers_by_port_tcp; analyzer_map_by_port analyzers_by_port_udp; - analyzer_map_by_name analyzers_by_name; - analyzer_map_by_tag analyzers_by_tag; - analyzer_map_by_val analyzers_by_val; Tag analyzer_backdoor; Tag analyzer_connsize; @@ -380,8 +332,6 @@ private: Tag analyzer_stepping; Tag analyzer_tcpstats; - EnumType* tag_enum_type; - //// Data structures to track analyzed scheduled for future connections. // The index for a scheduled connection. diff --git a/src/analyzer/Tag.cc b/src/analyzer/Tag.cc index 2f04ff17da..3ab41daf78 100644 --- a/src/analyzer/Tag.cc +++ b/src/analyzer/Tag.cc @@ -3,90 +3,20 @@ #include "Tag.h" #include "Manager.h" -#include "../NetVar.h" +analyzer::Tag analyzer::Tag::Error; -using namespace analyzer; - -Tag Tag::Error; - -Tag::Tag(type_t arg_type, subtype_t arg_subtype) +analyzer::Tag::Tag(type_t type, subtype_t subtype) + : ::Tag(analyzer_mgr->GetTagEnumType(), type, subtype) { - assert(arg_type > 0); - - type = arg_type; - subtype = arg_subtype; - int64_t i = (int64)(type) | ((int64)subtype << 31); - - EnumType* etype = analyzer_mgr->GetTagEnumType(); - Ref(etype); - val = new EnumVal(i, etype); } -Tag::Tag(EnumVal* arg_val) +analyzer::Tag& analyzer::Tag::operator=(const analyzer::Tag& other) { - assert(arg_val); - - val = arg_val; - Ref(val); - - int64 i = val->InternalInt(); - type = i & 0xffffffff; - subtype = (i >> 31) & 0xffffffff; - } - -Tag::Tag(const Tag& other) - { - type = other.type; - subtype = other.subtype; - val = other.val; - - if ( val ) - Ref(val); - } - -Tag::Tag() - { - type = 0; - subtype = 0; - val = 0; - } - -Tag::~Tag() - { - Unref(val); - val = 0; - } - -Tag& Tag::operator=(const Tag& other) - { - if ( this != &other ) - { - type = other.type; - subtype = other.subtype; - val = other.val; - - if ( val ) - Ref(val); - } - + ::Tag::operator=(other); return *this; } -EnumVal* Tag::AsEnumVal() const +EnumVal* analyzer::Tag::AsEnumVal() const { - if ( ! val ) - { - assert(analyzer_mgr); - assert(type == 0 && subtype == 0); - EnumType* etype = analyzer_mgr->GetTagEnumType(); - Ref(etype); - val = new EnumVal(0, etype); - } - - return val; - } - -std::string Tag::AsString() const - { - return fmt("%" PRIu32 "/%" PRIu32, type, subtype); + return ::Tag::AsEnumVal(analyzer_mgr->GetTagEnumType()); } diff --git a/src/analyzer/Tag.h b/src/analyzer/Tag.h index edb0ade8a7..d01c8902ee 100644 --- a/src/analyzer/Tag.h +++ b/src/analyzer/Tag.h @@ -5,90 +5,46 @@ #include "config.h" #include "util.h" +#include "../Tag.h" +#include "plugin/TaggedComponent.h" +#include "plugin/ComponentManager.h" class EnumVal; -namespace file_analysis { -class Manager; -class Component; -} - namespace analyzer { class Manager; class Component; /** - * Class to identify an analyzer type. + * Class to identify a protocol analyzer type. * - * Each analyzer type gets a tag consisting of a main type and subtype. The - * former is an identifier that's unique all analyzer classes. The latter is - * passed through to the analyzer instances for their use, yet not further - * interpreted by the analyzer infrastructure; it allows an analyzer to - * branch out into a set of sub-analyzers internally. Jointly, main type and - * subtype form an analyzer "tag". Each unique tag corresponds to a single - * "analyzer" from the user's perspective. At the script layer, these tags - * are mapped into enums of type \c Analyzer::Tag. Internally, the - * analyzer::Manager maintains the mapping of tag to analyzer (and it also - * assigns them their main types), and analyzer::Component creates new - * tags. - * - * The Tag class supports all operations necessary to act as an index in a - * \c std::map. + * The script-layer analogue is Analyzer::Tag. */ -class Tag { +class Tag : public ::Tag { public: - /** - * Type for the analyzer's main type. - */ - typedef uint32 type_t; - - /** - * Type for the analyzer's subtype. - */ - typedef uint32 subtype_t; - /* * Copy constructor. */ - Tag(const Tag& other); + Tag(const Tag& other) : ::Tag(other) {} /** * Default constructor. This initializes the tag with an error value * that will make \c operator \c bool return false. */ - Tag(); + Tag() : ::Tag() {} /** * Destructor. */ - ~Tag(); - - /** - * Returns the tag's main type. - */ - type_t Type() const { return type; } - - /** - * Returns the tag's subtype. - */ - subtype_t Subtype() const { return subtype; } - - /** - * Returns the \c Analyzer::Tag enum that corresponds to this tag. - * The returned value is \a does not have its ref-count increased. - */ - EnumVal* AsEnumVal() const; - - /** - * Returns the numerical values for main and subtype inside a string - * suitable for printing. This is primarily for debugging. - */ - std::string AsString() const; + ~Tag() {} /** * Returns false if the tag represents an error value rather than a * legal analyzer type. + * TODO: make this conversion operator "explicit" (C++11) or use a + * "safe bool" idiom (not necessary if "explicit" is available), + * otherwise this may allow nonsense/undesired comparison operations. */ operator bool() const { return *this != Tag(); } @@ -102,7 +58,7 @@ public: */ bool operator==(const Tag& other) const { - return type == other.type && subtype == other.subtype; + return ::Tag::operator==(other); } /** @@ -110,7 +66,7 @@ public: */ bool operator!=(const Tag& other) const { - return type != other.type || subtype != other.subtype; + return ::Tag::operator!=(other); } /** @@ -118,23 +74,30 @@ public: */ bool operator<(const Tag& other) const { - return type != other.type ? type < other.type : (subtype < other.subtype); + return ::Tag::operator<(other); } + /** + * Returns the \c Analyzer::Tag enum that corresponds to this tag. + * The returned value does not have its ref-count increased. + * + * @param etype the script-layer enum type associated with the tag. + */ + EnumVal* AsEnumVal() const; + static Tag Error; protected: friend class analyzer::Manager; - friend class analyzer::Component; - friend class file_analysis::Manager; - friend class file_analysis::Component; + friend class plugin::ComponentManager; + friend class plugin::TaggedComponent; /** * Constructor. * * @param type The main type. Note that the \a analyzer::Manager * manages the value space internally, so noone else should assign - * any main tyoes. + * any main types. * * @param subtype The sub type, which is left to an analyzer for * interpretation. By default it's set to zero. @@ -144,14 +107,9 @@ protected: /** * Constructor. * - * @param val An enuam value of script type \c Analyzer::Tag. + * @param val An enum value of script type \c Analyzer::Tag. */ - Tag(EnumVal* val); - -private: - type_t type; // Main type. - subtype_t subtype; // Subtype. - mutable EnumVal* val; // Analyzer::Tag value. + Tag(EnumVal* val) : ::Tag(val) {} }; } diff --git a/src/analyzer/analyzer.bif b/src/analyzer/analyzer.bif index 4d70816075..ebf8083624 100644 --- a/src/analyzer/analyzer.bif +++ b/src/analyzer/analyzer.bif @@ -41,11 +41,11 @@ function Analyzer::__schedule_analyzer%(orig: addr, resp: addr, resp_p: port, function __name%(atype: Analyzer::Tag%) : string %{ - return new StringVal(analyzer_mgr->GetAnalyzerName(atype)); + return new StringVal(analyzer_mgr->GetComponentName(atype)); %} function __tag%(name: string%) : Analyzer::Tag %{ - analyzer::Tag t = analyzer_mgr->GetAnalyzerTag(name->CheckString()); + analyzer::Tag t = analyzer_mgr->GetComponentTag(name->CheckString()); return t.AsEnumVal()->Ref(); %} diff --git a/src/file_analysis/Analyzer.cc b/src/file_analysis/Analyzer.cc new file mode 100644 index 0000000000..e0b5011aa8 --- /dev/null +++ b/src/file_analysis/Analyzer.cc @@ -0,0 +1,11 @@ +// See the file "COPYING" in the main distribution directory for copyright. + +#include "Analyzer.h" +#include "Manager.h" + +file_analysis::Analyzer::~Analyzer() + { + DBG_LOG(DBG_FILE_ANALYSIS, "Destroy file analyzer %s", + file_mgr->GetComponentName(tag)); + Unref(args); + } diff --git a/src/file_analysis/Analyzer.h b/src/file_analysis/Analyzer.h index 0a5aa9e25c..e20e2802cf 100644 --- a/src/file_analysis/Analyzer.h +++ b/src/file_analysis/Analyzer.h @@ -5,14 +5,12 @@ #include "Val.h" #include "NetVar.h" -#include "analyzer/Tag.h" +#include "Tag.h" #include "file_analysis/file_analysis.bif.h" namespace file_analysis { -typedef int FA_Tag; - class File; /** @@ -25,11 +23,7 @@ public: * Destructor. Nothing special about it. Virtual since we definitely expect * to delete instances of derived classes via pointers to this class. */ - virtual ~Analyzer() - { - DBG_LOG(DBG_FILE_ANALYSIS, "Destroy file analyzer %d", tag); - Unref(args); - } + virtual ~Analyzer(); /** * Subclasses may override this metod to receive file data non-sequentially. @@ -76,7 +70,7 @@ public: /** * @return the analyzer type enum value. */ - FA_Tag Tag() const { return tag; } + file_analysis::Tag Tag() const { return tag; } /** * @return the AnalyzerArgs associated with the analyzer. @@ -88,18 +82,6 @@ public: */ File* GetFile() const { return file; } - /** - * Retrieves an analyzer tag field from full analyzer argument record. - * @param args an \c AnalyzerArgs (script-layer type) value. - * @return the analyzer tag equivalent of the 'tag' field from the - * \c AnalyzerArgs value \a args. - */ - static FA_Tag ArgsTag(const RecordVal* args) - { - using BifType::Record::Files::AnalyzerArgs; - return args->Lookup(AnalyzerArgs->FieldOffset("tag"))->AsEnum(); - } - protected: /** @@ -108,15 +90,15 @@ protected: * tunable options, if any, related to a particular analyzer type. * @param arg_file the file to which the the analyzer is being attached. */ - Analyzer(RecordVal* arg_args, File* arg_file) - : tag(file_analysis::Analyzer::ArgsTag(arg_args)), + Analyzer(file_analysis::Tag arg_tag, RecordVal* arg_args, File* arg_file) + : tag(arg_tag), args(arg_args->Ref()->AsRecordVal()), file(arg_file) {} private: - FA_Tag tag; /**< The particular analyzer type of the analyzer instance. */ + file_analysis::Tag tag; /**< The particular type of the analyzer instance. */ RecordVal* args; /**< \c AnalyzerArgs val gives tunable analyzer params. */ File* file; /**< The file to which the analyzer is attached. */ }; diff --git a/src/file_analysis/AnalyzerSet.cc b/src/file_analysis/AnalyzerSet.cc index c710d8b085..f7abc01dc2 100644 --- a/src/file_analysis/AnalyzerSet.cc +++ b/src/file_analysis/AnalyzerSet.cc @@ -15,6 +15,7 @@ static void analyzer_del_func(void* v) AnalyzerSet::AnalyzerSet(File* arg_file) : file(arg_file) { TypeList* t = new TypeList(); + t->Append(file_mgr->GetTagEnumType()->Ref()); t->Append(BifType::Record::Files::AnalyzerArgs->Ref()); analyzer_hash = new CompositeHash(t); Unref(t); @@ -34,20 +35,20 @@ AnalyzerSet::~AnalyzerSet() delete analyzer_hash; } -bool AnalyzerSet::Add(RecordVal* args) +bool AnalyzerSet::Add(file_analysis::Tag tag, RecordVal* args) { - HashKey* key = GetKey(args); + HashKey* key = GetKey(tag, args); if ( analyzer_map.Lookup(key) ) { - DBG_LOG(DBG_FILE_ANALYSIS, "Instantiate analyzer %d skipped for file id" - " %s: already exists", file_analysis::Analyzer::ArgsTag(args), + DBG_LOG(DBG_FILE_ANALYSIS, "Instantiate analyzer %s skipped for file id" + " %s: already exists", file_mgr->GetComponentName(tag), file->GetID().c_str()); delete key; return true; } - file_analysis::Analyzer* a = InstantiateAnalyzer(args); + file_analysis::Analyzer* a = InstantiateAnalyzer(tag, args); if ( ! a ) { @@ -60,10 +61,10 @@ bool AnalyzerSet::Add(RecordVal* args) return true; } -bool AnalyzerSet::QueueAdd(RecordVal* args) +bool AnalyzerSet::QueueAdd(file_analysis::Tag tag, RecordVal* args) { - HashKey* key = GetKey(args); - file_analysis::Analyzer* a = InstantiateAnalyzer(args); + HashKey* key = GetKey(tag, args); + file_analysis::Analyzer* a = InstantiateAnalyzer(tag, args); if ( ! a ) { @@ -80,8 +81,9 @@ bool AnalyzerSet::AddMod::Perform(AnalyzerSet* set) { if ( set->analyzer_map.Lookup(key) ) { - DBG_LOG(DBG_FILE_ANALYSIS, "Add analyzer %d skipped for file id" - " %s: already exists", a->Tag(), a->GetFile()->GetID().c_str()); + DBG_LOG(DBG_FILE_ANALYSIS, "Add analyzer %s skipped for file id" + " %s: already exists", file_mgr->GetComponentName(a->Tag()), + a->GetFile()->GetID().c_str()); Abort(); return true; @@ -91,12 +93,12 @@ bool AnalyzerSet::AddMod::Perform(AnalyzerSet* set) return true; } -bool AnalyzerSet::Remove(const RecordVal* args) +bool AnalyzerSet::Remove(file_analysis::Tag tag, RecordVal* args) { - return Remove(file_analysis::Analyzer::ArgsTag(args), GetKey(args)); + return Remove(tag, GetKey(tag, args)); } -bool AnalyzerSet::Remove(FA_Tag tag, HashKey* key) +bool AnalyzerSet::Remove(file_analysis::Tag tag, HashKey* key) { file_analysis::Analyzer* a = (file_analysis::Analyzer*) analyzer_map.Remove(key); @@ -105,22 +107,22 @@ bool AnalyzerSet::Remove(FA_Tag tag, HashKey* key) if ( ! a ) { - DBG_LOG(DBG_FILE_ANALYSIS, "Skip remove analyzer %d for file id %s", - tag, file->GetID().c_str()); + DBG_LOG(DBG_FILE_ANALYSIS, "Skip remove analyzer %s for file id %s", + file_mgr->GetComponentName(tag), file->GetID().c_str()); return false; } - DBG_LOG(DBG_FILE_ANALYSIS, "Remove analyzer %d for file id %s", a->Tag(), + DBG_LOG(DBG_FILE_ANALYSIS, "Remove analyzer %s for file id %s", + file_mgr->GetComponentName(tag), file->GetID().c_str()); delete a; return true; } -bool AnalyzerSet::QueueRemove(const RecordVal* args) +bool AnalyzerSet::QueueRemove(file_analysis::Tag tag, RecordVal* args) { - HashKey* key = GetKey(args); - FA_Tag tag = file_analysis::Analyzer::ArgsTag(args); + HashKey* key = GetKey(tag, args); mod_queue.push(new RemoveMod(tag, key)); @@ -132,24 +134,28 @@ bool AnalyzerSet::RemoveMod::Perform(AnalyzerSet* set) return set->Remove(tag, key); } -HashKey* AnalyzerSet::GetKey(const RecordVal* args) const +HashKey* AnalyzerSet::GetKey(file_analysis::Tag t, RecordVal* args) const { - HashKey* key = analyzer_hash->ComputeHash(args, 1); + ListVal* lv = new ListVal(TYPE_ANY); + lv->Append(t.AsEnumVal()->Ref()); + lv->Append(args->Ref()); + HashKey* key = analyzer_hash->ComputeHash(lv, 1); + Unref(lv); if ( ! key ) reporter->InternalError("AnalyzerArgs type mismatch"); return key; } -file_analysis::Analyzer* AnalyzerSet::InstantiateAnalyzer(RecordVal* args) const +file_analysis::Analyzer* AnalyzerSet::InstantiateAnalyzer(Tag tag, + RecordVal* args) const { - FA_Tag tag = file_analysis::Analyzer::ArgsTag(args); file_analysis::Analyzer* a = file_mgr->InstantiateAnalyzer(tag, args, file); if ( ! a ) { reporter->Error("Failed file analyzer %s instantiation for file id %s", - file_mgr->GetAnalyzerName(tag), file->GetID().c_str()); + file_mgr->GetComponentName(tag), file->GetID().c_str()); return 0; } @@ -158,8 +164,8 @@ file_analysis::Analyzer* AnalyzerSet::InstantiateAnalyzer(RecordVal* args) const void AnalyzerSet::Insert(file_analysis::Analyzer* a, HashKey* key) { - DBG_LOG(DBG_FILE_ANALYSIS, "Add analyzer %d for file id %s", a->Tag(), - file->GetID().c_str()); + DBG_LOG(DBG_FILE_ANALYSIS, "Add analyzer %s for file id %s", + file_mgr->GetComponentName(a->Tag()), file->GetID().c_str()); analyzer_map.Insert(key, a); delete key; } diff --git a/src/file_analysis/AnalyzerSet.h b/src/file_analysis/AnalyzerSet.h index 6f14149e30..42a54f4943 100644 --- a/src/file_analysis/AnalyzerSet.h +++ b/src/file_analysis/AnalyzerSet.h @@ -9,6 +9,7 @@ #include "Dict.h" #include "CompHash.h" #include "Val.h" +#include "Tag.h" namespace file_analysis { @@ -38,31 +39,35 @@ public: /** * Attach an analyzer to #file immediately. + * @param tag the analyzer tag of the file analyzer to add. * @param args an \c AnalyzerArgs value which specifies an analyzer. * @return true if analyzer was instantiated/attached, else false. */ - bool Add(RecordVal* args); + bool Add(file_analysis::Tag tag, RecordVal* args); /** * Queue the attachment of an analyzer to #file. + * @param tag the analyzer tag of the file analyzer to add. * @param args an \c AnalyzerArgs value which specifies an analyzer. * @return true if analyzer was able to be instantiated, else false. */ - bool QueueAdd(RecordVal* args); + bool QueueAdd(file_analysis::Tag tag, RecordVal* args); /** * Remove an analyzer from #file immediately. + * @param tag the analyzer tag of the file analyzer to remove. * @param args an \c AnalyzerArgs value which specifies an analyzer. * @return false if analyzer didn't exist and so wasn't removed, else true. */ - bool Remove(const RecordVal* args); + bool Remove(file_analysis::Tag tag, RecordVal* args); /** * Queue the removal of an analyzer from #file. + * @param tag the analyzer tag of the file analyzer to remove. * @param args an \c AnalyzerArgs value which specifies an analyzer. * @return true if analyzer exists at time of call, else false; */ - bool QueueRemove(const RecordVal* args); + bool QueueRemove(file_analysis::Tag tag, RecordVal* args); /** * Perform all queued modifications to the current analyzer set. @@ -91,17 +96,20 @@ protected: /** * Get a hash key which represents an analyzer instance. + * @param tag the file analyzer tag. * @param args an \c AnalyzerArgs value which specifies an analyzer. * @return the hash key calculated from \a args */ - HashKey* GetKey(const RecordVal* args) const; + HashKey* GetKey(file_analysis::Tag tag, RecordVal* args) const; /** * Create an instance of a file analyzer. + * @param tag the tag of a file analyzer. * @param args an \c AnalyzerArgs value which specifies an analyzer. * @return a new file analyzer instance. */ - file_analysis::Analyzer* InstantiateAnalyzer(RecordVal* args) const; + file_analysis::Analyzer* InstantiateAnalyzer(file_analysis::Tag tag, + RecordVal* args) const; /** * Insert an analyzer instance in to the set. @@ -116,7 +124,7 @@ protected: * just used for debugging messages. * @param key the hash key which represents the analyzer's \c AnalyzerArgs. */ - bool Remove(FA_Tag tag, HashKey* key); + bool Remove(file_analysis::Tag tag, HashKey* key); private: @@ -175,14 +183,14 @@ private: * @param arg_a an analyzer instance to add to an analyzer set. * @param arg_key hash key representing the analyzer's \c AnalyzerArgs. */ - RemoveMod(FA_Tag arg_tag, HashKey* arg_key) + RemoveMod(file_analysis::Tag arg_tag, HashKey* arg_key) : Modification(), tag(arg_tag), key(arg_key) {} virtual ~RemoveMod() {} virtual bool Perform(AnalyzerSet* set); virtual void Abort() { delete key; } protected: - FA_Tag tag; + file_analysis::Tag tag; HashKey* key; }; diff --git a/src/file_analysis/CMakeLists.txt b/src/file_analysis/CMakeLists.txt index f22c293cc4..846fc4bf15 100644 --- a/src/file_analysis/CMakeLists.txt +++ b/src/file_analysis/CMakeLists.txt @@ -11,9 +11,10 @@ set(file_analysis_SRCS Manager.cc File.cc FileTimer.cc - Analyzer.h + Analyzer.cc AnalyzerSet.cc Component.cc + Tag.cc ) bif_target(file_analysis.bif) diff --git a/src/file_analysis/Component.cc b/src/file_analysis/Component.cc index 99531e40f5..9c47f2c75e 100644 --- a/src/file_analysis/Component.cc +++ b/src/file_analysis/Component.cc @@ -8,26 +8,22 @@ using namespace file_analysis; -analyzer::Tag::type_t Component::type_counter = 0; - -Component::Component(const char* arg_name, factory_callback arg_factory, - analyzer::Tag::subtype_t arg_subtype) - : plugin::Component(plugin::component::FILE_ANALYZER) +Component::Component(const char* arg_name, factory_callback arg_factory) + : plugin::Component(plugin::component::FILE_ANALYZER), + plugin::TaggedComponent() { name = copy_string(arg_name); canon_name = canonify_name(arg_name); factory = arg_factory; - - tag = analyzer::Tag(++type_counter, arg_subtype); } Component::Component(const Component& other) - : plugin::Component(Type()) + : plugin::Component(Type()), + plugin::TaggedComponent(other) { name = copy_string(other.name); canon_name = copy_string(other.canon_name); factory = other.factory; - tag = other.tag; } Component::~Component() @@ -36,11 +32,6 @@ Component::~Component() delete [] canon_name; } -analyzer::Tag Component::Tag() const - { - return tag; - } - void Component::Describe(ODesc* d) const { plugin::Component::Describe(d); @@ -58,11 +49,12 @@ void Component::Describe(ODesc* d) const Component& Component::operator=(const Component& other) { + plugin::TaggedComponent::operator=(other); + if ( &other != this ) { name = copy_string(other.name); factory = other.factory; - tag = other.tag; } return *this; diff --git a/src/file_analysis/Component.h b/src/file_analysis/Component.h index 3cdc69efdf..4cf2dced60 100644 --- a/src/file_analysis/Component.h +++ b/src/file_analysis/Component.h @@ -3,8 +3,9 @@ #ifndef FILE_ANALYZER_PLUGIN_COMPONENT_H #define FILE_ANALYZER_PLUGIN_COMPONENT_H -#include "analyzer/Tag.h" +#include "Tag.h" #include "plugin/Component.h" +#include "plugin/TaggedComponent.h" #include "Val.h" @@ -22,7 +23,8 @@ class Analyzer; * A plugin can provide a specific file analyzer by registering this * analyzer component, describing the analyzer. */ -class Component : public plugin::Component { +class Component : public plugin::Component, + public plugin::TaggedComponent { public: typedef Analyzer* (*factory_callback)(RecordVal* args, File* file); @@ -38,15 +40,8 @@ public: * from file_analysis::Analyzer. This is typically a static \c * Instatiate() method inside the class that just allocates and * returns a new instance. - * - * @param subtype A subtype associated with this component that - * further distinguishes it. The subtype will be integrated into - * the analyzer::Tag that the manager associates with this analyzer, - * and analyzer instances can accordingly access it via analyzer::Tag(). - * If not used, leave at zero. */ - Component(const char* name, factory_callback factory, - analyzer::Tag::subtype_t subtype = 0); + Component(const char* name, factory_callback factory); /** * Copy constructor. @@ -79,13 +74,6 @@ public: */ factory_callback Factory() const { return factory; } - /** - * Returns the analyzer's tag. Note that this is automatically - * generated for each new Components, and hence unique across all of - * them. - */ - analyzer::Tag Tag() const; - /** * Generates a human-readable description of the component's main * parameters. This goes into the output of \c "bro -NN". @@ -98,10 +86,6 @@ private: const char* name; // The analyzer's name. const char* canon_name; // The analyzer's canonical name. factory_callback factory; // The analyzer's factory callback. - analyzer::Tag tag; // The automatically assigned analyzer tag. - - // Global counter used to generate unique tags. - static analyzer::Tag::type_t type_counter; }; } diff --git a/src/file_analysis/File.cc b/src/file_analysis/File.cc index 9e44e327e3..1197cd06f6 100644 --- a/src/file_analysis/File.cc +++ b/src/file_analysis/File.cc @@ -88,7 +88,7 @@ File::File(const string& file_id, Connection* conn, analyzer::Tag tag, if ( conn ) { // add source, connection, is_orig fields - SetSource(analyzer_mgr->GetAnalyzerName(tag)); + SetSource(analyzer_mgr->GetComponentName(tag)); val->Assign(is_orig_idx, new Val(is_orig, TYPE_BOOL)); UpdateConnectionFields(conn, is_orig); } @@ -231,14 +231,14 @@ void File::ScheduleInactivityTimer() const timer_mgr->Add(new FileTimer(network_time, id, GetTimeoutInterval())); } -bool File::AddAnalyzer(RecordVal* args) +bool File::AddAnalyzer(file_analysis::Tag tag, RecordVal* args) { - return done ? false : analyzers.QueueAdd(args); + return done ? false : analyzers.QueueAdd(tag, args); } -bool File::RemoveAnalyzer(const RecordVal* args) +bool File::RemoveAnalyzer(file_analysis::Tag tag, RecordVal* args) { - return done ? false : analyzers.QueueRemove(args); + return done ? false : analyzers.QueueRemove(tag, args); } bool File::BufferBOF(const u_char* data, uint64 len) @@ -321,7 +321,7 @@ void File::DataIn(const u_char* data, uint64 len, uint64 offset) while ( (a = analyzers.NextEntry(c)) ) { if ( ! a->DeliverChunk(data, len, offset) ) - analyzers.QueueRemove(a->Args()); + analyzers.QueueRemove(a->Tag(), a->Args()); } analyzers.DrainModifications(); @@ -356,7 +356,7 @@ void File::DataIn(const u_char* data, uint64 len) { if ( ! a->DeliverStream(data, len) ) { - analyzers.QueueRemove(a->Args()); + analyzers.QueueRemove(a->Tag(), a->Args()); continue; } @@ -364,7 +364,7 @@ void File::DataIn(const u_char* data, uint64 len) LookupFieldDefaultCount(missing_bytes_idx); if ( ! a->DeliverChunk(data, len, offset) ) - analyzers.QueueRemove(a->Args()); + analyzers.QueueRemove(a->Tag(), a->Args()); } analyzers.DrainModifications(); @@ -389,7 +389,7 @@ void File::EndOfFile() while ( (a = analyzers.NextEntry(c)) ) { if ( ! a->EndOfFile() ) - analyzers.QueueRemove(a->Args()); + analyzers.QueueRemove(a->Tag(), a->Args()); } FileEvent(file_state_remove); @@ -411,7 +411,7 @@ void File::Gap(uint64 offset, uint64 len) while ( (a = analyzers.NextEntry(c)) ) { if ( ! a->Undelivered(offset, len) ) - analyzers.QueueRemove(a->Args()); + analyzers.QueueRemove(a->Tag(), a->Args()); } if ( FileEventAvailable(file_gap) ) diff --git a/src/file_analysis/File.h b/src/file_analysis/File.h index 794734d24b..12c1e061a8 100644 --- a/src/file_analysis/File.h +++ b/src/file_analysis/File.h @@ -10,6 +10,7 @@ #include "Conn.h" #include "Val.h" +#include "Tag.h" #include "AnalyzerSet.h" #include "BroString.h" @@ -94,17 +95,19 @@ public: /** * Queues attaching an analyzer. Only one analyzer per type can be attached * at a time unless the arguments differ. + * @param tag the analyzer tag of the file analyzer to add. * @param args an \c AnalyzerArgs value representing a file analyzer. * @return false if analyzer can't be instantiated, else true. */ - bool AddAnalyzer(RecordVal* args); + bool AddAnalyzer(file_analysis::Tag tag, RecordVal* args); /** * Queues removal of an analyzer. + * @param tag the analyzer tag of the file analyzer to remove. * @param args an \c AnalyzerArgs value representing a file analyzer. * @return true if analyzer was active at time of call, else false. */ - bool RemoveAnalyzer(const RecordVal* args); + bool RemoveAnalyzer(file_analysis::Tag tag, RecordVal* args); /** * Pass in non-sequential data and deliver to attached analyzers. diff --git a/src/file_analysis/FileTimer.cc b/src/file_analysis/FileTimer.cc index 575857fd15..6b1d70f136 100644 --- a/src/file_analysis/FileTimer.cc +++ b/src/file_analysis/FileTimer.cc @@ -14,7 +14,7 @@ FileTimer::FileTimer(double t, const string& id, double interval) void FileTimer::Dispatch(double t, int is_expire) { - File* file = file_mgr->Lookup(file_id); + File* file = file_mgr->LookupFile(file_id); if ( ! file ) return; diff --git a/src/file_analysis/Manager.cc b/src/file_analysis/Manager.cc index 7a92e92109..5975133356 100644 --- a/src/file_analysis/Manager.cc +++ b/src/file_analysis/Manager.cc @@ -18,10 +18,9 @@ TableVal* Manager::disabled = 0; string Manager::salt; Manager::Manager() + : plugin::ComponentManager("Files") { - tag_enum_type = new EnumType("Files::Tag"); - ::ID* id = install_ID("Tag", "Files", true, true); - add_type(id, tag_enum_type, 0, 0); } Manager::~Manager() @@ -35,27 +34,7 @@ void Manager::InitPreScript() for ( std::list::const_iterator i = analyzers.begin(); i != analyzers.end(); ++i ) - RegisterAnalyzerComponent(*i); - } - -void Manager::RegisterAnalyzerComponent(Component* component) - { - const char* cname = component->CanonicalName(); - - if ( tag_enum_type->Lookup("Files", cname) != -1 ) - reporter->FatalError("File Analyzer %s defined more than once", cname); - - DBG_LOG(DBG_FILE_ANALYSIS, "Registering analyzer %s (tag %s)", - component->Name(), component->Tag().AsString().c_str()); - - analyzers_by_name.insert(std::make_pair(cname, component)); - analyzers_by_tag.insert(std::make_pair(component->Tag(), component)); - analyzers_by_val.insert(std::make_pair( - component->Tag().AsEnumVal()->InternalInt(), component)); - - string id = fmt("ANALYZER_%s", cname); - tag_enum_type->AddName("Files", id.c_str(), - component->Tag().AsEnumVal()->InternalInt(), true); + RegisterComponent(*i, "ANALYZER_"); } void Manager::InitPostScript() @@ -193,7 +172,7 @@ void Manager::SetSize(uint64 size, analyzer::Tag tag, Connection* conn, bool Manager::SetTimeoutInterval(const string& file_id, double interval) const { - File* file = Lookup(file_id); + File* file = LookupFile(file_id); if ( ! file ) return false; @@ -205,24 +184,26 @@ bool Manager::SetTimeoutInterval(const string& file_id, double interval) const return true; } -bool Manager::AddAnalyzer(const string& file_id, RecordVal* args) const +bool Manager::AddAnalyzer(const string& file_id, file_analysis::Tag tag, + RecordVal* args) const { - File* file = Lookup(file_id); + File* file = LookupFile(file_id); if ( ! file ) return false; - return file->AddAnalyzer(args); + return file->AddAnalyzer(tag, args); } -bool Manager::RemoveAnalyzer(const string& file_id, const RecordVal* args) const +bool Manager::RemoveAnalyzer(const string& file_id, file_analysis::Tag tag, + RecordVal* args) const { - File* file = Lookup(file_id); + File* file = LookupFile(file_id); if ( ! file ) return false; - return file->RemoveAnalyzer(args); + return file->RemoveAnalyzer(tag, args); } File* Manager::GetFile(const string& file_id, Connection* conn, @@ -255,7 +236,7 @@ File* Manager::GetFile(const string& file_id, Connection* conn, return rval; } -File* Manager::Lookup(const string& file_id) const +File* Manager::LookupFile(const string& file_id) const { IDMap::const_iterator it = id_map.find(file_id); @@ -267,7 +248,7 @@ File* Manager::Lookup(const string& file_id) const void Manager::Timeout(const string& file_id, bool is_terminating) { - File* file = Lookup(file_id); + File* file = LookupFile(file_id); if ( ! file ) return; @@ -366,15 +347,13 @@ bool Manager::IsDisabled(analyzer::Tag tag) return rval; } -Analyzer* Manager::InstantiateAnalyzer(int tag, RecordVal* args, File* f) const +Analyzer* Manager::InstantiateAnalyzer(Tag tag, RecordVal* args, File* f) const { - analyzer_map_by_val::const_iterator it = analyzers_by_val.find(tag); + Component* c = Lookup(tag); - if ( it == analyzers_by_val.end() ) - reporter->InternalError("cannot instantiate unknown file analyzer: %d", - tag); - - Component* c = it->second; + if ( ! c ) + reporter->InternalError("cannot instantiate unknown file analyzer: %s", + tag.AsString().c_str()); if ( ! c->Factory() ) reporter->InternalError("file analyzer %s cannot be instantiated " @@ -382,14 +361,3 @@ Analyzer* Manager::InstantiateAnalyzer(int tag, RecordVal* args, File* f) const return c->Factory()(args, f); } - -const char* Manager::GetAnalyzerName(int tag) const - { - analyzer_map_by_val::const_iterator it = analyzers_by_val.find(tag); - - if ( it == analyzers_by_val.end() ) - reporter->InternalError("cannot get name of unknown file analyzer: %d", - tag); - - return it->second->CanonicalName(); - } diff --git a/src/file_analysis/Manager.h b/src/file_analysis/Manager.h index 84b606173d..dcf33edc99 100644 --- a/src/file_analysis/Manager.h +++ b/src/file_analysis/Manager.h @@ -18,7 +18,8 @@ #include "File.h" #include "FileTimer.h" #include "Component.h" - +#include "Tag.h" +#include "plugin/ComponentManager.h" #include "analyzer/Tag.h" #include "file_analysis/file_analysis.bif.h" @@ -28,7 +29,7 @@ namespace file_analysis { /** * Main entry point for interacting with file analysis. */ -class Manager { +class Manager : public plugin::ComponentManager { public: /** @@ -177,18 +178,22 @@ public: * analyzers of a given type can be attached per file identifier at a time * as long as the arguments differ. * @param file_id the file identifier/hash. + * @param tag the analyzer tag of the file analyzer to add. * @param args a \c AnalyzerArgs value which describes a file analyzer. * @return false if the analyzer failed to be instantiated, else true. */ - bool AddAnalyzer(const string& file_id, RecordVal* args) const; + bool AddAnalyzer(const string& file_id, file_analysis::Tag tag, + RecordVal* args) const; /** * Queue removal of an analyzer for a given file identifier. * @param file_id the file identifier/hash. + * @param tag the analyzer tag of the file analyzer to remove. * @param args a \c AnalyzerArgs value which describes a file analyzer. * @return true if the analyzer is active at the time of call, else false. */ - bool RemoveAnalyzer(const string& file_id, const RecordVal* args) const; + bool RemoveAnalyzer(const string& file_id, file_analysis::Tag tag, + RecordVal* args) const; /** * Tells whether analysis for a file is active or ignored. @@ -204,15 +209,7 @@ public: * @param f The file analzer is to be associated with. * @return The new analyzer instance or null if tag is invalid. */ - Analyzer* InstantiateAnalyzer(int tag, RecordVal* args, File* f) const; - - /** - * Translates a script-level file analyzer tag in to corresponding file - * analyzer name. - * @param tag The enum val of a file analyzer. - * @return The human-readable name of the file analyzer. - */ - const char* GetAnalyzerName(int tag) const; + Analyzer* InstantiateAnalyzer(Tag tag, RecordVal* args, File* f) const; protected: friend class FileTimer; @@ -247,7 +244,7 @@ protected: * @return the File object mapped to \a file_id, or a null pointer if no * mapping exists. */ - File* Lookup(const string& file_id) const; + File* LookupFile(const string& file_id) const; /** * Evaluate timeout policy for a file and remove the File object mapped to @@ -287,20 +284,10 @@ protected: static bool IsDisabled(analyzer::Tag tag); private: - typedef map analyzer_map_by_name; - typedef map analyzer_map_by_tag; - typedef map analyzer_map_by_val; - - void RegisterAnalyzerComponent(Component* component); IDMap id_map; /**< Map file ID to file_analysis::File records. */ IDSet ignored; /**< Ignored files. Will be finally removed on EOF. */ string current_file_id; /**< Hash of what get_file_handle event sets. */ - EnumType* tag_enum_type; /**< File analyzer tag type. */ - - analyzer_map_by_name analyzers_by_name; - analyzer_map_by_tag analyzers_by_tag; - analyzer_map_by_val analyzers_by_val; static TableVal* disabled; /**< Table of disabled analyzers. */ static string salt; /**< A salt added to file handles before hashing. */ diff --git a/src/file_analysis/Tag.cc b/src/file_analysis/Tag.cc new file mode 100644 index 0000000000..6f0774a4b4 --- /dev/null +++ b/src/file_analysis/Tag.cc @@ -0,0 +1,24 @@ +// See the file "COPYING" in the main distribution directory for copyright. + +#include "Tag.h" +#include "Manager.h" + +using namespace file_analysis; + +file_analysis::Tag file_analysis::Tag::Error; + +file_analysis::Tag::Tag(type_t type, subtype_t subtype) + : ::Tag(file_mgr->GetTagEnumType(), type, subtype) + { + } + +file_analysis::Tag& file_analysis::Tag::operator=(const file_analysis::Tag& other) + { + ::Tag::operator=(other); + return *this; + } + +EnumVal* file_analysis::Tag::AsEnumVal() const + { + return ::Tag::AsEnumVal(file_mgr->GetTagEnumType()); + } diff --git a/src/file_analysis/Tag.h b/src/file_analysis/Tag.h new file mode 100644 index 0000000000..aa38836403 --- /dev/null +++ b/src/file_analysis/Tag.h @@ -0,0 +1,116 @@ +// See the file "COPYING" in the main distribution directory for copyright. + +#ifndef FILE_ANALYZER_TAG_H +#define FILE_ANALYZER_TAG_H + +#include "config.h" +#include "util.h" +#include "../Tag.h" +#include "plugin/TaggedComponent.h" +#include "plugin/ComponentManager.h" + +class EnumVal; + +namespace file_analysis { + +class Component; + +/** + * Class to identify a file analyzer type. + * + * The script-layer analogue is Files::Tag. + */ +class Tag : public ::Tag { +public: + /* + * Copy constructor. + */ + Tag(const Tag& other) : ::Tag(other) {} + + /** + * Default constructor. This initializes the tag with an error value + * that will make \c operator \c bool return false. + */ + Tag() : ::Tag() {} + + /** + * Destructor. + */ + ~Tag() {} + + /** + * Returns false if the tag represents an error value rather than a + * legal analyzer type. + * TODO: make this conversion operator "explicit" (C++11) or use a + * "safe bool" idiom (not necessary if "explicit" is available), + * otherwise this may allow nonsense/undesired comparison operations. + * + */ + operator bool() const { return *this != Tag(); } + + /** + * Assignment operator. + */ + Tag& operator=(const Tag& other); + + /** + * Compares two tags for equality. + */ + bool operator==(const Tag& other) const + { + return ::Tag::operator==(other); + } + + /** + * Compares two tags for inequality. + */ + bool operator!=(const Tag& other) const + { + return ::Tag::operator!=(other); + } + + /** + * Compares two tags for less-than relationship. + */ + bool operator<(const Tag& other) const + { + return ::Tag::operator<(other); + } + + /** + * Returns the \c Files::Tag enum that corresponds to this tag. + * The returned value does not have its ref-count increased. + * + * @param etype the script-layer enum type associated with the tag. + */ + EnumVal* AsEnumVal() const; + + static Tag Error; + +protected: + friend class plugin::ComponentManager; + friend class plugin::TaggedComponent; + + /** + * Constructor. + * + * @param type The main type. Note that the \a file_analysis::Manager + * manages the value space internally, so noone else should assign + * main types. + * + * @param subtype The sub type, which is left to an analyzer for + * interpretation. By default it's set to zero. + */ + Tag(type_t type, subtype_t subtype = 0); + + /** + * Constructor. + * + * @param val An enum value of script type \c Files::Tag. + */ + Tag(EnumVal* val) : ::Tag(val) {} +}; + +} + +#endif diff --git a/src/file_analysis/analyzer/data_event/CMakeLists.txt b/src/file_analysis/analyzer/data_event/CMakeLists.txt index 81551feda2..49e23d49a0 100644 --- a/src/file_analysis/analyzer/data_event/CMakeLists.txt +++ b/src/file_analysis/analyzer/data_event/CMakeLists.txt @@ -4,5 +4,5 @@ include_directories(BEFORE ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR}) bro_plugin_begin(Bro FileDataEvent) -bro_plugin_cc(DataEvent.cc Plugin.cc) +bro_plugin_cc(DataEvent.cc Plugin.cc ../../Analyzer.cc) bro_plugin_end() diff --git a/src/file_analysis/analyzer/data_event/DataEvent.cc b/src/file_analysis/analyzer/data_event/DataEvent.cc index 1b04111c44..cf2d7e52ec 100644 --- a/src/file_analysis/analyzer/data_event/DataEvent.cc +++ b/src/file_analysis/analyzer/data_event/DataEvent.cc @@ -6,12 +6,15 @@ #include "EventRegistry.h" #include "Event.h" #include "util.h" +#include "file_analysis/Manager.h" using namespace file_analysis; DataEvent::DataEvent(RecordVal* args, File* file, EventHandlerPtr ce, EventHandlerPtr se) - : file_analysis::Analyzer(args, file), chunk_event(ce), stream_event(se) + : file_analysis::Analyzer(file_mgr->GetComponentTag("DATA_EVENT"), + args, file), + chunk_event(ce), stream_event(se) { } diff --git a/src/file_analysis/analyzer/extract/CMakeLists.txt b/src/file_analysis/analyzer/extract/CMakeLists.txt index df3fa2646d..e413196db2 100644 --- a/src/file_analysis/analyzer/extract/CMakeLists.txt +++ b/src/file_analysis/analyzer/extract/CMakeLists.txt @@ -4,5 +4,5 @@ include_directories(BEFORE ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR}) bro_plugin_begin(Bro FileExtract) -bro_plugin_cc(Extract.cc Plugin.cc) +bro_plugin_cc(Extract.cc Plugin.cc ../../Analyzer.cc) bro_plugin_end() diff --git a/src/file_analysis/analyzer/extract/Extract.cc b/src/file_analysis/analyzer/extract/Extract.cc index ef37425003..28b5cf5a63 100644 --- a/src/file_analysis/analyzer/extract/Extract.cc +++ b/src/file_analysis/analyzer/extract/Extract.cc @@ -4,11 +4,13 @@ #include "Extract.h" #include "util.h" +#include "file_analysis/Manager.h" using namespace file_analysis; Extract::Extract(RecordVal* args, File* file, const string& arg_filename) - : file_analysis::Analyzer(args, file), filename(arg_filename) + : file_analysis::Analyzer(file_mgr->GetComponentTag("EXTRACT"), args, file), + filename(arg_filename) { fd = open(filename.c_str(), O_WRONLY | O_CREAT | O_TRUNC, 0666); diff --git a/src/file_analysis/analyzer/hash/CMakeLists.txt b/src/file_analysis/analyzer/hash/CMakeLists.txt index 5734740198..0e3143ee05 100644 --- a/src/file_analysis/analyzer/hash/CMakeLists.txt +++ b/src/file_analysis/analyzer/hash/CMakeLists.txt @@ -4,6 +4,6 @@ include_directories(BEFORE ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR}) bro_plugin_begin(Bro FileHash) -bro_plugin_cc(Hash.cc Plugin.cc) +bro_plugin_cc(Hash.cc Plugin.cc ../../Analyzer.cc) bro_plugin_bif(events.bif) bro_plugin_end() diff --git a/src/file_analysis/analyzer/hash/Hash.cc b/src/file_analysis/analyzer/hash/Hash.cc index 9835f343b6..9829934301 100644 --- a/src/file_analysis/analyzer/hash/Hash.cc +++ b/src/file_analysis/analyzer/hash/Hash.cc @@ -5,11 +5,12 @@ #include "Hash.h" #include "util.h" #include "Event.h" +#include "file_analysis/Manager.h" using namespace file_analysis; Hash::Hash(RecordVal* args, File* file, HashVal* hv, const char* arg_kind) - : file_analysis::Analyzer(args, file), hash(hv), fed(false), kind(arg_kind) + : file_analysis::Analyzer(file_mgr->GetComponentTag(to_upper(arg_kind).c_str()), args, file), hash(hv), fed(false), kind(arg_kind) { hash->Init(); } diff --git a/src/file_analysis/file_analysis.bif b/src/file_analysis/file_analysis.bif index b6c80ac800..0e904f298f 100644 --- a/src/file_analysis/file_analysis.bif +++ b/src/file_analysis/file_analysis.bif @@ -16,21 +16,23 @@ function Files::__set_timeout_interval%(file_id: string, t: interval%): bool %} ## :bro:see:`Files::add_analyzer`. -function Files::__add_analyzer%(file_id: string, args: any%): bool +function Files::__add_analyzer%(file_id: string, tag: Files::Tag, args: any%): bool %{ using BifType::Record::Files::AnalyzerArgs; RecordVal* rv = args->AsRecordVal()->CoerceTo(AnalyzerArgs); - bool result = file_mgr->AddAnalyzer(file_id->CheckString(), rv); + bool result = file_mgr->AddAnalyzer(file_id->CheckString(), + file_mgr->GetComponentTag(tag), rv); Unref(rv); return new Val(result, TYPE_BOOL); %} ## :bro:see:`Files::remove_analyzer`. -function Files::__remove_analyzer%(file_id: string, args: any%): bool +function Files::__remove_analyzer%(file_id: string, tag: Files::Tag, args: any%): bool %{ using BifType::Record::Files::AnalyzerArgs; RecordVal* rv = args->AsRecordVal()->CoerceTo(AnalyzerArgs); - bool result = file_mgr->RemoveAnalyzer(file_id->CheckString(), rv); + bool result = file_mgr->RemoveAnalyzer(file_id->CheckString(), + file_mgr->GetComponentTag(tag) , rv); Unref(rv); return new Val(result, TYPE_BOOL); %} @@ -45,7 +47,7 @@ function Files::__stop%(file_id: string%): bool ## :bro:see:`Files::analyzer_name`. function Files::__analyzer_name%(tag: Files::Tag%) : string %{ - return new StringVal(file_mgr->GetAnalyzerName(tag->InternalInt())); + return new StringVal(file_mgr->GetComponentName(tag)); %} module GLOBAL; diff --git a/src/main.cc b/src/main.cc index 56193a935b..6a58832964 100644 --- a/src/main.cc +++ b/src/main.cc @@ -872,6 +872,7 @@ int main(int argc, char** argv) if ( generate_documentation ) { CreateProtoAnalyzerDoc("proto-analyzers.rst"); + CreateFileAnalyzerDoc("file-analyzers.rst"); std::list::iterator it; diff --git a/src/plugin/ComponentManager.h b/src/plugin/ComponentManager.h new file mode 100644 index 0000000000..16f9d80743 --- /dev/null +++ b/src/plugin/ComponentManager.h @@ -0,0 +1,248 @@ +#ifndef PLUGIN_COMPONENT_MANAGER_H +#define PLUGIN_COMPONENT_MANAGER_H + +#include +#include +#include + +#include "Type.h" +#include "ID.h" +#include "Var.h" +#include "Val.h" +#include "Reporter.h" + +namespace plugin { + +/** + * A class that manages tracking of plugin components (e.g. analyzers) and + * installs identifiers in the script-layer to identify them by a unique tag, + * (a script-layer enum value). + * + * @tparam T A ::Tag type or derivative. + * @tparam C A plugin::TaggedComponent type derivative. + */ +template +class ComponentManager { +public: + + /** + * Constructor creates a new enum type called a "Tag" to associate with + * a component. + * + * @param module The script-layer module in which to install the "Tag" ID + * representing an enum type. + */ + ComponentManager(const string& module); + + /** + * @return The script-layer module in which the component's "Tag" ID lives. + */ + const char* GetModule() const; + + /** + * @return A list of all registered components. + */ + list GetComponents() const; + + /** + * @return The enum type associated with the script-layer "Tag". + */ + EnumType* GetTagEnumType() const; + + /** + * Get a component name from its tag. + * + * @param tag A component's tag. + * @return The canonical component name. + */ + const char* GetComponentName(T tag) const; + + /** + * Get a component name from it's enum value. + * + * @param val A component's enum value. + * @return The canonical component name. + */ + const char* GetComponentName(Val* val) const; + + /** + * Get a component tag from its name. + * + * @param name A component's canonical name. + * @return The component's tag, or a tag representing an error if + * no such component assoicated with the name exists. + */ + T GetComponentTag(const string& name) const; + + /** + * Get a component tag from its enum value. + * + * @param v A component's enum value. + * @return The component's tag, or a tag representing an error if + * no such component assoicated with the value exists. + */ + T GetComponentTag(Val* v) const; + +protected: + + /** + * Add a component the internal maps used to keep track of it and create + * a script-layer ID for the component's enum value. + * + * @param component A component to track. + * @param prefix The script-layer ID associated with the component's enum + * value will be a concatenation of this prefix and the component's + * canonical name. + */ + void RegisterComponent(C* component, const string& prefix = ""); + + /** + * @param name The canonical name of a component. + * @return The component associated with the name or a null pointer if no + * such component exists. + */ + C* Lookup(const string& name) const; + + /** + * @param name A component tag. + * @return The component associated with the tag or a null pointer if no + * such component exists. + */ + C* Lookup(const T& tag) const; + + /** + * @param name A component's enum value. + * @return The component associated with the value or a null pointer if no + * such component exists. + */ + C* Lookup(EnumVal* val) const; + +private: + + string module; /**< Script layer module in which component tags live. */ + EnumType* tag_enum_type; /**< Enum type of component tags. */ + map components_by_name; + map components_by_tag; + map components_by_val; +}; + +template +ComponentManager::ComponentManager(const string& arg_module) + : module(arg_module) + { + tag_enum_type = new EnumType(module + "::Tag"); + ::ID* id = install_ID("Tag", module.c_str(), true, true); + add_type(id, tag_enum_type, 0, 0); + } + +template +const char* ComponentManager::GetModule() const + { + return module.c_str(); + } + +template +list ComponentManager::GetComponents() const + { + list rval; + typename map::const_iterator i; + + for ( i = components_by_tag.begin(); i != components_by_tag.end(); ++i ) + rval.push_back(i->second); + + return rval; + } + +template +EnumType* ComponentManager::GetTagEnumType() const + { + return tag_enum_type; + } + +template +const char* ComponentManager::GetComponentName(T tag) const + { + static const char* error = ""; + + if ( ! tag ) + return error; + + C* c = Lookup(tag); + + if ( ! c ) + reporter->InternalError("request for name of unknown component tag %s", + tag.AsString().c_str()); + + return c->CanonicalName(); + } + +template +const char* ComponentManager::GetComponentName(Val* val) const + { + return GetComponentName(T(val->AsEnumVal())); + } + +template +T ComponentManager::GetComponentTag(const string& name) const + { + C* c = Lookup(name); + return c ? c->Tag() : T(); + } + +template +T ComponentManager::GetComponentTag(Val* v) const + { + C* c = Lookup(v->AsEnumVal()); + return c ? c->Tag() : T(); + } + +template +C* ComponentManager::Lookup(const string& name) const + { + typename map::const_iterator i = + components_by_name.find(to_upper(name)); + return i != components_by_name.end() ? i->second : 0; + } + +template +C* ComponentManager::Lookup(const T& tag) const + { + typename map::const_iterator i = components_by_tag.find(tag); + return i != components_by_tag.end() ? i->second : 0; + } + +template +C* ComponentManager::Lookup(EnumVal* val) const + { + typename map::const_iterator i = + components_by_val.find(val->InternalInt()); + return i != components_by_val.end() ? i->second : 0; + } + +template +void ComponentManager::RegisterComponent(C* component, + const string& prefix) + { + const char* cname = component->CanonicalName(); + + if ( Lookup(cname) ) + reporter->FatalError("Component '%s::%s' defined more than once", + module.c_str(), cname); + + DBG_LOG(DBG_PLUGINS, "Registering component %s (tag %s)", + component->Name(), component->Tag().AsString().c_str()); + + components_by_name.insert(std::make_pair(cname, component)); + components_by_tag.insert(std::make_pair(component->Tag(), component)); + components_by_val.insert(std::make_pair( + component->Tag().AsEnumVal()->InternalInt(), component)); + + // Install an identfier for enum value + string id = fmt("%s%s", prefix.c_str(), cname); + tag_enum_type->AddName(module, id.c_str(), + component->Tag().AsEnumVal()->InternalInt(), true); + } + +} // namespace plugin + +#endif diff --git a/src/plugin/TaggedComponent.h b/src/plugin/TaggedComponent.h new file mode 100644 index 0000000000..99eab9f230 --- /dev/null +++ b/src/plugin/TaggedComponent.h @@ -0,0 +1,85 @@ +#ifndef PLUGIN_TAGGED_COMPONENT_H +#define PLUGIN_TAGGED_COMPONENT_H + +namespace plugin { + +/** + * A class which has a tag of a given type associated with it. + * + * @tparam T A ::Tag type or derivative. + */ +template +class TaggedComponent { +public: + + /** + * Constructor creates a unique tag value for this component. + * + * @param subtype A subtype associated with this component that + * further distinguishes it. The subtype will be integrated into + * the Tag that the manager associates with this component, + * and component instances can accordingly access it via Tag(). + * If not used, leave at zero. + */ + TaggedComponent(typename T::subtype_t subtype = 0); + + /** + * Copy constructor. + * + * @param other Another component from which to copy its tag value. + */ + TaggedComponent(const TaggedComponent& other); + + /** + * Assignment operator. + * + * @param other A component to assign. + * @return The assigned object. + */ + TaggedComponent& operator=(const TaggedComponent& other); + + /** + * @return The component's tag. + */ + T Tag() const; + +private: + + T tag; /**< The automatically assigned analyzer tag. */ + static typename T::type_t type_counter; /**< Used to generate globally + unique tags. */ +}; + +template +TaggedComponent::TaggedComponent(typename T::subtype_t subtype) + { + tag = T(++type_counter, subtype); + } + +template +TaggedComponent::TaggedComponent(const TaggedComponent& other) + { + tag = other.tag; + } + +template +TaggedComponent& +TaggedComponent::operator =(const TaggedComponent& other) + { + if ( &other != this ) + tag = other.tag; + + return *this; + } + +template +T TaggedComponent::Tag() const + { + return tag; + } + +template typename T::type_t TaggedComponent::type_counter(0); + +} // namespace plugin + +#endif diff --git a/src/probabilistic/BitVector.cc b/src/probabilistic/BitVector.cc index 6e642e62c1..e8c2b2f80e 100644 --- a/src/probabilistic/BitVector.cc +++ b/src/probabilistic/BitVector.cc @@ -1,10 +1,12 @@ // See the file "COPYING" in the main distribution directory for copyright. -#include "BitVector.h" - +#include #include #include + +#include "BitVector.h" #include "Serializer.h" +#include "digest.h" using namespace probabilistic; @@ -490,6 +492,21 @@ BitVector::size_type BitVector::FindNext(size_type i) const return block ? bi * bits_per_block + lowest_bit(block) : find_from(bi + 1); } +size_t BitVector::Hash() const + { + size_t hash = 0; + + u_char buf[SHA256_DIGEST_LENGTH]; + SHA256_CTX ctx; + sha256_init(&ctx); + + for ( size_type i = 0; i < Blocks(); ++i ) + sha256_update(&ctx, &bits[i], sizeof(bits[i])); + + sha256_final(&ctx, buf); + return *reinterpret_cast(buf); // Use the first bytes as seed. + } + BitVector::size_type BitVector::lowest_bit(block_type block) { block_type x = block - (block & (block - 1)); diff --git a/src/probabilistic/BitVector.h b/src/probabilistic/BitVector.h index d9c55d53c6..8e24336345 100644 --- a/src/probabilistic/BitVector.h +++ b/src/probabilistic/BitVector.h @@ -276,6 +276,13 @@ public: */ size_type FindNext(size_type i) const; + /** Computes a hash value of the internal representation. + * This is mainly for debugging/testing purposes. + * + * @return The hash. + */ + size_t Hash() const; + /** * Serializes the bit vector. * diff --git a/src/probabilistic/BloomFilter.cc b/src/probabilistic/BloomFilter.cc index db768ed934..bcab6c9b54 100644 --- a/src/probabilistic/BloomFilter.cc +++ b/src/probabilistic/BloomFilter.cc @@ -9,6 +9,8 @@ #include "CounterVector.h" #include "Serializer.h" +#include "../util.h" + using namespace probabilistic; BloomFilter::BloomFilter() @@ -40,28 +42,15 @@ bool BloomFilter::DoSerialize(SerialInfo* info) const { DO_SERIALIZE(SER_BLOOMFILTER, SerialObj); - if ( ! SERIALIZE(static_cast(hasher->K())) ) - return false; - - return SERIALIZE_STR(hasher->Name().c_str(), hasher->Name().size()); + return hasher->Serialize(info); } bool BloomFilter::DoUnserialize(UnserialInfo* info) { DO_UNSERIALIZE(SerialObj); - uint16 k; - if ( ! UNSERIALIZE(&k) ) - return false; - - const char* name; - if ( ! UNSERIALIZE_STR(&name, 0) ) - return false; - - hasher = Hasher::Create(k, name); - - delete [] name; - return true; + hasher = Hasher::Unserialize(info); + return hasher != 0; } size_t BasicBloomFilter::M(double fp, size_t capacity) @@ -120,6 +109,11 @@ BasicBloomFilter* BasicBloomFilter::Clone() const return copy; } +std::string BasicBloomFilter::InternalState() const + { + return fmt("%" PRIu64, (uint64_t)bits->Hash()); + } + BasicBloomFilter::BasicBloomFilter() { bits = 0; @@ -146,14 +140,18 @@ bool BasicBloomFilter::DoUnserialize(UnserialInfo* info) return (bits != 0); } -void BasicBloomFilter::AddImpl(const Hasher::digest_vector& h) +void BasicBloomFilter::Add(const HashKey* key) { + Hasher::digest_vector h = hasher->Hash(key); + for ( size_t i = 0; i < h.size(); ++i ) bits->Set(h[i] % bits->Size()); } -size_t BasicBloomFilter::CountImpl(const Hasher::digest_vector& h) const +size_t BasicBloomFilter::Count(const HashKey* key) const { + Hasher::digest_vector h = hasher->Hash(key); + for ( size_t i = 0; i < h.size(); ++i ) { if ( ! (*bits)[h[i] % bits->Size()] ) @@ -219,6 +217,11 @@ CountingBloomFilter* CountingBloomFilter::Clone() const return copy; } +string CountingBloomFilter::InternalState() const + { + return fmt("%" PRIu64, (uint64_t)cells->Hash()); + } + IMPLEMENT_SERIAL(CountingBloomFilter, SER_COUNTINGBLOOMFILTER) bool CountingBloomFilter::DoSerialize(SerialInfo* info) const @@ -235,14 +238,18 @@ bool CountingBloomFilter::DoUnserialize(UnserialInfo* info) } // TODO: Use partitioning in add/count to allow for reusing CMS bounds. -void CountingBloomFilter::AddImpl(const Hasher::digest_vector& h) +void CountingBloomFilter::Add(const HashKey* key) { + Hasher::digest_vector h = hasher->Hash(key); + for ( size_t i = 0; i < h.size(); ++i ) cells->Increment(h[i] % cells->Size()); } -size_t CountingBloomFilter::CountImpl(const Hasher::digest_vector& h) const +size_t CountingBloomFilter::Count(const HashKey* key) const { + Hasher::digest_vector h = hasher->Hash(key); + CounterVector::size_type min = std::numeric_limits::max(); diff --git a/src/probabilistic/BloomFilter.h b/src/probabilistic/BloomFilter.h index b6cf18672f..65dda2396d 100644 --- a/src/probabilistic/BloomFilter.h +++ b/src/probabilistic/BloomFilter.h @@ -13,9 +13,6 @@ class CounterVector; /** * The abstract base class for Bloom filters. - * - * At this point we won't let the user choose the hasher, but we might open - * up the interface in the future. */ class BloomFilter : public SerialObj { public: @@ -25,27 +22,20 @@ public: virtual ~BloomFilter(); /** - * Adds an element of type T to the Bloom filter. - * @param x The element to add + * Adds an element to the Bloom filter. + * + * @param key The key associated with the element to add. */ - template - void Add(const T& x) - { - AddImpl((*hasher)(x)); - } + virtual void Add(const HashKey* key) = 0; /** * Retrieves the associated count of a given value. * - * @param x The value of type `T` to check. + * @param key The key associated with the element to check. * - * @return The counter associated with *x*. + * @return The counter associated with *key*. */ - template - size_t Count(const T& x) const - { - return CountImpl((*hasher)(x)); - } + virtual size_t Count(const HashKey* key) const = 0; /** * Checks whether the Bloom filter is empty. @@ -75,6 +65,12 @@ public: */ virtual BloomFilter* Clone() const = 0; + /** + * Returns a string with a representation of the Bloom filter's + * internal state. This is for debugging/testing purposes only. + */ + virtual string InternalState() const = 0; + /** * Serializes the Bloom filter. * @@ -109,25 +105,6 @@ protected: */ BloomFilter(const Hasher* hasher); - /** - * Abstract method for implementinng the *Add* operation. - * - * @param hashes A set of *k* hashes for the item to add, computed by - * the internal hasher object. - * - */ - virtual void AddImpl(const Hasher::digest_vector& hashes) = 0; - - /** - * Abstract method for implementing the *Count* operation. - * - * @param hashes A set of *k* hashes for the item to add, computed by - * the internal hasher object. - * - * @return Returns the counter associated with the hashed element. - */ - virtual size_t CountImpl(const Hasher::digest_vector& hashes) const = 0; - const Hasher* hasher; }; @@ -180,6 +157,7 @@ public: virtual void Clear(); virtual bool Merge(const BloomFilter* other); virtual BasicBloomFilter* Clone() const; + virtual string InternalState() const; protected: DECLARE_SERIAL(BasicBloomFilter); @@ -190,8 +168,8 @@ protected: BasicBloomFilter(); // Overridden from BloomFilter. - virtual void AddImpl(const Hasher::digest_vector& h); - virtual size_t CountImpl(const Hasher::digest_vector& h) const; + virtual void Add(const HashKey* key); + virtual size_t Count(const HashKey* key) const; private: BitVector* bits; @@ -219,6 +197,7 @@ public: virtual void Clear(); virtual bool Merge(const BloomFilter* other); virtual CountingBloomFilter* Clone() const; + virtual string InternalState() const; protected: DECLARE_SERIAL(CountingBloomFilter); @@ -229,8 +208,8 @@ protected: CountingBloomFilter(); // Overridden from BloomFilter. - virtual void AddImpl(const Hasher::digest_vector& h); - virtual size_t CountImpl(const Hasher::digest_vector& h) const; + virtual void Add(const HashKey* key); + virtual size_t Count(const HashKey* key) const; private: CounterVector* cells; diff --git a/src/probabilistic/CMakeLists.txt b/src/probabilistic/CMakeLists.txt index af062b24ae..a36dfbbd6b 100644 --- a/src/probabilistic/CMakeLists.txt +++ b/src/probabilistic/CMakeLists.txt @@ -10,9 +10,11 @@ set(probabilistic_SRCS BitVector.cc BloomFilter.cc CounterVector.cc - Hasher.cc) + Hasher.cc + Topk.cc) bif_target(bloom-filter.bif) +bif_target(top-k.bif) bro_add_subdir_library(probabilistic ${probabilistic_SRCS}) add_dependencies(bro_probabilistic generate_outputs) diff --git a/src/probabilistic/CounterVector.cc b/src/probabilistic/CounterVector.cc index d5635fc0f2..8a6feae5fd 100644 --- a/src/probabilistic/CounterVector.cc +++ b/src/probabilistic/CounterVector.cc @@ -153,6 +153,11 @@ CounterVector operator|(const CounterVector& x, const CounterVector& y) } +size_t CounterVector::Hash() const + { + return bits->Hash(); + } + bool CounterVector::Serialize(SerialInfo* info) const { return SerialObj::Serialize(info); diff --git a/src/probabilistic/CounterVector.h b/src/probabilistic/CounterVector.h index df6fc57ac2..9ce522d61c 100644 --- a/src/probabilistic/CounterVector.h +++ b/src/probabilistic/CounterVector.h @@ -126,6 +126,13 @@ public: */ CounterVector& operator|=(const CounterVector& other); + /** Computes a hash value of the internal representation. + * This is mainly for debugging/testing purposes. + * + * @return The hash. + */ + size_t Hash() const; + /** * Serializes the bit vector. * diff --git a/src/probabilistic/Hasher.cc b/src/probabilistic/Hasher.cc index f9ce7bdd6b..f5b1f4f5f7 100644 --- a/src/probabilistic/Hasher.cc +++ b/src/probabilistic/Hasher.cc @@ -1,61 +1,119 @@ // See the file "COPYING" in the main distribution directory for copyright. #include +#include #include "Hasher.h" +#include "NetVar.h" #include "digest.h" +#include "Serializer.h" using namespace probabilistic; -UHF::UHF(size_t seed, const std::string& extra) - : h(compute_seed(seed, extra)) - { - } - -Hasher::digest UHF::hash(const void* x, size_t n) const - { - assert(n <= UHASH_KEY_SIZE); - return n == 0 ? 0 : h(x, n); - } - -size_t UHF::compute_seed(size_t seed, const std::string& extra) +size_t Hasher::MakeSeed(const void* data, size_t size) { u_char buf[SHA256_DIGEST_LENGTH]; SHA256_CTX ctx; sha256_init(&ctx); - if ( extra.empty() ) + if ( data ) + sha256_update(&ctx, data, size); + + else if ( global_hash_seed && global_hash_seed->Len() > 0 ) + sha256_update(&ctx, global_hash_seed->Bytes(), global_hash_seed->Len()); + + else { unsigned int first_seed = initial_seed(); sha256_update(&ctx, &first_seed, sizeof(first_seed)); } - else - sha256_update(&ctx, extra.c_str(), extra.size()); - - sha256_update(&ctx, &seed, sizeof(seed)); sha256_final(&ctx, buf); - - // Take the first sizeof(size_t) bytes as seed. - return *reinterpret_cast(buf); + return *reinterpret_cast(buf); // Use the first bytes as seed. } -Hasher* Hasher::Create(size_t k, const std::string& name) +Hasher::digest_vector Hasher::Hash(const HashKey* key) const { - return new DefaultHasher(k, name); + return Hash(key->Key(), key->Size()); } -Hasher::Hasher(size_t k, const std::string& arg_name) - : k(k) +bool Hasher::Serialize(SerialInfo* info) const { - name = arg_name; + return SerialObj::Serialize(info); } -DefaultHasher::DefaultHasher(size_t k, const std::string& name) - : Hasher(k, name) +Hasher* Hasher::Unserialize(UnserialInfo* info) { - for ( size_t i = 0; i < k; ++i ) - hash_functions.push_back(UHF(i, name)); + return reinterpret_cast(SerialObj::Unserialize(info, SER_HASHER)); + } + +bool Hasher::DoSerialize(SerialInfo* info) const + { + DO_SERIALIZE(SER_HASHER, SerialObj); + + if ( ! SERIALIZE(static_cast(k)) ) + return false; + + return SERIALIZE(static_cast(seed)); + } + +bool Hasher::DoUnserialize(UnserialInfo* info) + { + DO_UNSERIALIZE(SerialObj); + + uint16 serial_k; + if ( ! UNSERIALIZE(&serial_k) ) + return false; + + k = serial_k; + assert(k > 0); + + uint64 serial_seed; + if ( ! UNSERIALIZE(&serial_seed) ) + return false; + + seed = serial_seed; + + return true; + } + +Hasher::Hasher(size_t arg_k, size_t arg_seed) + { + k = arg_k; + seed = arg_seed; + } + +UHF::UHF(size_t arg_seed) + : h(arg_seed) + { + seed = arg_seed; + } + +// This function is almost equivalent to HashKey::HashBytes except that it +// does not depend on global state and that we mix in the seed multiple +// times. +Hasher::digest UHF::hash(const void* x, size_t n) const + { + if ( n <= UHASH_KEY_SIZE ) + return n == 0 ? 0 : h(x, n); + + unsigned char d[16]; + MD5(reinterpret_cast(x), n, d); + + const unsigned char* s = reinterpret_cast(&seed); + for ( size_t i = 0; i < 16; ++i ) + d[i] ^= s[i % sizeof(seed)]; + + MD5(d, 16, d); + + return d[0]; + } + +DefaultHasher::DefaultHasher(size_t k, size_t seed) + : Hasher(k, seed) + { + for ( size_t i = 1; i <= k; ++i ) + hash_functions.push_back(UHF(Seed() + bro_prng(i))); } Hasher::digest_vector DefaultHasher::Hash(const void* x, size_t n) const @@ -82,8 +140,29 @@ bool DefaultHasher::Equals(const Hasher* other) const return hash_functions == o->hash_functions; } -DoubleHasher::DoubleHasher(size_t k, const std::string& name) - : Hasher(k, name), h1(1, name), h2(2, name) +IMPLEMENT_SERIAL(DefaultHasher, SER_DEFAULTHASHER) + +bool DefaultHasher::DoSerialize(SerialInfo* info) const + { + DO_SERIALIZE(SER_DEFAULTHASHER, Hasher); + + // Nothing to do here, the base class has all we need serialized already. + return true; + } + +bool DefaultHasher::DoUnserialize(UnserialInfo* info) + { + DO_UNSERIALIZE(Hasher); + + hash_functions.clear(); + for ( size_t i = 0; i < K(); ++i ) + hash_functions.push_back(UHF(Seed() + bro_prng(i))); + + return true; + } + +DoubleHasher::DoubleHasher(size_t k, size_t seed) + : Hasher(k, seed), h1(seed + bro_prng(1)), h2(seed + bro_prng(2)) { } @@ -112,3 +191,23 @@ bool DoubleHasher::Equals(const Hasher* other) const const DoubleHasher* o = static_cast(other); return h1 == o->h1 && h2 == o->h2; } + +IMPLEMENT_SERIAL(DoubleHasher, SER_DOUBLEHASHER) + +bool DoubleHasher::DoSerialize(SerialInfo* info) const + { + DO_SERIALIZE(SER_DOUBLEHASHER, Hasher); + + // Nothing to do here, the base class has all we need serialized already. + return true; + } + +bool DoubleHasher::DoUnserialize(UnserialInfo* info) + { + DO_UNSERIALIZE(Hasher); + + h1 = UHF(Seed() + bro_prng(1)); + h2 = UHF(Seed() + bro_prng(2)); + + return true; + } diff --git a/src/probabilistic/Hasher.h b/src/probabilistic/Hasher.h index 9f7d4ae32d..a3322f5e37 100644 --- a/src/probabilistic/Hasher.h +++ b/src/probabilistic/Hasher.h @@ -5,6 +5,7 @@ #include "Hash.h" #include "H3.h" +#include "SerialObj.h" namespace probabilistic { @@ -12,11 +13,25 @@ namespace probabilistic { * Abstract base class for hashers. A hasher creates a family of hash * functions to hash an element *k* times. */ -class Hasher { +class Hasher : public SerialObj { public: typedef hash_t digest; typedef std::vector digest_vector; + /** + * Creates a valid hasher seed from an arbitrary string. + * + * @param data A pointer to contiguous data that should be crunched into a + * seed. If 0, the function tries to find a global_hash_seed script variable + * to derive a seed from. If this variable does not exist, the function uses + * the initial seed generated at Bro startup. + * + * @param size The number of bytes of *data*. + * + * @return A seed suitable for hashers. + */ + static size_t MakeSeed(const void* data, size_t size); + /** * Destructor. */ @@ -35,6 +50,15 @@ public: return Hash(&x, sizeof(T)); } + /** + * Computes hash values for an element. + * + * @param x The key of the value to hash. + * + * @return Vector of *k* hash values. + */ + digest_vector Hash(const HashKey* key) const; + /** * Computes the hashes for a set of bytes. * @@ -63,38 +87,30 @@ public: size_t K() const { return k; } /** - * Returns the hasher's name. TODO: What's this? + * Returns the seed used to construct the hasher. */ - const std::string& Name() const { return name; } + size_t Seed() const { return seed; } - /** - * Constructs the hasher used by the implementation. This hardcodes a - * specific hashing policy. It exists only because the HashingPolicy - * class hierachy is not yet serializable. - * - * @param k The number of hash functions to apply. - * - * @param name The hasher's name. Hashers with the same name should - * provide consistent results. - * - * @return Returns a new hasher instance. - */ - static Hasher* Create(size_t k, const std::string& name); + bool Serialize(SerialInfo* info) const; + static Hasher* Unserialize(UnserialInfo* info); protected: + DECLARE_ABSTRACT_SERIAL(Hasher); + + Hasher() { } + /** * Constructor. * - * @param k the number of hash functions. + * @param arg_k the number of hash functions. * - * @param name A name for the hasher. Hashers with the same name - * should provide consistent results. + * @param arg_seed The seed for the hasher. */ - Hasher(size_t k, const std::string& name); + Hasher(size_t arg_k, size_t arg_seed); private: - const size_t k; - std::string name; + size_t k; + size_t seed; }; /** @@ -107,13 +123,9 @@ public: * Constructs an H3 hash function seeded with a given seed and an * optional extra seed to replace the initial Bro seed. * - * @param seed The seed to use for this instance. - * - * @param extra If not empty, this parameter replaces the initial - * seed to compute the seed for t to compute the seed NUL-terminated - * string as additional seed. + * @param arg_seed The seed to use for this instance. */ - UHF(size_t seed, const std::string& extra = ""); + UHF(size_t arg_seed = 0); template Hasher::digest operator()(const T& x) const @@ -156,9 +168,10 @@ public: } private: - static size_t compute_seed(size_t seed, const std::string& extra); + static size_t compute_seed(size_t seed); H3 h; + size_t seed; }; @@ -173,16 +186,20 @@ public: * * @param k The number of hash functions to use. * - * @param name The name of the hasher. + * @param seed The seed for the hasher. */ - DefaultHasher(size_t k, const std::string& name); + DefaultHasher(size_t k, size_t seed); // Overridden from Hasher. virtual digest_vector Hash(const void* x, size_t n) const /* final */; virtual DefaultHasher* Clone() const /* final */; virtual bool Equals(const Hasher* other) const /* final */; + DECLARE_SERIAL(DefaultHasher); + private: + DefaultHasher() { } + std::vector hash_functions; }; @@ -197,16 +214,20 @@ public: * * @param k The number of hash functions to use. * - * @param name The name of the hasher. + * @param seed The seed for the hasher. */ - DoubleHasher(size_t k, const std::string& name); + DoubleHasher(size_t k, size_t seed); // Overridden from Hasher. virtual digest_vector Hash(const void* x, size_t n) const /* final */; virtual DoubleHasher* Clone() const /* final */; virtual bool Equals(const Hasher* other) const /* final */; + DECLARE_SERIAL(DoubleHasher); + private: + DoubleHasher() { } + UHF h1; UHF h2; }; diff --git a/src/probabilistic/Topk.cc b/src/probabilistic/Topk.cc new file mode 100644 index 0000000000..95d0ac732e --- /dev/null +++ b/src/probabilistic/Topk.cc @@ -0,0 +1,499 @@ +// See the file "COPYING" in the main distribution directory for copyright. + +#include "probabilistic/Topk.h" +#include "CompHash.h" +#include "Reporter.h" +#include "Serializer.h" +#include "NetVar.h" + +namespace probabilistic { + +IMPLEMENT_SERIAL(TopkVal, SER_TOPK_VAL); + +static void topk_element_hash_delete_func(void* val) + { + Element* e = (Element*) val; + delete e; + } + +Element::~Element() + { + Unref(value); + } + +void TopkVal::Typify(BroType* t) + { + assert(!hash && !type); + type = t->Ref(); + TypeList* tl = new TypeList(t); + tl->Append(t->Ref()); + hash = new CompositeHash(tl); + Unref(tl); + } + +HashKey* TopkVal::GetHash(Val* v) const + { + HashKey* key = hash->ComputeHash(v, 1); + assert(key); + return key; + } + +TopkVal::TopkVal(uint64 arg_size) : OpaqueVal(topk_type) + { + elementDict = new PDict(Element); + elementDict->SetDeleteFunc(topk_element_hash_delete_func); + size = arg_size; + type = 0; + numElements = 0; + pruned = false; + hash = 0; + } + +TopkVal::TopkVal() : OpaqueVal(topk_type) + { + elementDict = new PDict(Element); + elementDict->SetDeleteFunc(topk_element_hash_delete_func); + size = 0; + type = 0; + numElements = 0; + hash = 0; + } + +TopkVal::~TopkVal() + { + elementDict->Clear(); + delete elementDict; + + // now all elements are already gone - delete the buckets + std::list::iterator bi = buckets.begin(); + while ( bi != buckets.end() ) + { + delete *bi; + bi++; + } + + Unref(type); + delete hash; + } + +void TopkVal::Merge(const TopkVal* value, bool doPrune) + { + if ( type == 0 ) + { + assert(numElements == 0); + Typify(value->type); + } + + else + { + if ( ! same_type(type, value->type) ) + { + reporter->Error("Cannot merge top-k elements of differing types."); + return; + } + } + + std::list::const_iterator it = value->buckets.begin(); + while ( it != value->buckets.end() ) + { + Bucket* b = *it; + uint64_t currcount = b->count; + std::list::const_iterator eit = b->elements.begin(); + + while ( eit != b->elements.end() ) + { + Element* e = *eit; + // lookup if we already know this one... + HashKey* key = GetHash(e->value); + Element* olde = (Element*) elementDict->Lookup(key); + + if ( olde == 0 ) + { + olde = new Element(); + olde->epsilon = 0; + olde->value = e->value->Ref(); + // insert at bucket position 0 + if ( buckets.size() > 0 ) + { + assert (buckets.front()-> count > 0 ); + } + + Bucket* newbucket = new Bucket(); + newbucket->count = 0; + newbucket->bucketPos = buckets.insert(buckets.begin(), newbucket); + + olde->parent = newbucket; + newbucket->elements.insert(newbucket->elements.end(), olde); + + elementDict->Insert(key, olde); + numElements++; + + } + + // now that we are sure that the old element is present - increment epsilon + olde->epsilon += e->epsilon; + + // and increment position... + IncrementCounter(olde, currcount); + delete key; + + eit++; + } + + it++; + } + + // now we have added everything. And our top-k table could be too big. + // prune everything... + + assert(size > 0); + + if ( ! doPrune ) + return; + + while ( numElements > size ) + { + pruned = true; + assert(buckets.size() > 0 ); + Bucket* b = buckets.front(); + assert(b->elements.size() > 0); + + Element* e = b->elements.front(); + HashKey* key = GetHash(e->value); + elementDict->RemoveEntry(key); + delete e; + + b->elements.pop_front(); + + if ( b->elements.size() == 0 ) + { + delete b; + buckets.pop_front(); + } + + numElements--; + } + } + +bool TopkVal::DoSerialize(SerialInfo* info) const + { + DO_SERIALIZE(SER_TOPK_VAL, OpaqueVal); + + bool v = true; + + v &= SERIALIZE(size); + v &= SERIALIZE(numElements); + v &= SERIALIZE(pruned); + + bool type_present = (type != 0); + v &= SERIALIZE(type_present); + + if ( type_present ) + v &= type->Serialize(info); + else + assert(numElements == 0); + + uint64_t i = 0; + std::list::const_iterator it = buckets.begin(); + while ( it != buckets.end() ) + { + Bucket* b = *it; + uint32_t elements_count = b->elements.size(); + v &= SERIALIZE(elements_count); + v &= SERIALIZE(b->count); + + std::list::const_iterator eit = b->elements.begin(); + while ( eit != b->elements.end() ) + { + Element* element = *eit; + v &= SERIALIZE(element->epsilon); + v &= element->value->Serialize(info); + + eit++; + i++; + } + + it++; + } + + assert(i == numElements); + + return v; + } + +bool TopkVal::DoUnserialize(UnserialInfo* info) + { + DO_UNSERIALIZE(OpaqueVal); + + bool v = true; + + v &= UNSERIALIZE(&size); + v &= UNSERIALIZE(&numElements); + v &= UNSERIALIZE(&pruned); + + bool type_present = false; + v &= UNSERIALIZE(&type_present); + if ( type_present ) + { + BroType* deserialized_type = BroType::Unserialize(info); + + Typify(deserialized_type); + Unref(deserialized_type); + assert(type); + } + else + assert(numElements == 0); + + uint64_t i = 0; + while ( i < numElements ) + { + Bucket* b = new Bucket(); + uint32_t elements_count; + v &= UNSERIALIZE(&elements_count); + v &= UNSERIALIZE(&b->count); + b->bucketPos = buckets.insert(buckets.end(), b); + + for ( uint64_t j = 0; j < elements_count; j++ ) + { + Element* e = new Element(); + v &= UNSERIALIZE(&e->epsilon); + e->value = Val::Unserialize(info, type); + e->parent = b; + + b->elements.insert(b->elements.end(), e); + + HashKey* key = GetHash(e->value); + assert (elementDict->Lookup(key) == 0); + + elementDict->Insert(key, e); + delete key; + + i++; + } + } + + assert(i == numElements); + + return v; + } + + +VectorVal* TopkVal::GetTopK(int k) const // returns vector + { + if ( numElements == 0 ) + { + reporter->Error("Cannot return topk of empty"); + return 0; + } + + TypeList* vector_index = new TypeList(type); + vector_index->Append(type->Ref()); + VectorType* v = new VectorType(vector_index); + VectorVal* t = new VectorVal(v); + + // this does no estimation if the results is correct! + // in any case - just to make this future-proof (and I am lazy) - this can return more than k. + + int read = 0; + std::list::const_iterator it = buckets.end(); + it--; + while (read < k ) + { + //printf("Bucket %llu\n", (*it)->count); + std::list::iterator eit = (*it)->elements.begin(); + while ( eit != (*it)->elements.end() ) + { + //printf("Size: %ld\n", (*it)->elements.size()); + t->Assign(read, (*eit)->value->Ref()); + read++; + eit++; + } + + if ( it == buckets.begin() ) + break; + + it--; + } + + Unref(v); + return t; + } + +uint64_t TopkVal::GetCount(Val* value) const + { + HashKey* key = GetHash(value); + Element* e = (Element*) elementDict->Lookup(key); + + if ( e == 0 ) + { + reporter->Error("GetCount for element that is not in top-k"); + return 0; + } + + delete key; + return e->parent->count; + } + +uint64_t TopkVal::GetEpsilon(Val* value) const + { + HashKey* key = GetHash(value); + Element* e = (Element*) elementDict->Lookup(key); + + if ( e == 0 ) + { + reporter->Error("GetEpsilon for element that is not in top-k"); + return 0; + } + + delete key; + return e->epsilon; + } + +uint64_t TopkVal::GetSum() const + { + uint64_t sum = 0; + + std::list::const_iterator it = buckets.begin(); + while ( it != buckets.end() ) + { + sum += (*it)->elements.size() * (*it)->count; + + it++; + } + + if ( pruned ) + reporter->Warning("TopkVal::GetSum() was used on a pruned data structure. Result values do not represent total element count"); + + return sum; + } + +void TopkVal::Encountered(Val* encountered) + { + // ok, let's see if we already know this one. + + if ( numElements == 0 ) + Typify(encountered->Type()); + else + if ( ! same_type(type, encountered->Type()) ) + { + reporter->Error("Trying to add element to topk with differing type from other elements"); + return; + } + + // Step 1 - get the hash. + HashKey* key = GetHash(encountered); + Element* e = (Element*) elementDict->Lookup(key); + + if ( e == 0 ) + { + e = new Element(); + e->epsilon = 0; + e->value = encountered->Ref(); // or no ref? + + // well, we do not know this one yet... + if ( numElements < size ) + { + // brilliant. just add it at position 1 + if ( buckets.size() == 0 || (*buckets.begin())->count > 1 ) + { + Bucket* b = new Bucket(); + b->count = 1; + std::list::iterator pos = buckets.insert(buckets.begin(), b); + b->bucketPos = pos; + b->elements.insert(b->elements.end(), e); + e->parent = b; + } + else + { + Bucket* b = *buckets.begin(); + assert(b->count == 1); + b->elements.insert(b->elements.end(), e); + e->parent = b; + } + + elementDict->Insert(key, e); + numElements++; + delete key; + + return; // done. it is at pos 1. + } + + else + { + // replace element with min-value + Bucket* b = *buckets.begin(); // bucket with smallest elements + + // evict oldest element with least hits. + assert(b->elements.size() > 0); + HashKey* deleteKey = GetHash((*(b->elements.begin()))->value); + b->elements.erase(b->elements.begin()); + Element* deleteElement = (Element*) elementDict->RemoveEntry(deleteKey); + assert(deleteElement); // there has to have been a minimal element... + delete deleteElement; + delete deleteKey; + + // and add the new one to the end + e->epsilon = b->count; + b->elements.insert(b->elements.end(), e); + elementDict->Insert(key, e); + e->parent = b; + + // fallthrough, increment operation has to run! + } + + } + + // ok, we now have an element in e + delete key; + IncrementCounter(e); // well, this certainly was anticlimatic. + } + +// increment by count +void TopkVal::IncrementCounter(Element* e, unsigned int count) + { + Bucket* currBucket = e->parent; + uint64 currcount = currBucket->count; + + // well, let's test if there is a bucket for currcount++ + std::list::iterator bucketIter = currBucket->bucketPos; + + Bucket* nextBucket = 0; + + bucketIter++; + + while ( bucketIter != buckets.end() && (*bucketIter)->count < currcount+count ) + bucketIter++; + + if ( bucketIter != buckets.end() && (*bucketIter)->count == currcount+count ) + nextBucket = *bucketIter; + + if ( nextBucket == 0 ) + { + // the bucket for the value that we want does not exist. + // create it... + + Bucket* b = new Bucket(); + b->count = currcount+count; + + std::list::iterator nextBucketPos = buckets.insert(bucketIter, b); + b->bucketPos = nextBucketPos; // and give it the iterator we know now. + + nextBucket = b; + } + + // ok, now we have the new bucket in nextBucket. Shift the element over... + currBucket->elements.remove(e); + nextBucket->elements.insert(nextBucket->elements.end(), e); + + e->parent = nextBucket; + + // if currBucket is empty, we have to delete it now + if ( currBucket->elements.size() == 0 ) + { + buckets.remove(currBucket); + delete currBucket; + currBucket = 0; + } + } + +}; diff --git a/src/probabilistic/Topk.h b/src/probabilistic/Topk.h new file mode 100644 index 0000000000..a9a0d80818 --- /dev/null +++ b/src/probabilistic/Topk.h @@ -0,0 +1,170 @@ +// See the file "COPYING" in the main distribution directory for copyright. + +#ifndef topk_h +#define topk_h + +#include +#include "Val.h" +#include "CompHash.h" +#include "OpaqueVal.h" + +// This class implements the top-k algorithm. Or - to be more precise - an +// interpretation of it. + +namespace probabilistic { + +struct Element; + +struct Bucket { + uint64 count; + std::list elements; + + // Iterators only get invalidated for removed elements. This one + // points to us - so it is invalid when we are no longer there. Cute, + // isn't it? + std::list::iterator bucketPos; +}; + +struct Element { + uint64 epsilon; + Val* value; + Bucket* parent; + + ~Element(); +}; + +declare(PDict, Element); + +class TopkVal : public OpaqueVal { + +public: + /** + * Construct a TopkVal. + * + * @param size specifies how many total elements are tracked + * + * @return A newly initialized TopkVal + */ + TopkVal(uint64 size); + + /** + * Destructor. + */ + ~TopkVal(); + + /** + * Call this when a new value is encountered. Note that on the first + * call, the Bro type of the value types that are counted is set. All + * following calls to encountered have to specify the same type. + * + * @param value The encountered element + */ + void Encountered(Val* value); + + /** + * Get the first *k* elements of the result vector. At the moment, + * this does not check if it is in the right order or if we can prove + * that these are the correct top-k. Use count and epsilon for this. + * + * @param k Number of top-elements to return + * + * @returns The top-k encountered elements + */ + VectorVal* GetTopK(int k) const; + + /** + * Get the current count tracked in the top-k data structure for a + * certain val. Returns 0 if the val is unknown (and logs the error + * to reporter). + * + * @param value Bro value to get counts for + * + * @returns internal count for val, 0 if unknown + */ + uint64_t GetCount(Val* value) const; + + /** + * Get the current epsilon tracked in the top-k data structure for a + * certain val. + * + * @param value Bro value to get epsilons for + * + * @returns the epsilon. Returns 0 if the val is unknown (and logs + * the error to reporter) + */ + uint64_t GetEpsilon(Val* value) const; + + /** + * Get the size set in the constructor + * + * @returns size of the top-k structure + */ + uint64_t GetSize() const { return size; } + + /** + * Get the sum of all counts of all tracked elements. This is equal + * to the number of total observations up to this moment, if no + * elements were pruned from the data structure. + * + * @returns sum of all counts + */ + uint64_t GetSum() const; + + /** + * Merge another top-k data structure into this one. doPrune + * specifies if the total count of elements is limited to size after + * merging. Please note, that pruning will invalidate the results of + * getSum. + * + * @param value TopkVal to merge into this TopkVal + * + * @param doPrune prune resulting TopkVal to size after merging + */ + void Merge(const TopkVal* value, bool doPrune=false); + +protected: + /** + * Construct an empty TopkVal. Only used for deserialization + */ + TopkVal(); + +private: + /** + * Increment the counter for a specific element + * + * @param e element to increment counter for + * + * @param count increment counter by this much + */ + void IncrementCounter(Element* e, unsigned int count = 1); + + /** + * get the hashkey for a specific value + * + * @param v value to generate key for + * + * @returns HashKey for value + */ + HashKey* GetHash(Val* v) const; // this probably should go somewhere else. + + /** + * Set the type that this TopK instance tracks + * + * @param t type that is tracked + */ + void Typify(BroType* t); + + BroType* type; + CompositeHash* hash; + std::list buckets; + PDict(Element)* elementDict; + uint64 size; // how many elements are we tracking? + uint64 numElements; // how many elements do we have at the moment + bool pruned; // was this data structure pruned? + + DECLARE_SERIAL(TopkVal); +}; + +}; + +#endif diff --git a/src/probabilistic/bloom-filter.bif b/src/probabilistic/bloom-filter.bif index c6760f6adf..6994f651dd 100644 --- a/src/probabilistic/bloom-filter.bif +++ b/src/probabilistic/bloom-filter.bif @@ -20,23 +20,20 @@ module GLOBAL; ## Creates a basic Bloom filter. ## -## .. note:: A Bloom filter can have a name associated with it. In the future, -## Bloom filters with the same name will be compatible across indepedent Bro -## instances, i.e., it will be possible to merge them. Currently, however, that is -## not yet supported. -## ## fp: The desired false-positive rate. ## ## capacity: the maximum number of elements that guarantees a false-positive ## rate of *fp*. ## ## name: A name that uniquely identifies and seeds the Bloom filter. If empty, -## the filter will remain tied to the current Bro process. +## the filter will use :bro:id:`global_hash_seed` if that's set, and otherwise use +## a local seed tied to the current Bro process. Only filters with the same seed +## can be merged with :bro:id:`bloomfilter_merge` . ## ## Returns: A Bloom filter handle. ## -## .. bro:see:: bloomfilter_counting_init bloomfilter_add bloomfilter_lookup -## bloomfilter_clear bloomfilter_merge +## .. bro:see:: bloomfilter_basic_init2 bloomfilter_counting_init bloomfilter_add +## bloomfilter_lookup bloomfilter_clear bloomfilter_merge global_hash_seed function bloomfilter_basic_init%(fp: double, capacity: count, name: string &default=""%): opaque of bloomfilter %{ @@ -48,18 +45,53 @@ function bloomfilter_basic_init%(fp: double, capacity: count, size_t cells = BasicBloomFilter::M(fp, capacity); size_t optimal_k = BasicBloomFilter::K(cells, capacity); - const Hasher* h = Hasher::Create(optimal_k, name->CheckString()); + size_t seed = Hasher::MakeSeed(name->Len() > 0 ? name->Bytes() : 0, + name->Len()); + const Hasher* h = new DefaultHasher(optimal_k, seed); + + return new BloomFilterVal(new BasicBloomFilter(h, cells)); + %} + +## Creates a basic Bloom filter. This function serves as a low-level +## alternative to bloomfilter_basic_init where the user has full control over +## the number of hash functions and cells in the underlying bit vector. +## +## k: The number of hash functions to use. +## +## cells: The number of cells of the underlying bit vector. +## +## name: A name that uniquely identifies and seeds the Bloom filter. If empty, +## the filter will use :bro:id:`global_hash_seed` if that's set, and otherwise use +## a local seed tied to the current Bro process. Only filters with the same seed +## can be merged with :bro:id:`bloomfilter_merge` . +## +## Returns: A Bloom filter handle. +## +## .. bro:see:: bloom_filter_basic_init bloomfilter_counting_init bloomfilter_add +## bloomfilter_lookup bloomfilter_clear bloomfilter_merge global_hash_seed +function bloomfilter_basic_init2%(k: count, cells: count, + name: string &default=""%): opaque of bloomfilter + %{ + if ( k == 0 ) + { + reporter->Error("number of hash functions must be non-negative"); + return 0; + } + if ( cells == 0 ) + { + reporter->Error("number of cells must be non-negative"); + return 0; + } + + size_t seed = Hasher::MakeSeed(name->Len() > 0 ? name->Bytes() : 0, + name->Len()); + const Hasher* h = new DefaultHasher(k, seed); return new BloomFilterVal(new BasicBloomFilter(h, cells)); %} ## Creates a counting Bloom filter. ## -## .. note:: A Bloom filter can have a name associated with it. In the future, -## Bloom filters with the same name will be compatible across indepedent Bro -## instances, i.e., it will be possible to merge them. Currently, however, that is -## not yet supported. -## ## k: The number of hash functions to use. ## ## cells: The number of cells of the underlying counter vector. As there's no @@ -71,12 +103,14 @@ function bloomfilter_basic_init%(fp: double, capacity: count, ## becomes a cell of size *w* bits. ## ## name: A name that uniquely identifies and seeds the Bloom filter. If empty, -## the filter will remain tied to the current Bro process. +## the filter will use :bro:id:`global_hash_seed` if that's set, and otherwise use +## a local seed tied to the current Bro process. Only filters with the same seed +## can be merged with :bro:id:`bloomfilter_merge` . ## ## Returns: A Bloom filter handle. ## -## .. bro:see:: bloomfilter_basic_init bloomfilter_add bloomfilter_lookup -## bloomfilter_clear bloomfilter_merge +## .. bro:see:: bloomfilter_basic_init bloomfilter_basic_init2 bloomfilter_add +## bloomfilter_lookup bloomfilter_clear bloomfilter_merge global_hash_seed function bloomfilter_counting_init%(k: count, cells: count, max: count, name: string &default=""%): opaque of bloomfilter %{ @@ -86,7 +120,10 @@ function bloomfilter_counting_init%(k: count, cells: count, max: count, return 0; } - const Hasher* h = Hasher::Create(k, name->CheckString()); + size_t seed = Hasher::MakeSeed(name->Len() > 0 ? name->Bytes() : 0, + name->Len()); + + const Hasher* h = new DefaultHasher(k, seed); uint16 width = 1; while ( max >>= 1 ) @@ -101,8 +138,9 @@ function bloomfilter_counting_init%(k: count, cells: count, max: count, ## ## x: The element to add. ## -## .. bro:see:: bloomfilter_counting_init bloomfilter_basic_init loomfilter_lookup -## bloomfilter_clear bloomfilter_merge +## .. bro:see:: bloomfilter_basic_init bloomfilter_basic_init2 +## bloomfilter_counting_init bloomfilter_lookup bloomfilter_clear +## bloomfilter_merge function bloomfilter_add%(bf: opaque of bloomfilter, x: any%): any %{ BloomFilterVal* bfv = static_cast(bf); @@ -127,8 +165,9 @@ function bloomfilter_add%(bf: opaque of bloomfilter, x: any%): any ## ## Returns: the counter associated with *x* in *bf*. ## -## .. bro:see:: bloomfilter_counting_init bloomfilter_basic_init -## bloomfilter_add bloomfilter_clear bloomfilter_merge +## .. bro:see:: bloomfilter_basic_init bloomfilter_basic_init2 +## bloomfilter_counting_init bloomfilter_add bloomfilter_clear +## bloomfilter_merge function bloomfilter_lookup%(bf: opaque of bloomfilter, x: any%): count %{ const BloomFilterVal* bfv = static_cast(bf); @@ -154,8 +193,9 @@ function bloomfilter_lookup%(bf: opaque of bloomfilter, x: any%): count ## ## bf: The Bloom filter handle. ## -## .. bro:see:: bloomfilter_counting_init bloomfilter_basic_init -## bloomfilter_add bloomfilter_lookup bloomfilter_merge +## .. bro:see:: bloomfilter_basic_init bloomfilter_counting_init2 +## bloomfilter_counting_init bloomfilter_add bloomfilter_lookup +## bloomfilter_merge function bloomfilter_clear%(bf: opaque of bloomfilter%): any %{ BloomFilterVal* bfv = static_cast(bf); @@ -178,15 +218,18 @@ function bloomfilter_clear%(bf: opaque of bloomfilter%): any ## ## Returns: The union of *bf1* and *bf2*. ## -## .. bro:see:: bloomfilter_counting_init bloomfilter_basic_init -## bloomfilter_add bloomfilter_lookup bloomfilter_clear +## .. bro:see:: bloomfilter_basic_init bloomfilter_basic_init2 +## bloomfilter_counting_init bloomfilter_add bloomfilter_lookup +## bloomfilter_clear function bloomfilter_merge%(bf1: opaque of bloomfilter, bf2: opaque of bloomfilter%): opaque of bloomfilter %{ const BloomFilterVal* bfv1 = static_cast(bf1); const BloomFilterVal* bfv2 = static_cast(bf2); - if ( ! same_type(bfv1->Type(), bfv2->Type()) ) + if ( bfv1->Type() && // any one 0 is ok here + bfv2->Type() && + ! same_type(bfv1->Type(), bfv2->Type()) ) { reporter->Error("incompatible Bloom filter types"); return 0; @@ -194,3 +237,13 @@ function bloomfilter_merge%(bf1: opaque of bloomfilter, return BloomFilterVal::Merge(bfv1, bfv2); %} + +## Returns a string with a representation of a Bloom filter's internal +## state. This is for debugging/testing purposes only. +## +## bf: The Bloom filter handle. +function bloomfilter_internal_state%(bf: opaque of bloomfilter%): string + %{ + BloomFilterVal* bfv = static_cast(bf); + return new StringVal(bfv->InternalState()); + %} diff --git a/src/probabilistic/top-k.bif b/src/probabilistic/top-k.bif new file mode 100644 index 0000000000..9ad36cadef --- /dev/null +++ b/src/probabilistic/top-k.bif @@ -0,0 +1,184 @@ +# =========================================================================== +# +# Top-K Functions +# +# =========================================================================== + + +%%{ +#include "probabilistic/Topk.h" +%%} + +## Creates a top-k data structure which tracks *size* elements. +## +## size: number of elements to track +## +## Returns: Opaque pointer to the data structure. +## +## .. bro:see:: topk_add topk_get_top topk_count topk_epsilon +## topk_size topk_sum topk_merge topk_merge_prune +function topk_init%(size: count%): opaque of topk + %{ + probabilistic::TopkVal* v = new probabilistic::TopkVal(size); + return v; + %} + +## Add a new observed object to the data structure. +## +## .. note:: The first added object sets the type of data tracked by +## the top-k data structure. All following values have to be of the same +## type. +## +## handle: the TopK handle +## +## value: observed value +## +## .. bro:see:: topk_init topk_get_top topk_count topk_epsilon +## topk_size topk_sum topk_merge topk_merge_prune +function topk_add%(handle: opaque of topk, value: any%): any + %{ + assert(handle); + probabilistic::TopkVal* h = (probabilistic::TopkVal*) handle; + h->Encountered(value); + + return 0; + %} + +## Get the first *k* elements of the top-k data structure. +## +## handle: the TopK handle +## +## k: number of elements to return +## +## Returns: vector of the first k elements +## +## .. bro:see:: topk_init topk_add topk_count topk_epsilon +## topk_size topk_sum topk_merge topk_merge_prune +function topk_get_top%(handle: opaque of topk, k: count%): any + %{ + assert(handle); + probabilistic::TopkVal* h = (probabilistic::TopkVal*) handle; + return h->GetTopK(k); + %} + +## Get an overestimated count of how often value has been encountered. +## +## .. note:: value has to be part of the currently tracked elements, otherwise +## 0 will be returned and an error message will be added to reporter. +## +## handle: the TopK handle +## +## value: Value to look up count for. +## +## Returns: Overestimated number for how often the element has been encountered +## +## .. bro:see:: topk_init topk_add topk_get_top topk_epsilon +## topk_size topk_sum topk_merge topk_merge_prune +function topk_count%(handle: opaque of topk, value: any%): count + %{ + assert(handle); + probabilistic::TopkVal* h = (probabilistic::TopkVal*) handle; + return new Val(h->GetCount(value), TYPE_COUNT); + %} + +## Get the maximal overestimation for count. +## +## .. note:: Same restrictions as for :bro:id:`topk_count` apply. +## +## handle: the TopK handle +## +## value: Value to look up epsilon for. +## +## Returns: Number which represents the maximal overesimation for the count of this element. +## +## .. bro:see:: topk_init topk_add topk_get_top topk_count +## topk_size topk_sum topk_merge topk_merge_prune +function topk_epsilon%(handle: opaque of topk, value: any%): count + %{ + assert(handle); + probabilistic::TopkVal* h = (probabilistic::TopkVal*) handle; + return new Val(h->GetEpsilon(value), TYPE_COUNT); + %} + +## Get the number of elements this data structure is supposed to track (given on init). +## +## .. note ::Note that the actual number of elements in the data structure can be lower +## or higher (due to non-pruned merges) than this. +## +## handle: the TopK handle +## +## Returns: size given during initialization +## +## .. bro:see:: topk_init topk_add topk_get_top topk_count topk_epsilon +## topk_sum topk_merge topk_merge_prune +function topk_size%(handle: opaque of topk%): count + %{ + assert(handle); + probabilistic::TopkVal* h = (probabilistic::TopkVal*) handle; + return new Val(h->GetSize(), TYPE_COUNT); + %} + +## Get the sum of all counts of all elements in the data structure. +## +## .. note:: This is equal to the number of all inserted objects if the data structure +## never has been pruned. Do not use after calling topk_merge_prune (will throw a +## warning message if used afterwards) +## +## handle: the TopK handle +## +## Returns: sum of all counts +## +## .. bro:see:: topk_init topk_add topk_get_top topk_count topk_epsilon +## topk_size topk_merge topk_merge_prune +function topk_sum%(handle: opaque of topk%): count + %{ + assert(handle); + probabilistic::TopkVal* h = (probabilistic::TopkVal*) handle; + return new Val(h->GetSum(), TYPE_COUNT); + %} + +## Merge the second topk data structure into the first. +## +## .. note:: This does not remove any elements, the resulting data structure can be +## bigger than the maximum size given on initialization. +## +## .. bro:see:: topk_init topk_add topk_get_top topk_count topk_epsilon +## topk_size topk_sum topk_merge_prune +function topk_merge%(handle1: opaque of topk, handle2: opaque of topk%): any + %{ + assert(handle1); + assert(handle2); + + probabilistic::TopkVal* h1 = (probabilistic::TopkVal*) handle1; + probabilistic::TopkVal* h2 = (probabilistic::TopkVal*) handle2; + + h1->Merge(h2); + + return 0; + %} + +## Merge the second topk data structure into the first and prunes the final data +## structure back to the size given on initialization. +## +## .. note:: Use with care and only when being aware of the restrictions this +## entails. Do not call :bro:id:`topk_size` or :bro:id:`topk_add` afterwards, +## results will probably not be what you expect. +## +## handle1: the TopK handle in which the second TopK structure is merged +## +## handle2: the TopK handle in which is merged into the first TopK structure +## +## .. bro:see:: topk_init topk_add topk_get_top topk_count topk_epsilon +## topk_size topk_sum topk_merge +function topk_merge_prune%(handle1: opaque of topk, handle2: opaque of topk%): any + %{ + assert(handle1); + assert(handle2); + + probabilistic::TopkVal* h1 = (probabilistic::TopkVal*) handle1; + probabilistic::TopkVal* h2 = (probabilistic::TopkVal*) handle2; + + h1->Merge(h2, true); + + return 0; + %} diff --git a/testing/btest/Baseline/bifs.bloomfilter-seed/output b/testing/btest/Baseline/bifs.bloomfilter-seed/output new file mode 100644 index 0000000000..533085900f --- /dev/null +++ b/testing/btest/Baseline/bifs.bloomfilter-seed/output @@ -0,0 +1,8 @@ +bf1, global_seed, 11979365913534242684 +bf2, global_seed, 12550100962110750449 +bf3, my_seed, 12550100962110750449 +bf4, my_seed, 945716460325754659 +bf1, global_seed, 12550100962110750449 +bf2, global_seed, 945716460325754659 +bf3, my_seed, 12550100962110750449 +bf4, my_seed, 945716460325754659 diff --git a/testing/btest/Baseline/bifs.bloomfilter/output b/testing/btest/Baseline/bifs.bloomfilter/output index 14e1f038c0..82414f0686 100644 --- a/testing/btest/Baseline/bifs.bloomfilter/output +++ b/testing/btest/Baseline/bifs.bloomfilter/output @@ -12,6 +12,9 @@ error: false-positive rate must take value between 0 and 1 1 1 1 +0, no fp +1 +1 1 1 1 diff --git a/testing/btest/Baseline/bifs.topk/.stderr b/testing/btest/Baseline/bifs.topk/.stderr new file mode 100644 index 0000000000..a711333fc0 --- /dev/null +++ b/testing/btest/Baseline/bifs.topk/.stderr @@ -0,0 +1,11 @@ +error: GetCount for element that is not in top-k +error: GetEpsilon for element that is not in top-k +error: GetCount for element that is not in top-k +error: GetEpsilon for element that is not in top-k +error: GetCount for element that is not in top-k +error: GetEpsilon for element that is not in top-k +error: GetCount for element that is not in top-k +error: GetEpsilon for element that is not in top-k +warning: TopkVal::GetSum() was used on a pruned data structure. Result values do not represent total element count +error: GetCount for element that is not in top-k +error: GetEpsilon for element that is not in top-k diff --git a/testing/btest/Baseline/bifs.topk/out b/testing/btest/Baseline/bifs.topk/out new file mode 100644 index 0000000000..1ce5c4b850 --- /dev/null +++ b/testing/btest/Baseline/bifs.topk/out @@ -0,0 +1,81 @@ +[b, c] +4 +0 +0 +2 +0 +2 +1 +[d, c] +5 +0 +0 +2 +1 +3 +2 +[d, e] +6 +3 +2 +3 +2 +[f, e] +7 +4 +3 +3 +2 +[f, e] +8 +4 +3 +4 +2 +[g, e] +9 +0 +0 +4 +2 +5 +4 +[c, e, d] +19 +6 +0 +5 +0 +4 +0 +[c, e] +6 +0 +5 +0 +0 +0 +[c, e] +22 +12 +0 +10 +0 +0 +0 +[c, e] +19 +6 +0 +5 +0 +4 +0 +[c, e, d] +38 +12 +0 +10 +0 +8 +0 diff --git a/testing/btest/Baseline/coverage.default-load-baseline/canonified_loaded_scripts.log b/testing/btest/Baseline/coverage.default-load-baseline/canonified_loaded_scripts.log index dbbf689185..572173bd97 100644 --- a/testing/btest/Baseline/coverage.default-load-baseline/canonified_loaded_scripts.log +++ b/testing/btest/Baseline/coverage.default-load-baseline/canonified_loaded_scripts.log @@ -161,6 +161,7 @@ scripts/base/init-default.bro scripts/base/protocols/dns/main.bro scripts/base/protocols/ftp/__load__.bro scripts/base/protocols/ftp/utils-commands.bro + scripts/base/protocols/ftp/info.bro scripts/base/protocols/ftp/main.bro scripts/base/protocols/ftp/utils.bro scripts/base/protocols/ftp/files.bro diff --git a/testing/btest/Baseline/istate.topk/out b/testing/btest/Baseline/istate.topk/out new file mode 100644 index 0000000000..ef3d0cef30 --- /dev/null +++ b/testing/btest/Baseline/istate.topk/out @@ -0,0 +1,21 @@ +1 +2 +6 +4 +5 +1 +[c, e, d] +1 +2 +6 +4 +5 +1 +[c, e, d] +2 +4 +12 +8 +10 +2 +[c, e, d] diff --git a/testing/btest/Baseline/scripts.base.frameworks.sumstats.topk-cluster/manager-1..stdout b/testing/btest/Baseline/scripts.base.frameworks.sumstats.topk-cluster/manager-1..stdout new file mode 100644 index 0000000000..2d076eeac7 --- /dev/null +++ b/testing/btest/Baseline/scripts.base.frameworks.sumstats.topk-cluster/manager-1..stdout @@ -0,0 +1,9 @@ +Top entries for key counter +Num: 995, count: 100, epsilon: 0 +Num: 1, count: 99, epsilon: 0 +Num: 2, count: 98, epsilon: 0 +Num: 3, count: 97, epsilon: 0 +Num: 4, count: 96, epsilon: 0 +Top entries for key two +Num: 2, count: 4, epsilon: 0 +Num: 1, count: 3, epsilon: 0 diff --git a/testing/btest/Baseline/scripts.base.frameworks.sumstats.topk/.stdout b/testing/btest/Baseline/scripts.base.frameworks.sumstats.topk/.stdout new file mode 100644 index 0000000000..c85316eecc --- /dev/null +++ b/testing/btest/Baseline/scripts.base.frameworks.sumstats.topk/.stdout @@ -0,0 +1,8 @@ +Top entries for key counter +Num: 1, count: 99, epsilon: 0 +Num: 2, count: 98, epsilon: 0 +Num: 3, count: 97, epsilon: 0 +Num: 4, count: 96, epsilon: 0 +Num: 5, count: 95, epsilon: 0 +Top entries for key two +Num: 1, count: 2, epsilon: 0 diff --git a/testing/btest/bifs/bloomfilter-seed.bro b/testing/btest/bifs/bloomfilter-seed.bro new file mode 100644 index 0000000000..436638e2af --- /dev/null +++ b/testing/btest/bifs/bloomfilter-seed.bro @@ -0,0 +1,40 @@ +# @TEST-EXEC: bro -b %INPUT global_hash_seed="foo" >>output +# @TEST-EXEC: bro -b %INPUT global_hash_seed="my_seed" >>output +# @TEST-EXEC: btest-diff output + +type Foo: record + { + a: count; + b: string; + }; + +function test_bloom_filter() + { + local bf1 = bloomfilter_basic_init(0.9, 10); + bloomfilter_add(bf1, "foo"); + bloomfilter_add(bf1, "bar"); + + local bf2 = bloomfilter_basic_init(0.9, 10); + bloomfilter_add(bf2, Foo($a=1, $b="xx")); + bloomfilter_add(bf2, Foo($a=2, $b="yy")); + + local bf3 = bloomfilter_basic_init(0.9, 10, "my_seed"); + bloomfilter_add(bf3, "foo"); + bloomfilter_add(bf3, "bar"); + + local bf4 = bloomfilter_basic_init(0.9, 10, "my_seed"); + bloomfilter_add(bf4, Foo($a=1, $b="xx")); + bloomfilter_add(bf4, Foo($a=2, $b="yy")); + + print "bf1, global_seed", bloomfilter_internal_state(bf1); + print "bf2, global_seed", bloomfilter_internal_state(bf2); + print "bf3, my_seed", bloomfilter_internal_state(bf3); + print "bf4, my_seed", bloomfilter_internal_state(bf4); + + + } + +event bro_init() + { + test_bloom_filter(); + } diff --git a/testing/btest/bifs/bloomfilter.bro b/testing/btest/bifs/bloomfilter.bro index 3b40f29553..95455bc74c 100644 --- a/testing/btest/bifs/bloomfilter.bro +++ b/testing/btest/bifs/bloomfilter.bro @@ -15,14 +15,21 @@ function test_basic_bloom_filter() bloomfilter_add(bf_cnt, 0.5); # Type mismatch bloomfilter_add(bf_cnt, "foo"); # Type mismatch + # Alternative constructor. + local bf_dbl = bloomfilter_basic_init2(4, 10); + bloomfilter_add(bf_dbl, 4.2); + bloomfilter_add(bf_dbl, 3.14); + print bloomfilter_lookup(bf_dbl, 4.2); + print bloomfilter_lookup(bf_dbl, 3.14); + # Basic usage with strings. local bf_str = bloomfilter_basic_init(0.9, 10); bloomfilter_add(bf_str, "foo"); bloomfilter_add(bf_str, "bar"); print bloomfilter_lookup(bf_str, "foo"); print bloomfilter_lookup(bf_str, "bar"); - print bloomfilter_lookup(bf_str, "b4z"); # FP - print bloomfilter_lookup(bf_str, "quux"); # FP + print bloomfilter_lookup(bf_str, "b4zzz"), "no fp"; # FP + print bloomfilter_lookup(bf_str, "quuux"); # FP bloomfilter_add(bf_str, 0.5); # Type mismatch bloomfilter_add(bf_str, 100); # Type mismatch @@ -45,6 +52,11 @@ function test_basic_bloom_filter() print bloomfilter_lookup(bf_merged, 84); print bloomfilter_lookup(bf_merged, 100); print bloomfilter_lookup(bf_merged, 168); + + #empty filter tests + local bf_empty = bloomfilter_basic_init(0.1, 1000); + local bf_empty_merged = bloomfilter_merge(bf_merged, bf_empty); + print bloomfilter_lookup(bf_empty_merged, 42); } function test_counting_bloom_filter() diff --git a/testing/btest/bifs/topk.bro b/testing/btest/bifs/topk.bro new file mode 100644 index 0000000000..02d13c4195 --- /dev/null +++ b/testing/btest/bifs/topk.bro @@ -0,0 +1,154 @@ +# @TEST-EXEC: bro -b %INPUT > out +# @TEST-EXEC: btest-diff out +# @TEST-EXEC: btest-diff .stderr + +event bro_init() + { + local k1 = topk_init(2); + + # first - peculiarity check... + topk_add(k1, "a"); + topk_add(k1, "b"); + topk_add(k1, "b"); + topk_add(k1, "c"); + + local s = topk_get_top(k1, 5); + print s; + print topk_sum(k1); + print topk_count(k1, "a"); + print topk_epsilon(k1, "a"); + print topk_count(k1, "b"); + print topk_epsilon(k1, "b"); + print topk_count(k1, "c"); + print topk_epsilon(k1, "c"); + + topk_add(k1, "d"); + s = topk_get_top(k1, 5); + print s; + print topk_sum(k1); + print topk_count(k1, "b"); + print topk_epsilon(k1, "b"); + print topk_count(k1, "c"); + print topk_epsilon(k1, "c"); + print topk_count(k1, "d"); + print topk_epsilon(k1, "d"); + + topk_add(k1, "e"); + s = topk_get_top(k1, 5); + print s; + print topk_sum(k1); + print topk_count(k1, "d"); + print topk_epsilon(k1, "d"); + print topk_count(k1, "e"); + print topk_epsilon(k1, "e"); + + topk_add(k1, "f"); + s = topk_get_top(k1, 5); + print s; + print topk_sum(k1); + print topk_count(k1, "f"); + print topk_epsilon(k1, "f"); + print topk_count(k1, "e"); + print topk_epsilon(k1, "e"); + + topk_add(k1, "e"); + s = topk_get_top(k1, 5); + print s; + print topk_sum(k1); + print topk_count(k1, "f"); + print topk_epsilon(k1, "f"); + print topk_count(k1, "e"); + print topk_epsilon(k1, "e"); + + topk_add(k1, "g"); + s = topk_get_top(k1, 5); + print s; + print topk_sum(k1); + print topk_count(k1, "f"); + print topk_epsilon(k1, "f"); + print topk_count(k1, "e"); + print topk_epsilon(k1, "e"); + print topk_count(k1, "g"); + print topk_epsilon(k1, "g"); + + k1 = topk_init(100); + topk_add(k1, "a"); + topk_add(k1, "b"); + topk_add(k1, "b"); + topk_add(k1, "c"); + topk_add(k1, "c"); + topk_add(k1, "c"); + topk_add(k1, "c"); + topk_add(k1, "c"); + topk_add(k1, "c"); + topk_add(k1, "d"); + topk_add(k1, "d"); + topk_add(k1, "d"); + topk_add(k1, "d"); + topk_add(k1, "e"); + topk_add(k1, "e"); + topk_add(k1, "e"); + topk_add(k1, "e"); + topk_add(k1, "e"); + topk_add(k1, "f"); + s = topk_get_top(k1, 3); + print s; + print topk_sum(k1); + print topk_count(k1, "c"); + print topk_epsilon(k1, "c"); + print topk_count(k1, "e"); + print topk_epsilon(k1, "d"); + print topk_count(k1, "d"); + print topk_epsilon(k1, "d"); + + local k3 = topk_init(2); + topk_merge_prune(k3, k1); + + s = topk_get_top(k3, 3); + print s; + print topk_count(k3, "c"); + print topk_epsilon(k3, "c"); + print topk_count(k3, "e"); + print topk_epsilon(k3, "e"); + print topk_count(k3, "d"); + print topk_epsilon(k3, "d"); + + topk_merge_prune(k3, k1); + + s = topk_get_top(k3, 3); + print s; + print topk_sum(k3); # this gives a warning and a wrong result. + print topk_count(k3, "c"); + print topk_epsilon(k3, "c"); + print topk_count(k3, "e"); + print topk_epsilon(k3, "e"); + print topk_count(k3, "d"); + print topk_epsilon(k3, "d"); + + k3 = topk_init(2); + topk_merge(k3, k1); + print s; + print topk_sum(k3); + print topk_count(k3, "c"); + print topk_epsilon(k3, "c"); + print topk_count(k3, "e"); + print topk_epsilon(k3, "e"); + print topk_count(k3, "d"); + print topk_epsilon(k3, "d"); + + topk_merge(k3, k1); + + s = topk_get_top(k3, 3); + print s; + print topk_sum(k3); + print topk_count(k3, "c"); + print topk_epsilon(k3, "c"); + print topk_count(k3, "e"); + print topk_epsilon(k3, "e"); + print topk_count(k3, "d"); + print topk_epsilon(k3, "d"); + + + + +} diff --git a/testing/btest/istate/topk.bro b/testing/btest/istate/topk.bro new file mode 100644 index 0000000000..4d599c2780 --- /dev/null +++ b/testing/btest/istate/topk.bro @@ -0,0 +1,74 @@ +# @TEST-EXEC: bro -b %INPUT runnumber=1 >out +# @TEST-EXEC: bro -b %INPUT runnumber=2 >>out +# @TEST-EXEC: bro -b %INPUT runnumber=3 >>out +# @TEST-EXEC: btest-diff out + +global runnumber: count &redef; # differentiate runs + +global k1: opaque of topk &persistent; +global k2: opaque of topk &persistent; + +event bro_init() + { + + k2 = topk_init(20); + + if ( runnumber == 1 ) + { + k1 = topk_init(100); + + topk_add(k1, "a"); + topk_add(k1, "b"); + topk_add(k1, "b"); + topk_add(k1, "c"); + topk_add(k1, "c"); + topk_add(k1, "c"); + topk_add(k1, "c"); + topk_add(k1, "c"); + topk_add(k1, "c"); + topk_add(k1, "d"); + topk_add(k1, "d"); + topk_add(k1, "d"); + topk_add(k1, "d"); + topk_add(k1, "e"); + topk_add(k1, "e"); + topk_add(k1, "e"); + topk_add(k1, "e"); + topk_add(k1, "e"); + topk_add(k1, "f"); + } + + local s = topk_get_top(k1, 3); + print topk_count(k1, "a"); + print topk_count(k1, "b"); + print topk_count(k1, "c"); + print topk_count(k1, "d"); + print topk_count(k1, "e"); + print topk_count(k1, "f"); + + if ( runnumber == 2 ) + { + topk_add(k1, "a"); + topk_add(k1, "b"); + topk_add(k1, "b"); + topk_add(k1, "c"); + topk_add(k1, "c"); + topk_add(k1, "c"); + topk_add(k1, "c"); + topk_add(k1, "c"); + topk_add(k1, "c"); + topk_add(k1, "d"); + topk_add(k1, "d"); + topk_add(k1, "d"); + topk_add(k1, "d"); + topk_add(k1, "e"); + topk_add(k1, "e"); + topk_add(k1, "e"); + topk_add(k1, "e"); + topk_add(k1, "e"); + topk_add(k1, "f"); + } + + print s; + + } diff --git a/testing/btest/scripts/base/frameworks/sumstats/topk-cluster.bro b/testing/btest/scripts/base/frameworks/sumstats/topk-cluster.bro new file mode 100644 index 0000000000..0ade38e86c --- /dev/null +++ b/testing/btest/scripts/base/frameworks/sumstats/topk-cluster.bro @@ -0,0 +1,110 @@ +# @TEST-SERIALIZE: comm +# +# @TEST-EXEC: btest-bg-run manager-1 BROPATH=$BROPATH:.. CLUSTER_NODE=manager-1 bro %INPUT +# @TEST-EXEC: sleep 1 +# @TEST-EXEC: btest-bg-run worker-1 BROPATH=$BROPATH:.. CLUSTER_NODE=worker-1 bro %INPUT +# @TEST-EXEC: btest-bg-run worker-2 BROPATH=$BROPATH:.. CLUSTER_NODE=worker-2 bro %INPUT +# @TEST-EXEC: btest-bg-wait 15 + +# @TEST-EXEC: btest-diff manager-1/.stdout +# +@TEST-START-FILE cluster-layout.bro +redef Cluster::nodes = { + ["manager-1"] = [$node_type=Cluster::MANAGER, $ip=127.0.0.1, $p=37757/tcp, $workers=set("worker-1", "worker-2")], + ["worker-1"] = [$node_type=Cluster::WORKER, $ip=127.0.0.1, $p=37760/tcp, $manager="manager-1", $interface="eth0"], + ["worker-2"] = [$node_type=Cluster::WORKER, $ip=127.0.0.1, $p=37761/tcp, $manager="manager-1", $interface="eth1"], +}; +@TEST-END-FILE + +redef Log::default_rotation_interval = 0secs; + + +event bro_init() &priority=5 + { + local r1: SumStats::Reducer = [$stream="test.metric", + $apply=set(SumStats::TOPK)]; + SumStats::create([$epoch=5secs, + $reducers=set(r1), + $epoch_finished(data: SumStats::ResultTable) = + { + for ( key in data ) + { + local r = data[key]["test.metric"]; + + local s: vector of SumStats::Observation; + s = topk_get_top(r$topk, 5); + + print fmt("Top entries for key %s", key$str); + for ( element in s ) + { + print fmt("Num: %d, count: %d, epsilon: %d", s[element]$num, topk_count(r$topk, s[element]), topk_epsilon(r$topk, s[element])); + } + + terminate(); + } + } + ]); + + + } + +event remote_connection_closed(p: event_peer) + { + terminate(); + } + +global ready_for_data: event(); +redef Cluster::manager2worker_events += /^ready_for_data$/; + +event ready_for_data() + { + const loop_v: vector of count = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100}; + + + if ( Cluster::node == "worker-1" ) + { + + local a: count; + a = 0; + + for ( i in loop_v ) + { + a = a + 1; + for ( j in loop_v ) + { + if ( i < j ) + SumStats::observe("test.metric", [$str="counter"], [$num=a]); + } + } + + + SumStats::observe("test.metric", [$str="two"], [$num=1]); + SumStats::observe("test.metric", [$str="two"], [$num=1]); + } + if ( Cluster::node == "worker-2" ) + { + SumStats::observe("test.metric", [$str="two"], [$num=2]); + SumStats::observe("test.metric", [$str="two"], [$num=2]); + SumStats::observe("test.metric", [$str="two"], [$num=2]); + SumStats::observe("test.metric", [$str="two"], [$num=2]); + SumStats::observe("test.metric", [$str="two"], [$num=1]); + + for ( i in loop_v ) + { + SumStats::observe("test.metric", [$str="counter"], [$num=995]); + } + } + } + +@if ( Cluster::local_node_type() == Cluster::MANAGER ) + +global peer_count = 0; +event remote_connection_handshake_done(p: event_peer) &priority=-5 + { + ++peer_count; + if ( peer_count == 2 ) + event ready_for_data(); + } + +@endif + diff --git a/testing/btest/scripts/base/frameworks/sumstats/topk.bro b/testing/btest/scripts/base/frameworks/sumstats/topk.bro new file mode 100644 index 0000000000..22a5af1bc7 --- /dev/null +++ b/testing/btest/scripts/base/frameworks/sumstats/topk.bro @@ -0,0 +1,48 @@ +# @TEST-EXEC: bro %INPUT +# @TEST-EXEC: btest-diff .stdout + +event bro_init() &priority=5 + { + local r1: SumStats::Reducer = [$stream="test.metric", + $apply=set(SumStats::TOPK)]; + SumStats::create([$epoch=3secs, + $reducers=set(r1), + $epoch_finished(data: SumStats::ResultTable) = + { + for ( key in data ) + { + local r = data[key]["test.metric"]; + + local s: vector of SumStats::Observation; + s = topk_get_top(r$topk, 5); + + print fmt("Top entries for key %s", key$str); + for ( element in s ) + { + print fmt("Num: %d, count: %d, epsilon: %d", s[element]$num, topk_count(r$topk, s[element]), topk_epsilon(r$topk, s[element])); + } + + } + } + ]); + + + const loop_v: vector of count = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100}; + + local a: count; + a = 0; + + for ( i in loop_v ) + { + a = a + 1; + for ( j in loop_v ) + { + if ( i < j ) + SumStats::observe("test.metric", [$str="counter"], [$num=a]); + } + } + + + SumStats::observe("test.metric", [$str="two"], [$num=1]); + SumStats::observe("test.metric", [$str="two"], [$num=1]); + }