zeek/scripts/base/frameworks/sumstats/plugins/topk.zeek
Johanna Amann 25c33d2a29 Add exact name of the Top-k algorithm.
I needed to figure out which exact algorithm we use for our
probabilistic top-k measurements. It turns out that we do not mention
this in our source tree at all so far.
2022-05-11 13:21:26 +01:00

62 lines
1.4 KiB
Text

##! Keep the top-k (i.e., most frequently occurring) observations.
##!
##! This plugin uses a probabilistic algorithm to count the top-k elements.
##! The algorithm (calles Space-Saving) is described in the paper Efficient
##! Computation of Frequent and Top-k Elements in Data Streams", by
##! Metwally et al. (2005).
@load base/frameworks/sumstats
module SumStats;
export {
redef record Reducer += {
## Number of elements to keep in the top-k list.
topk_size: count &default=500;
};
redef enum Calculation += {
## Keep a top-k list of values.
TOPK
};
redef record ResultVal += {
## A handle which can be passed to some built-in functions to get
## the top-k results.
topk: opaque of topk &optional;
};
}
hook register_observe_plugins()
{
register_observe_plugin(TOPK, function(r: Reducer, val: double, obs: Observation, rv: ResultVal)
{
topk_add(rv$topk, obs);
});
}
hook init_resultval_hook(r: Reducer, rv: ResultVal)
{
if ( TOPK in r$apply && ! rv?$topk )
rv$topk = topk_init(r$topk_size);
}
hook compose_resultvals_hook(result: ResultVal, rv1: ResultVal, rv2: ResultVal)
{
if ( rv1?$topk )
{
result$topk = topk_init(topk_size(rv1$topk));
topk_merge(result$topk, rv1$topk);
if ( rv2?$topk )
topk_merge(result$topk, rv2$topk);
}
else if ( rv2?$topk )
{
result$topk = topk_init(topk_size(rv2$topk));
topk_merge(result$topk, rv2$topk);
}
}