zeek/scripts/base/frameworks/sumstats/plugins/unique.zeek
2019-04-11 21:12:40 -05:00

86 lines
2 KiB
Text

##! Calculate the number of unique values.
@load ../main
module SumStats;
export {
redef record Reducer += {
## Maximum number of unique values to store.
unique_max: count &optional;
};
redef enum Calculation += {
## Calculate the number of unique values.
UNIQUE
};
redef record ResultVal += {
## If cardinality is being tracked, the number of unique
## values is tracked here.
unique: count &default=0;
};
}
redef record ResultVal += {
# Internal use only. This is used when multiple ResultVals
# are being merged and they need to abide the unique limit
# set in the reducer.
unique_max: count &optional;
# Internal use only. This is not meant to be publically available
# because we don't want to trust that we can inspect the values
# since we will likely move to a probabilistic data structure in the future.
# TODO: in the future this will optionally be a hyperloglog structure
unique_vals: set[Observation] &optional;
};
hook register_observe_plugins()
{
register_observe_plugin(UNIQUE, function(r: Reducer, val: double, obs: Observation, rv: ResultVal)
{
if ( ! rv?$unique_vals )
rv$unique_vals=set();
if ( r?$unique_max )
rv$unique_max=r$unique_max;
if ( ! r?$unique_max || |rv$unique_vals| <= r$unique_max )
add rv$unique_vals[obs];
rv$unique = |rv$unique_vals|;
});
}
hook compose_resultvals_hook(result: ResultVal, rv1: ResultVal, rv2: ResultVal)
{
if ( rv1?$unique_vals || rv2?$unique_vals )
{
if ( rv1?$unique_max )
result$unique_max = rv1$unique_max;
else if ( rv2?$unique_max )
result$unique_max = rv2$unique_max;
if ( rv1?$unique_vals )
result$unique_vals = copy(rv1$unique_vals);
if ( rv2?$unique_vals )
{
if ( ! result?$unique_vals )
{
result$unique_vals = copy(rv2$unique_vals);
}
else
{
for ( val2 in rv2$unique_vals )
{
if ( result?$unique_max && |result$unique_vals| >= result$unique_max )
break;
add result$unique_vals[copy(val2)];
}
}
}
result$unique = |result$unique_vals|;
}
}