Checkpoint for SumStats rename.

This commit is contained in:
Seth Hall 2013-04-15 15:12:28 -04:00
parent 8165d6077d
commit fbe967e16a
32 changed files with 626 additions and 620 deletions

View file

@ -0,0 +1,8 @@
@load ./average
@load ./max
@load ./min
@load ./sample
@load ./variance
@load ./std-dev
@load ./sum
@load ./unique

View file

@ -0,0 +1,36 @@
@load base/frameworks/sumstats
module SumStats;
export {
redef enum Calculation += {
## Calculate the average of the values.
AVERAGE
};
redef record ResultVal += {
## For numeric data, this calculates the average of all values.
average: double &optional;
};
}
hook add_to_reducer_hook(r: Reducer, val: double, data: Observation, rv: ResultVal)
{
if ( AVERAGE in r$apply )
{
if ( ! rv?$average )
rv$average = val;
else
rv$average += (val - rv$average) / rv$num;
}
}
hook compose_resultvals_hook(result: ResultVal, rv1: ResultVal, rv2: ResultVal)
{
if ( rv1?$average && rv2?$average )
result$average = ((rv1$average*rv1$num) + (rv2$average*rv2$num))/(rv1$num+rv2$num);
else if ( rv1?$average )
result$average = rv1$average;
else if ( rv2?$average )
result$average = rv2$average;
}

View file

@ -0,0 +1,38 @@
@load base/frameworks/sumstats
module SumStats;
export {
redef enum Calculation += {
## Find the maximum value.
MAX
};
redef record ResultVal += {
## For numeric data, this tracks the maximum value given.
max: double &optional;
};
}
hook add_to_reducer_hook(r: Reducer, val: double, data: Observation, rv: ResultVal)
{
if ( MAX in r$apply )
{
if ( ! rv?$max )
rv$max = val;
else if ( val > rv$max )
rv$max = val;
}
}
hook compose_resultvals_hook(result: ResultVal, rv1: ResultVal, rv2: ResultVal)
{
if ( rv1?$max && rv2?$max )
result$max = (rv1$max > rv2$max) ? rv1$max : rv2$max;
else if ( rv1?$max )
result$max = rv1$max;
else if ( rv2?$max )
result$max = rv2$max;
}

View file

@ -0,0 +1,36 @@
@load base/frameworks/sumstats
module SumStats;
export {
redef enum Calculation += {
## Find the minimum value.
MIN
};
redef record ResultVal += {
## For numeric data, this tracks the minimum value given.
min: double &optional;
};
}
hook add_to_reducer_hook(r: Reducer, val: double, data: Observation, rv: ResultVal)
{
if ( MIN in r$apply )
{
if ( ! rv?$min )
rv$min = val;
else if ( val < rv$min )
rv$min = val;
}
}
hook compose_resultvals_hook(result: ResultVal, rv1: ResultVal, rv2: ResultVal)
{
if ( rv1?$min && rv2?$min )
result$min = (rv1$min < rv2$min) ? rv1$min : rv2$min;
else if ( rv1?$min )
result$min = rv1$min;
else if ( rv2?$min )
result$min = rv2$min;
}

View file

@ -0,0 +1,51 @@
@load base/frameworks/sumstats
@load base/utils/queue
module SumStats;
export {
redef record Reducer += {
## A number of sample Observations to collect.
samples: count &default=0;
};
redef record ResultVal += {
## This is the queue where samples
## are maintained. Use the
## :bro:see:`SumStats::get_samples` function
## to get a vector of the samples.
samples: Queue::Queue &optional;
};
## Get a vector of sample Observation values from a ResultVal.
global get_samples: function(rv: ResultVal): vector of Observation;
}
function get_samples(rv: ResultVal): vector of Observation
{
local s: vector of Observation = vector();
if ( rv?$samples )
Queue::get_vector(rv$samples, s);
return s;
}
hook add_to_reducer_hook(r: Reducer, val: double, data: Observation, rv: ResultVal)
{
if ( r$samples > 0 )
{
if ( ! rv?$samples )
rv$samples = Queue::init([$max_len=r$samples]);
Queue::put(rv$samples, data);
}
}
hook compose_resultvals_hook(result: ResultVal, rv1: ResultVal, rv2: ResultVal)
{
# Merge $samples
if ( rv1?$samples && rv2?$samples )
result$samples = Queue::merge(rv1$samples, rv2$samples);
else if ( rv1?$samples )
result$samples = rv1$samples;
else if ( rv2?$samples )
result$samples = rv2$samples;
}

View file

@ -0,0 +1,37 @@
@load ./variance
@load base/frameworks/sumstats
module SumStats;
export {
redef enum Calculation += {
## Find the standard deviation of the values.
STD_DEV
};
redef record ResultVal += {
## For numeric data, this calculates the standard deviation.
std_dev: double &default=0.0;
};
}
function calc_std_dev(rv: ResultVal)
{
if ( rv?$variance )
rv$std_dev = sqrt(rv$variance);
}
# This depends on the variance plugin which uses priority -5
hook add_to_reducer_hook(r: Reducer, val: double, data: Observation, rv: ResultVal) &priority=-10
{
if ( STD_DEV in r$apply )
{
if ( rv?$variance )
calc_std_dev(rv);
}
}
hook compose_resultvals_hook(result: ResultVal, rv1: ResultVal, rv2: ResultVal) &priority=-10
{
calc_std_dev(result);
}

View file

@ -0,0 +1,51 @@
@load base/frameworks/sumstats
module SumStats;
export {
redef enum Calculation += {
## Sums the values given. For string values,
## this will be the number of strings given.
SUM
};
redef record ResultVal += {
## For numeric data, this tracks the sum of all values.
sum: double &default=0.0;
};
type threshold_function: function(key: SumStats::Key, result: SumStats::Result): count;
global sum_threshold: function(data_id: string): threshold_function;
}
function sum_threshold(data_id: string): threshold_function
{
return function(key: SumStats::Key, result: SumStats::Result): count
{
print fmt("data_id: %s", data_id);
print result;
return double_to_count(result[data_id]$sum);
};
}
hook init_resultval_hook(r: Reducer, rv: ResultVal)
{
if ( SUM in r$apply && ! rv?$sum )
rv$sum = 0;
}
hook add_to_reducer_hook(r: Reducer, val: double, data: Observation, rv: ResultVal)
{
if ( SUM in r$apply )
rv$sum += val;
}
hook compose_resultvals_hook(result: ResultVal, rv1: ResultVal, rv2: ResultVal)
{
if ( rv1?$sum || rv2?$sum )
{
result$sum = rv1?$sum ? rv1$sum : 0;
if ( rv2?$sum )
result$sum += rv2$sum;
}
}

View file

@ -0,0 +1,53 @@
@load base/frameworks/sumstats
module SumStats;
export {
redef enum Calculation += {
## Calculate the number of unique values.
UNIQUE
};
redef record ResultVal += {
## If cardinality is being tracked, the number of unique
## items is tracked here.
unique: count &default=0;
};
}
redef record ResultVal += {
# Internal use only. This is not meant to be publically available
# because we don't want to trust that we can inspect the values
# since we will like move to a probalistic data structure in the future.
# TODO: in the future this will optionally be a hyperloglog structure
unique_vals: set[Observation] &optional;
};
hook add_to_reducer_hook(r: Reducer, val: double, data: Observation, rv: ResultVal)
{
if ( UNIQUE in r$apply )
{
if ( ! rv?$unique_vals )
rv$unique_vals=set();
add rv$unique_vals[data];
rv$unique = |rv$unique_vals|;
}
}
hook compose_resultvals_hook(result: ResultVal, rv1: ResultVal, rv2: ResultVal)
{
if ( rv1?$unique_vals || rv2?$unique_vals )
{
if ( rv1?$unique_vals )
result$unique_vals = rv1$unique_vals;
if ( rv2?$unique_vals )
if ( ! result?$unique_vals )
result$unique_vals = rv2$unique_vals;
else
for ( val2 in rv2$unique_vals )
add result$unique_vals[val2];
result$unique = |result$unique_vals|;
}
}

View file

@ -0,0 +1,69 @@
@load ./average
@load base/frameworks/sumstats
module SumStats;
export {
redef enum Calculation += {
## Find the variance of the values.
VARIANCE
};
redef record ResultVal += {
## For numeric data, this calculates the variance.
variance: double &optional;
};
}
redef record ResultVal += {
# Internal use only. Used for incrementally calculating variance.
prev_avg: double &optional;
# Internal use only. For calculating incremental variance.
var_s: double &default=0.0;
};
function calc_variance(rv: ResultVal)
{
rv$variance = (rv$num > 1) ? rv$var_s/(rv$num-1) : 0.0;
}
# Reduced priority since this depends on the average
hook add_to_reducer_hook(r: Reducer, val: double, data: Observation, rv: ResultVal) &priority=-5
{
if ( VARIANCE in r$apply )
{
if ( rv$num > 1 )
rv$var_s += ((val - rv$prev_avg) * (val - rv$average));
calc_variance(rv);
rv$prev_avg = rv$average;
}
}
# Reduced priority since this depends on the average
hook compose_resultvals_hook(result: ResultVal, rv1: ResultVal, rv2: ResultVal) &priority=-5
{
if ( rv1?$var_s && rv1?$average &&
rv2?$var_s && rv2?$average )
{
local rv1_avg_sq = (rv1$average - result$average);
rv1_avg_sq = rv1_avg_sq*rv1_avg_sq;
local rv2_avg_sq = (rv2$average - result$average);
rv2_avg_sq = rv2_avg_sq*rv2_avg_sq;
result$var_s = rv1$num*(rv1$var_s/rv1$num + rv1_avg_sq) + rv2$num*(rv2$var_s/rv2$num + rv2_avg_sq);
}
else if ( rv1?$var_s )
result$var_s = rv1$var_s;
else if ( rv2?$var_s )
result$var_s = rv2$var_s;
if ( rv1?$prev_avg && rv2?$prev_avg )
result$prev_avg = ((rv1$prev_avg*rv1$num) + (rv2$prev_avg*rv2$num))/(rv1$num+rv2$num);
else if ( rv1?$prev_avg )
result$prev_avg = rv1$prev_avg;
else if ( rv2?$prev_avg )
result$prev_avg = rv2$prev_avg;
calc_variance(result);
}