API updates for metrics framework.

- Removed default logging.  Now a function is available for the new
  $period_finished filter field to get the same behavior for logging
  named Metrics::write_log.

- Added index rollups for getting multiple metrics result values
  as the same time.
This commit is contained in:
Seth Hall 2012-12-18 01:08:59 -05:00
parent 69030fdff3
commit 69b7ce12d2
17 changed files with 304 additions and 162 deletions

View file

@ -60,18 +60,18 @@ global requested_results: table[string] of time = table() &create_expire=5mins;
# This variable is maintained by manager nodes as they collect and aggregate
# results.
global filter_results: table[string, string, string] of MetricTable &create_expire=5mins;
global filter_results: table[string, string, string] of MetricTable &read_expire=1min;
# This variable is maintained by manager nodes to track how many "dones" they
# collected per collection unique id. Once the number of results for a uid
# matches the number of peer nodes that results should be coming from, the
# result is written out and deleted from here.
# TODO: add an &expire_func in case not all results are received.
global done_with: table[string] of count &create_expire=5mins &default=0;
global done_with: table[string] of count &read_expire=1min &default=0;
# This variable is maintained by managers to track intermediate responses as
# they are getting a global view for a certain index.
global index_requests: table[string, string, string, Index] of ResultVal &create_expire=5mins &default=[];
global index_requests: table[string, string, string, Index] of ResultVal &read_expire=1min;
# This variable is maintained by all hosts for different purposes. Non-managers
# maintain it to know what indexes they have recently sent as intermediate
@ -163,7 +163,7 @@ event Metrics::cluster_index_request(uid: string, id: string, filter_name: strin
@if ( Cluster::local_node_type() == Cluster::MANAGER )
# Manager's handle logging.
event Metrics::log_it(filter: Filter)
event Metrics::finish_period(filter: Filter)
{
#print fmt("%.6f MANAGER: breaking %s filter for %s metric", network_time(), filter$name, filter$id);
local uid = unique_id("");
@ -174,8 +174,8 @@ event Metrics::log_it(filter: Filter)
# Request data from peers.
event Metrics::cluster_filter_request(uid, filter$id, filter$name);
# Schedule the log_it event for the next break period.
schedule filter$every { Metrics::log_it(filter) };
# Schedule the next finish_period event.
schedule filter$every { Metrics::finish_period(filter) };
}
# This is unlikely to be called often, but it's here in case there are metrics
@ -237,6 +237,8 @@ event Metrics::cluster_filter_response(uid: string, id: string, filter_name: str
++done_with[uid];
local local_data = filter_results[uid, id, filter_name];
local filter = filter_store[id, filter_name];
for ( index in data )
{
if ( index in local_data )
@ -245,18 +247,18 @@ event Metrics::cluster_filter_response(uid: string, id: string, filter_name: str
local_data[index] = data[index];
# If a filter is done being collected, thresholds for each index
# need to checked so we're doing it here to avoid doubly iterating
# need to be checked so we're doing it here to avoid doubly iterating
# over each index.
if ( Cluster::worker_count == done_with[uid] )
{
if ( check_thresholds(filter_store[id, filter_name], index, local_data[index], 1.0) )
if ( check_thresholds(filter, index, local_data[index], 1.0) )
{
threshold_crossed(filter_store[id, filter_name], index, local_data[index]);
threshold_crossed(filter, index, local_data[index]);
}
}
}
# If the data has been collected from all peers, we are done and ready to log.
# If the data has been collected from all peers, we are done and ready to finish.
if ( Cluster::worker_count == done_with[uid] )
{
local ts = network_time();
@ -267,11 +269,30 @@ event Metrics::cluster_filter_response(uid: string, id: string, filter_name: str
delete requested_results[uid];
}
write_log(ts, filter_store[id, filter_name], local_data);
if ( filter?$rollup )
{
for ( index in local_data )
{
if ( index !in rollup_store )
rollup_store[index] = table();
rollup_store[index][id, filter_name] = local_data[index];
# If all of the result vals are stored then the rollup callback can be executed.
if ( |rollup_store[index]| == |rollups[filter$rollup]$filters| )
{
rollups[filter$rollup]$callback(index, rollup_store[index]);
}
}
}
if ( filter?$period_finished )
filter$period_finished(ts, filter$id, filter$name, local_data);
# Clean up
delete filter_results[uid, id, filter_name];
delete done_with[uid];
# Not sure I need to reset the filter on the manager.
reset(filter);
}
}

View file

@ -8,10 +8,6 @@ export {
## The metrics logging stream identifier.
redef enum Log::ID += { LOG };
## The default interval used for "breaking" metrics and writing the
## current value to the logging stream.
const default_break_interval = 15mins &redef;
## This is the interval for how often threshold based notices will happen
## after they have already fired.
const threshold_crossed_restart_interval = 1hr &redef;
@ -108,63 +104,74 @@ export {
## The record type that is used for logging metrics.
type Info: record {
## Timestamp at which the metric was "broken".
ts: time &log;
ts: time &log;
## Interval between logging of this filter and the last time it was logged.
ts_delta: interval &log;
## The name of the filter being logged. Values
## can have multiple filters which represent different perspectives on
## the data so this is necessary to understand the value.
filter_name: string &log;
ts_delta: interval &log;
## What measurement the metric represents.
metric: string &log;
metric: string &log;
## What the metric value applies to.
index: Index &log;
index: Index &log;
## The simple numeric value of the metric.
result: ResultVal &log;
result: ResultVal &log;
};
## Type to store a table of metrics result values.
type MetricTable: table[Index] of ResultVal;
## Filters define how the data from a metric is aggregated and handled.
## Filters can be used to set how often the measurements are cut
## and logged or how the data within them is aggregated. It's also
## possible to disable logging and use filters solely for thresholding.
type Filter: record {
## The name for this filter so that multiple filters can be
## applied to a single metrics to get a different view of the same
## metric data being collected (different aggregation, break, etc).
## A name for the filter in case multiple filters are being
## applied to the same metric. In most cases the default
## filter name is fine and this field does not need to be set.
name: string &default="default";
## The metric that this filter applies to.
id: string &optional;
## The measurements to perform on the data.
measure: set[Calculation] &optional;
## A predicate so that you can decide per index if you would like
## to accept the data being inserted.
pred: function(index: Metrics::Index, data: Metrics::DataPoint): bool &optional;
## A function to normalize the index. This can be used to aggregate or
## normalize the entire index.
normalize_func: function(index: Metrics::Index): Index &optional;
## Global mask by to aggregate traffic measuring an attribute of hosts.
## This is a special case of the normalize_func.
aggregation_mask: count &optional;
## The interval at which this filter should be "broken" and written
## to the logging stream. The counters are also reset to zero at
## this time so any threshold based detection needs to be set to a
## number that should be expected to happen within this period.
every: interval &default=default_break_interval;
## This determines if the result of this filter is sent to the metrics
## logging stream. One use for the logging framework is as an internal
## thresholding and statistics gathering utility that is meant to
## never log but rather to generate notices and derive data.
log: bool &default=T;
every: interval;
## The measurements to perform on the data.
measure: set[Calculation] &optional;
## A predicate so that you can decide per index if you would like
## to accept the data being inserted.
pred: function(index: Metrics::Index, data: Metrics::DataPoint): bool &optional;
## A function to normalize the index. This can be used to aggregate or
## normalize the entire index.
normalize_func: function(index: Metrics::Index): Index &optional;
## Global mask by to aggregate traffic measuring an attribute of hosts.
## This is a special case of the normalize_func.
aggregation_mask: count &optional;
## A direct threshold for calling the $threshold_crossed function when
## the SUM is greater than or equal to this value.
threshold: count &optional;
## A series of thresholds for calling the $threshold_crossed function.
threshold_series: vector of count &optional;
## A predicate so that you can decide when to flexibly declare when
## a threshold crossed, and do extra work.
threshold_func: function(index: Metrics::Index, val: Metrics::ResultVal): bool &optional;
## A function callback that is called when a threshold is crossed.
## A callback with the full collection of ResultVals for this filter. This
## is defined as a redef because the function includes a :bro:type:`Filter`
## record which is self referential before the Filter type has been fully
## defined and doesn't work.
period_finished: function(ts: time, metric_name: string, filter_name: string, data: Metrics::MetricTable) &optional;
## A callback that is called when a threshold is crossed.
threshold_crossed: function(index: Metrics::Index, val: Metrics::ResultVal) &optional;
## A rollup to register this filter with.
rollup: string &optional;
## A number of sample DataPoint strings to collect for the threshold
## crossing callback.
samples: count &optional;
@ -187,7 +194,19 @@ export {
##
## increment: How much to increment the counter by.
global add_data: function(id: string, index: Metrics::Index, data: Metrics::DataPoint);
## The callback definition for rollup functions.
type RollupCallback: function(index: Metrics::Index, vals: table[string, string] of Metrics::ResultVal);
## Add a rollup function for merging multiple filters with matching
## indexes. If the metrics filters being merged don't have equivalent times
## in the $every field, an error will be generated.
##
## name: An arbitrary name for this filter rollup.
##
## vals: Each ResultVal record indexed by the appropriate metric name and filter name.
global create_index_rollup: function(name: string, rollup: RollupCallback);
## Helper function to represent a :bro:type:`Metrics::Index` value as
## a simple string.
##
@ -195,12 +214,23 @@ export {
##
## Returns: A string reprentation of the metric index.
global index2str: function(index: Metrics::Index): string;
## A helper function to use with the `period_finished` field in filters. Using
## this function is not recommended however since each metric likely has
## different data and different semantics which would be better served by writing
## a custom function that logs in more domain specific fashion.
global write_log: function(ts: time, metric_name: string, filter_name: string, data: Metrics::MetricTable);
## Event to access metrics records as they are passed to the logging framework.
global log_metrics: event(rec: Metrics::Info);
}
redef record Filter += {
# The metric that this filter applies to. The value is automatically set.
id: string &optional;
};
redef record ResultVal += {
# Internal use only. Used for incrementally calculating variance.
prev_avg: double &optional;
@ -226,9 +256,6 @@ redef record ResultVal += {
threshold_series_index: count &default=0;
};
# Type to store a table of metrics values.
type MetricTable: table[Index] of ResultVal;
# Store the filters indexed on the metric identifier.
global metric_filters: table[string] of vector of Filter = table();
@ -238,16 +265,23 @@ global filter_store: table[string, string] of Filter = table();
# This is indexed by metric id and filter name.
global store: table[string, string] of MetricTable = table() &default=table();
# This is hook for watching thresholds being crossed. It is called whenever
# This is a hook for watching thresholds being crossed. It is called whenever
# index values are updated and the new val is given as the `val` argument.
# It's only prototyped here because cluster and non-cluster has separate
# It's only prototyped here because cluster and non-cluster have separate
# implementations.
global data_added: function(filter: Filter, index: Index, val: ResultVal);
type Rollup: record {
callback: RollupCallback;
filters: set[Filter] &optional;
};
global rollups: table[string] of Rollup;
global rollup_store: table[Index] of table[string, string] of ResultVal = {};
## Event that is used to "finish" metrics and adapt the metrics
## framework for clustered or non-clustered usage.
global log_it: event(filter: Metrics::Filter);
global finish_period: event(filter: Metrics::Filter);
event bro_init() &priority=5
{
@ -279,22 +313,21 @@ function do_calculated_fields(val: ResultVal)
function merge_result_vals(rv1: ResultVal, rv2: ResultVal): ResultVal
{
local result: ResultVal;
# Merge $begin (take the earliest one)
result$begin = rv1$begin < rv2$begin ? rv1$begin : rv2$begin;
result$begin = (rv1$begin < rv2$begin) ? rv1$begin : rv2$begin;
# Merge $end (take the latest one)
result$end = rv1$end > rv2$end ? rv1$end : rv2$end;
result$end = (rv1$end > rv2$end) ? rv1$end : rv2$end;
# Merge $num
result$num = rv1$num + rv2$num;
# Merge $sum
result$sum = rv1$sum + rv2$sum;
if ( rv1?$sum || rv2?$sum )
{
result$sum = 0;
if ( rv1?$sum )
result$sum += rv1$sum;
result$sum = rv1?$sum ? rv1$sum : 0;
if ( rv2?$sum )
result$sum += rv2$sum;
}
@ -348,13 +381,15 @@ function merge_result_vals(rv1: ResultVal, rv2: ResultVal): ResultVal
# Merge $unique_vals
if ( rv1?$unique_vals || rv2?$unique_vals )
{
result$unique_vals = set();
if ( rv1?$unique_vals )
for ( val1 in rv1$unique_vals )
add result$unique_vals[val1];
result$unique_vals = rv1$unique_vals;
if ( rv2?$unique_vals )
for ( val2 in rv2$unique_vals )
add result$unique_vals[val2];
if ( ! result?$unique_vals )
result$unique_vals = rv2$unique_vals;
else
for ( val2 in rv2$unique_vals )
add result$unique_vals[val2];
}
# Merge $sample_queue
@ -376,8 +411,9 @@ function merge_result_vals(rv1: ResultVal, rv2: ResultVal): ResultVal
return result;
}
function write_log(ts: time, filter: Filter, data: MetricTable)
function write_log(ts: time, metric_name: string, filter_name: string, data: Metrics::MetricTable)
{
local filter = filter_store[metric_name, filter_name];
for ( index in data )
{
local m: Info = [$ts=ts,
@ -386,9 +422,7 @@ function write_log(ts: time, filter: Filter, data: MetricTable)
$filter_name=filter$name,
$index=index,
$result=data[index]];
if ( filter$log )
Log::write(Metrics::LOG, m);
Log::write(LOG, m);
}
}
@ -401,7 +435,7 @@ function add_filter(id: string, filter: Filter)
{
if ( filter?$normalize_func && filter?$aggregation_mask )
{
Reporter::warning(fmt("invalid Metric filter (%s): Defined $normalize_func and $aggregation_mask.", filter$name));
Reporter::warning(fmt("invalid Metric filter (%s): Defined both $normalize_func and $aggregation_mask.", filter$name));
return;
}
if ( [id, filter$name] in store )
@ -409,7 +443,33 @@ function add_filter(id: string, filter: Filter)
Reporter::warning(fmt("invalid Metric filter (%s): Filter with same name already exists.", filter$name));
return;
}
if ( filter?$rollup )
{
if ( filter$rollup !in rollups )
{
Reporter::warning(fmt("invalid Metric filter (%s): %s rollup doesn't exist.", filter$name, filter$rollup));
return;
}
else
{
local every_field = 0secs;
for ( filt in rollups )
{
if ( [id, filt] !in filter_store )
next;
if ( every_field == 0secs )
every_field = filter_store[id, filt]$every;
else if ( every_field == filter_store[id, filt]$every )
{
Reporter::warning(fmt("invalid Metric rollup for %s: Filters with differing $every fields applied to %s.", filter$name, filter$rollup));
return;
}
}
}
add rollups[filter$rollup]$filters[filter];
}
if ( ! filter?$id )
filter$id = id;
@ -419,8 +479,8 @@ function add_filter(id: string, filter: Filter)
filter_store[id, filter$name] = filter;
store[id, filter$name] = table();
schedule filter$every { Metrics::log_it(filter) };
schedule filter$every { Metrics::finish_period(filter) };
}
function add_data(id: string, index: Index, data: DataPoint)
@ -513,11 +573,11 @@ function add_data(id: string, index: Index, data: DataPoint)
result$var_s += (val - result$prev_avg)*(val - result$avg);
}
if ( STD_DEV in filter$measure )
{
#if ( result?$variance )
# result$std_dev = sqrt(result$variance);
}
#if ( STD_DEV in filter$measure )
# {
# #if ( result?$variance )
# # result$std_dev = sqrt(result$variance);
# }
if ( UNIQUE in filter$measure )
{
@ -530,8 +590,7 @@ function add_data(id: string, index: Index, data: DataPoint)
}
}
# This function checks if a threshold has been crossed and generates a
# notice if it has. It is also used as a method to implement
# This function checks if a threshold has been crossed. It is also used as a method to implement
# mid-break-interval threshold crossing detection for cluster deployments.
function check_thresholds(filter: Filter, index: Index, val: ResultVal, modify_pct: double): bool
{
@ -570,7 +629,7 @@ function check_thresholds(filter: Filter, index: Index, val: ResultVal, modify_p
return F;
}
function threshold_crossed(filter: Filter, index: Index, val: ResultVal)
{
if ( ! filter?$threshold_crossed )
@ -586,3 +645,10 @@ function threshold_crossed(filter: Filter, index: Index, val: ResultVal)
if ( filter?$threshold_series )
++val$threshold_series_index;
}
function create_index_rollup(name: string, rollup: RollupCallback)
{
local r: Rollup = [$callback=rollup];
r$filters=set();
rollups[name] = r;
}

View file

@ -2,15 +2,31 @@
module Metrics;
event Metrics::log_it(filter: Filter)
event Metrics::finish_period(filter: Filter)
{
local id = filter$id;
local name = filter$name;
local data = store[filter$id, filter$name];
if ( filter?$rollup )
{
for ( index in data )
{
if ( index !in rollup_store )
rollup_store[index] = table();
rollup_store[index][filter$id, filter$name] = data[index];
# If all of the result vals are stored then the rollup callback can be executed.
if ( |rollup_store[index]| == |rollups[filter$rollup]$filters| )
{
rollups[filter$rollup]$callback(index, rollup_store[index]);
}
}
}
if ( filter?$period_finished )
filter$period_finished(network_time(), filter$id, filter$name, data);
write_log(network_time(), filter, store[id, name]);
reset(filter);
schedule filter$every { Metrics::log_it(filter) };
schedule filter$every { Metrics::finish_period(filter) };
}