Merge branch 'topic/robin/hyperloglog-merge'

* topic/robin/hyperloglog-merge: (35 commits)
  Making the confidence configurable.
  Renaming HyperLogLog->CardinalityCounter.
  Fixing bug introduced during merging.
  add clustered leak test for hll. No issues.
  make gcc happy
  (hopefully) fix refcounting problem in hll/bloom-filter opaque vals. Thanks Robin.
  re-use same hash class for all add operations
  get hll ready for merging
  and forgot a file...
  adapt to new structure
  fix opaqueval-related memleak.
  make it compile on case-sensitive file systems and fix warnings
  make error rate configureable
  add persistence test not using predetermined random seeds.
  update cluster test to also use hll
  persistence really works.
  well, with this commit synchronizing the data structure should work.. ...if we had consistent hashing.
  and also serialize the other things we need
  ok, this bug was hard to find.
  serialization compiles.
  ...
This commit is contained in:
Robin Sommer 2013-08-31 10:39:40 -07:00
commit 6f9d28cc18
31 changed files with 1018 additions and 19 deletions

View file

@ -1,4 +1,5 @@
@load ./average
@load ./hll_unique
@load ./last
@load ./max
@load ./min

View file

@ -0,0 +1,62 @@
@load base/frameworks/sumstats
module SumStats;
export {
redef record Reducer += {
## The error margin for HLL.
hll_error_margin: double &default=0.01;
## The confidence for HLL.
hll_confidence: double &default=0.95;
};
redef enum Calculation += {
## Calculate the number of unique values.
HLL_UNIQUE
};
redef record ResultVal += {
## If cardinality is being tracked, the number of unique
## items is tracked here.
hll_unique: count &default=0;
};
}
redef record ResultVal += {
# Internal use only. This is not meant to be publically available
# because probabilistic data structures have to be examined using
# specialized bifs.
card: opaque of cardinality &optional;
# We need these in the compose hook.
hll_error_margin: double &optional;
hll_confidence: double &optional;
};
hook register_observe_plugins()
{
register_observe_plugin(HLL_UNIQUE, function(r: Reducer, val: double, obs: Observation, rv: ResultVal)
{
if ( ! rv?$card )
{
rv$card = hll_cardinality_init(r$hll_error_margin, r$hll_confidence);
rv$hll_error_margin = r$hll_error_margin;
rv$hll_confidence = r$hll_confidence;
rv$hll_unique = 0;
}
hll_cardinality_add(rv$card, obs);
rv$hll_unique = double_to_count(hll_cardinality_estimate(rv$card));
});
}
hook compose_resultvals_hook(result: ResultVal, rv1: ResultVal, rv2: ResultVal)
{
local rhll = hll_cardinality_init(rv1$hll_error_margin, rv1$hll_confidence);
hll_cardinality_merge_into(rhll, rv1$card);
hll_cardinality_merge_into(rhll, rv2$card);
result$card = rhll;
result$hll_unique = double_to_count(hll_cardinality_estimate(rhll));
}