adapt to new structure

This commit is contained in:
Bernhard Amann 2013-07-24 12:50:01 -07:00
parent 9e0fd963e0
commit b7cdfc0e6e
9 changed files with 23 additions and 140 deletions

View file

@ -10,9 +10,12 @@ set(probabilistic_SRCS
BitVector.cc
BloomFilter.cc
CounterVector.cc
Hasher.cc)
Hasher.cc
HyperLogLog.cc)
bif_target(bloom-filter.bif)
set(BIF_OUTPUT_CC_SAVE ${BIF_OUTPUT_CC})
bif_target(hyper-loglog.bif)
bro_add_subdir_library(probabilistic ${probabilistic_SRCS} ${BIF_OUTPUT_CC})
bro_add_subdir_library(probabilistic ${probabilistic_SRCS} ${BIF_OUTPUT_CC_SAVE} ${BIF_OUTPUT_CC})
add_dependencies(bro_probabilistic generate_outputs)

View file

@ -0,0 +1,140 @@
// See the file "COPYING" in the main distribution directory for copyright.
#include <math.h>
#include <stdint.h>
#include "HyperLogLog.h"
#include <iostream>
using namespace probabilistic;
int CardinalityCounter::optimalB(double error)
{
double initial_estimate = 2*(log(1.04)-log(error))/log(2);
int answer = (int) floor(initial_estimate);
double k;
do
{
answer++;
k = pow(2, (answer - initial_estimate)/2);
}
while (erf(k/sqrt(2)) < HLL_CONF);
return answer;
}
CardinalityCounter::CardinalityCounter(uint64_t size)
{
m = size;
buckets = new uint8_t[m];
if(m == 16)
alpha_m = 0.673;
else if(m == 32)
alpha_m = 0.697;
else if(m == 64)
alpha_m = 0.709;
else
alpha_m = 0.7213/(1+1.079/m);
for (uint64_t i = 0; i < m; i++)
buckets[i] = 0;
V = m;
}
CardinalityCounter::CardinalityCounter(double error_margin)
{
int b = optimalB(error_margin);
m = (uint64_t) pow(2, b);
buckets = new uint8_t[m];
if(m == 16)
alpha_m = 0.673;
else if(m == 32)
alpha_m = 0.697;
else if(m == 64)
alpha_m = 0.709;
else
alpha_m = 0.7213/(1+1.079/m);
for (uint64_t i = 0; i < m; i++)
buckets[i] = 0;
V = m;
}
CardinalityCounter::~CardinalityCounter()
{
delete [] buckets;
}
uint8_t CardinalityCounter::rank(uint64_t hash_modified)
{
uint8_t answer = 0;
hash_modified = (uint64_t)(hash_modified/m);
hash_modified *= 2;
do
{
hash_modified = (uint64_t) (hash_modified/2);
answer++;
}
while (hash_modified%2 == 0);
return answer;
}
void CardinalityCounter::addElement(uint64_t hash)
{
uint64_t index = hash % m;
hash = hash-index;
if(buckets[index] == 0)
V--;
uint8_t temp = rank(hash);
if (temp > buckets[index])
buckets[index] = temp;
}
double CardinalityCounter::size()
{
double answer = 0;
for (unsigned int i = 0; i < m; i++)
answer += pow(2, -(int)buckets[i]);
answer = 1/answer;
answer = alpha_m*m*m*answer;
if (answer <= 5*(double)(m/2))
return m*log((double) m/V);
else if(answer <= pow(2,64)/30)
return answer;
else
return -pow(2,64)*log(1-answer/pow(2,64));
}
void CardinalityCounter::merge(CardinalityCounter* c)
{
uint8_t* temp = (*c).getBuckets();
V = 0;
for (unsigned int i = 0; i < m; i++)
{
if (temp[i] > buckets[i])
buckets[i] = temp[i];
if (buckets[i] == 0)
V += 1;
}
}
uint8_t* CardinalityCounter::getBuckets()
{
return buckets;
}
uint64_t CardinalityCounter::getM()
{
return m;
}

View file

@ -0,0 +1,125 @@
// See the file "COPYING" in the main distribution directory for copyright.
#ifndef hyperloglog_h
#define hyperloglog_h
#include <stdint.h>
#include <OpaqueVal.h>
namespace probabilistic {
/*
* "conf" is how confident the estimate given by the counter is.
*
* In other words, if the cardinality is estimated to be 100 with 2% error margin and HLL_CONFis
* 0.95, then we are 95% sure that the actual cardinality is between 98 and 102.
*/
#define HLL_CONF .95
class CardinalityCounter {
friend class CardinalityVal;
private:
/*
* This is the number of buckets that will be stored. The standard error is 1.04/sqrt(m), so the
* actual cardinality will be the estimate +/- 1.04/sqrt(m) with approximately 68% probability.
*/
uint64_t m;
/*
* These are the actual buckets that are storing an estimate of the cardinality. All these need to
* do is count when the first 1 bit appears in the bitstring and that location is at most 65, so
* not that many bits are needed to store it.
*/
uint8_t* buckets;
/*
* There are some state constants that need to be kept track of to make the final estimate easier.
* V is the number of values in buckets that are 0 and this is used in the small error correction.
* alpha_m is a multiplicative constant used in the algorithm.
*/
uint64_t V;
double alpha_m;
/*
* This function will calculate the smallest value of b that will satisfy these the constraints of
* a specified error margin and confidence level.
*
* The exact expression for b is as follows:
* Define x = 2*(log(1.04*k/error)/log(2)). Then b is the ceiling of x
*
* error is the error margin.
* k is the number of standard deviations that we have to go to have a confidence level of conf.
*/
int optimalB(double error);
/*
* Computes when the first one appears in the element. It looks at the bitstring from the end though.
* A precondition is that the argument is already divisible by m, so we just ignore the last b bits,
* since m = 2^b and the last b bits will always be 0.
*/
uint8_t rank(uint64_t hash_modified);
public:
/*
* This will be used when cloning. The error margin will be 1.04/sqrt(m) with approximately 68%
* probability.
*/
CardinalityCounter(uint64_t size);
/*
* This will initialize the Cardinality counter.Based on the error_margin, the number of buckets
* that need to be kept will be determined. Based on the max_size, the number of bits that will
* be used from the hash function will be determined.
*
* We need the hash function to return integers that are uniformly distributed from 0 to 2^L-1.
* And if that happens, the maximum cardinality that this counter can handle is approximately 2^L.
* By default, we will assume a value of 64 bits.
*/
CardinalityCounter(double error_margin);
/*
* Deletes the class variables.
*/
~CardinalityCounter();
/*
* This will add an element to the counter. It's responsible for adding an element and updating
* the value of V, if that applies.
*/
void addElement(uint64_t hash);
/*
* Returns the size estimate of the set. First, it has the "raw" HyperLogLog estimate. And then, we
* check if it's too "large" or "small" because the raw estimate doesn't do well in those cases.
* Thus, we correct for those errors as specified in the paper.
*/
double size();
/*
* Returns the buckets array that holds all of the rough cardinality estimates.
*/
uint8_t* getBuckets();
/*
* Merges the argument cardinality counter with this one. The error margins are assumed to be the same,
* so they have the same number of buckets. If any of the conditions are violated, then the return value
* of size() is meaningless.
*/
void merge(CardinalityCounter* c);
/*
* Returns the value of m. Should be used only for statistical purposes.
*/
uint64_t getM();
};
}
#endif