Merge remote-tracking branch 'origin/topic/johanna/bit-1612'

Addig a new random seed for external tests.

I added a wrapper around the siphash() function to make calling it a
little bit safer at least.

BIT-1612 #merged

* origin/topic/johanna/bit-1612:
  HLL: Fix missing typecast in test case.
  Remove the -K/-J options for setting keys.
  Add test checking the quality of HLL by adding a lot of elements.
  Fix serializing probabilistic hashers.
  Baseline updates after hash function change.
  Also switch BloomFilters from H3 to siphash.
  Change Hashing from H3 to Siphash.
  HLL: Remove unnecessary comparison.
  Hyperloglog: change calculation of Rho
This commit is contained in:
Robin Sommer 2016-07-14 16:00:03 -07:00
commit 4d84ee82da
347 changed files with 26269 additions and 26053 deletions

View file

@ -28,10 +28,9 @@ int CardinalityCounter::OptimalB(double error, double confidence) const
return answer;
}
void CardinalityCounter::Init(uint64 size)
void CardinalityCounter::Init(uint64_t size)
{
m = size;
buckets = new uint8_t[m];
// The following magic values are taken directly out of the
// description of the HyperLogLog algorithn.
@ -51,60 +50,83 @@ void CardinalityCounter::Init(uint64 size)
else
reporter->InternalError("Invalid size %" PRIu64 ". Size either has to be 16, 32, 64 or bigger than 128", size);
for ( uint64 i = 0; i < m; i++ )
buckets[i] = 0;
double calc_p = log2(m);
if ( trunc(calc_p) != calc_p )
reporter->InternalError("Invalid size %" PRIu64 ". Size either has to be a power of 2", size);
p = calc_p;
buckets.reserve(m);
for ( uint64_t i = 0; i < m; i++ )
buckets.push_back(0);
assert(buckets.size() == m);
V = m;
}
CardinalityCounter::CardinalityCounter(CardinalityCounter& other)
: buckets(other.buckets)
{
Init(other.GetM());
Merge(&other);
V = other.V;
alpha_m = other.alpha_m;
m = other.m;
p = other.p;
}
CardinalityCounter::CardinalityCounter(CardinalityCounter&& o)
{
V = o.V;
alpha_m = o.alpha_m;
m = o.m;
p = o.p;
o.m = 0;
buckets = std::move(o.buckets);
}
CardinalityCounter::CardinalityCounter(double error_margin, double confidence)
{
int b = OptimalB(error_margin, confidence);
Init((uint64) pow(2, b));
assert(b == p);
}
CardinalityCounter::CardinalityCounter(uint64 size)
CardinalityCounter::CardinalityCounter(uint64_t size)
{
Init(size);
}
CardinalityCounter::CardinalityCounter(uint64 arg_size, uint64 arg_V, double arg_alpha_m)
CardinalityCounter::CardinalityCounter(uint64_t arg_size, uint64_t arg_V, double arg_alpha_m)
{
m = arg_size;
buckets = new uint8_t[m];
buckets.reserve(m);
for ( uint64_t i = 0; i < m; i++ )
buckets.push_back(0);
alpha_m = arg_alpha_m;
V = arg_V;
p = log2(m);
}
CardinalityCounter::~CardinalityCounter()
{
delete [] buckets;
}
uint8_t CardinalityCounter::Rank(uint64 hash_modified) const
uint8_t CardinalityCounter::Rank(uint64_t hash_modified) const
{
uint8_t answer = 0;
hash_modified = (uint64)(hash_modified / m);
hash_modified *= 2;
do {
hash_modified = (uint64)(hash_modified / 2);
answer++;
} while ( hash_modified % 2 == 0);
hash_modified = hash_modified >> p;
int answer = 64 - p - CardinalityCounter::flsll(hash_modified) + 1;
assert(answer > 0 && answer < 64);
return answer;
}
void CardinalityCounter::AddElement(uint64 hash)
void CardinalityCounter::AddElement(uint64_t hash)
{
uint64 index = hash % m;
uint64_t index = hash % m;
hash = hash-index;
if( buckets[index] == 0 )
@ -118,7 +140,7 @@ void CardinalityCounter::AddElement(uint64 hash)
/**
* Estimate the size by using the the "raw" HyperLogLog estimate. Then,
* check if it's too "large" or "small" because the raw estimate doesn't
* check if it's too "large" or "small" because the raw estimate doesn't
* do well in those cases.
* Thus, we correct for those errors as specified in the paper.
*
@ -149,7 +171,7 @@ bool CardinalityCounter::Merge(CardinalityCounter* c)
if ( m != c->GetM() )
return false;
uint8_t* temp = c->GetBuckets();
const vector<uint8_t> temp = c->GetBuckets();
V = 0;
@ -165,12 +187,12 @@ bool CardinalityCounter::Merge(CardinalityCounter* c)
return true;
}
uint8_t* CardinalityCounter::GetBuckets()
const vector<uint8_t> &CardinalityCounter::GetBuckets() const
{
return buckets;
}
uint64 CardinalityCounter::GetM() const
uint64_t CardinalityCounter::GetM() const
{
return m;
}
@ -192,7 +214,7 @@ bool CardinalityCounter::Serialize(SerialInfo* info) const
CardinalityCounter* CardinalityCounter::Unserialize(UnserialInfo* info)
{
uint64_t m;
uint64 V;
uint64_t V;
double alpha_m;
bool valid = true;
@ -202,13 +224,13 @@ CardinalityCounter* CardinalityCounter::Unserialize(UnserialInfo* info)
CardinalityCounter* c = new CardinalityCounter(m, V, alpha_m);
uint8_t* buckets = c->buckets;
vector<uint8_t>& buckets = c->buckets;
for ( unsigned int i = 0; i < m; i++ )
{
char c;
valid &= UNSERIALIZE(&c);
buckets[i] = (uint8)c;
buckets[i] = (uint8_t)c;
}
if ( ! valid )
@ -219,3 +241,51 @@ CardinalityCounter* CardinalityCounter::Unserialize(UnserialInfo* info)
return c;
}
/**
* The following function is copied from libc/string/flsll.c from the FreeBSD source
* tree. Original copyright message follows
*/
/*-
* Copyright (c) 1990, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* Find Last Set bit
*/
int
CardinalityCounter::flsll(uint64_t mask)
{
int bit;
if (mask == 0)
return (0);
for (bit = 1; mask != 1; bit++)
mask = (uint64_t)mask >> 1;
return (bit);
}