mirror of
https://github.com/zeek/zeek.git
synced 2025-10-02 14:48:21 +00:00
Merge branch 'topic/robin/bloom-filter-merge'
* topic/robin/bloom-filter-merge: (50 commits) Support emptiness check on Bloom filters. Refactor Bloom filter merging. Add bloomfilter_clear() BiF. Updating NEWS. Broifying the code. Implement and test Bloom filter merging. Make hash functions equality comparable. Make counter vectors mergeable. Use half adder for bitwise addition and subtraction. Fix and test counting Bloom filter. Implement missing CounterVector functions. Tweak hasher interface. Add missing include for GCC. Fixing for unserializion error. Small fixes and style tweaks. Only serialize Bloom filter type if available. Create hash policies through factory. Remove lingering debug code. Factor implementation and change interface. Expose Bro's linear congruence PRNG as utility function. ...
This commit is contained in:
commit
599dadf30b
29 changed files with 2821 additions and 67 deletions
13
CHANGES
13
CHANGES
|
@ -1,4 +1,17 @@
|
|||
|
||||
2.1-880 | 2013-07-24 15:49:31 -0700
|
||||
|
||||
* Support for Bloom filter. (Matthias Vallentin)
|
||||
|
||||
Bro now provides the following BiFs:
|
||||
|
||||
bloomfilter_basic_init(fp: double, capacity: count, name: string &default=""): opaque of bloomfilter
|
||||
bloomfilter_counting_init(k: count, cells: count, max: count, name: string &default=""): opaque of bloomfilter
|
||||
bloomfilter_add(bf: opaque of bloomfilter, x: any)
|
||||
bloomfilter_lookup(bf: opaque of bloomfilter, x: any): count
|
||||
bloomfilter_merge(bf1: opaque of bloomfilter, bf2: opaque of bloomfilter): opaque of bloomfilter
|
||||
bloomfilter_clear(bf: opaque of bloomfilter)
|
||||
|
||||
2.1-824 | 2013-07-22 14:25:14 -0400
|
||||
|
||||
* Fixed a scriptland state issue that manifested especially badly on proxies. (Seth Hall)
|
||||
|
|
13
NEWS
13
NEWS
|
@ -108,6 +108,19 @@ New Functionality
|
|||
shunting, and sampling; plus plugin support to customize filters
|
||||
dynamically.
|
||||
|
||||
- Bro now provides Bloom filters of two kinds: basic Bloom filters
|
||||
supporting membership tests, and counting Bloom filters that track
|
||||
the frequency of elements. The corresponding functions are:
|
||||
|
||||
bloomfilter_basic_init(fp: double, capacity: count, name: string &default=""): opaque of bloomfilter
|
||||
bloomfilter_counting_init(k: count, cells: count, max: count, name: string &default=""): opaque of bloomfilter
|
||||
bloomfilter_add(bf: opaque of bloomfilter, x: any)
|
||||
bloomfilter_lookup(bf: opaque of bloomfilter, x: any): count
|
||||
bloomfilter_merge(bf1: opaque of bloomfilter, bf2: opaque of bloomfilter): opaque of bloomfilter
|
||||
bloomfilter_clear(bf: opaque of bloomfilter)
|
||||
|
||||
See <INSERT LINK> for full documentation.
|
||||
|
||||
Changed Functionality
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
|
|
2
VERSION
2
VERSION
|
@ -1 +1 @@
|
|||
2.1-824
|
||||
2.1-880
|
||||
|
|
|
@ -705,6 +705,7 @@ type entropy_test_result: record {
|
|||
@load base/bif/strings.bif
|
||||
@load base/bif/bro.bif
|
||||
@load base/bif/reporter.bif
|
||||
@load base/bif/bloom-filter.bif
|
||||
|
||||
## Deprecated. This is superseded by the new logging framework.
|
||||
global log_file_name: function(tag: string): string &redef;
|
||||
|
|
|
@ -150,6 +150,7 @@ set(bro_PLUGIN_LIBS CACHE INTERNAL "plugin libraries" FORCE)
|
|||
|
||||
add_subdirectory(analyzer)
|
||||
add_subdirectory(file_analysis)
|
||||
add_subdirectory(probabilistic)
|
||||
|
||||
set(bro_SUBDIRS
|
||||
${bro_SUBDIR_LIBS}
|
||||
|
|
|
@ -560,6 +560,9 @@ void builtin_error(const char* msg, BroObj* arg)
|
|||
#include "reporter.bif.func_def"
|
||||
#include "strings.bif.func_def"
|
||||
|
||||
// TODO: Add a nicer mechanism to pull in subdirectory bifs automatically.
|
||||
#include "probabilistic/bloom-filter.bif.h"
|
||||
|
||||
void init_builtin_funcs()
|
||||
{
|
||||
bro_resources = internal_type("bro_resources")->AsRecordType();
|
||||
|
@ -574,6 +577,9 @@ void init_builtin_funcs()
|
|||
#include "reporter.bif.func_init"
|
||||
#include "strings.bif.func_init"
|
||||
|
||||
// TODO: Add a nicer mechanism to pull in subdirectory bifs automatically.
|
||||
#include "probabilistic/bloom-filter.bif.init.cc"
|
||||
|
||||
did_builtin_init = true;
|
||||
}
|
||||
|
||||
|
|
80
src/H3.h
80
src/H3.h
|
@ -49,23 +49,49 @@
|
|||
// hash a substring of the data. Hashes of substrings can be bitwise-XOR'ed
|
||||
// together to get the same result as hashing the full string.
|
||||
// Any number of hash functions can be created by creating new instances of H3,
|
||||
// with the same or different template parameters. The hash function is
|
||||
// randomly generated using bro_random(); you must call init_random_seed()
|
||||
// before the H3 constructor if you wish to seed it.
|
||||
// with the same or different template parameters. The hash function
|
||||
// constructor takes a seed as argument which defaults to a call to
|
||||
// bro_random().
|
||||
|
||||
|
||||
#ifndef H3_H
|
||||
#define H3_H
|
||||
|
||||
#include <climits>
|
||||
#include <cstring>
|
||||
|
||||
// The number of values representable by a byte.
|
||||
#define H3_BYTE_RANGE (UCHAR_MAX+1)
|
||||
|
||||
template<class T, int N> class H3 {
|
||||
T byte_lookup[N][H3_BYTE_RANGE];
|
||||
template <typename T, int N>
|
||||
class H3 {
|
||||
public:
|
||||
H3();
|
||||
H3(T seed = bro_random())
|
||||
{
|
||||
T bit_lookup[N * CHAR_BIT];
|
||||
|
||||
for ( size_t bit = 0; bit < N * CHAR_BIT; bit++ )
|
||||
{
|
||||
bit_lookup[bit] = 0;
|
||||
seed = bro_prng(seed);
|
||||
for ( size_t i = 0; i < sizeof(T)/2; i++ )
|
||||
// assume random() returns at least 16 random bits
|
||||
bit_lookup[bit] = (bit_lookup[bit] << 16) | (seed & 0xFFFF);
|
||||
}
|
||||
|
||||
for ( size_t byte = 0; byte < N; byte++ )
|
||||
{
|
||||
for ( unsigned val = 0; val < H3_BYTE_RANGE; val++ )
|
||||
{
|
||||
byte_lookup[byte][val] = 0;
|
||||
for ( size_t bit = 0; bit < CHAR_BIT; bit++ )
|
||||
// Does this mean byte_lookup[*][0] == 0? -RP
|
||||
if (val & (1 << bit))
|
||||
byte_lookup[byte][val] ^= bit_lookup[byte*CHAR_BIT+bit];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
T operator()(const void* data, size_t size, size_t offset = 0) const
|
||||
{
|
||||
const unsigned char *p = static_cast<const unsigned char*>(data);
|
||||
|
@ -73,7 +99,7 @@ public:
|
|||
|
||||
// loop optmized with Duff's Device
|
||||
register unsigned n = (size + 7) / 8;
|
||||
switch (size % 8) {
|
||||
switch ( size % 8 ) {
|
||||
case 0: do { result ^= byte_lookup[offset++][*p++];
|
||||
case 7: result ^= byte_lookup[offset++][*p++];
|
||||
case 6: result ^= byte_lookup[offset++][*p++];
|
||||
|
@ -82,36 +108,24 @@ public:
|
|||
case 3: result ^= byte_lookup[offset++][*p++];
|
||||
case 2: result ^= byte_lookup[offset++][*p++];
|
||||
case 1: result ^= byte_lookup[offset++][*p++];
|
||||
} while (--n > 0);
|
||||
} while ( --n > 0 );
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
friend bool operator==(const H3& x, const H3& y)
|
||||
{
|
||||
return ! std::memcmp(x.byte_lookup, y.byte_lookup, N * H3_BYTE_RANGE);
|
||||
}
|
||||
|
||||
friend bool operator!=(const H3& x, const H3& y)
|
||||
{
|
||||
return ! (x == y);
|
||||
}
|
||||
|
||||
private:
|
||||
T byte_lookup[N][H3_BYTE_RANGE];
|
||||
};
|
||||
|
||||
template<class T, int N>
|
||||
H3<T,N>::H3()
|
||||
{
|
||||
T bit_lookup[N * CHAR_BIT];
|
||||
|
||||
for (size_t bit = 0; bit < N * CHAR_BIT; bit++) {
|
||||
bit_lookup[bit] = 0;
|
||||
for (size_t i = 0; i < sizeof(T)/2; i++) {
|
||||
// assume random() returns at least 16 random bits
|
||||
bit_lookup[bit] = (bit_lookup[bit] << 16) | (bro_random() & 0xFFFF);
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t byte = 0; byte < N; byte++) {
|
||||
for (unsigned val = 0; val < H3_BYTE_RANGE; val++) {
|
||||
byte_lookup[byte][val] = 0;
|
||||
for (size_t bit = 0; bit < CHAR_BIT; bit++) {
|
||||
// Does this mean byte_lookup[*][0] == 0? -RP
|
||||
if (val & (1 << bit))
|
||||
byte_lookup[byte][val] ^= bit_lookup[byte*CHAR_BIT+bit];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif //H3_H
|
||||
|
|
|
@ -242,6 +242,7 @@ OpaqueType* md5_type;
|
|||
OpaqueType* sha1_type;
|
||||
OpaqueType* sha256_type;
|
||||
OpaqueType* entropy_type;
|
||||
OpaqueType* bloomfilter_type;
|
||||
|
||||
#include "const.bif.netvar_def"
|
||||
#include "types.bif.netvar_def"
|
||||
|
@ -307,6 +308,7 @@ void init_general_global_var()
|
|||
sha1_type = new OpaqueType("sha1");
|
||||
sha256_type = new OpaqueType("sha256");
|
||||
entropy_type = new OpaqueType("entropy");
|
||||
bloomfilter_type = new OpaqueType("bloomfilter");
|
||||
}
|
||||
|
||||
void init_net_var()
|
||||
|
|
|
@ -247,6 +247,7 @@ extern OpaqueType* md5_type;
|
|||
extern OpaqueType* sha1_type;
|
||||
extern OpaqueType* sha256_type;
|
||||
extern OpaqueType* entropy_type;
|
||||
extern OpaqueType* bloomfilter_type;
|
||||
|
||||
// Initializes globals that don't pertain to network/event analysis.
|
||||
extern void init_general_global_var();
|
||||
|
|
151
src/OpaqueVal.cc
151
src/OpaqueVal.cc
|
@ -1,3 +1,5 @@
|
|||
// See the file "COPYING" in the main distribution directory for copyright.
|
||||
|
||||
#include "OpaqueVal.h"
|
||||
#include "NetVar.h"
|
||||
#include "Reporter.h"
|
||||
|
@ -515,3 +517,152 @@ bool EntropyVal::DoUnserialize(UnserialInfo* info)
|
|||
|
||||
return true;
|
||||
}
|
||||
|
||||
BloomFilterVal::BloomFilterVal()
|
||||
: OpaqueVal(bloomfilter_type)
|
||||
{
|
||||
type = 0;
|
||||
hash = 0;
|
||||
bloom_filter = 0;
|
||||
}
|
||||
|
||||
BloomFilterVal::BloomFilterVal(OpaqueType* t)
|
||||
: OpaqueVal(t)
|
||||
{
|
||||
type = 0;
|
||||
hash = 0;
|
||||
bloom_filter = 0;
|
||||
}
|
||||
|
||||
BloomFilterVal::BloomFilterVal(probabilistic::BloomFilter* bf)
|
||||
: OpaqueVal(bloomfilter_type)
|
||||
{
|
||||
type = 0;
|
||||
hash = 0;
|
||||
bloom_filter = bf;
|
||||
}
|
||||
|
||||
bool BloomFilterVal::Typify(BroType* arg_type)
|
||||
{
|
||||
if ( type )
|
||||
return false;
|
||||
|
||||
type = arg_type;
|
||||
type->Ref();
|
||||
|
||||
TypeList* tl = new TypeList(type);
|
||||
tl->Append(type);
|
||||
hash = new CompositeHash(tl);
|
||||
Unref(tl);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
BroType* BloomFilterVal::Type() const
|
||||
{
|
||||
return type;
|
||||
}
|
||||
|
||||
void BloomFilterVal::Add(const Val* val)
|
||||
{
|
||||
HashKey* key = hash->ComputeHash(val, 1);
|
||||
bloom_filter->Add(key->Hash());
|
||||
delete key;
|
||||
}
|
||||
|
||||
size_t BloomFilterVal::Count(const Val* val) const
|
||||
{
|
||||
HashKey* key = hash->ComputeHash(val, 1);
|
||||
size_t cnt = bloom_filter->Count(key->Hash());
|
||||
delete key;
|
||||
return cnt;
|
||||
}
|
||||
|
||||
void BloomFilterVal::Clear()
|
||||
{
|
||||
bloom_filter->Clear();
|
||||
}
|
||||
|
||||
bool BloomFilterVal::Empty() const
|
||||
{
|
||||
return bloom_filter->Empty();
|
||||
}
|
||||
|
||||
BloomFilterVal* BloomFilterVal::Merge(const BloomFilterVal* x,
|
||||
const BloomFilterVal* y)
|
||||
{
|
||||
if ( ! same_type(x->Type(), y->Type()) )
|
||||
{
|
||||
reporter->Error("cannot merge Bloom filters with different types");
|
||||
return 0;
|
||||
}
|
||||
|
||||
if ( typeid(*x->bloom_filter) != typeid(*y->bloom_filter) )
|
||||
{
|
||||
reporter->Error("cannot merge different Bloom filter types");
|
||||
return 0;
|
||||
}
|
||||
|
||||
probabilistic::BloomFilter* copy = x->bloom_filter->Clone();
|
||||
|
||||
if ( ! copy->Merge(y->bloom_filter) )
|
||||
{
|
||||
reporter->Error("failed to merge Bloom filter");
|
||||
return 0;
|
||||
}
|
||||
|
||||
BloomFilterVal* merged = new BloomFilterVal(copy);
|
||||
|
||||
if ( ! merged->Typify(x->Type()) )
|
||||
{
|
||||
reporter->Error("failed to set type on merged Bloom filter");
|
||||
return 0;
|
||||
}
|
||||
|
||||
return merged;
|
||||
}
|
||||
|
||||
BloomFilterVal::~BloomFilterVal()
|
||||
{
|
||||
Unref(type);
|
||||
delete hash;
|
||||
delete bloom_filter;
|
||||
}
|
||||
|
||||
IMPLEMENT_SERIAL(BloomFilterVal, SER_BLOOMFILTER_VAL);
|
||||
|
||||
bool BloomFilterVal::DoSerialize(SerialInfo* info) const
|
||||
{
|
||||
DO_SERIALIZE(SER_BLOOMFILTER_VAL, OpaqueVal);
|
||||
|
||||
bool is_typed = (type != 0);
|
||||
|
||||
if ( ! SERIALIZE(is_typed) )
|
||||
return false;
|
||||
|
||||
if ( is_typed && ! type->Serialize(info) )
|
||||
return false;
|
||||
|
||||
return bloom_filter->Serialize(info);
|
||||
}
|
||||
|
||||
bool BloomFilterVal::DoUnserialize(UnserialInfo* info)
|
||||
{
|
||||
DO_UNSERIALIZE(OpaqueVal);
|
||||
|
||||
bool is_typed;
|
||||
if ( ! UNSERIALIZE(&is_typed) )
|
||||
return false;
|
||||
|
||||
if ( is_typed )
|
||||
{
|
||||
BroType* type = BroType::Unserialize(info);
|
||||
if ( ! Typify(type) )
|
||||
return false;
|
||||
|
||||
Unref(type);
|
||||
}
|
||||
|
||||
bloom_filter = probabilistic::BloomFilter::Unserialize(info);
|
||||
return bloom_filter != 0;
|
||||
}
|
||||
|
|
|
@ -3,10 +3,18 @@
|
|||
#ifndef OPAQUEVAL_H
|
||||
#define OPAQUEVAL_H
|
||||
|
||||
#include <typeinfo>
|
||||
|
||||
#include "RandTest.h"
|
||||
#include "Val.h"
|
||||
#include "digest.h"
|
||||
|
||||
#include "probabilistic/BloomFilter.h"
|
||||
|
||||
namespace probabilistic {
|
||||
class BloomFilter;
|
||||
}
|
||||
|
||||
class HashVal : public OpaqueVal {
|
||||
public:
|
||||
virtual bool IsValid() const;
|
||||
|
@ -107,4 +115,37 @@ private:
|
|||
RandTest state;
|
||||
};
|
||||
|
||||
class BloomFilterVal : public OpaqueVal {
|
||||
public:
|
||||
explicit BloomFilterVal(probabilistic::BloomFilter* bf);
|
||||
virtual ~BloomFilterVal();
|
||||
|
||||
BroType* Type() const;
|
||||
bool Typify(BroType* type);
|
||||
|
||||
void Add(const Val* val);
|
||||
size_t Count(const Val* val) const;
|
||||
void Clear();
|
||||
bool Empty() const;
|
||||
|
||||
static BloomFilterVal* Merge(const BloomFilterVal* x,
|
||||
const BloomFilterVal* y);
|
||||
|
||||
protected:
|
||||
friend class Val;
|
||||
BloomFilterVal();
|
||||
BloomFilterVal(OpaqueType* t);
|
||||
|
||||
DECLARE_SERIAL(BloomFilterVal);
|
||||
|
||||
private:
|
||||
// Disable.
|
||||
BloomFilterVal(const BloomFilterVal&);
|
||||
BloomFilterVal& operator=(const BloomFilterVal&);
|
||||
|
||||
BroType* type;
|
||||
CompositeHash* hash;
|
||||
probabilistic::BloomFilter* bloom_filter;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
|
|
@ -49,6 +49,9 @@ SERIAL_IS(STATE_ACCESS, 0x1100)
|
|||
SERIAL_IS_BO(CASE, 0x1200)
|
||||
SERIAL_IS(LOCATION, 0x1300)
|
||||
SERIAL_IS(RE_MATCHER, 0x1400)
|
||||
SERIAL_IS(BITVECTOR, 0x1500)
|
||||
SERIAL_IS(COUNTERVECTOR, 0x1600)
|
||||
SERIAL_IS(BLOOMFILTER, 0x1700)
|
||||
|
||||
// These are the externally visible types.
|
||||
const SerialType SER_NONE = 0;
|
||||
|
@ -104,6 +107,7 @@ SERIAL_VAL(MD5_VAL, 16)
|
|||
SERIAL_VAL(SHA1_VAL, 17)
|
||||
SERIAL_VAL(SHA256_VAL, 18)
|
||||
SERIAL_VAL(ENTROPY_VAL, 19)
|
||||
SERIAL_VAL(BLOOMFILTER_VAL, 20)
|
||||
|
||||
#define SERIAL_EXPR(name, val) SERIAL_CONST(name, val, EXPR)
|
||||
SERIAL_EXPR(EXPR, 1)
|
||||
|
@ -197,10 +201,17 @@ SERIAL_FUNC(BRO_FUNC, 2)
|
|||
SERIAL_FUNC(DEBUG_FUNC, 3)
|
||||
SERIAL_FUNC(BUILTIN_FUNC, 4)
|
||||
|
||||
#define SERIAL_BLOOMFILTER(name, val) SERIAL_CONST(name, val, BLOOMFILTER)
|
||||
SERIAL_BLOOMFILTER(BLOOMFILTER, 1)
|
||||
SERIAL_BLOOMFILTER(BASICBLOOMFILTER, 2)
|
||||
SERIAL_BLOOMFILTER(COUNTINGBLOOMFILTER, 3)
|
||||
|
||||
SERIAL_CONST2(ID)
|
||||
SERIAL_CONST2(STATE_ACCESS)
|
||||
SERIAL_CONST2(CASE)
|
||||
SERIAL_CONST2(LOCATION)
|
||||
SERIAL_CONST2(RE_MATCHER)
|
||||
SERIAL_CONST2(BITVECTOR)
|
||||
SERIAL_CONST2(COUNTERVECTOR)
|
||||
|
||||
#endif
|
||||
|
|
|
@ -1311,19 +1311,20 @@ IMPLEMENT_SERIAL(OpaqueType, SER_OPAQUE_TYPE);
|
|||
bool OpaqueType::DoSerialize(SerialInfo* info) const
|
||||
{
|
||||
DO_SERIALIZE(SER_OPAQUE_TYPE, BroType);
|
||||
return SERIALIZE(name);
|
||||
return SERIALIZE_STR(name.c_str(), name.size());
|
||||
}
|
||||
|
||||
bool OpaqueType::DoUnserialize(UnserialInfo* info)
|
||||
{
|
||||
DO_UNSERIALIZE(BroType);
|
||||
|
||||
char const* n;
|
||||
const char* n;
|
||||
if ( ! UNSERIALIZE_STR(&n, 0) )
|
||||
return false;
|
||||
|
||||
name = n;
|
||||
delete [] n;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
|
@ -4975,4 +4975,3 @@ function anonymize_addr%(a: addr, cl: IPAddrAnonymizationClass%): addr
|
|||
(enum ip_addr_anonymization_class_t) anon_class));
|
||||
}
|
||||
%}
|
||||
|
||||
|
|
578
src/probabilistic/BitVector.cc
Normal file
578
src/probabilistic/BitVector.cc
Normal file
|
@ -0,0 +1,578 @@
|
|||
// See the file "COPYING" in the main distribution directory for copyright.
|
||||
|
||||
#include "BitVector.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <limits>
|
||||
#include "Serializer.h"
|
||||
|
||||
using namespace probabilistic;
|
||||
|
||||
BitVector::size_type BitVector::npos = static_cast<BitVector::size_type>(-1);
|
||||
BitVector::block_type BitVector::bits_per_block =
|
||||
std::numeric_limits<BitVector::block_type>::digits;
|
||||
|
||||
namespace {
|
||||
|
||||
uint8_t count_table[] = {
|
||||
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2,
|
||||
3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3,
|
||||
3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3,
|
||||
4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4,
|
||||
3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5,
|
||||
6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4,
|
||||
4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5,
|
||||
6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, 5,
|
||||
3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 3,
|
||||
4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6,
|
||||
6, 7, 6, 7, 7, 8
|
||||
};
|
||||
|
||||
} // namespace <anonymous>
|
||||
|
||||
BitVector::Reference::Reference(block_type& block, block_type i)
|
||||
: block(block), mask((block_type(1) << i))
|
||||
{
|
||||
assert(i < bits_per_block);
|
||||
}
|
||||
|
||||
BitVector::Reference& BitVector::Reference::Flip()
|
||||
{
|
||||
block ^= mask;
|
||||
return *this;
|
||||
}
|
||||
|
||||
BitVector::Reference::operator bool() const
|
||||
{
|
||||
return (block & mask) != 0;
|
||||
}
|
||||
|
||||
bool BitVector::Reference::operator~() const
|
||||
{
|
||||
return (block & mask) == 0;
|
||||
}
|
||||
|
||||
BitVector::Reference& BitVector::Reference::operator=(bool x)
|
||||
{
|
||||
if ( x )
|
||||
block |= mask;
|
||||
else
|
||||
block &= ~mask;
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
BitVector::Reference& BitVector::Reference::operator=(const Reference& other)
|
||||
{
|
||||
if ( other )
|
||||
block |= mask;
|
||||
else
|
||||
block &= ~mask;
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
BitVector::Reference& BitVector::Reference::operator|=(bool x)
|
||||
{
|
||||
if ( x )
|
||||
block |= mask;
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
BitVector::Reference& BitVector::Reference::operator&=(bool x)
|
||||
{
|
||||
if ( ! x )
|
||||
block &= ~mask;
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
BitVector::Reference& BitVector::Reference::operator^=(bool x)
|
||||
{
|
||||
if ( x )
|
||||
block ^= mask;
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
BitVector::Reference& BitVector::Reference::operator-=(bool x)
|
||||
{
|
||||
if ( x )
|
||||
block &= ~mask;
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
BitVector::BitVector()
|
||||
{
|
||||
num_bits = 0;
|
||||
}
|
||||
|
||||
BitVector::BitVector(size_type size, bool value)
|
||||
: bits(bits_to_blocks(size), value ? ~block_type(0) : 0)
|
||||
{
|
||||
num_bits = size;
|
||||
}
|
||||
|
||||
BitVector::BitVector(BitVector const& other)
|
||||
: bits(other.bits)
|
||||
{
|
||||
num_bits = other.num_bits;
|
||||
}
|
||||
|
||||
BitVector BitVector::operator~() const
|
||||
{
|
||||
BitVector b(*this);
|
||||
b.Flip();
|
||||
return b;
|
||||
}
|
||||
|
||||
BitVector& BitVector::operator=(BitVector const& other)
|
||||
{
|
||||
bits = other.bits;
|
||||
return *this;
|
||||
}
|
||||
|
||||
BitVector BitVector::operator<<(size_type n) const
|
||||
{
|
||||
BitVector b(*this);
|
||||
return b <<= n;
|
||||
}
|
||||
|
||||
BitVector BitVector::operator>>(size_type n) const
|
||||
{
|
||||
BitVector b(*this);
|
||||
return b >>= n;
|
||||
}
|
||||
|
||||
BitVector& BitVector::operator<<=(size_type n)
|
||||
{
|
||||
if ( n >= num_bits )
|
||||
return Reset();
|
||||
|
||||
if ( n > 0 )
|
||||
{
|
||||
size_type last = Blocks() - 1;
|
||||
size_type div = n / bits_per_block;
|
||||
block_type r = bit_index(n);
|
||||
block_type* b = &bits[0];
|
||||
|
||||
assert(Blocks() >= 1);
|
||||
assert(div <= last);
|
||||
|
||||
if ( r != 0 )
|
||||
{
|
||||
for ( size_type i = last - div; i > 0; --i )
|
||||
b[i + div] = (b[i] << r) | (b[i - 1] >> (bits_per_block - r));
|
||||
|
||||
b[div] = b[0] << r;
|
||||
}
|
||||
|
||||
else
|
||||
{
|
||||
for (size_type i = last-div; i > 0; --i)
|
||||
b[i + div] = b[i];
|
||||
|
||||
b[div] = b[0];
|
||||
}
|
||||
|
||||
std::fill_n(b, div, block_type(0));
|
||||
zero_unused_bits();
|
||||
}
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
BitVector& BitVector::operator>>=(size_type n)
|
||||
{
|
||||
if ( n >= num_bits )
|
||||
return Reset();
|
||||
|
||||
if ( n > 0 )
|
||||
{
|
||||
size_type last = Blocks() - 1;
|
||||
size_type div = n / bits_per_block;
|
||||
block_type r = bit_index(n);
|
||||
block_type* b = &bits[0];
|
||||
|
||||
assert(Blocks() >= 1);
|
||||
assert(div <= last);
|
||||
|
||||
if ( r != 0 )
|
||||
{
|
||||
for (size_type i = last - div; i > 0; --i)
|
||||
b[i - div] = (b[i] >> r) | (b[i + 1] << (bits_per_block - r));
|
||||
|
||||
b[last - div] = b[last] >> r;
|
||||
}
|
||||
|
||||
else
|
||||
{
|
||||
for (size_type i = div; i <= last; ++i)
|
||||
b[i-div] = b[i];
|
||||
}
|
||||
|
||||
std::fill_n(b + (Blocks() - div), div, block_type(0));
|
||||
}
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
BitVector& BitVector::operator&=(BitVector const& other)
|
||||
{
|
||||
assert(Size() >= other.Size());
|
||||
|
||||
for ( size_type i = 0; i < Blocks(); ++i )
|
||||
bits[i] &= other.bits[i];
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
BitVector& BitVector::operator|=(BitVector const& other)
|
||||
{
|
||||
assert(Size() >= other.Size());
|
||||
|
||||
for ( size_type i = 0; i < Blocks(); ++i )
|
||||
bits[i] |= other.bits[i];
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
BitVector& BitVector::operator^=(BitVector const& other)
|
||||
{
|
||||
assert(Size() >= other.Size());
|
||||
|
||||
for ( size_type i = 0; i < Blocks(); ++i )
|
||||
bits[i] ^= other.bits[i];
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
BitVector& BitVector::operator-=(BitVector const& other)
|
||||
{
|
||||
assert(Size() >= other.Size());
|
||||
|
||||
for ( size_type i = 0; i < Blocks(); ++i )
|
||||
bits[i] &= ~other.bits[i];
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
namespace probabilistic {
|
||||
|
||||
BitVector operator&(BitVector const& x, BitVector const& y)
|
||||
{
|
||||
BitVector b(x);
|
||||
return b &= y;
|
||||
}
|
||||
|
||||
BitVector operator|(BitVector const& x, BitVector const& y)
|
||||
{
|
||||
BitVector b(x);
|
||||
return b |= y;
|
||||
}
|
||||
|
||||
BitVector operator^(BitVector const& x, BitVector const& y)
|
||||
{
|
||||
BitVector b(x);
|
||||
return b ^= y;
|
||||
}
|
||||
|
||||
BitVector operator-(BitVector const& x, BitVector const& y)
|
||||
{
|
||||
BitVector b(x);
|
||||
return b -= y;
|
||||
}
|
||||
|
||||
bool operator==(BitVector const& x, BitVector const& y)
|
||||
{
|
||||
return x.num_bits == y.num_bits && x.bits == y.bits;
|
||||
}
|
||||
|
||||
bool operator!=(BitVector const& x, BitVector const& y)
|
||||
{
|
||||
return ! (x == y);
|
||||
}
|
||||
|
||||
bool operator<(BitVector const& x, BitVector const& y)
|
||||
{
|
||||
assert(x.Size() == y.Size());
|
||||
|
||||
for ( BitVector::size_type r = x.Blocks(); r > 0; --r )
|
||||
{
|
||||
BitVector::size_type i = r - 1;
|
||||
|
||||
if ( x.bits[i] < y.bits[i] )
|
||||
return true;
|
||||
|
||||
else if ( x.bits[i] > y.bits[i] )
|
||||
return false;
|
||||
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void BitVector::Resize(size_type n, bool value)
|
||||
{
|
||||
size_type old = Blocks();
|
||||
size_type required = bits_to_blocks(n);
|
||||
block_type block_value = value ? ~block_type(0) : block_type(0);
|
||||
|
||||
if ( required != old )
|
||||
bits.resize(required, block_value);
|
||||
|
||||
if ( value && (n > num_bits) && extra_bits() )
|
||||
bits[old - 1] |= (block_value << extra_bits());
|
||||
|
||||
num_bits = n;
|
||||
zero_unused_bits();
|
||||
}
|
||||
|
||||
void BitVector::Clear()
|
||||
{
|
||||
bits.clear();
|
||||
num_bits = 0;
|
||||
}
|
||||
|
||||
void BitVector::PushBack(bool bit)
|
||||
{
|
||||
size_type s = Size();
|
||||
Resize(s + 1);
|
||||
Set(s, bit);
|
||||
}
|
||||
|
||||
void BitVector::Append(block_type block)
|
||||
{
|
||||
size_type excess = extra_bits();
|
||||
|
||||
if ( excess )
|
||||
{
|
||||
assert(! Empty());
|
||||
bits.push_back(block >> (bits_per_block - excess));
|
||||
bits[Blocks() - 2] |= (block << excess);
|
||||
}
|
||||
|
||||
else
|
||||
{
|
||||
bits.push_back(block);
|
||||
}
|
||||
|
||||
num_bits += bits_per_block;
|
||||
}
|
||||
|
||||
BitVector& BitVector::Set(size_type i, bool bit)
|
||||
{
|
||||
assert(i < num_bits);
|
||||
|
||||
if ( bit )
|
||||
bits[block_index(i)] |= bit_mask(i);
|
||||
else
|
||||
Reset(i);
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
BitVector& BitVector::Set()
|
||||
{
|
||||
std::fill(bits.begin(), bits.end(), ~block_type(0));
|
||||
zero_unused_bits();
|
||||
return *this;
|
||||
}
|
||||
|
||||
BitVector& BitVector::Reset(size_type i)
|
||||
{
|
||||
assert(i < num_bits);
|
||||
bits[block_index(i)] &= ~bit_mask(i);
|
||||
return *this;
|
||||
}
|
||||
|
||||
BitVector& BitVector::Reset()
|
||||
{
|
||||
std::fill(bits.begin(), bits.end(), block_type(0));
|
||||
return *this;
|
||||
}
|
||||
|
||||
BitVector& BitVector::Flip(size_type i)
|
||||
{
|
||||
assert(i < num_bits);
|
||||
bits[block_index(i)] ^= bit_mask(i);
|
||||
return *this;
|
||||
}
|
||||
|
||||
BitVector& BitVector::Flip()
|
||||
{
|
||||
for (size_type i = 0; i < Blocks(); ++i)
|
||||
bits[i] = ~bits[i];
|
||||
|
||||
zero_unused_bits();
|
||||
return *this;
|
||||
}
|
||||
|
||||
bool BitVector::operator[](size_type i) const
|
||||
{
|
||||
assert(i < num_bits);
|
||||
return (bits[block_index(i)] & bit_mask(i)) != 0;
|
||||
}
|
||||
|
||||
BitVector::Reference BitVector::operator[](size_type i)
|
||||
{
|
||||
assert(i < num_bits);
|
||||
return Reference(bits[block_index(i)], bit_index(i));
|
||||
}
|
||||
|
||||
BitVector::size_type BitVector::Count() const
|
||||
{
|
||||
std::vector<block_type>::const_iterator first = bits.begin();
|
||||
size_t n = 0;
|
||||
size_type length = Blocks();
|
||||
|
||||
while ( length )
|
||||
{
|
||||
block_type block = *first;
|
||||
|
||||
while ( block )
|
||||
{
|
||||
// TODO: use _popcnt if available.
|
||||
n += count_table[block & ((1u << 8) - 1)];
|
||||
block >>= 8;
|
||||
}
|
||||
|
||||
++first;
|
||||
--length;
|
||||
}
|
||||
|
||||
return n;
|
||||
}
|
||||
|
||||
BitVector::size_type BitVector::Blocks() const
|
||||
{
|
||||
return bits.size();
|
||||
}
|
||||
|
||||
BitVector::size_type BitVector::Size() const
|
||||
{
|
||||
return num_bits;
|
||||
}
|
||||
|
||||
bool BitVector::Empty() const
|
||||
{
|
||||
return bits.empty();
|
||||
}
|
||||
|
||||
bool BitVector::AllZero() const
|
||||
{
|
||||
for ( size_t i = 0; i < bits.size(); ++i )
|
||||
{
|
||||
if ( bits[i] )
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
BitVector::size_type BitVector::FindFirst() const
|
||||
{
|
||||
return find_from(0);
|
||||
}
|
||||
|
||||
BitVector::size_type BitVector::FindNext(size_type i) const
|
||||
{
|
||||
if ( i >= (Size() - 1) || Size() == 0 )
|
||||
return npos;
|
||||
|
||||
++i;
|
||||
size_type bi = block_index(i);
|
||||
block_type block = bits[bi] & (~block_type(0) << bit_index(i));
|
||||
return block ? bi * bits_per_block + lowest_bit(block) : find_from(bi + 1);
|
||||
}
|
||||
|
||||
BitVector::size_type BitVector::lowest_bit(block_type block)
|
||||
{
|
||||
block_type x = block - (block & (block - 1));
|
||||
size_type log = 0;
|
||||
|
||||
while (x >>= 1)
|
||||
++log;
|
||||
|
||||
return log;
|
||||
}
|
||||
|
||||
BitVector::block_type BitVector::extra_bits() const
|
||||
{
|
||||
return bit_index(Size());
|
||||
}
|
||||
|
||||
void BitVector::zero_unused_bits()
|
||||
{
|
||||
if ( extra_bits() )
|
||||
bits.back() &= ~(~block_type(0) << extra_bits());
|
||||
}
|
||||
|
||||
BitVector::size_type BitVector::find_from(size_type i) const
|
||||
{
|
||||
while (i < Blocks() && bits[i] == 0)
|
||||
++i;
|
||||
|
||||
if ( i >= Blocks() )
|
||||
return npos;
|
||||
|
||||
return i * bits_per_block + lowest_bit(bits[i]);
|
||||
}
|
||||
|
||||
bool BitVector::Serialize(SerialInfo* info) const
|
||||
{
|
||||
return SerialObj::Serialize(info);
|
||||
}
|
||||
|
||||
BitVector* BitVector::Unserialize(UnserialInfo* info)
|
||||
{
|
||||
return reinterpret_cast<BitVector*>(SerialObj::Unserialize(info, SER_BITVECTOR));
|
||||
}
|
||||
|
||||
IMPLEMENT_SERIAL(BitVector, SER_BITVECTOR);
|
||||
|
||||
bool BitVector::DoSerialize(SerialInfo* info) const
|
||||
{
|
||||
DO_SERIALIZE(SER_BITVECTOR, SerialObj);
|
||||
|
||||
if ( ! SERIALIZE(static_cast<uint64>(bits.size())) )
|
||||
return false;
|
||||
|
||||
for ( size_t i = 0; i < bits.size(); ++i )
|
||||
if ( ! SERIALIZE(static_cast<uint64>(bits[i])) )
|
||||
return false;
|
||||
|
||||
return SERIALIZE(static_cast<uint64>(num_bits));
|
||||
}
|
||||
|
||||
bool BitVector::DoUnserialize(UnserialInfo* info)
|
||||
{
|
||||
DO_UNSERIALIZE(SerialObj);
|
||||
|
||||
uint64 size;
|
||||
if ( ! UNSERIALIZE(&size) )
|
||||
return false;
|
||||
|
||||
bits.resize(static_cast<size_t>(size));
|
||||
|
||||
for ( size_t i = 0; i < bits.size(); ++i )
|
||||
{
|
||||
uint64 block;
|
||||
if ( ! UNSERIALIZE(&block) )
|
||||
return false;
|
||||
|
||||
bits[i] = static_cast<block_type>(block);
|
||||
}
|
||||
|
||||
uint64 num_bits;
|
||||
if ( ! UNSERIALIZE(&num_bits) )
|
||||
return false;
|
||||
|
||||
num_bits = static_cast<size_type>(num_bits);
|
||||
|
||||
return true;
|
||||
}
|
370
src/probabilistic/BitVector.h
Normal file
370
src/probabilistic/BitVector.h
Normal file
|
@ -0,0 +1,370 @@
|
|||
// See the file "COPYING" in the main distribution directory for copyright.
|
||||
|
||||
#ifndef PROBABILISTIC_BITVECTOR_H
|
||||
#define PROBABILISTIC_BITVECTOR_H
|
||||
|
||||
#include <iterator>
|
||||
#include <vector>
|
||||
|
||||
#include "SerialObj.h"
|
||||
|
||||
namespace probabilistic {
|
||||
|
||||
/**
|
||||
* A vector of bits.
|
||||
*/
|
||||
class BitVector : public SerialObj {
|
||||
public:
|
||||
typedef size_t block_type;
|
||||
typedef size_t size_type;
|
||||
typedef bool const_reference;
|
||||
|
||||
static size_type npos;
|
||||
static block_type bits_per_block;
|
||||
|
||||
/**
|
||||
* An lvalue proxy for individual bits.
|
||||
*/
|
||||
class Reference {
|
||||
public:
|
||||
/**
|
||||
* Inverts the bits' values.
|
||||
*/
|
||||
Reference& Flip();
|
||||
|
||||
operator bool() const;
|
||||
bool operator~() const;
|
||||
Reference& operator=(bool x);
|
||||
Reference& operator=(const Reference& other);
|
||||
Reference& operator|=(bool x);
|
||||
Reference& operator&=(bool x);
|
||||
Reference& operator^=(bool x);
|
||||
Reference& operator-=(bool x);
|
||||
|
||||
private:
|
||||
friend class BitVector;
|
||||
|
||||
Reference(block_type& block, block_type i);
|
||||
void operator&();
|
||||
|
||||
block_type& block;
|
||||
const block_type mask;
|
||||
};
|
||||
|
||||
/**
|
||||
* Default-constructs an empty bit vector.
|
||||
*/
|
||||
BitVector();
|
||||
|
||||
/**
|
||||
* Constructs a bit vector of a given size.
|
||||
* @param size The number of bits.
|
||||
* @param value The value for each bit.
|
||||
*/
|
||||
explicit BitVector(size_type size, bool value = false);
|
||||
|
||||
/**
|
||||
* Constructs a bit vector from a sequence of blocks.
|
||||
*
|
||||
* @param first Start of range
|
||||
* @param last End of range.
|
||||
*
|
||||
*/
|
||||
template <typename InputIterator>
|
||||
BitVector(InputIterator first, InputIterator last)
|
||||
{
|
||||
bits.insert(bits.end(), first, last);
|
||||
num_bits = bits.size() * bits_per_block;
|
||||
}
|
||||
|
||||
/**
|
||||
* Copy-constructs a bit vector.
|
||||
* @param other The bit vector to copy.
|
||||
*/
|
||||
BitVector(const BitVector& other);
|
||||
|
||||
/**
|
||||
* Assigns another bit vector to this instance.
|
||||
* @param other The RHS of the assignment.
|
||||
*/
|
||||
BitVector& operator=(const BitVector& other);
|
||||
|
||||
//
|
||||
// Bitwise operations.
|
||||
//
|
||||
BitVector operator~() const;
|
||||
BitVector operator<<(size_type n) const;
|
||||
BitVector operator>>(size_type n) const;
|
||||
BitVector& operator<<=(size_type n);
|
||||
BitVector& operator>>=(size_type n);
|
||||
BitVector& operator&=(BitVector const& other);
|
||||
BitVector& operator|=(BitVector const& other);
|
||||
BitVector& operator^=(BitVector const& other);
|
||||
BitVector& operator-=(BitVector const& other);
|
||||
friend BitVector operator&(BitVector const& x, BitVector const& y);
|
||||
friend BitVector operator|(BitVector const& x, BitVector const& y);
|
||||
friend BitVector operator^(BitVector const& x, BitVector const& y);
|
||||
friend BitVector operator-(BitVector const& x, BitVector const& y);
|
||||
|
||||
//
|
||||
// Relational operators
|
||||
//
|
||||
friend bool operator==(BitVector const& x, BitVector const& y);
|
||||
friend bool operator!=(BitVector const& x, BitVector const& y);
|
||||
friend bool operator<(BitVector const& x, BitVector const& y);
|
||||
|
||||
//
|
||||
// Basic operations
|
||||
//
|
||||
|
||||
/** Appends the bits in a sequence of values.
|
||||
* @tparam Iterator A forward iterator.
|
||||
* @param first An iterator pointing to the first element of the sequence.
|
||||
* @param last An iterator pointing to one past the last element of the
|
||||
* sequence.
|
||||
*/
|
||||
template <typename ForwardIterator>
|
||||
void Append(ForwardIterator first, ForwardIterator last)
|
||||
{
|
||||
if ( first == last )
|
||||
return;
|
||||
|
||||
block_type excess = extra_bits();
|
||||
typename std::iterator_traits<ForwardIterator>::difference_type delta =
|
||||
std::distance(first, last);
|
||||
|
||||
bits.reserve(Blocks() + delta);
|
||||
|
||||
if ( excess == 0 )
|
||||
{
|
||||
bits.back() |= (*first << excess);
|
||||
|
||||
do {
|
||||
block_type b = *first++ >> (bits_per_block - excess);
|
||||
bits.push_back(b | (first == last ? 0 : *first << excess));
|
||||
} while (first != last);
|
||||
|
||||
}
|
||||
|
||||
else
|
||||
bits.insert(bits.end(), first, last);
|
||||
|
||||
num_bits += bits_per_block * delta;
|
||||
}
|
||||
|
||||
/**
|
||||
* Appends the bits in a given block.
|
||||
* @param block The block containing bits to append.
|
||||
*/
|
||||
void Append(block_type block);
|
||||
|
||||
/** Appends a single bit to the end of the bit vector.
|
||||
* @param bit The value of the bit.
|
||||
*/
|
||||
void PushBack(bool bit);
|
||||
|
||||
/**
|
||||
* Clears all bits in the bitvector.
|
||||
*/
|
||||
void Clear();
|
||||
|
||||
/**
|
||||
* Resizes the bit vector to a new number of bits.
|
||||
* @param n The new number of bits of the bit vector.
|
||||
* @param value The bit value of new values, if the vector expands.
|
||||
*/
|
||||
void Resize(size_type n, bool value = false);
|
||||
|
||||
/**
|
||||
* Sets a bit at a specific position to a given value.
|
||||
* @param i The bit position.
|
||||
* @param bit The value assigned to position *i*.
|
||||
* @return A reference to the bit vector instance.
|
||||
*/
|
||||
BitVector& Set(size_type i, bool bit = true);
|
||||
|
||||
/**
|
||||
* Sets all bits to 1.
|
||||
* @return A reference to the bit vector instance.
|
||||
*/
|
||||
BitVector& Set();
|
||||
|
||||
/**
|
||||
* Resets a bit at a specific position, i.e., sets it to 0.
|
||||
* @param i The bit position.
|
||||
* @return A reference to the bit vector instance.
|
||||
*/
|
||||
BitVector& Reset(size_type i);
|
||||
|
||||
/**
|
||||
* Sets all bits to 0.
|
||||
* @return A reference to the bit vector instance.
|
||||
*/
|
||||
BitVector& Reset();
|
||||
|
||||
/**
|
||||
* Toggles/flips a bit at a specific position.
|
||||
* @param i The bit position.
|
||||
* @return A reference to the bit vector instance.
|
||||
*/
|
||||
BitVector& Flip(size_type i);
|
||||
|
||||
/**
|
||||
* Computes the complement.
|
||||
* @return A reference to the bit vector instance.
|
||||
*/
|
||||
BitVector& Flip();
|
||||
|
||||
/** Retrieves a single bit.
|
||||
* @param i The bit position.
|
||||
* @return A mutable reference to the bit at position *i*.
|
||||
*/
|
||||
Reference operator[](size_type i);
|
||||
|
||||
/**
|
||||
* Retrieves a single bit.
|
||||
* @param i The bit position.
|
||||
* @return A const-reference to the bit at position *i*.
|
||||
*/
|
||||
const_reference operator[](size_type i) const;
|
||||
|
||||
/**
|
||||
* Counts the number of 1-bits in the bit vector. Also known as *population
|
||||
* count* or *Hamming weight*.
|
||||
* @return The number of bits set to 1.
|
||||
*/
|
||||
size_type Count() const;
|
||||
|
||||
/**
|
||||
* Retrieves the number of blocks of the underlying storage.
|
||||
* @param The number of blocks that represent `Size()` bits.
|
||||
*/
|
||||
size_type Blocks() const;
|
||||
|
||||
/**
|
||||
* Retrieves the number of bits the bitvector consist of.
|
||||
* @return The length of the bit vector in bits.
|
||||
*/
|
||||
size_type Size() const;
|
||||
|
||||
/**
|
||||
* Checks whether the bit vector is empty.
|
||||
* @return `true` iff the bitvector has zero length.
|
||||
*/
|
||||
bool Empty() const;
|
||||
|
||||
/**
|
||||
* Checks whether all bits are 0.
|
||||
* @return `true` iff all bits in all blocks are 0.
|
||||
*/
|
||||
bool AllZero() const;
|
||||
|
||||
/**
|
||||
* Finds the bit position of of the first 1-bit.
|
||||
* @return The position of the first bit that equals to one or `npos` if no
|
||||
* such bit exists.
|
||||
*/
|
||||
size_type FindFirst() const;
|
||||
|
||||
/**
|
||||
* Finds the next 1-bit from a given starting position.
|
||||
*
|
||||
* @param i The index where to start looking.
|
||||
*
|
||||
* @return The position of the first bit that equals to 1 after position
|
||||
* *i* or `npos` if no such bit exists.
|
||||
*/
|
||||
size_type FindNext(size_type i) const;
|
||||
|
||||
/**
|
||||
* Serializes the bit vector.
|
||||
*
|
||||
* @param info The serializaton informationt to use.
|
||||
*
|
||||
* @return True if successful.
|
||||
*/
|
||||
bool Serialize(SerialInfo* info) const;
|
||||
|
||||
/**
|
||||
* Unserialize the bit vector.
|
||||
*
|
||||
* @param info The serializaton informationt to use.
|
||||
*
|
||||
* @return The unserialized bit vector, or null if an error occured.
|
||||
*/
|
||||
static BitVector* Unserialize(UnserialInfo* info);
|
||||
|
||||
protected:
|
||||
DECLARE_SERIAL(BitVector);
|
||||
|
||||
private:
|
||||
/**
|
||||
* Computes the number of excess/unused bits in the bit vector.
|
||||
*/
|
||||
block_type extra_bits() const;
|
||||
|
||||
/**
|
||||
* If the number of bits in the vector are not not a multiple of
|
||||
* bitvector::bits_per_block, then the last block exhibits unused bits which
|
||||
* this function resets.
|
||||
*/
|
||||
void zero_unused_bits();
|
||||
|
||||
/**
|
||||
* Looks for the first 1-bit starting at a given position.
|
||||
* @param i The block index to start looking.
|
||||
* @return The block index of the first 1-bit starting from *i* or
|
||||
* `bitvector::npos` if no 1-bit exists.
|
||||
*/
|
||||
size_type find_from(size_type i) const;
|
||||
|
||||
/**
|
||||
* Computes the block index for a given bit position.
|
||||
*/
|
||||
static size_type block_index(size_type i)
|
||||
{
|
||||
return i / bits_per_block;
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes the bit index within a given block for a given bit position.
|
||||
*/
|
||||
static block_type bit_index(size_type i)
|
||||
{
|
||||
return i % bits_per_block;
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes the bitmask block to extract a bit a given bit position.
|
||||
*/
|
||||
static block_type bit_mask(size_type i)
|
||||
{
|
||||
return block_type(1) << bit_index(i);
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes the number of blocks needed to represent a given number of
|
||||
* bits.
|
||||
* @param bits the number of bits.
|
||||
* @return The number of blocks to represent *bits* number of bits.
|
||||
*/
|
||||
static size_type bits_to_blocks(size_type bits)
|
||||
{
|
||||
return bits / bits_per_block
|
||||
+ static_cast<size_type>(bits % bits_per_block != 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes the bit position first 1-bit in a given block.
|
||||
* @param block The block to inspect.
|
||||
* @return The bit position where *block* has its first bit set to 1.
|
||||
*/
|
||||
static size_type lowest_bit(block_type block);
|
||||
|
||||
std::vector<block_type> bits;
|
||||
size_type num_bits;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#endif
|
257
src/probabilistic/BloomFilter.cc
Normal file
257
src/probabilistic/BloomFilter.cc
Normal file
|
@ -0,0 +1,257 @@
|
|||
// See the file "COPYING" in the main distribution directory for copyright.
|
||||
|
||||
#include <typeinfo>
|
||||
#include <cmath>
|
||||
#include <limits>
|
||||
|
||||
#include "BloomFilter.h"
|
||||
|
||||
#include "CounterVector.h"
|
||||
#include "Serializer.h"
|
||||
|
||||
using namespace probabilistic;
|
||||
|
||||
BloomFilter::BloomFilter()
|
||||
{
|
||||
hasher = 0;
|
||||
}
|
||||
|
||||
BloomFilter::BloomFilter(const Hasher* arg_hasher)
|
||||
{
|
||||
hasher = arg_hasher;
|
||||
}
|
||||
|
||||
BloomFilter::~BloomFilter()
|
||||
{
|
||||
delete hasher;
|
||||
}
|
||||
|
||||
bool BloomFilter::Serialize(SerialInfo* info) const
|
||||
{
|
||||
return SerialObj::Serialize(info);
|
||||
}
|
||||
|
||||
BloomFilter* BloomFilter::Unserialize(UnserialInfo* info)
|
||||
{
|
||||
return reinterpret_cast<BloomFilter*>(SerialObj::Unserialize(info, SER_BLOOMFILTER));
|
||||
}
|
||||
|
||||
bool BloomFilter::DoSerialize(SerialInfo* info) const
|
||||
{
|
||||
DO_SERIALIZE(SER_BLOOMFILTER, SerialObj);
|
||||
|
||||
if ( ! SERIALIZE(static_cast<uint16>(hasher->K())) )
|
||||
return false;
|
||||
|
||||
return SERIALIZE_STR(hasher->Name().c_str(), hasher->Name().size());
|
||||
}
|
||||
|
||||
bool BloomFilter::DoUnserialize(UnserialInfo* info)
|
||||
{
|
||||
DO_UNSERIALIZE(SerialObj);
|
||||
|
||||
uint16 k;
|
||||
if ( ! UNSERIALIZE(&k) )
|
||||
return false;
|
||||
|
||||
const char* name;
|
||||
if ( ! UNSERIALIZE_STR(&name, 0) )
|
||||
return false;
|
||||
|
||||
hasher = Hasher::Create(k, name);
|
||||
|
||||
delete [] name;
|
||||
return true;
|
||||
}
|
||||
|
||||
size_t BasicBloomFilter::M(double fp, size_t capacity)
|
||||
{
|
||||
double ln2 = std::log(2);
|
||||
return std::ceil(-(capacity * std::log(fp) / ln2 / ln2));
|
||||
}
|
||||
|
||||
size_t BasicBloomFilter::K(size_t cells, size_t capacity)
|
||||
{
|
||||
double frac = static_cast<double>(cells) / static_cast<double>(capacity);
|
||||
return std::ceil(frac * std::log(2));
|
||||
}
|
||||
|
||||
bool BasicBloomFilter::Empty() const
|
||||
{
|
||||
return bits->AllZero();
|
||||
}
|
||||
|
||||
void BasicBloomFilter::Clear()
|
||||
{
|
||||
bits->Clear();
|
||||
}
|
||||
|
||||
bool BasicBloomFilter::Merge(const BloomFilter* other)
|
||||
{
|
||||
if ( typeid(*this) != typeid(*other) )
|
||||
return false;
|
||||
|
||||
const BasicBloomFilter* o = static_cast<const BasicBloomFilter*>(other);
|
||||
|
||||
if ( ! hasher->Equals(o->hasher) )
|
||||
{
|
||||
reporter->Error("incompatible hashers in BasicBloomFilter merge");
|
||||
return false;
|
||||
}
|
||||
|
||||
else if ( bits->Size() != o->bits->Size() )
|
||||
{
|
||||
reporter->Error("different bitvector size in BasicBloomFilter merge");
|
||||
return false;
|
||||
}
|
||||
|
||||
(*bits) |= *o->bits;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
BasicBloomFilter* BasicBloomFilter::Clone() const
|
||||
{
|
||||
BasicBloomFilter* copy = new BasicBloomFilter();
|
||||
|
||||
copy->hasher = hasher->Clone();
|
||||
copy->bits = new BitVector(*bits);
|
||||
|
||||
return copy;
|
||||
}
|
||||
|
||||
BasicBloomFilter::BasicBloomFilter()
|
||||
{
|
||||
bits = 0;
|
||||
}
|
||||
|
||||
BasicBloomFilter::BasicBloomFilter(const Hasher* hasher, size_t cells)
|
||||
: BloomFilter(hasher)
|
||||
{
|
||||
bits = new BitVector(cells);
|
||||
}
|
||||
|
||||
IMPLEMENT_SERIAL(BasicBloomFilter, SER_BASICBLOOMFILTER)
|
||||
|
||||
bool BasicBloomFilter::DoSerialize(SerialInfo* info) const
|
||||
{
|
||||
DO_SERIALIZE(SER_BASICBLOOMFILTER, BloomFilter);
|
||||
return bits->Serialize(info);
|
||||
}
|
||||
|
||||
bool BasicBloomFilter::DoUnserialize(UnserialInfo* info)
|
||||
{
|
||||
DO_UNSERIALIZE(BloomFilter);
|
||||
bits = BitVector::Unserialize(info);
|
||||
return (bits != 0);
|
||||
}
|
||||
|
||||
void BasicBloomFilter::AddImpl(const Hasher::digest_vector& h)
|
||||
{
|
||||
for ( size_t i = 0; i < h.size(); ++i )
|
||||
bits->Set(h[i] % bits->Size());
|
||||
}
|
||||
|
||||
size_t BasicBloomFilter::CountImpl(const Hasher::digest_vector& h) const
|
||||
{
|
||||
for ( size_t i = 0; i < h.size(); ++i )
|
||||
{
|
||||
if ( ! (*bits)[h[i] % bits->Size()] )
|
||||
return 0;
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
CountingBloomFilter::CountingBloomFilter()
|
||||
{
|
||||
cells = 0;
|
||||
}
|
||||
|
||||
CountingBloomFilter::CountingBloomFilter(const Hasher* hasher,
|
||||
size_t arg_cells, size_t width)
|
||||
: BloomFilter(hasher)
|
||||
{
|
||||
cells = new CounterVector(width, arg_cells);
|
||||
}
|
||||
|
||||
bool CountingBloomFilter::Empty() const
|
||||
{
|
||||
return cells->AllZero();
|
||||
}
|
||||
|
||||
void CountingBloomFilter::Clear()
|
||||
{
|
||||
cells->Clear();
|
||||
}
|
||||
|
||||
bool CountingBloomFilter::Merge(const BloomFilter* other)
|
||||
{
|
||||
if ( typeid(*this) != typeid(*other) )
|
||||
return false;
|
||||
|
||||
const CountingBloomFilter* o = static_cast<const CountingBloomFilter*>(other);
|
||||
|
||||
if ( ! hasher->Equals(o->hasher) )
|
||||
{
|
||||
reporter->Error("incompatible hashers in CountingBloomFilter merge");
|
||||
return false;
|
||||
}
|
||||
|
||||
else if ( cells->Size() != o->cells->Size() )
|
||||
{
|
||||
reporter->Error("different bitvector size in CountingBloomFilter merge");
|
||||
return false;
|
||||
}
|
||||
|
||||
(*cells) |= *o->cells;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
CountingBloomFilter* CountingBloomFilter::Clone() const
|
||||
{
|
||||
CountingBloomFilter* copy = new CountingBloomFilter();
|
||||
|
||||
copy->hasher = hasher->Clone();
|
||||
copy->cells = new CounterVector(*cells);
|
||||
|
||||
return copy;
|
||||
}
|
||||
|
||||
IMPLEMENT_SERIAL(CountingBloomFilter, SER_COUNTINGBLOOMFILTER)
|
||||
|
||||
bool CountingBloomFilter::DoSerialize(SerialInfo* info) const
|
||||
{
|
||||
DO_SERIALIZE(SER_COUNTINGBLOOMFILTER, BloomFilter);
|
||||
return cells->Serialize(info);
|
||||
}
|
||||
|
||||
bool CountingBloomFilter::DoUnserialize(UnserialInfo* info)
|
||||
{
|
||||
DO_UNSERIALIZE(BloomFilter);
|
||||
cells = CounterVector::Unserialize(info);
|
||||
return (cells != 0);
|
||||
}
|
||||
|
||||
// TODO: Use partitioning in add/count to allow for reusing CMS bounds.
|
||||
void CountingBloomFilter::AddImpl(const Hasher::digest_vector& h)
|
||||
{
|
||||
for ( size_t i = 0; i < h.size(); ++i )
|
||||
cells->Increment(h[i] % cells->Size());
|
||||
}
|
||||
|
||||
size_t CountingBloomFilter::CountImpl(const Hasher::digest_vector& h) const
|
||||
{
|
||||
CounterVector::size_type min =
|
||||
std::numeric_limits<CounterVector::size_type>::max();
|
||||
|
||||
for ( size_t i = 0; i < h.size(); ++i )
|
||||
{
|
||||
CounterVector::size_type cnt = cells->Count(h[i] % cells->Size());
|
||||
if ( cnt < min )
|
||||
min = cnt;
|
||||
}
|
||||
|
||||
return min;
|
||||
}
|
241
src/probabilistic/BloomFilter.h
Normal file
241
src/probabilistic/BloomFilter.h
Normal file
|
@ -0,0 +1,241 @@
|
|||
// See the file "COPYING" in the main distribution directory for copyright.
|
||||
|
||||
#ifndef PROBABILISTIC_BLOOMFILTER_H
|
||||
#define PROBABILISTIC_BLOOMFILTER_H
|
||||
|
||||
#include <vector>
|
||||
#include "BitVector.h"
|
||||
#include "Hasher.h"
|
||||
|
||||
namespace probabilistic {
|
||||
|
||||
class CounterVector;
|
||||
|
||||
/**
|
||||
* The abstract base class for Bloom filters.
|
||||
*
|
||||
* At this point we won't let the user choose the hasher, but we might open
|
||||
* up the interface in the future.
|
||||
*/
|
||||
class BloomFilter : public SerialObj {
|
||||
public:
|
||||
/**
|
||||
* Destructor.
|
||||
*/
|
||||
virtual ~BloomFilter();
|
||||
|
||||
/**
|
||||
* Adds an element of type T to the Bloom filter.
|
||||
* @param x The element to add
|
||||
*/
|
||||
template <typename T>
|
||||
void Add(const T& x)
|
||||
{
|
||||
AddImpl((*hasher)(x));
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieves the associated count of a given value.
|
||||
*
|
||||
* @param x The value of type `T` to check.
|
||||
*
|
||||
* @return The counter associated with *x*.
|
||||
*/
|
||||
template <typename T>
|
||||
size_t Count(const T& x) const
|
||||
{
|
||||
return CountImpl((*hasher)(x));
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks whether the Bloom filter is empty.
|
||||
*
|
||||
* @return `true` if the Bloom filter contains no elements.
|
||||
*/
|
||||
virtual bool Empty() const = 0;
|
||||
|
||||
/**
|
||||
* Removes all elements, i.e., resets all bits in the underlying bit vector.
|
||||
*/
|
||||
virtual void Clear() = 0;
|
||||
|
||||
/**
|
||||
* Merges another Bloom filter into a copy of this one.
|
||||
*
|
||||
* @param other The other Bloom filter.
|
||||
*
|
||||
* @return `true` on success.
|
||||
*/
|
||||
virtual bool Merge(const BloomFilter* other) = 0;
|
||||
|
||||
/**
|
||||
* Constructs a copy of this Bloom filter.
|
||||
*
|
||||
* @return A copy of `*this`.
|
||||
*/
|
||||
virtual BloomFilter* Clone() const = 0;
|
||||
|
||||
/**
|
||||
* Serializes the Bloom filter.
|
||||
*
|
||||
* @param info The serializaton information to use.
|
||||
*
|
||||
* @return True if successful.
|
||||
*/
|
||||
bool Serialize(SerialInfo* info) const;
|
||||
|
||||
/**
|
||||
* Unserializes a Bloom filter.
|
||||
*
|
||||
* @param info The serializaton information to use.
|
||||
*
|
||||
* @return The unserialized Bloom filter, or null if an error
|
||||
* occured.
|
||||
*/
|
||||
static BloomFilter* Unserialize(UnserialInfo* info);
|
||||
|
||||
protected:
|
||||
DECLARE_ABSTRACT_SERIAL(BloomFilter);
|
||||
|
||||
/**
|
||||
* Default constructor.
|
||||
*/
|
||||
BloomFilter();
|
||||
|
||||
/**
|
||||
* Constructs a Bloom filter.
|
||||
*
|
||||
* @param hasher The hasher to use for this Bloom filter.
|
||||
*/
|
||||
BloomFilter(const Hasher* hasher);
|
||||
|
||||
/**
|
||||
* Abstract method for implementinng the *Add* operation.
|
||||
*
|
||||
* @param hashes A set of *k* hashes for the item to add, computed by
|
||||
* the internal hasher object.
|
||||
*
|
||||
*/
|
||||
virtual void AddImpl(const Hasher::digest_vector& hashes) = 0;
|
||||
|
||||
/**
|
||||
* Abstract method for implementing the *Count* operation.
|
||||
*
|
||||
* @param hashes A set of *k* hashes for the item to add, computed by
|
||||
* the internal hasher object.
|
||||
*
|
||||
* @return Returns the counter associated with the hashed element.
|
||||
*/
|
||||
virtual size_t CountImpl(const Hasher::digest_vector& hashes) const = 0;
|
||||
|
||||
const Hasher* hasher;
|
||||
};
|
||||
|
||||
/**
|
||||
* A basic Bloom filter.
|
||||
*/
|
||||
class BasicBloomFilter : public BloomFilter {
|
||||
public:
|
||||
/**
|
||||
* Constructs a basic Bloom filter with a given number of cells. The
|
||||
* ideal number of cells can be computed with *M*.
|
||||
*
|
||||
* @param hasher The hasher to use. The ideal number of hash
|
||||
* functions can be computed with *K*.
|
||||
*
|
||||
* @param cells The number of cells.
|
||||
*/
|
||||
BasicBloomFilter(const Hasher* hasher, size_t cells);
|
||||
|
||||
/**
|
||||
* Computes the number of cells based on a given false positive rate
|
||||
* and capacity. In the literature, this parameter often has the name
|
||||
* *M*.
|
||||
*
|
||||
* @param fp The false positive rate.
|
||||
*
|
||||
* @param capacity The expected number of elements that will be
|
||||
* stored.
|
||||
*
|
||||
* Returns: The number cells needed to support a false positive rate
|
||||
* of *fp* with at most *capacity* elements.
|
||||
*/
|
||||
static size_t M(double fp, size_t capacity);
|
||||
|
||||
/**
|
||||
* Computes the optimal number of hash functions based on the number cells
|
||||
* and expected number of elements.
|
||||
*
|
||||
* @param cells The number of cells (*m*).
|
||||
*
|
||||
* @param capacity The maximum number of elements.
|
||||
*
|
||||
* Returns: the optimal number of hash functions for a false-positive
|
||||
* rate of *fp* for at most *capacity* elements.
|
||||
*/
|
||||
static size_t K(size_t cells, size_t capacity);
|
||||
|
||||
// Overridden from BloomFilter.
|
||||
virtual bool Empty() const;
|
||||
virtual void Clear();
|
||||
virtual bool Merge(const BloomFilter* other);
|
||||
virtual BasicBloomFilter* Clone() const;
|
||||
|
||||
protected:
|
||||
DECLARE_SERIAL(BasicBloomFilter);
|
||||
|
||||
/**
|
||||
* Default constructor.
|
||||
*/
|
||||
BasicBloomFilter();
|
||||
|
||||
// Overridden from BloomFilter.
|
||||
virtual void AddImpl(const Hasher::digest_vector& h);
|
||||
virtual size_t CountImpl(const Hasher::digest_vector& h) const;
|
||||
|
||||
private:
|
||||
BitVector* bits;
|
||||
};
|
||||
|
||||
/**
|
||||
* A counting Bloom filter.
|
||||
*/
|
||||
class CountingBloomFilter : public BloomFilter {
|
||||
public:
|
||||
/**
|
||||
* Constructs a counting Bloom filter.
|
||||
*
|
||||
* @param hasher The hasher to use. The ideal number of hash
|
||||
* functions can be computed with *K*.
|
||||
*
|
||||
* @param cells The number of cells to use.
|
||||
*
|
||||
* @param width The maximal bit-width of counter values.
|
||||
*/
|
||||
CountingBloomFilter(const Hasher* hasher, size_t cells, size_t width);
|
||||
|
||||
// Overridden from BloomFilter.
|
||||
virtual bool Empty() const;
|
||||
virtual void Clear();
|
||||
virtual bool Merge(const BloomFilter* other);
|
||||
virtual CountingBloomFilter* Clone() const;
|
||||
|
||||
protected:
|
||||
DECLARE_SERIAL(CountingBloomFilter);
|
||||
|
||||
/**
|
||||
* Default constructor.
|
||||
*/
|
||||
CountingBloomFilter();
|
||||
|
||||
// Overridden from BloomFilter.
|
||||
virtual void AddImpl(const Hasher::digest_vector& h);
|
||||
virtual size_t CountImpl(const Hasher::digest_vector& h) const;
|
||||
|
||||
private:
|
||||
CounterVector* cells;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#endif
|
19
src/probabilistic/CMakeLists.txt
Normal file
19
src/probabilistic/CMakeLists.txt
Normal file
|
@ -0,0 +1,19 @@
|
|||
|
||||
include(BroSubdir)
|
||||
|
||||
include_directories(BEFORE
|
||||
${CMAKE_CURRENT_SOURCE_DIR}
|
||||
${CMAKE_CURRENT_BINARY_DIR}
|
||||
)
|
||||
|
||||
set(probabilistic_SRCS
|
||||
BitVector.cc
|
||||
BloomFilter.cc
|
||||
CounterVector.cc
|
||||
Hasher.cc)
|
||||
|
||||
bif_target(bloom-filter.bif)
|
||||
|
||||
bro_add_subdir_library(probabilistic ${probabilistic_SRCS} ${BIF_OUTPUT_CC})
|
||||
|
||||
add_dependencies(bro_probabilistic generate_outputs)
|
193
src/probabilistic/CounterVector.cc
Normal file
193
src/probabilistic/CounterVector.cc
Normal file
|
@ -0,0 +1,193 @@
|
|||
// See the file "COPYING" in the main distribution directory for copyright.
|
||||
|
||||
#include "CounterVector.h"
|
||||
|
||||
#include <limits>
|
||||
#include "BitVector.h"
|
||||
#include "Serializer.h"
|
||||
|
||||
using namespace probabilistic;
|
||||
|
||||
CounterVector::CounterVector(size_t arg_width, size_t cells)
|
||||
{
|
||||
bits = new BitVector(arg_width * cells);
|
||||
width = arg_width;
|
||||
}
|
||||
|
||||
CounterVector::CounterVector(const CounterVector& other)
|
||||
{
|
||||
bits = new BitVector(*other.bits);
|
||||
width = other.width;
|
||||
}
|
||||
|
||||
CounterVector::~CounterVector()
|
||||
{
|
||||
delete bits;
|
||||
}
|
||||
|
||||
bool CounterVector::Increment(size_type cell, count_type value)
|
||||
{
|
||||
assert(cell < Size());
|
||||
assert(value != 0);
|
||||
|
||||
size_t lsb = cell * width;
|
||||
bool carry = false;
|
||||
|
||||
for ( size_t i = 0; i < width; ++i )
|
||||
{
|
||||
bool b1 = (*bits)[lsb + i];
|
||||
bool b2 = value & (1 << i);
|
||||
(*bits)[lsb + i] = b1 ^ b2 ^ carry;
|
||||
carry = ( b1 && b2 ) || ( carry && ( b1 != b2 ) );
|
||||
}
|
||||
|
||||
if ( carry )
|
||||
{
|
||||
for ( size_t i = 0; i < width; ++i )
|
||||
bits->Set(lsb + i);
|
||||
}
|
||||
|
||||
return ! carry;
|
||||
}
|
||||
|
||||
bool CounterVector::Decrement(size_type cell, count_type value)
|
||||
{
|
||||
assert(cell < Size());
|
||||
assert(value != 0);
|
||||
|
||||
value = ~value + 1; // A - B := A + ~B + 1
|
||||
bool carry = false;
|
||||
size_t lsb = cell * width;
|
||||
|
||||
for ( size_t i = 0; i < width; ++i )
|
||||
{
|
||||
bool b1 = (*bits)[lsb + i];
|
||||
bool b2 = value & (1 << i);
|
||||
(*bits)[lsb + i] = b1 ^ b2 ^ carry;
|
||||
carry = ( b1 && b2 ) || ( carry && ( b1 != b2 ) );
|
||||
}
|
||||
|
||||
return carry;
|
||||
}
|
||||
|
||||
bool CounterVector::AllZero() const
|
||||
{
|
||||
return bits->AllZero();
|
||||
}
|
||||
|
||||
void CounterVector::Clear()
|
||||
{
|
||||
bits->Clear();
|
||||
}
|
||||
|
||||
CounterVector::count_type CounterVector::Count(size_type cell) const
|
||||
{
|
||||
assert(cell < Size());
|
||||
|
||||
size_t cnt = 0, order = 1;
|
||||
size_t lsb = cell * width;
|
||||
|
||||
for ( size_t i = lsb; i < lsb + width; ++i, order <<= 1 )
|
||||
if ( (*bits)[i] )
|
||||
cnt |= order;
|
||||
|
||||
return cnt;
|
||||
}
|
||||
|
||||
CounterVector::size_type CounterVector::Size() const
|
||||
{
|
||||
return bits->Size() / width;
|
||||
}
|
||||
|
||||
size_t CounterVector::Width() const
|
||||
{
|
||||
return width;
|
||||
}
|
||||
|
||||
size_t CounterVector::Max() const
|
||||
{
|
||||
return std::numeric_limits<size_t>::max()
|
||||
>> (std::numeric_limits<size_t>::digits - width);
|
||||
}
|
||||
|
||||
CounterVector& CounterVector::Merge(const CounterVector& other)
|
||||
{
|
||||
assert(Size() == other.Size());
|
||||
assert(Width() == other.Width());
|
||||
|
||||
for ( size_t cell = 0; cell < Size(); ++cell )
|
||||
{
|
||||
size_t lsb = cell * width;
|
||||
bool carry = false;
|
||||
|
||||
for ( size_t i = 0; i < width; ++i )
|
||||
{
|
||||
bool b1 = (*bits)[lsb + i];
|
||||
bool b2 = (*other.bits)[lsb + i];
|
||||
(*bits)[lsb + i] = b1 ^ b2 ^ carry;
|
||||
carry = ( b1 && b2 ) || ( carry && ( b1 != b2 ) );
|
||||
}
|
||||
|
||||
if ( carry )
|
||||
{
|
||||
for ( size_t i = 0; i < width; ++i )
|
||||
bits->Set(lsb + i);
|
||||
}
|
||||
}
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
namespace probabilistic {
|
||||
|
||||
CounterVector& CounterVector::operator|=(const CounterVector& other)
|
||||
{
|
||||
return Merge(other);
|
||||
}
|
||||
|
||||
CounterVector operator|(const CounterVector& x, const CounterVector& y)
|
||||
{
|
||||
CounterVector cv(x);
|
||||
return cv |= y;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
bool CounterVector::Serialize(SerialInfo* info) const
|
||||
{
|
||||
return SerialObj::Serialize(info);
|
||||
}
|
||||
|
||||
CounterVector* CounterVector::Unserialize(UnserialInfo* info)
|
||||
{
|
||||
return reinterpret_cast<CounterVector*>(SerialObj::Unserialize(info, SER_COUNTERVECTOR));
|
||||
}
|
||||
|
||||
IMPLEMENT_SERIAL(CounterVector, SER_COUNTERVECTOR)
|
||||
|
||||
bool CounterVector::DoSerialize(SerialInfo* info) const
|
||||
{
|
||||
DO_SERIALIZE(SER_COUNTERVECTOR, SerialObj);
|
||||
|
||||
if ( ! bits->Serialize(info) )
|
||||
return false;
|
||||
|
||||
return SERIALIZE(static_cast<uint64>(width));
|
||||
}
|
||||
|
||||
bool CounterVector::DoUnserialize(UnserialInfo* info)
|
||||
{
|
||||
DO_UNSERIALIZE(SerialObj);
|
||||
|
||||
bits = BitVector::Unserialize(info);
|
||||
if ( ! bits )
|
||||
return false;
|
||||
|
||||
uint64 width;
|
||||
if ( ! UNSERIALIZE(&width) )
|
||||
return false;
|
||||
|
||||
width = static_cast<size_t>(width);
|
||||
|
||||
return true;
|
||||
}
|
165
src/probabilistic/CounterVector.h
Normal file
165
src/probabilistic/CounterVector.h
Normal file
|
@ -0,0 +1,165 @@
|
|||
// See the file "COPYING" in the main distribution directory for copyright.
|
||||
|
||||
#ifndef PROBABILISTIC_COUNTERVECTOR_H
|
||||
#define PROBABILISTIC_COUNTERVECTOR_H
|
||||
|
||||
#include "SerialObj.h"
|
||||
|
||||
namespace probabilistic {
|
||||
|
||||
class BitVector;
|
||||
|
||||
/**
|
||||
* A vector of counters, each of which has a fixed number of bits.
|
||||
*/
|
||||
class CounterVector : public SerialObj {
|
||||
public:
|
||||
typedef size_t size_type;
|
||||
typedef uint64 count_type;
|
||||
|
||||
/**
|
||||
* Constructs a counter vector having cells of a given width.
|
||||
*
|
||||
* @param width The number of bits that each cell occupies.
|
||||
*
|
||||
* @param cells The number of cells in the bitvector.
|
||||
*
|
||||
* @pre `cells > 0 && width > 0`
|
||||
*/
|
||||
CounterVector(size_t width, size_t cells = 1024);
|
||||
|
||||
/**
|
||||
* Copy-constructs a counter vector.
|
||||
*
|
||||
* @param other The counter vector to copy.
|
||||
*/
|
||||
CounterVector(const CounterVector& other);
|
||||
|
||||
/**
|
||||
* Destructor.
|
||||
*/
|
||||
~CounterVector();
|
||||
|
||||
/**
|
||||
* Increments a given cell.
|
||||
*
|
||||
* @param cell The cell to increment.
|
||||
*
|
||||
* @param value The value to add to the current counter in *cell*.
|
||||
*
|
||||
* @return `true` if adding *value* to the counter in *cell* succeeded.
|
||||
*
|
||||
* @pre `cell < Size()`
|
||||
*/
|
||||
bool Increment(size_type cell, count_type value = 1);
|
||||
|
||||
/**
|
||||
* Decrements a given cell.
|
||||
*
|
||||
* @param cell The cell to decrement.
|
||||
*
|
||||
* @param value The value to subtract from the current counter in *cell*.
|
||||
*
|
||||
* @return `true` if subtracting *value* from the counter in *cell* succeeded.
|
||||
*
|
||||
* @pre `cell < Size()`
|
||||
*/
|
||||
bool Decrement(size_type cell, count_type value = 1);
|
||||
|
||||
/**
|
||||
* Retrieves the counter of a given cell.
|
||||
*
|
||||
* @param cell The cell index to retrieve the count for.
|
||||
*
|
||||
* @return The counter associated with *cell*.
|
||||
*
|
||||
* @pre `cell < Size()`
|
||||
*/
|
||||
count_type Count(size_type cell) const;
|
||||
|
||||
/**
|
||||
* Checks whether all counters are 0.
|
||||
* @return `true` iff all counters have the value 0.
|
||||
*/
|
||||
bool AllZero() const;
|
||||
|
||||
/**
|
||||
* Sets all counters to 0.
|
||||
*/
|
||||
void Clear();
|
||||
|
||||
/**
|
||||
* Retrieves the number of cells in the storage.
|
||||
*
|
||||
* @return The number of cells.
|
||||
*/
|
||||
size_type Size() const;
|
||||
|
||||
/**
|
||||
* Retrieves the counter width.
|
||||
*
|
||||
* @return The number of bits per counter.
|
||||
*/
|
||||
size_t Width() const;
|
||||
|
||||
/**
|
||||
* Computes the maximum counter value.
|
||||
*
|
||||
* @return The maximum counter value based on the width.
|
||||
*/
|
||||
size_t Max() const;
|
||||
|
||||
/**
|
||||
* Merges another counter vector into this instance by *adding* the
|
||||
* counters of each cells.
|
||||
*
|
||||
* @param other The counter vector to merge into this instance.
|
||||
*
|
||||
* @return A reference to `*this`.
|
||||
*
|
||||
* @pre `Size() == other.Size() && Width() == other.Width()`
|
||||
*/
|
||||
CounterVector& Merge(const CounterVector& other);
|
||||
|
||||
/**
|
||||
* An alias for ::Merge.
|
||||
*/
|
||||
CounterVector& operator|=(const CounterVector& other);
|
||||
|
||||
/**
|
||||
* Serializes the bit vector.
|
||||
*
|
||||
* @param info The serializaton information to use.
|
||||
*
|
||||
* @return True if successful.
|
||||
*/
|
||||
bool Serialize(SerialInfo* info) const;
|
||||
|
||||
/**
|
||||
* Unserialize the counter vector.
|
||||
*
|
||||
* @param info The serializaton information to use.
|
||||
*
|
||||
* @return The unserialized counter vector, or null if an error
|
||||
* occured.
|
||||
*/
|
||||
static CounterVector* Unserialize(UnserialInfo* info);
|
||||
|
||||
protected:
|
||||
friend CounterVector operator|(const CounterVector& x,
|
||||
const CounterVector& y);
|
||||
|
||||
CounterVector() { }
|
||||
|
||||
DECLARE_SERIAL(CounterVector);
|
||||
|
||||
private:
|
||||
CounterVector& operator=(const CounterVector&); // Disable.
|
||||
|
||||
BitVector* bits;
|
||||
size_t width;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#endif
|
114
src/probabilistic/Hasher.cc
Normal file
114
src/probabilistic/Hasher.cc
Normal file
|
@ -0,0 +1,114 @@
|
|||
// See the file "COPYING" in the main distribution directory for copyright.
|
||||
|
||||
#include <typeinfo>
|
||||
|
||||
#include "Hasher.h"
|
||||
#include "digest.h"
|
||||
|
||||
using namespace probabilistic;
|
||||
|
||||
UHF::UHF(size_t seed, const std::string& extra)
|
||||
: h(compute_seed(seed, extra))
|
||||
{
|
||||
}
|
||||
|
||||
Hasher::digest UHF::hash(const void* x, size_t n) const
|
||||
{
|
||||
assert(n <= UHASH_KEY_SIZE);
|
||||
return n == 0 ? 0 : h(x, n);
|
||||
}
|
||||
|
||||
size_t UHF::compute_seed(size_t seed, const std::string& extra)
|
||||
{
|
||||
u_char buf[SHA256_DIGEST_LENGTH];
|
||||
SHA256_CTX ctx;
|
||||
sha256_init(&ctx);
|
||||
|
||||
if ( extra.empty() )
|
||||
{
|
||||
unsigned int first_seed = initial_seed();
|
||||
sha256_update(&ctx, &first_seed, sizeof(first_seed));
|
||||
}
|
||||
|
||||
else
|
||||
sha256_update(&ctx, extra.c_str(), extra.size());
|
||||
|
||||
sha256_update(&ctx, &seed, sizeof(seed));
|
||||
sha256_final(&ctx, buf);
|
||||
|
||||
// Take the first sizeof(size_t) bytes as seed.
|
||||
return *reinterpret_cast<size_t*>(buf);
|
||||
}
|
||||
|
||||
Hasher* Hasher::Create(size_t k, const std::string& name)
|
||||
{
|
||||
return new DefaultHasher(k, name);
|
||||
}
|
||||
|
||||
Hasher::Hasher(size_t k, const std::string& arg_name)
|
||||
: k(k)
|
||||
{
|
||||
name = arg_name;
|
||||
}
|
||||
|
||||
DefaultHasher::DefaultHasher(size_t k, const std::string& name)
|
||||
: Hasher(k, name)
|
||||
{
|
||||
for ( size_t i = 0; i < k; ++i )
|
||||
hash_functions.push_back(UHF(i, name));
|
||||
}
|
||||
|
||||
Hasher::digest_vector DefaultHasher::Hash(const void* x, size_t n) const
|
||||
{
|
||||
digest_vector h(K(), 0);
|
||||
|
||||
for ( size_t i = 0; i < h.size(); ++i )
|
||||
h[i] = hash_functions[i](x, n);
|
||||
|
||||
return h;
|
||||
}
|
||||
|
||||
DefaultHasher* DefaultHasher::Clone() const
|
||||
{
|
||||
return new DefaultHasher(*this);
|
||||
}
|
||||
|
||||
bool DefaultHasher::Equals(const Hasher* other) const
|
||||
{
|
||||
if ( typeid(*this) != typeid(*other) )
|
||||
return false;
|
||||
|
||||
const DefaultHasher* o = static_cast<const DefaultHasher*>(other);
|
||||
return hash_functions == o->hash_functions;
|
||||
}
|
||||
|
||||
DoubleHasher::DoubleHasher(size_t k, const std::string& name)
|
||||
: Hasher(k, name), h1(1, name), h2(2, name)
|
||||
{
|
||||
}
|
||||
|
||||
Hasher::digest_vector DoubleHasher::Hash(const void* x, size_t n) const
|
||||
{
|
||||
digest d1 = h1(x, n);
|
||||
digest d2 = h2(x, n);
|
||||
digest_vector h(K(), 0);
|
||||
|
||||
for ( size_t i = 0; i < h.size(); ++i )
|
||||
h[i] = d1 + i * d2;
|
||||
|
||||
return h;
|
||||
}
|
||||
|
||||
DoubleHasher* DoubleHasher::Clone() const
|
||||
{
|
||||
return new DoubleHasher(*this);
|
||||
}
|
||||
|
||||
bool DoubleHasher::Equals(const Hasher* other) const
|
||||
{
|
||||
if ( typeid(*this) != typeid(*other) )
|
||||
return false;
|
||||
|
||||
const DoubleHasher* o = static_cast<const DoubleHasher*>(other);
|
||||
return h1 == o->h1 && h2 == o->h2;
|
||||
}
|
216
src/probabilistic/Hasher.h
Normal file
216
src/probabilistic/Hasher.h
Normal file
|
@ -0,0 +1,216 @@
|
|||
// See the file "COPYING" in the main distribution directory for copyright.
|
||||
|
||||
#ifndef PROBABILISTIC_HASHER_H
|
||||
#define PROBABILISTIC_HASHER_H
|
||||
|
||||
#include "Hash.h"
|
||||
#include "H3.h"
|
||||
|
||||
namespace probabilistic {
|
||||
|
||||
/**
|
||||
* Abstract base class for hashers. A hasher creates a family of hash
|
||||
* functions to hash an element *k* times.
|
||||
*/
|
||||
class Hasher {
|
||||
public:
|
||||
typedef hash_t digest;
|
||||
typedef std::vector<digest> digest_vector;
|
||||
|
||||
/**
|
||||
* Destructor.
|
||||
*/
|
||||
virtual ~Hasher() { }
|
||||
|
||||
/**
|
||||
* Computes hash values for an element.
|
||||
*
|
||||
* @param x The element to hash.
|
||||
*
|
||||
* @return Vector of *k* hash values.
|
||||
*/
|
||||
template <typename T>
|
||||
digest_vector operator()(const T& x) const
|
||||
{
|
||||
return Hash(&x, sizeof(T));
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes the hashes for a set of bytes.
|
||||
*
|
||||
* @param x Pointer to first byte to hash.
|
||||
*
|
||||
* @param n Number of bytes to hash.
|
||||
*
|
||||
* @return Vector of *k* hash values.
|
||||
*
|
||||
*/
|
||||
virtual digest_vector Hash(const void* x, size_t n) const = 0;
|
||||
|
||||
/**
|
||||
* Returns a deep copy of the hasher.
|
||||
*/
|
||||
virtual Hasher* Clone() const = 0;
|
||||
|
||||
/**
|
||||
* Returns true if two hashers are identical.
|
||||
*/
|
||||
virtual bool Equals(const Hasher* other) const = 0;
|
||||
|
||||
/**
|
||||
* Returns the number *k* of hash functions the hashers applies.
|
||||
*/
|
||||
size_t K() const { return k; }
|
||||
|
||||
/**
|
||||
* Returns the hasher's name. TODO: What's this?
|
||||
*/
|
||||
const std::string& Name() const { return name; }
|
||||
|
||||
/**
|
||||
* Constructs the hasher used by the implementation. This hardcodes a
|
||||
* specific hashing policy. It exists only because the HashingPolicy
|
||||
* class hierachy is not yet serializable.
|
||||
*
|
||||
* @param k The number of hash functions to apply.
|
||||
*
|
||||
* @param name The hasher's name. Hashers with the same name should
|
||||
* provide consistent results.
|
||||
*
|
||||
* @return Returns a new hasher instance.
|
||||
*/
|
||||
static Hasher* Create(size_t k, const std::string& name);
|
||||
|
||||
protected:
|
||||
/**
|
||||
* Constructor.
|
||||
*
|
||||
* @param k the number of hash functions.
|
||||
*
|
||||
* @param name A name for the hasher. Hashers with the same name
|
||||
* should provide consistent results.
|
||||
*/
|
||||
Hasher(size_t k, const std::string& name);
|
||||
|
||||
private:
|
||||
const size_t k;
|
||||
std::string name;
|
||||
};
|
||||
|
||||
/**
|
||||
* A universal hash function family. This is a helper class that Hasher
|
||||
* implementations can use in their implementation.
|
||||
*/
|
||||
class UHF {
|
||||
public:
|
||||
/**
|
||||
* Constructs an H3 hash function seeded with a given seed and an
|
||||
* optional extra seed to replace the initial Bro seed.
|
||||
*
|
||||
* @param seed The seed to use for this instance.
|
||||
*
|
||||
* @param extra If not empty, this parameter replaces the initial
|
||||
* seed to compute the seed for t to compute the seed NUL-terminated
|
||||
* string as additional seed.
|
||||
*/
|
||||
UHF(size_t seed, const std::string& extra = "");
|
||||
|
||||
template <typename T>
|
||||
Hasher::digest operator()(const T& x) const
|
||||
{
|
||||
return hash(&x, sizeof(T));
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes hash values for an element.
|
||||
*
|
||||
* @param x The element to hash.
|
||||
*
|
||||
* @return Vector of *k* hash values.
|
||||
*/
|
||||
Hasher::digest operator()(const void* x, size_t n) const
|
||||
{
|
||||
return hash(x, n);
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes the hashes for a set of bytes.
|
||||
*
|
||||
* @param x Pointer to first byte to hash.
|
||||
*
|
||||
* @param n Number of bytes to hash.
|
||||
*
|
||||
* @return Vector of *k* hash values.
|
||||
*
|
||||
*/
|
||||
Hasher::digest hash(const void* x, size_t n) const;
|
||||
|
||||
friend bool operator==(const UHF& x, const UHF& y)
|
||||
{
|
||||
return x.h == y.h;
|
||||
}
|
||||
|
||||
friend bool operator!=(const UHF& x, const UHF& y)
|
||||
{
|
||||
return ! (x == y);
|
||||
}
|
||||
|
||||
private:
|
||||
static size_t compute_seed(size_t seed, const std::string& extra);
|
||||
|
||||
H3<Hasher::digest, UHASH_KEY_SIZE> h;
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
* A hasher implementing the default hashing policy. Uses *k* separate hash
|
||||
* functions internally.
|
||||
*/
|
||||
class DefaultHasher : public Hasher {
|
||||
public:
|
||||
/**
|
||||
* Constructor for a hasher with *k* hash functions.
|
||||
*
|
||||
* @param k The number of hash functions to use.
|
||||
*
|
||||
* @param name The name of the hasher.
|
||||
*/
|
||||
DefaultHasher(size_t k, const std::string& name);
|
||||
|
||||
// Overridden from Hasher.
|
||||
virtual digest_vector Hash(const void* x, size_t n) const /* final */;
|
||||
virtual DefaultHasher* Clone() const /* final */;
|
||||
virtual bool Equals(const Hasher* other) const /* final */;
|
||||
|
||||
private:
|
||||
std::vector<UHF> hash_functions;
|
||||
};
|
||||
|
||||
/**
|
||||
* The *double-hashing* policy. Uses a linear combination of two hash
|
||||
* functions.
|
||||
*/
|
||||
class DoubleHasher : public Hasher {
|
||||
public:
|
||||
/**
|
||||
* Constructor for a double hasher with *k* hash functions.
|
||||
*
|
||||
* @param k The number of hash functions to use.
|
||||
*
|
||||
* @param name The name of the hasher.
|
||||
*/
|
||||
DoubleHasher(size_t k, const std::string& name);
|
||||
|
||||
// Overridden from Hasher.
|
||||
virtual digest_vector Hash(const void* x, size_t n) const /* final */;
|
||||
virtual DoubleHasher* Clone() const /* final */;
|
||||
virtual bool Equals(const Hasher* other) const /* final */;
|
||||
|
||||
private:
|
||||
UHF h1;
|
||||
UHF h2;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#endif
|
196
src/probabilistic/bloom-filter.bif
Normal file
196
src/probabilistic/bloom-filter.bif
Normal file
|
@ -0,0 +1,196 @@
|
|||
# ===========================================================================
|
||||
#
|
||||
# Bloom Filter Functions
|
||||
#
|
||||
# ===========================================================================
|
||||
|
||||
%%{
|
||||
|
||||
// TODO: This is currently included from the top-level src directory, hence
|
||||
// paths are relative to there. We need a better mechanisms to pull in
|
||||
// BiFs defined in sub directories.
|
||||
#include "probabilistic/BloomFilter.h"
|
||||
#include "OpaqueVal.h"
|
||||
|
||||
using namespace probabilistic;
|
||||
|
||||
%%}
|
||||
|
||||
module GLOBAL;
|
||||
|
||||
## Creates a basic Bloom filter.
|
||||
##
|
||||
## .. note:: A Bloom filter can have a name associated with it. In the future,
|
||||
## Bloom filters with the same name will be compatible across indepedent Bro
|
||||
## instances, i.e., it will be possible to merge them. Currently, however, that is
|
||||
## not yet supported.
|
||||
##
|
||||
## fp: The desired false-positive rate.
|
||||
##
|
||||
## capacity: the maximum number of elements that guarantees a false-positive
|
||||
## rate of *fp*.
|
||||
##
|
||||
## name: A name that uniquely identifies and seeds the Bloom filter. If empty,
|
||||
## the filter will remain tied to the current Bro process.
|
||||
##
|
||||
## Returns: A Bloom filter handle.
|
||||
##
|
||||
## .. bro:see:: bloomfilter_counting_init bloomfilter_add bloomfilter_lookup
|
||||
## bloomfilter_clear bloomfilter_merge
|
||||
function bloomfilter_basic_init%(fp: double, capacity: count,
|
||||
name: string &default=""%): opaque of bloomfilter
|
||||
%{
|
||||
if ( fp < 0.0 || fp > 1.0 )
|
||||
{
|
||||
reporter->Error("false-positive rate must take value between 0 and 1");
|
||||
return 0;
|
||||
}
|
||||
|
||||
size_t cells = BasicBloomFilter::M(fp, capacity);
|
||||
size_t optimal_k = BasicBloomFilter::K(cells, capacity);
|
||||
const Hasher* h = Hasher::Create(optimal_k, name->CheckString());
|
||||
|
||||
return new BloomFilterVal(new BasicBloomFilter(h, cells));
|
||||
%}
|
||||
|
||||
## Creates a counting Bloom filter.
|
||||
##
|
||||
## .. note:: A Bloom filter can have a name associated with it. In the future,
|
||||
## Bloom filters with the same name will be compatible across indepedent Bro
|
||||
## instances, i.e., it will be possible to merge them. Currently, however, that is
|
||||
## not yet supported.
|
||||
##
|
||||
## k: The number of hash functions to use.
|
||||
##
|
||||
## cells: The number of cells of the underlying counter vector. As there's no
|
||||
## single answer to what's the best parameterization for a counting Bloom filter,
|
||||
## we refer to the Bloom filter literature here for choosing an appropiate value.
|
||||
##
|
||||
## max: The maximum counter value associated with each each element described
|
||||
## by *w = ceil(log_2(max))* bits. Each bit in the underlying counter vector
|
||||
## becomes a cell of size *w* bits.
|
||||
##
|
||||
## name: A name that uniquely identifies and seeds the Bloom filter. If empty,
|
||||
## the filter will remain tied to the current Bro process.
|
||||
##
|
||||
## Returns: A Bloom filter handle.
|
||||
##
|
||||
## .. bro:see:: bloomfilter_basic_init bloomfilter_add bloomfilter_lookup
|
||||
## bloomfilter_clear bloomfilter_merge
|
||||
function bloomfilter_counting_init%(k: count, cells: count, max: count,
|
||||
name: string &default=""%): opaque of bloomfilter
|
||||
%{
|
||||
if ( max == 0 )
|
||||
{
|
||||
reporter->Error("max counter value must be greater than 0");
|
||||
return 0;
|
||||
}
|
||||
|
||||
const Hasher* h = Hasher::Create(k, name->CheckString());
|
||||
|
||||
uint16 width = 1;
|
||||
while ( max >>= 1 )
|
||||
++width;
|
||||
|
||||
return new BloomFilterVal(new CountingBloomFilter(h, cells, width));
|
||||
%}
|
||||
|
||||
## Adds an element to a Bloom filter.
|
||||
##
|
||||
## bf: The Bloom filter handle.
|
||||
##
|
||||
## x: The element to add.
|
||||
##
|
||||
## .. bro:see:: bloomfilter_counting_init bloomfilter_basic_init loomfilter_lookup
|
||||
## bloomfilter_clear bloomfilter_merge
|
||||
function bloomfilter_add%(bf: opaque of bloomfilter, x: any%): any
|
||||
%{
|
||||
BloomFilterVal* bfv = static_cast<BloomFilterVal*>(bf);
|
||||
|
||||
if ( ! bfv->Type() && ! bfv->Typify(x->Type()) )
|
||||
reporter->Error("failed to set Bloom filter type");
|
||||
|
||||
else if ( ! same_type(bfv->Type(), x->Type()) )
|
||||
reporter->Error("incompatible Bloom filter types");
|
||||
|
||||
else
|
||||
bfv->Add(x);
|
||||
|
||||
return 0;
|
||||
%}
|
||||
|
||||
## Retrieves the counter for a given element in a Bloom filter.
|
||||
##
|
||||
## bf: The Bloom filter handle.
|
||||
##
|
||||
## x: The element to count.
|
||||
##
|
||||
## Returns: the counter associated with *x* in *bf*.
|
||||
##
|
||||
## .. bro:see:: bloomfilter_counting_init bloomfilter_basic_init
|
||||
## bloomfilter_add bloomfilter_clear bloomfilter_merge
|
||||
function bloomfilter_lookup%(bf: opaque of bloomfilter, x: any%): count
|
||||
%{
|
||||
const BloomFilterVal* bfv = static_cast<const BloomFilterVal*>(bf);
|
||||
|
||||
if ( bfv->Empty() )
|
||||
return new Val(0, TYPE_COUNT);
|
||||
|
||||
if ( ! bfv->Type() )
|
||||
reporter->Error("cannot perform lookup on untyped Bloom filter");
|
||||
|
||||
else if ( ! same_type(bfv->Type(), x->Type()) )
|
||||
reporter->Error("incompatible Bloom filter types");
|
||||
|
||||
else
|
||||
return new Val(static_cast<uint64>(bfv->Count(x)), TYPE_COUNT);
|
||||
|
||||
return new Val(0, TYPE_COUNT);
|
||||
%}
|
||||
|
||||
## Removes all elements from a Bloom filter. This function resets all bits in the
|
||||
## underlying bitvector back to 0 but does not change the parameterization of the
|
||||
## Bloom filter, such as the element type and the hasher seed.
|
||||
##
|
||||
## bf: The Bloom filter handle.
|
||||
##
|
||||
## .. bro:see:: bloomfilter_counting_init bloomfilter_basic_init
|
||||
## bloomfilter_add bloomfilter_lookup bloomfilter_merge
|
||||
function bloomfilter_clear%(bf: opaque of bloomfilter%): any
|
||||
%{
|
||||
BloomFilterVal* bfv = static_cast<BloomFilterVal*>(bf);
|
||||
|
||||
if ( bfv->Type() ) // Untyped Bloom filters are already empty.
|
||||
bfv->Clear();
|
||||
|
||||
return 0;
|
||||
%}
|
||||
|
||||
## Merges two Bloom filters.
|
||||
##
|
||||
## .. note:: Currently Bloom filters created by different Bro instances cannot
|
||||
## be merged. In the future, this will be supported as long as both filters
|
||||
## are created with the same name.
|
||||
##
|
||||
## bf1: The first Bloom filter handle.
|
||||
##
|
||||
## bf2: The second Bloom filter handle.
|
||||
##
|
||||
## Returns: The union of *bf1* and *bf2*.
|
||||
##
|
||||
## .. bro:see:: bloomfilter_counting_init bloomfilter_basic_init
|
||||
## bloomfilter_add bloomfilter_lookup bloomfilter_clear
|
||||
function bloomfilter_merge%(bf1: opaque of bloomfilter,
|
||||
bf2: opaque of bloomfilter%): opaque of bloomfilter
|
||||
%{
|
||||
const BloomFilterVal* bfv1 = static_cast<const BloomFilterVal*>(bf1);
|
||||
const BloomFilterVal* bfv2 = static_cast<const BloomFilterVal*>(bf2);
|
||||
|
||||
if ( ! same_type(bfv1->Type(), bfv2->Type()) )
|
||||
{
|
||||
reporter->Error("incompatible Bloom filter types");
|
||||
return 0;
|
||||
}
|
||||
|
||||
return BloomFilterVal::Merge(bfv1, bfv2);
|
||||
%}
|
42
src/util.cc
42
src/util.cc
|
@ -716,6 +716,8 @@ static bool write_random_seeds(const char* write_file, uint32 seed,
|
|||
|
||||
static bool bro_rand_determistic = false;
|
||||
static unsigned int bro_rand_state = 0;
|
||||
static bool first_seed_saved = false;
|
||||
static unsigned int first_seed = 0;
|
||||
|
||||
static void bro_srandom(unsigned int seed, bool deterministic)
|
||||
{
|
||||
|
@ -800,6 +802,12 @@ void init_random_seed(uint32 seed, const char* read_file, const char* write_file
|
|||
|
||||
bro_srandom(seed, seeds_done);
|
||||
|
||||
if ( ! first_seed_saved )
|
||||
{
|
||||
first_seed = seed;
|
||||
first_seed_saved = true;
|
||||
}
|
||||
|
||||
if ( ! hmac_key_set )
|
||||
{
|
||||
MD5((const u_char*) buf, sizeof(buf), shared_hmac_md5_key);
|
||||
|
@ -811,27 +819,39 @@ void init_random_seed(uint32 seed, const char* read_file, const char* write_file
|
|||
write_file);
|
||||
}
|
||||
|
||||
unsigned int initial_seed()
|
||||
{
|
||||
return first_seed;
|
||||
}
|
||||
|
||||
bool have_random_seed()
|
||||
{
|
||||
return bro_rand_determistic;
|
||||
}
|
||||
|
||||
long int bro_prng(long int state)
|
||||
{
|
||||
// Use our own simple linear congruence PRNG to make sure we are
|
||||
// predictable across platforms.
|
||||
static const long int m = 2147483647;
|
||||
static const long int a = 16807;
|
||||
const long int q = m / a;
|
||||
const long int r = m % a;
|
||||
|
||||
state = a * ( state % q ) - r * ( state / q );
|
||||
|
||||
if ( state <= 0 )
|
||||
state += m;
|
||||
|
||||
return state;
|
||||
}
|
||||
|
||||
long int bro_random()
|
||||
{
|
||||
if ( ! bro_rand_determistic )
|
||||
return random(); // Use system PRNG.
|
||||
|
||||
// Use our own simple linear congruence PRNG to make sure we are
|
||||
// predictable across platforms.
|
||||
const long int m = 2147483647;
|
||||
const long int a = 16807;
|
||||
const long int q = m / a;
|
||||
const long int r = m % a;
|
||||
|
||||
bro_rand_state = a * ( bro_rand_state % q ) - r * ( bro_rand_state / q );
|
||||
|
||||
if ( bro_rand_state <= 0 )
|
||||
bro_rand_state += m;
|
||||
bro_rand_state = bro_prng(bro_rand_state);
|
||||
|
||||
return bro_rand_state;
|
||||
}
|
||||
|
|
12
src/util.h
12
src/util.h
|
@ -165,12 +165,20 @@ extern void hmac_md5(size_t size, const unsigned char* bytes,
|
|||
extern void init_random_seed(uint32 seed, const char* load_file,
|
||||
const char* write_file);
|
||||
|
||||
// Retrieves the initial seed computed after the very first call to
|
||||
// init_random_seed(). Repeated calls to init_random_seed() will not affect
|
||||
// the return value of this function.
|
||||
unsigned int initial_seed();
|
||||
|
||||
// Returns true if the user explicitly set a seed via init_random_seed();
|
||||
extern bool have_random_seed();
|
||||
|
||||
// A simple linear congruence PRNG. It takes its state as argument and
|
||||
// returns a new random value, which can serve as state for subsequent calls.
|
||||
long int bro_prng(long int state);
|
||||
|
||||
// Replacement for the system random(), to which is normally falls back
|
||||
// except when a seed has been given. In that case, we use our own
|
||||
// predictable PRNG.
|
||||
// except when a seed has been given. In that case, the function bro_prng.
|
||||
long int bro_random();
|
||||
|
||||
// Calls the system srandom() function with the given seed if not running
|
||||
|
|
27
testing/btest/Baseline/bifs.bloomfilter/output
Normal file
27
testing/btest/Baseline/bifs.bloomfilter/output
Normal file
|
@ -0,0 +1,27 @@
|
|||
error: incompatible Bloom filter types
|
||||
error: incompatible Bloom filter types
|
||||
error: incompatible Bloom filter types
|
||||
error: incompatible Bloom filter types
|
||||
error: false-positive rate must take value between 0 and 1
|
||||
error: false-positive rate must take value between 0 and 1
|
||||
0
|
||||
1
|
||||
1
|
||||
0
|
||||
1
|
||||
1
|
||||
1
|
||||
1
|
||||
1
|
||||
1
|
||||
1
|
||||
1
|
||||
1
|
||||
2
|
||||
3
|
||||
3
|
||||
2
|
||||
3
|
||||
3
|
||||
3
|
||||
2
|
83
testing/btest/bifs/bloomfilter.bro
Normal file
83
testing/btest/bifs/bloomfilter.bro
Normal file
|
@ -0,0 +1,83 @@
|
|||
# @TEST-EXEC: bro -b %INPUT >output 2>&1
|
||||
# @TEST-EXEC: btest-diff output
|
||||
|
||||
function test_basic_bloom_filter()
|
||||
{
|
||||
# Basic usage with counts.
|
||||
local bf_cnt = bloomfilter_basic_init(0.1, 1000);
|
||||
bloomfilter_add(bf_cnt, 42);
|
||||
bloomfilter_add(bf_cnt, 84);
|
||||
bloomfilter_add(bf_cnt, 168);
|
||||
print bloomfilter_lookup(bf_cnt, 0);
|
||||
print bloomfilter_lookup(bf_cnt, 42);
|
||||
print bloomfilter_lookup(bf_cnt, 168);
|
||||
print bloomfilter_lookup(bf_cnt, 336);
|
||||
bloomfilter_add(bf_cnt, 0.5); # Type mismatch
|
||||
bloomfilter_add(bf_cnt, "foo"); # Type mismatch
|
||||
|
||||
# Basic usage with strings.
|
||||
local bf_str = bloomfilter_basic_init(0.9, 10);
|
||||
bloomfilter_add(bf_str, "foo");
|
||||
bloomfilter_add(bf_str, "bar");
|
||||
print bloomfilter_lookup(bf_str, "foo");
|
||||
print bloomfilter_lookup(bf_str, "bar");
|
||||
print bloomfilter_lookup(bf_str, "b4z"); # FP
|
||||
print bloomfilter_lookup(bf_str, "quux"); # FP
|
||||
bloomfilter_add(bf_str, 0.5); # Type mismatch
|
||||
bloomfilter_add(bf_str, 100); # Type mismatch
|
||||
|
||||
# Edge cases.
|
||||
local bf_edge0 = bloomfilter_basic_init(0.000000000001, 1);
|
||||
local bf_edge1 = bloomfilter_basic_init(0.00000001, 100000000);
|
||||
local bf_edge2 = bloomfilter_basic_init(0.9999999, 1);
|
||||
local bf_edge3 = bloomfilter_basic_init(0.9999999, 100000000000);
|
||||
|
||||
# Invalid parameters.
|
||||
local bf_bug0 = bloomfilter_basic_init(-0.5, 42);
|
||||
local bf_bug1 = bloomfilter_basic_init(1.1, 42);
|
||||
|
||||
# Merging
|
||||
local bf_cnt2 = bloomfilter_basic_init(0.1, 1000);
|
||||
bloomfilter_add(bf_cnt2, 42);
|
||||
bloomfilter_add(bf_cnt, 100);
|
||||
local bf_merged = bloomfilter_merge(bf_cnt, bf_cnt2);
|
||||
print bloomfilter_lookup(bf_merged, 42);
|
||||
print bloomfilter_lookup(bf_merged, 84);
|
||||
print bloomfilter_lookup(bf_merged, 100);
|
||||
print bloomfilter_lookup(bf_merged, 168);
|
||||
}
|
||||
|
||||
function test_counting_bloom_filter()
|
||||
{
|
||||
local bf = bloomfilter_counting_init(3, 32, 3);
|
||||
bloomfilter_add(bf, "foo");
|
||||
print bloomfilter_lookup(bf, "foo"); # 1
|
||||
bloomfilter_add(bf, "foo");
|
||||
print bloomfilter_lookup(bf, "foo"); # 2
|
||||
bloomfilter_add(bf, "foo");
|
||||
print bloomfilter_lookup(bf, "foo"); # 3
|
||||
bloomfilter_add(bf, "foo");
|
||||
print bloomfilter_lookup(bf, "foo"); # still 3
|
||||
|
||||
|
||||
bloomfilter_add(bf, "bar");
|
||||
bloomfilter_add(bf, "bar");
|
||||
print bloomfilter_lookup(bf, "bar"); # 2
|
||||
print bloomfilter_lookup(bf, "foo"); # still 3
|
||||
|
||||
# Merging
|
||||
local bf2 = bloomfilter_counting_init(3, 32, 3);
|
||||
bloomfilter_add(bf2, "baz");
|
||||
bloomfilter_add(bf2, "baz");
|
||||
bloomfilter_add(bf2, "bar");
|
||||
local bf_merged = bloomfilter_merge(bf, bf2);
|
||||
print bloomfilter_lookup(bf_merged, "foo");
|
||||
print bloomfilter_lookup(bf_merged, "bar");
|
||||
print bloomfilter_lookup(bf_merged, "baz");
|
||||
}
|
||||
|
||||
event bro_init()
|
||||
{
|
||||
test_basic_bloom_filter();
|
||||
test_counting_bloom_filter();
|
||||
}
|
|
@ -12,6 +12,9 @@ global sha1_handle: opaque of sha1 &persistent &synchronized;
|
|||
global sha256_handle: opaque of sha256 &persistent &synchronized;
|
||||
global entropy_handle: opaque of entropy &persistent &synchronized;
|
||||
|
||||
global bloomfilter_elements: set[string] &persistent &synchronized;
|
||||
global bloomfilter_handle: opaque of bloomfilter &persistent &synchronized;
|
||||
|
||||
event bro_done()
|
||||
{
|
||||
local out = open("output.log");
|
||||
|
@ -36,6 +39,9 @@ event bro_done()
|
|||
print out, entropy_test_finish(entropy_handle);
|
||||
else
|
||||
print out, "entropy_test_add() failed";
|
||||
|
||||
for ( e in bloomfilter_elements )
|
||||
print bloomfilter_lookup(bloomfilter_handle, e);
|
||||
}
|
||||
|
||||
@TEST-END-FILE
|
||||
|
@ -47,6 +53,9 @@ global sha1_handle: opaque of sha1 &persistent &synchronized;
|
|||
global sha256_handle: opaque of sha256 &persistent &synchronized;
|
||||
global entropy_handle: opaque of entropy &persistent &synchronized;
|
||||
|
||||
global bloomfilter_elements = { "foo", "bar", "baz" } &persistent &synchronized;
|
||||
global bloomfilter_handle: opaque of bloomfilter &persistent &synchronized;
|
||||
|
||||
event bro_init()
|
||||
{
|
||||
local out = open("expected.log");
|
||||
|
@ -72,6 +81,10 @@ event bro_init()
|
|||
entropy_handle = entropy_test_init();
|
||||
if ( ! entropy_test_add(entropy_handle, "f") )
|
||||
print out, "entropy_test_add() failed";
|
||||
|
||||
bloomfilter_handle = bloomfilter_basic_init(0.1, 100);
|
||||
for ( e in bloomfilter_elements )
|
||||
bloomfilter_add(bloomfilter_handle, e);
|
||||
}
|
||||
|
||||
@TEST-END-FILE
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue