diff --git a/CHANGES b/CHANGES index 7cbbc74e4f..fe4d58d7ea 100644 --- a/CHANGES +++ b/CHANGES @@ -1,4 +1,17 @@ +2.1-880 | 2013-07-24 15:49:31 -0700 + + * Support for Bloom filter. (Matthias Vallentin) + + Bro now provides the following BiFs: + + bloomfilter_basic_init(fp: double, capacity: count, name: string &default=""): opaque of bloomfilter + bloomfilter_counting_init(k: count, cells: count, max: count, name: string &default=""): opaque of bloomfilter + bloomfilter_add(bf: opaque of bloomfilter, x: any) + bloomfilter_lookup(bf: opaque of bloomfilter, x: any): count + bloomfilter_merge(bf1: opaque of bloomfilter, bf2: opaque of bloomfilter): opaque of bloomfilter + bloomfilter_clear(bf: opaque of bloomfilter) + 2.1-824 | 2013-07-22 14:25:14 -0400 * Fixed a scriptland state issue that manifested especially badly on proxies. (Seth Hall) diff --git a/NEWS b/NEWS index 1fce6b1d9d..c3eabf5554 100644 --- a/NEWS +++ b/NEWS @@ -108,6 +108,19 @@ New Functionality shunting, and sampling; plus plugin support to customize filters dynamically. +- Bro now provides Bloom filters of two kinds: basic Bloom filters + supporting membership tests, and counting Bloom filters that track + the frequency of elements. The corresponding functions are: + + bloomfilter_basic_init(fp: double, capacity: count, name: string &default=""): opaque of bloomfilter + bloomfilter_counting_init(k: count, cells: count, max: count, name: string &default=""): opaque of bloomfilter + bloomfilter_add(bf: opaque of bloomfilter, x: any) + bloomfilter_lookup(bf: opaque of bloomfilter, x: any): count + bloomfilter_merge(bf1: opaque of bloomfilter, bf2: opaque of bloomfilter): opaque of bloomfilter + bloomfilter_clear(bf: opaque of bloomfilter) + + See for full documentation. + Changed Functionality ~~~~~~~~~~~~~~~~~~~~~ diff --git a/VERSION b/VERSION index d35eaf1454..c89a17797a 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.1-824 +2.1-880 diff --git a/scripts/base/init-bare.bro b/scripts/base/init-bare.bro index 60ed0d2fd1..c368b9d610 100644 --- a/scripts/base/init-bare.bro +++ b/scripts/base/init-bare.bro @@ -705,6 +705,7 @@ type entropy_test_result: record { @load base/bif/strings.bif @load base/bif/bro.bif @load base/bif/reporter.bif +@load base/bif/bloom-filter.bif ## Deprecated. This is superseded by the new logging framework. global log_file_name: function(tag: string): string &redef; diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index e353dd4695..0c979df19f 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -150,6 +150,7 @@ set(bro_PLUGIN_LIBS CACHE INTERNAL "plugin libraries" FORCE) add_subdirectory(analyzer) add_subdirectory(file_analysis) +add_subdirectory(probabilistic) set(bro_SUBDIRS ${bro_SUBDIR_LIBS} diff --git a/src/Func.cc b/src/Func.cc index f3718fe231..483699668f 100644 --- a/src/Func.cc +++ b/src/Func.cc @@ -560,6 +560,9 @@ void builtin_error(const char* msg, BroObj* arg) #include "reporter.bif.func_def" #include "strings.bif.func_def" +// TODO: Add a nicer mechanism to pull in subdirectory bifs automatically. +#include "probabilistic/bloom-filter.bif.h" + void init_builtin_funcs() { bro_resources = internal_type("bro_resources")->AsRecordType(); @@ -574,6 +577,9 @@ void init_builtin_funcs() #include "reporter.bif.func_init" #include "strings.bif.func_init" +// TODO: Add a nicer mechanism to pull in subdirectory bifs automatically. +#include "probabilistic/bloom-filter.bif.init.cc" + did_builtin_init = true; } diff --git a/src/H3.h b/src/H3.h index 72d81d519f..8ea5848816 100644 --- a/src/H3.h +++ b/src/H3.h @@ -49,69 +49,83 @@ // hash a substring of the data. Hashes of substrings can be bitwise-XOR'ed // together to get the same result as hashing the full string. // Any number of hash functions can be created by creating new instances of H3, -// with the same or different template parameters. The hash function is -// randomly generated using bro_random(); you must call init_random_seed() -// before the H3 constructor if you wish to seed it. +// with the same or different template parameters. The hash function +// constructor takes a seed as argument which defaults to a call to +// bro_random(). #ifndef H3_H #define H3_H #include +#include // The number of values representable by a byte. #define H3_BYTE_RANGE (UCHAR_MAX+1) -template class H3 { - T byte_lookup[N][H3_BYTE_RANGE]; +template +class H3 { public: - H3(); - T operator()(const void* data, size_t size, size_t offset = 0) const - { - const unsigned char *p = static_cast(data); - T result = 0; + H3(T seed = bro_random()) + { + T bit_lookup[N * CHAR_BIT]; - // loop optmized with Duff's Device - register unsigned n = (size + 7) / 8; - switch (size % 8) { - case 0: do { result ^= byte_lookup[offset++][*p++]; - case 7: result ^= byte_lookup[offset++][*p++]; - case 6: result ^= byte_lookup[offset++][*p++]; - case 5: result ^= byte_lookup[offset++][*p++]; - case 4: result ^= byte_lookup[offset++][*p++]; - case 3: result ^= byte_lookup[offset++][*p++]; - case 2: result ^= byte_lookup[offset++][*p++]; - case 1: result ^= byte_lookup[offset++][*p++]; - } while (--n > 0); - } + for ( size_t bit = 0; bit < N * CHAR_BIT; bit++ ) + { + bit_lookup[bit] = 0; + seed = bro_prng(seed); + for ( size_t i = 0; i < sizeof(T)/2; i++ ) + // assume random() returns at least 16 random bits + bit_lookup[bit] = (bit_lookup[bit] << 16) | (seed & 0xFFFF); + } - return result; - } + for ( size_t byte = 0; byte < N; byte++ ) + { + for ( unsigned val = 0; val < H3_BYTE_RANGE; val++ ) + { + byte_lookup[byte][val] = 0; + for ( size_t bit = 0; bit < CHAR_BIT; bit++ ) + // Does this mean byte_lookup[*][0] == 0? -RP + if (val & (1 << bit)) + byte_lookup[byte][val] ^= bit_lookup[byte*CHAR_BIT+bit]; + } + } + } + + T operator()(const void* data, size_t size, size_t offset = 0) const + { + const unsigned char *p = static_cast(data); + T result = 0; + + // loop optmized with Duff's Device + register unsigned n = (size + 7) / 8; + switch ( size % 8 ) { + case 0: do { result ^= byte_lookup[offset++][*p++]; + case 7: result ^= byte_lookup[offset++][*p++]; + case 6: result ^= byte_lookup[offset++][*p++]; + case 5: result ^= byte_lookup[offset++][*p++]; + case 4: result ^= byte_lookup[offset++][*p++]; + case 3: result ^= byte_lookup[offset++][*p++]; + case 2: result ^= byte_lookup[offset++][*p++]; + case 1: result ^= byte_lookup[offset++][*p++]; + } while ( --n > 0 ); + } + + return result; + } + + friend bool operator==(const H3& x, const H3& y) + { + return ! std::memcmp(x.byte_lookup, y.byte_lookup, N * H3_BYTE_RANGE); + } + + friend bool operator!=(const H3& x, const H3& y) + { + return ! (x == y); + } + +private: + T byte_lookup[N][H3_BYTE_RANGE]; }; -template -H3::H3() -{ - T bit_lookup[N * CHAR_BIT]; - - for (size_t bit = 0; bit < N * CHAR_BIT; bit++) { - bit_lookup[bit] = 0; - for (size_t i = 0; i < sizeof(T)/2; i++) { - // assume random() returns at least 16 random bits - bit_lookup[bit] = (bit_lookup[bit] << 16) | (bro_random() & 0xFFFF); - } - } - - for (size_t byte = 0; byte < N; byte++) { - for (unsigned val = 0; val < H3_BYTE_RANGE; val++) { - byte_lookup[byte][val] = 0; - for (size_t bit = 0; bit < CHAR_BIT; bit++) { - // Does this mean byte_lookup[*][0] == 0? -RP - if (val & (1 << bit)) - byte_lookup[byte][val] ^= bit_lookup[byte*CHAR_BIT+bit]; - } - } - } -} - #endif //H3_H diff --git a/src/NetVar.cc b/src/NetVar.cc index 2f50ce528b..388aa46f10 100644 --- a/src/NetVar.cc +++ b/src/NetVar.cc @@ -242,6 +242,7 @@ OpaqueType* md5_type; OpaqueType* sha1_type; OpaqueType* sha256_type; OpaqueType* entropy_type; +OpaqueType* bloomfilter_type; #include "const.bif.netvar_def" #include "types.bif.netvar_def" @@ -307,6 +308,7 @@ void init_general_global_var() sha1_type = new OpaqueType("sha1"); sha256_type = new OpaqueType("sha256"); entropy_type = new OpaqueType("entropy"); + bloomfilter_type = new OpaqueType("bloomfilter"); } void init_net_var() diff --git a/src/NetVar.h b/src/NetVar.h index ac825e7845..7ce33d1a1a 100644 --- a/src/NetVar.h +++ b/src/NetVar.h @@ -247,6 +247,7 @@ extern OpaqueType* md5_type; extern OpaqueType* sha1_type; extern OpaqueType* sha256_type; extern OpaqueType* entropy_type; +extern OpaqueType* bloomfilter_type; // Initializes globals that don't pertain to network/event analysis. extern void init_general_global_var(); diff --git a/src/OpaqueVal.cc b/src/OpaqueVal.cc index 19346e52f2..b70cfee086 100644 --- a/src/OpaqueVal.cc +++ b/src/OpaqueVal.cc @@ -1,3 +1,5 @@ +// See the file "COPYING" in the main distribution directory for copyright. + #include "OpaqueVal.h" #include "NetVar.h" #include "Reporter.h" @@ -515,3 +517,152 @@ bool EntropyVal::DoUnserialize(UnserialInfo* info) return true; } + +BloomFilterVal::BloomFilterVal() + : OpaqueVal(bloomfilter_type) + { + type = 0; + hash = 0; + bloom_filter = 0; + } + +BloomFilterVal::BloomFilterVal(OpaqueType* t) + : OpaqueVal(t) + { + type = 0; + hash = 0; + bloom_filter = 0; + } + +BloomFilterVal::BloomFilterVal(probabilistic::BloomFilter* bf) + : OpaqueVal(bloomfilter_type) + { + type = 0; + hash = 0; + bloom_filter = bf; + } + +bool BloomFilterVal::Typify(BroType* arg_type) + { + if ( type ) + return false; + + type = arg_type; + type->Ref(); + + TypeList* tl = new TypeList(type); + tl->Append(type); + hash = new CompositeHash(tl); + Unref(tl); + + return true; + } + +BroType* BloomFilterVal::Type() const + { + return type; + } + +void BloomFilterVal::Add(const Val* val) + { + HashKey* key = hash->ComputeHash(val, 1); + bloom_filter->Add(key->Hash()); + delete key; + } + +size_t BloomFilterVal::Count(const Val* val) const + { + HashKey* key = hash->ComputeHash(val, 1); + size_t cnt = bloom_filter->Count(key->Hash()); + delete key; + return cnt; + } + +void BloomFilterVal::Clear() + { + bloom_filter->Clear(); + } + +bool BloomFilterVal::Empty() const + { + return bloom_filter->Empty(); + } + +BloomFilterVal* BloomFilterVal::Merge(const BloomFilterVal* x, + const BloomFilterVal* y) + { + if ( ! same_type(x->Type(), y->Type()) ) + { + reporter->Error("cannot merge Bloom filters with different types"); + return 0; + } + + if ( typeid(*x->bloom_filter) != typeid(*y->bloom_filter) ) + { + reporter->Error("cannot merge different Bloom filter types"); + return 0; + } + + probabilistic::BloomFilter* copy = x->bloom_filter->Clone(); + + if ( ! copy->Merge(y->bloom_filter) ) + { + reporter->Error("failed to merge Bloom filter"); + return 0; + } + + BloomFilterVal* merged = new BloomFilterVal(copy); + + if ( ! merged->Typify(x->Type()) ) + { + reporter->Error("failed to set type on merged Bloom filter"); + return 0; + } + + return merged; + } + +BloomFilterVal::~BloomFilterVal() + { + Unref(type); + delete hash; + delete bloom_filter; + } + +IMPLEMENT_SERIAL(BloomFilterVal, SER_BLOOMFILTER_VAL); + +bool BloomFilterVal::DoSerialize(SerialInfo* info) const + { + DO_SERIALIZE(SER_BLOOMFILTER_VAL, OpaqueVal); + + bool is_typed = (type != 0); + + if ( ! SERIALIZE(is_typed) ) + return false; + + if ( is_typed && ! type->Serialize(info) ) + return false; + + return bloom_filter->Serialize(info); + } + +bool BloomFilterVal::DoUnserialize(UnserialInfo* info) + { + DO_UNSERIALIZE(OpaqueVal); + + bool is_typed; + if ( ! UNSERIALIZE(&is_typed) ) + return false; + + if ( is_typed ) + { + BroType* type = BroType::Unserialize(info); + if ( ! Typify(type) ) + return false; + + Unref(type); + } + + bloom_filter = probabilistic::BloomFilter::Unserialize(info); + return bloom_filter != 0; + } diff --git a/src/OpaqueVal.h b/src/OpaqueVal.h index 78fa5da5e9..52c9583fc7 100644 --- a/src/OpaqueVal.h +++ b/src/OpaqueVal.h @@ -3,10 +3,18 @@ #ifndef OPAQUEVAL_H #define OPAQUEVAL_H +#include + #include "RandTest.h" #include "Val.h" #include "digest.h" +#include "probabilistic/BloomFilter.h" + +namespace probabilistic { + class BloomFilter; +} + class HashVal : public OpaqueVal { public: virtual bool IsValid() const; @@ -107,4 +115,37 @@ private: RandTest state; }; +class BloomFilterVal : public OpaqueVal { +public: + explicit BloomFilterVal(probabilistic::BloomFilter* bf); + virtual ~BloomFilterVal(); + + BroType* Type() const; + bool Typify(BroType* type); + + void Add(const Val* val); + size_t Count(const Val* val) const; + void Clear(); + bool Empty() const; + + static BloomFilterVal* Merge(const BloomFilterVal* x, + const BloomFilterVal* y); + +protected: + friend class Val; + BloomFilterVal(); + BloomFilterVal(OpaqueType* t); + + DECLARE_SERIAL(BloomFilterVal); + +private: + // Disable. + BloomFilterVal(const BloomFilterVal&); + BloomFilterVal& operator=(const BloomFilterVal&); + + BroType* type; + CompositeHash* hash; + probabilistic::BloomFilter* bloom_filter; + }; + #endif diff --git a/src/SerialTypes.h b/src/SerialTypes.h index 723badab1e..85aed10bda 100644 --- a/src/SerialTypes.h +++ b/src/SerialTypes.h @@ -49,6 +49,9 @@ SERIAL_IS(STATE_ACCESS, 0x1100) SERIAL_IS_BO(CASE, 0x1200) SERIAL_IS(LOCATION, 0x1300) SERIAL_IS(RE_MATCHER, 0x1400) +SERIAL_IS(BITVECTOR, 0x1500) +SERIAL_IS(COUNTERVECTOR, 0x1600) +SERIAL_IS(BLOOMFILTER, 0x1700) // These are the externally visible types. const SerialType SER_NONE = 0; @@ -104,6 +107,7 @@ SERIAL_VAL(MD5_VAL, 16) SERIAL_VAL(SHA1_VAL, 17) SERIAL_VAL(SHA256_VAL, 18) SERIAL_VAL(ENTROPY_VAL, 19) +SERIAL_VAL(BLOOMFILTER_VAL, 20) #define SERIAL_EXPR(name, val) SERIAL_CONST(name, val, EXPR) SERIAL_EXPR(EXPR, 1) @@ -197,10 +201,17 @@ SERIAL_FUNC(BRO_FUNC, 2) SERIAL_FUNC(DEBUG_FUNC, 3) SERIAL_FUNC(BUILTIN_FUNC, 4) +#define SERIAL_BLOOMFILTER(name, val) SERIAL_CONST(name, val, BLOOMFILTER) +SERIAL_BLOOMFILTER(BLOOMFILTER, 1) +SERIAL_BLOOMFILTER(BASICBLOOMFILTER, 2) +SERIAL_BLOOMFILTER(COUNTINGBLOOMFILTER, 3) + SERIAL_CONST2(ID) SERIAL_CONST2(STATE_ACCESS) SERIAL_CONST2(CASE) SERIAL_CONST2(LOCATION) SERIAL_CONST2(RE_MATCHER) +SERIAL_CONST2(BITVECTOR) +SERIAL_CONST2(COUNTERVECTOR) #endif diff --git a/src/Type.cc b/src/Type.cc index 917c6f46b3..563bc5afbd 100644 --- a/src/Type.cc +++ b/src/Type.cc @@ -1311,19 +1311,20 @@ IMPLEMENT_SERIAL(OpaqueType, SER_OPAQUE_TYPE); bool OpaqueType::DoSerialize(SerialInfo* info) const { DO_SERIALIZE(SER_OPAQUE_TYPE, BroType); - return SERIALIZE(name); + return SERIALIZE_STR(name.c_str(), name.size()); } bool OpaqueType::DoUnserialize(UnserialInfo* info) { DO_UNSERIALIZE(BroType); - char const* n; + const char* n; if ( ! UNSERIALIZE_STR(&n, 0) ) return false; name = n; delete [] n; + return true; } diff --git a/src/bro.bif b/src/bro.bif index efb913bbf7..68a8d5114c 100644 --- a/src/bro.bif +++ b/src/bro.bif @@ -4975,4 +4975,3 @@ function anonymize_addr%(a: addr, cl: IPAddrAnonymizationClass%): addr (enum ip_addr_anonymization_class_t) anon_class)); } %} - diff --git a/src/probabilistic/BitVector.cc b/src/probabilistic/BitVector.cc new file mode 100644 index 0000000000..c0285eced3 --- /dev/null +++ b/src/probabilistic/BitVector.cc @@ -0,0 +1,578 @@ +// See the file "COPYING" in the main distribution directory for copyright. + +#include "BitVector.h" + +#include +#include +#include "Serializer.h" + +using namespace probabilistic; + +BitVector::size_type BitVector::npos = static_cast(-1); +BitVector::block_type BitVector::bits_per_block = + std::numeric_limits::digits; + +namespace { + +uint8_t count_table[] = { + 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, + 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, + 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, + 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, + 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, + 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, + 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, + 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, 5, + 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 3, + 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, + 6, 7, 6, 7, 7, 8 +}; + +} // namespace + +BitVector::Reference::Reference(block_type& block, block_type i) + : block(block), mask((block_type(1) << i)) + { + assert(i < bits_per_block); + } + +BitVector::Reference& BitVector::Reference::Flip() + { + block ^= mask; + return *this; + } + +BitVector::Reference::operator bool() const + { + return (block & mask) != 0; + } + +bool BitVector::Reference::operator~() const + { + return (block & mask) == 0; + } + +BitVector::Reference& BitVector::Reference::operator=(bool x) + { + if ( x ) + block |= mask; + else + block &= ~mask; + + return *this; + } + +BitVector::Reference& BitVector::Reference::operator=(const Reference& other) + { + if ( other ) + block |= mask; + else + block &= ~mask; + + return *this; + } + +BitVector::Reference& BitVector::Reference::operator|=(bool x) + { + if ( x ) + block |= mask; + + return *this; + } + +BitVector::Reference& BitVector::Reference::operator&=(bool x) + { + if ( ! x ) + block &= ~mask; + + return *this; + } + +BitVector::Reference& BitVector::Reference::operator^=(bool x) + { + if ( x ) + block ^= mask; + + return *this; + } + +BitVector::Reference& BitVector::Reference::operator-=(bool x) + { + if ( x ) + block &= ~mask; + + return *this; + } + +BitVector::BitVector() + { + num_bits = 0; + } + +BitVector::BitVector(size_type size, bool value) + : bits(bits_to_blocks(size), value ? ~block_type(0) : 0) + { + num_bits = size; + } + +BitVector::BitVector(BitVector const& other) + : bits(other.bits) + { + num_bits = other.num_bits; + } + +BitVector BitVector::operator~() const + { + BitVector b(*this); + b.Flip(); + return b; + } + +BitVector& BitVector::operator=(BitVector const& other) + { + bits = other.bits; + return *this; + } + +BitVector BitVector::operator<<(size_type n) const + { + BitVector b(*this); + return b <<= n; + } + +BitVector BitVector::operator>>(size_type n) const + { + BitVector b(*this); + return b >>= n; + } + +BitVector& BitVector::operator<<=(size_type n) + { + if ( n >= num_bits ) + return Reset(); + + if ( n > 0 ) + { + size_type last = Blocks() - 1; + size_type div = n / bits_per_block; + block_type r = bit_index(n); + block_type* b = &bits[0]; + + assert(Blocks() >= 1); + assert(div <= last); + + if ( r != 0 ) + { + for ( size_type i = last - div; i > 0; --i ) + b[i + div] = (b[i] << r) | (b[i - 1] >> (bits_per_block - r)); + + b[div] = b[0] << r; + } + + else + { + for (size_type i = last-div; i > 0; --i) + b[i + div] = b[i]; + + b[div] = b[0]; + } + + std::fill_n(b, div, block_type(0)); + zero_unused_bits(); + } + + return *this; + } + +BitVector& BitVector::operator>>=(size_type n) + { + if ( n >= num_bits ) + return Reset(); + + if ( n > 0 ) + { + size_type last = Blocks() - 1; + size_type div = n / bits_per_block; + block_type r = bit_index(n); + block_type* b = &bits[0]; + + assert(Blocks() >= 1); + assert(div <= last); + + if ( r != 0 ) + { + for (size_type i = last - div; i > 0; --i) + b[i - div] = (b[i] >> r) | (b[i + 1] << (bits_per_block - r)); + + b[last - div] = b[last] >> r; + } + + else + { + for (size_type i = div; i <= last; ++i) + b[i-div] = b[i]; + } + + std::fill_n(b + (Blocks() - div), div, block_type(0)); + } + + return *this; + } + +BitVector& BitVector::operator&=(BitVector const& other) + { + assert(Size() >= other.Size()); + + for ( size_type i = 0; i < Blocks(); ++i ) + bits[i] &= other.bits[i]; + + return *this; + } + +BitVector& BitVector::operator|=(BitVector const& other) + { + assert(Size() >= other.Size()); + + for ( size_type i = 0; i < Blocks(); ++i ) + bits[i] |= other.bits[i]; + + return *this; + } + +BitVector& BitVector::operator^=(BitVector const& other) + { + assert(Size() >= other.Size()); + + for ( size_type i = 0; i < Blocks(); ++i ) + bits[i] ^= other.bits[i]; + + return *this; + } + +BitVector& BitVector::operator-=(BitVector const& other) + { + assert(Size() >= other.Size()); + + for ( size_type i = 0; i < Blocks(); ++i ) + bits[i] &= ~other.bits[i]; + + return *this; + } + +namespace probabilistic { + +BitVector operator&(BitVector const& x, BitVector const& y) + { + BitVector b(x); + return b &= y; + } + +BitVector operator|(BitVector const& x, BitVector const& y) + { + BitVector b(x); + return b |= y; + } + +BitVector operator^(BitVector const& x, BitVector const& y) + { + BitVector b(x); + return b ^= y; + } + +BitVector operator-(BitVector const& x, BitVector const& y) + { + BitVector b(x); + return b -= y; + } + +bool operator==(BitVector const& x, BitVector const& y) + { + return x.num_bits == y.num_bits && x.bits == y.bits; + } + +bool operator!=(BitVector const& x, BitVector const& y) + { + return ! (x == y); + } + +bool operator<(BitVector const& x, BitVector const& y) + { + assert(x.Size() == y.Size()); + + for ( BitVector::size_type r = x.Blocks(); r > 0; --r ) + { + BitVector::size_type i = r - 1; + + if ( x.bits[i] < y.bits[i] ) + return true; + + else if ( x.bits[i] > y.bits[i] ) + return false; + + } + + return false; + } + +} + +void BitVector::Resize(size_type n, bool value) + { + size_type old = Blocks(); + size_type required = bits_to_blocks(n); + block_type block_value = value ? ~block_type(0) : block_type(0); + + if ( required != old ) + bits.resize(required, block_value); + + if ( value && (n > num_bits) && extra_bits() ) + bits[old - 1] |= (block_value << extra_bits()); + + num_bits = n; + zero_unused_bits(); + } + +void BitVector::Clear() + { + bits.clear(); + num_bits = 0; + } + +void BitVector::PushBack(bool bit) + { + size_type s = Size(); + Resize(s + 1); + Set(s, bit); + } + +void BitVector::Append(block_type block) + { + size_type excess = extra_bits(); + + if ( excess ) + { + assert(! Empty()); + bits.push_back(block >> (bits_per_block - excess)); + bits[Blocks() - 2] |= (block << excess); + } + + else + { + bits.push_back(block); + } + + num_bits += bits_per_block; + } + +BitVector& BitVector::Set(size_type i, bool bit) + { + assert(i < num_bits); + + if ( bit ) + bits[block_index(i)] |= bit_mask(i); + else + Reset(i); + + return *this; + } + +BitVector& BitVector::Set() + { + std::fill(bits.begin(), bits.end(), ~block_type(0)); + zero_unused_bits(); + return *this; + } + +BitVector& BitVector::Reset(size_type i) + { + assert(i < num_bits); + bits[block_index(i)] &= ~bit_mask(i); + return *this; + } + +BitVector& BitVector::Reset() + { + std::fill(bits.begin(), bits.end(), block_type(0)); + return *this; + } + +BitVector& BitVector::Flip(size_type i) + { + assert(i < num_bits); + bits[block_index(i)] ^= bit_mask(i); + return *this; + } + +BitVector& BitVector::Flip() + { + for (size_type i = 0; i < Blocks(); ++i) + bits[i] = ~bits[i]; + + zero_unused_bits(); + return *this; + } + +bool BitVector::operator[](size_type i) const + { + assert(i < num_bits); + return (bits[block_index(i)] & bit_mask(i)) != 0; + } + +BitVector::Reference BitVector::operator[](size_type i) + { + assert(i < num_bits); + return Reference(bits[block_index(i)], bit_index(i)); + } + +BitVector::size_type BitVector::Count() const + { + std::vector::const_iterator first = bits.begin(); + size_t n = 0; + size_type length = Blocks(); + + while ( length ) + { + block_type block = *first; + + while ( block ) + { + // TODO: use _popcnt if available. + n += count_table[block & ((1u << 8) - 1)]; + block >>= 8; + } + + ++first; + --length; + } + + return n; + } + +BitVector::size_type BitVector::Blocks() const + { + return bits.size(); + } + +BitVector::size_type BitVector::Size() const + { + return num_bits; + } + +bool BitVector::Empty() const + { + return bits.empty(); + } + +bool BitVector::AllZero() const + { + for ( size_t i = 0; i < bits.size(); ++i ) + { + if ( bits[i] ) + return false; + } + + return true; + } + +BitVector::size_type BitVector::FindFirst() const + { + return find_from(0); + } + +BitVector::size_type BitVector::FindNext(size_type i) const + { + if ( i >= (Size() - 1) || Size() == 0 ) + return npos; + + ++i; + size_type bi = block_index(i); + block_type block = bits[bi] & (~block_type(0) << bit_index(i)); + return block ? bi * bits_per_block + lowest_bit(block) : find_from(bi + 1); + } + +BitVector::size_type BitVector::lowest_bit(block_type block) + { + block_type x = block - (block & (block - 1)); + size_type log = 0; + + while (x >>= 1) + ++log; + + return log; + } + +BitVector::block_type BitVector::extra_bits() const + { + return bit_index(Size()); + } + +void BitVector::zero_unused_bits() + { + if ( extra_bits() ) + bits.back() &= ~(~block_type(0) << extra_bits()); + } + +BitVector::size_type BitVector::find_from(size_type i) const + { + while (i < Blocks() && bits[i] == 0) + ++i; + + if ( i >= Blocks() ) + return npos; + + return i * bits_per_block + lowest_bit(bits[i]); + } + +bool BitVector::Serialize(SerialInfo* info) const + { + return SerialObj::Serialize(info); + } + +BitVector* BitVector::Unserialize(UnserialInfo* info) + { + return reinterpret_cast(SerialObj::Unserialize(info, SER_BITVECTOR)); + } + +IMPLEMENT_SERIAL(BitVector, SER_BITVECTOR); + +bool BitVector::DoSerialize(SerialInfo* info) const + { + DO_SERIALIZE(SER_BITVECTOR, SerialObj); + + if ( ! SERIALIZE(static_cast(bits.size())) ) + return false; + + for ( size_t i = 0; i < bits.size(); ++i ) + if ( ! SERIALIZE(static_cast(bits[i])) ) + return false; + + return SERIALIZE(static_cast(num_bits)); + } + +bool BitVector::DoUnserialize(UnserialInfo* info) + { + DO_UNSERIALIZE(SerialObj); + + uint64 size; + if ( ! UNSERIALIZE(&size) ) + return false; + + bits.resize(static_cast(size)); + + for ( size_t i = 0; i < bits.size(); ++i ) + { + uint64 block; + if ( ! UNSERIALIZE(&block) ) + return false; + + bits[i] = static_cast(block); + } + + uint64 num_bits; + if ( ! UNSERIALIZE(&num_bits) ) + return false; + + num_bits = static_cast(num_bits); + + return true; + } diff --git a/src/probabilistic/BitVector.h b/src/probabilistic/BitVector.h new file mode 100644 index 0000000000..d9c55d53c6 --- /dev/null +++ b/src/probabilistic/BitVector.h @@ -0,0 +1,370 @@ +// See the file "COPYING" in the main distribution directory for copyright. + +#ifndef PROBABILISTIC_BITVECTOR_H +#define PROBABILISTIC_BITVECTOR_H + +#include +#include + +#include "SerialObj.h" + +namespace probabilistic { + +/** + * A vector of bits. + */ +class BitVector : public SerialObj { +public: + typedef size_t block_type; + typedef size_t size_type; + typedef bool const_reference; + + static size_type npos; + static block_type bits_per_block; + + /** + * An lvalue proxy for individual bits. + */ + class Reference { + public: + /** + * Inverts the bits' values. + */ + Reference& Flip(); + + operator bool() const; + bool operator~() const; + Reference& operator=(bool x); + Reference& operator=(const Reference& other); + Reference& operator|=(bool x); + Reference& operator&=(bool x); + Reference& operator^=(bool x); + Reference& operator-=(bool x); + + private: + friend class BitVector; + + Reference(block_type& block, block_type i); + void operator&(); + + block_type& block; + const block_type mask; + }; + + /** + * Default-constructs an empty bit vector. + */ + BitVector(); + + /** + * Constructs a bit vector of a given size. + * @param size The number of bits. + * @param value The value for each bit. + */ + explicit BitVector(size_type size, bool value = false); + + /** + * Constructs a bit vector from a sequence of blocks. + * + * @param first Start of range + * @param last End of range. + * + */ + template + BitVector(InputIterator first, InputIterator last) + { + bits.insert(bits.end(), first, last); + num_bits = bits.size() * bits_per_block; + } + + /** + * Copy-constructs a bit vector. + * @param other The bit vector to copy. + */ + BitVector(const BitVector& other); + + /** + * Assigns another bit vector to this instance. + * @param other The RHS of the assignment. + */ + BitVector& operator=(const BitVector& other); + + // + // Bitwise operations. + // + BitVector operator~() const; + BitVector operator<<(size_type n) const; + BitVector operator>>(size_type n) const; + BitVector& operator<<=(size_type n); + BitVector& operator>>=(size_type n); + BitVector& operator&=(BitVector const& other); + BitVector& operator|=(BitVector const& other); + BitVector& operator^=(BitVector const& other); + BitVector& operator-=(BitVector const& other); + friend BitVector operator&(BitVector const& x, BitVector const& y); + friend BitVector operator|(BitVector const& x, BitVector const& y); + friend BitVector operator^(BitVector const& x, BitVector const& y); + friend BitVector operator-(BitVector const& x, BitVector const& y); + + // + // Relational operators + // + friend bool operator==(BitVector const& x, BitVector const& y); + friend bool operator!=(BitVector const& x, BitVector const& y); + friend bool operator<(BitVector const& x, BitVector const& y); + + // + // Basic operations + // + + /** Appends the bits in a sequence of values. + * @tparam Iterator A forward iterator. + * @param first An iterator pointing to the first element of the sequence. + * @param last An iterator pointing to one past the last element of the + * sequence. + */ + template + void Append(ForwardIterator first, ForwardIterator last) + { + if ( first == last ) + return; + + block_type excess = extra_bits(); + typename std::iterator_traits::difference_type delta = + std::distance(first, last); + + bits.reserve(Blocks() + delta); + + if ( excess == 0 ) + { + bits.back() |= (*first << excess); + + do { + block_type b = *first++ >> (bits_per_block - excess); + bits.push_back(b | (first == last ? 0 : *first << excess)); + } while (first != last); + + } + + else + bits.insert(bits.end(), first, last); + + num_bits += bits_per_block * delta; + } + + /** + * Appends the bits in a given block. + * @param block The block containing bits to append. + */ + void Append(block_type block); + + /** Appends a single bit to the end of the bit vector. + * @param bit The value of the bit. + */ + void PushBack(bool bit); + + /** + * Clears all bits in the bitvector. + */ + void Clear(); + + /** + * Resizes the bit vector to a new number of bits. + * @param n The new number of bits of the bit vector. + * @param value The bit value of new values, if the vector expands. + */ + void Resize(size_type n, bool value = false); + + /** + * Sets a bit at a specific position to a given value. + * @param i The bit position. + * @param bit The value assigned to position *i*. + * @return A reference to the bit vector instance. + */ + BitVector& Set(size_type i, bool bit = true); + + /** + * Sets all bits to 1. + * @return A reference to the bit vector instance. + */ + BitVector& Set(); + + /** + * Resets a bit at a specific position, i.e., sets it to 0. + * @param i The bit position. + * @return A reference to the bit vector instance. + */ + BitVector& Reset(size_type i); + + /** + * Sets all bits to 0. + * @return A reference to the bit vector instance. + */ + BitVector& Reset(); + + /** + * Toggles/flips a bit at a specific position. + * @param i The bit position. + * @return A reference to the bit vector instance. + */ + BitVector& Flip(size_type i); + + /** + * Computes the complement. + * @return A reference to the bit vector instance. + */ + BitVector& Flip(); + + /** Retrieves a single bit. + * @param i The bit position. + * @return A mutable reference to the bit at position *i*. + */ + Reference operator[](size_type i); + + /** + * Retrieves a single bit. + * @param i The bit position. + * @return A const-reference to the bit at position *i*. + */ + const_reference operator[](size_type i) const; + + /** + * Counts the number of 1-bits in the bit vector. Also known as *population + * count* or *Hamming weight*. + * @return The number of bits set to 1. + */ + size_type Count() const; + + /** + * Retrieves the number of blocks of the underlying storage. + * @param The number of blocks that represent `Size()` bits. + */ + size_type Blocks() const; + + /** + * Retrieves the number of bits the bitvector consist of. + * @return The length of the bit vector in bits. + */ + size_type Size() const; + + /** + * Checks whether the bit vector is empty. + * @return `true` iff the bitvector has zero length. + */ + bool Empty() const; + + /** + * Checks whether all bits are 0. + * @return `true` iff all bits in all blocks are 0. + */ + bool AllZero() const; + + /** + * Finds the bit position of of the first 1-bit. + * @return The position of the first bit that equals to one or `npos` if no + * such bit exists. + */ + size_type FindFirst() const; + + /** + * Finds the next 1-bit from a given starting position. + * + * @param i The index where to start looking. + * + * @return The position of the first bit that equals to 1 after position + * *i* or `npos` if no such bit exists. + */ + size_type FindNext(size_type i) const; + + /** + * Serializes the bit vector. + * + * @param info The serializaton informationt to use. + * + * @return True if successful. + */ + bool Serialize(SerialInfo* info) const; + + /** + * Unserialize the bit vector. + * + * @param info The serializaton informationt to use. + * + * @return The unserialized bit vector, or null if an error occured. + */ + static BitVector* Unserialize(UnserialInfo* info); + +protected: + DECLARE_SERIAL(BitVector); + +private: + /** + * Computes the number of excess/unused bits in the bit vector. + */ + block_type extra_bits() const; + + /** + * If the number of bits in the vector are not not a multiple of + * bitvector::bits_per_block, then the last block exhibits unused bits which + * this function resets. + */ + void zero_unused_bits(); + + /** + * Looks for the first 1-bit starting at a given position. + * @param i The block index to start looking. + * @return The block index of the first 1-bit starting from *i* or + * `bitvector::npos` if no 1-bit exists. + */ + size_type find_from(size_type i) const; + + /** + * Computes the block index for a given bit position. + */ + static size_type block_index(size_type i) + { + return i / bits_per_block; + } + + /** + * Computes the bit index within a given block for a given bit position. + */ + static block_type bit_index(size_type i) + { + return i % bits_per_block; + } + + /** + * Computes the bitmask block to extract a bit a given bit position. + */ + static block_type bit_mask(size_type i) + { + return block_type(1) << bit_index(i); + } + + /** + * Computes the number of blocks needed to represent a given number of + * bits. + * @param bits the number of bits. + * @return The number of blocks to represent *bits* number of bits. + */ + static size_type bits_to_blocks(size_type bits) + { + return bits / bits_per_block + + static_cast(bits % bits_per_block != 0); + } + + /** + * Computes the bit position first 1-bit in a given block. + * @param block The block to inspect. + * @return The bit position where *block* has its first bit set to 1. + */ + static size_type lowest_bit(block_type block); + + std::vector bits; + size_type num_bits; +}; + +} + +#endif diff --git a/src/probabilistic/BloomFilter.cc b/src/probabilistic/BloomFilter.cc new file mode 100644 index 0000000000..db768ed934 --- /dev/null +++ b/src/probabilistic/BloomFilter.cc @@ -0,0 +1,257 @@ +// See the file "COPYING" in the main distribution directory for copyright. + +#include +#include +#include + +#include "BloomFilter.h" + +#include "CounterVector.h" +#include "Serializer.h" + +using namespace probabilistic; + +BloomFilter::BloomFilter() + { + hasher = 0; + } + +BloomFilter::BloomFilter(const Hasher* arg_hasher) + { + hasher = arg_hasher; + } + +BloomFilter::~BloomFilter() + { + delete hasher; + } + +bool BloomFilter::Serialize(SerialInfo* info) const + { + return SerialObj::Serialize(info); + } + +BloomFilter* BloomFilter::Unserialize(UnserialInfo* info) + { + return reinterpret_cast(SerialObj::Unserialize(info, SER_BLOOMFILTER)); + } + +bool BloomFilter::DoSerialize(SerialInfo* info) const + { + DO_SERIALIZE(SER_BLOOMFILTER, SerialObj); + + if ( ! SERIALIZE(static_cast(hasher->K())) ) + return false; + + return SERIALIZE_STR(hasher->Name().c_str(), hasher->Name().size()); + } + +bool BloomFilter::DoUnserialize(UnserialInfo* info) + { + DO_UNSERIALIZE(SerialObj); + + uint16 k; + if ( ! UNSERIALIZE(&k) ) + return false; + + const char* name; + if ( ! UNSERIALIZE_STR(&name, 0) ) + return false; + + hasher = Hasher::Create(k, name); + + delete [] name; + return true; + } + +size_t BasicBloomFilter::M(double fp, size_t capacity) + { + double ln2 = std::log(2); + return std::ceil(-(capacity * std::log(fp) / ln2 / ln2)); + } + +size_t BasicBloomFilter::K(size_t cells, size_t capacity) + { + double frac = static_cast(cells) / static_cast(capacity); + return std::ceil(frac * std::log(2)); + } + +bool BasicBloomFilter::Empty() const + { + return bits->AllZero(); + } + +void BasicBloomFilter::Clear() + { + bits->Clear(); + } + +bool BasicBloomFilter::Merge(const BloomFilter* other) + { + if ( typeid(*this) != typeid(*other) ) + return false; + + const BasicBloomFilter* o = static_cast(other); + + if ( ! hasher->Equals(o->hasher) ) + { + reporter->Error("incompatible hashers in BasicBloomFilter merge"); + return false; + } + + else if ( bits->Size() != o->bits->Size() ) + { + reporter->Error("different bitvector size in BasicBloomFilter merge"); + return false; + } + + (*bits) |= *o->bits; + + return true; + } + +BasicBloomFilter* BasicBloomFilter::Clone() const + { + BasicBloomFilter* copy = new BasicBloomFilter(); + + copy->hasher = hasher->Clone(); + copy->bits = new BitVector(*bits); + + return copy; + } + +BasicBloomFilter::BasicBloomFilter() + { + bits = 0; + } + +BasicBloomFilter::BasicBloomFilter(const Hasher* hasher, size_t cells) + : BloomFilter(hasher) + { + bits = new BitVector(cells); + } + +IMPLEMENT_SERIAL(BasicBloomFilter, SER_BASICBLOOMFILTER) + +bool BasicBloomFilter::DoSerialize(SerialInfo* info) const + { + DO_SERIALIZE(SER_BASICBLOOMFILTER, BloomFilter); + return bits->Serialize(info); + } + +bool BasicBloomFilter::DoUnserialize(UnserialInfo* info) + { + DO_UNSERIALIZE(BloomFilter); + bits = BitVector::Unserialize(info); + return (bits != 0); + } + +void BasicBloomFilter::AddImpl(const Hasher::digest_vector& h) + { + for ( size_t i = 0; i < h.size(); ++i ) + bits->Set(h[i] % bits->Size()); + } + +size_t BasicBloomFilter::CountImpl(const Hasher::digest_vector& h) const + { + for ( size_t i = 0; i < h.size(); ++i ) + { + if ( ! (*bits)[h[i] % bits->Size()] ) + return 0; + } + + return 1; + } + +CountingBloomFilter::CountingBloomFilter() + { + cells = 0; + } + +CountingBloomFilter::CountingBloomFilter(const Hasher* hasher, + size_t arg_cells, size_t width) + : BloomFilter(hasher) + { + cells = new CounterVector(width, arg_cells); + } + +bool CountingBloomFilter::Empty() const + { + return cells->AllZero(); + } + +void CountingBloomFilter::Clear() + { + cells->Clear(); + } + +bool CountingBloomFilter::Merge(const BloomFilter* other) + { + if ( typeid(*this) != typeid(*other) ) + return false; + + const CountingBloomFilter* o = static_cast(other); + + if ( ! hasher->Equals(o->hasher) ) + { + reporter->Error("incompatible hashers in CountingBloomFilter merge"); + return false; + } + + else if ( cells->Size() != o->cells->Size() ) + { + reporter->Error("different bitvector size in CountingBloomFilter merge"); + return false; + } + + (*cells) |= *o->cells; + + return true; + } + +CountingBloomFilter* CountingBloomFilter::Clone() const + { + CountingBloomFilter* copy = new CountingBloomFilter(); + + copy->hasher = hasher->Clone(); + copy->cells = new CounterVector(*cells); + + return copy; + } + +IMPLEMENT_SERIAL(CountingBloomFilter, SER_COUNTINGBLOOMFILTER) + +bool CountingBloomFilter::DoSerialize(SerialInfo* info) const + { + DO_SERIALIZE(SER_COUNTINGBLOOMFILTER, BloomFilter); + return cells->Serialize(info); + } + +bool CountingBloomFilter::DoUnserialize(UnserialInfo* info) + { + DO_UNSERIALIZE(BloomFilter); + cells = CounterVector::Unserialize(info); + return (cells != 0); + } + +// TODO: Use partitioning in add/count to allow for reusing CMS bounds. +void CountingBloomFilter::AddImpl(const Hasher::digest_vector& h) + { + for ( size_t i = 0; i < h.size(); ++i ) + cells->Increment(h[i] % cells->Size()); + } + +size_t CountingBloomFilter::CountImpl(const Hasher::digest_vector& h) const + { + CounterVector::size_type min = + std::numeric_limits::max(); + + for ( size_t i = 0; i < h.size(); ++i ) + { + CounterVector::size_type cnt = cells->Count(h[i] % cells->Size()); + if ( cnt < min ) + min = cnt; + } + + return min; + } diff --git a/src/probabilistic/BloomFilter.h b/src/probabilistic/BloomFilter.h new file mode 100644 index 0000000000..b6cf18672f --- /dev/null +++ b/src/probabilistic/BloomFilter.h @@ -0,0 +1,241 @@ +// See the file "COPYING" in the main distribution directory for copyright. + +#ifndef PROBABILISTIC_BLOOMFILTER_H +#define PROBABILISTIC_BLOOMFILTER_H + +#include +#include "BitVector.h" +#include "Hasher.h" + +namespace probabilistic { + +class CounterVector; + +/** + * The abstract base class for Bloom filters. + * + * At this point we won't let the user choose the hasher, but we might open + * up the interface in the future. + */ +class BloomFilter : public SerialObj { +public: + /** + * Destructor. + */ + virtual ~BloomFilter(); + + /** + * Adds an element of type T to the Bloom filter. + * @param x The element to add + */ + template + void Add(const T& x) + { + AddImpl((*hasher)(x)); + } + + /** + * Retrieves the associated count of a given value. + * + * @param x The value of type `T` to check. + * + * @return The counter associated with *x*. + */ + template + size_t Count(const T& x) const + { + return CountImpl((*hasher)(x)); + } + + /** + * Checks whether the Bloom filter is empty. + * + * @return `true` if the Bloom filter contains no elements. + */ + virtual bool Empty() const = 0; + + /** + * Removes all elements, i.e., resets all bits in the underlying bit vector. + */ + virtual void Clear() = 0; + + /** + * Merges another Bloom filter into a copy of this one. + * + * @param other The other Bloom filter. + * + * @return `true` on success. + */ + virtual bool Merge(const BloomFilter* other) = 0; + + /** + * Constructs a copy of this Bloom filter. + * + * @return A copy of `*this`. + */ + virtual BloomFilter* Clone() const = 0; + + /** + * Serializes the Bloom filter. + * + * @param info The serializaton information to use. + * + * @return True if successful. + */ + bool Serialize(SerialInfo* info) const; + + /** + * Unserializes a Bloom filter. + * + * @param info The serializaton information to use. + * + * @return The unserialized Bloom filter, or null if an error + * occured. + */ + static BloomFilter* Unserialize(UnserialInfo* info); + +protected: + DECLARE_ABSTRACT_SERIAL(BloomFilter); + + /** + * Default constructor. + */ + BloomFilter(); + + /** + * Constructs a Bloom filter. + * + * @param hasher The hasher to use for this Bloom filter. + */ + BloomFilter(const Hasher* hasher); + + /** + * Abstract method for implementinng the *Add* operation. + * + * @param hashes A set of *k* hashes for the item to add, computed by + * the internal hasher object. + * + */ + virtual void AddImpl(const Hasher::digest_vector& hashes) = 0; + + /** + * Abstract method for implementing the *Count* operation. + * + * @param hashes A set of *k* hashes for the item to add, computed by + * the internal hasher object. + * + * @return Returns the counter associated with the hashed element. + */ + virtual size_t CountImpl(const Hasher::digest_vector& hashes) const = 0; + + const Hasher* hasher; +}; + +/** + * A basic Bloom filter. + */ +class BasicBloomFilter : public BloomFilter { +public: + /** + * Constructs a basic Bloom filter with a given number of cells. The + * ideal number of cells can be computed with *M*. + * + * @param hasher The hasher to use. The ideal number of hash + * functions can be computed with *K*. + * + * @param cells The number of cells. + */ + BasicBloomFilter(const Hasher* hasher, size_t cells); + + /** + * Computes the number of cells based on a given false positive rate + * and capacity. In the literature, this parameter often has the name + * *M*. + * + * @param fp The false positive rate. + * + * @param capacity The expected number of elements that will be + * stored. + * + * Returns: The number cells needed to support a false positive rate + * of *fp* with at most *capacity* elements. + */ + static size_t M(double fp, size_t capacity); + + /** + * Computes the optimal number of hash functions based on the number cells + * and expected number of elements. + * + * @param cells The number of cells (*m*). + * + * @param capacity The maximum number of elements. + * + * Returns: the optimal number of hash functions for a false-positive + * rate of *fp* for at most *capacity* elements. + */ + static size_t K(size_t cells, size_t capacity); + + // Overridden from BloomFilter. + virtual bool Empty() const; + virtual void Clear(); + virtual bool Merge(const BloomFilter* other); + virtual BasicBloomFilter* Clone() const; + +protected: + DECLARE_SERIAL(BasicBloomFilter); + + /** + * Default constructor. + */ + BasicBloomFilter(); + + // Overridden from BloomFilter. + virtual void AddImpl(const Hasher::digest_vector& h); + virtual size_t CountImpl(const Hasher::digest_vector& h) const; + +private: + BitVector* bits; +}; + +/** + * A counting Bloom filter. + */ +class CountingBloomFilter : public BloomFilter { +public: + /** + * Constructs a counting Bloom filter. + * + * @param hasher The hasher to use. The ideal number of hash + * functions can be computed with *K*. + * + * @param cells The number of cells to use. + * + * @param width The maximal bit-width of counter values. + */ + CountingBloomFilter(const Hasher* hasher, size_t cells, size_t width); + + // Overridden from BloomFilter. + virtual bool Empty() const; + virtual void Clear(); + virtual bool Merge(const BloomFilter* other); + virtual CountingBloomFilter* Clone() const; + +protected: + DECLARE_SERIAL(CountingBloomFilter); + + /** + * Default constructor. + */ + CountingBloomFilter(); + + // Overridden from BloomFilter. + virtual void AddImpl(const Hasher::digest_vector& h); + virtual size_t CountImpl(const Hasher::digest_vector& h) const; + +private: + CounterVector* cells; +}; + +} + +#endif diff --git a/src/probabilistic/CMakeLists.txt b/src/probabilistic/CMakeLists.txt new file mode 100644 index 0000000000..f82cdfaf8e --- /dev/null +++ b/src/probabilistic/CMakeLists.txt @@ -0,0 +1,19 @@ + +include(BroSubdir) + +include_directories(BEFORE + ${CMAKE_CURRENT_SOURCE_DIR} + ${CMAKE_CURRENT_BINARY_DIR} +) + +set(probabilistic_SRCS + BitVector.cc + BloomFilter.cc + CounterVector.cc + Hasher.cc) + +bif_target(bloom-filter.bif) + +bro_add_subdir_library(probabilistic ${probabilistic_SRCS} ${BIF_OUTPUT_CC}) + +add_dependencies(bro_probabilistic generate_outputs) diff --git a/src/probabilistic/CounterVector.cc b/src/probabilistic/CounterVector.cc new file mode 100644 index 0000000000..24c9ff3638 --- /dev/null +++ b/src/probabilistic/CounterVector.cc @@ -0,0 +1,193 @@ +// See the file "COPYING" in the main distribution directory for copyright. + +#include "CounterVector.h" + +#include +#include "BitVector.h" +#include "Serializer.h" + +using namespace probabilistic; + +CounterVector::CounterVector(size_t arg_width, size_t cells) + { + bits = new BitVector(arg_width * cells); + width = arg_width; + } + +CounterVector::CounterVector(const CounterVector& other) + { + bits = new BitVector(*other.bits); + width = other.width; + } + +CounterVector::~CounterVector() + { + delete bits; + } + +bool CounterVector::Increment(size_type cell, count_type value) + { + assert(cell < Size()); + assert(value != 0); + + size_t lsb = cell * width; + bool carry = false; + + for ( size_t i = 0; i < width; ++i ) + { + bool b1 = (*bits)[lsb + i]; + bool b2 = value & (1 << i); + (*bits)[lsb + i] = b1 ^ b2 ^ carry; + carry = ( b1 && b2 ) || ( carry && ( b1 != b2 ) ); + } + + if ( carry ) + { + for ( size_t i = 0; i < width; ++i ) + bits->Set(lsb + i); + } + + return ! carry; + } + +bool CounterVector::Decrement(size_type cell, count_type value) + { + assert(cell < Size()); + assert(value != 0); + + value = ~value + 1; // A - B := A + ~B + 1 + bool carry = false; + size_t lsb = cell * width; + + for ( size_t i = 0; i < width; ++i ) + { + bool b1 = (*bits)[lsb + i]; + bool b2 = value & (1 << i); + (*bits)[lsb + i] = b1 ^ b2 ^ carry; + carry = ( b1 && b2 ) || ( carry && ( b1 != b2 ) ); + } + + return carry; + } + +bool CounterVector::AllZero() const + { + return bits->AllZero(); + } + +void CounterVector::Clear() + { + bits->Clear(); + } + +CounterVector::count_type CounterVector::Count(size_type cell) const + { + assert(cell < Size()); + + size_t cnt = 0, order = 1; + size_t lsb = cell * width; + + for ( size_t i = lsb; i < lsb + width; ++i, order <<= 1 ) + if ( (*bits)[i] ) + cnt |= order; + + return cnt; + } + +CounterVector::size_type CounterVector::Size() const + { + return bits->Size() / width; + } + +size_t CounterVector::Width() const + { + return width; + } + +size_t CounterVector::Max() const + { + return std::numeric_limits::max() + >> (std::numeric_limits::digits - width); + } + +CounterVector& CounterVector::Merge(const CounterVector& other) + { + assert(Size() == other.Size()); + assert(Width() == other.Width()); + + for ( size_t cell = 0; cell < Size(); ++cell ) + { + size_t lsb = cell * width; + bool carry = false; + + for ( size_t i = 0; i < width; ++i ) + { + bool b1 = (*bits)[lsb + i]; + bool b2 = (*other.bits)[lsb + i]; + (*bits)[lsb + i] = b1 ^ b2 ^ carry; + carry = ( b1 && b2 ) || ( carry && ( b1 != b2 ) ); + } + + if ( carry ) + { + for ( size_t i = 0; i < width; ++i ) + bits->Set(lsb + i); + } + } + + return *this; + } + +namespace probabilistic { + +CounterVector& CounterVector::operator|=(const CounterVector& other) + { + return Merge(other); + } + +CounterVector operator|(const CounterVector& x, const CounterVector& y) + { + CounterVector cv(x); + return cv |= y; + } + +} + +bool CounterVector::Serialize(SerialInfo* info) const + { + return SerialObj::Serialize(info); + } + +CounterVector* CounterVector::Unserialize(UnserialInfo* info) + { + return reinterpret_cast(SerialObj::Unserialize(info, SER_COUNTERVECTOR)); + } + +IMPLEMENT_SERIAL(CounterVector, SER_COUNTERVECTOR) + +bool CounterVector::DoSerialize(SerialInfo* info) const + { + DO_SERIALIZE(SER_COUNTERVECTOR, SerialObj); + + if ( ! bits->Serialize(info) ) + return false; + + return SERIALIZE(static_cast(width)); + } + +bool CounterVector::DoUnserialize(UnserialInfo* info) + { + DO_UNSERIALIZE(SerialObj); + + bits = BitVector::Unserialize(info); + if ( ! bits ) + return false; + + uint64 width; + if ( ! UNSERIALIZE(&width) ) + return false; + + width = static_cast(width); + + return true; + } diff --git a/src/probabilistic/CounterVector.h b/src/probabilistic/CounterVector.h new file mode 100644 index 0000000000..df6fc57ac2 --- /dev/null +++ b/src/probabilistic/CounterVector.h @@ -0,0 +1,165 @@ +// See the file "COPYING" in the main distribution directory for copyright. + +#ifndef PROBABILISTIC_COUNTERVECTOR_H +#define PROBABILISTIC_COUNTERVECTOR_H + +#include "SerialObj.h" + +namespace probabilistic { + +class BitVector; + +/** + * A vector of counters, each of which has a fixed number of bits. + */ +class CounterVector : public SerialObj { +public: + typedef size_t size_type; + typedef uint64 count_type; + + /** + * Constructs a counter vector having cells of a given width. + * + * @param width The number of bits that each cell occupies. + * + * @param cells The number of cells in the bitvector. + * + * @pre `cells > 0 && width > 0` + */ + CounterVector(size_t width, size_t cells = 1024); + + /** + * Copy-constructs a counter vector. + * + * @param other The counter vector to copy. + */ + CounterVector(const CounterVector& other); + + /** + * Destructor. + */ + ~CounterVector(); + + /** + * Increments a given cell. + * + * @param cell The cell to increment. + * + * @param value The value to add to the current counter in *cell*. + * + * @return `true` if adding *value* to the counter in *cell* succeeded. + * + * @pre `cell < Size()` + */ + bool Increment(size_type cell, count_type value = 1); + + /** + * Decrements a given cell. + * + * @param cell The cell to decrement. + * + * @param value The value to subtract from the current counter in *cell*. + * + * @return `true` if subtracting *value* from the counter in *cell* succeeded. + * + * @pre `cell < Size()` + */ + bool Decrement(size_type cell, count_type value = 1); + + /** + * Retrieves the counter of a given cell. + * + * @param cell The cell index to retrieve the count for. + * + * @return The counter associated with *cell*. + * + * @pre `cell < Size()` + */ + count_type Count(size_type cell) const; + + /** + * Checks whether all counters are 0. + * @return `true` iff all counters have the value 0. + */ + bool AllZero() const; + + /** + * Sets all counters to 0. + */ + void Clear(); + + /** + * Retrieves the number of cells in the storage. + * + * @return The number of cells. + */ + size_type Size() const; + + /** + * Retrieves the counter width. + * + * @return The number of bits per counter. + */ + size_t Width() const; + + /** + * Computes the maximum counter value. + * + * @return The maximum counter value based on the width. + */ + size_t Max() const; + + /** + * Merges another counter vector into this instance by *adding* the + * counters of each cells. + * + * @param other The counter vector to merge into this instance. + * + * @return A reference to `*this`. + * + * @pre `Size() == other.Size() && Width() == other.Width()` + */ + CounterVector& Merge(const CounterVector& other); + + /** + * An alias for ::Merge. + */ + CounterVector& operator|=(const CounterVector& other); + + /** + * Serializes the bit vector. + * + * @param info The serializaton information to use. + * + * @return True if successful. + */ + bool Serialize(SerialInfo* info) const; + + /** + * Unserialize the counter vector. + * + * @param info The serializaton information to use. + * + * @return The unserialized counter vector, or null if an error + * occured. + */ + static CounterVector* Unserialize(UnserialInfo* info); + +protected: + friend CounterVector operator|(const CounterVector& x, + const CounterVector& y); + + CounterVector() { } + + DECLARE_SERIAL(CounterVector); + +private: + CounterVector& operator=(const CounterVector&); // Disable. + + BitVector* bits; + size_t width; +}; + +} + +#endif diff --git a/src/probabilistic/Hasher.cc b/src/probabilistic/Hasher.cc new file mode 100644 index 0000000000..f9ce7bdd6b --- /dev/null +++ b/src/probabilistic/Hasher.cc @@ -0,0 +1,114 @@ +// See the file "COPYING" in the main distribution directory for copyright. + +#include + +#include "Hasher.h" +#include "digest.h" + +using namespace probabilistic; + +UHF::UHF(size_t seed, const std::string& extra) + : h(compute_seed(seed, extra)) + { + } + +Hasher::digest UHF::hash(const void* x, size_t n) const + { + assert(n <= UHASH_KEY_SIZE); + return n == 0 ? 0 : h(x, n); + } + +size_t UHF::compute_seed(size_t seed, const std::string& extra) + { + u_char buf[SHA256_DIGEST_LENGTH]; + SHA256_CTX ctx; + sha256_init(&ctx); + + if ( extra.empty() ) + { + unsigned int first_seed = initial_seed(); + sha256_update(&ctx, &first_seed, sizeof(first_seed)); + } + + else + sha256_update(&ctx, extra.c_str(), extra.size()); + + sha256_update(&ctx, &seed, sizeof(seed)); + sha256_final(&ctx, buf); + + // Take the first sizeof(size_t) bytes as seed. + return *reinterpret_cast(buf); + } + +Hasher* Hasher::Create(size_t k, const std::string& name) + { + return new DefaultHasher(k, name); + } + +Hasher::Hasher(size_t k, const std::string& arg_name) + : k(k) + { + name = arg_name; + } + +DefaultHasher::DefaultHasher(size_t k, const std::string& name) + : Hasher(k, name) + { + for ( size_t i = 0; i < k; ++i ) + hash_functions.push_back(UHF(i, name)); + } + +Hasher::digest_vector DefaultHasher::Hash(const void* x, size_t n) const + { + digest_vector h(K(), 0); + + for ( size_t i = 0; i < h.size(); ++i ) + h[i] = hash_functions[i](x, n); + + return h; + } + +DefaultHasher* DefaultHasher::Clone() const + { + return new DefaultHasher(*this); + } + +bool DefaultHasher::Equals(const Hasher* other) const + { + if ( typeid(*this) != typeid(*other) ) + return false; + + const DefaultHasher* o = static_cast(other); + return hash_functions == o->hash_functions; + } + +DoubleHasher::DoubleHasher(size_t k, const std::string& name) + : Hasher(k, name), h1(1, name), h2(2, name) + { + } + +Hasher::digest_vector DoubleHasher::Hash(const void* x, size_t n) const + { + digest d1 = h1(x, n); + digest d2 = h2(x, n); + digest_vector h(K(), 0); + + for ( size_t i = 0; i < h.size(); ++i ) + h[i] = d1 + i * d2; + + return h; + } + +DoubleHasher* DoubleHasher::Clone() const + { + return new DoubleHasher(*this); + } + +bool DoubleHasher::Equals(const Hasher* other) const + { + if ( typeid(*this) != typeid(*other) ) + return false; + + const DoubleHasher* o = static_cast(other); + return h1 == o->h1 && h2 == o->h2; + } diff --git a/src/probabilistic/Hasher.h b/src/probabilistic/Hasher.h new file mode 100644 index 0000000000..9f7d4ae32d --- /dev/null +++ b/src/probabilistic/Hasher.h @@ -0,0 +1,216 @@ +// See the file "COPYING" in the main distribution directory for copyright. + +#ifndef PROBABILISTIC_HASHER_H +#define PROBABILISTIC_HASHER_H + +#include "Hash.h" +#include "H3.h" + +namespace probabilistic { + +/** + * Abstract base class for hashers. A hasher creates a family of hash + * functions to hash an element *k* times. + */ +class Hasher { +public: + typedef hash_t digest; + typedef std::vector digest_vector; + + /** + * Destructor. + */ + virtual ~Hasher() { } + + /** + * Computes hash values for an element. + * + * @param x The element to hash. + * + * @return Vector of *k* hash values. + */ + template + digest_vector operator()(const T& x) const + { + return Hash(&x, sizeof(T)); + } + + /** + * Computes the hashes for a set of bytes. + * + * @param x Pointer to first byte to hash. + * + * @param n Number of bytes to hash. + * + * @return Vector of *k* hash values. + * + */ + virtual digest_vector Hash(const void* x, size_t n) const = 0; + + /** + * Returns a deep copy of the hasher. + */ + virtual Hasher* Clone() const = 0; + + /** + * Returns true if two hashers are identical. + */ + virtual bool Equals(const Hasher* other) const = 0; + + /** + * Returns the number *k* of hash functions the hashers applies. + */ + size_t K() const { return k; } + + /** + * Returns the hasher's name. TODO: What's this? + */ + const std::string& Name() const { return name; } + + /** + * Constructs the hasher used by the implementation. This hardcodes a + * specific hashing policy. It exists only because the HashingPolicy + * class hierachy is not yet serializable. + * + * @param k The number of hash functions to apply. + * + * @param name The hasher's name. Hashers with the same name should + * provide consistent results. + * + * @return Returns a new hasher instance. + */ + static Hasher* Create(size_t k, const std::string& name); + +protected: + /** + * Constructor. + * + * @param k the number of hash functions. + * + * @param name A name for the hasher. Hashers with the same name + * should provide consistent results. + */ + Hasher(size_t k, const std::string& name); + +private: + const size_t k; + std::string name; +}; + +/** + * A universal hash function family. This is a helper class that Hasher + * implementations can use in their implementation. + */ +class UHF { +public: + /** + * Constructs an H3 hash function seeded with a given seed and an + * optional extra seed to replace the initial Bro seed. + * + * @param seed The seed to use for this instance. + * + * @param extra If not empty, this parameter replaces the initial + * seed to compute the seed for t to compute the seed NUL-terminated + * string as additional seed. + */ + UHF(size_t seed, const std::string& extra = ""); + + template + Hasher::digest operator()(const T& x) const + { + return hash(&x, sizeof(T)); + } + + /** + * Computes hash values for an element. + * + * @param x The element to hash. + * + * @return Vector of *k* hash values. + */ + Hasher::digest operator()(const void* x, size_t n) const + { + return hash(x, n); + } + + /** + * Computes the hashes for a set of bytes. + * + * @param x Pointer to first byte to hash. + * + * @param n Number of bytes to hash. + * + * @return Vector of *k* hash values. + * + */ + Hasher::digest hash(const void* x, size_t n) const; + + friend bool operator==(const UHF& x, const UHF& y) + { + return x.h == y.h; + } + + friend bool operator!=(const UHF& x, const UHF& y) + { + return ! (x == y); + } + +private: + static size_t compute_seed(size_t seed, const std::string& extra); + + H3 h; +}; + + +/** + * A hasher implementing the default hashing policy. Uses *k* separate hash + * functions internally. + */ +class DefaultHasher : public Hasher { +public: + /** + * Constructor for a hasher with *k* hash functions. + * + * @param k The number of hash functions to use. + * + * @param name The name of the hasher. + */ + DefaultHasher(size_t k, const std::string& name); + + // Overridden from Hasher. + virtual digest_vector Hash(const void* x, size_t n) const /* final */; + virtual DefaultHasher* Clone() const /* final */; + virtual bool Equals(const Hasher* other) const /* final */; + +private: + std::vector hash_functions; +}; + +/** + * The *double-hashing* policy. Uses a linear combination of two hash + * functions. + */ +class DoubleHasher : public Hasher { +public: + /** + * Constructor for a double hasher with *k* hash functions. + * + * @param k The number of hash functions to use. + * + * @param name The name of the hasher. + */ + DoubleHasher(size_t k, const std::string& name); + + // Overridden from Hasher. + virtual digest_vector Hash(const void* x, size_t n) const /* final */; + virtual DoubleHasher* Clone() const /* final */; + virtual bool Equals(const Hasher* other) const /* final */; + +private: + UHF h1; + UHF h2; +}; + +} + +#endif diff --git a/src/probabilistic/bloom-filter.bif b/src/probabilistic/bloom-filter.bif new file mode 100644 index 0000000000..c6760f6adf --- /dev/null +++ b/src/probabilistic/bloom-filter.bif @@ -0,0 +1,196 @@ +# =========================================================================== +# +# Bloom Filter Functions +# +# =========================================================================== + +%%{ + +// TODO: This is currently included from the top-level src directory, hence +// paths are relative to there. We need a better mechanisms to pull in +// BiFs defined in sub directories. +#include "probabilistic/BloomFilter.h" +#include "OpaqueVal.h" + +using namespace probabilistic; + +%%} + +module GLOBAL; + +## Creates a basic Bloom filter. +## +## .. note:: A Bloom filter can have a name associated with it. In the future, +## Bloom filters with the same name will be compatible across indepedent Bro +## instances, i.e., it will be possible to merge them. Currently, however, that is +## not yet supported. +## +## fp: The desired false-positive rate. +## +## capacity: the maximum number of elements that guarantees a false-positive +## rate of *fp*. +## +## name: A name that uniquely identifies and seeds the Bloom filter. If empty, +## the filter will remain tied to the current Bro process. +## +## Returns: A Bloom filter handle. +## +## .. bro:see:: bloomfilter_counting_init bloomfilter_add bloomfilter_lookup +## bloomfilter_clear bloomfilter_merge +function bloomfilter_basic_init%(fp: double, capacity: count, + name: string &default=""%): opaque of bloomfilter + %{ + if ( fp < 0.0 || fp > 1.0 ) + { + reporter->Error("false-positive rate must take value between 0 and 1"); + return 0; + } + + size_t cells = BasicBloomFilter::M(fp, capacity); + size_t optimal_k = BasicBloomFilter::K(cells, capacity); + const Hasher* h = Hasher::Create(optimal_k, name->CheckString()); + + return new BloomFilterVal(new BasicBloomFilter(h, cells)); + %} + +## Creates a counting Bloom filter. +## +## .. note:: A Bloom filter can have a name associated with it. In the future, +## Bloom filters with the same name will be compatible across indepedent Bro +## instances, i.e., it will be possible to merge them. Currently, however, that is +## not yet supported. +## +## k: The number of hash functions to use. +## +## cells: The number of cells of the underlying counter vector. As there's no +## single answer to what's the best parameterization for a counting Bloom filter, +## we refer to the Bloom filter literature here for choosing an appropiate value. +## +## max: The maximum counter value associated with each each element described +## by *w = ceil(log_2(max))* bits. Each bit in the underlying counter vector +## becomes a cell of size *w* bits. +## +## name: A name that uniquely identifies and seeds the Bloom filter. If empty, +## the filter will remain tied to the current Bro process. +## +## Returns: A Bloom filter handle. +## +## .. bro:see:: bloomfilter_basic_init bloomfilter_add bloomfilter_lookup +## bloomfilter_clear bloomfilter_merge +function bloomfilter_counting_init%(k: count, cells: count, max: count, + name: string &default=""%): opaque of bloomfilter + %{ + if ( max == 0 ) + { + reporter->Error("max counter value must be greater than 0"); + return 0; + } + + const Hasher* h = Hasher::Create(k, name->CheckString()); + + uint16 width = 1; + while ( max >>= 1 ) + ++width; + + return new BloomFilterVal(new CountingBloomFilter(h, cells, width)); + %} + +## Adds an element to a Bloom filter. +## +## bf: The Bloom filter handle. +## +## x: The element to add. +## +## .. bro:see:: bloomfilter_counting_init bloomfilter_basic_init loomfilter_lookup +## bloomfilter_clear bloomfilter_merge +function bloomfilter_add%(bf: opaque of bloomfilter, x: any%): any + %{ + BloomFilterVal* bfv = static_cast(bf); + + if ( ! bfv->Type() && ! bfv->Typify(x->Type()) ) + reporter->Error("failed to set Bloom filter type"); + + else if ( ! same_type(bfv->Type(), x->Type()) ) + reporter->Error("incompatible Bloom filter types"); + + else + bfv->Add(x); + + return 0; + %} + +## Retrieves the counter for a given element in a Bloom filter. +## +## bf: The Bloom filter handle. +## +## x: The element to count. +## +## Returns: the counter associated with *x* in *bf*. +## +## .. bro:see:: bloomfilter_counting_init bloomfilter_basic_init +## bloomfilter_add bloomfilter_clear bloomfilter_merge +function bloomfilter_lookup%(bf: opaque of bloomfilter, x: any%): count + %{ + const BloomFilterVal* bfv = static_cast(bf); + + if ( bfv->Empty() ) + return new Val(0, TYPE_COUNT); + + if ( ! bfv->Type() ) + reporter->Error("cannot perform lookup on untyped Bloom filter"); + + else if ( ! same_type(bfv->Type(), x->Type()) ) + reporter->Error("incompatible Bloom filter types"); + + else + return new Val(static_cast(bfv->Count(x)), TYPE_COUNT); + + return new Val(0, TYPE_COUNT); + %} + +## Removes all elements from a Bloom filter. This function resets all bits in the +## underlying bitvector back to 0 but does not change the parameterization of the +## Bloom filter, such as the element type and the hasher seed. +## +## bf: The Bloom filter handle. +## +## .. bro:see:: bloomfilter_counting_init bloomfilter_basic_init +## bloomfilter_add bloomfilter_lookup bloomfilter_merge +function bloomfilter_clear%(bf: opaque of bloomfilter%): any + %{ + BloomFilterVal* bfv = static_cast(bf); + + if ( bfv->Type() ) // Untyped Bloom filters are already empty. + bfv->Clear(); + + return 0; + %} + +## Merges two Bloom filters. +## +## .. note:: Currently Bloom filters created by different Bro instances cannot +## be merged. In the future, this will be supported as long as both filters +## are created with the same name. +## +## bf1: The first Bloom filter handle. +## +## bf2: The second Bloom filter handle. +## +## Returns: The union of *bf1* and *bf2*. +## +## .. bro:see:: bloomfilter_counting_init bloomfilter_basic_init +## bloomfilter_add bloomfilter_lookup bloomfilter_clear +function bloomfilter_merge%(bf1: opaque of bloomfilter, + bf2: opaque of bloomfilter%): opaque of bloomfilter + %{ + const BloomFilterVal* bfv1 = static_cast(bf1); + const BloomFilterVal* bfv2 = static_cast(bf2); + + if ( ! same_type(bfv1->Type(), bfv2->Type()) ) + { + reporter->Error("incompatible Bloom filter types"); + return 0; + } + + return BloomFilterVal::Merge(bfv1, bfv2); + %} diff --git a/src/util.cc b/src/util.cc index cff36f0f23..6bea2eb7f1 100644 --- a/src/util.cc +++ b/src/util.cc @@ -716,6 +716,8 @@ static bool write_random_seeds(const char* write_file, uint32 seed, static bool bro_rand_determistic = false; static unsigned int bro_rand_state = 0; +static bool first_seed_saved = false; +static unsigned int first_seed = 0; static void bro_srandom(unsigned int seed, bool deterministic) { @@ -800,6 +802,12 @@ void init_random_seed(uint32 seed, const char* read_file, const char* write_file bro_srandom(seed, seeds_done); + if ( ! first_seed_saved ) + { + first_seed = seed; + first_seed_saved = true; + } + if ( ! hmac_key_set ) { MD5((const u_char*) buf, sizeof(buf), shared_hmac_md5_key); @@ -811,27 +819,39 @@ void init_random_seed(uint32 seed, const char* read_file, const char* write_file write_file); } +unsigned int initial_seed() + { + return first_seed; + } + bool have_random_seed() { return bro_rand_determistic; } +long int bro_prng(long int state) + { + // Use our own simple linear congruence PRNG to make sure we are + // predictable across platforms. + static const long int m = 2147483647; + static const long int a = 16807; + const long int q = m / a; + const long int r = m % a; + + state = a * ( state % q ) - r * ( state / q ); + + if ( state <= 0 ) + state += m; + + return state; + } + long int bro_random() { if ( ! bro_rand_determistic ) return random(); // Use system PRNG. - // Use our own simple linear congruence PRNG to make sure we are - // predictable across platforms. - const long int m = 2147483647; - const long int a = 16807; - const long int q = m / a; - const long int r = m % a; - - bro_rand_state = a * ( bro_rand_state % q ) - r * ( bro_rand_state / q ); - - if ( bro_rand_state <= 0 ) - bro_rand_state += m; + bro_rand_state = bro_prng(bro_rand_state); return bro_rand_state; } diff --git a/src/util.h b/src/util.h index cafa63b7e8..aaad2d9403 100644 --- a/src/util.h +++ b/src/util.h @@ -165,12 +165,20 @@ extern void hmac_md5(size_t size, const unsigned char* bytes, extern void init_random_seed(uint32 seed, const char* load_file, const char* write_file); +// Retrieves the initial seed computed after the very first call to +// init_random_seed(). Repeated calls to init_random_seed() will not affect +// the return value of this function. +unsigned int initial_seed(); + // Returns true if the user explicitly set a seed via init_random_seed(); extern bool have_random_seed(); +// A simple linear congruence PRNG. It takes its state as argument and +// returns a new random value, which can serve as state for subsequent calls. +long int bro_prng(long int state); + // Replacement for the system random(), to which is normally falls back -// except when a seed has been given. In that case, we use our own -// predictable PRNG. +// except when a seed has been given. In that case, the function bro_prng. long int bro_random(); // Calls the system srandom() function with the given seed if not running diff --git a/testing/btest/Baseline/bifs.bloomfilter/output b/testing/btest/Baseline/bifs.bloomfilter/output new file mode 100644 index 0000000000..14e1f038c0 --- /dev/null +++ b/testing/btest/Baseline/bifs.bloomfilter/output @@ -0,0 +1,27 @@ +error: incompatible Bloom filter types +error: incompatible Bloom filter types +error: incompatible Bloom filter types +error: incompatible Bloom filter types +error: false-positive rate must take value between 0 and 1 +error: false-positive rate must take value between 0 and 1 +0 +1 +1 +0 +1 +1 +1 +1 +1 +1 +1 +1 +1 +2 +3 +3 +2 +3 +3 +3 +2 diff --git a/testing/btest/bifs/bloomfilter.bro b/testing/btest/bifs/bloomfilter.bro new file mode 100644 index 0000000000..3b40f29553 --- /dev/null +++ b/testing/btest/bifs/bloomfilter.bro @@ -0,0 +1,83 @@ +# @TEST-EXEC: bro -b %INPUT >output 2>&1 +# @TEST-EXEC: btest-diff output + +function test_basic_bloom_filter() + { + # Basic usage with counts. + local bf_cnt = bloomfilter_basic_init(0.1, 1000); + bloomfilter_add(bf_cnt, 42); + bloomfilter_add(bf_cnt, 84); + bloomfilter_add(bf_cnt, 168); + print bloomfilter_lookup(bf_cnt, 0); + print bloomfilter_lookup(bf_cnt, 42); + print bloomfilter_lookup(bf_cnt, 168); + print bloomfilter_lookup(bf_cnt, 336); + bloomfilter_add(bf_cnt, 0.5); # Type mismatch + bloomfilter_add(bf_cnt, "foo"); # Type mismatch + + # Basic usage with strings. + local bf_str = bloomfilter_basic_init(0.9, 10); + bloomfilter_add(bf_str, "foo"); + bloomfilter_add(bf_str, "bar"); + print bloomfilter_lookup(bf_str, "foo"); + print bloomfilter_lookup(bf_str, "bar"); + print bloomfilter_lookup(bf_str, "b4z"); # FP + print bloomfilter_lookup(bf_str, "quux"); # FP + bloomfilter_add(bf_str, 0.5); # Type mismatch + bloomfilter_add(bf_str, 100); # Type mismatch + + # Edge cases. + local bf_edge0 = bloomfilter_basic_init(0.000000000001, 1); + local bf_edge1 = bloomfilter_basic_init(0.00000001, 100000000); + local bf_edge2 = bloomfilter_basic_init(0.9999999, 1); + local bf_edge3 = bloomfilter_basic_init(0.9999999, 100000000000); + + # Invalid parameters. + local bf_bug0 = bloomfilter_basic_init(-0.5, 42); + local bf_bug1 = bloomfilter_basic_init(1.1, 42); + + # Merging + local bf_cnt2 = bloomfilter_basic_init(0.1, 1000); + bloomfilter_add(bf_cnt2, 42); + bloomfilter_add(bf_cnt, 100); + local bf_merged = bloomfilter_merge(bf_cnt, bf_cnt2); + print bloomfilter_lookup(bf_merged, 42); + print bloomfilter_lookup(bf_merged, 84); + print bloomfilter_lookup(bf_merged, 100); + print bloomfilter_lookup(bf_merged, 168); + } + +function test_counting_bloom_filter() + { + local bf = bloomfilter_counting_init(3, 32, 3); + bloomfilter_add(bf, "foo"); + print bloomfilter_lookup(bf, "foo"); # 1 + bloomfilter_add(bf, "foo"); + print bloomfilter_lookup(bf, "foo"); # 2 + bloomfilter_add(bf, "foo"); + print bloomfilter_lookup(bf, "foo"); # 3 + bloomfilter_add(bf, "foo"); + print bloomfilter_lookup(bf, "foo"); # still 3 + + + bloomfilter_add(bf, "bar"); + bloomfilter_add(bf, "bar"); + print bloomfilter_lookup(bf, "bar"); # 2 + print bloomfilter_lookup(bf, "foo"); # still 3 + + # Merging + local bf2 = bloomfilter_counting_init(3, 32, 3); + bloomfilter_add(bf2, "baz"); + bloomfilter_add(bf2, "baz"); + bloomfilter_add(bf2, "bar"); + local bf_merged = bloomfilter_merge(bf, bf2); + print bloomfilter_lookup(bf_merged, "foo"); + print bloomfilter_lookup(bf_merged, "bar"); + print bloomfilter_lookup(bf_merged, "baz"); + } + +event bro_init() + { + test_basic_bloom_filter(); + test_counting_bloom_filter(); + } diff --git a/testing/btest/istate/opaque.bro b/testing/btest/istate/opaque.bro index 84818a5e70..b387f9d6bc 100644 --- a/testing/btest/istate/opaque.bro +++ b/testing/btest/istate/opaque.bro @@ -12,6 +12,9 @@ global sha1_handle: opaque of sha1 &persistent &synchronized; global sha256_handle: opaque of sha256 &persistent &synchronized; global entropy_handle: opaque of entropy &persistent &synchronized; +global bloomfilter_elements: set[string] &persistent &synchronized; +global bloomfilter_handle: opaque of bloomfilter &persistent &synchronized; + event bro_done() { local out = open("output.log"); @@ -36,6 +39,9 @@ event bro_done() print out, entropy_test_finish(entropy_handle); else print out, "entropy_test_add() failed"; + + for ( e in bloomfilter_elements ) + print bloomfilter_lookup(bloomfilter_handle, e); } @TEST-END-FILE @@ -47,6 +53,9 @@ global sha1_handle: opaque of sha1 &persistent &synchronized; global sha256_handle: opaque of sha256 &persistent &synchronized; global entropy_handle: opaque of entropy &persistent &synchronized; +global bloomfilter_elements = { "foo", "bar", "baz" } &persistent &synchronized; +global bloomfilter_handle: opaque of bloomfilter &persistent &synchronized; + event bro_init() { local out = open("expected.log"); @@ -72,6 +81,10 @@ event bro_init() entropy_handle = entropy_test_init(); if ( ! entropy_test_add(entropy_handle, "f") ) print out, "entropy_test_add() failed"; + + bloomfilter_handle = bloomfilter_basic_init(0.1, 100); + for ( e in bloomfilter_elements ) + bloomfilter_add(bloomfilter_handle, e); } @TEST-END-FILE