diff --git a/.gitmodules b/.gitmodules index d151b3d288..1bceead3d6 100644 --- a/.gitmodules +++ b/.gitmodules @@ -28,3 +28,6 @@ [submodule "doc"] path = doc url = https://github.com/zeek/zeek-docs +[submodule "aux/paraglob"] + path = aux/paraglob + url = https://github.com/zeek/paraglob diff --git a/CHANGES b/CHANGES index d7b1944bec..810cd211fe 100644 --- a/CHANGES +++ b/CHANGES @@ -1,4 +1,11 @@ +2.6-517 | 2019-06-24 15:20:39 -0700 + + * Add paraglob, a fairly quick data structure for matching a string against a large list of patterns. + (Zeke Medley, Corelight) + + * GH-171: support warning messages alongside deprecated attributes (Tim Wojtulewicz, Corelight) + 2.6-503 | 2019-06-21 11:17:58 -0700 * GH-417: Remove old, unmaintained p0f support. (Johanna Amann, Corelight) diff --git a/CMakeLists.txt b/CMakeLists.txt index b8db7b52f9..217e741148 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -359,6 +359,10 @@ include_directories(BEFORE ${CAF_INCLUDE_DIR_CORE}) include_directories(BEFORE ${CAF_INCLUDE_DIR_IO}) include_directories(BEFORE ${CAF_INCLUDE_DIR_OPENSSL}) +add_subdirectory(aux/paraglob) +set(zeekdeps ${zeekdeps} paraglob) +include_directories(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/aux/paraglob) + add_subdirectory(src) add_subdirectory(scripts) add_subdirectory(man) diff --git a/NEWS b/NEWS index b6d7b41884..98f61b4a87 100644 --- a/NEWS +++ b/NEWS @@ -112,6 +112,13 @@ New Functionality v[2:4] = vector(6, 7, 8); # v is now [1, 2, 6, 7, 8, 5] print v[:4]; # prints [1, 2, 6, 7] +- Add support for paraglob, a fairly quick data structure for matching a string + against a large list of patterns. For example:: + + local v1 = vector("*", "d?g", "*og", "d?", "d[!wl]g"); + local p1 = paraglob_init(v1); + print paraglob_get(p1, "dog"); + Changed Functionality --------------------- diff --git a/VERSION b/VERSION index 63dc72fdfa..81d658d97e 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.6-503 +2.6-517 diff --git a/aux/paraglob b/aux/paraglob new file mode 160000 index 0000000000..c3bd6b88d8 --- /dev/null +++ b/aux/paraglob @@ -0,0 +1 @@ +Subproject commit c3bd6b88d8ee79752d95f6647a098f9a0b600b0e diff --git a/doc b/doc index 957e8a6ec8..843f601f92 160000 --- a/doc +++ b/doc @@ -1 +1 @@ -Subproject commit 957e8a6ec80de04e35e6bbd62e12e3970be1c382 +Subproject commit 843f601f9236bef694959d5bf336cb0e4fbaea31 diff --git a/src/IPAddr.cc b/src/IPAddr.cc index c215b463b9..76aa34f79a 100644 --- a/src/IPAddr.cc +++ b/src/IPAddr.cc @@ -281,4 +281,3 @@ string IPPrefix::AsString() const return prefix.AsString() +"/" + l; } - diff --git a/src/OpaqueVal.cc b/src/OpaqueVal.cc index 93adc5ca06..d372f525a5 100644 --- a/src/OpaqueVal.cc +++ b/src/OpaqueVal.cc @@ -1024,3 +1024,88 @@ bool CardinalityVal::DoUnserialize(const broker::data& data) c = cu.release(); return true; } + +ParaglobVal::ParaglobVal(std::unique_ptr p) +: OpaqueVal(paraglob_type) + { + this->internal_paraglob = std::move(p); + } + +VectorVal* ParaglobVal::Get(StringVal* &pattern) + { + VectorVal* rval = new VectorVal(internal_type("string_vec")->AsVectorType()); + std::string string_pattern (reinterpret_cast(pattern->Bytes()), pattern->Len()); + + std::vector matches = this->internal_paraglob->get(string_pattern); + for (unsigned int i = 0; i < matches.size(); i++) + rval->Assign(i, new StringVal(matches.at(i))); + + return rval; + } + +bool ParaglobVal::operator==(const ParaglobVal& other) const + { + return *(this->internal_paraglob) == *(other.internal_paraglob); + } + +IMPLEMENT_OPAQUE_VALUE(ParaglobVal) + +broker::expected ParaglobVal::DoSerialize() const + { + broker::vector d; + std::unique_ptr> iv = this->internal_paraglob->serialize(); + for (uint8_t a : *(iv.get())) + d.emplace_back(static_cast(a)); + return {std::move(d)}; + } + +bool ParaglobVal::DoUnserialize(const broker::data& data) + { + auto d = caf::get_if(&data); + if ( ! d ) + return false; + + std::unique_ptr> iv (new std::vector); + iv->resize(d->size()); + + for (std::vector::size_type i = 0; i < d->size(); ++i) + { + if ( ! get_vector_idx(*d, i, iv.get()->data() + i) ) + return false; + } + + try + { + this->internal_paraglob = build_unique(std::move(iv)); + } + catch (const paraglob::underflow_error& e) + { + reporter->Error("Paraglob underflow error -> %s", e.what()); + return false; + } + catch (const paraglob::overflow_error& e) + { + reporter->Error("Paraglob overflow error -> %s", e.what()); + return false; + } + + return true; + } + +Val* ParaglobVal::DoClone(CloneState* state) + { + try { + return new ParaglobVal + (build_unique(this->internal_paraglob->serialize())); + } + catch (const paraglob::underflow_error& e) + { + reporter->Error("Paraglob underflow error while cloning -> %s", e.what()); + return nullptr; + } + catch (const paraglob::overflow_error& e) + { + reporter->Error("Paraglob overflow error while cloning -> %s", e.what()); + return nullptr; + } + } diff --git a/src/OpaqueVal.h b/src/OpaqueVal.h index 3e1b91f0ab..1d298bb614 100644 --- a/src/OpaqueVal.h +++ b/src/OpaqueVal.h @@ -3,12 +3,18 @@ #ifndef OPAQUEVAL_H #define OPAQUEVAL_H + + +#include // std::unique_ptr + #include #include + #include "RandTest.h" #include "Val.h" #include "digest.h" +#include "src/paraglob.h" class OpaqueVal; @@ -319,4 +325,20 @@ private: probabilistic::CardinalityCounter* c; }; +class ParaglobVal : public OpaqueVal { +public: + explicit ParaglobVal(std::unique_ptr p); + VectorVal* Get(StringVal* &pattern); + Val* DoClone(CloneState* state) override; + bool operator==(const ParaglobVal& other) const; + +protected: + ParaglobVal() : OpaqueVal(paraglob_type) {} + + DECLARE_OPAQUE_VALUE(ParaglobVal) + +private: + std::unique_ptr internal_paraglob; +}; + #endif diff --git a/src/Type.h b/src/Type.h index d22178a13e..19fad4b2ce 100644 --- a/src/Type.h +++ b/src/Type.h @@ -635,6 +635,7 @@ extern OpaqueType* topk_type; extern OpaqueType* bloomfilter_type; extern OpaqueType* x509_opaque_type; extern OpaqueType* ocsp_resp_opaque_type; +extern OpaqueType* paraglob_type; // Returns the Bro basic (non-parameterized) type with the given type. // The reference count of the type is not increased. diff --git a/src/main.cc b/src/main.cc index e69aa70909..3556a0e99d 100644 --- a/src/main.cc +++ b/src/main.cc @@ -118,6 +118,7 @@ OpaqueType* topk_type = 0; OpaqueType* bloomfilter_type = 0; OpaqueType* x509_opaque_type = 0; OpaqueType* ocsp_resp_opaque_type = 0; +OpaqueType* paraglob_type = 0; // Keep copy of command line int bro_argc; @@ -786,6 +787,7 @@ int main(int argc, char** argv) bloomfilter_type = new OpaqueType("bloomfilter"); x509_opaque_type = new OpaqueType("x509"); ocsp_resp_opaque_type = new OpaqueType("ocsp_resp"); + paraglob_type = new OpaqueType("paraglob"); // The leak-checker tends to produce some false // positives (memory which had already been diff --git a/src/util.h b/src/util.h index 388cbe3079..f89592c6d0 100644 --- a/src/util.h +++ b/src/util.h @@ -555,4 +555,13 @@ void bro_strerror_r(int bro_errno, char* buf, size_t buflen); */ char* zeekenv(const char* name); +/** + * Small convenience function. Does what std::make_unique does in C++14. Will not + * work on arrays. + */ +template +std::unique_ptr build_unique (Args&&... args) { + return std::unique_ptr(new T(std::forward(args)...)); +} + #endif diff --git a/src/zeek.bif b/src/zeek.bif index 63c5c0c64f..7460083e31 100644 --- a/src/zeek.bif +++ b/src/zeek.bif @@ -789,6 +789,74 @@ function sha256_hash_finish%(handle: opaque of sha256%): string return static_cast(handle)->Get(); %} +## Initializes and returns a new paraglob. +## +## v: Vector of patterns to initialize the paraglob with. +## +## Returns: A new, compiled, paraglob with the patterns in *v* +## +## .. zeek:see::paraglob_get paraglob_equals paraglob_add +function paraglob_init%(v: any%) : opaque of paraglob + %{ + if ( v->Type()->Tag() != TYPE_VECTOR || + v->Type()->YieldType()->Tag() != TYPE_STRING ) + { + // reporter->Error will throw an exception. + reporter->Error("paraglob requires a vector of strings for initialization."); + return nullptr; + } + + std::vector patterns; + VectorVal* vv = v->AsVectorVal(); + for ( unsigned int i = 0; i < vv->Size(); ++i ) + { + const BroString* s = vv->Lookup(i)->AsString(); + patterns.push_back(std::string(reinterpret_cast(s->Bytes()), s->Len())); + } + + try + { + std::unique_ptr p (new paraglob::Paraglob(patterns)); + return new ParaglobVal(std::move(p)); + } + // Thrown if paraglob fails to add a pattern. + catch (const paraglob::add_error& e) + { + reporter->Error("Paraglob failed to add pattern: %s", e.what()); + return nullptr; + } + %} + +## Gets all the strings inside the handle associated with an input pattern. +## +## handle: A compiled paraglob. +## +## pattern: A glob style pattern. +## +## Returns: A vector of strings matching the input pattern +## +## ## .. zeek:see::paraglob_add paraglob_equals paraglob_init +function paraglob_get%(handle: opaque of paraglob, pat: string%): string_vec + %{ + return static_cast(handle)->Get(pat); + %} + +## Compares two paraglobs for equality. +## +## p_one: A compiled paraglob. +## +## p_two: A compiled paraglob. +## +## Returns: True if both paraglobs contain the same patterns, false otherwise. +## +## ## .. zeek:see::paraglob_add paraglob_get paraglob_init +function paraglob_equals%(p_one: opaque of paraglob, p_two: opaque of paraglob%) : bool + %{ + return val_mgr->GetBool( + *(static_cast(p_one)) == *(static_cast(p_two)) + ); + %} + ## Returns 32-bit digest of arbitrary input values using FNV-1a hash algorithm. ## See ``_. ## @@ -3077,7 +3145,7 @@ function strptime%(fmt: string, d: string%) : time const time_t timeval = time_t(); struct tm t; - if ( ! localtime_r(&timeval, &t) || + if ( ! localtime_r(&timeval, &t) || ! strptime(d->CheckString(), fmt->CheckString(), &t) ) { reporter->Warning("strptime conversion failed: fmt:%s d:%s", fmt->CheckString(), d->CheckString()); diff --git a/testing/btest/Baseline/language.copy-all-opaques/out b/testing/btest/Baseline/language.copy-all-opaques/out index 68b12cecac..d4e347a67a 100644 --- a/testing/btest/Baseline/language.copy-all-opaques/out +++ b/testing/btest/Baseline/language.copy-all-opaques/out @@ -23,3 +23,6 @@ ============ Entropy [entropy=4.715374, chi_square=591.981818, mean=75.472727, monte_carlo_pi=4.0, serial_correlation=-0.11027] [entropy=4.715374, chi_square=591.981818, mean=75.472727, monte_carlo_pi=4.0, serial_correlation=-0.11027] +============ Paraglob +T +T diff --git a/testing/btest/Baseline/language.paraglob-serialization/recv.recv.out b/testing/btest/Baseline/language.paraglob-serialization/recv.recv.out new file mode 100644 index 0000000000..bd6ae96cfa --- /dev/null +++ b/testing/btest/Baseline/language.paraglob-serialization/recv.recv.out @@ -0,0 +1,12 @@ +receiver added peer: endpoint=127.0.0.1 msg=handshake successful +is_remote should be T, and is, T +receiver got ping number: 1 +[*, *ello, hello] +is_remote should be T, and is, T +receiver got ping number: 2 +[*, *ello, hello] +is_remote should be T, and is, T +receiver got ping number: 3 +[*, *ello, hello] +is_remote should be T, and is, T +[num_peers=1, num_stores=0, num_pending_queries=0, num_events_incoming=4, num_events_outgoing=3, num_logs_incoming=0, num_logs_outgoing=1, num_ids_incoming=0, num_ids_outgoing=0] diff --git a/testing/btest/Baseline/language.paraglob-serialization/send.send.out b/testing/btest/Baseline/language.paraglob-serialization/send.send.out new file mode 100644 index 0000000000..e6f0a48779 --- /dev/null +++ b/testing/btest/Baseline/language.paraglob-serialization/send.send.out @@ -0,0 +1,11 @@ +Starting send. +[*, *ello, hello] +is_remote should be F, and is, F +sender added peer: endpoint=127.0.0.1 msg=received handshake from remote core +is_remote should be T, and is, T +sender got pong number: 1 +is_remote should be T, and is, T +sender got pong number: 2 +is_remote should be T, and is, T +sender got pong number: 3 +sender lost peer: endpoint=127.0.0.1 msg=lost remote peer diff --git a/testing/btest/Baseline/language.paraglob/out b/testing/btest/Baseline/language.paraglob/out new file mode 100644 index 0000000000..18e6da7096 --- /dev/null +++ b/testing/btest/Baseline/language.paraglob/out @@ -0,0 +1,9 @@ +[T, T, T, T, T] +T +F +[*, *og, d?g, d[!wl]g] +[once] +[] +[*.gov*, *malware*] +[z*ro] +[*.gov*, *malware*] diff --git a/testing/btest/core/leaks/paraglob.zeek b/testing/btest/core/leaks/paraglob.zeek new file mode 100644 index 0000000000..c9e42f51e0 --- /dev/null +++ b/testing/btest/core/leaks/paraglob.zeek @@ -0,0 +1,34 @@ +# Needs perftools support. +# +# @TEST-GROUP: leaks +# +# @TEST-REQUIRES: zeek --help 2>&1 | grep -q mem-leaks +# +# @TEST-EXEC: HEAP_CHECK_DUMP_DIRECTORY=. HEAPCHECK=local btest-bg-run zeek zeek -m -b -r $TRACES/http/get.trace %INPUT +# @TEST-EXEC: btest-bg-wait 120 + +event new_connection (c : connection) +{ + local v1 = vector("*", "d?g", "*og", "d?", "d[!wl]g"); + local v2 = vector("once", "!o*", "once"); + local v3 = vector("https://*.google.com/*", "*malware*", "*.gov*"); + + local p1 = paraglob_init(v1); + local p2: opaque of paraglob = paraglob_init(v2); + local p3 = paraglob_init(v3); + local p_eq = paraglob_init(v1); + + # paraglob_init should not modify v1 + print (v1 == vector("*", "d?g", "*og", "d?", "d[!wl]g")); + # p_eq and p1 should be the same paraglobs + print paraglob_equals(p_eq, p1); + + print paraglob_get(p1, "dog"); + + + print paraglob_get(p2, "once"); + print paraglob_get(p3, "www.strange-malware-domain.gov"); + + local large_glob: opaque of paraglob = paraglob_init(v3); + print paraglob_get(large_glob, "www.strange-malware-domain.gov"); +} diff --git a/testing/btest/language/copy-all-opaques.zeek b/testing/btest/language/copy-all-opaques.zeek index 06b4a07471..176660bfe1 100644 --- a/testing/btest/language/copy-all-opaques.zeek +++ b/testing/btest/language/copy-all-opaques.zeek @@ -82,4 +82,12 @@ event zeek_init() local handle2 = copy(handle); print entropy_test_finish(handle); print entropy_test_finish(handle2); + + print "============ Paraglob"; + local p = paraglob_init(vector("https://*.google.com/*", "*malware*", "*.gov*")); + local p2 = copy(p); + print paraglob_equals(p, p2); + # A get operation shouldn't change the paraglob + paraglob_get(p, "whitehouse.gov"); + print paraglob_equals(p, p2); } diff --git a/testing/btest/language/paraglob-serialization.zeek b/testing/btest/language/paraglob-serialization.zeek new file mode 100644 index 0000000000..00d6c7a967 --- /dev/null +++ b/testing/btest/language/paraglob-serialization.zeek @@ -0,0 +1,102 @@ +# @TEST-PORT: BROKER_PORT +# +# @TEST-EXEC: btest-bg-run recv "zeek -B broker -b ../recv.zeek >recv.out" +# @TEST-EXEC: btest-bg-run send "zeek -B broker -b ../send.zeek >send.out" +# +# @TEST-EXEC: btest-bg-wait 30 +# @TEST-EXEC: btest-diff recv/recv.out +# @TEST-EXEC: btest-diff send/send.out + +@TEST-START-FILE send.zeek + +redef exit_only_after_terminate = T; + +global event_count = 0; +global p: opaque of paraglob = paraglob_init(vector("hello", "*ello", "*")); + +global ping: event(msg: opaque of paraglob, c: count); + +event zeek_init() + { + print "Starting send."; + print paraglob_get(p, "hello"); + Broker::subscribe("bro/event/my_topic"); + Broker::peer("127.0.0.1", 9999/tcp); + print "is_remote should be F, and is", is_remote_event(); + } + +function send_event() + { + ++event_count; + local e = Broker::make_event(ping, p, event_count); + Broker::publish("bro/event/my_topic", e); + } + +event Broker::peer_added(endpoint: Broker::EndpointInfo, msg: string) + { + print fmt("sender added peer: endpoint=%s msg=%s", + endpoint$network$address, msg); + send_event(); + } + +event Broker::peer_lost(endpoint: Broker::EndpointInfo, msg: string) + { + print fmt("sender lost peer: endpoint=%s msg=%s", + endpoint$network$address, msg); + terminate(); + } + +event pong(msg: opaque of paraglob, n: count) + { + print "is_remote should be T, and is", is_remote_event(); + print fmt("sender got pong number: %s", n); + send_event(); + } + +@TEST-END-FILE + + +@TEST-START-FILE recv.zeek + +redef exit_only_after_terminate = T; + +const events_to_recv = 3; + +global handler: event(msg: string, c: count); +global auto_handler: event(msg: string, c: count); + +global pong: event(msg: opaque of paraglob, c: count); + +event zeek_init() + { + Broker::subscribe("bro/event/my_topic"); + Broker::listen("127.0.0.1", 9999/tcp); + } + +event Broker::peer_added(endpoint: Broker::EndpointInfo, msg: string) + { + print fmt("receiver added peer: endpoint=%s msg=%s", endpoint$network$address, msg); + } + +event Broker::peer_lost(endpoint: Broker::EndpointInfo, msg: string) + { + print fmt("receiver lost peer: endpoint=%s msg=%s", endpoint$network$address, msg); + } + +event ping(msg: opaque of paraglob, n: count) + { + print "is_remote should be T, and is", is_remote_event(); + if ( n > events_to_recv ) + { + print get_broker_stats(); + terminate(); + return; + } + print fmt("receiver got ping number: %s", n); + print paraglob_get(msg, "hello"); + + local e = Broker::make_event(pong, msg, n); + Broker::publish("bro/event/my_topic", e); + } + +@TEST-END-FILE diff --git a/testing/btest/language/paraglob.zeek b/testing/btest/language/paraglob.zeek new file mode 100644 index 0000000000..920cd73141 --- /dev/null +++ b/testing/btest/language/paraglob.zeek @@ -0,0 +1,41 @@ +# @TEST-EXEC: zeek -b %INPUT >out +# @TEST-EXEC: btest-diff out + +event zeek_init () +{ + local v1 = vector("*", "d?g", "*og", "d?", "d[!wl]g"); + local v2 = vector("once", "!o*", "once"); + local v3 = vector("https://*.google.com/*", "*malware*", "*.gov*"); + local v4 = vector("z*ro"); + + local p1 = paraglob_init(v1); + local p2: opaque of paraglob = paraglob_init(v2); + local p3 = paraglob_init(v3); + local p4 = paraglob_init(v4); + local p_eq = paraglob_init(v1); + + # paraglob_init should not modify v1 + print (v1 == vector("*", "d?g", "*og", "d?", "d[!wl]g")); + # p_eq and p1 should be the same paraglobs + print paraglob_equals(p_eq, p1); + print paraglob_equals(p1, p2); + + print paraglob_get(p1, "dog"); + + + print paraglob_get(p2, "once"); + print paraglob_get(p2, "nothing"); + print paraglob_get(p3, "www.strange-malware-domain.gov"); + print paraglob_get(p4, "zero\0zero"); + + # This looks like a lot, but really should complete quickly. + # Paraglob should stop addition of duplicate patterns. + local i = 1000000; + while (i > 0) { + i = i - 1; + v3 += v3[1]; + } + + local large_glob: opaque of paraglob = paraglob_init(v3); + print paraglob_get(large_glob, "www.strange-malware-domain.gov"); +}