* 'paraglob' of https://github.com/ZekeMedley/zeek:
  Add leak test to paraglob.
  Catch paraglob serialization errors in DoClone.
  Update paraglob serialization.
  Stop execution on paraglob error.
  Update paraglob submodule
  Change C++11 detection in paraglob.
  Make paraglob serializable and copyable.
  Initial paraglob integration.

I made a bunch of small changes:
 * paraglob now deals better with \0 characters
 * I rolled back the changes to Binary Serialization format,
 * there were some small formatting issue
 * the error output was slightly unsafe
 * build_unique is now in util.h.

and perhaps a few more small things.
This commit is contained in:
Johanna Amann 2019-06-24 14:05:57 -07:00
commit 5f9a9bbcbe
22 changed files with 432 additions and 4 deletions

3
.gitmodules vendored
View file

@ -28,3 +28,6 @@
[submodule "doc"]
path = doc
url = https://github.com/zeek/zeek-docs
[submodule "aux/paraglob"]
path = aux/paraglob
url = https://github.com/zeek/paraglob

View file

@ -1,4 +1,11 @@
2.6-517 | 2019-06-24 15:20:39 -0700
* Add paraglob, a fairly quick data structure for matching a string against a large list of patterns.
(Zeke Medley, Corelight)
* GH-171: support warning messages alongside deprecated attributes (Tim Wojtulewicz, Corelight)
2.6-503 | 2019-06-21 11:17:58 -0700
* GH-417: Remove old, unmaintained p0f support. (Johanna Amann, Corelight)

View file

@ -359,6 +359,10 @@ include_directories(BEFORE ${CAF_INCLUDE_DIR_CORE})
include_directories(BEFORE ${CAF_INCLUDE_DIR_IO})
include_directories(BEFORE ${CAF_INCLUDE_DIR_OPENSSL})
add_subdirectory(aux/paraglob)
set(zeekdeps ${zeekdeps} paraglob)
include_directories(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/aux/paraglob)
add_subdirectory(src)
add_subdirectory(scripts)
add_subdirectory(man)

7
NEWS
View file

@ -112,6 +112,13 @@ New Functionality
v[2:4] = vector(6, 7, 8); # v is now [1, 2, 6, 7, 8, 5]
print v[:4]; # prints [1, 2, 6, 7]
- Add support for paraglob, a fairly quick data structure for matching a string
against a large list of patterns. For example::
local v1 = vector("*", "d?g", "*og", "d?", "d[!wl]g");
local p1 = paraglob_init(v1);
print paraglob_get(p1, "dog");
Changed Functionality
---------------------

View file

@ -1 +1 @@
2.6-503
2.6-517

1
aux/paraglob Submodule

@ -0,0 +1 @@
Subproject commit c3bd6b88d8ee79752d95f6647a098f9a0b600b0e

2
doc

@ -1 +1 @@
Subproject commit 957e8a6ec80de04e35e6bbd62e12e3970be1c382
Subproject commit 843f601f9236bef694959d5bf336cb0e4fbaea31

View file

@ -281,4 +281,3 @@ string IPPrefix::AsString() const
return prefix.AsString() +"/" + l;
}

View file

@ -1024,3 +1024,88 @@ bool CardinalityVal::DoUnserialize(const broker::data& data)
c = cu.release();
return true;
}
ParaglobVal::ParaglobVal(std::unique_ptr<paraglob::Paraglob> p)
: OpaqueVal(paraglob_type)
{
this->internal_paraglob = std::move(p);
}
VectorVal* ParaglobVal::Get(StringVal* &pattern)
{
VectorVal* rval = new VectorVal(internal_type("string_vec")->AsVectorType());
std::string string_pattern (reinterpret_cast<const char*>(pattern->Bytes()), pattern->Len());
std::vector<std::string> matches = this->internal_paraglob->get(string_pattern);
for (unsigned int i = 0; i < matches.size(); i++)
rval->Assign(i, new StringVal(matches.at(i)));
return rval;
}
bool ParaglobVal::operator==(const ParaglobVal& other) const
{
return *(this->internal_paraglob) == *(other.internal_paraglob);
}
IMPLEMENT_OPAQUE_VALUE(ParaglobVal)
broker::expected<broker::data> ParaglobVal::DoSerialize() const
{
broker::vector d;
std::unique_ptr<std::vector<uint8_t>> iv = this->internal_paraglob->serialize();
for (uint8_t a : *(iv.get()))
d.emplace_back(static_cast<uint64_t>(a));
return {std::move(d)};
}
bool ParaglobVal::DoUnserialize(const broker::data& data)
{
auto d = caf::get_if<broker::vector>(&data);
if ( ! d )
return false;
std::unique_ptr<std::vector<uint8_t>> iv (new std::vector<uint8_t>);
iv->resize(d->size());
for (std::vector<broker::data>::size_type i = 0; i < d->size(); ++i)
{
if ( ! get_vector_idx<uint64_t>(*d, i, iv.get()->data() + i) )
return false;
}
try
{
this->internal_paraglob = build_unique<paraglob::Paraglob>(std::move(iv));
}
catch (const paraglob::underflow_error& e)
{
reporter->Error("Paraglob underflow error -> %s", e.what());
return false;
}
catch (const paraglob::overflow_error& e)
{
reporter->Error("Paraglob overflow error -> %s", e.what());
return false;
}
return true;
}
Val* ParaglobVal::DoClone(CloneState* state)
{
try {
return new ParaglobVal
(build_unique<paraglob::Paraglob>(this->internal_paraglob->serialize()));
}
catch (const paraglob::underflow_error& e)
{
reporter->Error("Paraglob underflow error while cloning -> %s", e.what());
return nullptr;
}
catch (const paraglob::overflow_error& e)
{
reporter->Error("Paraglob overflow error while cloning -> %s", e.what());
return nullptr;
}
}

View file

@ -3,12 +3,18 @@
#ifndef OPAQUEVAL_H
#define OPAQUEVAL_H
#include <memory> // std::unique_ptr
#include <broker/data.hh>
#include <broker/expected.hh>
#include "RandTest.h"
#include "Val.h"
#include "digest.h"
#include "src/paraglob.h"
class OpaqueVal;
@ -319,4 +325,20 @@ private:
probabilistic::CardinalityCounter* c;
};
class ParaglobVal : public OpaqueVal {
public:
explicit ParaglobVal(std::unique_ptr<paraglob::Paraglob> p);
VectorVal* Get(StringVal* &pattern);
Val* DoClone(CloneState* state) override;
bool operator==(const ParaglobVal& other) const;
protected:
ParaglobVal() : OpaqueVal(paraglob_type) {}
DECLARE_OPAQUE_VALUE(ParaglobVal)
private:
std::unique_ptr<paraglob::Paraglob> internal_paraglob;
};
#endif

View file

@ -635,6 +635,7 @@ extern OpaqueType* topk_type;
extern OpaqueType* bloomfilter_type;
extern OpaqueType* x509_opaque_type;
extern OpaqueType* ocsp_resp_opaque_type;
extern OpaqueType* paraglob_type;
// Returns the Bro basic (non-parameterized) type with the given type.
// The reference count of the type is not increased.

View file

@ -118,6 +118,7 @@ OpaqueType* topk_type = 0;
OpaqueType* bloomfilter_type = 0;
OpaqueType* x509_opaque_type = 0;
OpaqueType* ocsp_resp_opaque_type = 0;
OpaqueType* paraglob_type = 0;
// Keep copy of command line
int bro_argc;
@ -786,6 +787,7 @@ int main(int argc, char** argv)
bloomfilter_type = new OpaqueType("bloomfilter");
x509_opaque_type = new OpaqueType("x509");
ocsp_resp_opaque_type = new OpaqueType("ocsp_resp");
paraglob_type = new OpaqueType("paraglob");
// The leak-checker tends to produce some false
// positives (memory which had already been

View file

@ -555,4 +555,13 @@ void bro_strerror_r(int bro_errno, char* buf, size_t buflen);
*/
char* zeekenv(const char* name);
/**
* Small convenience function. Does what std::make_unique does in C++14. Will not
* work on arrays.
*/
template <typename T, typename ... Args>
std::unique_ptr<T> build_unique (Args&&... args) {
return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
}
#endif

View file

@ -789,6 +789,74 @@ function sha256_hash_finish%(handle: opaque of sha256%): string
return static_cast<HashVal*>(handle)->Get();
%}
## Initializes and returns a new paraglob.
##
## v: Vector of patterns to initialize the paraglob with.
##
## Returns: A new, compiled, paraglob with the patterns in *v*
##
## .. zeek:see::paraglob_get paraglob_equals paraglob_add
function paraglob_init%(v: any%) : opaque of paraglob
%{
if ( v->Type()->Tag() != TYPE_VECTOR ||
v->Type()->YieldType()->Tag() != TYPE_STRING )
{
// reporter->Error will throw an exception.
reporter->Error("paraglob requires a vector of strings for initialization.");
return nullptr;
}
std::vector<std::string> patterns;
VectorVal* vv = v->AsVectorVal();
for ( unsigned int i = 0; i < vv->Size(); ++i )
{
const BroString* s = vv->Lookup(i)->AsString();
patterns.push_back(std::string(reinterpret_cast<const char*>(s->Bytes()), s->Len()));
}
try
{
std::unique_ptr<paraglob::Paraglob> p (new paraglob::Paraglob(patterns));
return new ParaglobVal(std::move(p));
}
// Thrown if paraglob fails to add a pattern.
catch (const paraglob::add_error& e)
{
reporter->Error("Paraglob failed to add pattern: %s", e.what());
return nullptr;
}
%}
## Gets all the strings inside the handle associated with an input pattern.
##
## handle: A compiled paraglob.
##
## pattern: A glob style pattern.
##
## Returns: A vector of strings matching the input pattern
##
## ## .. zeek:see::paraglob_add paraglob_equals paraglob_init
function paraglob_get%(handle: opaque of paraglob, pat: string%): string_vec
%{
return static_cast<ParaglobVal*>(handle)->Get(pat);
%}
## Compares two paraglobs for equality.
##
## p_one: A compiled paraglob.
##
## p_two: A compiled paraglob.
##
## Returns: True if both paraglobs contain the same patterns, false otherwise.
##
## ## .. zeek:see::paraglob_add paraglob_get paraglob_init
function paraglob_equals%(p_one: opaque of paraglob, p_two: opaque of paraglob%) : bool
%{
return val_mgr->GetBool(
*(static_cast<ParaglobVal*>(p_one)) == *(static_cast<ParaglobVal*>(p_two))
);
%}
## Returns 32-bit digest of arbitrary input values using FNV-1a hash algorithm.
## See `<https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function>`_.
##

View file

@ -23,3 +23,6 @@
============ Entropy
[entropy=4.715374, chi_square=591.981818, mean=75.472727, monte_carlo_pi=4.0, serial_correlation=-0.11027]
[entropy=4.715374, chi_square=591.981818, mean=75.472727, monte_carlo_pi=4.0, serial_correlation=-0.11027]
============ Paraglob
T
T

View file

@ -0,0 +1,12 @@
receiver added peer: endpoint=127.0.0.1 msg=handshake successful
is_remote should be T, and is, T
receiver got ping number: 1
[*, *ello, hello]
is_remote should be T, and is, T
receiver got ping number: 2
[*, *ello, hello]
is_remote should be T, and is, T
receiver got ping number: 3
[*, *ello, hello]
is_remote should be T, and is, T
[num_peers=1, num_stores=0, num_pending_queries=0, num_events_incoming=4, num_events_outgoing=3, num_logs_incoming=0, num_logs_outgoing=1, num_ids_incoming=0, num_ids_outgoing=0]

View file

@ -0,0 +1,11 @@
Starting send.
[*, *ello, hello]
is_remote should be F, and is, F
sender added peer: endpoint=127.0.0.1 msg=received handshake from remote core
is_remote should be T, and is, T
sender got pong number: 1
is_remote should be T, and is, T
sender got pong number: 2
is_remote should be T, and is, T
sender got pong number: 3
sender lost peer: endpoint=127.0.0.1 msg=lost remote peer

View file

@ -0,0 +1,9 @@
[T, T, T, T, T]
T
F
[*, *og, d?g, d[!wl]g]
[once]
[]
[*.gov*, *malware*]
[z*ro]
[*.gov*, *malware*]

View file

@ -0,0 +1,34 @@
# Needs perftools support.
#
# @TEST-GROUP: leaks
#
# @TEST-REQUIRES: zeek --help 2>&1 | grep -q mem-leaks
#
# @TEST-EXEC: HEAP_CHECK_DUMP_DIRECTORY=. HEAPCHECK=local btest-bg-run zeek zeek -m -b -r $TRACES/http/get.trace %INPUT
# @TEST-EXEC: btest-bg-wait 120
event new_connection (c : connection)
{
local v1 = vector("*", "d?g", "*og", "d?", "d[!wl]g");
local v2 = vector("once", "!o*", "once");
local v3 = vector("https://*.google.com/*", "*malware*", "*.gov*");
local p1 = paraglob_init(v1);
local p2: opaque of paraglob = paraglob_init(v2);
local p3 = paraglob_init(v3);
local p_eq = paraglob_init(v1);
# paraglob_init should not modify v1
print (v1 == vector("*", "d?g", "*og", "d?", "d[!wl]g"));
# p_eq and p1 should be the same paraglobs
print paraglob_equals(p_eq, p1);
print paraglob_get(p1, "dog");
print paraglob_get(p2, "once");
print paraglob_get(p3, "www.strange-malware-domain.gov");
local large_glob: opaque of paraglob = paraglob_init(v3);
print paraglob_get(large_glob, "www.strange-malware-domain.gov");
}

View file

@ -82,4 +82,12 @@ event zeek_init()
local handle2 = copy(handle);
print entropy_test_finish(handle);
print entropy_test_finish(handle2);
print "============ Paraglob";
local p = paraglob_init(vector("https://*.google.com/*", "*malware*", "*.gov*"));
local p2 = copy(p);
print paraglob_equals(p, p2);
# A get operation shouldn't change the paraglob
paraglob_get(p, "whitehouse.gov");
print paraglob_equals(p, p2);
}

View file

@ -0,0 +1,102 @@
# @TEST-PORT: BROKER_PORT
#
# @TEST-EXEC: btest-bg-run recv "zeek -B broker -b ../recv.zeek >recv.out"
# @TEST-EXEC: btest-bg-run send "zeek -B broker -b ../send.zeek >send.out"
#
# @TEST-EXEC: btest-bg-wait 30
# @TEST-EXEC: btest-diff recv/recv.out
# @TEST-EXEC: btest-diff send/send.out
@TEST-START-FILE send.zeek
redef exit_only_after_terminate = T;
global event_count = 0;
global p: opaque of paraglob = paraglob_init(vector("hello", "*ello", "*"));
global ping: event(msg: opaque of paraglob, c: count);
event zeek_init()
{
print "Starting send.";
print paraglob_get(p, "hello");
Broker::subscribe("bro/event/my_topic");
Broker::peer("127.0.0.1", 9999/tcp);
print "is_remote should be F, and is", is_remote_event();
}
function send_event()
{
++event_count;
local e = Broker::make_event(ping, p, event_count);
Broker::publish("bro/event/my_topic", e);
}
event Broker::peer_added(endpoint: Broker::EndpointInfo, msg: string)
{
print fmt("sender added peer: endpoint=%s msg=%s",
endpoint$network$address, msg);
send_event();
}
event Broker::peer_lost(endpoint: Broker::EndpointInfo, msg: string)
{
print fmt("sender lost peer: endpoint=%s msg=%s",
endpoint$network$address, msg);
terminate();
}
event pong(msg: opaque of paraglob, n: count)
{
print "is_remote should be T, and is", is_remote_event();
print fmt("sender got pong number: %s", n);
send_event();
}
@TEST-END-FILE
@TEST-START-FILE recv.zeek
redef exit_only_after_terminate = T;
const events_to_recv = 3;
global handler: event(msg: string, c: count);
global auto_handler: event(msg: string, c: count);
global pong: event(msg: opaque of paraglob, c: count);
event zeek_init()
{
Broker::subscribe("bro/event/my_topic");
Broker::listen("127.0.0.1", 9999/tcp);
}
event Broker::peer_added(endpoint: Broker::EndpointInfo, msg: string)
{
print fmt("receiver added peer: endpoint=%s msg=%s", endpoint$network$address, msg);
}
event Broker::peer_lost(endpoint: Broker::EndpointInfo, msg: string)
{
print fmt("receiver lost peer: endpoint=%s msg=%s", endpoint$network$address, msg);
}
event ping(msg: opaque of paraglob, n: count)
{
print "is_remote should be T, and is", is_remote_event();
if ( n > events_to_recv )
{
print get_broker_stats();
terminate();
return;
}
print fmt("receiver got ping number: %s", n);
print paraglob_get(msg, "hello");
local e = Broker::make_event(pong, msg, n);
Broker::publish("bro/event/my_topic", e);
}
@TEST-END-FILE

View file

@ -0,0 +1,41 @@
# @TEST-EXEC: zeek -b %INPUT >out
# @TEST-EXEC: btest-diff out
event zeek_init ()
{
local v1 = vector("*", "d?g", "*og", "d?", "d[!wl]g");
local v2 = vector("once", "!o*", "once");
local v3 = vector("https://*.google.com/*", "*malware*", "*.gov*");
local v4 = vector("z*ro");
local p1 = paraglob_init(v1);
local p2: opaque of paraglob = paraglob_init(v2);
local p3 = paraglob_init(v3);
local p4 = paraglob_init(v4);
local p_eq = paraglob_init(v1);
# paraglob_init should not modify v1
print (v1 == vector("*", "d?g", "*og", "d?", "d[!wl]g"));
# p_eq and p1 should be the same paraglobs
print paraglob_equals(p_eq, p1);
print paraglob_equals(p1, p2);
print paraglob_get(p1, "dog");
print paraglob_get(p2, "once");
print paraglob_get(p2, "nothing");
print paraglob_get(p3, "www.strange-malware-domain.gov");
print paraglob_get(p4, "zero\0zero");
# This looks like a lot, but really should complete quickly.
# Paraglob should stop addition of duplicate patterns.
local i = 1000000;
while (i > 0) {
i = i - 1;
v3 += v3[1];
}
local large_glob: opaque of paraglob = paraglob_init(v3);
print paraglob_get(large_glob, "www.strange-malware-domain.gov");
}