diff --git a/.gitmodules b/.gitmodules index d151b3d288..1bceead3d6 100644 --- a/.gitmodules +++ b/.gitmodules @@ -28,3 +28,6 @@ [submodule "doc"] path = doc url = https://github.com/zeek/zeek-docs +[submodule "aux/paraglob"] + path = aux/paraglob + url = https://github.com/zeek/paraglob diff --git a/CMakeLists.txt b/CMakeLists.txt index f5edf896c0..a51783711f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -325,6 +325,10 @@ include_directories(BEFORE ${CAF_INCLUDE_DIR_CORE}) include_directories(BEFORE ${CAF_INCLUDE_DIR_IO}) include_directories(BEFORE ${CAF_INCLUDE_DIR_OPENSSL}) +add_subdirectory(aux/paraglob) +set(zeekdeps ${zeekdeps} paraglob) +include_directories(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/aux/paraglob) + add_subdirectory(src) add_subdirectory(scripts) add_subdirectory(man) diff --git a/aux/paraglob b/aux/paraglob new file mode 160000 index 0000000000..757e00b651 --- /dev/null +++ b/aux/paraglob @@ -0,0 +1 @@ +Subproject commit 757e00b6510d2b0e92510c9c26f9e3279aa442a4 diff --git a/src/OpaqueVal.cc b/src/OpaqueVal.cc index 5b6c9aa483..e1c182ca73 100644 --- a/src/OpaqueVal.cc +++ b/src/OpaqueVal.cc @@ -869,3 +869,28 @@ void CardinalityVal::Add(const Val* val) c->AddElement(key->Hash()); delete key; } + + +ParaglobVal::ParaglobVal(paraglob::Paraglob* p) +: OpaqueVal(paraglob_type) + { + this->internal_paraglob = p; + } + +VectorVal* ParaglobVal::get(StringVal* &pattern) + { + VectorVal* rval = new VectorVal(internal_type("string_vec")->AsVectorType()); + std::string string_pattern (pattern->CheckString(), pattern->Len()); + std::vector matches = this->internal_paraglob->get(string_pattern); + + for (unsigned int i = 0; i < matches.size(); i++) { + rval->Assign(i, new StringVal(matches.at(i).c_str())); + } + + return rval; + } + +bool ParaglobVal::operator==(const ParaglobVal *other) + { + return (*(this->internal_paraglob) == *(other->internal_paraglob)); + } diff --git a/src/OpaqueVal.h b/src/OpaqueVal.h index 5fe0823436..34e7ae9998 100644 --- a/src/OpaqueVal.h +++ b/src/OpaqueVal.h @@ -8,6 +8,7 @@ #include "RandTest.h" #include "Val.h" #include "digest.h" +#include "src/paraglob.h" namespace probabilistic { class BloomFilter; @@ -188,4 +189,14 @@ private: DECLARE_SERIAL(CardinalityVal); }; +class ParaglobVal : public OpaqueVal { +public: + explicit ParaglobVal(paraglob::Paraglob* p); + VectorVal* get(StringVal* &pattern); + bool operator==(const ParaglobVal *other); + +private: + paraglob::Paraglob* internal_paraglob; +}; + #endif diff --git a/src/Type.h b/src/Type.h index c537bb6203..6cc4f3e84a 100644 --- a/src/Type.h +++ b/src/Type.h @@ -639,6 +639,7 @@ extern OpaqueType* topk_type; extern OpaqueType* bloomfilter_type; extern OpaqueType* x509_opaque_type; extern OpaqueType* ocsp_resp_opaque_type; +extern OpaqueType* paraglob_type; // Returns the Bro basic (non-parameterized) type with the given type. // The reference count of the type is not increased. diff --git a/src/bro.bif b/src/bro.bif index 039053f4f2..972665d8fe 100644 --- a/src/bro.bif +++ b/src/bro.bif @@ -789,6 +789,63 @@ function sha256_hash_finish%(handle: opaque of sha256%): string return static_cast(handle)->Get(); %} + +## Initializes and returns a new paraglob. +## +## v: Vector of patterns to initialize the paraglob with. +## +## Returns: A new, compiled, paraglob with the patterns in *v* +## +## .. zeek:see::paraglob_get paraglob_equals paraglob_add +function paraglob_init%(v: any%) : opaque of paraglob + %{ + if ( v->Type()->Tag() != TYPE_VECTOR || + v->Type()->YieldType()->Tag() != TYPE_STRING ) + { + builtin_error("paraglob requires a vector for initialization."); + return nullptr; + } + + std::vector patterns; + VectorVal* vv = v->AsVectorVal(); + for ( unsigned int i = 0; i < vv->Size(); ++i ) + { + const BroString* s = vv->Lookup(i)->AsString(); + patterns.push_back(std::string(s->CheckString(), s->Len())); + } + + return new ParaglobVal(new paraglob::Paraglob(patterns)); + %} + +## Gets all the strings inside the handle associated with an input pattern. +## +## handle: A compiled paraglob. +## pattern: A glob style pattern. +## +## Returns: A vector of strings matching the input pattern +## +## ## .. zeek:see::paraglob_add paraglob_equals paraglob_init +function paraglob_get%(handle: opaque of paraglob, pat: string%): string_vec + %{ + return static_cast(handle)->get(pat); + %} + +## Compares two paraglobs for equality. +## +## p_one: A compiled paraglob. +## p_two: A compiled paraglob. +## +## Returns: True of both paraglobs contain the same patterns, false otherwise. +## +## ## .. zeek:see::paraglob_add paraglob_get paraglob_init +function paraglob_equals%(p_one: opaque of paraglob, p_two: opaque of paraglob%) + : bool + %{ + bool eq = + (static_cast(p_one) == static_cast(p_two)); + return val_mgr->GetBool(eq); + %} + ## Returns 32-bit digest of arbitrary input values using FNV-1a hash algorithm. ## See ``_. ## @@ -3071,7 +3128,7 @@ function strptime%(fmt: string, d: string%) : time const time_t timeval = time_t(); struct tm t; - if ( ! localtime_r(&timeval, &t) || + if ( ! localtime_r(&timeval, &t) || ! strptime(d->CheckString(), fmt->CheckString(), &t) ) { reporter->Warning("strptime conversion failed: fmt:%s d:%s", fmt->CheckString(), d->CheckString()); diff --git a/src/input/Manager.cc b/src/input/Manager.cc index bcd3e84bf3..34e8960193 100644 --- a/src/input/Manager.cc +++ b/src/input/Manager.cc @@ -224,7 +224,7 @@ ReaderBackend* Manager::CreateBackend(ReaderFrontend* frontend, EnumVal* tag) return backend; } -// Create a new input reader object to be used at whomevers leisure lateron. +// Create a new input reader object to be used at whomevers leisure later on. bool Manager::CreateStream(Stream* info, RecordVal* description) { RecordType* rtype = description->Type()->AsRecordType(); @@ -232,7 +232,7 @@ bool Manager::CreateStream(Stream* info, RecordVal* description) || same_type(rtype, BifType::Record::Input::EventDescription, 0) || same_type(rtype, BifType::Record::Input::AnalysisDescription, 0) ) ) { - reporter->Error("Streamdescription argument not of right type for new input stream"); + reporter->Error("Stream description argument not of right type for new input stream"); return false; } @@ -824,6 +824,7 @@ bool Manager::IsCompatibleType(BroType* t, bool atomic_only) case TYPE_INTERVAL: case TYPE_ENUM: case TYPE_STRING: + case TYPE_PATTERN: return true; case TYPE_RECORD: @@ -2074,6 +2075,12 @@ int Manager::GetValueLength(const Value* val) const } break; + case TYPE_PATTERN: + { + length += strlen(val->val.pattern_text_val) + 1; + break; + } + case TYPE_TABLE: { for ( int i = 0; i < val->val.set_val.size; i++ ) @@ -2193,6 +2200,14 @@ int Manager::CopyValue(char *data, const int startpos, const Value* val) const return length; } + case TYPE_PATTERN: + { + // include null-terminator + int length = strlen(val->val.pattern_text_val) + 1; + memcpy(data + startpos, val->val.pattern_text_val, length); + return length; + } + case TYPE_TABLE: { int length = 0; @@ -2350,6 +2365,13 @@ Val* Manager::ValueToVal(const Stream* i, const Value* val, BroType* request_typ return subnetval; } + case TYPE_PATTERN: + { + RE_Matcher* re = new RE_Matcher(val->val.pattern_text_val); + re->Compile(); + return new PatternVal(re); + } + case TYPE_TABLE: { // all entries have to have the same type... @@ -2492,6 +2514,13 @@ Val* Manager::ValueToVal(const Stream* i, const Value* val, bool& have_error) co return subnetval; } + case TYPE_PATTERN: + { + RE_Matcher* re = new RE_Matcher(val->val.pattern_text_val); + re->Compile(); + return new PatternVal(re); + } + case TYPE_TABLE: { TypeList* set_index; diff --git a/src/main.cc b/src/main.cc index 10026eea7e..c3c0a2ca6d 100644 --- a/src/main.cc +++ b/src/main.cc @@ -122,6 +122,7 @@ OpaqueType* topk_type = 0; OpaqueType* bloomfilter_type = 0; OpaqueType* x509_opaque_type = 0; OpaqueType* ocsp_resp_opaque_type = 0; +OpaqueType* paraglob_type = 0; // Keep copy of command line int bro_argc; @@ -809,6 +810,7 @@ int main(int argc, char** argv) bloomfilter_type = new OpaqueType("bloomfilter"); x509_opaque_type = new OpaqueType("x509"); ocsp_resp_opaque_type = new OpaqueType("ocsp_resp"); + paraglob_type = new OpaqueType("paraglob"); // The leak-checker tends to produce some false // positives (memory which had already been diff --git a/src/threading/SerialTypes.h b/src/threading/SerialTypes.h index 65bb79b659..b9a9c6c718 100644 --- a/src/threading/SerialTypes.h +++ b/src/threading/SerialTypes.h @@ -126,6 +126,7 @@ struct Value { vec_t vector_val; addr_t addr_val; subnet_t subnet_val; + const char* pattern_text_val; struct { char* data; diff --git a/src/threading/formatters/Ascii.cc b/src/threading/formatters/Ascii.cc index 147305485b..fde6fa9380 100644 --- a/src/threading/formatters/Ascii.cc +++ b/src/threading/formatters/Ascii.cc @@ -325,6 +325,28 @@ threading::Value* Ascii::ParseValue(const string& s, const string& name, TypeTag break; } + case TYPE_PATTERN: + { + string cannidate = get_unescaped_string(s); + // A string is a cannidate pattern iff it begins and ends with + // a '/'. Rather or not the rest of the string is legal will + // be determined later when it is given to the RE engine. + if ( cannidate.size() >= 2 ) + { + if ( cannidate.front() == cannidate.back() && + cannidate.back() == '/' ) + { + // Remove the '/'s + cannidate.erase(0, 1); + cannidate.erase(cannidate.size() - 1); + val->val.pattern_text_val = copy_string(cannidate.c_str()); + break; + } + } + GetThread()->Error(GetThread()->Fmt("String '%s' contained no parseable pattern.", cannidate.c_str())); + goto parse_error; + } + case TYPE_TABLE: case TYPE_VECTOR: // First - common initialization diff --git a/testing/btest/Baseline/language.paraglob/out b/testing/btest/Baseline/language.paraglob/out new file mode 100644 index 0000000000..d375f0c6b6 --- /dev/null +++ b/testing/btest/Baseline/language.paraglob/out @@ -0,0 +1,6 @@ +[T, T, T, T, T] +T +[*, *og, d?g, d[!wl]g] +[once] +[*.gov*, *malware*] +[*.gov*, *malware*] diff --git a/testing/btest/Baseline/scripts.base.frameworks.input.bad_patterns/.stderr b/testing/btest/Baseline/scripts.base.frameworks.input.bad_patterns/.stderr new file mode 100644 index 0000000000..e0a7be2cc3 --- /dev/null +++ b/testing/btest/Baseline/scripts.base.frameworks.input.bad_patterns/.stderr @@ -0,0 +1,9 @@ +error: input.log/Input::READER_ASCII: String '/cat/sss' contained no parseable pattern. +warning: input.log/Input::READER_ASCII: Could not convert line '2 /cat/sss' of input.log to Val. Ignoring line. +error: input.log/Input::READER_ASCII: String '/foo|bar' contained no parseable pattern. +warning: input.log/Input::READER_ASCII: Could not convert line '3 /foo|bar' of input.log to Val. Ignoring line. +error: input.log/Input::READER_ASCII: String 'this is not a pattern' contained no parseable pattern. +warning: input.log/Input::READER_ASCII: Could not convert line '4 this is not a pattern' of input.log to Val. Ignoring line. +error: input.log/Input::READER_ASCII: String '/5' contained no parseable pattern. +warning: input.log/Input::READER_ASCII: Could not convert line '5 /5' of input.log to Val. Ignoring line. +received termination signal diff --git a/testing/btest/Baseline/scripts.base.frameworks.input.patterns/out b/testing/btest/Baseline/scripts.base.frameworks.input.patterns/out new file mode 100644 index 0000000000..9852d0d5d5 --- /dev/null +++ b/testing/btest/Baseline/scripts.base.frameworks.input.patterns/out @@ -0,0 +1,9 @@ +T +F +T +{ +[2] = [p=/^?(cat)$?/], +[4] = [p=/^?(^oob)$?/], +[1] = [p=/^?(dog)$?/], +[3] = [p=/^?(foo|bar)$?/] +} diff --git a/testing/btest/language/paraglob.zeek b/testing/btest/language/paraglob.zeek new file mode 100644 index 0000000000..b5a2dcba11 --- /dev/null +++ b/testing/btest/language/paraglob.zeek @@ -0,0 +1,34 @@ +# @TEST-EXEC: bro -b %INPUT >out +# @TEST-EXEC: btest-diff out + +event zeek_init () +{ + local v1 = vector("*", "d?g", "*og", "d?", "d[!wl]g"); + local v2 = vector("once", "!o*", "once"); + local v3 = vector("https://*.google.com/*", "*malware*", "*.gov*"); + + local p1 = paraglob_init(v1); + local p2: opaque of paraglob = paraglob_init(v2); + local p3 = paraglob_init(v3); + local p_eq = paraglob_init(v1); + + # paraglob_init should not modify v1 + print (v1 == vector("*", "d?g", "*og", "d?", "d[!wl]g")); + # p_eq and p1 should be the same paraglobs + print paraglob_equals(p1, p_eq); + + print paraglob_get(p1, "dog"); + print paraglob_get(p2, "once"); + print paraglob_get(p3, "www.strange-malware-domain.gov"); + + # This looks like a lot, but really should complete quickly. + # Paraglob should stop addition of duplicate patterns. + local i = 1000000; + while (i > 0) { + i = i - 1; + v3 += v3[1]; + } + + local large_glob: opaque of paraglob = paraglob_init(v3); + print paraglob_get(large_glob, "www.strange-malware-domain.gov"); +} diff --git a/testing/btest/scripts/base/frameworks/input/bad_patterns.zeek b/testing/btest/scripts/base/frameworks/input/bad_patterns.zeek new file mode 100644 index 0000000000..23d25b516b --- /dev/null +++ b/testing/btest/scripts/base/frameworks/input/bad_patterns.zeek @@ -0,0 +1,38 @@ +# @TEST-EXEC: zeek -b %INPUT +# @TEST-EXEC: btest-diff .stderr + +@TEST-START-FILE input.log +#separator \x09 +#fields i p +#types count pattern +1 /d/og/ +2 /cat/sss +3 /foo|bar +4 this is not a pattern +5 /5 +@TEST-END-FILE + +redef exit_only_after_terminate = T; + +module A; + +type Idx: record { + i: int; +}; + +type Val: record { + p: pattern; +}; + +event kill_me() + { + terminate(); + } + +global pats: table[int] of Val = table(); + +event zeek_init() + { + Input::add_table([$source="input.log", $name="pats", $idx=Idx, $val=Val, $destination=pats]); + schedule 10msec { kill_me() }; + } diff --git a/testing/btest/scripts/base/frameworks/input/patterns.zeek b/testing/btest/scripts/base/frameworks/input/patterns.zeek new file mode 100644 index 0000000000..eeed7ac602 --- /dev/null +++ b/testing/btest/scripts/base/frameworks/input/patterns.zeek @@ -0,0 +1,47 @@ +# @TEST-EXEC: btest-bg-run zeek zeek -b %INPUT +# @TEST-EXEC: btest-bg-wait 10 + + +redef exit_only_after_terminate = T; + +@TEST-START-FILE input.log +#separator \x09 +#fields i p +#types count pattern +1 /dog/ +2 /cat/ +3 /foo|bar/ +4 /^oob/ +@TEST-END-FILE + +global outfile: file; + +module A; + +type Idx: record { + i: int; +}; + +type Val: record { + p: pattern; +}; + +global pats: table[int] of Val = table(); + +event zeek_init() + { + outfile = open("../out"); + # first read in the old stuff into the table... + Input::add_table([$source="../input.log", $name="pats", $idx=Idx, $val=Val, $destination=pats]); + } + +event Input::end_of_data(name: string, source:string) + { + print outfile, (pats[3]$p in "foobar"); # T + print outfile, (pats[4]$p in "foobar"); # F + print outfile, (pats[3]$p == "foo"); # T + print outfile, pats; + Input::remove("pats"); + close(outfile); + terminate(); + }