diff --git a/CHANGES b/CHANGES index 4a65b5ccfe..e1ea3fe076 100644 --- a/CHANGES +++ b/CHANGES @@ -1,3 +1,64 @@ +6.2.0-dev.174 | 2023-11-21 12:00:20 +0100 + + * TableVal: Replace raw subnets/pattern_matcher with unique_ptr (Arne Welzel, Corelight) + + * TablePatternMatcher: Drop Insert()/Remove(), use Clear() (Arne Welzel, Corelight) + + Also move Clear() when assigning into more generic Assign() function. + + * TableType: Simplify and inline Is...Index tests (Arne Welzel, Corelight) + + * NEWS: Add small table[pattern] section (Arne Welzel, Corelight) + + * Expr/Val: Add support for in set[pattern] (Arne Welzel, Corelight) + + * zeek.bif: Implement table_pattern_matcher_stats() bif for introspection (Arne Welzel, Corelight) + + Provide a script accessible way to introspect the DFA stats that can be + leveraged to gather runtime statistics of the underlying DFA. This + re-uses the existing MatcherStats used by ``get_matcher_stats()``. + + * DFA: Extract inner Stats struct from DFA_State_Cache (Arne Welzel, Corelight) + + This makes it possible to forward declare the class in Val.h which + otherwise seems difficult. + + * Expr: Implement string in table[pattern] of X (Arne Welzel, Corelight) + + Not sure how useful this is (and the implementation isn't optimized in + any way), but seems reasonable for consistency. + + Vern suggested that set[pattern] can already be achieved via + set_to_regex(), so left out any set[pattern] variants. + + * RE: Remove RE_DisjunctiveMatcher and re-use MatchAll() (Arne Welzel, Corelight) + + Seems we can just open code the CompileSet() usage in the TablePatternMatcher + helper without indirecting through another class. Further, add the collection + of indices into MatchAll() rather than duplicating its code in + MatchDisjunction(). Doesn't seem like MatchAll() is used widely. + + * TablePatternMatcher: Use const StringValPtr& instead of const StringVal* (Arne Welzel, Corelight) + + * Val: Move TablePatternMatcher into detail namespace (Arne Welzel, Corelight) + + There's anyway only prototype in the headers, so detail seems better + than the public zeek namespace. + + * TablePatternMatcher: Use unique_ptr (Arne Welzel, Corelight) + + * IndexType: Add IsPatternIndex(), like IsSubNetIndex() (Arne Welzel, Corelight) + + * btest: Add test for pattern tables and when (Arne Welzel, Corelight) + + * Reuse CompileSet() instead of || string formatting (Arne Welzel, Corelight) + + * script optimization support for using strings to index table[pattern] values (Vern Paxson, Corelight) + + * BTests for indexing "table[pattern] of T" with strings (Vern Paxson, Corelight) + + * support for indexing "table[pattern] of T" with strings to get multi-matches (Vern Paxson, Corelight) + 6.2.0-dev.155 | 2023-11-21 10:08:14 +0100 * GH-3453: DNS: Add Ed25519 and Ed448 enum values to parser (Arne Welzel, Corelight) diff --git a/NEWS b/NEWS index d18db9e4a0..bbab361444 100644 --- a/NEWS +++ b/NEWS @@ -16,6 +16,26 @@ Breaking Changes New Functionality ----------------- +- The table type was extended to allow parallel regular expression matching + when a table's index is a pattern. Indexing such tables yields a vector + containing all values of matching patterns for keys of type string. + + As an example, the following snippet outputs ``[a, a or b], [a or b]``. + + global tbl: table[pattern] of string; + tbl[/a/] = "a"; + tbl[/a|b/] = "a or b"; + tbl[/c/] = "c"; + print tbl["a"], tbl["b"]; + + Depending on the patterns and input used for matching, memory growth may + be observed over time as the underlying DFA is constructed lazily. Users are + advised to test with realistic and adversarial input data with focus on + memory growth. The DFA's state can be reset by removal/addition of a single + pattern. For observability, a new bif ``table_pattern_matcher_stats()`` + can be used to gather ``MatcherStats``. + + Changed Functionality --------------------- diff --git a/VERSION b/VERSION index ac6895eaee..e2b03895c4 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -6.2.0-dev.155 +6.2.0-dev.174 diff --git a/src/DFA.h b/src/DFA.h index fd0b590025..1bf2979ec3 100644 --- a/src/DFA.h +++ b/src/DFA.h @@ -69,6 +69,17 @@ protected: using DigestStr = std::basic_string; +struct DFA_State_Cache_Stats { + // Sum of all NFA states + unsigned int nfa_states; + unsigned int dfa_states; + unsigned int computed; + unsigned int uncomputed; + unsigned int mem; + unsigned int hits; + unsigned int misses; +}; + class DFA_State_Cache { public: DFA_State_Cache(); @@ -82,17 +93,7 @@ public: int NumEntries() const { return states.size(); } - struct Stats { - // Sum of all NFA states - unsigned int nfa_states; - unsigned int dfa_states; - unsigned int computed; - unsigned int uncomputed; - unsigned int mem; - unsigned int hits; - unsigned int misses; - }; - + using Stats = DFA_State_Cache_Stats; void GetStats(Stats* s); private: diff --git a/src/Expr.cc b/src/Expr.cc index 32445ccd69..8153bf7d3f 100644 --- a/src/Expr.cc +++ b/src/Expr.cc @@ -2382,6 +2382,17 @@ IndexExpr::IndexExpr(ExprPtr arg_op1, ListExprPtr arg_op2, bool arg_is_slice, bo if ( IsError() ) return; + if ( op1->GetType()->Tag() == TYPE_TABLE ) { // Check for a table[pattern] being indexed by a string + const auto& table_type = op1->GetType()->AsTableType(); + const auto& rhs_type = op2->GetType()->AsTypeList()->GetTypes(); + if ( table_type->IsPatternIndex() && table_type->Yield() && rhs_type.size() == 1 && + IsString(rhs_type[0]->Tag()) ) { + is_pattern_table = true; + SetType(make_intrusive(op1->GetType()->Yield())); + return; + } + } + int match_type = op1->GetType()->MatchesIndex(op2->AsListExpr()); if ( match_type == DOES_NOT_MATCH_INDEX ) { @@ -2532,7 +2543,12 @@ ValPtr IndexExpr::Fold(Val* v1, Val* v2) const { return index_slice(vect, lv); } break; - case TYPE_TABLE: v = v1->AsTableVal()->FindOrDefault({NewRef{}, v2}); break; + case TYPE_TABLE: + if ( is_pattern_table ) + return v1->AsTableVal()->LookupPattern({NewRef{}, v2->AsListVal()->Idx(0)->AsStringVal()}); + + v = v1->AsTableVal()->FindOrDefault({NewRef{}, v2}); + break; case TYPE_STRING: return index_string(v1->AsString(), v2->AsListVal()); @@ -3799,6 +3815,18 @@ InExpr::InExpr(ExprPtr arg_op1, ExprPtr arg_op2) : BinaryExpr(EXPR_IN, std::move } } + // Support in table[pattern] / set[pattern] + if ( op1->GetType()->Tag() == TYPE_STRING ) { + if ( op2->GetType()->Tag() == TYPE_TABLE ) { + const auto& table_type = op2->GetType()->AsTableType(); + + if ( table_type->IsPatternIndex() ) { + SetType(base_type(TYPE_BOOL)); + return; + } + } + } + if ( op1->Tag() != EXPR_LIST ) op1 = make_intrusive(std::move(op1)); @@ -3837,8 +3865,15 @@ ValPtr InExpr::Fold(Val* v1, Val* v2) const { auto ind = v1->AsListVal()->Idx(0)->CoerceToUnsigned(); res = ind < vv2->Size() && vv2->ValAt(ind); } - else - res = (bool)v2->AsTableVal()->Find({NewRef{}, v1}); + else { + const auto& table_val = v2->AsTableVal(); + const auto& table_type = table_val->GetType(); + // Special table[pattern] / set[pattern] in expression. + if ( table_type->IsPatternIndex() && v1->GetType()->Tag() == TYPE_STRING ) + res = table_val->MatchPattern({NewRef{}, v1->AsStringVal()}); + else + res = (bool)v2->AsTableVal()->Find({NewRef{}, v1}); + } return val_mgr->Bool(res); } diff --git a/src/Expr.h b/src/Expr.h index 3bec879dbb..a45a1528b8 100644 --- a/src/Expr.h +++ b/src/Expr.h @@ -1012,6 +1012,7 @@ protected: bool is_slice; bool is_inside_when; + bool is_pattern_table = false; }; // The following execute the heart of IndexExpr functionality for diff --git a/src/RE.cc b/src/RE.cc index 67144e9bd4..3a790d2f4e 100644 --- a/src/RE.cc +++ b/src/RE.cc @@ -172,6 +172,10 @@ bool Specific_RE_Matcher::CompileSet(const string_list& set, const int_list& idx dfa = new DFA_Machine(nfa, EC()); ecs = EC()->EquivClasses(); + // dfa took ownership + Unref(nfa); + nfa = nullptr; + return true; } @@ -190,6 +194,10 @@ bool Specific_RE_Matcher::MatchAll(const String* s) { return MatchAll(s->Bytes(), s->Len()); } +bool Specific_RE_Matcher::MatchSet(const String* s, std::vector& matches) { + return MatchAll(s->Bytes(), s->Len(), &matches); +} + int Specific_RE_Matcher::Match(const char* s) { return Match((const u_char*)(s), strlen(s)); } int Specific_RE_Matcher::Match(const String* s) { return Match(s->Bytes(), s->Len()); } @@ -198,7 +206,7 @@ int Specific_RE_Matcher::LongestMatch(const char* s) { return LongestMatch((cons int Specific_RE_Matcher::LongestMatch(const String* s) { return LongestMatch(s->Bytes(), s->Len()); } -bool Specific_RE_Matcher::MatchAll(const u_char* bv, int n) { +bool Specific_RE_Matcher::MatchAll(const u_char* bv, int n, std::vector* matches) { if ( ! dfa ) // An empty pattern matches "all" iff what's being // matched is empty. @@ -218,6 +226,11 @@ bool Specific_RE_Matcher::MatchAll(const u_char* bv, int n) { if ( d ) d = d->Xtion(ecs[SYM_EOL], dfa); + if ( d && matches ) + if ( const auto* a_set = d->Accept() ) + for ( auto a : *a_set ) + matches->push_back(a); + return d && d->Accept() != nullptr; } diff --git a/src/RE.h b/src/RE.h index f68f3482bb..ee5234d42d 100644 --- a/src/RE.h +++ b/src/RE.h @@ -96,6 +96,14 @@ public: // to the matching expressions. (idx must not contain zeros). bool CompileSet(const string_list& set, const int_list& idx); + // For use with CompileSet() to collect indices of all matched + // expressions into the matches vector. The matches vector is + // populated with the indices of all matching expressions provided + // to CompileSet()'s set and idx arguments. + // + // Behaves as MatchAll(), consuming the complete input string. + bool MatchSet(const String* s, std::vector& matches); + // Returns the position in s just beyond where the first match // occurs, or 0 if there is no such position in s. Note that // if the pattern matches empty strings, matching continues @@ -125,7 +133,7 @@ protected: // appending to an existing pattern_text. void AddPat(const char* pat, const char* orig_fmt, const char* app_fmt); - bool MatchAll(const u_char* bv, int n); + bool MatchAll(const u_char* bv, int n, std::vector* matches = nullptr); match_type mt; bool multiline; diff --git a/src/Type.cc b/src/Type.cc index 4fd9fa9f52..2c5580d50c 100644 --- a/src/Type.cc +++ b/src/Type.cc @@ -382,13 +382,6 @@ void IndexType::DescribeReST(ODesc* d, bool roles_only) const { } } -bool IndexType::IsSubNetIndex() const { - const auto& types = indices->GetTypes(); - if ( types.size() == 1 && types[0]->Tag() == TYPE_SUBNET ) - return true; - return false; -} - detail::TraversalCode IndexType::Traverse(detail::TraversalCallback* cb) const { auto tc = cb->PreType(this); HANDLE_TC_TYPE_PRE(tc); diff --git a/src/Type.h b/src/Type.h index 6b69eb8e37..71247ab335 100644 --- a/src/Type.h +++ b/src/Type.h @@ -354,7 +354,16 @@ public: void DescribeReST(ODesc* d, bool roles_only = false) const override; // Returns true if this table is solely indexed by subnet. - bool IsSubNetIndex() const; + bool IsSubNetIndex() const { + const auto& types = indices->GetTypes(); + return types.size() == 1 && types[0]->Tag() == TYPE_SUBNET; + } + + // Returns true if this table has a single index of type pattern. + bool IsPatternIndex() const { + const auto& types = indices->GetTypes(); + return types.size() == 1 && types[0]->Tag() == TYPE_PATTERN; + } detail::TraversalCode Traverse(detail::TraversalCallback* cb) const override; diff --git a/src/Val.cc b/src/Val.cc index 397c1d777c..8d93426721 100644 --- a/src/Val.cc +++ b/src/Val.cc @@ -20,6 +20,7 @@ #include "zeek/Attr.h" #include "zeek/CompHash.h" #include "zeek/Conn.h" +#include "zeek/DFA.h" #include "zeek/Desc.h" #include "zeek/Dict.h" #include "zeek/Expr.h" @@ -1427,6 +1428,112 @@ static void find_nested_record_types(const TypePtr& t, std::set* fo } } +// Support class for returning multiple values from a table[pattern] +// when indexed with a string. +class detail::TablePatternMatcher { +public: + TablePatternMatcher(const TableVal* _tbl, TypePtr _yield) : tbl(_tbl) { + vtype = make_intrusive(std::move(_yield)); + } + + void Clear() { matcher.reset(); } + + VectorValPtr Lookup(const StringValPtr& s); + + // Delegate to matcher->MatchAll(). + bool MatchAll(const StringValPtr& s); + + void GetStats(detail::DFA_State_Cache_Stats* stats) const { + if ( matcher && matcher->DFA() ) + matcher->DFA()->Cache()->GetStats(stats); + else + *stats = {0}; + }; + +private: + void Build(); + + const TableVal* tbl; + VectorTypePtr vtype; + + // If matcher is nil then we know we need to build it. This gives + // us an easy way to cache matchers in the common case that these + // sorts of tables don't change their elements very often (indeed, + // they'll frequently be constructed just once), and also keeps us + // from having to re-build the matcher on every insert/delete in + // the common case that a whole bunch of those are done in a single + // batch. + std::unique_ptr matcher = nullptr; + + // Maps matcher values to corresponding yields. When building the + // matcher we insert a nil at the head to accommodate how + // disjunctive matchers use numbering starting at 1 rather than 0. + std::vector matcher_yields; +}; + +VectorValPtr detail::TablePatternMatcher::Lookup(const StringValPtr& s) { + auto results = make_intrusive(vtype); + + if ( ! matcher ) { + if ( tbl->Get()->Length() == 0 ) + return results; + + Build(); + } + + std::vector matches; + matcher->MatchSet(s->AsString(), matches); + + for ( auto m : matches ) + results->Append(matcher_yields[m]); + + return results; +} + +bool detail::TablePatternMatcher::MatchAll(const StringValPtr& s) { + if ( ! matcher ) { + if ( tbl->Get()->Length() == 0 ) + return false; + + Build(); + } + + return matcher->MatchAll(s->AsString()); +} + +void detail::TablePatternMatcher::Build() { + matcher_yields.clear(); + matcher_yields.push_back(nullptr); + + auto& tbl_dict = *tbl->Get(); + auto& tbl_hash = *tbl->GetTableHash(); + + zeek::detail::string_list pattern_list; + zeek::detail::int_list index_list; + + // We need to hold on to recovered hash key values so they don't + // get lost once a loop iteration goes out of scope. + std::vector hash_key_vals; + + for ( auto& iter : tbl_dict ) { + auto k = iter.GetHashKey(); + auto v = iter.value; + auto vl = tbl_hash.RecoverVals(*k); + + char* pt = const_cast(vl->AsListVal()->Idx(0)->AsPattern()->PatternText()); + pattern_list.push_back(pt); + index_list.push_back(pattern_list.size()); + matcher_yields.push_back(v->GetVal()); + + hash_key_vals.push_back(std::move(vl)); + } + + matcher = std::make_unique(detail::MATCH_EXACTLY); + + if ( ! matcher->CompileSet(pattern_list, index_list) ) + reporter->FatalError("failed compile set for disjunctive matching"); +} + TableVal::TableVal(TableTypePtr t, detail::AttributesPtr a) : Val(t) { bool ordered = (a != nullptr && a->Find(detail::ATTR_ORDERED) != nullptr); Init(std::move(t), ordered); @@ -1456,9 +1563,10 @@ void TableVal::Init(TableTypePtr t, bool ordered) { def_val = nullptr; if ( table_type->IsSubNetIndex() ) - subnets = new detail::PrefixTable; - else - subnets = nullptr; + subnets = std::make_unique(); + + if ( table_type->IsPatternIndex() ) + pattern_matcher = std::make_unique(this, table_type->Yield()); table_hash = new detail::CompositeHash(table_type->GetIndices()); if ( ordered ) @@ -1475,7 +1583,6 @@ TableVal::~TableVal() { delete table_hash; delete table_val; - delete subnets; delete expire_iterator; } @@ -1486,6 +1593,9 @@ void TableVal::RemoveAll() { delete table_val; table_val = new PDict; table_val->SetDeleteFunc(table_entry_val_delete_func); + + if ( pattern_matcher ) + pattern_matcher->Clear(); } int TableVal::Size() const { return table_val->Length(); } @@ -1598,6 +1708,9 @@ bool TableVal::Assign(ValPtr index, std::unique_ptr k, ValPtr n subnets->Insert(index.get(), new_entry_val); } + if ( pattern_matcher ) + pattern_matcher->Clear(); + // Keep old expiration time if necessary. if ( old_entry_val && attrs && attrs->Find(detail::ATTR_EXPIRE_CREATE) ) new_entry_val->SetExpireAccess(old_entry_val->ExpireAccessTime()); @@ -1925,6 +2038,27 @@ TableValPtr TableVal::LookupSubnetValues(const SubNetVal* search) { return nt; } +VectorValPtr TableVal::LookupPattern(const StringValPtr& s) { + if ( ! pattern_matcher || ! GetType()->Yield() ) + reporter->InternalError("LookupPattern called on wrong table type"); + + return pattern_matcher->Lookup(s); +} + +bool TableVal::MatchPattern(const StringValPtr& s) { + if ( ! pattern_matcher ) + reporter->InternalError("LookupPattern called on wrong table type"); + + return pattern_matcher->MatchAll(s); +} + +void TableVal::GetPatternMatcherStats(detail::DFA_State_Cache_Stats* stats) const { + if ( ! pattern_matcher ) + reporter->InternalError("GetPatternMatcherStats called on wrong table type"); + + return pattern_matcher->GetStats(stats); +} + bool TableVal::UpdateTimestamp(Val* index) { TableEntryVal* v; @@ -2105,8 +2239,14 @@ ValPtr TableVal::Remove(const Val& index, bool broker_forward, bool* iterators_i va = v->GetVal() ? v->GetVal() : IntrusivePtr{NewRef{}, this}; if ( subnets && ! subnets->Remove(&index) ) + // VP: not clear to me this should be an internal warning, + // since Zeek doesn't otherwise complain about removing + // non-existent table elements. reporter->InternalWarning("index not in prefix table"); + if ( pattern_matcher ) + pattern_matcher->Clear(); + delete v; Modified(); diff --git a/src/Val.h b/src/Val.h index 9ddeec842f..96bc5a037a 100644 --- a/src/Val.h +++ b/src/Val.h @@ -51,6 +51,9 @@ class Frame; class PrefixTable; class CompositeHash; class HashKey; +class TablePatternMatcher; + +struct DFA_State_Cache_Stats; class ValTrace; class ZBody; @@ -863,6 +866,20 @@ public: // Causes an internal error if called for any other kind of table. TableValPtr LookupSubnetValues(const SubNetVal* s); + // For a table[pattern], return a vector of all yields matching + // the given string. + // Causes an internal error if called for any other kind of table. + VectorValPtr LookupPattern(const StringValPtr& s); + + // For a table[pattern] or set[pattern], returns True if any of the + // patterns in the index matches the given string, else False. + // Causes an internal error if called for any other kind of table. + bool MatchPattern(const StringValPtr& s); + + // For a table[pattern], fill stats with information about + // the DFA's state for introspection. + void GetPatternMatcherStats(detail::DFA_State_Cache_Stats* stats) const; + // Sets the timestamp for the given index to network time. // Returns false if index does not exist. bool UpdateTimestamp(Val* index); @@ -922,7 +939,8 @@ public: // Returns the Prefix table used inside the table (if present). // This allows us to do more direct queries to this specialized // type that the general Table API does not allow. - const detail::PrefixTable* Subnets() const { return subnets; } + const detail::PrefixTable* Subnets() const { return subnets.get(); } + void Describe(ODesc* d) const override; @@ -1031,7 +1049,8 @@ protected: detail::ExprPtr expire_func; TableValTimer* timer; RobustDictIterator* expire_iterator; - detail::PrefixTable* subnets; + std::unique_ptr subnets; + std::unique_ptr pattern_matcher; ValPtr def_val; detail::ExprPtr change_func; std::string broker_store; diff --git a/src/script_opt/CPP/Exprs.cc b/src/script_opt/CPP/Exprs.cc index c943a14b25..f798b4f91c 100644 --- a/src/script_opt/CPP/Exprs.cc +++ b/src/script_opt/CPP/Exprs.cc @@ -388,7 +388,25 @@ string CPPCompile::GenIndexExpr(const Expr* e, GenType gt) { string func; if ( aggr_t->Tag() == TYPE_TABLE ) { - func = inside_when ? "when_index_table__CPP" : "index_table__CPP"; + auto ind_expr = e->GetOp2()->AsListExpr()->Exprs()[0]; + auto is_pat_str_ind = false; + + if ( aggr_t->AsTableType()->IsPatternIndex() && ind_expr->GetType()->Tag() == TYPE_STRING ) + is_pat_str_ind = true; + + if ( inside_when ) { + if ( is_pat_str_ind ) + func = "when_index_patstr__CPP"; + else + func = "when_index_table__CPP"; + } + else { + if ( is_pat_str_ind ) + func = "index_patstr_table__CPP"; + else + func = "index_table__CPP"; + } + gen = func + "(" + GenExpr(aggr, GEN_NATIVE) + ", {" + GenExpr(e->GetOp2(), GEN_VAL_PTR) + "})"; } diff --git a/src/script_opt/CPP/RuntimeOps.cc b/src/script_opt/CPP/RuntimeOps.cc index ea8a81ac86..ae2b9c7744 100644 --- a/src/script_opt/CPP/RuntimeOps.cc +++ b/src/script_opt/CPP/RuntimeOps.cc @@ -44,6 +44,10 @@ ValPtr index_table__CPP(const TableValPtr& t, vector indices) { return v; } +ValPtr index_patstr_table__CPP(const TableValPtr& t, vector indices) { + return t->LookupPattern(cast_intrusive(indices[0])); +} + ValPtr index_vec__CPP(const VectorValPtr& vec, int index) { if ( index < 0 ) index += vec->Size(); @@ -66,6 +70,13 @@ ValPtr when_index_table__CPP(const TableValPtr& t, vector indices) { return v; } +ValPtr when_index_patstr__CPP(const TableValPtr& t, vector indices) { + auto v = index_patstr_table__CPP(t, std::move(indices)); + if ( v && IndexExprWhen::evaluating > 0 ) + IndexExprWhen::results.emplace_back(v); + return v; +} + ValPtr when_index_vec__CPP(const VectorValPtr& vec, int index) { auto v = index_vec__CPP(vec, index); if ( v && IndexExprWhen::evaluating > 0 ) diff --git a/src/script_opt/CPP/RuntimeOps.h b/src/script_opt/CPP/RuntimeOps.h index b35dd2b213..5ef5ba0efa 100644 --- a/src/script_opt/CPP/RuntimeOps.h +++ b/src/script_opt/CPP/RuntimeOps.h @@ -32,13 +32,16 @@ extern ListValPtr index_val__CPP(std::vector indices); // Returns the value corresponding to indexing the given table/vector/string // with the given set of indices. These are functions rather than something // generated directly so that they can package up the error handling for -// the case where there's no such index. +// the case where there's no such index. "patstr" refers to indexing a +// table[pattern] of X with a string value. extern ValPtr index_table__CPP(const TableValPtr& t, std::vector indices); +extern ValPtr index_patstr_table__CPP(const TableValPtr& t, std::vector indices); extern ValPtr index_vec__CPP(const VectorValPtr& vec, int index); extern ValPtr index_string__CPP(const StringValPtr& svp, std::vector indices); // The same, but for indexing happening inside a "when" clause. extern ValPtr when_index_table__CPP(const TableValPtr& t, std::vector indices); +extern ValPtr when_index_patstr__CPP(const TableValPtr& t, std::vector indices); extern ValPtr when_index_vec__CPP(const VectorValPtr& vec, int index); // For vector slices, we use the existing index_slice(), but we need a diff --git a/src/script_opt/ZAM/Expr.cc b/src/script_opt/ZAM/Expr.cc index 854fb6fde5..ea5af97939 100644 --- a/src/script_opt/ZAM/Expr.cc +++ b/src/script_opt/ZAM/Expr.cc @@ -573,6 +573,19 @@ const ZAMStmt ZAMCompiler::CompileIndex(const NameExpr* n1, int n2_slot, const T int n = l->Exprs().length(); auto n2tag = n2t->Tag(); + // Whether this is an instance of indexing a table[pattern] of X + // with a string. + bool is_pat_str_ind = false; + + if ( n2tag == TYPE_TABLE && n == 1 ) { + auto& ind_types = n2t->AsTableType()->GetIndices(); + auto& ind_type0 = ind_types->GetTypes()[0]; + auto ind = l->Exprs()[0]; + + if ( ind_type0->Tag() == TYPE_PATTERN && ind->GetType()->Tag() == TYPE_STRING ) + is_pat_str_ind = true; + } + if ( n == 1 && ! in_when ) { auto ind = l->Exprs()[0]; auto var_ind = ind->Tag() == EXPR_NAME; @@ -640,7 +653,8 @@ const ZAMStmt ZAMCompiler::CompileIndex(const NameExpr* n1, int n2_slot, const T if ( n2tag == TYPE_TABLE ) { if ( n3 ) { int n3_slot = FrameSlot(n3); - auto zop = AssignmentFlavor(OP_TABLE_INDEX1_VVV, n1->GetType()->Tag()); + auto op = is_pat_str_ind ? OP_TABLE_PATSTR_INDEX1_VVV : OP_TABLE_INDEX1_VVV; + auto zop = AssignmentFlavor(op, n1->GetType()->Tag()); z = ZInstI(zop, Frame1Slot(n1, zop), n2_slot, n3_slot); z.SetType(n3->GetType()); } @@ -648,7 +662,8 @@ const ZAMStmt ZAMCompiler::CompileIndex(const NameExpr* n1, int n2_slot, const T else { ASSERT(c3); - auto zop = AssignmentFlavor(OP_TABLE_INDEX1_VVC, n1->GetType()->Tag()); + auto op = is_pat_str_ind ? OP_TABLE_PATSTR_INDEX1_VVC : OP_TABLE_INDEX1_VVC; + auto zop = AssignmentFlavor(op, n1->GetType()->Tag()); z = ZInstI(zop, Frame1Slot(n1, zop), n2_slot, c3); } @@ -674,7 +689,15 @@ const ZAMStmt ZAMCompiler::CompileIndex(const NameExpr* n1, int n2_slot, const T break; case TYPE_TABLE: - op = in_when ? OP_WHEN_TABLE_INDEX_VV : OP_TABLE_INDEX_VV; + if ( in_when ) { + if ( is_pat_str_ind ) + op = OP_WHEN_PATSTR_INDEX_VV; + else + op = OP_WHEN_TABLE_INDEX_VV; + } + else + op = OP_TABLE_INDEX_VV; + z = ZInstI(op, Frame1Slot(n1, op), n2_slot); z.SetType(n1->GetType()); break; diff --git a/src/script_opt/ZAM/Ops.in b/src/script_opt/ZAM/Ops.in index d9acb601cc..3ffbeb6462 100644 --- a/src/script_opt/ZAM/Ops.in +++ b/src/script_opt/ZAM/Ops.in @@ -1057,6 +1057,15 @@ macro EvalTableIndex(index) break; } +internal-op When-PatStr-Index +type VV +eval auto args = z.aux->ToListVal(frame); + auto arg0 = args->Idx(0); + auto v = frame[z.v2].table_val->LookupPattern({NewRef{}, arg0->AsStringVal()}); + if ( IndexExprWhen::evaluating > 0 ) + IndexExprWhen::results.emplace_back(v); + AssignV1(BuildVal(v, z.t)) + internal-assignment-op Table-Index1 type VVV assign-val v @@ -1068,6 +1077,17 @@ type VVC assign-val v eval EvalTableIndex(z.c.ToVal(z.t)) +# Same, but for indexing table[pattern] of X with a string. +internal-assignment-op Table-PatStr-Index1 +type VVV +assign-val v +eval auto v = frame[z.v2].table_val->LookupPattern({NewRef{}, frame[z.v3].AsString()}); + +internal-assignment-op Table-PatStr-Index1 +type VVC +assign-val v +eval auto v = frame[z.v2].table_val->LookupPattern({NewRef{}, z.c.AsString()}); + # This version is for a variable v3. internal-op Index-String type VVV diff --git a/src/zeek.bif b/src/zeek.bif index 4c70a6fcbf..f8358fec8c 100644 --- a/src/zeek.bif +++ b/src/zeek.bif @@ -5795,3 +5795,53 @@ function have_spicy_analyzers%(%) : bool %{ return zeek::val_mgr->Bool(USE_SPICY_ANALYZERS); %} + +%%{ +#include "zeek/DFA.h" +%%} + +## Return MatcherStats for a table[pattern] or set[pattern] value. +## +## This returns a MatcherStats objects that can be used for introspection +## of the DFA used for such a table. Statistics reset whenever elements are +## added or removed to the table as these operations result in the underlying +## DFA being rebuilt. +## +## This function iterates over all states of the DFA. Calling it at a high +## frequency is likely detrimental to performance. +## +## tbl: The table to get stats for. +## +## Returns: A record with matcher statistics. +function table_pattern_matcher_stats%(tbl: any%) : MatcherStats + %{ + static auto matcher_stats_type = zeek::id::find_type("MatcherStats"); + + const auto& type = tbl->GetType(); + if ( type->Tag() != zeek::TYPE_TABLE ) + { + zeek::emit_builtin_error("pattern-table_stats() requires a table argument"); + return nullptr; + } + + if ( ! type->AsTableType()->IsPatternIndex() ) + { + zeek::emit_builtin_error("pattern_table_stats() requires a single index of type pattern"); + return nullptr; + } + + zeek::detail::DFA_State_Cache::Stats stats; + tbl->AsTableVal()->GetPatternMatcherStats(&stats); + + auto result = zeek::make_intrusive(matcher_stats_type); + int n = 0; + result->Assign(n++, 1); // matchers + result->Assign(n++, stats.nfa_states); + result->Assign(n++, stats.dfa_states); + result->Assign(n++, stats.computed); + result->Assign(n++, stats.mem); + result->Assign(n++, stats.hits); + result->Assign(n++, stats.misses); + + return result; + %} diff --git a/testing/btest/Baseline/language.pattern-sets/.stderr b/testing/btest/Baseline/language.pattern-sets/.stderr new file mode 100644 index 0000000000..49d861c74c --- /dev/null +++ b/testing/btest/Baseline/language.pattern-sets/.stderr @@ -0,0 +1 @@ +### BTest baseline data generated by btest-diff. Do not edit. Use "btest -U/-u" to update. Requires BTest >= 0.63. diff --git a/testing/btest/Baseline/language.pattern-sets/out b/testing/btest/Baseline/language.pattern-sets/out new file mode 100644 index 0000000000..e3ad8c8fd1 --- /dev/null +++ b/testing/btest/Baseline/language.pattern-sets/out @@ -0,0 +1,14 @@ +### BTest baseline data generated by btest-diff. Do not edit. Use "btest -U/-u" to update. Requires BTest >= 0.63. +in empty, F +single insert, foo in, T +single insert, foox not-in, T +multiple inserts, x not-in, T +multiple insert, foo in, T +multiple insert, bletch in, T +multiple insert, foobletch not-in, T +single delete, bletch in, T +single delete, foo in, T +two deletes, bletch not-in, T +two deletes, foo not-in, T +two deletes, bar in, T +clear_table, bar not-in, T diff --git a/testing/btest/Baseline/language.pattern-tables-stats/.stderr b/testing/btest/Baseline/language.pattern-tables-stats/.stderr new file mode 100644 index 0000000000..49d861c74c --- /dev/null +++ b/testing/btest/Baseline/language.pattern-tables-stats/.stderr @@ -0,0 +1 @@ +### BTest baseline data generated by btest-diff. Do not edit. Use "btest -U/-u" to update. Requires BTest >= 0.63. diff --git a/testing/btest/Baseline/language.pattern-tables-stats/out b/testing/btest/Baseline/language.pattern-tables-stats/out new file mode 100644 index 0000000000..12fe607fbc --- /dev/null +++ b/testing/btest/Baseline/language.pattern-tables-stats/out @@ -0,0 +1,21 @@ +### BTest baseline data generated by btest-diff. Do not edit. Use "btest -U/-u" to update. Requires BTest >= 0.63. +initial stats, [matchers=1, nfa_states=0, dfa_states=0, computed=0, mem=0, hits=0, misses=0] +populated stats, [matchers=1, nfa_states=0, dfa_states=0, computed=0, mem=0, hits=0, misses=0] +[1], [], T, F +after lookup stats, [matchers=1, nfa_states=10, dfa_states=6, computed=6, mem=2368, hits=0, misses=6] +reset stats, [matchers=1, nfa_states=0, dfa_states=0, computed=0, mem=0, hits=0, misses=0] +[], [3], [1, 3], T, F +after more lookup stats, [matchers=1, nfa_states=34, dfa_states=13, computed=13, mem=7720, hits=0, misses=13] +reset stats after delete, [matchers=1, nfa_states=0, dfa_states=0, computed=0, mem=0, hits=0, misses=0] +[], [3], [1, 3] +after even more lookup stats, [matchers=1, nfa_states=29, dfa_states=13, computed=13, mem=7056, hits=0, misses=13] +reset after reassignment, [matchers=1, nfa_states=0, dfa_states=0, computed=0, mem=0, hits=0, misses=0] +set initial stats, [matchers=1, nfa_states=0, dfa_states=0, computed=0, mem=0, hits=0, misses=0] +set populated stats, [matchers=1, nfa_states=0, dfa_states=0, computed=0, mem=0, hits=0, misses=0] +T, F +set after lookup stats, [matchers=1, nfa_states=10, dfa_states=6, computed=6, mem=2368, hits=0, misses=6] +set reset stats, [matchers=1, nfa_states=0, dfa_states=0, computed=0, mem=0, hits=0, misses=0] +F, T +set after more lookup stats, [matchers=1, nfa_states=24, dfa_states=9, computed=9, mem=5336, hits=0, misses=9] +set reset stats after delete, [matchers=1, nfa_states=24, dfa_states=9, computed=9, mem=5336, hits=0, misses=9] +set reset after reassignment, [matchers=1, nfa_states=0, dfa_states=0, computed=0, mem=0, hits=0, misses=0] diff --git a/testing/btest/Baseline/language.pattern-tables-when/.stderr b/testing/btest/Baseline/language.pattern-tables-when/.stderr new file mode 100644 index 0000000000..e3f6131b1d --- /dev/null +++ b/testing/btest/Baseline/language.pattern-tables-when/.stderr @@ -0,0 +1,2 @@ +### BTest baseline data generated by btest-diff. Do not edit. Use "btest -U/-u" to update. Requires BTest >= 0.63. +received termination signal diff --git a/testing/btest/Baseline/language.pattern-tables-when/out b/testing/btest/Baseline/language.pattern-tables-when/out new file mode 100644 index 0000000000..7367e825e6 --- /dev/null +++ b/testing/btest/Baseline/language.pattern-tables-when/out @@ -0,0 +1,8 @@ +### BTest baseline data generated by btest-diff. Do not edit. Use "btest -U/-u" to update. Requires BTest >= 0.63. +schedule populate +populate_a() +gotcha a, [42] +populate_b() +gotcha b, [4242] +populate_c() +gotcha c, [4711] diff --git a/testing/btest/Baseline/language.pattern-tables/.stderr b/testing/btest/Baseline/language.pattern-tables/.stderr new file mode 100644 index 0000000000..49d861c74c --- /dev/null +++ b/testing/btest/Baseline/language.pattern-tables/.stderr @@ -0,0 +1 @@ +### BTest baseline data generated by btest-diff. Do not edit. Use "btest -U/-u" to update. Requires BTest >= 0.63. diff --git a/testing/btest/Baseline/language.pattern-tables/out b/testing/btest/Baseline/language.pattern-tables/out new file mode 100644 index 0000000000..a74c2d32eb --- /dev/null +++ b/testing/btest/Baseline/language.pattern-tables/out @@ -0,0 +1,22 @@ +### BTest baseline data generated by btest-diff. Do not edit. Use "btest -U/-u" to update. Requires BTest >= 0.63. +indexing empty, 0 +single insert, match, [1] +single insert, non-match, [] +single insert, in, T +single insert, not-in, F +multiple inserts, non-match, [] +multiple inserts, single match, [3] +multiple inserts, double match, [1, 3] +multiple insert, in, T +multiple insert, not-in, F +triple match, [1, 3, 4] +embedded newline, /s operator, [6] +no embedded newline, /s vs. no /s operator, [5, 6, 7] +no embedded newline, case sensitive, /i vs. no /i operator, [7] +single delete, no more triple match, [1, 4] +double delete, no more double match, [4] +delete of non-existing pattern, [4] +shallow copy matches multi, [5, 6, 7] +deep copy matches multi, [5, 6, 7] +delete of entire table, [] +reassignment of table, [] diff --git a/testing/btest/language/pattern-sets.zeek b/testing/btest/language/pattern-sets.zeek new file mode 100644 index 0000000000..1b53eb890f --- /dev/null +++ b/testing/btest/language/pattern-sets.zeek @@ -0,0 +1,53 @@ +# @TEST-DOC: set[pattern] also supports parallel RE matching using in expression + +# @TEST-EXEC: zeek -b %INPUT >out +# @TEST-EXEC: btest-diff out +# @TEST-EXEC: btest-diff .stderr + +global ps: set[pattern]; + +event zeek_init() + { + assert "foo" !in ps; + print "in empty", "foo" in ps; + + add ps[/foo/]; + + assert "foo" in ps; + assert "foox" !in ps; + print "single insert, foo in", "foo" in ps; + print "single insert, foox not-in", "foox" !in ps; + + add ps[/bar/]; + add ps[/(foo|bletch)/]; + + assert "x" !in ps; + assert "bar" in ps; + assert "foo" in ps; + assert "bletch" in ps; + assert "foobletch" !in ps; + + print "multiple inserts, x not-in", "x" !in ps; + print "multiple insert, foo in", "foo" in ps; + print "multiple insert, bletch in", "bletch" in ps; + print "multiple insert, foobletch not-in", "foobletch" !in ps; + + # After delete of /foo/, still matches "foo" due to /(foo|bletch)/ + delete ps[/foo/]; + assert "foo" in ps; + assert "bletch" in ps; + print "single delete, bletch in", "bletch" in ps; + print "single delete, foo in", "foo" in ps; + + delete ps[/(foo|bletch)/]; + assert "foo" !in ps; + assert "bar" in ps; + assert "bletch" !in ps; + print "two deletes, bletch not-in", "bletch" !in ps; + print "two deletes, foo not-in", "foo" !in ps; + print "two deletes, bar in", "bar" in ps; + + clear_table(ps); + assert "bar" !in ps; + print "clear_table, bar not-in", "bar" !in ps; + } diff --git a/testing/btest/language/pattern-tables-stats.zeek b/testing/btest/language/pattern-tables-stats.zeek new file mode 100644 index 0000000000..0c25915474 --- /dev/null +++ b/testing/btest/language/pattern-tables-stats.zeek @@ -0,0 +1,56 @@ +# @TEST-DOC: Test table_pattern_matcher_stats() +# @TEST-EXEC: zeek -b %INPUT >out +# @TEST-EXEC: btest-diff out +# @TEST-EXEC: btest-diff .stderr + +global pt: table[pattern] of count; +global ps: set[pattern]; + +event zeek_init() + { + print "initial stats", table_pattern_matcher_stats(pt); + pt[/foo/] = 1; + print "populated stats", table_pattern_matcher_stats(pt); + + print pt["foo"], pt["foox"], "foo" in pt, "foox" in pt; + print "after lookup stats", table_pattern_matcher_stats(pt); + + pt[/bar/] = 2; + pt[/(foo|bletch)/] = 3; + print "reset stats", table_pattern_matcher_stats(pt); + + print pt["x"], pt["bletch"], sort(pt["foo"]), "foo" in pt, "x" in pt; + print "after more lookup stats", table_pattern_matcher_stats(pt); + + delete pt[/bar/]; + print "reset stats after delete", table_pattern_matcher_stats(pt); + + print pt["x"], pt["bletch"], sort(pt["foo"]); + print "after even more lookup stats", table_pattern_matcher_stats(pt); + + pt = table(); + print "reset after reassignment", table_pattern_matcher_stats(pt); + } + +event zeek_init() &priority=-10 + { + print "set initial stats", table_pattern_matcher_stats(ps); + add ps[/foo/]; + print "set populated stats", table_pattern_matcher_stats(ps); + + print "foo" in ps, "foox" in ps; + print "set after lookup stats", table_pattern_matcher_stats(ps); + + add ps[/bar/]; + add ps[/(foo|bletch)/]; + print "set reset stats", table_pattern_matcher_stats(ps); + + print "x" in ps, "bletch" in ps; + print "set after more lookup stats", table_pattern_matcher_stats(ps); + + delete pt[/bar/]; + print "set reset stats after delete", table_pattern_matcher_stats(ps); + + ps = set(); + print "set reset after reassignment", table_pattern_matcher_stats(pt); + } diff --git a/testing/btest/language/pattern-tables-when.zeek b/testing/btest/language/pattern-tables-when.zeek new file mode 100644 index 0000000000..d371cccf2c --- /dev/null +++ b/testing/btest/language/pattern-tables-when.zeek @@ -0,0 +1,54 @@ +# @TEST-EXEC: zeek -b %INPUT >out +# @TEST-EXEC: btest-diff out +# @TEST-EXEC: btest-diff .stderr + +global pt: table[pattern] of count; + +redef exit_only_after_terminate = T; + +event populate_c() + { + print "populate_c()"; + pt[/c/] = 4711; + terminate(); + } + +event populate_b() + { + print "populate_b()"; + pt[/b/] = 4242; + schedule 1msec { populate_c() }; + } + +event populate_a() + { + print "populate_a()"; + pt[/a/] = 42; + schedule 1msec { populate_b() }; + } + +event hard_exit() + { + if ( ! zeek_is_terminating() ) + exit(1); + } + +event zeek_init() + { + schedule 5sec { hard_exit() }; + + when ( |pt["a"]| > 0 ) { + print "gotcha a", pt["a"]; + } + + when ( |pt["b"]| > 0 ) { + print "gotcha b", pt["b"]; + } + + when ( "c" in pt ) { + print "gotcha c", pt["c"]; + } + + print "schedule populate"; + schedule 1msec { populate_a() }; + } diff --git a/testing/btest/language/pattern-tables.zeek b/testing/btest/language/pattern-tables.zeek new file mode 100644 index 0000000000..5331bd6315 --- /dev/null +++ b/testing/btest/language/pattern-tables.zeek @@ -0,0 +1,60 @@ +# @TEST-EXEC: zeek -b %INPUT >out +# @TEST-EXEC: btest-diff out +# @TEST-EXEC: btest-diff .stderr + +global pt: table[pattern] of count; + +event zeek_init() + { + # test_case("indexing empty", |pt["foo"] == 0|); + print "indexing empty", |pt["foo"]|; + + pt[/foo/] = 1; + + print "single insert, match", pt["foo"]; + print "single insert, non-match", pt["foox"]; + print "single insert, in", "foo" in pt; + print "single insert, not-in", "foox" in pt; + + pt[/bar/] = 2; + pt[/(foo|bletch)/] = 3; + + print "multiple inserts, non-match", pt["x"]; + print "multiple inserts, single match", pt["bletch"]; + print "multiple inserts, double match", sort(pt["foo"]); + print "multiple insert, in", "foo" in pt; + print "multiple insert, not-in", "x" in pt; + + pt[/(foo|bletch|xyz)/] = 4; + print "triple match", sort(pt["foo"]); + + pt[/dog.*cat/] = 5; + pt[/dog.*cat/s] = 6; + pt[/dog.*cat/i] = 7; + print "embedded newline, /s operator", pt["dog\ncat"]; + print "no embedded newline, /s vs. no /s operator", sort(pt["dogmousecat"]); + print "no embedded newline, case sensitive, /i vs. no /i operator", sort(pt["dogmouseCat"]); + + delete pt[/(foo|bletch)/]; + print "single delete, no more triple match", pt["foo"]; + + delete pt[/bar/]; + delete pt[/foo/]; + print "double delete, no more double match", pt["foo"]; + + delete pt[/nosuchpattern/]; + print "delete of non-existing pattern", pt["foo"]; + + local copy_pt = pt; + print "shallow copy matches multi", sort(pt["dogmousecat"]); + + local deep_copy_pt = copy(pt); + print "deep copy matches multi", sort(pt["dogmousecat"]); + + clear_table(pt); + print "delete of entire table", pt["foo"]; + + local replacement_pt: table[pattern] of count; + deep_copy_pt = replacement_pt; + print "reassignment of table", deep_copy_pt["dogmousecat"]; + }