From 699549eb45dc3a8d127e5a5a38e7544106e45cf0 Mon Sep 17 00:00:00 2001 From: Vern Paxson Date: Wed, 1 Nov 2023 11:23:48 +0100 Subject: [PATCH 01/18] support for indexing "table[pattern] of T" with strings to get multi-matches --- src/Expr.cc | 19 ++++++++- src/Expr.h | 1 + src/RE.cc | 44 ++++++++++++++++++++ src/RE.h | 25 +++++++++++ src/Val.cc | 111 +++++++++++++++++++++++++++++++++++++++++++++++++ src/Val.h | 8 ++++ src/re-parse.y | 18 +++++++- src/re-scan.l | 4 ++ 8 files changed, 228 insertions(+), 2 deletions(-) diff --git a/src/Expr.cc b/src/Expr.cc index 32445ccd69..18deec804c 100644 --- a/src/Expr.cc +++ b/src/Expr.cc @@ -2382,6 +2382,18 @@ IndexExpr::IndexExpr(ExprPtr arg_op1, ListExprPtr arg_op2, bool arg_is_slice, bo if ( IsError() ) return; + if ( op1->GetType()->Tag() == TYPE_TABLE ) { // Check for a table[pattern] being indexed by a string + auto table_type = op1->GetType()->AsTableType(); + auto& it = table_type->GetIndexTypes(); + auto& rhs_type = op2->GetType()->AsTypeList()->GetTypes(); + if ( it.size() == 1 && it[0]->Tag() == TYPE_PATTERN && table_type->Yield() && rhs_type.size() == 1 && + rhs_type[0]->Tag() == TYPE_STRING ) { + is_pattern_table = true; + SetType(make_intrusive(op1->GetType()->Yield())); + return; + } + } + int match_type = op1->GetType()->MatchesIndex(op2->AsListExpr()); if ( match_type == DOES_NOT_MATCH_INDEX ) { @@ -2532,7 +2544,12 @@ ValPtr IndexExpr::Fold(Val* v1, Val* v2) const { return index_slice(vect, lv); } break; - case TYPE_TABLE: v = v1->AsTableVal()->FindOrDefault({NewRef{}, v2}); break; + case TYPE_TABLE: + if ( is_pattern_table ) + return v1->AsTableVal()->LookupPattern(v2->AsListVal()->Idx(0)->AsStringVal()); + + v = v1->AsTableVal()->FindOrDefault({NewRef{}, v2}); + break; case TYPE_STRING: return index_string(v1->AsString(), v2->AsListVal()); diff --git a/src/Expr.h b/src/Expr.h index 3bec879dbb..a45a1528b8 100644 --- a/src/Expr.h +++ b/src/Expr.h @@ -1012,6 +1012,7 @@ protected: bool is_slice; bool is_inside_when; + bool is_pattern_table = false; }; // The following execute the heart of IndexExpr functionality for diff --git a/src/RE.cc b/src/RE.cc index 67144e9bd4..87b69d29a2 100644 --- a/src/RE.cc +++ b/src/RE.cc @@ -251,6 +251,33 @@ int Specific_RE_Matcher::Match(const u_char* bv, int n) { return 0; } +void Specific_RE_Matcher::MatchDisjunction(const String* s, std::vector& matches) { + auto bv = s->Bytes(); + auto n = s->Len(); + + ASSERT(dfa); + + DFA_State* d = dfa->StartState(); + d = d->Xtion(ecs[SYM_BOL], dfa); + + while ( d ) { + if ( --n < 0 ) + break; + + int ec = ecs[*(bv++)]; + d = d->Xtion(ec, dfa); + } + + if ( d ) + d = d->Xtion(ecs[SYM_EOL], dfa); + + if ( d ) + if ( auto a_set = d->Accept() ) + for ( auto a : *a_set ) + matches.push_back(a); +} + + void Specific_RE_Matcher::Dump(FILE* f) { dfa->Dump(f); } inline void RE_Match_State::AddMatches(const AcceptingSet& as, MatchPos position) { @@ -425,6 +452,23 @@ void RE_Matcher::MakeSingleLine() { bool RE_Matcher::Compile(bool lazy) { return re_anywhere->Compile(lazy) && re_exact->Compile(lazy); } +RE_DisjunctiveMatcher::RE_DisjunctiveMatcher(const std::vector& REs) { + matcher = std::make_unique(detail::MATCH_EXACTLY); + + std::string disjunction; + for ( auto re : REs ) + disjunction += std::string("||") + re->PatternText(); + + matcher->SetPat(disjunction.c_str()); + auto status = matcher->Compile(); + ASSERT(status); +} + +void RE_DisjunctiveMatcher::Match(const String* s, std::vector& matches) { + matches.clear(); + return matcher->MatchDisjunction(s, matches); +} + TEST_SUITE("re_matcher") { TEST_CASE("simple_pattern") { RE_Matcher match("[0-9]+"); diff --git a/src/RE.h b/src/RE.h index f68f3482bb..8d7b28da30 100644 --- a/src/RE.h +++ b/src/RE.h @@ -36,6 +36,7 @@ extern CCL* curr_ccl; extern NFA_Machine* nfa; extern Specific_RE_Matcher* rem; extern const char* RE_parse_input; +extern int RE_accept_num; extern int clower(int); extern void synerr(const char str[]); @@ -104,6 +105,17 @@ public: int Match(const String* s); int Match(const u_char* bv, int n); + // A disjunction is a collection of regular expressions (that under + // the hood are matches as a single RE, not serially) for which + // the match operation returns *all* of the matches. Disjunctions + // are constructed using the internal "||" RE operator, and the + // matches are returned as indices into the position, left-to-right, + // of which REs matched. IMPORTANT: the first RE is numbered 1, not 0. + // + // Note that there's no guarantee regarding the ordering of the + // returned matches if there is more than one. + void MatchDisjunction(const String* s, std::vector& matches); + int LongestMatch(const char* s); int LongestMatch(const String* s); int LongestMatch(const u_char* bv, int n, bool bol = true, bool eol = true); @@ -244,4 +256,17 @@ protected: bool is_single_line = false; }; +class RE_DisjunctiveMatcher final { +public: + // Takes a collection of individual REs and builds a disjunctive + // matcher for the set. + RE_DisjunctiveMatcher(const std::vector& REs); + + // See MatchDisjunction() above. + void Match(const String* s, std::vector& matches); + +private: + std::unique_ptr matcher; +}; + } // namespace zeek diff --git a/src/Val.cc b/src/Val.cc index 397c1d777c..d861d69386 100644 --- a/src/Val.cc +++ b/src/Val.cc @@ -1427,6 +1427,93 @@ static void find_nested_record_types(const TypePtr& t, std::set* fo } } +using PatternValPtr = IntrusivePtr; + +// Support class for returning multiple values from a table[pattern] +// when indexed with a string. +class TablePatternMatcher { +public: + TablePatternMatcher(const TableVal* _tbl, TypePtr _yield) : tbl(_tbl) { + vtype = make_intrusive(std::move(_yield)); + } + ~TablePatternMatcher() { Clear(); } + + void Insert(ValPtr pat, ValPtr yield) { Clear(); } + void Remove(ValPtr pat) { Clear(); } + + void Clear() { + delete matcher; + matcher = nullptr; + } + + VectorValPtr Lookup(const StringVal* s); + +private: + void Build(); + + const TableVal* tbl; + VectorTypePtr vtype; + + // If matcher is nil then we know we need to build it. This gives + // us an easy way to cache matchers in the common case that these + // sorts of tables don't change their elements very often (indeed, + // they'll frequently be constructed just once), and also keeps us + // from having to re-build the matcher on every insert/delete in + // the common case that a whole bunch of those are done in a single + // batch. + RE_DisjunctiveMatcher* matcher = nullptr; + + // Maps matcher values to corresponding yields. When building the + // matcher we insert a nil at the head to accommodate how + // disjunctive matchers use numbering starting at 1 rather than 0. + std::vector matcher_yields; +}; + +VectorValPtr TablePatternMatcher::Lookup(const StringVal* s) { + auto results = make_intrusive(vtype); + + if ( ! matcher ) { + if ( tbl->Get()->Length() == 0 ) + return results; + + Build(); + } + + std::vector matches; + matcher->Match(s->AsString(), matches); + + for ( auto m : matches ) + results->Append(matcher_yields[m]); + + return results; +} + +void TablePatternMatcher::Build() { + matcher_yields.clear(); + matcher_yields.push_back(nullptr); + + auto& tbl_dict = *tbl->Get(); + auto& tbl_hash = *tbl->GetTableHash(); + std::vector patterns; + + // We need to hold on to recovered hash key values so they don't + // get lost once a loop iteration goes out of scope. + std::vector hash_key_vals; + + for ( auto& iter : tbl_dict ) { + auto k = iter.GetHashKey(); + auto v = iter.value; + auto vl = tbl_hash.RecoverVals(*k); + + patterns.push_back(vl->AsListVal()->Idx(0)->AsPattern()); + matcher_yields.push_back(v->GetVal()); + + hash_key_vals.push_back(std::move(vl)); + } + + matcher = new RE_DisjunctiveMatcher(patterns); +} + TableVal::TableVal(TableTypePtr t, detail::AttributesPtr a) : Val(t) { bool ordered = (a != nullptr && a->Find(detail::ATTR_ORDERED) != nullptr); Init(std::move(t), ordered); @@ -1460,6 +1547,10 @@ void TableVal::Init(TableTypePtr t, bool ordered) { else subnets = nullptr; + auto& it = table_type->GetIndexTypes(); + if ( it.size() == 1 && it[0]->Tag() == TYPE_PATTERN && table_type->Yield() ) + pattern_matcher = new TablePatternMatcher(this, table_type->Yield()); + table_hash = new detail::CompositeHash(table_type->GetIndices()); if ( ordered ) table_val = new PDict(DictOrder::ORDERED); @@ -1476,6 +1567,7 @@ TableVal::~TableVal() { delete table_hash; delete table_val; delete subnets; + delete pattern_matcher; delete expire_iterator; } @@ -1486,6 +1578,9 @@ void TableVal::RemoveAll() { delete table_val; table_val = new PDict; table_val->SetDeleteFunc(table_entry_val_delete_func); + + if ( pattern_matcher ) + pattern_matcher->Clear(); } int TableVal::Size() const { return table_val->Length(); } @@ -1570,6 +1665,9 @@ bool TableVal::Assign(ValPtr index, ValPtr new_val, bool broker_forward, bool* i return false; } + if ( pattern_matcher ) + pattern_matcher->Insert(index->AsListVal()->Idx(0), new_val); + return Assign(std::move(index), std::move(k), std::move(new_val), broker_forward, iterators_invalidated); } @@ -1925,6 +2023,13 @@ TableValPtr TableVal::LookupSubnetValues(const SubNetVal* search) { return nt; } +VectorValPtr TableVal::LookupPattern(const StringVal* s) { + if ( ! pattern_matcher ) + reporter->InternalError("LookupPattern called on wrong table type"); + + return pattern_matcher->Lookup(s); +} + bool TableVal::UpdateTimestamp(Val* index) { TableEntryVal* v; @@ -2105,8 +2210,14 @@ ValPtr TableVal::Remove(const Val& index, bool broker_forward, bool* iterators_i va = v->GetVal() ? v->GetVal() : IntrusivePtr{NewRef{}, this}; if ( subnets && ! subnets->Remove(&index) ) + // VP: not clear to me this should be an internal warning, + // since Zeek doesn't otherwise complain about removing + // non-existent table elements. reporter->InternalWarning("index not in prefix table"); + if ( pattern_matcher ) + pattern_matcher->Remove(index.AsListVal()->Idx(0)); + delete v; Modified(); diff --git a/src/Val.h b/src/Val.h index 9ddeec842f..e0c67e91f8 100644 --- a/src/Val.h +++ b/src/Val.h @@ -718,6 +718,8 @@ protected: TableVal* table; }; +class TablePatternMatcher; + class TableVal final : public Val, public notifier::detail::Modifiable { public: explicit TableVal(TableTypePtr t, detail::AttributesPtr attrs = nullptr); @@ -863,6 +865,11 @@ public: // Causes an internal error if called for any other kind of table. TableValPtr LookupSubnetValues(const SubNetVal* s); + // For a table[pattern], return a vector of all yields matching + // the given string. + // Causes an internal error if called for any other kind of table. + VectorValPtr LookupPattern(const StringVal* s); + // Sets the timestamp for the given index to network time. // Returns false if index does not exist. bool UpdateTimestamp(Val* index); @@ -1032,6 +1039,7 @@ protected: TableValTimer* timer; RobustDictIterator* expire_iterator; detail::PrefixTable* subnets; + TablePatternMatcher* pattern_matcher = nullptr; ValPtr def_val; detail::ExprPtr change_func; std::string broker_store; diff --git a/src/re-parse.y b/src/re-parse.y index 2d6672df8d..2ee6bec9e2 100644 --- a/src/re-parse.y +++ b/src/re-parse.y @@ -21,6 +21,7 @@ void yyerror(const char msg[]); %} %token TOK_CHAR TOK_NUMBER TOK_CCL TOK_CCE TOK_CASE_INSENSITIVE TOK_SINGLE_LINE +%token TOK_DISJUNCTION %union { int int_val; @@ -32,7 +33,7 @@ void yyerror(const char msg[]); %type TOK_CHAR TOK_NUMBER %type TOK_CCE %type TOK_CCL ccl full_ccl -%type re singleton series string +%type re singleton series string disjunction %destructor { delete $$; } @@ -40,6 +41,9 @@ void yyerror(const char msg[]); flexrule : re { $1->AddAccept(1); zeek::detail::nfa = $1; } + | disjunction + { zeek::detail::nfa = $1; } + | error { return 1; } ; @@ -51,6 +55,18 @@ re : re '|' series { $$ = new zeek::detail::NFA_Machine(new zeek::detail::EpsilonState()); } ; +disjunction : disjunction TOK_DISJUNCTION re + { + $3->AddAccept(++zeek::detail::RE_accept_num); + $$ = zeek::detail::make_alternate($1, $3); + } + | TOK_DISJUNCTION re + { + $2->AddAccept(++zeek::detail::RE_accept_num); + $$ = $2; + } + ; + series : series singleton { $1->AppendMachine($2); $$ = $1; } | singleton diff --git a/src/re-scan.l b/src/re-scan.l index f382393477..7df4665640 100644 --- a/src/re-scan.l +++ b/src/re-scan.l @@ -23,6 +23,7 @@ #include "re-parse.h" const char* zeek::detail::RE_parse_input = nullptr; +int zeek::detail::RE_accept_num = 0; #define RET_CCE(func) \ BEGIN(SC_CCL); \ @@ -143,6 +144,8 @@ CCL_EXPR ("[:"[[:alpha:]]+":]") } } + "||" return TOK_DISJUNCTION; + [|*+?.(){}] return yytext[0]; . yylval.int_val = yytext[0]; return TOK_CHAR; \n return 0; // treat as end of pattern @@ -237,6 +240,7 @@ YY_BUFFER_STATE RE_buf; void RE_set_input(const char* str) { zeek::detail::RE_parse_input = str; + zeek::detail::RE_accept_num = 0; RE_buf = yy_scan_string(str); } From fd1094a184bad843eadc75761bdf68ec07b823d7 Mon Sep 17 00:00:00 2001 From: Vern Paxson Date: Thu, 26 Oct 2023 11:38:25 -0400 Subject: [PATCH 02/18] BTests for indexing "table[pattern] of T" with strings --- .../Baseline/language.pattern-tables/.stderr | 1 + .../Baseline/language.pattern-tables/out | 18 ++++++ testing/btest/language/pattern-tables.zeek | 56 +++++++++++++++++++ 3 files changed, 75 insertions(+) create mode 100644 testing/btest/Baseline/language.pattern-tables/.stderr create mode 100644 testing/btest/Baseline/language.pattern-tables/out create mode 100644 testing/btest/language/pattern-tables.zeek diff --git a/testing/btest/Baseline/language.pattern-tables/.stderr b/testing/btest/Baseline/language.pattern-tables/.stderr new file mode 100644 index 0000000000..49d861c74c --- /dev/null +++ b/testing/btest/Baseline/language.pattern-tables/.stderr @@ -0,0 +1 @@ +### BTest baseline data generated by btest-diff. Do not edit. Use "btest -U/-u" to update. Requires BTest >= 0.63. diff --git a/testing/btest/Baseline/language.pattern-tables/out b/testing/btest/Baseline/language.pattern-tables/out new file mode 100644 index 0000000000..209e9c48b2 --- /dev/null +++ b/testing/btest/Baseline/language.pattern-tables/out @@ -0,0 +1,18 @@ +### BTest baseline data generated by btest-diff. Do not edit. Use "btest -U/-u" to update. Requires BTest >= 0.63. +indexing empty, 0 +single insert, match, [1] +single insert, non-match, [] +multiple inserts, non-match, [] +multiple inserts, single match, [3] +multiple inserts, double match, [1, 3] +triple match, [1, 3, 4] +embedded newline, /s operator, [6] +no embedded newline, /s vs. no /s operator, [5, 6, 7] +no embedded newline, case sensitive, /i vs. no /i operator, [7] +single delete, no more triple match, [1, 4] +double delete, no more double match, [4] +delete of non-existing pattern, [4] +shallow copy matches multi, [5, 6, 7] +deep copy matches multi, [5, 6, 7] +delete of entire table, [] +reassignment of table, [] diff --git a/testing/btest/language/pattern-tables.zeek b/testing/btest/language/pattern-tables.zeek new file mode 100644 index 0000000000..7ca5c42e46 --- /dev/null +++ b/testing/btest/language/pattern-tables.zeek @@ -0,0 +1,56 @@ +# @TEST-EXEC: zeek -b %INPUT >out +# @TEST-EXEC: btest-diff out +# @TEST-EXEC: btest-diff .stderr + +global pt: table[pattern] of count; + +event zeek_init() + { + # test_case("indexing empty", |pt["foo"] == 0|); + print "indexing empty", |pt["foo"]|; + + pt[/foo/] = 1; + + print "single insert, match", pt["foo"]; + print "single insert, non-match", pt["foox"]; + + pt[/bar/] = 2; + pt[/(foo|bletch)/] = 3; + + print "multiple inserts, non-match", pt["x"]; + print "multiple inserts, single match", pt["bletch"]; + print "multiple inserts, double match", sort(pt["foo"]); + + pt[/(foo|bletch|xyz)/] = 4; + print "triple match", sort(pt["foo"]); + + pt[/dog.*cat/] = 5; + pt[/dog.*cat/s] = 6; + pt[/dog.*cat/i] = 7; + print "embedded newline, /s operator", pt["dog\ncat"]; + print "no embedded newline, /s vs. no /s operator", sort(pt["dogmousecat"]); + print "no embedded newline, case sensitive, /i vs. no /i operator", sort(pt["dogmouseCat"]); + + delete pt[/(foo|bletch)/]; + print "single delete, no more triple match", pt["foo"]; + + delete pt[/bar/]; + delete pt[/foo/]; + print "double delete, no more double match", pt["foo"]; + + delete pt[/nosuchpattern/]; + print "delete of non-existing pattern", pt["foo"]; + + local copy_pt = pt; + print "shallow copy matches multi", sort(pt["dogmousecat"]); + + local deep_copy_pt = copy(pt); + print "deep copy matches multi", sort(pt["dogmousecat"]); + + clear_table(pt); + print "delete of entire table", pt["foo"]; + + local replacement_pt: table[pattern] of count; + deep_copy_pt = replacement_pt; + print "reassignment of table", deep_copy_pt["dogmousecat"]; + } From 61fcca848200f36b19877b8860fd5c0bf9b0b123 Mon Sep 17 00:00:00 2001 From: Vern Paxson Date: Wed, 1 Nov 2023 11:44:58 +0100 Subject: [PATCH 03/18] script optimization support for using strings to index table[pattern] values --- src/script_opt/CPP/Exprs.cc | 21 ++++++++++++++++++++- src/script_opt/CPP/RuntimeOps.cc | 11 +++++++++++ src/script_opt/CPP/RuntimeOps.h | 5 ++++- src/script_opt/ZAM/Expr.cc | 29 ++++++++++++++++++++++++++--- src/script_opt/ZAM/Ops.in | 20 ++++++++++++++++++++ 5 files changed, 81 insertions(+), 5 deletions(-) diff --git a/src/script_opt/CPP/Exprs.cc b/src/script_opt/CPP/Exprs.cc index c943a14b25..afef3ce3cd 100644 --- a/src/script_opt/CPP/Exprs.cc +++ b/src/script_opt/CPP/Exprs.cc @@ -388,7 +388,26 @@ string CPPCompile::GenIndexExpr(const Expr* e, GenType gt) { string func; if ( aggr_t->Tag() == TYPE_TABLE ) { - func = inside_when ? "when_index_table__CPP" : "index_table__CPP"; + auto ind_expr = e->GetOp2()->AsListExpr()->Exprs()[0]; + auto is_pat_str_ind = false; + + auto& indices = aggr_t->AsTableType()->GetIndices()->GetTypes(); + if ( indices.size() == 1 && indices[0]->Tag() == TYPE_PATTERN && ind_expr->GetType()->Tag() == TYPE_STRING ) + is_pat_str_ind = true; + + if ( inside_when ) { + if ( is_pat_str_ind ) + func = "when_index_patstr__CPP"; + else + func = "when_index_table__CPP"; + } + else { + if ( is_pat_str_ind ) + func = "index_patstr_table__CPP"; + else + func = "index_table__CPP"; + } + gen = func + "(" + GenExpr(aggr, GEN_NATIVE) + ", {" + GenExpr(e->GetOp2(), GEN_VAL_PTR) + "})"; } diff --git a/src/script_opt/CPP/RuntimeOps.cc b/src/script_opt/CPP/RuntimeOps.cc index ea8a81ac86..273b77b315 100644 --- a/src/script_opt/CPP/RuntimeOps.cc +++ b/src/script_opt/CPP/RuntimeOps.cc @@ -44,6 +44,10 @@ ValPtr index_table__CPP(const TableValPtr& t, vector indices) { return v; } +ValPtr index_patstr_table__CPP(const TableValPtr& t, vector indices) { + return t->LookupPattern(indices[0]->AsStringVal()); +} + ValPtr index_vec__CPP(const VectorValPtr& vec, int index) { if ( index < 0 ) index += vec->Size(); @@ -66,6 +70,13 @@ ValPtr when_index_table__CPP(const TableValPtr& t, vector indices) { return v; } +ValPtr when_index_patstr__CPP(const TableValPtr& t, vector indices) { + auto v = index_patstr_table__CPP(t, std::move(indices)); + if ( v && IndexExprWhen::evaluating > 0 ) + IndexExprWhen::results.emplace_back(v); + return v; +} + ValPtr when_index_vec__CPP(const VectorValPtr& vec, int index) { auto v = index_vec__CPP(vec, index); if ( v && IndexExprWhen::evaluating > 0 ) diff --git a/src/script_opt/CPP/RuntimeOps.h b/src/script_opt/CPP/RuntimeOps.h index b35dd2b213..5ef5ba0efa 100644 --- a/src/script_opt/CPP/RuntimeOps.h +++ b/src/script_opt/CPP/RuntimeOps.h @@ -32,13 +32,16 @@ extern ListValPtr index_val__CPP(std::vector indices); // Returns the value corresponding to indexing the given table/vector/string // with the given set of indices. These are functions rather than something // generated directly so that they can package up the error handling for -// the case where there's no such index. +// the case where there's no such index. "patstr" refers to indexing a +// table[pattern] of X with a string value. extern ValPtr index_table__CPP(const TableValPtr& t, std::vector indices); +extern ValPtr index_patstr_table__CPP(const TableValPtr& t, std::vector indices); extern ValPtr index_vec__CPP(const VectorValPtr& vec, int index); extern ValPtr index_string__CPP(const StringValPtr& svp, std::vector indices); // The same, but for indexing happening inside a "when" clause. extern ValPtr when_index_table__CPP(const TableValPtr& t, std::vector indices); +extern ValPtr when_index_patstr__CPP(const TableValPtr& t, std::vector indices); extern ValPtr when_index_vec__CPP(const VectorValPtr& vec, int index); // For vector slices, we use the existing index_slice(), but we need a diff --git a/src/script_opt/ZAM/Expr.cc b/src/script_opt/ZAM/Expr.cc index 854fb6fde5..ea5af97939 100644 --- a/src/script_opt/ZAM/Expr.cc +++ b/src/script_opt/ZAM/Expr.cc @@ -573,6 +573,19 @@ const ZAMStmt ZAMCompiler::CompileIndex(const NameExpr* n1, int n2_slot, const T int n = l->Exprs().length(); auto n2tag = n2t->Tag(); + // Whether this is an instance of indexing a table[pattern] of X + // with a string. + bool is_pat_str_ind = false; + + if ( n2tag == TYPE_TABLE && n == 1 ) { + auto& ind_types = n2t->AsTableType()->GetIndices(); + auto& ind_type0 = ind_types->GetTypes()[0]; + auto ind = l->Exprs()[0]; + + if ( ind_type0->Tag() == TYPE_PATTERN && ind->GetType()->Tag() == TYPE_STRING ) + is_pat_str_ind = true; + } + if ( n == 1 && ! in_when ) { auto ind = l->Exprs()[0]; auto var_ind = ind->Tag() == EXPR_NAME; @@ -640,7 +653,8 @@ const ZAMStmt ZAMCompiler::CompileIndex(const NameExpr* n1, int n2_slot, const T if ( n2tag == TYPE_TABLE ) { if ( n3 ) { int n3_slot = FrameSlot(n3); - auto zop = AssignmentFlavor(OP_TABLE_INDEX1_VVV, n1->GetType()->Tag()); + auto op = is_pat_str_ind ? OP_TABLE_PATSTR_INDEX1_VVV : OP_TABLE_INDEX1_VVV; + auto zop = AssignmentFlavor(op, n1->GetType()->Tag()); z = ZInstI(zop, Frame1Slot(n1, zop), n2_slot, n3_slot); z.SetType(n3->GetType()); } @@ -648,7 +662,8 @@ const ZAMStmt ZAMCompiler::CompileIndex(const NameExpr* n1, int n2_slot, const T else { ASSERT(c3); - auto zop = AssignmentFlavor(OP_TABLE_INDEX1_VVC, n1->GetType()->Tag()); + auto op = is_pat_str_ind ? OP_TABLE_PATSTR_INDEX1_VVC : OP_TABLE_INDEX1_VVC; + auto zop = AssignmentFlavor(op, n1->GetType()->Tag()); z = ZInstI(zop, Frame1Slot(n1, zop), n2_slot, c3); } @@ -674,7 +689,15 @@ const ZAMStmt ZAMCompiler::CompileIndex(const NameExpr* n1, int n2_slot, const T break; case TYPE_TABLE: - op = in_when ? OP_WHEN_TABLE_INDEX_VV : OP_TABLE_INDEX_VV; + if ( in_when ) { + if ( is_pat_str_ind ) + op = OP_WHEN_PATSTR_INDEX_VV; + else + op = OP_WHEN_TABLE_INDEX_VV; + } + else + op = OP_TABLE_INDEX_VV; + z = ZInstI(op, Frame1Slot(n1, op), n2_slot); z.SetType(n1->GetType()); break; diff --git a/src/script_opt/ZAM/Ops.in b/src/script_opt/ZAM/Ops.in index d9acb601cc..87b820177f 100644 --- a/src/script_opt/ZAM/Ops.in +++ b/src/script_opt/ZAM/Ops.in @@ -1057,6 +1057,15 @@ macro EvalTableIndex(index) break; } +internal-op When-PatStr-Index +type VV +eval auto args = z.aux->ToListVal(frame); + auto arg0 = args->Idx(0); + auto v = frame[z.v2].table_val->LookupPattern(arg0->AsStringVal()); + if ( IndexExprWhen::evaluating > 0 ) + IndexExprWhen::results.emplace_back(v); + AssignV1(BuildVal(v, z.t)) + internal-assignment-op Table-Index1 type VVV assign-val v @@ -1068,6 +1077,17 @@ type VVC assign-val v eval EvalTableIndex(z.c.ToVal(z.t)) +# Same, but for indexing table[pattern] of X with a string. +internal-assignment-op Table-PatStr-Index1 +type VVV +assign-val v +eval auto v = frame[z.v2].table_val->LookupPattern(frame[z.v3].AsString()); + +internal-assignment-op Table-PatStr-Index1 +type VVC +assign-val v +eval auto v = frame[z.v2].table_val->LookupPattern(z.c.AsString()); + # This version is for a variable v3. internal-op Index-String type VVV From b55e1a122e5e93c84f3712bb576ef179213231cd Mon Sep 17 00:00:00 2001 From: Arne Welzel Date: Wed, 1 Nov 2023 11:51:25 +0100 Subject: [PATCH 04/18] Reuse CompileSet() instead of || string formatting --- src/RE.cc | 19 +++++++++++++------ src/RE.h | 1 - src/re-parse.y | 18 +----------------- src/re-scan.l | 4 ---- 4 files changed, 14 insertions(+), 28 deletions(-) diff --git a/src/RE.cc b/src/RE.cc index 87b69d29a2..70bb71b055 100644 --- a/src/RE.cc +++ b/src/RE.cc @@ -172,6 +172,10 @@ bool Specific_RE_Matcher::CompileSet(const string_list& set, const int_list& idx dfa = new DFA_Machine(nfa, EC()); ecs = EC()->EquivClasses(); + // dfa took ownership + Unref(nfa); + nfa = nullptr; + return true; } @@ -455,13 +459,16 @@ bool RE_Matcher::Compile(bool lazy) { return re_anywhere->Compile(lazy) && re_ex RE_DisjunctiveMatcher::RE_DisjunctiveMatcher(const std::vector& REs) { matcher = std::make_unique(detail::MATCH_EXACTLY); - std::string disjunction; - for ( auto re : REs ) - disjunction += std::string("||") + re->PatternText(); + zeek::detail::string_list sl; + zeek::detail::int_list il; - matcher->SetPat(disjunction.c_str()); - auto status = matcher->Compile(); - ASSERT(status); + for ( const auto* re : REs ) { + sl.push_back(const_cast(re->PatternText())); + il.push_back(sl.size()); + } + + if ( ! matcher->CompileSet(sl, il) ) + reporter->FatalError("failed compile set for disjunctive matcher"); } void RE_DisjunctiveMatcher::Match(const String* s, std::vector& matches) { diff --git a/src/RE.h b/src/RE.h index 8d7b28da30..52b446a306 100644 --- a/src/RE.h +++ b/src/RE.h @@ -36,7 +36,6 @@ extern CCL* curr_ccl; extern NFA_Machine* nfa; extern Specific_RE_Matcher* rem; extern const char* RE_parse_input; -extern int RE_accept_num; extern int clower(int); extern void synerr(const char str[]); diff --git a/src/re-parse.y b/src/re-parse.y index 2ee6bec9e2..2d6672df8d 100644 --- a/src/re-parse.y +++ b/src/re-parse.y @@ -21,7 +21,6 @@ void yyerror(const char msg[]); %} %token TOK_CHAR TOK_NUMBER TOK_CCL TOK_CCE TOK_CASE_INSENSITIVE TOK_SINGLE_LINE -%token TOK_DISJUNCTION %union { int int_val; @@ -33,7 +32,7 @@ void yyerror(const char msg[]); %type TOK_CHAR TOK_NUMBER %type TOK_CCE %type TOK_CCL ccl full_ccl -%type re singleton series string disjunction +%type re singleton series string %destructor { delete $$; } @@ -41,9 +40,6 @@ void yyerror(const char msg[]); flexrule : re { $1->AddAccept(1); zeek::detail::nfa = $1; } - | disjunction - { zeek::detail::nfa = $1; } - | error { return 1; } ; @@ -55,18 +51,6 @@ re : re '|' series { $$ = new zeek::detail::NFA_Machine(new zeek::detail::EpsilonState()); } ; -disjunction : disjunction TOK_DISJUNCTION re - { - $3->AddAccept(++zeek::detail::RE_accept_num); - $$ = zeek::detail::make_alternate($1, $3); - } - | TOK_DISJUNCTION re - { - $2->AddAccept(++zeek::detail::RE_accept_num); - $$ = $2; - } - ; - series : series singleton { $1->AppendMachine($2); $$ = $1; } | singleton diff --git a/src/re-scan.l b/src/re-scan.l index 7df4665640..f382393477 100644 --- a/src/re-scan.l +++ b/src/re-scan.l @@ -23,7 +23,6 @@ #include "re-parse.h" const char* zeek::detail::RE_parse_input = nullptr; -int zeek::detail::RE_accept_num = 0; #define RET_CCE(func) \ BEGIN(SC_CCL); \ @@ -144,8 +143,6 @@ CCL_EXPR ("[:"[[:alpha:]]+":]") } } - "||" return TOK_DISJUNCTION; - [|*+?.(){}] return yytext[0]; . yylval.int_val = yytext[0]; return TOK_CHAR; \n return 0; // treat as end of pattern @@ -240,7 +237,6 @@ YY_BUFFER_STATE RE_buf; void RE_set_input(const char* str) { zeek::detail::RE_parse_input = str; - zeek::detail::RE_accept_num = 0; RE_buf = yy_scan_string(str); } From 074f51fc967f7b318c1780eb64e9bfda08d2e626 Mon Sep 17 00:00:00 2001 From: Arne Welzel Date: Wed, 1 Nov 2023 13:42:40 +0100 Subject: [PATCH 05/18] btest: Add test for pattern tables and when --- .../language.pattern-tables-when/.stderr | 2 + .../Baseline/language.pattern-tables-when/out | 6 +++ .../btest/language/pattern-tables-when.zeek | 43 +++++++++++++++++++ 3 files changed, 51 insertions(+) create mode 100644 testing/btest/Baseline/language.pattern-tables-when/.stderr create mode 100644 testing/btest/Baseline/language.pattern-tables-when/out create mode 100644 testing/btest/language/pattern-tables-when.zeek diff --git a/testing/btest/Baseline/language.pattern-tables-when/.stderr b/testing/btest/Baseline/language.pattern-tables-when/.stderr new file mode 100644 index 0000000000..e3f6131b1d --- /dev/null +++ b/testing/btest/Baseline/language.pattern-tables-when/.stderr @@ -0,0 +1,2 @@ +### BTest baseline data generated by btest-diff. Do not edit. Use "btest -U/-u" to update. Requires BTest >= 0.63. +received termination signal diff --git a/testing/btest/Baseline/language.pattern-tables-when/out b/testing/btest/Baseline/language.pattern-tables-when/out new file mode 100644 index 0000000000..d9e62df0e8 --- /dev/null +++ b/testing/btest/Baseline/language.pattern-tables-when/out @@ -0,0 +1,6 @@ +### BTest baseline data generated by btest-diff. Do not edit. Use "btest -U/-u" to update. Requires BTest >= 0.63. +schedule populate +populate_a() +gotcha a, [42] +populate_b() +gotcha b, [4242] diff --git a/testing/btest/language/pattern-tables-when.zeek b/testing/btest/language/pattern-tables-when.zeek new file mode 100644 index 0000000000..df303418c2 --- /dev/null +++ b/testing/btest/language/pattern-tables-when.zeek @@ -0,0 +1,43 @@ +# @TEST-EXEC: zeek -b %INPUT >out +# @TEST-EXEC: btest-diff out +# @TEST-EXEC: btest-diff .stderr + +global pt: table[pattern] of count; + +redef exit_only_after_terminate = T; + +event populate_b() + { + print "populate_b()"; + pt[/b/] = 4242; + terminate(); + } + +event populate_a() + { + print "populate_a()"; + pt[/a/] = 42; + schedule 1msec { populate_b() }; + } + +event hard_exit() + { + if ( ! zeek_is_terminating() ) + exit(1); + } + +event zeek_init() + { + schedule 5sec { hard_exit() }; + + when ( |pt["a"]| > 0 ) { + print "gotcha a", pt["a"]; + } + + when ( |pt["b"]| > 0 ) { + print "gotcha b", pt["b"]; + } + + print "schedule populate"; + schedule 1msec { populate_a() }; + } From c8bab6a0ec8d59219ccc923909d02ea5e6e1e280 Mon Sep 17 00:00:00 2001 From: Arne Welzel Date: Wed, 1 Nov 2023 14:07:27 +0100 Subject: [PATCH 06/18] IndexType: Add IsPatternIndex(), like IsSubNetIndex() --- src/Expr.cc | 9 ++++----- src/Type.cc | 7 +++++++ src/Type.h | 3 +++ src/Val.cc | 3 +-- src/script_opt/CPP/Exprs.cc | 3 +-- 5 files changed, 16 insertions(+), 9 deletions(-) diff --git a/src/Expr.cc b/src/Expr.cc index 18deec804c..9f333c641e 100644 --- a/src/Expr.cc +++ b/src/Expr.cc @@ -2383,11 +2383,10 @@ IndexExpr::IndexExpr(ExprPtr arg_op1, ListExprPtr arg_op2, bool arg_is_slice, bo return; if ( op1->GetType()->Tag() == TYPE_TABLE ) { // Check for a table[pattern] being indexed by a string - auto table_type = op1->GetType()->AsTableType(); - auto& it = table_type->GetIndexTypes(); - auto& rhs_type = op2->GetType()->AsTypeList()->GetTypes(); - if ( it.size() == 1 && it[0]->Tag() == TYPE_PATTERN && table_type->Yield() && rhs_type.size() == 1 && - rhs_type[0]->Tag() == TYPE_STRING ) { + const auto& table_type = op1->GetType()->AsTableType(); + const auto& rhs_type = op2->GetType()->AsTypeList()->GetTypes(); + if ( table_type->IsPatternIndex() && table_type->Yield() && rhs_type.size() == 1 && + IsString(rhs_type[0]->Tag()) ) { is_pattern_table = true; SetType(make_intrusive(op1->GetType()->Yield())); return; diff --git a/src/Type.cc b/src/Type.cc index 4fd9fa9f52..a9da69e760 100644 --- a/src/Type.cc +++ b/src/Type.cc @@ -389,6 +389,13 @@ bool IndexType::IsSubNetIndex() const { return false; } +bool IndexType::IsPatternIndex() const { + const auto& types = indices->GetTypes(); + if ( types.size() == 1 && types[0]->Tag() == TYPE_PATTERN ) + return true; + return false; +} + detail::TraversalCode IndexType::Traverse(detail::TraversalCallback* cb) const { auto tc = cb->PreType(this); HANDLE_TC_TYPE_PRE(tc); diff --git a/src/Type.h b/src/Type.h index 6b69eb8e37..142fe2d331 100644 --- a/src/Type.h +++ b/src/Type.h @@ -356,6 +356,9 @@ public: // Returns true if this table is solely indexed by subnet. bool IsSubNetIndex() const; + // Returns true if this table has a single index of type pattern. + bool IsPatternIndex() const; + detail::TraversalCode Traverse(detail::TraversalCallback* cb) const override; protected: diff --git a/src/Val.cc b/src/Val.cc index d861d69386..a8f6f0d2e2 100644 --- a/src/Val.cc +++ b/src/Val.cc @@ -1547,8 +1547,7 @@ void TableVal::Init(TableTypePtr t, bool ordered) { else subnets = nullptr; - auto& it = table_type->GetIndexTypes(); - if ( it.size() == 1 && it[0]->Tag() == TYPE_PATTERN && table_type->Yield() ) + if ( table_type->IsPatternIndex() && table_type->Yield() ) pattern_matcher = new TablePatternMatcher(this, table_type->Yield()); table_hash = new detail::CompositeHash(table_type->GetIndices()); diff --git a/src/script_opt/CPP/Exprs.cc b/src/script_opt/CPP/Exprs.cc index afef3ce3cd..f798b4f91c 100644 --- a/src/script_opt/CPP/Exprs.cc +++ b/src/script_opt/CPP/Exprs.cc @@ -391,8 +391,7 @@ string CPPCompile::GenIndexExpr(const Expr* e, GenType gt) { auto ind_expr = e->GetOp2()->AsListExpr()->Exprs()[0]; auto is_pat_str_ind = false; - auto& indices = aggr_t->AsTableType()->GetIndices()->GetTypes(); - if ( indices.size() == 1 && indices[0]->Tag() == TYPE_PATTERN && ind_expr->GetType()->Tag() == TYPE_STRING ) + if ( aggr_t->AsTableType()->IsPatternIndex() && ind_expr->GetType()->Tag() == TYPE_STRING ) is_pat_str_ind = true; if ( inside_when ) { From 43a54739191f2a7a5e4c24359d6b354ac06aa9a8 Mon Sep 17 00:00:00 2001 From: Arne Welzel Date: Wed, 1 Nov 2023 14:47:10 +0100 Subject: [PATCH 07/18] TablePatternMatcher: Use unique_ptr --- src/Val.cc | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/src/Val.cc b/src/Val.cc index a8f6f0d2e2..58fae0e3d6 100644 --- a/src/Val.cc +++ b/src/Val.cc @@ -1436,15 +1436,11 @@ public: TablePatternMatcher(const TableVal* _tbl, TypePtr _yield) : tbl(_tbl) { vtype = make_intrusive(std::move(_yield)); } - ~TablePatternMatcher() { Clear(); } void Insert(ValPtr pat, ValPtr yield) { Clear(); } void Remove(ValPtr pat) { Clear(); } - void Clear() { - delete matcher; - matcher = nullptr; - } + void Clear() { matcher.reset(); } VectorValPtr Lookup(const StringVal* s); @@ -1461,7 +1457,7 @@ private: // from having to re-build the matcher on every insert/delete in // the common case that a whole bunch of those are done in a single // batch. - RE_DisjunctiveMatcher* matcher = nullptr; + std::unique_ptr matcher = nullptr; // Maps matcher values to corresponding yields. When building the // matcher we insert a nil at the head to accommodate how @@ -1511,7 +1507,7 @@ void TablePatternMatcher::Build() { hash_key_vals.push_back(std::move(vl)); } - matcher = new RE_DisjunctiveMatcher(patterns); + matcher = std::make_unique(patterns); } TableVal::TableVal(TableTypePtr t, detail::AttributesPtr a) : Val(t) { From c426304c2795258585c1446f11d652094ccff3f5 Mon Sep 17 00:00:00 2001 From: Arne Welzel Date: Wed, 1 Nov 2023 15:00:05 +0100 Subject: [PATCH 08/18] Val: Move TablePatternMatcher into detail namespace There's anyway only prototype in the headers, so detail seems better than the public zeek namespace. --- src/Val.cc | 10 ++++------ src/Val.h | 5 ++--- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/src/Val.cc b/src/Val.cc index 58fae0e3d6..73bfd2dce7 100644 --- a/src/Val.cc +++ b/src/Val.cc @@ -1427,11 +1427,9 @@ static void find_nested_record_types(const TypePtr& t, std::set* fo } } -using PatternValPtr = IntrusivePtr; - // Support class for returning multiple values from a table[pattern] // when indexed with a string. -class TablePatternMatcher { +class detail::TablePatternMatcher { public: TablePatternMatcher(const TableVal* _tbl, TypePtr _yield) : tbl(_tbl) { vtype = make_intrusive(std::move(_yield)); @@ -1465,7 +1463,7 @@ private: std::vector matcher_yields; }; -VectorValPtr TablePatternMatcher::Lookup(const StringVal* s) { +VectorValPtr detail::TablePatternMatcher::Lookup(const StringVal* s) { auto results = make_intrusive(vtype); if ( ! matcher ) { @@ -1484,7 +1482,7 @@ VectorValPtr TablePatternMatcher::Lookup(const StringVal* s) { return results; } -void TablePatternMatcher::Build() { +void detail::TablePatternMatcher::Build() { matcher_yields.clear(); matcher_yields.push_back(nullptr); @@ -1544,7 +1542,7 @@ void TableVal::Init(TableTypePtr t, bool ordered) { subnets = nullptr; if ( table_type->IsPatternIndex() && table_type->Yield() ) - pattern_matcher = new TablePatternMatcher(this, table_type->Yield()); + pattern_matcher = new detail::TablePatternMatcher(this, table_type->Yield()); table_hash = new detail::CompositeHash(table_type->GetIndices()); if ( ordered ) diff --git a/src/Val.h b/src/Val.h index e0c67e91f8..3b5176849b 100644 --- a/src/Val.h +++ b/src/Val.h @@ -51,6 +51,7 @@ class Frame; class PrefixTable; class CompositeHash; class HashKey; +class TablePatternMatcher; class ValTrace; class ZBody; @@ -718,8 +719,6 @@ protected: TableVal* table; }; -class TablePatternMatcher; - class TableVal final : public Val, public notifier::detail::Modifiable { public: explicit TableVal(TableTypePtr t, detail::AttributesPtr attrs = nullptr); @@ -1039,7 +1038,7 @@ protected: TableValTimer* timer; RobustDictIterator* expire_iterator; detail::PrefixTable* subnets; - TablePatternMatcher* pattern_matcher = nullptr; + detail::TablePatternMatcher* pattern_matcher = nullptr; ValPtr def_val; detail::ExprPtr change_func; std::string broker_store; From 501b582bc7b33d93ef090db627561343592d1b13 Mon Sep 17 00:00:00 2001 From: Arne Welzel Date: Wed, 1 Nov 2023 15:23:57 +0100 Subject: [PATCH 09/18] TablePatternMatcher: Use const StringValPtr& instead of const StringVal* --- src/Expr.cc | 2 +- src/Val.cc | 6 +++--- src/Val.h | 2 +- src/script_opt/CPP/RuntimeOps.cc | 2 +- src/script_opt/ZAM/Ops.in | 6 +++--- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/Expr.cc b/src/Expr.cc index 9f333c641e..cf4e52f3cf 100644 --- a/src/Expr.cc +++ b/src/Expr.cc @@ -2545,7 +2545,7 @@ ValPtr IndexExpr::Fold(Val* v1, Val* v2) const { case TYPE_TABLE: if ( is_pattern_table ) - return v1->AsTableVal()->LookupPattern(v2->AsListVal()->Idx(0)->AsStringVal()); + return v1->AsTableVal()->LookupPattern({NewRef{}, v2->AsListVal()->Idx(0)->AsStringVal()}); v = v1->AsTableVal()->FindOrDefault({NewRef{}, v2}); break; diff --git a/src/Val.cc b/src/Val.cc index 73bfd2dce7..d8bd8d1897 100644 --- a/src/Val.cc +++ b/src/Val.cc @@ -1440,7 +1440,7 @@ public: void Clear() { matcher.reset(); } - VectorValPtr Lookup(const StringVal* s); + VectorValPtr Lookup(const StringValPtr& s); private: void Build(); @@ -1463,7 +1463,7 @@ private: std::vector matcher_yields; }; -VectorValPtr detail::TablePatternMatcher::Lookup(const StringVal* s) { +VectorValPtr detail::TablePatternMatcher::Lookup(const StringValPtr& s) { auto results = make_intrusive(vtype); if ( ! matcher ) { @@ -2016,7 +2016,7 @@ TableValPtr TableVal::LookupSubnetValues(const SubNetVal* search) { return nt; } -VectorValPtr TableVal::LookupPattern(const StringVal* s) { +VectorValPtr TableVal::LookupPattern(const StringValPtr& s) { if ( ! pattern_matcher ) reporter->InternalError("LookupPattern called on wrong table type"); diff --git a/src/Val.h b/src/Val.h index 3b5176849b..9f201be1b2 100644 --- a/src/Val.h +++ b/src/Val.h @@ -867,7 +867,7 @@ public: // For a table[pattern], return a vector of all yields matching // the given string. // Causes an internal error if called for any other kind of table. - VectorValPtr LookupPattern(const StringVal* s); + VectorValPtr LookupPattern(const StringValPtr& s); // Sets the timestamp for the given index to network time. // Returns false if index does not exist. diff --git a/src/script_opt/CPP/RuntimeOps.cc b/src/script_opt/CPP/RuntimeOps.cc index 273b77b315..ae2b9c7744 100644 --- a/src/script_opt/CPP/RuntimeOps.cc +++ b/src/script_opt/CPP/RuntimeOps.cc @@ -45,7 +45,7 @@ ValPtr index_table__CPP(const TableValPtr& t, vector indices) { } ValPtr index_patstr_table__CPP(const TableValPtr& t, vector indices) { - return t->LookupPattern(indices[0]->AsStringVal()); + return t->LookupPattern(cast_intrusive(indices[0])); } ValPtr index_vec__CPP(const VectorValPtr& vec, int index) { diff --git a/src/script_opt/ZAM/Ops.in b/src/script_opt/ZAM/Ops.in index 87b820177f..3ffbeb6462 100644 --- a/src/script_opt/ZAM/Ops.in +++ b/src/script_opt/ZAM/Ops.in @@ -1061,7 +1061,7 @@ internal-op When-PatStr-Index type VV eval auto args = z.aux->ToListVal(frame); auto arg0 = args->Idx(0); - auto v = frame[z.v2].table_val->LookupPattern(arg0->AsStringVal()); + auto v = frame[z.v2].table_val->LookupPattern({NewRef{}, arg0->AsStringVal()}); if ( IndexExprWhen::evaluating > 0 ) IndexExprWhen::results.emplace_back(v); AssignV1(BuildVal(v, z.t)) @@ -1081,12 +1081,12 @@ eval EvalTableIndex(z.c.ToVal(z.t)) internal-assignment-op Table-PatStr-Index1 type VVV assign-val v -eval auto v = frame[z.v2].table_val->LookupPattern(frame[z.v3].AsString()); +eval auto v = frame[z.v2].table_val->LookupPattern({NewRef{}, frame[z.v3].AsString()}); internal-assignment-op Table-PatStr-Index1 type VVC assign-val v -eval auto v = frame[z.v2].table_val->LookupPattern(z.c.AsString()); +eval auto v = frame[z.v2].table_val->LookupPattern({NewRef{}, z.c.AsString()}); # This version is for a variable v3. internal-op Index-String From 9ae99cdc44f940627dc6481464a851c24259b825 Mon Sep 17 00:00:00 2001 From: Arne Welzel Date: Wed, 1 Nov 2023 15:43:15 +0100 Subject: [PATCH 10/18] RE: Remove RE_DisjunctiveMatcher and re-use MatchAll() Seems we can just open code the CompileSet() usage in the TablePatternMatcher helper without indirecting through another class. Further, add the collection of indices into MatchAll() rather than duplicating its code in MatchDisjunction(). Doesn't seem like MatchAll() is used widely. --- src/RE.cc | 58 ++++++++++-------------------------------------------- src/RE.h | 34 +++++++++----------------------- src/Val.cc | 19 ++++++++++++------ 3 files changed, 32 insertions(+), 79 deletions(-) diff --git a/src/RE.cc b/src/RE.cc index 70bb71b055..3a790d2f4e 100644 --- a/src/RE.cc +++ b/src/RE.cc @@ -194,6 +194,10 @@ bool Specific_RE_Matcher::MatchAll(const String* s) { return MatchAll(s->Bytes(), s->Len()); } +bool Specific_RE_Matcher::MatchSet(const String* s, std::vector& matches) { + return MatchAll(s->Bytes(), s->Len(), &matches); +} + int Specific_RE_Matcher::Match(const char* s) { return Match((const u_char*)(s), strlen(s)); } int Specific_RE_Matcher::Match(const String* s) { return Match(s->Bytes(), s->Len()); } @@ -202,7 +206,7 @@ int Specific_RE_Matcher::LongestMatch(const char* s) { return LongestMatch((cons int Specific_RE_Matcher::LongestMatch(const String* s) { return LongestMatch(s->Bytes(), s->Len()); } -bool Specific_RE_Matcher::MatchAll(const u_char* bv, int n) { +bool Specific_RE_Matcher::MatchAll(const u_char* bv, int n, std::vector* matches) { if ( ! dfa ) // An empty pattern matches "all" iff what's being // matched is empty. @@ -222,6 +226,11 @@ bool Specific_RE_Matcher::MatchAll(const u_char* bv, int n) { if ( d ) d = d->Xtion(ecs[SYM_EOL], dfa); + if ( d && matches ) + if ( const auto* a_set = d->Accept() ) + for ( auto a : *a_set ) + matches->push_back(a); + return d && d->Accept() != nullptr; } @@ -255,33 +264,6 @@ int Specific_RE_Matcher::Match(const u_char* bv, int n) { return 0; } -void Specific_RE_Matcher::MatchDisjunction(const String* s, std::vector& matches) { - auto bv = s->Bytes(); - auto n = s->Len(); - - ASSERT(dfa); - - DFA_State* d = dfa->StartState(); - d = d->Xtion(ecs[SYM_BOL], dfa); - - while ( d ) { - if ( --n < 0 ) - break; - - int ec = ecs[*(bv++)]; - d = d->Xtion(ec, dfa); - } - - if ( d ) - d = d->Xtion(ecs[SYM_EOL], dfa); - - if ( d ) - if ( auto a_set = d->Accept() ) - for ( auto a : *a_set ) - matches.push_back(a); -} - - void Specific_RE_Matcher::Dump(FILE* f) { dfa->Dump(f); } inline void RE_Match_State::AddMatches(const AcceptingSet& as, MatchPos position) { @@ -456,26 +438,6 @@ void RE_Matcher::MakeSingleLine() { bool RE_Matcher::Compile(bool lazy) { return re_anywhere->Compile(lazy) && re_exact->Compile(lazy); } -RE_DisjunctiveMatcher::RE_DisjunctiveMatcher(const std::vector& REs) { - matcher = std::make_unique(detail::MATCH_EXACTLY); - - zeek::detail::string_list sl; - zeek::detail::int_list il; - - for ( const auto* re : REs ) { - sl.push_back(const_cast(re->PatternText())); - il.push_back(sl.size()); - } - - if ( ! matcher->CompileSet(sl, il) ) - reporter->FatalError("failed compile set for disjunctive matcher"); -} - -void RE_DisjunctiveMatcher::Match(const String* s, std::vector& matches) { - matches.clear(); - return matcher->MatchDisjunction(s, matches); -} - TEST_SUITE("re_matcher") { TEST_CASE("simple_pattern") { RE_Matcher match("[0-9]+"); diff --git a/src/RE.h b/src/RE.h index 52b446a306..ee5234d42d 100644 --- a/src/RE.h +++ b/src/RE.h @@ -96,6 +96,14 @@ public: // to the matching expressions. (idx must not contain zeros). bool CompileSet(const string_list& set, const int_list& idx); + // For use with CompileSet() to collect indices of all matched + // expressions into the matches vector. The matches vector is + // populated with the indices of all matching expressions provided + // to CompileSet()'s set and idx arguments. + // + // Behaves as MatchAll(), consuming the complete input string. + bool MatchSet(const String* s, std::vector& matches); + // Returns the position in s just beyond where the first match // occurs, or 0 if there is no such position in s. Note that // if the pattern matches empty strings, matching continues @@ -104,17 +112,6 @@ public: int Match(const String* s); int Match(const u_char* bv, int n); - // A disjunction is a collection of regular expressions (that under - // the hood are matches as a single RE, not serially) for which - // the match operation returns *all* of the matches. Disjunctions - // are constructed using the internal "||" RE operator, and the - // matches are returned as indices into the position, left-to-right, - // of which REs matched. IMPORTANT: the first RE is numbered 1, not 0. - // - // Note that there's no guarantee regarding the ordering of the - // returned matches if there is more than one. - void MatchDisjunction(const String* s, std::vector& matches); - int LongestMatch(const char* s); int LongestMatch(const String* s); int LongestMatch(const u_char* bv, int n, bool bol = true, bool eol = true); @@ -136,7 +133,7 @@ protected: // appending to an existing pattern_text. void AddPat(const char* pat, const char* orig_fmt, const char* app_fmt); - bool MatchAll(const u_char* bv, int n); + bool MatchAll(const u_char* bv, int n, std::vector* matches = nullptr); match_type mt; bool multiline; @@ -255,17 +252,4 @@ protected: bool is_single_line = false; }; -class RE_DisjunctiveMatcher final { -public: - // Takes a collection of individual REs and builds a disjunctive - // matcher for the set. - RE_DisjunctiveMatcher(const std::vector& REs); - - // See MatchDisjunction() above. - void Match(const String* s, std::vector& matches); - -private: - std::unique_ptr matcher; -}; - } // namespace zeek diff --git a/src/Val.cc b/src/Val.cc index d8bd8d1897..4304570df7 100644 --- a/src/Val.cc +++ b/src/Val.cc @@ -1455,7 +1455,7 @@ private: // from having to re-build the matcher on every insert/delete in // the common case that a whole bunch of those are done in a single // batch. - std::unique_ptr matcher = nullptr; + std::unique_ptr matcher = nullptr; // Maps matcher values to corresponding yields. When building the // matcher we insert a nil at the head to accommodate how @@ -1473,8 +1473,8 @@ VectorValPtr detail::TablePatternMatcher::Lookup(const StringValPtr& s) { Build(); } - std::vector matches; - matcher->Match(s->AsString(), matches); + std::vector matches; + matcher->MatchSet(s->AsString(), matches); for ( auto m : matches ) results->Append(matcher_yields[m]); @@ -1488,7 +1488,9 @@ void detail::TablePatternMatcher::Build() { auto& tbl_dict = *tbl->Get(); auto& tbl_hash = *tbl->GetTableHash(); - std::vector patterns; + + zeek::detail::string_list pattern_list; + zeek::detail::int_list index_list; // We need to hold on to recovered hash key values so they don't // get lost once a loop iteration goes out of scope. @@ -1499,13 +1501,18 @@ void detail::TablePatternMatcher::Build() { auto v = iter.value; auto vl = tbl_hash.RecoverVals(*k); - patterns.push_back(vl->AsListVal()->Idx(0)->AsPattern()); + char* pt = const_cast(vl->AsListVal()->Idx(0)->AsPattern()->PatternText()); + pattern_list.push_back(pt); + index_list.push_back(pattern_list.size()); matcher_yields.push_back(v->GetVal()); hash_key_vals.push_back(std::move(vl)); } - matcher = std::make_unique(patterns); + matcher = std::make_unique(detail::MATCH_EXACTLY); + + if ( ! matcher->CompileSet(pattern_list, index_list) ) + reporter->FatalError("failed compile set for disjunctive matching"); } TableVal::TableVal(TableTypePtr t, detail::AttributesPtr a) : Val(t) { From c72d4a4427653a757438e83bdc8a07e7f57c0721 Mon Sep 17 00:00:00 2001 From: Arne Welzel Date: Wed, 1 Nov 2023 17:27:47 +0100 Subject: [PATCH 11/18] Expr: Implement string in table[pattern] of X Not sure how useful this is (and the implementation isn't optimized in any way), but seems reasonable for consistency. Vern suggested that set[pattern] can already be achieved via set_to_regex(), so left out any set[pattern] variants. --- src/Expr.cc | 22 +++++++++++++++++-- .../Baseline/language.pattern-tables-when/out | 2 ++ .../Baseline/language.pattern-tables/out | 4 ++++ .../btest/language/pattern-tables-when.zeek | 13 ++++++++++- testing/btest/language/pattern-tables.zeek | 4 ++++ 5 files changed, 42 insertions(+), 3 deletions(-) diff --git a/src/Expr.cc b/src/Expr.cc index cf4e52f3cf..95e9f0d93d 100644 --- a/src/Expr.cc +++ b/src/Expr.cc @@ -3815,6 +3815,18 @@ InExpr::InExpr(ExprPtr arg_op1, ExprPtr arg_op2) : BinaryExpr(EXPR_IN, std::move } } + // Support in table[pattern] of X + if ( op1->GetType()->Tag() == TYPE_STRING ) { + if ( op2->GetType()->Tag() == TYPE_TABLE ) { + const auto& table_type = op2->GetType()->AsTableType(); + + if ( table_type->IsPatternIndex() && table_type->Yield() ) { + SetType(base_type(TYPE_BOOL)); + return; + } + } + } + if ( op1->Tag() != EXPR_LIST ) op1 = make_intrusive(std::move(op1)); @@ -3853,8 +3865,14 @@ ValPtr InExpr::Fold(Val* v1, Val* v2) const { auto ind = v1->AsListVal()->Idx(0)->CoerceToUnsigned(); res = ind < vv2->Size() && vv2->ValAt(ind); } - else - res = (bool)v2->AsTableVal()->Find({NewRef{}, v1}); + else { + const auto& table_val = v2->AsTableVal(); + const auto& table_type = table_val->GetType(); + if ( table_type->IsPatternIndex() && table_type->Yield() && v1->GetType()->Tag() == TYPE_STRING ) + res = table_val->LookupPattern({NewRef{}, v1->AsStringVal()})->Size() > 0; + else + res = (bool)v2->AsTableVal()->Find({NewRef{}, v1}); + } return val_mgr->Bool(res); } diff --git a/testing/btest/Baseline/language.pattern-tables-when/out b/testing/btest/Baseline/language.pattern-tables-when/out index d9e62df0e8..7367e825e6 100644 --- a/testing/btest/Baseline/language.pattern-tables-when/out +++ b/testing/btest/Baseline/language.pattern-tables-when/out @@ -4,3 +4,5 @@ populate_a() gotcha a, [42] populate_b() gotcha b, [4242] +populate_c() +gotcha c, [4711] diff --git a/testing/btest/Baseline/language.pattern-tables/out b/testing/btest/Baseline/language.pattern-tables/out index 209e9c48b2..a74c2d32eb 100644 --- a/testing/btest/Baseline/language.pattern-tables/out +++ b/testing/btest/Baseline/language.pattern-tables/out @@ -2,9 +2,13 @@ indexing empty, 0 single insert, match, [1] single insert, non-match, [] +single insert, in, T +single insert, not-in, F multiple inserts, non-match, [] multiple inserts, single match, [3] multiple inserts, double match, [1, 3] +multiple insert, in, T +multiple insert, not-in, F triple match, [1, 3, 4] embedded newline, /s operator, [6] no embedded newline, /s vs. no /s operator, [5, 6, 7] diff --git a/testing/btest/language/pattern-tables-when.zeek b/testing/btest/language/pattern-tables-when.zeek index df303418c2..d371cccf2c 100644 --- a/testing/btest/language/pattern-tables-when.zeek +++ b/testing/btest/language/pattern-tables-when.zeek @@ -6,11 +6,18 @@ global pt: table[pattern] of count; redef exit_only_after_terminate = T; +event populate_c() + { + print "populate_c()"; + pt[/c/] = 4711; + terminate(); + } + event populate_b() { print "populate_b()"; pt[/b/] = 4242; - terminate(); + schedule 1msec { populate_c() }; } event populate_a() @@ -38,6 +45,10 @@ event zeek_init() print "gotcha b", pt["b"]; } + when ( "c" in pt ) { + print "gotcha c", pt["c"]; + } + print "schedule populate"; schedule 1msec { populate_a() }; } diff --git a/testing/btest/language/pattern-tables.zeek b/testing/btest/language/pattern-tables.zeek index 7ca5c42e46..5331bd6315 100644 --- a/testing/btest/language/pattern-tables.zeek +++ b/testing/btest/language/pattern-tables.zeek @@ -13,6 +13,8 @@ event zeek_init() print "single insert, match", pt["foo"]; print "single insert, non-match", pt["foox"]; + print "single insert, in", "foo" in pt; + print "single insert, not-in", "foox" in pt; pt[/bar/] = 2; pt[/(foo|bletch)/] = 3; @@ -20,6 +22,8 @@ event zeek_init() print "multiple inserts, non-match", pt["x"]; print "multiple inserts, single match", pt["bletch"]; print "multiple inserts, double match", sort(pt["foo"]); + print "multiple insert, in", "foo" in pt; + print "multiple insert, not-in", "x" in pt; pt[/(foo|bletch|xyz)/] = 4; print "triple match", sort(pt["foo"]); From 3f240e0f0ac27c0b6fc6e24677352353d77acc4d Mon Sep 17 00:00:00 2001 From: Arne Welzel Date: Wed, 8 Nov 2023 20:23:10 +0100 Subject: [PATCH 12/18] DFA: Extract inner Stats struct from DFA_State_Cache This makes it possible to forward declare the class in Val.h which otherwise seems difficult. --- src/DFA.h | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/src/DFA.h b/src/DFA.h index fd0b590025..1bf2979ec3 100644 --- a/src/DFA.h +++ b/src/DFA.h @@ -69,6 +69,17 @@ protected: using DigestStr = std::basic_string; +struct DFA_State_Cache_Stats { + // Sum of all NFA states + unsigned int nfa_states; + unsigned int dfa_states; + unsigned int computed; + unsigned int uncomputed; + unsigned int mem; + unsigned int hits; + unsigned int misses; +}; + class DFA_State_Cache { public: DFA_State_Cache(); @@ -82,17 +93,7 @@ public: int NumEntries() const { return states.size(); } - struct Stats { - // Sum of all NFA states - unsigned int nfa_states; - unsigned int dfa_states; - unsigned int computed; - unsigned int uncomputed; - unsigned int mem; - unsigned int hits; - unsigned int misses; - }; - + using Stats = DFA_State_Cache_Stats; void GetStats(Stats* s); private: From e39f280e3d32b1a0380233cfb172ccebbfc70903 Mon Sep 17 00:00:00 2001 From: Arne Welzel Date: Wed, 8 Nov 2023 20:30:04 +0100 Subject: [PATCH 13/18] zeek.bif: Implement table_pattern_matcher_stats() bif for introspection Provide a script accessible way to introspect the DFA stats that can be leveraged to gather runtime statistics of the underlying DFA. This re-uses the existing MatcherStats used by ``get_matcher_stats()``. --- src/Val.cc | 15 ++++++ src/Val.h | 6 +++ src/zeek.bif | 50 +++++++++++++++++++ .../language.pattern-tables-stats/.stderr | 1 + .../language.pattern-tables-stats/out | 12 +++++ .../btest/language/pattern-tables-stats.zeek | 32 ++++++++++++ 6 files changed, 116 insertions(+) create mode 100644 testing/btest/Baseline/language.pattern-tables-stats/.stderr create mode 100644 testing/btest/Baseline/language.pattern-tables-stats/out create mode 100644 testing/btest/language/pattern-tables-stats.zeek diff --git a/src/Val.cc b/src/Val.cc index 4304570df7..83324c3ae2 100644 --- a/src/Val.cc +++ b/src/Val.cc @@ -20,6 +20,7 @@ #include "zeek/Attr.h" #include "zeek/CompHash.h" #include "zeek/Conn.h" +#include "zeek/DFA.h" #include "zeek/Desc.h" #include "zeek/Dict.h" #include "zeek/Expr.h" @@ -1442,6 +1443,13 @@ public: VectorValPtr Lookup(const StringValPtr& s); + void GetStats(detail::DFA_State_Cache_Stats* stats) const { + if ( matcher && matcher->DFA() ) + matcher->DFA()->Cache()->GetStats(stats); + else + *stats = {0}; + }; + private: void Build(); @@ -2030,6 +2038,13 @@ VectorValPtr TableVal::LookupPattern(const StringValPtr& s) { return pattern_matcher->Lookup(s); } +void TableVal::GetPatternMatcherStats(detail::DFA_State_Cache_Stats* stats) const { + if ( ! pattern_matcher ) + reporter->InternalError("GetPatternMatcherStats called on wrong table type"); + + return pattern_matcher->GetStats(stats); +} + bool TableVal::UpdateTimestamp(Val* index) { TableEntryVal* v; diff --git a/src/Val.h b/src/Val.h index 9f201be1b2..9fc026dc6f 100644 --- a/src/Val.h +++ b/src/Val.h @@ -53,6 +53,8 @@ class CompositeHash; class HashKey; class TablePatternMatcher; +struct DFA_State_Cache_Stats; + class ValTrace; class ZBody; class CPPRuntime; @@ -869,6 +871,10 @@ public: // Causes an internal error if called for any other kind of table. VectorValPtr LookupPattern(const StringValPtr& s); + // For a table[pattern], fill stats with information about + // the DFA's state for introspection. + void GetPatternMatcherStats(detail::DFA_State_Cache_Stats* stats) const; + // Sets the timestamp for the given index to network time. // Returns false if index does not exist. bool UpdateTimestamp(Val* index); diff --git a/src/zeek.bif b/src/zeek.bif index 4c70a6fcbf..f8358fec8c 100644 --- a/src/zeek.bif +++ b/src/zeek.bif @@ -5795,3 +5795,53 @@ function have_spicy_analyzers%(%) : bool %{ return zeek::val_mgr->Bool(USE_SPICY_ANALYZERS); %} + +%%{ +#include "zeek/DFA.h" +%%} + +## Return MatcherStats for a table[pattern] or set[pattern] value. +## +## This returns a MatcherStats objects that can be used for introspection +## of the DFA used for such a table. Statistics reset whenever elements are +## added or removed to the table as these operations result in the underlying +## DFA being rebuilt. +## +## This function iterates over all states of the DFA. Calling it at a high +## frequency is likely detrimental to performance. +## +## tbl: The table to get stats for. +## +## Returns: A record with matcher statistics. +function table_pattern_matcher_stats%(tbl: any%) : MatcherStats + %{ + static auto matcher_stats_type = zeek::id::find_type("MatcherStats"); + + const auto& type = tbl->GetType(); + if ( type->Tag() != zeek::TYPE_TABLE ) + { + zeek::emit_builtin_error("pattern-table_stats() requires a table argument"); + return nullptr; + } + + if ( ! type->AsTableType()->IsPatternIndex() ) + { + zeek::emit_builtin_error("pattern_table_stats() requires a single index of type pattern"); + return nullptr; + } + + zeek::detail::DFA_State_Cache::Stats stats; + tbl->AsTableVal()->GetPatternMatcherStats(&stats); + + auto result = zeek::make_intrusive(matcher_stats_type); + int n = 0; + result->Assign(n++, 1); // matchers + result->Assign(n++, stats.nfa_states); + result->Assign(n++, stats.dfa_states); + result->Assign(n++, stats.computed); + result->Assign(n++, stats.mem); + result->Assign(n++, stats.hits); + result->Assign(n++, stats.misses); + + return result; + %} diff --git a/testing/btest/Baseline/language.pattern-tables-stats/.stderr b/testing/btest/Baseline/language.pattern-tables-stats/.stderr new file mode 100644 index 0000000000..49d861c74c --- /dev/null +++ b/testing/btest/Baseline/language.pattern-tables-stats/.stderr @@ -0,0 +1 @@ +### BTest baseline data generated by btest-diff. Do not edit. Use "btest -U/-u" to update. Requires BTest >= 0.63. diff --git a/testing/btest/Baseline/language.pattern-tables-stats/out b/testing/btest/Baseline/language.pattern-tables-stats/out new file mode 100644 index 0000000000..c7646c68bc --- /dev/null +++ b/testing/btest/Baseline/language.pattern-tables-stats/out @@ -0,0 +1,12 @@ +### BTest baseline data generated by btest-diff. Do not edit. Use "btest -U/-u" to update. Requires BTest >= 0.63. +initial stats, [matchers=1, nfa_states=0, dfa_states=0, computed=0, mem=0, hits=0, misses=0] +populated stats, [matchers=1, nfa_states=0, dfa_states=0, computed=0, mem=0, hits=0, misses=0] +[1], [], T, F +after lookup stats, [matchers=1, nfa_states=10, dfa_states=6, computed=6, mem=2368, hits=0, misses=6] +reset stats, [matchers=1, nfa_states=0, dfa_states=0, computed=0, mem=0, hits=0, misses=0] +[], [3], [1, 3], T, F +after more lookup stats, [matchers=1, nfa_states=34, dfa_states=13, computed=13, mem=7720, hits=0, misses=13] +reset stats after delete, [matchers=1, nfa_states=0, dfa_states=0, computed=0, mem=0, hits=0, misses=0] +[], [3], [1, 3] +after even more lookup stats, [matchers=1, nfa_states=29, dfa_states=13, computed=13, mem=7056, hits=0, misses=13] +reset after reassignment, [matchers=1, nfa_states=0, dfa_states=0, computed=0, mem=0, hits=0, misses=0] diff --git a/testing/btest/language/pattern-tables-stats.zeek b/testing/btest/language/pattern-tables-stats.zeek new file mode 100644 index 0000000000..1670dee86c --- /dev/null +++ b/testing/btest/language/pattern-tables-stats.zeek @@ -0,0 +1,32 @@ +# @TEST-DOC: Test table_pattern_matcher_stats() +# @TEST-EXEC: zeek -b %INPUT >out +# @TEST-EXEC: btest-diff out +# @TEST-EXEC: btest-diff .stderr + +global pt: table[pattern] of count; + +event zeek_init() + { + print "initial stats", table_pattern_matcher_stats(pt); + pt[/foo/] = 1; + print "populated stats", table_pattern_matcher_stats(pt); + + print pt["foo"], pt["foox"], "foo" in pt, "foox" in pt; + print "after lookup stats", table_pattern_matcher_stats(pt); + + pt[/bar/] = 2; + pt[/(foo|bletch)/] = 3; + print "reset stats", table_pattern_matcher_stats(pt); + + print pt["x"], pt["bletch"], sort(pt["foo"]), "foo" in pt, "x" in pt; + print "after more lookup stats", table_pattern_matcher_stats(pt); + + delete pt[/bar/]; + print "reset stats after delete", table_pattern_matcher_stats(pt); + + print pt["x"], pt["bletch"], sort(pt["foo"]); + print "after even more lookup stats", table_pattern_matcher_stats(pt); + + pt = table(); + print "reset after reassignment", table_pattern_matcher_stats(pt); + } From c113b9b297289cdb910a225d4e6aabc204df8414 Mon Sep 17 00:00:00 2001 From: Arne Welzel Date: Tue, 14 Nov 2023 11:33:29 +0100 Subject: [PATCH 14/18] Expr/Val: Add support for in set[pattern] --- src/Expr.cc | 9 ++-- src/Val.cc | 27 ++++++++-- src/Val.h | 5 ++ .../Baseline/language.pattern-sets/.stderr | 1 + .../btest/Baseline/language.pattern-sets/out | 14 +++++ .../language.pattern-tables-stats/out | 9 ++++ testing/btest/language/pattern-sets.zeek | 53 +++++++++++++++++++ .../btest/language/pattern-tables-stats.zeek | 24 +++++++++ 8 files changed, 135 insertions(+), 7 deletions(-) create mode 100644 testing/btest/Baseline/language.pattern-sets/.stderr create mode 100644 testing/btest/Baseline/language.pattern-sets/out create mode 100644 testing/btest/language/pattern-sets.zeek diff --git a/src/Expr.cc b/src/Expr.cc index 95e9f0d93d..8153bf7d3f 100644 --- a/src/Expr.cc +++ b/src/Expr.cc @@ -3815,12 +3815,12 @@ InExpr::InExpr(ExprPtr arg_op1, ExprPtr arg_op2) : BinaryExpr(EXPR_IN, std::move } } - // Support in table[pattern] of X + // Support in table[pattern] / set[pattern] if ( op1->GetType()->Tag() == TYPE_STRING ) { if ( op2->GetType()->Tag() == TYPE_TABLE ) { const auto& table_type = op2->GetType()->AsTableType(); - if ( table_type->IsPatternIndex() && table_type->Yield() ) { + if ( table_type->IsPatternIndex() ) { SetType(base_type(TYPE_BOOL)); return; } @@ -3868,8 +3868,9 @@ ValPtr InExpr::Fold(Val* v1, Val* v2) const { else { const auto& table_val = v2->AsTableVal(); const auto& table_type = table_val->GetType(); - if ( table_type->IsPatternIndex() && table_type->Yield() && v1->GetType()->Tag() == TYPE_STRING ) - res = table_val->LookupPattern({NewRef{}, v1->AsStringVal()})->Size() > 0; + // Special table[pattern] / set[pattern] in expression. + if ( table_type->IsPatternIndex() && v1->GetType()->Tag() == TYPE_STRING ) + res = table_val->MatchPattern({NewRef{}, v1->AsStringVal()}); else res = (bool)v2->AsTableVal()->Find({NewRef{}, v1}); } diff --git a/src/Val.cc b/src/Val.cc index 83324c3ae2..1e022a56ca 100644 --- a/src/Val.cc +++ b/src/Val.cc @@ -1443,6 +1443,9 @@ public: VectorValPtr Lookup(const StringValPtr& s); + // Delegate to matcher->MatchAll(). + bool MatchAll(const StringValPtr& s); + void GetStats(detail::DFA_State_Cache_Stats* stats) const { if ( matcher && matcher->DFA() ) matcher->DFA()->Cache()->GetStats(stats); @@ -1490,6 +1493,17 @@ VectorValPtr detail::TablePatternMatcher::Lookup(const StringValPtr& s) { return results; } +bool detail::TablePatternMatcher::MatchAll(const StringValPtr& s) { + if ( ! matcher ) { + if ( tbl->Get()->Length() == 0 ) + return false; + + Build(); + } + + return matcher->MatchAll(s->AsString()); +} + void detail::TablePatternMatcher::Build() { matcher_yields.clear(); matcher_yields.push_back(nullptr); @@ -1556,7 +1570,7 @@ void TableVal::Init(TableTypePtr t, bool ordered) { else subnets = nullptr; - if ( table_type->IsPatternIndex() && table_type->Yield() ) + if ( table_type->IsPatternIndex() ) pattern_matcher = new detail::TablePatternMatcher(this, table_type->Yield()); table_hash = new detail::CompositeHash(table_type->GetIndices()); @@ -1674,7 +1688,7 @@ bool TableVal::Assign(ValPtr index, ValPtr new_val, bool broker_forward, bool* i } if ( pattern_matcher ) - pattern_matcher->Insert(index->AsListVal()->Idx(0), new_val); + pattern_matcher->Insert(index, new_val); return Assign(std::move(index), std::move(k), std::move(new_val), broker_forward, iterators_invalidated); } @@ -2032,12 +2046,19 @@ TableValPtr TableVal::LookupSubnetValues(const SubNetVal* search) { } VectorValPtr TableVal::LookupPattern(const StringValPtr& s) { - if ( ! pattern_matcher ) + if ( ! pattern_matcher || ! GetType()->Yield() ) reporter->InternalError("LookupPattern called on wrong table type"); return pattern_matcher->Lookup(s); } +bool TableVal::MatchPattern(const StringValPtr& s) { + if ( ! pattern_matcher ) + reporter->InternalError("LookupPattern called on wrong table type"); + + return pattern_matcher->MatchAll(s); +} + void TableVal::GetPatternMatcherStats(detail::DFA_State_Cache_Stats* stats) const { if ( ! pattern_matcher ) reporter->InternalError("GetPatternMatcherStats called on wrong table type"); diff --git a/src/Val.h b/src/Val.h index 9fc026dc6f..987b2081eb 100644 --- a/src/Val.h +++ b/src/Val.h @@ -871,6 +871,11 @@ public: // Causes an internal error if called for any other kind of table. VectorValPtr LookupPattern(const StringValPtr& s); + // For a table[pattern] or set[pattern], returns True if any of the + // patterns in the index matches the given string, else False. + // Causes an internal error if called for any other kind of table. + bool MatchPattern(const StringValPtr& s); + // For a table[pattern], fill stats with information about // the DFA's state for introspection. void GetPatternMatcherStats(detail::DFA_State_Cache_Stats* stats) const; diff --git a/testing/btest/Baseline/language.pattern-sets/.stderr b/testing/btest/Baseline/language.pattern-sets/.stderr new file mode 100644 index 0000000000..49d861c74c --- /dev/null +++ b/testing/btest/Baseline/language.pattern-sets/.stderr @@ -0,0 +1 @@ +### BTest baseline data generated by btest-diff. Do not edit. Use "btest -U/-u" to update. Requires BTest >= 0.63. diff --git a/testing/btest/Baseline/language.pattern-sets/out b/testing/btest/Baseline/language.pattern-sets/out new file mode 100644 index 0000000000..e3ad8c8fd1 --- /dev/null +++ b/testing/btest/Baseline/language.pattern-sets/out @@ -0,0 +1,14 @@ +### BTest baseline data generated by btest-diff. Do not edit. Use "btest -U/-u" to update. Requires BTest >= 0.63. +in empty, F +single insert, foo in, T +single insert, foox not-in, T +multiple inserts, x not-in, T +multiple insert, foo in, T +multiple insert, bletch in, T +multiple insert, foobletch not-in, T +single delete, bletch in, T +single delete, foo in, T +two deletes, bletch not-in, T +two deletes, foo not-in, T +two deletes, bar in, T +clear_table, bar not-in, T diff --git a/testing/btest/Baseline/language.pattern-tables-stats/out b/testing/btest/Baseline/language.pattern-tables-stats/out index c7646c68bc..12fe607fbc 100644 --- a/testing/btest/Baseline/language.pattern-tables-stats/out +++ b/testing/btest/Baseline/language.pattern-tables-stats/out @@ -10,3 +10,12 @@ reset stats after delete, [matchers=1, nfa_states=0, dfa_states=0, computed=0, m [], [3], [1, 3] after even more lookup stats, [matchers=1, nfa_states=29, dfa_states=13, computed=13, mem=7056, hits=0, misses=13] reset after reassignment, [matchers=1, nfa_states=0, dfa_states=0, computed=0, mem=0, hits=0, misses=0] +set initial stats, [matchers=1, nfa_states=0, dfa_states=0, computed=0, mem=0, hits=0, misses=0] +set populated stats, [matchers=1, nfa_states=0, dfa_states=0, computed=0, mem=0, hits=0, misses=0] +T, F +set after lookup stats, [matchers=1, nfa_states=10, dfa_states=6, computed=6, mem=2368, hits=0, misses=6] +set reset stats, [matchers=1, nfa_states=0, dfa_states=0, computed=0, mem=0, hits=0, misses=0] +F, T +set after more lookup stats, [matchers=1, nfa_states=24, dfa_states=9, computed=9, mem=5336, hits=0, misses=9] +set reset stats after delete, [matchers=1, nfa_states=24, dfa_states=9, computed=9, mem=5336, hits=0, misses=9] +set reset after reassignment, [matchers=1, nfa_states=0, dfa_states=0, computed=0, mem=0, hits=0, misses=0] diff --git a/testing/btest/language/pattern-sets.zeek b/testing/btest/language/pattern-sets.zeek new file mode 100644 index 0000000000..1b53eb890f --- /dev/null +++ b/testing/btest/language/pattern-sets.zeek @@ -0,0 +1,53 @@ +# @TEST-DOC: set[pattern] also supports parallel RE matching using in expression + +# @TEST-EXEC: zeek -b %INPUT >out +# @TEST-EXEC: btest-diff out +# @TEST-EXEC: btest-diff .stderr + +global ps: set[pattern]; + +event zeek_init() + { + assert "foo" !in ps; + print "in empty", "foo" in ps; + + add ps[/foo/]; + + assert "foo" in ps; + assert "foox" !in ps; + print "single insert, foo in", "foo" in ps; + print "single insert, foox not-in", "foox" !in ps; + + add ps[/bar/]; + add ps[/(foo|bletch)/]; + + assert "x" !in ps; + assert "bar" in ps; + assert "foo" in ps; + assert "bletch" in ps; + assert "foobletch" !in ps; + + print "multiple inserts, x not-in", "x" !in ps; + print "multiple insert, foo in", "foo" in ps; + print "multiple insert, bletch in", "bletch" in ps; + print "multiple insert, foobletch not-in", "foobletch" !in ps; + + # After delete of /foo/, still matches "foo" due to /(foo|bletch)/ + delete ps[/foo/]; + assert "foo" in ps; + assert "bletch" in ps; + print "single delete, bletch in", "bletch" in ps; + print "single delete, foo in", "foo" in ps; + + delete ps[/(foo|bletch)/]; + assert "foo" !in ps; + assert "bar" in ps; + assert "bletch" !in ps; + print "two deletes, bletch not-in", "bletch" !in ps; + print "two deletes, foo not-in", "foo" !in ps; + print "two deletes, bar in", "bar" in ps; + + clear_table(ps); + assert "bar" !in ps; + print "clear_table, bar not-in", "bar" !in ps; + } diff --git a/testing/btest/language/pattern-tables-stats.zeek b/testing/btest/language/pattern-tables-stats.zeek index 1670dee86c..0c25915474 100644 --- a/testing/btest/language/pattern-tables-stats.zeek +++ b/testing/btest/language/pattern-tables-stats.zeek @@ -4,6 +4,7 @@ # @TEST-EXEC: btest-diff .stderr global pt: table[pattern] of count; +global ps: set[pattern]; event zeek_init() { @@ -30,3 +31,26 @@ event zeek_init() pt = table(); print "reset after reassignment", table_pattern_matcher_stats(pt); } + +event zeek_init() &priority=-10 + { + print "set initial stats", table_pattern_matcher_stats(ps); + add ps[/foo/]; + print "set populated stats", table_pattern_matcher_stats(ps); + + print "foo" in ps, "foox" in ps; + print "set after lookup stats", table_pattern_matcher_stats(ps); + + add ps[/bar/]; + add ps[/(foo|bletch)/]; + print "set reset stats", table_pattern_matcher_stats(ps); + + print "x" in ps, "bletch" in ps; + print "set after more lookup stats", table_pattern_matcher_stats(ps); + + delete pt[/bar/]; + print "set reset stats after delete", table_pattern_matcher_stats(ps); + + ps = set(); + print "set reset after reassignment", table_pattern_matcher_stats(pt); + } From 96a0312ad265caefe588f6bd812b6dadff7ec951 Mon Sep 17 00:00:00 2001 From: Arne Welzel Date: Tue, 14 Nov 2023 13:08:01 +0100 Subject: [PATCH 15/18] NEWS: Add small table[pattern] section --- NEWS | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/NEWS b/NEWS index d18db9e4a0..53845a5892 100644 --- a/NEWS +++ b/NEWS @@ -16,6 +16,26 @@ Breaking Changes New Functionality ----------------- +- The table type was extended to allow parallel regular expression matching + when a table's index is a pattern. Indexing such tables yields a vector + containing all values of matching patterns for key of type string. + + As an example, the following snippet outputs ``[a, a or b], [a or b]``. + + global tbl: table[pattern] of string; + tbl[/a/] = "a"; + tbl[/a|b/] = "a or b"; + tbl[/c/] = "c"; + print tbl["a"], tbl["b"]; + + Depending on the patterns and input data used for matching, memory growth may + be observed over time as the underlying DFA is constructed lazily. Users are + advised to test their scripts with realistic and adversarial input data with + focus on memory growth. The DFA's state can be reset by removal/addition + of a single pattern. For observability, a new bif ``table_pattern_matcher_stats()`` + can be used to gather ``MatcherStats``. + + Changed Functionality --------------------- From e68194f2dfb9b25e4385eb56f8eae17c4f378586 Mon Sep 17 00:00:00 2001 From: Arne Welzel Date: Tue, 21 Nov 2023 10:43:04 +0100 Subject: [PATCH 16/18] TableType: Simplify and inline Is...Index tests --- src/Type.cc | 14 -------------- src/Type.h | 10 ++++++++-- 2 files changed, 8 insertions(+), 16 deletions(-) diff --git a/src/Type.cc b/src/Type.cc index a9da69e760..2c5580d50c 100644 --- a/src/Type.cc +++ b/src/Type.cc @@ -382,20 +382,6 @@ void IndexType::DescribeReST(ODesc* d, bool roles_only) const { } } -bool IndexType::IsSubNetIndex() const { - const auto& types = indices->GetTypes(); - if ( types.size() == 1 && types[0]->Tag() == TYPE_SUBNET ) - return true; - return false; -} - -bool IndexType::IsPatternIndex() const { - const auto& types = indices->GetTypes(); - if ( types.size() == 1 && types[0]->Tag() == TYPE_PATTERN ) - return true; - return false; -} - detail::TraversalCode IndexType::Traverse(detail::TraversalCallback* cb) const { auto tc = cb->PreType(this); HANDLE_TC_TYPE_PRE(tc); diff --git a/src/Type.h b/src/Type.h index 142fe2d331..71247ab335 100644 --- a/src/Type.h +++ b/src/Type.h @@ -354,10 +354,16 @@ public: void DescribeReST(ODesc* d, bool roles_only = false) const override; // Returns true if this table is solely indexed by subnet. - bool IsSubNetIndex() const; + bool IsSubNetIndex() const { + const auto& types = indices->GetTypes(); + return types.size() == 1 && types[0]->Tag() == TYPE_SUBNET; + } // Returns true if this table has a single index of type pattern. - bool IsPatternIndex() const; + bool IsPatternIndex() const { + const auto& types = indices->GetTypes(); + return types.size() == 1 && types[0]->Tag() == TYPE_PATTERN; + } detail::TraversalCode Traverse(detail::TraversalCallback* cb) const override; From 36c43d2aa385250d7389a33e0a5dd1c109581eab Mon Sep 17 00:00:00 2001 From: Arne Welzel Date: Tue, 21 Nov 2023 10:50:37 +0100 Subject: [PATCH 17/18] TablePatternMatcher: Drop Insert()/Remove(), use Clear() Also move Clear() when assigning into more generic Assign() function. --- src/Val.cc | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/src/Val.cc b/src/Val.cc index 1e022a56ca..7bccc6e0b0 100644 --- a/src/Val.cc +++ b/src/Val.cc @@ -1436,9 +1436,6 @@ public: vtype = make_intrusive(std::move(_yield)); } - void Insert(ValPtr pat, ValPtr yield) { Clear(); } - void Remove(ValPtr pat) { Clear(); } - void Clear() { matcher.reset(); } VectorValPtr Lookup(const StringValPtr& s); @@ -1687,9 +1684,6 @@ bool TableVal::Assign(ValPtr index, ValPtr new_val, bool broker_forward, bool* i return false; } - if ( pattern_matcher ) - pattern_matcher->Insert(index, new_val); - return Assign(std::move(index), std::move(k), std::move(new_val), broker_forward, iterators_invalidated); } @@ -1718,6 +1712,9 @@ bool TableVal::Assign(ValPtr index, std::unique_ptr k, ValPtr n subnets->Insert(index.get(), new_entry_val); } + if ( pattern_matcher ) + pattern_matcher->Clear(); + // Keep old expiration time if necessary. if ( old_entry_val && attrs && attrs->Find(detail::ATTR_EXPIRE_CREATE) ) new_entry_val->SetExpireAccess(old_entry_val->ExpireAccessTime()); @@ -2252,7 +2249,7 @@ ValPtr TableVal::Remove(const Val& index, bool broker_forward, bool* iterators_i reporter->InternalWarning("index not in prefix table"); if ( pattern_matcher ) - pattern_matcher->Remove(index.AsListVal()->Idx(0)); + pattern_matcher->Clear(); delete v; From cf9afd7b777c787154abf5ba60b8fa4dc951086f Mon Sep 17 00:00:00 2001 From: Arne Welzel Date: Tue, 21 Nov 2023 11:12:09 +0100 Subject: [PATCH 18/18] TableVal: Replace raw subnets/pattern_matcher with unique_ptr --- src/Val.cc | 8 ++------ src/Val.h | 7 ++++--- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/src/Val.cc b/src/Val.cc index 7bccc6e0b0..8d93426721 100644 --- a/src/Val.cc +++ b/src/Val.cc @@ -1563,12 +1563,10 @@ void TableVal::Init(TableTypePtr t, bool ordered) { def_val = nullptr; if ( table_type->IsSubNetIndex() ) - subnets = new detail::PrefixTable; - else - subnets = nullptr; + subnets = std::make_unique(); if ( table_type->IsPatternIndex() ) - pattern_matcher = new detail::TablePatternMatcher(this, table_type->Yield()); + pattern_matcher = std::make_unique(this, table_type->Yield()); table_hash = new detail::CompositeHash(table_type->GetIndices()); if ( ordered ) @@ -1585,8 +1583,6 @@ TableVal::~TableVal() { delete table_hash; delete table_val; - delete subnets; - delete pattern_matcher; delete expire_iterator; } diff --git a/src/Val.h b/src/Val.h index 987b2081eb..96bc5a037a 100644 --- a/src/Val.h +++ b/src/Val.h @@ -939,7 +939,8 @@ public: // Returns the Prefix table used inside the table (if present). // This allows us to do more direct queries to this specialized // type that the general Table API does not allow. - const detail::PrefixTable* Subnets() const { return subnets; } + const detail::PrefixTable* Subnets() const { return subnets.get(); } + void Describe(ODesc* d) const override; @@ -1048,8 +1049,8 @@ protected: detail::ExprPtr expire_func; TableValTimer* timer; RobustDictIterator* expire_iterator; - detail::PrefixTable* subnets; - detail::TablePatternMatcher* pattern_matcher = nullptr; + std::unique_ptr subnets; + std::unique_ptr pattern_matcher; ValPtr def_val; detail::ExprPtr change_func; std::string broker_store;