From 9ae99cdc44f940627dc6481464a851c24259b825 Mon Sep 17 00:00:00 2001 From: Arne Welzel Date: Wed, 1 Nov 2023 15:43:15 +0100 Subject: [PATCH] RE: Remove RE_DisjunctiveMatcher and re-use MatchAll() Seems we can just open code the CompileSet() usage in the TablePatternMatcher helper without indirecting through another class. Further, add the collection of indices into MatchAll() rather than duplicating its code in MatchDisjunction(). Doesn't seem like MatchAll() is used widely. --- src/RE.cc | 58 ++++++++++-------------------------------------------- src/RE.h | 34 +++++++++----------------------- src/Val.cc | 19 ++++++++++++------ 3 files changed, 32 insertions(+), 79 deletions(-) diff --git a/src/RE.cc b/src/RE.cc index 70bb71b055..3a790d2f4e 100644 --- a/src/RE.cc +++ b/src/RE.cc @@ -194,6 +194,10 @@ bool Specific_RE_Matcher::MatchAll(const String* s) { return MatchAll(s->Bytes(), s->Len()); } +bool Specific_RE_Matcher::MatchSet(const String* s, std::vector& matches) { + return MatchAll(s->Bytes(), s->Len(), &matches); +} + int Specific_RE_Matcher::Match(const char* s) { return Match((const u_char*)(s), strlen(s)); } int Specific_RE_Matcher::Match(const String* s) { return Match(s->Bytes(), s->Len()); } @@ -202,7 +206,7 @@ int Specific_RE_Matcher::LongestMatch(const char* s) { return LongestMatch((cons int Specific_RE_Matcher::LongestMatch(const String* s) { return LongestMatch(s->Bytes(), s->Len()); } -bool Specific_RE_Matcher::MatchAll(const u_char* bv, int n) { +bool Specific_RE_Matcher::MatchAll(const u_char* bv, int n, std::vector* matches) { if ( ! dfa ) // An empty pattern matches "all" iff what's being // matched is empty. @@ -222,6 +226,11 @@ bool Specific_RE_Matcher::MatchAll(const u_char* bv, int n) { if ( d ) d = d->Xtion(ecs[SYM_EOL], dfa); + if ( d && matches ) + if ( const auto* a_set = d->Accept() ) + for ( auto a : *a_set ) + matches->push_back(a); + return d && d->Accept() != nullptr; } @@ -255,33 +264,6 @@ int Specific_RE_Matcher::Match(const u_char* bv, int n) { return 0; } -void Specific_RE_Matcher::MatchDisjunction(const String* s, std::vector& matches) { - auto bv = s->Bytes(); - auto n = s->Len(); - - ASSERT(dfa); - - DFA_State* d = dfa->StartState(); - d = d->Xtion(ecs[SYM_BOL], dfa); - - while ( d ) { - if ( --n < 0 ) - break; - - int ec = ecs[*(bv++)]; - d = d->Xtion(ec, dfa); - } - - if ( d ) - d = d->Xtion(ecs[SYM_EOL], dfa); - - if ( d ) - if ( auto a_set = d->Accept() ) - for ( auto a : *a_set ) - matches.push_back(a); -} - - void Specific_RE_Matcher::Dump(FILE* f) { dfa->Dump(f); } inline void RE_Match_State::AddMatches(const AcceptingSet& as, MatchPos position) { @@ -456,26 +438,6 @@ void RE_Matcher::MakeSingleLine() { bool RE_Matcher::Compile(bool lazy) { return re_anywhere->Compile(lazy) && re_exact->Compile(lazy); } -RE_DisjunctiveMatcher::RE_DisjunctiveMatcher(const std::vector& REs) { - matcher = std::make_unique(detail::MATCH_EXACTLY); - - zeek::detail::string_list sl; - zeek::detail::int_list il; - - for ( const auto* re : REs ) { - sl.push_back(const_cast(re->PatternText())); - il.push_back(sl.size()); - } - - if ( ! matcher->CompileSet(sl, il) ) - reporter->FatalError("failed compile set for disjunctive matcher"); -} - -void RE_DisjunctiveMatcher::Match(const String* s, std::vector& matches) { - matches.clear(); - return matcher->MatchDisjunction(s, matches); -} - TEST_SUITE("re_matcher") { TEST_CASE("simple_pattern") { RE_Matcher match("[0-9]+"); diff --git a/src/RE.h b/src/RE.h index 52b446a306..ee5234d42d 100644 --- a/src/RE.h +++ b/src/RE.h @@ -96,6 +96,14 @@ public: // to the matching expressions. (idx must not contain zeros). bool CompileSet(const string_list& set, const int_list& idx); + // For use with CompileSet() to collect indices of all matched + // expressions into the matches vector. The matches vector is + // populated with the indices of all matching expressions provided + // to CompileSet()'s set and idx arguments. + // + // Behaves as MatchAll(), consuming the complete input string. + bool MatchSet(const String* s, std::vector& matches); + // Returns the position in s just beyond where the first match // occurs, or 0 if there is no such position in s. Note that // if the pattern matches empty strings, matching continues @@ -104,17 +112,6 @@ public: int Match(const String* s); int Match(const u_char* bv, int n); - // A disjunction is a collection of regular expressions (that under - // the hood are matches as a single RE, not serially) for which - // the match operation returns *all* of the matches. Disjunctions - // are constructed using the internal "||" RE operator, and the - // matches are returned as indices into the position, left-to-right, - // of which REs matched. IMPORTANT: the first RE is numbered 1, not 0. - // - // Note that there's no guarantee regarding the ordering of the - // returned matches if there is more than one. - void MatchDisjunction(const String* s, std::vector& matches); - int LongestMatch(const char* s); int LongestMatch(const String* s); int LongestMatch(const u_char* bv, int n, bool bol = true, bool eol = true); @@ -136,7 +133,7 @@ protected: // appending to an existing pattern_text. void AddPat(const char* pat, const char* orig_fmt, const char* app_fmt); - bool MatchAll(const u_char* bv, int n); + bool MatchAll(const u_char* bv, int n, std::vector* matches = nullptr); match_type mt; bool multiline; @@ -255,17 +252,4 @@ protected: bool is_single_line = false; }; -class RE_DisjunctiveMatcher final { -public: - // Takes a collection of individual REs and builds a disjunctive - // matcher for the set. - RE_DisjunctiveMatcher(const std::vector& REs); - - // See MatchDisjunction() above. - void Match(const String* s, std::vector& matches); - -private: - std::unique_ptr matcher; -}; - } // namespace zeek diff --git a/src/Val.cc b/src/Val.cc index d8bd8d1897..4304570df7 100644 --- a/src/Val.cc +++ b/src/Val.cc @@ -1455,7 +1455,7 @@ private: // from having to re-build the matcher on every insert/delete in // the common case that a whole bunch of those are done in a single // batch. - std::unique_ptr matcher = nullptr; + std::unique_ptr matcher = nullptr; // Maps matcher values to corresponding yields. When building the // matcher we insert a nil at the head to accommodate how @@ -1473,8 +1473,8 @@ VectorValPtr detail::TablePatternMatcher::Lookup(const StringValPtr& s) { Build(); } - std::vector matches; - matcher->Match(s->AsString(), matches); + std::vector matches; + matcher->MatchSet(s->AsString(), matches); for ( auto m : matches ) results->Append(matcher_yields[m]); @@ -1488,7 +1488,9 @@ void detail::TablePatternMatcher::Build() { auto& tbl_dict = *tbl->Get(); auto& tbl_hash = *tbl->GetTableHash(); - std::vector patterns; + + zeek::detail::string_list pattern_list; + zeek::detail::int_list index_list; // We need to hold on to recovered hash key values so they don't // get lost once a loop iteration goes out of scope. @@ -1499,13 +1501,18 @@ void detail::TablePatternMatcher::Build() { auto v = iter.value; auto vl = tbl_hash.RecoverVals(*k); - patterns.push_back(vl->AsListVal()->Idx(0)->AsPattern()); + char* pt = const_cast(vl->AsListVal()->Idx(0)->AsPattern()->PatternText()); + pattern_list.push_back(pt); + index_list.push_back(pattern_list.size()); matcher_yields.push_back(v->GetVal()); hash_key_vals.push_back(std::move(vl)); } - matcher = std::make_unique(patterns); + matcher = std::make_unique(detail::MATCH_EXACTLY); + + if ( ! matcher->CompileSet(pattern_list, index_list) ) + reporter->FatalError("failed compile set for disjunctive matching"); } TableVal::TableVal(TableTypePtr t, detail::AttributesPtr a) : Val(t) {