diff --git a/src/RE.cc b/src/RE.cc index 70bb71b055..3a790d2f4e 100644 --- a/src/RE.cc +++ b/src/RE.cc @@ -194,6 +194,10 @@ bool Specific_RE_Matcher::MatchAll(const String* s) { return MatchAll(s->Bytes(), s->Len()); } +bool Specific_RE_Matcher::MatchSet(const String* s, std::vector& matches) { + return MatchAll(s->Bytes(), s->Len(), &matches); +} + int Specific_RE_Matcher::Match(const char* s) { return Match((const u_char*)(s), strlen(s)); } int Specific_RE_Matcher::Match(const String* s) { return Match(s->Bytes(), s->Len()); } @@ -202,7 +206,7 @@ int Specific_RE_Matcher::LongestMatch(const char* s) { return LongestMatch((cons int Specific_RE_Matcher::LongestMatch(const String* s) { return LongestMatch(s->Bytes(), s->Len()); } -bool Specific_RE_Matcher::MatchAll(const u_char* bv, int n) { +bool Specific_RE_Matcher::MatchAll(const u_char* bv, int n, std::vector* matches) { if ( ! dfa ) // An empty pattern matches "all" iff what's being // matched is empty. @@ -222,6 +226,11 @@ bool Specific_RE_Matcher::MatchAll(const u_char* bv, int n) { if ( d ) d = d->Xtion(ecs[SYM_EOL], dfa); + if ( d && matches ) + if ( const auto* a_set = d->Accept() ) + for ( auto a : *a_set ) + matches->push_back(a); + return d && d->Accept() != nullptr; } @@ -255,33 +264,6 @@ int Specific_RE_Matcher::Match(const u_char* bv, int n) { return 0; } -void Specific_RE_Matcher::MatchDisjunction(const String* s, std::vector& matches) { - auto bv = s->Bytes(); - auto n = s->Len(); - - ASSERT(dfa); - - DFA_State* d = dfa->StartState(); - d = d->Xtion(ecs[SYM_BOL], dfa); - - while ( d ) { - if ( --n < 0 ) - break; - - int ec = ecs[*(bv++)]; - d = d->Xtion(ec, dfa); - } - - if ( d ) - d = d->Xtion(ecs[SYM_EOL], dfa); - - if ( d ) - if ( auto a_set = d->Accept() ) - for ( auto a : *a_set ) - matches.push_back(a); -} - - void Specific_RE_Matcher::Dump(FILE* f) { dfa->Dump(f); } inline void RE_Match_State::AddMatches(const AcceptingSet& as, MatchPos position) { @@ -456,26 +438,6 @@ void RE_Matcher::MakeSingleLine() { bool RE_Matcher::Compile(bool lazy) { return re_anywhere->Compile(lazy) && re_exact->Compile(lazy); } -RE_DisjunctiveMatcher::RE_DisjunctiveMatcher(const std::vector& REs) { - matcher = std::make_unique(detail::MATCH_EXACTLY); - - zeek::detail::string_list sl; - zeek::detail::int_list il; - - for ( const auto* re : REs ) { - sl.push_back(const_cast(re->PatternText())); - il.push_back(sl.size()); - } - - if ( ! matcher->CompileSet(sl, il) ) - reporter->FatalError("failed compile set for disjunctive matcher"); -} - -void RE_DisjunctiveMatcher::Match(const String* s, std::vector& matches) { - matches.clear(); - return matcher->MatchDisjunction(s, matches); -} - TEST_SUITE("re_matcher") { TEST_CASE("simple_pattern") { RE_Matcher match("[0-9]+"); diff --git a/src/RE.h b/src/RE.h index 52b446a306..ee5234d42d 100644 --- a/src/RE.h +++ b/src/RE.h @@ -96,6 +96,14 @@ public: // to the matching expressions. (idx must not contain zeros). bool CompileSet(const string_list& set, const int_list& idx); + // For use with CompileSet() to collect indices of all matched + // expressions into the matches vector. The matches vector is + // populated with the indices of all matching expressions provided + // to CompileSet()'s set and idx arguments. + // + // Behaves as MatchAll(), consuming the complete input string. + bool MatchSet(const String* s, std::vector& matches); + // Returns the position in s just beyond where the first match // occurs, or 0 if there is no such position in s. Note that // if the pattern matches empty strings, matching continues @@ -104,17 +112,6 @@ public: int Match(const String* s); int Match(const u_char* bv, int n); - // A disjunction is a collection of regular expressions (that under - // the hood are matches as a single RE, not serially) for which - // the match operation returns *all* of the matches. Disjunctions - // are constructed using the internal "||" RE operator, and the - // matches are returned as indices into the position, left-to-right, - // of which REs matched. IMPORTANT: the first RE is numbered 1, not 0. - // - // Note that there's no guarantee regarding the ordering of the - // returned matches if there is more than one. - void MatchDisjunction(const String* s, std::vector& matches); - int LongestMatch(const char* s); int LongestMatch(const String* s); int LongestMatch(const u_char* bv, int n, bool bol = true, bool eol = true); @@ -136,7 +133,7 @@ protected: // appending to an existing pattern_text. void AddPat(const char* pat, const char* orig_fmt, const char* app_fmt); - bool MatchAll(const u_char* bv, int n); + bool MatchAll(const u_char* bv, int n, std::vector* matches = nullptr); match_type mt; bool multiline; @@ -255,17 +252,4 @@ protected: bool is_single_line = false; }; -class RE_DisjunctiveMatcher final { -public: - // Takes a collection of individual REs and builds a disjunctive - // matcher for the set. - RE_DisjunctiveMatcher(const std::vector& REs); - - // See MatchDisjunction() above. - void Match(const String* s, std::vector& matches); - -private: - std::unique_ptr matcher; -}; - } // namespace zeek diff --git a/src/Val.cc b/src/Val.cc index d8bd8d1897..4304570df7 100644 --- a/src/Val.cc +++ b/src/Val.cc @@ -1455,7 +1455,7 @@ private: // from having to re-build the matcher on every insert/delete in // the common case that a whole bunch of those are done in a single // batch. - std::unique_ptr matcher = nullptr; + std::unique_ptr matcher = nullptr; // Maps matcher values to corresponding yields. When building the // matcher we insert a nil at the head to accommodate how @@ -1473,8 +1473,8 @@ VectorValPtr detail::TablePatternMatcher::Lookup(const StringValPtr& s) { Build(); } - std::vector matches; - matcher->Match(s->AsString(), matches); + std::vector matches; + matcher->MatchSet(s->AsString(), matches); for ( auto m : matches ) results->Append(matcher_yields[m]); @@ -1488,7 +1488,9 @@ void detail::TablePatternMatcher::Build() { auto& tbl_dict = *tbl->Get(); auto& tbl_hash = *tbl->GetTableHash(); - std::vector patterns; + + zeek::detail::string_list pattern_list; + zeek::detail::int_list index_list; // We need to hold on to recovered hash key values so they don't // get lost once a loop iteration goes out of scope. @@ -1499,13 +1501,18 @@ void detail::TablePatternMatcher::Build() { auto v = iter.value; auto vl = tbl_hash.RecoverVals(*k); - patterns.push_back(vl->AsListVal()->Idx(0)->AsPattern()); + char* pt = const_cast(vl->AsListVal()->Idx(0)->AsPattern()->PatternText()); + pattern_list.push_back(pt); + index_list.push_back(pattern_list.size()); matcher_yields.push_back(v->GetVal()); hash_key_vals.push_back(std::move(vl)); } - matcher = std::make_unique(patterns); + matcher = std::make_unique(detail::MATCH_EXACTLY); + + if ( ! matcher->CompileSet(pattern_list, index_list) ) + reporter->FatalError("failed compile set for disjunctive matching"); } TableVal::TableVal(TableTypePtr t, detail::AttributesPtr a) : Val(t) {