RE: Remove RE_DisjunctiveMatcher and re-use MatchAll()

Seems we can just open code the CompileSet() usage in the TablePatternMatcher
helper without indirecting through another class. Further, add the collection
of indices into MatchAll() rather than duplicating its code in
MatchDisjunction(). Doesn't seem like MatchAll() is used widely.
This commit is contained in:
Arne Welzel 2023-11-01 15:43:15 +01:00
parent 501b582bc7
commit 9ae99cdc44
3 changed files with 32 additions and 79 deletions

View file

@ -194,6 +194,10 @@ bool Specific_RE_Matcher::MatchAll(const String* s) {
return MatchAll(s->Bytes(), s->Len());
}
bool Specific_RE_Matcher::MatchSet(const String* s, std::vector<AcceptIdx>& matches) {
return MatchAll(s->Bytes(), s->Len(), &matches);
}
int Specific_RE_Matcher::Match(const char* s) { return Match((const u_char*)(s), strlen(s)); }
int Specific_RE_Matcher::Match(const String* s) { return Match(s->Bytes(), s->Len()); }
@ -202,7 +206,7 @@ int Specific_RE_Matcher::LongestMatch(const char* s) { return LongestMatch((cons
int Specific_RE_Matcher::LongestMatch(const String* s) { return LongestMatch(s->Bytes(), s->Len()); }
bool Specific_RE_Matcher::MatchAll(const u_char* bv, int n) {
bool Specific_RE_Matcher::MatchAll(const u_char* bv, int n, std::vector<AcceptIdx>* matches) {
if ( ! dfa )
// An empty pattern matches "all" iff what's being
// matched is empty.
@ -222,6 +226,11 @@ bool Specific_RE_Matcher::MatchAll(const u_char* bv, int n) {
if ( d )
d = d->Xtion(ecs[SYM_EOL], dfa);
if ( d && matches )
if ( const auto* a_set = d->Accept() )
for ( auto a : *a_set )
matches->push_back(a);
return d && d->Accept() != nullptr;
}
@ -255,33 +264,6 @@ int Specific_RE_Matcher::Match(const u_char* bv, int n) {
return 0;
}
void Specific_RE_Matcher::MatchDisjunction(const String* s, std::vector<int>& matches) {
auto bv = s->Bytes();
auto n = s->Len();
ASSERT(dfa);
DFA_State* d = dfa->StartState();
d = d->Xtion(ecs[SYM_BOL], dfa);
while ( d ) {
if ( --n < 0 )
break;
int ec = ecs[*(bv++)];
d = d->Xtion(ec, dfa);
}
if ( d )
d = d->Xtion(ecs[SYM_EOL], dfa);
if ( d )
if ( auto a_set = d->Accept() )
for ( auto a : *a_set )
matches.push_back(a);
}
void Specific_RE_Matcher::Dump(FILE* f) { dfa->Dump(f); }
inline void RE_Match_State::AddMatches(const AcceptingSet& as, MatchPos position) {
@ -456,26 +438,6 @@ void RE_Matcher::MakeSingleLine() {
bool RE_Matcher::Compile(bool lazy) { return re_anywhere->Compile(lazy) && re_exact->Compile(lazy); }
RE_DisjunctiveMatcher::RE_DisjunctiveMatcher(const std::vector<const RE_Matcher*>& REs) {
matcher = std::make_unique<detail::Specific_RE_Matcher>(detail::MATCH_EXACTLY);
zeek::detail::string_list sl;
zeek::detail::int_list il;
for ( const auto* re : REs ) {
sl.push_back(const_cast<char*>(re->PatternText()));
il.push_back(sl.size());
}
if ( ! matcher->CompileSet(sl, il) )
reporter->FatalError("failed compile set for disjunctive matcher");
}
void RE_DisjunctiveMatcher::Match(const String* s, std::vector<int>& matches) {
matches.clear();
return matcher->MatchDisjunction(s, matches);
}
TEST_SUITE("re_matcher") {
TEST_CASE("simple_pattern") {
RE_Matcher match("[0-9]+");

View file

@ -96,6 +96,14 @@ public:
// to the matching expressions. (idx must not contain zeros).
bool CompileSet(const string_list& set, const int_list& idx);
// For use with CompileSet() to collect indices of all matched
// expressions into the matches vector. The matches vector is
// populated with the indices of all matching expressions provided
// to CompileSet()'s set and idx arguments.
//
// Behaves as MatchAll(), consuming the complete input string.
bool MatchSet(const String* s, std::vector<AcceptIdx>& matches);
// Returns the position in s just beyond where the first match
// occurs, or 0 if there is no such position in s. Note that
// if the pattern matches empty strings, matching continues
@ -104,17 +112,6 @@ public:
int Match(const String* s);
int Match(const u_char* bv, int n);
// A disjunction is a collection of regular expressions (that under
// the hood are matches as a single RE, not serially) for which
// the match operation returns *all* of the matches. Disjunctions
// are constructed using the internal "||" RE operator, and the
// matches are returned as indices into the position, left-to-right,
// of which REs matched. IMPORTANT: the first RE is numbered 1, not 0.
//
// Note that there's no guarantee regarding the ordering of the
// returned matches if there is more than one.
void MatchDisjunction(const String* s, std::vector<int>& matches);
int LongestMatch(const char* s);
int LongestMatch(const String* s);
int LongestMatch(const u_char* bv, int n, bool bol = true, bool eol = true);
@ -136,7 +133,7 @@ protected:
// appending to an existing pattern_text.
void AddPat(const char* pat, const char* orig_fmt, const char* app_fmt);
bool MatchAll(const u_char* bv, int n);
bool MatchAll(const u_char* bv, int n, std::vector<AcceptIdx>* matches = nullptr);
match_type mt;
bool multiline;
@ -255,17 +252,4 @@ protected:
bool is_single_line = false;
};
class RE_DisjunctiveMatcher final {
public:
// Takes a collection of individual REs and builds a disjunctive
// matcher for the set.
RE_DisjunctiveMatcher(const std::vector<const RE_Matcher*>& REs);
// See MatchDisjunction() above.
void Match(const String* s, std::vector<int>& matches);
private:
std::unique_ptr<detail::Specific_RE_Matcher> matcher;
};
} // namespace zeek

View file

@ -1455,7 +1455,7 @@ private:
// from having to re-build the matcher on every insert/delete in
// the common case that a whole bunch of those are done in a single
// batch.
std::unique_ptr<RE_DisjunctiveMatcher> matcher = nullptr;
std::unique_ptr<detail::Specific_RE_Matcher> matcher = nullptr;
// Maps matcher values to corresponding yields. When building the
// matcher we insert a nil at the head to accommodate how
@ -1473,8 +1473,8 @@ VectorValPtr detail::TablePatternMatcher::Lookup(const StringValPtr& s) {
Build();
}
std::vector<int> matches;
matcher->Match(s->AsString(), matches);
std::vector<AcceptIdx> matches;
matcher->MatchSet(s->AsString(), matches);
for ( auto m : matches )
results->Append(matcher_yields[m]);
@ -1488,7 +1488,9 @@ void detail::TablePatternMatcher::Build() {
auto& tbl_dict = *tbl->Get();
auto& tbl_hash = *tbl->GetTableHash();
std::vector<const RE_Matcher*> patterns;
zeek::detail::string_list pattern_list;
zeek::detail::int_list index_list;
// We need to hold on to recovered hash key values so they don't
// get lost once a loop iteration goes out of scope.
@ -1499,13 +1501,18 @@ void detail::TablePatternMatcher::Build() {
auto v = iter.value;
auto vl = tbl_hash.RecoverVals(*k);
patterns.push_back(vl->AsListVal()->Idx(0)->AsPattern());
char* pt = const_cast<char*>(vl->AsListVal()->Idx(0)->AsPattern()->PatternText());
pattern_list.push_back(pt);
index_list.push_back(pattern_list.size());
matcher_yields.push_back(v->GetVal());
hash_key_vals.push_back(std::move(vl));
}
matcher = std::make_unique<RE_DisjunctiveMatcher>(patterns);
matcher = std::make_unique<detail::Specific_RE_Matcher>(detail::MATCH_EXACTLY);
if ( ! matcher->CompileSet(pattern_list, index_list) )
reporter->FatalError("failed compile set for disjunctive matching");
}
TableVal::TableVal(TableTypePtr t, detail::AttributesPtr a) : Val(t) {