support for indexing "table[pattern] of T" with strings to get multi-matches

This commit is contained in:
Vern Paxson 2023-11-01 11:23:48 +01:00 committed by Arne Welzel
parent a5a79d3f3a
commit 699549eb45
8 changed files with 228 additions and 2 deletions

View file

@ -2382,6 +2382,18 @@ IndexExpr::IndexExpr(ExprPtr arg_op1, ListExprPtr arg_op2, bool arg_is_slice, bo
if ( IsError() ) if ( IsError() )
return; return;
if ( op1->GetType()->Tag() == TYPE_TABLE ) { // Check for a table[pattern] being indexed by a string
auto table_type = op1->GetType()->AsTableType();
auto& it = table_type->GetIndexTypes();
auto& rhs_type = op2->GetType()->AsTypeList()->GetTypes();
if ( it.size() == 1 && it[0]->Tag() == TYPE_PATTERN && table_type->Yield() && rhs_type.size() == 1 &&
rhs_type[0]->Tag() == TYPE_STRING ) {
is_pattern_table = true;
SetType(make_intrusive<VectorType>(op1->GetType()->Yield()));
return;
}
}
int match_type = op1->GetType()->MatchesIndex(op2->AsListExpr()); int match_type = op1->GetType()->MatchesIndex(op2->AsListExpr());
if ( match_type == DOES_NOT_MATCH_INDEX ) { if ( match_type == DOES_NOT_MATCH_INDEX ) {
@ -2532,7 +2544,12 @@ ValPtr IndexExpr::Fold(Val* v1, Val* v2) const {
return index_slice(vect, lv); return index_slice(vect, lv);
} break; } break;
case TYPE_TABLE: v = v1->AsTableVal()->FindOrDefault({NewRef{}, v2}); break; case TYPE_TABLE:
if ( is_pattern_table )
return v1->AsTableVal()->LookupPattern(v2->AsListVal()->Idx(0)->AsStringVal());
v = v1->AsTableVal()->FindOrDefault({NewRef{}, v2});
break;
case TYPE_STRING: return index_string(v1->AsString(), v2->AsListVal()); case TYPE_STRING: return index_string(v1->AsString(), v2->AsListVal());

View file

@ -1012,6 +1012,7 @@ protected:
bool is_slice; bool is_slice;
bool is_inside_when; bool is_inside_when;
bool is_pattern_table = false;
}; };
// The following execute the heart of IndexExpr functionality for // The following execute the heart of IndexExpr functionality for

View file

@ -251,6 +251,33 @@ int Specific_RE_Matcher::Match(const u_char* bv, int n) {
return 0; return 0;
} }
void Specific_RE_Matcher::MatchDisjunction(const String* s, std::vector<int>& matches) {
auto bv = s->Bytes();
auto n = s->Len();
ASSERT(dfa);
DFA_State* d = dfa->StartState();
d = d->Xtion(ecs[SYM_BOL], dfa);
while ( d ) {
if ( --n < 0 )
break;
int ec = ecs[*(bv++)];
d = d->Xtion(ec, dfa);
}
if ( d )
d = d->Xtion(ecs[SYM_EOL], dfa);
if ( d )
if ( auto a_set = d->Accept() )
for ( auto a : *a_set )
matches.push_back(a);
}
void Specific_RE_Matcher::Dump(FILE* f) { dfa->Dump(f); } void Specific_RE_Matcher::Dump(FILE* f) { dfa->Dump(f); }
inline void RE_Match_State::AddMatches(const AcceptingSet& as, MatchPos position) { inline void RE_Match_State::AddMatches(const AcceptingSet& as, MatchPos position) {
@ -425,6 +452,23 @@ void RE_Matcher::MakeSingleLine() {
bool RE_Matcher::Compile(bool lazy) { return re_anywhere->Compile(lazy) && re_exact->Compile(lazy); } bool RE_Matcher::Compile(bool lazy) { return re_anywhere->Compile(lazy) && re_exact->Compile(lazy); }
RE_DisjunctiveMatcher::RE_DisjunctiveMatcher(const std::vector<const RE_Matcher*>& REs) {
matcher = std::make_unique<detail::Specific_RE_Matcher>(detail::MATCH_EXACTLY);
std::string disjunction;
for ( auto re : REs )
disjunction += std::string("||") + re->PatternText();
matcher->SetPat(disjunction.c_str());
auto status = matcher->Compile();
ASSERT(status);
}
void RE_DisjunctiveMatcher::Match(const String* s, std::vector<int>& matches) {
matches.clear();
return matcher->MatchDisjunction(s, matches);
}
TEST_SUITE("re_matcher") { TEST_SUITE("re_matcher") {
TEST_CASE("simple_pattern") { TEST_CASE("simple_pattern") {
RE_Matcher match("[0-9]+"); RE_Matcher match("[0-9]+");

View file

@ -36,6 +36,7 @@ extern CCL* curr_ccl;
extern NFA_Machine* nfa; extern NFA_Machine* nfa;
extern Specific_RE_Matcher* rem; extern Specific_RE_Matcher* rem;
extern const char* RE_parse_input; extern const char* RE_parse_input;
extern int RE_accept_num;
extern int clower(int); extern int clower(int);
extern void synerr(const char str[]); extern void synerr(const char str[]);
@ -104,6 +105,17 @@ public:
int Match(const String* s); int Match(const String* s);
int Match(const u_char* bv, int n); int Match(const u_char* bv, int n);
// A disjunction is a collection of regular expressions (that under
// the hood are matches as a single RE, not serially) for which
// the match operation returns *all* of the matches. Disjunctions
// are constructed using the internal "||" RE operator, and the
// matches are returned as indices into the position, left-to-right,
// of which REs matched. IMPORTANT: the first RE is numbered 1, not 0.
//
// Note that there's no guarantee regarding the ordering of the
// returned matches if there is more than one.
void MatchDisjunction(const String* s, std::vector<int>& matches);
int LongestMatch(const char* s); int LongestMatch(const char* s);
int LongestMatch(const String* s); int LongestMatch(const String* s);
int LongestMatch(const u_char* bv, int n, bool bol = true, bool eol = true); int LongestMatch(const u_char* bv, int n, bool bol = true, bool eol = true);
@ -244,4 +256,17 @@ protected:
bool is_single_line = false; bool is_single_line = false;
}; };
class RE_DisjunctiveMatcher final {
public:
// Takes a collection of individual REs and builds a disjunctive
// matcher for the set.
RE_DisjunctiveMatcher(const std::vector<const RE_Matcher*>& REs);
// See MatchDisjunction() above.
void Match(const String* s, std::vector<int>& matches);
private:
std::unique_ptr<detail::Specific_RE_Matcher> matcher;
};
} // namespace zeek } // namespace zeek

View file

@ -1427,6 +1427,93 @@ static void find_nested_record_types(const TypePtr& t, std::set<RecordType*>* fo
} }
} }
using PatternValPtr = IntrusivePtr<PatternVal>;
// Support class for returning multiple values from a table[pattern]
// when indexed with a string.
class TablePatternMatcher {
public:
TablePatternMatcher(const TableVal* _tbl, TypePtr _yield) : tbl(_tbl) {
vtype = make_intrusive<VectorType>(std::move(_yield));
}
~TablePatternMatcher() { Clear(); }
void Insert(ValPtr pat, ValPtr yield) { Clear(); }
void Remove(ValPtr pat) { Clear(); }
void Clear() {
delete matcher;
matcher = nullptr;
}
VectorValPtr Lookup(const StringVal* s);
private:
void Build();
const TableVal* tbl;
VectorTypePtr vtype;
// If matcher is nil then we know we need to build it. This gives
// us an easy way to cache matchers in the common case that these
// sorts of tables don't change their elements very often (indeed,
// they'll frequently be constructed just once), and also keeps us
// from having to re-build the matcher on every insert/delete in
// the common case that a whole bunch of those are done in a single
// batch.
RE_DisjunctiveMatcher* matcher = nullptr;
// Maps matcher values to corresponding yields. When building the
// matcher we insert a nil at the head to accommodate how
// disjunctive matchers use numbering starting at 1 rather than 0.
std::vector<ValPtr> matcher_yields;
};
VectorValPtr TablePatternMatcher::Lookup(const StringVal* s) {
auto results = make_intrusive<VectorVal>(vtype);
if ( ! matcher ) {
if ( tbl->Get()->Length() == 0 )
return results;
Build();
}
std::vector<int> matches;
matcher->Match(s->AsString(), matches);
for ( auto m : matches )
results->Append(matcher_yields[m]);
return results;
}
void TablePatternMatcher::Build() {
matcher_yields.clear();
matcher_yields.push_back(nullptr);
auto& tbl_dict = *tbl->Get();
auto& tbl_hash = *tbl->GetTableHash();
std::vector<const RE_Matcher*> patterns;
// We need to hold on to recovered hash key values so they don't
// get lost once a loop iteration goes out of scope.
std::vector<ListValPtr> hash_key_vals;
for ( auto& iter : tbl_dict ) {
auto k = iter.GetHashKey();
auto v = iter.value;
auto vl = tbl_hash.RecoverVals(*k);
patterns.push_back(vl->AsListVal()->Idx(0)->AsPattern());
matcher_yields.push_back(v->GetVal());
hash_key_vals.push_back(std::move(vl));
}
matcher = new RE_DisjunctiveMatcher(patterns);
}
TableVal::TableVal(TableTypePtr t, detail::AttributesPtr a) : Val(t) { TableVal::TableVal(TableTypePtr t, detail::AttributesPtr a) : Val(t) {
bool ordered = (a != nullptr && a->Find(detail::ATTR_ORDERED) != nullptr); bool ordered = (a != nullptr && a->Find(detail::ATTR_ORDERED) != nullptr);
Init(std::move(t), ordered); Init(std::move(t), ordered);
@ -1460,6 +1547,10 @@ void TableVal::Init(TableTypePtr t, bool ordered) {
else else
subnets = nullptr; subnets = nullptr;
auto& it = table_type->GetIndexTypes();
if ( it.size() == 1 && it[0]->Tag() == TYPE_PATTERN && table_type->Yield() )
pattern_matcher = new TablePatternMatcher(this, table_type->Yield());
table_hash = new detail::CompositeHash(table_type->GetIndices()); table_hash = new detail::CompositeHash(table_type->GetIndices());
if ( ordered ) if ( ordered )
table_val = new PDict<TableEntryVal>(DictOrder::ORDERED); table_val = new PDict<TableEntryVal>(DictOrder::ORDERED);
@ -1476,6 +1567,7 @@ TableVal::~TableVal() {
delete table_hash; delete table_hash;
delete table_val; delete table_val;
delete subnets; delete subnets;
delete pattern_matcher;
delete expire_iterator; delete expire_iterator;
} }
@ -1486,6 +1578,9 @@ void TableVal::RemoveAll() {
delete table_val; delete table_val;
table_val = new PDict<TableEntryVal>; table_val = new PDict<TableEntryVal>;
table_val->SetDeleteFunc(table_entry_val_delete_func); table_val->SetDeleteFunc(table_entry_val_delete_func);
if ( pattern_matcher )
pattern_matcher->Clear();
} }
int TableVal::Size() const { return table_val->Length(); } int TableVal::Size() const { return table_val->Length(); }
@ -1570,6 +1665,9 @@ bool TableVal::Assign(ValPtr index, ValPtr new_val, bool broker_forward, bool* i
return false; return false;
} }
if ( pattern_matcher )
pattern_matcher->Insert(index->AsListVal()->Idx(0), new_val);
return Assign(std::move(index), std::move(k), std::move(new_val), broker_forward, iterators_invalidated); return Assign(std::move(index), std::move(k), std::move(new_val), broker_forward, iterators_invalidated);
} }
@ -1925,6 +2023,13 @@ TableValPtr TableVal::LookupSubnetValues(const SubNetVal* search) {
return nt; return nt;
} }
VectorValPtr TableVal::LookupPattern(const StringVal* s) {
if ( ! pattern_matcher )
reporter->InternalError("LookupPattern called on wrong table type");
return pattern_matcher->Lookup(s);
}
bool TableVal::UpdateTimestamp(Val* index) { bool TableVal::UpdateTimestamp(Val* index) {
TableEntryVal* v; TableEntryVal* v;
@ -2105,8 +2210,14 @@ ValPtr TableVal::Remove(const Val& index, bool broker_forward, bool* iterators_i
va = v->GetVal() ? v->GetVal() : IntrusivePtr{NewRef{}, this}; va = v->GetVal() ? v->GetVal() : IntrusivePtr{NewRef{}, this};
if ( subnets && ! subnets->Remove(&index) ) if ( subnets && ! subnets->Remove(&index) )
// VP: not clear to me this should be an internal warning,
// since Zeek doesn't otherwise complain about removing
// non-existent table elements.
reporter->InternalWarning("index not in prefix table"); reporter->InternalWarning("index not in prefix table");
if ( pattern_matcher )
pattern_matcher->Remove(index.AsListVal()->Idx(0));
delete v; delete v;
Modified(); Modified();

View file

@ -718,6 +718,8 @@ protected:
TableVal* table; TableVal* table;
}; };
class TablePatternMatcher;
class TableVal final : public Val, public notifier::detail::Modifiable { class TableVal final : public Val, public notifier::detail::Modifiable {
public: public:
explicit TableVal(TableTypePtr t, detail::AttributesPtr attrs = nullptr); explicit TableVal(TableTypePtr t, detail::AttributesPtr attrs = nullptr);
@ -863,6 +865,11 @@ public:
// Causes an internal error if called for any other kind of table. // Causes an internal error if called for any other kind of table.
TableValPtr LookupSubnetValues(const SubNetVal* s); TableValPtr LookupSubnetValues(const SubNetVal* s);
// For a table[pattern], return a vector of all yields matching
// the given string.
// Causes an internal error if called for any other kind of table.
VectorValPtr LookupPattern(const StringVal* s);
// Sets the timestamp for the given index to network time. // Sets the timestamp for the given index to network time.
// Returns false if index does not exist. // Returns false if index does not exist.
bool UpdateTimestamp(Val* index); bool UpdateTimestamp(Val* index);
@ -1032,6 +1039,7 @@ protected:
TableValTimer* timer; TableValTimer* timer;
RobustDictIterator<TableEntryVal>* expire_iterator; RobustDictIterator<TableEntryVal>* expire_iterator;
detail::PrefixTable* subnets; detail::PrefixTable* subnets;
TablePatternMatcher* pattern_matcher = nullptr;
ValPtr def_val; ValPtr def_val;
detail::ExprPtr change_func; detail::ExprPtr change_func;
std::string broker_store; std::string broker_store;

View file

@ -21,6 +21,7 @@ void yyerror(const char msg[]);
%} %}
%token TOK_CHAR TOK_NUMBER TOK_CCL TOK_CCE TOK_CASE_INSENSITIVE TOK_SINGLE_LINE %token TOK_CHAR TOK_NUMBER TOK_CCL TOK_CCE TOK_CASE_INSENSITIVE TOK_SINGLE_LINE
%token TOK_DISJUNCTION
%union { %union {
int int_val; int int_val;
@ -32,7 +33,7 @@ void yyerror(const char msg[]);
%type <int_val> TOK_CHAR TOK_NUMBER %type <int_val> TOK_CHAR TOK_NUMBER
%type <cce_val> TOK_CCE %type <cce_val> TOK_CCE
%type <ccl_val> TOK_CCL ccl full_ccl %type <ccl_val> TOK_CCL ccl full_ccl
%type <mach_val> re singleton series string %type <mach_val> re singleton series string disjunction
%destructor { delete $$; } <mach_val> %destructor { delete $$; } <mach_val>
@ -40,6 +41,9 @@ void yyerror(const char msg[]);
flexrule : re flexrule : re
{ $1->AddAccept(1); zeek::detail::nfa = $1; } { $1->AddAccept(1); zeek::detail::nfa = $1; }
| disjunction
{ zeek::detail::nfa = $1; }
| error | error
{ return 1; } { return 1; }
; ;
@ -51,6 +55,18 @@ re : re '|' series
{ $$ = new zeek::detail::NFA_Machine(new zeek::detail::EpsilonState()); } { $$ = new zeek::detail::NFA_Machine(new zeek::detail::EpsilonState()); }
; ;
disjunction : disjunction TOK_DISJUNCTION re
{
$3->AddAccept(++zeek::detail::RE_accept_num);
$$ = zeek::detail::make_alternate($1, $3);
}
| TOK_DISJUNCTION re
{
$2->AddAccept(++zeek::detail::RE_accept_num);
$$ = $2;
}
;
series : series singleton series : series singleton
{ $1->AppendMachine($2); $$ = $1; } { $1->AppendMachine($2); $$ = $1; }
| singleton | singleton

View file

@ -23,6 +23,7 @@
#include "re-parse.h" #include "re-parse.h"
const char* zeek::detail::RE_parse_input = nullptr; const char* zeek::detail::RE_parse_input = nullptr;
int zeek::detail::RE_accept_num = 0;
#define RET_CCE(func) \ #define RET_CCE(func) \
BEGIN(SC_CCL); \ BEGIN(SC_CCL); \
@ -143,6 +144,8 @@ CCL_EXPR ("[:"[[:alpha:]]+":]")
} }
} }
"||" return TOK_DISJUNCTION;
[|*+?.(){}] return yytext[0]; [|*+?.(){}] return yytext[0];
. yylval.int_val = yytext[0]; return TOK_CHAR; . yylval.int_val = yytext[0]; return TOK_CHAR;
\n return 0; // treat as end of pattern \n return 0; // treat as end of pattern
@ -237,6 +240,7 @@ YY_BUFFER_STATE RE_buf;
void RE_set_input(const char* str) void RE_set_input(const char* str)
{ {
zeek::detail::RE_parse_input = str; zeek::detail::RE_parse_input = str;
zeek::detail::RE_accept_num = 0;
RE_buf = yy_scan_string(str); RE_buf = yy_scan_string(str);
} }