Merge remote-tracking branch 'origin/topic/timw/1129-slash-s-patterns'

* origin/topic/timw/1129-slash-s-patterns:
  Add support for /s modifier to RE matcher and parser
  Code cleanup in RE_Matcher code
  Add basic unit tests for RE_Matcher
  Add /s modifier to parser for patterns
This commit is contained in:
Tim Wojtulewicz 2022-08-02 11:33:22 -07:00
commit 2cba2415fd
13 changed files with 235 additions and 60 deletions

15
CHANGES
View file

@ -1,3 +1,18 @@
5.1.0-dev.309 | 2022-08-02 11:33:22 -0700
* Add support for /s modifier to RE matcher and parser (Tim Wojtulewicz, Corelight)
* Code cleanup in RE_Matcher code (Tim Wojtulewicz, Corelight)
- Use std::string in Specific_RE_Matcher instead of char*
- Change a couple of ints-as-bools to bools
* Add basic unit tests for RE_Matcher (Tim Wojtulewicz, Corelight)
* Add /s modifier to parser for patterns (Tim Wojtulewicz, Corelight)
* Update gen-zam submodule [nomail] (Tim Wojtulewicz, Corelight)
5.1.0-dev.303 | 2022-08-01 09:56:45 -0700 5.1.0-dev.303 | 2022-08-01 09:56:45 -0700
* GH-1344: Give better warning when using a type that doesn't exist (Tim Wojtulewicz, Corelight) * GH-1344: Give better warning when using a type that doesn't exist (Tim Wojtulewicz, Corelight)

4
NEWS
View file

@ -24,6 +24,10 @@ Breaking Changes
New Functionality New Functionality
----------------- -----------------
- Added support for the /s regular expression modifier. Using this modifier in
patterns in Zeek scripts will cause the '.' character to also match newline
characters.
Changed Functionality Changed Functionality
--------------------- ---------------------

View file

@ -1 +1 @@
5.1.0-dev.303 5.1.0-dev.309

188
src/RE.cc
View file

@ -7,6 +7,7 @@
#include <cstdlib> #include <cstdlib>
#include <utility> #include <utility>
#include "zeek/3rdparty/doctest.h"
#include "zeek/CCL.h" #include "zeek/CCL.h"
#include "zeek/DFA.h" #include "zeek/DFA.h"
#include "zeek/EquivClass.h" #include "zeek/EquivClass.h"
@ -16,7 +17,8 @@
zeek::detail::CCL* zeek::detail::curr_ccl = nullptr; zeek::detail::CCL* zeek::detail::curr_ccl = nullptr;
zeek::detail::Specific_RE_Matcher* zeek::detail::rem = nullptr; zeek::detail::Specific_RE_Matcher* zeek::detail::rem = nullptr;
zeek::detail::NFA_Machine* zeek::detail::nfa = nullptr; zeek::detail::NFA_Machine* zeek::detail::nfa = nullptr;
int zeek::detail::case_insensitive = 0; bool zeek::detail::case_insensitive = false;
bool zeek::detail::re_single_line = false;
extern int RE_parse(void); extern int RE_parse(void);
extern void RE_set_input(const char* str); extern void RE_set_input(const char* str);
@ -27,13 +29,11 @@ namespace zeek
namespace detail namespace detail
{ {
Specific_RE_Matcher::Specific_RE_Matcher(match_type arg_mt, int arg_multiline) Specific_RE_Matcher::Specific_RE_Matcher(match_type arg_mt, bool arg_multiline)
: equiv_class(NUM_SYM) : mt(arg_mt), multiline(arg_multiline), equiv_class(NUM_SYM)
{ {
mt = arg_mt;
multiline = arg_multiline;
any_ccl = nullptr; any_ccl = nullptr;
pattern_text = nullptr; single_line_ccl = nullptr;
dfa = nullptr; dfa = nullptr;
ecs = nullptr; ecs = nullptr;
accepted = new AcceptingSet(); accepted = new AcceptingSet();
@ -45,14 +45,25 @@ Specific_RE_Matcher::~Specific_RE_Matcher()
delete ccl_list[i]; delete ccl_list[i];
Unref(dfa); Unref(dfa);
delete[] pattern_text;
delete accepted; delete accepted;
} }
CCL* Specific_RE_Matcher::AnyCCL() CCL* Specific_RE_Matcher::AnyCCL(bool single_line_mode)
{ {
if ( single_line_mode )
{
if ( ! single_line_ccl )
{
single_line_ccl = new CCL();
single_line_ccl->Negate();
EC()->CCL_Use(single_line_ccl);
}
return single_line_ccl;
}
if ( ! any_ccl ) if ( ! any_ccl )
{ // Create the '.' character class. {
any_ccl = new CCL(); any_ccl = new CCL();
if ( ! multiline ) if ( ! multiline )
any_ccl->Add('\n'); any_ccl->Add('\n');
@ -89,51 +100,38 @@ void Specific_RE_Matcher::AddExactPat(const char* new_pat)
void Specific_RE_Matcher::AddPat(const char* new_pat, const char* orig_fmt, const char* app_fmt) void Specific_RE_Matcher::AddPat(const char* new_pat, const char* orig_fmt, const char* app_fmt)
{ {
int n = strlen(new_pat); if ( ! pattern_text.empty() )
pattern_text = util::fmt(app_fmt, pattern_text.c_str(), new_pat);
if ( pattern_text )
n += strlen(pattern_text) + strlen(app_fmt);
else else
n += strlen(orig_fmt); pattern_text = util::fmt(orig_fmt, new_pat);
char* s = new char[n + 5 /* slop */];
if ( pattern_text )
sprintf(s, app_fmt, pattern_text, new_pat);
else
sprintf(s, orig_fmt, new_pat);
delete[] pattern_text;
pattern_text = s;
} }
void Specific_RE_Matcher::MakeCaseInsensitive() void Specific_RE_Matcher::MakeCaseInsensitive()
{ {
const char fmt[] = "(?i:%s)"; const char fmt[] = "(?i:%s)";
int n = strlen(pattern_text) + strlen(fmt); pattern_text = util::fmt(fmt, pattern_text.c_str());
}
char* s = new char[n + 5 /* slop */]; void Specific_RE_Matcher::MakeSingleLine()
{
snprintf(s, n + 5, fmt, pattern_text); const char fmt[] = "(?s:%s)";
pattern_text = util::fmt(fmt, pattern_text.c_str());
delete[] pattern_text;
pattern_text = s;
} }
bool Specific_RE_Matcher::Compile(bool lazy) bool Specific_RE_Matcher::Compile(bool lazy)
{ {
if ( ! pattern_text ) if ( pattern_text.empty() )
return false; return false;
rem = this; rem = this;
RE_set_input(pattern_text); RE_set_input(pattern_text.c_str());
int parse_status = RE_parse(); int parse_status = RE_parse();
RE_done_with_scan(); RE_done_with_scan();
if ( parse_status ) if ( parse_status )
{ {
reporter->Error("error compiling pattern /%s/", pattern_text); reporter->Error("error compiling pattern /%s/", pattern_text.c_str());
Unref(nfa); Unref(nfa);
nfa = nullptr; nfa = nullptr;
return false; return false;
@ -416,13 +414,10 @@ static RE_Matcher* matcher_merge(const RE_Matcher* re1, const RE_Matcher* re2, c
const char* text1 = re1->PatternText(); const char* text1 = re1->PatternText();
const char* text2 = re2->PatternText(); const char* text2 = re2->PatternText();
int n = strlen(text1) + strlen(text2) + strlen(merge_op) + 32 /* slop */; size_t n = strlen(text1) + strlen(text2) + strlen(merge_op) + 32 /* slop */;
char* merge_text = new char[n]; std::string merge_text = util::fmt("(%s)%s(%s)", text1, merge_op, text2);
snprintf(merge_text, n, "(%s)%s(%s)", text1, merge_op, text2); RE_Matcher* merge = new RE_Matcher(merge_text.c_str());
RE_Matcher* merge = new RE_Matcher(merge_text);
delete[] merge_text;
merge->Compile(); merge->Compile();
@ -483,9 +478,122 @@ void RE_Matcher::MakeCaseInsensitive()
is_case_insensitive = true; is_case_insensitive = true;
} }
void RE_Matcher::MakeSingleLine()
{
re_anywhere->MakeSingleLine();
re_exact->MakeSingleLine();
is_single_line = true;
}
bool RE_Matcher::Compile(bool lazy) bool RE_Matcher::Compile(bool lazy)
{ {
return re_anywhere->Compile(lazy) && re_exact->Compile(lazy); return re_anywhere->Compile(lazy) && re_exact->Compile(lazy);
} }
TEST_SUITE("re_matcher")
{
TEST_CASE("simple_pattern")
{
RE_Matcher match("[0-9]+");
match.Compile();
CHECK(strcmp(match.OrigText(), "[0-9]+") == 0);
CHECK(strcmp(match.PatternText(), "^?([0-9]+)$?") == 0);
CHECK(strcmp(match.AnywherePatternText(), "^?(.|\\n)*([0-9]+)") == 0);
CHECK(match.MatchExactly("12345"));
CHECK_FALSE(match.MatchExactly("a12345"));
// The documentation for MatchAnywhere says that it returns the
// "index just beyond where the first match occurs", which I would
// think means *after* the match. This is returning the position
// where the match starts though.
CHECK(match.MatchAnywhere("a1234bcd") == 2);
CHECK(match.MatchAnywhere("abcd") == 0);
}
TEST_CASE("case_insensitive_mode")
{
RE_Matcher match("[a-z]+");
match.MakeCaseInsensitive();
match.Compile();
CHECK(strcmp(match.PatternText(), "(?i:^?([a-z]+)$?)") == 0);
CHECK(match.MatchExactly("abcDEF"));
}
TEST_CASE("multi_pattern")
{
RE_Matcher match("[0-9]+");
match.AddPat("[a-z]+");
match.Compile();
CHECK(strcmp(match.PatternText(), "(^?([0-9]+)$?)|(^?([a-z]+)$?)") == 0);
CHECK(match.MatchExactly("abc"));
CHECK(match.MatchExactly("123"));
CHECK_FALSE(match.MatchExactly("abc123"));
}
TEST_CASE("modes_multi_pattern")
{
RE_Matcher match("[a-m]+");
match.MakeCaseInsensitive();
match.AddPat("[n-z]+");
match.Compile();
CHECK(strcmp(match.PatternText(), "((?i:^?([a-m]+)$?))|(^?([n-z]+)$?)") == 0);
CHECK(match.MatchExactly("aBc"));
CHECK(match.MatchExactly("nop"));
CHECK_FALSE(match.MatchExactly("NoP"));
}
TEST_CASE("single_line_mode")
{
RE_Matcher match(".*");
match.MakeSingleLine();
match.Compile();
CHECK(strcmp(match.PatternText(), "(?s:^?(.*)$?)") == 0);
CHECK(match.MatchExactly("abc\ndef"));
RE_Matcher match2("fOO.*bAR");
match2.MakeSingleLine();
match2.Compile();
CHECK(strcmp(match2.PatternText(), "(?s:^?(fOO.*bAR)$?)") == 0);
CHECK(match.MatchExactly("fOOab\ncdbAR"));
RE_Matcher match3("b.r");
match3.MakeSingleLine();
match3.Compile();
CHECK(match3.MatchExactly("bar"));
CHECK(match3.MatchExactly("b\nr"));
RE_Matcher match4("a.c");
match4.MakeSingleLine();
match4.AddPat("def");
match4.Compile();
CHECK(match4.MatchExactly("abc"));
CHECK(match4.MatchExactly("a\nc"));
}
TEST_CASE("disjunction")
{
RE_Matcher match1("a.c");
match1.MakeSingleLine();
match1.Compile();
RE_Matcher match2("def");
match2.Compile();
auto dj = detail::RE_Matcher_disjunction(&match1, &match2);
CHECK(dj->MatchExactly("abc"));
CHECK(dj->MatchExactly("a.c"));
CHECK(dj->MatchExactly("a\nc"));
CHECK(dj->MatchExactly("def"));
delete dj;
}
}
} // namespace zeek } // namespace zeek

View file

@ -32,7 +32,8 @@ class DFA_State;
class Specific_RE_Matcher; class Specific_RE_Matcher;
class CCL; class CCL;
extern int case_insensitive; extern bool case_insensitive;
extern bool re_single_line;
extern CCL* curr_ccl; extern CCL* curr_ccl;
extern NFA_Machine* nfa; extern NFA_Machine* nfa;
extern Specific_RE_Matcher* rem; extern Specific_RE_Matcher* rem;
@ -59,14 +60,15 @@ enum match_type
class Specific_RE_Matcher class Specific_RE_Matcher
{ {
public: public:
explicit Specific_RE_Matcher(match_type mt, int multiline = 0); explicit Specific_RE_Matcher(match_type mt, bool multiline = false);
~Specific_RE_Matcher(); ~Specific_RE_Matcher();
void AddPat(const char* pat); void AddPat(const char* pat);
void MakeCaseInsensitive(); void MakeCaseInsensitive();
void MakeSingleLine();
void SetPat(const char* pat) { pattern_text = util::copy_string(pat); } void SetPat(const char* pat) { pattern_text = pat; }
bool Compile(bool lazy = false); bool Compile(bool lazy = false);
@ -90,7 +92,7 @@ public:
return nullptr; return nullptr;
} }
CCL* LookupCCL(int index) { return ccl_list[index]; } CCL* LookupCCL(int index) { return ccl_list[index]; }
CCL* AnyCCL(); CCL* AnyCCL(bool single_line_mode = false);
void ConvertCCLs(); void ConvertCCLs();
@ -117,7 +119,7 @@ public:
EquivClass* EC() { return &equiv_class; } EquivClass* EC() { return &equiv_class; }
const char* PatternText() const { return pattern_text; } const char* PatternText() const { return pattern_text.c_str(); }
DFA_Machine* DFA() const { return dfa; } DFA_Machine* DFA() const { return dfa; }
@ -135,17 +137,21 @@ protected:
bool MatchAll(const u_char* bv, int n); bool MatchAll(const u_char* bv, int n);
match_type mt; match_type mt;
int multiline; bool multiline;
char* pattern_text;
std::string pattern_text;
std::map<std::string, std::string> defs; std::map<std::string, std::string> defs;
std::map<std::string, CCL*> ccl_dict; std::map<std::string, CCL*> ccl_dict;
std::vector<char> modifiers;
PList<CCL> ccl_list; PList<CCL> ccl_list;
EquivClass equiv_class; EquivClass equiv_class;
int* ecs; int* ecs;
DFA_Machine* dfa; DFA_Machine* dfa;
CCL* any_ccl;
AcceptingSet* accepted; AcceptingSet* accepted;
CCL* any_ccl;
CCL* single_line_ccl;
}; };
class RE_Match_State class RE_Match_State
@ -205,6 +211,9 @@ public:
void MakeCaseInsensitive(); void MakeCaseInsensitive();
bool IsCaseInsensitive() const { return is_case_insensitive; } bool IsCaseInsensitive() const { return is_case_insensitive; }
void MakeSingleLine();
bool IsSingleLine() const { return is_single_line; }
bool Compile(bool lazy = false); bool Compile(bool lazy = false);
// Returns true if s exactly matches the pattern, false otherwise. // Returns true if s exactly matches the pattern, false otherwise.
@ -240,6 +249,7 @@ protected:
detail::Specific_RE_Matcher* re_exact; detail::Specific_RE_Matcher* re_exact;
bool is_case_insensitive = false; bool is_case_insensitive = false;
bool is_single_line = false;
}; };
} // namespace zeek } // namespace zeek

View file

@ -526,7 +526,7 @@ void RuleMatcher::BuildPatternSets(RuleHdrTest::pattern_set_list* dst, const str
if ( group_exprs.length() > sig_max_group_size || i == exprs.length() ) if ( group_exprs.length() > sig_max_group_size || i == exprs.length() )
{ {
RuleHdrTest::PatternSet* set = new RuleHdrTest::PatternSet; RuleHdrTest::PatternSet* set = new RuleHdrTest::PatternSet;
set->re = new Specific_RE_Matcher(MATCH_EXACTLY, 1); set->re = new Specific_RE_Matcher(MATCH_EXACTLY, true);
set->re->CompileSet(group_exprs, group_ids); set->re->CompileSet(group_exprs, group_ids);
set->patterns = group_exprs; set->patterns = group_exprs;
set->ids = group_ids; set->ids = group_ids;

View file

@ -145,7 +145,7 @@ TeredoAnalyzer::TeredoAnalyzer() : zeek::packet_analysis::Analyzer("TEREDO")
// } // }
pattern_re = std::make_unique<zeek::detail::Specific_RE_Matcher>(zeek::detail::MATCH_EXACTLY, pattern_re = std::make_unique<zeek::detail::Specific_RE_Matcher>(zeek::detail::MATCH_EXACTLY,
1); true);
pattern_re->AddPat("^(\\x00\\x00)|(\\x00\\x01)|([\\x60-\\x6f].{7}((\\x20\\x01\\x00\\x00)).{28})" pattern_re->AddPat("^(\\x00\\x00)|(\\x00\\x01)|([\\x60-\\x6f].{7}((\\x20\\x01\\x00\\x00)).{28})"
"|([\\x60-\\x6f].{23}((\\x20\\x01\\x00\\x00))).{12}"); "|([\\x60-\\x6f].{23}((\\x20\\x01\\x00\\x00))).{12}");
pattern_re->Compile(); pattern_re->Compile();

View file

@ -54,7 +54,7 @@
%left '$' '[' ']' '(' ')' TOK_HAS_FIELD TOK_HAS_ATTR %left '$' '[' ']' '(' ')' TOK_HAS_FIELD TOK_HAS_ATTR
%nonassoc TOK_AS TOK_IS %nonassoc TOK_AS TOK_IS
%type <b> opt_no_test opt_no_test_block TOK_PATTERN_END opt_deep when_flavor %type <b> opt_no_test opt_no_test_block opt_deep when_flavor
%type <str> TOK_ID TOK_PATTERN_TEXT %type <str> TOK_ID TOK_PATTERN_TEXT
%type <id> local_id global_id def_global_id event_id global_or_event_id resolve_id begin_lambda case_type %type <id> local_id global_id def_global_id event_id global_or_event_id resolve_id begin_lambda case_type
%type <id_l> local_id_list case_type_list %type <id_l> local_id_list case_type_list
@ -77,6 +77,7 @@
%type <capture> capture %type <capture> capture
%type <captures> capture_list opt_captures when_captures %type <captures> capture_list opt_captures when_captures
%type <when_clause> when_head when_start when_clause %type <when_clause> when_head when_start when_clause
%type <re_modes> TOK_PATTERN_END
%{ %{
#include <cstdlib> #include <cstdlib>
@ -324,6 +325,11 @@ static StmtPtr build_local(ID* id, Type* t, InitClass ic, Expr* e,
zeek::FuncType::Capture* capture; zeek::FuncType::Capture* capture;
zeek::FuncType::CaptureList* captures; zeek::FuncType::CaptureList* captures;
zeek::detail::WhenInfo* when_clause; zeek::detail::WhenInfo* when_clause;
struct
{
bool ignore_case;
bool single_line;
} re_modes;
} }
%% %%
@ -912,9 +918,12 @@ expr:
auto* re = new RE_Matcher($3); auto* re = new RE_Matcher($3);
delete [] $3; delete [] $3;
if ( $4 ) if ( $4.ignore_case )
re->MakeCaseInsensitive(); re->MakeCaseInsensitive();
if ( $4.single_line )
re->MakeSingleLine();
re->Compile(); re->Compile();
$$ = new ConstExpr(make_intrusive<PatternVal>(re)); $$ = new ConstExpr(make_intrusive<PatternVal>(re));
} }

View file

@ -20,7 +20,7 @@ namespace zeek::detail {
void yyerror(const char msg[]); void yyerror(const char msg[]);
%} %}
%token TOK_CHAR TOK_NUMBER TOK_CCL TOK_CCE TOK_CASE_INSENSITIVE %token TOK_CHAR TOK_NUMBER TOK_CCL TOK_CCE TOK_CASE_INSENSITIVE TOK_SINGLE_LINE
%union { %union {
int int_val; int int_val;
@ -112,7 +112,8 @@ singleton : singleton '*'
| '.' | '.'
{ {
$$ = new zeek::detail::NFA_Machine(new zeek::detail::NFA_State(zeek::detail::rem->AnyCCL())); $$ = new zeek::detail::NFA_Machine(new zeek::detail::NFA_State(
zeek::detail::rem->AnyCCL(zeek::detail::re_single_line)));
} }
| full_ccl | full_ccl
@ -132,7 +133,10 @@ singleton : singleton '*'
{ $$ = $2; } { $$ = $2; }
| TOK_CASE_INSENSITIVE re ')' | TOK_CASE_INSENSITIVE re ')'
{ $$ = $2; zeek::detail::case_insensitive = 0; } { $$ = $2; zeek::detail::case_insensitive = false; }
| TOK_SINGLE_LINE re ')'
{ $$ = $2; zeek::detail::re_single_line = false; }
| TOK_CHAR | TOK_CHAR
{ {

View file

@ -115,7 +115,8 @@ CCL_EXPR ("[:"[[:alpha:]]+":]")
} }
} }
"(?i:" zeek::detail::case_insensitive = 1; return TOK_CASE_INSENSITIVE; "(?i:" zeek::detail::case_insensitive = true; return TOK_CASE_INSENSITIVE;
"(?s:" zeek::detail::re_single_line = true; return TOK_SINGLE_LINE;
[a-zA-Z] { [a-zA-Z] {
if ( zeek::detail::case_insensitive ) if ( zeek::detail::case_insensitive )

View file

@ -562,13 +562,28 @@ F RET_CONST(zeek::val_mgr->False()->Ref())
<RE>"/" { <RE>"/" {
BEGIN(INITIAL); BEGIN(INITIAL);
yylval.b = false; yylval.re_modes.ignore_case = false;
yylval.re_modes.single_line = false;
return TOK_PATTERN_END; return TOK_PATTERN_END;
} }
<RE>"/i" { <RE>(\/[is]{0,2}) {
BEGIN(INITIAL); BEGIN(INITIAL);
yylval.b = true;
if ( strlen(yytext) == 2 )
{
yylval.re_modes.ignore_case = (yytext[1] == 'i');
yylval.re_modes.single_line = (yytext[1] == 's');
}
else
{
if ( yytext[1] == yytext[2] )
zeek::reporter->Error("pattern has duplicate mode %c", yytext[1]);
yylval.re_modes.ignore_case = (yytext[1] == 'i' || yytext[2] == 'i');
yylval.re_modes.single_line = (yytext[1] == 's' || yytext[2] == 's');
}
return TOK_PATTERN_END; return TOK_PATTERN_END;
} }

View file

@ -33,3 +33,7 @@ case-sensitive pattern (PASS)
(?i:...) pattern construct (PASS) (?i:...) pattern construct (PASS)
(?i:...) pattern construct (FAIL) (?i:...) pattern construct (FAIL)
(?i:...) pattern construct (PASS) (?i:...) pattern construct (PASS)
/s missing (PASS)
/s pattern modifier (PASS)
/s pattern disjunction (PASS)
/s pattern concatenation (PASS)

View file

@ -65,4 +65,9 @@ event zeek_init()
test_case( "(?i:...) pattern construct", /foo|(?i:bar)/ in "xFOoy" ); test_case( "(?i:...) pattern construct", /foo|(?i:bar)/ in "xFOoy" );
test_case( "(?i:...) pattern construct", /foo|(?i:bar)/ | /foo/i in "xFOoy" ); test_case( "(?i:...) pattern construct", /foo|(?i:bar)/ | /foo/i in "xFOoy" );
test_case( "/s missing", /fOO.*bAR/ != "fOOab\ncdbAR");
test_case( "/s pattern modifier", /fOO.*bAR/s == "fOOab\ncdbAR");
test_case( "/s pattern disjunction", /b.r/s | /bez/ == "b\nr" );
test_case( "/s pattern concatenation", /b.r/s & /bez/ == "b\nrbez" );
} }