diff --git a/NEWS b/NEWS index bd056bc781..99fd3b494a 100644 --- a/NEWS +++ b/NEWS @@ -24,6 +24,10 @@ Breaking Changes New Functionality ----------------- +- Added support for the /s regular expression modifier. Using this modifier in + patterns in Zeek scripts will cause the '.' character to also match newline + characters. + Changed Functionality --------------------- diff --git a/src/RE.cc b/src/RE.cc index 13d21e59e5..e6389de943 100644 --- a/src/RE.cc +++ b/src/RE.cc @@ -18,6 +18,7 @@ zeek::detail::CCL* zeek::detail::curr_ccl = nullptr; zeek::detail::Specific_RE_Matcher* zeek::detail::rem = nullptr; zeek::detail::NFA_Machine* zeek::detail::nfa = nullptr; bool zeek::detail::case_insensitive = false; +bool zeek::detail::re_single_line = false; extern int RE_parse(void); extern void RE_set_input(const char* str); @@ -32,6 +33,7 @@ Specific_RE_Matcher::Specific_RE_Matcher(match_type arg_mt, bool arg_multiline) : mt(arg_mt), multiline(arg_multiline), equiv_class(NUM_SYM) { any_ccl = nullptr; + single_line_ccl = nullptr; dfa = nullptr; ecs = nullptr; accepted = new AcceptingSet(); @@ -46,10 +48,22 @@ Specific_RE_Matcher::~Specific_RE_Matcher() delete accepted; } -CCL* Specific_RE_Matcher::AnyCCL() +CCL* Specific_RE_Matcher::AnyCCL(bool single_line_mode) { + if ( single_line_mode ) + { + if ( ! single_line_ccl ) + { + single_line_ccl = new CCL(); + single_line_ccl->Negate(); + EC()->CCL_Use(single_line_ccl); + } + + return single_line_ccl; + } + if ( ! any_ccl ) - { // Create the '.' character class. + { any_ccl = new CCL(); if ( ! multiline ) any_ccl->Add('\n'); @@ -98,6 +112,12 @@ void Specific_RE_Matcher::MakeCaseInsensitive() pattern_text = util::fmt(fmt, pattern_text.c_str()); } +void Specific_RE_Matcher::MakeSingleLine() + { + const char fmt[] = "(?s:%s)"; + pattern_text = util::fmt(fmt, pattern_text.c_str()); + } + bool Specific_RE_Matcher::Compile(bool lazy) { if ( pattern_text.empty() ) @@ -394,13 +414,10 @@ static RE_Matcher* matcher_merge(const RE_Matcher* re1, const RE_Matcher* re2, c const char* text1 = re1->PatternText(); const char* text2 = re2->PatternText(); - int n = strlen(text1) + strlen(text2) + strlen(merge_op) + 32 /* slop */; + size_t n = strlen(text1) + strlen(text2) + strlen(merge_op) + 32 /* slop */; - char* merge_text = new char[n]; - snprintf(merge_text, n, "(%s)%s(%s)", text1, merge_op, text2); - - RE_Matcher* merge = new RE_Matcher(merge_text); - delete[] merge_text; + std::string merge_text = util::fmt("(%s)%s(%s)", text1, merge_op, text2); + RE_Matcher* merge = new RE_Matcher(merge_text.c_str()); merge->Compile(); @@ -461,6 +478,14 @@ void RE_Matcher::MakeCaseInsensitive() is_case_insensitive = true; } +void RE_Matcher::MakeSingleLine() + { + re_anywhere->MakeSingleLine(); + re_exact->MakeSingleLine(); + + is_single_line = true; + } + bool RE_Matcher::Compile(bool lazy) { return re_anywhere->Compile(lazy) && re_exact->Compile(lazy); @@ -523,15 +548,51 @@ TEST_SUITE("re_matcher") CHECK(match.MatchExactly("aBc")); CHECK(match.MatchExactly("nop")); CHECK_FALSE(match.MatchExactly("NoP")); + } - // TODO: this part isn't working at all. There's something about the second call - // to Compile() that's breaking something. - // match.MakeCaseInsensitive(); - // match.Compile(); - // CHECK(strcmp(match.PatternText(), "(?i:((?i:^?([a-m]+)$?))|(^?([n-z]+)$?))") == 0); - // CHECK(match.MatchExactly("aBc")); - // CHECK(match.MatchExactly("nop")); - // CHECK(match.MatchExactly("NoP")); + TEST_CASE("single_line_mode") + { + RE_Matcher match(".*"); + match.MakeSingleLine(); + match.Compile(); + + CHECK(strcmp(match.PatternText(), "(?s:^?(.*)$?)") == 0); + CHECK(match.MatchExactly("abc\ndef")); + + RE_Matcher match2("fOO.*bAR"); + match2.MakeSingleLine(); + match2.Compile(); + + CHECK(strcmp(match2.PatternText(), "(?s:^?(fOO.*bAR)$?)") == 0); + CHECK(match.MatchExactly("fOOab\ncdbAR")); + + RE_Matcher match3("b.r"); + match3.MakeSingleLine(); + match3.Compile(); + CHECK(match3.MatchExactly("bar")); + CHECK(match3.MatchExactly("b\nr")); + + RE_Matcher match4("a.c"); + match4.MakeSingleLine(); + match4.AddPat("def"); + match4.Compile(); + CHECK(match4.MatchExactly("abc")); + CHECK(match4.MatchExactly("a\nc")); + } + + TEST_CASE("disjunction") + { + RE_Matcher match1("a.c"); + match1.MakeSingleLine(); + match1.Compile(); + RE_Matcher match2("def"); + match2.Compile(); + auto dj = detail::RE_Matcher_disjunction(&match1, &match2); + CHECK(dj->MatchExactly("abc")); + CHECK(dj->MatchExactly("a.c")); + CHECK(dj->MatchExactly("a\nc")); + CHECK(dj->MatchExactly("def")); + delete dj; } } diff --git a/src/RE.h b/src/RE.h index c71b8dd746..28c343f655 100644 --- a/src/RE.h +++ b/src/RE.h @@ -33,6 +33,7 @@ class Specific_RE_Matcher; class CCL; extern bool case_insensitive; +extern bool re_single_line; extern CCL* curr_ccl; extern NFA_Machine* nfa; extern Specific_RE_Matcher* rem; @@ -65,7 +66,7 @@ public: void AddPat(const char* pat); void MakeCaseInsensitive(); - void SetSingleLineMode(); + void MakeSingleLine(); void SetPat(const char* pat) { pattern_text = pat; } @@ -91,7 +92,7 @@ public: return nullptr; } CCL* LookupCCL(int index) { return ccl_list[index]; } - CCL* AnyCCL(); + CCL* AnyCCL(bool single_line_mode = false); void ConvertCCLs(); @@ -147,8 +148,10 @@ protected: EquivClass equiv_class; int* ecs; DFA_Machine* dfa; - CCL* any_ccl; AcceptingSet* accepted; + + CCL* any_ccl; + CCL* single_line_ccl; }; class RE_Match_State @@ -208,6 +211,9 @@ public: void MakeCaseInsensitive(); bool IsCaseInsensitive() const { return is_case_insensitive; } + void MakeSingleLine(); + bool IsSingleLine() const { return is_single_line; } + bool Compile(bool lazy = false); // Returns true if s exactly matches the pattern, false otherwise. @@ -243,6 +249,7 @@ protected: detail::Specific_RE_Matcher* re_exact; bool is_case_insensitive = false; + bool is_single_line = false; }; } // namespace zeek diff --git a/src/parse.y b/src/parse.y index 637e40c318..1336d177cf 100644 --- a/src/parse.y +++ b/src/parse.y @@ -922,8 +922,7 @@ expr: re->MakeCaseInsensitive(); if ( $4.single_line ) - { - } + re->MakeSingleLine(); re->Compile(); $$ = new ConstExpr(make_intrusive(re)); diff --git a/src/re-parse.y b/src/re-parse.y index 5d8d2e9a63..7a38820f90 100644 --- a/src/re-parse.y +++ b/src/re-parse.y @@ -20,7 +20,7 @@ namespace zeek::detail { void yyerror(const char msg[]); %} -%token TOK_CHAR TOK_NUMBER TOK_CCL TOK_CCE TOK_CASE_INSENSITIVE +%token TOK_CHAR TOK_NUMBER TOK_CCL TOK_CCE TOK_CASE_INSENSITIVE TOK_SINGLE_LINE %union { int int_val; @@ -112,7 +112,8 @@ singleton : singleton '*' | '.' { - $$ = new zeek::detail::NFA_Machine(new zeek::detail::NFA_State(zeek::detail::rem->AnyCCL())); + $$ = new zeek::detail::NFA_Machine(new zeek::detail::NFA_State( + zeek::detail::rem->AnyCCL(zeek::detail::re_single_line))); } | full_ccl @@ -134,6 +135,9 @@ singleton : singleton '*' | TOK_CASE_INSENSITIVE re ')' { $$ = $2; zeek::detail::case_insensitive = false; } + | TOK_SINGLE_LINE re ')' + { $$ = $2; zeek::detail::re_single_line = false; } + | TOK_CHAR { auto sym = $1; diff --git a/src/re-scan.l b/src/re-scan.l index 494dc5b486..5bffd812a7 100644 --- a/src/re-scan.l +++ b/src/re-scan.l @@ -116,6 +116,7 @@ CCL_EXPR ("[:"[[:alpha:]]+":]") } "(?i:" zeek::detail::case_insensitive = true; return TOK_CASE_INSENSITIVE; + "(?s:" zeek::detail::re_single_line = true; return TOK_SINGLE_LINE; [a-zA-Z] { if ( zeek::detail::case_insensitive ) diff --git a/src/scan.l b/src/scan.l index 508c21cbfe..169af2932b 100644 --- a/src/scan.l +++ b/src/scan.l @@ -570,7 +570,7 @@ F RET_CONST(zeek::val_mgr->False()->Ref()) (\/[is]{0,2}) { BEGIN(INITIAL); - if (strlen(yytext) == 2) + if ( strlen(yytext) == 2 ) { yylval.re_modes.ignore_case = (yytext[1] == 'i'); yylval.re_modes.single_line = (yytext[1] == 's'); diff --git a/testing/btest/Baseline/language.pattern/out b/testing/btest/Baseline/language.pattern/out index 8bbf981c12..5c1441502b 100644 --- a/testing/btest/Baseline/language.pattern/out +++ b/testing/btest/Baseline/language.pattern/out @@ -33,3 +33,7 @@ case-sensitive pattern (PASS) (?i:...) pattern construct (PASS) (?i:...) pattern construct (FAIL) (?i:...) pattern construct (PASS) +/s missing (PASS) +/s pattern modifier (PASS) +/s pattern disjunction (PASS) +/s pattern concatenation (PASS) diff --git a/testing/btest/language/pattern.zeek b/testing/btest/language/pattern.zeek index 05a84e713c..2567afc884 100644 --- a/testing/btest/language/pattern.zeek +++ b/testing/btest/language/pattern.zeek @@ -65,4 +65,9 @@ event zeek_init() test_case( "(?i:...) pattern construct", /foo|(?i:bar)/ in "xFOoy" ); test_case( "(?i:...) pattern construct", /foo|(?i:bar)/ | /foo/i in "xFOoy" ); + test_case( "/s missing", /fOO.*bAR/ != "fOOab\ncdbAR"); + test_case( "/s pattern modifier", /fOO.*bAR/s == "fOOab\ncdbAR"); + test_case( "/s pattern disjunction", /b.r/s | /bez/ == "b\nr" ); + test_case( "/s pattern concatenation", /b.r/s & /bez/ == "b\nrbez" ); + }