Merge remote-tracking branch 'origin/topic/timw/1129-slash-s-patterns'

* origin/topic/timw/1129-slash-s-patterns: Add support for /s modifier to RE matcher and parser Code cleanup in RE_Matcher code Add basic unit tests for RE_Matcher Add /s modifier to parser for patterns
2025-10-02 14:48:21 +00:00 · 2022-08-02 11:33:22 -07:00 · 2022-08-02 11:33:22 -07:00 · 2cba2415fd
commit 2cba2415fd
parent 14e48733ac 18126c2d50
13 changed files with 235 additions and 60 deletions
--- a/15
+++ b/15
@ -1,3 +1,18 @@
 5.1.0-dev.309 | 2022-08-02 11:33:22 -0700
  * Add support for /s modifier to RE matcher and parser (Tim Wojtulewicz, Corelight)
  * Code cleanup in RE_Matcher code (Tim Wojtulewicz, Corelight)
    - Use std::string in Specific_RE_Matcher instead of char*
    - Change a couple of ints-as-bools to bools
  * Add basic unit tests for RE_Matcher (Tim Wojtulewicz, Corelight)
  * Add /s modifier to parser for patterns (Tim Wojtulewicz, Corelight)
  * Update gen-zam submodule [nomail] (Tim Wojtulewicz, Corelight)
 5.1.0-dev.303 | 2022-08-01 09:56:45 -0700
  * GH-1344: Give better warning when using a type that doesn't exist (Tim Wojtulewicz, Corelight)
--- a/4
+++ b/4
@ -24,6 +24,10 @@ Breaking Changes
 New Functionality
 -----------------
 - Added support for the /s regular expression modifier. Using this modifier in
  patterns in Zeek scripts will cause the '.' character to also match newline
  characters.
 Changed Functionality
 ---------------------
--- a/2
+++ b/2
@ -1 +1 @@
-5.1.0-dev.303
+5.1.0-dev.309
--- a/src/RE.cc
+++ b/src/RE.cc
@ -7,6 +7,7 @@
 #include <cstdlib>
 #include <utility>
 #include "zeek/3rdparty/doctest.h"
 #include "zeek/CCL.h"
 #include "zeek/DFA.h"
 #include "zeek/EquivClass.h"
@ -16,7 +17,8 @@
 zeek::detail::CCL* zeek::detail::curr_ccl = nullptr;
 zeek::detail::Specific_RE_Matcher* zeek::detail::rem = nullptr;
 zeek::detail::NFA_Machine* zeek::detail::nfa = nullptr;
-int zeek::detail::case_insensitive = 0;
+bool zeek::detail::case_insensitive = false;
 bool zeek::detail::re_single_line = false;
 extern int RE_parse(void);
 extern void RE_set_input(const char* str);
@ -27,13 +29,11 @@ namespace zeek
 namespace detail
 	{
-Specific_RE_Matcher::Specific_RE_Matcher(match_type arg_mt, int arg_multiline)
+Specific_RE_Matcher::Specific_RE_Matcher(match_type arg_mt, bool arg_multiline)
-	: equiv_class(NUM_SYM)
+	: mt(arg_mt), multiline(arg_multiline), equiv_class(NUM_SYM)
 	{
 	mt = arg_mt;
 	multiline = arg_multiline;
 	any_ccl = nullptr;
-	pattern_text = nullptr;
+	single_line_ccl = nullptr;
 	dfa = nullptr;
 	ecs = nullptr;
 	accepted = new AcceptingSet();
@ -45,14 +45,25 @@ Specific_RE_Matcher::~Specific_RE_Matcher()
 		delete ccl_list[i];
 	Unref(dfa);
 	delete[] pattern_text;
 	delete accepted;
 	}
-CCL* Specific_RE_Matcher::AnyCCL()
+CCL* Specific_RE_Matcher::AnyCCL(bool single_line_mode)
 	{
 	if ( single_line_mode )
 		{
 		if ( ! single_line_ccl )
 			{
 			single_line_ccl = new CCL();
 			single_line_ccl->Negate();
 			EC()->CCL_Use(single_line_ccl);
 			}
 		return single_line_ccl;
 		}
 	if ( ! any_ccl )
-		{ // Create the '.' character class.
+		{
 		any_ccl = new CCL();
 		if ( ! multiline )
 			any_ccl->Add('\n');
@ -89,51 +100,38 @@ void Specific_RE_Matcher::AddExactPat(const char* new_pat)
 void Specific_RE_Matcher::AddPat(const char* new_pat, const char* orig_fmt, const char* app_fmt)
 	{
-	int n = strlen(new_pat);
+	if ( ! pattern_text.empty() )
-
+		pattern_text = util::fmt(app_fmt, pattern_text.c_str(), new_pat);
 	if ( pattern_text )
 		n += strlen(pattern_text) + strlen(app_fmt);
 	else
-		n += strlen(orig_fmt);
+		pattern_text = util::fmt(orig_fmt, new_pat);
 	char* s = new char[n + 5 /* slop */];
 	if ( pattern_text )
 		sprintf(s, app_fmt, pattern_text, new_pat);
 	else
 		sprintf(s, orig_fmt, new_pat);
 	delete[] pattern_text;
 	pattern_text = s;
 	}
 void Specific_RE_Matcher::MakeCaseInsensitive()
 	{
 	const char fmt[] = "(?i:%s)";
-	int n = strlen(pattern_text) + strlen(fmt);
+	pattern_text = util::fmt(fmt, pattern_text.c_str());
 	}
-	char* s = new char[n + 5 /* slop */];
+void Specific_RE_Matcher::MakeSingleLine()
-
+	{
-	snprintf(s, n + 5, fmt, pattern_text);
+	const char fmt[] = "(?s:%s)";
-
+	pattern_text = util::fmt(fmt, pattern_text.c_str());
 	delete[] pattern_text;
 	pattern_text = s;
 	}
 bool Specific_RE_Matcher::Compile(bool lazy)
 	{
-	if ( ! pattern_text )
+	if ( pattern_text.empty() )
 		return false;
 	rem = this;
-	RE_set_input(pattern_text);
+	RE_set_input(pattern_text.c_str());
 	int parse_status = RE_parse();
 	RE_done_with_scan();
 	if ( parse_status )
 		{
-		reporter->Error("error compiling pattern /%s/", pattern_text);
+		reporter->Error("error compiling pattern /%s/", pattern_text.c_str());
 		Unref(nfa);
 		nfa = nullptr;
 		return false;
@ -416,13 +414,10 @@ static RE_Matcher* matcher_merge(const RE_Matcher* re1, const RE_Matcher* re2, c
 	const char* text1 = re1->PatternText();
 	const char* text2 = re2->PatternText();
-	int n = strlen(text1) + strlen(text2) + strlen(merge_op) + 32 /* slop */;
+	size_t n = strlen(text1) + strlen(text2) + strlen(merge_op) + 32 /* slop */;
-	char* merge_text = new char[n];
+	std::string merge_text = util::fmt("(%s)%s(%s)", text1, merge_op, text2);
-	snprintf(merge_text, n, "(%s)%s(%s)", text1, merge_op, text2);
+	RE_Matcher* merge = new RE_Matcher(merge_text.c_str());
 	RE_Matcher* merge = new RE_Matcher(merge_text);
 	delete[] merge_text;
 	merge->Compile();
@ -483,9 +478,122 @@ void RE_Matcher::MakeCaseInsensitive()
 	is_case_insensitive = true;
 	}
 void RE_Matcher::MakeSingleLine()
 	{
 	re_anywhere->MakeSingleLine();
 	re_exact->MakeSingleLine();
 	is_single_line = true;
 	}
 bool RE_Matcher::Compile(bool lazy)
 	{
 	return re_anywhere->Compile(lazy) && re_exact->Compile(lazy);
 	}
 TEST_SUITE("re_matcher")
 	{
 	TEST_CASE("simple_pattern")
 		{
 		RE_Matcher match("[0-9]+");
 		match.Compile();
 		CHECK(strcmp(match.OrigText(), "[0-9]+") == 0);
 		CHECK(strcmp(match.PatternText(), "^?([0-9]+)$?") == 0);
 		CHECK(strcmp(match.AnywherePatternText(), "^?(.|\\n)*([0-9]+)") == 0);
 		CHECK(match.MatchExactly("12345"));
 		CHECK_FALSE(match.MatchExactly("a12345"));
 		// The documentation for MatchAnywhere says that it returns the
 		// "index just beyond where the first match occurs", which I would
 		// think means *after* the match. This is returning the position
 		// where the match starts though.
 		CHECK(match.MatchAnywhere("a1234bcd") == 2);
 		CHECK(match.MatchAnywhere("abcd") == 0);
 		}
 	TEST_CASE("case_insensitive_mode")
 		{
 		RE_Matcher match("[a-z]+");
 		match.MakeCaseInsensitive();
 		match.Compile();
 		CHECK(strcmp(match.PatternText(), "(?i:^?([a-z]+)$?)") == 0);
 		CHECK(match.MatchExactly("abcDEF"));
 		}
 	TEST_CASE("multi_pattern")
 		{
 		RE_Matcher match("[0-9]+");
 		match.AddPat("[a-z]+");
 		match.Compile();
 		CHECK(strcmp(match.PatternText(), "(^?([0-9]+)$?)|(^?([a-z]+)$?)") == 0);
 		CHECK(match.MatchExactly("abc"));
 		CHECK(match.MatchExactly("123"));
 		CHECK_FALSE(match.MatchExactly("abc123"));
 		}
 	TEST_CASE("modes_multi_pattern")
 		{
 		RE_Matcher match("[a-m]+");
 		match.MakeCaseInsensitive();
 		match.AddPat("[n-z]+");
 		match.Compile();
 		CHECK(strcmp(match.PatternText(), "((?i:^?([a-m]+)$?))|(^?([n-z]+)$?)") == 0);
 		CHECK(match.MatchExactly("aBc"));
 		CHECK(match.MatchExactly("nop"));
 		CHECK_FALSE(match.MatchExactly("NoP"));
 		}
 	TEST_CASE("single_line_mode")
 		{
 		RE_Matcher match(".*");
 		match.MakeSingleLine();
 		match.Compile();
 		CHECK(strcmp(match.PatternText(), "(?s:^?(.*)$?)") == 0);
 		CHECK(match.MatchExactly("abc\ndef"));
 		RE_Matcher match2("fOO.*bAR");
 		match2.MakeSingleLine();
 		match2.Compile();
 		CHECK(strcmp(match2.PatternText(), "(?s:^?(fOO.*bAR)$?)") == 0);
 		CHECK(match.MatchExactly("fOOab\ncdbAR"));
 		RE_Matcher match3("b.r");
 		match3.MakeSingleLine();
 		match3.Compile();
 		CHECK(match3.MatchExactly("bar"));
 		CHECK(match3.MatchExactly("b\nr"));
 		RE_Matcher match4("a.c");
 		match4.MakeSingleLine();
 		match4.AddPat("def");
 		match4.Compile();
 		CHECK(match4.MatchExactly("abc"));
 		CHECK(match4.MatchExactly("a\nc"));
 		}
 	TEST_CASE("disjunction")
 		{
 		RE_Matcher match1("a.c");
 		match1.MakeSingleLine();
 		match1.Compile();
 		RE_Matcher match2("def");
 		match2.Compile();
 		auto dj = detail::RE_Matcher_disjunction(&match1, &match2);
 		CHECK(dj->MatchExactly("abc"));
 		CHECK(dj->MatchExactly("a.c"));
 		CHECK(dj->MatchExactly("a\nc"));
 		CHECK(dj->MatchExactly("def"));
 		delete dj;
 		}
 	}
 	} // namespace zeek
--- a/src/RE.h
+++ b/src/RE.h
@ -32,7 +32,8 @@ class DFA_State;
 class Specific_RE_Matcher;
 class CCL;
-extern int case_insensitive;
+extern bool case_insensitive;
 extern bool re_single_line;
 extern CCL* curr_ccl;
 extern NFA_Machine* nfa;
 extern Specific_RE_Matcher* rem;
@ -59,14 +60,15 @@ enum match_type
 class Specific_RE_Matcher
 	{
 public:
-	explicit Specific_RE_Matcher(match_type mt, int multiline = 0);
+	explicit Specific_RE_Matcher(match_type mt, bool multiline = false);
 	~Specific_RE_Matcher();
 	void AddPat(const char* pat);
 	void MakeCaseInsensitive();
 	void MakeSingleLine();
-	void SetPat(const char* pat) { pattern_text = util::copy_string(pat); }
+	void SetPat(const char* pat) { pattern_text = pat; }
 	bool Compile(bool lazy = false);
@ -90,7 +92,7 @@ public:
 		return nullptr;
 		}
 	CCL* LookupCCL(int index) { return ccl_list[index]; }
-	CCL* AnyCCL();
+	CCL* AnyCCL(bool single_line_mode = false);
 	void ConvertCCLs();
@ -117,7 +119,7 @@ public:
 	EquivClass* EC() { return &equiv_class; }
-	const char* PatternText() const { return pattern_text; }
+	const char* PatternText() const { return pattern_text.c_str(); }
 	DFA_Machine* DFA() const { return dfa; }
@ -135,17 +137,21 @@ protected:
 	bool MatchAll(const u_char* bv, int n);
 	match_type mt;
-	int multiline;
+	bool multiline;
-	char* pattern_text;
+
 	std::string pattern_text;
 	std::map<std::string, std::string> defs;
 	std::map<std::string, CCL*> ccl_dict;
 	std::vector<char> modifiers;
 	PList<CCL> ccl_list;
 	EquivClass equiv_class;
 	int* ecs;
 	DFA_Machine* dfa;
 	CCL* any_ccl;
 	AcceptingSet* accepted;
 	CCL* any_ccl;
 	CCL* single_line_ccl;
 	};
 class RE_Match_State
@ -205,6 +211,9 @@ public:
 	void MakeCaseInsensitive();
 	bool IsCaseInsensitive() const { return is_case_insensitive; }
 	void MakeSingleLine();
 	bool IsSingleLine() const { return is_single_line; }
 	bool Compile(bool lazy = false);
 	// Returns true if s exactly matches the pattern, false otherwise.
@ -240,6 +249,7 @@ protected:
 	detail::Specific_RE_Matcher* re_exact;
 	bool is_case_insensitive = false;
 	bool is_single_line = false;
 	};
 	} // namespace zeek
--- a/src/RuleMatcher.cc
+++ b/src/RuleMatcher.cc
@ -526,7 +526,7 @@ void RuleMatcher::BuildPatternSets(RuleHdrTest::pattern_set_list* dst, const str
 		if ( group_exprs.length() > sig_max_group_size || i == exprs.length() )
 			{
 			RuleHdrTest::PatternSet* set = new RuleHdrTest::PatternSet;
-			set->re = new Specific_RE_Matcher(MATCH_EXACTLY, 1);
+			set->re = new Specific_RE_Matcher(MATCH_EXACTLY, true);
 			set->re->CompileSet(group_exprs, group_ids);
 			set->patterns = group_exprs;
 			set->ids = group_ids;
--- a/src/packet_analysis/protocol/teredo/Teredo.cc
+++ b/src/packet_analysis/protocol/teredo/Teredo.cc
@ -145,7 +145,7 @@ TeredoAnalyzer::TeredoAnalyzer() : zeek::packet_analysis::Analyzer("TEREDO")
 	// 	}
 	pattern_re = std::make_unique<zeek::detail::Specific_RE_Matcher>(zeek::detail::MATCH_EXACTLY,
-	                                                                 1);
+	                                                                 true);
 	pattern_re->AddPat("^(\\x00\\x00)|(\\x00\\x01)|([\\x60-\\x6f].{7}((\\x20\\x01\\x00\\x00)).{28})"
 	                   "|([\\x60-\\x6f].{23}((\\x20\\x01\\x00\\x00))).{12}");
 	pattern_re->Compile();
--- a/src/parse.y
+++ b/src/parse.y
@ -54,7 +54,7 @@
 %left '$' '[' ']' '(' ')' TOK_HAS_FIELD TOK_HAS_ATTR
 %nonassoc TOK_AS TOK_IS
-%type <b> opt_no_test opt_no_test_block TOK_PATTERN_END opt_deep when_flavor
+%type <b> opt_no_test opt_no_test_block opt_deep when_flavor
 %type <str> TOK_ID TOK_PATTERN_TEXT
 %type <id> local_id global_id def_global_id event_id global_or_event_id resolve_id begin_lambda case_type
 %type <id_l> local_id_list case_type_list
@ -77,6 +77,7 @@
 %type <capture> capture
 %type <captures> capture_list opt_captures when_captures
 %type <when_clause> when_head when_start when_clause
 %type <re_modes> TOK_PATTERN_END
 %{
 #include <cstdlib>
@ -324,6 +325,11 @@ static StmtPtr build_local(ID* id, Type* t, InitClass ic, Expr* e,
 	zeek::FuncType::Capture* capture;
 	zeek::FuncType::CaptureList* captures;
 	zeek::detail::WhenInfo* when_clause;
 	struct
 		{
 		bool ignore_case;
 		bool single_line;
 		} re_modes;
 }
 %%
@ -912,9 +918,12 @@ expr:
 			auto* re = new RE_Matcher($3);
 			delete [] $3;
-			if ( $4 )
+			if ( $4.ignore_case )
 				re->MakeCaseInsensitive();
 			if ( $4.single_line )
 				re->MakeSingleLine();
 			re->Compile();
 			$$ = new ConstExpr(make_intrusive<PatternVal>(re));
 			}
--- a/src/re-parse.y
+++ b/src/re-parse.y
@ -20,7 +20,7 @@ namespace zeek::detail {
 void yyerror(const char msg[]);
 %}
-%token TOK_CHAR TOK_NUMBER TOK_CCL TOK_CCE TOK_CASE_INSENSITIVE
+%token TOK_CHAR TOK_NUMBER TOK_CCL TOK_CCE TOK_CASE_INSENSITIVE TOK_SINGLE_LINE
 %union {
 	int int_val;
@ -112,7 +112,8 @@ singleton	:  singleton '*'
 		|  '.'
 			{
-			$$ = new zeek::detail::NFA_Machine(new zeek::detail::NFA_State(zeek::detail::rem->AnyCCL()));
+			$$ = new zeek::detail::NFA_Machine(new zeek::detail::NFA_State(
                zeek::detail::rem->AnyCCL(zeek::detail::re_single_line)));
 			}
 		|  full_ccl
@ -132,7 +133,10 @@ singleton	:  singleton '*'
 			{ $$ = $2; }
 		|  TOK_CASE_INSENSITIVE re ')'
-			{ $$ = $2; zeek::detail::case_insensitive = 0; }
+			{ $$ = $2; zeek::detail::case_insensitive = false; }
 		|  TOK_SINGLE_LINE re ')'
 			{ $$ = $2; zeek::detail::re_single_line = false; }
 		|  TOK_CHAR
 			{
--- a/src/re-scan.l
+++ b/src/re-scan.l
@ -115,7 +115,8 @@ CCL_EXPR	("[:"[[:alpha:]]+":]")
 				}
 			}
-	"(?i:"		zeek::detail::case_insensitive = 1; return TOK_CASE_INSENSITIVE;
+	"(?i:"		zeek::detail::case_insensitive = true; return TOK_CASE_INSENSITIVE;
 	"(?s:"		zeek::detail::re_single_line = true; return TOK_SINGLE_LINE;
 	[a-zA-Z]	{
 			if ( zeek::detail::case_insensitive )
--- a/src/scan.l
+++ b/src/scan.l
@ -562,13 +562,28 @@ F	RET_CONST(zeek::val_mgr->False()->Ref())
 <RE>"/"	{
 	BEGIN(INITIAL);
-	yylval.b = false;
+	yylval.re_modes.ignore_case = false;
 	yylval.re_modes.single_line = false;
 	return TOK_PATTERN_END;
 	}
-<RE>"/i"	{
+<RE>(\/[is]{0,2})	{
 	BEGIN(INITIAL);
-	yylval.b = true;
+
 	if ( strlen(yytext) == 2 )
 		{
 		yylval.re_modes.ignore_case = (yytext[1] == 'i');
 		yylval.re_modes.single_line = (yytext[1] == 's');
 		}
 	else
 		{
 		if ( yytext[1] == yytext[2] )
 			zeek::reporter->Error("pattern has duplicate mode %c", yytext[1]);
 		yylval.re_modes.ignore_case = (yytext[1] == 'i' || yytext[2] == 'i');
 		yylval.re_modes.single_line = (yytext[1] == 's' || yytext[2] == 's');
 		}
 	return TOK_PATTERN_END;
 	}
--- a/testing/btest/Baseline/language.pattern/out
+++ b/testing/btest/Baseline/language.pattern/out
@ -33,3 +33,7 @@ case-sensitive pattern (PASS)
 (?i:...) pattern construct (PASS)
 (?i:...) pattern construct (FAIL)
 (?i:...) pattern construct (PASS)
 /s missing (PASS)
 /s pattern modifier (PASS)
 /s pattern disjunction (PASS)
 /s pattern concatenation (PASS)
--- a/testing/btest/language/pattern.zeek
+++ b/testing/btest/language/pattern.zeek
@ -65,4 +65,9 @@ event zeek_init()
 	test_case( "(?i:...) pattern construct", /foo|(?i:bar)/ in "xFOoy" );
 	test_case( "(?i:...) pattern construct", /foo|(?i:bar)/ | /foo/i in "xFOoy" );
 	test_case( "/s missing", /fOO.*bAR/ != "fOOab\ncdbAR");
 	test_case( "/s pattern modifier", /fOO.*bAR/s == "fOOab\ncdbAR");
 	test_case( "/s pattern disjunction", /b.r/s | /bez/ == "b\nr" );
 	test_case( "/s pattern concatenation", /b.r/s & /bez/ == "b\nrbez" );
 }