Add support for /s modifier to RE matcher and parser

This commit is contained in:
Tim Wojtulewicz 2022-07-25 16:24:19 -07:00
parent f67f6e4507
commit 18126c2d50
9 changed files with 109 additions and 24 deletions

4
NEWS
View file

@ -24,6 +24,10 @@ Breaking Changes
New Functionality New Functionality
----------------- -----------------
- Added support for the /s regular expression modifier. Using this modifier in
patterns in Zeek scripts will cause the '.' character to also match newline
characters.
Changed Functionality Changed Functionality
--------------------- ---------------------

View file

@ -18,6 +18,7 @@ zeek::detail::CCL* zeek::detail::curr_ccl = nullptr;
zeek::detail::Specific_RE_Matcher* zeek::detail::rem = nullptr; zeek::detail::Specific_RE_Matcher* zeek::detail::rem = nullptr;
zeek::detail::NFA_Machine* zeek::detail::nfa = nullptr; zeek::detail::NFA_Machine* zeek::detail::nfa = nullptr;
bool zeek::detail::case_insensitive = false; bool zeek::detail::case_insensitive = false;
bool zeek::detail::re_single_line = false;
extern int RE_parse(void); extern int RE_parse(void);
extern void RE_set_input(const char* str); extern void RE_set_input(const char* str);
@ -32,6 +33,7 @@ Specific_RE_Matcher::Specific_RE_Matcher(match_type arg_mt, bool arg_multiline)
: mt(arg_mt), multiline(arg_multiline), equiv_class(NUM_SYM) : mt(arg_mt), multiline(arg_multiline), equiv_class(NUM_SYM)
{ {
any_ccl = nullptr; any_ccl = nullptr;
single_line_ccl = nullptr;
dfa = nullptr; dfa = nullptr;
ecs = nullptr; ecs = nullptr;
accepted = new AcceptingSet(); accepted = new AcceptingSet();
@ -46,10 +48,22 @@ Specific_RE_Matcher::~Specific_RE_Matcher()
delete accepted; delete accepted;
} }
CCL* Specific_RE_Matcher::AnyCCL() CCL* Specific_RE_Matcher::AnyCCL(bool single_line_mode)
{ {
if ( single_line_mode )
{
if ( ! single_line_ccl )
{
single_line_ccl = new CCL();
single_line_ccl->Negate();
EC()->CCL_Use(single_line_ccl);
}
return single_line_ccl;
}
if ( ! any_ccl ) if ( ! any_ccl )
{ // Create the '.' character class. {
any_ccl = new CCL(); any_ccl = new CCL();
if ( ! multiline ) if ( ! multiline )
any_ccl->Add('\n'); any_ccl->Add('\n');
@ -98,6 +112,12 @@ void Specific_RE_Matcher::MakeCaseInsensitive()
pattern_text = util::fmt(fmt, pattern_text.c_str()); pattern_text = util::fmt(fmt, pattern_text.c_str());
} }
void Specific_RE_Matcher::MakeSingleLine()
{
const char fmt[] = "(?s:%s)";
pattern_text = util::fmt(fmt, pattern_text.c_str());
}
bool Specific_RE_Matcher::Compile(bool lazy) bool Specific_RE_Matcher::Compile(bool lazy)
{ {
if ( pattern_text.empty() ) if ( pattern_text.empty() )
@ -394,13 +414,10 @@ static RE_Matcher* matcher_merge(const RE_Matcher* re1, const RE_Matcher* re2, c
const char* text1 = re1->PatternText(); const char* text1 = re1->PatternText();
const char* text2 = re2->PatternText(); const char* text2 = re2->PatternText();
int n = strlen(text1) + strlen(text2) + strlen(merge_op) + 32 /* slop */; size_t n = strlen(text1) + strlen(text2) + strlen(merge_op) + 32 /* slop */;
char* merge_text = new char[n]; std::string merge_text = util::fmt("(%s)%s(%s)", text1, merge_op, text2);
snprintf(merge_text, n, "(%s)%s(%s)", text1, merge_op, text2); RE_Matcher* merge = new RE_Matcher(merge_text.c_str());
RE_Matcher* merge = new RE_Matcher(merge_text);
delete[] merge_text;
merge->Compile(); merge->Compile();
@ -461,6 +478,14 @@ void RE_Matcher::MakeCaseInsensitive()
is_case_insensitive = true; is_case_insensitive = true;
} }
void RE_Matcher::MakeSingleLine()
{
re_anywhere->MakeSingleLine();
re_exact->MakeSingleLine();
is_single_line = true;
}
bool RE_Matcher::Compile(bool lazy) bool RE_Matcher::Compile(bool lazy)
{ {
return re_anywhere->Compile(lazy) && re_exact->Compile(lazy); return re_anywhere->Compile(lazy) && re_exact->Compile(lazy);
@ -523,15 +548,51 @@ TEST_SUITE("re_matcher")
CHECK(match.MatchExactly("aBc")); CHECK(match.MatchExactly("aBc"));
CHECK(match.MatchExactly("nop")); CHECK(match.MatchExactly("nop"));
CHECK_FALSE(match.MatchExactly("NoP")); CHECK_FALSE(match.MatchExactly("NoP"));
}
// TODO: this part isn't working at all. There's something about the second call TEST_CASE("single_line_mode")
// to Compile() that's breaking something. {
// match.MakeCaseInsensitive(); RE_Matcher match(".*");
// match.Compile(); match.MakeSingleLine();
// CHECK(strcmp(match.PatternText(), "(?i:((?i:^?([a-m]+)$?))|(^?([n-z]+)$?))") == 0); match.Compile();
// CHECK(match.MatchExactly("aBc"));
// CHECK(match.MatchExactly("nop")); CHECK(strcmp(match.PatternText(), "(?s:^?(.*)$?)") == 0);
// CHECK(match.MatchExactly("NoP")); CHECK(match.MatchExactly("abc\ndef"));
RE_Matcher match2("fOO.*bAR");
match2.MakeSingleLine();
match2.Compile();
CHECK(strcmp(match2.PatternText(), "(?s:^?(fOO.*bAR)$?)") == 0);
CHECK(match.MatchExactly("fOOab\ncdbAR"));
RE_Matcher match3("b.r");
match3.MakeSingleLine();
match3.Compile();
CHECK(match3.MatchExactly("bar"));
CHECK(match3.MatchExactly("b\nr"));
RE_Matcher match4("a.c");
match4.MakeSingleLine();
match4.AddPat("def");
match4.Compile();
CHECK(match4.MatchExactly("abc"));
CHECK(match4.MatchExactly("a\nc"));
}
TEST_CASE("disjunction")
{
RE_Matcher match1("a.c");
match1.MakeSingleLine();
match1.Compile();
RE_Matcher match2("def");
match2.Compile();
auto dj = detail::RE_Matcher_disjunction(&match1, &match2);
CHECK(dj->MatchExactly("abc"));
CHECK(dj->MatchExactly("a.c"));
CHECK(dj->MatchExactly("a\nc"));
CHECK(dj->MatchExactly("def"));
delete dj;
} }
} }

View file

@ -33,6 +33,7 @@ class Specific_RE_Matcher;
class CCL; class CCL;
extern bool case_insensitive; extern bool case_insensitive;
extern bool re_single_line;
extern CCL* curr_ccl; extern CCL* curr_ccl;
extern NFA_Machine* nfa; extern NFA_Machine* nfa;
extern Specific_RE_Matcher* rem; extern Specific_RE_Matcher* rem;
@ -65,7 +66,7 @@ public:
void AddPat(const char* pat); void AddPat(const char* pat);
void MakeCaseInsensitive(); void MakeCaseInsensitive();
void SetSingleLineMode(); void MakeSingleLine();
void SetPat(const char* pat) { pattern_text = pat; } void SetPat(const char* pat) { pattern_text = pat; }
@ -91,7 +92,7 @@ public:
return nullptr; return nullptr;
} }
CCL* LookupCCL(int index) { return ccl_list[index]; } CCL* LookupCCL(int index) { return ccl_list[index]; }
CCL* AnyCCL(); CCL* AnyCCL(bool single_line_mode = false);
void ConvertCCLs(); void ConvertCCLs();
@ -147,8 +148,10 @@ protected:
EquivClass equiv_class; EquivClass equiv_class;
int* ecs; int* ecs;
DFA_Machine* dfa; DFA_Machine* dfa;
CCL* any_ccl;
AcceptingSet* accepted; AcceptingSet* accepted;
CCL* any_ccl;
CCL* single_line_ccl;
}; };
class RE_Match_State class RE_Match_State
@ -208,6 +211,9 @@ public:
void MakeCaseInsensitive(); void MakeCaseInsensitive();
bool IsCaseInsensitive() const { return is_case_insensitive; } bool IsCaseInsensitive() const { return is_case_insensitive; }
void MakeSingleLine();
bool IsSingleLine() const { return is_single_line; }
bool Compile(bool lazy = false); bool Compile(bool lazy = false);
// Returns true if s exactly matches the pattern, false otherwise. // Returns true if s exactly matches the pattern, false otherwise.
@ -243,6 +249,7 @@ protected:
detail::Specific_RE_Matcher* re_exact; detail::Specific_RE_Matcher* re_exact;
bool is_case_insensitive = false; bool is_case_insensitive = false;
bool is_single_line = false;
}; };
} // namespace zeek } // namespace zeek

View file

@ -922,8 +922,7 @@ expr:
re->MakeCaseInsensitive(); re->MakeCaseInsensitive();
if ( $4.single_line ) if ( $4.single_line )
{ re->MakeSingleLine();
}
re->Compile(); re->Compile();
$$ = new ConstExpr(make_intrusive<PatternVal>(re)); $$ = new ConstExpr(make_intrusive<PatternVal>(re));

View file

@ -20,7 +20,7 @@ namespace zeek::detail {
void yyerror(const char msg[]); void yyerror(const char msg[]);
%} %}
%token TOK_CHAR TOK_NUMBER TOK_CCL TOK_CCE TOK_CASE_INSENSITIVE %token TOK_CHAR TOK_NUMBER TOK_CCL TOK_CCE TOK_CASE_INSENSITIVE TOK_SINGLE_LINE
%union { %union {
int int_val; int int_val;
@ -112,7 +112,8 @@ singleton : singleton '*'
| '.' | '.'
{ {
$$ = new zeek::detail::NFA_Machine(new zeek::detail::NFA_State(zeek::detail::rem->AnyCCL())); $$ = new zeek::detail::NFA_Machine(new zeek::detail::NFA_State(
zeek::detail::rem->AnyCCL(zeek::detail::re_single_line)));
} }
| full_ccl | full_ccl
@ -134,6 +135,9 @@ singleton : singleton '*'
| TOK_CASE_INSENSITIVE re ')' | TOK_CASE_INSENSITIVE re ')'
{ $$ = $2; zeek::detail::case_insensitive = false; } { $$ = $2; zeek::detail::case_insensitive = false; }
| TOK_SINGLE_LINE re ')'
{ $$ = $2; zeek::detail::re_single_line = false; }
| TOK_CHAR | TOK_CHAR
{ {
auto sym = $1; auto sym = $1;

View file

@ -116,6 +116,7 @@ CCL_EXPR ("[:"[[:alpha:]]+":]")
} }
"(?i:" zeek::detail::case_insensitive = true; return TOK_CASE_INSENSITIVE; "(?i:" zeek::detail::case_insensitive = true; return TOK_CASE_INSENSITIVE;
"(?s:" zeek::detail::re_single_line = true; return TOK_SINGLE_LINE;
[a-zA-Z] { [a-zA-Z] {
if ( zeek::detail::case_insensitive ) if ( zeek::detail::case_insensitive )

View file

@ -570,7 +570,7 @@ F RET_CONST(zeek::val_mgr->False()->Ref())
<RE>(\/[is]{0,2}) { <RE>(\/[is]{0,2}) {
BEGIN(INITIAL); BEGIN(INITIAL);
if (strlen(yytext) == 2) if ( strlen(yytext) == 2 )
{ {
yylval.re_modes.ignore_case = (yytext[1] == 'i'); yylval.re_modes.ignore_case = (yytext[1] == 'i');
yylval.re_modes.single_line = (yytext[1] == 's'); yylval.re_modes.single_line = (yytext[1] == 's');

View file

@ -33,3 +33,7 @@ case-sensitive pattern (PASS)
(?i:...) pattern construct (PASS) (?i:...) pattern construct (PASS)
(?i:...) pattern construct (FAIL) (?i:...) pattern construct (FAIL)
(?i:...) pattern construct (PASS) (?i:...) pattern construct (PASS)
/s missing (PASS)
/s pattern modifier (PASS)
/s pattern disjunction (PASS)
/s pattern concatenation (PASS)

View file

@ -65,4 +65,9 @@ event zeek_init()
test_case( "(?i:...) pattern construct", /foo|(?i:bar)/ in "xFOoy" ); test_case( "(?i:...) pattern construct", /foo|(?i:bar)/ in "xFOoy" );
test_case( "(?i:...) pattern construct", /foo|(?i:bar)/ | /foo/i in "xFOoy" ); test_case( "(?i:...) pattern construct", /foo|(?i:bar)/ | /foo/i in "xFOoy" );
test_case( "/s missing", /fOO.*bAR/ != "fOOab\ncdbAR");
test_case( "/s pattern modifier", /fOO.*bAR/s == "fOOab\ncdbAR");
test_case( "/s pattern disjunction", /b.r/s | /bez/ == "b\nr" );
test_case( "/s pattern concatenation", /b.r/s & /bez/ == "b\nrbez" );
} }