Add support for /s modifier to RE matcher and parser

This commit is contained in:
Tim Wojtulewicz 2022-07-25 16:24:19 -07:00
parent f67f6e4507
commit 18126c2d50
9 changed files with 109 additions and 24 deletions

4
NEWS
View file

@ -24,6 +24,10 @@ Breaking Changes
New Functionality
-----------------
- Added support for the /s regular expression modifier. Using this modifier in
patterns in Zeek scripts will cause the '.' character to also match newline
characters.
Changed Functionality
---------------------

View file

@ -18,6 +18,7 @@ zeek::detail::CCL* zeek::detail::curr_ccl = nullptr;
zeek::detail::Specific_RE_Matcher* zeek::detail::rem = nullptr;
zeek::detail::NFA_Machine* zeek::detail::nfa = nullptr;
bool zeek::detail::case_insensitive = false;
bool zeek::detail::re_single_line = false;
extern int RE_parse(void);
extern void RE_set_input(const char* str);
@ -32,6 +33,7 @@ Specific_RE_Matcher::Specific_RE_Matcher(match_type arg_mt, bool arg_multiline)
: mt(arg_mt), multiline(arg_multiline), equiv_class(NUM_SYM)
{
any_ccl = nullptr;
single_line_ccl = nullptr;
dfa = nullptr;
ecs = nullptr;
accepted = new AcceptingSet();
@ -46,10 +48,22 @@ Specific_RE_Matcher::~Specific_RE_Matcher()
delete accepted;
}
CCL* Specific_RE_Matcher::AnyCCL()
CCL* Specific_RE_Matcher::AnyCCL(bool single_line_mode)
{
if ( single_line_mode )
{
if ( ! single_line_ccl )
{
single_line_ccl = new CCL();
single_line_ccl->Negate();
EC()->CCL_Use(single_line_ccl);
}
return single_line_ccl;
}
if ( ! any_ccl )
{ // Create the '.' character class.
{
any_ccl = new CCL();
if ( ! multiline )
any_ccl->Add('\n');
@ -98,6 +112,12 @@ void Specific_RE_Matcher::MakeCaseInsensitive()
pattern_text = util::fmt(fmt, pattern_text.c_str());
}
void Specific_RE_Matcher::MakeSingleLine()
{
const char fmt[] = "(?s:%s)";
pattern_text = util::fmt(fmt, pattern_text.c_str());
}
bool Specific_RE_Matcher::Compile(bool lazy)
{
if ( pattern_text.empty() )
@ -394,13 +414,10 @@ static RE_Matcher* matcher_merge(const RE_Matcher* re1, const RE_Matcher* re2, c
const char* text1 = re1->PatternText();
const char* text2 = re2->PatternText();
int n = strlen(text1) + strlen(text2) + strlen(merge_op) + 32 /* slop */;
size_t n = strlen(text1) + strlen(text2) + strlen(merge_op) + 32 /* slop */;
char* merge_text = new char[n];
snprintf(merge_text, n, "(%s)%s(%s)", text1, merge_op, text2);
RE_Matcher* merge = new RE_Matcher(merge_text);
delete[] merge_text;
std::string merge_text = util::fmt("(%s)%s(%s)", text1, merge_op, text2);
RE_Matcher* merge = new RE_Matcher(merge_text.c_str());
merge->Compile();
@ -461,6 +478,14 @@ void RE_Matcher::MakeCaseInsensitive()
is_case_insensitive = true;
}
void RE_Matcher::MakeSingleLine()
{
re_anywhere->MakeSingleLine();
re_exact->MakeSingleLine();
is_single_line = true;
}
bool RE_Matcher::Compile(bool lazy)
{
return re_anywhere->Compile(lazy) && re_exact->Compile(lazy);
@ -523,15 +548,51 @@ TEST_SUITE("re_matcher")
CHECK(match.MatchExactly("aBc"));
CHECK(match.MatchExactly("nop"));
CHECK_FALSE(match.MatchExactly("NoP"));
}
// TODO: this part isn't working at all. There's something about the second call
// to Compile() that's breaking something.
// match.MakeCaseInsensitive();
// match.Compile();
// CHECK(strcmp(match.PatternText(), "(?i:((?i:^?([a-m]+)$?))|(^?([n-z]+)$?))") == 0);
// CHECK(match.MatchExactly("aBc"));
// CHECK(match.MatchExactly("nop"));
// CHECK(match.MatchExactly("NoP"));
TEST_CASE("single_line_mode")
{
RE_Matcher match(".*");
match.MakeSingleLine();
match.Compile();
CHECK(strcmp(match.PatternText(), "(?s:^?(.*)$?)") == 0);
CHECK(match.MatchExactly("abc\ndef"));
RE_Matcher match2("fOO.*bAR");
match2.MakeSingleLine();
match2.Compile();
CHECK(strcmp(match2.PatternText(), "(?s:^?(fOO.*bAR)$?)") == 0);
CHECK(match.MatchExactly("fOOab\ncdbAR"));
RE_Matcher match3("b.r");
match3.MakeSingleLine();
match3.Compile();
CHECK(match3.MatchExactly("bar"));
CHECK(match3.MatchExactly("b\nr"));
RE_Matcher match4("a.c");
match4.MakeSingleLine();
match4.AddPat("def");
match4.Compile();
CHECK(match4.MatchExactly("abc"));
CHECK(match4.MatchExactly("a\nc"));
}
TEST_CASE("disjunction")
{
RE_Matcher match1("a.c");
match1.MakeSingleLine();
match1.Compile();
RE_Matcher match2("def");
match2.Compile();
auto dj = detail::RE_Matcher_disjunction(&match1, &match2);
CHECK(dj->MatchExactly("abc"));
CHECK(dj->MatchExactly("a.c"));
CHECK(dj->MatchExactly("a\nc"));
CHECK(dj->MatchExactly("def"));
delete dj;
}
}

View file

@ -33,6 +33,7 @@ class Specific_RE_Matcher;
class CCL;
extern bool case_insensitive;
extern bool re_single_line;
extern CCL* curr_ccl;
extern NFA_Machine* nfa;
extern Specific_RE_Matcher* rem;
@ -65,7 +66,7 @@ public:
void AddPat(const char* pat);
void MakeCaseInsensitive();
void SetSingleLineMode();
void MakeSingleLine();
void SetPat(const char* pat) { pattern_text = pat; }
@ -91,7 +92,7 @@ public:
return nullptr;
}
CCL* LookupCCL(int index) { return ccl_list[index]; }
CCL* AnyCCL();
CCL* AnyCCL(bool single_line_mode = false);
void ConvertCCLs();
@ -147,8 +148,10 @@ protected:
EquivClass equiv_class;
int* ecs;
DFA_Machine* dfa;
CCL* any_ccl;
AcceptingSet* accepted;
CCL* any_ccl;
CCL* single_line_ccl;
};
class RE_Match_State
@ -208,6 +211,9 @@ public:
void MakeCaseInsensitive();
bool IsCaseInsensitive() const { return is_case_insensitive; }
void MakeSingleLine();
bool IsSingleLine() const { return is_single_line; }
bool Compile(bool lazy = false);
// Returns true if s exactly matches the pattern, false otherwise.
@ -243,6 +249,7 @@ protected:
detail::Specific_RE_Matcher* re_exact;
bool is_case_insensitive = false;
bool is_single_line = false;
};
} // namespace zeek

View file

@ -922,8 +922,7 @@ expr:
re->MakeCaseInsensitive();
if ( $4.single_line )
{
}
re->MakeSingleLine();
re->Compile();
$$ = new ConstExpr(make_intrusive<PatternVal>(re));

View file

@ -20,7 +20,7 @@ namespace zeek::detail {
void yyerror(const char msg[]);
%}
%token TOK_CHAR TOK_NUMBER TOK_CCL TOK_CCE TOK_CASE_INSENSITIVE
%token TOK_CHAR TOK_NUMBER TOK_CCL TOK_CCE TOK_CASE_INSENSITIVE TOK_SINGLE_LINE
%union {
int int_val;
@ -112,7 +112,8 @@ singleton : singleton '*'
| '.'
{
$$ = new zeek::detail::NFA_Machine(new zeek::detail::NFA_State(zeek::detail::rem->AnyCCL()));
$$ = new zeek::detail::NFA_Machine(new zeek::detail::NFA_State(
zeek::detail::rem->AnyCCL(zeek::detail::re_single_line)));
}
| full_ccl
@ -134,6 +135,9 @@ singleton : singleton '*'
| TOK_CASE_INSENSITIVE re ')'
{ $$ = $2; zeek::detail::case_insensitive = false; }
| TOK_SINGLE_LINE re ')'
{ $$ = $2; zeek::detail::re_single_line = false; }
| TOK_CHAR
{
auto sym = $1;

View file

@ -116,6 +116,7 @@ CCL_EXPR ("[:"[[:alpha:]]+":]")
}
"(?i:" zeek::detail::case_insensitive = true; return TOK_CASE_INSENSITIVE;
"(?s:" zeek::detail::re_single_line = true; return TOK_SINGLE_LINE;
[a-zA-Z] {
if ( zeek::detail::case_insensitive )

View file

@ -33,3 +33,7 @@ case-sensitive pattern (PASS)
(?i:...) pattern construct (PASS)
(?i:...) pattern construct (FAIL)
(?i:...) pattern construct (PASS)
/s missing (PASS)
/s pattern modifier (PASS)
/s pattern disjunction (PASS)
/s pattern concatenation (PASS)

View file

@ -65,4 +65,9 @@ event zeek_init()
test_case( "(?i:...) pattern construct", /foo|(?i:bar)/ in "xFOoy" );
test_case( "(?i:...) pattern construct", /foo|(?i:bar)/ | /foo/i in "xFOoy" );
test_case( "/s missing", /fOO.*bAR/ != "fOOab\ncdbAR");
test_case( "/s pattern modifier", /fOO.*bAR/s == "fOOab\ncdbAR");
test_case( "/s pattern disjunction", /b.r/s | /bez/ == "b\nr" );
test_case( "/s pattern concatenation", /b.r/s & /bez/ == "b\nrbez" );
}