mirror of
https://github.com/zeek/zeek.git
synced 2025-10-02 06:38:20 +00:00
Add support for /s modifier to RE matcher and parser
This commit is contained in:
parent
f67f6e4507
commit
18126c2d50
9 changed files with 109 additions and 24 deletions
4
NEWS
4
NEWS
|
@ -24,6 +24,10 @@ Breaking Changes
|
|||
New Functionality
|
||||
-----------------
|
||||
|
||||
- Added support for the /s regular expression modifier. Using this modifier in
|
||||
patterns in Zeek scripts will cause the '.' character to also match newline
|
||||
characters.
|
||||
|
||||
Changed Functionality
|
||||
---------------------
|
||||
|
||||
|
|
93
src/RE.cc
93
src/RE.cc
|
@ -18,6 +18,7 @@ zeek::detail::CCL* zeek::detail::curr_ccl = nullptr;
|
|||
zeek::detail::Specific_RE_Matcher* zeek::detail::rem = nullptr;
|
||||
zeek::detail::NFA_Machine* zeek::detail::nfa = nullptr;
|
||||
bool zeek::detail::case_insensitive = false;
|
||||
bool zeek::detail::re_single_line = false;
|
||||
|
||||
extern int RE_parse(void);
|
||||
extern void RE_set_input(const char* str);
|
||||
|
@ -32,6 +33,7 @@ Specific_RE_Matcher::Specific_RE_Matcher(match_type arg_mt, bool arg_multiline)
|
|||
: mt(arg_mt), multiline(arg_multiline), equiv_class(NUM_SYM)
|
||||
{
|
||||
any_ccl = nullptr;
|
||||
single_line_ccl = nullptr;
|
||||
dfa = nullptr;
|
||||
ecs = nullptr;
|
||||
accepted = new AcceptingSet();
|
||||
|
@ -46,10 +48,22 @@ Specific_RE_Matcher::~Specific_RE_Matcher()
|
|||
delete accepted;
|
||||
}
|
||||
|
||||
CCL* Specific_RE_Matcher::AnyCCL()
|
||||
CCL* Specific_RE_Matcher::AnyCCL(bool single_line_mode)
|
||||
{
|
||||
if ( single_line_mode )
|
||||
{
|
||||
if ( ! single_line_ccl )
|
||||
{
|
||||
single_line_ccl = new CCL();
|
||||
single_line_ccl->Negate();
|
||||
EC()->CCL_Use(single_line_ccl);
|
||||
}
|
||||
|
||||
return single_line_ccl;
|
||||
}
|
||||
|
||||
if ( ! any_ccl )
|
||||
{ // Create the '.' character class.
|
||||
{
|
||||
any_ccl = new CCL();
|
||||
if ( ! multiline )
|
||||
any_ccl->Add('\n');
|
||||
|
@ -98,6 +112,12 @@ void Specific_RE_Matcher::MakeCaseInsensitive()
|
|||
pattern_text = util::fmt(fmt, pattern_text.c_str());
|
||||
}
|
||||
|
||||
void Specific_RE_Matcher::MakeSingleLine()
|
||||
{
|
||||
const char fmt[] = "(?s:%s)";
|
||||
pattern_text = util::fmt(fmt, pattern_text.c_str());
|
||||
}
|
||||
|
||||
bool Specific_RE_Matcher::Compile(bool lazy)
|
||||
{
|
||||
if ( pattern_text.empty() )
|
||||
|
@ -394,13 +414,10 @@ static RE_Matcher* matcher_merge(const RE_Matcher* re1, const RE_Matcher* re2, c
|
|||
const char* text1 = re1->PatternText();
|
||||
const char* text2 = re2->PatternText();
|
||||
|
||||
int n = strlen(text1) + strlen(text2) + strlen(merge_op) + 32 /* slop */;
|
||||
size_t n = strlen(text1) + strlen(text2) + strlen(merge_op) + 32 /* slop */;
|
||||
|
||||
char* merge_text = new char[n];
|
||||
snprintf(merge_text, n, "(%s)%s(%s)", text1, merge_op, text2);
|
||||
|
||||
RE_Matcher* merge = new RE_Matcher(merge_text);
|
||||
delete[] merge_text;
|
||||
std::string merge_text = util::fmt("(%s)%s(%s)", text1, merge_op, text2);
|
||||
RE_Matcher* merge = new RE_Matcher(merge_text.c_str());
|
||||
|
||||
merge->Compile();
|
||||
|
||||
|
@ -461,6 +478,14 @@ void RE_Matcher::MakeCaseInsensitive()
|
|||
is_case_insensitive = true;
|
||||
}
|
||||
|
||||
void RE_Matcher::MakeSingleLine()
|
||||
{
|
||||
re_anywhere->MakeSingleLine();
|
||||
re_exact->MakeSingleLine();
|
||||
|
||||
is_single_line = true;
|
||||
}
|
||||
|
||||
bool RE_Matcher::Compile(bool lazy)
|
||||
{
|
||||
return re_anywhere->Compile(lazy) && re_exact->Compile(lazy);
|
||||
|
@ -523,15 +548,51 @@ TEST_SUITE("re_matcher")
|
|||
CHECK(match.MatchExactly("aBc"));
|
||||
CHECK(match.MatchExactly("nop"));
|
||||
CHECK_FALSE(match.MatchExactly("NoP"));
|
||||
}
|
||||
|
||||
// TODO: this part isn't working at all. There's something about the second call
|
||||
// to Compile() that's breaking something.
|
||||
// match.MakeCaseInsensitive();
|
||||
// match.Compile();
|
||||
// CHECK(strcmp(match.PatternText(), "(?i:((?i:^?([a-m]+)$?))|(^?([n-z]+)$?))") == 0);
|
||||
// CHECK(match.MatchExactly("aBc"));
|
||||
// CHECK(match.MatchExactly("nop"));
|
||||
// CHECK(match.MatchExactly("NoP"));
|
||||
TEST_CASE("single_line_mode")
|
||||
{
|
||||
RE_Matcher match(".*");
|
||||
match.MakeSingleLine();
|
||||
match.Compile();
|
||||
|
||||
CHECK(strcmp(match.PatternText(), "(?s:^?(.*)$?)") == 0);
|
||||
CHECK(match.MatchExactly("abc\ndef"));
|
||||
|
||||
RE_Matcher match2("fOO.*bAR");
|
||||
match2.MakeSingleLine();
|
||||
match2.Compile();
|
||||
|
||||
CHECK(strcmp(match2.PatternText(), "(?s:^?(fOO.*bAR)$?)") == 0);
|
||||
CHECK(match.MatchExactly("fOOab\ncdbAR"));
|
||||
|
||||
RE_Matcher match3("b.r");
|
||||
match3.MakeSingleLine();
|
||||
match3.Compile();
|
||||
CHECK(match3.MatchExactly("bar"));
|
||||
CHECK(match3.MatchExactly("b\nr"));
|
||||
|
||||
RE_Matcher match4("a.c");
|
||||
match4.MakeSingleLine();
|
||||
match4.AddPat("def");
|
||||
match4.Compile();
|
||||
CHECK(match4.MatchExactly("abc"));
|
||||
CHECK(match4.MatchExactly("a\nc"));
|
||||
}
|
||||
|
||||
TEST_CASE("disjunction")
|
||||
{
|
||||
RE_Matcher match1("a.c");
|
||||
match1.MakeSingleLine();
|
||||
match1.Compile();
|
||||
RE_Matcher match2("def");
|
||||
match2.Compile();
|
||||
auto dj = detail::RE_Matcher_disjunction(&match1, &match2);
|
||||
CHECK(dj->MatchExactly("abc"));
|
||||
CHECK(dj->MatchExactly("a.c"));
|
||||
CHECK(dj->MatchExactly("a\nc"));
|
||||
CHECK(dj->MatchExactly("def"));
|
||||
delete dj;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
13
src/RE.h
13
src/RE.h
|
@ -33,6 +33,7 @@ class Specific_RE_Matcher;
|
|||
class CCL;
|
||||
|
||||
extern bool case_insensitive;
|
||||
extern bool re_single_line;
|
||||
extern CCL* curr_ccl;
|
||||
extern NFA_Machine* nfa;
|
||||
extern Specific_RE_Matcher* rem;
|
||||
|
@ -65,7 +66,7 @@ public:
|
|||
void AddPat(const char* pat);
|
||||
|
||||
void MakeCaseInsensitive();
|
||||
void SetSingleLineMode();
|
||||
void MakeSingleLine();
|
||||
|
||||
void SetPat(const char* pat) { pattern_text = pat; }
|
||||
|
||||
|
@ -91,7 +92,7 @@ public:
|
|||
return nullptr;
|
||||
}
|
||||
CCL* LookupCCL(int index) { return ccl_list[index]; }
|
||||
CCL* AnyCCL();
|
||||
CCL* AnyCCL(bool single_line_mode = false);
|
||||
|
||||
void ConvertCCLs();
|
||||
|
||||
|
@ -147,8 +148,10 @@ protected:
|
|||
EquivClass equiv_class;
|
||||
int* ecs;
|
||||
DFA_Machine* dfa;
|
||||
CCL* any_ccl;
|
||||
AcceptingSet* accepted;
|
||||
|
||||
CCL* any_ccl;
|
||||
CCL* single_line_ccl;
|
||||
};
|
||||
|
||||
class RE_Match_State
|
||||
|
@ -208,6 +211,9 @@ public:
|
|||
void MakeCaseInsensitive();
|
||||
bool IsCaseInsensitive() const { return is_case_insensitive; }
|
||||
|
||||
void MakeSingleLine();
|
||||
bool IsSingleLine() const { return is_single_line; }
|
||||
|
||||
bool Compile(bool lazy = false);
|
||||
|
||||
// Returns true if s exactly matches the pattern, false otherwise.
|
||||
|
@ -243,6 +249,7 @@ protected:
|
|||
detail::Specific_RE_Matcher* re_exact;
|
||||
|
||||
bool is_case_insensitive = false;
|
||||
bool is_single_line = false;
|
||||
};
|
||||
|
||||
} // namespace zeek
|
||||
|
|
|
@ -922,8 +922,7 @@ expr:
|
|||
re->MakeCaseInsensitive();
|
||||
|
||||
if ( $4.single_line )
|
||||
{
|
||||
}
|
||||
re->MakeSingleLine();
|
||||
|
||||
re->Compile();
|
||||
$$ = new ConstExpr(make_intrusive<PatternVal>(re));
|
||||
|
|
|
@ -20,7 +20,7 @@ namespace zeek::detail {
|
|||
void yyerror(const char msg[]);
|
||||
%}
|
||||
|
||||
%token TOK_CHAR TOK_NUMBER TOK_CCL TOK_CCE TOK_CASE_INSENSITIVE
|
||||
%token TOK_CHAR TOK_NUMBER TOK_CCL TOK_CCE TOK_CASE_INSENSITIVE TOK_SINGLE_LINE
|
||||
|
||||
%union {
|
||||
int int_val;
|
||||
|
@ -112,7 +112,8 @@ singleton : singleton '*'
|
|||
|
||||
| '.'
|
||||
{
|
||||
$$ = new zeek::detail::NFA_Machine(new zeek::detail::NFA_State(zeek::detail::rem->AnyCCL()));
|
||||
$$ = new zeek::detail::NFA_Machine(new zeek::detail::NFA_State(
|
||||
zeek::detail::rem->AnyCCL(zeek::detail::re_single_line)));
|
||||
}
|
||||
|
||||
| full_ccl
|
||||
|
@ -134,6 +135,9 @@ singleton : singleton '*'
|
|||
| TOK_CASE_INSENSITIVE re ')'
|
||||
{ $$ = $2; zeek::detail::case_insensitive = false; }
|
||||
|
||||
| TOK_SINGLE_LINE re ')'
|
||||
{ $$ = $2; zeek::detail::re_single_line = false; }
|
||||
|
||||
| TOK_CHAR
|
||||
{
|
||||
auto sym = $1;
|
||||
|
|
|
@ -116,6 +116,7 @@ CCL_EXPR ("[:"[[:alpha:]]+":]")
|
|||
}
|
||||
|
||||
"(?i:" zeek::detail::case_insensitive = true; return TOK_CASE_INSENSITIVE;
|
||||
"(?s:" zeek::detail::re_single_line = true; return TOK_SINGLE_LINE;
|
||||
|
||||
[a-zA-Z] {
|
||||
if ( zeek::detail::case_insensitive )
|
||||
|
|
|
@ -570,7 +570,7 @@ F RET_CONST(zeek::val_mgr->False()->Ref())
|
|||
<RE>(\/[is]{0,2}) {
|
||||
BEGIN(INITIAL);
|
||||
|
||||
if (strlen(yytext) == 2)
|
||||
if ( strlen(yytext) == 2 )
|
||||
{
|
||||
yylval.re_modes.ignore_case = (yytext[1] == 'i');
|
||||
yylval.re_modes.single_line = (yytext[1] == 's');
|
||||
|
|
|
@ -33,3 +33,7 @@ case-sensitive pattern (PASS)
|
|||
(?i:...) pattern construct (PASS)
|
||||
(?i:...) pattern construct (FAIL)
|
||||
(?i:...) pattern construct (PASS)
|
||||
/s missing (PASS)
|
||||
/s pattern modifier (PASS)
|
||||
/s pattern disjunction (PASS)
|
||||
/s pattern concatenation (PASS)
|
||||
|
|
|
@ -65,4 +65,9 @@ event zeek_init()
|
|||
test_case( "(?i:...) pattern construct", /foo|(?i:bar)/ in "xFOoy" );
|
||||
test_case( "(?i:...) pattern construct", /foo|(?i:bar)/ | /foo/i in "xFOoy" );
|
||||
|
||||
test_case( "/s missing", /fOO.*bAR/ != "fOOab\ncdbAR");
|
||||
test_case( "/s pattern modifier", /fOO.*bAR/s == "fOOab\ncdbAR");
|
||||
test_case( "/s pattern disjunction", /b.r/s | /bez/ == "b\nr" );
|
||||
test_case( "/s pattern concatenation", /b.r/s & /bez/ == "b\nrbez" );
|
||||
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue