zeek/src/re-parse.y
2025-03-04 09:33:30 -07:00

262 lines
5.3 KiB
Text

// parse.y - parser for flex input
%{
#include <cstdlib>
#include "zeek/RE.h"
#include "zeek/CCL.h"
#include "zeek/NFA.h"
#include "zeek/EquivClass.h"
#include "zeek/Reporter.h"
namespace zeek::detail {
constexpr int csize = 256;
bool re_syntax_error = 0;
int cupper(int sym);
int clower(int sym);
}
void yyerror(const char msg[]);
%}
%token TOK_CHAR TOK_NUMBER TOK_CCL TOK_CCE TOK_CASE_INSENSITIVE TOK_SINGLE_LINE
%union {
int int_val;
cce_func cce_val;
zeek::detail::CCL* ccl_val;
zeek::detail::NFA_Machine* mach_val;
}
%type <int_val> TOK_CHAR TOK_NUMBER
%type <cce_val> TOK_CCE
%type <ccl_val> TOK_CCL ccl full_ccl
%type <mach_val> re singleton series string
%destructor { delete $$; } <mach_val>
%%
flexrule : re
{ $1->AddAccept(1); zeek::detail::nfa = $1; }
| error
{ return 1; }
;
re : re '|' series
{ $$ = zeek::detail::make_alternate($1, $3); }
| series
|
{ $$ = new zeek::detail::NFA_Machine(new zeek::detail::EpsilonState()); }
;
series : series singleton
{ $1->AppendMachine($2); $$ = $1; }
| singleton
;
singleton : singleton '*'
{ $1->MakeClosure(); $$ = $1; }
| singleton '+'
{ $1->MakePositiveClosure(); $$ = $1; }
| singleton '?'
{ $1->MakeOptional(); $$ = $1; }
| singleton '{' TOK_NUMBER ',' TOK_NUMBER '}'
{
if ( $3 > $5 || $3 < 0 )
zeek::detail::synerr("bad iteration values");
else {
if ( $3 == 0 ) {
if ( $5 == 0 ) {
$$ = new zeek::detail::NFA_Machine(new zeek::detail::EpsilonState());
Unref($1);
}
else {
$1->MakeRepl(1, $5);
$1->MakeOptional();
}
}
else
$1->MakeRepl($3, $5);
}
}
| singleton '{' TOK_NUMBER ',' '}'
{
if ( $3 < 0 )
zeek::detail::synerr("iteration value must be positive");
else if ( $3 == 0 )
$1->MakeClosure();
else
$1->MakeRepl($3, NO_UPPER_BOUND);
$$ = $1;
}
| singleton '{' TOK_NUMBER '}'
{
if ( $3 < 0 )
zeek::detail::synerr("iteration value must be positive");
else if ( $3 == 0 ) {
Unref($1);
$$ = new zeek::detail::NFA_Machine(new zeek::detail::EpsilonState());
}
else
$1->LinkCopies($3-1);
}
| '.'
{
$$ = new zeek::detail::NFA_Machine(new zeek::detail::NFA_State(
zeek::detail::rem->AnyCCL(zeek::detail::re_single_line)));
}
| full_ccl
{
$1->Sort();
zeek::detail::rem->EC()->CCL_Use($1);
$$ = new zeek::detail::NFA_Machine(new zeek::detail::NFA_State($1));
}
| TOK_CCL
{ $$ = new zeek::detail::NFA_Machine(new zeek::detail::NFA_State($1)); }
| '"' string '"'
{ $$ = $2; }
| '(' re ')'
{ $$ = $2; }
| TOK_CASE_INSENSITIVE re ')'
{ $$ = $2; zeek::detail::case_insensitive = false; }
| TOK_SINGLE_LINE re ')'
{ $$ = $2; zeek::detail::re_single_line = false; }
| TOK_CHAR
{
auto sym = $1;
if ( sym < 0 || ( sym >= NUM_SYM && sym != SYM_EPSILON ) ) {
zeek::reporter->Error("bad symbol %d (compiling pattern /%s/)", sym,
zeek::detail::RE_parse_input);
return 1;
}
$$ = new zeek::detail::NFA_Machine(new zeek::detail::NFA_State(sym, zeek::detail::rem->EC()));
}
| '^'
{
$$ = new zeek::detail::NFA_Machine(new zeek::detail::NFA_State(SYM_BOL, zeek::detail::rem->EC()));
$$->MarkBOL();
}
| '$'
{
$$ = new zeek::detail::NFA_Machine(new zeek::detail::NFA_State(SYM_EOL, zeek::detail::rem->EC()));
$$->MarkEOL();
}
;
full_ccl : '[' ccl ']'
{ $$ = $2; }
| '[' '^' ccl ']'
{
$3->Negate();
$$ = $3;
}
;
ccl : ccl TOK_CHAR '-' TOK_CHAR
{
if ( $2 > $4 )
zeek::detail::synerr("negative range in character class");
else if ( zeek::detail::case_insensitive && (isalpha($2) || isalpha($4)) ) {
if ( isalpha($2) && isalpha($4) && isupper($2) == isupper($4) ) {
// Compatible range, do both versions
int l2 = tolower($2);
int l4 = tolower($4);
for ( int i = l2; i<= l4; ++i ) {
$1->Add(i);
$1->Add(toupper(i));
}
}
else
zeek::detail::synerr("ambiguous case-insensitive character class");
}
else {
for ( int i = $2; i <= $4; ++i )
$1->Add(i);
}
}
| ccl TOK_CHAR
{
if ( zeek::detail::case_insensitive && isalpha($2) ) {
$1->Add(zeek::detail::clower($2));
$1->Add(zeek::detail::cupper($2));
}
else
$1->Add($2);
}
| ccl ccl_expr
|
{ $$ = zeek::detail::curr_ccl; }
;
ccl_expr: TOK_CCE
{
for ( int c = 0; c < zeek::detail::csize; ++c )
if ( isascii(c) && $1(c) )
zeek::detail::curr_ccl->Add(c);
}
;
string : string TOK_CHAR
{
// Even if case-insensitivity is set,
// leave this alone; that provides a way
// of "escaping" out of insensitivity
// if needed.
$1->AppendState(new zeek::detail::NFA_State($2, zeek::detail::rem->EC()));
$$ = $1;
}
|
{ $$ = new zeek::detail::NFA_Machine(new zeek::detail::EpsilonState()); }
;
%%
namespace zeek::detail {
int cupper(int sym)
{
return (isascii(sym) && islower(sym)) ? toupper(sym) : sym;
}
int clower(int sym)
{
return (isascii(sym) && isupper(sym)) ? tolower(sym) : sym;
}
void synerr(const char str[])
{
zeek::detail::re_syntax_error = true;
zeek::reporter->Error("%s (compiling pattern /%s/)", str, RE_parse_input);
}
} // namespace zeek::detail
void yyerror(const char msg[])
{
}