mirror of
https://github.com/zeek/zeek.git
synced 2025-10-02 14:48:21 +00:00
Merge remote-tracking branch 'origin/topic/vern/case-insensitive-patterns'
* origin/topic/vern/case-insensitive-patterns: use PCRE syntax instead of the beautiful new (?i ...) syntax nitlet in NEWS entry test suite update for case-insensitive patterns document use of double quotes to escape case-insensitivity bug fix for recent memory leak patch documentation updates for case-insensitive patterns d'oh there's isalpha. I looked earlier for isletter :-P fix for handling [:(lower|upper):] in case-insensitive patterns implemented /re/i for case-insensitive patterns
This commit is contained in:
commit
463e540c9b
14 changed files with 235 additions and 40 deletions
4
CHANGES
4
CHANGES
|
@ -1,4 +1,8 @@
|
|||
|
||||
2.5-740 | 2018-07-16 16:01:31 -0500
|
||||
|
||||
* Add support for case-insensitive patterns (Vern Paxson, Corelight)
|
||||
|
||||
2.5-730 | 2018-07-16 10:39:33 -0500
|
||||
|
||||
* de-restrict pattern-oriented BiFs to no longer require only running at init
|
||||
|
|
15
NEWS
15
NEWS
|
@ -255,6 +255,21 @@ New Functionality
|
|||
semi-present in previous versions of Bro, but required constants as
|
||||
its operands; now you can use any pattern-valued expressions.
|
||||
|
||||
- You can now specify that a pattern matches in a case-insensitive
|
||||
fashion by adding 'i' to the end of its specification. So for example
|
||||
/fOO/i == "Foo" yields T, as does /fOO/i in "xFoObar". Characters
|
||||
enclosed in quotes however keep their casing, so /"fOO"/i in "xFoObar"
|
||||
yields F, though it yields T for "xfOObar".
|
||||
|
||||
You can achieve the same functionality for a subpattern enclosed in
|
||||
parentheses by adding "?i:" to the open parenthesis. So for example
|
||||
"/foo|(?i:bar)/" will match "BaR", but not "FoO".
|
||||
|
||||
For both ways of specifying case-insensitivity, characters enclosed in
|
||||
double quotes maintain their case-sensitivity. So for example /"foo"/i
|
||||
will not match "Foo", but it will match "foo".
|
||||
|
||||
|
||||
Changed Functionality
|
||||
---------------------
|
||||
|
||||
|
|
2
VERSION
2
VERSION
|
@ -1 +1 @@
|
|||
2.5-730
|
||||
2.5-740
|
||||
|
|
|
@ -198,9 +198,9 @@ Here is a more detailed description of each type:
|
|||
|
||||
.. bro:type:: pattern
|
||||
|
||||
A type representing regular-expression patterns which can be used
|
||||
A type representing regular-expression patterns that can be used
|
||||
for fast text-searching operations. Pattern constants are created
|
||||
by enclosing text within forward slashes (/) and is the same syntax
|
||||
by enclosing text within forward slashes (``/``) and use the same syntax
|
||||
as the patterns supported by the `flex lexical analyzer
|
||||
<http://westes.github.io/flex/manual/Patterns.html>`_. The speed of
|
||||
regular expression matching does not depend on the complexity or
|
||||
|
@ -244,13 +244,25 @@ Here is a more detailed description of each type:
|
|||
|
||||
yields true, like in the similar example above. You can also
|
||||
create the conjunction (concatenation) of patterns using the ``&``
|
||||
operator. For example:
|
||||
operator. For example::
|
||||
|
||||
/foo/ & /bar/ in "foobar"
|
||||
|
||||
will yield true because the pattern /(foo)(bar)/ appears in
|
||||
the string "foobar".
|
||||
|
||||
When specifying a pattern, you can add a final ``i`` specifier to
|
||||
mark it as case-insensitive. For example, ``/foo|bar/i`` will match
|
||||
a "foo", "Foo", "BaR", etc.
|
||||
|
||||
You can also introduce a case-insensitive sub-pattern by enclosing it
|
||||
in ``(?i:``<pattern>``)``. So, for example, ``/foo|(?i:bar)/`` will
|
||||
match "foo" and "BaR", but *not* "Foo".
|
||||
|
||||
For both ways of specifying case-insensitivity, characters enclosed
|
||||
in double quotes maintain their case-sensitivity. So for example
|
||||
/"foo"/i will not match "Foo", but it will match "foo".
|
||||
|
||||
.. bro:type:: port
|
||||
|
||||
A type representing transport-level port numbers (besides TCP and
|
||||
|
|
24
src/RE.cc
24
src/RE.cc
|
@ -102,6 +102,19 @@ void Specific_RE_Matcher::AddPat(const char* new_pat,
|
|||
pattern_text = s;
|
||||
}
|
||||
|
||||
void Specific_RE_Matcher::MakeCaseInsensitive()
|
||||
{
|
||||
const char fmt[] = "(?i:%s)";
|
||||
int n = strlen(pattern_text) + strlen(fmt);
|
||||
|
||||
char* s = new char[n + 5 /* slop */];
|
||||
|
||||
safe_snprintf(s, n + 5, fmt, pattern_text);
|
||||
|
||||
delete [] pattern_text;
|
||||
pattern_text = s;
|
||||
}
|
||||
|
||||
int Specific_RE_Matcher::Compile(int lazy)
|
||||
{
|
||||
if ( ! pattern_text )
|
||||
|
@ -155,9 +168,10 @@ int Specific_RE_Matcher::CompileSet(const string_list& set, const int_list& idx)
|
|||
|
||||
if ( set_nfa != nfa )
|
||||
Unref(set_nfa);
|
||||
Unref(nfa);
|
||||
nfa = 0;
|
||||
else
|
||||
Unref(nfa);
|
||||
|
||||
nfa = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -444,6 +458,12 @@ void RE_Matcher::AddPat(const char* new_pat)
|
|||
re_exact->AddPat(new_pat);
|
||||
}
|
||||
|
||||
void RE_Matcher::MakeCaseInsensitive()
|
||||
{
|
||||
re_anywhere->MakeCaseInsensitive();
|
||||
re_exact->MakeCaseInsensitive();
|
||||
}
|
||||
|
||||
int RE_Matcher::Compile(int lazy)
|
||||
{
|
||||
return re_anywhere->Compile(lazy) && re_exact->Compile(lazy);
|
||||
|
|
5
src/RE.h
5
src/RE.h
|
@ -54,6 +54,8 @@ public:
|
|||
|
||||
void AddPat(const char* pat);
|
||||
|
||||
void MakeCaseInsensitive();
|
||||
|
||||
void SetPat(const char* pat) { pattern_text = copy_string(pat); }
|
||||
|
||||
int Compile(int lazy = 0);
|
||||
|
@ -178,6 +180,9 @@ public:
|
|||
|
||||
void AddPat(const char* pat);
|
||||
|
||||
// Makes the matcher as specified to date case-insensitive.
|
||||
void MakeCaseInsensitive();
|
||||
|
||||
int Compile(int lazy = 0);
|
||||
|
||||
// Returns true if s exactly matches the pattern, false otherwise.
|
||||
|
|
|
@ -23,7 +23,6 @@ extern void add_input_file_at_front(const char* file);
|
|||
extern void add_to_name_list(char* s, char delim, name_list& nl);
|
||||
|
||||
extern void begin_RE();
|
||||
extern void end_RE();
|
||||
|
||||
extern void do_atif(Expr* expr);
|
||||
extern void do_atifdef(const char* id);
|
||||
|
|
|
@ -14,7 +14,7 @@
|
|||
%token TOK_DOUBLE TOK_ELSE TOK_ENUM TOK_EVENT TOK_EXPORT TOK_FALLTHROUGH
|
||||
%token TOK_FILE TOK_FOR TOK_FUNCTION TOK_GLOBAL TOK_HOOK TOK_ID TOK_IF TOK_INT
|
||||
%token TOK_INTERVAL TOK_LIST TOK_LOCAL TOK_MODULE
|
||||
%token TOK_NEXT TOK_OF TOK_OPAQUE TOK_PATTERN TOK_PATTERN_TEXT
|
||||
%token TOK_NEXT TOK_OF TOK_OPAQUE TOK_PATTERN TOK_PATTERN_END TOK_PATTERN_TEXT
|
||||
%token TOK_PORT TOK_PRINT TOK_RECORD TOK_REDEF
|
||||
%token TOK_REMOVE_FROM TOK_RETURN TOK_SCHEDULE TOK_SET
|
||||
%token TOK_STRING TOK_SUBNET TOK_SWITCH TOK_TABLE
|
||||
|
@ -52,7 +52,7 @@
|
|||
%left '$' '[' ']' '(' ')' TOK_HAS_FIELD TOK_HAS_ATTR
|
||||
%nonassoc TOK_AS TOK_IS
|
||||
|
||||
%type <b> opt_no_test opt_no_test_block opt_deprecated
|
||||
%type <b> opt_no_test opt_no_test_block opt_deprecated TOK_PATTERN_END
|
||||
%type <str> TOK_ID TOK_PATTERN_TEXT
|
||||
%type <id> local_id global_id def_global_id event_id global_or_event_id resolve_id begin_func case_type
|
||||
%type <id_l> local_id_list case_type_list
|
||||
|
@ -723,13 +723,16 @@ expr:
|
|||
$$ = new ConstExpr($1);
|
||||
}
|
||||
|
||||
| '/' { begin_RE(); } TOK_PATTERN_TEXT { end_RE(); } '/'
|
||||
| '/' { begin_RE(); } TOK_PATTERN_TEXT TOK_PATTERN_END
|
||||
{
|
||||
set_location(@3);
|
||||
|
||||
RE_Matcher* re = new RE_Matcher($3);
|
||||
delete [] $3;
|
||||
|
||||
if ( $4 )
|
||||
re->MakeCaseInsensitive();
|
||||
|
||||
re->Compile();
|
||||
$$ = new ConstExpr(new PatternVal(re));
|
||||
}
|
||||
|
|
|
@ -11,11 +11,12 @@
|
|||
int csize = 256;
|
||||
int syntax_error = 0;
|
||||
|
||||
int cupper(int sym);
|
||||
int clower(int sym);
|
||||
void yyerror(const char msg[]);
|
||||
%}
|
||||
|
||||
%token TOK_CHAR TOK_NUMBER TOK_CCL TOK_CCE
|
||||
%token TOK_CHAR TOK_NUMBER TOK_CCL TOK_CCE TOK_CASE_INSENSITIVE
|
||||
|
||||
%union {
|
||||
int int_val;
|
||||
|
@ -126,12 +127,11 @@ singleton : singleton '*'
|
|||
| '(' re ')'
|
||||
{ $$ = $2; }
|
||||
|
||||
| TOK_CASE_INSENSITIVE re ')'
|
||||
{ $$ = $2; case_insensitive = 0; }
|
||||
|
||||
| TOK_CHAR
|
||||
{
|
||||
if ( case_insensitive && $1 >= 'A' && $1 <= 'Z' )
|
||||
$1 = clower($1);
|
||||
$$ = new NFA_Machine(new NFA_State($1, rem->EC()));
|
||||
}
|
||||
{ $$ = new NFA_Machine(new NFA_State($1, rem->EC())); }
|
||||
|
||||
| '^'
|
||||
{
|
||||
|
@ -158,17 +158,29 @@ full_ccl : '[' ccl ']'
|
|||
|
||||
ccl : ccl TOK_CHAR '-' TOK_CHAR
|
||||
{
|
||||
if ( case_insensitive )
|
||||
{
|
||||
if ( $2 >= 'A' && $2 <= 'Z' )
|
||||
$2 = clower($2);
|
||||
if ( $4 >= 'A' && $4 <= 'Z' )
|
||||
$4 = clower($4);
|
||||
}
|
||||
|
||||
if ( $2 > $4 )
|
||||
synerr("negative range in character class");
|
||||
|
||||
else if ( case_insensitive &&
|
||||
(isalpha($2) || isalpha($4)) )
|
||||
{
|
||||
if ( isalpha($2) && isalpha($4) &&
|
||||
isupper($2) == isupper($4) )
|
||||
{ // Compatible range, do both versions
|
||||
int l2 = tolower($2);
|
||||
int l4 = tolower($4);
|
||||
|
||||
for ( int i = l2; i<= l4; ++i )
|
||||
{
|
||||
$1->Add(i);
|
||||
$1->Add(toupper(i));
|
||||
}
|
||||
}
|
||||
|
||||
else
|
||||
synerr("ambiguous case-insensitive character class");
|
||||
}
|
||||
|
||||
else
|
||||
{
|
||||
for ( int i = $2; i <= $4; ++i )
|
||||
|
@ -178,10 +190,13 @@ ccl : ccl TOK_CHAR '-' TOK_CHAR
|
|||
|
||||
| ccl TOK_CHAR
|
||||
{
|
||||
if ( case_insensitive && $2 >= 'A' && $2 <= 'Z' )
|
||||
$2 = clower($2);
|
||||
|
||||
$1->Add($2);
|
||||
if ( case_insensitive && isalpha($2) )
|
||||
{
|
||||
$1->Add(clower($2));
|
||||
$1->Add(cupper($2));
|
||||
}
|
||||
else
|
||||
$1->Add($2);
|
||||
}
|
||||
|
||||
| ccl ccl_expr
|
||||
|
@ -200,9 +215,10 @@ ccl_expr: TOK_CCE
|
|||
|
||||
string : string TOK_CHAR
|
||||
{
|
||||
if ( case_insensitive && $2 >= 'A' && $2 <= 'Z' )
|
||||
$2 = clower($2);
|
||||
|
||||
// Even if case-insensitivity is set,
|
||||
// leave this alone; that provides a way
|
||||
// of "escaping" out of insensitivity
|
||||
// if needed.
|
||||
$1->AppendState(new NFA_State($2, rem->EC()));
|
||||
}
|
||||
|
||||
|
@ -211,6 +227,11 @@ string : string TOK_CHAR
|
|||
;
|
||||
%%
|
||||
|
||||
int cupper(int sym)
|
||||
{
|
||||
return (isascii(sym) && islower(sym)) ? toupper(sym) : sym;
|
||||
}
|
||||
|
||||
int clower(int sym)
|
||||
{
|
||||
return (isascii(sym) && isupper(sym)) ? tolower(sym) : sym;
|
||||
|
|
|
@ -114,6 +114,25 @@ CCL_EXPR ("[:"[[:alpha:]]+":]")
|
|||
}
|
||||
}
|
||||
|
||||
"(?i:" case_insensitive = 1; return TOK_CASE_INSENSITIVE;
|
||||
|
||||
[a-zA-Z] {
|
||||
if ( case_insensitive )
|
||||
{
|
||||
char c = yytext[0]; // unput trashes yytext!
|
||||
// Push back the character inside a CCL,
|
||||
// so the parser can then expand it.
|
||||
unput(']');
|
||||
unput(c);
|
||||
unput('[');
|
||||
}
|
||||
else
|
||||
{
|
||||
yylval.int_val = yytext[0];
|
||||
return TOK_CHAR;
|
||||
}
|
||||
}
|
||||
|
||||
[|*+?.(){}] return yytext[0];
|
||||
. yylval.int_val = yytext[0]; return TOK_CHAR;
|
||||
\n return 0; // treat as end of pattern
|
||||
|
@ -149,15 +168,22 @@ CCL_EXPR ("[:"[[:alpha:]]+":]")
|
|||
"[:cntrl:]" RET_CCE(my_iscntrl)
|
||||
"[:digit:]" RET_CCE(my_isdigit)
|
||||
"[:graph:]" RET_CCE(my_isgraph)
|
||||
"[:lower:]" RET_CCE(my_islower)
|
||||
"[:print:]" RET_CCE(my_isprint)
|
||||
"[:punct:]" RET_CCE(my_ispunct)
|
||||
"[:space:]" RET_CCE(my_isspace)
|
||||
"[:xdigit:]" RET_CCE(my_isxdigit)
|
||||
|
||||
"[:lower:]" {
|
||||
BEGIN(SC_CCL);
|
||||
yylval.cce_val =
|
||||
case_insensitive ? my_isalpha : my_islower;
|
||||
return TOK_CCE;
|
||||
}
|
||||
|
||||
"[:upper:]" {
|
||||
BEGIN(SC_CCL);
|
||||
yylval.cce_val =
|
||||
case_insensitive ? my_isupper : my_islower;
|
||||
case_insensitive ? my_isalpha : my_isupper;
|
||||
return TOK_CCE;
|
||||
}
|
||||
|
||||
|
|
19
src/scan.l
19
src/scan.l
|
@ -554,7 +554,19 @@ F RET_CONST(new Val(false, TYPE_BOOL))
|
|||
return TOK_PATTERN_TEXT;
|
||||
}
|
||||
|
||||
<RE>[/\\\n] return yytext[0];
|
||||
<RE>"/" {
|
||||
BEGIN(INITIAL);
|
||||
yylval.b = false;
|
||||
return TOK_PATTERN_END;
|
||||
}
|
||||
|
||||
<RE>"/i" {
|
||||
BEGIN(INITIAL);
|
||||
yylval.b = true;
|
||||
return TOK_PATTERN_END;
|
||||
}
|
||||
|
||||
<RE>[\\\n] return yytext[0]; // should cause a parse error
|
||||
|
||||
<*>. reporter->Error("unrecognized character - %s", yytext);
|
||||
|
||||
|
@ -698,11 +710,6 @@ void begin_RE()
|
|||
BEGIN(RE);
|
||||
}
|
||||
|
||||
void end_RE()
|
||||
{
|
||||
BEGIN(INITIAL);
|
||||
}
|
||||
|
||||
class LocalNameFinder : public TraversalCallback {
|
||||
public:
|
||||
LocalNameFinder()
|
||||
|
|
|
@ -10,3 +10,25 @@ in operator (PASS)
|
|||
& operator (FAIL)
|
||||
| operator (PASS)
|
||||
| operator (FAIL)
|
||||
/i pattern modifier (PASS)
|
||||
/i pattern modifier (PASS)
|
||||
/i double-quote escape (FAIL)
|
||||
/i double-quote escape (PASS)
|
||||
case-sensitive pattern (FAIL)
|
||||
case-sensitive pattern (FAIL)
|
||||
case-sensitive pattern (PASS)
|
||||
/i pattern disjunction (PASS)
|
||||
/i pattern disjunction (FAIL)
|
||||
/i pattern disjunction (PASS)
|
||||
/i pattern disjunction (PASS)
|
||||
/i pattern concatenation (PASS)
|
||||
/i pattern concatenation (FAIL)
|
||||
/i pattern concatenation (FAIL)
|
||||
/i pattern concatenation (PASS)
|
||||
/i pattern concatenation (PASS)
|
||||
/i pattern concatenation (FAIL)
|
||||
/i pattern character class (FAIL)
|
||||
/i pattern character class (PASS)
|
||||
(?i:...) pattern construct (PASS)
|
||||
(?i:...) pattern construct (FAIL)
|
||||
(?i:...) pattern construct (PASS)
|
||||
|
|
|
@ -35,4 +35,33 @@ event new_connection(c: connection)
|
|||
test_case( "& operator", p2 & p1 in "baroob" );
|
||||
test_case( "| operator", p1 | p2 in "lazybarlazy" );
|
||||
test_case( "| operator", p3 | p4 in "xoob" );
|
||||
|
||||
test_case( "/i pattern modifier", /fOO/i in "xFoObar" );
|
||||
test_case( "/i pattern modifier", /fOO/i == "Foo" );
|
||||
|
||||
test_case( "/i double-quote escape", /"fOO"/i in "xFoObar" );
|
||||
test_case( "/i double-quote escape", /"fOO"/i in "xfOObar" );
|
||||
|
||||
test_case( "case-sensitive pattern", /fOO/ in "xFoObar" );
|
||||
test_case( "case-sensitive pattern", /fOO/ == "Foo" );
|
||||
test_case( "case-sensitive pattern", /fOO/ == "fOO" );
|
||||
|
||||
test_case( "/i pattern disjunction", /bar/i | /bez/ == "bez" );
|
||||
test_case( "/i pattern disjunction", /bar/i | /bez/ == "bEz" );
|
||||
test_case( "/i pattern disjunction", /bar/i | /bez/ == "bar" );
|
||||
test_case( "/i pattern disjunction", /bar/i | /bez/ == "bAr" );
|
||||
|
||||
test_case( "/i pattern concatenation", /bar/i & /bez/ == "barbez" );
|
||||
test_case( "/i pattern concatenation", /bar/i & /bez/ == "barbEz" );
|
||||
test_case( "/i pattern concatenation", /BAR/i & /bez/ == "barbEz" );
|
||||
test_case( "/i pattern concatenation", /bar/i & /bez/ == "bArbez" );
|
||||
test_case( "/i pattern concatenation", /BAR/i & /bez/ == "bArbez" );
|
||||
test_case( "/i pattern concatenation", /bar/i & /bez/ == "bArbEz" );
|
||||
|
||||
test_case( "/i pattern character class", /ba[0a-c99S-Z0]/i & /bEz/ == "bArbEz" );
|
||||
test_case( "/i pattern character class", /ba[0a-c99M-S0]/i & /bEz/ == "bArbEz" );
|
||||
|
||||
test_case( "(?i:...) pattern construct", /foo|(?i:bar)/ in "xBAry" );
|
||||
test_case( "(?i:...) pattern construct", /foo|(?i:bar)/ in "xFOoy" );
|
||||
test_case( "(?i:...) pattern construct", /foo|(?i:bar)/ | /foo/i in "xFOoy" );
|
||||
}
|
||||
|
|
|
@ -22,15 +22,47 @@ event bro_init()
|
|||
|
||||
test_case( "equality operator", "foo" == p1 );
|
||||
test_case( "equality operator (order of operands)", p1 == "foo" );
|
||||
|
||||
test_case( "inequality operator", "foobar" != p1 );
|
||||
test_case( "inequality operator (order of operands)", p1 != "foobar" );
|
||||
|
||||
test_case( "in operator", p1 in "foobar" );
|
||||
test_case( "in operator", p2 in "foobar" );
|
||||
test_case( "!in operator", p3 !in "foobar" );
|
||||
|
||||
test_case( "& operator", p1 & p2 in "baroob" );
|
||||
test_case( "& operator", p2 & p1 in "baroob" );
|
||||
|
||||
test_case( "| operator", p1 | p2 in "lazybarlazy" );
|
||||
test_case( "| operator", p3 | p4 in "xoob" );
|
||||
|
||||
}
|
||||
test_case( "/i pattern modifier", /fOO/i in "xFoObar" );
|
||||
test_case( "/i pattern modifier", /fOO/i == "Foo" );
|
||||
|
||||
test_case( "/i double-quote escape", /"fOO"/i in "xFoObar" );
|
||||
test_case( "/i double-quote escape", /"fOO"/i in "xfOObar" );
|
||||
|
||||
test_case( "case-sensitive pattern", /fOO/ in "xFoObar" );
|
||||
test_case( "case-sensitive pattern", /fOO/ == "Foo" );
|
||||
test_case( "case-sensitive pattern", /fOO/ == "fOO" );
|
||||
|
||||
test_case( "/i pattern disjunction", /bar/i | /bez/ == "bez" );
|
||||
test_case( "/i pattern disjunction", /bar/i | /bez/ == "bEz" );
|
||||
test_case( "/i pattern disjunction", /bar/i | /bez/ == "bar" );
|
||||
test_case( "/i pattern disjunction", /bar/i | /bez/ == "bAr" );
|
||||
|
||||
test_case( "/i pattern concatenation", /bar/i & /bez/ == "barbez" );
|
||||
test_case( "/i pattern concatenation", /bar/i & /bez/ == "barbEz" );
|
||||
test_case( "/i pattern concatenation", /BAR/i & /bez/ == "barbEz" );
|
||||
test_case( "/i pattern concatenation", /bar/i & /bez/ == "bArbez" );
|
||||
test_case( "/i pattern concatenation", /BAR/i & /bez/ == "bArbez" );
|
||||
test_case( "/i pattern concatenation", /bar/i & /bez/ == "bArbEz" );
|
||||
|
||||
test_case( "/i pattern character class", /ba[0a-c99S-Z0]/i & /bEz/ == "bArbEz" );
|
||||
test_case( "/i pattern character class", /ba[0a-c99M-S0]/i & /bEz/ == "bArbEz" );
|
||||
|
||||
test_case( "(?i:...) pattern construct", /foo|(?i:bar)/ in "xBAry" );
|
||||
test_case( "(?i:...) pattern construct", /foo|(?i:bar)/ in "xFOoy" );
|
||||
test_case( "(?i:...) pattern construct", /foo|(?i:bar)/ | /foo/i in "xFOoy" );
|
||||
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue