diff --git a/NEWS b/NEWS index f594207f58..d075d4637b 100644 --- a/NEWS +++ b/NEWS @@ -255,6 +255,12 @@ New Functionality semi-present in previous versions of Bro, but required constants as as its operands; now you can use any pattern-valued expressions. +- You can now specify that a pattern should be match in a case-insensitive + fashion by adding 'i' to the end of its specification. So for example + /fOO/i == "Foo" yields T, as does /fOO/i in "xFoObar". Characters + enclosed in quotes however keep their casing, so /"fOO"/i in "xFoObar" + yields F, though it yields T for "xfOObar". + Changed Functionality --------------------- diff --git a/src/RE.cc b/src/RE.cc index 4d26ce2423..4e29fa8e92 100644 --- a/src/RE.cc +++ b/src/RE.cc @@ -102,6 +102,19 @@ void Specific_RE_Matcher::AddPat(const char* new_pat, pattern_text = s; } +void Specific_RE_Matcher::MakeCaseInsensitive() + { + const char fmt[] = "(+i %s)"; + int n = strlen(pattern_text) + strlen(fmt); + + char* s = new char[n + 5 /* slop */]; + + safe_snprintf(s, n + 5, fmt, pattern_text); + + delete [] pattern_text; + pattern_text = s; + } + int Specific_RE_Matcher::Compile(int lazy) { if ( ! pattern_text ) @@ -444,6 +457,12 @@ void RE_Matcher::AddPat(const char* new_pat) re_exact->AddPat(new_pat); } +void RE_Matcher::MakeCaseInsensitive() + { + re_anywhere->MakeCaseInsensitive(); + re_exact->MakeCaseInsensitive(); + } + int RE_Matcher::Compile(int lazy) { return re_anywhere->Compile(lazy) && re_exact->Compile(lazy); diff --git a/src/RE.h b/src/RE.h index 056c0d2183..06b0699864 100644 --- a/src/RE.h +++ b/src/RE.h @@ -54,6 +54,8 @@ public: void AddPat(const char* pat); + void MakeCaseInsensitive(); + void SetPat(const char* pat) { pattern_text = copy_string(pat); } int Compile(int lazy = 0); @@ -178,6 +180,9 @@ public: void AddPat(const char* pat); + // Makes the matcher as specified to date case-insensitive. + void MakeCaseInsensitive(); + int Compile(int lazy = 0); // Returns true if s exactly matches the pattern, false otherwise. diff --git a/src/input.h b/src/input.h index 3d0caa459a..230a10073a 100644 --- a/src/input.h +++ b/src/input.h @@ -23,7 +23,6 @@ extern void add_input_file_at_front(const char* file); extern void add_to_name_list(char* s, char delim, name_list& nl); extern void begin_RE(); -extern void end_RE(); extern void do_atif(Expr* expr); extern void do_atifdef(const char* id); diff --git a/src/parse.y b/src/parse.y index 34d6f31373..25b6c17873 100644 --- a/src/parse.y +++ b/src/parse.y @@ -14,7 +14,7 @@ %token TOK_DOUBLE TOK_ELSE TOK_ENUM TOK_EVENT TOK_EXPORT TOK_FALLTHROUGH %token TOK_FILE TOK_FOR TOK_FUNCTION TOK_GLOBAL TOK_HOOK TOK_ID TOK_IF TOK_INT %token TOK_INTERVAL TOK_LIST TOK_LOCAL TOK_MODULE -%token TOK_NEXT TOK_OF TOK_OPAQUE TOK_PATTERN TOK_PATTERN_TEXT +%token TOK_NEXT TOK_OF TOK_OPAQUE TOK_PATTERN TOK_PATTERN_END TOK_PATTERN_TEXT %token TOK_PORT TOK_PRINT TOK_RECORD TOK_REDEF %token TOK_REMOVE_FROM TOK_RETURN TOK_SCHEDULE TOK_SET %token TOK_STRING TOK_SUBNET TOK_SWITCH TOK_TABLE @@ -52,7 +52,7 @@ %left '$' '[' ']' '(' ')' TOK_HAS_FIELD TOK_HAS_ATTR %nonassoc TOK_AS TOK_IS -%type opt_no_test opt_no_test_block opt_deprecated +%type opt_no_test opt_no_test_block opt_deprecated TOK_PATTERN_END %type TOK_ID TOK_PATTERN_TEXT %type local_id global_id def_global_id event_id global_or_event_id resolve_id begin_func case_type %type local_id_list case_type_list @@ -723,13 +723,16 @@ expr: $$ = new ConstExpr($1); } - | '/' { begin_RE(); } TOK_PATTERN_TEXT { end_RE(); } '/' + | '/' { begin_RE(); } TOK_PATTERN_TEXT TOK_PATTERN_END { set_location(@3); RE_Matcher* re = new RE_Matcher($3); delete [] $3; + if ( $4 ) + re->MakeCaseInsensitive(); + re->Compile(); $$ = new ConstExpr(new PatternVal(re)); } diff --git a/src/re-parse.y b/src/re-parse.y index 3847c06f29..6834836f28 100644 --- a/src/re-parse.y +++ b/src/re-parse.y @@ -11,11 +11,13 @@ int csize = 256; int syntax_error = 0; +int is_letter(int sym); +int cupper(int sym); int clower(int sym); void yyerror(const char msg[]); %} -%token TOK_CHAR TOK_NUMBER TOK_CCL TOK_CCE +%token TOK_CHAR TOK_NUMBER TOK_CCL TOK_CCE TOK_CASE_INSENSITIVE %union { int int_val; @@ -126,12 +128,11 @@ singleton : singleton '*' | '(' re ')' { $$ = $2; } + | TOK_CASE_INSENSITIVE re ')' + { $$ = $2; case_insensitive = 0; } + | TOK_CHAR - { - if ( case_insensitive && $1 >= 'A' && $1 <= 'Z' ) - $1 = clower($1); - $$ = new NFA_Machine(new NFA_State($1, rem->EC())); - } + { $$ = new NFA_Machine(new NFA_State($1, rem->EC())); } | '^' { @@ -158,17 +159,29 @@ full_ccl : '[' ccl ']' ccl : ccl TOK_CHAR '-' TOK_CHAR { - if ( case_insensitive ) - { - if ( $2 >= 'A' && $2 <= 'Z' ) - $2 = clower($2); - if ( $4 >= 'A' && $4 <= 'Z' ) - $4 = clower($4); - } - if ( $2 > $4 ) synerr("negative range in character class"); + else if ( case_insensitive && + (is_letter($2) || is_letter($4)) ) + { + if ( is_letter($2) && is_letter($4) && + isupper($2) == isupper($4) ) + { // Compatible range, do both versions + int l2 = tolower($2); + int l4 = tolower($4); + + for ( int i = l2; i<= l4; ++i ) + { + $1->Add(i); + $1->Add(toupper(i)); + } + } + + else + synerr("ambiguous case-insensitive character class"); + } + else { for ( int i = $2; i <= $4; ++i ) @@ -178,10 +191,13 @@ ccl : ccl TOK_CHAR '-' TOK_CHAR | ccl TOK_CHAR { - if ( case_insensitive && $2 >= 'A' && $2 <= 'Z' ) - $2 = clower($2); - - $1->Add($2); + if ( case_insensitive && is_letter($2) ) + { + $1->Add(clower($2)); + $1->Add(cupper($2)); + } + else + $1->Add($2); } | ccl ccl_expr @@ -200,9 +216,10 @@ ccl_expr: TOK_CCE string : string TOK_CHAR { - if ( case_insensitive && $2 >= 'A' && $2 <= 'Z' ) - $2 = clower($2); - + // Even if case-insensitivity is set, + // leave this alone; that provides a way + // of "escaping" out of insensitivity + // if needed. $1->AppendState(new NFA_State($2, rem->EC())); } @@ -211,6 +228,16 @@ string : string TOK_CHAR ; %% +int is_letter(int sym) + { + return isascii(sym) && (islower(sym) || isupper(sym)); + } + +int cupper(int sym) + { + return (isascii(sym) && islower(sym)) ? toupper(sym) : sym; + } + int clower(int sym) { return (isascii(sym) && isupper(sym)) ? tolower(sym) : sym; diff --git a/src/re-scan.l b/src/re-scan.l index 8bd00c8bba..70bafd5649 100644 --- a/src/re-scan.l +++ b/src/re-scan.l @@ -114,6 +114,25 @@ CCL_EXPR ("[:"[[:alpha:]]+":]") } } + "(+i"[ \t]* case_insensitive = 1; return TOK_CASE_INSENSITIVE; + + [a-zA-Z] { + if ( case_insensitive ) + { + char c = yytext[0]; // unput trashes yytext! + // Push back the character inside a CCL, + // so the parser can then expand it. + unput(']'); + unput(c); + unput('['); + } + else + { + yylval.int_val = yytext[0]; + return TOK_CHAR; + } + } + [|*+?.(){}] return yytext[0]; . yylval.int_val = yytext[0]; return TOK_CHAR; \n return 0; // treat as end of pattern @@ -157,7 +176,7 @@ CCL_EXPR ("[:"[[:alpha:]]+":]") "[:upper:]" { BEGIN(SC_CCL); yylval.cce_val = - case_insensitive ? my_isupper : my_islower; + case_insensitive ? my_islower : my_isupper; return TOK_CCE; } diff --git a/src/scan.l b/src/scan.l index 3bbf6ec999..24e0547bfc 100644 --- a/src/scan.l +++ b/src/scan.l @@ -554,7 +554,19 @@ F RET_CONST(new Val(false, TYPE_BOOL)) return TOK_PATTERN_TEXT; } -[/\\\n] return yytext[0]; +"/" { + BEGIN(INITIAL); + yylval.b = false; + return TOK_PATTERN_END; + } + +"/i" { + BEGIN(INITIAL); + yylval.b = true; + return TOK_PATTERN_END; + } + +[\\\n] return yytext[0]; // should cause a parse error <*>. reporter->Error("unrecognized character - %s", yytext);