From 312547ab0aec5564fb4d0d428ab00ac1391dba1d Mon Sep 17 00:00:00 2001 From: Jon Siwek Date: Thu, 8 Apr 2021 20:15:54 -0700 Subject: [PATCH] GH-1497: Support CRLF line-endings in Zeek scripts and signature files --- .gitattributes | 1 + auxil/btest | 2 +- src/rule-scan.l | 6 ++-- src/scan.l | 18 +++++----- .../btest/Baseline/language.crlf-parsing/out | 5 +++ testing/btest/language/crlf-parsing.zeek | 35 +++++++++++++++++++ 6 files changed, 55 insertions(+), 12 deletions(-) create mode 100644 testing/btest/Baseline/language.crlf-parsing/out create mode 100644 testing/btest/language/crlf-parsing.zeek diff --git a/.gitattributes b/.gitattributes index 35b35112d1..1300f8abd0 100644 --- a/.gitattributes +++ b/.gitattributes @@ -3,3 +3,4 @@ *.bif linguist-language=C++ *.l linguist-language=Lex testing/btest/Baseline/** linguist-detectable=false +testing/btest/language/crlf-parsing.zeek text eol=crlf diff --git a/auxil/btest b/auxil/btest index 487d1d03ba..327a7e2e8e 160000 --- a/auxil/btest +++ b/auxil/btest @@ -1 +1 @@ -Subproject commit 487d1d03bac4b51049bc109c862ca547257533cf +Subproject commit 327a7e2e8e838858bdbcf85acb790f61d639639b diff --git a/src/rule-scan.l b/src/rule-scan.l index 5874aac1a8..2b424d2ee5 100644 --- a/src/rule-scan.l +++ b/src/rule-scan.l @@ -24,12 +24,12 @@ WS [ \t]+ D [0-9]+ H [0-9a-fA-F]+ HEX {H} -STRING \"([^\n\"]|\\\")*\" +STRING \"([^\r\n\"]|\\\")*\" IDCOMPONENT [a-zA-Z_][0-9a-zA-Z_-]* ID {IDCOMPONENT}(::{IDCOMPONENT})* IP6 ("["({HEX}:){7}{HEX}"]")|("["0x{HEX}({HEX}|:)*"::"({HEX}|:)*"]")|("["({HEX}|:)*"::"({HEX}|:)*"]")|("["({HEX}|:)*"::"({HEX}|:)*({D}"."){3}{D}"]") RE \/(\\\/)?([^/]|[^\\]\\\/)*\/i? -META \.[^ \t]+{WS}[^\n]+ +META \.[^ \t]+{WS}[^\r\n]+ PIDCOMPONENT [A-Za-z_][A-Za-z_0-9]* PID {PIDCOMPONENT}(::{PIDCOMPONENT})* @@ -41,7 +41,7 @@ PID {PIDCOMPONENT}(::{PIDCOMPONENT})* #.* /* eat comments */ {WS} /* eat white space */ {META} /* eat any meta-data/comments */ - \n ++rules_line_number; + \r?\n ++rules_line_number; } {IP6} { diff --git a/src/scan.l b/src/scan.l index 4a7c9f363a..03c2f46a64 100644 --- a/src/scan.l +++ b/src/scan.l @@ -139,12 +139,12 @@ HEX [0-9a-fA-F]+ IDCOMPONENT [A-Za-z_][A-Za-z_0-9]* ID {IDCOMPONENT}(::{IDCOMPONENT})* IP6 ("["({HEX}:){7}{HEX}"]")|("["0x{HEX}({HEX}|:)*"::"({HEX}|:)*"]")|("["({HEX}|:)*"::"({HEX}|:)*"]")|("["({HEX}:){6}({D}"."){3}{D}"]")|("["({HEX}|:)*"::"({HEX}|:)*({D}"."){3}{D}"]") -FILE [^ \t\n]+ -PREFIX [^ \t\n]+ +FILE [^ \t\r\n]+ +PREFIX [^ \t\r\n]+ FLOAT (({D}*"."?{D})|({D}"."?{D}*))([eE][-+]?{D})? H [A-Za-z0-9][A-Za-z0-9\-]* HTLD [A-Za-z][A-Za-z0-9\-]* -ESCSEQ (\\([^\n]|[0-7]+|x[[:xdigit:]]+)) +ESCSEQ (\\([^\r\n]|[0-7]+|x[[:xdigit:]]+)) %% @@ -175,7 +175,7 @@ ESCSEQ (\\([^\n]|[0-7]+|x[[:xdigit:]]+)) {WS} /* eat whitespace */ -\n { +\r?\n { ++line_number; ++yylloc.first_line; ++yylloc.last_line; @@ -448,7 +448,7 @@ when return TOK_WHEN; @ifndef ++current_depth; @else return TOK_ATELSE; @endif return TOK_ATENDIF; -[^@\n]+ /* eat */ +[^@\r\n]+ /* eat */ . /* eat */ T RET_CONST(zeek::val_mgr->True()->Ref()) @@ -513,7 +513,7 @@ F RET_CONST(zeek::val_mgr->False()->Ref()) ({H}".")+{HTLD} RET_CONST(zeek::detail::dns_mgr->LookupHost(yytext).release()) -\"([^\\\n\"]|{ESCSEQ})*\" { +\"([^\\\r\\\n\"]|{ESCSEQ})*\" { const char* text = yytext; int len = strlen(text) + 1; int i = 0; @@ -546,7 +546,7 @@ F RET_CONST(zeek::val_mgr->False()->Ref()) RET_CONST(new zeek::StringVal(new zeek::String(1, (zeek::byte_vec) s, i-1))) } -([^/\\\n]|{ESCSEQ})+ { +([^/\\\r\\\n]|{ESCSEQ})+ { yylval.str = zeek::util::copy_string(yytext); return TOK_PATTERN_TEXT; } @@ -563,7 +563,9 @@ F RET_CONST(zeek::val_mgr->False()->Ref()) return TOK_PATTERN_END; } -[\\\n] return yytext[0]; // should cause a parse error +\r?\n { + zeek::reporter->Error("patterns must not span multiple lines"); +} <*>. zeek::reporter->Error("unrecognized character: '%s'", zeek::util::get_escaped_string(yytext, false).data()); diff --git a/testing/btest/Baseline/language.crlf-parsing/out b/testing/btest/Baseline/language.crlf-parsing/out new file mode 100644 index 0000000000..d514b55a05 --- /dev/null +++ b/testing/btest/Baseline/language.crlf-parsing/out @@ -0,0 +1,5 @@ +### BTest baseline data generated by btest-diff. Do not edit. Use "btest -U/-u" to update. Requires BTest >= 0.63. +first hello +hello T +last hello +zeek_init diff --git a/testing/btest/language/crlf-parsing.zeek b/testing/btest/language/crlf-parsing.zeek new file mode 100644 index 0000000000..213809278a --- /dev/null +++ b/testing/btest/language/crlf-parsing.zeek @@ -0,0 +1,35 @@ +# @TEST-EXEC: zeek -b %INPUT >out +# @TEST-EXEC: btest-diff out +# @TEST-DOC: Checks that CRLF line endings work in zeek/signature files +# Note the test file itself uses CRLFs and .gitattributes has an entry +# to ensure preservation of the CRLFs. + +@TEST-START-FILE test.sig +signature blah + { + ip-proto == tcp + src-port == 21 + payload /.*/ + event "matched" + } +@TEST-END-FILE + +@TEST-START-FILE test.zeek +event zeek_init() + { + print "zeek_init"; + } +@TEST-END-FILE + +@load test.zeek +@load-sigs test.sig + +print "first hello"; + +@if ( T ) + print "hello T"; +@else + print "hello F"; +@endif + +print "last hello";