diff --git a/CHANGES b/CHANGES index ff53b6f259..171ba6b95f 100644 --- a/CHANGES +++ b/CHANGES @@ -1,3 +1,12 @@ +6.2.0-dev.141 | 2023-11-17 13:01:19 +0100 + + * GH-3455: strings.bif/do_split_string: Pass bol and eol to MatchPrefix() (Arne Welzel, Corelight) + + This allows better control of BOL and EOL. MatchPrefix() / LongestMatch() + always start with BOL. + + * RE_Matcher: Add MatchPrefix with bol/eol control (Arne Welzel, Corelight) + 6.2.0-dev.137 | 2023-11-11 17:45:51 +0100 * Bind scan_path to the scope; avoid heap allocation (Dominik Charousset, Corelight) diff --git a/NEWS b/NEWS index 048f355060..f438bb1fe8 100644 --- a/NEWS +++ b/NEWS @@ -19,6 +19,10 @@ New Functionality Changed Functionality --------------------- +- The ``split_string`` family of functions now respect the beginning-of-line ^ and + end-of-line $ anchors. Previously, an anchored pattern would be matched anywhere + in the input string. + Removed Functionality --------------------- diff --git a/VERSION b/VERSION index bae830d77f..29a1e753f5 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -6.2.0-dev.137 +6.2.0-dev.141 diff --git a/src/RE.cc b/src/RE.cc index b0038a333a..db5dad6d07 100644 --- a/src/RE.cc +++ b/src/RE.cc @@ -318,7 +318,7 @@ bool RE_Match_State::Match(const u_char* bv, int n, bool bol, bool eol, bool cle return accepted_matches.size() != old_matches; } -int Specific_RE_Matcher::LongestMatch(const u_char* bv, int n) { +int Specific_RE_Matcher::LongestMatch(const u_char* bv, int n, bool bol, bool eol) { if ( ! dfa ) // An empty pattern matches anything. return 0; @@ -327,11 +327,13 @@ int Specific_RE_Matcher::LongestMatch(const u_char* bv, int n) { int last_accept = -1; DFA_State* d = dfa->StartState(); - d = d->Xtion(ecs[SYM_BOL], dfa); - if ( ! d ) - return -1; + if ( bol ) { + d = d->Xtion(ecs[SYM_BOL], dfa); + if ( ! d ) + return -1; + } - if ( d->Accept() ) + if ( d->Accept() ) // initial state or bol match (e.g, / */ or /^ ?/) last_accept = 0; for ( int i = 0; i < n; ++i ) { @@ -345,7 +347,7 @@ int Specific_RE_Matcher::LongestMatch(const u_char* bv, int n) { last_accept = i + 1; } - if ( d ) { + if ( d && eol ) { d = d->Xtion(ecs[SYM_EOL], dfa); if ( d && d->Accept() ) return n; diff --git a/src/RE.h b/src/RE.h index ff6c084c92..f68f3482bb 100644 --- a/src/RE.h +++ b/src/RE.h @@ -106,7 +106,7 @@ public: int LongestMatch(const char* s); int LongestMatch(const String* s); - int LongestMatch(const u_char* bv, int n); + int LongestMatch(const u_char* bv, int n, bool bol = true, bool eol = true); EquivClass* EC() { return &equiv_class; } @@ -220,6 +220,11 @@ public: int MatchPrefix(const String* s) { return re_exact->LongestMatch(s); } int MatchPrefix(const u_char* s, int n) { return re_exact->LongestMatch(s, n); } + // MatchPrefix() version allowing control of bol and eol. + // This can be useful when searching for a pattern with an + // anchor within a larger string. + int MatchPrefix(const u_char* s, int n, bool bol, bool eol) { return re_exact->LongestMatch(s, n, bol, eol); } + bool Match(const u_char* s, int n) { return re_anywhere->Match(s, n); } const char* PatternText() const { return re_exact->PatternText(); } diff --git a/src/strings.bif b/src/strings.bif index 0c280c58ac..56a0370c92 100644 --- a/src/strings.bif +++ b/src/strings.bif @@ -280,15 +280,23 @@ static zeek::VectorValPtr do_split_string(zeek::StringVal* str_val, int num_sep = 0; int offset = 0; + bool bol = true; + const bool eol = true; + while ( n >= 0 ) { offset = 0; // Find next match offset. int end_of_match = 0; - while ( n > 0 && - (end_of_match = re->MatchPrefix(s + offset, n)) <= 0 ) + while ( n > 0 ) { - // Move on to next byte. + end_of_match = re->MatchPrefix(s + offset, n, bol, eol); + if ( end_of_match > 0 ) + break; + + // Move on to next byte, use BOL only on the byte such that + // a BOL anchored pattern won't be matched anywhere else. + bol = false; ++offset; --n; } diff --git a/testing/btest/Baseline/bifs.split_string/out b/testing/btest/Baseline/bifs.split_string/out index 0f5377c2e2..4f7ef7e15c 100644 --- a/testing/btest/Baseline/bifs.split_string/out +++ b/testing/btest/Baseline/bifs.split_string/out @@ -31,3 +31,9 @@ A C = D +test, ^est, [test] +test, tes$, [test] +test, ^test$, [, test, ] +aa bb cc, / ?/, [aa, bb, cc] +aa bb cc, / ?/, [aa, , bb, , cc] +aa bb cc, / +/, [aa, bb, cc] diff --git a/testing/btest/bifs/split_string.zeek b/testing/btest/bifs/split_string.zeek index 11f0c5e011..e092f73768 100644 --- a/testing/btest/bifs/split_string.zeek +++ b/testing/btest/bifs/split_string.zeek @@ -34,3 +34,46 @@ event zeek_init() pat = /=/; print_string_vector(split_string_all(a, pat)); } + +event zeek_init() &priority=-5 + { + # Anchor testing. + local r = split_string_n("test", /^est/, T, 1); + assert |r| == 1; + assert r[0] == "test"; + print "test", "^est", r; + + r = split_string_n("test", /tes$/, T, 1); + assert |r| == 1; + assert r[0] == "test"; + print "test", "tes$", r; + + r = split_string_n("test", /^test$/, T, 1); + assert |r| == 3; + assert r[0] == ""; + assert r[1] == "test"; + assert r[2] == ""; + print "test", "^test$", r; + + r = split_string_n("aa bb cc", / ?/, F, 0); + assert |r| == 3; + assert r[0] == "aa"; + assert r[1] == "bb"; + assert r[2] == "cc"; + print "aa bb cc", "/ ?/", r; + + r = split_string_n("aa bb cc", / ?/, T, 0); + assert |r| == 5; + assert r[0] == "aa"; + assert r[1] == " "; + assert r[2] == "bb"; + assert r[3] == " "; + print "aa bb cc", "/ ?/", r; + + r = split_string_n("aa bb cc", / +/, F, 0); + assert |r| == 3; + assert r[0] == "aa"; + assert r[1] == "bb"; + assert r[2] == "cc"; + print "aa bb cc", "/ +/", r; + }