From 98d9089a8ffe8c1894d5facdc740211e30d42d12 Mon Sep 17 00:00:00 2001 From: Arne Welzel Date: Wed, 15 Nov 2023 20:22:10 +0100 Subject: [PATCH] strings.bif/do_split_string: Pass bol and eol to MatchPrefix() This allows better control of BOL and EOL. MatchPrefix() / LongestMatch() always start with BOL. Closes #3455 --- src/strings.bif | 14 +++++-- testing/btest/Baseline/bifs.split_string/out | 6 +++ testing/btest/bifs/split_string.zeek | 43 ++++++++++++++++++++ 3 files changed, 60 insertions(+), 3 deletions(-) diff --git a/src/strings.bif b/src/strings.bif index 0c280c58ac..56a0370c92 100644 --- a/src/strings.bif +++ b/src/strings.bif @@ -280,15 +280,23 @@ static zeek::VectorValPtr do_split_string(zeek::StringVal* str_val, int num_sep = 0; int offset = 0; + bool bol = true; + const bool eol = true; + while ( n >= 0 ) { offset = 0; // Find next match offset. int end_of_match = 0; - while ( n > 0 && - (end_of_match = re->MatchPrefix(s + offset, n)) <= 0 ) + while ( n > 0 ) { - // Move on to next byte. + end_of_match = re->MatchPrefix(s + offset, n, bol, eol); + if ( end_of_match > 0 ) + break; + + // Move on to next byte, use BOL only on the byte such that + // a BOL anchored pattern won't be matched anywhere else. + bol = false; ++offset; --n; } diff --git a/testing/btest/Baseline/bifs.split_string/out b/testing/btest/Baseline/bifs.split_string/out index 0f5377c2e2..4f7ef7e15c 100644 --- a/testing/btest/Baseline/bifs.split_string/out +++ b/testing/btest/Baseline/bifs.split_string/out @@ -31,3 +31,9 @@ A C = D +test, ^est, [test] +test, tes$, [test] +test, ^test$, [, test, ] +aa bb cc, / ?/, [aa, bb, cc] +aa bb cc, / ?/, [aa, , bb, , cc] +aa bb cc, / +/, [aa, bb, cc] diff --git a/testing/btest/bifs/split_string.zeek b/testing/btest/bifs/split_string.zeek index 11f0c5e011..e092f73768 100644 --- a/testing/btest/bifs/split_string.zeek +++ b/testing/btest/bifs/split_string.zeek @@ -34,3 +34,46 @@ event zeek_init() pat = /=/; print_string_vector(split_string_all(a, pat)); } + +event zeek_init() &priority=-5 + { + # Anchor testing. + local r = split_string_n("test", /^est/, T, 1); + assert |r| == 1; + assert r[0] == "test"; + print "test", "^est", r; + + r = split_string_n("test", /tes$/, T, 1); + assert |r| == 1; + assert r[0] == "test"; + print "test", "tes$", r; + + r = split_string_n("test", /^test$/, T, 1); + assert |r| == 3; + assert r[0] == ""; + assert r[1] == "test"; + assert r[2] == ""; + print "test", "^test$", r; + + r = split_string_n("aa bb cc", / ?/, F, 0); + assert |r| == 3; + assert r[0] == "aa"; + assert r[1] == "bb"; + assert r[2] == "cc"; + print "aa bb cc", "/ ?/", r; + + r = split_string_n("aa bb cc", / ?/, T, 0); + assert |r| == 5; + assert r[0] == "aa"; + assert r[1] == " "; + assert r[2] == "bb"; + assert r[3] == " "; + print "aa bb cc", "/ ?/", r; + + r = split_string_n("aa bb cc", / +/, F, 0); + assert |r| == 3; + assert r[0] == "aa"; + assert r[1] == "bb"; + assert r[2] == "cc"; + print "aa bb cc", "/ +/", r; + }