strings.bif/do_split_string: Pass bol and eol to MatchPrefix()

This allows better control of BOL and EOL. MatchPrefix() / LongestMatch()
always start with BOL.

Closes #3455
This commit is contained in:
Arne Welzel 2023-11-15 20:22:10 +01:00
parent a3bd3e4c50
commit 98d9089a8f
3 changed files with 60 additions and 3 deletions

View file

@ -280,15 +280,23 @@ static zeek::VectorValPtr do_split_string(zeek::StringVal* str_val,
int num_sep = 0;
int offset = 0;
bool bol = true;
const bool eol = true;
while ( n >= 0 )
{
offset = 0;
// Find next match offset.
int end_of_match = 0;
while ( n > 0 &&
(end_of_match = re->MatchPrefix(s + offset, n)) <= 0 )
while ( n > 0 )
{
// Move on to next byte.
end_of_match = re->MatchPrefix(s + offset, n, bol, eol);
if ( end_of_match > 0 )
break;
// Move on to next byte, use BOL only on the byte such that
// a BOL anchored pattern won't be matched anywhere else.
bol = false;
++offset;
--n;
}

View file

@ -31,3 +31,9 @@ A
C
=
D
test, ^est, [test]
test, tes$, [test]
test, ^test$, [, test, ]
aa bb cc, / ?/, [aa, bb, cc]
aa bb cc, / ?/, [aa, , bb, , cc]
aa bb cc, / +/, [aa, bb, cc]

View file

@ -34,3 +34,46 @@ event zeek_init()
pat = /=/;
print_string_vector(split_string_all(a, pat));
}
event zeek_init() &priority=-5
{
# Anchor testing.
local r = split_string_n("test", /^est/, T, 1);
assert |r| == 1;
assert r[0] == "test";
print "test", "^est", r;
r = split_string_n("test", /tes$/, T, 1);
assert |r| == 1;
assert r[0] == "test";
print "test", "tes$", r;
r = split_string_n("test", /^test$/, T, 1);
assert |r| == 3;
assert r[0] == "";
assert r[1] == "test";
assert r[2] == "";
print "test", "^test$", r;
r = split_string_n("aa bb cc", / ?/, F, 0);
assert |r| == 3;
assert r[0] == "aa";
assert r[1] == "bb";
assert r[2] == "cc";
print "aa bb cc", "/ ?/", r;
r = split_string_n("aa bb cc", / ?/, T, 0);
assert |r| == 5;
assert r[0] == "aa";
assert r[1] == " ";
assert r[2] == "bb";
assert r[3] == " ";
print "aa bb cc", "/ ?/", r;
r = split_string_n("aa bb cc", / +/, F, 0);
assert |r| == 3;
assert r[0] == "aa";
assert r[1] == "bb";
assert r[2] == "cc";
print "aa bb cc", "/ +/", r;
}