Merge remote-tracking branch 'origin/topic/awelzel/3455-do-split-string-2'

* origin/topic/awelzel/3455-do-split-string-2:
  strings.bif/do_split_string: Pass bol and eol to MatchPrefix()
  RE_Matcher: Add MatchPrefix with bol/eol control
This commit is contained in:
Arne Welzel 2023-11-17 13:01:19 +01:00
commit d9b8154c4e
8 changed files with 88 additions and 11 deletions

View file

@ -1,3 +1,12 @@
6.2.0-dev.141 | 2023-11-17 13:01:19 +0100
* GH-3455: strings.bif/do_split_string: Pass bol and eol to MatchPrefix() (Arne Welzel, Corelight)
This allows better control of BOL and EOL. MatchPrefix() / LongestMatch()
always start with BOL.
* RE_Matcher: Add MatchPrefix with bol/eol control (Arne Welzel, Corelight)
6.2.0-dev.137 | 2023-11-11 17:45:51 +0100
* Bind scan_path to the scope; avoid heap allocation (Dominik Charousset, Corelight)

4
NEWS
View file

@ -19,6 +19,10 @@ New Functionality
Changed Functionality
---------------------
- The ``split_string`` family of functions now respect the beginning-of-line ^ and
end-of-line $ anchors. Previously, an anchored pattern would be matched anywhere
in the input string.
Removed Functionality
---------------------

View file

@ -1 +1 @@
6.2.0-dev.137
6.2.0-dev.141

View file

@ -318,7 +318,7 @@ bool RE_Match_State::Match(const u_char* bv, int n, bool bol, bool eol, bool cle
return accepted_matches.size() != old_matches;
}
int Specific_RE_Matcher::LongestMatch(const u_char* bv, int n) {
int Specific_RE_Matcher::LongestMatch(const u_char* bv, int n, bool bol, bool eol) {
if ( ! dfa )
// An empty pattern matches anything.
return 0;
@ -327,11 +327,13 @@ int Specific_RE_Matcher::LongestMatch(const u_char* bv, int n) {
int last_accept = -1;
DFA_State* d = dfa->StartState();
if ( bol ) {
d = d->Xtion(ecs[SYM_BOL], dfa);
if ( ! d )
return -1;
}
if ( d->Accept() )
if ( d->Accept() ) // initial state or bol match (e.g, / */ or /^ ?/)
last_accept = 0;
for ( int i = 0; i < n; ++i ) {
@ -345,7 +347,7 @@ int Specific_RE_Matcher::LongestMatch(const u_char* bv, int n) {
last_accept = i + 1;
}
if ( d ) {
if ( d && eol ) {
d = d->Xtion(ecs[SYM_EOL], dfa);
if ( d && d->Accept() )
return n;

View file

@ -106,7 +106,7 @@ public:
int LongestMatch(const char* s);
int LongestMatch(const String* s);
int LongestMatch(const u_char* bv, int n);
int LongestMatch(const u_char* bv, int n, bool bol = true, bool eol = true);
EquivClass* EC() { return &equiv_class; }
@ -220,6 +220,11 @@ public:
int MatchPrefix(const String* s) { return re_exact->LongestMatch(s); }
int MatchPrefix(const u_char* s, int n) { return re_exact->LongestMatch(s, n); }
// MatchPrefix() version allowing control of bol and eol.
// This can be useful when searching for a pattern with an
// anchor within a larger string.
int MatchPrefix(const u_char* s, int n, bool bol, bool eol) { return re_exact->LongestMatch(s, n, bol, eol); }
bool Match(const u_char* s, int n) { return re_anywhere->Match(s, n); }
const char* PatternText() const { return re_exact->PatternText(); }

View file

@ -280,15 +280,23 @@ static zeek::VectorValPtr do_split_string(zeek::StringVal* str_val,
int num_sep = 0;
int offset = 0;
bool bol = true;
const bool eol = true;
while ( n >= 0 )
{
offset = 0;
// Find next match offset.
int end_of_match = 0;
while ( n > 0 &&
(end_of_match = re->MatchPrefix(s + offset, n)) <= 0 )
while ( n > 0 )
{
// Move on to next byte.
end_of_match = re->MatchPrefix(s + offset, n, bol, eol);
if ( end_of_match > 0 )
break;
// Move on to next byte, use BOL only on the byte such that
// a BOL anchored pattern won't be matched anywhere else.
bol = false;
++offset;
--n;
}

View file

@ -31,3 +31,9 @@ A
C
=
D
test, ^est, [test]
test, tes$, [test]
test, ^test$, [, test, ]
aa bb cc, / ?/, [aa, bb, cc]
aa bb cc, / ?/, [aa, , bb, , cc]
aa bb cc, / +/, [aa, bb, cc]

View file

@ -34,3 +34,46 @@ event zeek_init()
pat = /=/;
print_string_vector(split_string_all(a, pat));
}
event zeek_init() &priority=-5
{
# Anchor testing.
local r = split_string_n("test", /^est/, T, 1);
assert |r| == 1;
assert r[0] == "test";
print "test", "^est", r;
r = split_string_n("test", /tes$/, T, 1);
assert |r| == 1;
assert r[0] == "test";
print "test", "tes$", r;
r = split_string_n("test", /^test$/, T, 1);
assert |r| == 3;
assert r[0] == "";
assert r[1] == "test";
assert r[2] == "";
print "test", "^test$", r;
r = split_string_n("aa bb cc", / ?/, F, 0);
assert |r| == 3;
assert r[0] == "aa";
assert r[1] == "bb";
assert r[2] == "cc";
print "aa bb cc", "/ ?/", r;
r = split_string_n("aa bb cc", / ?/, T, 0);
assert |r| == 5;
assert r[0] == "aa";
assert r[1] == " ";
assert r[2] == "bb";
assert r[3] == " ";
print "aa bb cc", "/ ?/", r;
r = split_string_n("aa bb cc", / +/, F, 0);
assert |r| == 3;
assert r[0] == "aa";
assert r[1] == "bb";
assert r[2] == "cc";
print "aa bb cc", "/ +/", r;
}