mirror of
https://github.com/zeek/zeek.git
synced 2025-10-02 06:38:20 +00:00
Merge remote-tracking branch 'origin/topic/awelzel/3455-do-split-string-2'
* origin/topic/awelzel/3455-do-split-string-2: strings.bif/do_split_string: Pass bol and eol to MatchPrefix() RE_Matcher: Add MatchPrefix with bol/eol control
This commit is contained in:
commit
d9b8154c4e
8 changed files with 88 additions and 11 deletions
9
CHANGES
9
CHANGES
|
@ -1,3 +1,12 @@
|
|||
6.2.0-dev.141 | 2023-11-17 13:01:19 +0100
|
||||
|
||||
* GH-3455: strings.bif/do_split_string: Pass bol and eol to MatchPrefix() (Arne Welzel, Corelight)
|
||||
|
||||
This allows better control of BOL and EOL. MatchPrefix() / LongestMatch()
|
||||
always start with BOL.
|
||||
|
||||
* RE_Matcher: Add MatchPrefix with bol/eol control (Arne Welzel, Corelight)
|
||||
|
||||
6.2.0-dev.137 | 2023-11-11 17:45:51 +0100
|
||||
|
||||
* Bind scan_path to the scope; avoid heap allocation (Dominik Charousset, Corelight)
|
||||
|
|
4
NEWS
4
NEWS
|
@ -19,6 +19,10 @@ New Functionality
|
|||
Changed Functionality
|
||||
---------------------
|
||||
|
||||
- The ``split_string`` family of functions now respect the beginning-of-line ^ and
|
||||
end-of-line $ anchors. Previously, an anchored pattern would be matched anywhere
|
||||
in the input string.
|
||||
|
||||
Removed Functionality
|
||||
---------------------
|
||||
|
||||
|
|
2
VERSION
2
VERSION
|
@ -1 +1 @@
|
|||
6.2.0-dev.137
|
||||
6.2.0-dev.141
|
||||
|
|
14
src/RE.cc
14
src/RE.cc
|
@ -318,7 +318,7 @@ bool RE_Match_State::Match(const u_char* bv, int n, bool bol, bool eol, bool cle
|
|||
return accepted_matches.size() != old_matches;
|
||||
}
|
||||
|
||||
int Specific_RE_Matcher::LongestMatch(const u_char* bv, int n) {
|
||||
int Specific_RE_Matcher::LongestMatch(const u_char* bv, int n, bool bol, bool eol) {
|
||||
if ( ! dfa )
|
||||
// An empty pattern matches anything.
|
||||
return 0;
|
||||
|
@ -327,11 +327,13 @@ int Specific_RE_Matcher::LongestMatch(const u_char* bv, int n) {
|
|||
int last_accept = -1;
|
||||
DFA_State* d = dfa->StartState();
|
||||
|
||||
d = d->Xtion(ecs[SYM_BOL], dfa);
|
||||
if ( ! d )
|
||||
return -1;
|
||||
if ( bol ) {
|
||||
d = d->Xtion(ecs[SYM_BOL], dfa);
|
||||
if ( ! d )
|
||||
return -1;
|
||||
}
|
||||
|
||||
if ( d->Accept() )
|
||||
if ( d->Accept() ) // initial state or bol match (e.g, / */ or /^ ?/)
|
||||
last_accept = 0;
|
||||
|
||||
for ( int i = 0; i < n; ++i ) {
|
||||
|
@ -345,7 +347,7 @@ int Specific_RE_Matcher::LongestMatch(const u_char* bv, int n) {
|
|||
last_accept = i + 1;
|
||||
}
|
||||
|
||||
if ( d ) {
|
||||
if ( d && eol ) {
|
||||
d = d->Xtion(ecs[SYM_EOL], dfa);
|
||||
if ( d && d->Accept() )
|
||||
return n;
|
||||
|
|
7
src/RE.h
7
src/RE.h
|
@ -106,7 +106,7 @@ public:
|
|||
|
||||
int LongestMatch(const char* s);
|
||||
int LongestMatch(const String* s);
|
||||
int LongestMatch(const u_char* bv, int n);
|
||||
int LongestMatch(const u_char* bv, int n, bool bol = true, bool eol = true);
|
||||
|
||||
EquivClass* EC() { return &equiv_class; }
|
||||
|
||||
|
@ -220,6 +220,11 @@ public:
|
|||
int MatchPrefix(const String* s) { return re_exact->LongestMatch(s); }
|
||||
int MatchPrefix(const u_char* s, int n) { return re_exact->LongestMatch(s, n); }
|
||||
|
||||
// MatchPrefix() version allowing control of bol and eol.
|
||||
// This can be useful when searching for a pattern with an
|
||||
// anchor within a larger string.
|
||||
int MatchPrefix(const u_char* s, int n, bool bol, bool eol) { return re_exact->LongestMatch(s, n, bol, eol); }
|
||||
|
||||
bool Match(const u_char* s, int n) { return re_anywhere->Match(s, n); }
|
||||
|
||||
const char* PatternText() const { return re_exact->PatternText(); }
|
||||
|
|
|
@ -280,15 +280,23 @@ static zeek::VectorValPtr do_split_string(zeek::StringVal* str_val,
|
|||
int num_sep = 0;
|
||||
|
||||
int offset = 0;
|
||||
bool bol = true;
|
||||
const bool eol = true;
|
||||
|
||||
while ( n >= 0 )
|
||||
{
|
||||
offset = 0;
|
||||
// Find next match offset.
|
||||
int end_of_match = 0;
|
||||
while ( n > 0 &&
|
||||
(end_of_match = re->MatchPrefix(s + offset, n)) <= 0 )
|
||||
while ( n > 0 )
|
||||
{
|
||||
// Move on to next byte.
|
||||
end_of_match = re->MatchPrefix(s + offset, n, bol, eol);
|
||||
if ( end_of_match > 0 )
|
||||
break;
|
||||
|
||||
// Move on to next byte, use BOL only on the byte such that
|
||||
// a BOL anchored pattern won't be matched anywhere else.
|
||||
bol = false;
|
||||
++offset;
|
||||
--n;
|
||||
}
|
||||
|
|
|
@ -31,3 +31,9 @@ A
|
|||
C
|
||||
=
|
||||
D
|
||||
test, ^est, [test]
|
||||
test, tes$, [test]
|
||||
test, ^test$, [, test, ]
|
||||
aa bb cc, / ?/, [aa, bb, cc]
|
||||
aa bb cc, / ?/, [aa, , bb, , cc]
|
||||
aa bb cc, / +/, [aa, bb, cc]
|
||||
|
|
|
@ -34,3 +34,46 @@ event zeek_init()
|
|||
pat = /=/;
|
||||
print_string_vector(split_string_all(a, pat));
|
||||
}
|
||||
|
||||
event zeek_init() &priority=-5
|
||||
{
|
||||
# Anchor testing.
|
||||
local r = split_string_n("test", /^est/, T, 1);
|
||||
assert |r| == 1;
|
||||
assert r[0] == "test";
|
||||
print "test", "^est", r;
|
||||
|
||||
r = split_string_n("test", /tes$/, T, 1);
|
||||
assert |r| == 1;
|
||||
assert r[0] == "test";
|
||||
print "test", "tes$", r;
|
||||
|
||||
r = split_string_n("test", /^test$/, T, 1);
|
||||
assert |r| == 3;
|
||||
assert r[0] == "";
|
||||
assert r[1] == "test";
|
||||
assert r[2] == "";
|
||||
print "test", "^test$", r;
|
||||
|
||||
r = split_string_n("aa bb cc", / ?/, F, 0);
|
||||
assert |r| == 3;
|
||||
assert r[0] == "aa";
|
||||
assert r[1] == "bb";
|
||||
assert r[2] == "cc";
|
||||
print "aa bb cc", "/ ?/", r;
|
||||
|
||||
r = split_string_n("aa bb cc", / ?/, T, 0);
|
||||
assert |r| == 5;
|
||||
assert r[0] == "aa";
|
||||
assert r[1] == " ";
|
||||
assert r[2] == "bb";
|
||||
assert r[3] == " ";
|
||||
print "aa bb cc", "/ ?/", r;
|
||||
|
||||
r = split_string_n("aa bb cc", / +/, F, 0);
|
||||
assert |r| == 3;
|
||||
assert r[0] == "aa";
|
||||
assert r[1] == "bb";
|
||||
assert r[2] == "cc";
|
||||
print "aa bb cc", "/ +/", r;
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue