mirror of
https://github.com/zeek/zeek.git
synced 2025-10-02 06:38:20 +00:00
Merge remote-tracking branch 'origin/topic/awelzel/3455-do-split-string-2'
* origin/topic/awelzel/3455-do-split-string-2: strings.bif/do_split_string: Pass bol and eol to MatchPrefix() RE_Matcher: Add MatchPrefix with bol/eol control
This commit is contained in:
commit
d9b8154c4e
8 changed files with 88 additions and 11 deletions
9
CHANGES
9
CHANGES
|
@ -1,3 +1,12 @@
|
||||||
|
6.2.0-dev.141 | 2023-11-17 13:01:19 +0100
|
||||||
|
|
||||||
|
* GH-3455: strings.bif/do_split_string: Pass bol and eol to MatchPrefix() (Arne Welzel, Corelight)
|
||||||
|
|
||||||
|
This allows better control of BOL and EOL. MatchPrefix() / LongestMatch()
|
||||||
|
always start with BOL.
|
||||||
|
|
||||||
|
* RE_Matcher: Add MatchPrefix with bol/eol control (Arne Welzel, Corelight)
|
||||||
|
|
||||||
6.2.0-dev.137 | 2023-11-11 17:45:51 +0100
|
6.2.0-dev.137 | 2023-11-11 17:45:51 +0100
|
||||||
|
|
||||||
* Bind scan_path to the scope; avoid heap allocation (Dominik Charousset, Corelight)
|
* Bind scan_path to the scope; avoid heap allocation (Dominik Charousset, Corelight)
|
||||||
|
|
4
NEWS
4
NEWS
|
@ -19,6 +19,10 @@ New Functionality
|
||||||
Changed Functionality
|
Changed Functionality
|
||||||
---------------------
|
---------------------
|
||||||
|
|
||||||
|
- The ``split_string`` family of functions now respect the beginning-of-line ^ and
|
||||||
|
end-of-line $ anchors. Previously, an anchored pattern would be matched anywhere
|
||||||
|
in the input string.
|
||||||
|
|
||||||
Removed Functionality
|
Removed Functionality
|
||||||
---------------------
|
---------------------
|
||||||
|
|
||||||
|
|
2
VERSION
2
VERSION
|
@ -1 +1 @@
|
||||||
6.2.0-dev.137
|
6.2.0-dev.141
|
||||||
|
|
14
src/RE.cc
14
src/RE.cc
|
@ -318,7 +318,7 @@ bool RE_Match_State::Match(const u_char* bv, int n, bool bol, bool eol, bool cle
|
||||||
return accepted_matches.size() != old_matches;
|
return accepted_matches.size() != old_matches;
|
||||||
}
|
}
|
||||||
|
|
||||||
int Specific_RE_Matcher::LongestMatch(const u_char* bv, int n) {
|
int Specific_RE_Matcher::LongestMatch(const u_char* bv, int n, bool bol, bool eol) {
|
||||||
if ( ! dfa )
|
if ( ! dfa )
|
||||||
// An empty pattern matches anything.
|
// An empty pattern matches anything.
|
||||||
return 0;
|
return 0;
|
||||||
|
@ -327,11 +327,13 @@ int Specific_RE_Matcher::LongestMatch(const u_char* bv, int n) {
|
||||||
int last_accept = -1;
|
int last_accept = -1;
|
||||||
DFA_State* d = dfa->StartState();
|
DFA_State* d = dfa->StartState();
|
||||||
|
|
||||||
d = d->Xtion(ecs[SYM_BOL], dfa);
|
if ( bol ) {
|
||||||
if ( ! d )
|
d = d->Xtion(ecs[SYM_BOL], dfa);
|
||||||
return -1;
|
if ( ! d )
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
if ( d->Accept() )
|
if ( d->Accept() ) // initial state or bol match (e.g, / */ or /^ ?/)
|
||||||
last_accept = 0;
|
last_accept = 0;
|
||||||
|
|
||||||
for ( int i = 0; i < n; ++i ) {
|
for ( int i = 0; i < n; ++i ) {
|
||||||
|
@ -345,7 +347,7 @@ int Specific_RE_Matcher::LongestMatch(const u_char* bv, int n) {
|
||||||
last_accept = i + 1;
|
last_accept = i + 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( d ) {
|
if ( d && eol ) {
|
||||||
d = d->Xtion(ecs[SYM_EOL], dfa);
|
d = d->Xtion(ecs[SYM_EOL], dfa);
|
||||||
if ( d && d->Accept() )
|
if ( d && d->Accept() )
|
||||||
return n;
|
return n;
|
||||||
|
|
7
src/RE.h
7
src/RE.h
|
@ -106,7 +106,7 @@ public:
|
||||||
|
|
||||||
int LongestMatch(const char* s);
|
int LongestMatch(const char* s);
|
||||||
int LongestMatch(const String* s);
|
int LongestMatch(const String* s);
|
||||||
int LongestMatch(const u_char* bv, int n);
|
int LongestMatch(const u_char* bv, int n, bool bol = true, bool eol = true);
|
||||||
|
|
||||||
EquivClass* EC() { return &equiv_class; }
|
EquivClass* EC() { return &equiv_class; }
|
||||||
|
|
||||||
|
@ -220,6 +220,11 @@ public:
|
||||||
int MatchPrefix(const String* s) { return re_exact->LongestMatch(s); }
|
int MatchPrefix(const String* s) { return re_exact->LongestMatch(s); }
|
||||||
int MatchPrefix(const u_char* s, int n) { return re_exact->LongestMatch(s, n); }
|
int MatchPrefix(const u_char* s, int n) { return re_exact->LongestMatch(s, n); }
|
||||||
|
|
||||||
|
// MatchPrefix() version allowing control of bol and eol.
|
||||||
|
// This can be useful when searching for a pattern with an
|
||||||
|
// anchor within a larger string.
|
||||||
|
int MatchPrefix(const u_char* s, int n, bool bol, bool eol) { return re_exact->LongestMatch(s, n, bol, eol); }
|
||||||
|
|
||||||
bool Match(const u_char* s, int n) { return re_anywhere->Match(s, n); }
|
bool Match(const u_char* s, int n) { return re_anywhere->Match(s, n); }
|
||||||
|
|
||||||
const char* PatternText() const { return re_exact->PatternText(); }
|
const char* PatternText() const { return re_exact->PatternText(); }
|
||||||
|
|
|
@ -280,15 +280,23 @@ static zeek::VectorValPtr do_split_string(zeek::StringVal* str_val,
|
||||||
int num_sep = 0;
|
int num_sep = 0;
|
||||||
|
|
||||||
int offset = 0;
|
int offset = 0;
|
||||||
|
bool bol = true;
|
||||||
|
const bool eol = true;
|
||||||
|
|
||||||
while ( n >= 0 )
|
while ( n >= 0 )
|
||||||
{
|
{
|
||||||
offset = 0;
|
offset = 0;
|
||||||
// Find next match offset.
|
// Find next match offset.
|
||||||
int end_of_match = 0;
|
int end_of_match = 0;
|
||||||
while ( n > 0 &&
|
while ( n > 0 )
|
||||||
(end_of_match = re->MatchPrefix(s + offset, n)) <= 0 )
|
|
||||||
{
|
{
|
||||||
// Move on to next byte.
|
end_of_match = re->MatchPrefix(s + offset, n, bol, eol);
|
||||||
|
if ( end_of_match > 0 )
|
||||||
|
break;
|
||||||
|
|
||||||
|
// Move on to next byte, use BOL only on the byte such that
|
||||||
|
// a BOL anchored pattern won't be matched anywhere else.
|
||||||
|
bol = false;
|
||||||
++offset;
|
++offset;
|
||||||
--n;
|
--n;
|
||||||
}
|
}
|
||||||
|
|
|
@ -31,3 +31,9 @@ A
|
||||||
C
|
C
|
||||||
=
|
=
|
||||||
D
|
D
|
||||||
|
test, ^est, [test]
|
||||||
|
test, tes$, [test]
|
||||||
|
test, ^test$, [, test, ]
|
||||||
|
aa bb cc, / ?/, [aa, bb, cc]
|
||||||
|
aa bb cc, / ?/, [aa, , bb, , cc]
|
||||||
|
aa bb cc, / +/, [aa, bb, cc]
|
||||||
|
|
|
@ -34,3 +34,46 @@ event zeek_init()
|
||||||
pat = /=/;
|
pat = /=/;
|
||||||
print_string_vector(split_string_all(a, pat));
|
print_string_vector(split_string_all(a, pat));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
event zeek_init() &priority=-5
|
||||||
|
{
|
||||||
|
# Anchor testing.
|
||||||
|
local r = split_string_n("test", /^est/, T, 1);
|
||||||
|
assert |r| == 1;
|
||||||
|
assert r[0] == "test";
|
||||||
|
print "test", "^est", r;
|
||||||
|
|
||||||
|
r = split_string_n("test", /tes$/, T, 1);
|
||||||
|
assert |r| == 1;
|
||||||
|
assert r[0] == "test";
|
||||||
|
print "test", "tes$", r;
|
||||||
|
|
||||||
|
r = split_string_n("test", /^test$/, T, 1);
|
||||||
|
assert |r| == 3;
|
||||||
|
assert r[0] == "";
|
||||||
|
assert r[1] == "test";
|
||||||
|
assert r[2] == "";
|
||||||
|
print "test", "^test$", r;
|
||||||
|
|
||||||
|
r = split_string_n("aa bb cc", / ?/, F, 0);
|
||||||
|
assert |r| == 3;
|
||||||
|
assert r[0] == "aa";
|
||||||
|
assert r[1] == "bb";
|
||||||
|
assert r[2] == "cc";
|
||||||
|
print "aa bb cc", "/ ?/", r;
|
||||||
|
|
||||||
|
r = split_string_n("aa bb cc", / ?/, T, 0);
|
||||||
|
assert |r| == 5;
|
||||||
|
assert r[0] == "aa";
|
||||||
|
assert r[1] == " ";
|
||||||
|
assert r[2] == "bb";
|
||||||
|
assert r[3] == " ";
|
||||||
|
print "aa bb cc", "/ ?/", r;
|
||||||
|
|
||||||
|
r = split_string_n("aa bb cc", / +/, F, 0);
|
||||||
|
assert |r| == 3;
|
||||||
|
assert r[0] == "aa";
|
||||||
|
assert r[1] == "bb";
|
||||||
|
assert r[2] == "cc";
|
||||||
|
print "aa bb cc", "/ +/", r;
|
||||||
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue