mirror of
https://github.com/zeek/zeek.git
synced 2025-10-02 14:48:21 +00:00
Merge remote-tracking branch 'origin/topic/timw/178-string-functions'
* origin/topic/timw/178-string-functions: GH-178: Add new string bif methods based on python string utilities
This commit is contained in:
commit
b89935107d
6 changed files with 469 additions and 3 deletions
3
CHANGES
3
CHANGES
|
@ -1,3 +1,6 @@
|
|||
3.3.0-dev.109 | 2020-08-14 10:00:07 -0700
|
||||
|
||||
* GH-178: Add new string bif methods based on python string utilities (Tim Wojtulewicz, Corelight)
|
||||
|
||||
3.3.0-dev.107 | 2020-08-12 20:01:37 +0000
|
||||
|
||||
|
|
2
VERSION
2
VERSION
|
@ -1 +1 @@
|
|||
3.3.0-dev.107
|
||||
3.3.0-dev.109
|
||||
|
|
2
doc
2
doc
|
@ -1 +1 @@
|
|||
Subproject commit c303feaa43d8b645e44a54a87e559747024eb148
|
||||
Subproject commit ba3403531777d3f07438e7d45a8430eed599fad6
|
317
src/strings.bif
317
src/strings.bif
|
@ -5,6 +5,7 @@
|
|||
%%{ // C segment
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
#include <cctype>
|
||||
using namespace std;
|
||||
|
||||
#include "SmithWaterman.h"
|
||||
|
@ -1150,7 +1151,321 @@ function hexdump%(data_str: string%) : string
|
|||
##
|
||||
function reverse%(str: string%) : string
|
||||
%{
|
||||
string s = string((const char*)str->Bytes(), str->Len());
|
||||
string s = str->ToStdString();
|
||||
reverse(s.begin(), s.end());
|
||||
return zeek::make_intrusive<zeek::StringVal>(s.length(), (const char*)s.c_str());
|
||||
%}
|
||||
|
||||
## Returns the number of times a substring occurs within a string
|
||||
##
|
||||
## str: The string to search in.
|
||||
## substr: The string to search for.
|
||||
##
|
||||
## Returns: The number of times the substring occurred.
|
||||
##
|
||||
function count_substr%(str: string, sub: string%) : count
|
||||
%{
|
||||
string s = str->ToStdString();
|
||||
string sub_s = sub->ToStdString();
|
||||
|
||||
size_t count = 0;
|
||||
size_t pos = s.find(sub_s);
|
||||
while ( pos != string::npos )
|
||||
{
|
||||
++count;
|
||||
pos = s.find(sub_s, pos + sub_s.size());
|
||||
}
|
||||
|
||||
return zeek::val_mgr->Count(count);
|
||||
%}
|
||||
|
||||
%%{
|
||||
|
||||
int64_t do_find_str(zeek::StringVal* str, zeek::StringVal* sub, uint64_t start, int64_t end, bool rfind)
|
||||
{
|
||||
// Don't bother if the start is after the end of the string.
|
||||
if ( start > str->Len() )
|
||||
return -1;
|
||||
|
||||
// Also don't bother (and return an error) if the end is before the start.
|
||||
if ( (end != -1 ) && end < start )
|
||||
{
|
||||
reporter->Error("find_str: end position must be greater than start position");
|
||||
return -1;
|
||||
}
|
||||
|
||||
size_t end_pos = str->Len();
|
||||
if ( end >= 0 && end < str->Len() )
|
||||
end_pos = end;
|
||||
|
||||
// One last sanity check, don't bother doing string operations at all if the range is shorter than
|
||||
// the length of the search string.
|
||||
if ( (end_pos - start + 1) < sub->Len() )
|
||||
return -1;
|
||||
|
||||
string s = str->ToStdString().substr(start, end_pos);
|
||||
size_t pos = string::npos;
|
||||
if ( rfind )
|
||||
pos = s.rfind(sub->ToStdString());
|
||||
else
|
||||
pos = s.find(sub->ToStdString());
|
||||
|
||||
if ( pos == string::npos )
|
||||
return -1;
|
||||
|
||||
return pos + start;
|
||||
}
|
||||
|
||||
%%}
|
||||
|
||||
## Finds a string within another string, starting from the beginning. This works by taking a substring within
|
||||
## the provided indexes and searching for the sub argument. This means that ranges shorter than the string in
|
||||
## the sub argument will always return a failure.
|
||||
##
|
||||
## str: The string to search in.
|
||||
## substr: The string to search for.
|
||||
## start: An optional position for the start of the substring.
|
||||
## end: An optional position for the end of the substring. A value less than zero (such as the default -1)
|
||||
## means a search until the end of the string.
|
||||
##
|
||||
## Returns: The position of the substring. Returns -1 if the string wasn't found. Prints an error if the
|
||||
## starting position is after the ending position.
|
||||
##
|
||||
function find_str%(str: string, sub: string, start: count &default=0, end: int &default=-1%) : int
|
||||
%{
|
||||
return zeek::val_mgr->Int(do_find_str(str, sub, start, end, false));
|
||||
%}
|
||||
|
||||
## The same as find(), but returns the highest index matching the substring instead of the smallest.
|
||||
##
|
||||
## str: The string to search in.
|
||||
## substr: The string to search for.
|
||||
## start: An optional position for the start of the substring.
|
||||
## end: An optional position for the end of the substring. A value less than zero (such as the default -1)
|
||||
## means a search from the end of the string.
|
||||
##
|
||||
## Returns: The position of the substring. Returns -1 if the string wasn't found. Prints an error if the
|
||||
## starting position is after the ending position.
|
||||
##
|
||||
function rfind_str%(str: string, sub: string, start: count &default=0, end: int &default=-1%) : count
|
||||
%{
|
||||
return zeek::val_mgr->Int(do_find_str(str, sub, start, end, true));
|
||||
%}
|
||||
|
||||
## Returns whether a string starts with a substring.
|
||||
##
|
||||
function starts_with%(str: string, sub: string%) : bool
|
||||
%{
|
||||
string s = str->ToStdString();
|
||||
return zeek::val_mgr->Bool(s.find(sub->ToStdString()) == 0);
|
||||
%}
|
||||
|
||||
## Returns whether a string ends with a substring.
|
||||
##
|
||||
function ends_with%(str: string, sub: string%) : bool
|
||||
%{
|
||||
if ( sub->Len() > str->Len() )
|
||||
return zeek::val_mgr->Bool(false);
|
||||
|
||||
string s = str->ToStdString();
|
||||
string sub_s = sub->ToStdString();
|
||||
return zeek::val_mgr->Bool(s.rfind(sub_s) == (s.size() - sub_s.size()));
|
||||
%}
|
||||
|
||||
## Returns whether an entire string consists only of digits.
|
||||
##
|
||||
function is_num%(str: string%) : bool
|
||||
%{
|
||||
// Python's version of this method (which this is based on) just checks to see if every
|
||||
// character in the string is a numeric value. If something more than this is desired, we
|
||||
// could use something like std::from_chars or std::strto{ul,f} to check it.
|
||||
const char* s = str->CheckString();
|
||||
for ( int i = 0; i < str->Len(); i++ )
|
||||
if ( ! std::isdigit(s[i]) )
|
||||
return zeek::val_mgr->False();
|
||||
|
||||
return zeek::val_mgr->True();
|
||||
%}
|
||||
|
||||
## Returns whether an entire string is alphabetic characters.
|
||||
##
|
||||
function is_alpha%(str: string%) : bool
|
||||
%{
|
||||
const char* s = str->CheckString();
|
||||
for ( int i = 0; i < str->Len(); i++ )
|
||||
if ( ! std::isalpha(s[i]) )
|
||||
return zeek::val_mgr->False();
|
||||
|
||||
return zeek::val_mgr->True();
|
||||
%}
|
||||
|
||||
## Returns whether an entire string is alphanumeric characters
|
||||
##
|
||||
function is_alnum%(str: string%) : bool
|
||||
%{
|
||||
const char* s = str->CheckString();
|
||||
for ( int i = 0; i < str->Len(); i++ )
|
||||
if ( ! std::isalnum(s[i]) )
|
||||
return zeek::val_mgr->False();
|
||||
|
||||
return zeek::val_mgr->True();
|
||||
%}
|
||||
|
||||
## Returns a left-justified version of the string, padded to a specific length with a specified character.
|
||||
##
|
||||
## str: The string to left-justify.
|
||||
## count: The length of the returned string. If this value is less than or equal to the length of str, a
|
||||
## copy of str is returned.
|
||||
## fill: The character used to fill in any extra characters in the resulting string. If a string longer than
|
||||
## one character is passed, an error is reported. This defaults to the space character.
|
||||
##
|
||||
## Returns: A left-justified version of a string, padded with characters to a specific length.
|
||||
##
|
||||
function ljust%(str: string, width: count, fill: string &default=" "%) : string
|
||||
%{
|
||||
if ( fill->Len() != 1 )
|
||||
{
|
||||
reporter->Error("Fill string passed to ljust() must be a single character");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
string new_s = str->ToStdString();
|
||||
|
||||
if ( width <= new_s.size() )
|
||||
return zeek::StringValPtr(zeek::NewRef{}, str);
|
||||
|
||||
new_s.insert(new_s.size(), width - new_s.size(), fill->CheckString()[0]);
|
||||
return zeek::make_intrusive<zeek::StringVal>(new_s);
|
||||
%}
|
||||
|
||||
%%{
|
||||
|
||||
static zeek::StringValPtr do_rjust(zeek::StringVal* str, int width, char fill)
|
||||
{
|
||||
string new_s = str->ToStdString();
|
||||
|
||||
if ( width <= new_s.size() )
|
||||
return { zeek::NewRef{}, str };
|
||||
|
||||
new_s.insert(0, width - new_s.size(), fill);
|
||||
return zeek::make_intrusive<zeek::StringVal>(new_s);
|
||||
}
|
||||
|
||||
%%}
|
||||
|
||||
## Returns a right-justified version of the string, padded to a specific length with a specified character.
|
||||
##
|
||||
## str: The string to right-justify.
|
||||
## count: The length of the returned string. If this value is less than or equal to the length of str, a
|
||||
## copy of str is returned.
|
||||
## fill: The character used to fill in any extra characters in the resulting string. If a string longer than
|
||||
## one character is passed, an error is reported. This defaults to the space character.
|
||||
##
|
||||
## Returns: A right-justified version of a string, padded with characters to a specific length.
|
||||
##
|
||||
function rjust%(str: string, width: count, fill: string &default=" "%) : string
|
||||
%{
|
||||
if ( fill->Len() != 1 )
|
||||
{
|
||||
reporter->Error("Fill string passed to rjust() must be a single character");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
return do_rjust(str, width, fill->CheckString()[0]);
|
||||
%}
|
||||
|
||||
## Swaps the case of every alphabetic character in a string. For example, the string "aBc" be returned as "AbC".
|
||||
##
|
||||
## str: The string to swap cases in.
|
||||
##
|
||||
## Returns: A copy of the str with the case of each character swapped.
|
||||
##
|
||||
function swap_case%(str: string%) : string
|
||||
%{
|
||||
string s = str->ToStdString();
|
||||
for ( size_t i = 0; i < s.size(); i++ )
|
||||
{
|
||||
if ( std::islower(s[i]) )
|
||||
s[i] = toupper(s[i]);
|
||||
else if ( std::isupper(s[i]) )
|
||||
s[i] = tolower(s[i]);
|
||||
}
|
||||
|
||||
return zeek::make_intrusive<zeek::StringVal>(s);
|
||||
%}
|
||||
|
||||
## Converts a string to Title Case. This changes the first character of each sequence of non-space characters
|
||||
## in the string to be capitalized. See https://docs.python.org/2/library/stdtypes.html#str.title for more info.
|
||||
##
|
||||
## str: The string to convert.
|
||||
##
|
||||
## Returns: A title-cased version of the string.
|
||||
##
|
||||
function to_title%(str: string%) : string
|
||||
%{
|
||||
string s = str->ToStdString();
|
||||
size_t pos = s.find_first_not_of(' ');
|
||||
if ( pos == string::npos )
|
||||
return zeek::IntrusivePtr<zeek::StringVal>(NewRef{}, str);
|
||||
|
||||
while ( pos != string::npos )
|
||||
{
|
||||
s[pos] = std::toupper(s[pos]);
|
||||
pos = s.find(' ', pos+1);
|
||||
if ( pos == string::npos )
|
||||
break;
|
||||
|
||||
pos = s.find_first_not_of(' ', pos+1);
|
||||
}
|
||||
|
||||
return zeek::make_intrusive<zeek::StringVal>(s);
|
||||
%}
|
||||
|
||||
## Returns a copy of a string filled on the left side with zeroes. This is effectively rjust(str, width, "0").
|
||||
function zfill%(str: string, width: count%) : string
|
||||
%{
|
||||
return do_rjust(str, width, '0');
|
||||
%}
|
||||
|
||||
## Similar to lstrip(), except does the removal repeatedly if the pattern repeats at the start of the string.
|
||||
function remove_prefix%(str: string, sub: string%) : string
|
||||
%{
|
||||
// This could just use repeated calls to lstrip(), except for a couple of reasons:
|
||||
// 1) lstrip() creates a StringVal at the end, and that would mean repeated recreation of objects
|
||||
// 2) lstrip() searches for any character in the string, not the string as a whole.
|
||||
string s = str->ToStdString();
|
||||
string sub_s = sub->ToStdString();
|
||||
|
||||
size_t pos = s.find(sub_s);
|
||||
if ( pos != 0 )
|
||||
return zeek::IntrusivePtr<zeek::StringVal>(NewRef{}, str);
|
||||
|
||||
pos = s.find(sub_s, pos+1);
|
||||
size_t next_pos = sub_s.size();
|
||||
while ( pos == next_pos && next_pos < s.size() )
|
||||
{
|
||||
next_pos += sub_s.size();
|
||||
pos = s.find(sub_s, pos+1);
|
||||
}
|
||||
|
||||
return zeek::make_intrusive<zeek::StringVal>(s.substr(next_pos));
|
||||
%}
|
||||
|
||||
## Similar to rstrip(), except does the removal repeatedly if the pattern repeats at the end of the string.
|
||||
function remove_suffix%(str: string, sub: string%) : string
|
||||
%{
|
||||
// See the note in removeprefix for why this doesn't just call rstrip.
|
||||
string s = str->ToStdString();
|
||||
string sub_s = sub->ToStdString();
|
||||
|
||||
size_t pos = s.rfind(sub_s);
|
||||
size_t next_pos = s.size() - sub_s.size();
|
||||
|
||||
while ( pos == next_pos )
|
||||
{
|
||||
next_pos -= sub_s.size();
|
||||
pos = s.rfind(sub_s, pos-1);
|
||||
}
|
||||
|
||||
return zeek::make_intrusive<zeek::StringVal>(s.substr(0, next_pos + sub_s.size()));
|
||||
%}
|
||||
|
|
66
testing/btest/Baseline/bifs.string_utils/out
Normal file
66
testing/btest/Baseline/bifs.string_utils/out
Normal file
|
@ -0,0 +1,66 @@
|
|||
Justification (input string 'abc')
|
||||
----------------------------------
|
||||
ljust: 'abc'
|
||||
ljust: 'abc'
|
||||
ljust: 'abc '
|
||||
ljust: 'abc--'
|
||||
rjust: 'abc'
|
||||
rjust: 'abc'
|
||||
rjust: ' abc'
|
||||
rjust: '--abc'
|
||||
zfill: 'abc'
|
||||
zfill: 'abc'
|
||||
zfill: '00abc'
|
||||
|
||||
Content checking
|
||||
----------------
|
||||
is_num abc : 0
|
||||
is_num 123 : 1
|
||||
is_alpha ab : 1
|
||||
is_alpha 1a : 0
|
||||
is_alpha a1 : 0
|
||||
is_alnum ab : 1
|
||||
is_alnum 1a : 1
|
||||
is_alnum a1 : 1
|
||||
is_alnum 12 : 1
|
||||
is_alnum ##12: 0
|
||||
|
||||
String counting (input str 'aabbaa')
|
||||
------------------------------------
|
||||
count_substr aa: 2
|
||||
count_substr bb: 1
|
||||
count_substr cc: 0
|
||||
|
||||
Starts/endswith
|
||||
---------------
|
||||
starts_with bro: 1
|
||||
starts_with ids: 0
|
||||
ends_with ids: 1
|
||||
ends_with bro: 0
|
||||
|
||||
Transformations
|
||||
---------------
|
||||
swap_case 'aBc': AbC
|
||||
to_title 'bro is a very neat ids': 'Bro Is A Very Neat Ids'
|
||||
to_title ' ': ' '
|
||||
to_title ' a c ': ' A C '
|
||||
remove_prefix 'ananab'/'an' : ab
|
||||
remove_prefix 'anatnab'/'an': atnab
|
||||
remove_suffix 'banana'/'na' : ba
|
||||
remove_suffix 'bantana'/'na': banta
|
||||
|
||||
find_str/rfind_str (input string 'abcdefghi')
|
||||
-----------------------------------------------------
|
||||
find_str: 0
|
||||
find_str: -1
|
||||
find_str: -1
|
||||
find_str: 4
|
||||
find_str: 4
|
||||
find_str: -1
|
||||
find_str: 0
|
||||
find_str: -1
|
||||
find_str: -1
|
||||
find_str: 4
|
||||
find_str: 4
|
||||
find_str: -1
|
||||
|
82
testing/btest/bifs/string_utils.zeek
Normal file
82
testing/btest/bifs/string_utils.zeek
Normal file
|
@ -0,0 +1,82 @@
|
|||
# @TEST-EXEC: zeek -b %INPUT >out
|
||||
# @TEST-EXEC: btest-diff out
|
||||
|
||||
event zeek_init()
|
||||
{
|
||||
print "Justification (input string 'abc')";
|
||||
print "----------------------------------";
|
||||
local s1 : string = "abc";
|
||||
print fmt("ljust: '%s'", ljust(s1, 2, " ")); # 'abc'
|
||||
print fmt("ljust: '%s'", ljust(s1, 3, " ")); # 'abc'
|
||||
print fmt("ljust: '%s'", ljust(s1, 5)); # 'abc '
|
||||
print fmt("ljust: '%s'", ljust(s1, 5, "-")); # 'abc--'
|
||||
print fmt("ljust: '%s'", ljust(s1, 2, "--")); # This should return an error
|
||||
print fmt("rjust: '%s'", rjust(s1, 2, " ")); # 'abc'
|
||||
print fmt("rjust: '%s'", rjust(s1, 3, " ")); # 'abc'
|
||||
print fmt("rjust: '%s'", rjust(s1, 5)); # ' abc'
|
||||
print fmt("rjust: '%s'", rjust(s1, 5, "-")); # '--abc'
|
||||
print fmt("rjust: '%s'", rjust(s1, 2, "--")); # This should return an error
|
||||
print fmt("zfill: '%s'", zfill(s1, 2)); # 'abc'
|
||||
print fmt("zfill: '%s'", zfill(s1, 3)); # 'abc'
|
||||
print fmt("zfill: '%s'", zfill(s1, 5)); # '00abc'
|
||||
print "";
|
||||
|
||||
print "Content checking";
|
||||
print "----------------";
|
||||
print fmt("is_num abc : %d", is_num("abc"));
|
||||
print fmt("is_num 123 : %d", is_num("123"));
|
||||
print fmt("is_alpha ab : %d", is_alpha("ab"));
|
||||
print fmt("is_alpha 1a : %d", is_alpha("1a"));
|
||||
print fmt("is_alpha a1 : %d", is_alpha("a1"));
|
||||
print fmt("is_alnum ab : %d", is_alnum("ab"));
|
||||
print fmt("is_alnum 1a : %d", is_alnum("1a"));
|
||||
print fmt("is_alnum a1 : %d", is_alnum("a1"));
|
||||
print fmt("is_alnum 12 : %d", is_alnum("12"));
|
||||
print fmt("is_alnum ##12: %d", is_alnum("##12"));
|
||||
print "";
|
||||
|
||||
print "String counting (input str 'aabbaa')";
|
||||
print "------------------------------------";
|
||||
local s2 : string = "aabbaa";
|
||||
print fmt("count_substr aa: %d", count_substr(s2, "aa"));
|
||||
print fmt("count_substr bb: %d", count_substr(s2, "bb"));
|
||||
print fmt("count_substr cc: %d", count_substr(s2, "cc"));
|
||||
print "";
|
||||
|
||||
print "Starts/endswith";
|
||||
print "---------------";
|
||||
local s3: string = "abcdefghi";
|
||||
print fmt("starts_with bro: %d", starts_with(s3, "abc"));
|
||||
print fmt("starts_with ids: %d", starts_with(s3, "ghi"));
|
||||
print fmt("ends_with ids: %d", ends_with(s3, "ghi"));
|
||||
print fmt("ends_with bro: %d", ends_with(s3, "abc"));
|
||||
print "";
|
||||
|
||||
print "Transformations";
|
||||
print "---------------";
|
||||
print fmt("swap_case 'aBc': %s", swap_case("aBc"));
|
||||
print fmt("to_title 'bro is a very neat ids': '%s'", to_title("bro is a very neat ids"));
|
||||
print fmt("to_title ' ': '%s'", to_title(" "));
|
||||
print fmt("to_title ' a c ': '%s'", to_title(" a c "));
|
||||
print fmt("remove_prefix 'ananab'/'an' : %s", remove_prefix("ananab", "an"));
|
||||
print fmt("remove_prefix 'anatnab'/'an': %s", remove_prefix("anatnab", "an"));
|
||||
print fmt("remove_suffix 'banana'/'na' : %s", remove_suffix("banana", "na"));
|
||||
print fmt("remove_suffix 'bantana'/'na': %s", remove_suffix("bantana", "na"));
|
||||
print "";
|
||||
|
||||
print fmt("find_str/rfind_str (input string '%s')", s3);
|
||||
print "-----------------------------------------------------";
|
||||
print fmt("find_str: %d", find_str(s3, "abcd"));
|
||||
print fmt("find_str: %d", find_str(s3, "abcd", 1));
|
||||
print fmt("find_str: %d", find_str(s3, "abcd", 0, 2));
|
||||
print fmt("find_str: %d", find_str(s3, "efg"));
|
||||
print fmt("find_str: %d", find_str(s3, "efg", 2, 6));
|
||||
print fmt("find_str: %d", find_str(s3, "efg", 6, 2));
|
||||
print fmt("find_str: %d", rfind_str(s3, "abcd"));
|
||||
print fmt("find_str: %d", rfind_str(s3, "abcd", 1));
|
||||
print fmt("find_str: %d", rfind_str(s3, "abcd", 0, 2));
|
||||
print fmt("find_str: %d", rfind_str(s3, "efg"));
|
||||
print fmt("find_str: %d", rfind_str(s3, "efg", 2, 6));
|
||||
print fmt("find_str: %d", rfind_str(s3, "efg", 6, 2));
|
||||
print "";
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue