Merge remote-tracking branch 'origin/topic/timw/178-string-functions'

* origin/topic/timw/178-string-functions:
  GH-178: Add new string bif methods based on python string utilities
This commit is contained in:
Tim Wojtulewicz 2020-08-14 10:00:07 -07:00
commit b89935107d
6 changed files with 469 additions and 3 deletions

View file

@ -1,3 +1,6 @@
3.3.0-dev.109 | 2020-08-14 10:00:07 -0700
* GH-178: Add new string bif methods based on python string utilities (Tim Wojtulewicz, Corelight)
3.3.0-dev.107 | 2020-08-12 20:01:37 +0000

View file

@ -1 +1 @@
3.3.0-dev.107
3.3.0-dev.109

2
doc

@ -1 +1 @@
Subproject commit c303feaa43d8b645e44a54a87e559747024eb148
Subproject commit ba3403531777d3f07438e7d45a8430eed599fad6

View file

@ -5,6 +5,7 @@
%%{ // C segment
#include <vector>
#include <algorithm>
#include <cctype>
using namespace std;
#include "SmithWaterman.h"
@ -1150,7 +1151,321 @@ function hexdump%(data_str: string%) : string
##
function reverse%(str: string%) : string
%{
string s = string((const char*)str->Bytes(), str->Len());
string s = str->ToStdString();
reverse(s.begin(), s.end());
return zeek::make_intrusive<zeek::StringVal>(s.length(), (const char*)s.c_str());
%}
## Returns the number of times a substring occurs within a string
##
## str: The string to search in.
## substr: The string to search for.
##
## Returns: The number of times the substring occurred.
##
function count_substr%(str: string, sub: string%) : count
%{
string s = str->ToStdString();
string sub_s = sub->ToStdString();
size_t count = 0;
size_t pos = s.find(sub_s);
while ( pos != string::npos )
{
++count;
pos = s.find(sub_s, pos + sub_s.size());
}
return zeek::val_mgr->Count(count);
%}
%%{
int64_t do_find_str(zeek::StringVal* str, zeek::StringVal* sub, uint64_t start, int64_t end, bool rfind)
{
// Don't bother if the start is after the end of the string.
if ( start > str->Len() )
return -1;
// Also don't bother (and return an error) if the end is before the start.
if ( (end != -1 ) && end < start )
{
reporter->Error("find_str: end position must be greater than start position");
return -1;
}
size_t end_pos = str->Len();
if ( end >= 0 && end < str->Len() )
end_pos = end;
// One last sanity check, don't bother doing string operations at all if the range is shorter than
// the length of the search string.
if ( (end_pos - start + 1) < sub->Len() )
return -1;
string s = str->ToStdString().substr(start, end_pos);
size_t pos = string::npos;
if ( rfind )
pos = s.rfind(sub->ToStdString());
else
pos = s.find(sub->ToStdString());
if ( pos == string::npos )
return -1;
return pos + start;
}
%%}
## Finds a string within another string, starting from the beginning. This works by taking a substring within
## the provided indexes and searching for the sub argument. This means that ranges shorter than the string in
## the sub argument will always return a failure.
##
## str: The string to search in.
## substr: The string to search for.
## start: An optional position for the start of the substring.
## end: An optional position for the end of the substring. A value less than zero (such as the default -1)
## means a search until the end of the string.
##
## Returns: The position of the substring. Returns -1 if the string wasn't found. Prints an error if the
## starting position is after the ending position.
##
function find_str%(str: string, sub: string, start: count &default=0, end: int &default=-1%) : int
%{
return zeek::val_mgr->Int(do_find_str(str, sub, start, end, false));
%}
## The same as find(), but returns the highest index matching the substring instead of the smallest.
##
## str: The string to search in.
## substr: The string to search for.
## start: An optional position for the start of the substring.
## end: An optional position for the end of the substring. A value less than zero (such as the default -1)
## means a search from the end of the string.
##
## Returns: The position of the substring. Returns -1 if the string wasn't found. Prints an error if the
## starting position is after the ending position.
##
function rfind_str%(str: string, sub: string, start: count &default=0, end: int &default=-1%) : count
%{
return zeek::val_mgr->Int(do_find_str(str, sub, start, end, true));
%}
## Returns whether a string starts with a substring.
##
function starts_with%(str: string, sub: string%) : bool
%{
string s = str->ToStdString();
return zeek::val_mgr->Bool(s.find(sub->ToStdString()) == 0);
%}
## Returns whether a string ends with a substring.
##
function ends_with%(str: string, sub: string%) : bool
%{
if ( sub->Len() > str->Len() )
return zeek::val_mgr->Bool(false);
string s = str->ToStdString();
string sub_s = sub->ToStdString();
return zeek::val_mgr->Bool(s.rfind(sub_s) == (s.size() - sub_s.size()));
%}
## Returns whether an entire string consists only of digits.
##
function is_num%(str: string%) : bool
%{
// Python's version of this method (which this is based on) just checks to see if every
// character in the string is a numeric value. If something more than this is desired, we
// could use something like std::from_chars or std::strto{ul,f} to check it.
const char* s = str->CheckString();
for ( int i = 0; i < str->Len(); i++ )
if ( ! std::isdigit(s[i]) )
return zeek::val_mgr->False();
return zeek::val_mgr->True();
%}
## Returns whether an entire string is alphabetic characters.
##
function is_alpha%(str: string%) : bool
%{
const char* s = str->CheckString();
for ( int i = 0; i < str->Len(); i++ )
if ( ! std::isalpha(s[i]) )
return zeek::val_mgr->False();
return zeek::val_mgr->True();
%}
## Returns whether an entire string is alphanumeric characters
##
function is_alnum%(str: string%) : bool
%{
const char* s = str->CheckString();
for ( int i = 0; i < str->Len(); i++ )
if ( ! std::isalnum(s[i]) )
return zeek::val_mgr->False();
return zeek::val_mgr->True();
%}
## Returns a left-justified version of the string, padded to a specific length with a specified character.
##
## str: The string to left-justify.
## count: The length of the returned string. If this value is less than or equal to the length of str, a
## copy of str is returned.
## fill: The character used to fill in any extra characters in the resulting string. If a string longer than
## one character is passed, an error is reported. This defaults to the space character.
##
## Returns: A left-justified version of a string, padded with characters to a specific length.
##
function ljust%(str: string, width: count, fill: string &default=" "%) : string
%{
if ( fill->Len() != 1 )
{
reporter->Error("Fill string passed to ljust() must be a single character");
return nullptr;
}
string new_s = str->ToStdString();
if ( width <= new_s.size() )
return zeek::StringValPtr(zeek::NewRef{}, str);
new_s.insert(new_s.size(), width - new_s.size(), fill->CheckString()[0]);
return zeek::make_intrusive<zeek::StringVal>(new_s);
%}
%%{
static zeek::StringValPtr do_rjust(zeek::StringVal* str, int width, char fill)
{
string new_s = str->ToStdString();
if ( width <= new_s.size() )
return { zeek::NewRef{}, str };
new_s.insert(0, width - new_s.size(), fill);
return zeek::make_intrusive<zeek::StringVal>(new_s);
}
%%}
## Returns a right-justified version of the string, padded to a specific length with a specified character.
##
## str: The string to right-justify.
## count: The length of the returned string. If this value is less than or equal to the length of str, a
## copy of str is returned.
## fill: The character used to fill in any extra characters in the resulting string. If a string longer than
## one character is passed, an error is reported. This defaults to the space character.
##
## Returns: A right-justified version of a string, padded with characters to a specific length.
##
function rjust%(str: string, width: count, fill: string &default=" "%) : string
%{
if ( fill->Len() != 1 )
{
reporter->Error("Fill string passed to rjust() must be a single character");
return nullptr;
}
return do_rjust(str, width, fill->CheckString()[0]);
%}
## Swaps the case of every alphabetic character in a string. For example, the string "aBc" be returned as "AbC".
##
## str: The string to swap cases in.
##
## Returns: A copy of the str with the case of each character swapped.
##
function swap_case%(str: string%) : string
%{
string s = str->ToStdString();
for ( size_t i = 0; i < s.size(); i++ )
{
if ( std::islower(s[i]) )
s[i] = toupper(s[i]);
else if ( std::isupper(s[i]) )
s[i] = tolower(s[i]);
}
return zeek::make_intrusive<zeek::StringVal>(s);
%}
## Converts a string to Title Case. This changes the first character of each sequence of non-space characters
## in the string to be capitalized. See https://docs.python.org/2/library/stdtypes.html#str.title for more info.
##
## str: The string to convert.
##
## Returns: A title-cased version of the string.
##
function to_title%(str: string%) : string
%{
string s = str->ToStdString();
size_t pos = s.find_first_not_of(' ');
if ( pos == string::npos )
return zeek::IntrusivePtr<zeek::StringVal>(NewRef{}, str);
while ( pos != string::npos )
{
s[pos] = std::toupper(s[pos]);
pos = s.find(' ', pos+1);
if ( pos == string::npos )
break;
pos = s.find_first_not_of(' ', pos+1);
}
return zeek::make_intrusive<zeek::StringVal>(s);
%}
## Returns a copy of a string filled on the left side with zeroes. This is effectively rjust(str, width, "0").
function zfill%(str: string, width: count%) : string
%{
return do_rjust(str, width, '0');
%}
## Similar to lstrip(), except does the removal repeatedly if the pattern repeats at the start of the string.
function remove_prefix%(str: string, sub: string%) : string
%{
// This could just use repeated calls to lstrip(), except for a couple of reasons:
// 1) lstrip() creates a StringVal at the end, and that would mean repeated recreation of objects
// 2) lstrip() searches for any character in the string, not the string as a whole.
string s = str->ToStdString();
string sub_s = sub->ToStdString();
size_t pos = s.find(sub_s);
if ( pos != 0 )
return zeek::IntrusivePtr<zeek::StringVal>(NewRef{}, str);
pos = s.find(sub_s, pos+1);
size_t next_pos = sub_s.size();
while ( pos == next_pos && next_pos < s.size() )
{
next_pos += sub_s.size();
pos = s.find(sub_s, pos+1);
}
return zeek::make_intrusive<zeek::StringVal>(s.substr(next_pos));
%}
## Similar to rstrip(), except does the removal repeatedly if the pattern repeats at the end of the string.
function remove_suffix%(str: string, sub: string%) : string
%{
// See the note in removeprefix for why this doesn't just call rstrip.
string s = str->ToStdString();
string sub_s = sub->ToStdString();
size_t pos = s.rfind(sub_s);
size_t next_pos = s.size() - sub_s.size();
while ( pos == next_pos )
{
next_pos -= sub_s.size();
pos = s.rfind(sub_s, pos-1);
}
return zeek::make_intrusive<zeek::StringVal>(s.substr(0, next_pos + sub_s.size()));
%}

View file

@ -0,0 +1,66 @@
Justification (input string 'abc')
----------------------------------
ljust: 'abc'
ljust: 'abc'
ljust: 'abc '
ljust: 'abc--'
rjust: 'abc'
rjust: 'abc'
rjust: ' abc'
rjust: '--abc'
zfill: 'abc'
zfill: 'abc'
zfill: '00abc'
Content checking
----------------
is_num abc : 0
is_num 123 : 1
is_alpha ab : 1
is_alpha 1a : 0
is_alpha a1 : 0
is_alnum ab : 1
is_alnum 1a : 1
is_alnum a1 : 1
is_alnum 12 : 1
is_alnum ##12: 0
String counting (input str 'aabbaa')
------------------------------------
count_substr aa: 2
count_substr bb: 1
count_substr cc: 0
Starts/endswith
---------------
starts_with bro: 1
starts_with ids: 0
ends_with ids: 1
ends_with bro: 0
Transformations
---------------
swap_case 'aBc': AbC
to_title 'bro is a very neat ids': 'Bro Is A Very Neat Ids'
to_title ' ': ' '
to_title ' a c ': ' A C '
remove_prefix 'ananab'/'an' : ab
remove_prefix 'anatnab'/'an': atnab
remove_suffix 'banana'/'na' : ba
remove_suffix 'bantana'/'na': banta
find_str/rfind_str (input string 'abcdefghi')
-----------------------------------------------------
find_str: 0
find_str: -1
find_str: -1
find_str: 4
find_str: 4
find_str: -1
find_str: 0
find_str: -1
find_str: -1
find_str: 4
find_str: 4
find_str: -1

View file

@ -0,0 +1,82 @@
# @TEST-EXEC: zeek -b %INPUT >out
# @TEST-EXEC: btest-diff out
event zeek_init()
{
print "Justification (input string 'abc')";
print "----------------------------------";
local s1 : string = "abc";
print fmt("ljust: '%s'", ljust(s1, 2, " ")); # 'abc'
print fmt("ljust: '%s'", ljust(s1, 3, " ")); # 'abc'
print fmt("ljust: '%s'", ljust(s1, 5)); # 'abc '
print fmt("ljust: '%s'", ljust(s1, 5, "-")); # 'abc--'
print fmt("ljust: '%s'", ljust(s1, 2, "--")); # This should return an error
print fmt("rjust: '%s'", rjust(s1, 2, " ")); # 'abc'
print fmt("rjust: '%s'", rjust(s1, 3, " ")); # 'abc'
print fmt("rjust: '%s'", rjust(s1, 5)); # ' abc'
print fmt("rjust: '%s'", rjust(s1, 5, "-")); # '--abc'
print fmt("rjust: '%s'", rjust(s1, 2, "--")); # This should return an error
print fmt("zfill: '%s'", zfill(s1, 2)); # 'abc'
print fmt("zfill: '%s'", zfill(s1, 3)); # 'abc'
print fmt("zfill: '%s'", zfill(s1, 5)); # '00abc'
print "";
print "Content checking";
print "----------------";
print fmt("is_num abc : %d", is_num("abc"));
print fmt("is_num 123 : %d", is_num("123"));
print fmt("is_alpha ab : %d", is_alpha("ab"));
print fmt("is_alpha 1a : %d", is_alpha("1a"));
print fmt("is_alpha a1 : %d", is_alpha("a1"));
print fmt("is_alnum ab : %d", is_alnum("ab"));
print fmt("is_alnum 1a : %d", is_alnum("1a"));
print fmt("is_alnum a1 : %d", is_alnum("a1"));
print fmt("is_alnum 12 : %d", is_alnum("12"));
print fmt("is_alnum ##12: %d", is_alnum("##12"));
print "";
print "String counting (input str 'aabbaa')";
print "------------------------------------";
local s2 : string = "aabbaa";
print fmt("count_substr aa: %d", count_substr(s2, "aa"));
print fmt("count_substr bb: %d", count_substr(s2, "bb"));
print fmt("count_substr cc: %d", count_substr(s2, "cc"));
print "";
print "Starts/endswith";
print "---------------";
local s3: string = "abcdefghi";
print fmt("starts_with bro: %d", starts_with(s3, "abc"));
print fmt("starts_with ids: %d", starts_with(s3, "ghi"));
print fmt("ends_with ids: %d", ends_with(s3, "ghi"));
print fmt("ends_with bro: %d", ends_with(s3, "abc"));
print "";
print "Transformations";
print "---------------";
print fmt("swap_case 'aBc': %s", swap_case("aBc"));
print fmt("to_title 'bro is a very neat ids': '%s'", to_title("bro is a very neat ids"));
print fmt("to_title ' ': '%s'", to_title(" "));
print fmt("to_title ' a c ': '%s'", to_title(" a c "));
print fmt("remove_prefix 'ananab'/'an' : %s", remove_prefix("ananab", "an"));
print fmt("remove_prefix 'anatnab'/'an': %s", remove_prefix("anatnab", "an"));
print fmt("remove_suffix 'banana'/'na' : %s", remove_suffix("banana", "na"));
print fmt("remove_suffix 'bantana'/'na': %s", remove_suffix("bantana", "na"));
print "";
print fmt("find_str/rfind_str (input string '%s')", s3);
print "-----------------------------------------------------";
print fmt("find_str: %d", find_str(s3, "abcd"));
print fmt("find_str: %d", find_str(s3, "abcd", 1));
print fmt("find_str: %d", find_str(s3, "abcd", 0, 2));
print fmt("find_str: %d", find_str(s3, "efg"));
print fmt("find_str: %d", find_str(s3, "efg", 2, 6));
print fmt("find_str: %d", find_str(s3, "efg", 6, 2));
print fmt("find_str: %d", rfind_str(s3, "abcd"));
print fmt("find_str: %d", rfind_str(s3, "abcd", 1));
print fmt("find_str: %d", rfind_str(s3, "abcd", 0, 2));
print fmt("find_str: %d", rfind_str(s3, "efg"));
print fmt("find_str: %d", rfind_str(s3, "efg", 2, 6));
print fmt("find_str: %d", rfind_str(s3, "efg", 6, 2));
print "";
}