diff --git a/CHANGES b/CHANGES index a944d89460..d4fcef86e8 100644 --- a/CHANGES +++ b/CHANGES @@ -1,3 +1,6 @@ +3.3.0-dev.109 | 2020-08-14 10:00:07 -0700 + + * GH-178: Add new string bif methods based on python string utilities (Tim Wojtulewicz, Corelight) 3.3.0-dev.107 | 2020-08-12 20:01:37 +0000 diff --git a/VERSION b/VERSION index 82f6583a08..b23bd4dfb6 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -3.3.0-dev.107 +3.3.0-dev.109 diff --git a/doc b/doc index c303feaa43..ba34035317 160000 --- a/doc +++ b/doc @@ -1 +1 @@ -Subproject commit c303feaa43d8b645e44a54a87e559747024eb148 +Subproject commit ba3403531777d3f07438e7d45a8430eed599fad6 diff --git a/src/strings.bif b/src/strings.bif index 0567bdf709..aeec22dc4b 100644 --- a/src/strings.bif +++ b/src/strings.bif @@ -5,6 +5,7 @@ %%{ // C segment #include #include +#include using namespace std; #include "SmithWaterman.h" @@ -1150,7 +1151,321 @@ function hexdump%(data_str: string%) : string ## function reverse%(str: string%) : string %{ - string s = string((const char*)str->Bytes(), str->Len()); + string s = str->ToStdString(); reverse(s.begin(), s.end()); return zeek::make_intrusive(s.length(), (const char*)s.c_str()); %} + +## Returns the number of times a substring occurs within a string +## +## str: The string to search in. +## substr: The string to search for. +## +## Returns: The number of times the substring occurred. +## +function count_substr%(str: string, sub: string%) : count + %{ + string s = str->ToStdString(); + string sub_s = sub->ToStdString(); + + size_t count = 0; + size_t pos = s.find(sub_s); + while ( pos != string::npos ) + { + ++count; + pos = s.find(sub_s, pos + sub_s.size()); + } + + return zeek::val_mgr->Count(count); + %} + +%%{ + +int64_t do_find_str(zeek::StringVal* str, zeek::StringVal* sub, uint64_t start, int64_t end, bool rfind) + { + // Don't bother if the start is after the end of the string. + if ( start > str->Len() ) + return -1; + + // Also don't bother (and return an error) if the end is before the start. + if ( (end != -1 ) && end < start ) + { + reporter->Error("find_str: end position must be greater than start position"); + return -1; + } + + size_t end_pos = str->Len(); + if ( end >= 0 && end < str->Len() ) + end_pos = end; + + // One last sanity check, don't bother doing string operations at all if the range is shorter than + // the length of the search string. + if ( (end_pos - start + 1) < sub->Len() ) + return -1; + + string s = str->ToStdString().substr(start, end_pos); + size_t pos = string::npos; + if ( rfind ) + pos = s.rfind(sub->ToStdString()); + else + pos = s.find(sub->ToStdString()); + + if ( pos == string::npos ) + return -1; + + return pos + start; + } + +%%} + +## Finds a string within another string, starting from the beginning. This works by taking a substring within +## the provided indexes and searching for the sub argument. This means that ranges shorter than the string in +## the sub argument will always return a failure. +## +## str: The string to search in. +## substr: The string to search for. +## start: An optional position for the start of the substring. +## end: An optional position for the end of the substring. A value less than zero (such as the default -1) +## means a search until the end of the string. +## +## Returns: The position of the substring. Returns -1 if the string wasn't found. Prints an error if the +## starting position is after the ending position. +## +function find_str%(str: string, sub: string, start: count &default=0, end: int &default=-1%) : int + %{ + return zeek::val_mgr->Int(do_find_str(str, sub, start, end, false)); + %} + +## The same as find(), but returns the highest index matching the substring instead of the smallest. +## +## str: The string to search in. +## substr: The string to search for. +## start: An optional position for the start of the substring. +## end: An optional position for the end of the substring. A value less than zero (such as the default -1) +## means a search from the end of the string. +## +## Returns: The position of the substring. Returns -1 if the string wasn't found. Prints an error if the +## starting position is after the ending position. +## +function rfind_str%(str: string, sub: string, start: count &default=0, end: int &default=-1%) : count + %{ + return zeek::val_mgr->Int(do_find_str(str, sub, start, end, true)); + %} + +## Returns whether a string starts with a substring. +## +function starts_with%(str: string, sub: string%) : bool + %{ + string s = str->ToStdString(); + return zeek::val_mgr->Bool(s.find(sub->ToStdString()) == 0); + %} + +## Returns whether a string ends with a substring. +## +function ends_with%(str: string, sub: string%) : bool + %{ + if ( sub->Len() > str->Len() ) + return zeek::val_mgr->Bool(false); + + string s = str->ToStdString(); + string sub_s = sub->ToStdString(); + return zeek::val_mgr->Bool(s.rfind(sub_s) == (s.size() - sub_s.size())); + %} + +## Returns whether an entire string consists only of digits. +## +function is_num%(str: string%) : bool + %{ + // Python's version of this method (which this is based on) just checks to see if every + // character in the string is a numeric value. If something more than this is desired, we + // could use something like std::from_chars or std::strto{ul,f} to check it. + const char* s = str->CheckString(); + for ( int i = 0; i < str->Len(); i++ ) + if ( ! std::isdigit(s[i]) ) + return zeek::val_mgr->False(); + + return zeek::val_mgr->True(); + %} + +## Returns whether an entire string is alphabetic characters. +## +function is_alpha%(str: string%) : bool + %{ + const char* s = str->CheckString(); + for ( int i = 0; i < str->Len(); i++ ) + if ( ! std::isalpha(s[i]) ) + return zeek::val_mgr->False(); + + return zeek::val_mgr->True(); + %} + +## Returns whether an entire string is alphanumeric characters +## +function is_alnum%(str: string%) : bool + %{ + const char* s = str->CheckString(); + for ( int i = 0; i < str->Len(); i++ ) + if ( ! std::isalnum(s[i]) ) + return zeek::val_mgr->False(); + + return zeek::val_mgr->True(); + %} + +## Returns a left-justified version of the string, padded to a specific length with a specified character. +## +## str: The string to left-justify. +## count: The length of the returned string. If this value is less than or equal to the length of str, a +## copy of str is returned. +## fill: The character used to fill in any extra characters in the resulting string. If a string longer than +## one character is passed, an error is reported. This defaults to the space character. +## +## Returns: A left-justified version of a string, padded with characters to a specific length. +## +function ljust%(str: string, width: count, fill: string &default=" "%) : string + %{ + if ( fill->Len() != 1 ) + { + reporter->Error("Fill string passed to ljust() must be a single character"); + return nullptr; + } + + string new_s = str->ToStdString(); + + if ( width <= new_s.size() ) + return zeek::StringValPtr(zeek::NewRef{}, str); + + new_s.insert(new_s.size(), width - new_s.size(), fill->CheckString()[0]); + return zeek::make_intrusive(new_s); + %} + +%%{ + +static zeek::StringValPtr do_rjust(zeek::StringVal* str, int width, char fill) + { + string new_s = str->ToStdString(); + + if ( width <= new_s.size() ) + return { zeek::NewRef{}, str }; + + new_s.insert(0, width - new_s.size(), fill); + return zeek::make_intrusive(new_s); + } + +%%} + +## Returns a right-justified version of the string, padded to a specific length with a specified character. +## +## str: The string to right-justify. +## count: The length of the returned string. If this value is less than or equal to the length of str, a +## copy of str is returned. +## fill: The character used to fill in any extra characters in the resulting string. If a string longer than +## one character is passed, an error is reported. This defaults to the space character. +## +## Returns: A right-justified version of a string, padded with characters to a specific length. +## +function rjust%(str: string, width: count, fill: string &default=" "%) : string + %{ + if ( fill->Len() != 1 ) + { + reporter->Error("Fill string passed to rjust() must be a single character"); + return nullptr; + } + + return do_rjust(str, width, fill->CheckString()[0]); + %} + +## Swaps the case of every alphabetic character in a string. For example, the string "aBc" be returned as "AbC". +## +## str: The string to swap cases in. +## +## Returns: A copy of the str with the case of each character swapped. +## +function swap_case%(str: string%) : string + %{ + string s = str->ToStdString(); + for ( size_t i = 0; i < s.size(); i++ ) + { + if ( std::islower(s[i]) ) + s[i] = toupper(s[i]); + else if ( std::isupper(s[i]) ) + s[i] = tolower(s[i]); + } + + return zeek::make_intrusive(s); + %} + +## Converts a string to Title Case. This changes the first character of each sequence of non-space characters +## in the string to be capitalized. See https://docs.python.org/2/library/stdtypes.html#str.title for more info. +## +## str: The string to convert. +## +## Returns: A title-cased version of the string. +## +function to_title%(str: string%) : string + %{ + string s = str->ToStdString(); + size_t pos = s.find_first_not_of(' '); + if ( pos == string::npos ) + return zeek::IntrusivePtr(NewRef{}, str); + + while ( pos != string::npos ) + { + s[pos] = std::toupper(s[pos]); + pos = s.find(' ', pos+1); + if ( pos == string::npos ) + break; + + pos = s.find_first_not_of(' ', pos+1); + } + + return zeek::make_intrusive(s); + %} + +## Returns a copy of a string filled on the left side with zeroes. This is effectively rjust(str, width, "0"). +function zfill%(str: string, width: count%) : string + %{ + return do_rjust(str, width, '0'); + %} + +## Similar to lstrip(), except does the removal repeatedly if the pattern repeats at the start of the string. +function remove_prefix%(str: string, sub: string%) : string + %{ + // This could just use repeated calls to lstrip(), except for a couple of reasons: + // 1) lstrip() creates a StringVal at the end, and that would mean repeated recreation of objects + // 2) lstrip() searches for any character in the string, not the string as a whole. + string s = str->ToStdString(); + string sub_s = sub->ToStdString(); + + size_t pos = s.find(sub_s); + if ( pos != 0 ) + return zeek::IntrusivePtr(NewRef{}, str); + + pos = s.find(sub_s, pos+1); + size_t next_pos = sub_s.size(); + while ( pos == next_pos && next_pos < s.size() ) + { + next_pos += sub_s.size(); + pos = s.find(sub_s, pos+1); + } + + return zeek::make_intrusive(s.substr(next_pos)); + %} + +## Similar to rstrip(), except does the removal repeatedly if the pattern repeats at the end of the string. +function remove_suffix%(str: string, sub: string%) : string + %{ + // See the note in removeprefix for why this doesn't just call rstrip. + string s = str->ToStdString(); + string sub_s = sub->ToStdString(); + + size_t pos = s.rfind(sub_s); + size_t next_pos = s.size() - sub_s.size(); + + while ( pos == next_pos ) + { + next_pos -= sub_s.size(); + pos = s.rfind(sub_s, pos-1); + } + + return zeek::make_intrusive(s.substr(0, next_pos + sub_s.size())); + %} diff --git a/testing/btest/Baseline/bifs.string_utils/out b/testing/btest/Baseline/bifs.string_utils/out new file mode 100644 index 0000000000..4639ad0bc8 --- /dev/null +++ b/testing/btest/Baseline/bifs.string_utils/out @@ -0,0 +1,66 @@ +Justification (input string 'abc') +---------------------------------- +ljust: 'abc' +ljust: 'abc' +ljust: 'abc ' +ljust: 'abc--' +rjust: 'abc' +rjust: 'abc' +rjust: ' abc' +rjust: '--abc' +zfill: 'abc' +zfill: 'abc' +zfill: '00abc' + +Content checking +---------------- +is_num abc : 0 +is_num 123 : 1 +is_alpha ab : 1 +is_alpha 1a : 0 +is_alpha a1 : 0 +is_alnum ab : 1 +is_alnum 1a : 1 +is_alnum a1 : 1 +is_alnum 12 : 1 +is_alnum ##12: 0 + +String counting (input str 'aabbaa') +------------------------------------ +count_substr aa: 2 +count_substr bb: 1 +count_substr cc: 0 + +Starts/endswith +--------------- +starts_with bro: 1 +starts_with ids: 0 +ends_with ids: 1 +ends_with bro: 0 + +Transformations +--------------- +swap_case 'aBc': AbC +to_title 'bro is a very neat ids': 'Bro Is A Very Neat Ids' +to_title ' ': ' ' +to_title ' a c ': ' A C ' +remove_prefix 'ananab'/'an' : ab +remove_prefix 'anatnab'/'an': atnab +remove_suffix 'banana'/'na' : ba +remove_suffix 'bantana'/'na': banta + +find_str/rfind_str (input string 'abcdefghi') +----------------------------------------------------- +find_str: 0 +find_str: -1 +find_str: -1 +find_str: 4 +find_str: 4 +find_str: -1 +find_str: 0 +find_str: -1 +find_str: -1 +find_str: 4 +find_str: 4 +find_str: -1 + diff --git a/testing/btest/bifs/string_utils.zeek b/testing/btest/bifs/string_utils.zeek new file mode 100644 index 0000000000..85a0a97daf --- /dev/null +++ b/testing/btest/bifs/string_utils.zeek @@ -0,0 +1,82 @@ +# @TEST-EXEC: zeek -b %INPUT >out +# @TEST-EXEC: btest-diff out + +event zeek_init() + { + print "Justification (input string 'abc')"; + print "----------------------------------"; + local s1 : string = "abc"; + print fmt("ljust: '%s'", ljust(s1, 2, " ")); # 'abc' + print fmt("ljust: '%s'", ljust(s1, 3, " ")); # 'abc' + print fmt("ljust: '%s'", ljust(s1, 5)); # 'abc ' + print fmt("ljust: '%s'", ljust(s1, 5, "-")); # 'abc--' + print fmt("ljust: '%s'", ljust(s1, 2, "--")); # This should return an error + print fmt("rjust: '%s'", rjust(s1, 2, " ")); # 'abc' + print fmt("rjust: '%s'", rjust(s1, 3, " ")); # 'abc' + print fmt("rjust: '%s'", rjust(s1, 5)); # ' abc' + print fmt("rjust: '%s'", rjust(s1, 5, "-")); # '--abc' + print fmt("rjust: '%s'", rjust(s1, 2, "--")); # This should return an error + print fmt("zfill: '%s'", zfill(s1, 2)); # 'abc' + print fmt("zfill: '%s'", zfill(s1, 3)); # 'abc' + print fmt("zfill: '%s'", zfill(s1, 5)); # '00abc' + print ""; + + print "Content checking"; + print "----------------"; + print fmt("is_num abc : %d", is_num("abc")); + print fmt("is_num 123 : %d", is_num("123")); + print fmt("is_alpha ab : %d", is_alpha("ab")); + print fmt("is_alpha 1a : %d", is_alpha("1a")); + print fmt("is_alpha a1 : %d", is_alpha("a1")); + print fmt("is_alnum ab : %d", is_alnum("ab")); + print fmt("is_alnum 1a : %d", is_alnum("1a")); + print fmt("is_alnum a1 : %d", is_alnum("a1")); + print fmt("is_alnum 12 : %d", is_alnum("12")); + print fmt("is_alnum ##12: %d", is_alnum("##12")); + print ""; + + print "String counting (input str 'aabbaa')"; + print "------------------------------------"; + local s2 : string = "aabbaa"; + print fmt("count_substr aa: %d", count_substr(s2, "aa")); + print fmt("count_substr bb: %d", count_substr(s2, "bb")); + print fmt("count_substr cc: %d", count_substr(s2, "cc")); + print ""; + + print "Starts/endswith"; + print "---------------"; + local s3: string = "abcdefghi"; + print fmt("starts_with bro: %d", starts_with(s3, "abc")); + print fmt("starts_with ids: %d", starts_with(s3, "ghi")); + print fmt("ends_with ids: %d", ends_with(s3, "ghi")); + print fmt("ends_with bro: %d", ends_with(s3, "abc")); + print ""; + + print "Transformations"; + print "---------------"; + print fmt("swap_case 'aBc': %s", swap_case("aBc")); + print fmt("to_title 'bro is a very neat ids': '%s'", to_title("bro is a very neat ids")); + print fmt("to_title ' ': '%s'", to_title(" ")); + print fmt("to_title ' a c ': '%s'", to_title(" a c ")); + print fmt("remove_prefix 'ananab'/'an' : %s", remove_prefix("ananab", "an")); + print fmt("remove_prefix 'anatnab'/'an': %s", remove_prefix("anatnab", "an")); + print fmt("remove_suffix 'banana'/'na' : %s", remove_suffix("banana", "na")); + print fmt("remove_suffix 'bantana'/'na': %s", remove_suffix("bantana", "na")); + print ""; + + print fmt("find_str/rfind_str (input string '%s')", s3); + print "-----------------------------------------------------"; + print fmt("find_str: %d", find_str(s3, "abcd")); + print fmt("find_str: %d", find_str(s3, "abcd", 1)); + print fmt("find_str: %d", find_str(s3, "abcd", 0, 2)); + print fmt("find_str: %d", find_str(s3, "efg")); + print fmt("find_str: %d", find_str(s3, "efg", 2, 6)); + print fmt("find_str: %d", find_str(s3, "efg", 6, 2)); + print fmt("find_str: %d", rfind_str(s3, "abcd")); + print fmt("find_str: %d", rfind_str(s3, "abcd", 1)); + print fmt("find_str: %d", rfind_str(s3, "abcd", 0, 2)); + print fmt("find_str: %d", rfind_str(s3, "efg")); + print fmt("find_str: %d", rfind_str(s3, "efg", 2, 6)); + print fmt("find_str: %d", rfind_str(s3, "efg", 6, 2)); + print ""; + }