Merge remote-tracking branch 'origin/topic/timw/178-string-functions'

* origin/topic/timw/178-string-functions: GH-178: Add new string bif methods based on python string utilities
2025-10-02 14:48:21 +00:00 · 2020-08-14 10:00:07 -07:00 · 2020-08-14 10:00:07 -07:00 · b89935107d
commit b89935107d
parent 7f267d3e87 5d764d6678
6 changed files with 469 additions and 3 deletions
--- a/3
+++ b/3
@ -1,3 +1,6 @@
+3.3.0-dev.109 | 2020-08-14 10:00:07 -0700
+
+  * GH-178: Add new string bif methods based on python string utilities (Tim Wojtulewicz, Corelight)

 3.3.0-dev.107 | 2020-08-12 20:01:37 +0000

--- a/2
+++ b/2
@ -1 +1 @@
-3.3.0-dev.107
+3.3.0-dev.109
--- a/2
+++ b/2
@ -1 +1 @@
-Subproject commit c303feaa43d8b645e44a54a87e559747024eb148
+Subproject commit ba3403531777d3f07438e7d45a8430eed599fad6
--- a/src/strings.bif
+++ b/src/strings.bif
@ -5,6 +5,7 @@
 %%{ // C segment
 #include <vector>
 #include <algorithm>
+#include <cctype>
 using namespace std;

 #include "SmithWaterman.h"
@ -1150,7 +1151,321 @@ function hexdump%(data_str: string%) : string
 ##
 function reverse%(str: string%) : string
 	%{
-	string s = string((const char*)str->Bytes(), str->Len());
+	string s = str->ToStdString();
 	reverse(s.begin(), s.end());
 	return zeek::make_intrusive<zeek::StringVal>(s.length(), (const char*)s.c_str());
 	%}
+
+## Returns the number of times a substring occurs within a string
+##
+## str: The string to search in.
+## substr: The string to search for.
+##
+## Returns: The number of times the substring occurred.
+##
+function count_substr%(str: string, sub: string%) : count
+	%{
+	string s = str->ToStdString();
+	string sub_s = sub->ToStdString();
+
+	size_t count = 0;
+	size_t pos = s.find(sub_s);
+	while ( pos != string::npos )
+		{
+		++count;
+		pos = s.find(sub_s, pos + sub_s.size());
+		}
+
+	return zeek::val_mgr->Count(count);
+	%}
+
+%%{
+
+int64_t do_find_str(zeek::StringVal* str, zeek::StringVal* sub, uint64_t start, int64_t end, bool rfind)
+	{
+	// Don't bother if the start is after the end of the string.
+	if ( start > str->Len() )
+		return -1;
+
+	// Also don't bother (and return an error) if the end is before the start.
+	if ( (end != -1 ) && end < start )
+		{
+		reporter->Error("find_str: end position must be greater than start position");
+		return -1;
+		}
+
+	size_t end_pos = str->Len();
+	if ( end >= 0 && end < str->Len() )
+		end_pos = end;
+
+	// One last sanity check, don't bother doing string operations at all if the range is shorter than
+	// the length of the search string.
+	if ( (end_pos - start + 1) < sub->Len() )
+		return -1;
+
+	string s = str->ToStdString().substr(start, end_pos);
+	size_t pos = string::npos;
+	if ( rfind )
+		pos = s.rfind(sub->ToStdString());
+	else
+		pos = s.find(sub->ToStdString());
+
+	if ( pos == string::npos )
+		return -1;
+
+	return pos + start;
+	}
+
+%%}
+
+## Finds a string within another string, starting from the beginning. This works by taking a substring within
+## the provided indexes and searching for the sub argument. This means that ranges shorter than the string in
+## the sub argument will always return a failure.
+##
+## str: The string to search in.
+## substr: The string to search for.
+## start: An optional position for the start of the substring.
+## end: An optional position for the end of the substring. A value less than zero (such as the default -1)
+##      means a search until the end of the string.
+##
+## Returns: The position of the substring. Returns -1 if the string wasn't found. Prints an error if the
+## starting position is after the ending position.
+##
+function find_str%(str: string, sub: string, start: count &default=0, end: int &default=-1%) : int
+	%{
+	return zeek::val_mgr->Int(do_find_str(str, sub, start, end, false));
+	%}
+
+## The same as find(), but returns the highest index matching the substring instead of the smallest.
+##
+## str: The string to search in.
+## substr: The string to search for.
+## start: An optional position for the start of the substring.
+## end: An optional position for the end of the substring. A value less than zero (such as the default -1)
+##      means a search from the end of the string.
+##
+## Returns: The position of the substring. Returns -1 if the string wasn't found. Prints an error if the
+## starting position is after the ending position.
+##
+function rfind_str%(str: string, sub: string, start: count &default=0, end: int &default=-1%) : count
+	%{
+	return zeek::val_mgr->Int(do_find_str(str, sub, start, end, true));
+	%}
+
+## Returns whether a string starts with a substring.
+##
+function starts_with%(str: string, sub: string%) : bool
+	%{
+	string s = str->ToStdString();
+	return zeek::val_mgr->Bool(s.find(sub->ToStdString()) == 0);
+	%}
+
+## Returns whether a string ends with a substring.
+##
+function ends_with%(str: string, sub: string%) : bool
+	%{
+	if ( sub->Len() > str->Len() )
+		return zeek::val_mgr->Bool(false);
+
+	string s = str->ToStdString();
+	string sub_s = sub->ToStdString();
+	return zeek::val_mgr->Bool(s.rfind(sub_s) == (s.size() - sub_s.size()));
+	%}
+
+## Returns whether an entire string consists only of digits.
+##
+function is_num%(str: string%) : bool
+	%{
+	// Python's version of this method (which this is based on) just checks to see if every
+	// character in the string is a numeric value. If something more than this is desired, we
+	// could use something like std::from_chars or std::strto{ul,f} to check it.
+	const char* s = str->CheckString();
+	for ( int i = 0; i < str->Len(); i++ )
+		if ( ! std::isdigit(s[i]) )
+			return zeek::val_mgr->False();
+
+	return zeek::val_mgr->True();
+	%}
+
+## Returns whether an entire string is alphabetic characters.
+##
+function is_alpha%(str: string%) : bool
+	%{
+	const char* s = str->CheckString();
+	for ( int i = 0; i < str->Len(); i++ )
+		if ( ! std::isalpha(s[i]) )
+			return zeek::val_mgr->False();
+
+	return zeek::val_mgr->True();
+	%}
+
+## Returns whether an entire string is alphanumeric characters
+##
+function is_alnum%(str: string%) : bool
+	%{
+	const char* s = str->CheckString();
+	for ( int i = 0; i < str->Len(); i++ )
+		if ( ! std::isalnum(s[i]) )
+			return zeek::val_mgr->False();
+
+	return zeek::val_mgr->True();
+	%}
+
+## Returns a left-justified version of the string, padded to a specific length with a specified character.
+##
+## str: The string to left-justify.
+## count: The length of the returned string. If this value is less than or equal to the length of str, a
+## copy of str is returned.
+## fill: The character used to fill in any extra characters in the resulting string. If a string longer than
+## one character is passed, an error is reported. This defaults to the space character.
+##
+## Returns: A left-justified version of a string, padded with characters to a specific length.
+##
+function ljust%(str: string, width: count, fill: string &default=" "%) : string
+	%{
+	if ( fill->Len() != 1 )
+		{
+		reporter->Error("Fill string passed to ljust() must be a single character");
+		return nullptr;
+		}
+
+	string new_s = str->ToStdString();
+
+	if ( width <= new_s.size() )
+		return zeek::StringValPtr(zeek::NewRef{}, str);
+
+	new_s.insert(new_s.size(), width - new_s.size(), fill->CheckString()[0]);
+	return zeek::make_intrusive<zeek::StringVal>(new_s);
+	%}
+
+%%{
+
+static zeek::StringValPtr do_rjust(zeek::StringVal* str, int width, char fill)
+	{
+	string new_s = str->ToStdString();
+
+	if ( width <= new_s.size() )
+		return { zeek::NewRef{}, str };
+
+	new_s.insert(0, width - new_s.size(), fill);
+	return zeek::make_intrusive<zeek::StringVal>(new_s);
+	}
+
+%%}
+
+## Returns a right-justified version of the string, padded to a specific length with a specified character.
+##
+## str: The string to right-justify.
+## count: The length of the returned string. If this value is less than or equal to the length of str, a
+## copy of str is returned.
+## fill: The character used to fill in any extra characters in the resulting string. If a string longer than
+## one character is passed, an error is reported. This defaults to the space character.
+##
+## Returns: A right-justified version of a string, padded with characters to a specific length.
+##
+function rjust%(str: string, width: count, fill: string &default=" "%) : string
+	%{
+	if ( fill->Len() != 1 )
+		{
+		reporter->Error("Fill string passed to rjust() must be a single character");
+		return nullptr;
+		}
+
+	return do_rjust(str, width, fill->CheckString()[0]);
+	%}
+
+## Swaps the case of every alphabetic character in a string. For example, the string "aBc" be returned as "AbC".
+##
+## str: The string to swap cases in.
+##
+## Returns: A copy of the str with the case of each character swapped.
+##
+function swap_case%(str: string%) : string
+	%{
+	string s = str->ToStdString();
+	for ( size_t i = 0; i < s.size(); i++ )
+		{
+		if ( std::islower(s[i]) )
+			s[i] = toupper(s[i]);
+		else if ( std::isupper(s[i]) )
+			s[i] = tolower(s[i]);
+		}
+
+	return zeek::make_intrusive<zeek::StringVal>(s);
+	%}
+
+## Converts a string to Title Case. This changes the first character of each sequence of non-space characters
+## in the string to be capitalized. See https://docs.python.org/2/library/stdtypes.html#str.title for more info.
+##
+## str: The string to convert.
+##
+## Returns: A title-cased version of the string.
+##
+function to_title%(str: string%) : string
+	%{
+	string s = str->ToStdString();
+	size_t pos = s.find_first_not_of(' ');
+	if ( pos == string::npos )
+		return zeek::IntrusivePtr<zeek::StringVal>(NewRef{}, str);
+
+	while ( pos != string::npos )
+		{
+		s[pos] = std::toupper(s[pos]);
+		pos = s.find(' ', pos+1);
+		if ( pos == string::npos )
+			break;
+
+		pos = s.find_first_not_of(' ', pos+1);
+		}
+
+	return zeek::make_intrusive<zeek::StringVal>(s);
+	%}
+
+## Returns a copy of a string filled on the left side with zeroes. This is effectively rjust(str, width, "0").
+function zfill%(str: string, width: count%) : string
+	%{
+	return do_rjust(str, width, '0');
+	%}
+
+## Similar to lstrip(), except does the removal repeatedly if the pattern repeats at the start of the string.
+function remove_prefix%(str: string, sub: string%) : string
+	%{
+	// This could just use repeated calls to lstrip(), except for a couple of reasons:
+	// 1) lstrip() creates a StringVal at the end, and that would mean repeated recreation of objects
+	// 2) lstrip() searches for any character in the string, not the string as a whole.
+	string s = str->ToStdString();
+	string sub_s = sub->ToStdString();
+
+	size_t pos = s.find(sub_s);
+	if ( pos != 0 )
+		return zeek::IntrusivePtr<zeek::StringVal>(NewRef{}, str);
+
+	pos = s.find(sub_s, pos+1);
+	size_t next_pos = sub_s.size();
+	while ( pos == next_pos && next_pos < s.size() )
+		{
+		next_pos += sub_s.size();
+		pos = s.find(sub_s, pos+1);
+		}
+
+	return zeek::make_intrusive<zeek::StringVal>(s.substr(next_pos));
+	%}
+
+## Similar to rstrip(), except does the removal repeatedly if the pattern repeats at the end of the string.
+function remove_suffix%(str: string, sub: string%) : string
+	%{
+	// See the note in removeprefix for why this doesn't just call rstrip.
+	string s = str->ToStdString();
+	string sub_s = sub->ToStdString();
+
+	size_t pos = s.rfind(sub_s);
+	size_t next_pos = s.size() - sub_s.size();
+
+	while ( pos == next_pos )
+		{
+		next_pos -= sub_s.size();
+		pos = s.rfind(sub_s, pos-1);
+		}
+
+	return zeek::make_intrusive<zeek::StringVal>(s.substr(0, next_pos + sub_s.size()));
+	%}
--- a/testing/btest/Baseline/bifs.string_utils/out
+++ b/testing/btest/Baseline/bifs.string_utils/out
@ -0,0 +1,66 @@
+Justification (input string 'abc')
+----------------------------------
+ljust: 'abc'
+ljust: 'abc'
+ljust: 'abc  '
+ljust: 'abc--'
+rjust: 'abc'
+rjust: 'abc'
+rjust: '  abc'
+rjust: '--abc'
+zfill: 'abc'
+zfill: 'abc'
+zfill: '00abc'
+
+Content checking
+----------------
+is_num abc   : 0
+is_num 123   : 1
+is_alpha ab  : 1
+is_alpha 1a  : 0
+is_alpha a1  : 0
+is_alnum ab  : 1
+is_alnum 1a  : 1
+is_alnum a1  : 1
+is_alnum 12  : 1
+is_alnum ##12: 0
+
+String counting (input str 'aabbaa')
+------------------------------------
+count_substr aa: 2
+count_substr bb: 1
+count_substr cc: 0
+
+Starts/endswith
+---------------
+starts_with bro: 1
+starts_with ids: 0
+ends_with ids: 1
+ends_with bro: 0
+
+Transformations
+---------------
+swap_case 'aBc': AbC
+to_title 'bro is a very neat ids': 'Bro Is A Very Neat Ids'
+to_title '   ': '   '
+to_title '  a   c  ': '  A   C  '
+remove_prefix 'ananab'/'an' : ab
+remove_prefix 'anatnab'/'an': atnab
+remove_suffix 'banana'/'na' : ba
+remove_suffix 'bantana'/'na': banta
+
+find_str/rfind_str (input string 'abcdefghi')
+-----------------------------------------------------
+find_str: 0
+find_str: -1
+find_str: -1
+find_str: 4
+find_str: 4
+find_str: -1
+find_str: 0
+find_str: -1
+find_str: -1
+find_str: 4
+find_str: 4
+find_str: -1
+
--- a/testing/btest/bifs/string_utils.zeek
+++ b/testing/btest/bifs/string_utils.zeek
@ -0,0 +1,82 @@
+# @TEST-EXEC: zeek -b %INPUT >out
+# @TEST-EXEC: btest-diff out
+
+event zeek_init()
+	{
+	print "Justification (input string 'abc')";
+	print "----------------------------------";
+	local s1 : string = "abc";
+	print fmt("ljust: '%s'", ljust(s1, 2, " "));   # 'abc'
+	print fmt("ljust: '%s'", ljust(s1, 3, " "));   # 'abc'
+	print fmt("ljust: '%s'", ljust(s1, 5));        # 'abc  '
+	print fmt("ljust: '%s'", ljust(s1, 5, "-"));   # 'abc--'
+	print fmt("ljust: '%s'", ljust(s1, 2, "--"));  # This should return an error
+	print fmt("rjust: '%s'", rjust(s1, 2, " "));   # 'abc'
+	print fmt("rjust: '%s'", rjust(s1, 3, " "));   # 'abc'
+	print fmt("rjust: '%s'", rjust(s1, 5));        # '  abc'
+	print fmt("rjust: '%s'", rjust(s1, 5, "-"));   # '--abc'
+	print fmt("rjust: '%s'", rjust(s1, 2, "--"));  # This should return an error
+	print fmt("zfill: '%s'", zfill(s1, 2));        # 'abc'
+	print fmt("zfill: '%s'", zfill(s1, 3));        # 'abc'
+	print fmt("zfill: '%s'", zfill(s1, 5));        # '00abc'
+	print "";
+
+	print "Content checking";
+	print "----------------";
+	print fmt("is_num abc   : %d", is_num("abc"));
+	print fmt("is_num 123   : %d", is_num("123"));
+	print fmt("is_alpha ab  : %d", is_alpha("ab"));
+	print fmt("is_alpha 1a  : %d", is_alpha("1a"));
+	print fmt("is_alpha a1  : %d", is_alpha("a1"));
+	print fmt("is_alnum ab  : %d", is_alnum("ab"));
+	print fmt("is_alnum 1a  : %d", is_alnum("1a"));
+	print fmt("is_alnum a1  : %d", is_alnum("a1"));
+	print fmt("is_alnum 12  : %d", is_alnum("12"));
+	print fmt("is_alnum ##12: %d", is_alnum("##12"));
+	print "";
+
+	print "String counting (input str 'aabbaa')";
+	print "------------------------------------";
+	local s2 : string = "aabbaa";
+	print fmt("count_substr aa: %d", count_substr(s2, "aa"));
+	print fmt("count_substr bb: %d", count_substr(s2, "bb"));
+	print fmt("count_substr cc: %d", count_substr(s2, "cc"));
+	print "";
+
+	print "Starts/endswith";
+	print "---------------";
+	local s3: string = "abcdefghi";
+	print fmt("starts_with bro: %d", starts_with(s3, "abc"));
+	print fmt("starts_with ids: %d", starts_with(s3, "ghi"));
+	print fmt("ends_with ids: %d", ends_with(s3, "ghi"));
+	print fmt("ends_with bro: %d", ends_with(s3, "abc"));
+	print "";
+
+	print "Transformations";
+	print "---------------";
+	print fmt("swap_case 'aBc': %s", swap_case("aBc"));
+	print fmt("to_title 'bro is a very neat ids': '%s'", to_title("bro is a very neat ids"));
+	print fmt("to_title '   ': '%s'", to_title("   "));
+	print fmt("to_title '  a   c  ': '%s'", to_title("  a   c  "));
+	print fmt("remove_prefix 'ananab'/'an' : %s", remove_prefix("ananab", "an"));
+	print fmt("remove_prefix 'anatnab'/'an': %s", remove_prefix("anatnab", "an"));
+	print fmt("remove_suffix 'banana'/'na' : %s", remove_suffix("banana", "na"));
+	print fmt("remove_suffix 'bantana'/'na': %s", remove_suffix("bantana", "na"));
+	print "";
+
+	print fmt("find_str/rfind_str (input string '%s')", s3);
+	print "-----------------------------------------------------";
+	print fmt("find_str: %d", find_str(s3, "abcd"));
+	print fmt("find_str: %d", find_str(s3, "abcd", 1));
+	print fmt("find_str: %d", find_str(s3, "abcd", 0, 2));
+	print fmt("find_str: %d", find_str(s3, "efg"));
+	print fmt("find_str: %d", find_str(s3, "efg", 2, 6));
+	print fmt("find_str: %d", find_str(s3, "efg", 6, 2));
+	print fmt("find_str: %d", rfind_str(s3, "abcd"));
+	print fmt("find_str: %d", rfind_str(s3, "abcd", 1));
+	print fmt("find_str: %d", rfind_str(s3, "abcd", 0, 2));
+	print fmt("find_str: %d", rfind_str(s3, "efg"));
+	print fmt("find_str: %d", rfind_str(s3, "efg", 2, 6));
+	print fmt("find_str: %d", rfind_str(s3, "efg", 6, 2));
+	print "";
+	}