From 5bf18fdb7f1d54d290728ce02b95e1579b3a65f0 Mon Sep 17 00:00:00 2001
From: Seth Hall <seth@icir.org>
Date: Wed, 13 Oct 2010 14:32:27 -0400
Subject: [PATCH] Modified all of the string functions that used the
 CheckString function.  All now use Bytes and Len to extract the bytes from
 string arguments.  The result of this is that these functions now don't fail
 when presented with strings containing NULL bytes.

Signed-off-by: Seth Hall <seth@icir.org>
---
 src/strings.bif | 180 +++++++++++++++++++++---------------------------
 1 file changed, 79 insertions(+), 101 deletions(-)

diff --git a/src/strings.bif b/src/strings.bif
index 44b0c57eb6..6044813476 100644
--- a/src/strings.bif
+++ b/src/strings.bif
@@ -138,27 +138,27 @@ function sort_string_array%(a: string_array%): string_array
 
 function edit%(arg_s: string, arg_edit_char: string%): string
 	%{
-	const char* s = arg_s->AsString()->CheckString();
-	const char* edit_s = arg_edit_char->AsString()->CheckString();
-
-	if ( strlen(edit_s) != 1 )
+	if ( arg_edit_char->Len() != 1 )
 		builtin_run_time("not exactly one edit character", @ARG@[1]);
+	
+	const u_char* s = arg_s->Bytes();
+	const u_char* edit_s = arg_edit_char->Bytes();
 
-	char edit_c = *edit_s;
+	u_char edit_c = *edit_s;
 
-	int n = strlen(s) + 1;
-	char* new_s = new char[n];
+	int n = arg_s->Len();
+	u_char* new_s = new u_char[n+1];
 	int ind = 0;
 
-	for ( ; *s; ++s )
+	for ( int i=0; i<n; ++i )
 		{
-		if ( *s == edit_c )
+		if ( s[i] == edit_c )
 			{ // Delete last character
 			if ( --ind < 0 )
 				ind = 0;
 			}
 		else
-			new_s[ind++] = *s;
+			new_s[ind++] = s[i];
 		}
 
 	new_s[ind] = '\0';
@@ -198,7 +198,6 @@ static int match_prefix(int s_len, const char* s, int t_len, const char* t)
 Val* do_split(StringVal* str_val, RE_Matcher* re, TableVal* other_sep,
 		int incl_sep, int max_num_sep)
 	{
-	const BroString* str = str_val->AsString();
 	TableVal* a = new TableVal(internal_type("string_array")->AsTableType());
 	ListVal* other_strings = 0;
 
@@ -209,66 +208,54 @@ Val* do_split(StringVal* str_val, RE_Matcher* re, TableVal* other_sep,
 	// the future we expect to change this by giving RE_Matcher a
 	// const char* segment.
 
-	const char* s = str->CheckString();
-	int len = strlen(s);
-	const char* end_of_s = s + len;
+	const u_char* s = str_val->Bytes();
+	int n = str_val->Len();
+	const u_char* end_of_s = s + n;
 	int num = 0;
 	int num_sep = 0;
+	
+	int offset = 0;
 
-	while ( 1 )
+	while ( n > 0 )
 		{
-		int offset = 0;
-		const char* t;
-
-		if ( max_num_sep > 0 && num_sep >= max_num_sep )
-			t = end_of_s;
-		else
+		offset = 0;
+		// Find next match offset.
+		int end_of_match;
+		while ( n > 0 &&
+		        (end_of_match = re->MatchPrefix(&s[offset], n)) <= 0 )
 			{
-			for ( t = s; t < end_of_s; ++t )
-				{
-				offset = re->MatchPrefix(t);
-
-				if ( other_strings )
-					{
-					val_list* vl = other_strings->Vals();
-					loop_over_list(*vl, i)
-						{
-						const BroString* sub =
-							(*vl)[i]->AsString();
-						if ( sub->Len() > offset &&
-						     match_prefix(end_of_s - t,
-								t, sub->Len(),
-								(const char*) (sub->Bytes())) )
-							{
-							offset = sub->Len();
-							}
-						}
-					}
-
-				if ( offset > 0 )
-					break;
-				}
+			// Move on to next character.
+			++offset;
+			--n;
 			}
-
+		
 		Val* ind = new Val(++num, TYPE_COUNT);
-		a->Assign(ind, new StringVal(t - s, s));
+		a->Assign(ind, new StringVal(offset, (const char*) s));
 		Unref(ind);
 
-		if ( t >= end_of_s )
+		// No more separators will be needed if this is the end of string.
+		if ( n <= 0 )
 			break;
 
-		++num_sep;
-
 		if ( incl_sep )
 			{ // including the part that matches the pattern
 			ind = new Val(++num, TYPE_COUNT);
-			a->Assign(ind, new StringVal(offset, t));
+			a->Assign(ind, new StringVal(end_of_match, (const char*) s+offset));
 			Unref(ind);
 			}
-
-		s = t + offset;
+		
+		++num_sep;
+		if ( max_num_sep && num_sep >= max_num_sep )
+			break;
+		
+		offset += end_of_match;
+		n -= end_of_match;
+		s += offset;
+		
 		if ( s > end_of_s )
+			{
 			internal_error("RegMatch in split goes beyond the string");
+			}
 		}
 
 	if ( other_strings )
@@ -476,42 +463,38 @@ function subst_string%(s: string, from: string, to: string%): string
 
 function to_lower%(str: string%): string
 	%{
-	const char* s = str->CheckString();
-	int n = strlen(s) + 1;
+	const u_char* s = str->Bytes();
+	int n = str->Len();
 	char* lower_s = new char[n];
+	char* ls = lower_s;
 
-	char* ls;
-	for ( ls = lower_s; *s; ++s )
+	for (int i=0; i<n; ++i)
 		{
-		if ( isascii(*s) && isupper(*s) )
-			*ls++ = tolower(*s);
+		if ( isascii(s[i]) && isupper(s[i]) )
+			*ls++ = tolower(s[i]);
 		else
-			*ls++ = *s;
+			*ls++ = s[i];
 		}
 
-	*ls = '\0';
-
-	return new StringVal(new BroString(1, byte_vec(lower_s), n-1));
+	return new StringVal(new BroString(1, byte_vec(lower_s), n));
 	%}
 
 function to_upper%(str: string%): string
 	%{
-	const char* s = str->CheckString();
-	int n = strlen(s) + 1;
+	const u_char* s = str->Bytes();
+	int n = str->Len();
 	char* upper_s = new char[n];
-
-	char* us;
-	for ( us = upper_s; *s; ++s )
+	char* us = upper_s;
+	
+	for (int i=0; i<n; ++i)
 		{
-		if ( isascii(*s) && islower(*s) )
-			*us++ = toupper(*s);
+		if ( isascii(s[i]) && islower(s[i]) )
+			*us++ = toupper(s[i]);
 		else
-			*us++ = *s;
+			*us++ = s[i];
 		}
 
-	*us = '\0';
-
-	return new StringVal(new BroString(1, byte_vec(upper_s), n-1));
+	return new StringVal(new BroString(1, byte_vec(upper_s), n));
 	%}
 
 function clean%(str: string%): string
@@ -604,40 +587,34 @@ function str_split%(s: string, idx: index_vec%): string_vec
 
 function strip%(str: string%): string
 	%{
-	const char* s = str->CheckString();
+	const u_char* s = str->Bytes();
+	int n = str->Len();
 
-	int n = strlen(s) + 1;
-	char* strip_s = new char[n];
-
-	if ( n == 1 )
+	if ( n == 0 )
 		// Empty string.
-		return new StringVal(new BroString(1, byte_vec(strip_s), 0));
+		return new StringVal(new BroString(s, n, 1));
 
-	while ( isspace(*s) )
-		++s;
-
-	strncpy(strip_s, s, n);
-
-	char* s2 = strip_s;
-	char* e = &s2[strlen(s2) - 1];
-
-	while ( e > s2 && isspace(*e) )
+	const u_char* sp = s;
+	// Move a pointer to the end of the string
+	const u_char* e = &sp[n-1];
+	while ( e > sp && isspace(*e) )
 		--e;
 
-	e[1] = '\0';	// safe even if e hasn't changed, due to n = strlen + 1
+	// Move the pointer for the beginning of the string
+	while ( isspace(*sp) )
+		++sp;
 
-	return new StringVal(new BroString(1, byte_vec(s2), (e-s2)+1));
+	return new StringVal(new BroString(sp, e-sp+1, 1));
 	%}
 
 function string_fill%(len: int, source: string%): string
 	%{
-	const char* src = source->CheckString();
-
-	int sn = strlen(src);
+	const u_char* src = source->Bytes();
+	int n = source->Len();
 	char* dst = new char[len];
 
-	for ( int i = 0; i < len; i += sn )
-		::memcpy((dst + i), src, min(sn, len - i));
+	for ( int i = 0; i < len; i += n )
+		::memcpy((dst + i), src, min(n, len - i));
 
 	dst[len - 1] = 0;
 
@@ -650,11 +627,12 @@ function string_fill%(len: int, source: string%): string
 #
 function str_shell_escape%(source: string%): string
 	%{
-	unsigned j = 0;
-	const char* src = source->CheckString();
-	char* dst = new char[strlen(src) * 2 + 1];
+	uint j = 0;
+	const u_char* src = source->Bytes();
+	uint n = source->Len();
+	byte_vec dst = new u_char[n * 2 + 1];
 
-	for ( unsigned i = 0; i < strlen(src); ++i )
+	for ( uint i = 0; i < n; ++i )
 		{
 		switch ( src[i] ) {
 		case '`': case '"': case '\\': case '$':
@@ -672,7 +650,7 @@ function str_shell_escape%(source: string%): string
 		}
 
 	dst[j] = '\0';
-	return new StringVal(new BroString(1, byte_vec(dst), j));
+	return new StringVal(new BroString(1, dst, j));
 	%}
 
 # Returns all occurrences of the given pattern in the given string (an empty