##! Definitions of built-in functions related to string processing and ##! manipulation. %%{ // C segment #include #include #include #include "zeek/SmithWaterman.h" using namespace std; %%} ## Calculates the Levenshtein distance between the two strings. See `Wikipedia ## `__ for more information. ## ## s1: The first string. ## ## s2: The second string. ## ## Returns: The Levenshtein distance of two strings as a count. ## function levenshtein_distance%(s1: string, s2: string%): count %{ unsigned int n = s1->Len(); unsigned int m = s2->Len(); if ( ! n ) return zeek::val_mgr->Count(m); if ( ! m ) return zeek::val_mgr->Count(n); vector > d(n + 1, vector(m + 1)); d[0][0] = 0; for ( unsigned int i = 1; i <= n; ++i ) d[i][0] = i; for ( unsigned int i = 1; i <= m; ++i ) d[0][i] = i; for ( unsigned int i = 1; i <= n; ++i ) { for ( unsigned int j = 1; j <= m; ++j ) d[i][j] = min(min(d[i-1][j] + 1, d[i][j-1] + 1), d[i-1][j-1] + (s1->Bytes()[i-1] == s2->Bytes()[j-1] ? 0 : 1)); } return zeek::val_mgr->Count(d[n][m]); %} ## Concatenates all arguments into a single string. The function takes a ## variable number of arguments of type string and stitches them together. ## ## Returns: The concatenation of all (string) arguments. ## ## .. zeek:see:: cat cat_sep ## fmt ## join_string_vec function string_cat%(...%): string %{ int n = 0; for ( const auto& a : @ARG@ ) { if ( a->GetType()->Tag() != TYPE_STRING ) { zeek::reporter->Error("string_cat() requires string arguments"); return val_mgr->EmptyString(); } n += a->AsString()->Len(); } u_char* b = new u_char[n+1]; zeek::String* s = new zeek::String(1, b, n); for ( const auto& a : @ARG@ ) { const zeek::String* s = a->AsString(); memcpy(b, s->Bytes(), s->Len()); b += s->Len(); } *b = 0; return zeek::make_intrusive(s); %} ## Joins all values in the given vector of strings with a separator placed ## between each element. ## ## sep: The separator to place between each element. ## ## vec: The :zeek:type:`string_vec` (``vector of string``). ## ## Returns: The concatenation of all elements in *vec*, with *sep* placed ## between each element. ## ## .. zeek:see:: cat cat_sep string_cat ## fmt function join_string_vec%(vec: string_vec, sep: string%): string %{ ODesc d; d.SetStyle(RAW_STYLE); zeek::VectorVal *v = vec->AsVectorVal(); for ( unsigned i = 0; i < v->Size(); ++i ) { if ( i > 0 ) d.AddN(reinterpret_cast(sep->Bytes()), sep->Len()); auto e = v->ValAt(i); // If the element is empty, skip it. if ( ! e ) continue; e->Describe(&d); } zeek::String* s = new zeek::String(1, d.TakeBytes(), d.Len()); s->SetUseFreeToDelete(true); return zeek::make_intrusive(s); %} ## Joins all values in the given set of strings with a separator placed ## between each element. ## ## ss: The :zeek:type:`string_set` (``set[string]``). ## ## sep: The separator to place between each element. ## ## Returns: The concatenation of all elements in *s*, with *sep* placed ## between each element. ## ## .. zeek:see:: cat cat_sep string_cat ## fmt ## join_string_vec function join_string_set%(ss: string_set, sep: string%): string %{ ODesc d; d.SetStyle(RAW_STYLE); if ( ! ss->GetType()->IsSet () ) { zeek::emit_builtin_error("join_string_set() requires a string set argument"); return val_mgr->EmptyString(); } const auto& it = ss->GetType()->AsTableType()->GetIndexTypes(); if ( it.size() != 1 || it[0]->Tag() != TYPE_STRING ) { zeek::emit_builtin_error("join_string_set() requires a string set argument"); return val_mgr->EmptyString(); } int i = 0; TableVal* tv = ss->AsTableVal(); const PDict* loop_vals = tv->AsTable(); if ( ! loop_vals->Length() ) return val_mgr->EmptyString(); for ( const auto& iter : *loop_vals ) { if ( i > 0 ) d.AddN(reinterpret_cast(sep->Bytes()), sep->Len()); // Not sure this is fast - I guess we don't have access to the // values used for the keys directly anymore. auto k = iter.GetHashKey(); auto ind_lv = tv->RecreateIndex(*k); ind_lv->Describe(&d); ++i; } zeek::String* str = new zeek::String(1, d.TakeBytes(), d.Len()); str->SetUseFreeToDelete(true); return zeek::make_intrusive(str); %} ## Returns an edited version of a string that applies a special ## "backspace character" (usually ``\x08`` for backspace or ``\x7f`` for DEL). ## For example, ``edit("hello there", "e")`` returns ``"llo t"``. ## ## arg_s: The string to edit. ## ## arg_edit_char: A string of exactly one character that represents the ## "backspace character". If it is longer than one character Zeek ## generates a run-time error and uses the first character in ## the string. ## ## Returns: An edited version of *arg_s* where *arg_edit_char* triggers the ## deletion of the last character. ## ## .. zeek:see:: clean ## to_string_literal ## escape_string ## strip function edit%(arg_s: string, arg_edit_char: string%): string %{ if ( arg_edit_char->Len() != 1 ) zeek::emit_builtin_error("not exactly one edit character", @ARG@[1]); const u_char* s = arg_s->Bytes(); const u_char* edit_s = arg_edit_char->Bytes(); u_char edit_c = *edit_s; int n = arg_s->Len(); u_char* new_s = new u_char[n+1]; int ind = 0; for ( int i = 0; i < n; ++i ) { if ( s[i] == edit_c ) { // Delete last character if ( --ind < 0 ) ind = 0; } else new_s[ind++] = s[i]; } new_s[ind] = '\0'; return zeek::make_intrusive(new zeek::String(1, byte_vec(new_s), ind)); %} ## Get a substring from a string, given a starting position and length. ## ## s: The string to obtain a substring from. ## ## start: The starting position of the substring in *s*, where 1 is the first ## character. As a special case, 0 also represents the first character. ## ## n: The number of characters to extract, beginning at *start*. ## ## Returns: A substring of *s* of length *n* from position *start*. function sub_bytes%(s: string, start: count, n: int%): string %{ if ( start > 0 ) --start; // make it 0-based zeek::String* ss = s->AsString()->GetSubstring(start, n); if ( ! ss ) ss = new zeek::String(""); return zeek::make_intrusive(ss); %} %%{ static int match_prefix(int s_len, const char* s, int t_len, const char* t) { for ( int i = 0; i < t_len; ++i ) { if ( i >= s_len || s[i] != t[i] ) return 0; } return 1; } static zeek::VectorValPtr do_split_string(zeek::StringVal* str_val, zeek::RE_Matcher* re, int incl_sep, int max_num_sep) { // string_vec is used early in the version script - do not use the NetVar. auto rval = zeek::make_intrusive(zeek::id::find_type("string_vec")); const u_char* s = str_val->Bytes(); int n = str_val->Len(); const u_char* end_of_s = s + n; int num = 0; int num_sep = 0; int offset = 0; bool bol = true; const bool eol = true; while ( n >= 0 ) { offset = 0; // Find next match offset. int end_of_match = 0; while ( n > 0 ) { end_of_match = re->MatchPrefix(s + offset, n, bol, eol); if ( end_of_match > 0 ) break; // Move on to next byte, use BOL only on the byte such that // a BOL anchored pattern won't be matched anywhere else. bol = false; ++offset; --n; } if ( max_num_sep && num_sep >= max_num_sep ) { offset = end_of_s - s; n=0; } rval->Assign(num++, zeek::make_intrusive(offset, (const char*) s)); // No more separators will be needed if this is the end of string. if ( n <= 0 ) break; if ( incl_sep ) { // including the part that matches the pattern rval->Assign(num++, zeek::make_intrusive(end_of_match, (const char*) s+offset)); } if ( max_num_sep && num_sep >= max_num_sep ) break; ++num_sep; n -= end_of_match; s += offset + end_of_match;; if ( s > end_of_s ) zeek::reporter->InternalError("RegMatch in split goes beyond the string"); } return rval; } zeek::Val* do_split(zeek::StringVal* str_val, zeek::RE_Matcher* re, int incl_sep, int max_num_sep) { auto* a = new zeek::TableVal(zeek::id::string_array); const u_char* s = str_val->Bytes(); int n = str_val->Len(); const u_char* end_of_s = s + n; int num = 0; int num_sep = 0; int offset = 0; while ( n >= 0 ) { offset = 0; // Find next match offset. int end_of_match = 0; while ( n > 0 && (end_of_match = re->MatchPrefix(s + offset, n)) <= 0 ) { // Move on to next byte. ++offset; --n; } if ( max_num_sep && num_sep >= max_num_sep ) { offset = end_of_s - s; n=0; } auto ind = zeek::val_mgr->Count(++num); a->Assign(std::move(ind), zeek::make_intrusive(offset, (const char*) s)); // No more separators will be needed if this is the end of string. if ( n <= 0 ) break; if ( incl_sep ) { // including the part that matches the pattern ind = zeek::val_mgr->Count(++num); a->Assign(std::move(ind), zeek::make_intrusive(end_of_match, (const char*) s+offset)); } if ( max_num_sep && num_sep >= max_num_sep ) break; ++num_sep; n -= end_of_match; s += offset + end_of_match;; if ( s > end_of_s ) zeek::reporter->InternalError("RegMatch in split goes beyond the string"); } return a; } %%} ## Splits a string into an array of strings according to a pattern. ## ## str: The string to split. ## ## re: The pattern describing the element separator in *str*. ## ## Returns: An array of strings where each element corresponds to a substring ## in *str* separated by *re*. ## ## .. zeek:see:: split_string1 split_string_all split_string_n ## function split_string%(str: string, re: pattern%): string_vec %{ return do_split_string(str, re, 0, 0); %} ## Splits a string *once* into a two-element array of strings according to a ## pattern. This function is the same as :zeek:id:`split_string`, but *str* is ## only split once (if possible) at the earliest position and an array of two ## strings is returned. ## ## str: The string to split. ## ## re: The pattern describing the separator to split *str* in two pieces. ## ## Returns: An array of strings with two elements in which the first represents ## the substring in *str* up to the first occurrence of *re*, and the ## second everything after *re*. An array of one string is returned ## when *s* cannot be split. ## ## .. zeek:see:: split_string split_string_all split_string_n function split_string1%(str: string, re: pattern%): string_vec %{ return do_split_string(str, re, 0, 1); %} ## Splits a string into an array of strings according to a pattern. This ## function is the same as :zeek:id:`split_string`, except that the separators ## are returned as well. For example, ``split_string_all("a-b--cd", /(\-)+/)`` ## returns ``{"a", "-", "b", "--", "cd"}``: odd-indexed elements do match the ## pattern and even-indexed ones do not. ## ## str: The string to split. ## ## re: The pattern describing the element separator in *str*. ## ## Returns: An array of strings where each two successive elements correspond ## to a substring in *str* of the part not matching *re* (even-indexed) ## and the part that matches *re* (odd-indexed). ## ## .. zeek:see:: split_string split_string1 split_string_n function split_string_all%(str: string, re: pattern%): string_vec %{ return do_split_string(str, re, 1, 0); %} ## Splits a string a given number of times into an array of strings according ## to a pattern. This function is similar to :zeek:id:`split_string1` and ## :zeek:id:`split_string_all`, but with customizable behavior with respect to ## including separators in the result and the number of times to split. ## ## str: The string to split. ## ## re: The pattern describing the element separator in *str*. ## ## incl_sep: A flag indicating whether to include the separator matches in the ## result (as in :zeek:id:`split_string_all`). ## ## max_num_sep: The number of times to split *str*. ## ## Returns: An array of strings where, if *incl_sep* is true, each two ## successive elements correspond to a substring in *str* of the part ## not matching *re* (even-indexed) and the part that matches *re* ## (odd-indexed). ## ## .. zeek:see:: split_string split_string1 split_string_all function split_string_n%(str: string, re: pattern, incl_sep: bool, max_num_sep: count%): string_vec %{ return do_split_string(str, re, incl_sep, max_num_sep); %} ## Substitutes a given replacement string for the first occurrence of a pattern ## in a given string. ## ## str: The string to perform the substitution in. ## ## re: The pattern being replaced with *repl*. ## ## repl: The string that replaces *re*. ## ## Returns: A copy of *str* with the first occurrence of *re* replaced with ## *repl*. ## ## .. zeek:see:: gsub subst_string function sub%(str: string, re: pattern, repl: string%): string %{ return str->Replace(re, *repl->AsString(), false); %} ## Substitutes a given replacement string for all occurrences of a pattern ## in a given string. ## ## str: The string to perform the substitution in. ## ## re: The pattern being replaced with *repl*. ## ## repl: The string that replaces *re*. ## ## Returns: A copy of *str* with all occurrences of *re* replaced with *repl*. ## ## .. zeek:see:: sub subst_string function gsub%(str: string, re: pattern, repl: string%): string %{ return str->Replace(re, *repl->AsString(), true); %} ## Lexicographically compares two strings. ## ## s1: The first string. ## ## s2: The second string. ## ## Returns: An integer greater than, equal to, or less than 0 according as ## *s1* is greater than, equal to, or less than *s2*. function strcmp%(s1: string, s2: string%): int %{ return zeek::val_mgr->Int(Bstr_cmp(s1->AsString(), s2->AsString())); %} ## Locates the first occurrence of one string in another. ## ## big: The string to look in. ## ## little: The (smaller) string to find inside *big*. ## ## Returns: The location of *little* in *big*, or 0 if *little* is not found in ## *big*. ## ## .. zeek:see:: find_all find_last function strstr%(big: string, little: string%): count %{ return zeek::val_mgr->Count( 1 + big->AsString()->FindSubstring(little->AsString())); %} ## Substitutes each (non-overlapping) appearance of a string in another. ## ## s: The string in which to perform the substitution. ## ## from: The string to look for which is replaced with *to*. ## ## to: The string that replaces all occurrences of *from* in *s*. ## ## Returns: A copy of *s* where each occurrence of *from* is replaced with *to*. ## ## .. zeek:see:: sub gsub function subst_string%(s: string, from: string, to: string%): string %{ const int little_len = from->Len(); if ( little_len == 0 ) return IntrusivePtr{zeek::NewRef{}, s}; int big_len = s->Len(); const u_char* big = s->Bytes(); data_chunk_t dc; vector vs; while ( big_len >= little_len ) { int j = zeek::util::strstr_n(big_len, big, little_len, from->Bytes()); if ( j < 0 ) break; if ( j > 0 ) { dc.length = j; dc.data = (const char*) big; vs.push_back(dc); } dc.length = to->Len(); dc.data = (const char*) (to->Bytes()); vs.push_back(dc); j += little_len; big += j; big_len -= j; } if ( big_len > 0 ) { dc.length = big_len; dc.data = (const char*) big; vs.push_back(dc); } return zeek::make_intrusive(concatenate(vs)); %} ## Replaces all uppercase letters in a string with their lowercase counterpart. ## ## str: The string to convert to lowercase letters. ## ## Returns: A copy of the given string with the uppercase letters (as indicated ## by ``isascii`` and ``isupper``) folded to lowercase ## (via ``tolower``). ## ## .. zeek:see:: to_upper is_ascii function to_lower%(str: string%): string %{ const u_char* s = str->Bytes(); int n = str->Len(); u_char* lower_s = new u_char[n + 1]; u_char* ls = lower_s; for ( int i = 0; i < n; ++i) { if ( isascii(s[i]) && isupper(s[i]) ) *ls++ = tolower(s[i]); else *ls++ = s[i]; } *ls++ = '\0'; return zeek::make_intrusive(new zeek::String(1, lower_s, n)); %} ## Replaces all lowercase letters in a string with their uppercase counterpart. ## ## str: The string to convert to uppercase letters. ## ## Returns: A copy of the given string with the lowercase letters (as indicated ## by ``isascii`` and ``islower``) folded to uppercase ## (via ``toupper``). ## ## .. zeek:see:: to_lower is_ascii function to_upper%(str: string%): string %{ const u_char* s = str->Bytes(); int n = str->Len(); u_char* upper_s = new u_char[n + 1]; u_char* us = upper_s; for ( int i = 0; i < n; ++i) { if ( isascii(s[i]) && islower(s[i]) ) *us++ = toupper(s[i]); else *us++ = s[i]; } *us++ = '\0'; return zeek::make_intrusive(new zeek::String(1, upper_s, n)); %} ## Replaces non-printable characters in a string with escaped sequences. The ## mappings are: ## ## - values not in *[32, 126]* to ``\xXX`` ## ## If the string does not yet have a trailing NUL, one is added internally. ## ## In contrast to :zeek:id:`escape_string`, this encoding is *not* fully reversible.` ## ## str: The string to escape. ## ## Returns: The escaped string. ## ## .. zeek:see:: to_string_literal escape_string function clean%(str: string%): string %{ char* s = str->AsString()->Render(); return zeek::make_intrusive(new zeek::String(1, byte_vec(s), strlen(s))); %} ## Replaces non-printable characters in a string with escaped sequences. The ## mappings are: ## ## - values not in *[32, 126]* to ``\xXX`` ## - ``\`` to ``\\`` ## - ``'`` and ``""`` to ``\'`` and ``\"``, respectively. ## ## str: The string to escape. ## ## Returns: The escaped string. ## ## .. zeek:see:: clean escape_string function to_string_literal%(str: string%): string %{ char* s = str->AsString()->Render(zeek::String::ZEEK_STRING_LITERAL); return zeek::make_intrusive(new zeek::String(1, byte_vec(s), strlen(s))); %} ## Determines whether a given string contains only ASCII characters. ## The empty string is ASCII. ## ## str: The string to examine. ## ## Returns: False if any byte value of *str* is greater than 127, and true ## otherwise. ## ## .. zeek:see:: to_upper to_lower function is_ascii%(str: string%): bool %{ int n = str->Len(); const u_char* s = str->Bytes(); for ( int i = 0; i < n; ++i ) if ( s[i] > 127 ) return zeek::val_mgr->False(); return zeek::val_mgr->True(); %} ## Replaces non-printable characters in a string with escaped sequences. The ## mappings are: ## ## - values not in *[32, 126]* to ``\xXX`` ## - ``\`` to ``\\`` ## ## In contrast to :zeek:id:`clean`, this encoding is fully reversible.` ## ## str: The string to escape. ## ## Returns: The escaped string. ## ## .. zeek:see:: clean to_string_literal function escape_string%(s: string%): string %{ char* escstr = s->AsString()->Render(zeek::String::ESC_HEX | zeek::String::ESC_ESC); auto val = zeek::make_intrusive(escstr); delete [] escstr; return std::move(val); %} ## Returns an ASCII hexadecimal representation of a string. ## ## s: The string to convert to hex. ## ## Returns: A copy of *s* where each byte is replaced with the corresponding ## hex nibble. function string_to_ascii_hex%(s: string%): string %{ char* x = new char[s->Len() * 2 + 1]; const u_char* sp = s->Bytes(); for ( int i = 0; i < s->Len(); ++i ) snprintf(x + i * 2, 3, "%02x", sp[i]); return zeek::make_intrusive(new zeek::String(1, (u_char*) x, s->Len() * 2)); %} ## Uses the Smith-Waterman algorithm to find similar/overlapping substrings. ## See `Wikipedia `__. ## ## s1: The first string. ## ## s2: The second string. ## ## params: Parameters for the Smith-Waterman algorithm. ## ## Returns: The result of the Smith-Waterman algorithm calculation. function str_smith_waterman%(s1: string, s2: string, params: sw_params%) : sw_substring_vec %{ zeek::detail::SWParams sw_params( params->AsRecordVal()->GetFieldAs(0), zeek::detail::SWVariant(params->AsRecordVal()->GetFieldAs(1))); auto* subseq = zeek::detail::smith_waterman(s1->AsString(), s2->AsString(), sw_params); auto result = zeek::VectorValPtr{zeek::AdoptRef{}, zeek::detail::Substring::VecToPolicy(subseq)}; zeek::util::delete_each(subseq); delete subseq; return std::move(result); %} ## Splits a string into substrings with the help of an index vector of cutting ## points. ## ## s: The string to split. ## ## idx: The index vector (``vector of count``) with the cutting points ## ## Returns: A zero-indexed vector of strings. ## ## .. zeek:see:: split_string split_string1 split_string_all split_string_n function str_split_indices%(s: string, idx: index_vec%): string_vec %{ auto idx_v = idx->As(); auto n = idx_v->Size(); zeek::String::IdxVec indices(n); unsigned int i; for ( i = 0; i < n; i++ ) indices[i] = idx_v->CountAt(i); zeek::String::Vec* result = s->AsString()->Split(indices); auto result_v = zeek::make_intrusive(zeek::id::string_vec); if ( result ) { i = 0; for ( zeek::String::VecIt it = result->begin(); it != result->end(); ++it, ++i ) result_v->Assign(i, zeek::make_intrusive(*it)); // StringVal now possesses string. delete result; } return std::move(result_v); %} ## Strips whitespace at both ends of a string. ## ## str: The string to strip the whitespace from. ## ## Returns: A copy of *str* with leading and trailing whitespace removed. ## ## .. zeek:see:: sub gsub lstrip rstrip function strip%(str: string%): string %{ const u_char* s = str->Bytes(); int n = str->Len(); if ( n == 0 ) // Empty string. return zeek::make_intrusive(new zeek::String(s, n, 1)); const u_char* sp = s; // Move a pointer from the end of the string. const u_char* e = sp + n - 1; while ( e > sp && isspace(*e) ) --e; // Move the pointer for the beginning of the string. while ( isspace(*sp) && sp <= e ) ++sp; return zeek::make_intrusive(new zeek::String(sp, (e - sp + 1), 1)); %} %%{ static bool should_strip(u_char c, const zeek::String* strip_chars) { auto strip_bytes = strip_chars->Bytes(); for ( auto i = 0; i < strip_chars->Len(); ++i ) if ( c == strip_bytes[i] ) return true; return false; } %%} ## Removes all combinations of characters in the *chars* argument ## starting at the beginning of the string until first mismatch. ## ## str: The string to strip characters from. ## ## chars: A string consisting of the characters to be removed. ## Defaults to all whitespace characters. ## ## Returns: A copy of *str* with the characters in *chars* removed from ## the beginning. ## ## .. zeek:see:: sub gsub strip rstrip function lstrip%(str: string, chars: string &default=" \t\n\r\v\f"%): string %{ const u_char* s = str->Bytes(); int n = str->Len(); // empty input string if ( n == 0 ) return zeek::make_intrusive(new zeek::String(s, n, 1)); int i; auto bs_chars = chars->AsString(); for ( i = 0; i < n; ++i ) if ( ! should_strip(s[i], bs_chars) ) break; return zeek::make_intrusive(new zeek::String(s + i, n - i, 1)); %} ## Removes all combinations of characters in the *chars* argument ## starting at the end of the string until first mismatch. ## ## str: The string to strip characters from. ## ## chars: A string consisting of the characters to be removed. ## Defaults to all whitespace characters. ## ## Returns: A copy of *str* with the characters in *chars* removed from ## the end. ## ## .. zeek:see:: sub gsub strip lstrip function rstrip%(str: string, chars: string &default=" \t\n\r\v\f"%): string %{ const u_char* s = str->Bytes(); int n = str->Len(); // empty input string if ( n == 0 ) return zeek::make_intrusive(new zeek::String(s, n, 1)); int n_to_remove; auto bs_chars = chars->AsString(); for ( n_to_remove = 0; n_to_remove < n; ++n_to_remove ) if ( ! should_strip(s[n - n_to_remove - 1], bs_chars) ) break; return zeek::make_intrusive(new zeek::String(s, n - n_to_remove, 1)); %} ## Generates a string of a given size and fills it with repetitions of a source ## string. ## ## len: The length of the output string. ## ## source: The string to concatenate repeatedly until *len* has been reached. ## ## Returns: A string of length *len* filled with *source*. function string_fill%(len: int, source: string%): string %{ const u_char* src = source->Bytes(); int64_t n = source->Len(); char* dst = new char[len]; for ( int i = 0; i < len; i += n ) ::memcpy((dst + i), src, min(n, len - i)); dst[len - 1] = 0; return zeek::make_intrusive(new zeek::String(1, byte_vec(dst), len)); %} ## Takes a string and escapes characters that would allow execution of ## commands at the shell level. Must be used before including strings in ## :zeek:id:`system` or similar calls. ## ## source: The string to escape. ## ## Returns: A shell-escaped version of *source*. Specifically, this ## backslash-escapes characters whose literal value is not otherwise ## preserved by enclosure in double-quotes (dollar-sign, backquote, ## backslash, and double-quote itself), and then encloses that ## backslash-escaped string in double-quotes to ultimately preserve ## the literal value of all input characters. ## ## .. zeek:see:: system safe_shell_quote function safe_shell_quote%(source: string%): string %{ unsigned j = 0; const u_char* src = source->Bytes(); unsigned n = source->Len(); byte_vec dst = new u_char[n * 2 + 1 + 2]; dst[j++] = '"'; for ( unsigned i = 0; i < n; ++i ) { switch ( src[i] ) { case '`': case '"': case '\\': case '$': dst[j++] = '\\'; break; default: break; } dst[j++] = src[i]; } dst[j++] = '"'; dst[j] = '\0'; return zeek::make_intrusive(new zeek::String(1, dst, j)); %} %%{ static bool exceeds_max_string_length(int str_len, int max_size, zeek::detail::Frame* frame) { bool using_constant = false; if ( max_size < 0 ) { static auto max_find_all_string_length = zeek::id::find_val("max_find_all_string_length"); max_size = max_find_all_string_length->Get(); using_constant = true; } if ( max_size > 0 && str_len > max_size ) { zeek::ODesc desc; frame->GetCallLocation()->Describe(&desc); std::string addl = zeek::util::fmt("%s: length %d exceeded %d", desc.Description(), str_len, max_size); if ( using_constant ) addl.append("(from constant max_find_all_string_length"); zeek::reporter->Weird("max_find_all_string_length_exceeded", addl.c_str()); return true; } return false; } %%} ## Finds all occurrences of a pattern in a string. ## ## str: The string to inspect. ## ## re: The pattern to look for in *str*. ## ## max_str_size: The maximum string size allowed as input. If set to -1, this will use the ## :zeek:see:`max_find_all_string_length` global constant. If set to 0, this ## check is disabled. If the length of `str` is greater than this size, an ## empty set is returned. ## ## Returns: The set of strings in *str* that match *re*, or the empty set. ## ## .. zeek:see: find_all_ordered find_last strstr function find_all%(str: string, re: pattern, max_str_size: int &default=-1%) : string_set %{ auto a = zeek::make_intrusive(zeek::id::string_set); if ( exceeds_max_string_length(str->Len(), max_str_size, frame) ) return std::move(a); const u_char* s = str->Bytes(); const u_char* e = s + str->Len(); for ( const u_char* t = s; t < e; ++t ) { int n = re->MatchPrefix(t, e - t); if ( n >= 0 ) { auto idx = zeek::make_intrusive(n, (const char*) t); a->Assign(std::move(idx), 0); t += n - 1; } } return std::move(a); %} ## Finds all occurrences of a pattern in a string. The order in which ## occurrences are found is preserved and the return value may contain ## duplicate elements. ## ## str: The string to inspect. ## ## re: The pattern to look for in *str*. ## ## max_str_size: The maximum string size allowed as input. If set to -1, this will use the ## :zeek:see:`max_find_all_string_length` global constant. If set to 0, this ## check is disabled. If the length of `str` is greater than this size, an ## empty set is returned. ## ## Returns: All strings in *str* that match *re*, or an empty vector. ## ## .. zeek:see: find_all find_last strstr function find_all_ordered%(str: string, re: pattern, max_str_size: int &default=-1%) : string_vec %{ auto a = zeek::make_intrusive(zeek::id::string_vec); if ( exceeds_max_string_length(str->Len(), max_str_size, frame) ) return std::move(a); const u_char* s = str->Bytes(); const u_char* e = s + str->Len(); for ( const u_char* t = s; t < e; ++t ) { int n = re->MatchPrefix(t, e - t); if ( n >= 0 ) { auto idx = zeek::make_intrusive(n, (const char*) t); a->Assign(a->Size(), std::move(idx)); t += n - 1; } } return std::move(a); %} ## Finds the last occurrence of a pattern in a string. This function returns ## the match that starts at the largest index in the string, which is not ## necessarily the longest match. For example, a pattern of ``/.*/`` will ## return the final character in the string. ## ## str: The string to inspect. ## ## re: The pattern to look for in *str*. ## ## Returns: The last string in *str* that matches *re*, or the empty string. ## ## .. zeek:see: find_all find_all_ordered strstr function find_last%(str: string, re: pattern%) : string %{ const u_char* s = str->Bytes(); const u_char* e = s + str->Len(); for ( const u_char* t = e - 1; t >= s; --t ) { int n = re->MatchPrefix(t, e - t); if ( n >= 0 ) return zeek::make_intrusive(n, (const char*) t); } return zeek::val_mgr->EmptyString(); %} ## Returns a hex dump for given input data. The hex dump renders 16 bytes per ## line, with hex on the left and ASCII (where printable) ## on the right. ## ## data_str: The string to dump in hex format. ## ## Returns: The hex dump of the given string. ## ## .. zeek:see:: string_to_ascii_hex bytestring_to_hexstr ## ## .. note:: Based on Netdude's hex editor code. ## function hexdump%(data_str: string%) : string %{ // The width of a line of text in the hex-mode view, consisting // of offset, hex view and ASCII view: // // 32 + 16 characters per 8 bytes, twice // (2*7) + Single space between bytes, twice // 4 + Two spaces between 8-byte sets and ASCII // 1 + For newline // 17 + For ASCII display, with spacer column // 6 For 5-digit offset counter, including spacer // #define HEX_LINE_WIDTH 74 #define HEX_LINE_START 6 #define HEX_LINE_END 53 #define HEX_LINE_START_ASCII 56 #define HEX_LINE_START_RIGHT_ASCII 65 #define HEX_LINE_LEFT_MIDDLE 28 #define HEX_LINE_RIGHT_MIDDLE 31 #define HEX_BLOCK_LEN 23 #define HEX_LINE_BYTES 16 #define NULL_CHAR '.' #define NONPRINT_CHAR '.' const u_char* data = data_str->Bytes(); unsigned data_size = data_str->Len(); if ( ! data ) return zeek::val_mgr->EmptyString(); int num_lines = (data_size / 16) + 1; int len = num_lines * HEX_LINE_WIDTH; u_char* hex_data = new u_char[len + 1]; if ( ! hex_data ) return zeek::val_mgr->EmptyString(); memset(hex_data, ' ', len); u_char* hex_data_ptr = hex_data; u_char* ascii_ptr = hex_data_ptr + 50; int x = 0, y = 0; for ( const u_char* data_ptr = data; data_ptr < data + data_size; ++data_ptr ) { if ( x == 0 ) { char offset[5]; snprintf(offset, sizeof(offset), "%.4tx", data_ptr - data); memcpy(hex_data_ptr, offset, 4); hex_data_ptr += 6; ascii_ptr = hex_data_ptr + 50; } char hex_byte[3]; snprintf(hex_byte, sizeof(hex_byte), "%.2x", (u_char) *data_ptr); int val = (u_char) *data_ptr; u_char ascii_byte = val; // If unprintable, use special characters: if ( val < 0x20 || val >= 0x7f ) { if ( val == 0 ) ascii_byte = NULL_CHAR; else ascii_byte = NONPRINT_CHAR; } *hex_data_ptr++ = hex_byte[0]; *hex_data_ptr++ = hex_byte[1]; *hex_data_ptr++ = ' '; *ascii_ptr++ = ascii_byte; if ( x == 7 ) { *hex_data_ptr++ = ' '; *ascii_ptr++ = ' '; } ++x; if ( x == 16 ) { x = 0; *ascii_ptr++ = '\n'; hex_data_ptr = ascii_ptr; } } // Terminate the string, but ensure it ends with a newline. if ( ascii_ptr[-1] != '\n' ) *ascii_ptr++ = '\n'; *ascii_ptr = 0; auto result = zeek::make_intrusive((const char*) hex_data); delete [] hex_data; return std::move(result); %} ## Returns a reversed copy of the string ## ## str: The string to reverse. ## ## Returns: A reversed copy of *str* ## function reverse%(str: string%) : string %{ string s = str->ToStdString(); reverse(s.begin(), s.end()); return zeek::make_intrusive(s.length(), (const char*)s.c_str()); %} ## Returns the number of times a substring occurs within a string ## ## str: The string to search in. ## substr: The string to search for. ## ## Returns: The number of times the substring occurred. ## function count_substr%(str: string, sub: string%) : count %{ auto s = str->ToStdStringView(); auto sub_s = sub->ToStdStringView(); size_t count = 0; size_t pos = s.find(sub_s); while ( pos != string::npos ) { ++count; pos = s.find(sub_s, pos + sub_s.size()); } return zeek::val_mgr->Count(count); %} %%{ static int64_t do_find_str(zeek::StringVal* str, zeek::StringVal* sub, int64_t start, int64_t end, bool rfind, bool case_sensitive) { // Don't bother if the start is after the end of the string. if ( start > str->Len() ) return -1; // Also don't bother (and return an error) if the end is before the start. if ( (end != -1 ) && end < start ) { zeek::reporter->Error("find_str: end position must be greater than start position"); return -1; } int64_t end_pos = str->Len(); if ( end >= 0 && end < str->Len() ) end_pos = end; // One last sanity check, don't bother doing string operations at all if the range is shorter than // the length of the search string. if ( (end_pos - start + 1) < sub->Len() ) return -1; string s = str->ToStdString().substr(start, end_pos); string sb = sub->ToStdString(); size_t pos = string::npos; if ( ! case_sensitive ) { transform(s.begin(), s.end(), s.begin(), ::tolower); transform(sb.begin(), sb.end(), sb.begin(), ::tolower); } if ( rfind ) pos = s.rfind(sb); else pos = s.find(sb); if ( pos == string::npos ) return -1; return pos + start; } %%} ## Finds a string within another string, starting from the beginning. This works ## by taking a substring within the provided indexes and searching for the sub ## argument. This means that ranges shorter than the string in the sub argument ## will always return a failure. ## ## str: The string to search in. ## substr: The string to search for. ## start: An optional position for the start of the substring. ## end: An optional position for the end of the substring. A value less than ## zero (such as the default -1) means a search until the end of the ## string. ## case_sensitive: Set to false to perform a case-insensitive search. ## (default: T). Note that case-insensitive searches use the ## ``tolower`` libc function, which is locale-sensitive. ## ## Returns: The position of the substring. Returns -1 if the string wasn't ## found. Prints an error if the starting position is after the ending ## position. function find_str%(str: string, sub: string, start: count &default=0, end: int &default=-1, case_sensitive: bool &default=T%) : int %{ return zeek::val_mgr->Int(do_find_str(str, sub, start, end, false, case_sensitive)); %} ## The same as :zeek:see:`find_str`, but returns the highest index matching ## the substring instead of the smallest. ## ## str: The string to search in. ## substr: The string to search for. ## start: An optional position for the start of the substring. ## end: An optional position for the end of the substring. A value less than ## zero (such as the default -1) means a search from the end of the string. ## case_sensitive: Set to false to perform a case-insensitive search. ## (default: T). Note that case-insensitive searches use the ## ``tolower`` libc function, which is locale-sensitive. ## ## Returns: The position of the substring. Returns -1 if the string wasn't ## found. Prints an error if the starting position is after the ending ## position. function rfind_str%(str: string, sub: string, start: count &default=0, end: int &default=-1, case_sensitive: bool &default=T%) : int %{ return zeek::val_mgr->Int(do_find_str(str, sub, start, end, true, case_sensitive)); %} ## Returns whether a string starts with a substring. ## function starts_with%(str: string, sub: string%) : bool %{ if ( sub->Len() > str->Len() ) return zeek::val_mgr->Bool(false); auto sub_s = sub->ToStdStringView(); auto s = str->ToStdStringView(); auto start_s = std::string_view{s.data(), sub_s.size()}; return zeek::val_mgr->Bool(start_s == sub_s); %} ## Returns whether a string ends with a substring. ## function ends_with%(str: string, sub: string%) : bool %{ if ( sub->Len() > str->Len() ) return zeek::val_mgr->Bool(false); auto sub_s = sub->ToStdStringView(); auto s = str->ToStdStringView(); // Create a string_view that only looks at the end of the string being searched // with the same number of characters as the search string. This avoids possible // pathological searches of big strings if the search string doesn't exist. auto end_s = std::string_view{s.data() + s.size() - sub_s.size(), sub_s.size()}; return zeek::val_mgr->Bool(end_s == sub_s); %} ## Returns whether a string consists entirely of digits. ## The empty string is not numeric. ## function is_num%(str: string%) : bool %{ // Python's version of this method (which this is based on) just checks to see if every // character in the string is a numeric value. If something more than this is desired, we // could use something like std::from_chars or std::strto{ul,f} to check it. if ( str->Len() == 0 ) return zeek::val_mgr->False(); const char* s = str->CheckString(); for ( int i = 0; i < str->Len(); i++ ) if ( ! std::isdigit(s[i]) ) return zeek::val_mgr->False(); return zeek::val_mgr->True(); %} ## Returns whether a string consists entirely of alphabetic characters. ## The empty string is not alphabetic. ## function is_alpha%(str: string%) : bool %{ if ( str->Len() == 0 ) return zeek::val_mgr->False(); const char* s = str->CheckString(); for ( int i = 0; i < str->Len(); i++ ) if ( ! std::isalpha(s[i]) ) return zeek::val_mgr->False(); return zeek::val_mgr->True(); %} ## Returns whether a string consists entirely of alphanumeric characters. ## The empty string is not alphanumeric. ## function is_alnum%(str: string%) : bool %{ if ( str->Len() == 0 ) return zeek::val_mgr->False(); const char* s = str->CheckString(); for ( int i = 0; i < str->Len(); i++ ) if ( ! std::isalnum(s[i]) ) return zeek::val_mgr->False(); return zeek::val_mgr->True(); %} ## Returns a left-justified version of the string, padded to a specific length ## with a specified character. ## ## str: The string to left-justify. ## count: The length of the returned string. If this value is less than or ## equal to the length of str, a copy of str is returned. ## fill: The character used to fill in any extra characters in the resulting ## string. If a string longer than one character is passed, an error is ## reported. This defaults to the space character. ## ## Returns: A left-justified version of a string, padded with characters to a ## specific length. ## function ljust%(str: string, width: count, fill: string &default=" "%) : string %{ if ( fill->Len() != 1 ) { reporter->Error("Fill string passed to ljust() must be a single character"); return nullptr; } string new_s = str->ToStdString(); if ( width <= new_s.size() ) return zeek::StringValPtr(zeek::NewRef{}, str); new_s.insert(new_s.size(), width - new_s.size(), fill->CheckString()[0]); return zeek::make_intrusive(new_s); %} %%{ static zeek::StringValPtr do_rjust(zeek::StringVal* str, uint64_t width, char fill) { string new_s = str->ToStdString(); if ( width <= new_s.size() ) return { zeek::NewRef{}, str }; new_s.insert(0, width - new_s.size(), fill); return zeek::make_intrusive(new_s); } %%} ## Returns a right-justified version of the string, padded to a specific length ## with a specified character. ## ## str: The string to right-justify. ## count: The length of the returned string. If this value is less than or ## equal to the length of str, a copy of str is returned. ## fill: The character used to fill in any extra characters in the resulting ## string. If a string longer than one character is passed, an error is ## reported. This defaults to the space character. ## ## Returns: A right-justified version of a string, padded with characters to a ## specific length. ## function rjust%(str: string, width: count, fill: string &default=" "%) : string %{ if ( fill->Len() != 1 ) { reporter->Error("Fill string passed to rjust() must be a single character"); return nullptr; } return do_rjust(str, width, fill->CheckString()[0]); %} ## Swaps the case of every alphabetic character in a string. For example, the string "aBc" be returned as "AbC". ## ## str: The string to swap cases in. ## ## Returns: A copy of the str with the case of each character swapped. ## function swap_case%(str: string%) : string %{ string s = str->ToStdString(); for ( size_t i = 0; i < s.size(); i++ ) { if ( std::islower(s[i]) ) s[i] = toupper(s[i]); else if ( std::isupper(s[i]) ) s[i] = tolower(s[i]); } return zeek::make_intrusive(s); %} ## Converts a string to Title Case. This changes the first character of each sequence of non-space characters ## in the string to be capitalized. See https://docs.python.org/3/library/stdtypes.html#str.title for more info. ## ## str: The string to convert. ## ## Returns: A title-cased version of the string. ## function to_title%(str: string%) : string %{ string s = str->ToStdString(); size_t pos = s.find_first_not_of(' '); if ( pos == string::npos ) return zeek::IntrusivePtr(NewRef{}, str); while ( pos != string::npos ) { s[pos] = std::toupper(s[pos]); pos = s.find(' ', pos+1); if ( pos == string::npos ) break; pos = s.find_first_not_of(' ', pos+1); } return zeek::make_intrusive(s); %} ## Returns a copy of a string filled on the left side with zeroes. This is effectively rjust(str, width, "0"). function zfill%(str: string, width: count%) : string %{ return do_rjust(str, width, '0'); %} ## Similar to lstrip(), except does the removal repeatedly if the pattern repeats at the start of the string. function remove_prefix%(str: string, sub: string%) : string %{ // This could just use repeated calls to lstrip(), except for a couple of reasons: // 1) lstrip() creates a StringVal at the end, and that would mean repeated recreation of objects // 2) lstrip() searches for any character in the string, not the string as a whole. auto s = str->ToStdStringView(); auto sub_s = sub->ToStdStringView(); size_t pos = s.find(sub_s); if ( pos != 0 ) return zeek::IntrusivePtr(NewRef{}, str); pos = s.find(sub_s, pos+1); size_t next_pos = sub_s.size(); while ( pos == next_pos && next_pos < s.size() ) { next_pos += sub_s.size(); pos = s.find(sub_s, pos+1); } return zeek::make_intrusive(s.substr(next_pos)); %} ## Similar to rstrip(), except does the removal repeatedly if the pattern repeats at the end of the string. function remove_suffix%(str: string, sub: string%) : string %{ // See the note in removeprefix for why this doesn't just call rstrip. auto s = str->ToStdStringView(); auto sub_s = sub->ToStdStringView(); size_t pos = s.rfind(sub_s); size_t next_pos = s.size() - sub_s.size(); if ( pos != next_pos ) return zeek::IntrusivePtr(NewRef{}, str); while ( pos == next_pos ) { next_pos -= sub_s.size(); pos = s.rfind(sub_s, pos-1); } return zeek::make_intrusive(s.substr(0, next_pos + sub_s.size())); %}