zeek/src/strings.bif

824 lines
18 KiB
Text

# Definitions of Bro built-in functions related to strings.
%%{ // C segment
#include <vector>
#include <algorithm>
using namespace std;
#include "SmithWaterman.h"
%%}
function string_cat%(...%): string
%{
int n = 0;
loop_over_list(@ARG@, i)
n += @ARG@[i]->AsString()->Len();
u_char* b = new u_char[n+1];
BroString* s = new BroString(1, b, n);
loop_over_list(@ARG@, j)
{
const BroString* s = @ARG@[j]->AsString();
memcpy(b, s->Bytes(), s->Len());
b += s->Len();
}
*b = 0;
return new StringVal(s);
%}
%%{
int string_array_to_vs(TableVal* tbl, int start, int end,
vector<const BroString*>& vs)
{
vs.clear();
for ( int i = start; i <= end; ++i )
{
Val* ind = new Val(i, TYPE_COUNT);
Val* v = tbl->Lookup(ind);
if ( ! v )
return 0;
vs.push_back(v->AsString());
#if 0
char* str = v->AsString()->Render();
DEBUG_MSG("string_array[%d] = \"%s\"\n", i, str);
delete [] str;
#endif
delete ind;
}
return 1;
}
int vs_to_string_array(vector<const BroString*>& vs, TableVal* tbl,
int start, int end)
{
for ( int i = start, j = 0; i <= end; ++i, ++j )
{
Val* ind = new Val(i, TYPE_COUNT);
tbl->Assign(ind, new StringVal(vs[j]->Len(),
(const char *)vs[j]->Bytes()));
Unref(ind);
}
return 1;
}
BroString* cat_string_array_n(TableVal* tbl, int start, int end)
{
vector<const BroString*> vs;
string_array_to_vs(tbl, start, end, vs);
return concatenate(vs);
}
%%}
function cat_string_array%(a: string_array%): string
%{
TableVal* tbl = a->AsTableVal();
return new StringVal(cat_string_array_n(tbl, 1, a->AsTable()->Length()));
%}
function cat_string_array_n%(a: string_array, start: count, end: count%): string
%{
TableVal* tbl = a->AsTableVal();
return new StringVal(cat_string_array_n(tbl, start, end));
%}
function join_string_array%(sep: string, a: string_array%): string
%{
vector<const BroString*> vs;
TableVal* tbl = a->AsTableVal();
int n = a->AsTable()->Length();
for ( int i = 1; i <= n; ++i )
{
Val* ind = new Val(i, TYPE_COUNT);
Val* v = tbl->Lookup(ind);
if ( ! v )
return 0;
vs.push_back(v->AsString());
Unref(ind);
if ( i < n )
vs.push_back(sep->AsString());
}
return new StringVal(concatenate(vs));
%}
function sort_string_array%(a: string_array%): string_array
%{
TableVal* tbl = a->AsTableVal();
int n = a->AsTable()->Length();
vector<const BroString*> vs;
string_array_to_vs(tbl, 1, n, vs);
unsigned int i, j;
for ( i = 0; i < vs.size(); ++i )
{
const BroString* x = vs[i];
for ( j = i; j > 0; --j )
if ( Bstr_cmp(vs[j-1], x) <= 0 )
break;
else
vs[j] = vs[j-1];
vs[j] = x;
}
// sort(vs.begin(), vs.end(), Bstr_cmp);
TableVal* b = new TableVal(internal_type("string_array")->AsTableType());
vs_to_string_array(vs, b, 1, n);
return b;
%}
function join_string_vec%(vec: string_vec, sep: string%): string
%{
ODesc d;
VectorVal *v = vec->AsVectorVal();
for ( unsigned i = 0; i < v->Size(); ++i )
{
if ( i > 0 )
d.Add(sep->CheckString(), 0);
v->Lookup(i+1)->Describe(&d);
}
BroString* s = new BroString(1, d.TakeBytes(), d.Len());
s->SetUseFreeToDelete(true);
return new StringVal(s);
%}
function edit%(arg_s: string, arg_edit_char: string%): string
%{
if ( arg_edit_char->Len() != 1 )
builtin_error("not exactly one edit character", @ARG@[1]);
const u_char* s = arg_s->Bytes();
const u_char* edit_s = arg_edit_char->Bytes();
u_char edit_c = *edit_s;
int n = arg_s->Len();
u_char* new_s = new u_char[n+1];
int ind = 0;
for ( int i = 0; i < n; ++i )
{
if ( s[i] == edit_c )
{ // Delete last character
if ( --ind < 0 )
ind = 0;
}
else
new_s[ind++] = s[i];
}
new_s[ind] = '\0';
return new StringVal(new BroString(1, byte_vec(new_s), ind));
%}
function byte_len%(s: string%): count
%{
return new Val(s->Len(), TYPE_COUNT);
%}
function sub_bytes%(s: string, start: count, n: int%): string
%{
if ( start > 0 )
--start; // make it 0-based
BroString* ss = s->AsString()->GetSubstring(start, n);
if ( ! ss )
ss = new BroString("");
return new StringVal(ss);
%}
%%{
static int match_prefix(int s_len, const char* s, int t_len, const char* t)
{
for ( int i = 0; i < t_len; ++i )
{
if ( i >= s_len || s[i] != t[i] )
return 0;
}
return 1;
}
Val* do_split(StringVal* str_val, RE_Matcher* re, TableVal* other_sep,
int incl_sep, int max_num_sep)
{
TableVal* a = new TableVal(internal_type("string_array")->AsTableType());
ListVal* other_strings = 0;
if ( other_sep && other_sep->Size() > 0 )
other_strings = other_sep->ConvertToPureList();
const u_char* s = str_val->Bytes();
int n = str_val->Len();
const u_char* end_of_s = s + n;
int num = 0;
int num_sep = 0;
int offset = 0;
while ( n >= 0 )
{
offset = 0;
// Find next match offset.
int end_of_match = 0;
while ( n > 0 &&
(end_of_match = re->MatchPrefix(s + offset, n)) <= 0 )
{
// Move on to next byte.
++offset;
--n;
}
if ( max_num_sep && num_sep >= max_num_sep )
{
offset = end_of_s - s;
n=0;
}
Val* ind = new Val(++num, TYPE_COUNT);
a->Assign(ind, new StringVal(offset, (const char*) s));
Unref(ind);
// No more separators will be needed if this is the end of string.
if ( n <= 0 )
break;
if ( incl_sep )
{ // including the part that matches the pattern
ind = new Val(++num, TYPE_COUNT);
a->Assign(ind, new StringVal(end_of_match, (const char*) s+offset));
Unref(ind);
}
if ( max_num_sep && num_sep >= max_num_sep )
break;
++num_sep;
n -= end_of_match;
s += offset + end_of_match;;
if ( s > end_of_s )
reporter->InternalError("RegMatch in split goes beyond the string");
}
if ( other_strings )
delete other_strings;
return a;
}
Val* do_sub(StringVal* str_val, RE_Matcher* re, StringVal* repl, int do_all)
{
const u_char* s = str_val->Bytes();
int offset = 0;
int n = str_val->Len();
// cut_points is a set of pairs of indices in str that should
// be removed/replaced. A pair <x,y> means "delete starting
// at offset x, up to but not including offset y".
List(ptr_compat_int) cut_points; // where RE matches pieces of str
int size = 0; // size of result
while ( n > 0 )
{
// Find next match offset.
int end_of_match;
while ( n > 0 &&
(end_of_match = re->MatchPrefix(&s[offset], n)) <= 0 )
{
// This character is going to be copied to the result.
++size;
// Move on to next character.
++offset;
--n;
}
if ( n <= 0 )
break;
// s[offset .. offset+end_of_match-1] matches re.
cut_points.append(offset);
cut_points.append(offset + end_of_match);
offset += end_of_match;
n -= end_of_match;
if ( ! do_all )
{
// We've now done the first substitution - finished.
// Include the remainder of the string in the result.
size += n;
break;
}
}
// size now reflects amount of space copied. Factor in amount
// of space for replacement text.
int num_cut_points = cut_points.length() / 2;
size += num_cut_points * repl->Len();
// And a final NUL for good health.
++size;
byte_vec result = new u_char[size];
byte_vec r = result;
// Copy it all over.
int start_offset = 0;
for ( int i = 0; i < cut_points.length(); i += 2 /* loop over pairs */ )
{
int num_to_copy = cut_points[i] - start_offset;
memcpy(r, s + start_offset, num_to_copy);
r += num_to_copy;
start_offset = cut_points[i+1];
// Now add in replacement text.
memcpy(r, repl->Bytes(), repl->Len());
r += repl->Len();
}
// Copy final trailing characters.
int num_to_copy = str_val->Len() - start_offset;
memcpy(r, s + start_offset, num_to_copy);
r += num_to_copy;
// Final NUL. No need to increment r, since the length
// computed from it in the next statement does not include
// the NUL.
r[0] = '\0';
return new StringVal(new BroString(1, result, r - result));
}
%%}
# Similar to split in awk.
function split%(str: string, re: pattern%): string_array
%{
return do_split(str, re, 0, 0, 0);
%}
# split1(str, pattern, include_separator): table[count] of string
#
# Same as split, except that str is only split (if possible) at the
# earliest position and an array of two strings is returned.
# An array of one string is returned when str cannot be splitted.
function split1%(str: string, re: pattern%): string_array
%{
return do_split(str, re, 0, 0, 1);
%}
# Same as split, except that the array returned by split_all also
# includes parts of string that match the pattern in the array.
# For example, split_all("a-b--cd", /(\-)+/) returns {"a", "-", "b",
# "--", "cd"}: odd-indexed elements do not match the pattern
# and even-indexed ones do.
function split_all%(str: string, re: pattern%): string_array
%{
return do_split(str, re, 0, 1, 0);
%}
function split_n%(str: string, re: pattern,
incl_sep: bool, max_num_sep: count%): string_array
%{
return do_split(str, re, 0, incl_sep, max_num_sep);
%}
function split_complete%(str: string,
re: pattern, other: string_set,
incl_sep: bool, max_num_sep: count%): string_array
%{
return do_split(str, re, other->AsTableVal(), incl_sep, max_num_sep);
%}
function sub%(str: string, re: pattern, repl: string%): string
%{
return do_sub(str, re, repl, 0);
%}
function gsub%(str: string, re: pattern, repl: string%): string
%{
return do_sub(str, re, repl, 1);
%}
function strcmp%(s1: string, s2: string%): int
%{
return new Val(Bstr_cmp(s1->AsString(), s2->AsString()), TYPE_INT);
%}
# Returns 0 if $little is not found in $big.
function strstr%(big: string, little: string%): count
%{
return new Val(
1 + big->AsString()->FindSubstring(little->AsString()),
TYPE_COUNT);
%}
# Substitute each (non-overlapping) appearance of $from in $s to $to,
# and return the resulting string.
function subst_string%(s: string, from: string, to: string%): string
%{
const int little_len = from->Len();
if ( little_len == 0 )
return s->Ref();
int big_len = s->Len();
const u_char* big = s->Bytes();
data_chunk_t dc;
vector<data_chunk_t> vs;
while ( big_len >= little_len )
{
int j = strstr_n(big_len, big, little_len, from->Bytes());
if ( j < 0 )
break;
if ( j > 0 )
{
dc.length = j; dc.data = (const char*) big;
vs.push_back(dc);
}
dc.length = to->Len();
dc.data = (const char*) (to->Bytes());
vs.push_back(dc);
j += little_len;
big += j;
big_len -= j;
}
if ( big_len > 0 )
{
dc.length = big_len; dc.data = (const char*) big;
vs.push_back(dc);
}
return new StringVal(concatenate(vs));
%}
function to_lower%(str: string%): string
%{
const u_char* s = str->Bytes();
int n = str->Len();
char* lower_s = new char[n];
char* ls = lower_s;
for ( int i = 0; i < n; ++i)
{
if ( isascii(s[i]) && isupper(s[i]) )
*ls++ = tolower(s[i]);
else
*ls++ = s[i];
}
return new StringVal(n, lower_s);
%}
function to_upper%(str: string%): string
%{
const u_char* s = str->Bytes();
int n = str->Len();
char* upper_s = new char[n];
char* us = upper_s;
for ( int i = 0; i < n; ++i)
{
if ( isascii(s[i]) && islower(s[i]) )
*us++ = toupper(s[i]);
else
*us++ = s[i];
}
return new StringVal(n, upper_s);
%}
function clean%(str: string%): string
%{
char* s = str->AsString()->Render();
return new StringVal(new BroString(1, byte_vec(s), strlen(s)));
%}
function to_string_literal%(str: string%): string
%{
char* s = str->AsString()->Render(BroString::BRO_STRING_LITERAL);
return new StringVal(new BroString(1, byte_vec(s), strlen(s)));
%}
function is_ascii%(str: string%): bool
%{
int n = str->Len();
const u_char* s = str->Bytes();
for ( int i = 0; i < n; ++i )
if ( s[i] > 127 )
return new Val(0, TYPE_BOOL);
return new Val(1, TYPE_BOOL);
%}
# Make printable version of string.
function escape_string%(s: string%): string
%{
char* escstr = s->AsString()->Render();
Val* val = new StringVal(escstr);
delete [] escstr;
return val;
%}
# Returns an ASCII hexadecimal representation of a string.
function string_to_ascii_hex%(s: string%): string
%{
char* x = new char[s->Len() * 2 + 1];
const u_char* sp = s->Bytes();
for ( int i = 0; i < s->Len(); ++i )
sprintf(x + i * 2, "%02x", sp[i]);
return new StringVal(new BroString(1, (u_char*) x, s->Len() * 2));
%}
function str_smith_waterman%(s1: string, s2: string, params: sw_params%)
: sw_substring_vec
%{
SWParams sw_params(params->AsRecordVal()->Lookup(0)->AsCount(),
SWVariant(params->AsRecordVal()->Lookup(1)->AsCount()));
BroSubstring::Vec* subseq =
smith_waterman(s1->AsString(), s2->AsString(), sw_params);
VectorVal* result = BroSubstring::VecToPolicy(subseq);
delete_each(subseq);
delete subseq;
return result;
%}
function str_split%(s: string, idx: index_vec%): string_vec
%{
vector<Val*>* idx_v = idx->AsVector();
BroString::IdxVec indices(idx_v->size());
unsigned int i;
for ( i = 0; i < idx_v->size(); i++ )
indices[i] = (*idx_v)[i]->AsCount();
BroString::Vec* result = s->AsString()->Split(indices);
VectorVal* result_v =
new VectorVal(new VectorType(base_type(TYPE_STRING)));
if ( result )
{
i = 1;
for ( BroString::VecIt it = result->begin();
it != result->end(); ++it, ++i )
result_v->Assign(i, new StringVal(*it), 0);
// StringVal now possesses string.
delete result;
}
return result_v;
%}
function strip%(str: string%): string
%{
const u_char* s = str->Bytes();
int n = str->Len();
if ( n == 0 )
// Empty string.
return new StringVal(new BroString(s, n, 1));
const u_char* sp = s;
// Move a pointer from the end of the string.
const u_char* e = sp + n - 1;
while ( e > sp && isspace(*e) )
--e;
// Move the pointer for the beginning of the string.
while ( isspace(*sp) && sp <= e )
++sp;
return new StringVal(new BroString(sp, (e - sp + 1), 1));
%}
function string_fill%(len: int, source: string%): string
%{
const u_char* src = source->Bytes();
int64_t n = source->Len();
char* dst = new char[len];
for ( int i = 0; i < len; i += n )
::memcpy((dst + i), src, min(n, len - i));
dst[len - 1] = 0;
return new StringVal(new BroString(1, byte_vec(dst), len));
%}
# Takes a string and escapes characters that would allow execution of commands
# at the shell level. Must be used before including strings in system() or
# similar calls.
#
function str_shell_escape%(source: string%): string
%{
unsigned j = 0;
const u_char* src = source->Bytes();
unsigned n = source->Len();
byte_vec dst = new u_char[n * 2 + 1];
for ( unsigned i = 0; i < n; ++i )
{
switch ( src[i] ) {
case '`': case '"': case '\\': case '$':
// case '|': case '&': case ';': case '(': case ')': case '<':
// case '>': case '\'': case '*': case '?': case '[': case ']':
// case '!': case '#': case '{': case '}':
dst[j++] = '\\';
break;
default:
break;
}
dst[j++] = src[i];
}
dst[j] = '\0';
return new StringVal(new BroString(1, dst, j));
%}
# Returns all occurrences of the given pattern in the given string (an empty
# empty set if none).
function find_all%(str: string, re: pattern%) : string_set
%{
TableVal* a = new TableVal(internal_type("string_set")->AsTableType());
const u_char* s = str->Bytes();
const u_char* e = s + str->Len();
for ( const u_char* t = s; t < e; ++t )
{
int n = re->MatchPrefix(t, e - t);
if ( n >= 0 )
{
a->Assign(new StringVal(n, (const char*) t), 0);
t += n - 1;
}
}
return a;
%}
# Returns the last occurrence of the given pattern in the given string.
# If not found, returns an empty string. Note that this function returns
# the match that starts at the largest index in the string, which is
# not necessarily the longest match. For example, a pattern of /.*/
# will return the final character in the string.
function find_last%(str: string, re: pattern%) : string
%{
const u_char* s = str->Bytes();
const u_char* e = s + str->Len();
for ( const u_char* t = e - 1; t >= s; --t )
{
int n = re->MatchPrefix(t, e - t);
if ( n >= 0 )
return new StringVal(n, (const char*) t);
}
return new StringVal("");
%}
# Returns a hex dump for given input data. The hex dump renders
# 16 bytes per line, with hex on the left and ASCII (where printable)
# on the right. Based on Netdude's hex editor code.
#
function hexdump%(data_str: string%) : string
%{
// The width of a line of text in the hex-mode view, consisting
// of offset, hex view and ASCII view:
//
// 32 + 16 characters per 8 bytes, twice
// (2*7) + Single space between bytes, twice
// 4 + Two spaces between 8-byte sets and ASCII
// 1 + For newline
// 17 + For ASCII display, with spacer column
// 6 For 5-digit offset counter, including spacer
//
#define HEX_LINE_WIDTH 74
#define HEX_LINE_START 6
#define HEX_LINE_END 53
#define HEX_LINE_START_ASCII 56
#define HEX_LINE_START_RIGHT_ASCII 65
#define HEX_LINE_LEFT_MIDDLE 28
#define HEX_LINE_RIGHT_MIDDLE 31
#define HEX_BLOCK_LEN 23
#define HEX_LINE_BYTES 16
#define NULL_CHAR '.'
#define NONPRINT_CHAR '.'
const u_char* data = data_str->Bytes();
unsigned data_size = data_str->Len();
if ( ! data )
return new StringVal("");
int num_lines = (data_size / 16) + 1;
int len = num_lines * HEX_LINE_WIDTH;
u_char* hex_data = new u_char[len + 1];
if ( ! hex_data )
return new StringVal("");
memset(hex_data, ' ', len);
u_char* hex_data_ptr = hex_data;
u_char* ascii_ptr = hex_data_ptr + 50;
int x = 0, y = 0;
for ( const u_char* data_ptr = data; data_ptr < data + data_size;
++data_ptr )
{
if ( x == 0 )
{
char offset[5];
safe_snprintf(offset, sizeof(offset),
"%.4x", data_ptr - data);
memcpy(hex_data_ptr, offset, 4);
hex_data_ptr += 6;
ascii_ptr = hex_data_ptr + 50;
}
char hex_byte[3];
safe_snprintf(hex_byte, sizeof(hex_byte),
"%.2x", (u_char) *data_ptr);
int val = (u_char) *data_ptr;
u_char ascii_byte = val;
// If unprintable, use special characters:
if ( val < 0x20 || val >= 0x7f )
{
if ( val == 0 )
ascii_byte = NULL_CHAR;
else
ascii_byte = NONPRINT_CHAR;
}
*hex_data_ptr++ = hex_byte[0];
*hex_data_ptr++ = hex_byte[1];
*hex_data_ptr++ = ' ';
*ascii_ptr++ = ascii_byte;
if ( x == 7 )
{
*hex_data_ptr++ = ' ';
*ascii_ptr++ = ' ';
}
++x;
if ( x == 16 )
{
x = 0;
*ascii_ptr++ = '\n';
hex_data_ptr = ascii_ptr;
}
}
// Terminate the string, but ensure it ends with a newline.
if ( ascii_ptr[-1] != '\n' )
*ascii_ptr++ = '\n';
*ascii_ptr = 0;
StringVal* result = new StringVal((const char*) hex_data);
delete [] hex_data;
return result;
%}