mirror of
https://github.com/zeek/zeek.git
synced 2025-10-02 06:38:20 +00:00
641 lines
16 KiB
C++
641 lines
16 KiB
C++
// See the file "COPYING" in the main distribution directory for copyright.
|
|
|
|
#include "zeek/ZeekString.h"
|
|
|
|
#include <algorithm>
|
|
#include <cctype>
|
|
#include <iostream>
|
|
#include <sstream> // Needed for unit testing
|
|
|
|
#include "zeek/ID.h"
|
|
#include "zeek/Reporter.h"
|
|
#include "zeek/Val.h"
|
|
#include "zeek/util.h"
|
|
|
|
#include "zeek/3rdparty/doctest.h"
|
|
|
|
#ifdef DEBUG
|
|
#define DEBUG_STR(msg) DBG_LOG(zeek::DBG_STRING, msg)
|
|
#else
|
|
#define DEBUG_STR(msg)
|
|
#endif
|
|
|
|
using namespace std::string_literals;
|
|
|
|
namespace zeek {
|
|
|
|
// This constructor forces the user to specify arg_final_NUL. When str
|
|
// is a *normal* NUL-terminated string, make arg_n == strlen(str) and
|
|
// arg_final_NUL == 1; when str is a sequence of n bytes, make
|
|
// arg_final_NUL == 0.
|
|
|
|
String::String(bool arg_final_NUL, byte_vec str, int arg_n) {
|
|
b = str;
|
|
n = arg_n;
|
|
final_NUL = arg_final_NUL;
|
|
use_free_to_delete = false;
|
|
}
|
|
|
|
String::String(String&& other) noexcept
|
|
: b(other.b), n(other.n), final_NUL(other.final_NUL), use_free_to_delete(other.use_free_to_delete) {
|
|
other.b = nullptr;
|
|
other.Reset();
|
|
}
|
|
|
|
String::String(const u_char* str, int arg_n, bool add_NUL) : String() { Set(str, arg_n, add_NUL); }
|
|
|
|
String::String(std::string_view str) : String() { Set(str); }
|
|
|
|
String::String(const String& bs) : String() { *this = bs; }
|
|
|
|
String::String() {
|
|
b = nullptr;
|
|
n = 0;
|
|
final_NUL = false;
|
|
use_free_to_delete = false;
|
|
}
|
|
|
|
void String::Reset() {
|
|
if ( use_free_to_delete )
|
|
free(b);
|
|
else
|
|
delete[] b;
|
|
|
|
b = nullptr;
|
|
n = 0;
|
|
final_NUL = false;
|
|
use_free_to_delete = false;
|
|
}
|
|
|
|
const String& String::operator=(const String& bs) {
|
|
if ( this == &bs )
|
|
return *this;
|
|
|
|
Reset();
|
|
n = bs.n;
|
|
b = new u_char[n + 1];
|
|
|
|
memcpy(b, bs.b, n);
|
|
b[n] = '\0';
|
|
|
|
final_NUL = true;
|
|
use_free_to_delete = false;
|
|
return *this;
|
|
}
|
|
|
|
String& String::operator=(String&& other) noexcept {
|
|
b = other.b;
|
|
n = other.n;
|
|
final_NUL = other.final_NUL;
|
|
use_free_to_delete = other.use_free_to_delete;
|
|
|
|
other.b = nullptr;
|
|
other.Reset();
|
|
|
|
return *this;
|
|
}
|
|
|
|
bool String::operator==(const String& bs) const { return Bstr_eq(this, &bs); }
|
|
|
|
bool String::operator<(const String& bs) const { return Bstr_cmp(this, &bs) < 0; }
|
|
|
|
bool String::operator==(std::string_view s) const {
|
|
if ( static_cast<size_t>(n) != s.size() )
|
|
return false;
|
|
|
|
if ( b == nullptr ) {
|
|
return s.size() == 0;
|
|
}
|
|
|
|
return (memcmp(b, s.data(), n) == 0);
|
|
}
|
|
|
|
bool String::operator!=(std::string_view s) const { return ! (*this == s); }
|
|
|
|
void String::Adopt(byte_vec bytes, int len) {
|
|
Reset();
|
|
|
|
b = bytes;
|
|
|
|
// Check if the string ends with a NUL. If so, mark it as having
|
|
// a final NUL and adjust the length accordingly.
|
|
final_NUL = (b[len - 1] == '\0');
|
|
n = len - final_NUL;
|
|
}
|
|
|
|
void String::Set(const u_char* str, int len, bool add_NUL) {
|
|
Reset();
|
|
|
|
n = len;
|
|
b = new u_char[add_NUL ? n + 1 : n];
|
|
memcpy(b, str, n);
|
|
final_NUL = add_NUL;
|
|
|
|
if ( add_NUL )
|
|
b[n] = 0;
|
|
|
|
use_free_to_delete = false;
|
|
}
|
|
|
|
void String::Set(std::string_view str) {
|
|
Reset();
|
|
|
|
if ( ! str.empty() ) {
|
|
n = str.size();
|
|
b = new u_char[n + 1];
|
|
memcpy(b, str.data(), n);
|
|
b[n] = 0;
|
|
final_NUL = true;
|
|
use_free_to_delete = false;
|
|
}
|
|
}
|
|
|
|
void String::Set(const String& str) { *this = str; }
|
|
|
|
std::pair<const char*, size_t> String::CheckStringWithSize() const {
|
|
void* nulTerm;
|
|
if ( n == 0 )
|
|
return {"", 0};
|
|
|
|
nulTerm = memchr(b, '\0', n + final_NUL);
|
|
if ( nulTerm != &b[n] ) {
|
|
// Either an embedded NUL, or no final NUL.
|
|
char* exp_s = Render();
|
|
|
|
if ( nulTerm )
|
|
reporter->Error("string with embedded NUL: \"%s\"", exp_s);
|
|
else
|
|
reporter->Error("string without NUL terminator: \"%s\"", exp_s);
|
|
|
|
delete[] exp_s;
|
|
static constexpr const char result[] = "<string-with-NUL>";
|
|
return {result, std::size(result) - 1};
|
|
}
|
|
|
|
return {(const char*)b, n};
|
|
}
|
|
|
|
const char* String::CheckString() const { return CheckStringWithSize().first; }
|
|
|
|
std::string String::ToStdString() const { return {(char*)Bytes(), static_cast<size_t>(Len())}; }
|
|
|
|
std::string_view String::ToStdStringView() const { return {(char*)Bytes(), static_cast<size_t>(Len())}; }
|
|
|
|
char* String::Render(int format, int* len) const {
|
|
// Maximum character expansion is as \xHH, so a factor of 4.
|
|
char* s = new char[n * 4 + 1]; // +1 is for final '\0'
|
|
char* sp = s;
|
|
int tmp_len;
|
|
|
|
for ( int i = 0; i < n; ++i ) {
|
|
if ( b[i] == '\\' && (format & ESC_ESC) ) {
|
|
*sp++ = '\\';
|
|
*sp++ = '\\';
|
|
}
|
|
|
|
else if ( (b[i] == '\'' || b[i] == '"') && (format & ESC_QUOT) ) {
|
|
*sp++ = '\\';
|
|
*sp++ = b[i];
|
|
}
|
|
|
|
else if ( (b[i] < ' ' || b[i] > 126) && (format & ESC_HEX) ) {
|
|
char hex_fmt[16];
|
|
|
|
*sp++ = '\\';
|
|
*sp++ = 'x';
|
|
snprintf(hex_fmt, 16, "%02x", b[i]);
|
|
*sp++ = hex_fmt[0];
|
|
*sp++ = hex_fmt[1];
|
|
}
|
|
|
|
else if ( (b[i] < ' ' || b[i] > 126) && (format & ESC_DOT) ) {
|
|
*sp++ = '.';
|
|
}
|
|
|
|
else {
|
|
*sp++ = b[i];
|
|
}
|
|
}
|
|
|
|
*sp++ = '\0'; // NUL-terminate.
|
|
tmp_len = sp - s;
|
|
|
|
if ( (format & ESC_SER) ) {
|
|
char* result = new char[tmp_len + 16];
|
|
snprintf(result, tmp_len + 16, "%u ", tmp_len - 1);
|
|
tmp_len += strlen(result);
|
|
memcpy(result + strlen(result), s, sp - s);
|
|
delete[] s;
|
|
s = result;
|
|
}
|
|
|
|
if ( len )
|
|
*len = tmp_len;
|
|
|
|
return s;
|
|
}
|
|
|
|
std::ostream& String::Render(std::ostream& os, int format) const {
|
|
char* tmp = Render(format);
|
|
os << tmp;
|
|
delete[] tmp;
|
|
return os;
|
|
}
|
|
|
|
std::istream& String::Read(std::istream& is, int format) {
|
|
if ( (format & String::ESC_SER) ) {
|
|
int len;
|
|
is >> len; // Get the length of the string
|
|
|
|
char c;
|
|
is.read(&c, 1); // Eat single whitespace
|
|
|
|
char* buf = new char[len + 1];
|
|
is.read(buf, len);
|
|
buf[len] = '\0'; // NUL-terminate just for safety
|
|
|
|
Adopt((u_char*)buf, len + 1);
|
|
}
|
|
else {
|
|
std::string str;
|
|
is >> str;
|
|
Set(str);
|
|
}
|
|
|
|
return is;
|
|
}
|
|
|
|
void String::ToUpper() {
|
|
for ( int i = 0; i < n; ++i )
|
|
b[i] = std::toupper(b[i]);
|
|
}
|
|
|
|
String* String::GetSubstring(int start, int len) const {
|
|
// This code used to live in zeek.bif's sub_bytes() routine.
|
|
if ( start < 0 || start > n )
|
|
return nullptr;
|
|
|
|
if ( len < 0 || len > n - start )
|
|
len = n - start;
|
|
|
|
return new String(&b[start], len, true);
|
|
}
|
|
|
|
int String::FindSubstring(const String* s) const { return util::strstr_n(n, b, s->Len(), s->Bytes()); }
|
|
|
|
String::Vec* String::Split(const String::IdxVec& indices) const {
|
|
if ( indices.empty() )
|
|
return nullptr;
|
|
|
|
// Copy input, ensuring space for "0":
|
|
IdxVec idx(1 + indices.size());
|
|
|
|
idx[0] = 0;
|
|
idx.insert(idx.end(), indices.begin(), indices.end());
|
|
|
|
// Sanity checks.
|
|
std::ranges::transform(idx.begin(), idx.end(), idx.begin(), [this](int v) {
|
|
if ( v >= n || v < 0 )
|
|
return 0;
|
|
return v;
|
|
});
|
|
|
|
// Sort it:
|
|
std::ranges::sort(idx);
|
|
|
|
// Shuffle vector so duplicate entries are used only once. "ret" here is the first
|
|
// element after the last unique element. "last" should be the end of the vector.
|
|
auto [ret, last] = std::ranges::unique(idx);
|
|
|
|
// Each element in idx is now the start index of a new
|
|
// substring, and we know that all indices are within [0, n].
|
|
//
|
|
Vec* result = new Vec();
|
|
result->reserve(std::distance(idx.begin(), ret));
|
|
|
|
size_t i = 0;
|
|
for ( IdxVecIt it = idx.begin(); it != ret; ++it, ++i ) {
|
|
int len = (it + 1 == last) ? -1 : idx[i + 1] - idx[i];
|
|
result->push_back(GetSubstring(idx[i], len));
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
bool StringLenCmp::operator()(String* const& bst1, String* const& bst2) {
|
|
return _increasing ? (bst1->Len() < bst2->Len()) : (bst1->Len() > bst2->Len());
|
|
}
|
|
|
|
std::ostream& operator<<(std::ostream& os, const String& bs) {
|
|
char* tmp = bs.Render(String::EXPANDED_STRING);
|
|
os << tmp;
|
|
delete[] tmp;
|
|
return os;
|
|
}
|
|
|
|
int Bstr_eq(const String* s1, const String* s2) {
|
|
if ( s1->Len() != s2->Len() )
|
|
return 0;
|
|
|
|
if ( ! s1->Bytes() || ! s2->Bytes() )
|
|
// memcmp() arguments should never be null, so help avoid that
|
|
return s1->Bytes() == s2->Bytes();
|
|
|
|
return memcmp(s1->Bytes(), s2->Bytes(), s1->Len()) == 0;
|
|
}
|
|
|
|
int Bstr_cmp(const String* s1, const String* s2) {
|
|
int n = std::min(s1->Len(), s2->Len());
|
|
// memcmp() arguments should never be null, so help avoid that
|
|
// (assuming that we only ever have null pointers when lengths are zero).
|
|
int cmp = n == 0 ? 0 : memcmp(s1->Bytes(), s2->Bytes(), n);
|
|
|
|
if ( cmp || s1->Len() == s2->Len() )
|
|
return cmp;
|
|
|
|
// Compared equal, but one was shorter than the other. Treat
|
|
// it as less than the other.
|
|
if ( s1->Len() < s2->Len() )
|
|
return -1;
|
|
else
|
|
return 1;
|
|
}
|
|
|
|
String* concatenate(std::vector<data_chunk_t>& v) {
|
|
int n = v.size();
|
|
int len = 0;
|
|
int i;
|
|
for ( i = 0; i < n; ++i )
|
|
len += v[i].length;
|
|
|
|
char* data = new char[len + 1];
|
|
|
|
char* b = data;
|
|
for ( i = 0; i < n; ++i ) {
|
|
memcpy(b, v[i].data, v[i].length);
|
|
b += v[i].length;
|
|
}
|
|
|
|
*b = '\0';
|
|
|
|
return new String(true, (byte_vec)data, len);
|
|
}
|
|
|
|
String* concatenate(String::CVec& v) {
|
|
int n = v.size();
|
|
int len = 0;
|
|
int i;
|
|
for ( i = 0; i < n; ++i )
|
|
len += v[i]->Len();
|
|
|
|
char* data = new char[len + 1];
|
|
|
|
char* b = data;
|
|
for ( i = 0; i < n; ++i ) {
|
|
memcpy(b, v[i]->Bytes(), v[i]->Len());
|
|
b += v[i]->Len();
|
|
}
|
|
*b = '\0';
|
|
|
|
return new String(true, (byte_vec)data, len);
|
|
}
|
|
|
|
String* concatenate(String::Vec& v) {
|
|
String::CVec cv;
|
|
std::ranges::copy(v, std::back_inserter<String::CVec>(cv));
|
|
return concatenate(cv);
|
|
}
|
|
|
|
void delete_strings(std::vector<const String*>& v) {
|
|
for ( auto& elem : v )
|
|
delete elem;
|
|
v.clear();
|
|
}
|
|
|
|
} // namespace zeek
|
|
|
|
TEST_SUITE_BEGIN("ZeekString");
|
|
|
|
TEST_CASE("construction") {
|
|
zeek::String s1{};
|
|
CHECK_EQ(s1.Len(), 0);
|
|
CHECK_EQ(s1.Bytes(), nullptr);
|
|
CHECK_EQ(s1, "");
|
|
|
|
std::string text = "abcdef";
|
|
zeek::byte_vec text2 = new u_char[7];
|
|
memcpy(text2, text.c_str(), 7);
|
|
|
|
zeek::String s2{text2, 6, false};
|
|
CHECK_EQ(s2.Len(), 6);
|
|
|
|
zeek::String s3{text2, 6, true};
|
|
CHECK_EQ(s3.Len(), 6);
|
|
|
|
zeek::String s4{"abcdef"};
|
|
CHECK_EQ(s4.Len(), 6);
|
|
|
|
zeek::String s5{std::string("abcdef")};
|
|
CHECK_EQ(s5.Len(), 6);
|
|
|
|
// Test the copy constructor.
|
|
// coverity[copy_instead_of_move]
|
|
zeek::String s6{s5};
|
|
CHECK_EQ(s6.Len(), 6);
|
|
|
|
zeek::String s7{true, text2, 6};
|
|
CHECK_EQ(s7.Len(), 6);
|
|
CHECK_EQ(s7.Bytes(), text2);
|
|
|
|
zeek::byte_vec text3 = new u_char[7];
|
|
memcpy(text3, text.c_str(), 7);
|
|
zeek::String s8{false, text3, 6};
|
|
CHECK_EQ(std::string(s8.CheckString()), "<string-with-NUL>");
|
|
|
|
zeek::byte_vec text4 = new u_char[7];
|
|
memcpy(text4, text.c_str(), 7);
|
|
text4[2] = '\0';
|
|
zeek::String s9{false, text4, 6};
|
|
CHECK_EQ(std::string(s9.CheckString()), "<string-with-NUL>");
|
|
|
|
zeek::byte_vec text5 = (zeek::byte_vec)malloc(7);
|
|
memcpy(text5, text.c_str(), 7);
|
|
zeek::String s10{true, text5, 6};
|
|
s10.SetUseFreeToDelete(1);
|
|
CHECK_EQ(s10.Bytes(), text5);
|
|
|
|
// Test the move constructor.
|
|
zeek::String s11{std::move(s5)};
|
|
CHECK_EQ(s11.Len(), 6);
|
|
}
|
|
|
|
TEST_CASE("set/assignment/comparison") {
|
|
zeek::String s{"abc"};
|
|
CHECK_EQ(s, "abc");
|
|
|
|
s.Set("def");
|
|
CHECK_EQ(s, "def");
|
|
|
|
s.Set(std::string("ghi"));
|
|
CHECK_EQ(s, "ghi");
|
|
|
|
zeek::String s2{"abc"};
|
|
s.Set(s2);
|
|
CHECK_EQ(s, "abc");
|
|
|
|
zeek::String s3{"def"};
|
|
s = s3;
|
|
CHECK_EQ(s, "def");
|
|
CHECK_EQ(s, s3);
|
|
CHECK(s2 < s3);
|
|
|
|
s.Set("ghi");
|
|
CHECK_FALSE(s < s2);
|
|
|
|
std::string text = "abcdef";
|
|
zeek::byte_vec text2 = new u_char[7];
|
|
memcpy(text2, text.c_str(), 7);
|
|
s.Adopt(text2, 7);
|
|
|
|
CHECK_EQ(s, "abcdef");
|
|
CHECK_FALSE(s == s2);
|
|
|
|
// This is a clearly invalid string and we probably shouldn't allow it to be
|
|
// constructed, but this test covers one if statement in Bstr_eq.
|
|
zeek::String s4(false, nullptr, 3);
|
|
CHECK_FALSE(s4 == s2);
|
|
|
|
zeek::String s5{};
|
|
CHECK_LT(s5, s);
|
|
CHECK_FALSE(s < s5);
|
|
|
|
zeek::String s6 = std::move(s3);
|
|
CHECK_EQ(s6, "def");
|
|
}
|
|
|
|
TEST_CASE("searching/modification") {
|
|
zeek::String s{"this is a test"};
|
|
auto* ss = s.GetSubstring(5, 4);
|
|
CHECK_EQ(*ss, "is a");
|
|
delete ss;
|
|
|
|
auto* ss2 = s.GetSubstring(-1, 4);
|
|
CHECK_EQ(ss2, nullptr);
|
|
ss2 = s.GetSubstring(s.Len() + 5, 4);
|
|
CHECK_EQ(ss2, nullptr);
|
|
|
|
zeek::String s2{"test"};
|
|
CHECK_EQ(s.FindSubstring(&s2), 10);
|
|
|
|
s2.ToUpper();
|
|
CHECK_EQ(s2, "TEST");
|
|
|
|
zeek::String::IdxVec indexes;
|
|
zeek::String::Vec* splits = s.Split(indexes);
|
|
CHECK_EQ(splits, nullptr);
|
|
|
|
indexes.insert(indexes.end(), {4, 7, 9, -1, 30});
|
|
splits = s.Split(indexes);
|
|
CHECK_EQ(splits->size(), 4);
|
|
CHECK_EQ(*(splits->at(0)), "this");
|
|
CHECK_EQ(*(splits->at(1)), " is");
|
|
CHECK_EQ(*(splits->at(2)), " a");
|
|
CHECK_EQ(*(splits->at(3)), " test");
|
|
|
|
zeek::String* s3 = concatenate(*splits);
|
|
CHECK_EQ(s.Len(), s3->Len());
|
|
CHECK_EQ(s, *s3);
|
|
delete s3;
|
|
|
|
for ( auto& spl : *splits )
|
|
delete spl;
|
|
delete splits;
|
|
}
|
|
|
|
TEST_CASE("rendering") {
|
|
zeek::String s1("\\abcd\'\"");
|
|
auto* r = s1.Render(zeek::String::ESC_ESC);
|
|
CHECK_EQ(std::string(r), "\\\\abcd\'\"");
|
|
delete[] r;
|
|
|
|
r = s1.Render(zeek::String::ESC_QUOT);
|
|
CHECK_EQ(std::string(r), "\\abcd\\\'\\\"");
|
|
delete[] r;
|
|
|
|
r = s1.Render(zeek::String::ESC_ESC | zeek::String::ESC_QUOT | zeek::String::ESC_SER);
|
|
CHECK_EQ(std::string(r), "10 \\\\abcd\\\'\\\"");
|
|
delete[] r;
|
|
|
|
zeek::byte_vec text = new u_char[6];
|
|
text[0] = 3;
|
|
text[1] = 4;
|
|
text[2] = 5;
|
|
text[3] = 6;
|
|
text[4] = '\\';
|
|
text[5] = '\'';
|
|
zeek::String s2(false, text, 6);
|
|
|
|
r = s2.Render(zeek::String::ESC_HEX);
|
|
CHECK_EQ(std::string(r), "\\x03\\x04\\x05\\x06\\\'");
|
|
delete[] r;
|
|
|
|
int test_length = 0;
|
|
r = s2.Render(zeek::String::ESC_DOT, &test_length);
|
|
CHECK_EQ(std::string(r), "....\\\'");
|
|
CHECK_EQ(test_length, 7);
|
|
delete[] r;
|
|
|
|
r = s2.Render(zeek::String::ZEEK_STRING_LITERAL);
|
|
CHECK_EQ(std::string(r), "\\x03\\x04\\x05\\x06\\\\\\\'");
|
|
delete[] r;
|
|
|
|
std::ostringstream os1;
|
|
// This uses ESC_HEX, so it should be the same as the test above
|
|
os1 << s2;
|
|
CHECK_EQ(os1.str(), "\\x03\\x04\\x05\\x06\\\'");
|
|
|
|
std::ostringstream os2;
|
|
s2.Render(os2, zeek::String::ESC_HEX);
|
|
CHECK_EQ(os2.str(), "\\x03\\x04\\x05\\x06\\\'");
|
|
}
|
|
|
|
TEST_CASE("read") {
|
|
std::string text1("5 abcde");
|
|
std::istringstream iss1(text1);
|
|
zeek::String s1{};
|
|
s1.Read(iss1);
|
|
CHECK_EQ(s1, "abcde");
|
|
|
|
std::string text2("abcde");
|
|
std::istringstream iss2(text2);
|
|
zeek::String s2{};
|
|
// Setting to something else disables reading the serialization format
|
|
s2.Read(iss2, zeek::String::ESC_HEX);
|
|
CHECK_EQ(s2, text2);
|
|
}
|
|
|
|
TEST_CASE("misc") {
|
|
std::vector<const zeek::String*> sv = {new zeek::String{}, new zeek::String{}};
|
|
CHECK_EQ(sv.size(), 2);
|
|
zeek::delete_strings(sv);
|
|
CHECK_EQ(sv.size(), 0);
|
|
|
|
std::vector<zeek::data_chunk_t> dv = {{5, "abcde"}, {6, "fghijk"}};
|
|
auto* s = zeek::concatenate(dv);
|
|
CHECK_EQ(*s, "abcdefghijk");
|
|
delete s;
|
|
|
|
std::vector<zeek::String*> sv2 = {new zeek::String{"abcde"}, new zeek::String{"fghi"}};
|
|
std::ranges::sort(sv2, zeek::StringLenCmp(true));
|
|
CHECK_EQ(*(sv2.front()), "fghi");
|
|
CHECK_EQ(*(sv2.back()), "abcde");
|
|
|
|
std::ranges::sort(sv2, zeek::StringLenCmp(false));
|
|
CHECK_EQ(*(sv2.front()), "abcde");
|
|
CHECK_EQ(*(sv2.back()), "fghi");
|
|
|
|
for ( auto* entry : sv2 )
|
|
delete entry;
|
|
}
|
|
|
|
TEST_SUITE_END();
|