mirror of
https://github.com/zeek/zeek.git
synced 2025-10-02 06:38:20 +00:00
Merge remote-tracking branch 'origin/topic/seth/tsv-logs-utf8-by-default'
* origin/topic/seth/tsv-logs-utf8-by-default: Fix mis-usage of string::append that leads to an overflow Use json_escape_utf8 for all utf8 data in ODesc Switch the TSV Zeek logs to be UTF8 by default.
This commit is contained in:
commit
b7e264f8ef
10 changed files with 61 additions and 45 deletions
10
CHANGES
10
CHANGES
|
@ -1,3 +1,13 @@
|
||||||
|
4.2.0-dev.154 | 2021-09-08 12:03:51 -0700
|
||||||
|
|
||||||
|
* Fix mis-usage of string::append that leads to an overflow (Tim Wojtulewicz, Corelight)
|
||||||
|
|
||||||
|
* Use json_escape_utf8 for all utf8 data in ODesc (Tim Wojtulewicz, Corelight)
|
||||||
|
|
||||||
|
* Switch the TSV Zeek logs to be UTF8 by default. (Seth Hall, Corelight)
|
||||||
|
|
||||||
|
There is a paired zeek-testing branch for some updates there.
|
||||||
|
|
||||||
4.2.0-dev.150 | 2021-09-08 11:44:15 -0700
|
4.2.0-dev.150 | 2021-09-08 11:44:15 -0700
|
||||||
|
|
||||||
* fix race condition in btest output ordering (Vern Paxson, Corelight)
|
* fix race condition in btest output ordering (Vern Paxson, Corelight)
|
||||||
|
|
4
NEWS
4
NEWS
|
@ -12,6 +12,10 @@ New Functionality
|
||||||
Changed Functionality
|
Changed Functionality
|
||||||
---------------------
|
---------------------
|
||||||
|
|
||||||
|
- The traditional TSV Zeek logs are now valid UTF8 by default. It's possible
|
||||||
|
to revert to the previous behavior by setting ``LogAscii::enable_utf_8`` to
|
||||||
|
false.
|
||||||
|
|
||||||
Removed Functionality
|
Removed Functionality
|
||||||
---------------------
|
---------------------
|
||||||
|
|
||||||
|
|
2
VERSION
2
VERSION
|
@ -1 +1 @@
|
||||||
4.2.0-dev.150
|
4.2.0-dev.154
|
||||||
|
|
|
@ -38,7 +38,7 @@ export {
|
||||||
## written into logs.
|
## written into logs.
|
||||||
##
|
##
|
||||||
## This option is also available as a per-filter ``$config`` option.
|
## This option is also available as a per-filter ``$config`` option.
|
||||||
const enable_utf_8 = F &redef;
|
const enable_utf_8 = T &redef;
|
||||||
|
|
||||||
## Define the gzip level to compress the logs. If 0, then no gzip
|
## Define the gzip level to compress the logs. If 0, then no gzip
|
||||||
## compression is performed. Enabling compression also changes
|
## compression is performed. Enabling compression also changes
|
||||||
|
|
57
src/Desc.cc
57
src/Desc.cc
|
@ -251,11 +251,8 @@ size_t ODesc::StartsWithEscapeSequence(const char* start, const char* end)
|
||||||
if ( escape_sequences.empty() )
|
if ( escape_sequences.empty() )
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
escape_set::const_iterator it;
|
for ( const auto& esc_str : escape_sequences )
|
||||||
|
|
||||||
for ( it = escape_sequences.begin(); it != escape_sequences.end(); ++it )
|
|
||||||
{
|
{
|
||||||
const std::string& esc_str = *it;
|
|
||||||
size_t esc_len = esc_str.length();
|
size_t esc_len = esc_str.length();
|
||||||
|
|
||||||
if ( start + esc_len > end )
|
if ( start + esc_len > end )
|
||||||
|
@ -289,33 +286,9 @@ std::pair<const char*, size_t> ODesc::FirstEscapeLoc(const char* bytes, size_t n
|
||||||
|
|
||||||
if ( len )
|
if ( len )
|
||||||
return escape_pos(bytes + i, len);
|
return escape_pos(bytes + i, len);
|
||||||
|
|
||||||
if ( ! printable && utf8 )
|
|
||||||
{
|
|
||||||
size_t utf_found = getNumBytesForUTF8(bytes[i]);
|
|
||||||
|
|
||||||
if ( utf_found == 1 )
|
|
||||||
return escape_pos(bytes + i, 1);
|
|
||||||
|
|
||||||
if ( i + utf_found > n )
|
|
||||||
// Don't know if this is even meant to be a utf8 encoding,
|
|
||||||
// since there's not enough bytes left to check it's a valid
|
|
||||||
// sequence, so maybe safest to just move up by one instead
|
|
||||||
// of escaping the entire remainder.
|
|
||||||
return escape_pos(bytes + i, 1);
|
|
||||||
|
|
||||||
if ( isLegalUTF8Sequence(reinterpret_cast<const unsigned char *>(bytes + i),
|
|
||||||
reinterpret_cast<const unsigned char *>(bytes + i + utf_found)) )
|
|
||||||
{
|
|
||||||
i += utf_found - 1;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
return escape_pos(bytes + i, 1);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return escape_pos(0, 0);
|
return escape_pos(nullptr, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ODesc::AddBytes(const void* bytes, unsigned int n)
|
void ODesc::AddBytes(const void* bytes, unsigned int n)
|
||||||
|
@ -331,17 +304,31 @@ void ODesc::AddBytes(const void* bytes, unsigned int n)
|
||||||
|
|
||||||
while ( s < e )
|
while ( s < e )
|
||||||
{
|
{
|
||||||
std::pair<const char*, size_t> p = FirstEscapeLoc(s, e - s);
|
auto [ esc_start, esc_len ] = FirstEscapeLoc(s, e - s);
|
||||||
|
|
||||||
if ( p.first )
|
if ( esc_start != nullptr )
|
||||||
{
|
{
|
||||||
AddBytesRaw(s, p.first - s);
|
if ( utf8 )
|
||||||
util::get_escaped_string(this, p.first, p.second, true);
|
{
|
||||||
s = p.first + p.second;
|
std::string result = util::json_escape_utf8(s, esc_start - s, false);
|
||||||
|
AddBytesRaw(result.c_str(), result.size());
|
||||||
|
}
|
||||||
|
else
|
||||||
|
AddBytesRaw(s, esc_start - s);
|
||||||
|
|
||||||
|
util::get_escaped_string(this, esc_start, esc_len, true);
|
||||||
|
s = esc_start + esc_len;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
AddBytesRaw(s, e - s);
|
if ( utf8 )
|
||||||
|
{
|
||||||
|
std::string result = util::json_escape_utf8(s, e - s, false);
|
||||||
|
AddBytesRaw(result.c_str(), result.size());
|
||||||
|
}
|
||||||
|
else
|
||||||
|
AddBytesRaw(s, e - s);
|
||||||
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
14
src/util.cc
14
src/util.cc
|
@ -2439,10 +2439,14 @@ static bool check_ok_utf8(const unsigned char* start, const unsigned char* end)
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
string json_escape_utf8(const string& val)
|
string json_escape_utf8(const string& val, bool escape_printable_controls)
|
||||||
{
|
{
|
||||||
auto val_data = reinterpret_cast<const unsigned char*>(val.c_str());
|
return json_escape_utf8(val.c_str(), val.size(), escape_printable_controls);
|
||||||
auto val_size = val.length();
|
}
|
||||||
|
|
||||||
|
string json_escape_utf8(const char* val, size_t val_size, bool escape_printable_controls)
|
||||||
|
{
|
||||||
|
auto val_data = reinterpret_cast<const unsigned char*>(val);
|
||||||
|
|
||||||
// Reserve at least the size of the existing string to avoid resizing the string in the best-case
|
// Reserve at least the size of the existing string to avoid resizing the string in the best-case
|
||||||
// scenario where we don't have any multi-byte characters. We keep two versions of this string:
|
// scenario where we don't have any multi-byte characters. We keep two versions of this string:
|
||||||
|
@ -2464,7 +2468,7 @@ string json_escape_utf8(const string& val)
|
||||||
// Normal ASCII characters plus a few of the control characters can be inserted directly. The
|
// Normal ASCII characters plus a few of the control characters can be inserted directly. The
|
||||||
// rest of the control characters should be escaped as regular bytes.
|
// rest of the control characters should be escaped as regular bytes.
|
||||||
if ( ( ch >= 32 && ch < 127 ) ||
|
if ( ( ch >= 32 && ch < 127 ) ||
|
||||||
ch == '\b' || ch == '\f' || ch == '\n' || ch == '\r' || ch == '\t' )
|
( escape_printable_controls && ( ch == '\b' || ch == '\f' || ch == '\n' || ch == '\r' || ch == '\t' ) ) )
|
||||||
{
|
{
|
||||||
if ( ! found_bad )
|
if ( ! found_bad )
|
||||||
utf_result.push_back(ch);
|
utf_result.push_back(ch);
|
||||||
|
@ -2504,7 +2508,7 @@ string json_escape_utf8(const string& val)
|
||||||
{
|
{
|
||||||
for ( unsigned int i = 0; i < char_size; i++ )
|
for ( unsigned int i = 0; i < char_size; i++ )
|
||||||
escaped_result.append(json_escape_byte(val[idx+i]));
|
escaped_result.append(json_escape_byte(val[idx+i]));
|
||||||
utf_result.append(val, idx, char_size);
|
utf_result.append(val+idx, char_size);
|
||||||
idx += char_size;
|
idx += char_size;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
11
src/util.h
11
src/util.h
|
@ -550,7 +550,16 @@ char* zeekenv(const char* name);
|
||||||
* @param val the input string to be escaped
|
* @param val the input string to be escaped
|
||||||
* @return the escaped string
|
* @return the escaped string
|
||||||
*/
|
*/
|
||||||
std::string json_escape_utf8(const std::string& val);
|
std::string json_escape_utf8(const std::string& val, bool escape_printable_controls=true);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Escapes bytes in a string that are not valid UTF8 characters with \xYY format. Used
|
||||||
|
* by the JSON writer and BIF methods.
|
||||||
|
* @param val the character data to be escaped
|
||||||
|
* @param val_size the length of the character data
|
||||||
|
* @return the escaped string
|
||||||
|
*/
|
||||||
|
std::string json_escape_utf8(const char* val, size_t val_size, bool escape_printable_controls=true);
|
||||||
|
|
||||||
} // namespace util
|
} // namespace util
|
||||||
} // namespace zeek
|
} // namespace zeek
|
||||||
|
|
|
@ -2,6 +2,8 @@
|
||||||
# @TEST-EXEC: btest-diff test.log
|
# @TEST-EXEC: btest-diff test.log
|
||||||
# @TEST-EXEC: btest-diff output
|
# @TEST-EXEC: btest-diff output
|
||||||
|
|
||||||
|
redef LogAscii::enable_utf_8 = F;
|
||||||
|
|
||||||
module Test;
|
module Test;
|
||||||
|
|
||||||
export {
|
export {
|
||||||
|
|
2
testing/external/commit-hash.zeek-testing
vendored
2
testing/external/commit-hash.zeek-testing
vendored
|
@ -1 +1 @@
|
||||||
6816110d6af23562c837298ef8b65cb8a9d5b73a
|
5b2a6d78f789f1271b68123875ef66eaaba6f3e8
|
||||||
|
|
|
@ -1 +1 @@
|
||||||
4b88837c49ade5d9fd980d5e6cf02ec91d19a3bb
|
44cc696ed070bf7569848437ab1368d557ace4e5
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue