mirror of
https://github.com/zeek/zeek.git
synced 2025-10-02 14:48:21 +00:00
Merge remote-tracking branch 'origin/topic/dev/non-ascii-logging'
* origin/topic/dev/non-ascii-logging: Removed Policy Script for UTF-8 Logs Commented out UTF-8 Script in Test All Policy Minor Style Tweak Use getNumBytesForUTF8 method to determine number of bytes Added Jon's test cases as unit tests Prioritizes escaping predefined Escape Sequences over Unescaping UTF-8 Sequences Added additional check to confirm anything unescaping is a multibyte UTF-8 sequence, addressing the test case Jon brought up Added optional script and redef bool to enable utf-8 in ASCII logs Initial Commit, removed std::isprint check to escape Made minor code format and logic adjustments during merge.
This commit is contained in:
commit
d1770853b3
16 changed files with 181 additions and 3 deletions
7
CHANGES
7
CHANGES
|
@ -1,4 +1,11 @@
|
||||||
|
|
||||||
|
2.6-723 | 2019-07-30 19:36:56 -0700
|
||||||
|
|
||||||
|
* Add LogAscii::enable_utf_8 option (Dev Bali, Corelight)
|
||||||
|
|
||||||
|
This option allows valid utf8 sequences to be written directly
|
||||||
|
into the ASCII logs without any escaping.
|
||||||
|
|
||||||
2.6-713 | 2019-07-30 18:12:49 +0000
|
2.6-713 | 2019-07-30 18:12:49 +0000
|
||||||
|
|
||||||
* Fix memory leaks in expire_func introduced by recent changes (Jon
|
* Fix memory leaks in expire_func introduced by recent changes (Jon
|
||||||
|
|
3
NEWS
3
NEWS
|
@ -162,6 +162,9 @@ New Functionality
|
||||||
local three = make_adder(3);
|
local three = make_adder(3);
|
||||||
print three(5); # prints 8
|
print three(5); # prints 8
|
||||||
|
|
||||||
|
- Add ``LogAscii::enable_utf_8`` option to allow valid utf8 sequences
|
||||||
|
to be written directly into the ASCII logs without any escaping.
|
||||||
|
|
||||||
Changed Functionality
|
Changed Functionality
|
||||||
---------------------
|
---------------------
|
||||||
|
|
||||||
|
|
2
VERSION
2
VERSION
|
@ -1 +1 @@
|
||||||
2.6-713
|
2.6-723
|
||||||
|
|
2
doc
2
doc
|
@ -1 +1 @@
|
||||||
Subproject commit bb2f06ff8e3b6ae1f362ad4b56da4ef2ebb12d0c
|
Subproject commit f3b78d63696c3145144fbd8cfd82cdae15a13e98
|
|
@ -26,6 +26,12 @@ export {
|
||||||
## This option is also available as a per-filter ``$config`` option.
|
## This option is also available as a per-filter ``$config`` option.
|
||||||
const use_json = F &redef;
|
const use_json = F &redef;
|
||||||
|
|
||||||
|
## If true, valid UTF-8 sequences will pass through unescaped and be
|
||||||
|
## written into logs.
|
||||||
|
##
|
||||||
|
## This option is also available as a per-filter ``$config`` option.
|
||||||
|
const enable_utf_8 = F &redef;
|
||||||
|
|
||||||
## Define the gzip level to compress the logs. If 0, then no gzip
|
## Define the gzip level to compress the logs. If 0, then no gzip
|
||||||
## compression is performed. Enabling compression also changes
|
## compression is performed. Enabling compression also changes
|
||||||
## the log file name extension to include ".gz".
|
## the log file name extension to include ".gz".
|
||||||
|
|
39
src/Desc.cc
39
src/Desc.cc
|
@ -10,6 +10,8 @@
|
||||||
#include "File.h"
|
#include "File.h"
|
||||||
#include "Reporter.h"
|
#include "Reporter.h"
|
||||||
|
|
||||||
|
#include "ConvertUTF.h"
|
||||||
|
|
||||||
#define DEFAULT_SIZE 128
|
#define DEFAULT_SIZE 128
|
||||||
#define SLOP 10
|
#define SLOP 10
|
||||||
|
|
||||||
|
@ -39,6 +41,7 @@ ODesc::ODesc(desc_type t, BroFile* arg_f)
|
||||||
include_stats = 0;
|
include_stats = 0;
|
||||||
indent_with_spaces = 0;
|
indent_with_spaces = 0;
|
||||||
escape = false;
|
escape = false;
|
||||||
|
utf8 = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
ODesc::~ODesc()
|
ODesc::~ODesc()
|
||||||
|
@ -57,6 +60,11 @@ void ODesc::EnableEscaping()
|
||||||
escape = true;
|
escape = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ODesc::EnableUTF8 ()
|
||||||
|
{
|
||||||
|
utf8 = true;
|
||||||
|
}
|
||||||
|
|
||||||
void ODesc::PushIndent()
|
void ODesc::PushIndent()
|
||||||
{
|
{
|
||||||
++indent_level;
|
++indent_level;
|
||||||
|
@ -258,13 +266,42 @@ pair<const char*, size_t> ODesc::FirstEscapeLoc(const char* bytes, size_t n)
|
||||||
|
|
||||||
for ( size_t i = 0; i < n; ++i )
|
for ( size_t i = 0; i < n; ++i )
|
||||||
{
|
{
|
||||||
if ( ! isprint(bytes[i]) || bytes[i] == '\\' )
|
auto printable = isprint(bytes[i]);
|
||||||
|
|
||||||
|
if ( ! printable && ! utf8 )
|
||||||
|
return escape_pos(bytes + i, 1);
|
||||||
|
|
||||||
|
if ( bytes[i] == '\\' )
|
||||||
return escape_pos(bytes + i, 1);
|
return escape_pos(bytes + i, 1);
|
||||||
|
|
||||||
size_t len = StartsWithEscapeSequence(bytes + i, bytes + n);
|
size_t len = StartsWithEscapeSequence(bytes + i, bytes + n);
|
||||||
|
|
||||||
if ( len )
|
if ( len )
|
||||||
return escape_pos(bytes + i, len);
|
return escape_pos(bytes + i, len);
|
||||||
|
|
||||||
|
if ( ! printable && utf8 )
|
||||||
|
{
|
||||||
|
size_t utf_found = getNumBytesForUTF8(bytes[i]);
|
||||||
|
|
||||||
|
if ( utf_found == 1 )
|
||||||
|
return escape_pos(bytes + i, 1);
|
||||||
|
|
||||||
|
if ( i + utf_found > n )
|
||||||
|
// Don't know if this is even meant to be a utf8 encoding,
|
||||||
|
// since there's not enough bytes left to check it's a valid
|
||||||
|
// sequence, so maybe safest to just move up by one instead
|
||||||
|
// of escaping the entire remainder.
|
||||||
|
return escape_pos(bytes + i, 1);
|
||||||
|
|
||||||
|
if ( isLegalUTF8Sequence(reinterpret_cast<const unsigned char *>(bytes + i),
|
||||||
|
reinterpret_cast<const unsigned char *>(bytes + i + utf_found)) )
|
||||||
|
{
|
||||||
|
i += utf_found - 1;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
return escape_pos(bytes + i, 1);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return escape_pos(0, 0);
|
return escape_pos(0, 0);
|
||||||
|
|
|
@ -54,6 +54,7 @@ public:
|
||||||
void SetFlush(int arg_do_flush) { do_flush = arg_do_flush; }
|
void SetFlush(int arg_do_flush) { do_flush = arg_do_flush; }
|
||||||
|
|
||||||
void EnableEscaping();
|
void EnableEscaping();
|
||||||
|
void EnableUTF8();
|
||||||
void AddEscapeSequence(const char* s) { escape_sequences.insert(s); }
|
void AddEscapeSequence(const char* s) { escape_sequences.insert(s); }
|
||||||
void AddEscapeSequence(const char* s, size_t n)
|
void AddEscapeSequence(const char* s, size_t n)
|
||||||
{ escape_sequences.insert(string(s, n)); }
|
{ escape_sequences.insert(string(s, n)); }
|
||||||
|
@ -185,6 +186,7 @@ protected:
|
||||||
unsigned int offset; // where we are in the buffer
|
unsigned int offset; // where we are in the buffer
|
||||||
unsigned int size; // size of buffer in bytes
|
unsigned int size; // size of buffer in bytes
|
||||||
|
|
||||||
|
bool utf8; // whether valid utf-8 sequences may pass through unescaped
|
||||||
bool escape; // escape unprintable characters in output?
|
bool escape; // escape unprintable characters in output?
|
||||||
typedef set<string> escape_set;
|
typedef set<string> escape_set;
|
||||||
escape_set escape_sequences; // additional sequences of chars to escape
|
escape_set escape_sequences; // additional sequences of chars to escape
|
||||||
|
|
|
@ -23,6 +23,7 @@ Ascii::Ascii(WriterFrontend* frontend) : WriterBackend(frontend)
|
||||||
include_meta = false;
|
include_meta = false;
|
||||||
tsv = false;
|
tsv = false;
|
||||||
use_json = false;
|
use_json = false;
|
||||||
|
enable_utf_8 = false;
|
||||||
formatter = 0;
|
formatter = 0;
|
||||||
gzip_level = 0;
|
gzip_level = 0;
|
||||||
gzfile = nullptr;
|
gzfile = nullptr;
|
||||||
|
@ -36,6 +37,7 @@ void Ascii::InitConfigOptions()
|
||||||
output_to_stdout = BifConst::LogAscii::output_to_stdout;
|
output_to_stdout = BifConst::LogAscii::output_to_stdout;
|
||||||
include_meta = BifConst::LogAscii::include_meta;
|
include_meta = BifConst::LogAscii::include_meta;
|
||||||
use_json = BifConst::LogAscii::use_json;
|
use_json = BifConst::LogAscii::use_json;
|
||||||
|
enable_utf_8 = BifConst::LogAscii::enable_utf_8;
|
||||||
gzip_level = BifConst::LogAscii::gzip_level;
|
gzip_level = BifConst::LogAscii::gzip_level;
|
||||||
|
|
||||||
separator.assign(
|
separator.assign(
|
||||||
|
@ -115,6 +117,19 @@ bool Ascii::InitFilterOptions()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
else if ( strcmp(i->first, "enable_utf_8") == 0 )
|
||||||
|
{
|
||||||
|
if ( strcmp(i->second, "T") == 0 )
|
||||||
|
enable_utf_8 = true;
|
||||||
|
else if ( strcmp(i->second, "F") == 0 )
|
||||||
|
enable_utf_8 = false;
|
||||||
|
else
|
||||||
|
{
|
||||||
|
Error("invalid value for 'enable_utf_8', must be a string and either \"T\" or \"F\"");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
else if ( strcmp(i->first, "output_to_stdout") == 0 )
|
else if ( strcmp(i->first, "output_to_stdout") == 0 )
|
||||||
{
|
{
|
||||||
if ( strcmp(i->second, "T") == 0 )
|
if ( strcmp(i->second, "T") == 0 )
|
||||||
|
@ -181,6 +196,10 @@ bool Ascii::InitFormatter()
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
// Enable utf-8 if needed
|
||||||
|
if ( enable_utf_8 )
|
||||||
|
desc.EnableUTF8();
|
||||||
|
|
||||||
// Use the default "Bro logs" format.
|
// Use the default "Bro logs" format.
|
||||||
desc.EnableEscaping();
|
desc.EnableEscaping();
|
||||||
desc.AddEscapeSequence(separator);
|
desc.AddEscapeSequence(separator);
|
||||||
|
|
|
@ -65,6 +65,7 @@ private:
|
||||||
|
|
||||||
int gzip_level; // level > 0 enables gzip compression
|
int gzip_level; // level > 0 enables gzip compression
|
||||||
bool use_json;
|
bool use_json;
|
||||||
|
bool enable_utf_8;
|
||||||
string json_timestamps;
|
string json_timestamps;
|
||||||
|
|
||||||
threading::formatter::Formatter* formatter;
|
threading::formatter::Formatter* formatter;
|
||||||
|
|
|
@ -11,5 +11,6 @@ const set_separator: string;
|
||||||
const empty_field: string;
|
const empty_field: string;
|
||||||
const unset_field: string;
|
const unset_field: string;
|
||||||
const use_json: bool;
|
const use_json: bool;
|
||||||
|
const enable_utf_8: bool;
|
||||||
const json_timestamps: JSON::TimestampFormat;
|
const json_timestamps: JSON::TimestampFormat;
|
||||||
const gzip_level: count;
|
const gzip_level: count;
|
||||||
|
|
|
@ -0,0 +1,10 @@
|
||||||
|
#separator \x09
|
||||||
|
#set_separator ,
|
||||||
|
#empty_field (empty)
|
||||||
|
#unset_field -
|
||||||
|
#path test
|
||||||
|
#open 2019-07-23-11-40-47
|
||||||
|
#fields s
|
||||||
|
#types string
|
||||||
|
foo \x0a\x09\x00 bar
|
||||||
|
#close 2019-07-23-11-40-47
|
|
@ -0,0 +1,10 @@
|
||||||
|
#separator \x09
|
||||||
|
#set_separator \xc2\xae
|
||||||
|
#empty_field (empty)
|
||||||
|
#unset_field -
|
||||||
|
#path test
|
||||||
|
#open 2019-07-23-11-46-43
|
||||||
|
#fields ss
|
||||||
|
#types set[string]
|
||||||
|
\xc2\xae
|
||||||
|
#close 2019-07-23-11-46-43
|
|
@ -0,0 +1,12 @@
|
||||||
|
#separator \x09
|
||||||
|
#set_separator ,
|
||||||
|
#empty_field (empty)
|
||||||
|
#unset_field -
|
||||||
|
#path test
|
||||||
|
#open 2019-07-22-10-13-09
|
||||||
|
#fields s
|
||||||
|
#types string
|
||||||
|
foo ® bar
|
||||||
|
दुनिया को नमस्ते
|
||||||
|
hello 𠜎
|
||||||
|
#close 2019-07-22-10-13-09
|
|
@ -0,0 +1,21 @@
|
||||||
|
#
|
||||||
|
# @TEST-EXEC: zeek -b %INPUT
|
||||||
|
# @TEST-EXEC: btest-diff test.log
|
||||||
|
|
||||||
|
redef LogAscii::enable_utf_8 = T;
|
||||||
|
|
||||||
|
module Test;
|
||||||
|
export {
|
||||||
|
redef enum Log::ID += { LOG };
|
||||||
|
|
||||||
|
type Log: record {
|
||||||
|
s: string;
|
||||||
|
} &log;
|
||||||
|
}
|
||||||
|
|
||||||
|
event zeek_init()
|
||||||
|
{
|
||||||
|
local a = "foo \n\t\0 bar";
|
||||||
|
Log::create_stream(Test::LOG, [$columns=Log]);
|
||||||
|
Log::write(Test::LOG, [$s=a]);
|
||||||
|
}
|
|
@ -0,0 +1,23 @@
|
||||||
|
#
|
||||||
|
# @TEST-EXEC: zeek -b %INPUT
|
||||||
|
# @TEST-EXEC: btest-diff test.log
|
||||||
|
|
||||||
|
redef LogAscii::enable_utf_8 = T;
|
||||||
|
|
||||||
|
redef LogAscii::set_separator = "\xc2\xae";
|
||||||
|
|
||||||
|
module Test;
|
||||||
|
|
||||||
|
export {
|
||||||
|
redef enum Log::ID += { LOG };
|
||||||
|
|
||||||
|
type Log: record {
|
||||||
|
ss: set[string];
|
||||||
|
} &log;
|
||||||
|
}
|
||||||
|
|
||||||
|
event zeek_init()
|
||||||
|
{
|
||||||
|
Log::create_stream(Test::LOG, [$columns=Log]);
|
||||||
|
Log::write(Test::LOG, [$ss=set("\xc2\xae")]);
|
||||||
|
}
|
|
@ -0,0 +1,26 @@
|
||||||
|
#
|
||||||
|
# @TEST-EXEC: zeek -b %INPUT
|
||||||
|
# @TEST-EXEC: btest-diff test.log
|
||||||
|
|
||||||
|
redef LogAscii::enable_utf_8 = T;
|
||||||
|
|
||||||
|
module Test;
|
||||||
|
export {
|
||||||
|
redef enum Log::ID += { LOG };
|
||||||
|
|
||||||
|
type Log: record {
|
||||||
|
s: string;
|
||||||
|
} &log;
|
||||||
|
}
|
||||||
|
|
||||||
|
event zeek_init()
|
||||||
|
{
|
||||||
|
local a = "foo \xc2\xae bar"; # 2 bytes
|
||||||
|
local b = "दुनिया को नमस्ते"; # Hindi characters are 3 byte utf-8
|
||||||
|
local c = "hello 𠜎"; # A 4 byte Chinese character
|
||||||
|
|
||||||
|
Log::create_stream(Test::LOG, [$columns=Log]);
|
||||||
|
Log::write(Test::LOG, [$s=a]);
|
||||||
|
Log::write(Test::LOG, [$s=b]);
|
||||||
|
Log::write(Test::LOG, [$s=c]);
|
||||||
|
}
|
Loading…
Add table
Add a link
Reference in a new issue