Merge remote-tracking branch 'origin/topic/dev/non-ascii-logging'

* origin/topic/dev/non-ascii-logging:
  Removed Policy Script for UTF-8 Logs
  Commented out UTF-8 Script in Test All Policy
  Minor Style Tweak
  Use getNumBytesForUTF8 method to determine number of bytes
  Added Jon's test cases as unit tests
  Prioritizes escaping predefined Escape Sequences over Unescaping UTF-8 Sequences
  Added additional check to confirm anything unescaping is a multibyte UTF-8 sequence, addressing the test case Jon brought up
  Added optional script and redef bool to enable utf-8 in ASCII logs
  Initial Commit, removed std::isprint check to escape

Made minor code format and logic adjustments during merge.
This commit is contained in:
Jon Siwek 2019-07-30 19:36:56 -07:00
commit d1770853b3
16 changed files with 181 additions and 3 deletions

View file

@ -1,4 +1,11 @@
2.6-723 | 2019-07-30 19:36:56 -0700
* Add LogAscii::enable_utf_8 option (Dev Bali, Corelight)
This option allows valid utf8 sequences to be written directly
into the ASCII logs without any escaping.
2.6-713 | 2019-07-30 18:12:49 +0000 2.6-713 | 2019-07-30 18:12:49 +0000
* Fix memory leaks in expire_func introduced by recent changes (Jon * Fix memory leaks in expire_func introduced by recent changes (Jon

3
NEWS
View file

@ -162,6 +162,9 @@ New Functionality
local three = make_adder(3); local three = make_adder(3);
print three(5); # prints 8 print three(5); # prints 8
- Add ``LogAscii::enable_utf_8`` option to allow valid utf8 sequences
to be written directly into the ASCII logs without any escaping.
Changed Functionality Changed Functionality
--------------------- ---------------------

View file

@ -1 +1 @@
2.6-713 2.6-723

2
doc

@ -1 +1 @@
Subproject commit bb2f06ff8e3b6ae1f362ad4b56da4ef2ebb12d0c Subproject commit f3b78d63696c3145144fbd8cfd82cdae15a13e98

View file

@ -26,6 +26,12 @@ export {
## This option is also available as a per-filter ``$config`` option. ## This option is also available as a per-filter ``$config`` option.
const use_json = F &redef; const use_json = F &redef;
## If true, valid UTF-8 sequences will pass through unescaped and be
## written into logs.
##
## This option is also available as a per-filter ``$config`` option.
const enable_utf_8 = F &redef;
## Define the gzip level to compress the logs. If 0, then no gzip ## Define the gzip level to compress the logs. If 0, then no gzip
## compression is performed. Enabling compression also changes ## compression is performed. Enabling compression also changes
## the log file name extension to include ".gz". ## the log file name extension to include ".gz".

View file

@ -10,6 +10,8 @@
#include "File.h" #include "File.h"
#include "Reporter.h" #include "Reporter.h"
#include "ConvertUTF.h"
#define DEFAULT_SIZE 128 #define DEFAULT_SIZE 128
#define SLOP 10 #define SLOP 10
@ -39,6 +41,7 @@ ODesc::ODesc(desc_type t, BroFile* arg_f)
include_stats = 0; include_stats = 0;
indent_with_spaces = 0; indent_with_spaces = 0;
escape = false; escape = false;
utf8 = false;
} }
ODesc::~ODesc() ODesc::~ODesc()
@ -57,6 +60,11 @@ void ODesc::EnableEscaping()
escape = true; escape = true;
} }
void ODesc::EnableUTF8 ()
{
utf8 = true;
}
void ODesc::PushIndent() void ODesc::PushIndent()
{ {
++indent_level; ++indent_level;
@ -258,13 +266,42 @@ pair<const char*, size_t> ODesc::FirstEscapeLoc(const char* bytes, size_t n)
for ( size_t i = 0; i < n; ++i ) for ( size_t i = 0; i < n; ++i )
{ {
if ( ! isprint(bytes[i]) || bytes[i] == '\\' ) auto printable = isprint(bytes[i]);
if ( ! printable && ! utf8 )
return escape_pos(bytes + i, 1);
if ( bytes[i] == '\\' )
return escape_pos(bytes + i, 1); return escape_pos(bytes + i, 1);
size_t len = StartsWithEscapeSequence(bytes + i, bytes + n); size_t len = StartsWithEscapeSequence(bytes + i, bytes + n);
if ( len ) if ( len )
return escape_pos(bytes + i, len); return escape_pos(bytes + i, len);
if ( ! printable && utf8 )
{
size_t utf_found = getNumBytesForUTF8(bytes[i]);
if ( utf_found == 1 )
return escape_pos(bytes + i, 1);
if ( i + utf_found > n )
// Don't know if this is even meant to be a utf8 encoding,
// since there's not enough bytes left to check it's a valid
// sequence, so maybe safest to just move up by one instead
// of escaping the entire remainder.
return escape_pos(bytes + i, 1);
if ( isLegalUTF8Sequence(reinterpret_cast<const unsigned char *>(bytes + i),
reinterpret_cast<const unsigned char *>(bytes + i + utf_found)) )
{
i += utf_found - 1;
continue;
}
return escape_pos(bytes + i, 1);
}
} }
return escape_pos(0, 0); return escape_pos(0, 0);

View file

@ -54,6 +54,7 @@ public:
void SetFlush(int arg_do_flush) { do_flush = arg_do_flush; } void SetFlush(int arg_do_flush) { do_flush = arg_do_flush; }
void EnableEscaping(); void EnableEscaping();
void EnableUTF8();
void AddEscapeSequence(const char* s) { escape_sequences.insert(s); } void AddEscapeSequence(const char* s) { escape_sequences.insert(s); }
void AddEscapeSequence(const char* s, size_t n) void AddEscapeSequence(const char* s, size_t n)
{ escape_sequences.insert(string(s, n)); } { escape_sequences.insert(string(s, n)); }
@ -185,6 +186,7 @@ protected:
unsigned int offset; // where we are in the buffer unsigned int offset; // where we are in the buffer
unsigned int size; // size of buffer in bytes unsigned int size; // size of buffer in bytes
bool utf8; // whether valid utf-8 sequences may pass through unescaped
bool escape; // escape unprintable characters in output? bool escape; // escape unprintable characters in output?
typedef set<string> escape_set; typedef set<string> escape_set;
escape_set escape_sequences; // additional sequences of chars to escape escape_set escape_sequences; // additional sequences of chars to escape

View file

@ -23,6 +23,7 @@ Ascii::Ascii(WriterFrontend* frontend) : WriterBackend(frontend)
include_meta = false; include_meta = false;
tsv = false; tsv = false;
use_json = false; use_json = false;
enable_utf_8 = false;
formatter = 0; formatter = 0;
gzip_level = 0; gzip_level = 0;
gzfile = nullptr; gzfile = nullptr;
@ -36,6 +37,7 @@ void Ascii::InitConfigOptions()
output_to_stdout = BifConst::LogAscii::output_to_stdout; output_to_stdout = BifConst::LogAscii::output_to_stdout;
include_meta = BifConst::LogAscii::include_meta; include_meta = BifConst::LogAscii::include_meta;
use_json = BifConst::LogAscii::use_json; use_json = BifConst::LogAscii::use_json;
enable_utf_8 = BifConst::LogAscii::enable_utf_8;
gzip_level = BifConst::LogAscii::gzip_level; gzip_level = BifConst::LogAscii::gzip_level;
separator.assign( separator.assign(
@ -115,6 +117,19 @@ bool Ascii::InitFilterOptions()
} }
} }
else if ( strcmp(i->first, "enable_utf_8") == 0 )
{
if ( strcmp(i->second, "T") == 0 )
enable_utf_8 = true;
else if ( strcmp(i->second, "F") == 0 )
enable_utf_8 = false;
else
{
Error("invalid value for 'enable_utf_8', must be a string and either \"T\" or \"F\"");
return false;
}
}
else if ( strcmp(i->first, "output_to_stdout") == 0 ) else if ( strcmp(i->first, "output_to_stdout") == 0 )
{ {
if ( strcmp(i->second, "T") == 0 ) if ( strcmp(i->second, "T") == 0 )
@ -181,6 +196,10 @@ bool Ascii::InitFormatter()
} }
else else
{ {
// Enable utf-8 if needed
if ( enable_utf_8 )
desc.EnableUTF8();
// Use the default "Bro logs" format. // Use the default "Bro logs" format.
desc.EnableEscaping(); desc.EnableEscaping();
desc.AddEscapeSequence(separator); desc.AddEscapeSequence(separator);

View file

@ -65,6 +65,7 @@ private:
int gzip_level; // level > 0 enables gzip compression int gzip_level; // level > 0 enables gzip compression
bool use_json; bool use_json;
bool enable_utf_8;
string json_timestamps; string json_timestamps;
threading::formatter::Formatter* formatter; threading::formatter::Formatter* formatter;

View file

@ -11,5 +11,6 @@ const set_separator: string;
const empty_field: string; const empty_field: string;
const unset_field: string; const unset_field: string;
const use_json: bool; const use_json: bool;
const enable_utf_8: bool;
const json_timestamps: JSON::TimestampFormat; const json_timestamps: JSON::TimestampFormat;
const gzip_level: count; const gzip_level: count;

View file

@ -0,0 +1,10 @@
#separator \x09
#set_separator ,
#empty_field (empty)
#unset_field -
#path test
#open 2019-07-23-11-40-47
#fields s
#types string
foo \x0a\x09\x00 bar
#close 2019-07-23-11-40-47

View file

@ -0,0 +1,10 @@
#separator \x09
#set_separator \xc2\xae
#empty_field (empty)
#unset_field -
#path test
#open 2019-07-23-11-46-43
#fields ss
#types set[string]
\xc2\xae
#close 2019-07-23-11-46-43

View file

@ -0,0 +1,12 @@
#separator \x09
#set_separator ,
#empty_field (empty)
#unset_field -
#path test
#open 2019-07-22-10-13-09
#fields s
#types string
foo ® bar
दुनिया को नमस्ते
hello 𠜎
#close 2019-07-22-10-13-09

View file

@ -0,0 +1,21 @@
#
# @TEST-EXEC: zeek -b %INPUT
# @TEST-EXEC: btest-diff test.log
redef LogAscii::enable_utf_8 = T;
module Test;
export {
redef enum Log::ID += { LOG };
type Log: record {
s: string;
} &log;
}
event zeek_init()
{
local a = "foo \n\t\0 bar";
Log::create_stream(Test::LOG, [$columns=Log]);
Log::write(Test::LOG, [$s=a]);
}

View file

@ -0,0 +1,23 @@
#
# @TEST-EXEC: zeek -b %INPUT
# @TEST-EXEC: btest-diff test.log
redef LogAscii::enable_utf_8 = T;
redef LogAscii::set_separator = "\xc2\xae";
module Test;
export {
redef enum Log::ID += { LOG };
type Log: record {
ss: set[string];
} &log;
}
event zeek_init()
{
Log::create_stream(Test::LOG, [$columns=Log]);
Log::write(Test::LOG, [$ss=set("\xc2\xae")]);
}

View file

@ -0,0 +1,26 @@
#
# @TEST-EXEC: zeek -b %INPUT
# @TEST-EXEC: btest-diff test.log
redef LogAscii::enable_utf_8 = T;
module Test;
export {
redef enum Log::ID += { LOG };
type Log: record {
s: string;
} &log;
}
event zeek_init()
{
local a = "foo \xc2\xae bar"; # 2 bytes
local b = "दुनिया को नमस्ते"; # Hindi characters are 3 byte utf-8
local c = "hello 𠜎"; # A 4 byte Chinese character
Log::create_stream(Test::LOG, [$columns=Log]);
Log::write(Test::LOG, [$s=a]);
Log::write(Test::LOG, [$s=b]);
Log::write(Test::LOG, [$s=c]);
}