Added optional script and redef bool to enable utf-8 in ASCII logs

This commit is contained in:
Dev Bali 2019-07-22 10:28:38 -07:00
parent 6927dd1213
commit 66557d3178
10 changed files with 111 additions and 2 deletions

View file

@ -26,6 +26,12 @@ export {
## This option is also available as a per-filter ``$config`` option. ## This option is also available as a per-filter ``$config`` option.
const use_json = F &redef; const use_json = F &redef;
## If true, non ASCII UTF-8 characters will pass through and
## be written into logs
##
## This option is also available as a per-filter ``$config`` option.
const enable_utf_8 = F &redef;
## Define the gzip level to compress the logs. If 0, then no gzip ## Define the gzip level to compress the logs. If 0, then no gzip
## compression is performed. Enabling compression also changes ## compression is performed. Enabling compression also changes
## the log file name extension to include ".gz". ## the log file name extension to include ".gz".

View file

@ -0,0 +1,4 @@
##! Loading this script will enable utf-8 characters
##! instead of escaping them into the \xYY format
redef LogAscii::enable_utf_8=T;

View file

@ -112,4 +112,5 @@
@load tuning/defaults/packet-fragments.zeek @load tuning/defaults/packet-fragments.zeek
@load tuning/defaults/warnings.zeek @load tuning/defaults/warnings.zeek
@load tuning/json-logs.zeek @load tuning/json-logs.zeek
@load tuning/enable-utf-8-logs.zeek
@load tuning/track-all-assets.zeek @load tuning/track-all-assets.zeek

View file

@ -10,6 +10,8 @@
#include "File.h" #include "File.h"
#include "Reporter.h" #include "Reporter.h"
#include "ConvertUTF.h"
#define DEFAULT_SIZE 128 #define DEFAULT_SIZE 128
#define SLOP 10 #define SLOP 10
@ -39,6 +41,7 @@ ODesc::ODesc(desc_type t, BroFile* arg_f)
include_stats = 0; include_stats = 0;
indent_with_spaces = 0; indent_with_spaces = 0;
escape = false; escape = false;
utf8 = false;
} }
ODesc::~ODesc() ODesc::~ODesc()
@ -57,6 +60,11 @@ void ODesc::EnableEscaping()
escape = true; escape = true;
} }
void ODesc::EnableUTF8 ()
{
utf8 = true;
}
void ODesc::PushIndent() void ODesc::PushIndent()
{ {
++indent_level; ++indent_level;
@ -249,6 +257,23 @@ size_t ODesc::StartsWithEscapeSequence(const char* start, const char* end)
return 0; return 0;
} }
size_t check_utf8 (const char* bytes, size_t n, size_t i)
{
// Checks two to four bytes from starting position i
// and returns the length of the valid utf-8 sequence
size_t num_to_check = ((n-i+1) < 4) ? (n-i+1) : 4;
for (size_t j = 1; j <= num_to_check; ++j)
{
if (isLegalUTF8Sequence(reinterpret_cast<const unsigned char *>(bytes+i), reinterpret_cast<const unsigned char *>(bytes+i+j) ))
{
return j;
}
}
return 0;
}
pair<const char*, size_t> ODesc::FirstEscapeLoc(const char* bytes, size_t n) pair<const char*, size_t> ODesc::FirstEscapeLoc(const char* bytes, size_t n)
{ {
typedef pair<const char*, size_t> escape_pos; typedef pair<const char*, size_t> escape_pos;
@ -258,8 +283,21 @@ pair<const char*, size_t> ODesc::FirstEscapeLoc(const char* bytes, size_t n)
for ( size_t i = 0; i < n; ++i ) for ( size_t i = 0; i < n; ++i )
{ {
//if ( ! isprint(bytes[i]) || bytes[i] == '\\' ) if (!isprint(bytes[i]))
if ( bytes[i] == '\\' ) {
if (utf8)
{
size_t utf_found = check_utf8(bytes, n, i);
if (utf_found)
{
i += utf_found - 1;
continue;
}
}
return escape_pos(bytes + i, 1);
}
else if (bytes[i] == '\\' )
return escape_pos(bytes + i, 1); return escape_pos(bytes + i, 1);
size_t len = StartsWithEscapeSequence(bytes + i, bytes + n); size_t len = StartsWithEscapeSequence(bytes + i, bytes + n);

View file

@ -54,6 +54,7 @@ public:
void SetFlush(int arg_do_flush) { do_flush = arg_do_flush; } void SetFlush(int arg_do_flush) { do_flush = arg_do_flush; }
void EnableEscaping(); void EnableEscaping();
void EnableUTF8();
void AddEscapeSequence(const char* s) { escape_sequences.insert(s); } void AddEscapeSequence(const char* s) { escape_sequences.insert(s); }
void AddEscapeSequence(const char* s, size_t n) void AddEscapeSequence(const char* s, size_t n)
{ escape_sequences.insert(string(s, n)); } { escape_sequences.insert(string(s, n)); }
@ -185,6 +186,7 @@ protected:
unsigned int offset; // where we are in the buffer unsigned int offset; // where we are in the buffer
unsigned int size; // size of buffer in bytes unsigned int size; // size of buffer in bytes
bool utf8; //whether to allow non ascii utf-8 characters to pass through
bool escape; // escape unprintable characters in output? bool escape; // escape unprintable characters in output?
typedef set<string> escape_set; typedef set<string> escape_set;
escape_set escape_sequences; // additional sequences of chars to escape escape_set escape_sequences; // additional sequences of chars to escape

View file

@ -23,6 +23,7 @@ Ascii::Ascii(WriterFrontend* frontend) : WriterBackend(frontend)
include_meta = false; include_meta = false;
tsv = false; tsv = false;
use_json = false; use_json = false;
enable_utf_8 = false;
formatter = 0; formatter = 0;
gzip_level = 0; gzip_level = 0;
gzfile = nullptr; gzfile = nullptr;
@ -36,6 +37,7 @@ void Ascii::InitConfigOptions()
output_to_stdout = BifConst::LogAscii::output_to_stdout; output_to_stdout = BifConst::LogAscii::output_to_stdout;
include_meta = BifConst::LogAscii::include_meta; include_meta = BifConst::LogAscii::include_meta;
use_json = BifConst::LogAscii::use_json; use_json = BifConst::LogAscii::use_json;
enable_utf_8 = BifConst::LogAscii::enable_utf_8;
gzip_level = BifConst::LogAscii::gzip_level; gzip_level = BifConst::LogAscii::gzip_level;
separator.assign( separator.assign(
@ -115,6 +117,19 @@ bool Ascii::InitFilterOptions()
} }
} }
else if ( strcmp(i->first, "enable_utf_8") == 0 )
{
if ( strcmp(i->second, "T") == 0 )
enable_utf_8 = true;
else if ( strcmp(i->second, "F") == 0 )
enable_utf_8 = false;
else
{
Error("invalid value for 'enable_utf_8', must be a string and either \"T\" or \"F\"");
return false;
}
}
else if ( strcmp(i->first, "output_to_stdout") == 0 ) else if ( strcmp(i->first, "output_to_stdout") == 0 )
{ {
if ( strcmp(i->second, "T") == 0 ) if ( strcmp(i->second, "T") == 0 )
@ -181,6 +196,9 @@ bool Ascii::InitFormatter()
} }
else else
{ {
// Enable utf-8 if needed
if (enable_utf_8)
desc.EnableUTF8();
// Use the default "Bro logs" format. // Use the default "Bro logs" format.
desc.EnableEscaping(); desc.EnableEscaping();
desc.AddEscapeSequence(separator); desc.AddEscapeSequence(separator);

View file

@ -65,6 +65,7 @@ private:
int gzip_level; // level > 0 enables gzip compression int gzip_level; // level > 0 enables gzip compression
bool use_json; bool use_json;
bool enable_utf_8;
string json_timestamps; string json_timestamps;
threading::formatter::Formatter* formatter; threading::formatter::Formatter* formatter;

View file

@ -11,5 +11,6 @@ const set_separator: string;
const empty_field: string; const empty_field: string;
const unset_field: string; const unset_field: string;
const use_json: bool; const use_json: bool;
const enable_utf_8: bool;
const json_timestamps: JSON::TimestampFormat; const json_timestamps: JSON::TimestampFormat;
const gzip_level: count; const gzip_level: count;

View file

@ -0,0 +1,12 @@
#separator \x09
#set_separator ,
#empty_field (empty)
#unset_field -
#path test
#open 2019-07-22-10-13-09
#fields s
#types string
foo ® bar
दुनिया को नमस्ते
hello 𠜎
#close 2019-07-22-10-13-09

View file

@ -0,0 +1,26 @@
#
# @TEST-EXEC: zeek -b %INPUT
# @TEST-EXEC: btest-diff test.log
@load tuning/enable-utf-8-logs
module Test;
export {
redef enum Log::ID += { LOG };
type Log: record {
s: string;
} &log;
}
event zeek_init()
{
local a = "foo \xc2\xae bar"; # 2 bytes
local b = "दुनिया को नमस्ते"; # Hindi characters are 3 byte utf-8
local c = "hello 𠜎"; # A 4 byte Chinese character
Log::create_stream(Test::LOG, [$columns=Log]);
Log::write(Test::LOG, [$s=a]);
Log::write(Test::LOG, [$s=b]);
Log::write(Test::LOG, [$s=c]);
}