From 1c58a2d86b628b444dce96ae2908891093cecaf1 Mon Sep 17 00:00:00 2001 From: Jon Siwek Date: Tue, 9 Mar 2021 18:22:54 -0800 Subject: [PATCH] GH-1432: Use buffered IO for file extraction This can improve performance significantly: ~3.5x faster when tested on a large file passing data to the file analysis framework in small chunks of 20 bytes. --- src/file_analysis/analyzer/extract/Extract.cc | 68 ++++++++++++++++--- src/file_analysis/analyzer/extract/Extract.h | 3 +- 2 files changed, 61 insertions(+), 10 deletions(-) diff --git a/src/file_analysis/analyzer/extract/Extract.cc b/src/file_analysis/analyzer/extract/Extract.cc index 001c49ba01..f66c02e80d 100644 --- a/src/file_analysis/analyzer/extract/Extract.cc +++ b/src/file_analysis/analyzer/extract/Extract.cc @@ -17,12 +17,21 @@ Extract::Extract(RecordValPtr args, file_analysis::File* file, std::move(args), file), filename(arg_filename), limit(arg_limit), depth(0) { - fd = open(filename.c_str(), O_WRONLY | O_CREAT | O_TRUNC | O_APPEND, 0666); + char buf[128]; + file_stream = fopen(filename.data(), "w"); - if ( fd < 0 ) + if ( file_stream ) + { + // Try to ensure full buffering. + if ( setvbuf(file_stream, nullptr, _IOFBF, BUFSIZ) ) + { + util::zeek_strerror_r(errno, buf, sizeof(buf)); + reporter->Warning("cannot set buffering mode for %s: %s", + filename.data(), buf); + } + } + else { - fd = 0; - char buf[128]; util::zeek_strerror_r(errno, buf, sizeof(buf)); reporter->Error("cannot open %s: %s", filename.c_str(), buf); } @@ -30,8 +39,12 @@ Extract::Extract(RecordValPtr args, file_analysis::File* file, Extract::~Extract() { - if ( fd ) - util::safe_close(fd); + if ( file_stream && fclose(file_stream) ) + { + char buf[128]; + util::zeek_strerror_r(errno, buf, sizeof(buf)); + reporter->Error("cannot close %s: %s", filename.data(), buf); + } } static const ValPtr& get_extract_field_val(const RecordValPtr& args, @@ -86,7 +99,7 @@ static bool check_limit_exceeded(uint64_t lim, uint64_t depth, uint64_t len, uin bool Extract::DeliverStream(const u_char* data, uint64_t len) { - if ( ! fd ) + if ( ! file_stream ) return false; uint64_t towrite = 0; @@ -106,21 +119,58 @@ bool Extract::DeliverStream(const u_char* data, uint64_t len) limit_exceeded = check_limit_exceeded(limit, depth, len, &towrite); } + char buf[128]; + if ( towrite > 0 ) { - util::safe_write(fd, reinterpret_cast(data), towrite); + if ( fwrite(data, towrite, 1, file_stream) != 1 ) + { + util::zeek_strerror_r(errno, buf, sizeof(buf)); + reporter->Error("failed to write to extracted file %s: %s", + filename.data(), buf); + fclose(file_stream); + file_stream = nullptr; + return false; + } + depth += towrite; } + // Assume we may not try to write anything more for a while due to reaching + // the extraction limit and the file analysis File still proceeding to + // do other analysis without destructing/closing this one until the very end, + // so flush anything currently buffered. + if ( limit_exceeded && fflush(file_stream) ) + { + util::zeek_strerror_r(errno, buf, sizeof(buf)); + reporter->Warning("cannot fflush extracted file %s: %s", + filename.data(), buf); + } + return ( ! limit_exceeded ); } bool Extract::Undelivered(uint64_t offset, uint64_t len) { + if ( ! file_stream ) + return false; + if ( depth == offset ) { char* tmp = new char[len](); - util::safe_write(fd, tmp, len); + + if ( fwrite(tmp, len, 1, file_stream) != 1 ) + { + char buf[128]; + util::zeek_strerror_r(errno, buf, sizeof(buf)); + reporter->Error("failed to write to extracted file %s: %s", + filename.data(), buf); + fclose(file_stream); + file_stream = nullptr; + delete [] tmp; + return false; + } + delete [] tmp; depth += len; } diff --git a/src/file_analysis/analyzer/extract/Extract.h b/src/file_analysis/analyzer/extract/Extract.h index 8a02ba8289..3da1bf3a30 100644 --- a/src/file_analysis/analyzer/extract/Extract.h +++ b/src/file_analysis/analyzer/extract/Extract.h @@ -3,6 +3,7 @@ #pragma once #include +#include #include "zeek/Val.h" #include "zeek/file_analysis/File.h" @@ -72,7 +73,7 @@ protected: private: std::string filename; - int fd; + FILE* file_stream; uint64_t limit; uint64_t depth; };