From 7f350587b05bc9b8983f763a080c4a1f03cf074d Mon Sep 17 00:00:00 2001 From: Justin Azoff Date: Fri, 18 Apr 2025 14:01:47 -0400 Subject: [PATCH] speed up file analysis, remove IncrementByteCount MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Avoid creating and recreating count objects for each chunk of file analyzed. This replaces counts inside of records with c++ uint64_ts. On a pcap containing a 100GB file download this gives a 9% speedup Benchmark 1 (3 runs): zeek-master/bin/zeek -Cr http_100g_zeroes.pcap tuning/json-logs frameworks/files/hash-all-files measurement mean ± σ min … max outliers delta wall_time 102s ± 1.23s 101s … 103s 0 ( 0%) 0% peak_rss 108MB ± 632KB 107MB … 109MB 0 ( 0%) 0% cpu_cycles 381G ± 862M 380G … 382G 0 ( 0%) 0% instructions 663G ± 5.16M 663G … 663G 0 ( 0%) 0% cache_references 1.03G ± 109M 927M … 1.15G 0 ( 0%) 0% cache_misses 12.3M ± 587K 11.7M … 12.9M 0 ( 0%) 0% branch_misses 1.23G ± 2.10M 1.22G … 1.23G 0 ( 0%) 0% Benchmark 2 (3 runs): zeek-file_analysis_speedup/bin/zeek -Cr http_100g_zeroes.pcap tuning/json-logs frameworks/files/hash-all-files measurement mean ± σ min … max outliers delta wall_time 92.9s ± 1.85s 91.8s … 95.1s 0 ( 0%) ⚡- 9.0% ± 3.5% peak_rss 108MB ± 393KB 108MB … 109MB 0 ( 0%) + 0.1% ± 1.1% cpu_cycles 341G ± 695M 341G … 342G 0 ( 0%) ⚡- 10.4% ± 0.5% instructions 605G ± 626M 605G … 606G 0 ( 0%) ⚡- 8.7% ± 0.2% cache_references 831M ± 16.9M 813M … 846M 0 ( 0%) ⚡- 19.6% ± 17.2% cache_misses 12.4M ± 1.48M 11.4M … 14.1M 0 ( 0%) + 0.3% ± 20.8% branch_misses 1.02G ± 3.45M 1.02G … 1.02G 0 ( 0%) ⚡- 16.8% ± 0.5% --- src/Val.h | 6 +++++- src/file_analysis/File.cc | 31 ++++++++++++++++++------------- src/file_analysis/File.h | 3 +++ 3 files changed, 26 insertions(+), 14 deletions(-) diff --git a/src/Val.h b/src/Val.h index e5055beec2..6e2f4d7b60 100644 --- a/src/Val.h +++ b/src/Val.h @@ -1310,11 +1310,15 @@ public: return cast_intrusive(GetField(field)); } + // Returns true if the slot for the given field is initialized. + // This helper can be used to guard GetFieldAs() accesses. + bool HasRawField(int field) const { return record_val[field].has_value(); } + // The following return the given field converted to a particular // underlying value. We provide these to enable efficient // access to record fields (without requiring an intermediary Val). // It is up to the caller to ensure that the field exists in the - // record (using HasField(), if necessary). + // record (using HasRawField(), if necessary). template, bool> = true> auto GetFieldAs(int field) const -> std::invoke_result_t { if constexpr ( std::is_same_v || std::is_same_v || std::is_same_v ) diff --git a/src/file_analysis/File.cc b/src/file_analysis/File.cc index 9bb69efeb8..73d98fdbd9 100644 --- a/src/file_analysis/File.cc +++ b/src/file_analysis/File.cc @@ -86,6 +86,9 @@ File::File(const std::string& file_id, const std::string& source_name, Connectio reassembly_enabled(false), postpone_timeout(false), done(false), + seen_bytes(0), + missing_bytes(0), + overflow_bytes(0), analyzers(this) { StaticInit(); @@ -147,6 +150,9 @@ void File::RaiseFileOverNewConnection(Connection* conn, bool is_orig) { } uint64_t File::LookupFieldDefaultCount(int idx) const { + if ( val->HasRawField(idx) ) + return val->GetFieldAs(idx); + auto v = val->GetFieldOrDefault(idx); return v->AsCount(); } @@ -192,23 +198,19 @@ bool File::SetExtractionLimit(RecordValPtr args, uint64_t bytes) { return true; } -void File::IncrementByteCount(uint64_t size, int field_idx) { - uint64_t old = LookupFieldDefaultCount(field_idx); - val->Assign(field_idx, old + size); -} - void File::SetTotalBytes(uint64_t size) { DBG_LOG(DBG_FILE_ANALYSIS, "[%s] Total bytes %" PRIu64, id.c_str(), size); val->Assign(total_bytes_idx, size); } bool File::IsComplete() const { - const auto& total = val->GetField(total_bytes_idx); - - if ( ! total ) + // If total_bytes hasn't been initialized yet, file is certainly not complete. + if ( ! val->HasRawField(total_bytes_idx) ) return false; - if ( stream_offset >= total->AsCount() ) + auto total = val->GetFieldAs(total_bytes_idx); + + if ( stream_offset >= total ) return true; return false; @@ -372,7 +374,7 @@ void File::DeliverStream(const u_char* data, uint64_t len) { } stream_offset += len; - IncrementByteCount(len, seen_bytes_idx); + seen_bytes += len; } void File::DeliverChunk(const u_char* data, uint64_t len, uint64_t offset) { @@ -388,7 +390,7 @@ void File::DeliverChunk(const u_char* data, uint64_t len, uint64_t offset) { if ( reassembly_max_buffer > 0 && reassembly_max_buffer < file_reassembler->TotalSize() ) { uint64_t current_offset = stream_offset; uint64_t gap_bytes = file_reassembler->Flush(); - IncrementByteCount(gap_bytes, overflow_bytes_idx); + overflow_bytes += gap_bytes; if ( FileEventAvailable(file_reassembly_overflow) ) { FileEvent(file_reassembly_overflow, {val, val_mgr->Count(current_offset), val_mgr->Count(gap_bytes)}); @@ -411,7 +413,7 @@ void File::DeliverChunk(const u_char* data, uint64_t len, uint64_t offset) { } else { // We can't reassemble so we throw out the data for streaming. - IncrementByteCount(len, overflow_bytes_idx); + overflow_bytes += len; } DBG_LOG(DBG_FILE_ANALYSIS, "[%s] %" PRIu64 " chunk bytes in at offset %" PRIu64 "; %s [%s%s]", id.c_str(), len, @@ -513,7 +515,7 @@ void File::Gap(uint64_t offset, uint64_t len) { analyzers.DrainModifications(); stream_offset += len; - IncrementByteCount(len, missing_bytes_idx); + missing_bytes += len; } bool File::FileEventAvailable(EventHandlerPtr h) { return h && ! file_mgr->IsIgnored(id); } @@ -526,6 +528,9 @@ void File::FileEvent(EventHandlerPtr h) { } void File::FileEvent(EventHandlerPtr h, Args args) { + val->Assign(seen_bytes_idx, seen_bytes); + val->Assign(missing_bytes_idx, missing_bytes); + val->Assign(overflow_bytes_idx, overflow_bytes); event_mgr.Enqueue(h, std::move(args)); if ( h == file_new || h == file_over_new_connection || h == file_sniff || h == file_timeout || diff --git a/src/file_analysis/File.h b/src/file_analysis/File.h index caaef35ae9..96096555dd 100644 --- a/src/file_analysis/File.h +++ b/src/file_analysis/File.h @@ -325,6 +325,9 @@ protected: bool reassembly_enabled; /**< Whether file stream reassembly is needed. */ bool postpone_timeout; /**< Whether postponing timeout is requested. */ bool done; /**< If this object is about to be deleted. */ + uint64_t seen_bytes; /**< Number of bytes processed for this file. */ + uint64_t missing_bytes; /**< Number of bytes missed for this file. */ + uint64_t overflow_bytes; /**< Number of bytes not delivered. */ detail::AnalyzerSet analyzers; /**< A set of attached file analyzers. */ std::list done_analyzers; /**< Analyzers we're done with, remembered here until they can be safely deleted. */