speed up file analysis, remove IncrementByteCount

Avoid creating and recreating count objects for each chunk of file analyzed. This replaces counts inside of records with c++ uint64_ts. On a pcap containing a 100GB file download this gives a 9% speedup Benchmark 1 (3 runs): zeek-master/bin/zeek -Cr http_100g_zeroes.pcap tuning/json-logs frameworks/files/hash-all-files measurement mean ± σ min … max outliers delta wall_time 102s ± 1.23s 101s … 103s 0 ( 0%) 0% peak_rss 108MB ± 632KB 107MB … 109MB 0 ( 0%) 0% cpu_cycles 381G ± 862M 380G … 382G 0 ( 0%) 0% instructions 663G ± 5.16M 663G … 663G 0 ( 0%) 0% cache_references 1.03G ± 109M 927M … 1.15G 0 ( 0%) 0% cache_misses 12.3M ± 587K 11.7M … 12.9M 0 ( 0%) 0% branch_misses 1.23G ± 2.10M 1.22G … 1.23G 0 ( 0%) 0% Benchmark 2 (3 runs): zeek-file_analysis_speedup/bin/zeek -Cr http_100g_zeroes.pcap tuning/json-logs frameworks/files/hash-all-files measurement mean ± σ min … max outliers delta wall_time 92.9s ± 1.85s 91.8s … 95.1s 0 ( 0%) ⚡- 9.0% ± 3.5% peak_rss 108MB ± 393KB 108MB … 109MB 0 ( 0%) + 0.1% ± 1.1% cpu_cycles 341G ± 695M 341G … 342G 0 ( 0%) ⚡- 10.4% ± 0.5% instructions 605G ± 626M 605G … 606G 0 ( 0%) ⚡- 8.7% ± 0.2% cache_references 831M ± 16.9M 813M … 846M 0 ( 0%) ⚡- 19.6% ± 17.2% cache_misses 12.4M ± 1.48M 11.4M … 14.1M 0 ( 0%) + 0.3% ± 20.8% branch_misses 1.02G ± 3.45M 1.02G … 1.02G 0 ( 0%) ⚡- 16.8% ± 0.5%
2025-10-02 06:38:20 +00:00 · 2025-04-18 14:01:47 -04:00 · 2025-04-18 14:01:47 -04:00 · 7f350587b0
commit 7f350587b0
parent 20ada619c5
3 changed files with 26 additions and 14 deletions
--- a/src/file_analysis/File.cc
+++ b/src/file_analysis/File.cc
@ -86,6 +86,9 @@ File::File(const std::string& file_id, const std::string& source_name, Connectio
      reassembly_enabled(false),
      postpone_timeout(false),
      done(false),
+      seen_bytes(0),
+      missing_bytes(0),
+      overflow_bytes(0),
      analyzers(this) {
    StaticInit();

@ -147,6 +150,9 @@ void File::RaiseFileOverNewConnection(Connection* conn, bool is_orig) {
 }

 uint64_t File::LookupFieldDefaultCount(int idx) const {
+    if ( val->HasRawField(idx) )
+        return val->GetFieldAs<zeek::CountVal>(idx);
+
    auto v = val->GetFieldOrDefault(idx);
    return v->AsCount();
 }
@ -192,23 +198,19 @@ bool File::SetExtractionLimit(RecordValPtr args, uint64_t bytes) {
    return true;
 }

-void File::IncrementByteCount(uint64_t size, int field_idx) {
-    uint64_t old = LookupFieldDefaultCount(field_idx);
-    val->Assign(field_idx, old + size);
-}
-
 void File::SetTotalBytes(uint64_t size) {
    DBG_LOG(DBG_FILE_ANALYSIS, "[%s] Total bytes %" PRIu64, id.c_str(), size);
    val->Assign(total_bytes_idx, size);
 }

 bool File::IsComplete() const {
-    const auto& total = val->GetField(total_bytes_idx);
-
-    if ( ! total )
+    // If total_bytes hasn't been initialized yet, file is certainly not complete.
+    if ( ! val->HasRawField(total_bytes_idx) )
        return false;

-    if ( stream_offset >= total->AsCount() )
+    auto total = val->GetFieldAs<zeek::CountVal>(total_bytes_idx);
+
+    if ( stream_offset >= total )
        return true;

    return false;
@ -372,7 +374,7 @@ void File::DeliverStream(const u_char* data, uint64_t len) {
    }

    stream_offset += len;
-    IncrementByteCount(len, seen_bytes_idx);
+    seen_bytes += len;
 }

 void File::DeliverChunk(const u_char* data, uint64_t len, uint64_t offset) {
@ -388,7 +390,7 @@ void File::DeliverChunk(const u_char* data, uint64_t len, uint64_t offset) {
        if ( reassembly_max_buffer > 0 && reassembly_max_buffer < file_reassembler->TotalSize() ) {
            uint64_t current_offset = stream_offset;
            uint64_t gap_bytes = file_reassembler->Flush();
-            IncrementByteCount(gap_bytes, overflow_bytes_idx);
+            overflow_bytes += gap_bytes;

            if ( FileEventAvailable(file_reassembly_overflow) ) {
                FileEvent(file_reassembly_overflow, {val, val_mgr->Count(current_offset), val_mgr->Count(gap_bytes)});
@ -411,7 +413,7 @@ void File::DeliverChunk(const u_char* data, uint64_t len, uint64_t offset) {
    }
    else {
        // We can't reassemble so we throw out the data for streaming.
-        IncrementByteCount(len, overflow_bytes_idx);
+        overflow_bytes += len;
    }

    DBG_LOG(DBG_FILE_ANALYSIS, "[%s] %" PRIu64 " chunk bytes in at offset %" PRIu64 "; %s [%s%s]", id.c_str(), len,
@ -513,7 +515,7 @@ void File::Gap(uint64_t offset, uint64_t len) {
    analyzers.DrainModifications();

    stream_offset += len;
-    IncrementByteCount(len, missing_bytes_idx);
+    missing_bytes += len;
 }

 bool File::FileEventAvailable(EventHandlerPtr h) { return h && ! file_mgr->IsIgnored(id); }
@ -526,6 +528,9 @@ void File::FileEvent(EventHandlerPtr h) {
 }

 void File::FileEvent(EventHandlerPtr h, Args args) {
+    val->Assign(seen_bytes_idx, seen_bytes);
+    val->Assign(missing_bytes_idx, missing_bytes);
+    val->Assign(overflow_bytes_idx, overflow_bytes);
    event_mgr.Enqueue(h, std::move(args));

    if ( h == file_new || h == file_over_new_connection || h == file_sniff || h == file_timeout ||
--- a/src/file_analysis/File.h
+++ b/src/file_analysis/File.h
@ -325,6 +325,9 @@ protected:
    bool reassembly_enabled;             /**< Whether file stream reassembly is needed. */
    bool postpone_timeout;               /**< Whether postponing timeout is requested. */
    bool done;                           /**< If this object is about to be deleted. */
+    uint64_t seen_bytes;                 /**< Number of bytes processed for this file. */
+    uint64_t missing_bytes;              /**< Number of bytes missed for this file. */
+    uint64_t overflow_bytes;             /**< Number of bytes not delivered. */
    detail::AnalyzerSet analyzers;       /**< A set of attached file analyzers. */
    std::list<Analyzer*> done_analyzers; /**< Analyzers we're done with, remembered here until they
                                            can be safely deleted. */