speed up file analysis, remove IncrementByteCount

Avoid creating and recreating count objects for each chunk of file
analyzed.  This replaces counts inside of records with c++ uint64_ts.

On a pcap containing a 100GB file download this gives a 9% speedup

    Benchmark 1 (3 runs): zeek-master/bin/zeek -Cr http_100g_zeroes.pcap tuning/json-logs frameworks/files/hash-all-files
      measurement          mean ± σ            min … max           outliers         delta
      wall_time           102s  ± 1.23s      101s  …  103s           0 ( 0%)        0%
      peak_rss            108MB ±  632KB     107MB …  109MB          0 ( 0%)        0%
      cpu_cycles          381G  ±  862M      380G  …  382G           0 ( 0%)        0%
      instructions        663G  ± 5.16M      663G  …  663G           0 ( 0%)        0%
      cache_references   1.03G  ±  109M      927M  … 1.15G           0 ( 0%)        0%
      cache_misses       12.3M  ±  587K     11.7M  … 12.9M           0 ( 0%)        0%
      branch_misses      1.23G  ± 2.10M     1.22G  … 1.23G           0 ( 0%)        0%
    Benchmark 2 (3 runs): zeek-file_analysis_speedup/bin/zeek -Cr http_100g_zeroes.pcap tuning/json-logs frameworks/files/hash-all-files
      measurement          mean ± σ            min … max           outliers         delta
      wall_time          92.9s  ± 1.85s     91.8s  … 95.1s           0 ( 0%)        -  9.0% ±  3.5%
      peak_rss            108MB ±  393KB     108MB …  109MB          0 ( 0%)          +  0.1% ±  1.1%
      cpu_cycles          341G  ±  695M      341G  …  342G           0 ( 0%)        - 10.4% ±  0.5%
      instructions        605G  ±  626M      605G  …  606G           0 ( 0%)        -  8.7% ±  0.2%
      cache_references    831M  ± 16.9M      813M  …  846M           0 ( 0%)        - 19.6% ± 17.2%
      cache_misses       12.4M  ± 1.48M     11.4M  … 14.1M           0 ( 0%)          +  0.3% ± 20.8%
      branch_misses      1.02G  ± 3.45M     1.02G  … 1.02G           0 ( 0%)        - 16.8% ±  0.5%
This commit is contained in:
Justin Azoff 2025-04-18 14:01:47 -04:00 committed by Justin Azoff
parent 20ada619c5
commit 7f350587b0
3 changed files with 26 additions and 14 deletions

View file

@ -86,6 +86,9 @@ File::File(const std::string& file_id, const std::string& source_name, Connectio
reassembly_enabled(false),
postpone_timeout(false),
done(false),
seen_bytes(0),
missing_bytes(0),
overflow_bytes(0),
analyzers(this) {
StaticInit();
@ -147,6 +150,9 @@ void File::RaiseFileOverNewConnection(Connection* conn, bool is_orig) {
}
uint64_t File::LookupFieldDefaultCount(int idx) const {
if ( val->HasRawField(idx) )
return val->GetFieldAs<zeek::CountVal>(idx);
auto v = val->GetFieldOrDefault(idx);
return v->AsCount();
}
@ -192,23 +198,19 @@ bool File::SetExtractionLimit(RecordValPtr args, uint64_t bytes) {
return true;
}
void File::IncrementByteCount(uint64_t size, int field_idx) {
uint64_t old = LookupFieldDefaultCount(field_idx);
val->Assign(field_idx, old + size);
}
void File::SetTotalBytes(uint64_t size) {
DBG_LOG(DBG_FILE_ANALYSIS, "[%s] Total bytes %" PRIu64, id.c_str(), size);
val->Assign(total_bytes_idx, size);
}
bool File::IsComplete() const {
const auto& total = val->GetField(total_bytes_idx);
if ( ! total )
// If total_bytes hasn't been initialized yet, file is certainly not complete.
if ( ! val->HasRawField(total_bytes_idx) )
return false;
if ( stream_offset >= total->AsCount() )
auto total = val->GetFieldAs<zeek::CountVal>(total_bytes_idx);
if ( stream_offset >= total )
return true;
return false;
@ -372,7 +374,7 @@ void File::DeliverStream(const u_char* data, uint64_t len) {
}
stream_offset += len;
IncrementByteCount(len, seen_bytes_idx);
seen_bytes += len;
}
void File::DeliverChunk(const u_char* data, uint64_t len, uint64_t offset) {
@ -388,7 +390,7 @@ void File::DeliverChunk(const u_char* data, uint64_t len, uint64_t offset) {
if ( reassembly_max_buffer > 0 && reassembly_max_buffer < file_reassembler->TotalSize() ) {
uint64_t current_offset = stream_offset;
uint64_t gap_bytes = file_reassembler->Flush();
IncrementByteCount(gap_bytes, overflow_bytes_idx);
overflow_bytes += gap_bytes;
if ( FileEventAvailable(file_reassembly_overflow) ) {
FileEvent(file_reassembly_overflow, {val, val_mgr->Count(current_offset), val_mgr->Count(gap_bytes)});
@ -411,7 +413,7 @@ void File::DeliverChunk(const u_char* data, uint64_t len, uint64_t offset) {
}
else {
// We can't reassemble so we throw out the data for streaming.
IncrementByteCount(len, overflow_bytes_idx);
overflow_bytes += len;
}
DBG_LOG(DBG_FILE_ANALYSIS, "[%s] %" PRIu64 " chunk bytes in at offset %" PRIu64 "; %s [%s%s]", id.c_str(), len,
@ -513,7 +515,7 @@ void File::Gap(uint64_t offset, uint64_t len) {
analyzers.DrainModifications();
stream_offset += len;
IncrementByteCount(len, missing_bytes_idx);
missing_bytes += len;
}
bool File::FileEventAvailable(EventHandlerPtr h) { return h && ! file_mgr->IsIgnored(id); }
@ -526,6 +528,9 @@ void File::FileEvent(EventHandlerPtr h) {
}
void File::FileEvent(EventHandlerPtr h, Args args) {
val->Assign(seen_bytes_idx, seen_bytes);
val->Assign(missing_bytes_idx, missing_bytes);
val->Assign(overflow_bytes_idx, overflow_bytes);
event_mgr.Enqueue(h, std::move(args));
if ( h == file_new || h == file_over_new_connection || h == file_sniff || h == file_timeout ||

View file

@ -325,6 +325,9 @@ protected:
bool reassembly_enabled; /**< Whether file stream reassembly is needed. */
bool postpone_timeout; /**< Whether postponing timeout is requested. */
bool done; /**< If this object is about to be deleted. */
uint64_t seen_bytes; /**< Number of bytes processed for this file. */
uint64_t missing_bytes; /**< Number of bytes missed for this file. */
uint64_t overflow_bytes; /**< Number of bytes not delivered. */
detail::AnalyzerSet analyzers; /**< A set of attached file analyzers. */
std::list<Analyzer*> done_analyzers; /**< Analyzers we're done with, remembered here until they
can be safely deleted. */