speed up file analysis, remove IncrementByteCount

Avoid creating and recreating count objects for each chunk of file
analyzed.  This replaces counts inside of records with c++ uint64_ts.

On a pcap containing a 100GB file download this gives a 9% speedup

    Benchmark 1 (3 runs): zeek-master/bin/zeek -Cr http_100g_zeroes.pcap tuning/json-logs frameworks/files/hash-all-files
      measurement          mean ± σ            min … max           outliers         delta
      wall_time           102s  ± 1.23s      101s  …  103s           0 ( 0%)        0%
      peak_rss            108MB ±  632KB     107MB …  109MB          0 ( 0%)        0%
      cpu_cycles          381G  ±  862M      380G  …  382G           0 ( 0%)        0%
      instructions        663G  ± 5.16M      663G  …  663G           0 ( 0%)        0%
      cache_references   1.03G  ±  109M      927M  … 1.15G           0 ( 0%)        0%
      cache_misses       12.3M  ±  587K     11.7M  … 12.9M           0 ( 0%)        0%
      branch_misses      1.23G  ± 2.10M     1.22G  … 1.23G           0 ( 0%)        0%
    Benchmark 2 (3 runs): zeek-file_analysis_speedup/bin/zeek -Cr http_100g_zeroes.pcap tuning/json-logs frameworks/files/hash-all-files
      measurement          mean ± σ            min … max           outliers         delta
      wall_time          92.9s  ± 1.85s     91.8s  … 95.1s           0 ( 0%)        -  9.0% ±  3.5%
      peak_rss            108MB ±  393KB     108MB …  109MB          0 ( 0%)          +  0.1% ±  1.1%
      cpu_cycles          341G  ±  695M      341G  …  342G           0 ( 0%)        - 10.4% ±  0.5%
      instructions        605G  ±  626M      605G  …  606G           0 ( 0%)        -  8.7% ±  0.2%
      cache_references    831M  ± 16.9M      813M  …  846M           0 ( 0%)        - 19.6% ± 17.2%
      cache_misses       12.4M  ± 1.48M     11.4M  … 14.1M           0 ( 0%)          +  0.3% ± 20.8%
      branch_misses      1.02G  ± 3.45M     1.02G  … 1.02G           0 ( 0%)        - 16.8% ±  0.5%
This commit is contained in:
Justin Azoff 2025-04-18 14:01:47 -04:00 committed by Justin Azoff
parent 20ada619c5
commit 7f350587b0
3 changed files with 26 additions and 14 deletions

View file

@ -1310,11 +1310,15 @@ public:
return cast_intrusive<T>(GetField(field));
}
// Returns true if the slot for the given field is initialized.
// This helper can be used to guard GetFieldAs() accesses.
bool HasRawField(int field) const { return record_val[field].has_value(); }
// The following return the given field converted to a particular
// underlying value. We provide these to enable efficient
// access to record fields (without requiring an intermediary Val).
// It is up to the caller to ensure that the field exists in the
// record (using HasField(), if necessary).
// record (using HasRawField(), if necessary).
template<typename T, typename std::enable_if_t<is_zeek_val_v<T>, bool> = true>
auto GetFieldAs(int field) const -> std::invoke_result_t<decltype(&T::Get), T> {
if constexpr ( std::is_same_v<T, BoolVal> || std::is_same_v<T, IntVal> || std::is_same_v<T, EnumVal> )

View file

@ -86,6 +86,9 @@ File::File(const std::string& file_id, const std::string& source_name, Connectio
reassembly_enabled(false),
postpone_timeout(false),
done(false),
seen_bytes(0),
missing_bytes(0),
overflow_bytes(0),
analyzers(this) {
StaticInit();
@ -147,6 +150,9 @@ void File::RaiseFileOverNewConnection(Connection* conn, bool is_orig) {
}
uint64_t File::LookupFieldDefaultCount(int idx) const {
if ( val->HasRawField(idx) )
return val->GetFieldAs<zeek::CountVal>(idx);
auto v = val->GetFieldOrDefault(idx);
return v->AsCount();
}
@ -192,23 +198,19 @@ bool File::SetExtractionLimit(RecordValPtr args, uint64_t bytes) {
return true;
}
void File::IncrementByteCount(uint64_t size, int field_idx) {
uint64_t old = LookupFieldDefaultCount(field_idx);
val->Assign(field_idx, old + size);
}
void File::SetTotalBytes(uint64_t size) {
DBG_LOG(DBG_FILE_ANALYSIS, "[%s] Total bytes %" PRIu64, id.c_str(), size);
val->Assign(total_bytes_idx, size);
}
bool File::IsComplete() const {
const auto& total = val->GetField(total_bytes_idx);
if ( ! total )
// If total_bytes hasn't been initialized yet, file is certainly not complete.
if ( ! val->HasRawField(total_bytes_idx) )
return false;
if ( stream_offset >= total->AsCount() )
auto total = val->GetFieldAs<zeek::CountVal>(total_bytes_idx);
if ( stream_offset >= total )
return true;
return false;
@ -372,7 +374,7 @@ void File::DeliverStream(const u_char* data, uint64_t len) {
}
stream_offset += len;
IncrementByteCount(len, seen_bytes_idx);
seen_bytes += len;
}
void File::DeliverChunk(const u_char* data, uint64_t len, uint64_t offset) {
@ -388,7 +390,7 @@ void File::DeliverChunk(const u_char* data, uint64_t len, uint64_t offset) {
if ( reassembly_max_buffer > 0 && reassembly_max_buffer < file_reassembler->TotalSize() ) {
uint64_t current_offset = stream_offset;
uint64_t gap_bytes = file_reassembler->Flush();
IncrementByteCount(gap_bytes, overflow_bytes_idx);
overflow_bytes += gap_bytes;
if ( FileEventAvailable(file_reassembly_overflow) ) {
FileEvent(file_reassembly_overflow, {val, val_mgr->Count(current_offset), val_mgr->Count(gap_bytes)});
@ -411,7 +413,7 @@ void File::DeliverChunk(const u_char* data, uint64_t len, uint64_t offset) {
}
else {
// We can't reassemble so we throw out the data for streaming.
IncrementByteCount(len, overflow_bytes_idx);
overflow_bytes += len;
}
DBG_LOG(DBG_FILE_ANALYSIS, "[%s] %" PRIu64 " chunk bytes in at offset %" PRIu64 "; %s [%s%s]", id.c_str(), len,
@ -513,7 +515,7 @@ void File::Gap(uint64_t offset, uint64_t len) {
analyzers.DrainModifications();
stream_offset += len;
IncrementByteCount(len, missing_bytes_idx);
missing_bytes += len;
}
bool File::FileEventAvailable(EventHandlerPtr h) { return h && ! file_mgr->IsIgnored(id); }
@ -526,6 +528,9 @@ void File::FileEvent(EventHandlerPtr h) {
}
void File::FileEvent(EventHandlerPtr h, Args args) {
val->Assign(seen_bytes_idx, seen_bytes);
val->Assign(missing_bytes_idx, missing_bytes);
val->Assign(overflow_bytes_idx, overflow_bytes);
event_mgr.Enqueue(h, std::move(args));
if ( h == file_new || h == file_over_new_connection || h == file_sniff || h == file_timeout ||

View file

@ -325,6 +325,9 @@ protected:
bool reassembly_enabled; /**< Whether file stream reassembly is needed. */
bool postpone_timeout; /**< Whether postponing timeout is requested. */
bool done; /**< If this object is about to be deleted. */
uint64_t seen_bytes; /**< Number of bytes processed for this file. */
uint64_t missing_bytes; /**< Number of bytes missed for this file. */
uint64_t overflow_bytes; /**< Number of bytes not delivered. */
detail::AnalyzerSet analyzers; /**< A set of attached file analyzers. */
std::list<Analyzer*> done_analyzers; /**< Analyzers we're done with, remembered here until they
can be safely deleted. */