mirror of
https://github.com/zeek/zeek.git
synced 2025-10-02 06:38:20 +00:00
speed up file analysis, remove IncrementByteCount
Avoid creating and recreating count objects for each chunk of file analyzed. This replaces counts inside of records with c++ uint64_ts. On a pcap containing a 100GB file download this gives a 9% speedup Benchmark 1 (3 runs): zeek-master/bin/zeek -Cr http_100g_zeroes.pcap tuning/json-logs frameworks/files/hash-all-files measurement mean ± σ min … max outliers delta wall_time 102s ± 1.23s 101s … 103s 0 ( 0%) 0% peak_rss 108MB ± 632KB 107MB … 109MB 0 ( 0%) 0% cpu_cycles 381G ± 862M 380G … 382G 0 ( 0%) 0% instructions 663G ± 5.16M 663G … 663G 0 ( 0%) 0% cache_references 1.03G ± 109M 927M … 1.15G 0 ( 0%) 0% cache_misses 12.3M ± 587K 11.7M … 12.9M 0 ( 0%) 0% branch_misses 1.23G ± 2.10M 1.22G … 1.23G 0 ( 0%) 0% Benchmark 2 (3 runs): zeek-file_analysis_speedup/bin/zeek -Cr http_100g_zeroes.pcap tuning/json-logs frameworks/files/hash-all-files measurement mean ± σ min … max outliers delta wall_time 92.9s ± 1.85s 91.8s … 95.1s 0 ( 0%) ⚡- 9.0% ± 3.5% peak_rss 108MB ± 393KB 108MB … 109MB 0 ( 0%) + 0.1% ± 1.1% cpu_cycles 341G ± 695M 341G … 342G 0 ( 0%) ⚡- 10.4% ± 0.5% instructions 605G ± 626M 605G … 606G 0 ( 0%) ⚡- 8.7% ± 0.2% cache_references 831M ± 16.9M 813M … 846M 0 ( 0%) ⚡- 19.6% ± 17.2% cache_misses 12.4M ± 1.48M 11.4M … 14.1M 0 ( 0%) + 0.3% ± 20.8% branch_misses 1.02G ± 3.45M 1.02G … 1.02G 0 ( 0%) ⚡- 16.8% ± 0.5%
This commit is contained in:
parent
20ada619c5
commit
7f350587b0
3 changed files with 26 additions and 14 deletions
|
@ -1310,11 +1310,15 @@ public:
|
|||
return cast_intrusive<T>(GetField(field));
|
||||
}
|
||||
|
||||
// Returns true if the slot for the given field is initialized.
|
||||
// This helper can be used to guard GetFieldAs() accesses.
|
||||
bool HasRawField(int field) const { return record_val[field].has_value(); }
|
||||
|
||||
// The following return the given field converted to a particular
|
||||
// underlying value. We provide these to enable efficient
|
||||
// access to record fields (without requiring an intermediary Val).
|
||||
// It is up to the caller to ensure that the field exists in the
|
||||
// record (using HasField(), if necessary).
|
||||
// record (using HasRawField(), if necessary).
|
||||
template<typename T, typename std::enable_if_t<is_zeek_val_v<T>, bool> = true>
|
||||
auto GetFieldAs(int field) const -> std::invoke_result_t<decltype(&T::Get), T> {
|
||||
if constexpr ( std::is_same_v<T, BoolVal> || std::is_same_v<T, IntVal> || std::is_same_v<T, EnumVal> )
|
||||
|
|
|
@ -86,6 +86,9 @@ File::File(const std::string& file_id, const std::string& source_name, Connectio
|
|||
reassembly_enabled(false),
|
||||
postpone_timeout(false),
|
||||
done(false),
|
||||
seen_bytes(0),
|
||||
missing_bytes(0),
|
||||
overflow_bytes(0),
|
||||
analyzers(this) {
|
||||
StaticInit();
|
||||
|
||||
|
@ -147,6 +150,9 @@ void File::RaiseFileOverNewConnection(Connection* conn, bool is_orig) {
|
|||
}
|
||||
|
||||
uint64_t File::LookupFieldDefaultCount(int idx) const {
|
||||
if ( val->HasRawField(idx) )
|
||||
return val->GetFieldAs<zeek::CountVal>(idx);
|
||||
|
||||
auto v = val->GetFieldOrDefault(idx);
|
||||
return v->AsCount();
|
||||
}
|
||||
|
@ -192,23 +198,19 @@ bool File::SetExtractionLimit(RecordValPtr args, uint64_t bytes) {
|
|||
return true;
|
||||
}
|
||||
|
||||
void File::IncrementByteCount(uint64_t size, int field_idx) {
|
||||
uint64_t old = LookupFieldDefaultCount(field_idx);
|
||||
val->Assign(field_idx, old + size);
|
||||
}
|
||||
|
||||
void File::SetTotalBytes(uint64_t size) {
|
||||
DBG_LOG(DBG_FILE_ANALYSIS, "[%s] Total bytes %" PRIu64, id.c_str(), size);
|
||||
val->Assign(total_bytes_idx, size);
|
||||
}
|
||||
|
||||
bool File::IsComplete() const {
|
||||
const auto& total = val->GetField(total_bytes_idx);
|
||||
|
||||
if ( ! total )
|
||||
// If total_bytes hasn't been initialized yet, file is certainly not complete.
|
||||
if ( ! val->HasRawField(total_bytes_idx) )
|
||||
return false;
|
||||
|
||||
if ( stream_offset >= total->AsCount() )
|
||||
auto total = val->GetFieldAs<zeek::CountVal>(total_bytes_idx);
|
||||
|
||||
if ( stream_offset >= total )
|
||||
return true;
|
||||
|
||||
return false;
|
||||
|
@ -372,7 +374,7 @@ void File::DeliverStream(const u_char* data, uint64_t len) {
|
|||
}
|
||||
|
||||
stream_offset += len;
|
||||
IncrementByteCount(len, seen_bytes_idx);
|
||||
seen_bytes += len;
|
||||
}
|
||||
|
||||
void File::DeliverChunk(const u_char* data, uint64_t len, uint64_t offset) {
|
||||
|
@ -388,7 +390,7 @@ void File::DeliverChunk(const u_char* data, uint64_t len, uint64_t offset) {
|
|||
if ( reassembly_max_buffer > 0 && reassembly_max_buffer < file_reassembler->TotalSize() ) {
|
||||
uint64_t current_offset = stream_offset;
|
||||
uint64_t gap_bytes = file_reassembler->Flush();
|
||||
IncrementByteCount(gap_bytes, overflow_bytes_idx);
|
||||
overflow_bytes += gap_bytes;
|
||||
|
||||
if ( FileEventAvailable(file_reassembly_overflow) ) {
|
||||
FileEvent(file_reassembly_overflow, {val, val_mgr->Count(current_offset), val_mgr->Count(gap_bytes)});
|
||||
|
@ -411,7 +413,7 @@ void File::DeliverChunk(const u_char* data, uint64_t len, uint64_t offset) {
|
|||
}
|
||||
else {
|
||||
// We can't reassemble so we throw out the data for streaming.
|
||||
IncrementByteCount(len, overflow_bytes_idx);
|
||||
overflow_bytes += len;
|
||||
}
|
||||
|
||||
DBG_LOG(DBG_FILE_ANALYSIS, "[%s] %" PRIu64 " chunk bytes in at offset %" PRIu64 "; %s [%s%s]", id.c_str(), len,
|
||||
|
@ -513,7 +515,7 @@ void File::Gap(uint64_t offset, uint64_t len) {
|
|||
analyzers.DrainModifications();
|
||||
|
||||
stream_offset += len;
|
||||
IncrementByteCount(len, missing_bytes_idx);
|
||||
missing_bytes += len;
|
||||
}
|
||||
|
||||
bool File::FileEventAvailable(EventHandlerPtr h) { return h && ! file_mgr->IsIgnored(id); }
|
||||
|
@ -526,6 +528,9 @@ void File::FileEvent(EventHandlerPtr h) {
|
|||
}
|
||||
|
||||
void File::FileEvent(EventHandlerPtr h, Args args) {
|
||||
val->Assign(seen_bytes_idx, seen_bytes);
|
||||
val->Assign(missing_bytes_idx, missing_bytes);
|
||||
val->Assign(overflow_bytes_idx, overflow_bytes);
|
||||
event_mgr.Enqueue(h, std::move(args));
|
||||
|
||||
if ( h == file_new || h == file_over_new_connection || h == file_sniff || h == file_timeout ||
|
||||
|
|
|
@ -325,6 +325,9 @@ protected:
|
|||
bool reassembly_enabled; /**< Whether file stream reassembly is needed. */
|
||||
bool postpone_timeout; /**< Whether postponing timeout is requested. */
|
||||
bool done; /**< If this object is about to be deleted. */
|
||||
uint64_t seen_bytes; /**< Number of bytes processed for this file. */
|
||||
uint64_t missing_bytes; /**< Number of bytes missed for this file. */
|
||||
uint64_t overflow_bytes; /**< Number of bytes not delivered. */
|
||||
detail::AnalyzerSet analyzers; /**< A set of attached file analyzers. */
|
||||
std::list<Analyzer*> done_analyzers; /**< Analyzers we're done with, remembered here until they
|
||||
can be safely deleted. */
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue