From e068ad8a535797730879137a8af69864a44793d1 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Fri, 31 Jan 2020 11:31:10 +0100 Subject: [PATCH 1/8] util: don't modify the input string in tokenize_string() This saves one full copy of the input string and avoids moving memory around at O(n^2) in the erase() call in each loop iteration. --- src/util.cc | 11 ++++++----- src/util.h | 2 +- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/src/util.cc b/src/util.cc index a9b1707297..b820aabb3a 100644 --- a/src/util.cc +++ b/src/util.cc @@ -1501,26 +1501,27 @@ TEST_CASE("util tokenize_string") CHECK(v2.size() == 1); } -vector* tokenize_string(string input, const string& delim, +vector* tokenize_string(const string &input, const string& delim, vector* rval, int limit) { if ( ! rval ) rval = new vector(); + size_t pos = 0; size_t n; auto found = 0; - while ( (n = input.find(delim)) != string::npos ) + while ( (n = input.find(delim, pos)) != string::npos ) { ++found; - rval->push_back(input.substr(0, n)); - input.erase(0, n + 1); + rval->emplace_back(input.substr(pos, n - pos)); + pos = n + 1; if ( limit && found == limit ) break; } - rval->push_back(input); + rval->emplace_back(input.substr(pos)); return rval; } diff --git a/src/util.h b/src/util.h index 3065652bea..40c45b274e 100644 --- a/src/util.h +++ b/src/util.h @@ -145,7 +145,7 @@ inline std::string get_escaped_string(const std::string& str, bool escape_all) return get_escaped_string(str.data(), str.length(), escape_all); } -std::vector* tokenize_string(std::string input, +std::vector* tokenize_string(const std::string &input, const std::string& delim, std::vector* rval = 0, int limit = 0); From f1566bda1403d9c2f3a35d3a07c0c9c168c3b2f3 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Fri, 31 Jan 2020 13:10:04 +0100 Subject: [PATCH 2/8] util: pass std::string_view to tokenize_string() This saves some overhead because some callers pass a plain C string here which needed to be copied to a temporary std::string. --- src/util.cc | 2 +- src/util.h | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/util.cc b/src/util.cc index b820aabb3a..544b547b29 100644 --- a/src/util.cc +++ b/src/util.cc @@ -1501,7 +1501,7 @@ TEST_CASE("util tokenize_string") CHECK(v2.size() == 1); } -vector* tokenize_string(const string &input, const string& delim, +vector* tokenize_string(const std::string_view input, const std::string_view delim, vector* rval, int limit) { if ( ! rval ) diff --git a/src/util.h b/src/util.h index 40c45b274e..9442357e0b 100644 --- a/src/util.h +++ b/src/util.h @@ -25,6 +25,7 @@ #include #include +#include #include #include #include @@ -145,8 +146,8 @@ inline std::string get_escaped_string(const std::string& str, bool escape_all) return get_escaped_string(str.data(), str.length(), escape_all); } -std::vector* tokenize_string(const std::string &input, - const std::string& delim, +std::vector* tokenize_string(std::string_view input, + std::string_view delim, std::vector* rval = 0, int limit = 0); extern char* copy_string(const char* s); From 0589f295fa10805af8643312be5031d0b107cc5b Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Fri, 31 Jan 2020 13:25:34 +0100 Subject: [PATCH 3/8] util: pass std::string_view to normalize_path() Reduce overhead in some callers. --- src/util.cc | 4 ++-- src/util.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/util.cc b/src/util.cc index 544b547b29..54b1041501 100644 --- a/src/util.cc +++ b/src/util.cc @@ -1552,13 +1552,13 @@ TEST_CASE("util normalize_path") CHECK(normalize_path("zeek/../..") == ".."); } -string normalize_path(const string& path) +string normalize_path(const std::string_view path) { size_t n; vector components, final_components; string new_path; - if ( path[0] == '/' ) + if ( !path.empty() && path[0] == '/' ) new_path = "/"; tokenize_string(path, "/", &components); diff --git a/src/util.h b/src/util.h index 9442357e0b..b2b346f727 100644 --- a/src/util.h +++ b/src/util.h @@ -344,7 +344,7 @@ std::string flatten_script_name(const std::string& name, * @param path A filesystem path. * @return A canonical/shortened version of \a path. */ -std::string normalize_path(const std::string& path); +std::string normalize_path(std::string_view path); /** * Strip the ZEEKPATH component from a path. From 5c0c336c6b379c5af710bc8d7b93d7e044eab119 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Fri, 31 Jan 2020 12:03:37 +0100 Subject: [PATCH 4/8] util: skip "." completely in normalize_path() Don't copy "." segments to the final_components list only to remove it afterwards. --- src/util.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/util.cc b/src/util.cc index 54b1041501..5c8fe05013 100644 --- a/src/util.cc +++ b/src/util.cc @@ -1567,11 +1567,11 @@ string normalize_path(const std::string_view path) for ( it = components.begin(); it != components.end(); ++it ) { if ( *it == "" ) continue; + if ( *it == "." && it != components.begin() ) continue; + final_components.push_back(*it); - if ( *it == "." && it != components.begin() ) - final_components.pop_back(); - else if ( *it == ".." ) + if ( *it == ".." ) { auto cur_idx = final_components.size() - 1; From 53c4e300241e5411d2803f4b818433594f64302d Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Fri, 31 Jan 2020 12:05:47 +0100 Subject: [PATCH 5/8] util: reserve space in normalize_path() Pessimistic reservations to ensure that it does not need to be reallocated. --- src/util.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/util.cc b/src/util.cc index 5c8fe05013..1c73b4e89f 100644 --- a/src/util.cc +++ b/src/util.cc @@ -1557,11 +1557,13 @@ string normalize_path(const std::string_view path) size_t n; vector components, final_components; string new_path; + new_path.reserve(path.size()); if ( !path.empty() && path[0] == '/' ) new_path = "/"; tokenize_string(path, "/", &components); + final_components.reserve(components.size()); vector::const_iterator it; for ( it = components.begin(); it != components.end(); ++it ) From 37cbd98e340db27d404990e1d8781d254d677b6c Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Fri, 31 Jan 2020 12:24:57 +0100 Subject: [PATCH 6/8] util: use "auto" in normalize_path() --- src/util.cc | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/util.cc b/src/util.cc index 1c73b4e89f..341c8412e7 100644 --- a/src/util.cc +++ b/src/util.cc @@ -1565,8 +1565,7 @@ string normalize_path(const std::string_view path) tokenize_string(path, "/", &components); final_components.reserve(components.size()); - vector::const_iterator it; - for ( it = components.begin(); it != components.end(); ++it ) + for ( auto it = components.begin(); it != components.end(); ++it ) { if ( *it == "" ) continue; if ( *it == "." && it != components.begin() ) continue; @@ -1600,7 +1599,7 @@ string normalize_path(const std::string_view path) } } - for ( it = final_components.begin(); it != final_components.end(); ++it ) + for ( auto it = final_components.begin(); it != final_components.end(); ++it ) { new_path.append(*it); new_path.append("/"); From 763afe6f5f8462c69647b44ab9b2d2dba2e18d96 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Fri, 31 Jan 2020 12:24:16 +0100 Subject: [PATCH 7/8] util: store std::string_view in "final_components" vector Don't copy those path segments - instead, use std::string_view to store references into the existing std::strings. This saves a good amount of allocation overhead. --- src/util.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/util.cc b/src/util.cc index 341c8412e7..db91af4f54 100644 --- a/src/util.cc +++ b/src/util.cc @@ -1555,7 +1555,8 @@ TEST_CASE("util normalize_path") string normalize_path(const std::string_view path) { size_t n; - vector components, final_components; + vector components; + vector final_components; string new_path; new_path.reserve(path.size()); From 26da10ca058901c438b4b6c46e3f8dee5073f69f Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Fri, 31 Jan 2020 12:25:48 +0100 Subject: [PATCH 8/8] util: add a tokenize_string() overload which returns string_views Additionally, it uses a single "char" as delimiter, which is also faster. This patch speeds up Zeek startup by 10%. --- src/util.cc | 28 ++++++++++++++++++++++------ src/util.h | 2 ++ 2 files changed, 24 insertions(+), 6 deletions(-) diff --git a/src/util.cc b/src/util.cc index db91af4f54..d566c588a9 100644 --- a/src/util.cc +++ b/src/util.cc @@ -837,8 +837,7 @@ bool ensure_intermediate_dirs(const char* dirname) bool absolute = dirname[0] == '/'; string path = normalize_path(dirname); - vector path_components; - tokenize_string(path, "/", &path_components); + const auto path_components = tokenize_string(path, '/'); string current_dir; @@ -1525,6 +1524,25 @@ vector* tokenize_string(const std::string_view input, const std::string_ return rval; } +vector tokenize_string(const std::string_view input, const char delim) noexcept + { + vector rval; + + size_t pos = 0; + size_t n; + auto found = 0; + + while ( (n = input.find(delim, pos)) != string::npos ) + { + ++found; + rval.push_back(input.substr(pos, n - pos)); + pos = n + 1; + } + + rval.push_back(input.substr(pos)); + return rval; + } + TEST_CASE("util normalize_path") { CHECK(normalize_path("/1/2/3") == "/1/2/3"); @@ -1555,7 +1573,6 @@ TEST_CASE("util normalize_path") string normalize_path(const std::string_view path) { size_t n; - vector components; vector final_components; string new_path; new_path.reserve(path.size()); @@ -1563,7 +1580,7 @@ string normalize_path(const std::string_view path) if ( !path.empty() && path[0] == '/' ) new_path = "/"; - tokenize_string(path, "/", &components); + const auto components = tokenize_string(path, '/'); final_components.reserve(components.size()); for ( auto it = components.begin(); it != components.end(); ++it ) @@ -1616,8 +1633,7 @@ string without_bropath_component(const string& path) { string rval = normalize_path(path); - vector paths; - tokenize_string(bro_path(), ":", &paths); + const auto paths = tokenize_string(bro_path(), ':'); for ( size_t i = 0; i < paths.size(); ++i ) { diff --git a/src/util.h b/src/util.h index b2b346f727..bd2447c23f 100644 --- a/src/util.h +++ b/src/util.h @@ -150,6 +150,8 @@ std::vector* tokenize_string(std::string_view input, std::string_view delim, std::vector* rval = 0, int limit = 0); +std::vector tokenize_string(const std::string_view input, const char delim) noexcept; + extern char* copy_string(const char* s); extern int streq(const char* s1, const char* s2);