diff --git a/CHANGES b/CHANGES index e8be148c8f..ce9f1852b2 100644 --- a/CHANGES +++ b/CHANGES @@ -1,4 +1,14 @@ +3.1.0-dev.519 | 2020-01-31 13:19:09 -0800 + + * util: optimize tokenize_string() and normalize_path() (Max Kellermann) + + This patch can speed up Zeek startup by 10-25%. + + Adds a new tokenize_string() overload which returns string_views and + changes existing tokenize_string() and normalize_path() to use string_view + arguments. + 3.1.0-dev.510 | 2020-01-31 11:20:28 -0800 * Remove extra fmt() in a reporter->Error() call (Jon Siwek, Corelight) diff --git a/VERSION b/VERSION index 277b45b4e9..19a258a590 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -3.1.0-dev.510 +3.1.0-dev.519 diff --git a/src/util.cc b/src/util.cc index 36d346a882..28f95a002e 100644 --- a/src/util.cc +++ b/src/util.cc @@ -838,8 +838,7 @@ bool ensure_intermediate_dirs(const char* dirname) bool absolute = dirname[0] == '/'; string path = normalize_path(dirname); - vector path_components; - tokenize_string(path, "/", &path_components); + const auto path_components = tokenize_string(path, '/'); string current_dir; @@ -1500,28 +1499,50 @@ TEST_CASE("util tokenize_string") v2.clear(); tokenize_string("/wrong/delim", ",", &v2); CHECK(v2.size() == 1); + + auto svs = tokenize_string("one,two,three,four,", ','); + std::vector expect{"one", "two", "three", "four", ""}; + CHECK(svs == expect); } -vector* tokenize_string(string input, const string& delim, +vector* tokenize_string(const std::string_view input, const std::string_view delim, vector* rval, int limit) { if ( ! rval ) rval = new vector(); + size_t pos = 0; size_t n; auto found = 0; - while ( (n = input.find(delim)) != string::npos ) + while ( (n = input.find(delim, pos)) != string::npos ) { ++found; - rval->push_back(input.substr(0, n)); - input.erase(0, n + 1); + rval->emplace_back(input.substr(pos, n - pos)); + pos = n + 1; if ( limit && found == limit ) break; } - rval->push_back(input); + rval->emplace_back(input.substr(pos)); + return rval; + } + +vector tokenize_string(const std::string_view input, const char delim) noexcept + { + vector rval; + + size_t pos = 0; + size_t n; + + while ( (n = input.find(delim, pos)) != string::npos ) + { + rval.emplace_back(input.substr(pos, n - pos)); + pos = n + 1; + } + + rval.emplace_back(input.substr(pos)); return rval; } @@ -1552,26 +1573,27 @@ TEST_CASE("util normalize_path") CHECK(normalize_path("zeek/../..") == ".."); } -string normalize_path(const string& path) +string normalize_path(const std::string_view path) { size_t n; - vector components, final_components; + vector final_components; string new_path; + new_path.reserve(path.size()); - if ( path[0] == '/' ) + if ( ! path.empty() && path[0] == '/' ) new_path = "/"; - tokenize_string(path, "/", &components); + const auto components = tokenize_string(path, '/'); + final_components.reserve(components.size()); - vector::const_iterator it; - for ( it = components.begin(); it != components.end(); ++it ) + for ( auto it = components.begin(); it != components.end(); ++it ) { if ( *it == "" ) continue; + if ( *it == "." && it != components.begin() ) continue; + final_components.push_back(*it); - if ( *it == "." && it != components.begin() ) - final_components.pop_back(); - else if ( *it == ".." ) + if ( *it == ".." ) { auto cur_idx = final_components.size() - 1; @@ -1598,7 +1620,7 @@ string normalize_path(const string& path) } } - for ( it = final_components.begin(); it != final_components.end(); ++it ) + for ( auto it = final_components.begin(); it != final_components.end(); ++it ) { new_path.append(*it); new_path.append("/"); @@ -1614,8 +1636,7 @@ string without_bropath_component(const string& path) { string rval = normalize_path(path); - vector paths; - tokenize_string(bro_path(), ":", &paths); + const auto paths = tokenize_string(bro_path(), ':'); for ( size_t i = 0; i < paths.size(); ++i ) { diff --git a/src/util.h b/src/util.h index 3065652bea..bd2447c23f 100644 --- a/src/util.h +++ b/src/util.h @@ -25,6 +25,7 @@ #include #include +#include #include #include #include @@ -145,10 +146,12 @@ inline std::string get_escaped_string(const std::string& str, bool escape_all) return get_escaped_string(str.data(), str.length(), escape_all); } -std::vector* tokenize_string(std::string input, - const std::string& delim, +std::vector* tokenize_string(std::string_view input, + std::string_view delim, std::vector* rval = 0, int limit = 0); +std::vector tokenize_string(const std::string_view input, const char delim) noexcept; + extern char* copy_string(const char* s); extern int streq(const char* s1, const char* s2); @@ -343,7 +346,7 @@ std::string flatten_script_name(const std::string& name, * @param path A filesystem path. * @return A canonical/shortened version of \a path. */ -std::string normalize_path(const std::string& path); +std::string normalize_path(std::string_view path); /** * Strip the ZEEKPATH component from a path.