Merge branch 'optimize_normalize_path' of https://github.com/MaxKellermann/zeek

- Minor changes in merge: extended unit test, prefer emplace_back(),
  remove unused "found" count in new function

* 'optimize_normalize_path' of https://github.com/MaxKellermann/zeek:
  util: add a tokenize_string() overload which returns string_views
  util: store std::string_view in "final_components" vector
  util: use "auto" in normalize_path()
  util: reserve space in normalize_path()
  util: skip "." completely in normalize_path()
  util: pass std::string_view to normalize_path()
  util: pass std::string_view to tokenize_string()
  util: don't modify the input string in tokenize_string()
This commit is contained in:
Jon Siwek 2020-01-31 13:19:09 -08:00
commit d39bb42b14
4 changed files with 57 additions and 23 deletions

10
CHANGES
View file

@ -1,4 +1,14 @@
3.1.0-dev.519 | 2020-01-31 13:19:09 -0800
* util: optimize tokenize_string() and normalize_path() (Max Kellermann)
This patch can speed up Zeek startup by 10-25%.
Adds a new tokenize_string() overload which returns string_views and
changes existing tokenize_string() and normalize_path() to use string_view
arguments.
3.1.0-dev.510 | 2020-01-31 11:20:28 -0800 3.1.0-dev.510 | 2020-01-31 11:20:28 -0800
* Remove extra fmt() in a reporter->Error() call (Jon Siwek, Corelight) * Remove extra fmt() in a reporter->Error() call (Jon Siwek, Corelight)

View file

@ -1 +1 @@
3.1.0-dev.510 3.1.0-dev.519

View file

@ -838,8 +838,7 @@ bool ensure_intermediate_dirs(const char* dirname)
bool absolute = dirname[0] == '/'; bool absolute = dirname[0] == '/';
string path = normalize_path(dirname); string path = normalize_path(dirname);
vector<string> path_components; const auto path_components = tokenize_string(path, '/');
tokenize_string(path, "/", &path_components);
string current_dir; string current_dir;
@ -1500,28 +1499,50 @@ TEST_CASE("util tokenize_string")
v2.clear(); v2.clear();
tokenize_string("/wrong/delim", ",", &v2); tokenize_string("/wrong/delim", ",", &v2);
CHECK(v2.size() == 1); CHECK(v2.size() == 1);
auto svs = tokenize_string("one,two,three,four,", ',');
std::vector<std::string_view> expect{"one", "two", "three", "four", ""};
CHECK(svs == expect);
} }
vector<string>* tokenize_string(string input, const string& delim, vector<string>* tokenize_string(const std::string_view input, const std::string_view delim,
vector<string>* rval, int limit) vector<string>* rval, int limit)
{ {
if ( ! rval ) if ( ! rval )
rval = new vector<string>(); rval = new vector<string>();
size_t pos = 0;
size_t n; size_t n;
auto found = 0; auto found = 0;
while ( (n = input.find(delim)) != string::npos ) while ( (n = input.find(delim, pos)) != string::npos )
{ {
++found; ++found;
rval->push_back(input.substr(0, n)); rval->emplace_back(input.substr(pos, n - pos));
input.erase(0, n + 1); pos = n + 1;
if ( limit && found == limit ) if ( limit && found == limit )
break; break;
} }
rval->push_back(input); rval->emplace_back(input.substr(pos));
return rval;
}
vector<std::string_view> tokenize_string(const std::string_view input, const char delim) noexcept
{
vector<std::string_view> rval;
size_t pos = 0;
size_t n;
while ( (n = input.find(delim, pos)) != string::npos )
{
rval.emplace_back(input.substr(pos, n - pos));
pos = n + 1;
}
rval.emplace_back(input.substr(pos));
return rval; return rval;
} }
@ -1552,26 +1573,27 @@ TEST_CASE("util normalize_path")
CHECK(normalize_path("zeek/../..") == ".."); CHECK(normalize_path("zeek/../..") == "..");
} }
string normalize_path(const string& path) string normalize_path(const std::string_view path)
{ {
size_t n; size_t n;
vector<string> components, final_components; vector<std::string_view> final_components;
string new_path; string new_path;
new_path.reserve(path.size());
if ( path[0] == '/' ) if ( ! path.empty() && path[0] == '/' )
new_path = "/"; new_path = "/";
tokenize_string(path, "/", &components); const auto components = tokenize_string(path, '/');
final_components.reserve(components.size());
vector<string>::const_iterator it; for ( auto it = components.begin(); it != components.end(); ++it )
for ( it = components.begin(); it != components.end(); ++it )
{ {
if ( *it == "" ) continue; if ( *it == "" ) continue;
if ( *it == "." && it != components.begin() ) continue;
final_components.push_back(*it); final_components.push_back(*it);
if ( *it == "." && it != components.begin() ) if ( *it == ".." )
final_components.pop_back();
else if ( *it == ".." )
{ {
auto cur_idx = final_components.size() - 1; auto cur_idx = final_components.size() - 1;
@ -1598,7 +1620,7 @@ string normalize_path(const string& path)
} }
} }
for ( it = final_components.begin(); it != final_components.end(); ++it ) for ( auto it = final_components.begin(); it != final_components.end(); ++it )
{ {
new_path.append(*it); new_path.append(*it);
new_path.append("/"); new_path.append("/");
@ -1614,8 +1636,7 @@ string without_bropath_component(const string& path)
{ {
string rval = normalize_path(path); string rval = normalize_path(path);
vector<string> paths; const auto paths = tokenize_string(bro_path(), ':');
tokenize_string(bro_path(), ":", &paths);
for ( size_t i = 0; i < paths.size(); ++i ) for ( size_t i = 0; i < paths.size(); ++i )
{ {

View file

@ -25,6 +25,7 @@
#include <cstdint> #include <cstdint>
#include <string> #include <string>
#include <string_view>
#include <array> #include <array>
#include <vector> #include <vector>
#include <stdio.h> #include <stdio.h>
@ -145,10 +146,12 @@ inline std::string get_escaped_string(const std::string& str, bool escape_all)
return get_escaped_string(str.data(), str.length(), escape_all); return get_escaped_string(str.data(), str.length(), escape_all);
} }
std::vector<std::string>* tokenize_string(std::string input, std::vector<std::string>* tokenize_string(std::string_view input,
const std::string& delim, std::string_view delim,
std::vector<std::string>* rval = 0, int limit = 0); std::vector<std::string>* rval = 0, int limit = 0);
std::vector<std::string_view> tokenize_string(const std::string_view input, const char delim) noexcept;
extern char* copy_string(const char* s); extern char* copy_string(const char* s);
extern int streq(const char* s1, const char* s2); extern int streq(const char* s1, const char* s2);
@ -343,7 +346,7 @@ std::string flatten_script_name(const std::string& name,
* @param path A filesystem path. * @param path A filesystem path.
* @return A canonical/shortened version of \a path. * @return A canonical/shortened version of \a path.
*/ */
std::string normalize_path(const std::string& path); std::string normalize_path(std::string_view path);
/** /**
* Strip the ZEEKPATH component from a path. * Strip the ZEEKPATH component from a path.