classes for evaluating function/hook/event usage

2025-10-05 08:08:19 +00:00 · 2022-05-04 22:27:49 -07:00 · 2022-05-04 22:27:49 -07:00 · 91f1ecd1ff
commit 91f1ecd1ff
parent a0fc8ca5e4
3 changed files with 304 additions and 0 deletions
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -398,6 +398,7 @@ set(MAIN_SRCS
    script_opt/ScriptOpt.cc
    script_opt/Stmt.cc
    script_opt/TempVar.cc
    script_opt/UsageAnalyzer.cc
    script_opt/UseDefs.cc
    script_opt/ZAM/AM-Opt.cc
--- a/src/script_opt/UsageAnalyzer.cc
+++ b/src/script_opt/UsageAnalyzer.cc
@ -0,0 +1,221 @@
 // See the file "COPYING" in the main distribution directory for copyright.
 #include "zeek/module_util.h"
 #include "zeek/script_opt/IDOptInfo.h"
 #include "zeek/script_opt/UsageAnalyzer.h"
 namespace zeek::detail
 	{
 // The names of identifiers that correspond to events not-previously-known
 // before their declaration in the scripts.
 std::unordered_set<std::string> script_events;
 void register_new_event(const IDPtr& id)
 	{
 	script_events.insert(id->Name());
 	}
 UsageAnalyzer::UsageAnalyzer(std::vector<FuncInfo>& funcs)
 	{
 	// Setting a scope cues ID::Traverse to delve into function values.
 	current_scope = global_scope();
 	FindSeeds(reachables);
 	FullyExpandReachables();
 	// At this point, we've done the complete reachability analysis.
 	// Report out on unreachables.  We do this in two steps: first,
 	// unreachable events/hooks, and then unreachable functions.  We
 	// split the two because we don't want to ding a function as being
 	// unreachable if there's an (unreachable) event-or-hook that calls
 	// it, since presumably the real problem is the latter being an
 	// orphan, rather than the function.
 	auto& globals = global_scope()->Vars();
 	for ( auto& gpair : globals )
 		{
 		auto id = gpair.second.get();
 		auto& t = id->GetType();
 		if ( t->Tag() != TYPE_FUNC )
 			continue;
 		if ( t->AsFuncType()->Flavor() == FUNC_FLAVOR_FUNCTION )
 			continue;
 		if ( reachables.count(id) > 0 )
 			continue;
 		auto flavor = t->AsFuncType()->FlavorString();
 		auto loc = id->GetLocationInfo();
 		reporter->Warning("%s %s (%s:%d): cannot be invoked", flavor.c_str(), id->Name(), loc->filename, loc->first_line);
 		// Don't ding any functions that are reachable via this
 		// identifier.  This will also suppress flagging other events
 		// and hooks, depending on order-of-traversal.  That seems
 		// fine, as the key is to find the root of such issues.
 		reachables.insert(id);
 		Expand(id);
 		}
 	// Now make a second pass, focusing solely on functions.
 	for ( auto& gpair : globals )
 		{
 		auto& id = gpair.second;
 		if ( reachables.count(id.get()) > 0 )
 			continue;
 		auto f = GetFuncIfAny(id);
 		if ( ! f )
 			continue;
 		auto loc = id->GetLocationInfo();
 		reporter->Warning("function %s (%s:%d): cannot be called", id->Name(), loc->filename, loc->first_line);
 		// Unlike for events/hooks above, we don't add the function to
 		// the reachables.  This is because an orphan function is a
 		// somewhat more significant potential error than an orphan
 		// event handler or hook, as the latter can arise from simple
 		// typos (because there will be a declaration elsewhere that
 		// they're supposed to match), whereas orphan functions in
 		// general will not.
 		}
 	}
 void UsageAnalyzer::FindSeeds(IDSet& seeds) const
 	{
 	for ( auto& gpair : global_scope()->Vars() )
 		{
 		auto& id = gpair.second;
 		if ( id->GetAttr(ATTR_IS_USED) || id->GetAttr(ATTR_DEPRECATED) )
 			{
 			seeds.insert(id.get());
 			continue;
 			}
 		auto f = GetFuncIfAny(id);
 		if ( f && id->GetType<FuncType>()->Flavor() == FUNC_FLAVOR_EVENT )
 			{
 			if ( script_events.count(f->Name()) == 0 )
 				seeds.insert(id.get());
 			continue;
 			}
 		// If the global is exported, or has global scope, we assume
 		// it's meant to be used, even if the current scripts don't
 		// use it.
 		if ( id->IsExport() || id->ModuleName() == "GLOBAL" )
 			seeds.insert(id.get());
 		}
 	}
 const Func* UsageAnalyzer::GetFuncIfAny(const ID* id) const
 	{
 	auto& t = id->GetType();
 	if ( t->Tag() != TYPE_FUNC )
 		return nullptr;
 	auto fv = cast_intrusive<FuncVal>(id->GetVal());
 	if ( ! fv )
 		return nullptr;
 	auto func = fv->Get();
 	return func->GetKind() == Func::SCRIPT_FUNC ? func : nullptr;
 	}
 void UsageAnalyzer::FullyExpandReachables()
 	{
 	// We use the following structure to avoid having to copy
 	// the initial set of reachables, which can be quite large.
 	if ( ExpandReachables(reachables) )
 		{
 		auto r = new_reachables;
 		reachables.insert(r.begin(), r.end());
 		while ( ExpandReachables(r) )
 			{
 			r = new_reachables;
 			reachables.insert(r.begin(), r.end());
 			}
 		}
 	}
 bool UsageAnalyzer::ExpandReachables(const IDSet& curr_r)
 	{
 	new_reachables.clear();
 	for ( auto r : curr_r )
 		Expand(r);
 	return ! new_reachables.empty();
 	}
 void UsageAnalyzer::Expand(const ID* id)
 	{
 	// A subtle problem arises for exported globals that refer to functions
 	// that themselves generate events.  Because for identifiers we don't
 	// traverse their values (since there's no Traverse infrastructure for
 	// Val classes), we can see those identifiers initially in a seeding
 	// context, where we can't associate them with their functions; and
 	// then again when actually analyzing that function.
 	//
 	// It might be tempting to special-case the seeding phase, but that
 	// gets hard if the global doesn't direclty refer to the function,
 	// but instead ultimately incorporates a type with an attribute that
 	// uses the function.  So instead we allow re-visiting of identifiers
 	// and just suppress them once-per-analysis traversal (to save a bunch
 	// of computation).
 	analyzed_IDs.clear();
 	id->Traverse(this);
 	}
 TraversalCode UsageAnalyzer::PreID(const ID* id)
 	{
 	if ( analyzed_IDs.count(id) > 0 )
 		// No need to repeat the analysis.
 		return TC_ABORTSTMT;
 	// Mark so that we avoid redundant re-traversal.
 	analyzed_IDs.insert(id);
 	auto f = GetFuncIfAny(id);
 	if ( f && reachables.count(id) == 0 )
 		// Haven't seen this function before.
 		new_reachables.insert(id);
 	id->GetType()->Traverse(this);
 	auto& attrs = id->GetAttrs();
 	if ( attrs )
 		attrs->Traverse(this);
 	// Initialization expressions can have function calls or lambdas that
 	// themselves link to other identifiers.
 	for ( auto& ie : id->GetOptInfo()->GetInitExprs() )
 		if ( ie )
 			ie->Traverse(this);
 	return TC_CONTINUE;
 	}
 TraversalCode UsageAnalyzer::PreType(const Type* t)
 	{
 	if ( analyzed_types.count(t) > 0 )
 		return TC_ABORTSTMT;
 	// Save processing by avoiding a re-traversal of this type.
 	analyzed_types.insert(t);
 	return TC_CONTINUE;
 	}
 	} // namespace zeek::detail
--- a/src/script_opt/UsageAnalyzer.h
+++ b/src/script_opt/UsageAnalyzer.h
@ -0,0 +1,82 @@
 // See the file "COPYING" in the main distribution directory for copyright.
 // Classes for analyzing the usage of functions, hooks & events in order
 // to locate any that cannot actually be invoked.
 #pragma once
 #include "zeek/Traverse.h"
 #include "zeek/script_opt/ScriptOpt.h"
 namespace zeek::detail
 	{
 class UsageAnalyzer : public TraversalCallback
 	{
 public:
 	// "funcs" contains the entire set of ASTs.
 	UsageAnalyzer(std::vector<FuncInfo>& funcs);
 private:
 	using IDSet = std::unordered_set<const ID*>;
 	// Finds the set of identifiers that serve as a starting point of
 	// what's-known-to-be-used.  An identifier qualifies as such if it is
 	// (1) an event that was newly introduced by scripting (so, known to
 	// the event engine), or (2) a function or hook that's either global
 	// in scope, or exported from its module (so clearly meant for use
 	// by other scripts), or (3) marked as either &is_used or &deprecated
 	// (the latter as a way to flag identifiers that in fact are not used
 	// and will be removed in the future).
 	void FindSeeds(IDSet& seeds) const;
 	// Given an identifier, return its corresponding script function,
 	// or nil if that's not applicable.
 	const Func* GetFuncIfAny(const ID* id) const;
 	const Func* GetFuncIfAny(const IDPtr& id) const { return GetFuncIfAny(id.get()); }
 	// Iteratively follows reachability across the set of reachable
 	// identifiers (starting with the seeds) until there's no more to reap.
 	void FullyExpandReachables();
 	// Populates new_reachables with identifiers newly reachable (directly)
 	// from curr_r.
 	bool ExpandReachables(const IDSet& curr_r);
 	// For a given identifier, populates new_reachables with new
 	// identifiers directly reachable from it.
 	void Expand(const ID* f);
 	// Hooks into AST traversal to find reachable functions/hooks/events.
 	TraversalCode PreID(const ID* id) override;
 	// We traverse types, too, as their attributes can include lambdas
 	// that we need to incorporate.
 	TraversalCode PreType(const Type* t) override;
 	// The identifiers we've currently determined are (ultimately)
 	// reachable from the seeds.
 	IDSet reachables;
 	// Newly-reachable identifiers-of-interest.  This is a member variable
 	// rather than a parameter to ExpandReachables() because the coupling
 	// to populating it is indirect, via AST traversal.
 	IDSet new_reachables;
 	// The following are used to avoid redundant computation.  Note that
 	// they differ in that the first is per-traversal, while the second
 	// is global across all our analyses.  See Expand() for a discussion
 	// of why the first needs to be per-traversal.
 	// All of the identifiers we've analyzed during the current traversal.
 	std::unordered_set<const ID*> analyzed_IDs;
 	// All of the types we've analyzed to date.
 	std::unordered_set<const Type*> analyzed_types;
 	};
 // Marks a given identifier as referring to a script-level event (one
 // not previously known before its declaration in a script).
 extern void register_new_event(const IDPtr& id);
 	} // namespace zeek::detail