zeek/testing/btest/opt/AST-side-effects.zeek
Arne Welzel 93a3a11d36 Merge remote-tracking branch 'origin/topic/vern/script-opt-keep-asserts'
* origin/topic/vern/script-opt-keep-asserts:
  ZAM documentation updates for asserts and event handler run-time errors
  BTest updates for ZAM support of (optionally) keeping "assert" statements
  command-line options for controlling script optimization: keeping asserts, avoiding event handler coalescence
  ZAM support for option to not coalesce event handlers
  ZAM support for keeping "assert" statements
  internal support for script optimization options for keeping asserts, not consolidating event handlers
  ZAM operations to support asserts
  simplified "assert" by not trying to catch messages that themselves have errors

Fixed some TEST-REQUIRES "${ZEEK_ZAM}" == "1" to use "=" instead to
be /bin/sh compatible.
2024-12-05 21:45:07 +01:00

405 lines
14 KiB
Text

# @TEST-DOC: Stress tests for the AST optimizer dealing with side effects.
# @TEST-REQUIRES: test "${ZEEK_ZAM}" = "1"
#
# See below for an explanation of this convoluted invocation line.
# @TEST-EXEC: zeek -b -O ZAM -O dump-xform --optimize-func='AST_opt_test_.*' %INPUT >output
# @TEST-EXEC: TEST_DIFF_CANONIFIER=$SCRIPTS/diff-remove-abspath btest-diff output
# This is a subtle & extensive test of the AST optimizer used by ZAM, in
# particular its CSE = Common Subexpression Elimination functionality, whereby
# it reuses previously computed values rather than recomputing them. This test
# in particular focuses on that functionality in the context of side-effects
# that arise due to either implicit or explicit function calls. The implicit
# ones (such as table &default functions) are the trickiest, because they
# can in principle access or modify state that leads to *other* implicit
# function calls.
#
# Unlike nearly all Zeek BTests, the heart of the output is intermediary
# information produced by "-O dump-xform", not the final output. The dump
# implicitly reveals where the AST optimizer reuses expressions previously
# computed (into temporaries) and where it refrains from doing so because
# in principle the re-use is unsafe due to possible modifications to the
# original expression. Many of the tests print the same set of values twice in
# a row, often to see whether the first "print" requires correctly re-computing
# an expression or inheriting its value from a previous computation. The
# second "print" generally ensures that given the lack of potential
# side-effects, the values from the first "print" are re-used, although for
# some tests they need to instead be re-computed.
#
# If the dumped intermediary information agrees but there's a difference in
# the final output as the Zeek script executes, that reflects a bug subsequent
# to the AST optimizer (typically, this will be in ZAM's low-level optimizer).
#
# Changes to the AST processing used by script optimization can introduce
# benign differences to the intermediary information, such as introducing
# or removing some temporary variables. These require hand inspection to
# determine whether they reflect a problem.
#
# We divide the tests up into a number of different event handlers. This
# keeps the AST for each group of tests from getting too complex (especially
# if later tests wind up reusing values from much earlier tests, which can
# be hard to manually verify for correctness). We enable ZAM optimization
# for *only* those handlers to avoid complications in the output unrelated
# to these tests. We use distinct event handlers, rather than multiple
# handlers for the same event, to avoid coalescence of the handlers. Doing
# this explicitly rather than using "-O no-inline" is preferrable because
# it means the testing still includes any issues that the inliner might
# introduce.
#
# Any time you make a change to the final output the script produces, confirm
# that running Zeek *without* script optimization produces that same output.
########################################################################
# A global that in some contexts is directly modified as a side-effect.
global g1 = 4;
# A global that is never directly modified as a side-effect. However, some
# side effects have Unknown consequences - for those, the optimizer should
# assume that any expression derived from a global value needs to re-computed.
global g2 = 44;
# A table with a safe &default. However, in some contexts the tests will
# introduce other tables with the same type signature whose &default's have
# side effects. Due to the complexity of potential aliasing among Zeek
# container objects of like types, those side effects should be presumed
# to potentially modify this global.
global tbl_c_of_c = table([2] = 3, [3] = 5) &default=456;
# A function with no side effects.
function benign(a: count): count
{
return a + 3;
}
# Calls a BiF that should be known to the optimizer as having no side effects.
function safe_bif(): string
{
return fmt("%s", g1 * 7);
}
# Calls a BiF that the optimizer doesn't know as having no side effects,
# and which should therefore be treated as having Unknown side effects.
function dangerous_bif(): count
{
local l = tbl_c_of_c[2];
clear_table(tbl_c_of_c);
return l;
}
event AST_opt_test_1()
{
# For the following, the optimizer should reuse "g1 * 2" and
# "tbl_c_of_c[2]" for the later print's ...
print g1 * 2, tbl_c_of_c[2];
print benign(4);
print g1 * 2, tbl_c_of_c[2];
print safe_bif();
print g1 * 2, tbl_c_of_c[2];
# ... but *not* after this call. In addition, even though the call
# doesn't modify "g1", it should still be reloaded because
# the optimizer only knows that the BiF is unsafe = it has Unknown
# effects.
print dangerous_bif();
print g1 * 2, tbl_c_of_c[2];
print g1 * 2, tbl_c_of_c[2];
}
########################################################################
# A function that modifies our global-of-interest.
function mod_g1(a: addr): count
{
return ++g1;
}
global tbl_addr_of_count = table([1.2.3.4] = 1001, [2.3.4.5] = 10002) &default=mod_g1;
event AST_opt_test_2()
{
# The optimizer should reload "g1" after each tbl_addr_of_count
# reference, but reuse "g2 * 3" and "tbl_c_of_c[2]", since those
# can't be affected by mod_g1().
print g1 * 2, g2 * 3, tbl_c_of_c[2];
print tbl_addr_of_count[1.2.3.4];
print g1 * 2, g2 * 3, tbl_c_of_c[2];
print tbl_addr_of_count[127.0.0.1];
print g1 * 2, g2 * 3, tbl_c_of_c[2];
}
########################################################################
# A global that controls whether the following functions change an entry
# in one of our global tables.
global mess_is_active = F;
# We use a separate function for actually changing the global to make sure
# that the AST analysis follows the effects of function calls.
function do_the_messing()
{
tbl_c_of_c[2] = 999;
}
# An &on_change handler that potentially alters the value of the table.
function mess_with_tbl_c_of_c(tbl: table[bool] of count, tc: TableChange, ind: bool, val: count)
{
if ( mess_is_active )
do_the_messing();
}
global tcc_mod = table([F] = 44, [T] = 55) &on_change=mess_with_tbl_c_of_c;
event AST_opt_test_3()
{
# The optimizer should re-use "tbl_c_of_c[2]" in each of the secondary
# print's, but it should recompute it after each modification to
# "tcc_mod", even though the first one won't lead to a potential change.
print tbl_c_of_c[2];
print tbl_c_of_c[2];
tcc_mod[F] = 33;
print tbl_c_of_c[2];
print tbl_c_of_c[2];
mess_is_active = T;
tcc_mod[T] = 66;
print tbl_c_of_c[2];
print tbl_c_of_c[2];
}
########################################################################
# Analogous to tbl_c_of_c but has a function call for its &default.
global tbl_c_of_c2 = table([1] = 4, [4] = 8) &default=benign;
event AST_opt_test_4()
{
# In the following, for the duplicated prints the optimizer should
# reuse the previous values. That includes for the accesses to
# local_tbl_c_of_c3 (which associates a more complex &default function
# to "table[count] of count" types).
print g1 * 2, tbl_c_of_c[2], tbl_c_of_c2[1], tbl_c_of_c2[10];
print g1 * 2, tbl_c_of_c[2], tbl_c_of_c2[1], tbl_c_of_c2[10];
local local_tbl_c_of_c3 = table([4] = 1, [12] = 0)
&default=function(c: count): count { return benign(c+7) - 2; };
# We print at this separately to make sure it occurs prior to
# potentially computing the other elements in the print.
print local_tbl_c_of_c3[12];
print g1 * 2, tbl_c_of_c[2], tbl_c_of_c2[1], local_tbl_c_of_c3[12];
# Same with separate printing here.
print local_tbl_c_of_c3[10];
print g1 * 2, tbl_c_of_c[2], tbl_c_of_c2[1], local_tbl_c_of_c3[10];
print g1 * 2, tbl_c_of_c[2], tbl_c_of_c2[1], local_tbl_c_of_c3[10];
# This BiF should lead to recomputing of all values, including
# the local local_tbl_c_of_c3.
print dangerous_bif();
print g1 * 2, tbl_c_of_c[2], tbl_c_of_c2[1], local_tbl_c_of_c3[12];
print g1 * 2, tbl_c_of_c[2], tbl_c_of_c2[1], local_tbl_c_of_c3[12], local_tbl_c_of_c3[10];
}
########################################################################
# Used to introduce another global as having side effects, this time in
# the context of a local table.
global my_exponential = 2.0;
event AST_opt_test_5()
{
# A similar test, but this time the second local has side effects,
# and aliases type-wise with the first local, so expressions with
# the first local should be recomputed for every access.
#
# The notion of aggregate aliases is important because in general
# with static analysis it can be extremely difficult to tell whether
# two instances of an aggregate, each with the same type, might in
# fact refer to the same underlying aggregate value. The optimizer
# thus needs to play it safe and refrain from only focusing on the
# instance that directly has the attribute.
local side_effect_free = table([0] = 0.5, [1] = 1.5);
print side_effect_free[1];
print side_effect_free[1];
local has_side_effects = table([0] = -0.5, [1] = -1.5)
&default_insert=function(c: count): double
{
my_exponential = my_exponential * my_exponential;
return my_exponential;
};
print side_effect_free[1], has_side_effects[2];
print side_effect_free[1], has_side_effects[2];
}
########################################################################
global tbl_c_of_b = table([100] = F, [101] = T);
global tbl_s_of_s = table(["1"] = "4", ["4"] = "8") &default="no-side-effects";
global tbl_s_of_s2 = table(["1"] = "4", ["4"] = "8") &default="ultimately-side-effects";
event AST_opt_test_6()
{
# Without the final statement of this handler, the second one of these
# prints would re-use all of the values. However, it instead can only
# reuse the first three, as explained below.
print g1 * 2, tbl_c_of_c[2], tbl_c_of_c2[1], tbl_c_of_b[100], tbl_s_of_s["4"];
print g1 * 2, tbl_c_of_c[2], tbl_c_of_c2[1], tbl_c_of_b[100], tbl_s_of_s["4"];
# This should force recomputation of tbl_c_of_b above (because its
# type aliases with one that has side effects on loads, so loads
# must not be optimized away) AND THEREFORE also tbl_s_of_s (because
# those loads affect an alias of its type).
print table([11] = F)
&default=function(c: count): bool
{ tbl_s_of_s2["2"] = "two"; return |tbl_s_of_s2| < 9; };
}
########################################################################
# Here we test &default side-effects of record constructors. We also use
# this as an opportunity to stress-test propagation of side effects
# when one side effect induces another, and whether AST analysis correctly
# accounts for potential type aliasing.
# Reads a local table - but that should be enough to raise the specter of
# any "table[count] of bool" access side-effects.
function local_table_read(): count
{
local l = table([99] = T);
return l[99] ? 3 : 2;
}
# The same, but modifies the table instead.
function local_table_mod(): count
{
local l = table([99] = T);
l[99] = F;
return |l|;
}
# Tickles "table[count] of bool" access side-effects.
type R1: record {
a: count &default = local_table_read();
b: string;
};
# Similar, but for modification side-effects. NOTE: type-compatible with R1.
type R2: record {
a: count &default = local_table_mod();
b: string;
};
# Just like R2 but NOT type-compatible with R1.
type R3: record {
a: count &default = local_table_mod();
b: string;
c: int &default = +5;
};
event AST_opt_test_7()
{
# In this test, it's not the mere presence of "R1" that affects
# earlier optimization (like in the previous test), but the execution
# of its constructor. So the following should proceed with the second
# completely re-using from the first.
print tbl_s_of_s["4"];
print tbl_s_of_s["4"];
# Here constructing R1 potentially invokes the &default for $a,
# which reads from a "table[count] of bool", which the previous
# test (AST_opt_test_6) has set up as potentially affecting values
# of type "table[string] of string" such as tbl_s_of_s.
local force_tbl_c_of_c_str_reload = R1($b="match-on-type-of-tbl_c_of_c");
print force_tbl_c_of_c_str_reload;
print tbl_s_of_s["4"];
print tbl_s_of_s["4"];
# Here's a similar sequence but using a record that *modifies*
# a "table[count] of bool" - which ordinarily should NOT thwart
# optimization since there's no potential &on_change attribute for
# such tables ... HOWEVER R2 is type-compatible with R1, so it
# inherits R1's side-effects, thus this WILL require a reload of
# tbl_s_of_s["4"].
local problem_r2 = R2($b="hello");
print tbl_s_of_s["4"], problem_r2$b;
# ... THIS however won't, because R3 is not type-compatible with R1,
# even though it has the same attributes as R2.
local problem_r3 = R3($b="hello again");
print tbl_s_of_s["4"], problem_r3$b;
}
########################################################################
event AST_opt_test_8()
{
# This last one is a hum-dinger. The two locals we define introduce
# mutually recursive side-effects, type-wise: the first sets up that
# a "table[count, count] of count" can access a "table[double, double]
# of double", and the latter establishes the converse. First, this
# setup requires that the profiler doesn't wind up in infinite
# recursion trying to follow dependencies. Second, it should both
# spot those *and* turn them into Unknown side-effects ... which
# means that reuse of expression involving the global g2 - which is
# never altered anywhere - that crosses an access to such a table
# should be recomputed, rather than reusing the previous
# result, because "all bets are off" for Unknown side-effects.
# The second of these should reuse the previous temporary ...
print g2 * 5;
print g2 * 5;
local l1 = table([1, 3] = 4, [2, 5] = 6)
&default = function(i1: count, i2: count): count
{
local my_tbl = table([1.0, 3.0] = 1e4);
return double_to_count(my_tbl[1.0, 3.0]);
};
local l2 = table([1.0, 3.0] = 4.0, [2.0, 5.0] = 6.0)
&default = function(d1: double, d2: double): double
{
local my_tbl = table([1, 3] = 1000);
return my_tbl[1, 3];
};
# ... as should both of these, since we haven't yet done an *access*
# to one of the tables.
print g2 * 5;
print g2 * 5;
# Here's an access.
print l1[3, 8];
# The first of these should recompute, the second should re-use.
print g2 * 5;
print g2 * 5;
# Here's an access to the other.
print l2[2.0, 5.0];
# Again, the first of these should recompute, the second should re-use.
print g2 * 5;
print g2 * 5;
}
########################################################################
event zeek_init()
{
event AST_opt_test_1();
event AST_opt_test_2();
event AST_opt_test_3();
event AST_opt_test_4();
event AST_opt_test_5();
event AST_opt_test_6();
event AST_opt_test_7();
event AST_opt_test_8();
}