diff --git a/.github/workflows/generate-docs.yml b/.github/workflows/generate-docs.yml
index cee9ed9970..d22d370914 100644
--- a/.github/workflows/generate-docs.yml
+++ b/.github/workflows/generate-docs.yml
@@ -34,15 +34,6 @@ jobs:
with:
submodules: "recursive"
- # Only reset the submodule pointer for scheduled builds. The reason to do
- # this is to pick up any merge commits or anything that may have been
- # missed in a merge, but not have any actual content. We don't want to do
- # it otherwise because PRs should just use the submodule they're pointing
- # at.
- - name: Switch doc submodule to master
- if: github.event_name == 'schedule'
- run: cd doc && git checkout master
-
- name: Fetch Dependencies
run: |
sudo apt-get update
@@ -119,9 +110,6 @@ jobs:
cd doc
- echo "*** Running pre-commit ***"
- pre-commit run -a --show-diff-on-failure --color=always
-
echo "*** Generating Sphinx Docs ***"
make > make.out 2>&1
make_status=$?
@@ -132,7 +120,7 @@ jobs:
grep -q WARNING make.out && exit 1
rm make.out
- - name: Push zeek-docs Changes
+ - name: Push docs Changes
if: github.event_name == 'schedule'
run: |
cd doc
@@ -142,16 +130,6 @@ jobs:
# with a check that detects whether there's anything staged.
git diff-index --cached --quiet HEAD || { git commit -m "Generate docs" && git push; }
- - name: Update zeek-docs Submodule
- if: github.event_name == 'schedule'
- run: |
- git config --global user.name zeek-bot
- git config --global user.email info@zeek.org
- git add doc
- git status
- # Similar logic here: proceed only if there's a change in the submodule.
- git diff-index --cached --quiet HEAD || { git commit -m 'Update doc submodule [nomail] [skip ci]' && git push; }
-
- name: Send email
# Only send notifications for scheduled runs. Runs from pull requests
# show failures in the GitHub UI.
diff --git a/.gitignore b/.gitignore
index 27d0bc390b..d1586f6fc8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,6 +3,9 @@
build*
!ci/windows/build.cmd
+# Don't ignore things in the docs directory
+!doc/**
+
tmp
*.gcov
diff --git a/.gitmodules b/.gitmodules
index b9cfdcb91b..01d43a6468 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -16,9 +16,6 @@
[submodule "auxil/netcontrol-connectors"]
path = auxil/netcontrol-connectors
url = https://github.com/zeek/zeek-netcontrol
-[submodule "doc"]
- path = doc
- url = https://github.com/zeek/zeek-docs
[submodule "auxil/paraglob"]
path = auxil/paraglob
url = https://github.com/zeek/paraglob
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index a557470b29..d77a7008be 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -10,7 +10,7 @@ repos:
language: python
files: '\.(h|c|cpp|cc|spicy|evt)$'
types: [file]
- exclude: '^(testing/btest/(Baseline|plugins|spicy|scripts)/.*|testing/builtin-plugins/.*|src/3rdparty/.*)$'
+ exclude: '^(testing/btest/(Baseline|plugins|spicy|scripts)/.*|testing/builtin-plugins/.*|src/3rdparty/.*|doc/.*)$'
- id: btest-command-commented
name: Check that all BTest command lines are commented out
@@ -56,4 +56,4 @@ repos:
rev: v0.26.0
hooks:
- id: spicy-format
- exclude: '^testing/.*'
+ exclude: '^(testing/.*|doc/devel/spicy/autogen/.*)'
diff --git a/.readthedocs.yml b/.readthedocs.yml
new file mode 100644
index 0000000000..fde3c16ef8
--- /dev/null
+++ b/.readthedocs.yml
@@ -0,0 +1,16 @@
+version: 2
+
+formats:
+ - htmlzip
+
+build:
+ os: ubuntu-24.04
+ tools:
+ python: "3.13"
+
+python:
+ install:
+ - requirements: doc/requirements.txt
+
+sphinx:
+ configuration: doc/conf.py
diff --git a/doc b/doc
deleted file mode 160000
index 2731def915..0000000000
--- a/doc
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 2731def9159247e6da8a3191783c89683363689c
diff --git a/doc/.gitignore b/doc/.gitignore
new file mode 100644
index 0000000000..6f97ca1afc
--- /dev/null
+++ b/doc/.gitignore
@@ -0,0 +1,2 @@
+build
+*.pyc
diff --git a/doc/.typos.toml b/doc/.typos.toml
new file mode 100644
index 0000000000..f54d93bcad
--- /dev/null
+++ b/doc/.typos.toml
@@ -0,0 +1,66 @@
+[default]
+extend-ignore-re = [
+ # seh too close to she
+ "registered SEH to support IDL",
+ # ALLO is a valid FTP command
+ "\"ALLO\".*[0-9]{3}",
+ "des-ede3-cbc-Env-OID",
+ # On purpose
+ "\"THE NETBIOS NAM\"",
+ # NFS stuff.
+ "commited :zeek:type:`NFS3::stable_how_t`",
+ "\\/fo\\(o",
+ " nd\\. /dev/null; done
+ @
+ @echo Checking whether docs for Spicy integration are up-to-date
+ @./devel/spicy/autogen-spicy-docs spicy-tftp
+ @
+ @git diff --quiet devel/spicy/autogen/ \
+ || (echo "Spicy docs are not up-to-date, rerun './devel/spicy/autogen-spicy-docs'." && exit 1)
+
+.PHONY : all doc builddir clean html livehtml
diff --git a/doc/README b/doc/README
new file mode 100644
index 0000000000..8204c11a4e
--- /dev/null
+++ b/doc/README
@@ -0,0 +1,132 @@
+.. _zeek-docs: https://github.com/zeek/zeek-docs
+.. _Read the Docs: https://docs.readthedocs.io/en/stable/index.html
+.. _Zeek repo: https://github.com/zeek/zeek
+.. _Sphinx: https://www.sphinx-doc.org/en/master
+.. _pip: https://pypi.org/project/pip
+
+Zeek Documentation
+==================
+
+The documentation repo at zeek-docs_
+contains version-specific Zeek documentation source files that are ultimately
+used as the basis for content hosted at https://docs.zeek.org.
+
+Markup Format, Style, and Conventions
+-------------------------------------
+
+For general guidance on the basics of how the documentation is written,
+consult this Zeek wiki:
+
+https://github.com/zeek/zeek/wiki/Documentation-Style-and-Conventions
+
+Source-Tree Organization
+------------------------
+
+The zeek-docs_ repo containing this README file is the root of a Sphinx_ source
+tree and can be modified to add more documentation, style sheets, JavaScript,
+etc. The Sphinx config file is ``conf.py``. The typical way new documents get
+integrated is from them being referenced directly in ``index.rst`` or
+indirectly from something in the ``toctree`` (Table of Contents Tree) specified
+in that main index.
+
+There is also a custom Sphinx domain implemented in ``ext/zeek.py`` which adds
+some reStructureText (reST) directives and roles that aid in generating useful
+index entries and cross-references. This primarily supports integration with
+the script-reference sections, some of which are auto-generated by Zeek's
+Doxygen-like feature, named "Zeekygen". The bulk of auto-generated content
+lives under the ``scripts/`` directory or has a file name starting with
+"autogenerated", so if you find yourself wanting to change those, you should
+actually look at at doing those changes within the `Zeek repo`_ itself rather
+than here, so see the next section for how Zeekygen docs can be (re)generated.
+
+Generating Zeekygen Reference Docs
+----------------------------------
+
+All Zeekygen-generated docs get committed into Git, so if you don't have to
+perform any changes on it and just want to preview what's already existing,
+you can skip down to the next :ref:`Local Previewing ` section.
+
+The Zeekygen documentation-generation feature is a part of Zeek itself, so
+you'll want to obtain the `Zeek repo`_ from Git, read the :doc:`INSTALL
+` file directions to install required dependencies, and build Zeek::
+
+ git clone --recursive https://github.com/zeek/zeek
+ cd zeek
+ # Read INSTALL file and get dependencies here
+ ./configure && make -j $(nproc)
+ # Make desired edits to scripts/, src/, etc.
+ ./ci/update-zeekygen-docs.sh
+
+The last command runs a script to generate documentation, which will end up in
+the ``doc/`` subdirectory. Note that ``doc/`` is just a Git submodule of this
+this zeek-docs_ repository, so you can run ``git status`` there to find exactly
+what changed.
+
+Also note that the documentation-generation script is run automatically
+on a daily basis to incorporate up any documentation changes that people make
+in Zeek itself without them having to necessarily be aware of the full
+documentation process. The GitHub Action that does that daily task is
+located in the Zeek repo's ``.github/workflows/generate-docs.yml`` file.
+
+.. _local-doc-preview:
+
+Local Previewing (How To Build)
+-------------------------------
+
+First make sure you have the required dependencies used for building docs:
+
+* Python interpreter >= 3.9
+* Sphinx: https://www.sphinx-doc.org/en/master/
+* Read the Docs Sphinx Theme: https://github.com/rtfd/sphinx_rtd_theme
+* GitPython: https://github.com/gitpython-developers/GitPython
+
+If you have pip_, you may just use the command ``pip3 install -r
+requirements.txt`` to install all the dependencies using the
+``requirements.txt`` from zeek-docs_.
+
+Now run ``make`` within the zeek-docs_ repository's top-level to locally render
+its reST files into HTML. After the build completes, HTML documentation is
+symlinked in ``build/html`` and you can open the ``index.html`` found there in
+your web browser.
+
+There's also a ``make livehtml`` (requires ``pip3 install sphinx-autobuild``)
+target in the top-level Makefile that is useful for editing the reST files and
+seeing changes rendered out live to a separate browser.
+
+Hosting
+-------
+
+Documentation is hosted by `Read the Docs`_ (RTD), so you can generally read
+about how it works there. The web-interface is accessible via
+https://readthedocs.org/projects/zeek-docs.
+
+How zeek-docs_ is configured to use RTD is a combination of some custom
+settings in its ``.readthedocs.yml`` file and others only accessible through
+RTD's web-interface (e.g. domain and subproject settings). Most config
+settings are likely understandable just by browsing the web-interface and
+RTD's guides, but a few particular points to mention:
+
+* There is an associated, always-failing project at
+ https://readthedocs.org/projects/zeek. It's always-failing because
+ RTD redirects only activate when pages 404 and this project exists so that
+ all attempts to use https://zeek.rtfd.io or https://zeek.readthedocs.io
+ get redirected to https://docs.zeek.org. Those would have been the project
+ URLs if ownership of the RTD 'zeek' project was had from the start, but
+ it was only obtained later, after documentation already started development
+ in the 'zeek-docs' RTD project slug.
+
+* Over time, page redirects have accrued into ``redirects.yml`` as a way to
+ help document what they are and why they happened and also as a potential
+ way to automate addition/reinstantiation of a large number of redirects,
+ but typically redirects can be manually added via the RTD web interface
+ first and then noted in ``redirects.yml``
+
+* There are RTD subprojects for things like Broker, Package Manager,
+ and Spicy. The use of subprojects simply allows access to their RTD
+ docs via the custom domain of https://docs.zeek.org
+
+* RTD will auto-build any newly-pushed commits to zeek-docs_ (i.e. a webhook is
+ configured), but if a tag is changed to point somewhere different, you'll
+ typically have to go into the RTD web interface, "Edit" the associated
+ version under "Versions", "wipe" the existing docs, and then manually trigger
+ a rebuild of that version tag under "Builds".
diff --git a/doc/README.rst b/doc/README.rst
new file mode 100644
index 0000000000..8204c11a4e
--- /dev/null
+++ b/doc/README.rst
@@ -0,0 +1,132 @@
+.. _zeek-docs: https://github.com/zeek/zeek-docs
+.. _Read the Docs: https://docs.readthedocs.io/en/stable/index.html
+.. _Zeek repo: https://github.com/zeek/zeek
+.. _Sphinx: https://www.sphinx-doc.org/en/master
+.. _pip: https://pypi.org/project/pip
+
+Zeek Documentation
+==================
+
+The documentation repo at zeek-docs_
+contains version-specific Zeek documentation source files that are ultimately
+used as the basis for content hosted at https://docs.zeek.org.
+
+Markup Format, Style, and Conventions
+-------------------------------------
+
+For general guidance on the basics of how the documentation is written,
+consult this Zeek wiki:
+
+https://github.com/zeek/zeek/wiki/Documentation-Style-and-Conventions
+
+Source-Tree Organization
+------------------------
+
+The zeek-docs_ repo containing this README file is the root of a Sphinx_ source
+tree and can be modified to add more documentation, style sheets, JavaScript,
+etc. The Sphinx config file is ``conf.py``. The typical way new documents get
+integrated is from them being referenced directly in ``index.rst`` or
+indirectly from something in the ``toctree`` (Table of Contents Tree) specified
+in that main index.
+
+There is also a custom Sphinx domain implemented in ``ext/zeek.py`` which adds
+some reStructureText (reST) directives and roles that aid in generating useful
+index entries and cross-references. This primarily supports integration with
+the script-reference sections, some of which are auto-generated by Zeek's
+Doxygen-like feature, named "Zeekygen". The bulk of auto-generated content
+lives under the ``scripts/`` directory or has a file name starting with
+"autogenerated", so if you find yourself wanting to change those, you should
+actually look at at doing those changes within the `Zeek repo`_ itself rather
+than here, so see the next section for how Zeekygen docs can be (re)generated.
+
+Generating Zeekygen Reference Docs
+----------------------------------
+
+All Zeekygen-generated docs get committed into Git, so if you don't have to
+perform any changes on it and just want to preview what's already existing,
+you can skip down to the next :ref:`Local Previewing ` section.
+
+The Zeekygen documentation-generation feature is a part of Zeek itself, so
+you'll want to obtain the `Zeek repo`_ from Git, read the :doc:`INSTALL
+` file directions to install required dependencies, and build Zeek::
+
+ git clone --recursive https://github.com/zeek/zeek
+ cd zeek
+ # Read INSTALL file and get dependencies here
+ ./configure && make -j $(nproc)
+ # Make desired edits to scripts/, src/, etc.
+ ./ci/update-zeekygen-docs.sh
+
+The last command runs a script to generate documentation, which will end up in
+the ``doc/`` subdirectory. Note that ``doc/`` is just a Git submodule of this
+this zeek-docs_ repository, so you can run ``git status`` there to find exactly
+what changed.
+
+Also note that the documentation-generation script is run automatically
+on a daily basis to incorporate up any documentation changes that people make
+in Zeek itself without them having to necessarily be aware of the full
+documentation process. The GitHub Action that does that daily task is
+located in the Zeek repo's ``.github/workflows/generate-docs.yml`` file.
+
+.. _local-doc-preview:
+
+Local Previewing (How To Build)
+-------------------------------
+
+First make sure you have the required dependencies used for building docs:
+
+* Python interpreter >= 3.9
+* Sphinx: https://www.sphinx-doc.org/en/master/
+* Read the Docs Sphinx Theme: https://github.com/rtfd/sphinx_rtd_theme
+* GitPython: https://github.com/gitpython-developers/GitPython
+
+If you have pip_, you may just use the command ``pip3 install -r
+requirements.txt`` to install all the dependencies using the
+``requirements.txt`` from zeek-docs_.
+
+Now run ``make`` within the zeek-docs_ repository's top-level to locally render
+its reST files into HTML. After the build completes, HTML documentation is
+symlinked in ``build/html`` and you can open the ``index.html`` found there in
+your web browser.
+
+There's also a ``make livehtml`` (requires ``pip3 install sphinx-autobuild``)
+target in the top-level Makefile that is useful for editing the reST files and
+seeing changes rendered out live to a separate browser.
+
+Hosting
+-------
+
+Documentation is hosted by `Read the Docs`_ (RTD), so you can generally read
+about how it works there. The web-interface is accessible via
+https://readthedocs.org/projects/zeek-docs.
+
+How zeek-docs_ is configured to use RTD is a combination of some custom
+settings in its ``.readthedocs.yml`` file and others only accessible through
+RTD's web-interface (e.g. domain and subproject settings). Most config
+settings are likely understandable just by browsing the web-interface and
+RTD's guides, but a few particular points to mention:
+
+* There is an associated, always-failing project at
+ https://readthedocs.org/projects/zeek. It's always-failing because
+ RTD redirects only activate when pages 404 and this project exists so that
+ all attempts to use https://zeek.rtfd.io or https://zeek.readthedocs.io
+ get redirected to https://docs.zeek.org. Those would have been the project
+ URLs if ownership of the RTD 'zeek' project was had from the start, but
+ it was only obtained later, after documentation already started development
+ in the 'zeek-docs' RTD project slug.
+
+* Over time, page redirects have accrued into ``redirects.yml`` as a way to
+ help document what they are and why they happened and also as a potential
+ way to automate addition/reinstantiation of a large number of redirects,
+ but typically redirects can be manually added via the RTD web interface
+ first and then noted in ``redirects.yml``
+
+* There are RTD subprojects for things like Broker, Package Manager,
+ and Spicy. The use of subprojects simply allows access to their RTD
+ docs via the custom domain of https://docs.zeek.org
+
+* RTD will auto-build any newly-pushed commits to zeek-docs_ (i.e. a webhook is
+ configured), but if a tag is changed to point somewhere different, you'll
+ typically have to go into the RTD web interface, "Edit" the associated
+ version under "Versions", "wipe" the existing docs, and then manually trigger
+ a rebuild of that version tag under "Builds".
diff --git a/doc/_static/theme_overrides.css b/doc/_static/theme_overrides.css
new file mode 100644
index 0000000000..cf1fa824f8
--- /dev/null
+++ b/doc/_static/theme_overrides.css
@@ -0,0 +1,32 @@
+/* override table width restrictions */
+@media screen and (min-width: 767px) {
+
+ .wy-table-responsive table td {
+ /* !important prevents the common CSS stylesheets from overriding
+ this as on RTD they are loaded after this stylesheet */
+ white-space: normal !important;
+ }
+
+ .wy-table-responsive {
+ overflow: visible !important;
+ }
+}
+
+h1, h2, h3, h4, h5, h6 {
+ color: #294488;
+ font-family: 'Open Sans',Helvetica,Arial,Lucida,sans-serif!important;
+}
+
+a {
+ color: #2ea3f2;
+}
+
+body {
+ font-family: "Open Sans",Arial,sans-serif;
+ color: #666;
+}
+
+div.highlight pre strong {
+ font-weight: 800;
+ background-color: #ffffcc;
+}
diff --git a/doc/_templates/breadcrumbs.html b/doc/_templates/breadcrumbs.html
new file mode 100644
index 0000000000..8a4aa54ae4
--- /dev/null
+++ b/doc/_templates/breadcrumbs.html
@@ -0,0 +1,15 @@
+{% extends "!breadcrumbs.html" %}
+
+{% block breadcrumbs_aside %}
+
+{% if pagename != "search" %}
+ {% if display_github %}
+ {% if github_version == "master" %}
+ {{ _('Edit on GitHub') }}
+ {% endif %}
+ {% elif show_source and has_source and sourcename %}
+ {{ _('View page source') }}
+ {% endif %}
+{% endif %}
+
+{% endblock %}
diff --git a/doc/_templates/layout.html b/doc/_templates/layout.html
new file mode 100644
index 0000000000..3a5449e99a
--- /dev/null
+++ b/doc/_templates/layout.html
@@ -0,0 +1,14 @@
+{% extends "!layout.html" %}
+
+{% if READTHEDOCS and current_version %}
+ {% if current_version == "latest" or current_version == "stable"
+ or current_version == "master" or current_version == "current"
+ or current_version == "lts" or current_version == "LTS" %}
+ {% set current_version = current_version ~ " (" ~ version ~ ")" %}
+ {% endif %}
+{% endif %}
+
+{% block menu %}
+ {{ super() }}
+ Index
+{% endblock %}
diff --git a/doc/about.rst b/doc/about.rst
new file mode 100644
index 0000000000..85b3ddd5a8
--- /dev/null
+++ b/doc/about.rst
@@ -0,0 +1,256 @@
+==========
+About Zeek
+==========
+
+What Is Zeek?
+=============
+
+Zeek is a passive, open-source network traffic analyzer. Many operators use
+Zeek as a network security monitor (NSM) to support investigations of
+suspicious or malicious activity. Zeek also supports a wide range of traffic
+analysis tasks beyond the security domain, including performance measurement
+and troubleshooting.
+
+The first benefit a new user derives from Zeek is the extensive set of logs
+describing network activity. These logs include not only a comprehensive record
+of every connection seen on the wire, but also application-layer transcripts.
+These include all HTTP sessions with their requested URIs, key headers, MIME
+types, and server responses; DNS requests with replies; SSL certificates; key
+content of SMTP sessions; and much more. By default, Zeek writes all this
+information into well-structured tab-separated or JSON log files suitable for
+post-processing with external software. Users can also choose to have external
+databases or SIEM products consume, store, process, and present the data for
+querying.
+
+In addition to the logs, Zeek comes with built-in functionality for a range of
+analysis and detection tasks, including extracting files from HTTP sessions,
+detecting malware by interfacing to external registries, reporting vulnerable
+versions of software seen on the network, identifying popular web applications,
+detecting SSH brute-forcing, validating SSL certificate chains, and much more.
+
+In addition to shipping such powerful functionality “out of the box,” Zeek is a
+fully customizable and extensible platform for traffic analysis. Zeek provides
+users a domain-specific, Turing-complete scripting language for expressing
+arbitrary analysis tasks. Think of the Zeek language as a “domain-specific
+Python” (or Perl): just like Python, the system comes with a large set of
+pre-built functionality (the “standard library”), yet users can also put Zeek
+to use in novel ways by writing custom code. Indeed, all of Zeek’s default
+analyses, including logging, are done via scripts; no specific analysis is
+hard-coded into the core of the system.
+
+Zeek runs on commodity hardware and hence provides a low-cost alternative to
+expensive proprietary solutions. In many ways Zeek exceeds the capabilities of
+other network monitoring tools, which typically remain limited to a small set
+of hard-coded analysis tasks. Zeek is not a classic signature-based intrusion
+detection system (IDS); while it supports such standard functionality as well,
+Zeek’s scripting language facilitates a much broader spectrum of very different
+approaches to finding malicious activity. These include semantic misuse
+detection, anomaly detection, and behavioral analysis.
+
+A large variety of sites deploy Zeek to protect their infrastructure, including
+many universities, research labs, supercomputing centers, open-science
+communities, major corporations, and government agencies. Zeek specifically
+targets high-speed, high-volume network monitoring, and an increasing number of
+sites are now using the system to monitor their 10GE networks, with some
+already moving on to 100GE links.
+
+Zeek accommodates high-performance settings by supporting scalable
+load-balancing. Large sites typically run “Zeek Clusters” in which a high-speed
+front end load balancer distributes the traffic across an appropriate number of
+back end PCs, all running dedicated Zeek instances on their individual traffic
+slices. A central manager system coordinates the process, synchronizing state
+across the back ends and providing the operators with a central management
+interface for configuration and access to aggregated logs. Zeek’s integrated
+management framework, ZeekControl, supports such cluster setups out-of-the-box.
+
+Zeek’s cluster features support single-system and multi-system setups. That's
+part of Zeek’s scalability advantages. For example, administrators can scale
+Zeek within one system for as long as possible, and then transparently add more
+systems when necessary.
+
+In brief, Zeek is optimized for interpreting network traffic and generating
+logs based on that traffic. It is not optimized for byte matching, and users
+seeking signature detection approaches would be better served by trying
+intrusion detection systems such as Suricata. Zeek is also not a protocol
+analyzer in the sense of Wireshark, seeking to depict every element of network
+traffic at the frame level, or a system for storing traffic in packet capture
+(PCAP) form. Rather, Zeek sits at the “happy medium” representing compact yet
+high fidelity network logs, generating better understanding of network traffic
+and usage.
+
+Why Zeek?
+=========
+
+Zeek offers many advantages for security and network teams who want to better
+understand how their infrastructure is being used.
+
+Security teams generally depend upon four sorts of data sources when trying to
+detect and respond to suspicious and malicious activity. These include *third
+party* sources such as law enforcement, peers, and commercial or nonprofit
+threat intelligence organizations; *network data*; *infrastructure and
+application data*, including logs from cloud environments; and *endpoint data*.
+Zeek is primarily a platform for collecting and analyzing the second form of
+data -- network data. All four are important elements of any security team’s
+program, however.
+
+When looking at data derived from the network, there are four types of data
+available to analysts. As defined by the `network security monitoring paradigm
+`_, these
+four data types are *full content*, *transaction data*, *extracted content*,
+and *alert data*. Using these data types, one can record traffic, summarize
+traffic, extract traffic (or perhaps more accurately, extract content
+in the form of files), and judge traffic, respectively.
+
+It’s critical to collect and analyze the four types of network security
+monitoring data. The question becomes one of determining the best way to
+accomplish this goal. Thankfully, Zeek as a NSM platform enables collection of
+at least two, and in some ways three, of these data forms, namely transaction
+data, extracted content, and alert data.
+
+Zeek is best known for its transaction data. By default, when run and told to
+watch a network interface, Zeek will generate a collection of compact,
+high-fidelity, richly-annotated set of transaction logs. These logs describe
+the protocols and activity seen on the wire, in a judgement-free,
+policy-neutral manner. This documentation will spend a considerable amount of
+time describing the most common Zeek log files such that readers will become
+comfortable with the format and learn to apply them to their environment.
+
+Zeek can also easily carve files from network traffic, thanks to its file
+extraction capabilities. Analysts can then send those files to execution
+sandboxes or other file examination tools for additional investigation. Zeek
+has some capability to perform classical byte-centric intrusion detection, but
+that job is best suited for packages like the open source Snort or Suricata
+engines. Zeek has other capabilities however that are capable of providing
+judgements in the form of alerts, through its notice mechanism.
+
+Zeek is not optimized for writing traffic to disk in the spirit of a full
+content data collection, and that task is best handled by software written to
+fulfill that requirement.
+
+Beyond the forms of network data that Zeek can natively collect and generate,
+Zeek has advantages that appeared in the `What Is Zeek?`_ section. These
+include its built-in functionality for a range of analysis and detection
+tasks, and its status as a fully customizable and extensible platform for
+traffic analysis. Zeek is also attractive because of its ability to run on
+commodity hardware, giving users of all types the ability to at least try Zeek
+in a low-cost manner.
+
+History
+=======
+
+Zeek has a rich history stretching back to the 1990s. `Vern Paxson
+`_ designed and implemented the initial version in
+1995 as a researcher at the `Lawrence Berkeley National Laboratory (LBNL)
+`_. The original software was called “Bro,” as an
+“Orwellian reminder that monitoring comes hand in hand with the potential
+for privacy violations”.
+
+LBNL first deployed Zeek in 1996, and the USENIX Security Symposium published
+Vern’s original paper on Zeek in 1998, and awarded it the Best Paper Award that
+year He published a refined version of the paper in 1999 as `Bro: A System for
+Detecting Network Intruders in Real-Time
+`_.
+
+In 2003, the `National Science Foundation (NSF) `_ began
+supporting research and advanced development on Bro at the `International
+Computer Science Institute (ICSI) `_. (Vern
+still leads the ICSI `Networking and Security group `_.)
+
+Over the years, a growing team of ICSI researchers and students kept adding
+novel functions to Zeek, while LBNL continued its support with funding from the
+`Department of Energy (DOE) `_. Much of Zeek’s
+capabilities originate in academic research projects, with results often
+published at top-tier conferences. A key to Zeek’s success was the project’s
+ability to bridge the gap between academia and operations. This relationship
+helped ground research on Zeek in real-world challenges.
+
+With a growing operational user community, the research-centric development
+model eventually became a bottleneck to the system’s evolution. Research
+grants did not support the more mundane parts of software development and
+maintenance. However, those elements were crucial for the end-user experience.
+As a result, deploying Zeek required overcoming a steep learning curve.
+
+In 2010, NSF sought to address this challenge by awarding ICSI a grant from its
+Software Development for Cyberinfrastructure fund. The `National Center for
+Supercomputing Applications (NCSA) `_ joined the
+team as a core partner, and the Zeek project began to overhaul many of the
+user-visible parts of the system for the 2.0 release in 2012.
+
+After Zeek 2.0, the project enjoyed tremendous growth in new deployments across
+a diverse range of settings, and the ongoing collaboration between ICSI (co-PI
+Robin Sommer) and NCSA (co-PI Adam Slagell) brought a number of important
+features. In 2012, Zeek added native IPv6 support, long before many enterprise
+networking monitoring tools. In 2013, NSF renewed its support with a second
+grant that established the Bro Center of Expertise at ICSI and NCSA, promoting
+Zeek as a comprehensive, low-cost security capability for research and
+education communities. To facilitate both debugging and education,
+`try.zeek.org `_ (formerly try.bro.org) was launched in
+2014. This provided an interactive way for users to test a script with their
+own packet captures against a variety of Zeek versions and easily share
+sample code with others. For Zeek clusters and external communication,
+the Broker communication framework was added. Last, but not least, the
+Zeek package manager was created in 2016, funded by an additional grant
+from the Mozilla Foundation.
+
+In the fall of 2018, the project leadership team decided to change the name of
+the software from Bro to Zeek. The leadership team desired a name that better
+reflected the values of the community while avoiding the negative connotations
+of so-called “bro culture” outside the computing world. The project released
+version 3.0 in the fall of 2019, the first release bearing the name Zeek. The
+year 2020 saw a renewed focus on community and growing the Zeek community, with
+increased interaction via social media, webinars, Slack channels, and related
+outreach efforts.
+
+For a history of the project from 1995 to 2015, see Vern Paxson’s talk from
+BroCon 2015, `Reflecting on Twenty Years of Bro
+`_.
+
+For background on the decision to rename Bro to Zeek, see Vern Paxson’s talk
+from BroCon 2018, `Renaming Bro
+`_.
+
+Architecture
+============
+
+.. image:: /images/architecture.png
+ :align: center
+ :scale: 75%
+
+At a very high level, Zeek is architecturally layered into two major
+components. Its *event engine* (or *core*) reduces the incoming packet stream
+into a series of higher-level *events*. These events reflect network activity
+in policy-neutral terms, i.e., they describe *what* has been seen, but not
+*why*, or whether it is significant.
+
+For example, every HTTP request on the wire turns into a corresponding
+:zeek:see:`http_request` event that carries with it the involved IP addresses
+and ports, the URI being requested, and the HTTP version in use. The event
+however does not convey any further *interpretation*, such as whether that URI
+corresponds to a known malware site.
+
+The event engine component comprises a number of subcomponents, including in
+particular the packet processing pipeline consisting of: input sources,
+packet analysis, session analysis, and file analysis. Input sources ingest
+incoming network traffic from network interfaces. Packet analysis processes
+lower-level protocols, starting all the way down at the link layer. Session
+analysis handles application-layer protocols, such as HTTP, FTP, etc. File
+analysis dissects the content of files transferred over sessions. The event
+engine provides a plugin architecture for adding any of these from outside
+of the core Zeek code base, allowing to expand Zeek’s capabilities as
+needed.
+
+Semantics related to the events are derived by Zeek’s second main component,
+the *script interpreter*, which executes a set of *event handlers* written in
+Zeek’s custom scripting language. These scripts can express a site’s
+security policy, such as what actions to take when the monitor detects
+different types of activity.
+
+More generally scripts can derive any desired properties and statistics from
+the input traffic. In fact, all of Zeek’s default output comes from scripts
+included in the distribution. Zeek’s language comes with extensive
+domain-specific types and support functionality. Crucially, Zeek’s language
+allows scripts to maintain state over time, enabling them to track and
+correlate the evolution of what they observe across connection and host
+boundaries. Zeek scripts can generate real-time alerts and also execute
+arbitrary external programs on demand. One might use this functionality to
+trigger an active response to an attack.
diff --git a/doc/acknowledgements.rst b/doc/acknowledgements.rst
new file mode 100644
index 0000000000..7a0747e638
--- /dev/null
+++ b/doc/acknowledgements.rst
@@ -0,0 +1,22 @@
+================
+Acknowledgements
+================
+
+Thanks to everyone who contributed in making Zeek's documentation
+(alphabetically):
+
+* Johanna Amann
+* Richard Bejtlich
+* Michael Dopheide
+* Amber Graner
+* Jan Grashöfer
+* Christian Kreibich
+* Terry Leach
+* Aashish Sharma
+* Jon Siwek
+* Stephen Smoot
+* Robin Sommer
+* Aaron Soto
+* Nick Turley
+* Fatema Bannat Wala
+* Tim Wojtulewicz
diff --git a/doc/building-from-source.rst b/doc/building-from-source.rst
new file mode 100644
index 0000000000..21f940406f
--- /dev/null
+++ b/doc/building-from-source.rst
@@ -0,0 +1,392 @@
+
+.. _CMake: https://www.cmake.org
+.. _SWIG: https://www.swig.org
+.. _Xcode: https://developer.apple.com/xcode/
+.. _MacPorts: https://www.macports.org
+.. _Fink: https://www.finkproject.org
+.. _Homebrew: https://brew.sh
+.. _downloads page: https://zeek.org/get-zeek
+.. _devtoolset: https://developers.redhat.com/products/developertoolset/hello-world
+.. _zkg package manager: https://docs.zeek.org/projects/package-manager/en/stable/
+.. _crosstool-NG: https://crosstool-ng.github.io/
+.. _CMake toolchain: https://cmake.org/cmake/help/latest/manual/cmake-toolchains.7.html
+.. _contribute: https://github.com/zeek/zeek/wiki/Contribution-Guide
+.. _Chocolatey: https://chocolatey.org
+.. _Npcap: https://npcap.com/
+
+.. _building-from-source:
+
+====================
+Building from Source
+====================
+
+Building Zeek from source provides the most control over your build and is the
+preferred approach for advanced users. We support a wide range of operating
+systems and distributions. Our `support policy
+`_ is informed by
+what we can run in our CI pipelines with reasonable effort, with the current
+status captured in our `support matrix
+`_.
+
+Required Dependencies
+---------------------
+
+Building Zeek from source requires the following dependencies, including
+development headers for libraries:
+
+ * Bash (for ZeekControl and BTest)
+ * BIND8 library or greater (if not covered by system's libresolv)
+ * Bison 3.3 or greater (https://www.gnu.org/software/bison/)
+ * C/C++ compiler with C++17 support (GCC 8+ or Clang 9+)
+ * CMake 3.15 or greater (https://www.cmake.org)
+ * Flex (lexical analyzer generator) 2.6 or greater (https://github.com/westes/flex)
+ * Libpcap (https://www.tcpdump.org)
+ * Make
+ * OpenSSL (https://www.openssl.org)
+ * Python 3.9 or greater (https://www.python.org/)
+ * SWIG (https://www.swig.org)
+ * ZeroMQ (https://zeromq.org)
+ * Zlib (https://zlib.net/)
+
+To install these, you can use:
+
+* RPM/RedHat-based Linux:
+
+ .. code-block:: console
+
+ sudo dnf install bison cmake cppzmq-devel gcc gcc-c++ flex libpcap-devel make openssl-devel python3 python3-devel swig zlib-devel
+
+ On pre-``dnf`` systems, use ``yum`` instead. Additionally, on RHEL/CentOS 7,
+ you can install and activate a devtoolset_ to get access to recent GCC
+ versions. You will also have to install and activate CMake 3. For example:
+
+ .. code-block:: console
+
+ sudo yum install cmake3 devtoolset-7
+ scl enable devtoolset-7 bash
+
+* DEB/Debian-based Linux:
+
+ .. code-block:: console
+
+ sudo apt-get install bison cmake cppzmq-dev gcc g++ flex libfl-dev libpcap-dev libssl-dev make python3 python3-dev swig zlib1g-dev
+
+ If your platform doesn't offer ``cppzmq-dev``, try ``libzmq3-dev``
+ instead. Zeek's build will fall back to an in-tree version of C++
+ bindings to ZeroMQ in that case.
+
+* FreeBSD:
+
+ Most required dependencies should come with a minimal FreeBSD install
+ except for the following.
+
+ .. code-block:: console
+
+ sudo pkg install -y base64 bash bison cmake cppzmq git python3 swig
+ pyver=`python3 -c 'import sys; print(f"py{sys.version_info[0]}{sys.version_info[1]}")'`
+ sudo pkg install -y $pyver-sqlite3
+
+* macOS:
+
+ Compiling source code on Macs requires first installing either Xcode_
+ or the "Command Line Tools" (which is a much smaller download). To check
+ if either is installed, run the ``xcode-select -p`` command. If you see
+ an error message, then neither is installed and you can then run
+ ``xcode-select --install`` which will prompt you to either get Xcode (by
+ clicking "Get Xcode") or to install the command line tools (by
+ clicking "Install").
+
+ macOS comes with all required dependencies except for CMake_, SWIG_,
+ Bison, Flex, and OpenSSL (OpenSSL headers were removed in macOS 10.11,
+ therefore OpenSSL must be installed manually for macOS versions 10.11
+ or newer).
+
+ Distributions of these dependencies can likely be obtained from your
+ preferred macOS package management system (e.g. Homebrew_,
+ MacPorts_, or Fink_). Specifically for Homebrew, the ``bison``, ``cmake``,
+ ``cppzmq``, ``flex``, ``swig``, and ``openssl`` packages
+ provide the required dependencies. For MacPorts, use the ``bison``, ``cmake``,
+ ``cppzmq``, ``flex``, ``swig``, ``swig-python``, and ``openssl`` packages.
+
+* Windows
+
+ Windows support is experimental. These instructions are meant as a starting
+ point for development on that platform, and might have issues or be missing
+ steps. Notify the Zeek team if any such problems arise.
+
+ Compiling on Windows requires the installation of a development environment.
+ Zeek currently builds on Visual Studio 2019, and you can either install the
+ full version including the UI tools or you can install the command-line tools
+ and build from a shell. The instructions below describe how to install the
+ command-line tools, but are not necessary if you install the full VS2019
+ package. You will need to install Chocolatey_ in order to install the
+ dependencies as instructed below. It's possible to install them from other
+ sources (msys2, cygwin, etc), which we leave to the reader.
+
+ Cloning the repository will also require Developer Mode to be enabled in
+ Windows. This is due to the existence of a number of symbolic links in the
+ repository. Without Developer Mode, ``git`` on Windows will ignore these
+ links and builds will fail. There are a couple of different ways to enable
+ it, and the settings may differ depending on the version of Windows.
+
+ .. code-block:: console
+
+ choco install -y --no-progress visualstudio2019buildtools --version=16.11.11.0
+ choco install -y --no-progress visualstudio2019-workload-vctools --version=1.0.0 --package-parameters '--add Microsoft.VisualStudio.Component.VC.ATLMFC'
+ choco install -y --no-progress sed
+ choco install -y --no-progress winflexbison3
+ choco install -y --no-progress msysgit
+ choco install -y --no-progress python
+ choco install -y --no-progress openssl --version=3.1.1
+
+ Once the dependencies are installed, you will need to add the Git installation
+ to your PATH (``C:\Program Files\Git\bin`` by default). This is needed for the
+ ``sh`` command to be available during the build. Once all of the dependencies
+ are in place, you will need to open a shell (PowerShell or cmd) and add the
+ development environment to it. The following command is for running on an
+ x86_64 host.
+
+ .. code-block:: console
+
+ C:\Program Files (x86)\Microsoft Visual Studio\2019\BuildTools\VC\Auxiliary\Build\vcvarsall.bat x86_amd64
+
+ Now you can build via cmake:
+
+ .. code-block:: console
+
+ mkdir build
+ cd build
+ cmake.exe .. -DCMAKE_BUILD_TYPE=release -DENABLE_ZEEK_UNIT_TESTS=yes -DENABLE_CLUSTER_BACKEND_ZEROMQ=no -DVCPKG_TARGET_TRIPLET="x64-windows-static" -G Ninja
+ cmake.exe --build .
+
+ All of this is duplicated in the CI configuration for Windows which lives in
+ the ``ci/windows`` directory, and can be used as a reference for running the
+ commands by hand.
+
+ Note: By default, Windows links against the standard libpcap library from
+ vcpkg. This version of libpcap does not support packet capture on Windows,
+ unlike other platforms. In order to capture packets from live interfaces on
+ Windows, you will need to link against the Npcap_ library. This library is free
+ for personal use, but requires a paid license for commercial use or
+ redistribution. To link against Npcap, download the SDK from their website,
+ unzip it, and then pass ``-DPCAP_ROOT_DIR=""`` to the
+ initial CMake invocation for Zeek.
+
+ Note also that the ZeroMQ cluster backend is not yet supported on Windows.
+
+Optional Dependencies
+---------------------
+
+Zeek can make use of some optional libraries and tools if they are found at
+build time:
+
+ * libmaxminddb (for geolocating IP addresses)
+ * sendmail (enables Zeek and ZeekControl to send mail)
+ * curl (used by a Zeek script that implements active HTTP)
+ * gperftools (tcmalloc is used to improve memory and CPU usage)
+ * jemalloc (https://github.com/jemalloc/jemalloc)
+ * PF_RING (Linux only, see :ref:`pf-ring-config`)
+ * krb5 libraries and headers
+ * ipsumdump (for trace-summary; https://github.com/kohler/ipsumdump)
+ * hiredis (for the Redis storage backend)
+
+Geolocation is probably the most interesting and can be installed on most
+platforms by following the instructions for :ref:`address geolocation and AS
+lookups `.
+
+The `zkg package manager`_, included in the Zeek installation, requires
+two external Python modules:
+
+ * GitPython: https://pypi.org/project/GitPython/
+ * semantic-version: https://pypi.org/project/semantic-version/
+
+These install easily via pip (``pip3 install GitPython
+semantic-version``) and also ship with some distributions:
+
+* RPM/RedHat-based Linux:
+
+ .. code-block:: console
+
+ sudo yum install python3-GitPython python3-semantic_version
+
+* DEB/Debian-based Linux:
+
+ .. code-block:: console
+
+ sudo apt-get install python3-git python3-semantic-version
+
+``zkg`` also requires a ``git`` installation, which the above system packages
+pull in as a dependency. If you install via pip, remember that you also need
+``git`` itself.
+
+Retrieving the Sources
+----------------------
+
+Zeek releases are bundled into source packages for convenience and are
+available on the `downloads page`_. The source code can be manually downloaded
+from the link in the ``.tar.gz`` format to the target system for installation.
+
+If you plan to `contribute`_ to Zeek or just want to try out the latest
+features under development, you should obtain Zeek's source code through its
+Git repositories hosted at https://github.com/zeek:
+
+.. code-block:: console
+
+ git clone --recurse-submodules https://github.com/zeek/zeek
+
+.. note:: If you choose to clone the ``zeek`` repository
+ non-recursively for a "minimal Zeek experience", be aware that
+ compiling it depends on several of the other submodules as well, so
+ you'll likely have to build/install those independently first.
+
+Configuring and Building
+------------------------
+
+The typical way to build and install from source is as follows:
+
+.. code-block:: console
+
+ ./configure
+ make
+ make install
+
+If the ``configure`` script fails, then it is most likely because it either
+couldn't find a required dependency or it couldn't find a sufficiently new
+version of a dependency. Assuming that you already installed all required
+dependencies, then you may need to use one of the ``--with-*`` options
+that can be given to the ``configure`` script to help it locate a dependency.
+To find out what all different options ``./configure`` supports, run
+``./configure --help``.
+
+The default installation path is ``/usr/local/zeek``, which would typically
+require root privileges when doing the ``make install``. A different
+installation path can be chosen by specifying the ``configure`` script
+``--prefix`` option. Note that ``/usr``, ``/opt/bro/``, and ``/opt/zeek`` are
+the standard prefixes for binary Zeek packages to be installed, so those are
+typically not good choices unless you are creating such a package.
+
+OpenBSD users, please see our `FAQ `_ if you are having
+problems installing Zeek.
+
+Depending on the Zeek package you downloaded, there may be auxiliary
+tools and libraries available in the ``auxil/`` directory. Some of them
+will be automatically built and installed along with Zeek. There are
+``--disable-*`` options that can be given to the configure script to
+turn off unwanted auxiliary projects that would otherwise be installed
+automatically. Finally, use ``make install-aux`` to install some of
+the other programs that are in the ``auxil/zeek-aux`` directory.
+
+Finally, if you want to build the Zeek documentation (not required, because
+all of the documentation for the latest Zeek release is available at
+https://docs.zeek.org), there are instructions in ``doc/README`` in the source
+distribution.
+
+Cross Compiling
+---------------
+
+Prerequisites
+~~~~~~~~~~~~~
+
+You need three things on the host system:
+
+1. The Zeek source tree.
+2. A cross-compilation toolchain, such as one built via crosstool-NG_.
+3. Pre-built Zeek dependencies from the target system. This usually
+ includes libpcap, zlib, OpenSSL, and Python development headers
+ and libraries.
+
+Configuration and Compiling
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+You first need to compile a few build tools native to the host system
+for use during the later cross-compile build. In the root of your
+Zeek source tree:
+
+.. code-block:: console
+
+ ./configure --builddir=../zeek-buildtools
+ ( cd ../zeek-buildtools && make binpac bifcl )
+
+Next configure Zeek to use your cross-compilation toolchain (this example
+uses a Raspberry Pi as the target system):
+
+.. code-block:: console
+
+ ./configure --toolchain=/home/jon/x-tools/RaspberryPi-toolchain.cmake --with-binpac=$(pwd)/../zeek-buildtools/auxil/binpac/src/binpac --with-bifcl=$(pwd)/../zeek-buildtools/src/bifcl
+
+Here, the :file:`RaspberryPi-toolchain.cmake` file specifies a `CMake
+toolchain`_. In the toolchain file, you need to point the toolchain and
+compiler at the cross-compilation toolchain. It might look something the
+following:
+
+.. code-block:: cmake
+
+ # Operating System on which CMake is targeting.
+ set(CMAKE_SYSTEM_NAME Linux)
+
+ # The CMAKE_STAGING_PREFIX option may not work.
+ # Given that Zeek is configured:
+ #
+ # ``./configure --prefix=``
+ #
+ # The options are:
+ #
+ # (1) ``make install`` and then copy over the --prefix dir from host to
+ # target system.
+ #
+ # (2) ``DESTDIR= make install`` and then copy over the
+ # contents of that staging directory.
+
+ set(toolchain /home/jon/x-tools/arm-rpi-linux-gnueabihf)
+ set(CMAKE_C_COMPILER ${toolchain}/bin/arm-rpi-linux-gnueabihf-gcc)
+ set(CMAKE_CXX_COMPILER ${toolchain}/bin/arm-rpi-linux-gnueabihf-g++)
+
+ # The cross-compiler/linker will use these paths to locate dependencies.
+ set(CMAKE_FIND_ROOT_PATH
+ /home/jon/x-tools/zeek-rpi-deps
+ ${toolchain}/arm-rpi-linux-gnueabihf/sysroot
+ )
+
+ set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+ set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+ set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+
+If that configuration succeeds you are ready to build:
+
+.. code-block:: console
+
+ make
+
+And if that works, install on your host system:
+
+.. code-block:: console
+
+ make install
+
+Once installed, you can copy/move the files from the installation prefix on the
+host system to the target system and start running Zeek as usual.
+
+Configuring the Run-Time Environment
+====================================
+
+You may want to adjust your :envvar:`PATH` environment variable
+according to the platform/shell/package you're using since
+neither :file:`/usr/local/zeek/bin/` nor :file:`/opt/zeek/bin/`
+will reside in the default :envvar:`PATH`. For example:
+
+Bourne-Shell Syntax:
+
+.. code-block:: console
+
+ export PATH=/usr/local/zeek/bin:$PATH
+
+C-Shell Syntax:
+
+.. code-block:: console
+
+ setenv PATH /usr/local/zeek/bin:$PATH
+
+Or substitute ``/opt/zeek/bin`` instead if you installed from a binary package.
+
+Zeek supports several environment variables to adjust its behavior. Take a look
+at the ``zeek --help`` output for details.
diff --git a/doc/cluster-setup.rst b/doc/cluster-setup.rst
new file mode 100644
index 0000000000..0bb5cc0790
--- /dev/null
+++ b/doc/cluster-setup.rst
@@ -0,0 +1,507 @@
+
+.. _ZeekControl documentation: https://github.com/zeek/zeekctl
+
+==================
+Zeek Cluster Setup
+==================
+
+.. TODO: integrate BoZ revisions
+
+A *Zeek Cluster* is a set of systems jointly analyzing the traffic of
+a network link in a coordinated fashion. You can operate such a setup from
+a central manager system easily using ZeekControl because it
+hides much of the complexity of the multi-machine installation.
+
+Cluster Architecture
+====================
+
+Zeek is not multithreaded, so once the limitations of a single processor core
+are reached the only option currently is to spread the workload across many
+cores, or even many physical computers. The cluster deployment scenario for
+Zeek is the current solution to build these larger systems. The tools and
+scripts that accompany Zeek provide the structure to easily manage many Zeek
+processes examining packets and doing correlation activities but acting as
+a singular, cohesive entity. This section describes the Zeek cluster
+architecture. For information on how to configure a Zeek cluster,
+see the documentation for `ZeekControl `_.
+
+Architecture
+------------
+
+The figure below illustrates the main components of a Zeek cluster.
+
+.. image:: /images/deployment.png
+
+For more specific information on the way Zeek processes are connected,
+how they function, and how they communicate with each other, see the
+:ref:`Broker Framework Documentation `.
+
+Tap
+***
+The tap is a mechanism that splits the packet stream in order to make a copy
+available for inspection. Examples include the monitoring port on a switch
+and an optical splitter on fiber networks.
+
+Frontend
+********
+The frontend is a discrete hardware device or on-host technique that splits
+traffic into many streams or flows. The Zeek binary does not do this job.
+There are numerous ways to accomplish this task, some of which are described
+below in `Frontend Options`_.
+
+Manager
+*******
+The manager is a Zeek process that has two primary jobs. It receives log
+messages and notices from the rest of the nodes in the cluster using the Zeek
+communications protocol (note that if you use a separate logger node, then the
+logger receives all logs instead of the manager). The result
+is a single log instead of many discrete logs that you have to
+combine in some manner with post-processing.
+The manager also supports other functionality and analysis which
+requires a centralized, global view of events or data.
+
+Logger
+******
+A logger is an optional Zeek process that receives log messages from the
+rest of the nodes in the cluster using the Zeek communications protocol.
+The purpose of having a logger receive logs instead of the manager is
+to reduce the load on the manager. If no logger is needed, then the
+manager will receive logs instead.
+
+Proxy
+*****
+A proxy is a Zeek process that may be used to offload data storage or
+any arbitrary workload. A cluster may contain multiple proxy nodes.
+The default scripts that come with Zeek make minimal use of proxies, so
+a single one may be sufficient, but customized use of them to partition
+data or workloads provides greater cluster scalability potential than
+just doing similar tasks on a single, centralized Manager node.
+
+Zeek processes acting as proxies don't tend to be extremely hard on CPU
+or memory and users frequently run proxy processes on the same physical
+host as the manager.
+
+Worker
+******
+The worker is the Zeek process that sniffs network traffic and does protocol
+analysis on the reassembled traffic streams. Most of the work of an active
+cluster takes place on the workers and as such, the workers typically
+represent the bulk of the Zeek processes that are running in a cluster.
+The fastest memory and CPU core speed you can afford is recommended
+since all of the protocol parsing and most analysis will take place here.
+There are no particular requirements for the disks in workers since almost all
+logging is done remotely to the manager, and normally very little is written
+to disk.
+
+Frontend Options
+----------------
+
+There are many options for setting up a frontend flow distributor. In many
+cases it is beneficial to do multiple stages of flow distribution
+on the network and on the host.
+
+Discrete hardware flow balancers
+********************************
+
+cPacket
+^^^^^^^
+
+If you are monitoring one or more 10G physical interfaces, the recommended
+solution is to use either a cFlow or cVu device from cPacket because they
+are used successfully at a number of sites. These devices will perform
+layer-2 load balancing by rewriting the destination Ethernet MAC address
+to cause each packet associated with a particular flow to have the same
+destination MAC. The packets can then be passed directly to a monitoring
+host where each worker has a BPF filter to limit its visibility to only that
+stream of flows, or onward to a commodity switch to split the traffic out to
+multiple 1G interfaces for the workers. This greatly reduces
+costs since workers can use relatively inexpensive 1G interfaces.
+
+On host flow balancing
+**********************
+
+PF_RING
+^^^^^^^
+
+The PF_RING software for Linux has a "clustering" feature which will do
+flow-based load balancing across a number of processes that are sniffing the
+same interface. This allows you to easily take advantage of multiple
+cores in a single physical host because Zeek's main event loop is single
+threaded and can't natively utilize all of the cores. If you want to use
+PF_RING, see the documentation on :ref:`how to configure Zeek with PF_RING
+`.
+
+
+AF_PACKET
+^^^^^^^^^
+
+On Linux, Zeek supports `AF_PACKET sockets `_ natively.
+Currently, this is provided by including the `external Zeek::AF_Packet plugin `_
+in default builds of Zeek for Linux. Additional information can be found in
+the project's README file.
+
+To check the availability of the ``af_packet`` packet source, print its information using ``zeek -N``::
+
+ zeek -N Zeek::AF_Packet
+ Zeek::AF_Packet - Packet acquisition via AF_Packet (dynamic, version 3.2.0)
+
+On FreeBSD, MacOSX, or if Zeek was built with ``--disable-af-packet``, the
+plugin won't be available.
+
+Single worker mode
+""""""""""""""""""
+
+For the most basic usage, prefix the interface with ``af_packet::`` when invoking Zeek::
+
+ zeek -i af_packet::eth0
+
+Generally, running Zeek this way requires a privileged user with CAP_NET_RAW
+and CAP_NET_ADMIN capabilities. Linux supports file-based capabilities: A
+process executing an executable with capabilities will receive these.
+Using this mechanism allows to run Zeek as an unprivileged user once the file
+capabilities have been added::
+
+ sudo setcap cap_net_raw,cap_net_admin=+eip /path/to/zeek
+
+Offloading and ethtool tuning
+"""""""""""""""""""""""""""""
+
+While not specific to AF_PACKET, it is recommended to disable any offloading
+features provided by the network card or Linux networking stack when running
+Zeek. This allows to see network packets as they arrive on the wire.
+See this `blog post `_
+for more background
+
+Toggling these features can be done with the ``ethtool -K`` command, for example::
+
+ IFACE=eth0
+ for offload in rx tx sg tso ufo gso gro lro; do
+ ethtool -K $IFACE $offload off
+ done
+
+Detailed statistics about the interface can be gathered via ``ethtool -S``.
+
+For more details around the involved offloads consult the
+`ethtool manpage `_.
+
+Load balancing
+""""""""""""""
+
+The more interesting use-case is to use AF_PACKET to run multiple Zeek workers
+and have their packet sockets join what is called a fanout group.
+In such a setup, the network traffic is load-balanced across Zeek workers.
+By default load balancing is based on symmetric flow hashes [#]_.
+
+For example, running two Zeek workers listening on the same network interface,
+each worker analyzing approximately half of the network traffic, can be done
+as follows::
+
+ zeek -i af_packet::eth0 &
+ zeek -i af_packet::eth0 &
+
+The fanout group is identified by an id and configurable using the
+``AF_Packet::fanout_id`` constant which defaults to 23. In the example
+above, both Zeek workers join the same fanout group.
+
+
+.. note::
+
+ As a caveat, within the same Linux network namespace, two Zeek processes can
+ not use the same fanout group id for listening on different network interfaces.
+ If this is a setup you're planning on running, configure the fanout group
+ ids explicitly.
+ For illustration purposes, the following starts two Zeek workers each using
+ a different network interface and fanout group id::
+
+ zeek -i af_packet::eth0 AF_Packet::fanout_id=23 &
+ zeek -i af_packet::eth1 AF_Packet::fanout_id=24 &
+
+.. warning::
+
+ Zeek workers crashing or restarting due to running out of memory can,
+ for a short period of time, disturb load balancing due to their packet
+ sockets being removed and later rejoining the fanout group.
+ This may be visible in Zeek logs as gaps and/or duplicated connection
+ entries produced by different Zeek workers.
+
+See :ref:`cluster-configuration` for instructions how to configure AF_PACKET
+with ZeekControl.
+
+
+Netmap
+^^^^^^
+
+`Netmap `_ is a framework for fast
+packet I/O that is natively supported on FreeBSD since version 10.
+On Linux it can be installed as an out-of-tree kernel module.
+
+FreeBSD
+"""""""
+FreeBSD's libpcap library supports netmap natively. This allows to prefix
+interface names with ``netmap:`` to instruct libpcap to open the interface
+in netmap mode. For example, a single Zeek worker can leverage netmap
+transparently using Zeek's default packet source as follows::
+
+ zeek -i netmap:em0
+
+.. warning::
+
+ Above command will put the em0 interface into kernel-bypass mode. Network
+ packets will pass directly to Zeek without being interpreted by the kernel.
+ If em0 is your primary network interface, this effectively disables
+ networking, including SSH connectivity.
+
+If your network card supports multiple rings, individual Zeek workers can be
+attached to these as well (this assumes the NIC does proper flow hashing in hardware)::
+
+ zeek -i netmap:em0-0
+ zeek -i netmap:em0-1
+
+For software load balancing support, the FreeBSD source tree includes the
+``lb`` tool to distribute packets into netmap pipes doing flow hashing
+in user-space.
+
+To compile and install ``lb``, ensure ``/usr/src`` is available on your
+FreeBSD system, then run the following commands::
+
+ cd /usr/src/tools/tools/netmap/
+ make
+ # Installs lb into /usr/local/bin
+ cp /usr/obj/usr/src/`uname -m`.`uname -m`/tools/tools/netmap/lb /usr/local/bin/
+
+
+To load-balance packets arriving on em0 into 4 different netmap pipes named
+``zeek}0`` through ``zeek}3``, run ``lb`` as follows::
+
+ lb -i em0 -p zeek:4
+ 410.154166 main [634] interface is em0
+ 411.377220 main [741] successfully opened netmap:em0
+ 411.377243 main [812] opening pipe named netmap:zeek{0/xT@1
+ 411.379200 main [829] successfully opened pipe #1 netmap:zeek{0/xT@1 (tx slots: 1024)
+ 411.379242 main [838] zerocopy enabled
+ ...
+
+Now, Zeek workers can attach to these four netmap pipes. When starting Zeek
+workers manually, the respective invocations would be as follows. The ``/x``
+suffix specifies exclusive mode to prevent two Zeek processes consuming packets
+from the same netmap pipe::
+
+ zeek -i netmap:zeek}0/x
+ zeek -i netmap:zeek}1/x
+ zeek -i netmap:zeek}2/x
+ zeek -i netmap:zeek}3/x
+
+For packet-level debugging, you can attach ``tcpdump`` to any of the netmap
+pipes in read monitor mode even while Zeek workers are consuming from them::
+
+ tcpdump -i netmap:zeek}1/r
+
+In case libpcap's netmap support is insufficient, the external
+`Zeek netmap plugin `_ can be installed.
+
+.. warning::
+
+ When using the zeek-netmap plugin on FreeBSD, the interface specification given to Zeek
+ needs to change from ``netmap:zeek}0/x`` to ``netmap::zeek}0/x`` - a single colon more.
+ In the first case, Zeek uses the default libpcap packet source and passes ``netmap:zeek}0``
+ as interface name. In the second case, ``netmap::`` is interpreted by Zeek and
+ the netmap packet source is instantiated. The ``zeek}0/x`` part is used as
+ interface name.
+
+Linux
+"""""
+
+While netmap isn't included in the Linux kernel, it can be installed as
+an out-of-tree kernel module.
+See the project's `GitHub repository `_
+for detailed instructions. This includes the ``lb`` tool for load balancing.
+
+On Linux, the external `zeek-netmap `_
+packet source plugin is required, or the system's libpcap library as used by
+Zeek needs to be recompiled with native netmap support. With the netmap kernel
+module loaded and the Zeek plugin installed, running a Zeek worker as follows
+will leverage netmap on Linux::
+
+ zeek -i netmap::eth1
+
+For using ``lb`` or libpcap with netmap support, refer to the commands shown
+in the FreeBSD section - these are essentially the same.
+
+
+.. _cluster-configuration:
+
+Cluster Configuration
+=====================
+
+A *Zeek Cluster* is a set of systems jointly analyzing the traffic of
+a network link in a coordinated fashion. You can operate such a setup from
+a central manager system easily using ZeekControl because it
+hides much of the complexity of the multi-machine installation.
+
+This section gives examples of how to setup common cluster configurations
+using ZeekControl. For a full reference on ZeekControl, see the
+`ZeekControl documentation`_.
+
+Preparing to Setup a Cluster
+----------------------------
+
+We refer to the user account used to set up the cluster
+as the "Zeek user". When setting up a cluster the Zeek user must be set up
+on all hosts, and this user must have ssh access from the manager to all
+machines in the cluster, and it must work without being prompted for a
+password/passphrase (for example, using ssh public key authentication).
+Also, on the worker nodes this user must have access to the target
+network interface in promiscuous mode.
+
+Additional storage must be available on all hosts under the same path,
+which we will call the cluster's prefix path. We refer to this directory
+as ````. If you build Zeek from source, then ```` is
+the directory specified with the ``--prefix`` configure option,
+or ``/usr/local/zeek`` by default. The Zeek user must be able to either
+create this directory or, where it already exists, must have write
+permission inside this directory on all hosts.
+
+When trying to decide how to configure the Zeek nodes, keep in mind that
+there can be multiple Zeek instances running on the same host. For example,
+it's possible to run a proxy and the manager on the same host. However, it is
+recommended to run workers on a different machine than the manager because
+workers can consume a lot of CPU resources. The maximum recommended
+number of workers to run on a machine should be one or two less than
+the number of CPU cores available on that machine. Using a load-balancing
+method (such as PF_RING) along with CPU pinning can decrease the load on
+the worker machines. Also, in order to reduce the load on the manager
+process, it is recommended to have a logger in your configuration. If a
+logger is defined in your cluster configuration, then it will receive logs
+instead of the manager process.
+
+Basic Cluster Configuration
+---------------------------
+
+With all prerequisites in place, perform the following steps to setup
+a Zeek cluster (do this as the Zeek user on the manager host only):
+
+- Edit the ZeekControl configuration file, ``/etc/zeekctl.cfg``,
+ and change the value of any options to be more suitable for
+ your environment. You will most likely want to change the value of
+ the ``MailTo`` and ``LogRotationInterval`` options. A complete
+ reference of all ZeekControl options can be found in the
+ `ZeekControl documentation`_.
+
+- Edit the ZeekControl node configuration file, ``/etc/node.cfg``
+ to define where logger, manager, proxies, and workers are to run. For a
+ cluster configuration, you must comment-out (or remove) the standalone node
+ in that file, and either uncomment or add node entries for each node
+ in your cluster (logger, manager, proxy, and workers). For example, if you
+ wanted to run five Zeek nodes (two workers, one proxy, a logger, and a
+ manager) on a cluster consisting of three machines, your cluster
+ configuration would look like this::
+
+ [logger]
+ type=logger
+ host=10.0.0.10
+
+ [manager]
+ type=manager
+ host=10.0.0.10
+
+ [proxy-1]
+ type=proxy
+ host=10.0.0.10
+
+ [worker-1]
+ type=worker
+ host=10.0.0.11
+ interface=eth0
+
+ [worker-2]
+ type=worker
+ host=10.0.0.12
+ interface=eth0
+
+ For a complete reference of all options that are allowed in the ``node.cfg``
+ file, see the `ZeekControl documentation`_.
+
+- Edit the network configuration file ``/etc/networks.cfg``. This
+ file lists all of the networks which the cluster should consider as local
+ to the monitored environment.
+
+- Install Zeek on all machines in the cluster using ZeekControl::
+
+ > zeekctl install
+
+- See the `ZeekControl documentation`_
+ for information on setting up a cron job on the manager host that can
+ monitor the cluster.
+
+AF_PACKET Cluster Configuration
+-------------------------------
+
+Since version 5.2, Zeek includes AF_PACKET as a native packet source. This
+provides an easy and efficient capture mechanism for Linux users.
+
+Adapt the worker section in ZeekControl's ``node.cfg`` file with the
+following entries, assuming running four worker processes listening on ``eth0`` ::
+
+ [worker-1]
+ type=worker
+ host=10.0.0.11
+ interface=eth0
+ lb_method=af_packet
+ lb_procs=4
+
+The specific options are ``lb_method=af_packet`` and ``lb_procs=4``.
+If listening on two or more interfaces on the same host is a requirement,
+remember to set a unique ``fanout_id`` using the node option ``af_packet_fanout_id``::
+
+ [worker-1-eth0]
+ type=worker
+ host=10.0.0.11
+ interface=eth0
+ lb_method=af_packet
+ lb_procs=4
+ af_packet_fanout_id=20
+
+ [worker-1-eth1]
+ type=worker
+ host=10.0.0.11
+ interface=eth1
+ lb_method=af_packet
+ lb_procs=4
+ af_packet_fanout_id=21
+
+Pinning the worker processes to individual CPU cores can improve performance.
+Use the node's option ``pin_cpus=4,5,6,7``, listing as many CPU numbers as
+processes at appropriate offsets.
+
+.. _pf-ring-config:
+
+PF_RING Cluster Configuration
+-----------------------------
+
+`PF_RING `_ allows speeding up the
+packet capture process by installing a new type of socket in Linux systems.
+It supports 10Gbit hardware packet filtering using standard network adapters,
+and user-space DNA (Direct NIC Access) for fast packet capture/transmission.
+
+.. note::
+
+ Unless you have evaluated to specifically require PF_RING, consider using
+ AF_PACKET first and test if it fulfills your requirements. AF_PACKET has
+ been integrated into Zeek since version 5.2. It's a bit easier to get
+ started with as it does not require an out of tree Linux kernel module.
+
+Head over to :ref:`cluster-pf-ring` for more details.
+
+.. toctree::
+ :hidden:
+
+ cluster/pf_ring
+
+
+.. [#] Some Linux kernel versions between 3.10 and 4.7 might exhibit
+ a bug that prevents the required symmetric hashing. The script available
+ in the GitHub project `can-i-use-afpacket-fanout `_
+ can be used to verify whether ``PACKET_FANOUT`` works as expected.
+
+ This issue has been fixed in all stable kernels for at least 5 years.
+ You're unlikely to be affected.
diff --git a/doc/cluster/pf_ring.rst b/doc/cluster/pf_ring.rst
new file mode 100644
index 0000000000..aa124491c2
--- /dev/null
+++ b/doc/cluster/pf_ring.rst
@@ -0,0 +1,141 @@
+.. _cluster-pf-ring:
+
+===================
+PF_RING Setup Guide
+===================
+
+Installing PF_RING
+******************
+
+1. Download and install PF_RING for your system following the instructions
+ `here `_. The following
+ commands will install the PF_RING libraries and kernel module (replace
+ the version number 5.6.2 in this example with the version that you
+ downloaded)::
+
+ cd /usr/src
+ tar xvzf PF_RING-5.6.2.tar.gz
+ cd PF_RING-5.6.2/userland/lib
+ ./configure --prefix=/opt/pfring
+ make install
+
+ cd ../libpcap
+ ./configure --prefix=/opt/pfring
+ make install
+
+ cd ../tcpdump-4.1.1
+ ./configure --prefix=/opt/pfring
+ make install
+
+ cd ../../kernel
+ make
+ make install
+
+ modprobe pf_ring enable_tx_capture=0 min_num_slots=32768
+
+ Refer to the documentation for your Linux distribution on how to load the
+ pf_ring module at boot time. You will need to install the PF_RING
+ library files and kernel module on all of the workers in your cluster.
+
+2. Download the Zeek source code.
+
+3. Configure and install Zeek using the following commands::
+
+ ./configure --with-pcap=/opt/pfring
+ make
+ make install
+
+4. Make sure Zeek is correctly linked to the PF_RING libpcap libraries::
+
+ ldd /usr/local/zeek/bin/zeek | grep pcap
+ libpcap.so.1 => /opt/pfring/lib/libpcap.so.1 (0x00007fa6d7d24000)
+
+5. Configure ZeekControl to use PF_RING (explained below).
+
+6. Run "zeekctl install" on the manager. This command will install Zeek and
+ required scripts to all machines in your cluster.
+
+Using PF_RING
+*************
+
+In order to use PF_RING, you need to specify the correct configuration
+options for your worker nodes in ZeekControl's node configuration file.
+Edit the ``node.cfg`` file and specify ``lb_method=pf_ring`` for each of
+your worker nodes. Next, use the ``lb_procs`` node option to specify how
+many Zeek processes you'd like that worker node to run, and optionally pin
+those processes to certain CPU cores with the ``pin_cpus`` option (CPU
+numbering starts at zero). The correct ``pin_cpus`` setting to use is
+dependent on your CPU architecture (Intel and AMD systems enumerate
+processors in different ways). Using the wrong ``pin_cpus`` setting
+can cause poor performance. Here is what a worker node entry should
+look like when using PF_RING and CPU pinning::
+
+ [worker-1]
+ type=worker
+ host=10.0.0.50
+ interface=eth0
+ lb_method=pf_ring
+ lb_procs=10
+ pin_cpus=2,3,4,5,6,7,8,9,10,11
+
+
+Using PF_RING+DNA with symmetric RSS
+************************************
+
+You must have a PF_RING+DNA license in order to do this. You can sniff
+each packet only once.
+
+1. Load the DNA NIC driver (i.e. ixgbe) on each worker host.
+
+2. Run "ethtool -L dna0 combined 10" (this will establish 10 RSS queues
+ on your NIC) on each worker host. You must make sure that you set the
+ number of RSS queues to the same as the number you specify for the
+ lb_procs option in the node.cfg file.
+
+3. On the manager, configure your worker(s) in node.cfg::
+
+ [worker-1]
+ type=worker
+ host=10.0.0.50
+ interface=dna0
+ lb_method=pf_ring
+ lb_procs=10
+
+
+Using PF_RING+DNA with pfdnacluster_master
+******************************************
+
+You must have a PF_RING+DNA license and a libzero license in order to do
+this. You can load balance between multiple applications and sniff the
+same packets multiple times with different tools.
+
+1. Load the DNA NIC driver (i.e. ixgbe) on each worker host.
+
+2. Run "ethtool -L dna0 1" (this will establish 1 RSS queues on your NIC)
+ on each worker host.
+
+3. Run the pfdnacluster_master command on each worker host. For example::
+
+ pfdnacluster_master -c 21 -i dna0 -n 10
+
+ Make sure that your cluster ID (21 in this example) matches the interface
+ name you specify in the node.cfg file. Also make sure that the number
+ of processes you're balancing across (10 in this example) matches
+ the lb_procs option in the node.cfg file.
+
+4. If you are load balancing to other processes, you can use the
+ pfringfirstappinstance variable in zeekctl.cfg to set the first
+ application instance that Zeek should use. For example, if you are running
+ pfdnacluster_master with "-n 10,4" you would set
+ pfringfirstappinstance=4. Unfortunately that's still a global setting
+ in zeekctl.cfg at the moment but we may change that to something you can
+ set in node.cfg eventually.
+
+5. On the manager, configure your worker(s) in node.cfg::
+
+ [worker-1]
+ type=worker
+ host=10.0.0.50
+ interface=dnacluster:21
+ lb_method=pf_ring
+ lb_procs=10
diff --git a/doc/components/index.rst b/doc/components/index.rst
new file mode 100644
index 0000000000..45092f3baa
--- /dev/null
+++ b/doc/components/index.rst
@@ -0,0 +1,33 @@
+
+=============
+Subcomponents
+=============
+
+To find documentation for the various subcomponents of Zeek, see their
+respective GitHub repositories or documentation:
+
+* `Spicy `__
+ - C++ parser generator for dissecting protocols & files.
+* `BinPAC `__
+ - A protocol parser generator
+* `ZeekControl `__
+ - Interactive Zeek management shell
+* `Zeek-Aux `__
+ - Small auxiliary tools for Zeek
+* `BTest `__
+ - A system testing framework
+* `Capstats `__
+ - Command-line packet statistic tool
+* `PySubnetTree `__
+ - Python module for CIDR lookups
+* `trace-summary `__
+ - Script for generating break-downs of network traffic
+* `Broker `__
+ - Zeek's Messaging Library
+ - `(Docs) `__
+* `Package Manager `__
+ - A package manager for Zeek
+ - `(Docs) `__
+* `Paraglob `__
+ - A pattern matching data structure for Zeek.
+ - `(Docs) `__
diff --git a/doc/conf.py b/doc/conf.py
new file mode 100644
index 0000000000..69de553ac5
--- /dev/null
+++ b/doc/conf.py
@@ -0,0 +1,305 @@
+#
+# Zeek documentation build configuration file, created by sphinx-quickstart
+#
+# This file is execfile()d with the current directory set to its containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+import os
+import sys
+
+extensions = []
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+sys.path.insert(0, os.path.abspath("ext"))
+
+# -- General configuration -----------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+# needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be extensions
+# coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
+extensions += [
+ "zeek",
+ "sphinx.ext.todo",
+ "zeek_pygments",
+ "spicy-pygments",
+ "literal-emph",
+ "sphinx.ext.extlinks",
+]
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ["_templates"]
+
+# The suffix of source filenames.
+source_suffix = ".rst"
+
+# The encoding of source files.
+# source_encoding = 'utf-8-sig'
+
+# The master toctree document.
+master_doc = "index"
+
+# General information about the project.
+project = "Zeek"
+copyright = "by the Zeek Project"
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+# The short X.Y version.
+#
+
+version = "source"
+
+try:
+ # Use the actual Zeek version if available
+ with open("../VERSION") as f:
+ version = f.readline().strip()
+except:
+ try:
+ import re
+
+ import git
+
+ repo = git.Repo(os.path.abspath("."))
+ version = "git/master"
+
+ version_tag_re = r"v\d+\.\d+(\.\d+)?"
+ version_tags = [
+ t
+ for t in repo.tags
+ if t.commit == repo.head.commit and re.match(version_tag_re, str(t))
+ ]
+ # Note: sorting by tag date doesn't necessarily give correct
+ # order in terms of version numbers, but doubtful that will ever be
+ # a problem (if we ever do re-tag an old version number on a given
+ # commit such that it is incorrectly found as the most recent version,
+ # we can just re-tag all the other version numbers on that same commit)
+ version_tags = sorted(version_tags, key=lambda t: t.tag.tagged_date)
+
+ if version_tags:
+ version = str(version_tags[-1])
+
+ except:
+ pass
+
+# The full version, including alpha/beta/rc tags.
+release = version
+
+# In terms of the actual hyperlink URL, a more ideal/stable way to reference
+# source code on GitHub would be by commit hash, but that can be tricky to
+# update in a way that produces stable Sphinx/reST configuration: don't want
+# to update the commit-hash for every Zeek commit unless it actually produces
+# new content, and also don't want to accidentally make it easy for people to
+# insert unreachable commits when manually running
+# `zeek/ci/update-zeekygen-docs.sh`.
+#
+# We only have a few versions of docs that actually matter: `master` and
+# `release/.*`, and the tip of those branches will always be in sync with
+# auto-generated content by simply having `zeek/ci/update-zeekygen-docs.sh`
+# change this to `release/.*` when needed.
+zeek_code_version = "master"
+zeek_code_url = f"https://github.com/zeek/zeek/blob/{zeek_code_version}"
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+# language = None
+
+# There are two options for replacing |today|: either, you set today to some
+# non-false value, then it is used:
+# today = ''
+# Else, today_fmt is used as the format for a strftime call.
+today_fmt = "%B %d, %Y"
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+exclude_patterns = [".#*", "script-reference/autogenerated-*"]
+
+# The reST default role (used for this markup: `text`) to use for all documents.
+# default_role = None
+
+# If true, '()' will be appended to :func: etc. cross-reference text.
+# add_function_parentheses = True
+
+# If true, the current module name will be prepended to all description
+# unit titles (such as .. function::).
+# add_module_names = True
+
+# If true, sectionauthor and moduleauthor directives will be shown in the
+# output. They are ignored by default.
+show_authors = True
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = "sphinx"
+
+highlight_language = "none"
+
+# A list of ignored prefixes for module index sorting.
+# modindex_common_prefix = []
+
+
+# -- Options for HTML output ---------------------------------------------------
+
+html_theme = "sphinx_rtd_theme"
+
+# Set canonical URL from the Read the Docs Domain
+html_baseurl = os.environ.get("READTHEDOCS_CANONICAL_URL", "")
+
+# Tell Jinja2 templates the build is running on Read the Docs
+if os.environ.get("READTHEDOCS", "") == "True":
+ if "html_context" not in globals():
+ html_context = {}
+ html_context["READTHEDOCS"] = True
+
+html_last_updated_fmt = "%B %d, %Y"
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further. For a list of options available for each theme, see the
+# documentation.
+html_theme_options = {
+ "analytics_id": "UA-144186885-1",
+ "collapse_navigation": False,
+ "style_external_links": True,
+}
+
+# Add any paths that contain custom themes here, relative to this directory.
+# html_theme_path = []
+
+# The name for this set of Sphinx documents. If None, it defaults to
+# " v Documentation".
+html_title = f"Book of Zeek ({release})"
+
+# A shorter title for the navigation bar. Default is the same as html_title.
+# html_short_title = None
+
+# The name of an image file (relative to this directory) to place at the top
+# of the sidebar.
+html_logo = "images/zeek-logo-sidebar.png"
+
+# The name of an image file (within the static path) to use as favicon of the
+# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
+# pixels large.
+html_favicon = "images/zeek-favicon.ico"
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ["_static"]
+
+
+def setup(app):
+ app.add_css_file("theme_overrides.css")
+ from sphinx.highlighting import lexers
+ from zeek_pygments import ZeekLexer
+
+ lexers["zeek"] = ZeekLexer()
+ app.add_config_value("zeek-code-url", zeek_code_url, "env")
+
+
+# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
+# using the given strftime format.
+# html_last_updated_fmt = '%b %d, %Y'
+
+# If true, SmartyPants will be used to convert quotes and dashes to
+# typographically correct entities.
+# html_use_smartypants = True
+
+# Custom sidebar templates, maps document names to template names.
+# html_sidebars = {
+#'**': ['localtoc.html', 'sourcelink.html', 'searchbox.html'],
+# }
+
+# Additional templates that should be rendered to pages, maps page names to
+# template names.
+# html_additional_pages = {}
+
+# If false, no module index is generated.
+# html_domain_indices = True
+
+# If false, no index is generated.
+# html_use_index = True
+
+# If true, the index is split into individual pages for each letter.
+# html_split_index = False
+
+# If true, links to the reST sources are added to the pages.
+# html_show_sourcelink = True
+
+# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
+# html_show_sphinx = True
+
+# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
+# html_show_copyright = True
+
+# If true, an OpenSearch description file will be output, and all pages will
+# contain a tag referring to it. The value of this option must be the
+# base URL from which the finished HTML is served.
+# html_use_opensearch = ''
+
+# This is the file name suffix for HTML files (e.g. ".xhtml").
+# html_file_suffix = None
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = "zeek-docs"
+
+# -- Options for LaTeX output --------------------------------------------------
+
+# The paper size ('letter' or 'a4').
+# latex_paper_size = 'letter'
+
+# The font size ('10pt', '11pt' or '12pt').
+# latex_font_size = '10pt'
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title, author, documentclass [howto/manual]).
+latex_documents = [
+ ("index", "Zeek.tex", "Zeek Documentation", "The Zeek Project", "manual"),
+]
+
+# The name of an image file (relative to this directory) to place at the top of
+# the title page.
+# latex_logo = None
+
+# For "manual" documents, if this is true, then toplevel headings are parts,
+# not chapters.
+# latex_use_parts = False
+
+# If true, show page references after internal links.
+# latex_show_pagerefs = False
+
+# If true, show URL addresses after external links.
+# latex_show_urls = False
+
+# Additional stuff for the LaTeX preamble.
+# latex_preamble = ''
+
+# Documents to append as an appendix to all manuals.
+# latex_appendices = []
+
+# If false, no module index is generated.
+# latex_domain_indices = True
+
+# -- Options for manual page output --------------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [("index", "zeek", "Zeek Documentation", ["The Zeek Project"], 1)]
+
+# -- Options for todo plugin --------------------------------------------
+todo_include_todos = True
+
+extlinks = {
+ "slacklink": ("https://zeek.org/slack%s", None),
+ "discourselink": ("https://community.zeek.org/%s", None),
+ "spicylink": ("https://docs.zeek.org/projects/spicy/en/latest/%s", None),
+}
+extlinks_detect_hardcoded_links = True
diff --git a/doc/customizations.rst b/doc/customizations.rst
new file mode 100644
index 0000000000..fc6b15e826
--- /dev/null
+++ b/doc/customizations.rst
@@ -0,0 +1,318 @@
+.. _popular-customizations:
+
+======================
+Popular Customizations
+======================
+
+This page outlines customizations and additions that are popular
+among Zeek users.
+
+.. note::
+
+ This page lists externally-maintained Zeek packages. The Zeek team does not
+ provide support or maintenance for these packages. If you find bugs or have
+ feature requests, please reach out to the respective package maintainers directly.
+
+ You may also post in the :slacklink:`Zeek Slack <>` #packages
+ channel or :discourselink:`forum <>` to get help from the broader
+ Zeek community.
+
+
+Log Enrichment
+==============
+
+Community ID
+------------
+
+.. versionadded:: 6.0
+
+Zeek includes native `Community ID Flow Hashing`_ support. This functionality
+has previously been provided through the `zeek-community-id`_ package.
+
+.. note::
+
+ At this point, the external `zeek-community-id`_ package is still
+ available to support Zeek deployments running older versions. However,
+ the scripts provided by the package cause conflicts with those provided in
+ Zeek 6.0 - do not load both.
+
+Loading the
+:doc:`/scripts/policy/protocols/conn/community-id-logging.zeek`
+and
+:doc:`/scripts/policy/frameworks/notice/community-id.zeek`
+scripts adds an additional ``community_id`` field to the
+:zeek:see:`Conn::Info` and :zeek:see:`Notice::Info` record.
+
+.. code-block:: console
+
+ $ zeek -r ./traces/get.trace protocols/conn/community-id-logging LogAscii::use_json=T
+ $ jq < conn.log
+ {
+ "ts": 1362692526.869344,
+ "uid": "CoqLmg1Ds5TE61szq1",
+ "id.orig_h": "141.142.228.5",
+ "id.orig_p": 59856,
+ "id.resp_h": "192.150.187.43",
+ "id.resp_p": 80,
+ "proto": "tcp",
+ ...
+ "community_id": "1:yvyB8h+3dnggTZW0UEITWCst97w="
+ }
+
+
+The Community ID Flow Hash of a :zeek:see:`conn_id` instance can be computed
+with the :zeek:see:`community_id_v1` builtin function directly on the command-line
+or used in custom scripts.
+
+.. code-block:: console
+
+ $ zeek -e 'print community_id_v1([$orig_h=141.142.228.5, $orig_p=59856/tcp, $resp_h=192.150.187.43, $resp_p=80/tcp])'
+ 1:yvyB8h+3dnggTZW0UEITWCst97w=
+
+.. _Community ID Flow Hashing: https://github.com/corelight/community-id-spec
+.. _zeek-community-id: https://github.com/corelight/zeek-community-id/>`_
+
+.. _geolocation:
+
+Address geolocation and AS lookups
+----------------------------------
+
+.. _libmaxminddb: https://github.com/maxmind/libmaxminddb
+
+Zeek supports IP address geolocation as well as AS (autonomous system)
+lookups. This requires two things:
+
+ * Compilation of Zeek with the `libmaxminddb`_ library and development
+ headers. If you're using our :ref:`Docker images ` or
+ :ref:`binary packages `, there's nothing to do: they ship
+ with GeoIP support.
+ * Installation of corresponding MaxMind database files on your
+ system.
+
+To check whether your Zeek supports geolocation, run ``zeek-config --have-geoip``
+(available since Zeek 6.2) or simply try an address lookup. The following
+indicates that your Zeek lacks support:
+
+.. code-block:: console
+
+ $ zeek -e 'lookup_location(1.2.3.4)'
+ error in , line 1: Zeek was not configured for GeoIP support (lookup_location(1.2.3.4))
+
+Read on for more details about building Zeek with GeoIP support, and how to
+configure access to the database files.
+
+Building Zeek with libmaxminddb
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+If you build Zeek yourself, you need to install libmaxminddb prior to
+configuring your build.
+
+* RPM/RedHat-based Linux:
+
+ .. code-block:: console
+
+ sudo yum install libmaxminddb-devel
+
+* DEB/Debian-based Linux:
+
+ .. code-block:: console
+
+ sudo apt-get install libmaxminddb-dev
+
+* FreeBSD:
+
+ .. code-block:: console
+
+ sudo pkg install libmaxminddb
+
+* Mac OS X:
+
+ You need to install from your preferred package management system
+ (e.g. Homebrew, MacPorts, or Fink). For Homebrew, the name of the package
+ that you need is libmaxminddb.
+
+The ``configure`` script's output indicates whether it successfully located
+libmaxminddb. If your system's MaxMind library resides in a non-standard path,
+you may need to specify it via ``./configure --with-geoip=``.
+
+Installing and configuring GeoIP databases
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+MaxMind's databases ship as individual files that you can `download
+`_ from their
+website after `signing up `_ for an
+account. Some Linux distributions also offer free databases in their package
+managers.
+
+There are three types of databases: city-level geolocation, country-level
+geolocation, and mapping of IP addresses to autonomous systems (AS number and
+organization). Download these and decide on a place to put them on your
+file system. If you use automated tooling or system packages for the
+installation, that path may be chosen for you, such as ``/usr/share/GeoIP``.
+
+Zeek provides three ways to configure access to the databases:
+
+* Specifying the path and filenames via script variables. Use the
+ :zeek:see:`mmdb_dir` variable, unset by default, to point to the directory
+ containing the database(s). By default Zeek looks for databases called
+ ``GeoLite2-City.mmdb``, ``GeoLite2-Country.mmdb``, and
+ ``GeoLite2-ASN.mmdb``. Starting with Zeek 6.2 you can adjust these names by
+ redefining the :zeek:see:`mmdb_city_db`, :zeek:see:`mmdb_country_db`, and
+ :zeek:see:`mmdb_asn_db` variables.
+* Relying on Zeek's pre-configured search paths and filenames. The
+ :zeek:see:`mmdb_dir_fallbacks` variable contains default
+ search paths that Zeek will try in turn when :zeek:see:`mmdb_dir` is not
+ set. Prior to Zeek 6.2 these paths were hardcoded; they're now redefinable.
+ For geolocation, Zeek first attempts the city-level databases due to their
+ greater precision, and falls back to the city-level one. You can adjust the
+ database filenames via :zeek:see:`mmdb_city_db` and related variables, as
+ covered above.
+* Opening databases explicitly via scripting. The
+ :zeek:see:`mmdb_open_location_db` and :zeek:see:`mmdb_open_asn_db`
+ functions take full paths to database files. Zeek only ever uses one
+ geolocation and one ASN database, and these loads override any databases
+ previously loaded. These loads can occur at any point.
+
+Querying the databases
+^^^^^^^^^^^^^^^^^^^^^^
+
+Two built-in functions provide GeoIP functionality:
+
+.. code-block:: zeek
+
+ function lookup_location(a:addr): geo_location
+ function lookup_autonomous_system(a:addr): geo_autonomous_system
+
+:zeek:see:`lookup_location` returns a :zeek:see:`geo_location` record with
+country/region/etc fields, while :zeek:see:`lookup_autonomous_system` returns a
+:zeek:see:`geo_autonomous_system` record indicating the AS number and
+organization. Depending on the queried IP address some fields may be
+uninitialized, so you should guard access with an ``a?$b`` :ref:`existence test
+`.
+
+Zeek tests the database files for staleness. If it detects that a database has
+been updated, it will automatically reload it. Zeek does not automatically add
+GeoIP intelligence to its logs, but several add-on scripts and packages provide
+such functionality. These include:
+
+* The :ref:`notice framework ` lets you configure notice types
+ that you'd like to augment with location information. See
+ :zeek:see:`Notice::lookup_location_types` and
+ :zeek:see:`Notice::ACTION_ADD_GEODATA` for details.
+* The :doc:`/scripts/policy/protocols/smtp/detect-suspicious-orig.zeek` and
+ :doc:`/scripts/policy/protocols/ssh/geo-data.zeek` policy scripts.
+* Several `Zeek packages `_.
+
+Testing
+^^^^^^^
+
+Before using the GeoIP functionality it is a good idea to verify that
+everything is setup correctly. You can quickly check if the GeoIP
+functionality works by running commands like these:
+
+.. code-block:: console
+
+ zeek -e "print lookup_location(8.8.8.8);"
+
+If you see an error message similar to "Failed to open GeoIP location database",
+then your database configuration is broken. You may need to rename or move your
+GeoIP database files.
+
+Example
+^^^^^^^
+
+The following shows every FTP connection from hosts in Ohio, US:
+
+.. code-block:: zeek
+
+ event ftp_reply(c: connection, code: count, msg: string, cont_resp: bool)
+ {
+ local client = c$id$orig_h;
+ local loc = lookup_location(client);
+
+ if (loc?$region && loc$region == "OH" && loc?$country_code && loc$country_code == "US")
+ {
+ local city = loc?$city ? loc$city : "";
+
+ print fmt("FTP Connection from:%s (%s,%s,%s)", client, city,
+ loc$region, loc$country_code);
+ }
+ }
+
+
+Log Writers
+===========
+
+Kafka
+-----
+
+For exporting logs to `Apache Kafka`_ in a streaming fashion, the externally-maintained
+`zeek-kafka`_ package is a popular choice and easy to configure. It relies on `librdkafka`_.
+
+.. code-block:: zeek
+
+ redef Log::default_writer = Log::WRITER_KAFKAWRITER;
+
+ redef Kafka::kafka_conf += {
+ ["metadata.broker.list"] = "192.168.0.1:9092"
+ };
+
+.. _Apache Kafka: https://kafka.apache.org/
+.. _zeek-kafka: https://github.com/SeisoLLC/zeek-kafka/
+.. _librdkafka: https://github.com/confluentinc/librdkafka
+
+
+Logging
+=======
+
+JSON Streaming Logs
+-------------------
+
+The externally-maintained `json-streaming-logs`_ package tailors Zeek
+for use with log shippers like `Filebeat`_ or `fluentd`_. It configures
+additional log files prefixed with ``json_streaming_``, adds ``_path``
+and ``_write_ts`` fields to log records and configures log rotation
+appropriately.
+
+If you do not use a logging archive and want to stream all logs away
+from the system where Zeek is running without leveraging Kafka, this
+package helps you with that.
+
+.. _json-streaming-logs: https://github.com/corelight/json-streaming-logs
+.. _Filebeat: https://www.elastic.co/beats/filebeat
+.. _fluentd: https://www.fluentd.org/
+
+
+Long Connections
+----------------
+
+Zeek logs connection entries into the :file:`conn.log` only upon termination
+or due to expiration of inactivity timeouts. Depending on the protocol and
+chosen timeout values this can significantly delay the appearance of a log
+entry for a given connection. The delay may be up to an hour for lingering
+SSH connections or connections where the final FIN or RST packets were missed.
+
+The `zeek-long-connections`_ package alleviates this by creating a :file:`conn_long.log`
+log with the same format as :file:`conn.log`, but containing entries for connections
+that have been existing for configurable intervals.
+By default, the first entry for a connection is logged after 10mins. Depending on
+the environment, this can be lowered as even a 10 minute delay may be significant
+for detection purposes in streaming setup.
+
+.. _zeek-long-connections: https://github.com/corelight/zeek-long-connections
+
+
+Profiling and Debugging
+=======================
+
+jemalloc profiling
+------------------
+
+For investigation of memory leaks or state-growth issues within Zeek,
+jemalloc's profiling is invaluable. A package providing a bit support
+for configuring jemalloc's profiling facilities is `zeek-jemalloc-profiling`_.
+
+Some general information about memory profiling exists in the :ref:`Troubleshooting `
+section.
+
+.. _zeek-jemalloc-profiling: https://github.com/JustinAzoff/zeek-jemalloc-profiling
diff --git a/doc/devel/cluster-backend-zeromq.rst b/doc/devel/cluster-backend-zeromq.rst
new file mode 100644
index 0000000000..c07c523786
--- /dev/null
+++ b/doc/devel/cluster-backend-zeromq.rst
@@ -0,0 +1,120 @@
+.. _cluster_backend_zeromq:
+
+======================
+ZeroMQ Cluster Backend
+======================
+
+.. versionadded:: 7.1
+
+*Experimental*
+
+Quickstart
+==========
+
+To switch a Zeek cluster with a static cluster layout over to use ZeroMQ
+as cluster backend, add the following snippet to ``local.zeek``:
+
+.. code-block:: zeek
+
+ @load frameworks/cluster/backend/zeromq/connect
+
+
+Note that the function :zeek:see:`Broker::publish` will be non-functional
+and a warning emitted when used - use :zeek:see:`Cluster::publish` instead.
+
+By default, a configuration based on hard-coded endpoints and cluster layout
+information is created. For more customization, refer to the module documentation
+at :doc:`cluster/backend/zeromq/main.zeek `.
+
+
+Architecture
+============
+
+Publish-Subscribe of Zeek Events
+--------------------------------
+
+The `ZeroMQ `_ based cluster backend uses a central
+XPUB/XSUB broker for publish-subscribe functionality. Zeek events published
+via :zeek:see:`Cluster::publish` are distributed by this central broker to
+interested nodes.
+
+.. figure:: /images/cluster/zeromq-pubsub.png
+
+
+As depicted in the figure above, each cluster node connects to the central
+broker twice, once via its XPUB socket and once via its XSUB socket. This
+results in two TCP connections from every cluster node to the central broker.
+This setup allows every node in the cluster to see messages from all other
+nodes, avoiding the need for cluster topology awareness.
+
+.. note::
+
+ Scalability of the central broker in production setups, but for small
+ clusters on a single node, may be fast enough.
+
+On a cluster node, the XPUB socket provides notifications about subscriptions
+created by other nodes: For every subscription created by any node in
+the cluster, the :zeek:see:`Cluster::Backend::ZeroMQ::subscription` event is
+raised locally on every other node (unless another node had created the same
+subscription previously).
+
+This mechanism is used to discover the existence of other cluster nodes by
+matching the topics with the prefix for node specific subscriptions as produced
+by :zeek:see:`Cluster::nodeid_topic`.
+
+As of now, the implementation of the central broker calls ZeroMQ's
+``zmq::proxy()`` function to forward messages between the XPUB and
+XSUB socket.
+
+While the diagram above indicates the central broker being deployed separately
+from Zeek cluster nodes, by default the manager node will start and run this
+broker using a separate thread. There's nothing that would prevent from running
+a long running central broker independently from the Zeek cluster nodes, however.
+
+The serialization of Zeek events is done by the selected
+:zeek:see:`Cluster::event_serializer` and is independent of ZeroMQ.
+The central broker needs no knowledge about the chosen format, it is
+only shuffling messages between nodes.
+
+
+Logging
+-------
+
+While remote events always pass through the central broker, nodes connect and
+send log writes directly to logger nodes in a cluster. The ZeroMQ cluster backend
+leverages ZeroMQ's pipeline pattern for this functionality. That is, logger nodes
+(including the manager if configured using :zeek:see:`Cluster::manager_is_logger`)
+open a ZeroMQ PULL socket to receive log writes. All other nodes connect their
+PUSH socket to all available PULL sockets. These connections are separate from
+the publish-subscribe setup outlined above.
+
+When sending log-writes over a PUSH socket, load balancing is done by ZeroMQ.
+Individual cluster nodes do not have control over the decision which logger
+node receives log writes at any given time.
+
+.. figure:: /images/cluster/zeromq-logging.png
+
+While the previous paragraph used "log writes", a single message to a logger
+node actually contains a batch of log writes. The options :zeek:see:`Log::flush_interval`
+and :zeek:see:`Log::write_buffer_size` control the frequency and maximum size
+of these batches.
+
+The serialization format used to encode such batches is controlled by the
+selected :zeek:see:`Cluster::log_serializer` and is independent of ZeroMQ.
+
+With the default serializer (:zeek:see:`Cluster::LOG_SERIALIZER_ZEEK_BIN_V1`),
+every log batch on the wire has a header prepended that describes it. This allows
+interpretation of log writes even by non-Zeek processes. This opens the possibility
+to implement non-Zeek logger processes as long as the chosen serializer format
+is understood by the receiving process. In the future, a JSON lines serialization
+may be provided, allowing easier interpretation than a proprietary binary format.
+
+
+Summary
+-------
+
+Combining the diagrams above, the connections between the different socket
+types in a Zeek cluster looks something like the following.
+
+.. figure:: /images/cluster/zeromq-cluster.png
+
diff --git a/doc/devel/contributors.rst b/doc/devel/contributors.rst
new file mode 100644
index 0000000000..f1ec537a16
--- /dev/null
+++ b/doc/devel/contributors.rst
@@ -0,0 +1,111 @@
+
+===================
+Contributor's Guide
+===================
+
+See below for selection of some of the more common contribution guidelines
+maintained directly in `Zeek wiki
+`_.
+
+General Contribution Process
+============================
+
+See https://github.com/zeek/zeek/wiki/Contribution-Guide
+
+Coding Style and Conventions
+============================
+
+See https://github.com/zeek/zeek/wiki/Coding-Style-and-Conventions
+
+General Documentation Structure/Process
+=======================================
+
+See the :doc:`README ` file of https://github.com/zeek/zeek-docs
+
+Documentation Style and Conventions
+===================================
+
+See https://github.com/zeek/zeek/wiki/Documentation-Style-and-Conventions
+
+Checking for Memory Errors and Leaks
+====================================
+
+See https://github.com/zeek/zeek/wiki/Checking-for-Memory-Errors-and-Leaks
+
+Maintaining long-lived forks of Zeek
+====================================
+
+Consistent formatting of the Zeek codebase is enforced automatically by
+configurations tracked in the repository. Upstream updates to these
+configurations can lead to formatting changes which could cause merge conflicts
+for long-lived forks.
+
+Currently the following configuration files in the root directory are used:
+
+- ``.pre-commit-config.yaml``: Configuration for `pre-commit `_.
+ We use pre-commit to manage and orchestrate formatters and linters.
+- ``.clang-format``: Configuration for `clang-format
+ `_ for formatting C++ files.
+- ``.style.yapf``: Configuration for `YAPF `_
+ for formatting Python files.
+- ``.cmake-format.json``: Configuration for `cmake-format
+ `_ for formatting CMake files.
+
+With these configuration files present ``pre-commit run --all-files`` will
+install all needed formatters and reformat all files in the repository
+according to the current configuration.
+
+.. rubric:: Workflow: Zeek ``master`` branch regularly merged into fork
+
+If Zeek's master branch is regularly merged into the fork, merge conflicts can
+be resolved once and their resolution is tracked in the repository. Similarly,
+we can explicitly reformat the fork once and then merge the upstream branch.
+
+.. code-block:: sh
+
+ ## Get and stage latest versions of configuration files from master.
+ git checkout master -- .pre-commit-config.yaml .clang-format .style.yapf .cmake-format.json
+
+ ## Reformat fork according to new configuration.
+ pre-commit run -a
+
+ ## Record reformatted state of fork.
+ git add -u && git commit -m 'Reformat'
+
+ # Merge in master, resolve merge conflicts as usual.
+ git merge master
+
+.. rubric:: Workflow: Fork regularly rebased onto Zeek ``master`` branch
+
+If the target for a rebase has been reformatted individual diff hunks might not
+apply cleanly anymore. There are different approaches to work around that. The
+approach with the least conflicts is likely to first reformat the fork
+according to upstream style without pulling in changes, and only after that
+rebase on upstream and resolve potential semantic conflicts.
+
+.. code-block:: sh
+
+ # Create a commit updating the configuration files.
+ git checkout master -- .pre-commit-config.yaml .clang-format .style.yapf .cmake-format.json
+ git commit -m 'Bump formatter configurations'
+
+ # With a fork branched from upstream at commit FORK_COMMIT, rebase the
+ # config update commit 'Bump formatter configurations' to the start of the
+ # fork, but do not yet rebase on master (interactively move the last patch
+ # to the start of the list of patches).
+ git rebase -i FORK_COMMIT
+
+ # Reformat all commits according to configs at the base. We use the '--exec'
+ # flag of 'git rebase' to execute pre-commit after applying each patch. If
+ # 'git rebase' detects uncommitted changes it stops automatic progress so
+ # one can inspect and apply the changes.
+ git rebase -i FORK_COMMIT --exec 'pre-commit run --all-files'
+ # When this stops, inspect changes and stage them.
+ git add -u
+ # Continue rebasing. This prompts for a commit message and amends the last
+ # patch.
+ git rebase --continue
+
+ # The fork is now formatted according to upstream style. Rebase on master,
+ # and drop the 'Bump formatter configurations' patch from the list of patches.
+ git rebase -i master
diff --git a/doc/devel/index.rst b/doc/devel/index.rst
new file mode 100644
index 0000000000..7ffe4baf86
--- /dev/null
+++ b/doc/devel/index.rst
@@ -0,0 +1,21 @@
+
+================
+Developer Guides
+================
+
+In addition to documentation found or mentioned below, some developer-oriented
+content is maintained directly in the `Zeek wiki
+`_ due to the nature of
+the content (e.g. the author finds it to be more dynamic, informal, meta,
+transient, etc. compared to other documentation).
+
+.. toctree::
+ :maxdepth: 2
+
+ plugins
+ spicy/index
+ websocket-api
+ Documentation Guide
+ contributors
+ maintainers
+ cluster-backend-zeromq
diff --git a/doc/devel/maintainers.rst b/doc/devel/maintainers.rst
new file mode 100644
index 0000000000..0bc179bcc9
--- /dev/null
+++ b/doc/devel/maintainers.rst
@@ -0,0 +1,13 @@
+
+==================
+Maintainer's Guide
+==================
+
+Some notable guidelines for maintainers are linked below for convenience, but
+they are generally maintained directly in the `Zeek wiki
+`_.
+
+Release Process
+===============
+
+See https://github.com/zeek/zeek/wiki/Release-Process
diff --git a/doc/devel/plugins.rst b/doc/devel/plugins.rst
new file mode 100644
index 0000000000..5381c1ef24
--- /dev/null
+++ b/doc/devel/plugins.rst
@@ -0,0 +1,505 @@
+.. _zkg package manager: https://docs.zeek.org/projects/package-manager/en/stable/
+
+.. _writing-plugins:
+
+===============
+Writing Plugins
+===============
+
+Zeek provides a plugin API that enables extending
+the system dynamically, without modifying the core code base. That way,
+custom code remains self-contained and can be maintained, compiled,
+and installed independently. Currently, plugins can add the following
+functionality to Zeek:
+
+ - Zeek scripts.
+
+ - Builtin functions/events/types for the scripting language.
+
+ - Protocol analyzers.
+
+ - File analyzers.
+
+ - Packet sources and packet dumpers.
+
+ - Logging framework backends.
+
+ - Input framework readers.
+
+A plugin's functionality is available to the user just as if Zeek had
+the corresponding code built-in. Indeed, internally many of Zeek's
+pieces are structured as plugins as well, they are just statically
+compiled into the binary rather than loaded dynamically at runtime.
+
+.. note::
+
+ Plugins and Zeek packages are related but separate concepts. Both extend
+ Zeek's functionality without modifying Zeek's source code. A plugin achieves
+ this via compiled, native code that Zeek links into its core at runtime. A Zeek
+ package, on the other hand, is a modular addition to Zeek, managed via the
+ `zkg package manager`_, that may or may not include a plugin. More commonly,
+ packages consist of script-layer additions to Zeek's functionality. Packages
+ also feature more elaborate metadata, enabling dependencies on other packages,
+ Zeek versions, etc.
+
+Quick Start
+===========
+
+Writing a basic plugin is quite straight-forward as long as one
+follows a few conventions. In the following, we create a simple example
+plugin that adds a new Built-In Function (BIF) to Zeek: we'll add
+``rot13(s: string) : string``, a function that rotates every letter
+in a string by 13 places.
+
+Generally, a plugin comes in the form of a directory following a
+certain structure. To get started, Zeek's distribution provides a
+helper script ``auxil/zeek-aux/plugin-support/init-plugin`` that creates
+a skeleton plugin that can then be customized. Let's use that::
+
+ # init-plugin ./rot13-plugin Demo Rot13
+
+As you can see, the script takes three arguments. The first is a
+directory inside which the plugin skeleton will be created. The second
+is the namespace the plugin will live in, and the third is a descriptive
+name for the plugin itself relative to the namespace. Zeek uses the
+combination of namespace and name to identify a plugin. The namespace
+serves to avoid naming conflicts between plugins written by independent
+developers; pick, e.g., the name of your organisation. The namespaces
+``Bro`` (legacy) and ``Zeek`` are reserved for functionality distributed
+by the Zeek Project. In
+our example, the plugin will be called ``Demo::Rot13``.
+
+The ``init-plugin`` script puts a number of files in place. The full
+layout is described later. For now, all we need is
+``src/rot13.bif``. It's initially empty, but we'll add our new BIF
+there as follows::
+
+ # cat src/rot13.bif
+ %%{
+ #include
+ #include
+ #include "zeek/util.h"
+ #include "zeek/ZeekString.h"
+ #include "zeek/Val.h"
+ %%}
+
+ module Demo;
+
+ function rot13%(s: string%) : string
+ %{
+ char* rot13 = util::copy_string(s->CheckString());
+
+ for ( char* p = rot13; *p; p++ )
+ {
+ char b = islower(*p) ? 'a' : 'A';
+ char d = *p - b + 13;
+
+ if ( d >= 13 && d <= 38 )
+ *p = d % 26 + b;
+ }
+
+ zeek::String* zs = new zeek::String(1, reinterpret_cast(rot13),
+ strlen(rot13));
+ return make_intrusive(zs);
+ %}
+
+The syntax of this file is just like any other ``*.bif`` file; we
+won't go into it here.
+
+Now we are ready to compile our plugin. The configure script will just
+need to be able to find the location of either a Zeek installation-tree or
+a Zeek source-tree.
+
+When building a plugin against a Zeek installation-tree, simply have the
+installation's associated ``zeek-config`` in your :envvar:`PATH` and the
+configure script will detect it and use it to obtain all the information
+it needs::
+
+ # which zeek-config
+ /usr/local/zeek/bin/zeek-config
+ # cd rot13-plugin
+ # ./configure && make
+ [... cmake output ...]
+
+When building a plugin against a Zeek source-tree (which itself needs
+to have first been built), the configure script has to explicitly be
+told its location::
+
+ # cd rot13-plugin
+ # ./configure --zeek-dist=/path/to/zeek/dist && make
+ [... cmake output ...]
+
+This builds the plugin in a subdirectory ``build/``. In fact, that
+subdirectory *becomes* the plugin: when ``make`` finishes, ``build/``
+has everything it needs for Zeek to recognize it as a dynamic plugin.
+
+Let's try that. Once we point Zeek to the ``build/`` directory, it will
+pull in our new plugin automatically, as we can check with the ``-N``
+option::
+
+ # export ZEEK_PLUGIN_PATH=/path/to/rot13-plugin/build
+ # zeek -N
+ [...]
+ Demo::Rot13 - (dynamic, version 0.1.0)
+ [...]
+
+That looks quite good, except for the dummy description that we should
+replace with something nicer so that users will know what our plugin
+is about. We do this by editing the ``config.description`` line in
+``src/Plugin.cc``, like this::
+
+ [...]
+ plugin::Configuration Plugin::Configure()
+ {
+ plugin::Configuration config;
+ config.name = "Demo::Rot13";
+ config.description = "Caesar cipher rotating a string's letters by 13 places.";
+ config.version.major = 0;
+ config.version.minor = 1;
+ config.version.patch = 0;
+ return config;
+ }
+ [...]
+
+Now rebuild and verify that the description is visible::
+
+ # make
+ [...]
+ # zeek -N | grep Rot13
+ Demo::Rot13 - Caesar cipher rotating a string's letters by 13 places. (dynamic, version 0.1.0)
+
+Zeek can also show us what exactly the plugin provides with the
+more verbose option ``-NN``::
+
+ # zeek -NN
+ [...]
+ Demo::Rot13 - Caesar cipher rotating a string's letters by 13 places. (dynamic, version 0.1.0)
+ [Function] Demo::rot13
+ [...]
+
+There's our function. Now let's use it::
+
+ # zeek -e 'print Demo::rot13("Hello")'
+ Uryyb
+
+It works. We next install the plugin along with Zeek itself, so that it
+will find it directly without needing the ``ZEEK_PLUGIN_PATH``
+environment variable. If we first unset the variable, the function
+will no longer be available::
+
+ # unset ZEEK_PLUGIN_PATH
+ # zeek -e 'print Demo::rot13("Hello")'
+ error in , line 1: unknown identifier Demo::rot13, at or near "Demo::rot13"
+
+Once we install it, it works again::
+
+ # make install
+ # zeek -e 'print Demo::rot13("Hello")'
+ Uryyb
+
+The installed version went into
+``/lib/zeek/plugins/Demo_Rot13``.
+
+One can distribute the plugin independently of Zeek for others to use.
+To distribute in source form, just remove the ``build/`` directory
+(``make distclean`` does that) and then tar up the whole ``rot13-plugin/``
+directory. Others then follow the same process as above after
+unpacking.
+
+To distribute the plugin in binary form, the build process
+conveniently creates a corresponding tarball in ``build/dist/``. In
+this case, it's called ``Demo_Rot13-0.1.0.tar.gz``, with the version
+number coming out of the ``VERSION`` file that ``init-plugin`` put
+into place. The binary tarball has everything needed to run the
+plugin, but no further source files. Optionally, one can include
+further files by specifying them in the plugin's ``CMakeLists.txt``
+through the ``zeek_plugin_dist_files`` macro; the skeleton does that
+for ``README``, ``VERSION``, ``CHANGES``, and ``COPYING``. To use the
+plugin through the binary tarball, just unpack it into
+``/lib/zeek/plugins/``. Alternatively, if you unpack
+it in another location, then you need to point ``ZEEK_PLUGIN_PATH`` there.
+
+Before distributing your plugin, you should edit some of the meta
+files that ``init-plugin`` puts in place. Edit ``README`` and
+``VERSION``, and update ``CHANGES`` when you make changes. Also put a
+license file in place as ``COPYING``; if BSD is fine, you will find a
+template in ``COPYING.edit-me``.
+
+Plugin Directory Layout
+=======================
+
+A plugin's directory needs to follow a set of conventions so that Zeek
+(1) recognizes it as a plugin, and (2) knows what to load. While
+``init-plugin`` takes care of most of this, the following is the full
+story. We'll use ```` to represent a plugin's top-level
+directory. With the skeleton, ```` corresponds to ``build/``.
+
+``/__zeek_plugin__``
+ A file that marks a directory as containing a Zeek plugin. The file
+ must exist, and its content must consist of a single line with the
+ qualified name of the plugin (e.g., "Demo::Rot13").
+
+``/lib/.-.so``
+ The shared library containing the plugin's compiled code. Zeek will
+ load this in dynamically at run-time if OS and architecture match
+ the current platform.
+
+``scripts/``
+ A directory with the plugin's custom Zeek scripts. When the plugin
+ gets activated, this directory will be automatically added to
+ ``ZEEKPATH``, so that any scripts/modules inside can be
+ "@load"ed.
+
+``scripts``/__load__.zeek
+ A Zeek script that will be loaded when the plugin gets activated.
+ When this script executes, any BIF elements that the plugin
+ defines will already be available. See below for more information
+ on activating plugins.
+
+``scripts``/__preload__.zeek
+ A Zeek script that will be loaded when the plugin gets activated,
+ but before any BIF elements become available. See below for more
+ information on activating plugins.
+
+``lib/bif/``
+ Directory with auto-generated Zeek scripts that declare the plugin's
+ BIF elements. The files here are produced by ``bifcl``.
+
+Any other files in ```` are ignored by Zeek.
+
+By convention, a plugin should put its custom scripts into sub folders
+of ``scripts/``, i.e., ``scripts///',
+ # format='html')
+ # signode += rawnode
+
+ else:
+ signode += addnodes.desc_name("", sig)
+
+ return sig
+
+
+class ZeekNamespace(ZeekGeneric):
+ def add_target_and_index(self, name, sig, signode):
+ targetname = self.get_obj_name() + "-" + name
+
+ if targetname not in self.state.document.ids:
+ signode["names"].append(targetname)
+ signode["ids"].append(targetname)
+ signode["first"] = not self.names
+ self.state.document.note_explicit_target(signode)
+
+ objects = self.env.domaindata["zeek"]["objects"]
+ key = (self.get_obj_name(), name)
+ objects[key] = self.env.docname
+ self.update_type_map(name)
+
+ indextext = self.get_index_text(name)
+ self.indexnode["entries"].append(
+ make_index_tuple("single", indextext, targetname, targetname)
+ )
+ self.indexnode["entries"].append(
+ make_index_tuple("single", f"namespaces; {sig}", targetname, targetname)
+ )
+
+ def get_index_text(self, name):
+ return _("%s (namespace); %s") % (name, self.env.docname)
+
+ def handle_signature(self, sig, signode):
+ signode += addnodes.desc_name("", sig)
+ return sig
+
+
+class ZeekEnum(ZeekGeneric):
+ def add_target_and_index(self, name, sig, signode):
+ targetname = self.get_obj_name() + "-" + name
+
+ if targetname not in self.state.document.ids:
+ self.process_signode(name, sig, signode, targetname)
+
+ objects = self.env.domaindata["zeek"]["objects"]
+ key = (self.get_obj_name(), name)
+ objects[key] = self.env.docname
+ self.update_type_map(name)
+
+ # indextext = self.get_index_text(name)
+ # self.indexnode['entries'].append(make_index_tuple('single', indextext,
+ # targetname, targetname))
+ m = sig.split()
+
+ if len(m) < 2:
+ logger.warning(
+ "%s: zeek:enum directive missing argument(s)", self.env.docname
+ )
+ return
+
+ if m[1] == "Notice::Type":
+ if "notices" not in self.env.domaindata["zeek"]:
+ self.env.domaindata["zeek"]["notices"] = []
+ self.env.domaindata["zeek"]["notices"].append(
+ (m[0], self.env.docname, targetname)
+ )
+
+ self.indexnode["entries"].append(
+ make_index_tuple(
+ "single", f"{m[1]} (enum values); {m[0]}", targetname, targetname
+ )
+ )
+
+ def handle_signature(self, sig, signode):
+ m = sig.split()
+ name = m[0]
+ signode += addnodes.desc_name("", name)
+ return name
+
+
+class ZeekParamField(docfields.GroupedField):
+ has_arg = True
+ is_typed = True
+
+
+class ZeekIdentifier(ZeekGeneric):
+ zeek_param_field = ZeekParamField("param", label="Parameters", can_collapse=True)
+ field_type_map = {"param": (zeek_param_field, False)}
+
+ def get_index_text(self, name):
+ return name
+
+ def get_field_type_map(self):
+ return self.field_type_map
+
+
+class ZeekNative(ZeekGeneric):
+ def handle_signature(self, sig, signode):
+ # The run() method is overridden to drop signode anyway in favor of
+ # simply adding the index and a target nodes and leaving up
+ # to the .rst document to explicitly add things that need to
+ # be presented in the final rendering (e.g. a section header)
+ self.native_name = sig
+ return sig
+
+ def process_signode(self, name, sig, signode, targetname):
+ pass
+
+ def run(self):
+ ns = super().run()
+ index_node = ns[0]
+
+ target_id = self.get_obj_name() + "-" + self.native_name
+ target_node = nodes.target("", "", ids=[target_id])
+ self.state.document.note_explicit_target(target_node)
+
+ # Replace the description node from Sphinx with a simple target node
+ return [index_node, target_node]
+
+
+class ZeekKeyword(ZeekNative):
+ def get_index_text(self, name):
+ if name and name[0] == "@":
+ return _("%s (directive)") % (name)
+ else:
+ return _("%s (keyword)") % (name)
+
+
+class ZeekAttribute(ZeekNative):
+ def get_index_text(self, name):
+ return _("%s (attribute)") % (name)
+
+
+class ZeekType(ZeekGeneric):
+ """
+ Put the type that's currently documented into env.ref_context
+ for usage with the ZeekField directive.
+ """
+
+ def before_content(self):
+ self.env.ref_context["zeek:type"] = self.arguments[0]
+
+ def after_content(self):
+ self.env.ref_context.pop("zeek:type", None)
+
+
+class ZeekField(ZeekGeneric):
+ def handle_signature(self, sig, signode):
+ """
+ The signature for .. zeek:field: currently looks like the following:
+
+ .. zeek:field:: ts :zeek:type:`time` :zeek:attr:`&log` :zeek:attr:`&optional`
+ """
+ parts = sig.split(" ", 2)
+ name, type_str = parts[0:2]
+ record_type = self.env.ref_context["zeek:type"]
+ fullname = "$".join([record_type, name])
+ attrs_str = ""
+ if len(parts) == 3:
+ attrs_str = parts[2]
+
+ type_nodes, _ = self.state.inline_text(type_str, -1)
+
+ signode += addnodes.desc_name(name, name)
+ signode += addnodes.desc_sig_punctuation("", ":")
+ signode += addnodes.desc_sig_space()
+ signode += type_nodes
+
+ if attrs_str:
+ attr_nodes, _ = self.state.inline_text(attrs_str, -1)
+ signode += addnodes.desc_sig_space()
+ signode += attr_nodes
+
+ signode["class"] = record_type
+ signode["fullname"] = fullname
+
+ return fullname
+
+ def run(self):
+ idx, signode = super().run()
+
+ record_type = self.env.ref_context["zeek:type"]
+
+ fields = self.env.domaindata["zeek"].setdefault("fields", {})
+ rfields = fields.setdefault(record_type, collections.OrderedDict())
+ rfields[signode[0]["fullname"]] = {
+ "idx": idx,
+ "signode": signode,
+ }
+
+ return []
+
+
+class ZeekNativeType(ZeekNative):
+ def get_obj_name(self):
+ # As opposed to using 'native-type', just imitate 'type'.
+ return "type"
+
+
+class ZeekFieldXRefRole(XRefRole):
+ def process_link(self, env, refnode, has_explicit_title, title, target):
+ title, target = super().process_link(
+ env, refnode, has_explicit_title, title, target
+ )
+
+ parts = title.split("$")
+ if len(parts) == 2 and parts[0] and parts[1]:
+ # If a field is in Type$field, form, strip Type.
+ title = parts[1]
+
+ return title, target
+
+
+class ZeekNotices(Index):
+ """
+ Index subclass to provide the Zeek notices index.
+ """
+
+ name = "noticeindex"
+ localname = _("Zeek Notice Index")
+ shortname = _("notices")
+
+ def generate(self, docnames=None):
+ content = {}
+
+ if "notices" not in self.domain.env.domaindata["zeek"]:
+ return content, False
+
+ for n in self.domain.env.domaindata["zeek"]["notices"]:
+ modname = n[0].split("::")[0]
+ entries = content.setdefault(modname, [])
+ entries.append([n[0], 0, n[1], n[2], "", "", ""])
+
+ content = sorted(content.items())
+
+ return content, False
+
+
+class ZeekDomain(Domain):
+ """Zeek domain."""
+
+ name = "zeek"
+ label = "Zeek"
+
+ object_types = {
+ "type": ObjType(_("type"), "type"),
+ "native-type": ObjType(_("type"), "type"),
+ "namespace": ObjType(_("namespace"), "namespace"),
+ "id": ObjType(_("id"), "id"),
+ "keyword": ObjType(_("keyword"), "keyword"),
+ "enum": ObjType(_("enum"), "enum"),
+ "attr": ObjType(_("attr"), "attr"),
+ "field": ObjType(_("field"), "field"),
+ }
+
+ directives = {
+ "type": ZeekType,
+ "native-type": ZeekNativeType,
+ "namespace": ZeekNamespace,
+ "id": ZeekIdentifier,
+ "keyword": ZeekKeyword,
+ "enum": ZeekEnum,
+ "attr": ZeekAttribute,
+ "field": ZeekField,
+ }
+
+ roles = {
+ "type": XRefRole(),
+ "namespace": XRefRole(),
+ "id": XRefRole(),
+ "keyword": XRefRole(),
+ "enum": XRefRole(),
+ "attr": XRefRole(),
+ "see": XRefRole(),
+ "field": ZeekFieldXRefRole(),
+ }
+
+ indices = [
+ ZeekNotices,
+ ]
+
+ initial_data = {
+ "objects": {}, # fullname -> docname, objtype
+ }
+
+ def clear_doc(self, docname):
+ to_delete = []
+
+ for (typ, name), doc in self.data["objects"].items():
+ if doc == docname:
+ to_delete.append((typ, name))
+
+ for typ, name in to_delete:
+ del self.data["objects"][typ, name]
+
+ def resolve_xref(self, env, fromdocname, builder, typ, target, node, contnode):
+ objects = self.data["objects"]
+
+ if typ == "see":
+ if target not in self.data["idtypes"]:
+ logger.warning(
+ '%s: unknown target for ":zeek:see:`%s`"', fromdocname, target
+ )
+ return []
+
+ objtype = self.data["idtypes"][target]
+ return make_refnode(
+ builder,
+ fromdocname,
+ objects[objtype, target],
+ objtype + "-" + target,
+ contnode,
+ target + " " + objtype,
+ )
+ elif typ == "field" and "$" not in target:
+ # :zeek:field:`x` without a record type ends up just x, no ref.
+ return []
+ else:
+ objtypes = self.objtypes_for_role(typ)
+
+ for objtype in objtypes:
+ if (objtype, target) in objects:
+ return make_refnode(
+ builder,
+ fromdocname,
+ objects[objtype, target],
+ objtype + "-" + target,
+ contnode,
+ target + " " + objtype,
+ )
+ else:
+ logger.warning(
+ '%s: unknown target for ":zeek:%s:`%s`"',
+ fromdocname,
+ typ,
+ target,
+ )
+
+ def get_objects(self):
+ for (typ, name), docname in self.data["objects"].items():
+ yield name, name, typ, docname, typ + "-" + name, 1
+
+ def merge_domaindata(self, docnames, otherdata):
+ """
+ Merge domaindata in multiprocess mode.
+
+ I'm quite unclear how the objects dict works out okay in single
+ process mode. For example, the file_entropy() event is defined
+ in scripts/base/bif/plugins/Zeek_FileEntropy.events.bif.zeek.rst
+ *and* in script-reference/autogenerated-file-analyzer-index.rst.
+ The current documentation refers to the first one for :zeek:see:.
+ It seems in single process mode the reading sorts filenames and
+ just uses the last highest sorting one. That ends-up being the one
+ in scripts/base.
+
+ In [4]: "script-reference/autogenerated" < "scripts/base"
+ Out[4]: True
+
+ """
+ for target, data in otherdata.items():
+ if target == "version":
+ continue
+ elif hasattr(data, "items"):
+ target_data = self.env.domaindata["zeek"].setdefault(target, {})
+
+ # Iterate manually over the elements for debugging
+ for k, v in data.items():
+ if k not in target_data:
+ target_data[k] = v
+ else:
+ # The > comparison below updates the objects domaindata
+ # to filenames that sort higher. See comment above.
+ if isinstance(v, str):
+ if v > target_data[k]:
+ target_data[k] = v
+ else:
+ # Otherwise assume it's a dict and we can merge
+ # using update()
+ target_data[k].update(v)
+
+ elif hasattr(data, "extend"):
+ # notices are a list
+ target_data = self.env.domaindata["zeek"].setdefault(target, [])
+ target_data.extend(data)
+ else:
+ raise NotImplementedError(target, type(data))
diff --git a/doc/ext/zeek_pygments.py b/doc/ext/zeek_pygments.py
new file mode 100644
index 0000000000..aaa9f449fe
--- /dev/null
+++ b/doc/ext/zeek_pygments.py
@@ -0,0 +1,247 @@
+from pygments.lexer import RegexLexer, bygroups, include, words
+from pygments.token import (
+ Comment,
+ Keyword,
+ Literal,
+ Name,
+ Number,
+ Operator,
+ Punctuation,
+ String,
+ Text,
+)
+
+
+def setup(Sphinx):
+ return {
+ "parallel_read_safe": True,
+ }
+
+
+class ZeekLexer(RegexLexer):
+ """
+ For `Zeek `_ scripts.
+
+ .. versionadded:: 2.5
+ """
+
+ name = "Zeek"
+ aliases = ["zeek"]
+ filenames = ["*.zeek"]
+
+ _hex = r"[0-9a-fA-F]"
+ _float = r"((\d*\.?\d+)|(\d+\.?\d*))([eE][-+]?\d+)?"
+ _h = r"[A-Za-z0-9][-A-Za-z0-9]*"
+
+ tokens = {
+ "root": [
+ include("whitespace"),
+ include("comments"),
+ include("directives"),
+ include("attributes"),
+ include("types"),
+ include("keywords"),
+ include("literals"),
+ include("operators"),
+ include("punctuation"),
+ (
+ r"\b((?:[A-Za-z_][A-Za-z_0-9]*)(?:::(?:[A-Za-z_][A-Za-z_0-9]*))*)(?=\s*\()",
+ Name.Function,
+ ),
+ include("identifiers"),
+ ],
+ "whitespace": [
+ (r"\n", Text),
+ (r"\s+", Text),
+ (r"\\\n", Text),
+ ],
+ "comments": [
+ (r"#.*$", Comment),
+ ],
+ "directives": [
+ (r"(@(load-plugin|load-sigs|load|unload))\b.*$", Comment.Preproc),
+ (
+ r"(@(DEBUG|DIR|FILENAME|deprecated|if|ifdef|ifndef|else|endif))\b",
+ Comment.Preproc,
+ ),
+ (r"(@prefixes)\s*(\+?=).*$", Comment.Preproc),
+ ],
+ "attributes": [
+ (
+ words(
+ (
+ "redef",
+ "priority",
+ "log",
+ "optional",
+ "default",
+ "add_func",
+ "delete_func",
+ "expire_func",
+ "read_expire",
+ "write_expire",
+ "create_expire",
+ "synchronized",
+ "persistent",
+ "rotate_interval",
+ "rotate_size",
+ "encrypt",
+ "raw_output",
+ "mergeable",
+ "error_handler",
+ "broker_allow_complex_type",
+ "is_assigned",
+ "is_used",
+ "type_column",
+ "deprecated",
+ "on_change",
+ "backend",
+ "broker_store",
+ ),
+ prefix=r"&",
+ suffix=r"\b",
+ ),
+ Keyword.Pseudo,
+ ),
+ ],
+ "types": [
+ (
+ words(
+ (
+ "any",
+ "enum",
+ "record",
+ "set",
+ "table",
+ "vector",
+ "function",
+ "hook",
+ "event",
+ "addr",
+ "bool",
+ "count",
+ "double",
+ "file",
+ "int",
+ "interval",
+ "pattern",
+ "port",
+ "string",
+ "subnet",
+ "time",
+ ),
+ prefix=r"\b",
+ suffix=r"\b",
+ ),
+ Keyword.Type,
+ ),
+ (
+ r"\b(opaque)(\s+)(of)(\s+)((?:[A-Za-z_][A-Za-z_0-9]*)(?:::(?:[A-Za-z_][A-Za-z_0-9]*))*)\b",
+ bygroups(Keyword.Type, Text, Operator.Word, Text, Keyword.Type),
+ ),
+ (
+ r"\b(type)(\s+)((?:[A-Za-z_][A-Za-z_0-9]*)(?:::(?:[A-Za-z_][A-Za-z_0-9]*))*)(\s*)(:)(\s*)\b(record|enum)\b",
+ bygroups(Keyword, Text, Name.Class, Text, Operator, Text, Keyword.Type),
+ ),
+ (
+ r"\b(type)(\s+)((?:[A-Za-z_][A-Za-z_0-9]*)(?:::(?:[A-Za-z_][A-Za-z_0-9]*))*)(\s*)(:)",
+ bygroups(Keyword, Text, Name, Text, Operator),
+ ),
+ (
+ r"\b(redef)(\s+)(record|enum)(\s+)((?:[A-Za-z_][A-Za-z_0-9]*)(?:::(?:[A-Za-z_][A-Za-z_0-9]*))*)\b",
+ bygroups(Keyword, Text, Keyword.Type, Text, Name.Class),
+ ),
+ ],
+ "keywords": [
+ (
+ words(
+ (
+ "redef",
+ "export",
+ "if",
+ "else",
+ "for",
+ "while",
+ "return",
+ "break",
+ "next",
+ "continue",
+ "fallthrough",
+ "switch",
+ "default",
+ "case",
+ "add",
+ "delete",
+ "copy",
+ "when",
+ "timeout",
+ "schedule",
+ ),
+ prefix=r"\b",
+ suffix=r"\b",
+ ),
+ Keyword,
+ ),
+ (r"\b(print)\b", Keyword),
+ (r"\b(global|local|const|option)\b", Keyword.Declaration),
+ (
+ r"\b(module)(\s+)(([A-Za-z_][A-Za-z_0-9]*)(?:::([A-Za-z_][A-Za-z_0-9]*))*)\b",
+ bygroups(Keyword.Namespace, Text, Name.Namespace),
+ ),
+ ],
+ "literals": [
+ (r'"', String, "string"),
+ # Not the greatest match for patterns, but generally helps
+ # disambiguate between start of a pattern and just a division
+ # operator.
+ (r"/(?=.*/)", String.Regex, "regex"),
+ (r"\b(T|F)\b", Keyword.Constant),
+ # Port
+ (r"\b\d{1,5}/(udp|tcp|icmp|unknown)\b", Number),
+ # IPv4 Address
+ (
+ r"\b(25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}|[0-9]{1,2})\.(25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}|[0-9]{1,2})\.(25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}|[0-9]{1,2})\.(25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}|[0-9]{1,2})\b",
+ Number,
+ ),
+ # IPv6 Address (not 100% correct: that takes more effort)
+ (
+ r"\[([0-9a-fA-F]{0,4}:){2,7}([0-9a-fA-F]{0,4})?((25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}|[0-9]{1,2})\.(25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}|[0-9]{1,2})\.(25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}|[0-9]{1,2})\.(25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}|[0-9]{1,2}))?\]",
+ Number,
+ ),
+ # Numeric
+ (r"\b0[xX]" + _hex + r"+\b", Number.Hex),
+ (r"\b" + _float + r"\s*(day|hr|min|sec|msec|usec)s?\b", Literal.Date),
+ (r"\b" + _float + r"\b", Number.Float),
+ (r"\b(\d+)\b", Number.Integer),
+ # Hostnames
+ (_h + r"(\." + _h + r")+", String),
+ ],
+ "operators": [
+ (r"[!%*/+<=>~|&^-]", Operator),
+ (r"([-+=&|]{2}|[+=!><-]=)", Operator),
+ (r"\b(in|as|is|of)\b", Operator.Word),
+ (r"\??\$", Operator),
+ # Technically, colons are often used for punctuation/separation.
+ # E.g. field name/type separation.
+ (r"[?:]", Operator),
+ ],
+ "punctuation": [
+ (r"\?\$", Punctuation),
+ (r"[{}()\[\],;:.]", Punctuation),
+ ],
+ "identifiers": [
+ (r"([a-zA-Z_]\w*)(::)", bygroups(Name, Punctuation)),
+ (r"[a-zA-Z_]\w*", Name),
+ ],
+ "string": [
+ (r"\\.", String.Escape),
+ (r"%-?[0-9]*(\.[0-9]+)?[DTdxsefg]", String.Escape),
+ (r'"', String, "#pop"),
+ (r".", String),
+ ],
+ "regex": [
+ (r"\\.", String.Escape),
+ (r"/", String.Regex, "#pop"),
+ (r".", String.Regex),
+ ],
+ }
diff --git a/doc/frameworks/broker.rst b/doc/frameworks/broker.rst
new file mode 100644
index 0000000000..a70b042e5a
--- /dev/null
+++ b/doc/frameworks/broker.rst
@@ -0,0 +1,644 @@
+.. _CAF: https://github.com/actor-framework/actor-framework
+
+.. _broker-framework:
+
+==============================
+Broker Communication Framework
+==============================
+
+.. rst-class:: opening
+
+ Zeek uses the `Broker Library
+ `_ to exchange information with
+ other Zeek processes. Broker itself uses CAF_ (C++ Actor Framework)
+ internally for connecting nodes and exchanging arbitrary data over
+ networks. Broker then introduces, on top of CAF, a topic-based
+ publish/subscribe communication pattern using a data model that is
+ compatible to Zeek's. Broker itself can be utilized outside the
+ context of Zeek, with Zeek itself making use of only a few predefined
+ Broker message formats that represent Zeek events, log entries, etc.
+
+ In summary, the Zeek's Broker framework provides basic facilities for
+ connecting broker-enabled peers (e.g. Zeek instances) to each other
+ and exchanging messages (e.g. events and logs).
+
+Cluster Layout / API
+====================
+
+Layout / Topology
+-----------------
+
+In a Zeek cluster setup, every Zeek process is assigned a cluster role.
+Such a process is then called a Zeek node, a cluster node, or just named
+after the role of the process (the manager, the loggers, ...). A basic Zeek
+cluster uses four different node types, enumerated in the script-level
+variable :zeek:see:`Cluster::NodeType`.
+
+- Manager
+- Logger
+- Worker
+- Proxy
+
+In small Zeek deployments, all nodes may run on a single host. In large
+Zeek deployments, nodes may be distributed across multiple physical
+systems for scaling.
+
+Currently, a single Manager node in a Zeek cluster exists. Further, connectivity
+between nodes is determined statically based on their type:
+
+- Every node connects to all loggers and the manager.
+
+- Each worker connects to all proxies.
+
+
+.. figure:: broker/cluster-layout.png
+
+Some general suggestions as to the purpose/utilization of each node type:
+
+- Workers: are a good first choice for doing the brunt of any work you need
+ done. They should be spending a lot of time performing the actual job
+ of parsing/analyzing incoming data from packets, so you might choose
+ to look at them as doing a "first pass" analysis and then deciding how
+ the results should be shared with other nodes in the cluster.
+
+- Proxies: serve as intermediaries for data storage and work/calculation
+ offloading. Good for helping offload work or data in a scalable and
+ distributed way. Since any given worker is connected to all
+ proxies and can agree on an "arbitrary key -> proxy node" mapping
+ (more on that later), you can partition work or data amongst them in a
+ uniform manner. e.g. you might choose to use proxies as a method of
+ sharing non-persistent state or as a "second pass" analysis for any
+ work that you don't want interfering with the workers' capacity to
+ keep up with capturing and parsing packets. Note that the default scripts
+ that come with Zeek make minimal use of proxies, so if you are coming
+ from a previous ZeekControl deployment, you may want to try reducing down
+ to a single proxy node. If you come to have custom/community scripts
+ that utilize proxies, that would be the time to start considering scaling
+ up the number of proxies to meet demands.
+
+- Manager: this node will be good at performing decisions that require a
+ global view of things since it is in a centralized location, connected
+ to everything. However, that also makes it easy to overload, so try
+ to use it sparingly and only for tasks that must be done in a
+ centralized or authoritative location. Optionally, for some
+ deployments, the Manager can also serve as the sole Logger.
+
+- Loggers: these nodes should simply be spending their time writing out
+ logs to disk and not used for much else. In the default cluster
+ configuration, logs get distributed among available loggers in a
+ round-robin fashion, providing failover capability should any given
+ logger temporarily go offline.
+
+Data Management/Sharing Strategies
+==================================
+
+There's maybe no single, best approach or pattern to use when you need a
+Zeek script to store or share long-term state and data. The two
+approaches that were previously used were either using the ``&synchronized``
+attribute on tables/sets or by explicitly sending events to specific
+nodes on which you wanted data to be stored. The former is no longer
+possible, though there are several new possibilities that the new
+Broker/Cluster framework offer, namely distributed data store and data
+partitioning APIs.
+
+Data Stores
+-----------
+
+Broker provides a distributed key-value store interface with optional
+choice of using a persistent backend. For more detail, see
+:ref:`this example `.
+
+Some ideas/considerations/scenarios when deciding whether to use
+a data store for your use-case:
+
+* If you need the full data set locally in order to achieve low-latency
+ queries using data store "clones" can provide that.
+
+* If you need data that persists across restarts of Zeek processes, then
+ data stores can also provide that.
+
+* If the data you want to store is complex (tables, sets, records) or
+ you expect to read, modify, and store back, then data stores may not
+ be able to provide simple, race-free methods of performing the pattern
+ of logic that you want.
+
+* If the data set you want to store is excessively large, that's still
+ problematic even for stores that use a persistent backend as they are
+ implemented in a way that requires a full snapshot of the store's
+ contents to fit in memory (this limitation may change in the future).
+
+Data Partitioning
+-----------------
+
+New data partitioning strategies are available using the API in
+:doc:`/scripts/base/frameworks/cluster/pools.zeek`. Using that API, developers
+of custom Zeek scripts can define a custom pool of nodes that best fits the
+needs of their script.
+
+One example strategy is to use Highest Random Weight (HRW) hashing to
+partition data tables amongst the pool of all proxy nodes. e.g. using
+:zeek:see:`Cluster::publish_hrw`. This could allow clusters to
+be scaled more easily than the approach of "the entire data set gets
+synchronized to all nodes" as the solution to memory limitations becomes
+"just add another proxy node". It may also take away some of the
+messaging load that used to be required to synchronize data sets across
+all nodes.
+
+The tradeoff of this approach, is that nodes that leave the pool (due to
+crashing, etc.) cause a temporary gap in the total data set until
+workers start hashing keys to a new proxy node that is still alive,
+causing data to now be located and updated there.
+
+If the developer of a script expects its workload to be particularly
+intensive, wants to ensure that their operations get exclusive
+access to nodes, or otherwise set constraints on the number of nodes within
+a pool utilized by their script, then the :zeek:see:`Cluster::PoolSpec`
+structure will allow them to do that while still allowing users of that script
+to override the default suggestions made by the original developer.
+
+Broker Framework Examples
+=========================
+
+The broker framework provides basic facilities for connecting Zeek instances
+to each other and exchanging messages, like events or logs.
+
+See :doc:`/scripts/base/frameworks/broker/main.zeek` for an overview
+of the main Broker API.
+
+.. _broker_topic_naming:
+
+Topic Naming Conventions
+------------------------
+
+All Broker-based messaging involves two components: the information you
+want to send (e.g. an event w/ its arguments) along with an associated
+topic name string. The topic strings are used as a filtering mechanism:
+Broker uses a publish/subscribe communication pattern where peers
+advertise interest in topic **prefixes** and only receive messages which
+match one of their prefix subscriptions.
+
+Broker itself supports arbitrary topic strings, however Zeek generally
+follows certain conventions in choosing these topics to help avoid
+conflicts and generally make them easier to remember.
+
+As a reminder of how topic subscriptions work, subscribers advertise
+interest in a topic **prefix** and then receive any messages published by a
+peer to a topic name that starts with that prefix. E.g. Alice
+subscribes to the "alice/dogs" prefix, then would receive the following
+message topics published by Bob:
+
+- topic "alice/dogs/corgi"
+- topic "alice/dogs"
+- topic "alice/dogsarecool/oratleastilikethem"
+
+Alice would **not** receive the following message topics published by Bob:
+
+- topic "alice/cats/siamese"
+- topic "alice/cats"
+- topic "alice/dog"
+- topic "alice"
+
+Note that the topics aren't required to form a slash-delimited hierarchy,
+the subscription matching is purely a byte-per-byte prefix comparison.
+
+However, Zeek scripts generally will follow a topic naming hierarchy and
+any given script will make the topic names it uses apparent via some
+redef'able constant in its export section. Generally topics that Zeek
+scripts use will be along the lines of :samp:`zeek/{}/{}`
+with :samp:`{}` being the script's module name (in all-undercase).
+For example, you might expect an imaginary ``Pretend`` framework to
+publish/subscribe using topic names like ``zeek/pretend/my_cool_event``.
+For scripts that use Broker as a means of cluster-aware analysis,
+it's usually sufficient for them to make use of the topics declared
+by the cluster framework. For scripts that are meant to establish
+communication flows unrelated to Zeek cluster, new topics are declared
+(examples being the NetControl and Control frameworks).
+
+For cluster operation, see :doc:`/scripts/base/frameworks/cluster/main.zeek`
+for a list of topics that are useful for steering published events to
+the various node classes. E.g. you have the ability to broadcast
+to all nodes of a given class (e.g. just workers) or just send to a
+specific node within a class.
+
+The topic names that logs get published under are a bit nuanced. In the
+default cluster configuration, they are round-robin published to
+explicit topic names that identify a single logger. In standalone Zeek
+processes, logs get published to the topic indicated by
+:zeek:see:`Broker::default_log_topic_prefix`.
+
+For those writing their own scripts which need new topic names, a
+suggestion would be to avoid prefixing any new topics/prefixes with
+``zeek/`` as any changes in scripts shipping with Zeek will use that prefix
+and it's better to not risk unintended conflicts. Again, it's
+often less confusing to just re-use existing topic names instead
+of introducing new topic names. The typical use case is writing
+a cluster-enabled script, which usually just needs to route events
+based upon node classes, and that already has usable topics in the
+cluster framework.
+
+Connecting to Peers
+-------------------
+
+Zeek can accept incoming connections by calling :zeek:see:`Broker::listen`.
+
+.. literalinclude:: broker/connecting-listener.zeek
+ :caption: connecting-listener.zeek
+ :language: zeek
+ :linenos:
+ :tab-width: 4
+
+Zeek can initiate outgoing connections by calling :zeek:see:`Broker::peer`.
+
+.. literalinclude:: broker/connecting-connector.zeek
+ :caption: connecting-connector.zeek
+ :language: zeek
+ :linenos:
+ :tab-width: 4
+
+In either case, connection status updates are monitored via the
+:zeek:see:`Broker::peer_added` and :zeek:see:`Broker::peer_lost` events.
+
+Remote Events
+-------------
+
+To receive remote events, you need to first subscribe to a "topic" to which
+the events are being sent. A topic is just a string chosen by the sender,
+and named in a way that helps organize events into various categories.
+See the :ref:`topic naming conventions section ` for
+more on how topics work and are chosen.
+
+Use the :zeek:see:`Broker::subscribe` function to subscribe to topics and
+define any event handlers for events that peers will send.
+
+.. literalinclude:: broker/events-listener.zeek
+ :caption: events-listener.zeek
+ :language: zeek
+ :linenos:
+ :tab-width: 4
+
+To send an event, call the :zeek:see:`Broker::publish` function which you can
+supply directly with the event and its arguments or give it the return value of
+:zeek:see:`Broker::make_event` in case you need to send the same event/args
+multiple times. When publishing events like this, local event handlers for
+the event are not called, even if a matching subscription exists.
+
+.. literalinclude:: broker/events-connector.zeek
+ :caption: events-connector.zeek
+ :language: zeek
+ :linenos:
+ :tab-width: 4
+
+Note that the subscription model is prefix-based, meaning that if you subscribe
+to the ``zeek/events`` topic prefix you would receive events that are published
+to topic names ``zeek/events/foo`` and ``zeek/events/bar`` but not
+``zeek/misc``.
+
+.. note::
+
+ In prior Zeek versions, ``Broker::auto_publish`` was available to automatically
+ send events to peers whenever the events were called locally via the normal
+ event invocation syntax. When auto-publishing events, local event handlers for
+ the event were called in addition to sending the event to any subscribed peers.
+
+ ``Broker::auto_publish`` was removed due to its
+ `implicit nature `_.
+
+
+Remote Logging
+--------------
+
+.. literalinclude:: broker/testlog.zeek
+ :caption: testlog.zeek
+ :language: zeek
+ :linenos:
+ :tab-width: 4
+
+To toggle remote logs, redef :zeek:see:`Log::enable_remote_logging`.
+Use the :zeek:see:`Broker::subscribe` function to advertise interest
+in logs written by peers. The topic names that Zeek uses are determined by
+:zeek:see:`Broker::log_topic`.
+
+.. literalinclude:: broker/logs-listener.zeek
+ :caption: logs-listener.zeek
+ :language: zeek
+ :linenos:
+ :tab-width: 4
+
+.. literalinclude:: broker/logs-connector.zeek
+ :caption: logs-connector.zeek
+ :language: zeek
+ :linenos:
+ :tab-width: 4
+
+Note that logging events are only raised locally on the node that performs
+the :zeek:see:`Log::write` and not automatically published to peers.
+
+.. _data_store_example:
+
+Distributed Data Stores
+-----------------------
+
+See :doc:`/scripts/base/frameworks/broker/store.zeek` for an overview
+of the Broker data store API.
+
+There are two flavors of key-value data store interfaces: master and clone.
+
+A master data store can be cloned from remote peers which may then
+perform lightweight, local queries against the clone, which
+automatically stays synchronized with the master store. Clones cannot
+modify their content directly, instead they send modifications to the
+centralized master store which applies them and then broadcasts them to
+all clones.
+
+Master stores get to choose what type of storage backend to
+use. E.g. In-memory versus SQLite for persistence.
+
+Data stores also support expiration on a per-key basis using an amount of
+time relative to the entry's last modification time.
+
+.. literalinclude:: broker/stores-listener.zeek
+ :caption: stores-listener.zeek
+ :language: zeek
+ :linenos:
+ :tab-width: 4
+
+.. literalinclude:: broker/stores-connector.zeek
+ :caption: stores-connector.zeek
+ :language: zeek
+ :linenos:
+ :tab-width: 4
+
+Note that all data store queries must be made within Zeek's asynchronous
+``when`` statements and must specify a timeout block.
+
+
+SQLite Data Store Tuning
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+When leveraging the SQLite backend for persistence, SQLite's default journaling
+and consistency settings are used. Concretely, ``journal_mode`` is set to
+``DELETE`` and ``synchronous`` to ``FULL``. This in turn is not optimal for
+`high INSERT or UPDATE rates `_
+due to SQLite waiting for the required IO to complete until data is safely
+on disk. This can also have a non-negligible system effect when the
+SQLite database is located on the same device as other IO critical processes.
+
+Starting with Zeek 5.2, it is possible to tune and relax these settings by
+providing an appropriate :zeek:see:`Broker::BackendOptions` and
+:zeek:see:`Broker::SQLiteOptions` instance to
+:zeek:see:`Broker::create_master`. The following example changes the
+data store to use `Write-Ahead Logging `_
+which should perform significantly faster than the default.
+
+
+.. literalinclude:: broker/store-sqlite-tuning.zeek
+ :caption: store-sqlite-tuning.zeek
+ :language: zeek
+ :linenos:
+ :tab-width: 4
+
+If your use-case turns out to require more and lower-level tuning around
+SQLite options, please get in contact or open a feature request on GitHub.
+
+
+Cluster Framework Examples
+==========================
+
+This section contains a few brief examples of how various communication
+patterns one might use when developing Zeek scripts that are to operate in
+the context of a cluster.
+
+.. _event-namespacing-pitfall:
+
+A Reminder About Events and Module Namespaces
+---------------------------------------------
+
+For simplicity, the following examples do not use any modules/namespaces.
+If you choose to use them within your own code, it's important to
+remember that the ``event`` and ``schedule`` dispatching statements
+should always use the fully-qualified event name.
+
+For example, this will likely not work as expected:
+
+.. code-block:: zeek
+
+ module MyModule;
+
+ export {
+ global my_event: event();
+ }
+
+ event my_event()
+ {
+ print "got my event";
+ }
+
+ event zeek_init()
+ {
+ event my_event();
+ schedule 10sec { my_event() };
+ }
+
+This code runs without errors, however, the local ``my_event`` handler
+will never be called and also not any remote handlers either. Instead, at
+minimum you would need change the ``zeek_init()`` handler:
+
+.. code-block:: zeek
+
+ event zeek_init()
+ {
+ event MyModule::my_event();
+ schedule 10sec { MyModule::my_event() };
+ }
+
+Though, an easy rule of thumb to remember would be to always use the
+explicit module namespace scoping and you can't go wrong:
+
+.. code-block:: zeek
+
+ module MyModule;
+
+ export {
+ global MyModule::my_event: event();
+ }
+
+ event MyModule::my_event()
+ {
+ print "got my event";
+ }
+
+ event zeek_init()
+ {
+ event MyModule::my_event();
+ schedule 10sec { MyModule::my_event() };
+ }
+
+Event types that reside in the default namespace (such as :zeek:id:`zeek_init` or
+:zeek:id:`connection_established`) require no qualification, even when scheduled from
+inside a module. Don't force qualification of such events by prefixing with
+``GLOBAL::``.
+
+Note that other identifiers in Zeek do not have this inconsistency
+related to module namespacing, it's just events that require
+explicitness.
+
+Manager Sending Events To Workers
+---------------------------------
+
+This is fairly straightforward, we just need a topic name which we know
+all workers are subscribed combined with the event we want to send them.
+
+.. code-block:: zeek
+
+ event manager_to_workers(s: string)
+ {
+ print "got event from manager", s;
+ }
+
+ event some_event_handled_on_manager()
+ {
+ Broker::publish(Cluster::worker_topic, manager_to_workers,
+ "hello v0");
+
+ # If you know this event is only handled on the manager, you don't
+ # need any of the following conditions, they're just here as an
+ # example of how you can further discriminate based on node identity.
+
+ # Can check based on the name of the node.
+ if ( Cluster::node == "manager" )
+ Broker::publish(Cluster::worker_topic, manager_to_workers,
+ "hello v1");
+
+ # Can check based on the type of the node.
+ if ( Cluster::local_node_type() == Cluster::MANAGER )
+ Broker::publish(Cluster::worker_topic, manager_to_workers,
+ "hello v2");
+
+ # The run-time overhead of the above conditions can even be
+ # eliminated by using the following conditional directives.
+ # It's evaluated once per node at parse-time and, if false,
+ # any code within is just ignored / treated as not existing at all.
+ @if ( Cluster::local_node_type() == Cluster::MANAGER )
+ Broker::publish(Cluster::worker_topic, manager_to_workers,
+ "hello v3");
+ @endif
+ }
+
+Worker Sending Events To Manager
+--------------------------------
+
+This should look almost identical to the previous case of sending an event
+from the manager to workers, except it simply changes the topic name to
+one which the manager is subscribed.
+
+.. code-block:: zeek
+
+ event worker_to_manager(worker_name: string)
+ {
+ print "got event from worker", worker_name;
+ }
+
+ event some_event_handled_on_worker()
+ {
+ Broker::publish(Cluster::manager_topic, worker_to_manager,
+ Cluster::node);
+ }
+
+Worker Sending Events To All Workers
+------------------------------------
+
+Since workers are not directly connected to each other in the cluster
+topology, this type of communication is a bit different than what we
+did before since we have to manually relay the event via some node that *is*
+connected to all workers. The manager or a proxy satisfies that requirement:
+
+.. code-block:: zeek
+
+ event worker_to_workers(worker_name: string)
+ {
+ @if ( Cluster::local_node_type() == Cluster::MANAGER ||
+ Cluster::local_node_type() == Cluster::PROXY )
+ Broker::publish(Cluster::worker_topic, worker_to_workers,
+ worker_name);
+ @else
+ print "got event from worker", worker_name;
+ @endif
+ }
+
+ event some_event_handled_on_worker()
+ {
+ # We know the manager is connected to all workers, so we could
+ # choose to relay the event across it.
+ Broker::publish(Cluster::manager_topic, worker_to_workers,
+ Cluster::node + " (via manager)");
+
+ # We also know that any given proxy is connected to all workers,
+ # though now we have a choice of which proxy to use. If we
+ # want to distribute the work associated with relaying uniformly,
+ # we can use a round-robin strategy. The key used here is simply
+ # used by the cluster framework internally to keep track of
+ # which node is up next in the round-robin.
+ local pt = Cluster::rr_topic(Cluster::proxy_pool, "example_key");
+ Broker::publish(pt, worker_to_workers,
+ Cluster::node + " (via a proxy)");
+ }
+
+Worker Distributing Events Uniformly Across Proxies
+---------------------------------------------------
+
+If you want to offload some data/work from a worker to your proxies,
+we can make use of a `Highest Random Weight (HRW) hashing
+`_ distribution strategy
+to uniformly map an arbitrary key space across all available proxies.
+
+.. code-block:: zeek
+
+ event worker_to_proxies(worker_name: string)
+ {
+ print "got event from worker", worker_name;
+ }
+
+ global my_counter = 0;
+
+ event some_event_handled_on_worker()
+ {
+ # The key here is used to choose which proxy shall receive
+ # the event. Different keys may map to different nodes, but
+ # any given key always maps to the same node provided the
+ # pool of nodes remains consistent. If a proxy goes offline,
+ # that key maps to a different node until the original comes
+ # back up.
+ Cluster::publish_hrw(Cluster::proxy_pool,
+ cat("example_key", ++my_counter),
+ worker_to_proxies, Cluster::node);
+ }
+
+Broker-backed Zeek Tables for Data Synchronization and Persistence
+==================================================================
+
+Starting with Zeek 3.2, it is possible to "bind" a Zeek table to a backing
+Broker store. Changes to the Zeek table are sent to the Broker store. Similarly,
+changes of the Broker store are applied to the Zeek table.
+
+This feature allows easy distribution of table contents across a cluster.
+It also offers persistence for tables (when using a persistent Broker store
+backend like SQLite).
+
+To give a short example, to distribute a table over a cluster you can use
+the :zeek:attr:`&backend` attribute.
+
+.. code-block:: zeek
+
+ global t: table[string] of count &backend=Broker::MEMORY;
+
+The :zeek:attr:`&backend` attribute creates a master data store on the
+manager and a clone data store on all other node on the cluster. This
+in essence means that the table exists twice in each Zeek process. One
+copy of the table is contained in a Broker data store (either a master
+or a clone depending on the node), which data store distributes the
+data across the cluster---and, depending on the backend, might also
+make the data persistent. Since Broker data stores are only accessible
+via asynchronous operations, and accessing them might not always be
+immediate, a second copy of the table, which is immediately
+accessible, is held inside the Zeek core. This is the copy that you
+see and interact with on the Zeek side.
diff --git a/doc/frameworks/broker/cluster-layout.png b/doc/frameworks/broker/cluster-layout.png
new file mode 100644
index 0000000000..3813bfbfda
Binary files /dev/null and b/doc/frameworks/broker/cluster-layout.png differ
diff --git a/doc/frameworks/broker/cluster-layout.xml b/doc/frameworks/broker/cluster-layout.xml
new file mode 100644
index 0000000000..4269c6723f
--- /dev/null
+++ b/doc/frameworks/broker/cluster-layout.xml
@@ -0,0 +1,2 @@
+
+7VxLc6M4EP41Po4LSUjAcZJJZg+7VVOVrd3ZowwKZgYjFyaxvb9+hZEwEhA/eITs2JdYrZYE0ve1ultyZuh+tfua0vXyDx6weAatYDdDX2ZQfDxL/Mkl+0Li2G4hCNMoKETgKHiK/mVSKNuFL1HANppixnmcRWtd6PMkYX6myWia8q2u9sxjfdQ1DVlN8OTTuC79OwqypZQCyzpW/MaicCmHdrGsWFD/Z5jyl0SON4Po+fApqldU9SX1N0sa8G1FhB5m6D7lPCu+rXb3LM7nVk1b0e6xpbZ87pQl2VkNGEXAcTwrgD71XPQJoKKLVxq/MPUOhyfN9mp2WCAmSxYTnog/d4dXZnmnQJSW2SqWX2O6YPFdOSv3PObpsdkmo2n2OV8wQ/YYxXkPlipLiGBRZkmgWvgx3Wwi/89llBQVshkoSpVGP1iW7WWZvmRciHiaLXnIExr/zvlattpkKf/J1FOK1fOIgz6TskahIdd95kn2SFdRnIP8L5YGNKFSLEcCUJYrHVqHj5An/KEyj4dH3kXZ9/yt51iW/pFzcHxWqVpfZ7n0G/6S+qxtcSWVaBqyrEXHK3TyNa50LNHzlfEVy9K9UEhZTLPoVScMlbwLSz3ZVKwY3VcU1jxKsk2l52+5QCgoE6KYJg0IcqCOY1MfQTFp1Ra2bVdbiC/FM6hS5WWOogM7zmWKfWPKL80UNAmmAHIZUwB0RmdKjSgrAYCQpTNIYjG7d4v8W5iVq1VlUByLTT9H+3YZZexpTQ9rthV+h06fI68OVFD7al7l81Xky4pTLGsANW0BtWBQRZOBADOnO9hpHIVJzliBVzFDbwD4laUZ270JPVWL9BVHyrpuj86NctmWFbdGQasJrBW4XIYG2GA2Cxhs1jRRQFinfLf/BJsQUkjEuFX9qQEncLyFZZ0DnOdnRnx/msCBUAcOwHXgOLgOHDwEcJo80zpwYh4Ky5LbnI+BE7Ig+CwDI4IIOFWcePoeZJN3hIlXg4mERIN7dlv7HjYXD7/b4t+CVQ1QXxzvrm3T6Qac0uGuuNvFg4sV+14tKEe87rT35JrDM1xzMIhrXvOlbYLmnmMjDCxgI9dBjsYED84RJB7GmCAHQ7334h1lh29Fth6YE9GLhRwxAlbOerkj2/Y8790Rr01sYBFjmGKaasO0Rxka9S5z/JsC4jbPDtw8u3e12kbUOKZjh29Ge1yjrTImNaNtDWW07enYaAKN/Agi1xlirLwN1RExOhrT1JIbh0biUMmVObIcjS9zizRSpjk52UimQ2ffWBqJpc8t+2Eqe2PYMKn8GjGQbTAM4OsYZqO+GdaSArWMiAVoh2SdE6DOjZwjRyU1plVoS1yDtfC8je56bl4VsgxzmlAnK3J1jlnOdWRFjt4RgEZHPZEV4mHJqixphZx63FBhYnu0ezU57lxs42ZyNMXcBdL7ctO8Oi7tcWBonu/a5lFDCwwvNvVQD5e9nsFDLrPsJp5OOeij4GxANE30dgFCRvRrGbdkTuirMHgo/d5tnNsNpsfNFI9q+Grb+phQhSNZQqLvo90tYYewEoFuQGmwZxXM1C3avJNjORGbNo17IMjM6J2yaeRCG9VR37BpdR4YUSS2+7WB9WPBpuT0lqc/i6PCD5qdXrRzwsxOuz6bana6TBKPcYAMG5BxC9ff4xDRmtumfzFooH5OEu0WlzeP4wwblt/uoU/nlGhOiJn5nmYObaSEN1Fucpnwdq/jKnb1jgA09rO+cmjuwDk0XOPmFTHCIKHk4EedgJwBzJFiSdfMqoGB8KTOUAfKqjmXmfoWhNXzpu8UabaA8PBI/WFsJOMHjPN0bOYrumLsVPx6Kv48s/5UPFp7z57jURWQdgX5W0dfo2TrhjSkw5xGDJM6s/pZT2M1p2WyejVYI0VW4NRRU0+b4qVnChem0zqp9x6dNd0/as2mfdy7nv+PbNqYv8ZoSrP+wmH7G1Z4oGTamCcfI13hhMZ9rauvcNrmTTUT8n1dMDN+EmPsBd19x/ovam8sGzU5dpELNUc5UrT8WemZX5ccO8e9Gomc5W1P5Wp4BqfOJWf5EwTl4pgd9UVO0is5RfH471oK9eP/xEEP/wE=
\ No newline at end of file
diff --git a/doc/frameworks/broker/connecting-connector.zeek b/doc/frameworks/broker/connecting-connector.zeek
new file mode 100644
index 0000000000..f1f8cbf872
--- /dev/null
+++ b/doc/frameworks/broker/connecting-connector.zeek
@@ -0,0 +1,12 @@
+redef exit_only_after_terminate = T;
+
+event zeek_init()
+ {
+ Broker::peer("127.0.0.1");
+ }
+
+event Broker::peer_added(endpoint: Broker::EndpointInfo, msg: string)
+ {
+ print "peer added", endpoint;
+ terminate();
+ }
diff --git a/doc/frameworks/broker/connecting-listener.zeek b/doc/frameworks/broker/connecting-listener.zeek
new file mode 100644
index 0000000000..7802229996
--- /dev/null
+++ b/doc/frameworks/broker/connecting-listener.zeek
@@ -0,0 +1,17 @@
+redef exit_only_after_terminate = T;
+
+event zeek_init()
+ {
+ Broker::listen("127.0.0.1");
+ }
+
+event Broker::peer_added(endpoint: Broker::EndpointInfo, msg: string)
+ {
+ print "peer added", endpoint;
+ }
+
+event Broker::peer_lost(endpoint: Broker::EndpointInfo, msg: string)
+ {
+ print "peer lost", endpoint;
+ terminate();
+ }
diff --git a/doc/frameworks/broker/events-connector.zeek b/doc/frameworks/broker/events-connector.zeek
new file mode 100644
index 0000000000..fb4bec92ef
--- /dev/null
+++ b/doc/frameworks/broker/events-connector.zeek
@@ -0,0 +1,26 @@
+redef exit_only_after_terminate = T;
+global my_event: event(msg: string, c: count);
+
+event zeek_init()
+ {
+ Broker::peer("127.0.0.1");
+ }
+
+event Broker::peer_added(endpoint: Broker::EndpointInfo, msg: string)
+ {
+ print "peer added", endpoint;
+ Broker::publish("zeek/event/my_event", my_event, "hi", 0);
+ Broker::publish("zeek/event/my_event", my_event, "...", 1);
+ local e = Broker::make_event(my_event, "bye", 2);
+ Broker::publish("zeek/event/my_event", e);
+ }
+
+event Broker::peer_lost(endpoint: Broker::EndpointInfo, msg: string)
+ {
+ terminate();
+ }
+
+event my_event(msg: string, c: count)
+ {
+ print "got my_event", msg, c;
+ }
diff --git a/doc/frameworks/broker/events-listener.zeek b/doc/frameworks/broker/events-listener.zeek
new file mode 100644
index 0000000000..374dc5db11
--- /dev/null
+++ b/doc/frameworks/broker/events-listener.zeek
@@ -0,0 +1,24 @@
+redef exit_only_after_terminate = T;
+global msg_count = 0;
+global my_event: event(msg: string, c: count);
+global my_auto_event: event(msg: string, c: count);
+
+event zeek_init()
+ {
+ Broker::subscribe("zeek/event/");
+ Broker::listen("127.0.0.1");
+ }
+
+event Broker::peer_added(endpoint: Broker::EndpointInfo, msg: string)
+ {
+ print "peer added", endpoint;
+ }
+
+event my_event(msg: string, c: count)
+ {
+ ++msg_count;
+ print "got my_event", msg, c;
+
+ if ( msg_count == 5 )
+ terminate();
+ }
diff --git a/doc/frameworks/broker/logs-connector.zeek b/doc/frameworks/broker/logs-connector.zeek
new file mode 100644
index 0000000000..47d912e294
--- /dev/null
+++ b/doc/frameworks/broker/logs-connector.zeek
@@ -0,0 +1,36 @@
+@load ./testlog
+
+redef exit_only_after_terminate = T;
+global n = 0;
+
+event zeek_init()
+ {
+ Broker::peer("127.0.0.1");
+ }
+
+event do_write()
+ {
+ if ( n == 6 )
+ return;
+
+ Log::write(Test::LOG, [$msg = "ping", $num = n]);
+ ++n;
+ event do_write();
+ }
+
+event Broker::peer_added(endpoint: Broker::EndpointInfo, msg: string)
+ {
+ print "peer added", endpoint;
+ event do_write();
+ }
+
+event Broker::peer_lost(endpoint: Broker::EndpointInfo, msg: string)
+ {
+ terminate();
+ }
+
+event Test::log_test(rec: Test::Info)
+ {
+ print "wrote log", rec;
+ Broker::publish("zeek/logs/forward/test", Test::log_test, rec);
+ }
diff --git a/doc/frameworks/broker/logs-listener.zeek b/doc/frameworks/broker/logs-listener.zeek
new file mode 100644
index 0000000000..654551b940
--- /dev/null
+++ b/doc/frameworks/broker/logs-listener.zeek
@@ -0,0 +1,22 @@
+@load ./testlog
+
+redef exit_only_after_terminate = T;
+
+event zeek_init()
+ {
+ Broker::subscribe("zeek/logs");
+ Broker::listen("127.0.0.1");
+ }
+
+event Broker::peer_added(endpoint: Broker::EndpointInfo, msg: string)
+ {
+ print "peer added", endpoint;
+ }
+
+event Test::log_test(rec: Test::Info)
+ {
+ print "got log event", rec;
+
+ if ( rec$num == 5 )
+ terminate();
+ }
diff --git a/doc/frameworks/broker/store-sqlite-tuning.zeek b/doc/frameworks/broker/store-sqlite-tuning.zeek
new file mode 100644
index 0000000000..4c59456013
--- /dev/null
+++ b/doc/frameworks/broker/store-sqlite-tuning.zeek
@@ -0,0 +1,19 @@
+global h: opaque of Broker::Store;
+
+event zeek_init()
+ {
+ # Use WAL mode.
+ local sqlite_options=Broker::SQLiteOptions(
+ $synchronous=Broker::SQLITE_SYNCHRONOUS_NORMAL,
+ $journal_mode=Broker::SQLITE_JOURNAL_MODE_WAL,
+ );
+ local options = Broker::BackendOptions($sqlite=sqlite_options);
+ h = Broker::create_master("persistent-store", Broker::SQLITE, options);
+
+ local c = 1000;
+ while (c > 0)
+ {
+ Broker::put(h, cat(c), rand(10000));
+ --c;
+ }
+ }
diff --git a/doc/frameworks/broker/stores-connector.zeek b/doc/frameworks/broker/stores-connector.zeek
new file mode 100644
index 0000000000..4c09d9b950
--- /dev/null
+++ b/doc/frameworks/broker/stores-connector.zeek
@@ -0,0 +1,29 @@
+redef exit_only_after_terminate = T;
+
+global h: opaque of Broker::Store;
+
+global ready: event();
+
+event Broker::peer_lost(endpoint: Broker::EndpointInfo, msg: string)
+ {
+ terminate();
+ }
+
+event zeek_init()
+ {
+ h = Broker::create_master("mystore");
+
+ local myset: set[string] = {"a", "b", "c"};
+ local myvec: vector of string = {"alpha", "beta", "gamma"};
+ Broker::put(h, "one", 110);
+ Broker::put(h, "two", 223);
+ Broker::put(h, "myset", myset);
+ Broker::put(h, "myvec", myvec);
+ Broker::increment(h, "one");
+ Broker::decrement(h, "two");
+ Broker::insert_into_set(h, "myset", "d");
+ Broker::remove_from(h, "myset", "b");
+ Broker::push(h, "myvec", "delta");
+
+ Broker::peer("127.0.0.1");
+ }
diff --git a/doc/frameworks/broker/stores-listener.zeek b/doc/frameworks/broker/stores-listener.zeek
new file mode 100644
index 0000000000..059444226f
--- /dev/null
+++ b/doc/frameworks/broker/stores-listener.zeek
@@ -0,0 +1,79 @@
+redef exit_only_after_terminate = T;
+
+global h: opaque of Broker::Store;
+global expected_key_count = 4;
+global key_count = 0;
+
+# Lookup a value in the store based on an arbitrary key string.
+function do_lookup(key: string)
+ {
+ when ( local res = Broker::get(h, key) )
+ {
+ ++key_count;
+ print "lookup", key, res;
+
+ # End after we iterated over looking up each key in the store twice.
+ if ( key_count == expected_key_count * 2 )
+ terminate();
+ }
+ # All data store queries must specify a timeout
+ timeout 3sec
+ { print "timeout", key; }
+ }
+
+event check_keys()
+ {
+ # Here we just query for the list of keys in the store, and show how to
+ # look up each one's value.
+ when ( local res = Broker::keys(h) )
+ {
+ print "clone keys", res;
+
+ if ( res?$result )
+ {
+ # Since we know that the keys we are storing are all strings,
+ # we can conveniently cast the result of Broker::keys to
+ # a native Bro type, namely 'set[string]'.
+ for ( k in res$result as string_set )
+ do_lookup(k);
+
+ # Alternatively, we can use a generic iterator to iterate
+ # over the results (which we know is of the 'set' type because
+ # that's what Broker::keys() always returns). If the keys
+ # we stored were not all of the same type, then you would
+ # likely want to use this method of inspecting the store's keys.
+ local i = Broker::set_iterator(res$result);
+
+ while ( ! Broker::set_iterator_last(i) )
+ {
+ do_lookup(Broker::set_iterator_value(i) as string);
+ Broker::set_iterator_next(i);
+ }
+ }
+ }
+ # All data store queries must specify a timeout.
+ # You also might see timeouts on connecting/initializing a clone since
+ # it hasn't had time to get fully set up yet.
+ timeout 1sec
+ {
+ print "timeout";
+ schedule 1sec { check_keys() };
+ }
+ }
+
+event Broker::peer_added(endpoint: Broker::EndpointInfo, msg: string)
+ {
+ print "peer added";
+ # We could create a clone early, like in zeek_init and it will periodically
+ # try to synchronize with its master once it connects, however, we just
+ # create it now since we know the peer w/ the master store has just
+ # connected.
+ h = Broker::create_clone("mystore");
+
+ event check_keys();
+ }
+
+event zeek_init()
+ {
+ Broker::listen("127.0.0.1");
+ }
diff --git a/doc/frameworks/broker/testlog.zeek b/doc/frameworks/broker/testlog.zeek
new file mode 100644
index 0000000000..f52f177cf1
--- /dev/null
+++ b/doc/frameworks/broker/testlog.zeek
@@ -0,0 +1,17 @@
+module Test;
+
+export {
+ redef enum Log::ID += { LOG };
+
+ type Info: record {
+ msg: string &log;
+ num: count &log;
+ };
+
+ global log_test: event(rec: Test::Info);
+}
+
+event zeek_init() &priority=5
+ {
+ Log::create_stream(Test::LOG, [$columns=Test::Info, $ev=log_test, $path="test"]);
+ }
diff --git a/doc/frameworks/cluster.rst b/doc/frameworks/cluster.rst
new file mode 100644
index 0000000000..ed9766f45c
--- /dev/null
+++ b/doc/frameworks/cluster.rst
@@ -0,0 +1,630 @@
+
+.. _cluster-framework:
+
+=================
+Cluster Framework
+=================
+
+The basic premise of Zeek clusterization is to break down network traffic into
+smaller pieces, while preserving the affinity of individual network sessions to
+a single analysis process. Cluster architecture thus allows Zeek to distribute
+that analysis across many dozens or hundreds of worker processes, allowing the
+monitoring system to scale up to line speeds of 100G or more.
+
+.. figure:: /images/cluster-diagram.png
+
+ Figure 1: Block diagram of cluster setup showing multiple network feeds to a
+ traffic aggregator. This device sends traffic to workers after symmetric
+ hashing/load-balancing. Traffic is then fed to the Zeek cluster using
+ load-balancing network cards.
+
+Zeek's Cluster Components
+=========================
+
+By distributing network traffic across hosts and processes, overall traffic
+finally reaches a volume that can be effectively analyzed by a single worker
+process. Zeek then acts as a distributed network security monitor to perform
+analysis across many dozens or hundreds of workers, all acting on a small
+fraction of the overall traffic volume. The analysis of the worker process is
+further facilitated by nodes such as manager and proxies, ultimately logging
+the alerts and or relevant network logs. A Zeek cluster therefore consists of
+four main components: a manager, workers, proxies, and a logger.
+
+Manager
+-------
+
+The manager is a Zeek process that has two primary jobs. It normally receives
+log messages and notices from the rest of the nodes in the cluster using the
+Zeek communications protocol. It combines the individual logs that each worker
+produces, so that the result is a set of joint logs instead of many discrete
+logs that you would have to combine in some manner with post-processing. (Note
+that if you use a separate logger node, then the logger receives all logs
+instead of the manager.) The manager also supports other functionality and
+analysis which requires a centralized, global view of events or data.
+
+Worker
+------
+
+The worker is the Zeek process that sniffs network traffic and does protocol
+analysis on the reassembled traffic streams. Most of the work of an active
+cluster takes place on the workers. Workers typically represent the bulk of the
+Zeek processes that are running in a cluster. The fastest memory and CPU core
+speed you can afford is recommended since all of the protocol parsing and most
+analysis will take place here. There are no particular requirements for the
+disks in workers since almost all logging is done remotely to the manager (or
+dedicated logger). Normally, very little is written to disk.
+
+Proxy
+-----
+
+A proxy is a Zeek process that may be used to offload data storage or any
+arbitrary workload. A cluster may contain multiple proxy nodes.
+Zeek's default scripts make only minimal use of proxies.
+Custom scripts or third-party packages may exercise proxies more heavily
+to partition data or workloads, providing greater cluster scalability potential.
+The number of required proxy nodes in a cluster depends on the deployed scripts,
+cluster size and traffic characteristics. For small clusters with four or less workers,
+a single proxy node is usually sufficient. For larger clusters, you may want to
+closely monitor :ref:`CPU and memory usage ` of proxy
+nodes and increase their number as needed.
+
+Zeek processes acting as proxies don’t tend to be extremely hard on CPU or
+memory, and users frequently run proxy processes on the same physical host as
+the manager.
+
+Logger
+------
+
+A logger is an optional Zeek process that receives log messages from the rest
+of the nodes in the cluster using the Zeek communications protocol. The purpose
+of having a logger to receive logs instead of the manager is to reduce the load
+on the manager. If no logger is needed, then the manager will receive logs
+instead.
+
+Running a Zeek Cluster
+======================
+
+Zeek Cluster Setup
+------------------
+
+This :ref:`link ` describes the cluster setup in great
+detail.
+
+General Usage and Deployment
+----------------------------
+
+The biggest advantage to using a Zeek cluster is that most of its inner
+workings are transparent to the user. Clusterization is a clever trick to
+divide-and-conquer ever increasing network traffic volume.
+
+As a practitioner one must know how to set up a cluster by defining components
+such as the manager, proxies, loggers and workers in the
+:samp:`{}/etc/node.cfg` file on the manager.
+
+Edit the ZeekControl node configuration file, :samp:`{}/etc/node.cfg`,
+to define where the logger, manager, proxies, and workers will run. For a
+cluster configuration, comment-out (or remove) the standalone node in that
+file, and either uncomment or add node entries for each node in your cluster
+(logger, manager, proxy, and workers).
+
+For example, to run five Zeek nodes (two workers, one proxy, a logger, and a
+manager) on a cluster consisting of three machines, the cluster configuration
+would look like this::
+
+ [logger]
+ type=logger
+ host=10.0.0.10
+
+ [manager]
+ type=manager
+ host=10.0.0.10
+
+ [proxy-1]
+ type=proxy
+ host=10.0.0.10
+
+ [worker-1]
+ type=worker
+ host=10.0.0.11
+ interface=eth0
+
+ [worker-2]
+ type=worker
+ host=10.0.0.12
+ interface=eth0
+
+
+To set up a cluster we need a network-aggregator/load balancing device which
+can aggregate inputs from network sources, such as taps or span ports. This
+device also performs the critical function of ensuring each TCP session is
+distributed to a single link. This function is provided through symmetric
+hashing.
+
+Once the tap aggregator is set, output from each port is sent to a “Zeek node”
+which is typically built on commodity hardware. Zeek clusters have evolved from
+running the manager, workers and proxies on individual servers, to most often
+now running a “cluster-in-a-box” setup, where a powerful multi-core box with
+dedicated cores hosts the workers, proxies logger and manager. We’ve seen
+instances of 90 workers running on a single physical server.
+
+At present the preferred way to run a cluster is to use a load-balancing
+network card such as Myricom NICs or Intel cards with PF_RING or AF_PACKET
+support. The NIC (and associated software) further divides the traffic to
+multiple Zeek worker processes running on the ‘Zeek- node’.
+
+While the Zeek cluster allows us to monitor traffic at scale, an optional
+add-on technology called “shunting” is helpful to reduce the volume that needs
+be processed.. Shunting can detect specific large data flows based on
+predetermined characteristics and communicate with the network tap via an API
+to stop sending those flows to Zeek for analysis. This allows Zeek to maintain
+awareness and logs of these shunted large flows while dramatically reducing the
+analysis load necessary to process traffic.
+
+The following links gives more specific information on how to set up
+clusterization using one of the above approaches: :ref:`cluster-configuration`.
+
+Developing Scripts/Heuristics
+=============================
+
+This section is for developers who are interested in writing
+packages/scripts/heuristics and want to take advantage of clusterization.
+
+In order to make your scripts/packages “clusterized,” one must understand the
+purpose of each of the cluster components (manager, workers, proxies and
+logger) and how/where the data is generated and how to move data/information
+across the different nodes in the cluster.
+
+* **Workers**: Workers are a good first choice for doing the brunt of any work.
+ They should be spending a lot of time parsing or analyzing incoming data from
+ packets. You might choose them to do a “first pass” analysis and then decide
+ how the results should be shared with other nodes in the cluster.
+
+* **Proxies**: Proxies serve as intermediaries for data storage and computation
+ offloading. Proxies help offload work or data in a scalable and distributed
+ way. Since any given worker is connected to all proxies and can agree on an
+ “arbitrary key -> proxy node” mapping (discussed later), you can partition
+ work or data amongst them in a uniform manner. You might choose to use
+ proxies as a method to share non-persistent state or as a “second pass”
+ analysis for any work that you don’t want interfering with the workers’
+ capacity to keep up with capturing and parsing packets. The default scripts
+ that come with Zeek make minimal use of proxies. If you are migrating from a
+ previous ZeekControl deployment, you may want to implement a single proxy
+ node. If you have custom or community scripts that utilize proxies,
+ considering scaling up the number of proxies to meet demand.
+
+* **Manager**: A manager will make decisions that require a global view, as it
+ is in a centralized location and connected to everything. However, that
+ connectivity also makes it easy to overload it. Try to use a manager
+ sparingly and only for tasks that must be done in a centralized or
+ authoritative location. Optionally, for some deployments, the manager can
+ also serve as the sole logger.
+
+* **Loggers**: Loggers should simply write logs to disk. In the default cluster
+ configuration, log content gets distributed among available loggers in a
+ round-robin fashion, providing failover capability should any given logger
+ temporarily go offline.
+
+The Need to Move Data and Events Across Different Nodes
+-------------------------------------------------------
+
+Imagine you have a list of IP addresses that you want to distribute across all
+workers to keep in a watch list, such as the Intel framework. You may also want
+to aggregate results across workers to see if that count crosses a threshold,
+such as using scan detection. Finally, you might want to extract URLs from
+emails and then redistribute the extracted URLs to all workers to be able to
+find which of these extracted URLs got clicked on. All these examples tend to
+introduce challenges in a Zeek cluster setup due to data centrality issues. In
+other words, the very advantageous divide-and-conquer approach of
+clusterization also introduces complexity in Zeek scripts. However, with the
+introduction of the Broker communication framework and additional helper
+functions, data centrality complexities can be addressed efficiently. One must
+rely on clusterization techniques provided by Zeek scripting, the Broker API,
+and clusterization components.
+
+When clustering your scripts, the fundamental work to move data or events in
+the context of a cluster falls primarily on few high level abstractions of
+communication patterns:
+
+ 1. Manager-to-worker
+ 2. Worker-to-manager
+ 3. Worker-to-proxy
+ 4. Worker-to-manager-to-worker
+ 5. Manager-to-worker-to-manager
+
+All the communication between workers, proxies and manager is established by
+Zeek via the Broker framework. The Broker framework provides basic facilities
+for connecting Zeek instances to each other and exchanging messages, events or
+data.
+
+Cluster Topics
+--------------
+
+All Broker-based messaging involves two components: the information you want to
+send, such as an event with its arguments, along with an associated topic name
+string. The topic strings are used as a filtering mechanism: Broker uses a
+publish-subscribe communication pattern where peers advertise interest in topic
+prefixes and only receive messages which match one of their prefix
+subscriptions. Broker itself supports arbitrary topic strings. However, Zeek
+generally follows certain conventions in choosing these topics to help avoid
+conflicts and generally make them easier to remember.
+
+To communicate between workers, proxies and manager one needs to know the topic
+name to which all workers, proxies and manager are subscribed to. These are:
+
+ 1. :zeek:see:`Cluster::worker_topic` - to which all workers are subscribed
+ 2. :zeek:see:`Cluster::proxy_topic` - to which all proxies are subscribed
+ 3. :zeek:see:`Cluster::manager_topic` - to which manager is subscribed
+
+
+The following table illustrates all the topics and communication events for
+clusterization, along with potential use cases:
+
+.. list-table::
+ :header-rows: 1
+
+ * - Event
+ - Topic
+ - Use cases
+
+ * - Manager to worker
+ - :zeek:see:`Cluster::worker_topic`
+ - * Read input file on manager
+ * Distribute data and events from manager to workers
+
+ * - Worker to manager
+ - :zeek:see:`Cluster::manager_topic`
+ - * Find characteristics of a “scan” eg. SYN-only pkts
+ * Send data to manager for aggregation
+
+ * - Worker or manager to proxy
+ - :zeek:see:`Cluster::proxy_topic`
+ - * Run operation on all proxies
+ * Disseminate notice suppression
+
+ * - Worker to manager to worker
+ - :zeek:see:`Cluster::manager_topic` + :zeek:see:`Cluster::worker_topic`
+ - * Find URLs in emails
+ * Send to manager
+ * Distribute to workers to check against HTTP GET requests
+
+ * - Manager to worker to manager
+ - :zeek:see:`Cluster::worker_topic` + :zeek:see:`Cluster::manager_topic`
+ - * Read input file on manager
+ * Distribute data to workers
+ * Workers to report counts of connections to manager
+ * Aggregate the counts on manager
+
+Cluster Pools
+-------------
+
+In addition to topics, Zeek nodes can join a :zeek:see:`Cluster::Pool`.
+Using :zeek:see:`Cluster::publish_hrw` and :zeek:see:`Cluster::publish_rr`,
+pools allow to publish events to individual proxies without prior knowledge
+of a cluster's shape and size.
+
+A popular pool is the :zeek:see:`Cluster::proxy_pool`. It comprises all
+the proxies of a cluster. Examples of its use are listed in the following table.
+
+
+.. list-table::
+ :header-rows: 1
+
+ * - Event
+ - Pool
+ - Use cases
+
+ * - Workers to individual proxy processes
+ - :zeek:see:`Cluster::proxy_pool`
+ - * Aggregation based on Highest Random Weight (eg. DNS query types, see the :ref:`section below ` for details.)
+ * Aggregation of Software versions for a given host
+ * Offloading tasks in round-robin fashion across proxies
+
+
+Publishing Events Across the Cluster
+------------------------------------
+
+Broker, as well as Zeek’s higher-level cluster framework, provide a set of
+function to publish events, including:
+
+.. list-table::
+ :header-rows: 1
+
+ * - Function
+ - Description
+ - Use
+
+ * - :zeek:see:`Cluster::publish`
+ - Publishes an event at a given topic
+ - Standard function to send an event to all nodes subscribed to a given
+ topic.
+
+ * - :zeek:see:`Cluster::publish_hrw`
+ - Publishes an event to a node within a pool according to
+ Highest Random Weight (HRW) hashing strategy; see details below
+ - Use this in cases of any aggregation needs - eg. scan detection or
+ anything that needs a counter going.
+
+ * - :zeek:see:`Cluster::publish_rr`
+ - Publishes an event to a node within a pool according to Round-Robin
+ distribution strategy.
+ - Generally used inside Zeek for multiple logger nodes.
+
+ * - :zeek:see:`Broker::publish`
+ - Publishes an event at a given topic
+ - Standard function to send an event to all nodes subscribed to a given
+ topic.
+
+ Starting with Zeek 7.1, this function should only be used in
+ Broker-specific scripts. Use :zeek:see:`Cluster::publish` otherwise.
+
+
+.. note::
+
+ The ``Cluster::publish`` function was added in Zeek 7.1. In contrast to
+ ``Broker:publish``, it publishes events even when a non-Broker cluster
+ backend is in use. Going forward, ``Cluster:publish`` should be preferred
+ over ``Broker::publish``, unless the script is specific to the Broker backend,
+ e.g. when interacting with an external application using native Python
+ bindings for Broker.
+
+
+An example sending an event from worker to manager:
+
+.. code-block:: zeek
+
+ event worker_to_manager(worker_name: string)
+ {
+ print "got event from worker", worker_name;
+ }
+
+ event some_event_handled_on_worker()
+ {
+ Broker::publish(Cluster::manager_topic, worker_to_manager,
+ Cluster::node);
+ }
+
+More details and code snippets and documentation on Broker communication
+frameworks are available at :ref:`broker-framework`.
+
+
+.. _cluster-framework-proxies-uniform:
+
+Distributing Events Uniformly Across Proxies
+--------------------------------------------
+
+If you want to offload some data/work from a worker to your proxies, we can
+make use of a `Highest Random Weight (HRW) hashing
+`_ distribution strategy to
+uniformly map an arbitrary key space across all available proxies through
+:zeek:see:`Cluster::publish_hrw`. This function publishes an event to one node
+within a pool according to a Highest Random Weight hashing strategy. By
+assigning :zeek:see:`Cluster::proxy_pool` to this event, one can utilize
+proxies to handle it. Note that :zeek:see:`Cluster::publish_hrw` requires a
+unique key as an input to the hashing function to uniformly distribute keys
+among available nodes. Often this key is a source or destination IP address. If
+you are using :zeek:see:`Cluster::publish_hrw` for an aggregate function, such
+as counts unique across the workers, make sure to appropriately select the
+hashing key.
+
+The following example illustrates this issue. Assume that we are counting the
+number of scanner IPs from each ``/24`` subnet. If the key were the source IP,
+then depending on the hashing, different IP addresses from the same ``/24``
+might end up on different proxies for the aggregation function. In this case
+one might instead want to use a more inclusive hashing key, such as the subnet
+(``/24``) itself. To illustrate the issue, in the notice log below, you see
+that 3 scanners each from ``52.100.165.0/24`` went to ``proxy-1`` and
+``proxy-2``. Ideally we want a single count of 6 scanners instead.
+
+::
+
+ 1600212249.061779 Scan::Subnet 52.100.165.0/24 has 3 spf IPs originating from it 52.100.165.249 52.100.165.237 52.100.165.246 - 52.100.165.246 - - proxy-2 Notice::ACTION_LOG 3600.000000 F
+
+ 1600212293.581745 Scan::Subnet 52.100.165.0/24 has 3 spf IPs originating from it 52.100.165.247 52.100.165.244 52.100.165.205 - 52.100.165.205 - - proxy-1 Notice::ACTION_LOG 3600.000000
+
+Instead, we can ensure the hash key is ``52.100.165.0/24`` instead of the
+original IP, as the hash for ``52.100.165.0/24`` will be the same for all
+addresses belonging to this subnet. Then the data will reach only one proxy.
+To that end, we can use the ``mask_address`` function to extract subnet
+information for a given IP address to use as a key in the hash function:
+
+.. code-block:: zeek
+
+ local spf = mask_address(orig);
+
+ @if ( Cluster::is_enabled())
+ Cluster::publish_hrw(Cluster::proxy_pool, spf, smtpsink::aggregate_stats, c) ;
+ @else
+ event smtpsink::aggregate_stats(c);
+ @endif
+
+Carefully select the key for :zeek:see:`Cluster::publish_hrw`. If done right,
+this feature will bring tremendous benefits in code scalability, especially
+when working with aggregate and threshold functions.
+
+.. note::
+
+ In scripting for clusterization, using the correct module names and
+ namespaces is crucial as both events and data are transmitted to different
+ systems. In order to make sure the contexts are correct, all functions,
+ events and datasets should be scoped within their respective namespaces and
+ modules. An easy rule of thumb is to always use the explicit module namespace
+ scoping. See :ref:`event-namespacing-pitfall` for further explanation and
+ examples.
+
+Clusterization of Zeek scripts can be an intimidating task for beginners.
+However, with reliance on the new Broker framework, clusterization has become
+simpler and straightforward. Consider the following:
+
+1. Communication overhead: Be sure not to generate unnecessary communication
+ overhead. For example, scan detection is one of the worst cases for
+ distributed analysis. One needs to count connections from a given IP address
+ across all workers and then aggregate them on a proxy or manager. All the
+ connections have to reach an aggregate function before Zeek can determine if
+ a given source is a scanner or not. This happens because each worker only
+ has a limited picture of the activity generated by a given remote IP.
+
+2. Communication optimizations: Once a given remote IP is identified as
+ desired, make sure a manager reports that to the worker, and workers stop
+ sending any further data for that IP to the manager. This is especially
+ useful in scan detection where it takes only a few connections to identify
+ scans, while a given scanner might send millions of probes eventually. If
+ done right, workers will only send the first N connections, and stop after
+ that, thus saving a lot of communication overheads. However, it makes sense
+ to stop workers from sending any further connection information
+
+3. Clusterization also requires timely state synchronization across the
+ workers, to make sure that all workers have a common view of a particular
+ heuristic.
+
+4. When writing scripts for clusterization make sure your detection runs in
+ both cluster and standalone setup.
+
+A Cluster Script Walkthrough
+----------------------------
+
+Let's say we want to count how many connections a remote IP is making to a host
+in our network on port 3389 UDP. Due to the distributed nature of Zeek
+clusters, connections are distributed across the workers based on a 5-tuple
+hash (source IP, source port, destination IP, destination port, and protocol).
+To get a central view of a connection between a given IP pair, one must deploy
+a clusterized scripting approach. The following example highlights how to go
+about doing so.
+
+In this use case, we intend to create an aggregation function.
+:zeek:see:`Cluster::publish_hrw` appears to be the appropriate function, since
+it allows offloading a lot of work to proxies, thus leaving workers and manager
+to process traffic.
+
+In order to make sure all the connections between two hosts go to a single
+specific proxy, we need to make sure the key for the hashing function
+accommodates this constraint. We will use ``orig_h+resp_h`` as the key. We
+create a new data-type called ``pair`` as seen in code below. This allows us
+to use the ``orig+resp`` as a unique key across the code, including in the
+candidate table. Further, we create a new data type called ``stats`` to keep
+track of additional data associated with a connection pair.
+
+.. code-block:: zeek
+
+ module DoS;
+
+ export {
+
+ redef enum Notice::Type += {
+ Threshold,
+ Victim_3389UDP,
+ };
+
+ type pair: record {
+ orig: addr;
+ resp: addr;
+ };
+
+ type stats: record {
+ orig: addr;
+ resp: addr ;
+ orig_bytes: count &default=0;
+ resp_bytes: count &default=0;
+ conns: count &default=0;
+ };
+
+ global dos_candidates: table [pair] of stats &create_expire=1 day;
+
+ global DoS::aggregate_stats:event(s: stats);
+ }
+
+We choose the :zeek:see:`connection_state_remove` event as the primary event to
+tap into. :zeek:see:`connection_state_remove` is generated when a connection’s
+internal state is about to be removed from memory. It's appropriate for this
+case, as all the information about the connection is now included in the
+:zeek:see:`connection` record ``c``. One disadvantage of using
+:zeek:see:`connection_state_remove` is that the event is fired at the very end
+of the connection, after the expiration timeouts are over. Thus, there are
+delays, and any operation which happens on the data is “after-the-fact” that
+connection is over. While this could be a problem in approaches such as
+proactive blocking and early detection heuristics, in this case of aggregation
+it is not an issue.
+
+The thing to pay attention to in the code snippet below is the
+:zeek:see:`@if`-:zeek:see:`@else`-:zeek:see:`@endif` directives which
+differentiate between clusterized and standalone operation of the script. With
+the :zeek:see:`@if` construct, the specified expression must evaluate to type
+bool. If the value is true, then the following script lines (up to the next
+:zeek:see:`@else` or :zeek:see:`@endif`) are available to be executed. In this
+case we check if :zeek:see:`Cluster::is_enabled`. If so, we call
+:zeek:see:`Cluster::publish_hrw` along with the key (``hash_pair``) and the
+aggregate function followed by parameters, which is the stats record in this
+case. If the cluster isn’t running that aggregate function, it is directly
+called.
+
+.. code-block:: zeek
+
+ event connection_state_remove(c: connection)
+ {
+ local service = c$id$resp_p;
+ local resp = c$id$resp_h;
+
+ if ( service != 3389/udp )
+ return;
+
+ if ( resp !in Site::local_nets )
+ return;
+
+ local s: stats;
+ s$orig = c$id$orig_h;
+ s$resp = c$id$resp_h;
+ s$orig_bytes = c$conn$orig_ip_bytes;
+ s$resp_bytes = c$conn$resp_ip_bytes;
+
+ local hash_pair: pair;
+ hash_pair$orig = c$id$orig_h;
+ hash_pair$resp = resp;
+
+ @if ( Cluster::is_enabled() )
+ Cluster::publish_hrw(Cluster::proxy_pool, hash_pair, DoS::aggregate_stats, s);
+ @else
+ event DoS::aggregate_stats(s);
+ @endif
+ }
+
+Since ``hash_pair`` makes the key unique, irrespective of what worker this
+specific connection has gone to, it will end up on a one specific proxy only.
+
+.. code-block:: zeek
+
+ event DoS::aggregate_stats(s: stats)
+ {
+ local p: pair ;
+ p$orig = s$orig;
+ p$resp = s$resp ;
+
+ if ( p !in dos_candidates )
+ {
+ local tmp_s: stats;
+ tmp_s$orig = s$orig;
+ tmp_s$resp = s$resp;
+ tmp_s$orig_bytes = 0;
+ tmp_s$resp_bytes= 0;
+ tmp_s$conns = 0;
+
+ dos_candidates[p] = tmp_s;
+ }
+
+ dos_candidates[p]$conns += 1;
+ dos_candidates[p]$orig_bytes += s$orig_bytes;
+ dos_candidates[p]$resp_bytes += s$resp_bytes;
+
+ local n = dos_candidates[p]$conns;
+
+ local thresh = check_ip_threshold(dos_threshold, ip_pair_threshold_idx, p, n);
+
+ if ( thresh )
+ {
+ local msg = fmt("%s pair has reached %s threshold %s",
+ p, n, dos_candidates[p]);
+ NOTICE([$note=DoS::Threshold, $src=p$orig, $msg=msg]);
+
+ if ( dos_candidates[p]$resp_bytes > 0 )
+ NOTICE([$note=DoS::Victim, $src=p$orig, $msg=msg,
+ $identifier=cat(p$resp), $suppress_for=1 hrs]);
+ }
+ }
diff --git a/doc/frameworks/configuration.rst b/doc/frameworks/configuration.rst
new file mode 100644
index 0000000000..91492b231b
--- /dev/null
+++ b/doc/frameworks/configuration.rst
@@ -0,0 +1,356 @@
+
+.. _framework-configuration:
+
+=======================
+Configuration Framework
+=======================
+
+Zeek includes a configuration framework that allows updating script options at
+runtime. This functionality consists of an :zeek:see:`option` declaration in
+the Zeek language, configuration files that enable changing the value of
+options at runtime, option-change callbacks to process updates in your Zeek
+scripts, a couple of script-level functions to manage config settings directly,
+and a log file (:file:`config.log`) that contains information about every
+option value change according to :zeek:see:`Config::Info`.
+
+Introduction
+============
+
+The configuration framework provides an alternative to using Zeek script
+constants to store various Zeek settings.
+
+While traditional constants work well when a value is not expected to change at
+runtime, they cannot be used for values that need to be modified occasionally.
+While a :zeek:see:`redef` allows a re-definition of an already defined constant
+in Zeek, these redefinitions can only be performed when Zeek first starts.
+Afterwards, constants can no longer be modified.
+
+However, it is clearly desirable to be able to change at runtime many of the
+configuration options that Zeek offers. Restarting Zeek can be time-consuming
+and causes it to lose all connection state and knowledge that it accumulated.
+Zeek’s configuration framework solves this problem.
+
+Declaring Options
+=================
+
+The :zeek:see:`option` keyword allows variables to be declared as configuration
+options:
+
+.. code-block:: zeek
+
+ module Test;
+
+ export {
+ option my_networks: set[subnet] = {};
+ option enable_feature = F;
+ option hostname = "testsystem";
+ option timeout_after = 1min;
+ option my_ports: vector of port = {};
+ }
+
+Options combine aspects of global variables and constants. Like global
+variables, options cannot be declared inside a function, hook, or event
+handler. Like constants, options must be initialized when declared (the type
+can often be inferred from the initializer but may need to be specified when
+ambiguous). The value of an option can change at runtime, but options cannot be
+assigned a new value using normal assignments.
+
+The initial value of an option can be redefined with a :zeek:see:`redef`
+declaration just like for global variables and constants. However, there is no
+need to specify the :zeek:see:`&redef` attribute in the declaration of an
+option. For example, given the above option declarations, here are possible
+redefs that work anyway:
+
+.. code-block:: zeek
+
+ redef Test::enable_feature = T;
+ redef Test::my_networks += { 10.1.0.0/16, 10.2.0.0/16 };
+
+Changing Options
+================
+
+The configuration framework facilitates reading in new option values from
+external files at runtime. Configuration files contain a mapping between option
+names and their values. Each line contains one option assignment, formatted as
+follows::
+
+ [option name][tab/spaces][new value]
+
+Lines starting with ``#`` are comments and ignored.
+
+You register configuration files by adding them to
+:zeek:see:`Config::config_files`, a set of filenames. Simply say something like
+the following in :file:`local.zeek`:
+
+.. code-block:: zeek
+
+ redef Config::config_files += { "/path/to/config.dat" };
+
+Zeek will then monitor the specified file continuously for changes. For
+example, editing a line containing::
+
+ Test::enable_feature T
+
+to the config file while Zeek is running will cause it to automatically update
+the option’s value in the scripting layer. The next time your code accesses the
+option, it will see the new value.
+
+.. note::
+
+ The config framework is clusterized. In a cluster configuration, only the
+ manager node watches the specified configuration files, and relays option
+ updates across the cluster.
+
+Config File Formatting
+----------------------
+
+The formatting of config option values in the config file is not the same as in
+Zeek’s scripting language. Keep an eye on the :file:`reporter.log` for warnings
+from the config reader in case of incorrectly formatted values, which it’ll
+generally ignore when encountered. The following table summarizes supported
+types and their value representations:
+
+.. list-table::
+ :header-rows: 1
+
+ * - Data Type
+ - Sample Config File Entry
+ - Comments
+
+ * - :zeek:see:`addr`
+ - ``1.2.3.4``
+ - Plain IPv4 or IPv6 address, as in Zeek. No ``/32`` or similar netmasks.
+
+ * - :zeek:see:`bool`
+ - ``T``
+ - ``T`` or ``1`` for true, ``F`` or ``0`` for false
+
+ * - :zeek:see:`count`
+ - ``42``
+ - Plain, nonnegative integer.
+
+ * - :zeek:see:`double`
+ - ``-42.5``
+ - Plain double number.
+
+ * - :zeek:see:`enum`
+ - ``Enum::FOO_A``
+ - Plain enum string.
+
+ * - :zeek:see:`int`
+ - ``-1``
+ - Plain integer.
+
+ * - :zeek:see:`interval`
+ - ``3600.0``
+ - Always in epoch seconds, with optional fraction of seconds. Never
+ includes a time unit.
+
+ * - :zeek:see:`pattern`
+ - ``/(foo|bar)/``
+ - The regex pattern, within forward-slash characters.
+
+ * - :zeek:see:`port`
+ - ``42/tcp``
+ - Port number with protocol, as in Zeek. When the protocol part is missing,
+ Zeek interprets it as ``/unknown``.
+
+ * - :zeek:see:`set`
+ - ``80/tcp,53/udp``
+ - The set members, formatted as per their own type, separated by commas.
+ For an empty set, use an empty string: just follow the option name with
+ whitespace.
+
+ Sets with multiple index types (e.g. ``set[addr,string]``) are currently
+ not supported in config files.
+
+ * - :zeek:see:`string`
+ - ``Don’t bite, Zeek``
+ - Plain string, no quotation marks. Given quotation marks become part of
+ the string. Everything after the whitespace separator delineating the
+ option name becomes the string. Saces and special characters are fine.
+ Backslash characters (e.g. ``\n``) have no special meaning.
+
+ * - :zeek:see:`subnet`
+ - ``1.2.3.4/16``
+ - Plain subnet, as in Zeek.
+
+ * - :zeek:see:`time`
+ - ``1608164505.5``
+ - Always in epoch seconds, with optional fraction of seconds. Never
+ includes a time unit.
+
+ * - :zeek:see:`vector`
+ - ``1,2,3,4``
+ - The set members, formatted as per their own type, separated by commas.
+ For an empty vector, use an empty string: just follow the option name
+ with whitespace.
+
+This leaves a few data types unsupported, notably tables and records. If you
+require these, build up an instance of the corresponding type manually (perhaps
+from a separate input framework file) and then call
+:zeek:see:`Config::set_value` to update the option:
+
+.. code-block:: zeek
+
+ module Test;
+
+ export {
+ option host_port: table[addr] of port = {};
+ }
+
+ event zeek_init() {
+ local t: table[addr] of port = { [10.0.0.2] = 123/tcp };
+ Config::set_value("Test::host_port", t);
+ }
+
+
+Regardless of whether an option change is triggered by a config file or via
+explicit :zeek:see:`Config::set_value` calls, Zeek always logs the change to
+:file:`config.log`. A sample entry::
+
+ #fields ts id old_value new_value location
+ #types time string string string string
+ 1608167352.498872 Test::a_count 42 3 config.txt
+
+Mentioning options repeatedly in the config files leads to multiple update
+events; the last entry “wins”. Mentioning options that do not correspond to
+existing options in the script layer is safe, but triggers warnings in
+:file:`reporter.log`::
+
+ warning: config.txt/Input::READER_CONFIG: Option 'an_unknown' does not exist. Ignoring line.
+
+Internally, the framework uses the Zeek input framework to learn about config
+changes. If you inspect the configuration framework scripts, you will notice
+that the scripts simply catch input framework events and call
+:zeek:see:`Config::set_value` to set the relevant option to the new value. If
+you want to change an option in your scripts at runtime, you can likewise call
+:zeek:see:`Config::set_value` directly from a script (in a cluster
+configuration, this only needs to happen on the manager, as the change will be
+automatically sent to all other nodes in the cluster).
+
+.. note::
+
+ The input framework is usually very strict about the syntax of input files, but
+ that is not the case for configuration files. These require no header lines,
+ and both tabs and spaces are accepted as separators. A custom input reader,
+ specifically for reading config files, facilitates this.
+
+.. tip::
+
+ The gory details of option-parsing reside in ``Ascii::ParseValue()`` in
+ :file:`src/threading/formatters/Ascii.cc` and ``Value::ValueToVal`` in
+ :file:`src/threading/SerialTypes.cc` in the Zeek core.
+
+Change Handlers
+===============
+
+A change handler is a user-defined function that Zeek calls each time an option
+value changes. This allows you to react programmatically to option changes. The
+following example shows how to register a change handler for an option that has
+a data type of :zeek:see:`addr` (for other data types, the return type and
+second parameter data type must be adjusted accordingly):
+
+.. code-block:: zeek
+
+ module Test;
+
+ export {
+ option testaddr = 127.0.0.1;
+ }
+
+ # Note: the data type of 2nd parameter and return type must match
+ function change_addr(id: string, new_value: addr): addr
+ {
+ print fmt("Value of %s changed from %s to %s", id, testaddr, new_value);
+ return new_value;
+ }
+
+ event zeek_init()
+ {
+ Option::set_change_handler("Test::testaddr", change_addr);
+ }
+
+Immediately before Zeek changes the specified option value, it invokes any
+registered change handlers. The value returned by the change handler is the
+value Zeek assigns to the option. This allows, for example, checking of values
+to reject invalid input (the original value can be returned to override the
+change).
+
+.. note::
+
+ :zeek:see:`Option::set_change_handler` expects the name of the option to
+ invoke the change handler for, not the option itself. Also, that name
+ includes the module name, even when registering from within the module.
+
+It is possible to define multiple change handlers for a single option. In this
+case, the change handlers are chained together: the value returned by the first
+change handler is the “new value” seen by the next change handler, and so on.
+The built-in function :zeek:see:`Option::set_change_handler` takes an optional
+third argument that can specify a priority for the handlers.
+
+A change handler function can optionally have a third argument of type string.
+When a config file triggers a change, then the third argument is the pathname
+of the config file. When the :zeek:see:`Config::set_value` function triggers a
+change, then the third argument of the change handler is the value passed to
+the optional third argument of the :zeek:see:`Config::set_value` function.
+
+.. tip::
+
+ Change handlers are also used internally by the configuration framework. If
+ you look at the script-level source code of the config framework, you can see
+ that change handlers log the option changes to :file:`config.log`.
+
+When Change Handlers Trigger
+----------------------------
+
+Change handlers often implement logic that manages additional internal state.
+For example, depending on a performance toggle option, you might initialize or
+clean up a caching structure. In such scenarios you need to know exactly when
+and whether a handler gets invoked. The following hold:
+
+* When no config files get registered in :zeek:see:`Config::config_files`,
+ change handlers do not run.
+* When none of any registered config files exist on disk, change handlers do
+ not run.
+
+That is, change handlers are tied to config files, and don’t automatically run
+with the option’s default values.
+
+* When a config file exists on disk at Zeek startup, change handlers run with
+ the file’s config values.
+* When the config file contains the same value the option already defaults to,
+ its change handlers are invoked anyway.
+* :zeek:see:`zeek_init` handlers run before any change handlers — i.e., they
+ run with the options’ default values.
+* Since the config framework relies on the input framework, the input
+ framework’s inherent asynchrony applies: you can’t assume when exactly an
+ option change manifests in the code.
+
+If your change handler needs to run consistently at startup and when options
+change, you can call the handler manually from :zeek:see:`zeek_init` when you
+register it. That way, initialization code always runs for the option’s default
+value, and also for any new values.
+
+.. code-block:: zeek
+
+ module Test;
+
+ export {
+ option use_cache = T;
+ }
+
+ function use_cache_hdlr(id: string, new_value: bool): bool
+ {
+ if ( new_value ) {
+ # Ensure caching structures are set up properly
+ }
+
+ return new_value;
+ }
+
+ event zeek_init()
+ {
+ use_cache_hdlr("Test::use_cache", use_cache);
+ Option::set_change_handler("Test::use_cache", use_cache_hdlr);
+ }
diff --git a/doc/frameworks/denylist.jsonl b/doc/frameworks/denylist.jsonl
new file mode 100644
index 0000000000..1ea6a1851e
--- /dev/null
+++ b/doc/frameworks/denylist.jsonl
@@ -0,0 +1,3 @@
+{"ip": "192.168.17.1", "timestamp": 1333252748, "reason": "Malware host"}
+{"ip": "192.168.27.2", "timestamp": 1330235733, "reason": "Botnet server"}
+{"ip": "192.168.250.3", "timestamp": 1333145108, "reason": "Virus detected"}
diff --git a/doc/frameworks/file-analysis.rst b/doc/frameworks/file-analysis.rst
new file mode 100644
index 0000000000..03f152af49
--- /dev/null
+++ b/doc/frameworks/file-analysis.rst
@@ -0,0 +1,283 @@
+
+.. _file-analysis-framework:
+
+=======================
+File Analysis Framework
+=======================
+
+.. TODO: integrate BoZ revisions
+
+.. rst-class:: opening
+
+ In the past, writing Zeek scripts with the intent of analyzing file
+ content could be cumbersome because of the fact that the content
+ would be presented in different ways, via events, at the
+ script-layer depending on which network protocol was involved in the
+ file transfer. Scripts written to analyze files over one protocol
+ would have to be copied and modified to fit other protocols. The
+ file analysis framework (FAF) instead provides a generalized
+ presentation of file-related information. The information regarding
+ the protocol involved in transporting a file over the network is
+ still available, but it no longer has to dictate how one organizes
+ their scripting logic to handle it. A goal of the FAF is to
+ provide analysis specifically for files that is analogous to the
+ analysis Zeek provides for network connections.
+
+Supported Protocols
+===================
+
+Zeek ships with file analysis for the following protocols:
+:ref:`FTP `,
+:ref:`HTTP `,
+:ref:`IRC `,
+:ref:`Kerberos `,
+:ref:`MIME `,
+:ref:`RDP `,
+:ref:`SMTP `, and
+:ref:`SSL/TLS/DTLS `.
+Protocol analyzers are regular :ref:`Zeek plugins `, so users
+are welcome to provide additional ones in separate Zeek packages.
+
+File Lifecycle Events
+=====================
+
+The key events that may occur during the lifetime of a file are:
+:zeek:see:`file_new`, :zeek:see:`file_over_new_connection`,
+:zeek:see:`file_sniff`, :zeek:see:`file_timeout`, :zeek:see:`file_gap`, and
+:zeek:see:`file_state_remove`. Handling any of these events provides
+some information about the file such as which network
+:zeek:see:`connection` and protocol are transporting the file, how many
+bytes have been transferred so far, and its MIME type.
+
+Here's a simple example:
+
+.. literalinclude:: file_analysis_01.zeek
+ :caption:
+ :language: zeek
+ :linenos:
+ :tab-width: 4
+
+.. code-block:: console
+
+ $ zeek -r http/get.trace file_analysis_01.zeek
+ file_state_remove
+ FakNcS1Jfe01uljb3
+ CHhAvVGS1DHFjwGM9
+ [orig_h=141.142.228.5, orig_p=59856/tcp, resp_h=192.150.187.43, resp_p=80/tcp]
+ HTTP
+ connection_state_remove
+ CHhAvVGS1DHFjwGM9
+ [orig_h=141.142.228.5, orig_p=59856/tcp, resp_h=192.150.187.43, resp_p=80/tcp]
+ HTTP
+
+This doesn't perform any interesting analysis yet, but does highlight
+the similarity between analysis of connections and files. Connections
+are identified by the usual 5-tuple or a convenient UID string while
+files are identified just by a string of the same format as the
+connection UID. So there's unique ways to identify both files and
+connections and files hold references to a connection (or connections)
+that transported it.
+
+File Type Identification
+========================
+
+Zeek ships with its own library of content signatures to determine the type of a
+file, conveyed as MIME types in the :zeek:see:`file_sniff` event. You can find
+those signatures in the Zeek distribution's ``scripts/base/frameworks/files/magic/``
+directory. (Despite the name, Zeek does *not* rely on libmagic for content analysis.)
+
+Adding Analysis
+===============
+
+Zeek supports customized file analysis via *file analyzers* that users can
+attach to observed files. You can attach analyzers selectively to individual
+files, or register them for auto-attachment under certain conditions. Once
+attached, file analyzers start receiving the contents of files as Zeek parses
+them from ongoing network connections.
+
+Zeek comes with the following built-in analyzers:
+
+ * :ref:`plugin-zeek-filedataevent` to access file content via
+ events (as data streams or content chunks),
+ * :ref:`plugin-zeek-fileentropy` to compute various entropy for a file,
+ * :ref:`plugin-zeek-fileextract` to extract files to disk,
+ * :ref:`plugin-zeek-filehash` to produce common hash values for files,
+ * :ref:`plugin-zeek-pe` to parse executables in PE format, and
+ * :ref:`plugin-zeek-x509` to extract information about x509 certificates.
+
+Like protocol parsers, file analyzers are regular :ref:`Zeek plugins
+`. Users are free to contribute additional ones via Zeek
+packages.
+
+Per-file analyzer registration
+------------------------------
+
+To attach an analyzer to a specific file, call :zeek:see:`Files::add_analyzer`
+with the analyzer's component tag (such as :zeek:see:`Files::ANALYZER_MD5`;
+consult the above analyzers for details). Some file analyzers support parameters
+that you can provide to this function via a :zeek:see:`Files::AnalyzerArgs`
+record, while others introduce additional event types and tunable script-layer
+settings.
+
+You can add multiple analyzers to a file, and add the same analyzer type
+multiple times, assuming you use varying :zeek:see:`Files::AnalyzerArgs`
+parameterization. You may remove these selectively from files via calls to
+:zeek:see:`Files::remove_analyzer`. You may also enable and disable file
+analyzers globally by calling :zeek:see:`Files::enable_analyzer` and
+:zeek:see:`Files::disable_analyzer`, respectively.
+
+Generic analyzer registration
+-----------------------------
+
+The framework provides mechanisms for automatically attaching analyzers to
+files. For example, the :zeek:see:`Files::register_for_mime_types` function
+ensures that Zeek automatically attaches a given analyzer to all files of a
+given MIME type. For fully customized auto-attachment logic take a look at
+:zeek:see:`Files::register_analyzer_add_callback`, and refer to
+:doc:`base/frameworks/files/main.zeek `
+for additional APIs and data structures.
+
+Regardless of which file analyzers end up acting on a file, general
+information about the file (e.g. size, time of last data transferred,
+MIME type, etc.) is logged in :file:`files.log`.
+
+Protocol-specific state
+-----------------------
+
+Some protocol analyzers redefine the ``fa_file`` record to add additional
+state. For example, ``base/protocols/http/entities.zeek``, which Zeek loads by
+default as part of the HTTP analyzer, makes the transaction's
+:zeek:see:`HTTP::Info` record available via ``f$http`` to provide HTTP
+context. As always, make sure to test the presence of optional fields via the
+``a?$b`` :ref:`record field operator ` before accessing
+them.
+
+Examples
+--------
+
+File hashing
+^^^^^^^^^^^^
+
+The following script uses the MD5 file analyzer to calculate the hashes of plain
+text files:
+
+.. literalinclude:: file_analysis_02.zeek
+ :caption:
+ :language: zeek
+ :tab-width: 4
+
+.. code-block:: console
+
+ $ zeek -r http/get.trace file_analysis_02.zeek
+ new file, FakNcS1Jfe01uljb3
+ file_hash, FakNcS1Jfe01uljb3, md5, 397168fd09991a0e712254df7bc639ac
+
+File extraction
+^^^^^^^^^^^^^^^
+
+The following example sets up extraction of observed files to disk:
+
+.. code-block:: zeek
+
+ global idx: count = 0;
+
+ event file_new(f: fa_file)
+ {
+ Files::add_analyzer(f, Files::ANALYZER_EXTRACT,
+ [$extract_filename=fmt("file-%04d", ++idx)]);
+ }
+
+The file extraction analyzer now writes the content of each observed file to a
+separate file on disk. The output file name results from concatenating the
+:zeek:see:`FileExtract::prefix` (normally ``./extract_files/``) and the
+enumerated ``file-NNNN`` strings.
+
+In a production setting you'll likely want to include additional information in
+the output, for example from state attached to the provided file record. The
+Zeek distribution ships with a starting point for such approaches: the
+``policy/frameworks/files/extract-all-files.zeek`` script. For additional
+configurability, take a look at the `file-extraction
+`_ Zeek package.
+
+Script-level content analysis
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The ``FileDataEvent`` analyzer provides script-layer access to file content for
+customized analysis. Since observed files can be very large, Zeek cannot buffer
+these files and provide their entire content to the script layer once
+complete. Instead, the ``FileDataEvent`` analyzer reflects the incremental
+nature of file content as Zeek observes it, and supports two types of events to
+allow you to process it: user-provided *stream events* receive new file content
+as supplied by connection-oriented protocols, while *chunk events* receive
+observed data as provided by protocols that do not feature stream semantics.
+
+The following example manually computes the SHA256 hash of each observed file by
+building up hash state and feeding streamed file content into the hash
+computation. When Zeek removes a file's state (because it has fully observed it,
+or perhaps because its state is timing out), it prints the resulting hash to the
+console:
+
+.. code-block:: zeek
+
+ global hashstate: table[string] of opaque of sha256;
+
+ event file_stream(f: fa_file, data: string)
+ {
+ if ( f$id !in hashstate )
+ hashstate[f$id] = sha256_hash_init();
+
+ sha256_hash_update(hashstate[f$id], data);
+ }
+
+ event file_new(f: fa_file)
+ {
+ Files::add_analyzer(f, Files::ANALYZER_DATA_EVENT, [$stream_event=file_stream]);
+ }
+
+ event file_state_remove(f: fa_file)
+ {
+ if ( f$id in hashstate )
+ {
+ print(sha256_hash_finish(hashstate[f$id]));
+ delete hashstate[f$id];
+ }
+ }
+
+Be careful with this approach, as it can quickly prove expensive to route all
+file content through the script layer. Make sure to add the analyzer only for
+relevant files, and consider removing it via :zeek:see:`Files::remove_analyzer`
+when you no longer require content analysis. For performance-critical
+applications a new file analyzer plugin could be a better approach.
+
+Input Framework Integration
+===========================
+
+The FAF comes with a simple way to integrate with the :doc:`Input
+Framework `, so that Zeek can analyze files from external sources
+in the same way it analyzes files that it sees coming over traffic from
+a network interface it's monitoring. It only requires a call to
+:zeek:see:`Input::add_analysis`:
+
+.. literalinclude:: file_analysis_03.zeek
+ :caption:
+ :language: zeek
+ :linenos:
+ :tab-width: 4
+
+Note that the "source" field of :zeek:see:`fa_file` corresponds to the
+"name" field of :zeek:see:`Input::AnalysisDescription` since that is what
+the input framework uses to uniquely identify an input stream.
+
+Example output of the above script may be:
+
+.. code-block:: console
+
+ $ echo "Hello world" > myfile
+ $ zeek file_analysis_03.zeek
+ new file, FZedLu4Ajcvge02jA8
+ file_hash, FZedLu4Ajcvge02jA8, md5, f0ef7081e1539ac00ef5b761b4fb01b3
+ file_state_remove
+
+Nothing that special, but it at least verifies the MD5 file analyzer
+saw all the bytes of the input file and calculated the checksum
+correctly!
diff --git a/doc/frameworks/file_analysis_01.zeek b/doc/frameworks/file_analysis_01.zeek
new file mode 100644
index 0000000000..a48f8184ad
--- /dev/null
+++ b/doc/frameworks/file_analysis_01.zeek
@@ -0,0 +1,20 @@
+event connection_state_remove(c: connection)
+ {
+ print "connection_state_remove";
+ print c$uid;
+ print c$id;
+ for ( s in c$service )
+ print s;
+ }
+
+event file_state_remove(f: fa_file)
+ {
+ print "file_state_remove";
+ print f$id;
+ for ( cid in f$conns )
+ {
+ print f$conns[cid]$uid;
+ print cid;
+ }
+ print f$source;
+ }
diff --git a/doc/frameworks/file_analysis_02.zeek b/doc/frameworks/file_analysis_02.zeek
new file mode 100644
index 0000000000..fd4f0e775e
--- /dev/null
+++ b/doc/frameworks/file_analysis_02.zeek
@@ -0,0 +1,12 @@
+event file_sniff(f: fa_file, meta: fa_metadata)
+ {
+ if ( ! meta?$mime_type ) return;
+ print "new file", f$id;
+ if ( meta$mime_type == "text/plain" )
+ Files::add_analyzer(f, Files::ANALYZER_MD5);
+ }
+
+event file_hash(f: fa_file, kind: string, hash: string)
+ {
+ print "file_hash", f$id, kind, hash;
+ }
diff --git a/doc/frameworks/file_analysis_03.zeek b/doc/frameworks/file_analysis_03.zeek
new file mode 100644
index 0000000000..3f8aa35d31
--- /dev/null
+++ b/doc/frameworks/file_analysis_03.zeek
@@ -0,0 +1,25 @@
+redef exit_only_after_terminate = T;
+
+event file_new(f: fa_file)
+ {
+ print "new file", f$id;
+ Files::add_analyzer(f, Files::ANALYZER_MD5);
+ }
+
+event file_state_remove(f: fa_file)
+ {
+ print "file_state_remove";
+ Input::remove(f$source);
+ terminate();
+ }
+
+event file_hash(f: fa_file, kind: string, hash: string)
+ {
+ print "file_hash", f$id, kind, hash;
+ }
+
+event zeek_init()
+ {
+ local source: string = "./myfile";
+ Input::add_analysis([$source=source, $name=source]);
+ }
diff --git a/doc/frameworks/index.rst b/doc/frameworks/index.rst
new file mode 100644
index 0000000000..a9ef76f884
--- /dev/null
+++ b/doc/frameworks/index.rst
@@ -0,0 +1,38 @@
+
+==========
+Frameworks
+==========
+
+Zeek includes several software frameworks that provide commonly used
+functionality to the scripting layer. Among other things, these frameworks
+enhance Zeek’s ability to ingest data, structure and filter its outputs, adapt
+settings at runtime, and interact with other components in your network. Most
+frameworks include functionality implemented in Zeek’s core, with
+corresponding data structures and APIs exposed to the script layer.
+
+Some frameworks target relatively specific use cases, while others run in
+nearly every Zeek installation. The logging framework, for example, provides
+the machinery behind all of the Zeek logs covered earlier. Frameworks also
+build on each other, so it’s well worth knowing their capabilities. The next
+sections cover them in detail.
+
+.. toctree::
+ :maxdepth: 1
+
+ broker
+ cluster
+ configuration
+ file-analysis
+ input
+ intel
+ logging
+ management
+ netcontrol
+ notice
+ packet-analysis
+ signatures
+ storage
+ sumstats
+ supervisor
+ telemetry
+ tls-decryption
diff --git a/doc/frameworks/input.rst b/doc/frameworks/input.rst
new file mode 100644
index 0000000000..994eec01eb
--- /dev/null
+++ b/doc/frameworks/input.rst
@@ -0,0 +1,640 @@
+
+.. _framework-input:
+
+===============
+Input Framework
+===============
+
+Zeek features a flexible input framework that allows users to import arbitrary
+data into Zeek. Data is either read into Zeek tables or directly converted to
+events for scripts to handle as they see fit. A modular reader architecture
+allows reading from files, databases, or other data sources.
+
+This chapter gives an overview of how to use the input framework, with
+examples. For more complex scenarios take a look at the test cases in
+:file:`testing/btest/scripts/base/frameworks/input/` in the Zeek distribution.
+
+.. note::
+
+ The input framework has no awareness of Zeek’s cluster architecture. Zeek
+ supports all of the mechanisms covered below on any cluster node. The config
+ and intelligence frameworks both leverage the input framework, adding logic
+ that applies the input framework on the manager node, distributing ingested
+ information across the cluster via events.
+
+Reading Data into Tables
+========================
+
+Probably the most interesting use-case of the input framework is to read data
+into a Zeek table. By default, the input framework reads the data in the same
+format as it is written by Zeek’s logging framework: a tab-separated ASCII
+file.
+
+We will show the ways to read files into Zeek with a simple example. For this
+example we assume that we want to import data from a denylist that contains
+server IP addresses as well as the timestamp and the reason for the block.
+
+An example input file could look like this (note that all fields must be
+tab-separated)::
+
+ #fields ip timestamp reason
+ 192.168.17.1 1333252748 Malware host
+ 192.168.27.2 1330235733 Botnet server
+ 192.168.250.3 1333145108 Virus detected
+
+To read a file into a Zeek table, two record types have to be defined. One
+contains the types and names of the columns that should constitute the table
+keys, and the second contains the types and names of the columns that should
+constitute the table values.
+
+In our case, we want to be able to look up IPs. Hence, our key record only
+contains the server IP. All other elements should be stored as the table
+content.
+
+.. code-block:: zeek
+
+ type Idx: record {
+ ip: addr;
+ };
+
+ type Val: record {
+ timestamp: time;
+ reason: string;
+ };
+
+Note that the names of the fields in the record definitions must correspond to
+the column names listed in the ``#fields`` line of the input file, in this case
+``ip``, ``timestamp``, and ``reason``. Also note that the ordering of the
+columns does not matter, because each column is identified by name.
+
+The input file is read into the table with a call of the
+:zeek:see:`Input::add_table` function:
+
+.. code-block:: zeek
+
+ global denylist: table[addr] of Val = table();
+
+ event zeek_init() {
+ Input::add_table([$source="denylist.file", $name="denylist",
+ $idx=Idx, $val=Val, $destination=denylist]);
+ Input::remove("denylist");
+ }
+
+With these three lines we first create an empty table that should receive the
+denylist data and then instruct the input framework to open an input stream
+named “denylist” to read the data into the table. The third line removes the
+input stream again, because we do not need it any more after the data has been
+read.
+
+Note that while the key and content records may use :zeek:attr:`&optional`
+fields, omitting columns (usually via the "-" character) requires care. Since
+the key record's columns expand into a list of values for indexing into the
+receiving table (note how in the above example ``denylist`` is indexed via a
+plain ``addr``) and all of those values must be present for indexing, you cannot
+in practice omit these values. For content records, omitting is meaningful, but
+only permitted for columns with the :zeek:attr:`&optional` attribute. The
+framework skips offending input lines with a warning.
+
+.. note::
+
+ Prior to version 4.1 Zeek accepted such inputs, unsafely. When transitioning
+ from such versions to Zeek 4.1 or newer, users with omitted fields in their
+ input data may observe discrepancies in the loaded data sets.
+
+Asynchronous processing
+-----------------------
+
+Since some data files might be rather large, the input framework works
+asynchronously. A new thread is created for each new input stream. This thread
+opens the input data file, converts the data into an internal format and sends
+it back to the main Zeek thread. Because of this, the data is not immediately
+accessible. Depending on the size of the data source it might take from a few
+milliseconds up to a few seconds until all data is present in the table. Please
+note that this means that when Zeek is running without an input source or on
+very short captured files, it might terminate before the data is present in the
+table (because Zeek already handled all packets before the import thread
+finished).
+
+Subsequent calls to an input source are queued until the previous action has
+been completed. Because of this it is, for example, possible to call
+:zeek:see:`Input::add_table` and :zeek:see:`Input::remove` in two subsequent
+lines: the remove action will remain queued until the first read has been
+completed.
+
+Once the input framework finishes reading from a data source, it fires the
+:zeek:see:`Input::end_of_data` event. Once this event has been received all
+data from the input file is available in the table.
+
+.. code-block:: zeek
+
+ event Input::end_of_data(name: string, source: string) {
+ # now all data is in the table
+ print denylist;
+ }
+
+The table can be used while the data is still being read — it just might not
+contain all lines from the input file before the event has fired. After the
+table has been populated it can be used like any other Zeek table and denylist
+entries can easily be tested:
+
+.. code-block:: zeek
+
+ if ( 192.168.18.12 in denylist )
+ # take action
+
+
+Sets instead of tables
+----------------------
+
+For some use cases the key/value notion that drives tabular data does not
+apply, for example when the main purpose of the data is to test for membership
+in a set. The input framework supports this approach by using sets as the
+destination data type, and omitting ``$val`` in :zeek:see:`Input::add_table`:
+
+.. code-block:: zeek
+
+ type Idx: record {
+ ip: addr;
+ };
+
+ global denylist: set[addr] = set();
+
+ event zeek_init() {
+ Input::add_table([$source="denylist.file", $name="denylist",
+ $idx=Idx, $destination=denylist]);
+ Input::remove("denylist");
+ }
+
+Re-reading and streaming data
+-----------------------------
+
+For some data sources (such as many denylists), the input data changes
+continually. The input framework supports additional techniques to manage such
+ever-changing input.
+
+The first, very basic method is an explicit refresh of an input stream. When an
+input stream is open (meaning it has not yet been removed by a call to
+:zeek:see:`Input::remove`), the function :zeek:see:`Input::force_update` can be
+called. This will trigger a complete refresh of the table: any changed elements
+from the file will be updated, new ones added, and any elements no longer in
+the input data get removed. After the update is finished the
+:zeek:see:`Input::end_of_data` event will be raised.
+
+In our example the call would look as follows:
+
+.. code-block:: zeek
+
+ Input::force_update("denylist");
+
+Alternatively, the input framework can automatically refresh the table contents
+when it detects a change to the input file. To use this feature you need to
+specify a non-default read mode by setting the mode option of the
+:zeek:see:`Input::add_table` call. Valid values are :zeek:see:`Input::MANUAL`
+(the default), :zeek:see:`Input::REREAD`, and :zeek:see:`Input::STREAM`. For
+example, setting the value of the mode option in the previous example would
+look like this:
+
+.. code-block:: zeek
+
+ Input::add_table([$source="denylist.file", $name="denylist",
+ $idx=Idx, $val=Val, $destination=denylist,
+ $mode=Input::REREAD]);
+
+When using the reread mode (i.e., ``$mode=Input::REREAD``), Zeek continually
+checks if the input file has been changed. If the file has been changed, it is
+re-read and the data in the Zeek table is updated to reflect the current state.
+Each time a change has been detected and all the new data has been read into
+the table, the :zeek:see:`Input::end_of_data` event is raised.
+
+When using the streaming mode (i.e., ``$mode=Input::STREAM``), Zeek
+assumes that the input is an append-only file to which new data is
+continually appended. Zeek also checks to see if the file being
+followed has been renamed or rotated. The file is closed and reopened
+when tail detects that the filename being read from has a new inode
+number. Zeek continually checks for new data at the end of the file
+and will add the new data to the table. If newer lines in the file
+have the same table index as previous lines, they will overwrite
+the values in the output table. Because of the nature of streaming
+reads (data is continually added to the table), the
+:zeek:see:`Input::end_of_data` event is never raised when using
+streaming reads.
+
+.. tip::
+
+ Change detection happens via periodic “heartbeat” events, defaulting to a
+ frequency of once per second as defined by the global
+ :zeek:see:`Threading::heartbeat_interval` constant. The reader considers the
+ input file changed when the file’s inode or modification time has changed
+ since the last check.
+
+Receiving change events
+-----------------------
+
+When re-reading files, it might be interesting to know exactly which lines in
+the source files have changed. For this reason, the input framework can raise
+an event each time when a data item is added to, removed from, or changed in a
+table.
+
+The event definition looks like this (note that you can change the name of this
+event in your own Zeek script):
+
+.. code-block:: zeek
+
+ event entry(description: Input::TableDescription, tpe: Input::Event,
+ left: Idx, right: Val) {
+ # do something here...
+ print fmt("%s = %s", left, right);
+ }
+
+The event must be specified in ``$ev`` in the :zeek:see:`Input::add_table`
+call:
+
+.. code-block:: zeek
+
+ Input::add_table([$source="denylist.file", $name="denylist",
+ $idx=Idx, $val=Val, $destination=denylist,
+ $mode=Input::REREAD, $ev=entry]);
+
+The description argument of the event contains the arguments that were
+originally supplied to the :zeek:see:`Input::add_table` call. Hence, the name
+of the stream can, for example, be accessed with ``description$name``. The
+``tpe`` argument of the event is an enum containing the type of the change that
+occurred.
+
+If a line that was not previously present in the table has been added, then the
+value of ``tpe`` will be :zeek:see:`Input::EVENT_NEW`. In this case left
+contains the index of the added table entry and right contains the values of
+the added entry.
+
+If a table entry that already was present is altered during the re-reading or
+streaming read of a file, then the value of ``tpe`` will be
+:zeek:see:`Input::EVENT_CHANGED`. In this case ``left`` contains the index of
+the changed table entry and ``right`` contains the values of the entry before
+the change. The reason for this is that the table already has been updated when
+the event is raised. The current value in the table can be ascertained by
+looking up the current table value. Hence it is possible to compare the new and
+the old values of the table.
+
+If a table element is removed because it was no longer present during a
+re-read, then the value of ``tpe`` will be :zeek:see:`Input::EVENT_REMOVED`. In
+this case ``left`` contains the index and ``right`` the values of the removed
+element.
+
+Filtering data during import
+----------------------------
+
+The input framework also allows a user to filter the data during the import. To
+this end, predicate functions are used. A predicate function is called before a
+new element is added/changed/removed from a table. The predicate can either
+accept or veto the change by returning true for an accepted change and false
+for a rejected change. Furthermore, it can alter the data before it is written
+to the table.
+
+The following example filter will reject adding entries to the table when they
+were generated over a month ago. It will accept all changes and all removals of
+values that are already present in the table.
+
+.. code-block:: zeek
+
+ Input::add_table([$source="denylist.file", $name="denylist",
+ $idx=Idx, $val=Val, $destination=denylist,
+ $mode=Input::REREAD,
+ $pred(tpe: Input::Event, left: Idx, right: Val) = {
+ if ( tpe != Input::EVENT_NEW ) {
+ return T;
+ }
+ return (current_time() - right$timestamp) < 30day;
+ }]);
+
+To change elements while they are being imported, the predicate function can
+manipulate ``left`` and ``right``. Note that predicate functions are called
+before the change is committed to the table. Hence, when a table element is
+changed (``tpe`` is :zeek:see:`Input::EVENT_CHANGED`), ``left`` and ``right``
+contain the new values, but the destination (``denylist`` in our example) still
+contains the old values. This allows predicate functions to examine the changes
+between the old and the new version before deciding if they should be allowed.
+
+Broken input data
+-----------------
+
+The input framework notifies you of problems during data ingestion in two ways.
+First, reporter messages, ending up in :file:`reporter.log`, indicate the type of
+problem and the file in which the problem occurred::
+
+ #fields ts level message location
+ 0.000000 Reporter::WARNING denylist.file/Input::READER_ASCII: Did not find requested field ip in input data file denylist.file. (empty)
+
+Second, the :zeek:see:`Input::TableDescription` and
+:zeek:see:`Input::EventDescription` records feature an ``$error_ev`` member to
+trigger events indicating the same message and severity levels as shown above.
+The use of these events mirrors that of change events.
+
+For both approaches, the framework suppresses repeated messages regarding the
+same file, so mistakes in large data files do not trigger a message flood.
+
+Finally, the ASCII reader allows coarse control over the robustness in case of
+problems during data ingestion. Concretely, the
+:zeek:see:`InputAscii::fail_on_invalid_lines` and
+:zeek:see:`InputAscii::fail_on_file_problem` flags indicate whether problems
+should merely trigger warnings or lead to processing failure. Both default to
+warnings.
+
+Reading Data to Events
+======================
+
+The second data ingestion mode of the input framework directly generates Zeek
+events from ingested data instead of inserting them to a table. Event streams
+work very similarly to the table streams discussed above, and most of the
+features discussed (such as predicates for filtering) also work for event
+streams. To read the denylist of the previous example into an event stream, we
+use the :zeek:see:`Input::add_event` function:
+
+.. code-block:: zeek
+
+ type Val: record {
+ ip: addr;
+ timestamp: time;
+ reason: string;
+ };
+
+ event denylistentry(description: Input::EventDescription,
+ tpe: Input::Event, data: Val) {
+ # do something here...
+ print "data:", data;
+ }
+
+ event zeek_init() {
+ Input::add_event([$source="denylist.file", $name="denylist",
+ $fields=Val, $ev=denylistentry]);
+ }
+
+Event streams differ from table streams in two ways:
+
+* An event stream needs no separate index and value declarations — instead, all
+ source data types are provided in a single record definition.
+* Since the framework perceives a continuous stream of events, it has no
+ concept of a data baseline (e.g. a table) to compare the incoming data to.
+ Therefore the change event type (an :zeek:see:`Input::Event` instance,
+ ``tpe`` in the above) is currently always :zeek:see:`Input::EVENT_NEW`.
+
+These aside, event streams work exactly the same as table streams and support
+most of the options that are also supported for table streams.
+
+Data Readers
+============
+
+The input framework supports different kinds of readers for different kinds of
+source data files. At the moment, the framework defaults to ingesting ASCII
+files formatted in the Zeek log file format (tab-separated values with a
+``#fields`` header line). Several other readers are included in Zeek, and Zeek
+packages/plugins can provide additional ones.
+
+Reader selection proceeds as follows. The :zeek:see:`Input::default_reader`
+variable defines the default reader: :zeek:see:`Input::READER_ASCII`. When you
+call :zeek:see:`Input::add_table` or :zeek:see:`Input::add_event` this reader
+gets used automatically. You can override the default by assigning the
+``$reader`` member in the description record passed into these calls. See test
+cases in :file:`testing/btest/scripts/base/frameworks/input/` for examples.
+
+The ASCII Reader
+----------------
+
+The ASCII reader, enabled by default or by selecting
+:zeek:see:`Input::READER_ASCII`, understands Zeek’s TSV log format. It actually
+understands the full set of directives in the preamble of those log files, e.g.
+to define the column separator. This is rarely used, and most commonly input
+files merely start with a tab-separated row that names the ``#fields`` in the
+input file, as shown earlier.
+
+.. warning::
+
+ The ASCII reader has no notion of file locking, including UNIX’s advisory
+ locking. For large files, this means the framework might process a file
+ that’s still written to. The reader handles resulting errors robustly (e.g.
+ via the reporter log, as described earlier), but nevertheless will encounter
+ errors. In order to avoid these problems it’s best to produce a new input
+ file on the side, and then atomically rename it to the filename monitored by
+ the framework.
+
+There’s currently no JSON ingestion mode for this reader, but see the section
+about using the :ref:`raw reader ` together with the
+builtin :zeek:see:`from_json` function.
+
+The Benchmark Reader
+--------------------
+
+The benchmark reader, selected via :zeek:see:`Input::READER_BENCHMARK`, helps
+the Zeek developers optimize the speed of the input framework. It can generate
+arbitrary amounts of semi-random data in all Zeek data types supported by the
+input framework.
+
+The Binary Reader
+-----------------
+
+This reader, selected via :zeek:see:`Input::READER_BINARY`, is intended for
+use with file analysis input streams to ingest file content (and is the default
+type of reader for those streams).
+
+.. _input-raw-reader:
+
+The Raw Reader
+--------------
+
+The raw reader, selected via :zeek:see:`Input::READER_RAW`, reads a file that
+is split by a specified record separator (newline by default). The contents are
+returned line-by-line as strings; it can, for example, be used to read
+configuration files and the like and is probably only useful in the event mode
+and not for reading data to tables.
+
+Reading JSON Lines
+~~~~~~~~~~~~~~~~~~
+
+.. versionadded:: 6.0
+
+
+While the ASCII reader does not currently support JSON natively, it is
+possible to use the raw reader together with the builtin :zeek:see:`from_json`
+function to read files in JSON lines format and instantiate Zeek record
+values based on the input.
+
+The following example shows how this can be done, holding two state tables
+in order to allow for removal updates of the read data.
+
+.. literalinclude:: denylist.jsonl
+ :caption:
+ :language: json
+ :linenos:
+ :tab-width: 4
+
+.. literalinclude:: input_json_1.zeek
+ :caption: Loading denylist.jsonl, converting to Zeek types, populating a table.
+ :language: zeek
+ :linenos:
+ :tab-width: 4
+
+If your input data is already in, or can be easily converted into, JSON Lines format
+the above approach can be used to load it into Zeek.
+
+.. _input-sqlite-reader:
+
+The SQLite Reader
+-----------------
+
+The SQLite input reader, selected via :zeek:see:`Input::READER_SQLITE`,
+provides a way to access SQLite databases from Zeek. SQLite is a simple,
+file-based, widely used SQL database system. Due to the transactional nature of
+SQLite, databases can be used by several applications simultaneously. Hence
+they can, for example, be used to make constantly evolving datasets available
+to Zeek on a continuous basis.
+
+Reading Data from SQLite Databases
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Like with Zeek’s logging support, reading data from SQLite databases is built
+into Zeek without any extra configuration needed. Just like text-based input
+readers, the SQLite reader can read data — in this case the result of SQL
+queries — into tables or events.
+
+Reading Data into Tables
+************************
+
+To read data from a SQLite database, we first have to provide Zeek with the
+information how the resulting data will be structured. For this example, we
+expect that we have a SQLite database, which contains host IP addresses and the
+user accounts that are allowed to log into a specific machine.
+
+The SQLite commands to create the schema are as follows::
+
+ create table machines_to_users (
+ host text unique not null,
+ users text not null);
+
+ insert into machines_to_users values (
+ '192.168.17.1', 'johanna,matthias,seth');
+ insert into machines_to_users values (
+ '192.168.17.2', 'johanna');
+ insert into machines_to_users values (
+ '192.168.17.3', 'seth,matthias');
+
+After creating a file called hosts.sqlite with this content, we can read the
+resulting table into Zeek:
+
+.. code-block:: zeek
+
+ type Idx: record {
+ host: addr;
+ };
+
+ type Val: record {
+ users: set[string];
+ };
+
+ global hostslist: table[addr] of Val = table();
+
+ event zeek_init()
+ {
+ Input::add_table([$source="/var/db/hosts",
+ $name="hosts",
+ $idx=Idx,
+ $val=Val,
+ $destination=hostslist,
+ $reader=Input::READER_SQLITE,
+ $config=table(["query"] = "select * from machines_to_users;")
+ ]);
+
+ Input::remove("hosts");
+ }
+
+ event Input::end_of_data(name: string, source: string)
+ {
+ if ( name != "hosts" )
+ return;
+
+ # now all data is in the table
+ print "Hosts list has been successfully imported";
+
+ # List the users of one host.
+ print hostslist[192.168.17.1]$users;
+ }
+
+The ``hostslist`` table can now be used to check host logins against an
+available user list.
+
+Turning Data into Events
+************************
+
+The second mode is to use the SQLite reader to output the input data as events.
+Typically there are two reasons to do this. First, the structure of the input
+data is too complicated for a direct table import. In this case, the data can
+be read into an event which can then create the necessary data structures in
+Zeek in scriptland. Second, the dataset is too big to hold in memory. In this
+case, event-driven ingestion can perform checks on-demand.
+
+As an example, let’s consider a large database with malware hashes. Live
+database queries allow us to cross-check sporadically occurring downloads
+against this evolving database. The SQLite commands to create the schema are as
+follows::
+
+ create table malware_hashes (
+ hash text unique not null,
+ description text not null);
+
+ insert into malware_hashes values ('86f7e437faa5a7fce15d1ddcb9eaeaea377667b8', 'malware a');
+ insert into malware_hashes values ('e9d71f5ee7c92d6dc9e92ffdad17b8bd49418f98', 'malware b');
+ insert into malware_hashes values ('84a516841ba77a5b4648de2cd0dfcb30ea46dbb4', 'malware c');
+ insert into malware_hashes values ('3c363836cf4e16666669a25da280a1865c2d2874', 'malware d');
+ insert into malware_hashes values ('58e6b3a414a1e090dfc6029add0f3555ccba127f', 'malware e');
+ insert into malware_hashes values ('4a0a19218e082a343a1b17e5333409af9d98f0f5', 'malware f');
+ insert into malware_hashes values ('54fd1711209fb1c0781092374132c66e79e2241b', 'malware g');
+ insert into malware_hashes values ('27d5482eebd075de44389774fce28c69f45c8a75', 'malware h');
+ insert into malware_hashes values ('73f45106968ff8dc51fba105fa91306af1ff6666', 'ftp-trace');
+
+The following code uses the file-analysis framework to get the sha1 hashes of
+files that are transmitted over the network. For each hash, a SQL-query runs
+against SQLite. If the query returns a result, we output the matching hash.
+
+.. code-block:: zeek
+
+ @load frameworks/files/hash-all-files
+
+ type Val: record {
+ hash: string;
+ description: string;
+ };
+
+ event line(description: Input::EventDescription, tpe: Input::Event, r: Val)
+ {
+ print fmt("malware-hit with hash %s, description %s", r$hash, r$description);
+ }
+
+ global malware_source = "/var/db/malware";
+
+ event file_hash(f: fa_file, kind: string, hash: string)
+ {
+
+ # check all sha1 hashes
+ if ( kind=="sha1" )
+ {
+ Input::add_event(
+ [
+ $source=malware_source,
+ $name=hash,
+ $fields=Val,
+ $ev=line,
+ $want_record=T,
+ $config=table(
+ ["query"] = fmt("select * from malware_hashes where hash='%s';", hash)
+ ),
+ $reader=Input::READER_SQLITE
+ ]);
+ }
+ }
+
+ event Input::end_of_data(name: string, source:string)
+ {
+ if ( source == malware_source )
+ Input::remove(name);
+ }
+
+If you run this script against the trace in
+:file:`testing/btest/Traces/ftp/ipv4.trace`, you will get one hit.
diff --git a/doc/frameworks/input_json_1.zeek b/doc/frameworks/input_json_1.zeek
new file mode 100644
index 0000000000..8a829e4375
--- /dev/null
+++ b/doc/frameworks/input_json_1.zeek
@@ -0,0 +1,56 @@
+## Read a denylist.jsonl file in JSON Lines format
+module Denylist;
+
+type JsonLine: record {
+ s: string;
+};
+
+type Entry: record {
+ ip: addr;
+ timestamp: time;
+ reason: string;
+};
+
+global staged_denies: table[addr] of Entry;
+global active_denies: table[addr] of Entry;
+
+event Input::end_of_data(name: string, source: string)
+ {
+ if ( name != "denylist" )
+ return;
+
+ # Switch active and staging tables when input file has been read.
+ active_denies = staged_denies;
+ staged_denies = table();
+
+ print network_time(), "end_of_data() active:", table_keys(active_denies);
+ }
+
+
+event Denylist::json_line(description: Input::EventDescription, tpe: Input::Event, l: string)
+ {
+ local parse_result = from_json(l, Entry);
+
+ # Parsing of JSON may fail, so ignore anything invalid.
+ if ( ! parse_result$valid )
+ return;
+
+ # Cast parsed value as Entry...
+ local entry = parse_result$v as Entry;
+
+ # ...and populate staging table.
+ staged_denies[entry$ip] = entry;
+ }
+
+event zeek_init()
+ {
+ Input::add_event([
+ $source="denylist.jsonl",
+ $name="denylist",
+ $reader=Input::READER_RAW,
+ $mode=Input::REREAD,
+ $fields=JsonLine,
+ $ev=Denylist::json_line,
+ $want_record=F,
+ ]);
+ }
diff --git a/doc/frameworks/intel.rst b/doc/frameworks/intel.rst
new file mode 100644
index 0000000000..dd7500f4c7
--- /dev/null
+++ b/doc/frameworks/intel.rst
@@ -0,0 +1,270 @@
+
+======================
+Intelligence Framework
+======================
+
+Introduction
+============
+
+The goals of Zeek’s Intelligence Framework are to consume intelligence data,
+make it available for matching, and provide infrastructure to improve
+performance and memory utilization.
+
+Data in the Intelligence Framework is an atomic piece of intelligence such as
+an IP address or an e-mail address. This atomic data will be packed with
+metadata such as a freeform source field, a freeform descriptive field, and a
+URL which might lead to more information about the specific item. The metadata
+in the default scripts has been deliberately kept to a minimum.
+
+Quick Start
+===========
+
+First we need to define the intelligence data to match. Let's look for the
+domain ``www.reddit.com``. For the details of the file format see the
+:ref:`Loading Intelligence ` section below.
+
+::
+
+ #fields indicator indicator_type meta.source
+ www.reddit.com Intel::DOMAIN my_special_source
+
+Now we need to tell Zeek about the data. Add this line to your local.zeek to
+load an intelligence file:
+
+.. code-block:: zeek
+
+ redef Intel::read_files += { "/somewhere/yourdata.txt" };
+
+In a cluster, the text files only need to reside on the manager.
+
+Add the following line to :file:`local.zeek` to load the scripts that send
+“seen” data into the Intelligence Framework to be checked against the loaded
+intelligence data:
+
+.. code-block:: zeek
+
+ @load frameworks/intel/seen
+
+Intelligence data matches will be logged to the :file:`intel.log` file. A match
+on ``www.reddit.com`` might look like this::
+
+ {
+ "ts":1320279566.452687,
+ "uid":"C4llPsinsviGyNY45",
+ "id.orig_h":"192.168.2.76",
+ "id.orig_p":52026,
+ "id.resp_h":"132.235.215.119",
+ "id.resp_p":80,
+ "seen.indicator":"www.reddit.com",
+ "seen.indicator_type":"Intel::DOMAIN",
+ "seen.where":"HTTP::IN_HOST_HEADER",
+ "seen.node":"zeek",
+ "matched":[
+ "Intel::DOMAIN"
+ ],
+ "sources":[
+ "my_special_source"
+ ]}
+
+You can explore this example on `try.zeek.org
+`_.
+
+Architecture
+============
+
+The Intelligence Framework can be thought of as containing three separate
+portions. The first part involves loading intelligence data. The second is a
+mechanism for indicating to the intelligence framework that a piece of data
+which needs to be checked has been seen. The third handles when a positive
+match has been discovered.
+
+.. image:: /images/intel-architecture.png
+ :align: center
+
+The figure above depicts how these portions work together: loading intelligence
+*inserts* the data into an in-memory data store that is managed by the
+intelligence framework. During traffic analysis, scripts report the *seen* data
+to the framework to check against the loaded items.
+
+.. _loading-intelligence:
+
+Loading Intelligence
+--------------------
+
+By default, intelligence data is loaded through plain text files using the
+Input Framework. In clusters the manager is the only node that needs the
+intelligence data. The intelligence framework has distribution mechanisms which
+will push data out to all of the nodes that need it.
+
+Here is an example of the intelligence data format. All fields must be
+separated by a single tab character and fields containing only a hyphen are
+considered to be null values. Note that there may be additional fields
+depending on the loaded extensions. One example is the
+:doc:`/scripts/policy/frameworks/intel/do_notice.zeek` script as described
+below.
+
+::
+
+ #fields indicator indicator_type meta.source meta.desc meta.url
+ 1.2.3.4 Intel::ADDR source1 Sending phishing email http://source1.com/badhosts/1.2.3.4
+ a.b.com Intel::DOMAIN source2 Name used for data exfiltration -
+
+For a list of all built-in ``indicator_type`` values, please refer to the
+documentation of :zeek:see:`Intel::Type`.
+
+To load the data once the files are created, add the following to your
+``local.zeek`` to specify which intel files to load (with your own file names
+of course):
+
+.. code-block:: zeek
+
+ redef Intel::read_files += {
+ "/somewhere/feed1.txt",
+ "/somewhere/feed2.txt",
+ };
+
+Remember, the files only need to be present on the file system of the manager
+node on cluster deployments.
+
+The intel framework is very flexible so that intelligence matching can be
+extended in numerous ways. For example, the
+:doc:`/scripts/policy/frameworks/intel/do_notice.zeek`
+script implements a
+simple mechanism to raise a Zeek notice (of type :zeek:see:`Intel::Notice`) for
+user-specified intelligence matches. To use this feature, add the following
+line to ``local.zeek``:
+
+.. code-block:: zeek
+
+ @load frameworks/intel/do_notice
+
+The script adds additional metadata fields. In particular, if the ``do_notice``
+field of type bool is set to ``T`` for an intelligence item, Zeek will create a
+notice when the item is matched.
+
+Seen Data
+---------
+
+When some bit of data is extracted from network traffic (such as an email
+address in the “From” header in a SMTP message), the Intelligence Framework
+needs to be informed that this data was discovered so that its presence will be
+checked within the loaded intelligence data. This is accomplished through the
+:zeek:see:`Intel::seen` function.
+
+Zeek includes a default set of scripts that will send data to the intelligence
+framework. To load all of the scripts included with Zeek for sending “seen”
+data to the intelligence framework, just add this line to ``local.zeek``:
+
+.. code-block:: zeek
+
+ @load frameworks/intel/seen
+
+Alternatively, specific scripts in that directory can be loaded. Keep in mind
+that as more data is sent to the intelligence framework, the CPU load consumed
+by Zeek will increase depending on how many times the :zeek:see:`Intel::seen`
+function is being called. The effect of this condition depends on the nature
+and volume of the traffic Zeek monitors.
+
+Zeek's intelligence framework can only match loaded items if corresponding
+occurrences are reported as *seen*. For example, the scripts included with Zeek
+will only report IP addresses from established TCP connections to the
+intelligence framework. Thus, neither UDP traffic nor one-sided traffic will
+trigger intelligence hits by default. However, it is easy to report additional
+observations to the framework. The following will report the IPs of all
+connections (including ICMP, UDP and one-sided traffic) to the intelligence
+framework:
+
+.. code-block:: zeek
+
+ event new_connection(c: connection)
+ {
+ Intel::seen([$host=c$id$orig_h, $conn=c, $where=Conn::IN_ORIG]);
+ Intel::seen([$host=c$id$resp_h, $conn=c, $where=Conn::IN_RESP]);
+ }
+
+Note that using the :zeek:see:`new_connection` event could have a significant
+impact on the overall performance as much more data might be processed by the
+intelligence framework.
+
+Intelligence Matches
+--------------------
+
+The Intelligence Framework provides an event that is generated whenever a match
+is discovered. This event is named :zeek:see:`Intel::match` and receives two
+arguments. First, a record of type :zeek:see:`Intel::Seen` that describes the
+observation as reported to the framework. It contains information about what
+was seen (e.g., the domain ``www.slideshare.net``), where it was seen (e.g. in
+an X509 certificate) and further context (e.g., a connection or a file record)
+if available. The second argument is a set of intelligence items that matched
+the observation. A set is used because multiple items may match a given
+observation. For example, assume you have ingested the IP ``1.2.3.4`` from
+source A and from source B as well as the subnet ``1.2.3.0/24`` from source B.
+If the IP ``1.2.3.4`` is seen in your traffic, the match event will receive all
+three intelligence items.
+
+In a cluster setup, the match event is raised on the manager. This is important
+to keep in mind when writing a script that handles the event. While the context
+information about the match is available through the event parameters, the
+handler itself is executed on the manager. Thus, one cannot access any state
+that is local to the worker node that reported the observation in the first
+place. Other interaction is also limited. For example, one cannot reliably
+trigger file extraction based on an intelligence hit: Once the manager
+processes the match event and comes to the conclusion that file extraction
+would be desired, the worker that triggered the hit is most likely done
+processing the corresponding data. Instead, one would need to start by
+extracting all files that are potentially relevant, keep the ones that refer to
+an intelligence hit and regularly discard the others.
+
+Intelligence matches are logged to the :file:`intel.log` file. For further
+description of each field in that file, see the documentation for the
+:zeek:see:`Intel::Info` record.
+
+The following are two matches from a sample :file:`intel.log`::
+
+ {
+ "ts": "2019-03-12T18:22:19.252191Z",
+ "uid": "Cpue7J1KNReqCodXHc",
+ "id.orig_h": "192.168.4.6",
+ "id.orig_p": 64738,
+ "id.resp_h": "13.107.18.13",
+ "id.resp_p": 443,
+ "seen.indicator": "www.slideshare.net",
+ "seen.indicator_type": "Intel::DOMAIN",
+ "seen.where": "X509::IN_CERT",
+ "seen.node": "so16-enp0s8-1",
+ "matched": [
+ "Intel::DOMAIN"
+ ],
+ "sources": [
+ "from http://hosts-file.net/fsa.txt via intel.criticalstack.com"
+ ],
+ "fuid": "FnRp0j1YMig5KhcMDg",
+ "file_mime_type": "application/x-x509-user-cert",
+ "file_desc": "13.107.18.13:443/tcp"
+ }
+ {
+ "ts": "2019-03-12T18:32:19.821962Z",
+ "uid": "CvusFJ2HdbTnCLxEUa",
+ "id.orig_h": "192.168.4.6",
+ "id.orig_p": 64826,
+ "id.resp_h": "13.107.42.14",
+ "id.resp_p": 443,
+ "seen.indicator": "www.slideshare.net",
+ "seen.indicator_type": "Intel::DOMAIN",
+ "seen.where": "X509::IN_CERT",
+ "seen.node": "so16-enp0s8-1",
+ "matched": [
+ "Intel::DOMAIN"
+ ],
+ "sources": [
+ "from http://hosts-file.net/fsa.txt via intel.criticalstack.com"
+ ],
+ "fuid": "FUrrLa45T7a8hjdRy",
+ "file_mime_type": "application/x-x509-user-cert",
+ "file_desc": "13.107.42.14:443/tcp"
+ }
+
+These examples show there were matches in a domain observed in a X509
+certificate. That domain was ``www.slideshare.net``. This is unusual as that
+domain is used for legitimate purposes. This example demonstrates that analysts
+must vet intelligence feeds for their local use and applicability.
diff --git a/doc/frameworks/logging-input-sqlite.rst b/doc/frameworks/logging-input-sqlite.rst
new file mode 100644
index 0000000000..a785e99b58
--- /dev/null
+++ b/doc/frameworks/logging-input-sqlite.rst
@@ -0,0 +1,8 @@
+:orphan:
+
+====================
+SQLite Input/Logging
+====================
+
+* :ref:`SQLite Input Reader `
+* :ref:`SQLite Log Writer `
diff --git a/doc/frameworks/logging.rst b/doc/frameworks/logging.rst
new file mode 100644
index 0000000000..b764a730cc
--- /dev/null
+++ b/doc/frameworks/logging.rst
@@ -0,0 +1,1087 @@
+
+.. _framework-logging:
+
+=================
+Logging Framework
+=================
+
+Zeek comes with a flexible logging interface that allows fine-grained control of
+what gets logged and how it is logged. This section explains how you can use
+this framework to customize and extended your logs.
+
+Terminology
+===========
+
+Zeek’s logging interface is built around three main abstractions:
+
+ Streams
+ A log stream corresponds to a single log. It defines the set of fields that
+ a log consists of with their names and types. Examples are the conn stream
+ for recording connection summaries, and the http stream for recording HTTP
+ activity.
+
+ Filters
+ Each stream has a set of filters attached to it that determine what
+ information gets written out, and how. By default, each stream has one
+ default filter that just logs everything directly to disk. However,
+ additional filters can be added to record only a subset of the log records,
+ write to different outputs, or set a custom rotation interval. If all
+ filters are removed from a stream, then output is disabled for that stream.
+
+ Writers
+ Each filter has a writer. A writer defines the actual output format for the
+ information being logged. The default writer is the ASCII writer, which
+ produces tab-separated ASCII files. Other writers are available, like for
+ binary output or direct logging into a database.
+
+There are several different ways to customize Zeek’s logging: you can create a
+new log stream, you can extend an existing log with new fields, you can apply
+filters to an existing log stream, or you can customize the output format by
+setting log writer options. All of these approaches are described below.
+
+Streams
+=======
+
+In order to log data to a new log stream, all of the following needs to be done:
+
+* A :zeek:see:`record` type must be defined which consists of all the fields
+ that will be logged (by convention, the name of this record type is usually
+ “Info”).
+* A log stream ID (an :zeek:see:`enum` with type name :zeek:see:`Log::ID`) must
+ be defined that uniquely identifies the new log stream.
+* A log stream must be created using the :zeek:see:`Log::create_stream`
+ function.
+* When the data to be logged becomes available, the :zeek:see:`Log::write`
+ function must be called.
+
+In the following example, we create a new module, ``Foo``, which creates a new
+log stream.
+
+.. code-block:: zeek
+
+ module Foo;
+
+ export {
+ # Create an ID for our new stream. By convention, this is
+ # called "LOG".
+ redef enum Log::ID += { LOG };
+
+ # Define the record type that will contain the data to log.
+ type Info: record {
+ ts: time &log;
+ id: conn_id &log;
+ service: string &log &optional;
+ missed_bytes: count &log &default=0;
+ };
+ }
+
+ # Optionally, we can add a new field to the connection record so that
+ # the data we are logging (our "Info" record) will be easily
+ # accessible in a variety of event handlers.
+ redef record connection += {
+ # By convention, the name of this new field is the lowercase name
+ # of the module.
+ foo: Info &optional;
+ };
+
+ # This event is handled at a priority higher than zero so that if
+ # users modify this stream in another script, they can do so at the
+ # default priority of zero.
+ event zeek_init() &priority=5
+ {
+ # Create the stream. This adds a default filter automatically.
+ Log::create_stream(Foo::LOG, [$columns=Info, $path="foo"]);
+ }
+
+In the definition of the ``Info`` record above, notice that each field has the
+:zeek:see:`&log` attribute. Without this attribute, a field will not appear in
+the log output. Also notice one field has the :zeek:see:`&optional` attribute.
+This indicates that the field might not be assigned any value before the log
+record is written. Finally, a field with the :zeek:see:`&default` attribute
+has a default value assigned to it automatically.
+
+At this point, the only thing missing is a call to the :zeek:see:`Log::write`
+function to send data to the logging framework. The actual event handler where
+this should take place will depend on where your data becomes available. In
+this example, the :zeek:see:`connection_established` event provides our data,
+and we also store a copy of the data being logged into the
+:zeek:see:`connection` record:
+
+.. code-block:: zeek
+
+ event connection_established(c: connection)
+ {
+ local rec: Foo::Info = [$ts=network_time(), $id=c$id];
+
+ # Store a copy of the data in the connection record so other
+ # event handlers can access it.
+ c$foo = rec;
+
+ Log::write(Foo::LOG, rec);
+ }
+
+If you run Zeek with this script, a new log file :file:`foo.log` will be
+created. Although we only specified four fields in the ``Info`` record above,
+the log output will actually contain seven fields because one of the fields
+(the one named ``id``) is itself a record type. Since a :zeek:see:`conn_id`
+record has four fields, then each of these fields is a separate column in the
+log output. Note that the way that such fields are named in the log output
+differs slightly from the way we would refer to the same field in a Zeek script
+(each dollar sign is replaced with a period). For example, to access the first
+field of a :zeek:see:`conn_id` in a Zeek script we would use the notation
+``id$orig_h``, but that field is named ``id.orig_h`` in the log output.
+
+When you are developing scripts that add data to the :zeek:see:`connection`
+record, care must be given to when and how long data is stored. Normally data
+saved to the connection record will remain there for the duration of the
+connection and from a practical perspective it’s not uncommon to need to delete
+that data before the end of the connection.
+
+Add Fields to a Log
+-------------------
+
+You can add additional fields to a log by extending the record type that
+defines its content, and setting a value for the new fields before each log
+record is written.
+
+Let’s say we want to add a boolean field ``is_private`` to
+:zeek:see:`Conn::Info` that indicates whether the originator IP address is part
+of the :rfc:`1918` space:
+
+.. code-block:: zeek
+
+ # Add a field to the connection log record.
+ redef record Conn::Info += {
+ ## Indicate if the originator of the connection is part of the
+ ## "private" address space defined in RFC1918.
+ is_private: bool &default=F &log;
+ };
+
+As this example shows, when extending a log stream’s ``Info`` record, each new
+field must always be declared either with a &default value or as
+:zeek:see:`&optional`. Furthermore, you need to add the :zeek:see:`&log`
+attribute or otherwise the field won’t appear in the log file.
+
+Now we need to set the field. Although the details vary depending on which log
+is being extended, in general it is important to choose a suitable event in
+which to set the additional fields because we need to make sure that the fields
+are set before the log record is written. Sometimes the right choice is the
+same event which writes the log record, but at a higher priority (in order to
+ensure that the event handler that sets the additional fields is executed
+before the event handler that writes the log record).
+
+In this example, since a connection’s summary is generated at the time its
+state is removed from memory, we can add another handler at that time that sets
+our field correctly:
+
+.. code-block:: zeek
+
+ event connection_state_remove(c: connection)
+ {
+ if ( c$id$orig_h in Site::private_address_space )
+ c$conn$is_private = T;
+ }
+
+Now :file:`conn.log` will show a new field ``is_private`` of type
+:zeek:see:`bool`. If you look at the Zeek script which defines the connection
+log stream :doc:`/scripts/base/protocols/conn/main.zeek`, you will see that
+:zeek:see:`Log::write` gets called in an event handler for the same event as
+used in this example to set the additional fields, but at a lower priority than
+the one used in this example (i.e., the log record gets written after we assign
+the ``is_private`` field).
+
+For extending logs this way, one needs a bit of knowledge about how the script
+that creates the log stream is organizing its state keeping. Most of the
+standard Zeek scripts attach their log state to the :zeek:see:`connection`
+record where it can then be accessed, just like ``c$conn`` above. For example,
+the HTTP analysis adds a field http of type :zeek:see:`HTTP::Info` to the
+:zeek:see:`connection` record.
+
+Define a Logging Event
+----------------------
+
+Sometimes it is helpful to do additional analysis of the information being
+logged. For these cases, a stream can specify an event that will be generated
+every time a log record is written to it. To do this, we need to modify the
+example module shown above to look something like this:
+
+.. code-block:: zeek
+
+ module Foo;
+
+ export {
+ redef enum Log::ID += { LOG };
+
+ type Info: record {
+ ts: time &log;
+ id: conn_id &log;
+ service: string &log &optional;
+ missed_bytes: count &log &default=0;
+ };
+
+ # Define a logging event. By convention, this is called
+ # "log_".
+ global log_foo: event(rec: Info);
+ }
+
+ event zeek_init() &priority=5
+ {
+ # Specify the "log_foo" event here in order for Zeek to raise it.
+ Log::create_stream(Foo::LOG, [$columns=Info, $ev=log_foo,
+ $path="foo"]);
+ }
+
+All of Zeek’s default log streams define such an event. For example, the
+connection log stream raises the event :zeek:see:`Conn::log_conn`. You could
+use that for example for flagging when a connection to a specific destination
+exceeds a certain duration:
+
+.. code-block:: zeek
+
+ redef enum Notice::Type += {
+ ## Indicates that a connection remained established longer
+ ## than 5 minutes.
+ Long_Conn_Found
+ };
+
+ event Conn::log_conn(rec: Conn::Info)
+ {
+ if ( rec?$duration && rec$duration > 5mins )
+ NOTICE([$note=Long_Conn_Found,
+ $msg=fmt("unusually long conn to %s", rec$id$resp_h),
+ $id=rec$id]);
+ }
+
+Often, these events can be an alternative to post-processing Zeek logs
+externally with Perl scripts. Much of what such an external script would do
+later offline, one may instead do directly inside of Zeek in real-time.
+
+Disable a Stream
+----------------
+
+One way to “turn off” a log is to completely disable the stream. For example,
+the following example will prevent the :file:`conn.log` from being written:
+
+.. code-block:: zeek
+
+ event zeek_init()
+ {
+ Log::disable_stream(Conn::LOG);
+ }
+
+Note that this must run after the stream is created, so the priority of this
+event handler must be lower than the priority of the event handler where the
+stream was created.
+
+
+Delaying Log Writes
+-------------------
+
+.. versionadded:: 6.2
+
+The logging framework allows delaying log writes using the
+:zeek:see:`Log::delay` function.
+
+This functionality enables querying or waiting for additional data to attach to
+an in-flight log record for which a :zeek:see:`Log::write` has happened.
+Common examples are the execution of DNS reverse lookups for the addresses
+of a connection, or - more generally - asynchronous queries to external systems.
+Similarly, waiting a small duration for more data from an external process
+pertaining to specific connections or events is another. For example, endpoint
+agents may provide detailed process information for specific connections
+logged by Zeek.
+
+Conceptually, the delay of a log record is placed after the execution of the
+global :zeek:see:`Log::log_stream_policy` hook and before the execution of
+:ref:`policy hooks attached to filters `.
+At this point, calling :zeek:see:`Log::delay` is only valid for the currently
+*active write* during the execution of the global :zeek:see:`Log::log_stream_policy`
+hook. Calling :zeek:see:`Log::delay` in any other context or with the wrong
+arguments results in runtime errors.
+
+.. note::
+
+ While this may appear very restrictive, it does make it explicit that it is
+ the action of a :zeek:see:`Log::write` for a given stream and log record
+ that is being delayed as well as providing a defined point where a delay starts.
+
+ Prior ideas entertained the idea of an implicit and very lax interface, but
+ in the end was deemed too loose and provided too much flexibility that would
+ be hard to later restrict again or keep stable. The current interface might
+ be made more lax in the future if it turns out to be too rigid.
+
+
+By default, log records are not delayed. That is, during the execution of
+the :zeek:see:`Log::write` function, a serialized version of the given log
+record is handed off to a remote logger or a local logging thread.
+Modifications of the same log record after :zeek:see:`Log::write` has returned
+have no effect.
+
+In contrast, when a log write is delayed using the :zeek:see:`Log::delay`
+function, the record is enqueued into a per-stream record queue and the
+:zeek:see:`Log::write` returns. Processing of the delayed write resumes once
+it is released by using the :zeek:see:`Log::delay_finish` function or until
+a maximum, per-stream configurable, delay duration expires.
+
+When processing of a log write is resumed, first, all post delay callbacks
+given to :zeek:see:`Log::delay` are executed. Thereafter, as for non-delayed
+writes, filter policy hooks are executed and the log record is serialized.
+
+Policy hooks attached to filters and the serialization step observe any
+mutations done during the delay. Filter policy hooks may even use these
+modifications for deciding on the verdict of the given log record.
+
+.. note::
+
+ Policy hooks attached to filters are often used to skip logging of
+ uninteresting log records. When combined with log write delaying, users
+ should consider lifting such filter logic up into the
+ :zeek:see:`Log::log_stream_policy` hook to avoid unnecessarily delaying
+ records when it is known that these will be discarded later on.
+
+
+The :zeek:see:`Log::delay` and :zeek:see:`Log::delay_finish` functions increment
+and decrement an internal reference count for a given write. To continue a
+delayed write, :zeek:see:`Log::delay_finish` must be called as often as
+:zeek:see:`Log::delay`.
+
+
+Zeek delays a log record by a configurable interval defined for each log stream.
+It defaults to the global :zeek:see:`Log::default_max_delay_interval`, and can be
+adapted by calling :zeek:see:`Log::set_max_delay_interval` on the stream.
+It is possible to explicitly extend the delay duration by providing a post
+delay callback to :zeek:see:`Log::delay`. Calling :zeek:see:`Log::delay` from
+within such a post delay callback re-delays the record, essentially putting
+it at the end of the per-stream queue again.
+
+.. note::
+
+ While this puts additional burden on the script writer to realize per-record
+ specific longer delay intervals, it allows for a simpler internal implementation.
+ Additionally, the explicit re-delaying is also meant to make users aware of the
+ consequences when using such long delays either on purpose or by accident.
+
+ For multiple second or even longer delays, it is suggested to consider resumable,
+ robust and non-ephemeral external post processing steps based on Zeek logs instead.
+ In the face of worker crashes or uncontrolled restarts of a Zeek cluster, all
+ delayed log records are inevitably lost.
+
+
+The following example shows how to use the :ref:`when ` to asynchronously
+lookup the DNS names of the originator and responder addresses to enrich an
+in-flight :zeek:see:`Conn::Info` record. By default, a stream's maximum delay
+interval is 200 milliseconds - the ``timeout 150msec`` part ensures a delayed
+write resumes after 150 milliseconds already by explicitly calling
+:zeek:see:`Log::delay_finish`.
+
+
+.. literalinclude:: logging/delay1.zeek
+ :caption: Enriching conn.log with originator and responder names.
+ :language: zeek
+ :linenos:
+ :tab-width: 4
+
+
+Filters
+=======
+
+A stream has one or more filters attached to it. A stream without any filters
+will not produce any log output. Filters govern two aspects of log production:
+they control which of the stream’s log entries get written out, and they define
+how to actually implement the log writes. They do the latter by specifying a
+log writer that implements the write operation, such as the ASCII writer (see
+below) for text file output. When a stream is created, it automatically gets a
+default filter attached to it. This default filter can be removed or replaced,
+or other filters can be added to the stream. This is accomplished by using
+either the :zeek:see:`Log::add_filter` or :zeek:see:`Log::remove_filter`
+function. This section shows how to use filters to do such tasks as rename a
+log file, split the output into multiple files, control which records are
+written, and set a custom rotation interval.
+
+Each filter has a unique name, scoped to the stream it belongs to. That is, all
+filters attached to a given stream have different names. Calling
+:zeek:see:`Log::add_filter` to add a filter with a name that already exists for
+the stream replaces the existing filter.
+
+Rename a Log File
+-----------------
+
+Normally, the log filename for a given log stream is determined when the stream
+is created, unless you explicitly specify a different one by adding a filter.
+
+The easiest way to change a log filename is to simply replace the default log
+filter with a new filter that specifies a value for the ``path`` field. In this
+example, :file:`conn.log` will be changed to :file:`myconn.log`:
+
+.. code-block:: zeek
+
+ event zeek_init()
+ {
+ # Replace default filter for the Conn::LOG stream in order to
+ # change the log filename.
+
+ local f = Log::get_filter(Conn::LOG, "default");
+ f$path = "myconn";
+ Log::add_filter(Conn::LOG, f);
+ }
+
+Keep in mind that the ``path`` field of a log filter never contains the
+filename extension. The extension will be determined later by the log writer.
+
+Change the Logging Directory
+----------------------------
+
+By default, Zeek log files are created in the current working directory.
+To write logs into a different directory, set :zeek:see:`Log::default_logdir`:
+
+.. code-block:: zeek
+
+ redef Log::default_logdir = /path/to/output_log_directory
+
+The :zeek:see:`Log::default_logdir` option is honored by all file based
+writes included with Zeek (ASCII and SQLite).
+
+Add an Additional Output File
+-----------------------------
+
+Normally, a log stream writes to only one log file. However, you can add
+filters so that the stream writes to multiple files. This is useful if you want
+to restrict the set of fields being logged to the new file.
+
+In this example, a new filter is added to the :zeek:see:`Conn::LOG` stream that
+writes two fields to a new log file:
+
+.. code-block:: zeek
+
+ event zeek_init()
+ {
+ # Add a new filter to the Conn::LOG stream that logs only
+ # timestamp and originator address.
+
+ local filter: Log::Filter = [$name="orig-only", $path="origs",
+ $include=set("ts", "id.orig_h")];
+ Log::add_filter(Conn::LOG, filter);
+ }
+
+.. note::
+
+ When multiple filters added to a stream use the same path value, Zeek will
+ disambiguate the output file names by adding numeric suffixes to the name. If
+ we say ``$path="conn"`` in the above example, Zeek warns us about the fact that
+ it’ll write this filter’s log entries to a different file::
+
+ 1071580905.346457 warning: Write using filter 'orig-only' on path 'conn' changed to use new path 'conn-2' to avoid conflict with filter 'default'
+
+ The same also happens when omitting a path value, in which case the filter
+ inherits the value of the stream’s path member.
+
+Notice how the ``include`` filter attribute specifies a set that limits the
+fields to the ones given. The names correspond to those in the
+:zeek:see:`Conn::Info` record (however, because the ``id`` field is itself a
+record, we can specify an individual field of ``id`` by the dot notation shown
+in the example).
+
+Using the code above, in addition to the regular :file:`conn.log`, you will now
+also get a new log file :file:`origs.log` that looks like the regular
+:file:`conn.log`, but will have only the fields specified in the ``include``
+filter attribute.
+
+If you want to skip only some fields but keep the rest, there is a
+corresponding exclude filter attribute that you can use instead of include to
+list only the ones you are not interested in.
+
+If you want to make this the only log file for the stream, you can remove the
+default filter:
+
+.. code-block:: zeek
+
+ event zeek_init()
+ {
+ # Remove the filter called "default".
+ Log::remove_filter(Conn::LOG, "default");
+ }
+
+Determine Log Path Dynamically
+------------------------------
+
+Instead of using the ``path`` filter attribute, a filter can determine output
+paths *dynamically* based on the record being logged. That allows, e.g., to
+record local and remote connections into separate files. To do this, you define
+a function that returns the desired path, and use the ``path_func`` filter
+attribute:
+
+.. code-block:: zeek
+
+ function myfunc(id: Log::ID, path: string, rec: Conn::Info) : string
+ {
+ # Return "conn-local" if originator is a local IP, otherwise
+ # return "conn-remote".
+ local r = Site::is_local_addr(rec$id$orig_h) ? "local" : "remote";
+ return fmt("%s-%s", path, r);
+ }
+
+ event zeek_init()
+ {
+ local filter: Log::Filter = [$name="conn-split",
+ $path_func=myfunc, $include=set("ts", "id.orig_h")];
+ Log::add_filter(Conn::LOG, filter);
+ }
+
+Running this will now produce two new files, :file:`conn-local.log` and
+:file:`conn-remote.log`, with the corresponding entries. For this example
+to work, :zeek:see:`Site::local_nets` must specify your local network.
+It defaults to IANA's standard private address space. One
+could extend this further for example to log information by subnets or even by
+IP address. Be careful, however, as it is easy to create many files very
+quickly.
+
+The ``myfunc`` function has one drawback: it can be used only with the :zeek:see:`Conn::LOG`
+stream as the record type is hardcoded into its argument list. However, Zeek
+allows to do a more generic variant:
+
+.. code-block:: zeek
+
+ function myfunc(id: Log::ID, path: string,
+ rec: record { id: conn_id; } ) : string
+ {
+ local r = Site::is_local_addr(rec$id$orig_h) ? "local" : "remote";
+ return fmt("%s-%s", path, r);
+ }
+
+This function can be used with all log streams that have records containing an
+``id: conn_id`` field.
+
+.. _logging-filtering-log-records:
+
+Filtering Log Records
+---------------------
+
+We just saw ways how to customize the logged columns. The logging framework also
+lets you control which records Zeek writes out. It relies on Zeek’s :zeek:see:`hook`
+mechanism to do this, as follows. The framework provides two levels of "policy"
+hooks, a global one and a set of filter-level ones. The hook handlers can
+implement additional processing of a log record, including vetoing the writing
+of the record. Any handler that uses a :zeek:see:`break` statement to leave the
+hook declares that a record shall not be written out. Anyone can attach handlers
+to these hooks, which look as follows:
+
+.. code-block:: zeek
+
+ type Log::StreamPolicyHook: hook(rec: any, id: ID);
+ type Log::PolicyHook: hook(rec: any, id: ID, filter: Filter);
+
+For both hook types, the ``rec`` argument contains the entry to be logged and is
+an instance of the record type associated with the stream’s columns, and ``id``
+identifies the log stream.
+
+The logging framework defines one global hook policy hook: :zeek:see:`Log::log_stream_policy`.
+For every log write, this hook gets invoked first. Any of its handlers may
+decide to veto the log entry. The framework then iterates over the log stream's
+filters. Each filter has a ``filter$policy`` hook of type :zeek:see:`Log::PolicyHook`.
+Its handlers receive the log record, the ID of the log stream, and the filter
+record itself. Each handler can veto the write. After the filter's hook has run,
+any veto (by :zeek:see:`Log::log_stream_policy` or the filter's hook) aborts the
+write via that filter. If no veto has occurred, the filter now steers the log
+record to its output.
+
+You can pass arbitrary state through these hook handlers. For example, you can
+extending streams or filters via a :zeek:see:`redef`, or pass key-value pairs
+via the ``filter$config`` table..
+
+Since you'll often want to use uniform handling for all writes on a given
+stream, log streams offer a default hook, provided when constructing the stream,
+that the stream's filters will use if they don't provide their own. To support
+hooks on your log streams, you should always define a default hook when creating
+new streams, as follows:
+
+.. code-block:: zeek
+
+ module Foo;
+
+ export {
+ ## The logging stream identifier.
+ redef enum Log::ID += { LOG };
+
+ ## A default logging policy hook for the stream.
+ global log_policy: Log::PolicyHook;
+
+ # Define the record type that will contain the data to log.
+ type Info: record {
+ ts: time &log;
+ id: conn_id &log;
+ service: string &log &optional;
+ missed_bytes: count &log &default=0;
+ };
+ }
+
+ event zeek_init() &priority=5
+ {
+ # Create the stream, adding the default policy hook:
+ Log::create_stream(Foo::LOG, [$columns=Info, $path="foo", $policy=log_policy]);
+ }
+
+With this hook in place, it’s now easy to add a filtering predicate for the ``Foo``
+log from anywhere:
+
+.. code-block:: zeek
+
+ hook Foo::log_policy(rec: Foo::Info, id: Log::ID, filter: Log::Filter)
+ {
+ # Let's only log complete information:
+ if ( rec$missed_bytes > 0 )
+ break;
+ }
+
+The Zeek distribution features default hooks for all of its streams. Here’s a
+more realistic example, using HTTP:
+
+.. code-block:: zeek
+
+ hook HTTP::log_policy(rec: HTTP::Info, id: Log::ID, filter: Log::Filter)
+ {
+ # Record only connections with successfully analyzed HTTP traffic
+ if ( ! rec?$service || rec$service != "http" )
+ break;
+ }
+
+To override a hook selectively in a new filter, set the hook when adding the
+filter to a stream:
+
+.. code-block:: zeek
+
+ hook my_policy(rec: Foo::Info, id: Log::ID, filter: Log::Filter)
+ {
+ # Let's only log incomplete flows:
+ if ( rec$missed_bytes == 0 )
+ break;
+ }
+
+ event zeek_init()
+ {
+ local filter: Log::Filter = [$name="incomplete-only",
+ $path="foo-incomplete",
+ $policy=my_policy];
+ Log::add_filter(Foo::LOG, filter);
+ }
+
+Note that this approach has subtle implications: the new filter does not use the
+``Foo::log_policy`` hook, and that hook does not get invoked for writes to this
+filter. Any vetoes or additional processing implemented in ``Foo::log_policy``
+handlers no longer happens for the new filter. Such hook replacement should
+rarely be necessary; you may find it preferable to narrow the stream's default
+handler to the filter in question:
+
+.. code-block:: zeek
+
+ hook Foo::log_policy(rec: Foo::Info, id: Log::ID, filter: Log::Filter)
+ {
+ if ( filter$name != "incomplete-only" )
+ return;
+
+ # Let's only log incomplete flows:
+ if ( rec$missed_bytes == 0 )
+ break;
+ }
+
+For tasks that need to run once per-write, not once per-write-and-filter,
+use the :zeek:see:`Log::log_stream_policy` instead:
+
+.. code-block:: zeek
+
+ hook Log::log_stream_policy(rec: Foo::Info, id: Log::ID)
+ {
+ # Called once per write
+ }
+
+ hook Foo::log_policy(rec: Foo::Info, id: Log::ID, filter: Log::Filter)
+ {
+ # Called once for each of Foo's filters.
+ }
+
+To change an existing filter first retrieve it, then update it, and
+re-establish it:
+
+.. code-block:: zeek
+
+ hook my_policy(rec: Foo::Info, id: Log::ID, filter: Log::Filter)
+ {
+ # Let's only log incomplete flows:
+ if ( rec$missed_bytes == 0 )
+ break;
+ }
+
+ event zeek_init()
+ {
+ local f = Log::get_filter(Foo::LOG, "default");
+ f$policy = my_policy;
+ Log::add_filter(Foo::LOG, f);
+ }
+
+.. note::
+
+ Policy hooks can also modify the log records, but with subtle implications.
+ The logging framework applies all of a stream’s log filters sequentially to
+ the same log record, so modifications made in a hook handler will persist
+ not only into subsequent handlers in the same hook, but also into any in
+ filters processed subsequently. In contrast to hook priorities, filters
+ provide no control over their processing order.
+
+Log Rotation and Post-Processing
+--------------------------------
+
+The logging framework provides fine-grained control over when and how to rotate
+log files. Log rotation means that Zeek periodically renames an active log
+file, such as :file:`conn.log`, in a manner configurable by the user (e.g.,
+renaming to :file:`conn_21-01-03_14-05-00.log` to timestamp it), and starts
+over on a fresh :file:`conn.log` file. Post-processing means that Zeek can also
+apply optional additional processing to the rotated file, such as compression
+or file transfers. These mechanisms apply naturally to file-based log writers,
+but are available to other writers as well for more generalized forms of
+periodic additional processing of their outputs.
+
+Rotation Timing
+~~~~~~~~~~~~~~~
+
+The log rotation interval is globally controllable for all filters by
+redefining the :zeek:see:`Log::default_rotation_interval` constant, or
+specifically for certain :zeek:see:`Log::Filter` instances by setting their
+``interv`` field. The default value, ``0secs``, disables rotation.
+
+.. note::
+
+ When using ZeekControl, this option is set automatically via the ZeekControl
+ configuration.
+
+Here’s an example of changing just the :zeek:see:`Conn::LOG` stream’s default
+filter rotation:
+
+.. code-block:: zeek
+
+ event zeek_init()
+ {
+ local f = Log::get_filter(Conn::LOG, "default");
+ f$interv = 1 min;
+ Log::add_filter(Conn::LOG, f);
+ }
+
+Controlling File Naming
+~~~~~~~~~~~~~~~~~~~~~~~
+
+The redef’able :zeek:see:`Log::rotation_format_func` determines the naming of
+the rotated-to file. The logging framework invokes the function with sufficient
+context (a :zeek:see:`Log::RotationFmtInfo` record), from which it determines
+the output name in two parts: the output directory, and the output file’s base
+name, meaning its name without a suffix. It returns these two components via a
+:zeek:see:`Log::RotationPath` record. The output directory defaults to
+:zeek:see:`Log::default_rotation_dir` (a config option) and incorporates a
+timestamp in the base name, as specified by
+:zeek:see:`Log::default_rotation_date_format`.
+
+When :zeek:see:`Log::default_logdir` is in use and :zeek:see:`Log::rotation_format_func`
+does not set an output directory (e.g. when :zeek:see:`Log::default_rotation_dir` is not set),
+:zeek:see:`Log::default_logdir` is used as the default output directory.
+
+For examples of customized log rotation, take a look at the
+`relevant `_
+`test `_
+`cases `_.
+
+Post-Processing of Rotated Logs
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Post-processing can proceed via defaults configured across all log filters, or
+with per-filter customizations. Zeek provides helpful default infrastructure to
+simplify running shell commands on rotated logs, but you’re free to define your
+own post-processing infrastructure from scratch.
+
+By default, the :zeek:see:`Log::default_rotation_postprocessor_cmd`, if
+defined, runs on every rotated log. The wrapper function making the actual
+command invocation is :zeek:see:`Log::run_rotation_postprocessor_cmd`. It
+passes six additional arguments to the configured shell command:
+
+* The rotated-to file name (e.g. ``conn_21-01-03_14-05-00.log``)
+* The original base name (e.g. ``conn``)
+* The timestamp at which the original log file got created (e.g. ``21-01-03_14.04.00``)
+* The timestamp at which the original log file got rotated (e.g. ``21-01-03_15.05.00``)
+* ``1`` if Zeek is terminating, ``0`` otherwise
+* The name of the writer (e.g. ``ascii`` for the ASCII writer)
+
+.. warning::
+
+ Zeek ignores failures (non-zero exit codes) of this shell command: the
+ default rotation postprocessor command returns ``T`` regardless. Be careful
+ if you implement your own postprocessor function: returning ``F`` from it
+ will cause the corresponding log writer instance to shut down, therefore do
+ so only when the writer really won’t be able to continue.
+
+Zeek ships with ready-to-use postprocessors for file transfer via :doc:`SCP
+` and
+:doc:`SFTP `. The
+Zeek project also provides an external tool, `zeek-archiver
+`_, that performs log compression
+outside of the Zeek process for robustness.
+
+Other Features
+--------------
+
+Log Extension Fields
+~~~~~~~~~~~~~~~~~~~~
+
+The logging framework provides rudimentary support for adding additional
+columns to an already defined log format, globally for all logs or for
+individual log filters only. Records returned by the
+:zeek:see:`Log::default_ext_func` function get added to every log, and the
+``ext_func`` member of :zeek:see:`Log::Filter` in filter records allows local
+overrides.
+
+You can configure a prefix string separately for either of these options — this
+string ensures that the resulting fields don’t collide with already existing
+log fields. The prefix defaults to an underscore, via
+:zeek:see:`Log::default_ext_prefix`. The ``ext_prefix`` field in filter
+records overrides as needed.
+
+The following example, taken straight from a Zeek testcase, adds three extra
+columns to all logs:
+
+.. code-block:: zeek
+
+ type Extension: record {
+ write_ts: time &log;
+ stream: string &log;
+ system_name: string &log;
+ };
+
+ function add_extension(path: string): Extension
+ {
+ return Extension($write_ts = network_time(),
+ $stream = path,
+ $system_name = peer_description);
+ }
+
+ redef Log::default_ext_func = add_extension;
+
+A resulting :file:`conn.log`::
+
+ #fields _write_ts _stream _system_name ts uid …
+ #types time string string time string …
+ 1071580905.346457 conn zeek 1071580904.891921 Cod6Wj3YeJFHgkaO8j …
+
+.. note::
+
+ Extension fields remain separate from the original log record. They remain
+ invisible to filters, policy hooks, and log events. *After* filter processing
+ determines that an entry is to be logged, the framework simply tucks the
+ extension's members onto the list of fields to write out.
+
+Field Name Mapping
+~~~~~~~~~~~~~~~~~~
+
+On occasion it can be handy to rewrite column names as they appear in a Zeek
+log. A typical use case for this would be to ensure that column naming complies
+with the requirements of your log ingestion system. To achieve this, you can
+provide name translation maps, and here too you can do this either globally or
+per-filter. The maps are simple string tables with the keys being Zeek’s field
+names and the values being the ones to actually write out. Field names not
+present in the maps remain unchanged. The global variant is the (normally
+empty) :zeek:see:`Log::default_field_name_map`, and the corresponding
+filter-local equivalent is the filter’s ``field_name_map`` member.
+
+For example, the following name map gets rid of the dots in the usual naming of
+connection IDs:
+
+.. code-block:: zeek
+
+ redef Log::default_field_name_map = {
+ ["id.orig_h"] = "id_orig_h",
+ ["id.orig_p"] = "id_orig_p",
+ ["id.resp_h"] = "id_resp_h",
+ ["id.resp_p"] = "id_resp_p"
+ };
+
+With it, all logs rendering a connection identifier tuple now use ...
+
+::
+
+ #fields ts uid id_orig_h id_orig_p id_resp_h id_resp_p ...
+
+… instead of the default names:
+
+::
+
+ #fields ts uid id.orig_h id.orig_p id.resp_h id.resp_p ...
+
+If you’d prefer this change only for a given log filter, make the change to the
+filter record directly. The following changes the naming only for
+:file:`conn.log`:
+
+.. code-block:: zeek
+
+ event zeek_init()
+ {
+ local f = Log::get_filter(Conn::LOG, "default");
+ f$field_name_map = table(
+ ["id.orig_h"] = "id_orig_h",
+ ["id.orig_p"] = "id_orig_p",
+ ["id.resp_h"] = "id_resp_h",
+ ["id.resp_p"] = "id_resp_p");
+ Log::add_filter(Conn::LOG, f);
+ }
+
+Printing to Log Messages
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+Zeek’s :zeek:see:`print` statement normally writes to ``stdout`` or a specific
+output file. By adjusting the :zeek:see:`Log::print_to_log` enum value you can
+redirect such statements to instead go directly into a Zeek log. Possible
+values include:
+
+* :zeek:see:`Log::REDIRECT_NONE`: the default, which doesn’t involve Zeek logs
+* :zeek:see:`Log::REDIRECT_STDOUT`: prints that would normally go to stdout go
+ to a log
+* :zeek:see:`Log::REDIRECT_ALL`: any prints end up in a log instead of stdout
+ or other files
+
+The :zeek:see:`Log::print_log_path` defines the name of the log file,
+:zeek:see:`Log::PrintLogInfo` its columns, and :zeek:see:`Log::log_print`
+events allow you to process logged messages via event handlers.
+
+Local vs Remote Logging
+~~~~~~~~~~~~~~~~~~~~~~~
+
+In its log processing, Zeek considers whether log writes should happen locally
+to a Zeek node or remotely on another node, after forwarding log entries to it.
+Single-node Zeek setups default to local logging, whereas cluster setups enable
+local logging only on logger nodes, and log remotely on all but the logger
+nodes. You normally don’t need to go near these settings, but you can do so by
+``redef``’ing the :zeek:see:`Log::enable_local_logging` and
+:zeek:see:`Log::enable_remote_logging` booleans, respectively.
+
+Writers
+=======
+
+Each filter has a writer. If you do not specify a writer when adding a filter
+to a stream, then the ASCII writer is the default.
+
+There are two ways to specify a non-default writer. To change the default
+writer for all log filters, just redefine the :zeek:see:`Log::default_writer`
+option. Alternatively, you can specify the writer to use on a per-filter basis
+by setting a value for the filter’s ``writer`` field. Consult the documentation
+of the writer to use to see if there are other options that are needed.
+
+ASCII Writer
+------------
+
+By default, the ASCII writer outputs log files that begin with several lines of
+metadata, followed by the actual log output. The metadata describes the format
+of the log file, the ``path`` of the log (i.e., the log filename without file
+extension), and also specifies the time that the log was created and the time
+when Zeek finished writing to it. The ASCII writer has a number of options for
+customizing the format of its output, see
+:doc:`/scripts/base/frameworks/logging/writers/ascii.zeek`. If you change the
+output format options, then be careful to check whether your post-processing
+scripts can still recognize your log files.
+
+Some writer options are global (i.e., they affect all log filters using that
+log writer). For example, to change the output format of all ASCII logs to JSON
+format:
+
+.. code-block:: zeek
+
+ redef LogAscii::use_json = T;
+
+
+Some writer options are filter-specific (i.e., they affect only the filters
+that explicitly specify the option). For example, to change the output format
+of the :file:`conn.log` only:
+
+.. code-block:: zeek
+
+ event zeek_init()
+ {
+ local f = Log::get_filter(Conn::LOG, "default");
+ # Use tab-separated-value mode
+ f$config = table(["tsv"] = "T");
+ Log::add_filter(Conn::LOG, f);
+ }
+
+.. _logging-sqlite-writer:
+
+SQLite Writer
+-------------
+
+SQLite is a simple, file-based, widely used SQL database system. Using SQLite
+allows Zeek to write and access data in a format that is easy to use in
+interchange with other applications. Due to the transactional nature of SQLite,
+databases can be used by several applications simultaneously. Zeek’s input
+framework supports a :ref:`SQLite reader `.
+
+Logging support for SQLite is available in all Zeek installations. There is no
+need to load any additional scripts or for any compile-time configurations.
+Sending data from existing logging streams to SQLite is rather straightforward.
+Most likely you’ll want SQLite output only for select log filters, so you have
+to configure one to use the SQLite writer. The following example code adds
+SQLite as a filter for the connection log:
+
+.. code-block:: zeek
+
+ event zeek_init()
+ {
+ local filter: Log::Filter =
+ [
+ $name="sqlite",
+ $path="/var/db/conn",
+ $config=table(["tablename"] = "conn"),
+ $writer=Log::WRITER_SQLITE
+ ];
+
+ Log::add_filter(Conn::LOG, filter);
+ }
+
+Zeek will create the database file :file:`/var/db/conn.sqlite` if it does not
+already exist. It will also create a table with the name ``conn`` (if it does
+not exist) and start appending connection information to the table.
+
+Zeek does not currently support rotating SQLite databases as it does for ASCII
+logs. You have to take care to create them in adequate locations.
+
+If you examine the resulting SQLite database, the schema will contain the same
+fields that are present in the ASCII log files:
+
+.. code-block:: console
+
+ sqlite3 /var/db/conn.sqlite
+
+::
+
+ SQLite version 3.8.0.2 2013-09-03 17:11:13
+ Enter ".help" for instructions
+ Enter SQL statements terminated with a ";"
+ sqlite> .schema
+ CREATE TABLE conn (
+ 'ts' double precision,
+ 'uid' text,
+ 'id.orig_h' text,
+ 'id.orig_p' integer,
+ ...
+
+Note that with the above code the ASCII :file:`conn.log` will still be created,
+because it adds an additional log filter alongside the default, ASCII-logging
+one. To prevent this you can remove the default filter:
+
+.. code-block:: zeek
+
+ Log::remove_filter(Conn::LOG, "default");
+
+To create a custom SQLite log file, you have to create a new log stream that
+contains just the information you want to commit to the database. See the above
+documentation on how to create custom log streams.
+
+None Writer
+-----------
+
+The ``None`` writer, selected via :zeek:see:`Log::WRITER_NONE`, is largely a
+troubleshooting and development aide. It discards all log entries it receives,
+but behaves like a proper writer to the rest of the logging framework,
+including, for example, pretended log rotation. If you enable its debugging
+mode by setting :zeek:see:`LogNone::debug` to ``T``, Zeek reports operational
+details about the writer’s activity to ``stdout``.
diff --git a/doc/frameworks/logging/delay1.zeek b/doc/frameworks/logging/delay1.zeek
new file mode 100644
index 0000000000..ae75bd76ca
--- /dev/null
+++ b/doc/frameworks/logging/delay1.zeek
@@ -0,0 +1,37 @@
+@load base/protocols/conn
+
+redef record Conn::Info += {
+ orig_name: string &log &optional;
+ resp_name: string &log &optional;
+};
+
+hook Log::log_stream_policy(rec: Conn::Info, id: Log::ID)
+ {
+ if ( id != Conn::LOG )
+ return;
+
+ local token1 = Log::delay(id, rec);
+ local token2 = Log::delay(id, rec);
+
+ when [id, rec, token1] ( local orig_name = lookup_addr(rec$id$orig_h) )
+ {
+ rec$orig_name = orig_name;
+ Log::delay_finish(id, rec, token1);
+ }
+ timeout 150msec
+ {
+ Reporter::warning(fmt("lookup_addr timeout for %s", rec$id$orig_h));
+ Log::delay_finish(id, rec, token1);
+ }
+
+ when [id, rec, token2] ( local resp_name = lookup_addr(rec$id$resp_h) )
+ {
+ rec$resp_name = resp_name;
+ Log::delay_finish(id, rec, token2);
+ }
+ timeout 150msec
+ {
+ Reporter::warning(fmt("lookup_addr timeout for %s", rec$id$resp_h));
+ Log::delay_finish(id, rec, token2);
+ }
+ }
diff --git a/doc/frameworks/management.rst b/doc/frameworks/management.rst
new file mode 100644
index 0000000000..f8d7499339
--- /dev/null
+++ b/doc/frameworks/management.rst
@@ -0,0 +1,867 @@
+.. _framework-management:
+
+====================
+Management Framework
+====================
+
+.. rst-class:: opening
+
+ The management framework provides a Zeek-based, service-oriented architecture
+ and event-driven APIs to manage a Zeek cluster that monitors live traffic. It
+ provides a central, stateful *controller* that relays and orchestrates
+ cluster management tasks across connected *agents*. Each agent manages Zeek
+ processes in its local *instance*, the Zeek process tree controlled by the
+ local Zeek :ref:`Supervisor `. A management *client*
+ lets the user interact with the controller to initiate cluster management
+ tasks, such as deployment of cluster configurations, monitoring of
+ operational aspects, or to restart cluster nodes. The default client is
+ ``zeek-client``, included in the Zeek distribution.
+
+.. _framework-management-quickstart:
+
+Quickstart
+==========
+
+Run the following (as root) to launch an all-in-one management instance on your
+system:
+
+.. code-block:: console
+
+ # zeek -C -j policy/frameworks/management/controller policy/frameworks/management/agent
+
+The above will stay in the foreground. In a new shell, save the following
+content to a file ``cluster.cfg`` and adapt the workers' sniffing interfaces to
+your system:
+
+.. literalinclude:: management/mini-config.ini
+ :language: ini
+
+Run the following command (as any user) to deploy the configuration:
+
+.. literalinclude:: management/mini-deployment.console
+ :language: console
+
+You are now running a Zeek cluster on your system. Try ``zeek-client get-nodes``
+to see more details about the cluster's current status. (In the above, "testbox"
+is the system's hostname.)
+
+Architecture and Terminology
+============================
+
+Controller
+----------
+
+The controller forms the central hub of cluster management. It exists once in
+every installation and runs as a Zeek process solely dedicated to management
+tasks. It awaits instructions from a management client and communicates with one
+or more agents to manage their cluster nodes.
+
+All controller communication happens via :ref:`Broker `-based
+Zeek event exchange, usually in the form of request-response event pairs tagged
+with a request ID to provide context. The controller is stateful and persists
+cluster configurations to disk. In a multi-system setup, the controller runs
+inside a separate, dedicated Zeek instance. In a single-system setup, the
+controller can run as an additional process in the local instance.
+
+The controller's API resides in the :zeek:see:`Management::Controller::API` module.
+Additional code documentation is :doc:`here `.
+
+Instance
+--------
+
+A Zeek instance comprises the set of processes managed by a Zeek
+:ref:`Supervisor `. The management framework builds
+heavily on the Supervisor framework and cannot run without it. Typically, a
+single instance includes all Zeek processes on the local system (a physical
+machine, a container, etc), but running multiple instances on a system is
+possible.
+
+Agent
+-----
+
+Management agents implement instance-level cluster management tasks. Every
+instance participating in cluster management runs an agent. Agents peer with the
+controller to receive instructions (a node restart, say), carry them out, and
+respond with the outcome. The direction of connection establishment for the
+peering depends on configuration and can go either way (more on this below); by
+default, agents connect to the controller.
+
+The agent's API resides in the :zeek:see:`Management::Agent::API` module.
+Additional code documentation is :doc:`here `.
+
+Agents add script-layer code to both the Supervisor (details :doc:`here
+`) and Zeek cluster
+nodes (details :doc:`here `)
+to enable management tasks (e.g. to tap into node stdout/stderr output) and to
+receive confirmation of successful node startup.
+
+Cluster nodes
+-------------
+
+The Zeek processes involved in traffic analysis and log output make up the Zeek
+*cluster*, via the :ref:`cluster framework `. The management
+framework does not change the cluster framework, and all of its concepts (the
+manager, logger(s), workers, etc) apply as before. Cluster *nodes* refer to
+individual Zeek processes in the cluster, as managed by the Supervisor.
+
+Client
+------
+
+The management client provides the user's interface to cluster management. It
+allows configuration and deployment of the Zeek cluster, insight into the
+running cluster, the ability to restart nodes, etc. The client uses the
+controller's event API to communicate and is the only component in the framework
+not (necessarily) implemented in Zeek's script layer. The Zeek distribution
+ships with ``zeek-client``, a command-line client implemented in Python, to
+provide management functionality. Users are welcome to implement other clients.
+
+.. _framework-management-visual-example:
+
+A Visual Example
+================
+
+Consider the following setup, consisting of a single instance, controller, and a
+connected ``zeek-client``, all running on different machines:
+
+.. image:: /images/management.png
+ :align: center
+
+The cluster system runs a single management instance, with an agent listening on
+TCP port 2151, the default. Since the agent needs to communicate with the
+Supervisor for node management tasks and the two run in separate processes, the
+Supervisor listens for Broker peerings, on TCP port 9999 (again, the default),
+and the two communicate events over topic ``zeek/supervisor``. As shown, the
+agent has launched a 4-node Zeek cluster consisting of two workers, a logger,
+and a manager, communicating internally as usual.
+
+The controller system is more straightforward, consisting merely of a
+Supervisor-governed management controller. This controller has connected to and
+peered with the agent on the cluster system, to relay commands received by the
+client via the agent's API and receive responses over Broker topic
+``zeek/management/agent``. Since the controller doesn't need to interact with
+the Supervisor, the latter doesn't listen on any ports. Standalone controllers,
+as running here, still require a Supervisor, to simplify co-located deployment
+of agent and controller in a single instance.
+
+Finally, the admin system doesn't run Zeek, but has it installed to provide
+``zeek-client``, the CLI for issuing cluster management requests. This client
+connects to and peers with the controller, exchanging controller API events over
+topic ``zeek/management/controller``. For more details on ``zeek-client``, see
+:ref:`below `.
+
+In practice you can simplify the deployment by running ``zeek-client`` directly
+on the controller machine, or by running agent and controller jointly on a
+single system. We cover this in :ref:`more detail
+`.
+
+Goals and Relationship to ZeekControl
+=====================================
+
+The management framework first shipped in usable form in Zeek 5.0. It will
+replace the aging :ref:`ZeekControl ` over the course of
+the coming releases. The framework is not compatible with ZeekControl's approach
+to cluster management: use one or the other, not both.
+
+The framework currently targets single-instance deployments, i.e., setups in
+which traffic monitoring happens on a single system. While the management
+framework technically supports clusters spanning multiple monitoring systems,
+much of the infrastructure users know from ``zeekctl`` (such as the ability to
+deploy Zeek scripts and additional configuration) is not yet available in the
+management framework.
+
+ZeekControl remains included in the Zeek distribution, and remains the
+recommended solution for multi-system clusters and those needing rich management
+capabilities.
+
+.. _framework-management-running:
+
+Running Controller and Agent
+============================
+
+.. _joint-launch:
+
+Joint launch
+------------
+
+The easiest approach is to run a single Zeek instance in which the Supervisor
+launches both an agent and the controller. The framework comes pre-configured for
+this use-case. Its invocation looks as follows:
+
+.. code-block:: console
+
+ # zeek -j policy/frameworks/management/controller policy/frameworks/management/agent
+
+The ``-j`` flag enables the Supervisor and is required for successful launch of
+the framework. (Without it, the above command will simply return.)
+
+.. note::
+
+ If you're planning to monitor the machine's own traffic, add the ``-C`` flag
+ to avoid checksum errors, which commonly happen in local monitoring due to
+ offload of the checksum computation to the NIC.
+
+The following illustrates this setup:
+
+.. image:: /images/management-all-in-one.png
+ :align: center
+ :scale: 75%
+
+Separate controller and agent instances
+---------------------------------------
+
+You can also separate the agent and controller instances. For this, you'd say
+
+.. code-block:: console
+
+ # zeek -j policy/frameworks/management/agent
+
+for the agent, and
+
+.. code-block:: console
+
+ # zeek -j policy/frameworks/management/controller
+
+for the controller. You can run the latter as a regular user, assuming the user
+has write access to the installation's spool and log directories (more on this
+below). While technically not required to operate a stand-alone controller, the
+Supervisor is currently also required in this scenario, so don't omit the
+``-j``.
+
+This looks as follows:
+
+.. image:: /images/management-all-in-one-two-zeeks.png
+ :align: center
+
+
+Controller and agent instances on separate systems
+--------------------------------------------------
+
+You can also separate the two across different systems, though that approach
+will only really start to make sense when the framework fully supports running
+multiple traffic-sniffing instances. To do this, you either need to configure
+the agent to find the controller, or tell the controller where to find the
+agent. For the former, redefine the corresponding config setting, for example by
+saying
+
+.. code-block:: zeek
+
+ redef Management::Agent::controller = [$address="1.2.3.4", $bound_port=21500/tcp];
+
+in ``local.zeek`` and then launching
+
+.. code-block:: console
+
+ # zeek -j policy/frameworks/management/agent local
+
+The result looks as already covered :ref:`earlier `:
+
+.. image:: /images/management.png
+ :align: center
+
+To make the controller connect to remote agents, deploy configurations that
+include the location of such agents in the configuration. More on this below.
+
+Multiple instances
+------------------
+
+You can run multiple instances on a single system, but it requires some
+care. Doing so requires specifying a different listening port for each agent,
+and additionally providing a different listening port for each instance's
+Supervisor. Since agents communicate with their Supervisor to facilitate node
+management, the Supervisor needs to listen (though only locally). Furthermore,
+you need to ensure this agent runs with a unique name (see the next section for
+more on naming).
+
+Assuming you already have an instance running, a launch of an additional agent
+might look as follows:
+
+.. code-block:: console
+
+ # zeek -j policy/frameworks/management/agent \
+ Management::Agent::default_port=2152/tcp \
+ Management::Agent::name=agent-standby \
+ Broker::default_port=10001/tcp
+
+Finally, as already mentioned, you can spread multiple instances across multiple
+systems to explore distributed cluster management. This simplifies the
+individual launch invocations, but for practical distributed cluster use you may
+find the framework's current cluster management features lacking when compared
+to ZeekControl.
+
+Controller and agent naming
+---------------------------
+
+The management framework identifies all nodes in the system by name, and all
+nodes (agent(s), controller, and Zeek cluster nodes) must have unique names. By
+default, the framework chooses ``agent-`` and
+``controller-`` for agent and controller, respectively. To reconfigure
+naming, set the ``ZEEK_AGENT_NAME`` / ``ZEEK_CONTROLLER_NAME`` environment
+variables, or redefine the following:
+
+.. code-block:: zeek
+
+ redef Management::Controller::name = "controller1";
+ redef Management::Agent::name = "agent1";
+
+Firewalling and encryption
+--------------------------
+
+By default, the controller listens for clients and agents on ports ``2149/tcp`` and
+``2150/tcp``. The former port supports Broker's WebSocket data format, the latter its
+traditional one.
+Unless you run all components, including the client, on a single system, you'll
+want to open up these ports on the controller's system. The agent's default port
+is ``2151/tcp``. It always listens; this allows cluster nodes to connect to it
+to send status reports. If the agents connect to the controller, your firewall
+may block the agent's port since host-local connectivity from cluster nodes to
+the agent process suffices.
+
+To switch agent and/or controller to different ports, set environment variables
+``ZEEK_CONTROLLER_PORT`` / ``ZEEK_CONTROLLER_WEBSOCKET_PORT`` / ``ZEEK_AGENT_PORT``,
+or use the following:
+
+.. code-block:: zeek
+
+ redef Management::Controller::default_port_websocket = 21490/tcp;
+ redef Management::Controller::default_port = 21500/tcp;
+ redef Management::Agent::default_port = 21510/tcp;
+
+By default, agent and controller listen globally. To make them listen on a
+specific interface, set environment variables ``ZEEK_CONTROLLER_ADDR`` /
+``ZEEK_CONTROLLER_WEBSOCKET_ADDR`` / ``ZEEK_AGENT_ADDR``,
+or redefine the framework's fallback default address:
+
+.. code-block:: zeek
+
+ redef Management::default_address = "127.0.0.1";
+
+The framework inherits Broker's TLS capabilities and defaults. For details,
+please refer to the :doc:`Broker config settings
+`.
+
+.. note::
+
+ ``zeek-client`` currently doesn't support client-side certificates.
+
+Additional framework configuration
+----------------------------------
+
+The framework features a number of additional settings that we cover as needed
+in the remainder of this chapter. Refer to the following to browse them all:
+
+* :doc:`General settings `
+* :doc:`Controller `
+* :doc:`Agents `
+* :doc:`Cluster nodes `
+* :doc:`Supervisor `
+
+Node Operation and Outputs
+==========================
+
+The framework places every Supervisor-created node into its own working
+directory, located in ``$(zeek-config --prefix)/var/lib/nodes/``. You can
+reconfigure this by setting the ``ZEEK_MANAGEMENT_STATE_DIR`` or redefining
+:zeek:see:`Management::state_dir`. Doing either will change the toplevel
+directory (i.e., replacing the path up to and including ``var/lib`` in the
+above); the framework will still create the ``nodes/`` directory structure
+within it.
+
+Outputs in the resulting directory include:
+
+* Two separate ad-hoc logs (not structured by Zeek's logging framework)
+ capturing the node's stdout and stderr streams. Their naming is configurable,
+ defaulting simply to ``stdout`` and ``stderr``.
+
+* Zeek log files prior to log rotation.
+
+* Persisted Zeek state, such as Broker-backed tables.
+
+
+Log Management
+==============
+
+The framework configures log rotation and archival via Zeek's included
+`zeek-archiver tool `_, as follows:
+
+* The :zeek:see:`Log::default_rotation_interval` is one hour, with both local
+ and remote logging enabled. You are free to adjust it as needed.
+
+* The log rotation directory defaults to ``$(zeek-config --prefix)/spool/log-queue``.
+ To adjust this, redefine :zeek:see:`Log::default_rotation_dir` as usual.
+ You can also relocate the spool by setting the ``ZEEK_MANAGEMENT_SPOOL_DIR``
+ environment variable or redefining :zeek:see:`Management::spool_dir`. The
+ framework will place ``log-queue`` into that new destination.
+
+* The log rotation callback rotates node-local logs into the log queue, with
+ naming suitable for ``zeek-archiver``. An example:
+
+ .. code-block:: console
+
+ conn__2022-06-20-10-00-00__2022-06-20-11-00-00__.log
+
+ For details, take a look at the implementation in
+ ``scripts/policy/frameworks/management/persistence.zeek``.
+
+* Once per log rotation interval, the agent launches log archival to archive
+ rotated logs into the installation's log directory (``$(zeek-config
+ --root)/logs``). By default this invokes ``zeek-archiver``, which establishes
+ a datestamp directory in the ``logs`` directory and places the compressed logs
+ into it:
+
+ .. code-block:: console
+
+ # cd $(zeek-config --root)/logs
+ # ls -l
+ total 4
+ drwx------. 2 root root 4096 Jun 20 21:17 2022-06-20
+ # cd 2022-06-20
+ # ls -l
+ total 712
+ -rw-r--r--. 1 root root 280 Jun 20 20:17 broker.19:00:00-20:00:00.log.gz
+ -rw-r--r--. 1 root root 24803 Jun 20 20:17 conn.19:00:00-20:00:00.log.gz
+ -rw-r--r--. 1 root root 26036 Jun 20 21:17 conn.20:00:00-21:00:00.log.gz
+ -rw-r--r--. 1 root root 350 Jun 20 20:17 dhcp.19:00:00-20:00:00.log.gz
+ -rw-r--r--. 1 root root 400 Jun 20 21:17 dhcp.20:00:00-21:00:00.log.gz
+ ...
+
+You can adapt the log archival configuration via the following settings:
+
+* Redefine :zeek:see:`Management::Agent::archive_logs` to ``F`` to disable
+ archival entirely.
+
+* Redefine :zeek:see:`Management::Agent::archive_interval` for an interval other
+ than the log rotation one.
+
+* Redefine :zeek:see:`Management::Agent::archive_dir` to change the
+ destination directory.
+
+* Redefine :zeek:see:`Management::Agent::archive_cmd` to invoke an executable
+ other than the included ``zeek-archiver``. The replacement should accept the
+ same argument structure: `` -1