Merge branch 'topic/robin/intel-framework-merge'

* topic/robin/intel-framework-merge: (22 commits)
  Fixing tests after intel-framework merge.
  Extracting URLs from message bodies over SMTP and sending them to Intel framework.
  Small comment updates in the Intel framework CIF support.
  Intelligence framework documentation first draft.
  Only the manager tries to read files with the input framework now.
  Initial support for Bro's Intel framework with the Collective Intelligence Framework.
  Initial API for Intel framework is complete.
  Fixed an issue with cluster data distribution.
  Updating some intel framework test baselines.
  Reworked cluster intelligence data distribution mechanism and fixed tests.
  Lots more intelligence checking in SMTP traffic.
  Added intelligence check for "Received" path checking and a bit of reshuffling.
  Added sources to the intel log.
  Fixing a problem with intel distribution on clusters.
  Updated intel framework test to include matching.
  Restructuring the scripts that feed data into the intel framework slightly.
  One test for cluster transparency of the intel framework.
  Fixed a cluster support bug.
  Intelligence framework checkpoint
  Major updates to fix the Intel framework API.
  ...

Closes #914.
This commit is contained in:
Robin Sommer 2012-11-05 16:00:38 -08:00
commit a40b00d4ab
40 changed files with 1039 additions and 337 deletions

View file

@ -1 +1,11 @@
@load ./main
@load ./main
# The cluster framework must be loaded first.
@load base/frameworks/cluster
@if ( Cluster::is_enabled() )
@load ./cluster
@endif
# This needs cluster support to only read on the manager.
@load ./input

View file

@ -0,0 +1,61 @@
##! Cluster transparency support for the intelligence framework. This is mostly oriented
##! toward distributing intelligence information across clusters.
@load base/frameworks/cluster
@load ./input
module Intel;
redef record Item += {
## This field is used internally for cluster transparency to avoid
## re-dispatching intelligence items over and over from workers.
first_dispatch: bool &default=T;
};
# If this process is not a manager process, we don't want the full metadata
@if ( Cluster::local_node_type() != Cluster::MANAGER )
redef have_full_data = F;
@endif
global cluster_new_item: event(item: Item);
# Primary intelligence distribution comes from manager.
redef Cluster::manager2worker_events += /^Intel::(cluster_new_item)$/;
# If a worker finds intelligence and adds it, it should share it back to the manager.
redef Cluster::worker2manager_events += /^Intel::(cluster_new_item|match_no_items)$/;
@if ( Cluster::local_node_type() == Cluster::MANAGER )
event Intel::match_no_items(s: Seen) &priority=5
{
event Intel::match(s, Intel::get_items(s));
}
event remote_connection_handshake_done(p: event_peer)
{
# When a worker connects, send it the complete minimal data store.
# It will be kept up to date after this by the cluster_new_item event.
if ( Cluster::nodes[p$descr]$node_type == Cluster::WORKER )
{
send_id(p, "Intel::min_data_store");
}
}
@endif
event Intel::cluster_new_item(item: Intel::Item) &priority=5
{
# Ignore locally generated events to avoid event storms.
if ( is_remote_event() )
Intel::insert(item);
}
event Intel::new_item(item: Intel::Item) &priority=5
{
# The cluster manager always rebroadcasts intelligence.
# Workers redistribute it if it was locally generated.
if ( Cluster::local_node_type() == Cluster::MANAGER ||
item$first_dispatch )
{
item$first_dispatch=F;
event Intel::cluster_new_item(item);
}
}

View file

@ -0,0 +1,33 @@
@load ./main
module Intel;
export {
## Intelligence files that will be read off disk. The files are
## reread everytime they are updated so updates much be atomic with
## "mv" instead of writing the file in place.
const read_files: set[string] = {} &redef;
}
event Intel::read_entry(desc: Input::EventDescription, tpe: Input::Event, item: Intel::Item)
{
Intel::insert(item);
}
event bro_init() &priority=5
{
if ( ! Cluster::is_enabled() ||
Cluster::local_node_type() == Cluster::MANAGER )
{
for ( a_file in read_files )
{
Input::add_event([$source=a_file,
$reader=Input::READER_ASCII,
$mode=Input::REREAD,
$name=cat("intel-", a_file),
$fields=Intel::Item,
$ev=Intel::read_entry]);
}
}
}

View file

@ -1,323 +1,345 @@
##! The intelligence framework provides a way to store and query IP addresses,
##! strings (with a subtype), and numeric (with a subtype) data. Metadata
##! also be associated with the intelligence like tags which are arbitrary
##! strings, time values, and longer descriptive strings.
# Example string subtypes:
# url
# email
# domain
# software
# user_name
# file_name
# file_md5
# x509_md5
# Example tags:
# infrastructure
# malicious
# sensitive
# canary
# friend
##! and strings (with a str_type). Metadata can
##! also be associated with the intelligence like for making more informed
##! decisions about matching and handling of intelligence.
@load base/frameworks/notice
module Intel;
export {
## The intel logging stream identifier.
redef enum Log::ID += { LOG };
redef enum Notice::Type += {
## This notice should be used in all detector scripts to indicate
## an intelligence based detection.
Detection,
## String data needs to be further categoried since it could represent
## and number of types of data.
type StrType: enum {
## A complete URL without the prefix "http://".
URL,
## User-Agent string, typically HTTP or mail message body.
USER_AGENT,
## Email address.
EMAIL,
## DNS domain name.
DOMAIN,
## A user name.
USER_NAME,
## File hash which is non-hash type specific. It's up to the user to query
## for any relevant hash types.
FILE_HASH,
## Certificate SHA-1 hash.
CERT_HASH,
};
## Record type used for logging information from the intelligence framework.
## Primarily for problems or oddities with inserting and querying data.
## This is important since the content of the intelligence framework can
## change quite dramatically during runtime and problems may be introduced
## into the data.
type Info: record {
## The current network time.
ts: time &log;
## Represents the severity of the message.
## This value should be one of: "info", "warn", "error"
level: string &log;
## The message.
message: string &log;
};
## Record to represent metadata associated with a single piece of
## intelligence.
## Data about an :bro:type:`Intel::Item`
type MetaData: record {
## A description for the data.
## An arbitrary string value representing the data source. Typically,
## the convention for this field will be the source name and feed name
## separated by a hyphen. For example: "source1-c&c".
source: string;
## A freeform description for the data.
desc: string &optional;
## A URL where more information may be found about the intelligence.
## A URL for more information about the data.
url: string &optional;
## The time at which the data was first declared to be intelligence.
first_seen: time &optional;
## When this data was most recent inserted into the framework.
latest_seen: time &optional;
## Arbitrary text tags for the data.
tags: set[string];
};
## Record to represent a singular piece of intelligence.
## Represents a piece of intelligence.
type Item: record {
## If the data is an IP address, this hold the address.
ip: addr &optional;
## If the data is textual, this holds the text.
str: string &optional;
## If the data is numeric, this holds the number.
num: int &optional;
## The subtype of the data for when either the $str or $num fields are
## given. If one of those fields are given, this field must be present.
subtype: string &optional;
## The IP address if the intelligence is about an IP address.
host: addr &optional;
## The network if the intelligence is about a CIDR block.
net: subnet &optional;
## The string if the intelligence is about a string.
str: string &optional;
## The type of data that is in the string if the $str field is set.
str_type: StrType &optional;
## The next five fields are temporary until a better model for
## attaching metadata to an intelligence item is created.
desc: string &optional;
url: string &optional;
first_seen: time &optional;
latest_seen: time &optional;
tags: set[string];
## These single string tags are throw away until pybroccoli supports sets.
tag1: string &optional;
tag2: string &optional;
tag3: string &optional;
## Metadata for the item. Typically represents more deeply \
## descriptive data for a piece of intelligence.
meta: MetaData;
};
## Record model used for constructing queries against the intelligence
## framework.
type QueryItem: record {
## If an IP address is being queried for, this field should be given.
ip: addr &optional;
## If a string is being queried for, this field should be given.
str: string &optional;
## If numeric data is being queried for, this field should be given.
num: int &optional;
## If either a string or number is being queried for, this field should
## indicate the subtype of the data.
subtype: string &optional;
## A set of tags where if a single metadata record attached to an item
## has any one of the tags defined in this field, it will match.
or_tags: set[string] &optional;
## A set of tags where a single metadata record attached to an item
## must have all of the tags defined in this field.
and_tags: set[string] &optional;
## The predicate can be given when searching for a match. It will
## be tested against every :bro:type:`Intel::MetaData` item associated
## with the data being matched on. If it returns T a single time, the
## matcher will consider that the item has matched. This field can
## be used for constructing arbitrarily complex queries that may not
## be possible with the $or_tags or $and_tags fields.
pred: function(meta: Intel::MetaData): bool &optional;
## Enum to represent where data came from when it was discovered.
## The convenction is to prefix the name with "IN_".
type Where: enum {
## A catchall value to represent data of unknown provenance.
IN_ANYWHERE,
};
## Function to insert data into the intelligence framework.
##
## item: The data item.
## The $host field and combination of $str and $str_type fields are mutually
## exclusive. These records *must* represent either an IP address being
## seen or a string being seen.
type Seen: record {
## The IP address if the data seen is an IP address.
host: addr &log &optional;
## The string if the data is about a string.
str: string &log &optional;
## The type of data that is in the string if the $str field is set.
str_type: StrType &log &optional;
## Where the data was discovered.
where: Where &log;
## If the data was discovered within a connection, the
## connection record should go into get to give context to the data.
conn: connection &optional;
};
## Record used for the logging framework representing a positive
## hit within the intelligence framework.
type Info: record {
## Timestamp when the data was discovered.
ts: time &log;
## If a connection was associated with this intelligence hit,
## this is the uid for the connection
uid: string &log &optional;
## If a connection was associated with this intelligence hit,
## this is the conn_id for the connection.
id: conn_id &log &optional;
## Where the data was seen.
seen: Seen &log;
## Sources which supplied data that resulted in this match.
sources: set[string] &log;
};
## Intelligence data manipulation functions.
global insert: function(item: Item);
## Function to declare discovery of a piece of data in order to check
## it against known intelligence for matches.
global seen: function(s: Seen);
## Event to represent a match in the intelligence data from data that was seen.
## On clusters there is no assurance as to where this event will be generated
## so do not assume that arbitrary global state beyond the given data
## will be available.
##
## Returns: T if the data was successfully inserted into the framework,
## otherwise it returns F.
global insert: function(item: Item): bool;
## A wrapper for the :bro:id:`Intel::insert` function. This is primarily
## used as the external API for inserting data into the intelligence
## using Broccoli.
global insert_event: event(item: Item);
## Function for matching data within the intelligence framework.
global matcher: function(item: QueryItem): bool;
## This is the primary mechanism where a user will take actions based on data
## within the intelligence framework.
global match: event(s: Seen, items: set[Item]);
global log_intel: event(rec: Info);
}
type MetaDataStore: table[count] of MetaData;
# Internal handler for matches with no metadata available.
global match_no_items: event(s: Seen);
# Internal events for cluster data distribution
global new_item: event(item: Item);
global updated_item: event(item: Item);
# Optionally store metadata. This is used internally depending on
# if this is a cluster deployment or not.
const have_full_data = T &redef;
# The in memory data structure for holding intelligence.
type DataStore: record {
ip_data: table[addr] of MetaDataStore;
# The first string is the actual value and the second string is the subtype.
string_data: table[string, string] of MetaDataStore;
int_data: table[int, string] of MetaDataStore;
net_data: table[subnet] of set[MetaData];
string_data: table[string, StrType] of set[MetaData];
};
global data_store: DataStore;
global data_store: DataStore &redef;
event bro_init()
# The in memory data structure for holding the barest matchable intelligence.
# This is primarily for workers to do the initial quick matches and store
# a minimal amount of data for the full match to happen on the manager.
type MinDataStore: record {
net_data: set[subnet];
string_data: set[string, StrType];
};
global min_data_store: MinDataStore &redef;
event bro_init() &priority=5
{
Log::create_stream(Intel::LOG, [$columns=Info]);
Log::create_stream(LOG, [$columns=Info, $ev=log_intel]);
}
function insert(item: Item): bool
function find(s: Seen): bool
{
local err_msg = "";
if ( (item?$str || item?$num) && ! item?$subtype )
err_msg = "You must provide a subtype to insert_sync or this item doesn't make sense.";
if ( err_msg == "" )
if ( s?$host &&
((have_full_data && s$host in data_store$net_data) ||
(s$host in min_data_store$net_data)))
{
# Create and fill out the meta data item.
local meta: MetaData;
if ( item?$first_seen )
meta$first_seen = item$first_seen;
if ( item?$latest_seen )
meta$latest_seen = item$latest_seen;
if ( item?$tags )
meta$tags = item$tags;
if ( item?$desc )
meta$desc = item$desc;
if ( item?$url )
meta$url = item$url;
# This is hopefully only temporary until pybroccoli supports sets.
if ( item?$tag1 )
add item$tags[item$tag1];
if ( item?$tag2 )
add item$tags[item$tag2];
if ( item?$tag3 )
add item$tags[item$tag3];
if ( item?$ip )
{
if ( item$ip !in data_store$ip_data )
data_store$ip_data[item$ip] = table();
data_store$ip_data[item$ip][|data_store$ip_data[item$ip]|] = meta;
return T;
}
else if ( item?$str )
{
if ( [item$str, item$subtype] !in data_store$string_data )
data_store$string_data[item$str, item$subtype] = table();
data_store$string_data[item$str, item$subtype][|data_store$string_data[item$str, item$subtype]|] = meta;
return T;
}
else if ( item?$num )
{
if ( [item$num, item$subtype] !in data_store$int_data )
data_store$int_data[item$num, item$subtype] = table();
return T;
}
else if ( s?$str && s?$str_type &&
((have_full_data && [s$str, s$str_type] in data_store$string_data) ||
([s$str, s$str_type] in min_data_store$string_data)))
{
return T;
}
else
{
return F;
}
}
data_store$int_data[item$num, item$subtype][|data_store$int_data[item$num, item$subtype]|] = meta;
return T;
function get_items(s: Seen): set[Item]
{
local item: Item;
local return_data: set[Item] = set();
if ( ! have_full_data )
{
# A reporter warning should be generated here because this function
# should never be called from a host that doesn't have the full data.
# TODO: do a reporter warning.
return return_data;
}
if ( s?$host )
{
# See if the host is known about and it has meta values
if ( s$host in data_store$net_data )
{
for ( m in data_store$net_data[s$host] )
{
# TODO: the lookup should be finding all and not just most specific
# and $host/$net should have the correct value.
item = [$host=s$host, $meta=m];
add return_data[item];
}
}
}
else if ( s?$str && s?$str_type )
{
# See if the string is known about and it has meta values
if ( [s$str, s$str_type] in data_store$string_data )
{
for ( m in data_store$string_data[s$str, s$str_type] )
{
item = [$str=s$str, $str_type=s$str_type, $meta=m];
add return_data[item];
}
}
}
return return_data;
}
function Intel::seen(s: Seen)
{
if ( find(s) )
{
if ( have_full_data )
{
local items = get_items(s);
event Intel::match(s, items);
}
else
err_msg = "Failed to insert intelligence item for some unknown reason.";
}
if ( err_msg != "" )
Log::write(Intel::LOG, [$ts=network_time(), $level="warn", $message=fmt(err_msg)]);
return F;
}
event insert_event(item: Item)
{
insert(item);
}
function match_item_with_metadata(item: QueryItem, meta: MetaData): bool
{
if ( item?$and_tags )
{
local matched = T;
# Every tag given has to match in a single MetaData entry.
for ( tag in item$and_tags )
{
if ( tag !in meta$tags )
matched = F;
event Intel::match_no_items(s);
}
if ( matched )
}
}
function has_meta(check: MetaData, metas: set[MetaData]): bool
{
local check_hash = md5_hash(check);
for ( m in metas )
{
if ( check_hash == md5_hash(m) )
return T;
}
else if ( item?$or_tags )
{
# For OR tags, only a single tag has to match.
for ( tag in item$or_tags )
{
if ( tag in meta$tags )
return T;
}
}
else if ( item?$pred )
return item$pred(meta);
# This indicates some sort of failure in the query
# The records must not be equivalent if we made it this far.
return F;
}
function matcher(item: QueryItem): bool
event Intel::match(s: Seen, items: set[Item]) &priority=5
{
local err_msg = "";
if ( ! (item?$ip || item?$str || item?$num) )
err_msg = "You must supply one of the $ip, $str, or $num fields to search on";
else if ( (item?$or_tags || item?$and_tags) && item?$pred )
err_msg = "You can't match with both tags and a predicate.";
else if ( item?$or_tags && item?$and_tags )
err_msg = "You can't match with both OR'd together tags and AND'd together tags";
else if ( (item?$str || item?$num) && ! item?$subtype )
err_msg = "You must provide a subtype to matcher or this item doesn't make sense.";
else if ( item?$str && item?$num )
err_msg = "You must only provide $str or $num, not both.";
local meta: MetaData;
local empty_set: set[string] = set();
local info: Info = [$ts=network_time(), $seen=s, $sources=empty_set];
if ( err_msg == "" )
if ( s?$conn )
{
if ( item?$ip )
{
if ( item$ip in data_store$ip_data )
{
if ( ! item?$and_tags && ! item?$or_tags && ! item?$pred )
return T;
for ( i in data_store$ip_data[item$ip] )
{
meta = data_store$ip_data[item$ip][i];
if ( match_item_with_metadata(item, meta) )
return T;
}
}
}
else if ( item?$str )
{
if ( [item$str, item$subtype] in data_store$string_data )
{
if ( ! item?$and_tags && ! item?$or_tags && ! item?$pred )
return T;
for ( i in data_store$string_data[item$str, item$subtype] )
{
meta = data_store$string_data[item$str, item$subtype][i];
if ( match_item_with_metadata(item, meta) )
return T;
}
}
}
else if ( item?$num )
{
if ( [item$num, item$subtype] in data_store$int_data )
{
if ( ! item?$and_tags && ! item?$or_tags && ! item?$pred )
return T;
for ( i in data_store$int_data[item$num, item$subtype] )
{
meta = data_store$int_data[item$num, item$subtype][i];
if ( match_item_with_metadata(item, meta) )
return T;
}
}
}
else
err_msg = "Failed to query intelligence data for some unknown reason.";
info$uid = s$conn$uid;
info$id = s$conn$id;
}
if ( err_msg != "" )
Log::write(Intel::LOG, [$ts=network_time(), $level="error", $message=fmt(err_msg)]);
return F;
for ( item in items )
add info$sources[item$meta$source];
Log::write(Intel::LOG, info);
}
function insert(item: Item)
{
if ( item?$str && !item?$str_type )
{
event reporter_warning(network_time(), fmt("You must provide a str_type for strings or this item doesn't make sense. Item: %s", item), "");
return;
}
# Create and fill out the meta data item.
local meta = item$meta;
local metas: set[MetaData];
if ( item?$host )
{
local host = mask_addr(item$host, is_v4_addr(item$host) ? 32 : 128);
if ( have_full_data )
{
if ( host !in data_store$net_data )
data_store$net_data[host] = set();
metas = data_store$net_data[host];
}
add min_data_store$net_data[host];
}
else if ( item?$net )
{
if ( have_full_data )
{
if ( item$net !in data_store$net_data )
data_store$net_data[item$net] = set();
metas = data_store$net_data[item$net];
}
add min_data_store$net_data[item$net];
}
else if ( item?$str )
{
if ( have_full_data )
{
if ( [item$str, item$str_type] !in data_store$string_data )
data_store$string_data[item$str, item$str_type] = set();
metas = data_store$string_data[item$str, item$str_type];
}
add min_data_store$string_data[item$str, item$str_type];
}
local updated = F;
if ( have_full_data )
{
for ( m in metas )
{
if ( meta$source == m$source )
{
if ( has_meta(meta, metas) )
{
# It's the same item being inserted again.
return;
}
else
{
# Same source, different metadata means updated item.
updated = T;
}
}
}
add metas[item$meta];
}
if ( updated )
event Intel::updated_item(item);
else
event Intel::new_item(item);
}

View file

@ -14,6 +14,7 @@
@load base/utils/patterns
@load base/utils/strings
@load base/utils/thresholds
@load base/utils/urls
# This has some deep interplay between types and BiFs so it's
# loaded in base/init-bare.bro

View file

@ -0,0 +1,25 @@
## Functions for URL handling.
## A regular expression for matching and extracting URLs.
const url_regex = /^([a-zA-Z\-]{3,5})(:\/\/[^\/?#"'\r\n><]*)([^?#"'\r\n><]*)([^[:blank:]\r\n"'><]*|\??[^"'\r\n><]*)/ &redef;
## Extracts URLs discovered in arbitrary text.
function find_all_urls(s: string): string_set
{
return find_all(s, url_regex);
}
## Extracts URLs discovered in arbitrary text without
## the URL scheme included.
function find_all_urls_without_scheme(s: string): string_set
{
local urls = find_all_urls(s);
local return_urls: set[string] = set();
for ( url in urls )
{
local no_scheme = sub(url, /^([a-zA-Z\-]{3,5})(:\/\/)/, "");
add return_urls[no_scheme];
}
return return_urls;
}