A set of file analysis extensions.

- Enable manager to associate analyzers with a MIME type. With that,
  one can now say enable all analyzers for, e.g., "image/gif". This is
  exposed to script-land as

    Files::add_analyzers_for_mime_type(f: fa_file, mtype: string)

  For MIME types identified via libmagic, this happens automatically
  (via the file_new() handler in files/main.bro).

- Extend the analyzer API to better match that of protocol analyzers:

    - Adding unique analyzer IDs so that we can refer to instances
      from script-land.

    - Adding subtypes to Components so that a single analyzer
      implementation can support different types of analyzers
      internally.

    - Add an analyzer method SetTag() that allows to set the tag after
      construction.

    - Adding Init() and Done() methods for consistency with what other
      classes offer.

- Add debug logging to the file_analysis stream.

TODO: test cases missing for the new script-land functionality.
This commit is contained in:
Robin Sommer 2013-11-26 11:16:58 -08:00
parent f0fe270029
commit d34f23c8d4
12 changed files with 337 additions and 30 deletions

View file

@ -152,6 +152,15 @@ export {
tag: Files::Tag,
args: AnalyzerArgs &default=AnalyzerArgs()): bool;
## Adds all analyzers associated with a give MIME type to the analysis of
## a file. Note that analyzers added via MIME types cannot take further
## arguments.
##
## f: the file.
##
## mtype: the MIME type; it will be compared case-insensitive.
global add_analyzers_for_mime_type: function(f: fa_file, mtype: string);
## Removes an analyzer from the analysis of a given file.
##
## f: the file.
@ -288,6 +297,15 @@ function add_analyzer(f: fa_file, tag: Files::Tag, args: AnalyzerArgs): bool
return T;
}
function add_analyzers_for_mime_type(f: fa_file, mtype: string)
{
local dummy_args: AnalyzerArgs;
local analyzers = __add_analyzers_for_mime_type(f$id, mtype, dummy_args);
for ( tag in analyzers )
add f$info$analyzers[Files::analyzer_name(tag)];
}
function register_analyzer_add_callback(tag: Files::Tag, callback: function(f: fa_file, args: AnalyzerArgs))
{
analyzer_add_callbacks[tag] = callback;
@ -311,6 +329,9 @@ function analyzer_name(tag: Files::Tag): string
event file_new(f: fa_file) &priority=10
{
set_info(f);
if ( f?$mime_type )
add_analyzers_for_mime_type(f, f$mime_type);
}
event file_over_new_connection(f: fa_file, c: connection, is_orig: bool) &priority=10

View file

@ -60,6 +60,13 @@ type addr_vec: vector of addr;
## directly and then remove this alias.
type table_string_of_string: table[string] of string;
## A set of file analyzer tags.
##
## .. todo:: We need this type definition only for declaring builtin functions
## via ``bifcl``. We should extend ``bifcl`` to understand composite types
## directly and then remove this alias.
type files_tag_set: set[Files::Tag];
## A connection's transport-layer protocol. Note that Bro uses the term
## "connection" broadly, using flow semantics for ICMP and UDP.
type transport_proto: enum {

View file

@ -3,9 +3,17 @@
#include "Analyzer.h"
#include "Manager.h"
file_analysis::ID file_analysis::Analyzer::id_counter = 0;
file_analysis::Analyzer::~Analyzer()
{
DBG_LOG(DBG_FILE_ANALYSIS, "Destroy file analyzer %s",
file_mgr->GetComponentName(tag));
Unref(args);
}
void file_analysis::Analyzer::SetAnalyzerTag(const file_analysis::Tag& arg_tag)
{
assert(! tag || tag == arg_tag);
tag = arg_tag;
}

View file

@ -13,6 +13,8 @@ namespace file_analysis {
class File;
typedef uint32 ID;
/**
* Base class for analyzers that can be attached to file_analysis::File objects.
*/
@ -25,6 +27,18 @@ public:
*/
virtual ~Analyzer();
/**
* Initializes the analyzer before input processing starts.
*/
virtual void Init()
{ };
/**
* Finishes the analyzer's operation after all input has been parsed.
*/
virtual void Done()
{ };
/**
* Subclasses may override this metod to receive file data non-sequentially.
* @param data points to start of a chunk of file data.
@ -72,6 +86,13 @@ public:
*/
file_analysis::Tag Tag() const { return tag; }
/**
* Returns the analyzer instance's internal ID. These IDs are unique
* across all analyzers instantiated and can thus be used to
* indentify a specific instance.
*/
ID GetID() const { return id; }
/**
* @return the AnalyzerArgs associated with the analyzer.
*/
@ -82,10 +103,19 @@ public:
*/
File* GetFile() const { return file; }
/**
* Sets the tag associated with the analyzer's type. Note that this
* can be called only right after construction, if the constructor
* did not receive a name or tag. The method cannot be used to change
* an existing tag.
*/
void SetAnalyzerTag(const file_analysis::Tag& tag);
protected:
/**
* Constructor. Only derived classes are meant to be instantiated.
* @param arg_tag the tag definining the analyzer's type.
* @param arg_args an \c AnalyzerArgs (script-layer type) value specifiying
* tunable options, if any, related to a particular analyzer type.
* @param arg_file the file to which the the analyzer is being attached.
@ -94,13 +124,35 @@ protected:
: tag(arg_tag),
args(arg_args->Ref()->AsRecordVal()),
file(arg_file)
{}
{
id = ++id_counter;
}
/**
* Constructor. Only derived classes are meant to be instantiated.
* As this version of the constructor does not receive a name or tag,
* SetAnalyzerTag() must be called before the instance can be used.
*
* @param arg_args an \c AnalyzerArgs (script-layer type) value specifiying
* tunable options, if any, related to a particular analyzer type.
* @param arg_file the file to which the the analyzer is being attached.
*/
Analyzer(RecordVal* arg_args, File* arg_file)
: tag(),
args(arg_args->Ref()->AsRecordVal()),
file(arg_file)
{
id = ++id_counter;
}
private:
ID id; /**< Unique instance ID. */
file_analysis::Tag tag; /**< The particular type of the analyzer instance. */
RecordVal* args; /**< \c AnalyzerArgs val gives tunable analyzer params. */
File* file; /**< The file to which the analyzer is attached. */
static ID id_counter;
};
} // namespace file_analysis

View file

@ -9,7 +9,10 @@ using namespace file_analysis;
static void analyzer_del_func(void* v)
{
delete (file_analysis::Analyzer*) v;
file_analysis::Analyzer* a = (file_analysis::Analyzer*)v;
a->Done();
delete a;
}
AnalyzerSet::AnalyzerSet(File* arg_file) : file(arg_file)
@ -98,6 +101,7 @@ bool AnalyzerSet::AddMod::Perform(AnalyzerSet* set)
}
set->Insert(a, key);
return true;
}
@ -124,7 +128,9 @@ bool AnalyzerSet::Remove(file_analysis::Tag tag, HashKey* key)
file_mgr->GetComponentName(tag),
file->GetID().c_str());
a->Done();
delete a;
return true;
}
@ -176,6 +182,8 @@ void AnalyzerSet::Insert(file_analysis::Analyzer* a, HashKey* key)
file_mgr->GetComponentName(a->Tag()), file->GetID().c_str());
analyzer_map.Insert(key, a);
delete key;
a->Init();
}
void AnalyzerSet::DrainModifications()

View file

@ -8,13 +8,15 @@
using namespace file_analysis;
Component::Component(const char* arg_name, factory_callback arg_factory)
Component::Component(const char* arg_name, factory_callback arg_factory, Tag::subtype_t subtype)
: plugin::Component(plugin::component::FILE_ANALYZER),
plugin::TaggedComponent<file_analysis::Tag>()
plugin::TaggedComponent<file_analysis::Tag>(subtype)
{
name = copy_string(arg_name);
canon_name = canonify_name(arg_name);
factory = arg_factory;
file_mgr->RegisterComponent(this, "ANALYZER_");
}
Component::Component(const Component& other)
@ -24,6 +26,8 @@ Component::Component(const Component& other)
name = copy_string(other.name);
canon_name = copy_string(other.canon_name);
factory = other.factory;
// TODO: Do we need the RegisterComponent() call here?
}
Component::~Component()

View file

@ -40,8 +40,14 @@ public:
* from file_analysis::Analyzer. This is typically a static \c
* Instatiate() method inside the class that just allocates and
* returns a new instance.
*
* @param subtype A subtype associated with this component that
* further distinguishes it. The subtype will be integrated into the
* analyzer::Tag that the manager associates with this analyzer, and
* analyzer instances can accordingly access it via analyzer::Tag().
* If not used, leave at zero.
*/
Component(const char* name, factory_callback factory);
Component(const char* name, factory_callback factory, Tag::subtype_t subtype = 0);
/**
* Copy constructor.

View file

@ -1,6 +1,7 @@
// See the file "COPYING" in the main distribution directory for copyright.
#include <string>
#include <algorithm>
#include "File.h"
#include "FileTimer.h"
@ -82,7 +83,7 @@ File::File(const string& file_id, Connection* conn, analyzer::Tag tag,
{
StaticInit();
DBG_LOG(DBG_FILE_ANALYSIS, "Creating new File object %s", file_id.c_str());
DBG_LOG(DBG_FILE_ANALYSIS, "[%s] Creating new File object", file_id.c_str());
val = new RecordVal(fa_file_type);
val->Assign(id_idx, new StringVal(file_id.c_str()));
@ -100,7 +101,7 @@ File::File(const string& file_id, Connection* conn, analyzer::Tag tag,
File::~File()
{
DBG_LOG(DBG_FILE_ANALYSIS, "Destroying File object %s", id.c_str());
DBG_LOG(DBG_FILE_ANALYSIS, "[%s] Destroying File object", id.c_str());
Unref(val);
// Queue may not be empty in the case where only content gaps were seen.
@ -229,6 +230,7 @@ void File::IncrementByteCount(uint64 size, int field_idx)
void File::SetTotalBytes(uint64 size)
{
DBG_LOG(DBG_FILE_ANALYSIS, "[%s] Total bytes %" PRIu64, id.c_str(), size);
val->Assign(total_bytes_idx, new Val(size, TYPE_COUNT));
}
@ -251,11 +253,17 @@ void File::ScheduleInactivityTimer() const
bool File::AddAnalyzer(file_analysis::Tag tag, RecordVal* args)
{
DBG_LOG(DBG_FILE_ANALYSIS, "[%s] Queuing addition of %s analyzer",
id.c_str(), file_mgr->GetComponentName(tag));
return done ? false : analyzers.QueueAdd(tag, args);
}
bool File::RemoveAnalyzer(file_analysis::Tag tag, RecordVal* args)
{
DBG_LOG(DBG_FILE_ANALYSIS, "[%s] Queuing remove of %s analyzer",
id.c_str(), file_mgr->GetComponentName(tag));
return done ? false : analyzers.QueueRemove(tag, args);
}
@ -284,16 +292,18 @@ bool File::DetectMIME(const u_char* data, uint64 len)
if ( mime )
{
// strip off charset
const char* mime_end = strchr(mime, ';');
if ( mime_end )
// strip off charset
val->Assign(mime_type_idx, new StringVal(mime_end - mime, mime));
else
val->Assign(mime_type_idx, new StringVal(mime));
StringVal* mime_val = mime_end ?
new StringVal(mime_end - mime, mime) :
new StringVal(mime);
val->Assign(mime_type_idx, mime_val);
DBG_LOG(DBG_FILE_ANALYSIS, "[%s] Detected MIME type %s", id.c_str(), mime_val->CheckString());
}
return mime;
return true;
}
void File::ReplayBOF()
@ -314,7 +324,6 @@ void File::ReplayBOF()
val->Assign(bof_buffer_idx, new StringVal(bs));
DetectMIME(bs->Bytes(), bs->Len());
FileEvent(file_new);
for ( size_t i = 0; i < bof_buffer.chunks.size(); ++i )
@ -333,6 +342,11 @@ void File::DataIn(const u_char* data, uint64 len, uint64 offset)
first_chunk = false;
}
DBG_LOG(DBG_FILE_ANALYSIS, "[%s] %" PRIu64 " bytes in at offset" PRIu64 "; %s [%s]",
id.c_str(), len, offset,
IsComplete() ? "complete" : "incomplete",
fmt_bytes((const char*) data, min((uint64)40, len)), len > 40 ? "..." : "");
file_analysis::Analyzer* a = 0;
IterCookie* c = analyzers.InitForIteration();
@ -367,6 +381,11 @@ void File::DataIn(const u_char* data, uint64 len)
missed_bof = false;
}
DBG_LOG(DBG_FILE_ANALYSIS, "[%s] %" PRIu64 " bytes in; %s [%s]",
id.c_str(), len,
IsComplete() ? "complete" : "incomplete",
fmt_bytes((const char*) data, min((uint64)40, len)), len > 40 ? "..." : "");
file_analysis::Analyzer* a = 0;
IterCookie* c = analyzers.InitForIteration();
@ -391,6 +410,8 @@ void File::DataIn(const u_char* data, uint64 len)
void File::EndOfFile()
{
DBG_LOG(DBG_FILE_ANALYSIS, "[%s] End of file", id.c_str());
if ( done )
return;
@ -417,6 +438,9 @@ void File::EndOfFile()
void File::Gap(uint64 offset, uint64 len)
{
DBG_LOG(DBG_FILE_ANALYSIS, "[%s] Gap of size %" PRIu64 " at offset %" PRIu64,
id.c_str(), offset, len);
analyzers.DrainModifications();
// If we were buffering the beginning of the file, a gap means we've got

View file

@ -229,7 +229,7 @@ protected:
* field in #val.
* @param data pointer to a chunk of file data.
* @param len number of bytes in the data chunk.
* @return whether mime type was available.
* @return true if mime type available.
*/
bool DetectMIME(const u_char* data, uint64 len);

View file

@ -12,10 +12,12 @@
#include "UID.h"
#include "plugin/Manager.h"
#include "analyzer/Manager.h"
using namespace file_analysis;
TableVal* Manager::disabled = 0;
TableType* Manager::tag_set_type = 0;
string Manager::salt;
Manager::Manager()
@ -27,15 +29,13 @@ Manager::Manager()
Manager::~Manager()
{
Terminate();
for ( MIMEMap::iterator i = mime_types.begin(); i != mime_types.end(); i++ )
delete i->second;
}
void Manager::InitPreScript()
{
std::list<Component*> analyzers = plugin_mgr->Components<Component>();
for ( std::list<Component*>::const_iterator i = analyzers.begin();
i != analyzers.end(); ++i )
RegisterComponent(*i, "ANALYZER_");
}
void Manager::InitPostScript()
@ -72,6 +72,7 @@ void Manager::SetHandle(const string& handle)
if ( handle.empty() )
return;
DBG_LOG(DBG_FILE_ANALYSIS, "Set current handle to %s", handle.c_str());
current_file_id = HashHandle(handle);
}
@ -205,6 +206,28 @@ bool Manager::AddAnalyzer(const string& file_id, file_analysis::Tag tag,
return file->AddAnalyzer(tag, args);
}
TableVal* Manager::AddAnalyzersForMIMEType(const string& file_id, const string& mtype,
RecordVal* args)
{
if ( ! tag_set_type )
tag_set_type = internal_type("files_tag_set")->AsTableType();
TableVal* sval = new TableVal(tag_set_type);
TagSet* l = LookupMIMEType(mtype, false);
if ( ! l )
return sval;
for ( TagSet::const_iterator i = l->begin(); i != l->end(); i++ )
{
file_analysis::Tag tag = *i;
if ( AddAnalyzer(file_id, tag, args) )
sval->Assign(tag.AsEnumVal(), 0);
}
return sval;
}
bool Manager::RemoveAnalyzer(const string& file_id, file_analysis::Tag tag,
RecordVal* args) const
{
@ -327,6 +350,9 @@ void Manager::GetFileHandle(analyzer::Tag tag, Connection* c, bool is_orig)
if ( ! get_file_handle )
return;
DBG_LOG(DBG_FILE_ANALYSIS, "Raise get_file_handle() for protocol analyzer %s",
analyzer_mgr->GetComponentName(tag));
EnumVal* tagval = tag.AsEnumVal();
Ref(tagval);
@ -376,5 +402,72 @@ Analyzer* Manager::InstantiateAnalyzer(Tag tag, RecordVal* args, File* f) const
return 0;
}
return c->Factory()(args, f);
DBG_LOG(DBG_FILE_ANALYSIS, "Instantiate analyzer %s for file %s",
GetComponentName(tag), f->id.c_str());
Analyzer* a = c->Factory()(args, f);
if ( ! a )
reporter->InternalError("file analyzer instantiation failed");
a->SetAnalyzerTag(tag);
return a;
}
Manager::TagSet* Manager::LookupMIMEType(const string& mtype, bool add_if_not_found)
{
MIMEMap::const_iterator i = mime_types.find(to_upper(mtype));
if ( i != mime_types.end() )
return i->second;
if ( ! add_if_not_found )
return 0;
TagSet* l = new TagSet;
mime_types.insert(std::make_pair(to_upper(mtype), l));
return l;
}
bool Manager::RegisterAnalyzerForMIMEType(EnumVal* tag, StringVal* mtype)
{
Component* p = Lookup(tag);
if ( ! p )
return false;
return RegisterAnalyzerForMIMEType(p->Tag(), mtype->CheckString());
}
bool Manager::RegisterAnalyzerForMIMEType(Tag tag, const string& mtype)
{
TagSet* l = LookupMIMEType(mtype, true);
DBG_LOG(DBG_FILE_ANALYSIS, "Register analyzer %s for MIME type %s",
GetComponentName(tag), mtype.c_str());
l->insert(tag);
return true;
}
bool Manager::UnregisterAnalyzerForMIMEType(EnumVal* tag, StringVal* mtype)
{
Component* p = Lookup(tag);
if ( ! p )
return false;
return UnregisterAnalyzerForMIMEType(p->Tag(), mtype->CheckString());
}
bool Manager::UnregisterAnalyzerForMIMEType(Tag tag, const string& mtype)
{
TagSet* l = LookupMIMEType(mtype, true);
DBG_LOG(DBG_FILE_ANALYSIS, "Unregister analyzer %s for MIME type %s",
GetComponentName(tag), mtype.c_str());
l->erase(tag);
return true;
}

View file

@ -198,6 +198,18 @@ public:
bool AddAnalyzer(const string& file_id, file_analysis::Tag tag,
RecordVal* args) const;
/**
* Queue attachment of an all analyzers associated with a given MIME
* type to the file identifier.
*
* @param file_id the file identifier/hash.
* @param mtype the MIME type; comparisions will be performanced case-insensitive.
* @param args a \c AnalyzerArgs value which describes a file analyzer.
* @return A ref'ed \c set[Tag] with all added analyzers.
*/
TableVal* AddAnalyzersForMIMEType(const string& file_id, const string& mtype,
RecordVal* args);
/**
* Queue removal of an analyzer for a given file identifier.
* @param file_id the file identifier/hash.
@ -224,6 +236,62 @@ public:
*/
Analyzer* InstantiateAnalyzer(Tag tag, RecordVal* args, File* f) const;
/**
* Registers a MIME type for an analyzer. Once registered, files of
* that MIME type will automatically get a corresponding analyzer
* assigned.
*
* @param tag The analyzer's tag as an enum of script type \c
* Files::Tag.
*
* @param mtype The MIME type. It will be matched case-insenistive.
*
* @return True if successful.
*/
bool RegisterAnalyzerForMIMEType(EnumVal* tag, StringVal* mtype);
/**
* Registers a MIME type for an analyzer. Once registered, files of
* that MIME type will automatically get a corresponding analyzer
* assigned.
*
* @param tag The analyzer's tag as an enum of script type \c
* Files::Tag.
*
* @param mtype The MIME type. It will be matched case-insenistive.
*
* @return True if successful.
*/
bool RegisterAnalyzerForMIMEType(Tag tag, const string& mtype);
/**
* Unregisters a MIME type for an analyzer.
*
* @param tag The analyzer's tag as an enum of script type \c
* Files::Tag.
*
* @param mtype The MIME type. It will be matched case-insenistive.
*
* @return True if successful (incl. when the type wasn't actually
* registered for the analyzer).
*
*/
bool UnregisterAnalyzerForMIMEType(EnumVal* tag, StringVal* mtype);
/**
* Unregisters a MIME type for an analyzer.
*
* @param tag The analyzer's tag as an enum of script type \c
* Files::Tag.
*
* @param mtype The MIME type. It will be matched case-insenistive.
*
* @return True if successful (incl. when the type wasn't actually
* registered for the analyzer).
*
*/
bool UnregisterAnalyzerForMIMEType(Tag tag, const string& mtype);
protected:
friend class FileTimer;
@ -297,12 +365,18 @@ protected:
static bool IsDisabled(analyzer::Tag tag);
private:
typedef set<Tag> TagSet;
typedef map<string, TagSet*> MIMEMap;
TagSet* LookupMIMEType(const string& mtype, bool add_if_not_found);
IDMap id_map; /**< Map file ID to file_analysis::File records. */
IDSet ignored; /**< Ignored files. Will be finally removed on EOF. */
string current_file_id; /**< Hash of what get_file_handle event sets. */
MIMEMap mime_types;/**< Mapping of MIME types to analyzers. */
static TableVal* disabled; /**< Table of disabled analyzers. */
static TableType* tag_set_type; /**< Type for set[tag]. */
static string salt; /**< A salt added to file handles before hashing. */
};

View file

@ -26,6 +26,16 @@ function Files::__add_analyzer%(file_id: string, tag: Files::Tag, args: any%): b
return new Val(result, TYPE_BOOL);
%}
## :bro:see:`Files::add_analyzers_for_mime_type`.
function Files::__add_analyzers_for_mime_type%(file_id: string, mtype: string, args: any%): files_tag_set
%{
using BifType::Record::Files::AnalyzerArgs;
RecordVal* rv = args->AsRecordVal()->CoerceTo(AnalyzerArgs);
Val* analyzers = file_mgr->AddAnalyzersForMIMEType(file_id->CheckString(), mtype->CheckString(), rv);
Unref(rv);
return analyzers;
%}
## :bro:see:`Files::remove_analyzer`.
function Files::__remove_analyzer%(file_id: string, tag: Files::Tag, args: any%): bool
%{