Extend file analysis API to allow file ID caching, adapt HTTP to use it.

This allows an analyzer to either provide file IDs associated with some
file content or to cache a file ID that was already determined by
script-layer logic so that subsequent calls to the file analysis
interface can bypass costly detours through script-layer.  This can
yield a decent performance improvement for analyzers that are able to
take advantage of it and deal with streaming content (like HTTP).
This commit is contained in:
Jon Siwek 2014-01-29 15:34:24 -06:00
parent 55a8725ce2
commit 1842d324cb
4 changed files with 95 additions and 40 deletions

View file

@ -242,10 +242,11 @@ int HTTP_Entity::Undelivered(int64_t len)
if ( end_of_data && in_header ) if ( end_of_data && in_header )
return 0; return 0;
file_mgr->Gap(body_length, len, cached_file_id = file_mgr->Gap(body_length, len,
http_message->MyHTTP_Analyzer()->GetAnalyzerTag(), http_message->MyHTTP_Analyzer()->GetAnalyzerTag(),
http_message->MyHTTP_Analyzer()->Conn(), http_message->MyHTTP_Analyzer()->Conn(),
http_message->IsOrig()); http_message->IsOrig(),
cached_file_id);
if ( chunked_transfer_state != NON_CHUNKED_TRANSFER ) if ( chunked_transfer_state != NON_CHUNKED_TRANSFER )
{ {
@ -314,15 +315,18 @@ void HTTP_Entity::SubmitData(int len, const char* buf)
else else
{ {
if ( send_size && content_length > 0 ) if ( send_size && content_length > 0 )
file_mgr->SetSize(content_length, cached_file_id = file_mgr->SetSize(content_length,
http_message->MyHTTP_Analyzer()->GetAnalyzerTag(), http_message->MyHTTP_Analyzer()->GetAnalyzerTag(),
http_message->MyHTTP_Analyzer()->Conn(), http_message->MyHTTP_Analyzer()->Conn(),
http_message->IsOrig()); http_message->IsOrig(),
cached_file_id);
file_mgr->DataIn(reinterpret_cast<const u_char*>(buf), len, cached_file_id = file_mgr->DataIn(reinterpret_cast<const u_char*>(buf),
len,
http_message->MyHTTP_Analyzer()->GetAnalyzerTag(), http_message->MyHTTP_Analyzer()->GetAnalyzerTag(),
http_message->MyHTTP_Analyzer()->Conn(), http_message->MyHTTP_Analyzer()->Conn(),
http_message->IsOrig()); http_message->IsOrig(),
cached_file_id);
} }
send_size = false; send_size = false;

View file

@ -64,6 +64,7 @@ protected:
uint64_t offset; uint64_t offset;
int64_t instance_length; // total length indicated by content-range int64_t instance_length; // total length indicated by content-range
bool send_size; // whether to send size indication to FAF bool send_size; // whether to send size indication to FAF
std::string cached_file_id;
MIME_Entity* NewChildEntity() { return new HTTP_Entity(http_message, this, 1); } MIME_Entity* NewChildEntity() { return new HTTP_Entity(http_message, this, 1); }

View file

@ -75,36 +75,47 @@ void Manager::SetHandle(const string& handle)
current_file_id = HashHandle(handle); current_file_id = HashHandle(handle);
} }
void Manager::DataIn(const u_char* data, uint64 len, uint64 offset, string Manager::DataIn(const u_char* data, uint64 len, uint64 offset,
analyzer::Tag tag, Connection* conn, bool is_orig) analyzer::Tag tag, Connection* conn, bool is_orig,
const string& cached_id)
{ {
GetFileHandle(tag, conn, is_orig); string id = cached_id.empty() ? GetFileID(tag, conn, is_orig) : cached_id;
File* file = GetFile(current_file_id, conn, tag, is_orig); File* file = GetFile(id, conn, tag, is_orig);
if ( ! file ) if ( ! file )
return; return "";
file->DataIn(data, len, offset); file->DataIn(data, len, offset);
if ( file->IsComplete() ) if ( file->IsComplete() )
{
RemoveFile(file->GetID()); RemoveFile(file->GetID());
return "";
}
return id;
} }
void Manager::DataIn(const u_char* data, uint64 len, analyzer::Tag tag, string Manager::DataIn(const u_char* data, uint64 len, analyzer::Tag tag,
Connection* conn, bool is_orig) Connection* conn, bool is_orig, const string& cached_id)
{ {
GetFileHandle(tag, conn, is_orig); string id = cached_id.empty() ? GetFileID(tag, conn, is_orig) : cached_id;
// Sequential data input shouldn't be going over multiple conns, so don't // Sequential data input shouldn't be going over multiple conns, so don't
// do the check to update connection set. // do the check to update connection set.
File* file = GetFile(current_file_id, conn, tag, is_orig, false); File* file = GetFile(id, conn, tag, is_orig, false);
if ( ! file ) if ( ! file )
return; return "";
file->DataIn(data, len); file->DataIn(data, len);
if ( file->IsComplete() ) if ( file->IsComplete() )
{
RemoveFile(file->GetID()); RemoveFile(file->GetID());
return "";
}
return id;
} }
void Manager::DataIn(const u_char* data, uint64 len, const string& file_id, void Manager::DataIn(const u_char* data, uint64 len, const string& file_id,
@ -133,8 +144,7 @@ void Manager::EndOfFile(analyzer::Tag tag, Connection* conn)
void Manager::EndOfFile(analyzer::Tag tag, Connection* conn, bool is_orig) void Manager::EndOfFile(analyzer::Tag tag, Connection* conn, bool is_orig)
{ {
// Don't need to create a file if we're just going to remove it right away. // Don't need to create a file if we're just going to remove it right away.
GetFileHandle(tag, conn, is_orig); RemoveFile(GetFileID(tag, conn, is_orig));
RemoveFile(current_file_id);
} }
void Manager::EndOfFile(const string& file_id) void Manager::EndOfFile(const string& file_id)
@ -142,31 +152,37 @@ void Manager::EndOfFile(const string& file_id)
RemoveFile(file_id); RemoveFile(file_id);
} }
void Manager::Gap(uint64 offset, uint64 len, analyzer::Tag tag, string Manager::Gap(uint64 offset, uint64 len, analyzer::Tag tag,
Connection* conn, bool is_orig) Connection* conn, bool is_orig, const string& cached_id)
{ {
GetFileHandle(tag, conn, is_orig); string id = cached_id.empty() ? GetFileID(tag, conn, is_orig) : cached_id;
File* file = GetFile(current_file_id, conn, tag, is_orig); File* file = GetFile(id, conn, tag, is_orig);
if ( ! file ) if ( ! file )
return; return "";
file->Gap(offset, len); file->Gap(offset, len);
return id;
} }
void Manager::SetSize(uint64 size, analyzer::Tag tag, Connection* conn, string Manager::SetSize(uint64 size, analyzer::Tag tag, Connection* conn,
bool is_orig) bool is_orig, const string& cached_id)
{ {
GetFileHandle(tag, conn, is_orig); string id = cached_id.empty() ? GetFileID(tag, conn, is_orig) : cached_id;
File* file = GetFile(current_file_id, conn, tag, is_orig); File* file = GetFile(id, conn, tag, is_orig);
if ( ! file ) if ( ! file )
return; return "";
file->SetTotalBytes(size); file->SetTotalBytes(size);
if ( file->IsComplete() ) if ( file->IsComplete() )
{
RemoveFile(file->GetID()); RemoveFile(file->GetID());
return "";
}
return id;
} }
bool Manager::SetTimeoutInterval(const string& file_id, double interval) const bool Manager::SetTimeoutInterval(const string& file_id, double interval) const
@ -317,15 +333,15 @@ bool Manager::IsIgnored(const string& file_id)
return ignored.find(file_id) != ignored.end(); return ignored.find(file_id) != ignored.end();
} }
void Manager::GetFileHandle(analyzer::Tag tag, Connection* c, bool is_orig) string Manager::GetFileID(analyzer::Tag tag, Connection* c, bool is_orig)
{ {
current_file_id.clear(); current_file_id.clear();
if ( IsDisabled(tag) ) if ( IsDisabled(tag) )
return; return "";
if ( ! get_file_handle ) if ( ! get_file_handle )
return; return "";
EnumVal* tagval = tag.AsEnumVal(); EnumVal* tagval = tag.AsEnumVal();
Ref(tagval); Ref(tagval);
@ -337,6 +353,7 @@ void Manager::GetFileHandle(analyzer::Tag tag, Connection* c, bool is_orig)
mgr.QueueEvent(get_file_handle, vl); mgr.QueueEvent(get_file_handle, vl);
mgr.Drain(); // need file handle immediately so we don't have to buffer data mgr.Drain(); // need file handle immediately so we don't have to buffer data
return current_file_id;
} }
bool Manager::IsDisabled(analyzer::Tag tag) bool Manager::IsDisabled(analyzer::Tag tag)

View file

@ -82,9 +82,17 @@ public:
* @param conn network connection over which the file data is transferred. * @param conn network connection over which the file data is transferred.
* @param is_orig true if the file is being sent from connection originator * @param is_orig true if the file is being sent from connection originator
* or false if is being sent in the opposite direction. * or false if is being sent in the opposite direction.
* @param cached_file_id may be set to a previous return value in order to
* bypass costly file handle lookups.
* @return a unique file ID string which, in certain contexts, may be
* cached and passed back in to a subsequent function call in order
* to avoid costly file handle lookups (which have to go through
* the \c get_file_handle script-layer event). An empty string
* indicates the associate file is not going to be analyzed further.
*/ */
void DataIn(const u_char* data, uint64 len, uint64 offset, std::string DataIn(const u_char* data, uint64 len, uint64 offset,
analyzer::Tag tag, Connection* conn, bool is_orig); analyzer::Tag tag, Connection* conn, bool is_orig,
const std::string& cached_file_id = "");
/** /**
* Pass in sequential file data. * Pass in sequential file data.
@ -94,9 +102,17 @@ public:
* @param conn network connection over which the file data is transferred. * @param conn network connection over which the file data is transferred.
* @param is_orig true if the file is being sent from connection originator * @param is_orig true if the file is being sent from connection originator
* or false if is being sent in the opposite direction. * or false if is being sent in the opposite direction.
* @param cached_file_id may be set to a previous return value in order to
* bypass costly file handle lookups.
* @return a unique file ID string which, in certain contexts, may be
* cached and passed back in to a subsequent function call in order
* to avoid costly file handle lookups (which have to go through
* the \c get_file_handle script-layer event). An empty string
* indicates the associate file is not going to be analyzed further.
*/ */
void DataIn(const u_char* data, uint64 len, analyzer::Tag tag, std::string DataIn(const u_char* data, uint64 len, analyzer::Tag tag,
Connection* conn, bool is_orig); Connection* conn, bool is_orig,
const std::string& cached_file_id = "");
/** /**
* Pass in sequential file data from external source (e.g. input framework). * Pass in sequential file data from external source (e.g. input framework).
@ -140,9 +156,17 @@ public:
* @param conn network connection over which the file data is transferred. * @param conn network connection over which the file data is transferred.
* @param is_orig true if the file is being sent from connection originator * @param is_orig true if the file is being sent from connection originator
* or false if is being sent in the opposite direction. * or false if is being sent in the opposite direction.
* @param cached_file_id may be set to a previous return value in order to
* bypass costly file handle lookups.
* @return a unique file ID string which, in certain contexts, may be
* cached and passed back in to a subsequent function call in order
* to avoid costly file handle lookups (which have to go through
* the \c get_file_handle script-layer event). An empty string
* indicates the associate file is not going to be analyzed further.
*/ */
void Gap(uint64 offset, uint64 len, analyzer::Tag tag, Connection* conn, std::string Gap(uint64 offset, uint64 len, analyzer::Tag tag,
bool is_orig); Connection* conn, bool is_orig,
const std::string& cached_file_id = "");
/** /**
* Provide the expected number of bytes that comprise a file. * Provide the expected number of bytes that comprise a file.
@ -151,9 +175,16 @@ public:
* @param conn network connection over which the file data is transferred. * @param conn network connection over which the file data is transferred.
* @param is_orig true if the file is being sent from connection originator * @param is_orig true if the file is being sent from connection originator
* or false if is being sent in the opposite direction. * or false if is being sent in the opposite direction.
* @param cached_file_id may be set to a previous return value in order to
* bypass costly file handle lookups.
* @return a unique file ID string which, in certain contexts, may be
* cached and passed back in to a subsequent function call in order
* to avoid costly file handle lookups (which have to go through
* the \c get_file_handle script-layer event). An empty string
* indicates the associate file is not going to be analyzed further.
*/ */
void SetSize(uint64 size, analyzer::Tag tag, Connection* conn, std::string SetSize(uint64 size, analyzer::Tag tag, Connection* conn,
bool is_orig); bool is_orig, const std::string& cached_file_id = "");
/** /**
* Starts ignoring a file, which will finally be removed from internal * Starts ignoring a file, which will finally be removed from internal
@ -283,8 +314,10 @@ protected:
* @param conn network connection over which the file is transferred. * @param conn network connection over which the file is transferred.
* @param is_orig true if the file is being sent from connection originator * @param is_orig true if the file is being sent from connection originator
* or false if is being sent in the opposite direction. * or false if is being sent in the opposite direction.
* @return #current_file_id, which is a hash of a unique file handle string
* set by a \c get_file_handle event handler.
*/ */
void GetFileHandle(analyzer::Tag tag, Connection* c, bool is_orig); std::string GetFileID(analyzer::Tag tag, Connection* c, bool is_orig);
/** /**
* Check if analysis is available for files transferred over a given * Check if analysis is available for files transferred over a given