Add extract_limit_includes_missing option for file extraction

Setting this option to false does not count missing bytes in files towards the
extraction limits, and allows to extract data up to the desired limit,
even when partial files are written.

When missing bytes are encountered, files are now written as sparse
files.

Using this option requires the underlying storage and utilities to support
sparse files.

(cherry picked from commit afa6f3a0d3b8db1ec5b5e82d26225504c2891089)
This commit is contained in:
Johanna Amann 2023-09-06 11:00:05 +01:00 committed by Tim Wojtulewicz
parent b2c40a22cb
commit f4d0fdcd5c
14 changed files with 151 additions and 18 deletions

View file

@ -13,9 +13,10 @@ namespace zeek::file_analysis::detail
{
Extract::Extract(RecordValPtr args, file_analysis::File* file, const std::string& arg_filename,
uint64_t arg_limit)
uint64_t arg_limit, bool arg_limit_includes_missing)
: file_analysis::Analyzer(file_mgr->GetComponentTag("EXTRACT"), std::move(args), file),
filename(arg_filename), limit(arg_limit), depth(0)
filename(arg_filename), limit(arg_limit), written(0),
limit_includes_missing(arg_limit_includes_missing)
{
char buf[128];
file_stream = fopen(filename.data(), "wb");
@ -60,14 +61,25 @@ file_analysis::Analyzer* Extract::Instantiate(RecordValPtr args, file_analysis::
{
const auto& fname = get_extract_field_val(args, "extract_filename");
const auto& limit = get_extract_field_val(args, "extract_limit");
const auto& extract_limit_includes_missing = get_extract_field_val(
args, "extract_limit_includes_missing");
if ( ! fname || ! limit )
if ( ! fname || ! limit || ! extract_limit_includes_missing )
return nullptr;
return new Extract(std::move(args), file, fname->AsString()->CheckString(), limit->AsCount());
return new Extract(std::move(args), file, fname->AsString()->CheckString(), limit->AsCount(),
extract_limit_includes_missing->AsBool());
}
static bool check_limit_exceeded(uint64_t lim, uint64_t depth, uint64_t len, uint64_t* n)
/**
* Check if we are exceeding the write limit with this write.
* @param lim size limit
* @param written how many bytes we have written so far
* @param len length of the write
* @param n number of bytes to write to keep within limit
* @returns true if limit exceeded
*/
static bool check_limit_exceeded(uint64_t lim, uint64_t written, uint64_t len, uint64_t* n)
{
if ( lim == 0 )
{
@ -75,14 +87,14 @@ static bool check_limit_exceeded(uint64_t lim, uint64_t depth, uint64_t len, uin
return false;
}
if ( depth >= lim )
if ( written >= lim )
{
*n = 0;
return true;
}
else if ( depth + len > lim )
else if ( written + len > lim )
{
*n = lim - depth;
*n = lim - written;
return true;
}
else
@ -99,7 +111,7 @@ bool Extract::DeliverStream(const u_char* data, uint64_t len)
return false;
uint64_t towrite = 0;
bool limit_exceeded = check_limit_exceeded(limit, depth, len, &towrite);
bool limit_exceeded = check_limit_exceeded(limit, written, len, &towrite);
if ( limit_exceeded && file_extraction_limit )
{
@ -108,7 +120,7 @@ bool Extract::DeliverStream(const u_char* data, uint64_t len)
{f->ToVal(), GetArgs(), val_mgr->Count(limit), val_mgr->Count(len)});
// Limit may have been modified by a BIF, re-check it.
limit_exceeded = check_limit_exceeded(limit, depth, len, &towrite);
limit_exceeded = check_limit_exceeded(limit, written, len, &towrite);
}
char buf[128];
@ -124,7 +136,7 @@ bool Extract::DeliverStream(const u_char* data, uint64_t len)
return false;
}
depth += towrite;
written += towrite;
}
// Assume we may not try to write anything more for a while due to reaching
@ -145,7 +157,30 @@ bool Extract::Undelivered(uint64_t offset, uint64_t len)
if ( ! file_stream )
return false;
if ( depth == offset )
if ( limit_includes_missing )
{
uint64_t towrite = 0;
bool limit_exceeded = check_limit_exceeded(limit, written, len, &towrite);
// if the limit is exceeded, we have to raise the event. This gives scripts the opportunity
// to raise the limit.
if ( limit_exceeded && file_extraction_limit )
{
file_analysis::File* f = GetFile();
f->FileEvent(file_extraction_limit,
{f->ToVal(), GetArgs(), val_mgr->Count(limit), val_mgr->Count(len)});
// we have to check again if the limit is still exceedee
limit_exceeded = check_limit_exceeded(limit, written, len, &towrite);
}
// if the limit is exceeded, abort and don't do anything - no reason to seek.
if ( limit_exceeded )
return false;
// if we don't skip holes, count this hole against the write limit
written += len;
}
if ( fseek(file_stream, len + offset, SEEK_SET) != 0 )
{
char* tmp = new char[len]();
@ -161,7 +196,7 @@ bool Extract::Undelivered(uint64_t offset, uint64_t len)
}
delete[] tmp;
depth += len;
written += len;
}
return true;

View file

@ -65,15 +65,17 @@ protected:
* @param arg_filename a file system path which specifies the local file
* to which the contents of the file will be extracted/written.
* @param arg_limit the maximum allowed file size.
* @param arg_limit_includes_missing missing bytes count towards limit if true.
*/
Extract(RecordValPtr args, file_analysis::File* file, const std::string& arg_filename,
uint64_t arg_limit);
uint64_t arg_limit, bool arg_limit_includes_missing);
private:
std::string filename;
FILE* file_stream;
uint64_t limit;
uint64_t depth;
uint64_t limit; // the file extraction limit
uint64_t written; // how many bytes we have written so far
bool limit_includes_missing; // do count missing bytes against limit if true
};
} // namespace zeek::file_analysis::detail