Merge branch 'topic/awelzel/3957-raw-reader-spinning'

* topic/awelzel/3957-raw-reader-spinning:
  input/Raw: Rework GetLine()

(cherry picked from commit 2a23e9fc19)
This commit is contained in:
Christian Kreibich 2024-10-09 14:26:35 -07:00 committed by Tim Wojtulewicz
parent f5fefd17df
commit 300b7a11ac
12 changed files with 292 additions and 27 deletions

10
CHANGES
View file

@ -1,3 +1,13 @@
7.0.3-10 | 2024-11-14 11:30:00 -0700
* GH-3957: input/Raw: Rework GetLine() (Arne Welzel, Corelight)
(cherry picked from commit 2a23e9fc1962419e41133689c2a682455d24e35e)
* GH-215: POP3: Rework unbounded pending command fix (Arne Welzel, Corelight)
(cherry picked from commit 2a23e9fc1962419e41133689c2a682455d24e35e)
7.0.3-9 | 2024-11-14 10:21:55 -0700
* import of GH-4022 BTest additions (Vern Paxson, Corelight)

View file

@ -1 +1 @@
7.0.3-9
7.0.3-10

View file

@ -46,6 +46,7 @@ Raw::Raw(ReaderFrontend* frontend) : ReaderBackend(frontend), file(nullptr, fclo
sep_length = BifConst::InputRaw::record_separator->Len();
bufpos = 0;
bufsize = 0;
stdin_fileno = fileno(stdin);
stdout_fileno = fileno(stdout);
@ -420,59 +421,74 @@ bool Raw::DoInit(const ReaderInfo& info, int num_fields, const Field* const* fie
int64_t Raw::GetLine(FILE* arg_file) {
errno = 0;
int pos = 0; // strstr_n only works on ints - so no use to use something different here
int offset = 0;
if ( ! buf )
if ( ! buf ) {
buf = std::unique_ptr<char[]>(new char[block_size]);
int repeats = 1;
bufpos = 0;
bufsize = block_size;
}
for ( ;; ) {
size_t readbytes = fread(buf.get() + bufpos + offset, 1, block_size - bufpos, arg_file);
pos += bufpos + readbytes;
// printf("Pos: %d\n", pos);
bufpos = offset = 0; // read full block size in next read...
size_t readbytes = fread(buf.get() + bufpos, 1, bufsize - bufpos, arg_file);
if ( pos == 0 && errno != 0 )
bufpos = bufpos + readbytes;
// Nothing in the buffer and errno set, yield.
if ( bufpos == 0 && errno != 0 )
break;
// researching everything each time is a bit... cpu-intensive. But otherwise we have
// to deal with situations where the separator is multi-character and split over multiple
// reads...
int found = util::strstr_n(pos, (unsigned char*)buf.get(), separator.size(), (unsigned char*)separator.c_str());
//
// memmem() would be more appropriate, but not available on Windows.
int found = util::strstr_n(bufpos, reinterpret_cast<u_char*>(buf.get()), separator.size(),
reinterpret_cast<const u_char*>(separator.c_str()));
if ( found == -1 ) {
// we did not find it and have to search again in the next try. resize buffer....
// we did not find it and have to search again in the next try.
// but first check if we encountered the file end - because if we did this was it.
if ( feof(arg_file) != 0 ) {
if ( pos == 0 )
if ( bufpos == 0 )
return -1; // signal EOF - and that we had no more data.
else {
outbuf = std::move(buf); // buf is null after this
return pos;
return bufpos; // flush out remaining buffered data as line
}
}
repeats++;
// bah, we cannot use realloc because we would have to change the delete in the manager
// to a free.
std::unique_ptr<char[]> newbuf = std::unique_ptr<char[]>(new char[block_size * repeats]);
memcpy(newbuf.get(), buf.get(), block_size * (repeats - 1));
// No separator found and buffer full, realloc and retry reading more right away.
if ( bufpos == bufsize ) {
std::unique_ptr<char[]> newbuf = std::unique_ptr<char[]>(new char[bufsize + block_size]);
memcpy(newbuf.get(), buf.get(), bufsize);
buf = std::move(newbuf);
offset = block_size * (repeats - 1);
bufsize = bufsize + block_size;
}
else {
// Short or empty read, some data in the buffer, but no separator found
// and also not EOF: This is likely reading from a pipe where the separator
// wasn't yet produced. Yield to retry on the next heartbeat.
return -2;
}
}
else {
size_t sep_idx = static_cast<size_t>(found);
assert(sep_idx <= bufsize - sep_length);
size_t remaining = bufpos - sep_idx - sep_length;
outbuf = std::move(buf);
if ( found < pos ) {
if ( remaining > 0 ) {
// we have leftovers. copy them into the buffer for the next line
assert(remaining <= block_size);
buf = std::unique_ptr<char[]>(new char[block_size]);
memcpy(buf.get(), outbuf.get() + found + sep_length, pos - found - sep_length);
bufpos = pos - found - sep_length;
bufpos = remaining;
bufsize = block_size;
memcpy(buf.get(), outbuf.get() + sep_idx + sep_length, remaining);
}
return found;
return sep_idx;
}
}

View file

@ -58,7 +58,8 @@ private:
std::string separator;
unsigned int sep_length; // length of the separator
int bufpos;
size_t bufpos; // Where in buf to read more data.
size_t bufsize; // Currently allocated size of buf.
std::unique_ptr<char[]> buf;
std::unique_ptr<char[]> outbuf;

View file

@ -0,0 +1,4 @@
### BTest baseline data generated by btest-diff. Do not edit. Use "btest -U/-u" to update. Requires BTest >= 0.63.
Input::EVENT_NEW, aaa
Input::EVENT_NEW, bbb
Input::EVENT_NEW, final

View file

@ -0,0 +1,4 @@
### BTest baseline data generated by btest-diff. Do not edit. Use "btest -U/-u" to update. Requires BTest >= 0.63.
Input::EVENT_NEW, aaa
Input::EVENT_NEW, bbb
Input::EVENT_NEW, final

View file

@ -0,0 +1,4 @@
### BTest baseline data generated by btest-diff. Do not edit. Use "btest -U/-u" to update. Requires BTest >= 0.63.
Input::EVENT_NEW, 24612, binary start\x00\x00\x00\x00, \x00\x00\x00\x00\x00binary done
Input::EVENT_NEW, 3, ccc, ccc
Input::EVENT_NEW, 5, final, final

View file

@ -0,0 +1,4 @@
### BTest baseline data generated by btest-diff. Do not edit. Use "btest -U/-u" to update. Requires BTest >= 0.63.
Input::EVENT_NEW, aaa-bbb-ccc
Input::EVENT_NEW, aaa-bbb-ccc
Input::EVENT_NEW, final

View file

@ -0,0 +1,52 @@
# @TEST-DOC: Launching a program that produces output slowly and strangely separated.
# @TEST-EXEC: chmod +x run.sh
# @TEST-EXEC: btest-bg-run zeek zeek -b %INPUT
# @TEST-EXEC: btest-bg-wait 10
# @TEST-EXEC: btest-diff zeek/.stdout
redef exit_only_after_terminate = T;
redef Threading::heartbeat_interval = 0.01sec;
@TEST-START-FILE run.sh
#!/usr/bin/env bash
echo -e -n "aaa\nb"
sleep 0.1
echo -e -n "bb\nfi"
sleep 0.1
echo "nal"
sleep infinity
@TEST-END-FILE
module A;
type Val: record {
s: string;
};
global lines = 0;
event one_line(description: Input::EventDescription, tpe: Input::Event, s: string)
{
print tpe, s;
++lines;
if ( lines == 3 )
{
Input::remove("input");
terminate();
}
}
event zeek_init()
{
Input::add_event([
$name="run",
$source="../run.sh |",
$reader=Input::READER_RAW,
$mode=Input::STREAM,
$fields=Val,
$ev=one_line, $want_record=F,
]);
}

View file

@ -0,0 +1,53 @@
# @TEST-DOC: Launching a program that doesn't end it's final line with a \n
# @TEST-EXEC: chmod +x run.sh
# @TEST-EXEC: btest-bg-run zeek zeek -b %INPUT
# @TEST-EXEC: btest-bg-wait 10
# @TEST-EXEC: btest-diff zeek/.stdout
redef exit_only_after_terminate = T;
redef Threading::heartbeat_interval = 0.01sec;
@TEST-START-FILE run.sh
#!/usr/bin/env bash
sleep 0.1
echo "aaa"
sleep 0.1
echo "bbb"
sleep 0.1
echo -n "final"
sleep 0.1
exit 0
@TEST-END-FILE
module A;
type Val: record {
s: string;
};
global lines = 0;
event one_line(description: Input::EventDescription, tpe: Input::Event, s: string)
{
print tpe, s;
++lines;
if ( lines == 3 )
{
Input::remove("input");
terminate();
}
}
event zeek_init()
{
Input::add_event([
$name="run",
$source="../run.sh |",
$reader=Input::READER_RAW,
$mode=Input::STREAM,
$fields=Val,
$ev=one_line, $want_record=F,
]);
}

View file

@ -0,0 +1,62 @@
# @TEST-DOC: Launching a program that produces output slowly and exercises buffering.
# @TEST-EXEC: chmod +x run.sh
# @TEST-EXEC: btest-bg-run zeek zeek -b %INPUT
# @TEST-EXEC: btest-bg-wait 10
# @TEST-EXEC: btest-diff zeek/.stdout
redef exit_only_after_terminate = T;
redef Threading::heartbeat_interval = 0.01sec;
@TEST-START-FILE run.sh
#!/usr/bin/env bash
sleep 0.1
echo -n "binary start"
sleep 0.1
dd if=/dev/zero bs=1 count=8192
sleep 0.1
echo -n "binary middle"
sleep 0.1
dd if=/dev/zero bs=1 count=8192
sleep 0.1
dd if=/dev/zero bs=1 count=8192
sleep 0.1
echo "binary done"
sleep 0.1
echo "ccc"
sleep 0.1
echo "final"
sleep infinity
@TEST-END-FILE
module A;
type Val: record {
s: string;
};
global lines = 0;
event one_line(description: Input::EventDescription, tpe: Input::Event, s: string)
{
print tpe,|s|, s[:16], s[-16:];
++lines;
if ( lines == 3 )
{
Input::remove("input");
terminate();
}
}
event zeek_init()
{
Input::add_event([
$name="run",
$source="../run.sh |",
$reader=Input::READER_RAW,
$mode=Input::STREAM,
$fields=Val,
$ev=one_line, $want_record=F,
]);
}

View file

@ -0,0 +1,55 @@
# @TEST-DOC: Launching a program that produces output slowly puts the raw reader into an endless loop.
# @TEST-EXEC: chmod +x run.sh
# @TEST-EXEC: btest-bg-run zeek zeek -b %INPUT
# @TEST-EXEC: btest-bg-wait 10
# @TEST-EXEC: btest-diff zeek/.stdout
redef exit_only_after_terminate = T;
redef Threading::heartbeat_interval = 0.01sec;
@TEST-START-FILE run.sh
#!/usr/bin/env bash
sleep 0.1
echo -n "aaa-"
sleep 0.1
echo -n "bbb-"
sleep 0.1
echo "ccc"
sleep 0.1
echo "aaa-bbb-ccc"
echo "final"
sleep infinity
@TEST-END-FILE
module A;
type Val: record {
s: string;
};
global lines = 0;
event one_line(description: Input::EventDescription, tpe: Input::Event, s: string)
{
print tpe, s;
++lines;
if ( lines == 3 )
{
Input::remove("input");
terminate();
}
}
event zeek_init()
{
Input::add_event([
$name="run",
$source="../run.sh |",
$reader=Input::READER_RAW,
$mode=Input::STREAM,
$fields=Val,
$ev=one_line, $want_record=F,
]);
}