mirror of
https://github.com/zeek/zeek.git
synced 2025-10-02 06:38:20 +00:00
525 lines
11 KiB
C++
525 lines
11 KiB
C++
// See the file "COPYING" in the main distribution directory for copyright.
|
|
|
|
#include "zeek-config.h"
|
|
#include "zeek/RE.h"
|
|
|
|
#include <stdlib.h>
|
|
#include <utility>
|
|
|
|
#include "zeek/DFA.h"
|
|
#include "zeek/CCL.h"
|
|
#include "zeek/EquivClass.h"
|
|
#include "zeek/Reporter.h"
|
|
#include "zeek/ZeekString.h"
|
|
|
|
zeek::detail::CCL* zeek::detail::curr_ccl = nullptr;
|
|
zeek::detail::CCL*& curr_ccl = zeek::detail::curr_ccl;
|
|
zeek::detail::Specific_RE_Matcher* zeek::detail::rem = nullptr;
|
|
zeek::detail::Specific_RE_Matcher*& rem = zeek::detail::rem;
|
|
zeek::detail::NFA_Machine* zeek::detail::nfa = nullptr;
|
|
zeek::detail::NFA_Machine*& nfa = zeek::detail::nfa;
|
|
int zeek::detail::case_insensitive = 0;
|
|
int& case_insensitive = zeek::detail::case_insensitive;
|
|
|
|
extern int RE_parse(void);
|
|
extern void RE_set_input(const char* str);
|
|
extern void RE_done_with_scan();
|
|
|
|
namespace zeek {
|
|
namespace detail {
|
|
|
|
Specific_RE_Matcher::Specific_RE_Matcher(match_type arg_mt, int arg_multiline)
|
|
: equiv_class(NUM_SYM)
|
|
{
|
|
mt = arg_mt;
|
|
multiline = arg_multiline;
|
|
any_ccl = nullptr;
|
|
pattern_text = nullptr;
|
|
dfa = nullptr;
|
|
ecs = nullptr;
|
|
accepted = new AcceptingSet();
|
|
}
|
|
|
|
Specific_RE_Matcher::~Specific_RE_Matcher()
|
|
{
|
|
for ( int i = 0; i < ccl_list.length(); ++i )
|
|
delete ccl_list[i];
|
|
|
|
Unref(dfa);
|
|
delete [] pattern_text;
|
|
delete accepted;
|
|
}
|
|
|
|
CCL* Specific_RE_Matcher::AnyCCL()
|
|
{
|
|
if ( ! any_ccl )
|
|
{ // Create the '.' character class.
|
|
any_ccl = new CCL();
|
|
if ( ! multiline )
|
|
any_ccl->Add('\n');
|
|
any_ccl->Negate();
|
|
EC()->CCL_Use(any_ccl);
|
|
}
|
|
|
|
return any_ccl;
|
|
}
|
|
|
|
void Specific_RE_Matcher::ConvertCCLs()
|
|
{
|
|
for ( int i = 0; i < ccl_list.length(); ++i )
|
|
equiv_class.ConvertCCL(ccl_list[i]);
|
|
}
|
|
|
|
void Specific_RE_Matcher::AddPat(const char* new_pat)
|
|
{
|
|
if ( mt == MATCH_EXACTLY )
|
|
AddExactPat(new_pat);
|
|
else
|
|
AddAnywherePat(new_pat);
|
|
}
|
|
|
|
void Specific_RE_Matcher::AddAnywherePat(const char* new_pat)
|
|
{
|
|
AddPat(new_pat, "^?(.|\\n)*(%s)", "(%s)|(^?(.|\\n)*(%s))");
|
|
}
|
|
|
|
void Specific_RE_Matcher::AddExactPat(const char* new_pat)
|
|
{
|
|
AddPat(new_pat, "^?(%s)$?", "(%s)|(^?(%s)$?)");
|
|
}
|
|
|
|
void Specific_RE_Matcher::AddPat(const char* new_pat,
|
|
const char* orig_fmt, const char* app_fmt)
|
|
{
|
|
int n = strlen(new_pat);
|
|
|
|
if ( pattern_text )
|
|
n += strlen(pattern_text) + strlen(app_fmt);
|
|
else
|
|
n += strlen(orig_fmt);
|
|
|
|
char* s = new char[n + 5 /* slop */];
|
|
|
|
if ( pattern_text )
|
|
sprintf(s, app_fmt, pattern_text, new_pat);
|
|
else
|
|
sprintf(s, orig_fmt, new_pat);
|
|
|
|
delete [] pattern_text;
|
|
pattern_text = s;
|
|
}
|
|
|
|
void Specific_RE_Matcher::MakeCaseInsensitive()
|
|
{
|
|
const char fmt[] = "(?i:%s)";
|
|
int n = strlen(pattern_text) + strlen(fmt);
|
|
|
|
char* s = new char[n + 5 /* slop */];
|
|
|
|
snprintf(s, n + 5, fmt, pattern_text);
|
|
|
|
delete [] pattern_text;
|
|
pattern_text = s;
|
|
}
|
|
|
|
bool Specific_RE_Matcher::Compile(bool lazy)
|
|
{
|
|
if ( ! pattern_text )
|
|
return false;
|
|
|
|
rem = this;
|
|
RE_set_input(pattern_text);
|
|
|
|
int parse_status = RE_parse();
|
|
RE_done_with_scan();
|
|
|
|
if ( parse_status )
|
|
{
|
|
reporter->Error("error compiling pattern /%s/", pattern_text);
|
|
Unref(nfa);
|
|
nfa = nullptr;
|
|
return false;
|
|
}
|
|
|
|
EC()->BuildECs();
|
|
ConvertCCLs();
|
|
|
|
dfa = new DFA_Machine(nfa, EC());
|
|
|
|
Unref(nfa);
|
|
nfa = nullptr;
|
|
|
|
ecs = EC()->EquivClasses();
|
|
|
|
return true;
|
|
}
|
|
|
|
bool Specific_RE_Matcher::CompileSet(const string_list& set, const int_list& idx)
|
|
{
|
|
if ( (size_t)set.length() != idx.size() )
|
|
reporter->InternalError("compileset: lengths of sets differ");
|
|
|
|
rem = this;
|
|
|
|
NFA_Machine* set_nfa = nullptr;
|
|
|
|
loop_over_list(set, i)
|
|
{
|
|
RE_set_input(set[i]);
|
|
int parse_status = RE_parse();
|
|
RE_done_with_scan();
|
|
|
|
if ( parse_status )
|
|
{
|
|
reporter->Error("error compiling pattern /%s/", set[i]);
|
|
|
|
if ( set_nfa && set_nfa != nfa )
|
|
Unref(set_nfa);
|
|
else
|
|
Unref(nfa);
|
|
|
|
nfa = nullptr;
|
|
return false;
|
|
}
|
|
|
|
nfa->FinalState()->SetAccept(idx[i]);
|
|
set_nfa = set_nfa ? make_alternate(nfa, set_nfa) : nfa;
|
|
}
|
|
|
|
// Prefix the expression with a "^?".
|
|
nfa = new NFA_Machine(new NFA_State(SYM_BOL, rem->EC()));
|
|
nfa->MakeOptional();
|
|
if ( set_nfa )
|
|
nfa->AppendMachine( set_nfa );
|
|
|
|
EC()->BuildECs();
|
|
ConvertCCLs();
|
|
|
|
dfa = new DFA_Machine(nfa, EC());
|
|
ecs = EC()->EquivClasses();
|
|
|
|
return true;
|
|
}
|
|
|
|
std::string Specific_RE_Matcher::LookupDef(const std::string& def)
|
|
{
|
|
const auto& iter = defs.find(def);
|
|
if ( iter != defs.end() )
|
|
return iter->second;
|
|
|
|
return std::string();
|
|
}
|
|
|
|
bool Specific_RE_Matcher::MatchAll(const char* s)
|
|
{
|
|
return MatchAll((const u_char*)(s), strlen(s));
|
|
}
|
|
|
|
bool Specific_RE_Matcher::MatchAll(const String* s)
|
|
{
|
|
// s->Len() does not include '\0'.
|
|
return MatchAll(s->Bytes(), s->Len());
|
|
}
|
|
|
|
int Specific_RE_Matcher::Match(const char* s)
|
|
{
|
|
return Match((const u_char*)(s), strlen(s));
|
|
}
|
|
|
|
int Specific_RE_Matcher::Match(const String* s)
|
|
{
|
|
return Match(s->Bytes(), s->Len());
|
|
}
|
|
|
|
int Specific_RE_Matcher::LongestMatch(const char* s)
|
|
{
|
|
return LongestMatch((const u_char*)(s), strlen(s));
|
|
}
|
|
|
|
int Specific_RE_Matcher::LongestMatch(const String* s)
|
|
{
|
|
return LongestMatch(s->Bytes(), s->Len());
|
|
}
|
|
|
|
bool Specific_RE_Matcher::MatchAll(const u_char* bv, int n)
|
|
{
|
|
if ( ! dfa )
|
|
// An empty pattern matches "all" iff what's being
|
|
// matched is empty.
|
|
return n == 0;
|
|
|
|
DFA_State* d = dfa->StartState();
|
|
d = d->Xtion(ecs[SYM_BOL], dfa);
|
|
|
|
while ( d )
|
|
{
|
|
if ( --n < 0 )
|
|
break;
|
|
|
|
int ec = ecs[*(bv++)];
|
|
d = d->Xtion(ec, dfa);
|
|
}
|
|
|
|
if ( d )
|
|
d = d->Xtion(ecs[SYM_EOL], dfa);
|
|
|
|
return d && d->Accept() != nullptr;
|
|
}
|
|
|
|
|
|
int Specific_RE_Matcher::Match(const u_char* bv, int n)
|
|
{
|
|
if ( ! dfa )
|
|
// An empty pattern matches anything.
|
|
return 1;
|
|
|
|
DFA_State* d = dfa->StartState();
|
|
|
|
d = d->Xtion(ecs[SYM_BOL], dfa);
|
|
if ( ! d ) return 0;
|
|
|
|
for ( int i = 0; i < n; ++i )
|
|
{
|
|
int ec = ecs[bv[i]];
|
|
d = d->Xtion(ec, dfa);
|
|
if ( ! d )
|
|
break;
|
|
|
|
if ( d->Accept() )
|
|
return i + 1;
|
|
}
|
|
|
|
if ( d )
|
|
{
|
|
d = d->Xtion(ecs[SYM_EOL], dfa);
|
|
if ( d && d->Accept() )
|
|
return n > 0 ? n : 1; // we can't return 0 here for match...
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
|
|
void Specific_RE_Matcher::Dump(FILE* f)
|
|
{
|
|
dfa->Dump(f);
|
|
}
|
|
|
|
inline void RE_Match_State::AddMatches(const AcceptingSet& as,
|
|
MatchPos position)
|
|
{
|
|
typedef std::pair<AcceptIdx, MatchPos> am_idx;
|
|
|
|
for ( AcceptingSet::const_iterator it = as.begin(); it != as.end(); ++it )
|
|
accepted_matches.insert(am_idx(*it, position));
|
|
}
|
|
|
|
bool RE_Match_State::Match(const u_char* bv, int n,
|
|
bool bol, bool eol, bool clear)
|
|
{
|
|
if ( current_pos == -1 )
|
|
{
|
|
// First call to Match().
|
|
if ( ! dfa )
|
|
return false;
|
|
|
|
// Initialize state and copy the accepting states of the start
|
|
// state into the acceptance set.
|
|
current_state = dfa->StartState();
|
|
|
|
const AcceptingSet* ac = current_state->Accept();
|
|
|
|
if ( ac )
|
|
AddMatches(*ac, 0);
|
|
}
|
|
|
|
else if ( clear )
|
|
current_state = dfa->StartState();
|
|
|
|
if ( ! current_state )
|
|
return false;
|
|
|
|
current_pos = 0;
|
|
|
|
size_t old_matches = accepted_matches.size();
|
|
|
|
int ec;
|
|
int m = bol ? n + 1 : n;
|
|
int e = eol ? -1 : 0;
|
|
|
|
while ( --m >= e )
|
|
{
|
|
if ( m == n )
|
|
ec = ecs[SYM_BOL];
|
|
else if ( m == -1 )
|
|
ec = ecs[SYM_EOL];
|
|
else
|
|
ec = ecs[*(bv++)];
|
|
|
|
DFA_State* next_state = current_state->Xtion(ec,dfa);
|
|
|
|
if ( ! next_state )
|
|
{
|
|
current_state = nullptr;
|
|
break;
|
|
}
|
|
|
|
const AcceptingSet* ac = next_state->Accept();
|
|
|
|
if ( ac )
|
|
AddMatches(*ac, current_pos);
|
|
|
|
++current_pos;
|
|
|
|
current_state = next_state;
|
|
}
|
|
|
|
return accepted_matches.size() != old_matches;
|
|
}
|
|
|
|
int Specific_RE_Matcher::LongestMatch(const u_char* bv, int n)
|
|
{
|
|
if ( ! dfa )
|
|
// An empty pattern matches anything.
|
|
return 0;
|
|
|
|
// Use -1 to indicate no match.
|
|
int last_accept = -1;
|
|
DFA_State* d = dfa->StartState();
|
|
|
|
d = d->Xtion(ecs[SYM_BOL], dfa);
|
|
if ( ! d )
|
|
return -1;
|
|
|
|
if ( d->Accept() )
|
|
last_accept = 0;
|
|
|
|
for ( int i = 0; i < n; ++i )
|
|
{
|
|
int ec = ecs[bv[i]];
|
|
d = d->Xtion(ec, dfa);
|
|
|
|
if ( ! d )
|
|
break;
|
|
|
|
if ( d->Accept() )
|
|
last_accept = i + 1;
|
|
}
|
|
|
|
if ( d )
|
|
{
|
|
d = d->Xtion(ecs[SYM_EOL], dfa);
|
|
if ( d && d->Accept() )
|
|
return n;
|
|
}
|
|
|
|
return last_accept;
|
|
}
|
|
|
|
unsigned int Specific_RE_Matcher::MemoryAllocation() const
|
|
{
|
|
unsigned int size = 0;
|
|
|
|
for ( int i = 0; i < ccl_list.length(); ++i )
|
|
size += ccl_list[i]->MemoryAllocation();
|
|
|
|
size += util::pad_size(sizeof(CCL*) * ccl_dict.size());
|
|
for ( const auto& entry : ccl_dict )
|
|
{
|
|
size += padded_sizeof(std::string) + util::pad_size(sizeof(std::string::value_type) * entry.first.size());
|
|
size += entry.second->MemoryAllocation();
|
|
}
|
|
|
|
for ( const auto& entry : defs )
|
|
{
|
|
size += padded_sizeof(std::string) + util::pad_size(sizeof(std::string::value_type) * entry.first.size());
|
|
size += padded_sizeof(std::string) + util::pad_size(sizeof(std::string::value_type) * entry.second.size());
|
|
}
|
|
|
|
return size + padded_sizeof(*this)
|
|
+ (pattern_text ? util::pad_size(strlen(pattern_text) + 1) : 0)
|
|
+ ccl_list.MemoryAllocation() - padded_sizeof(ccl_list)
|
|
+ equiv_class.Size() - padded_sizeof(EquivClass)
|
|
+ (dfa ? dfa->MemoryAllocation() : 0) // this is ref counted; consider the bytes here?
|
|
+ padded_sizeof(*any_ccl)
|
|
+ padded_sizeof(*accepted) // NOLINT(bugprone-sizeof-container)
|
|
+ accepted->size() * padded_sizeof(AcceptingSet::key_type);
|
|
}
|
|
|
|
static RE_Matcher* matcher_merge(const RE_Matcher* re1, const RE_Matcher* re2,
|
|
const char* merge_op)
|
|
{
|
|
const char* text1 = re1->PatternText();
|
|
const char* text2 = re2->PatternText();
|
|
|
|
int n = strlen(text1) + strlen(text2) + strlen(merge_op) + 32 /* slop */ ;
|
|
|
|
char* merge_text = new char[n];
|
|
snprintf(merge_text, n, "(%s)%s(%s)", text1, merge_op, text2);
|
|
|
|
RE_Matcher* merge = new RE_Matcher(merge_text);
|
|
delete [] merge_text;
|
|
|
|
merge->Compile();
|
|
|
|
return merge;
|
|
}
|
|
|
|
RE_Matcher* RE_Matcher_conjunction(const RE_Matcher* re1, const RE_Matcher* re2)
|
|
{
|
|
return matcher_merge(re1, re2, "");
|
|
}
|
|
|
|
RE_Matcher* RE_Matcher_disjunction(const RE_Matcher* re1, const RE_Matcher* re2)
|
|
{
|
|
return matcher_merge(re1, re2, "|");
|
|
}
|
|
|
|
} // namespace detail
|
|
|
|
RE_Matcher::RE_Matcher()
|
|
{
|
|
re_anywhere = new detail::Specific_RE_Matcher(detail::MATCH_ANYWHERE);
|
|
re_exact = new detail::Specific_RE_Matcher(detail::MATCH_EXACTLY);
|
|
}
|
|
|
|
RE_Matcher::RE_Matcher(const char* pat)
|
|
{
|
|
re_anywhere = new detail::Specific_RE_Matcher(detail::MATCH_ANYWHERE);
|
|
re_exact = new detail::Specific_RE_Matcher(detail::MATCH_EXACTLY);
|
|
|
|
AddPat(pat);
|
|
}
|
|
|
|
RE_Matcher::RE_Matcher(const char* exact_pat, const char* anywhere_pat)
|
|
{
|
|
re_anywhere = new detail::Specific_RE_Matcher(detail::MATCH_ANYWHERE);
|
|
re_anywhere->SetPat(anywhere_pat);
|
|
re_exact = new detail::Specific_RE_Matcher(detail::MATCH_EXACTLY);
|
|
re_exact->SetPat(exact_pat);
|
|
}
|
|
|
|
RE_Matcher::~RE_Matcher()
|
|
{
|
|
delete re_anywhere;
|
|
delete re_exact;
|
|
}
|
|
|
|
void RE_Matcher::AddPat(const char* new_pat)
|
|
{
|
|
re_anywhere->AddPat(new_pat);
|
|
re_exact->AddPat(new_pat);
|
|
}
|
|
|
|
void RE_Matcher::MakeCaseInsensitive()
|
|
{
|
|
re_anywhere->MakeCaseInsensitive();
|
|
re_exact->MakeCaseInsensitive();
|
|
}
|
|
|
|
bool RE_Matcher::Compile(bool lazy)
|
|
{
|
|
return re_anywhere->Compile(lazy) && re_exact->Compile(lazy);
|
|
}
|
|
|
|
} // namespace zeek
|