// See the file "COPYING" in the main distribution directory for copyright. #include "zeek-config.h" #include "zeek/RE.h" #include #include #include "zeek/DFA.h" #include "zeek/CCL.h" #include "zeek/EquivClass.h" #include "zeek/Reporter.h" #include "zeek/ZeekString.h" zeek::detail::CCL* zeek::detail::curr_ccl = nullptr; zeek::detail::CCL*& curr_ccl = zeek::detail::curr_ccl; zeek::detail::Specific_RE_Matcher* zeek::detail::rem = nullptr; zeek::detail::Specific_RE_Matcher*& rem = zeek::detail::rem; zeek::detail::NFA_Machine* zeek::detail::nfa = nullptr; zeek::detail::NFA_Machine*& nfa = zeek::detail::nfa; int zeek::detail::case_insensitive = 0; int& case_insensitive = zeek::detail::case_insensitive; extern int RE_parse(void); extern void RE_set_input(const char* str); extern void RE_done_with_scan(); namespace zeek { namespace detail { Specific_RE_Matcher::Specific_RE_Matcher(match_type arg_mt, int arg_multiline) : equiv_class(NUM_SYM) { mt = arg_mt; multiline = arg_multiline; any_ccl = nullptr; pattern_text = nullptr; dfa = nullptr; ecs = nullptr; accepted = new AcceptingSet(); } Specific_RE_Matcher::~Specific_RE_Matcher() { for ( int i = 0; i < ccl_list.length(); ++i ) delete ccl_list[i]; Unref(dfa); delete [] pattern_text; delete accepted; } CCL* Specific_RE_Matcher::AnyCCL() { if ( ! any_ccl ) { // Create the '.' character class. any_ccl = new CCL(); if ( ! multiline ) any_ccl->Add('\n'); any_ccl->Negate(); EC()->CCL_Use(any_ccl); } return any_ccl; } void Specific_RE_Matcher::ConvertCCLs() { for ( int i = 0; i < ccl_list.length(); ++i ) equiv_class.ConvertCCL(ccl_list[i]); } void Specific_RE_Matcher::AddPat(const char* new_pat) { if ( mt == MATCH_EXACTLY ) AddExactPat(new_pat); else AddAnywherePat(new_pat); } void Specific_RE_Matcher::AddAnywherePat(const char* new_pat) { AddPat(new_pat, "^?(.|\\n)*(%s)", "(%s)|(^?(.|\\n)*(%s))"); } void Specific_RE_Matcher::AddExactPat(const char* new_pat) { AddPat(new_pat, "^?(%s)$?", "(%s)|(^?(%s)$?)"); } void Specific_RE_Matcher::AddPat(const char* new_pat, const char* orig_fmt, const char* app_fmt) { int n = strlen(new_pat); if ( pattern_text ) n += strlen(pattern_text) + strlen(app_fmt); else n += strlen(orig_fmt); char* s = new char[n + 5 /* slop */]; if ( pattern_text ) sprintf(s, app_fmt, pattern_text, new_pat); else sprintf(s, orig_fmt, new_pat); delete [] pattern_text; pattern_text = s; } void Specific_RE_Matcher::MakeCaseInsensitive() { const char fmt[] = "(?i:%s)"; int n = strlen(pattern_text) + strlen(fmt); char* s = new char[n + 5 /* slop */]; snprintf(s, n + 5, fmt, pattern_text); delete [] pattern_text; pattern_text = s; } bool Specific_RE_Matcher::Compile(bool lazy) { if ( ! pattern_text ) return false; rem = this; RE_set_input(pattern_text); int parse_status = RE_parse(); RE_done_with_scan(); if ( parse_status ) { reporter->Error("error compiling pattern /%s/", pattern_text); Unref(nfa); nfa = nullptr; return false; } EC()->BuildECs(); ConvertCCLs(); dfa = new DFA_Machine(nfa, EC()); Unref(nfa); nfa = nullptr; ecs = EC()->EquivClasses(); return true; } bool Specific_RE_Matcher::CompileSet(const string_list& set, const int_list& idx) { if ( (size_t)set.length() != idx.size() ) reporter->InternalError("compileset: lengths of sets differ"); rem = this; NFA_Machine* set_nfa = nullptr; loop_over_list(set, i) { RE_set_input(set[i]); int parse_status = RE_parse(); RE_done_with_scan(); if ( parse_status ) { reporter->Error("error compiling pattern /%s/", set[i]); if ( set_nfa && set_nfa != nfa ) Unref(set_nfa); else Unref(nfa); nfa = nullptr; return false; } nfa->FinalState()->SetAccept(idx[i]); set_nfa = set_nfa ? make_alternate(nfa, set_nfa) : nfa; } // Prefix the expression with a "^?". nfa = new NFA_Machine(new NFA_State(SYM_BOL, rem->EC())); nfa->MakeOptional(); if ( set_nfa ) nfa->AppendMachine( set_nfa ); EC()->BuildECs(); ConvertCCLs(); dfa = new DFA_Machine(nfa, EC()); ecs = EC()->EquivClasses(); return true; } std::string Specific_RE_Matcher::LookupDef(const std::string& def) { const auto& iter = defs.find(def); if ( iter != defs.end() ) return iter->second; return std::string(); } bool Specific_RE_Matcher::MatchAll(const char* s) { return MatchAll((const u_char*)(s), strlen(s)); } bool Specific_RE_Matcher::MatchAll(const String* s) { // s->Len() does not include '\0'. return MatchAll(s->Bytes(), s->Len()); } int Specific_RE_Matcher::Match(const char* s) { return Match((const u_char*)(s), strlen(s)); } int Specific_RE_Matcher::Match(const String* s) { return Match(s->Bytes(), s->Len()); } int Specific_RE_Matcher::LongestMatch(const char* s) { return LongestMatch((const u_char*)(s), strlen(s)); } int Specific_RE_Matcher::LongestMatch(const String* s) { return LongestMatch(s->Bytes(), s->Len()); } bool Specific_RE_Matcher::MatchAll(const u_char* bv, int n) { if ( ! dfa ) // An empty pattern matches "all" iff what's being // matched is empty. return n == 0; DFA_State* d = dfa->StartState(); d = d->Xtion(ecs[SYM_BOL], dfa); while ( d ) { if ( --n < 0 ) break; int ec = ecs[*(bv++)]; d = d->Xtion(ec, dfa); } if ( d ) d = d->Xtion(ecs[SYM_EOL], dfa); return d && d->Accept() != nullptr; } int Specific_RE_Matcher::Match(const u_char* bv, int n) { if ( ! dfa ) // An empty pattern matches anything. return 1; DFA_State* d = dfa->StartState(); d = d->Xtion(ecs[SYM_BOL], dfa); if ( ! d ) return 0; for ( int i = 0; i < n; ++i ) { int ec = ecs[bv[i]]; d = d->Xtion(ec, dfa); if ( ! d ) break; if ( d->Accept() ) return i + 1; } if ( d ) { d = d->Xtion(ecs[SYM_EOL], dfa); if ( d && d->Accept() ) return n > 0 ? n : 1; // we can't return 0 here for match... } return 0; } void Specific_RE_Matcher::Dump(FILE* f) { dfa->Dump(f); } inline void RE_Match_State::AddMatches(const AcceptingSet& as, MatchPos position) { typedef std::pair am_idx; for ( AcceptingSet::const_iterator it = as.begin(); it != as.end(); ++it ) accepted_matches.insert(am_idx(*it, position)); } bool RE_Match_State::Match(const u_char* bv, int n, bool bol, bool eol, bool clear) { if ( current_pos == -1 ) { // First call to Match(). if ( ! dfa ) return false; // Initialize state and copy the accepting states of the start // state into the acceptance set. current_state = dfa->StartState(); const AcceptingSet* ac = current_state->Accept(); if ( ac ) AddMatches(*ac, 0); } else if ( clear ) current_state = dfa->StartState(); if ( ! current_state ) return false; current_pos = 0; size_t old_matches = accepted_matches.size(); int ec; int m = bol ? n + 1 : n; int e = eol ? -1 : 0; while ( --m >= e ) { if ( m == n ) ec = ecs[SYM_BOL]; else if ( m == -1 ) ec = ecs[SYM_EOL]; else ec = ecs[*(bv++)]; DFA_State* next_state = current_state->Xtion(ec,dfa); if ( ! next_state ) { current_state = nullptr; break; } const AcceptingSet* ac = next_state->Accept(); if ( ac ) AddMatches(*ac, current_pos); ++current_pos; current_state = next_state; } return accepted_matches.size() != old_matches; } int Specific_RE_Matcher::LongestMatch(const u_char* bv, int n) { if ( ! dfa ) // An empty pattern matches anything. return 0; // Use -1 to indicate no match. int last_accept = -1; DFA_State* d = dfa->StartState(); d = d->Xtion(ecs[SYM_BOL], dfa); if ( ! d ) return -1; if ( d->Accept() ) last_accept = 0; for ( int i = 0; i < n; ++i ) { int ec = ecs[bv[i]]; d = d->Xtion(ec, dfa); if ( ! d ) break; if ( d->Accept() ) last_accept = i + 1; } if ( d ) { d = d->Xtion(ecs[SYM_EOL], dfa); if ( d && d->Accept() ) return n; } return last_accept; } unsigned int Specific_RE_Matcher::MemoryAllocation() const { unsigned int size = 0; for ( int i = 0; i < ccl_list.length(); ++i ) size += ccl_list[i]->MemoryAllocation(); size += util::pad_size(sizeof(CCL*) * ccl_dict.size()); for ( const auto& entry : ccl_dict ) { size += padded_sizeof(std::string) + util::pad_size(sizeof(std::string::value_type) * entry.first.size()); size += entry.second->MemoryAllocation(); } for ( const auto& entry : defs ) { size += padded_sizeof(std::string) + util::pad_size(sizeof(std::string::value_type) * entry.first.size()); size += padded_sizeof(std::string) + util::pad_size(sizeof(std::string::value_type) * entry.second.size()); } return size + padded_sizeof(*this) + (pattern_text ? util::pad_size(strlen(pattern_text) + 1) : 0) + ccl_list.MemoryAllocation() - padded_sizeof(ccl_list) + equiv_class.Size() - padded_sizeof(EquivClass) + (dfa ? dfa->MemoryAllocation() : 0) // this is ref counted; consider the bytes here? + padded_sizeof(*any_ccl) + padded_sizeof(*accepted) // NOLINT(bugprone-sizeof-container) + accepted->size() * padded_sizeof(AcceptingSet::key_type); } static RE_Matcher* matcher_merge(const RE_Matcher* re1, const RE_Matcher* re2, const char* merge_op) { const char* text1 = re1->PatternText(); const char* text2 = re2->PatternText(); int n = strlen(text1) + strlen(text2) + strlen(merge_op) + 32 /* slop */ ; char* merge_text = new char[n]; snprintf(merge_text, n, "(%s)%s(%s)", text1, merge_op, text2); RE_Matcher* merge = new RE_Matcher(merge_text); delete [] merge_text; merge->Compile(); return merge; } RE_Matcher* RE_Matcher_conjunction(const RE_Matcher* re1, const RE_Matcher* re2) { return matcher_merge(re1, re2, ""); } RE_Matcher* RE_Matcher_disjunction(const RE_Matcher* re1, const RE_Matcher* re2) { return matcher_merge(re1, re2, "|"); } } // namespace detail RE_Matcher::RE_Matcher() { re_anywhere = new detail::Specific_RE_Matcher(detail::MATCH_ANYWHERE); re_exact = new detail::Specific_RE_Matcher(detail::MATCH_EXACTLY); } RE_Matcher::RE_Matcher(const char* pat) { re_anywhere = new detail::Specific_RE_Matcher(detail::MATCH_ANYWHERE); re_exact = new detail::Specific_RE_Matcher(detail::MATCH_EXACTLY); AddPat(pat); } RE_Matcher::RE_Matcher(const char* exact_pat, const char* anywhere_pat) { re_anywhere = new detail::Specific_RE_Matcher(detail::MATCH_ANYWHERE); re_anywhere->SetPat(anywhere_pat); re_exact = new detail::Specific_RE_Matcher(detail::MATCH_EXACTLY); re_exact->SetPat(exact_pat); } RE_Matcher::~RE_Matcher() { delete re_anywhere; delete re_exact; } void RE_Matcher::AddPat(const char* new_pat) { re_anywhere->AddPat(new_pat); re_exact->AddPat(new_pat); } void RE_Matcher::MakeCaseInsensitive() { re_anywhere->MakeCaseInsensitive(); re_exact->MakeCaseInsensitive(); } bool RE_Matcher::Compile(bool lazy) { return re_anywhere->Compile(lazy) && re_exact->Compile(lazy); } } // namespace zeek