mirror of
https://github.com/zeek/zeek.git
synced 2025-10-09 18:18:19 +00:00
Refactor regex/signature AcceptingSet data structure and usages.
Several parts of that code would do membership checks and that's going to be more efficient with a set instead of a list data structure.
This commit is contained in:
parent
8126f06ffb
commit
171c6ce86b
5 changed files with 99 additions and 124 deletions
28
src/DFA.cc
28
src/DFA.cc
|
@ -211,9 +211,10 @@ void DFA_State::Dump(FILE* f, DFA_Machine* m)
|
|||
|
||||
if ( accept )
|
||||
{
|
||||
for ( int i = 0; i < accept->length(); ++i )
|
||||
fprintf(f, "%s accept #%d",
|
||||
i > 0 ? "," : "", int((*accept)[i]));
|
||||
AcceptingSet::const_iterator it;
|
||||
|
||||
for ( it = accept->begin(); it != accept->end(); ++it )
|
||||
fprintf(f, "%s accept #%d", it == accept->begin() ? "" : ",", *it);
|
||||
}
|
||||
|
||||
fprintf(f, "\n");
|
||||
|
@ -285,7 +286,7 @@ unsigned int DFA_State::Size()
|
|||
{
|
||||
return sizeof(*this)
|
||||
+ pad_size(sizeof(DFA_State*) * num_sym)
|
||||
+ (accept ? pad_size(sizeof(int) * accept->length()) : 0)
|
||||
+ (accept ? pad_size(sizeof(int) * accept->size()) : 0)
|
||||
+ (nfa_states ? pad_size(sizeof(NFA_State*) * nfa_states->length()) : 0)
|
||||
+ (meta_ec ? meta_ec->Size() : 0)
|
||||
+ (centry ? padded_sizeof(CacheEntry) : 0);
|
||||
|
@ -470,33 +471,20 @@ int DFA_Machine::StateSetToDFA_State(NFA_state_list* state_set,
|
|||
return 0;
|
||||
|
||||
AcceptingSet* accept = new AcceptingSet;
|
||||
|
||||
for ( int i = 0; i < state_set->length(); ++i )
|
||||
{
|
||||
int acc = (*state_set)[i]->Accept();
|
||||
|
||||
if ( acc != NO_ACCEPT )
|
||||
{
|
||||
int j;
|
||||
for ( j = 0; j < accept->length(); ++j )
|
||||
if ( (*accept)[j] == acc )
|
||||
break;
|
||||
|
||||
if ( j >= accept->length() )
|
||||
// It's not already present.
|
||||
accept->append(acc);
|
||||
}
|
||||
accept->insert(acc);
|
||||
}
|
||||
|
||||
if ( accept->length() == 0 )
|
||||
if ( accept->empty() )
|
||||
{
|
||||
delete accept;
|
||||
accept = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
accept->sort(int_list_cmp);
|
||||
accept->resize(0);
|
||||
}
|
||||
|
||||
DFA_State* ds = new DFA_State(state_count++, ec, state_set, accept);
|
||||
d = dfa_state_cache->Insert(ds, hash);
|
||||
|
|
40
src/RE.cc
40
src/RE.cc
|
@ -3,6 +3,7 @@
|
|||
#include "config.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <utility>
|
||||
|
||||
#include "RE.h"
|
||||
#include "DFA.h"
|
||||
|
@ -266,6 +267,15 @@ void Specific_RE_Matcher::Dump(FILE* f)
|
|||
dfa->Dump(f);
|
||||
}
|
||||
|
||||
inline void RE_Match_State::AddMatches(const AcceptingSet& as,
|
||||
MatchPos position)
|
||||
{
|
||||
typedef std::pair<AcceptIdx, MatchPos> am_idx;
|
||||
|
||||
for ( AcceptingSet::const_iterator it = as.begin(); it != as.end(); ++it )
|
||||
accepted_matches.insert(am_idx(*it, position));
|
||||
}
|
||||
|
||||
bool RE_Match_State::Match(const u_char* bv, int n,
|
||||
bool bol, bool eol, bool clear)
|
||||
{
|
||||
|
@ -283,14 +293,9 @@ bool RE_Match_State::Match(const u_char* bv, int n,
|
|||
current_state = dfa->StartState();
|
||||
|
||||
const AcceptingSet* ac = current_state->Accept();
|
||||
|
||||
if ( ac )
|
||||
{
|
||||
loop_over_list(*ac, i)
|
||||
{
|
||||
accepted.append((*ac)[i]);
|
||||
match_pos.append(0);
|
||||
}
|
||||
}
|
||||
AddMatches(*ac, 0);
|
||||
}
|
||||
|
||||
else if ( clear )
|
||||
|
@ -301,7 +306,7 @@ bool RE_Match_State::Match(const u_char* bv, int n,
|
|||
|
||||
current_pos = 0;
|
||||
|
||||
int old_matches = accepted.length();
|
||||
size_t old_matches = accepted_matches.size();
|
||||
|
||||
int ec;
|
||||
int m = bol ? n + 1 : n;
|
||||
|
@ -324,25 +329,17 @@ bool RE_Match_State::Match(const u_char* bv, int n,
|
|||
break;
|
||||
}
|
||||
|
||||
if ( next_state->Accept() )
|
||||
{
|
||||
const AcceptingSet* ac = next_state->Accept();
|
||||
loop_over_list(*ac, i)
|
||||
{
|
||||
if ( ! accepted.is_member((*ac)[i]) )
|
||||
{
|
||||
accepted.append((*ac)[i]);
|
||||
match_pos.append(current_pos);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ( ac )
|
||||
AddMatches(*ac, current_pos);
|
||||
|
||||
++current_pos;
|
||||
|
||||
current_state = next_state;
|
||||
}
|
||||
|
||||
return accepted.length() != old_matches;
|
||||
return accepted_matches.size() != old_matches;
|
||||
}
|
||||
|
||||
int Specific_RE_Matcher::LongestMatch(const u_char* bv, int n)
|
||||
|
@ -399,7 +396,8 @@ unsigned int Specific_RE_Matcher::MemoryAllocation() const
|
|||
+ equiv_class.Size() - padded_sizeof(EquivClass)
|
||||
+ (dfa ? dfa->MemoryAllocation() : 0) // this is ref counted; consider the bytes here?
|
||||
+ padded_sizeof(*any_ccl)
|
||||
+ accepted->MemoryAllocation();
|
||||
+ padded_sizeof(*accepted)
|
||||
+ accepted->size() * padded_sizeof(AcceptingSet::key_type);
|
||||
}
|
||||
|
||||
RE_Matcher::RE_Matcher()
|
||||
|
|
20
src/RE.h
20
src/RE.h
|
@ -9,6 +9,9 @@
|
|||
#include "CCL.h"
|
||||
#include "EquivClass.h"
|
||||
|
||||
#include <set>
|
||||
#include <map>
|
||||
|
||||
#include <ctype.h>
|
||||
typedef int (*cce_func)(int);
|
||||
|
||||
|
@ -33,7 +36,10 @@ extern int re_lex(void);
|
|||
extern int clower(int);
|
||||
extern void synerr(const char str[]);
|
||||
|
||||
typedef int_list AcceptingSet;
|
||||
typedef int AcceptIdx;
|
||||
typedef std::set<AcceptIdx> AcceptingSet;
|
||||
typedef uint64 MatchPos;
|
||||
typedef std::map<AcceptIdx, MatchPos> AcceptingMatchSet;
|
||||
typedef name_list string_list;
|
||||
|
||||
typedef enum { MATCH_ANYWHERE, MATCH_EXACTLY, } match_type;
|
||||
|
@ -135,8 +141,8 @@ public:
|
|||
current_state = 0;
|
||||
}
|
||||
|
||||
const AcceptingSet* Accepted() const { return &accepted; }
|
||||
const int_list* MatchPositions() const { return &match_pos; }
|
||||
const AcceptingMatchSet& AcceptedMatches() const
|
||||
{ return accepted_matches; }
|
||||
|
||||
// Returns the number of bytes feeded into the matcher so far
|
||||
int Length() { return current_pos; }
|
||||
|
@ -149,16 +155,16 @@ public:
|
|||
{
|
||||
current_pos = -1;
|
||||
current_state = 0;
|
||||
accepted.clear();
|
||||
match_pos.clear();
|
||||
accepted_matches.clear();
|
||||
}
|
||||
|
||||
void AddMatches(const AcceptingSet& as, MatchPos position);
|
||||
|
||||
protected:
|
||||
DFA_Machine* dfa;
|
||||
int* ecs;
|
||||
|
||||
AcceptingSet accepted;
|
||||
int_list match_pos;
|
||||
AcceptingMatchSet accepted_matches;
|
||||
DFA_State* current_state;
|
||||
int current_pos;
|
||||
};
|
||||
|
|
|
@ -594,6 +594,29 @@ RuleFileMagicState* RuleMatcher::InitFileMagic() const
|
|||
return state;
|
||||
}
|
||||
|
||||
bool RuleMatcher::AllRulePatternsMatched(const Rule* r, MatchPos matchpos,
|
||||
const AcceptingMatchSet& ams)
|
||||
{
|
||||
DBG_LOG(DBG_RULES, "Checking rule: %s", r->id);
|
||||
|
||||
// Check whether all patterns of the rule have matched.
|
||||
loop_over_list(r->patterns, j)
|
||||
{
|
||||
if ( ams.find(r->patterns[j]->id) == ams.end() )
|
||||
return false;
|
||||
|
||||
// See if depth is satisfied.
|
||||
if ( matchpos > r->patterns[j]->offset + r->patterns[j]->depth )
|
||||
return false;
|
||||
|
||||
// FIXME: How to check for offset ??? ###
|
||||
}
|
||||
|
||||
DBG_LOG(DBG_RULES, "All patterns of rule satisfied");
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
RuleMatcher::MIME_Matches* RuleMatcher::Match(RuleFileMagicState* state,
|
||||
const u_char* data, uint64 len,
|
||||
MIME_Matches* rval) const
|
||||
|
@ -636,56 +659,39 @@ RuleMatcher::MIME_Matches* RuleMatcher::Match(RuleFileMagicState* state,
|
|||
|
||||
DBG_LOG(DBG_RULES, "New pattern match found");
|
||||
|
||||
AcceptingSet accepted;
|
||||
int_list matchpos;
|
||||
AcceptingMatchSet accepted_matches;
|
||||
|
||||
loop_over_list(state->matchers, y)
|
||||
{
|
||||
RuleFileMagicState::Matcher* m = state->matchers[y];
|
||||
const AcceptingSet* ac = m->state->Accepted();
|
||||
|
||||
loop_over_list(*ac, k)
|
||||
{
|
||||
if ( ! accepted.is_member((*ac)[k]) )
|
||||
{
|
||||
accepted.append((*ac)[k]);
|
||||
matchpos.append((*m->state->MatchPositions())[k]);
|
||||
}
|
||||
}
|
||||
const AcceptingMatchSet& ams = m->state->AcceptedMatches();
|
||||
accepted_matches.insert(ams.begin(), ams.end());
|
||||
}
|
||||
|
||||
// Find rules for which patterns have matched.
|
||||
rule_list matched;
|
||||
set<Rule*> rule_matches;
|
||||
|
||||
loop_over_list(accepted, i)
|
||||
for ( AcceptingMatchSet::const_iterator it = accepted_matches.begin();
|
||||
it != accepted_matches.end(); ++it )
|
||||
{
|
||||
Rule* r = Rule::rule_table[accepted[i] - 1];
|
||||
AcceptIdx aidx = it->first;
|
||||
MatchPos mpos = it->second;
|
||||
|
||||
DBG_LOG(DBG_RULES, "Checking rule: %v", r->id);
|
||||
Rule* r = Rule::rule_table[aidx - 1];
|
||||
|
||||
loop_over_list(r->patterns, j)
|
||||
{
|
||||
if ( ! accepted.is_member(r->patterns[j]->id) )
|
||||
continue;
|
||||
|
||||
if ( (unsigned int) matchpos[i] >
|
||||
r->patterns[j]->offset + r->patterns[j]->depth )
|
||||
continue;
|
||||
|
||||
DBG_LOG(DBG_RULES, "All patterns of rule satisfied");
|
||||
if ( AllRulePatternsMatched(r, mpos, accepted_matches) )
|
||||
rule_matches.insert(r);
|
||||
}
|
||||
|
||||
if ( ! matched.is_member(r) )
|
||||
matched.append(r);
|
||||
}
|
||||
|
||||
loop_over_list(matched, j)
|
||||
for ( set<Rule*>::const_iterator it = rule_matches.begin();
|
||||
it != rule_matches.end(); ++it )
|
||||
{
|
||||
Rule* r = matched[j];
|
||||
Rule* r = *it;
|
||||
|
||||
loop_over_list(r->actions, rai)
|
||||
{
|
||||
const RuleActionMIME* ram = dynamic_cast<const RuleActionMIME*>(r->actions[rai]);
|
||||
const RuleActionMIME* ram =
|
||||
dynamic_cast<const RuleActionMIME*>(r->actions[rai]);
|
||||
|
||||
if ( ! ram )
|
||||
continue;
|
||||
|
@ -876,66 +882,40 @@ void RuleMatcher::Match(RuleEndpointState* state, Rule::PatternType type,
|
|||
|
||||
DBG_LOG(DBG_RULES, "New pattern match found");
|
||||
|
||||
// Build a joined AcceptingSet.
|
||||
AcceptingSet accepted;
|
||||
int_list matchpos;
|
||||
AcceptingMatchSet accepted_matches;
|
||||
|
||||
loop_over_list(state->matchers, y)
|
||||
loop_over_list(state->matchers, y )
|
||||
{
|
||||
RuleEndpointState::Matcher* m = state->matchers[y];
|
||||
const AcceptingSet* ac = m->state->Accepted();
|
||||
|
||||
loop_over_list(*ac, k)
|
||||
{
|
||||
if ( ! accepted.is_member((*ac)[k]) )
|
||||
{
|
||||
accepted.append((*ac)[k]);
|
||||
matchpos.append((*m->state->MatchPositions())[k]);
|
||||
}
|
||||
}
|
||||
const AcceptingMatchSet& ams = m->state->AcceptedMatches();
|
||||
accepted_matches.insert(ams.begin(), ams.end());
|
||||
}
|
||||
|
||||
// Determine the rules for which all patterns have matched.
|
||||
// This code should be fast enough as long as there are only very few
|
||||
// matched patterns per connection (which is a plausible assumption).
|
||||
|
||||
rule_list matched;
|
||||
// Find rules for which patterns have matched.
|
||||
set<Rule*> rule_matches;
|
||||
|
||||
loop_over_list(accepted, i)
|
||||
for ( AcceptingMatchSet::const_iterator it = accepted_matches.begin();
|
||||
it != accepted_matches.end(); ++it )
|
||||
{
|
||||
Rule* r = Rule::rule_table[accepted[i] - 1];
|
||||
AcceptIdx aidx = it->first;
|
||||
MatchPos mpos = it->second;
|
||||
|
||||
DBG_LOG(DBG_RULES, "Checking rule: %s", r->id);
|
||||
Rule* r = Rule::rule_table[aidx - 1];
|
||||
|
||||
// Check whether all patterns of the rule have matched.
|
||||
loop_over_list(r->patterns, j)
|
||||
{
|
||||
if ( ! accepted.is_member(r->patterns[j]->id) )
|
||||
goto next_pattern;
|
||||
|
||||
// See if depth is satisfied.
|
||||
if ( (unsigned int) matchpos[i] >
|
||||
r->patterns[j]->offset + r->patterns[j]->depth )
|
||||
goto next_pattern;
|
||||
|
||||
DBG_LOG(DBG_RULES, "All patterns of rule satisfied");
|
||||
|
||||
// FIXME: How to check for offset ??? ###
|
||||
}
|
||||
|
||||
// If not already in the list of matching rules, add it.
|
||||
if ( ! matched.is_member(r) )
|
||||
matched.append(r);
|
||||
|
||||
next_pattern:
|
||||
continue;
|
||||
if ( AllRulePatternsMatched(r, mpos, accepted_matches) )
|
||||
rule_matches.insert(r);
|
||||
}
|
||||
|
||||
// Check which of the matching rules really belong to any of our nodes.
|
||||
|
||||
loop_over_list(matched, j)
|
||||
for ( set<Rule*>::const_iterator it = rule_matches.begin();
|
||||
it != rule_matches.end(); ++it )
|
||||
{
|
||||
Rule* r = matched[j];
|
||||
Rule* r = *it;
|
||||
|
||||
DBG_LOG(DBG_RULES, "Accepted rule: %s", r->id);
|
||||
|
||||
|
|
|
@ -361,6 +361,9 @@ private:
|
|||
|
||||
void DumpStateStats(BroFile* f, RuleHdrTest* hdr_test);
|
||||
|
||||
static bool AllRulePatternsMatched(const Rule* r, MatchPos matchpos,
|
||||
const AcceptingMatchSet& ams);
|
||||
|
||||
int RE_level;
|
||||
bool parse_error;
|
||||
RuleHdrTest* root;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue