Refactor regex/signature AcceptingSet data structure and usages.

Several parts of that code would do membership checks and that's going
to be more efficient with a set instead of a list data structure.
This commit is contained in:
Jon Siwek 2014-04-21 16:55:51 -05:00
parent 8126f06ffb
commit 171c6ce86b
5 changed files with 99 additions and 124 deletions

View file

@ -211,9 +211,10 @@ void DFA_State::Dump(FILE* f, DFA_Machine* m)
if ( accept )
{
for ( int i = 0; i < accept->length(); ++i )
fprintf(f, "%s accept #%d",
i > 0 ? "," : "", int((*accept)[i]));
AcceptingSet::const_iterator it;
for ( it = accept->begin(); it != accept->end(); ++it )
fprintf(f, "%s accept #%d", it == accept->begin() ? "" : ",", *it);
}
fprintf(f, "\n");
@ -285,7 +286,7 @@ unsigned int DFA_State::Size()
{
return sizeof(*this)
+ pad_size(sizeof(DFA_State*) * num_sym)
+ (accept ? pad_size(sizeof(int) * accept->length()) : 0)
+ (accept ? pad_size(sizeof(int) * accept->size()) : 0)
+ (nfa_states ? pad_size(sizeof(NFA_State*) * nfa_states->length()) : 0)
+ (meta_ec ? meta_ec->Size() : 0)
+ (centry ? padded_sizeof(CacheEntry) : 0);
@ -470,33 +471,20 @@ int DFA_Machine::StateSetToDFA_State(NFA_state_list* state_set,
return 0;
AcceptingSet* accept = new AcceptingSet;
for ( int i = 0; i < state_set->length(); ++i )
{
int acc = (*state_set)[i]->Accept();
if ( acc != NO_ACCEPT )
{
int j;
for ( j = 0; j < accept->length(); ++j )
if ( (*accept)[j] == acc )
break;
if ( j >= accept->length() )
// It's not already present.
accept->append(acc);
}
accept->insert(acc);
}
if ( accept->length() == 0 )
if ( accept->empty() )
{
delete accept;
accept = 0;
}
else
{
accept->sort(int_list_cmp);
accept->resize(0);
}
DFA_State* ds = new DFA_State(state_count++, ec, state_set, accept);
d = dfa_state_cache->Insert(ds, hash);

View file

@ -3,6 +3,7 @@
#include "config.h"
#include <stdlib.h>
#include <utility>
#include "RE.h"
#include "DFA.h"
@ -266,6 +267,15 @@ void Specific_RE_Matcher::Dump(FILE* f)
dfa->Dump(f);
}
inline void RE_Match_State::AddMatches(const AcceptingSet& as,
MatchPos position)
{
typedef std::pair<AcceptIdx, MatchPos> am_idx;
for ( AcceptingSet::const_iterator it = as.begin(); it != as.end(); ++it )
accepted_matches.insert(am_idx(*it, position));
}
bool RE_Match_State::Match(const u_char* bv, int n,
bool bol, bool eol, bool clear)
{
@ -283,14 +293,9 @@ bool RE_Match_State::Match(const u_char* bv, int n,
current_state = dfa->StartState();
const AcceptingSet* ac = current_state->Accept();
if ( ac )
{
loop_over_list(*ac, i)
{
accepted.append((*ac)[i]);
match_pos.append(0);
}
}
AddMatches(*ac, 0);
}
else if ( clear )
@ -301,7 +306,7 @@ bool RE_Match_State::Match(const u_char* bv, int n,
current_pos = 0;
int old_matches = accepted.length();
size_t old_matches = accepted_matches.size();
int ec;
int m = bol ? n + 1 : n;
@ -324,25 +329,17 @@ bool RE_Match_State::Match(const u_char* bv, int n,
break;
}
if ( next_state->Accept() )
{
const AcceptingSet* ac = next_state->Accept();
loop_over_list(*ac, i)
{
if ( ! accepted.is_member((*ac)[i]) )
{
accepted.append((*ac)[i]);
match_pos.append(current_pos);
}
}
}
if ( ac )
AddMatches(*ac, current_pos);
++current_pos;
current_state = next_state;
}
return accepted.length() != old_matches;
return accepted_matches.size() != old_matches;
}
int Specific_RE_Matcher::LongestMatch(const u_char* bv, int n)
@ -399,7 +396,8 @@ unsigned int Specific_RE_Matcher::MemoryAllocation() const
+ equiv_class.Size() - padded_sizeof(EquivClass)
+ (dfa ? dfa->MemoryAllocation() : 0) // this is ref counted; consider the bytes here?
+ padded_sizeof(*any_ccl)
+ accepted->MemoryAllocation();
+ padded_sizeof(*accepted)
+ accepted->size() * padded_sizeof(AcceptingSet::key_type);
}
RE_Matcher::RE_Matcher()

View file

@ -9,6 +9,9 @@
#include "CCL.h"
#include "EquivClass.h"
#include <set>
#include <map>
#include <ctype.h>
typedef int (*cce_func)(int);
@ -33,7 +36,10 @@ extern int re_lex(void);
extern int clower(int);
extern void synerr(const char str[]);
typedef int_list AcceptingSet;
typedef int AcceptIdx;
typedef std::set<AcceptIdx> AcceptingSet;
typedef uint64 MatchPos;
typedef std::map<AcceptIdx, MatchPos> AcceptingMatchSet;
typedef name_list string_list;
typedef enum { MATCH_ANYWHERE, MATCH_EXACTLY, } match_type;
@ -135,8 +141,8 @@ public:
current_state = 0;
}
const AcceptingSet* Accepted() const { return &accepted; }
const int_list* MatchPositions() const { return &match_pos; }
const AcceptingMatchSet& AcceptedMatches() const
{ return accepted_matches; }
// Returns the number of bytes feeded into the matcher so far
int Length() { return current_pos; }
@ -149,16 +155,16 @@ public:
{
current_pos = -1;
current_state = 0;
accepted.clear();
match_pos.clear();
accepted_matches.clear();
}
void AddMatches(const AcceptingSet& as, MatchPos position);
protected:
DFA_Machine* dfa;
int* ecs;
AcceptingSet accepted;
int_list match_pos;
AcceptingMatchSet accepted_matches;
DFA_State* current_state;
int current_pos;
};

View file

@ -594,6 +594,29 @@ RuleFileMagicState* RuleMatcher::InitFileMagic() const
return state;
}
bool RuleMatcher::AllRulePatternsMatched(const Rule* r, MatchPos matchpos,
const AcceptingMatchSet& ams)
{
DBG_LOG(DBG_RULES, "Checking rule: %s", r->id);
// Check whether all patterns of the rule have matched.
loop_over_list(r->patterns, j)
{
if ( ams.find(r->patterns[j]->id) == ams.end() )
return false;
// See if depth is satisfied.
if ( matchpos > r->patterns[j]->offset + r->patterns[j]->depth )
return false;
// FIXME: How to check for offset ??? ###
}
DBG_LOG(DBG_RULES, "All patterns of rule satisfied");
return true;
}
RuleMatcher::MIME_Matches* RuleMatcher::Match(RuleFileMagicState* state,
const u_char* data, uint64 len,
MIME_Matches* rval) const
@ -636,56 +659,39 @@ RuleMatcher::MIME_Matches* RuleMatcher::Match(RuleFileMagicState* state,
DBG_LOG(DBG_RULES, "New pattern match found");
AcceptingSet accepted;
int_list matchpos;
AcceptingMatchSet accepted_matches;
loop_over_list(state->matchers, y)
{
RuleFileMagicState::Matcher* m = state->matchers[y];
const AcceptingSet* ac = m->state->Accepted();
loop_over_list(*ac, k)
{
if ( ! accepted.is_member((*ac)[k]) )
{
accepted.append((*ac)[k]);
matchpos.append((*m->state->MatchPositions())[k]);
}
}
const AcceptingMatchSet& ams = m->state->AcceptedMatches();
accepted_matches.insert(ams.begin(), ams.end());
}
// Find rules for which patterns have matched.
rule_list matched;
set<Rule*> rule_matches;
loop_over_list(accepted, i)
for ( AcceptingMatchSet::const_iterator it = accepted_matches.begin();
it != accepted_matches.end(); ++it )
{
Rule* r = Rule::rule_table[accepted[i] - 1];
AcceptIdx aidx = it->first;
MatchPos mpos = it->second;
DBG_LOG(DBG_RULES, "Checking rule: %v", r->id);
Rule* r = Rule::rule_table[aidx - 1];
loop_over_list(r->patterns, j)
{
if ( ! accepted.is_member(r->patterns[j]->id) )
continue;
if ( (unsigned int) matchpos[i] >
r->patterns[j]->offset + r->patterns[j]->depth )
continue;
DBG_LOG(DBG_RULES, "All patterns of rule satisfied");
if ( AllRulePatternsMatched(r, mpos, accepted_matches) )
rule_matches.insert(r);
}
if ( ! matched.is_member(r) )
matched.append(r);
}
loop_over_list(matched, j)
for ( set<Rule*>::const_iterator it = rule_matches.begin();
it != rule_matches.end(); ++it )
{
Rule* r = matched[j];
Rule* r = *it;
loop_over_list(r->actions, rai)
{
const RuleActionMIME* ram = dynamic_cast<const RuleActionMIME*>(r->actions[rai]);
const RuleActionMIME* ram =
dynamic_cast<const RuleActionMIME*>(r->actions[rai]);
if ( ! ram )
continue;
@ -876,66 +882,40 @@ void RuleMatcher::Match(RuleEndpointState* state, Rule::PatternType type,
DBG_LOG(DBG_RULES, "New pattern match found");
// Build a joined AcceptingSet.
AcceptingSet accepted;
int_list matchpos;
AcceptingMatchSet accepted_matches;
loop_over_list(state->matchers, y)
loop_over_list(state->matchers, y )
{
RuleEndpointState::Matcher* m = state->matchers[y];
const AcceptingSet* ac = m->state->Accepted();
loop_over_list(*ac, k)
{
if ( ! accepted.is_member((*ac)[k]) )
{
accepted.append((*ac)[k]);
matchpos.append((*m->state->MatchPositions())[k]);
}
}
const AcceptingMatchSet& ams = m->state->AcceptedMatches();
accepted_matches.insert(ams.begin(), ams.end());
}
// Determine the rules for which all patterns have matched.
// This code should be fast enough as long as there are only very few
// matched patterns per connection (which is a plausible assumption).
rule_list matched;
// Find rules for which patterns have matched.
set<Rule*> rule_matches;
loop_over_list(accepted, i)
for ( AcceptingMatchSet::const_iterator it = accepted_matches.begin();
it != accepted_matches.end(); ++it )
{
Rule* r = Rule::rule_table[accepted[i] - 1];
AcceptIdx aidx = it->first;
MatchPos mpos = it->second;
DBG_LOG(DBG_RULES, "Checking rule: %s", r->id);
Rule* r = Rule::rule_table[aidx - 1];
// Check whether all patterns of the rule have matched.
loop_over_list(r->patterns, j)
{
if ( ! accepted.is_member(r->patterns[j]->id) )
goto next_pattern;
// See if depth is satisfied.
if ( (unsigned int) matchpos[i] >
r->patterns[j]->offset + r->patterns[j]->depth )
goto next_pattern;
DBG_LOG(DBG_RULES, "All patterns of rule satisfied");
// FIXME: How to check for offset ??? ###
}
// If not already in the list of matching rules, add it.
if ( ! matched.is_member(r) )
matched.append(r);
next_pattern:
continue;
if ( AllRulePatternsMatched(r, mpos, accepted_matches) )
rule_matches.insert(r);
}
// Check which of the matching rules really belong to any of our nodes.
loop_over_list(matched, j)
for ( set<Rule*>::const_iterator it = rule_matches.begin();
it != rule_matches.end(); ++it )
{
Rule* r = matched[j];
Rule* r = *it;
DBG_LOG(DBG_RULES, "Accepted rule: %s", r->id);

View file

@ -361,6 +361,9 @@ private:
void DumpStateStats(BroFile* f, RuleHdrTest* hdr_test);
static bool AllRulePatternsMatched(const Rule* r, MatchPos matchpos,
const AcceptingMatchSet& ams);
int RE_level;
bool parse_error;
RuleHdrTest* root;