Merge branch 'topic/robin/cleanup-dfa-cache'

* topic/robin/cleanup-dfa-cache:
  Removing the EXPIRE_DFA_STATES code.
This commit is contained in:
Robin Sommer 2011-04-01 14:52:27 -07:00
commit ec1b2b4d2a
4 changed files with 56 additions and 330 deletions

View file

@ -21,11 +21,10 @@ DFA_State::DFA_State(int arg_state_num, const EquivClass* ec,
nfa_states = arg_nfa_states;
accept = arg_accept;
mark = 0;
lock = 0;
SymPartition(ec);
xtions = new DFA_State_Handle*[num_sym];
xtions = new DFA_State*[num_sym];
for ( int i = 0; i < num_sym; ++i )
xtions[i] = DFA_UNCOMPUTED_STATE_PTR;
@ -33,31 +32,14 @@ DFA_State::DFA_State(int arg_state_num, const EquivClass* ec,
DFA_State::~DFA_State()
{
for ( int i = 0; i < num_sym; ++i )
{
DFA_State_Handle* s = xtions[i];
if ( s && s != DFA_UNCOMPUTED_STATE_PTR )
StateUnref(s);
}
delete [] xtions;
delete nfa_states;
delete accept;
delete meta_ec;
}
void DFA_State::AddXtion(int sym, DFA_State_Handle* next_state)
void DFA_State::AddXtion(int sym, DFA_State* next_state)
{
// The order is important here: first StateRef() the new,
// then StateUnref() the old. Otherwise, we may get a problem
// if both are equal.
if ( next_state )
StateRef(next_state);
if ( xtions[sym] && xtions[sym] != DFA_UNCOMPUTED_STATE_PTR )
StateUnref(xtions[sym]);
xtions[sym] = next_state;
}
@ -94,14 +76,10 @@ void DFA_State::SymPartition(const EquivClass* ec)
meta_ec->BuildECs();
}
DFA_State_Handle* DFA_State::ComputeXtion(int sym, DFA_Machine* machine)
DFA_State* DFA_State::ComputeXtion(int sym, DFA_Machine* machine)
{
// Make sure we will not expire...
assert(IsLocked());
int equiv_sym = meta_ec->EquivRep(sym);
if ( xtions[equiv_sym] != DFA_UNCOMPUTED_STATE_PTR &&
StateIsValid(xtions[equiv_sym]) )
if ( xtions[equiv_sym] != DFA_UNCOMPUTED_STATE_PTR )
{
AddXtion(sym, xtions[equiv_sym]);
return xtions[sym];
@ -109,7 +87,7 @@ DFA_State_Handle* DFA_State::ComputeXtion(int sym, DFA_Machine* machine)
const EquivClass* ec = machine->EC();
DFA_State_Handle* next_d;
DFA_State* next_d;
NFA_state_list* ns = SymFollowSet(equiv_sym, ec);
if ( ns->length() > 0 )
@ -211,10 +189,10 @@ void DFA_State::ClearMarks()
for ( int i = 0; i < num_sym; ++i )
{
DFA_State_Handle* s = xtions[i];
DFA_State* s = xtions[i];
if ( s && s != DFA_UNCOMPUTED_STATE_PTR )
(*xtions[i])->ClearMarks();
xtions[i]->ClearMarks();
}
}
}
@ -243,7 +221,7 @@ void DFA_State::Dump(FILE* f, DFA_Machine* m)
int num_trans = 0;
for ( int sym = 0; sym < num_sym; ++sym )
{
DFA_State_Handle* s = xtions[sym];
DFA_State* s = xtions[sym];
if ( ! s )
continue;
@ -271,7 +249,7 @@ void DFA_State::Dump(FILE* f, DFA_Machine* m)
else
fprintf(f, "%stransition on %s to state %d",
++num_trans == 1 ? "\t" : "\n\t", xbuf,
(*s)->StateNum());
s->StateNum());
sym = i - 1;
}
@ -283,10 +261,10 @@ void DFA_State::Dump(FILE* f, DFA_Machine* m)
for ( int sym = 0; sym < num_sym; ++sym )
{
DFA_State_Handle* s = xtions[sym];
DFA_State* s = xtions[sym];
if ( s && s != DFA_UNCOMPUTED_STATE_PTR )
(*s)->Dump(f, m);
s->Dump(f, m);
}
}
@ -294,7 +272,7 @@ void DFA_State::Stats(unsigned int* computed, unsigned int* uncomputed)
{
for ( int sym = 0; sym < num_sym; ++sym )
{
DFA_State_Handle* s = xtions[sym];
DFA_State* s = xtions[sym];
if ( s == DFA_UNCOMPUTED_STATE_PTR )
(*uncomputed)++;
@ -313,11 +291,9 @@ unsigned int DFA_State::Size()
+ (centry ? padded_sizeof(CacheEntry) : 0);
}
DFA_State_Cache::DFA_State_Cache(int arg_maxsize)
{
maxsize = arg_maxsize;
head = tail = 0;
hits = misses = 0;
}
@ -328,13 +304,12 @@ DFA_State_Cache::~DFA_State_Cache()
while ( (e = (CacheEntry*) states.NextEntry(i)) )
{
assert(e->state);
StateInvalidate(e->state);
delete e->hash;
delete e;
}
}
DFA_State_Handle* DFA_State_Cache::Lookup(const NFA_state_list& nfas,
DFA_State* DFA_State_Cache::Lookup(const NFA_state_list& nfas,
HashKey** hash)
{
// We assume that state ID's don't exceed 10 digits, plus
@ -380,100 +355,24 @@ DFA_State_Handle* DFA_State_Cache::Lookup(const NFA_state_list& nfas,
delete *hash;
*hash = 0;
MoveToFront(e);
return e->state;
}
DFA_State_Handle* DFA_State_Cache::Insert(DFA_State* state, HashKey* hash)
DFA_State* DFA_State_Cache::Insert(DFA_State* state, HashKey* hash)
{
CacheEntry* e;
#ifdef EXPIRE_DFA_STATES
if ( states.Length() == maxsize )
{
// Remove oldest unlocked entry.
for ( e = tail; e; e = e->prev )
if ( ! (*e->state)->lock )
break;
if ( e )
Remove(e);
}
#endif
e = new CacheEntry;
#ifdef EXPIRE_DFA_STATES
// Insert as head.
e->state = new DFA_State_Handle(state);
e->state->state->centry = e;
#else
e->state = state;
e->state->centry = e;
#endif
e->hash = hash;
e->prev = 0;
e->next = head;
if ( head )
head->prev = e;
head = e;
if ( ! tail )
tail = e;
states.Insert(hash, e);
return e->state;
}
void DFA_State_Cache::Remove(CacheEntry* e)
{
if ( e == head )
{
head = e->next;
if ( head )
head->prev = 0;
}
else
e->prev->next = e->next;
if ( e == tail )
{
tail = e->prev;
if ( tail )
tail->next = 0;
}
else
e->next->prev = e->prev;
states.Remove(e->hash);
assert(e->state);
StateInvalidate(e->state);
delete e->hash;
delete e;
}
void DFA_State_Cache::MoveToFront(CacheEntry* e)
{
++hits;
if ( e->prev )
{
e->prev->next = e->next;
if ( e->next )
e->next->prev = e->prev;
else
tail = e->prev;
e->prev = 0;
e->next = head;
head->prev = e;
head = e;
}
}
void DFA_State_Cache::GetStats(Stats* s)
{
s->dfa_states = 0;
@ -490,9 +389,9 @@ void DFA_State_Cache::GetStats(Stats* s)
while ( (e = (CacheEntry*) states.NextEntry(i)) )
{
++s->dfa_states;
s->nfa_states += (*e->state)->NFAStateNum();
(*e->state)->Stats(&s->computed, &s->uncomputed);
s->mem += pad_size((*e->state)->Size()) + padded_sizeof(*e->state);
s->nfa_states += e->state->NFAStateNum();
e->state->Stats(&s->computed, &s->uncomputed);
s->mem += pad_size(e->state->Size()) + padded_sizeof(*e->state);
}
}
@ -514,9 +413,6 @@ DFA_Machine::DFA_Machine(NFA_Machine* n, EquivClass* arg_ec)
{
NFA_state_list* state_set = epsilon_closure(ns);
(void) StateSetToDFA_State(state_set, start_state, ec);
StateRef(start_state);
StateLock(start_state);
}
else
start_state = 0; // Jam
@ -524,12 +420,6 @@ DFA_Machine::DFA_Machine(NFA_Machine* n, EquivClass* arg_ec)
DFA_Machine::~DFA_Machine()
{
if ( start_state )
{
StateUnlock(start_state);
StateUnref(start_state);
}
delete dfa_state_cache;
Unref(nfa);
}
@ -541,8 +431,8 @@ void DFA_Machine::Describe(ODesc* d) const
void DFA_Machine::Dump(FILE* f)
{
(*start_state)->Dump(f, this);
(*start_state)->ClearMarks();
start_state->Dump(f, this);
start_state->ClearMarks();
}
void DFA_Machine::DumpStats(FILE* f)
@ -571,12 +461,11 @@ unsigned int DFA_Machine::MemoryAllocation() const
}
int DFA_Machine::StateSetToDFA_State(NFA_state_list* state_set,
DFA_State_Handle*& d, const EquivClass* ec)
DFA_State*& d, const EquivClass* ec)
{
HashKey* hash;
d = dfa_state_cache->Lookup(*state_set, &hash);
assert((! d) || StateIsValid(d));
if ( d )
return 0;

157
src/DFA.h
View file

@ -8,57 +8,12 @@
#include <assert.h>
// It's possible to use a fixed size cache of computed states for each DFA.
// If the number of DFA states reaches the given limit, old states are expired
// on a least-recently-used basis. This may impact the performance significantly
// if expired states have to be recalculated regularly, but it limits the
// amount of memory taken by a DFA.
//
// Enable by configuring with --with-expire-dfa-states.
class DFA_State;
// The cache marks expired states as invalid.
#define DFA_INVALID_STATE_PTR ((DFA_State*) -1)
// Transitions to the uncomputed state indicate that we haven't yet
// computed the state to go to.
#define DFA_UNCOMPUTED_STATE -2
#define DFA_UNCOMPUTED_STATE_PTR ((DFA_State_Handle*) DFA_UNCOMPUTED_STATE)
#ifdef EXPIRE_DFA_STATES
class DFA_State_Handle {
public:
// The reference counting keeps track of this *handle* (not the state).
void Ref() { assert(state); ++refcount; }
void Unref()
{
if ( --refcount == 0 )
delete this;
}
inline void Invalidate();
bool IsValid() const { return state != DFA_INVALID_STATE_PTR; }
DFA_State* State() const { return state; }
DFA_State* operator->() const { return state; }
protected:
friend class DFA_State_Cache;
DFA_State_Handle(DFA_State* arg_state)
{ state = arg_state; refcount = 1; }
inline ~DFA_State_Handle();
DFA_State* state;
int refcount;
};
#else
typedef DFA_State DFA_State_Handle;
#endif
#define DFA_UNCOMPUTED_STATE_PTR ((DFA_State*) DFA_UNCOMPUTED_STATE)
#include "NFA.h"
@ -76,9 +31,9 @@ public:
int StateNum() const { return state_num; }
int NFAStateNum() const { return nfa_states->length(); }
void AddXtion(int sym, DFA_State_Handle* next_state);
void AddXtion(int sym, DFA_State* next_state);
inline DFA_State_Handle* Xtion(int sym, DFA_Machine* machine);
inline DFA_State* Xtion(int sym, DFA_Machine* machine);
const AcceptingSet* Accept() const { return accept; }
void SymPartition(const EquivClass* ec);
@ -98,43 +53,29 @@ public:
void Stats(unsigned int* computed, unsigned int* uncomputed);
unsigned int Size();
// Locking a state will keep it from expiring from a cache.
void Lock() { ++lock; }
void Unlock() { --lock; }
#ifdef EXPIRE_DFA_STATES
bool IsLocked() { return lock != 0; }
#else
bool IsLocked() { return true; }
DFA_State* operator->(){ return this; }
#endif
protected:
friend class DFA_State_Cache;
DFA_State_Handle* ComputeXtion(int sym, DFA_Machine* machine);
DFA_State* ComputeXtion(int sym, DFA_Machine* machine);
void AppendIfNew(int sym, int_list* sym_list);
int state_num;
int num_sym;
DFA_State_Handle** xtions;
DFA_State** xtions;
AcceptingSet* accept;
NFA_state_list* nfa_states;
EquivClass* meta_ec; // which ec's make same transition
DFA_State* mark;
int lock;
CacheEntry* centry;
static unsigned int transition_counter; // see Xtion()
};
struct CacheEntry {
DFA_State_Handle* state;
DFA_State* state;
HashKey* hash;
CacheEntry* next;
CacheEntry* prev;
};
class DFA_State_Cache {
@ -143,13 +84,11 @@ public:
~DFA_State_Cache();
// If the caller stores the handle, it has to call Ref() on it.
DFA_State_Handle* Lookup(const NFA_state_list& nfa_states,
DFA_State* Lookup(const NFA_state_list& nfa_states,
HashKey** hash);
// Takes ownership of both; hash is the one returned by Lookup().
DFA_State_Handle* Insert(DFA_State* state, HashKey* hash);
void MoveToFront(DFA_State* state) { MoveToFront(state->centry); }
DFA_State* Insert(DFA_State* state, HashKey* hash);
int NumEntries() const { return states.Length(); }
@ -168,9 +107,6 @@ public:
void GetStats(Stats* s);
private:
void Remove(CacheEntry* e);
void MoveToFront(CacheEntry* e);
int maxsize;
int hits; // Statistics
@ -180,10 +116,6 @@ private:
// Hash indexed by NFA states (MD5s of them, actually).
PDict(CacheEntry) states;
// List in LRU order.
CacheEntry* head;
CacheEntry* tail;
};
declare(PList,DFA_State);
@ -196,7 +128,7 @@ public:
int* acc_array);
~DFA_Machine();
DFA_State_Handle* StartState() const { return start_state; }
DFA_State* StartState() const { return start_state; }
int NumStates() const { return dfa_state_cache->NumEntries(); }
@ -217,74 +149,18 @@ protected:
int state_count;
// The state list has to be sorted according to IDs.
int StateSetToDFA_State(NFA_state_list* state_set, DFA_State_Handle*& d,
int StateSetToDFA_State(NFA_state_list* state_set, DFA_State*& d,
const EquivClass* ec);
const EquivClass* EC() const { return ec; }
EquivClass* ec; // equivalence classes corresponding to NFAs
DFA_State_Handle* start_state;
DFA_State* start_state;
DFA_State_Cache* dfa_state_cache;
NFA_Machine* nfa;
};
#ifdef EXPIRE_DFA_STATES
inline DFA_State_Handle* DFA_State::Xtion(int sym, DFA_Machine* machine)
{
Lock();
// This is just a clumsy form of sampling... Instead of moving
// the state to the front of our LRU cache on each transition (which
// would be quite often) we just do it on every nth transition
// (counted across all DFA states). This is based on the observation
// that a very few of all states are used most of time.
// (currently n=10000; should it be configurable?)
if ( transition_counter++ % 10000 == 0 )
machine->Cache()->MoveToFront(this);
DFA_State_Handle* h;
if ( xtions[sym] == DFA_UNCOMPUTED_STATE_PTR ||
(xtions[sym] && ! xtions[sym]->IsValid()) )
h = ComputeXtion(sym, machine);
else
h = xtions[sym];
Unlock();
return h;
}
inline DFA_State_Handle::~DFA_State_Handle()
{
if ( state != DFA_INVALID_STATE_PTR )
delete state;
}
inline void DFA_State_Handle::Invalidate()
{
assert(state!=DFA_INVALID_STATE_PTR);
delete state;
state = DFA_INVALID_STATE_PTR;
Unref();
}
// Not nice but helps avoiding some overhead in the non-expiration case.
static inline void StateLock(DFA_State_Handle* s) { s->State()->Lock(); }
static inline void StateUnlock(DFA_State_Handle* s) { s->State()->Unlock(); }
static inline void StateRef(DFA_State_Handle* s) { s->Ref(); }
static inline void StateUnref(DFA_State_Handle* s) { s->Unref(); }
static inline void StateInvalidate(DFA_State_Handle* s) { s->Invalidate(); }
static inline bool StateIsValid(DFA_State_Handle* s)
{
return ! s || s->IsValid();
}
#else
inline DFA_State_Handle* DFA_State::Xtion(int sym, DFA_Machine* machine)
inline DFA_State* DFA_State::Xtion(int sym, DFA_Machine* machine)
{
if ( xtions[sym] == DFA_UNCOMPUTED_STATE_PTR )
return ComputeXtion(sym, machine);
@ -292,13 +168,4 @@ inline DFA_State_Handle* DFA_State::Xtion(int sym, DFA_Machine* machine)
return xtions[sym];
}
static inline void StateLock(DFA_State_Handle* s) { }
static inline void StateUnlock(DFA_State_Handle* s) { }
static inline void StateRef(DFA_State_Handle* s) { }
static inline void StateUnref(DFA_State_Handle* s) { }
static inline void StateInvalidate(DFA_State_Handle* s) { }
static inline bool StateIsValid(DFA_State_Handle* s) { return true; }
#endif
#endif

View file

@ -211,8 +211,8 @@ int Specific_RE_Matcher::MatchAll(const u_char* bv, int n)
// matched is empty.
return n == 0;
DFA_State_Handle* d = dfa->StartState();
d = (*d)->Xtion(ecs[SYM_BOL], dfa);
DFA_State* d = dfa->StartState();
d = d->Xtion(ecs[SYM_BOL], dfa);
while ( d )
{
@ -220,13 +220,13 @@ int Specific_RE_Matcher::MatchAll(const u_char* bv, int n)
break;
int ec = ecs[*(bv++)];
d = (*d)->Xtion(ec, dfa);
d = d->Xtion(ec, dfa);
}
if ( d )
d = (*d)->Xtion(ecs[SYM_EOL], dfa);
d = d->Xtion(ecs[SYM_EOL], dfa);
return d && (*d)->Accept() != 0;
return d && d->Accept() != 0;
}
@ -236,26 +236,26 @@ int Specific_RE_Matcher::Match(const u_char* bv, int n)
// An empty pattern matches anything.
return 1;
DFA_State_Handle* d = dfa->StartState();
DFA_State* d = dfa->StartState();
d = (*d)->Xtion(ecs[SYM_BOL], dfa);
d = d->Xtion(ecs[SYM_BOL], dfa);
if ( ! d ) return 0;
for ( int i = 0; i < n; ++i )
{
int ec = ecs[bv[i]];
d = (*d)->Xtion(ec, dfa);
d = d->Xtion(ec, dfa);
if ( ! d )
break;
if ( (*d)->Accept() )
if ( d->Accept() )
return i + 1;
}
if ( d )
{
d = (*d)->Xtion(ecs[SYM_EOL], dfa);
if ( d && (*d)->Accept() )
d = d->Xtion(ecs[SYM_EOL], dfa);
if ( d && d->Accept() )
return n > 0 ? n : 1; // we can't return 0 here for match...
}
@ -268,12 +268,6 @@ void Specific_RE_Matcher::Dump(FILE* f)
dfa->Dump(f);
}
RE_Match_State::~RE_Match_State()
{
if ( current_state )
StateUnref(current_state);
}
bool RE_Match_State::Match(const u_char* bv, int n,
bool bol, bool eol, bool clear)
{
@ -289,9 +283,8 @@ bool RE_Match_State::Match(const u_char* bv, int n,
// Initialize state and copy the accepting states of the start
// state into the acceptance set.
current_state = dfa->StartState();
StateRef(current_state);
const AcceptingSet* ac = (*current_state)->Accept();
const AcceptingSet* ac = current_state->Accept();
if ( ac )
{
loop_over_list(*ac, i)
@ -303,20 +296,11 @@ bool RE_Match_State::Match(const u_char* bv, int n,
}
else if ( clear )
{
if ( current_state )
StateUnref(current_state);
current_state = dfa->StartState();
StateRef(current_state);
}
if ( ! current_state )
return false;
else
(*current_state)->Unlock();
current_pos = 0;
int old_matches = accepted.length();
@ -334,7 +318,7 @@ bool RE_Match_State::Match(const u_char* bv, int n,
else
ec = ecs[*(bv++)];
DFA_State_Handle* next_state = (*current_state)->Xtion(ec,dfa);
DFA_State* next_state = current_state->Xtion(ec,dfa);
if ( ! next_state )
{
@ -342,9 +326,9 @@ bool RE_Match_State::Match(const u_char* bv, int n,
break;
}
if ( (*next_state)->Accept() )
if ( next_state->Accept() )
{
const AcceptingSet* ac = (*next_state)->Accept();
const AcceptingSet* ac = next_state->Accept();
loop_over_list(*ac, i)
{
if ( ! accepted.is_member((*ac)[i]) )
@ -357,15 +341,9 @@ bool RE_Match_State::Match(const u_char* bv, int n,
++current_pos;
StateRef(next_state);
StateUnref(current_state);
current_state = next_state;
}
// Make sure our state doesn't expire until we return.
if ( current_state )
(*current_state)->Lock();
return accepted.length() != old_matches;
}
@ -377,31 +355,31 @@ int Specific_RE_Matcher::LongestMatch(const u_char* bv, int n)
// Use -1 to indicate no match.
int last_accept = -1;
DFA_State_Handle* d = dfa->StartState();
DFA_State* d = dfa->StartState();
d = (*d)->Xtion(ecs[SYM_BOL], dfa);
d = d->Xtion(ecs[SYM_BOL], dfa);
if ( ! d )
return -1;
if ( (*d)->Accept() )
if ( d->Accept() )
last_accept = 0;
for ( int i = 0; i < n; ++i )
{
int ec = ecs[bv[i]];
d = (*d)->Xtion(ec, dfa);
d = d->Xtion(ec, dfa);
if ( ! d )
break;
if ( (*d)->Accept() )
if ( d->Accept() )
last_accept = i + 1;
}
if ( d )
{
d = (*d)->Xtion(ecs[SYM_EOL], dfa);
if ( d && (*d)->Accept() )
d = d->Xtion(ecs[SYM_EOL], dfa);
if ( d && d->Accept() )
return n;
}

View file

@ -19,6 +19,7 @@ class NFA_Machine;
class DFA_Machine;
class Specific_RE_Matcher;
class RE_Matcher;
class DFA_State;
declare(PDict,char);
declare(PDict,CCL);
@ -126,13 +127,6 @@ protected:
AcceptingSet* accepted;
};
#ifdef EXPIRE_DFA_STATES
class DFA_State_Handle;
#else
class DFA_State;
typedef DFA_State DFA_State_Handle;
#endif
class RE_Match_State {
public:
RE_Match_State(Specific_RE_Matcher* matcher)
@ -143,8 +137,6 @@ public:
current_state = 0;
}
~RE_Match_State();
const AcceptingSet* Accepted() const { return &accepted; }
const int_list* MatchPositions() const { return &match_pos; }
@ -169,7 +161,7 @@ protected:
AcceptingSet accepted;
int_list match_pos;
DFA_State_Handle* current_state;
DFA_State* current_state;
int current_pos;
};