zeek/src/DFA.cc

// See the file "COPYING" in the main distribution directory for copyright.

#include "zeek/DFA.h"

#include "zeek/Desc.h"
#include "zeek/EquivClass.h"
#include "zeek/Hash.h"

namespace zeek::detail {

DFA_State::DFA_State(int arg_state_num, const EquivClass* ec, NFA_state_list* arg_nfa_states,
                     AcceptingSet* arg_accept) {
    state_num = arg_state_num;
    num_sym = ec->NumClasses();
    nfa_states = arg_nfa_states;
    accept = arg_accept;
    mark = nullptr;

    SymPartition(ec);

    xtions = new DFA_State*[num_sym];

    for ( int i = 0; i < num_sym; ++i )
        xtions[i] = DFA_UNCOMPUTED_STATE_PTR;
}

DFA_State::~DFA_State() {
    delete[] xtions;
    delete nfa_states;
    delete accept;
    delete meta_ec;
}

void DFA_State::AddXtion(int sym, DFA_State* next_state) { xtions[sym] = next_state; }

void DFA_State::SymPartition(const EquivClass* ec) {
    // Partitioning is done by creating equivalence classes for those
    // characters which have out-transitions from the given state.  Thus
    // we are really creating equivalence classes of equivalence classes.
    meta_ec = new EquivClass(ec->NumClasses());

    assert(nfa_states);
    for ( int i = 0; i < nfa_states->length(); ++i ) {
        NFA_State* n = (*nfa_states)[i];
        int sym = n->TransSym();

        if ( sym == SYM_EPSILON )
            continue;

        if ( sym != SYM_CCL ) { // character transition
            if ( ec->IsRep(sym) ) {
                sym = ec->SymEquivClass(sym);
                meta_ec->UniqueChar(sym);
            }
            continue;
        }

        // Character class.
        meta_ec->CCL_Use(n->TransCCL());
    }

    meta_ec->BuildECs();
}

DFA_State* DFA_State::ComputeXtion(int sym, DFA_Machine* machine) {
    int equiv_sym = meta_ec->EquivRep(sym);
    if ( xtions[equiv_sym] != DFA_UNCOMPUTED_STATE_PTR ) {
        AddXtion(sym, xtions[equiv_sym]);
        return xtions[sym];
    }

    const EquivClass* ec = machine->EC();

    DFA_State* next_d;

    NFA_state_list* ns = SymFollowSet(equiv_sym, ec);
    if ( ns->length() > 0 ) {
        NFA_state_list* state_set = epsilon_closure(ns);
        if ( ! machine->StateSetToDFA_State(state_set, next_d, ec) )
            delete state_set;
    }
    else {
        delete ns;
        next_d = nullptr; // Jam
    }

    AddXtion(equiv_sym, next_d);
    if ( sym != equiv_sym )
        AddXtion(sym, next_d);

    return xtions[sym];
}

void DFA_State::AppendIfNew(int sym, int_list* sym_list) {
    for ( auto value : *sym_list )
        if ( value == sym )
            return;

    sym_list->push_back(sym);
}

NFA_state_list* DFA_State::SymFollowSet(int ec_sym, const EquivClass* ec) {
    NFA_state_list* ns = new NFA_state_list;

    assert(nfa_states);

    for ( int i = 0; i < nfa_states->length(); ++i ) {
        NFA_State* n = (*nfa_states)[i];

        if ( n->TransSym() == SYM_CCL ) { // it's a character class
            CCL* ccl = n->TransCCL();
            int_list* syms = ccl->Syms();

            if ( ccl->IsNegated() ) {
                size_t j;
                for ( j = 0; j < syms->size(); ++j ) {
                    // Loop through (sorted) negated
                    // character class, which has
                    // presumably already been converted
                    // over to equivalence classes.
                    if ( (*syms)[j] >= ec_sym )
                        break;
                }

                if ( j >= syms->size() || (*syms)[j] > ec_sym )
                    // Didn't find ec_sym in ccl.
                    n->AddXtionsTo(ns);

                continue;
            }

            for ( auto sym : *syms ) {
                if ( sym > ec_sym )
                    break;

                if ( sym == ec_sym ) {
                    n->AddXtionsTo(ns);
                    break;
                }
            }
        }

        else if ( n->TransSym() == SYM_EPSILON ) { // do nothing
        }

        else if ( ec->IsRep(n->TransSym()) ) {
            if ( ec_sym == ec->SymEquivClass(n->TransSym()) )
                n->AddXtionsTo(ns);
        }
    }

    ns->resize(0);
    return ns;
}

void DFA_State::ClearMarks() {
    if ( mark ) {
        SetMark(nullptr);

        for ( int i = 0; i < num_sym; ++i ) {
            DFA_State* s = xtions[i];

            if ( s && s != DFA_UNCOMPUTED_STATE_PTR )
                xtions[i]->ClearMarks();
        }
    }
}

void DFA_State::Describe(ODesc* d) const { d->Add("DFA state"); }

void DFA_State::Dump(FILE* f, DFA_Machine* m) {
    if ( mark )
        return;

    fprintf(f, "\nDFA state %d:", StateNum());

    if ( accept ) {
        AcceptingSet::const_iterator it;

        for ( it = accept->begin(); it != accept->end(); ++it )
            fprintf(f, "%s accept #%d", it == accept->begin() ? "" : ",", *it);
    }

    fprintf(f, "\n");

    int num_trans = 0;
    for ( int sym = 0; sym < num_sym; ++sym ) {
        DFA_State* s = xtions[sym];

        if ( ! s )
            continue;

        // Look ahead for compression.
        int i;
        for ( i = sym + 1; i < num_sym; ++i )
            if ( xtions[i] != s )
                break;

        constexpr int xbuf_size = 512;
        char* xbuf = new char[xbuf_size];

        int r = m->Rep(sym);
        if ( ! r )
            r = '.';

        if ( i == sym + 1 )
            snprintf(xbuf, xbuf_size, "'%c'", r);
        else
            snprintf(xbuf, xbuf_size, "'%c'-'%c'", r, m->Rep(i - 1));

        if ( s == DFA_UNCOMPUTED_STATE_PTR )
            fprintf(f, "%stransition on %s to <uncomputed>", ++num_trans == 1 ? "\t" : "\n\t", xbuf);
        else
            fprintf(f, "%stransition on %s to state %d", ++num_trans == 1 ? "\t" : "\n\t", xbuf, s->StateNum());

        delete[] xbuf;

        sym = i - 1;
    }

    if ( num_trans > 0 )
        fprintf(f, "\n");

    SetMark(this);

    for ( int sym = 0; sym < num_sym; ++sym ) {
        DFA_State* s = xtions[sym];

        if ( s && s != DFA_UNCOMPUTED_STATE_PTR )
            s->Dump(f, m);
    }
}

void DFA_State::Stats(unsigned int* computed, unsigned int* uncomputed) {
    for ( int sym = 0; sym < num_sym; ++sym ) {
        DFA_State* s = xtions[sym];

        if ( s == DFA_UNCOMPUTED_STATE_PTR )
            (*uncomputed)++;
        else
            (*computed)++;
    }
}

unsigned int DFA_State::Size() {
    return sizeof(*this) + util::pad_size(sizeof(DFA_State*) * num_sym) +
           (accept ? util::pad_size(sizeof(int) * accept->size()) : 0) +
           (nfa_states ? util::pad_size(sizeof(NFA_State*) * nfa_states->length()) : 0) +
           (meta_ec ? meta_ec->Size() : 0);
}

DFA_State_Cache::DFA_State_Cache() { hits = misses = 0; }

DFA_State_Cache::~DFA_State_Cache() {
    for ( auto& entry : states ) {
        assert(entry.second);
        Unref(entry.second);
    }

    states.clear();
}

DFA_State* DFA_State_Cache::Lookup(const NFA_state_list& nfas, DigestStr* digest) {
    // We assume that state ID's don't exceed 10 digits, plus
    // we allow one more character for the delimiter.
    auto id_tag_buf = std::make_unique<char[]>(nfas.length() * 11 + 1);
    auto id_tag = id_tag_buf.get();
    char* p = id_tag;

    for ( int i = 0; i < nfas.length(); ++i ) {
        NFA_State* n = nfas[i];
        if ( n->TransSym() != SYM_EPSILON || n->Accept() != NO_ACCEPT ) {
            int id = n->ID();
            do {
                *p++ = '0' + (char)(id % 10);
                id /= 10;
            } while ( id > 0 );
            *p++ = '&';
        }
    }

    *p++ = '\0';

    // We use the short MD5 instead of the full string for the
    // HashKey because the data is copied into the key.
    hash128_t hash;
    KeyedHash::Hash128(id_tag, p - id_tag, &hash);
    *digest = DigestStr(reinterpret_cast<const char*>(hash), 16);

    auto entry = states.find(*digest);
    if ( entry == states.end() ) {
        ++misses;
        return nullptr;
    }
    ++hits;

    digest->clear();

    return entry->second;
}

DFA_State* DFA_State_Cache::Insert(DFA_State* state, DigestStr digest) {
    states.emplace(std::move(digest), state);
    return state;
}

void DFA_State_Cache::GetStats(Stats* s) {
    s->dfa_states = 0;
    s->nfa_states = 0;
    s->computed = 0;
    s->uncomputed = 0;
    s->mem = 0;
    s->hits = hits;
    s->misses = misses;

    for ( const auto& state : states ) {
        DFA_State* e = state.second;
        ++s->dfa_states;
        s->nfa_states += e->NFAStateNum();
        e->Stats(&s->computed, &s->uncomputed);
        s->mem += util::pad_size(e->Size()) + padded_sizeof(*e);
    }
}

DFA_Machine::DFA_Machine(NFA_Machine* n, EquivClass* arg_ec) {
    state_count = 0;

    nfa = n;
    Ref(n);

    ec = arg_ec;

    dfa_state_cache = new DFA_State_Cache();

    NFA_state_list* ns = new NFA_state_list;
    ns->push_back(n->FirstState());

    if ( ns->length() > 0 ) {
        NFA_state_list* state_set = epsilon_closure(ns);
        StateSetToDFA_State(state_set, start_state, ec);
    }
    else {
        start_state = nullptr; // Jam
        delete ns;
    }
}

DFA_Machine::~DFA_Machine() {
    delete dfa_state_cache;
    Unref(nfa);
}

void DFA_Machine::Describe(ODesc* d) const { d->Add("DFA machine"); }

void DFA_Machine::Dump(FILE* f) {
    start_state->Dump(f, this);
    start_state->ClearMarks();
}

bool DFA_Machine::StateSetToDFA_State(NFA_state_list* state_set, DFA_State*& d, const EquivClass* ec) {
    DigestStr digest;
    d = dfa_state_cache->Lookup(*state_set, &digest);

    if ( d )
        return false;

    AcceptingSet* accept = new AcceptingSet;

    for ( int i = 0; i < state_set->length(); ++i ) {
        int acc = (*state_set)[i]->Accept();

        if ( acc != NO_ACCEPT )
            accept->insert(acc);
    }

    if ( accept->empty() ) {
        delete accept;
        accept = nullptr;
    }

    DFA_State* ds = new DFA_State(state_count++, ec, state_set, accept);
    d = dfa_state_cache->Insert(ds, std::move(digest));

    return true;
}

int DFA_Machine::Rep(int sym) {
    for ( int i = 0; i < NUM_SYM; ++i )
        if ( ec->SymEquivClass(i) == sym )
            return i;

    return -1;
}

} // namespace zeek::detail