zeek/src/Dict.cc
2013-09-27 10:13:52 -05:00

603 lines
12 KiB
C++

// See the file "COPYING" in the main distribution directory for copyright.
#include "config.h"
#ifdef HAVE_MEMORY_H
#include <memory.h>
#endif
#include "Dict.h"
#include "Reporter.h"
// If the mean bucket length exceeds the following then Insert() will
// increase the size of the hash table.
#define DEFAULT_DENSITY_THRESH 3.0
// Threshold above which we do not try to ensure that the hash size
// is prime.
#define PRIME_THRESH 1000
class DictEntry {
public:
DictEntry(void* k, int l, hash_t h, void* val)
{ key = k; len = l; hash = h; value = val; }
~DictEntry()
{
delete [] (char*) key;
}
void* key;
int len;
hash_t hash;
void* value;
};
// The value of an iteration cookie is the bucket and offset within the
// bucket at which to start looking for the next value to return.
class IterCookie {
public:
IterCookie(int b, int o)
{
bucket = b;
offset = o;
ttbl = 0;
num_buckets_p = 0;
}
int bucket, offset;
PList(DictEntry)** ttbl;
const int* num_buckets_p;
PList(DictEntry) inserted; // inserted while iterating
};
Dictionary::Dictionary(dict_order ordering, int initial_size)
{
Init(initial_size);
tbl2 = 0;
if ( ordering == ORDERED )
order = new PList(DictEntry);
else
order = 0;
SetDensityThresh(DEFAULT_DENSITY_THRESH);
delete_func = 0;
tbl_next_ind = 0;
num_buckets2 = num_entries2 = max_num_entries2 = thresh_entries2 = 0;
den_thresh2 = 0;
}
Dictionary::~Dictionary()
{
DeInit();
delete order;
}
void Dictionary::Clear()
{
DeInit();
Init(2);
tbl2 = 0;
}
void Dictionary::DeInit()
{
for ( int i = 0; i < num_buckets; ++i )
if ( tbl[i] )
{
PList(DictEntry)* chain = tbl[i];
loop_over_list(*chain, j)
{
DictEntry* e = (*chain)[j];
if ( delete_func )
delete_func(e->value);
delete e;
}
delete chain;
}
delete [] tbl;
if ( tbl2 == 0 )
return;
for ( int i = 0; i < num_buckets2; ++i )
if ( tbl2[i] )
{
PList(DictEntry)* chain = tbl2[i];
loop_over_list(*chain, j)
{
DictEntry* e = (*chain)[j];
if ( delete_func )
delete_func(e->value);
delete e;
}
delete chain;
}
delete [] tbl2;
tbl2 = 0;
}
void* Dictionary::Lookup(const void* key, int key_size, hash_t hash) const
{
hash_t h;
PList(DictEntry)* chain;
// Figure out which hash table to look in.
h = hash % num_buckets;
if ( ! tbl2 || h >= tbl_next_ind )
chain = tbl[h];
else
chain = tbl2[hash % num_buckets2];
if ( chain )
{
for ( int i = 0; i < chain->length(); ++i )
{
DictEntry* entry = (*chain)[i];
if ( entry->hash == hash && entry->len == key_size &&
! memcmp(key, entry->key, key_size) )
return entry->value;
}
}
return 0;
}
void* Dictionary::Insert(void* key, int key_size, hash_t hash, void* val,
int copy_key)
{
DictEntry* new_entry = new DictEntry(key, key_size, hash, val);
void* old_val = Insert(new_entry, copy_key);
if ( old_val )
{
// We didn't need the new DictEntry, the key was already
// present.
delete new_entry;
}
else if ( order )
order->append(new_entry);
// Resize logic.
if ( tbl2 )
MoveChains();
else if ( num_entries >= thresh_entries )
StartChangeSize(num_buckets * 2 + 1);
return old_val;
}
void* Dictionary::Remove(const void* key, int key_size, hash_t hash,
bool dont_delete)
{
hash_t h;
PList(DictEntry)* chain;
int* num_entries_ptr;
// Figure out which hash table to look in
h = hash % num_buckets;
if ( ! tbl2 || h >= tbl_next_ind )
{
chain = tbl[h];
num_entries_ptr = &num_entries;
}
else
{
chain = tbl2[hash % num_buckets2];
num_entries_ptr = &num_entries2;
}
if ( ! chain )
return 0;
for ( int i = 0; i < chain->length(); ++i )
{
DictEntry* entry = (*chain)[i];
if ( entry->hash == hash && entry->len == key_size &&
! memcmp(key, entry->key, key_size) )
{
void* entry_value = DoRemove(entry, h, chain, i);
if ( dont_delete )
entry->key = 0;
delete entry;
--*num_entries_ptr;
return entry_value;
}
}
return 0;
}
void* Dictionary::DoRemove(DictEntry* entry, hash_t h,
PList(DictEntry)* chain, int chain_offset)
{
void* entry_value = entry->value;
chain->remove_nth(chain_offset);
if ( order )
order->remove(entry);
// Adjust existing cookies.
loop_over_list(cookies, i)
{
IterCookie* c = cookies[i];
// Is the affected bucket the current one?
if ( (unsigned int) c->bucket == h )
{
if ( c->offset > chain_offset )
--c->offset;
// The only other important case here occurs when we
// are deleting the current entry which
// simultaniously happens to be the last one in this
// bucket. This means that we would have to move on
// to the next non-empty bucket. Fortunately,
// NextEntry() will do exactly the right thing in
// this case. :-)
}
// This item may have been inserted during this iteration.
if ( (unsigned int) c->bucket > h )
c->inserted.remove(entry);
}
return entry_value;
}
void* Dictionary::NthEntry(int n, const void*& key, int& key_len) const
{
if ( ! order || n < 0 || n >= Length() )
return 0;
DictEntry* entry = (*order)[n];
key = entry->key;
key_len = entry->len;
return entry->value;
}
IterCookie* Dictionary::InitForIteration() const
{
return new IterCookie(0, 0);
}
void Dictionary::StopIteration(IterCookie* cookie) const
{
delete cookie;
}
void* Dictionary::NextEntry(HashKey*& h, IterCookie*& cookie, int return_hash) const
{
// If there are any inserted entries, return them first.
// That keeps the list small and helps avoiding searching
// a large list when deleting an entry.
DictEntry* entry;
if ( cookie->inserted.length() )
{
// Return the last one. Order doesn't matter,
// and removing from the tail is cheaper.
entry = cookie->inserted.remove_nth(cookie->inserted.length()-1);
if ( return_hash )
h = new HashKey(entry->key, entry->len, entry->hash);
return entry->value;
}
int b = cookie->bucket;
int o = cookie->offset;
PList(DictEntry)** ttbl;
const int* num_buckets_p;
if ( ! cookie->ttbl )
{
// XXX maybe we could update cookie->b from tbl_next_ind here?
cookie->ttbl = tbl;
cookie->num_buckets_p = &num_buckets;
}
ttbl = cookie->ttbl;
num_buckets_p = cookie->num_buckets_p;
if ( ttbl[b] && ttbl[b]->length() > o )
{
entry = (*ttbl[b])[o];
++cookie->offset;
if ( return_hash )
h = new HashKey(entry->key, entry->len, entry->hash);
return entry->value;
}
++b; // Move on to next non-empty bucket.
while ( b < *num_buckets_p && (! ttbl[b] || ttbl[b]->length() == 0) )
++b;
if ( b >= *num_buckets_p )
{
// If we're resizing, we need to search the 2nd table too.
if ( ttbl == tbl && tbl2 )
{
cookie->ttbl = tbl2;
cookie->num_buckets_p = &num_buckets2;
cookie->bucket = 0;
cookie->offset = 0;
return Dictionary::NextEntry(h, cookie, return_hash);
}
// All done.
// FIXME: I don't like removing the const here. But is there
// a better way?
const_cast<PList(IterCookie)*>(&cookies)->remove(cookie);
delete cookie;
cookie = 0;
return 0;
}
entry = (*ttbl[b])[0];
if ( return_hash )
h = new HashKey(entry->key, entry->len, entry->hash);
cookie->bucket = b;
cookie->offset = 1;
return entry->value;
}
void Dictionary::Init(int size)
{
num_buckets = NextPrime(size);
tbl = new PList(DictEntry)*[num_buckets];
for ( int i = 0; i < num_buckets; ++i )
tbl[i] = 0;
max_num_entries = num_entries = 0;
}
void Dictionary::Init2(int size)
{
num_buckets2 = NextPrime(size);
tbl2 = new PList(DictEntry)*[num_buckets2];
for ( int i = 0; i < num_buckets2; ++i )
tbl2[i] = 0;
max_num_entries2 = num_entries2 = 0;
}
// private
void* Dictionary::Insert(DictEntry* new_entry, int copy_key)
{
PList(DictEntry)** ttbl;
int* num_entries_ptr;
int* max_num_entries_ptr;
hash_t h = new_entry->hash % num_buckets;
// We must be careful when we are in the middle of resizing.
// If the new entry hashes to a bucket in the old table we
// haven't moved yet, we need to put it in the old table. If
// we didn't do it this way, we would sometimes have to
// search both tables which is probably more expensive.
if ( ! tbl2 || h >= tbl_next_ind )
{
ttbl = tbl;
num_entries_ptr = &num_entries;
max_num_entries_ptr = &max_num_entries;
}
else
{
ttbl = tbl2;
h = new_entry->hash % num_buckets2;
num_entries_ptr = &num_entries2;
max_num_entries_ptr = &max_num_entries2;
}
PList(DictEntry)* chain = ttbl[h];
int n = new_entry->len;
if ( chain )
{
for ( int i = 0; i < chain->length(); ++i )
{
DictEntry* entry = (*chain)[i];
if ( entry->hash == new_entry->hash &&
entry->len == n &&
! memcmp(entry->key, new_entry->key, n) )
{
void* old_value = entry->value;
entry->value = new_entry->value;
return old_value;
}
}
}
else
// Create new chain.
chain = ttbl[h] = new PList(DictEntry);
// If we got this far, then we couldn't use an existing copy
// of the key, so make a new one if necessary.
if ( copy_key )
{
void* old_key = new_entry->key;
new_entry->key = (void*) new char[n];
memcpy(new_entry->key, old_key, n);
delete (char*) old_key;
}
// We happen to know (:-() that appending is more efficient
// on lists than prepending.
chain->append(new_entry);
if ( *max_num_entries_ptr < ++*num_entries_ptr )
*max_num_entries_ptr = *num_entries_ptr;
// For ongoing iterations: If we already passed the bucket where this
// entry was put, add it to the cookie's list of inserted entries.
loop_over_list(cookies, i)
{
IterCookie* c = cookies[i];
if ( h < (unsigned int) c->bucket )
c->inserted.append(new_entry);
}
return 0;
}
int Dictionary::NextPrime(int n) const
{
if ( (n & 0x1) == 0 )
// Even.
++n;
if ( n > PRIME_THRESH )
// Too expensive to test for primality, just stick with it.
return n;
while ( ! IsPrime(n) )
n += 2;
return n;
}
int Dictionary::IsPrime(int n) const
{
for ( int j = 3; j * j <= n; ++j )
if ( n % j == 0 )
return 0;
return 1;
}
void Dictionary::StartChangeSize(int new_size)
{
// Only start resizing if there isn't any iteration in progress.
if ( cookies.length() > 0 )
return;
if ( tbl2 )
reporter->InternalError("Dictionary::StartChangeSize() tbl2 not NULL");
Init2(new_size);
tbl_next_ind = 0;
// Preserve threshold density
SetDensityThresh2(DensityThresh());
}
void Dictionary::MoveChains()
{
// Do not change current distribution if there an ongoing iteration.
if ( cookies.length() > 0 )
return;
// Attempt to move this many entries (must do at least 2)
int num = 8;
do
{
PList(DictEntry)* chain = tbl[tbl_next_ind++];
if ( ! chain )
continue;
tbl[tbl_next_ind - 1] = 0;
for ( int j = 0; j < chain->length(); ++j )
{
Insert((*chain)[j], 0);
--num_entries;
--num;
}
delete chain;
}
while ( num > 0 && int(tbl_next_ind) < num_buckets );
if ( int(tbl_next_ind) >= num_buckets )
FinishChangeSize();
}
void Dictionary::FinishChangeSize()
{
// Cheap safety check.
if ( num_entries != 0 )
reporter->InternalError(
"Dictionary::FinishChangeSize: num_entries is %d\n",
num_entries);
for ( int i = 0; i < num_buckets; ++i )
delete tbl[i];
delete [] tbl;
tbl = tbl2;
tbl2 = 0;
num_buckets = num_buckets2;
num_entries = num_entries2;
max_num_entries = max_num_entries2;
den_thresh = den_thresh2;
thresh_entries = thresh_entries2;
num_buckets2 = 0;
num_entries2 = 0;
max_num_entries2 = 0;
den_thresh2 = 0;
thresh_entries2 = 0;
}
unsigned int Dictionary::MemoryAllocation() const
{
int size = padded_sizeof(*this);
for ( int i = 0; i < num_buckets; ++i )
if ( tbl[i] )
{
PList(DictEntry)* chain = tbl[i];
loop_over_list(*chain, j)
size += padded_sizeof(DictEntry) + pad_size((*chain)[j]->len);
size += chain->MemoryAllocation();
}
size += pad_size(num_buckets * sizeof(PList(DictEntry)*));
if ( order )
size += order->MemoryAllocation();
if ( tbl2 )
{
for ( int i = 0; i < num_buckets2; ++i )
if ( tbl2[i] )
{
PList(DictEntry)* chain = tbl2[i];
loop_over_list(*chain, j)
size += padded_sizeof(DictEntry) + pad_size((*chain)[j]->len);
size += chain->MemoryAllocation();
}
size += pad_size(num_buckets2 * sizeof(PList(DictEntry)*));
}
return size;
}
void generic_delete_func(void* v)
{
free(v);
}