zeek/src/Dict.cc

// See the file "COPYING" in the main distribution directory for copyright.

#include "config.h"

#ifdef HAVE_MEMORY_H
#include <memory.h>
#endif

#include "Dict.h"
#include "Reporter.h"

// If the mean bucket length exceeds the following then Insert() will
// increase the size of the hash table.
#define DEFAULT_DENSITY_THRESH 3.0

// Threshold above which we do not try to ensure that the hash size
// is prime.
#define PRIME_THRESH 1000

class DictEntry {
public:
	DictEntry(void* k, int l, hash_t h, void* val)
		{ key = k; len = l; hash = h; value = val; }

	~DictEntry()
		{
		delete [] (char*) key;
		}

	void* key;
	int len;
	hash_t hash;
	void* value;
};

// The value of an iteration cookie is the bucket and offset within the
// bucket at which to start looking for the next value to return.
class IterCookie {
public:
	IterCookie(int b, int o)
		{
		bucket = b;
		offset = o;
		ttbl = 0;
		num_buckets_p = 0;
		}

	int bucket, offset;
	PList(DictEntry)** ttbl;
	const int* num_buckets_p;
	PList(DictEntry) inserted;	// inserted while iterating
};

Dictionary::Dictionary(dict_order ordering, int initial_size)
	{
	Init(initial_size);
	tbl2 = 0;

	if ( ordering == ORDERED )
		order = new PList(DictEntry);
	else
		order = 0;

	SetDensityThresh(DEFAULT_DENSITY_THRESH);

	delete_func = 0;
	tbl_next_ind = 0;

	num_buckets2 = num_entries2 = max_num_entries2 = thresh_entries2 = 0;
	den_thresh2 = 0;
	}

Dictionary::~Dictionary()
	{
	DeInit();
	delete order;
	}

void Dictionary::Clear()
	{
	DeInit();
	Init(2);
	tbl2 = 0;
	}

void Dictionary::DeInit()
	{
	for ( int i = 0; i < num_buckets; ++i )
		if ( tbl[i] )
			{
			PList(DictEntry)* chain = tbl[i];
			loop_over_list(*chain, j)
				{
				DictEntry* e = (*chain)[j];
				if ( delete_func )
					delete_func(e->value);
				delete e;
				}

			delete chain;
			}

	delete [] tbl;

	if ( tbl2 == 0 )
		return;

	for ( int i = 0; i < num_buckets2; ++i )
		if ( tbl2[i] )
			{
			PList(DictEntry)* chain = tbl2[i];
			loop_over_list(*chain, j)
				{
				DictEntry* e = (*chain)[j];
				if ( delete_func )
					delete_func(e->value);
				delete e;
				}

			delete chain;
			}

	delete [] tbl2;
	tbl2 = 0;
	}

void* Dictionary::Lookup(const void* key, int key_size, hash_t hash) const
	{
	hash_t h;
	PList(DictEntry)* chain;

	// Figure out which hash table to look in.
	h = hash % num_buckets;
	if ( ! tbl2 || h >= tbl_next_ind )
		chain = tbl[h];
	else
		chain = tbl2[hash % num_buckets2];

	if ( chain )
		{
		for ( int i = 0; i < chain->length(); ++i )
			{
			DictEntry* entry = (*chain)[i];

			if ( entry->hash == hash && entry->len == key_size &&
			     ! memcmp(key, entry->key, key_size) )
				return entry->value;
			}
		}

	return 0;
	}

void* Dictionary::Insert(void* key, int key_size, hash_t hash, void* val,
				int copy_key)
	{
	DictEntry* new_entry = new DictEntry(key, key_size, hash, val);
	void* old_val = Insert(new_entry, copy_key);

	if ( old_val )
		{
		// We didn't need the new DictEntry, the key was already
		// present.
		delete new_entry;
		}
	else if ( order )
		order->append(new_entry);

	// Resize logic.
	if ( tbl2 )
		MoveChains();
	else if ( num_entries >= thresh_entries )
		StartChangeSize(num_buckets * 2 + 1);

	return old_val;
	}

void* Dictionary::Remove(const void* key, int key_size, hash_t hash,
				bool dont_delete)
	{
	hash_t h;
	PList(DictEntry)* chain;
	int* num_entries_ptr;

	// Figure out which hash table to look in
	h = hash % num_buckets;
	if ( ! tbl2 || h >= tbl_next_ind )
		{
		chain = tbl[h];
		num_entries_ptr = &num_entries;
		}
	else
		{
		chain = tbl2[hash % num_buckets2];
		num_entries_ptr = &num_entries2;
		}

	if ( ! chain )
		return 0;

	for ( int i = 0; i < chain->length(); ++i )
		{
		DictEntry* entry = (*chain)[i];

		if ( entry->hash == hash && entry->len == key_size &&
		     ! memcmp(key, entry->key, key_size) )
			{
			void* entry_value = DoRemove(entry, h, chain, i);

			if ( dont_delete )
				entry->key = 0;

			delete entry;
			--*num_entries_ptr;
			return entry_value;
			}
		}

	return 0;
	}

void* Dictionary::DoRemove(DictEntry* entry, hash_t h,
				PList(DictEntry)* chain, int chain_offset)
	{
	void* entry_value = entry->value;

	chain->remove_nth(chain_offset);
	if ( order )
		order->remove(entry);

	// Adjust existing cookies.
	loop_over_list(cookies, i)
		{
		IterCookie* c = cookies[i];

		// Is the affected bucket the current one?
		if ( (unsigned int) c->bucket == h )
			{
			if ( c->offset > chain_offset )
				--c->offset;

			// The only other important case here occurs when we
			// are deleting the current entry which
			// simultaniously happens to be the last one in this
			// bucket. This means that we would have to move on
			// to the next non-empty bucket. Fortunately,
			// NextEntry() will do exactly the right thing in
			// this case. :-)
			}

		// This item may have been inserted during this iteration.
		if ( (unsigned int) c->bucket > h )
			c->inserted.remove(entry);
		}

	return entry_value;
	}

void* Dictionary::NthEntry(int n, const void*& key, int& key_len) const
	{
	if ( ! order || n < 0 || n >= Length() )
		return 0;

	DictEntry* entry = (*order)[n];
	key = entry->key;
	key_len = entry->len;
	return entry->value;
	}

IterCookie* Dictionary::InitForIteration() const
	{
	return new IterCookie(0, 0);
	}

void Dictionary::StopIteration(IterCookie* cookie) const
	{
	delete cookie;
	}

void* Dictionary::NextEntry(HashKey*& h, IterCookie*& cookie, int return_hash) const
	{
	// If there are any inserted entries, return them first.
	// That keeps the list small and helps avoiding searching
	// a large list when deleting an entry.

	DictEntry* entry;

	if ( cookie->inserted.length() )
		{
		// Return the last one. Order doesn't matter,
		// and removing from the tail is cheaper.
		entry = cookie->inserted.remove_nth(cookie->inserted.length()-1);
		if ( return_hash )
			h = new HashKey(entry->key, entry->len, entry->hash);

		return entry->value;
		}

	int b = cookie->bucket;
	int o = cookie->offset;
	PList(DictEntry)** ttbl;
	const int* num_buckets_p;

	if ( ! cookie->ttbl )
		{
		// XXX maybe we could update cookie->b from tbl_next_ind here?
		cookie->ttbl = tbl;
		cookie->num_buckets_p = &num_buckets;
		}

	ttbl = cookie->ttbl;
	num_buckets_p = cookie->num_buckets_p;

	if ( ttbl[b] && ttbl[b]->length() > o )
		{
		entry = (*ttbl[b])[o];
		++cookie->offset;
		if ( return_hash )
			h = new HashKey(entry->key, entry->len, entry->hash);
		return entry->value;
		}

	++b;	// Move on to next non-empty bucket.
	while ( b < *num_buckets_p && (! ttbl[b] || ttbl[b]->length() == 0) )
		++b;

	if ( b >= *num_buckets_p )
		{
		// If we're resizing, we need to search the 2nd table too.
		if ( ttbl == tbl && tbl2 )
			{
			cookie->ttbl = tbl2;
			cookie->num_buckets_p = &num_buckets2;
			cookie->bucket = 0;
			cookie->offset = 0;
			return Dictionary::NextEntry(h, cookie, return_hash);
			}

		// All done.

		// FIXME: I don't like removing the const here. But is there
		// a better way?
		const_cast<PList(IterCookie)*>(&cookies)->remove(cookie);
		delete cookie;
		cookie = 0;
		return 0;
		}

	entry = (*ttbl[b])[0];
	if ( return_hash )
		h = new HashKey(entry->key, entry->len, entry->hash);

	cookie->bucket = b;
	cookie->offset = 1;

	return entry->value;
	}

void Dictionary::Init(int size)
	{
	num_buckets = NextPrime(size);
	tbl = new PList(DictEntry)*[num_buckets];

	for ( int i = 0; i < num_buckets; ++i )
		tbl[i] = 0;

	max_num_entries = num_entries = 0;
	}

void Dictionary::Init2(int size)
	{
	num_buckets2 = NextPrime(size);
	tbl2 = new PList(DictEntry)*[num_buckets2];

	for ( int i = 0; i < num_buckets2; ++i )
		tbl2[i] = 0;

	max_num_entries2 = num_entries2 = 0;
	}

// private
void* Dictionary::Insert(DictEntry* new_entry, int copy_key)
	{
	PList(DictEntry)** ttbl;
	int* num_entries_ptr;
	int* max_num_entries_ptr;
	hash_t h = new_entry->hash % num_buckets;

	// We must be careful when we are in the middle of resizing.
	// If the new entry hashes to a bucket in the old table we
	// haven't moved yet, we need to put it in the old table. If
	// we didn't do it this way, we would sometimes have to
	// search both tables which is probably more expensive.

	if ( ! tbl2 || h >= tbl_next_ind )
		{
		ttbl = tbl;
		num_entries_ptr = &num_entries;
		max_num_entries_ptr = &max_num_entries;
		}
	else
		{
		ttbl = tbl2;
		h = new_entry->hash % num_buckets2;
		num_entries_ptr = &num_entries2;
		max_num_entries_ptr = &max_num_entries2;
		}

	PList(DictEntry)* chain = ttbl[h];

	int n = new_entry->len;

	if ( chain )
		{
		for ( int i = 0; i < chain->length(); ++i )
			{
			DictEntry* entry = (*chain)[i];

			if ( entry->hash == new_entry->hash &&
			     entry->len == n &&
			     ! memcmp(entry->key, new_entry->key, n) )
				{
				void* old_value = entry->value;
				entry->value = new_entry->value;
				return old_value;
				}
			}
		}
	else
		// Create new chain.
		chain = ttbl[h] = new PList(DictEntry);

	// If we got this far, then we couldn't use an existing copy
	// of the key, so make a new one if necessary.
	if ( copy_key )
		{
		void* old_key = new_entry->key;
		new_entry->key = (void*) new char[n];
		memcpy(new_entry->key, old_key, n);
		delete (char*) old_key;
		}

	// We happen to know (:-() that appending is more efficient
	// on lists than prepending.
	chain->append(new_entry);

	if ( *max_num_entries_ptr < ++*num_entries_ptr )
		*max_num_entries_ptr = *num_entries_ptr;

	// For ongoing iterations: If we already passed the bucket where this
	// entry was put, add it to the cookie's list of inserted entries.
	loop_over_list(cookies, i)
		{
		IterCookie* c = cookies[i];
		if ( h < (unsigned int) c->bucket )
			c->inserted.append(new_entry);
		}

	return 0;
	}

int Dictionary::NextPrime(int n) const
	{
	if ( (n & 0x1) == 0 )
		// Even.
		++n;

	if ( n > PRIME_THRESH )
		// Too expensive to test for primality, just stick with it.
		return n;

	while ( ! IsPrime(n) )
		n += 2;

	return n;
	}

int Dictionary::IsPrime(int n) const
	{
	for ( int j = 3; j * j <= n; ++j )
		if ( n % j == 0 )
			return 0;

	return 1;
	}

void Dictionary::StartChangeSize(int new_size)
	{
	// Only start resizing if there isn't any iteration in progress.
	if ( cookies.length() > 0 )
		return;

	if ( tbl2 )
		reporter->InternalError("Dictionary::StartChangeSize() tbl2 not NULL");

	Init2(new_size);

	tbl_next_ind = 0;

	// Preserve threshold density
	SetDensityThresh2(DensityThresh());
	}

void Dictionary::MoveChains()
	{
	// Do not change current distribution if there an ongoing iteration.
	if ( cookies.length() > 0 )
		return;

	// Attempt to move this many entries (must do at least 2)
	int num = 8;

	do
		{
		PList(DictEntry)* chain = tbl[tbl_next_ind++];

		if ( ! chain )
			continue;

		tbl[tbl_next_ind - 1] = 0;

		for ( int j = 0; j < chain->length(); ++j )
			{
			Insert((*chain)[j], 0);
			--num_entries;
			--num;
			}

		delete chain;
		}
	while ( num > 0 && int(tbl_next_ind) < num_buckets );

	if ( int(tbl_next_ind) >= num_buckets )
		FinishChangeSize();
	}

void Dictionary::FinishChangeSize()
	{
	// Cheap safety check.
	if ( num_entries != 0 )
		reporter->InternalError(
		    "Dictionary::FinishChangeSize: num_entries is %d\n",
		    num_entries);

	for ( int i = 0; i < num_buckets; ++i )
		delete tbl[i];
	delete [] tbl;

	tbl = tbl2;
	tbl2 = 0;

	num_buckets = num_buckets2;
	num_entries = num_entries2;
	max_num_entries = max_num_entries2;
	den_thresh = den_thresh2;
	thresh_entries = thresh_entries2;

	num_buckets2 = 0;
	num_entries2 = 0;
	max_num_entries2 = 0;
	den_thresh2 = 0;
	thresh_entries2 = 0;
	}

unsigned int Dictionary::MemoryAllocation() const
	{
	int size = padded_sizeof(*this);

	for ( int i = 0; i < num_buckets; ++i )
		if ( tbl[i] )
			{
			PList(DictEntry)* chain = tbl[i];
			loop_over_list(*chain, j)
				size += padded_sizeof(DictEntry) + pad_size((*chain)[j]->len);
			size += chain->MemoryAllocation();
			}

	size += pad_size(num_buckets * sizeof(PList(DictEntry)*));

	if ( order )
		size += order->MemoryAllocation();

	if ( tbl2 )
		{
		for ( int i = 0; i < num_buckets2; ++i )
			if ( tbl2[i] )
				{
				PList(DictEntry)* chain = tbl2[i];
				loop_over_list(*chain, j)
					size += padded_sizeof(DictEntry) + pad_size((*chain)[j]->len);
				size += chain->MemoryAllocation();
				}

		size += pad_size(num_buckets2 * sizeof(PList(DictEntry)*));
		}

	return size;
	}

void generic_delete_func(void* v)
	{
	free(v);
	}