From d3eeeb0f4cd47e12d4de80dd2db1feafd5798d51 Mon Sep 17 00:00:00 2001 From: jasonlue Date: Tue, 10 Sep 2019 11:20:24 -0700 Subject: [PATCH] add Clustered Hashing based Open Addressing Dict. To replace the existing dict, #define USE_OPEN_DICT --- src/Dict.cc | 10 +- src/Dict.h | 7 + src/OpenDict.cc | 961 ++++++++++++++++++++++++++++++++++++++++++++++++ src/OpenDict.h | 362 ++++++++++++++++++ 4 files changed, 1339 insertions(+), 1 deletion(-) create mode 100644 src/OpenDict.cc create mode 100644 src/OpenDict.h diff --git a/src/Dict.cc b/src/Dict.cc index ce0ee44e78..1bde022560 100644 --- a/src/Dict.cc +++ b/src/Dict.cc @@ -1,6 +1,10 @@ // See the file "COPYING" in the main distribution directory for copyright. -#include "Dict.h" +#ifdef USE_OPEN_DICT + +#include "OpenDict.cc" + +#else//USE_OPEN_DICT #include "zeek-config.h" @@ -1132,3 +1136,7 @@ void Dictionary::StopIteration(IterCookie* cookie) const } } // namespace zeek + +TEST_SUITE_END(); + +#endif//USE_OPEN_DICT diff --git a/src/Dict.h b/src/Dict.h index 2666bb52e2..65b6d202b3 100644 --- a/src/Dict.h +++ b/src/Dict.h @@ -1,4 +1,9 @@ // See the file "COPYING" in the main distribution directory for copyright. +#ifdef USE_OPEN_DICT + +#include "OpenDict.h" + +#else//USE_OPEN_DICT #pragma once @@ -419,3 +424,5 @@ public: using Dictionary [[deprecated("Remove in v4.1. Use zeek::Dictionary instead.")]] = zeek::Dictionary; template using PDict [[deprecated("Remove in v4.1. Use zeek::PDict instead.")]] = zeek::PDict; + +#endif//USE_OPEN_DICT diff --git a/src/OpenDict.cc b/src/OpenDict.cc new file mode 100644 index 0000000000..8c5593a864 --- /dev/null +++ b/src/OpenDict.cc @@ -0,0 +1,961 @@ +// See the file "COPYING" in the main distribution directory for copyright. + +#include "zeek-config.h" + +#ifdef HAVE_MEMORY_H +#include +#endif +#include +#include +#include +#include +#include "OpenDict.h" +#include "Reporter.h" +#include "util.h" + +#ifdef DEBUG +#define ASSERT_VALID(o) 0->AssertValid() +#else +#define ASSERT_VALID(o) +#endif//DEBUG + +struct IterCookie { + IterCookie(Dictionary* d) : robust(false),d(d),next(-1), inserted(0), visited(0){} + bool robust; + Dictionary* d; + int next; //index for next valid entry. -1 is default not started yet. + + //the following only used in robust cookies. + std::vector* inserted; // inserted while iterating + std::vector* visited; //visited but moved across next due to insertion. + void MakeRobust() + { + robust = true; + ASSERT( next <0 ); //must make it robust before any iteration. + inserted = new std::vector(); + visited = new std::vector(); + ASSERT(d&&d->cookies); + d->cookies->push_back(this); + } + + void AssertValid() const + { + ASSERT(d && -1 <= next && next <= d->Capacity()); + ASSERT((!robust && !inserted && !visited) || (robust && inserted && visited)); + } + + ~IterCookie() + { + ASSERT_VALID(this); + if( robust) + { + d->cookies->erase(std::remove(d->cookies->begin(), d->cookies->end(), this)); + delete inserted; + delete visited; + } + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// +//bucket math +int Dictionary::Log2(int num) const + { + int i=0; + while(num>>=1) + i++; + return i; + } + +int Dictionary::Buckets(bool expected) const + { + int buckets = (1 << log2_buckets); + if( expected) + return buckets; + return table ? buckets : 0; + } + +int Dictionary::Capacity(bool expected) const + { + int capacity = (1 << log2_buckets) + (log2_buckets+0); + if( expected) + return capacity; + return table ? capacity : 0; + } + +int Dictionary::ThresholdEntries() const + { + //SizeUp the dictionary when dict is 75% full + //However, when the dict is small (<=2^4+4=20), SizeUp only when dict is 100% full (right after the current insertion) + //the dict will always SizeUp when the last entry is occupied after current insertion. + //the idea is the current insertion is always successful. + int capacity = Capacity(); + if( log2_buckets <= DICT_THRESHOLD_BITS) + return capacity; //20 or less elements, 1.0, only size up when necessary. + return capacity - (capacity>>DICT_LOAD_FACTOR_BITS); + } + +hash_t Dictionary::FibHash(hash_t h) const + { + //refer for details: https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/ + //GoldenRatio phi = (sqrt(5)+1)/2 = 1.6180339887... + //1/phi = phi - 1 + + h &= HASH_MASK; + h *= 11400714819323198485llu; //2^64/phi + return h; + } + +//return position in dict with 2^bit size. +int Dictionary::BucketByHash(hash_t h, int log2_table_size) const //map h to n-bit + { + ASSERT(log2_table_size>=0); + if( !log2_table_size ) + return 0; //<< >> breaks on 64. + int m = 64 - log2_table_size; + #ifdef DICT_NO_FIB_HASH + hash_t hash = h; + #else + hash_t hash = FibHash(h); + #endif//_FIB_HASH + hash <<= m; + hash >>= m; + ASSERT(hash>=0); + return hash; + } + +//given entry at index i, return it's perfect bucket position. +int Dictionary::BucketByPosition(int position) const + { + ASSERT( table && position>=0 && position < Capacity() && !table[position].Empty() ); + return position - table[position].distance; + } + +//////////////////////////////////////////////////////////////////////////////////////////////// +//Cluster Math +//////////////////////////////////////////////////////////////////////////////////////////////// +int Dictionary::HeadOfClusterByBucket(int bucket) const +{ + ASSERT(bucket>=0 && bucket < Buckets()); + int i = bucket; + for(; i < Capacity() && !table[i].Empty() && BucketByPosition(i) < bucket; i++) + if( BucketByPosition(i) == bucket) + return i; //found. + //not found. + return -1; +} + +int Dictionary::TailOfClusterByBucket(int bucket) const +{ + int end = EndOfClusterByBucket(bucket); + if( end - 1 >= 0 && !table[end-1].Empty() && BucketByPosition(end - 1) == bucket ) + return end - 1; + return -1; +} + +int Dictionary::EndOfClusterByBucket(int bucket) const +{ + ASSERT(bucket>=0 && bucket < Buckets()); + int i = bucket; + while( i < Capacity() && !table[i].Empty() && BucketByPosition(i) <= bucket) + i++; + return i; +} + +int Dictionary::HeadOfClusterByPosition( int position) const + {//the first entry in bucket chain. + ASSERT( 0 <= position && position < Capacity() && !table[position].Empty()); + //look backward for the first item with the same bucket as myself. + int bucket = BucketByPosition(position); + int i = position; + while( i >= bucket && BucketByPosition(i) == bucket ) + i--; + return i == bucket ? i : i + 1; + } + +int Dictionary::TailOfClusterByPosition(int position) const + { + ASSERT( 0 <= position && position < Capacity() && !table[position].Empty()); + int bucket = BucketByPosition(position); + int i = position; + while( i < Capacity() && !table[i].Empty() && BucketByPosition(i) == bucket ) + i++; //stop just over the tail. + return i - 1; + } + +int Dictionary::EndOfClusterByPosition(int position) const +{ + return TailOfClusterByPosition(position)+1; +} + +int Dictionary::OffsetInClusterByPosition(int position) const + { + ASSERT( 0 <= position && position < Capacity() && !table[position].Empty() ); + int head = HeadOfClusterByPosition(position); + return position - head; + } + +//find next valid entry after index i (i not included). i can be empty. +int Dictionary::Next(int position) const + {//can handle i=-1. + ASSERT(table && -1 <= position && position < Capacity()); + do + { + position++; + } while (position < Capacity() && table[position].Empty()); + return position; + } + +/////////////////////////////////////////////////////////////////////////////////////////////////////// +//Debugging +/////////////////////////////////////////////////////////////////////////////////////////////////////////// +#define DUMPIF(f) if(f) Dump(1) +#ifdef DEBUG +void Dictionary::AssertValid() const + { + bool valid = true; + int n = num_entries; + for(int i=Capacity()-1; i>=0; i--) + if( table && !table[i].Empty()) + n--; + ASSERT( (valid = (n==0)) ); + DUMPIF(!valid); + + //entries must clustered together + for(int i=1; i= table[i-1].bucket))); + DUMPIF(!valid); + if(table[i].bucket == table[i-1].bucket ) + { + ASSERT((valid=(table[i].distance == table[i-1].distance+1))); + DUMPIF(!valid); + } + else + { + ASSERT((valid=(table[i].distance <= table[i-1].distance))); + DUMPIF(!valid); + } + } + } + } +#endif//DEBUG + +unsigned int Dictionary::MemoryAllocation() const + { + int size = padded_sizeof(*this); + if( table) + { + size += pad_size(Capacity() * sizeof(DictEntry)); + for ( int i = Capacity()-1; i>=0; i-- ) + if ( !table[i].Empty() && table[i].key_size > 8) + size += pad_size(table[i].key_size); + } + if(order) + size += padded_sizeof(std::vector) + pad_size(sizeof(DictEntry) * order->capacity()); + return size; + } + +void Dictionary::DumpKeys() const + { + if(!table) + return; + char key_file[100]; + //detect string or binary from first key. + int i=0; + while(table[i].Empty() && i max_distance) + max_distance = table[i].distance; + if( num_distances <= 0 || !distances) + continue; + if(table[i].distance >= num_distances-1) + distances[num_distances-1]++; + else + distances[table[i].distance]++; + } + } + +void Dictionary::Dump(int level) const + { + int key_size = 0; + uint64_t val_size = 0; + uint64_t connval_size = 0; + for(int i=0; i 0) + { + for(int i=0; i= 1) + { + printf("%-10s %1s %-10s %-4s %-4s %-10s %-18s %-2s\n", "Index", "*","Bucket", "Dist", "Off", "Hash", "FibHash", "KeySize"); + for(int i=0; i 0) + {//if initial is specified, init the table right away, otherwise wait until the first insert to init. + log2_buckets = Log2(initial_size); + Init(); + } + if ( ordering == ORDERED ) + order = new std::vector; + } + +Dictionary::~Dictionary() + { + Clear();//the place to free key memory. other place is to remove the entry + } + +void Dictionary::Clear() + { + if(table) + { + for ( int i = Capacity() - 1; i >= 0; i-- ) + { + if( table[i].Empty()) + continue; + if( delete_func ) + delete_func(table[i].value); + table[i].Clear(); + } + delete [] table; + table = NULL; + } + + if( order ) + { + delete order; + order = NULL; + } + if( cookies ) + { + delete cookies; + cookies = NULL; + } + log2_buckets = 0; + num_iterators = 0; + remaps = 0; + remap_end = -1; + num_entries = 0; + max_entries = 0; + } + +void Dictionary::Init() + { + ASSERT(!table); + table = (DictEntry*)malloc(sizeof(DictEntry)*Capacity(true)); + for(int i=Capacity()-1; i >= 0; i--) + table[i].SetEmpty(); + } + +// private +void generic_delete_func(void* v) + { + free(v); + } + +////////////////////////////////////////////////////////////////////////////////////////// +//Lookup + +// Look up now also possibly modifies the entry. Why? if the entry is found but not positioned according to the current dict (so it's before SizeUp), it will be moved to the right position so next looik up is fast. +void* Dictionary::Lookup(const HashKey* key) const + { + return Lookup(key->Key(), key->Size(), key->Hash()); + } + +void* Dictionary::Lookup(const void* key, int key_size, hash_t h) const + { + Dictionary* d = const_cast(this); + int position = d->LookupIndex(key, key_size, h); + return position >= 0 ? table[position].value : NULL; + } + +//for verification purposes +int Dictionary::LinearLookupIndex(const void* key, int key_size, hash_t hash) const + { + for(int i=0; i= 0) + { + ASSERT(position == linear_position);//same as linearLookup + return position; + } + + for(int i=1; i<=remaps; i++) + { + int prev_bucket = BucketByHash(hash,log2_buckets - i); + if( prev_bucket <= remap_end) + { //possibly here. insert_position & insert_distance returned on failed lookup is not valid in previous table_sizes. + position = LookupIndex(key, key_size, hash, prev_bucket, remap_end+1); + if(position >= 0) + { + ASSERT(position == linear_position);//same as linearLookup + //remap immediately if no iteration is on. + if( !num_iterators) + { + Remap(position, &position); + ASSERT(position == LookupIndex(key, key_size, hash)); + } + return position; + } + } + } + //not found +#ifdef DEBUG + if( linear_position >= 0) + {//different. stop and try to see whats happending. + ASSERT(false); + //rerun the function in debugger to track down the bug. + LookupIndex(key, key_size, hash); + } +#endif//DEBUG + return -1; + } + +/// +/// returns the position of the item if it exists. Otherwise return -1 but set the insert_position/insert_distance if required. +/// begin may not be the bucket for current table size. it's also used to search an item in prev table size. +int Dictionary::LookupIndex(const void* key, int key_size, hash_t hash, int bucket, int end, int* insert_position/*output*/, int* insert_distance/*output*/) + { + ASSERT(bucket>=0 && bucket < Buckets()); + int i = bucket; + for(; i < end && !table[i].Empty() && BucketByPosition(i) <= bucket; i++) + if( BucketByPosition(i) == bucket && table[i].Equal((char*)key, key_size, hash)) + return i; + //no such cluster, or not found in the cluster. + if(insert_position) + *insert_position = i; + if(insert_distance) + *insert_distance = i - bucket; + return -1; + } + +/////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Insert +////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +///This is external interface. +///If key exists, return its value, +///If key doesn't exist, return NULL +void* Dictionary::Insert(void* key, int key_size, hash_t hash, void* val, int copy_key) +{ + ASSERT_VALID(this); + ASSERT(num_iterators == 0 || (cookies && cookies->size() == num_iterators)); //only robust iterators exist. + if( !table ) + Init();//create table on first Insert to save memory on empty dicts. + void* v = NULL; + //if found. i is the position + //if not found, i is the insert position, d is the distance of key on position i. + int insert_position = -1, insert_distance = -1; + int position = LookupIndex(key, key_size, hash, &insert_position, &insert_distance); + if( position >= 0 ) + { + v = table[position].value; + table[position].value = val; + if( !copy_key) + delete (char*)key; + + if( order ) + {//set new v to order too. + auto it = std::find(order->begin(), order->end(), table[position]); + ASSERT(it != order->end()); + it->value = val; + } + if( cookies && !cookies->empty() ) + //need to set new v for cookies too. + for( auto c: *cookies ) + { + ASSERT_VALID(c); + //ASSERT(false); + auto it = std::find(c->inserted->begin(), c->inserted->end(), table[position]); + if( it != c->inserted->end()) + it->value = val; + } + } + else + { + //not found here. + //allocate memory for key if necesary. key is updated to reflect internal key if necessary. + DictEntry entry(key, key_size, hash, val, insert_distance, copy_key); + InsertRelocateAndAdjust(entry, insert_position); + if(order) + order->push_back(entry); + + num_entries++; + cum_entries++; + if( max_entries < num_entries ) + max_entries = num_entries; + if( num_entries > ThresholdEntries() ) + SizeUp(); + #ifdef DEBUG//validate the newly inserted entry can be looked up. + position = LookupIndex(key, key_size, hash); + if(position < 0) + {//if not found. stop here and try again. the second time is for step by step debugging. + ASSERT(false); + LookupIndex(key, key_size, hash); + } + #endif//DEBUG + } + + //Remap after insert can adjust asap to shorten period of mixed table. + //however, if remap happens right after size up, then it consumes more cpu for this cycle, a possible hickup point. + if( Remapping()) + Remap(); + ASSERT_VALID(this); + return v; + } + +///e.distance is adjusted to be the one at insert_position. +void Dictionary::InsertRelocateAndAdjust(DictEntry& entry, int insert_position) + { +#ifdef DEBUG + entry.bucket = BucketByHash(entry.hash,log2_buckets); +#endif//DEBUG + int last_affected_position = insert_position; + InsertAndRelocate(entry, insert_position, &last_affected_position); + //if remapping in progress. adjust remap_end to step back a little to cover the new range if the changed range straddles over remap_end. + if( Remapping() && insert_position <= remap_end && remap_end < last_affected_position ) + {//[i,j] range changed. if map_end in between. then possibly old entry pushed down across map_end. + remap_end = last_affected_position; //adjust to j on the conservative side. + } + + if(cookies && !cookies->empty()) + for(auto c: *cookies) + AdjustOnInsert(c, entry, insert_position, last_affected_position); +} + +/// insert entry into position, relocate other entries when necessary. +void Dictionary::InsertAndRelocate(DictEntry& entry, int insert_position, int* last_affected_position) + {///take out the head of cluster and append to the end of the cluster. + while(true) + { + if(insert_position >= Capacity()) + { + ASSERT(insert_position == Capacity()); + SizeUp(); //copied all the items to new table. as it's just copying without remapping, insert_position is now empty. + table[insert_position] = entry; + if(last_affected_position) + *last_affected_position = insert_position; + return; + } + if(table[insert_position].Empty()) + { //the condition to end the loop. + table[insert_position] = entry; + if(last_affected_position) + *last_affected_position = insert_position; + + return; + } + //the to-be-swapped-out item appends to the end of its original cluster. + auto t = table[insert_position]; + int next = EndOfClusterByPosition(insert_position); + t.distance += next - insert_position; + //swap + table[insert_position] = entry; + entry = t; + insert_position = next; //append to the end of the current cluster. + } + } + +/// Adjust Cookies on Insert. +void Dictionary::AdjustOnInsert(IterCookie* c, const DictEntry& entry, int insert_position, int last_affected_position) + { + ASSERT(c); + ASSERT_VALID(c); + if(insert_position < c->next) + c->inserted->push_back(entry); + if( insert_position < c->next && c->next <= last_affected_position) + { + int k = TailOfClusterByPosition(c->next); + ASSERT(k >= 0 && k < Capacity()); + c->visited->push_back(table[k]); + } + } + +void Dictionary::SizeUp() + { + int prev_capacity = Capacity(); + log2_buckets++; + int capacity = Capacity(); + table = (DictEntry*)realloc(table, capacity*sizeof(DictEntry)); + for(int i=prev_capacity; isize() == num_iterators)); //only robust iterators exist. + ASSERT(!dont_delete); //this is a poorly designed flag. if on, the internal has nowhere to return and memory is lost. + int position = LookupIndex(key, key_size, hash); + if(position < 0) + return NULL; + + DictEntry entry = RemoveRelocateAndAdjust(position); + num_entries--; + ASSERT(num_entries >= 0); + //e is about to be invalid. remove it from all references. + if(order) + order->erase(std::remove(order->begin(), order->end(), entry)); + void* v = entry.value; + entry.Clear(); + ASSERT_VALID(this); + return v; + } + +DictEntry Dictionary::RemoveRelocateAndAdjust(int position) + { + int last_affected_position = position; + DictEntry entry = RemoveAndRelocate(position, &last_affected_position); +#ifdef DEBUG + //validation: index to i-1 should be continuous without empty spaces. + for(int k=position; kempty()) + for(auto c: *cookies) + AdjustOnRemove(c, entry, position, last_affected_position); + + return entry; + } + +DictEntry Dictionary::RemoveAndRelocate(int position, int* last_affected_position) + {//fill the empty position with the tail of the cluster of position+1. + ASSERT(position >= 0 && position < Capacity() && !table[position].Empty()); + DictEntry entry = table[position]; + while(true) + { + if( position == Capacity() - 1 || table[position+1].Empty() || table[position+1].distance == 0) + {//no next cluster to fill, or next position is empty or next position is already in perfect bucket. + table[position].SetEmpty(); + if(last_affected_position) + *last_affected_position = position; + return entry; + } + int next = TailOfClusterByPosition(position+1); + table[position] = table[next]; + table[position].distance -= next - position; //distance improved for the item. + position = next; + } + return entry; + } + +void Dictionary::AdjustOnRemove(IterCookie* c, const DictEntry& entry, int position, int last_affected_position) + { + ASSERT_VALID(c); + c->inserted->erase(std::remove(c->inserted->begin(), c->inserted->end(), entry), c->inserted->end()); + if( position < c->next && c->next <= last_affected_position) + { + int moved = HeadOfClusterByPosition(c->next-1); + if( moved < position ) + moved = position; + c->inserted->push_back(table[moved]); + } + + //if not already the end of the dictionary, adjust next to a valid one. + if( c->next < Capacity() && table[c->next].Empty()) + c->next = Next(c->next); + } + +/////////////////////////////////////////////////////////////////////////////////////////////////// +//Remap +/////////////////////////////////////////////////////////////////////////////////////////////////// + +void Dictionary::Remap() + { + ///since remap should be very fast. take more at a time. + ///delay Remap when cookie is there. hard to handle cookie iteration while size changes. + ///remap from bottom up. + ///remap creates two parts of the dict: [0,remap_end] (remap_end, ...]. the former is mixed with old/new entries; the latter contains all new entries. + /// + if(num_iterators) + return; + + int left = DICT_REMAP_ENTRIES; + while(remap_end >= 0 && left > 0) + { + if(!table[remap_end].Empty() && Remap(remap_end)) + left--; + else//< successful Remap may increase remap_end in the case of SizeUp due to insert. if so, remap_end need to be worked on again. + remap_end--; + } + if(remap_end < 0) + remaps = 0; //done remapping. + } + +bool Dictionary::Remap(int position, int* new_position) + { + ASSERT_VALID(this); + ///Remap changes item positions by remove() and insert(). to avoid excessive operation. avoid it when safe iteration is in progress. + ASSERT(!cookies || cookies->empty()); + int current = BucketByPosition(position);//current bucket + int expected = BucketByHash(table[position].hash, log2_buckets); //expected bucket in new table. + //equal because 1: it's a new item, 2: it's an old item, but new bucket is the same as old. 50% of old items act this way due to fibhash. + if( current == expected ) + return false; + DictEntry entry = RemoveAndRelocate(position); // no iteration cookies to adjust, no need for last_affected_position. + #ifdef DEBUG + entry.bucket = expected; + #endif//DEBUG + + //find insert position. + int insert_position = EndOfClusterByBucket(expected); + if( new_position ) + *new_position = insert_position; + entry.distance = insert_position - expected; + InsertAndRelocate(entry, insert_position);// no iteration cookies to adjust, no need for last_affected_position. + ASSERT_VALID(this); + return true; + } + +////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Iteration +/////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +void* Dictionary::NthEntry(int n, const void*& key, int& key_size) const + { + if( !order || n < 0 || n >= Length() ) + return 0; + DictEntry entry = (*order)[n]; + key = entry.GetKey(); + key_size = entry.key_size; + return entry.value; + } + +void Dictionary::MakeRobustCookie(IterCookie* cookie) + { //make sure c->next >= 0. + if( !cookies) + cookies = new std::vector; + cookie->MakeRobust(); + ASSERT_VALID(cookie); + } + +IterCookie* Dictionary::InitForIterationNonConst() //const + { + num_iterators++; + return new IterCookie(const_cast(this)); + } + +void Dictionary::StopIterationNonConst(IterCookie* cookie) //const + { + ASSERT(num_iterators > 0); + if(num_iterators > 0) + num_iterators--; + delete cookie; + } + +void* Dictionary::NextEntryNonConst(HashKey*& h, IterCookie*& c, int return_hash) //const + { + // If there are any inserted entries, return them first. + // That keeps the list small and helps avoiding searching + // a large list when deleting an entry. + ASSERT(c); + ASSERT_VALID(c); + if(!table) + { + if(num_iterators > 0) + num_iterators--; + delete c; + c = NULL; + return NULL; //end of iteration. + } + if ( c->inserted && !c->inserted->empty() ) + { + // Return the last one. Order doesn't matter, + // and removing from the tail is cheaper. + DictEntry e = c->inserted->back(); + if ( return_hash ) + h = new HashKey(e.GetKey(), e.key_size, e.hash); + void* v = e.value; + c->inserted->pop_back(); + return v; + } + + if( c->next < 0) + c->next = Next(-1); + int capacity = Capacity(); + // if resize happens during iteration. before sizeup, c->next points to Capacity(), + // but now Capacity() doubles up and c->next doesn't point to the end anymore. + // this is fine because c->next may be filled now. + // however, c->next can also be empty. + // before sizeup, we use c->next >= Capacity() to indicate the end of the iteration. + // now this guard is invalid, we may face c->next is valid but empty now.F + //fix it here. + if( c->next < capacity && table[c->next].Empty()) + { + ASSERT(false); //stop to check the condition here. why it's happening. + c->next = Next(c->next); + } + //filter out visited keys. + if( c->visited && !c->visited->empty()) + //filter out visited entries. + while(c->next < capacity ) + { + ASSERT(!table[c->next].Empty()); + auto it = std::find(c->visited->begin(), c->visited->end(), table[c->next]); + if( it == c->visited->end()) + break; + c->visited->erase(it); + c->next = Next(c->next); + } + + if( c->next >= capacity ) + {//end. + if(num_iterators > 0) + num_iterators--; + delete c; + c = NULL; + return NULL; //end of iteration. + } + + ASSERT(!table[c->next].Empty()); + void* v = table[c->next].value; + if ( return_hash ) + h = new HashKey(table[c->next].GetKey(), table[c->next].key_size, table[c->next].hash); + //prepare for next time. + c->next = Next(c->next); + ASSERT_VALID(c); + return v; + } + + +IterCookie* Dictionary::InitForIteration() const +{ + Dictionary* dp = const_cast(this); + return dp->InitForIterationNonConst(); +} +void* Dictionary::NextEntry(HashKey*& h, IterCookie*& cookie, int return_hash) const +{ + Dictionary* dp = const_cast(this); + return dp->NextEntryNonConst(h,cookie, return_hash); +} +void Dictionary::StopIteration(IterCookie* cookie) const +{ + Dictionary* dp = const_cast(this); + dp->StopIterationNonConst(cookie); +} diff --git a/src/OpenDict.h b/src/OpenDict.h new file mode 100644 index 0000000000..86b25dcfc5 --- /dev/null +++ b/src/OpenDict.h @@ -0,0 +1,362 @@ +// See the file "COPYING" in the main distribution directory for copyright. +//Clustered Hashing, a variation of Robinhood Hashing/Open Addressing Hashing. +//Ref following post links to help understand the implementation +//https://jasonlue.github.io/algo/2019/08/20/clustered-hashing.html +//https://jasonlue.github.io/algo/2019/08/27/clustered-hashing-basic-operations.html +//https://jasonlue.github.io/algo/2019/09/03/clustered-hashing-incremental-resize.html +//https://jasonlue.github.io/algo/2019/09/10/clustered-hashing-modify-on-iteration.html + +#ifndef odict_h +#define odict_h + +#include +#include "Hash.h" +#include +class Dictionary; +class DictEntry; +class IterCookie; + +// Default number of hash buckets in dictionary. The dictionary will +// increase the size of the hash table as needed. +#define HASH_MASK 0xFFFFffff //only lower 32 bits. + +//micros here are used to build different targets with -Dxxx to compare for performance or for debug purposes. +//when incremental resizing, remapping, it remaps DICT_REMAP_ENTRIES at step. +#ifndef DICT_REMAP_ENTRIES +#define DICT_REMAP_ENTRIES 16 //2 for debug. 16 is best for release. move how many at a time when remapping. +#endif//DICT_REMAP_ENTRIES + +#ifndef DICT_LOAD_FACTOR_BITS //give option to define it at command line +#define DICT_LOAD_FACTOR_BITS 2//LOAD_FACTOR = 1 - 1/2^LOAD_FACTOR_BITS .75 is the optimal load factor. +#endif //LOAD_FACTOR_BITS + +//when log2_buckets > DICT_THRESHOLD_BITS, DICT_LOAD_FACTOR_BITS becomes effective. +//basicly it means if dict size < 2^DICT_THRESHOLD_BITS+n, size up only if necessary (the last entry can't hold new key). +#ifndef DICT_THRESHOLD_BITS +#define DICT_THRESHOLD_BITS 3 +#endif + +// of insertions. +typedef enum { ORDERED, UNORDERED } dict_order; + +// Type for function to be called when deleting elements. +typedef void (*dict_delete_func)(void*); + +// A dict_delete_func that just calls delete. +extern void generic_delete_func(void*); +// The value of an iteration cookie is the bucket and offset within the +// bucket at which to start looking for the next value to return. +#define TOO_FAR_TO_REACH 0xFFFF +#define MAX_KEY_SIZE 0XFFFF +struct DictEntry{//24 bytes. perfectly alligned. always own the key. but not the value. + #ifdef DEBUG + int bucket;//for easy debugging + #endif//DEBUG + uint16_t distance; //(key_here); + } + else + { + if( copy_key ) + { + key = new char[key_size]; + memcpy(key, arg_key, key_size); + arg_key = key; + } + else + { + key = (char*)arg_key; + } + } + } + bool Empty() {return distance == TOO_FAR_TO_REACH;} + void SetEmpty() + { + distance = TOO_FAR_TO_REACH; + #ifdef DEBUG + hash = 0; + key = NULL; + value = NULL; + key_size = 0; + bucket = 0; + #endif//DEBUG + } + + //if with no intent to release key memory, call SetEmpty instead. key pointer is shared when moving around. + void Clear() + {//SetEmpty & release memory if allocated. + if(key_size > 8) + delete key; + SetEmpty(); + } + const char* GetKey() const {return key_size <= 8? key_here : key;} + bool Equal(const char* arg_key, int arg_key_size, hash_t arg_hash) const + {//only 40-bit hash comparison. + return (0 == ((hash ^ arg_hash) & HASH_MASK)) + && key_size == arg_key_size && 0 == memcmp(GetKey(), arg_key, key_size); + } + bool operator==(const DictEntry& r) const + { + return Equal(r.GetKey(), r.key_size, r.hash); + } + bool operator!=(const DictEntry& r) const + { + return !Equal(r.GetKey(), r.key_size, r.hash); + } +}; + +struct IterCookie; + +// Default number of hash buckets in dictionary. The dictionary will +// increase the size of the hash table as needed. +#ifndef DEFAULT_DICT_SIZE +#define DEFAULT_DICT_SIZE 0 +#endif//DEFAULT_DICT_SIZE +class Dictionary{ +public: + explicit Dictionary(dict_order ordering = UNORDERED, int initial_size = DEFAULT_DICT_SIZE); + ~Dictionary(); + + // Member functions for looking up a key, inserting/changing its + // contents, and deleting it. These come in two flavors: one + // which takes a HashKey, and the other which takes a raw key, + // its size, and its (unmodulated) hash. + //lookup may move the key to right place if in the old zone to speed up the next lookup. + void* Lookup(const HashKey* key) const; + void* Lookup(const void* key, int key_size, hash_t h) const; + + // Returns previous value, or 0 if none. + void* Insert(HashKey* key, void* val) + { return Insert(key->TakeKey(), key->Size(), key->Hash(), val, 0);} + // If copy_key is true, then the key is copied, otherwise it's assumed + // that it's a heap pointer that now belongs to the Dictionary to + // manage as needed. + void* Insert(void* key, int key_size, hash_t hash, void* val, int copy_key); + + // Removes the given element. Returns a pointer to the element in + // case it needs to be deleted. Returns 0 if no such element exists. + // If dontdelete is true, the key's bytes will not be deleted. + void* Remove(const HashKey* key) + { return Remove(key->Key(), key->Size(), key->Hash()); } + void* Remove(const void* key, int key_size, hash_t hash, bool dont_delete = false); + + // Number of entries. + int Length() const + { return num_entries; } + + // Largest it's ever been. + int MaxLength() const + { return max_entries; } + + // Total number of entries ever. + uint64_t NumCumulativeInserts() const + { return cum_entries; } + + // True if the dictionary is ordered, false otherwise. + int IsOrdered() const { return order != 0; } + + // If the dictionary is ordered then returns the n'th entry's value; + // the second method also returns the key. The first entry inserted + // corresponds to n=0. + // + // Returns nil if the dictionary is not ordered or if "n" is out + // of range. + void* NthEntry(int n) const + { + const void* key; + int key_len; + return NthEntry(n, key, key_len); + } + void* NthEntry(int n, const void*& key, int& key_len) const; + + // To iterate through the dictionary, first call InitForIteration() + // to get an "iteration cookie". The cookie can then be handed + // to NextEntry() to get the next entry in the iteration and update + // the cookie. If NextEntry() indicates no more entries, it will + // also delete the cookie, or the cookie can be manually deleted + // prior to this if no longer needed. + // + // Unexpected results will occur if the elements of + // the dictionary are changed between calls to NextEntry() without + // first calling InitForIteration(). + // + // If return_hash is true, a HashKey for the entry is returned in h, + // which should be delete'd when no longer needed. + IterCookie* InitForIteration() const; + void* NextEntry(HashKey*& h, IterCookie*& cookie, int return_hash) const; + void StopIteration(IterCookie* cookie) const; + + void SetDeleteFunc(dict_delete_func f) { delete_func = f; } + + // With a robust cookie, it is safe to change the dictionary while + // iterating. This means that (i) we will eventually visit all + // unmodified entries as well as all entries added during iteration, + // and (ii) we won't visit any still-unseen entries which are getting + // removed. (We don't get this for free, so only use it if + // necessary.) + void MakeRobustCookie(IterCookie* cookie); + // Remove all entries. + void Clear(); + + unsigned int MemoryAllocation() const; + float GetThreshold() const { return 1.0 - 1.0 / (1<= 0;} //remap in reverse order. + ///One round of remap. + void Remap(); + + ///Remap item in [position] + ///Returns true if actual relocation happend. false on noop. + bool Remap(int position, int* new_position = NULL); + void SizeUp(); + + //the dictionary size will be bounded at around 100K. 1M is absolute limit. only Connections use so many entries. + //alligned on 8-bytes with 4-leading bytes. 7*8=56 bytes a dictionary. + + //when sizeup but the current mapping is in progress. the current mapping will be ignored as it will be remapped to new dict size anyway. + //however, the missed count is recorded for lookup. if position not found for a key in the position of dict of current size, it still could be in the position of dict of previous N sizes. + unsigned char remaps; + unsigned char log2_buckets; + unsigned short num_iterators; //pending iterators on the dict. including robust and non-rubust. to avoid remap when any iterators active. + int remap_end;//the last index to be remapped + + int num_entries; + int max_entries; + + uint64_t cum_entries; + dict_delete_func delete_func; + DictEntry* table; + vector* cookies; + vector* order;//order means the order of insertion. means no deletion until exit. will be inefficient. +public: + //Debugging +#ifdef DEBUG + void AssertValid() const; +#endif//DEBUG + void Dump(int level=0) const; + void DistanceStats(int& max_distance, int* distances=0, int num_distances=0) const; + void DumpKeys() const; +}; + +template +class PDict : public Dictionary { +public: + explicit PDict(dict_order ordering = UNORDERED, int initial_size = 0) : + Dictionary(ordering, initial_size) {} + T* Lookup(const char* key) const + { + HashKey h(key); + return (T*) Dictionary::Lookup(&h); + } + T* Lookup(const HashKey* key) const + { return (T*) Dictionary::Lookup(key); } + T* Insert(const char* key, T* val) + { + HashKey h(key); + return (T*) Dictionary::Insert(&h, (void*) val); + } + T* Insert(HashKey* key, T* val) + { return (T*) Dictionary::Insert(key, (void*) val); } + T* NthEntry(int n) const + { return (T*) Dictionary::NthEntry(n); } + T* NthEntry(int n, const char*& key) const + { + int key_len; + return (T*) Dictionary::NthEntry(n, (const void*&) key, key_len); + } + T* NextEntry(IterCookie*& cookie) const + { + HashKey* h; + return (T*) Dictionary::NextEntry(h, cookie, 0); + } + T* NextEntry(HashKey*& h, IterCookie*& cookie) const + { return (T*) Dictionary::NextEntry(h, cookie, 1); } + T* RemoveEntry(const HashKey* key) + { return (T*) Remove(key->Key(), key->Size(), key->Hash()); } +}; + +#endif//odict_h