Line data Source code
1 : /*
2 : * simplehash.h
3 : *
4 : * Hash table implementation which will be specialized to user-defined
5 : * types, by including this file to generate the required code. It's
6 : * probably not worthwhile to do so for hash tables that aren't performance
7 : * or space sensitive.
8 : *
9 : * Usage notes:
10 : *
11 : * To generate a hash-table and associated functions for a use case several
12 : * macros have to be #define'ed before this file is included. Including
13 : * the file #undef's all those, so a new hash table can be generated
14 : * afterwards.
15 : * The relevant parameters are:
16 : * - SH_PREFIX - prefix for all symbol names generated. A prefix of 'foo'
17 : * will result in hash table type 'foo_hash' and functions like
18 : * 'foo_insert'/'foo_lookup' and so forth.
19 : * - SH_ELEMENT_TYPE - type of the contained elements
20 : * - SH_KEY_TYPE - type of the hashtable's key
21 : * - SH_DECLARE - if defined function prototypes and type declarations are
22 : * generated
23 : * - SH_DEFINE - if defined function definitions are generated
24 : * - SH_SCOPE - in which scope (e.g. extern, static inline) do function
25 : * declarations reside
26 : * - SH_USE_NONDEFAULT_ALLOCATOR - if defined no element allocator functions
27 : * are defined, so you can supply your own
28 : * The following parameters are only relevant when SH_DEFINE is defined:
29 : * - SH_KEY - name of the element in SH_ELEMENT_TYPE containing the hash key
30 : * - SH_EQUAL(table, a, b) - compare two table keys
31 : * - SH_HASH_KEY(table, key) - generate hash for the key
32 : * - SH_STORE_HASH - if defined the hash is stored in the elements
33 : * - SH_GET_HASH(tb, a) - return the field to store the hash in
34 : *
35 : * For examples of usage look at simplehash.c (file local definition) and
36 : * execnodes.h/execGrouping.c (exposed declaration, file local
37 : * implementation).
38 : *
39 : * Hash table design:
40 : *
41 : * The hash table design chosen is a variant of linear open-addressing. The
42 : * reason for doing so is that linear addressing is CPU cache & pipeline
43 : * friendly. The biggest disadvantage of simple linear addressing schemes
44 : * are highly variable lookup times due to clustering, and deletions
45 : * leaving a lot of tombstones around. To address these issues a variant
46 : * of "robin hood" hashing is employed. Robin hood hashing optimizes
47 : * chaining lengths by moving elements close to their optimal bucket
48 : * ("rich" elements), out of the way if a to-be-inserted element is further
49 : * away from its optimal position (i.e. it's "poor"). While that can make
50 : * insertions slower, the average lookup performance is a lot better, and
51 : * higher fill factors can be used in a still performant manner. To avoid
52 : * tombstones - which normally solve the issue that a deleted node's
53 : * presence is relevant to determine whether a lookup needs to continue
54 : * looking or is done - buckets following a deleted element are shifted
55 : * backwards, unless they're empty or already at their optimal position.
56 : */
57 :
58 : /* helpers */
59 : #define SH_MAKE_PREFIX(a) CppConcat(a,_)
60 : #define SH_MAKE_NAME(name) SH_MAKE_NAME_(SH_MAKE_PREFIX(SH_PREFIX),name)
61 : #define SH_MAKE_NAME_(a,b) CppConcat(a,b)
62 :
63 : /* name macros for: */
64 :
65 : /* type declarations */
66 : #define SH_TYPE SH_MAKE_NAME(hash)
67 : #define SH_STATUS SH_MAKE_NAME(status)
68 : #define SH_STATUS_EMPTY SH_MAKE_NAME(EMPTY)
69 : #define SH_STATUS_IN_USE SH_MAKE_NAME(IN_USE)
70 : #define SH_ITERATOR SH_MAKE_NAME(iterator)
71 :
72 : /* function declarations */
73 : #define SH_CREATE SH_MAKE_NAME(create)
74 : #define SH_DESTROY SH_MAKE_NAME(destroy)
75 : #define SH_INSERT SH_MAKE_NAME(insert)
76 : #define SH_DELETE SH_MAKE_NAME(delete)
77 : #define SH_LOOKUP SH_MAKE_NAME(lookup)
78 : #define SH_GROW SH_MAKE_NAME(grow)
79 : #define SH_START_ITERATE SH_MAKE_NAME(start_iterate)
80 : #define SH_START_ITERATE_AT SH_MAKE_NAME(start_iterate_at)
81 : #define SH_ITERATE SH_MAKE_NAME(iterate)
82 : #define SH_ALLOCATE SH_MAKE_NAME(allocate)
83 : #define SH_FREE SH_MAKE_NAME(free)
84 : #define SH_STAT SH_MAKE_NAME(stat)
85 :
86 : /* internal helper functions (no externally visible prototypes) */
87 : #define SH_COMPUTE_PARAMETERS SH_MAKE_NAME(compute_parameters)
88 : #define SH_NEXT SH_MAKE_NAME(next)
89 : #define SH_PREV SH_MAKE_NAME(prev)
90 : #define SH_DISTANCE_FROM_OPTIMAL SH_MAKE_NAME(distance)
91 : #define SH_INITIAL_BUCKET SH_MAKE_NAME(initial_bucket)
92 : #define SH_ENTRY_HASH SH_MAKE_NAME(entry_hash)
93 :
94 : /* generate forward declarations necessary to use the hash table */
95 : #ifdef SH_DECLARE
96 :
97 : /* type definitions */
98 : typedef struct SH_TYPE
99 : {
100 : /*
101 : * Size of data / bucket array, 64 bits to handle UINT32_MAX sized hash
102 : * tables. Note that the maximum number of elements is lower
103 : * (SH_MAX_FILLFACTOR)
104 : */
105 : uint64 size;
106 :
107 : /* how many elements have valid contents */
108 : uint32 members;
109 :
110 : /* mask for bucket and size calculations, based on size */
111 : uint32 sizemask;
112 :
113 : /* boundary after which to grow hashtable */
114 : uint32 grow_threshold;
115 :
116 : /* hash buckets */
117 : SH_ELEMENT_TYPE *data;
118 :
119 : /* memory context to use for allocations */
120 : MemoryContext ctx;
121 :
122 : /* user defined data, useful for callbacks */
123 : void *private_data;
124 : } SH_TYPE;
125 :
126 : typedef enum SH_STATUS
127 : {
128 : SH_STATUS_EMPTY = 0x00,
129 : SH_STATUS_IN_USE = 0x01
130 : } SH_STATUS;
131 :
132 : typedef struct SH_ITERATOR
133 : {
134 : uint32 cur; /* current element */
135 : uint32 end;
136 : bool done; /* iterator exhausted? */
137 : } SH_ITERATOR;
138 :
139 : /* externally visible function prototypes */
140 : SH_SCOPE SH_TYPE *SH_CREATE(MemoryContext ctx, uint32 nelements,
141 : void *private_data);
142 : SH_SCOPE void SH_DESTROY(SH_TYPE * tb);
143 : SH_SCOPE void SH_GROW(SH_TYPE * tb, uint32 newsize);
144 : SH_SCOPE SH_ELEMENT_TYPE *SH_INSERT(SH_TYPE * tb, SH_KEY_TYPE key, bool *found);
145 : SH_SCOPE SH_ELEMENT_TYPE *SH_LOOKUP(SH_TYPE * tb, SH_KEY_TYPE key);
146 : SH_SCOPE bool SH_DELETE(SH_TYPE * tb, SH_KEY_TYPE key);
147 : SH_SCOPE void SH_START_ITERATE(SH_TYPE * tb, SH_ITERATOR * iter);
148 : SH_SCOPE void SH_START_ITERATE_AT(SH_TYPE * tb, SH_ITERATOR * iter, uint32 at);
149 : SH_SCOPE SH_ELEMENT_TYPE *SH_ITERATE(SH_TYPE * tb, SH_ITERATOR * iter);
150 : SH_SCOPE void SH_STAT(SH_TYPE * tb);
151 :
152 : #endif /* SH_DECLARE */
153 :
154 :
155 : /* generate implementation of the hash table */
156 : #ifdef SH_DEFINE
157 :
158 : #include "utils/memutils.h"
159 :
160 : /* max data array size,we allow up to PG_UINT32_MAX buckets, including 0 */
161 : #define SH_MAX_SIZE (((uint64) PG_UINT32_MAX) + 1)
162 :
163 : /* normal fillfactor, unless already close to maximum */
164 : #ifndef SH_FILLFACTOR
165 : #define SH_FILLFACTOR (0.9)
166 : #endif
167 : /* increase fillfactor if we otherwise would error out */
168 : #define SH_MAX_FILLFACTOR (0.98)
169 : /* grow if actual and optimal location bigger than */
170 : #ifndef SH_GROW_MAX_DIB
171 : #define SH_GROW_MAX_DIB 25
172 : #endif
173 : /* grow if more than elements to move when inserting */
174 : #ifndef SH_GROW_MAX_MOVE
175 : #define SH_GROW_MAX_MOVE 150
176 : #endif
177 :
178 : #ifdef SH_STORE_HASH
179 : #define SH_COMPARE_KEYS(tb, ahash, akey, b) (ahash == SH_GET_HASH(tb, b) && SH_EQUAL(tb, b->SH_KEY, akey))
180 : #else
181 : #define SH_COMPARE_KEYS(tb, ahash, akey, b) (SH_EQUAL(tb, b->SH_KEY, akey))
182 : #endif
183 :
184 : /* FIXME: can we move these to a central location? */
185 :
186 : /* calculate ceil(log base 2) of num */
187 : static inline uint64
188 1136 : sh_log2(uint64 num)
189 : {
190 : int i;
191 : uint64 limit;
192 :
193 1136 : for (i = 0, limit = 1; limit < num; i++, limit <<= 1)
194 : ;
195 1136 : return i;
196 : }
197 :
198 : /* calculate first power of 2 >= num */
199 : static inline uint64
200 1136 : sh_pow2(uint64 num)
201 : {
202 1136 : return ((uint64) 1) << sh_log2(num);
203 : }
204 :
205 : /*
206 : * Compute sizing parameters for hashtable. Called when creating and growing
207 : * the hashtable.
208 : */
209 : static inline void
210 1030 : SH_COMPUTE_PARAMETERS(SH_TYPE * tb, uint32 newsize)
211 : {
212 : uint64 size;
213 :
214 : /* supporting zero sized hashes would complicate matters */
215 1030 : size = Max(newsize, 2);
216 :
217 : /* round up size to the next power of 2, that's how bucketing works */
218 1030 : size = sh_pow2(size);
219 1030 : Assert(size <= SH_MAX_SIZE);
220 :
221 : /*
222 : * Verify that allocation of ->data is possible on this platform, without
223 : * overflowing Size.
224 : */
225 1030 : if ((((uint64) sizeof(SH_ELEMENT_TYPE)) * size) >= MaxAllocHugeSize)
226 0 : elog(ERROR, "hash table too large");
227 :
228 : /* now set size */
229 1030 : tb->size = size;
230 :
231 1030 : if (tb->size == SH_MAX_SIZE)
232 0 : tb->sizemask = 0;
233 : else
234 1030 : tb->sizemask = tb->size - 1;
235 :
236 : /*
237 : * Compute the next threshold at which we need to grow the hash table
238 : * again.
239 : */
240 1030 : if (tb->size == SH_MAX_SIZE)
241 0 : tb->grow_threshold = ((double) tb->size) * SH_MAX_FILLFACTOR;
242 : else
243 1030 : tb->grow_threshold = ((double) tb->size) * SH_FILLFACTOR;
244 1030 : }
245 :
246 : /* return the optimal bucket for the hash */
247 : static inline uint32
248 1190176 : SH_INITIAL_BUCKET(SH_TYPE * tb, uint32 hash)
249 : {
250 1190176 : return hash & tb->sizemask;
251 : }
252 :
253 : /* return next bucket after the current, handling wraparound */
254 : static inline uint32
255 604028 : SH_NEXT(SH_TYPE * tb, uint32 curelem, uint32 startelem)
256 : {
257 604028 : curelem = (curelem + 1) & tb->sizemask;
258 :
259 604028 : Assert(curelem != startelem);
260 :
261 604028 : return curelem;
262 : }
263 :
264 : /* return bucket before the current, handling wraparound */
265 : static inline uint32
266 36238 : SH_PREV(SH_TYPE * tb, uint32 curelem, uint32 startelem)
267 : {
268 36238 : curelem = (curelem - 1) & tb->sizemask;
269 :
270 36238 : Assert(curelem != startelem);
271 :
272 36238 : return curelem;
273 : }
274 :
275 : /* return distance between bucket and its optimal position */
276 : static inline uint32
277 366557 : SH_DISTANCE_FROM_OPTIMAL(SH_TYPE * tb, uint32 optimal, uint32 bucket)
278 : {
279 366557 : if (optimal <= bucket)
280 361990 : return bucket - optimal;
281 : else
282 4567 : return (tb->size + bucket) - optimal;
283 : }
284 :
285 : static inline uint32
286 377535 : SH_ENTRY_HASH(SH_TYPE * tb, SH_ELEMENT_TYPE * entry)
287 : {
288 : #ifdef SH_STORE_HASH
289 121530 : return SH_GET_HASH(tb, entry);
290 : #else
291 256005 : return SH_HASH_KEY(tb, entry->SH_KEY);
292 : #endif
293 : }
294 :
295 : /* default memory allocator function */
296 : static inline void *SH_ALLOCATE(SH_TYPE * type, Size size);
297 : static inline void SH_FREE(SH_TYPE * type, void *pointer);
298 :
299 : #ifndef SH_USE_NONDEFAULT_ALLOCATOR
300 :
301 : /* default memory allocator function */
302 : static inline void *
303 604 : SH_ALLOCATE(SH_TYPE * type, Size size)
304 : {
305 604 : return MemoryContextAllocExtended(type->ctx, size,
306 : MCXT_ALLOC_HUGE | MCXT_ALLOC_ZERO);
307 : }
308 :
309 : /* default memory free function */
310 : static inline void
311 84 : SH_FREE(SH_TYPE * type, void *pointer)
312 : {
313 84 : pfree(pointer);
314 84 : }
315 :
316 : #endif
317 :
318 : /*
319 : * Create a hash table with enough space for `nelements` distinct members.
320 : * Memory for the hash table is allocated from the passed-in context. If
321 : * desired, the array of elements can be allocated using a passed-in allocator;
322 : * this could be useful in order to place the array of elements in a shared
323 : * memory, or in a context that will outlive the rest of the hash table.
324 : * Memory other than for the array of elements will still be allocated from
325 : * the passed-in context.
326 : */
327 : SH_SCOPE SH_TYPE *
328 924 : SH_CREATE(MemoryContext ctx, uint32 nelements, void *private_data)
329 : {
330 : SH_TYPE *tb;
331 : uint64 size;
332 :
333 924 : tb = MemoryContextAllocZero(ctx, sizeof(SH_TYPE));
334 924 : tb->ctx = ctx;
335 924 : tb->private_data = private_data;
336 :
337 : /* increase nelements by fillfactor, want to store nelements elements */
338 924 : size = Min((double) SH_MAX_SIZE, ((double) nelements) / SH_FILLFACTOR);
339 :
340 924 : SH_COMPUTE_PARAMETERS(tb, size);
341 :
342 924 : tb->data = SH_ALLOCATE(tb, sizeof(SH_ELEMENT_TYPE) * tb->size);
343 :
344 924 : return tb;
345 : }
346 :
347 : /* destroy a previously created hash table */
348 : SH_SCOPE void
349 404 : SH_DESTROY(SH_TYPE * tb)
350 : {
351 404 : SH_FREE(tb, tb->data);
352 404 : pfree(tb);
353 404 : }
354 :
355 : /*
356 : * Grow a hash table to at least `newsize` buckets.
357 : *
358 : * Usually this will automatically be called by insertions/deletions, when
359 : * necessary. But resizing to the exact input size can be advantageous
360 : * performance-wise, when known at some point.
361 : */
362 : SH_SCOPE void
363 106 : SH_GROW(SH_TYPE * tb, uint32 newsize)
364 : {
365 106 : uint64 oldsize = tb->size;
366 106 : SH_ELEMENT_TYPE *olddata = tb->data;
367 : SH_ELEMENT_TYPE *newdata;
368 : uint32 i;
369 106 : uint32 startelem = 0;
370 : uint32 copyelem;
371 :
372 106 : Assert(oldsize == sh_pow2(oldsize));
373 106 : Assert(oldsize != SH_MAX_SIZE);
374 106 : Assert(oldsize < newsize);
375 :
376 : /* compute parameters for new table */
377 106 : SH_COMPUTE_PARAMETERS(tb, newsize);
378 :
379 106 : tb->data = SH_ALLOCATE(tb, sizeof(SH_ELEMENT_TYPE) * tb->size);
380 :
381 106 : newdata = tb->data;
382 :
383 : /*
384 : * Copy entries from the old data to newdata. We theoretically could use
385 : * SH_INSERT here, to avoid code duplication, but that's more general than
386 : * we need. We neither want tb->members increased, nor do we need to do
387 : * deal with deleted elements, nor do we need to compare keys. So a
388 : * special-cased implementation is lot faster. As resizing can be time
389 : * consuming and frequent, that's worthwhile to optimize.
390 : *
391 : * To be able to simply move entries over, we have to start not at the
392 : * first bucket (i.e olddata[0]), but find the first bucket that's either
393 : * empty, or is occupied by an entry at its optimal position. Such a
394 : * bucket has to exist in any table with a load factor under 1, as not all
395 : * buckets are occupied, i.e. there always has to be an empty bucket. By
396 : * starting at such a bucket we can move the entries to the larger table,
397 : * without having to deal with conflicts.
398 : */
399 :
400 : /* search for the first element in the hash that's not wrapped around */
401 931 : for (i = 0; i < oldsize; i++)
402 : {
403 931 : SH_ELEMENT_TYPE *oldentry = &olddata[i];
404 : uint32 hash;
405 : uint32 optimal;
406 :
407 931 : if (oldentry->status != SH_STATUS_IN_USE)
408 : {
409 79 : startelem = i;
410 79 : break;
411 : }
412 :
413 852 : hash = SH_ENTRY_HASH(tb, oldentry);
414 852 : optimal = SH_INITIAL_BUCKET(tb, hash);
415 :
416 852 : if (optimal == i)
417 : {
418 27 : startelem = i;
419 27 : break;
420 : }
421 : }
422 :
423 : /* and copy all elements in the old table */
424 106 : copyelem = startelem;
425 11346 : for (i = 0; i < oldsize; i++)
426 : {
427 11240 : SH_ELEMENT_TYPE *oldentry = &olddata[copyelem];
428 :
429 11240 : if (oldentry->status == SH_STATUS_IN_USE)
430 : {
431 : uint32 hash;
432 : uint32 startelem;
433 : uint32 curelem;
434 : SH_ELEMENT_TYPE *newentry;
435 :
436 10024 : hash = SH_ENTRY_HASH(tb, oldentry);
437 10024 : startelem = SH_INITIAL_BUCKET(tb, hash);
438 10024 : curelem = startelem;
439 :
440 : /* find empty element to put data into */
441 : while (true)
442 : {
443 13840 : newentry = &newdata[curelem];
444 :
445 13840 : if (newentry->status == SH_STATUS_EMPTY)
446 : {
447 10024 : break;
448 : }
449 :
450 3816 : curelem = SH_NEXT(tb, curelem, startelem);
451 3816 : }
452 :
453 : /* copy entry to new slot */
454 10024 : memcpy(newentry, oldentry, sizeof(SH_ELEMENT_TYPE));
455 : }
456 :
457 : /* can't use SH_NEXT here, would use new size */
458 11240 : copyelem++;
459 11240 : if (copyelem >= oldsize)
460 : {
461 106 : copyelem = 0;
462 : }
463 : }
464 :
465 106 : SH_FREE(tb, olddata);
466 106 : }
467 :
468 : /*
469 : * Insert the key key into the hash-table, set *found to true if the key
470 : * already exists, false otherwise. Returns the hash-table entry in either
471 : * case.
472 : */
473 : SH_SCOPE SH_ELEMENT_TYPE *
474 702368 : SH_INSERT(SH_TYPE * tb, SH_KEY_TYPE key, bool *found)
475 : {
476 702368 : uint32 hash = SH_HASH_KEY(tb, key);
477 : uint32 startelem;
478 : uint32 curelem;
479 : SH_ELEMENT_TYPE *data;
480 : uint32 insertdist;
481 :
482 : restart:
483 702369 : insertdist = 0;
484 :
485 : /*
486 : * We do the grow check even if the key is actually present, to avoid
487 : * doing the check inside the loop. This also lets us avoid having to
488 : * re-find our position in the hashtable after resizing.
489 : *
490 : * Note that this also reached when resizing the table due to
491 : * SH_GROW_MAX_DIB / SH_GROW_MAX_MOVE.
492 : */
493 702369 : if (unlikely(tb->members >= tb->grow_threshold))
494 : {
495 106 : if (tb->size == SH_MAX_SIZE)
496 : {
497 0 : elog(ERROR, "hash table size exceeded");
498 : }
499 :
500 : /*
501 : * When optimizing, it can be very useful to print these out.
502 : */
503 : /* SH_STAT(tb); */
504 106 : SH_GROW(tb, tb->size * 2);
505 : /* SH_STAT(tb); */
506 : }
507 :
508 : /* perform insert, start bucket search at optimal location */
509 702369 : data = tb->data;
510 702369 : startelem = SH_INITIAL_BUCKET(tb, hash);
511 702369 : curelem = startelem;
512 : while (true)
513 : {
514 : uint32 curdist;
515 : uint32 curhash;
516 : uint32 curoptimal;
517 1062276 : SH_ELEMENT_TYPE *entry = &data[curelem];
518 :
519 : /* any empty bucket can directly be used */
520 1062276 : if (entry->status == SH_STATUS_EMPTY)
521 : {
522 36555 : tb->members++;
523 36555 : entry->SH_KEY = key;
524 : #ifdef SH_STORE_HASH
525 27582 : SH_GET_HASH(tb, entry) = hash;
526 : #endif
527 36555 : entry->status = SH_STATUS_IN_USE;
528 36555 : *found = false;
529 36555 : return entry;
530 : }
531 :
532 : /*
533 : * If the bucket is not empty, we either found a match (in which case
534 : * we're done), or we have to decide whether to skip over or move the
535 : * colliding entry. When the colliding element's distance to its
536 : * optimal position is smaller than the to-be-inserted entry's, we
537 : * shift the colliding entry (and its followers) forward by one.
538 : */
539 :
540 1025721 : if (SH_COMPARE_KEYS(tb, hash, key, entry))
541 : {
542 659164 : Assert(entry->status == SH_STATUS_IN_USE);
543 659164 : *found = true;
544 659164 : return entry;
545 : }
546 :
547 366557 : curhash = SH_ENTRY_HASH(tb, entry);
548 366557 : curoptimal = SH_INITIAL_BUCKET(tb, curhash);
549 366557 : curdist = SH_DISTANCE_FROM_OPTIMAL(tb, curoptimal, curelem);
550 :
551 366557 : if (insertdist > curdist)
552 : {
553 6650 : SH_ELEMENT_TYPE *lastentry = entry;
554 6650 : uint32 emptyelem = curelem;
555 : uint32 moveelem;
556 6650 : int32 emptydist = 0;
557 :
558 : /* find next empty bucket */
559 : while (true)
560 : {
561 : SH_ELEMENT_TYPE *emptyentry;
562 :
563 36389 : emptyelem = SH_NEXT(tb, emptyelem, startelem);
564 36389 : emptyentry = &data[emptyelem];
565 :
566 36389 : if (emptyentry->status == SH_STATUS_EMPTY)
567 : {
568 6649 : lastentry = emptyentry;
569 6649 : break;
570 : }
571 :
572 : /*
573 : * To avoid negative consequences from overly imbalanced
574 : * hashtables, grow the hashtable if collisions would require
575 : * us to move a lot of entries. The most likely cause of such
576 : * imbalance is filling a (currently) small table, from a
577 : * currently big one, in hash-table order.
578 : */
579 29740 : if (++emptydist > SH_GROW_MAX_MOVE)
580 : {
581 1 : tb->grow_threshold = 0;
582 1 : goto restart;
583 : }
584 29739 : }
585 :
586 : /* shift forward, starting at last occupied element */
587 :
588 : /*
589 : * TODO: This could be optimized to be one memcpy in may cases,
590 : * excepting wrapping around at the end of ->data. Hasn't shown up
591 : * in profiles so far though.
592 : */
593 6649 : moveelem = emptyelem;
594 49536 : while (moveelem != curelem)
595 : {
596 : SH_ELEMENT_TYPE *moveentry;
597 :
598 36238 : moveelem = SH_PREV(tb, moveelem, startelem);
599 36238 : moveentry = &data[moveelem];
600 :
601 36238 : memcpy(lastentry, moveentry, sizeof(SH_ELEMENT_TYPE));
602 36238 : lastentry = moveentry;
603 : }
604 :
605 : /* and fill the now empty spot */
606 6649 : tb->members++;
607 :
608 6649 : entry->SH_KEY = key;
609 : #ifdef SH_STORE_HASH
610 3839 : SH_GET_HASH(tb, entry) = hash;
611 : #endif
612 6649 : entry->status = SH_STATUS_IN_USE;
613 6649 : *found = false;
614 6649 : return entry;
615 : }
616 :
617 359907 : curelem = SH_NEXT(tb, curelem, startelem);
618 359907 : insertdist++;
619 :
620 : /*
621 : * To avoid negative consequences from overly imbalanced hashtables,
622 : * grow the hashtable if collisions lead to large runs. The most
623 : * likely cause of such imbalance is filling a (currently) small
624 : * table, from a currently big one, in hash-table order.
625 : */
626 359907 : if (insertdist > SH_GROW_MAX_DIB)
627 : {
628 0 : tb->grow_threshold = 0;
629 0 : goto restart;
630 : }
631 359907 : }
632 : }
633 :
634 : /*
635 : * Lookup up entry in hash table. Returns NULL if key not present.
636 : */
637 : SH_SCOPE SH_ELEMENT_TYPE *
638 89038 : SH_LOOKUP(SH_TYPE * tb, SH_KEY_TYPE key)
639 : {
640 89038 : uint32 hash = SH_HASH_KEY(tb, key);
641 89038 : const uint32 startelem = SH_INITIAL_BUCKET(tb, hash);
642 89038 : uint32 curelem = startelem;
643 :
644 : while (true)
645 : {
646 287571 : SH_ELEMENT_TYPE *entry = &tb->data[curelem];
647 :
648 287571 : if (entry->status == SH_STATUS_EMPTY)
649 : {
650 65071 : return NULL;
651 : }
652 :
653 222500 : Assert(entry->status == SH_STATUS_IN_USE);
654 :
655 222500 : if (SH_COMPARE_KEYS(tb, hash, key, entry))
656 23967 : return entry;
657 :
658 : /*
659 : * TODO: we could stop search based on distance. If the current
660 : * buckets's distance-from-optimal is smaller than what we've skipped
661 : * already, the entry doesn't exist. Probably only do so if
662 : * SH_STORE_HASH is defined, to avoid re-computing hashes?
663 : */
664 :
665 198533 : curelem = SH_NEXT(tb, curelem, startelem);
666 198533 : }
667 : }
668 :
669 : /*
670 : * Delete entry from hash table. Returns whether to-be-deleted key was
671 : * present.
672 : */
673 : SH_SCOPE bool
674 21234 : SH_DELETE(SH_TYPE * tb, SH_KEY_TYPE key)
675 : {
676 21234 : uint32 hash = SH_HASH_KEY(tb, key);
677 21234 : uint32 startelem = SH_INITIAL_BUCKET(tb, hash);
678 21234 : uint32 curelem = startelem;
679 :
680 : while (true)
681 : {
682 23526 : SH_ELEMENT_TYPE *entry = &tb->data[curelem];
683 :
684 23526 : if (entry->status == SH_STATUS_EMPTY)
685 18227 : return false;
686 :
687 10598 : if (entry->status == SH_STATUS_IN_USE &&
688 5299 : SH_COMPARE_KEYS(tb, hash, key, entry))
689 : {
690 3007 : SH_ELEMENT_TYPE *lastentry = entry;
691 :
692 3007 : tb->members--;
693 :
694 : /*
695 : * Backward shift following elements till either an empty element
696 : * or an element at its optimal position is encountered.
697 : *
698 : * While that sounds expensive, the average chain length is short,
699 : * and deletions would otherwise require tombstones.
700 : */
701 : while (true)
702 : {
703 : SH_ELEMENT_TYPE *curentry;
704 : uint32 curhash;
705 : uint32 curoptimal;
706 :
707 3091 : curelem = SH_NEXT(tb, curelem, startelem);
708 3091 : curentry = &tb->data[curelem];
709 :
710 3091 : if (curentry->status != SH_STATUS_IN_USE)
711 : {
712 2989 : lastentry->status = SH_STATUS_EMPTY;
713 2989 : break;
714 : }
715 :
716 102 : curhash = SH_ENTRY_HASH(tb, curentry);
717 102 : curoptimal = SH_INITIAL_BUCKET(tb, curhash);
718 :
719 : /* current is at optimal position, done */
720 102 : if (curoptimal == curelem)
721 : {
722 18 : lastentry->status = SH_STATUS_EMPTY;
723 18 : break;
724 : }
725 :
726 : /* shift */
727 84 : memcpy(lastentry, curentry, sizeof(SH_ELEMENT_TYPE));
728 :
729 84 : lastentry = curentry;
730 84 : }
731 :
732 3007 : return true;
733 : }
734 :
735 : /* TODO: return false; if distance too big */
736 :
737 2292 : curelem = SH_NEXT(tb, curelem, startelem);
738 2292 : }
739 : }
740 :
741 : /*
742 : * Initialize iterator.
743 : */
744 : SH_SCOPE void
745 885 : SH_START_ITERATE(SH_TYPE * tb, SH_ITERATOR * iter)
746 : {
747 : int i;
748 885 : uint64 startelem = PG_UINT64_MAX;
749 :
750 : /*
751 : * Search for the first empty element. As deletions during iterations are
752 : * supported, we want to start/end at an element that cannot be affected
753 : * by elements being shifted.
754 : */
755 1487 : for (i = 0; i < tb->size; i++)
756 : {
757 1487 : SH_ELEMENT_TYPE *entry = &tb->data[i];
758 :
759 1487 : if (entry->status != SH_STATUS_IN_USE)
760 : {
761 885 : startelem = i;
762 885 : break;
763 : }
764 : }
765 :
766 885 : Assert(startelem < SH_MAX_SIZE);
767 :
768 : /*
769 : * Iterate backwards, that allows the current element to be deleted, even
770 : * if there are backward shifts
771 : */
772 885 : iter->cur = startelem;
773 885 : iter->end = iter->cur;
774 885 : iter->done = false;
775 885 : }
776 :
777 : /*
778 : * Initialize iterator to a specific bucket. That's really only useful for
779 : * cases where callers are partially iterating over the hashspace, and that
780 : * iteration deletes and inserts elements based on visited entries. Doing that
781 : * repeatedly could lead to an unbalanced keyspace when always starting at the
782 : * same position.
783 : */
784 : SH_SCOPE void
785 4 : SH_START_ITERATE_AT(SH_TYPE * tb, SH_ITERATOR * iter, uint32 at)
786 : {
787 : /*
788 : * Iterate backwards, that allows the current element to be deleted, even
789 : * if there are backward shifts.
790 : */
791 4 : iter->cur = at & tb->sizemask; /* ensure at is within a valid range */
792 4 : iter->end = iter->cur;
793 4 : iter->done = false;
794 4 : }
795 :
796 : /*
797 : * Iterate over all entries in the hash-table. Return the next occupied entry,
798 : * or NULL if done.
799 : *
800 : * During iteration the current entry in the hash table may be deleted,
801 : * without leading to elements being skipped or returned twice. Additionally
802 : * the rest of the table may be modified (i.e. there can be insertions or
803 : * deletions), but if so, there's neither a guarantee that all nodes are
804 : * visited at least once, nor a guarantee that a node is visited at most once.
805 : */
806 : SH_SCOPE SH_ELEMENT_TYPE *
807 32245 : SH_ITERATE(SH_TYPE * tb, SH_ITERATOR * iter)
808 : {
809 289874 : while (!iter->done)
810 : {
811 : SH_ELEMENT_TYPE *elem;
812 :
813 256759 : elem = &tb->data[iter->cur];
814 :
815 : /* next element in backward direction */
816 256759 : iter->cur = (iter->cur - 1) & tb->sizemask;
817 :
818 256759 : if ((iter->cur & tb->sizemask) == (iter->end & tb->sizemask))
819 870 : iter->done = true;
820 256759 : if (elem->status == SH_STATUS_IN_USE)
821 : {
822 31375 : return elem;
823 : }
824 : }
825 :
826 870 : return NULL;
827 : }
828 :
829 : /*
830 : * Report some statistics about the state of the hashtable. For
831 : * debugging/profiling purposes only.
832 : */
833 : SH_SCOPE void
834 0 : SH_STAT(SH_TYPE * tb)
835 : {
836 0 : uint32 max_chain_length = 0;
837 0 : uint32 total_chain_length = 0;
838 : double avg_chain_length;
839 : double fillfactor;
840 : uint32 i;
841 :
842 0 : uint32 *collisions = palloc0(tb->size * sizeof(uint32));
843 0 : uint32 total_collisions = 0;
844 0 : uint32 max_collisions = 0;
845 : double avg_collisions;
846 :
847 0 : for (i = 0; i < tb->size; i++)
848 : {
849 : uint32 hash;
850 : uint32 optimal;
851 : uint32 dist;
852 : SH_ELEMENT_TYPE *elem;
853 :
854 0 : elem = &tb->data[i];
855 :
856 0 : if (elem->status != SH_STATUS_IN_USE)
857 0 : continue;
858 :
859 0 : hash = SH_ENTRY_HASH(tb, elem);
860 0 : optimal = SH_INITIAL_BUCKET(tb, hash);
861 0 : dist = SH_DISTANCE_FROM_OPTIMAL(tb, optimal, i);
862 :
863 0 : if (dist > max_chain_length)
864 0 : max_chain_length = dist;
865 0 : total_chain_length += dist;
866 :
867 0 : collisions[optimal]++;
868 : }
869 :
870 0 : for (i = 0; i < tb->size; i++)
871 : {
872 0 : uint32 curcoll = collisions[i];
873 :
874 0 : if (curcoll == 0)
875 0 : continue;
876 :
877 : /* single contained element is not a collision */
878 0 : curcoll--;
879 0 : total_collisions += curcoll;
880 0 : if (curcoll > max_collisions)
881 0 : max_collisions = curcoll;
882 : }
883 :
884 0 : if (tb->members > 0)
885 : {
886 0 : fillfactor = tb->members / ((double) tb->size);
887 0 : avg_chain_length = ((double) total_chain_length) / tb->members;
888 0 : avg_collisions = ((double) total_collisions) / tb->members;
889 : }
890 : else
891 : {
892 0 : fillfactor = 0;
893 0 : avg_chain_length = 0;
894 0 : avg_collisions = 0;
895 : }
896 :
897 0 : elog(LOG, "size: " UINT64_FORMAT ", members: %u, filled: %f, total chain: %u, max chain: %u, avg chain: %f, total_collisions: %u, max_collisions: %i, avg_collisions: %f",
898 : tb->size, tb->members, fillfactor, total_chain_length, max_chain_length, avg_chain_length,
899 : total_collisions, max_collisions, avg_collisions);
900 0 : }
901 :
902 : #endif /* SH_DEFINE */
903 :
904 :
905 : /* undefine external parameters, so next hash table can be defined */
906 : #undef SH_PREFIX
907 : #undef SH_KEY_TYPE
908 : #undef SH_KEY
909 : #undef SH_ELEMENT_TYPE
910 : #undef SH_HASH_KEY
911 : #undef SH_SCOPE
912 : #undef SH_DECLARE
913 : #undef SH_DEFINE
914 : #undef SH_GET_HASH
915 : #undef SH_STORE_HASH
916 : #undef SH_USE_NONDEFAULT_ALLOCATOR
917 :
918 : /* undefine locally declared macros */
919 : #undef SH_MAKE_PREFIX
920 : #undef SH_MAKE_NAME
921 : #undef SH_MAKE_NAME_
922 : #undef SH_FILLFACTOR
923 : #undef SH_MAX_FILLFACTOR
924 : #undef SH_GROW_MAX_DIB
925 : #undef SH_GROW_MAX_MOVE
926 : #undef SH_MAX_SIZE
927 :
928 : /* types */
929 : #undef SH_TYPE
930 : #undef SH_STATUS
931 : #undef SH_STATUS_EMPTY
932 : #undef SH_STATUS_IN_USE
933 : #undef SH_ITERATOR
934 :
935 : /* external function names */
936 : #undef SH_CREATE
937 : #undef SH_DESTROY
938 : #undef SH_INSERT
939 : #undef SH_DELETE
940 : #undef SH_LOOKUP
941 : #undef SH_GROW
942 : #undef SH_START_ITERATE
943 : #undef SH_START_ITERATE_AT
944 : #undef SH_ITERATE
945 : #undef SH_ALLOCATE
946 : #undef SH_FREE
947 : #undef SH_STAT
948 :
949 : /* internal function names */
950 : #undef SH_COMPUTE_PARAMETERS
951 : #undef SH_COMPARE_KEYS
952 : #undef SH_INITIAL_BUCKET
953 : #undef SH_NEXT
954 : #undef SH_PREV
955 : #undef SH_DISTANCE_FROM_OPTIMAL
956 : #undef SH_ENTRY_HASH
|