Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * tidbitmap.c
4 : * PostgreSQL tuple-id (TID) bitmap package
5 : *
6 : * This module provides bitmap data structures that are spiritually
7 : * similar to Bitmapsets, but are specially adapted to store sets of
8 : * tuple identifiers (TIDs), or ItemPointers. In particular, the division
9 : * of an ItemPointer into BlockNumber and OffsetNumber is catered for.
10 : * Also, since we wish to be able to store very large tuple sets in
11 : * memory with this data structure, we support "lossy" storage, in which
12 : * we no longer remember individual tuple offsets on a page but only the
13 : * fact that a particular page needs to be visited.
14 : *
15 : * The "lossy" storage uses one bit per disk page, so at the standard 8K
16 : * BLCKSZ, we can represent all pages in 64Gb of disk space in about 1Mb
17 : * of memory. People pushing around tables of that size should have a
18 : * couple of Mb to spare, so we don't worry about providing a second level
19 : * of lossiness. In theory we could fall back to page ranges at some
20 : * point, but for now that seems useless complexity.
21 : *
22 : * We also support the notion of candidate matches, or rechecking. This
23 : * means we know that a search need visit only some tuples on a page,
24 : * but we are not certain that all of those tuples are real matches.
25 : * So the eventual heap scan must recheck the quals for these tuples only,
26 : * rather than rechecking the quals for all tuples on the page as in the
27 : * lossy-bitmap case. Rechecking can be specified when TIDs are inserted
28 : * into a bitmap, and it can also happen internally when we AND a lossy
29 : * and a non-lossy page.
30 : *
31 : *
32 : * Copyright (c) 2003-2017, PostgreSQL Global Development Group
33 : *
34 : * IDENTIFICATION
35 : * src/backend/nodes/tidbitmap.c
36 : *
37 : *-------------------------------------------------------------------------
38 : */
39 : #include "postgres.h"
40 :
41 : #include <limits.h>
42 :
43 : #include "access/htup_details.h"
44 : #include "nodes/bitmapset.h"
45 : #include "nodes/tidbitmap.h"
46 : #include "storage/lwlock.h"
47 : #include "utils/dsa.h"
48 :
49 : /*
50 : * The maximum number of tuples per page is not large (typically 256 with
51 : * 8K pages, or 1024 with 32K pages). So there's not much point in making
52 : * the per-page bitmaps variable size. We just legislate that the size
53 : * is this:
54 : */
55 : #define MAX_TUPLES_PER_PAGE MaxHeapTuplesPerPage
56 :
57 : /*
58 : * When we have to switch over to lossy storage, we use a data structure
59 : * with one bit per page, where all pages having the same number DIV
60 : * PAGES_PER_CHUNK are aggregated into one chunk. When a chunk is present
61 : * and has the bit set for a given page, there must not be a per-page entry
62 : * for that page in the page table.
63 : *
64 : * We actually store both exact pages and lossy chunks in the same hash
65 : * table, using identical data structures. (This is because the memory
66 : * management for hashtables doesn't easily/efficiently allow space to be
67 : * transferred easily from one hashtable to another.) Therefore it's best
68 : * if PAGES_PER_CHUNK is the same as MAX_TUPLES_PER_PAGE, or at least not
69 : * too different. But we also want PAGES_PER_CHUNK to be a power of 2 to
70 : * avoid expensive integer remainder operations. So, define it like this:
71 : */
72 : #define PAGES_PER_CHUNK (BLCKSZ / 32)
73 :
74 : /* We use BITS_PER_BITMAPWORD and typedef bitmapword from nodes/bitmapset.h */
75 :
76 : #define WORDNUM(x) ((x) / BITS_PER_BITMAPWORD)
77 : #define BITNUM(x) ((x) % BITS_PER_BITMAPWORD)
78 :
79 : /* number of active words for an exact page: */
80 : #define WORDS_PER_PAGE ((MAX_TUPLES_PER_PAGE - 1) / BITS_PER_BITMAPWORD + 1)
81 : /* number of active words for a lossy chunk: */
82 : #define WORDS_PER_CHUNK ((PAGES_PER_CHUNK - 1) / BITS_PER_BITMAPWORD + 1)
83 :
84 : /*
85 : * The hashtable entries are represented by this data structure. For
86 : * an exact page, blockno is the page number and bit k of the bitmap
87 : * represents tuple offset k+1. For a lossy chunk, blockno is the first
88 : * page in the chunk (this must be a multiple of PAGES_PER_CHUNK) and
89 : * bit k represents page blockno+k. Note that it is not possible to
90 : * have exact storage for the first page of a chunk if we are using
91 : * lossy storage for any page in the chunk's range, since the same
92 : * hashtable entry has to serve both purposes.
93 : *
94 : * recheck is used only on exact pages --- it indicates that although
95 : * only the stated tuples need be checked, the full index qual condition
96 : * must be checked for each (ie, these are candidate matches).
97 : */
98 : typedef struct PagetableEntry
99 : {
100 : BlockNumber blockno; /* page number (hashtable key) */
101 : char status; /* hash entry status */
102 : bool ischunk; /* T = lossy storage, F = exact */
103 : bool recheck; /* should the tuples be rechecked? */
104 : bitmapword words[Max(WORDS_PER_PAGE, WORDS_PER_CHUNK)];
105 : } PagetableEntry;
106 :
107 : /*
108 : * Holds array of pagetable entries.
109 : */
110 : typedef struct PTEntryArray
111 : {
112 : pg_atomic_uint32 refcount; /* no. of iterator attached */
113 : PagetableEntry ptentry[FLEXIBLE_ARRAY_MEMBER];
114 : } PTEntryArray;
115 :
116 : /*
117 : * We want to avoid the overhead of creating the hashtable, which is
118 : * comparatively large, when not necessary. Particularly when we are using a
119 : * bitmap scan on the inside of a nestloop join: a bitmap may well live only
120 : * long enough to accumulate one entry in such cases. We therefore avoid
121 : * creating an actual hashtable until we need two pagetable entries. When
122 : * just one pagetable entry is needed, we store it in a fixed field of
123 : * TIDBitMap. (NOTE: we don't get rid of the hashtable if the bitmap later
124 : * shrinks down to zero or one page again. So, status can be TBM_HASH even
125 : * when nentries is zero or one.)
126 : */
127 : typedef enum
128 : {
129 : TBM_EMPTY, /* no hashtable, nentries == 0 */
130 : TBM_ONE_PAGE, /* entry1 contains the single entry */
131 : TBM_HASH /* pagetable is valid, entry1 is not */
132 : } TBMStatus;
133 :
134 : /*
135 : * Current iterating state of the TBM.
136 : */
137 : typedef enum
138 : {
139 : TBM_NOT_ITERATING, /* not yet converted to page and chunk array */
140 : TBM_ITERATING_PRIVATE, /* converted to local page and chunk array */
141 : TBM_ITERATING_SHARED /* converted to shared page and chunk array */
142 : } TBMIteratingState;
143 :
144 : /*
145 : * Here is the representation for a whole TIDBitMap:
146 : */
147 : struct TIDBitmap
148 : {
149 : NodeTag type; /* to make it a valid Node */
150 : MemoryContext mcxt; /* memory context containing me */
151 : TBMStatus status; /* see codes above */
152 : struct pagetable_hash *pagetable; /* hash table of PagetableEntry's */
153 : int nentries; /* number of entries in pagetable */
154 : int maxentries; /* limit on same to meet maxbytes */
155 : int npages; /* number of exact entries in pagetable */
156 : int nchunks; /* number of lossy entries in pagetable */
157 : TBMIteratingState iterating; /* tbm_begin_iterate called? */
158 : uint32 lossify_start; /* offset to start lossifying hashtable at */
159 : PagetableEntry entry1; /* used when status == TBM_ONE_PAGE */
160 : /* these are valid when iterating is true: */
161 : PagetableEntry **spages; /* sorted exact-page list, or NULL */
162 : PagetableEntry **schunks; /* sorted lossy-chunk list, or NULL */
163 : dsa_pointer dsapagetable; /* dsa_pointer to the element array */
164 : dsa_pointer dsapagetableold; /* dsa_pointer to the old element array */
165 : dsa_pointer ptpages; /* dsa_pointer to the page array */
166 : dsa_pointer ptchunks; /* dsa_pointer to the chunk array */
167 : dsa_area *dsa; /* reference to per-query dsa area */
168 : };
169 :
170 : /*
171 : * When iterating over a bitmap in sorted order, a TBMIterator is used to
172 : * track our progress. There can be several iterators scanning the same
173 : * bitmap concurrently. Note that the bitmap becomes read-only as soon as
174 : * any iterator is created.
175 : */
176 : struct TBMIterator
177 : {
178 : TIDBitmap *tbm; /* TIDBitmap we're iterating over */
179 : int spageptr; /* next spages index */
180 : int schunkptr; /* next schunks index */
181 : int schunkbit; /* next bit to check in current schunk */
182 : TBMIterateResult output; /* MUST BE LAST (because variable-size) */
183 : };
184 :
185 : /*
186 : * Holds the shared members of the iterator so that multiple processes
187 : * can jointly iterate.
188 : */
189 : typedef struct TBMSharedIteratorState
190 : {
191 : int nentries; /* number of entries in pagetable */
192 : int maxentries; /* limit on same to meet maxbytes */
193 : int npages; /* number of exact entries in pagetable */
194 : int nchunks; /* number of lossy entries in pagetable */
195 : dsa_pointer pagetable; /* dsa pointers to head of pagetable data */
196 : dsa_pointer spages; /* dsa pointer to page array */
197 : dsa_pointer schunks; /* dsa pointer to chunk array */
198 : LWLock lock; /* lock to protect below members */
199 : int spageptr; /* next spages index */
200 : int schunkptr; /* next schunks index */
201 : int schunkbit; /* next bit to check in current schunk */
202 : } TBMSharedIteratorState;
203 :
204 : /*
205 : * pagetable iteration array.
206 : */
207 : typedef struct PTIterationArray
208 : {
209 : pg_atomic_uint32 refcount; /* no. of iterator attached */
210 : int index[FLEXIBLE_ARRAY_MEMBER]; /* index array */
211 : } PTIterationArray;
212 :
213 : /*
214 : * same as TBMIterator, but it is used for joint iteration, therefore this
215 : * also holds a reference to the shared state.
216 : */
217 : struct TBMSharedIterator
218 : {
219 : TBMSharedIteratorState *state; /* shared state */
220 : PTEntryArray *ptbase; /* pagetable element array */
221 : PTIterationArray *ptpages; /* sorted exact page index list */
222 : PTIterationArray *ptchunks; /* sorted lossy page index list */
223 : TBMIterateResult output; /* MUST BE LAST (because variable-size) */
224 : };
225 :
226 : /* Local function prototypes */
227 : static void tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage);
228 : static bool tbm_intersect_page(TIDBitmap *a, PagetableEntry *apage,
229 : const TIDBitmap *b);
230 : static const PagetableEntry *tbm_find_pageentry(const TIDBitmap *tbm,
231 : BlockNumber pageno);
232 : static PagetableEntry *tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno);
233 : static bool tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno);
234 : static void tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno);
235 : static void tbm_lossify(TIDBitmap *tbm);
236 : static int tbm_comparator(const void *left, const void *right);
237 : static int tbm_shared_comparator(const void *left, const void *right,
238 : void *arg);
239 :
240 : /*
241 : * Simple inline murmur hash implementation for the exact width required, for
242 : * performance.
243 : */
244 : static inline uint32
245 716303 : hash_blockno(BlockNumber b)
246 : {
247 716303 : uint32 h = b;
248 :
249 716303 : h ^= h >> 16;
250 716303 : h *= 0x85ebca6b;
251 716303 : h ^= h >> 13;
252 716303 : h *= 0xc2b2ae35;
253 716303 : h ^= h >> 16;
254 716303 : return h;
255 : }
256 :
257 : /* define hashtable mapping block numbers to PagetableEntry's */
258 : #define SH_USE_NONDEFAULT_ALLOCATOR
259 : #define SH_PREFIX pagetable
260 : #define SH_ELEMENT_TYPE PagetableEntry
261 : #define SH_KEY_TYPE BlockNumber
262 : #define SH_KEY blockno
263 : #define SH_HASH_KEY(tb, key) hash_blockno(key)
264 : #define SH_EQUAL(tb, a, b) a == b
265 : #define SH_SCOPE static inline
266 : #define SH_DEFINE
267 : #define SH_DECLARE
268 : #include "lib/simplehash.h"
269 :
270 :
271 : /*
272 : * tbm_create - create an initially-empty bitmap
273 : *
274 : * The bitmap will live in the memory context that is CurrentMemoryContext
275 : * at the time of this call. It will be limited to (approximately) maxbytes
276 : * total memory consumption. If the DSA passed to this function is not NULL
277 : * then the memory for storing elements of the underlying page table will
278 : * be allocated from the DSA.
279 : */
280 : TIDBitmap *
281 1110 : tbm_create(long maxbytes, dsa_area *dsa)
282 : {
283 : TIDBitmap *tbm;
284 : long nbuckets;
285 :
286 : /* Create the TIDBitmap struct and zero all its fields */
287 1110 : tbm = makeNode(TIDBitmap);
288 :
289 1110 : tbm->mcxt = CurrentMemoryContext;
290 1110 : tbm->status = TBM_EMPTY;
291 :
292 : /*
293 : * Estimate number of hashtable entries we can have within maxbytes. This
294 : * estimates the hash cost as sizeof(PagetableEntry), which is good enough
295 : * for our purpose. Also count an extra Pointer per entry for the arrays
296 : * created during iteration readout.
297 : */
298 1110 : nbuckets = maxbytes /
299 : (sizeof(PagetableEntry) + sizeof(Pointer) + sizeof(Pointer));
300 1110 : nbuckets = Min(nbuckets, INT_MAX - 1); /* safety limit */
301 1110 : nbuckets = Max(nbuckets, 16); /* sanity limit */
302 1110 : tbm->maxentries = (int) nbuckets;
303 1110 : tbm->lossify_start = 0;
304 1110 : tbm->dsa = dsa;
305 1110 : tbm->dsapagetable = InvalidDsaPointer;
306 1110 : tbm->dsapagetableold = InvalidDsaPointer;
307 1110 : tbm->ptpages = InvalidDsaPointer;
308 1110 : tbm->ptchunks = InvalidDsaPointer;
309 :
310 1110 : return tbm;
311 : }
312 :
313 : /*
314 : * Actually create the hashtable. Since this is a moderately expensive
315 : * proposition, we don't do it until we have to.
316 : */
317 : static void
318 408 : tbm_create_pagetable(TIDBitmap *tbm)
319 : {
320 408 : Assert(tbm->status != TBM_HASH);
321 408 : Assert(tbm->pagetable == NULL);
322 :
323 408 : tbm->pagetable = pagetable_create(tbm->mcxt, 128, tbm);
324 :
325 : /* If entry1 is valid, push it into the hashtable */
326 408 : if (tbm->status == TBM_ONE_PAGE)
327 : {
328 : PagetableEntry *page;
329 : bool found;
330 : char oldstatus;
331 :
332 160 : page = pagetable_insert(tbm->pagetable,
333 : tbm->entry1.blockno,
334 : &found);
335 160 : Assert(!found);
336 160 : oldstatus = page->status;
337 160 : memcpy(page, &tbm->entry1, sizeof(PagetableEntry));
338 160 : page->status = oldstatus;
339 : }
340 :
341 408 : tbm->status = TBM_HASH;
342 408 : }
343 :
344 : /*
345 : * tbm_free - free a TIDBitmap
346 : */
347 : void
348 1106 : tbm_free(TIDBitmap *tbm)
349 : {
350 1106 : if (tbm->pagetable)
351 408 : pagetable_destroy(tbm->pagetable);
352 1106 : if (tbm->spages)
353 147 : pfree(tbm->spages);
354 1106 : if (tbm->schunks)
355 250 : pfree(tbm->schunks);
356 1106 : pfree(tbm);
357 1106 : }
358 :
359 : /*
360 : * tbm_free_shared_area - free shared state
361 : *
362 : * Free shared iterator state, Also free shared pagetable and iterator arrays
363 : * memory if they are not referred by any of the shared iterator i.e recount
364 : * is becomes 0.
365 : */
366 : void
367 18 : tbm_free_shared_area(dsa_area *dsa, dsa_pointer dp)
368 : {
369 18 : TBMSharedIteratorState *istate = dsa_get_address(dsa, dp);
370 : PTEntryArray *ptbase;
371 : PTIterationArray *ptpages;
372 : PTIterationArray *ptchunks;
373 :
374 18 : if (DsaPointerIsValid(istate->pagetable))
375 : {
376 18 : ptbase = dsa_get_address(dsa, istate->pagetable);
377 18 : if (pg_atomic_sub_fetch_u32(&ptbase->refcount, 1) == 0)
378 9 : dsa_free(dsa, istate->pagetable);
379 : }
380 18 : if (DsaPointerIsValid(istate->spages))
381 : {
382 18 : ptpages = dsa_get_address(dsa, istate->spages);
383 18 : if (pg_atomic_sub_fetch_u32(&ptpages->refcount, 1) == 0)
384 9 : dsa_free(dsa, istate->spages);
385 : }
386 18 : if (DsaPointerIsValid(istate->schunks))
387 : {
388 0 : ptchunks = dsa_get_address(dsa, istate->schunks);
389 0 : if (pg_atomic_sub_fetch_u32(&ptchunks->refcount, 1) == 0)
390 0 : dsa_free(dsa, istate->schunks);
391 : }
392 :
393 18 : dsa_free(dsa, dp);
394 18 : }
395 :
396 : /*
397 : * tbm_add_tuples - add some tuple IDs to a TIDBitmap
398 : *
399 : * If recheck is true, then the recheck flag will be set in the
400 : * TBMIterateResult when any of these tuples are reported out.
401 : */
402 : void
403 393465 : tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids,
404 : bool recheck)
405 : {
406 393465 : BlockNumber currblk = InvalidBlockNumber;
407 393465 : PagetableEntry *page = NULL; /* only valid when currblk is valid */
408 : int i;
409 :
410 393465 : Assert(tbm->iterating == TBM_NOT_ITERATING);
411 831786 : for (i = 0; i < ntids; i++)
412 : {
413 438321 : BlockNumber blk = ItemPointerGetBlockNumber(tids + i);
414 438321 : OffsetNumber off = ItemPointerGetOffsetNumber(tids + i);
415 : int wordnum,
416 : bitnum;
417 :
418 : /* safety check to ensure we don't overrun bit array bounds */
419 438321 : if (off < 1 || off > MAX_TUPLES_PER_PAGE)
420 0 : elog(ERROR, "tuple offset out of range: %u", off);
421 :
422 : /*
423 : * Look up target page unless we already did. This saves cycles when
424 : * the input includes consecutive tuples on the same page, which is
425 : * common enough to justify an extra test here.
426 : */
427 438321 : if (blk != currblk)
428 : {
429 412944 : if (tbm_page_is_lossy(tbm, blk))
430 553 : page = NULL; /* remember page is lossy */
431 : else
432 412391 : page = tbm_get_pageentry(tbm, blk);
433 412944 : currblk = blk;
434 : }
435 :
436 438321 : if (page == NULL)
437 553 : continue; /* whole page is already marked */
438 :
439 437768 : if (page->ischunk)
440 : {
441 : /* The page is a lossy chunk header, set bit for itself */
442 0 : wordnum = bitnum = 0;
443 : }
444 : else
445 : {
446 : /* Page is exact, so set bit for individual tuple */
447 437768 : wordnum = WORDNUM(off - 1);
448 437768 : bitnum = BITNUM(off - 1);
449 : }
450 437768 : page->words[wordnum] |= ((bitmapword) 1 << bitnum);
451 437768 : page->recheck |= recheck;
452 :
453 437768 : if (tbm->nentries > tbm->maxentries)
454 : {
455 4 : tbm_lossify(tbm);
456 : /* Page could have been converted to lossy, so force new lookup */
457 4 : currblk = InvalidBlockNumber;
458 : }
459 : }
460 393465 : }
461 :
462 : /*
463 : * tbm_add_page - add a whole page to a TIDBitmap
464 : *
465 : * This causes the whole page to be reported (with the recheck flag)
466 : * when the TIDBitmap is scanned.
467 : */
468 : void
469 18449 : tbm_add_page(TIDBitmap *tbm, BlockNumber pageno)
470 : {
471 : /* Enter the page in the bitmap, or mark it lossy if already present */
472 18449 : tbm_mark_page_lossy(tbm, pageno);
473 : /* If we went over the memory limit, lossify some more pages */
474 18449 : if (tbm->nentries > tbm->maxentries)
475 0 : tbm_lossify(tbm);
476 18449 : }
477 :
478 : /*
479 : * tbm_union - set union
480 : *
481 : * a is modified in-place, b is not changed
482 : */
483 : void
484 0 : tbm_union(TIDBitmap *a, const TIDBitmap *b)
485 : {
486 0 : Assert(!a->iterating);
487 : /* Nothing to do if b is empty */
488 0 : if (b->nentries == 0)
489 0 : return;
490 : /* Scan through chunks and pages in b, merge into a */
491 0 : if (b->status == TBM_ONE_PAGE)
492 0 : tbm_union_page(a, &b->entry1);
493 : else
494 : {
495 : pagetable_iterator i;
496 : PagetableEntry *bpage;
497 :
498 0 : Assert(b->status == TBM_HASH);
499 0 : pagetable_start_iterate(b->pagetable, &i);
500 0 : while ((bpage = pagetable_iterate(b->pagetable, &i)) != NULL)
501 0 : tbm_union_page(a, bpage);
502 : }
503 : }
504 :
505 : /* Process one page of b during a union op */
506 : static void
507 0 : tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage)
508 : {
509 : PagetableEntry *apage;
510 : int wordnum;
511 :
512 0 : if (bpage->ischunk)
513 : {
514 : /* Scan b's chunk, mark each indicated page lossy in a */
515 0 : for (wordnum = 0; wordnum < WORDS_PER_CHUNK; wordnum++)
516 : {
517 0 : bitmapword w = bpage->words[wordnum];
518 :
519 0 : if (w != 0)
520 : {
521 : BlockNumber pg;
522 :
523 0 : pg = bpage->blockno + (wordnum * BITS_PER_BITMAPWORD);
524 0 : while (w != 0)
525 : {
526 0 : if (w & 1)
527 0 : tbm_mark_page_lossy(a, pg);
528 0 : pg++;
529 0 : w >>= 1;
530 : }
531 : }
532 : }
533 : }
534 0 : else if (tbm_page_is_lossy(a, bpage->blockno))
535 : {
536 : /* page is already lossy in a, nothing to do */
537 0 : return;
538 : }
539 : else
540 : {
541 0 : apage = tbm_get_pageentry(a, bpage->blockno);
542 0 : if (apage->ischunk)
543 : {
544 : /* The page is a lossy chunk header, set bit for itself */
545 0 : apage->words[0] |= ((bitmapword) 1 << 0);
546 : }
547 : else
548 : {
549 : /* Both pages are exact, merge at the bit level */
550 0 : for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
551 0 : apage->words[wordnum] |= bpage->words[wordnum];
552 0 : apage->recheck |= bpage->recheck;
553 : }
554 : }
555 :
556 0 : if (a->nentries > a->maxentries)
557 0 : tbm_lossify(a);
558 : }
559 :
560 : /*
561 : * tbm_intersect - set intersection
562 : *
563 : * a is modified in-place, b is not changed
564 : */
565 : void
566 2 : tbm_intersect(TIDBitmap *a, const TIDBitmap *b)
567 : {
568 2 : Assert(!a->iterating);
569 : /* Nothing to do if a is empty */
570 2 : if (a->nentries == 0)
571 2 : return;
572 : /* Scan through chunks and pages in a, try to match to b */
573 2 : if (a->status == TBM_ONE_PAGE)
574 : {
575 0 : if (tbm_intersect_page(a, &a->entry1, b))
576 : {
577 : /* Page is now empty, remove it from a */
578 0 : Assert(!a->entry1.ischunk);
579 0 : a->npages--;
580 0 : a->nentries--;
581 0 : Assert(a->nentries == 0);
582 0 : a->status = TBM_EMPTY;
583 : }
584 : }
585 : else
586 : {
587 : pagetable_iterator i;
588 : PagetableEntry *apage;
589 :
590 2 : Assert(a->status == TBM_HASH);
591 2 : pagetable_start_iterate(a->pagetable, &i);
592 2 : while ((apage = pagetable_iterate(a->pagetable, &i)) != NULL)
593 : {
594 692 : if (tbm_intersect_page(a, apage, b))
595 : {
596 : /* Page or chunk is now empty, remove it from a */
597 663 : if (apage->ischunk)
598 0 : a->nchunks--;
599 : else
600 663 : a->npages--;
601 663 : a->nentries--;
602 663 : if (!pagetable_delete(a->pagetable, apage->blockno))
603 0 : elog(ERROR, "hash table corrupted");
604 : }
605 : }
606 : }
607 : }
608 :
609 : /*
610 : * Process one page of a during an intersection op
611 : *
612 : * Returns TRUE if apage is now empty and should be deleted from a
613 : */
614 : static bool
615 692 : tbm_intersect_page(TIDBitmap *a, PagetableEntry *apage, const TIDBitmap *b)
616 : {
617 : const PagetableEntry *bpage;
618 : int wordnum;
619 :
620 692 : if (apage->ischunk)
621 : {
622 : /* Scan each bit in chunk, try to clear */
623 5 : bool candelete = true;
624 :
625 45 : for (wordnum = 0; wordnum < WORDS_PER_CHUNK; wordnum++)
626 : {
627 40 : bitmapword w = apage->words[wordnum];
628 :
629 40 : if (w != 0)
630 : {
631 39 : bitmapword neww = w;
632 : BlockNumber pg;
633 : int bitnum;
634 :
635 39 : pg = apage->blockno + (wordnum * BITS_PER_BITMAPWORD);
636 39 : bitnum = 0;
637 1283 : while (w != 0)
638 : {
639 1205 : if (w & 1)
640 : {
641 629 : if (!tbm_page_is_lossy(b, pg) &&
642 38 : tbm_find_pageentry(b, pg) == NULL)
643 : {
644 : /* Page is not in b at all, lose lossy bit */
645 0 : neww &= ~((bitmapword) 1 << bitnum);
646 : }
647 : }
648 1205 : pg++;
649 1205 : bitnum++;
650 1205 : w >>= 1;
651 : }
652 39 : apage->words[wordnum] = neww;
653 39 : if (neww != 0)
654 39 : candelete = false;
655 : }
656 : }
657 5 : return candelete;
658 : }
659 687 : else if (tbm_page_is_lossy(b, apage->blockno))
660 : {
661 : /*
662 : * Some of the tuples in 'a' might not satisfy the quals for 'b', but
663 : * because the page 'b' is lossy, we don't know which ones. Therefore
664 : * we mark 'a' as requiring rechecks, to indicate that at most those
665 : * tuples set in 'a' are matches.
666 : */
667 0 : apage->recheck = true;
668 0 : return false;
669 : }
670 : else
671 : {
672 687 : bool candelete = true;
673 :
674 687 : bpage = tbm_find_pageentry(b, apage->blockno);
675 687 : if (bpage != NULL)
676 : {
677 : /* Both pages are exact, merge at the bit level */
678 610 : Assert(!bpage->ischunk);
679 6710 : for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
680 : {
681 6100 : apage->words[wordnum] &= bpage->words[wordnum];
682 6100 : if (apage->words[wordnum] != 0)
683 24 : candelete = false;
684 : }
685 610 : apage->recheck |= bpage->recheck;
686 : }
687 : /* If there is no matching b page, we can just delete the a page */
688 687 : return candelete;
689 : }
690 : }
691 :
692 : /*
693 : * tbm_is_empty - is a TIDBitmap completely empty?
694 : */
695 : bool
696 11 : tbm_is_empty(const TIDBitmap *tbm)
697 : {
698 11 : return (tbm->nentries == 0);
699 : }
700 :
701 : /*
702 : * tbm_begin_iterate - prepare to iterate through a TIDBitmap
703 : *
704 : * The TBMIterator struct is created in the caller's memory context.
705 : * For a clean shutdown of the iteration, call tbm_end_iterate; but it's
706 : * okay to just allow the memory context to be released, too. It is caller's
707 : * responsibility not to touch the TBMIterator anymore once the TIDBitmap
708 : * is freed.
709 : *
710 : * NB: after this is called, it is no longer allowed to modify the contents
711 : * of the bitmap. However, you can call this multiple times to scan the
712 : * contents repeatedly, including parallel scans.
713 : */
714 : TBMIterator *
715 2187 : tbm_begin_iterate(TIDBitmap *tbm)
716 : {
717 : TBMIterator *iterator;
718 :
719 2187 : Assert(tbm->iterating != TBM_ITERATING_SHARED);
720 :
721 : /*
722 : * Create the TBMIterator struct, with enough trailing space to serve the
723 : * needs of the TBMIterateResult sub-struct.
724 : */
725 2187 : iterator = (TBMIterator *) palloc(sizeof(TBMIterator) +
726 : MAX_TUPLES_PER_PAGE * sizeof(OffsetNumber));
727 2187 : iterator->tbm = tbm;
728 :
729 : /*
730 : * Initialize iteration pointers.
731 : */
732 2187 : iterator->spageptr = 0;
733 2187 : iterator->schunkptr = 0;
734 2187 : iterator->schunkbit = 0;
735 :
736 : /*
737 : * If we have a hashtable, create and fill the sorted page lists, unless
738 : * we already did that for a previous iterator. Note that the lists are
739 : * attached to the bitmap not the iterator, so they can be used by more
740 : * than one iterator.
741 : */
742 2187 : if (tbm->status == TBM_HASH && tbm->iterating == TBM_NOT_ITERATING)
743 : {
744 : pagetable_iterator i;
745 : PagetableEntry *page;
746 : int npages;
747 : int nchunks;
748 :
749 395 : if (!tbm->spages && tbm->npages > 0)
750 147 : tbm->spages = (PagetableEntry **)
751 147 : MemoryContextAlloc(tbm->mcxt,
752 147 : tbm->npages * sizeof(PagetableEntry *));
753 395 : if (!tbm->schunks && tbm->nchunks > 0)
754 250 : tbm->schunks = (PagetableEntry **)
755 250 : MemoryContextAlloc(tbm->mcxt,
756 250 : tbm->nchunks * sizeof(PagetableEntry *));
757 :
758 395 : npages = nchunks = 0;
759 395 : pagetable_start_iterate(tbm->pagetable, &i);
760 4790 : while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
761 : {
762 4000 : if (page->ischunk)
763 258 : tbm->schunks[nchunks++] = page;
764 : else
765 3742 : tbm->spages[npages++] = page;
766 : }
767 395 : Assert(npages == tbm->npages);
768 395 : Assert(nchunks == tbm->nchunks);
769 395 : if (npages > 1)
770 147 : qsort(tbm->spages, npages, sizeof(PagetableEntry *),
771 : tbm_comparator);
772 395 : if (nchunks > 1)
773 2 : qsort(tbm->schunks, nchunks, sizeof(PagetableEntry *),
774 : tbm_comparator);
775 : }
776 :
777 2187 : tbm->iterating = TBM_ITERATING_PRIVATE;
778 :
779 2187 : return iterator;
780 : }
781 :
782 : /*
783 : * tbm_prepare_shared_iterate - prepare shared iteration state for a TIDBitmap.
784 : *
785 : * The necessary shared state will be allocated from the DSA passed to
786 : * tbm_create, so that multiple processes can attach to it and iterate jointly.
787 : *
788 : * This will convert the pagetable hash into page and chunk array of the index
789 : * into pagetable array.
790 : */
791 : dsa_pointer
792 22 : tbm_prepare_shared_iterate(TIDBitmap *tbm)
793 : {
794 : dsa_pointer dp;
795 : TBMSharedIteratorState *istate;
796 22 : PTEntryArray *ptbase = NULL;
797 22 : PTIterationArray *ptpages = NULL;
798 22 : PTIterationArray *ptchunks = NULL;
799 :
800 22 : Assert(tbm->dsa != NULL);
801 22 : Assert(tbm->iterating != TBM_ITERATING_PRIVATE);
802 :
803 : /*
804 : * Allocate TBMSharedIteratorState from DSA to hold the shared members and
805 : * lock, this will also be used by multiple worker for shared iterate.
806 : */
807 22 : dp = dsa_allocate0(tbm->dsa, sizeof(TBMSharedIteratorState));
808 22 : istate = dsa_get_address(tbm->dsa, dp);
809 :
810 : /*
811 : * If we're not already iterating, create and fill the sorted page lists.
812 : * (If we are, the sorted page lists are already stored in the TIDBitmap,
813 : * and we can just reuse them.)
814 : */
815 22 : if (tbm->iterating == TBM_NOT_ITERATING)
816 : {
817 : pagetable_iterator i;
818 : PagetableEntry *page;
819 : int idx;
820 : int npages;
821 : int nchunks;
822 :
823 : /*
824 : * Allocate the page and chunk array memory from the DSA to share
825 : * across multiple processes.
826 : */
827 11 : if (tbm->npages)
828 : {
829 11 : tbm->ptpages = dsa_allocate(tbm->dsa, sizeof(PTIterationArray) +
830 : tbm->npages * sizeof(int));
831 11 : ptpages = dsa_get_address(tbm->dsa, tbm->ptpages);
832 11 : pg_atomic_init_u32(&ptpages->refcount, 0);
833 : }
834 11 : if (tbm->nchunks)
835 : {
836 1 : tbm->ptchunks = dsa_allocate(tbm->dsa, sizeof(PTIterationArray) +
837 : tbm->nchunks * sizeof(int));
838 1 : ptchunks = dsa_get_address(tbm->dsa, tbm->ptchunks);
839 1 : pg_atomic_init_u32(&ptchunks->refcount, 0);
840 : }
841 :
842 : /*
843 : * If TBM status is TBM_HASH then iterate over the pagetable and
844 : * convert it to page and chunk arrays. But if it's in the
845 : * TBM_ONE_PAGE mode then directly allocate the space for one entry
846 : * from the DSA.
847 : */
848 11 : npages = nchunks = 0;
849 11 : if (tbm->status == TBM_HASH)
850 : {
851 11 : ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
852 :
853 11 : pagetable_start_iterate(tbm->pagetable, &i);
854 4121 : while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
855 : {
856 4099 : idx = page - ptbase->ptentry;
857 4099 : if (page->ischunk)
858 5 : ptchunks->index[nchunks++] = idx;
859 : else
860 4094 : ptpages->index[npages++] = idx;
861 : }
862 :
863 11 : Assert(npages == tbm->npages);
864 11 : Assert(nchunks == tbm->nchunks);
865 : }
866 0 : else if (tbm->status == TBM_ONE_PAGE)
867 : {
868 : /*
869 : * In one page mode allocate the space for one pagetable entry,
870 : * initialize it, and directly store its index (i.e. 0) in the
871 : * page array.
872 : */
873 0 : tbm->dsapagetable = dsa_allocate(tbm->dsa, sizeof(PTEntryArray) +
874 : sizeof(PagetableEntry));
875 0 : ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
876 0 : memcpy(ptbase->ptentry, &tbm->entry1, sizeof(PagetableEntry));
877 0 : ptpages->index[0] = 0;
878 : }
879 :
880 11 : if (ptbase != NULL)
881 11 : pg_atomic_init_u32(&ptbase->refcount, 0);
882 11 : if (npages > 1)
883 11 : qsort_arg((void *) (ptpages->index), npages, sizeof(int),
884 11 : tbm_shared_comparator, (void *) ptbase->ptentry);
885 11 : if (nchunks > 1)
886 1 : qsort_arg((void *) (ptchunks->index), nchunks, sizeof(int),
887 1 : tbm_shared_comparator, (void *) ptbase->ptentry);
888 : }
889 :
890 : /*
891 : * Store the TBM members in the shared state so that we can share them
892 : * across multiple processes.
893 : */
894 22 : istate->nentries = tbm->nentries;
895 22 : istate->maxentries = tbm->maxentries;
896 22 : istate->npages = tbm->npages;
897 22 : istate->nchunks = tbm->nchunks;
898 22 : istate->pagetable = tbm->dsapagetable;
899 22 : istate->spages = tbm->ptpages;
900 22 : istate->schunks = tbm->ptchunks;
901 :
902 22 : ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
903 22 : ptpages = dsa_get_address(tbm->dsa, tbm->ptpages);
904 22 : ptchunks = dsa_get_address(tbm->dsa, tbm->ptchunks);
905 :
906 : /*
907 : * For every shared iterator, referring to pagetable and iterator array,
908 : * increase the refcount by 1 so that while freeing the shared iterator we
909 : * don't free pagetable and iterator array until its refcount becomes 0.
910 : */
911 22 : if (ptbase != NULL)
912 22 : pg_atomic_add_fetch_u32(&ptbase->refcount, 1);
913 22 : if (ptpages != NULL)
914 22 : pg_atomic_add_fetch_u32(&ptpages->refcount, 1);
915 22 : if (ptchunks != NULL)
916 2 : pg_atomic_add_fetch_u32(&ptchunks->refcount, 1);
917 :
918 : /* Initialize the iterator lock */
919 22 : LWLockInitialize(&istate->lock, LWTRANCHE_TBM);
920 :
921 : /* Initialize the shared iterator state */
922 22 : istate->schunkbit = 0;
923 22 : istate->schunkptr = 0;
924 22 : istate->spageptr = 0;
925 :
926 22 : tbm->iterating = TBM_ITERATING_SHARED;
927 :
928 22 : return dp;
929 : }
930 :
931 : /*
932 : * tbm_extract_page_tuple - extract the tuple offsets from a page
933 : *
934 : * The extracted offsets are stored into TBMIterateResult.
935 : */
936 : static inline int
937 16190 : tbm_extract_page_tuple(PagetableEntry *page, TBMIterateResult *output)
938 : {
939 : int wordnum;
940 16190 : int ntuples = 0;
941 :
942 178090 : for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
943 : {
944 161900 : bitmapword w = page->words[wordnum];
945 :
946 161900 : if (w != 0)
947 : {
948 31638 : int off = wordnum * BITS_PER_BITMAPWORD + 1;
949 :
950 892645 : while (w != 0)
951 : {
952 829369 : if (w & 1)
953 678952 : output->offsets[ntuples++] = (OffsetNumber) off;
954 829369 : off++;
955 829369 : w >>= 1;
956 : }
957 : }
958 : }
959 :
960 16190 : return ntuples;
961 : }
962 :
963 : /*
964 : * tbm_advance_schunkbit - Advance the chunkbit
965 : */
966 : static inline void
967 43301 : tbm_advance_schunkbit(PagetableEntry *chunk, int *schunkbitp)
968 : {
969 43301 : int schunkbit = *schunkbitp;
970 :
971 177649 : while (schunkbit < PAGES_PER_CHUNK)
972 : {
973 133835 : int wordnum = WORDNUM(schunkbit);
974 133835 : int bitnum = BITNUM(schunkbit);
975 :
976 133835 : if ((chunk->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0)
977 42788 : break;
978 91047 : schunkbit++;
979 : }
980 :
981 43301 : *schunkbitp = schunkbit;
982 43301 : }
983 :
984 : /*
985 : * tbm_iterate - scan through next page of a TIDBitmap
986 : *
987 : * Returns a TBMIterateResult representing one page, or NULL if there are
988 : * no more pages to scan. Pages are guaranteed to be delivered in numerical
989 : * order. If result->ntuples < 0, then the bitmap is "lossy" and failed to
990 : * remember the exact tuples to look at on this page --- the caller must
991 : * examine all tuples on the page and check if they meet the intended
992 : * condition. If result->recheck is true, only the indicated tuples need
993 : * be examined, but the condition must be rechecked anyway. (For ease of
994 : * testing, recheck is always set true when ntuples < 0.)
995 : */
996 : TBMIterateResult *
997 48831 : tbm_iterate(TBMIterator *iterator)
998 : {
999 48831 : TIDBitmap *tbm = iterator->tbm;
1000 48831 : TBMIterateResult *output = &(iterator->output);
1001 :
1002 48831 : Assert(tbm->iterating == TBM_ITERATING_PRIVATE);
1003 :
1004 : /*
1005 : * If lossy chunk pages remain, make sure we've advanced schunkptr/
1006 : * schunkbit to the next set bit.
1007 : */
1008 98165 : while (iterator->schunkptr < tbm->nchunks)
1009 : {
1010 40951 : PagetableEntry *chunk = tbm->schunks[iterator->schunkptr];
1011 40951 : int schunkbit = iterator->schunkbit;
1012 :
1013 40951 : tbm_advance_schunkbit(chunk, &schunkbit);
1014 40951 : if (schunkbit < PAGES_PER_CHUNK)
1015 : {
1016 40448 : iterator->schunkbit = schunkbit;
1017 40448 : break;
1018 : }
1019 : /* advance to next chunk */
1020 503 : iterator->schunkptr++;
1021 503 : iterator->schunkbit = 0;
1022 : }
1023 :
1024 : /*
1025 : * If both chunk and per-page data remain, must output the numerically
1026 : * earlier page.
1027 : */
1028 48831 : if (iterator->schunkptr < tbm->nchunks)
1029 : {
1030 40448 : PagetableEntry *chunk = tbm->schunks[iterator->schunkptr];
1031 : BlockNumber chunk_blockno;
1032 :
1033 40448 : chunk_blockno = chunk->blockno + iterator->schunkbit;
1034 43992 : if (iterator->spageptr >= tbm->npages ||
1035 3544 : chunk_blockno < tbm->spages[iterator->spageptr]->blockno)
1036 : {
1037 : /* Return a lossy page indicator from the chunk */
1038 39262 : output->blockno = chunk_blockno;
1039 39262 : output->ntuples = -1;
1040 39262 : output->recheck = true;
1041 39262 : iterator->schunkbit++;
1042 39262 : return output;
1043 : }
1044 : }
1045 :
1046 9569 : if (iterator->spageptr < tbm->npages)
1047 : {
1048 : PagetableEntry *page;
1049 : int ntuples;
1050 :
1051 : /* In ONE_PAGE state, we don't allocate an spages[] array */
1052 8002 : if (tbm->status == TBM_ONE_PAGE)
1053 626 : page = &tbm->entry1;
1054 : else
1055 7376 : page = tbm->spages[iterator->spageptr];
1056 :
1057 : /* scan bitmap to extract individual offset numbers */
1058 8002 : ntuples = tbm_extract_page_tuple(page, output);
1059 8002 : output->blockno = page->blockno;
1060 8002 : output->ntuples = ntuples;
1061 8002 : output->recheck = page->recheck;
1062 8002 : iterator->spageptr++;
1063 8002 : return output;
1064 : }
1065 :
1066 : /* Nothing more in the bitmap */
1067 1567 : return NULL;
1068 : }
1069 :
1070 : /*
1071 : * tbm_shared_iterate - scan through next page of a TIDBitmap
1072 : *
1073 : * As above, but this will iterate using an iterator which is shared
1074 : * across multiple processes. We need to acquire the iterator LWLock,
1075 : * before accessing the shared members.
1076 : */
1077 : TBMIterateResult *
1078 9437 : tbm_shared_iterate(TBMSharedIterator *iterator)
1079 : {
1080 9437 : TBMIterateResult *output = &iterator->output;
1081 9437 : TBMSharedIteratorState *istate = iterator->state;
1082 9437 : PagetableEntry *ptbase = NULL;
1083 9437 : int *idxpages = NULL;
1084 9437 : int *idxchunks = NULL;
1085 :
1086 9437 : if (iterator->ptbase != NULL)
1087 9437 : ptbase = iterator->ptbase->ptentry;
1088 9437 : if (iterator->ptpages != NULL)
1089 9437 : idxpages = iterator->ptpages->index;
1090 9437 : if (iterator->ptchunks != NULL)
1091 2477 : idxchunks = iterator->ptchunks->index;
1092 :
1093 : /* Acquire the LWLock before accessing the shared members */
1094 9437 : LWLockAcquire(&istate->lock, LW_EXCLUSIVE);
1095 :
1096 : /*
1097 : * If lossy chunk pages remain, make sure we've advanced schunkptr/
1098 : * schunkbit to the next set bit.
1099 : */
1100 18884 : while (istate->schunkptr < istate->nchunks)
1101 : {
1102 2350 : PagetableEntry *chunk = &ptbase[idxchunks[istate->schunkptr]];
1103 2350 : int schunkbit = istate->schunkbit;
1104 :
1105 2350 : tbm_advance_schunkbit(chunk, &schunkbit);
1106 2350 : if (schunkbit < PAGES_PER_CHUNK)
1107 : {
1108 2340 : istate->schunkbit = schunkbit;
1109 2340 : break;
1110 : }
1111 : /* advance to next chunk */
1112 10 : istate->schunkptr++;
1113 10 : istate->schunkbit = 0;
1114 : }
1115 :
1116 : /*
1117 : * If both chunk and per-page data remain, must output the numerically
1118 : * earlier page.
1119 : */
1120 9437 : if (istate->schunkptr < istate->nchunks)
1121 : {
1122 2340 : PagetableEntry *chunk = &ptbase[idxchunks[istate->schunkptr]];
1123 : BlockNumber chunk_blockno;
1124 :
1125 2340 : chunk_blockno = chunk->blockno + istate->schunkbit;
1126 :
1127 4680 : if (istate->spageptr >= istate->npages ||
1128 2340 : chunk_blockno < ptbase[idxpages[istate->spageptr]].blockno)
1129 : {
1130 : /* Return a lossy page indicator from the chunk */
1131 1182 : output->blockno = chunk_blockno;
1132 1182 : output->ntuples = -1;
1133 1182 : output->recheck = true;
1134 1182 : istate->schunkbit++;
1135 :
1136 1182 : LWLockRelease(&istate->lock);
1137 1182 : return output;
1138 : }
1139 : }
1140 :
1141 8255 : if (istate->spageptr < istate->npages)
1142 : {
1143 8188 : PagetableEntry *page = &ptbase[idxpages[istate->spageptr]];
1144 : int ntuples;
1145 :
1146 : /* scan bitmap to extract individual offset numbers */
1147 8188 : ntuples = tbm_extract_page_tuple(page, output);
1148 8188 : output->blockno = page->blockno;
1149 8188 : output->ntuples = ntuples;
1150 8188 : output->recheck = page->recheck;
1151 8188 : istate->spageptr++;
1152 :
1153 8188 : LWLockRelease(&istate->lock);
1154 :
1155 8188 : return output;
1156 : }
1157 :
1158 67 : LWLockRelease(&istate->lock);
1159 :
1160 : /* Nothing more in the bitmap */
1161 67 : return NULL;
1162 : }
1163 :
1164 : /*
1165 : * tbm_end_iterate - finish an iteration over a TIDBitmap
1166 : *
1167 : * Currently this is just a pfree, but it might do more someday. (For
1168 : * instance, it could be useful to count open iterators and allow the
1169 : * bitmap to return to read/write status when there are no more iterators.)
1170 : */
1171 : void
1172 2181 : tbm_end_iterate(TBMIterator *iterator)
1173 : {
1174 2181 : pfree(iterator);
1175 2181 : }
1176 :
1177 : /*
1178 : * tbm_end_shared_iterate - finish a shared iteration over a TIDBitmap
1179 : *
1180 : * This doesn't free any of the shared state associated with the iterator,
1181 : * just our backend-private state.
1182 : */
1183 : void
1184 110 : tbm_end_shared_iterate(TBMSharedIterator *iterator)
1185 : {
1186 110 : pfree(iterator);
1187 110 : }
1188 :
1189 : /*
1190 : * tbm_find_pageentry - find a PagetableEntry for the pageno
1191 : *
1192 : * Returns NULL if there is no non-lossy entry for the pageno.
1193 : */
1194 : static const PagetableEntry *
1195 725 : tbm_find_pageentry(const TIDBitmap *tbm, BlockNumber pageno)
1196 : {
1197 : const PagetableEntry *page;
1198 :
1199 725 : if (tbm->nentries == 0) /* in case pagetable doesn't exist */
1200 0 : return NULL;
1201 :
1202 725 : if (tbm->status == TBM_ONE_PAGE)
1203 : {
1204 0 : page = &tbm->entry1;
1205 0 : if (page->blockno != pageno)
1206 0 : return NULL;
1207 0 : Assert(!page->ischunk);
1208 0 : return page;
1209 : }
1210 :
1211 725 : page = pagetable_lookup(tbm->pagetable, pageno);
1212 725 : if (page == NULL)
1213 77 : return NULL;
1214 648 : if (page->ischunk)
1215 0 : return NULL; /* don't want a lossy chunk header */
1216 648 : return page;
1217 : }
1218 :
1219 : /*
1220 : * tbm_get_pageentry - find or create a PagetableEntry for the pageno
1221 : *
1222 : * If new, the entry is marked as an exact (non-chunk) entry.
1223 : *
1224 : * This may cause the table to exceed the desired memory size. It is
1225 : * up to the caller to call tbm_lossify() at the next safe point if so.
1226 : */
1227 : static PagetableEntry *
1228 412391 : tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno)
1229 : {
1230 : PagetableEntry *page;
1231 : bool found;
1232 :
1233 412391 : if (tbm->status == TBM_EMPTY)
1234 : {
1235 : /* Use the fixed slot */
1236 473 : page = &tbm->entry1;
1237 473 : found = false;
1238 473 : tbm->status = TBM_ONE_PAGE;
1239 : }
1240 : else
1241 : {
1242 411918 : if (tbm->status == TBM_ONE_PAGE)
1243 : {
1244 2476 : page = &tbm->entry1;
1245 2476 : if (page->blockno == pageno)
1246 2316 : return page;
1247 : /* Time to switch from one page to a hashtable */
1248 160 : tbm_create_pagetable(tbm);
1249 : }
1250 :
1251 : /* Look up or create an entry */
1252 409602 : page = pagetable_insert(tbm->pagetable, pageno, &found);
1253 : }
1254 :
1255 : /* Initialize it if not present before */
1256 410075 : if (!found)
1257 : {
1258 11855 : char oldstatus = page->status;
1259 :
1260 11855 : MemSet(page, 0, sizeof(PagetableEntry));
1261 11855 : page->status = oldstatus;
1262 11855 : page->blockno = pageno;
1263 : /* must count it too */
1264 11855 : tbm->nentries++;
1265 11855 : tbm->npages++;
1266 : }
1267 :
1268 410075 : return page;
1269 : }
1270 :
1271 : /*
1272 : * tbm_page_is_lossy - is the page marked as lossily stored?
1273 : */
1274 : static bool
1275 414222 : tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno)
1276 : {
1277 : PagetableEntry *page;
1278 : BlockNumber chunk_pageno;
1279 : int bitno;
1280 :
1281 : /* we can skip the lookup if there are no lossy chunks */
1282 414222 : if (tbm->nchunks == 0)
1283 406437 : return false;
1284 7785 : Assert(tbm->status == TBM_HASH);
1285 :
1286 7785 : bitno = pageno % PAGES_PER_CHUNK;
1287 7785 : chunk_pageno = pageno - bitno;
1288 :
1289 7785 : page = pagetable_lookup(tbm->pagetable, chunk_pageno);
1290 :
1291 7785 : if (page != NULL && page->ischunk)
1292 : {
1293 7785 : int wordnum = WORDNUM(bitno);
1294 7785 : int bitnum = BITNUM(bitno);
1295 :
1296 7785 : if ((page->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0)
1297 1106 : return true;
1298 : }
1299 6679 : return false;
1300 : }
1301 :
1302 : /*
1303 : * tbm_mark_page_lossy - mark the page number as lossily stored
1304 : *
1305 : * This may cause the table to exceed the desired memory size. It is
1306 : * up to the caller to call tbm_lossify() at the next safe point if so.
1307 : */
1308 : static void
1309 20793 : tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno)
1310 : {
1311 : PagetableEntry *page;
1312 : bool found;
1313 : BlockNumber chunk_pageno;
1314 : int bitno;
1315 : int wordnum;
1316 : int bitnum;
1317 :
1318 : /* We force the bitmap into hashtable mode whenever it's lossy */
1319 20793 : if (tbm->status != TBM_HASH)
1320 248 : tbm_create_pagetable(tbm);
1321 :
1322 20793 : bitno = pageno % PAGES_PER_CHUNK;
1323 20793 : chunk_pageno = pageno - bitno;
1324 :
1325 : /*
1326 : * Remove any extant non-lossy entry for the page. If the page is its own
1327 : * chunk header, however, we skip this and handle the case below.
1328 : */
1329 20793 : if (bitno != 0)
1330 : {
1331 20571 : if (pagetable_delete(tbm->pagetable, pageno))
1332 : {
1333 : /* It was present, so adjust counts */
1334 2344 : tbm->nentries--;
1335 2344 : tbm->npages--; /* assume it must have been non-lossy */
1336 : }
1337 : }
1338 :
1339 : /* Look up or create entry for chunk-header page */
1340 20793 : page = pagetable_insert(tbm->pagetable, chunk_pageno, &found);
1341 :
1342 : /* Initialize it if not present before */
1343 20793 : if (!found)
1344 : {
1345 248 : char oldstatus = page->status;
1346 :
1347 248 : MemSet(page, 0, sizeof(PagetableEntry));
1348 248 : page->status = oldstatus;
1349 248 : page->blockno = chunk_pageno;
1350 248 : page->ischunk = true;
1351 : /* must count it too */
1352 248 : tbm->nentries++;
1353 248 : tbm->nchunks++;
1354 : }
1355 20545 : else if (!page->ischunk)
1356 : {
1357 20 : char oldstatus = page->status;
1358 :
1359 : /* chunk header page was formerly non-lossy, make it lossy */
1360 20 : MemSet(page, 0, sizeof(PagetableEntry));
1361 20 : page->status = oldstatus;
1362 20 : page->blockno = chunk_pageno;
1363 20 : page->ischunk = true;
1364 : /* we assume it had some tuple bit(s) set, so mark it lossy */
1365 20 : page->words[0] = ((bitmapword) 1 << 0);
1366 : /* adjust counts */
1367 20 : tbm->nchunks++;
1368 20 : tbm->npages--;
1369 : }
1370 :
1371 : /* Now set the original target page's bit */
1372 20793 : wordnum = WORDNUM(bitno);
1373 20793 : bitnum = BITNUM(bitno);
1374 20793 : page->words[wordnum] |= ((bitmapword) 1 << bitnum);
1375 20793 : }
1376 :
1377 : /*
1378 : * tbm_lossify - lose some information to get back under the memory limit
1379 : */
1380 : static void
1381 4 : tbm_lossify(TIDBitmap *tbm)
1382 : {
1383 : pagetable_iterator i;
1384 : PagetableEntry *page;
1385 :
1386 : /*
1387 : * XXX Really stupid implementation: this just lossifies pages in
1388 : * essentially random order. We should be paying some attention to the
1389 : * number of bits set in each page, instead.
1390 : *
1391 : * Since we are called as soon as nentries exceeds maxentries, we should
1392 : * push nentries down to significantly less than maxentries, or else we'll
1393 : * just end up doing this again very soon. We shoot for maxentries/2.
1394 : */
1395 4 : Assert(tbm->iterating == TBM_NOT_ITERATING);
1396 4 : Assert(tbm->status == TBM_HASH);
1397 :
1398 4 : pagetable_start_iterate_at(tbm->pagetable, &i, tbm->lossify_start);
1399 4 : while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
1400 : {
1401 2344 : if (page->ischunk)
1402 0 : continue; /* already a chunk header */
1403 :
1404 : /*
1405 : * If the page would become a chunk header, we won't save anything by
1406 : * converting it to lossy, so skip it.
1407 : */
1408 2344 : if ((page->blockno % PAGES_PER_CHUNK) == 0)
1409 0 : continue;
1410 :
1411 : /* This does the dirty work ... */
1412 2344 : tbm_mark_page_lossy(tbm, page->blockno);
1413 :
1414 2344 : if (tbm->nentries <= tbm->maxentries / 2)
1415 : {
1416 : /*
1417 : * We have made enough room. Remember where to start lossifying
1418 : * next round, so we evenly iterate over the hashtable.
1419 : */
1420 4 : tbm->lossify_start = i.cur;
1421 4 : break;
1422 : }
1423 :
1424 : /*
1425 : * Note: tbm_mark_page_lossy may have inserted a lossy chunk into the
1426 : * hashtable and may have deleted the non-lossy chunk. We can
1427 : * continue the same hash table scan, since failure to visit one
1428 : * element or visiting the newly inserted element, isn't fatal.
1429 : */
1430 : }
1431 :
1432 : /*
1433 : * With a big bitmap and small work_mem, it's possible that we cannot get
1434 : * under maxentries. Again, if that happens, we'd end up uselessly
1435 : * calling tbm_lossify over and over. To prevent this from becoming a
1436 : * performance sink, force maxentries up to at least double the current
1437 : * number of entries. (In essence, we're admitting inability to fit
1438 : * within work_mem when we do this.) Note that this test will not fire if
1439 : * we broke out of the loop early; and if we didn't, the current number of
1440 : * entries is simply not reducible any further.
1441 : */
1442 4 : if (tbm->nentries > tbm->maxentries / 2)
1443 0 : tbm->maxentries = Min(tbm->nentries, (INT_MAX - 1) / 2) * 2;
1444 4 : }
1445 :
1446 : /*
1447 : * qsort comparator to handle PagetableEntry pointers.
1448 : */
1449 : static int
1450 22691 : tbm_comparator(const void *left, const void *right)
1451 : {
1452 22691 : BlockNumber l = (*((PagetableEntry *const *) left))->blockno;
1453 22691 : BlockNumber r = (*((PagetableEntry *const *) right))->blockno;
1454 :
1455 22691 : if (l < r)
1456 10049 : return -1;
1457 12642 : else if (l > r)
1458 12642 : return 1;
1459 0 : return 0;
1460 : }
1461 :
1462 : /*
1463 : * As above, but this will get index into PagetableEntry array. Therefore,
1464 : * it needs to get actual PagetableEntry using the index before comparing the
1465 : * blockno.
1466 : */
1467 : static int
1468 35923 : tbm_shared_comparator(const void *left, const void *right, void *arg)
1469 : {
1470 35923 : PagetableEntry *base = (PagetableEntry *) arg;
1471 35923 : PagetableEntry *lpage = &base[*(int *) left];
1472 35923 : PagetableEntry *rpage = &base[*(int *) right];
1473 :
1474 35923 : if (lpage->blockno < rpage->blockno)
1475 16153 : return -1;
1476 19770 : else if (lpage->blockno > rpage->blockno)
1477 19770 : return 1;
1478 0 : return 0;
1479 : }
1480 :
1481 : /*
1482 : * tbm_attach_shared_iterate
1483 : *
1484 : * Allocate a backend-private iterator and attach the shared iterator state
1485 : * to it so that multiple processed can iterate jointly.
1486 : *
1487 : * We also converts the DSA pointers to local pointers and store them into
1488 : * our private iterator.
1489 : */
1490 : TBMSharedIterator *
1491 110 : tbm_attach_shared_iterate(dsa_area *dsa, dsa_pointer dp)
1492 : {
1493 : TBMSharedIterator *iterator;
1494 : TBMSharedIteratorState *istate;
1495 :
1496 : /*
1497 : * Create the TBMSharedIterator struct, with enough trailing space to
1498 : * serve the needs of the TBMIterateResult sub-struct.
1499 : */
1500 110 : iterator = (TBMSharedIterator *) palloc0(sizeof(TBMSharedIterator) +
1501 : MAX_TUPLES_PER_PAGE * sizeof(OffsetNumber));
1502 :
1503 110 : istate = (TBMSharedIteratorState *) dsa_get_address(dsa, dp);
1504 :
1505 110 : iterator->state = istate;
1506 :
1507 110 : iterator->ptbase = dsa_get_address(dsa, istate->pagetable);
1508 :
1509 110 : if (istate->npages)
1510 110 : iterator->ptpages = dsa_get_address(dsa, istate->spages);
1511 110 : if (istate->nchunks)
1512 10 : iterator->ptchunks = dsa_get_address(dsa, istate->schunks);
1513 :
1514 110 : return iterator;
1515 : }
1516 :
1517 : /*
1518 : * pagetable_allocate
1519 : *
1520 : * Callback function for allocating the memory for hashtable elements.
1521 : * Allocate memory for hashtable elements, using DSA if available.
1522 : */
1523 : static inline void *
1524 430 : pagetable_allocate(pagetable_hash *pagetable, Size size)
1525 : {
1526 430 : TIDBitmap *tbm = (TIDBitmap *) pagetable->private_data;
1527 : PTEntryArray *ptbase;
1528 :
1529 430 : if (tbm->dsa == NULL)
1530 406 : return MemoryContextAllocExtended(pagetable->ctx, size,
1531 : MCXT_ALLOC_HUGE | MCXT_ALLOC_ZERO);
1532 :
1533 : /*
1534 : * Save the dsapagetable reference in dsapagetableold before allocating
1535 : * new memory so that pagetable_free can free the old entry.
1536 : */
1537 24 : tbm->dsapagetableold = tbm->dsapagetable;
1538 24 : tbm->dsapagetable = dsa_allocate_extended(tbm->dsa,
1539 : sizeof(PTEntryArray) + size,
1540 : DSA_ALLOC_HUGE | DSA_ALLOC_ZERO);
1541 24 : ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
1542 :
1543 24 : return ptbase->ptentry;
1544 : }
1545 :
1546 : /*
1547 : * pagetable_free
1548 : *
1549 : * Callback function for freeing hash table elements.
1550 : */
1551 : static inline void
1552 430 : pagetable_free(pagetable_hash *pagetable, void *pointer)
1553 : {
1554 430 : TIDBitmap *tbm = (TIDBitmap *) pagetable->private_data;
1555 :
1556 : /* pfree the input pointer if DSA is not available */
1557 430 : if (tbm->dsa == NULL)
1558 406 : pfree(pointer);
1559 24 : else if (DsaPointerIsValid(tbm->dsapagetableold))
1560 : {
1561 13 : dsa_free(tbm->dsa, tbm->dsapagetableold);
1562 13 : tbm->dsapagetableold = InvalidDsaPointer;
1563 : }
1564 430 : }
|