LCOV - code coverage report
Current view: top level - src/backend/nodes - tidbitmap.c (source / functions) Hit Total Coverage
Test: PostgreSQL Lines: 450 515 87.4 %
Date: 2017-09-29 15:12:54 Functions: 28 30 93.3 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * tidbitmap.c
       4             :  *    PostgreSQL tuple-id (TID) bitmap package
       5             :  *
       6             :  * This module provides bitmap data structures that are spiritually
       7             :  * similar to Bitmapsets, but are specially adapted to store sets of
       8             :  * tuple identifiers (TIDs), or ItemPointers.  In particular, the division
       9             :  * of an ItemPointer into BlockNumber and OffsetNumber is catered for.
      10             :  * Also, since we wish to be able to store very large tuple sets in
      11             :  * memory with this data structure, we support "lossy" storage, in which
      12             :  * we no longer remember individual tuple offsets on a page but only the
      13             :  * fact that a particular page needs to be visited.
      14             :  *
      15             :  * The "lossy" storage uses one bit per disk page, so at the standard 8K
      16             :  * BLCKSZ, we can represent all pages in 64Gb of disk space in about 1Mb
      17             :  * of memory.  People pushing around tables of that size should have a
      18             :  * couple of Mb to spare, so we don't worry about providing a second level
      19             :  * of lossiness.  In theory we could fall back to page ranges at some
      20             :  * point, but for now that seems useless complexity.
      21             :  *
      22             :  * We also support the notion of candidate matches, or rechecking.  This
      23             :  * means we know that a search need visit only some tuples on a page,
      24             :  * but we are not certain that all of those tuples are real matches.
      25             :  * So the eventual heap scan must recheck the quals for these tuples only,
      26             :  * rather than rechecking the quals for all tuples on the page as in the
      27             :  * lossy-bitmap case.  Rechecking can be specified when TIDs are inserted
      28             :  * into a bitmap, and it can also happen internally when we AND a lossy
      29             :  * and a non-lossy page.
      30             :  *
      31             :  *
      32             :  * Copyright (c) 2003-2017, PostgreSQL Global Development Group
      33             :  *
      34             :  * IDENTIFICATION
      35             :  *    src/backend/nodes/tidbitmap.c
      36             :  *
      37             :  *-------------------------------------------------------------------------
      38             :  */
      39             : #include "postgres.h"
      40             : 
      41             : #include <limits.h>
      42             : 
      43             : #include "access/htup_details.h"
      44             : #include "nodes/bitmapset.h"
      45             : #include "nodes/tidbitmap.h"
      46             : #include "storage/lwlock.h"
      47             : #include "utils/dsa.h"
      48             : 
      49             : /*
      50             :  * The maximum number of tuples per page is not large (typically 256 with
      51             :  * 8K pages, or 1024 with 32K pages).  So there's not much point in making
      52             :  * the per-page bitmaps variable size.  We just legislate that the size
      53             :  * is this:
      54             :  */
      55             : #define MAX_TUPLES_PER_PAGE  MaxHeapTuplesPerPage
      56             : 
      57             : /*
      58             :  * When we have to switch over to lossy storage, we use a data structure
      59             :  * with one bit per page, where all pages having the same number DIV
      60             :  * PAGES_PER_CHUNK are aggregated into one chunk.  When a chunk is present
      61             :  * and has the bit set for a given page, there must not be a per-page entry
      62             :  * for that page in the page table.
      63             :  *
      64             :  * We actually store both exact pages and lossy chunks in the same hash
      65             :  * table, using identical data structures.  (This is because the memory
      66             :  * management for hashtables doesn't easily/efficiently allow space to be
      67             :  * transferred easily from one hashtable to another.)  Therefore it's best
      68             :  * if PAGES_PER_CHUNK is the same as MAX_TUPLES_PER_PAGE, or at least not
      69             :  * too different.  But we also want PAGES_PER_CHUNK to be a power of 2 to
      70             :  * avoid expensive integer remainder operations.  So, define it like this:
      71             :  */
      72             : #define PAGES_PER_CHUNK  (BLCKSZ / 32)
      73             : 
      74             : /* We use BITS_PER_BITMAPWORD and typedef bitmapword from nodes/bitmapset.h */
      75             : 
      76             : #define WORDNUM(x)  ((x) / BITS_PER_BITMAPWORD)
      77             : #define BITNUM(x)   ((x) % BITS_PER_BITMAPWORD)
      78             : 
      79             : /* number of active words for an exact page: */
      80             : #define WORDS_PER_PAGE  ((MAX_TUPLES_PER_PAGE - 1) / BITS_PER_BITMAPWORD + 1)
      81             : /* number of active words for a lossy chunk: */
      82             : #define WORDS_PER_CHUNK  ((PAGES_PER_CHUNK - 1) / BITS_PER_BITMAPWORD + 1)
      83             : 
      84             : /*
      85             :  * The hashtable entries are represented by this data structure.  For
      86             :  * an exact page, blockno is the page number and bit k of the bitmap
      87             :  * represents tuple offset k+1.  For a lossy chunk, blockno is the first
      88             :  * page in the chunk (this must be a multiple of PAGES_PER_CHUNK) and
      89             :  * bit k represents page blockno+k.  Note that it is not possible to
      90             :  * have exact storage for the first page of a chunk if we are using
      91             :  * lossy storage for any page in the chunk's range, since the same
      92             :  * hashtable entry has to serve both purposes.
      93             :  *
      94             :  * recheck is used only on exact pages --- it indicates that although
      95             :  * only the stated tuples need be checked, the full index qual condition
      96             :  * must be checked for each (ie, these are candidate matches).
      97             :  */
      98             : typedef struct PagetableEntry
      99             : {
     100             :     BlockNumber blockno;        /* page number (hashtable key) */
     101             :     char        status;         /* hash entry status */
     102             :     bool        ischunk;        /* T = lossy storage, F = exact */
     103             :     bool        recheck;        /* should the tuples be rechecked? */
     104             :     bitmapword  words[Max(WORDS_PER_PAGE, WORDS_PER_CHUNK)];
     105             : } PagetableEntry;
     106             : 
     107             : /*
     108             :  * Holds array of pagetable entries.
     109             :  */
     110             : typedef struct PTEntryArray
     111             : {
     112             :     pg_atomic_uint32 refcount;  /* no. of iterator attached */
     113             :     PagetableEntry ptentry[FLEXIBLE_ARRAY_MEMBER];
     114             : } PTEntryArray;
     115             : 
     116             : /*
     117             :  * We want to avoid the overhead of creating the hashtable, which is
     118             :  * comparatively large, when not necessary. Particularly when we are using a
     119             :  * bitmap scan on the inside of a nestloop join: a bitmap may well live only
     120             :  * long enough to accumulate one entry in such cases.  We therefore avoid
     121             :  * creating an actual hashtable until we need two pagetable entries.  When
     122             :  * just one pagetable entry is needed, we store it in a fixed field of
     123             :  * TIDBitMap.  (NOTE: we don't get rid of the hashtable if the bitmap later
     124             :  * shrinks down to zero or one page again.  So, status can be TBM_HASH even
     125             :  * when nentries is zero or one.)
     126             :  */
     127             : typedef enum
     128             : {
     129             :     TBM_EMPTY,                  /* no hashtable, nentries == 0 */
     130             :     TBM_ONE_PAGE,               /* entry1 contains the single entry */
     131             :     TBM_HASH                    /* pagetable is valid, entry1 is not */
     132             : } TBMStatus;
     133             : 
     134             : /*
     135             :  * Current iterating state of the TBM.
     136             :  */
     137             : typedef enum
     138             : {
     139             :     TBM_NOT_ITERATING,          /* not yet converted to page and chunk array */
     140             :     TBM_ITERATING_PRIVATE,      /* converted to local page and chunk array */
     141             :     TBM_ITERATING_SHARED        /* converted to shared page and chunk array */
     142             : } TBMIteratingState;
     143             : 
     144             : /*
     145             :  * Here is the representation for a whole TIDBitMap:
     146             :  */
     147             : struct TIDBitmap
     148             : {
     149             :     NodeTag     type;           /* to make it a valid Node */
     150             :     MemoryContext mcxt;         /* memory context containing me */
     151             :     TBMStatus   status;         /* see codes above */
     152             :     struct pagetable_hash *pagetable;   /* hash table of PagetableEntry's */
     153             :     int         nentries;       /* number of entries in pagetable */
     154             :     int         maxentries;     /* limit on same to meet maxbytes */
     155             :     int         npages;         /* number of exact entries in pagetable */
     156             :     int         nchunks;        /* number of lossy entries in pagetable */
     157             :     TBMIteratingState iterating;    /* tbm_begin_iterate called? */
     158             :     uint32      lossify_start;  /* offset to start lossifying hashtable at */
     159             :     PagetableEntry entry1;      /* used when status == TBM_ONE_PAGE */
     160             :     /* these are valid when iterating is true: */
     161             :     PagetableEntry **spages;    /* sorted exact-page list, or NULL */
     162             :     PagetableEntry **schunks;   /* sorted lossy-chunk list, or NULL */
     163             :     dsa_pointer dsapagetable;   /* dsa_pointer to the element array */
     164             :     dsa_pointer dsapagetableold;    /* dsa_pointer to the old element array */
     165             :     dsa_pointer ptpages;        /* dsa_pointer to the page array */
     166             :     dsa_pointer ptchunks;       /* dsa_pointer to the chunk array */
     167             :     dsa_area   *dsa;            /* reference to per-query dsa area */
     168             : };
     169             : 
     170             : /*
     171             :  * When iterating over a bitmap in sorted order, a TBMIterator is used to
     172             :  * track our progress.  There can be several iterators scanning the same
     173             :  * bitmap concurrently.  Note that the bitmap becomes read-only as soon as
     174             :  * any iterator is created.
     175             :  */
     176             : struct TBMIterator
     177             : {
     178             :     TIDBitmap  *tbm;            /* TIDBitmap we're iterating over */
     179             :     int         spageptr;       /* next spages index */
     180             :     int         schunkptr;      /* next schunks index */
     181             :     int         schunkbit;      /* next bit to check in current schunk */
     182             :     TBMIterateResult output;    /* MUST BE LAST (because variable-size) */
     183             : };
     184             : 
     185             : /*
     186             :  * Holds the shared members of the iterator so that multiple processes
     187             :  * can jointly iterate.
     188             :  */
     189             : typedef struct TBMSharedIteratorState
     190             : {
     191             :     int         nentries;       /* number of entries in pagetable */
     192             :     int         maxentries;     /* limit on same to meet maxbytes */
     193             :     int         npages;         /* number of exact entries in pagetable */
     194             :     int         nchunks;        /* number of lossy entries in pagetable */
     195             :     dsa_pointer pagetable;      /* dsa pointers to head of pagetable data */
     196             :     dsa_pointer spages;         /* dsa pointer to page array */
     197             :     dsa_pointer schunks;        /* dsa pointer to chunk array */
     198             :     LWLock      lock;           /* lock to protect below members */
     199             :     int         spageptr;       /* next spages index */
     200             :     int         schunkptr;      /* next schunks index */
     201             :     int         schunkbit;      /* next bit to check in current schunk */
     202             : } TBMSharedIteratorState;
     203             : 
     204             : /*
     205             :  * pagetable iteration array.
     206             :  */
     207             : typedef struct PTIterationArray
     208             : {
     209             :     pg_atomic_uint32 refcount;  /* no. of iterator attached */
     210             :     int         index[FLEXIBLE_ARRAY_MEMBER];   /* index array */
     211             : } PTIterationArray;
     212             : 
     213             : /*
     214             :  * same as TBMIterator, but it is used for joint iteration, therefore this
     215             :  * also holds a reference to the shared state.
     216             :  */
     217             : struct TBMSharedIterator
     218             : {
     219             :     TBMSharedIteratorState *state;  /* shared state */
     220             :     PTEntryArray *ptbase;       /* pagetable element array */
     221             :     PTIterationArray *ptpages;  /* sorted exact page index list */
     222             :     PTIterationArray *ptchunks; /* sorted lossy page index list */
     223             :     TBMIterateResult output;    /* MUST BE LAST (because variable-size) */
     224             : };
     225             : 
     226             : /* Local function prototypes */
     227             : static void tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage);
     228             : static bool tbm_intersect_page(TIDBitmap *a, PagetableEntry *apage,
     229             :                    const TIDBitmap *b);
     230             : static const PagetableEntry *tbm_find_pageentry(const TIDBitmap *tbm,
     231             :                    BlockNumber pageno);
     232             : static PagetableEntry *tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno);
     233             : static bool tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno);
     234             : static void tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno);
     235             : static void tbm_lossify(TIDBitmap *tbm);
     236             : static int  tbm_comparator(const void *left, const void *right);
     237             : static int tbm_shared_comparator(const void *left, const void *right,
     238             :                       void *arg);
     239             : 
     240             : /*
     241             :  * Simple inline murmur hash implementation for the exact width required, for
     242             :  * performance.
     243             :  */
     244             : static inline uint32
     245      716303 : hash_blockno(BlockNumber b)
     246             : {
     247      716303 :     uint32      h = b;
     248             : 
     249      716303 :     h ^= h >> 16;
     250      716303 :     h *= 0x85ebca6b;
     251      716303 :     h ^= h >> 13;
     252      716303 :     h *= 0xc2b2ae35;
     253      716303 :     h ^= h >> 16;
     254      716303 :     return h;
     255             : }
     256             : 
     257             : /* define hashtable mapping block numbers to PagetableEntry's */
     258             : #define SH_USE_NONDEFAULT_ALLOCATOR
     259             : #define SH_PREFIX pagetable
     260             : #define SH_ELEMENT_TYPE PagetableEntry
     261             : #define SH_KEY_TYPE BlockNumber
     262             : #define SH_KEY blockno
     263             : #define SH_HASH_KEY(tb, key) hash_blockno(key)
     264             : #define SH_EQUAL(tb, a, b) a == b
     265             : #define SH_SCOPE static inline
     266             : #define SH_DEFINE
     267             : #define SH_DECLARE
     268             : #include "lib/simplehash.h"
     269             : 
     270             : 
     271             : /*
     272             :  * tbm_create - create an initially-empty bitmap
     273             :  *
     274             :  * The bitmap will live in the memory context that is CurrentMemoryContext
     275             :  * at the time of this call.  It will be limited to (approximately) maxbytes
     276             :  * total memory consumption.  If the DSA passed to this function is not NULL
     277             :  * then the memory for storing elements of the underlying page table will
     278             :  * be allocated from the DSA.
     279             :  */
     280             : TIDBitmap *
     281        1110 : tbm_create(long maxbytes, dsa_area *dsa)
     282             : {
     283             :     TIDBitmap  *tbm;
     284             :     long        nbuckets;
     285             : 
     286             :     /* Create the TIDBitmap struct and zero all its fields */
     287        1110 :     tbm = makeNode(TIDBitmap);
     288             : 
     289        1110 :     tbm->mcxt = CurrentMemoryContext;
     290        1110 :     tbm->status = TBM_EMPTY;
     291             : 
     292             :     /*
     293             :      * Estimate number of hashtable entries we can have within maxbytes. This
     294             :      * estimates the hash cost as sizeof(PagetableEntry), which is good enough
     295             :      * for our purpose.  Also count an extra Pointer per entry for the arrays
     296             :      * created during iteration readout.
     297             :      */
     298        1110 :     nbuckets = maxbytes /
     299             :         (sizeof(PagetableEntry) + sizeof(Pointer) + sizeof(Pointer));
     300        1110 :     nbuckets = Min(nbuckets, INT_MAX - 1);  /* safety limit */
     301        1110 :     nbuckets = Max(nbuckets, 16);   /* sanity limit */
     302        1110 :     tbm->maxentries = (int) nbuckets;
     303        1110 :     tbm->lossify_start = 0;
     304        1110 :     tbm->dsa = dsa;
     305        1110 :     tbm->dsapagetable = InvalidDsaPointer;
     306        1110 :     tbm->dsapagetableold = InvalidDsaPointer;
     307        1110 :     tbm->ptpages = InvalidDsaPointer;
     308        1110 :     tbm->ptchunks = InvalidDsaPointer;
     309             : 
     310        1110 :     return tbm;
     311             : }
     312             : 
     313             : /*
     314             :  * Actually create the hashtable.  Since this is a moderately expensive
     315             :  * proposition, we don't do it until we have to.
     316             :  */
     317             : static void
     318         408 : tbm_create_pagetable(TIDBitmap *tbm)
     319             : {
     320         408 :     Assert(tbm->status != TBM_HASH);
     321         408 :     Assert(tbm->pagetable == NULL);
     322             : 
     323         408 :     tbm->pagetable = pagetable_create(tbm->mcxt, 128, tbm);
     324             : 
     325             :     /* If entry1 is valid, push it into the hashtable */
     326         408 :     if (tbm->status == TBM_ONE_PAGE)
     327             :     {
     328             :         PagetableEntry *page;
     329             :         bool        found;
     330             :         char        oldstatus;
     331             : 
     332         160 :         page = pagetable_insert(tbm->pagetable,
     333             :                                 tbm->entry1.blockno,
     334             :                                 &found);
     335         160 :         Assert(!found);
     336         160 :         oldstatus = page->status;
     337         160 :         memcpy(page, &tbm->entry1, sizeof(PagetableEntry));
     338         160 :         page->status = oldstatus;
     339             :     }
     340             : 
     341         408 :     tbm->status = TBM_HASH;
     342         408 : }
     343             : 
     344             : /*
     345             :  * tbm_free - free a TIDBitmap
     346             :  */
     347             : void
     348        1106 : tbm_free(TIDBitmap *tbm)
     349             : {
     350        1106 :     if (tbm->pagetable)
     351         408 :         pagetable_destroy(tbm->pagetable);
     352        1106 :     if (tbm->spages)
     353         147 :         pfree(tbm->spages);
     354        1106 :     if (tbm->schunks)
     355         250 :         pfree(tbm->schunks);
     356        1106 :     pfree(tbm);
     357        1106 : }
     358             : 
     359             : /*
     360             :  * tbm_free_shared_area - free shared state
     361             :  *
     362             :  * Free shared iterator state, Also free shared pagetable and iterator arrays
     363             :  * memory if they are not referred by any of the shared iterator i.e recount
     364             :  * is becomes 0.
     365             :  */
     366             : void
     367          18 : tbm_free_shared_area(dsa_area *dsa, dsa_pointer dp)
     368             : {
     369          18 :     TBMSharedIteratorState *istate = dsa_get_address(dsa, dp);
     370             :     PTEntryArray *ptbase;
     371             :     PTIterationArray *ptpages;
     372             :     PTIterationArray *ptchunks;
     373             : 
     374          18 :     if (DsaPointerIsValid(istate->pagetable))
     375             :     {
     376          18 :         ptbase = dsa_get_address(dsa, istate->pagetable);
     377          18 :         if (pg_atomic_sub_fetch_u32(&ptbase->refcount, 1) == 0)
     378           9 :             dsa_free(dsa, istate->pagetable);
     379             :     }
     380          18 :     if (DsaPointerIsValid(istate->spages))
     381             :     {
     382          18 :         ptpages = dsa_get_address(dsa, istate->spages);
     383          18 :         if (pg_atomic_sub_fetch_u32(&ptpages->refcount, 1) == 0)
     384           9 :             dsa_free(dsa, istate->spages);
     385             :     }
     386          18 :     if (DsaPointerIsValid(istate->schunks))
     387             :     {
     388           0 :         ptchunks = dsa_get_address(dsa, istate->schunks);
     389           0 :         if (pg_atomic_sub_fetch_u32(&ptchunks->refcount, 1) == 0)
     390           0 :             dsa_free(dsa, istate->schunks);
     391             :     }
     392             : 
     393          18 :     dsa_free(dsa, dp);
     394          18 : }
     395             : 
     396             : /*
     397             :  * tbm_add_tuples - add some tuple IDs to a TIDBitmap
     398             :  *
     399             :  * If recheck is true, then the recheck flag will be set in the
     400             :  * TBMIterateResult when any of these tuples are reported out.
     401             :  */
     402             : void
     403      393465 : tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids,
     404             :                bool recheck)
     405             : {
     406      393465 :     BlockNumber currblk = InvalidBlockNumber;
     407      393465 :     PagetableEntry *page = NULL;    /* only valid when currblk is valid */
     408             :     int         i;
     409             : 
     410      393465 :     Assert(tbm->iterating == TBM_NOT_ITERATING);
     411      831786 :     for (i = 0; i < ntids; i++)
     412             :     {
     413      438321 :         BlockNumber blk = ItemPointerGetBlockNumber(tids + i);
     414      438321 :         OffsetNumber off = ItemPointerGetOffsetNumber(tids + i);
     415             :         int         wordnum,
     416             :                     bitnum;
     417             : 
     418             :         /* safety check to ensure we don't overrun bit array bounds */
     419      438321 :         if (off < 1 || off > MAX_TUPLES_PER_PAGE)
     420           0 :             elog(ERROR, "tuple offset out of range: %u", off);
     421             : 
     422             :         /*
     423             :          * Look up target page unless we already did.  This saves cycles when
     424             :          * the input includes consecutive tuples on the same page, which is
     425             :          * common enough to justify an extra test here.
     426             :          */
     427      438321 :         if (blk != currblk)
     428             :         {
     429      412944 :             if (tbm_page_is_lossy(tbm, blk))
     430         553 :                 page = NULL;    /* remember page is lossy */
     431             :             else
     432      412391 :                 page = tbm_get_pageentry(tbm, blk);
     433      412944 :             currblk = blk;
     434             :         }
     435             : 
     436      438321 :         if (page == NULL)
     437         553 :             continue;           /* whole page is already marked */
     438             : 
     439      437768 :         if (page->ischunk)
     440             :         {
     441             :             /* The page is a lossy chunk header, set bit for itself */
     442           0 :             wordnum = bitnum = 0;
     443             :         }
     444             :         else
     445             :         {
     446             :             /* Page is exact, so set bit for individual tuple */
     447      437768 :             wordnum = WORDNUM(off - 1);
     448      437768 :             bitnum = BITNUM(off - 1);
     449             :         }
     450      437768 :         page->words[wordnum] |= ((bitmapword) 1 << bitnum);
     451      437768 :         page->recheck |= recheck;
     452             : 
     453      437768 :         if (tbm->nentries > tbm->maxentries)
     454             :         {
     455           4 :             tbm_lossify(tbm);
     456             :             /* Page could have been converted to lossy, so force new lookup */
     457           4 :             currblk = InvalidBlockNumber;
     458             :         }
     459             :     }
     460      393465 : }
     461             : 
     462             : /*
     463             :  * tbm_add_page - add a whole page to a TIDBitmap
     464             :  *
     465             :  * This causes the whole page to be reported (with the recheck flag)
     466             :  * when the TIDBitmap is scanned.
     467             :  */
     468             : void
     469       18449 : tbm_add_page(TIDBitmap *tbm, BlockNumber pageno)
     470             : {
     471             :     /* Enter the page in the bitmap, or mark it lossy if already present */
     472       18449 :     tbm_mark_page_lossy(tbm, pageno);
     473             :     /* If we went over the memory limit, lossify some more pages */
     474       18449 :     if (tbm->nentries > tbm->maxentries)
     475           0 :         tbm_lossify(tbm);
     476       18449 : }
     477             : 
     478             : /*
     479             :  * tbm_union - set union
     480             :  *
     481             :  * a is modified in-place, b is not changed
     482             :  */
     483             : void
     484           0 : tbm_union(TIDBitmap *a, const TIDBitmap *b)
     485             : {
     486           0 :     Assert(!a->iterating);
     487             :     /* Nothing to do if b is empty */
     488           0 :     if (b->nentries == 0)
     489           0 :         return;
     490             :     /* Scan through chunks and pages in b, merge into a */
     491           0 :     if (b->status == TBM_ONE_PAGE)
     492           0 :         tbm_union_page(a, &b->entry1);
     493             :     else
     494             :     {
     495             :         pagetable_iterator i;
     496             :         PagetableEntry *bpage;
     497             : 
     498           0 :         Assert(b->status == TBM_HASH);
     499           0 :         pagetable_start_iterate(b->pagetable, &i);
     500           0 :         while ((bpage = pagetable_iterate(b->pagetable, &i)) != NULL)
     501           0 :             tbm_union_page(a, bpage);
     502             :     }
     503             : }
     504             : 
     505             : /* Process one page of b during a union op */
     506             : static void
     507           0 : tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage)
     508             : {
     509             :     PagetableEntry *apage;
     510             :     int         wordnum;
     511             : 
     512           0 :     if (bpage->ischunk)
     513             :     {
     514             :         /* Scan b's chunk, mark each indicated page lossy in a */
     515           0 :         for (wordnum = 0; wordnum < WORDS_PER_CHUNK; wordnum++)
     516             :         {
     517           0 :             bitmapword  w = bpage->words[wordnum];
     518             : 
     519           0 :             if (w != 0)
     520             :             {
     521             :                 BlockNumber pg;
     522             : 
     523           0 :                 pg = bpage->blockno + (wordnum * BITS_PER_BITMAPWORD);
     524           0 :                 while (w != 0)
     525             :                 {
     526           0 :                     if (w & 1)
     527           0 :                         tbm_mark_page_lossy(a, pg);
     528           0 :                     pg++;
     529           0 :                     w >>= 1;
     530             :                 }
     531             :             }
     532             :         }
     533             :     }
     534           0 :     else if (tbm_page_is_lossy(a, bpage->blockno))
     535             :     {
     536             :         /* page is already lossy in a, nothing to do */
     537           0 :         return;
     538             :     }
     539             :     else
     540             :     {
     541           0 :         apage = tbm_get_pageentry(a, bpage->blockno);
     542           0 :         if (apage->ischunk)
     543             :         {
     544             :             /* The page is a lossy chunk header, set bit for itself */
     545           0 :             apage->words[0] |= ((bitmapword) 1 << 0);
     546             :         }
     547             :         else
     548             :         {
     549             :             /* Both pages are exact, merge at the bit level */
     550           0 :             for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
     551           0 :                 apage->words[wordnum] |= bpage->words[wordnum];
     552           0 :             apage->recheck |= bpage->recheck;
     553             :         }
     554             :     }
     555             : 
     556           0 :     if (a->nentries > a->maxentries)
     557           0 :         tbm_lossify(a);
     558             : }
     559             : 
     560             : /*
     561             :  * tbm_intersect - set intersection
     562             :  *
     563             :  * a is modified in-place, b is not changed
     564             :  */
     565             : void
     566           2 : tbm_intersect(TIDBitmap *a, const TIDBitmap *b)
     567             : {
     568           2 :     Assert(!a->iterating);
     569             :     /* Nothing to do if a is empty */
     570           2 :     if (a->nentries == 0)
     571           2 :         return;
     572             :     /* Scan through chunks and pages in a, try to match to b */
     573           2 :     if (a->status == TBM_ONE_PAGE)
     574             :     {
     575           0 :         if (tbm_intersect_page(a, &a->entry1, b))
     576             :         {
     577             :             /* Page is now empty, remove it from a */
     578           0 :             Assert(!a->entry1.ischunk);
     579           0 :             a->npages--;
     580           0 :             a->nentries--;
     581           0 :             Assert(a->nentries == 0);
     582           0 :             a->status = TBM_EMPTY;
     583             :         }
     584             :     }
     585             :     else
     586             :     {
     587             :         pagetable_iterator i;
     588             :         PagetableEntry *apage;
     589             : 
     590           2 :         Assert(a->status == TBM_HASH);
     591           2 :         pagetable_start_iterate(a->pagetable, &i);
     592           2 :         while ((apage = pagetable_iterate(a->pagetable, &i)) != NULL)
     593             :         {
     594         692 :             if (tbm_intersect_page(a, apage, b))
     595             :             {
     596             :                 /* Page or chunk is now empty, remove it from a */
     597         663 :                 if (apage->ischunk)
     598           0 :                     a->nchunks--;
     599             :                 else
     600         663 :                     a->npages--;
     601         663 :                 a->nentries--;
     602         663 :                 if (!pagetable_delete(a->pagetable, apage->blockno))
     603           0 :                     elog(ERROR, "hash table corrupted");
     604             :             }
     605             :         }
     606             :     }
     607             : }
     608             : 
     609             : /*
     610             :  * Process one page of a during an intersection op
     611             :  *
     612             :  * Returns TRUE if apage is now empty and should be deleted from a
     613             :  */
     614             : static bool
     615         692 : tbm_intersect_page(TIDBitmap *a, PagetableEntry *apage, const TIDBitmap *b)
     616             : {
     617             :     const PagetableEntry *bpage;
     618             :     int         wordnum;
     619             : 
     620         692 :     if (apage->ischunk)
     621             :     {
     622             :         /* Scan each bit in chunk, try to clear */
     623           5 :         bool        candelete = true;
     624             : 
     625          45 :         for (wordnum = 0; wordnum < WORDS_PER_CHUNK; wordnum++)
     626             :         {
     627          40 :             bitmapword  w = apage->words[wordnum];
     628             : 
     629          40 :             if (w != 0)
     630             :             {
     631          39 :                 bitmapword  neww = w;
     632             :                 BlockNumber pg;
     633             :                 int         bitnum;
     634             : 
     635          39 :                 pg = apage->blockno + (wordnum * BITS_PER_BITMAPWORD);
     636          39 :                 bitnum = 0;
     637        1283 :                 while (w != 0)
     638             :                 {
     639        1205 :                     if (w & 1)
     640             :                     {
     641         629 :                         if (!tbm_page_is_lossy(b, pg) &&
     642          38 :                             tbm_find_pageentry(b, pg) == NULL)
     643             :                         {
     644             :                             /* Page is not in b at all, lose lossy bit */
     645           0 :                             neww &= ~((bitmapword) 1 << bitnum);
     646             :                         }
     647             :                     }
     648        1205 :                     pg++;
     649        1205 :                     bitnum++;
     650        1205 :                     w >>= 1;
     651             :                 }
     652          39 :                 apage->words[wordnum] = neww;
     653          39 :                 if (neww != 0)
     654          39 :                     candelete = false;
     655             :             }
     656             :         }
     657           5 :         return candelete;
     658             :     }
     659         687 :     else if (tbm_page_is_lossy(b, apage->blockno))
     660             :     {
     661             :         /*
     662             :          * Some of the tuples in 'a' might not satisfy the quals for 'b', but
     663             :          * because the page 'b' is lossy, we don't know which ones. Therefore
     664             :          * we mark 'a' as requiring rechecks, to indicate that at most those
     665             :          * tuples set in 'a' are matches.
     666             :          */
     667           0 :         apage->recheck = true;
     668           0 :         return false;
     669             :     }
     670             :     else
     671             :     {
     672         687 :         bool        candelete = true;
     673             : 
     674         687 :         bpage = tbm_find_pageentry(b, apage->blockno);
     675         687 :         if (bpage != NULL)
     676             :         {
     677             :             /* Both pages are exact, merge at the bit level */
     678         610 :             Assert(!bpage->ischunk);
     679        6710 :             for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
     680             :             {
     681        6100 :                 apage->words[wordnum] &= bpage->words[wordnum];
     682        6100 :                 if (apage->words[wordnum] != 0)
     683          24 :                     candelete = false;
     684             :             }
     685         610 :             apage->recheck |= bpage->recheck;
     686             :         }
     687             :         /* If there is no matching b page, we can just delete the a page */
     688         687 :         return candelete;
     689             :     }
     690             : }
     691             : 
     692             : /*
     693             :  * tbm_is_empty - is a TIDBitmap completely empty?
     694             :  */
     695             : bool
     696          11 : tbm_is_empty(const TIDBitmap *tbm)
     697             : {
     698          11 :     return (tbm->nentries == 0);
     699             : }
     700             : 
     701             : /*
     702             :  * tbm_begin_iterate - prepare to iterate through a TIDBitmap
     703             :  *
     704             :  * The TBMIterator struct is created in the caller's memory context.
     705             :  * For a clean shutdown of the iteration, call tbm_end_iterate; but it's
     706             :  * okay to just allow the memory context to be released, too.  It is caller's
     707             :  * responsibility not to touch the TBMIterator anymore once the TIDBitmap
     708             :  * is freed.
     709             :  *
     710             :  * NB: after this is called, it is no longer allowed to modify the contents
     711             :  * of the bitmap.  However, you can call this multiple times to scan the
     712             :  * contents repeatedly, including parallel scans.
     713             :  */
     714             : TBMIterator *
     715        2187 : tbm_begin_iterate(TIDBitmap *tbm)
     716             : {
     717             :     TBMIterator *iterator;
     718             : 
     719        2187 :     Assert(tbm->iterating != TBM_ITERATING_SHARED);
     720             : 
     721             :     /*
     722             :      * Create the TBMIterator struct, with enough trailing space to serve the
     723             :      * needs of the TBMIterateResult sub-struct.
     724             :      */
     725        2187 :     iterator = (TBMIterator *) palloc(sizeof(TBMIterator) +
     726             :                                       MAX_TUPLES_PER_PAGE * sizeof(OffsetNumber));
     727        2187 :     iterator->tbm = tbm;
     728             : 
     729             :     /*
     730             :      * Initialize iteration pointers.
     731             :      */
     732        2187 :     iterator->spageptr = 0;
     733        2187 :     iterator->schunkptr = 0;
     734        2187 :     iterator->schunkbit = 0;
     735             : 
     736             :     /*
     737             :      * If we have a hashtable, create and fill the sorted page lists, unless
     738             :      * we already did that for a previous iterator.  Note that the lists are
     739             :      * attached to the bitmap not the iterator, so they can be used by more
     740             :      * than one iterator.
     741             :      */
     742        2187 :     if (tbm->status == TBM_HASH && tbm->iterating == TBM_NOT_ITERATING)
     743             :     {
     744             :         pagetable_iterator i;
     745             :         PagetableEntry *page;
     746             :         int         npages;
     747             :         int         nchunks;
     748             : 
     749         395 :         if (!tbm->spages && tbm->npages > 0)
     750         147 :             tbm->spages = (PagetableEntry **)
     751         147 :                 MemoryContextAlloc(tbm->mcxt,
     752         147 :                                    tbm->npages * sizeof(PagetableEntry *));
     753         395 :         if (!tbm->schunks && tbm->nchunks > 0)
     754         250 :             tbm->schunks = (PagetableEntry **)
     755         250 :                 MemoryContextAlloc(tbm->mcxt,
     756         250 :                                    tbm->nchunks * sizeof(PagetableEntry *));
     757             : 
     758         395 :         npages = nchunks = 0;
     759         395 :         pagetable_start_iterate(tbm->pagetable, &i);
     760        4790 :         while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
     761             :         {
     762        4000 :             if (page->ischunk)
     763         258 :                 tbm->schunks[nchunks++] = page;
     764             :             else
     765        3742 :                 tbm->spages[npages++] = page;
     766             :         }
     767         395 :         Assert(npages == tbm->npages);
     768         395 :         Assert(nchunks == tbm->nchunks);
     769         395 :         if (npages > 1)
     770         147 :             qsort(tbm->spages, npages, sizeof(PagetableEntry *),
     771             :                   tbm_comparator);
     772         395 :         if (nchunks > 1)
     773           2 :             qsort(tbm->schunks, nchunks, sizeof(PagetableEntry *),
     774             :                   tbm_comparator);
     775             :     }
     776             : 
     777        2187 :     tbm->iterating = TBM_ITERATING_PRIVATE;
     778             : 
     779        2187 :     return iterator;
     780             : }
     781             : 
     782             : /*
     783             :  * tbm_prepare_shared_iterate - prepare shared iteration state for a TIDBitmap.
     784             :  *
     785             :  * The necessary shared state will be allocated from the DSA passed to
     786             :  * tbm_create, so that multiple processes can attach to it and iterate jointly.
     787             :  *
     788             :  * This will convert the pagetable hash into page and chunk array of the index
     789             :  * into pagetable array.
     790             :  */
     791             : dsa_pointer
     792          22 : tbm_prepare_shared_iterate(TIDBitmap *tbm)
     793             : {
     794             :     dsa_pointer dp;
     795             :     TBMSharedIteratorState *istate;
     796          22 :     PTEntryArray *ptbase = NULL;
     797          22 :     PTIterationArray *ptpages = NULL;
     798          22 :     PTIterationArray *ptchunks = NULL;
     799             : 
     800          22 :     Assert(tbm->dsa != NULL);
     801          22 :     Assert(tbm->iterating != TBM_ITERATING_PRIVATE);
     802             : 
     803             :     /*
     804             :      * Allocate TBMSharedIteratorState from DSA to hold the shared members and
     805             :      * lock, this will also be used by multiple worker for shared iterate.
     806             :      */
     807          22 :     dp = dsa_allocate0(tbm->dsa, sizeof(TBMSharedIteratorState));
     808          22 :     istate = dsa_get_address(tbm->dsa, dp);
     809             : 
     810             :     /*
     811             :      * If we're not already iterating, create and fill the sorted page lists.
     812             :      * (If we are, the sorted page lists are already stored in the TIDBitmap,
     813             :      * and we can just reuse them.)
     814             :      */
     815          22 :     if (tbm->iterating == TBM_NOT_ITERATING)
     816             :     {
     817             :         pagetable_iterator i;
     818             :         PagetableEntry *page;
     819             :         int         idx;
     820             :         int         npages;
     821             :         int         nchunks;
     822             : 
     823             :         /*
     824             :          * Allocate the page and chunk array memory from the DSA to share
     825             :          * across multiple processes.
     826             :          */
     827          11 :         if (tbm->npages)
     828             :         {
     829          11 :             tbm->ptpages = dsa_allocate(tbm->dsa, sizeof(PTIterationArray) +
     830             :                                         tbm->npages * sizeof(int));
     831          11 :             ptpages = dsa_get_address(tbm->dsa, tbm->ptpages);
     832          11 :             pg_atomic_init_u32(&ptpages->refcount, 0);
     833             :         }
     834          11 :         if (tbm->nchunks)
     835             :         {
     836           1 :             tbm->ptchunks = dsa_allocate(tbm->dsa, sizeof(PTIterationArray) +
     837             :                                          tbm->nchunks * sizeof(int));
     838           1 :             ptchunks = dsa_get_address(tbm->dsa, tbm->ptchunks);
     839           1 :             pg_atomic_init_u32(&ptchunks->refcount, 0);
     840             :         }
     841             : 
     842             :         /*
     843             :          * If TBM status is TBM_HASH then iterate over the pagetable and
     844             :          * convert it to page and chunk arrays.  But if it's in the
     845             :          * TBM_ONE_PAGE mode then directly allocate the space for one entry
     846             :          * from the DSA.
     847             :          */
     848          11 :         npages = nchunks = 0;
     849          11 :         if (tbm->status == TBM_HASH)
     850             :         {
     851          11 :             ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
     852             : 
     853          11 :             pagetable_start_iterate(tbm->pagetable, &i);
     854        4121 :             while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
     855             :             {
     856        4099 :                 idx = page - ptbase->ptentry;
     857        4099 :                 if (page->ischunk)
     858           5 :                     ptchunks->index[nchunks++] = idx;
     859             :                 else
     860        4094 :                     ptpages->index[npages++] = idx;
     861             :             }
     862             : 
     863          11 :             Assert(npages == tbm->npages);
     864          11 :             Assert(nchunks == tbm->nchunks);
     865             :         }
     866           0 :         else if (tbm->status == TBM_ONE_PAGE)
     867             :         {
     868             :             /*
     869             :              * In one page mode allocate the space for one pagetable entry,
     870             :              * initialize it, and directly store its index (i.e. 0) in the
     871             :              * page array.
     872             :              */
     873           0 :             tbm->dsapagetable = dsa_allocate(tbm->dsa, sizeof(PTEntryArray) +
     874             :                                              sizeof(PagetableEntry));
     875           0 :             ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
     876           0 :             memcpy(ptbase->ptentry, &tbm->entry1, sizeof(PagetableEntry));
     877           0 :             ptpages->index[0] = 0;
     878             :         }
     879             : 
     880          11 :         if (ptbase != NULL)
     881          11 :             pg_atomic_init_u32(&ptbase->refcount, 0);
     882          11 :         if (npages > 1)
     883          11 :             qsort_arg((void *) (ptpages->index), npages, sizeof(int),
     884          11 :                       tbm_shared_comparator, (void *) ptbase->ptentry);
     885          11 :         if (nchunks > 1)
     886           1 :             qsort_arg((void *) (ptchunks->index), nchunks, sizeof(int),
     887           1 :                       tbm_shared_comparator, (void *) ptbase->ptentry);
     888             :     }
     889             : 
     890             :     /*
     891             :      * Store the TBM members in the shared state so that we can share them
     892             :      * across multiple processes.
     893             :      */
     894          22 :     istate->nentries = tbm->nentries;
     895          22 :     istate->maxentries = tbm->maxentries;
     896          22 :     istate->npages = tbm->npages;
     897          22 :     istate->nchunks = tbm->nchunks;
     898          22 :     istate->pagetable = tbm->dsapagetable;
     899          22 :     istate->spages = tbm->ptpages;
     900          22 :     istate->schunks = tbm->ptchunks;
     901             : 
     902          22 :     ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
     903          22 :     ptpages = dsa_get_address(tbm->dsa, tbm->ptpages);
     904          22 :     ptchunks = dsa_get_address(tbm->dsa, tbm->ptchunks);
     905             : 
     906             :     /*
     907             :      * For every shared iterator, referring to pagetable and iterator array,
     908             :      * increase the refcount by 1 so that while freeing the shared iterator we
     909             :      * don't free pagetable and iterator array until its refcount becomes 0.
     910             :      */
     911          22 :     if (ptbase != NULL)
     912          22 :         pg_atomic_add_fetch_u32(&ptbase->refcount, 1);
     913          22 :     if (ptpages != NULL)
     914          22 :         pg_atomic_add_fetch_u32(&ptpages->refcount, 1);
     915          22 :     if (ptchunks != NULL)
     916           2 :         pg_atomic_add_fetch_u32(&ptchunks->refcount, 1);
     917             : 
     918             :     /* Initialize the iterator lock */
     919          22 :     LWLockInitialize(&istate->lock, LWTRANCHE_TBM);
     920             : 
     921             :     /* Initialize the shared iterator state */
     922          22 :     istate->schunkbit = 0;
     923          22 :     istate->schunkptr = 0;
     924          22 :     istate->spageptr = 0;
     925             : 
     926          22 :     tbm->iterating = TBM_ITERATING_SHARED;
     927             : 
     928          22 :     return dp;
     929             : }
     930             : 
     931             : /*
     932             :  * tbm_extract_page_tuple - extract the tuple offsets from a page
     933             :  *
     934             :  * The extracted offsets are stored into TBMIterateResult.
     935             :  */
     936             : static inline int
     937       16190 : tbm_extract_page_tuple(PagetableEntry *page, TBMIterateResult *output)
     938             : {
     939             :     int         wordnum;
     940       16190 :     int         ntuples = 0;
     941             : 
     942      178090 :     for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
     943             :     {
     944      161900 :         bitmapword  w = page->words[wordnum];
     945             : 
     946      161900 :         if (w != 0)
     947             :         {
     948       31638 :             int         off = wordnum * BITS_PER_BITMAPWORD + 1;
     949             : 
     950      892645 :             while (w != 0)
     951             :             {
     952      829369 :                 if (w & 1)
     953      678952 :                     output->offsets[ntuples++] = (OffsetNumber) off;
     954      829369 :                 off++;
     955      829369 :                 w >>= 1;
     956             :             }
     957             :         }
     958             :     }
     959             : 
     960       16190 :     return ntuples;
     961             : }
     962             : 
     963             : /*
     964             :  *  tbm_advance_schunkbit - Advance the chunkbit
     965             :  */
     966             : static inline void
     967       43301 : tbm_advance_schunkbit(PagetableEntry *chunk, int *schunkbitp)
     968             : {
     969       43301 :     int         schunkbit = *schunkbitp;
     970             : 
     971      177649 :     while (schunkbit < PAGES_PER_CHUNK)
     972             :     {
     973      133835 :         int         wordnum = WORDNUM(schunkbit);
     974      133835 :         int         bitnum = BITNUM(schunkbit);
     975             : 
     976      133835 :         if ((chunk->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0)
     977       42788 :             break;
     978       91047 :         schunkbit++;
     979             :     }
     980             : 
     981       43301 :     *schunkbitp = schunkbit;
     982       43301 : }
     983             : 
     984             : /*
     985             :  * tbm_iterate - scan through next page of a TIDBitmap
     986             :  *
     987             :  * Returns a TBMIterateResult representing one page, or NULL if there are
     988             :  * no more pages to scan.  Pages are guaranteed to be delivered in numerical
     989             :  * order.  If result->ntuples < 0, then the bitmap is "lossy" and failed to
     990             :  * remember the exact tuples to look at on this page --- the caller must
     991             :  * examine all tuples on the page and check if they meet the intended
     992             :  * condition.  If result->recheck is true, only the indicated tuples need
     993             :  * be examined, but the condition must be rechecked anyway.  (For ease of
     994             :  * testing, recheck is always set true when ntuples < 0.)
     995             :  */
     996             : TBMIterateResult *
     997       48831 : tbm_iterate(TBMIterator *iterator)
     998             : {
     999       48831 :     TIDBitmap  *tbm = iterator->tbm;
    1000       48831 :     TBMIterateResult *output = &(iterator->output);
    1001             : 
    1002       48831 :     Assert(tbm->iterating == TBM_ITERATING_PRIVATE);
    1003             : 
    1004             :     /*
    1005             :      * If lossy chunk pages remain, make sure we've advanced schunkptr/
    1006             :      * schunkbit to the next set bit.
    1007             :      */
    1008       98165 :     while (iterator->schunkptr < tbm->nchunks)
    1009             :     {
    1010       40951 :         PagetableEntry *chunk = tbm->schunks[iterator->schunkptr];
    1011       40951 :         int         schunkbit = iterator->schunkbit;
    1012             : 
    1013       40951 :         tbm_advance_schunkbit(chunk, &schunkbit);
    1014       40951 :         if (schunkbit < PAGES_PER_CHUNK)
    1015             :         {
    1016       40448 :             iterator->schunkbit = schunkbit;
    1017       40448 :             break;
    1018             :         }
    1019             :         /* advance to next chunk */
    1020         503 :         iterator->schunkptr++;
    1021         503 :         iterator->schunkbit = 0;
    1022             :     }
    1023             : 
    1024             :     /*
    1025             :      * If both chunk and per-page data remain, must output the numerically
    1026             :      * earlier page.
    1027             :      */
    1028       48831 :     if (iterator->schunkptr < tbm->nchunks)
    1029             :     {
    1030       40448 :         PagetableEntry *chunk = tbm->schunks[iterator->schunkptr];
    1031             :         BlockNumber chunk_blockno;
    1032             : 
    1033       40448 :         chunk_blockno = chunk->blockno + iterator->schunkbit;
    1034       43992 :         if (iterator->spageptr >= tbm->npages ||
    1035        3544 :             chunk_blockno < tbm->spages[iterator->spageptr]->blockno)
    1036             :         {
    1037             :             /* Return a lossy page indicator from the chunk */
    1038       39262 :             output->blockno = chunk_blockno;
    1039       39262 :             output->ntuples = -1;
    1040       39262 :             output->recheck = true;
    1041       39262 :             iterator->schunkbit++;
    1042       39262 :             return output;
    1043             :         }
    1044             :     }
    1045             : 
    1046        9569 :     if (iterator->spageptr < tbm->npages)
    1047             :     {
    1048             :         PagetableEntry *page;
    1049             :         int         ntuples;
    1050             : 
    1051             :         /* In ONE_PAGE state, we don't allocate an spages[] array */
    1052        8002 :         if (tbm->status == TBM_ONE_PAGE)
    1053         626 :             page = &tbm->entry1;
    1054             :         else
    1055        7376 :             page = tbm->spages[iterator->spageptr];
    1056             : 
    1057             :         /* scan bitmap to extract individual offset numbers */
    1058        8002 :         ntuples = tbm_extract_page_tuple(page, output);
    1059        8002 :         output->blockno = page->blockno;
    1060        8002 :         output->ntuples = ntuples;
    1061        8002 :         output->recheck = page->recheck;
    1062        8002 :         iterator->spageptr++;
    1063        8002 :         return output;
    1064             :     }
    1065             : 
    1066             :     /* Nothing more in the bitmap */
    1067        1567 :     return NULL;
    1068             : }
    1069             : 
    1070             : /*
    1071             :  *  tbm_shared_iterate - scan through next page of a TIDBitmap
    1072             :  *
    1073             :  *  As above, but this will iterate using an iterator which is shared
    1074             :  *  across multiple processes.  We need to acquire the iterator LWLock,
    1075             :  *  before accessing the shared members.
    1076             :  */
    1077             : TBMIterateResult *
    1078        9437 : tbm_shared_iterate(TBMSharedIterator *iterator)
    1079             : {
    1080        9437 :     TBMIterateResult *output = &iterator->output;
    1081        9437 :     TBMSharedIteratorState *istate = iterator->state;
    1082        9437 :     PagetableEntry *ptbase = NULL;
    1083        9437 :     int        *idxpages = NULL;
    1084        9437 :     int        *idxchunks = NULL;
    1085             : 
    1086        9437 :     if (iterator->ptbase != NULL)
    1087        9437 :         ptbase = iterator->ptbase->ptentry;
    1088        9437 :     if (iterator->ptpages != NULL)
    1089        9437 :         idxpages = iterator->ptpages->index;
    1090        9437 :     if (iterator->ptchunks != NULL)
    1091        2477 :         idxchunks = iterator->ptchunks->index;
    1092             : 
    1093             :     /* Acquire the LWLock before accessing the shared members */
    1094        9437 :     LWLockAcquire(&istate->lock, LW_EXCLUSIVE);
    1095             : 
    1096             :     /*
    1097             :      * If lossy chunk pages remain, make sure we've advanced schunkptr/
    1098             :      * schunkbit to the next set bit.
    1099             :      */
    1100       18884 :     while (istate->schunkptr < istate->nchunks)
    1101             :     {
    1102        2350 :         PagetableEntry *chunk = &ptbase[idxchunks[istate->schunkptr]];
    1103        2350 :         int         schunkbit = istate->schunkbit;
    1104             : 
    1105        2350 :         tbm_advance_schunkbit(chunk, &schunkbit);
    1106        2350 :         if (schunkbit < PAGES_PER_CHUNK)
    1107             :         {
    1108        2340 :             istate->schunkbit = schunkbit;
    1109        2340 :             break;
    1110             :         }
    1111             :         /* advance to next chunk */
    1112          10 :         istate->schunkptr++;
    1113          10 :         istate->schunkbit = 0;
    1114             :     }
    1115             : 
    1116             :     /*
    1117             :      * If both chunk and per-page data remain, must output the numerically
    1118             :      * earlier page.
    1119             :      */
    1120        9437 :     if (istate->schunkptr < istate->nchunks)
    1121             :     {
    1122        2340 :         PagetableEntry *chunk = &ptbase[idxchunks[istate->schunkptr]];
    1123             :         BlockNumber chunk_blockno;
    1124             : 
    1125        2340 :         chunk_blockno = chunk->blockno + istate->schunkbit;
    1126             : 
    1127        4680 :         if (istate->spageptr >= istate->npages ||
    1128        2340 :             chunk_blockno < ptbase[idxpages[istate->spageptr]].blockno)
    1129             :         {
    1130             :             /* Return a lossy page indicator from the chunk */
    1131        1182 :             output->blockno = chunk_blockno;
    1132        1182 :             output->ntuples = -1;
    1133        1182 :             output->recheck = true;
    1134        1182 :             istate->schunkbit++;
    1135             : 
    1136        1182 :             LWLockRelease(&istate->lock);
    1137        1182 :             return output;
    1138             :         }
    1139             :     }
    1140             : 
    1141        8255 :     if (istate->spageptr < istate->npages)
    1142             :     {
    1143        8188 :         PagetableEntry *page = &ptbase[idxpages[istate->spageptr]];
    1144             :         int         ntuples;
    1145             : 
    1146             :         /* scan bitmap to extract individual offset numbers */
    1147        8188 :         ntuples = tbm_extract_page_tuple(page, output);
    1148        8188 :         output->blockno = page->blockno;
    1149        8188 :         output->ntuples = ntuples;
    1150        8188 :         output->recheck = page->recheck;
    1151        8188 :         istate->spageptr++;
    1152             : 
    1153        8188 :         LWLockRelease(&istate->lock);
    1154             : 
    1155        8188 :         return output;
    1156             :     }
    1157             : 
    1158          67 :     LWLockRelease(&istate->lock);
    1159             : 
    1160             :     /* Nothing more in the bitmap */
    1161          67 :     return NULL;
    1162             : }
    1163             : 
    1164             : /*
    1165             :  * tbm_end_iterate - finish an iteration over a TIDBitmap
    1166             :  *
    1167             :  * Currently this is just a pfree, but it might do more someday.  (For
    1168             :  * instance, it could be useful to count open iterators and allow the
    1169             :  * bitmap to return to read/write status when there are no more iterators.)
    1170             :  */
    1171             : void
    1172        2181 : tbm_end_iterate(TBMIterator *iterator)
    1173             : {
    1174        2181 :     pfree(iterator);
    1175        2181 : }
    1176             : 
    1177             : /*
    1178             :  * tbm_end_shared_iterate - finish a shared iteration over a TIDBitmap
    1179             :  *
    1180             :  * This doesn't free any of the shared state associated with the iterator,
    1181             :  * just our backend-private state.
    1182             :  */
    1183             : void
    1184         110 : tbm_end_shared_iterate(TBMSharedIterator *iterator)
    1185             : {
    1186         110 :     pfree(iterator);
    1187         110 : }
    1188             : 
    1189             : /*
    1190             :  * tbm_find_pageentry - find a PagetableEntry for the pageno
    1191             :  *
    1192             :  * Returns NULL if there is no non-lossy entry for the pageno.
    1193             :  */
    1194             : static const PagetableEntry *
    1195         725 : tbm_find_pageentry(const TIDBitmap *tbm, BlockNumber pageno)
    1196             : {
    1197             :     const PagetableEntry *page;
    1198             : 
    1199         725 :     if (tbm->nentries == 0)      /* in case pagetable doesn't exist */
    1200           0 :         return NULL;
    1201             : 
    1202         725 :     if (tbm->status == TBM_ONE_PAGE)
    1203             :     {
    1204           0 :         page = &tbm->entry1;
    1205           0 :         if (page->blockno != pageno)
    1206           0 :             return NULL;
    1207           0 :         Assert(!page->ischunk);
    1208           0 :         return page;
    1209             :     }
    1210             : 
    1211         725 :     page = pagetable_lookup(tbm->pagetable, pageno);
    1212         725 :     if (page == NULL)
    1213          77 :         return NULL;
    1214         648 :     if (page->ischunk)
    1215           0 :         return NULL;            /* don't want a lossy chunk header */
    1216         648 :     return page;
    1217             : }
    1218             : 
    1219             : /*
    1220             :  * tbm_get_pageentry - find or create a PagetableEntry for the pageno
    1221             :  *
    1222             :  * If new, the entry is marked as an exact (non-chunk) entry.
    1223             :  *
    1224             :  * This may cause the table to exceed the desired memory size.  It is
    1225             :  * up to the caller to call tbm_lossify() at the next safe point if so.
    1226             :  */
    1227             : static PagetableEntry *
    1228      412391 : tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno)
    1229             : {
    1230             :     PagetableEntry *page;
    1231             :     bool        found;
    1232             : 
    1233      412391 :     if (tbm->status == TBM_EMPTY)
    1234             :     {
    1235             :         /* Use the fixed slot */
    1236         473 :         page = &tbm->entry1;
    1237         473 :         found = false;
    1238         473 :         tbm->status = TBM_ONE_PAGE;
    1239             :     }
    1240             :     else
    1241             :     {
    1242      411918 :         if (tbm->status == TBM_ONE_PAGE)
    1243             :         {
    1244        2476 :             page = &tbm->entry1;
    1245        2476 :             if (page->blockno == pageno)
    1246        2316 :                 return page;
    1247             :             /* Time to switch from one page to a hashtable */
    1248         160 :             tbm_create_pagetable(tbm);
    1249             :         }
    1250             : 
    1251             :         /* Look up or create an entry */
    1252      409602 :         page = pagetable_insert(tbm->pagetable, pageno, &found);
    1253             :     }
    1254             : 
    1255             :     /* Initialize it if not present before */
    1256      410075 :     if (!found)
    1257             :     {
    1258       11855 :         char        oldstatus = page->status;
    1259             : 
    1260       11855 :         MemSet(page, 0, sizeof(PagetableEntry));
    1261       11855 :         page->status = oldstatus;
    1262       11855 :         page->blockno = pageno;
    1263             :         /* must count it too */
    1264       11855 :         tbm->nentries++;
    1265       11855 :         tbm->npages++;
    1266             :     }
    1267             : 
    1268      410075 :     return page;
    1269             : }
    1270             : 
    1271             : /*
    1272             :  * tbm_page_is_lossy - is the page marked as lossily stored?
    1273             :  */
    1274             : static bool
    1275      414222 : tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno)
    1276             : {
    1277             :     PagetableEntry *page;
    1278             :     BlockNumber chunk_pageno;
    1279             :     int         bitno;
    1280             : 
    1281             :     /* we can skip the lookup if there are no lossy chunks */
    1282      414222 :     if (tbm->nchunks == 0)
    1283      406437 :         return false;
    1284        7785 :     Assert(tbm->status == TBM_HASH);
    1285             : 
    1286        7785 :     bitno = pageno % PAGES_PER_CHUNK;
    1287        7785 :     chunk_pageno = pageno - bitno;
    1288             : 
    1289        7785 :     page = pagetable_lookup(tbm->pagetable, chunk_pageno);
    1290             : 
    1291        7785 :     if (page != NULL && page->ischunk)
    1292             :     {
    1293        7785 :         int         wordnum = WORDNUM(bitno);
    1294        7785 :         int         bitnum = BITNUM(bitno);
    1295             : 
    1296        7785 :         if ((page->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0)
    1297        1106 :             return true;
    1298             :     }
    1299        6679 :     return false;
    1300             : }
    1301             : 
    1302             : /*
    1303             :  * tbm_mark_page_lossy - mark the page number as lossily stored
    1304             :  *
    1305             :  * This may cause the table to exceed the desired memory size.  It is
    1306             :  * up to the caller to call tbm_lossify() at the next safe point if so.
    1307             :  */
    1308             : static void
    1309       20793 : tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno)
    1310             : {
    1311             :     PagetableEntry *page;
    1312             :     bool        found;
    1313             :     BlockNumber chunk_pageno;
    1314             :     int         bitno;
    1315             :     int         wordnum;
    1316             :     int         bitnum;
    1317             : 
    1318             :     /* We force the bitmap into hashtable mode whenever it's lossy */
    1319       20793 :     if (tbm->status != TBM_HASH)
    1320         248 :         tbm_create_pagetable(tbm);
    1321             : 
    1322       20793 :     bitno = pageno % PAGES_PER_CHUNK;
    1323       20793 :     chunk_pageno = pageno - bitno;
    1324             : 
    1325             :     /*
    1326             :      * Remove any extant non-lossy entry for the page.  If the page is its own
    1327             :      * chunk header, however, we skip this and handle the case below.
    1328             :      */
    1329       20793 :     if (bitno != 0)
    1330             :     {
    1331       20571 :         if (pagetable_delete(tbm->pagetable, pageno))
    1332             :         {
    1333             :             /* It was present, so adjust counts */
    1334        2344 :             tbm->nentries--;
    1335        2344 :             tbm->npages--;       /* assume it must have been non-lossy */
    1336             :         }
    1337             :     }
    1338             : 
    1339             :     /* Look up or create entry for chunk-header page */
    1340       20793 :     page = pagetable_insert(tbm->pagetable, chunk_pageno, &found);
    1341             : 
    1342             :     /* Initialize it if not present before */
    1343       20793 :     if (!found)
    1344             :     {
    1345         248 :         char        oldstatus = page->status;
    1346             : 
    1347         248 :         MemSet(page, 0, sizeof(PagetableEntry));
    1348         248 :         page->status = oldstatus;
    1349         248 :         page->blockno = chunk_pageno;
    1350         248 :         page->ischunk = true;
    1351             :         /* must count it too */
    1352         248 :         tbm->nentries++;
    1353         248 :         tbm->nchunks++;
    1354             :     }
    1355       20545 :     else if (!page->ischunk)
    1356             :     {
    1357          20 :         char        oldstatus = page->status;
    1358             : 
    1359             :         /* chunk header page was formerly non-lossy, make it lossy */
    1360          20 :         MemSet(page, 0, sizeof(PagetableEntry));
    1361          20 :         page->status = oldstatus;
    1362          20 :         page->blockno = chunk_pageno;
    1363          20 :         page->ischunk = true;
    1364             :         /* we assume it had some tuple bit(s) set, so mark it lossy */
    1365          20 :         page->words[0] = ((bitmapword) 1 << 0);
    1366             :         /* adjust counts */
    1367          20 :         tbm->nchunks++;
    1368          20 :         tbm->npages--;
    1369             :     }
    1370             : 
    1371             :     /* Now set the original target page's bit */
    1372       20793 :     wordnum = WORDNUM(bitno);
    1373       20793 :     bitnum = BITNUM(bitno);
    1374       20793 :     page->words[wordnum] |= ((bitmapword) 1 << bitnum);
    1375       20793 : }
    1376             : 
    1377             : /*
    1378             :  * tbm_lossify - lose some information to get back under the memory limit
    1379             :  */
    1380             : static void
    1381           4 : tbm_lossify(TIDBitmap *tbm)
    1382             : {
    1383             :     pagetable_iterator i;
    1384             :     PagetableEntry *page;
    1385             : 
    1386             :     /*
    1387             :      * XXX Really stupid implementation: this just lossifies pages in
    1388             :      * essentially random order.  We should be paying some attention to the
    1389             :      * number of bits set in each page, instead.
    1390             :      *
    1391             :      * Since we are called as soon as nentries exceeds maxentries, we should
    1392             :      * push nentries down to significantly less than maxentries, or else we'll
    1393             :      * just end up doing this again very soon.  We shoot for maxentries/2.
    1394             :      */
    1395           4 :     Assert(tbm->iterating == TBM_NOT_ITERATING);
    1396           4 :     Assert(tbm->status == TBM_HASH);
    1397             : 
    1398           4 :     pagetable_start_iterate_at(tbm->pagetable, &i, tbm->lossify_start);
    1399           4 :     while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
    1400             :     {
    1401        2344 :         if (page->ischunk)
    1402           0 :             continue;           /* already a chunk header */
    1403             : 
    1404             :         /*
    1405             :          * If the page would become a chunk header, we won't save anything by
    1406             :          * converting it to lossy, so skip it.
    1407             :          */
    1408        2344 :         if ((page->blockno % PAGES_PER_CHUNK) == 0)
    1409           0 :             continue;
    1410             : 
    1411             :         /* This does the dirty work ... */
    1412        2344 :         tbm_mark_page_lossy(tbm, page->blockno);
    1413             : 
    1414        2344 :         if (tbm->nentries <= tbm->maxentries / 2)
    1415             :         {
    1416             :             /*
    1417             :              * We have made enough room. Remember where to start lossifying
    1418             :              * next round, so we evenly iterate over the hashtable.
    1419             :              */
    1420           4 :             tbm->lossify_start = i.cur;
    1421           4 :             break;
    1422             :         }
    1423             : 
    1424             :         /*
    1425             :          * Note: tbm_mark_page_lossy may have inserted a lossy chunk into the
    1426             :          * hashtable and may have deleted the non-lossy chunk.  We can
    1427             :          * continue the same hash table scan, since failure to visit one
    1428             :          * element or visiting the newly inserted element, isn't fatal.
    1429             :          */
    1430             :     }
    1431             : 
    1432             :     /*
    1433             :      * With a big bitmap and small work_mem, it's possible that we cannot get
    1434             :      * under maxentries.  Again, if that happens, we'd end up uselessly
    1435             :      * calling tbm_lossify over and over.  To prevent this from becoming a
    1436             :      * performance sink, force maxentries up to at least double the current
    1437             :      * number of entries.  (In essence, we're admitting inability to fit
    1438             :      * within work_mem when we do this.)  Note that this test will not fire if
    1439             :      * we broke out of the loop early; and if we didn't, the current number of
    1440             :      * entries is simply not reducible any further.
    1441             :      */
    1442           4 :     if (tbm->nentries > tbm->maxentries / 2)
    1443           0 :         tbm->maxentries = Min(tbm->nentries, (INT_MAX - 1) / 2) * 2;
    1444           4 : }
    1445             : 
    1446             : /*
    1447             :  * qsort comparator to handle PagetableEntry pointers.
    1448             :  */
    1449             : static int
    1450       22691 : tbm_comparator(const void *left, const void *right)
    1451             : {
    1452       22691 :     BlockNumber l = (*((PagetableEntry *const *) left))->blockno;
    1453       22691 :     BlockNumber r = (*((PagetableEntry *const *) right))->blockno;
    1454             : 
    1455       22691 :     if (l < r)
    1456       10049 :         return -1;
    1457       12642 :     else if (l > r)
    1458       12642 :         return 1;
    1459           0 :     return 0;
    1460             : }
    1461             : 
    1462             : /*
    1463             :  * As above, but this will get index into PagetableEntry array.  Therefore,
    1464             :  * it needs to get actual PagetableEntry using the index before comparing the
    1465             :  * blockno.
    1466             :  */
    1467             : static int
    1468       35923 : tbm_shared_comparator(const void *left, const void *right, void *arg)
    1469             : {
    1470       35923 :     PagetableEntry *base = (PagetableEntry *) arg;
    1471       35923 :     PagetableEntry *lpage = &base[*(int *) left];
    1472       35923 :     PagetableEntry *rpage = &base[*(int *) right];
    1473             : 
    1474       35923 :     if (lpage->blockno < rpage->blockno)
    1475       16153 :         return -1;
    1476       19770 :     else if (lpage->blockno > rpage->blockno)
    1477       19770 :         return 1;
    1478           0 :     return 0;
    1479             : }
    1480             : 
    1481             : /*
    1482             :  *  tbm_attach_shared_iterate
    1483             :  *
    1484             :  *  Allocate a backend-private iterator and attach the shared iterator state
    1485             :  *  to it so that multiple processed can iterate jointly.
    1486             :  *
    1487             :  *  We also converts the DSA pointers to local pointers and store them into
    1488             :  *  our private iterator.
    1489             :  */
    1490             : TBMSharedIterator *
    1491         110 : tbm_attach_shared_iterate(dsa_area *dsa, dsa_pointer dp)
    1492             : {
    1493             :     TBMSharedIterator *iterator;
    1494             :     TBMSharedIteratorState *istate;
    1495             : 
    1496             :     /*
    1497             :      * Create the TBMSharedIterator struct, with enough trailing space to
    1498             :      * serve the needs of the TBMIterateResult sub-struct.
    1499             :      */
    1500         110 :     iterator = (TBMSharedIterator *) palloc0(sizeof(TBMSharedIterator) +
    1501             :                                              MAX_TUPLES_PER_PAGE * sizeof(OffsetNumber));
    1502             : 
    1503         110 :     istate = (TBMSharedIteratorState *) dsa_get_address(dsa, dp);
    1504             : 
    1505         110 :     iterator->state = istate;
    1506             : 
    1507         110 :     iterator->ptbase = dsa_get_address(dsa, istate->pagetable);
    1508             : 
    1509         110 :     if (istate->npages)
    1510         110 :         iterator->ptpages = dsa_get_address(dsa, istate->spages);
    1511         110 :     if (istate->nchunks)
    1512          10 :         iterator->ptchunks = dsa_get_address(dsa, istate->schunks);
    1513             : 
    1514         110 :     return iterator;
    1515             : }
    1516             : 
    1517             : /*
    1518             :  * pagetable_allocate
    1519             :  *
    1520             :  * Callback function for allocating the memory for hashtable elements.
    1521             :  * Allocate memory for hashtable elements, using DSA if available.
    1522             :  */
    1523             : static inline void *
    1524         430 : pagetable_allocate(pagetable_hash *pagetable, Size size)
    1525             : {
    1526         430 :     TIDBitmap  *tbm = (TIDBitmap *) pagetable->private_data;
    1527             :     PTEntryArray *ptbase;
    1528             : 
    1529         430 :     if (tbm->dsa == NULL)
    1530         406 :         return MemoryContextAllocExtended(pagetable->ctx, size,
    1531             :                                           MCXT_ALLOC_HUGE | MCXT_ALLOC_ZERO);
    1532             : 
    1533             :     /*
    1534             :      * Save the dsapagetable reference in dsapagetableold before allocating
    1535             :      * new memory so that pagetable_free can free the old entry.
    1536             :      */
    1537          24 :     tbm->dsapagetableold = tbm->dsapagetable;
    1538          24 :     tbm->dsapagetable = dsa_allocate_extended(tbm->dsa,
    1539             :                                               sizeof(PTEntryArray) + size,
    1540             :                                               DSA_ALLOC_HUGE | DSA_ALLOC_ZERO);
    1541          24 :     ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
    1542             : 
    1543          24 :     return ptbase->ptentry;
    1544             : }
    1545             : 
    1546             : /*
    1547             :  * pagetable_free
    1548             :  *
    1549             :  * Callback function for freeing hash table elements.
    1550             :  */
    1551             : static inline void
    1552         430 : pagetable_free(pagetable_hash *pagetable, void *pointer)
    1553             : {
    1554         430 :     TIDBitmap  *tbm = (TIDBitmap *) pagetable->private_data;
    1555             : 
    1556             :     /* pfree the input pointer if DSA is not available */
    1557         430 :     if (tbm->dsa == NULL)
    1558         406 :         pfree(pointer);
    1559          24 :     else if (DsaPointerIsValid(tbm->dsapagetableold))
    1560             :     {
    1561          13 :         dsa_free(tbm->dsa, tbm->dsapagetableold);
    1562          13 :         tbm->dsapagetableold = InvalidDsaPointer;
    1563             :     }
    1564         430 : }

Generated by: LCOV version 1.11