LCOV - code coverage report
Current view: top level - src/backend/access/hash - hashpage.c (source / functions) Hit Total Coverage
Test: PostgreSQL Lines: 392 480 81.7 %
Date: 2017-09-29 13:40:31 Functions: 18 19 94.7 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * hashpage.c
       4             :  *    Hash table page management code for the Postgres hash access method
       5             :  *
       6             :  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
       7             :  * Portions Copyright (c) 1994, Regents of the University of California
       8             :  *
       9             :  *
      10             :  * IDENTIFICATION
      11             :  *    src/backend/access/hash/hashpage.c
      12             :  *
      13             :  * NOTES
      14             :  *    Postgres hash pages look like ordinary relation pages.  The opaque
      15             :  *    data at high addresses includes information about the page including
      16             :  *    whether a page is an overflow page or a true bucket, the bucket
      17             :  *    number, and the block numbers of the preceding and following pages
      18             :  *    in the same bucket.
      19             :  *
      20             :  *    The first page in a hash relation, page zero, is special -- it stores
      21             :  *    information describing the hash table; it is referred to as the
      22             :  *    "meta page." Pages one and higher store the actual data.
      23             :  *
      24             :  *    There are also bitmap pages, which are not manipulated here;
      25             :  *    see hashovfl.c.
      26             :  *
      27             :  *-------------------------------------------------------------------------
      28             :  */
      29             : #include "postgres.h"
      30             : 
      31             : #include "access/hash.h"
      32             : #include "access/hash_xlog.h"
      33             : #include "miscadmin.h"
      34             : #include "storage/lmgr.h"
      35             : #include "storage/smgr.h"
      36             : 
      37             : 
      38             : static bool _hash_alloc_buckets(Relation rel, BlockNumber firstblock,
      39             :                     uint32 nblocks);
      40             : static void _hash_splitbucket(Relation rel, Buffer metabuf,
      41             :                   Bucket obucket, Bucket nbucket,
      42             :                   Buffer obuf,
      43             :                   Buffer nbuf,
      44             :                   HTAB *htab,
      45             :                   uint32 maxbucket,
      46             :                   uint32 highmask, uint32 lowmask);
      47             : static void log_split_page(Relation rel, Buffer buf);
      48             : 
      49             : 
      50             : /*
      51             :  * We use high-concurrency locking on hash indexes (see README for an overview
      52             :  * of the locking rules).  However, we can skip taking lmgr locks when the
      53             :  * index is local to the current backend (ie, either temp or new in the
      54             :  * current transaction).  No one else can see it, so there's no reason to
      55             :  * take locks.  We still take buffer-level locks, but not lmgr locks.
      56             :  */
      57             : #define USELOCKING(rel)     (!RELATION_IS_LOCAL(rel))
      58             : 
      59             : 
      60             : /*
      61             :  *  _hash_getbuf() -- Get a buffer by block number for read or write.
      62             :  *
      63             :  *      'access' must be HASH_READ, HASH_WRITE, or HASH_NOLOCK.
      64             :  *      'flags' is a bitwise OR of the allowed page types.
      65             :  *
      66             :  *      This must be used only to fetch pages that are expected to be valid
      67             :  *      already.  _hash_checkpage() is applied using the given flags.
      68             :  *
      69             :  *      When this routine returns, the appropriate lock is set on the
      70             :  *      requested buffer and its reference count has been incremented
      71             :  *      (ie, the buffer is "locked and pinned").
      72             :  *
      73             :  *      P_NEW is disallowed because this routine can only be used
      74             :  *      to access pages that are known to be before the filesystem EOF.
      75             :  *      Extending the index should be done with _hash_getnewbuf.
      76             :  */
      77             : Buffer
      78      195435 : _hash_getbuf(Relation rel, BlockNumber blkno, int access, int flags)
      79             : {
      80             :     Buffer      buf;
      81             : 
      82      195435 :     if (blkno == P_NEW)
      83           0 :         elog(ERROR, "hash AM does not use P_NEW");
      84             : 
      85      195435 :     buf = ReadBuffer(rel, blkno);
      86             : 
      87      195435 :     if (access != HASH_NOLOCK)
      88      114883 :         LockBuffer(buf, access);
      89             : 
      90             :     /* ref count and lock type are correct */
      91             : 
      92      195435 :     _hash_checkpage(rel, buf, flags);
      93             : 
      94      195435 :     return buf;
      95             : }
      96             : 
      97             : /*
      98             :  * _hash_getbuf_with_condlock_cleanup() -- Try to get a buffer for cleanup.
      99             :  *
     100             :  *      We read the page and try to acquire a cleanup lock.  If we get it,
     101             :  *      we return the buffer; otherwise, we return InvalidBuffer.
     102             :  */
     103             : Buffer
     104          72 : _hash_getbuf_with_condlock_cleanup(Relation rel, BlockNumber blkno, int flags)
     105             : {
     106             :     Buffer      buf;
     107             : 
     108          72 :     if (blkno == P_NEW)
     109           0 :         elog(ERROR, "hash AM does not use P_NEW");
     110             : 
     111          72 :     buf = ReadBuffer(rel, blkno);
     112             : 
     113          72 :     if (!ConditionalLockBufferForCleanup(buf))
     114             :     {
     115           0 :         ReleaseBuffer(buf);
     116           0 :         return InvalidBuffer;
     117             :     }
     118             : 
     119             :     /* ref count and lock type are correct */
     120             : 
     121          72 :     _hash_checkpage(rel, buf, flags);
     122             : 
     123          72 :     return buf;
     124             : }
     125             : 
     126             : /*
     127             :  *  _hash_getinitbuf() -- Get and initialize a buffer by block number.
     128             :  *
     129             :  *      This must be used only to fetch pages that are known to be before
     130             :  *      the index's filesystem EOF, but are to be filled from scratch.
     131             :  *      _hash_pageinit() is applied automatically.  Otherwise it has
     132             :  *      effects similar to _hash_getbuf() with access = HASH_WRITE.
     133             :  *
     134             :  *      When this routine returns, a write lock is set on the
     135             :  *      requested buffer and its reference count has been incremented
     136             :  *      (ie, the buffer is "locked and pinned").
     137             :  *
     138             :  *      P_NEW is disallowed because this routine can only be used
     139             :  *      to access pages that are known to be before the filesystem EOF.
     140             :  *      Extending the index should be done with _hash_getnewbuf.
     141             :  */
     142             : Buffer
     143           8 : _hash_getinitbuf(Relation rel, BlockNumber blkno)
     144             : {
     145             :     Buffer      buf;
     146             : 
     147           8 :     if (blkno == P_NEW)
     148           0 :         elog(ERROR, "hash AM does not use P_NEW");
     149             : 
     150           8 :     buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_ZERO_AND_LOCK,
     151             :                              NULL);
     152             : 
     153             :     /* ref count and lock type are correct */
     154             : 
     155             :     /* initialize the page */
     156           8 :     _hash_pageinit(BufferGetPage(buf), BufferGetPageSize(buf));
     157             : 
     158           8 :     return buf;
     159             : }
     160             : 
     161             : /*
     162             :  *  _hash_initbuf() -- Get and initialize a buffer by bucket number.
     163             :  */
     164             : void
     165         428 : _hash_initbuf(Buffer buf, uint32 max_bucket, uint32 num_bucket, uint32 flag,
     166             :               bool initpage)
     167             : {
     168             :     HashPageOpaque pageopaque;
     169             :     Page        page;
     170             : 
     171         428 :     page = BufferGetPage(buf);
     172             : 
     173             :     /* initialize the page */
     174         428 :     if (initpage)
     175           0 :         _hash_pageinit(page, BufferGetPageSize(buf));
     176             : 
     177         428 :     pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
     178             : 
     179             :     /*
     180             :      * Set hasho_prevblkno with current hashm_maxbucket. This value will be
     181             :      * used to validate cached HashMetaPageData. See
     182             :      * _hash_getbucketbuf_from_hashkey().
     183             :      */
     184         428 :     pageopaque->hasho_prevblkno = max_bucket;
     185         428 :     pageopaque->hasho_nextblkno = InvalidBlockNumber;
     186         428 :     pageopaque->hasho_bucket = num_bucket;
     187         428 :     pageopaque->hasho_flag = flag;
     188         428 :     pageopaque->hasho_page_id = HASHO_PAGE_ID;
     189         428 : }
     190             : 
     191             : /*
     192             :  *  _hash_getnewbuf() -- Get a new page at the end of the index.
     193             :  *
     194             :  *      This has the same API as _hash_getinitbuf, except that we are adding
     195             :  *      a page to the index, and hence expect the page to be past the
     196             :  *      logical EOF.  (However, we have to support the case where it isn't,
     197             :  *      since a prior try might have crashed after extending the filesystem
     198             :  *      EOF but before updating the metapage to reflect the added page.)
     199             :  *
     200             :  *      It is caller's responsibility to ensure that only one process can
     201             :  *      extend the index at a time.  In practice, this function is called
     202             :  *      only while holding write lock on the metapage, because adding a page
     203             :  *      is always associated with an update of metapage data.
     204             :  */
     205             : Buffer
     206         552 : _hash_getnewbuf(Relation rel, BlockNumber blkno, ForkNumber forkNum)
     207             : {
     208         552 :     BlockNumber nblocks = RelationGetNumberOfBlocksInFork(rel, forkNum);
     209             :     Buffer      buf;
     210             : 
     211         552 :     if (blkno == P_NEW)
     212           0 :         elog(ERROR, "hash AM does not use P_NEW");
     213         552 :     if (blkno > nblocks)
     214           0 :         elog(ERROR, "access to noncontiguous page in hash index \"%s\"",
     215             :              RelationGetRelationName(rel));
     216             : 
     217             :     /* smgr insists we use P_NEW to extend the relation */
     218         552 :     if (blkno == nblocks)
     219             :     {
     220         480 :         buf = ReadBufferExtended(rel, forkNum, P_NEW, RBM_NORMAL, NULL);
     221         480 :         if (BufferGetBlockNumber(buf) != blkno)
     222           0 :             elog(ERROR, "unexpected hash relation size: %u, should be %u",
     223             :                  BufferGetBlockNumber(buf), blkno);
     224         480 :         LockBuffer(buf, HASH_WRITE);
     225             :     }
     226             :     else
     227             :     {
     228          72 :         buf = ReadBufferExtended(rel, forkNum, blkno, RBM_ZERO_AND_LOCK,
     229             :                                  NULL);
     230             :     }
     231             : 
     232             :     /* ref count and lock type are correct */
     233             : 
     234             :     /* initialize the page */
     235         552 :     _hash_pageinit(BufferGetPage(buf), BufferGetPageSize(buf));
     236             : 
     237         552 :     return buf;
     238             : }
     239             : 
     240             : /*
     241             :  *  _hash_getbuf_with_strategy() -- Get a buffer with nondefault strategy.
     242             :  *
     243             :  *      This is identical to _hash_getbuf() but also allows a buffer access
     244             :  *      strategy to be specified.  We use this for VACUUM operations.
     245             :  */
     246             : Buffer
     247         132 : _hash_getbuf_with_strategy(Relation rel, BlockNumber blkno,
     248             :                            int access, int flags,
     249             :                            BufferAccessStrategy bstrategy)
     250             : {
     251             :     Buffer      buf;
     252             : 
     253         132 :     if (blkno == P_NEW)
     254           0 :         elog(ERROR, "hash AM does not use P_NEW");
     255             : 
     256         132 :     buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, bstrategy);
     257             : 
     258         132 :     if (access != HASH_NOLOCK)
     259         132 :         LockBuffer(buf, access);
     260             : 
     261             :     /* ref count and lock type are correct */
     262             : 
     263         132 :     _hash_checkpage(rel, buf, flags);
     264             : 
     265         132 :     return buf;
     266             : }
     267             : 
     268             : /*
     269             :  *  _hash_relbuf() -- release a locked buffer.
     270             :  *
     271             :  * Lock and pin (refcount) are both dropped.
     272             :  */
     273             : void
     274      109431 : _hash_relbuf(Relation rel, Buffer buf)
     275             : {
     276      109431 :     UnlockReleaseBuffer(buf);
     277      109431 : }
     278             : 
     279             : /*
     280             :  *  _hash_dropbuf() -- release an unlocked buffer.
     281             :  *
     282             :  * This is used to unpin a buffer on which we hold no lock.
     283             :  */
     284             : void
     285       86768 : _hash_dropbuf(Relation rel, Buffer buf)
     286             : {
     287       86768 :     ReleaseBuffer(buf);
     288       86768 : }
     289             : 
     290             : /*
     291             :  *  _hash_dropscanbuf() -- release buffers used in scan.
     292             :  *
     293             :  * This routine unpins the buffers used during scan on which we
     294             :  * hold no lock.
     295             :  */
     296             : void
     297          74 : _hash_dropscanbuf(Relation rel, HashScanOpaque so)
     298             : {
     299             :     /* release pin we hold on primary bucket page */
     300          99 :     if (BufferIsValid(so->hashso_bucket_buf) &&
     301          25 :         so->hashso_bucket_buf != so->hashso_curbuf)
     302          25 :         _hash_dropbuf(rel, so->hashso_bucket_buf);
     303          74 :     so->hashso_bucket_buf = InvalidBuffer;
     304             : 
     305             :     /* release pin we hold on primary bucket page  of bucket being split */
     306          74 :     if (BufferIsValid(so->hashso_split_bucket_buf) &&
     307           0 :         so->hashso_split_bucket_buf != so->hashso_curbuf)
     308           0 :         _hash_dropbuf(rel, so->hashso_split_bucket_buf);
     309          74 :     so->hashso_split_bucket_buf = InvalidBuffer;
     310             : 
     311             :     /* release any pin we still hold */
     312          74 :     if (BufferIsValid(so->hashso_curbuf))
     313           0 :         _hash_dropbuf(rel, so->hashso_curbuf);
     314          74 :     so->hashso_curbuf = InvalidBuffer;
     315             : 
     316             :     /* reset split scan */
     317          74 :     so->hashso_buc_populated = false;
     318          74 :     so->hashso_buc_split = false;
     319          74 : }
     320             : 
     321             : 
     322             : /*
     323             :  *  _hash_init() -- Initialize the metadata page of a hash index,
     324             :  *              the initial buckets, and the initial bitmap page.
     325             :  *
     326             :  * The initial number of buckets is dependent on num_tuples, an estimate
     327             :  * of the number of tuples to be loaded into the index initially.  The
     328             :  * chosen number of buckets is returned.
     329             :  *
     330             :  * We are fairly cavalier about locking here, since we know that no one else
     331             :  * could be accessing this index.  In particular the rule about not holding
     332             :  * multiple buffer locks is ignored.
     333             :  */
     334             : uint32
     335          15 : _hash_init(Relation rel, double num_tuples, ForkNumber forkNum)
     336             : {
     337             :     Buffer      metabuf;
     338             :     Buffer      buf;
     339             :     Buffer      bitmapbuf;
     340             :     Page        pg;
     341             :     HashMetaPage metap;
     342             :     RegProcedure procid;
     343             :     int32       data_width;
     344             :     int32       item_width;
     345             :     int32       ffactor;
     346             :     uint32      num_buckets;
     347             :     uint32      i;
     348             :     bool        use_wal;
     349             : 
     350             :     /* safety check */
     351          15 :     if (RelationGetNumberOfBlocksInFork(rel, forkNum) != 0)
     352           0 :         elog(ERROR, "cannot initialize non-empty hash index \"%s\"",
     353             :              RelationGetRelationName(rel));
     354             : 
     355             :     /*
     356             :      * WAL log creation of pages if the relation is persistent, or this is the
     357             :      * init fork.  Init forks for unlogged relations always need to be WAL
     358             :      * logged.
     359             :      */
     360          15 :     use_wal = RelationNeedsWAL(rel) || forkNum == INIT_FORKNUM;
     361             : 
     362             :     /*
     363             :      * Determine the target fill factor (in tuples per bucket) for this index.
     364             :      * The idea is to make the fill factor correspond to pages about as full
     365             :      * as the user-settable fillfactor parameter says.  We can compute it
     366             :      * exactly since the index datatype (i.e. uint32 hash key) is fixed-width.
     367             :      */
     368          15 :     data_width = sizeof(uint32);
     369          15 :     item_width = MAXALIGN(sizeof(IndexTupleData)) + MAXALIGN(data_width) +
     370             :         sizeof(ItemIdData);     /* include the line pointer */
     371          15 :     ffactor = RelationGetTargetPageUsage(rel, HASH_DEFAULT_FILLFACTOR) / item_width;
     372             :     /* keep to a sane range */
     373          15 :     if (ffactor < 10)
     374           0 :         ffactor = 10;
     375             : 
     376          15 :     procid = index_getprocid(rel, 1, HASHSTANDARD_PROC);
     377             : 
     378             :     /*
     379             :      * We initialize the metapage, the first N bucket pages, and the first
     380             :      * bitmap page in sequence, using _hash_getnewbuf to cause smgrextend()
     381             :      * calls to occur.  This ensures that the smgr level has the right idea of
     382             :      * the physical index length.
     383             :      *
     384             :      * Critical section not required, because on error the creation of the
     385             :      * whole relation will be rolled back.
     386             :      */
     387          15 :     metabuf = _hash_getnewbuf(rel, HASH_METAPAGE, forkNum);
     388          15 :     _hash_init_metabuffer(metabuf, num_tuples, procid, ffactor, false);
     389          15 :     MarkBufferDirty(metabuf);
     390             : 
     391          15 :     pg = BufferGetPage(metabuf);
     392          15 :     metap = HashPageGetMeta(pg);
     393             : 
     394             :     /* XLOG stuff */
     395          15 :     if (use_wal)
     396             :     {
     397             :         xl_hash_init_meta_page xlrec;
     398             :         XLogRecPtr  recptr;
     399             : 
     400          13 :         xlrec.num_tuples = num_tuples;
     401          13 :         xlrec.procid = metap->hashm_procid;
     402          13 :         xlrec.ffactor = metap->hashm_ffactor;
     403             : 
     404          13 :         XLogBeginInsert();
     405          13 :         XLogRegisterData((char *) &xlrec, SizeOfHashInitMetaPage);
     406          13 :         XLogRegisterBuffer(0, metabuf, REGBUF_WILL_INIT);
     407             : 
     408          13 :         recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_INIT_META_PAGE);
     409             : 
     410          13 :         PageSetLSN(BufferGetPage(metabuf), recptr);
     411             :     }
     412             : 
     413          15 :     num_buckets = metap->hashm_maxbucket + 1;
     414             : 
     415             :     /*
     416             :      * Release buffer lock on the metapage while we initialize buckets.
     417             :      * Otherwise, we'll be in interrupt holdoff and the CHECK_FOR_INTERRUPTS
     418             :      * won't accomplish anything.  It's a bad idea to hold buffer locks for
     419             :      * long intervals in any case, since that can block the bgwriter.
     420             :      */
     421          15 :     LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
     422             : 
     423             :     /*
     424             :      * Initialize and WAL Log the first N buckets
     425             :      */
     426         443 :     for (i = 0; i < num_buckets; i++)
     427             :     {
     428             :         BlockNumber blkno;
     429             : 
     430             :         /* Allow interrupts, in case N is huge */
     431         428 :         CHECK_FOR_INTERRUPTS();
     432             : 
     433         428 :         blkno = BUCKET_TO_BLKNO(metap, i);
     434         428 :         buf = _hash_getnewbuf(rel, blkno, forkNum);
     435         428 :         _hash_initbuf(buf, metap->hashm_maxbucket, i, LH_BUCKET_PAGE, false);
     436         428 :         MarkBufferDirty(buf);
     437             : 
     438         428 :         if (use_wal)
     439         412 :             log_newpage(&rel->rd_node,
     440             :                         forkNum,
     441             :                         blkno,
     442         412 :                         BufferGetPage(buf),
     443             :                         true);
     444         428 :         _hash_relbuf(rel, buf);
     445             :     }
     446             : 
     447             :     /* Now reacquire buffer lock on metapage */
     448          15 :     LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
     449             : 
     450             :     /*
     451             :      * Initialize bitmap page
     452             :      */
     453          15 :     bitmapbuf = _hash_getnewbuf(rel, num_buckets + 1, forkNum);
     454          15 :     _hash_initbitmapbuffer(bitmapbuf, metap->hashm_bmsize, false);
     455          15 :     MarkBufferDirty(bitmapbuf);
     456             : 
     457             :     /* add the new bitmap page to the metapage's list of bitmaps */
     458             :     /* metapage already has a write lock */
     459          15 :     if (metap->hashm_nmaps >= HASH_MAX_BITMAPS)
     460           0 :         ereport(ERROR,
     461             :                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
     462             :                  errmsg("out of overflow pages in hash index \"%s\"",
     463             :                         RelationGetRelationName(rel))));
     464             : 
     465          15 :     metap->hashm_mapp[metap->hashm_nmaps] = num_buckets + 1;
     466             : 
     467          15 :     metap->hashm_nmaps++;
     468          15 :     MarkBufferDirty(metabuf);
     469             : 
     470             :     /* XLOG stuff */
     471          15 :     if (use_wal)
     472             :     {
     473             :         xl_hash_init_bitmap_page xlrec;
     474             :         XLogRecPtr  recptr;
     475             : 
     476          13 :         xlrec.bmsize = metap->hashm_bmsize;
     477             : 
     478          13 :         XLogBeginInsert();
     479          13 :         XLogRegisterData((char *) &xlrec, SizeOfHashInitBitmapPage);
     480          13 :         XLogRegisterBuffer(0, bitmapbuf, REGBUF_WILL_INIT);
     481             : 
     482             :         /*
     483             :          * This is safe only because nobody else can be modifying the index at
     484             :          * this stage; it's only visible to the transaction that is creating
     485             :          * it.
     486             :          */
     487          13 :         XLogRegisterBuffer(1, metabuf, REGBUF_STANDARD);
     488             : 
     489          13 :         recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_INIT_BITMAP_PAGE);
     490             : 
     491          13 :         PageSetLSN(BufferGetPage(bitmapbuf), recptr);
     492          13 :         PageSetLSN(BufferGetPage(metabuf), recptr);
     493             :     }
     494             : 
     495             :     /* all done */
     496          15 :     _hash_relbuf(rel, bitmapbuf);
     497          15 :     _hash_relbuf(rel, metabuf);
     498             : 
     499          15 :     return num_buckets;
     500             : }
     501             : 
     502             : /*
     503             :  *  _hash_init_metabuffer() -- Initialize the metadata page of a hash index.
     504             :  */
     505             : void
     506          15 : _hash_init_metabuffer(Buffer buf, double num_tuples, RegProcedure procid,
     507             :                       uint16 ffactor, bool initpage)
     508             : {
     509             :     HashMetaPage metap;
     510             :     HashPageOpaque pageopaque;
     511             :     Page        page;
     512             :     double      dnumbuckets;
     513             :     uint32      num_buckets;
     514             :     uint32      spare_index;
     515             :     uint32      i;
     516             : 
     517             :     /*
     518             :      * Choose the number of initial bucket pages to match the fill factor
     519             :      * given the estimated number of tuples.  We round up the result to the
     520             :      * total number of buckets which has to be allocated before using its
     521             :      * _hashm_spare element. However always force at least 2 bucket pages. The
     522             :      * upper limit is determined by considerations explained in
     523             :      * _hash_expandtable().
     524             :      */
     525          15 :     dnumbuckets = num_tuples / ffactor;
     526          15 :     if (dnumbuckets <= 2.0)
     527           6 :         num_buckets = 2;
     528           9 :     else if (dnumbuckets >= (double) 0x40000000)
     529           0 :         num_buckets = 0x40000000;
     530             :     else
     531           9 :         num_buckets = _hash_get_totalbuckets(_hash_spareindex(dnumbuckets));
     532             : 
     533          15 :     spare_index = _hash_spareindex(num_buckets);
     534          15 :     Assert(spare_index < HASH_MAX_SPLITPOINTS);
     535             : 
     536          15 :     page = BufferGetPage(buf);
     537          15 :     if (initpage)
     538           0 :         _hash_pageinit(page, BufferGetPageSize(buf));
     539             : 
     540          15 :     pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
     541          15 :     pageopaque->hasho_prevblkno = InvalidBlockNumber;
     542          15 :     pageopaque->hasho_nextblkno = InvalidBlockNumber;
     543          15 :     pageopaque->hasho_bucket = -1;
     544          15 :     pageopaque->hasho_flag = LH_META_PAGE;
     545          15 :     pageopaque->hasho_page_id = HASHO_PAGE_ID;
     546             : 
     547          15 :     metap = HashPageGetMeta(page);
     548             : 
     549          15 :     metap->hashm_magic = HASH_MAGIC;
     550          15 :     metap->hashm_version = HASH_VERSION;
     551          15 :     metap->hashm_ntuples = 0;
     552          15 :     metap->hashm_nmaps = 0;
     553          15 :     metap->hashm_ffactor = ffactor;
     554          15 :     metap->hashm_bsize = HashGetMaxBitmapSize(page);
     555             :     /* find largest bitmap array size that will fit in page size */
     556          30 :     for (i = _hash_log2(metap->hashm_bsize); i > 0; --i)
     557             :     {
     558          30 :         if ((1 << i) <= metap->hashm_bsize)
     559          15 :             break;
     560             :     }
     561          15 :     Assert(i > 0);
     562          15 :     metap->hashm_bmsize = 1 << i;
     563          15 :     metap->hashm_bmshift = i + BYTE_TO_BIT;
     564          15 :     Assert((1 << BMPG_SHIFT(metap)) == (BMPG_MASK(metap) + 1));
     565             : 
     566             :     /*
     567             :      * Label the index with its primary hash support function's OID.  This is
     568             :      * pretty useless for normal operation (in fact, hashm_procid is not used
     569             :      * anywhere), but it might be handy for forensic purposes so we keep it.
     570             :      */
     571          15 :     metap->hashm_procid = procid;
     572             : 
     573             :     /*
     574             :      * We initialize the index with N buckets, 0 .. N-1, occupying physical
     575             :      * blocks 1 to N.  The first freespace bitmap page is in block N+1.
     576             :      */
     577          15 :     metap->hashm_maxbucket = num_buckets - 1;
     578             : 
     579             :     /*
     580             :      * Set highmask as next immediate ((2 ^ x) - 1), which should be
     581             :      * sufficient to cover num_buckets.
     582             :      */
     583          15 :     metap->hashm_highmask = (1 << (_hash_log2(num_buckets + 1))) - 1;
     584          15 :     metap->hashm_lowmask = (metap->hashm_highmask >> 1);
     585             : 
     586          15 :     MemSet(metap->hashm_spares, 0, sizeof(metap->hashm_spares));
     587          15 :     MemSet(metap->hashm_mapp, 0, sizeof(metap->hashm_mapp));
     588             : 
     589             :     /* Set up mapping for one spare page after the initial splitpoints */
     590          15 :     metap->hashm_spares[spare_index] = 1;
     591          15 :     metap->hashm_ovflpoint = spare_index;
     592          15 :     metap->hashm_firstfree = 0;
     593             : 
     594             :     /*
     595             :      * Set pd_lower just past the end of the metadata.  This is to log full
     596             :      * page image of metapage in xloginsert.c.
     597             :      */
     598          15 :     ((PageHeader) page)->pd_lower =
     599          15 :         ((char *) metap + sizeof(HashMetaPageData)) - (char *) page;
     600          15 : }
     601             : 
     602             : /*
     603             :  *  _hash_pageinit() -- Initialize a new hash index page.
     604             :  */
     605             : void
     606         583 : _hash_pageinit(Page page, Size size)
     607             : {
     608         583 :     PageInit(page, size, sizeof(HashPageOpaqueData));
     609         583 : }
     610             : 
     611             : /*
     612             :  * Attempt to expand the hash table by creating one new bucket.
     613             :  *
     614             :  * This will silently do nothing if we don't get cleanup lock on old or
     615             :  * new bucket.
     616             :  *
     617             :  * Complete the pending splits and remove the tuples from old bucket,
     618             :  * if there are any left over from the previous split.
     619             :  *
     620             :  * The caller must hold a pin, but no lock, on the metapage buffer.
     621             :  * The buffer is returned in the same state.
     622             :  */
     623             : void
     624          72 : _hash_expandtable(Relation rel, Buffer metabuf)
     625             : {
     626             :     HashMetaPage metap;
     627             :     Bucket      old_bucket;
     628             :     Bucket      new_bucket;
     629             :     uint32      spare_ndx;
     630             :     BlockNumber start_oblkno;
     631             :     BlockNumber start_nblkno;
     632             :     Buffer      buf_nblkno;
     633             :     Buffer      buf_oblkno;
     634             :     Page        opage;
     635             :     Page        npage;
     636             :     HashPageOpaque oopaque;
     637             :     HashPageOpaque nopaque;
     638             :     uint32      maxbucket;
     639             :     uint32      highmask;
     640             :     uint32      lowmask;
     641          72 :     bool        metap_update_masks = false;
     642          72 :     bool        metap_update_splitpoint = false;
     643             : 
     644             : restart_expand:
     645             : 
     646             :     /*
     647             :      * Write-lock the meta page.  It used to be necessary to acquire a
     648             :      * heavyweight lock to begin a split, but that is no longer required.
     649             :      */
     650          72 :     LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
     651             : 
     652          72 :     _hash_checkpage(rel, metabuf, LH_META_PAGE);
     653          72 :     metap = HashPageGetMeta(BufferGetPage(metabuf));
     654             : 
     655             :     /*
     656             :      * Check to see if split is still needed; someone else might have already
     657             :      * done one while we waited for the lock.
     658             :      *
     659             :      * Make sure this stays in sync with _hash_doinsert()
     660             :      */
     661         216 :     if (metap->hashm_ntuples <=
     662         144 :         (double) metap->hashm_ffactor * (metap->hashm_maxbucket + 1))
     663           0 :         goto fail;
     664             : 
     665             :     /*
     666             :      * Can't split anymore if maxbucket has reached its maximum possible
     667             :      * value.
     668             :      *
     669             :      * Ideally we'd allow bucket numbers up to UINT_MAX-1 (no higher because
     670             :      * the calculation maxbucket+1 mustn't overflow).  Currently we restrict
     671             :      * to half that because of overflow looping in _hash_log2() and
     672             :      * insufficient space in hashm_spares[].  It's moot anyway because an
     673             :      * index with 2^32 buckets would certainly overflow BlockNumber and hence
     674             :      * _hash_alloc_buckets() would fail, but if we supported buckets smaller
     675             :      * than a disk block then this would be an independent constraint.
     676             :      *
     677             :      * If you change this, see also the maximum initial number of buckets in
     678             :      * _hash_init().
     679             :      */
     680          72 :     if (metap->hashm_maxbucket >= (uint32) 0x7FFFFFFE)
     681           0 :         goto fail;
     682             : 
     683             :     /*
     684             :      * Determine which bucket is to be split, and attempt to take cleanup lock
     685             :      * on the old bucket.  If we can't get the lock, give up.
     686             :      *
     687             :      * The cleanup lock protects us not only against other backends, but
     688             :      * against our own backend as well.
     689             :      *
     690             :      * The cleanup lock is mainly to protect the split from concurrent
     691             :      * inserts. See src/backend/access/hash/README, Lock Definitions for
     692             :      * further details.  Due to this locking restriction, if there is any
     693             :      * pending scan, the split will give up which is not good, but harmless.
     694             :      */
     695          72 :     new_bucket = metap->hashm_maxbucket + 1;
     696             : 
     697          72 :     old_bucket = (new_bucket & metap->hashm_lowmask);
     698             : 
     699          72 :     start_oblkno = BUCKET_TO_BLKNO(metap, old_bucket);
     700             : 
     701          72 :     buf_oblkno = _hash_getbuf_with_condlock_cleanup(rel, start_oblkno, LH_BUCKET_PAGE);
     702          72 :     if (!buf_oblkno)
     703           0 :         goto fail;
     704             : 
     705          72 :     opage = BufferGetPage(buf_oblkno);
     706          72 :     oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
     707             : 
     708             :     /*
     709             :      * We want to finish the split from a bucket as there is no apparent
     710             :      * benefit by not doing so and it will make the code complicated to finish
     711             :      * the split that involves multiple buckets considering the case where new
     712             :      * split also fails.  We don't need to consider the new bucket for
     713             :      * completing the split here as it is not possible that a re-split of new
     714             :      * bucket starts when there is still a pending split from old bucket.
     715             :      */
     716          72 :     if (H_BUCKET_BEING_SPLIT(oopaque))
     717             :     {
     718             :         /*
     719             :          * Copy bucket mapping info now; refer the comment in code below where
     720             :          * we copy this information before calling _hash_splitbucket to see
     721             :          * why this is okay.
     722             :          */
     723           0 :         maxbucket = metap->hashm_maxbucket;
     724           0 :         highmask = metap->hashm_highmask;
     725           0 :         lowmask = metap->hashm_lowmask;
     726             : 
     727             :         /*
     728             :          * Release the lock on metapage and old_bucket, before completing the
     729             :          * split.
     730             :          */
     731           0 :         LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
     732           0 :         LockBuffer(buf_oblkno, BUFFER_LOCK_UNLOCK);
     733             : 
     734           0 :         _hash_finish_split(rel, metabuf, buf_oblkno, old_bucket, maxbucket,
     735             :                            highmask, lowmask);
     736             : 
     737             :         /* release the pin on old buffer and retry for expand. */
     738           0 :         _hash_dropbuf(rel, buf_oblkno);
     739             : 
     740           0 :         goto restart_expand;
     741             :     }
     742             : 
     743             :     /*
     744             :      * Clean the tuples remained from the previous split.  This operation
     745             :      * requires cleanup lock and we already have one on the old bucket, so
     746             :      * let's do it. We also don't want to allow further splits from the bucket
     747             :      * till the garbage of previous split is cleaned.  This has two
     748             :      * advantages; first, it helps in avoiding the bloat due to garbage and
     749             :      * second is, during cleanup of bucket, we are always sure that the
     750             :      * garbage tuples belong to most recently split bucket.  On the contrary,
     751             :      * if we allow cleanup of bucket after meta page is updated to indicate
     752             :      * the new split and before the actual split, the cleanup operation won't
     753             :      * be able to decide whether the tuple has been moved to the newly created
     754             :      * bucket and ended up deleting such tuples.
     755             :      */
     756          72 :     if (H_NEEDS_SPLIT_CLEANUP(oopaque))
     757             :     {
     758             :         /*
     759             :          * Copy bucket mapping info now; refer to the comment in code below
     760             :          * where we copy this information before calling _hash_splitbucket to
     761             :          * see why this is okay.
     762             :          */
     763           0 :         maxbucket = metap->hashm_maxbucket;
     764           0 :         highmask = metap->hashm_highmask;
     765           0 :         lowmask = metap->hashm_lowmask;
     766             : 
     767             :         /* Release the metapage lock. */
     768           0 :         LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
     769             : 
     770           0 :         hashbucketcleanup(rel, old_bucket, buf_oblkno, start_oblkno, NULL,
     771             :                           maxbucket, highmask, lowmask, NULL, NULL, true,
     772             :                           NULL, NULL);
     773             : 
     774           0 :         _hash_dropbuf(rel, buf_oblkno);
     775             : 
     776           0 :         goto restart_expand;
     777             :     }
     778             : 
     779             :     /*
     780             :      * There shouldn't be any active scan on new bucket.
     781             :      *
     782             :      * Note: it is safe to compute the new bucket's blkno here, even though we
     783             :      * may still need to update the BUCKET_TO_BLKNO mapping.  This is because
     784             :      * the current value of hashm_spares[hashm_ovflpoint] correctly shows
     785             :      * where we are going to put a new splitpoint's worth of buckets.
     786             :      */
     787          72 :     start_nblkno = BUCKET_TO_BLKNO(metap, new_bucket);
     788             : 
     789             :     /*
     790             :      * If the split point is increasing we need to allocate a new batch of
     791             :      * bucket pages.
     792             :      */
     793          72 :     spare_ndx = _hash_spareindex(new_bucket + 1);
     794          72 :     if (spare_ndx > metap->hashm_ovflpoint)
     795             :     {
     796             :         uint32      buckets_to_add;
     797             : 
     798           4 :         Assert(spare_ndx == metap->hashm_ovflpoint + 1);
     799             : 
     800             :         /*
     801             :          * We treat allocation of buckets as a separate WAL-logged action.
     802             :          * Even if we fail after this operation, won't leak bucket pages;
     803             :          * rather, the next split will consume this space. In any case, even
     804             :          * without failure we don't use all the space in one split operation.
     805             :          */
     806           4 :         buckets_to_add = _hash_get_totalbuckets(spare_ndx) - new_bucket;
     807           4 :         if (!_hash_alloc_buckets(rel, start_nblkno, buckets_to_add))
     808             :         {
     809             :             /* can't split due to BlockNumber overflow */
     810           0 :             _hash_relbuf(rel, buf_oblkno);
     811           0 :             goto fail;
     812             :         }
     813             :     }
     814             : 
     815             :     /*
     816             :      * Physically allocate the new bucket's primary page.  We want to do this
     817             :      * before changing the metapage's mapping info, in case we can't get the
     818             :      * disk space.  Ideally, we don't need to check for cleanup lock on new
     819             :      * bucket as no other backend could find this bucket unless meta page is
     820             :      * updated.  However, it is good to be consistent with old bucket locking.
     821             :      */
     822          72 :     buf_nblkno = _hash_getnewbuf(rel, start_nblkno, MAIN_FORKNUM);
     823          72 :     if (!IsBufferCleanupOK(buf_nblkno))
     824             :     {
     825           0 :         _hash_relbuf(rel, buf_oblkno);
     826           0 :         _hash_relbuf(rel, buf_nblkno);
     827           0 :         goto fail;
     828             :     }
     829             : 
     830             :     /*
     831             :      * Since we are scribbling on the pages in the shared buffers, establish a
     832             :      * critical section.  Any failure in this next code leaves us with a big
     833             :      * problem: the metapage is effectively corrupt but could get written back
     834             :      * to disk.
     835             :      */
     836          72 :     START_CRIT_SECTION();
     837             : 
     838             :     /*
     839             :      * Okay to proceed with split.  Update the metapage bucket mapping info.
     840             :      */
     841          72 :     metap->hashm_maxbucket = new_bucket;
     842             : 
     843          72 :     if (new_bucket > metap->hashm_highmask)
     844             :     {
     845             :         /* Starting a new doubling */
     846           3 :         metap->hashm_lowmask = metap->hashm_highmask;
     847           3 :         metap->hashm_highmask = new_bucket | metap->hashm_lowmask;
     848           3 :         metap_update_masks = true;
     849             :     }
     850             : 
     851             :     /*
     852             :      * If the split point is increasing we need to adjust the hashm_spares[]
     853             :      * array and hashm_ovflpoint so that future overflow pages will be created
     854             :      * beyond this new batch of bucket pages.
     855             :      */
     856          72 :     if (spare_ndx > metap->hashm_ovflpoint)
     857             :     {
     858           4 :         metap->hashm_spares[spare_ndx] = metap->hashm_spares[metap->hashm_ovflpoint];
     859           4 :         metap->hashm_ovflpoint = spare_ndx;
     860           4 :         metap_update_splitpoint = true;
     861             :     }
     862             : 
     863          72 :     MarkBufferDirty(metabuf);
     864             : 
     865             :     /*
     866             :      * Copy bucket mapping info now; this saves re-accessing the meta page
     867             :      * inside _hash_splitbucket's inner loop.  Note that once we drop the
     868             :      * split lock, other splits could begin, so these values might be out of
     869             :      * date before _hash_splitbucket finishes.  That's okay, since all it
     870             :      * needs is to tell which of these two buckets to map hashkeys into.
     871             :      */
     872          72 :     maxbucket = metap->hashm_maxbucket;
     873          72 :     highmask = metap->hashm_highmask;
     874          72 :     lowmask = metap->hashm_lowmask;
     875             : 
     876          72 :     opage = BufferGetPage(buf_oblkno);
     877          72 :     oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
     878             : 
     879             :     /*
     880             :      * Mark the old bucket to indicate that split is in progress.  (At
     881             :      * operation end, we will clear the split-in-progress flag.)  Also, for a
     882             :      * primary bucket page, hasho_prevblkno stores the number of buckets that
     883             :      * existed as of the last split, so we must update that value here.
     884             :      */
     885          72 :     oopaque->hasho_flag |= LH_BUCKET_BEING_SPLIT;
     886          72 :     oopaque->hasho_prevblkno = maxbucket;
     887             : 
     888          72 :     MarkBufferDirty(buf_oblkno);
     889             : 
     890          72 :     npage = BufferGetPage(buf_nblkno);
     891             : 
     892             :     /*
     893             :      * initialize the new bucket's primary page and mark it to indicate that
     894             :      * split is in progress.
     895             :      */
     896          72 :     nopaque = (HashPageOpaque) PageGetSpecialPointer(npage);
     897          72 :     nopaque->hasho_prevblkno = maxbucket;
     898          72 :     nopaque->hasho_nextblkno = InvalidBlockNumber;
     899          72 :     nopaque->hasho_bucket = new_bucket;
     900          72 :     nopaque->hasho_flag = LH_BUCKET_PAGE | LH_BUCKET_BEING_POPULATED;
     901          72 :     nopaque->hasho_page_id = HASHO_PAGE_ID;
     902             : 
     903          72 :     MarkBufferDirty(buf_nblkno);
     904             : 
     905             :     /* XLOG stuff */
     906          72 :     if (RelationNeedsWAL(rel))
     907             :     {
     908             :         xl_hash_split_allocate_page xlrec;
     909             :         XLogRecPtr  recptr;
     910             : 
     911          72 :         xlrec.new_bucket = maxbucket;
     912          72 :         xlrec.old_bucket_flag = oopaque->hasho_flag;
     913          72 :         xlrec.new_bucket_flag = nopaque->hasho_flag;
     914          72 :         xlrec.flags = 0;
     915             : 
     916          72 :         XLogBeginInsert();
     917             : 
     918          72 :         XLogRegisterBuffer(0, buf_oblkno, REGBUF_STANDARD);
     919          72 :         XLogRegisterBuffer(1, buf_nblkno, REGBUF_WILL_INIT);
     920          72 :         XLogRegisterBuffer(2, metabuf, REGBUF_STANDARD);
     921             : 
     922          72 :         if (metap_update_masks)
     923             :         {
     924           3 :             xlrec.flags |= XLH_SPLIT_META_UPDATE_MASKS;
     925           3 :             XLogRegisterBufData(2, (char *) &metap->hashm_lowmask, sizeof(uint32));
     926           3 :             XLogRegisterBufData(2, (char *) &metap->hashm_highmask, sizeof(uint32));
     927             :         }
     928             : 
     929          72 :         if (metap_update_splitpoint)
     930             :         {
     931           4 :             xlrec.flags |= XLH_SPLIT_META_UPDATE_SPLITPOINT;
     932           4 :             XLogRegisterBufData(2, (char *) &metap->hashm_ovflpoint,
     933             :                                 sizeof(uint32));
     934           4 :             XLogRegisterBufData(2,
     935           4 :                                 (char *) &metap->hashm_spares[metap->hashm_ovflpoint],
     936             :                                 sizeof(uint32));
     937             :         }
     938             : 
     939          72 :         XLogRegisterData((char *) &xlrec, SizeOfHashSplitAllocPage);
     940             : 
     941          72 :         recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_ALLOCATE_PAGE);
     942             : 
     943          72 :         PageSetLSN(BufferGetPage(buf_oblkno), recptr);
     944          72 :         PageSetLSN(BufferGetPage(buf_nblkno), recptr);
     945          72 :         PageSetLSN(BufferGetPage(metabuf), recptr);
     946             :     }
     947             : 
     948          72 :     END_CRIT_SECTION();
     949             : 
     950             :     /* drop lock, but keep pin */
     951          72 :     LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
     952             : 
     953             :     /* Relocate records to the new bucket */
     954          72 :     _hash_splitbucket(rel, metabuf,
     955             :                       old_bucket, new_bucket,
     956             :                       buf_oblkno, buf_nblkno, NULL,
     957             :                       maxbucket, highmask, lowmask);
     958             : 
     959             :     /* all done, now release the pins on primary buckets. */
     960          72 :     _hash_dropbuf(rel, buf_oblkno);
     961          72 :     _hash_dropbuf(rel, buf_nblkno);
     962             : 
     963         144 :     return;
     964             : 
     965             :     /* Here if decide not to split or fail to acquire old bucket lock */
     966             : fail:
     967             : 
     968             :     /* We didn't write the metapage, so just drop lock */
     969           0 :     LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
     970             : }
     971             : 
     972             : 
     973             : /*
     974             :  * _hash_alloc_buckets -- allocate a new splitpoint's worth of bucket pages
     975             :  *
     976             :  * This does not need to initialize the new bucket pages; we'll do that as
     977             :  * each one is used by _hash_expandtable().  But we have to extend the logical
     978             :  * EOF to the end of the splitpoint; this keeps smgr's idea of the EOF in
     979             :  * sync with ours, so that we don't get complaints from smgr.
     980             :  *
     981             :  * We do this by writing a page of zeroes at the end of the splitpoint range.
     982             :  * We expect that the filesystem will ensure that the intervening pages read
     983             :  * as zeroes too.  On many filesystems this "hole" will not be allocated
     984             :  * immediately, which means that the index file may end up more fragmented
     985             :  * than if we forced it all to be allocated now; but since we don't scan
     986             :  * hash indexes sequentially anyway, that probably doesn't matter.
     987             :  *
     988             :  * XXX It's annoying that this code is executed with the metapage lock held.
     989             :  * We need to interlock against _hash_addovflpage() adding a new overflow page
     990             :  * concurrently, but it'd likely be better to use LockRelationForExtension
     991             :  * for the purpose.  OTOH, adding a splitpoint is a very infrequent operation,
     992             :  * so it may not be worth worrying about.
     993             :  *
     994             :  * Returns TRUE if successful, or FALSE if allocation failed due to
     995             :  * BlockNumber overflow.
     996             :  */
     997             : static bool
     998           4 : _hash_alloc_buckets(Relation rel, BlockNumber firstblock, uint32 nblocks)
     999             : {
    1000             :     BlockNumber lastblock;
    1001             :     char        zerobuf[BLCKSZ];
    1002             :     Page        page;
    1003             :     HashPageOpaque ovflopaque;
    1004             : 
    1005           4 :     lastblock = firstblock + nblocks - 1;
    1006             : 
    1007             :     /*
    1008             :      * Check for overflow in block number calculation; if so, we cannot extend
    1009             :      * the index anymore.
    1010             :      */
    1011           4 :     if (lastblock < firstblock || lastblock == InvalidBlockNumber)
    1012           0 :         return false;
    1013             : 
    1014           4 :     page = (Page) zerobuf;
    1015             : 
    1016             :     /*
    1017             :      * Initialize the page.  Just zeroing the page won't work; see
    1018             :      * _hash_freeovflpage for similar usage.  We take care to make the special
    1019             :      * space valid for the benefit of tools such as pageinspect.
    1020             :      */
    1021           4 :     _hash_pageinit(page, BLCKSZ);
    1022             : 
    1023           4 :     ovflopaque = (HashPageOpaque) PageGetSpecialPointer(page);
    1024             : 
    1025           4 :     ovflopaque->hasho_prevblkno = InvalidBlockNumber;
    1026           4 :     ovflopaque->hasho_nextblkno = InvalidBlockNumber;
    1027           4 :     ovflopaque->hasho_bucket = -1;
    1028           4 :     ovflopaque->hasho_flag = LH_UNUSED_PAGE;
    1029           4 :     ovflopaque->hasho_page_id = HASHO_PAGE_ID;
    1030             : 
    1031           4 :     if (RelationNeedsWAL(rel))
    1032           4 :         log_newpage(&rel->rd_node,
    1033             :                     MAIN_FORKNUM,
    1034             :                     lastblock,
    1035             :                     zerobuf,
    1036             :                     true);
    1037             : 
    1038           4 :     RelationOpenSmgr(rel);
    1039           4 :     smgrextend(rel->rd_smgr, MAIN_FORKNUM, lastblock, zerobuf, false);
    1040             : 
    1041           4 :     return true;
    1042             : }
    1043             : 
    1044             : 
    1045             : /*
    1046             :  * _hash_splitbucket -- split 'obucket' into 'obucket' and 'nbucket'
    1047             :  *
    1048             :  * This routine is used to partition the tuples between old and new bucket and
    1049             :  * is used to finish the incomplete split operations.  To finish the previously
    1050             :  * interrupted split operation, the caller needs to fill htab.  If htab is set,
    1051             :  * then we skip the movement of tuples that exists in htab, otherwise NULL
    1052             :  * value of htab indicates movement of all the tuples that belong to the new
    1053             :  * bucket.
    1054             :  *
    1055             :  * We are splitting a bucket that consists of a base bucket page and zero
    1056             :  * or more overflow (bucket chain) pages.  We must relocate tuples that
    1057             :  * belong in the new bucket.
    1058             :  *
    1059             :  * The caller must hold cleanup locks on both buckets to ensure that
    1060             :  * no one else is trying to access them (see README).
    1061             :  *
    1062             :  * The caller must hold a pin, but no lock, on the metapage buffer.
    1063             :  * The buffer is returned in the same state.  (The metapage is only
    1064             :  * touched if it becomes necessary to add or remove overflow pages.)
    1065             :  *
    1066             :  * Split needs to retain pin on primary bucket pages of both old and new
    1067             :  * buckets till end of operation.  This is to prevent vacuum from starting
    1068             :  * while a split is in progress.
    1069             :  *
    1070             :  * In addition, the caller must have created the new bucket's base page,
    1071             :  * which is passed in buffer nbuf, pinned and write-locked.  The lock will be
    1072             :  * released here and pin must be released by the caller.  (The API is set up
    1073             :  * this way because we must do _hash_getnewbuf() before releasing the metapage
    1074             :  * write lock.  So instead of passing the new bucket's start block number, we
    1075             :  * pass an actual buffer.)
    1076             :  */
    1077             : static void
    1078          72 : _hash_splitbucket(Relation rel,
    1079             :                   Buffer metabuf,
    1080             :                   Bucket obucket,
    1081             :                   Bucket nbucket,
    1082             :                   Buffer obuf,
    1083             :                   Buffer nbuf,
    1084             :                   HTAB *htab,
    1085             :                   uint32 maxbucket,
    1086             :                   uint32 highmask,
    1087             :                   uint32 lowmask)
    1088             : {
    1089             :     Buffer      bucket_obuf;
    1090             :     Buffer      bucket_nbuf;
    1091             :     Page        opage;
    1092             :     Page        npage;
    1093             :     HashPageOpaque oopaque;
    1094             :     HashPageOpaque nopaque;
    1095             :     OffsetNumber itup_offsets[MaxIndexTuplesPerPage];
    1096             :     IndexTuple  itups[MaxIndexTuplesPerPage];
    1097          72 :     Size        all_tups_size = 0;
    1098             :     int         i;
    1099          72 :     uint16      nitups = 0;
    1100             : 
    1101          72 :     bucket_obuf = obuf;
    1102          72 :     opage = BufferGetPage(obuf);
    1103          72 :     oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
    1104             : 
    1105          72 :     bucket_nbuf = nbuf;
    1106          72 :     npage = BufferGetPage(nbuf);
    1107          72 :     nopaque = (HashPageOpaque) PageGetSpecialPointer(npage);
    1108             : 
    1109             :     /*
    1110             :      * Partition the tuples in the old bucket between the old bucket and the
    1111             :      * new bucket, advancing along the old bucket's overflow bucket chain and
    1112             :      * adding overflow pages to the new bucket as needed.  Outer loop iterates
    1113             :      * once per page in old bucket.
    1114             :      */
    1115             :     for (;;)
    1116             :     {
    1117             :         BlockNumber oblkno;
    1118             :         OffsetNumber ooffnum;
    1119             :         OffsetNumber omaxoffnum;
    1120             : 
    1121             :         /* Scan each tuple in old page */
    1122         123 :         omaxoffnum = PageGetMaxOffsetNumber(opage);
    1123       44114 :         for (ooffnum = FirstOffsetNumber;
    1124             :              ooffnum <= omaxoffnum;
    1125       43868 :              ooffnum = OffsetNumberNext(ooffnum))
    1126             :         {
    1127             :             IndexTuple  itup;
    1128             :             Size        itemsz;
    1129             :             Bucket      bucket;
    1130       43868 :             bool        found = false;
    1131             : 
    1132             :             /* skip dead tuples */
    1133       43868 :             if (ItemIdIsDead(PageGetItemId(opage, ooffnum)))
    1134           0 :                 continue;
    1135             : 
    1136             :             /*
    1137             :              * Before inserting a tuple, probe the hash table containing TIDs
    1138             :              * of tuples belonging to new bucket, if we find a match, then
    1139             :              * skip that tuple, else fetch the item's hash key (conveniently
    1140             :              * stored in the item) and determine which bucket it now belongs
    1141             :              * in.
    1142             :              */
    1143       43868 :             itup = (IndexTuple) PageGetItem(opage,
    1144             :                                             PageGetItemId(opage, ooffnum));
    1145             : 
    1146       43868 :             if (htab)
    1147           0 :                 (void) hash_search(htab, &itup->t_tid, HASH_FIND, &found);
    1148             : 
    1149       43868 :             if (found)
    1150           0 :                 continue;
    1151             : 
    1152       43868 :             bucket = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup),
    1153             :                                           maxbucket, highmask, lowmask);
    1154             : 
    1155       43868 :             if (bucket == nbucket)
    1156             :             {
    1157             :                 IndexTuple  new_itup;
    1158             : 
    1159             :                 /*
    1160             :                  * make a copy of index tuple as we have to scribble on it.
    1161             :                  */
    1162       16530 :                 new_itup = CopyIndexTuple(itup);
    1163             : 
    1164             :                 /*
    1165             :                  * mark the index tuple as moved by split, such tuples are
    1166             :                  * skipped by scan if there is split in progress for a bucket.
    1167             :                  */
    1168       16530 :                 new_itup->t_info |= INDEX_MOVED_BY_SPLIT_MASK;
    1169             : 
    1170             :                 /*
    1171             :                  * insert the tuple into the new bucket.  if it doesn't fit on
    1172             :                  * the current page in the new bucket, we must allocate a new
    1173             :                  * overflow page and place the tuple on that page instead.
    1174             :                  */
    1175       16530 :                 itemsz = IndexTupleDSize(*new_itup);
    1176       16530 :                 itemsz = MAXALIGN(itemsz);
    1177             : 
    1178       16530 :                 if (PageGetFreeSpaceForMultipleTuples(npage, nitups + 1) < (all_tups_size + itemsz))
    1179             :                 {
    1180             :                     /*
    1181             :                      * Change the shared buffer state in critical section,
    1182             :                      * otherwise any error could make it unrecoverable.
    1183             :                      */
    1184          11 :                     START_CRIT_SECTION();
    1185             : 
    1186          11 :                     _hash_pgaddmultitup(rel, nbuf, itups, itup_offsets, nitups);
    1187          11 :                     MarkBufferDirty(nbuf);
    1188             :                     /* log the split operation before releasing the lock */
    1189          11 :                     log_split_page(rel, nbuf);
    1190             : 
    1191          11 :                     END_CRIT_SECTION();
    1192             : 
    1193             :                     /* drop lock, but keep pin */
    1194          11 :                     LockBuffer(nbuf, BUFFER_LOCK_UNLOCK);
    1195             : 
    1196             :                     /* be tidy */
    1197        5610 :                     for (i = 0; i < nitups; i++)
    1198        5599 :                         pfree(itups[i]);
    1199          11 :                     nitups = 0;
    1200          11 :                     all_tups_size = 0;
    1201             : 
    1202             :                     /* chain to a new overflow page */
    1203          11 :                     nbuf = _hash_addovflpage(rel, metabuf, nbuf, (nbuf == bucket_nbuf) ? true : false);
    1204          11 :                     npage = BufferGetPage(nbuf);
    1205          11 :                     nopaque = (HashPageOpaque) PageGetSpecialPointer(npage);
    1206             :                 }
    1207             : 
    1208       16530 :                 itups[nitups++] = new_itup;
    1209       16530 :                 all_tups_size += itemsz;
    1210             :             }
    1211             :             else
    1212             :             {
    1213             :                 /*
    1214             :                  * the tuple stays on this page, so nothing to do.
    1215             :                  */
    1216       27338 :                 Assert(bucket == obucket);
    1217             :             }
    1218             :         }
    1219             : 
    1220         123 :         oblkno = oopaque->hasho_nextblkno;
    1221             : 
    1222             :         /* retain the pin on the old primary bucket */
    1223         123 :         if (obuf == bucket_obuf)
    1224          72 :             LockBuffer(obuf, BUFFER_LOCK_UNLOCK);
    1225             :         else
    1226          51 :             _hash_relbuf(rel, obuf);
    1227             : 
    1228             :         /* Exit loop if no more overflow pages in old bucket */
    1229         123 :         if (!BlockNumberIsValid(oblkno))
    1230             :         {
    1231             :             /*
    1232             :              * Change the shared buffer state in critical section, otherwise
    1233             :              * any error could make it unrecoverable.
    1234             :              */
    1235          72 :             START_CRIT_SECTION();
    1236             : 
    1237          72 :             _hash_pgaddmultitup(rel, nbuf, itups, itup_offsets, nitups);
    1238          72 :             MarkBufferDirty(nbuf);
    1239             :             /* log the split operation before releasing the lock */
    1240          72 :             log_split_page(rel, nbuf);
    1241             : 
    1242          72 :             END_CRIT_SECTION();
    1243             : 
    1244          72 :             if (nbuf == bucket_nbuf)
    1245          71 :                 LockBuffer(nbuf, BUFFER_LOCK_UNLOCK);
    1246             :             else
    1247           1 :                 _hash_relbuf(rel, nbuf);
    1248             : 
    1249             :             /* be tidy */
    1250       11003 :             for (i = 0; i < nitups; i++)
    1251       10931 :                 pfree(itups[i]);
    1252          72 :             break;
    1253             :         }
    1254             : 
    1255             :         /* Else, advance to next old page */
    1256          51 :         obuf = _hash_getbuf(rel, oblkno, HASH_READ, LH_OVERFLOW_PAGE);
    1257          51 :         opage = BufferGetPage(obuf);
    1258          51 :         oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
    1259          51 :     }
    1260             : 
    1261             :     /*
    1262             :      * We're at the end of the old bucket chain, so we're done partitioning
    1263             :      * the tuples.  Mark the old and new buckets to indicate split is
    1264             :      * finished.
    1265             :      *
    1266             :      * To avoid deadlocks due to locking order of buckets, first lock the old
    1267             :      * bucket and then the new bucket.
    1268             :      */
    1269          72 :     LockBuffer(bucket_obuf, BUFFER_LOCK_EXCLUSIVE);
    1270          72 :     opage = BufferGetPage(bucket_obuf);
    1271          72 :     oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
    1272             : 
    1273          72 :     LockBuffer(bucket_nbuf, BUFFER_LOCK_EXCLUSIVE);
    1274          72 :     npage = BufferGetPage(bucket_nbuf);
    1275          72 :     nopaque = (HashPageOpaque) PageGetSpecialPointer(npage);
    1276             : 
    1277          72 :     START_CRIT_SECTION();
    1278             : 
    1279          72 :     oopaque->hasho_flag &= ~LH_BUCKET_BEING_SPLIT;
    1280          72 :     nopaque->hasho_flag &= ~LH_BUCKET_BEING_POPULATED;
    1281             : 
    1282             :     /*
    1283             :      * After the split is finished, mark the old bucket to indicate that it
    1284             :      * contains deletable tuples.  We will clear split-cleanup flag after
    1285             :      * deleting such tuples either at the end of split or at the next split
    1286             :      * from old bucket or at the time of vacuum.
    1287             :      */
    1288          72 :     oopaque->hasho_flag |= LH_BUCKET_NEEDS_SPLIT_CLEANUP;
    1289             : 
    1290             :     /*
    1291             :      * now write the buffers, here we don't release the locks as caller is
    1292             :      * responsible to release locks.
    1293             :      */
    1294          72 :     MarkBufferDirty(bucket_obuf);
    1295          72 :     MarkBufferDirty(bucket_nbuf);
    1296             : 
    1297          72 :     if (RelationNeedsWAL(rel))
    1298             :     {
    1299             :         XLogRecPtr  recptr;
    1300             :         xl_hash_split_complete xlrec;
    1301             : 
    1302          72 :         xlrec.old_bucket_flag = oopaque->hasho_flag;
    1303          72 :         xlrec.new_bucket_flag = nopaque->hasho_flag;
    1304             : 
    1305          72 :         XLogBeginInsert();
    1306             : 
    1307          72 :         XLogRegisterData((char *) &xlrec, SizeOfHashSplitComplete);
    1308             : 
    1309          72 :         XLogRegisterBuffer(0, bucket_obuf, REGBUF_STANDARD);
    1310          72 :         XLogRegisterBuffer(1, bucket_nbuf, REGBUF_STANDARD);
    1311             : 
    1312          72 :         recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_COMPLETE);
    1313             : 
    1314          72 :         PageSetLSN(BufferGetPage(bucket_obuf), recptr);
    1315          72 :         PageSetLSN(BufferGetPage(bucket_nbuf), recptr);
    1316             :     }
    1317             : 
    1318          72 :     END_CRIT_SECTION();
    1319             : 
    1320             :     /*
    1321             :      * If possible, clean up the old bucket.  We might not be able to do this
    1322             :      * if someone else has a pin on it, but if not then we can go ahead.  This
    1323             :      * isn't absolutely necessary, but it reduces bloat; if we don't do it
    1324             :      * now, VACUUM will do it eventually, but maybe not until new overflow
    1325             :      * pages have been allocated.  Note that there's no need to clean up the
    1326             :      * new bucket.
    1327             :      */
    1328          72 :     if (IsBufferCleanupOK(bucket_obuf))
    1329             :     {
    1330          72 :         LockBuffer(bucket_nbuf, BUFFER_LOCK_UNLOCK);
    1331          72 :         hashbucketcleanup(rel, obucket, bucket_obuf,
    1332             :                           BufferGetBlockNumber(bucket_obuf), NULL,
    1333             :                           maxbucket, highmask, lowmask, NULL, NULL, true,
    1334             :                           NULL, NULL);
    1335             :     }
    1336             :     else
    1337             :     {
    1338           0 :         LockBuffer(bucket_nbuf, BUFFER_LOCK_UNLOCK);
    1339           0 :         LockBuffer(bucket_obuf, BUFFER_LOCK_UNLOCK);
    1340             :     }
    1341          72 : }
    1342             : 
    1343             : /*
    1344             :  *  _hash_finish_split() -- Finish the previously interrupted split operation
    1345             :  *
    1346             :  * To complete the split operation, we form the hash table of TIDs in new
    1347             :  * bucket which is then used by split operation to skip tuples that are
    1348             :  * already moved before the split operation was previously interrupted.
    1349             :  *
    1350             :  * The caller must hold a pin, but no lock, on the metapage and old bucket's
    1351             :  * primary page buffer.  The buffers are returned in the same state.  (The
    1352             :  * metapage is only touched if it becomes necessary to add or remove overflow
    1353             :  * pages.)
    1354             :  */
    1355             : void
    1356           0 : _hash_finish_split(Relation rel, Buffer metabuf, Buffer obuf, Bucket obucket,
    1357             :                    uint32 maxbucket, uint32 highmask, uint32 lowmask)
    1358             : {
    1359             :     HASHCTL     hash_ctl;
    1360             :     HTAB       *tidhtab;
    1361           0 :     Buffer      bucket_nbuf = InvalidBuffer;
    1362             :     Buffer      nbuf;
    1363             :     Page        npage;
    1364             :     BlockNumber nblkno;
    1365             :     BlockNumber bucket_nblkno;
    1366             :     HashPageOpaque npageopaque;
    1367             :     Bucket      nbucket;
    1368             :     bool        found;
    1369             : 
    1370             :     /* Initialize hash tables used to track TIDs */
    1371           0 :     memset(&hash_ctl, 0, sizeof(hash_ctl));
    1372           0 :     hash_ctl.keysize = sizeof(ItemPointerData);
    1373           0 :     hash_ctl.entrysize = sizeof(ItemPointerData);
    1374           0 :     hash_ctl.hcxt = CurrentMemoryContext;
    1375             : 
    1376           0 :     tidhtab =
    1377             :         hash_create("bucket ctids",
    1378             :                     256,        /* arbitrary initial size */
    1379             :                     &hash_ctl,
    1380             :                     HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
    1381             : 
    1382           0 :     bucket_nblkno = nblkno = _hash_get_newblock_from_oldbucket(rel, obucket);
    1383             : 
    1384             :     /*
    1385             :      * Scan the new bucket and build hash table of TIDs
    1386             :      */
    1387             :     for (;;)
    1388             :     {
    1389             :         OffsetNumber noffnum;
    1390             :         OffsetNumber nmaxoffnum;
    1391             : 
    1392           0 :         nbuf = _hash_getbuf(rel, nblkno, HASH_READ,
    1393             :                             LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
    1394             : 
    1395             :         /* remember the primary bucket buffer to acquire cleanup lock on it. */
    1396           0 :         if (nblkno == bucket_nblkno)
    1397           0 :             bucket_nbuf = nbuf;
    1398             : 
    1399           0 :         npage = BufferGetPage(nbuf);
    1400           0 :         npageopaque = (HashPageOpaque) PageGetSpecialPointer(npage);
    1401             : 
    1402             :         /* Scan each tuple in new page */
    1403           0 :         nmaxoffnum = PageGetMaxOffsetNumber(npage);
    1404           0 :         for (noffnum = FirstOffsetNumber;
    1405             :              noffnum <= nmaxoffnum;
    1406           0 :              noffnum = OffsetNumberNext(noffnum))
    1407             :         {
    1408             :             IndexTuple  itup;
    1409             : 
    1410             :             /* Fetch the item's TID and insert it in hash table. */
    1411           0 :             itup = (IndexTuple) PageGetItem(npage,
    1412             :                                             PageGetItemId(npage, noffnum));
    1413             : 
    1414           0 :             (void) hash_search(tidhtab, &itup->t_tid, HASH_ENTER, &found);
    1415             : 
    1416           0 :             Assert(!found);
    1417             :         }
    1418             : 
    1419           0 :         nblkno = npageopaque->hasho_nextblkno;
    1420             : 
    1421             :         /*
    1422             :          * release our write lock without modifying buffer and ensure to
    1423             :          * retain the pin on primary bucket.
    1424             :          */
    1425           0 :         if (nbuf == bucket_nbuf)
    1426           0 :             LockBuffer(nbuf, BUFFER_LOCK_UNLOCK);
    1427             :         else
    1428           0 :             _hash_relbuf(rel, nbuf);
    1429             : 
    1430             :         /* Exit loop if no more overflow pages in new bucket */
    1431           0 :         if (!BlockNumberIsValid(nblkno))
    1432           0 :             break;
    1433           0 :     }
    1434             : 
    1435             :     /*
    1436             :      * Conditionally get the cleanup lock on old and new buckets to perform
    1437             :      * the split operation.  If we don't get the cleanup locks, silently give
    1438             :      * up and next insertion on old bucket will try again to complete the
    1439             :      * split.
    1440             :      */
    1441           0 :     if (!ConditionalLockBufferForCleanup(obuf))
    1442             :     {
    1443           0 :         hash_destroy(tidhtab);
    1444           0 :         return;
    1445             :     }
    1446           0 :     if (!ConditionalLockBufferForCleanup(bucket_nbuf))
    1447             :     {
    1448           0 :         LockBuffer(obuf, BUFFER_LOCK_UNLOCK);
    1449           0 :         hash_destroy(tidhtab);
    1450           0 :         return;
    1451             :     }
    1452             : 
    1453           0 :     npage = BufferGetPage(bucket_nbuf);
    1454           0 :     npageopaque = (HashPageOpaque) PageGetSpecialPointer(npage);
    1455           0 :     nbucket = npageopaque->hasho_bucket;
    1456             : 
    1457           0 :     _hash_splitbucket(rel, metabuf, obucket,
    1458             :                       nbucket, obuf, bucket_nbuf, tidhtab,
    1459             :                       maxbucket, highmask, lowmask);
    1460             : 
    1461           0 :     _hash_dropbuf(rel, bucket_nbuf);
    1462           0 :     hash_destroy(tidhtab);
    1463             : }
    1464             : 
    1465             : /*
    1466             :  *  log_split_page() -- Log the split operation
    1467             :  *
    1468             :  *  We log the split operation when the new page in new bucket gets full,
    1469             :  *  so we log the entire page.
    1470             :  *
    1471             :  *  'buf' must be locked by the caller which is also responsible for unlocking
    1472             :  *  it.
    1473             :  */
    1474             : static void
    1475          83 : log_split_page(Relation rel, Buffer buf)
    1476             : {
    1477          83 :     if (RelationNeedsWAL(rel))
    1478             :     {
    1479             :         XLogRecPtr  recptr;
    1480             : 
    1481          83 :         XLogBeginInsert();
    1482             : 
    1483          83 :         XLogRegisterBuffer(0, buf, REGBUF_FORCE_IMAGE | REGBUF_STANDARD);
    1484             : 
    1485          83 :         recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_PAGE);
    1486             : 
    1487          83 :         PageSetLSN(BufferGetPage(buf), recptr);
    1488             :     }
    1489          83 : }
    1490             : 
    1491             : /*
    1492             :  *  _hash_getcachedmetap() -- Returns cached metapage data.
    1493             :  *
    1494             :  *  If metabuf is not InvalidBuffer, caller must hold a pin, but no lock, on
    1495             :  *  the metapage.  If not set, we'll set it before returning if we have to
    1496             :  *  refresh the cache, and return with a pin but no lock on it; caller is
    1497             :  *  responsible for releasing the pin.
    1498             :  *
    1499             :  *  We refresh the cache if it's not initialized yet or force_refresh is true.
    1500             :  */
    1501             : HashMetaPage
    1502       80642 : _hash_getcachedmetap(Relation rel, Buffer *metabuf, bool force_refresh)
    1503             : {
    1504             :     Page        page;
    1505             : 
    1506       80642 :     Assert(metabuf);
    1507       80642 :     if (force_refresh || rel->rd_amcache == NULL)
    1508             :     {
    1509          85 :         char       *cache = NULL;
    1510             : 
    1511             :         /*
    1512             :          * It's important that we don't set rd_amcache to an invalid value.
    1513             :          * Either MemoryContextAlloc or _hash_getbuf could fail, so don't
    1514             :          * install a pointer to the newly-allocated storage in the actual
    1515             :          * relcache entry until both have succeeeded.
    1516             :          */
    1517          85 :         if (rel->rd_amcache == NULL)
    1518          20 :             cache = MemoryContextAlloc(rel->rd_indexcxt,
    1519             :                                        sizeof(HashMetaPageData));
    1520             : 
    1521             :         /* Read the metapage. */
    1522          85 :         if (BufferIsValid(*metabuf))
    1523           0 :             LockBuffer(*metabuf, BUFFER_LOCK_SHARE);
    1524             :         else
    1525          85 :             *metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ,
    1526             :                                     LH_META_PAGE);
    1527          85 :         page = BufferGetPage(*metabuf);
    1528             : 
    1529             :         /* Populate the cache. */
    1530          85 :         if (rel->rd_amcache == NULL)
    1531          20 :             rel->rd_amcache = cache;
    1532          85 :         memcpy(rel->rd_amcache, HashPageGetMeta(page),
    1533             :                sizeof(HashMetaPageData));
    1534             : 
    1535             :         /* Release metapage lock, but keep the pin. */
    1536          85 :         LockBuffer(*metabuf, BUFFER_LOCK_UNLOCK);
    1537             :     }
    1538             : 
    1539       80642 :     return (HashMetaPage) rel->rd_amcache;
    1540             : }
    1541             : 
    1542             : /*
    1543             :  *  _hash_getbucketbuf_from_hashkey() -- Get the bucket's buffer for the given
    1544             :  *                                       hashkey.
    1545             :  *
    1546             :  *  Bucket pages do not move or get removed once they are allocated. This give
    1547             :  *  us an opportunity to use the previously saved metapage contents to reach
    1548             :  *  the target bucket buffer, instead of reading from the metapage every time.
    1549             :  *  This saves one buffer access every time we want to reach the target bucket
    1550             :  *  buffer, which is very helpful savings in bufmgr traffic and contention.
    1551             :  *
    1552             :  *  The access type parameter (HASH_READ or HASH_WRITE) indicates whether the
    1553             :  *  bucket buffer has to be locked for reading or writing.
    1554             :  *
    1555             :  *  The out parameter cachedmetap is set with metapage contents used for
    1556             :  *  hashkey to bucket buffer mapping. Some callers need this info to reach the
    1557             :  *  old bucket in case of bucket split, see _hash_doinsert().
    1558             :  */
    1559             : Buffer
    1560       80577 : _hash_getbucketbuf_from_hashkey(Relation rel, uint32 hashkey, int access,
    1561             :                                 HashMetaPage *cachedmetap)
    1562             : {
    1563             :     HashMetaPage metap;
    1564             :     Buffer      buf;
    1565       80577 :     Buffer      metabuf = InvalidBuffer;
    1566             :     Page        page;
    1567             :     Bucket      bucket;
    1568             :     BlockNumber blkno;
    1569             :     HashPageOpaque opaque;
    1570             : 
    1571             :     /* We read from target bucket buffer, hence locking is must. */
    1572       80577 :     Assert(access == HASH_READ || access == HASH_WRITE);
    1573             : 
    1574       80577 :     metap = _hash_getcachedmetap(rel, &metabuf, false);
    1575       80577 :     Assert(metap != NULL);
    1576             : 
    1577             :     /*
    1578             :      * Loop until we get a lock on the correct target bucket.
    1579             :      */
    1580             :     for (;;)
    1581             :     {
    1582             :         /*
    1583             :          * Compute the target bucket number, and convert to block number.
    1584             :          */
    1585       80642 :         bucket = _hash_hashkey2bucket(hashkey,
    1586             :                                       metap->hashm_maxbucket,
    1587             :                                       metap->hashm_highmask,
    1588             :                                       metap->hashm_lowmask);
    1589             : 
    1590       80642 :         blkno = BUCKET_TO_BLKNO(metap, bucket);
    1591             : 
    1592             :         /* Fetch the primary bucket page for the bucket */
    1593       80642 :         buf = _hash_getbuf(rel, blkno, access, LH_BUCKET_PAGE);
    1594       80642 :         page = BufferGetPage(buf);
    1595       80642 :         opaque = (HashPageOpaque) PageGetSpecialPointer(page);
    1596       80642 :         Assert(opaque->hasho_bucket == bucket);
    1597       80642 :         Assert(opaque->hasho_prevblkno != InvalidBlockNumber);
    1598             : 
    1599             :         /*
    1600             :          * If this bucket hasn't been split, we're done.
    1601             :          */
    1602       80642 :         if (opaque->hasho_prevblkno <= metap->hashm_maxbucket)
    1603       80577 :             break;
    1604             : 
    1605             :         /* Drop lock on this buffer, update cached metapage, and retry. */
    1606          65 :         _hash_relbuf(rel, buf);
    1607          65 :         metap = _hash_getcachedmetap(rel, &metabuf, true);
    1608          65 :         Assert(metap != NULL);
    1609          65 :     }
    1610             : 
    1611       80577 :     if (BufferIsValid(metabuf))
    1612          85 :         _hash_dropbuf(rel, metabuf);
    1613             : 
    1614       80577 :     if (cachedmetap)
    1615       80552 :         *cachedmetap = metap;
    1616             : 
    1617       80577 :     return buf;
    1618             : }

Generated by: LCOV version 1.11