LCOV - PostgreSQL - src/backend/access/nbtree/nbtpage.c

LCOV - code coverage report

Current view:	top level - src/backend/access/nbtree - nbtpage.c (source / functions)		Hit	Total	Coverage
Test:	PostgreSQL	Lines:	514	592	86.8 %
Date:	2017-09-29 15:12:54	Functions:	18	18	100.0 %
Legend:	Lines: hit not hit

          Line data    Source code

       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * nbtpage.c
       4             :  *    BTree-specific page management code for the Postgres btree access
       5             :  *    method.
       6             :  *
       7             :  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
       8             :  * Portions Copyright (c) 1994, Regents of the University of California
       9             :  *
      10             :  *
      11             :  * IDENTIFICATION
      12             :  *    src/backend/access/nbtree/nbtpage.c
      13             :  *
      14             :  *  NOTES
      15             :  *     Postgres btree pages look like ordinary relation pages.  The opaque
      16             :  *     data at high addresses includes pointers to left and right siblings
      17             :  *     and flag data describing page state.  The first page in a btree, page
      18             :  *     zero, is special -- it stores meta-information describing the tree.
      19             :  *     Pages one and higher store the actual tree data.
      20             :  *
      21             :  *-------------------------------------------------------------------------
      22             :  */
      23             : #include "postgres.h"
      24             : 
      25             : #include "access/nbtree.h"
      26             : #include "access/nbtxlog.h"
      27             : #include "access/transam.h"
      28             : #include "access/xlog.h"
      29             : #include "access/xloginsert.h"
      30             : #include "miscadmin.h"
      31             : #include "storage/indexfsm.h"
      32             : #include "storage/lmgr.h"
      33             : #include "storage/predicate.h"
      34             : #include "utils/snapmgr.h"
      35             : 
      36             : static bool _bt_mark_page_halfdead(Relation rel, Buffer buf, BTStack stack);
      37             : static bool _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf,
      38             :                          bool *rightsib_empty);
      39             : static bool _bt_lock_branch_parent(Relation rel, BlockNumber child,
      40             :                        BTStack stack, Buffer *topparent, OffsetNumber *topoff,
      41             :                        BlockNumber *target, BlockNumber *rightsib);
      42             : static void _bt_log_reuse_page(Relation rel, BlockNumber blkno,
      43             :                    TransactionId latestRemovedXid);
      44             : 
      45             : /*
      46             :  *  _bt_initmetapage() -- Fill a page buffer with a correct metapage image
      47             :  */
      48             : void
      49        1437 : _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level)
      50             : {
      51             :     BTMetaPageData *metad;
      52             :     BTPageOpaque metaopaque;
      53             : 
      54        1437 :     _bt_pageinit(page, BLCKSZ);
      55             : 
      56        1437 :     metad = BTPageGetMeta(page);
      57        1437 :     metad->btm_magic = BTREE_MAGIC;
      58        1437 :     metad->btm_version = BTREE_VERSION;
      59        1437 :     metad->btm_root = rootbknum;
      60        1437 :     metad->btm_level = level;
      61        1437 :     metad->btm_fastroot = rootbknum;
      62        1437 :     metad->btm_fastlevel = level;
      63             : 
      64        1437 :     metaopaque = (BTPageOpaque) PageGetSpecialPointer(page);
      65        1437 :     metaopaque->btpo_flags = BTP_META;
      66             : 
      67             :     /*
      68             :      * Set pd_lower just past the end of the metadata.  This is not essential
      69             :      * but it makes the page look compressible to xlog.c.
      70             :      */
      71        1437 :     ((PageHeader) page)->pd_lower =
      72        1437 :         ((char *) metad + sizeof(BTMetaPageData)) - (char *) page;
      73        1437 : }
      74             : 
      75             : /*
      76             :  *  _bt_getroot() -- Get the root page of the btree.
      77             :  *
      78             :  *      Since the root page can move around the btree file, we have to read
      79             :  *      its location from the metadata page, and then read the root page
      80             :  *      itself.  If no root page exists yet, we have to create one.  The
      81             :  *      standard class of race conditions exists here; I think I covered
      82             :  *      them all in the Hopi Indian rain dance of lock requests below.
      83             :  *
      84             :  *      The access type parameter (BT_READ or BT_WRITE) controls whether
      85             :  *      a new root page will be created or not.  If access = BT_READ,
      86             :  *      and no root page exists, we just return InvalidBuffer.  For
      87             :  *      BT_WRITE, we try to create the root page if it doesn't exist.
      88             :  *      NOTE that the returned root page will have only a read lock set
      89             :  *      on it even if access = BT_WRITE!
      90             :  *
      91             :  *      The returned page is not necessarily the true root --- it could be
      92             :  *      a "fast root" (a page that is alone in its level due to deletions).
      93             :  *      Also, if the root page is split while we are "in flight" to it,
      94             :  *      what we will return is the old root, which is now just the leftmost
      95             :  *      page on a probably-not-very-wide level.  For most purposes this is
      96             :  *      as good as or better than the true root, so we do not bother to
      97             :  *      insist on finding the true root.  We do, however, guarantee to
      98             :  *      return a live (not deleted or half-dead) page.
      99             :  *
     100             :  *      On successful return, the root page is pinned and read-locked.
     101             :  *      The metadata page is not locked or pinned on exit.
     102             :  */
     103             : Buffer
     104      595801 : _bt_getroot(Relation rel, int access)
     105             : {
     106             :     Buffer      metabuf;
     107             :     Page        metapg;
     108             :     BTPageOpaque metaopaque;
     109             :     Buffer      rootbuf;
     110             :     Page        rootpage;
     111             :     BTPageOpaque rootopaque;
     112             :     BlockNumber rootblkno;
     113             :     uint32      rootlevel;
     114             :     BTMetaPageData *metad;
     115             : 
     116             :     /*
     117             :      * Try to use previously-cached metapage data to find the root.  This
     118             :      * normally saves one buffer access per index search, which is a very
     119             :      * helpful savings in bufmgr traffic and hence contention.
     120             :      */
     121      595801 :     if (rel->rd_amcache != NULL)
     122             :     {
     123      572365 :         metad = (BTMetaPageData *) rel->rd_amcache;
     124             :         /* We shouldn't have cached it if any of these fail */
     125      572365 :         Assert(metad->btm_magic == BTREE_MAGIC);
     126      572365 :         Assert(metad->btm_version == BTREE_VERSION);
     127      572365 :         Assert(metad->btm_root != P_NONE);
     128             : 
     129      572365 :         rootblkno = metad->btm_fastroot;
     130      572365 :         Assert(rootblkno != P_NONE);
     131      572365 :         rootlevel = metad->btm_fastlevel;
     132             : 
     133      572365 :         rootbuf = _bt_getbuf(rel, rootblkno, BT_READ);
     134      572365 :         rootpage = BufferGetPage(rootbuf);
     135      572365 :         rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
     136             : 
     137             :         /*
     138             :          * Since the cache might be stale, we check the page more carefully
     139             :          * here than normal.  We *must* check that it's not deleted. If it's
     140             :          * not alone on its level, then we reject too --- this may be overly
     141             :          * paranoid but better safe than sorry.  Note we don't check P_ISROOT,
     142             :          * because that's not set in a "fast root".
     143             :          */
     144     1144730 :         if (!P_IGNORE(rootopaque) &&
     145     1144730 :             rootopaque->btpo.level == rootlevel &&
     146     1144730 :             P_LEFTMOST(rootopaque) &&
     147      572365 :             P_RIGHTMOST(rootopaque))
     148             :         {
     149             :             /* OK, accept cached page as the root */
     150      572327 :             return rootbuf;
     151             :         }
     152          38 :         _bt_relbuf(rel, rootbuf);
     153             :         /* Cache is stale, throw it away */
     154          38 :         if (rel->rd_amcache)
     155          38 :             pfree(rel->rd_amcache);
     156          38 :         rel->rd_amcache = NULL;
     157             :     }
     158             : 
     159       23474 :     metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
     160       23474 :     metapg = BufferGetPage(metabuf);
     161       23474 :     metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
     162       23474 :     metad = BTPageGetMeta(metapg);
     163             : 
     164             :     /* sanity-check the metapage */
     165       46948 :     if (!(metaopaque->btpo_flags & BTP_META) ||
     166       23474 :         metad->btm_magic != BTREE_MAGIC)
     167           0 :         ereport(ERROR,
     168             :                 (errcode(ERRCODE_INDEX_CORRUPTED),
     169             :                  errmsg("index \"%s\" is not a btree",
     170             :                         RelationGetRelationName(rel))));
     171             : 
     172       23474 :     if (metad->btm_version != BTREE_VERSION)
     173           0 :         ereport(ERROR,
     174             :                 (errcode(ERRCODE_INDEX_CORRUPTED),
     175             :                  errmsg("version mismatch in index \"%s\": file version %d, code version %d",
     176             :                         RelationGetRelationName(rel),
     177             :                         metad->btm_version, BTREE_VERSION)));
     178             : 
     179             :     /* if no root page initialized yet, do it */
     180       23474 :     if (metad->btm_root == P_NONE)
     181             :     {
     182             :         /* If access = BT_READ, caller doesn't want us to create root yet */
     183       12152 :         if (access == BT_READ)
     184             :         {
     185       11853 :             _bt_relbuf(rel, metabuf);
     186       11853 :             return InvalidBuffer;
     187             :         }
     188             : 
     189             :         /* trade in our read lock for a write lock */
     190         299 :         LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
     191         299 :         LockBuffer(metabuf, BT_WRITE);
     192             : 
     193             :         /*
     194             :          * Race condition:  if someone else initialized the metadata between
     195             :          * the time we released the read lock and acquired the write lock, we
     196             :          * must avoid doing it again.
     197             :          */
     198         299 :         if (metad->btm_root != P_NONE)
     199             :         {
     200             :             /*
     201             :              * Metadata initialized by someone else.  In order to guarantee no
     202             :              * deadlocks, we have to release the metadata page and start all
     203             :              * over again.  (Is that really true? But it's hardly worth trying
     204             :              * to optimize this case.)
     205             :              */
     206           0 :             _bt_relbuf(rel, metabuf);
     207           0 :             return _bt_getroot(rel, access);
     208             :         }
     209             : 
     210             :         /*
     211             :          * Get, initialize, write, and leave a lock of the appropriate type on
     212             :          * the new root page.  Since this is the first page in the tree, it's
     213             :          * a leaf as well as the root.
     214             :          */
     215         299 :         rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
     216         299 :         rootblkno = BufferGetBlockNumber(rootbuf);
     217         299 :         rootpage = BufferGetPage(rootbuf);
     218         299 :         rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
     219         299 :         rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
     220         299 :         rootopaque->btpo_flags = (BTP_LEAF | BTP_ROOT);
     221         299 :         rootopaque->btpo.level = 0;
     222         299 :         rootopaque->btpo_cycleid = 0;
     223             : 
     224             :         /* NO ELOG(ERROR) till meta is updated */
     225         299 :         START_CRIT_SECTION();
     226             : 
     227         299 :         metad->btm_root = rootblkno;
     228         299 :         metad->btm_level = 0;
     229         299 :         metad->btm_fastroot = rootblkno;
     230         299 :         metad->btm_fastlevel = 0;
     231             : 
     232         299 :         MarkBufferDirty(rootbuf);
     233         299 :         MarkBufferDirty(metabuf);
     234             : 
     235             :         /* XLOG stuff */
     236         299 :         if (RelationNeedsWAL(rel))
     237             :         {
     238             :             xl_btree_newroot xlrec;
     239             :             XLogRecPtr  recptr;
     240             :             xl_btree_metadata md;
     241             : 
     242         250 :             XLogBeginInsert();
     243         250 :             XLogRegisterBuffer(0, rootbuf, REGBUF_WILL_INIT);
     244         250 :             XLogRegisterBuffer(2, metabuf, REGBUF_WILL_INIT);
     245             : 
     246         250 :             md.root = rootblkno;
     247         250 :             md.level = 0;
     248         250 :             md.fastroot = rootblkno;
     249         250 :             md.fastlevel = 0;
     250             : 
     251         250 :             XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata));
     252             : 
     253         250 :             xlrec.rootblk = rootblkno;
     254         250 :             xlrec.level = 0;
     255             : 
     256         250 :             XLogRegisterData((char *) &xlrec, SizeOfBtreeNewroot);
     257             : 
     258         250 :             recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT);
     259             : 
     260         250 :             PageSetLSN(rootpage, recptr);
     261         250 :             PageSetLSN(metapg, recptr);
     262             :         }
     263             : 
     264         299 :         END_CRIT_SECTION();
     265             : 
     266             :         /*
     267             :          * swap root write lock for read lock.  There is no danger of anyone
     268             :          * else accessing the new root page while it's unlocked, since no one
     269             :          * else knows where it is yet.
     270             :          */
     271         299 :         LockBuffer(rootbuf, BUFFER_LOCK_UNLOCK);
     272         299 :         LockBuffer(rootbuf, BT_READ);
     273             : 
     274             :         /* okay, metadata is correct, release lock on it */
     275         299 :         _bt_relbuf(rel, metabuf);
     276             :     }
     277             :     else
     278             :     {
     279       11322 :         rootblkno = metad->btm_fastroot;
     280       11322 :         Assert(rootblkno != P_NONE);
     281       11322 :         rootlevel = metad->btm_fastlevel;
     282             : 
     283             :         /*
     284             :          * Cache the metapage data for next time
     285             :          */
     286       11322 :         rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt,
     287             :                                              sizeof(BTMetaPageData));
     288       11322 :         memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
     289             : 
     290             :         /*
     291             :          * We are done with the metapage; arrange to release it via first
     292             :          * _bt_relandgetbuf call
     293             :          */
     294       11322 :         rootbuf = metabuf;
     295             : 
     296             :         for (;;)
     297             :         {
     298       11322 :             rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
     299       11322 :             rootpage = BufferGetPage(rootbuf);
     300       11322 :             rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
     301             : 
     302       11322 :             if (!P_IGNORE(rootopaque))
     303       11322 :                 break;
     304             : 
     305             :             /* it's dead, Jim.  step right one page */
     306           0 :             if (P_RIGHTMOST(rootopaque))
     307           0 :                 elog(ERROR, "no live root page found in index \"%s\"",
     308             :                      RelationGetRelationName(rel));
     309           0 :             rootblkno = rootopaque->btpo_next;
     310           0 :         }
     311             : 
     312             :         /* Note: can't check btpo.level on deleted pages */
     313       11322 :         if (rootopaque->btpo.level != rootlevel)
     314           0 :             elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
     315             :                  rootblkno, RelationGetRelationName(rel),
     316             :                  rootopaque->btpo.level, rootlevel);
     317             :     }
     318             : 
     319             :     /*
     320             :      * By here, we have a pin and read lock on the root page, and no lock set
     321             :      * on the metadata page.  Return the root page's buffer.
     322             :      */
     323       11621 :     return rootbuf;
     324             : }
     325             : 
     326             : /*
     327             :  *  _bt_gettrueroot() -- Get the true root page of the btree.
     328             :  *
     329             :  *      This is the same as the BT_READ case of _bt_getroot(), except
     330             :  *      we follow the true-root link not the fast-root link.
     331             :  *
     332             :  * By the time we acquire lock on the root page, it might have been split and
     333             :  * not be the true root anymore.  This is okay for the present uses of this
     334             :  * routine; we only really need to be able to move up at least one tree level
     335             :  * from whatever non-root page we were at.  If we ever do need to lock the
     336             :  * one true root page, we could loop here, re-reading the metapage on each
     337             :  * failure.  (Note that it wouldn't do to hold the lock on the metapage while
     338             :  * moving to the root --- that'd deadlock against any concurrent root split.)
     339             :  */
     340             : Buffer
     341           1 : _bt_gettrueroot(Relation rel)
     342             : {
     343             :     Buffer      metabuf;
     344             :     Page        metapg;
     345             :     BTPageOpaque metaopaque;
     346             :     Buffer      rootbuf;
     347             :     Page        rootpage;
     348             :     BTPageOpaque rootopaque;
     349             :     BlockNumber rootblkno;
     350             :     uint32      rootlevel;
     351             :     BTMetaPageData *metad;
     352             : 
     353             :     /*
     354             :      * We don't try to use cached metapage data here, since (a) this path is
     355             :      * not performance-critical, and (b) if we are here it suggests our cache
     356             :      * is out-of-date anyway.  In light of point (b), it's probably safest to
     357             :      * actively flush any cached metapage info.
     358             :      */
     359           1 :     if (rel->rd_amcache)
     360           1 :         pfree(rel->rd_amcache);
     361           1 :     rel->rd_amcache = NULL;
     362             : 
     363           1 :     metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
     364           1 :     metapg = BufferGetPage(metabuf);
     365           1 :     metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
     366           1 :     metad = BTPageGetMeta(metapg);
     367             : 
     368           2 :     if (!(metaopaque->btpo_flags & BTP_META) ||
     369           1 :         metad->btm_magic != BTREE_MAGIC)
     370           0 :         ereport(ERROR,
     371             :                 (errcode(ERRCODE_INDEX_CORRUPTED),
     372             :                  errmsg("index \"%s\" is not a btree",
     373             :                         RelationGetRelationName(rel))));
     374             : 
     375           1 :     if (metad->btm_version != BTREE_VERSION)
     376           0 :         ereport(ERROR,
     377             :                 (errcode(ERRCODE_INDEX_CORRUPTED),
     378             :                  errmsg("version mismatch in index \"%s\": file version %d, code version %d",
     379             :                         RelationGetRelationName(rel),
     380             :                         metad->btm_version, BTREE_VERSION)));
     381             : 
     382             :     /* if no root page initialized yet, fail */
     383           1 :     if (metad->btm_root == P_NONE)
     384             :     {
     385           0 :         _bt_relbuf(rel, metabuf);
     386           0 :         return InvalidBuffer;
     387             :     }
     388             : 
     389           1 :     rootblkno = metad->btm_root;
     390           1 :     rootlevel = metad->btm_level;
     391             : 
     392             :     /*
     393             :      * We are done with the metapage; arrange to release it via first
     394             :      * _bt_relandgetbuf call
     395             :      */
     396           1 :     rootbuf = metabuf;
     397             : 
     398             :     for (;;)
     399             :     {
     400           1 :         rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
     401           1 :         rootpage = BufferGetPage(rootbuf);
     402           1 :         rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
     403             : 
     404           1 :         if (!P_IGNORE(rootopaque))
     405           1 :             break;
     406             : 
     407             :         /* it's dead, Jim.  step right one page */
     408           0 :         if (P_RIGHTMOST(rootopaque))
     409           0 :             elog(ERROR, "no live root page found in index \"%s\"",
     410             :                  RelationGetRelationName(rel));
     411           0 :         rootblkno = rootopaque->btpo_next;
     412           0 :     }
     413             : 
     414             :     /* Note: can't check btpo.level on deleted pages */
     415           1 :     if (rootopaque->btpo.level != rootlevel)
     416           0 :         elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
     417             :              rootblkno, RelationGetRelationName(rel),
     418             :              rootopaque->btpo.level, rootlevel);
     419             : 
     420           1 :     return rootbuf;
     421             : }
     422             : 
     423             : /*
     424             :  *  _bt_getrootheight() -- Get the height of the btree search tree.
     425             :  *
     426             :  *      We return the level (counting from zero) of the current fast root.
     427             :  *      This represents the number of tree levels we'd have to descend through
     428             :  *      to start any btree index search.
     429             :  *
     430             :  *      This is used by the planner for cost-estimation purposes.  Since it's
     431             :  *      only an estimate, slightly-stale data is fine, hence we don't worry
     432             :  *      about updating previously cached data.
     433             :  */
     434             : int
     435       21188 : _bt_getrootheight(Relation rel)
     436             : {
     437             :     BTMetaPageData *metad;
     438             : 
     439             :     /*
     440             :      * We can get what we need from the cached metapage data.  If it's not
     441             :      * cached yet, load it.  Sanity checks here must match _bt_getroot().
     442             :      */
     443       21188 :     if (rel->rd_amcache == NULL)
     444             :     {
     445             :         Buffer      metabuf;
     446             :         Page        metapg;
     447             :         BTPageOpaque metaopaque;
     448             : 
     449        2052 :         metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
     450        2052 :         metapg = BufferGetPage(metabuf);
     451        2052 :         metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
     452        2052 :         metad = BTPageGetMeta(metapg);
     453             : 
     454             :         /* sanity-check the metapage */
     455        4104 :         if (!(metaopaque->btpo_flags & BTP_META) ||
     456        2052 :             metad->btm_magic != BTREE_MAGIC)
     457           0 :             ereport(ERROR,
     458             :                     (errcode(ERRCODE_INDEX_CORRUPTED),
     459             :                      errmsg("index \"%s\" is not a btree",
     460             :                             RelationGetRelationName(rel))));
     461             : 
     462        2052 :         if (metad->btm_version != BTREE_VERSION)
     463           0 :             ereport(ERROR,
     464             :                     (errcode(ERRCODE_INDEX_CORRUPTED),
     465             :                      errmsg("version mismatch in index \"%s\": file version %d, code version %d",
     466             :                             RelationGetRelationName(rel),
     467             :                             metad->btm_version, BTREE_VERSION)));
     468             : 
     469             :         /*
     470             :          * If there's no root page yet, _bt_getroot() doesn't expect a cache
     471             :          * to be made, so just stop here and report the index height is zero.
     472             :          * (XXX perhaps _bt_getroot() should be changed to allow this case.)
     473             :          */
     474        2052 :         if (metad->btm_root == P_NONE)
     475             :         {
     476        1301 :             _bt_relbuf(rel, metabuf);
     477        1301 :             return 0;
     478             :         }
     479             : 
     480             :         /*
     481             :          * Cache the metapage data for next time
     482             :          */
     483         751 :         rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt,
     484             :                                              sizeof(BTMetaPageData));
     485         751 :         memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
     486             : 
     487         751 :         _bt_relbuf(rel, metabuf);
     488             :     }
     489             : 
     490       19887 :     metad = (BTMetaPageData *) rel->rd_amcache;
     491             :     /* We shouldn't have cached it if any of these fail */
     492       19887 :     Assert(metad->btm_magic == BTREE_MAGIC);
     493       19887 :     Assert(metad->btm_version == BTREE_VERSION);
     494       19887 :     Assert(metad->btm_fastroot != P_NONE);
     495             : 
     496       19887 :     return metad->btm_fastlevel;
     497             : }
     498             : 
     499             : /*
     500             :  *  _bt_checkpage() -- Verify that a freshly-read page looks sane.
     501             :  */
     502             : void
     503     1098161 : _bt_checkpage(Relation rel, Buffer buf)
     504             : {
     505     1098161 :     Page        page = BufferGetPage(buf);
     506             : 
     507             :     /*
     508             :      * ReadBuffer verifies that every newly-read page passes
     509             :      * PageHeaderIsValid, which means it either contains a reasonably sane
     510             :      * page header or is all-zero.  We have to defend against the all-zero
     511             :      * case, however.
     512             :      */
     513     1098161 :     if (PageIsNew(page))
     514           0 :         ereport(ERROR,
     515             :                 (errcode(ERRCODE_INDEX_CORRUPTED),
     516             :                  errmsg("index \"%s\" contains unexpected zero page at block %u",
     517             :                         RelationGetRelationName(rel),
     518             :                         BufferGetBlockNumber(buf)),
     519             :                  errhint("Please REINDEX it.")));
     520             : 
     521             :     /*
     522             :      * Additionally check that the special area looks sane.
     523             :      */
     524     1098161 :     if (PageGetSpecialSize(page) != MAXALIGN(sizeof(BTPageOpaqueData)))
     525           0 :         ereport(ERROR,
     526             :                 (errcode(ERRCODE_INDEX_CORRUPTED),
     527             :                  errmsg("index \"%s\" contains corrupted page at block %u",
     528             :                         RelationGetRelationName(rel),
     529             :                         BufferGetBlockNumber(buf)),
     530             :                  errhint("Please REINDEX it.")));
     531     1098161 : }
     532             : 
     533             : /*
     534             :  * Log the reuse of a page from the FSM.
     535             :  */
     536             : static void
     537          34 : _bt_log_reuse_page(Relation rel, BlockNumber blkno, TransactionId latestRemovedXid)
     538             : {
     539             :     xl_btree_reuse_page xlrec_reuse;
     540             : 
     541             :     /*
     542             :      * Note that we don't register the buffer with the record, because this
     543             :      * operation doesn't modify the page. This record only exists to provide a
     544             :      * conflict point for Hot Standby.
     545             :      */
     546             : 
     547             :     /* XLOG stuff */
     548          34 :     xlrec_reuse.node = rel->rd_node;
     549          34 :     xlrec_reuse.block = blkno;
     550          34 :     xlrec_reuse.latestRemovedXid = latestRemovedXid;
     551             : 
     552          34 :     XLogBeginInsert();
     553          34 :     XLogRegisterData((char *) &xlrec_reuse, SizeOfBtreeReusePage);
     554             : 
     555          34 :     XLogInsert(RM_BTREE_ID, XLOG_BTREE_REUSE_PAGE);
     556          34 : }
     557             : 
     558             : /*
     559             :  *  _bt_getbuf() -- Get a buffer by block number for read or write.
     560             :  *
     561             :  *      blkno == P_NEW means to get an unallocated index page.  The page
     562             :  *      will be initialized before returning it.
     563             :  *
     564             :  *      When this routine returns, the appropriate lock is set on the
     565             :  *      requested buffer and its reference count has been incremented
     566             :  *      (ie, the buffer is "locked and pinned").  Also, we apply
     567             :  *      _bt_checkpage to sanity-check the page (except in P_NEW case).
     568             :  */
     569             : Buffer
     570      606164 : _bt_getbuf(Relation rel, BlockNumber blkno, int access)
     571             : {
     572             :     Buffer      buf;
     573             : 
     574      606164 :     if (blkno != P_NEW)
     575             :     {
     576             :         /* Read an existing block of the relation */
     577      604967 :         buf = ReadBuffer(rel, blkno);
     578      604967 :         LockBuffer(buf, access);
     579      604967 :         _bt_checkpage(rel, buf);
     580             :     }
     581             :     else
     582             :     {
     583             :         bool        needLock;
     584             :         Page        page;
     585             : 
     586        1197 :         Assert(access == BT_WRITE);
     587             : 
     588             :         /*
     589             :          * First see if the FSM knows of any free pages.
     590             :          *
     591             :          * We can't trust the FSM's report unreservedly; we have to check that
     592             :          * the page is still free.  (For example, an already-free page could
     593             :          * have been re-used between the time the last VACUUM scanned it and
     594             :          * the time the VACUUM made its FSM updates.)
     595             :          *
     596             :          * In fact, it's worse than that: we can't even assume that it's safe
     597             :          * to take a lock on the reported page.  If somebody else has a lock
     598             :          * on it, or even worse our own caller does, we could deadlock.  (The
     599             :          * own-caller scenario is actually not improbable. Consider an index
     600             :          * on a serial or timestamp column.  Nearly all splits will be at the
     601             :          * rightmost page, so it's entirely likely that _bt_split will call us
     602             :          * while holding a lock on the page most recently acquired from FSM. A
     603             :          * VACUUM running concurrently with the previous split could well have
     604             :          * placed that page back in FSM.)
     605             :          *
     606             :          * To get around that, we ask for only a conditional lock on the
     607             :          * reported page.  If we fail, then someone else is using the page,
     608             :          * and we may reasonably assume it's not free.  (If we happen to be
     609             :          * wrong, the worst consequence is the page will be lost to use till
     610             :          * the next VACUUM, which is no big problem.)
     611             :          */
     612             :         for (;;)
     613             :         {
     614        1197 :             blkno = GetFreeIndexPage(rel);
     615        1197 :             if (blkno == InvalidBlockNumber)
     616        1163 :                 break;
     617          34 :             buf = ReadBuffer(rel, blkno);
     618          34 :             if (ConditionalLockBuffer(buf))
     619             :             {
     620          34 :                 page = BufferGetPage(buf);
     621          34 :                 if (_bt_page_recyclable(page))
     622             :                 {
     623             :                     /*
     624             :                      * If we are generating WAL for Hot Standby then create a
     625             :                      * WAL record that will allow us to conflict with queries
     626             :                      * running on standby.
     627             :                      */
     628          34 :                     if (XLogStandbyInfoActive() && RelationNeedsWAL(rel))
     629             :                     {
     630          34 :                         BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
     631             : 
     632          34 :                         _bt_log_reuse_page(rel, blkno, opaque->btpo.xact);
     633             :                     }
     634             : 
     635             :                     /* Okay to use page.  Re-initialize and return it */
     636          34 :                     _bt_pageinit(page, BufferGetPageSize(buf));
     637          34 :                     return buf;
     638             :                 }
     639           0 :                 elog(DEBUG2, "FSM returned nonrecyclable page");
     640           0 :                 _bt_relbuf(rel, buf);
     641             :             }
     642             :             else
     643             :             {
     644           0 :                 elog(DEBUG2, "FSM returned nonlockable page");
     645             :                 /* couldn't get lock, so just drop pin */
     646           0 :                 ReleaseBuffer(buf);
     647             :             }
     648           0 :         }
     649             : 
     650             :         /*
     651             :          * Extend the relation by one page.
     652             :          *
     653             :          * We have to use a lock to ensure no one else is extending the rel at
     654             :          * the same time, else we will both try to initialize the same new
     655             :          * page.  We can skip locking for new or temp relations, however,
     656             :          * since no one else could be accessing them.
     657             :          */
     658        1163 :         needLock = !RELATION_IS_LOCAL(rel);
     659             : 
     660        1163 :         if (needLock)
     661        1099 :             LockRelationForExtension(rel, ExclusiveLock);
     662             : 
     663        1163 :         buf = ReadBuffer(rel, P_NEW);
     664             : 
     665             :         /* Acquire buffer lock on new page */
     666        1163 :         LockBuffer(buf, BT_WRITE);
     667             : 
     668             :         /*
     669             :          * Release the file-extension lock; it's now OK for someone else to
     670             :          * extend the relation some more.  Note that we cannot release this
     671             :          * lock before we have buffer lock on the new page, or we risk a race
     672             :          * condition against btvacuumscan --- see comments therein.
     673             :          */
     674        1163 :         if (needLock)
     675        1099 :             UnlockRelationForExtension(rel, ExclusiveLock);
     676             : 
     677             :         /* Initialize the new page before returning it */
     678        1163 :         page = BufferGetPage(buf);
     679        1163 :         Assert(PageIsNew(page));
     680        1163 :         _bt_pageinit(page, BufferGetPageSize(buf));
     681             :     }
     682             : 
     683             :     /* ref count and lock type are correct */
     684      606130 :     return buf;
     685             : }
     686             : 
     687             : /*
     688             :  *  _bt_relandgetbuf() -- release a locked buffer and get another one.
     689             :  *
     690             :  * This is equivalent to _bt_relbuf followed by _bt_getbuf, with the
     691             :  * exception that blkno may not be P_NEW.  Also, if obuf is InvalidBuffer
     692             :  * then it reduces to just _bt_getbuf; allowing this case simplifies some
     693             :  * callers.
     694             :  *
     695             :  * The original motivation for using this was to avoid two entries to the
     696             :  * bufmgr when one would do.  However, now it's mainly just a notational
     697             :  * convenience.  The only case where it saves work over _bt_relbuf/_bt_getbuf
     698             :  * is when the target page is the same one already in the buffer.
     699             :  */
     700             : Buffer
     701      491101 : _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
     702             : {
     703             :     Buffer      buf;
     704             : 
     705      491101 :     Assert(blkno != P_NEW);
     706      491101 :     if (BufferIsValid(obuf))
     707      488724 :         LockBuffer(obuf, BUFFER_LOCK_UNLOCK);
     708      491101 :     buf = ReleaseAndReadBuffer(obuf, rel, blkno);
     709      491101 :     LockBuffer(buf, access);
     710      491101 :     _bt_checkpage(rel, buf);
     711      491101 :     return buf;
     712             : }
     713             : 
     714             : /*
     715             :  *  _bt_relbuf() -- release a locked buffer.
     716             :  *
     717             :  * Lock and pin (refcount) are both dropped.
     718             :  */
     719             : void
     720      227939 : _bt_relbuf(Relation rel, Buffer buf)
     721             : {
     722      227939 :     UnlockReleaseBuffer(buf);
     723      227939 : }
     724             : 
     725             : /*
     726             :  *  _bt_pageinit() -- Initialize a new page.
     727             :  *
     728             :  * On return, the page header is initialized; data space is empty;
     729             :  * special space is zeroed out.
     730             :  */
     731             : void
     732        5025 : _bt_pageinit(Page page, Size size)
     733             : {
     734        5025 :     PageInit(page, size, sizeof(BTPageOpaqueData));
     735        5025 : }
     736             : 
     737             : /*
     738             :  *  _bt_page_recyclable() -- Is an existing page recyclable?
     739             :  *
     740             :  * This exists to make sure _bt_getbuf and btvacuumscan have the same
     741             :  * policy about whether a page is safe to re-use.
     742             :  */
     743             : bool
     744        1986 : _bt_page_recyclable(Page page)
     745             : {
     746             :     BTPageOpaque opaque;
     747             : 
     748             :     /*
     749             :      * It's possible to find an all-zeroes page in an index --- for example, a
     750             :      * backend might successfully extend the relation one page and then crash
     751             :      * before it is able to make a WAL entry for adding the page. If we find a
     752             :      * zeroed page then reclaim it.
     753             :      */
     754        1986 :     if (PageIsNew(page))
     755           0 :         return true;
     756             : 
     757             :     /*
     758             :      * Otherwise, recycle if deleted and too old to have any processes
     759             :      * interested in it.
     760             :      */
     761        1986 :     opaque = (BTPageOpaque) PageGetSpecialPointer(page);
     762        2231 :     if (P_ISDELETED(opaque) &&
     763         245 :         TransactionIdPrecedes(opaque->btpo.xact, RecentGlobalXmin))
     764         206 :         return true;
     765        1780 :     return false;
     766             : }
     767             : 
     768             : /*
     769             :  * Delete item(s) from a btree page during VACUUM.
     770             :  *
     771             :  * This must only be used for deleting leaf items.  Deleting an item on a
     772             :  * non-leaf page has to be done as part of an atomic action that includes
     773             :  * deleting the page it points to.
     774             :  *
     775             :  * This routine assumes that the caller has pinned and locked the buffer.
     776             :  * Also, the given itemnos *must* appear in increasing order in the array.
     777             :  *
     778             :  * We record VACUUMs and b-tree deletes differently in WAL. InHotStandby
     779             :  * we need to be able to pin all of the blocks in the btree in physical
     780             :  * order when replaying the effects of a VACUUM, just as we do for the
     781             :  * original VACUUM itself. lastBlockVacuumed allows us to tell whether an
     782             :  * intermediate range of blocks has had no changes at all by VACUUM,
     783             :  * and so must be scanned anyway during replay. We always write a WAL record
     784             :  * for the last block in the index, whether or not it contained any items
     785             :  * to be removed. This allows us to scan right up to end of index to
     786             :  * ensure correct locking.
     787             :  */
     788             : void
     789         625 : _bt_delitems_vacuum(Relation rel, Buffer buf,
     790             :                     OffsetNumber *itemnos, int nitems,
     791             :                     BlockNumber lastBlockVacuumed)
     792             : {
     793         625 :     Page        page = BufferGetPage(buf);
     794             :     BTPageOpaque opaque;
     795             : 
     796             :     /* No ereport(ERROR) until changes are logged */
     797         625 :     START_CRIT_SECTION();
     798             : 
     799             :     /* Fix the page */
     800         625 :     if (nitems > 0)
     801         484 :         PageIndexMultiDelete(page, itemnos, nitems);
     802             : 
     803             :     /*
     804             :      * We can clear the vacuum cycle ID since this page has certainly been
     805             :      * processed by the current vacuum scan.
     806             :      */
     807         625 :     opaque = (BTPageOpaque) PageGetSpecialPointer(page);
     808         625 :     opaque->btpo_cycleid = 0;
     809             : 
     810             :     /*
     811             :      * Mark the page as not containing any LP_DEAD items.  This is not
     812             :      * certainly true (there might be some that have recently been marked, but
     813             :      * weren't included in our target-item list), but it will almost always be
     814             :      * true and it doesn't seem worth an additional page scan to check it.
     815             :      * Remember that BTP_HAS_GARBAGE is only a hint anyway.
     816             :      */
     817         625 :     opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
     818             : 
     819         625 :     MarkBufferDirty(buf);
     820             : 
     821             :     /* XLOG stuff */
     822         625 :     if (RelationNeedsWAL(rel))
     823             :     {
     824             :         XLogRecPtr  recptr;
     825             :         xl_btree_vacuum xlrec_vacuum;
     826             : 
     827         625 :         xlrec_vacuum.lastBlockVacuumed = lastBlockVacuumed;
     828             : 
     829         625 :         XLogBeginInsert();
     830         625 :         XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
     831         625 :         XLogRegisterData((char *) &xlrec_vacuum, SizeOfBtreeVacuum);
     832             : 
     833             :         /*
     834             :          * The target-offsets array is not in the buffer, but pretend that it
     835             :          * is.  When XLogInsert stores the whole buffer, the offsets array
     836             :          * need not be stored too.
     837             :          */
     838         625 :         if (nitems > 0)
     839         484 :             XLogRegisterBufData(0, (char *) itemnos, nitems * sizeof(OffsetNumber));
     840             : 
     841         625 :         recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_VACUUM);
     842             : 
     843         625 :         PageSetLSN(page, recptr);
     844             :     }
     845             : 
     846         625 :     END_CRIT_SECTION();
     847         625 : }
     848             : 
     849             : /*
     850             :  * Delete item(s) from a btree page during single-page cleanup.
     851             :  *
     852             :  * As above, must only be used on leaf pages.
     853             :  *
     854             :  * This routine assumes that the caller has pinned and locked the buffer.
     855             :  * Also, the given itemnos *must* appear in increasing order in the array.
     856             :  *
     857             :  * This is nearly the same as _bt_delitems_vacuum as far as what it does to
     858             :  * the page, but the WAL logging considerations are quite different.  See
     859             :  * comments for _bt_delitems_vacuum.
     860             :  */
     861             : void
     862         257 : _bt_delitems_delete(Relation rel, Buffer buf,
     863             :                     OffsetNumber *itemnos, int nitems,
     864             :                     Relation heapRel)
     865             : {
     866         257 :     Page        page = BufferGetPage(buf);
     867             :     BTPageOpaque opaque;
     868             : 
     869             :     /* Shouldn't be called unless there's something to do */
     870         257 :     Assert(nitems > 0);
     871             : 
     872             :     /* No ereport(ERROR) until changes are logged */
     873         257 :     START_CRIT_SECTION();
     874             : 
     875             :     /* Fix the page */
     876         257 :     PageIndexMultiDelete(page, itemnos, nitems);
     877             : 
     878             :     /*
     879             :      * Unlike _bt_delitems_vacuum, we *must not* clear the vacuum cycle ID,
     880             :      * because this is not called by VACUUM.
     881             :      */
     882             : 
     883             :     /*
     884             :      * Mark the page as not containing any LP_DEAD items.  This is not
     885             :      * certainly true (there might be some that have recently been marked, but
     886             :      * weren't included in our target-item list), but it will almost always be
     887             :      * true and it doesn't seem worth an additional page scan to check it.
     888             :      * Remember that BTP_HAS_GARBAGE is only a hint anyway.
     889             :      */
     890         257 :     opaque = (BTPageOpaque) PageGetSpecialPointer(page);
     891         257 :     opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
     892             : 
     893         257 :     MarkBufferDirty(buf);
     894             : 
     895             :     /* XLOG stuff */
     896         257 :     if (RelationNeedsWAL(rel))
     897             :     {
     898             :         XLogRecPtr  recptr;
     899             :         xl_btree_delete xlrec_delete;
     900             : 
     901         257 :         xlrec_delete.hnode = heapRel->rd_node;
     902         257 :         xlrec_delete.nitems = nitems;
     903             : 
     904         257 :         XLogBeginInsert();
     905         257 :         XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
     906         257 :         XLogRegisterData((char *) &xlrec_delete, SizeOfBtreeDelete);
     907             : 
     908             :         /*
     909             :          * We need the target-offsets array whether or not we store the whole
     910             :          * buffer, to allow us to find the latestRemovedXid on a standby
     911             :          * server.
     912             :          */
     913         257 :         XLogRegisterData((char *) itemnos, nitems * sizeof(OffsetNumber));
     914             : 
     915         257 :         recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DELETE);
     916             : 
     917         257 :         PageSetLSN(page, recptr);
     918             :     }
     919             : 
     920         257 :     END_CRIT_SECTION();
     921         257 : }
     922             : 
     923             : /*
     924             :  * Returns true, if the given block has the half-dead flag set.
     925             :  */
     926             : static bool
     927         177 : _bt_is_page_halfdead(Relation rel, BlockNumber blk)
     928             : {
     929             :     Buffer      buf;
     930             :     Page        page;
     931             :     BTPageOpaque opaque;
     932             :     bool        result;
     933             : 
     934         177 :     buf = _bt_getbuf(rel, blk, BT_READ);
     935         177 :     page = BufferGetPage(buf);
     936         177 :     opaque = (BTPageOpaque) PageGetSpecialPointer(page);
     937             : 
     938         177 :     result = P_ISHALFDEAD(opaque);
     939         177 :     _bt_relbuf(rel, buf);
     940             : 
     941         177 :     return result;
     942             : }
     943             : 
     944             : /*
     945             :  * Subroutine to find the parent of the branch we're deleting.  This climbs
     946             :  * up the tree until it finds a page with more than one child, i.e. a page
     947             :  * that will not be totally emptied by the deletion.  The chain of pages below
     948             :  * it, with one downlink each, will form the branch that we need to delete.
     949             :  *
     950             :  * If we cannot remove the downlink from the parent, because it's the
     951             :  * rightmost entry, returns false.  On success, *topparent and *topoff are set
     952             :  * to the buffer holding the parent, and the offset of the downlink in it.
     953             :  * *topparent is write-locked, the caller is responsible for releasing it when
     954             :  * done.  *target is set to the topmost page in the branch to-be-deleted, i.e.
     955             :  * the page whose downlink *topparent / *topoff point to, and *rightsib to its
     956             :  * right sibling.
     957             :  *
     958             :  * "child" is the leaf page we wish to delete, and "stack" is a search stack
     959             :  * leading to it (approximately).  Note that we will update the stack
     960             :  * entry(s) to reflect current downlink positions --- this is harmless and
     961             :  * indeed saves later search effort in _bt_pagedel.  The caller should
     962             :  * initialize *target and *rightsib to the leaf page and its right sibling.
     963             :  *
     964             :  * Note: it's OK to release page locks on any internal pages between the leaf
     965             :  * and *topparent, because a safe deletion can't become unsafe due to
     966             :  * concurrent activity.  An internal page can only acquire an entry if the
     967             :  * child is split, but that cannot happen as long as we hold a lock on the
     968             :  * leaf.
     969             :  */
     970             : static bool
     971         177 : _bt_lock_branch_parent(Relation rel, BlockNumber child, BTStack stack,
     972             :                        Buffer *topparent, OffsetNumber *topoff,
     973             :                        BlockNumber *target, BlockNumber *rightsib)
     974             : {
     975             :     BlockNumber parent;
     976             :     OffsetNumber poffset,
     977             :                 maxoff;
     978             :     Buffer      pbuf;
     979             :     Page        page;
     980             :     BTPageOpaque opaque;
     981             :     BlockNumber leftsib;
     982             : 
     983             :     /*
     984             :      * Locate the downlink of "child" in the parent (updating the stack entry
     985             :      * if needed)
     986             :      */
     987         177 :     ItemPointerSet(&(stack->bts_btentry.t_tid), child, P_HIKEY);
     988         177 :     pbuf = _bt_getstackbuf(rel, stack, BT_WRITE);
     989         177 :     if (pbuf == InvalidBuffer)
     990           0 :         elog(ERROR, "failed to re-find parent key in index \"%s\" for deletion target page %u",
     991             :              RelationGetRelationName(rel), child);
     992         177 :     parent = stack->bts_blkno;
     993         177 :     poffset = stack->bts_offset;
     994             : 
     995         177 :     page = BufferGetPage(pbuf);
     996         177 :     opaque = (BTPageOpaque) PageGetSpecialPointer(page);
     997         177 :     maxoff = PageGetMaxOffsetNumber(page);
     998             : 
     999             :     /*
    1000             :      * If the target is the rightmost child of its parent, then we can't
    1001             :      * delete, unless it's also the only child.
    1002             :      */
    1003         177 :     if (poffset >= maxoff)
    1004             :     {
    1005             :         /* It's rightmost child... */
    1006          42 :         if (poffset == P_FIRSTDATAKEY(opaque))
    1007             :         {
    1008             :             /*
    1009             :              * It's only child, so safe if parent would itself be removable.
    1010             :              * We have to check the parent itself, and then recurse to test
    1011             :              * the conditions at the parent's parent.
    1012             :              */
    1013          84 :             if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) ||
    1014          42 :                 P_INCOMPLETE_SPLIT(opaque))
    1015             :             {
    1016           0 :                 _bt_relbuf(rel, pbuf);
    1017           0 :                 return false;
    1018             :             }
    1019             : 
    1020          42 :             *target = parent;
    1021          42 :             *rightsib = opaque->btpo_next;
    1022          42 :             leftsib = opaque->btpo_prev;
    1023             : 
    1024          42 :             _bt_relbuf(rel, pbuf);
    1025             : 
    1026             :             /*
    1027             :              * Like in _bt_pagedel, check that the left sibling is not marked
    1028             :              * with INCOMPLETE_SPLIT flag.  That would mean that there is no
    1029             :              * downlink to the page to be deleted, and the page deletion
    1030             :              * algorithm isn't prepared to handle that.
    1031             :              */
    1032          42 :             if (leftsib != P_NONE)
    1033             :             {
    1034             :                 Buffer      lbuf;
    1035             :                 Page        lpage;
    1036             :                 BTPageOpaque lopaque;
    1037             : 
    1038           0 :                 lbuf = _bt_getbuf(rel, leftsib, BT_READ);
    1039           0 :                 lpage = BufferGetPage(lbuf);
    1040           0 :                 lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage);
    1041             : 
    1042             :                 /*
    1043             :                  * If the left sibling was concurrently split, so that its
    1044             :                  * next-pointer doesn't point to the current page anymore, the
    1045             :                  * split that created the current page must be completed. (We
    1046             :                  * don't allow splitting an incompletely split page again
    1047             :                  * until the previous split has been completed)
    1048             :                  */
    1049           0 :                 if (lopaque->btpo_next == parent &&
    1050           0 :                     P_INCOMPLETE_SPLIT(lopaque))
    1051             :                 {
    1052           0 :                     _bt_relbuf(rel, lbuf);
    1053           0 :                     return false;
    1054             :                 }
    1055           0 :                 _bt_relbuf(rel, lbuf);
    1056             :             }
    1057             : 
    1058             :             /*
    1059             :              * Perform the same check on this internal level that
    1060             :              * _bt_mark_page_halfdead performed on the leaf level.
    1061             :              */
    1062          42 :             if (_bt_is_page_halfdead(rel, *rightsib))
    1063             :             {
    1064           0 :                 elog(DEBUG1, "could not delete page %u because its right sibling %u is half-dead",
    1065             :                      parent, *rightsib);
    1066           0 :                 return false;
    1067             :             }
    1068             : 
    1069          42 :             return _bt_lock_branch_parent(rel, parent, stack->bts_parent,
    1070             :                                           topparent, topoff, target, rightsib);
    1071             :         }
    1072             :         else
    1073             :         {
    1074             :             /* Unsafe to delete */
    1075           0 :             _bt_relbuf(rel, pbuf);
    1076           0 :             return false;
    1077             :         }
    1078             :     }
    1079             :     else
    1080             :     {
    1081             :         /* Not rightmost child, so safe to delete */
    1082         135 :         *topparent = pbuf;
    1083         135 :         *topoff = poffset;
    1084         135 :         return true;
    1085             :     }
    1086             : }
    1087             : 
    1088             : /*
    1089             :  * _bt_pagedel() -- Delete a page from the b-tree, if legal to do so.
    1090             :  *
    1091             :  * This action unlinks the page from the b-tree structure, removing all
    1092             :  * pointers leading to it --- but not touching its own left and right links.
    1093             :  * The page cannot be physically reclaimed right away, since other processes
    1094             :  * may currently be trying to follow links leading to the page; they have to
    1095             :  * be allowed to use its right-link to recover.  See nbtree/README.
    1096             :  *
    1097             :  * On entry, the target buffer must be pinned and locked (either read or write
    1098             :  * lock is OK).  This lock and pin will be dropped before exiting.
    1099             :  *
    1100             :  * Returns the number of pages successfully deleted (zero if page cannot
    1101             :  * be deleted now; could be more than one if parent or sibling pages were
    1102             :  * deleted too).
    1103             :  *
    1104             :  * NOTE: this leaks memory.  Rather than trying to clean up everything
    1105             :  * carefully, it's better to run it in a temp context that can be reset
    1106             :  * frequently.
    1107             :  */
    1108             : int
    1109         144 : _bt_pagedel(Relation rel, Buffer buf)
    1110             : {
    1111         144 :     int         ndeleted = 0;
    1112             :     BlockNumber rightsib;
    1113             :     bool        rightsib_empty;
    1114             :     Page        page;
    1115             :     BTPageOpaque opaque;
    1116             : 
    1117             :     /*
    1118             :      * "stack" is a search stack leading (approximately) to the target page.
    1119             :      * It is initially NULL, but when iterating, we keep it to avoid
    1120             :      * duplicated search effort.
    1121             :      *
    1122             :      * Also, when "stack" is not NULL, we have already checked that the
    1123             :      * current page is not the right half of an incomplete split, i.e. the
    1124             :      * left sibling does not have its INCOMPLETE_SPLIT flag set.
    1125             :      */
    1126         144 :     BTStack     stack = NULL;
    1127             : 
    1128             :     for (;;)
    1129             :     {
    1130         279 :         page = BufferGetPage(buf);
    1131         279 :         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    1132             : 
    1133             :         /*
    1134             :          * Internal pages are never deleted directly, only as part of deleting
    1135             :          * the whole branch all the way down to leaf level.
    1136             :          */
    1137         279 :         if (!P_ISLEAF(opaque))
    1138             :         {
    1139             :             /*
    1140             :              * Pre-9.4 page deletion only marked internal pages as half-dead,
    1141             :              * but now we only use that flag on leaf pages. The old algorithm
    1142             :              * was never supposed to leave half-dead pages in the tree, it was
    1143             :              * just a transient state, but it was nevertheless possible in
    1144             :              * error scenarios. We don't know how to deal with them here. They
    1145             :              * are harmless as far as searches are considered, but inserts
    1146             :              * into the deleted keyspace could add out-of-order downlinks in
    1147             :              * the upper levels. Log a notice, hopefully the admin will notice
    1148             :              * and reindex.
    1149             :              */
    1150           0 :             if (P_ISHALFDEAD(opaque))
    1151           0 :                 ereport(LOG,
    1152             :                         (errcode(ERRCODE_INDEX_CORRUPTED),
    1153             :                          errmsg("index \"%s\" contains a half-dead internal page",
    1154             :                                 RelationGetRelationName(rel)),
    1155             :                          errhint("This can be caused by an interrupted VACUUM in version 9.3 or older, before upgrade. Please REINDEX it.")));
    1156           0 :             _bt_relbuf(rel, buf);
    1157           0 :             return ndeleted;
    1158             :         }
    1159             : 
    1160             :         /*
    1161             :          * We can never delete rightmost pages nor root pages.  While at it,
    1162             :          * check that page is not already deleted and is empty.
    1163             :          *
    1164             :          * To keep the algorithm simple, we also never delete an incompletely
    1165             :          * split page (they should be rare enough that this doesn't make any
    1166             :          * meaningful difference to disk usage):
    1167             :          *
    1168             :          * The INCOMPLETE_SPLIT flag on the page tells us if the page is the
    1169             :          * left half of an incomplete split, but ensuring that it's not the
    1170             :          * right half is more complicated.  For that, we have to check that
    1171             :          * the left sibling doesn't have its INCOMPLETE_SPLIT flag set.  On
    1172             :          * the first iteration, we temporarily release the lock on the current
    1173             :          * page, and check the left sibling and also construct a search stack
    1174             :          * to.  On subsequent iterations, we know we stepped right from a page
    1175             :          * that passed these tests, so it's OK.
    1176             :          */
    1177         549 :         if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque) ||
    1178         540 :             P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page) ||
    1179         270 :             P_INCOMPLETE_SPLIT(opaque))
    1180             :         {
    1181             :             /* Should never fail to delete a half-dead page */
    1182           9 :             Assert(!P_ISHALFDEAD(opaque));
    1183             : 
    1184           9 :             _bt_relbuf(rel, buf);
    1185           9 :             return ndeleted;
    1186             :         }
    1187             : 
    1188             :         /*
    1189             :          * First, remove downlink pointing to the page (or a parent of the
    1190             :          * page, if we are going to delete a taller branch), and mark the page
    1191             :          * as half-dead.
    1192             :          */
    1193         270 :         if (!P_ISHALFDEAD(opaque))
    1194             :         {
    1195             :             /*
    1196             :              * We need an approximate pointer to the page's parent page.  We
    1197             :              * use the standard search mechanism to search for the page's high
    1198             :              * key; this will give us a link to either the current parent or
    1199             :              * someplace to its left (if there are multiple equal high keys).
    1200             :              *
    1201             :              * Also check if this is the right-half of an incomplete split
    1202             :              * (see comment above).
    1203             :              */
    1204         270 :             if (!stack)
    1205             :             {
    1206             :                 ScanKey     itup_scankey;
    1207             :                 ItemId      itemid;
    1208             :                 IndexTuple  targetkey;
    1209             :                 Buffer      lbuf;
    1210             :                 BlockNumber leftsib;
    1211             : 
    1212         135 :                 itemid = PageGetItemId(page, P_HIKEY);
    1213         135 :                 targetkey = CopyIndexTuple((IndexTuple) PageGetItem(page, itemid));
    1214             : 
    1215         135 :                 leftsib = opaque->btpo_prev;
    1216             : 
    1217             :                 /*
    1218             :                  * To avoid deadlocks, we'd better drop the leaf page lock
    1219             :                  * before going further.
    1220             :                  */
    1221         135 :                 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
    1222             : 
    1223             :                 /*
    1224             :                  * Fetch the left sibling, to check that it's not marked with
    1225             :                  * INCOMPLETE_SPLIT flag.  That would mean that the page
    1226             :                  * to-be-deleted doesn't have a downlink, and the page
    1227             :                  * deletion algorithm isn't prepared to handle that.
    1228             :                  */
    1229         135 :                 if (!P_LEFTMOST(opaque))
    1230             :                 {
    1231             :                     BTPageOpaque lopaque;
    1232             :                     Page        lpage;
    1233             : 
    1234          39 :                     lbuf = _bt_getbuf(rel, leftsib, BT_READ);
    1235          39 :                     lpage = BufferGetPage(lbuf);
    1236          39 :                     lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage);
    1237             : 
    1238             :                     /*
    1239             :                      * If the left sibling is split again by another backend,
    1240             :                      * after we released the lock, we know that the first
    1241             :                      * split must have finished, because we don't allow an
    1242             :                      * incompletely-split page to be split again.  So we don't
    1243             :                      * need to walk right here.
    1244             :                      */
    1245          78 :                     if (lopaque->btpo_next == BufferGetBlockNumber(buf) &&
    1246          39 :                         P_INCOMPLETE_SPLIT(lopaque))
    1247             :                     {
    1248           0 :                         ReleaseBuffer(buf);
    1249           0 :                         _bt_relbuf(rel, lbuf);
    1250           0 :                         return ndeleted;
    1251             :                     }
    1252          39 :                     _bt_relbuf(rel, lbuf);
    1253             :                 }
    1254             : 
    1255             :                 /* we need an insertion scan key for the search, so build one */
    1256         135 :                 itup_scankey = _bt_mkscankey(rel, targetkey);
    1257             :                 /* find the leftmost leaf page containing this key */
    1258         135 :                 stack = _bt_search(rel, rel->rd_rel->relnatts, itup_scankey,
    1259             :                                    false, &lbuf, BT_READ, NULL);
    1260             :                 /* don't need a pin on the page */
    1261         135 :                 _bt_relbuf(rel, lbuf);
    1262             : 
    1263             :                 /*
    1264             :                  * Re-lock the leaf page, and start over, to re-check that the
    1265             :                  * page can still be deleted.
    1266             :                  */
    1267         135 :                 LockBuffer(buf, BT_WRITE);
    1268         135 :                 continue;
    1269             :             }
    1270             : 
    1271         135 :             if (!_bt_mark_page_halfdead(rel, buf, stack))
    1272             :             {
    1273           0 :                 _bt_relbuf(rel, buf);
    1274           0 :                 return ndeleted;
    1275             :             }
    1276             :         }
    1277             : 
    1278             :         /*
    1279             :          * Then unlink it from its siblings.  Each call to
    1280             :          * _bt_unlink_halfdead_page unlinks the topmost page from the branch,
    1281             :          * making it shallower.  Iterate until the leaf page is gone.
    1282             :          */
    1283         135 :         rightsib_empty = false;
    1284         447 :         while (P_ISHALFDEAD(opaque))
    1285             :         {
    1286         177 :             if (!_bt_unlink_halfdead_page(rel, buf, &rightsib_empty))
    1287             :             {
    1288             :                 /* _bt_unlink_halfdead_page already released buffer */
    1289           0 :                 return ndeleted;
    1290             :             }
    1291         177 :             ndeleted++;
    1292             :         }
    1293             : 
    1294         135 :         rightsib = opaque->btpo_next;
    1295             : 
    1296         135 :         _bt_relbuf(rel, buf);
    1297             : 
    1298             :         /*
    1299             :          * The page has now been deleted. If its right sibling is completely
    1300             :          * empty, it's possible that the reason we haven't deleted it earlier
    1301             :          * is that it was the rightmost child of the parent. Now that we
    1302             :          * removed the downlink for this page, the right sibling might now be
    1303             :          * the only child of the parent, and could be removed. It would be
    1304             :          * picked up by the next vacuum anyway, but might as well try to
    1305             :          * remove it now, so loop back to process the right sibling.
    1306             :          */
    1307         135 :         if (!rightsib_empty)
    1308         135 :             break;
    1309             : 
    1310           0 :         buf = _bt_getbuf(rel, rightsib, BT_WRITE);
    1311         135 :     }
    1312             : 
    1313         135 :     return ndeleted;
    1314             : }
    1315             : 
    1316             : /*
    1317             :  * First stage of page deletion.  Remove the downlink to the top of the
    1318             :  * branch being deleted, and mark the leaf page as half-dead.
    1319             :  */
    1320             : static bool
    1321         135 : _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack)
    1322             : {
    1323             :     BlockNumber leafblkno;
    1324             :     BlockNumber leafrightsib;
    1325             :     BlockNumber target;
    1326             :     BlockNumber rightsib;
    1327             :     ItemId      itemid;
    1328             :     Page        page;
    1329             :     BTPageOpaque opaque;
    1330             :     Buffer      topparent;
    1331             :     OffsetNumber topoff;
    1332             :     OffsetNumber nextoffset;
    1333             :     IndexTuple  itup;
    1334             :     IndexTupleData trunctuple;
    1335             : 
    1336         135 :     page = BufferGetPage(leafbuf);
    1337         135 :     opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    1338             : 
    1339         135 :     Assert(!P_RIGHTMOST(opaque) && !P_ISROOT(opaque) && !P_ISDELETED(opaque) &&
    1340             :            !P_ISHALFDEAD(opaque) && P_ISLEAF(opaque) &&
    1341             :            P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page));
    1342             : 
    1343             :     /*
    1344             :      * Save info about the leaf page.
    1345             :      */
    1346         135 :     leafblkno = BufferGetBlockNumber(leafbuf);
    1347         135 :     leafrightsib = opaque->btpo_next;
    1348             : 
    1349             :     /*
    1350             :      * Before attempting to lock the parent page, check that the right sibling
    1351             :      * is not in half-dead state.  A half-dead right sibling would have no
    1352             :      * downlink in the parent, which would be highly confusing later when we
    1353             :      * delete the downlink that follows the current page's downlink. (I
    1354             :      * believe the deletion would work correctly, but it would fail the
    1355             :      * cross-check we make that the following downlink points to the right
    1356             :      * sibling of the delete page.)
    1357             :      */
    1358         135 :     if (_bt_is_page_halfdead(rel, leafrightsib))
    1359             :     {
    1360           0 :         elog(DEBUG1, "could not delete page %u because its right sibling %u is half-dead",
    1361             :              leafblkno, leafrightsib);
    1362           0 :         return false;
    1363             :     }
    1364             : 
    1365             :     /*
    1366             :      * We cannot delete a page that is the rightmost child of its immediate
    1367             :      * parent, unless it is the only child --- in which case the parent has to
    1368             :      * be deleted too, and the same condition applies recursively to it. We
    1369             :      * have to check this condition all the way up before trying to delete,
    1370             :      * and lock the final parent of the to-be-deleted branch.
    1371             :      */
    1372         135 :     rightsib = leafrightsib;
    1373         135 :     target = leafblkno;
    1374         135 :     if (!_bt_lock_branch_parent(rel, leafblkno, stack,
    1375             :                                 &topparent, &topoff, &target, &rightsib))
    1376           0 :         return false;
    1377             : 
    1378             :     /*
    1379             :      * Check that the parent-page index items we're about to delete/overwrite
    1380             :      * contain what we expect.  This can fail if the index has become corrupt
    1381             :      * for some reason.  We want to throw any error before entering the
    1382             :      * critical section --- otherwise it'd be a PANIC.
    1383             :      *
    1384             :      * The test on the target item is just an Assert because
    1385             :      * _bt_lock_branch_parent should have guaranteed it has the expected
    1386             :      * contents.  The test on the next-child downlink is known to sometimes
    1387             :      * fail in the field, though.
    1388             :      */
    1389         135 :     page = BufferGetPage(topparent);
    1390         135 :     opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    1391             : 
    1392             : #ifdef USE_ASSERT_CHECKING
    1393         135 :     itemid = PageGetItemId(page, topoff);
    1394         135 :     itup = (IndexTuple) PageGetItem(page, itemid);
    1395         135 :     Assert(ItemPointerGetBlockNumber(&(itup->t_tid)) == target);
    1396             : #endif
    1397             : 
    1398         135 :     nextoffset = OffsetNumberNext(topoff);
    1399         135 :     itemid = PageGetItemId(page, nextoffset);
    1400         135 :     itup = (IndexTuple) PageGetItem(page, itemid);
    1401         135 :     if (ItemPointerGetBlockNumber(&(itup->t_tid)) != rightsib)
    1402           0 :         elog(ERROR, "right sibling %u of block %u is not next child %u of block %u in index \"%s\"",
    1403             :              rightsib, target, ItemPointerGetBlockNumber(&(itup->t_tid)),
    1404             :              BufferGetBlockNumber(topparent), RelationGetRelationName(rel));
    1405             : 
    1406             :     /*
    1407             :      * Any insert which would have gone on the leaf block will now go to its
    1408             :      * right sibling.
    1409             :      */
    1410         135 :     PredicateLockPageCombine(rel, leafblkno, leafrightsib);
    1411             : 
    1412             :     /* No ereport(ERROR) until changes are logged */
    1413         135 :     START_CRIT_SECTION();
    1414             : 
    1415             :     /*
    1416             :      * Update parent.  The normal case is a tad tricky because we want to
    1417             :      * delete the target's downlink and the *following* key.  Easiest way is
    1418             :      * to copy the right sibling's downlink over the target downlink, and then
    1419             :      * delete the following item.
    1420             :      */
    1421         135 :     page = BufferGetPage(topparent);
    1422         135 :     opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    1423             : 
    1424         135 :     itemid = PageGetItemId(page, topoff);
    1425         135 :     itup = (IndexTuple) PageGetItem(page, itemid);
    1426         135 :     ItemPointerSet(&(itup->t_tid), rightsib, P_HIKEY);
    1427             : 
    1428         135 :     nextoffset = OffsetNumberNext(topoff);
    1429         135 :     PageIndexTupleDelete(page, nextoffset);
    1430             : 
    1431             :     /*
    1432             :      * Mark the leaf page as half-dead, and stamp it with a pointer to the
    1433             :      * highest internal page in the branch we're deleting.  We use the tid of
    1434             :      * the high key to store it.
    1435             :      */
    1436         135 :     page = BufferGetPage(leafbuf);
    1437         135 :     opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    1438         135 :     opaque->btpo_flags |= BTP_HALF_DEAD;
    1439             : 
    1440         135 :     PageIndexTupleDelete(page, P_HIKEY);
    1441         135 :     Assert(PageGetMaxOffsetNumber(page) == 0);
    1442         135 :     MemSet(&trunctuple, 0, sizeof(IndexTupleData));
    1443         135 :     trunctuple.t_info = sizeof(IndexTupleData);
    1444         135 :     if (target != leafblkno)
    1445          31 :         ItemPointerSet(&trunctuple.t_tid, target, P_HIKEY);
    1446             :     else
    1447         104 :         ItemPointerSetInvalid(&trunctuple.t_tid);
    1448         135 :     if (PageAddItem(page, (Item) &trunctuple, sizeof(IndexTupleData), P_HIKEY,
    1449             :                     false, false) == InvalidOffsetNumber)
    1450           0 :         elog(ERROR, "could not add dummy high key to half-dead page");
    1451             : 
    1452             :     /* Must mark buffers dirty before XLogInsert */
    1453         135 :     MarkBufferDirty(topparent);
    1454         135 :     MarkBufferDirty(leafbuf);
    1455             : 
    1456             :     /* XLOG stuff */
    1457         135 :     if (RelationNeedsWAL(rel))
    1458             :     {
    1459             :         xl_btree_mark_page_halfdead xlrec;
    1460             :         XLogRecPtr  recptr;
    1461             : 
    1462         135 :         xlrec.poffset = topoff;
    1463         135 :         xlrec.leafblk = leafblkno;
    1464         135 :         if (target != leafblkno)
    1465          31 :             xlrec.topparent = target;
    1466             :         else
    1467         104 :             xlrec.topparent = InvalidBlockNumber;
    1468             : 
    1469         135 :         XLogBeginInsert();
    1470         135 :         XLogRegisterBuffer(0, leafbuf, REGBUF_WILL_INIT);
    1471         135 :         XLogRegisterBuffer(1, topparent, REGBUF_STANDARD);
    1472             : 
    1473         135 :         page = BufferGetPage(leafbuf);
    1474         135 :         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    1475         135 :         xlrec.leftblk = opaque->btpo_prev;
    1476         135 :         xlrec.rightblk = opaque->btpo_next;
    1477             : 
    1478         135 :         XLogRegisterData((char *) &xlrec, SizeOfBtreeMarkPageHalfDead);
    1479             : 
    1480         135 :         recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_MARK_PAGE_HALFDEAD);
    1481             : 
    1482         135 :         page = BufferGetPage(topparent);
    1483         135 :         PageSetLSN(page, recptr);
    1484         135 :         page = BufferGetPage(leafbuf);
    1485         135 :         PageSetLSN(page, recptr);
    1486             :     }
    1487             : 
    1488         135 :     END_CRIT_SECTION();
    1489             : 
    1490         135 :     _bt_relbuf(rel, topparent);
    1491         135 :     return true;
    1492             : }
    1493             : 
    1494             : /*
    1495             :  * Unlink a page in a branch of half-dead pages from its siblings.
    1496             :  *
    1497             :  * If the leaf page still has a downlink pointing to it, unlinks the highest
    1498             :  * parent in the to-be-deleted branch instead of the leaf page.  To get rid
    1499             :  * of the whole branch, including the leaf page itself, iterate until the
    1500             :  * leaf page is deleted.
    1501             :  *
    1502             :  * Returns 'false' if the page could not be unlinked (shouldn't happen).
    1503             :  * If the (new) right sibling of the page is empty, *rightsib_empty is set
    1504             :  * to true.
    1505             :  *
    1506             :  * Must hold pin and lock on leafbuf at entry (read or write doesn't matter).
    1507             :  * On success exit, we'll be holding pin and write lock.  On failure exit,
    1508             :  * we'll release both pin and lock before returning (we define it that way
    1509             :  * to avoid having to reacquire a lock we already released).
    1510             :  */
    1511             : static bool
    1512         177 : _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty)
    1513             : {
    1514         177 :     BlockNumber leafblkno = BufferGetBlockNumber(leafbuf);
    1515             :     BlockNumber leafleftsib;
    1516             :     BlockNumber leafrightsib;
    1517             :     BlockNumber target;
    1518             :     BlockNumber leftsib;
    1519             :     BlockNumber rightsib;
    1520         177 :     Buffer      lbuf = InvalidBuffer;
    1521             :     Buffer      buf;
    1522             :     Buffer      rbuf;
    1523         177 :     Buffer      metabuf = InvalidBuffer;
    1524         177 :     Page        metapg = NULL;
    1525         177 :     BTMetaPageData *metad = NULL;
    1526             :     ItemId      itemid;
    1527             :     Page        page;
    1528             :     BTPageOpaque opaque;
    1529             :     bool        rightsib_is_rightmost;
    1530             :     int         targetlevel;
    1531             :     ItemPointer leafhikey;
    1532             :     BlockNumber nextchild;
    1533             : 
    1534         177 :     page = BufferGetPage(leafbuf);
    1535         177 :     opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    1536             : 
    1537         177 :     Assert(P_ISLEAF(opaque) && P_ISHALFDEAD(opaque));
    1538             : 
    1539             :     /*
    1540             :      * Remember some information about the leaf page.
    1541             :      */
    1542         177 :     itemid = PageGetItemId(page, P_HIKEY);
    1543         177 :     leafhikey = &((IndexTuple) PageGetItem(page, itemid))->t_tid;
    1544         177 :     leafleftsib = opaque->btpo_prev;
    1545         177 :     leafrightsib = opaque->btpo_next;
    1546             : 
    1547         177 :     LockBuffer(leafbuf, BUFFER_LOCK_UNLOCK);
    1548             : 
    1549             :     /*
    1550             :      * If the leaf page still has a parent pointing to it (or a chain of
    1551             :      * parents), we don't unlink the leaf page yet, but the topmost remaining
    1552             :      * parent in the branch.  Set 'target' and 'buf' to reference the page
    1553             :      * actually being unlinked.
    1554             :      */
    1555         177 :     if (ItemPointerIsValid(leafhikey))
    1556             :     {
    1557          42 :         target = ItemPointerGetBlockNumber(leafhikey);
    1558          42 :         Assert(target != leafblkno);
    1559             : 
    1560             :         /* fetch the block number of the topmost parent's left sibling */
    1561          42 :         buf = _bt_getbuf(rel, target, BT_READ);
    1562          42 :         page = BufferGetPage(buf);
    1563          42 :         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    1564          42 :         leftsib = opaque->btpo_prev;
    1565          42 :         targetlevel = opaque->btpo.level;
    1566             : 
    1567             :         /*
    1568             :          * To avoid deadlocks, we'd better drop the target page lock before
    1569             :          * going further.
    1570             :          */
    1571          42 :         LockBuffer(buf, BUFFER_LOCK_UNLOCK);
    1572             :     }
    1573             :     else
    1574             :     {
    1575         135 :         target = leafblkno;
    1576             : 
    1577         135 :         buf = leafbuf;
    1578         135 :         leftsib = leafleftsib;
    1579         135 :         targetlevel = 0;
    1580             :     }
    1581             : 
    1582             :     /*
    1583             :      * We have to lock the pages we need to modify in the standard order:
    1584             :      * moving right, then up.  Else we will deadlock against other writers.
    1585             :      *
    1586             :      * So, first lock the leaf page, if it's not the target.  Then find and
    1587             :      * write-lock the current left sibling of the target page.  The sibling
    1588             :      * that was current a moment ago could have split, so we may have to move
    1589             :      * right.  This search could fail if either the sibling or the target page
    1590             :      * was deleted by someone else meanwhile; if so, give up.  (Right now,
    1591             :      * that should never happen, since page deletion is only done in VACUUM
    1592             :      * and there shouldn't be multiple VACUUMs concurrently on the same
    1593             :      * table.)
    1594             :      */
    1595         177 :     if (target != leafblkno)
    1596          42 :         LockBuffer(leafbuf, BT_WRITE);
    1597         177 :     if (leftsib != P_NONE)
    1598             :     {
    1599          39 :         lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);
    1600          39 :         page = BufferGetPage(lbuf);
    1601          39 :         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    1602          78 :         while (P_ISDELETED(opaque) || opaque->btpo_next != target)
    1603             :         {
    1604             :             /* step right one page */
    1605           0 :             leftsib = opaque->btpo_next;
    1606           0 :             _bt_relbuf(rel, lbuf);
    1607           0 :             if (leftsib == P_NONE)
    1608             :             {
    1609           0 :                 elog(LOG, "no left sibling (concurrent deletion?) of block %u in \"%s\"",
    1610             :                      target,
    1611             :                      RelationGetRelationName(rel));
    1612           0 :                 if (target != leafblkno)
    1613             :                 {
    1614             :                     /* we have only a pin on target, but pin+lock on leafbuf */
    1615           0 :                     ReleaseBuffer(buf);
    1616           0 :                     _bt_relbuf(rel, leafbuf);
    1617             :                 }
    1618             :                 else
    1619             :                 {
    1620             :                     /* we have only a pin on leafbuf */
    1621           0 :                     ReleaseBuffer(leafbuf);
    1622             :                 }
    1623           0 :                 return false;
    1624             :             }
    1625           0 :             lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);
    1626           0 :             page = BufferGetPage(lbuf);
    1627           0 :             opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    1628             :         }
    1629             :     }
    1630             :     else
    1631         138 :         lbuf = InvalidBuffer;
    1632             : 
    1633             :     /*
    1634             :      * Next write-lock the target page itself.  It should be okay to take just
    1635             :      * a write lock not a superexclusive lock, since no scans would stop on an
    1636             :      * empty page.
    1637             :      */
    1638         177 :     LockBuffer(buf, BT_WRITE);
    1639         177 :     page = BufferGetPage(buf);
    1640         177 :     opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    1641             : 
    1642             :     /*
    1643             :      * Check page is still empty etc, else abandon deletion.  This is just for
    1644             :      * paranoia's sake; a half-dead page cannot resurrect because there can be
    1645             :      * only one vacuum process running at a time.
    1646             :      */
    1647         177 :     if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque))
    1648             :     {
    1649           0 :         elog(ERROR, "half-dead page changed status unexpectedly in block %u of index \"%s\"",
    1650             :              target, RelationGetRelationName(rel));
    1651             :     }
    1652         177 :     if (opaque->btpo_prev != leftsib)
    1653           0 :         elog(ERROR, "left link changed unexpectedly in block %u of index \"%s\"",
    1654             :              target, RelationGetRelationName(rel));
    1655             : 
    1656         177 :     if (target == leafblkno)
    1657             :     {
    1658         270 :         if (P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page) ||
    1659         270 :             !P_ISLEAF(opaque) || !P_ISHALFDEAD(opaque))
    1660           0 :             elog(ERROR, "half-dead page changed status unexpectedly in block %u of index \"%s\"",
    1661             :                  target, RelationGetRelationName(rel));
    1662         135 :         nextchild = InvalidBlockNumber;
    1663             :     }
    1664             :     else
    1665             :     {
    1666          84 :         if (P_FIRSTDATAKEY(opaque) != PageGetMaxOffsetNumber(page) ||
    1667          42 :             P_ISLEAF(opaque))
    1668           0 :             elog(ERROR, "half-dead page changed status unexpectedly in block %u of index \"%s\"",
    1669             :                  target, RelationGetRelationName(rel));
    1670             : 
    1671             :         /* remember the next non-leaf child down in the branch. */
    1672          42 :         itemid = PageGetItemId(page, P_FIRSTDATAKEY(opaque));
    1673          42 :         nextchild = ItemPointerGetBlockNumber(&((IndexTuple) PageGetItem(page, itemid))->t_tid);
    1674          42 :         if (nextchild == leafblkno)
    1675          31 :             nextchild = InvalidBlockNumber;
    1676             :     }
    1677             : 
    1678             :     /*
    1679             :      * And next write-lock the (current) right sibling.
    1680             :      */
    1681         177 :     rightsib = opaque->btpo_next;
    1682         177 :     rbuf = _bt_getbuf(rel, rightsib, BT_WRITE);
    1683         177 :     page = BufferGetPage(rbuf);
    1684         177 :     opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    1685         177 :     if (opaque->btpo_prev != target)
    1686           0 :         elog(ERROR, "right sibling's left-link doesn't match: "
    1687             :              "block %u links to %u instead of expected %u in index \"%s\"",
    1688             :              rightsib, opaque->btpo_prev, target,
    1689             :              RelationGetRelationName(rel));
    1690         177 :     rightsib_is_rightmost = P_RIGHTMOST(opaque);
    1691         177 :     *rightsib_empty = (P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page));
    1692             : 
    1693             :     /*
    1694             :      * If we are deleting the next-to-last page on the target's level, then
    1695             :      * the rightsib is a candidate to become the new fast root. (In theory, it
    1696             :      * might be possible to push the fast root even further down, but the odds
    1697             :      * of doing so are slim, and the locking considerations daunting.)
    1698             :      *
    1699             :      * We don't support handling this in the case where the parent is becoming
    1700             :      * half-dead, even though it theoretically could occur.
    1701             :      *
    1702             :      * We can safely acquire a lock on the metapage here --- see comments for
    1703             :      * _bt_newroot().
    1704             :      */
    1705         177 :     if (leftsib == P_NONE && rightsib_is_rightmost)
    1706             :     {
    1707           4 :         page = BufferGetPage(rbuf);
    1708           4 :         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    1709           4 :         if (P_RIGHTMOST(opaque))
    1710             :         {
    1711             :             /* rightsib will be the only one left on the level */
    1712           4 :             metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
    1713           4 :             metapg = BufferGetPage(metabuf);
    1714           4 :             metad = BTPageGetMeta(metapg);
    1715             : 
    1716             :             /*
    1717             :              * The expected case here is btm_fastlevel == targetlevel+1; if
    1718             :              * the fastlevel is <= targetlevel, something is wrong, and we
    1719             :              * choose to overwrite it to fix it.
    1720             :              */
    1721           4 :             if (metad->btm_fastlevel > targetlevel + 1)
    1722             :             {
    1723             :                 /* no update wanted */
    1724           0 :                 _bt_relbuf(rel, metabuf);
    1725           0 :                 metabuf = InvalidBuffer;
    1726             :             }
    1727             :         }
    1728             :     }
    1729             : 
    1730             :     /*
    1731             :      * Here we begin doing the deletion.
    1732             :      */
    1733             : 
    1734             :     /* No ereport(ERROR) until changes are logged */
    1735         177 :     START_CRIT_SECTION();
    1736             : 
    1737             :     /*
    1738             :      * Update siblings' side-links.  Note the target page's side-links will
    1739             :      * continue to point to the siblings.  Asserts here are just rechecking
    1740             :      * things we already verified above.
    1741             :      */
    1742         177 :     if (BufferIsValid(lbuf))
    1743             :     {
    1744          39 :         page = BufferGetPage(lbuf);
    1745          39 :         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    1746          39 :         Assert(opaque->btpo_next == target);
    1747          39 :         opaque->btpo_next = rightsib;
    1748             :     }
    1749         177 :     page = BufferGetPage(rbuf);
    1750         177 :     opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    1751         177 :     Assert(opaque->btpo_prev == target);
    1752         177 :     opaque->btpo_prev = leftsib;
    1753             : 
    1754             :     /*
    1755             :      * If we deleted a parent of the targeted leaf page, instead of the leaf
    1756             :      * itself, update the leaf to point to the next remaining child in the
    1757             :      * branch.
    1758             :      */
    1759         177 :     if (target != leafblkno)
    1760             :     {
    1761          42 :         if (nextchild == InvalidBlockNumber)
    1762          31 :             ItemPointerSetInvalid(leafhikey);
    1763             :         else
    1764          11 :             ItemPointerSet(leafhikey, nextchild, P_HIKEY);
    1765             :     }
    1766             : 
    1767             :     /*
    1768             :      * Mark the page itself deleted.  It can be recycled when all current
    1769             :      * transactions are gone.  Storing GetTopTransactionId() would work, but
    1770             :      * we're in VACUUM and would not otherwise have an XID.  Having already
    1771             :      * updated links to the target, ReadNewTransactionId() suffices as an
    1772             :      * upper bound.  Any scan having retained a now-stale link is advertising
    1773             :      * in its PGXACT an xmin less than or equal to the value we read here.  It
    1774             :      * will continue to do so, holding back RecentGlobalXmin, for the duration
    1775             :      * of that scan.
    1776             :      */
    1777         177 :     page = BufferGetPage(buf);
    1778         177 :     opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    1779         177 :     opaque->btpo_flags &= ~BTP_HALF_DEAD;
    1780         177 :     opaque->btpo_flags |= BTP_DELETED;
    1781         177 :     opaque->btpo.xact = ReadNewTransactionId();
    1782             : 
    1783             :     /* And update the metapage, if needed */
    1784         177 :     if (BufferIsValid(metabuf))
    1785             :     {
    1786           4 :         metad->btm_fastroot = rightsib;
    1787           4 :         metad->btm_fastlevel = targetlevel;
    1788           4 :         MarkBufferDirty(metabuf);
    1789             :     }
    1790             : 
    1791             :     /* Must mark buffers dirty before XLogInsert */
    1792         177 :     MarkBufferDirty(rbuf);
    1793         177 :     MarkBufferDirty(buf);
    1794         177 :     if (BufferIsValid(lbuf))
    1795          39 :         MarkBufferDirty(lbuf);
    1796         177 :     if (target != leafblkno)
    1797          42 :         MarkBufferDirty(leafbuf);
    1798             : 
    1799             :     /* XLOG stuff */
    1800         177 :     if (RelationNeedsWAL(rel))
    1801             :     {
    1802             :         xl_btree_unlink_page xlrec;
    1803             :         xl_btree_metadata xlmeta;
    1804             :         uint8       xlinfo;
    1805             :         XLogRecPtr  recptr;
    1806             : 
    1807         177 :         XLogBeginInsert();
    1808             : 
    1809         177 :         XLogRegisterBuffer(0, buf, REGBUF_WILL_INIT);
    1810         177 :         if (BufferIsValid(lbuf))
    1811          39 :             XLogRegisterBuffer(1, lbuf, REGBUF_STANDARD);
    1812         177 :         XLogRegisterBuffer(2, rbuf, REGBUF_STANDARD);
    1813         177 :         if (target != leafblkno)
    1814          42 :             XLogRegisterBuffer(3, leafbuf, REGBUF_WILL_INIT);
    1815             : 
    1816             :         /* information on the unlinked block */
    1817         177 :         xlrec.leftsib = leftsib;
    1818         177 :         xlrec.rightsib = rightsib;
    1819         177 :         xlrec.btpo_xact = opaque->btpo.xact;
    1820             : 
    1821             :         /* information needed to recreate the leaf block (if not the target) */
    1822         177 :         xlrec.leafleftsib = leafleftsib;
    1823         177 :         xlrec.leafrightsib = leafrightsib;
    1824         177 :         xlrec.topparent = nextchild;
    1825             : 
    1826         177 :         XLogRegisterData((char *) &xlrec, SizeOfBtreeUnlinkPage);
    1827             : 
    1828         177 :         if (BufferIsValid(metabuf))
    1829             :         {
    1830           4 :             XLogRegisterBuffer(4, metabuf, REGBUF_WILL_INIT);
    1831             : 
    1832           4 :             xlmeta.root = metad->btm_root;
    1833           4 :             xlmeta.level = metad->btm_level;
    1834           4 :             xlmeta.fastroot = metad->btm_fastroot;
    1835           4 :             xlmeta.fastlevel = metad->btm_fastlevel;
    1836             : 
    1837           4 :             XLogRegisterBufData(4, (char *) &xlmeta, sizeof(xl_btree_metadata));
    1838           4 :             xlinfo = XLOG_BTREE_UNLINK_PAGE_META;
    1839             :         }
    1840             :         else
    1841         173 :             xlinfo = XLOG_BTREE_UNLINK_PAGE;
    1842             : 
    1843         177 :         recptr = XLogInsert(RM_BTREE_ID, xlinfo);
    1844             : 
    1845         177 :         if (BufferIsValid(metabuf))
    1846             :         {
    1847           4 :             PageSetLSN(metapg, recptr);
    1848             :         }
    1849         177 :         page = BufferGetPage(rbuf);
    1850         177 :         PageSetLSN(page, recptr);
    1851         177 :         page = BufferGetPage(buf);
    1852         177 :         PageSetLSN(page, recptr);
    1853         177 :         if (BufferIsValid(lbuf))
    1854             :         {
    1855          39 :             page = BufferGetPage(lbuf);
    1856          39 :             PageSetLSN(page, recptr);
    1857             :         }
    1858         177 :         if (target != leafblkno)
    1859             :         {
    1860          42 :             page = BufferGetPage(leafbuf);
    1861          42 :             PageSetLSN(page, recptr);
    1862             :         }
    1863             :     }
    1864             : 
    1865         177 :     END_CRIT_SECTION();
    1866             : 
    1867             :     /* release metapage */
    1868         177 :     if (BufferIsValid(metabuf))
    1869           4 :         _bt_relbuf(rel, metabuf);
    1870             : 
    1871             :     /* release siblings */
    1872         177 :     if (BufferIsValid(lbuf))
    1873          39 :         _bt_relbuf(rel, lbuf);
    1874         177 :     _bt_relbuf(rel, rbuf);
    1875             : 
    1876             :     /*
    1877             :      * Release the target, if it was not the leaf block.  The leaf is always
    1878             :      * kept locked.
    1879             :      */
    1880         177 :     if (target != leafblkno)
    1881          42 :         _bt_relbuf(rel, buf);
    1882             : 
    1883         177 :     return true;
    1884             : }

Generated by: LCOV version 1.11