LCOV - code coverage report
Current view: top level - src/backend/access/nbtree - nbtree.c (source / functions) Hit Total Coverage
Test: PostgreSQL Lines: 378 426 88.7 %
Date: 2017-09-29 13:40:31 Functions: 23 24 95.8 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * nbtree.c
       4             :  *    Implementation of Lehman and Yao's btree management algorithm for
       5             :  *    Postgres.
       6             :  *
       7             :  * NOTES
       8             :  *    This file contains only the public interface routines.
       9             :  *
      10             :  *
      11             :  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
      12             :  * Portions Copyright (c) 1994, Regents of the University of California
      13             :  *
      14             :  * IDENTIFICATION
      15             :  *    src/backend/access/nbtree/nbtree.c
      16             :  *
      17             :  *-------------------------------------------------------------------------
      18             :  */
      19             : #include "postgres.h"
      20             : 
      21             : #include "access/nbtree.h"
      22             : #include "access/relscan.h"
      23             : #include "access/xlog.h"
      24             : #include "catalog/index.h"
      25             : #include "commands/vacuum.h"
      26             : #include "pgstat.h"
      27             : #include "storage/condition_variable.h"
      28             : #include "storage/indexfsm.h"
      29             : #include "storage/ipc.h"
      30             : #include "storage/lmgr.h"
      31             : #include "storage/smgr.h"
      32             : #include "tcop/tcopprot.h"        /* pgrminclude ignore */
      33             : #include "utils/builtins.h"
      34             : #include "utils/index_selfuncs.h"
      35             : #include "utils/memutils.h"
      36             : 
      37             : 
      38             : /* Working state for btbuild and its callback */
      39             : typedef struct
      40             : {
      41             :     bool        isUnique;
      42             :     bool        haveDead;
      43             :     Relation    heapRel;
      44             :     BTSpool    *spool;
      45             : 
      46             :     /*
      47             :      * spool2 is needed only when the index is a unique index. Dead tuples are
      48             :      * put into spool2 instead of spool in order to avoid uniqueness check.
      49             :      */
      50             :     BTSpool    *spool2;
      51             :     double      indtuples;
      52             : } BTBuildState;
      53             : 
      54             : /* Working state needed by btvacuumpage */
      55             : typedef struct
      56             : {
      57             :     IndexVacuumInfo *info;
      58             :     IndexBulkDeleteResult *stats;
      59             :     IndexBulkDeleteCallback callback;
      60             :     void       *callback_state;
      61             :     BTCycleId   cycleid;
      62             :     BlockNumber lastBlockVacuumed;  /* highest blkno actually vacuumed */
      63             :     BlockNumber lastBlockLocked;    /* highest blkno we've cleanup-locked */
      64             :     BlockNumber totFreePages;   /* true total # of free pages */
      65             :     MemoryContext pagedelcontext;
      66             : } BTVacState;
      67             : 
      68             : /*
      69             :  * BTPARALLEL_NOT_INITIALIZED indicates that the scan has not started.
      70             :  *
      71             :  * BTPARALLEL_ADVANCING indicates that some process is advancing the scan to
      72             :  * a new page; others must wait.
      73             :  *
      74             :  * BTPARALLEL_IDLE indicates that no backend is currently advancing the scan
      75             :  * to a new page; some process can start doing that.
      76             :  *
      77             :  * BTPARALLEL_DONE indicates that the scan is complete (including error exit).
      78             :  * We reach this state once for every distinct combination of array keys.
      79             :  */
      80             : typedef enum
      81             : {
      82             :     BTPARALLEL_NOT_INITIALIZED,
      83             :     BTPARALLEL_ADVANCING,
      84             :     BTPARALLEL_IDLE,
      85             :     BTPARALLEL_DONE
      86             : } BTPS_State;
      87             : 
      88             : /*
      89             :  * BTParallelScanDescData contains btree specific shared information required
      90             :  * for parallel scan.
      91             :  */
      92             : typedef struct BTParallelScanDescData
      93             : {
      94             :     BlockNumber btps_scanPage;  /* latest or next page to be scanned */
      95             :     BTPS_State  btps_pageStatus;    /* indicates whether next page is
      96             :                                      * available for scan. see above for
      97             :                                      * possible states of parallel scan. */
      98             :     int         btps_arrayKeyCount; /* count indicating number of array scan
      99             :                                      * keys processed by parallel scan */
     100             :     slock_t     btps_mutex;     /* protects above variables */
     101             :     ConditionVariable btps_cv;  /* used to synchronize parallel scan */
     102             : }           BTParallelScanDescData;
     103             : 
     104             : typedef struct BTParallelScanDescData *BTParallelScanDesc;
     105             : 
     106             : 
     107             : static void btbuildCallback(Relation index,
     108             :                 HeapTuple htup,
     109             :                 Datum *values,
     110             :                 bool *isnull,
     111             :                 bool tupleIsAlive,
     112             :                 void *state);
     113             : static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
     114             :              IndexBulkDeleteCallback callback, void *callback_state,
     115             :              BTCycleId cycleid);
     116             : static void btvacuumpage(BTVacState *vstate, BlockNumber blkno,
     117             :              BlockNumber orig_blkno);
     118             : 
     119             : 
     120             : /*
     121             :  * Btree handler function: return IndexAmRoutine with access method parameters
     122             :  * and callbacks.
     123             :  */
     124             : Datum
     125       35320 : bthandler(PG_FUNCTION_ARGS)
     126             : {
     127       35320 :     IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
     128             : 
     129       35320 :     amroutine->amstrategies = BTMaxStrategyNumber;
     130       35320 :     amroutine->amsupport = BTNProcs;
     131       35320 :     amroutine->amcanorder = true;
     132       35320 :     amroutine->amcanorderbyop = false;
     133       35320 :     amroutine->amcanbackward = true;
     134       35320 :     amroutine->amcanunique = true;
     135       35320 :     amroutine->amcanmulticol = true;
     136       35320 :     amroutine->amoptionalkey = true;
     137       35320 :     amroutine->amsearcharray = true;
     138       35320 :     amroutine->amsearchnulls = true;
     139       35320 :     amroutine->amstorage = false;
     140       35320 :     amroutine->amclusterable = true;
     141       35320 :     amroutine->ampredlocks = true;
     142       35320 :     amroutine->amcanparallel = true;
     143       35320 :     amroutine->amkeytype = InvalidOid;
     144             : 
     145       35320 :     amroutine->ambuild = btbuild;
     146       35320 :     amroutine->ambuildempty = btbuildempty;
     147       35320 :     amroutine->aminsert = btinsert;
     148       35320 :     amroutine->ambulkdelete = btbulkdelete;
     149       35320 :     amroutine->amvacuumcleanup = btvacuumcleanup;
     150       35320 :     amroutine->amcanreturn = btcanreturn;
     151       35320 :     amroutine->amcostestimate = btcostestimate;
     152       35320 :     amroutine->amoptions = btoptions;
     153       35320 :     amroutine->amproperty = btproperty;
     154       35320 :     amroutine->amvalidate = btvalidate;
     155       35320 :     amroutine->ambeginscan = btbeginscan;
     156       35320 :     amroutine->amrescan = btrescan;
     157       35320 :     amroutine->amgettuple = btgettuple;
     158       35320 :     amroutine->amgetbitmap = btgetbitmap;
     159       35320 :     amroutine->amendscan = btendscan;
     160       35320 :     amroutine->ammarkpos = btmarkpos;
     161       35320 :     amroutine->amrestrpos = btrestrpos;
     162       35320 :     amroutine->amestimateparallelscan = btestimateparallelscan;
     163       35320 :     amroutine->aminitparallelscan = btinitparallelscan;
     164       35320 :     amroutine->amparallelrescan = btparallelrescan;
     165             : 
     166       35320 :     PG_RETURN_POINTER(amroutine);
     167             : }
     168             : 
     169             : /*
     170             :  *  btbuild() -- build a new btree index.
     171             :  */
     172             : IndexBuildResult *
     173        1429 : btbuild(Relation heap, Relation index, IndexInfo *indexInfo)
     174             : {
     175             :     IndexBuildResult *result;
     176             :     double      reltuples;
     177             :     BTBuildState buildstate;
     178             : 
     179        1429 :     buildstate.isUnique = indexInfo->ii_Unique;
     180        1429 :     buildstate.haveDead = false;
     181        1429 :     buildstate.heapRel = heap;
     182        1429 :     buildstate.spool = NULL;
     183        1429 :     buildstate.spool2 = NULL;
     184        1429 :     buildstate.indtuples = 0;
     185             : 
     186             : #ifdef BTREE_BUILD_STATS
     187             :     if (log_btree_build_stats)
     188             :         ResetUsage();
     189             : #endif                          /* BTREE_BUILD_STATS */
     190             : 
     191             :     /*
     192             :      * We expect to be called exactly once for any index relation. If that's
     193             :      * not the case, big trouble's what we have.
     194             :      */
     195        1429 :     if (RelationGetNumberOfBlocks(index) != 0)
     196           0 :         elog(ERROR, "index \"%s\" already contains data",
     197             :              RelationGetRelationName(index));
     198             : 
     199        1429 :     buildstate.spool = _bt_spoolinit(heap, index, indexInfo->ii_Unique, false);
     200             : 
     201             :     /*
     202             :      * If building a unique index, put dead tuples in a second spool to keep
     203             :      * them out of the uniqueness check.
     204             :      */
     205        1429 :     if (indexInfo->ii_Unique)
     206        1197 :         buildstate.spool2 = _bt_spoolinit(heap, index, false, true);
     207             : 
     208             :     /* do the heap scan */
     209        1429 :     reltuples = IndexBuildHeapScan(heap, index, indexInfo, true,
     210             :                                    btbuildCallback, (void *) &buildstate);
     211             : 
     212             :     /* okay, all heap tuples are indexed */
     213        1428 :     if (buildstate.spool2 && !buildstate.haveDead)
     214             :     {
     215             :         /* spool2 turns out to be unnecessary */
     216        1196 :         _bt_spooldestroy(buildstate.spool2);
     217        1196 :         buildstate.spool2 = NULL;
     218             :     }
     219             : 
     220             :     /*
     221             :      * Finish the build by (1) completing the sort of the spool file, (2)
     222             :      * inserting the sorted tuples into btree pages and (3) building the upper
     223             :      * levels.
     224             :      */
     225        1428 :     _bt_leafbuild(buildstate.spool, buildstate.spool2);
     226        1422 :     _bt_spooldestroy(buildstate.spool);
     227        1422 :     if (buildstate.spool2)
     228           1 :         _bt_spooldestroy(buildstate.spool2);
     229             : 
     230             : #ifdef BTREE_BUILD_STATS
     231             :     if (log_btree_build_stats)
     232             :     {
     233             :         ShowUsage("BTREE BUILD STATS");
     234             :         ResetUsage();
     235             :     }
     236             : #endif                          /* BTREE_BUILD_STATS */
     237             : 
     238             :     /*
     239             :      * Return statistics
     240             :      */
     241        1422 :     result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
     242             : 
     243        1422 :     result->heap_tuples = reltuples;
     244        1422 :     result->index_tuples = buildstate.indtuples;
     245             : 
     246        1422 :     return result;
     247             : }
     248             : 
     249             : /*
     250             :  * Per-tuple callback from IndexBuildHeapScan
     251             :  */
     252             : static void
     253      506550 : btbuildCallback(Relation index,
     254             :                 HeapTuple htup,
     255             :                 Datum *values,
     256             :                 bool *isnull,
     257             :                 bool tupleIsAlive,
     258             :                 void *state)
     259             : {
     260      506550 :     BTBuildState *buildstate = (BTBuildState *) state;
     261             : 
     262             :     /*
     263             :      * insert the index tuple into the appropriate spool file for subsequent
     264             :      * processing
     265             :      */
     266      506550 :     if (tupleIsAlive || buildstate->spool2 == NULL)
     267      506544 :         _bt_spool(buildstate->spool, &htup->t_self, values, isnull);
     268             :     else
     269             :     {
     270             :         /* dead tuples are put into spool2 */
     271           6 :         buildstate->haveDead = true;
     272           6 :         _bt_spool(buildstate->spool2, &htup->t_self, values, isnull);
     273             :     }
     274             : 
     275      506550 :     buildstate->indtuples += 1;
     276      506550 : }
     277             : 
     278             : /*
     279             :  *  btbuildempty() -- build an empty btree index in the initialization fork
     280             :  */
     281             : void
     282          11 : btbuildempty(Relation index)
     283             : {
     284             :     Page        metapage;
     285             : 
     286             :     /* Construct metapage. */
     287          11 :     metapage = (Page) palloc(BLCKSZ);
     288          11 :     _bt_initmetapage(metapage, P_NONE, 0);
     289             : 
     290             :     /*
     291             :      * Write the page and log it.  It might seem that an immediate sync would
     292             :      * be sufficient to guarantee that the file exists on disk, but recovery
     293             :      * itself might remove it while replaying, for example, an
     294             :      * XLOG_DBASE_CREATE or XLOG_TBLSPC_CREATE record.  Therefore, we need
     295             :      * this even when wal_level=minimal.
     296             :      */
     297          11 :     PageSetChecksumInplace(metapage, BTREE_METAPAGE);
     298          11 :     smgrwrite(index->rd_smgr, INIT_FORKNUM, BTREE_METAPAGE,
     299             :               (char *) metapage, true);
     300          11 :     log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM,
     301             :                 BTREE_METAPAGE, metapage, false);
     302             : 
     303             :     /*
     304             :      * An immediate sync is required even if we xlog'd the page, because the
     305             :      * write did not go through shared_buffers and therefore a concurrent
     306             :      * checkpoint may have moved the redo pointer past our xlog record.
     307             :      */
     308          11 :     smgrimmedsync(index->rd_smgr, INIT_FORKNUM);
     309          11 : }
     310             : 
     311             : /*
     312             :  *  btinsert() -- insert an index tuple into a btree.
     313             :  *
     314             :  *      Descend the tree recursively, find the appropriate location for our
     315             :  *      new tuple, and put it there.
     316             :  */
     317             : bool
     318      205514 : btinsert(Relation rel, Datum *values, bool *isnull,
     319             :          ItemPointer ht_ctid, Relation heapRel,
     320             :          IndexUniqueCheck checkUnique,
     321             :          IndexInfo *indexInfo)
     322             : {
     323             :     bool        result;
     324             :     IndexTuple  itup;
     325             : 
     326             :     /* generate an index tuple */
     327      205514 :     itup = index_form_tuple(RelationGetDescr(rel), values, isnull);
     328      205514 :     itup->t_tid = *ht_ctid;
     329             : 
     330      205514 :     result = _bt_doinsert(rel, itup, checkUnique, heapRel);
     331             : 
     332      205482 :     pfree(itup);
     333             : 
     334      205482 :     return result;
     335             : }
     336             : 
     337             : /*
     338             :  *  btgettuple() -- Get the next tuple in the scan.
     339             :  */
     340             : bool
     341     1177790 : btgettuple(IndexScanDesc scan, ScanDirection dir)
     342             : {
     343     1177790 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
     344             :     bool        res;
     345             : 
     346             :     /* btree indexes are never lossy */
     347     1177790 :     scan->xs_recheck = false;
     348             : 
     349             :     /*
     350             :      * If we have any array keys, initialize them during first call for a
     351             :      * scan.  We can't do this in btrescan because we don't know the scan
     352             :      * direction at that time.
     353             :      */
     354     1177790 :     if (so->numArrayKeys && !BTScanPosIsValid(so->currPos))
     355             :     {
     356             :         /* punt if we have any unsatisfiable array keys */
     357           3 :         if (so->numArrayKeys < 0)
     358           0 :             return false;
     359             : 
     360           3 :         _bt_start_array_keys(scan, dir);
     361             :     }
     362             : 
     363             :     /* This loop handles advancing to the next array elements, if any */
     364             :     do
     365             :     {
     366             :         /*
     367             :          * If we've already initialized this scan, we can just advance it in
     368             :          * the appropriate direction.  If we haven't done so yet, we call
     369             :          * _bt_first() to get the first item in the scan.
     370             :          */
     371     1177795 :         if (!BTScanPosIsValid(so->currPos))
     372      387164 :             res = _bt_first(scan, dir);
     373             :         else
     374             :         {
     375             :             /*
     376             :              * Check to see if we should kill the previously-fetched tuple.
     377             :              */
     378      790631 :             if (scan->kill_prior_tuple)
     379             :             {
     380             :                 /*
     381             :                  * Yes, remember it for later. (We'll deal with all such
     382             :                  * tuples at once right before leaving the index page.)  The
     383             :                  * test for numKilled overrun is not just paranoia: if the
     384             :                  * caller reverses direction in the indexscan then the same
     385             :                  * item might get entered multiple times. It's not worth
     386             :                  * trying to optimize that, so we don't detect it, but instead
     387             :                  * just forget any excess entries.
     388             :                  */
     389        5116 :                 if (so->killedItems == NULL)
     390        2871 :                     so->killedItems = (int *)
     391        2871 :                         palloc(MaxIndexTuplesPerPage * sizeof(int));
     392        5116 :                 if (so->numKilled < MaxIndexTuplesPerPage)
     393        5116 :                     so->killedItems[so->numKilled++] = so->currPos.itemIndex;
     394             :             }
     395             : 
     396             :             /*
     397             :              * Now continue the scan.
     398             :              */
     399      790631 :             res = _bt_next(scan, dir);
     400             :         }
     401             : 
     402             :         /* If we have a tuple, return it ... */
     403     1177795 :         if (res)
     404      986779 :             break;
     405             :         /* ... otherwise see if we have more array keys to deal with */
     406      191016 :     } while (so->numArrayKeys && _bt_advance_array_keys(scan, dir));
     407             : 
     408     1177790 :     return res;
     409             : }
     410             : 
     411             : /*
     412             :  * btgetbitmap() -- gets all matching tuples, and adds them to a bitmap
     413             :  */
     414             : int64
     415         831 : btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
     416             : {
     417         831 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
     418         831 :     int64       ntids = 0;
     419             :     ItemPointer heapTid;
     420             : 
     421             :     /*
     422             :      * If we have any array keys, initialize them.
     423             :      */
     424         831 :     if (so->numArrayKeys)
     425             :     {
     426             :         /* punt if we have any unsatisfiable array keys */
     427          28 :         if (so->numArrayKeys < 0)
     428           0 :             return ntids;
     429             : 
     430          28 :         _bt_start_array_keys(scan, ForwardScanDirection);
     431             :     }
     432             : 
     433             :     /* This loop handles advancing to the next array elements, if any */
     434             :     do
     435             :     {
     436             :         /* Fetch the first page & tuple */
     437         902 :         if (_bt_first(scan, ForwardScanDirection))
     438             :         {
     439             :             /* Save tuple ID, and continue scanning */
     440         419 :             heapTid = &scan->xs_ctup.t_self;
     441         419 :             tbm_add_tuples(tbm, heapTid, 1, false);
     442         419 :             ntids++;
     443             : 
     444             :             for (;;)
     445             :             {
     446             :                 /*
     447             :                  * Advance to next tuple within page.  This is the same as the
     448             :                  * easy case in _bt_next().
     449             :                  */
     450      211640 :                 if (++so->currPos.itemIndex > so->currPos.lastItem)
     451             :                 {
     452             :                     /* let _bt_next do the heavy lifting */
     453         978 :                     if (!_bt_next(scan, ForwardScanDirection))
     454         419 :                         break;
     455             :                 }
     456             : 
     457             :                 /* Save tuple ID, and continue scanning */
     458      211221 :                 heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid;
     459      211221 :                 tbm_add_tuples(tbm, heapTid, 1, false);
     460      211221 :                 ntids++;
     461      211221 :             }
     462             :         }
     463             :         /* Now see if we have more array keys to deal with */
     464         902 :     } while (so->numArrayKeys && _bt_advance_array_keys(scan, ForwardScanDirection));
     465             : 
     466         831 :     return ntids;
     467             : }
     468             : 
     469             : /*
     470             :  *  btbeginscan() -- start a scan on a btree index
     471             :  */
     472             : IndexScanDesc
     473      362281 : btbeginscan(Relation rel, int nkeys, int norderbys)
     474             : {
     475             :     IndexScanDesc scan;
     476             :     BTScanOpaque so;
     477             : 
     478             :     /* no order by operators allowed */
     479      362281 :     Assert(norderbys == 0);
     480             : 
     481             :     /* get the scan */
     482      362281 :     scan = RelationGetIndexScan(rel, nkeys, norderbys);
     483             : 
     484             :     /* allocate private workspace */
     485      362281 :     so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData));
     486      362281 :     BTScanPosInvalidate(so->currPos);
     487      362281 :     BTScanPosInvalidate(so->markPos);
     488      362281 :     if (scan->numberOfKeys > 0)
     489      361923 :         so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
     490             :     else
     491         358 :         so->keyData = NULL;
     492             : 
     493      362281 :     so->arrayKeyData = NULL; /* assume no array keys for now */
     494      362281 :     so->numArrayKeys = 0;
     495      362281 :     so->arrayKeys = NULL;
     496      362281 :     so->arrayContext = NULL;
     497             : 
     498      362281 :     so->killedItems = NULL;      /* until needed */
     499      362281 :     so->numKilled = 0;
     500             : 
     501             :     /*
     502             :      * We don't know yet whether the scan will be index-only, so we do not
     503             :      * allocate the tuple workspace arrays until btrescan.  However, we set up
     504             :      * scan->xs_itupdesc whether we'll need it or not, since that's so cheap.
     505             :      */
     506      362281 :     so->currTuples = so->markTuples = NULL;
     507             : 
     508      362281 :     scan->xs_itupdesc = RelationGetDescr(rel);
     509             : 
     510      362281 :     scan->opaque = so;
     511             : 
     512      362281 :     return scan;
     513             : }
     514             : 
     515             : /*
     516             :  *  btrescan() -- rescan an index relation
     517             :  */
     518             : void
     519      388316 : btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
     520             :          ScanKey orderbys, int norderbys)
     521             : {
     522      388316 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
     523             : 
     524             :     /* we aren't holding any read locks, but gotta drop the pins */
     525      388316 :     if (BTScanPosIsValid(so->currPos))
     526             :     {
     527             :         /* Before leaving current page, deal with any killed items */
     528        3497 :         if (so->numKilled > 0)
     529           0 :             _bt_killitems(scan);
     530        3497 :         BTScanPosUnpinIfPinned(so->currPos);
     531        3497 :         BTScanPosInvalidate(so->currPos);
     532             :     }
     533             : 
     534      388316 :     so->markItemIndex = -1;
     535      388316 :     so->arrayKeyCount = 0;
     536      388316 :     BTScanPosUnpinIfPinned(so->markPos);
     537      388316 :     BTScanPosInvalidate(so->markPos);
     538             : 
     539             :     /*
     540             :      * Allocate tuple workspace arrays, if needed for an index-only scan and
     541             :      * not already done in a previous rescan call.  To save on palloc
     542             :      * overhead, both workspaces are allocated as one palloc block; only this
     543             :      * function and btendscan know that.
     544             :      *
     545             :      * NOTE: this data structure also makes it safe to return data from a
     546             :      * "name" column, even though btree name_ops uses an underlying storage
     547             :      * datatype of cstring.  The risk there is that "name" is supposed to be
     548             :      * padded to NAMEDATALEN, but the actual index tuple is probably shorter.
     549             :      * However, since we only return data out of tuples sitting in the
     550             :      * currTuples array, a fetch of NAMEDATALEN bytes can at worst pull some
     551             :      * data out of the markTuples array --- running off the end of memory for
     552             :      * a SIGSEGV is not possible.  Yeah, this is ugly as sin, but it beats
     553             :      * adding special-case treatment for name_ops elsewhere.
     554             :      */
     555      388316 :     if (scan->xs_want_itup && so->currTuples == NULL)
     556             :     {
     557         436 :         so->currTuples = (char *) palloc(BLCKSZ * 2);
     558         436 :         so->markTuples = so->currTuples + BLCKSZ;
     559             :     }
     560             : 
     561             :     /*
     562             :      * Reset the scan keys. Note that keys ordering stuff moved to _bt_first.
     563             :      * - vadim 05/05/97
     564             :      */
     565      388316 :     if (scankey && scan->numberOfKeys > 0)
     566      387958 :         memmove(scan->keyData,
     567             :                 scankey,
     568      387958 :                 scan->numberOfKeys * sizeof(ScanKeyData));
     569      388316 :     so->numberOfKeys = 0;        /* until _bt_preprocess_keys sets it */
     570             : 
     571             :     /* If any keys are SK_SEARCHARRAY type, set up array-key info */
     572      388316 :     _bt_preprocess_array_keys(scan);
     573      388316 : }
     574             : 
     575             : /*
     576             :  *  btendscan() -- close down a scan
     577             :  */
     578             : void
     579      362179 : btendscan(IndexScanDesc scan)
     580             : {
     581      362179 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
     582             : 
     583             :     /* we aren't holding any read locks, but gotta drop the pins */
     584      362179 :     if (BTScanPosIsValid(so->currPos))
     585             :     {
     586             :         /* Before leaving current page, deal with any killed items */
     587      192582 :         if (so->numKilled > 0)
     588         758 :             _bt_killitems(scan);
     589      192582 :         BTScanPosUnpinIfPinned(so->currPos);
     590             :     }
     591             : 
     592      362179 :     so->markItemIndex = -1;
     593      362179 :     BTScanPosUnpinIfPinned(so->markPos);
     594             : 
     595             :     /* No need to invalidate positions, the RAM is about to be freed. */
     596             : 
     597             :     /* Release storage */
     598      362179 :     if (so->keyData != NULL)
     599      361826 :         pfree(so->keyData);
     600             :     /* so->arrayKeyData and so->arrayKeys are in arrayContext */
     601      362179 :     if (so->arrayContext != NULL)
     602          31 :         MemoryContextDelete(so->arrayContext);
     603      362179 :     if (so->killedItems != NULL)
     604        2867 :         pfree(so->killedItems);
     605      362179 :     if (so->currTuples != NULL)
     606         429 :         pfree(so->currTuples);
     607             :     /* so->markTuples should not be pfree'd, see btrescan */
     608      362179 :     pfree(so);
     609      362179 : }
     610             : 
     611             : /*
     612             :  *  btmarkpos() -- save current scan position
     613             :  */
     614             : void
     615       21001 : btmarkpos(IndexScanDesc scan)
     616             : {
     617       21001 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
     618             : 
     619             :     /* There may be an old mark with a pin (but no lock). */
     620       21001 :     BTScanPosUnpinIfPinned(so->markPos);
     621             : 
     622             :     /*
     623             :      * Just record the current itemIndex.  If we later step to next page
     624             :      * before releasing the marked position, _bt_steppage makes a full copy of
     625             :      * the currPos struct in markPos.  If (as often happens) the mark is moved
     626             :      * before we leave the page, we don't have to do that work.
     627             :      */
     628       21001 :     if (BTScanPosIsValid(so->currPos))
     629       21001 :         so->markItemIndex = so->currPos.itemIndex;
     630             :     else
     631             :     {
     632           0 :         BTScanPosInvalidate(so->markPos);
     633           0 :         so->markItemIndex = -1;
     634             :     }
     635             : 
     636             :     /* Also record the current positions of any array keys */
     637       21001 :     if (so->numArrayKeys)
     638           0 :         _bt_mark_array_keys(scan);
     639       21001 : }
     640             : 
     641             : /*
     642             :  *  btrestrpos() -- restore scan to last saved position
     643             :  */
     644             : void
     645        9001 : btrestrpos(IndexScanDesc scan)
     646             : {
     647        9001 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
     648             : 
     649             :     /* Restore the marked positions of any array keys */
     650        9001 :     if (so->numArrayKeys)
     651           0 :         _bt_restore_array_keys(scan);
     652             : 
     653        9001 :     if (so->markItemIndex >= 0)
     654             :     {
     655             :         /*
     656             :          * The scan has never moved to a new page since the last mark.  Just
     657             :          * restore the itemIndex.
     658             :          *
     659             :          * NB: In this case we can't count on anything in so->markPos to be
     660             :          * accurate.
     661             :          */
     662        8983 :         so->currPos.itemIndex = so->markItemIndex;
     663             :     }
     664             :     else
     665             :     {
     666             :         /*
     667             :          * The scan moved to a new page after last mark or restore, and we are
     668             :          * now restoring to the marked page.  We aren't holding any read
     669             :          * locks, but if we're still holding the pin for the current position,
     670             :          * we must drop it.
     671             :          */
     672          18 :         if (BTScanPosIsValid(so->currPos))
     673             :         {
     674             :             /* Before leaving current page, deal with any killed items */
     675          18 :             if (so->numKilled > 0)
     676           0 :                 _bt_killitems(scan);
     677          18 :             BTScanPosUnpinIfPinned(so->currPos);
     678             :         }
     679             : 
     680          18 :         if (BTScanPosIsValid(so->markPos))
     681             :         {
     682             :             /* bump pin on mark buffer for assignment to current buffer */
     683          18 :             if (BTScanPosIsPinned(so->markPos))
     684           0 :                 IncrBufferRefCount(so->markPos.buf);
     685          18 :             memcpy(&so->currPos, &so->markPos,
     686             :                    offsetof(BTScanPosData, items[1]) +
     687          18 :                    so->markPos.lastItem * sizeof(BTScanPosItem));
     688          18 :             if (so->currTuples)
     689           0 :                 memcpy(so->currTuples, so->markTuples,
     690           0 :                        so->markPos.nextTupleOffset);
     691             :         }
     692             :         else
     693           0 :             BTScanPosInvalidate(so->currPos);
     694             :     }
     695        9001 : }
     696             : 
     697             : /*
     698             :  * btestimateparallelscan -- estimate storage for BTParallelScanDescData
     699             :  */
     700             : Size
     701           5 : btestimateparallelscan(void)
     702             : {
     703           5 :     return sizeof(BTParallelScanDescData);
     704             : }
     705             : 
     706             : /*
     707             :  * btinitparallelscan -- initialize BTParallelScanDesc for parallel btree scan
     708             :  */
     709             : void
     710           5 : btinitparallelscan(void *target)
     711             : {
     712           5 :     BTParallelScanDesc bt_target = (BTParallelScanDesc) target;
     713             : 
     714           5 :     SpinLockInit(&bt_target->btps_mutex);
     715           5 :     bt_target->btps_scanPage = InvalidBlockNumber;
     716           5 :     bt_target->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
     717           5 :     bt_target->btps_arrayKeyCount = 0;
     718           5 :     ConditionVariableInit(&bt_target->btps_cv);
     719           5 : }
     720             : 
     721             : /*
     722             :  *  btparallelrescan() -- reset parallel scan
     723             :  */
     724             : void
     725           4 : btparallelrescan(IndexScanDesc scan)
     726             : {
     727             :     BTParallelScanDesc btscan;
     728           4 :     ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
     729             : 
     730           4 :     Assert(parallel_scan);
     731             : 
     732           4 :     btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
     733             :                                                   parallel_scan->ps_offset);
     734             : 
     735             :     /*
     736             :      * In theory, we don't need to acquire the spinlock here, because there
     737             :      * shouldn't be any other workers running at this point, but we do so for
     738             :      * consistency.
     739             :      */
     740           4 :     SpinLockAcquire(&btscan->btps_mutex);
     741           4 :     btscan->btps_scanPage = InvalidBlockNumber;
     742           4 :     btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
     743           4 :     btscan->btps_arrayKeyCount = 0;
     744           4 :     SpinLockRelease(&btscan->btps_mutex);
     745           4 : }
     746             : 
     747             : /*
     748             :  * _bt_parallel_seize() -- Begin the process of advancing the scan to a new
     749             :  *      page.  Other scans must wait until we call bt_parallel_release() or
     750             :  *      bt_parallel_done().
     751             :  *
     752             :  * The return value is true if we successfully seized the scan and false
     753             :  * if we did not.  The latter case occurs if no pages remain for the current
     754             :  * set of scankeys.
     755             :  *
     756             :  * If the return value is true, *pageno returns the next or current page
     757             :  * of the scan (depending on the scan direction).  An invalid block number
     758             :  * means the scan hasn't yet started, and P_NONE means we've reached the end.
     759             :  * The first time a participating process reaches the last page, it will return
     760             :  * true and set *pageno to P_NONE; after that, further attempts to seize the
     761             :  * scan will return false.
     762             :  *
     763             :  * Callers should ignore the value of pageno if the return value is false.
     764             :  */
     765             : bool
     766         253 : _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno)
     767             : {
     768         253 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
     769             :     BTPS_State  pageStatus;
     770         253 :     bool        exit_loop = false;
     771         253 :     bool        status = true;
     772         253 :     ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
     773             :     BTParallelScanDesc btscan;
     774             : 
     775         253 :     *pageno = P_NONE;
     776             : 
     777         253 :     btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
     778             :                                                   parallel_scan->ps_offset);
     779             : 
     780             :     while (1)
     781             :     {
     782         253 :         SpinLockAcquire(&btscan->btps_mutex);
     783         253 :         pageStatus = btscan->btps_pageStatus;
     784             : 
     785         253 :         if (so->arrayKeyCount < btscan->btps_arrayKeyCount)
     786             :         {
     787             :             /* Parallel scan has already advanced to a new set of scankeys. */
     788           0 :             status = false;
     789             :         }
     790         253 :         else if (pageStatus == BTPARALLEL_DONE)
     791             :         {
     792             :             /*
     793             :              * We're done with this set of scankeys.  This may be the end, or
     794             :              * there could be more sets to try.
     795             :              */
     796          36 :             status = false;
     797             :         }
     798         217 :         else if (pageStatus != BTPARALLEL_ADVANCING)
     799             :         {
     800             :             /*
     801             :              * We have successfully seized control of the scan for the purpose
     802             :              * of advancing it to a new page!
     803             :              */
     804         217 :             btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
     805         217 :             *pageno = btscan->btps_scanPage;
     806         217 :             exit_loop = true;
     807             :         }
     808         253 :         SpinLockRelease(&btscan->btps_mutex);
     809         253 :         if (exit_loop || !status)
     810             :             break;
     811           0 :         ConditionVariableSleep(&btscan->btps_cv, WAIT_EVENT_BTREE_PAGE);
     812           0 :     }
     813         253 :     ConditionVariableCancelSleep();
     814             : 
     815         253 :     return status;
     816             : }
     817             : 
     818             : /*
     819             :  * _bt_parallel_release() -- Complete the process of advancing the scan to a
     820             :  *      new page.  We now have the new value btps_scanPage; some other backend
     821             :  *      can now begin advancing the scan.
     822             :  */
     823             : void
     824         208 : _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page)
     825             : {
     826         208 :     ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
     827             :     BTParallelScanDesc btscan;
     828             : 
     829         208 :     btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
     830             :                                                   parallel_scan->ps_offset);
     831             : 
     832         208 :     SpinLockAcquire(&btscan->btps_mutex);
     833         208 :     btscan->btps_scanPage = scan_page;
     834         208 :     btscan->btps_pageStatus = BTPARALLEL_IDLE;
     835         208 :     SpinLockRelease(&btscan->btps_mutex);
     836         208 :     ConditionVariableSignal(&btscan->btps_cv);
     837         208 : }
     838             : 
     839             : /*
     840             :  * _bt_parallel_done() -- Mark the parallel scan as complete.
     841             :  *
     842             :  * When there are no pages left to scan, this function should be called to
     843             :  * notify other workers.  Otherwise, they might wait forever for the scan to
     844             :  * advance to the next page.
     845             :  */
     846             : void
     847      191877 : _bt_parallel_done(IndexScanDesc scan)
     848             : {
     849      191877 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
     850      191877 :     ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
     851             :     BTParallelScanDesc btscan;
     852      191877 :     bool        status_changed = false;
     853             : 
     854             :     /* Do nothing, for non-parallel scans */
     855      191877 :     if (parallel_scan == NULL)
     856      383745 :         return;
     857             : 
     858           9 :     btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
     859             :                                                   parallel_scan->ps_offset);
     860             : 
     861             :     /*
     862             :      * Mark the parallel scan as done for this combination of scan keys,
     863             :      * unless some other process already did so.  See also
     864             :      * _bt_advance_array_keys.
     865             :      */
     866           9 :     SpinLockAcquire(&btscan->btps_mutex);
     867          18 :     if (so->arrayKeyCount >= btscan->btps_arrayKeyCount &&
     868           9 :         btscan->btps_pageStatus != BTPARALLEL_DONE)
     869             :     {
     870           9 :         btscan->btps_pageStatus = BTPARALLEL_DONE;
     871           9 :         status_changed = true;
     872             :     }
     873           9 :     SpinLockRelease(&btscan->btps_mutex);
     874             : 
     875             :     /* wake up all the workers associated with this parallel scan */
     876           9 :     if (status_changed)
     877           9 :         ConditionVariableBroadcast(&btscan->btps_cv);
     878             : }
     879             : 
     880             : /*
     881             :  * _bt_parallel_advance_array_keys() -- Advances the parallel scan for array
     882             :  *          keys.
     883             :  *
     884             :  * Updates the count of array keys processed for both local and parallel
     885             :  * scans.
     886             :  */
     887             : void
     888           0 : _bt_parallel_advance_array_keys(IndexScanDesc scan)
     889             : {
     890           0 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
     891           0 :     ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
     892             :     BTParallelScanDesc btscan;
     893             : 
     894           0 :     btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
     895             :                                                   parallel_scan->ps_offset);
     896             : 
     897           0 :     so->arrayKeyCount++;
     898           0 :     SpinLockAcquire(&btscan->btps_mutex);
     899           0 :     if (btscan->btps_pageStatus == BTPARALLEL_DONE)
     900             :     {
     901           0 :         btscan->btps_scanPage = InvalidBlockNumber;
     902           0 :         btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
     903           0 :         btscan->btps_arrayKeyCount++;
     904             :     }
     905           0 :     SpinLockRelease(&btscan->btps_mutex);
     906           0 : }
     907             : 
     908             : /*
     909             :  * Bulk deletion of all index entries pointing to a set of heap tuples.
     910             :  * The set of target tuples is specified via a callback routine that tells
     911             :  * whether any given heap tuple (identified by ItemPointer) is being deleted.
     912             :  *
     913             :  * Result: a palloc'd struct containing statistical info for VACUUM displays.
     914             :  */
     915             : IndexBulkDeleteResult *
     916         109 : btbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
     917             :              IndexBulkDeleteCallback callback, void *callback_state)
     918             : {
     919         109 :     Relation    rel = info->index;
     920             :     BTCycleId   cycleid;
     921             : 
     922             :     /* allocate stats if first time through, else re-use existing struct */
     923         109 :     if (stats == NULL)
     924         109 :         stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
     925             : 
     926             :     /* Establish the vacuum cycle ID to use for this scan */
     927             :     /* The ENSURE stuff ensures we clean up shared memory on failure */
     928         109 :     PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
     929             :     {
     930         109 :         cycleid = _bt_start_vacuum(rel);
     931             : 
     932         109 :         btvacuumscan(info, stats, callback, callback_state, cycleid);
     933             :     }
     934         109 :     PG_END_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
     935         109 :     _bt_end_vacuum(rel);
     936             : 
     937         109 :     return stats;
     938             : }
     939             : 
     940             : /*
     941             :  * Post-VACUUM cleanup.
     942             :  *
     943             :  * Result: a palloc'd struct containing statistical info for VACUUM displays.
     944             :  */
     945             : IndexBulkDeleteResult *
     946         600 : btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
     947             : {
     948             :     /* No-op in ANALYZE ONLY mode */
     949         600 :     if (info->analyze_only)
     950         204 :         return stats;
     951             : 
     952             :     /*
     953             :      * If btbulkdelete was called, we need not do anything, just return the
     954             :      * stats from the latest btbulkdelete call.  If it wasn't called, we must
     955             :      * still do a pass over the index, to recycle any newly-recyclable pages
     956             :      * and to obtain index statistics.
     957             :      *
     958             :      * Since we aren't going to actually delete any leaf items, there's no
     959             :      * need to go through all the vacuum-cycle-ID pushups.
     960             :      */
     961         396 :     if (stats == NULL)
     962             :     {
     963         292 :         stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
     964         292 :         btvacuumscan(info, stats, NULL, NULL, 0);
     965             :     }
     966             : 
     967             :     /* Finally, vacuum the FSM */
     968         396 :     IndexFreeSpaceMapVacuum(info->index);
     969             : 
     970             :     /*
     971             :      * It's quite possible for us to be fooled by concurrent page splits into
     972             :      * double-counting some index tuples, so disbelieve any total that exceeds
     973             :      * the underlying heap's count ... if we know that accurately.  Otherwise
     974             :      * this might just make matters worse.
     975             :      */
     976         396 :     if (!info->estimated_count)
     977             :     {
     978         387 :         if (stats->num_index_tuples > info->num_heap_tuples)
     979           0 :             stats->num_index_tuples = info->num_heap_tuples;
     980             :     }
     981             : 
     982         396 :     return stats;
     983             : }
     984             : 
     985             : /*
     986             :  * btvacuumscan --- scan the index for VACUUMing purposes
     987             :  *
     988             :  * This combines the functions of looking for leaf tuples that are deletable
     989             :  * according to the vacuum callback, looking for empty pages that can be
     990             :  * deleted, and looking for old deleted pages that can be recycled.  Both
     991             :  * btbulkdelete and btvacuumcleanup invoke this (the latter only if no
     992             :  * btbulkdelete call occurred).
     993             :  *
     994             :  * The caller is responsible for initially allocating/zeroing a stats struct
     995             :  * and for obtaining a vacuum cycle ID if necessary.
     996             :  */
     997             : static void
     998         401 : btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
     999             :              IndexBulkDeleteCallback callback, void *callback_state,
    1000             :              BTCycleId cycleid)
    1001             : {
    1002         401 :     Relation    rel = info->index;
    1003             :     BTVacState  vstate;
    1004             :     BlockNumber num_pages;
    1005             :     BlockNumber blkno;
    1006             :     bool        needLock;
    1007             : 
    1008             :     /*
    1009             :      * Reset counts that will be incremented during the scan; needed in case
    1010             :      * of multiple scans during a single VACUUM command
    1011             :      */
    1012         401 :     stats->estimated_count = false;
    1013         401 :     stats->num_index_tuples = 0;
    1014         401 :     stats->pages_deleted = 0;
    1015             : 
    1016             :     /* Set up info to pass down to btvacuumpage */
    1017         401 :     vstate.info = info;
    1018         401 :     vstate.stats = stats;
    1019         401 :     vstate.callback = callback;
    1020         401 :     vstate.callback_state = callback_state;
    1021         401 :     vstate.cycleid = cycleid;
    1022         401 :     vstate.lastBlockVacuumed = BTREE_METAPAGE;  /* Initialise at first block */
    1023         401 :     vstate.lastBlockLocked = BTREE_METAPAGE;
    1024         401 :     vstate.totFreePages = 0;
    1025             : 
    1026             :     /* Create a temporary memory context to run _bt_pagedel in */
    1027         401 :     vstate.pagedelcontext = AllocSetContextCreate(CurrentMemoryContext,
    1028             :                                                   "_bt_pagedel",
    1029             :                                                   ALLOCSET_DEFAULT_SIZES);
    1030             : 
    1031             :     /*
    1032             :      * The outer loop iterates over all index pages except the metapage, in
    1033             :      * physical order (we hope the kernel will cooperate in providing
    1034             :      * read-ahead for speed).  It is critical that we visit all leaf pages,
    1035             :      * including ones added after we start the scan, else we might fail to
    1036             :      * delete some deletable tuples.  Hence, we must repeatedly check the
    1037             :      * relation length.  We must acquire the relation-extension lock while
    1038             :      * doing so to avoid a race condition: if someone else is extending the
    1039             :      * relation, there is a window where bufmgr/smgr have created a new
    1040             :      * all-zero page but it hasn't yet been write-locked by _bt_getbuf(). If
    1041             :      * we manage to scan such a page here, we'll improperly assume it can be
    1042             :      * recycled.  Taking the lock synchronizes things enough to prevent a
    1043             :      * problem: either num_pages won't include the new page, or _bt_getbuf
    1044             :      * already has write lock on the buffer and it will be fully initialized
    1045             :      * before we can examine it.  (See also vacuumlazy.c, which has the same
    1046             :      * issue.)  Also, we need not worry if a page is added immediately after
    1047             :      * we look; the page splitting code already has write-lock on the left
    1048             :      * page before it adds a right page, so we must already have processed any
    1049             :      * tuples due to be moved into such a page.
    1050             :      *
    1051             :      * We can skip locking for new or temp relations, however, since no one
    1052             :      * else could be accessing them.
    1053             :      */
    1054         401 :     needLock = !RELATION_IS_LOCAL(rel);
    1055             : 
    1056         401 :     blkno = BTREE_METAPAGE + 1;
    1057             :     for (;;)
    1058             :     {
    1059             :         /* Get the current relation length */
    1060         634 :         if (needLock)
    1061         634 :             LockRelationForExtension(rel, ExclusiveLock);
    1062         634 :         num_pages = RelationGetNumberOfBlocks(rel);
    1063         634 :         if (needLock)
    1064         634 :             UnlockRelationForExtension(rel, ExclusiveLock);
    1065             : 
    1066             :         /* Quit if we've scanned the whole relation */
    1067         634 :         if (blkno >= num_pages)
    1068         401 :             break;
    1069             :         /* Iterate over pages, then loop back to recheck length */
    1070        1912 :         for (; blkno < num_pages; blkno++)
    1071             :         {
    1072        1679 :             btvacuumpage(&vstate, blkno, blkno);
    1073             :         }
    1074         233 :     }
    1075             : 
    1076             :     /*
    1077             :      * Check to see if we need to issue one final WAL record for this index,
    1078             :      * which may be needed for correctness on a hot standby node when non-MVCC
    1079             :      * index scans could take place.
    1080             :      *
    1081             :      * If the WAL is replayed in hot standby, the replay process needs to get
    1082             :      * cleanup locks on all index leaf pages, just as we've been doing here.
    1083             :      * However, we won't issue any WAL records about pages that have no items
    1084             :      * to be deleted.  For pages between pages we've vacuumed, the replay code
    1085             :      * will take locks under the direction of the lastBlockVacuumed fields in
    1086             :      * the XLOG_BTREE_VACUUM WAL records.  To cover pages after the last one
    1087             :      * we vacuum, we need to issue a dummy XLOG_BTREE_VACUUM WAL record
    1088             :      * against the last leaf page in the index, if that one wasn't vacuumed.
    1089             :      */
    1090         802 :     if (XLogStandbyInfoActive() &&
    1091         401 :         vstate.lastBlockVacuumed < vstate.lastBlockLocked)
    1092             :     {
    1093             :         Buffer      buf;
    1094             : 
    1095             :         /*
    1096             :          * The page should be valid, but we can't use _bt_getbuf() because we
    1097             :          * want to use a nondefault buffer access strategy.  Since we aren't
    1098             :          * going to delete any items, getting cleanup lock again is probably
    1099             :          * overkill, but for consistency do that anyway.
    1100             :          */
    1101         144 :         buf = ReadBufferExtended(rel, MAIN_FORKNUM, vstate.lastBlockLocked,
    1102             :                                  RBM_NORMAL, info->strategy);
    1103         144 :         LockBufferForCleanup(buf);
    1104         144 :         _bt_checkpage(rel, buf);
    1105         144 :         _bt_delitems_vacuum(rel, buf, NULL, 0, vstate.lastBlockVacuumed);
    1106         144 :         _bt_relbuf(rel, buf);
    1107             :     }
    1108             : 
    1109         401 :     MemoryContextDelete(vstate.pagedelcontext);
    1110             : 
    1111             :     /* update statistics */
    1112         401 :     stats->num_pages = num_pages;
    1113         401 :     stats->pages_free = vstate.totFreePages;
    1114         401 : }
    1115             : 
    1116             : /*
    1117             :  * btvacuumpage --- VACUUM one page
    1118             :  *
    1119             :  * This processes a single page for btvacuumscan().  In some cases we
    1120             :  * must go back and re-examine previously-scanned pages; this routine
    1121             :  * recurses when necessary to handle that case.
    1122             :  *
    1123             :  * blkno is the page to process.  orig_blkno is the highest block number
    1124             :  * reached by the outer btvacuumscan loop (the same as blkno, unless we
    1125             :  * are recursing to re-examine a previous page).
    1126             :  */
    1127             : static void
    1128        1679 : btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno)
    1129             : {
    1130        1679 :     IndexVacuumInfo *info = vstate->info;
    1131        1679 :     IndexBulkDeleteResult *stats = vstate->stats;
    1132        1679 :     IndexBulkDeleteCallback callback = vstate->callback;
    1133        1679 :     void       *callback_state = vstate->callback_state;
    1134        1679 :     Relation    rel = info->index;
    1135             :     bool        delete_now;
    1136             :     BlockNumber recurse_to;
    1137             :     Buffer      buf;
    1138             :     Page        page;
    1139        1679 :     BTPageOpaque opaque = NULL;
    1140             : 
    1141             : restart:
    1142        1679 :     delete_now = false;
    1143        1679 :     recurse_to = P_NONE;
    1144             : 
    1145             :     /* call vacuum_delay_point while not holding any buffer lock */
    1146        1679 :     vacuum_delay_point();
    1147             : 
    1148             :     /*
    1149             :      * We can't use _bt_getbuf() here because it always applies
    1150             :      * _bt_checkpage(), which will barf on an all-zero page. We want to
    1151             :      * recycle all-zero pages, not fail.  Also, we want to use a nondefault
    1152             :      * buffer access strategy.
    1153             :      */
    1154        1679 :     buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
    1155             :                              info->strategy);
    1156        1679 :     LockBuffer(buf, BT_READ);
    1157        1679 :     page = BufferGetPage(buf);
    1158        1679 :     if (!PageIsNew(page))
    1159             :     {
    1160        1679 :         _bt_checkpage(rel, buf);
    1161        1679 :         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    1162             :     }
    1163             : 
    1164             :     /*
    1165             :      * If we are recursing, the only case we want to do anything with is a
    1166             :      * live leaf page having the current vacuum cycle ID.  Any other state
    1167             :      * implies we already saw the page (eg, deleted it as being empty).
    1168             :      */
    1169        1679 :     if (blkno != orig_blkno)
    1170             :     {
    1171           0 :         if (_bt_page_recyclable(page) ||
    1172           0 :             P_IGNORE(opaque) ||
    1173           0 :             !P_ISLEAF(opaque) ||
    1174           0 :             opaque->btpo_cycleid != vstate->cycleid)
    1175             :         {
    1176           0 :             _bt_relbuf(rel, buf);
    1177           0 :             return;
    1178             :         }
    1179             :     }
    1180             : 
    1181             :     /* Page is valid, see what to do with it */
    1182        1679 :     if (_bt_page_recyclable(page))
    1183             :     {
    1184             :         /* Okay to recycle this page */
    1185           0 :         RecordFreeIndexPage(rel, blkno);
    1186           0 :         vstate->totFreePages++;
    1187           0 :         stats->pages_deleted++;
    1188             :     }
    1189        1679 :     else if (P_ISDELETED(opaque))
    1190             :     {
    1191             :         /* Already deleted, but can't recycle yet */
    1192           0 :         stats->pages_deleted++;
    1193             :     }
    1194        1679 :     else if (P_ISHALFDEAD(opaque))
    1195             :     {
    1196             :         /* Half-dead, try to delete */
    1197           0 :         delete_now = true;
    1198             :     }
    1199        1679 :     else if (P_ISLEAF(opaque))
    1200             :     {
    1201             :         OffsetNumber deletable[MaxOffsetNumber];
    1202             :         int         ndeletable;
    1203             :         OffsetNumber offnum,
    1204             :                     minoff,
    1205             :                     maxoff;
    1206             : 
    1207             :         /*
    1208             :          * Trade in the initial read lock for a super-exclusive write lock on
    1209             :          * this page.  We must get such a lock on every leaf page over the
    1210             :          * course of the vacuum scan, whether or not it actually contains any
    1211             :          * deletable tuples --- see nbtree/README.
    1212             :          */
    1213        1517 :         LockBuffer(buf, BUFFER_LOCK_UNLOCK);
    1214        1517 :         LockBufferForCleanup(buf);
    1215             : 
    1216             :         /*
    1217             :          * Remember highest leaf page number we've taken cleanup lock on; see
    1218             :          * notes in btvacuumscan
    1219             :          */
    1220        1517 :         if (blkno > vstate->lastBlockLocked)
    1221        1517 :             vstate->lastBlockLocked = blkno;
    1222             : 
    1223             :         /*
    1224             :          * Check whether we need to recurse back to earlier pages.  What we
    1225             :          * are concerned about is a page split that happened since we started
    1226             :          * the vacuum scan.  If the split moved some tuples to a lower page
    1227             :          * then we might have missed 'em.  If so, set up for tail recursion.
    1228             :          * (Must do this before possibly clearing btpo_cycleid below!)
    1229             :          */
    1230        2238 :         if (vstate->cycleid != 0 &&
    1231         721 :             opaque->btpo_cycleid == vstate->cycleid &&
    1232           0 :             !(opaque->btpo_flags & BTP_SPLIT_END) &&
    1233           0 :             !P_RIGHTMOST(opaque) &&
    1234           0 :             opaque->btpo_next < orig_blkno)
    1235           0 :             recurse_to = opaque->btpo_next;
    1236             : 
    1237             :         /*
    1238             :          * Scan over all items to see which ones need deleted according to the
    1239             :          * callback function.
    1240             :          */
    1241        1517 :         ndeletable = 0;
    1242        1517 :         minoff = P_FIRSTDATAKEY(opaque);
    1243        1517 :         maxoff = PageGetMaxOffsetNumber(page);
    1244        1517 :         if (callback)
    1245             :         {
    1246      122930 :             for (offnum = minoff;
    1247             :                  offnum <= maxoff;
    1248      121488 :                  offnum = OffsetNumberNext(offnum))
    1249             :             {
    1250             :                 IndexTuple  itup;
    1251             :                 ItemPointer htup;
    1252             : 
    1253      121488 :                 itup = (IndexTuple) PageGetItem(page,
    1254             :                                                 PageGetItemId(page, offnum));
    1255      121488 :                 htup = &(itup->t_tid);
    1256             : 
    1257             :                 /*
    1258             :                  * During Hot Standby we currently assume that
    1259             :                  * XLOG_BTREE_VACUUM records do not produce conflicts. That is
    1260             :                  * only true as long as the callback function depends only
    1261             :                  * upon whether the index tuple refers to heap tuples removed
    1262             :                  * in the initial heap scan. When vacuum starts it derives a
    1263             :                  * value of OldestXmin. Backends taking later snapshots could
    1264             :                  * have a RecentGlobalXmin with a later xid than the vacuum's
    1265             :                  * OldestXmin, so it is possible that row versions deleted
    1266             :                  * after OldestXmin could be marked as killed by other
    1267             :                  * backends. The callback function *could* look at the index
    1268             :                  * tuple state in isolation and decide to delete the index
    1269             :                  * tuple, though currently it does not. If it ever did, we
    1270             :                  * would need to reconsider whether XLOG_BTREE_VACUUM records
    1271             :                  * should cause conflicts. If they did cause conflicts they
    1272             :                  * would be fairly harsh conflicts, since we haven't yet
    1273             :                  * worked out a way to pass a useful value for
    1274             :                  * latestRemovedXid on the XLOG_BTREE_VACUUM records. This
    1275             :                  * applies to *any* type of index that marks index tuples as
    1276             :                  * killed.
    1277             :                  */
    1278      121488 :                 if (callback(htup, callback_state))
    1279       39083 :                     deletable[ndeletable++] = offnum;
    1280             :             }
    1281             :         }
    1282             : 
    1283             :         /*
    1284             :          * Apply any needed deletes.  We issue just one _bt_delitems_vacuum()
    1285             :          * call per page, so as to minimize WAL traffic.
    1286             :          */
    1287        1517 :         if (ndeletable > 0)
    1288             :         {
    1289             :             /*
    1290             :              * Notice that the issued XLOG_BTREE_VACUUM WAL record includes
    1291             :              * all information to the replay code to allow it to get a cleanup
    1292             :              * lock on all pages between the previous lastBlockVacuumed and
    1293             :              * this page. This ensures that WAL replay locks all leaf pages at
    1294             :              * some point, which is important should non-MVCC scans be
    1295             :              * requested. This is currently unused on standby, but we record
    1296             :              * it anyway, so that the WAL contains the required information.
    1297             :              *
    1298             :              * Since we can visit leaf pages out-of-order when recursing,
    1299             :              * replay might end up locking such pages an extra time, but it
    1300             :              * doesn't seem worth the amount of bookkeeping it'd take to avoid
    1301             :              * that.
    1302             :              */
    1303         419 :             _bt_delitems_vacuum(rel, buf, deletable, ndeletable,
    1304             :                                 vstate->lastBlockVacuumed);
    1305             : 
    1306             :             /*
    1307             :              * Remember highest leaf page number we've issued a
    1308             :              * XLOG_BTREE_VACUUM WAL record for.
    1309             :              */
    1310         419 :             if (blkno > vstate->lastBlockVacuumed)
    1311         419 :                 vstate->lastBlockVacuumed = blkno;
    1312             : 
    1313         419 :             stats->tuples_removed += ndeletable;
    1314             :             /* must recompute maxoff */
    1315         419 :             maxoff = PageGetMaxOffsetNumber(page);
    1316             :         }
    1317             :         else
    1318             :         {
    1319             :             /*
    1320             :              * If the page has been split during this vacuum cycle, it seems
    1321             :              * worth expending a write to clear btpo_cycleid even if we don't
    1322             :              * have any deletions to do.  (If we do, _bt_delitems_vacuum takes
    1323             :              * care of this.)  This ensures we won't process the page again.
    1324             :              *
    1325             :              * We treat this like a hint-bit update because there's no need to
    1326             :              * WAL-log it.
    1327             :              */
    1328        1400 :             if (vstate->cycleid != 0 &&
    1329         302 :                 opaque->btpo_cycleid == vstate->cycleid)
    1330             :             {
    1331           0 :                 opaque->btpo_cycleid = 0;
    1332           0 :                 MarkBufferDirtyHint(buf, true);
    1333             :             }
    1334             :         }
    1335             : 
    1336             :         /*
    1337             :          * If it's now empty, try to delete; else count the live tuples. We
    1338             :          * don't delete when recursing, though, to avoid putting entries into
    1339             :          * freePages out-of-order (doesn't seem worth any extra code to handle
    1340             :          * the case).
    1341             :          */
    1342        1517 :         if (minoff > maxoff)
    1343          52 :             delete_now = (blkno == orig_blkno);
    1344             :         else
    1345        1465 :             stats->num_index_tuples += maxoff - minoff + 1;
    1346             :     }
    1347             : 
    1348        1679 :     if (delete_now)
    1349             :     {
    1350             :         MemoryContext oldcontext;
    1351             :         int         ndel;
    1352             : 
    1353             :         /* Run pagedel in a temp context to avoid memory leakage */
    1354          52 :         MemoryContextReset(vstate->pagedelcontext);
    1355          52 :         oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext);
    1356             : 
    1357          52 :         ndel = _bt_pagedel(rel, buf);
    1358             : 
    1359             :         /* count only this page, else may double-count parent */
    1360          52 :         if (ndel)
    1361          42 :             stats->pages_deleted++;
    1362             : 
    1363          52 :         MemoryContextSwitchTo(oldcontext);
    1364             :         /* pagedel released buffer, so we shouldn't */
    1365             :     }
    1366             :     else
    1367        1627 :         _bt_relbuf(rel, buf);
    1368             : 
    1369             :     /*
    1370             :      * This is really tail recursion, but if the compiler is too stupid to
    1371             :      * optimize it as such, we'd eat an uncomfortably large amount of stack
    1372             :      * space per recursion level (due to the deletable[] array). A failure is
    1373             :      * improbable since the number of levels isn't likely to be large ... but
    1374             :      * just in case, let's hand-optimize into a loop.
    1375             :      */
    1376        1679 :     if (recurse_to != P_NONE)
    1377             :     {
    1378           0 :         blkno = recurse_to;
    1379           0 :         goto restart;
    1380             :     }
    1381             : }
    1382             : 
    1383             : /*
    1384             :  *  btcanreturn() -- Check whether btree indexes support index-only scans.
    1385             :  *
    1386             :  * btrees always do, so this is trivial.
    1387             :  */
    1388             : bool
    1389       31104 : btcanreturn(Relation index, int attno)
    1390             : {
    1391       31104 :     return true;
    1392             : }

Generated by: LCOV version 1.11