LCOV - code coverage report
Current view: top level - src/backend/access/nbtree - nbtsearch.c (source / functions) Hit Total Coverage
Test: PostgreSQL Lines: 450 548 82.1 %
Date: 2017-09-29 13:40:31 Functions: 15 16 93.8 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * nbtsearch.c
       4             :  *    Search code for postgres btrees.
       5             :  *
       6             :  *
       7             :  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
       8             :  * Portions Copyright (c) 1994, Regents of the University of California
       9             :  *
      10             :  * IDENTIFICATION
      11             :  *    src/backend/access/nbtree/nbtsearch.c
      12             :  *
      13             :  *-------------------------------------------------------------------------
      14             :  */
      15             : 
      16             : #include "postgres.h"
      17             : 
      18             : #include "access/nbtree.h"
      19             : #include "access/relscan.h"
      20             : #include "miscadmin.h"
      21             : #include "pgstat.h"
      22             : #include "storage/predicate.h"
      23             : #include "utils/lsyscache.h"
      24             : #include "utils/rel.h"
      25             : #include "utils/tqual.h"
      26             : 
      27             : 
      28             : static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir,
      29             :              OffsetNumber offnum);
      30             : static void _bt_saveitem(BTScanOpaque so, int itemIndex,
      31             :              OffsetNumber offnum, IndexTuple itup);
      32             : static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir);
      33             : static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir);
      34             : static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno,
      35             :                       ScanDirection dir);
      36             : static Buffer _bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot);
      37             : static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
      38             : static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp);
      39             : static inline void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir);
      40             : 
      41             : 
      42             : /*
      43             :  *  _bt_drop_lock_and_maybe_pin()
      44             :  *
      45             :  * Unlock the buffer; and if it is safe to release the pin, do that, too.  It
      46             :  * is safe if the scan is using an MVCC snapshot and the index is WAL-logged.
      47             :  * This will prevent vacuum from stalling in a blocked state trying to read a
      48             :  * page when a cursor is sitting on it -- at least in many important cases.
      49             :  *
      50             :  * Set the buffer to invalid if the pin is released, since the buffer may be
      51             :  * re-used.  If we need to go back to this block (for example, to apply
      52             :  * LP_DEAD hints) we must get a fresh reference to the buffer.  Hopefully it
      53             :  * will remain in shared memory for as long as it takes to scan the index
      54             :  * buffer page.
      55             :  */
      56             : static void
      57      277057 : _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
      58             : {
      59      277057 :     LockBuffer(sp->buf, BUFFER_LOCK_UNLOCK);
      60             : 
      61      553044 :     if (IsMVCCSnapshot(scan->xs_snapshot) &&
      62      551841 :         RelationNeedsWAL(scan->indexRelation) &&
      63      275854 :         !scan->xs_want_itup)
      64             :     {
      65      272660 :         ReleaseBuffer(sp->buf);
      66      272660 :         sp->buf = InvalidBuffer;
      67             :     }
      68      277057 : }
      69             : 
      70             : 
      71             : /*
      72             :  *  _bt_search() -- Search the tree for a particular scankey,
      73             :  *      or more precisely for the first leaf page it could be on.
      74             :  *
      75             :  * The passed scankey must be an insertion-type scankey (see nbtree/README),
      76             :  * but it can omit the rightmost column(s) of the index.
      77             :  *
      78             :  * When nextkey is false (the usual case), we are looking for the first
      79             :  * item >= scankey.  When nextkey is true, we are looking for the first
      80             :  * item strictly greater than scankey.
      81             :  *
      82             :  * Return value is a stack of parent-page pointers.  *bufP is set to the
      83             :  * address of the leaf-page buffer, which is read-locked and pinned.
      84             :  * No locks are held on the parent pages, however!
      85             :  *
      86             :  * If the snapshot parameter is not NULL, "old snapshot" checking will take
      87             :  * place during the descent through the tree.  This is not needed when
      88             :  * positioning for an insert or delete, so NULL is used for those cases.
      89             :  *
      90             :  * NOTE that the returned buffer is read-locked regardless of the access
      91             :  * parameter.  However, access = BT_WRITE will allow an empty root page
      92             :  * to be created and returned.  When access = BT_READ, an empty index
      93             :  * will result in *bufP being set to InvalidBuffer.  Also, in BT_WRITE mode,
      94             :  * any incomplete splits encountered during the search will be finished.
      95             :  */
      96             : BTStack
      97      592745 : _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey,
      98             :            Buffer *bufP, int access, Snapshot snapshot)
      99             : {
     100      592745 :     BTStack     stack_in = NULL;
     101             : 
     102             :     /* Get the root page to start with */
     103      592745 :     *bufP = _bt_getroot(rel, access);
     104             : 
     105             :     /* If index is empty and access = BT_READ, no root page is created. */
     106      592745 :     if (!BufferIsValid(*bufP))
     107       11579 :         return (BTStack) NULL;
     108             : 
     109             :     /* Loop iterates once per level descended in the tree */
     110             :     for (;;)
     111             :     {
     112             :         Page        page;
     113             :         BTPageOpaque opaque;
     114             :         OffsetNumber offnum;
     115             :         ItemId      itemid;
     116             :         IndexTuple  itup;
     117             :         BlockNumber blkno;
     118             :         BlockNumber par_blkno;
     119             :         BTStack     new_stack;
     120             : 
     121             :         /*
     122             :          * Race -- the page we just grabbed may have split since we read its
     123             :          * pointer in the parent (or metapage).  If it has, we may need to
     124             :          * move right to its new sibling.  Do that.
     125             :          *
     126             :          * In write-mode, allow _bt_moveright to finish any incomplete splits
     127             :          * along the way.  Strictly speaking, we'd only need to finish an
     128             :          * incomplete split on the leaf page we're about to insert to, not on
     129             :          * any of the upper levels (they are taken care of in _bt_getstackbuf,
     130             :          * if the leaf page is split and we insert to the parent page).  But
     131             :          * this is a good opportunity to finish splits of internal pages too.
     132             :          */
     133     1061722 :         *bufP = _bt_moveright(rel, *bufP, keysz, scankey, nextkey,
     134             :                               (access == BT_WRITE), stack_in,
     135             :                               BT_READ, snapshot);
     136             : 
     137             :         /* if this is a leaf page, we're done */
     138     1061722 :         page = BufferGetPage(*bufP);
     139     1061722 :         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
     140     1061722 :         if (P_ISLEAF(opaque))
     141      581166 :             break;
     142             : 
     143             :         /*
     144             :          * Find the appropriate item on the internal page, and get the child
     145             :          * page that it points to.
     146             :          */
     147      480556 :         offnum = _bt_binsrch(rel, *bufP, keysz, scankey, nextkey);
     148      480556 :         itemid = PageGetItemId(page, offnum);
     149      480556 :         itup = (IndexTuple) PageGetItem(page, itemid);
     150      480556 :         blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
     151      480556 :         par_blkno = BufferGetBlockNumber(*bufP);
     152             : 
     153             :         /*
     154             :          * We need to save the location of the index entry we chose in the
     155             :          * parent page on a stack. In case we split the tree, we'll use the
     156             :          * stack to work back up to the parent page.  We also save the actual
     157             :          * downlink (TID) to uniquely identify the index entry, in case it
     158             :          * moves right while we're working lower in the tree.  See the paper
     159             :          * by Lehman and Yao for how this is detected and handled. (We use the
     160             :          * child link to disambiguate duplicate keys in the index -- Lehman
     161             :          * and Yao disallow duplicate keys.)
     162             :          */
     163      480556 :         new_stack = (BTStack) palloc(sizeof(BTStackData));
     164      480556 :         new_stack->bts_blkno = par_blkno;
     165      480556 :         new_stack->bts_offset = offnum;
     166      480556 :         memcpy(&new_stack->bts_btentry, itup, sizeof(IndexTupleData));
     167      480556 :         new_stack->bts_parent = stack_in;
     168             : 
     169             :         /* drop the read lock on the parent page, acquire one on the child */
     170      480556 :         *bufP = _bt_relandgetbuf(rel, *bufP, blkno, BT_READ);
     171             : 
     172             :         /* okay, all set to move down a level */
     173      480556 :         stack_in = new_stack;
     174      480556 :     }
     175             : 
     176      581166 :     return stack_in;
     177             : }
     178             : 
     179             : /*
     180             :  *  _bt_moveright() -- move right in the btree if necessary.
     181             :  *
     182             :  * When we follow a pointer to reach a page, it is possible that
     183             :  * the page has changed in the meanwhile.  If this happens, we're
     184             :  * guaranteed that the page has "split right" -- that is, that any
     185             :  * data that appeared on the page originally is either on the page
     186             :  * or strictly to the right of it.
     187             :  *
     188             :  * This routine decides whether or not we need to move right in the
     189             :  * tree by examining the high key entry on the page.  If that entry
     190             :  * is strictly less than the scankey, or <= the scankey in the nextkey=true
     191             :  * case, then we followed the wrong link and we need to move right.
     192             :  *
     193             :  * The passed scankey must be an insertion-type scankey (see nbtree/README),
     194             :  * but it can omit the rightmost column(s) of the index.
     195             :  *
     196             :  * When nextkey is false (the usual case), we are looking for the first
     197             :  * item >= scankey.  When nextkey is true, we are looking for the first
     198             :  * item strictly greater than scankey.
     199             :  *
     200             :  * If forupdate is true, we will attempt to finish any incomplete splits
     201             :  * that we encounter.  This is required when locking a target page for an
     202             :  * insertion, because we don't allow inserting on a page before the split
     203             :  * is completed.  'stack' is only used if forupdate is true.
     204             :  *
     205             :  * On entry, we have the buffer pinned and a lock of the type specified by
     206             :  * 'access'.  If we move right, we release the buffer and lock and acquire
     207             :  * the same on the right sibling.  Return value is the buffer we stop at.
     208             :  *
     209             :  * If the snapshot parameter is not NULL, "old snapshot" checking will take
     210             :  * place during the descent through the tree.  This is not needed when
     211             :  * positioning for an insert or delete, so NULL is used for those cases.
     212             :  */
     213             : Buffer
     214     1267236 : _bt_moveright(Relation rel,
     215             :               Buffer buf,
     216             :               int keysz,
     217             :               ScanKey scankey,
     218             :               bool nextkey,
     219             :               bool forupdate,
     220             :               BTStack stack,
     221             :               int access,
     222             :               Snapshot snapshot)
     223             : {
     224             :     Page        page;
     225             :     BTPageOpaque opaque;
     226             :     int32       cmpval;
     227             : 
     228             :     /*
     229             :      * When nextkey = false (normal case): if the scan key that brought us to
     230             :      * this page is > the high key stored on the page, then the page has split
     231             :      * and we need to move right.  (If the scan key is equal to the high key,
     232             :      * we might or might not need to move right; have to scan the page first
     233             :      * anyway.)
     234             :      *
     235             :      * When nextkey = true: move right if the scan key is >= page's high key.
     236             :      *
     237             :      * The page could even have split more than once, so scan as far as
     238             :      * needed.
     239             :      *
     240             :      * We also have to move right if we followed a link that brought us to a
     241             :      * dead page.
     242             :      */
     243     1267236 :     cmpval = nextkey ? 0 : 1;
     244             : 
     245             :     for (;;)
     246             :     {
     247     1267257 :         page = BufferGetPage(buf);
     248     1267257 :         TestForOldSnapshot(snapshot, rel, page);
     249     1267257 :         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
     250             : 
     251     1267257 :         if (P_RIGHTMOST(opaque))
     252      885039 :             break;
     253             : 
     254             :         /*
     255             :          * Finish any incomplete splits we encounter along the way.
     256             :          */
     257      382218 :         if (forupdate && P_INCOMPLETE_SPLIT(opaque))
     258             :         {
     259           0 :             BlockNumber blkno = BufferGetBlockNumber(buf);
     260             : 
     261             :             /* upgrade our lock if necessary */
     262           0 :             if (access == BT_READ)
     263             :             {
     264           0 :                 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
     265           0 :                 LockBuffer(buf, BT_WRITE);
     266             :             }
     267             : 
     268           0 :             if (P_INCOMPLETE_SPLIT(opaque))
     269           0 :                 _bt_finish_split(rel, buf, stack);
     270             :             else
     271           0 :                 _bt_relbuf(rel, buf);
     272             : 
     273             :             /* re-acquire the lock in the right mode, and re-check */
     274           0 :             buf = _bt_getbuf(rel, blkno, access);
     275           0 :             continue;
     276             :         }
     277             : 
     278      382218 :         if (P_IGNORE(opaque) || _bt_compare(rel, keysz, scankey, page, P_HIKEY) >= cmpval)
     279             :         {
     280             :             /* step right one page */
     281          21 :             buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access);
     282          21 :             continue;
     283             :         }
     284             :         else
     285             :             break;
     286          21 :     }
     287             : 
     288     1267236 :     if (P_IGNORE(opaque))
     289           0 :         elog(ERROR, "fell off the end of index \"%s\"",
     290             :              RelationGetRelationName(rel));
     291             : 
     292     1267236 :     return buf;
     293             : }
     294             : 
     295             : /*
     296             :  *  _bt_binsrch() -- Do a binary search for a key on a particular page.
     297             :  *
     298             :  * The passed scankey must be an insertion-type scankey (see nbtree/README),
     299             :  * but it can omit the rightmost column(s) of the index.
     300             :  *
     301             :  * When nextkey is false (the usual case), we are looking for the first
     302             :  * item >= scankey.  When nextkey is true, we are looking for the first
     303             :  * item strictly greater than scankey.
     304             :  *
     305             :  * On a leaf page, _bt_binsrch() returns the OffsetNumber of the first
     306             :  * key >= given scankey, or > scankey if nextkey is true.  (NOTE: in
     307             :  * particular, this means it is possible to return a value 1 greater than the
     308             :  * number of keys on the page, if the scankey is > all keys on the page.)
     309             :  *
     310             :  * On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber
     311             :  * of the last key < given scankey, or last key <= given scankey if nextkey
     312             :  * is true.  (Since _bt_compare treats the first data key of such a page as
     313             :  * minus infinity, there will be at least one key < scankey, so the result
     314             :  * always points at one of the keys on the page.)  This key indicates the
     315             :  * right place to descend to be sure we find all leaf keys >= given scankey
     316             :  * (or leaf keys > given scankey when nextkey is true).
     317             :  *
     318             :  * This procedure is not responsible for walking right, it just examines
     319             :  * the given page.  _bt_binsrch() has no lock or refcount side effects
     320             :  * on the buffer.
     321             :  */
     322             : OffsetNumber
     323     1060024 : _bt_binsrch(Relation rel,
     324             :             Buffer buf,
     325             :             int keysz,
     326             :             ScanKey scankey,
     327             :             bool nextkey)
     328             : {
     329             :     Page        page;
     330             :     BTPageOpaque opaque;
     331             :     OffsetNumber low,
     332             :                 high;
     333             :     int32       result,
     334             :                 cmpval;
     335             : 
     336     1060024 :     page = BufferGetPage(buf);
     337     1060024 :     opaque = (BTPageOpaque) PageGetSpecialPointer(page);
     338             : 
     339     1060024 :     low = P_FIRSTDATAKEY(opaque);
     340     1060024 :     high = PageGetMaxOffsetNumber(page);
     341             : 
     342             :     /*
     343             :      * If there are no keys on the page, return the first available slot. Note
     344             :      * this covers two cases: the page is really empty (no keys), or it
     345             :      * contains only a high key.  The latter case is possible after vacuuming.
     346             :      * This can never happen on an internal page, however, since they are
     347             :      * never empty (an internal page must have children).
     348             :      */
     349     1060024 :     if (high < low)
     350         314 :         return low;
     351             : 
     352             :     /*
     353             :      * Binary search to find the first key on the page >= scan key, or first
     354             :      * key > scankey when nextkey is true.
     355             :      *
     356             :      * For nextkey=false (cmpval=1), the loop invariant is: all slots before
     357             :      * 'low' are < scan key, all slots at or after 'high' are >= scan key.
     358             :      *
     359             :      * For nextkey=true (cmpval=0), the loop invariant is: all slots before
     360             :      * 'low' are <= scan key, all slots at or after 'high' are > scan key.
     361             :      *
     362             :      * We can fall out when high == low.
     363             :      */
     364     1059710 :     high++;                     /* establish the loop invariant for high */
     365             : 
     366     1059710 :     cmpval = nextkey ? 0 : 1;   /* select comparison value */
     367             : 
     368     8475674 :     while (high > low)
     369             :     {
     370     6356254 :         OffsetNumber mid = low + ((high - low) / 2);
     371             : 
     372             :         /* We have low <= mid < high, so mid points at a real slot */
     373             : 
     374     6356254 :         result = _bt_compare(rel, keysz, scankey, page, mid);
     375             : 
     376     6356254 :         if (result >= cmpval)
     377     3880539 :             low = mid + 1;
     378             :         else
     379     2475715 :             high = mid;
     380             :     }
     381             : 
     382             :     /*
     383             :      * At this point we have high == low, but be careful: they could point
     384             :      * past the last slot on the page.
     385             :      *
     386             :      * On a leaf page, we always return the first key >= scan key (resp. >
     387             :      * scan key), which could be the last slot + 1.
     388             :      */
     389     1059710 :     if (P_ISLEAF(opaque))
     390      579154 :         return low;
     391             : 
     392             :     /*
     393             :      * On a non-leaf page, return the last key < scan key (resp. <= scan key).
     394             :      * There must be one if _bt_compare() is playing by the rules.
     395             :      */
     396      480556 :     Assert(low > P_FIRSTDATAKEY(opaque));
     397             : 
     398      480556 :     return OffsetNumberPrev(low);
     399             : }
     400             : 
     401             : /*----------
     402             :  *  _bt_compare() -- Compare scankey to a particular tuple on the page.
     403             :  *
     404             :  * The passed scankey must be an insertion-type scankey (see nbtree/README),
     405             :  * but it can omit the rightmost column(s) of the index.
     406             :  *
     407             :  *  keysz: number of key conditions to be checked (might be less than the
     408             :  *      number of index columns!)
     409             :  *  page/offnum: location of btree item to be compared to.
     410             :  *
     411             :  *      This routine returns:
     412             :  *          <0 if scankey < tuple at offnum;
     413             :  *           0 if scankey == tuple at offnum;
     414             :  *          >0 if scankey > tuple at offnum.
     415             :  *      NULLs in the keys are treated as sortable values.  Therefore
     416             :  *      "equality" does not necessarily mean that the item should be
     417             :  *      returned to the caller as a matching key!
     418             :  *
     419             :  * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be
     420             :  * "minus infinity": this routine will always claim it is less than the
     421             :  * scankey.  The actual key value stored (if any, which there probably isn't)
     422             :  * does not matter.  This convention allows us to implement the Lehman and
     423             :  * Yao convention that the first down-link pointer is before the first key.
     424             :  * See backend/access/nbtree/README for details.
     425             :  *----------
     426             :  */
     427             : int32
     428     6740713 : _bt_compare(Relation rel,
     429             :             int keysz,
     430             :             ScanKey scankey,
     431             :             Page page,
     432             :             OffsetNumber offnum)
     433             : {
     434     6740713 :     TupleDesc   itupdesc = RelationGetDescr(rel);
     435     6740713 :     BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
     436             :     IndexTuple  itup;
     437             :     int         i;
     438             : 
     439             :     /*
     440             :      * Force result ">" if target item is first data item on an internal page
     441             :      * --- see NOTE above.
     442             :      */
     443     6740713 :     if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
     444       74747 :         return 1;
     445             : 
     446     6665966 :     itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
     447             : 
     448             :     /*
     449             :      * The scan key is set up with the attribute number associated with each
     450             :      * term in the key.  It is important that, if the index is multi-key, the
     451             :      * scan contain the first k key attributes, and that they be in order.  If
     452             :      * you think about how multi-key ordering works, you'll understand why
     453             :      * this is.
     454             :      *
     455             :      * We don't test for violation of this condition here, however.  The
     456             :      * initial setup for the index scan had better have gotten it right (see
     457             :      * _bt_first).
     458             :      */
     459             : 
     460     9379522 :     for (i = 1; i <= keysz; i++)
     461             :     {
     462             :         Datum       datum;
     463             :         bool        isNull;
     464             :         int32       result;
     465             : 
     466     8870958 :         datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
     467             : 
     468             :         /* see comments about NULLs handling in btbuild */
     469     8870958 :         if (scankey->sk_flags & SK_ISNULL)   /* key is NULL */
     470             :         {
     471        2402 :             if (isNull)
     472          22 :                 result = 0;     /* NULL "=" NULL */
     473        2380 :             else if (scankey->sk_flags & SK_BT_NULLS_FIRST)
     474          44 :                 result = -1;    /* NULL "<" NOT_NULL */
     475             :             else
     476        2336 :                 result = 1;     /* NULL ">" NOT_NULL */
     477             :         }
     478     8868556 :         else if (isNull)        /* key is NOT_NULL and item is NULL */
     479             :         {
     480          33 :             if (scankey->sk_flags & SK_BT_NULLS_FIRST)
     481           0 :                 result = 1;     /* NOT_NULL ">" NULL */
     482             :             else
     483          33 :                 result = -1;    /* NOT_NULL "<" NULL */
     484             :         }
     485             :         else
     486             :         {
     487             :             /*
     488             :              * The sk_func needs to be passed the index value as left arg and
     489             :              * the sk_argument as right arg (they might be of different
     490             :              * types).  Since it is convenient for callers to think of
     491             :              * _bt_compare as comparing the scankey to the index item, we have
     492             :              * to flip the sign of the comparison result.  (Unless it's a DESC
     493             :              * column, in which case we *don't* flip the sign.)
     494             :              */
     495     8868523 :             result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
     496             :                                                      scankey->sk_collation,
     497             :                                                      datum,
     498             :                                                      scankey->sk_argument));
     499             : 
     500     8868523 :             if (!(scankey->sk_flags & SK_BT_DESC))
     501     8868522 :                 result = -result;
     502             :         }
     503             : 
     504             :         /* if the keys are unequal, return the difference */
     505     8870958 :         if (result != 0)
     506     6157402 :             return result;
     507             : 
     508     2713556 :         scankey++;
     509             :     }
     510             : 
     511             :     /* if we get here, the keys are equal */
     512      508564 :     return 0;
     513             : }
     514             : 
     515             : /*
     516             :  *  _bt_first() -- Find the first item in a scan.
     517             :  *
     518             :  *      We need to be clever about the direction of scan, the search
     519             :  *      conditions, and the tree ordering.  We find the first item (or,
     520             :  *      if backwards scan, the last item) in the tree that satisfies the
     521             :  *      qualifications in the scan key.  On success exit, the page containing
     522             :  *      the current index tuple is pinned but not locked, and data about
     523             :  *      the matching tuple(s) on the page has been loaded into so->currPos.
     524             :  *      scan->xs_ctup.t_self is set to the heap TID of the current tuple,
     525             :  *      and if requested, scan->xs_itup points to a copy of the index tuple.
     526             :  *
     527             :  * If there are no matching items in the index, we return FALSE, with no
     528             :  * pins or locks held.
     529             :  *
     530             :  * Note that scan->keyData[], and the so->keyData[] scankey built from it,
     531             :  * are both search-type scankeys (see nbtree/README for more about this).
     532             :  * Within this routine, we build a temporary insertion-type scankey to use
     533             :  * in locating the scan start position.
     534             :  */
     535             : bool
     536      388066 : _bt_first(IndexScanDesc scan, ScanDirection dir)
     537             : {
     538      388066 :     Relation    rel = scan->indexRelation;
     539      388066 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
     540             :     Buffer      buf;
     541             :     BTStack     stack;
     542             :     OffsetNumber offnum;
     543             :     StrategyNumber strat;
     544             :     bool        nextkey;
     545             :     bool        goback;
     546             :     ScanKey     startKeys[INDEX_MAX_KEYS];
     547             :     ScanKeyData scankeys[INDEX_MAX_KEYS];
     548             :     ScanKeyData notnullkeys[INDEX_MAX_KEYS];
     549      388066 :     int         keysCount = 0;
     550             :     int         i;
     551      388066 :     bool        status = true;
     552             :     StrategyNumber strat_total;
     553             :     BTScanPosItem *currItem;
     554             :     BlockNumber blkno;
     555             : 
     556      388066 :     Assert(!BTScanPosIsValid(so->currPos));
     557             : 
     558      388066 :     pgstat_count_index_scan(rel);
     559             : 
     560             :     /*
     561             :      * Examine the scan keys and eliminate any redundant keys; also mark the
     562             :      * keys that must be matched to continue the scan.
     563             :      */
     564      388066 :     _bt_preprocess_keys(scan);
     565             : 
     566             :     /*
     567             :      * Quit now if _bt_preprocess_keys() discovered that the scan keys can
     568             :      * never be satisfied (eg, x == 1 AND x > 2).
     569             :      */
     570      388066 :     if (!so->qual_ok)
     571          11 :         return false;
     572             : 
     573             :     /*
     574             :      * For parallel scans, get the starting page from shared state. If the
     575             :      * scan has not started, proceed to find out first leaf page in the usual
     576             :      * way while keeping other participating processes waiting.  If the scan
     577             :      * has already begun, use the page number from the shared structure.
     578             :      */
     579      388055 :     if (scan->parallel_scan != NULL)
     580             :     {
     581          45 :         status = _bt_parallel_seize(scan, &blkno);
     582          45 :         if (!status)
     583          36 :             return false;
     584           9 :         else if (blkno == P_NONE)
     585             :         {
     586           0 :             _bt_parallel_done(scan);
     587           0 :             return false;
     588             :         }
     589           9 :         else if (blkno != InvalidBlockNumber)
     590             :         {
     591           0 :             if (!_bt_parallel_readpage(scan, blkno, dir))
     592           0 :                 return false;
     593           0 :             goto readcomplete;
     594             :         }
     595             :     }
     596             : 
     597             :     /*----------
     598             :      * Examine the scan keys to discover where we need to start the scan.
     599             :      *
     600             :      * We want to identify the keys that can be used as starting boundaries;
     601             :      * these are =, >, or >= keys for a forward scan or =, <, <= keys for
     602             :      * a backwards scan.  We can use keys for multiple attributes so long as
     603             :      * the prior attributes had only =, >= (resp. =, <=) keys.  Once we accept
     604             :      * a > or < boundary or find an attribute with no boundary (which can be
     605             :      * thought of as the same as "> -infinity"), we can't use keys for any
     606             :      * attributes to its right, because it would break our simplistic notion
     607             :      * of what initial positioning strategy to use.
     608             :      *
     609             :      * When the scan keys include cross-type operators, _bt_preprocess_keys
     610             :      * may not be able to eliminate redundant keys; in such cases we will
     611             :      * arbitrarily pick a usable one for each attribute.  This is correct
     612             :      * but possibly not optimal behavior.  (For example, with keys like
     613             :      * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when
     614             :      * x=5 would be more efficient.)  Since the situation only arises given
     615             :      * a poorly-worded query plus an incomplete opfamily, live with it.
     616             :      *
     617             :      * When both equality and inequality keys appear for a single attribute
     618             :      * (again, only possible when cross-type operators appear), we *must*
     619             :      * select one of the equality keys for the starting point, because
     620             :      * _bt_checkkeys() will stop the scan as soon as an equality qual fails.
     621             :      * For example, if we have keys like "x >= 4 AND x = 10" and we elect to
     622             :      * start at x=4, we will fail and stop before reaching x=10.  If multiple
     623             :      * equality quals survive preprocessing, however, it doesn't matter which
     624             :      * one we use --- by definition, they are either redundant or
     625             :      * contradictory.
     626             :      *
     627             :      * Any regular (not SK_SEARCHNULL) key implies a NOT NULL qualifier.
     628             :      * If the index stores nulls at the end of the index we'll be starting
     629             :      * from, and we have no boundary key for the column (which means the key
     630             :      * we deduced NOT NULL from is an inequality key that constrains the other
     631             :      * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to
     632             :      * use as a boundary key.  If we didn't do this, we might find ourselves
     633             :      * traversing a lot of null entries at the start of the scan.
     634             :      *
     635             :      * In this loop, row-comparison keys are treated the same as keys on their
     636             :      * first (leftmost) columns.  We'll add on lower-order columns of the row
     637             :      * comparison below, if possible.
     638             :      *
     639             :      * The selected scan keys (at most one per index column) are remembered by
     640             :      * storing their addresses into the local startKeys[] array.
     641             :      *----------
     642             :      */
     643      388019 :     strat_total = BTEqualStrategyNumber;
     644      388019 :     if (so->numberOfKeys > 0)
     645             :     {
     646             :         AttrNumber  curattr;
     647             :         ScanKey     chosen;
     648             :         ScanKey     impliesNN;
     649             :         ScanKey     cur;
     650             : 
     651             :         /*
     652             :          * chosen is the so-far-chosen key for the current attribute, if any.
     653             :          * We don't cast the decision in stone until we reach keys for the
     654             :          * next attribute.
     655             :          */
     656      387665 :         curattr = 1;
     657      387665 :         chosen = NULL;
     658             :         /* Also remember any scankey that implies a NOT NULL constraint */
     659      387665 :         impliesNN = NULL;
     660             : 
     661             :         /*
     662             :          * Loop iterates from 0 to numberOfKeys inclusive; we use the last
     663             :          * pass to handle after-last-key processing.  Actual exit from the
     664             :          * loop is at one of the "break" statements below.
     665             :          */
     666     1052712 :         for (cur = so->keyData, i = 0;; cur++, i++)
     667             :         {
     668     1052712 :             if (i >= so->numberOfKeys || cur->sk_attno != curattr)
     669             :             {
     670             :                 /*
     671             :                  * Done looking at keys for curattr.  If we didn't find a
     672             :                  * usable boundary key, see if we can deduce a NOT NULL key.
     673             :                  */
     674      665458 :                 if (chosen == NULL && impliesNN != NULL &&
     675         459 :                     ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
     676             :                      ScanDirectionIsForward(dir) :
     677             :                      ScanDirectionIsBackward(dir)))
     678             :                 {
     679             :                     /* Yes, so build the key in notnullkeys[keysCount] */
     680           1 :                     chosen = &notnullkeys[keysCount];
     681           2 :                     ScanKeyEntryInitialize(chosen,
     682             :                                            (SK_SEARCHNOTNULL | SK_ISNULL |
     683           1 :                                             (impliesNN->sk_flags &
     684             :                                              (SK_BT_DESC | SK_BT_NULLS_FIRST))),
     685             :                                            curattr,
     686           1 :                                            ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
     687             :                                             BTGreaterStrategyNumber :
     688             :                                             BTLessStrategyNumber),
     689             :                                            InvalidOid,
     690             :                                            InvalidOid,
     691             :                                            InvalidOid,
     692             :                                            (Datum) 0);
     693             :                 }
     694             : 
     695             :                 /*
     696             :                  * If we still didn't find a usable boundary key, quit; else
     697             :                  * save the boundary key pointer in startKeys.
     698             :                  */
     699      664999 :                 if (chosen == NULL)
     700         484 :                     break;
     701      664515 :                 startKeys[keysCount++] = chosen;
     702             : 
     703             :                 /*
     704             :                  * Adjust strat_total, and quit if we have stored a > or <
     705             :                  * key.
     706             :                  */
     707      664515 :                 strat = chosen->sk_strategy;
     708      664515 :                 if (strat != BTEqualStrategyNumber)
     709             :                 {
     710       27034 :                     strat_total = strat;
     711       27034 :                     if (strat == BTGreaterStrategyNumber ||
     712             :                         strat == BTLessStrategyNumber)
     713             :                         break;
     714             :                 }
     715             : 
     716             :                 /*
     717             :                  * Done if that was the last attribute, or if next key is not
     718             :                  * in sequence (implying no boundary key is available for the
     719             :                  * next attribute).
     720             :                  */
     721      915362 :                 if (i >= so->numberOfKeys ||
     722      277361 :                     cur->sk_attno != curattr + 1)
     723             :                     break;
     724             : 
     725             :                 /*
     726             :                  * Reset for next attr.
     727             :                  */
     728      277334 :                 curattr = cur->sk_attno;
     729      277334 :                 chosen = NULL;
     730      277334 :                 impliesNN = NULL;
     731             :             }
     732             : 
     733             :             /*
     734             :              * Can we use this key as a starting boundary for this attr?
     735             :              *
     736             :              * If not, does it imply a NOT NULL constraint?  (Because
     737             :              * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber,
     738             :              * *any* inequality key works for that; we need not test.)
     739             :              */
     740      665047 :             switch (cur->sk_strategy)
     741             :             {
     742             :                 case BTLessStrategyNumber:
     743             :                 case BTLessEqualStrategyNumber:
     744         804 :                     if (chosen == NULL)
     745             :                     {
     746         732 :                         if (ScanDirectionIsBackward(dir))
     747         276 :                             chosen = cur;
     748             :                         else
     749         456 :                             impliesNN = cur;
     750             :                     }
     751         804 :                     break;
     752             :                 case BTEqualStrategyNumber:
     753             :                     /* override any non-equality choice */
     754      637481 :                     chosen = cur;
     755      637481 :                     break;
     756             :                 case BTGreaterEqualStrategyNumber:
     757             :                 case BTGreaterStrategyNumber:
     758       26762 :                     if (chosen == NULL)
     759             :                     {
     760       26762 :                         if (ScanDirectionIsForward(dir))
     761       26757 :                             chosen = cur;
     762             :                         else
     763           5 :                             impliesNN = cur;
     764             :                     }
     765       26762 :                     break;
     766             :             }
     767      665047 :         }
     768             :     }
     769             : 
     770             :     /*
     771             :      * If we found no usable boundary keys, we have to start from one end of
     772             :      * the tree.  Walk down that edge to the first or last key, and scan from
     773             :      * there.
     774             :      */
     775      388019 :     if (keysCount == 0)
     776             :     {
     777             :         bool        match;
     778             : 
     779         831 :         match = _bt_endpoint(scan, dir);
     780             : 
     781         831 :         if (!match)
     782             :         {
     783             :             /* No match, so mark (parallel) scan finished */
     784         231 :             _bt_parallel_done(scan);
     785             :         }
     786             : 
     787         831 :         return match;
     788             :     }
     789             : 
     790             :     /*
     791             :      * We want to start the scan somewhere within the index.  Set up an
     792             :      * insertion scankey we can use to search for the boundary point we
     793             :      * identified above.  The insertion scankey is built in the local
     794             :      * scankeys[] array, using the keys identified by startKeys[].
     795             :      */
     796      387188 :     Assert(keysCount <= INDEX_MAX_KEYS);
     797     1051701 :     for (i = 0; i < keysCount; i++)
     798             :     {
     799      664515 :         ScanKey     cur = startKeys[i];
     800             : 
     801      664515 :         Assert(cur->sk_attno == i + 1);
     802             : 
     803      664515 :         if (cur->sk_flags & SK_ROW_HEADER)
     804             :         {
     805             :             /*
     806             :              * Row comparison header: look to the first row member instead.
     807             :              *
     808             :              * The member scankeys are already in insertion format (ie, they
     809             :              * have sk_func = 3-way-comparison function), but we have to watch
     810             :              * out for nulls, which _bt_preprocess_keys didn't check. A null
     811             :              * in the first row member makes the condition unmatchable, just
     812             :              * like qual_ok = false.
     813             :              */
     814           2 :             ScanKey     subkey = (ScanKey) DatumGetPointer(cur->sk_argument);
     815             : 
     816           2 :             Assert(subkey->sk_flags & SK_ROW_MEMBER);
     817           2 :             if (subkey->sk_flags & SK_ISNULL)
     818             :             {
     819           0 :                 _bt_parallel_done(scan);
     820           0 :                 return false;
     821             :             }
     822           2 :             memcpy(scankeys + i, subkey, sizeof(ScanKeyData));
     823             : 
     824             :             /*
     825             :              * If the row comparison is the last positioning key we accepted,
     826             :              * try to add additional keys from the lower-order row members.
     827             :              * (If we accepted independent conditions on additional index
     828             :              * columns, we use those instead --- doesn't seem worth trying to
     829             :              * determine which is more restrictive.)  Note that this is OK
     830             :              * even if the row comparison is of ">" or "<" type, because the
     831             :              * condition applied to all but the last row member is effectively
     832             :              * ">=" or "<=", and so the extra keys don't break the positioning
     833             :              * scheme.  But, by the same token, if we aren't able to use all
     834             :              * the row members, then the part of the row comparison that we
     835             :              * did use has to be treated as just a ">=" or "<=" condition, and
     836             :              * so we'd better adjust strat_total accordingly.
     837             :              */
     838           2 :             if (i == keysCount - 1)
     839             :             {
     840           2 :                 bool        used_all_subkeys = false;
     841             : 
     842           2 :                 Assert(!(subkey->sk_flags & SK_ROW_END));
     843             :                 for (;;)
     844             :                 {
     845           2 :                     subkey++;
     846           2 :                     Assert(subkey->sk_flags & SK_ROW_MEMBER);
     847           2 :                     if (subkey->sk_attno != keysCount + 1)
     848           0 :                         break;  /* out-of-sequence, can't use it */
     849           2 :                     if (subkey->sk_strategy != cur->sk_strategy)
     850           0 :                         break;  /* wrong direction, can't use it */
     851           2 :                     if (subkey->sk_flags & SK_ISNULL)
     852           0 :                         break;  /* can't use null keys */
     853           2 :                     Assert(keysCount < INDEX_MAX_KEYS);
     854           2 :                     memcpy(scankeys + keysCount, subkey, sizeof(ScanKeyData));
     855           2 :                     keysCount++;
     856           2 :                     if (subkey->sk_flags & SK_ROW_END)
     857             :                     {
     858           2 :                         used_all_subkeys = true;
     859           2 :                         break;
     860             :                     }
     861           0 :                 }
     862           2 :                 if (!used_all_subkeys)
     863             :                 {
     864           0 :                     switch (strat_total)
     865             :                     {
     866             :                         case BTLessStrategyNumber:
     867           0 :                             strat_total = BTLessEqualStrategyNumber;
     868           0 :                             break;
     869             :                         case BTGreaterStrategyNumber:
     870           0 :                             strat_total = BTGreaterEqualStrategyNumber;
     871           0 :                             break;
     872             :                     }
     873             :                 }
     874           2 :                 break;          /* done with outer loop */
     875             :             }
     876             :         }
     877             :         else
     878             :         {
     879             :             /*
     880             :              * Ordinary comparison key.  Transform the search-style scan key
     881             :              * to an insertion scan key by replacing the sk_func with the
     882             :              * appropriate btree comparison function.
     883             :              *
     884             :              * If scankey operator is not a cross-type comparison, we can use
     885             :              * the cached comparison function; otherwise gotta look it up in
     886             :              * the catalogs.  (That can't lead to infinite recursion, since no
     887             :              * indexscan initiated by syscache lookup will use cross-data-type
     888             :              * operators.)
     889             :              *
     890             :              * We support the convention that sk_subtype == InvalidOid means
     891             :              * the opclass input type; this is a hack to simplify life for
     892             :              * ScanKeyInit().
     893             :              */
     894     1289532 :             if (cur->sk_subtype == rel->rd_opcintype[i] ||
     895      625019 :                 cur->sk_subtype == InvalidOid)
     896      663767 :             {
     897             :                 FmgrInfo   *procinfo;
     898             : 
     899      663767 :                 procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC);
     900     1327534 :                 ScanKeyEntryInitializeWithInfo(scankeys + i,
     901             :                                                cur->sk_flags,
     902      663767 :                                                cur->sk_attno,
     903             :                                                InvalidStrategy,
     904             :                                                cur->sk_subtype,
     905             :                                                cur->sk_collation,
     906             :                                                procinfo,
     907             :                                                cur->sk_argument);
     908             :             }
     909             :             else
     910             :             {
     911             :                 RegProcedure cmp_proc;
     912             : 
     913        1492 :                 cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
     914         746 :                                              rel->rd_opcintype[i],
     915             :                                              cur->sk_subtype,
     916             :                                              BTORDER_PROC);
     917         746 :                 if (!RegProcedureIsValid(cmp_proc))
     918           0 :                     elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
     919             :                          BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype,
     920             :                          cur->sk_attno, RelationGetRelationName(rel));
     921        1492 :                 ScanKeyEntryInitialize(scankeys + i,
     922             :                                        cur->sk_flags,
     923         746 :                                        cur->sk_attno,
     924             :                                        InvalidStrategy,
     925             :                                        cur->sk_subtype,
     926             :                                        cur->sk_collation,
     927             :                                        cmp_proc,
     928             :                                        cur->sk_argument);
     929             :             }
     930             :         }
     931             :     }
     932             : 
     933             :     /*----------
     934             :      * Examine the selected initial-positioning strategy to determine exactly
     935             :      * where we need to start the scan, and set flag variables to control the
     936             :      * code below.
     937             :      *
     938             :      * If nextkey = false, _bt_search and _bt_binsrch will locate the first
     939             :      * item >= scan key.  If nextkey = true, they will locate the first
     940             :      * item > scan key.
     941             :      *
     942             :      * If goback = true, we will then step back one item, while if
     943             :      * goback = false, we will start the scan on the located item.
     944             :      *----------
     945             :      */
     946      387188 :     switch (strat_total)
     947             :     {
     948             :         case BTLessStrategyNumber:
     949             : 
     950             :             /*
     951             :              * Find first item >= scankey, then back up one to arrive at last
     952             :              * item < scankey.  (Note: this positioning strategy is only used
     953             :              * for a backward scan, so that is always the correct starting
     954             :              * position.)
     955             :              */
     956         277 :             nextkey = false;
     957         277 :             goback = true;
     958         277 :             break;
     959             : 
     960             :         case BTLessEqualStrategyNumber:
     961             : 
     962             :             /*
     963             :              * Find first item > scankey, then back up one to arrive at last
     964             :              * item <= scankey.  (Note: this positioning strategy is only used
     965             :              * for a backward scan, so that is always the correct starting
     966             :              * position.)
     967             :              */
     968           0 :             nextkey = true;
     969           0 :             goback = true;
     970           0 :             break;
     971             : 
     972             :         case BTEqualStrategyNumber:
     973             : 
     974             :             /*
     975             :              * If a backward scan was specified, need to start with last equal
     976             :              * item not first one.
     977             :              */
     978      360154 :             if (ScanDirectionIsBackward(dir))
     979             :             {
     980             :                 /*
     981             :                  * This is the same as the <= strategy.  We will check at the
     982             :                  * end whether the found item is actually =.
     983             :                  */
     984          20 :                 nextkey = true;
     985          20 :                 goback = true;
     986             :             }
     987             :             else
     988             :             {
     989             :                 /*
     990             :                  * This is the same as the >= strategy.  We will check at the
     991             :                  * end whether the found item is actually =.
     992             :                  */
     993      360134 :                 nextkey = false;
     994      360134 :                 goback = false;
     995             :             }
     996      360154 :             break;
     997             : 
     998             :         case BTGreaterEqualStrategyNumber:
     999             : 
    1000             :             /*
    1001             :              * Find first item >= scankey.  (This is only used for forward
    1002             :              * scans.)
    1003             :              */
    1004         520 :             nextkey = false;
    1005         520 :             goback = false;
    1006         520 :             break;
    1007             : 
    1008             :         case BTGreaterStrategyNumber:
    1009             : 
    1010             :             /*
    1011             :              * Find first item > scankey.  (This is only used for forward
    1012             :              * scans.)
    1013             :              */
    1014       26237 :             nextkey = true;
    1015       26237 :             goback = false;
    1016       26237 :             break;
    1017             : 
    1018             :         default:
    1019             :             /* can't get here, but keep compiler quiet */
    1020           0 :             elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
    1021             :             return false;
    1022             :     }
    1023             : 
    1024             :     /*
    1025             :      * Use the manufactured insertion scan key to descend the tree and
    1026             :      * position ourselves on the target leaf page.
    1027             :      */
    1028      387188 :     stack = _bt_search(rel, keysCount, scankeys, nextkey, &buf, BT_READ,
    1029             :                        scan->xs_snapshot);
    1030             : 
    1031             :     /* don't need to keep the stack around... */
    1032      387188 :     _bt_freestack(stack);
    1033             : 
    1034      387188 :     if (!BufferIsValid(buf))
    1035             :     {
    1036             :         /*
    1037             :          * We only get here if the index is completely empty. Lock relation
    1038             :          * because nothing finer to lock exists.
    1039             :          */
    1040       11579 :         PredicateLockRelation(rel, scan->xs_snapshot);
    1041             : 
    1042             :         /*
    1043             :          * mark parallel scan as done, so that all the workers can finish
    1044             :          * their scan
    1045             :          */
    1046       11579 :         _bt_parallel_done(scan);
    1047       11579 :         BTScanPosInvalidate(so->currPos);
    1048             : 
    1049       11579 :         return false;
    1050             :     }
    1051             :     else
    1052      375609 :         PredicateLockPage(rel, BufferGetBlockNumber(buf),
    1053             :                           scan->xs_snapshot);
    1054             : 
    1055      375609 :     _bt_initialize_more_data(so, dir);
    1056             : 
    1057             :     /* position to the precise item on the page */
    1058      375609 :     offnum = _bt_binsrch(rel, buf, keysCount, scankeys, nextkey);
    1059             : 
    1060             :     /*
    1061             :      * If nextkey = false, we are positioned at the first item >= scan key, or
    1062             :      * possibly at the end of a page on which all the existing items are less
    1063             :      * than the scan key and we know that everything on later pages is greater
    1064             :      * than or equal to scan key.
    1065             :      *
    1066             :      * If nextkey = true, we are positioned at the first item > scan key, or
    1067             :      * possibly at the end of a page on which all the existing items are less
    1068             :      * than or equal to the scan key and we know that everything on later
    1069             :      * pages is greater than scan key.
    1070             :      *
    1071             :      * The actually desired starting point is either this item or the prior
    1072             :      * one, or in the end-of-page case it's the first item on the next page or
    1073             :      * the last item on this page.  Adjust the starting offset if needed. (If
    1074             :      * this results in an offset before the first item or after the last one,
    1075             :      * _bt_readpage will report no items found, and then we'll step to the
    1076             :      * next page as needed.)
    1077             :      */
    1078      375609 :     if (goback)
    1079         297 :         offnum = OffsetNumberPrev(offnum);
    1080             : 
    1081             :     /* remember which buffer we have pinned, if any */
    1082      375609 :     Assert(!BTScanPosIsValid(so->currPos));
    1083      375609 :     so->currPos.buf = buf;
    1084             : 
    1085             :     /*
    1086             :      * Now load data from the first page of the scan.
    1087             :      */
    1088      375609 :     if (!_bt_readpage(scan, dir, offnum))
    1089             :     {
    1090             :         /*
    1091             :          * There's no actually-matching data on this page.  Try to advance to
    1092             :          * the next page.  Return false if there's no matching data at all.
    1093             :          */
    1094      101510 :         LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
    1095      101510 :         if (!_bt_steppage(scan, dir))
    1096      101293 :             return false;
    1097             :     }
    1098             :     else
    1099             :     {
    1100             :         /* Drop the lock, and maybe the pin, on the current page */
    1101      274099 :         _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
    1102             :     }
    1103             : 
    1104             : readcomplete:
    1105             :     /* OK, itemIndex says what to return */
    1106      274316 :     currItem = &so->currPos.items[so->currPos.itemIndex];
    1107      274316 :     scan->xs_ctup.t_self = currItem->heapTid;
    1108      274316 :     if (scan->xs_want_itup)
    1109        2060 :         scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
    1110             : 
    1111      274316 :     return true;
    1112             : }
    1113             : 
    1114             : /*
    1115             :  *  _bt_next() -- Get the next item in a scan.
    1116             :  *
    1117             :  *      On entry, so->currPos describes the current page, which may be pinned
    1118             :  *      but is not locked, and so->currPos.itemIndex identifies which item was
    1119             :  *      previously returned.
    1120             :  *
    1121             :  *      On successful exit, scan->xs_ctup.t_self is set to the TID of the
    1122             :  *      next heap tuple, and if requested, scan->xs_itup points to a copy of
    1123             :  *      the index tuple.  so->currPos is updated as needed.
    1124             :  *
    1125             :  *      On failure exit (no more tuples), we release pin and set
    1126             :  *      so->currPos.buf to InvalidBuffer.
    1127             :  */
    1128             : bool
    1129      791609 : _bt_next(IndexScanDesc scan, ScanDirection dir)
    1130             : {
    1131      791609 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
    1132             :     BTScanPosItem *currItem;
    1133             : 
    1134             :     /*
    1135             :      * Advance to next tuple on current page; or if there's no more, try to
    1136             :      * step to the next page with data.
    1137             :      */
    1138      791609 :     if (ScanDirectionIsForward(dir))
    1139             :     {
    1140      789816 :         if (++so->currPos.itemIndex > so->currPos.lastItem)
    1141             :         {
    1142       80895 :             if (!_bt_steppage(scan, dir))
    1143       78758 :                 return false;
    1144             :         }
    1145             :     }
    1146             :     else
    1147             :     {
    1148        1793 :         if (--so->currPos.itemIndex < so->currPos.firstItem)
    1149             :         {
    1150          14 :             if (!_bt_steppage(scan, dir))
    1151          10 :                 return false;
    1152             :         }
    1153             :     }
    1154             : 
    1155             :     /* OK, itemIndex says what to return */
    1156      712841 :     currItem = &so->currPos.items[so->currPos.itemIndex];
    1157      712841 :     scan->xs_ctup.t_self = currItem->heapTid;
    1158      712841 :     if (scan->xs_want_itup)
    1159      443818 :         scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
    1160             : 
    1161      712841 :     return true;
    1162             : }
    1163             : 
    1164             : /*
    1165             :  *  _bt_readpage() -- Load data from current index page into so->currPos
    1166             :  *
    1167             :  * Caller must have pinned and read-locked so->currPos.buf; the buffer's state
    1168             :  * is not changed here.  Also, currPos.moreLeft and moreRight must be valid;
    1169             :  * they are updated as appropriate.  All other fields of so->currPos are
    1170             :  * initialized from scratch here.
    1171             :  *
    1172             :  * We scan the current page starting at offnum and moving in the indicated
    1173             :  * direction.  All items matching the scan keys are loaded into currPos.items.
    1174             :  * moreLeft or moreRight (as appropriate) is cleared if _bt_checkkeys reports
    1175             :  * that there can be no more matching tuples in the current scan direction.
    1176             :  *
    1177             :  * In the case of a parallel scan, caller must have called _bt_parallel_seize
    1178             :  * prior to calling this function; this function will invoke
    1179             :  * _bt_parallel_release before returning.
    1180             :  *
    1181             :  * Returns true if any matching items found on the page, false if none.
    1182             :  */
    1183             : static bool
    1184      379451 : _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
    1185             : {
    1186      379451 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
    1187             :     Page        page;
    1188             :     BTPageOpaque opaque;
    1189             :     OffsetNumber minoff;
    1190             :     OffsetNumber maxoff;
    1191             :     int         itemIndex;
    1192             :     IndexTuple  itup;
    1193             :     bool        continuescan;
    1194             : 
    1195             :     /*
    1196             :      * We must have the buffer pinned and locked, but the usual macro can't be
    1197             :      * used here; this function is what makes it good for currPos.
    1198             :      */
    1199      379451 :     Assert(BufferIsValid(so->currPos.buf));
    1200             : 
    1201      379451 :     page = BufferGetPage(so->currPos.buf);
    1202      379451 :     opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    1203             : 
    1204             :     /* allow next page be processed by parallel worker */
    1205      379451 :     if (scan->parallel_scan)
    1206             :     {
    1207         208 :         if (ScanDirectionIsForward(dir))
    1208         208 :             _bt_parallel_release(scan, opaque->btpo_next);
    1209             :         else
    1210           0 :             _bt_parallel_release(scan, BufferGetBlockNumber(so->currPos.buf));
    1211             :     }
    1212             : 
    1213      379451 :     minoff = P_FIRSTDATAKEY(opaque);
    1214      379451 :     maxoff = PageGetMaxOffsetNumber(page);
    1215             : 
    1216             :     /*
    1217             :      * We note the buffer's block number so that we can release the pin later.
    1218             :      * This allows us to re-read the buffer if it is needed again for hinting.
    1219             :      */
    1220      379451 :     so->currPos.currPage = BufferGetBlockNumber(so->currPos.buf);
    1221             : 
    1222             :     /*
    1223             :      * We save the LSN of the page as we read it, so that we know whether it
    1224             :      * safe to apply LP_DEAD hints to the page later.  This allows us to drop
    1225             :      * the pin for MVCC scans, which allows vacuum to avoid blocking.
    1226             :      */
    1227      379451 :     so->currPos.lsn = PageGetLSN(page);
    1228             : 
    1229             :     /*
    1230             :      * we must save the page's right-link while scanning it; this tells us
    1231             :      * where to step right to after we're done with these items.  There is no
    1232             :      * corresponding need for the left-link, since splits always go right.
    1233             :      */
    1234      379451 :     so->currPos.nextPage = opaque->btpo_next;
    1235             : 
    1236             :     /* initialize tuple workspace to empty */
    1237      379451 :     so->currPos.nextTupleOffset = 0;
    1238             : 
    1239             :     /*
    1240             :      * Now that the current page has been made consistent, the macro should be
    1241             :      * good.
    1242             :      */
    1243      379451 :     Assert(BTScanPosIsPinned(so->currPos));
    1244             : 
    1245      379451 :     if (ScanDirectionIsForward(dir))
    1246             :     {
    1247             :         /* load items[] in ascending order */
    1248      379142 :         itemIndex = 0;
    1249             : 
    1250      379142 :         offnum = Max(offnum, minoff);
    1251             : 
    1252     2582689 :         while (offnum <= maxoff)
    1253             :         {
    1254     2126517 :             itup = _bt_checkkeys(scan, page, offnum, dir, &continuescan);
    1255     2126517 :             if (itup != NULL)
    1256             :             {
    1257             :                 /* tuple passes all scan key conditions, so remember it */
    1258     1732702 :                 _bt_saveitem(so, itemIndex, offnum, itup);
    1259     1732702 :                 itemIndex++;
    1260             :             }
    1261     2126517 :             if (!continuescan)
    1262             :             {
    1263             :                 /* there can't be any more matches, so stop */
    1264      302112 :                 so->currPos.moreRight = false;
    1265      302112 :                 break;
    1266             :             }
    1267             : 
    1268     1824405 :             offnum = OffsetNumberNext(offnum);
    1269             :         }
    1270             : 
    1271      379142 :         Assert(itemIndex <= MaxIndexTuplesPerPage);
    1272      379142 :         so->currPos.firstItem = 0;
    1273      379142 :         so->currPos.lastItem = itemIndex - 1;
    1274      379142 :         so->currPos.itemIndex = 0;
    1275             :     }
    1276             :     else
    1277             :     {
    1278             :         /* load items[] in descending order */
    1279         309 :         itemIndex = MaxIndexTuplesPerPage;
    1280             : 
    1281         309 :         offnum = Min(offnum, maxoff);
    1282             : 
    1283       52108 :         while (offnum >= minoff)
    1284             :         {
    1285       51503 :             itup = _bt_checkkeys(scan, page, offnum, dir, &continuescan);
    1286       51503 :             if (itup != NULL)
    1287             :             {
    1288             :                 /* tuple passes all scan key conditions, so remember it */
    1289       49859 :                 itemIndex--;
    1290       49859 :                 _bt_saveitem(so, itemIndex, offnum, itup);
    1291             :             }
    1292       51503 :             if (!continuescan)
    1293             :             {
    1294             :                 /* there can't be any more matches, so stop */
    1295          13 :                 so->currPos.moreLeft = false;
    1296          13 :                 break;
    1297             :             }
    1298             : 
    1299       51490 :             offnum = OffsetNumberPrev(offnum);
    1300             :         }
    1301             : 
    1302         309 :         Assert(itemIndex >= 0);
    1303         309 :         so->currPos.firstItem = itemIndex;
    1304         309 :         so->currPos.lastItem = MaxIndexTuplesPerPage - 1;
    1305         309 :         so->currPos.itemIndex = MaxIndexTuplesPerPage - 1;
    1306             :     }
    1307             : 
    1308      379451 :     return (so->currPos.firstItem <= so->currPos.lastItem);
    1309             : }
    1310             : 
    1311             : /* Save an index item into so->currPos.items[itemIndex] */
    1312             : static void
    1313     1782561 : _bt_saveitem(BTScanOpaque so, int itemIndex,
    1314             :              OffsetNumber offnum, IndexTuple itup)
    1315             : {
    1316     1782561 :     BTScanPosItem *currItem = &so->currPos.items[itemIndex];
    1317             : 
    1318     1782561 :     currItem->heapTid = itup->t_tid;
    1319     1782561 :     currItem->indexOffset = offnum;
    1320     1782561 :     if (so->currTuples)
    1321             :     {
    1322      459366 :         Size        itupsz = IndexTupleSize(itup);
    1323             : 
    1324      459366 :         currItem->tupleOffset = so->currPos.nextTupleOffset;
    1325      459366 :         memcpy(so->currTuples + so->currPos.nextTupleOffset, itup, itupsz);
    1326      459366 :         so->currPos.nextTupleOffset += MAXALIGN(itupsz);
    1327             :     }
    1328     1782561 : }
    1329             : 
    1330             : /*
    1331             :  *  _bt_steppage() -- Step to next page containing valid data for scan
    1332             :  *
    1333             :  * On entry, if so->currPos.buf is valid the buffer is pinned but not locked;
    1334             :  * if pinned, we'll drop the pin before moving to next page.  The buffer is
    1335             :  * not locked on entry.
    1336             :  *
    1337             :  * For success on a scan using a non-MVCC snapshot we hold a pin, but not a
    1338             :  * read lock, on that page.  If we do not hold the pin, we set so->currPos.buf
    1339             :  * to InvalidBuffer.  We return TRUE to indicate success.
    1340             :  */
    1341             : static bool
    1342      182431 : _bt_steppage(IndexScanDesc scan, ScanDirection dir)
    1343             : {
    1344      182431 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
    1345      182431 :     BlockNumber blkno = InvalidBlockNumber;
    1346      182431 :     bool        status = true;
    1347             : 
    1348      182431 :     Assert(BTScanPosIsValid(so->currPos));
    1349             : 
    1350             :     /* Before leaving current page, deal with any killed items */
    1351      182431 :     if (so->numKilled > 0)
    1352        2123 :         _bt_killitems(scan);
    1353             : 
    1354             :     /*
    1355             :      * Before we modify currPos, make a copy of the page data if there was a
    1356             :      * mark position that needs it.
    1357             :      */
    1358      182431 :     if (so->markItemIndex >= 0)
    1359             :     {
    1360             :         /* bump pin on current buffer for assignment to mark buffer */
    1361          46 :         if (BTScanPosIsPinned(so->currPos))
    1362          44 :             IncrBufferRefCount(so->currPos.buf);
    1363          46 :         memcpy(&so->markPos, &so->currPos,
    1364             :                offsetof(BTScanPosData, items[1]) +
    1365          46 :                so->currPos.lastItem * sizeof(BTScanPosItem));
    1366          46 :         if (so->markTuples)
    1367          44 :             memcpy(so->markTuples, so->currTuples,
    1368          44 :                    so->currPos.nextTupleOffset);
    1369          46 :         so->markPos.itemIndex = so->markItemIndex;
    1370          46 :         so->markItemIndex = -1;
    1371             :     }
    1372             : 
    1373      182431 :     if (ScanDirectionIsForward(dir))
    1374             :     {
    1375             :         /* Walk right to the next page with data */
    1376      182416 :         if (scan->parallel_scan != NULL)
    1377             :         {
    1378             :             /*
    1379             :              * Seize the scan to get the next block number; if the scan has
    1380             :              * ended already, bail out.
    1381             :              */
    1382         208 :             status = _bt_parallel_seize(scan, &blkno);
    1383         208 :             if (!status)
    1384             :             {
    1385             :                 /* release the previous buffer, if pinned */
    1386           0 :                 BTScanPosUnpinIfPinned(so->currPos);
    1387           0 :                 BTScanPosInvalidate(so->currPos);
    1388           0 :                 return false;
    1389             :             }
    1390             :         }
    1391             :         else
    1392             :         {
    1393             :             /* Not parallel, so use the previously-saved nextPage link. */
    1394      182208 :             blkno = so->currPos.nextPage;
    1395             :         }
    1396             : 
    1397             :         /* Remember we left a page with data */
    1398      182416 :         so->currPos.moreLeft = true;
    1399             : 
    1400             :         /* release the previous buffer, if pinned */
    1401      182416 :         BTScanPosUnpinIfPinned(so->currPos);
    1402             :     }
    1403             :     else
    1404             :     {
    1405             :         /* Remember we left a page with data */
    1406          15 :         so->currPos.moreRight = true;
    1407             : 
    1408          15 :         if (scan->parallel_scan != NULL)
    1409             :         {
    1410             :             /*
    1411             :              * Seize the scan to get the current block number; if the scan has
    1412             :              * ended already, bail out.
    1413             :              */
    1414           0 :             status = _bt_parallel_seize(scan, &blkno);
    1415           0 :             BTScanPosUnpinIfPinned(so->currPos);
    1416           0 :             if (!status)
    1417             :             {
    1418           0 :                 BTScanPosInvalidate(so->currPos);
    1419           0 :                 return false;
    1420             :             }
    1421             :         }
    1422             :         else
    1423             :         {
    1424             :             /* Not parallel, so just use our own notion of the current page */
    1425          15 :             blkno = so->currPos.currPage;
    1426             :         }
    1427             :     }
    1428             : 
    1429      182431 :     if (!_bt_readnextpage(scan, blkno, dir))
    1430      180067 :         return false;
    1431             : 
    1432             :     /* Drop the lock, and maybe the pin, on the current page */
    1433        2364 :     _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
    1434             : 
    1435        2364 :     return true;
    1436             : }
    1437             : 
    1438             : /*
    1439             :  *  _bt_readnextpage() -- Read next page containing valid data for scan
    1440             :  *
    1441             :  * On success exit, so->currPos is updated to contain data from the next
    1442             :  * interesting page.  Caller is responsible to release lock and pin on
    1443             :  * buffer on success.  We return TRUE to indicate success.
    1444             :  *
    1445             :  * If there are no more matching records in the given direction, we drop all
    1446             :  * locks and pins, set so->currPos.buf to InvalidBuffer, and return FALSE.
    1447             :  */
    1448             : static bool
    1449      182431 : _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
    1450             : {
    1451      182431 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
    1452             :     Relation    rel;
    1453             :     Page        page;
    1454             :     BTPageOpaque opaque;
    1455      182431 :     bool        status = true;
    1456             : 
    1457      182431 :     rel = scan->indexRelation;
    1458             : 
    1459      182431 :     if (ScanDirectionIsForward(dir))
    1460             :     {
    1461             :         for (;;)
    1462             :         {
    1463             :             /*
    1464             :              * if we're at end of scan, give up and mark parallel scan as
    1465             :              * done, so that all the workers can finish their scan
    1466             :              */
    1467      183288 :             if (blkno == P_NONE || !so->currPos.moreRight)
    1468             :             {
    1469      180056 :                 _bt_parallel_done(scan);
    1470      180056 :                 BTScanPosInvalidate(so->currPos);
    1471      180056 :                 return false;
    1472             :             }
    1473             :             /* check for interrupts while we're not holding any buffer lock */
    1474        3232 :             CHECK_FOR_INTERRUPTS();
    1475             :             /* step right one page */
    1476        3232 :             so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
    1477        3232 :             page = BufferGetPage(so->currPos.buf);
    1478        3232 :             TestForOldSnapshot(scan->xs_snapshot, rel, page);
    1479        3232 :             opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    1480             :             /* check for deleted page */
    1481        3232 :             if (!P_IGNORE(opaque))
    1482             :             {
    1483        3232 :                 PredicateLockPage(rel, blkno, scan->xs_snapshot);
    1484             :                 /* see if there are any matches on this page */
    1485             :                 /* note that this will clear moreRight if we can stop */
    1486        3232 :                 if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque)))
    1487        2360 :                     break;
    1488             :             }
    1489             : 
    1490             :             /* nope, keep going */
    1491         872 :             if (scan->parallel_scan != NULL)
    1492             :             {
    1493           0 :                 status = _bt_parallel_seize(scan, &blkno);
    1494           0 :                 if (!status)
    1495             :                 {
    1496           0 :                     _bt_relbuf(rel, so->currPos.buf);
    1497           0 :                     BTScanPosInvalidate(so->currPos);
    1498           0 :                     return false;
    1499             :                 }
    1500             :             }
    1501             :             else
    1502         872 :                 blkno = opaque->btpo_next;
    1503         872 :             _bt_relbuf(rel, so->currPos.buf);
    1504         872 :         }
    1505             :     }
    1506             :     else
    1507             :     {
    1508             :         /*
    1509             :          * Should only happen in parallel cases, when some other backend
    1510             :          * advanced the scan.
    1511             :          */
    1512          15 :         if (so->currPos.currPage != blkno)
    1513             :         {
    1514           0 :             BTScanPosUnpinIfPinned(so->currPos);
    1515           0 :             so->currPos.currPage = blkno;
    1516             :         }
    1517             : 
    1518             :         /*
    1519             :          * Walk left to the next page with data.  This is much more complex
    1520             :          * than the walk-right case because of the possibility that the page
    1521             :          * to our left splits while we are in flight to it, plus the
    1522             :          * possibility that the page we were on gets deleted after we leave
    1523             :          * it.  See nbtree/README for details.
    1524             :          *
    1525             :          * It might be possible to rearrange this code to have less overhead
    1526             :          * in pinning and locking, but that would require capturing the left
    1527             :          * pointer when the page is initially read, and using it here, along
    1528             :          * with big changes to _bt_walk_left() and the code below.  It is not
    1529             :          * clear whether this would be a win, since if the page immediately to
    1530             :          * the left splits after we read this page and before we step left, we
    1531             :          * would need to visit more pages than with the current code.
    1532             :          *
    1533             :          * Note that if we change the code so that we drop the pin for a scan
    1534             :          * which uses a non-MVCC snapshot, we will need to modify the code for
    1535             :          * walking left, to allow for the possibility that a referenced page
    1536             :          * has been deleted.  As long as the buffer is pinned or the snapshot
    1537             :          * is MVCC the page cannot move past the half-dead state to fully
    1538             :          * deleted.
    1539             :          */
    1540          15 :         if (BTScanPosIsPinned(so->currPos))
    1541           8 :             LockBuffer(so->currPos.buf, BT_READ);
    1542             :         else
    1543           7 :             so->currPos.buf = _bt_getbuf(rel, so->currPos.currPage, BT_READ);
    1544             : 
    1545             :         for (;;)
    1546             :         {
    1547             :             /* Done if we know there are no matching keys to the left */
    1548          15 :             if (!so->currPos.moreLeft)
    1549             :             {
    1550           7 :                 _bt_relbuf(rel, so->currPos.buf);
    1551           7 :                 _bt_parallel_done(scan);
    1552           7 :                 BTScanPosInvalidate(so->currPos);
    1553           7 :                 return false;
    1554             :             }
    1555             : 
    1556             :             /* Step to next physical page */
    1557           8 :             so->currPos.buf = _bt_walk_left(rel, so->currPos.buf,
    1558             :                                             scan->xs_snapshot);
    1559             : 
    1560             :             /* if we're physically at end of index, return failure */
    1561           8 :             if (so->currPos.buf == InvalidBuffer)
    1562             :             {
    1563           4 :                 _bt_parallel_done(scan);
    1564           4 :                 BTScanPosInvalidate(so->currPos);
    1565           4 :                 return false;
    1566             :             }
    1567             : 
    1568             :             /*
    1569             :              * Okay, we managed to move left to a non-deleted page. Done if
    1570             :              * it's not half-dead and contains matching tuples. Else loop back
    1571             :              * and do it all again.
    1572             :              */
    1573           4 :             page = BufferGetPage(so->currPos.buf);
    1574           4 :             TestForOldSnapshot(scan->xs_snapshot, rel, page);
    1575           4 :             opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    1576           4 :             if (!P_IGNORE(opaque))
    1577             :             {
    1578           4 :                 PredicateLockPage(rel, BufferGetBlockNumber(so->currPos.buf), scan->xs_snapshot);
    1579             :                 /* see if there are any matches on this page */
    1580             :                 /* note that this will clear moreLeft if we can stop */
    1581           4 :                 if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page)))
    1582           4 :                     break;
    1583             :             }
    1584             : 
    1585             :             /*
    1586             :              * For parallel scans, get the last page scanned as it is quite
    1587             :              * possible that by the time we try to seize the scan, some other
    1588             :              * worker has already advanced the scan to a different page.  We
    1589             :              * must continue based on the latest page scanned by any worker.
    1590             :              */
    1591           0 :             if (scan->parallel_scan != NULL)
    1592             :             {
    1593           0 :                 _bt_relbuf(rel, so->currPos.buf);
    1594           0 :                 status = _bt_parallel_seize(scan, &blkno);
    1595           0 :                 if (!status)
    1596             :                 {
    1597           0 :                     BTScanPosInvalidate(so->currPos);
    1598           0 :                     return false;
    1599             :                 }
    1600           0 :                 so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
    1601             :             }
    1602           0 :         }
    1603             :     }
    1604             : 
    1605        2364 :     return true;
    1606             : }
    1607             : 
    1608             : /*
    1609             :  *  _bt_parallel_readpage() -- Read current page containing valid data for scan
    1610             :  *
    1611             :  * On success, release lock and maybe pin on buffer.  We return TRUE to
    1612             :  * indicate success.
    1613             :  */
    1614             : static bool
    1615           0 : _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
    1616             : {
    1617           0 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
    1618             : 
    1619           0 :     _bt_initialize_more_data(so, dir);
    1620             : 
    1621           0 :     if (!_bt_readnextpage(scan, blkno, dir))
    1622           0 :         return false;
    1623             : 
    1624             :     /* Drop the lock, and maybe the pin, on the current page */
    1625           0 :     _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
    1626             : 
    1627           0 :     return true;
    1628             : }
    1629             : 
    1630             : /*
    1631             :  * _bt_walk_left() -- step left one page, if possible
    1632             :  *
    1633             :  * The given buffer must be pinned and read-locked.  This will be dropped
    1634             :  * before stepping left.  On return, we have pin and read lock on the
    1635             :  * returned page, instead.
    1636             :  *
    1637             :  * Returns InvalidBuffer if there is no page to the left (no lock is held
    1638             :  * in that case).
    1639             :  *
    1640             :  * When working on a non-leaf level, it is possible for the returned page
    1641             :  * to be half-dead; the caller should check that condition and step left
    1642             :  * again if it's important.
    1643             :  */
    1644             : static Buffer
    1645           8 : _bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot)
    1646             : {
    1647             :     Page        page;
    1648             :     BTPageOpaque opaque;
    1649             : 
    1650           8 :     page = BufferGetPage(buf);
    1651           8 :     opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    1652             : 
    1653             :     for (;;)
    1654             :     {
    1655             :         BlockNumber obknum;
    1656             :         BlockNumber lblkno;
    1657             :         BlockNumber blkno;
    1658             :         int         tries;
    1659             : 
    1660             :         /* if we're at end of tree, release buf and return failure */
    1661           8 :         if (P_LEFTMOST(opaque))
    1662             :         {
    1663           4 :             _bt_relbuf(rel, buf);
    1664           4 :             break;
    1665             :         }
    1666             :         /* remember original page we are stepping left from */
    1667           4 :         obknum = BufferGetBlockNumber(buf);
    1668             :         /* step left */
    1669           4 :         blkno = lblkno = opaque->btpo_prev;
    1670           4 :         _bt_relbuf(rel, buf);
    1671             :         /* check for interrupts while we're not holding any buffer lock */
    1672           4 :         CHECK_FOR_INTERRUPTS();
    1673           4 :         buf = _bt_getbuf(rel, blkno, BT_READ);
    1674           4 :         page = BufferGetPage(buf);
    1675           4 :         TestForOldSnapshot(snapshot, rel, page);
    1676           4 :         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    1677             : 
    1678             :         /*
    1679             :          * If this isn't the page we want, walk right till we find what we
    1680             :          * want --- but go no more than four hops (an arbitrary limit). If we
    1681             :          * don't find the correct page by then, the most likely bet is that
    1682             :          * the original page got deleted and isn't in the sibling chain at all
    1683             :          * anymore, not that its left sibling got split more than four times.
    1684             :          *
    1685             :          * Note that it is correct to test P_ISDELETED not P_IGNORE here,
    1686             :          * because half-dead pages are still in the sibling chain.  Caller
    1687             :          * must reject half-dead pages if wanted.
    1688             :          */
    1689           4 :         tries = 0;
    1690             :         for (;;)
    1691             :         {
    1692           4 :             if (!P_ISDELETED(opaque) && opaque->btpo_next == obknum)
    1693             :             {
    1694             :                 /* Found desired page, return it */
    1695           4 :                 return buf;
    1696             :             }
    1697           0 :             if (P_RIGHTMOST(opaque) || ++tries > 4)
    1698             :                 break;
    1699           0 :             blkno = opaque->btpo_next;
    1700           0 :             buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
    1701           0 :             page = BufferGetPage(buf);
    1702           0 :             TestForOldSnapshot(snapshot, rel, page);
    1703           0 :             opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    1704           0 :         }
    1705             : 
    1706             :         /* Return to the original page to see what's up */
    1707           0 :         buf = _bt_relandgetbuf(rel, buf, obknum, BT_READ);
    1708           0 :         page = BufferGetPage(buf);
    1709           0 :         TestForOldSnapshot(snapshot, rel, page);
    1710           0 :         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    1711           0 :         if (P_ISDELETED(opaque))
    1712             :         {
    1713             :             /*
    1714             :              * It was deleted.  Move right to first nondeleted page (there
    1715             :              * must be one); that is the page that has acquired the deleted
    1716             :              * one's keyspace, so stepping left from it will take us where we
    1717             :              * want to be.
    1718             :              */
    1719             :             for (;;)
    1720             :             {
    1721           0 :                 if (P_RIGHTMOST(opaque))
    1722           0 :                     elog(ERROR, "fell off the end of index \"%s\"",
    1723             :                          RelationGetRelationName(rel));
    1724           0 :                 blkno = opaque->btpo_next;
    1725           0 :                 buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
    1726           0 :                 page = BufferGetPage(buf);
    1727           0 :                 TestForOldSnapshot(snapshot, rel, page);
    1728           0 :                 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    1729           0 :                 if (!P_ISDELETED(opaque))
    1730           0 :                     break;
    1731           0 :             }
    1732             : 
    1733             :             /*
    1734             :              * Now return to top of loop, resetting obknum to point to this
    1735             :              * nondeleted page, and try again.
    1736             :              */
    1737             :         }
    1738             :         else
    1739             :         {
    1740             :             /*
    1741             :              * It wasn't deleted; the explanation had better be that the page
    1742             :              * to the left got split or deleted. Without this check, we'd go
    1743             :              * into an infinite loop if there's anything wrong.
    1744             :              */
    1745           0 :             if (opaque->btpo_prev == lblkno)
    1746           0 :                 elog(ERROR, "could not find left sibling of block %u in index \"%s\"",
    1747             :                      obknum, RelationGetRelationName(rel));
    1748             :             /* Okay to try again with new lblkno value */
    1749             :         }
    1750           0 :     }
    1751             : 
    1752           4 :     return InvalidBuffer;
    1753             : }
    1754             : 
    1755             : /*
    1756             :  * _bt_get_endpoint() -- Find the first or last page on a given tree level
    1757             :  *
    1758             :  * If the index is empty, we will return InvalidBuffer; any other failure
    1759             :  * condition causes ereport().  We will not return a dead page.
    1760             :  *
    1761             :  * The returned buffer is pinned and read-locked.
    1762             :  */
    1763             : Buffer
    1764         831 : _bt_get_endpoint(Relation rel, uint32 level, bool rightmost,
    1765             :                  Snapshot snapshot)
    1766             : {
    1767             :     Buffer      buf;
    1768             :     Page        page;
    1769             :     BTPageOpaque opaque;
    1770             :     OffsetNumber offnum;
    1771             :     BlockNumber blkno;
    1772             :     IndexTuple  itup;
    1773             : 
    1774             :     /*
    1775             :      * If we are looking for a leaf page, okay to descend from fast root;
    1776             :      * otherwise better descend from true root.  (There is no point in being
    1777             :      * smarter about intermediate levels.)
    1778             :      */
    1779         831 :     if (level == 0)
    1780         831 :         buf = _bt_getroot(rel, BT_READ);
    1781             :     else
    1782           0 :         buf = _bt_gettrueroot(rel);
    1783             : 
    1784         831 :     if (!BufferIsValid(buf))
    1785         225 :         return InvalidBuffer;
    1786             : 
    1787         606 :     page = BufferGetPage(buf);
    1788         606 :     TestForOldSnapshot(snapshot, rel, page);
    1789         606 :     opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    1790             : 
    1791             :     for (;;)
    1792             :     {
    1793             :         /*
    1794             :          * If we landed on a deleted page, step right to find a live page
    1795             :          * (there must be one).  Also, if we want the rightmost page, step
    1796             :          * right if needed to get to it (this could happen if the page split
    1797             :          * since we obtained a pointer to it).
    1798             :          */
    1799        2046 :         while (P_IGNORE(opaque) ||
    1800           9 :                (rightmost && !P_RIGHTMOST(opaque)))
    1801             :         {
    1802           0 :             blkno = opaque->btpo_next;
    1803           0 :             if (blkno == P_NONE)
    1804           0 :                 elog(ERROR, "fell off the end of index \"%s\"",
    1805             :                      RelationGetRelationName(rel));
    1806           0 :             buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
    1807           0 :             page = BufferGetPage(buf);
    1808           0 :             TestForOldSnapshot(snapshot, rel, page);
    1809           0 :             opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    1810             :         }
    1811             : 
    1812             :         /* Done? */
    1813        1023 :         if (opaque->btpo.level == level)
    1814         606 :             break;
    1815         417 :         if (opaque->btpo.level < level)
    1816           0 :             elog(ERROR, "btree level %u not found in index \"%s\"",
    1817             :                  level, RelationGetRelationName(rel));
    1818             : 
    1819             :         /* Descend to leftmost or rightmost child page */
    1820         417 :         if (rightmost)
    1821           1 :             offnum = PageGetMaxOffsetNumber(page);
    1822             :         else
    1823         416 :             offnum = P_FIRSTDATAKEY(opaque);
    1824             : 
    1825         417 :         itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
    1826         417 :         blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
    1827             : 
    1828         417 :         buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
    1829         417 :         page = BufferGetPage(buf);
    1830         417 :         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    1831         417 :     }
    1832             : 
    1833         606 :     return buf;
    1834             : }
    1835             : 
    1836             : /*
    1837             :  *  _bt_endpoint() -- Find the first or last page in the index, and scan
    1838             :  * from there to the first key satisfying all the quals.
    1839             :  *
    1840             :  * This is used by _bt_first() to set up a scan when we've determined
    1841             :  * that the scan must start at the beginning or end of the index (for
    1842             :  * a forward or backward scan respectively).  Exit conditions are the
    1843             :  * same as for _bt_first().
    1844             :  */
    1845             : static bool
    1846         831 : _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
    1847             : {
    1848         831 :     Relation    rel = scan->indexRelation;
    1849         831 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
    1850             :     Buffer      buf;
    1851             :     Page        page;
    1852             :     BTPageOpaque opaque;
    1853             :     OffsetNumber start;
    1854             :     BTScanPosItem *currItem;
    1855             : 
    1856             :     /*
    1857             :      * Scan down to the leftmost or rightmost leaf page.  This is a simplified
    1858             :      * version of _bt_search().  We don't maintain a stack since we know we
    1859             :      * won't need it.
    1860             :      */
    1861         831 :     buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir), scan->xs_snapshot);
    1862             : 
    1863         831 :     if (!BufferIsValid(buf))
    1864             :     {
    1865             :         /*
    1866             :          * Empty index. Lock the whole relation, as nothing finer to lock
    1867             :          * exists.
    1868             :          */
    1869         225 :         PredicateLockRelation(rel, scan->xs_snapshot);
    1870         225 :         BTScanPosInvalidate(so->currPos);
    1871         225 :         return false;
    1872             :     }
    1873             : 
    1874         606 :     PredicateLockPage(rel, BufferGetBlockNumber(buf), scan->xs_snapshot);
    1875         606 :     page = BufferGetPage(buf);
    1876         606 :     opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    1877         606 :     Assert(P_ISLEAF(opaque));
    1878             : 
    1879         606 :     if (ScanDirectionIsForward(dir))
    1880             :     {
    1881             :         /* There could be dead pages to the left, so not this: */
    1882             :         /* Assert(P_LEFTMOST(opaque)); */
    1883             : 
    1884         598 :         start = P_FIRSTDATAKEY(opaque);
    1885             :     }
    1886           8 :     else if (ScanDirectionIsBackward(dir))
    1887             :     {
    1888           8 :         Assert(P_RIGHTMOST(opaque));
    1889             : 
    1890           8 :         start = PageGetMaxOffsetNumber(page);
    1891             :     }
    1892             :     else
    1893             :     {
    1894           0 :         elog(ERROR, "invalid scan direction: %d", (int) dir);
    1895             :         start = 0;              /* keep compiler quiet */
    1896             :     }
    1897             : 
    1898             :     /* remember which buffer we have pinned */
    1899         606 :     so->currPos.buf = buf;
    1900             : 
    1901         606 :     _bt_initialize_more_data(so, dir);
    1902             : 
    1903             :     /*
    1904             :      * Now load data from the first page of the scan.
    1905             :      */
    1906         606 :     if (!_bt_readpage(scan, dir, start))
    1907             :     {
    1908             :         /*
    1909             :          * There's no actually-matching data on this page.  Try to advance to
    1910             :          * the next page.  Return false if there's no matching data at all.
    1911             :          */
    1912          12 :         LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
    1913          12 :         if (!_bt_steppage(scan, dir))
    1914           6 :             return false;
    1915             :     }
    1916             :     else
    1917             :     {
    1918             :         /* Drop the lock, and maybe the pin, on the current page */
    1919         594 :         _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
    1920             :     }
    1921             : 
    1922             :     /* OK, itemIndex says what to return */
    1923         600 :     currItem = &so->currPos.items[so->currPos.itemIndex];
    1924         600 :     scan->xs_ctup.t_self = currItem->heapTid;
    1925         600 :     if (scan->xs_want_itup)
    1926         121 :         scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
    1927             : 
    1928         600 :     return true;
    1929             : }
    1930             : 
    1931             : /*
    1932             :  * _bt_initialize_more_data() -- initialize moreLeft/moreRight appropriately
    1933             :  * for scan direction
    1934             :  */
    1935             : static inline void
    1936      376215 : _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir)
    1937             : {
    1938             :     /* initialize moreLeft/moreRight appropriately for scan direction */
    1939      376215 :     if (ScanDirectionIsForward(dir))
    1940             :     {
    1941      375910 :         so->currPos.moreLeft = false;
    1942      375910 :         so->currPos.moreRight = true;
    1943             :     }
    1944             :     else
    1945             :     {
    1946         305 :         so->currPos.moreLeft = true;
    1947         305 :         so->currPos.moreRight = false;
    1948             :     }
    1949      376215 :     so->numKilled = 0;           /* just paranoia */
    1950      376215 :     so->markItemIndex = -1;      /* ditto */
    1951      376215 : }

Generated by: LCOV version 1.11