Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * nbtsearch.c
4 : * Search code for postgres btrees.
5 : *
6 : *
7 : * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
8 : * Portions Copyright (c) 1994, Regents of the University of California
9 : *
10 : * IDENTIFICATION
11 : * src/backend/access/nbtree/nbtsearch.c
12 : *
13 : *-------------------------------------------------------------------------
14 : */
15 :
16 : #include "postgres.h"
17 :
18 : #include "access/nbtree.h"
19 : #include "access/relscan.h"
20 : #include "miscadmin.h"
21 : #include "pgstat.h"
22 : #include "storage/predicate.h"
23 : #include "utils/lsyscache.h"
24 : #include "utils/rel.h"
25 : #include "utils/tqual.h"
26 :
27 :
28 : static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir,
29 : OffsetNumber offnum);
30 : static void _bt_saveitem(BTScanOpaque so, int itemIndex,
31 : OffsetNumber offnum, IndexTuple itup);
32 : static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir);
33 : static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir);
34 : static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno,
35 : ScanDirection dir);
36 : static Buffer _bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot);
37 : static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
38 : static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp);
39 : static inline void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir);
40 :
41 :
42 : /*
43 : * _bt_drop_lock_and_maybe_pin()
44 : *
45 : * Unlock the buffer; and if it is safe to release the pin, do that, too. It
46 : * is safe if the scan is using an MVCC snapshot and the index is WAL-logged.
47 : * This will prevent vacuum from stalling in a blocked state trying to read a
48 : * page when a cursor is sitting on it -- at least in many important cases.
49 : *
50 : * Set the buffer to invalid if the pin is released, since the buffer may be
51 : * re-used. If we need to go back to this block (for example, to apply
52 : * LP_DEAD hints) we must get a fresh reference to the buffer. Hopefully it
53 : * will remain in shared memory for as long as it takes to scan the index
54 : * buffer page.
55 : */
56 : static void
57 278168 : _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
58 : {
59 278168 : LockBuffer(sp->buf, BUFFER_LOCK_UNLOCK);
60 :
61 555283 : if (IsMVCCSnapshot(scan->xs_snapshot) &&
62 554097 : RelationNeedsWAL(scan->indexRelation) &&
63 276982 : !scan->xs_want_itup)
64 : {
65 273761 : ReleaseBuffer(sp->buf);
66 273761 : sp->buf = InvalidBuffer;
67 : }
68 278168 : }
69 :
70 :
71 : /*
72 : * _bt_search() -- Search the tree for a particular scankey,
73 : * or more precisely for the first leaf page it could be on.
74 : *
75 : * The passed scankey must be an insertion-type scankey (see nbtree/README),
76 : * but it can omit the rightmost column(s) of the index.
77 : *
78 : * When nextkey is false (the usual case), we are looking for the first
79 : * item >= scankey. When nextkey is true, we are looking for the first
80 : * item strictly greater than scankey.
81 : *
82 : * Return value is a stack of parent-page pointers. *bufP is set to the
83 : * address of the leaf-page buffer, which is read-locked and pinned.
84 : * No locks are held on the parent pages, however!
85 : *
86 : * If the snapshot parameter is not NULL, "old snapshot" checking will take
87 : * place during the descent through the tree. This is not needed when
88 : * positioning for an insert or delete, so NULL is used for those cases.
89 : *
90 : * NOTE that the returned buffer is read-locked regardless of the access
91 : * parameter. However, access = BT_WRITE will allow an empty root page
92 : * to be created and returned. When access = BT_READ, an empty index
93 : * will result in *bufP being set to InvalidBuffer. Also, in BT_WRITE mode,
94 : * any incomplete splits encountered during the search will be finished.
95 : */
96 : BTStack
97 594988 : _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey,
98 : Buffer *bufP, int access, Snapshot snapshot)
99 : {
100 594988 : BTStack stack_in = NULL;
101 :
102 : /* Get the root page to start with */
103 594988 : *bufP = _bt_getroot(rel, access);
104 :
105 : /* If index is empty and access = BT_READ, no root page is created. */
106 594988 : if (!BufferIsValid(*bufP))
107 11642 : return (BTStack) NULL;
108 :
109 : /* Loop iterates once per level descended in the tree */
110 : for (;;)
111 : {
112 : Page page;
113 : BTPageOpaque opaque;
114 : OffsetNumber offnum;
115 : ItemId itemid;
116 : IndexTuple itup;
117 : BlockNumber blkno;
118 : BlockNumber par_blkno;
119 : BTStack new_stack;
120 :
121 : /*
122 : * Race -- the page we just grabbed may have split since we read its
123 : * pointer in the parent (or metapage). If it has, we may need to
124 : * move right to its new sibling. Do that.
125 : *
126 : * In write-mode, allow _bt_moveright to finish any incomplete splits
127 : * along the way. Strictly speaking, we'd only need to finish an
128 : * incomplete split on the leaf page we're about to insert to, not on
129 : * any of the upper levels (they are taken care of in _bt_getstackbuf,
130 : * if the leaf page is split and we insert to the parent page). But
131 : * this is a good opportunity to finish splits of internal pages too.
132 : */
133 1060304 : *bufP = _bt_moveright(rel, *bufP, keysz, scankey, nextkey,
134 : (access == BT_WRITE), stack_in,
135 : BT_READ, snapshot);
136 :
137 : /* if this is a leaf page, we're done */
138 1060304 : page = BufferGetPage(*bufP);
139 1060304 : opaque = (BTPageOpaque) PageGetSpecialPointer(page);
140 1060304 : if (P_ISLEAF(opaque))
141 583346 : break;
142 :
143 : /*
144 : * Find the appropriate item on the internal page, and get the child
145 : * page that it points to.
146 : */
147 476958 : offnum = _bt_binsrch(rel, *bufP, keysz, scankey, nextkey);
148 476958 : itemid = PageGetItemId(page, offnum);
149 476958 : itup = (IndexTuple) PageGetItem(page, itemid);
150 476958 : blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
151 476958 : par_blkno = BufferGetBlockNumber(*bufP);
152 :
153 : /*
154 : * We need to save the location of the index entry we chose in the
155 : * parent page on a stack. In case we split the tree, we'll use the
156 : * stack to work back up to the parent page. We also save the actual
157 : * downlink (TID) to uniquely identify the index entry, in case it
158 : * moves right while we're working lower in the tree. See the paper
159 : * by Lehman and Yao for how this is detected and handled. (We use the
160 : * child link to disambiguate duplicate keys in the index -- Lehman
161 : * and Yao disallow duplicate keys.)
162 : */
163 476958 : new_stack = (BTStack) palloc(sizeof(BTStackData));
164 476958 : new_stack->bts_blkno = par_blkno;
165 476958 : new_stack->bts_offset = offnum;
166 476958 : memcpy(&new_stack->bts_btentry, itup, sizeof(IndexTupleData));
167 476958 : new_stack->bts_parent = stack_in;
168 :
169 : /* drop the read lock on the parent page, acquire one on the child */
170 476958 : *bufP = _bt_relandgetbuf(rel, *bufP, blkno, BT_READ);
171 :
172 : /* okay, all set to move down a level */
173 476958 : stack_in = new_stack;
174 476958 : }
175 :
176 583346 : return stack_in;
177 : }
178 :
179 : /*
180 : * _bt_moveright() -- move right in the btree if necessary.
181 : *
182 : * When we follow a pointer to reach a page, it is possible that
183 : * the page has changed in the meanwhile. If this happens, we're
184 : * guaranteed that the page has "split right" -- that is, that any
185 : * data that appeared on the page originally is either on the page
186 : * or strictly to the right of it.
187 : *
188 : * This routine decides whether or not we need to move right in the
189 : * tree by examining the high key entry on the page. If that entry
190 : * is strictly less than the scankey, or <= the scankey in the nextkey=true
191 : * case, then we followed the wrong link and we need to move right.
192 : *
193 : * The passed scankey must be an insertion-type scankey (see nbtree/README),
194 : * but it can omit the rightmost column(s) of the index.
195 : *
196 : * When nextkey is false (the usual case), we are looking for the first
197 : * item >= scankey. When nextkey is true, we are looking for the first
198 : * item strictly greater than scankey.
199 : *
200 : * If forupdate is true, we will attempt to finish any incomplete splits
201 : * that we encounter. This is required when locking a target page for an
202 : * insertion, because we don't allow inserting on a page before the split
203 : * is completed. 'stack' is only used if forupdate is true.
204 : *
205 : * On entry, we have the buffer pinned and a lock of the type specified by
206 : * 'access'. If we move right, we release the buffer and lock and acquire
207 : * the same on the right sibling. Return value is the buffer we stop at.
208 : *
209 : * If the snapshot parameter is not NULL, "old snapshot" checking will take
210 : * place during the descent through the tree. This is not needed when
211 : * positioning for an insert or delete, so NULL is used for those cases.
212 : */
213 : Buffer
214 1266313 : _bt_moveright(Relation rel,
215 : Buffer buf,
216 : int keysz,
217 : ScanKey scankey,
218 : bool nextkey,
219 : bool forupdate,
220 : BTStack stack,
221 : int access,
222 : Snapshot snapshot)
223 : {
224 : Page page;
225 : BTPageOpaque opaque;
226 : int32 cmpval;
227 :
228 : /*
229 : * When nextkey = false (normal case): if the scan key that brought us to
230 : * this page is > the high key stored on the page, then the page has split
231 : * and we need to move right. (If the scan key is equal to the high key,
232 : * we might or might not need to move right; have to scan the page first
233 : * anyway.)
234 : *
235 : * When nextkey = true: move right if the scan key is >= page's high key.
236 : *
237 : * The page could even have split more than once, so scan as far as
238 : * needed.
239 : *
240 : * We also have to move right if we followed a link that brought us to a
241 : * dead page.
242 : */
243 1266313 : cmpval = nextkey ? 0 : 1;
244 :
245 : for (;;)
246 : {
247 1266335 : page = BufferGetPage(buf);
248 1266335 : TestForOldSnapshot(snapshot, rel, page);
249 1266335 : opaque = (BTPageOpaque) PageGetSpecialPointer(page);
250 :
251 1266335 : if (P_RIGHTMOST(opaque))
252 886225 : break;
253 :
254 : /*
255 : * Finish any incomplete splits we encounter along the way.
256 : */
257 380110 : if (forupdate && P_INCOMPLETE_SPLIT(opaque))
258 : {
259 0 : BlockNumber blkno = BufferGetBlockNumber(buf);
260 :
261 : /* upgrade our lock if necessary */
262 0 : if (access == BT_READ)
263 : {
264 0 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
265 0 : LockBuffer(buf, BT_WRITE);
266 : }
267 :
268 0 : if (P_INCOMPLETE_SPLIT(opaque))
269 0 : _bt_finish_split(rel, buf, stack);
270 : else
271 0 : _bt_relbuf(rel, buf);
272 :
273 : /* re-acquire the lock in the right mode, and re-check */
274 0 : buf = _bt_getbuf(rel, blkno, access);
275 0 : continue;
276 : }
277 :
278 380110 : if (P_IGNORE(opaque) || _bt_compare(rel, keysz, scankey, page, P_HIKEY) >= cmpval)
279 : {
280 : /* step right one page */
281 22 : buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access);
282 22 : continue;
283 : }
284 : else
285 : break;
286 22 : }
287 :
288 1266313 : if (P_IGNORE(opaque))
289 0 : elog(ERROR, "fell off the end of index \"%s\"",
290 : RelationGetRelationName(rel));
291 :
292 1266313 : return buf;
293 : }
294 :
295 : /*
296 : * _bt_binsrch() -- Do a binary search for a key on a particular page.
297 : *
298 : * The passed scankey must be an insertion-type scankey (see nbtree/README),
299 : * but it can omit the rightmost column(s) of the index.
300 : *
301 : * When nextkey is false (the usual case), we are looking for the first
302 : * item >= scankey. When nextkey is true, we are looking for the first
303 : * item strictly greater than scankey.
304 : *
305 : * On a leaf page, _bt_binsrch() returns the OffsetNumber of the first
306 : * key >= given scankey, or > scankey if nextkey is true. (NOTE: in
307 : * particular, this means it is possible to return a value 1 greater than the
308 : * number of keys on the page, if the scankey is > all keys on the page.)
309 : *
310 : * On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber
311 : * of the last key < given scankey, or last key <= given scankey if nextkey
312 : * is true. (Since _bt_compare treats the first data key of such a page as
313 : * minus infinity, there will be at least one key < scankey, so the result
314 : * always points at one of the keys on the page.) This key indicates the
315 : * right place to descend to be sure we find all leaf keys >= given scankey
316 : * (or leaf keys > given scankey when nextkey is true).
317 : *
318 : * This procedure is not responsible for walking right, it just examines
319 : * the given page. _bt_binsrch() has no lock or refcount side effects
320 : * on the buffer.
321 : */
322 : OffsetNumber
323 1057962 : _bt_binsrch(Relation rel,
324 : Buffer buf,
325 : int keysz,
326 : ScanKey scankey,
327 : bool nextkey)
328 : {
329 : Page page;
330 : BTPageOpaque opaque;
331 : OffsetNumber low,
332 : high;
333 : int32 result,
334 : cmpval;
335 :
336 1057962 : page = BufferGetPage(buf);
337 1057962 : opaque = (BTPageOpaque) PageGetSpecialPointer(page);
338 :
339 1057962 : low = P_FIRSTDATAKEY(opaque);
340 1057962 : high = PageGetMaxOffsetNumber(page);
341 :
342 : /*
343 : * If there are no keys on the page, return the first available slot. Note
344 : * this covers two cases: the page is really empty (no keys), or it
345 : * contains only a high key. The latter case is possible after vacuuming.
346 : * This can never happen on an internal page, however, since they are
347 : * never empty (an internal page must have children).
348 : */
349 1057962 : if (high < low)
350 314 : return low;
351 :
352 : /*
353 : * Binary search to find the first key on the page >= scan key, or first
354 : * key > scankey when nextkey is true.
355 : *
356 : * For nextkey=false (cmpval=1), the loop invariant is: all slots before
357 : * 'low' are < scan key, all slots at or after 'high' are >= scan key.
358 : *
359 : * For nextkey=true (cmpval=0), the loop invariant is: all slots before
360 : * 'low' are <= scan key, all slots at or after 'high' are > scan key.
361 : *
362 : * We can fall out when high == low.
363 : */
364 1057648 : high++; /* establish the loop invariant for high */
365 :
366 1057648 : cmpval = nextkey ? 0 : 1; /* select comparison value */
367 :
368 8451159 : while (high > low)
369 : {
370 6335863 : OffsetNumber mid = low + ((high - low) / 2);
371 :
372 : /* We have low <= mid < high, so mid points at a real slot */
373 :
374 6335863 : result = _bt_compare(rel, keysz, scankey, page, mid);
375 :
376 6335863 : if (result >= cmpval)
377 3862847 : low = mid + 1;
378 : else
379 2473016 : high = mid;
380 : }
381 :
382 : /*
383 : * At this point we have high == low, but be careful: they could point
384 : * past the last slot on the page.
385 : *
386 : * On a leaf page, we always return the first key >= scan key (resp. >
387 : * scan key), which could be the last slot + 1.
388 : */
389 1057648 : if (P_ISLEAF(opaque))
390 580690 : return low;
391 :
392 : /*
393 : * On a non-leaf page, return the last key < scan key (resp. <= scan key).
394 : * There must be one if _bt_compare() is playing by the rules.
395 : */
396 476958 : Assert(low > P_FIRSTDATAKEY(opaque));
397 :
398 476958 : return OffsetNumberPrev(low);
399 : }
400 :
401 : /*----------
402 : * _bt_compare() -- Compare scankey to a particular tuple on the page.
403 : *
404 : * The passed scankey must be an insertion-type scankey (see nbtree/README),
405 : * but it can omit the rightmost column(s) of the index.
406 : *
407 : * keysz: number of key conditions to be checked (might be less than the
408 : * number of index columns!)
409 : * page/offnum: location of btree item to be compared to.
410 : *
411 : * This routine returns:
412 : * <0 if scankey < tuple at offnum;
413 : * 0 if scankey == tuple at offnum;
414 : * >0 if scankey > tuple at offnum.
415 : * NULLs in the keys are treated as sortable values. Therefore
416 : * "equality" does not necessarily mean that the item should be
417 : * returned to the caller as a matching key!
418 : *
419 : * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be
420 : * "minus infinity": this routine will always claim it is less than the
421 : * scankey. The actual key value stored (if any, which there probably isn't)
422 : * does not matter. This convention allows us to implement the Lehman and
423 : * Yao convention that the first down-link pointer is before the first key.
424 : * See backend/access/nbtree/README for details.
425 : *----------
426 : */
427 : int32
428 6718736 : _bt_compare(Relation rel,
429 : int keysz,
430 : ScanKey scankey,
431 : Page page,
432 : OffsetNumber offnum)
433 : {
434 6718736 : TupleDesc itupdesc = RelationGetDescr(rel);
435 6718736 : BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
436 : IndexTuple itup;
437 : int i;
438 :
439 : /*
440 : * Force result ">" if target item is first data item on an internal page
441 : * --- see NOTE above.
442 : */
443 6718736 : if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
444 72761 : return 1;
445 :
446 6645975 : itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
447 :
448 : /*
449 : * The scan key is set up with the attribute number associated with each
450 : * term in the key. It is important that, if the index is multi-key, the
451 : * scan contain the first k key attributes, and that they be in order. If
452 : * you think about how multi-key ordering works, you'll understand why
453 : * this is.
454 : *
455 : * We don't test for violation of this condition here, however. The
456 : * initial setup for the index scan had better have gotten it right (see
457 : * _bt_first).
458 : */
459 :
460 9353708 : for (i = 1; i <= keysz; i++)
461 : {
462 : Datum datum;
463 : bool isNull;
464 : int32 result;
465 :
466 8840093 : datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
467 :
468 : /* see comments about NULLs handling in btbuild */
469 8840093 : if (scankey->sk_flags & SK_ISNULL) /* key is NULL */
470 : {
471 2226 : if (isNull)
472 22 : result = 0; /* NULL "=" NULL */
473 2204 : else if (scankey->sk_flags & SK_BT_NULLS_FIRST)
474 44 : result = -1; /* NULL "<" NOT_NULL */
475 : else
476 2160 : result = 1; /* NULL ">" NOT_NULL */
477 : }
478 8837867 : else if (isNull) /* key is NOT_NULL and item is NULL */
479 : {
480 33 : if (scankey->sk_flags & SK_BT_NULLS_FIRST)
481 0 : result = 1; /* NOT_NULL ">" NULL */
482 : else
483 33 : result = -1; /* NOT_NULL "<" NULL */
484 : }
485 : else
486 : {
487 : /*
488 : * The sk_func needs to be passed the index value as left arg and
489 : * the sk_argument as right arg (they might be of different
490 : * types). Since it is convenient for callers to think of
491 : * _bt_compare as comparing the scankey to the index item, we have
492 : * to flip the sign of the comparison result. (Unless it's a DESC
493 : * column, in which case we *don't* flip the sign.)
494 : */
495 8837834 : result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
496 : scankey->sk_collation,
497 : datum,
498 : scankey->sk_argument));
499 :
500 8837834 : if (!(scankey->sk_flags & SK_BT_DESC))
501 8837833 : result = -result;
502 : }
503 :
504 : /* if the keys are unequal, return the difference */
505 8840093 : if (result != 0)
506 6132360 : return result;
507 :
508 2707733 : scankey++;
509 : }
510 :
511 : /* if we get here, the keys are equal */
512 513615 : return 0;
513 : }
514 :
515 : /*
516 : * _bt_first() -- Find the first item in a scan.
517 : *
518 : * We need to be clever about the direction of scan, the search
519 : * conditions, and the tree ordering. We find the first item (or,
520 : * if backwards scan, the last item) in the tree that satisfies the
521 : * qualifications in the scan key. On success exit, the page containing
522 : * the current index tuple is pinned but not locked, and data about
523 : * the matching tuple(s) on the page has been loaded into so->currPos.
524 : * scan->xs_ctup.t_self is set to the heap TID of the current tuple,
525 : * and if requested, scan->xs_itup points to a copy of the index tuple.
526 : *
527 : * If there are no matching items in the index, we return FALSE, with no
528 : * pins or locks held.
529 : *
530 : * Note that scan->keyData[], and the so->keyData[] scankey built from it,
531 : * are both search-type scankeys (see nbtree/README for more about this).
532 : * Within this routine, we build a temporary insertion-type scankey to use
533 : * in locating the scan start position.
534 : */
535 : bool
536 389698 : _bt_first(IndexScanDesc scan, ScanDirection dir)
537 : {
538 389698 : Relation rel = scan->indexRelation;
539 389698 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
540 : Buffer buf;
541 : BTStack stack;
542 : OffsetNumber offnum;
543 : StrategyNumber strat;
544 : bool nextkey;
545 : bool goback;
546 : ScanKey startKeys[INDEX_MAX_KEYS];
547 : ScanKeyData scankeys[INDEX_MAX_KEYS];
548 : ScanKeyData notnullkeys[INDEX_MAX_KEYS];
549 389698 : int keysCount = 0;
550 : int i;
551 389698 : bool status = true;
552 : StrategyNumber strat_total;
553 : BTScanPosItem *currItem;
554 : BlockNumber blkno;
555 :
556 389698 : Assert(!BTScanPosIsValid(so->currPos));
557 :
558 389698 : pgstat_count_index_scan(rel);
559 :
560 : /*
561 : * Examine the scan keys and eliminate any redundant keys; also mark the
562 : * keys that must be matched to continue the scan.
563 : */
564 389698 : _bt_preprocess_keys(scan);
565 :
566 : /*
567 : * Quit now if _bt_preprocess_keys() discovered that the scan keys can
568 : * never be satisfied (eg, x == 1 AND x > 2).
569 : */
570 389698 : if (!so->qual_ok)
571 5 : return false;
572 :
573 : /*
574 : * For parallel scans, get the starting page from shared state. If the
575 : * scan has not started, proceed to find out first leaf page in the usual
576 : * way while keeping other participating processes waiting. If the scan
577 : * has already begun, use the page number from the shared structure.
578 : */
579 389693 : if (scan->parallel_scan != NULL)
580 : {
581 45 : status = _bt_parallel_seize(scan, &blkno);
582 45 : if (!status)
583 36 : return false;
584 9 : else if (blkno == P_NONE)
585 : {
586 0 : _bt_parallel_done(scan);
587 0 : return false;
588 : }
589 9 : else if (blkno != InvalidBlockNumber)
590 : {
591 0 : if (!_bt_parallel_readpage(scan, blkno, dir))
592 0 : return false;
593 0 : goto readcomplete;
594 : }
595 : }
596 :
597 : /*----------
598 : * Examine the scan keys to discover where we need to start the scan.
599 : *
600 : * We want to identify the keys that can be used as starting boundaries;
601 : * these are =, >, or >= keys for a forward scan or =, <, <= keys for
602 : * a backwards scan. We can use keys for multiple attributes so long as
603 : * the prior attributes had only =, >= (resp. =, <=) keys. Once we accept
604 : * a > or < boundary or find an attribute with no boundary (which can be
605 : * thought of as the same as "> -infinity"), we can't use keys for any
606 : * attributes to its right, because it would break our simplistic notion
607 : * of what initial positioning strategy to use.
608 : *
609 : * When the scan keys include cross-type operators, _bt_preprocess_keys
610 : * may not be able to eliminate redundant keys; in such cases we will
611 : * arbitrarily pick a usable one for each attribute. This is correct
612 : * but possibly not optimal behavior. (For example, with keys like
613 : * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when
614 : * x=5 would be more efficient.) Since the situation only arises given
615 : * a poorly-worded query plus an incomplete opfamily, live with it.
616 : *
617 : * When both equality and inequality keys appear for a single attribute
618 : * (again, only possible when cross-type operators appear), we *must*
619 : * select one of the equality keys for the starting point, because
620 : * _bt_checkkeys() will stop the scan as soon as an equality qual fails.
621 : * For example, if we have keys like "x >= 4 AND x = 10" and we elect to
622 : * start at x=4, we will fail and stop before reaching x=10. If multiple
623 : * equality quals survive preprocessing, however, it doesn't matter which
624 : * one we use --- by definition, they are either redundant or
625 : * contradictory.
626 : *
627 : * Any regular (not SK_SEARCHNULL) key implies a NOT NULL qualifier.
628 : * If the index stores nulls at the end of the index we'll be starting
629 : * from, and we have no boundary key for the column (which means the key
630 : * we deduced NOT NULL from is an inequality key that constrains the other
631 : * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to
632 : * use as a boundary key. If we didn't do this, we might find ourselves
633 : * traversing a lot of null entries at the start of the scan.
634 : *
635 : * In this loop, row-comparison keys are treated the same as keys on their
636 : * first (leftmost) columns. We'll add on lower-order columns of the row
637 : * comparison below, if possible.
638 : *
639 : * The selected scan keys (at most one per index column) are remembered by
640 : * storing their addresses into the local startKeys[] array.
641 : *----------
642 : */
643 389657 : strat_total = BTEqualStrategyNumber;
644 389657 : if (so->numberOfKeys > 0)
645 : {
646 : AttrNumber curattr;
647 : ScanKey chosen;
648 : ScanKey impliesNN;
649 : ScanKey cur;
650 :
651 : /*
652 : * chosen is the so-far-chosen key for the current attribute, if any.
653 : * We don't cast the decision in stone until we reach keys for the
654 : * next attribute.
655 : */
656 389317 : curattr = 1;
657 389317 : chosen = NULL;
658 : /* Also remember any scankey that implies a NOT NULL constraint */
659 389317 : impliesNN = NULL;
660 :
661 : /*
662 : * Loop iterates from 0 to numberOfKeys inclusive; we use the last
663 : * pass to handle after-last-key processing. Actual exit from the
664 : * loop is at one of the "break" statements below.
665 : */
666 1056968 : for (cur = so->keyData, i = 0;; cur++, i++)
667 : {
668 1056968 : if (i >= so->numberOfKeys || cur->sk_attno != curattr)
669 : {
670 : /*
671 : * Done looking at keys for curattr. If we didn't find a
672 : * usable boundary key, see if we can deduce a NOT NULL key.
673 : */
674 668059 : if (chosen == NULL && impliesNN != NULL &&
675 458 : ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
676 : ScanDirectionIsForward(dir) :
677 : ScanDirectionIsBackward(dir)))
678 : {
679 : /* Yes, so build the key in notnullkeys[keysCount] */
680 1 : chosen = ¬nullkeys[keysCount];
681 2 : ScanKeyEntryInitialize(chosen,
682 : (SK_SEARCHNOTNULL | SK_ISNULL |
683 1 : (impliesNN->sk_flags &
684 : (SK_BT_DESC | SK_BT_NULLS_FIRST))),
685 : curattr,
686 1 : ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
687 : BTGreaterStrategyNumber :
688 : BTLessStrategyNumber),
689 : InvalidOid,
690 : InvalidOid,
691 : InvalidOid,
692 : (Datum) 0);
693 : }
694 :
695 : /*
696 : * If we still didn't find a usable boundary key, quit; else
697 : * save the boundary key pointer in startKeys.
698 : */
699 667601 : if (chosen == NULL)
700 480 : break;
701 667121 : startKeys[keysCount++] = chosen;
702 :
703 : /*
704 : * Adjust strat_total, and quit if we have stored a > or <
705 : * key.
706 : */
707 667121 : strat = chosen->sk_strategy;
708 667121 : if (strat != BTEqualStrategyNumber)
709 : {
710 26944 : strat_total = strat;
711 26944 : if (strat == BTGreaterStrategyNumber ||
712 : strat == BTLessStrategyNumber)
713 : break;
714 : }
715 :
716 : /*
717 : * Done if that was the last attribute, or if next key is not
718 : * in sequence (implying no boundary key is available for the
719 : * next attribute).
720 : */
721 919007 : if (i >= so->numberOfKeys ||
722 278311 : cur->sk_attno != curattr + 1)
723 : break;
724 :
725 : /*
726 : * Reset for next attr.
727 : */
728 278284 : curattr = cur->sk_attno;
729 278284 : chosen = NULL;
730 278284 : impliesNN = NULL;
731 : }
732 :
733 : /*
734 : * Can we use this key as a starting boundary for this attr?
735 : *
736 : * If not, does it imply a NOT NULL constraint? (Because
737 : * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber,
738 : * *any* inequality key works for that; we need not test.)
739 : */
740 667651 : switch (cur->sk_strategy)
741 : {
742 : case BTLessStrategyNumber:
743 : case BTLessEqualStrategyNumber:
744 774 : if (chosen == NULL)
745 : {
746 703 : if (ScanDirectionIsBackward(dir))
747 248 : chosen = cur;
748 : else
749 455 : impliesNN = cur;
750 : }
751 774 : break;
752 : case BTEqualStrategyNumber:
753 : /* override any non-equality choice */
754 640177 : chosen = cur;
755 640177 : break;
756 : case BTGreaterEqualStrategyNumber:
757 : case BTGreaterStrategyNumber:
758 26700 : if (chosen == NULL)
759 : {
760 26700 : if (ScanDirectionIsForward(dir))
761 26695 : chosen = cur;
762 : else
763 5 : impliesNN = cur;
764 : }
765 26700 : break;
766 : }
767 667651 : }
768 : }
769 :
770 : /*
771 : * If we found no usable boundary keys, we have to start from one end of
772 : * the tree. Walk down that edge to the first or last key, and scan from
773 : * there.
774 : */
775 389657 : if (keysCount == 0)
776 : {
777 : bool match;
778 :
779 813 : match = _bt_endpoint(scan, dir);
780 :
781 813 : if (!match)
782 : {
783 : /* No match, so mark (parallel) scan finished */
784 217 : _bt_parallel_done(scan);
785 : }
786 :
787 813 : return match;
788 : }
789 :
790 : /*
791 : * We want to start the scan somewhere within the index. Set up an
792 : * insertion scankey we can use to search for the boundary point we
793 : * identified above. The insertion scankey is built in the local
794 : * scankeys[] array, using the keys identified by startKeys[].
795 : */
796 388844 : Assert(keysCount <= INDEX_MAX_KEYS);
797 1055963 : for (i = 0; i < keysCount; i++)
798 : {
799 667121 : ScanKey cur = startKeys[i];
800 :
801 667121 : Assert(cur->sk_attno == i + 1);
802 :
803 667121 : if (cur->sk_flags & SK_ROW_HEADER)
804 : {
805 : /*
806 : * Row comparison header: look to the first row member instead.
807 : *
808 : * The member scankeys are already in insertion format (ie, they
809 : * have sk_func = 3-way-comparison function), but we have to watch
810 : * out for nulls, which _bt_preprocess_keys didn't check. A null
811 : * in the first row member makes the condition unmatchable, just
812 : * like qual_ok = false.
813 : */
814 2 : ScanKey subkey = (ScanKey) DatumGetPointer(cur->sk_argument);
815 :
816 2 : Assert(subkey->sk_flags & SK_ROW_MEMBER);
817 2 : if (subkey->sk_flags & SK_ISNULL)
818 : {
819 0 : _bt_parallel_done(scan);
820 0 : return false;
821 : }
822 2 : memcpy(scankeys + i, subkey, sizeof(ScanKeyData));
823 :
824 : /*
825 : * If the row comparison is the last positioning key we accepted,
826 : * try to add additional keys from the lower-order row members.
827 : * (If we accepted independent conditions on additional index
828 : * columns, we use those instead --- doesn't seem worth trying to
829 : * determine which is more restrictive.) Note that this is OK
830 : * even if the row comparison is of ">" or "<" type, because the
831 : * condition applied to all but the last row member is effectively
832 : * ">=" or "<=", and so the extra keys don't break the positioning
833 : * scheme. But, by the same token, if we aren't able to use all
834 : * the row members, then the part of the row comparison that we
835 : * did use has to be treated as just a ">=" or "<=" condition, and
836 : * so we'd better adjust strat_total accordingly.
837 : */
838 2 : if (i == keysCount - 1)
839 : {
840 2 : bool used_all_subkeys = false;
841 :
842 2 : Assert(!(subkey->sk_flags & SK_ROW_END));
843 : for (;;)
844 : {
845 2 : subkey++;
846 2 : Assert(subkey->sk_flags & SK_ROW_MEMBER);
847 2 : if (subkey->sk_attno != keysCount + 1)
848 0 : break; /* out-of-sequence, can't use it */
849 2 : if (subkey->sk_strategy != cur->sk_strategy)
850 0 : break; /* wrong direction, can't use it */
851 2 : if (subkey->sk_flags & SK_ISNULL)
852 0 : break; /* can't use null keys */
853 2 : Assert(keysCount < INDEX_MAX_KEYS);
854 2 : memcpy(scankeys + keysCount, subkey, sizeof(ScanKeyData));
855 2 : keysCount++;
856 2 : if (subkey->sk_flags & SK_ROW_END)
857 : {
858 2 : used_all_subkeys = true;
859 2 : break;
860 : }
861 0 : }
862 2 : if (!used_all_subkeys)
863 : {
864 0 : switch (strat_total)
865 : {
866 : case BTLessStrategyNumber:
867 0 : strat_total = BTLessEqualStrategyNumber;
868 0 : break;
869 : case BTGreaterStrategyNumber:
870 0 : strat_total = BTGreaterEqualStrategyNumber;
871 0 : break;
872 : }
873 : }
874 2 : break; /* done with outer loop */
875 : }
876 : }
877 : else
878 : {
879 : /*
880 : * Ordinary comparison key. Transform the search-style scan key
881 : * to an insertion scan key by replacing the sk_func with the
882 : * appropriate btree comparison function.
883 : *
884 : * If scankey operator is not a cross-type comparison, we can use
885 : * the cached comparison function; otherwise gotta look it up in
886 : * the catalogs. (That can't lead to infinite recursion, since no
887 : * indexscan initiated by syscache lookup will use cross-data-type
888 : * operators.)
889 : *
890 : * We support the convention that sk_subtype == InvalidOid means
891 : * the opclass input type; this is a hack to simplify life for
892 : * ScanKeyInit().
893 : */
894 1294774 : if (cur->sk_subtype == rel->rd_opcintype[i] ||
895 627655 : cur->sk_subtype == InvalidOid)
896 666373 : {
897 : FmgrInfo *procinfo;
898 :
899 666373 : procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC);
900 1332746 : ScanKeyEntryInitializeWithInfo(scankeys + i,
901 : cur->sk_flags,
902 666373 : cur->sk_attno,
903 : InvalidStrategy,
904 : cur->sk_subtype,
905 : cur->sk_collation,
906 : procinfo,
907 : cur->sk_argument);
908 : }
909 : else
910 : {
911 : RegProcedure cmp_proc;
912 :
913 1492 : cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
914 746 : rel->rd_opcintype[i],
915 : cur->sk_subtype,
916 : BTORDER_PROC);
917 746 : if (!RegProcedureIsValid(cmp_proc))
918 0 : elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
919 : BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype,
920 : cur->sk_attno, RelationGetRelationName(rel));
921 1492 : ScanKeyEntryInitialize(scankeys + i,
922 : cur->sk_flags,
923 746 : cur->sk_attno,
924 : InvalidStrategy,
925 : cur->sk_subtype,
926 : cur->sk_collation,
927 : cmp_proc,
928 : cur->sk_argument);
929 : }
930 : }
931 : }
932 :
933 : /*----------
934 : * Examine the selected initial-positioning strategy to determine exactly
935 : * where we need to start the scan, and set flag variables to control the
936 : * code below.
937 : *
938 : * If nextkey = false, _bt_search and _bt_binsrch will locate the first
939 : * item >= scan key. If nextkey = true, they will locate the first
940 : * item > scan key.
941 : *
942 : * If goback = true, we will then step back one item, while if
943 : * goback = false, we will start the scan on the located item.
944 : *----------
945 : */
946 388844 : switch (strat_total)
947 : {
948 : case BTLessStrategyNumber:
949 :
950 : /*
951 : * Find first item >= scankey, then back up one to arrive at last
952 : * item < scankey. (Note: this positioning strategy is only used
953 : * for a backward scan, so that is always the correct starting
954 : * position.)
955 : */
956 249 : nextkey = false;
957 249 : goback = true;
958 249 : break;
959 :
960 : case BTLessEqualStrategyNumber:
961 :
962 : /*
963 : * Find first item > scankey, then back up one to arrive at last
964 : * item <= scankey. (Note: this positioning strategy is only used
965 : * for a backward scan, so that is always the correct starting
966 : * position.)
967 : */
968 0 : nextkey = true;
969 0 : goback = true;
970 0 : break;
971 :
972 : case BTEqualStrategyNumber:
973 :
974 : /*
975 : * If a backward scan was specified, need to start with last equal
976 : * item not first one.
977 : */
978 361900 : if (ScanDirectionIsBackward(dir))
979 : {
980 : /*
981 : * This is the same as the <= strategy. We will check at the
982 : * end whether the found item is actually =.
983 : */
984 20 : nextkey = true;
985 20 : goback = true;
986 : }
987 : else
988 : {
989 : /*
990 : * This is the same as the >= strategy. We will check at the
991 : * end whether the found item is actually =.
992 : */
993 361880 : nextkey = false;
994 361880 : goback = false;
995 : }
996 361900 : break;
997 :
998 : case BTGreaterEqualStrategyNumber:
999 :
1000 : /*
1001 : * Find first item >= scankey. (This is only used for forward
1002 : * scans.)
1003 : */
1004 519 : nextkey = false;
1005 519 : goback = false;
1006 519 : break;
1007 :
1008 : case BTGreaterStrategyNumber:
1009 :
1010 : /*
1011 : * Find first item > scankey. (This is only used for forward
1012 : * scans.)
1013 : */
1014 26176 : nextkey = true;
1015 26176 : goback = false;
1016 26176 : break;
1017 :
1018 : default:
1019 : /* can't get here, but keep compiler quiet */
1020 0 : elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
1021 : return false;
1022 : }
1023 :
1024 : /*
1025 : * Use the manufactured insertion scan key to descend the tree and
1026 : * position ourselves on the target leaf page.
1027 : */
1028 388844 : stack = _bt_search(rel, keysCount, scankeys, nextkey, &buf, BT_READ,
1029 : scan->xs_snapshot);
1030 :
1031 : /* don't need to keep the stack around... */
1032 388844 : _bt_freestack(stack);
1033 :
1034 388844 : if (!BufferIsValid(buf))
1035 : {
1036 : /*
1037 : * We only get here if the index is completely empty. Lock relation
1038 : * because nothing finer to lock exists.
1039 : */
1040 11642 : PredicateLockRelation(rel, scan->xs_snapshot);
1041 :
1042 : /*
1043 : * mark parallel scan as done, so that all the workers can finish
1044 : * their scan
1045 : */
1046 11642 : _bt_parallel_done(scan);
1047 11642 : BTScanPosInvalidate(so->currPos);
1048 :
1049 11642 : return false;
1050 : }
1051 : else
1052 377202 : PredicateLockPage(rel, BufferGetBlockNumber(buf),
1053 : scan->xs_snapshot);
1054 :
1055 377202 : _bt_initialize_more_data(so, dir);
1056 :
1057 : /* position to the precise item on the page */
1058 377202 : offnum = _bt_binsrch(rel, buf, keysCount, scankeys, nextkey);
1059 :
1060 : /*
1061 : * If nextkey = false, we are positioned at the first item >= scan key, or
1062 : * possibly at the end of a page on which all the existing items are less
1063 : * than the scan key and we know that everything on later pages is greater
1064 : * than or equal to scan key.
1065 : *
1066 : * If nextkey = true, we are positioned at the first item > scan key, or
1067 : * possibly at the end of a page on which all the existing items are less
1068 : * than or equal to the scan key and we know that everything on later
1069 : * pages is greater than scan key.
1070 : *
1071 : * The actually desired starting point is either this item or the prior
1072 : * one, or in the end-of-page case it's the first item on the next page or
1073 : * the last item on this page. Adjust the starting offset if needed. (If
1074 : * this results in an offset before the first item or after the last one,
1075 : * _bt_readpage will report no items found, and then we'll step to the
1076 : * next page as needed.)
1077 : */
1078 377202 : if (goback)
1079 269 : offnum = OffsetNumberPrev(offnum);
1080 :
1081 : /* remember which buffer we have pinned, if any */
1082 377202 : Assert(!BTScanPosIsValid(so->currPos));
1083 377202 : so->currPos.buf = buf;
1084 :
1085 : /*
1086 : * Now load data from the first page of the scan.
1087 : */
1088 377202 : if (!_bt_readpage(scan, dir, offnum))
1089 : {
1090 : /*
1091 : * There's no actually-matching data on this page. Try to advance to
1092 : * the next page. Return false if there's no matching data at all.
1093 : */
1094 101924 : LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
1095 101924 : if (!_bt_steppage(scan, dir))
1096 101728 : return false;
1097 : }
1098 : else
1099 : {
1100 : /* Drop the lock, and maybe the pin, on the current page */
1101 275278 : _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
1102 : }
1103 :
1104 : readcomplete:
1105 : /* OK, itemIndex says what to return */
1106 275474 : currItem = &so->currPos.items[so->currPos.itemIndex];
1107 275474 : scan->xs_ctup.t_self = currItem->heapTid;
1108 275474 : if (scan->xs_want_itup)
1109 2087 : scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
1110 :
1111 275474 : return true;
1112 : }
1113 :
1114 : /*
1115 : * _bt_next() -- Get the next item in a scan.
1116 : *
1117 : * On entry, so->currPos describes the current page, which may be pinned
1118 : * but is not locked, and so->currPos.itemIndex identifies which item was
1119 : * previously returned.
1120 : *
1121 : * On successful exit, scan->xs_ctup.t_self is set to the TID of the
1122 : * next heap tuple, and if requested, scan->xs_itup points to a copy of
1123 : * the index tuple. so->currPos is updated as needed.
1124 : *
1125 : * On failure exit (no more tuples), we release pin and set
1126 : * so->currPos.buf to InvalidBuffer.
1127 : */
1128 : bool
1129 789662 : _bt_next(IndexScanDesc scan, ScanDirection dir)
1130 : {
1131 789662 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
1132 : BTScanPosItem *currItem;
1133 :
1134 : /*
1135 : * Advance to next tuple on current page; or if there's no more, try to
1136 : * step to the next page with data.
1137 : */
1138 789662 : if (ScanDirectionIsForward(dir))
1139 : {
1140 788668 : if (++so->currPos.itemIndex > so->currPos.lastItem)
1141 : {
1142 80857 : if (!_bt_steppage(scan, dir))
1143 78761 : return false;
1144 : }
1145 : }
1146 : else
1147 : {
1148 994 : if (--so->currPos.itemIndex < so->currPos.firstItem)
1149 : {
1150 12 : if (!_bt_steppage(scan, dir))
1151 10 : return false;
1152 : }
1153 : }
1154 :
1155 : /* OK, itemIndex says what to return */
1156 710891 : currItem = &so->currPos.items[so->currPos.itemIndex];
1157 710891 : scan->xs_ctup.t_self = currItem->heapTid;
1158 710891 : if (scan->xs_want_itup)
1159 443903 : scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
1160 :
1161 710891 : return true;
1162 : }
1163 :
1164 : /*
1165 : * _bt_readpage() -- Load data from current index page into so->currPos
1166 : *
1167 : * Caller must have pinned and read-locked so->currPos.buf; the buffer's state
1168 : * is not changed here. Also, currPos.moreLeft and moreRight must be valid;
1169 : * they are updated as appropriate. All other fields of so->currPos are
1170 : * initialized from scratch here.
1171 : *
1172 : * We scan the current page starting at offnum and moving in the indicated
1173 : * direction. All items matching the scan keys are loaded into currPos.items.
1174 : * moreLeft or moreRight (as appropriate) is cleared if _bt_checkkeys reports
1175 : * that there can be no more matching tuples in the current scan direction.
1176 : *
1177 : * In the case of a parallel scan, caller must have called _bt_parallel_seize
1178 : * prior to calling this function; this function will invoke
1179 : * _bt_parallel_release before returning.
1180 : *
1181 : * Returns true if any matching items found on the page, false if none.
1182 : */
1183 : static bool
1184 380345 : _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
1185 : {
1186 380345 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
1187 : Page page;
1188 : BTPageOpaque opaque;
1189 : OffsetNumber minoff;
1190 : OffsetNumber maxoff;
1191 : int itemIndex;
1192 : IndexTuple itup;
1193 : bool continuescan;
1194 :
1195 : /*
1196 : * We must have the buffer pinned and locked, but the usual macro can't be
1197 : * used here; this function is what makes it good for currPos.
1198 : */
1199 380345 : Assert(BufferIsValid(so->currPos.buf));
1200 :
1201 380345 : page = BufferGetPage(so->currPos.buf);
1202 380345 : opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1203 :
1204 : /* allow next page be processed by parallel worker */
1205 380345 : if (scan->parallel_scan)
1206 : {
1207 208 : if (ScanDirectionIsForward(dir))
1208 208 : _bt_parallel_release(scan, opaque->btpo_next);
1209 : else
1210 0 : _bt_parallel_release(scan, BufferGetBlockNumber(so->currPos.buf));
1211 : }
1212 :
1213 380345 : minoff = P_FIRSTDATAKEY(opaque);
1214 380345 : maxoff = PageGetMaxOffsetNumber(page);
1215 :
1216 : /*
1217 : * We note the buffer's block number so that we can release the pin later.
1218 : * This allows us to re-read the buffer if it is needed again for hinting.
1219 : */
1220 380345 : so->currPos.currPage = BufferGetBlockNumber(so->currPos.buf);
1221 :
1222 : /*
1223 : * We save the LSN of the page as we read it, so that we know whether it
1224 : * safe to apply LP_DEAD hints to the page later. This allows us to drop
1225 : * the pin for MVCC scans, which allows vacuum to avoid blocking.
1226 : */
1227 380345 : so->currPos.lsn = PageGetLSN(page);
1228 :
1229 : /*
1230 : * we must save the page's right-link while scanning it; this tells us
1231 : * where to step right to after we're done with these items. There is no
1232 : * corresponding need for the left-link, since splits always go right.
1233 : */
1234 380345 : so->currPos.nextPage = opaque->btpo_next;
1235 :
1236 : /* initialize tuple workspace to empty */
1237 380345 : so->currPos.nextTupleOffset = 0;
1238 :
1239 : /*
1240 : * Now that the current page has been made consistent, the macro should be
1241 : * good.
1242 : */
1243 380345 : Assert(BTScanPosIsPinned(so->currPos));
1244 :
1245 380345 : if (ScanDirectionIsForward(dir))
1246 : {
1247 : /* load items[] in ascending order */
1248 380064 : itemIndex = 0;
1249 :
1250 380064 : offnum = Max(offnum, minoff);
1251 :
1252 2851937 : while (offnum <= maxoff)
1253 : {
1254 2394655 : itup = _bt_checkkeys(scan, page, offnum, dir, &continuescan);
1255 2394655 : if (itup != NULL)
1256 : {
1257 : /* tuple passes all scan key conditions, so remember it */
1258 2009047 : _bt_saveitem(so, itemIndex, offnum, itup);
1259 2009047 : itemIndex++;
1260 : }
1261 2394655 : if (!continuescan)
1262 : {
1263 : /* there can't be any more matches, so stop */
1264 302846 : so->currPos.moreRight = false;
1265 302846 : break;
1266 : }
1267 :
1268 2091809 : offnum = OffsetNumberNext(offnum);
1269 : }
1270 :
1271 380064 : Assert(itemIndex <= MaxIndexTuplesPerPage);
1272 380064 : so->currPos.firstItem = 0;
1273 380064 : so->currPos.lastItem = itemIndex - 1;
1274 380064 : so->currPos.itemIndex = 0;
1275 : }
1276 : else
1277 : {
1278 : /* load items[] in descending order */
1279 281 : itemIndex = MaxIndexTuplesPerPage;
1280 :
1281 281 : offnum = Min(offnum, maxoff);
1282 :
1283 51510 : while (offnum >= minoff)
1284 : {
1285 50961 : itup = _bt_checkkeys(scan, page, offnum, dir, &continuescan);
1286 50961 : if (itup != NULL)
1287 : {
1288 : /* tuple passes all scan key conditions, so remember it */
1289 48619 : itemIndex--;
1290 48619 : _bt_saveitem(so, itemIndex, offnum, itup);
1291 : }
1292 50961 : if (!continuescan)
1293 : {
1294 : /* there can't be any more matches, so stop */
1295 13 : so->currPos.moreLeft = false;
1296 13 : break;
1297 : }
1298 :
1299 50948 : offnum = OffsetNumberPrev(offnum);
1300 : }
1301 :
1302 281 : Assert(itemIndex >= 0);
1303 281 : so->currPos.firstItem = itemIndex;
1304 281 : so->currPos.lastItem = MaxIndexTuplesPerPage - 1;
1305 281 : so->currPos.itemIndex = MaxIndexTuplesPerPage - 1;
1306 : }
1307 :
1308 380345 : return (so->currPos.firstItem <= so->currPos.lastItem);
1309 : }
1310 :
1311 : /* Save an index item into so->currPos.items[itemIndex] */
1312 : static void
1313 2057666 : _bt_saveitem(BTScanOpaque so, int itemIndex,
1314 : OffsetNumber offnum, IndexTuple itup)
1315 : {
1316 2057666 : BTScanPosItem *currItem = &so->currPos.items[itemIndex];
1317 :
1318 2057666 : currItem->heapTid = itup->t_tid;
1319 2057666 : currItem->indexOffset = offnum;
1320 2057666 : if (so->currTuples)
1321 : {
1322 459395 : Size itupsz = IndexTupleSize(itup);
1323 :
1324 459395 : currItem->tupleOffset = so->currPos.nextTupleOffset;
1325 459395 : memcpy(so->currTuples + so->currPos.nextTupleOffset, itup, itupsz);
1326 459395 : so->currPos.nextTupleOffset += MAXALIGN(itupsz);
1327 : }
1328 2057666 : }
1329 :
1330 : /*
1331 : * _bt_steppage() -- Step to next page containing valid data for scan
1332 : *
1333 : * On entry, if so->currPos.buf is valid the buffer is pinned but not locked;
1334 : * if pinned, we'll drop the pin before moving to next page. The buffer is
1335 : * not locked on entry.
1336 : *
1337 : * For success on a scan using a non-MVCC snapshot we hold a pin, but not a
1338 : * read lock, on that page. If we do not hold the pin, we set so->currPos.buf
1339 : * to InvalidBuffer. We return TRUE to indicate success.
1340 : */
1341 : static bool
1342 182802 : _bt_steppage(IndexScanDesc scan, ScanDirection dir)
1343 : {
1344 182802 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
1345 182802 : BlockNumber blkno = InvalidBlockNumber;
1346 182802 : bool status = true;
1347 :
1348 182802 : Assert(BTScanPosIsValid(so->currPos));
1349 :
1350 : /* Before leaving current page, deal with any killed items */
1351 182802 : if (so->numKilled > 0)
1352 1952 : _bt_killitems(scan);
1353 :
1354 : /*
1355 : * Before we modify currPos, make a copy of the page data if there was a
1356 : * mark position that needs it.
1357 : */
1358 182802 : if (so->markItemIndex >= 0)
1359 : {
1360 : /* bump pin on current buffer for assignment to mark buffer */
1361 46 : if (BTScanPosIsPinned(so->currPos))
1362 44 : IncrBufferRefCount(so->currPos.buf);
1363 46 : memcpy(&so->markPos, &so->currPos,
1364 : offsetof(BTScanPosData, items[1]) +
1365 46 : so->currPos.lastItem * sizeof(BTScanPosItem));
1366 46 : if (so->markTuples)
1367 44 : memcpy(so->markTuples, so->currTuples,
1368 44 : so->currPos.nextTupleOffset);
1369 46 : so->markPos.itemIndex = so->markItemIndex;
1370 46 : so->markItemIndex = -1;
1371 : }
1372 :
1373 182802 : if (ScanDirectionIsForward(dir))
1374 : {
1375 : /* Walk right to the next page with data */
1376 182788 : if (scan->parallel_scan != NULL)
1377 : {
1378 : /*
1379 : * Seize the scan to get the next block number; if the scan has
1380 : * ended already, bail out.
1381 : */
1382 208 : status = _bt_parallel_seize(scan, &blkno);
1383 208 : if (!status)
1384 : {
1385 : /* release the previous buffer, if pinned */
1386 0 : BTScanPosUnpinIfPinned(so->currPos);
1387 0 : BTScanPosInvalidate(so->currPos);
1388 0 : return false;
1389 : }
1390 : }
1391 : else
1392 : {
1393 : /* Not parallel, so use the previously-saved nextPage link. */
1394 182580 : blkno = so->currPos.nextPage;
1395 : }
1396 :
1397 : /* Remember we left a page with data */
1398 182788 : so->currPos.moreLeft = true;
1399 :
1400 : /* release the previous buffer, if pinned */
1401 182788 : BTScanPosUnpinIfPinned(so->currPos);
1402 : }
1403 : else
1404 : {
1405 : /* Remember we left a page with data */
1406 14 : so->currPos.moreRight = true;
1407 :
1408 14 : if (scan->parallel_scan != NULL)
1409 : {
1410 : /*
1411 : * Seize the scan to get the current block number; if the scan has
1412 : * ended already, bail out.
1413 : */
1414 0 : status = _bt_parallel_seize(scan, &blkno);
1415 0 : BTScanPosUnpinIfPinned(so->currPos);
1416 0 : if (!status)
1417 : {
1418 0 : BTScanPosInvalidate(so->currPos);
1419 0 : return false;
1420 : }
1421 : }
1422 : else
1423 : {
1424 : /* Not parallel, so just use our own notion of the current page */
1425 14 : blkno = so->currPos.currPage;
1426 : }
1427 : }
1428 :
1429 182802 : if (!_bt_readnextpage(scan, blkno, dir))
1430 180505 : return false;
1431 :
1432 : /* Drop the lock, and maybe the pin, on the current page */
1433 2297 : _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
1434 :
1435 2297 : return true;
1436 : }
1437 :
1438 : /*
1439 : * _bt_readnextpage() -- Read next page containing valid data for scan
1440 : *
1441 : * On success exit, so->currPos is updated to contain data from the next
1442 : * interesting page. Caller is responsible to release lock and pin on
1443 : * buffer on success. We return TRUE to indicate success.
1444 : *
1445 : * If there are no more matching records in the given direction, we drop all
1446 : * locks and pins, set so->currPos.buf to InvalidBuffer, and return FALSE.
1447 : */
1448 : static bool
1449 182802 : _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
1450 : {
1451 182802 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
1452 : Relation rel;
1453 : Page page;
1454 : BTPageOpaque opaque;
1455 182802 : bool status = true;
1456 :
1457 182802 : rel = scan->indexRelation;
1458 :
1459 182802 : if (ScanDirectionIsForward(dir))
1460 : {
1461 : for (;;)
1462 : {
1463 : /*
1464 : * if we're at end of scan, give up and mark parallel scan as
1465 : * done, so that all the workers can finish their scan
1466 : */
1467 183031 : if (blkno == P_NONE || !so->currPos.moreRight)
1468 : {
1469 180494 : _bt_parallel_done(scan);
1470 180494 : BTScanPosInvalidate(so->currPos);
1471 180494 : return false;
1472 : }
1473 : /* check for interrupts while we're not holding any buffer lock */
1474 2537 : CHECK_FOR_INTERRUPTS();
1475 : /* step right one page */
1476 2537 : so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
1477 2537 : page = BufferGetPage(so->currPos.buf);
1478 2537 : TestForOldSnapshot(scan->xs_snapshot, rel, page);
1479 2537 : opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1480 : /* check for deleted page */
1481 2537 : if (!P_IGNORE(opaque))
1482 : {
1483 2537 : PredicateLockPage(rel, blkno, scan->xs_snapshot);
1484 : /* see if there are any matches on this page */
1485 : /* note that this will clear moreRight if we can stop */
1486 2537 : if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque)))
1487 2294 : break;
1488 : }
1489 :
1490 : /* nope, keep going */
1491 243 : if (scan->parallel_scan != NULL)
1492 : {
1493 0 : status = _bt_parallel_seize(scan, &blkno);
1494 0 : if (!status)
1495 : {
1496 0 : _bt_relbuf(rel, so->currPos.buf);
1497 0 : BTScanPosInvalidate(so->currPos);
1498 0 : return false;
1499 : }
1500 : }
1501 : else
1502 243 : blkno = opaque->btpo_next;
1503 243 : _bt_relbuf(rel, so->currPos.buf);
1504 243 : }
1505 : }
1506 : else
1507 : {
1508 : /*
1509 : * Should only happen in parallel cases, when some other backend
1510 : * advanced the scan.
1511 : */
1512 14 : if (so->currPos.currPage != blkno)
1513 : {
1514 0 : BTScanPosUnpinIfPinned(so->currPos);
1515 0 : so->currPos.currPage = blkno;
1516 : }
1517 :
1518 : /*
1519 : * Walk left to the next page with data. This is much more complex
1520 : * than the walk-right case because of the possibility that the page
1521 : * to our left splits while we are in flight to it, plus the
1522 : * possibility that the page we were on gets deleted after we leave
1523 : * it. See nbtree/README for details.
1524 : *
1525 : * It might be possible to rearrange this code to have less overhead
1526 : * in pinning and locking, but that would require capturing the left
1527 : * pointer when the page is initially read, and using it here, along
1528 : * with big changes to _bt_walk_left() and the code below. It is not
1529 : * clear whether this would be a win, since if the page immediately to
1530 : * the left splits after we read this page and before we step left, we
1531 : * would need to visit more pages than with the current code.
1532 : *
1533 : * Note that if we change the code so that we drop the pin for a scan
1534 : * which uses a non-MVCC snapshot, we will need to modify the code for
1535 : * walking left, to allow for the possibility that a referenced page
1536 : * has been deleted. As long as the buffer is pinned or the snapshot
1537 : * is MVCC the page cannot move past the half-dead state to fully
1538 : * deleted.
1539 : */
1540 14 : if (BTScanPosIsPinned(so->currPos))
1541 7 : LockBuffer(so->currPos.buf, BT_READ);
1542 : else
1543 7 : so->currPos.buf = _bt_getbuf(rel, so->currPos.currPage, BT_READ);
1544 :
1545 : for (;;)
1546 : {
1547 : /* Done if we know there are no matching keys to the left */
1548 15 : if (!so->currPos.moreLeft)
1549 : {
1550 7 : _bt_relbuf(rel, so->currPos.buf);
1551 7 : _bt_parallel_done(scan);
1552 7 : BTScanPosInvalidate(so->currPos);
1553 7 : return false;
1554 : }
1555 :
1556 : /* Step to next physical page */
1557 8 : so->currPos.buf = _bt_walk_left(rel, so->currPos.buf,
1558 : scan->xs_snapshot);
1559 :
1560 : /* if we're physically at end of index, return failure */
1561 8 : if (so->currPos.buf == InvalidBuffer)
1562 : {
1563 4 : _bt_parallel_done(scan);
1564 4 : BTScanPosInvalidate(so->currPos);
1565 4 : return false;
1566 : }
1567 :
1568 : /*
1569 : * Okay, we managed to move left to a non-deleted page. Done if
1570 : * it's not half-dead and contains matching tuples. Else loop back
1571 : * and do it all again.
1572 : */
1573 4 : page = BufferGetPage(so->currPos.buf);
1574 4 : TestForOldSnapshot(scan->xs_snapshot, rel, page);
1575 4 : opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1576 4 : if (!P_IGNORE(opaque))
1577 : {
1578 4 : PredicateLockPage(rel, BufferGetBlockNumber(so->currPos.buf), scan->xs_snapshot);
1579 : /* see if there are any matches on this page */
1580 : /* note that this will clear moreLeft if we can stop */
1581 4 : if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page)))
1582 3 : break;
1583 : }
1584 :
1585 : /*
1586 : * For parallel scans, get the last page scanned as it is quite
1587 : * possible that by the time we try to seize the scan, some other
1588 : * worker has already advanced the scan to a different page. We
1589 : * must continue based on the latest page scanned by any worker.
1590 : */
1591 1 : if (scan->parallel_scan != NULL)
1592 : {
1593 0 : _bt_relbuf(rel, so->currPos.buf);
1594 0 : status = _bt_parallel_seize(scan, &blkno);
1595 0 : if (!status)
1596 : {
1597 0 : BTScanPosInvalidate(so->currPos);
1598 0 : return false;
1599 : }
1600 0 : so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
1601 : }
1602 1 : }
1603 : }
1604 :
1605 2297 : return true;
1606 : }
1607 :
1608 : /*
1609 : * _bt_parallel_readpage() -- Read current page containing valid data for scan
1610 : *
1611 : * On success, release lock and maybe pin on buffer. We return TRUE to
1612 : * indicate success.
1613 : */
1614 : static bool
1615 0 : _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
1616 : {
1617 0 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
1618 :
1619 0 : _bt_initialize_more_data(so, dir);
1620 :
1621 0 : if (!_bt_readnextpage(scan, blkno, dir))
1622 0 : return false;
1623 :
1624 : /* Drop the lock, and maybe the pin, on the current page */
1625 0 : _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
1626 :
1627 0 : return true;
1628 : }
1629 :
1630 : /*
1631 : * _bt_walk_left() -- step left one page, if possible
1632 : *
1633 : * The given buffer must be pinned and read-locked. This will be dropped
1634 : * before stepping left. On return, we have pin and read lock on the
1635 : * returned page, instead.
1636 : *
1637 : * Returns InvalidBuffer if there is no page to the left (no lock is held
1638 : * in that case).
1639 : *
1640 : * When working on a non-leaf level, it is possible for the returned page
1641 : * to be half-dead; the caller should check that condition and step left
1642 : * again if it's important.
1643 : */
1644 : static Buffer
1645 8 : _bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot)
1646 : {
1647 : Page page;
1648 : BTPageOpaque opaque;
1649 :
1650 8 : page = BufferGetPage(buf);
1651 8 : opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1652 :
1653 : for (;;)
1654 : {
1655 : BlockNumber obknum;
1656 : BlockNumber lblkno;
1657 : BlockNumber blkno;
1658 : int tries;
1659 :
1660 : /* if we're at end of tree, release buf and return failure */
1661 8 : if (P_LEFTMOST(opaque))
1662 : {
1663 4 : _bt_relbuf(rel, buf);
1664 4 : break;
1665 : }
1666 : /* remember original page we are stepping left from */
1667 4 : obknum = BufferGetBlockNumber(buf);
1668 : /* step left */
1669 4 : blkno = lblkno = opaque->btpo_prev;
1670 4 : _bt_relbuf(rel, buf);
1671 : /* check for interrupts while we're not holding any buffer lock */
1672 4 : CHECK_FOR_INTERRUPTS();
1673 4 : buf = _bt_getbuf(rel, blkno, BT_READ);
1674 4 : page = BufferGetPage(buf);
1675 4 : TestForOldSnapshot(snapshot, rel, page);
1676 4 : opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1677 :
1678 : /*
1679 : * If this isn't the page we want, walk right till we find what we
1680 : * want --- but go no more than four hops (an arbitrary limit). If we
1681 : * don't find the correct page by then, the most likely bet is that
1682 : * the original page got deleted and isn't in the sibling chain at all
1683 : * anymore, not that its left sibling got split more than four times.
1684 : *
1685 : * Note that it is correct to test P_ISDELETED not P_IGNORE here,
1686 : * because half-dead pages are still in the sibling chain. Caller
1687 : * must reject half-dead pages if wanted.
1688 : */
1689 4 : tries = 0;
1690 : for (;;)
1691 : {
1692 4 : if (!P_ISDELETED(opaque) && opaque->btpo_next == obknum)
1693 : {
1694 : /* Found desired page, return it */
1695 4 : return buf;
1696 : }
1697 0 : if (P_RIGHTMOST(opaque) || ++tries > 4)
1698 : break;
1699 0 : blkno = opaque->btpo_next;
1700 0 : buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
1701 0 : page = BufferGetPage(buf);
1702 0 : TestForOldSnapshot(snapshot, rel, page);
1703 0 : opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1704 0 : }
1705 :
1706 : /* Return to the original page to see what's up */
1707 0 : buf = _bt_relandgetbuf(rel, buf, obknum, BT_READ);
1708 0 : page = BufferGetPage(buf);
1709 0 : TestForOldSnapshot(snapshot, rel, page);
1710 0 : opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1711 0 : if (P_ISDELETED(opaque))
1712 : {
1713 : /*
1714 : * It was deleted. Move right to first nondeleted page (there
1715 : * must be one); that is the page that has acquired the deleted
1716 : * one's keyspace, so stepping left from it will take us where we
1717 : * want to be.
1718 : */
1719 : for (;;)
1720 : {
1721 0 : if (P_RIGHTMOST(opaque))
1722 0 : elog(ERROR, "fell off the end of index \"%s\"",
1723 : RelationGetRelationName(rel));
1724 0 : blkno = opaque->btpo_next;
1725 0 : buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
1726 0 : page = BufferGetPage(buf);
1727 0 : TestForOldSnapshot(snapshot, rel, page);
1728 0 : opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1729 0 : if (!P_ISDELETED(opaque))
1730 0 : break;
1731 0 : }
1732 :
1733 : /*
1734 : * Now return to top of loop, resetting obknum to point to this
1735 : * nondeleted page, and try again.
1736 : */
1737 : }
1738 : else
1739 : {
1740 : /*
1741 : * It wasn't deleted; the explanation had better be that the page
1742 : * to the left got split or deleted. Without this check, we'd go
1743 : * into an infinite loop if there's anything wrong.
1744 : */
1745 0 : if (opaque->btpo_prev == lblkno)
1746 0 : elog(ERROR, "could not find left sibling of block %u in index \"%s\"",
1747 : obknum, RelationGetRelationName(rel));
1748 : /* Okay to try again with new lblkno value */
1749 : }
1750 0 : }
1751 :
1752 4 : return InvalidBuffer;
1753 : }
1754 :
1755 : /*
1756 : * _bt_get_endpoint() -- Find the first or last page on a given tree level
1757 : *
1758 : * If the index is empty, we will return InvalidBuffer; any other failure
1759 : * condition causes ereport(). We will not return a dead page.
1760 : *
1761 : * The returned buffer is pinned and read-locked.
1762 : */
1763 : Buffer
1764 814 : _bt_get_endpoint(Relation rel, uint32 level, bool rightmost,
1765 : Snapshot snapshot)
1766 : {
1767 : Buffer buf;
1768 : Page page;
1769 : BTPageOpaque opaque;
1770 : OffsetNumber offnum;
1771 : BlockNumber blkno;
1772 : IndexTuple itup;
1773 :
1774 : /*
1775 : * If we are looking for a leaf page, okay to descend from fast root;
1776 : * otherwise better descend from true root. (There is no point in being
1777 : * smarter about intermediate levels.)
1778 : */
1779 814 : if (level == 0)
1780 813 : buf = _bt_getroot(rel, BT_READ);
1781 : else
1782 1 : buf = _bt_gettrueroot(rel);
1783 :
1784 814 : if (!BufferIsValid(buf))
1785 211 : return InvalidBuffer;
1786 :
1787 603 : page = BufferGetPage(buf);
1788 603 : TestForOldSnapshot(snapshot, rel, page);
1789 603 : opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1790 :
1791 : for (;;)
1792 : {
1793 : /*
1794 : * If we landed on a deleted page, step right to find a live page
1795 : * (there must be one). Also, if we want the rightmost page, step
1796 : * right if needed to get to it (this could happen if the page split
1797 : * since we obtained a pointer to it).
1798 : */
1799 2048 : while (P_IGNORE(opaque) ||
1800 9 : (rightmost && !P_RIGHTMOST(opaque)))
1801 : {
1802 0 : blkno = opaque->btpo_next;
1803 0 : if (blkno == P_NONE)
1804 0 : elog(ERROR, "fell off the end of index \"%s\"",
1805 : RelationGetRelationName(rel));
1806 0 : buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
1807 0 : page = BufferGetPage(buf);
1808 0 : TestForOldSnapshot(snapshot, rel, page);
1809 0 : opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1810 : }
1811 :
1812 : /* Done? */
1813 1024 : if (opaque->btpo.level == level)
1814 603 : break;
1815 421 : if (opaque->btpo.level < level)
1816 0 : elog(ERROR, "btree level %u not found in index \"%s\"",
1817 : level, RelationGetRelationName(rel));
1818 :
1819 : /* Descend to leftmost or rightmost child page */
1820 421 : if (rightmost)
1821 1 : offnum = PageGetMaxOffsetNumber(page);
1822 : else
1823 420 : offnum = P_FIRSTDATAKEY(opaque);
1824 :
1825 421 : itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
1826 421 : blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
1827 :
1828 421 : buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
1829 421 : page = BufferGetPage(buf);
1830 421 : opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1831 421 : }
1832 :
1833 603 : return buf;
1834 : }
1835 :
1836 : /*
1837 : * _bt_endpoint() -- Find the first or last page in the index, and scan
1838 : * from there to the first key satisfying all the quals.
1839 : *
1840 : * This is used by _bt_first() to set up a scan when we've determined
1841 : * that the scan must start at the beginning or end of the index (for
1842 : * a forward or backward scan respectively). Exit conditions are the
1843 : * same as for _bt_first().
1844 : */
1845 : static bool
1846 813 : _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
1847 : {
1848 813 : Relation rel = scan->indexRelation;
1849 813 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
1850 : Buffer buf;
1851 : Page page;
1852 : BTPageOpaque opaque;
1853 : OffsetNumber start;
1854 : BTScanPosItem *currItem;
1855 :
1856 : /*
1857 : * Scan down to the leftmost or rightmost leaf page. This is a simplified
1858 : * version of _bt_search(). We don't maintain a stack since we know we
1859 : * won't need it.
1860 : */
1861 813 : buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir), scan->xs_snapshot);
1862 :
1863 813 : if (!BufferIsValid(buf))
1864 : {
1865 : /*
1866 : * Empty index. Lock the whole relation, as nothing finer to lock
1867 : * exists.
1868 : */
1869 211 : PredicateLockRelation(rel, scan->xs_snapshot);
1870 211 : BTScanPosInvalidate(so->currPos);
1871 211 : return false;
1872 : }
1873 :
1874 602 : PredicateLockPage(rel, BufferGetBlockNumber(buf), scan->xs_snapshot);
1875 602 : page = BufferGetPage(buf);
1876 602 : opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1877 602 : Assert(P_ISLEAF(opaque));
1878 :
1879 602 : if (ScanDirectionIsForward(dir))
1880 : {
1881 : /* There could be dead pages to the left, so not this: */
1882 : /* Assert(P_LEFTMOST(opaque)); */
1883 :
1884 594 : start = P_FIRSTDATAKEY(opaque);
1885 : }
1886 8 : else if (ScanDirectionIsBackward(dir))
1887 : {
1888 8 : Assert(P_RIGHTMOST(opaque));
1889 :
1890 8 : start = PageGetMaxOffsetNumber(page);
1891 : }
1892 : else
1893 : {
1894 0 : elog(ERROR, "invalid scan direction: %d", (int) dir);
1895 : start = 0; /* keep compiler quiet */
1896 : }
1897 :
1898 : /* remember which buffer we have pinned */
1899 602 : so->currPos.buf = buf;
1900 :
1901 602 : _bt_initialize_more_data(so, dir);
1902 :
1903 : /*
1904 : * Now load data from the first page of the scan.
1905 : */
1906 602 : if (!_bt_readpage(scan, dir, start))
1907 : {
1908 : /*
1909 : * There's no actually-matching data on this page. Try to advance to
1910 : * the next page. Return false if there's no matching data at all.
1911 : */
1912 9 : LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
1913 9 : if (!_bt_steppage(scan, dir))
1914 6 : return false;
1915 : }
1916 : else
1917 : {
1918 : /* Drop the lock, and maybe the pin, on the current page */
1919 593 : _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
1920 : }
1921 :
1922 : /* OK, itemIndex says what to return */
1923 596 : currItem = &so->currPos.items[so->currPos.itemIndex];
1924 596 : scan->xs_ctup.t_self = currItem->heapTid;
1925 596 : if (scan->xs_want_itup)
1926 121 : scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
1927 :
1928 596 : return true;
1929 : }
1930 :
1931 : /*
1932 : * _bt_initialize_more_data() -- initialize moreLeft/moreRight appropriately
1933 : * for scan direction
1934 : */
1935 : static inline void
1936 377804 : _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir)
1937 : {
1938 : /* initialize moreLeft/moreRight appropriately for scan direction */
1939 377804 : if (ScanDirectionIsForward(dir))
1940 : {
1941 377527 : so->currPos.moreLeft = false;
1942 377527 : so->currPos.moreRight = true;
1943 : }
1944 : else
1945 : {
1946 277 : so->currPos.moreLeft = true;
1947 277 : so->currPos.moreRight = false;
1948 : }
1949 377804 : so->numKilled = 0; /* just paranoia */
1950 377804 : so->markItemIndex = -1; /* ditto */
1951 377804 : }
|