Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * nbtree.c
4 : * Implementation of Lehman and Yao's btree management algorithm for
5 : * Postgres.
6 : *
7 : * NOTES
8 : * This file contains only the public interface routines.
9 : *
10 : *
11 : * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
12 : * Portions Copyright (c) 1994, Regents of the University of California
13 : *
14 : * IDENTIFICATION
15 : * src/backend/access/nbtree/nbtree.c
16 : *
17 : *-------------------------------------------------------------------------
18 : */
19 : #include "postgres.h"
20 :
21 : #include "access/nbtree.h"
22 : #include "access/relscan.h"
23 : #include "access/xlog.h"
24 : #include "catalog/index.h"
25 : #include "commands/vacuum.h"
26 : #include "pgstat.h"
27 : #include "storage/condition_variable.h"
28 : #include "storage/indexfsm.h"
29 : #include "storage/ipc.h"
30 : #include "storage/lmgr.h"
31 : #include "storage/smgr.h"
32 : #include "tcop/tcopprot.h" /* pgrminclude ignore */
33 : #include "utils/builtins.h"
34 : #include "utils/index_selfuncs.h"
35 : #include "utils/memutils.h"
36 :
37 :
38 : /* Working state for btbuild and its callback */
39 : typedef struct
40 : {
41 : bool isUnique;
42 : bool haveDead;
43 : Relation heapRel;
44 : BTSpool *spool;
45 :
46 : /*
47 : * spool2 is needed only when the index is a unique index. Dead tuples are
48 : * put into spool2 instead of spool in order to avoid uniqueness check.
49 : */
50 : BTSpool *spool2;
51 : double indtuples;
52 : } BTBuildState;
53 :
54 : /* Working state needed by btvacuumpage */
55 : typedef struct
56 : {
57 : IndexVacuumInfo *info;
58 : IndexBulkDeleteResult *stats;
59 : IndexBulkDeleteCallback callback;
60 : void *callback_state;
61 : BTCycleId cycleid;
62 : BlockNumber lastBlockVacuumed; /* highest blkno actually vacuumed */
63 : BlockNumber lastBlockLocked; /* highest blkno we've cleanup-locked */
64 : BlockNumber totFreePages; /* true total # of free pages */
65 : MemoryContext pagedelcontext;
66 : } BTVacState;
67 :
68 : /*
69 : * BTPARALLEL_NOT_INITIALIZED indicates that the scan has not started.
70 : *
71 : * BTPARALLEL_ADVANCING indicates that some process is advancing the scan to
72 : * a new page; others must wait.
73 : *
74 : * BTPARALLEL_IDLE indicates that no backend is currently advancing the scan
75 : * to a new page; some process can start doing that.
76 : *
77 : * BTPARALLEL_DONE indicates that the scan is complete (including error exit).
78 : * We reach this state once for every distinct combination of array keys.
79 : */
80 : typedef enum
81 : {
82 : BTPARALLEL_NOT_INITIALIZED,
83 : BTPARALLEL_ADVANCING,
84 : BTPARALLEL_IDLE,
85 : BTPARALLEL_DONE
86 : } BTPS_State;
87 :
88 : /*
89 : * BTParallelScanDescData contains btree specific shared information required
90 : * for parallel scan.
91 : */
92 : typedef struct BTParallelScanDescData
93 : {
94 : BlockNumber btps_scanPage; /* latest or next page to be scanned */
95 : BTPS_State btps_pageStatus; /* indicates whether next page is
96 : * available for scan. see above for
97 : * possible states of parallel scan. */
98 : int btps_arrayKeyCount; /* count indicating number of array scan
99 : * keys processed by parallel scan */
100 : slock_t btps_mutex; /* protects above variables */
101 : ConditionVariable btps_cv; /* used to synchronize parallel scan */
102 : } BTParallelScanDescData;
103 :
104 : typedef struct BTParallelScanDescData *BTParallelScanDesc;
105 :
106 :
107 : static void btbuildCallback(Relation index,
108 : HeapTuple htup,
109 : Datum *values,
110 : bool *isnull,
111 : bool tupleIsAlive,
112 : void *state);
113 : static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
114 : IndexBulkDeleteCallback callback, void *callback_state,
115 : BTCycleId cycleid);
116 : static void btvacuumpage(BTVacState *vstate, BlockNumber blkno,
117 : BlockNumber orig_blkno);
118 :
119 :
120 : /*
121 : * Btree handler function: return IndexAmRoutine with access method parameters
122 : * and callbacks.
123 : */
124 : Datum
125 35320 : bthandler(PG_FUNCTION_ARGS)
126 : {
127 35320 : IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
128 :
129 35320 : amroutine->amstrategies = BTMaxStrategyNumber;
130 35320 : amroutine->amsupport = BTNProcs;
131 35320 : amroutine->amcanorder = true;
132 35320 : amroutine->amcanorderbyop = false;
133 35320 : amroutine->amcanbackward = true;
134 35320 : amroutine->amcanunique = true;
135 35320 : amroutine->amcanmulticol = true;
136 35320 : amroutine->amoptionalkey = true;
137 35320 : amroutine->amsearcharray = true;
138 35320 : amroutine->amsearchnulls = true;
139 35320 : amroutine->amstorage = false;
140 35320 : amroutine->amclusterable = true;
141 35320 : amroutine->ampredlocks = true;
142 35320 : amroutine->amcanparallel = true;
143 35320 : amroutine->amkeytype = InvalidOid;
144 :
145 35320 : amroutine->ambuild = btbuild;
146 35320 : amroutine->ambuildempty = btbuildempty;
147 35320 : amroutine->aminsert = btinsert;
148 35320 : amroutine->ambulkdelete = btbulkdelete;
149 35320 : amroutine->amvacuumcleanup = btvacuumcleanup;
150 35320 : amroutine->amcanreturn = btcanreturn;
151 35320 : amroutine->amcostestimate = btcostestimate;
152 35320 : amroutine->amoptions = btoptions;
153 35320 : amroutine->amproperty = btproperty;
154 35320 : amroutine->amvalidate = btvalidate;
155 35320 : amroutine->ambeginscan = btbeginscan;
156 35320 : amroutine->amrescan = btrescan;
157 35320 : amroutine->amgettuple = btgettuple;
158 35320 : amroutine->amgetbitmap = btgetbitmap;
159 35320 : amroutine->amendscan = btendscan;
160 35320 : amroutine->ammarkpos = btmarkpos;
161 35320 : amroutine->amrestrpos = btrestrpos;
162 35320 : amroutine->amestimateparallelscan = btestimateparallelscan;
163 35320 : amroutine->aminitparallelscan = btinitparallelscan;
164 35320 : amroutine->amparallelrescan = btparallelrescan;
165 :
166 35320 : PG_RETURN_POINTER(amroutine);
167 : }
168 :
169 : /*
170 : * btbuild() -- build a new btree index.
171 : */
172 : IndexBuildResult *
173 1429 : btbuild(Relation heap, Relation index, IndexInfo *indexInfo)
174 : {
175 : IndexBuildResult *result;
176 : double reltuples;
177 : BTBuildState buildstate;
178 :
179 1429 : buildstate.isUnique = indexInfo->ii_Unique;
180 1429 : buildstate.haveDead = false;
181 1429 : buildstate.heapRel = heap;
182 1429 : buildstate.spool = NULL;
183 1429 : buildstate.spool2 = NULL;
184 1429 : buildstate.indtuples = 0;
185 :
186 : #ifdef BTREE_BUILD_STATS
187 : if (log_btree_build_stats)
188 : ResetUsage();
189 : #endif /* BTREE_BUILD_STATS */
190 :
191 : /*
192 : * We expect to be called exactly once for any index relation. If that's
193 : * not the case, big trouble's what we have.
194 : */
195 1429 : if (RelationGetNumberOfBlocks(index) != 0)
196 0 : elog(ERROR, "index \"%s\" already contains data",
197 : RelationGetRelationName(index));
198 :
199 1429 : buildstate.spool = _bt_spoolinit(heap, index, indexInfo->ii_Unique, false);
200 :
201 : /*
202 : * If building a unique index, put dead tuples in a second spool to keep
203 : * them out of the uniqueness check.
204 : */
205 1429 : if (indexInfo->ii_Unique)
206 1197 : buildstate.spool2 = _bt_spoolinit(heap, index, false, true);
207 :
208 : /* do the heap scan */
209 1429 : reltuples = IndexBuildHeapScan(heap, index, indexInfo, true,
210 : btbuildCallback, (void *) &buildstate);
211 :
212 : /* okay, all heap tuples are indexed */
213 1428 : if (buildstate.spool2 && !buildstate.haveDead)
214 : {
215 : /* spool2 turns out to be unnecessary */
216 1196 : _bt_spooldestroy(buildstate.spool2);
217 1196 : buildstate.spool2 = NULL;
218 : }
219 :
220 : /*
221 : * Finish the build by (1) completing the sort of the spool file, (2)
222 : * inserting the sorted tuples into btree pages and (3) building the upper
223 : * levels.
224 : */
225 1428 : _bt_leafbuild(buildstate.spool, buildstate.spool2);
226 1422 : _bt_spooldestroy(buildstate.spool);
227 1422 : if (buildstate.spool2)
228 1 : _bt_spooldestroy(buildstate.spool2);
229 :
230 : #ifdef BTREE_BUILD_STATS
231 : if (log_btree_build_stats)
232 : {
233 : ShowUsage("BTREE BUILD STATS");
234 : ResetUsage();
235 : }
236 : #endif /* BTREE_BUILD_STATS */
237 :
238 : /*
239 : * Return statistics
240 : */
241 1422 : result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
242 :
243 1422 : result->heap_tuples = reltuples;
244 1422 : result->index_tuples = buildstate.indtuples;
245 :
246 1422 : return result;
247 : }
248 :
249 : /*
250 : * Per-tuple callback from IndexBuildHeapScan
251 : */
252 : static void
253 506550 : btbuildCallback(Relation index,
254 : HeapTuple htup,
255 : Datum *values,
256 : bool *isnull,
257 : bool tupleIsAlive,
258 : void *state)
259 : {
260 506550 : BTBuildState *buildstate = (BTBuildState *) state;
261 :
262 : /*
263 : * insert the index tuple into the appropriate spool file for subsequent
264 : * processing
265 : */
266 506550 : if (tupleIsAlive || buildstate->spool2 == NULL)
267 506544 : _bt_spool(buildstate->spool, &htup->t_self, values, isnull);
268 : else
269 : {
270 : /* dead tuples are put into spool2 */
271 6 : buildstate->haveDead = true;
272 6 : _bt_spool(buildstate->spool2, &htup->t_self, values, isnull);
273 : }
274 :
275 506550 : buildstate->indtuples += 1;
276 506550 : }
277 :
278 : /*
279 : * btbuildempty() -- build an empty btree index in the initialization fork
280 : */
281 : void
282 11 : btbuildempty(Relation index)
283 : {
284 : Page metapage;
285 :
286 : /* Construct metapage. */
287 11 : metapage = (Page) palloc(BLCKSZ);
288 11 : _bt_initmetapage(metapage, P_NONE, 0);
289 :
290 : /*
291 : * Write the page and log it. It might seem that an immediate sync would
292 : * be sufficient to guarantee that the file exists on disk, but recovery
293 : * itself might remove it while replaying, for example, an
294 : * XLOG_DBASE_CREATE or XLOG_TBLSPC_CREATE record. Therefore, we need
295 : * this even when wal_level=minimal.
296 : */
297 11 : PageSetChecksumInplace(metapage, BTREE_METAPAGE);
298 11 : smgrwrite(index->rd_smgr, INIT_FORKNUM, BTREE_METAPAGE,
299 : (char *) metapage, true);
300 11 : log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM,
301 : BTREE_METAPAGE, metapage, false);
302 :
303 : /*
304 : * An immediate sync is required even if we xlog'd the page, because the
305 : * write did not go through shared_buffers and therefore a concurrent
306 : * checkpoint may have moved the redo pointer past our xlog record.
307 : */
308 11 : smgrimmedsync(index->rd_smgr, INIT_FORKNUM);
309 11 : }
310 :
311 : /*
312 : * btinsert() -- insert an index tuple into a btree.
313 : *
314 : * Descend the tree recursively, find the appropriate location for our
315 : * new tuple, and put it there.
316 : */
317 : bool
318 205514 : btinsert(Relation rel, Datum *values, bool *isnull,
319 : ItemPointer ht_ctid, Relation heapRel,
320 : IndexUniqueCheck checkUnique,
321 : IndexInfo *indexInfo)
322 : {
323 : bool result;
324 : IndexTuple itup;
325 :
326 : /* generate an index tuple */
327 205514 : itup = index_form_tuple(RelationGetDescr(rel), values, isnull);
328 205514 : itup->t_tid = *ht_ctid;
329 :
330 205514 : result = _bt_doinsert(rel, itup, checkUnique, heapRel);
331 :
332 205482 : pfree(itup);
333 :
334 205482 : return result;
335 : }
336 :
337 : /*
338 : * btgettuple() -- Get the next tuple in the scan.
339 : */
340 : bool
341 1177790 : btgettuple(IndexScanDesc scan, ScanDirection dir)
342 : {
343 1177790 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
344 : bool res;
345 :
346 : /* btree indexes are never lossy */
347 1177790 : scan->xs_recheck = false;
348 :
349 : /*
350 : * If we have any array keys, initialize them during first call for a
351 : * scan. We can't do this in btrescan because we don't know the scan
352 : * direction at that time.
353 : */
354 1177790 : if (so->numArrayKeys && !BTScanPosIsValid(so->currPos))
355 : {
356 : /* punt if we have any unsatisfiable array keys */
357 3 : if (so->numArrayKeys < 0)
358 0 : return false;
359 :
360 3 : _bt_start_array_keys(scan, dir);
361 : }
362 :
363 : /* This loop handles advancing to the next array elements, if any */
364 : do
365 : {
366 : /*
367 : * If we've already initialized this scan, we can just advance it in
368 : * the appropriate direction. If we haven't done so yet, we call
369 : * _bt_first() to get the first item in the scan.
370 : */
371 1177795 : if (!BTScanPosIsValid(so->currPos))
372 387164 : res = _bt_first(scan, dir);
373 : else
374 : {
375 : /*
376 : * Check to see if we should kill the previously-fetched tuple.
377 : */
378 790631 : if (scan->kill_prior_tuple)
379 : {
380 : /*
381 : * Yes, remember it for later. (We'll deal with all such
382 : * tuples at once right before leaving the index page.) The
383 : * test for numKilled overrun is not just paranoia: if the
384 : * caller reverses direction in the indexscan then the same
385 : * item might get entered multiple times. It's not worth
386 : * trying to optimize that, so we don't detect it, but instead
387 : * just forget any excess entries.
388 : */
389 5116 : if (so->killedItems == NULL)
390 2871 : so->killedItems = (int *)
391 2871 : palloc(MaxIndexTuplesPerPage * sizeof(int));
392 5116 : if (so->numKilled < MaxIndexTuplesPerPage)
393 5116 : so->killedItems[so->numKilled++] = so->currPos.itemIndex;
394 : }
395 :
396 : /*
397 : * Now continue the scan.
398 : */
399 790631 : res = _bt_next(scan, dir);
400 : }
401 :
402 : /* If we have a tuple, return it ... */
403 1177795 : if (res)
404 986779 : break;
405 : /* ... otherwise see if we have more array keys to deal with */
406 191016 : } while (so->numArrayKeys && _bt_advance_array_keys(scan, dir));
407 :
408 1177790 : return res;
409 : }
410 :
411 : /*
412 : * btgetbitmap() -- gets all matching tuples, and adds them to a bitmap
413 : */
414 : int64
415 831 : btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
416 : {
417 831 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
418 831 : int64 ntids = 0;
419 : ItemPointer heapTid;
420 :
421 : /*
422 : * If we have any array keys, initialize them.
423 : */
424 831 : if (so->numArrayKeys)
425 : {
426 : /* punt if we have any unsatisfiable array keys */
427 28 : if (so->numArrayKeys < 0)
428 0 : return ntids;
429 :
430 28 : _bt_start_array_keys(scan, ForwardScanDirection);
431 : }
432 :
433 : /* This loop handles advancing to the next array elements, if any */
434 : do
435 : {
436 : /* Fetch the first page & tuple */
437 902 : if (_bt_first(scan, ForwardScanDirection))
438 : {
439 : /* Save tuple ID, and continue scanning */
440 419 : heapTid = &scan->xs_ctup.t_self;
441 419 : tbm_add_tuples(tbm, heapTid, 1, false);
442 419 : ntids++;
443 :
444 : for (;;)
445 : {
446 : /*
447 : * Advance to next tuple within page. This is the same as the
448 : * easy case in _bt_next().
449 : */
450 211640 : if (++so->currPos.itemIndex > so->currPos.lastItem)
451 : {
452 : /* let _bt_next do the heavy lifting */
453 978 : if (!_bt_next(scan, ForwardScanDirection))
454 419 : break;
455 : }
456 :
457 : /* Save tuple ID, and continue scanning */
458 211221 : heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid;
459 211221 : tbm_add_tuples(tbm, heapTid, 1, false);
460 211221 : ntids++;
461 211221 : }
462 : }
463 : /* Now see if we have more array keys to deal with */
464 902 : } while (so->numArrayKeys && _bt_advance_array_keys(scan, ForwardScanDirection));
465 :
466 831 : return ntids;
467 : }
468 :
469 : /*
470 : * btbeginscan() -- start a scan on a btree index
471 : */
472 : IndexScanDesc
473 362281 : btbeginscan(Relation rel, int nkeys, int norderbys)
474 : {
475 : IndexScanDesc scan;
476 : BTScanOpaque so;
477 :
478 : /* no order by operators allowed */
479 362281 : Assert(norderbys == 0);
480 :
481 : /* get the scan */
482 362281 : scan = RelationGetIndexScan(rel, nkeys, norderbys);
483 :
484 : /* allocate private workspace */
485 362281 : so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData));
486 362281 : BTScanPosInvalidate(so->currPos);
487 362281 : BTScanPosInvalidate(so->markPos);
488 362281 : if (scan->numberOfKeys > 0)
489 361923 : so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
490 : else
491 358 : so->keyData = NULL;
492 :
493 362281 : so->arrayKeyData = NULL; /* assume no array keys for now */
494 362281 : so->numArrayKeys = 0;
495 362281 : so->arrayKeys = NULL;
496 362281 : so->arrayContext = NULL;
497 :
498 362281 : so->killedItems = NULL; /* until needed */
499 362281 : so->numKilled = 0;
500 :
501 : /*
502 : * We don't know yet whether the scan will be index-only, so we do not
503 : * allocate the tuple workspace arrays until btrescan. However, we set up
504 : * scan->xs_itupdesc whether we'll need it or not, since that's so cheap.
505 : */
506 362281 : so->currTuples = so->markTuples = NULL;
507 :
508 362281 : scan->xs_itupdesc = RelationGetDescr(rel);
509 :
510 362281 : scan->opaque = so;
511 :
512 362281 : return scan;
513 : }
514 :
515 : /*
516 : * btrescan() -- rescan an index relation
517 : */
518 : void
519 388316 : btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
520 : ScanKey orderbys, int norderbys)
521 : {
522 388316 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
523 :
524 : /* we aren't holding any read locks, but gotta drop the pins */
525 388316 : if (BTScanPosIsValid(so->currPos))
526 : {
527 : /* Before leaving current page, deal with any killed items */
528 3497 : if (so->numKilled > 0)
529 0 : _bt_killitems(scan);
530 3497 : BTScanPosUnpinIfPinned(so->currPos);
531 3497 : BTScanPosInvalidate(so->currPos);
532 : }
533 :
534 388316 : so->markItemIndex = -1;
535 388316 : so->arrayKeyCount = 0;
536 388316 : BTScanPosUnpinIfPinned(so->markPos);
537 388316 : BTScanPosInvalidate(so->markPos);
538 :
539 : /*
540 : * Allocate tuple workspace arrays, if needed for an index-only scan and
541 : * not already done in a previous rescan call. To save on palloc
542 : * overhead, both workspaces are allocated as one palloc block; only this
543 : * function and btendscan know that.
544 : *
545 : * NOTE: this data structure also makes it safe to return data from a
546 : * "name" column, even though btree name_ops uses an underlying storage
547 : * datatype of cstring. The risk there is that "name" is supposed to be
548 : * padded to NAMEDATALEN, but the actual index tuple is probably shorter.
549 : * However, since we only return data out of tuples sitting in the
550 : * currTuples array, a fetch of NAMEDATALEN bytes can at worst pull some
551 : * data out of the markTuples array --- running off the end of memory for
552 : * a SIGSEGV is not possible. Yeah, this is ugly as sin, but it beats
553 : * adding special-case treatment for name_ops elsewhere.
554 : */
555 388316 : if (scan->xs_want_itup && so->currTuples == NULL)
556 : {
557 436 : so->currTuples = (char *) palloc(BLCKSZ * 2);
558 436 : so->markTuples = so->currTuples + BLCKSZ;
559 : }
560 :
561 : /*
562 : * Reset the scan keys. Note that keys ordering stuff moved to _bt_first.
563 : * - vadim 05/05/97
564 : */
565 388316 : if (scankey && scan->numberOfKeys > 0)
566 387958 : memmove(scan->keyData,
567 : scankey,
568 387958 : scan->numberOfKeys * sizeof(ScanKeyData));
569 388316 : so->numberOfKeys = 0; /* until _bt_preprocess_keys sets it */
570 :
571 : /* If any keys are SK_SEARCHARRAY type, set up array-key info */
572 388316 : _bt_preprocess_array_keys(scan);
573 388316 : }
574 :
575 : /*
576 : * btendscan() -- close down a scan
577 : */
578 : void
579 362179 : btendscan(IndexScanDesc scan)
580 : {
581 362179 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
582 :
583 : /* we aren't holding any read locks, but gotta drop the pins */
584 362179 : if (BTScanPosIsValid(so->currPos))
585 : {
586 : /* Before leaving current page, deal with any killed items */
587 192582 : if (so->numKilled > 0)
588 758 : _bt_killitems(scan);
589 192582 : BTScanPosUnpinIfPinned(so->currPos);
590 : }
591 :
592 362179 : so->markItemIndex = -1;
593 362179 : BTScanPosUnpinIfPinned(so->markPos);
594 :
595 : /* No need to invalidate positions, the RAM is about to be freed. */
596 :
597 : /* Release storage */
598 362179 : if (so->keyData != NULL)
599 361826 : pfree(so->keyData);
600 : /* so->arrayKeyData and so->arrayKeys are in arrayContext */
601 362179 : if (so->arrayContext != NULL)
602 31 : MemoryContextDelete(so->arrayContext);
603 362179 : if (so->killedItems != NULL)
604 2867 : pfree(so->killedItems);
605 362179 : if (so->currTuples != NULL)
606 429 : pfree(so->currTuples);
607 : /* so->markTuples should not be pfree'd, see btrescan */
608 362179 : pfree(so);
609 362179 : }
610 :
611 : /*
612 : * btmarkpos() -- save current scan position
613 : */
614 : void
615 21001 : btmarkpos(IndexScanDesc scan)
616 : {
617 21001 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
618 :
619 : /* There may be an old mark with a pin (but no lock). */
620 21001 : BTScanPosUnpinIfPinned(so->markPos);
621 :
622 : /*
623 : * Just record the current itemIndex. If we later step to next page
624 : * before releasing the marked position, _bt_steppage makes a full copy of
625 : * the currPos struct in markPos. If (as often happens) the mark is moved
626 : * before we leave the page, we don't have to do that work.
627 : */
628 21001 : if (BTScanPosIsValid(so->currPos))
629 21001 : so->markItemIndex = so->currPos.itemIndex;
630 : else
631 : {
632 0 : BTScanPosInvalidate(so->markPos);
633 0 : so->markItemIndex = -1;
634 : }
635 :
636 : /* Also record the current positions of any array keys */
637 21001 : if (so->numArrayKeys)
638 0 : _bt_mark_array_keys(scan);
639 21001 : }
640 :
641 : /*
642 : * btrestrpos() -- restore scan to last saved position
643 : */
644 : void
645 9001 : btrestrpos(IndexScanDesc scan)
646 : {
647 9001 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
648 :
649 : /* Restore the marked positions of any array keys */
650 9001 : if (so->numArrayKeys)
651 0 : _bt_restore_array_keys(scan);
652 :
653 9001 : if (so->markItemIndex >= 0)
654 : {
655 : /*
656 : * The scan has never moved to a new page since the last mark. Just
657 : * restore the itemIndex.
658 : *
659 : * NB: In this case we can't count on anything in so->markPos to be
660 : * accurate.
661 : */
662 8983 : so->currPos.itemIndex = so->markItemIndex;
663 : }
664 : else
665 : {
666 : /*
667 : * The scan moved to a new page after last mark or restore, and we are
668 : * now restoring to the marked page. We aren't holding any read
669 : * locks, but if we're still holding the pin for the current position,
670 : * we must drop it.
671 : */
672 18 : if (BTScanPosIsValid(so->currPos))
673 : {
674 : /* Before leaving current page, deal with any killed items */
675 18 : if (so->numKilled > 0)
676 0 : _bt_killitems(scan);
677 18 : BTScanPosUnpinIfPinned(so->currPos);
678 : }
679 :
680 18 : if (BTScanPosIsValid(so->markPos))
681 : {
682 : /* bump pin on mark buffer for assignment to current buffer */
683 18 : if (BTScanPosIsPinned(so->markPos))
684 0 : IncrBufferRefCount(so->markPos.buf);
685 18 : memcpy(&so->currPos, &so->markPos,
686 : offsetof(BTScanPosData, items[1]) +
687 18 : so->markPos.lastItem * sizeof(BTScanPosItem));
688 18 : if (so->currTuples)
689 0 : memcpy(so->currTuples, so->markTuples,
690 0 : so->markPos.nextTupleOffset);
691 : }
692 : else
693 0 : BTScanPosInvalidate(so->currPos);
694 : }
695 9001 : }
696 :
697 : /*
698 : * btestimateparallelscan -- estimate storage for BTParallelScanDescData
699 : */
700 : Size
701 5 : btestimateparallelscan(void)
702 : {
703 5 : return sizeof(BTParallelScanDescData);
704 : }
705 :
706 : /*
707 : * btinitparallelscan -- initialize BTParallelScanDesc for parallel btree scan
708 : */
709 : void
710 5 : btinitparallelscan(void *target)
711 : {
712 5 : BTParallelScanDesc bt_target = (BTParallelScanDesc) target;
713 :
714 5 : SpinLockInit(&bt_target->btps_mutex);
715 5 : bt_target->btps_scanPage = InvalidBlockNumber;
716 5 : bt_target->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
717 5 : bt_target->btps_arrayKeyCount = 0;
718 5 : ConditionVariableInit(&bt_target->btps_cv);
719 5 : }
720 :
721 : /*
722 : * btparallelrescan() -- reset parallel scan
723 : */
724 : void
725 4 : btparallelrescan(IndexScanDesc scan)
726 : {
727 : BTParallelScanDesc btscan;
728 4 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
729 :
730 4 : Assert(parallel_scan);
731 :
732 4 : btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
733 : parallel_scan->ps_offset);
734 :
735 : /*
736 : * In theory, we don't need to acquire the spinlock here, because there
737 : * shouldn't be any other workers running at this point, but we do so for
738 : * consistency.
739 : */
740 4 : SpinLockAcquire(&btscan->btps_mutex);
741 4 : btscan->btps_scanPage = InvalidBlockNumber;
742 4 : btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
743 4 : btscan->btps_arrayKeyCount = 0;
744 4 : SpinLockRelease(&btscan->btps_mutex);
745 4 : }
746 :
747 : /*
748 : * _bt_parallel_seize() -- Begin the process of advancing the scan to a new
749 : * page. Other scans must wait until we call bt_parallel_release() or
750 : * bt_parallel_done().
751 : *
752 : * The return value is true if we successfully seized the scan and false
753 : * if we did not. The latter case occurs if no pages remain for the current
754 : * set of scankeys.
755 : *
756 : * If the return value is true, *pageno returns the next or current page
757 : * of the scan (depending on the scan direction). An invalid block number
758 : * means the scan hasn't yet started, and P_NONE means we've reached the end.
759 : * The first time a participating process reaches the last page, it will return
760 : * true and set *pageno to P_NONE; after that, further attempts to seize the
761 : * scan will return false.
762 : *
763 : * Callers should ignore the value of pageno if the return value is false.
764 : */
765 : bool
766 253 : _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno)
767 : {
768 253 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
769 : BTPS_State pageStatus;
770 253 : bool exit_loop = false;
771 253 : bool status = true;
772 253 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
773 : BTParallelScanDesc btscan;
774 :
775 253 : *pageno = P_NONE;
776 :
777 253 : btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
778 : parallel_scan->ps_offset);
779 :
780 : while (1)
781 : {
782 253 : SpinLockAcquire(&btscan->btps_mutex);
783 253 : pageStatus = btscan->btps_pageStatus;
784 :
785 253 : if (so->arrayKeyCount < btscan->btps_arrayKeyCount)
786 : {
787 : /* Parallel scan has already advanced to a new set of scankeys. */
788 0 : status = false;
789 : }
790 253 : else if (pageStatus == BTPARALLEL_DONE)
791 : {
792 : /*
793 : * We're done with this set of scankeys. This may be the end, or
794 : * there could be more sets to try.
795 : */
796 36 : status = false;
797 : }
798 217 : else if (pageStatus != BTPARALLEL_ADVANCING)
799 : {
800 : /*
801 : * We have successfully seized control of the scan for the purpose
802 : * of advancing it to a new page!
803 : */
804 217 : btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
805 217 : *pageno = btscan->btps_scanPage;
806 217 : exit_loop = true;
807 : }
808 253 : SpinLockRelease(&btscan->btps_mutex);
809 253 : if (exit_loop || !status)
810 : break;
811 0 : ConditionVariableSleep(&btscan->btps_cv, WAIT_EVENT_BTREE_PAGE);
812 0 : }
813 253 : ConditionVariableCancelSleep();
814 :
815 253 : return status;
816 : }
817 :
818 : /*
819 : * _bt_parallel_release() -- Complete the process of advancing the scan to a
820 : * new page. We now have the new value btps_scanPage; some other backend
821 : * can now begin advancing the scan.
822 : */
823 : void
824 208 : _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page)
825 : {
826 208 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
827 : BTParallelScanDesc btscan;
828 :
829 208 : btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
830 : parallel_scan->ps_offset);
831 :
832 208 : SpinLockAcquire(&btscan->btps_mutex);
833 208 : btscan->btps_scanPage = scan_page;
834 208 : btscan->btps_pageStatus = BTPARALLEL_IDLE;
835 208 : SpinLockRelease(&btscan->btps_mutex);
836 208 : ConditionVariableSignal(&btscan->btps_cv);
837 208 : }
838 :
839 : /*
840 : * _bt_parallel_done() -- Mark the parallel scan as complete.
841 : *
842 : * When there are no pages left to scan, this function should be called to
843 : * notify other workers. Otherwise, they might wait forever for the scan to
844 : * advance to the next page.
845 : */
846 : void
847 191877 : _bt_parallel_done(IndexScanDesc scan)
848 : {
849 191877 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
850 191877 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
851 : BTParallelScanDesc btscan;
852 191877 : bool status_changed = false;
853 :
854 : /* Do nothing, for non-parallel scans */
855 191877 : if (parallel_scan == NULL)
856 383745 : return;
857 :
858 9 : btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
859 : parallel_scan->ps_offset);
860 :
861 : /*
862 : * Mark the parallel scan as done for this combination of scan keys,
863 : * unless some other process already did so. See also
864 : * _bt_advance_array_keys.
865 : */
866 9 : SpinLockAcquire(&btscan->btps_mutex);
867 18 : if (so->arrayKeyCount >= btscan->btps_arrayKeyCount &&
868 9 : btscan->btps_pageStatus != BTPARALLEL_DONE)
869 : {
870 9 : btscan->btps_pageStatus = BTPARALLEL_DONE;
871 9 : status_changed = true;
872 : }
873 9 : SpinLockRelease(&btscan->btps_mutex);
874 :
875 : /* wake up all the workers associated with this parallel scan */
876 9 : if (status_changed)
877 9 : ConditionVariableBroadcast(&btscan->btps_cv);
878 : }
879 :
880 : /*
881 : * _bt_parallel_advance_array_keys() -- Advances the parallel scan for array
882 : * keys.
883 : *
884 : * Updates the count of array keys processed for both local and parallel
885 : * scans.
886 : */
887 : void
888 0 : _bt_parallel_advance_array_keys(IndexScanDesc scan)
889 : {
890 0 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
891 0 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
892 : BTParallelScanDesc btscan;
893 :
894 0 : btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
895 : parallel_scan->ps_offset);
896 :
897 0 : so->arrayKeyCount++;
898 0 : SpinLockAcquire(&btscan->btps_mutex);
899 0 : if (btscan->btps_pageStatus == BTPARALLEL_DONE)
900 : {
901 0 : btscan->btps_scanPage = InvalidBlockNumber;
902 0 : btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
903 0 : btscan->btps_arrayKeyCount++;
904 : }
905 0 : SpinLockRelease(&btscan->btps_mutex);
906 0 : }
907 :
908 : /*
909 : * Bulk deletion of all index entries pointing to a set of heap tuples.
910 : * The set of target tuples is specified via a callback routine that tells
911 : * whether any given heap tuple (identified by ItemPointer) is being deleted.
912 : *
913 : * Result: a palloc'd struct containing statistical info for VACUUM displays.
914 : */
915 : IndexBulkDeleteResult *
916 109 : btbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
917 : IndexBulkDeleteCallback callback, void *callback_state)
918 : {
919 109 : Relation rel = info->index;
920 : BTCycleId cycleid;
921 :
922 : /* allocate stats if first time through, else re-use existing struct */
923 109 : if (stats == NULL)
924 109 : stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
925 :
926 : /* Establish the vacuum cycle ID to use for this scan */
927 : /* The ENSURE stuff ensures we clean up shared memory on failure */
928 109 : PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
929 : {
930 109 : cycleid = _bt_start_vacuum(rel);
931 :
932 109 : btvacuumscan(info, stats, callback, callback_state, cycleid);
933 : }
934 109 : PG_END_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
935 109 : _bt_end_vacuum(rel);
936 :
937 109 : return stats;
938 : }
939 :
940 : /*
941 : * Post-VACUUM cleanup.
942 : *
943 : * Result: a palloc'd struct containing statistical info for VACUUM displays.
944 : */
945 : IndexBulkDeleteResult *
946 600 : btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
947 : {
948 : /* No-op in ANALYZE ONLY mode */
949 600 : if (info->analyze_only)
950 204 : return stats;
951 :
952 : /*
953 : * If btbulkdelete was called, we need not do anything, just return the
954 : * stats from the latest btbulkdelete call. If it wasn't called, we must
955 : * still do a pass over the index, to recycle any newly-recyclable pages
956 : * and to obtain index statistics.
957 : *
958 : * Since we aren't going to actually delete any leaf items, there's no
959 : * need to go through all the vacuum-cycle-ID pushups.
960 : */
961 396 : if (stats == NULL)
962 : {
963 292 : stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
964 292 : btvacuumscan(info, stats, NULL, NULL, 0);
965 : }
966 :
967 : /* Finally, vacuum the FSM */
968 396 : IndexFreeSpaceMapVacuum(info->index);
969 :
970 : /*
971 : * It's quite possible for us to be fooled by concurrent page splits into
972 : * double-counting some index tuples, so disbelieve any total that exceeds
973 : * the underlying heap's count ... if we know that accurately. Otherwise
974 : * this might just make matters worse.
975 : */
976 396 : if (!info->estimated_count)
977 : {
978 387 : if (stats->num_index_tuples > info->num_heap_tuples)
979 0 : stats->num_index_tuples = info->num_heap_tuples;
980 : }
981 :
982 396 : return stats;
983 : }
984 :
985 : /*
986 : * btvacuumscan --- scan the index for VACUUMing purposes
987 : *
988 : * This combines the functions of looking for leaf tuples that are deletable
989 : * according to the vacuum callback, looking for empty pages that can be
990 : * deleted, and looking for old deleted pages that can be recycled. Both
991 : * btbulkdelete and btvacuumcleanup invoke this (the latter only if no
992 : * btbulkdelete call occurred).
993 : *
994 : * The caller is responsible for initially allocating/zeroing a stats struct
995 : * and for obtaining a vacuum cycle ID if necessary.
996 : */
997 : static void
998 401 : btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
999 : IndexBulkDeleteCallback callback, void *callback_state,
1000 : BTCycleId cycleid)
1001 : {
1002 401 : Relation rel = info->index;
1003 : BTVacState vstate;
1004 : BlockNumber num_pages;
1005 : BlockNumber blkno;
1006 : bool needLock;
1007 :
1008 : /*
1009 : * Reset counts that will be incremented during the scan; needed in case
1010 : * of multiple scans during a single VACUUM command
1011 : */
1012 401 : stats->estimated_count = false;
1013 401 : stats->num_index_tuples = 0;
1014 401 : stats->pages_deleted = 0;
1015 :
1016 : /* Set up info to pass down to btvacuumpage */
1017 401 : vstate.info = info;
1018 401 : vstate.stats = stats;
1019 401 : vstate.callback = callback;
1020 401 : vstate.callback_state = callback_state;
1021 401 : vstate.cycleid = cycleid;
1022 401 : vstate.lastBlockVacuumed = BTREE_METAPAGE; /* Initialise at first block */
1023 401 : vstate.lastBlockLocked = BTREE_METAPAGE;
1024 401 : vstate.totFreePages = 0;
1025 :
1026 : /* Create a temporary memory context to run _bt_pagedel in */
1027 401 : vstate.pagedelcontext = AllocSetContextCreate(CurrentMemoryContext,
1028 : "_bt_pagedel",
1029 : ALLOCSET_DEFAULT_SIZES);
1030 :
1031 : /*
1032 : * The outer loop iterates over all index pages except the metapage, in
1033 : * physical order (we hope the kernel will cooperate in providing
1034 : * read-ahead for speed). It is critical that we visit all leaf pages,
1035 : * including ones added after we start the scan, else we might fail to
1036 : * delete some deletable tuples. Hence, we must repeatedly check the
1037 : * relation length. We must acquire the relation-extension lock while
1038 : * doing so to avoid a race condition: if someone else is extending the
1039 : * relation, there is a window where bufmgr/smgr have created a new
1040 : * all-zero page but it hasn't yet been write-locked by _bt_getbuf(). If
1041 : * we manage to scan such a page here, we'll improperly assume it can be
1042 : * recycled. Taking the lock synchronizes things enough to prevent a
1043 : * problem: either num_pages won't include the new page, or _bt_getbuf
1044 : * already has write lock on the buffer and it will be fully initialized
1045 : * before we can examine it. (See also vacuumlazy.c, which has the same
1046 : * issue.) Also, we need not worry if a page is added immediately after
1047 : * we look; the page splitting code already has write-lock on the left
1048 : * page before it adds a right page, so we must already have processed any
1049 : * tuples due to be moved into such a page.
1050 : *
1051 : * We can skip locking for new or temp relations, however, since no one
1052 : * else could be accessing them.
1053 : */
1054 401 : needLock = !RELATION_IS_LOCAL(rel);
1055 :
1056 401 : blkno = BTREE_METAPAGE + 1;
1057 : for (;;)
1058 : {
1059 : /* Get the current relation length */
1060 634 : if (needLock)
1061 634 : LockRelationForExtension(rel, ExclusiveLock);
1062 634 : num_pages = RelationGetNumberOfBlocks(rel);
1063 634 : if (needLock)
1064 634 : UnlockRelationForExtension(rel, ExclusiveLock);
1065 :
1066 : /* Quit if we've scanned the whole relation */
1067 634 : if (blkno >= num_pages)
1068 401 : break;
1069 : /* Iterate over pages, then loop back to recheck length */
1070 1912 : for (; blkno < num_pages; blkno++)
1071 : {
1072 1679 : btvacuumpage(&vstate, blkno, blkno);
1073 : }
1074 233 : }
1075 :
1076 : /*
1077 : * Check to see if we need to issue one final WAL record for this index,
1078 : * which may be needed for correctness on a hot standby node when non-MVCC
1079 : * index scans could take place.
1080 : *
1081 : * If the WAL is replayed in hot standby, the replay process needs to get
1082 : * cleanup locks on all index leaf pages, just as we've been doing here.
1083 : * However, we won't issue any WAL records about pages that have no items
1084 : * to be deleted. For pages between pages we've vacuumed, the replay code
1085 : * will take locks under the direction of the lastBlockVacuumed fields in
1086 : * the XLOG_BTREE_VACUUM WAL records. To cover pages after the last one
1087 : * we vacuum, we need to issue a dummy XLOG_BTREE_VACUUM WAL record
1088 : * against the last leaf page in the index, if that one wasn't vacuumed.
1089 : */
1090 802 : if (XLogStandbyInfoActive() &&
1091 401 : vstate.lastBlockVacuumed < vstate.lastBlockLocked)
1092 : {
1093 : Buffer buf;
1094 :
1095 : /*
1096 : * The page should be valid, but we can't use _bt_getbuf() because we
1097 : * want to use a nondefault buffer access strategy. Since we aren't
1098 : * going to delete any items, getting cleanup lock again is probably
1099 : * overkill, but for consistency do that anyway.
1100 : */
1101 144 : buf = ReadBufferExtended(rel, MAIN_FORKNUM, vstate.lastBlockLocked,
1102 : RBM_NORMAL, info->strategy);
1103 144 : LockBufferForCleanup(buf);
1104 144 : _bt_checkpage(rel, buf);
1105 144 : _bt_delitems_vacuum(rel, buf, NULL, 0, vstate.lastBlockVacuumed);
1106 144 : _bt_relbuf(rel, buf);
1107 : }
1108 :
1109 401 : MemoryContextDelete(vstate.pagedelcontext);
1110 :
1111 : /* update statistics */
1112 401 : stats->num_pages = num_pages;
1113 401 : stats->pages_free = vstate.totFreePages;
1114 401 : }
1115 :
1116 : /*
1117 : * btvacuumpage --- VACUUM one page
1118 : *
1119 : * This processes a single page for btvacuumscan(). In some cases we
1120 : * must go back and re-examine previously-scanned pages; this routine
1121 : * recurses when necessary to handle that case.
1122 : *
1123 : * blkno is the page to process. orig_blkno is the highest block number
1124 : * reached by the outer btvacuumscan loop (the same as blkno, unless we
1125 : * are recursing to re-examine a previous page).
1126 : */
1127 : static void
1128 1679 : btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno)
1129 : {
1130 1679 : IndexVacuumInfo *info = vstate->info;
1131 1679 : IndexBulkDeleteResult *stats = vstate->stats;
1132 1679 : IndexBulkDeleteCallback callback = vstate->callback;
1133 1679 : void *callback_state = vstate->callback_state;
1134 1679 : Relation rel = info->index;
1135 : bool delete_now;
1136 : BlockNumber recurse_to;
1137 : Buffer buf;
1138 : Page page;
1139 1679 : BTPageOpaque opaque = NULL;
1140 :
1141 : restart:
1142 1679 : delete_now = false;
1143 1679 : recurse_to = P_NONE;
1144 :
1145 : /* call vacuum_delay_point while not holding any buffer lock */
1146 1679 : vacuum_delay_point();
1147 :
1148 : /*
1149 : * We can't use _bt_getbuf() here because it always applies
1150 : * _bt_checkpage(), which will barf on an all-zero page. We want to
1151 : * recycle all-zero pages, not fail. Also, we want to use a nondefault
1152 : * buffer access strategy.
1153 : */
1154 1679 : buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
1155 : info->strategy);
1156 1679 : LockBuffer(buf, BT_READ);
1157 1679 : page = BufferGetPage(buf);
1158 1679 : if (!PageIsNew(page))
1159 : {
1160 1679 : _bt_checkpage(rel, buf);
1161 1679 : opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1162 : }
1163 :
1164 : /*
1165 : * If we are recursing, the only case we want to do anything with is a
1166 : * live leaf page having the current vacuum cycle ID. Any other state
1167 : * implies we already saw the page (eg, deleted it as being empty).
1168 : */
1169 1679 : if (blkno != orig_blkno)
1170 : {
1171 0 : if (_bt_page_recyclable(page) ||
1172 0 : P_IGNORE(opaque) ||
1173 0 : !P_ISLEAF(opaque) ||
1174 0 : opaque->btpo_cycleid != vstate->cycleid)
1175 : {
1176 0 : _bt_relbuf(rel, buf);
1177 0 : return;
1178 : }
1179 : }
1180 :
1181 : /* Page is valid, see what to do with it */
1182 1679 : if (_bt_page_recyclable(page))
1183 : {
1184 : /* Okay to recycle this page */
1185 0 : RecordFreeIndexPage(rel, blkno);
1186 0 : vstate->totFreePages++;
1187 0 : stats->pages_deleted++;
1188 : }
1189 1679 : else if (P_ISDELETED(opaque))
1190 : {
1191 : /* Already deleted, but can't recycle yet */
1192 0 : stats->pages_deleted++;
1193 : }
1194 1679 : else if (P_ISHALFDEAD(opaque))
1195 : {
1196 : /* Half-dead, try to delete */
1197 0 : delete_now = true;
1198 : }
1199 1679 : else if (P_ISLEAF(opaque))
1200 : {
1201 : OffsetNumber deletable[MaxOffsetNumber];
1202 : int ndeletable;
1203 : OffsetNumber offnum,
1204 : minoff,
1205 : maxoff;
1206 :
1207 : /*
1208 : * Trade in the initial read lock for a super-exclusive write lock on
1209 : * this page. We must get such a lock on every leaf page over the
1210 : * course of the vacuum scan, whether or not it actually contains any
1211 : * deletable tuples --- see nbtree/README.
1212 : */
1213 1517 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
1214 1517 : LockBufferForCleanup(buf);
1215 :
1216 : /*
1217 : * Remember highest leaf page number we've taken cleanup lock on; see
1218 : * notes in btvacuumscan
1219 : */
1220 1517 : if (blkno > vstate->lastBlockLocked)
1221 1517 : vstate->lastBlockLocked = blkno;
1222 :
1223 : /*
1224 : * Check whether we need to recurse back to earlier pages. What we
1225 : * are concerned about is a page split that happened since we started
1226 : * the vacuum scan. If the split moved some tuples to a lower page
1227 : * then we might have missed 'em. If so, set up for tail recursion.
1228 : * (Must do this before possibly clearing btpo_cycleid below!)
1229 : */
1230 2238 : if (vstate->cycleid != 0 &&
1231 721 : opaque->btpo_cycleid == vstate->cycleid &&
1232 0 : !(opaque->btpo_flags & BTP_SPLIT_END) &&
1233 0 : !P_RIGHTMOST(opaque) &&
1234 0 : opaque->btpo_next < orig_blkno)
1235 0 : recurse_to = opaque->btpo_next;
1236 :
1237 : /*
1238 : * Scan over all items to see which ones need deleted according to the
1239 : * callback function.
1240 : */
1241 1517 : ndeletable = 0;
1242 1517 : minoff = P_FIRSTDATAKEY(opaque);
1243 1517 : maxoff = PageGetMaxOffsetNumber(page);
1244 1517 : if (callback)
1245 : {
1246 122930 : for (offnum = minoff;
1247 : offnum <= maxoff;
1248 121488 : offnum = OffsetNumberNext(offnum))
1249 : {
1250 : IndexTuple itup;
1251 : ItemPointer htup;
1252 :
1253 121488 : itup = (IndexTuple) PageGetItem(page,
1254 : PageGetItemId(page, offnum));
1255 121488 : htup = &(itup->t_tid);
1256 :
1257 : /*
1258 : * During Hot Standby we currently assume that
1259 : * XLOG_BTREE_VACUUM records do not produce conflicts. That is
1260 : * only true as long as the callback function depends only
1261 : * upon whether the index tuple refers to heap tuples removed
1262 : * in the initial heap scan. When vacuum starts it derives a
1263 : * value of OldestXmin. Backends taking later snapshots could
1264 : * have a RecentGlobalXmin with a later xid than the vacuum's
1265 : * OldestXmin, so it is possible that row versions deleted
1266 : * after OldestXmin could be marked as killed by other
1267 : * backends. The callback function *could* look at the index
1268 : * tuple state in isolation and decide to delete the index
1269 : * tuple, though currently it does not. If it ever did, we
1270 : * would need to reconsider whether XLOG_BTREE_VACUUM records
1271 : * should cause conflicts. If they did cause conflicts they
1272 : * would be fairly harsh conflicts, since we haven't yet
1273 : * worked out a way to pass a useful value for
1274 : * latestRemovedXid on the XLOG_BTREE_VACUUM records. This
1275 : * applies to *any* type of index that marks index tuples as
1276 : * killed.
1277 : */
1278 121488 : if (callback(htup, callback_state))
1279 39083 : deletable[ndeletable++] = offnum;
1280 : }
1281 : }
1282 :
1283 : /*
1284 : * Apply any needed deletes. We issue just one _bt_delitems_vacuum()
1285 : * call per page, so as to minimize WAL traffic.
1286 : */
1287 1517 : if (ndeletable > 0)
1288 : {
1289 : /*
1290 : * Notice that the issued XLOG_BTREE_VACUUM WAL record includes
1291 : * all information to the replay code to allow it to get a cleanup
1292 : * lock on all pages between the previous lastBlockVacuumed and
1293 : * this page. This ensures that WAL replay locks all leaf pages at
1294 : * some point, which is important should non-MVCC scans be
1295 : * requested. This is currently unused on standby, but we record
1296 : * it anyway, so that the WAL contains the required information.
1297 : *
1298 : * Since we can visit leaf pages out-of-order when recursing,
1299 : * replay might end up locking such pages an extra time, but it
1300 : * doesn't seem worth the amount of bookkeeping it'd take to avoid
1301 : * that.
1302 : */
1303 419 : _bt_delitems_vacuum(rel, buf, deletable, ndeletable,
1304 : vstate->lastBlockVacuumed);
1305 :
1306 : /*
1307 : * Remember highest leaf page number we've issued a
1308 : * XLOG_BTREE_VACUUM WAL record for.
1309 : */
1310 419 : if (blkno > vstate->lastBlockVacuumed)
1311 419 : vstate->lastBlockVacuumed = blkno;
1312 :
1313 419 : stats->tuples_removed += ndeletable;
1314 : /* must recompute maxoff */
1315 419 : maxoff = PageGetMaxOffsetNumber(page);
1316 : }
1317 : else
1318 : {
1319 : /*
1320 : * If the page has been split during this vacuum cycle, it seems
1321 : * worth expending a write to clear btpo_cycleid even if we don't
1322 : * have any deletions to do. (If we do, _bt_delitems_vacuum takes
1323 : * care of this.) This ensures we won't process the page again.
1324 : *
1325 : * We treat this like a hint-bit update because there's no need to
1326 : * WAL-log it.
1327 : */
1328 1400 : if (vstate->cycleid != 0 &&
1329 302 : opaque->btpo_cycleid == vstate->cycleid)
1330 : {
1331 0 : opaque->btpo_cycleid = 0;
1332 0 : MarkBufferDirtyHint(buf, true);
1333 : }
1334 : }
1335 :
1336 : /*
1337 : * If it's now empty, try to delete; else count the live tuples. We
1338 : * don't delete when recursing, though, to avoid putting entries into
1339 : * freePages out-of-order (doesn't seem worth any extra code to handle
1340 : * the case).
1341 : */
1342 1517 : if (minoff > maxoff)
1343 52 : delete_now = (blkno == orig_blkno);
1344 : else
1345 1465 : stats->num_index_tuples += maxoff - minoff + 1;
1346 : }
1347 :
1348 1679 : if (delete_now)
1349 : {
1350 : MemoryContext oldcontext;
1351 : int ndel;
1352 :
1353 : /* Run pagedel in a temp context to avoid memory leakage */
1354 52 : MemoryContextReset(vstate->pagedelcontext);
1355 52 : oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext);
1356 :
1357 52 : ndel = _bt_pagedel(rel, buf);
1358 :
1359 : /* count only this page, else may double-count parent */
1360 52 : if (ndel)
1361 42 : stats->pages_deleted++;
1362 :
1363 52 : MemoryContextSwitchTo(oldcontext);
1364 : /* pagedel released buffer, so we shouldn't */
1365 : }
1366 : else
1367 1627 : _bt_relbuf(rel, buf);
1368 :
1369 : /*
1370 : * This is really tail recursion, but if the compiler is too stupid to
1371 : * optimize it as such, we'd eat an uncomfortably large amount of stack
1372 : * space per recursion level (due to the deletable[] array). A failure is
1373 : * improbable since the number of levels isn't likely to be large ... but
1374 : * just in case, let's hand-optimize into a loop.
1375 : */
1376 1679 : if (recurse_to != P_NONE)
1377 : {
1378 0 : blkno = recurse_to;
1379 0 : goto restart;
1380 : }
1381 : }
1382 :
1383 : /*
1384 : * btcanreturn() -- Check whether btree indexes support index-only scans.
1385 : *
1386 : * btrees always do, so this is trivial.
1387 : */
1388 : bool
1389 31104 : btcanreturn(Relation index, int attno)
1390 : {
1391 31104 : return true;
1392 : }
|