Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * vacuumlazy.c
4 : * Concurrent ("lazy") vacuuming.
5 : *
6 : *
7 : * The major space usage for LAZY VACUUM is storage for the array of dead tuple
8 : * TIDs. We want to ensure we can vacuum even the very largest relations with
9 : * finite memory space usage. To do that, we set upper bounds on the number of
10 : * tuples we will keep track of at once.
11 : *
12 : * We are willing to use at most maintenance_work_mem (or perhaps
13 : * autovacuum_work_mem) memory space to keep track of dead tuples. We
14 : * initially allocate an array of TIDs of that size, with an upper limit that
15 : * depends on table size (this limit ensures we don't allocate a huge area
16 : * uselessly for vacuuming small tables). If the array threatens to overflow,
17 : * we suspend the heap scan phase and perform a pass of index cleanup and page
18 : * compaction, then resume the heap scan with an empty TID array.
19 : *
20 : * If we're processing a table with no indexes, we can just vacuum each page
21 : * as we go; there's no need to save up multiple tuples to minimize the number
22 : * of index scans performed. So we don't use maintenance_work_mem memory for
23 : * the TID array, just enough to hold as many heap tuples as fit on one page.
24 : *
25 : *
26 : * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
27 : * Portions Copyright (c) 1994, Regents of the University of California
28 : *
29 : *
30 : * IDENTIFICATION
31 : * src/backend/commands/vacuumlazy.c
32 : *
33 : *-------------------------------------------------------------------------
34 : */
35 : #include "postgres.h"
36 :
37 : #include <math.h>
38 :
39 : #include "access/genam.h"
40 : #include "access/heapam.h"
41 : #include "access/heapam_xlog.h"
42 : #include "access/htup_details.h"
43 : #include "access/multixact.h"
44 : #include "access/transam.h"
45 : #include "access/visibilitymap.h"
46 : #include "access/xlog.h"
47 : #include "catalog/catalog.h"
48 : #include "catalog/storage.h"
49 : #include "commands/dbcommands.h"
50 : #include "commands/progress.h"
51 : #include "commands/vacuum.h"
52 : #include "miscadmin.h"
53 : #include "pgstat.h"
54 : #include "portability/instr_time.h"
55 : #include "postmaster/autovacuum.h"
56 : #include "storage/bufmgr.h"
57 : #include "storage/freespace.h"
58 : #include "storage/lmgr.h"
59 : #include "utils/lsyscache.h"
60 : #include "utils/memutils.h"
61 : #include "utils/pg_rusage.h"
62 : #include "utils/timestamp.h"
63 : #include "utils/tqual.h"
64 :
65 :
66 : /*
67 : * Space/time tradeoff parameters: do these need to be user-tunable?
68 : *
69 : * To consider truncating the relation, we want there to be at least
70 : * REL_TRUNCATE_MINIMUM or (relsize / REL_TRUNCATE_FRACTION) (whichever
71 : * is less) potentially-freeable pages.
72 : */
73 : #define REL_TRUNCATE_MINIMUM 1000
74 : #define REL_TRUNCATE_FRACTION 16
75 :
76 : /*
77 : * Timing parameters for truncate locking heuristics.
78 : *
79 : * These were not exposed as user tunable GUC values because it didn't seem
80 : * that the potential for improvement was great enough to merit the cost of
81 : * supporting them.
82 : */
83 : #define VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL 20 /* ms */
84 : #define VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL 50 /* ms */
85 : #define VACUUM_TRUNCATE_LOCK_TIMEOUT 5000 /* ms */
86 :
87 : /*
88 : * Guesstimation of number of dead tuples per page. This is used to
89 : * provide an upper limit to memory allocated when vacuuming small
90 : * tables.
91 : */
92 : #define LAZY_ALLOC_TUPLES MaxHeapTuplesPerPage
93 :
94 : /*
95 : * Before we consider skipping a page that's marked as clean in
96 : * visibility map, we must've seen at least this many clean pages.
97 : */
98 : #define SKIP_PAGES_THRESHOLD ((BlockNumber) 32)
99 :
100 : /*
101 : * Size of the prefetch window for lazy vacuum backwards truncation scan.
102 : * Needs to be a power of 2.
103 : */
104 : #define PREFETCH_SIZE ((BlockNumber) 32)
105 :
106 : typedef struct LVRelStats
107 : {
108 : /* hasindex = true means two-pass strategy; false means one-pass */
109 : bool hasindex;
110 : /* Overall statistics about rel */
111 : BlockNumber old_rel_pages; /* previous value of pg_class.relpages */
112 : BlockNumber rel_pages; /* total number of pages */
113 : BlockNumber scanned_pages; /* number of pages we examined */
114 : BlockNumber pinskipped_pages; /* # of pages we skipped due to a pin */
115 : BlockNumber frozenskipped_pages; /* # of frozen pages we skipped */
116 : BlockNumber tupcount_pages; /* pages whose tuples we counted */
117 : double scanned_tuples; /* counts only tuples on tupcount_pages */
118 : double old_rel_tuples; /* previous value of pg_class.reltuples */
119 : double new_rel_tuples; /* new estimated total # of tuples */
120 : double new_dead_tuples; /* new estimated total # of dead tuples */
121 : BlockNumber pages_removed;
122 : double tuples_deleted;
123 : BlockNumber nonempty_pages; /* actually, last nonempty page + 1 */
124 : /* List of TIDs of tuples we intend to delete */
125 : /* NB: this list is ordered by TID address */
126 : int num_dead_tuples; /* current # of entries */
127 : int max_dead_tuples; /* # slots allocated in array */
128 : ItemPointer dead_tuples; /* array of ItemPointerData */
129 : int num_index_scans;
130 : TransactionId latestRemovedXid;
131 : bool lock_waiter_detected;
132 : } LVRelStats;
133 :
134 :
135 : /* A few variables that don't seem worth passing around as parameters */
136 : static int elevel = -1;
137 :
138 : static TransactionId OldestXmin;
139 : static TransactionId FreezeLimit;
140 : static MultiXactId MultiXactCutoff;
141 :
142 : static BufferAccessStrategy vac_strategy;
143 :
144 :
145 : /* non-export function prototypes */
146 : static void lazy_scan_heap(Relation onerel, int options,
147 : LVRelStats *vacrelstats, Relation *Irel, int nindexes,
148 : bool aggressive);
149 : static void lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats);
150 : static bool lazy_check_needs_freeze(Buffer buf, bool *hastup);
151 : static void lazy_vacuum_index(Relation indrel,
152 : IndexBulkDeleteResult **stats,
153 : LVRelStats *vacrelstats);
154 : static void lazy_cleanup_index(Relation indrel,
155 : IndexBulkDeleteResult *stats,
156 : LVRelStats *vacrelstats);
157 : static int lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
158 : int tupindex, LVRelStats *vacrelstats, Buffer *vmbuffer);
159 : static bool should_attempt_truncation(LVRelStats *vacrelstats);
160 : static void lazy_truncate_heap(Relation onerel, LVRelStats *vacrelstats);
161 : static BlockNumber count_nondeletable_pages(Relation onerel,
162 : LVRelStats *vacrelstats);
163 : static void lazy_space_alloc(LVRelStats *vacrelstats, BlockNumber relblocks);
164 : static void lazy_record_dead_tuple(LVRelStats *vacrelstats,
165 : ItemPointer itemptr);
166 : static bool lazy_tid_reaped(ItemPointer itemptr, void *state);
167 : static int vac_cmp_itemptr(const void *left, const void *right);
168 : static bool heap_page_is_all_visible(Relation rel, Buffer buf,
169 : TransactionId *visibility_cutoff_xid, bool *all_frozen);
170 :
171 :
172 : /*
173 : * lazy_vacuum_rel() -- perform LAZY VACUUM for one heap relation
174 : *
175 : * This routine vacuums a single heap, cleans out its indexes, and
176 : * updates its relpages and reltuples statistics.
177 : *
178 : * At entry, we have already established a transaction and opened
179 : * and locked the relation.
180 : */
181 : void
182 384 : lazy_vacuum_rel(Relation onerel, int options, VacuumParams *params,
183 : BufferAccessStrategy bstrategy)
184 : {
185 : LVRelStats *vacrelstats;
186 : Relation *Irel;
187 : int nindexes;
188 : PGRUsage ru0;
189 384 : TimestampTz starttime = 0;
190 : long secs;
191 : int usecs;
192 : double read_rate,
193 : write_rate;
194 : bool aggressive; /* should we scan all unfrozen pages? */
195 : bool scanned_all_unfrozen; /* actually scanned all such pages? */
196 : TransactionId xidFullScanLimit;
197 : MultiXactId mxactFullScanLimit;
198 : BlockNumber new_rel_pages;
199 : double new_rel_tuples;
200 : BlockNumber new_rel_allvisible;
201 : double new_live_tuples;
202 : TransactionId new_frozen_xid;
203 : MultiXactId new_min_multi;
204 :
205 384 : Assert(params != NULL);
206 :
207 : /* measure elapsed time iff autovacuum logging requires it */
208 384 : if (IsAutoVacuumWorkerProcess() && params->log_min_duration >= 0)
209 : {
210 7 : pg_rusage_init(&ru0);
211 7 : starttime = GetCurrentTimestamp();
212 : }
213 :
214 384 : if (options & VACOPT_VERBOSE)
215 0 : elevel = INFO;
216 : else
217 384 : elevel = DEBUG2;
218 :
219 384 : pgstat_progress_start_command(PROGRESS_COMMAND_VACUUM,
220 : RelationGetRelid(onerel));
221 :
222 384 : vac_strategy = bstrategy;
223 :
224 384 : vacuum_set_xid_limits(onerel,
225 : params->freeze_min_age,
226 : params->freeze_table_age,
227 : params->multixact_freeze_min_age,
228 : params->multixact_freeze_table_age,
229 : &OldestXmin, &FreezeLimit, &xidFullScanLimit,
230 : &MultiXactCutoff, &mxactFullScanLimit);
231 :
232 : /*
233 : * We request an aggressive scan if the table's frozen Xid is now older
234 : * than or equal to the requested Xid full-table scan limit; or if the
235 : * table's minimum MultiXactId is older than or equal to the requested
236 : * mxid full-table scan limit; or if DISABLE_PAGE_SKIPPING was specified.
237 : */
238 384 : aggressive = TransactionIdPrecedesOrEquals(onerel->rd_rel->relfrozenxid,
239 : xidFullScanLimit);
240 384 : aggressive |= MultiXactIdPrecedesOrEquals(onerel->rd_rel->relminmxid,
241 : mxactFullScanLimit);
242 384 : if (options & VACOPT_DISABLE_PAGE_SKIPPING)
243 1 : aggressive = true;
244 :
245 384 : vacrelstats = (LVRelStats *) palloc0(sizeof(LVRelStats));
246 :
247 384 : vacrelstats->old_rel_pages = onerel->rd_rel->relpages;
248 384 : vacrelstats->old_rel_tuples = onerel->rd_rel->reltuples;
249 384 : vacrelstats->num_index_scans = 0;
250 384 : vacrelstats->pages_removed = 0;
251 384 : vacrelstats->lock_waiter_detected = false;
252 :
253 : /* Open all indexes of the relation */
254 384 : vac_open_indexes(onerel, RowExclusiveLock, &nindexes, &Irel);
255 384 : vacrelstats->hasindex = (nindexes > 0);
256 :
257 : /* Do the vacuuming */
258 384 : lazy_scan_heap(onerel, options, vacrelstats, Irel, nindexes, aggressive);
259 :
260 : /* Done with indexes */
261 383 : vac_close_indexes(nindexes, Irel, NoLock);
262 :
263 : /*
264 : * Compute whether we actually scanned the all unfrozen pages. If we did,
265 : * we can adjust relfrozenxid and relminmxid.
266 : *
267 : * NB: We need to check this before truncating the relation, because that
268 : * will change ->rel_pages.
269 : */
270 766 : if ((vacrelstats->scanned_pages + vacrelstats->frozenskipped_pages)
271 383 : < vacrelstats->rel_pages)
272 : {
273 1 : Assert(!aggressive);
274 1 : scanned_all_unfrozen = false;
275 : }
276 : else
277 382 : scanned_all_unfrozen = true;
278 :
279 : /*
280 : * Optionally truncate the relation.
281 : */
282 383 : if (should_attempt_truncation(vacrelstats))
283 12 : lazy_truncate_heap(onerel, vacrelstats);
284 :
285 : /* Report that we are now doing final cleanup */
286 383 : pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
287 : PROGRESS_VACUUM_PHASE_FINAL_CLEANUP);
288 :
289 : /* Vacuum the Free Space Map */
290 383 : FreeSpaceMapVacuum(onerel);
291 :
292 : /*
293 : * Update statistics in pg_class.
294 : *
295 : * A corner case here is that if we scanned no pages at all because every
296 : * page is all-visible, we should not update relpages/reltuples, because
297 : * we have no new information to contribute. In particular this keeps us
298 : * from replacing relpages=reltuples=0 (which means "unknown tuple
299 : * density") with nonzero relpages and reltuples=0 (which means "zero
300 : * tuple density") unless there's some actual evidence for the latter.
301 : *
302 : * It's important that we use tupcount_pages and not scanned_pages for the
303 : * check described above; scanned_pages counts pages where we could not
304 : * get cleanup lock, and which were processed only for frozenxid purposes.
305 : *
306 : * We do update relallvisible even in the corner case, since if the table
307 : * is all-visible we'd definitely like to know that. But clamp the value
308 : * to be not more than what we're setting relpages to.
309 : *
310 : * Also, don't change relfrozenxid/relminmxid if we skipped any pages,
311 : * since then we don't know for certain that all tuples have a newer xmin.
312 : */
313 383 : new_rel_pages = vacrelstats->rel_pages;
314 383 : new_rel_tuples = vacrelstats->new_rel_tuples;
315 383 : if (vacrelstats->tupcount_pages == 0 && new_rel_pages > 0)
316 : {
317 0 : new_rel_pages = vacrelstats->old_rel_pages;
318 0 : new_rel_tuples = vacrelstats->old_rel_tuples;
319 : }
320 :
321 383 : visibilitymap_count(onerel, &new_rel_allvisible, NULL);
322 383 : if (new_rel_allvisible > new_rel_pages)
323 0 : new_rel_allvisible = new_rel_pages;
324 :
325 383 : new_frozen_xid = scanned_all_unfrozen ? FreezeLimit : InvalidTransactionId;
326 383 : new_min_multi = scanned_all_unfrozen ? MultiXactCutoff : InvalidMultiXactId;
327 :
328 383 : vac_update_relstats(onerel,
329 : new_rel_pages,
330 : new_rel_tuples,
331 : new_rel_allvisible,
332 383 : vacrelstats->hasindex,
333 : new_frozen_xid,
334 : new_min_multi,
335 : false);
336 :
337 : /* report results to the stats collector, too */
338 383 : new_live_tuples = new_rel_tuples - vacrelstats->new_dead_tuples;
339 383 : if (new_live_tuples < 0)
340 0 : new_live_tuples = 0; /* just in case */
341 :
342 766 : pgstat_report_vacuum(RelationGetRelid(onerel),
343 383 : onerel->rd_rel->relisshared,
344 : new_live_tuples,
345 383 : vacrelstats->new_dead_tuples);
346 383 : pgstat_progress_end_command();
347 :
348 : /* and log the action if appropriate */
349 383 : if (IsAutoVacuumWorkerProcess() && params->log_min_duration >= 0)
350 : {
351 6 : TimestampTz endtime = GetCurrentTimestamp();
352 :
353 6 : if (params->log_min_duration == 0 ||
354 0 : TimestampDifferenceExceeds(starttime, endtime,
355 : params->log_min_duration))
356 : {
357 : StringInfoData buf;
358 :
359 6 : TimestampDifference(starttime, endtime, &secs, &usecs);
360 :
361 6 : read_rate = 0;
362 6 : write_rate = 0;
363 6 : if ((secs > 0) || (usecs > 0))
364 : {
365 6 : read_rate = (double) BLCKSZ * VacuumPageMiss / (1024 * 1024) /
366 : (secs + usecs / 1000000.0);
367 6 : write_rate = (double) BLCKSZ * VacuumPageDirty / (1024 * 1024) /
368 : (secs + usecs / 1000000.0);
369 : }
370 :
371 : /*
372 : * This is pretty messy, but we split it up so that we can skip
373 : * emitting individual parts of the message when not applicable.
374 : */
375 6 : initStringInfo(&buf);
376 12 : appendStringInfo(&buf, _("automatic vacuum of table \"%s.%s.%s\": index scans: %d\n"),
377 : get_database_name(MyDatabaseId),
378 6 : get_namespace_name(RelationGetNamespace(onerel)),
379 6 : RelationGetRelationName(onerel),
380 : vacrelstats->num_index_scans);
381 6 : appendStringInfo(&buf, _("pages: %u removed, %u remain, %u skipped due to pins, %u skipped frozen\n"),
382 : vacrelstats->pages_removed,
383 : vacrelstats->rel_pages,
384 : vacrelstats->pinskipped_pages,
385 : vacrelstats->frozenskipped_pages);
386 6 : appendStringInfo(&buf,
387 : _("tuples: %.0f removed, %.0f remain, %.0f are dead but not yet removable, oldest xmin: %u\n"),
388 : vacrelstats->tuples_deleted,
389 : vacrelstats->new_rel_tuples,
390 : vacrelstats->new_dead_tuples,
391 : OldestXmin);
392 6 : appendStringInfo(&buf,
393 : _("buffer usage: %d hits, %d misses, %d dirtied\n"),
394 : VacuumPageHit,
395 : VacuumPageMiss,
396 : VacuumPageDirty);
397 6 : appendStringInfo(&buf, _("avg read rate: %.3f MB/s, avg write rate: %.3f MB/s\n"),
398 : read_rate, write_rate);
399 6 : appendStringInfo(&buf, _("system usage: %s"), pg_rusage_show(&ru0));
400 :
401 6 : ereport(LOG,
402 : (errmsg_internal("%s", buf.data)));
403 6 : pfree(buf.data);
404 : }
405 : }
406 383 : }
407 :
408 : /*
409 : * For Hot Standby we need to know the highest transaction id that will
410 : * be removed by any change. VACUUM proceeds in a number of passes so
411 : * we need to consider how each pass operates. The first phase runs
412 : * heap_page_prune(), which can issue XLOG_HEAP2_CLEAN records as it
413 : * progresses - these will have a latestRemovedXid on each record.
414 : * In some cases this removes all of the tuples to be removed, though
415 : * often we have dead tuples with index pointers so we must remember them
416 : * for removal in phase 3. Index records for those rows are removed
417 : * in phase 2 and index blocks do not have MVCC information attached.
418 : * So before we can allow removal of any index tuples we need to issue
419 : * a WAL record containing the latestRemovedXid of rows that will be
420 : * removed in phase three. This allows recovery queries to block at the
421 : * correct place, i.e. before phase two, rather than during phase three
422 : * which would be after the rows have become inaccessible.
423 : */
424 : static void
425 58 : vacuum_log_cleanup_info(Relation rel, LVRelStats *vacrelstats)
426 : {
427 : /*
428 : * Skip this for relations for which no WAL is to be written, or if we're
429 : * not trying to support archive recovery.
430 : */
431 58 : if (!RelationNeedsWAL(rel) || !XLogIsNeeded())
432 58 : return;
433 :
434 : /*
435 : * No need to write the record at all unless it contains a valid value
436 : */
437 58 : if (TransactionIdIsValid(vacrelstats->latestRemovedXid))
438 48 : (void) log_heap_cleanup_info(rel->rd_node, vacrelstats->latestRemovedXid);
439 : }
440 :
441 : /*
442 : * lazy_scan_heap() -- scan an open heap relation
443 : *
444 : * This routine prunes each page in the heap, which will among other
445 : * things truncate dead tuples to dead line pointers, defragment the
446 : * page, and set commit status bits (see heap_page_prune). It also builds
447 : * lists of dead tuples and pages with free space, calculates statistics
448 : * on the number of live tuples in the heap, and marks pages as
449 : * all-visible if appropriate. When done, or when we run low on space for
450 : * dead-tuple TIDs, invoke vacuuming of indexes and call lazy_vacuum_heap
451 : * to reclaim dead line pointers.
452 : *
453 : * If there are no indexes then we can reclaim line pointers on the fly;
454 : * dead line pointers need only be retained until all index pointers that
455 : * reference them have been killed.
456 : */
457 : static void
458 384 : lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats,
459 : Relation *Irel, int nindexes, bool aggressive)
460 : {
461 : BlockNumber nblocks,
462 : blkno;
463 : HeapTupleData tuple;
464 : char *relname;
465 : BlockNumber empty_pages,
466 : vacuumed_pages;
467 : double num_tuples,
468 : tups_vacuumed,
469 : nkeep,
470 : nunused;
471 : IndexBulkDeleteResult **indstats;
472 : int i;
473 : PGRUsage ru0;
474 384 : Buffer vmbuffer = InvalidBuffer;
475 : BlockNumber next_unskippable_block;
476 : bool skipping_blocks;
477 : xl_heap_freeze_tuple *frozen;
478 : StringInfoData buf;
479 384 : const int initprog_index[] = {
480 : PROGRESS_VACUUM_PHASE,
481 : PROGRESS_VACUUM_TOTAL_HEAP_BLKS,
482 : PROGRESS_VACUUM_MAX_DEAD_TUPLES
483 : };
484 : int64 initprog_val[3];
485 :
486 384 : pg_rusage_init(&ru0);
487 :
488 384 : relname = RelationGetRelationName(onerel);
489 384 : ereport(elevel,
490 : (errmsg("vacuuming \"%s.%s\"",
491 : get_namespace_name(RelationGetNamespace(onerel)),
492 : relname)));
493 :
494 384 : empty_pages = vacuumed_pages = 0;
495 384 : num_tuples = tups_vacuumed = nkeep = nunused = 0;
496 :
497 384 : indstats = (IndexBulkDeleteResult **)
498 384 : palloc0(nindexes * sizeof(IndexBulkDeleteResult *));
499 :
500 384 : nblocks = RelationGetNumberOfBlocks(onerel);
501 384 : vacrelstats->rel_pages = nblocks;
502 384 : vacrelstats->scanned_pages = 0;
503 384 : vacrelstats->tupcount_pages = 0;
504 384 : vacrelstats->nonempty_pages = 0;
505 384 : vacrelstats->latestRemovedXid = InvalidTransactionId;
506 :
507 384 : lazy_space_alloc(vacrelstats, nblocks);
508 384 : frozen = palloc(sizeof(xl_heap_freeze_tuple) * MaxHeapTuplesPerPage);
509 :
510 : /* Report that we're scanning the heap, advertising total # of blocks */
511 384 : initprog_val[0] = PROGRESS_VACUUM_PHASE_SCAN_HEAP;
512 384 : initprog_val[1] = nblocks;
513 384 : initprog_val[2] = vacrelstats->max_dead_tuples;
514 384 : pgstat_progress_update_multi_param(3, initprog_index, initprog_val);
515 :
516 : /*
517 : * Except when aggressive is set, we want to skip pages that are
518 : * all-visible according to the visibility map, but only when we can skip
519 : * at least SKIP_PAGES_THRESHOLD consecutive pages. Since we're reading
520 : * sequentially, the OS should be doing readahead for us, so there's no
521 : * gain in skipping a page now and then; that's likely to disable
522 : * readahead and so be counterproductive. Also, skipping even a single
523 : * page means that we can't update relfrozenxid, so we only want to do it
524 : * if we can skip a goodly number of pages.
525 : *
526 : * When aggressive is set, we can't skip pages just because they are
527 : * all-visible, but we can still skip pages that are all-frozen, since
528 : * such pages do not need freezing and do not affect the value that we can
529 : * safely set for relfrozenxid or relminmxid.
530 : *
531 : * Before entering the main loop, establish the invariant that
532 : * next_unskippable_block is the next block number >= blkno that we can't
533 : * skip based on the visibility map, either all-visible for a regular scan
534 : * or all-frozen for an aggressive scan. We set it to nblocks if there's
535 : * no such block. We also set up the skipping_blocks flag correctly at
536 : * this stage.
537 : *
538 : * Note: The value returned by visibilitymap_get_status could be slightly
539 : * out-of-date, since we make this test before reading the corresponding
540 : * heap page or locking the buffer. This is OK. If we mistakenly think
541 : * that the page is all-visible or all-frozen when in fact the flag's just
542 : * been cleared, we might fail to vacuum the page. It's easy to see that
543 : * skipping a page when aggressive is not set is not a very big deal; we
544 : * might leave some dead tuples lying around, but the next vacuum will
545 : * find them. But even when aggressive *is* set, it's still OK if we miss
546 : * a page whose all-frozen marking has just been cleared. Any new XIDs
547 : * just added to that page are necessarily newer than the GlobalXmin we
548 : * computed, so they'll have no effect on the value to which we can safely
549 : * set relfrozenxid. A similar argument applies for MXIDs and relminmxid.
550 : *
551 : * We will scan the table's last page, at least to the extent of
552 : * determining whether it has tuples or not, even if it should be skipped
553 : * according to the above rules; except when we've already determined that
554 : * it's not worth trying to truncate the table. This avoids having
555 : * lazy_truncate_heap() take access-exclusive lock on the table to attempt
556 : * a truncation that just fails immediately because there are tuples in
557 : * the last page. This is worth avoiding mainly because such a lock must
558 : * be replayed on any hot standby, where it can be disruptive.
559 : */
560 384 : next_unskippable_block = 0;
561 384 : if ((options & VACOPT_DISABLE_PAGE_SKIPPING) == 0)
562 : {
563 1333 : while (next_unskippable_block < nblocks)
564 : {
565 : uint8 vmstatus;
566 :
567 783 : vmstatus = visibilitymap_get_status(onerel, next_unskippable_block,
568 : &vmbuffer);
569 783 : if (aggressive)
570 : {
571 50 : if ((vmstatus & VISIBILITYMAP_ALL_FROZEN) == 0)
572 50 : break;
573 : }
574 : else
575 : {
576 733 : if ((vmstatus & VISIBILITYMAP_ALL_VISIBLE) == 0)
577 166 : break;
578 : }
579 567 : vacuum_delay_point();
580 567 : next_unskippable_block++;
581 : }
582 : }
583 :
584 384 : if (next_unskippable_block >= SKIP_PAGES_THRESHOLD)
585 3 : skipping_blocks = true;
586 : else
587 381 : skipping_blocks = false;
588 :
589 5396 : for (blkno = 0; blkno < nblocks; blkno++)
590 : {
591 : Buffer buf;
592 : Page page;
593 : OffsetNumber offnum,
594 : maxoff;
595 : bool tupgone,
596 : hastup;
597 : int prev_dead_count;
598 : int nfrozen;
599 : Size freespace;
600 5013 : bool all_visible_according_to_vm = false;
601 : bool all_visible;
602 5013 : bool all_frozen = true; /* provided all_visible is also true */
603 : bool has_dead_tuples;
604 5013 : TransactionId visibility_cutoff_xid = InvalidTransactionId;
605 :
606 : /* see note above about forcing scanning of last page */
607 : #define FORCE_CHECK_PAGE() \
608 : (blkno == nblocks - 1 && should_attempt_truncation(vacrelstats))
609 :
610 5013 : pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_SCANNED, blkno);
611 :
612 5013 : if (blkno == next_unskippable_block)
613 : {
614 : /* Time to advance next_unskippable_block */
615 4256 : next_unskippable_block++;
616 4256 : if ((options & VACOPT_DISABLE_PAGE_SKIPPING) == 0)
617 : {
618 8700 : while (next_unskippable_block < nblocks)
619 : {
620 : uint8 vmskipflags;
621 :
622 4230 : vmskipflags = visibilitymap_get_status(onerel,
623 : next_unskippable_block,
624 : &vmbuffer);
625 4230 : if (aggressive)
626 : {
627 345 : if ((vmskipflags & VISIBILITYMAP_ALL_FROZEN) == 0)
628 345 : break;
629 : }
630 : else
631 : {
632 3885 : if ((vmskipflags & VISIBILITYMAP_ALL_VISIBLE) == 0)
633 3695 : break;
634 : }
635 190 : vacuum_delay_point();
636 190 : next_unskippable_block++;
637 : }
638 : }
639 :
640 : /*
641 : * We know we can't skip the current block. But set up
642 : * skipping_blocks to do the right thing at the following blocks.
643 : */
644 4256 : if (next_unskippable_block - blkno > SKIP_PAGES_THRESHOLD)
645 1 : skipping_blocks = true;
646 : else
647 4255 : skipping_blocks = false;
648 :
649 : /*
650 : * Normally, the fact that we can't skip this block must mean that
651 : * it's not all-visible. But in an aggressive vacuum we know only
652 : * that it's not all-frozen, so it might still be all-visible.
653 : */
654 4256 : if (aggressive && VM_ALL_VISIBLE(onerel, blkno, &vmbuffer))
655 0 : all_visible_according_to_vm = true;
656 : }
657 : else
658 : {
659 : /*
660 : * The current block is potentially skippable; if we've seen a
661 : * long enough run of skippable blocks to justify skipping it, and
662 : * we're not forced to check it, then go ahead and skip.
663 : * Otherwise, the page must be at least all-visible if not
664 : * all-frozen, so we can set all_visible_according_to_vm = true.
665 : */
666 757 : if (skipping_blocks && !FORCE_CHECK_PAGE())
667 : {
668 : /*
669 : * Tricky, tricky. If this is in aggressive vacuum, the page
670 : * must have been all-frozen at the time we checked whether it
671 : * was skippable, but it might not be any more. We must be
672 : * careful to count it as a skipped all-frozen page in that
673 : * case, or else we'll think we can't update relfrozenxid and
674 : * relminmxid. If it's not an aggressive vacuum, we don't
675 : * know whether it was all-frozen, so we have to recheck; but
676 : * in this case an approximate answer is OK.
677 : */
678 468 : if (aggressive || VM_ALL_FROZEN(onerel, blkno, &vmbuffer))
679 124 : vacrelstats->frozenskipped_pages++;
680 960 : continue;
681 : }
682 289 : all_visible_according_to_vm = true;
683 : }
684 :
685 4545 : vacuum_delay_point();
686 :
687 : /*
688 : * If we are close to overrunning the available space for dead-tuple
689 : * TIDs, pause and do a cycle of vacuuming before we tackle this page.
690 : */
691 4544 : if ((vacrelstats->max_dead_tuples - vacrelstats->num_dead_tuples) < MaxHeapTuplesPerPage &&
692 0 : vacrelstats->num_dead_tuples > 0)
693 : {
694 0 : const int hvp_index[] = {
695 : PROGRESS_VACUUM_PHASE,
696 : PROGRESS_VACUUM_NUM_INDEX_VACUUMS
697 : };
698 : int64 hvp_val[2];
699 :
700 : /*
701 : * Before beginning index vacuuming, we release any pin we may
702 : * hold on the visibility map page. This isn't necessary for
703 : * correctness, but we do it anyway to avoid holding the pin
704 : * across a lengthy, unrelated operation.
705 : */
706 0 : if (BufferIsValid(vmbuffer))
707 : {
708 0 : ReleaseBuffer(vmbuffer);
709 0 : vmbuffer = InvalidBuffer;
710 : }
711 :
712 : /* Log cleanup info before we touch indexes */
713 0 : vacuum_log_cleanup_info(onerel, vacrelstats);
714 :
715 : /* Report that we are now vacuuming indexes */
716 0 : pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
717 : PROGRESS_VACUUM_PHASE_VACUUM_INDEX);
718 :
719 : /* Remove index entries */
720 0 : for (i = 0; i < nindexes; i++)
721 0 : lazy_vacuum_index(Irel[i],
722 0 : &indstats[i],
723 : vacrelstats);
724 :
725 : /*
726 : * Report that we are now vacuuming the heap. We also increase
727 : * the number of index scans here; note that by using
728 : * pgstat_progress_update_multi_param we can update both
729 : * parameters atomically.
730 : */
731 0 : hvp_val[0] = PROGRESS_VACUUM_PHASE_VACUUM_HEAP;
732 0 : hvp_val[1] = vacrelstats->num_index_scans + 1;
733 0 : pgstat_progress_update_multi_param(2, hvp_index, hvp_val);
734 :
735 : /* Remove tuples from heap */
736 0 : lazy_vacuum_heap(onerel, vacrelstats);
737 :
738 : /*
739 : * Forget the now-vacuumed tuples, and press on, but be careful
740 : * not to reset latestRemovedXid since we want that value to be
741 : * valid.
742 : */
743 0 : vacrelstats->num_dead_tuples = 0;
744 0 : vacrelstats->num_index_scans++;
745 :
746 : /* Report that we are once again scanning the heap */
747 0 : pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
748 : PROGRESS_VACUUM_PHASE_SCAN_HEAP);
749 : }
750 :
751 : /*
752 : * Pin the visibility map page in case we need to mark the page
753 : * all-visible. In most cases this will be very cheap, because we'll
754 : * already have the correct page pinned anyway. However, it's
755 : * possible that (a) next_unskippable_block is covered by a different
756 : * VM page than the current block or (b) we released our pin and did a
757 : * cycle of index vacuuming.
758 : *
759 : */
760 4544 : visibilitymap_pin(onerel, blkno, &vmbuffer);
761 :
762 4544 : buf = ReadBufferExtended(onerel, MAIN_FORKNUM, blkno,
763 : RBM_NORMAL, vac_strategy);
764 :
765 : /* We need buffer cleanup lock so that we can prune HOT chains. */
766 4544 : if (!ConditionalLockBufferForCleanup(buf))
767 : {
768 : /*
769 : * If we're not performing an aggressive scan to guard against XID
770 : * wraparound, and we don't want to forcibly check the page, then
771 : * it's OK to skip vacuuming pages we get a lock conflict on. They
772 : * will be dealt with in some future vacuum.
773 : */
774 0 : if (!aggressive && !FORCE_CHECK_PAGE())
775 : {
776 0 : ReleaseBuffer(buf);
777 0 : vacrelstats->pinskipped_pages++;
778 0 : continue;
779 : }
780 :
781 : /*
782 : * Read the page with share lock to see if any xids on it need to
783 : * be frozen. If not we just skip the page, after updating our
784 : * scan statistics. If there are some, we wait for cleanup lock.
785 : *
786 : * We could defer the lock request further by remembering the page
787 : * and coming back to it later, or we could even register
788 : * ourselves for multiple buffers and then service whichever one
789 : * is received first. For now, this seems good enough.
790 : *
791 : * If we get here with aggressive false, then we're just forcibly
792 : * checking the page, and so we don't want to insist on getting
793 : * the lock; we only need to know if the page contains tuples, so
794 : * that we can update nonempty_pages correctly. It's convenient
795 : * to use lazy_check_needs_freeze() for both situations, though.
796 : */
797 0 : LockBuffer(buf, BUFFER_LOCK_SHARE);
798 0 : if (!lazy_check_needs_freeze(buf, &hastup))
799 : {
800 0 : UnlockReleaseBuffer(buf);
801 0 : vacrelstats->scanned_pages++;
802 0 : vacrelstats->pinskipped_pages++;
803 0 : if (hastup)
804 0 : vacrelstats->nonempty_pages = blkno + 1;
805 0 : continue;
806 : }
807 0 : if (!aggressive)
808 : {
809 : /*
810 : * Here, we must not advance scanned_pages; that would amount
811 : * to claiming that the page contains no freezable tuples.
812 : */
813 0 : UnlockReleaseBuffer(buf);
814 0 : vacrelstats->pinskipped_pages++;
815 0 : if (hastup)
816 0 : vacrelstats->nonempty_pages = blkno + 1;
817 0 : continue;
818 : }
819 0 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
820 0 : LockBufferForCleanup(buf);
821 : /* drop through to normal processing */
822 : }
823 :
824 4544 : vacrelstats->scanned_pages++;
825 4544 : vacrelstats->tupcount_pages++;
826 :
827 4544 : page = BufferGetPage(buf);
828 :
829 4544 : if (PageIsNew(page))
830 : {
831 : /*
832 : * An all-zeroes page could be left over if a backend extends the
833 : * relation but crashes before initializing the page. Reclaim such
834 : * pages for use.
835 : *
836 : * We have to be careful here because we could be looking at a
837 : * page that someone has just added to the relation and not yet
838 : * been able to initialize (see RelationGetBufferForTuple). To
839 : * protect against that, release the buffer lock, grab the
840 : * relation extension lock momentarily, and re-lock the buffer. If
841 : * the page is still uninitialized by then, it must be left over
842 : * from a crashed backend, and we can initialize it.
843 : *
844 : * We don't really need the relation lock when this is a new or
845 : * temp relation, but it's probably not worth the code space to
846 : * check that, since this surely isn't a critical path.
847 : *
848 : * Note: the comparable code in vacuum.c need not worry because
849 : * it's got exclusive lock on the whole relation.
850 : */
851 0 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
852 0 : LockRelationForExtension(onerel, ExclusiveLock);
853 0 : UnlockRelationForExtension(onerel, ExclusiveLock);
854 0 : LockBufferForCleanup(buf);
855 0 : if (PageIsNew(page))
856 : {
857 0 : ereport(WARNING,
858 : (errmsg("relation \"%s\" page %u is uninitialized --- fixing",
859 : relname, blkno)));
860 0 : PageInit(page, BufferGetPageSize(buf), 0);
861 0 : empty_pages++;
862 : }
863 0 : freespace = PageGetHeapFreeSpace(page);
864 0 : MarkBufferDirty(buf);
865 0 : UnlockReleaseBuffer(buf);
866 :
867 0 : RecordPageWithFreeSpace(onerel, blkno, freespace);
868 0 : continue;
869 : }
870 :
871 4544 : if (PageIsEmpty(page))
872 : {
873 24 : empty_pages++;
874 24 : freespace = PageGetHeapFreeSpace(page);
875 :
876 : /* empty pages are always all-visible and all-frozen */
877 24 : if (!PageIsAllVisible(page))
878 : {
879 24 : START_CRIT_SECTION();
880 :
881 : /* mark buffer dirty before writing a WAL record */
882 24 : MarkBufferDirty(buf);
883 :
884 : /*
885 : * It's possible that another backend has extended the heap,
886 : * initialized the page, and then failed to WAL-log the page
887 : * due to an ERROR. Since heap extension is not WAL-logged,
888 : * recovery might try to replay our record setting the page
889 : * all-visible and find that the page isn't initialized, which
890 : * will cause a PANIC. To prevent that, check whether the
891 : * page has been previously WAL-logged, and if not, do that
892 : * now.
893 : */
894 48 : if (RelationNeedsWAL(onerel) &&
895 24 : PageGetLSN(page) == InvalidXLogRecPtr)
896 24 : log_newpage_buffer(buf, true);
897 :
898 24 : PageSetAllVisible(page);
899 24 : visibilitymap_set(onerel, blkno, buf, InvalidXLogRecPtr,
900 : vmbuffer, InvalidTransactionId,
901 : VISIBILITYMAP_ALL_VISIBLE | VISIBILITYMAP_ALL_FROZEN);
902 24 : END_CRIT_SECTION();
903 : }
904 :
905 24 : UnlockReleaseBuffer(buf);
906 24 : RecordPageWithFreeSpace(onerel, blkno, freespace);
907 24 : continue;
908 : }
909 :
910 : /*
911 : * Prune all HOT-update chains in this page.
912 : *
913 : * We count tuples removed by the pruning step as removed by VACUUM.
914 : */
915 4520 : tups_vacuumed += heap_page_prune(onerel, buf, OldestXmin, false,
916 : &vacrelstats->latestRemovedXid);
917 :
918 : /*
919 : * Now scan the page to collect vacuumable items and check for tuples
920 : * requiring freezing.
921 : */
922 4520 : all_visible = true;
923 4520 : has_dead_tuples = false;
924 4520 : nfrozen = 0;
925 4520 : hastup = false;
926 4520 : prev_dead_count = vacrelstats->num_dead_tuples;
927 4520 : maxoff = PageGetMaxOffsetNumber(page);
928 :
929 : /*
930 : * Note: If you change anything in the loop below, also look at
931 : * heap_page_is_all_visible to see if that needs to be changed.
932 : */
933 452285 : for (offnum = FirstOffsetNumber;
934 : offnum <= maxoff;
935 443245 : offnum = OffsetNumberNext(offnum))
936 : {
937 : ItemId itemid;
938 :
939 443245 : itemid = PageGetItemId(page, offnum);
940 :
941 : /* Unused items require no processing, but we count 'em */
942 443245 : if (!ItemIdIsUsed(itemid))
943 : {
944 1774 : nunused += 1;
945 1774 : continue;
946 : }
947 :
948 : /* Redirect items mustn't be touched */
949 441471 : if (ItemIdIsRedirected(itemid))
950 : {
951 293 : hastup = true; /* this page won't be truncatable */
952 293 : continue;
953 : }
954 :
955 441178 : ItemPointerSet(&(tuple.t_self), blkno, offnum);
956 :
957 : /*
958 : * DEAD item pointers are to be vacuumed normally; but we don't
959 : * count them in tups_vacuumed, else we'd be double-counting (at
960 : * least in the common case where heap_page_prune() just freed up
961 : * a non-HOT tuple).
962 : */
963 441178 : if (ItemIdIsDead(itemid))
964 : {
965 71320 : lazy_record_dead_tuple(vacrelstats, &(tuple.t_self));
966 71320 : all_visible = false;
967 71320 : continue;
968 : }
969 :
970 369858 : Assert(ItemIdIsNormal(itemid));
971 :
972 369858 : tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
973 369858 : tuple.t_len = ItemIdGetLength(itemid);
974 369858 : tuple.t_tableOid = RelationGetRelid(onerel);
975 :
976 369858 : tupgone = false;
977 :
978 369858 : switch (HeapTupleSatisfiesVacuum(&tuple, OldestXmin, buf))
979 : {
980 : case HEAPTUPLE_DEAD:
981 :
982 : /*
983 : * Ordinarily, DEAD tuples would have been removed by
984 : * heap_page_prune(), but it's possible that the tuple
985 : * state changed since heap_page_prune() looked. In
986 : * particular an INSERT_IN_PROGRESS tuple could have
987 : * changed to DEAD if the inserter aborted. So this
988 : * cannot be considered an error condition.
989 : *
990 : * If the tuple is HOT-updated then it must only be
991 : * removed by a prune operation; so we keep it just as if
992 : * it were RECENTLY_DEAD. Also, if it's a heap-only
993 : * tuple, we choose to keep it, because it'll be a lot
994 : * cheaper to get rid of it in the next pruning pass than
995 : * to treat it like an indexed tuple.
996 : */
997 0 : if (HeapTupleIsHotUpdated(&tuple) ||
998 0 : HeapTupleIsHeapOnly(&tuple))
999 0 : nkeep += 1;
1000 : else
1001 0 : tupgone = true; /* we can delete the tuple */
1002 0 : all_visible = false;
1003 0 : break;
1004 : case HEAPTUPLE_LIVE:
1005 : /* Tuple is good --- but let's do some validity checks */
1006 371671 : if (onerel->rd_rel->relhasoids &&
1007 64878 : !OidIsValid(HeapTupleGetOid(&tuple)))
1008 0 : elog(WARNING, "relation \"%s\" TID %u/%u: OID is invalid",
1009 : relname, blkno, offnum);
1010 :
1011 : /*
1012 : * Is the tuple definitely visible to all transactions?
1013 : *
1014 : * NB: Like with per-tuple hint bits, we can't set the
1015 : * PD_ALL_VISIBLE flag if the inserter committed
1016 : * asynchronously. See SetHintBits for more info. Check
1017 : * that the tuple is hinted xmin-committed because of
1018 : * that.
1019 : */
1020 339232 : if (all_visible)
1021 : {
1022 : TransactionId xmin;
1023 :
1024 288581 : if (!HeapTupleHeaderXminCommitted(tuple.t_data))
1025 : {
1026 0 : all_visible = false;
1027 0 : break;
1028 : }
1029 :
1030 : /*
1031 : * The inserter definitely committed. But is it old
1032 : * enough that everyone sees it as committed?
1033 : */
1034 288581 : xmin = HeapTupleHeaderGetXmin(tuple.t_data);
1035 288581 : if (!TransactionIdPrecedes(xmin, OldestXmin))
1036 : {
1037 144 : all_visible = false;
1038 144 : break;
1039 : }
1040 :
1041 : /* Track newest xmin on page. */
1042 288437 : if (TransactionIdFollows(xmin, visibility_cutoff_xid))
1043 5127 : visibility_cutoff_xid = xmin;
1044 : }
1045 339088 : break;
1046 : case HEAPTUPLE_RECENTLY_DEAD:
1047 :
1048 : /*
1049 : * If tuple is recently deleted then we must not remove it
1050 : * from relation.
1051 : */
1052 30626 : nkeep += 1;
1053 30626 : all_visible = false;
1054 30626 : break;
1055 : case HEAPTUPLE_INSERT_IN_PROGRESS:
1056 : /* This is an expected case during concurrent vacuum */
1057 0 : all_visible = false;
1058 0 : break;
1059 : case HEAPTUPLE_DELETE_IN_PROGRESS:
1060 : /* This is an expected case during concurrent vacuum */
1061 0 : all_visible = false;
1062 0 : break;
1063 : default:
1064 0 : elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
1065 : break;
1066 : }
1067 :
1068 369858 : if (tupgone)
1069 : {
1070 0 : lazy_record_dead_tuple(vacrelstats, &(tuple.t_self));
1071 0 : HeapTupleHeaderAdvanceLatestRemovedXid(tuple.t_data,
1072 : &vacrelstats->latestRemovedXid);
1073 0 : tups_vacuumed += 1;
1074 0 : has_dead_tuples = true;
1075 : }
1076 : else
1077 : {
1078 : bool tuple_totally_frozen;
1079 :
1080 369858 : num_tuples += 1;
1081 369858 : hastup = true;
1082 :
1083 : /*
1084 : * Each non-removable tuple must be checked to see if it needs
1085 : * freezing. Note we already have exclusive buffer lock.
1086 : */
1087 739716 : if (heap_prepare_freeze_tuple(tuple.t_data, FreezeLimit,
1088 369858 : MultiXactCutoff, &frozen[nfrozen],
1089 : &tuple_totally_frozen))
1090 15279 : frozen[nfrozen++].offset = offnum;
1091 :
1092 369858 : if (!tuple_totally_frozen)
1093 330553 : all_frozen = false;
1094 : }
1095 : } /* scan along page */
1096 :
1097 : /*
1098 : * If we froze any tuples, mark the buffer dirty, and write a WAL
1099 : * record recording the changes. We must log the changes to be
1100 : * crash-safe against future truncation of CLOG.
1101 : */
1102 4520 : if (nfrozen > 0)
1103 : {
1104 247 : START_CRIT_SECTION();
1105 :
1106 247 : MarkBufferDirty(buf);
1107 :
1108 : /* execute collected freezes */
1109 15526 : for (i = 0; i < nfrozen; i++)
1110 : {
1111 : ItemId itemid;
1112 : HeapTupleHeader htup;
1113 :
1114 15279 : itemid = PageGetItemId(page, frozen[i].offset);
1115 15279 : htup = (HeapTupleHeader) PageGetItem(page, itemid);
1116 :
1117 15279 : heap_execute_freeze_tuple(htup, &frozen[i]);
1118 : }
1119 :
1120 : /* Now WAL-log freezing if necessary */
1121 247 : if (RelationNeedsWAL(onerel))
1122 : {
1123 : XLogRecPtr recptr;
1124 :
1125 247 : recptr = log_heap_freeze(onerel, buf, FreezeLimit,
1126 : frozen, nfrozen);
1127 247 : PageSetLSN(page, recptr);
1128 : }
1129 :
1130 247 : END_CRIT_SECTION();
1131 : }
1132 :
1133 : /*
1134 : * If there are no indexes then we can vacuum the page right now
1135 : * instead of doing a second scan.
1136 : */
1137 4922 : if (nindexes == 0 &&
1138 402 : vacrelstats->num_dead_tuples > 0)
1139 : {
1140 : /* Remove tuples from heap */
1141 12 : lazy_vacuum_page(onerel, blkno, buf, 0, vacrelstats, &vmbuffer);
1142 12 : has_dead_tuples = false;
1143 :
1144 : /*
1145 : * Forget the now-vacuumed tuples, and press on, but be careful
1146 : * not to reset latestRemovedXid since we want that value to be
1147 : * valid.
1148 : */
1149 12 : vacrelstats->num_dead_tuples = 0;
1150 12 : vacuumed_pages++;
1151 : }
1152 :
1153 4520 : freespace = PageGetHeapFreeSpace(page);
1154 :
1155 : /* mark page all-visible, if appropriate */
1156 4520 : if (all_visible && !all_visible_according_to_vm)
1157 2889 : {
1158 2889 : uint8 flags = VISIBILITYMAP_ALL_VISIBLE;
1159 :
1160 2889 : if (all_frozen)
1161 370 : flags |= VISIBILITYMAP_ALL_FROZEN;
1162 :
1163 : /*
1164 : * It should never be the case that the visibility map page is set
1165 : * while the page-level bit is clear, but the reverse is allowed
1166 : * (if checksums are not enabled). Regardless, set the both bits
1167 : * so that we get back in sync.
1168 : *
1169 : * NB: If the heap page is all-visible but the VM bit is not set,
1170 : * we don't need to dirty the heap page. However, if checksums
1171 : * are enabled, we do need to make sure that the heap page is
1172 : * dirtied before passing it to visibilitymap_set(), because it
1173 : * may be logged. Given that this situation should only happen in
1174 : * rare cases after a crash, it is not worth optimizing.
1175 : */
1176 2889 : PageSetAllVisible(page);
1177 2889 : MarkBufferDirty(buf);
1178 2889 : visibilitymap_set(onerel, blkno, buf, InvalidXLogRecPtr,
1179 : vmbuffer, visibility_cutoff_xid, flags);
1180 : }
1181 :
1182 : /*
1183 : * As of PostgreSQL 9.2, the visibility map bit should never be set if
1184 : * the page-level bit is clear. However, it's possible that the bit
1185 : * got cleared after we checked it and before we took the buffer
1186 : * content lock, so we must recheck before jumping to the conclusion
1187 : * that something bad has happened.
1188 : */
1189 1631 : else if (all_visible_according_to_vm && !PageIsAllVisible(page)
1190 0 : && VM_ALL_VISIBLE(onerel, blkno, &vmbuffer))
1191 : {
1192 0 : elog(WARNING, "page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u",
1193 : relname, blkno);
1194 0 : visibilitymap_clear(onerel, blkno, vmbuffer,
1195 : VISIBILITYMAP_VALID_BITS);
1196 : }
1197 :
1198 : /*
1199 : * It's possible for the value returned by GetOldestXmin() to move
1200 : * backwards, so it's not wrong for us to see tuples that appear to
1201 : * not be visible to everyone yet, while PD_ALL_VISIBLE is already
1202 : * set. The real safe xmin value never moves backwards, but
1203 : * GetOldestXmin() is conservative and sometimes returns a value
1204 : * that's unnecessarily small, so if we see that contradiction it just
1205 : * means that the tuples that we think are not visible to everyone yet
1206 : * actually are, and the PD_ALL_VISIBLE flag is correct.
1207 : *
1208 : * There should never be dead tuples on a page with PD_ALL_VISIBLE
1209 : * set, however.
1210 : */
1211 1631 : else if (PageIsAllVisible(page) && has_dead_tuples)
1212 : {
1213 0 : elog(WARNING, "page containing dead tuples is marked as all-visible in relation \"%s\" page %u",
1214 : relname, blkno);
1215 0 : PageClearAllVisible(page);
1216 0 : MarkBufferDirty(buf);
1217 0 : visibilitymap_clear(onerel, blkno, vmbuffer,
1218 : VISIBILITYMAP_VALID_BITS);
1219 : }
1220 :
1221 : /*
1222 : * If the all-visible page is turned out to be all-frozen but not
1223 : * marked, we should so mark it. Note that all_frozen is only valid
1224 : * if all_visible is true, so we must check both.
1225 : */
1226 1867 : else if (all_visible_according_to_vm && all_visible && all_frozen &&
1227 236 : !VM_ALL_FROZEN(onerel, blkno, &vmbuffer))
1228 : {
1229 : /*
1230 : * We can pass InvalidTransactionId as the cutoff XID here,
1231 : * because setting the all-frozen bit doesn't cause recovery
1232 : * conflicts.
1233 : */
1234 0 : visibilitymap_set(onerel, blkno, buf, InvalidXLogRecPtr,
1235 : vmbuffer, InvalidTransactionId,
1236 : VISIBILITYMAP_ALL_FROZEN);
1237 : }
1238 :
1239 4520 : UnlockReleaseBuffer(buf);
1240 :
1241 : /* Remember the location of the last page with nonremovable tuples */
1242 4520 : if (hastup)
1243 4048 : vacrelstats->nonempty_pages = blkno + 1;
1244 :
1245 : /*
1246 : * If we remembered any tuples for deletion, then the page will be
1247 : * visited again by lazy_vacuum_heap, which will compute and record
1248 : * its post-compaction free space. If not, then we're done with this
1249 : * page, so remember its free space as-is. (This path will always be
1250 : * taken if there are no indexes.)
1251 : */
1252 4520 : if (vacrelstats->num_dead_tuples == prev_dead_count)
1253 3545 : RecordPageWithFreeSpace(onerel, blkno, freespace);
1254 : }
1255 :
1256 : /* report that everything is scanned and vacuumed */
1257 383 : pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_SCANNED, blkno);
1258 :
1259 383 : pfree(frozen);
1260 :
1261 : /* save stats for use later */
1262 383 : vacrelstats->scanned_tuples = num_tuples;
1263 383 : vacrelstats->tuples_deleted = tups_vacuumed;
1264 383 : vacrelstats->new_dead_tuples = nkeep;
1265 :
1266 : /* now we can compute the new value for pg_class.reltuples */
1267 383 : vacrelstats->new_rel_tuples = vac_estimate_reltuples(onerel, false,
1268 : nblocks,
1269 : vacrelstats->tupcount_pages,
1270 : num_tuples);
1271 :
1272 : /*
1273 : * Release any remaining pin on visibility map page.
1274 : */
1275 383 : if (BufferIsValid(vmbuffer))
1276 : {
1277 238 : ReleaseBuffer(vmbuffer);
1278 238 : vmbuffer = InvalidBuffer;
1279 : }
1280 :
1281 : /* If any tuples need to be deleted, perform final vacuum cycle */
1282 : /* XXX put a threshold on min number of tuples here? */
1283 383 : if (vacrelstats->num_dead_tuples > 0)
1284 : {
1285 58 : const int hvp_index[] = {
1286 : PROGRESS_VACUUM_PHASE,
1287 : PROGRESS_VACUUM_NUM_INDEX_VACUUMS
1288 : };
1289 : int64 hvp_val[2];
1290 :
1291 : /* Log cleanup info before we touch indexes */
1292 58 : vacuum_log_cleanup_info(onerel, vacrelstats);
1293 :
1294 : /* Report that we are now vacuuming indexes */
1295 58 : pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
1296 : PROGRESS_VACUUM_PHASE_VACUUM_INDEX);
1297 :
1298 : /* Remove index entries */
1299 167 : for (i = 0; i < nindexes; i++)
1300 218 : lazy_vacuum_index(Irel[i],
1301 109 : &indstats[i],
1302 : vacrelstats);
1303 :
1304 : /* Report that we are now vacuuming the heap */
1305 58 : hvp_val[0] = PROGRESS_VACUUM_PHASE_VACUUM_HEAP;
1306 58 : hvp_val[1] = vacrelstats->num_index_scans + 1;
1307 58 : pgstat_progress_update_multi_param(2, hvp_index, hvp_val);
1308 :
1309 : /* Remove tuples from heap */
1310 58 : pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
1311 : PROGRESS_VACUUM_PHASE_VACUUM_HEAP);
1312 58 : lazy_vacuum_heap(onerel, vacrelstats);
1313 58 : vacrelstats->num_index_scans++;
1314 : }
1315 :
1316 : /* report all blocks vacuumed; and that we're cleaning up */
1317 383 : pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_VACUUMED, blkno);
1318 383 : pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
1319 : PROGRESS_VACUUM_PHASE_INDEX_CLEANUP);
1320 :
1321 : /* Do post-vacuum cleanup and statistics update for each index */
1322 807 : for (i = 0; i < nindexes; i++)
1323 424 : lazy_cleanup_index(Irel[i], indstats[i], vacrelstats);
1324 :
1325 : /* If no indexes, make log report that lazy_vacuum_heap would've made */
1326 383 : if (vacuumed_pages)
1327 8 : ereport(elevel,
1328 : (errmsg("\"%s\": removed %.0f row versions in %u pages",
1329 : RelationGetRelationName(onerel),
1330 : tups_vacuumed, vacuumed_pages)));
1331 :
1332 : /*
1333 : * This is pretty messy, but we split it up so that we can skip emitting
1334 : * individual parts of the message when not applicable.
1335 : */
1336 383 : initStringInfo(&buf);
1337 383 : appendStringInfo(&buf,
1338 : _("%.0f dead row versions cannot be removed yet, oldest xmin: %u\n"),
1339 : nkeep, OldestXmin);
1340 383 : appendStringInfo(&buf, _("There were %.0f unused item pointers.\n"),
1341 : nunused);
1342 383 : appendStringInfo(&buf, ngettext("Skipped %u page due to buffer pins, ",
1343 : "Skipped %u pages due to buffer pins, ",
1344 : vacrelstats->pinskipped_pages),
1345 : vacrelstats->pinskipped_pages);
1346 383 : appendStringInfo(&buf, ngettext("%u frozen page.\n",
1347 : "%u frozen pages.\n",
1348 : vacrelstats->frozenskipped_pages),
1349 : vacrelstats->frozenskipped_pages);
1350 383 : appendStringInfo(&buf, ngettext("%u page is entirely empty.\n",
1351 : "%u pages are entirely empty.\n",
1352 : empty_pages),
1353 : empty_pages);
1354 383 : appendStringInfo(&buf, _("%s."), pg_rusage_show(&ru0));
1355 :
1356 383 : ereport(elevel,
1357 : (errmsg("\"%s\": found %.0f removable, %.0f nonremovable row versions in %u out of %u pages",
1358 : RelationGetRelationName(onerel),
1359 : tups_vacuumed, num_tuples,
1360 : vacrelstats->scanned_pages, nblocks),
1361 : errdetail_internal("%s", buf.data)));
1362 383 : pfree(buf.data);
1363 383 : }
1364 :
1365 :
1366 : /*
1367 : * lazy_vacuum_heap() -- second pass over the heap
1368 : *
1369 : * This routine marks dead tuples as unused and compacts out free
1370 : * space on their pages. Pages not having dead tuples recorded from
1371 : * lazy_scan_heap are not visited at all.
1372 : *
1373 : * Note: the reason for doing this as a second pass is we cannot remove
1374 : * the tuples until we've removed their index entries, and we want to
1375 : * process index entry removal in batches as large as possible.
1376 : */
1377 : static void
1378 58 : lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats)
1379 : {
1380 : int tupindex;
1381 : int npages;
1382 : PGRUsage ru0;
1383 58 : Buffer vmbuffer = InvalidBuffer;
1384 :
1385 58 : pg_rusage_init(&ru0);
1386 58 : npages = 0;
1387 :
1388 58 : tupindex = 0;
1389 1009 : while (tupindex < vacrelstats->num_dead_tuples)
1390 : {
1391 : BlockNumber tblk;
1392 : Buffer buf;
1393 : Page page;
1394 : Size freespace;
1395 :
1396 893 : vacuum_delay_point();
1397 :
1398 893 : tblk = ItemPointerGetBlockNumber(&vacrelstats->dead_tuples[tupindex]);
1399 893 : buf = ReadBufferExtended(onerel, MAIN_FORKNUM, tblk, RBM_NORMAL,
1400 : vac_strategy);
1401 893 : if (!ConditionalLockBufferForCleanup(buf))
1402 : {
1403 0 : ReleaseBuffer(buf);
1404 0 : ++tupindex;
1405 0 : continue;
1406 : }
1407 893 : tupindex = lazy_vacuum_page(onerel, tblk, buf, tupindex, vacrelstats,
1408 : &vmbuffer);
1409 :
1410 : /* Now that we've compacted the page, record its available space */
1411 893 : page = BufferGetPage(buf);
1412 893 : freespace = PageGetHeapFreeSpace(page);
1413 :
1414 893 : UnlockReleaseBuffer(buf);
1415 893 : RecordPageWithFreeSpace(onerel, tblk, freespace);
1416 893 : npages++;
1417 : }
1418 :
1419 58 : if (BufferIsValid(vmbuffer))
1420 : {
1421 58 : ReleaseBuffer(vmbuffer);
1422 58 : vmbuffer = InvalidBuffer;
1423 : }
1424 :
1425 58 : ereport(elevel,
1426 : (errmsg("\"%s\": removed %d row versions in %d pages",
1427 : RelationGetRelationName(onerel),
1428 : tupindex, npages),
1429 : errdetail_internal("%s", pg_rusage_show(&ru0))));
1430 58 : }
1431 :
1432 : /*
1433 : * lazy_vacuum_page() -- free dead tuples on a page
1434 : * and repair its fragmentation.
1435 : *
1436 : * Caller must hold pin and buffer cleanup lock on the buffer.
1437 : *
1438 : * tupindex is the index in vacrelstats->dead_tuples of the first dead
1439 : * tuple for this page. We assume the rest follow sequentially.
1440 : * The return value is the first tupindex after the tuples of this page.
1441 : */
1442 : static int
1443 905 : lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
1444 : int tupindex, LVRelStats *vacrelstats, Buffer *vmbuffer)
1445 : {
1446 905 : Page page = BufferGetPage(buffer);
1447 : OffsetNumber unused[MaxOffsetNumber];
1448 905 : int uncnt = 0;
1449 : TransactionId visibility_cutoff_xid;
1450 : bool all_frozen;
1451 :
1452 905 : pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_VACUUMED, blkno);
1453 :
1454 905 : START_CRIT_SECTION();
1455 :
1456 68666 : for (; tupindex < vacrelstats->num_dead_tuples; tupindex++)
1457 : {
1458 : BlockNumber tblk;
1459 : OffsetNumber toff;
1460 : ItemId itemid;
1461 :
1462 68596 : tblk = ItemPointerGetBlockNumber(&vacrelstats->dead_tuples[tupindex]);
1463 68596 : if (tblk != blkno)
1464 835 : break; /* past end of tuples for this block */
1465 67761 : toff = ItemPointerGetOffsetNumber(&vacrelstats->dead_tuples[tupindex]);
1466 67761 : itemid = PageGetItemId(page, toff);
1467 67761 : ItemIdSetUnused(itemid);
1468 67761 : unused[uncnt++] = toff;
1469 : }
1470 :
1471 905 : PageRepairFragmentation(page);
1472 :
1473 : /*
1474 : * Mark buffer dirty before we write WAL.
1475 : */
1476 905 : MarkBufferDirty(buffer);
1477 :
1478 : /* XLOG stuff */
1479 905 : if (RelationNeedsWAL(onerel))
1480 : {
1481 : XLogRecPtr recptr;
1482 :
1483 905 : recptr = log_heap_clean(onerel, buffer,
1484 : NULL, 0, NULL, 0,
1485 : unused, uncnt,
1486 : vacrelstats->latestRemovedXid);
1487 905 : PageSetLSN(page, recptr);
1488 : }
1489 :
1490 : /*
1491 : * End critical section, so we safely can do visibility tests (which
1492 : * possibly need to perform IO and allocate memory!). If we crash now the
1493 : * page (including the corresponding vm bit) might not be marked all
1494 : * visible, but that's fine. A later vacuum will fix that.
1495 : */
1496 905 : END_CRIT_SECTION();
1497 :
1498 : /*
1499 : * Now that we have removed the dead tuples from the page, once again
1500 : * check if the page has become all-visible. The page is already marked
1501 : * dirty, exclusively locked, and, if needed, a full page image has been
1502 : * emitted in the log_heap_clean() above.
1503 : */
1504 905 : if (heap_page_is_all_visible(onerel, buffer, &visibility_cutoff_xid,
1505 : &all_frozen))
1506 846 : PageSetAllVisible(page);
1507 :
1508 : /*
1509 : * All the changes to the heap page have been done. If the all-visible
1510 : * flag is now set, also set the VM all-visible bit (and, if possible, the
1511 : * all-frozen bit) unless this has already been done previously.
1512 : */
1513 905 : if (PageIsAllVisible(page))
1514 : {
1515 846 : uint8 vm_status = visibilitymap_get_status(onerel, blkno, vmbuffer);
1516 846 : uint8 flags = 0;
1517 :
1518 : /* Set the VM all-frozen bit to flag, if needed */
1519 846 : if ((vm_status & VISIBILITYMAP_ALL_VISIBLE) == 0)
1520 846 : flags |= VISIBILITYMAP_ALL_VISIBLE;
1521 846 : if ((vm_status & VISIBILITYMAP_ALL_FROZEN) == 0 && all_frozen)
1522 504 : flags |= VISIBILITYMAP_ALL_FROZEN;
1523 :
1524 846 : Assert(BufferIsValid(*vmbuffer));
1525 846 : if (flags != 0)
1526 846 : visibilitymap_set(onerel, blkno, buffer, InvalidXLogRecPtr,
1527 : *vmbuffer, visibility_cutoff_xid, flags);
1528 : }
1529 :
1530 905 : return tupindex;
1531 : }
1532 :
1533 : /*
1534 : * lazy_check_needs_freeze() -- scan page to see if any tuples
1535 : * need to be cleaned to avoid wraparound
1536 : *
1537 : * Returns true if the page needs to be vacuumed using cleanup lock.
1538 : * Also returns a flag indicating whether page contains any tuples at all.
1539 : */
1540 : static bool
1541 0 : lazy_check_needs_freeze(Buffer buf, bool *hastup)
1542 : {
1543 0 : Page page = BufferGetPage(buf);
1544 : OffsetNumber offnum,
1545 : maxoff;
1546 : HeapTupleHeader tupleheader;
1547 :
1548 0 : *hastup = false;
1549 :
1550 : /* If we hit an uninitialized page, we want to force vacuuming it. */
1551 0 : if (PageIsNew(page))
1552 0 : return true;
1553 :
1554 : /* Quick out for ordinary empty page. */
1555 0 : if (PageIsEmpty(page))
1556 0 : return false;
1557 :
1558 0 : maxoff = PageGetMaxOffsetNumber(page);
1559 0 : for (offnum = FirstOffsetNumber;
1560 : offnum <= maxoff;
1561 0 : offnum = OffsetNumberNext(offnum))
1562 : {
1563 : ItemId itemid;
1564 :
1565 0 : itemid = PageGetItemId(page, offnum);
1566 :
1567 : /* this should match hastup test in count_nondeletable_pages() */
1568 0 : if (ItemIdIsUsed(itemid))
1569 0 : *hastup = true;
1570 :
1571 : /* dead and redirect items never need freezing */
1572 0 : if (!ItemIdIsNormal(itemid))
1573 0 : continue;
1574 :
1575 0 : tupleheader = (HeapTupleHeader) PageGetItem(page, itemid);
1576 :
1577 0 : if (heap_tuple_needs_freeze(tupleheader, FreezeLimit,
1578 : MultiXactCutoff, buf))
1579 0 : return true;
1580 : } /* scan along page */
1581 :
1582 0 : return false;
1583 : }
1584 :
1585 :
1586 : /*
1587 : * lazy_vacuum_index() -- vacuum one index relation.
1588 : *
1589 : * Delete all the index entries pointing to tuples listed in
1590 : * vacrelstats->dead_tuples, and update running statistics.
1591 : */
1592 : static void
1593 109 : lazy_vacuum_index(Relation indrel,
1594 : IndexBulkDeleteResult **stats,
1595 : LVRelStats *vacrelstats)
1596 : {
1597 : IndexVacuumInfo ivinfo;
1598 : PGRUsage ru0;
1599 :
1600 109 : pg_rusage_init(&ru0);
1601 :
1602 109 : ivinfo.index = indrel;
1603 109 : ivinfo.analyze_only = false;
1604 109 : ivinfo.estimated_count = true;
1605 109 : ivinfo.message_level = elevel;
1606 109 : ivinfo.num_heap_tuples = vacrelstats->old_rel_tuples;
1607 109 : ivinfo.strategy = vac_strategy;
1608 :
1609 : /* Do bulk deletion */
1610 109 : *stats = index_bulk_delete(&ivinfo, *stats,
1611 : lazy_tid_reaped, (void *) vacrelstats);
1612 :
1613 109 : ereport(elevel,
1614 : (errmsg("scanned index \"%s\" to remove %d row versions",
1615 : RelationGetRelationName(indrel),
1616 : vacrelstats->num_dead_tuples),
1617 : errdetail_internal("%s", pg_rusage_show(&ru0))));
1618 109 : }
1619 :
1620 : /*
1621 : * lazy_cleanup_index() -- do post-vacuum cleanup for one index relation.
1622 : */
1623 : static void
1624 424 : lazy_cleanup_index(Relation indrel,
1625 : IndexBulkDeleteResult *stats,
1626 : LVRelStats *vacrelstats)
1627 : {
1628 : IndexVacuumInfo ivinfo;
1629 : PGRUsage ru0;
1630 :
1631 424 : pg_rusage_init(&ru0);
1632 :
1633 424 : ivinfo.index = indrel;
1634 424 : ivinfo.analyze_only = false;
1635 424 : ivinfo.estimated_count = (vacrelstats->tupcount_pages < vacrelstats->rel_pages);
1636 424 : ivinfo.message_level = elevel;
1637 424 : ivinfo.num_heap_tuples = vacrelstats->new_rel_tuples;
1638 424 : ivinfo.strategy = vac_strategy;
1639 :
1640 424 : stats = index_vacuum_cleanup(&ivinfo, stats);
1641 :
1642 424 : if (!stats)
1643 429 : return;
1644 :
1645 : /*
1646 : * Now update statistics in pg_class, but only if the index says the count
1647 : * is accurate.
1648 : */
1649 419 : if (!stats->estimated_count)
1650 419 : vac_update_relstats(indrel,
1651 : stats->num_pages,
1652 : stats->num_index_tuples,
1653 : 0,
1654 : false,
1655 : InvalidTransactionId,
1656 : InvalidMultiXactId,
1657 : false);
1658 :
1659 419 : ereport(elevel,
1660 : (errmsg("index \"%s\" now contains %.0f row versions in %u pages",
1661 : RelationGetRelationName(indrel),
1662 : stats->num_index_tuples,
1663 : stats->num_pages),
1664 : errdetail("%.0f index row versions were removed.\n"
1665 : "%u index pages have been deleted, %u are currently reusable.\n"
1666 : "%s.",
1667 : stats->tuples_removed,
1668 : stats->pages_deleted, stats->pages_free,
1669 : pg_rusage_show(&ru0))));
1670 :
1671 419 : pfree(stats);
1672 : }
1673 :
1674 : /*
1675 : * should_attempt_truncation - should we attempt to truncate the heap?
1676 : *
1677 : * Don't even think about it unless we have a shot at releasing a goodly
1678 : * number of pages. Otherwise, the time taken isn't worth it.
1679 : *
1680 : * Also don't attempt it if we are doing early pruning/vacuuming, because a
1681 : * scan which cannot find a truncated heap page cannot determine that the
1682 : * snapshot is too old to read that page. We might be able to get away with
1683 : * truncating all except one of the pages, setting its LSN to (at least) the
1684 : * maximum of the truncated range if we also treated an index leaf tuple
1685 : * pointing to a missing heap page as something to trigger the "snapshot too
1686 : * old" error, but that seems fragile and seems like it deserves its own patch
1687 : * if we consider it.
1688 : *
1689 : * This is split out so that we can test whether truncation is going to be
1690 : * called for before we actually do it. If you change the logic here, be
1691 : * careful to depend only on fields that lazy_scan_heap updates on-the-fly.
1692 : */
1693 : static bool
1694 384 : should_attempt_truncation(LVRelStats *vacrelstats)
1695 : {
1696 : BlockNumber possibly_freeable;
1697 :
1698 384 : possibly_freeable = vacrelstats->rel_pages - vacrelstats->nonempty_pages;
1699 384 : if (possibly_freeable > 0 &&
1700 16 : (possibly_freeable >= REL_TRUNCATE_MINIMUM ||
1701 29 : possibly_freeable >= vacrelstats->rel_pages / REL_TRUNCATE_FRACTION) &&
1702 13 : old_snapshot_threshold < 0)
1703 13 : return true;
1704 : else
1705 371 : return false;
1706 : }
1707 :
1708 : /*
1709 : * lazy_truncate_heap - try to truncate off any empty pages at the end
1710 : */
1711 : static void
1712 12 : lazy_truncate_heap(Relation onerel, LVRelStats *vacrelstats)
1713 : {
1714 12 : BlockNumber old_rel_pages = vacrelstats->rel_pages;
1715 : BlockNumber new_rel_pages;
1716 : PGRUsage ru0;
1717 : int lock_retry;
1718 :
1719 12 : pg_rusage_init(&ru0);
1720 :
1721 : /* Report that we are now truncating */
1722 12 : pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
1723 : PROGRESS_VACUUM_PHASE_TRUNCATE);
1724 :
1725 : /*
1726 : * Loop until no more truncating can be done.
1727 : */
1728 : do
1729 : {
1730 : /*
1731 : * We need full exclusive lock on the relation in order to do
1732 : * truncation. If we can't get it, give up rather than waiting --- we
1733 : * don't want to block other backends, and we don't want to deadlock
1734 : * (which is quite possible considering we already hold a lower-grade
1735 : * lock).
1736 : */
1737 12 : vacrelstats->lock_waiter_detected = false;
1738 12 : lock_retry = 0;
1739 : while (true)
1740 : {
1741 12 : if (ConditionalLockRelation(onerel, AccessExclusiveLock))
1742 12 : break;
1743 :
1744 : /*
1745 : * Check for interrupts while trying to (re-)acquire the exclusive
1746 : * lock.
1747 : */
1748 0 : CHECK_FOR_INTERRUPTS();
1749 :
1750 0 : if (++lock_retry > (VACUUM_TRUNCATE_LOCK_TIMEOUT /
1751 : VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL))
1752 : {
1753 : /*
1754 : * We failed to establish the lock in the specified number of
1755 : * retries. This means we give up truncating.
1756 : */
1757 0 : vacrelstats->lock_waiter_detected = true;
1758 0 : ereport(elevel,
1759 : (errmsg("\"%s\": stopping truncate due to conflicting lock request",
1760 : RelationGetRelationName(onerel))));
1761 0 : return;
1762 : }
1763 :
1764 0 : pg_usleep(VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL * 1000L);
1765 0 : }
1766 :
1767 : /*
1768 : * Now that we have exclusive lock, look to see if the rel has grown
1769 : * whilst we were vacuuming with non-exclusive lock. If so, give up;
1770 : * the newly added pages presumably contain non-deletable tuples.
1771 : */
1772 12 : new_rel_pages = RelationGetNumberOfBlocks(onerel);
1773 12 : if (new_rel_pages != old_rel_pages)
1774 : {
1775 : /*
1776 : * Note: we intentionally don't update vacrelstats->rel_pages with
1777 : * the new rel size here. If we did, it would amount to assuming
1778 : * that the new pages are empty, which is unlikely. Leaving the
1779 : * numbers alone amounts to assuming that the new pages have the
1780 : * same tuple density as existing ones, which is less unlikely.
1781 : */
1782 0 : UnlockRelation(onerel, AccessExclusiveLock);
1783 0 : return;
1784 : }
1785 :
1786 : /*
1787 : * Scan backwards from the end to verify that the end pages actually
1788 : * contain no tuples. This is *necessary*, not optional, because
1789 : * other backends could have added tuples to these pages whilst we
1790 : * were vacuuming.
1791 : */
1792 12 : new_rel_pages = count_nondeletable_pages(onerel, vacrelstats);
1793 :
1794 12 : if (new_rel_pages >= old_rel_pages)
1795 : {
1796 : /* can't do anything after all */
1797 0 : UnlockRelation(onerel, AccessExclusiveLock);
1798 0 : return;
1799 : }
1800 :
1801 : /*
1802 : * Okay to truncate.
1803 : */
1804 12 : RelationTruncate(onerel, new_rel_pages);
1805 :
1806 : /*
1807 : * We can release the exclusive lock as soon as we have truncated.
1808 : * Other backends can't safely access the relation until they have
1809 : * processed the smgr invalidation that smgrtruncate sent out ... but
1810 : * that should happen as part of standard invalidation processing once
1811 : * they acquire lock on the relation.
1812 : */
1813 12 : UnlockRelation(onerel, AccessExclusiveLock);
1814 :
1815 : /*
1816 : * Update statistics. Here, it *is* correct to adjust rel_pages
1817 : * without also touching reltuples, since the tuple count wasn't
1818 : * changed by the truncation.
1819 : */
1820 12 : vacrelstats->pages_removed += old_rel_pages - new_rel_pages;
1821 12 : vacrelstats->rel_pages = new_rel_pages;
1822 :
1823 12 : ereport(elevel,
1824 : (errmsg("\"%s\": truncated %u to %u pages",
1825 : RelationGetRelationName(onerel),
1826 : old_rel_pages, new_rel_pages),
1827 : errdetail_internal("%s",
1828 : pg_rusage_show(&ru0))));
1829 12 : old_rel_pages = new_rel_pages;
1830 12 : } while (new_rel_pages > vacrelstats->nonempty_pages &&
1831 12 : vacrelstats->lock_waiter_detected);
1832 : }
1833 :
1834 : /*
1835 : * Rescan end pages to verify that they are (still) empty of tuples.
1836 : *
1837 : * Returns number of nondeletable pages (last nonempty page + 1).
1838 : */
1839 : static BlockNumber
1840 12 : count_nondeletable_pages(Relation onerel, LVRelStats *vacrelstats)
1841 : {
1842 : BlockNumber blkno;
1843 : BlockNumber prefetchedUntil;
1844 : instr_time starttime;
1845 :
1846 : /* Initialize the starttime if we check for conflicting lock requests */
1847 12 : INSTR_TIME_SET_CURRENT(starttime);
1848 :
1849 : /*
1850 : * Start checking blocks at what we believe relation end to be and move
1851 : * backwards. (Strange coding of loop control is needed because blkno is
1852 : * unsigned.) To make the scan faster, we prefetch a few blocks at a time
1853 : * in forward direction, so that OS-level readahead can kick in.
1854 : */
1855 12 : blkno = vacrelstats->rel_pages;
1856 : StaticAssertStmt((PREFETCH_SIZE & (PREFETCH_SIZE - 1)) == 0,
1857 : "prefetch size must be power of 2");
1858 12 : prefetchedUntil = InvalidBlockNumber;
1859 92 : while (blkno > vacrelstats->nonempty_pages)
1860 : {
1861 : Buffer buf;
1862 : Page page;
1863 : OffsetNumber offnum,
1864 : maxoff;
1865 : bool hastup;
1866 :
1867 : /*
1868 : * Check if another process requests a lock on our relation. We are
1869 : * holding an AccessExclusiveLock here, so they will be waiting. We
1870 : * only do this once per VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL, and we
1871 : * only check if that interval has elapsed once every 32 blocks to
1872 : * keep the number of system calls and actual shared lock table
1873 : * lookups to a minimum.
1874 : */
1875 68 : if ((blkno % 32) == 0)
1876 : {
1877 : instr_time currenttime;
1878 : instr_time elapsed;
1879 :
1880 3 : INSTR_TIME_SET_CURRENT(currenttime);
1881 3 : elapsed = currenttime;
1882 3 : INSTR_TIME_SUBTRACT(elapsed, starttime);
1883 3 : if ((INSTR_TIME_GET_MICROSEC(elapsed) / 1000)
1884 : >= VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL)
1885 : {
1886 0 : if (LockHasWaitersRelation(onerel, AccessExclusiveLock))
1887 : {
1888 0 : ereport(elevel,
1889 : (errmsg("\"%s\": suspending truncate due to conflicting lock request",
1890 : RelationGetRelationName(onerel))));
1891 :
1892 0 : vacrelstats->lock_waiter_detected = true;
1893 0 : return blkno;
1894 : }
1895 0 : starttime = currenttime;
1896 : }
1897 : }
1898 :
1899 : /*
1900 : * We don't insert a vacuum delay point here, because we have an
1901 : * exclusive lock on the table which we want to hold for as short a
1902 : * time as possible. We still need to check for interrupts however.
1903 : */
1904 68 : CHECK_FOR_INTERRUPTS();
1905 :
1906 68 : blkno--;
1907 :
1908 : /* If we haven't prefetched this lot yet, do so now. */
1909 68 : if (prefetchedUntil > blkno)
1910 : {
1911 : BlockNumber prefetchStart;
1912 : BlockNumber pblkno;
1913 :
1914 15 : prefetchStart = blkno & ~(PREFETCH_SIZE - 1);
1915 196 : for (pblkno = prefetchStart; pblkno <= blkno; pblkno++)
1916 : {
1917 181 : PrefetchBuffer(onerel, MAIN_FORKNUM, pblkno);
1918 181 : CHECK_FOR_INTERRUPTS();
1919 : }
1920 15 : prefetchedUntil = prefetchStart;
1921 : }
1922 :
1923 68 : buf = ReadBufferExtended(onerel, MAIN_FORKNUM, blkno,
1924 : RBM_NORMAL, vac_strategy);
1925 :
1926 : /* In this phase we only need shared access to the buffer */
1927 68 : LockBuffer(buf, BUFFER_LOCK_SHARE);
1928 :
1929 68 : page = BufferGetPage(buf);
1930 :
1931 68 : if (PageIsNew(page) || PageIsEmpty(page))
1932 : {
1933 : /* PageIsNew probably shouldn't happen... */
1934 17 : UnlockReleaseBuffer(buf);
1935 17 : continue;
1936 : }
1937 :
1938 51 : hastup = false;
1939 51 : maxoff = PageGetMaxOffsetNumber(page);
1940 1847 : for (offnum = FirstOffsetNumber;
1941 : offnum <= maxoff;
1942 1745 : offnum = OffsetNumberNext(offnum))
1943 : {
1944 : ItemId itemid;
1945 :
1946 1745 : itemid = PageGetItemId(page, offnum);
1947 :
1948 : /*
1949 : * Note: any non-unused item should be taken as a reason to keep
1950 : * this page. We formerly thought that DEAD tuples could be
1951 : * thrown away, but that's not so, because we'd not have cleaned
1952 : * out their index entries.
1953 : */
1954 1745 : if (ItemIdIsUsed(itemid))
1955 : {
1956 0 : hastup = true;
1957 0 : break; /* can stop scanning */
1958 : }
1959 : } /* scan along page */
1960 :
1961 51 : UnlockReleaseBuffer(buf);
1962 :
1963 : /* Done scanning if we found a tuple here */
1964 51 : if (hastup)
1965 0 : return blkno + 1;
1966 : }
1967 :
1968 : /*
1969 : * If we fall out of the loop, all the previously-thought-to-be-empty
1970 : * pages still are; we need not bother to look at the last known-nonempty
1971 : * page.
1972 : */
1973 12 : return vacrelstats->nonempty_pages;
1974 : }
1975 :
1976 : /*
1977 : * lazy_space_alloc - space allocation decisions for lazy vacuum
1978 : *
1979 : * See the comments at the head of this file for rationale.
1980 : */
1981 : static void
1982 384 : lazy_space_alloc(LVRelStats *vacrelstats, BlockNumber relblocks)
1983 : {
1984 : long maxtuples;
1985 775 : int vac_work_mem = IsAutoVacuumWorkerProcess() &&
1986 7 : autovacuum_work_mem != -1 ?
1987 384 : autovacuum_work_mem : maintenance_work_mem;
1988 :
1989 384 : if (vacrelstats->hasindex)
1990 : {
1991 290 : maxtuples = (vac_work_mem * 1024L) / sizeof(ItemPointerData);
1992 290 : maxtuples = Min(maxtuples, INT_MAX);
1993 290 : maxtuples = Min(maxtuples, MaxAllocSize / sizeof(ItemPointerData));
1994 :
1995 : /* curious coding here to ensure the multiplication can't overflow */
1996 290 : if ((BlockNumber) (maxtuples / LAZY_ALLOC_TUPLES) > relblocks)
1997 290 : maxtuples = relblocks * LAZY_ALLOC_TUPLES;
1998 :
1999 : /* stay sane if small maintenance_work_mem */
2000 290 : maxtuples = Max(maxtuples, MaxHeapTuplesPerPage);
2001 : }
2002 : else
2003 : {
2004 94 : maxtuples = MaxHeapTuplesPerPage;
2005 : }
2006 :
2007 384 : vacrelstats->num_dead_tuples = 0;
2008 384 : vacrelstats->max_dead_tuples = (int) maxtuples;
2009 384 : vacrelstats->dead_tuples = (ItemPointer)
2010 384 : palloc(maxtuples * sizeof(ItemPointerData));
2011 384 : }
2012 :
2013 : /*
2014 : * lazy_record_dead_tuple - remember one deletable tuple
2015 : */
2016 : static void
2017 71320 : lazy_record_dead_tuple(LVRelStats *vacrelstats,
2018 : ItemPointer itemptr)
2019 : {
2020 : /*
2021 : * The array shouldn't overflow under normal behavior, but perhaps it
2022 : * could if we are given a really small maintenance_work_mem. In that
2023 : * case, just forget the last few tuples (we'll get 'em next time).
2024 : */
2025 71320 : if (vacrelstats->num_dead_tuples < vacrelstats->max_dead_tuples)
2026 : {
2027 71320 : vacrelstats->dead_tuples[vacrelstats->num_dead_tuples] = *itemptr;
2028 71320 : vacrelstats->num_dead_tuples++;
2029 71320 : pgstat_progress_update_param(PROGRESS_VACUUM_NUM_DEAD_TUPLES,
2030 71320 : vacrelstats->num_dead_tuples);
2031 : }
2032 71320 : }
2033 :
2034 : /*
2035 : * lazy_tid_reaped() -- is a particular tid deletable?
2036 : *
2037 : * This has the right signature to be an IndexBulkDeleteCallback.
2038 : *
2039 : * Assumes dead_tuples array is in sorted order.
2040 : */
2041 : static bool
2042 233492 : lazy_tid_reaped(ItemPointer itemptr, void *state)
2043 : {
2044 233492 : LVRelStats *vacrelstats = (LVRelStats *) state;
2045 : ItemPointer res;
2046 :
2047 466984 : res = (ItemPointer) bsearch((void *) itemptr,
2048 233492 : (void *) vacrelstats->dead_tuples,
2049 233492 : vacrelstats->num_dead_tuples,
2050 : sizeof(ItemPointerData),
2051 : vac_cmp_itemptr);
2052 :
2053 233492 : return (res != NULL);
2054 : }
2055 :
2056 : /*
2057 : * Comparator routines for use with qsort() and bsearch().
2058 : */
2059 : static int
2060 2573246 : vac_cmp_itemptr(const void *left, const void *right)
2061 : {
2062 : BlockNumber lblk,
2063 : rblk;
2064 : OffsetNumber loff,
2065 : roff;
2066 :
2067 2573246 : lblk = ItemPointerGetBlockNumber((ItemPointer) left);
2068 2573246 : rblk = ItemPointerGetBlockNumber((ItemPointer) right);
2069 :
2070 2573246 : if (lblk < rblk)
2071 959590 : return -1;
2072 1613656 : if (lblk > rblk)
2073 647898 : return 1;
2074 :
2075 965758 : loff = ItemPointerGetOffsetNumber((ItemPointer) left);
2076 965758 : roff = ItemPointerGetOffsetNumber((ItemPointer) right);
2077 :
2078 965758 : if (loff < roff)
2079 449113 : return -1;
2080 516645 : if (loff > roff)
2081 389545 : return 1;
2082 :
2083 127100 : return 0;
2084 : }
2085 :
2086 : /*
2087 : * Check if every tuple in the given page is visible to all current and future
2088 : * transactions. Also return the visibility_cutoff_xid which is the highest
2089 : * xmin amongst the visible tuples. Set *all_frozen to true if every tuple
2090 : * on this page is frozen.
2091 : */
2092 : static bool
2093 905 : heap_page_is_all_visible(Relation rel, Buffer buf,
2094 : TransactionId *visibility_cutoff_xid,
2095 : bool *all_frozen)
2096 : {
2097 905 : Page page = BufferGetPage(buf);
2098 905 : BlockNumber blockno = BufferGetBlockNumber(buf);
2099 : OffsetNumber offnum,
2100 : maxoff;
2101 905 : bool all_visible = true;
2102 :
2103 905 : *visibility_cutoff_xid = InvalidTransactionId;
2104 905 : *all_frozen = true;
2105 :
2106 : /*
2107 : * This is a stripped down version of the line pointer scan in
2108 : * lazy_scan_heap(). So if you change anything here, also check that code.
2109 : */
2110 905 : maxoff = PageGetMaxOffsetNumber(page);
2111 82838 : for (offnum = FirstOffsetNumber;
2112 81087 : offnum <= maxoff && all_visible;
2113 81028 : offnum = OffsetNumberNext(offnum))
2114 : {
2115 : ItemId itemid;
2116 : HeapTupleData tuple;
2117 :
2118 81028 : itemid = PageGetItemId(page, offnum);
2119 :
2120 : /* Unused or redirect line pointers are of no interest */
2121 81028 : if (!ItemIdIsUsed(itemid) || ItemIdIsRedirected(itemid))
2122 63889 : continue;
2123 :
2124 17139 : ItemPointerSet(&(tuple.t_self), blockno, offnum);
2125 :
2126 : /*
2127 : * Dead line pointers can have index pointers pointing to them. So
2128 : * they can't be treated as visible
2129 : */
2130 17139 : if (ItemIdIsDead(itemid))
2131 : {
2132 0 : all_visible = false;
2133 0 : *all_frozen = false;
2134 0 : break;
2135 : }
2136 :
2137 17139 : Assert(ItemIdIsNormal(itemid));
2138 :
2139 17139 : tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
2140 17139 : tuple.t_len = ItemIdGetLength(itemid);
2141 17139 : tuple.t_tableOid = RelationGetRelid(rel);
2142 :
2143 17139 : switch (HeapTupleSatisfiesVacuum(&tuple, OldestXmin, buf))
2144 : {
2145 : case HEAPTUPLE_LIVE:
2146 : {
2147 : TransactionId xmin;
2148 :
2149 : /* Check comments in lazy_scan_heap. */
2150 17080 : if (!HeapTupleHeaderXminCommitted(tuple.t_data))
2151 : {
2152 0 : all_visible = false;
2153 0 : *all_frozen = false;
2154 0 : break;
2155 : }
2156 :
2157 : /*
2158 : * The inserter definitely committed. But is it old enough
2159 : * that everyone sees it as committed?
2160 : */
2161 17080 : xmin = HeapTupleHeaderGetXmin(tuple.t_data);
2162 17080 : if (!TransactionIdPrecedes(xmin, OldestXmin))
2163 : {
2164 0 : all_visible = false;
2165 0 : *all_frozen = false;
2166 0 : break;
2167 : }
2168 :
2169 : /* Track newest xmin on page. */
2170 17080 : if (TransactionIdFollows(xmin, *visibility_cutoff_xid))
2171 1607 : *visibility_cutoff_xid = xmin;
2172 :
2173 : /* Check whether this tuple is already frozen or not */
2174 20115 : if (all_visible && *all_frozen &&
2175 3035 : heap_tuple_needs_eventual_freeze(tuple.t_data))
2176 342 : *all_frozen = false;
2177 : }
2178 17080 : break;
2179 :
2180 : case HEAPTUPLE_DEAD:
2181 : case HEAPTUPLE_RECENTLY_DEAD:
2182 : case HEAPTUPLE_INSERT_IN_PROGRESS:
2183 : case HEAPTUPLE_DELETE_IN_PROGRESS:
2184 : {
2185 59 : all_visible = false;
2186 59 : *all_frozen = false;
2187 59 : break;
2188 : }
2189 : default:
2190 0 : elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
2191 : break;
2192 : }
2193 : } /* scan along page */
2194 :
2195 905 : return all_visible;
2196 : }
|