Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * heapam.c
4 : * heap access method code
5 : *
6 : * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/backend/access/heap/heapam.c
12 : *
13 : *
14 : * INTERFACE ROUTINES
15 : * relation_open - open any relation by relation OID
16 : * relation_openrv - open any relation specified by a RangeVar
17 : * relation_close - close any relation
18 : * heap_open - open a heap relation by relation OID
19 : * heap_openrv - open a heap relation specified by a RangeVar
20 : * heap_close - (now just a macro for relation_close)
21 : * heap_beginscan - begin relation scan
22 : * heap_rescan - restart a relation scan
23 : * heap_endscan - end relation scan
24 : * heap_getnext - retrieve next tuple in scan
25 : * heap_fetch - retrieve tuple with given tid
26 : * heap_insert - insert tuple into a relation
27 : * heap_multi_insert - insert multiple tuples into a relation
28 : * heap_delete - delete a tuple from a relation
29 : * heap_update - replace a tuple in a relation with another tuple
30 : * heap_sync - sync heap, for when no WAL has been written
31 : *
32 : * NOTES
33 : * This file contains the heap_ routines which implement
34 : * the POSTGRES heap access method used for all POSTGRES
35 : * relations.
36 : *
37 : *-------------------------------------------------------------------------
38 : */
39 : #include "postgres.h"
40 :
41 : #include "access/bufmask.h"
42 : #include "access/heapam.h"
43 : #include "access/heapam_xlog.h"
44 : #include "access/hio.h"
45 : #include "access/multixact.h"
46 : #include "access/parallel.h"
47 : #include "access/relscan.h"
48 : #include "access/sysattr.h"
49 : #include "access/transam.h"
50 : #include "access/tuptoaster.h"
51 : #include "access/valid.h"
52 : #include "access/visibilitymap.h"
53 : #include "access/xact.h"
54 : #include "access/xlog.h"
55 : #include "access/xloginsert.h"
56 : #include "access/xlogutils.h"
57 : #include "catalog/catalog.h"
58 : #include "catalog/namespace.h"
59 : #include "miscadmin.h"
60 : #include "pgstat.h"
61 : #include "port/atomics.h"
62 : #include "storage/bufmgr.h"
63 : #include "storage/freespace.h"
64 : #include "storage/lmgr.h"
65 : #include "storage/predicate.h"
66 : #include "storage/procarray.h"
67 : #include "storage/smgr.h"
68 : #include "storage/spin.h"
69 : #include "storage/standby.h"
70 : #include "utils/datum.h"
71 : #include "utils/inval.h"
72 : #include "utils/lsyscache.h"
73 : #include "utils/relcache.h"
74 : #include "utils/snapmgr.h"
75 : #include "utils/syscache.h"
76 : #include "utils/tqual.h"
77 :
78 :
79 : /* GUC variable */
80 : bool synchronize_seqscans = true;
81 :
82 :
83 : static HeapScanDesc heap_beginscan_internal(Relation relation,
84 : Snapshot snapshot,
85 : int nkeys, ScanKey key,
86 : ParallelHeapScanDesc parallel_scan,
87 : bool allow_strat,
88 : bool allow_sync,
89 : bool allow_pagemode,
90 : bool is_bitmapscan,
91 : bool is_samplescan,
92 : bool temp_snap);
93 : static void heap_parallelscan_startblock_init(HeapScanDesc scan);
94 : static BlockNumber heap_parallelscan_nextpage(HeapScanDesc scan);
95 : static HeapTuple heap_prepare_insert(Relation relation, HeapTuple tup,
96 : TransactionId xid, CommandId cid, int options);
97 : static XLogRecPtr log_heap_update(Relation reln, Buffer oldbuf,
98 : Buffer newbuf, HeapTuple oldtup,
99 : HeapTuple newtup, HeapTuple old_key_tup,
100 : bool all_visible_cleared, bool new_all_visible_cleared);
101 : static Bitmapset *HeapDetermineModifiedColumns(Relation relation,
102 : Bitmapset *interesting_cols,
103 : HeapTuple oldtup, HeapTuple newtup);
104 : static bool heap_acquire_tuplock(Relation relation, ItemPointer tid,
105 : LockTupleMode mode, LockWaitPolicy wait_policy,
106 : bool *have_tuple_lock);
107 : static void compute_new_xmax_infomask(TransactionId xmax, uint16 old_infomask,
108 : uint16 old_infomask2, TransactionId add_to_xmax,
109 : LockTupleMode mode, bool is_update,
110 : TransactionId *result_xmax, uint16 *result_infomask,
111 : uint16 *result_infomask2);
112 : static HTSU_Result heap_lock_updated_tuple(Relation rel, HeapTuple tuple,
113 : ItemPointer ctid, TransactionId xid,
114 : LockTupleMode mode);
115 : static void GetMultiXactIdHintBits(MultiXactId multi, uint16 *new_infomask,
116 : uint16 *new_infomask2);
117 : static TransactionId MultiXactIdGetUpdateXid(TransactionId xmax,
118 : uint16 t_infomask);
119 : static bool DoesMultiXactIdConflict(MultiXactId multi, uint16 infomask,
120 : LockTupleMode lockmode);
121 : static void MultiXactIdWait(MultiXactId multi, MultiXactStatus status, uint16 infomask,
122 : Relation rel, ItemPointer ctid, XLTW_Oper oper,
123 : int *remaining);
124 : static bool ConditionalMultiXactIdWait(MultiXactId multi, MultiXactStatus status,
125 : uint16 infomask, Relation rel, int *remaining);
126 : static XLogRecPtr log_heap_new_cid(Relation relation, HeapTuple tup);
127 : static HeapTuple ExtractReplicaIdentity(Relation rel, HeapTuple tup, bool key_modified,
128 : bool *copy);
129 :
130 :
131 : /*
132 : * Each tuple lock mode has a corresponding heavyweight lock, and one or two
133 : * corresponding MultiXactStatuses (one to merely lock tuples, another one to
134 : * update them). This table (and the macros below) helps us determine the
135 : * heavyweight lock mode and MultiXactStatus values to use for any particular
136 : * tuple lock strength.
137 : *
138 : * Don't look at lockstatus/updstatus directly! Use get_mxact_status_for_lock
139 : * instead.
140 : */
141 : static const struct
142 : {
143 : LOCKMODE hwlock;
144 : int lockstatus;
145 : int updstatus;
146 : }
147 :
148 : tupleLockExtraInfo[MaxLockTupleMode + 1] =
149 : {
150 : { /* LockTupleKeyShare */
151 : AccessShareLock,
152 : MultiXactStatusForKeyShare,
153 : -1 /* KeyShare does not allow updating tuples */
154 : },
155 : { /* LockTupleShare */
156 : RowShareLock,
157 : MultiXactStatusForShare,
158 : -1 /* Share does not allow updating tuples */
159 : },
160 : { /* LockTupleNoKeyExclusive */
161 : ExclusiveLock,
162 : MultiXactStatusForNoKeyUpdate,
163 : MultiXactStatusNoKeyUpdate
164 : },
165 : { /* LockTupleExclusive */
166 : AccessExclusiveLock,
167 : MultiXactStatusForUpdate,
168 : MultiXactStatusUpdate
169 : }
170 : };
171 :
172 : /* Get the LOCKMODE for a given MultiXactStatus */
173 : #define LOCKMODE_from_mxstatus(status) \
174 : (tupleLockExtraInfo[TUPLOCK_from_mxstatus((status))].hwlock)
175 :
176 : /*
177 : * Acquire heavyweight locks on tuples, using a LockTupleMode strength value.
178 : * This is more readable than having every caller translate it to lock.h's
179 : * LOCKMODE.
180 : */
181 : #define LockTupleTuplock(rel, tup, mode) \
182 : LockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
183 : #define UnlockTupleTuplock(rel, tup, mode) \
184 : UnlockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
185 : #define ConditionalLockTupleTuplock(rel, tup, mode) \
186 : ConditionalLockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
187 :
188 : /*
189 : * This table maps tuple lock strength values for each particular
190 : * MultiXactStatus value.
191 : */
192 : static const int MultiXactStatusLock[MaxMultiXactStatus + 1] =
193 : {
194 : LockTupleKeyShare, /* ForKeyShare */
195 : LockTupleShare, /* ForShare */
196 : LockTupleNoKeyExclusive, /* ForNoKeyUpdate */
197 : LockTupleExclusive, /* ForUpdate */
198 : LockTupleNoKeyExclusive, /* NoKeyUpdate */
199 : LockTupleExclusive /* Update */
200 : };
201 :
202 : /* Get the LockTupleMode for a given MultiXactStatus */
203 : #define TUPLOCK_from_mxstatus(status) \
204 : (MultiXactStatusLock[(status)])
205 :
206 : /* ----------------------------------------------------------------
207 : * heap support routines
208 : * ----------------------------------------------------------------
209 : */
210 :
211 : /* ----------------
212 : * initscan - scan code common to heap_beginscan and heap_rescan
213 : * ----------------
214 : */
215 : static void
216 18682 : initscan(HeapScanDesc scan, ScanKey key, bool keep_startblock)
217 : {
218 : bool allow_strat;
219 : bool allow_sync;
220 :
221 : /*
222 : * Determine the number of blocks we have to scan.
223 : *
224 : * It is sufficient to do this once at scan start, since any tuples added
225 : * while the scan is in progress will be invisible to my snapshot anyway.
226 : * (That is not true when using a non-MVCC snapshot. However, we couldn't
227 : * guarantee to return tuples added after scan start anyway, since they
228 : * might go into pages we already scanned. To guarantee consistent
229 : * results for a non-MVCC snapshot, the caller must hold some higher-level
230 : * lock that ensures the interesting tuple(s) won't change.)
231 : */
232 18682 : if (scan->rs_parallel != NULL)
233 54 : scan->rs_nblocks = scan->rs_parallel->phs_nblocks;
234 : else
235 18628 : scan->rs_nblocks = RelationGetNumberOfBlocks(scan->rs_rd);
236 :
237 : /*
238 : * If the table is large relative to NBuffers, use a bulk-read access
239 : * strategy and enable synchronized scanning (see syncscan.c). Although
240 : * the thresholds for these features could be different, we make them the
241 : * same so that there are only two behaviors to tune rather than four.
242 : * (However, some callers need to be able to disable one or both of these
243 : * behaviors, independently of the size of the table; also there is a GUC
244 : * variable that can disable synchronized scanning.)
245 : *
246 : * Note that heap_parallelscan_initialize has a very similar test; if you
247 : * change this, consider changing that one, too.
248 : */
249 36267 : if (!RelationUsesLocalBuffers(scan->rs_rd) &&
250 17585 : scan->rs_nblocks > NBuffers / 4)
251 : {
252 0 : allow_strat = scan->rs_allow_strat;
253 0 : allow_sync = scan->rs_allow_sync;
254 : }
255 : else
256 18682 : allow_strat = allow_sync = false;
257 :
258 18682 : if (allow_strat)
259 : {
260 : /* During a rescan, keep the previous strategy object. */
261 0 : if (scan->rs_strategy == NULL)
262 0 : scan->rs_strategy = GetAccessStrategy(BAS_BULKREAD);
263 : }
264 : else
265 : {
266 18682 : if (scan->rs_strategy != NULL)
267 0 : FreeAccessStrategy(scan->rs_strategy);
268 18682 : scan->rs_strategy = NULL;
269 : }
270 :
271 18682 : if (scan->rs_parallel != NULL)
272 : {
273 : /* For parallel scan, believe whatever ParallelHeapScanDesc says. */
274 54 : scan->rs_syncscan = scan->rs_parallel->phs_syncscan;
275 : }
276 18628 : else if (keep_startblock)
277 : {
278 : /*
279 : * When rescanning, we want to keep the previous startblock setting,
280 : * so that rewinding a cursor doesn't generate surprising results.
281 : * Reset the active syncscan setting, though.
282 : */
283 1327 : scan->rs_syncscan = (allow_sync && synchronize_seqscans);
284 : }
285 17301 : else if (allow_sync && synchronize_seqscans)
286 : {
287 0 : scan->rs_syncscan = true;
288 0 : scan->rs_startblock = ss_get_location(scan->rs_rd, scan->rs_nblocks);
289 : }
290 : else
291 : {
292 17301 : scan->rs_syncscan = false;
293 17301 : scan->rs_startblock = 0;
294 : }
295 :
296 18682 : scan->rs_numblocks = InvalidBlockNumber;
297 18682 : scan->rs_inited = false;
298 18682 : scan->rs_ctup.t_data = NULL;
299 18682 : ItemPointerSetInvalid(&scan->rs_ctup.t_self);
300 18682 : scan->rs_cbuf = InvalidBuffer;
301 18682 : scan->rs_cblock = InvalidBlockNumber;
302 :
303 : /* page-at-a-time fields are always invalid when not rs_inited */
304 :
305 : /*
306 : * copy the scan key, if appropriate
307 : */
308 18682 : if (key != NULL)
309 5060 : memcpy(scan->rs_key, key, scan->rs_nkeys * sizeof(ScanKeyData));
310 :
311 : /*
312 : * Currently, we don't have a stats counter for bitmap heap scans (but the
313 : * underlying bitmap index scans will be counted) or sample scans (we only
314 : * update stats for tuple fetches there)
315 : */
316 18682 : if (!scan->rs_bitmapscan && !scan->rs_samplescan)
317 16891 : pgstat_count_heap_scan(scan->rs_rd);
318 18682 : }
319 :
320 : /*
321 : * heap_setscanlimits - restrict range of a heapscan
322 : *
323 : * startBlk is the page to start at
324 : * numBlks is number of pages to scan (InvalidBlockNumber means "all")
325 : */
326 : void
327 25 : heap_setscanlimits(HeapScanDesc scan, BlockNumber startBlk, BlockNumber numBlks)
328 : {
329 25 : Assert(!scan->rs_inited); /* else too late to change */
330 25 : Assert(!scan->rs_syncscan); /* else rs_startblock is significant */
331 :
332 : /* Check startBlk is valid (but allow case of zero blocks...) */
333 25 : Assert(startBlk == 0 || startBlk < scan->rs_nblocks);
334 :
335 25 : scan->rs_startblock = startBlk;
336 25 : scan->rs_numblocks = numBlks;
337 25 : }
338 :
339 : /*
340 : * heapgetpage - subroutine for heapgettup()
341 : *
342 : * This routine reads and pins the specified page of the relation.
343 : * In page-at-a-time mode it performs additional work, namely determining
344 : * which tuples on the page are visible.
345 : */
346 : void
347 112656 : heapgetpage(HeapScanDesc scan, BlockNumber page)
348 : {
349 : Buffer buffer;
350 : Snapshot snapshot;
351 : Page dp;
352 : int lines;
353 : int ntup;
354 : OffsetNumber lineoff;
355 : ItemId lpp;
356 : bool all_visible;
357 :
358 112656 : Assert(page < scan->rs_nblocks);
359 :
360 : /* release previous scan buffer, if any */
361 112656 : if (BufferIsValid(scan->rs_cbuf))
362 : {
363 99868 : ReleaseBuffer(scan->rs_cbuf);
364 99868 : scan->rs_cbuf = InvalidBuffer;
365 : }
366 :
367 : /*
368 : * Be sure to check for interrupts at least once per page. Checks at
369 : * higher code levels won't be able to stop a seqscan that encounters many
370 : * pages' worth of consecutive dead tuples.
371 : */
372 112656 : CHECK_FOR_INTERRUPTS();
373 :
374 : /* read page using selected strategy */
375 112656 : scan->rs_cbuf = ReadBufferExtended(scan->rs_rd, MAIN_FORKNUM, page,
376 : RBM_NORMAL, scan->rs_strategy);
377 112656 : scan->rs_cblock = page;
378 :
379 112656 : if (!scan->rs_pageatatime)
380 125656 : return;
381 :
382 99656 : buffer = scan->rs_cbuf;
383 99656 : snapshot = scan->rs_snapshot;
384 :
385 : /*
386 : * Prune and repair fragmentation for the whole page, if possible.
387 : */
388 99656 : heap_page_prune_opt(scan->rs_rd, buffer);
389 :
390 : /*
391 : * We must hold share lock on the buffer content while examining tuple
392 : * visibility. Afterwards, however, the tuples we have found to be
393 : * visible are guaranteed good as long as we hold the buffer pin.
394 : */
395 99656 : LockBuffer(buffer, BUFFER_LOCK_SHARE);
396 :
397 99656 : dp = BufferGetPage(buffer);
398 99656 : TestForOldSnapshot(snapshot, scan->rs_rd, dp);
399 99656 : lines = PageGetMaxOffsetNumber(dp);
400 99656 : ntup = 0;
401 :
402 : /*
403 : * If the all-visible flag indicates that all tuples on the page are
404 : * visible to everyone, we can skip the per-tuple visibility tests.
405 : *
406 : * Note: In hot standby, a tuple that's already visible to all
407 : * transactions in the master might still be invisible to a read-only
408 : * transaction in the standby. We partly handle this problem by tracking
409 : * the minimum xmin of visible tuples as the cut-off XID while marking a
410 : * page all-visible on master and WAL log that along with the visibility
411 : * map SET operation. In hot standby, we wait for (or abort) all
412 : * transactions that can potentially may not see one or more tuples on the
413 : * page. That's how index-only scans work fine in hot standby. A crucial
414 : * difference between index-only scans and heap scans is that the
415 : * index-only scan completely relies on the visibility map where as heap
416 : * scan looks at the page-level PD_ALL_VISIBLE flag. We are not sure if
417 : * the page-level flag can be trusted in the same way, because it might
418 : * get propagated somehow without being explicitly WAL-logged, e.g. via a
419 : * full page write. Until we can prove that beyond doubt, let's check each
420 : * tuple for visibility the hard way.
421 : */
422 99656 : all_visible = PageIsAllVisible(dp) && !snapshot->takenDuringRecovery;
423 :
424 3706260 : for (lineoff = FirstOffsetNumber, lpp = PageGetItemId(dp, lineoff);
425 3606604 : lineoff <= lines;
426 3506948 : lineoff++, lpp++)
427 : {
428 3506948 : if (ItemIdIsNormal(lpp))
429 : {
430 : HeapTupleData loctup;
431 : bool valid;
432 :
433 3079288 : loctup.t_tableOid = RelationGetRelid(scan->rs_rd);
434 3079288 : loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
435 3079288 : loctup.t_len = ItemIdGetLength(lpp);
436 3079288 : ItemPointerSet(&(loctup.t_self), page, lineoff);
437 :
438 3079288 : if (all_visible)
439 1644827 : valid = true;
440 : else
441 1434461 : valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer);
442 :
443 3079288 : CheckForSerializableConflictOut(valid, scan->rs_rd, &loctup,
444 : buffer, snapshot);
445 :
446 3079288 : if (valid)
447 2939829 : scan->rs_vistuples[ntup++] = lineoff;
448 : }
449 : }
450 :
451 99656 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
452 :
453 99656 : Assert(ntup <= MaxHeapTuplesPerPage);
454 99656 : scan->rs_ntuples = ntup;
455 : }
456 :
457 : /* ----------------
458 : * heapgettup - fetch next heap tuple
459 : *
460 : * Initialize the scan if not already done; then advance to the next
461 : * tuple as indicated by "dir"; return the next tuple in scan->rs_ctup,
462 : * or set scan->rs_ctup.t_data = NULL if no more tuples.
463 : *
464 : * dir == NoMovementScanDirection means "re-fetch the tuple indicated
465 : * by scan->rs_ctup".
466 : *
467 : * Note: the reason nkeys/key are passed separately, even though they are
468 : * kept in the scan descriptor, is that the caller may not want us to check
469 : * the scankeys.
470 : *
471 : * Note: when we fall off the end of the scan in either direction, we
472 : * reset rs_inited. This means that a further request with the same
473 : * scan direction will restart the scan, which is a bit odd, but a
474 : * request with the opposite scan direction will start a fresh scan
475 : * in the proper direction. The latter is required behavior for cursors,
476 : * while the former case is generally undefined behavior in Postgres
477 : * so we don't care too much.
478 : * ----------------
479 : */
480 : static void
481 766630 : heapgettup(HeapScanDesc scan,
482 : ScanDirection dir,
483 : int nkeys,
484 : ScanKey key)
485 : {
486 766630 : HeapTuple tuple = &(scan->rs_ctup);
487 766630 : Snapshot snapshot = scan->rs_snapshot;
488 766630 : bool backward = ScanDirectionIsBackward(dir);
489 : BlockNumber page;
490 : bool finished;
491 : Page dp;
492 : int lines;
493 : OffsetNumber lineoff;
494 : int linesleft;
495 : ItemId lpp;
496 :
497 : /*
498 : * calculate next starting lineoff, given scan direction
499 : */
500 766630 : if (ScanDirectionIsForward(dir))
501 : {
502 766630 : if (!scan->rs_inited)
503 : {
504 : /*
505 : * return null immediately if relation is empty
506 : */
507 1393 : if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
508 : {
509 1126 : Assert(!BufferIsValid(scan->rs_cbuf));
510 1126 : tuple->t_data = NULL;
511 1126 : return;
512 : }
513 267 : if (scan->rs_parallel != NULL)
514 : {
515 0 : heap_parallelscan_startblock_init(scan);
516 :
517 0 : page = heap_parallelscan_nextpage(scan);
518 :
519 : /* Other processes might have already finished the scan. */
520 0 : if (page == InvalidBlockNumber)
521 : {
522 0 : Assert(!BufferIsValid(scan->rs_cbuf));
523 0 : tuple->t_data = NULL;
524 0 : return;
525 : }
526 : }
527 : else
528 267 : page = scan->rs_startblock; /* first page */
529 267 : heapgetpage(scan, page);
530 267 : lineoff = FirstOffsetNumber; /* first offnum */
531 267 : scan->rs_inited = true;
532 : }
533 : else
534 : {
535 : /* continue from previously returned page/tuple */
536 765237 : page = scan->rs_cblock; /* current page */
537 765237 : lineoff = /* next offnum */
538 765237 : OffsetNumberNext(ItemPointerGetOffsetNumber(&(tuple->t_self)));
539 : }
540 :
541 765504 : LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
542 :
543 765504 : dp = BufferGetPage(scan->rs_cbuf);
544 765504 : TestForOldSnapshot(snapshot, scan->rs_rd, dp);
545 765504 : lines = PageGetMaxOffsetNumber(dp);
546 : /* page and lineoff now reference the physically next tid */
547 :
548 765504 : linesleft = lines - lineoff + 1;
549 : }
550 0 : else if (backward)
551 : {
552 : /* backward parallel scan not supported */
553 0 : Assert(scan->rs_parallel == NULL);
554 :
555 0 : if (!scan->rs_inited)
556 : {
557 : /*
558 : * return null immediately if relation is empty
559 : */
560 0 : if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
561 : {
562 0 : Assert(!BufferIsValid(scan->rs_cbuf));
563 0 : tuple->t_data = NULL;
564 0 : return;
565 : }
566 :
567 : /*
568 : * Disable reporting to syncscan logic in a backwards scan; it's
569 : * not very likely anyone else is doing the same thing at the same
570 : * time, and much more likely that we'll just bollix things for
571 : * forward scanners.
572 : */
573 0 : scan->rs_syncscan = false;
574 : /* start from last page of the scan */
575 0 : if (scan->rs_startblock > 0)
576 0 : page = scan->rs_startblock - 1;
577 : else
578 0 : page = scan->rs_nblocks - 1;
579 0 : heapgetpage(scan, page);
580 : }
581 : else
582 : {
583 : /* continue from previously returned page/tuple */
584 0 : page = scan->rs_cblock; /* current page */
585 : }
586 :
587 0 : LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
588 :
589 0 : dp = BufferGetPage(scan->rs_cbuf);
590 0 : TestForOldSnapshot(snapshot, scan->rs_rd, dp);
591 0 : lines = PageGetMaxOffsetNumber(dp);
592 :
593 0 : if (!scan->rs_inited)
594 : {
595 0 : lineoff = lines; /* final offnum */
596 0 : scan->rs_inited = true;
597 : }
598 : else
599 : {
600 0 : lineoff = /* previous offnum */
601 0 : OffsetNumberPrev(ItemPointerGetOffsetNumber(&(tuple->t_self)));
602 : }
603 : /* page and lineoff now reference the physically previous tid */
604 :
605 0 : linesleft = lineoff;
606 : }
607 : else
608 : {
609 : /*
610 : * ``no movement'' scan direction: refetch prior tuple
611 : */
612 0 : if (!scan->rs_inited)
613 : {
614 0 : Assert(!BufferIsValid(scan->rs_cbuf));
615 0 : tuple->t_data = NULL;
616 0 : return;
617 : }
618 :
619 0 : page = ItemPointerGetBlockNumber(&(tuple->t_self));
620 0 : if (page != scan->rs_cblock)
621 0 : heapgetpage(scan, page);
622 :
623 : /* Since the tuple was previously fetched, needn't lock page here */
624 0 : dp = BufferGetPage(scan->rs_cbuf);
625 0 : TestForOldSnapshot(snapshot, scan->rs_rd, dp);
626 0 : lineoff = ItemPointerGetOffsetNumber(&(tuple->t_self));
627 0 : lpp = PageGetItemId(dp, lineoff);
628 0 : Assert(ItemIdIsNormal(lpp));
629 :
630 0 : tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
631 0 : tuple->t_len = ItemIdGetLength(lpp);
632 :
633 0 : return;
634 : }
635 :
636 : /*
637 : * advance the scan until we find a qualifying tuple or run out of stuff
638 : * to scan
639 : */
640 765504 : lpp = PageGetItemId(dp, lineoff);
641 : for (;;)
642 : {
643 1564664 : while (linesleft > 0)
644 : {
645 774824 : if (ItemIdIsNormal(lpp))
646 : {
647 : bool valid;
648 :
649 765238 : tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
650 765238 : tuple->t_len = ItemIdGetLength(lpp);
651 765238 : ItemPointerSet(&(tuple->t_self), page, lineoff);
652 :
653 : /*
654 : * if current tuple qualifies, return it.
655 : */
656 765238 : valid = HeapTupleSatisfiesVisibility(tuple,
657 : snapshot,
658 : scan->rs_cbuf);
659 :
660 765238 : CheckForSerializableConflictOut(valid, scan->rs_rd, tuple,
661 : scan->rs_cbuf, snapshot);
662 :
663 765238 : if (valid && key != NULL)
664 0 : HeapKeyTest(tuple, RelationGetDescr(scan->rs_rd),
665 : nkeys, key, valid);
666 :
667 765238 : if (valid)
668 : {
669 765238 : LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
670 765238 : return;
671 : }
672 : }
673 :
674 : /*
675 : * otherwise move to the next item on the page
676 : */
677 9586 : --linesleft;
678 9586 : if (backward)
679 : {
680 0 : --lpp; /* move back in this page's ItemId array */
681 0 : --lineoff;
682 : }
683 : else
684 : {
685 9586 : ++lpp; /* move forward in this page's ItemId array */
686 9586 : ++lineoff;
687 : }
688 : }
689 :
690 : /*
691 : * if we get here, it means we've exhausted the items on this page and
692 : * it's time to move to the next.
693 : */
694 12301 : LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
695 :
696 : /*
697 : * advance to next/prior page and detect end of scan
698 : */
699 12301 : if (backward)
700 : {
701 0 : finished = (page == scan->rs_startblock) ||
702 0 : (scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks == 0 : false);
703 0 : if (page == 0)
704 0 : page = scan->rs_nblocks;
705 0 : page--;
706 : }
707 12301 : else if (scan->rs_parallel != NULL)
708 : {
709 0 : page = heap_parallelscan_nextpage(scan);
710 0 : finished = (page == InvalidBlockNumber);
711 : }
712 : else
713 : {
714 12301 : page++;
715 12301 : if (page >= scan->rs_nblocks)
716 260 : page = 0;
717 24360 : finished = (page == scan->rs_startblock) ||
718 12059 : (scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks == 0 : false);
719 :
720 : /*
721 : * Report our new scan position for synchronization purposes. We
722 : * don't do that when moving backwards, however. That would just
723 : * mess up any other forward-moving scanners.
724 : *
725 : * Note: we do this before checking for end of scan so that the
726 : * final state of the position hint is back at the start of the
727 : * rel. That's not strictly necessary, but otherwise when you run
728 : * the same query multiple times the starting position would shift
729 : * a little bit backwards on every invocation, which is confusing.
730 : * We don't guarantee any specific ordering in general, though.
731 : */
732 12301 : if (scan->rs_syncscan)
733 0 : ss_report_location(scan->rs_rd, page);
734 : }
735 :
736 : /*
737 : * return NULL if we've exhausted all the pages
738 : */
739 12301 : if (finished)
740 : {
741 266 : if (BufferIsValid(scan->rs_cbuf))
742 266 : ReleaseBuffer(scan->rs_cbuf);
743 266 : scan->rs_cbuf = InvalidBuffer;
744 266 : scan->rs_cblock = InvalidBlockNumber;
745 266 : tuple->t_data = NULL;
746 266 : scan->rs_inited = false;
747 266 : return;
748 : }
749 :
750 12035 : heapgetpage(scan, page);
751 :
752 12035 : LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
753 :
754 12035 : dp = BufferGetPage(scan->rs_cbuf);
755 12035 : TestForOldSnapshot(snapshot, scan->rs_rd, dp);
756 12035 : lines = PageGetMaxOffsetNumber((Page) dp);
757 12035 : linesleft = lines;
758 12035 : if (backward)
759 : {
760 0 : lineoff = lines;
761 0 : lpp = PageGetItemId(dp, lines);
762 : }
763 : else
764 : {
765 12035 : lineoff = FirstOffsetNumber;
766 12035 : lpp = PageGetItemId(dp, FirstOffsetNumber);
767 : }
768 12035 : }
769 : }
770 :
771 : /* ----------------
772 : * heapgettup_pagemode - fetch next heap tuple in page-at-a-time mode
773 : *
774 : * Same API as heapgettup, but used in page-at-a-time mode
775 : *
776 : * The internal logic is much the same as heapgettup's too, but there are some
777 : * differences: we do not take the buffer content lock (that only needs to
778 : * happen inside heapgetpage), and we iterate through just the tuples listed
779 : * in rs_vistuples[] rather than all tuples on the page. Notice that
780 : * lineindex is 0-based, where the corresponding loop variable lineoff in
781 : * heapgettup is 1-based.
782 : * ----------------
783 : */
784 : static void
785 2351516 : heapgettup_pagemode(HeapScanDesc scan,
786 : ScanDirection dir,
787 : int nkeys,
788 : ScanKey key)
789 : {
790 2351516 : HeapTuple tuple = &(scan->rs_ctup);
791 2351516 : bool backward = ScanDirectionIsBackward(dir);
792 : BlockNumber page;
793 : bool finished;
794 : Page dp;
795 : int lines;
796 : int lineindex;
797 : OffsetNumber lineoff;
798 : int linesleft;
799 : ItemId lpp;
800 :
801 : /*
802 : * calculate next starting lineindex, given scan direction
803 : */
804 2351516 : if (ScanDirectionIsForward(dir))
805 : {
806 2351409 : if (!scan->rs_inited)
807 : {
808 : /*
809 : * return null immediately if relation is empty
810 : */
811 15453 : if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
812 : {
813 2923 : Assert(!BufferIsValid(scan->rs_cbuf));
814 2923 : tuple->t_data = NULL;
815 2923 : return;
816 : }
817 12530 : if (scan->rs_parallel != NULL)
818 : {
819 53 : heap_parallelscan_startblock_init(scan);
820 :
821 53 : page = heap_parallelscan_nextpage(scan);
822 :
823 : /* Other processes might have already finished the scan. */
824 53 : if (page == InvalidBlockNumber)
825 : {
826 38 : Assert(!BufferIsValid(scan->rs_cbuf));
827 38 : tuple->t_data = NULL;
828 38 : return;
829 : }
830 : }
831 : else
832 12477 : page = scan->rs_startblock; /* first page */
833 12492 : heapgetpage(scan, page);
834 12492 : lineindex = 0;
835 12492 : scan->rs_inited = true;
836 : }
837 : else
838 : {
839 : /* continue from previously returned page/tuple */
840 2335956 : page = scan->rs_cblock; /* current page */
841 2335956 : lineindex = scan->rs_cindex + 1;
842 : }
843 :
844 2348448 : dp = BufferGetPage(scan->rs_cbuf);
845 2348448 : TestForOldSnapshot(scan->rs_snapshot, scan->rs_rd, dp);
846 2348448 : lines = scan->rs_ntuples;
847 : /* page and lineindex now reference the next visible tid */
848 :
849 2348448 : linesleft = lines - lineindex;
850 : }
851 107 : else if (backward)
852 : {
853 : /* backward parallel scan not supported */
854 107 : Assert(scan->rs_parallel == NULL);
855 :
856 107 : if (!scan->rs_inited)
857 : {
858 : /*
859 : * return null immediately if relation is empty
860 : */
861 6 : if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
862 : {
863 0 : Assert(!BufferIsValid(scan->rs_cbuf));
864 0 : tuple->t_data = NULL;
865 0 : return;
866 : }
867 :
868 : /*
869 : * Disable reporting to syncscan logic in a backwards scan; it's
870 : * not very likely anyone else is doing the same thing at the same
871 : * time, and much more likely that we'll just bollix things for
872 : * forward scanners.
873 : */
874 6 : scan->rs_syncscan = false;
875 : /* start from last page of the scan */
876 6 : if (scan->rs_startblock > 0)
877 0 : page = scan->rs_startblock - 1;
878 : else
879 6 : page = scan->rs_nblocks - 1;
880 6 : heapgetpage(scan, page);
881 : }
882 : else
883 : {
884 : /* continue from previously returned page/tuple */
885 101 : page = scan->rs_cblock; /* current page */
886 : }
887 :
888 107 : dp = BufferGetPage(scan->rs_cbuf);
889 107 : TestForOldSnapshot(scan->rs_snapshot, scan->rs_rd, dp);
890 107 : lines = scan->rs_ntuples;
891 :
892 107 : if (!scan->rs_inited)
893 : {
894 6 : lineindex = lines - 1;
895 6 : scan->rs_inited = true;
896 : }
897 : else
898 : {
899 101 : lineindex = scan->rs_cindex - 1;
900 : }
901 : /* page and lineindex now reference the previous visible tid */
902 :
903 107 : linesleft = lineindex + 1;
904 : }
905 : else
906 : {
907 : /*
908 : * ``no movement'' scan direction: refetch prior tuple
909 : */
910 0 : if (!scan->rs_inited)
911 : {
912 0 : Assert(!BufferIsValid(scan->rs_cbuf));
913 0 : tuple->t_data = NULL;
914 0 : return;
915 : }
916 :
917 0 : page = ItemPointerGetBlockNumber(&(tuple->t_self));
918 0 : if (page != scan->rs_cblock)
919 0 : heapgetpage(scan, page);
920 :
921 : /* Since the tuple was previously fetched, needn't lock page here */
922 0 : dp = BufferGetPage(scan->rs_cbuf);
923 0 : TestForOldSnapshot(scan->rs_snapshot, scan->rs_rd, dp);
924 0 : lineoff = ItemPointerGetOffsetNumber(&(tuple->t_self));
925 0 : lpp = PageGetItemId(dp, lineoff);
926 0 : Assert(ItemIdIsNormal(lpp));
927 :
928 0 : tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
929 0 : tuple->t_len = ItemIdGetLength(lpp);
930 :
931 : /* check that rs_cindex is in sync */
932 0 : Assert(scan->rs_cindex < scan->rs_ntuples);
933 0 : Assert(lineoff == scan->rs_vistuples[scan->rs_cindex]);
934 :
935 0 : return;
936 : }
937 :
938 : /*
939 : * advance the scan until we find a qualifying tuple or run out of stuff
940 : * to scan
941 : */
942 : for (;;)
943 : {
944 5383163 : while (linesleft > 0)
945 : {
946 2853671 : lineoff = scan->rs_vistuples[lineindex];
947 2853671 : lpp = PageGetItemId(dp, lineoff);
948 2853671 : Assert(ItemIdIsNormal(lpp));
949 :
950 2853671 : tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
951 2853671 : tuple->t_len = ItemIdGetLength(lpp);
952 2853671 : ItemPointerSet(&(tuple->t_self), page, lineoff);
953 :
954 : /*
955 : * if current tuple qualifies, return it.
956 : */
957 2853671 : if (key != NULL)
958 : {
959 : bool valid;
960 :
961 517773 : HeapKeyTest(tuple, RelationGetDescr(scan->rs_rd),
962 : nkeys, key, valid);
963 517773 : if (valid)
964 : {
965 3212 : scan->rs_cindex = lineindex;
966 3212 : return;
967 : }
968 : }
969 : else
970 : {
971 2335898 : scan->rs_cindex = lineindex;
972 2335898 : return;
973 : }
974 :
975 : /*
976 : * otherwise move to the next item on the page
977 : */
978 514561 : --linesleft;
979 514561 : if (backward)
980 0 : --lineindex;
981 : else
982 514561 : ++lineindex;
983 : }
984 :
985 : /*
986 : * if we get here, it means we've exhausted the items on this page and
987 : * it's time to move to the next.
988 : */
989 95191 : if (backward)
990 : {
991 12 : finished = (page == scan->rs_startblock) ||
992 0 : (scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks == 0 : false);
993 12 : if (page == 0)
994 12 : page = scan->rs_nblocks;
995 12 : page--;
996 : }
997 95179 : else if (scan->rs_parallel != NULL)
998 : {
999 3111 : page = heap_parallelscan_nextpage(scan);
1000 3111 : finished = (page == InvalidBlockNumber);
1001 : }
1002 : else
1003 : {
1004 92068 : page++;
1005 92068 : if (page >= scan->rs_nblocks)
1006 9418 : page = 0;
1007 174718 : finished = (page == scan->rs_startblock) ||
1008 82650 : (scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks == 0 : false);
1009 :
1010 : /*
1011 : * Report our new scan position for synchronization purposes. We
1012 : * don't do that when moving backwards, however. That would just
1013 : * mess up any other forward-moving scanners.
1014 : *
1015 : * Note: we do this before checking for end of scan so that the
1016 : * final state of the position hint is back at the start of the
1017 : * rel. That's not strictly necessary, but otherwise when you run
1018 : * the same query multiple times the starting position would shift
1019 : * a little bit backwards on every invocation, which is confusing.
1020 : * We don't guarantee any specific ordering in general, though.
1021 : */
1022 92068 : if (scan->rs_syncscan)
1023 0 : ss_report_location(scan->rs_rd, page);
1024 : }
1025 :
1026 : /*
1027 : * return NULL if we've exhausted all the pages
1028 : */
1029 95191 : if (finished)
1030 : {
1031 9445 : if (BufferIsValid(scan->rs_cbuf))
1032 9445 : ReleaseBuffer(scan->rs_cbuf);
1033 9445 : scan->rs_cbuf = InvalidBuffer;
1034 9445 : scan->rs_cblock = InvalidBlockNumber;
1035 9445 : tuple->t_data = NULL;
1036 9445 : scan->rs_inited = false;
1037 9445 : return;
1038 : }
1039 :
1040 85746 : heapgetpage(scan, page);
1041 :
1042 85746 : dp = BufferGetPage(scan->rs_cbuf);
1043 85746 : TestForOldSnapshot(scan->rs_snapshot, scan->rs_rd, dp);
1044 85746 : lines = scan->rs_ntuples;
1045 85746 : linesleft = lines;
1046 85746 : if (backward)
1047 0 : lineindex = lines - 1;
1048 : else
1049 85746 : lineindex = 0;
1050 85746 : }
1051 : }
1052 :
1053 :
1054 : #if defined(DISABLE_COMPLEX_MACRO)
1055 : /*
1056 : * This is formatted so oddly so that the correspondence to the macro
1057 : * definition in access/htup_details.h is maintained.
1058 : */
1059 : Datum
1060 : fastgetattr(HeapTuple tup, int attnum, TupleDesc tupleDesc,
1061 : bool *isnull)
1062 : {
1063 : return (
1064 : (attnum) > 0 ?
1065 : (
1066 : (*(isnull) = false),
1067 : HeapTupleNoNulls(tup) ?
1068 : (
1069 : TupleDescAttr((tupleDesc), (attnum) - 1)->attcacheoff >= 0 ?
1070 : (
1071 : fetchatt(TupleDescAttr((tupleDesc), (attnum) - 1),
1072 : (char *) (tup)->t_data + (tup)->t_data->t_hoff +
1073 : TupleDescAttr((tupleDesc), (attnum) - 1)->attcacheoff)
1074 : )
1075 : :
1076 : nocachegetattr((tup), (attnum), (tupleDesc))
1077 : )
1078 : :
1079 : (
1080 : att_isnull((attnum) - 1, (tup)->t_data->t_bits) ?
1081 : (
1082 : (*(isnull) = true),
1083 : (Datum) NULL
1084 : )
1085 : :
1086 : (
1087 : nocachegetattr((tup), (attnum), (tupleDesc))
1088 : )
1089 : )
1090 : )
1091 : :
1092 : (
1093 : (Datum) NULL
1094 : )
1095 : );
1096 : }
1097 : #endif /* defined(DISABLE_COMPLEX_MACRO) */
1098 :
1099 :
1100 : /* ----------------------------------------------------------------
1101 : * heap access method interface
1102 : * ----------------------------------------------------------------
1103 : */
1104 :
1105 : /* ----------------
1106 : * relation_open - open any relation by relation OID
1107 : *
1108 : * If lockmode is not "NoLock", the specified kind of lock is
1109 : * obtained on the relation. (Generally, NoLock should only be
1110 : * used if the caller knows it has some appropriate lock on the
1111 : * relation already.)
1112 : *
1113 : * An error is raised if the relation does not exist.
1114 : *
1115 : * NB: a "relation" is anything with a pg_class entry. The caller is
1116 : * expected to check whether the relkind is something it can handle.
1117 : * ----------------
1118 : */
1119 : Relation
1120 1006343 : relation_open(Oid relationId, LOCKMODE lockmode)
1121 : {
1122 : Relation r;
1123 :
1124 1006343 : Assert(lockmode >= NoLock && lockmode < MAX_LOCKMODES);
1125 :
1126 : /* Get the lock before trying to open the relcache entry */
1127 1006343 : if (lockmode != NoLock)
1128 908528 : LockRelationOid(relationId, lockmode);
1129 :
1130 : /* The relcache does all the real work... */
1131 1006343 : r = RelationIdGetRelation(relationId);
1132 :
1133 1006343 : if (!RelationIsValid(r))
1134 0 : elog(ERROR, "could not open relation with OID %u", relationId);
1135 :
1136 : /* Make note that we've accessed a temporary relation */
1137 1006343 : if (RelationUsesLocalBuffers(r))
1138 14215 : MyXactFlags |= XACT_FLAGS_ACCESSEDTEMPREL;
1139 :
1140 1006343 : pgstat_initstats(r);
1141 :
1142 1006343 : return r;
1143 : }
1144 :
1145 : /* ----------------
1146 : * try_relation_open - open any relation by relation OID
1147 : *
1148 : * Same as relation_open, except return NULL instead of failing
1149 : * if the relation does not exist.
1150 : * ----------------
1151 : */
1152 : Relation
1153 966 : try_relation_open(Oid relationId, LOCKMODE lockmode)
1154 : {
1155 : Relation r;
1156 :
1157 966 : Assert(lockmode >= NoLock && lockmode < MAX_LOCKMODES);
1158 :
1159 : /* Get the lock first */
1160 966 : if (lockmode != NoLock)
1161 907 : LockRelationOid(relationId, lockmode);
1162 :
1163 : /*
1164 : * Now that we have the lock, probe to see if the relation really exists
1165 : * or not.
1166 : */
1167 966 : if (!SearchSysCacheExists1(RELOID, ObjectIdGetDatum(relationId)))
1168 : {
1169 : /* Release useless lock */
1170 0 : if (lockmode != NoLock)
1171 0 : UnlockRelationOid(relationId, lockmode);
1172 :
1173 0 : return NULL;
1174 : }
1175 :
1176 : /* Should be safe to do a relcache load */
1177 966 : r = RelationIdGetRelation(relationId);
1178 :
1179 966 : if (!RelationIsValid(r))
1180 0 : elog(ERROR, "could not open relation with OID %u", relationId);
1181 :
1182 : /* Make note that we've accessed a temporary relation */
1183 966 : if (RelationUsesLocalBuffers(r))
1184 19 : MyXactFlags |= XACT_FLAGS_ACCESSEDTEMPREL;
1185 :
1186 966 : pgstat_initstats(r);
1187 :
1188 966 : return r;
1189 : }
1190 :
1191 : /* ----------------
1192 : * relation_openrv - open any relation specified by a RangeVar
1193 : *
1194 : * Same as relation_open, but the relation is specified by a RangeVar.
1195 : * ----------------
1196 : */
1197 : Relation
1198 1428 : relation_openrv(const RangeVar *relation, LOCKMODE lockmode)
1199 : {
1200 : Oid relOid;
1201 :
1202 : /*
1203 : * Check for shared-cache-inval messages before trying to open the
1204 : * relation. This is needed even if we already hold a lock on the
1205 : * relation, because GRANT/REVOKE are executed without taking any lock on
1206 : * the target relation, and we want to be sure we see current ACL
1207 : * information. We can skip this if asked for NoLock, on the assumption
1208 : * that such a call is not the first one in the current command, and so we
1209 : * should be reasonably up-to-date already. (XXX this all could stand to
1210 : * be redesigned, but for the moment we'll keep doing this like it's been
1211 : * done historically.)
1212 : */
1213 1428 : if (lockmode != NoLock)
1214 1370 : AcceptInvalidationMessages();
1215 :
1216 : /* Look up and lock the appropriate relation using namespace search */
1217 1428 : relOid = RangeVarGetRelid(relation, lockmode, false);
1218 :
1219 : /* Let relation_open do the rest */
1220 1412 : return relation_open(relOid, NoLock);
1221 : }
1222 :
1223 : /* ----------------
1224 : * relation_openrv_extended - open any relation specified by a RangeVar
1225 : *
1226 : * Same as relation_openrv, but with an additional missing_ok argument
1227 : * allowing a NULL return rather than an error if the relation is not
1228 : * found. (Note that some other causes, such as permissions problems,
1229 : * will still result in an ereport.)
1230 : * ----------------
1231 : */
1232 : Relation
1233 18765 : relation_openrv_extended(const RangeVar *relation, LOCKMODE lockmode,
1234 : bool missing_ok)
1235 : {
1236 : Oid relOid;
1237 :
1238 : /*
1239 : * Check for shared-cache-inval messages before trying to open the
1240 : * relation. See comments in relation_openrv().
1241 : */
1242 18765 : if (lockmode != NoLock)
1243 18765 : AcceptInvalidationMessages();
1244 :
1245 : /* Look up and lock the appropriate relation using namespace search */
1246 18765 : relOid = RangeVarGetRelid(relation, lockmode, missing_ok);
1247 :
1248 : /* Return NULL on not-found */
1249 18700 : if (!OidIsValid(relOid))
1250 21 : return NULL;
1251 :
1252 : /* Let relation_open do the rest */
1253 18679 : return relation_open(relOid, NoLock);
1254 : }
1255 :
1256 : /* ----------------
1257 : * relation_close - close any relation
1258 : *
1259 : * If lockmode is not "NoLock", we then release the specified lock.
1260 : *
1261 : * Note that it is often sensible to hold a lock beyond relation_close;
1262 : * in that case, the lock is released automatically at xact end.
1263 : * ----------------
1264 : */
1265 : void
1266 518869 : relation_close(Relation relation, LOCKMODE lockmode)
1267 : {
1268 518869 : LockRelId relid = relation->rd_lockInfo.lockRelId;
1269 :
1270 518869 : Assert(lockmode >= NoLock && lockmode < MAX_LOCKMODES);
1271 :
1272 : /* The relcache does the real work... */
1273 518869 : RelationClose(relation);
1274 :
1275 518869 : if (lockmode != NoLock)
1276 377490 : UnlockRelationId(&relid, lockmode);
1277 518869 : }
1278 :
1279 :
1280 : /* ----------------
1281 : * heap_open - open a heap relation by relation OID
1282 : *
1283 : * This is essentially relation_open plus check that the relation
1284 : * is not an index nor a composite type. (The caller should also
1285 : * check that it's not a view or foreign table before assuming it has
1286 : * storage.)
1287 : * ----------------
1288 : */
1289 : Relation
1290 477121 : heap_open(Oid relationId, LOCKMODE lockmode)
1291 : {
1292 : Relation r;
1293 :
1294 477121 : r = relation_open(relationId, lockmode);
1295 :
1296 477121 : if (r->rd_rel->relkind == RELKIND_INDEX)
1297 0 : ereport(ERROR,
1298 : (errcode(ERRCODE_WRONG_OBJECT_TYPE),
1299 : errmsg("\"%s\" is an index",
1300 : RelationGetRelationName(r))));
1301 477121 : else if (r->rd_rel->relkind == RELKIND_COMPOSITE_TYPE)
1302 0 : ereport(ERROR,
1303 : (errcode(ERRCODE_WRONG_OBJECT_TYPE),
1304 : errmsg("\"%s\" is a composite type",
1305 : RelationGetRelationName(r))));
1306 :
1307 477121 : return r;
1308 : }
1309 :
1310 : /* ----------------
1311 : * heap_openrv - open a heap relation specified
1312 : * by a RangeVar node
1313 : *
1314 : * As above, but relation is specified by a RangeVar.
1315 : * ----------------
1316 : */
1317 : Relation
1318 1225 : heap_openrv(const RangeVar *relation, LOCKMODE lockmode)
1319 : {
1320 : Relation r;
1321 :
1322 1225 : r = relation_openrv(relation, lockmode);
1323 :
1324 1223 : if (r->rd_rel->relkind == RELKIND_INDEX)
1325 0 : ereport(ERROR,
1326 : (errcode(ERRCODE_WRONG_OBJECT_TYPE),
1327 : errmsg("\"%s\" is an index",
1328 : RelationGetRelationName(r))));
1329 1223 : else if (r->rd_rel->relkind == RELKIND_COMPOSITE_TYPE)
1330 0 : ereport(ERROR,
1331 : (errcode(ERRCODE_WRONG_OBJECT_TYPE),
1332 : errmsg("\"%s\" is a composite type",
1333 : RelationGetRelationName(r))));
1334 :
1335 1223 : return r;
1336 : }
1337 :
1338 : /* ----------------
1339 : * heap_openrv_extended - open a heap relation specified
1340 : * by a RangeVar node
1341 : *
1342 : * As above, but optionally return NULL instead of failing for
1343 : * relation-not-found.
1344 : * ----------------
1345 : */
1346 : Relation
1347 18690 : heap_openrv_extended(const RangeVar *relation, LOCKMODE lockmode,
1348 : bool missing_ok)
1349 : {
1350 : Relation r;
1351 :
1352 18690 : r = relation_openrv_extended(relation, lockmode, missing_ok);
1353 :
1354 18668 : if (r)
1355 : {
1356 18647 : if (r->rd_rel->relkind == RELKIND_INDEX)
1357 0 : ereport(ERROR,
1358 : (errcode(ERRCODE_WRONG_OBJECT_TYPE),
1359 : errmsg("\"%s\" is an index",
1360 : RelationGetRelationName(r))));
1361 18647 : else if (r->rd_rel->relkind == RELKIND_COMPOSITE_TYPE)
1362 0 : ereport(ERROR,
1363 : (errcode(ERRCODE_WRONG_OBJECT_TYPE),
1364 : errmsg("\"%s\" is a composite type",
1365 : RelationGetRelationName(r))));
1366 : }
1367 :
1368 18668 : return r;
1369 : }
1370 :
1371 :
1372 : /* ----------------
1373 : * heap_beginscan - begin relation scan
1374 : *
1375 : * heap_beginscan is the "standard" case.
1376 : *
1377 : * heap_beginscan_catalog differs in setting up its own temporary snapshot.
1378 : *
1379 : * heap_beginscan_strat offers an extended API that lets the caller control
1380 : * whether a nondefault buffer access strategy can be used, and whether
1381 : * syncscan can be chosen (possibly resulting in the scan not starting from
1382 : * block zero). Both of these default to TRUE with plain heap_beginscan.
1383 : *
1384 : * heap_beginscan_bm is an alternative entry point for setting up a
1385 : * HeapScanDesc for a bitmap heap scan. Although that scan technology is
1386 : * really quite unlike a standard seqscan, there is just enough commonality
1387 : * to make it worth using the same data structure.
1388 : *
1389 : * heap_beginscan_sampling is an alternative entry point for setting up a
1390 : * HeapScanDesc for a TABLESAMPLE scan. As with bitmap scans, it's worth
1391 : * using the same data structure although the behavior is rather different.
1392 : * In addition to the options offered by heap_beginscan_strat, this call
1393 : * also allows control of whether page-mode visibility checking is used.
1394 : * ----------------
1395 : */
1396 : HeapScanDesc
1397 8866 : heap_beginscan(Relation relation, Snapshot snapshot,
1398 : int nkeys, ScanKey key)
1399 : {
1400 8866 : return heap_beginscan_internal(relation, snapshot, nkeys, key, NULL,
1401 : true, true, true, false, false, false);
1402 : }
1403 :
1404 : HeapScanDesc
1405 2741 : heap_beginscan_catalog(Relation relation, int nkeys, ScanKey key)
1406 : {
1407 2741 : Oid relid = RelationGetRelid(relation);
1408 2741 : Snapshot snapshot = RegisterSnapshot(GetCatalogSnapshot(relid));
1409 :
1410 2741 : return heap_beginscan_internal(relation, snapshot, nkeys, key, NULL,
1411 : true, true, true, false, false, true);
1412 : }
1413 :
1414 : HeapScanDesc
1415 4041 : heap_beginscan_strat(Relation relation, Snapshot snapshot,
1416 : int nkeys, ScanKey key,
1417 : bool allow_strat, bool allow_sync)
1418 : {
1419 4041 : return heap_beginscan_internal(relation, snapshot, nkeys, key, NULL,
1420 : allow_strat, allow_sync, true,
1421 : false, false, false);
1422 : }
1423 :
1424 : HeapScanDesc
1425 1632 : heap_beginscan_bm(Relation relation, Snapshot snapshot,
1426 : int nkeys, ScanKey key)
1427 : {
1428 1632 : return heap_beginscan_internal(relation, snapshot, nkeys, key, NULL,
1429 : false, false, true, true, false, false);
1430 : }
1431 :
1432 : HeapScanDesc
1433 21 : heap_beginscan_sampling(Relation relation, Snapshot snapshot,
1434 : int nkeys, ScanKey key,
1435 : bool allow_strat, bool allow_sync, bool allow_pagemode)
1436 : {
1437 21 : return heap_beginscan_internal(relation, snapshot, nkeys, key, NULL,
1438 : allow_strat, allow_sync, allow_pagemode,
1439 : false, true, false);
1440 : }
1441 :
1442 : static HeapScanDesc
1443 17352 : heap_beginscan_internal(Relation relation, Snapshot snapshot,
1444 : int nkeys, ScanKey key,
1445 : ParallelHeapScanDesc parallel_scan,
1446 : bool allow_strat,
1447 : bool allow_sync,
1448 : bool allow_pagemode,
1449 : bool is_bitmapscan,
1450 : bool is_samplescan,
1451 : bool temp_snap)
1452 : {
1453 : HeapScanDesc scan;
1454 :
1455 : /*
1456 : * increment relation ref count while scanning relation
1457 : *
1458 : * This is just to make really sure the relcache entry won't go away while
1459 : * the scan has a pointer to it. Caller should be holding the rel open
1460 : * anyway, so this is redundant in all normal scenarios...
1461 : */
1462 17352 : RelationIncrementReferenceCount(relation);
1463 :
1464 : /*
1465 : * allocate and initialize scan descriptor
1466 : */
1467 17352 : scan = (HeapScanDesc) palloc(sizeof(HeapScanDescData));
1468 :
1469 17352 : scan->rs_rd = relation;
1470 17352 : scan->rs_snapshot = snapshot;
1471 17352 : scan->rs_nkeys = nkeys;
1472 17352 : scan->rs_bitmapscan = is_bitmapscan;
1473 17352 : scan->rs_samplescan = is_samplescan;
1474 17352 : scan->rs_strategy = NULL; /* set in initscan */
1475 17352 : scan->rs_allow_strat = allow_strat;
1476 17352 : scan->rs_allow_sync = allow_sync;
1477 17352 : scan->rs_temp_snap = temp_snap;
1478 17352 : scan->rs_parallel = parallel_scan;
1479 :
1480 : /*
1481 : * we can use page-at-a-time mode if it's an MVCC-safe snapshot
1482 : */
1483 17352 : scan->rs_pageatatime = allow_pagemode && IsMVCCSnapshot(snapshot);
1484 :
1485 : /*
1486 : * For a seqscan in a serializable transaction, acquire a predicate lock
1487 : * on the entire relation. This is required not only to lock all the
1488 : * matching tuples, but also to conflict with new insertions into the
1489 : * table. In an indexscan, we take page locks on the index pages covering
1490 : * the range specified in the scan qual, but in a heap scan there is
1491 : * nothing more fine-grained to lock. A bitmap scan is a different story,
1492 : * there we have already scanned the index and locked the index pages
1493 : * covering the predicate. But in that case we still have to lock any
1494 : * matching heap tuples.
1495 : */
1496 17352 : if (!is_bitmapscan)
1497 15720 : PredicateLockRelation(relation, snapshot);
1498 :
1499 : /* we only need to set this up once */
1500 17352 : scan->rs_ctup.t_tableOid = RelationGetRelid(relation);
1501 :
1502 : /*
1503 : * we do this here instead of in initscan() because heap_rescan also calls
1504 : * initscan() and we don't want to allocate memory again
1505 : */
1506 17352 : if (nkeys > 0)
1507 5060 : scan->rs_key = (ScanKey) palloc(sizeof(ScanKeyData) * nkeys);
1508 : else
1509 12292 : scan->rs_key = NULL;
1510 :
1511 17352 : initscan(scan, key, false);
1512 :
1513 17352 : return scan;
1514 : }
1515 :
1516 : /* ----------------
1517 : * heap_rescan - restart a relation scan
1518 : * ----------------
1519 : */
1520 : void
1521 1330 : heap_rescan(HeapScanDesc scan,
1522 : ScanKey key)
1523 : {
1524 : /*
1525 : * unpin scan buffers
1526 : */
1527 1330 : if (BufferIsValid(scan->rs_cbuf))
1528 378 : ReleaseBuffer(scan->rs_cbuf);
1529 :
1530 : /*
1531 : * reinitialize scan descriptor
1532 : */
1533 1330 : initscan(scan, key, true);
1534 1330 : }
1535 :
1536 : /* ----------------
1537 : * heap_rescan_set_params - restart a relation scan after changing params
1538 : *
1539 : * This call allows changing the buffer strategy, syncscan, and pagemode
1540 : * options before starting a fresh scan. Note that although the actual use
1541 : * of syncscan might change (effectively, enabling or disabling reporting),
1542 : * the previously selected startblock will be kept.
1543 : * ----------------
1544 : */
1545 : void
1546 5 : heap_rescan_set_params(HeapScanDesc scan, ScanKey key,
1547 : bool allow_strat, bool allow_sync, bool allow_pagemode)
1548 : {
1549 : /* adjust parameters */
1550 5 : scan->rs_allow_strat = allow_strat;
1551 5 : scan->rs_allow_sync = allow_sync;
1552 5 : scan->rs_pageatatime = allow_pagemode && IsMVCCSnapshot(scan->rs_snapshot);
1553 : /* ... and rescan */
1554 5 : heap_rescan(scan, key);
1555 5 : }
1556 :
1557 : /* ----------------
1558 : * heap_endscan - end relation scan
1559 : *
1560 : * See how to integrate with index scans.
1561 : * Check handling if reldesc caching.
1562 : * ----------------
1563 : */
1564 : void
1565 17172 : heap_endscan(HeapScanDesc scan)
1566 : {
1567 : /* Note: no locking manipulations needed */
1568 :
1569 : /*
1570 : * unpin scan buffers
1571 : */
1572 17172 : if (BufferIsValid(scan->rs_cbuf))
1573 3263 : ReleaseBuffer(scan->rs_cbuf);
1574 :
1575 : /*
1576 : * decrement relation reference count and free scan descriptor storage
1577 : */
1578 17172 : RelationDecrementReferenceCount(scan->rs_rd);
1579 :
1580 17172 : if (scan->rs_key)
1581 5053 : pfree(scan->rs_key);
1582 :
1583 17172 : if (scan->rs_strategy != NULL)
1584 0 : FreeAccessStrategy(scan->rs_strategy);
1585 :
1586 17172 : if (scan->rs_temp_snap)
1587 2830 : UnregisterSnapshot(scan->rs_snapshot);
1588 :
1589 17172 : pfree(scan);
1590 17172 : }
1591 :
1592 : /* ----------------
1593 : * heap_parallelscan_estimate - estimate storage for ParallelHeapScanDesc
1594 : *
1595 : * Sadly, this doesn't reduce to a constant, because the size required
1596 : * to serialize the snapshot can vary.
1597 : * ----------------
1598 : */
1599 : Size
1600 13 : heap_parallelscan_estimate(Snapshot snapshot)
1601 : {
1602 13 : return add_size(offsetof(ParallelHeapScanDescData, phs_snapshot_data),
1603 : EstimateSnapshotSpace(snapshot));
1604 : }
1605 :
1606 : /* ----------------
1607 : * heap_parallelscan_initialize - initialize ParallelHeapScanDesc
1608 : *
1609 : * Must allow as many bytes of shared memory as returned by
1610 : * heap_parallelscan_estimate. Call this just once in the leader
1611 : * process; then, individual workers attach via heap_beginscan_parallel.
1612 : * ----------------
1613 : */
1614 : void
1615 13 : heap_parallelscan_initialize(ParallelHeapScanDesc target, Relation relation,
1616 : Snapshot snapshot)
1617 : {
1618 13 : target->phs_relid = RelationGetRelid(relation);
1619 13 : target->phs_nblocks = RelationGetNumberOfBlocks(relation);
1620 : /* compare phs_syncscan initialization to similar logic in initscan */
1621 39 : target->phs_syncscan = synchronize_seqscans &&
1622 26 : !RelationUsesLocalBuffers(relation) &&
1623 13 : target->phs_nblocks > NBuffers / 4;
1624 13 : SpinLockInit(&target->phs_mutex);
1625 13 : target->phs_startblock = InvalidBlockNumber;
1626 13 : pg_atomic_init_u64(&target->phs_nallocated, 0);
1627 13 : SerializeSnapshot(snapshot, target->phs_snapshot_data);
1628 13 : }
1629 :
1630 : /* ----------------
1631 : * heap_parallelscan_reinitialize - reset a parallel scan
1632 : *
1633 : * Call this in the leader process. Caller is responsible for
1634 : * making sure that all workers have finished the scan beforehand.
1635 : * ----------------
1636 : */
1637 : void
1638 2 : heap_parallelscan_reinitialize(ParallelHeapScanDesc parallel_scan)
1639 : {
1640 2 : pg_atomic_write_u64(¶llel_scan->phs_nallocated, 0);
1641 2 : }
1642 :
1643 : /* ----------------
1644 : * heap_beginscan_parallel - join a parallel scan
1645 : *
1646 : * Caller must hold a suitable lock on the correct relation.
1647 : * ----------------
1648 : */
1649 : HeapScanDesc
1650 51 : heap_beginscan_parallel(Relation relation, ParallelHeapScanDesc parallel_scan)
1651 : {
1652 : Snapshot snapshot;
1653 :
1654 51 : Assert(RelationGetRelid(relation) == parallel_scan->phs_relid);
1655 51 : snapshot = RestoreSnapshot(parallel_scan->phs_snapshot_data);
1656 51 : RegisterSnapshot(snapshot);
1657 :
1658 51 : return heap_beginscan_internal(relation, snapshot, 0, NULL, parallel_scan,
1659 : true, true, true, false, false, true);
1660 : }
1661 :
1662 : /* ----------------
1663 : * heap_parallelscan_startblock_init - find and set the scan's startblock
1664 : *
1665 : * Determine where the parallel seq scan should start. This function may
1666 : * be called many times, once by each parallel worker. We must be careful
1667 : * only to set the startblock once.
1668 : * ----------------
1669 : */
1670 : static void
1671 53 : heap_parallelscan_startblock_init(HeapScanDesc scan)
1672 : {
1673 53 : BlockNumber sync_startpage = InvalidBlockNumber;
1674 : ParallelHeapScanDesc parallel_scan;
1675 :
1676 53 : Assert(scan->rs_parallel);
1677 53 : parallel_scan = scan->rs_parallel;
1678 :
1679 : retry:
1680 : /* Grab the spinlock. */
1681 53 : SpinLockAcquire(¶llel_scan->phs_mutex);
1682 :
1683 : /*
1684 : * If the scan's startblock has not yet been initialized, we must do so
1685 : * now. If this is not a synchronized scan, we just start at block 0, but
1686 : * if it is a synchronized scan, we must get the starting position from
1687 : * the synchronized scan machinery. We can't hold the spinlock while
1688 : * doing that, though, so release the spinlock, get the information we
1689 : * need, and retry. If nobody else has initialized the scan in the
1690 : * meantime, we'll fill in the value we fetched on the second time
1691 : * through.
1692 : */
1693 53 : if (parallel_scan->phs_startblock == InvalidBlockNumber)
1694 : {
1695 13 : if (!parallel_scan->phs_syncscan)
1696 13 : parallel_scan->phs_startblock = 0;
1697 0 : else if (sync_startpage != InvalidBlockNumber)
1698 0 : parallel_scan->phs_startblock = sync_startpage;
1699 : else
1700 : {
1701 0 : SpinLockRelease(¶llel_scan->phs_mutex);
1702 0 : sync_startpage = ss_get_location(scan->rs_rd, scan->rs_nblocks);
1703 0 : goto retry;
1704 : }
1705 : }
1706 53 : SpinLockRelease(¶llel_scan->phs_mutex);
1707 53 : }
1708 :
1709 : /* ----------------
1710 : * heap_parallelscan_nextpage - get the next page to scan
1711 : *
1712 : * Get the next page to scan. Even if there are no pages left to scan,
1713 : * another backend could have grabbed a page to scan and not yet finished
1714 : * looking at it, so it doesn't follow that the scan is done when the
1715 : * first backend gets an InvalidBlockNumber return.
1716 : * ----------------
1717 : */
1718 : static BlockNumber
1719 3164 : heap_parallelscan_nextpage(HeapScanDesc scan)
1720 : {
1721 : BlockNumber page;
1722 : ParallelHeapScanDesc parallel_scan;
1723 : uint64 nallocated;
1724 :
1725 3164 : Assert(scan->rs_parallel);
1726 3164 : parallel_scan = scan->rs_parallel;
1727 :
1728 : /*
1729 : * phs_nallocated tracks how many pages have been allocated to workers
1730 : * already. When phs_nallocated >= rs_nblocks, all blocks have been
1731 : * allocated.
1732 : *
1733 : * Because we use an atomic fetch-and-add to fetch the current value, the
1734 : * phs_nallocated counter will exceed rs_nblocks, because workers will
1735 : * still increment the value, when they try to allocate the next block but
1736 : * all blocks have been allocated already. The counter must be 64 bits
1737 : * wide because of that, to avoid wrapping around when rs_nblocks is close
1738 : * to 2^32.
1739 : *
1740 : * The actual page to return is calculated by adding the counter to the
1741 : * starting block number, modulo nblocks.
1742 : */
1743 3164 : nallocated = pg_atomic_fetch_add_u64(¶llel_scan->phs_nallocated, 1);
1744 3164 : if (nallocated >= scan->rs_nblocks)
1745 53 : page = InvalidBlockNumber; /* all blocks have been allocated */
1746 : else
1747 3111 : page = (nallocated + parallel_scan->phs_startblock) % scan->rs_nblocks;
1748 :
1749 : /*
1750 : * Report scan location. Normally, we report the current page number.
1751 : * When we reach the end of the scan, though, we report the starting page,
1752 : * not the ending page, just so the starting positions for later scans
1753 : * doesn't slew backwards. We only report the position at the end of the
1754 : * scan once, though: subsequent callers will report nothing.
1755 : */
1756 3164 : if (scan->rs_syncscan)
1757 : {
1758 0 : if (page != InvalidBlockNumber)
1759 0 : ss_report_location(scan->rs_rd, page);
1760 0 : else if (nallocated == scan->rs_nblocks)
1761 0 : ss_report_location(scan->rs_rd, parallel_scan->phs_startblock);
1762 : }
1763 :
1764 3164 : return page;
1765 : }
1766 :
1767 : /* ----------------
1768 : * heap_update_snapshot
1769 : *
1770 : * Update snapshot info in heap scan descriptor.
1771 : * ----------------
1772 : */
1773 : void
1774 44 : heap_update_snapshot(HeapScanDesc scan, Snapshot snapshot)
1775 : {
1776 44 : Assert(IsMVCCSnapshot(snapshot));
1777 :
1778 44 : RegisterSnapshot(snapshot);
1779 44 : scan->rs_snapshot = snapshot;
1780 44 : scan->rs_temp_snap = true;
1781 44 : }
1782 :
1783 : /* ----------------
1784 : * heap_getnext - retrieve next tuple in scan
1785 : *
1786 : * Fix to work with index relations.
1787 : * We don't return the buffer anymore, but you can get it from the
1788 : * returned HeapTuple.
1789 : * ----------------
1790 : */
1791 :
1792 : #ifdef HEAPDEBUGALL
1793 : #define HEAPDEBUG_1 \
1794 : elog(DEBUG2, "heap_getnext([%s,nkeys=%d],dir=%d) called", \
1795 : RelationGetRelationName(scan->rs_rd), scan->rs_nkeys, (int) direction)
1796 : #define HEAPDEBUG_2 \
1797 : elog(DEBUG2, "heap_getnext returning EOS")
1798 : #define HEAPDEBUG_3 \
1799 : elog(DEBUG2, "heap_getnext returning tuple")
1800 : #else
1801 : #define HEAPDEBUG_1
1802 : #define HEAPDEBUG_2
1803 : #define HEAPDEBUG_3
1804 : #endif /* !defined(HEAPDEBUGALL) */
1805 :
1806 :
1807 : HeapTuple
1808 3118146 : heap_getnext(HeapScanDesc scan, ScanDirection direction)
1809 : {
1810 : /* Note: no locking manipulations needed */
1811 :
1812 : HEAPDEBUG_1; /* heap_getnext( info ) */
1813 :
1814 3118146 : if (scan->rs_pageatatime)
1815 2351516 : heapgettup_pagemode(scan, direction,
1816 : scan->rs_nkeys, scan->rs_key);
1817 : else
1818 766630 : heapgettup(scan, direction, scan->rs_nkeys, scan->rs_key);
1819 :
1820 3118146 : if (scan->rs_ctup.t_data == NULL)
1821 : {
1822 : HEAPDEBUG_2; /* heap_getnext returning EOS */
1823 13798 : return NULL;
1824 : }
1825 :
1826 : /*
1827 : * if we get here it means we have a new current scan tuple, so point to
1828 : * the proper return buffer and return the tuple.
1829 : */
1830 : HEAPDEBUG_3; /* heap_getnext returning tuple */
1831 :
1832 3104348 : pgstat_count_heap_getnext(scan->rs_rd);
1833 :
1834 3104348 : return &(scan->rs_ctup);
1835 : }
1836 :
1837 : /*
1838 : * heap_fetch - retrieve tuple with given tid
1839 : *
1840 : * On entry, tuple->t_self is the TID to fetch. We pin the buffer holding
1841 : * the tuple, fill in the remaining fields of *tuple, and check the tuple
1842 : * against the specified snapshot.
1843 : *
1844 : * If successful (tuple found and passes snapshot time qual), then *userbuf
1845 : * is set to the buffer holding the tuple and TRUE is returned. The caller
1846 : * must unpin the buffer when done with the tuple.
1847 : *
1848 : * If the tuple is not found (ie, item number references a deleted slot),
1849 : * then tuple->t_data is set to NULL and FALSE is returned.
1850 : *
1851 : * If the tuple is found but fails the time qual check, then FALSE is returned
1852 : * but tuple->t_data is left pointing to the tuple.
1853 : *
1854 : * keep_buf determines what is done with the buffer in the FALSE-result cases.
1855 : * When the caller specifies keep_buf = true, we retain the pin on the buffer
1856 : * and return it in *userbuf (so the caller must eventually unpin it); when
1857 : * keep_buf = false, the pin is released and *userbuf is set to InvalidBuffer.
1858 : *
1859 : * stats_relation is the relation to charge the heap_fetch operation against
1860 : * for statistical purposes. (This could be the heap rel itself, an
1861 : * associated index, or NULL to not count the fetch at all.)
1862 : *
1863 : * heap_fetch does not follow HOT chains: only the exact TID requested will
1864 : * be fetched.
1865 : *
1866 : * It is somewhat inconsistent that we ereport() on invalid block number but
1867 : * return false on invalid item number. There are a couple of reasons though.
1868 : * One is that the caller can relatively easily check the block number for
1869 : * validity, but cannot check the item number without reading the page
1870 : * himself. Another is that when we are following a t_ctid link, we can be
1871 : * reasonably confident that the page number is valid (since VACUUM shouldn't
1872 : * truncate off the destination page without having killed the referencing
1873 : * tuple first), but the item number might well not be good.
1874 : */
1875 : bool
1876 1401 : heap_fetch(Relation relation,
1877 : Snapshot snapshot,
1878 : HeapTuple tuple,
1879 : Buffer *userbuf,
1880 : bool keep_buf,
1881 : Relation stats_relation)
1882 : {
1883 1401 : ItemPointer tid = &(tuple->t_self);
1884 : ItemId lp;
1885 : Buffer buffer;
1886 : Page page;
1887 : OffsetNumber offnum;
1888 : bool valid;
1889 :
1890 : /*
1891 : * Fetch and pin the appropriate page of the relation.
1892 : */
1893 1401 : buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
1894 :
1895 : /*
1896 : * Need share lock on buffer to examine tuple commit status.
1897 : */
1898 1401 : LockBuffer(buffer, BUFFER_LOCK_SHARE);
1899 1401 : page = BufferGetPage(buffer);
1900 1401 : TestForOldSnapshot(snapshot, relation, page);
1901 :
1902 : /*
1903 : * We'd better check for out-of-range offnum in case of VACUUM since the
1904 : * TID was obtained.
1905 : */
1906 1401 : offnum = ItemPointerGetOffsetNumber(tid);
1907 1401 : if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(page))
1908 : {
1909 0 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
1910 0 : if (keep_buf)
1911 0 : *userbuf = buffer;
1912 : else
1913 : {
1914 0 : ReleaseBuffer(buffer);
1915 0 : *userbuf = InvalidBuffer;
1916 : }
1917 0 : tuple->t_data = NULL;
1918 0 : return false;
1919 : }
1920 :
1921 : /*
1922 : * get the item line pointer corresponding to the requested tid
1923 : */
1924 1401 : lp = PageGetItemId(page, offnum);
1925 :
1926 : /*
1927 : * Must check for deleted tuple.
1928 : */
1929 1401 : if (!ItemIdIsNormal(lp))
1930 : {
1931 0 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
1932 0 : if (keep_buf)
1933 0 : *userbuf = buffer;
1934 : else
1935 : {
1936 0 : ReleaseBuffer(buffer);
1937 0 : *userbuf = InvalidBuffer;
1938 : }
1939 0 : tuple->t_data = NULL;
1940 0 : return false;
1941 : }
1942 :
1943 : /*
1944 : * fill in *tuple fields
1945 : */
1946 1401 : tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp);
1947 1401 : tuple->t_len = ItemIdGetLength(lp);
1948 1401 : tuple->t_tableOid = RelationGetRelid(relation);
1949 :
1950 : /*
1951 : * check time qualification of tuple, then release lock
1952 : */
1953 1401 : valid = HeapTupleSatisfiesVisibility(tuple, snapshot, buffer);
1954 :
1955 1401 : if (valid)
1956 1397 : PredicateLockTuple(relation, tuple, snapshot);
1957 :
1958 1401 : CheckForSerializableConflictOut(valid, relation, tuple, buffer, snapshot);
1959 :
1960 1401 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
1961 :
1962 1401 : if (valid)
1963 : {
1964 : /*
1965 : * All checks passed, so return the tuple as valid. Caller is now
1966 : * responsible for releasing the buffer.
1967 : */
1968 1397 : *userbuf = buffer;
1969 :
1970 : /* Count the successful fetch against appropriate rel, if any */
1971 1397 : if (stats_relation != NULL)
1972 0 : pgstat_count_heap_fetch(stats_relation);
1973 :
1974 1397 : return true;
1975 : }
1976 :
1977 : /* Tuple failed time qual, but maybe caller wants to see it anyway. */
1978 4 : if (keep_buf)
1979 0 : *userbuf = buffer;
1980 : else
1981 : {
1982 4 : ReleaseBuffer(buffer);
1983 4 : *userbuf = InvalidBuffer;
1984 : }
1985 :
1986 4 : return false;
1987 : }
1988 :
1989 : /*
1990 : * heap_hot_search_buffer - search HOT chain for tuple satisfying snapshot
1991 : *
1992 : * On entry, *tid is the TID of a tuple (either a simple tuple, or the root
1993 : * of a HOT chain), and buffer is the buffer holding this tuple. We search
1994 : * for the first chain member satisfying the given snapshot. If one is
1995 : * found, we update *tid to reference that tuple's offset number, and
1996 : * return TRUE. If no match, return FALSE without modifying *tid.
1997 : *
1998 : * heapTuple is a caller-supplied buffer. When a match is found, we return
1999 : * the tuple here, in addition to updating *tid. If no match is found, the
2000 : * contents of this buffer on return are undefined.
2001 : *
2002 : * If all_dead is not NULL, we check non-visible tuples to see if they are
2003 : * globally dead; *all_dead is set TRUE if all members of the HOT chain
2004 : * are vacuumable, FALSE if not.
2005 : *
2006 : * Unlike heap_fetch, the caller must already have pin and (at least) share
2007 : * lock on the buffer; it is still pinned/locked at exit. Also unlike
2008 : * heap_fetch, we do not report any pgstats count; caller may do so if wanted.
2009 : */
2010 : bool
2011 1050708 : heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer,
2012 : Snapshot snapshot, HeapTuple heapTuple,
2013 : bool *all_dead, bool first_call)
2014 : {
2015 1050708 : Page dp = (Page) BufferGetPage(buffer);
2016 1050708 : TransactionId prev_xmax = InvalidTransactionId;
2017 : OffsetNumber offnum;
2018 : bool at_chain_start;
2019 : bool valid;
2020 : bool skip;
2021 :
2022 : /* If this is not the first call, previous call returned a (live!) tuple */
2023 1050708 : if (all_dead)
2024 713028 : *all_dead = first_call;
2025 :
2026 1050708 : Assert(TransactionIdIsValid(RecentGlobalXmin));
2027 :
2028 1050708 : Assert(ItemPointerGetBlockNumber(tid) == BufferGetBlockNumber(buffer));
2029 1050708 : offnum = ItemPointerGetOffsetNumber(tid);
2030 1050708 : at_chain_start = first_call;
2031 1050708 : skip = !first_call;
2032 :
2033 1050708 : heapTuple->t_self = *tid;
2034 :
2035 : /* Scan through possible multiple members of HOT-chain */
2036 : for (;;)
2037 : {
2038 : ItemId lp;
2039 :
2040 : /* check for bogus TID */
2041 1096023 : if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(dp))
2042 : break;
2043 :
2044 1096023 : lp = PageGetItemId(dp, offnum);
2045 :
2046 : /* check for unused, dead, or redirected items */
2047 1096023 : if (!ItemIdIsNormal(lp))
2048 : {
2049 : /* We should only see a redirect at start of chain */
2050 28388 : if (ItemIdIsRedirected(lp) && at_chain_start)
2051 : {
2052 : /* Follow the redirect */
2053 5919 : offnum = ItemIdGetRedirect(lp);
2054 5919 : at_chain_start = false;
2055 5919 : continue;
2056 : }
2057 : /* else must be end of chain */
2058 22469 : break;
2059 : }
2060 :
2061 1067635 : heapTuple->t_data = (HeapTupleHeader) PageGetItem(dp, lp);
2062 1067635 : heapTuple->t_len = ItemIdGetLength(lp);
2063 1067635 : heapTuple->t_tableOid = RelationGetRelid(relation);
2064 1067635 : ItemPointerSetOffsetNumber(&heapTuple->t_self, offnum);
2065 :
2066 : /*
2067 : * Shouldn't see a HEAP_ONLY tuple at chain start.
2068 : */
2069 1067635 : if (at_chain_start && HeapTupleIsHeapOnly(heapTuple))
2070 0 : break;
2071 :
2072 : /*
2073 : * The xmin should match the previous xmax value, else chain is
2074 : * broken.
2075 : */
2076 1107031 : if (TransactionIdIsValid(prev_xmax) &&
2077 39396 : !TransactionIdEquals(prev_xmax,
2078 : HeapTupleHeaderGetXmin(heapTuple->t_data)))
2079 0 : break;
2080 :
2081 : /*
2082 : * When first_call is true (and thus, skip is initially false) we'll
2083 : * return the first tuple we find. But on later passes, heapTuple
2084 : * will initially be pointing to the tuple we returned last time.
2085 : * Returning it again would be incorrect (and would loop forever), so
2086 : * we skip it and return the next match we find.
2087 : */
2088 1067635 : if (!skip)
2089 : {
2090 : /*
2091 : * For the benefit of logical decoding, have t_self point at the
2092 : * element of the HOT chain we're currently investigating instead
2093 : * of the root tuple of the HOT chain. This is important because
2094 : * the *Satisfies routine for historical mvcc snapshots needs the
2095 : * correct tid to decide about the visibility in some cases.
2096 : */
2097 1065951 : ItemPointerSet(&(heapTuple->t_self), BufferGetBlockNumber(buffer), offnum);
2098 :
2099 : /* If it's visible per the snapshot, we must return it */
2100 1065951 : valid = HeapTupleSatisfiesVisibility(heapTuple, snapshot, buffer);
2101 1065951 : CheckForSerializableConflictOut(valid, relation, heapTuple,
2102 : buffer, snapshot);
2103 : /* reset to original, non-redirected, tid */
2104 1065951 : heapTuple->t_self = *tid;
2105 :
2106 1065951 : if (valid)
2107 : {
2108 1000767 : ItemPointerSetOffsetNumber(tid, offnum);
2109 1000767 : PredicateLockTuple(relation, heapTuple, snapshot);
2110 1000767 : if (all_dead)
2111 683492 : *all_dead = false;
2112 1000767 : return true;
2113 : }
2114 : }
2115 66868 : skip = false;
2116 :
2117 : /*
2118 : * If we can't see it, maybe no one else can either. At caller
2119 : * request, check whether all chain members are dead to all
2120 : * transactions.
2121 : */
2122 116091 : if (all_dead && *all_dead &&
2123 49223 : !HeapTupleIsSurelyDead(heapTuple, RecentGlobalXmin))
2124 34625 : *all_dead = false;
2125 :
2126 : /*
2127 : * Check to see if HOT chain continues past this tuple; if so fetch
2128 : * the next offnum and loop around.
2129 : */
2130 66868 : if (HeapTupleIsHotUpdated(heapTuple))
2131 : {
2132 39396 : Assert(ItemPointerGetBlockNumber(&heapTuple->t_data->t_ctid) ==
2133 : ItemPointerGetBlockNumber(tid));
2134 39396 : offnum = ItemPointerGetOffsetNumber(&heapTuple->t_data->t_ctid);
2135 39396 : at_chain_start = false;
2136 39396 : prev_xmax = HeapTupleHeaderGetUpdateXid(heapTuple->t_data);
2137 : }
2138 : else
2139 : break; /* end of chain */
2140 45315 : }
2141 :
2142 49941 : return false;
2143 : }
2144 :
2145 : /*
2146 : * heap_hot_search - search HOT chain for tuple satisfying snapshot
2147 : *
2148 : * This has the same API as heap_hot_search_buffer, except that the caller
2149 : * does not provide the buffer containing the page, rather we access it
2150 : * locally.
2151 : */
2152 : bool
2153 6145 : heap_hot_search(ItemPointer tid, Relation relation, Snapshot snapshot,
2154 : bool *all_dead)
2155 : {
2156 : bool result;
2157 : Buffer buffer;
2158 : HeapTupleData heapTuple;
2159 :
2160 6145 : buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
2161 6145 : LockBuffer(buffer, BUFFER_LOCK_SHARE);
2162 6145 : result = heap_hot_search_buffer(tid, relation, buffer, snapshot,
2163 : &heapTuple, all_dead, true);
2164 6145 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2165 6145 : ReleaseBuffer(buffer);
2166 6145 : return result;
2167 : }
2168 :
2169 : /*
2170 : * heap_get_latest_tid - get the latest tid of a specified tuple
2171 : *
2172 : * Actually, this gets the latest version that is visible according to
2173 : * the passed snapshot. You can pass SnapshotDirty to get the very latest,
2174 : * possibly uncommitted version.
2175 : *
2176 : * *tid is both an input and an output parameter: it is updated to
2177 : * show the latest version of the row. Note that it will not be changed
2178 : * if no version of the row passes the snapshot test.
2179 : */
2180 : void
2181 42 : heap_get_latest_tid(Relation relation,
2182 : Snapshot snapshot,
2183 : ItemPointer tid)
2184 : {
2185 : BlockNumber blk;
2186 : ItemPointerData ctid;
2187 : TransactionId priorXmax;
2188 :
2189 : /* this is to avoid Assert failures on bad input */
2190 42 : if (!ItemPointerIsValid(tid))
2191 42 : return;
2192 :
2193 : /*
2194 : * Since this can be called with user-supplied TID, don't trust the input
2195 : * too much. (RelationGetNumberOfBlocks is an expensive check, so we
2196 : * don't check t_ctid links again this way. Note that it would not do to
2197 : * call it just once and save the result, either.)
2198 : */
2199 42 : blk = ItemPointerGetBlockNumber(tid);
2200 42 : if (blk >= RelationGetNumberOfBlocks(relation))
2201 0 : elog(ERROR, "block number %u is out of range for relation \"%s\"",
2202 : blk, RelationGetRelationName(relation));
2203 :
2204 : /*
2205 : * Loop to chase down t_ctid links. At top of loop, ctid is the tuple we
2206 : * need to examine, and *tid is the TID we will return if ctid turns out
2207 : * to be bogus.
2208 : *
2209 : * Note that we will loop until we reach the end of the t_ctid chain.
2210 : * Depending on the snapshot passed, there might be at most one visible
2211 : * version of the row, but we don't try to optimize for that.
2212 : */
2213 42 : ctid = *tid;
2214 42 : priorXmax = InvalidTransactionId; /* cannot check first XMIN */
2215 : for (;;)
2216 : {
2217 : Buffer buffer;
2218 : Page page;
2219 : OffsetNumber offnum;
2220 : ItemId lp;
2221 : HeapTupleData tp;
2222 : bool valid;
2223 :
2224 : /*
2225 : * Read, pin, and lock the page.
2226 : */
2227 57 : buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(&ctid));
2228 57 : LockBuffer(buffer, BUFFER_LOCK_SHARE);
2229 57 : page = BufferGetPage(buffer);
2230 57 : TestForOldSnapshot(snapshot, relation, page);
2231 :
2232 : /*
2233 : * Check for bogus item number. This is not treated as an error
2234 : * condition because it can happen while following a t_ctid link. We
2235 : * just assume that the prior tid is OK and return it unchanged.
2236 : */
2237 57 : offnum = ItemPointerGetOffsetNumber(&ctid);
2238 57 : if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(page))
2239 : {
2240 0 : UnlockReleaseBuffer(buffer);
2241 0 : break;
2242 : }
2243 57 : lp = PageGetItemId(page, offnum);
2244 57 : if (!ItemIdIsNormal(lp))
2245 : {
2246 0 : UnlockReleaseBuffer(buffer);
2247 0 : break;
2248 : }
2249 :
2250 : /* OK to access the tuple */
2251 57 : tp.t_self = ctid;
2252 57 : tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
2253 57 : tp.t_len = ItemIdGetLength(lp);
2254 57 : tp.t_tableOid = RelationGetRelid(relation);
2255 :
2256 : /*
2257 : * After following a t_ctid link, we might arrive at an unrelated
2258 : * tuple. Check for XMIN match.
2259 : */
2260 72 : if (TransactionIdIsValid(priorXmax) &&
2261 15 : !TransactionIdEquals(priorXmax, HeapTupleHeaderGetXmin(tp.t_data)))
2262 : {
2263 0 : UnlockReleaseBuffer(buffer);
2264 0 : break;
2265 : }
2266 :
2267 : /*
2268 : * Check time qualification of tuple; if visible, set it as the new
2269 : * result candidate.
2270 : */
2271 57 : valid = HeapTupleSatisfiesVisibility(&tp, snapshot, buffer);
2272 57 : CheckForSerializableConflictOut(valid, relation, &tp, buffer, snapshot);
2273 57 : if (valid)
2274 38 : *tid = ctid;
2275 :
2276 : /*
2277 : * If there's a valid t_ctid link, follow it, else we're done.
2278 : */
2279 84 : if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) ||
2280 46 : HeapTupleHeaderIsOnlyLocked(tp.t_data) ||
2281 19 : ItemPointerEquals(&tp.t_self, &tp.t_data->t_ctid))
2282 : {
2283 42 : UnlockReleaseBuffer(buffer);
2284 42 : break;
2285 : }
2286 :
2287 15 : ctid = tp.t_data->t_ctid;
2288 15 : priorXmax = HeapTupleHeaderGetUpdateXid(tp.t_data);
2289 15 : UnlockReleaseBuffer(buffer);
2290 15 : } /* end of loop */
2291 : }
2292 :
2293 :
2294 : /*
2295 : * UpdateXmaxHintBits - update tuple hint bits after xmax transaction ends
2296 : *
2297 : * This is called after we have waited for the XMAX transaction to terminate.
2298 : * If the transaction aborted, we guarantee the XMAX_INVALID hint bit will
2299 : * be set on exit. If the transaction committed, we set the XMAX_COMMITTED
2300 : * hint bit if possible --- but beware that that may not yet be possible,
2301 : * if the transaction committed asynchronously.
2302 : *
2303 : * Note that if the transaction was a locker only, we set HEAP_XMAX_INVALID
2304 : * even if it commits.
2305 : *
2306 : * Hence callers should look only at XMAX_INVALID.
2307 : *
2308 : * Note this is not allowed for tuples whose xmax is a multixact.
2309 : */
2310 : static void
2311 0 : UpdateXmaxHintBits(HeapTupleHeader tuple, Buffer buffer, TransactionId xid)
2312 : {
2313 0 : Assert(TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple), xid));
2314 0 : Assert(!(tuple->t_infomask & HEAP_XMAX_IS_MULTI));
2315 :
2316 0 : if (!(tuple->t_infomask & (HEAP_XMAX_COMMITTED | HEAP_XMAX_INVALID)))
2317 : {
2318 0 : if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask) &&
2319 0 : TransactionIdDidCommit(xid))
2320 0 : HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED,
2321 : xid);
2322 : else
2323 0 : HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
2324 : InvalidTransactionId);
2325 : }
2326 0 : }
2327 :
2328 :
2329 : /*
2330 : * GetBulkInsertState - prepare status object for a bulk insert
2331 : */
2332 : BulkInsertState
2333 292 : GetBulkInsertState(void)
2334 : {
2335 : BulkInsertState bistate;
2336 :
2337 292 : bistate = (BulkInsertState) palloc(sizeof(BulkInsertStateData));
2338 292 : bistate->strategy = GetAccessStrategy(BAS_BULKWRITE);
2339 292 : bistate->current_buf = InvalidBuffer;
2340 292 : return bistate;
2341 : }
2342 :
2343 : /*
2344 : * FreeBulkInsertState - clean up after finishing a bulk insert
2345 : */
2346 : void
2347 277 : FreeBulkInsertState(BulkInsertState bistate)
2348 : {
2349 277 : if (bistate->current_buf != InvalidBuffer)
2350 229 : ReleaseBuffer(bistate->current_buf);
2351 277 : FreeAccessStrategy(bistate->strategy);
2352 277 : pfree(bistate);
2353 277 : }
2354 :
2355 : /*
2356 : * ReleaseBulkInsertStatePin - release a buffer currently held in bistate
2357 : */
2358 : void
2359 11 : ReleaseBulkInsertStatePin(BulkInsertState bistate)
2360 : {
2361 11 : if (bistate->current_buf != InvalidBuffer)
2362 7 : ReleaseBuffer(bistate->current_buf);
2363 11 : bistate->current_buf = InvalidBuffer;
2364 11 : }
2365 :
2366 :
2367 : /*
2368 : * heap_insert - insert tuple into a heap
2369 : *
2370 : * The new tuple is stamped with current transaction ID and the specified
2371 : * command ID.
2372 : *
2373 : * If the HEAP_INSERT_SKIP_WAL option is specified, the new tuple is not
2374 : * logged in WAL, even for a non-temp relation. Safe usage of this behavior
2375 : * requires that we arrange that all new tuples go into new pages not
2376 : * containing any tuples from other transactions, and that the relation gets
2377 : * fsync'd before commit. (See also heap_sync() comments)
2378 : *
2379 : * The HEAP_INSERT_SKIP_FSM option is passed directly to
2380 : * RelationGetBufferForTuple, which see for more info.
2381 : *
2382 : * HEAP_INSERT_FROZEN should only be specified for inserts into
2383 : * relfilenodes created during the current subtransaction and when
2384 : * there are no prior snapshots or pre-existing portals open.
2385 : * This causes rows to be frozen, which is an MVCC violation and
2386 : * requires explicit options chosen by user.
2387 : *
2388 : * HEAP_INSERT_IS_SPECULATIVE is used on so-called "speculative insertions",
2389 : * which can be backed out afterwards without aborting the whole transaction.
2390 : * Other sessions can wait for the speculative insertion to be confirmed,
2391 : * turning it into a regular tuple, or aborted, as if it never existed.
2392 : * Speculatively inserted tuples behave as "value locks" of short duration,
2393 : * used to implement INSERT .. ON CONFLICT.
2394 : *
2395 : * Note that most of these options will be applied when inserting into the
2396 : * heap's TOAST table, too, if the tuple requires any out-of-line data. Only
2397 : * HEAP_INSERT_IS_SPECULATIVE is explicitly ignored, as the toast data does
2398 : * not partake in speculative insertion.
2399 : *
2400 : * The BulkInsertState object (if any; bistate can be NULL for default
2401 : * behavior) is also just passed through to RelationGetBufferForTuple.
2402 : *
2403 : * The return value is the OID assigned to the tuple (either here or by the
2404 : * caller), or InvalidOid if no OID. The header fields of *tup are updated
2405 : * to match the stored tuple; in particular tup->t_self receives the actual
2406 : * TID where the tuple was stored. But note that any toasting of fields
2407 : * within the tuple data is NOT reflected into *tup.
2408 : */
2409 : Oid
2410 620313 : heap_insert(Relation relation, HeapTuple tup, CommandId cid,
2411 : int options, BulkInsertState bistate)
2412 : {
2413 620313 : TransactionId xid = GetCurrentTransactionId();
2414 : HeapTuple heaptup;
2415 : Buffer buffer;
2416 620313 : Buffer vmbuffer = InvalidBuffer;
2417 620313 : bool all_visible_cleared = false;
2418 :
2419 : /*
2420 : * Fill in tuple header fields, assign an OID, and toast the tuple if
2421 : * necessary.
2422 : *
2423 : * Note: below this point, heaptup is the data we actually intend to store
2424 : * into the relation; tup is the caller's original untoasted data.
2425 : */
2426 620313 : heaptup = heap_prepare_insert(relation, tup, xid, cid, options);
2427 :
2428 : /*
2429 : * Find buffer to insert this tuple into. If the page is all visible,
2430 : * this will also pin the requisite visibility map page.
2431 : */
2432 620313 : buffer = RelationGetBufferForTuple(relation, heaptup->t_len,
2433 : InvalidBuffer, options, bistate,
2434 : &vmbuffer, NULL);
2435 :
2436 : /*
2437 : * We're about to do the actual insert -- but check for conflict first, to
2438 : * avoid possibly having to roll back work we've just done.
2439 : *
2440 : * This is safe without a recheck as long as there is no possibility of
2441 : * another process scanning the page between this check and the insert
2442 : * being visible to the scan (i.e., an exclusive buffer content lock is
2443 : * continuously held from this point until the tuple insert is visible).
2444 : *
2445 : * For a heap insert, we only need to check for table-level SSI locks. Our
2446 : * new tuple can't possibly conflict with existing tuple locks, and heap
2447 : * page locks are only consolidated versions of tuple locks; they do not
2448 : * lock "gaps" as index page locks do. So we don't need to specify a
2449 : * buffer when making the call, which makes for a faster check.
2450 : */
2451 620313 : CheckForSerializableConflictIn(relation, NULL, InvalidBuffer);
2452 :
2453 : /* NO EREPORT(ERROR) from here till changes are logged */
2454 620312 : START_CRIT_SECTION();
2455 :
2456 620312 : RelationPutHeapTuple(relation, buffer, heaptup,
2457 620312 : (options & HEAP_INSERT_SPECULATIVE) != 0);
2458 :
2459 620312 : if (PageIsAllVisible(BufferGetPage(buffer)))
2460 : {
2461 469 : all_visible_cleared = true;
2462 469 : PageClearAllVisible(BufferGetPage(buffer));
2463 938 : visibilitymap_clear(relation,
2464 469 : ItemPointerGetBlockNumber(&(heaptup->t_self)),
2465 : vmbuffer, VISIBILITYMAP_VALID_BITS);
2466 : }
2467 :
2468 : /*
2469 : * XXX Should we set PageSetPrunable on this page ?
2470 : *
2471 : * The inserting transaction may eventually abort thus making this tuple
2472 : * DEAD and hence available for pruning. Though we don't want to optimize
2473 : * for aborts, if no other tuple in this page is UPDATEd/DELETEd, the
2474 : * aborted tuple will never be pruned until next vacuum is triggered.
2475 : *
2476 : * If you do add PageSetPrunable here, add it in heap_xlog_insert too.
2477 : */
2478 :
2479 620312 : MarkBufferDirty(buffer);
2480 :
2481 : /* XLOG stuff */
2482 620312 : if (!(options & HEAP_INSERT_SKIP_WAL) && RelationNeedsWAL(relation))
2483 : {
2484 : xl_heap_insert xlrec;
2485 : xl_heap_header xlhdr;
2486 : XLogRecPtr recptr;
2487 595607 : Page page = BufferGetPage(buffer);
2488 595607 : uint8 info = XLOG_HEAP_INSERT;
2489 595607 : int bufflags = 0;
2490 :
2491 : /*
2492 : * If this is a catalog, we need to transmit combocids to properly
2493 : * decode, so log that as well.
2494 : */
2495 595607 : if (RelationIsAccessibleInLogicalDecoding(relation))
2496 0 : log_heap_new_cid(relation, heaptup);
2497 :
2498 : /*
2499 : * If this is the single and first tuple on page, we can reinit the
2500 : * page instead of restoring the whole thing. Set flag, and hide
2501 : * buffer references from XLogInsert.
2502 : */
2503 603323 : if (ItemPointerGetOffsetNumber(&(heaptup->t_self)) == FirstOffsetNumber &&
2504 15432 : PageGetMaxOffsetNumber(page) == FirstOffsetNumber)
2505 : {
2506 7441 : info |= XLOG_HEAP_INIT_PAGE;
2507 7441 : bufflags |= REGBUF_WILL_INIT;
2508 : }
2509 :
2510 595607 : xlrec.offnum = ItemPointerGetOffsetNumber(&heaptup->t_self);
2511 595607 : xlrec.flags = 0;
2512 595607 : if (all_visible_cleared)
2513 469 : xlrec.flags |= XLH_INSERT_ALL_VISIBLE_CLEARED;
2514 595607 : if (options & HEAP_INSERT_SPECULATIVE)
2515 53 : xlrec.flags |= XLH_INSERT_IS_SPECULATIVE;
2516 595607 : Assert(ItemPointerGetBlockNumber(&heaptup->t_self) == BufferGetBlockNumber(buffer));
2517 :
2518 : /*
2519 : * For logical decoding, we need the tuple even if we're doing a full
2520 : * page write, so make sure it's included even if we take a full-page
2521 : * image. (XXX We could alternatively store a pointer into the FPW).
2522 : */
2523 595607 : if (RelationIsLogicallyLogged(relation))
2524 : {
2525 0 : xlrec.flags |= XLH_INSERT_CONTAINS_NEW_TUPLE;
2526 0 : bufflags |= REGBUF_KEEP_DATA;
2527 : }
2528 :
2529 595607 : XLogBeginInsert();
2530 595607 : XLogRegisterData((char *) &xlrec, SizeOfHeapInsert);
2531 :
2532 595607 : xlhdr.t_infomask2 = heaptup->t_data->t_infomask2;
2533 595607 : xlhdr.t_infomask = heaptup->t_data->t_infomask;
2534 595607 : xlhdr.t_hoff = heaptup->t_data->t_hoff;
2535 :
2536 : /*
2537 : * note we mark xlhdr as belonging to buffer; if XLogInsert decides to
2538 : * write the whole page to the xlog, we don't need to store
2539 : * xl_heap_header in the xlog.
2540 : */
2541 595607 : XLogRegisterBuffer(0, buffer, REGBUF_STANDARD | bufflags);
2542 595607 : XLogRegisterBufData(0, (char *) &xlhdr, SizeOfHeapHeader);
2543 : /* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */
2544 1191214 : XLogRegisterBufData(0,
2545 595607 : (char *) heaptup->t_data + SizeofHeapTupleHeader,
2546 595607 : heaptup->t_len - SizeofHeapTupleHeader);
2547 :
2548 : /* filtering by origin on a row level is much more efficient */
2549 595607 : XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
2550 :
2551 595607 : recptr = XLogInsert(RM_HEAP_ID, info);
2552 :
2553 595607 : PageSetLSN(page, recptr);
2554 : }
2555 :
2556 620312 : END_CRIT_SECTION();
2557 :
2558 620312 : UnlockReleaseBuffer(buffer);
2559 620312 : if (vmbuffer != InvalidBuffer)
2560 470 : ReleaseBuffer(vmbuffer);
2561 :
2562 : /*
2563 : * If tuple is cachable, mark it for invalidation from the caches in case
2564 : * we abort. Note it is OK to do this after releasing the buffer, because
2565 : * the heaptup data structure is all in local memory, not in the shared
2566 : * buffer.
2567 : */
2568 620312 : CacheInvalidateHeapTuple(relation, heaptup, NULL);
2569 :
2570 : /* Note: speculative insertions are counted too, even if aborted later */
2571 620312 : pgstat_count_heap_insert(relation, 1);
2572 :
2573 : /*
2574 : * If heaptup is a private copy, release it. Don't forget to copy t_self
2575 : * back to the caller's image, too.
2576 : */
2577 620312 : if (heaptup != tup)
2578 : {
2579 1819 : tup->t_self = heaptup->t_self;
2580 1819 : heap_freetuple(heaptup);
2581 : }
2582 :
2583 620312 : return HeapTupleGetOid(tup);
2584 : }
2585 :
2586 : /*
2587 : * Subroutine for heap_insert(). Prepares a tuple for insertion. This sets the
2588 : * tuple header fields, assigns an OID, and toasts the tuple if necessary.
2589 : * Returns a toasted version of the tuple if it was toasted, or the original
2590 : * tuple if not. Note that in any case, the header fields are also set in
2591 : * the original tuple.
2592 : */
2593 : static HeapTuple
2594 728391 : heap_prepare_insert(Relation relation, HeapTuple tup, TransactionId xid,
2595 : CommandId cid, int options)
2596 : {
2597 : /*
2598 : * For now, parallel operations are required to be strictly read-only.
2599 : * Unlike heap_update() and heap_delete(), an insert should never create a
2600 : * combo CID, so it might be possible to relax this restriction, but not
2601 : * without more thought and testing.
2602 : */
2603 728391 : if (IsInParallelMode())
2604 0 : ereport(ERROR,
2605 : (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
2606 : errmsg("cannot insert tuples during a parallel operation")));
2607 :
2608 728391 : if (relation->rd_rel->relhasoids)
2609 : {
2610 : #ifdef NOT_USED
2611 : /* this is redundant with an Assert in HeapTupleSetOid */
2612 : Assert(tup->t_data->t_infomask & HEAP_HASOID);
2613 : #endif
2614 :
2615 : /*
2616 : * If the object id of this tuple has already been assigned, trust the
2617 : * caller. There are a couple of ways this can happen. At initial db
2618 : * creation, the backend program sets oids for tuples. When we define
2619 : * an index, we set the oid. Finally, in the future, we may allow
2620 : * users to set their own object ids in order to support a persistent
2621 : * object store (objects need to contain pointers to one another).
2622 : */
2623 37545 : if (!OidIsValid(HeapTupleGetOid(tup)))
2624 26558 : HeapTupleSetOid(tup, GetNewOid(relation));
2625 : }
2626 : else
2627 : {
2628 : /* check there is not space for an OID */
2629 690846 : Assert(!(tup->t_data->t_infomask & HEAP_HASOID));
2630 : }
2631 :
2632 728391 : tup->t_data->t_infomask &= ~(HEAP_XACT_MASK);
2633 728391 : tup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK);
2634 728391 : tup->t_data->t_infomask |= HEAP_XMAX_INVALID;
2635 728391 : HeapTupleHeaderSetXmin(tup->t_data, xid);
2636 728391 : if (options & HEAP_INSERT_FROZEN)
2637 58 : HeapTupleHeaderSetXminFrozen(tup->t_data);
2638 :
2639 728391 : HeapTupleHeaderSetCmin(tup->t_data, cid);
2640 728391 : HeapTupleHeaderSetXmax(tup->t_data, 0); /* for cleanliness */
2641 728391 : tup->t_tableOid = RelationGetRelid(relation);
2642 :
2643 : /*
2644 : * If the new tuple is too big for storage or contains already toasted
2645 : * out-of-line attributes from some other relation, invoke the toaster.
2646 : */
2647 729547 : if (relation->rd_rel->relkind != RELKIND_RELATION &&
2648 1156 : relation->rd_rel->relkind != RELKIND_MATVIEW)
2649 : {
2650 : /* toast table entries should never be recursively toasted */
2651 720 : Assert(!HeapTupleHasExternal(tup));
2652 720 : return tup;
2653 : }
2654 727671 : else if (HeapTupleHasExternal(tup) || tup->t_len > TOAST_TUPLE_THRESHOLD)
2655 1819 : return toast_insert_or_update(relation, tup, NULL, options);
2656 : else
2657 725852 : return tup;
2658 : }
2659 :
2660 : /*
2661 : * heap_multi_insert - insert multiple tuple into a heap
2662 : *
2663 : * This is like heap_insert(), but inserts multiple tuples in one operation.
2664 : * That's faster than calling heap_insert() in a loop, because when multiple
2665 : * tuples can be inserted on a single page, we can write just a single WAL
2666 : * record covering all of them, and only need to lock/unlock the page once.
2667 : *
2668 : * Note: this leaks memory into the current memory context. You can create a
2669 : * temporary context before calling this, if that's a problem.
2670 : */
2671 : void
2672 238 : heap_multi_insert(Relation relation, HeapTuple *tuples, int ntuples,
2673 : CommandId cid, int options, BulkInsertState bistate)
2674 : {
2675 238 : TransactionId xid = GetCurrentTransactionId();
2676 : HeapTuple *heaptuples;
2677 : int i;
2678 : int ndone;
2679 238 : char *scratch = NULL;
2680 : Page page;
2681 : bool needwal;
2682 : Size saveFreeSpace;
2683 238 : bool need_tuple_data = RelationIsLogicallyLogged(relation);
2684 238 : bool need_cids = RelationIsAccessibleInLogicalDecoding(relation);
2685 :
2686 238 : needwal = !(options & HEAP_INSERT_SKIP_WAL) && RelationNeedsWAL(relation);
2687 238 : saveFreeSpace = RelationGetTargetPageFreeSpace(relation,
2688 : HEAP_DEFAULT_FILLFACTOR);
2689 :
2690 : /* Toast and set header data in all the tuples */
2691 238 : heaptuples = palloc(ntuples * sizeof(HeapTuple));
2692 108316 : for (i = 0; i < ntuples; i++)
2693 108078 : heaptuples[i] = heap_prepare_insert(relation, tuples[i],
2694 : xid, cid, options);
2695 :
2696 : /*
2697 : * Allocate some memory to use for constructing the WAL record. Using
2698 : * palloc() within a critical section is not safe, so we allocate this
2699 : * beforehand.
2700 : */
2701 238 : if (needwal)
2702 219 : scratch = palloc(BLCKSZ);
2703 :
2704 : /*
2705 : * We're about to do the actual inserts -- but check for conflict first,
2706 : * to minimize the possibility of having to roll back work we've just
2707 : * done.
2708 : *
2709 : * A check here does not definitively prevent a serialization anomaly;
2710 : * that check MUST be done at least past the point of acquiring an
2711 : * exclusive buffer content lock on every buffer that will be affected,
2712 : * and MAY be done after all inserts are reflected in the buffers and
2713 : * those locks are released; otherwise there race condition. Since
2714 : * multiple buffers can be locked and unlocked in the loop below, and it
2715 : * would not be feasible to identify and lock all of those buffers before
2716 : * the loop, we must do a final check at the end.
2717 : *
2718 : * The check here could be omitted with no loss of correctness; it is
2719 : * present strictly as an optimization.
2720 : *
2721 : * For heap inserts, we only need to check for table-level SSI locks. Our
2722 : * new tuples can't possibly conflict with existing tuple locks, and heap
2723 : * page locks are only consolidated versions of tuple locks; they do not
2724 : * lock "gaps" as index page locks do. So we don't need to specify a
2725 : * buffer when making the call, which makes for a faster check.
2726 : */
2727 238 : CheckForSerializableConflictIn(relation, NULL, InvalidBuffer);
2728 :
2729 238 : ndone = 0;
2730 1909 : while (ndone < ntuples)
2731 : {
2732 : Buffer buffer;
2733 1433 : Buffer vmbuffer = InvalidBuffer;
2734 1433 : bool all_visible_cleared = false;
2735 : int nthispage;
2736 :
2737 1433 : CHECK_FOR_INTERRUPTS();
2738 :
2739 : /*
2740 : * Find buffer where at least the next tuple will fit. If the page is
2741 : * all-visible, this will also pin the requisite visibility map page.
2742 : */
2743 1433 : buffer = RelationGetBufferForTuple(relation, heaptuples[ndone]->t_len,
2744 : InvalidBuffer, options, bistate,
2745 : &vmbuffer, NULL);
2746 1433 : page = BufferGetPage(buffer);
2747 :
2748 : /* NO EREPORT(ERROR) from here till changes are logged */
2749 1433 : START_CRIT_SECTION();
2750 :
2751 : /*
2752 : * RelationGetBufferForTuple has ensured that the first tuple fits.
2753 : * Put that on the page, and then as many other tuples as fit.
2754 : */
2755 1433 : RelationPutHeapTuple(relation, buffer, heaptuples[ndone], false);
2756 108078 : for (nthispage = 1; ndone + nthispage < ntuples; nthispage++)
2757 : {
2758 107840 : HeapTuple heaptup = heaptuples[ndone + nthispage];
2759 :
2760 107840 : if (PageGetHeapFreeSpace(page) < MAXALIGN(heaptup->t_len) + saveFreeSpace)
2761 1195 : break;
2762 :
2763 106645 : RelationPutHeapTuple(relation, buffer, heaptup, false);
2764 :
2765 : /*
2766 : * We don't use heap_multi_insert for catalog tuples yet, but
2767 : * better be prepared...
2768 : */
2769 106645 : if (needwal && need_cids)
2770 0 : log_heap_new_cid(relation, heaptup);
2771 : }
2772 :
2773 1433 : if (PageIsAllVisible(page))
2774 : {
2775 0 : all_visible_cleared = true;
2776 0 : PageClearAllVisible(page);
2777 0 : visibilitymap_clear(relation,
2778 : BufferGetBlockNumber(buffer),
2779 : vmbuffer, VISIBILITYMAP_VALID_BITS);
2780 : }
2781 :
2782 : /*
2783 : * XXX Should we set PageSetPrunable on this page ? See heap_insert()
2784 : */
2785 :
2786 1433 : MarkBufferDirty(buffer);
2787 :
2788 : /* XLOG stuff */
2789 1433 : if (needwal)
2790 : {
2791 : XLogRecPtr recptr;
2792 : xl_heap_multi_insert *xlrec;
2793 1369 : uint8 info = XLOG_HEAP2_MULTI_INSERT;
2794 : char *tupledata;
2795 : int totaldatalen;
2796 1369 : char *scratchptr = scratch;
2797 : bool init;
2798 1369 : int bufflags = 0;
2799 :
2800 : /*
2801 : * If the page was previously empty, we can reinit the page
2802 : * instead of restoring the whole thing.
2803 : */
2804 2556 : init = (ItemPointerGetOffsetNumber(&(heaptuples[ndone]->t_self)) == FirstOffsetNumber &&
2805 1187 : PageGetMaxOffsetNumber(page) == FirstOffsetNumber + nthispage - 1);
2806 :
2807 : /* allocate xl_heap_multi_insert struct from the scratch area */
2808 1369 : xlrec = (xl_heap_multi_insert *) scratchptr;
2809 1369 : scratchptr += SizeOfHeapMultiInsert;
2810 :
2811 : /*
2812 : * Allocate offsets array. Unless we're reinitializing the page,
2813 : * in that case the tuples are stored in order starting at
2814 : * FirstOffsetNumber and we don't need to store the offsets
2815 : * explicitly.
2816 : */
2817 1369 : if (!init)
2818 182 : scratchptr += nthispage * sizeof(OffsetNumber);
2819 :
2820 : /* the rest of the scratch space is used for tuple data */
2821 1369 : tupledata = scratchptr;
2822 :
2823 1369 : xlrec->flags = all_visible_cleared ? XLH_INSERT_ALL_VISIBLE_CLEARED : 0;
2824 1369 : xlrec->ntuples = nthispage;
2825 :
2826 : /*
2827 : * Write out an xl_multi_insert_tuple and the tuple data itself
2828 : * for each tuple.
2829 : */
2830 106470 : for (i = 0; i < nthispage; i++)
2831 : {
2832 105101 : HeapTuple heaptup = heaptuples[ndone + i];
2833 : xl_multi_insert_tuple *tuphdr;
2834 : int datalen;
2835 :
2836 105101 : if (!init)
2837 8154 : xlrec->offsets[i] = ItemPointerGetOffsetNumber(&heaptup->t_self);
2838 : /* xl_multi_insert_tuple needs two-byte alignment. */
2839 105101 : tuphdr = (xl_multi_insert_tuple *) SHORTALIGN(scratchptr);
2840 105101 : scratchptr = ((char *) tuphdr) + SizeOfMultiInsertTuple;
2841 :
2842 105101 : tuphdr->t_infomask2 = heaptup->t_data->t_infomask2;
2843 105101 : tuphdr->t_infomask = heaptup->t_data->t_infomask;
2844 105101 : tuphdr->t_hoff = heaptup->t_data->t_hoff;
2845 :
2846 : /* write bitmap [+ padding] [+ oid] + data */
2847 105101 : datalen = heaptup->t_len - SizeofHeapTupleHeader;
2848 210202 : memcpy(scratchptr,
2849 105101 : (char *) heaptup->t_data + SizeofHeapTupleHeader,
2850 : datalen);
2851 105101 : tuphdr->datalen = datalen;
2852 105101 : scratchptr += datalen;
2853 : }
2854 1369 : totaldatalen = scratchptr - tupledata;
2855 1369 : Assert((scratchptr - scratch) < BLCKSZ);
2856 :
2857 1369 : if (need_tuple_data)
2858 0 : xlrec->flags |= XLH_INSERT_CONTAINS_NEW_TUPLE;
2859 :
2860 : /*
2861 : * Signal that this is the last xl_heap_multi_insert record
2862 : * emitted by this call to heap_multi_insert(). Needed for logical
2863 : * decoding so it knows when to cleanup temporary data.
2864 : */
2865 1369 : if (ndone + nthispage == ntuples)
2866 219 : xlrec->flags |= XLH_INSERT_LAST_IN_MULTI;
2867 :
2868 1369 : if (init)
2869 : {
2870 1187 : info |= XLOG_HEAP_INIT_PAGE;
2871 1187 : bufflags |= REGBUF_WILL_INIT;
2872 : }
2873 :
2874 : /*
2875 : * If we're doing logical decoding, include the new tuple data
2876 : * even if we take a full-page image of the page.
2877 : */
2878 1369 : if (need_tuple_data)
2879 0 : bufflags |= REGBUF_KEEP_DATA;
2880 :
2881 1369 : XLogBeginInsert();
2882 1369 : XLogRegisterData((char *) xlrec, tupledata - scratch);
2883 1369 : XLogRegisterBuffer(0, buffer, REGBUF_STANDARD | bufflags);
2884 :
2885 1369 : XLogRegisterBufData(0, tupledata, totaldatalen);
2886 :
2887 : /* filtering by origin on a row level is much more efficient */
2888 1369 : XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
2889 :
2890 1369 : recptr = XLogInsert(RM_HEAP2_ID, info);
2891 :
2892 1369 : PageSetLSN(page, recptr);
2893 : }
2894 :
2895 1433 : END_CRIT_SECTION();
2896 :
2897 1433 : UnlockReleaseBuffer(buffer);
2898 1433 : if (vmbuffer != InvalidBuffer)
2899 0 : ReleaseBuffer(vmbuffer);
2900 :
2901 1433 : ndone += nthispage;
2902 : }
2903 :
2904 : /*
2905 : * We're done with the actual inserts. Check for conflicts again, to
2906 : * ensure that all rw-conflicts in to these inserts are detected. Without
2907 : * this final check, a sequential scan of the heap may have locked the
2908 : * table after the "before" check, missing one opportunity to detect the
2909 : * conflict, and then scanned the table before the new tuples were there,
2910 : * missing the other chance to detect the conflict.
2911 : *
2912 : * For heap inserts, we only need to check for table-level SSI locks. Our
2913 : * new tuples can't possibly conflict with existing tuple locks, and heap
2914 : * page locks are only consolidated versions of tuple locks; they do not
2915 : * lock "gaps" as index page locks do. So we don't need to specify a
2916 : * buffer when making the call.
2917 : */
2918 238 : CheckForSerializableConflictIn(relation, NULL, InvalidBuffer);
2919 :
2920 : /*
2921 : * If tuples are cachable, mark them for invalidation from the caches in
2922 : * case we abort. Note it is OK to do this after releasing the buffer,
2923 : * because the heaptuples data structure is all in local memory, not in
2924 : * the shared buffer.
2925 : */
2926 238 : if (IsCatalogRelation(relation))
2927 : {
2928 0 : for (i = 0; i < ntuples; i++)
2929 0 : CacheInvalidateHeapTuple(relation, heaptuples[i], NULL);
2930 : }
2931 :
2932 : /*
2933 : * Copy t_self fields back to the caller's original tuples. This does
2934 : * nothing for untoasted tuples (tuples[i] == heaptuples[i)], but it's
2935 : * probably faster to always copy than check.
2936 : */
2937 108316 : for (i = 0; i < ntuples; i++)
2938 108078 : tuples[i]->t_self = heaptuples[i]->t_self;
2939 :
2940 238 : pgstat_count_heap_insert(relation, ntuples);
2941 238 : }
2942 :
2943 : /*
2944 : * simple_heap_insert - insert a tuple
2945 : *
2946 : * Currently, this routine differs from heap_insert only in supplying
2947 : * a default command ID and not allowing access to the speedup options.
2948 : *
2949 : * This should be used rather than using heap_insert directly in most places
2950 : * where we are modifying system catalogs.
2951 : */
2952 : Oid
2953 68894 : simple_heap_insert(Relation relation, HeapTuple tup)
2954 : {
2955 68894 : return heap_insert(relation, tup, GetCurrentCommandId(true), 0, NULL);
2956 : }
2957 :
2958 : /*
2959 : * Given infomask/infomask2, compute the bits that must be saved in the
2960 : * "infobits" field of xl_heap_delete, xl_heap_update, xl_heap_lock,
2961 : * xl_heap_lock_updated WAL records.
2962 : *
2963 : * See fix_infomask_from_infobits.
2964 : */
2965 : static uint8
2966 121427 : compute_infobits(uint16 infomask, uint16 infomask2)
2967 : {
2968 485708 : return
2969 121427 : ((infomask & HEAP_XMAX_IS_MULTI) != 0 ? XLHL_XMAX_IS_MULTI : 0) |
2970 121427 : ((infomask & HEAP_XMAX_LOCK_ONLY) != 0 ? XLHL_XMAX_LOCK_ONLY : 0) |
2971 121427 : ((infomask & HEAP_XMAX_EXCL_LOCK) != 0 ? XLHL_XMAX_EXCL_LOCK : 0) |
2972 : /* note we ignore HEAP_XMAX_SHR_LOCK here */
2973 121427 : ((infomask & HEAP_XMAX_KEYSHR_LOCK) != 0 ? XLHL_XMAX_KEYSHR_LOCK : 0) |
2974 121427 : ((infomask2 & HEAP_KEYS_UPDATED) != 0 ?
2975 : XLHL_KEYS_UPDATED : 0);
2976 : }
2977 :
2978 : /*
2979 : * Given two versions of the same t_infomask for a tuple, compare them and
2980 : * return whether the relevant status for a tuple Xmax has changed. This is
2981 : * used after a buffer lock has been released and reacquired: we want to ensure
2982 : * that the tuple state continues to be the same it was when we previously
2983 : * examined it.
2984 : *
2985 : * Note the Xmax field itself must be compared separately.
2986 : */
2987 : static inline bool
2988 2 : xmax_infomask_changed(uint16 new_infomask, uint16 old_infomask)
2989 : {
2990 2 : const uint16 interesting =
2991 : HEAP_XMAX_IS_MULTI | HEAP_XMAX_LOCK_ONLY | HEAP_LOCK_MASK;
2992 :
2993 2 : if ((new_infomask & interesting) != (old_infomask & interesting))
2994 0 : return true;
2995 :
2996 2 : return false;
2997 : }
2998 :
2999 : /*
3000 : * heap_delete - delete a tuple
3001 : *
3002 : * NB: do not call this directly unless you are prepared to deal with
3003 : * concurrent-update conditions. Use simple_heap_delete instead.
3004 : *
3005 : * relation - table to be modified (caller must hold suitable lock)
3006 : * tid - TID of tuple to be deleted
3007 : * cid - delete command ID (used for visibility test, and stored into
3008 : * cmax if successful)
3009 : * crosscheck - if not InvalidSnapshot, also check tuple against this
3010 : * wait - true if should wait for any conflicting update to commit/abort
3011 : * hufd - output parameter, filled in failure cases (see below)
3012 : *
3013 : * Normal, successful return value is HeapTupleMayBeUpdated, which
3014 : * actually means we did delete it. Failure return codes are
3015 : * HeapTupleSelfUpdated, HeapTupleUpdated, or HeapTupleBeingUpdated
3016 : * (the last only possible if wait == false).
3017 : *
3018 : * In the failure cases, the routine fills *hufd with the tuple's t_ctid,
3019 : * t_xmax (resolving a possible MultiXact, if necessary), and t_cmax
3020 : * (the last only for HeapTupleSelfUpdated, since we
3021 : * cannot obtain cmax from a combocid generated by another transaction).
3022 : * See comments for struct HeapUpdateFailureData for additional info.
3023 : */
3024 : HTSU_Result
3025 109181 : heap_delete(Relation relation, ItemPointer tid,
3026 : CommandId cid, Snapshot crosscheck, bool wait,
3027 : HeapUpdateFailureData *hufd)
3028 : {
3029 : HTSU_Result result;
3030 109181 : TransactionId xid = GetCurrentTransactionId();
3031 : ItemId lp;
3032 : HeapTupleData tp;
3033 : Page page;
3034 : BlockNumber block;
3035 : Buffer buffer;
3036 109181 : Buffer vmbuffer = InvalidBuffer;
3037 : TransactionId new_xmax;
3038 : uint16 new_infomask,
3039 : new_infomask2;
3040 109181 : bool have_tuple_lock = false;
3041 : bool iscombo;
3042 109181 : bool all_visible_cleared = false;
3043 109181 : HeapTuple old_key_tuple = NULL; /* replica identity of the tuple */
3044 109181 : bool old_key_copied = false;
3045 :
3046 109181 : Assert(ItemPointerIsValid(tid));
3047 :
3048 : /*
3049 : * Forbid this during a parallel operation, lest it allocate a combocid.
3050 : * Other workers might need that combocid for visibility checks, and we
3051 : * have no provision for broadcasting it to them.
3052 : */
3053 109181 : if (IsInParallelMode())
3054 0 : ereport(ERROR,
3055 : (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
3056 : errmsg("cannot delete tuples during a parallel operation")));
3057 :
3058 109181 : block = ItemPointerGetBlockNumber(tid);
3059 109181 : buffer = ReadBuffer(relation, block);
3060 109181 : page = BufferGetPage(buffer);
3061 :
3062 : /*
3063 : * Before locking the buffer, pin the visibility map page if it appears to
3064 : * be necessary. Since we haven't got the lock yet, someone else might be
3065 : * in the middle of changing this, so we'll need to recheck after we have
3066 : * the lock.
3067 : */
3068 109181 : if (PageIsAllVisible(page))
3069 214 : visibilitymap_pin(relation, block, &vmbuffer);
3070 :
3071 109181 : LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
3072 :
3073 : /*
3074 : * If we didn't pin the visibility map page and the page has become all
3075 : * visible while we were busy locking the buffer, we'll have to unlock and
3076 : * re-lock, to avoid holding the buffer lock across an I/O. That's a bit
3077 : * unfortunate, but hopefully shouldn't happen often.
3078 : */
3079 109181 : if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
3080 : {
3081 0 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3082 0 : visibilitymap_pin(relation, block, &vmbuffer);
3083 0 : LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
3084 : }
3085 :
3086 109181 : lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid));
3087 109181 : Assert(ItemIdIsNormal(lp));
3088 :
3089 109181 : tp.t_tableOid = RelationGetRelid(relation);
3090 109181 : tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
3091 109181 : tp.t_len = ItemIdGetLength(lp);
3092 109181 : tp.t_self = *tid;
3093 :
3094 : l1:
3095 109181 : result = HeapTupleSatisfiesUpdate(&tp, cid, buffer);
3096 :
3097 109181 : if (result == HeapTupleInvisible)
3098 : {
3099 0 : UnlockReleaseBuffer(buffer);
3100 0 : ereport(ERROR,
3101 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
3102 : errmsg("attempted to delete invisible tuple")));
3103 : }
3104 109181 : else if (result == HeapTupleBeingUpdated && wait)
3105 : {
3106 : TransactionId xwait;
3107 : uint16 infomask;
3108 :
3109 : /* must copy state data before unlocking buffer */
3110 15 : xwait = HeapTupleHeaderGetRawXmax(tp.t_data);
3111 15 : infomask = tp.t_data->t_infomask;
3112 :
3113 : /*
3114 : * Sleep until concurrent transaction ends -- except when there's a
3115 : * single locker and it's our own transaction. Note we don't care
3116 : * which lock mode the locker has, because we need the strongest one.
3117 : *
3118 : * Before sleeping, we need to acquire tuple lock to establish our
3119 : * priority for the tuple (see heap_lock_tuple). LockTuple will
3120 : * release us when we are next-in-line for the tuple.
3121 : *
3122 : * If we are forced to "start over" below, we keep the tuple lock;
3123 : * this arranges that we stay at the head of the line while rechecking
3124 : * tuple state.
3125 : */
3126 15 : if (infomask & HEAP_XMAX_IS_MULTI)
3127 : {
3128 : /* wait for multixact */
3129 0 : if (DoesMultiXactIdConflict((MultiXactId) xwait, infomask,
3130 : LockTupleExclusive))
3131 : {
3132 0 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3133 :
3134 : /* acquire tuple lock, if necessary */
3135 0 : heap_acquire_tuplock(relation, &(tp.t_self), LockTupleExclusive,
3136 : LockWaitBlock, &have_tuple_lock);
3137 :
3138 : /* wait for multixact */
3139 0 : MultiXactIdWait((MultiXactId) xwait, MultiXactStatusUpdate, infomask,
3140 : relation, &(tp.t_self), XLTW_Delete,
3141 : NULL);
3142 0 : LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
3143 :
3144 : /*
3145 : * If xwait had just locked the tuple then some other xact
3146 : * could update this tuple before we get to this point. Check
3147 : * for xmax change, and start over if so.
3148 : */
3149 0 : if (xmax_infomask_changed(tp.t_data->t_infomask, infomask) ||
3150 0 : !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tp.t_data),
3151 : xwait))
3152 : goto l1;
3153 : }
3154 :
3155 : /*
3156 : * You might think the multixact is necessarily done here, but not
3157 : * so: it could have surviving members, namely our own xact or
3158 : * other subxacts of this backend. It is legal for us to delete
3159 : * the tuple in either case, however (the latter case is
3160 : * essentially a situation of upgrading our former shared lock to
3161 : * exclusive). We don't bother changing the on-disk hint bits
3162 : * since we are about to overwrite the xmax altogether.
3163 : */
3164 : }
3165 15 : else if (!TransactionIdIsCurrentTransactionId(xwait))
3166 : {
3167 : /*
3168 : * Wait for regular transaction to end; but first, acquire tuple
3169 : * lock.
3170 : */
3171 0 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3172 0 : heap_acquire_tuplock(relation, &(tp.t_self), LockTupleExclusive,
3173 : LockWaitBlock, &have_tuple_lock);
3174 0 : XactLockTableWait(xwait, relation, &(tp.t_self), XLTW_Delete);
3175 0 : LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
3176 :
3177 : /*
3178 : * xwait is done, but if xwait had just locked the tuple then some
3179 : * other xact could update this tuple before we get to this point.
3180 : * Check for xmax change, and start over if so.
3181 : */
3182 0 : if (xmax_infomask_changed(tp.t_data->t_infomask, infomask) ||
3183 0 : !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tp.t_data),
3184 : xwait))
3185 : goto l1;
3186 :
3187 : /* Otherwise check if it committed or aborted */
3188 0 : UpdateXmaxHintBits(tp.t_data, buffer, xwait);
3189 : }
3190 :
3191 : /*
3192 : * We may overwrite if previous xmax aborted, or if it committed but
3193 : * only locked the tuple without updating it.
3194 : */
3195 30 : if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) ||
3196 15 : HEAP_XMAX_IS_LOCKED_ONLY(tp.t_data->t_infomask) ||
3197 0 : HeapTupleHeaderIsOnlyLocked(tp.t_data))
3198 15 : result = HeapTupleMayBeUpdated;
3199 : else
3200 0 : result = HeapTupleUpdated;
3201 : }
3202 :
3203 109181 : if (crosscheck != InvalidSnapshot && result == HeapTupleMayBeUpdated)
3204 : {
3205 : /* Perform additional check for transaction-snapshot mode RI updates */
3206 0 : if (!HeapTupleSatisfiesVisibility(&tp, crosscheck, buffer))
3207 0 : result = HeapTupleUpdated;
3208 : }
3209 :
3210 109181 : if (result != HeapTupleMayBeUpdated)
3211 : {
3212 1 : Assert(result == HeapTupleSelfUpdated ||
3213 : result == HeapTupleUpdated ||
3214 : result == HeapTupleBeingUpdated);
3215 1 : Assert(!(tp.t_data->t_infomask & HEAP_XMAX_INVALID));
3216 1 : hufd->ctid = tp.t_data->t_ctid;
3217 1 : hufd->xmax = HeapTupleHeaderGetUpdateXid(tp.t_data);
3218 1 : if (result == HeapTupleSelfUpdated)
3219 1 : hufd->cmax = HeapTupleHeaderGetCmax(tp.t_data);
3220 : else
3221 0 : hufd->cmax = InvalidCommandId;
3222 1 : UnlockReleaseBuffer(buffer);
3223 1 : if (have_tuple_lock)
3224 0 : UnlockTupleTuplock(relation, &(tp.t_self), LockTupleExclusive);
3225 1 : if (vmbuffer != InvalidBuffer)
3226 0 : ReleaseBuffer(vmbuffer);
3227 1 : return result;
3228 : }
3229 :
3230 : /*
3231 : * We're about to do the actual delete -- check for conflict first, to
3232 : * avoid possibly having to roll back work we've just done.
3233 : *
3234 : * This is safe without a recheck as long as there is no possibility of
3235 : * another process scanning the page between this check and the delete
3236 : * being visible to the scan (i.e., an exclusive buffer content lock is
3237 : * continuously held from this point until the tuple delete is visible).
3238 : */
3239 109180 : CheckForSerializableConflictIn(relation, &tp, buffer);
3240 :
3241 : /* replace cid with a combo cid if necessary */
3242 109180 : HeapTupleHeaderAdjustCmax(tp.t_data, &cid, &iscombo);
3243 :
3244 : /*
3245 : * Compute replica identity tuple before entering the critical section so
3246 : * we don't PANIC upon a memory allocation failure.
3247 : */
3248 109180 : old_key_tuple = ExtractReplicaIdentity(relation, &tp, true, &old_key_copied);
3249 :
3250 : /*
3251 : * If this is the first possibly-multixact-able operation in the current
3252 : * transaction, set my per-backend OldestMemberMXactId setting. We can be
3253 : * certain that the transaction will never become a member of any older
3254 : * MultiXactIds than that. (We have to do this even if we end up just
3255 : * using our own TransactionId below, since some other backend could
3256 : * incorporate our XID into a MultiXact immediately afterwards.)
3257 : */
3258 109180 : MultiXactIdSetOldestMember();
3259 :
3260 218360 : compute_new_xmax_infomask(HeapTupleHeaderGetRawXmax(tp.t_data),
3261 218360 : tp.t_data->t_infomask, tp.t_data->t_infomask2,
3262 : xid, LockTupleExclusive, true,
3263 : &new_xmax, &new_infomask, &new_infomask2);
3264 :
3265 109180 : START_CRIT_SECTION();
3266 :
3267 : /*
3268 : * If this transaction commits, the tuple will become DEAD sooner or
3269 : * later. Set flag that this page is a candidate for pruning once our xid
3270 : * falls below the OldestXmin horizon. If the transaction finally aborts,
3271 : * the subsequent page pruning will be a no-op and the hint will be
3272 : * cleared.
3273 : */
3274 109180 : PageSetPrunable(page, xid);
3275 :
3276 109180 : if (PageIsAllVisible(page))
3277 : {
3278 214 : all_visible_cleared = true;
3279 214 : PageClearAllVisible(page);
3280 214 : visibilitymap_clear(relation, BufferGetBlockNumber(buffer),
3281 : vmbuffer, VISIBILITYMAP_VALID_BITS);
3282 : }
3283 :
3284 : /* store transaction information of xact deleting the tuple */
3285 109180 : tp.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
3286 109180 : tp.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
3287 109180 : tp.t_data->t_infomask |= new_infomask;
3288 109180 : tp.t_data->t_infomask2 |= new_infomask2;
3289 109180 : HeapTupleHeaderClearHotUpdated(tp.t_data);
3290 109180 : HeapTupleHeaderSetXmax(tp.t_data, new_xmax);
3291 109180 : HeapTupleHeaderSetCmax(tp.t_data, cid, iscombo);
3292 : /* Make sure there is no forward chain link in t_ctid */
3293 109180 : tp.t_data->t_ctid = tp.t_self;
3294 :
3295 109180 : MarkBufferDirty(buffer);
3296 :
3297 : /*
3298 : * XLOG stuff
3299 : *
3300 : * NB: heap_abort_speculative() uses the same xlog record and replay
3301 : * routines.
3302 : */
3303 109180 : if (RelationNeedsWAL(relation))
3304 : {
3305 : xl_heap_delete xlrec;
3306 : XLogRecPtr recptr;
3307 :
3308 : /* For logical decode we need combocids to properly decode the catalog */
3309 109041 : if (RelationIsAccessibleInLogicalDecoding(relation))
3310 0 : log_heap_new_cid(relation, &tp);
3311 :
3312 109041 : xlrec.flags = all_visible_cleared ? XLH_DELETE_ALL_VISIBLE_CLEARED : 0;
3313 109041 : xlrec.infobits_set = compute_infobits(tp.t_data->t_infomask,
3314 109041 : tp.t_data->t_infomask2);
3315 109041 : xlrec.offnum = ItemPointerGetOffsetNumber(&tp.t_self);
3316 109041 : xlrec.xmax = new_xmax;
3317 :
3318 109041 : if (old_key_tuple != NULL)
3319 : {
3320 0 : if (relation->rd_rel->relreplident == REPLICA_IDENTITY_FULL)
3321 0 : xlrec.flags |= XLH_DELETE_CONTAINS_OLD_TUPLE;
3322 : else
3323 0 : xlrec.flags |= XLH_DELETE_CONTAINS_OLD_KEY;
3324 : }
3325 :
3326 109041 : XLogBeginInsert();
3327 109041 : XLogRegisterData((char *) &xlrec, SizeOfHeapDelete);
3328 :
3329 109041 : XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
3330 :
3331 : /*
3332 : * Log replica identity of the deleted tuple if there is one
3333 : */
3334 109041 : if (old_key_tuple != NULL)
3335 : {
3336 : xl_heap_header xlhdr;
3337 :
3338 0 : xlhdr.t_infomask2 = old_key_tuple->t_data->t_infomask2;
3339 0 : xlhdr.t_infomask = old_key_tuple->t_data->t_infomask;
3340 0 : xlhdr.t_hoff = old_key_tuple->t_data->t_hoff;
3341 :
3342 0 : XLogRegisterData((char *) &xlhdr, SizeOfHeapHeader);
3343 0 : XLogRegisterData((char *) old_key_tuple->t_data
3344 : + SizeofHeapTupleHeader,
3345 0 : old_key_tuple->t_len
3346 0 : - SizeofHeapTupleHeader);
3347 : }
3348 :
3349 : /* filtering by origin on a row level is much more efficient */
3350 109041 : XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
3351 :
3352 109041 : recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_DELETE);
3353 :
3354 109041 : PageSetLSN(page, recptr);
3355 : }
3356 :
3357 109180 : END_CRIT_SECTION();
3358 :
3359 109180 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3360 :
3361 109180 : if (vmbuffer != InvalidBuffer)
3362 214 : ReleaseBuffer(vmbuffer);
3363 :
3364 : /*
3365 : * If the tuple has toasted out-of-line attributes, we need to delete
3366 : * those items too. We have to do this before releasing the buffer
3367 : * because we need to look at the contents of the tuple, but it's OK to
3368 : * release the content lock on the buffer first.
3369 : */
3370 109300 : if (relation->rd_rel->relkind != RELKIND_RELATION &&
3371 120 : relation->rd_rel->relkind != RELKIND_MATVIEW)
3372 : {
3373 : /* toast table entries should never be recursively toasted */
3374 117 : Assert(!HeapTupleHasExternal(&tp));
3375 : }
3376 109063 : else if (HeapTupleHasExternal(&tp))
3377 7 : toast_delete(relation, &tp, false);
3378 :
3379 : /*
3380 : * Mark tuple for invalidation from system caches at next command
3381 : * boundary. We have to do this before releasing the buffer because we
3382 : * need to look at the contents of the tuple.
3383 : */
3384 109180 : CacheInvalidateHeapTuple(relation, &tp, NULL);
3385 :
3386 : /* Now we can release the buffer */
3387 109180 : ReleaseBuffer(buffer);
3388 :
3389 : /*
3390 : * Release the lmgr tuple lock, if we had it.
3391 : */
3392 109180 : if (have_tuple_lock)
3393 0 : UnlockTupleTuplock(relation, &(tp.t_self), LockTupleExclusive);
3394 :
3395 109180 : pgstat_count_heap_delete(relation);
3396 :
3397 109180 : if (old_key_tuple != NULL && old_key_copied)
3398 0 : heap_freetuple(old_key_tuple);
3399 :
3400 109180 : return HeapTupleMayBeUpdated;
3401 : }
3402 :
3403 : /*
3404 : * simple_heap_delete - delete a tuple
3405 : *
3406 : * This routine may be used to delete a tuple when concurrent updates of
3407 : * the target tuple are not expected (for example, because we have a lock
3408 : * on the relation associated with the tuple). Any failure is reported
3409 : * via ereport().
3410 : */
3411 : void
3412 43454 : simple_heap_delete(Relation relation, ItemPointer tid)
3413 : {
3414 : HTSU_Result result;
3415 : HeapUpdateFailureData hufd;
3416 :
3417 43454 : result = heap_delete(relation, tid,
3418 : GetCurrentCommandId(true), InvalidSnapshot,
3419 : true /* wait for commit */ ,
3420 : &hufd);
3421 43454 : switch (result)
3422 : {
3423 : case HeapTupleSelfUpdated:
3424 : /* Tuple was already updated in current command? */
3425 0 : elog(ERROR, "tuple already updated by self");
3426 : break;
3427 :
3428 : case HeapTupleMayBeUpdated:
3429 : /* done successfully */
3430 43454 : break;
3431 :
3432 : case HeapTupleUpdated:
3433 0 : elog(ERROR, "tuple concurrently updated");
3434 : break;
3435 :
3436 : default:
3437 0 : elog(ERROR, "unrecognized heap_delete status: %u", result);
3438 : break;
3439 : }
3440 43454 : }
3441 :
3442 : /*
3443 : * heap_update - replace a tuple
3444 : *
3445 : * NB: do not call this directly unless you are prepared to deal with
3446 : * concurrent-update conditions. Use simple_heap_update instead.
3447 : *
3448 : * relation - table to be modified (caller must hold suitable lock)
3449 : * otid - TID of old tuple to be replaced
3450 : * newtup - newly constructed tuple data to store
3451 : * cid - update command ID (used for visibility test, and stored into
3452 : * cmax/cmin if successful)
3453 : * crosscheck - if not InvalidSnapshot, also check old tuple against this
3454 : * wait - true if should wait for any conflicting update to commit/abort
3455 : * hufd - output parameter, filled in failure cases (see below)
3456 : * lockmode - output parameter, filled with lock mode acquired on tuple
3457 : *
3458 : * Normal, successful return value is HeapTupleMayBeUpdated, which
3459 : * actually means we *did* update it. Failure return codes are
3460 : * HeapTupleSelfUpdated, HeapTupleUpdated, or HeapTupleBeingUpdated
3461 : * (the last only possible if wait == false).
3462 : *
3463 : * On success, the header fields of *newtup are updated to match the new
3464 : * stored tuple; in particular, newtup->t_self is set to the TID where the
3465 : * new tuple was inserted, and its HEAP_ONLY_TUPLE flag is set iff a HOT
3466 : * update was done. However, any TOAST changes in the new tuple's
3467 : * data are not reflected into *newtup.
3468 : *
3469 : * In the failure cases, the routine fills *hufd with the tuple's t_ctid,
3470 : * t_xmax (resolving a possible MultiXact, if necessary), and t_cmax
3471 : * (the last only for HeapTupleSelfUpdated, since we
3472 : * cannot obtain cmax from a combocid generated by another transaction).
3473 : * See comments for struct HeapUpdateFailureData for additional info.
3474 : */
3475 : HTSU_Result
3476 9399 : heap_update(Relation relation, ItemPointer otid, HeapTuple newtup,
3477 : CommandId cid, Snapshot crosscheck, bool wait,
3478 : HeapUpdateFailureData *hufd, LockTupleMode *lockmode)
3479 : {
3480 : HTSU_Result result;
3481 9399 : TransactionId xid = GetCurrentTransactionId();
3482 : Bitmapset *hot_attrs;
3483 : Bitmapset *key_attrs;
3484 : Bitmapset *id_attrs;
3485 : Bitmapset *interesting_attrs;
3486 : Bitmapset *modified_attrs;
3487 : ItemId lp;
3488 : HeapTupleData oldtup;
3489 : HeapTuple heaptup;
3490 9399 : HeapTuple old_key_tuple = NULL;
3491 9399 : bool old_key_copied = false;
3492 : Page page;
3493 : BlockNumber block;
3494 : MultiXactStatus mxact_status;
3495 : Buffer buffer,
3496 : newbuf,
3497 9399 : vmbuffer = InvalidBuffer,
3498 9399 : vmbuffer_new = InvalidBuffer;
3499 : bool need_toast;
3500 : Size newtupsize,
3501 : pagefree;
3502 9399 : bool have_tuple_lock = false;
3503 : bool iscombo;
3504 9399 : bool use_hot_update = false;
3505 9399 : bool hot_attrs_checked = false;
3506 : bool key_intact;
3507 9399 : bool all_visible_cleared = false;
3508 9399 : bool all_visible_cleared_new = false;
3509 : bool checked_lockers;
3510 : bool locker_remains;
3511 : TransactionId xmax_new_tuple,
3512 : xmax_old_tuple;
3513 : uint16 infomask_old_tuple,
3514 : infomask2_old_tuple,
3515 : infomask_new_tuple,
3516 : infomask2_new_tuple;
3517 :
3518 9399 : Assert(ItemPointerIsValid(otid));
3519 :
3520 : /*
3521 : * Forbid this during a parallel operation, lest it allocate a combocid.
3522 : * Other workers might need that combocid for visibility checks, and we
3523 : * have no provision for broadcasting it to them.
3524 : */
3525 9399 : if (IsInParallelMode())
3526 0 : ereport(ERROR,
3527 : (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
3528 : errmsg("cannot update tuples during a parallel operation")));
3529 :
3530 : /*
3531 : * Fetch the list of attributes to be checked for various operations.
3532 : *
3533 : * For HOT considerations, this is wasted effort if we fail to update or
3534 : * have to put the new tuple on a different page. But we must compute the
3535 : * list before obtaining buffer lock --- in the worst case, if we are
3536 : * doing an update on one of the relevant system catalogs, we could
3537 : * deadlock if we try to fetch the list later. In any case, the relcache
3538 : * caches the data so this is usually pretty cheap.
3539 : *
3540 : * We also need columns used by the replica identity and columns that are
3541 : * considered the "key" of rows in the table.
3542 : *
3543 : * Note that we get copies of each bitmap, so we need not worry about
3544 : * relcache flush happening midway through.
3545 : */
3546 9399 : hot_attrs = RelationGetIndexAttrBitmap(relation, INDEX_ATTR_BITMAP_ALL);
3547 9399 : key_attrs = RelationGetIndexAttrBitmap(relation, INDEX_ATTR_BITMAP_KEY);
3548 9399 : id_attrs = RelationGetIndexAttrBitmap(relation,
3549 : INDEX_ATTR_BITMAP_IDENTITY_KEY);
3550 :
3551 :
3552 9399 : block = ItemPointerGetBlockNumber(otid);
3553 9399 : buffer = ReadBuffer(relation, block);
3554 9399 : page = BufferGetPage(buffer);
3555 :
3556 9399 : interesting_attrs = NULL;
3557 :
3558 : /*
3559 : * If the page is already full, there is hardly any chance of doing a HOT
3560 : * update on this page. It might be wasteful effort to look for index
3561 : * column updates only to later reject HOT updates for lack of space in
3562 : * the same page. So we be conservative and only fetch hot_attrs if the
3563 : * page is not already full. Since we are already holding a pin on the
3564 : * buffer, there is no chance that the buffer can get cleaned up
3565 : * concurrently and even if that was possible, in the worst case we lose a
3566 : * chance to do a HOT update.
3567 : */
3568 9399 : if (!PageIsFull(page))
3569 : {
3570 6865 : interesting_attrs = bms_add_members(interesting_attrs, hot_attrs);
3571 6865 : hot_attrs_checked = true;
3572 : }
3573 9399 : interesting_attrs = bms_add_members(interesting_attrs, key_attrs);
3574 9399 : interesting_attrs = bms_add_members(interesting_attrs, id_attrs);
3575 :
3576 : /*
3577 : * Before locking the buffer, pin the visibility map page if it appears to
3578 : * be necessary. Since we haven't got the lock yet, someone else might be
3579 : * in the middle of changing this, so we'll need to recheck after we have
3580 : * the lock.
3581 : */
3582 9399 : if (PageIsAllVisible(page))
3583 167 : visibilitymap_pin(relation, block, &vmbuffer);
3584 :
3585 9399 : LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
3586 :
3587 9399 : lp = PageGetItemId(page, ItemPointerGetOffsetNumber(otid));
3588 9399 : Assert(ItemIdIsNormal(lp));
3589 :
3590 : /*
3591 : * Fill in enough data in oldtup for HeapDetermineModifiedColumns to work
3592 : * properly.
3593 : */
3594 9399 : oldtup.t_tableOid = RelationGetRelid(relation);
3595 9399 : oldtup.t_data = (HeapTupleHeader) PageGetItem(page, lp);
3596 9399 : oldtup.t_len = ItemIdGetLength(lp);
3597 9399 : oldtup.t_self = *otid;
3598 :
3599 : /* the new tuple is ready, except for this: */
3600 9399 : newtup->t_tableOid = RelationGetRelid(relation);
3601 :
3602 : /* Fill in OID for newtup */
3603 9399 : if (relation->rd_rel->relhasoids)
3604 : {
3605 : #ifdef NOT_USED
3606 : /* this is redundant with an Assert in HeapTupleSetOid */
3607 : Assert(newtup->t_data->t_infomask & HEAP_HASOID);
3608 : #endif
3609 4699 : HeapTupleSetOid(newtup, HeapTupleGetOid(&oldtup));
3610 : }
3611 : else
3612 : {
3613 : /* check there is not space for an OID */
3614 4700 : Assert(!(newtup->t_data->t_infomask & HEAP_HASOID));
3615 : }
3616 :
3617 : /* Determine columns modified by the update. */
3618 9399 : modified_attrs = HeapDetermineModifiedColumns(relation, interesting_attrs,
3619 : &oldtup, newtup);
3620 :
3621 : /*
3622 : * If we're not updating any "key" column, we can grab a weaker lock type.
3623 : * This allows for more concurrency when we are running simultaneously
3624 : * with foreign key checks.
3625 : *
3626 : * Note that if a column gets detoasted while executing the update, but
3627 : * the value ends up being the same, this test will fail and we will use
3628 : * the stronger lock. This is acceptable; the important case to optimize
3629 : * is updates that don't manipulate key columns, not those that
3630 : * serendipitiously arrive at the same key values.
3631 : */
3632 9399 : if (!bms_overlap(modified_attrs, key_attrs))
3633 : {
3634 8829 : *lockmode = LockTupleNoKeyExclusive;
3635 8829 : mxact_status = MultiXactStatusNoKeyUpdate;
3636 8829 : key_intact = true;
3637 :
3638 : /*
3639 : * If this is the first possibly-multixact-able operation in the
3640 : * current transaction, set my per-backend OldestMemberMXactId
3641 : * setting. We can be certain that the transaction will never become a
3642 : * member of any older MultiXactIds than that. (We have to do this
3643 : * even if we end up just using our own TransactionId below, since
3644 : * some other backend could incorporate our XID into a MultiXact
3645 : * immediately afterwards.)
3646 : */
3647 8829 : MultiXactIdSetOldestMember();
3648 : }
3649 : else
3650 : {
3651 570 : *lockmode = LockTupleExclusive;
3652 570 : mxact_status = MultiXactStatusUpdate;
3653 570 : key_intact = false;
3654 : }
3655 :
3656 : /*
3657 : * Note: beyond this point, use oldtup not otid to refer to old tuple.
3658 : * otid may very well point at newtup->t_self, which we will overwrite
3659 : * with the new tuple's location, so there's great risk of confusion if we
3660 : * use otid anymore.
3661 : */
3662 :
3663 : l2:
3664 9399 : checked_lockers = false;
3665 9399 : locker_remains = false;
3666 9399 : result = HeapTupleSatisfiesUpdate(&oldtup, cid, buffer);
3667 :
3668 : /* see below about the "no wait" case */
3669 9399 : Assert(result != HeapTupleBeingUpdated || wait);
3670 :
3671 9399 : if (result == HeapTupleInvisible)
3672 : {
3673 0 : UnlockReleaseBuffer(buffer);
3674 0 : ereport(ERROR,
3675 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
3676 : errmsg("attempted to update invisible tuple")));
3677 : }
3678 9399 : else if (result == HeapTupleBeingUpdated && wait)
3679 : {
3680 : TransactionId xwait;
3681 : uint16 infomask;
3682 249 : bool can_continue = false;
3683 :
3684 : /*
3685 : * XXX note that we don't consider the "no wait" case here. This
3686 : * isn't a problem currently because no caller uses that case, but it
3687 : * should be fixed if such a caller is introduced. It wasn't a
3688 : * problem previously because this code would always wait, but now
3689 : * that some tuple locks do not conflict with one of the lock modes we
3690 : * use, it is possible that this case is interesting to handle
3691 : * specially.
3692 : *
3693 : * This may cause failures with third-party code that calls
3694 : * heap_update directly.
3695 : */
3696 :
3697 : /* must copy state data before unlocking buffer */
3698 249 : xwait = HeapTupleHeaderGetRawXmax(oldtup.t_data);
3699 249 : infomask = oldtup.t_data->t_infomask;
3700 :
3701 : /*
3702 : * Now we have to do something about the existing locker. If it's a
3703 : * multi, sleep on it; we might be awakened before it is completely
3704 : * gone (or even not sleep at all in some cases); we need to preserve
3705 : * it as locker, unless it is gone completely.
3706 : *
3707 : * If it's not a multi, we need to check for sleeping conditions
3708 : * before actually going to sleep. If the update doesn't conflict
3709 : * with the locks, we just continue without sleeping (but making sure
3710 : * it is preserved).
3711 : *
3712 : * Before sleeping, we need to acquire tuple lock to establish our
3713 : * priority for the tuple (see heap_lock_tuple). LockTuple will
3714 : * release us when we are next-in-line for the tuple. Note we must
3715 : * not acquire the tuple lock until we're sure we're going to sleep;
3716 : * otherwise we're open for race conditions with other transactions
3717 : * holding the tuple lock which sleep on us.
3718 : *
3719 : * If we are forced to "start over" below, we keep the tuple lock;
3720 : * this arranges that we stay at the head of the line while rechecking
3721 : * tuple state.
3722 : */
3723 249 : if (infomask & HEAP_XMAX_IS_MULTI)
3724 : {
3725 : TransactionId update_xact;
3726 : int remain;
3727 :
3728 0 : if (DoesMultiXactIdConflict((MultiXactId) xwait, infomask,
3729 : *lockmode))
3730 : {
3731 0 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3732 :
3733 : /* acquire tuple lock, if necessary */
3734 0 : heap_acquire_tuplock(relation, &(oldtup.t_self), *lockmode,
3735 : LockWaitBlock, &have_tuple_lock);
3736 :
3737 : /* wait for multixact */
3738 0 : MultiXactIdWait((MultiXactId) xwait, mxact_status, infomask,
3739 : relation, &oldtup.t_self, XLTW_Update,
3740 : &remain);
3741 0 : checked_lockers = true;
3742 0 : locker_remains = remain != 0;
3743 0 : LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
3744 :
3745 : /*
3746 : * If xwait had just locked the tuple then some other xact
3747 : * could update this tuple before we get to this point. Check
3748 : * for xmax change, and start over if so.
3749 : */
3750 0 : if (xmax_infomask_changed(oldtup.t_data->t_infomask,
3751 0 : infomask) ||
3752 0 : !TransactionIdEquals(HeapTupleHeaderGetRawXmax(oldtup.t_data),
3753 : xwait))
3754 : goto l2;
3755 : }
3756 :
3757 : /*
3758 : * Note that the multixact may not be done by now. It could have
3759 : * surviving members; our own xact or other subxacts of this
3760 : * backend, and also any other concurrent transaction that locked
3761 : * the tuple with KeyShare if we only got TupleLockUpdate. If
3762 : * this is the case, we have to be careful to mark the updated
3763 : * tuple with the surviving members in Xmax.
3764 : *
3765 : * Note that there could have been another update in the
3766 : * MultiXact. In that case, we need to check whether it committed
3767 : * or aborted. If it aborted we are safe to update it again;
3768 : * otherwise there is an update conflict, and we have to return
3769 : * HeapTupleUpdated below.
3770 : *
3771 : * In the LockTupleExclusive case, we still need to preserve the
3772 : * surviving members: those would include the tuple locks we had
3773 : * before this one, which are important to keep in case this
3774 : * subxact aborts.
3775 : */
3776 0 : if (!HEAP_XMAX_IS_LOCKED_ONLY(oldtup.t_data->t_infomask))
3777 0 : update_xact = HeapTupleGetUpdateXid(oldtup.t_data);
3778 : else
3779 0 : update_xact = InvalidTransactionId;
3780 :
3781 : /*
3782 : * There was no UPDATE in the MultiXact; or it aborted. No
3783 : * TransactionIdIsInProgress() call needed here, since we called
3784 : * MultiXactIdWait() above.
3785 : */
3786 0 : if (!TransactionIdIsValid(update_xact) ||
3787 0 : TransactionIdDidAbort(update_xact))
3788 0 : can_continue = true;
3789 : }
3790 249 : else if (TransactionIdIsCurrentTransactionId(xwait))
3791 : {
3792 : /*
3793 : * The only locker is ourselves; we can avoid grabbing the tuple
3794 : * lock here, but must preserve our locking information.
3795 : */
3796 249 : checked_lockers = true;
3797 249 : locker_remains = true;
3798 249 : can_continue = true;
3799 : }
3800 0 : else if (HEAP_XMAX_IS_KEYSHR_LOCKED(infomask) && key_intact)
3801 : {
3802 : /*
3803 : * If it's just a key-share locker, and we're not changing the key
3804 : * columns, we don't need to wait for it to end; but we need to
3805 : * preserve it as locker.
3806 : */
3807 0 : checked_lockers = true;
3808 0 : locker_remains = true;
3809 0 : can_continue = true;
3810 : }
3811 : else
3812 : {
3813 : /*
3814 : * Wait for regular transaction to end; but first, acquire tuple
3815 : * lock.
3816 : */
3817 0 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3818 0 : heap_acquire_tuplock(relation, &(oldtup.t_self), *lockmode,
3819 : LockWaitBlock, &have_tuple_lock);
3820 0 : XactLockTableWait(xwait, relation, &oldtup.t_self,
3821 : XLTW_Update);
3822 0 : checked_lockers = true;
3823 0 : LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
3824 :
3825 : /*
3826 : * xwait is done, but if xwait had just locked the tuple then some
3827 : * other xact could update this tuple before we get to this point.
3828 : * Check for xmax change, and start over if so.
3829 : */
3830 0 : if (xmax_infomask_changed(oldtup.t_data->t_infomask, infomask) ||
3831 0 : !TransactionIdEquals(xwait,
3832 : HeapTupleHeaderGetRawXmax(oldtup.t_data)))
3833 : goto l2;
3834 :
3835 : /* Otherwise check if it committed or aborted */
3836 0 : UpdateXmaxHintBits(oldtup.t_data, buffer, xwait);
3837 0 : if (oldtup.t_data->t_infomask & HEAP_XMAX_INVALID)
3838 0 : can_continue = true;
3839 : }
3840 :
3841 249 : result = can_continue ? HeapTupleMayBeUpdated : HeapTupleUpdated;
3842 : }
3843 :
3844 9399 : if (crosscheck != InvalidSnapshot && result == HeapTupleMayBeUpdated)
3845 : {
3846 : /* Perform additional check for transaction-snapshot mode RI updates */
3847 0 : if (!HeapTupleSatisfiesVisibility(&oldtup, crosscheck, buffer))
3848 0 : result = HeapTupleUpdated;
3849 : }
3850 :
3851 9399 : if (result != HeapTupleMayBeUpdated)
3852 : {
3853 11 : Assert(result == HeapTupleSelfUpdated ||
3854 : result == HeapTupleUpdated ||
3855 : result == HeapTupleBeingUpdated);
3856 11 : Assert(!(oldtup.t_data->t_infomask & HEAP_XMAX_INVALID));
3857 11 : hufd->ctid = oldtup.t_data->t_ctid;
3858 11 : hufd->xmax = HeapTupleHeaderGetUpdateXid(oldtup.t_data);
3859 11 : if (result == HeapTupleSelfUpdated)
3860 11 : hufd->cmax = HeapTupleHeaderGetCmax(oldtup.t_data);
3861 : else
3862 0 : hufd->cmax = InvalidCommandId;
3863 11 : UnlockReleaseBuffer(buffer);
3864 11 : if (have_tuple_lock)
3865 0 : UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode);
3866 11 : if (vmbuffer != InvalidBuffer)
3867 0 : ReleaseBuffer(vmbuffer);
3868 11 : bms_free(hot_attrs);
3869 11 : bms_free(key_attrs);
3870 11 : bms_free(id_attrs);
3871 11 : bms_free(modified_attrs);
3872 11 : bms_free(interesting_attrs);
3873 11 : return result;
3874 : }
3875 :
3876 : /*
3877 : * If we didn't pin the visibility map page and the page has become all
3878 : * visible while we were busy locking the buffer, or during some
3879 : * subsequent window during which we had it unlocked, we'll have to unlock
3880 : * and re-lock, to avoid holding the buffer lock across an I/O. That's a
3881 : * bit unfortunate, especially since we'll now have to recheck whether the
3882 : * tuple has been locked or updated under us, but hopefully it won't
3883 : * happen very often.
3884 : */
3885 9388 : if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
3886 : {
3887 0 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3888 0 : visibilitymap_pin(relation, block, &vmbuffer);
3889 0 : LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
3890 0 : goto l2;
3891 : }
3892 :
3893 : /* Fill in transaction status data */
3894 :
3895 : /*
3896 : * If the tuple we're updating is locked, we need to preserve the locking
3897 : * info in the old tuple's Xmax. Prepare a new Xmax value for this.
3898 : */
3899 28164 : compute_new_xmax_infomask(HeapTupleHeaderGetRawXmax(oldtup.t_data),
3900 9388 : oldtup.t_data->t_infomask,
3901 9388 : oldtup.t_data->t_infomask2,
3902 : xid, *lockmode, true,
3903 : &xmax_old_tuple, &infomask_old_tuple,
3904 : &infomask2_old_tuple);
3905 :
3906 : /*
3907 : * And also prepare an Xmax value for the new copy of the tuple. If there
3908 : * was no xmax previously, or there was one but all lockers are now gone,
3909 : * then use InvalidXid; otherwise, get the xmax from the old tuple. (In
3910 : * rare cases that might also be InvalidXid and yet not have the
3911 : * HEAP_XMAX_INVALID bit set; that's fine.)
3912 : */
3913 9637 : if ((oldtup.t_data->t_infomask & HEAP_XMAX_INVALID) ||
3914 249 : HEAP_LOCKED_UPGRADED(oldtup.t_data->t_infomask) ||
3915 249 : (checked_lockers && !locker_remains))
3916 9139 : xmax_new_tuple = InvalidTransactionId;
3917 : else
3918 249 : xmax_new_tuple = HeapTupleHeaderGetRawXmax(oldtup.t_data);
3919 :
3920 9388 : if (!TransactionIdIsValid(xmax_new_tuple))
3921 : {
3922 9139 : infomask_new_tuple = HEAP_XMAX_INVALID;
3923 9139 : infomask2_new_tuple = 0;
3924 : }
3925 : else
3926 : {
3927 : /*
3928 : * If we found a valid Xmax for the new tuple, then the infomask bits
3929 : * to use on the new tuple depend on what was there on the old one.
3930 : * Note that since we're doing an update, the only possibility is that
3931 : * the lockers had FOR KEY SHARE lock.
3932 : */
3933 249 : if (oldtup.t_data->t_infomask & HEAP_XMAX_IS_MULTI)
3934 : {
3935 0 : GetMultiXactIdHintBits(xmax_new_tuple, &infomask_new_tuple,
3936 : &infomask2_new_tuple);
3937 : }
3938 : else
3939 : {
3940 249 : infomask_new_tuple = HEAP_XMAX_KEYSHR_LOCK | HEAP_XMAX_LOCK_ONLY;
3941 249 : infomask2_new_tuple = 0;
3942 : }
3943 : }
3944 :
3945 : /*
3946 : * Prepare the new tuple with the appropriate initial values of Xmin and
3947 : * Xmax, as well as initial infomask bits as computed above.
3948 : */
3949 9388 : newtup->t_data->t_infomask &= ~(HEAP_XACT_MASK);
3950 9388 : newtup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK);
3951 9388 : HeapTupleHeaderSetXmin(newtup->t_data, xid);
3952 9388 : HeapTupleHeaderSetCmin(newtup->t_data, cid);
3953 9388 : newtup->t_data->t_infomask |= HEAP_UPDATED | infomask_new_tuple;
3954 9388 : newtup->t_data->t_infomask2 |= infomask2_new_tuple;
3955 9388 : HeapTupleHeaderSetXmax(newtup->t_data, xmax_new_tuple);
3956 :
3957 : /*
3958 : * Replace cid with a combo cid if necessary. Note that we already put
3959 : * the plain cid into the new tuple.
3960 : */
3961 9388 : HeapTupleHeaderAdjustCmax(oldtup.t_data, &cid, &iscombo);
3962 :
3963 : /*
3964 : * If the toaster needs to be activated, OR if the new tuple will not fit
3965 : * on the same page as the old, then we need to release the content lock
3966 : * (but not the pin!) on the old tuple's buffer while we are off doing
3967 : * TOAST and/or table-file-extension work. We must mark the old tuple to
3968 : * show that it's locked, else other processes may try to update it
3969 : * themselves.
3970 : *
3971 : * We need to invoke the toaster if there are already any out-of-line
3972 : * toasted values present, or if the new tuple is over-threshold.
3973 : */
3974 9388 : if (relation->rd_rel->relkind != RELKIND_RELATION &&
3975 0 : relation->rd_rel->relkind != RELKIND_MATVIEW)
3976 : {
3977 : /* toast table entries should never be recursively toasted */
3978 0 : Assert(!HeapTupleHasExternal(&oldtup));
3979 0 : Assert(!HeapTupleHasExternal(newtup));
3980 0 : need_toast = false;
3981 : }
3982 : else
3983 28143 : need_toast = (HeapTupleHasExternal(&oldtup) ||
3984 18747 : HeapTupleHasExternal(newtup) ||
3985 9359 : newtup->t_len > TOAST_TUPLE_THRESHOLD);
3986 :
3987 9388 : pagefree = PageGetHeapFreeSpace(page);
3988 :
3989 9388 : newtupsize = MAXALIGN(newtup->t_len);
3990 :
3991 9388 : if (need_toast || newtupsize > pagefree)
3992 2895 : {
3993 : TransactionId xmax_lock_old_tuple;
3994 : uint16 infomask_lock_old_tuple,
3995 : infomask2_lock_old_tuple;
3996 2895 : bool cleared_all_frozen = false;
3997 :
3998 : /*
3999 : * To prevent concurrent sessions from updating the tuple, we have to
4000 : * temporarily mark it locked, while we release the page-level lock.
4001 : *
4002 : * To satisfy the rule that any xid potentially appearing in a buffer
4003 : * written out to disk, we unfortunately have to WAL log this
4004 : * temporary modification. We can reuse xl_heap_lock for this
4005 : * purpose. If we crash/error before following through with the
4006 : * actual update, xmax will be of an aborted transaction, allowing
4007 : * other sessions to proceed.
4008 : */
4009 :
4010 : /*
4011 : * Compute xmax / infomask appropriate for locking the tuple. This has
4012 : * to be done separately from the combo that's going to be used for
4013 : * updating, because the potentially created multixact would otherwise
4014 : * be wrong.
4015 : */
4016 8685 : compute_new_xmax_infomask(HeapTupleHeaderGetRawXmax(oldtup.t_data),
4017 2895 : oldtup.t_data->t_infomask,
4018 2895 : oldtup.t_data->t_infomask2,
4019 : xid, *lockmode, false,
4020 : &xmax_lock_old_tuple, &infomask_lock_old_tuple,
4021 : &infomask2_lock_old_tuple);
4022 :
4023 2895 : Assert(HEAP_XMAX_IS_LOCKED_ONLY(infomask_lock_old_tuple));
4024 :
4025 2895 : START_CRIT_SECTION();
4026 :
4027 : /* Clear obsolete visibility flags ... */
4028 2895 : oldtup.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
4029 2895 : oldtup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
4030 2895 : HeapTupleClearHotUpdated(&oldtup);
4031 : /* ... and store info about transaction updating this tuple */
4032 2895 : Assert(TransactionIdIsValid(xmax_lock_old_tuple));
4033 2895 : HeapTupleHeaderSetXmax(oldtup.t_data, xmax_lock_old_tuple);
4034 2895 : oldtup.t_data->t_infomask |= infomask_lock_old_tuple;
4035 2895 : oldtup.t_data->t_infomask2 |= infomask2_lock_old_tuple;
4036 2895 : HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo);
4037 :
4038 : /* temporarily make it look not-updated, but locked */
4039 2895 : oldtup.t_data->t_ctid = oldtup.t_self;
4040 :
4041 : /*
4042 : * Clear all-frozen bit on visibility map if needed. We could
4043 : * immediately reset ALL_VISIBLE, but given that the WAL logging
4044 : * overhead would be unchanged, that doesn't seem necessarily
4045 : * worthwhile.
4046 : */
4047 2955 : if (PageIsAllVisible(BufferGetPage(buffer)) &&
4048 60 : visibilitymap_clear(relation, block, vmbuffer,
4049 : VISIBILITYMAP_ALL_FROZEN))
4050 16 : cleared_all_frozen = true;
4051 :
4052 2895 : MarkBufferDirty(buffer);
4053 :
4054 2895 : if (RelationNeedsWAL(relation))
4055 : {
4056 : xl_heap_lock xlrec;
4057 : XLogRecPtr recptr;
4058 :
4059 2890 : XLogBeginInsert();
4060 2890 : XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
4061 :
4062 2890 : xlrec.offnum = ItemPointerGetOffsetNumber(&oldtup.t_self);
4063 2890 : xlrec.locking_xid = xmax_lock_old_tuple;
4064 2890 : xlrec.infobits_set = compute_infobits(oldtup.t_data->t_infomask,
4065 2890 : oldtup.t_data->t_infomask2);
4066 2890 : xlrec.flags =
4067 2890 : cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0;
4068 2890 : XLogRegisterData((char *) &xlrec, SizeOfHeapLock);
4069 2890 : recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_LOCK);
4070 2890 : PageSetLSN(page, recptr);
4071 : }
4072 :
4073 2895 : END_CRIT_SECTION();
4074 :
4075 2895 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
4076 :
4077 : /*
4078 : * Let the toaster do its thing, if needed.
4079 : *
4080 : * Note: below this point, heaptup is the data we actually intend to
4081 : * store into the relation; newtup is the caller's original untoasted
4082 : * data.
4083 : */
4084 2895 : if (need_toast)
4085 : {
4086 : /* Note we always use WAL and FSM during updates */
4087 69 : heaptup = toast_insert_or_update(relation, newtup, &oldtup, 0);
4088 69 : newtupsize = MAXALIGN(heaptup->t_len);
4089 : }
4090 : else
4091 2826 : heaptup = newtup;
4092 :
4093 : /*
4094 : * Now, do we need a new page for the tuple, or not? This is a bit
4095 : * tricky since someone else could have added tuples to the page while
4096 : * we weren't looking. We have to recheck the available space after
4097 : * reacquiring the buffer lock. But don't bother to do that if the
4098 : * former amount of free space is still not enough; it's unlikely
4099 : * there's more free now than before.
4100 : *
4101 : * What's more, if we need to get a new page, we will need to acquire
4102 : * buffer locks on both old and new pages. To avoid deadlock against
4103 : * some other backend trying to get the same two locks in the other
4104 : * order, we must be consistent about the order we get the locks in.
4105 : * We use the rule "lock the lower-numbered page of the relation
4106 : * first". To implement this, we must do RelationGetBufferForTuple
4107 : * while not holding the lock on the old page, and we must rely on it
4108 : * to get the locks on both pages in the correct order.
4109 : */
4110 2895 : if (newtupsize > pagefree)
4111 : {
4112 : /* Assume there's no chance to put heaptup on same page. */
4113 2850 : newbuf = RelationGetBufferForTuple(relation, heaptup->t_len,
4114 : buffer, 0, NULL,
4115 : &vmbuffer_new, &vmbuffer);
4116 : }
4117 : else
4118 : {
4119 : /* Re-acquire the lock on the old tuple's page. */
4120 45 : LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
4121 : /* Re-check using the up-to-date free space */
4122 45 : pagefree = PageGetHeapFreeSpace(page);
4123 45 : if (newtupsize > pagefree)
4124 : {
4125 : /*
4126 : * Rats, it doesn't fit anymore. We must now unlock and
4127 : * relock to avoid deadlock. Fortunately, this path should
4128 : * seldom be taken.
4129 : */
4130 0 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
4131 0 : newbuf = RelationGetBufferForTuple(relation, heaptup->t_len,
4132 : buffer, 0, NULL,
4133 : &vmbuffer_new, &vmbuffer);
4134 : }
4135 : else
4136 : {
4137 : /* OK, it fits here, so we're done. */
4138 45 : newbuf = buffer;
4139 : }
4140 : }
4141 : }
4142 : else
4143 : {
4144 : /* No TOAST work needed, and it'll fit on same page */
4145 6493 : newbuf = buffer;
4146 6493 : heaptup = newtup;
4147 : }
4148 :
4149 : /*
4150 : * We're about to do the actual update -- check for conflict first, to
4151 : * avoid possibly having to roll back work we've just done.
4152 : *
4153 : * This is safe without a recheck as long as there is no possibility of
4154 : * another process scanning the pages between this check and the update
4155 : * being visible to the scan (i.e., exclusive buffer content lock(s) are
4156 : * continuously held from this point until the tuple update is visible).
4157 : *
4158 : * For the new tuple the only check needed is at the relation level, but
4159 : * since both tuples are in the same relation and the check for oldtup
4160 : * will include checking the relation level, there is no benefit to a
4161 : * separate check for the new tuple.
4162 : */
4163 9388 : CheckForSerializableConflictIn(relation, &oldtup, buffer);
4164 :
4165 : /*
4166 : * At this point newbuf and buffer are both pinned and locked, and newbuf
4167 : * has enough space for the new tuple. If they are the same buffer, only
4168 : * one pin is held.
4169 : */
4170 :
4171 9388 : if (newbuf == buffer)
4172 : {
4173 : /*
4174 : * Since the new tuple is going into the same page, we might be able
4175 : * to do a HOT update. Check if any of the index columns have been
4176 : * changed. If the page was already full, we may have skipped checking
4177 : * for index columns. If so, HOT update is possible.
4178 : */
4179 6538 : if (hot_attrs_checked && !bms_overlap(modified_attrs, hot_attrs))
4180 5215 : use_hot_update = true;
4181 : }
4182 : else
4183 : {
4184 : /* Set a hint that the old page could use prune/defrag */
4185 2850 : PageSetFull(page);
4186 : }
4187 :
4188 : /*
4189 : * Compute replica identity tuple before entering the critical section so
4190 : * we don't PANIC upon a memory allocation failure.
4191 : * ExtractReplicaIdentity() will return NULL if nothing needs to be
4192 : * logged.
4193 : */
4194 9388 : old_key_tuple = ExtractReplicaIdentity(relation, &oldtup,
4195 9388 : bms_overlap(modified_attrs, id_attrs),
4196 : &old_key_copied);
4197 :
4198 : /* NO EREPORT(ERROR) from here till changes are logged */
4199 9388 : START_CRIT_SECTION();
4200 :
4201 : /*
4202 : * If this transaction commits, the old tuple will become DEAD sooner or
4203 : * later. Set flag that this page is a candidate for pruning once our xid
4204 : * falls below the OldestXmin horizon. If the transaction finally aborts,
4205 : * the subsequent page pruning will be a no-op and the hint will be
4206 : * cleared.
4207 : *
4208 : * XXX Should we set hint on newbuf as well? If the transaction aborts,
4209 : * there would be a prunable tuple in the newbuf; but for now we choose
4210 : * not to optimize for aborts. Note that heap_xlog_update must be kept in
4211 : * sync if this decision changes.
4212 : */
4213 9388 : PageSetPrunable(page, xid);
4214 :
4215 9388 : if (use_hot_update)
4216 : {
4217 : /* Mark the old tuple as HOT-updated */
4218 5215 : HeapTupleSetHotUpdated(&oldtup);
4219 : /* And mark the new tuple as heap-only */
4220 5215 : HeapTupleSetHeapOnly(heaptup);
4221 : /* Mark the caller's copy too, in case different from heaptup */
4222 5215 : HeapTupleSetHeapOnly(newtup);
4223 : }
4224 : else
4225 : {
4226 : /* Make sure tuples are correctly marked as not-HOT */
4227 4173 : HeapTupleClearHotUpdated(&oldtup);
4228 4173 : HeapTupleClearHeapOnly(heaptup);
4229 4173 : HeapTupleClearHeapOnly(newtup);
4230 : }
4231 :
4232 9388 : RelationPutHeapTuple(relation, newbuf, heaptup, false); /* insert new tuple */
4233 :
4234 :
4235 : /* Clear obsolete visibility flags, possibly set by ourselves above... */
4236 9388 : oldtup.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
4237 9388 : oldtup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
4238 : /* ... and store info about transaction updating this tuple */
4239 9388 : Assert(TransactionIdIsValid(xmax_old_tuple));
4240 9388 : HeapTupleHeaderSetXmax(oldtup.t_data, xmax_old_tuple);
4241 9388 : oldtup.t_data->t_infomask |= infomask_old_tuple;
4242 9388 : oldtup.t_data->t_infomask2 |= infomask2_old_tuple;
4243 9388 : HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo);
4244 :
4245 : /* record address of new tuple in t_ctid of old one */
4246 9388 : oldtup.t_data->t_ctid = heaptup->t_self;
4247 :
4248 : /* clear PD_ALL_VISIBLE flags, reset all visibilitymap bits */
4249 9388 : if (PageIsAllVisible(BufferGetPage(buffer)))
4250 : {
4251 167 : all_visible_cleared = true;
4252 167 : PageClearAllVisible(BufferGetPage(buffer));
4253 167 : visibilitymap_clear(relation, BufferGetBlockNumber(buffer),
4254 : vmbuffer, VISIBILITYMAP_VALID_BITS);
4255 : }
4256 9388 : if (newbuf != buffer && PageIsAllVisible(BufferGetPage(newbuf)))
4257 : {
4258 51 : all_visible_cleared_new = true;
4259 51 : PageClearAllVisible(BufferGetPage(newbuf));
4260 51 : visibilitymap_clear(relation, BufferGetBlockNumber(newbuf),
4261 : vmbuffer_new, VISIBILITYMAP_VALID_BITS);
4262 : }
4263 :
4264 9388 : if (newbuf != buffer)
4265 2850 : MarkBufferDirty(newbuf);
4266 9388 : MarkBufferDirty(buffer);
4267 :
4268 : /* XLOG stuff */
4269 9388 : if (RelationNeedsWAL(relation))
4270 : {
4271 : XLogRecPtr recptr;
4272 :
4273 : /*
4274 : * For logical decoding we need combocids to properly decode the
4275 : * catalog.
4276 : */
4277 9067 : if (RelationIsAccessibleInLogicalDecoding(relation))
4278 : {
4279 0 : log_heap_new_cid(relation, &oldtup);
4280 0 : log_heap_new_cid(relation, heaptup);
4281 : }
4282 :
4283 9067 : recptr = log_heap_update(relation, buffer,
4284 : newbuf, &oldtup, heaptup,
4285 : old_key_tuple,
4286 : all_visible_cleared,
4287 : all_visible_cleared_new);
4288 9067 : if (newbuf != buffer)
4289 : {
4290 2845 : PageSetLSN(BufferGetPage(newbuf), recptr);
4291 : }
4292 9067 : PageSetLSN(BufferGetPage(buffer), recptr);
4293 : }
4294 :
4295 9388 : END_CRIT_SECTION();
4296 :
4297 9388 : if (newbuf != buffer)
4298 2850 : LockBuffer(newbuf, BUFFER_LOCK_UNLOCK);
4299 9388 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
4300 :
4301 : /*
4302 : * Mark old tuple for invalidation from system caches at next command
4303 : * boundary, and mark the new tuple for invalidation in case we abort. We
4304 : * have to do this before releasing the buffer because oldtup is in the
4305 : * buffer. (heaptup is all in local memory, but it's necessary to process
4306 : * both tuple versions in one call to inval.c so we can avoid redundant
4307 : * sinval messages.)
4308 : */
4309 9388 : CacheInvalidateHeapTuple(relation, &oldtup, heaptup);
4310 :
4311 : /* Now we can release the buffer(s) */
4312 9388 : if (newbuf != buffer)
4313 2850 : ReleaseBuffer(newbuf);
4314 9388 : ReleaseBuffer(buffer);
4315 9388 : if (BufferIsValid(vmbuffer_new))
4316 51 : ReleaseBuffer(vmbuffer_new);
4317 9388 : if (BufferIsValid(vmbuffer))
4318 167 : ReleaseBuffer(vmbuffer);
4319 :
4320 : /*
4321 : * Release the lmgr tuple lock, if we had it.
4322 : */
4323 9388 : if (have_tuple_lock)
4324 0 : UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode);
4325 :
4326 9388 : pgstat_count_heap_update(relation, use_hot_update);
4327 :
4328 : /*
4329 : * If heaptup is a private copy, release it. Don't forget to copy t_self
4330 : * back to the caller's image, too.
4331 : */
4332 9388 : if (heaptup != newtup)
4333 : {
4334 63 : newtup->t_self = heaptup->t_self;
4335 63 : heap_freetuple(heaptup);
4336 : }
4337 :
4338 9388 : if (old_key_tuple != NULL && old_key_copied)
4339 0 : heap_freetuple(old_key_tuple);
4340 :
4341 9388 : bms_free(hot_attrs);
4342 9388 : bms_free(key_attrs);
4343 9388 : bms_free(id_attrs);
4344 9388 : bms_free(modified_attrs);
4345 9388 : bms_free(interesting_attrs);
4346 :
4347 9388 : return HeapTupleMayBeUpdated;
4348 : }
4349 :
4350 : /*
4351 : * Check if the specified attribute's value is same in both given tuples.
4352 : * Subroutine for HeapDetermineModifiedColumns.
4353 : */
4354 : static bool
4355 32715 : heap_tuple_attr_equals(TupleDesc tupdesc, int attrnum,
4356 : HeapTuple tup1, HeapTuple tup2)
4357 : {
4358 : Datum value1,
4359 : value2;
4360 : bool isnull1,
4361 : isnull2;
4362 : Form_pg_attribute att;
4363 :
4364 : /*
4365 : * If it's a whole-tuple reference, say "not equal". It's not really
4366 : * worth supporting this case, since it could only succeed after a no-op
4367 : * update, which is hardly a case worth optimizing for.
4368 : */
4369 32715 : if (attrnum == 0)
4370 0 : return false;
4371 :
4372 : /*
4373 : * Likewise, automatically say "not equal" for any system attribute other
4374 : * than OID and tableOID; we cannot expect these to be consistent in a HOT
4375 : * chain, or even to be set correctly yet in the new tuple.
4376 : */
4377 32715 : if (attrnum < 0)
4378 : {
4379 4560 : if (attrnum != ObjectIdAttributeNumber &&
4380 : attrnum != TableOidAttributeNumber)
4381 0 : return false;
4382 : }
4383 :
4384 : /*
4385 : * Extract the corresponding values. XXX this is pretty inefficient if
4386 : * there are many indexed columns. Should HeapDetermineModifiedColumns do
4387 : * a single heap_deform_tuple call on each tuple, instead? But that
4388 : * doesn't work for system columns ...
4389 : */
4390 32715 : value1 = heap_getattr(tup1, attrnum, tupdesc, &isnull1);
4391 32715 : value2 = heap_getattr(tup2, attrnum, tupdesc, &isnull2);
4392 :
4393 : /*
4394 : * If one value is NULL and other is not, then they are certainly not
4395 : * equal
4396 : */
4397 32715 : if (isnull1 != isnull2)
4398 0 : return false;
4399 :
4400 : /*
4401 : * If both are NULL, they can be considered equal.
4402 : */
4403 32715 : if (isnull1)
4404 636 : return true;
4405 :
4406 : /*
4407 : * We do simple binary comparison of the two datums. This may be overly
4408 : * strict because there can be multiple binary representations for the
4409 : * same logical value. But we should be OK as long as there are no false
4410 : * positives. Using a type-specific equality operator is messy because
4411 : * there could be multiple notions of equality in different operator
4412 : * classes; furthermore, we cannot safely invoke user-defined functions
4413 : * while holding exclusive buffer lock.
4414 : */
4415 32079 : if (attrnum <= 0)
4416 : {
4417 : /* The only allowed system columns are OIDs, so do this */
4418 4560 : return (DatumGetObjectId(value1) == DatumGetObjectId(value2));
4419 : }
4420 : else
4421 : {
4422 27519 : Assert(attrnum <= tupdesc->natts);
4423 27519 : att = TupleDescAttr(tupdesc, attrnum - 1);
4424 27519 : return datumIsEqual(value1, value2, att->attbyval, att->attlen);
4425 : }
4426 : }
4427 :
4428 : /*
4429 : * Check which columns are being updated.
4430 : *
4431 : * Given an updated tuple, determine (and return into the output bitmapset),
4432 : * from those listed as interesting, the set of columns that changed.
4433 : *
4434 : * The input bitmapset is destructively modified; that is OK since this is
4435 : * invoked at most once in heap_update.
4436 : */
4437 : static Bitmapset *
4438 9399 : HeapDetermineModifiedColumns(Relation relation, Bitmapset *interesting_cols,
4439 : HeapTuple oldtup, HeapTuple newtup)
4440 : {
4441 : int attnum;
4442 9399 : Bitmapset *modified = NULL;
4443 :
4444 51513 : while ((attnum = bms_first_member(interesting_cols)) >= 0)
4445 : {
4446 32715 : attnum += FirstLowInvalidHeapAttributeNumber;
4447 :
4448 32715 : if (!heap_tuple_attr_equals(RelationGetDescr(relation),
4449 : attnum, oldtup, newtup))
4450 1466 : modified = bms_add_member(modified,
4451 : attnum - FirstLowInvalidHeapAttributeNumber);
4452 : }
4453 :
4454 9399 : return modified;
4455 : }
4456 :
4457 : /*
4458 : * simple_heap_update - replace a tuple
4459 : *
4460 : * This routine may be used to update a tuple when concurrent updates of
4461 : * the target tuple are not expected (for example, because we have a lock
4462 : * on the relation associated with the tuple). Any failure is reported
4463 : * via ereport().
4464 : */
4465 : void
4466 5880 : simple_heap_update(Relation relation, ItemPointer otid, HeapTuple tup)
4467 : {
4468 : HTSU_Result result;
4469 : HeapUpdateFailureData hufd;
4470 : LockTupleMode lockmode;
4471 :
4472 5880 : result = heap_update(relation, otid, tup,
4473 : GetCurrentCommandId(true), InvalidSnapshot,
4474 : true /* wait for commit */ ,
4475 : &hufd, &lockmode);
4476 5880 : switch (result)
4477 : {
4478 : case HeapTupleSelfUpdated:
4479 : /* Tuple was already updated in current command? */
4480 0 : elog(ERROR, "tuple already updated by self");
4481 : break;
4482 :
4483 : case HeapTupleMayBeUpdated:
4484 : /* done successfully */
4485 5880 : break;
4486 :
4487 : case HeapTupleUpdated:
4488 0 : elog(ERROR, "tuple concurrently updated");
4489 : break;
4490 :
4491 : default:
4492 0 : elog(ERROR, "unrecognized heap_update status: %u", result);
4493 : break;
4494 : }
4495 5880 : }
4496 :
4497 :
4498 : /*
4499 : * Return the MultiXactStatus corresponding to the given tuple lock mode.
4500 : */
4501 : static MultiXactStatus
4502 3 : get_mxact_status_for_lock(LockTupleMode mode, bool is_update)
4503 : {
4504 : int retval;
4505 :
4506 3 : if (is_update)
4507 1 : retval = tupleLockExtraInfo[mode].updstatus;
4508 : else
4509 2 : retval = tupleLockExtraInfo[mode].lockstatus;
4510 :
4511 3 : if (retval == -1)
4512 0 : elog(ERROR, "invalid lock tuple mode %d/%s", mode,
4513 : is_update ? "true" : "false");
4514 :
4515 3 : return (MultiXactStatus) retval;
4516 : }
4517 :
4518 : /*
4519 : * heap_lock_tuple - lock a tuple in shared or exclusive mode
4520 : *
4521 : * Note that this acquires a buffer pin, which the caller must release.
4522 : *
4523 : * Input parameters:
4524 : * relation: relation containing tuple (caller must hold suitable lock)
4525 : * tuple->t_self: TID of tuple to lock (rest of struct need not be valid)
4526 : * cid: current command ID (used for visibility test, and stored into
4527 : * tuple's cmax if lock is successful)
4528 : * mode: indicates if shared or exclusive tuple lock is desired
4529 : * wait_policy: what to do if tuple lock is not available
4530 : * follow_updates: if true, follow the update chain to also lock descendant
4531 : * tuples.
4532 : *
4533 : * Output parameters:
4534 : * *tuple: all fields filled in
4535 : * *buffer: set to buffer holding tuple (pinned but not locked at exit)
4536 : * *hufd: filled in failure cases (see below)
4537 : *
4538 : * Function result may be:
4539 : * HeapTupleMayBeUpdated: lock was successfully acquired
4540 : * HeapTupleInvisible: lock failed because tuple was never visible to us
4541 : * HeapTupleSelfUpdated: lock failed because tuple updated by self
4542 : * HeapTupleUpdated: lock failed because tuple updated by other xact
4543 : * HeapTupleWouldBlock: lock couldn't be acquired and wait_policy is skip
4544 : *
4545 : * In the failure cases other than HeapTupleInvisible, the routine fills
4546 : * *hufd with the tuple's t_ctid, t_xmax (resolving a possible MultiXact,
4547 : * if necessary), and t_cmax (the last only for HeapTupleSelfUpdated,
4548 : * since we cannot obtain cmax from a combocid generated by another
4549 : * transaction).
4550 : * See comments for struct HeapUpdateFailureData for additional info.
4551 : *
4552 : * See README.tuplock for a thorough explanation of this mechanism.
4553 : */
4554 : HTSU_Result
4555 542 : heap_lock_tuple(Relation relation, HeapTuple tuple,
4556 : CommandId cid, LockTupleMode mode, LockWaitPolicy wait_policy,
4557 : bool follow_updates,
4558 : Buffer *buffer, HeapUpdateFailureData *hufd)
4559 : {
4560 : HTSU_Result result;
4561 542 : ItemPointer tid = &(tuple->t_self);
4562 : ItemId lp;
4563 : Page page;
4564 542 : Buffer vmbuffer = InvalidBuffer;
4565 : BlockNumber block;
4566 : TransactionId xid,
4567 : xmax;
4568 : uint16 old_infomask,
4569 : new_infomask,
4570 : new_infomask2;
4571 542 : bool first_time = true;
4572 542 : bool have_tuple_lock = false;
4573 542 : bool cleared_all_frozen = false;
4574 :
4575 542 : *buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
4576 542 : block = ItemPointerGetBlockNumber(tid);
4577 :
4578 : /*
4579 : * Before locking the buffer, pin the visibility map page if it appears to
4580 : * be necessary. Since we haven't got the lock yet, someone else might be
4581 : * in the middle of changing this, so we'll need to recheck after we have
4582 : * the lock.
4583 : */
4584 542 : if (PageIsAllVisible(BufferGetPage(*buffer)))
4585 1 : visibilitymap_pin(relation, block, &vmbuffer);
4586 :
4587 542 : LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4588 :
4589 542 : page = BufferGetPage(*buffer);
4590 542 : lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid));
4591 542 : Assert(ItemIdIsNormal(lp));
4592 :
4593 542 : tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp);
4594 542 : tuple->t_len = ItemIdGetLength(lp);
4595 542 : tuple->t_tableOid = RelationGetRelid(relation);
4596 :
4597 : l3:
4598 542 : result = HeapTupleSatisfiesUpdate(tuple, cid, *buffer);
4599 :
4600 542 : if (result == HeapTupleInvisible)
4601 : {
4602 : /*
4603 : * This is possible, but only when locking a tuple for ON CONFLICT
4604 : * UPDATE. We return this value here rather than throwing an error in
4605 : * order to give that case the opportunity to throw a more specific
4606 : * error.
4607 : */
4608 4 : result = HeapTupleInvisible;
4609 4 : goto out_locked;
4610 : }
4611 538 : else if (result == HeapTupleBeingUpdated || result == HeapTupleUpdated)
4612 : {
4613 : TransactionId xwait;
4614 : uint16 infomask;
4615 : uint16 infomask2;
4616 : bool require_sleep;
4617 : ItemPointerData t_ctid;
4618 :
4619 : /* must copy state data before unlocking buffer */
4620 35 : xwait = HeapTupleHeaderGetRawXmax(tuple->t_data);
4621 35 : infomask = tuple->t_data->t_infomask;
4622 35 : infomask2 = tuple->t_data->t_infomask2;
4623 35 : ItemPointerCopy(&tuple->t_data->t_ctid, &t_ctid);
4624 :
4625 35 : LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
4626 :
4627 : /*
4628 : * If any subtransaction of the current top transaction already holds
4629 : * a lock as strong as or stronger than what we're requesting, we
4630 : * effectively hold the desired lock already. We *must* succeed
4631 : * without trying to take the tuple lock, else we will deadlock
4632 : * against anyone wanting to acquire a stronger lock.
4633 : *
4634 : * Note we only do this the first time we loop on the HTSU result;
4635 : * there is no point in testing in subsequent passes, because
4636 : * evidently our own transaction cannot have acquired a new lock after
4637 : * the first time we checked.
4638 : */
4639 35 : if (first_time)
4640 : {
4641 35 : first_time = false;
4642 :
4643 35 : if (infomask & HEAP_XMAX_IS_MULTI)
4644 : {
4645 : int i;
4646 : int nmembers;
4647 : MultiXactMember *members;
4648 :
4649 : /*
4650 : * We don't need to allow old multixacts here; if that had
4651 : * been the case, HeapTupleSatisfiesUpdate would have returned
4652 : * MayBeUpdated and we wouldn't be here.
4653 : */
4654 1 : nmembers =
4655 1 : GetMultiXactIdMembers(xwait, &members, false,
4656 1 : HEAP_XMAX_IS_LOCKED_ONLY(infomask));
4657 :
4658 3 : for (i = 0; i < nmembers; i++)
4659 : {
4660 : /* only consider members of our own transaction */
4661 2 : if (!TransactionIdIsCurrentTransactionId(members[i].xid))
4662 1 : continue;
4663 :
4664 1 : if (TUPLOCK_from_mxstatus(members[i].status) >= mode)
4665 : {
4666 0 : pfree(members);
4667 0 : result = HeapTupleMayBeUpdated;
4668 0 : goto out_unlocked;
4669 : }
4670 : }
4671 :
4672 1 : if (members)
4673 1 : pfree(members);
4674 : }
4675 34 : else if (TransactionIdIsCurrentTransactionId(xwait))
4676 : {
4677 34 : switch (mode)
4678 : {
4679 : case LockTupleKeyShare:
4680 26 : Assert(HEAP_XMAX_IS_KEYSHR_LOCKED(infomask) ||
4681 : HEAP_XMAX_IS_SHR_LOCKED(infomask) ||
4682 : HEAP_XMAX_IS_EXCL_LOCKED(infomask));
4683 26 : result = HeapTupleMayBeUpdated;
4684 26 : goto out_unlocked;
4685 : case LockTupleShare:
4686 0 : if (HEAP_XMAX_IS_SHR_LOCKED(infomask) ||
4687 0 : HEAP_XMAX_IS_EXCL_LOCKED(infomask))
4688 : {
4689 0 : result = HeapTupleMayBeUpdated;
4690 0 : goto out_unlocked;
4691 : }
4692 0 : break;
4693 : case LockTupleNoKeyExclusive:
4694 6 : if (HEAP_XMAX_IS_EXCL_LOCKED(infomask))
4695 : {
4696 6 : result = HeapTupleMayBeUpdated;
4697 6 : goto out_unlocked;
4698 : }
4699 0 : break;
4700 : case LockTupleExclusive:
4701 3 : if (HEAP_XMAX_IS_EXCL_LOCKED(infomask) &&
4702 1 : infomask2 & HEAP_KEYS_UPDATED)
4703 : {
4704 1 : result = HeapTupleMayBeUpdated;
4705 1 : goto out_unlocked;
4706 : }
4707 1 : break;
4708 : }
4709 : }
4710 : }
4711 :
4712 : /*
4713 : * Initially assume that we will have to wait for the locking
4714 : * transaction(s) to finish. We check various cases below in which
4715 : * this can be turned off.
4716 : */
4717 2 : require_sleep = true;
4718 2 : if (mode == LockTupleKeyShare)
4719 : {
4720 : /*
4721 : * If we're requesting KeyShare, and there's no update present, we
4722 : * don't need to wait. Even if there is an update, we can still
4723 : * continue if the key hasn't been modified.
4724 : *
4725 : * However, if there are updates, we need to walk the update chain
4726 : * to mark future versions of the row as locked, too. That way,
4727 : * if somebody deletes that future version, we're protected
4728 : * against the key going away. This locking of future versions
4729 : * could block momentarily, if a concurrent transaction is
4730 : * deleting a key; or it could return a value to the effect that
4731 : * the transaction deleting the key has already committed. So we
4732 : * do this before re-locking the buffer; otherwise this would be
4733 : * prone to deadlocks.
4734 : *
4735 : * Note that the TID we're locking was grabbed before we unlocked
4736 : * the buffer. For it to change while we're not looking, the
4737 : * other properties we're testing for below after re-locking the
4738 : * buffer would also change, in which case we would restart this
4739 : * loop above.
4740 : */
4741 0 : if (!(infomask2 & HEAP_KEYS_UPDATED))
4742 : {
4743 : bool updated;
4744 :
4745 0 : updated = !HEAP_XMAX_IS_LOCKED_ONLY(infomask);
4746 :
4747 : /*
4748 : * If there are updates, follow the update chain; bail out if
4749 : * that cannot be done.
4750 : */
4751 0 : if (follow_updates && updated)
4752 : {
4753 : HTSU_Result res;
4754 :
4755 0 : res = heap_lock_updated_tuple(relation, tuple, &t_ctid,
4756 : GetCurrentTransactionId(),
4757 : mode);
4758 0 : if (res != HeapTupleMayBeUpdated)
4759 : {
4760 0 : result = res;
4761 : /* recovery code expects to have buffer lock held */
4762 0 : LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4763 0 : goto failed;
4764 : }
4765 : }
4766 :
4767 0 : LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4768 :
4769 : /*
4770 : * Make sure it's still an appropriate lock, else start over.
4771 : * Also, if it wasn't updated before we released the lock, but
4772 : * is updated now, we start over too; the reason is that we
4773 : * now need to follow the update chain to lock the new
4774 : * versions.
4775 : */
4776 0 : if (!HeapTupleHeaderIsOnlyLocked(tuple->t_data) &&
4777 0 : ((tuple->t_data->t_infomask2 & HEAP_KEYS_UPDATED) ||
4778 : !updated))
4779 : goto l3;
4780 :
4781 : /* Things look okay, so we can skip sleeping */
4782 0 : require_sleep = false;
4783 :
4784 : /*
4785 : * Note we allow Xmax to change here; other updaters/lockers
4786 : * could have modified it before we grabbed the buffer lock.
4787 : * However, this is not a problem, because with the recheck we
4788 : * just did we ensure that they still don't conflict with the
4789 : * lock we want.
4790 : */
4791 : }
4792 : }
4793 2 : else if (mode == LockTupleShare)
4794 : {
4795 : /*
4796 : * If we're requesting Share, we can similarly avoid sleeping if
4797 : * there's no update and no exclusive lock present.
4798 : */
4799 0 : if (HEAP_XMAX_IS_LOCKED_ONLY(infomask) &&
4800 0 : !HEAP_XMAX_IS_EXCL_LOCKED(infomask))
4801 : {
4802 0 : LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4803 :
4804 : /*
4805 : * Make sure it's still an appropriate lock, else start over.
4806 : * See above about allowing xmax to change.
4807 : */
4808 0 : if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask) ||
4809 0 : HEAP_XMAX_IS_EXCL_LOCKED(tuple->t_data->t_infomask))
4810 : goto l3;
4811 0 : require_sleep = false;
4812 : }
4813 : }
4814 2 : else if (mode == LockTupleNoKeyExclusive)
4815 : {
4816 : /*
4817 : * If we're requesting NoKeyExclusive, we might also be able to
4818 : * avoid sleeping; just ensure that there no conflicting lock
4819 : * already acquired.
4820 : */
4821 0 : if (infomask & HEAP_XMAX_IS_MULTI)
4822 : {
4823 0 : if (!DoesMultiXactIdConflict((MultiXactId) xwait, infomask,
4824 : mode))
4825 : {
4826 : /*
4827 : * No conflict, but if the xmax changed under us in the
4828 : * meantime, start over.
4829 : */
4830 0 : LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4831 0 : if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
4832 0 : !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data),
4833 : xwait))
4834 : goto l3;
4835 :
4836 : /* otherwise, we're good */
4837 0 : require_sleep = false;
4838 : }
4839 : }
4840 0 : else if (HEAP_XMAX_IS_KEYSHR_LOCKED(infomask))
4841 : {
4842 0 : LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4843 :
4844 : /* if the xmax changed in the meantime, start over */
4845 0 : if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
4846 0 : !TransactionIdEquals(
4847 : HeapTupleHeaderGetRawXmax(tuple->t_data),
4848 : xwait))
4849 : goto l3;
4850 : /* otherwise, we're good */
4851 0 : require_sleep = false;
4852 : }
4853 : }
4854 :
4855 : /*
4856 : * As a check independent from those above, we can also avoid sleeping
4857 : * if the current transaction is the sole locker of the tuple. Note
4858 : * that the strength of the lock already held is irrelevant; this is
4859 : * not about recording the lock in Xmax (which will be done regardless
4860 : * of this optimization, below). Also, note that the cases where we
4861 : * hold a lock stronger than we are requesting are already handled
4862 : * above by not doing anything.
4863 : *
4864 : * Note we only deal with the non-multixact case here; MultiXactIdWait
4865 : * is well equipped to deal with this situation on its own.
4866 : */
4867 3 : if (require_sleep && !(infomask & HEAP_XMAX_IS_MULTI) &&
4868 1 : TransactionIdIsCurrentTransactionId(xwait))
4869 : {
4870 : /* ... but if the xmax changed in the meantime, start over */
4871 1 : LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4872 2 : if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
4873 1 : !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data),
4874 : xwait))
4875 : goto l3;
4876 1 : Assert(HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask));
4877 1 : require_sleep = false;
4878 : }
4879 :
4880 : /*
4881 : * Time to sleep on the other transaction/multixact, if necessary.
4882 : *
4883 : * If the other transaction is an update that's already committed,
4884 : * then sleeping cannot possibly do any good: if we're required to
4885 : * sleep, get out to raise an error instead.
4886 : *
4887 : * By here, we either have already acquired the buffer exclusive lock,
4888 : * or we must wait for the locking transaction or multixact; so below
4889 : * we ensure that we grab buffer lock after the sleep.
4890 : */
4891 2 : if (require_sleep && result == HeapTupleUpdated)
4892 : {
4893 0 : LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4894 0 : goto failed;
4895 : }
4896 2 : else if (require_sleep)
4897 : {
4898 : /*
4899 : * Acquire tuple lock to establish our priority for the tuple, or
4900 : * die trying. LockTuple will release us when we are next-in-line
4901 : * for the tuple. We must do this even if we are share-locking.
4902 : *
4903 : * If we are forced to "start over" below, we keep the tuple lock;
4904 : * this arranges that we stay at the head of the line while
4905 : * rechecking tuple state.
4906 : */
4907 1 : if (!heap_acquire_tuplock(relation, tid, mode, wait_policy,
4908 : &have_tuple_lock))
4909 : {
4910 : /*
4911 : * This can only happen if wait_policy is Skip and the lock
4912 : * couldn't be obtained.
4913 : */
4914 0 : result = HeapTupleWouldBlock;
4915 : /* recovery code expects to have buffer lock held */
4916 0 : LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4917 0 : goto failed;
4918 : }
4919 :
4920 1 : if (infomask & HEAP_XMAX_IS_MULTI)
4921 : {
4922 1 : MultiXactStatus status = get_mxact_status_for_lock(mode, false);
4923 :
4924 : /* We only ever lock tuples, never update them */
4925 1 : if (status >= MultiXactStatusNoKeyUpdate)
4926 0 : elog(ERROR, "invalid lock mode in heap_lock_tuple");
4927 :
4928 : /* wait for multixact to end, or die trying */
4929 1 : switch (wait_policy)
4930 : {
4931 : case LockWaitBlock:
4932 1 : MultiXactIdWait((MultiXactId) xwait, status, infomask,
4933 : relation, &tuple->t_self, XLTW_Lock, NULL);
4934 1 : break;
4935 : case LockWaitSkip:
4936 0 : if (!ConditionalMultiXactIdWait((MultiXactId) xwait,
4937 : status, infomask, relation,
4938 : NULL))
4939 : {
4940 0 : result = HeapTupleWouldBlock;
4941 : /* recovery code expects to have buffer lock held */
4942 0 : LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4943 0 : goto failed;
4944 : }
4945 0 : break;
4946 : case LockWaitError:
4947 0 : if (!ConditionalMultiXactIdWait((MultiXactId) xwait,
4948 : status, infomask, relation,
4949 : NULL))
4950 0 : ereport(ERROR,
4951 : (errcode(ERRCODE_LOCK_NOT_AVAILABLE),
4952 : errmsg("could not obtain lock on row in relation \"%s\"",
4953 : RelationGetRelationName(relation))));
4954 :
4955 0 : break;
4956 : }
4957 :
4958 : /*
4959 : * Of course, the multixact might not be done here: if we're
4960 : * requesting a light lock mode, other transactions with light
4961 : * locks could still be alive, as well as locks owned by our
4962 : * own xact or other subxacts of this backend. We need to
4963 : * preserve the surviving MultiXact members. Note that it
4964 : * isn't absolutely necessary in the latter case, but doing so
4965 : * is simpler.
4966 : */
4967 : }
4968 : else
4969 : {
4970 : /* wait for regular transaction to end, or die trying */
4971 0 : switch (wait_policy)
4972 : {
4973 : case LockWaitBlock:
4974 0 : XactLockTableWait(xwait, relation, &tuple->t_self,
4975 : XLTW_Lock);
4976 0 : break;
4977 : case LockWaitSkip:
4978 0 : if (!ConditionalXactLockTableWait(xwait))
4979 : {
4980 0 : result = HeapTupleWouldBlock;
4981 : /* recovery code expects to have buffer lock held */
4982 0 : LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4983 0 : goto failed;
4984 : }
4985 0 : break;
4986 : case LockWaitError:
4987 0 : if (!ConditionalXactLockTableWait(xwait))
4988 0 : ereport(ERROR,
4989 : (errcode(ERRCODE_LOCK_NOT_AVAILABLE),
4990 : errmsg("could not obtain lock on row in relation \"%s\"",
4991 : RelationGetRelationName(relation))));
4992 0 : break;
4993 : }
4994 : }
4995 :
4996 : /* if there are updates, follow the update chain */
4997 1 : if (follow_updates && !HEAP_XMAX_IS_LOCKED_ONLY(infomask))
4998 : {
4999 : HTSU_Result res;
5000 :
5001 1 : res = heap_lock_updated_tuple(relation, tuple, &t_ctid,
5002 : GetCurrentTransactionId(),
5003 : mode);
5004 1 : if (res != HeapTupleMayBeUpdated)
5005 : {
5006 0 : result = res;
5007 : /* recovery code expects to have buffer lock held */
5008 0 : LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
5009 0 : goto failed;
5010 : }
5011 : }
5012 :
5013 1 : LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
5014 :
5015 : /*
5016 : * xwait is done, but if xwait had just locked the tuple then some
5017 : * other xact could update this tuple before we get to this point.
5018 : * Check for xmax change, and start over if so.
5019 : */
5020 2 : if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
5021 1 : !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data),
5022 : xwait))
5023 : goto l3;
5024 :
5025 1 : if (!(infomask & HEAP_XMAX_IS_MULTI))
5026 : {
5027 : /*
5028 : * Otherwise check if it committed or aborted. Note we cannot
5029 : * be here if the tuple was only locked by somebody who didn't
5030 : * conflict with us; that would have been handled above. So
5031 : * that transaction must necessarily be gone by now. But
5032 : * don't check for this in the multixact case, because some
5033 : * locker transactions might still be running.
5034 : */
5035 0 : UpdateXmaxHintBits(tuple->t_data, *buffer, xwait);
5036 : }
5037 : }
5038 :
5039 : /* By here, we're certain that we hold buffer exclusive lock again */
5040 :
5041 : /*
5042 : * We may lock if previous xmax aborted, or if it committed but only
5043 : * locked the tuple without updating it; or if we didn't have to wait
5044 : * at all for whatever reason.
5045 : */
5046 3 : if (!require_sleep ||
5047 2 : (tuple->t_data->t_infomask & HEAP_XMAX_INVALID) ||
5048 3 : HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask) ||
5049 1 : HeapTupleHeaderIsOnlyLocked(tuple->t_data))
5050 2 : result = HeapTupleMayBeUpdated;
5051 : else
5052 0 : result = HeapTupleUpdated;
5053 : }
5054 :
5055 : failed:
5056 505 : if (result != HeapTupleMayBeUpdated)
5057 : {
5058 2 : Assert(result == HeapTupleSelfUpdated || result == HeapTupleUpdated ||
5059 : result == HeapTupleWouldBlock);
5060 2 : Assert(!(tuple->t_data->t_infomask & HEAP_XMAX_INVALID));
5061 2 : hufd->ctid = tuple->t_data->t_ctid;
5062 2 : hufd->xmax = HeapTupleHeaderGetUpdateXid(tuple->t_data);
5063 2 : if (result == HeapTupleSelfUpdated)
5064 2 : hufd->cmax = HeapTupleHeaderGetCmax(tuple->t_data);
5065 : else
5066 0 : hufd->cmax = InvalidCommandId;
5067 2 : goto out_locked;
5068 : }
5069 :
5070 : /*
5071 : * If we didn't pin the visibility map page and the page has become all
5072 : * visible while we were busy locking the buffer, or during some
5073 : * subsequent window during which we had it unlocked, we'll have to unlock
5074 : * and re-lock, to avoid holding the buffer lock across I/O. That's a bit
5075 : * unfortunate, especially since we'll now have to recheck whether the
5076 : * tuple has been locked or updated under us, but hopefully it won't
5077 : * happen very often.
5078 : */
5079 503 : if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
5080 : {
5081 0 : LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
5082 0 : visibilitymap_pin(relation, block, &vmbuffer);
5083 0 : LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
5084 0 : goto l3;
5085 : }
5086 :
5087 503 : xmax = HeapTupleHeaderGetRawXmax(tuple->t_data);
5088 503 : old_infomask = tuple->t_data->t_infomask;
5089 :
5090 : /*
5091 : * If this is the first possibly-multixact-able operation in the current
5092 : * transaction, set my per-backend OldestMemberMXactId setting. We can be
5093 : * certain that the transaction will never become a member of any older
5094 : * MultiXactIds than that. (We have to do this even if we end up just
5095 : * using our own TransactionId below, since some other backend could
5096 : * incorporate our XID into a MultiXact immediately afterwards.)
5097 : */
5098 503 : MultiXactIdSetOldestMember();
5099 :
5100 : /*
5101 : * Compute the new xmax and infomask to store into the tuple. Note we do
5102 : * not modify the tuple just yet, because that would leave it in the wrong
5103 : * state if multixact.c elogs.
5104 : */
5105 503 : compute_new_xmax_infomask(xmax, old_infomask, tuple->t_data->t_infomask2,
5106 : GetCurrentTransactionId(), mode, false,
5107 : &xid, &new_infomask, &new_infomask2);
5108 :
5109 503 : START_CRIT_SECTION();
5110 :
5111 : /*
5112 : * Store transaction information of xact locking the tuple.
5113 : *
5114 : * Note: Cmax is meaningless in this context, so don't set it; this avoids
5115 : * possibly generating a useless combo CID. Moreover, if we're locking a
5116 : * previously updated tuple, it's important to preserve the Cmax.
5117 : *
5118 : * Also reset the HOT UPDATE bit, but only if there's no update; otherwise
5119 : * we would break the HOT chain.
5120 : */
5121 503 : tuple->t_data->t_infomask &= ~HEAP_XMAX_BITS;
5122 503 : tuple->t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
5123 503 : tuple->t_data->t_infomask |= new_infomask;
5124 503 : tuple->t_data->t_infomask2 |= new_infomask2;
5125 503 : if (HEAP_XMAX_IS_LOCKED_ONLY(new_infomask))
5126 503 : HeapTupleHeaderClearHotUpdated(tuple->t_data);
5127 503 : HeapTupleHeaderSetXmax(tuple->t_data, xid);
5128 :
5129 : /*
5130 : * Make sure there is no forward chain link in t_ctid. Note that in the
5131 : * cases where the tuple has been updated, we must not overwrite t_ctid,
5132 : * because it was set by the updater. Moreover, if the tuple has been
5133 : * updated, we need to follow the update chain to lock the new versions of
5134 : * the tuple as well.
5135 : */
5136 503 : if (HEAP_XMAX_IS_LOCKED_ONLY(new_infomask))
5137 503 : tuple->t_data->t_ctid = *tid;
5138 :
5139 : /* Clear only the all-frozen bit on visibility map if needed */
5140 504 : if (PageIsAllVisible(page) &&
5141 1 : visibilitymap_clear(relation, block, vmbuffer,
5142 : VISIBILITYMAP_ALL_FROZEN))
5143 0 : cleared_all_frozen = true;
5144 :
5145 :
5146 503 : MarkBufferDirty(*buffer);
5147 :
5148 : /*
5149 : * XLOG stuff. You might think that we don't need an XLOG record because
5150 : * there is no state change worth restoring after a crash. You would be
5151 : * wrong however: we have just written either a TransactionId or a
5152 : * MultiXactId that may never have been seen on disk before, and we need
5153 : * to make sure that there are XLOG entries covering those ID numbers.
5154 : * Else the same IDs might be re-used after a crash, which would be
5155 : * disastrous if this page made it to disk before the crash. Essentially
5156 : * we have to enforce the WAL log-before-data rule even in this case.
5157 : * (Also, in a PITR log-shipping or 2PC environment, we have to have XLOG
5158 : * entries for everything anyway.)
5159 : */
5160 503 : if (RelationNeedsWAL(relation))
5161 : {
5162 : xl_heap_lock xlrec;
5163 : XLogRecPtr recptr;
5164 :
5165 429 : XLogBeginInsert();
5166 429 : XLogRegisterBuffer(0, *buffer, REGBUF_STANDARD);
5167 :
5168 429 : xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self);
5169 429 : xlrec.locking_xid = xid;
5170 429 : xlrec.infobits_set = compute_infobits(new_infomask,
5171 429 : tuple->t_data->t_infomask2);
5172 429 : xlrec.flags = cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0;
5173 429 : XLogRegisterData((char *) &xlrec, SizeOfHeapLock);
5174 :
5175 : /* we don't decode row locks atm, so no need to log the origin */
5176 :
5177 429 : recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_LOCK);
5178 :
5179 429 : PageSetLSN(page, recptr);
5180 : }
5181 :
5182 503 : END_CRIT_SECTION();
5183 :
5184 503 : result = HeapTupleMayBeUpdated;
5185 :
5186 : out_locked:
5187 509 : LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
5188 :
5189 : out_unlocked:
5190 542 : if (BufferIsValid(vmbuffer))
5191 1 : ReleaseBuffer(vmbuffer);
5192 :
5193 : /*
5194 : * Don't update the visibility map here. Locking a tuple doesn't change
5195 : * visibility info.
5196 : */
5197 :
5198 : /*
5199 : * Now that we have successfully marked the tuple as locked, we can
5200 : * release the lmgr tuple lock, if we had it.
5201 : */
5202 542 : if (have_tuple_lock)
5203 1 : UnlockTupleTuplock(relation, tid, mode);
5204 :
5205 542 : return result;
5206 : }
5207 :
5208 : /*
5209 : * Acquire heavyweight lock on the given tuple, in preparation for acquiring
5210 : * its normal, Xmax-based tuple lock.
5211 : *
5212 : * have_tuple_lock is an input and output parameter: on input, it indicates
5213 : * whether the lock has previously been acquired (and this function does
5214 : * nothing in that case). If this function returns success, have_tuple_lock
5215 : * has been flipped to true.
5216 : *
5217 : * Returns false if it was unable to obtain the lock; this can only happen if
5218 : * wait_policy is Skip.
5219 : */
5220 : static bool
5221 1 : heap_acquire_tuplock(Relation relation, ItemPointer tid, LockTupleMode mode,
5222 : LockWaitPolicy wait_policy, bool *have_tuple_lock)
5223 : {
5224 1 : if (*have_tuple_lock)
5225 0 : return true;
5226 :
5227 1 : switch (wait_policy)
5228 : {
5229 : case LockWaitBlock:
5230 1 : LockTupleTuplock(relation, tid, mode);
5231 1 : break;
5232 :
5233 : case LockWaitSkip:
5234 0 : if (!ConditionalLockTupleTuplock(relation, tid, mode))
5235 0 : return false;
5236 0 : break;
5237 :
5238 : case LockWaitError:
5239 0 : if (!ConditionalLockTupleTuplock(relation, tid, mode))
5240 0 : ereport(ERROR,
5241 : (errcode(ERRCODE_LOCK_NOT_AVAILABLE),
5242 : errmsg("could not obtain lock on row in relation \"%s\"",
5243 : RelationGetRelationName(relation))));
5244 0 : break;
5245 : }
5246 1 : *have_tuple_lock = true;
5247 :
5248 1 : return true;
5249 : }
5250 :
5251 : /*
5252 : * Given an original set of Xmax and infomask, and a transaction (identified by
5253 : * add_to_xmax) acquiring a new lock of some mode, compute the new Xmax and
5254 : * corresponding infomasks to use on the tuple.
5255 : *
5256 : * Note that this might have side effects such as creating a new MultiXactId.
5257 : *
5258 : * Most callers will have called HeapTupleSatisfiesUpdate before this function;
5259 : * that will have set the HEAP_XMAX_INVALID bit if the xmax was a MultiXactId
5260 : * but it was not running anymore. There is a race condition, which is that the
5261 : * MultiXactId may have finished since then, but that uncommon case is handled
5262 : * either here, or within MultiXactIdExpand.
5263 : *
5264 : * There is a similar race condition possible when the old xmax was a regular
5265 : * TransactionId. We test TransactionIdIsInProgress again just to narrow the
5266 : * window, but it's still possible to end up creating an unnecessary
5267 : * MultiXactId. Fortunately this is harmless.
5268 : */
5269 : static void
5270 121966 : compute_new_xmax_infomask(TransactionId xmax, uint16 old_infomask,
5271 : uint16 old_infomask2, TransactionId add_to_xmax,
5272 : LockTupleMode mode, bool is_update,
5273 : TransactionId *result_xmax, uint16 *result_infomask,
5274 : uint16 *result_infomask2)
5275 : {
5276 : TransactionId new_xmax;
5277 : uint16 new_infomask,
5278 : new_infomask2;
5279 :
5280 121966 : Assert(TransactionIdIsCurrentTransactionId(add_to_xmax));
5281 :
5282 : l5:
5283 122266 : new_infomask = 0;
5284 122266 : new_infomask2 = 0;
5285 122266 : if (old_infomask & HEAP_XMAX_INVALID)
5286 : {
5287 : /*
5288 : * No previous locker; we just insert our own TransactionId.
5289 : *
5290 : * Note that it's critical that this case be the first one checked,
5291 : * because there are several blocks below that come back to this one
5292 : * to implement certain optimizations; old_infomask might contain
5293 : * other dirty bits in those cases, but we don't really care.
5294 : */
5295 121964 : if (is_update)
5296 : {
5297 118567 : new_xmax = add_to_xmax;
5298 118567 : if (mode == LockTupleExclusive)
5299 109771 : new_infomask2 |= HEAP_KEYS_UPDATED;
5300 : }
5301 : else
5302 : {
5303 3397 : new_infomask |= HEAP_XMAX_LOCK_ONLY;
5304 3397 : switch (mode)
5305 : {
5306 : case LockTupleKeyShare:
5307 190 : new_xmax = add_to_xmax;
5308 190 : new_infomask |= HEAP_XMAX_KEYSHR_LOCK;
5309 190 : break;
5310 : case LockTupleShare:
5311 11 : new_xmax = add_to_xmax;
5312 11 : new_infomask |= HEAP_XMAX_SHR_LOCK;
5313 11 : break;
5314 : case LockTupleNoKeyExclusive:
5315 3082 : new_xmax = add_to_xmax;
5316 3082 : new_infomask |= HEAP_XMAX_EXCL_LOCK;
5317 3082 : break;
5318 : case LockTupleExclusive:
5319 114 : new_xmax = add_to_xmax;
5320 114 : new_infomask |= HEAP_XMAX_EXCL_LOCK;
5321 114 : new_infomask2 |= HEAP_KEYS_UPDATED;
5322 114 : break;
5323 : default:
5324 0 : new_xmax = InvalidTransactionId; /* silence compiler */
5325 0 : elog(ERROR, "invalid lock mode");
5326 : }
5327 : }
5328 : }
5329 302 : else if (old_infomask & HEAP_XMAX_IS_MULTI)
5330 : {
5331 : MultiXactStatus new_status;
5332 :
5333 : /*
5334 : * Currently we don't allow XMAX_COMMITTED to be set for multis, so
5335 : * cross-check.
5336 : */
5337 1 : Assert(!(old_infomask & HEAP_XMAX_COMMITTED));
5338 :
5339 : /*
5340 : * A multixact together with LOCK_ONLY set but neither lock bit set
5341 : * (i.e. a pg_upgraded share locked tuple) cannot possibly be running
5342 : * anymore. This check is critical for databases upgraded by
5343 : * pg_upgrade; both MultiXactIdIsRunning and MultiXactIdExpand assume
5344 : * that such multis are never passed.
5345 : */
5346 1 : if (HEAP_LOCKED_UPGRADED(old_infomask))
5347 : {
5348 0 : old_infomask &= ~HEAP_XMAX_IS_MULTI;
5349 0 : old_infomask |= HEAP_XMAX_INVALID;
5350 0 : goto l5;
5351 : }
5352 :
5353 : /*
5354 : * If the XMAX is already a MultiXactId, then we need to expand it to
5355 : * include add_to_xmax; but if all the members were lockers and are
5356 : * all gone, we can do away with the IS_MULTI bit and just set
5357 : * add_to_xmax as the only locker/updater. If all lockers are gone
5358 : * and we have an updater that aborted, we can also do without a
5359 : * multi.
5360 : *
5361 : * The cost of doing GetMultiXactIdMembers would be paid by
5362 : * MultiXactIdExpand if we weren't to do this, so this check is not
5363 : * incurring extra work anyhow.
5364 : */
5365 1 : if (!MultiXactIdIsRunning(xmax, HEAP_XMAX_IS_LOCKED_ONLY(old_infomask)))
5366 : {
5367 0 : if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask) ||
5368 0 : !TransactionIdDidCommit(MultiXactIdGetUpdateXid(xmax,
5369 : old_infomask)))
5370 : {
5371 : /*
5372 : * Reset these bits and restart; otherwise fall through to
5373 : * create a new multi below.
5374 : */
5375 0 : old_infomask &= ~HEAP_XMAX_IS_MULTI;
5376 0 : old_infomask |= HEAP_XMAX_INVALID;
5377 0 : goto l5;
5378 : }
5379 : }
5380 :
5381 1 : new_status = get_mxact_status_for_lock(mode, is_update);
5382 :
5383 1 : new_xmax = MultiXactIdExpand((MultiXactId) xmax, add_to_xmax,
5384 : new_status);
5385 1 : GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
5386 : }
5387 301 : else if (old_infomask & HEAP_XMAX_COMMITTED)
5388 : {
5389 : /*
5390 : * It's a committed update, so we need to preserve him as updater of
5391 : * the tuple.
5392 : */
5393 : MultiXactStatus status;
5394 : MultiXactStatus new_status;
5395 :
5396 0 : if (old_infomask2 & HEAP_KEYS_UPDATED)
5397 0 : status = MultiXactStatusUpdate;
5398 : else
5399 0 : status = MultiXactStatusNoKeyUpdate;
5400 :
5401 0 : new_status = get_mxact_status_for_lock(mode, is_update);
5402 :
5403 : /*
5404 : * since it's not running, it's obviously impossible for the old
5405 : * updater to be identical to the current one, so we need not check
5406 : * for that case as we do in the block above.
5407 : */
5408 0 : new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status);
5409 0 : GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
5410 : }
5411 301 : else if (TransactionIdIsInProgress(xmax))
5412 : {
5413 : /*
5414 : * If the XMAX is a valid, in-progress TransactionId, then we need to
5415 : * create a new MultiXactId that includes both the old locker or
5416 : * updater and our own TransactionId.
5417 : */
5418 : MultiXactStatus new_status;
5419 : MultiXactStatus old_status;
5420 : LockTupleMode old_mode;
5421 :
5422 301 : if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask))
5423 : {
5424 602 : if (HEAP_XMAX_IS_KEYSHR_LOCKED(old_infomask))
5425 6 : old_status = MultiXactStatusForKeyShare;
5426 295 : else if (HEAP_XMAX_IS_SHR_LOCKED(old_infomask))
5427 1 : old_status = MultiXactStatusForShare;
5428 294 : else if (HEAP_XMAX_IS_EXCL_LOCKED(old_infomask))
5429 : {
5430 294 : if (old_infomask2 & HEAP_KEYS_UPDATED)
5431 37 : old_status = MultiXactStatusForUpdate;
5432 : else
5433 257 : old_status = MultiXactStatusForNoKeyUpdate;
5434 : }
5435 : else
5436 : {
5437 : /*
5438 : * LOCK_ONLY can be present alone only when a page has been
5439 : * upgraded by pg_upgrade. But in that case,
5440 : * TransactionIdIsInProgress() should have returned false. We
5441 : * assume it's no longer locked in this case.
5442 : */
5443 0 : elog(WARNING, "LOCK_ONLY found for Xid in progress %u", xmax);
5444 0 : old_infomask |= HEAP_XMAX_INVALID;
5445 0 : old_infomask &= ~HEAP_XMAX_LOCK_ONLY;
5446 0 : goto l5;
5447 : }
5448 : }
5449 : else
5450 : {
5451 : /* it's an update, but which kind? */
5452 0 : if (old_infomask2 & HEAP_KEYS_UPDATED)
5453 0 : old_status = MultiXactStatusUpdate;
5454 : else
5455 0 : old_status = MultiXactStatusNoKeyUpdate;
5456 : }
5457 :
5458 301 : old_mode = TUPLOCK_from_mxstatus(old_status);
5459 :
5460 : /*
5461 : * If the lock to be acquired is for the same TransactionId as the
5462 : * existing lock, there's an optimization possible: consider only the
5463 : * strongest of both locks as the only one present, and restart.
5464 : */
5465 301 : if (xmax == add_to_xmax)
5466 : {
5467 : /*
5468 : * Note that it's not possible for the original tuple to be
5469 : * updated: we wouldn't be here because the tuple would have been
5470 : * invisible and we wouldn't try to update it. As a subtlety,
5471 : * this code can also run when traversing an update chain to lock
5472 : * future versions of a tuple. But we wouldn't be here either,
5473 : * because the add_to_xmax would be different from the original
5474 : * updater.
5475 : */
5476 300 : Assert(HEAP_XMAX_IS_LOCKED_ONLY(old_infomask));
5477 :
5478 : /* acquire the strongest of both */
5479 300 : if (mode < old_mode)
5480 21 : mode = old_mode;
5481 : /* mustn't touch is_update */
5482 :
5483 300 : old_infomask |= HEAP_XMAX_INVALID;
5484 300 : goto l5;
5485 : }
5486 :
5487 : /* otherwise, just fall back to creating a new multixact */
5488 1 : new_status = get_mxact_status_for_lock(mode, is_update);
5489 1 : new_xmax = MultiXactIdCreate(xmax, old_status,
5490 : add_to_xmax, new_status);
5491 1 : GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
5492 : }
5493 0 : else if (!HEAP_XMAX_IS_LOCKED_ONLY(old_infomask) &&
5494 0 : TransactionIdDidCommit(xmax))
5495 0 : {
5496 : /*
5497 : * It's a committed update, so we gotta preserve him as updater of the
5498 : * tuple.
5499 : */
5500 : MultiXactStatus status;
5501 : MultiXactStatus new_status;
5502 :
5503 0 : if (old_infomask2 & HEAP_KEYS_UPDATED)
5504 0 : status = MultiXactStatusUpdate;
5505 : else
5506 0 : status = MultiXactStatusNoKeyUpdate;
5507 :
5508 0 : new_status = get_mxact_status_for_lock(mode, is_update);
5509 :
5510 : /*
5511 : * since it's not running, it's obviously impossible for the old
5512 : * updater to be identical to the current one, so we need not check
5513 : * for that case as we do in the block above.
5514 : */
5515 0 : new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status);
5516 0 : GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
5517 : }
5518 : else
5519 : {
5520 : /*
5521 : * Can get here iff the locking/updating transaction was running when
5522 : * the infomask was extracted from the tuple, but finished before
5523 : * TransactionIdIsInProgress got to run. Deal with it as if there was
5524 : * no locker at all in the first place.
5525 : */
5526 0 : old_infomask |= HEAP_XMAX_INVALID;
5527 0 : goto l5;
5528 : }
5529 :
5530 121966 : *result_infomask = new_infomask;
5531 121966 : *result_infomask2 = new_infomask2;
5532 121966 : *result_xmax = new_xmax;
5533 121966 : }
5534 :
5535 : /*
5536 : * Subroutine for heap_lock_updated_tuple_rec.
5537 : *
5538 : * Given a hypothetical multixact status held by the transaction identified
5539 : * with the given xid, does the current transaction need to wait, fail, or can
5540 : * it continue if it wanted to acquire a lock of the given mode? "needwait"
5541 : * is set to true if waiting is necessary; if it can continue, then
5542 : * HeapTupleMayBeUpdated is returned. If the lock is already held by the
5543 : * current transaction, return HeapTupleSelfUpdated. In case of a conflict
5544 : * with another transaction, a different HeapTupleSatisfiesUpdate return code
5545 : * is returned.
5546 : *
5547 : * The held status is said to be hypothetical because it might correspond to a
5548 : * lock held by a single Xid, i.e. not a real MultiXactId; we express it this
5549 : * way for simplicity of API.
5550 : */
5551 : static HTSU_Result
5552 0 : test_lockmode_for_conflict(MultiXactStatus status, TransactionId xid,
5553 : LockTupleMode mode, bool *needwait)
5554 : {
5555 : MultiXactStatus wantedstatus;
5556 :
5557 0 : *needwait = false;
5558 0 : wantedstatus = get_mxact_status_for_lock(mode, false);
5559 :
5560 : /*
5561 : * Note: we *must* check TransactionIdIsInProgress before
5562 : * TransactionIdDidAbort/Commit; see comment at top of tqual.c for an
5563 : * explanation.
5564 : */
5565 0 : if (TransactionIdIsCurrentTransactionId(xid))
5566 : {
5567 : /*
5568 : * The tuple has already been locked by our own transaction. This is
5569 : * very rare but can happen if multiple transactions are trying to
5570 : * lock an ancient version of the same tuple.
5571 : */
5572 0 : return HeapTupleSelfUpdated;
5573 : }
5574 0 : else if (TransactionIdIsInProgress(xid))
5575 : {
5576 : /*
5577 : * If the locking transaction is running, what we do depends on
5578 : * whether the lock modes conflict: if they do, then we must wait for
5579 : * it to finish; otherwise we can fall through to lock this tuple
5580 : * version without waiting.
5581 : */
5582 0 : if (DoLockModesConflict(LOCKMODE_from_mxstatus(status),
5583 0 : LOCKMODE_from_mxstatus(wantedstatus)))
5584 : {
5585 0 : *needwait = true;
5586 : }
5587 :
5588 : /*
5589 : * If we set needwait above, then this value doesn't matter;
5590 : * otherwise, this value signals to caller that it's okay to proceed.
5591 : */
5592 0 : return HeapTupleMayBeUpdated;
5593 : }
5594 0 : else if (TransactionIdDidAbort(xid))
5595 0 : return HeapTupleMayBeUpdated;
5596 0 : else if (TransactionIdDidCommit(xid))
5597 : {
5598 : /*
5599 : * The other transaction committed. If it was only a locker, then the
5600 : * lock is completely gone now and we can return success; but if it
5601 : * was an update, then what we do depends on whether the two lock
5602 : * modes conflict. If they conflict, then we must report error to
5603 : * caller. But if they don't, we can fall through to allow the current
5604 : * transaction to lock the tuple.
5605 : *
5606 : * Note: the reason we worry about ISUPDATE here is because as soon as
5607 : * a transaction ends, all its locks are gone and meaningless, and
5608 : * thus we can ignore them; whereas its updates persist. In the
5609 : * TransactionIdIsInProgress case, above, we don't need to check
5610 : * because we know the lock is still "alive" and thus a conflict needs
5611 : * always be checked.
5612 : */
5613 0 : if (!ISUPDATE_from_mxstatus(status))
5614 0 : return HeapTupleMayBeUpdated;
5615 :
5616 0 : if (DoLockModesConflict(LOCKMODE_from_mxstatus(status),
5617 0 : LOCKMODE_from_mxstatus(wantedstatus)))
5618 : /* bummer */
5619 0 : return HeapTupleUpdated;
5620 :
5621 0 : return HeapTupleMayBeUpdated;
5622 : }
5623 :
5624 : /* Not in progress, not aborted, not committed -- must have crashed */
5625 0 : return HeapTupleMayBeUpdated;
5626 : }
5627 :
5628 :
5629 : /*
5630 : * Recursive part of heap_lock_updated_tuple
5631 : *
5632 : * Fetch the tuple pointed to by tid in rel, and mark it as locked by the given
5633 : * xid with the given mode; if this tuple is updated, recurse to lock the new
5634 : * version as well.
5635 : */
5636 : static HTSU_Result
5637 1 : heap_lock_updated_tuple_rec(Relation rel, ItemPointer tid, TransactionId xid,
5638 : LockTupleMode mode)
5639 : {
5640 : HTSU_Result result;
5641 : ItemPointerData tupid;
5642 : HeapTupleData mytup;
5643 : Buffer buf;
5644 : uint16 new_infomask,
5645 : new_infomask2,
5646 : old_infomask,
5647 : old_infomask2;
5648 : TransactionId xmax,
5649 : new_xmax;
5650 1 : TransactionId priorXmax = InvalidTransactionId;
5651 1 : bool cleared_all_frozen = false;
5652 1 : Buffer vmbuffer = InvalidBuffer;
5653 : BlockNumber block;
5654 :
5655 1 : ItemPointerCopy(tid, &tupid);
5656 :
5657 : for (;;)
5658 : {
5659 1 : new_infomask = 0;
5660 1 : new_xmax = InvalidTransactionId;
5661 1 : block = ItemPointerGetBlockNumber(&tupid);
5662 1 : ItemPointerCopy(&tupid, &(mytup.t_self));
5663 :
5664 1 : if (!heap_fetch(rel, SnapshotAny, &mytup, &buf, false, NULL))
5665 : {
5666 : /*
5667 : * if we fail to find the updated version of the tuple, it's
5668 : * because it was vacuumed/pruned away after its creator
5669 : * transaction aborted. So behave as if we got to the end of the
5670 : * chain, and there's no further tuple to lock: return success to
5671 : * caller.
5672 : */
5673 0 : return HeapTupleMayBeUpdated;
5674 : }
5675 :
5676 : l4:
5677 1 : CHECK_FOR_INTERRUPTS();
5678 :
5679 : /*
5680 : * Before locking the buffer, pin the visibility map page if it
5681 : * appears to be necessary. Since we haven't got the lock yet,
5682 : * someone else might be in the middle of changing this, so we'll need
5683 : * to recheck after we have the lock.
5684 : */
5685 1 : if (PageIsAllVisible(BufferGetPage(buf)))
5686 0 : visibilitymap_pin(rel, block, &vmbuffer);
5687 : else
5688 1 : vmbuffer = InvalidBuffer;
5689 :
5690 1 : LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
5691 :
5692 : /*
5693 : * If we didn't pin the visibility map page and the page has become
5694 : * all visible while we were busy locking the buffer, we'll have to
5695 : * unlock and re-lock, to avoid holding the buffer lock across I/O.
5696 : * That's a bit unfortunate, but hopefully shouldn't happen often.
5697 : */
5698 1 : if (vmbuffer == InvalidBuffer && PageIsAllVisible(BufferGetPage(buf)))
5699 : {
5700 0 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
5701 0 : visibilitymap_pin(rel, block, &vmbuffer);
5702 0 : LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
5703 : }
5704 :
5705 : /*
5706 : * Check the tuple XMIN against prior XMAX, if any. If we reached the
5707 : * end of the chain, we're done, so return success.
5708 : */
5709 1 : if (TransactionIdIsValid(priorXmax) &&
5710 0 : !TransactionIdEquals(HeapTupleHeaderGetXmin(mytup.t_data),
5711 : priorXmax))
5712 : {
5713 0 : result = HeapTupleMayBeUpdated;
5714 0 : goto out_locked;
5715 : }
5716 :
5717 : /*
5718 : * Also check Xmin: if this tuple was created by an aborted
5719 : * (sub)transaction, then we already locked the last live one in the
5720 : * chain, thus we're done, so return success.
5721 : */
5722 1 : if (TransactionIdDidAbort(HeapTupleHeaderGetXmin(mytup.t_data)))
5723 : {
5724 1 : UnlockReleaseBuffer(buf);
5725 1 : return HeapTupleMayBeUpdated;
5726 : }
5727 :
5728 0 : old_infomask = mytup.t_data->t_infomask;
5729 0 : old_infomask2 = mytup.t_data->t_infomask2;
5730 0 : xmax = HeapTupleHeaderGetRawXmax(mytup.t_data);
5731 :
5732 : /*
5733 : * If this tuple version has been updated or locked by some concurrent
5734 : * transaction(s), what we do depends on whether our lock mode
5735 : * conflicts with what those other transactions hold, and also on the
5736 : * status of them.
5737 : */
5738 0 : if (!(old_infomask & HEAP_XMAX_INVALID))
5739 : {
5740 : TransactionId rawxmax;
5741 : bool needwait;
5742 :
5743 0 : rawxmax = HeapTupleHeaderGetRawXmax(mytup.t_data);
5744 0 : if (old_infomask & HEAP_XMAX_IS_MULTI)
5745 : {
5746 : int nmembers;
5747 : int i;
5748 : MultiXactMember *members;
5749 :
5750 : /*
5751 : * We don't need a test for pg_upgrade'd tuples: this is only
5752 : * applied to tuples after the first in an update chain. Said
5753 : * first tuple in the chain may well be locked-in-9.2-and-
5754 : * pg_upgraded, but that one was already locked by our caller,
5755 : * not us; and any subsequent ones cannot be because our
5756 : * caller must necessarily have obtained a snapshot later than
5757 : * the pg_upgrade itself.
5758 : */
5759 0 : Assert(!HEAP_LOCKED_UPGRADED(mytup.t_data->t_infomask));
5760 :
5761 0 : nmembers = GetMultiXactIdMembers(rawxmax, &members, false,
5762 0 : HEAP_XMAX_IS_LOCKED_ONLY(old_infomask));
5763 0 : for (i = 0; i < nmembers; i++)
5764 : {
5765 0 : result = test_lockmode_for_conflict(members[i].status,
5766 0 : members[i].xid,
5767 : mode, &needwait);
5768 :
5769 : /*
5770 : * If the tuple was already locked by ourselves in a
5771 : * previous iteration of this (say heap_lock_tuple was
5772 : * forced to restart the locking loop because of a change
5773 : * in xmax), then we hold the lock already on this tuple
5774 : * version and we don't need to do anything; and this is
5775 : * not an error condition either. We just need to skip
5776 : * this tuple and continue locking the next version in the
5777 : * update chain.
5778 : */
5779 0 : if (result == HeapTupleSelfUpdated)
5780 : {
5781 0 : pfree(members);
5782 0 : goto next;
5783 : }
5784 :
5785 0 : if (needwait)
5786 : {
5787 0 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
5788 0 : XactLockTableWait(members[i].xid, rel,
5789 : &mytup.t_self,
5790 : XLTW_LockUpdated);
5791 0 : pfree(members);
5792 0 : goto l4;
5793 : }
5794 0 : if (result != HeapTupleMayBeUpdated)
5795 : {
5796 0 : pfree(members);
5797 0 : goto out_locked;
5798 : }
5799 : }
5800 0 : if (members)
5801 0 : pfree(members);
5802 : }
5803 : else
5804 : {
5805 : MultiXactStatus status;
5806 :
5807 : /*
5808 : * For a non-multi Xmax, we first need to compute the
5809 : * corresponding MultiXactStatus by using the infomask bits.
5810 : */
5811 0 : if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask))
5812 : {
5813 0 : if (HEAP_XMAX_IS_KEYSHR_LOCKED(old_infomask))
5814 0 : status = MultiXactStatusForKeyShare;
5815 0 : else if (HEAP_XMAX_IS_SHR_LOCKED(old_infomask))
5816 0 : status = MultiXactStatusForShare;
5817 0 : else if (HEAP_XMAX_IS_EXCL_LOCKED(old_infomask))
5818 : {
5819 0 : if (old_infomask2 & HEAP_KEYS_UPDATED)
5820 0 : status = MultiXactStatusForUpdate;
5821 : else
5822 0 : status = MultiXactStatusForNoKeyUpdate;
5823 : }
5824 : else
5825 : {
5826 : /*
5827 : * LOCK_ONLY present alone (a pg_upgraded tuple marked
5828 : * as share-locked in the old cluster) shouldn't be
5829 : * seen in the middle of an update chain.
5830 : */
5831 0 : elog(ERROR, "invalid lock status in tuple");
5832 : }
5833 : }
5834 : else
5835 : {
5836 : /* it's an update, but which kind? */
5837 0 : if (old_infomask2 & HEAP_KEYS_UPDATED)
5838 0 : status = MultiXactStatusUpdate;
5839 : else
5840 0 : status = MultiXactStatusNoKeyUpdate;
5841 : }
5842 :
5843 0 : result = test_lockmode_for_conflict(status, rawxmax, mode,
5844 : &needwait);
5845 :
5846 : /*
5847 : * If the tuple was already locked by ourselves in a previous
5848 : * iteration of this (say heap_lock_tuple was forced to
5849 : * restart the locking loop because of a change in xmax), then
5850 : * we hold the lock already on this tuple version and we don't
5851 : * need to do anything; and this is not an error condition
5852 : * either. We just need to skip this tuple and continue
5853 : * locking the next version in the update chain.
5854 : */
5855 0 : if (result == HeapTupleSelfUpdated)
5856 0 : goto next;
5857 :
5858 0 : if (needwait)
5859 : {
5860 0 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
5861 0 : XactLockTableWait(rawxmax, rel, &mytup.t_self,
5862 : XLTW_LockUpdated);
5863 0 : goto l4;
5864 : }
5865 0 : if (result != HeapTupleMayBeUpdated)
5866 : {
5867 0 : goto out_locked;
5868 : }
5869 : }
5870 : }
5871 :
5872 : /* compute the new Xmax and infomask values for the tuple ... */
5873 0 : compute_new_xmax_infomask(xmax, old_infomask, mytup.t_data->t_infomask2,
5874 : xid, mode, false,
5875 : &new_xmax, &new_infomask, &new_infomask2);
5876 :
5877 0 : if (PageIsAllVisible(BufferGetPage(buf)) &&
5878 0 : visibilitymap_clear(rel, block, vmbuffer,
5879 : VISIBILITYMAP_ALL_FROZEN))
5880 0 : cleared_all_frozen = true;
5881 :
5882 0 : START_CRIT_SECTION();
5883 :
5884 : /* ... and set them */
5885 0 : HeapTupleHeaderSetXmax(mytup.t_data, new_xmax);
5886 0 : mytup.t_data->t_infomask &= ~HEAP_XMAX_BITS;
5887 0 : mytup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
5888 0 : mytup.t_data->t_infomask |= new_infomask;
5889 0 : mytup.t_data->t_infomask2 |= new_infomask2;
5890 :
5891 0 : MarkBufferDirty(buf);
5892 :
5893 : /* XLOG stuff */
5894 0 : if (RelationNeedsWAL(rel))
5895 : {
5896 : xl_heap_lock_updated xlrec;
5897 : XLogRecPtr recptr;
5898 0 : Page page = BufferGetPage(buf);
5899 :
5900 0 : XLogBeginInsert();
5901 0 : XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
5902 :
5903 0 : xlrec.offnum = ItemPointerGetOffsetNumber(&mytup.t_self);
5904 0 : xlrec.xmax = new_xmax;
5905 0 : xlrec.infobits_set = compute_infobits(new_infomask, new_infomask2);
5906 0 : xlrec.flags =
5907 0 : cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0;
5908 :
5909 0 : XLogRegisterData((char *) &xlrec, SizeOfHeapLockUpdated);
5910 :
5911 0 : recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_LOCK_UPDATED);
5912 :
5913 0 : PageSetLSN(page, recptr);
5914 : }
5915 :
5916 0 : END_CRIT_SECTION();
5917 :
5918 : next:
5919 : /* if we find the end of update chain, we're done. */
5920 0 : if (mytup.t_data->t_infomask & HEAP_XMAX_INVALID ||
5921 0 : ItemPointerEquals(&mytup.t_self, &mytup.t_data->t_ctid) ||
5922 0 : HeapTupleHeaderIsOnlyLocked(mytup.t_data))
5923 : {
5924 0 : result = HeapTupleMayBeUpdated;
5925 0 : goto out_locked;
5926 : }
5927 :
5928 : /* tail recursion */
5929 0 : priorXmax = HeapTupleHeaderGetUpdateXid(mytup.t_data);
5930 0 : ItemPointerCopy(&(mytup.t_data->t_ctid), &tupid);
5931 0 : UnlockReleaseBuffer(buf);
5932 0 : if (vmbuffer != InvalidBuffer)
5933 0 : ReleaseBuffer(vmbuffer);
5934 0 : }
5935 :
5936 : result = HeapTupleMayBeUpdated;
5937 :
5938 : out_locked:
5939 0 : UnlockReleaseBuffer(buf);
5940 :
5941 0 : if (vmbuffer != InvalidBuffer)
5942 0 : ReleaseBuffer(vmbuffer);
5943 :
5944 0 : return result;
5945 :
5946 : }
5947 :
5948 : /*
5949 : * heap_lock_updated_tuple
5950 : * Follow update chain when locking an updated tuple, acquiring locks (row
5951 : * marks) on the updated versions.
5952 : *
5953 : * The initial tuple is assumed to be already locked.
5954 : *
5955 : * This function doesn't check visibility, it just unconditionally marks the
5956 : * tuple(s) as locked. If any tuple in the updated chain is being deleted
5957 : * concurrently (or updated with the key being modified), sleep until the
5958 : * transaction doing it is finished.
5959 : *
5960 : * Note that we don't acquire heavyweight tuple locks on the tuples we walk
5961 : * when we have to wait for other transactions to release them, as opposed to
5962 : * what heap_lock_tuple does. The reason is that having more than one
5963 : * transaction walking the chain is probably uncommon enough that risk of
5964 : * starvation is not likely: one of the preconditions for being here is that
5965 : * the snapshot in use predates the update that created this tuple (because we
5966 : * started at an earlier version of the tuple), but at the same time such a
5967 : * transaction cannot be using repeatable read or serializable isolation
5968 : * levels, because that would lead to a serializability failure.
5969 : */
5970 : static HTSU_Result
5971 1 : heap_lock_updated_tuple(Relation rel, HeapTuple tuple, ItemPointer ctid,
5972 : TransactionId xid, LockTupleMode mode)
5973 : {
5974 1 : if (!ItemPointerEquals(&tuple->t_self, ctid))
5975 : {
5976 : /*
5977 : * If this is the first possibly-multixact-able operation in the
5978 : * current transaction, set my per-backend OldestMemberMXactId
5979 : * setting. We can be certain that the transaction will never become a
5980 : * member of any older MultiXactIds than that. (We have to do this
5981 : * even if we end up just using our own TransactionId below, since
5982 : * some other backend could incorporate our XID into a MultiXact
5983 : * immediately afterwards.)
5984 : */
5985 1 : MultiXactIdSetOldestMember();
5986 :
5987 1 : return heap_lock_updated_tuple_rec(rel, ctid, xid, mode);
5988 : }
5989 :
5990 : /* nothing to lock */
5991 0 : return HeapTupleMayBeUpdated;
5992 : }
5993 :
5994 : /*
5995 : * heap_finish_speculative - mark speculative insertion as successful
5996 : *
5997 : * To successfully finish a speculative insertion we have to clear speculative
5998 : * token from tuple. To do so the t_ctid field, which will contain a
5999 : * speculative token value, is modified in place to point to the tuple itself,
6000 : * which is characteristic of a newly inserted ordinary tuple.
6001 : *
6002 : * NB: It is not ok to commit without either finishing or aborting a
6003 : * speculative insertion. We could treat speculative tuples of committed
6004 : * transactions implicitly as completed, but then we would have to be prepared
6005 : * to deal with speculative tokens on committed tuples. That wouldn't be
6006 : * difficult - no-one looks at the ctid field of a tuple with invalid xmax -
6007 : * but clearing the token at completion isn't very expensive either.
6008 : * An explicit confirmation WAL record also makes logical decoding simpler.
6009 : */
6010 : void
6011 54 : heap_finish_speculative(Relation relation, HeapTuple tuple)
6012 : {
6013 : Buffer buffer;
6014 : Page page;
6015 : OffsetNumber offnum;
6016 54 : ItemId lp = NULL;
6017 : HeapTupleHeader htup;
6018 :
6019 54 : buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(&(tuple->t_self)));
6020 54 : LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
6021 54 : page = (Page) BufferGetPage(buffer);
6022 :
6023 54 : offnum = ItemPointerGetOffsetNumber(&(tuple->t_self));
6024 54 : if (PageGetMaxOffsetNumber(page) >= offnum)
6025 54 : lp = PageGetItemId(page, offnum);
6026 :
6027 54 : if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
6028 0 : elog(ERROR, "invalid lp");
6029 :
6030 54 : htup = (HeapTupleHeader) PageGetItem(page, lp);
6031 :
6032 : /* SpecTokenOffsetNumber should be distinguishable from any real offset */
6033 : StaticAssertStmt(MaxOffsetNumber < SpecTokenOffsetNumber,
6034 : "invalid speculative token constant");
6035 :
6036 : /* NO EREPORT(ERROR) from here till changes are logged */
6037 54 : START_CRIT_SECTION();
6038 :
6039 54 : Assert(HeapTupleHeaderIsSpeculative(tuple->t_data));
6040 :
6041 54 : MarkBufferDirty(buffer);
6042 :
6043 : /*
6044 : * Replace the speculative insertion token with a real t_ctid, pointing to
6045 : * itself like it does on regular tuples.
6046 : */
6047 54 : htup->t_ctid = tuple->t_self;
6048 :
6049 : /* XLOG stuff */
6050 54 : if (RelationNeedsWAL(relation))
6051 : {
6052 : xl_heap_confirm xlrec;
6053 : XLogRecPtr recptr;
6054 :
6055 52 : xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self);
6056 :
6057 52 : XLogBeginInsert();
6058 :
6059 : /* We want the same filtering on this as on a plain insert */
6060 52 : XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
6061 :
6062 52 : XLogRegisterData((char *) &xlrec, SizeOfHeapConfirm);
6063 52 : XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
6064 :
6065 52 : recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_CONFIRM);
6066 :
6067 52 : PageSetLSN(page, recptr);
6068 : }
6069 :
6070 54 : END_CRIT_SECTION();
6071 :
6072 54 : UnlockReleaseBuffer(buffer);
6073 54 : }
6074 :
6075 : /*
6076 : * heap_abort_speculative - kill a speculatively inserted tuple
6077 : *
6078 : * Marks a tuple that was speculatively inserted in the same command as dead,
6079 : * by setting its xmin as invalid. That makes it immediately appear as dead
6080 : * to all transactions, including our own. In particular, it makes
6081 : * HeapTupleSatisfiesDirty() regard the tuple as dead, so that another backend
6082 : * inserting a duplicate key value won't unnecessarily wait for our whole
6083 : * transaction to finish (it'll just wait for our speculative insertion to
6084 : * finish).
6085 : *
6086 : * Killing the tuple prevents "unprincipled deadlocks", which are deadlocks
6087 : * that arise due to a mutual dependency that is not user visible. By
6088 : * definition, unprincipled deadlocks cannot be prevented by the user
6089 : * reordering lock acquisition in client code, because the implementation level
6090 : * lock acquisitions are not under the user's direct control. If speculative
6091 : * inserters did not take this precaution, then under high concurrency they
6092 : * could deadlock with each other, which would not be acceptable.
6093 : *
6094 : * This is somewhat redundant with heap_delete, but we prefer to have a
6095 : * dedicated routine with stripped down requirements. Note that this is also
6096 : * used to delete the TOAST tuples created during speculative insertion.
6097 : *
6098 : * This routine does not affect logical decoding as it only looks at
6099 : * confirmation records.
6100 : */
6101 : void
6102 0 : heap_abort_speculative(Relation relation, HeapTuple tuple)
6103 : {
6104 0 : TransactionId xid = GetCurrentTransactionId();
6105 0 : ItemPointer tid = &(tuple->t_self);
6106 : ItemId lp;
6107 : HeapTupleData tp;
6108 : Page page;
6109 : BlockNumber block;
6110 : Buffer buffer;
6111 :
6112 0 : Assert(ItemPointerIsValid(tid));
6113 :
6114 0 : block = ItemPointerGetBlockNumber(tid);
6115 0 : buffer = ReadBuffer(relation, block);
6116 0 : page = BufferGetPage(buffer);
6117 :
6118 0 : LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
6119 :
6120 : /*
6121 : * Page can't be all visible, we just inserted into it, and are still
6122 : * running.
6123 : */
6124 0 : Assert(!PageIsAllVisible(page));
6125 :
6126 0 : lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid));
6127 0 : Assert(ItemIdIsNormal(lp));
6128 :
6129 0 : tp.t_tableOid = RelationGetRelid(relation);
6130 0 : tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
6131 0 : tp.t_len = ItemIdGetLength(lp);
6132 0 : tp.t_self = *tid;
6133 :
6134 : /*
6135 : * Sanity check that the tuple really is a speculatively inserted tuple,
6136 : * inserted by us.
6137 : */
6138 0 : if (tp.t_data->t_choice.t_heap.t_xmin != xid)
6139 0 : elog(ERROR, "attempted to kill a tuple inserted by another transaction");
6140 0 : if (!(IsToastRelation(relation) || HeapTupleHeaderIsSpeculative(tp.t_data)))
6141 0 : elog(ERROR, "attempted to kill a non-speculative tuple");
6142 0 : Assert(!HeapTupleHeaderIsHeapOnly(tp.t_data));
6143 :
6144 : /*
6145 : * No need to check for serializable conflicts here. There is never a
6146 : * need for a combocid, either. No need to extract replica identity, or
6147 : * do anything special with infomask bits.
6148 : */
6149 :
6150 0 : START_CRIT_SECTION();
6151 :
6152 : /*
6153 : * The tuple will become DEAD immediately. Flag that this page
6154 : * immediately is a candidate for pruning by setting xmin to
6155 : * RecentGlobalXmin. That's not pretty, but it doesn't seem worth
6156 : * inventing a nicer API for this.
6157 : */
6158 0 : Assert(TransactionIdIsValid(RecentGlobalXmin));
6159 0 : PageSetPrunable(page, RecentGlobalXmin);
6160 :
6161 : /* store transaction information of xact deleting the tuple */
6162 0 : tp.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
6163 0 : tp.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
6164 :
6165 : /*
6166 : * Set the tuple header xmin to InvalidTransactionId. This makes the
6167 : * tuple immediately invisible everyone. (In particular, to any
6168 : * transactions waiting on the speculative token, woken up later.)
6169 : */
6170 0 : HeapTupleHeaderSetXmin(tp.t_data, InvalidTransactionId);
6171 :
6172 : /* Clear the speculative insertion token too */
6173 0 : tp.t_data->t_ctid = tp.t_self;
6174 :
6175 0 : MarkBufferDirty(buffer);
6176 :
6177 : /*
6178 : * XLOG stuff
6179 : *
6180 : * The WAL records generated here match heap_delete(). The same recovery
6181 : * routines are used.
6182 : */
6183 0 : if (RelationNeedsWAL(relation))
6184 : {
6185 : xl_heap_delete xlrec;
6186 : XLogRecPtr recptr;
6187 :
6188 0 : xlrec.flags = XLH_DELETE_IS_SUPER;
6189 0 : xlrec.infobits_set = compute_infobits(tp.t_data->t_infomask,
6190 0 : tp.t_data->t_infomask2);
6191 0 : xlrec.offnum = ItemPointerGetOffsetNumber(&tp.t_self);
6192 0 : xlrec.xmax = xid;
6193 :
6194 0 : XLogBeginInsert();
6195 0 : XLogRegisterData((char *) &xlrec, SizeOfHeapDelete);
6196 0 : XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
6197 :
6198 : /* No replica identity & replication origin logged */
6199 :
6200 0 : recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_DELETE);
6201 :
6202 0 : PageSetLSN(page, recptr);
6203 : }
6204 :
6205 0 : END_CRIT_SECTION();
6206 :
6207 0 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
6208 :
6209 0 : if (HeapTupleHasExternal(&tp))
6210 : {
6211 0 : Assert(!IsToastRelation(relation));
6212 0 : toast_delete(relation, &tp, true);
6213 : }
6214 :
6215 : /*
6216 : * Never need to mark tuple for invalidation, since catalogs don't support
6217 : * speculative insertion
6218 : */
6219 :
6220 : /* Now we can release the buffer */
6221 0 : ReleaseBuffer(buffer);
6222 :
6223 : /* count deletion, as we counted the insertion too */
6224 0 : pgstat_count_heap_delete(relation);
6225 0 : }
6226 :
6227 : /*
6228 : * heap_inplace_update - update a tuple "in place" (ie, overwrite it)
6229 : *
6230 : * Overwriting violates both MVCC and transactional safety, so the uses
6231 : * of this function in Postgres are extremely limited. Nonetheless we
6232 : * find some places to use it.
6233 : *
6234 : * The tuple cannot change size, and therefore it's reasonable to assume
6235 : * that its null bitmap (if any) doesn't change either. So we just
6236 : * overwrite the data portion of the tuple without touching the null
6237 : * bitmap or any of the header fields.
6238 : *
6239 : * tuple is an in-memory tuple structure containing the data to be written
6240 : * over the target tuple. Also, tuple->t_self identifies the target tuple.
6241 : */
6242 : void
6243 3267 : heap_inplace_update(Relation relation, HeapTuple tuple)
6244 : {
6245 : Buffer buffer;
6246 : Page page;
6247 : OffsetNumber offnum;
6248 3267 : ItemId lp = NULL;
6249 : HeapTupleHeader htup;
6250 : uint32 oldlen;
6251 : uint32 newlen;
6252 :
6253 : /*
6254 : * For now, parallel operations are required to be strictly read-only.
6255 : * Unlike a regular update, this should never create a combo CID, so it
6256 : * might be possible to relax this restriction, but not without more
6257 : * thought and testing. It's not clear that it would be useful, anyway.
6258 : */
6259 3267 : if (IsInParallelMode())
6260 0 : ereport(ERROR,
6261 : (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
6262 : errmsg("cannot update tuples during a parallel operation")));
6263 :
6264 3267 : buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(&(tuple->t_self)));
6265 3267 : LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
6266 3267 : page = (Page) BufferGetPage(buffer);
6267 :
6268 3267 : offnum = ItemPointerGetOffsetNumber(&(tuple->t_self));
6269 3267 : if (PageGetMaxOffsetNumber(page) >= offnum)
6270 3267 : lp = PageGetItemId(page, offnum);
6271 :
6272 3267 : if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
6273 0 : elog(ERROR, "invalid lp");
6274 :
6275 3267 : htup = (HeapTupleHeader) PageGetItem(page, lp);
6276 :
6277 3267 : oldlen = ItemIdGetLength(lp) - htup->t_hoff;
6278 3267 : newlen = tuple->t_len - tuple->t_data->t_hoff;
6279 3267 : if (oldlen != newlen || htup->t_hoff != tuple->t_data->t_hoff)
6280 0 : elog(ERROR, "wrong tuple length");
6281 :
6282 : /* NO EREPORT(ERROR) from here till changes are logged */
6283 3267 : START_CRIT_SECTION();
6284 :
6285 6534 : memcpy((char *) htup + htup->t_hoff,
6286 6534 : (char *) tuple->t_data + tuple->t_data->t_hoff,
6287 : newlen);
6288 :
6289 3267 : MarkBufferDirty(buffer);
6290 :
6291 : /* XLOG stuff */
6292 3267 : if (RelationNeedsWAL(relation))
6293 : {
6294 : xl_heap_inplace xlrec;
6295 : XLogRecPtr recptr;
6296 :
6297 3267 : xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self);
6298 :
6299 3267 : XLogBeginInsert();
6300 3267 : XLogRegisterData((char *) &xlrec, SizeOfHeapInplace);
6301 :
6302 3267 : XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
6303 3267 : XLogRegisterBufData(0, (char *) htup + htup->t_hoff, newlen);
6304 :
6305 : /* inplace updates aren't decoded atm, don't log the origin */
6306 :
6307 3267 : recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_INPLACE);
6308 :
6309 3267 : PageSetLSN(page, recptr);
6310 : }
6311 :
6312 3267 : END_CRIT_SECTION();
6313 :
6314 3267 : UnlockReleaseBuffer(buffer);
6315 :
6316 : /*
6317 : * Send out shared cache inval if necessary. Note that because we only
6318 : * pass the new version of the tuple, this mustn't be used for any
6319 : * operations that could change catcache lookup keys. But we aren't
6320 : * bothering with index updates either, so that's true a fortiori.
6321 : */
6322 3267 : if (!IsBootstrapProcessingMode())
6323 3054 : CacheInvalidateHeapTuple(relation, tuple, NULL);
6324 3267 : }
6325 :
6326 : #define FRM_NOOP 0x0001
6327 : #define FRM_INVALIDATE_XMAX 0x0002
6328 : #define FRM_RETURN_IS_XID 0x0004
6329 : #define FRM_RETURN_IS_MULTI 0x0008
6330 : #define FRM_MARK_COMMITTED 0x0010
6331 :
6332 : /*
6333 : * FreezeMultiXactId
6334 : * Determine what to do during freezing when a tuple is marked by a
6335 : * MultiXactId.
6336 : *
6337 : * NB -- this might have the side-effect of creating a new MultiXactId!
6338 : *
6339 : * "flags" is an output value; it's used to tell caller what to do on return.
6340 : * Possible flags are:
6341 : * FRM_NOOP
6342 : * don't do anything -- keep existing Xmax
6343 : * FRM_INVALIDATE_XMAX
6344 : * mark Xmax as InvalidTransactionId and set XMAX_INVALID flag.
6345 : * FRM_RETURN_IS_XID
6346 : * The Xid return value is a single update Xid to set as xmax.
6347 : * FRM_MARK_COMMITTED
6348 : * Xmax can be marked as HEAP_XMAX_COMMITTED
6349 : * FRM_RETURN_IS_MULTI
6350 : * The return value is a new MultiXactId to set as new Xmax.
6351 : * (caller must obtain proper infomask bits using GetMultiXactIdHintBits)
6352 : */
6353 : static TransactionId
6354 0 : FreezeMultiXactId(MultiXactId multi, uint16 t_infomask,
6355 : TransactionId cutoff_xid, MultiXactId cutoff_multi,
6356 : uint16 *flags)
6357 : {
6358 0 : TransactionId xid = InvalidTransactionId;
6359 : int i;
6360 : MultiXactMember *members;
6361 : int nmembers;
6362 : bool need_replace;
6363 : int nnewmembers;
6364 : MultiXactMember *newmembers;
6365 : bool has_lockers;
6366 : TransactionId update_xid;
6367 : bool update_committed;
6368 :
6369 0 : *flags = 0;
6370 :
6371 : /* We should only be called in Multis */
6372 0 : Assert(t_infomask & HEAP_XMAX_IS_MULTI);
6373 :
6374 0 : if (!MultiXactIdIsValid(multi) ||
6375 0 : HEAP_LOCKED_UPGRADED(t_infomask))
6376 : {
6377 : /* Ensure infomask bits are appropriately set/reset */
6378 0 : *flags |= FRM_INVALIDATE_XMAX;
6379 0 : return InvalidTransactionId;
6380 : }
6381 0 : else if (MultiXactIdPrecedes(multi, cutoff_multi))
6382 : {
6383 : /*
6384 : * This old multi cannot possibly have members still running. If it
6385 : * was a locker only, it can be removed without any further
6386 : * consideration; but if it contained an update, we might need to
6387 : * preserve it.
6388 : */
6389 0 : Assert(!MultiXactIdIsRunning(multi,
6390 : HEAP_XMAX_IS_LOCKED_ONLY(t_infomask)));
6391 0 : if (HEAP_XMAX_IS_LOCKED_ONLY(t_infomask))
6392 : {
6393 0 : *flags |= FRM_INVALIDATE_XMAX;
6394 0 : xid = InvalidTransactionId; /* not strictly necessary */
6395 : }
6396 : else
6397 : {
6398 : /* replace multi by update xid */
6399 0 : xid = MultiXactIdGetUpdateXid(multi, t_infomask);
6400 :
6401 : /* wasn't only a lock, xid needs to be valid */
6402 0 : Assert(TransactionIdIsValid(xid));
6403 :
6404 : /*
6405 : * If the xid is older than the cutoff, it has to have aborted,
6406 : * otherwise the tuple would have gotten pruned away.
6407 : */
6408 0 : if (TransactionIdPrecedes(xid, cutoff_xid))
6409 : {
6410 0 : Assert(!TransactionIdDidCommit(xid));
6411 0 : *flags |= FRM_INVALIDATE_XMAX;
6412 0 : xid = InvalidTransactionId; /* not strictly necessary */
6413 : }
6414 : else
6415 : {
6416 0 : *flags |= FRM_RETURN_IS_XID;
6417 : }
6418 : }
6419 :
6420 0 : return xid;
6421 : }
6422 :
6423 : /*
6424 : * This multixact might have or might not have members still running, but
6425 : * we know it's valid and is newer than the cutoff point for multis.
6426 : * However, some member(s) of it may be below the cutoff for Xids, so we
6427 : * need to walk the whole members array to figure out what to do, if
6428 : * anything.
6429 : */
6430 :
6431 0 : nmembers =
6432 0 : GetMultiXactIdMembers(multi, &members, false,
6433 0 : HEAP_XMAX_IS_LOCKED_ONLY(t_infomask));
6434 0 : if (nmembers <= 0)
6435 : {
6436 : /* Nothing worth keeping */
6437 0 : *flags |= FRM_INVALIDATE_XMAX;
6438 0 : return InvalidTransactionId;
6439 : }
6440 :
6441 : /* is there anything older than the cutoff? */
6442 0 : need_replace = false;
6443 0 : for (i = 0; i < nmembers; i++)
6444 : {
6445 0 : if (TransactionIdPrecedes(members[i].xid, cutoff_xid))
6446 : {
6447 0 : need_replace = true;
6448 0 : break;
6449 : }
6450 : }
6451 :
6452 : /*
6453 : * In the simplest case, there is no member older than the cutoff; we can
6454 : * keep the existing MultiXactId as is.
6455 : */
6456 0 : if (!need_replace)
6457 : {
6458 0 : *flags |= FRM_NOOP;
6459 0 : pfree(members);
6460 0 : return InvalidTransactionId;
6461 : }
6462 :
6463 : /*
6464 : * If the multi needs to be updated, figure out which members do we need
6465 : * to keep.
6466 : */
6467 0 : nnewmembers = 0;
6468 0 : newmembers = palloc(sizeof(MultiXactMember) * nmembers);
6469 0 : has_lockers = false;
6470 0 : update_xid = InvalidTransactionId;
6471 0 : update_committed = false;
6472 :
6473 0 : for (i = 0; i < nmembers; i++)
6474 : {
6475 : /*
6476 : * Determine whether to keep this member or ignore it.
6477 : */
6478 0 : if (ISUPDATE_from_mxstatus(members[i].status))
6479 : {
6480 0 : TransactionId xid = members[i].xid;
6481 :
6482 : /*
6483 : * It's an update; should we keep it? If the transaction is known
6484 : * aborted or crashed then it's okay to ignore it, otherwise not.
6485 : * Note that an updater older than cutoff_xid cannot possibly be
6486 : * committed, because HeapTupleSatisfiesVacuum would have returned
6487 : * HEAPTUPLE_DEAD and we would not be trying to freeze the tuple.
6488 : *
6489 : * As with all tuple visibility routines, it's critical to test
6490 : * TransactionIdIsInProgress before TransactionIdDidCommit,
6491 : * because of race conditions explained in detail in tqual.c.
6492 : */
6493 0 : if (TransactionIdIsCurrentTransactionId(xid) ||
6494 0 : TransactionIdIsInProgress(xid))
6495 : {
6496 0 : Assert(!TransactionIdIsValid(update_xid));
6497 0 : update_xid = xid;
6498 : }
6499 0 : else if (TransactionIdDidCommit(xid))
6500 : {
6501 : /*
6502 : * The transaction committed, so we can tell caller to set
6503 : * HEAP_XMAX_COMMITTED. (We can only do this because we know
6504 : * the transaction is not running.)
6505 : */
6506 0 : Assert(!TransactionIdIsValid(update_xid));
6507 0 : update_committed = true;
6508 0 : update_xid = xid;
6509 : }
6510 :
6511 : /*
6512 : * Not in progress, not committed -- must be aborted or crashed;
6513 : * we can ignore it.
6514 : */
6515 :
6516 : /*
6517 : * Since the tuple wasn't marked HEAPTUPLE_DEAD by vacuum, the
6518 : * update Xid cannot possibly be older than the xid cutoff.
6519 : */
6520 0 : Assert(!TransactionIdIsValid(update_xid) ||
6521 : !TransactionIdPrecedes(update_xid, cutoff_xid));
6522 :
6523 : /*
6524 : * If we determined that it's an Xid corresponding to an update
6525 : * that must be retained, additionally add it to the list of
6526 : * members of the new Multi, in case we end up using that. (We
6527 : * might still decide to use only an update Xid and not a multi,
6528 : * but it's easier to maintain the list as we walk the old members
6529 : * list.)
6530 : */
6531 0 : if (TransactionIdIsValid(update_xid))
6532 0 : newmembers[nnewmembers++] = members[i];
6533 : }
6534 : else
6535 : {
6536 : /* We only keep lockers if they are still running */
6537 0 : if (TransactionIdIsCurrentTransactionId(members[i].xid) ||
6538 0 : TransactionIdIsInProgress(members[i].xid))
6539 : {
6540 : /* running locker cannot possibly be older than the cutoff */
6541 0 : Assert(!TransactionIdPrecedes(members[i].xid, cutoff_xid));
6542 0 : newmembers[nnewmembers++] = members[i];
6543 0 : has_lockers = true;
6544 : }
6545 : }
6546 : }
6547 :
6548 0 : pfree(members);
6549 :
6550 0 : if (nnewmembers == 0)
6551 : {
6552 : /* nothing worth keeping!? Tell caller to remove the whole thing */
6553 0 : *flags |= FRM_INVALIDATE_XMAX;
6554 0 : xid = InvalidTransactionId;
6555 : }
6556 0 : else if (TransactionIdIsValid(update_xid) && !has_lockers)
6557 : {
6558 : /*
6559 : * If there's a single member and it's an update, pass it back alone
6560 : * without creating a new Multi. (XXX we could do this when there's a
6561 : * single remaining locker, too, but that would complicate the API too
6562 : * much; moreover, the case with the single updater is more
6563 : * interesting, because those are longer-lived.)
6564 : */
6565 0 : Assert(nnewmembers == 1);
6566 0 : *flags |= FRM_RETURN_IS_XID;
6567 0 : if (update_committed)
6568 0 : *flags |= FRM_MARK_COMMITTED;
6569 0 : xid = update_xid;
6570 : }
6571 : else
6572 : {
6573 : /*
6574 : * Create a new multixact with the surviving members of the previous
6575 : * one, to set as new Xmax in the tuple.
6576 : */
6577 0 : xid = MultiXactIdCreateFromMembers(nnewmembers, newmembers);
6578 0 : *flags |= FRM_RETURN_IS_MULTI;
6579 : }
6580 :
6581 0 : pfree(newmembers);
6582 :
6583 0 : return xid;
6584 : }
6585 :
6586 : /*
6587 : * heap_prepare_freeze_tuple
6588 : *
6589 : * Check to see whether any of the XID fields of a tuple (xmin, xmax, xvac)
6590 : * are older than the specified cutoff XID and cutoff MultiXactId. If so,
6591 : * setup enough state (in the *frz output argument) to later execute and
6592 : * WAL-log what we would need to do, and return TRUE. Return FALSE if nothing
6593 : * is to be changed. In addition, set *totally_frozen_p to true if the tuple
6594 : * will be totally frozen after these operations are performed and false if
6595 : * more freezing will eventually be required.
6596 : *
6597 : * Caller is responsible for setting the offset field, if appropriate.
6598 : *
6599 : * It is assumed that the caller has checked the tuple with
6600 : * HeapTupleSatisfiesVacuum() and determined that it is not HEAPTUPLE_DEAD
6601 : * (else we should be removing the tuple, not freezing it).
6602 : *
6603 : * NB: cutoff_xid *must* be <= the current global xmin, to ensure that any
6604 : * XID older than it could neither be running nor seen as running by any
6605 : * open transaction. This ensures that the replacement will not change
6606 : * anyone's idea of the tuple state.
6607 : * Similarly, cutoff_multi must be less than or equal to the smallest
6608 : * MultiXactId used by any transaction currently open.
6609 : *
6610 : * If the tuple is in a shared buffer, caller must hold an exclusive lock on
6611 : * that buffer.
6612 : *
6613 : * NB: It is not enough to set hint bits to indicate something is
6614 : * committed/invalid -- they might not be set on a standby, or after crash
6615 : * recovery. We really need to remove old xids.
6616 : */
6617 : bool
6618 404745 : heap_prepare_freeze_tuple(HeapTupleHeader tuple, TransactionId cutoff_xid,
6619 : TransactionId cutoff_multi,
6620 : xl_heap_freeze_tuple *frz, bool *totally_frozen_p)
6621 : {
6622 404745 : bool changed = false;
6623 404745 : bool freeze_xmax = false;
6624 : TransactionId xid;
6625 404745 : bool totally_frozen = true;
6626 :
6627 404745 : frz->frzflags = 0;
6628 404745 : frz->t_infomask2 = tuple->t_infomask2;
6629 404745 : frz->t_infomask = tuple->t_infomask;
6630 404745 : frz->xmax = HeapTupleHeaderGetRawXmax(tuple);
6631 :
6632 : /* Process xmin */
6633 404745 : xid = HeapTupleHeaderGetXmin(tuple);
6634 404745 : if (TransactionIdIsNormal(xid))
6635 : {
6636 379721 : if (TransactionIdPrecedes(xid, cutoff_xid))
6637 : {
6638 27646 : frz->t_infomask |= HEAP_XMIN_FROZEN;
6639 27646 : changed = true;
6640 : }
6641 : else
6642 352075 : totally_frozen = false;
6643 : }
6644 :
6645 : /*
6646 : * Process xmax. To thoroughly examine the current Xmax value we need to
6647 : * resolve a MultiXactId to its member Xids, in case some of them are
6648 : * below the given cutoff for Xids. In that case, those values might need
6649 : * freezing, too. Also, if a multi needs freezing, we cannot simply take
6650 : * it out --- if there's a live updater Xid, it needs to be kept.
6651 : *
6652 : * Make sure to keep heap_tuple_needs_freeze in sync with this.
6653 : */
6654 404745 : xid = HeapTupleHeaderGetRawXmax(tuple);
6655 :
6656 404745 : if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
6657 : {
6658 : TransactionId newxmax;
6659 : uint16 flags;
6660 :
6661 0 : newxmax = FreezeMultiXactId(xid, tuple->t_infomask,
6662 : cutoff_xid, cutoff_multi, &flags);
6663 :
6664 0 : if (flags & FRM_INVALIDATE_XMAX)
6665 0 : freeze_xmax = true;
6666 0 : else if (flags & FRM_RETURN_IS_XID)
6667 : {
6668 : /*
6669 : * NB -- some of these transformations are only valid because we
6670 : * know the return Xid is a tuple updater (i.e. not merely a
6671 : * locker.) Also note that the only reason we don't explicitly
6672 : * worry about HEAP_KEYS_UPDATED is because it lives in
6673 : * t_infomask2 rather than t_infomask.
6674 : */
6675 0 : frz->t_infomask &= ~HEAP_XMAX_BITS;
6676 0 : frz->xmax = newxmax;
6677 0 : if (flags & FRM_MARK_COMMITTED)
6678 0 : frz->t_infomask |= HEAP_XMAX_COMMITTED;
6679 0 : changed = true;
6680 0 : totally_frozen = false;
6681 : }
6682 0 : else if (flags & FRM_RETURN_IS_MULTI)
6683 : {
6684 : uint16 newbits;
6685 : uint16 newbits2;
6686 :
6687 : /*
6688 : * We can't use GetMultiXactIdHintBits directly on the new multi
6689 : * here; that routine initializes the masks to all zeroes, which
6690 : * would lose other bits we need. Doing it this way ensures all
6691 : * unrelated bits remain untouched.
6692 : */
6693 0 : frz->t_infomask &= ~HEAP_XMAX_BITS;
6694 0 : frz->t_infomask2 &= ~HEAP_KEYS_UPDATED;
6695 0 : GetMultiXactIdHintBits(newxmax, &newbits, &newbits2);
6696 0 : frz->t_infomask |= newbits;
6697 0 : frz->t_infomask2 |= newbits2;
6698 :
6699 0 : frz->xmax = newxmax;
6700 :
6701 0 : changed = true;
6702 0 : totally_frozen = false;
6703 : }
6704 : else
6705 : {
6706 0 : Assert(flags & FRM_NOOP);
6707 : }
6708 : }
6709 404745 : else if (TransactionIdIsNormal(xid))
6710 : {
6711 38869 : if (TransactionIdPrecedes(xid, cutoff_xid))
6712 4 : freeze_xmax = true;
6713 : else
6714 38865 : totally_frozen = false;
6715 : }
6716 :
6717 404745 : if (freeze_xmax)
6718 : {
6719 4 : frz->xmax = InvalidTransactionId;
6720 :
6721 : /*
6722 : * The tuple might be marked either XMAX_INVALID or XMAX_COMMITTED +
6723 : * LOCKED. Normalize to INVALID just to be sure no one gets confused.
6724 : * Also get rid of the HEAP_KEYS_UPDATED bit.
6725 : */
6726 4 : frz->t_infomask &= ~HEAP_XMAX_BITS;
6727 4 : frz->t_infomask |= HEAP_XMAX_INVALID;
6728 4 : frz->t_infomask2 &= ~HEAP_HOT_UPDATED;
6729 4 : frz->t_infomask2 &= ~HEAP_KEYS_UPDATED;
6730 4 : changed = true;
6731 : }
6732 :
6733 : /*
6734 : * Old-style VACUUM FULL is gone, but we have to keep this code as long as
6735 : * we support having MOVED_OFF/MOVED_IN tuples in the database.
6736 : */
6737 404745 : if (tuple->t_infomask & HEAP_MOVED)
6738 : {
6739 0 : xid = HeapTupleHeaderGetXvac(tuple);
6740 :
6741 : /*
6742 : * For Xvac, we ignore the cutoff_xid and just always perform the
6743 : * freeze operation. The oldest release in which such a value can
6744 : * actually be set is PostgreSQL 8.4, because old-style VACUUM FULL
6745 : * was removed in PostgreSQL 9.0. Note that if we were to respect
6746 : * cutoff_xid here, we'd need to make surely to clear totally_frozen
6747 : * when we skipped freezing on that basis.
6748 : */
6749 0 : if (TransactionIdIsNormal(xid))
6750 : {
6751 : /*
6752 : * If a MOVED_OFF tuple is not dead, the xvac transaction must
6753 : * have failed; whereas a non-dead MOVED_IN tuple must mean the
6754 : * xvac transaction succeeded.
6755 : */
6756 0 : if (tuple->t_infomask & HEAP_MOVED_OFF)
6757 0 : frz->frzflags |= XLH_INVALID_XVAC;
6758 : else
6759 0 : frz->frzflags |= XLH_FREEZE_XVAC;
6760 :
6761 : /*
6762 : * Might as well fix the hint bits too; usually XMIN_COMMITTED
6763 : * will already be set here, but there's a small chance not.
6764 : */
6765 0 : Assert(!(tuple->t_infomask & HEAP_XMIN_INVALID));
6766 0 : frz->t_infomask |= HEAP_XMIN_COMMITTED;
6767 0 : changed = true;
6768 : }
6769 : }
6770 :
6771 404745 : *totally_frozen_p = totally_frozen;
6772 404745 : return changed;
6773 : }
6774 :
6775 : /*
6776 : * heap_execute_freeze_tuple
6777 : * Execute the prepared freezing of a tuple.
6778 : *
6779 : * Caller is responsible for ensuring that no other backend can access the
6780 : * storage underlying this tuple, either by holding an exclusive lock on the
6781 : * buffer containing it (which is what lazy VACUUM does), or by having it be
6782 : * in private storage (which is what CLUSTER and friends do).
6783 : *
6784 : * Note: it might seem we could make the changes without exclusive lock, since
6785 : * TransactionId read/write is assumed atomic anyway. However there is a race
6786 : * condition: someone who just fetched an old XID that we overwrite here could
6787 : * conceivably not finish checking the XID against pg_xact before we finish
6788 : * the VACUUM and perhaps truncate off the part of pg_xact he needs. Getting
6789 : * exclusive lock ensures no other backend is in process of checking the
6790 : * tuple status. Also, getting exclusive lock makes it safe to adjust the
6791 : * infomask bits.
6792 : *
6793 : * NB: All code in here must be safe to execute during crash recovery!
6794 : */
6795 : void
6796 27646 : heap_execute_freeze_tuple(HeapTupleHeader tuple, xl_heap_freeze_tuple *frz)
6797 : {
6798 27646 : HeapTupleHeaderSetXmax(tuple, frz->xmax);
6799 :
6800 27646 : if (frz->frzflags & XLH_FREEZE_XVAC)
6801 0 : HeapTupleHeaderSetXvac(tuple, FrozenTransactionId);
6802 :
6803 27646 : if (frz->frzflags & XLH_INVALID_XVAC)
6804 0 : HeapTupleHeaderSetXvac(tuple, InvalidTransactionId);
6805 :
6806 27646 : tuple->t_infomask = frz->t_infomask;
6807 27646 : tuple->t_infomask2 = frz->t_infomask2;
6808 27646 : }
6809 :
6810 : /*
6811 : * heap_freeze_tuple
6812 : * Freeze tuple in place, without WAL logging.
6813 : *
6814 : * Useful for callers like CLUSTER that perform their own WAL logging.
6815 : */
6816 : bool
6817 22761 : heap_freeze_tuple(HeapTupleHeader tuple, TransactionId cutoff_xid,
6818 : TransactionId cutoff_multi)
6819 : {
6820 : xl_heap_freeze_tuple frz;
6821 : bool do_freeze;
6822 : bool tuple_totally_frozen;
6823 :
6824 22761 : do_freeze = heap_prepare_freeze_tuple(tuple, cutoff_xid, cutoff_multi,
6825 : &frz, &tuple_totally_frozen);
6826 :
6827 : /*
6828 : * Note that because this is not a WAL-logged operation, we don't need to
6829 : * fill in the offset in the freeze record.
6830 : */
6831 :
6832 22761 : if (do_freeze)
6833 12355 : heap_execute_freeze_tuple(tuple, &frz);
6834 22761 : return do_freeze;
6835 : }
6836 :
6837 : /*
6838 : * For a given MultiXactId, return the hint bits that should be set in the
6839 : * tuple's infomask.
6840 : *
6841 : * Normally this should be called for a multixact that was just created, and
6842 : * so is on our local cache, so the GetMembers call is fast.
6843 : */
6844 : static void
6845 2 : GetMultiXactIdHintBits(MultiXactId multi, uint16 *new_infomask,
6846 : uint16 *new_infomask2)
6847 : {
6848 : int nmembers;
6849 : MultiXactMember *members;
6850 : int i;
6851 2 : uint16 bits = HEAP_XMAX_IS_MULTI;
6852 2 : uint16 bits2 = 0;
6853 2 : bool has_update = false;
6854 2 : LockTupleMode strongest = LockTupleKeyShare;
6855 :
6856 : /*
6857 : * We only use this in multis we just created, so they cannot be values
6858 : * pre-pg_upgrade.
6859 : */
6860 2 : nmembers = GetMultiXactIdMembers(multi, &members, false, false);
6861 :
6862 6 : for (i = 0; i < nmembers; i++)
6863 : {
6864 : LockTupleMode mode;
6865 :
6866 : /*
6867 : * Remember the strongest lock mode held by any member of the
6868 : * multixact.
6869 : */
6870 4 : mode = TUPLOCK_from_mxstatus(members[i].status);
6871 4 : if (mode > strongest)
6872 2 : strongest = mode;
6873 :
6874 : /* See what other bits we need */
6875 4 : switch (members[i].status)
6876 : {
6877 : case MultiXactStatusForKeyShare:
6878 : case MultiXactStatusForShare:
6879 : case MultiXactStatusForNoKeyUpdate:
6880 2 : break;
6881 :
6882 : case MultiXactStatusForUpdate:
6883 1 : bits2 |= HEAP_KEYS_UPDATED;
6884 1 : break;
6885 :
6886 : case MultiXactStatusNoKeyUpdate:
6887 1 : has_update = true;
6888 1 : break;
6889 :
6890 : case MultiXactStatusUpdate:
6891 0 : bits2 |= HEAP_KEYS_UPDATED;
6892 0 : has_update = true;
6893 0 : break;
6894 : }
6895 : }
6896 :
6897 2 : if (strongest == LockTupleExclusive ||
6898 : strongest == LockTupleNoKeyExclusive)
6899 2 : bits |= HEAP_XMAX_EXCL_LOCK;
6900 0 : else if (strongest == LockTupleShare)
6901 0 : bits |= HEAP_XMAX_SHR_LOCK;
6902 0 : else if (strongest == LockTupleKeyShare)
6903 0 : bits |= HEAP_XMAX_KEYSHR_LOCK;
6904 :
6905 2 : if (!has_update)
6906 1 : bits |= HEAP_XMAX_LOCK_ONLY;
6907 :
6908 2 : if (nmembers > 0)
6909 2 : pfree(members);
6910 :
6911 2 : *new_infomask = bits;
6912 2 : *new_infomask2 = bits2;
6913 2 : }
6914 :
6915 : /*
6916 : * MultiXactIdGetUpdateXid
6917 : *
6918 : * Given a multixact Xmax and corresponding infomask, which does not have the
6919 : * HEAP_XMAX_LOCK_ONLY bit set, obtain and return the Xid of the updating
6920 : * transaction.
6921 : *
6922 : * Caller is expected to check the status of the updating transaction, if
6923 : * necessary.
6924 : */
6925 : static TransactionId
6926 3 : MultiXactIdGetUpdateXid(TransactionId xmax, uint16 t_infomask)
6927 : {
6928 3 : TransactionId update_xact = InvalidTransactionId;
6929 : MultiXactMember *members;
6930 : int nmembers;
6931 :
6932 3 : Assert(!(t_infomask & HEAP_XMAX_LOCK_ONLY));
6933 3 : Assert(t_infomask & HEAP_XMAX_IS_MULTI);
6934 :
6935 : /*
6936 : * Since we know the LOCK_ONLY bit is not set, this cannot be a multi from
6937 : * pre-pg_upgrade.
6938 : */
6939 3 : nmembers = GetMultiXactIdMembers(xmax, &members, false, false);
6940 :
6941 3 : if (nmembers > 0)
6942 : {
6943 : int i;
6944 :
6945 9 : for (i = 0; i < nmembers; i++)
6946 : {
6947 : /* Ignore lockers */
6948 6 : if (!ISUPDATE_from_mxstatus(members[i].status))
6949 3 : continue;
6950 :
6951 : /* there can be at most one updater */
6952 3 : Assert(update_xact == InvalidTransactionId);
6953 3 : update_xact = members[i].xid;
6954 : #ifndef USE_ASSERT_CHECKING
6955 :
6956 : /*
6957 : * in an assert-enabled build, walk the whole array to ensure
6958 : * there's no other updater.
6959 : */
6960 : break;
6961 : #endif
6962 : }
6963 :
6964 3 : pfree(members);
6965 : }
6966 :
6967 3 : return update_xact;
6968 : }
6969 :
6970 : /*
6971 : * HeapTupleGetUpdateXid
6972 : * As above, but use a HeapTupleHeader
6973 : *
6974 : * See also HeapTupleHeaderGetUpdateXid, which can be used without previously
6975 : * checking the hint bits.
6976 : */
6977 : TransactionId
6978 3 : HeapTupleGetUpdateXid(HeapTupleHeader tuple)
6979 : {
6980 3 : return MultiXactIdGetUpdateXid(HeapTupleHeaderGetRawXmax(tuple),
6981 3 : tuple->t_infomask);
6982 : }
6983 :
6984 : /*
6985 : * Does the given multixact conflict with the current transaction grabbing a
6986 : * tuple lock of the given strength?
6987 : *
6988 : * The passed infomask pairs up with the given multixact in the tuple header.
6989 : */
6990 : static bool
6991 0 : DoesMultiXactIdConflict(MultiXactId multi, uint16 infomask,
6992 : LockTupleMode lockmode)
6993 : {
6994 : int nmembers;
6995 : MultiXactMember *members;
6996 0 : bool result = false;
6997 0 : LOCKMODE wanted = tupleLockExtraInfo[lockmode].hwlock;
6998 :
6999 0 : if (HEAP_LOCKED_UPGRADED(infomask))
7000 0 : return false;
7001 :
7002 0 : nmembers = GetMultiXactIdMembers(multi, &members, false,
7003 0 : HEAP_XMAX_IS_LOCKED_ONLY(infomask));
7004 0 : if (nmembers >= 0)
7005 : {
7006 : int i;
7007 :
7008 0 : for (i = 0; i < nmembers; i++)
7009 : {
7010 : TransactionId memxid;
7011 : LOCKMODE memlockmode;
7012 :
7013 0 : memlockmode = LOCKMODE_from_mxstatus(members[i].status);
7014 :
7015 : /* ignore members that don't conflict with the lock we want */
7016 0 : if (!DoLockModesConflict(memlockmode, wanted))
7017 0 : continue;
7018 :
7019 : /* ignore members from current xact */
7020 0 : memxid = members[i].xid;
7021 0 : if (TransactionIdIsCurrentTransactionId(memxid))
7022 0 : continue;
7023 :
7024 0 : if (ISUPDATE_from_mxstatus(members[i].status))
7025 : {
7026 : /* ignore aborted updaters */
7027 0 : if (TransactionIdDidAbort(memxid))
7028 0 : continue;
7029 : }
7030 : else
7031 : {
7032 : /* ignore lockers-only that are no longer in progress */
7033 0 : if (!TransactionIdIsInProgress(memxid))
7034 0 : continue;
7035 : }
7036 :
7037 : /*
7038 : * Whatever remains are either live lockers that conflict with our
7039 : * wanted lock, and updaters that are not aborted. Those conflict
7040 : * with what we want, so return true.
7041 : */
7042 0 : result = true;
7043 0 : break;
7044 : }
7045 0 : pfree(members);
7046 : }
7047 :
7048 0 : return result;
7049 : }
7050 :
7051 : /*
7052 : * Do_MultiXactIdWait
7053 : * Actual implementation for the two functions below.
7054 : *
7055 : * 'multi', 'status' and 'infomask' indicate what to sleep on (the status is
7056 : * needed to ensure we only sleep on conflicting members, and the infomask is
7057 : * used to optimize multixact access in case it's a lock-only multi); 'nowait'
7058 : * indicates whether to use conditional lock acquisition, to allow callers to
7059 : * fail if lock is unavailable. 'rel', 'ctid' and 'oper' are used to set up
7060 : * context information for error messages. 'remaining', if not NULL, receives
7061 : * the number of members that are still running, including any (non-aborted)
7062 : * subtransactions of our own transaction.
7063 : *
7064 : * We do this by sleeping on each member using XactLockTableWait. Any
7065 : * members that belong to the current backend are *not* waited for, however;
7066 : * this would not merely be useless but would lead to Assert failure inside
7067 : * XactLockTableWait. By the time this returns, it is certain that all
7068 : * transactions *of other backends* that were members of the MultiXactId
7069 : * that conflict with the requested status are dead (and no new ones can have
7070 : * been added, since it is not legal to add members to an existing
7071 : * MultiXactId).
7072 : *
7073 : * But by the time we finish sleeping, someone else may have changed the Xmax
7074 : * of the containing tuple, so the caller needs to iterate on us somehow.
7075 : *
7076 : * Note that in case we return false, the number of remaining members is
7077 : * not to be trusted.
7078 : */
7079 : static bool
7080 1 : Do_MultiXactIdWait(MultiXactId multi, MultiXactStatus status,
7081 : uint16 infomask, bool nowait,
7082 : Relation rel, ItemPointer ctid, XLTW_Oper oper,
7083 : int *remaining)
7084 : {
7085 1 : bool result = true;
7086 : MultiXactMember *members;
7087 : int nmembers;
7088 1 : int remain = 0;
7089 :
7090 : /* for pre-pg_upgrade tuples, no need to sleep at all */
7091 2 : nmembers = HEAP_LOCKED_UPGRADED(infomask) ? -1 :
7092 1 : GetMultiXactIdMembers(multi, &members, false,
7093 1 : HEAP_XMAX_IS_LOCKED_ONLY(infomask));
7094 :
7095 1 : if (nmembers >= 0)
7096 : {
7097 : int i;
7098 :
7099 3 : for (i = 0; i < nmembers; i++)
7100 : {
7101 2 : TransactionId memxid = members[i].xid;
7102 2 : MultiXactStatus memstatus = members[i].status;
7103 :
7104 2 : if (TransactionIdIsCurrentTransactionId(memxid))
7105 : {
7106 1 : remain++;
7107 1 : continue;
7108 : }
7109 :
7110 1 : if (!DoLockModesConflict(LOCKMODE_from_mxstatus(memstatus),
7111 1 : LOCKMODE_from_mxstatus(status)))
7112 : {
7113 0 : if (remaining && TransactionIdIsInProgress(memxid))
7114 0 : remain++;
7115 0 : continue;
7116 : }
7117 :
7118 : /*
7119 : * This member conflicts with our multi, so we have to sleep (or
7120 : * return failure, if asked to avoid waiting.)
7121 : *
7122 : * Note that we don't set up an error context callback ourselves,
7123 : * but instead we pass the info down to XactLockTableWait. This
7124 : * might seem a bit wasteful because the context is set up and
7125 : * tore down for each member of the multixact, but in reality it
7126 : * should be barely noticeable, and it avoids duplicate code.
7127 : */
7128 1 : if (nowait)
7129 : {
7130 0 : result = ConditionalXactLockTableWait(memxid);
7131 0 : if (!result)
7132 0 : break;
7133 : }
7134 : else
7135 1 : XactLockTableWait(memxid, rel, ctid, oper);
7136 : }
7137 :
7138 1 : pfree(members);
7139 : }
7140 :
7141 1 : if (remaining)
7142 0 : *remaining = remain;
7143 :
7144 1 : return result;
7145 : }
7146 :
7147 : /*
7148 : * MultiXactIdWait
7149 : * Sleep on a MultiXactId.
7150 : *
7151 : * By the time we finish sleeping, someone else may have changed the Xmax
7152 : * of the containing tuple, so the caller needs to iterate on us somehow.
7153 : *
7154 : * We return (in *remaining, if not NULL) the number of members that are still
7155 : * running, including any (non-aborted) subtransactions of our own transaction.
7156 : */
7157 : static void
7158 1 : MultiXactIdWait(MultiXactId multi, MultiXactStatus status, uint16 infomask,
7159 : Relation rel, ItemPointer ctid, XLTW_Oper oper,
7160 : int *remaining)
7161 : {
7162 1 : (void) Do_MultiXactIdWait(multi, status, infomask, false,
7163 : rel, ctid, oper, remaining);
7164 1 : }
7165 :
7166 : /*
7167 : * ConditionalMultiXactIdWait
7168 : * As above, but only lock if we can get the lock without blocking.
7169 : *
7170 : * By the time we finish sleeping, someone else may have changed the Xmax
7171 : * of the containing tuple, so the caller needs to iterate on us somehow.
7172 : *
7173 : * If the multixact is now all gone, return true. Returns false if some
7174 : * transactions might still be running.
7175 : *
7176 : * We return (in *remaining, if not NULL) the number of members that are still
7177 : * running, including any (non-aborted) subtransactions of our own transaction.
7178 : */
7179 : static bool
7180 0 : ConditionalMultiXactIdWait(MultiXactId multi, MultiXactStatus status,
7181 : uint16 infomask, Relation rel, int *remaining)
7182 : {
7183 0 : return Do_MultiXactIdWait(multi, status, infomask, true,
7184 : rel, NULL, XLTW_None, remaining);
7185 : }
7186 :
7187 : /*
7188 : * heap_tuple_needs_eventual_freeze
7189 : *
7190 : * Check to see whether any of the XID fields of a tuple (xmin, xmax, xvac)
7191 : * will eventually require freezing. Similar to heap_tuple_needs_freeze,
7192 : * but there's no cutoff, since we're trying to figure out whether freezing
7193 : * will ever be needed, not whether it's needed now.
7194 : */
7195 : bool
7196 3397 : heap_tuple_needs_eventual_freeze(HeapTupleHeader tuple)
7197 : {
7198 : TransactionId xid;
7199 :
7200 : /*
7201 : * If xmin is a normal transaction ID, this tuple is definitely not
7202 : * frozen.
7203 : */
7204 3397 : xid = HeapTupleHeaderGetXmin(tuple);
7205 3397 : if (TransactionIdIsNormal(xid))
7206 464 : return true;
7207 :
7208 : /*
7209 : * If xmax is a valid xact or multixact, this tuple is also not frozen.
7210 : */
7211 2933 : if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
7212 : {
7213 : MultiXactId multi;
7214 :
7215 0 : multi = HeapTupleHeaderGetRawXmax(tuple);
7216 0 : if (MultiXactIdIsValid(multi))
7217 0 : return true;
7218 : }
7219 : else
7220 : {
7221 2933 : xid = HeapTupleHeaderGetRawXmax(tuple);
7222 2933 : if (TransactionIdIsNormal(xid))
7223 0 : return true;
7224 : }
7225 :
7226 2933 : if (tuple->t_infomask & HEAP_MOVED)
7227 : {
7228 0 : xid = HeapTupleHeaderGetXvac(tuple);
7229 0 : if (TransactionIdIsNormal(xid))
7230 0 : return true;
7231 : }
7232 :
7233 2933 : return false;
7234 : }
7235 :
7236 : /*
7237 : * heap_tuple_needs_freeze
7238 : *
7239 : * Check to see whether any of the XID fields of a tuple (xmin, xmax, xvac)
7240 : * are older than the specified cutoff XID or MultiXactId. If so, return TRUE.
7241 : *
7242 : * It doesn't matter whether the tuple is alive or dead, we are checking
7243 : * to see if a tuple needs to be removed or frozen to avoid wraparound.
7244 : *
7245 : * NB: Cannot rely on hint bits here, they might not be set after a crash or
7246 : * on a standby.
7247 : */
7248 : bool
7249 0 : heap_tuple_needs_freeze(HeapTupleHeader tuple, TransactionId cutoff_xid,
7250 : MultiXactId cutoff_multi, Buffer buf)
7251 : {
7252 : TransactionId xid;
7253 :
7254 0 : xid = HeapTupleHeaderGetXmin(tuple);
7255 0 : if (TransactionIdIsNormal(xid) &&
7256 0 : TransactionIdPrecedes(xid, cutoff_xid))
7257 0 : return true;
7258 :
7259 : /*
7260 : * The considerations for multixacts are complicated; look at
7261 : * heap_prepare_freeze_tuple for justifications. This routine had better
7262 : * be in sync with that one!
7263 : */
7264 0 : if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
7265 : {
7266 : MultiXactId multi;
7267 :
7268 0 : multi = HeapTupleHeaderGetRawXmax(tuple);
7269 0 : if (!MultiXactIdIsValid(multi))
7270 : {
7271 : /* no xmax set, ignore */
7272 : ;
7273 : }
7274 0 : else if (HEAP_LOCKED_UPGRADED(tuple->t_infomask))
7275 0 : return true;
7276 0 : else if (MultiXactIdPrecedes(multi, cutoff_multi))
7277 0 : return true;
7278 : else
7279 : {
7280 : MultiXactMember *members;
7281 : int nmembers;
7282 : int i;
7283 :
7284 : /* need to check whether any member of the mxact is too old */
7285 :
7286 0 : nmembers = GetMultiXactIdMembers(multi, &members, false,
7287 0 : HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask));
7288 :
7289 0 : for (i = 0; i < nmembers; i++)
7290 : {
7291 0 : if (TransactionIdPrecedes(members[i].xid, cutoff_xid))
7292 : {
7293 0 : pfree(members);
7294 0 : return true;
7295 : }
7296 : }
7297 0 : if (nmembers > 0)
7298 0 : pfree(members);
7299 : }
7300 : }
7301 : else
7302 : {
7303 0 : xid = HeapTupleHeaderGetRawXmax(tuple);
7304 0 : if (TransactionIdIsNormal(xid) &&
7305 0 : TransactionIdPrecedes(xid, cutoff_xid))
7306 0 : return true;
7307 : }
7308 :
7309 0 : if (tuple->t_infomask & HEAP_MOVED)
7310 : {
7311 0 : xid = HeapTupleHeaderGetXvac(tuple);
7312 0 : if (TransactionIdIsNormal(xid) &&
7313 0 : TransactionIdPrecedes(xid, cutoff_xid))
7314 0 : return true;
7315 : }
7316 :
7317 0 : return false;
7318 : }
7319 :
7320 : /*
7321 : * If 'tuple' contains any visible XID greater than latestRemovedXid,
7322 : * ratchet forwards latestRemovedXid to the greatest one found.
7323 : * This is used as the basis for generating Hot Standby conflicts, so
7324 : * if a tuple was never visible then removing it should not conflict
7325 : * with queries.
7326 : */
7327 : void
7328 107391 : HeapTupleHeaderAdvanceLatestRemovedXid(HeapTupleHeader tuple,
7329 : TransactionId *latestRemovedXid)
7330 : {
7331 107391 : TransactionId xmin = HeapTupleHeaderGetXmin(tuple);
7332 107391 : TransactionId xmax = HeapTupleHeaderGetUpdateXid(tuple);
7333 107391 : TransactionId xvac = HeapTupleHeaderGetXvac(tuple);
7334 :
7335 107391 : if (tuple->t_infomask & HEAP_MOVED)
7336 : {
7337 0 : if (TransactionIdPrecedes(*latestRemovedXid, xvac))
7338 0 : *latestRemovedXid = xvac;
7339 : }
7340 :
7341 : /*
7342 : * Ignore tuples inserted by an aborted transaction or if the tuple was
7343 : * updated/deleted by the inserting transaction.
7344 : *
7345 : * Look for a committed hint bit, or if no xmin bit is set, check clog.
7346 : * This needs to work on both master and standby, where it is used to
7347 : * assess btree delete records.
7348 : */
7349 110805 : if (HeapTupleHeaderXminCommitted(tuple) ||
7350 3442 : (!HeapTupleHeaderXminInvalid(tuple) && TransactionIdDidCommit(xmin)))
7351 : {
7352 203734 : if (xmax != xmin &&
7353 99729 : TransactionIdFollows(xmax, *latestRemovedXid))
7354 5474 : *latestRemovedXid = xmax;
7355 : }
7356 :
7357 : /* *latestRemovedXid may still be invalid at end */
7358 107391 : }
7359 :
7360 : /*
7361 : * Perform XLogInsert to register a heap cleanup info message. These
7362 : * messages are sent once per VACUUM and are required because
7363 : * of the phasing of removal operations during a lazy VACUUM.
7364 : * see comments for vacuum_log_cleanup_info().
7365 : */
7366 : XLogRecPtr
7367 56 : log_heap_cleanup_info(RelFileNode rnode, TransactionId latestRemovedXid)
7368 : {
7369 : xl_heap_cleanup_info xlrec;
7370 : XLogRecPtr recptr;
7371 :
7372 56 : xlrec.node = rnode;
7373 56 : xlrec.latestRemovedXid = latestRemovedXid;
7374 :
7375 56 : XLogBeginInsert();
7376 56 : XLogRegisterData((char *) &xlrec, SizeOfHeapCleanupInfo);
7377 :
7378 56 : recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_CLEANUP_INFO);
7379 :
7380 56 : return recptr;
7381 : }
7382 :
7383 : /*
7384 : * Perform XLogInsert for a heap-clean operation. Caller must already
7385 : * have modified the buffer and marked it dirty.
7386 : *
7387 : * Note: prior to Postgres 8.3, the entries in the nowunused[] array were
7388 : * zero-based tuple indexes. Now they are one-based like other uses
7389 : * of OffsetNumber.
7390 : *
7391 : * We also include latestRemovedXid, which is the greatest XID present in
7392 : * the removed tuples. That allows recovery processing to cancel or wait
7393 : * for long standby queries that can still see these tuples.
7394 : */
7395 : XLogRecPtr
7396 4150 : log_heap_clean(Relation reln, Buffer buffer,
7397 : OffsetNumber *redirected, int nredirected,
7398 : OffsetNumber *nowdead, int ndead,
7399 : OffsetNumber *nowunused, int nunused,
7400 : TransactionId latestRemovedXid)
7401 : {
7402 : xl_heap_clean xlrec;
7403 : XLogRecPtr recptr;
7404 :
7405 : /* Caller should not call me on a non-WAL-logged relation */
7406 4150 : Assert(RelationNeedsWAL(reln));
7407 :
7408 4150 : xlrec.latestRemovedXid = latestRemovedXid;
7409 4150 : xlrec.nredirected = nredirected;
7410 4150 : xlrec.ndead = ndead;
7411 :
7412 4150 : XLogBeginInsert();
7413 4150 : XLogRegisterData((char *) &xlrec, SizeOfHeapClean);
7414 :
7415 4150 : XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
7416 :
7417 : /*
7418 : * The OffsetNumber arrays are not actually in the buffer, but we pretend
7419 : * that they are. When XLogInsert stores the whole buffer, the offset
7420 : * arrays need not be stored too. Note that even if all three arrays are
7421 : * empty, we want to expose the buffer as a candidate for whole-page
7422 : * storage, since this record type implies a defragmentation operation
7423 : * even if no item pointers changed state.
7424 : */
7425 4150 : if (nredirected > 0)
7426 509 : XLogRegisterBufData(0, (char *) redirected,
7427 509 : nredirected * sizeof(OffsetNumber) * 2);
7428 :
7429 4150 : if (ndead > 0)
7430 2797 : XLogRegisterBufData(0, (char *) nowdead,
7431 2797 : ndead * sizeof(OffsetNumber));
7432 :
7433 4150 : if (nunused > 0)
7434 1758 : XLogRegisterBufData(0, (char *) nowunused,
7435 1758 : nunused * sizeof(OffsetNumber));
7436 :
7437 4150 : recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_CLEAN);
7438 :
7439 4150 : return recptr;
7440 : }
7441 :
7442 : /*
7443 : * Perform XLogInsert for a heap-freeze operation. Caller must have already
7444 : * modified the buffer and marked it dirty.
7445 : */
7446 : XLogRecPtr
7447 250 : log_heap_freeze(Relation reln, Buffer buffer, TransactionId cutoff_xid,
7448 : xl_heap_freeze_tuple *tuples, int ntuples)
7449 : {
7450 : xl_heap_freeze_page xlrec;
7451 : XLogRecPtr recptr;
7452 :
7453 : /* Caller should not call me on a non-WAL-logged relation */
7454 250 : Assert(RelationNeedsWAL(reln));
7455 : /* nor when there are no tuples to freeze */
7456 250 : Assert(ntuples > 0);
7457 :
7458 250 : xlrec.cutoff_xid = cutoff_xid;
7459 250 : xlrec.ntuples = ntuples;
7460 :
7461 250 : XLogBeginInsert();
7462 250 : XLogRegisterData((char *) &xlrec, SizeOfHeapFreezePage);
7463 :
7464 : /*
7465 : * The freeze plan array is not actually in the buffer, but pretend that
7466 : * it is. When XLogInsert stores the whole buffer, the freeze plan need
7467 : * not be stored too.
7468 : */
7469 250 : XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
7470 250 : XLogRegisterBufData(0, (char *) tuples,
7471 250 : ntuples * sizeof(xl_heap_freeze_tuple));
7472 :
7473 250 : recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_FREEZE_PAGE);
7474 :
7475 250 : return recptr;
7476 : }
7477 :
7478 : /*
7479 : * Perform XLogInsert for a heap-visible operation. 'block' is the block
7480 : * being marked all-visible, and vm_buffer is the buffer containing the
7481 : * corresponding visibility map block. Both should have already been modified
7482 : * and dirtied.
7483 : *
7484 : * If checksums are enabled, we also generate a full-page image of
7485 : * heap_buffer, if necessary.
7486 : */
7487 : XLogRecPtr
7488 4025 : log_heap_visible(RelFileNode rnode, Buffer heap_buffer, Buffer vm_buffer,
7489 : TransactionId cutoff_xid, uint8 vmflags)
7490 : {
7491 : xl_heap_visible xlrec;
7492 : XLogRecPtr recptr;
7493 : uint8 flags;
7494 :
7495 4025 : Assert(BufferIsValid(heap_buffer));
7496 4025 : Assert(BufferIsValid(vm_buffer));
7497 :
7498 4025 : xlrec.cutoff_xid = cutoff_xid;
7499 4025 : xlrec.flags = vmflags;
7500 4025 : XLogBeginInsert();
7501 4025 : XLogRegisterData((char *) &xlrec, SizeOfHeapVisible);
7502 :
7503 4025 : XLogRegisterBuffer(0, vm_buffer, 0);
7504 :
7505 4025 : flags = REGBUF_STANDARD;
7506 4025 : if (!XLogHintBitIsNeeded())
7507 4025 : flags |= REGBUF_NO_IMAGE;
7508 4025 : XLogRegisterBuffer(1, heap_buffer, flags);
7509 :
7510 4025 : recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_VISIBLE);
7511 :
7512 4025 : return recptr;
7513 : }
7514 :
7515 : /*
7516 : * Perform XLogInsert for a heap-update operation. Caller must already
7517 : * have modified the buffer(s) and marked them dirty.
7518 : */
7519 : static XLogRecPtr
7520 9067 : log_heap_update(Relation reln, Buffer oldbuf,
7521 : Buffer newbuf, HeapTuple oldtup, HeapTuple newtup,
7522 : HeapTuple old_key_tuple,
7523 : bool all_visible_cleared, bool new_all_visible_cleared)
7524 : {
7525 : xl_heap_update xlrec;
7526 : xl_heap_header xlhdr;
7527 : xl_heap_header xlhdr_idx;
7528 : uint8 info;
7529 : uint16 prefix_suffix[2];
7530 9067 : uint16 prefixlen = 0,
7531 9067 : suffixlen = 0;
7532 : XLogRecPtr recptr;
7533 9067 : Page page = BufferGetPage(newbuf);
7534 9067 : bool need_tuple_data = RelationIsLogicallyLogged(reln);
7535 : bool init;
7536 : int bufflags;
7537 :
7538 : /* Caller should not call me on a non-WAL-logged relation */
7539 9067 : Assert(RelationNeedsWAL(reln));
7540 :
7541 9067 : XLogBeginInsert();
7542 :
7543 9067 : if (HeapTupleIsHeapOnly(newtup))
7544 4917 : info = XLOG_HEAP_HOT_UPDATE;
7545 : else
7546 4150 : info = XLOG_HEAP_UPDATE;
7547 :
7548 : /*
7549 : * If the old and new tuple are on the same page, we only need to log the
7550 : * parts of the new tuple that were changed. That saves on the amount of
7551 : * WAL we need to write. Currently, we just count any unchanged bytes in
7552 : * the beginning and end of the tuple. That's quick to check, and
7553 : * perfectly covers the common case that only one field is updated.
7554 : *
7555 : * We could do this even if the old and new tuple are on different pages,
7556 : * but only if we don't make a full-page image of the old page, which is
7557 : * difficult to know in advance. Also, if the old tuple is corrupt for
7558 : * some reason, it would allow the corruption to propagate the new page,
7559 : * so it seems best to avoid. Under the general assumption that most
7560 : * updates tend to create the new tuple version on the same page, there
7561 : * isn't much to be gained by doing this across pages anyway.
7562 : *
7563 : * Skip this if we're taking a full-page image of the new page, as we
7564 : * don't include the new tuple in the WAL record in that case. Also
7565 : * disable if wal_level='logical', as logical decoding needs to be able to
7566 : * read the new tuple in whole from the WAL record alone.
7567 : */
7568 15289 : if (oldbuf == newbuf && !need_tuple_data &&
7569 6222 : !XLogCheckBufferNeedsBackup(newbuf))
7570 : {
7571 6217 : char *oldp = (char *) oldtup->t_data + oldtup->t_data->t_hoff;
7572 6217 : char *newp = (char *) newtup->t_data + newtup->t_data->t_hoff;
7573 6217 : int oldlen = oldtup->t_len - oldtup->t_data->t_hoff;
7574 6217 : int newlen = newtup->t_len - newtup->t_data->t_hoff;
7575 :
7576 : /* Check for common prefix between old and new tuple */
7577 569136 : for (prefixlen = 0; prefixlen < Min(oldlen, newlen); prefixlen++)
7578 : {
7579 568086 : if (newp[prefixlen] != oldp[prefixlen])
7580 5167 : break;
7581 : }
7582 :
7583 : /*
7584 : * Storing the length of the prefix takes 2 bytes, so we need to save
7585 : * at least 3 bytes or there's no point.
7586 : */
7587 6217 : if (prefixlen < 3)
7588 275 : prefixlen = 0;
7589 :
7590 : /* Same for suffix */
7591 276985 : for (suffixlen = 0; suffixlen < Min(oldlen, newlen) - prefixlen; suffixlen++)
7592 : {
7593 275909 : if (newp[newlen - suffixlen - 1] != oldp[oldlen - suffixlen - 1])
7594 5141 : break;
7595 : }
7596 6217 : if (suffixlen < 3)
7597 1463 : suffixlen = 0;
7598 : }
7599 :
7600 : /* Prepare main WAL data chain */
7601 9067 : xlrec.flags = 0;
7602 9067 : if (all_visible_cleared)
7603 167 : xlrec.flags |= XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED;
7604 9067 : if (new_all_visible_cleared)
7605 51 : xlrec.flags |= XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED;
7606 9067 : if (prefixlen > 0)
7607 5942 : xlrec.flags |= XLH_UPDATE_PREFIX_FROM_OLD;
7608 9067 : if (suffixlen > 0)
7609 4754 : xlrec.flags |= XLH_UPDATE_SUFFIX_FROM_OLD;
7610 9067 : if (need_tuple_data)
7611 : {
7612 0 : xlrec.flags |= XLH_UPDATE_CONTAINS_NEW_TUPLE;
7613 0 : if (old_key_tuple)
7614 : {
7615 0 : if (reln->rd_rel->relreplident == REPLICA_IDENTITY_FULL)
7616 0 : xlrec.flags |= XLH_UPDATE_CONTAINS_OLD_TUPLE;
7617 : else
7618 0 : xlrec.flags |= XLH_UPDATE_CONTAINS_OLD_KEY;
7619 : }
7620 : }
7621 :
7622 : /* If new tuple is the single and first tuple on page... */
7623 9210 : if (ItemPointerGetOffsetNumber(&(newtup->t_self)) == FirstOffsetNumber &&
7624 286 : PageGetMaxOffsetNumber(page) == FirstOffsetNumber)
7625 : {
7626 100 : info |= XLOG_HEAP_INIT_PAGE;
7627 100 : init = true;
7628 : }
7629 : else
7630 8967 : init = false;
7631 :
7632 : /* Prepare WAL data for the old page */
7633 9067 : xlrec.old_offnum = ItemPointerGetOffsetNumber(&oldtup->t_self);
7634 9067 : xlrec.old_xmax = HeapTupleHeaderGetRawXmax(oldtup->t_data);
7635 9067 : xlrec.old_infobits_set = compute_infobits(oldtup->t_data->t_infomask,
7636 9067 : oldtup->t_data->t_infomask2);
7637 :
7638 : /* Prepare WAL data for the new page */
7639 9067 : xlrec.new_offnum = ItemPointerGetOffsetNumber(&newtup->t_self);
7640 9067 : xlrec.new_xmax = HeapTupleHeaderGetRawXmax(newtup->t_data);
7641 :
7642 9067 : bufflags = REGBUF_STANDARD;
7643 9067 : if (init)
7644 100 : bufflags |= REGBUF_WILL_INIT;
7645 9067 : if (need_tuple_data)
7646 0 : bufflags |= REGBUF_KEEP_DATA;
7647 :
7648 9067 : XLogRegisterBuffer(0, newbuf, bufflags);
7649 9067 : if (oldbuf != newbuf)
7650 2845 : XLogRegisterBuffer(1, oldbuf, REGBUF_STANDARD);
7651 :
7652 9067 : XLogRegisterData((char *) &xlrec, SizeOfHeapUpdate);
7653 :
7654 : /*
7655 : * Prepare WAL data for the new tuple.
7656 : */
7657 9067 : if (prefixlen > 0 || suffixlen > 0)
7658 : {
7659 6158 : if (prefixlen > 0 && suffixlen > 0)
7660 : {
7661 4538 : prefix_suffix[0] = prefixlen;
7662 4538 : prefix_suffix[1] = suffixlen;
7663 4538 : XLogRegisterBufData(0, (char *) &prefix_suffix, sizeof(uint16) * 2);
7664 : }
7665 1620 : else if (prefixlen > 0)
7666 : {
7667 1404 : XLogRegisterBufData(0, (char *) &prefixlen, sizeof(uint16));
7668 : }
7669 : else
7670 : {
7671 216 : XLogRegisterBufData(0, (char *) &suffixlen, sizeof(uint16));
7672 : }
7673 : }
7674 :
7675 9067 : xlhdr.t_infomask2 = newtup->t_data->t_infomask2;
7676 9067 : xlhdr.t_infomask = newtup->t_data->t_infomask;
7677 9067 : xlhdr.t_hoff = newtup->t_data->t_hoff;
7678 9067 : Assert(SizeofHeapTupleHeader + prefixlen + suffixlen <= newtup->t_len);
7679 :
7680 : /*
7681 : * PG73FORMAT: write bitmap [+ padding] [+ oid] + data
7682 : *
7683 : * The 'data' doesn't include the common prefix or suffix.
7684 : */
7685 9067 : XLogRegisterBufData(0, (char *) &xlhdr, SizeOfHeapHeader);
7686 9067 : if (prefixlen == 0)
7687 : {
7688 6250 : XLogRegisterBufData(0,
7689 3125 : ((char *) newtup->t_data) + SizeofHeapTupleHeader,
7690 3125 : newtup->t_len - SizeofHeapTupleHeader - suffixlen);
7691 : }
7692 : else
7693 : {
7694 : /*
7695 : * Have to write the null bitmap and data after the common prefix as
7696 : * two separate rdata entries.
7697 : */
7698 : /* bitmap [+ padding] [+ oid] */
7699 5942 : if (newtup->t_data->t_hoff - SizeofHeapTupleHeader > 0)
7700 : {
7701 11884 : XLogRegisterBufData(0,
7702 5942 : ((char *) newtup->t_data) + SizeofHeapTupleHeader,
7703 5942 : newtup->t_data->t_hoff - SizeofHeapTupleHeader);
7704 : }
7705 :
7706 : /* data after common prefix */
7707 11884 : XLogRegisterBufData(0,
7708 5942 : ((char *) newtup->t_data) + newtup->t_data->t_hoff + prefixlen,
7709 5942 : newtup->t_len - newtup->t_data->t_hoff - prefixlen - suffixlen);
7710 : }
7711 :
7712 : /* We need to log a tuple identity */
7713 9067 : if (need_tuple_data && old_key_tuple)
7714 : {
7715 : /* don't really need this, but its more comfy to decode */
7716 0 : xlhdr_idx.t_infomask2 = old_key_tuple->t_data->t_infomask2;
7717 0 : xlhdr_idx.t_infomask = old_key_tuple->t_data->t_infomask;
7718 0 : xlhdr_idx.t_hoff = old_key_tuple->t_data->t_hoff;
7719 :
7720 0 : XLogRegisterData((char *) &xlhdr_idx, SizeOfHeapHeader);
7721 :
7722 : /* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */
7723 0 : XLogRegisterData((char *) old_key_tuple->t_data + SizeofHeapTupleHeader,
7724 0 : old_key_tuple->t_len - SizeofHeapTupleHeader);
7725 : }
7726 :
7727 : /* filtering by origin on a row level is much more efficient */
7728 9067 : XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
7729 :
7730 9067 : recptr = XLogInsert(RM_HEAP_ID, info);
7731 :
7732 9067 : return recptr;
7733 : }
7734 :
7735 : /*
7736 : * Perform XLogInsert of an XLOG_HEAP2_NEW_CID record
7737 : *
7738 : * This is only used in wal_level >= WAL_LEVEL_LOGICAL, and only for catalog
7739 : * tuples.
7740 : */
7741 : static XLogRecPtr
7742 0 : log_heap_new_cid(Relation relation, HeapTuple tup)
7743 : {
7744 : xl_heap_new_cid xlrec;
7745 :
7746 : XLogRecPtr recptr;
7747 0 : HeapTupleHeader hdr = tup->t_data;
7748 :
7749 0 : Assert(ItemPointerIsValid(&tup->t_self));
7750 0 : Assert(tup->t_tableOid != InvalidOid);
7751 :
7752 0 : xlrec.top_xid = GetTopTransactionId();
7753 0 : xlrec.target_node = relation->rd_node;
7754 0 : xlrec.target_tid = tup->t_self;
7755 :
7756 : /*
7757 : * If the tuple got inserted & deleted in the same TX we definitely have a
7758 : * combocid, set cmin and cmax.
7759 : */
7760 0 : if (hdr->t_infomask & HEAP_COMBOCID)
7761 : {
7762 0 : Assert(!(hdr->t_infomask & HEAP_XMAX_INVALID));
7763 0 : Assert(!HeapTupleHeaderXminInvalid(hdr));
7764 0 : xlrec.cmin = HeapTupleHeaderGetCmin(hdr);
7765 0 : xlrec.cmax = HeapTupleHeaderGetCmax(hdr);
7766 0 : xlrec.combocid = HeapTupleHeaderGetRawCommandId(hdr);
7767 : }
7768 : /* No combocid, so only cmin or cmax can be set by this TX */
7769 : else
7770 : {
7771 : /*
7772 : * Tuple inserted.
7773 : *
7774 : * We need to check for LOCK ONLY because multixacts might be
7775 : * transferred to the new tuple in case of FOR KEY SHARE updates in
7776 : * which case there will be an xmax, although the tuple just got
7777 : * inserted.
7778 : */
7779 0 : if (hdr->t_infomask & HEAP_XMAX_INVALID ||
7780 0 : HEAP_XMAX_IS_LOCKED_ONLY(hdr->t_infomask))
7781 : {
7782 0 : xlrec.cmin = HeapTupleHeaderGetRawCommandId(hdr);
7783 0 : xlrec.cmax = InvalidCommandId;
7784 : }
7785 : /* Tuple from a different tx updated or deleted. */
7786 : else
7787 : {
7788 0 : xlrec.cmin = InvalidCommandId;
7789 0 : xlrec.cmax = HeapTupleHeaderGetRawCommandId(hdr);
7790 :
7791 : }
7792 0 : xlrec.combocid = InvalidCommandId;
7793 : }
7794 :
7795 : /*
7796 : * Note that we don't need to register the buffer here, because this
7797 : * operation does not modify the page. The insert/update/delete that
7798 : * called us certainly did, but that's WAL-logged separately.
7799 : */
7800 0 : XLogBeginInsert();
7801 0 : XLogRegisterData((char *) &xlrec, SizeOfHeapNewCid);
7802 :
7803 : /* will be looked at irrespective of origin */
7804 :
7805 0 : recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_NEW_CID);
7806 :
7807 0 : return recptr;
7808 : }
7809 :
7810 : /*
7811 : * Build a heap tuple representing the configured REPLICA IDENTITY to represent
7812 : * the old tuple in a UPDATE or DELETE.
7813 : *
7814 : * Returns NULL if there's no need to log an identity or if there's no suitable
7815 : * key in the Relation relation.
7816 : */
7817 : static HeapTuple
7818 118568 : ExtractReplicaIdentity(Relation relation, HeapTuple tp, bool key_changed, bool *copy)
7819 : {
7820 118568 : TupleDesc desc = RelationGetDescr(relation);
7821 : Oid replidindex;
7822 : Relation idx_rel;
7823 : TupleDesc idx_desc;
7824 118568 : char replident = relation->rd_rel->relreplident;
7825 118568 : HeapTuple key_tuple = NULL;
7826 : bool nulls[MaxHeapAttributeNumber];
7827 : Datum values[MaxHeapAttributeNumber];
7828 : int natt;
7829 :
7830 118568 : *copy = false;
7831 :
7832 118568 : if (!RelationIsLogicallyLogged(relation))
7833 118568 : return NULL;
7834 :
7835 0 : if (replident == REPLICA_IDENTITY_NOTHING)
7836 0 : return NULL;
7837 :
7838 0 : if (replident == REPLICA_IDENTITY_FULL)
7839 : {
7840 : /*
7841 : * When logging the entire old tuple, it very well could contain
7842 : * toasted columns. If so, force them to be inlined.
7843 : */
7844 0 : if (HeapTupleHasExternal(tp))
7845 : {
7846 0 : *copy = true;
7847 0 : tp = toast_flatten_tuple(tp, RelationGetDescr(relation));
7848 : }
7849 0 : return tp;
7850 : }
7851 :
7852 : /* if the key hasn't changed and we're only logging the key, we're done */
7853 0 : if (!key_changed)
7854 0 : return NULL;
7855 :
7856 : /* find the replica identity index */
7857 0 : replidindex = RelationGetReplicaIndex(relation);
7858 0 : if (!OidIsValid(replidindex))
7859 : {
7860 0 : elog(DEBUG4, "could not find configured replica identity for table \"%s\"",
7861 : RelationGetRelationName(relation));
7862 0 : return NULL;
7863 : }
7864 :
7865 0 : idx_rel = RelationIdGetRelation(replidindex);
7866 0 : idx_desc = RelationGetDescr(idx_rel);
7867 :
7868 : /* deform tuple, so we have fast access to columns */
7869 0 : heap_deform_tuple(tp, desc, values, nulls);
7870 :
7871 : /* set all columns to NULL, regardless of whether they actually are */
7872 0 : memset(nulls, 1, sizeof(nulls));
7873 :
7874 : /*
7875 : * Now set all columns contained in the index to NOT NULL, they cannot
7876 : * currently be NULL.
7877 : */
7878 0 : for (natt = 0; natt < idx_desc->natts; natt++)
7879 : {
7880 0 : int attno = idx_rel->rd_index->indkey.values[natt];
7881 :
7882 0 : if (attno < 0)
7883 : {
7884 : /*
7885 : * The OID column can appear in an index definition, but that's
7886 : * OK, because we always copy the OID if present (see below).
7887 : * Other system columns may not.
7888 : */
7889 0 : if (attno == ObjectIdAttributeNumber)
7890 0 : continue;
7891 0 : elog(ERROR, "system column in index");
7892 : }
7893 0 : nulls[attno - 1] = false;
7894 : }
7895 :
7896 0 : key_tuple = heap_form_tuple(desc, values, nulls);
7897 0 : *copy = true;
7898 0 : RelationClose(idx_rel);
7899 :
7900 : /*
7901 : * Always copy oids if the table has them, even if not included in the
7902 : * index. The space in the logged tuple is used anyway, so there's little
7903 : * point in not including the information.
7904 : */
7905 0 : if (relation->rd_rel->relhasoids)
7906 0 : HeapTupleSetOid(key_tuple, HeapTupleGetOid(tp));
7907 :
7908 : /*
7909 : * If the tuple, which by here only contains indexed columns, still has
7910 : * toasted columns, force them to be inlined. This is somewhat unlikely
7911 : * since there's limits on the size of indexed columns, so we don't
7912 : * duplicate toast_flatten_tuple()s functionality in the above loop over
7913 : * the indexed columns, even if it would be more efficient.
7914 : */
7915 0 : if (HeapTupleHasExternal(key_tuple))
7916 : {
7917 0 : HeapTuple oldtup = key_tuple;
7918 :
7919 0 : key_tuple = toast_flatten_tuple(oldtup, RelationGetDescr(relation));
7920 0 : heap_freetuple(oldtup);
7921 : }
7922 :
7923 0 : return key_tuple;
7924 : }
7925 :
7926 : /*
7927 : * Handles CLEANUP_INFO
7928 : */
7929 : static void
7930 0 : heap_xlog_cleanup_info(XLogReaderState *record)
7931 : {
7932 0 : xl_heap_cleanup_info *xlrec = (xl_heap_cleanup_info *) XLogRecGetData(record);
7933 :
7934 0 : if (InHotStandby)
7935 0 : ResolveRecoveryConflictWithSnapshot(xlrec->latestRemovedXid, xlrec->node);
7936 :
7937 : /*
7938 : * Actual operation is a no-op. Record type exists to provide a means for
7939 : * conflict processing to occur before we begin index vacuum actions. see
7940 : * vacuumlazy.c and also comments in btvacuumpage()
7941 : */
7942 :
7943 : /* Backup blocks are not used in cleanup_info records */
7944 0 : Assert(!XLogRecHasAnyBlockRefs(record));
7945 0 : }
7946 :
7947 : /*
7948 : * Handles HEAP2_CLEAN record type
7949 : */
7950 : static void
7951 0 : heap_xlog_clean(XLogReaderState *record)
7952 : {
7953 0 : XLogRecPtr lsn = record->EndRecPtr;
7954 0 : xl_heap_clean *xlrec = (xl_heap_clean *) XLogRecGetData(record);
7955 : Buffer buffer;
7956 0 : Size freespace = 0;
7957 : RelFileNode rnode;
7958 : BlockNumber blkno;
7959 : XLogRedoAction action;
7960 :
7961 0 : XLogRecGetBlockTag(record, 0, &rnode, NULL, &blkno);
7962 :
7963 : /*
7964 : * We're about to remove tuples. In Hot Standby mode, ensure that there's
7965 : * no queries running for which the removed tuples are still visible.
7966 : *
7967 : * Not all HEAP2_CLEAN records remove tuples with xids, so we only want to
7968 : * conflict on the records that cause MVCC failures for user queries. If
7969 : * latestRemovedXid is invalid, skip conflict processing.
7970 : */
7971 0 : if (InHotStandby && TransactionIdIsValid(xlrec->latestRemovedXid))
7972 0 : ResolveRecoveryConflictWithSnapshot(xlrec->latestRemovedXid, rnode);
7973 :
7974 : /*
7975 : * If we have a full-page image, restore it (using a cleanup lock) and
7976 : * we're done.
7977 : */
7978 0 : action = XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true,
7979 : &buffer);
7980 0 : if (action == BLK_NEEDS_REDO)
7981 : {
7982 0 : Page page = (Page) BufferGetPage(buffer);
7983 : OffsetNumber *end;
7984 : OffsetNumber *redirected;
7985 : OffsetNumber *nowdead;
7986 : OffsetNumber *nowunused;
7987 : int nredirected;
7988 : int ndead;
7989 : int nunused;
7990 : Size datalen;
7991 :
7992 0 : redirected = (OffsetNumber *) XLogRecGetBlockData(record, 0, &datalen);
7993 :
7994 0 : nredirected = xlrec->nredirected;
7995 0 : ndead = xlrec->ndead;
7996 0 : end = (OffsetNumber *) ((char *) redirected + datalen);
7997 0 : nowdead = redirected + (nredirected * 2);
7998 0 : nowunused = nowdead + ndead;
7999 0 : nunused = (end - nowunused);
8000 0 : Assert(nunused >= 0);
8001 :
8002 : /* Update all item pointers per the record, and repair fragmentation */
8003 0 : heap_page_prune_execute(buffer,
8004 : redirected, nredirected,
8005 : nowdead, ndead,
8006 : nowunused, nunused);
8007 :
8008 0 : freespace = PageGetHeapFreeSpace(page); /* needed to update FSM below */
8009 :
8010 : /*
8011 : * Note: we don't worry about updating the page's prunability hints.
8012 : * At worst this will cause an extra prune cycle to occur soon.
8013 : */
8014 :
8015 0 : PageSetLSN(page, lsn);
8016 0 : MarkBufferDirty(buffer);
8017 : }
8018 0 : if (BufferIsValid(buffer))
8019 0 : UnlockReleaseBuffer(buffer);
8020 :
8021 : /*
8022 : * Update the FSM as well.
8023 : *
8024 : * XXX: Don't do this if the page was restored from full page image. We
8025 : * don't bother to update the FSM in that case, it doesn't need to be
8026 : * totally accurate anyway.
8027 : */
8028 0 : if (action == BLK_NEEDS_REDO)
8029 0 : XLogRecordPageWithFreeSpace(rnode, blkno, freespace);
8030 0 : }
8031 :
8032 : /*
8033 : * Replay XLOG_HEAP2_VISIBLE record.
8034 : *
8035 : * The critical integrity requirement here is that we must never end up with
8036 : * a situation where the visibility map bit is set, and the page-level
8037 : * PD_ALL_VISIBLE bit is clear. If that were to occur, then a subsequent
8038 : * page modification would fail to clear the visibility map bit.
8039 : */
8040 : static void
8041 0 : heap_xlog_visible(XLogReaderState *record)
8042 : {
8043 0 : XLogRecPtr lsn = record->EndRecPtr;
8044 0 : xl_heap_visible *xlrec = (xl_heap_visible *) XLogRecGetData(record);
8045 0 : Buffer vmbuffer = InvalidBuffer;
8046 : Buffer buffer;
8047 : Page page;
8048 : RelFileNode rnode;
8049 : BlockNumber blkno;
8050 : XLogRedoAction action;
8051 :
8052 0 : XLogRecGetBlockTag(record, 1, &rnode, NULL, &blkno);
8053 :
8054 : /*
8055 : * If there are any Hot Standby transactions running that have an xmin
8056 : * horizon old enough that this page isn't all-visible for them, they
8057 : * might incorrectly decide that an index-only scan can skip a heap fetch.
8058 : *
8059 : * NB: It might be better to throw some kind of "soft" conflict here that
8060 : * forces any index-only scan that is in flight to perform heap fetches,
8061 : * rather than killing the transaction outright.
8062 : */
8063 0 : if (InHotStandby)
8064 0 : ResolveRecoveryConflictWithSnapshot(xlrec->cutoff_xid, rnode);
8065 :
8066 : /*
8067 : * Read the heap page, if it still exists. If the heap file has dropped or
8068 : * truncated later in recovery, we don't need to update the page, but we'd
8069 : * better still update the visibility map.
8070 : */
8071 0 : action = XLogReadBufferForRedo(record, 1, &buffer);
8072 0 : if (action == BLK_NEEDS_REDO)
8073 : {
8074 : /*
8075 : * We don't bump the LSN of the heap page when setting the visibility
8076 : * map bit (unless checksums or wal_hint_bits is enabled, in which
8077 : * case we must), because that would generate an unworkable volume of
8078 : * full-page writes. This exposes us to torn page hazards, but since
8079 : * we're not inspecting the existing page contents in any way, we
8080 : * don't care.
8081 : *
8082 : * However, all operations that clear the visibility map bit *do* bump
8083 : * the LSN, and those operations will only be replayed if the XLOG LSN
8084 : * follows the page LSN. Thus, if the page LSN has advanced past our
8085 : * XLOG record's LSN, we mustn't mark the page all-visible, because
8086 : * the subsequent update won't be replayed to clear the flag.
8087 : */
8088 0 : page = BufferGetPage(buffer);
8089 :
8090 0 : PageSetAllVisible(page);
8091 :
8092 0 : MarkBufferDirty(buffer);
8093 : }
8094 : else if (action == BLK_RESTORED)
8095 : {
8096 : /*
8097 : * If heap block was backed up, we already restored it and there's
8098 : * nothing more to do. (This can only happen with checksums or
8099 : * wal_log_hints enabled.)
8100 : */
8101 : }
8102 0 : if (BufferIsValid(buffer))
8103 0 : UnlockReleaseBuffer(buffer);
8104 :
8105 : /*
8106 : * Even if we skipped the heap page update due to the LSN interlock, it's
8107 : * still safe to update the visibility map. Any WAL record that clears
8108 : * the visibility map bit does so before checking the page LSN, so any
8109 : * bits that need to be cleared will still be cleared.
8110 : */
8111 0 : if (XLogReadBufferForRedoExtended(record, 0, RBM_ZERO_ON_ERROR, false,
8112 : &vmbuffer) == BLK_NEEDS_REDO)
8113 : {
8114 0 : Page vmpage = BufferGetPage(vmbuffer);
8115 : Relation reln;
8116 :
8117 : /* initialize the page if it was read as zeros */
8118 0 : if (PageIsNew(vmpage))
8119 0 : PageInit(vmpage, BLCKSZ, 0);
8120 :
8121 : /*
8122 : * XLogReadBufferForRedoExtended locked the buffer. But
8123 : * visibilitymap_set will handle locking itself.
8124 : */
8125 0 : LockBuffer(vmbuffer, BUFFER_LOCK_UNLOCK);
8126 :
8127 0 : reln = CreateFakeRelcacheEntry(rnode);
8128 0 : visibilitymap_pin(reln, blkno, &vmbuffer);
8129 :
8130 : /*
8131 : * Don't set the bit if replay has already passed this point.
8132 : *
8133 : * It might be safe to do this unconditionally; if replay has passed
8134 : * this point, we'll replay at least as far this time as we did
8135 : * before, and if this bit needs to be cleared, the record responsible
8136 : * for doing so should be again replayed, and clear it. For right
8137 : * now, out of an abundance of conservatism, we use the same test here
8138 : * we did for the heap page. If this results in a dropped bit, no
8139 : * real harm is done; and the next VACUUM will fix it.
8140 : */
8141 0 : if (lsn > PageGetLSN(vmpage))
8142 0 : visibilitymap_set(reln, blkno, InvalidBuffer, lsn, vmbuffer,
8143 0 : xlrec->cutoff_xid, xlrec->flags);
8144 :
8145 0 : ReleaseBuffer(vmbuffer);
8146 0 : FreeFakeRelcacheEntry(reln);
8147 : }
8148 0 : else if (BufferIsValid(vmbuffer))
8149 0 : UnlockReleaseBuffer(vmbuffer);
8150 0 : }
8151 :
8152 : /*
8153 : * Replay XLOG_HEAP2_FREEZE_PAGE records
8154 : */
8155 : static void
8156 0 : heap_xlog_freeze_page(XLogReaderState *record)
8157 : {
8158 0 : XLogRecPtr lsn = record->EndRecPtr;
8159 0 : xl_heap_freeze_page *xlrec = (xl_heap_freeze_page *) XLogRecGetData(record);
8160 0 : TransactionId cutoff_xid = xlrec->cutoff_xid;
8161 : Buffer buffer;
8162 : int ntup;
8163 :
8164 : /*
8165 : * In Hot Standby mode, ensure that there's no queries running which still
8166 : * consider the frozen xids as running.
8167 : */
8168 0 : if (InHotStandby)
8169 : {
8170 : RelFileNode rnode;
8171 0 : TransactionId latestRemovedXid = cutoff_xid;
8172 :
8173 0 : TransactionIdRetreat(latestRemovedXid);
8174 :
8175 0 : XLogRecGetBlockTag(record, 0, &rnode, NULL, NULL);
8176 0 : ResolveRecoveryConflictWithSnapshot(latestRemovedXid, rnode);
8177 : }
8178 :
8179 0 : if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
8180 : {
8181 0 : Page page = BufferGetPage(buffer);
8182 : xl_heap_freeze_tuple *tuples;
8183 :
8184 0 : tuples = (xl_heap_freeze_tuple *) XLogRecGetBlockData(record, 0, NULL);
8185 :
8186 : /* now execute freeze plan for each frozen tuple */
8187 0 : for (ntup = 0; ntup < xlrec->ntuples; ntup++)
8188 : {
8189 : xl_heap_freeze_tuple *xlrec_tp;
8190 : ItemId lp;
8191 : HeapTupleHeader tuple;
8192 :
8193 0 : xlrec_tp = &tuples[ntup];
8194 0 : lp = PageGetItemId(page, xlrec_tp->offset); /* offsets are one-based */
8195 0 : tuple = (HeapTupleHeader) PageGetItem(page, lp);
8196 :
8197 0 : heap_execute_freeze_tuple(tuple, xlrec_tp);
8198 : }
8199 :
8200 0 : PageSetLSN(page, lsn);
8201 0 : MarkBufferDirty(buffer);
8202 : }
8203 0 : if (BufferIsValid(buffer))
8204 0 : UnlockReleaseBuffer(buffer);
8205 0 : }
8206 :
8207 : /*
8208 : * Given an "infobits" field from an XLog record, set the correct bits in the
8209 : * given infomask and infomask2 for the tuple touched by the record.
8210 : *
8211 : * (This is the reverse of compute_infobits).
8212 : */
8213 : static void
8214 0 : fix_infomask_from_infobits(uint8 infobits, uint16 *infomask, uint16 *infomask2)
8215 : {
8216 0 : *infomask &= ~(HEAP_XMAX_IS_MULTI | HEAP_XMAX_LOCK_ONLY |
8217 : HEAP_XMAX_KEYSHR_LOCK | HEAP_XMAX_EXCL_LOCK);
8218 0 : *infomask2 &= ~HEAP_KEYS_UPDATED;
8219 :
8220 0 : if (infobits & XLHL_XMAX_IS_MULTI)
8221 0 : *infomask |= HEAP_XMAX_IS_MULTI;
8222 0 : if (infobits & XLHL_XMAX_LOCK_ONLY)
8223 0 : *infomask |= HEAP_XMAX_LOCK_ONLY;
8224 0 : if (infobits & XLHL_XMAX_EXCL_LOCK)
8225 0 : *infomask |= HEAP_XMAX_EXCL_LOCK;
8226 : /* note HEAP_XMAX_SHR_LOCK isn't considered here */
8227 0 : if (infobits & XLHL_XMAX_KEYSHR_LOCK)
8228 0 : *infomask |= HEAP_XMAX_KEYSHR_LOCK;
8229 :
8230 0 : if (infobits & XLHL_KEYS_UPDATED)
8231 0 : *infomask2 |= HEAP_KEYS_UPDATED;
8232 0 : }
8233 :
8234 : static void
8235 0 : heap_xlog_delete(XLogReaderState *record)
8236 : {
8237 0 : XLogRecPtr lsn = record->EndRecPtr;
8238 0 : xl_heap_delete *xlrec = (xl_heap_delete *) XLogRecGetData(record);
8239 : Buffer buffer;
8240 : Page page;
8241 0 : ItemId lp = NULL;
8242 : HeapTupleHeader htup;
8243 : BlockNumber blkno;
8244 : RelFileNode target_node;
8245 : ItemPointerData target_tid;
8246 :
8247 0 : XLogRecGetBlockTag(record, 0, &target_node, NULL, &blkno);
8248 0 : ItemPointerSetBlockNumber(&target_tid, blkno);
8249 0 : ItemPointerSetOffsetNumber(&target_tid, xlrec->offnum);
8250 :
8251 : /*
8252 : * The visibility map may need to be fixed even if the heap page is
8253 : * already up-to-date.
8254 : */
8255 0 : if (xlrec->flags & XLH_DELETE_ALL_VISIBLE_CLEARED)
8256 : {
8257 0 : Relation reln = CreateFakeRelcacheEntry(target_node);
8258 0 : Buffer vmbuffer = InvalidBuffer;
8259 :
8260 0 : visibilitymap_pin(reln, blkno, &vmbuffer);
8261 0 : visibilitymap_clear(reln, blkno, vmbuffer, VISIBILITYMAP_VALID_BITS);
8262 0 : ReleaseBuffer(vmbuffer);
8263 0 : FreeFakeRelcacheEntry(reln);
8264 : }
8265 :
8266 0 : if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
8267 : {
8268 0 : page = BufferGetPage(buffer);
8269 :
8270 0 : if (PageGetMaxOffsetNumber(page) >= xlrec->offnum)
8271 0 : lp = PageGetItemId(page, xlrec->offnum);
8272 :
8273 0 : if (PageGetMaxOffsetNumber(page) < xlrec->offnum || !ItemIdIsNormal(lp))
8274 0 : elog(PANIC, "invalid lp");
8275 :
8276 0 : htup = (HeapTupleHeader) PageGetItem(page, lp);
8277 :
8278 0 : htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
8279 0 : htup->t_infomask2 &= ~HEAP_KEYS_UPDATED;
8280 0 : HeapTupleHeaderClearHotUpdated(htup);
8281 0 : fix_infomask_from_infobits(xlrec->infobits_set,
8282 : &htup->t_infomask, &htup->t_infomask2);
8283 0 : if (!(xlrec->flags & XLH_DELETE_IS_SUPER))
8284 0 : HeapTupleHeaderSetXmax(htup, xlrec->xmax);
8285 : else
8286 0 : HeapTupleHeaderSetXmin(htup, InvalidTransactionId);
8287 0 : HeapTupleHeaderSetCmax(htup, FirstCommandId, false);
8288 :
8289 : /* Mark the page as a candidate for pruning */
8290 0 : PageSetPrunable(page, XLogRecGetXid(record));
8291 :
8292 0 : if (xlrec->flags & XLH_DELETE_ALL_VISIBLE_CLEARED)
8293 0 : PageClearAllVisible(page);
8294 :
8295 : /* Make sure there is no forward chain link in t_ctid */
8296 0 : htup->t_ctid = target_tid;
8297 0 : PageSetLSN(page, lsn);
8298 0 : MarkBufferDirty(buffer);
8299 : }
8300 0 : if (BufferIsValid(buffer))
8301 0 : UnlockReleaseBuffer(buffer);
8302 0 : }
8303 :
8304 : static void
8305 0 : heap_xlog_insert(XLogReaderState *record)
8306 : {
8307 0 : XLogRecPtr lsn = record->EndRecPtr;
8308 0 : xl_heap_insert *xlrec = (xl_heap_insert *) XLogRecGetData(record);
8309 : Buffer buffer;
8310 : Page page;
8311 : union
8312 : {
8313 : HeapTupleHeaderData hdr;
8314 : char data[MaxHeapTupleSize];
8315 : } tbuf;
8316 : HeapTupleHeader htup;
8317 : xl_heap_header xlhdr;
8318 : uint32 newlen;
8319 0 : Size freespace = 0;
8320 : RelFileNode target_node;
8321 : BlockNumber blkno;
8322 : ItemPointerData target_tid;
8323 : XLogRedoAction action;
8324 :
8325 0 : XLogRecGetBlockTag(record, 0, &target_node, NULL, &blkno);
8326 0 : ItemPointerSetBlockNumber(&target_tid, blkno);
8327 0 : ItemPointerSetOffsetNumber(&target_tid, xlrec->offnum);
8328 :
8329 : /*
8330 : * The visibility map may need to be fixed even if the heap page is
8331 : * already up-to-date.
8332 : */
8333 0 : if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED)
8334 : {
8335 0 : Relation reln = CreateFakeRelcacheEntry(target_node);
8336 0 : Buffer vmbuffer = InvalidBuffer;
8337 :
8338 0 : visibilitymap_pin(reln, blkno, &vmbuffer);
8339 0 : visibilitymap_clear(reln, blkno, vmbuffer, VISIBILITYMAP_VALID_BITS);
8340 0 : ReleaseBuffer(vmbuffer);
8341 0 : FreeFakeRelcacheEntry(reln);
8342 : }
8343 :
8344 : /*
8345 : * If we inserted the first and only tuple on the page, re-initialize the
8346 : * page from scratch.
8347 : */
8348 0 : if (XLogRecGetInfo(record) & XLOG_HEAP_INIT_PAGE)
8349 : {
8350 0 : buffer = XLogInitBufferForRedo(record, 0);
8351 0 : page = BufferGetPage(buffer);
8352 0 : PageInit(page, BufferGetPageSize(buffer), 0);
8353 0 : action = BLK_NEEDS_REDO;
8354 : }
8355 : else
8356 0 : action = XLogReadBufferForRedo(record, 0, &buffer);
8357 0 : if (action == BLK_NEEDS_REDO)
8358 : {
8359 : Size datalen;
8360 : char *data;
8361 :
8362 0 : page = BufferGetPage(buffer);
8363 :
8364 0 : if (PageGetMaxOffsetNumber(page) + 1 < xlrec->offnum)
8365 0 : elog(PANIC, "invalid max offset number");
8366 :
8367 0 : data = XLogRecGetBlockData(record, 0, &datalen);
8368 :
8369 0 : newlen = datalen - SizeOfHeapHeader;
8370 0 : Assert(datalen > SizeOfHeapHeader && newlen <= MaxHeapTupleSize);
8371 0 : memcpy((char *) &xlhdr, data, SizeOfHeapHeader);
8372 0 : data += SizeOfHeapHeader;
8373 :
8374 0 : htup = &tbuf.hdr;
8375 0 : MemSet((char *) htup, 0, SizeofHeapTupleHeader);
8376 : /* PG73FORMAT: get bitmap [+ padding] [+ oid] + data */
8377 0 : memcpy((char *) htup + SizeofHeapTupleHeader,
8378 : data,
8379 : newlen);
8380 0 : newlen += SizeofHeapTupleHeader;
8381 0 : htup->t_infomask2 = xlhdr.t_infomask2;
8382 0 : htup->t_infomask = xlhdr.t_infomask;
8383 0 : htup->t_hoff = xlhdr.t_hoff;
8384 0 : HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record));
8385 0 : HeapTupleHeaderSetCmin(htup, FirstCommandId);
8386 0 : htup->t_ctid = target_tid;
8387 :
8388 0 : if (PageAddItem(page, (Item) htup, newlen, xlrec->offnum,
8389 : true, true) == InvalidOffsetNumber)
8390 0 : elog(PANIC, "failed to add tuple");
8391 :
8392 0 : freespace = PageGetHeapFreeSpace(page); /* needed to update FSM below */
8393 :
8394 0 : PageSetLSN(page, lsn);
8395 :
8396 0 : if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED)
8397 0 : PageClearAllVisible(page);
8398 :
8399 0 : MarkBufferDirty(buffer);
8400 : }
8401 0 : if (BufferIsValid(buffer))
8402 0 : UnlockReleaseBuffer(buffer);
8403 :
8404 : /*
8405 : * If the page is running low on free space, update the FSM as well.
8406 : * Arbitrarily, our definition of "low" is less than 20%. We can't do much
8407 : * better than that without knowing the fill-factor for the table.
8408 : *
8409 : * XXX: Don't do this if the page was restored from full page image. We
8410 : * don't bother to update the FSM in that case, it doesn't need to be
8411 : * totally accurate anyway.
8412 : */
8413 0 : if (action == BLK_NEEDS_REDO && freespace < BLCKSZ / 5)
8414 0 : XLogRecordPageWithFreeSpace(target_node, blkno, freespace);
8415 0 : }
8416 :
8417 : /*
8418 : * Handles MULTI_INSERT record type.
8419 : */
8420 : static void
8421 0 : heap_xlog_multi_insert(XLogReaderState *record)
8422 : {
8423 0 : XLogRecPtr lsn = record->EndRecPtr;
8424 : xl_heap_multi_insert *xlrec;
8425 : RelFileNode rnode;
8426 : BlockNumber blkno;
8427 : Buffer buffer;
8428 : Page page;
8429 : union
8430 : {
8431 : HeapTupleHeaderData hdr;
8432 : char data[MaxHeapTupleSize];
8433 : } tbuf;
8434 : HeapTupleHeader htup;
8435 : uint32 newlen;
8436 0 : Size freespace = 0;
8437 : int i;
8438 0 : bool isinit = (XLogRecGetInfo(record) & XLOG_HEAP_INIT_PAGE) != 0;
8439 : XLogRedoAction action;
8440 :
8441 : /*
8442 : * Insertion doesn't overwrite MVCC data, so no conflict processing is
8443 : * required.
8444 : */
8445 0 : xlrec = (xl_heap_multi_insert *) XLogRecGetData(record);
8446 :
8447 0 : XLogRecGetBlockTag(record, 0, &rnode, NULL, &blkno);
8448 :
8449 : /*
8450 : * The visibility map may need to be fixed even if the heap page is
8451 : * already up-to-date.
8452 : */
8453 0 : if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED)
8454 : {
8455 0 : Relation reln = CreateFakeRelcacheEntry(rnode);
8456 0 : Buffer vmbuffer = InvalidBuffer;
8457 :
8458 0 : visibilitymap_pin(reln, blkno, &vmbuffer);
8459 0 : visibilitymap_clear(reln, blkno, vmbuffer, VISIBILITYMAP_VALID_BITS);
8460 0 : ReleaseBuffer(vmbuffer);
8461 0 : FreeFakeRelcacheEntry(reln);
8462 : }
8463 :
8464 0 : if (isinit)
8465 : {
8466 0 : buffer = XLogInitBufferForRedo(record, 0);
8467 0 : page = BufferGetPage(buffer);
8468 0 : PageInit(page, BufferGetPageSize(buffer), 0);
8469 0 : action = BLK_NEEDS_REDO;
8470 : }
8471 : else
8472 0 : action = XLogReadBufferForRedo(record, 0, &buffer);
8473 0 : if (action == BLK_NEEDS_REDO)
8474 : {
8475 : char *tupdata;
8476 : char *endptr;
8477 : Size len;
8478 :
8479 : /* Tuples are stored as block data */
8480 0 : tupdata = XLogRecGetBlockData(record, 0, &len);
8481 0 : endptr = tupdata + len;
8482 :
8483 0 : page = (Page) BufferGetPage(buffer);
8484 :
8485 0 : for (i = 0; i < xlrec->ntuples; i++)
8486 : {
8487 : OffsetNumber offnum;
8488 : xl_multi_insert_tuple *xlhdr;
8489 :
8490 : /*
8491 : * If we're reinitializing the page, the tuples are stored in
8492 : * order from FirstOffsetNumber. Otherwise there's an array of
8493 : * offsets in the WAL record, and the tuples come after that.
8494 : */
8495 0 : if (isinit)
8496 0 : offnum = FirstOffsetNumber + i;
8497 : else
8498 0 : offnum = xlrec->offsets[i];
8499 0 : if (PageGetMaxOffsetNumber(page) + 1 < offnum)
8500 0 : elog(PANIC, "invalid max offset number");
8501 :
8502 0 : xlhdr = (xl_multi_insert_tuple *) SHORTALIGN(tupdata);
8503 0 : tupdata = ((char *) xlhdr) + SizeOfMultiInsertTuple;
8504 :
8505 0 : newlen = xlhdr->datalen;
8506 0 : Assert(newlen <= MaxHeapTupleSize);
8507 0 : htup = &tbuf.hdr;
8508 0 : MemSet((char *) htup, 0, SizeofHeapTupleHeader);
8509 : /* PG73FORMAT: get bitmap [+ padding] [+ oid] + data */
8510 0 : memcpy((char *) htup + SizeofHeapTupleHeader,
8511 : (char *) tupdata,
8512 : newlen);
8513 0 : tupdata += newlen;
8514 :
8515 0 : newlen += SizeofHeapTupleHeader;
8516 0 : htup->t_infomask2 = xlhdr->t_infomask2;
8517 0 : htup->t_infomask = xlhdr->t_infomask;
8518 0 : htup->t_hoff = xlhdr->t_hoff;
8519 0 : HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record));
8520 0 : HeapTupleHeaderSetCmin(htup, FirstCommandId);
8521 0 : ItemPointerSetBlockNumber(&htup->t_ctid, blkno);
8522 0 : ItemPointerSetOffsetNumber(&htup->t_ctid, offnum);
8523 :
8524 0 : offnum = PageAddItem(page, (Item) htup, newlen, offnum, true, true);
8525 0 : if (offnum == InvalidOffsetNumber)
8526 0 : elog(PANIC, "failed to add tuple");
8527 : }
8528 0 : if (tupdata != endptr)
8529 0 : elog(PANIC, "total tuple length mismatch");
8530 :
8531 0 : freespace = PageGetHeapFreeSpace(page); /* needed to update FSM below */
8532 :
8533 0 : PageSetLSN(page, lsn);
8534 :
8535 0 : if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED)
8536 0 : PageClearAllVisible(page);
8537 :
8538 0 : MarkBufferDirty(buffer);
8539 : }
8540 0 : if (BufferIsValid(buffer))
8541 0 : UnlockReleaseBuffer(buffer);
8542 :
8543 : /*
8544 : * If the page is running low on free space, update the FSM as well.
8545 : * Arbitrarily, our definition of "low" is less than 20%. We can't do much
8546 : * better than that without knowing the fill-factor for the table.
8547 : *
8548 : * XXX: Don't do this if the page was restored from full page image. We
8549 : * don't bother to update the FSM in that case, it doesn't need to be
8550 : * totally accurate anyway.
8551 : */
8552 0 : if (action == BLK_NEEDS_REDO && freespace < BLCKSZ / 5)
8553 0 : XLogRecordPageWithFreeSpace(rnode, blkno, freespace);
8554 0 : }
8555 :
8556 : /*
8557 : * Handles UPDATE and HOT_UPDATE
8558 : */
8559 : static void
8560 0 : heap_xlog_update(XLogReaderState *record, bool hot_update)
8561 : {
8562 0 : XLogRecPtr lsn = record->EndRecPtr;
8563 0 : xl_heap_update *xlrec = (xl_heap_update *) XLogRecGetData(record);
8564 : RelFileNode rnode;
8565 : BlockNumber oldblk;
8566 : BlockNumber newblk;
8567 : ItemPointerData newtid;
8568 : Buffer obuffer,
8569 : nbuffer;
8570 : Page page;
8571 : OffsetNumber offnum;
8572 0 : ItemId lp = NULL;
8573 : HeapTupleData oldtup;
8574 : HeapTupleHeader htup;
8575 0 : uint16 prefixlen = 0,
8576 0 : suffixlen = 0;
8577 : char *newp;
8578 : union
8579 : {
8580 : HeapTupleHeaderData hdr;
8581 : char data[MaxHeapTupleSize];
8582 : } tbuf;
8583 : xl_heap_header xlhdr;
8584 : uint32 newlen;
8585 0 : Size freespace = 0;
8586 : XLogRedoAction oldaction;
8587 : XLogRedoAction newaction;
8588 :
8589 : /* initialize to keep the compiler quiet */
8590 0 : oldtup.t_data = NULL;
8591 0 : oldtup.t_len = 0;
8592 :
8593 0 : XLogRecGetBlockTag(record, 0, &rnode, NULL, &newblk);
8594 0 : if (XLogRecGetBlockTag(record, 1, NULL, NULL, &oldblk))
8595 : {
8596 : /* HOT updates are never done across pages */
8597 0 : Assert(!hot_update);
8598 : }
8599 : else
8600 0 : oldblk = newblk;
8601 :
8602 0 : ItemPointerSet(&newtid, newblk, xlrec->new_offnum);
8603 :
8604 : /*
8605 : * The visibility map may need to be fixed even if the heap page is
8606 : * already up-to-date.
8607 : */
8608 0 : if (xlrec->flags & XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED)
8609 : {
8610 0 : Relation reln = CreateFakeRelcacheEntry(rnode);
8611 0 : Buffer vmbuffer = InvalidBuffer;
8612 :
8613 0 : visibilitymap_pin(reln, oldblk, &vmbuffer);
8614 0 : visibilitymap_clear(reln, oldblk, vmbuffer, VISIBILITYMAP_VALID_BITS);
8615 0 : ReleaseBuffer(vmbuffer);
8616 0 : FreeFakeRelcacheEntry(reln);
8617 : }
8618 :
8619 : /*
8620 : * In normal operation, it is important to lock the two pages in
8621 : * page-number order, to avoid possible deadlocks against other update
8622 : * operations going the other way. However, during WAL replay there can
8623 : * be no other update happening, so we don't need to worry about that. But
8624 : * we *do* need to worry that we don't expose an inconsistent state to Hot
8625 : * Standby queries --- so the original page can't be unlocked before we've
8626 : * added the new tuple to the new page.
8627 : */
8628 :
8629 : /* Deal with old tuple version */
8630 0 : oldaction = XLogReadBufferForRedo(record, (oldblk == newblk) ? 0 : 1,
8631 : &obuffer);
8632 0 : if (oldaction == BLK_NEEDS_REDO)
8633 : {
8634 0 : page = BufferGetPage(obuffer);
8635 0 : offnum = xlrec->old_offnum;
8636 0 : if (PageGetMaxOffsetNumber(page) >= offnum)
8637 0 : lp = PageGetItemId(page, offnum);
8638 :
8639 0 : if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
8640 0 : elog(PANIC, "invalid lp");
8641 :
8642 0 : htup = (HeapTupleHeader) PageGetItem(page, lp);
8643 :
8644 0 : oldtup.t_data = htup;
8645 0 : oldtup.t_len = ItemIdGetLength(lp);
8646 :
8647 0 : htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
8648 0 : htup->t_infomask2 &= ~HEAP_KEYS_UPDATED;
8649 0 : if (hot_update)
8650 0 : HeapTupleHeaderSetHotUpdated(htup);
8651 : else
8652 0 : HeapTupleHeaderClearHotUpdated(htup);
8653 0 : fix_infomask_from_infobits(xlrec->old_infobits_set, &htup->t_infomask,
8654 : &htup->t_infomask2);
8655 0 : HeapTupleHeaderSetXmax(htup, xlrec->old_xmax);
8656 0 : HeapTupleHeaderSetCmax(htup, FirstCommandId, false);
8657 : /* Set forward chain link in t_ctid */
8658 0 : htup->t_ctid = newtid;
8659 :
8660 : /* Mark the page as a candidate for pruning */
8661 0 : PageSetPrunable(page, XLogRecGetXid(record));
8662 :
8663 0 : if (xlrec->flags & XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED)
8664 0 : PageClearAllVisible(page);
8665 :
8666 0 : PageSetLSN(page, lsn);
8667 0 : MarkBufferDirty(obuffer);
8668 : }
8669 :
8670 : /*
8671 : * Read the page the new tuple goes into, if different from old.
8672 : */
8673 0 : if (oldblk == newblk)
8674 : {
8675 0 : nbuffer = obuffer;
8676 0 : newaction = oldaction;
8677 : }
8678 0 : else if (XLogRecGetInfo(record) & XLOG_HEAP_INIT_PAGE)
8679 : {
8680 0 : nbuffer = XLogInitBufferForRedo(record, 0);
8681 0 : page = (Page) BufferGetPage(nbuffer);
8682 0 : PageInit(page, BufferGetPageSize(nbuffer), 0);
8683 0 : newaction = BLK_NEEDS_REDO;
8684 : }
8685 : else
8686 0 : newaction = XLogReadBufferForRedo(record, 0, &nbuffer);
8687 :
8688 : /*
8689 : * The visibility map may need to be fixed even if the heap page is
8690 : * already up-to-date.
8691 : */
8692 0 : if (xlrec->flags & XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED)
8693 : {
8694 0 : Relation reln = CreateFakeRelcacheEntry(rnode);
8695 0 : Buffer vmbuffer = InvalidBuffer;
8696 :
8697 0 : visibilitymap_pin(reln, newblk, &vmbuffer);
8698 0 : visibilitymap_clear(reln, newblk, vmbuffer, VISIBILITYMAP_VALID_BITS);
8699 0 : ReleaseBuffer(vmbuffer);
8700 0 : FreeFakeRelcacheEntry(reln);
8701 : }
8702 :
8703 : /* Deal with new tuple */
8704 0 : if (newaction == BLK_NEEDS_REDO)
8705 : {
8706 : char *recdata;
8707 : char *recdata_end;
8708 : Size datalen;
8709 : Size tuplen;
8710 :
8711 0 : recdata = XLogRecGetBlockData(record, 0, &datalen);
8712 0 : recdata_end = recdata + datalen;
8713 :
8714 0 : page = BufferGetPage(nbuffer);
8715 :
8716 0 : offnum = xlrec->new_offnum;
8717 0 : if (PageGetMaxOffsetNumber(page) + 1 < offnum)
8718 0 : elog(PANIC, "invalid max offset number");
8719 :
8720 0 : if (xlrec->flags & XLH_UPDATE_PREFIX_FROM_OLD)
8721 : {
8722 0 : Assert(newblk == oldblk);
8723 0 : memcpy(&prefixlen, recdata, sizeof(uint16));
8724 0 : recdata += sizeof(uint16);
8725 : }
8726 0 : if (xlrec->flags & XLH_UPDATE_SUFFIX_FROM_OLD)
8727 : {
8728 0 : Assert(newblk == oldblk);
8729 0 : memcpy(&suffixlen, recdata, sizeof(uint16));
8730 0 : recdata += sizeof(uint16);
8731 : }
8732 :
8733 0 : memcpy((char *) &xlhdr, recdata, SizeOfHeapHeader);
8734 0 : recdata += SizeOfHeapHeader;
8735 :
8736 0 : tuplen = recdata_end - recdata;
8737 0 : Assert(tuplen <= MaxHeapTupleSize);
8738 :
8739 0 : htup = &tbuf.hdr;
8740 0 : MemSet((char *) htup, 0, SizeofHeapTupleHeader);
8741 :
8742 : /*
8743 : * Reconstruct the new tuple using the prefix and/or suffix from the
8744 : * old tuple, and the data stored in the WAL record.
8745 : */
8746 0 : newp = (char *) htup + SizeofHeapTupleHeader;
8747 0 : if (prefixlen > 0)
8748 : {
8749 : int len;
8750 :
8751 : /* copy bitmap [+ padding] [+ oid] from WAL record */
8752 0 : len = xlhdr.t_hoff - SizeofHeapTupleHeader;
8753 0 : memcpy(newp, recdata, len);
8754 0 : recdata += len;
8755 0 : newp += len;
8756 :
8757 : /* copy prefix from old tuple */
8758 0 : memcpy(newp, (char *) oldtup.t_data + oldtup.t_data->t_hoff, prefixlen);
8759 0 : newp += prefixlen;
8760 :
8761 : /* copy new tuple data from WAL record */
8762 0 : len = tuplen - (xlhdr.t_hoff - SizeofHeapTupleHeader);
8763 0 : memcpy(newp, recdata, len);
8764 0 : recdata += len;
8765 0 : newp += len;
8766 : }
8767 : else
8768 : {
8769 : /*
8770 : * copy bitmap [+ padding] [+ oid] + data from record, all in one
8771 : * go
8772 : */
8773 0 : memcpy(newp, recdata, tuplen);
8774 0 : recdata += tuplen;
8775 0 : newp += tuplen;
8776 : }
8777 0 : Assert(recdata == recdata_end);
8778 :
8779 : /* copy suffix from old tuple */
8780 0 : if (suffixlen > 0)
8781 0 : memcpy(newp, (char *) oldtup.t_data + oldtup.t_len - suffixlen, suffixlen);
8782 :
8783 0 : newlen = SizeofHeapTupleHeader + tuplen + prefixlen + suffixlen;
8784 0 : htup->t_infomask2 = xlhdr.t_infomask2;
8785 0 : htup->t_infomask = xlhdr.t_infomask;
8786 0 : htup->t_hoff = xlhdr.t_hoff;
8787 :
8788 0 : HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record));
8789 0 : HeapTupleHeaderSetCmin(htup, FirstCommandId);
8790 0 : HeapTupleHeaderSetXmax(htup, xlrec->new_xmax);
8791 : /* Make sure there is no forward chain link in t_ctid */
8792 0 : htup->t_ctid = newtid;
8793 :
8794 0 : offnum = PageAddItem(page, (Item) htup, newlen, offnum, true, true);
8795 0 : if (offnum == InvalidOffsetNumber)
8796 0 : elog(PANIC, "failed to add tuple");
8797 :
8798 0 : if (xlrec->flags & XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED)
8799 0 : PageClearAllVisible(page);
8800 :
8801 0 : freespace = PageGetHeapFreeSpace(page); /* needed to update FSM below */
8802 :
8803 0 : PageSetLSN(page, lsn);
8804 0 : MarkBufferDirty(nbuffer);
8805 : }
8806 :
8807 0 : if (BufferIsValid(nbuffer) && nbuffer != obuffer)
8808 0 : UnlockReleaseBuffer(nbuffer);
8809 0 : if (BufferIsValid(obuffer))
8810 0 : UnlockReleaseBuffer(obuffer);
8811 :
8812 : /*
8813 : * If the new page is running low on free space, update the FSM as well.
8814 : * Arbitrarily, our definition of "low" is less than 20%. We can't do much
8815 : * better than that without knowing the fill-factor for the table.
8816 : *
8817 : * However, don't update the FSM on HOT updates, because after crash
8818 : * recovery, either the old or the new tuple will certainly be dead and
8819 : * prunable. After pruning, the page will have roughly as much free space
8820 : * as it did before the update, assuming the new tuple is about the same
8821 : * size as the old one.
8822 : *
8823 : * XXX: Don't do this if the page was restored from full page image. We
8824 : * don't bother to update the FSM in that case, it doesn't need to be
8825 : * totally accurate anyway.
8826 : */
8827 0 : if (newaction == BLK_NEEDS_REDO && !hot_update && freespace < BLCKSZ / 5)
8828 0 : XLogRecordPageWithFreeSpace(rnode, newblk, freespace);
8829 0 : }
8830 :
8831 : static void
8832 0 : heap_xlog_confirm(XLogReaderState *record)
8833 : {
8834 0 : XLogRecPtr lsn = record->EndRecPtr;
8835 0 : xl_heap_confirm *xlrec = (xl_heap_confirm *) XLogRecGetData(record);
8836 : Buffer buffer;
8837 : Page page;
8838 : OffsetNumber offnum;
8839 0 : ItemId lp = NULL;
8840 : HeapTupleHeader htup;
8841 :
8842 0 : if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
8843 : {
8844 0 : page = BufferGetPage(buffer);
8845 :
8846 0 : offnum = xlrec->offnum;
8847 0 : if (PageGetMaxOffsetNumber(page) >= offnum)
8848 0 : lp = PageGetItemId(page, offnum);
8849 :
8850 0 : if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
8851 0 : elog(PANIC, "invalid lp");
8852 :
8853 0 : htup = (HeapTupleHeader) PageGetItem(page, lp);
8854 :
8855 : /*
8856 : * Confirm tuple as actually inserted
8857 : */
8858 0 : ItemPointerSet(&htup->t_ctid, BufferGetBlockNumber(buffer), offnum);
8859 :
8860 0 : PageSetLSN(page, lsn);
8861 0 : MarkBufferDirty(buffer);
8862 : }
8863 0 : if (BufferIsValid(buffer))
8864 0 : UnlockReleaseBuffer(buffer);
8865 0 : }
8866 :
8867 : static void
8868 0 : heap_xlog_lock(XLogReaderState *record)
8869 : {
8870 0 : XLogRecPtr lsn = record->EndRecPtr;
8871 0 : xl_heap_lock *xlrec = (xl_heap_lock *) XLogRecGetData(record);
8872 : Buffer buffer;
8873 : Page page;
8874 : OffsetNumber offnum;
8875 0 : ItemId lp = NULL;
8876 : HeapTupleHeader htup;
8877 :
8878 : /*
8879 : * The visibility map may need to be fixed even if the heap page is
8880 : * already up-to-date.
8881 : */
8882 0 : if (xlrec->flags & XLH_LOCK_ALL_FROZEN_CLEARED)
8883 : {
8884 : RelFileNode rnode;
8885 0 : Buffer vmbuffer = InvalidBuffer;
8886 : BlockNumber block;
8887 : Relation reln;
8888 :
8889 0 : XLogRecGetBlockTag(record, 0, &rnode, NULL, &block);
8890 0 : reln = CreateFakeRelcacheEntry(rnode);
8891 :
8892 0 : visibilitymap_pin(reln, block, &vmbuffer);
8893 0 : visibilitymap_clear(reln, block, vmbuffer, VISIBILITYMAP_ALL_FROZEN);
8894 :
8895 0 : ReleaseBuffer(vmbuffer);
8896 0 : FreeFakeRelcacheEntry(reln);
8897 : }
8898 :
8899 0 : if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
8900 : {
8901 0 : page = (Page) BufferGetPage(buffer);
8902 :
8903 0 : offnum = xlrec->offnum;
8904 0 : if (PageGetMaxOffsetNumber(page) >= offnum)
8905 0 : lp = PageGetItemId(page, offnum);
8906 :
8907 0 : if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
8908 0 : elog(PANIC, "invalid lp");
8909 :
8910 0 : htup = (HeapTupleHeader) PageGetItem(page, lp);
8911 :
8912 0 : htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
8913 0 : htup->t_infomask2 &= ~HEAP_KEYS_UPDATED;
8914 0 : fix_infomask_from_infobits(xlrec->infobits_set, &htup->t_infomask,
8915 : &htup->t_infomask2);
8916 :
8917 : /*
8918 : * Clear relevant update flags, but only if the modified infomask says
8919 : * there's no update.
8920 : */
8921 0 : if (HEAP_XMAX_IS_LOCKED_ONLY(htup->t_infomask))
8922 : {
8923 0 : HeapTupleHeaderClearHotUpdated(htup);
8924 : /* Make sure there is no forward chain link in t_ctid */
8925 0 : ItemPointerSet(&htup->t_ctid,
8926 : BufferGetBlockNumber(buffer),
8927 : offnum);
8928 : }
8929 0 : HeapTupleHeaderSetXmax(htup, xlrec->locking_xid);
8930 0 : HeapTupleHeaderSetCmax(htup, FirstCommandId, false);
8931 0 : PageSetLSN(page, lsn);
8932 0 : MarkBufferDirty(buffer);
8933 : }
8934 0 : if (BufferIsValid(buffer))
8935 0 : UnlockReleaseBuffer(buffer);
8936 0 : }
8937 :
8938 : static void
8939 0 : heap_xlog_lock_updated(XLogReaderState *record)
8940 : {
8941 0 : XLogRecPtr lsn = record->EndRecPtr;
8942 : xl_heap_lock_updated *xlrec;
8943 : Buffer buffer;
8944 : Page page;
8945 : OffsetNumber offnum;
8946 0 : ItemId lp = NULL;
8947 : HeapTupleHeader htup;
8948 :
8949 0 : xlrec = (xl_heap_lock_updated *) XLogRecGetData(record);
8950 :
8951 : /*
8952 : * The visibility map may need to be fixed even if the heap page is
8953 : * already up-to-date.
8954 : */
8955 0 : if (xlrec->flags & XLH_LOCK_ALL_FROZEN_CLEARED)
8956 : {
8957 : RelFileNode rnode;
8958 0 : Buffer vmbuffer = InvalidBuffer;
8959 : BlockNumber block;
8960 : Relation reln;
8961 :
8962 0 : XLogRecGetBlockTag(record, 0, &rnode, NULL, &block);
8963 0 : reln = CreateFakeRelcacheEntry(rnode);
8964 :
8965 0 : visibilitymap_pin(reln, block, &vmbuffer);
8966 0 : visibilitymap_clear(reln, block, vmbuffer, VISIBILITYMAP_ALL_FROZEN);
8967 :
8968 0 : ReleaseBuffer(vmbuffer);
8969 0 : FreeFakeRelcacheEntry(reln);
8970 : }
8971 :
8972 0 : if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
8973 : {
8974 0 : page = BufferGetPage(buffer);
8975 :
8976 0 : offnum = xlrec->offnum;
8977 0 : if (PageGetMaxOffsetNumber(page) >= offnum)
8978 0 : lp = PageGetItemId(page, offnum);
8979 :
8980 0 : if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
8981 0 : elog(PANIC, "invalid lp");
8982 :
8983 0 : htup = (HeapTupleHeader) PageGetItem(page, lp);
8984 :
8985 0 : htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
8986 0 : htup->t_infomask2 &= ~HEAP_KEYS_UPDATED;
8987 0 : fix_infomask_from_infobits(xlrec->infobits_set, &htup->t_infomask,
8988 : &htup->t_infomask2);
8989 0 : HeapTupleHeaderSetXmax(htup, xlrec->xmax);
8990 :
8991 0 : PageSetLSN(page, lsn);
8992 0 : MarkBufferDirty(buffer);
8993 : }
8994 0 : if (BufferIsValid(buffer))
8995 0 : UnlockReleaseBuffer(buffer);
8996 0 : }
8997 :
8998 : static void
8999 0 : heap_xlog_inplace(XLogReaderState *record)
9000 : {
9001 0 : XLogRecPtr lsn = record->EndRecPtr;
9002 0 : xl_heap_inplace *xlrec = (xl_heap_inplace *) XLogRecGetData(record);
9003 : Buffer buffer;
9004 : Page page;
9005 : OffsetNumber offnum;
9006 0 : ItemId lp = NULL;
9007 : HeapTupleHeader htup;
9008 : uint32 oldlen;
9009 : Size newlen;
9010 :
9011 0 : if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
9012 : {
9013 0 : char *newtup = XLogRecGetBlockData(record, 0, &newlen);
9014 :
9015 0 : page = BufferGetPage(buffer);
9016 :
9017 0 : offnum = xlrec->offnum;
9018 0 : if (PageGetMaxOffsetNumber(page) >= offnum)
9019 0 : lp = PageGetItemId(page, offnum);
9020 :
9021 0 : if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
9022 0 : elog(PANIC, "invalid lp");
9023 :
9024 0 : htup = (HeapTupleHeader) PageGetItem(page, lp);
9025 :
9026 0 : oldlen = ItemIdGetLength(lp) - htup->t_hoff;
9027 0 : if (oldlen != newlen)
9028 0 : elog(PANIC, "wrong tuple length");
9029 :
9030 0 : memcpy((char *) htup + htup->t_hoff, newtup, newlen);
9031 :
9032 0 : PageSetLSN(page, lsn);
9033 0 : MarkBufferDirty(buffer);
9034 : }
9035 0 : if (BufferIsValid(buffer))
9036 0 : UnlockReleaseBuffer(buffer);
9037 0 : }
9038 :
9039 : void
9040 0 : heap_redo(XLogReaderState *record)
9041 : {
9042 0 : uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
9043 :
9044 : /*
9045 : * These operations don't overwrite MVCC data so no conflict processing is
9046 : * required. The ones in heap2 rmgr do.
9047 : */
9048 :
9049 0 : switch (info & XLOG_HEAP_OPMASK)
9050 : {
9051 : case XLOG_HEAP_INSERT:
9052 0 : heap_xlog_insert(record);
9053 0 : break;
9054 : case XLOG_HEAP_DELETE:
9055 0 : heap_xlog_delete(record);
9056 0 : break;
9057 : case XLOG_HEAP_UPDATE:
9058 0 : heap_xlog_update(record, false);
9059 0 : break;
9060 : case XLOG_HEAP_HOT_UPDATE:
9061 0 : heap_xlog_update(record, true);
9062 0 : break;
9063 : case XLOG_HEAP_CONFIRM:
9064 0 : heap_xlog_confirm(record);
9065 0 : break;
9066 : case XLOG_HEAP_LOCK:
9067 0 : heap_xlog_lock(record);
9068 0 : break;
9069 : case XLOG_HEAP_INPLACE:
9070 0 : heap_xlog_inplace(record);
9071 0 : break;
9072 : default:
9073 0 : elog(PANIC, "heap_redo: unknown op code %u", info);
9074 : }
9075 0 : }
9076 :
9077 : void
9078 0 : heap2_redo(XLogReaderState *record)
9079 : {
9080 0 : uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
9081 :
9082 0 : switch (info & XLOG_HEAP_OPMASK)
9083 : {
9084 : case XLOG_HEAP2_CLEAN:
9085 0 : heap_xlog_clean(record);
9086 0 : break;
9087 : case XLOG_HEAP2_FREEZE_PAGE:
9088 0 : heap_xlog_freeze_page(record);
9089 0 : break;
9090 : case XLOG_HEAP2_CLEANUP_INFO:
9091 0 : heap_xlog_cleanup_info(record);
9092 0 : break;
9093 : case XLOG_HEAP2_VISIBLE:
9094 0 : heap_xlog_visible(record);
9095 0 : break;
9096 : case XLOG_HEAP2_MULTI_INSERT:
9097 0 : heap_xlog_multi_insert(record);
9098 0 : break;
9099 : case XLOG_HEAP2_LOCK_UPDATED:
9100 0 : heap_xlog_lock_updated(record);
9101 0 : break;
9102 : case XLOG_HEAP2_NEW_CID:
9103 :
9104 : /*
9105 : * Nothing to do on a real replay, only used during logical
9106 : * decoding.
9107 : */
9108 0 : break;
9109 : case XLOG_HEAP2_REWRITE:
9110 0 : heap_xlog_logical_rewrite(record);
9111 0 : break;
9112 : default:
9113 0 : elog(PANIC, "heap2_redo: unknown op code %u", info);
9114 : }
9115 0 : }
9116 :
9117 : /*
9118 : * heap_sync - sync a heap, for use when no WAL has been written
9119 : *
9120 : * This forces the heap contents (including TOAST heap if any) down to disk.
9121 : * If we skipped using WAL, and WAL is otherwise needed, we must force the
9122 : * relation down to disk before it's safe to commit the transaction. This
9123 : * requires writing out any dirty buffers and then doing a forced fsync.
9124 : *
9125 : * Indexes are not touched. (Currently, index operations associated with
9126 : * the commands that use this are WAL-logged and so do not need fsync.
9127 : * That behavior might change someday, but in any case it's likely that
9128 : * any fsync decisions required would be per-index and hence not appropriate
9129 : * to be done here.)
9130 : */
9131 : void
9132 21 : heap_sync(Relation rel)
9133 : {
9134 : /* non-WAL-logged tables never need fsync */
9135 21 : if (!RelationNeedsWAL(rel))
9136 21 : return;
9137 :
9138 : /* main heap */
9139 21 : FlushRelationBuffers(rel);
9140 : /* FlushRelationBuffers will have opened rd_smgr */
9141 21 : smgrimmedsync(rel->rd_smgr, MAIN_FORKNUM);
9142 :
9143 : /* FSM is not critical, don't bother syncing it */
9144 :
9145 : /* toast heap, if any */
9146 21 : if (OidIsValid(rel->rd_rel->reltoastrelid))
9147 : {
9148 : Relation toastrel;
9149 :
9150 3 : toastrel = heap_open(rel->rd_rel->reltoastrelid, AccessShareLock);
9151 3 : FlushRelationBuffers(toastrel);
9152 3 : smgrimmedsync(toastrel->rd_smgr, MAIN_FORKNUM);
9153 3 : heap_close(toastrel, AccessShareLock);
9154 : }
9155 : }
9156 :
9157 : /*
9158 : * Mask a heap page before performing consistency checks on it.
9159 : */
9160 : void
9161 0 : heap_mask(char *pagedata, BlockNumber blkno)
9162 : {
9163 0 : Page page = (Page) pagedata;
9164 : OffsetNumber off;
9165 :
9166 0 : mask_page_lsn(page);
9167 :
9168 0 : mask_page_hint_bits(page);
9169 0 : mask_unused_space(page);
9170 :
9171 0 : for (off = 1; off <= PageGetMaxOffsetNumber(page); off++)
9172 : {
9173 0 : ItemId iid = PageGetItemId(page, off);
9174 : char *page_item;
9175 :
9176 0 : page_item = (char *) (page + ItemIdGetOffset(iid));
9177 :
9178 0 : if (ItemIdIsNormal(iid))
9179 : {
9180 0 : HeapTupleHeader page_htup = (HeapTupleHeader) page_item;
9181 :
9182 : /*
9183 : * If xmin of a tuple is not yet frozen, we should ignore
9184 : * differences in hint bits, since they can be set without
9185 : * emitting WAL.
9186 : */
9187 0 : if (!HeapTupleHeaderXminFrozen(page_htup))
9188 0 : page_htup->t_infomask &= ~HEAP_XACT_MASK;
9189 : else
9190 : {
9191 : /* Still we need to mask xmax hint bits. */
9192 0 : page_htup->t_infomask &= ~HEAP_XMAX_INVALID;
9193 0 : page_htup->t_infomask &= ~HEAP_XMAX_COMMITTED;
9194 : }
9195 :
9196 : /*
9197 : * During replay, we set Command Id to FirstCommandId. Hence, mask
9198 : * it. See heap_xlog_insert() for details.
9199 : */
9200 0 : page_htup->t_choice.t_heap.t_field3.t_cid = MASK_MARKER;
9201 :
9202 : /*
9203 : * For a speculative tuple, heap_insert() does not set ctid in the
9204 : * caller-passed heap tuple itself, leaving the ctid field to
9205 : * contain a speculative token value - a per-backend monotonically
9206 : * increasing identifier. Besides, it does not WAL-log ctid under
9207 : * any circumstances.
9208 : *
9209 : * During redo, heap_xlog_insert() sets t_ctid to current block
9210 : * number and self offset number. It doesn't care about any
9211 : * speculative insertions in master. Hence, we set t_ctid to
9212 : * current block number and self offset number to ignore any
9213 : * inconsistency.
9214 : */
9215 0 : if (HeapTupleHeaderIsSpeculative(page_htup))
9216 0 : ItemPointerSet(&page_htup->t_ctid, blkno, off);
9217 : }
9218 :
9219 : /*
9220 : * Ignore any padding bytes after the tuple, when the length of the
9221 : * item is not MAXALIGNed.
9222 : */
9223 0 : if (ItemIdHasStorage(iid))
9224 : {
9225 0 : int len = ItemIdGetLength(iid);
9226 0 : int padlen = MAXALIGN(len) - len;
9227 :
9228 0 : if (padlen > 0)
9229 0 : memset(page_item + len, MASK_MARKER, padlen);
9230 : }
9231 : }
9232 0 : }
|