Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * standby.c
4 : * Misc functions used in Hot Standby mode.
5 : *
6 : * All functions for handling RM_STANDBY_ID, which relate to
7 : * AccessExclusiveLocks and starting snapshots for Hot Standby mode.
8 : * Plus conflict recovery processing.
9 : *
10 : * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
11 : * Portions Copyright (c) 1994, Regents of the University of California
12 : *
13 : * IDENTIFICATION
14 : * src/backend/storage/ipc/standby.c
15 : *
16 : *-------------------------------------------------------------------------
17 : */
18 : #include "postgres.h"
19 : #include "access/transam.h"
20 : #include "access/twophase.h"
21 : #include "access/xact.h"
22 : #include "access/xlog.h"
23 : #include "access/xloginsert.h"
24 : #include "miscadmin.h"
25 : #include "pgstat.h"
26 : #include "storage/bufmgr.h"
27 : #include "storage/lmgr.h"
28 : #include "storage/proc.h"
29 : #include "storage/procarray.h"
30 : #include "storage/sinvaladt.h"
31 : #include "storage/standby.h"
32 : #include "utils/ps_status.h"
33 : #include "utils/timeout.h"
34 : #include "utils/timestamp.h"
35 :
36 : /* User-settable GUC parameters */
37 : int vacuum_defer_cleanup_age;
38 : int max_standby_archive_delay = 30 * 1000;
39 : int max_standby_streaming_delay = 30 * 1000;
40 :
41 : static List *RecoveryLockList;
42 :
43 : static void ResolveRecoveryConflictWithVirtualXIDs(VirtualTransactionId *waitlist,
44 : ProcSignalReason reason);
45 : static void SendRecoveryConflictWithBufferPin(ProcSignalReason reason);
46 : static XLogRecPtr LogCurrentRunningXacts(RunningTransactions CurrRunningXacts);
47 : static void LogAccessExclusiveLocks(int nlocks, xl_standby_lock *locks);
48 :
49 :
50 : /*
51 : * InitRecoveryTransactionEnvironment
52 : * Initialize tracking of in-progress transactions in master
53 : *
54 : * We need to issue shared invalidations and hold locks. Holding locks
55 : * means others may want to wait on us, so we need to make a lock table
56 : * vxact entry like a real transaction. We could create and delete
57 : * lock table entries for each transaction but its simpler just to create
58 : * one permanent entry and leave it there all the time. Locks are then
59 : * acquired and released as needed. Yes, this means you can see the
60 : * Startup process in pg_locks once we have run this.
61 : */
62 : void
63 0 : InitRecoveryTransactionEnvironment(void)
64 : {
65 : VirtualTransactionId vxid;
66 :
67 : /*
68 : * Initialize shared invalidation management for Startup process, being
69 : * careful to register ourselves as a sendOnly process so we don't need to
70 : * read messages, nor will we get signalled when the queue starts filling
71 : * up.
72 : */
73 0 : SharedInvalBackendInit(true);
74 :
75 : /*
76 : * Lock a virtual transaction id for Startup process.
77 : *
78 : * We need to do GetNextLocalTransactionId() because
79 : * SharedInvalBackendInit() leaves localTransactionid invalid and the lock
80 : * manager doesn't like that at all.
81 : *
82 : * Note that we don't need to run XactLockTableInsert() because nobody
83 : * needs to wait on xids. That sounds a little strange, but table locks
84 : * are held by vxids and row level locks are held by xids. All queries
85 : * hold AccessShareLocks so never block while we write or lock new rows.
86 : */
87 0 : vxid.backendId = MyBackendId;
88 0 : vxid.localTransactionId = GetNextLocalTransactionId();
89 0 : VirtualXactLockTableInsert(vxid);
90 :
91 0 : standbyState = STANDBY_INITIALIZED;
92 0 : }
93 :
94 : /*
95 : * ShutdownRecoveryTransactionEnvironment
96 : * Shut down transaction tracking
97 : *
98 : * Prepare to switch from hot standby mode to normal operation. Shut down
99 : * recovery-time transaction tracking.
100 : */
101 : void
102 0 : ShutdownRecoveryTransactionEnvironment(void)
103 : {
104 : /* Mark all tracked in-progress transactions as finished. */
105 0 : ExpireAllKnownAssignedTransactionIds();
106 :
107 : /* Release all locks the tracked transactions were holding */
108 0 : StandbyReleaseAllLocks();
109 :
110 : /* Cleanup our VirtualTransaction */
111 0 : VirtualXactLockTableCleanup();
112 0 : }
113 :
114 :
115 : /*
116 : * -----------------------------------------------------
117 : * Standby wait timers and backend cancel logic
118 : * -----------------------------------------------------
119 : */
120 :
121 : /*
122 : * Determine the cutoff time at which we want to start canceling conflicting
123 : * transactions. Returns zero (a time safely in the past) if we are willing
124 : * to wait forever.
125 : */
126 : static TimestampTz
127 0 : GetStandbyLimitTime(void)
128 : {
129 : TimestampTz rtime;
130 : bool fromStream;
131 :
132 : /*
133 : * The cutoff time is the last WAL data receipt time plus the appropriate
134 : * delay variable. Delay of -1 means wait forever.
135 : */
136 0 : GetXLogReceiptTime(&rtime, &fromStream);
137 0 : if (fromStream)
138 : {
139 0 : if (max_standby_streaming_delay < 0)
140 0 : return 0; /* wait forever */
141 0 : return TimestampTzPlusMilliseconds(rtime, max_standby_streaming_delay);
142 : }
143 : else
144 : {
145 0 : if (max_standby_archive_delay < 0)
146 0 : return 0; /* wait forever */
147 0 : return TimestampTzPlusMilliseconds(rtime, max_standby_archive_delay);
148 : }
149 : }
150 :
151 : #define STANDBY_INITIAL_WAIT_US 1000
152 : static int standbyWait_us = STANDBY_INITIAL_WAIT_US;
153 :
154 : /*
155 : * Standby wait logic for ResolveRecoveryConflictWithVirtualXIDs.
156 : * We wait here for a while then return. If we decide we can't wait any
157 : * more then we return true, if we can wait some more return false.
158 : */
159 : static bool
160 0 : WaitExceedsMaxStandbyDelay(void)
161 : {
162 : TimestampTz ltime;
163 :
164 0 : CHECK_FOR_INTERRUPTS();
165 :
166 : /* Are we past the limit time? */
167 0 : ltime = GetStandbyLimitTime();
168 0 : if (ltime && GetCurrentTimestamp() >= ltime)
169 0 : return true;
170 :
171 : /*
172 : * Sleep a bit (this is essential to avoid busy-waiting).
173 : */
174 0 : pg_usleep(standbyWait_us);
175 :
176 : /*
177 : * Progressively increase the sleep times, but not to more than 1s, since
178 : * pg_usleep isn't interruptable on some platforms.
179 : */
180 0 : standbyWait_us *= 2;
181 0 : if (standbyWait_us > 1000000)
182 0 : standbyWait_us = 1000000;
183 :
184 0 : return false;
185 : }
186 :
187 : /*
188 : * This is the main executioner for any query backend that conflicts with
189 : * recovery processing. Judgement has already been passed on it within
190 : * a specific rmgr. Here we just issue the orders to the procs. The procs
191 : * then throw the required error as instructed.
192 : */
193 : static void
194 0 : ResolveRecoveryConflictWithVirtualXIDs(VirtualTransactionId *waitlist,
195 : ProcSignalReason reason)
196 : {
197 : TimestampTz waitStart;
198 : char *new_status;
199 :
200 : /* Fast exit, to avoid a kernel call if there's no work to be done. */
201 0 : if (!VirtualTransactionIdIsValid(*waitlist))
202 0 : return;
203 :
204 0 : waitStart = GetCurrentTimestamp();
205 0 : new_status = NULL; /* we haven't changed the ps display */
206 :
207 0 : while (VirtualTransactionIdIsValid(*waitlist))
208 : {
209 : /* reset standbyWait_us for each xact we wait for */
210 0 : standbyWait_us = STANDBY_INITIAL_WAIT_US;
211 :
212 : /* wait until the virtual xid is gone */
213 0 : while (!VirtualXactLock(*waitlist, false))
214 : {
215 : /*
216 : * Report via ps if we have been waiting for more than 500 msec
217 : * (should that be configurable?)
218 : */
219 0 : if (update_process_title && new_status == NULL &&
220 0 : TimestampDifferenceExceeds(waitStart, GetCurrentTimestamp(),
221 : 500))
222 : {
223 : const char *old_status;
224 : int len;
225 :
226 0 : old_status = get_ps_display(&len);
227 0 : new_status = (char *) palloc(len + 8 + 1);
228 0 : memcpy(new_status, old_status, len);
229 0 : strcpy(new_status + len, " waiting");
230 0 : set_ps_display(new_status, false);
231 0 : new_status[len] = '\0'; /* truncate off " waiting" */
232 : }
233 :
234 : /* Is it time to kill it? */
235 0 : if (WaitExceedsMaxStandbyDelay())
236 : {
237 : pid_t pid;
238 :
239 : /*
240 : * Now find out who to throw out of the balloon.
241 : */
242 0 : Assert(VirtualTransactionIdIsValid(*waitlist));
243 0 : pid = CancelVirtualTransaction(*waitlist, reason);
244 :
245 : /*
246 : * Wait a little bit for it to die so that we avoid flooding
247 : * an unresponsive backend when system is heavily loaded.
248 : */
249 0 : if (pid != 0)
250 0 : pg_usleep(5000L);
251 : }
252 : }
253 :
254 : /* The virtual transaction is gone now, wait for the next one */
255 0 : waitlist++;
256 : }
257 :
258 : /* Reset ps display if we changed it */
259 0 : if (new_status)
260 : {
261 0 : set_ps_display(new_status, false);
262 0 : pfree(new_status);
263 : }
264 : }
265 :
266 : void
267 0 : ResolveRecoveryConflictWithSnapshot(TransactionId latestRemovedXid, RelFileNode node)
268 : {
269 : VirtualTransactionId *backends;
270 :
271 : /*
272 : * If we get passed InvalidTransactionId then we are a little surprised,
273 : * but it is theoretically possible in normal running. It also happens
274 : * when replaying already applied WAL records after a standby crash or
275 : * restart, or when replaying an XLOG_HEAP2_VISIBLE record that marks as
276 : * frozen a page which was already all-visible. If latestRemovedXid is
277 : * invalid then there is no conflict. That rule applies across all record
278 : * types that suffer from this conflict.
279 : */
280 0 : if (!TransactionIdIsValid(latestRemovedXid))
281 0 : return;
282 :
283 0 : backends = GetConflictingVirtualXIDs(latestRemovedXid,
284 : node.dbNode);
285 :
286 0 : ResolveRecoveryConflictWithVirtualXIDs(backends,
287 : PROCSIG_RECOVERY_CONFLICT_SNAPSHOT);
288 : }
289 :
290 : void
291 0 : ResolveRecoveryConflictWithTablespace(Oid tsid)
292 : {
293 : VirtualTransactionId *temp_file_users;
294 :
295 : /*
296 : * Standby users may be currently using this tablespace for their
297 : * temporary files. We only care about current users because
298 : * temp_tablespace parameter will just ignore tablespaces that no longer
299 : * exist.
300 : *
301 : * Ask everybody to cancel their queries immediately so we can ensure no
302 : * temp files remain and we can remove the tablespace. Nuke the entire
303 : * site from orbit, it's the only way to be sure.
304 : *
305 : * XXX: We could work out the pids of active backends using this
306 : * tablespace by examining the temp filenames in the directory. We would
307 : * then convert the pids into VirtualXIDs before attempting to cancel
308 : * them.
309 : *
310 : * We don't wait for commit because drop tablespace is non-transactional.
311 : */
312 0 : temp_file_users = GetConflictingVirtualXIDs(InvalidTransactionId,
313 : InvalidOid);
314 0 : ResolveRecoveryConflictWithVirtualXIDs(temp_file_users,
315 : PROCSIG_RECOVERY_CONFLICT_TABLESPACE);
316 0 : }
317 :
318 : void
319 0 : ResolveRecoveryConflictWithDatabase(Oid dbid)
320 : {
321 : /*
322 : * We don't do ResolveRecoveryConflictWithVirtualXIDs() here since that
323 : * only waits for transactions and completely idle sessions would block
324 : * us. This is rare enough that we do this as simply as possible: no wait,
325 : * just force them off immediately.
326 : *
327 : * No locking is required here because we already acquired
328 : * AccessExclusiveLock. Anybody trying to connect while we do this will
329 : * block during InitPostgres() and then disconnect when they see the
330 : * database has been removed.
331 : */
332 0 : while (CountDBBackends(dbid) > 0)
333 : {
334 0 : CancelDBBackends(dbid, PROCSIG_RECOVERY_CONFLICT_DATABASE, true);
335 :
336 : /*
337 : * Wait awhile for them to die so that we avoid flooding an
338 : * unresponsive backend when system is heavily loaded.
339 : */
340 0 : pg_usleep(10000);
341 : }
342 0 : }
343 :
344 : /*
345 : * ResolveRecoveryConflictWithLock is called from ProcSleep()
346 : * to resolve conflicts with other backends holding relation locks.
347 : *
348 : * The WaitLatch sleep normally done in ProcSleep()
349 : * (when not InHotStandby) is performed here, for code clarity.
350 : *
351 : * We either resolve conflicts immediately or set a timeout to wake us at
352 : * the limit of our patience.
353 : *
354 : * Resolve conflicts by canceling to all backends holding a conflicting
355 : * lock. As we are already queued to be granted the lock, no new lock
356 : * requests conflicting with ours will be granted in the meantime.
357 : *
358 : * Deadlocks involving the Startup process and an ordinary backend process
359 : * will be detected by the deadlock detector within the ordinary backend.
360 : */
361 : void
362 0 : ResolveRecoveryConflictWithLock(LOCKTAG locktag)
363 : {
364 : TimestampTz ltime;
365 :
366 0 : Assert(InHotStandby);
367 :
368 0 : ltime = GetStandbyLimitTime();
369 :
370 0 : if (GetCurrentTimestamp() >= ltime)
371 : {
372 : /*
373 : * We're already behind, so clear a path as quickly as possible.
374 : */
375 : VirtualTransactionId *backends;
376 :
377 0 : backends = GetLockConflicts(&locktag, AccessExclusiveLock);
378 0 : ResolveRecoveryConflictWithVirtualXIDs(backends,
379 : PROCSIG_RECOVERY_CONFLICT_LOCK);
380 : }
381 : else
382 : {
383 : /*
384 : * Wait (or wait again) until ltime
385 : */
386 : EnableTimeoutParams timeouts[1];
387 :
388 0 : timeouts[0].id = STANDBY_LOCK_TIMEOUT;
389 0 : timeouts[0].type = TMPARAM_AT;
390 0 : timeouts[0].fin_time = ltime;
391 0 : enable_timeouts(timeouts, 1);
392 : }
393 :
394 : /* Wait to be signaled by the release of the Relation Lock */
395 0 : ProcWaitForSignal(PG_WAIT_LOCK | locktag.locktag_type);
396 :
397 : /*
398 : * Clear any timeout requests established above. We assume here that the
399 : * Startup process doesn't have any other outstanding timeouts than those
400 : * used by this function. If that stops being true, we could cancel the
401 : * timeouts individually, but that'd be slower.
402 : */
403 0 : disable_all_timeouts(false);
404 0 : }
405 :
406 : /*
407 : * ResolveRecoveryConflictWithBufferPin is called from LockBufferForCleanup()
408 : * to resolve conflicts with other backends holding buffer pins.
409 : *
410 : * The ProcWaitForSignal() sleep normally done in LockBufferForCleanup()
411 : * (when not InHotStandby) is performed here, for code clarity.
412 : *
413 : * We either resolve conflicts immediately or set a timeout to wake us at
414 : * the limit of our patience.
415 : *
416 : * Resolve conflicts by sending a PROCSIG signal to all backends to check if
417 : * they hold one of the buffer pins that is blocking Startup process. If so,
418 : * those backends will take an appropriate error action, ERROR or FATAL.
419 : *
420 : * We also must check for deadlocks. Deadlocks occur because if queries
421 : * wait on a lock, that must be behind an AccessExclusiveLock, which can only
422 : * be cleared if the Startup process replays a transaction completion record.
423 : * If Startup process is also waiting then that is a deadlock. The deadlock
424 : * can occur if the query is waiting and then the Startup sleeps, or if
425 : * Startup is sleeping and the query waits on a lock. We protect against
426 : * only the former sequence here, the latter sequence is checked prior to
427 : * the query sleeping, in CheckRecoveryConflictDeadlock().
428 : *
429 : * Deadlocks are extremely rare, and relatively expensive to check for,
430 : * so we don't do a deadlock check right away ... only if we have had to wait
431 : * at least deadlock_timeout.
432 : */
433 : void
434 0 : ResolveRecoveryConflictWithBufferPin(void)
435 : {
436 : TimestampTz ltime;
437 :
438 0 : Assert(InHotStandby);
439 :
440 0 : ltime = GetStandbyLimitTime();
441 :
442 0 : if (ltime == 0)
443 : {
444 : /*
445 : * We're willing to wait forever for conflicts, so set timeout for
446 : * deadlock check only
447 : */
448 0 : enable_timeout_after(STANDBY_DEADLOCK_TIMEOUT, DeadlockTimeout);
449 : }
450 0 : else if (GetCurrentTimestamp() >= ltime)
451 : {
452 : /*
453 : * We're already behind, so clear a path as quickly as possible.
454 : */
455 0 : SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN);
456 : }
457 : else
458 : {
459 : /*
460 : * Wake up at ltime, and check for deadlocks as well if we will be
461 : * waiting longer than deadlock_timeout
462 : */
463 : EnableTimeoutParams timeouts[2];
464 :
465 0 : timeouts[0].id = STANDBY_TIMEOUT;
466 0 : timeouts[0].type = TMPARAM_AT;
467 0 : timeouts[0].fin_time = ltime;
468 0 : timeouts[1].id = STANDBY_DEADLOCK_TIMEOUT;
469 0 : timeouts[1].type = TMPARAM_AFTER;
470 0 : timeouts[1].delay_ms = DeadlockTimeout;
471 0 : enable_timeouts(timeouts, 2);
472 : }
473 :
474 : /* Wait to be signaled by UnpinBuffer() */
475 0 : ProcWaitForSignal(PG_WAIT_BUFFER_PIN);
476 :
477 : /*
478 : * Clear any timeout requests established above. We assume here that the
479 : * Startup process doesn't have any other timeouts than what this function
480 : * uses. If that stops being true, we could cancel the timeouts
481 : * individually, but that'd be slower.
482 : */
483 0 : disable_all_timeouts(false);
484 0 : }
485 :
486 : static void
487 0 : SendRecoveryConflictWithBufferPin(ProcSignalReason reason)
488 : {
489 0 : Assert(reason == PROCSIG_RECOVERY_CONFLICT_BUFFERPIN ||
490 : reason == PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK);
491 :
492 : /*
493 : * We send signal to all backends to ask them if they are holding the
494 : * buffer pin which is delaying the Startup process. We must not set the
495 : * conflict flag yet, since most backends will be innocent. Let the
496 : * SIGUSR1 handling in each backend decide their own fate.
497 : */
498 0 : CancelDBBackends(InvalidOid, reason, false);
499 0 : }
500 :
501 : /*
502 : * In Hot Standby perform early deadlock detection. We abort the lock
503 : * wait if we are about to sleep while holding the buffer pin that Startup
504 : * process is waiting for.
505 : *
506 : * Note: this code is pessimistic, because there is no way for it to
507 : * determine whether an actual deadlock condition is present: the lock we
508 : * need to wait for might be unrelated to any held by the Startup process.
509 : * Sooner or later, this mechanism should get ripped out in favor of somehow
510 : * accounting for buffer locks in DeadLockCheck(). However, errors here
511 : * seem to be very low-probability in practice, so for now it's not worth
512 : * the trouble.
513 : */
514 : void
515 0 : CheckRecoveryConflictDeadlock(void)
516 : {
517 0 : Assert(!InRecovery); /* do not call in Startup process */
518 :
519 0 : if (!HoldingBufferPinThatDelaysRecovery())
520 0 : return;
521 :
522 : /*
523 : * Error message should match ProcessInterrupts() but we avoid calling
524 : * that because we aren't handling an interrupt at this point. Note that
525 : * we only cancel the current transaction here, so if we are in a
526 : * subtransaction and the pin is held by a parent, then the Startup
527 : * process will continue to wait even though we have avoided deadlock.
528 : */
529 0 : ereport(ERROR,
530 : (errcode(ERRCODE_T_R_DEADLOCK_DETECTED),
531 : errmsg("canceling statement due to conflict with recovery"),
532 : errdetail("User transaction caused buffer deadlock with recovery.")));
533 : }
534 :
535 :
536 : /* --------------------------------
537 : * timeout handler routines
538 : * --------------------------------
539 : */
540 :
541 : /*
542 : * StandbyDeadLockHandler() will be called if STANDBY_DEADLOCK_TIMEOUT
543 : * occurs before STANDBY_TIMEOUT. Send out a request for hot-standby
544 : * backends to check themselves for deadlocks.
545 : */
546 : void
547 0 : StandbyDeadLockHandler(void)
548 : {
549 0 : SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK);
550 0 : }
551 :
552 : /*
553 : * StandbyTimeoutHandler() will be called if STANDBY_TIMEOUT is exceeded.
554 : * Send out a request to release conflicting buffer pins unconditionally,
555 : * so we can press ahead with applying changes in recovery.
556 : */
557 : void
558 0 : StandbyTimeoutHandler(void)
559 : {
560 : /* forget any pending STANDBY_DEADLOCK_TIMEOUT request */
561 0 : disable_timeout(STANDBY_DEADLOCK_TIMEOUT, false);
562 :
563 0 : SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN);
564 0 : }
565 :
566 : /*
567 : * StandbyLockTimeoutHandler() will be called if STANDBY_LOCK_TIMEOUT is exceeded.
568 : * This doesn't need to do anything, simply waking up is enough.
569 : */
570 : void
571 0 : StandbyLockTimeoutHandler(void)
572 : {
573 0 : }
574 :
575 : /*
576 : * -----------------------------------------------------
577 : * Locking in Recovery Mode
578 : * -----------------------------------------------------
579 : *
580 : * All locks are held by the Startup process using a single virtual
581 : * transaction. This implementation is both simpler and in some senses,
582 : * more correct. The locks held mean "some original transaction held
583 : * this lock, so query access is not allowed at this time". So the Startup
584 : * process is the proxy by which the original locks are implemented.
585 : *
586 : * We only keep track of AccessExclusiveLocks, which are only ever held by
587 : * one transaction on one relation.
588 : *
589 : * We keep a single dynamically expandible list of locks in local memory,
590 : * RecoveryLockList, so we can keep track of the various entries made by
591 : * the Startup process's virtual xid in the shared lock table.
592 : *
593 : * List elements use type xl_standby_lock, since the WAL record type exactly
594 : * matches the information that we need to keep track of.
595 : *
596 : * We use session locks rather than normal locks so we don't need
597 : * ResourceOwners.
598 : */
599 :
600 :
601 : void
602 0 : StandbyAcquireAccessExclusiveLock(TransactionId xid, Oid dbOid, Oid relOid)
603 : {
604 : xl_standby_lock *newlock;
605 : LOCKTAG locktag;
606 :
607 : /* Already processed? */
608 0 : if (!TransactionIdIsValid(xid) ||
609 0 : TransactionIdDidCommit(xid) ||
610 0 : TransactionIdDidAbort(xid))
611 0 : return;
612 :
613 0 : elog(trace_recovery(DEBUG4),
614 : "adding recovery lock: db %u rel %u", dbOid, relOid);
615 :
616 : /* dbOid is InvalidOid when we are locking a shared relation. */
617 0 : Assert(OidIsValid(relOid));
618 :
619 0 : newlock = palloc(sizeof(xl_standby_lock));
620 0 : newlock->xid = xid;
621 0 : newlock->dbOid = dbOid;
622 0 : newlock->relOid = relOid;
623 0 : RecoveryLockList = lappend(RecoveryLockList, newlock);
624 :
625 0 : SET_LOCKTAG_RELATION(locktag, newlock->dbOid, newlock->relOid);
626 :
627 0 : LockAcquireExtended(&locktag, AccessExclusiveLock, true, false, false);
628 : }
629 :
630 : static void
631 0 : StandbyReleaseLocks(TransactionId xid)
632 : {
633 : ListCell *cell,
634 : *prev,
635 : *next;
636 :
637 : /*
638 : * Release all matching locks and remove them from list
639 : */
640 0 : prev = NULL;
641 0 : for (cell = list_head(RecoveryLockList); cell; cell = next)
642 : {
643 0 : xl_standby_lock *lock = (xl_standby_lock *) lfirst(cell);
644 :
645 0 : next = lnext(cell);
646 :
647 0 : if (!TransactionIdIsValid(xid) || lock->xid == xid)
648 0 : {
649 : LOCKTAG locktag;
650 :
651 0 : elog(trace_recovery(DEBUG4),
652 : "releasing recovery lock: xid %u db %u rel %u",
653 : lock->xid, lock->dbOid, lock->relOid);
654 0 : SET_LOCKTAG_RELATION(locktag, lock->dbOid, lock->relOid);
655 0 : if (!LockRelease(&locktag, AccessExclusiveLock, true))
656 0 : elog(LOG,
657 : "RecoveryLockList contains entry for lock no longer recorded by lock manager: xid %u database %u relation %u",
658 : lock->xid, lock->dbOid, lock->relOid);
659 :
660 0 : RecoveryLockList = list_delete_cell(RecoveryLockList, cell, prev);
661 0 : pfree(lock);
662 : }
663 : else
664 0 : prev = cell;
665 : }
666 0 : }
667 :
668 : /*
669 : * Release locks for a transaction tree, starting at xid down, from
670 : * RecoveryLockList.
671 : *
672 : * Called during WAL replay of COMMIT/ROLLBACK when in hot standby mode,
673 : * to remove any AccessExclusiveLocks requested by a transaction.
674 : */
675 : void
676 0 : StandbyReleaseLockTree(TransactionId xid, int nsubxids, TransactionId *subxids)
677 : {
678 : int i;
679 :
680 0 : StandbyReleaseLocks(xid);
681 :
682 0 : for (i = 0; i < nsubxids; i++)
683 0 : StandbyReleaseLocks(subxids[i]);
684 0 : }
685 :
686 : /*
687 : * Called at end of recovery and when we see a shutdown checkpoint.
688 : */
689 : void
690 0 : StandbyReleaseAllLocks(void)
691 : {
692 : ListCell *cell,
693 : *prev,
694 : *next;
695 : LOCKTAG locktag;
696 :
697 0 : elog(trace_recovery(DEBUG2), "release all standby locks");
698 :
699 0 : prev = NULL;
700 0 : for (cell = list_head(RecoveryLockList); cell; cell = next)
701 : {
702 0 : xl_standby_lock *lock = (xl_standby_lock *) lfirst(cell);
703 :
704 0 : next = lnext(cell);
705 :
706 0 : elog(trace_recovery(DEBUG4),
707 : "releasing recovery lock: xid %u db %u rel %u",
708 : lock->xid, lock->dbOid, lock->relOid);
709 0 : SET_LOCKTAG_RELATION(locktag, lock->dbOid, lock->relOid);
710 0 : if (!LockRelease(&locktag, AccessExclusiveLock, true))
711 0 : elog(LOG,
712 : "RecoveryLockList contains entry for lock no longer recorded by lock manager: xid %u database %u relation %u",
713 : lock->xid, lock->dbOid, lock->relOid);
714 0 : RecoveryLockList = list_delete_cell(RecoveryLockList, cell, prev);
715 0 : pfree(lock);
716 : }
717 0 : }
718 :
719 : /*
720 : * StandbyReleaseOldLocks
721 : * Release standby locks held by top-level XIDs that aren't running,
722 : * as long as they're not prepared transactions.
723 : */
724 : void
725 0 : StandbyReleaseOldLocks(int nxids, TransactionId *xids)
726 : {
727 : ListCell *cell,
728 : *prev,
729 : *next;
730 : LOCKTAG locktag;
731 :
732 0 : prev = NULL;
733 0 : for (cell = list_head(RecoveryLockList); cell; cell = next)
734 : {
735 0 : xl_standby_lock *lock = (xl_standby_lock *) lfirst(cell);
736 0 : bool remove = false;
737 :
738 0 : next = lnext(cell);
739 :
740 0 : Assert(TransactionIdIsValid(lock->xid));
741 :
742 0 : if (StandbyTransactionIdIsPrepared(lock->xid))
743 0 : remove = false;
744 : else
745 : {
746 : int i;
747 0 : bool found = false;
748 :
749 0 : for (i = 0; i < nxids; i++)
750 : {
751 0 : if (lock->xid == xids[i])
752 : {
753 0 : found = true;
754 0 : break;
755 : }
756 : }
757 :
758 : /*
759 : * If its not a running transaction, remove it.
760 : */
761 0 : if (!found)
762 0 : remove = true;
763 : }
764 :
765 0 : if (remove)
766 : {
767 0 : elog(trace_recovery(DEBUG4),
768 : "releasing recovery lock: xid %u db %u rel %u",
769 : lock->xid, lock->dbOid, lock->relOid);
770 0 : SET_LOCKTAG_RELATION(locktag, lock->dbOid, lock->relOid);
771 0 : if (!LockRelease(&locktag, AccessExclusiveLock, true))
772 0 : elog(LOG,
773 : "RecoveryLockList contains entry for lock no longer recorded by lock manager: xid %u database %u relation %u",
774 : lock->xid, lock->dbOid, lock->relOid);
775 0 : RecoveryLockList = list_delete_cell(RecoveryLockList, cell, prev);
776 0 : pfree(lock);
777 : }
778 : else
779 0 : prev = cell;
780 : }
781 0 : }
782 :
783 : /*
784 : * --------------------------------------------------------------------
785 : * Recovery handling for Rmgr RM_STANDBY_ID
786 : *
787 : * These record types will only be created if XLogStandbyInfoActive()
788 : * --------------------------------------------------------------------
789 : */
790 :
791 : void
792 0 : standby_redo(XLogReaderState *record)
793 : {
794 0 : uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
795 :
796 : /* Backup blocks are not used in standby records */
797 0 : Assert(!XLogRecHasAnyBlockRefs(record));
798 :
799 : /* Do nothing if we're not in hot standby mode */
800 0 : if (standbyState == STANDBY_DISABLED)
801 0 : return;
802 :
803 0 : if (info == XLOG_STANDBY_LOCK)
804 : {
805 0 : xl_standby_locks *xlrec = (xl_standby_locks *) XLogRecGetData(record);
806 : int i;
807 :
808 0 : for (i = 0; i < xlrec->nlocks; i++)
809 0 : StandbyAcquireAccessExclusiveLock(xlrec->locks[i].xid,
810 : xlrec->locks[i].dbOid,
811 : xlrec->locks[i].relOid);
812 : }
813 0 : else if (info == XLOG_RUNNING_XACTS)
814 : {
815 0 : xl_running_xacts *xlrec = (xl_running_xacts *) XLogRecGetData(record);
816 : RunningTransactionsData running;
817 :
818 0 : running.xcnt = xlrec->xcnt;
819 0 : running.subxcnt = xlrec->subxcnt;
820 0 : running.subxid_overflow = xlrec->subxid_overflow;
821 0 : running.nextXid = xlrec->nextXid;
822 0 : running.latestCompletedXid = xlrec->latestCompletedXid;
823 0 : running.oldestRunningXid = xlrec->oldestRunningXid;
824 0 : running.xids = xlrec->xids;
825 :
826 0 : ProcArrayApplyRecoveryInfo(&running);
827 : }
828 0 : else if (info == XLOG_INVALIDATIONS)
829 : {
830 0 : xl_invalidations *xlrec = (xl_invalidations *) XLogRecGetData(record);
831 :
832 0 : ProcessCommittedInvalidationMessages(xlrec->msgs,
833 : xlrec->nmsgs,
834 0 : xlrec->relcacheInitFileInval,
835 : xlrec->dbId,
836 : xlrec->tsId);
837 : }
838 : else
839 0 : elog(PANIC, "standby_redo: unknown op code %u", info);
840 : }
841 :
842 : /*
843 : * Log details of the current snapshot to WAL. This allows the snapshot state
844 : * to be reconstructed on the standby and for logical decoding.
845 : *
846 : * This is used for Hot Standby as follows:
847 : *
848 : * We can move directly to STANDBY_SNAPSHOT_READY at startup if we
849 : * start from a shutdown checkpoint because we know nothing was running
850 : * at that time and our recovery snapshot is known empty. In the more
851 : * typical case of an online checkpoint we need to jump through a few
852 : * hoops to get a correct recovery snapshot and this requires a two or
853 : * sometimes a three stage process.
854 : *
855 : * The initial snapshot must contain all running xids and all current
856 : * AccessExclusiveLocks at a point in time on the standby. Assembling
857 : * that information while the server is running requires many and
858 : * various LWLocks, so we choose to derive that information piece by
859 : * piece and then re-assemble that info on the standby. When that
860 : * information is fully assembled we move to STANDBY_SNAPSHOT_READY.
861 : *
862 : * Since locking on the primary when we derive the information is not
863 : * strict, we note that there is a time window between the derivation and
864 : * writing to WAL of the derived information. That allows race conditions
865 : * that we must resolve, since xids and locks may enter or leave the
866 : * snapshot during that window. This creates the issue that an xid or
867 : * lock may start *after* the snapshot has been derived yet *before* the
868 : * snapshot is logged in the running xacts WAL record. We resolve this by
869 : * starting to accumulate changes at a point just prior to when we derive
870 : * the snapshot on the primary, then ignore duplicates when we later apply
871 : * the snapshot from the running xacts record. This is implemented during
872 : * CreateCheckpoint() where we use the logical checkpoint location as
873 : * our starting point and then write the running xacts record immediately
874 : * before writing the main checkpoint WAL record. Since we always start
875 : * up from a checkpoint and are immediately at our starting point, we
876 : * unconditionally move to STANDBY_INITIALIZED. After this point we
877 : * must do 4 things:
878 : * * move shared nextXid forwards as we see new xids
879 : * * extend the clog and subtrans with each new xid
880 : * * keep track of uncommitted known assigned xids
881 : * * keep track of uncommitted AccessExclusiveLocks
882 : *
883 : * When we see a commit/abort we must remove known assigned xids and locks
884 : * from the completing transaction. Attempted removals that cannot locate
885 : * an entry are expected and must not cause an error when we are in state
886 : * STANDBY_INITIALIZED. This is implemented in StandbyReleaseLocks() and
887 : * KnownAssignedXidsRemove().
888 : *
889 : * Later, when we apply the running xact data we must be careful to ignore
890 : * transactions already committed, since those commits raced ahead when
891 : * making WAL entries.
892 : *
893 : * The loose timing also means that locks may be recorded that have a
894 : * zero xid, since xids are removed from procs before locks are removed.
895 : * So we must prune the lock list down to ensure we hold locks only for
896 : * currently running xids, performed by StandbyReleaseOldLocks().
897 : * Zero xids should no longer be possible, but we may be replaying WAL
898 : * from a time when they were possible.
899 : *
900 : * For logical decoding only the running xacts information is needed;
901 : * there's no need to look at the locking information, but it's logged anyway,
902 : * as there's no independent knob to just enable logical decoding. For
903 : * details of how this is used, check snapbuild.c's introductory comment.
904 : *
905 : *
906 : * Returns the RecPtr of the last inserted record.
907 : */
908 : XLogRecPtr
909 14 : LogStandbySnapshot(void)
910 : {
911 : XLogRecPtr recptr;
912 : RunningTransactions running;
913 : xl_standby_lock *locks;
914 : int nlocks;
915 :
916 14 : Assert(XLogStandbyInfoActive());
917 :
918 : /*
919 : * Get details of any AccessExclusiveLocks being held at the moment.
920 : */
921 14 : locks = GetRunningTransactionLocks(&nlocks);
922 14 : if (nlocks > 0)
923 1 : LogAccessExclusiveLocks(nlocks, locks);
924 14 : pfree(locks);
925 :
926 : /*
927 : * Log details of all in-progress transactions. This should be the last
928 : * record we write, because standby will open up when it sees this.
929 : */
930 14 : running = GetRunningTransactionData();
931 :
932 : /*
933 : * GetRunningTransactionData() acquired ProcArrayLock, we must release it.
934 : * For Hot Standby this can be done before inserting the WAL record
935 : * because ProcArrayApplyRecoveryInfo() rechecks the commit status using
936 : * the clog. For logical decoding, though, the lock can't be released
937 : * early because the clog might be "in the future" from the POV of the
938 : * historic snapshot. This would allow for situations where we're waiting
939 : * for the end of a transaction listed in the xl_running_xacts record
940 : * which, according to the WAL, has committed before the xl_running_xacts
941 : * record. Fortunately this routine isn't executed frequently, and it's
942 : * only a shared lock.
943 : */
944 14 : if (wal_level < WAL_LEVEL_LOGICAL)
945 14 : LWLockRelease(ProcArrayLock);
946 :
947 14 : recptr = LogCurrentRunningXacts(running);
948 :
949 : /* Release lock if we kept it longer ... */
950 14 : if (wal_level >= WAL_LEVEL_LOGICAL)
951 0 : LWLockRelease(ProcArrayLock);
952 :
953 : /* GetRunningTransactionData() acquired XidGenLock, we must release it */
954 14 : LWLockRelease(XidGenLock);
955 :
956 14 : return recptr;
957 : }
958 :
959 : /*
960 : * Record an enhanced snapshot of running transactions into WAL.
961 : *
962 : * The definitions of RunningTransactionsData and xl_xact_running_xacts are
963 : * similar. We keep them separate because xl_xact_running_xacts is a
964 : * contiguous chunk of memory and never exists fully until it is assembled in
965 : * WAL. The inserted records are marked as not being important for durability,
966 : * to avoid triggering superfluous checkpoint / archiving activity.
967 : */
968 : static XLogRecPtr
969 14 : LogCurrentRunningXacts(RunningTransactions CurrRunningXacts)
970 : {
971 : xl_running_xacts xlrec;
972 : XLogRecPtr recptr;
973 :
974 14 : xlrec.xcnt = CurrRunningXacts->xcnt;
975 14 : xlrec.subxcnt = CurrRunningXacts->subxcnt;
976 14 : xlrec.subxid_overflow = CurrRunningXacts->subxid_overflow;
977 14 : xlrec.nextXid = CurrRunningXacts->nextXid;
978 14 : xlrec.oldestRunningXid = CurrRunningXacts->oldestRunningXid;
979 14 : xlrec.latestCompletedXid = CurrRunningXacts->latestCompletedXid;
980 :
981 : /* Header */
982 14 : XLogBeginInsert();
983 14 : XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT);
984 14 : XLogRegisterData((char *) (&xlrec), MinSizeOfXactRunningXacts);
985 :
986 : /* array of TransactionIds */
987 14 : if (xlrec.xcnt > 0)
988 10 : XLogRegisterData((char *) CurrRunningXacts->xids,
989 10 : (xlrec.xcnt + xlrec.subxcnt) * sizeof(TransactionId));
990 :
991 14 : recptr = XLogInsert(RM_STANDBY_ID, XLOG_RUNNING_XACTS);
992 :
993 14 : if (CurrRunningXacts->subxid_overflow)
994 0 : elog(trace_recovery(DEBUG2),
995 : "snapshot of %u running transactions overflowed (lsn %X/%X oldest xid %u latest complete %u next xid %u)",
996 : CurrRunningXacts->xcnt,
997 : (uint32) (recptr >> 32), (uint32) recptr,
998 : CurrRunningXacts->oldestRunningXid,
999 : CurrRunningXacts->latestCompletedXid,
1000 : CurrRunningXacts->nextXid);
1001 : else
1002 14 : elog(trace_recovery(DEBUG2),
1003 : "snapshot of %u+%u running transaction ids (lsn %X/%X oldest xid %u latest complete %u next xid %u)",
1004 : CurrRunningXacts->xcnt, CurrRunningXacts->subxcnt,
1005 : (uint32) (recptr >> 32), (uint32) recptr,
1006 : CurrRunningXacts->oldestRunningXid,
1007 : CurrRunningXacts->latestCompletedXid,
1008 : CurrRunningXacts->nextXid);
1009 :
1010 : /*
1011 : * Ensure running_xacts information is synced to disk not too far in the
1012 : * future. We don't want to stall anything though (i.e. use XLogFlush()),
1013 : * so we let the wal writer do it during normal operation.
1014 : * XLogSetAsyncXactLSN() conveniently will mark the LSN as to-be-synced
1015 : * and nudge the WALWriter into action if sleeping. Check
1016 : * XLogBackgroundFlush() for details why a record might not be flushed
1017 : * without it.
1018 : */
1019 14 : XLogSetAsyncXactLSN(recptr);
1020 :
1021 14 : return recptr;
1022 : }
1023 :
1024 : /*
1025 : * Wholesale logging of AccessExclusiveLocks. Other lock types need not be
1026 : * logged, as described in backend/storage/lmgr/README.
1027 : */
1028 : static void
1029 8808 : LogAccessExclusiveLocks(int nlocks, xl_standby_lock *locks)
1030 : {
1031 : xl_standby_locks xlrec;
1032 :
1033 8808 : xlrec.nlocks = nlocks;
1034 :
1035 8808 : XLogBeginInsert();
1036 8808 : XLogRegisterData((char *) &xlrec, offsetof(xl_standby_locks, locks));
1037 8808 : XLogRegisterData((char *) locks, nlocks * sizeof(xl_standby_lock));
1038 8808 : XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT);
1039 :
1040 8808 : (void) XLogInsert(RM_STANDBY_ID, XLOG_STANDBY_LOCK);
1041 8808 : }
1042 :
1043 : /*
1044 : * Individual logging of AccessExclusiveLocks for use during LockAcquire()
1045 : */
1046 : void
1047 8807 : LogAccessExclusiveLock(Oid dbOid, Oid relOid)
1048 : {
1049 : xl_standby_lock xlrec;
1050 :
1051 8807 : xlrec.xid = GetCurrentTransactionId();
1052 :
1053 : /*
1054 : * Decode the locktag back to the original values, to avoid sending lots
1055 : * of empty bytes with every message. See lock.h to check how a locktag
1056 : * is defined for LOCKTAG_RELATION
1057 : */
1058 8807 : xlrec.dbOid = dbOid;
1059 8807 : xlrec.relOid = relOid;
1060 :
1061 8807 : LogAccessExclusiveLocks(1, &xlrec);
1062 8807 : MyXactFlags |= XACT_FLAGS_ACQUIREDACCESSEXCLUSIVELOCK;
1063 8807 : }
1064 :
1065 : /*
1066 : * Prepare to log an AccessExclusiveLock, for use during LockAcquire()
1067 : */
1068 : void
1069 8807 : LogAccessExclusiveLockPrepare(void)
1070 : {
1071 : /*
1072 : * Ensure that a TransactionId has been assigned to this transaction, for
1073 : * two reasons, both related to lock release on the standby. First, we
1074 : * must assign an xid so that RecordTransactionCommit() and
1075 : * RecordTransactionAbort() do not optimise away the transaction
1076 : * completion record which recovery relies upon to release locks. It's a
1077 : * hack, but for a corner case not worth adding code for into the main
1078 : * commit path. Second, we must assign an xid before the lock is recorded
1079 : * in shared memory, otherwise a concurrently executing
1080 : * GetRunningTransactionLocks() might see a lock associated with an
1081 : * InvalidTransactionId which we later assert cannot happen.
1082 : */
1083 8807 : (void) GetCurrentTransactionId();
1084 8807 : }
1085 :
1086 : /*
1087 : * Emit WAL for invalidations. This currently is only used for commits without
1088 : * an xid but which contain invalidations.
1089 : */
1090 : void
1091 261 : LogStandbyInvalidations(int nmsgs, SharedInvalidationMessage *msgs,
1092 : bool relcacheInitFileInval)
1093 : {
1094 : xl_invalidations xlrec;
1095 :
1096 : /* prepare record */
1097 261 : memset(&xlrec, 0, sizeof(xlrec));
1098 261 : xlrec.dbId = MyDatabaseId;
1099 261 : xlrec.tsId = MyDatabaseTableSpace;
1100 261 : xlrec.relcacheInitFileInval = relcacheInitFileInval;
1101 261 : xlrec.nmsgs = nmsgs;
1102 :
1103 : /* perform insertion */
1104 261 : XLogBeginInsert();
1105 261 : XLogRegisterData((char *) (&xlrec), MinSizeOfInvalidations);
1106 261 : XLogRegisterData((char *) msgs,
1107 261 : nmsgs * sizeof(SharedInvalidationMessage));
1108 261 : XLogInsert(RM_STANDBY_ID, XLOG_INVALIDATIONS);
1109 261 : }
|