LCOV - code coverage report
Current view: top level - src/backend/access/transam - xlog.c (source / functions) Hit Total Coverage
Test: PostgreSQL Lines: 1339 3388 39.5 %
Date: 2017-09-29 13:40:31 Functions: 76 136 55.9 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * xlog.c
       4             :  *      PostgreSQL write-ahead log manager
       5             :  *
       6             :  *
       7             :  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
       8             :  * Portions Copyright (c) 1994, Regents of the University of California
       9             :  *
      10             :  * src/backend/access/transam/xlog.c
      11             :  *
      12             :  *-------------------------------------------------------------------------
      13             :  */
      14             : 
      15             : #include "postgres.h"
      16             : 
      17             : #include <ctype.h>
      18             : #include <math.h>
      19             : #include <time.h>
      20             : #include <fcntl.h>
      21             : #include <sys/stat.h>
      22             : #include <sys/time.h>
      23             : #include <unistd.h>
      24             : 
      25             : #include "access/clog.h"
      26             : #include "access/commit_ts.h"
      27             : #include "access/multixact.h"
      28             : #include "access/rewriteheap.h"
      29             : #include "access/subtrans.h"
      30             : #include "access/timeline.h"
      31             : #include "access/transam.h"
      32             : #include "access/tuptoaster.h"
      33             : #include "access/twophase.h"
      34             : #include "access/xact.h"
      35             : #include "access/xlog_internal.h"
      36             : #include "access/xloginsert.h"
      37             : #include "access/xlogreader.h"
      38             : #include "access/xlogutils.h"
      39             : #include "catalog/catversion.h"
      40             : #include "catalog/pg_control.h"
      41             : #include "catalog/pg_database.h"
      42             : #include "commands/tablespace.h"
      43             : #include "miscadmin.h"
      44             : #include "pgstat.h"
      45             : #include "port/atomics.h"
      46             : #include "postmaster/bgwriter.h"
      47             : #include "postmaster/walwriter.h"
      48             : #include "postmaster/startup.h"
      49             : #include "replication/basebackup.h"
      50             : #include "replication/logical.h"
      51             : #include "replication/slot.h"
      52             : #include "replication/origin.h"
      53             : #include "replication/snapbuild.h"
      54             : #include "replication/walreceiver.h"
      55             : #include "replication/walsender.h"
      56             : #include "storage/bufmgr.h"
      57             : #include "storage/fd.h"
      58             : #include "storage/ipc.h"
      59             : #include "storage/large_object.h"
      60             : #include "storage/latch.h"
      61             : #include "storage/pmsignal.h"
      62             : #include "storage/predicate.h"
      63             : #include "storage/proc.h"
      64             : #include "storage/procarray.h"
      65             : #include "storage/reinit.h"
      66             : #include "storage/smgr.h"
      67             : #include "storage/spin.h"
      68             : #include "utils/backend_random.h"
      69             : #include "utils/builtins.h"
      70             : #include "utils/guc.h"
      71             : #include "utils/memutils.h"
      72             : #include "utils/pg_lsn.h"
      73             : #include "utils/ps_status.h"
      74             : #include "utils/relmapper.h"
      75             : #include "utils/snapmgr.h"
      76             : #include "utils/timestamp.h"
      77             : #include "pg_trace.h"
      78             : 
      79             : extern uint32 bootstrap_data_checksum_version;
      80             : 
      81             : /* File path names (all relative to $PGDATA) */
      82             : #define RECOVERY_COMMAND_FILE   "recovery.conf"
      83             : #define RECOVERY_COMMAND_DONE   "recovery.done"
      84             : #define PROMOTE_SIGNAL_FILE     "promote"
      85             : #define FALLBACK_PROMOTE_SIGNAL_FILE "fallback_promote"
      86             : 
      87             : 
      88             : /* User-settable parameters */
      89             : int         max_wal_size_mb = 1024; /* 1 GB */
      90             : int         min_wal_size_mb = 80;   /* 80 MB */
      91             : int         wal_keep_segments = 0;
      92             : int         XLOGbuffers = -1;
      93             : int         XLogArchiveTimeout = 0;
      94             : int         XLogArchiveMode = ARCHIVE_MODE_OFF;
      95             : char       *XLogArchiveCommand = NULL;
      96             : bool        EnableHotStandby = false;
      97             : bool        fullPageWrites = true;
      98             : bool        wal_log_hints = false;
      99             : bool        wal_compression = false;
     100             : char       *wal_consistency_checking_string = NULL;
     101             : bool       *wal_consistency_checking = NULL;
     102             : bool        log_checkpoints = false;
     103             : int         sync_method = DEFAULT_SYNC_METHOD;
     104             : int         wal_level = WAL_LEVEL_MINIMAL;
     105             : int         CommitDelay = 0;    /* precommit delay in microseconds */
     106             : int         CommitSiblings = 5; /* # concurrent xacts needed to sleep */
     107             : int         wal_retrieve_retry_interval = 5000;
     108             : 
     109             : #ifdef WAL_DEBUG
     110             : bool        XLOG_DEBUG = false;
     111             : #endif
     112             : 
     113             : /*
     114             :  * Number of WAL insertion locks to use. A higher value allows more insertions
     115             :  * to happen concurrently, but adds some CPU overhead to flushing the WAL,
     116             :  * which needs to iterate all the locks.
     117             :  */
     118             : #define NUM_XLOGINSERT_LOCKS  8
     119             : 
     120             : /*
     121             :  * Max distance from last checkpoint, before triggering a new xlog-based
     122             :  * checkpoint.
     123             :  */
     124             : int         CheckPointSegments;
     125             : 
     126             : /* Estimated distance between checkpoints, in bytes */
     127             : static double CheckPointDistanceEstimate = 0;
     128             : static double PrevCheckPointDistance = 0;
     129             : 
     130             : /*
     131             :  * GUC support
     132             :  */
     133             : const struct config_enum_entry sync_method_options[] = {
     134             :     {"fsync", SYNC_METHOD_FSYNC, false},
     135             : #ifdef HAVE_FSYNC_WRITETHROUGH
     136             :     {"fsync_writethrough", SYNC_METHOD_FSYNC_WRITETHROUGH, false},
     137             : #endif
     138             : #ifdef HAVE_FDATASYNC
     139             :     {"fdatasync", SYNC_METHOD_FDATASYNC, false},
     140             : #endif
     141             : #ifdef OPEN_SYNC_FLAG
     142             :     {"open_sync", SYNC_METHOD_OPEN, false},
     143             : #endif
     144             : #ifdef OPEN_DATASYNC_FLAG
     145             :     {"open_datasync", SYNC_METHOD_OPEN_DSYNC, false},
     146             : #endif
     147             :     {NULL, 0, false}
     148             : };
     149             : 
     150             : 
     151             : /*
     152             :  * Although only "on", "off", and "always" are documented,
     153             :  * we accept all the likely variants of "on" and "off".
     154             :  */
     155             : const struct config_enum_entry archive_mode_options[] = {
     156             :     {"always", ARCHIVE_MODE_ALWAYS, false},
     157             :     {"on", ARCHIVE_MODE_ON, false},
     158             :     {"off", ARCHIVE_MODE_OFF, false},
     159             :     {"true", ARCHIVE_MODE_ON, true},
     160             :     {"false", ARCHIVE_MODE_OFF, true},
     161             :     {"yes", ARCHIVE_MODE_ON, true},
     162             :     {"no", ARCHIVE_MODE_OFF, true},
     163             :     {"1", ARCHIVE_MODE_ON, true},
     164             :     {"0", ARCHIVE_MODE_OFF, true},
     165             :     {NULL, 0, false}
     166             : };
     167             : 
     168             : /*
     169             :  * Statistics for current checkpoint are collected in this global struct.
     170             :  * Because only the checkpointer or a stand-alone backend can perform
     171             :  * checkpoints, this will be unused in normal backends.
     172             :  */
     173             : CheckpointStatsData CheckpointStats;
     174             : 
     175             : /*
     176             :  * ThisTimeLineID will be same in all backends --- it identifies current
     177             :  * WAL timeline for the database system.
     178             :  */
     179             : TimeLineID  ThisTimeLineID = 0;
     180             : 
     181             : /*
     182             :  * Are we doing recovery from XLOG?
     183             :  *
     184             :  * This is only ever true in the startup process; it should be read as meaning
     185             :  * "this process is replaying WAL records", rather than "the system is in
     186             :  * recovery mode".  It should be examined primarily by functions that need
     187             :  * to act differently when called from a WAL redo function (e.g., to skip WAL
     188             :  * logging).  To check whether the system is in recovery regardless of which
     189             :  * process you're running in, use RecoveryInProgress() but only after shared
     190             :  * memory startup and lock initialization.
     191             :  */
     192             : bool        InRecovery = false;
     193             : 
     194             : /* Are we in Hot Standby mode? Only valid in startup process, see xlog.h */
     195             : HotStandbyState standbyState = STANDBY_DISABLED;
     196             : 
     197             : static XLogRecPtr LastRec;
     198             : 
     199             : /* Local copy of WalRcv->receivedUpto */
     200             : static XLogRecPtr receivedUpto = 0;
     201             : static TimeLineID receiveTLI = 0;
     202             : 
     203             : /*
     204             :  * During recovery, lastFullPageWrites keeps track of full_page_writes that
     205             :  * the replayed WAL records indicate. It's initialized with full_page_writes
     206             :  * that the recovery starting checkpoint record indicates, and then updated
     207             :  * each time XLOG_FPW_CHANGE record is replayed.
     208             :  */
     209             : static bool lastFullPageWrites;
     210             : 
     211             : /*
     212             :  * Local copy of SharedRecoveryInProgress variable. True actually means "not
     213             :  * known, need to check the shared state".
     214             :  */
     215             : static bool LocalRecoveryInProgress = true;
     216             : 
     217             : /*
     218             :  * Local copy of SharedHotStandbyActive variable. False actually means "not
     219             :  * known, need to check the shared state".
     220             :  */
     221             : static bool LocalHotStandbyActive = false;
     222             : 
     223             : /*
     224             :  * Local state for XLogInsertAllowed():
     225             :  *      1: unconditionally allowed to insert XLOG
     226             :  *      0: unconditionally not allowed to insert XLOG
     227             :  *      -1: must check RecoveryInProgress(); disallow until it is false
     228             :  * Most processes start with -1 and transition to 1 after seeing that recovery
     229             :  * is not in progress.  But we can also force the value for special cases.
     230             :  * The coding in XLogInsertAllowed() depends on the first two of these states
     231             :  * being numerically the same as bool true and false.
     232             :  */
     233             : static int  LocalXLogInsertAllowed = -1;
     234             : 
     235             : /*
     236             :  * When ArchiveRecoveryRequested is set, archive recovery was requested,
     237             :  * ie. recovery.conf file was present. When InArchiveRecovery is set, we are
     238             :  * currently recovering using offline XLOG archives. These variables are only
     239             :  * valid in the startup process.
     240             :  *
     241             :  * When ArchiveRecoveryRequested is true, but InArchiveRecovery is false, we're
     242             :  * currently performing crash recovery using only XLOG files in pg_wal, but
     243             :  * will switch to using offline XLOG archives as soon as we reach the end of
     244             :  * WAL in pg_wal.
     245             : */
     246             : bool        ArchiveRecoveryRequested = false;
     247             : bool        InArchiveRecovery = false;
     248             : 
     249             : /* Was the last xlog file restored from archive, or local? */
     250             : static bool restoredFromArchive = false;
     251             : 
     252             : /* Buffers dedicated to consistency checks of size BLCKSZ */
     253             : static char *replay_image_masked = NULL;
     254             : static char *master_image_masked = NULL;
     255             : 
     256             : /* options taken from recovery.conf for archive recovery */
     257             : char       *recoveryRestoreCommand = NULL;
     258             : static char *recoveryEndCommand = NULL;
     259             : static char *archiveCleanupCommand = NULL;
     260             : static RecoveryTargetType recoveryTarget = RECOVERY_TARGET_UNSET;
     261             : static bool recoveryTargetInclusive = true;
     262             : static RecoveryTargetAction recoveryTargetAction = RECOVERY_TARGET_ACTION_PAUSE;
     263             : static TransactionId recoveryTargetXid;
     264             : static TimestampTz recoveryTargetTime;
     265             : static char *recoveryTargetName;
     266             : static XLogRecPtr recoveryTargetLSN;
     267             : static int  recovery_min_apply_delay = 0;
     268             : static TimestampTz recoveryDelayUntilTime;
     269             : 
     270             : /* options taken from recovery.conf for XLOG streaming */
     271             : static bool StandbyModeRequested = false;
     272             : static char *PrimaryConnInfo = NULL;
     273             : static char *PrimarySlotName = NULL;
     274             : static char *TriggerFile = NULL;
     275             : 
     276             : /* are we currently in standby mode? */
     277             : bool        StandbyMode = false;
     278             : 
     279             : /* whether request for fast promotion has been made yet */
     280             : static bool fast_promote = false;
     281             : 
     282             : /*
     283             :  * if recoveryStopsBefore/After returns true, it saves information of the stop
     284             :  * point here
     285             :  */
     286             : static TransactionId recoveryStopXid;
     287             : static TimestampTz recoveryStopTime;
     288             : static XLogRecPtr recoveryStopLSN;
     289             : static char recoveryStopName[MAXFNAMELEN];
     290             : static bool recoveryStopAfter;
     291             : 
     292             : /*
     293             :  * During normal operation, the only timeline we care about is ThisTimeLineID.
     294             :  * During recovery, however, things are more complicated.  To simplify life
     295             :  * for rmgr code, we keep ThisTimeLineID set to the "current" timeline as we
     296             :  * scan through the WAL history (that is, it is the line that was active when
     297             :  * the currently-scanned WAL record was generated).  We also need these
     298             :  * timeline values:
     299             :  *
     300             :  * recoveryTargetTLI: the desired timeline that we want to end in.
     301             :  *
     302             :  * recoveryTargetIsLatest: was the requested target timeline 'latest'?
     303             :  *
     304             :  * expectedTLEs: a list of TimeLineHistoryEntries for recoveryTargetTLI and the timelines of
     305             :  * its known parents, newest first (so recoveryTargetTLI is always the
     306             :  * first list member).  Only these TLIs are expected to be seen in the WAL
     307             :  * segments we read, and indeed only these TLIs will be considered as
     308             :  * candidate WAL files to open at all.
     309             :  *
     310             :  * curFileTLI: the TLI appearing in the name of the current input WAL file.
     311             :  * (This is not necessarily the same as ThisTimeLineID, because we could
     312             :  * be scanning data that was copied from an ancestor timeline when the current
     313             :  * file was created.)  During a sequential scan we do not allow this value
     314             :  * to decrease.
     315             :  */
     316             : static TimeLineID recoveryTargetTLI;
     317             : static bool recoveryTargetIsLatest = false;
     318             : static List *expectedTLEs;
     319             : static TimeLineID curFileTLI;
     320             : 
     321             : /*
     322             :  * ProcLastRecPtr points to the start of the last XLOG record inserted by the
     323             :  * current backend.  It is updated for all inserts.  XactLastRecEnd points to
     324             :  * end+1 of the last record, and is reset when we end a top-level transaction,
     325             :  * or start a new one; so it can be used to tell if the current transaction has
     326             :  * created any XLOG records.
     327             :  *
     328             :  * While in parallel mode, this may not be fully up to date.  When committing,
     329             :  * a transaction can assume this covers all xlog records written either by the
     330             :  * user backend or by any parallel worker which was present at any point during
     331             :  * the transaction.  But when aborting, or when still in parallel mode, other
     332             :  * parallel backends may have written WAL records at later LSNs than the value
     333             :  * stored here.  The parallel leader advances its own copy, when necessary,
     334             :  * in WaitForParallelWorkersToFinish.
     335             :  */
     336             : XLogRecPtr  ProcLastRecPtr = InvalidXLogRecPtr;
     337             : XLogRecPtr  XactLastRecEnd = InvalidXLogRecPtr;
     338             : XLogRecPtr  XactLastCommitEnd = InvalidXLogRecPtr;
     339             : 
     340             : /*
     341             :  * RedoRecPtr is this backend's local copy of the REDO record pointer
     342             :  * (which is almost but not quite the same as a pointer to the most recent
     343             :  * CHECKPOINT record).  We update this from the shared-memory copy,
     344             :  * XLogCtl->Insert.RedoRecPtr, whenever we can safely do so (ie, when we
     345             :  * hold an insertion lock).  See XLogInsertRecord for details.  We are also
     346             :  * allowed to update from XLogCtl->RedoRecPtr if we hold the info_lck;
     347             :  * see GetRedoRecPtr.  A freshly spawned backend obtains the value during
     348             :  * InitXLOGAccess.
     349             :  */
     350             : static XLogRecPtr RedoRecPtr;
     351             : 
     352             : /*
     353             :  * doPageWrites is this backend's local copy of (forcePageWrites ||
     354             :  * fullPageWrites).  It is used together with RedoRecPtr to decide whether
     355             :  * a full-page image of a page need to be taken.
     356             :  */
     357             : static bool doPageWrites;
     358             : 
     359             : /* Has the recovery code requested a walreceiver wakeup? */
     360             : static bool doRequestWalReceiverReply;
     361             : 
     362             : /*
     363             :  * RedoStartLSN points to the checkpoint's REDO location which is specified
     364             :  * in a backup label file, backup history file or control file. In standby
     365             :  * mode, XLOG streaming usually starts from the position where an invalid
     366             :  * record was found. But if we fail to read even the initial checkpoint
     367             :  * record, we use the REDO location instead of the checkpoint location as
     368             :  * the start position of XLOG streaming. Otherwise we would have to jump
     369             :  * backwards to the REDO location after reading the checkpoint record,
     370             :  * because the REDO record can precede the checkpoint record.
     371             :  */
     372             : static XLogRecPtr RedoStartLSN = InvalidXLogRecPtr;
     373             : 
     374             : /*----------
     375             :  * Shared-memory data structures for XLOG control
     376             :  *
     377             :  * LogwrtRqst indicates a byte position that we need to write and/or fsync
     378             :  * the log up to (all records before that point must be written or fsynced).
     379             :  * LogwrtResult indicates the byte positions we have already written/fsynced.
     380             :  * These structs are identical but are declared separately to indicate their
     381             :  * slightly different functions.
     382             :  *
     383             :  * To read XLogCtl->LogwrtResult, you must hold either info_lck or
     384             :  * WALWriteLock.  To update it, you need to hold both locks.  The point of
     385             :  * this arrangement is that the value can be examined by code that already
     386             :  * holds WALWriteLock without needing to grab info_lck as well.  In addition
     387             :  * to the shared variable, each backend has a private copy of LogwrtResult,
     388             :  * which is updated when convenient.
     389             :  *
     390             :  * The request bookkeeping is simpler: there is a shared XLogCtl->LogwrtRqst
     391             :  * (protected by info_lck), but we don't need to cache any copies of it.
     392             :  *
     393             :  * info_lck is only held long enough to read/update the protected variables,
     394             :  * so it's a plain spinlock.  The other locks are held longer (potentially
     395             :  * over I/O operations), so we use LWLocks for them.  These locks are:
     396             :  *
     397             :  * WALBufMappingLock: must be held to replace a page in the WAL buffer cache.
     398             :  * It is only held while initializing and changing the mapping.  If the
     399             :  * contents of the buffer being replaced haven't been written yet, the mapping
     400             :  * lock is released while the write is done, and reacquired afterwards.
     401             :  *
     402             :  * WALWriteLock: must be held to write WAL buffers to disk (XLogWrite or
     403             :  * XLogFlush).
     404             :  *
     405             :  * ControlFileLock: must be held to read/update control file or create
     406             :  * new log file.
     407             :  *
     408             :  * CheckpointLock: must be held to do a checkpoint or restartpoint (ensures
     409             :  * only one checkpointer at a time; currently, with all checkpoints done by
     410             :  * the checkpointer, this is just pro forma).
     411             :  *
     412             :  *----------
     413             :  */
     414             : 
     415             : typedef struct XLogwrtRqst
     416             : {
     417             :     XLogRecPtr  Write;          /* last byte + 1 to write out */
     418             :     XLogRecPtr  Flush;          /* last byte + 1 to flush */
     419             : } XLogwrtRqst;
     420             : 
     421             : typedef struct XLogwrtResult
     422             : {
     423             :     XLogRecPtr  Write;          /* last byte + 1 written out */
     424             :     XLogRecPtr  Flush;          /* last byte + 1 flushed */
     425             : } XLogwrtResult;
     426             : 
     427             : /*
     428             :  * Inserting to WAL is protected by a small fixed number of WAL insertion
     429             :  * locks. To insert to the WAL, you must hold one of the locks - it doesn't
     430             :  * matter which one. To lock out other concurrent insertions, you must hold
     431             :  * of them. Each WAL insertion lock consists of a lightweight lock, plus an
     432             :  * indicator of how far the insertion has progressed (insertingAt).
     433             :  *
     434             :  * The insertingAt values are read when a process wants to flush WAL from
     435             :  * the in-memory buffers to disk, to check that all the insertions to the
     436             :  * region the process is about to write out have finished. You could simply
     437             :  * wait for all currently in-progress insertions to finish, but the
     438             :  * insertingAt indicator allows you to ignore insertions to later in the WAL,
     439             :  * so that you only wait for the insertions that are modifying the buffers
     440             :  * you're about to write out.
     441             :  *
     442             :  * This isn't just an optimization. If all the WAL buffers are dirty, an
     443             :  * inserter that's holding a WAL insert lock might need to evict an old WAL
     444             :  * buffer, which requires flushing the WAL. If it's possible for an inserter
     445             :  * to block on another inserter unnecessarily, deadlock can arise when two
     446             :  * inserters holding a WAL insert lock wait for each other to finish their
     447             :  * insertion.
     448             :  *
     449             :  * Small WAL records that don't cross a page boundary never update the value,
     450             :  * the WAL record is just copied to the page and the lock is released. But
     451             :  * to avoid the deadlock-scenario explained above, the indicator is always
     452             :  * updated before sleeping while holding an insertion lock.
     453             :  *
     454             :  * lastImportantAt contains the LSN of the last important WAL record inserted
     455             :  * using a given lock. This value is used to detect if there has been
     456             :  * important WAL activity since the last time some action, like a checkpoint,
     457             :  * was performed - allowing to not repeat the action if not. The LSN is
     458             :  * updated for all insertions, unless the XLOG_MARK_UNIMPORTANT flag was
     459             :  * set. lastImportantAt is never cleared, only overwritten by the LSN of newer
     460             :  * records.  Tracking the WAL activity directly in WALInsertLock has the
     461             :  * advantage of not needing any additional locks to update the value.
     462             :  */
     463             : typedef struct
     464             : {
     465             :     LWLock      lock;
     466             :     XLogRecPtr  insertingAt;
     467             :     XLogRecPtr  lastImportantAt;
     468             : } WALInsertLock;
     469             : 
     470             : /*
     471             :  * All the WAL insertion locks are allocated as an array in shared memory. We
     472             :  * force the array stride to be a power of 2, which saves a few cycles in
     473             :  * indexing, but more importantly also ensures that individual slots don't
     474             :  * cross cache line boundaries. (Of course, we have to also ensure that the
     475             :  * array start address is suitably aligned.)
     476             :  */
     477             : typedef union WALInsertLockPadded
     478             : {
     479             :     WALInsertLock l;
     480             :     char        pad[PG_CACHE_LINE_SIZE];
     481             : } WALInsertLockPadded;
     482             : 
     483             : /*
     484             :  * State of an exclusive backup, necessary to control concurrent activities
     485             :  * across sessions when working on exclusive backups.
     486             :  *
     487             :  * EXCLUSIVE_BACKUP_NONE means that there is no exclusive backup actually
     488             :  * running, to be more precise pg_start_backup() is not being executed for
     489             :  * an exclusive backup and there is no exclusive backup in progress.
     490             :  * EXCLUSIVE_BACKUP_STARTING means that pg_start_backup() is starting an
     491             :  * exclusive backup.
     492             :  * EXCLUSIVE_BACKUP_IN_PROGRESS means that pg_start_backup() has finished
     493             :  * running and an exclusive backup is in progress. pg_stop_backup() is
     494             :  * needed to finish it.
     495             :  * EXCLUSIVE_BACKUP_STOPPING means that pg_stop_backup() is stopping an
     496             :  * exclusive backup.
     497             :  */
     498             : typedef enum ExclusiveBackupState
     499             : {
     500             :     EXCLUSIVE_BACKUP_NONE = 0,
     501             :     EXCLUSIVE_BACKUP_STARTING,
     502             :     EXCLUSIVE_BACKUP_IN_PROGRESS,
     503             :     EXCLUSIVE_BACKUP_STOPPING
     504             : } ExclusiveBackupState;
     505             : 
     506             : /*
     507             :  * Session status of running backup, used for sanity checks in SQL-callable
     508             :  * functions to start and stop backups.
     509             :  */
     510             : static SessionBackupState sessionBackupState = SESSION_BACKUP_NONE;
     511             : 
     512             : /*
     513             :  * Shared state data for WAL insertion.
     514             :  */
     515             : typedef struct XLogCtlInsert
     516             : {
     517             :     slock_t     insertpos_lck;  /* protects CurrBytePos and PrevBytePos */
     518             : 
     519             :     /*
     520             :      * CurrBytePos is the end of reserved WAL. The next record will be
     521             :      * inserted at that position. PrevBytePos is the start position of the
     522             :      * previously inserted (or rather, reserved) record - it is copied to the
     523             :      * prev-link of the next record. These are stored as "usable byte
     524             :      * positions" rather than XLogRecPtrs (see XLogBytePosToRecPtr()).
     525             :      */
     526             :     uint64      CurrBytePos;
     527             :     uint64      PrevBytePos;
     528             : 
     529             :     /*
     530             :      * Make sure the above heavily-contended spinlock and byte positions are
     531             :      * on their own cache line. In particular, the RedoRecPtr and full page
     532             :      * write variables below should be on a different cache line. They are
     533             :      * read on every WAL insertion, but updated rarely, and we don't want
     534             :      * those reads to steal the cache line containing Curr/PrevBytePos.
     535             :      */
     536             :     char        pad[PG_CACHE_LINE_SIZE];
     537             : 
     538             :     /*
     539             :      * fullPageWrites is the master copy used by all backends to determine
     540             :      * whether to write full-page to WAL, instead of using process-local one.
     541             :      * This is required because, when full_page_writes is changed by SIGHUP,
     542             :      * we must WAL-log it before it actually affects WAL-logging by backends.
     543             :      * Checkpointer sets at startup or after SIGHUP.
     544             :      *
     545             :      * To read these fields, you must hold an insertion lock. To modify them,
     546             :      * you must hold ALL the locks.
     547             :      */
     548             :     XLogRecPtr  RedoRecPtr;     /* current redo point for insertions */
     549             :     bool        forcePageWrites;    /* forcing full-page writes for PITR? */
     550             :     bool        fullPageWrites;
     551             : 
     552             :     /*
     553             :      * exclusiveBackupState indicates the state of an exclusive backup (see
     554             :      * comments of ExclusiveBackupState for more details). nonExclusiveBackups
     555             :      * is a counter indicating the number of streaming base backups currently
     556             :      * in progress. forcePageWrites is set to true when either of these is
     557             :      * non-zero. lastBackupStart is the latest checkpoint redo location used
     558             :      * as a starting point for an online backup.
     559             :      */
     560             :     ExclusiveBackupState exclusiveBackupState;
     561             :     int         nonExclusiveBackups;
     562             :     XLogRecPtr  lastBackupStart;
     563             : 
     564             :     /*
     565             :      * WAL insertion locks.
     566             :      */
     567             :     WALInsertLockPadded *WALInsertLocks;
     568             : } XLogCtlInsert;
     569             : 
     570             : /*
     571             :  * Total shared-memory state for XLOG.
     572             :  */
     573             : typedef struct XLogCtlData
     574             : {
     575             :     XLogCtlInsert Insert;
     576             : 
     577             :     /* Protected by info_lck: */
     578             :     XLogwrtRqst LogwrtRqst;
     579             :     XLogRecPtr  RedoRecPtr;     /* a recent copy of Insert->RedoRecPtr */
     580             :     uint32      ckptXidEpoch;   /* nextXID & epoch of latest checkpoint */
     581             :     TransactionId ckptXid;
     582             :     XLogRecPtr  asyncXactLSN;   /* LSN of newest async commit/abort */
     583             :     XLogRecPtr  replicationSlotMinLSN;  /* oldest LSN needed by any slot */
     584             : 
     585             :     XLogSegNo   lastRemovedSegNo;   /* latest removed/recycled XLOG segment */
     586             : 
     587             :     /* Fake LSN counter, for unlogged relations. Protected by ulsn_lck. */
     588             :     XLogRecPtr  unloggedLSN;
     589             :     slock_t     ulsn_lck;
     590             : 
     591             :     /* Time and LSN of last xlog segment switch. Protected by WALWriteLock. */
     592             :     pg_time_t   lastSegSwitchTime;
     593             :     XLogRecPtr  lastSegSwitchLSN;
     594             : 
     595             :     /*
     596             :      * Protected by info_lck and WALWriteLock (you must hold either lock to
     597             :      * read it, but both to update)
     598             :      */
     599             :     XLogwrtResult LogwrtResult;
     600             : 
     601             :     /*
     602             :      * Latest initialized page in the cache (last byte position + 1).
     603             :      *
     604             :      * To change the identity of a buffer (and InitializedUpTo), you need to
     605             :      * hold WALBufMappingLock.  To change the identity of a buffer that's
     606             :      * still dirty, the old page needs to be written out first, and for that
     607             :      * you need WALWriteLock, and you need to ensure that there are no
     608             :      * in-progress insertions to the page by calling
     609             :      * WaitXLogInsertionsToFinish().
     610             :      */
     611             :     XLogRecPtr  InitializedUpTo;
     612             : 
     613             :     /*
     614             :      * These values do not change after startup, although the pointed-to pages
     615             :      * and xlblocks values certainly do.  xlblock values are protected by
     616             :      * WALBufMappingLock.
     617             :      */
     618             :     char       *pages;          /* buffers for unwritten XLOG pages */
     619             :     XLogRecPtr *xlblocks;       /* 1st byte ptr-s + XLOG_BLCKSZ */
     620             :     int         XLogCacheBlck;  /* highest allocated xlog buffer index */
     621             : 
     622             :     /*
     623             :      * Shared copy of ThisTimeLineID. Does not change after end-of-recovery.
     624             :      * If we created a new timeline when the system was started up,
     625             :      * PrevTimeLineID is the old timeline's ID that we forked off from.
     626             :      * Otherwise it's equal to ThisTimeLineID.
     627             :      */
     628             :     TimeLineID  ThisTimeLineID;
     629             :     TimeLineID  PrevTimeLineID;
     630             : 
     631             :     /*
     632             :      * archiveCleanupCommand is read from recovery.conf but needs to be in
     633             :      * shared memory so that the checkpointer process can access it.
     634             :      */
     635             :     char        archiveCleanupCommand[MAXPGPATH];
     636             : 
     637             :     /*
     638             :      * SharedRecoveryInProgress indicates if we're still in crash or archive
     639             :      * recovery.  Protected by info_lck.
     640             :      */
     641             :     bool        SharedRecoveryInProgress;
     642             : 
     643             :     /*
     644             :      * SharedHotStandbyActive indicates if we're still in crash or archive
     645             :      * recovery.  Protected by info_lck.
     646             :      */
     647             :     bool        SharedHotStandbyActive;
     648             : 
     649             :     /*
     650             :      * WalWriterSleeping indicates whether the WAL writer is currently in
     651             :      * low-power mode (and hence should be nudged if an async commit occurs).
     652             :      * Protected by info_lck.
     653             :      */
     654             :     bool        WalWriterSleeping;
     655             : 
     656             :     /*
     657             :      * recoveryWakeupLatch is used to wake up the startup process to continue
     658             :      * WAL replay, if it is waiting for WAL to arrive or failover trigger file
     659             :      * to appear.
     660             :      */
     661             :     Latch       recoveryWakeupLatch;
     662             : 
     663             :     /*
     664             :      * During recovery, we keep a copy of the latest checkpoint record here.
     665             :      * lastCheckPointRecPtr points to start of checkpoint record and
     666             :      * lastCheckPointEndPtr points to end+1 of checkpoint record.  Used by the
     667             :      * checkpointer when it wants to create a restartpoint.
     668             :      *
     669             :      * Protected by info_lck.
     670             :      */
     671             :     XLogRecPtr  lastCheckPointRecPtr;
     672             :     XLogRecPtr  lastCheckPointEndPtr;
     673             :     CheckPoint  lastCheckPoint;
     674             : 
     675             :     /*
     676             :      * lastReplayedEndRecPtr points to end+1 of the last record successfully
     677             :      * replayed. When we're currently replaying a record, ie. in a redo
     678             :      * function, replayEndRecPtr points to the end+1 of the record being
     679             :      * replayed, otherwise it's equal to lastReplayedEndRecPtr.
     680             :      */
     681             :     XLogRecPtr  lastReplayedEndRecPtr;
     682             :     TimeLineID  lastReplayedTLI;
     683             :     XLogRecPtr  replayEndRecPtr;
     684             :     TimeLineID  replayEndTLI;
     685             :     /* timestamp of last COMMIT/ABORT record replayed (or being replayed) */
     686             :     TimestampTz recoveryLastXTime;
     687             : 
     688             :     /*
     689             :      * timestamp of when we started replaying the current chunk of WAL data,
     690             :      * only relevant for replication or archive recovery
     691             :      */
     692             :     TimestampTz currentChunkStartTime;
     693             :     /* Are we requested to pause recovery? */
     694             :     bool        recoveryPause;
     695             : 
     696             :     /*
     697             :      * lastFpwDisableRecPtr points to the start of the last replayed
     698             :      * XLOG_FPW_CHANGE record that instructs full_page_writes is disabled.
     699             :      */
     700             :     XLogRecPtr  lastFpwDisableRecPtr;
     701             : 
     702             :     slock_t     info_lck;       /* locks shared variables shown above */
     703             : } XLogCtlData;
     704             : 
     705             : static XLogCtlData *XLogCtl = NULL;
     706             : 
     707             : /* a private copy of XLogCtl->Insert.WALInsertLocks, for convenience */
     708             : static WALInsertLockPadded *WALInsertLocks = NULL;
     709             : 
     710             : /*
     711             :  * We maintain an image of pg_control in shared memory.
     712             :  */
     713             : static ControlFileData *ControlFile = NULL;
     714             : 
     715             : /*
     716             :  * Calculate the amount of space left on the page after 'endptr'. Beware
     717             :  * multiple evaluation!
     718             :  */
     719             : #define INSERT_FREESPACE(endptr)    \
     720             :     (((endptr) % XLOG_BLCKSZ == 0) ? 0 : (XLOG_BLCKSZ - (endptr) % XLOG_BLCKSZ))
     721             : 
     722             : /* Macro to advance to next buffer index. */
     723             : #define NextBufIdx(idx)     \
     724             :         (((idx) == XLogCtl->XLogCacheBlck) ? 0 : ((idx) + 1))
     725             : 
     726             : /*
     727             :  * XLogRecPtrToBufIdx returns the index of the WAL buffer that holds, or
     728             :  * would hold if it was in cache, the page containing 'recptr'.
     729             :  */
     730             : #define XLogRecPtrToBufIdx(recptr)  \
     731             :     (((recptr) / XLOG_BLCKSZ) % (XLogCtl->XLogCacheBlck + 1))
     732             : 
     733             : /*
     734             :  * These are the number of bytes in a WAL page and segment usable for WAL data.
     735             :  */
     736             : #define UsableBytesInPage (XLOG_BLCKSZ - SizeOfXLogShortPHD)
     737             : #define UsableBytesInSegment ((XLOG_SEG_SIZE / XLOG_BLCKSZ) * UsableBytesInPage - (SizeOfXLogLongPHD - SizeOfXLogShortPHD))
     738             : 
     739             : /* Convert min_wal_size_mb and max wal_size_mb to equivalent segment count */
     740             : #define ConvertToXSegs(x)   \
     741             :     (x / (XLOG_SEG_SIZE / (1024 * 1024)))
     742             : 
     743             : /*
     744             :  * Private, possibly out-of-date copy of shared LogwrtResult.
     745             :  * See discussion above.
     746             :  */
     747             : static XLogwrtResult LogwrtResult = {0, 0};
     748             : 
     749             : /*
     750             :  * Codes indicating where we got a WAL file from during recovery, or where
     751             :  * to attempt to get one.
     752             :  */
     753             : typedef enum
     754             : {
     755             :     XLOG_FROM_ANY = 0,          /* request to read WAL from any source */
     756             :     XLOG_FROM_ARCHIVE,          /* restored using restore_command */
     757             :     XLOG_FROM_PG_WAL,           /* existing file in pg_wal */
     758             :     XLOG_FROM_STREAM            /* streamed from master */
     759             : } XLogSource;
     760             : 
     761             : /* human-readable names for XLogSources, for debugging output */
     762             : static const char *xlogSourceNames[] = {"any", "archive", "pg_wal", "stream"};
     763             : 
     764             : /*
     765             :  * openLogFile is -1 or a kernel FD for an open log file segment.
     766             :  * When it's open, openLogOff is the current seek offset in the file.
     767             :  * openLogSegNo identifies the segment.  These variables are only
     768             :  * used to write the XLOG, and so will normally refer to the active segment.
     769             :  */
     770             : static int  openLogFile = -1;
     771             : static XLogSegNo openLogSegNo = 0;
     772             : static uint32 openLogOff = 0;
     773             : 
     774             : /*
     775             :  * These variables are used similarly to the ones above, but for reading
     776             :  * the XLOG.  Note, however, that readOff generally represents the offset
     777             :  * of the page just read, not the seek position of the FD itself, which
     778             :  * will be just past that page. readLen indicates how much of the current
     779             :  * page has been read into readBuf, and readSource indicates where we got
     780             :  * the currently open file from.
     781             :  */
     782             : static int  readFile = -1;
     783             : static XLogSegNo readSegNo = 0;
     784             : static uint32 readOff = 0;
     785             : static uint32 readLen = 0;
     786             : static XLogSource readSource = 0;   /* XLOG_FROM_* code */
     787             : 
     788             : /*
     789             :  * Keeps track of which source we're currently reading from. This is
     790             :  * different from readSource in that this is always set, even when we don't
     791             :  * currently have a WAL file open. If lastSourceFailed is set, our last
     792             :  * attempt to read from currentSource failed, and we should try another source
     793             :  * next.
     794             :  */
     795             : static XLogSource currentSource = 0;    /* XLOG_FROM_* code */
     796             : static bool lastSourceFailed = false;
     797             : 
     798             : typedef struct XLogPageReadPrivate
     799             : {
     800             :     int         emode;
     801             :     bool        fetching_ckpt;  /* are we fetching a checkpoint record? */
     802             :     bool        randAccess;
     803             : } XLogPageReadPrivate;
     804             : 
     805             : /*
     806             :  * These variables track when we last obtained some WAL data to process,
     807             :  * and where we got it from.  (XLogReceiptSource is initially the same as
     808             :  * readSource, but readSource gets reset to zero when we don't have data
     809             :  * to process right now.  It is also different from currentSource, which
     810             :  * also changes when we try to read from a source and fail, while
     811             :  * XLogReceiptSource tracks where we last successfully read some WAL.)
     812             :  */
     813             : static TimestampTz XLogReceiptTime = 0;
     814             : static XLogSource XLogReceiptSource = 0;    /* XLOG_FROM_* code */
     815             : 
     816             : /* State information for XLOG reading */
     817             : static XLogRecPtr ReadRecPtr;   /* start of last record read */
     818             : static XLogRecPtr EndRecPtr;    /* end+1 of last record read */
     819             : 
     820             : static XLogRecPtr minRecoveryPoint; /* local copy of
     821             :                                      * ControlFile->minRecoveryPoint */
     822             : static TimeLineID minRecoveryPointTLI;
     823             : static bool updateMinRecoveryPoint = true;
     824             : 
     825             : /*
     826             :  * Have we reached a consistent database state? In crash recovery, we have
     827             :  * to replay all the WAL, so reachedConsistency is never set. During archive
     828             :  * recovery, the database is consistent once minRecoveryPoint is reached.
     829             :  */
     830             : bool        reachedConsistency = false;
     831             : 
     832             : static bool InRedo = false;
     833             : 
     834             : /* Have we launched bgwriter during recovery? */
     835             : static bool bgwriterLaunched = false;
     836             : 
     837             : /* For WALInsertLockAcquire/Release functions */
     838             : static int  MyLockNo = 0;
     839             : static bool holdingAllLocks = false;
     840             : 
     841             : #ifdef WAL_DEBUG
     842             : static MemoryContext walDebugCxt = NULL;
     843             : #endif
     844             : 
     845             : static void readRecoveryCommandFile(void);
     846             : static void exitArchiveRecovery(TimeLineID endTLI, XLogRecPtr endOfLog);
     847             : static bool recoveryStopsBefore(XLogReaderState *record);
     848             : static bool recoveryStopsAfter(XLogReaderState *record);
     849             : static void recoveryPausesHere(void);
     850             : static bool recoveryApplyDelay(XLogReaderState *record);
     851             : static void SetLatestXTime(TimestampTz xtime);
     852             : static void SetCurrentChunkStartTime(TimestampTz xtime);
     853             : static void CheckRequiredParameterValues(void);
     854             : static void XLogReportParameters(void);
     855             : static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI,
     856             :                     TimeLineID prevTLI);
     857             : static void LocalSetXLogInsertAllowed(void);
     858             : static void CreateEndOfRecoveryRecord(void);
     859             : static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
     860             : static void KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo);
     861             : static XLogRecPtr XLogGetReplicationSlotMinimumLSN(void);
     862             : 
     863             : static void AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic);
     864             : static bool XLogCheckpointNeeded(XLogSegNo new_segno);
     865             : static void XLogWrite(XLogwrtRqst WriteRqst, bool flexible);
     866             : static bool InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
     867             :                        bool find_free, XLogSegNo max_segno,
     868             :                        bool use_lock);
     869             : static int XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
     870             :              int source, bool notfoundOk);
     871             : static int  XLogFileReadAnyTLI(XLogSegNo segno, int emode, int source);
     872             : static int XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr,
     873             :              int reqLen, XLogRecPtr targetRecPtr, char *readBuf,
     874             :              TimeLineID *readTLI);
     875             : static bool WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
     876             :                             bool fetching_ckpt, XLogRecPtr tliRecPtr);
     877             : static int  emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
     878             : static void XLogFileClose(void);
     879             : static void PreallocXlogFiles(XLogRecPtr endptr);
     880             : static void RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr PriorRedoPtr, XLogRecPtr endptr);
     881             : static void RemoveXlogFile(const char *segname, XLogRecPtr PriorRedoPtr, XLogRecPtr endptr);
     882             : static void UpdateLastRemovedPtr(char *filename);
     883             : static void ValidateXLOGDirectoryStructure(void);
     884             : static void CleanupBackupHistory(void);
     885             : static void UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force);
     886             : static XLogRecord *ReadRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
     887             :            int emode, bool fetching_ckpt);
     888             : static void CheckRecoveryConsistency(void);
     889             : static XLogRecord *ReadCheckpointRecord(XLogReaderState *xlogreader,
     890             :                      XLogRecPtr RecPtr, int whichChkpti, bool report);
     891             : static bool rescanLatestTimeLine(void);
     892             : static void WriteControlFile(void);
     893             : static void ReadControlFile(void);
     894             : static char *str_time(pg_time_t tnow);
     895             : static bool CheckForStandbyTrigger(void);
     896             : 
     897             : #ifdef WAL_DEBUG
     898             : static void xlog_outrec(StringInfo buf, XLogReaderState *record);
     899             : #endif
     900             : static void xlog_outdesc(StringInfo buf, XLogReaderState *record);
     901             : static void pg_start_backup_callback(int code, Datum arg);
     902             : static void pg_stop_backup_callback(int code, Datum arg);
     903             : static bool read_backup_label(XLogRecPtr *checkPointLoc,
     904             :                   bool *backupEndRequired, bool *backupFromStandby);
     905             : static bool read_tablespace_map(List **tablespaces);
     906             : 
     907             : static void rm_redo_error_callback(void *arg);
     908             : static int  get_sync_bit(int method);
     909             : 
     910             : static void CopyXLogRecordToWAL(int write_len, bool isLogSwitch,
     911             :                     XLogRecData *rdata,
     912             :                     XLogRecPtr StartPos, XLogRecPtr EndPos);
     913             : static void ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos,
     914             :                           XLogRecPtr *EndPos, XLogRecPtr *PrevPtr);
     915             : static bool ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos,
     916             :                   XLogRecPtr *PrevPtr);
     917             : static XLogRecPtr WaitXLogInsertionsToFinish(XLogRecPtr upto);
     918             : static char *GetXLogBuffer(XLogRecPtr ptr);
     919             : static XLogRecPtr XLogBytePosToRecPtr(uint64 bytepos);
     920             : static XLogRecPtr XLogBytePosToEndRecPtr(uint64 bytepos);
     921             : static uint64 XLogRecPtrToBytePos(XLogRecPtr ptr);
     922             : static void checkXLogConsistency(XLogReaderState *record);
     923             : 
     924             : static void WALInsertLockAcquire(void);
     925             : static void WALInsertLockAcquireExclusive(void);
     926             : static void WALInsertLockRelease(void);
     927             : static void WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt);
     928             : 
     929             : /*
     930             :  * Insert an XLOG record represented by an already-constructed chain of data
     931             :  * chunks.  This is a low-level routine; to construct the WAL record header
     932             :  * and data, use the higher-level routines in xloginsert.c.
     933             :  *
     934             :  * If 'fpw_lsn' is valid, it is the oldest LSN among the pages that this
     935             :  * WAL record applies to, that were not included in the record as full page
     936             :  * images.  If fpw_lsn >= RedoRecPtr, the function does not perform the
     937             :  * insertion and returns InvalidXLogRecPtr.  The caller can then recalculate
     938             :  * which pages need a full-page image, and retry.  If fpw_lsn is invalid, the
     939             :  * record is always inserted.
     940             :  *
     941             :  * 'flags' gives more in-depth control on the record being inserted. See
     942             :  * XLogSetRecordFlags() for details.
     943             :  *
     944             :  * The first XLogRecData in the chain must be for the record header, and its
     945             :  * data must be MAXALIGNed.  XLogInsertRecord fills in the xl_prev and
     946             :  * xl_crc fields in the header, the rest of the header must already be filled
     947             :  * by the caller.
     948             :  *
     949             :  * Returns XLOG pointer to end of record (beginning of next record).
     950             :  * This can be used as LSN for data pages affected by the logged action.
     951             :  * (LSN is the XLOG point up to which the XLOG must be flushed to disk
     952             :  * before the data page can be written out.  This implements the basic
     953             :  * WAL rule "write the log before the data".)
     954             :  */
     955             : XLogRecPtr
     956     1306714 : XLogInsertRecord(XLogRecData *rdata,
     957             :                  XLogRecPtr fpw_lsn,
     958             :                  uint8 flags)
     959             : {
     960     1306714 :     XLogCtlInsert *Insert = &XLogCtl->Insert;
     961             :     pg_crc32c   rdata_crc;
     962             :     bool        inserted;
     963     1306714 :     XLogRecord *rechdr = (XLogRecord *) rdata->data;
     964     1306714 :     uint8       info = rechdr->xl_info & ~XLR_INFO_MASK;
     965     1306714 :     bool        isLogSwitch = (rechdr->xl_rmid == RM_XLOG_ID &&
     966             :                                info == XLOG_SWITCH);
     967             :     XLogRecPtr  StartPos;
     968             :     XLogRecPtr  EndPos;
     969             : 
     970             :     /* we assume that all of the record header is in the first chunk */
     971     1306714 :     Assert(rdata->len >= SizeOfXLogRecord);
     972             : 
     973             :     /* cross-check on whether we should be here or not */
     974     1306714 :     if (!XLogInsertAllowed())
     975           0 :         elog(ERROR, "cannot make new WAL entries during recovery");
     976             : 
     977             :     /*----------
     978             :      *
     979             :      * We have now done all the preparatory work we can without holding a
     980             :      * lock or modifying shared state. From here on, inserting the new WAL
     981             :      * record to the shared WAL buffer cache is a two-step process:
     982             :      *
     983             :      * 1. Reserve the right amount of space from the WAL. The current head of
     984             :      *    reserved space is kept in Insert->CurrBytePos, and is protected by
     985             :      *    insertpos_lck.
     986             :      *
     987             :      * 2. Copy the record to the reserved WAL space. This involves finding the
     988             :      *    correct WAL buffer containing the reserved space, and copying the
     989             :      *    record in place. This can be done concurrently in multiple processes.
     990             :      *
     991             :      * To keep track of which insertions are still in-progress, each concurrent
     992             :      * inserter acquires an insertion lock. In addition to just indicating that
     993             :      * an insertion is in progress, the lock tells others how far the inserter
     994             :      * has progressed. There is a small fixed number of insertion locks,
     995             :      * determined by NUM_XLOGINSERT_LOCKS. When an inserter crosses a page
     996             :      * boundary, it updates the value stored in the lock to the how far it has
     997             :      * inserted, to allow the previous buffer to be flushed.
     998             :      *
     999             :      * Holding onto an insertion lock also protects RedoRecPtr and
    1000             :      * fullPageWrites from changing until the insertion is finished.
    1001             :      *
    1002             :      * Step 2 can usually be done completely in parallel. If the required WAL
    1003             :      * page is not initialized yet, you have to grab WALBufMappingLock to
    1004             :      * initialize it, but the WAL writer tries to do that ahead of insertions
    1005             :      * to avoid that from happening in the critical path.
    1006             :      *
    1007             :      *----------
    1008             :      */
    1009     1306714 :     START_CRIT_SECTION();
    1010     1306714 :     if (isLogSwitch)
    1011           0 :         WALInsertLockAcquireExclusive();
    1012             :     else
    1013     1306714 :         WALInsertLockAcquire();
    1014             : 
    1015             :     /*
    1016             :      * Check to see if my copy of RedoRecPtr or doPageWrites is out of date.
    1017             :      * If so, may have to go back and have the caller recompute everything.
    1018             :      * This can only happen just after a checkpoint, so it's better to be slow
    1019             :      * in this case and fast otherwise.
    1020             :      *
    1021             :      * If we aren't doing full-page writes then RedoRecPtr doesn't actually
    1022             :      * affect the contents of the XLOG record, so we'll update our local copy
    1023             :      * but not force a recomputation.  (If doPageWrites was just turned off,
    1024             :      * we could recompute the record without full pages, but we choose not to
    1025             :      * bother.)
    1026             :      */
    1027     1306714 :     if (RedoRecPtr != Insert->RedoRecPtr)
    1028             :     {
    1029           5 :         Assert(RedoRecPtr < Insert->RedoRecPtr);
    1030           5 :         RedoRecPtr = Insert->RedoRecPtr;
    1031             :     }
    1032     1306714 :     doPageWrites = (Insert->fullPageWrites || Insert->forcePageWrites);
    1033             : 
    1034     1306714 :     if (fpw_lsn != InvalidXLogRecPtr && fpw_lsn <= RedoRecPtr && doPageWrites)
    1035             :     {
    1036             :         /*
    1037             :          * Oops, some buffer now needs to be backed up that the caller didn't
    1038             :          * back up.  Start over.
    1039             :          */
    1040           0 :         WALInsertLockRelease();
    1041           0 :         END_CRIT_SECTION();
    1042           0 :         return InvalidXLogRecPtr;
    1043             :     }
    1044             : 
    1045             :     /*
    1046             :      * Reserve space for the record in the WAL. This also sets the xl_prev
    1047             :      * pointer.
    1048             :      */
    1049     1306714 :     if (isLogSwitch)
    1050           0 :         inserted = ReserveXLogSwitch(&StartPos, &EndPos, &rechdr->xl_prev);
    1051             :     else
    1052             :     {
    1053     1306714 :         ReserveXLogInsertLocation(rechdr->xl_tot_len, &StartPos, &EndPos,
    1054             :                                   &rechdr->xl_prev);
    1055     1306714 :         inserted = true;
    1056             :     }
    1057             : 
    1058     1306714 :     if (inserted)
    1059             :     {
    1060             :         /*
    1061             :          * Now that xl_prev has been filled in, calculate CRC of the record
    1062             :          * header.
    1063             :          */
    1064     1306714 :         rdata_crc = rechdr->xl_crc;
    1065     1306714 :         COMP_CRC32C(rdata_crc, rechdr, offsetof(XLogRecord, xl_crc));
    1066     1306714 :         FIN_CRC32C(rdata_crc);
    1067     1306714 :         rechdr->xl_crc = rdata_crc;
    1068             : 
    1069             :         /*
    1070             :          * All the record data, including the header, is now ready to be
    1071             :          * inserted. Copy the record in the space reserved.
    1072             :          */
    1073     1306714 :         CopyXLogRecordToWAL(rechdr->xl_tot_len, isLogSwitch, rdata,
    1074             :                             StartPos, EndPos);
    1075             : 
    1076             :         /*
    1077             :          * Unless record is flagged as not important, update LSN of last
    1078             :          * important record in the current slot. When holding all locks, just
    1079             :          * update the first one.
    1080             :          */
    1081     1306714 :         if ((flags & XLOG_MARK_UNIMPORTANT) == 0)
    1082             :         {
    1083     1298031 :             int         lockno = holdingAllLocks ? 0 : MyLockNo;
    1084             : 
    1085     1298031 :             WALInsertLocks[lockno].l.lastImportantAt = StartPos;
    1086             :         }
    1087             :     }
    1088             :     else
    1089             :     {
    1090             :         /*
    1091             :          * This was an xlog-switch record, but the current insert location was
    1092             :          * already exactly at the beginning of a segment, so there was no need
    1093             :          * to do anything.
    1094             :          */
    1095             :     }
    1096             : 
    1097             :     /*
    1098             :      * Done! Let others know that we're finished.
    1099             :      */
    1100     1306714 :     WALInsertLockRelease();
    1101             : 
    1102     1306714 :     MarkCurrentTransactionIdLoggedIfAny();
    1103             : 
    1104     1306714 :     END_CRIT_SECTION();
    1105             : 
    1106             :     /*
    1107             :      * Update shared LogwrtRqst.Write, if we crossed page boundary.
    1108             :      */
    1109     1306714 :     if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ)
    1110             :     {
    1111       20958 :         SpinLockAcquire(&XLogCtl->info_lck);
    1112             :         /* advance global request to include new block(s) */
    1113       20958 :         if (XLogCtl->LogwrtRqst.Write < EndPos)
    1114       20950 :             XLogCtl->LogwrtRqst.Write = EndPos;
    1115             :         /* update local result copy while I have the chance */
    1116       20958 :         LogwrtResult = XLogCtl->LogwrtResult;
    1117       20958 :         SpinLockRelease(&XLogCtl->info_lck);
    1118             :     }
    1119             : 
    1120             :     /*
    1121             :      * If this was an XLOG_SWITCH record, flush the record and the empty
    1122             :      * padding space that fills the rest of the segment, and perform
    1123             :      * end-of-segment actions (eg, notifying archiver).
    1124             :      */
    1125     1306714 :     if (isLogSwitch)
    1126             :     {
    1127             :         TRACE_POSTGRESQL_WAL_SWITCH();
    1128           0 :         XLogFlush(EndPos);
    1129             : 
    1130             :         /*
    1131             :          * Even though we reserved the rest of the segment for us, which is
    1132             :          * reflected in EndPos, we return a pointer to just the end of the
    1133             :          * xlog-switch record.
    1134             :          */
    1135           0 :         if (inserted)
    1136             :         {
    1137           0 :             EndPos = StartPos + SizeOfXLogRecord;
    1138           0 :             if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ)
    1139             :             {
    1140           0 :                 if (EndPos % XLOG_SEG_SIZE == EndPos % XLOG_BLCKSZ)
    1141           0 :                     EndPos += SizeOfXLogLongPHD;
    1142             :                 else
    1143           0 :                     EndPos += SizeOfXLogShortPHD;
    1144             :             }
    1145             :         }
    1146             :     }
    1147             : 
    1148             : #ifdef WAL_DEBUG
    1149             :     if (XLOG_DEBUG)
    1150             :     {
    1151             :         static XLogReaderState *debug_reader = NULL;
    1152             :         StringInfoData buf;
    1153             :         StringInfoData recordBuf;
    1154             :         char       *errormsg = NULL;
    1155             :         MemoryContext oldCxt;
    1156             : 
    1157             :         oldCxt = MemoryContextSwitchTo(walDebugCxt);
    1158             : 
    1159             :         initStringInfo(&buf);
    1160             :         appendStringInfo(&buf, "INSERT @ %X/%X: ",
    1161             :                          (uint32) (EndPos >> 32), (uint32) EndPos);
    1162             : 
    1163             :         /*
    1164             :          * We have to piece together the WAL record data from the XLogRecData
    1165             :          * entries, so that we can pass it to the rm_desc function as one
    1166             :          * contiguous chunk.
    1167             :          */
    1168             :         initStringInfo(&recordBuf);
    1169             :         for (; rdata != NULL; rdata = rdata->next)
    1170             :             appendBinaryStringInfo(&recordBuf, rdata->data, rdata->len);
    1171             : 
    1172             :         if (!debug_reader)
    1173             :             debug_reader = XLogReaderAllocate(NULL, NULL);
    1174             : 
    1175             :         if (!debug_reader)
    1176             :         {
    1177             :             appendStringInfoString(&buf, "error decoding record: out of memory");
    1178             :         }
    1179             :         else if (!DecodeXLogRecord(debug_reader, (XLogRecord *) recordBuf.data,
    1180             :                                    &errormsg))
    1181             :         {
    1182             :             appendStringInfo(&buf, "error decoding record: %s",
    1183             :                              errormsg ? errormsg : "no error message");
    1184             :         }
    1185             :         else
    1186             :         {
    1187             :             appendStringInfoString(&buf, " - ");
    1188             :             xlog_outdesc(&buf, debug_reader);
    1189             :         }
    1190             :         elog(LOG, "%s", buf.data);
    1191             : 
    1192             :         pfree(buf.data);
    1193             :         pfree(recordBuf.data);
    1194             :         MemoryContextSwitchTo(oldCxt);
    1195             :     }
    1196             : #endif
    1197             : 
    1198             :     /*
    1199             :      * Update our global variables
    1200             :      */
    1201     1306714 :     ProcLastRecPtr = StartPos;
    1202     1306714 :     XactLastRecEnd = EndPos;
    1203             : 
    1204     1306714 :     return EndPos;
    1205             : }
    1206             : 
    1207             : /*
    1208             :  * Reserves the right amount of space for a record of given size from the WAL.
    1209             :  * *StartPos is set to the beginning of the reserved section, *EndPos to
    1210             :  * its end+1. *PrevPtr is set to the beginning of the previous record; it is
    1211             :  * used to set the xl_prev of this record.
    1212             :  *
    1213             :  * This is the performance critical part of XLogInsert that must be serialized
    1214             :  * across backends. The rest can happen mostly in parallel. Try to keep this
    1215             :  * section as short as possible, insertpos_lck can be heavily contended on a
    1216             :  * busy system.
    1217             :  *
    1218             :  * NB: The space calculation here must match the code in CopyXLogRecordToWAL,
    1219             :  * where we actually copy the record to the reserved space.
    1220             :  */
    1221             : static void
    1222     1306714 : ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos, XLogRecPtr *EndPos,
    1223             :                           XLogRecPtr *PrevPtr)
    1224             : {
    1225     1306714 :     XLogCtlInsert *Insert = &XLogCtl->Insert;
    1226             :     uint64      startbytepos;
    1227             :     uint64      endbytepos;
    1228             :     uint64      prevbytepos;
    1229             : 
    1230     1306714 :     size = MAXALIGN(size);
    1231             : 
    1232             :     /* All (non xlog-switch) records should contain data. */
    1233     1306714 :     Assert(size > SizeOfXLogRecord);
    1234             : 
    1235             :     /*
    1236             :      * The duration the spinlock needs to be held is minimized by minimizing
    1237             :      * the calculations that have to be done while holding the lock. The
    1238             :      * current tip of reserved WAL is kept in CurrBytePos, as a byte position
    1239             :      * that only counts "usable" bytes in WAL, that is, it excludes all WAL
    1240             :      * page headers. The mapping between "usable" byte positions and physical
    1241             :      * positions (XLogRecPtrs) can be done outside the locked region, and
    1242             :      * because the usable byte position doesn't include any headers, reserving
    1243             :      * X bytes from WAL is almost as simple as "CurrBytePos += X".
    1244             :      */
    1245     1306714 :     SpinLockAcquire(&Insert->insertpos_lck);
    1246             : 
    1247     1306714 :     startbytepos = Insert->CurrBytePos;
    1248     1306714 :     endbytepos = startbytepos + size;
    1249     1306714 :     prevbytepos = Insert->PrevBytePos;
    1250     1306714 :     Insert->CurrBytePos = endbytepos;
    1251     1306714 :     Insert->PrevBytePos = startbytepos;
    1252             : 
    1253     1306714 :     SpinLockRelease(&Insert->insertpos_lck);
    1254             : 
    1255     1306714 :     *StartPos = XLogBytePosToRecPtr(startbytepos);
    1256     1306714 :     *EndPos = XLogBytePosToEndRecPtr(endbytepos);
    1257     1306714 :     *PrevPtr = XLogBytePosToRecPtr(prevbytepos);
    1258             : 
    1259             :     /*
    1260             :      * Check that the conversions between "usable byte positions" and
    1261             :      * XLogRecPtrs work consistently in both directions.
    1262             :      */
    1263     1306714 :     Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos);
    1264     1306714 :     Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos);
    1265     1306714 :     Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos);
    1266     1306714 : }
    1267             : 
    1268             : /*
    1269             :  * Like ReserveXLogInsertLocation(), but for an xlog-switch record.
    1270             :  *
    1271             :  * A log-switch record is handled slightly differently. The rest of the
    1272             :  * segment will be reserved for this insertion, as indicated by the returned
    1273             :  * *EndPos value. However, if we are already at the beginning of the current
    1274             :  * segment, *StartPos and *EndPos are set to the current location without
    1275             :  * reserving any space, and the function returns false.
    1276             : */
    1277             : static bool
    1278           0 : ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos, XLogRecPtr *PrevPtr)
    1279             : {
    1280           0 :     XLogCtlInsert *Insert = &XLogCtl->Insert;
    1281             :     uint64      startbytepos;
    1282             :     uint64      endbytepos;
    1283             :     uint64      prevbytepos;
    1284           0 :     uint32      size = MAXALIGN(SizeOfXLogRecord);
    1285             :     XLogRecPtr  ptr;
    1286             :     uint32      segleft;
    1287             : 
    1288             :     /*
    1289             :      * These calculations are a bit heavy-weight to be done while holding a
    1290             :      * spinlock, but since we're holding all the WAL insertion locks, there
    1291             :      * are no other inserters competing for it. GetXLogInsertRecPtr() does
    1292             :      * compete for it, but that's not called very frequently.
    1293             :      */
    1294           0 :     SpinLockAcquire(&Insert->insertpos_lck);
    1295             : 
    1296           0 :     startbytepos = Insert->CurrBytePos;
    1297             : 
    1298           0 :     ptr = XLogBytePosToEndRecPtr(startbytepos);
    1299           0 :     if (ptr % XLOG_SEG_SIZE == 0)
    1300             :     {
    1301           0 :         SpinLockRelease(&Insert->insertpos_lck);
    1302           0 :         *EndPos = *StartPos = ptr;
    1303           0 :         return false;
    1304             :     }
    1305             : 
    1306           0 :     endbytepos = startbytepos + size;
    1307           0 :     prevbytepos = Insert->PrevBytePos;
    1308             : 
    1309           0 :     *StartPos = XLogBytePosToRecPtr(startbytepos);
    1310           0 :     *EndPos = XLogBytePosToEndRecPtr(endbytepos);
    1311             : 
    1312           0 :     segleft = XLOG_SEG_SIZE - ((*EndPos) % XLOG_SEG_SIZE);
    1313           0 :     if (segleft != XLOG_SEG_SIZE)
    1314             :     {
    1315             :         /* consume the rest of the segment */
    1316           0 :         *EndPos += segleft;
    1317           0 :         endbytepos = XLogRecPtrToBytePos(*EndPos);
    1318             :     }
    1319           0 :     Insert->CurrBytePos = endbytepos;
    1320           0 :     Insert->PrevBytePos = startbytepos;
    1321             : 
    1322           0 :     SpinLockRelease(&Insert->insertpos_lck);
    1323             : 
    1324           0 :     *PrevPtr = XLogBytePosToRecPtr(prevbytepos);
    1325             : 
    1326           0 :     Assert((*EndPos) % XLOG_SEG_SIZE == 0);
    1327           0 :     Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos);
    1328           0 :     Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos);
    1329           0 :     Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos);
    1330             : 
    1331           0 :     return true;
    1332             : }
    1333             : 
    1334             : /*
    1335             :  * Checks whether the current buffer page and backup page stored in the
    1336             :  * WAL record are consistent or not. Before comparing the two pages, a
    1337             :  * masking can be applied to the pages to ignore certain areas like hint bits,
    1338             :  * unused space between pd_lower and pd_upper among other things. This
    1339             :  * function should be called once WAL replay has been completed for a
    1340             :  * given record.
    1341             :  */
    1342             : static void
    1343           0 : checkXLogConsistency(XLogReaderState *record)
    1344             : {
    1345           0 :     RmgrId      rmid = XLogRecGetRmid(record);
    1346             :     RelFileNode rnode;
    1347             :     ForkNumber  forknum;
    1348             :     BlockNumber blkno;
    1349             :     int         block_id;
    1350             : 
    1351             :     /* Records with no backup blocks have no need for consistency checks. */
    1352           0 :     if (!XLogRecHasAnyBlockRefs(record))
    1353           0 :         return;
    1354             : 
    1355           0 :     Assert((XLogRecGetInfo(record) & XLR_CHECK_CONSISTENCY) != 0);
    1356             : 
    1357           0 :     for (block_id = 0; block_id <= record->max_block_id; block_id++)
    1358             :     {
    1359             :         Buffer      buf;
    1360             :         Page        page;
    1361             : 
    1362           0 :         if (!XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blkno))
    1363             :         {
    1364             :             /*
    1365             :              * WAL record doesn't contain a block reference with the given id.
    1366             :              * Do nothing.
    1367             :              */
    1368           0 :             continue;
    1369             :         }
    1370             : 
    1371           0 :         Assert(XLogRecHasBlockImage(record, block_id));
    1372             : 
    1373           0 :         if (XLogRecBlockImageApply(record, block_id))
    1374             :         {
    1375             :             /*
    1376             :              * WAL record has already applied the page, so bypass the
    1377             :              * consistency check as that would result in comparing the full
    1378             :              * page stored in the record with itself.
    1379             :              */
    1380           0 :             continue;
    1381             :         }
    1382             : 
    1383             :         /*
    1384             :          * Read the contents from the current buffer and store it in a
    1385             :          * temporary page.
    1386             :          */
    1387           0 :         buf = XLogReadBufferExtended(rnode, forknum, blkno,
    1388             :                                      RBM_NORMAL_NO_LOG);
    1389           0 :         if (!BufferIsValid(buf))
    1390           0 :             continue;
    1391             : 
    1392           0 :         LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
    1393           0 :         page = BufferGetPage(buf);
    1394             : 
    1395             :         /*
    1396             :          * Take a copy of the local page where WAL has been applied to have a
    1397             :          * comparison base before masking it...
    1398             :          */
    1399           0 :         memcpy(replay_image_masked, page, BLCKSZ);
    1400             : 
    1401             :         /* No need for this page anymore now that a copy is in. */
    1402           0 :         UnlockReleaseBuffer(buf);
    1403             : 
    1404             :         /*
    1405             :          * If the block LSN is already ahead of this WAL record, we can't
    1406             :          * expect contents to match.  This can happen if recovery is
    1407             :          * restarted.
    1408             :          */
    1409           0 :         if (PageGetLSN(replay_image_masked) > record->EndRecPtr)
    1410           0 :             continue;
    1411             : 
    1412             :         /*
    1413             :          * Read the contents from the backup copy, stored in WAL record and
    1414             :          * store it in a temporary page. There is no need to allocate a new
    1415             :          * page here, a local buffer is fine to hold its contents and a mask
    1416             :          * can be directly applied on it.
    1417             :          */
    1418           0 :         if (!RestoreBlockImage(record, block_id, master_image_masked))
    1419           0 :             elog(ERROR, "failed to restore block image");
    1420             : 
    1421             :         /*
    1422             :          * If masking function is defined, mask both the master and replay
    1423             :          * images
    1424             :          */
    1425           0 :         if (RmgrTable[rmid].rm_mask != NULL)
    1426             :         {
    1427           0 :             RmgrTable[rmid].rm_mask(replay_image_masked, blkno);
    1428           0 :             RmgrTable[rmid].rm_mask(master_image_masked, blkno);
    1429             :         }
    1430             : 
    1431             :         /* Time to compare the master and replay images. */
    1432           0 :         if (memcmp(replay_image_masked, master_image_masked, BLCKSZ) != 0)
    1433             :         {
    1434           0 :             elog(FATAL,
    1435             :                  "inconsistent page found, rel %u/%u/%u, forknum %u, blkno %u",
    1436             :                  rnode.spcNode, rnode.dbNode, rnode.relNode,
    1437             :                  forknum, blkno);
    1438             :         }
    1439             :     }
    1440             : }
    1441             : 
    1442             : /*
    1443             :  * Subroutine of XLogInsertRecord.  Copies a WAL record to an already-reserved
    1444             :  * area in the WAL.
    1445             :  */
    1446             : static void
    1447     1306714 : CopyXLogRecordToWAL(int write_len, bool isLogSwitch, XLogRecData *rdata,
    1448             :                     XLogRecPtr StartPos, XLogRecPtr EndPos)
    1449             : {
    1450             :     char       *currpos;
    1451             :     int         freespace;
    1452             :     int         written;
    1453             :     XLogRecPtr  CurrPos;
    1454             :     XLogPageHeader pagehdr;
    1455             : 
    1456             :     /*
    1457             :      * Get a pointer to the right place in the right WAL buffer to start
    1458             :      * inserting to.
    1459             :      */
    1460     1306714 :     CurrPos = StartPos;
    1461     1306714 :     currpos = GetXLogBuffer(CurrPos);
    1462     1306714 :     freespace = INSERT_FREESPACE(CurrPos);
    1463             : 
    1464             :     /*
    1465             :      * there should be enough space for at least the first field (xl_tot_len)
    1466             :      * on this page.
    1467             :      */
    1468     1306714 :     Assert(freespace >= sizeof(uint32));
    1469             : 
    1470             :     /* Copy record data */
    1471     1306714 :     written = 0;
    1472     7210509 :     while (rdata != NULL)
    1473             :     {
    1474     4597081 :         char       *rdata_data = rdata->data;
    1475     4597081 :         int         rdata_len = rdata->len;
    1476             : 
    1477     9214513 :         while (rdata_len > freespace)
    1478             :         {
    1479             :             /*
    1480             :              * Write what fits on this page, and continue on the next page.
    1481             :              */
    1482       20351 :             Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || freespace == 0);
    1483       20351 :             memcpy(currpos, rdata_data, freespace);
    1484       20351 :             rdata_data += freespace;
    1485       20351 :             rdata_len -= freespace;
    1486       20351 :             written += freespace;
    1487       20351 :             CurrPos += freespace;
    1488             : 
    1489             :             /*
    1490             :              * Get pointer to beginning of next page, and set the xlp_rem_len
    1491             :              * in the page header. Set XLP_FIRST_IS_CONTRECORD.
    1492             :              *
    1493             :              * It's safe to set the contrecord flag and xlp_rem_len without a
    1494             :              * lock on the page. All the other flags were already set when the
    1495             :              * page was initialized, in AdvanceXLInsertBuffer, and we're the
    1496             :              * only backend that needs to set the contrecord flag.
    1497             :              */
    1498       20351 :             currpos = GetXLogBuffer(CurrPos);
    1499       20351 :             pagehdr = (XLogPageHeader) currpos;
    1500       20351 :             pagehdr->xlp_rem_len = write_len - written;
    1501       20351 :             pagehdr->xlp_info |= XLP_FIRST_IS_CONTRECORD;
    1502             : 
    1503             :             /* skip over the page header */
    1504       20351 :             if (CurrPos % XLogSegSize == 0)
    1505             :             {
    1506          10 :                 CurrPos += SizeOfXLogLongPHD;
    1507          10 :                 currpos += SizeOfXLogLongPHD;
    1508             :             }
    1509             :             else
    1510             :             {
    1511       20341 :                 CurrPos += SizeOfXLogShortPHD;
    1512       20341 :                 currpos += SizeOfXLogShortPHD;
    1513             :             }
    1514       20351 :             freespace = INSERT_FREESPACE(CurrPos);
    1515             :         }
    1516             : 
    1517     4597081 :         Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || rdata_len == 0);
    1518     4597081 :         memcpy(currpos, rdata_data, rdata_len);
    1519     4597081 :         currpos += rdata_len;
    1520     4597081 :         CurrPos += rdata_len;
    1521     4597081 :         freespace -= rdata_len;
    1522     4597081 :         written += rdata_len;
    1523             : 
    1524     4597081 :         rdata = rdata->next;
    1525             :     }
    1526     1306714 :     Assert(written == write_len);
    1527             : 
    1528             :     /*
    1529             :      * If this was an xlog-switch, it's not enough to write the switch record,
    1530             :      * we also have to consume all the remaining space in the WAL segment. We
    1531             :      * have already reserved it for us, but we still need to make sure it's
    1532             :      * allocated and zeroed in the WAL buffers so that when the caller (or
    1533             :      * someone else) does XLogWrite(), it can really write out all the zeros.
    1534             :      */
    1535     1306714 :     if (isLogSwitch && CurrPos % XLOG_SEG_SIZE != 0)
    1536             :     {
    1537             :         /* An xlog-switch record doesn't contain any data besides the header */
    1538           0 :         Assert(write_len == SizeOfXLogRecord);
    1539             : 
    1540             :         /*
    1541             :          * We do this one page at a time, to make sure we don't deadlock
    1542             :          * against ourselves if wal_buffers < XLOG_SEG_SIZE.
    1543             :          */
    1544           0 :         Assert(EndPos % XLogSegSize == 0);
    1545             : 
    1546             :         /* Use up all the remaining space on the first page */
    1547           0 :         CurrPos += freespace;
    1548             : 
    1549           0 :         while (CurrPos < EndPos)
    1550             :         {
    1551             :             /* initialize the next page (if not initialized already) */
    1552           0 :             WALInsertLockUpdateInsertingAt(CurrPos);
    1553           0 :             AdvanceXLInsertBuffer(CurrPos, false);
    1554           0 :             CurrPos += XLOG_BLCKSZ;
    1555             :         }
    1556             :     }
    1557             :     else
    1558             :     {
    1559             :         /* Align the end position, so that the next record starts aligned */
    1560     1306714 :         CurrPos = MAXALIGN64(CurrPos);
    1561             :     }
    1562             : 
    1563     1306714 :     if (CurrPos != EndPos)
    1564           0 :         elog(PANIC, "space reserved for WAL record does not match what was written");
    1565     1306714 : }
    1566             : 
    1567             : /*
    1568             :  * Acquire a WAL insertion lock, for inserting to WAL.
    1569             :  */
    1570             : static void
    1571     1306714 : WALInsertLockAcquire(void)
    1572             : {
    1573             :     bool        immed;
    1574             : 
    1575             :     /*
    1576             :      * It doesn't matter which of the WAL insertion locks we acquire, so try
    1577             :      * the one we used last time.  If the system isn't particularly busy, it's
    1578             :      * a good bet that it's still available, and it's good to have some
    1579             :      * affinity to a particular lock so that you don't unnecessarily bounce
    1580             :      * cache lines between processes when there's no contention.
    1581             :      *
    1582             :      * If this is the first time through in this backend, pick a lock
    1583             :      * (semi-)randomly.  This allows the locks to be used evenly if you have a
    1584             :      * lot of very short connections.
    1585             :      */
    1586             :     static int  lockToTry = -1;
    1587             : 
    1588     1306714 :     if (lockToTry == -1)
    1589         212 :         lockToTry = MyProc->pgprocno % NUM_XLOGINSERT_LOCKS;
    1590     1306714 :     MyLockNo = lockToTry;
    1591             : 
    1592             :     /*
    1593             :      * The insertingAt value is initially set to 0, as we don't know our
    1594             :      * insert location yet.
    1595             :      */
    1596     1306714 :     immed = LWLockAcquire(&WALInsertLocks[MyLockNo].l.lock, LW_EXCLUSIVE);
    1597     1306714 :     if (!immed)
    1598             :     {
    1599             :         /*
    1600             :          * If we couldn't get the lock immediately, try another lock next
    1601             :          * time.  On a system with more insertion locks than concurrent
    1602             :          * inserters, this causes all the inserters to eventually migrate to a
    1603             :          * lock that no-one else is using.  On a system with more inserters
    1604             :          * than locks, it still helps to distribute the inserters evenly
    1605             :          * across the locks.
    1606             :          */
    1607          45 :         lockToTry = (lockToTry + 1) % NUM_XLOGINSERT_LOCKS;
    1608             :     }
    1609     1306714 : }
    1610             : 
    1611             : /*
    1612             :  * Acquire all WAL insertion locks, to prevent other backends from inserting
    1613             :  * to WAL.
    1614             :  */
    1615             : static void
    1616          11 : WALInsertLockAcquireExclusive(void)
    1617             : {
    1618             :     int         i;
    1619             : 
    1620             :     /*
    1621             :      * When holding all the locks, all but the last lock's insertingAt
    1622             :      * indicator is set to 0xFFFFFFFFFFFFFFFF, which is higher than any real
    1623             :      * XLogRecPtr value, to make sure that no-one blocks waiting on those.
    1624             :      */
    1625          88 :     for (i = 0; i < NUM_XLOGINSERT_LOCKS - 1; i++)
    1626             :     {
    1627          77 :         LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
    1628          77 :         LWLockUpdateVar(&WALInsertLocks[i].l.lock,
    1629          77 :                         &WALInsertLocks[i].l.insertingAt,
    1630             :                         PG_UINT64_MAX);
    1631             :     }
    1632             :     /* Variable value reset to 0 at release */
    1633          11 :     LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
    1634             : 
    1635          11 :     holdingAllLocks = true;
    1636          11 : }
    1637             : 
    1638             : /*
    1639             :  * Release our insertion lock (or locks, if we're holding them all).
    1640             :  *
    1641             :  * NB: Reset all variables to 0, so they cause LWLockWaitForVar to block the
    1642             :  * next time the lock is acquired.
    1643             :  */
    1644             : static void
    1645     1306725 : WALInsertLockRelease(void)
    1646             : {
    1647     1306725 :     if (holdingAllLocks)
    1648             :     {
    1649             :         int         i;
    1650             : 
    1651          99 :         for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
    1652          88 :             LWLockReleaseClearVar(&WALInsertLocks[i].l.lock,
    1653          88 :                                   &WALInsertLocks[i].l.insertingAt,
    1654             :                                   0);
    1655             : 
    1656          11 :         holdingAllLocks = false;
    1657             :     }
    1658             :     else
    1659             :     {
    1660     1306714 :         LWLockReleaseClearVar(&WALInsertLocks[MyLockNo].l.lock,
    1661     1306714 :                               &WALInsertLocks[MyLockNo].l.insertingAt,
    1662             :                               0);
    1663             :     }
    1664     1306725 : }
    1665             : 
    1666             : /*
    1667             :  * Update our insertingAt value, to let others know that we've finished
    1668             :  * inserting up to that point.
    1669             :  */
    1670             : static void
    1671        1026 : WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt)
    1672             : {
    1673        1026 :     if (holdingAllLocks)
    1674             :     {
    1675             :         /*
    1676             :          * We use the last lock to mark our actual position, see comments in
    1677             :          * WALInsertLockAcquireExclusive.
    1678             :          */
    1679           0 :         LWLockUpdateVar(&WALInsertLocks[NUM_XLOGINSERT_LOCKS - 1].l.lock,
    1680           0 :                         &WALInsertLocks[NUM_XLOGINSERT_LOCKS - 1].l.insertingAt,
    1681             :                         insertingAt);
    1682             :     }
    1683             :     else
    1684        1026 :         LWLockUpdateVar(&WALInsertLocks[MyLockNo].l.lock,
    1685        1026 :                         &WALInsertLocks[MyLockNo].l.insertingAt,
    1686             :                         insertingAt);
    1687        1026 : }
    1688             : 
    1689             : /*
    1690             :  * Wait for any WAL insertions < upto to finish.
    1691             :  *
    1692             :  * Returns the location of the oldest insertion that is still in-progress.
    1693             :  * Any WAL prior to that point has been fully copied into WAL buffers, and
    1694             :  * can be flushed out to disk. Because this waits for any insertions older
    1695             :  * than 'upto' to finish, the return value is always >= 'upto'.
    1696             :  *
    1697             :  * Note: When you are about to write out WAL, you must call this function
    1698             :  * *before* acquiring WALWriteLock, to avoid deadlocks. This function might
    1699             :  * need to wait for an insertion to finish (or at least advance to next
    1700             :  * uninitialized page), and the inserter might need to evict an old WAL buffer
    1701             :  * to make room for a new one, which in turn requires WALWriteLock.
    1702             :  */
    1703             : static XLogRecPtr
    1704       10333 : WaitXLogInsertionsToFinish(XLogRecPtr upto)
    1705             : {
    1706             :     uint64      bytepos;
    1707             :     XLogRecPtr  reservedUpto;
    1708             :     XLogRecPtr  finishedUpto;
    1709       10333 :     XLogCtlInsert *Insert = &XLogCtl->Insert;
    1710             :     int         i;
    1711             : 
    1712       10333 :     if (MyProc == NULL)
    1713           0 :         elog(PANIC, "cannot wait without a PGPROC structure");
    1714             : 
    1715             :     /* Read the current insert position */
    1716       10333 :     SpinLockAcquire(&Insert->insertpos_lck);
    1717       10333 :     bytepos = Insert->CurrBytePos;
    1718       10333 :     SpinLockRelease(&Insert->insertpos_lck);
    1719       10333 :     reservedUpto = XLogBytePosToEndRecPtr(bytepos);
    1720             : 
    1721             :     /*
    1722             :      * No-one should request to flush a piece of WAL that hasn't even been
    1723             :      * reserved yet. However, it can happen if there is a block with a bogus
    1724             :      * LSN on disk, for example. XLogFlush checks for that situation and
    1725             :      * complains, but only after the flush. Here we just assume that to mean
    1726             :      * that all WAL that has been reserved needs to be finished. In this
    1727             :      * corner-case, the return value can be smaller than 'upto' argument.
    1728             :      */
    1729       10333 :     if (upto > reservedUpto)
    1730             :     {
    1731           0 :         elog(LOG, "request to flush past end of generated WAL; request %X/%X, currpos %X/%X",
    1732             :              (uint32) (upto >> 32), (uint32) upto,
    1733             :              (uint32) (reservedUpto >> 32), (uint32) reservedUpto);
    1734           0 :         upto = reservedUpto;
    1735             :     }
    1736             : 
    1737             :     /*
    1738             :      * Loop through all the locks, sleeping on any in-progress insert older
    1739             :      * than 'upto'.
    1740             :      *
    1741             :      * finishedUpto is our return value, indicating the point upto which all
    1742             :      * the WAL insertions have been finished. Initialize it to the head of
    1743             :      * reserved WAL, and as we iterate through the insertion locks, back it
    1744             :      * out for any insertion that's still in progress.
    1745             :      */
    1746       10333 :     finishedUpto = reservedUpto;
    1747       92997 :     for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
    1748             :     {
    1749       82664 :         XLogRecPtr  insertingat = InvalidXLogRecPtr;
    1750             : 
    1751             :         do
    1752             :         {
    1753             :             /*
    1754             :              * See if this insertion is in progress. LWLockWait will wait for
    1755             :              * the lock to be released, or for the 'value' to be set by a
    1756             :              * LWLockUpdateVar call.  When a lock is initially acquired, its
    1757             :              * value is 0 (InvalidXLogRecPtr), which means that we don't know
    1758             :              * where it's inserting yet.  We will have to wait for it.  If
    1759             :              * it's a small insertion, the record will most likely fit on the
    1760             :              * same page and the inserter will release the lock without ever
    1761             :              * calling LWLockUpdateVar.  But if it has to sleep, it will
    1762             :              * advertise the insertion point with LWLockUpdateVar before
    1763             :              * sleeping.
    1764             :              */
    1765      165336 :             if (LWLockWaitForVar(&WALInsertLocks[i].l.lock,
    1766       82668 :                                  &WALInsertLocks[i].l.insertingAt,
    1767             :                                  insertingat, &insertingat))
    1768             :             {
    1769             :                 /* the lock was free, so no insertion in progress */
    1770       82489 :                 insertingat = InvalidXLogRecPtr;
    1771       82489 :                 break;
    1772             :             }
    1773             : 
    1774             :             /*
    1775             :              * This insertion is still in progress. Have to wait, unless the
    1776             :              * inserter has proceeded past 'upto'.
    1777             :              */
    1778         179 :         } while (insertingat < upto);
    1779             : 
    1780       82664 :         if (insertingat != InvalidXLogRecPtr && insertingat < finishedUpto)
    1781         175 :             finishedUpto = insertingat;
    1782             :     }
    1783       10333 :     return finishedUpto;
    1784             : }
    1785             : 
    1786             : /*
    1787             :  * Get a pointer to the right location in the WAL buffer containing the
    1788             :  * given XLogRecPtr.
    1789             :  *
    1790             :  * If the page is not initialized yet, it is initialized. That might require
    1791             :  * evicting an old dirty buffer from the buffer cache, which means I/O.
    1792             :  *
    1793             :  * The caller must ensure that the page containing the requested location
    1794             :  * isn't evicted yet, and won't be evicted. The way to ensure that is to
    1795             :  * hold onto a WAL insertion lock with the insertingAt position set to
    1796             :  * something <= ptr. GetXLogBuffer() will update insertingAt if it needs
    1797             :  * to evict an old page from the buffer. (This means that once you call
    1798             :  * GetXLogBuffer() with a given 'ptr', you must not access anything before
    1799             :  * that point anymore, and must not call GetXLogBuffer() with an older 'ptr'
    1800             :  * later, because older buffers might be recycled already)
    1801             :  */
    1802             : static char *
    1803     1327065 : GetXLogBuffer(XLogRecPtr ptr)
    1804             : {
    1805             :     int         idx;
    1806             :     XLogRecPtr  endptr;
    1807             :     static uint64 cachedPage = 0;
    1808             :     static char *cachedPos = NULL;
    1809             :     XLogRecPtr  expectedEndPtr;
    1810             : 
    1811             :     /*
    1812             :      * Fast path for the common case that we need to access again the same
    1813             :      * page as last time.
    1814             :      */
    1815     1327065 :     if (ptr / XLOG_BLCKSZ == cachedPage)
    1816             :     {
    1817     1295596 :         Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC);
    1818     1295596 :         Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ));
    1819     1295596 :         return cachedPos + ptr % XLOG_BLCKSZ;
    1820             :     }
    1821             : 
    1822             :     /*
    1823             :      * The XLog buffer cache is organized so that a page is always loaded to a
    1824             :      * particular buffer.  That way we can easily calculate the buffer a given
    1825             :      * page must be loaded into, from the XLogRecPtr alone.
    1826             :      */
    1827       31469 :     idx = XLogRecPtrToBufIdx(ptr);
    1828             : 
    1829             :     /*
    1830             :      * See what page is loaded in the buffer at the moment. It could be the
    1831             :      * page we're looking for, or something older. It can't be anything newer
    1832             :      * - that would imply the page we're looking for has already been written
    1833             :      * out to disk and evicted, and the caller is responsible for making sure
    1834             :      * that doesn't happen.
    1835             :      *
    1836             :      * However, we don't hold a lock while we read the value. If someone has
    1837             :      * just initialized the page, it's possible that we get a "torn read" of
    1838             :      * the XLogRecPtr if 64-bit fetches are not atomic on this platform. In
    1839             :      * that case we will see a bogus value. That's ok, we'll grab the mapping
    1840             :      * lock (in AdvanceXLInsertBuffer) and retry if we see anything else than
    1841             :      * the page we're looking for. But it means that when we do this unlocked
    1842             :      * read, we might see a value that appears to be ahead of the page we're
    1843             :      * looking for. Don't PANIC on that, until we've verified the value while
    1844             :      * holding the lock.
    1845             :      */
    1846       31469 :     expectedEndPtr = ptr;
    1847       31469 :     expectedEndPtr += XLOG_BLCKSZ - ptr % XLOG_BLCKSZ;
    1848             : 
    1849       31469 :     endptr = XLogCtl->xlblocks[idx];
    1850       31469 :     if (expectedEndPtr != endptr)
    1851             :     {
    1852             :         XLogRecPtr  initializedUpto;
    1853             : 
    1854             :         /*
    1855             :          * Before calling AdvanceXLInsertBuffer(), which can block, let others
    1856             :          * know how far we're finished with inserting the record.
    1857             :          *
    1858             :          * NB: If 'ptr' points to just after the page header, advertise a
    1859             :          * position at the beginning of the page rather than 'ptr' itself. If
    1860             :          * there are no other insertions running, someone might try to flush
    1861             :          * up to our advertised location. If we advertised a position after
    1862             :          * the page header, someone might try to flush the page header, even
    1863             :          * though page might actually not be initialized yet. As the first
    1864             :          * inserter on the page, we are effectively responsible for making
    1865             :          * sure that it's initialized, before we let insertingAt to move past
    1866             :          * the page header.
    1867             :          */
    1868        1062 :         if (ptr % XLOG_BLCKSZ == SizeOfXLogShortPHD &&
    1869          36 :             ptr % XLOG_SEG_SIZE > XLOG_BLCKSZ)
    1870          36 :             initializedUpto = ptr - SizeOfXLogShortPHD;
    1871         990 :         else if (ptr % XLOG_BLCKSZ == SizeOfXLogLongPHD &&
    1872           0 :                  ptr % XLOG_SEG_SIZE < XLOG_BLCKSZ)
    1873           0 :             initializedUpto = ptr - SizeOfXLogLongPHD;
    1874             :         else
    1875         990 :             initializedUpto = ptr;
    1876             : 
    1877        1026 :         WALInsertLockUpdateInsertingAt(initializedUpto);
    1878             : 
    1879        1026 :         AdvanceXLInsertBuffer(ptr, false);
    1880        1026 :         endptr = XLogCtl->xlblocks[idx];
    1881             : 
    1882        1026 :         if (expectedEndPtr != endptr)
    1883           0 :             elog(PANIC, "could not find WAL buffer for %X/%X",
    1884             :                  (uint32) (ptr >> 32), (uint32) ptr);
    1885             :     }
    1886             :     else
    1887             :     {
    1888             :         /*
    1889             :          * Make sure the initialization of the page is visible to us, and
    1890             :          * won't arrive later to overwrite the WAL data we write on the page.
    1891             :          */
    1892       30443 :         pg_memory_barrier();
    1893             :     }
    1894             : 
    1895             :     /*
    1896             :      * Found the buffer holding this page. Return a pointer to the right
    1897             :      * offset within the page.
    1898             :      */
    1899       31469 :     cachedPage = ptr / XLOG_BLCKSZ;
    1900       31469 :     cachedPos = XLogCtl->pages + idx * (Size) XLOG_BLCKSZ;
    1901             : 
    1902       31469 :     Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC);
    1903       31469 :     Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ));
    1904             : 
    1905       31469 :     return cachedPos + ptr % XLOG_BLCKSZ;
    1906             : }
    1907             : 
    1908             : /*
    1909             :  * Converts a "usable byte position" to XLogRecPtr. A usable byte position
    1910             :  * is the position starting from the beginning of WAL, excluding all WAL
    1911             :  * page headers.
    1912             :  */
    1913             : static XLogRecPtr
    1914     2613439 : XLogBytePosToRecPtr(uint64 bytepos)
    1915             : {
    1916             :     uint64      fullsegs;
    1917             :     uint64      fullpages;
    1918             :     uint64      bytesleft;
    1919             :     uint32      seg_offset;
    1920             :     XLogRecPtr  result;
    1921             : 
    1922     2613439 :     fullsegs = bytepos / UsableBytesInSegment;
    1923     2613439 :     bytesleft = bytepos % UsableBytesInSegment;
    1924             : 
    1925     2613439 :     if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
    1926             :     {
    1927             :         /* fits on first page of segment */
    1928        1493 :         seg_offset = bytesleft + SizeOfXLogLongPHD;
    1929             :     }
    1930             :     else
    1931             :     {
    1932             :         /* account for the first page on segment with long header */
    1933     2611946 :         seg_offset = XLOG_BLCKSZ;
    1934     2611946 :         bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
    1935             : 
    1936     2611946 :         fullpages = bytesleft / UsableBytesInPage;
    1937     2611946 :         bytesleft = bytesleft % UsableBytesInPage;
    1938             : 
    1939     2611946 :         seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
    1940             :     }
    1941             : 
    1942     2613439 :     XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, result);
    1943             : 
    1944     2613439 :     return result;
    1945             : }
    1946             : 
    1947             : /*
    1948             :  * Like XLogBytePosToRecPtr, but if the position is at a page boundary,
    1949             :  * returns a pointer to the beginning of the page (ie. before page header),
    1950             :  * not to where the first xlog record on that page would go to. This is used
    1951             :  * when converting a pointer to the end of a record.
    1952             :  */
    1953             : static XLogRecPtr
    1954     1317047 : XLogBytePosToEndRecPtr(uint64 bytepos)
    1955             : {
    1956             :     uint64      fullsegs;
    1957             :     uint64      fullpages;
    1958             :     uint64      bytesleft;
    1959             :     uint32      seg_offset;
    1960             :     XLogRecPtr  result;
    1961             : 
    1962     1317047 :     fullsegs = bytepos / UsableBytesInSegment;
    1963     1317047 :     bytesleft = bytepos % UsableBytesInSegment;
    1964             : 
    1965     1317047 :     if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
    1966             :     {
    1967             :         /* fits on first page of segment */
    1968         762 :         if (bytesleft == 0)
    1969           0 :             seg_offset = 0;
    1970             :         else
    1971         762 :             seg_offset = bytesleft + SizeOfXLogLongPHD;
    1972             :     }
    1973             :     else
    1974             :     {
    1975             :         /* account for the first page on segment with long header */
    1976     1316285 :         seg_offset = XLOG_BLCKSZ;
    1977     1316285 :         bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
    1978             : 
    1979     1316285 :         fullpages = bytesleft / UsableBytesInPage;
    1980     1316285 :         bytesleft = bytesleft % UsableBytesInPage;
    1981             : 
    1982     1316285 :         if (bytesleft == 0)
    1983         636 :             seg_offset += fullpages * XLOG_BLCKSZ + bytesleft;
    1984             :         else
    1985     1315649 :             seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
    1986             :     }
    1987             : 
    1988     1317047 :     XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, result);
    1989             : 
    1990     1317047 :     return result;
    1991             : }
    1992             : 
    1993             : /*
    1994             :  * Convert an XLogRecPtr to a "usable byte position".
    1995             :  */
    1996             : static uint64
    1997     3920148 : XLogRecPtrToBytePos(XLogRecPtr ptr)
    1998             : {
    1999             :     uint64      fullsegs;
    2000             :     uint32      fullpages;
    2001             :     uint32      offset;
    2002             :     uint64      result;
    2003             : 
    2004     3920148 :     XLByteToSeg(ptr, fullsegs);
    2005             : 
    2006     3920148 :     fullpages = (ptr % XLOG_SEG_SIZE) / XLOG_BLCKSZ;
    2007     3920148 :     offset = ptr % XLOG_BLCKSZ;
    2008             : 
    2009     3920148 :     if (fullpages == 0)
    2010             :     {
    2011        2240 :         result = fullsegs * UsableBytesInSegment;
    2012        2240 :         if (offset > 0)
    2013             :         {
    2014        2240 :             Assert(offset >= SizeOfXLogLongPHD);
    2015        2240 :             result += offset - SizeOfXLogLongPHD;
    2016             :         }
    2017             :     }
    2018             :     else
    2019             :     {
    2020     7835816 :         result = fullsegs * UsableBytesInSegment +
    2021     3917908 :             (XLOG_BLCKSZ - SizeOfXLogLongPHD) + /* account for first page */
    2022     3917908 :             (fullpages - 1) * UsableBytesInPage;    /* full pages */
    2023     3917908 :         if (offset > 0)
    2024             :         {
    2025     3917278 :             Assert(offset >= SizeOfXLogShortPHD);
    2026     3917278 :             result += offset - SizeOfXLogShortPHD;
    2027             :         }
    2028             :     }
    2029             : 
    2030     3920148 :     return result;
    2031             : }
    2032             : 
    2033             : /*
    2034             :  * Initialize XLOG buffers, writing out old buffers if they still contain
    2035             :  * unwritten data, upto the page containing 'upto'. Or if 'opportunistic' is
    2036             :  * true, initialize as many pages as we can without having to write out
    2037             :  * unwritten data. Any new pages are initialized to zeros, with pages headers
    2038             :  * initialized properly.
    2039             :  */
    2040             : static void
    2041        1608 : AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic)
    2042             : {
    2043        1608 :     XLogCtlInsert *Insert = &XLogCtl->Insert;
    2044             :     int         nextidx;
    2045             :     XLogRecPtr  OldPageRqstPtr;
    2046             :     XLogwrtRqst WriteRqst;
    2047        1608 :     XLogRecPtr  NewPageEndPtr = InvalidXLogRecPtr;
    2048             :     XLogRecPtr  NewPageBeginPtr;
    2049             :     XLogPageHeader NewPage;
    2050        1608 :     int         npages = 0;
    2051             : 
    2052        1608 :     LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE);
    2053             : 
    2054             :     /*
    2055             :      * Now that we have the lock, check if someone initialized the page
    2056             :      * already.
    2057             :      */
    2058       24871 :     while (upto >= XLogCtl->InitializedUpTo || opportunistic)
    2059             :     {
    2060       22237 :         nextidx = XLogRecPtrToBufIdx(XLogCtl->InitializedUpTo);
    2061             : 
    2062             :         /*
    2063             :          * Get ending-offset of the buffer page we need to replace (this may
    2064             :          * be zero if the buffer hasn't been used yet).  Fall through if it's
    2065             :          * already written out.
    2066             :          */
    2067       22237 :         OldPageRqstPtr = XLogCtl->xlblocks[nextidx];
    2068       22237 :         if (LogwrtResult.Write < OldPageRqstPtr)
    2069             :         {
    2070             :             /*
    2071             :              * Nope, got work to do. If we just want to pre-initialize as much
    2072             :              * as we can without flushing, give up now.
    2073             :              */
    2074         749 :             if (opportunistic)
    2075         582 :                 break;
    2076             : 
    2077             :             /* Before waiting, get info_lck and update LogwrtResult */
    2078         167 :             SpinLockAcquire(&XLogCtl->info_lck);
    2079         167 :             if (XLogCtl->LogwrtRqst.Write < OldPageRqstPtr)
    2080           0 :                 XLogCtl->LogwrtRqst.Write = OldPageRqstPtr;
    2081         167 :             LogwrtResult = XLogCtl->LogwrtResult;
    2082         167 :             SpinLockRelease(&XLogCtl->info_lck);
    2083             : 
    2084             :             /*
    2085             :              * Now that we have an up-to-date LogwrtResult value, see if we
    2086             :              * still need to write it or if someone else already did.
    2087             :              */
    2088         167 :             if (LogwrtResult.Write < OldPageRqstPtr)
    2089             :             {
    2090             :                 /*
    2091             :                  * Must acquire write lock. Release WALBufMappingLock first,
    2092             :                  * to make sure that all insertions that we need to wait for
    2093             :                  * can finish (up to this same position). Otherwise we risk
    2094             :                  * deadlock.
    2095             :                  */
    2096         167 :                 LWLockRelease(WALBufMappingLock);
    2097             : 
    2098         167 :                 WaitXLogInsertionsToFinish(OldPageRqstPtr);
    2099             : 
    2100         167 :                 LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
    2101             : 
    2102         167 :                 LogwrtResult = XLogCtl->LogwrtResult;
    2103         167 :                 if (LogwrtResult.Write >= OldPageRqstPtr)
    2104             :                 {
    2105             :                     /* OK, someone wrote it already */
    2106           3 :                     LWLockRelease(WALWriteLock);
    2107             :                 }
    2108             :                 else
    2109             :                 {
    2110             :                     /* Have to write it ourselves */
    2111             :                     TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_START();
    2112         164 :                     WriteRqst.Write = OldPageRqstPtr;
    2113         164 :                     WriteRqst.Flush = 0;
    2114         164 :                     XLogWrite(WriteRqst, false);
    2115         164 :                     LWLockRelease(WALWriteLock);
    2116             :                     TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_DONE();
    2117             :                 }
    2118             :                 /* Re-acquire WALBufMappingLock and retry */
    2119         167 :                 LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE);
    2120         167 :                 continue;
    2121             :             }
    2122             :         }
    2123             : 
    2124             :         /*
    2125             :          * Now the next buffer slot is free and we can set it up to be the
    2126             :          * next output page.
    2127             :          */
    2128       21488 :         NewPageBeginPtr = XLogCtl->InitializedUpTo;
    2129       21488 :         NewPageEndPtr = NewPageBeginPtr + XLOG_BLCKSZ;
    2130             : 
    2131       21488 :         Assert(XLogRecPtrToBufIdx(NewPageBeginPtr) == nextidx);
    2132             : 
    2133       21488 :         NewPage = (XLogPageHeader) (XLogCtl->pages + nextidx * (Size) XLOG_BLCKSZ);
    2134             : 
    2135             :         /*
    2136             :          * Be sure to re-zero the buffer so that bytes beyond what we've
    2137             :          * written will look like zeroes and not valid XLOG records...
    2138             :          */
    2139       21488 :         MemSet((char *) NewPage, 0, XLOG_BLCKSZ);
    2140             : 
    2141             :         /*
    2142             :          * Fill the new page's header
    2143             :          */
    2144       21488 :         NewPage->xlp_magic = XLOG_PAGE_MAGIC;
    2145             : 
    2146             :         /* NewPage->xlp_info = 0; */ /* done by memset */
    2147       21488 :         NewPage->xlp_tli = ThisTimeLineID;
    2148       21488 :         NewPage->xlp_pageaddr = NewPageBeginPtr;
    2149             : 
    2150             :         /* NewPage->xlp_rem_len = 0; */  /* done by memset */
    2151             : 
    2152             :         /*
    2153             :          * If online backup is not in progress, mark the header to indicate
    2154             :          * that* WAL records beginning in this page have removable backup
    2155             :          * blocks.  This allows the WAL archiver to know whether it is safe to
    2156             :          * compress archived WAL data by transforming full-block records into
    2157             :          * the non-full-block format.  It is sufficient to record this at the
    2158             :          * page level because we force a page switch (in fact a segment
    2159             :          * switch) when starting a backup, so the flag will be off before any
    2160             :          * records can be written during the backup.  At the end of a backup,
    2161             :          * the last page will be marked as all unsafe when perhaps only part
    2162             :          * is unsafe, but at worst the archiver would miss the opportunity to
    2163             :          * compress a few records.
    2164             :          */
    2165       21488 :         if (!Insert->forcePageWrites)
    2166       21488 :             NewPage->xlp_info |= XLP_BKP_REMOVABLE;
    2167             : 
    2168             :         /*
    2169             :          * If first page of an XLOG segment file, make it a long header.
    2170             :          */
    2171       21488 :         if ((NewPage->xlp_pageaddr % XLogSegSize) == 0)
    2172             :         {
    2173          10 :             XLogLongPageHeader NewLongPage = (XLogLongPageHeader) NewPage;
    2174             : 
    2175          10 :             NewLongPage->xlp_sysid = ControlFile->system_identifier;
    2176          10 :             NewLongPage->xlp_seg_size = XLogSegSize;
    2177          10 :             NewLongPage->xlp_xlog_blcksz = XLOG_BLCKSZ;
    2178          10 :             NewPage->xlp_info |= XLP_LONG_HEADER;
    2179             :         }
    2180             : 
    2181             :         /*
    2182             :          * Make sure the initialization of the page becomes visible to others
    2183             :          * before the xlblocks update. GetXLogBuffer() reads xlblocks without
    2184             :          * holding a lock.
    2185             :          */
    2186       21488 :         pg_write_barrier();
    2187             : 
    2188       21488 :         *((volatile XLogRecPtr *) &XLogCtl->xlblocks[nextidx]) = NewPageEndPtr;
    2189             : 
    2190       21488 :         XLogCtl->InitializedUpTo = NewPageEndPtr;
    2191             : 
    2192       21488 :         npages++;
    2193             :     }
    2194        1608 :     LWLockRelease(WALBufMappingLock);
    2195             : 
    2196             : #ifdef WAL_DEBUG
    2197             :     if (XLOG_DEBUG && npages > 0)
    2198             :     {
    2199             :         elog(DEBUG1, "initialized %d pages, up to %X/%X",
    2200             :              npages, (uint32) (NewPageEndPtr >> 32), (uint32) NewPageEndPtr);
    2201             :     }
    2202             : #endif
    2203        1608 : }
    2204             : 
    2205             : /*
    2206             :  * Calculate CheckPointSegments based on max_wal_size_mb and
    2207             :  * checkpoint_completion_target.
    2208             :  */
    2209             : static void
    2210           5 : CalculateCheckpointSegments(void)
    2211             : {
    2212             :     double      target;
    2213             : 
    2214             :     /*-------
    2215             :      * Calculate the distance at which to trigger a checkpoint, to avoid
    2216             :      * exceeding max_wal_size_mb. This is based on two assumptions:
    2217             :      *
    2218             :      * a) we keep WAL for two checkpoint cycles, back to the "prev" checkpoint.
    2219             :      * b) during checkpoint, we consume checkpoint_completion_target *
    2220             :      *    number of segments consumed between checkpoints.
    2221             :      *-------
    2222             :      */
    2223           5 :     target = (double) ConvertToXSegs(max_wal_size_mb) / (2.0 + CheckPointCompletionTarget);
    2224             : 
    2225             :     /* round down */
    2226           5 :     CheckPointSegments = (int) target;
    2227             : 
    2228           5 :     if (CheckPointSegments < 1)
    2229           0 :         CheckPointSegments = 1;
    2230           5 : }
    2231             : 
    2232             : void
    2233           5 : assign_max_wal_size(int newval, void *extra)
    2234             : {
    2235           5 :     max_wal_size_mb = newval;
    2236           5 :     CalculateCheckpointSegments();
    2237           5 : }
    2238             : 
    2239             : void
    2240           0 : assign_checkpoint_completion_target(double newval, void *extra)
    2241             : {
    2242           0 :     CheckPointCompletionTarget = newval;
    2243           0 :     CalculateCheckpointSegments();
    2244           0 : }
    2245             : 
    2246             : /*
    2247             :  * At a checkpoint, how many WAL segments to recycle as preallocated future
    2248             :  * XLOG segments? Returns the highest segment that should be preallocated.
    2249             :  */
    2250             : static XLogSegNo
    2251           0 : XLOGfileslop(XLogRecPtr PriorRedoPtr)
    2252             : {
    2253             :     XLogSegNo   minSegNo;
    2254             :     XLogSegNo   maxSegNo;
    2255             :     double      distance;
    2256             :     XLogSegNo   recycleSegNo;
    2257             : 
    2258             :     /*
    2259             :      * Calculate the segment numbers that min_wal_size_mb and max_wal_size_mb
    2260             :      * correspond to. Always recycle enough segments to meet the minimum, and
    2261             :      * remove enough segments to stay below the maximum.
    2262             :      */
    2263           0 :     minSegNo = PriorRedoPtr / XLOG_SEG_SIZE + ConvertToXSegs(min_wal_size_mb) - 1;
    2264           0 :     maxSegNo = PriorRedoPtr / XLOG_SEG_SIZE + ConvertToXSegs(max_wal_size_mb) - 1;
    2265             : 
    2266             :     /*
    2267             :      * Between those limits, recycle enough segments to get us through to the
    2268             :      * estimated end of next checkpoint.
    2269             :      *
    2270             :      * To estimate where the next checkpoint will finish, assume that the
    2271             :      * system runs steadily consuming CheckPointDistanceEstimate bytes between
    2272             :      * every checkpoint.
    2273             :      *
    2274             :      * The reason this calculation is done from the prior checkpoint, not the
    2275             :      * one that just finished, is that this behaves better if some checkpoint
    2276             :      * cycles are abnormally short, like if you perform a manual checkpoint
    2277             :      * right after a timed one. The manual checkpoint will make almost a full
    2278             :      * cycle's worth of WAL segments available for recycling, because the
    2279             :      * segments from the prior's prior, fully-sized checkpoint cycle are no
    2280             :      * longer needed. However, the next checkpoint will make only few segments
    2281             :      * available for recycling, the ones generated between the timed
    2282             :      * checkpoint and the manual one right after that. If at the manual
    2283             :      * checkpoint we only retained enough segments to get us to the next timed
    2284             :      * one, and removed the rest, then at the next checkpoint we would not
    2285             :      * have enough segments around for recycling, to get us to the checkpoint
    2286             :      * after that. Basing the calculations on the distance from the prior redo
    2287             :      * pointer largely fixes that problem.
    2288             :      */
    2289           0 :     distance = (2.0 + CheckPointCompletionTarget) * CheckPointDistanceEstimate;
    2290             :     /* add 10% for good measure. */
    2291           0 :     distance *= 1.10;
    2292             : 
    2293           0 :     recycleSegNo = (XLogSegNo) ceil(((double) PriorRedoPtr + distance) / XLOG_SEG_SIZE);
    2294             : 
    2295           0 :     if (recycleSegNo < minSegNo)
    2296           0 :         recycleSegNo = minSegNo;
    2297           0 :     if (recycleSegNo > maxSegNo)
    2298           0 :         recycleSegNo = maxSegNo;
    2299             : 
    2300           0 :     return recycleSegNo;
    2301             : }
    2302             : 
    2303             : /*
    2304             :  * Check whether we've consumed enough xlog space that a checkpoint is needed.
    2305             :  *
    2306             :  * new_segno indicates a log file that has just been filled up (or read
    2307             :  * during recovery). We measure the distance from RedoRecPtr to new_segno
    2308             :  * and see if that exceeds CheckPointSegments.
    2309             :  *
    2310             :  * Note: it is caller's responsibility that RedoRecPtr is up-to-date.
    2311             :  */
    2312             : static bool
    2313          10 : XLogCheckpointNeeded(XLogSegNo new_segno)
    2314             : {
    2315             :     XLogSegNo   old_segno;
    2316             : 
    2317          10 :     XLByteToSeg(RedoRecPtr, old_segno);
    2318             : 
    2319          10 :     if (new_segno >= old_segno + (uint64) (CheckPointSegments - 1))
    2320           0 :         return true;
    2321          10 :     return false;
    2322             : }
    2323             : 
    2324             : /*
    2325             :  * Write and/or fsync the log at least as far as WriteRqst indicates.
    2326             :  *
    2327             :  * If flexible == TRUE, we don't have to write as far as WriteRqst, but
    2328             :  * may stop at any convenient boundary (such as a cache or logfile boundary).
    2329             :  * This option allows us to avoid uselessly issuing multiple writes when a
    2330             :  * single one would do.
    2331             :  *
    2332             :  * Must be called with WALWriteLock held. WaitXLogInsertionsToFinish(WriteRqst)
    2333             :  * must be called before grabbing the lock, to make sure the data is ready to
    2334             :  * write.
    2335             :  */
    2336             : static void
    2337       10084 : XLogWrite(XLogwrtRqst WriteRqst, bool flexible)
    2338             : {
    2339             :     bool        ispartialpage;
    2340             :     bool        last_iteration;
    2341             :     bool        finishing_seg;
    2342             :     bool        use_existent;
    2343             :     int         curridx;
    2344             :     int         npages;
    2345             :     int         startidx;
    2346             :     uint32      startoffset;
    2347             : 
    2348             :     /* We should always be inside a critical section here */
    2349       10084 :     Assert(CritSectionCount > 0);
    2350             : 
    2351             :     /*
    2352             :      * Update local LogwrtResult (caller probably did this already, but...)
    2353             :      */
    2354       10084 :     LogwrtResult = XLogCtl->LogwrtResult;
    2355             : 
    2356             :     /*
    2357             :      * Since successive pages in the xlog cache are consecutively allocated,
    2358             :      * we can usually gather multiple pages together and issue just one
    2359             :      * write() call.  npages is the number of pages we have determined can be
    2360             :      * written together; startidx is the cache block index of the first one,
    2361             :      * and startoffset is the file offset at which it should go. The latter
    2362             :      * two variables are only valid when npages > 0, but we must initialize
    2363             :      * all of them to keep the compiler quiet.
    2364             :      */
    2365       10084 :     npages = 0;
    2366       10084 :     startidx = 0;
    2367       10084 :     startoffset = 0;
    2368             : 
    2369             :     /*
    2370             :      * Within the loop, curridx is the cache block index of the page to
    2371             :      * consider writing.  Begin at the buffer containing the next unwritten
    2372             :      * page, or last partially written page.
    2373             :      */
    2374       10084 :     curridx = XLogRecPtrToBufIdx(LogwrtResult.Write);
    2375             : 
    2376       40671 :     while (LogwrtResult.Write < WriteRqst.Write)
    2377             :     {
    2378             :         /*
    2379             :          * Make sure we're not ahead of the insert process.  This could happen
    2380             :          * if we're passed a bogus WriteRqst.Write that is past the end of the
    2381             :          * last page that's been initialized by AdvanceXLInsertBuffer.
    2382             :          */
    2383       30414 :         XLogRecPtr  EndPtr = XLogCtl->xlblocks[curridx];
    2384             : 
    2385       30414 :         if (LogwrtResult.Write >= EndPtr)
    2386           0 :             elog(PANIC, "xlog write request %X/%X is past end of log %X/%X",
    2387             :                  (uint32) (LogwrtResult.Write >> 32),
    2388             :                  (uint32) LogwrtResult.Write,
    2389             :                  (uint32) (EndPtr >> 32), (uint32) EndPtr);
    2390             : 
    2391             :         /* Advance LogwrtResult.Write to end of current buffer page */
    2392       30414 :         LogwrtResult.Write = EndPtr;
    2393       30414 :         ispartialpage = WriteRqst.Write < LogwrtResult.Write;
    2394             : 
    2395       30414 :         if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo))
    2396             :         {
    2397             :             /*
    2398             :              * Switch to new logfile segment.  We cannot have any pending
    2399             :              * pages here (since we dump what we have at segment end).
    2400             :              */
    2401         256 :             Assert(npages == 0);
    2402         256 :             if (openLogFile >= 0)
    2403          51 :                 XLogFileClose();
    2404         256 :             XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo);
    2405             : 
    2406             :             /* create/use new log file */
    2407         256 :             use_existent = true;
    2408         256 :             openLogFile = XLogFileInit(openLogSegNo, &use_existent, true);
    2409         256 :             openLogOff = 0;
    2410             :         }
    2411             : 
    2412             :         /* Make sure we have the current logfile open */
    2413       30414 :         if (openLogFile < 0)
    2414             :         {
    2415           0 :             XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo);
    2416           0 :             openLogFile = XLogFileOpen(openLogSegNo);
    2417           0 :             openLogOff = 0;
    2418             :         }
    2419             : 
    2420             :         /* Add current page to the set of pending pages-to-dump */
    2421       30414 :         if (npages == 0)
    2422             :         {
    2423             :             /* first of group */
    2424       10112 :             startidx = curridx;
    2425       10112 :             startoffset = (LogwrtResult.Write - XLOG_BLCKSZ) % XLogSegSize;
    2426             :         }
    2427       30414 :         npages++;
    2428             : 
    2429             :         /*
    2430             :          * Dump the set if this will be the last loop iteration, or if we are
    2431             :          * at the last page of the cache area (since the next page won't be
    2432             :          * contiguous in memory), or if we are at the end of the logfile
    2433             :          * segment.
    2434             :          */
    2435       30414 :         last_iteration = WriteRqst.Write <= LogwrtResult.Write;
    2436             : 
    2437       51395 :         finishing_seg = !ispartialpage &&
    2438       20981 :             (startoffset + npages * XLOG_BLCKSZ) >= XLogSegSize;
    2439             : 
    2440       50754 :         if (last_iteration ||
    2441       40642 :             curridx == XLogCtl->XLogCacheBlck ||
    2442             :             finishing_seg)
    2443             :         {
    2444             :             char       *from;
    2445             :             Size        nbytes;
    2446             :             Size        nleft;
    2447             :             int         written;
    2448             : 
    2449             :             /* Need to seek in the file? */
    2450       10112 :             if (openLogOff != startoffset)
    2451             :             {
    2452        8875 :                 if (lseek(openLogFile, (off_t) startoffset, SEEK_SET) < 0)
    2453           0 :                     ereport(PANIC,
    2454             :                             (errcode_for_file_access(),
    2455             :                              errmsg("could not seek in log file %s to offset %u: %m",
    2456             :                                     XLogFileNameP(ThisTimeLineID, openLogSegNo),
    2457             :                                     startoffset)));
    2458        8875 :                 openLogOff = startoffset;
    2459             :             }
    2460             : 
    2461             :             /* OK to write the page(s) */
    2462       10112 :             from = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ;
    2463       10112 :             nbytes = npages * (Size) XLOG_BLCKSZ;
    2464       10112 :             nleft = nbytes;
    2465             :             do
    2466             :             {
    2467       10112 :                 errno = 0;
    2468       10112 :                 pgstat_report_wait_start(WAIT_EVENT_WAL_WRITE);
    2469       10112 :                 written = write(openLogFile, from, nleft);
    2470       10112 :                 pgstat_report_wait_end();
    2471       10112 :                 if (written <= 0)
    2472             :                 {
    2473           0 :                     if (errno == EINTR)
    2474           0 :                         continue;
    2475           0 :                     ereport(PANIC,
    2476             :                             (errcode_for_file_access(),
    2477             :                              errmsg("could not write to log file %s "
    2478             :                                     "at offset %u, length %zu: %m",
    2479             :                                     XLogFileNameP(ThisTimeLineID, openLogSegNo),
    2480             :                                     openLogOff, nbytes)));
    2481             :                 }
    2482       10112 :                 nleft -= written;
    2483       10112 :                 from += written;
    2484       10112 :             } while (nleft > 0);
    2485             : 
    2486             :             /* Update state for write */
    2487       10112 :             openLogOff += nbytes;
    2488       10112 :             npages = 0;
    2489             : 
    2490             :             /*
    2491             :              * If we just wrote the whole last page of a logfile segment,
    2492             :              * fsync the segment immediately.  This avoids having to go back
    2493             :              * and re-open prior segments when an fsync request comes along
    2494             :              * later. Doing it here ensures that one and only one backend will
    2495             :              * perform this fsync.
    2496             :              *
    2497             :              * This is also the right place to notify the Archiver that the
    2498             :              * segment is ready to copy to archival storage, and to update the
    2499             :              * timer for archive_timeout, and to signal for a checkpoint if
    2500             :              * too many logfile segments have been used since the last
    2501             :              * checkpoint.
    2502             :              */
    2503       10112 :             if (finishing_seg)
    2504             :             {
    2505          10 :                 issue_xlog_fsync(openLogFile, openLogSegNo);
    2506             : 
    2507             :                 /* signal that we need to wakeup walsenders later */
    2508          10 :                 WalSndWakeupRequest();
    2509             : 
    2510          10 :                 LogwrtResult.Flush = LogwrtResult.Write;    /* end of page */
    2511             : 
    2512          10 :                 if (XLogArchivingActive())
    2513           0 :                     XLogArchiveNotifySeg(openLogSegNo);
    2514             : 
    2515          10 :                 XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
    2516          10 :                 XLogCtl->lastSegSwitchLSN = LogwrtResult.Flush;
    2517             : 
    2518             :                 /*
    2519             :                  * Request a checkpoint if we've consumed too much xlog since
    2520             :                  * the last one.  For speed, we first check using the local
    2521             :                  * copy of RedoRecPtr, which might be out of date; if it looks
    2522             :                  * like a checkpoint is needed, forcibly update RedoRecPtr and
    2523             :                  * recheck.
    2524             :                  */
    2525          10 :                 if (IsUnderPostmaster && XLogCheckpointNeeded(openLogSegNo))
    2526             :                 {
    2527           0 :                     (void) GetRedoRecPtr();
    2528           0 :                     if (XLogCheckpointNeeded(openLogSegNo))
    2529           0 :                         RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
    2530             :                 }
    2531             :             }
    2532             :         }
    2533             : 
    2534       30414 :         if (ispartialpage)
    2535             :         {
    2536             :             /* Only asked to write a partial page */
    2537        9433 :             LogwrtResult.Write = WriteRqst.Write;
    2538        9433 :             break;
    2539             :         }
    2540       20981 :         curridx = NextBufIdx(curridx);
    2541             : 
    2542             :         /* If flexible, break out of loop as soon as we wrote something */
    2543       20981 :         if (flexible && npages == 0)
    2544         478 :             break;
    2545             :     }
    2546             : 
    2547       10084 :     Assert(npages == 0);
    2548             : 
    2549             :     /*
    2550             :      * If asked to flush, do so
    2551             :      */
    2552       19652 :     if (LogwrtResult.Flush < WriteRqst.Flush &&
    2553        9568 :         LogwrtResult.Flush < LogwrtResult.Write)
    2554             : 
    2555             :     {
    2556             :         /*
    2557             :          * Could get here without iterating above loop, in which case we might
    2558             :          * have no open file or the wrong one.  However, we do not need to
    2559             :          * fsync more than one file.
    2560             :          */
    2561       19134 :         if (sync_method != SYNC_METHOD_OPEN &&
    2562        9567 :             sync_method != SYNC_METHOD_OPEN_DSYNC)
    2563             :         {
    2564       19133 :             if (openLogFile >= 0 &&
    2565        9566 :                 !XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo))
    2566           0 :                 XLogFileClose();
    2567        9567 :             if (openLogFile < 0)
    2568             :             {
    2569           1 :                 XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo);
    2570           1 :                 openLogFile = XLogFileOpen(openLogSegNo);
    2571           1 :                 openLogOff = 0;
    2572             :             }
    2573             : 
    2574        9567 :             issue_xlog_fsync(openLogFile, openLogSegNo);
    2575             :         }
    2576             : 
    2577             :         /* signal that we need to wakeup walsenders later */
    2578        9567 :         WalSndWakeupRequest();
    2579             : 
    2580        9567 :         LogwrtResult.Flush = LogwrtResult.Write;
    2581             :     }
    2582             : 
    2583             :     /*
    2584             :      * Update shared-memory status
    2585             :      *
    2586             :      * We make sure that the shared 'request' values do not fall behind the
    2587             :      * 'result' values.  This is not absolutely essential, but it saves some
    2588             :      * code in a couple of places.
    2589             :      */
    2590             :     {
    2591       10084 :         SpinLockAcquire(&XLogCtl->info_lck);
    2592       10084 :         XLogCtl->LogwrtResult = LogwrtResult;
    2593       10084 :         if (XLogCtl->LogwrtRqst.Write < LogwrtResult.Write)
    2594        8951 :             XLogCtl->LogwrtRqst.Write = LogwrtResult.Write;
    2595       10084 :         if (XLogCtl->LogwrtRqst.Flush < LogwrtResult.Flush)
    2596        9569 :             XLogCtl->LogwrtRqst.Flush = LogwrtResult.Flush;
    2597       10084 :         SpinLockRelease(&XLogCtl->info_lck);
    2598             :     }
    2599       10084 : }
    2600             : 
    2601             : /*
    2602             :  * Record the LSN for an asynchronous transaction commit/abort
    2603             :  * and nudge the WALWriter if there is work for it to do.
    2604             :  * (This should not be called for synchronous commits.)
    2605             :  */
    2606             : void
    2607        1580 : XLogSetAsyncXactLSN(XLogRecPtr asyncXactLSN)
    2608             : {
    2609        1580 :     XLogRecPtr  WriteRqstPtr = asyncXactLSN;
    2610             :     bool        sleeping;
    2611             : 
    2612        1580 :     SpinLockAcquire(&XLogCtl->info_lck);
    2613        1580 :     LogwrtResult = XLogCtl->LogwrtResult;
    2614        1580 :     sleeping = XLogCtl->WalWriterSleeping;
    2615        1580 :     if (XLogCtl->asyncXactLSN < asyncXactLSN)
    2616        1574 :         XLogCtl->asyncXactLSN = asyncXactLSN;
    2617        1580 :     SpinLockRelease(&XLogCtl->info_lck);
    2618             : 
    2619             :     /*
    2620             :      * If the WALWriter is sleeping, we should kick it to make it come out of
    2621             :      * low-power mode.  Otherwise, determine whether there's a full page of
    2622             :      * WAL available to write.
    2623             :      */
    2624        1580 :     if (!sleeping)
    2625             :     {
    2626             :         /* back off to last completed page boundary */
    2627        1580 :         WriteRqstPtr -= WriteRqstPtr % XLOG_BLCKSZ;
    2628             : 
    2629             :         /* if we have already flushed that far, we're done */
    2630        1580 :         if (WriteRqstPtr <= LogwrtResult.Flush)
    2631        2567 :             return;
    2632             :     }
    2633             : 
    2634             :     /*
    2635             :      * Nudge the WALWriter: it has a full page of WAL to write, or we want it
    2636             :      * to come out of low-power mode so that this async commit will reach disk
    2637             :      * within the expected amount of time.
    2638             :      */
    2639         593 :     if (ProcGlobal->walwriterLatch)
    2640         503 :         SetLatch(ProcGlobal->walwriterLatch);
    2641             : }
    2642             : 
    2643             : /*
    2644             :  * Record the LSN up to which we can remove WAL because it's not required by
    2645             :  * any replication slot.
    2646             :  */
    2647             : void
    2648           3 : XLogSetReplicationSlotMinimumLSN(XLogRecPtr lsn)
    2649             : {
    2650           3 :     SpinLockAcquire(&XLogCtl->info_lck);
    2651           3 :     XLogCtl->replicationSlotMinLSN = lsn;
    2652           3 :     SpinLockRelease(&XLogCtl->info_lck);
    2653           3 : }
    2654             : 
    2655             : 
    2656             : /*
    2657             :  * Return the oldest LSN we must retain to satisfy the needs of some
    2658             :  * replication slot.
    2659             :  */
    2660             : static XLogRecPtr
    2661          11 : XLogGetReplicationSlotMinimumLSN(void)
    2662             : {
    2663             :     XLogRecPtr  retval;
    2664             : 
    2665          11 :     SpinLockAcquire(&XLogCtl->info_lck);
    2666          11 :     retval = XLogCtl->replicationSlotMinLSN;
    2667          11 :     SpinLockRelease(&XLogCtl->info_lck);
    2668             : 
    2669          11 :     return retval;
    2670             : }
    2671             : 
    2672             : /*
    2673             :  * Advance minRecoveryPoint in control file.
    2674             :  *
    2675             :  * If we crash during recovery, we must reach this point again before the
    2676             :  * database is consistent.
    2677             :  *
    2678             :  * If 'force' is true, 'lsn' argument is ignored. Otherwise, minRecoveryPoint
    2679             :  * is only updated if it's not already greater than or equal to 'lsn'.
    2680             :  */
    2681             : static void
    2682           0 : UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force)
    2683             : {
    2684             :     /* Quick check using our local copy of the variable */
    2685           0 :     if (!updateMinRecoveryPoint || (!force && lsn <= minRecoveryPoint))
    2686           0 :         return;
    2687             : 
    2688           0 :     LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    2689             : 
    2690             :     /* update local copy */
    2691           0 :     minRecoveryPoint = ControlFile->minRecoveryPoint;
    2692           0 :     minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
    2693             : 
    2694             :     /*
    2695             :      * An invalid minRecoveryPoint means that we need to recover all the WAL,
    2696             :      * i.e., we're doing crash recovery.  We never modify the control file's
    2697             :      * value in that case, so we can short-circuit future checks here too.
    2698             :      */
    2699           0 :     if (minRecoveryPoint == 0)
    2700           0 :         updateMinRecoveryPoint = false;
    2701           0 :     else if (force || minRecoveryPoint < lsn)
    2702             :     {
    2703             :         XLogRecPtr  newMinRecoveryPoint;
    2704             :         TimeLineID  newMinRecoveryPointTLI;
    2705             : 
    2706             :         /*
    2707             :          * To avoid having to update the control file too often, we update it
    2708             :          * all the way to the last record being replayed, even though 'lsn'
    2709             :          * would suffice for correctness.  This also allows the 'force' case
    2710             :          * to not need a valid 'lsn' value.
    2711             :          *
    2712             :          * Another important reason for doing it this way is that the passed
    2713             :          * 'lsn' value could be bogus, i.e., past the end of available WAL, if
    2714             :          * the caller got it from a corrupted heap page.  Accepting such a
    2715             :          * value as the min recovery point would prevent us from coming up at
    2716             :          * all.  Instead, we just log a warning and continue with recovery.
    2717             :          * (See also the comments about corrupt LSNs in XLogFlush.)
    2718             :          */
    2719           0 :         SpinLockAcquire(&XLogCtl->info_lck);
    2720           0 :         newMinRecoveryPoint = XLogCtl->replayEndRecPtr;
    2721           0 :         newMinRecoveryPointTLI = XLogCtl->replayEndTLI;
    2722           0 :         SpinLockRelease(&XLogCtl->info_lck);
    2723             : 
    2724           0 :         if (!force && newMinRecoveryPoint < lsn)
    2725           0 :             elog(WARNING,
    2726             :                  "xlog min recovery request %X/%X is past current point %X/%X",
    2727             :                  (uint32) (lsn >> 32), (uint32) lsn,
    2728             :                  (uint32) (newMinRecoveryPoint >> 32),
    2729             :                  (uint32) newMinRecoveryPoint);
    2730             : 
    2731             :         /* update control file */
    2732           0 :         if (ControlFile->minRecoveryPoint < newMinRecoveryPoint)
    2733             :         {
    2734           0 :             ControlFile->minRecoveryPoint = newMinRecoveryPoint;
    2735           0 :             ControlFile->minRecoveryPointTLI = newMinRecoveryPointTLI;
    2736           0 :             UpdateControlFile();
    2737           0 :             minRecoveryPoint = newMinRecoveryPoint;
    2738           0 :             minRecoveryPointTLI = newMinRecoveryPointTLI;
    2739             : 
    2740           0 :             ereport(DEBUG2,
    2741             :                     (errmsg("updated min recovery point to %X/%X on timeline %u",
    2742             :                             (uint32) (minRecoveryPoint >> 32),
    2743             :                             (uint32) minRecoveryPoint,
    2744             :                             newMinRecoveryPointTLI)));
    2745             :         }
    2746             :     }
    2747           0 :     LWLockRelease(ControlFileLock);
    2748             : }
    2749             : 
    2750             : /*
    2751             :  * Ensure that all XLOG data through the given position is flushed to disk.
    2752             :  *
    2753             :  * NOTE: this differs from XLogWrite mainly in that the WALWriteLock is not
    2754             :  * already held, and we try to avoid acquiring it if possible.
    2755             :  */
    2756             : void
    2757       17690 : XLogFlush(XLogRecPtr record)
    2758             : {
    2759             :     XLogRecPtr  WriteRqstPtr;
    2760             :     XLogwrtRqst WriteRqst;
    2761             : 
    2762             :     /*
    2763             :      * During REDO, we are reading not writing WAL.  Therefore, instead of
    2764             :      * trying to flush the WAL, we should update minRecoveryPoint instead. We
    2765             :      * test XLogInsertAllowed(), not InRecovery, because we need checkpointer
    2766             :      * to act this way too, and because when it tries to write the
    2767             :      * end-of-recovery checkpoint, it should indeed flush.
    2768             :      */
    2769       17690 :     if (!XLogInsertAllowed())
    2770             :     {
    2771           0 :         UpdateMinRecoveryPoint(record, false);
    2772           0 :         return;
    2773             :     }
    2774             : 
    2775             :     /* Quick exit if already known flushed */
    2776       17690 :     if (record <= LogwrtResult.Flush)
    2777        8163 :         return;
    2778             : 
    2779             : #ifdef WAL_DEBUG
    2780             :     if (XLOG_DEBUG)
    2781             :         elog(LOG, "xlog flush request %X/%X; write %X/%X; flush %X/%X",
    2782             :              (uint32) (record >> 32), (uint32) record,
    2783             :              (uint32) (LogwrtResult.Write >> 32), (uint32) LogwrtResult.Write,
    2784             :              (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
    2785             : #endif
    2786             : 
    2787        9527 :     START_CRIT_SECTION();
    2788             : 
    2789             :     /*
    2790             :      * Since fsync is usually a horribly expensive operation, we try to
    2791             :      * piggyback as much data as we can on each fsync: if we see any more data
    2792             :      * entered into the xlog buffer, we'll write and fsync that too, so that
    2793             :      * the final value of LogwrtResult.Flush is as large as possible. This
    2794             :      * gives us some chance of avoiding another fsync immediately after.
    2795             :      */
    2796             : 
    2797             :     /* initialize to given target; may increase below */
    2798        9527 :     WriteRqstPtr = record;
    2799             : 
    2800             :     /*
    2801             :      * Now wait until we get the write lock, or someone else does the flush
    2802             :      * for us.
    2803             :      */
    2804             :     for (;;)
    2805             :     {
    2806             :         XLogRecPtr  insertpos;
    2807             : 
    2808             :         /* read LogwrtResult and update local state */
    2809        9640 :         SpinLockAcquire(&XLogCtl->info_lck);
    2810        9640 :         if (WriteRqstPtr < XLogCtl->LogwrtRqst.Write)
    2811          83 :             WriteRqstPtr = XLogCtl->LogwrtRqst.Write;
    2812        9640 :         LogwrtResult = XLogCtl->LogwrtResult;
    2813        9640 :         SpinLockRelease(&XLogCtl->info_lck);
    2814             : 
    2815             :         /* done already? */
    2816        9640 :         if (record <= LogwrtResult.Flush)
    2817          56 :             break;
    2818             : 
    2819             :         /*
    2820             :          * Before actually performing the write, wait for all in-flight
    2821             :          * insertions to the pages we're about to write to finish.
    2822             :          */
    2823        9584 :         insertpos = WaitXLogInsertionsToFinish(WriteRqstPtr);
    2824             : 
    2825             :         /*
    2826             :          * Try to get the write lock. If we can't get it immediately, wait
    2827             :          * until it's released, and recheck if we still need to do the flush
    2828             :          * or if the backend that held the lock did it for us already. This
    2829             :          * helps to maintain a good rate of group committing when the system
    2830             :          * is bottlenecked by the speed of fsyncing.
    2831             :          */
    2832        9584 :         if (!LWLockAcquireOrWait(WALWriteLock, LW_EXCLUSIVE))
    2833             :         {
    2834             :             /*
    2835             :              * The lock is now free, but we didn't acquire it yet. Before we
    2836             :              * do, loop back to check if someone else flushed the record for
    2837             :              * us already.
    2838             :              */
    2839         113 :             continue;
    2840             :         }
    2841             : 
    2842             :         /* Got the lock; recheck whether request is satisfied */
    2843        9471 :         LogwrtResult = XLogCtl->LogwrtResult;
    2844        9471 :         if (record <= LogwrtResult.Flush)
    2845             :         {
    2846          51 :             LWLockRelease(WALWriteLock);
    2847          51 :             break;
    2848             :         }
    2849             : 
    2850             :         /*
    2851             :          * Sleep before flush! By adding a delay here, we may give further
    2852             :          * backends the opportunity to join the backlog of group commit
    2853             :          * followers; this can significantly improve transaction throughput,
    2854             :          * at the risk of increasing transaction latency.
    2855             :          *
    2856             :          * We do not sleep if enableFsync is not turned on, nor if there are
    2857             :          * fewer than CommitSiblings other backends with active transactions.
    2858             :          */
    2859        9420 :         if (CommitDelay > 0 && enableFsync &&
    2860           0 :             MinimumActiveBackends(CommitSiblings))
    2861             :         {
    2862           0 :             pg_usleep(CommitDelay);
    2863             : 
    2864             :             /*
    2865             :              * Re-check how far we can now flush the WAL. It's generally not
    2866             :              * safe to call WaitXLogInsertionsToFinish while holding
    2867             :              * WALWriteLock, because an in-progress insertion might need to
    2868             :              * also grab WALWriteLock to make progress. But we know that all
    2869             :              * the insertions up to insertpos have already finished, because
    2870             :              * that's what the earlier WaitXLogInsertionsToFinish() returned.
    2871             :              * We're only calling it again to allow insertpos to be moved
    2872             :              * further forward, not to actually wait for anyone.
    2873             :              */
    2874           0 :             insertpos = WaitXLogInsertionsToFinish(insertpos);
    2875             :         }
    2876             : 
    2877             :         /* try to write/flush later additions to XLOG as well */
    2878        9420 :         WriteRqst.Write = insertpos;
    2879        9420 :         WriteRqst.Flush = insertpos;
    2880             : 
    2881        9420 :         XLogWrite(WriteRqst, false);
    2882             : 
    2883        9420 :         LWLockRelease(WALWriteLock);
    2884             :         /* done */
    2885        9420 :         break;
    2886         113 :     }
    2887             : 
    2888        9527 :     END_CRIT_SECTION();
    2889             : 
    2890             :     /* wake up walsenders now that we've released heavily contended locks */
    2891        9527 :     WalSndWakeupProcessRequests();
    2892             : 
    2893             :     /*
    2894             :      * If we still haven't flushed to the request point then we have a
    2895             :      * problem; most likely, the requested flush point is past end of XLOG.
    2896             :      * This has been seen to occur when a disk page has a corrupted LSN.
    2897             :      *
    2898             :      * Formerly we treated this as a PANIC condition, but that hurts the
    2899             :      * system's robustness rather than helping it: we do not want to take down
    2900             :      * the whole system due to corruption on one data page.  In particular, if
    2901             :      * the bad page is encountered again during recovery then we would be
    2902             :      * unable to restart the database at all!  (This scenario actually
    2903             :      * happened in the field several times with 7.1 releases.)  As of 8.4, bad
    2904             :      * LSNs encountered during recovery are UpdateMinRecoveryPoint's problem;
    2905             :      * the only time we can reach here during recovery is while flushing the
    2906             :      * end-of-recovery checkpoint record, and we don't expect that to have a
    2907             :      * bad LSN.
    2908             :      *
    2909             :      * Note that for calls from xact.c, the ERROR will be promoted to PANIC
    2910             :      * since xact.c calls this routine inside a critical section.  However,
    2911             :      * calls from bufmgr.c are not within critical sections and so we will not
    2912             :      * force a restart for a bad LSN on a data page.
    2913             :      */
    2914        9527 :     if (LogwrtResult.Flush < record)
    2915           0 :         elog(ERROR,
    2916             :              "xlog flush request %X/%X is not satisfied --- flushed only to %X/%X",
    2917             :              (uint32) (record >> 32), (uint32) record,
    2918             :              (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
    2919             : }
    2920             : 
    2921             : /*
    2922             :  * Write & flush xlog, but without specifying exactly where to.
    2923             :  *
    2924             :  * We normally write only completed blocks; but if there is nothing to do on
    2925             :  * that basis, we check for unwritten async commits in the current incomplete
    2926             :  * block, and write through the latest one of those.  Thus, if async commits
    2927             :  * are not being used, we will write complete blocks only.
    2928             :  *
    2929             :  * If, based on the above, there's anything to write we do so immediately. But
    2930             :  * to avoid calling fsync, fdatasync et. al. at a rate that'd impact
    2931             :  * concurrent IO, we only flush WAL every wal_writer_delay ms, or if there's
    2932             :  * more than wal_writer_flush_after unflushed blocks.
    2933             :  *
    2934             :  * We can guarantee that async commits reach disk after at most three
    2935             :  * wal_writer_delay cycles. (When flushing complete blocks, we allow XLogWrite
    2936             :  * to write "flexibly", meaning it can stop at the end of the buffer ring;
    2937             :  * this makes a difference only with very high load or long wal_writer_delay,
    2938             :  * but imposes one extra cycle for the worst case for async commits.)
    2939             :  *
    2940             :  * This routine is invoked periodically by the background walwriter process.
    2941             :  *
    2942             :  * Returns TRUE if there was any work to do, even if we skipped flushing due
    2943             :  * to wal_writer_delay/wal_writer_flush_after.
    2944             :  */
    2945             : bool
    2946         904 : XLogBackgroundFlush(void)
    2947             : {
    2948             :     XLogwrtRqst WriteRqst;
    2949         904 :     bool        flexible = true;
    2950             :     static TimestampTz lastflush;
    2951             :     TimestampTz now;
    2952             :     int         flushbytes;
    2953             : 
    2954             :     /* XLOG doesn't need flushing during recovery */
    2955         904 :     if (RecoveryInProgress())
    2956           0 :         return false;
    2957             : 
    2958             :     /* read LogwrtResult and update local state */
    2959         904 :     SpinLockAcquire(&XLogCtl->info_lck);
    2960         904 :     LogwrtResult = XLogCtl->LogwrtResult;
    2961         904 :     WriteRqst = XLogCtl->LogwrtRqst;
    2962         904 :     SpinLockRelease(&XLogCtl->info_lck);
    2963             : 
    2964             :     /* back off to last completed page boundary */
    2965         904 :     WriteRqst.Write -= WriteRqst.Write % XLOG_BLCKSZ;
    2966             : 
    2967             :     /* if we have already flushed that far, consider async commit records */
    2968         904 :     if (WriteRqst.Write <= LogwrtResult.Flush)
    2969             :     {
    2970         344 :         SpinLockAcquire(&XLogCtl->info_lck);
    2971         344 :         WriteRqst.Write = XLogCtl->asyncXactLSN;
    2972         344 :         SpinLockRelease(&XLogCtl->info_lck);
    2973         344 :         flexible = false;       /* ensure it all gets written */
    2974             :     }
    2975             : 
    2976             :     /*
    2977             :      * If already known flushed, we're done. Just need to check if we are
    2978             :      * holding an open file handle to a logfile that's no longer in use,
    2979             :      * preventing the file from being deleted.
    2980             :      */
    2981         904 :     if (WriteRqst.Write <= LogwrtResult.Flush)
    2982             :     {
    2983         322 :         if (openLogFile >= 0)
    2984             :         {
    2985         289 :             if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo))
    2986             :             {
    2987           2 :                 XLogFileClose();
    2988             :             }
    2989             :         }
    2990         322 :         return false;
    2991             :     }
    2992             : 
    2993             :     /*
    2994             :      * Determine how far to flush WAL, based on the wal_writer_delay and
    2995             :      * wal_writer_flush_after GUCs.
    2996             :      */
    2997         582 :     now = GetCurrentTimestamp();
    2998        1164 :     flushbytes =
    2999        1164 :         WriteRqst.Write / XLOG_BLCKSZ - LogwrtResult.Flush / XLOG_BLCKSZ;
    3000             : 
    3001         582 :     if (WalWriterFlushAfter == 0 || lastflush == 0)
    3002             :     {
    3003             :         /* first call, or block based limits disabled */
    3004           1 :         WriteRqst.Flush = WriteRqst.Write;
    3005           1 :         lastflush = now;
    3006             :     }
    3007         581 :     else if (TimestampDifferenceExceeds(lastflush, now, WalWriterDelay))
    3008             :     {
    3009             :         /*
    3010             :          * Flush the writes at least every WalWriteDelay ms. This is important
    3011             :          * to bound the amount of time it takes for an asynchronous commit to
    3012             :          * hit disk.
    3013             :          */
    3014         154 :         WriteRqst.Flush = WriteRqst.Write;
    3015         154 :         lastflush = now;
    3016             :     }
    3017         427 :     else if (flushbytes >= WalWriterFlushAfter)
    3018             :     {
    3019             :         /* exceeded wal_writer_flush_after blocks, flush */
    3020           0 :         WriteRqst.Flush = WriteRqst.Write;
    3021           0 :         lastflush = now;
    3022             :     }
    3023             :     else
    3024             :     {
    3025             :         /* no flushing, this time round */
    3026         427 :         WriteRqst.Flush = 0;
    3027             :     }
    3028             : 
    3029             : #ifdef WAL_DEBUG
    3030             :     if (XLOG_DEBUG)
    3031             :         elog(LOG, "xlog bg flush request write %X/%X; flush: %X/%X, current is write %X/%X; flush %X/%X",
    3032             :              (uint32) (WriteRqst.Write >> 32), (uint32) WriteRqst.Write,
    3033             :              (uint32) (WriteRqst.Flush >> 32), (uint32) WriteRqst.Flush,
    3034             :              (uint32) (LogwrtResult.Write >> 32), (uint32) LogwrtResult.Write,
    3035             :              (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
    3036             : #endif
    3037             : 
    3038         582 :     START_CRIT_SECTION();
    3039             : 
    3040             :     /* now wait for any in-progress insertions to finish and get write lock */
    3041         582 :     WaitXLogInsertionsToFinish(WriteRqst.Write);
    3042         582 :     LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
    3043         582 :     LogwrtResult = XLogCtl->LogwrtResult;
    3044         664 :     if (WriteRqst.Write > LogwrtResult.Write ||
    3045          82 :         WriteRqst.Flush > LogwrtResult.Flush)
    3046             :     {
    3047         500 :         XLogWrite(WriteRqst, flexible);
    3048             :     }
    3049         582 :     LWLockRelease(WALWriteLock);
    3050             : 
    3051         582 :     END_CRIT_SECTION();
    3052             : 
    3053             :     /* wake up walsenders now that we've released heavily contended locks */
    3054         582 :     WalSndWakeupProcessRequests();
    3055             : 
    3056             :     /*
    3057             :      * Great, done. To take some work off the critical path, try to initialize
    3058             :      * as many of the no-longer-needed WAL buffers for future use as we can.
    3059             :      */
    3060         582 :     AdvanceXLInsertBuffer(InvalidXLogRecPtr, true);
    3061             : 
    3062             :     /*
    3063             :      * If we determined that we need to write data, but somebody else
    3064             :      * wrote/flushed already, it should be considered as being active, to
    3065             :      * avoid hibernating too early.
    3066             :      */
    3067         582 :     return true;
    3068             : }
    3069             : 
    3070             : /*
    3071             :  * Test whether XLOG data has been flushed up to (at least) the given position.
    3072             :  *
    3073             :  * Returns true if a flush is still needed.  (It may be that someone else
    3074             :  * is already in process of flushing that far, however.)
    3075             :  */
    3076             : bool
    3077      704592 : XLogNeedsFlush(XLogRecPtr record)
    3078             : {
    3079             :     /*
    3080             :      * During recovery, we don't flush WAL but update minRecoveryPoint
    3081             :      * instead. So "needs flush" is taken to mean whether minRecoveryPoint
    3082             :      * would need to be updated.
    3083             :      */
    3084      704592 :     if (RecoveryInProgress())
    3085             :     {
    3086             :         /* Quick exit if already known updated */
    3087           0 :         if (record <= minRecoveryPoint || !updateMinRecoveryPoint)
    3088           0 :             return false;
    3089             : 
    3090             :         /*
    3091             :          * Update local copy of minRecoveryPoint. But if the lock is busy,
    3092             :          * just return a conservative guess.
    3093             :          */
    3094           0 :         if (!LWLockConditionalAcquire(ControlFileLock, LW_SHARED))
    3095           0 :             return true;
    3096           0 :         minRecoveryPoint = ControlFile->minRecoveryPoint;
    3097           0 :         minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
    3098           0 :         LWLockRelease(ControlFileLock);
    3099             : 
    3100             :         /*
    3101             :          * An invalid minRecoveryPoint means that we need to recover all the
    3102             :          * WAL, i.e., we're doing crash recovery.  We never modify the control
    3103             :          * file's value in that case, so we can short-circuit future checks
    3104             :          * here too.
    3105             :          */
    3106           0 :         if (minRecoveryPoint == 0)
    3107           0 :             updateMinRecoveryPoint = false;
    3108             : 
    3109             :         /* check again */
    3110           0 :         if (record <= minRecoveryPoint || !updateMinRecoveryPoint)
    3111           0 :             return false;
    3112             :         else
    3113           0 :             return true;
    3114             :     }
    3115             : 
    3116             :     /* Quick exit if already known flushed */
    3117      704592 :     if (record <= LogwrtResult.Flush)
    3118      702664 :         return false;
    3119             : 
    3120             :     /* read LogwrtResult and update local state */
    3121        1928 :     SpinLockAcquire(&XLogCtl->info_lck);
    3122        1928 :     LogwrtResult = XLogCtl->LogwrtResult;
    3123        1928 :     SpinLockRelease(&XLogCtl->info_lck);
    3124             : 
    3125             :     /* check again */
    3126        1928 :     if (record <= LogwrtResult.Flush)
    3127         204 :         return false;
    3128             : 
    3129        1724 :     return true;
    3130             : }
    3131             : 
    3132             : /*
    3133             :  * Create a new XLOG file segment, or open a pre-existing one.
    3134             :  *
    3135             :  * log, seg: identify segment to be created/opened.
    3136             :  *
    3137             :  * *use_existent: if TRUE, OK to use a pre-existing file (else, any
    3138             :  * pre-existing file will be deleted).  On return, TRUE if a pre-existing
    3139             :  * file was used.
    3140             :  *
    3141             :  * use_lock: if TRUE, acquire ControlFileLock while moving file into
    3142             :  * place.  This should be TRUE except during bootstrap log creation.  The
    3143             :  * caller must *not* hold the lock at call.
    3144             :  *
    3145             :  * Returns FD of opened file.
    3146             :  *
    3147             :  * Note: errors here are ERROR not PANIC because we might or might not be
    3148             :  * inside a critical section (eg, during checkpoint there is no reason to
    3149             :  * take down the system on failure).  They will promote to PANIC if we are
    3150             :  * in a critical section.
    3151             :  */
    3152             : int
    3153         257 : XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock)
    3154             : {
    3155             :     char        path[MAXPGPATH];
    3156             :     char        tmppath[MAXPGPATH];
    3157             :     char        zbuffer_raw[XLOG_BLCKSZ + MAXIMUM_ALIGNOF];
    3158             :     char       *zbuffer;
    3159             :     XLogSegNo   installed_segno;
    3160             :     XLogSegNo   max_segno;
    3161             :     int         fd;
    3162             :     int         nbytes;
    3163             : 
    3164         257 :     XLogFilePath(path, ThisTimeLineID, logsegno);
    3165             : 
    3166             :     /*
    3167             :      * Try to use existent file (checkpoint maker may have created it already)
    3168             :      */
    3169         257 :     if (*use_existent)
    3170             :     {
    3171         256 :         fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
    3172             :                            S_IRUSR | S_IWUSR);
    3173         256 :         if (fd < 0)
    3174             :         {
    3175          10 :             if (errno != ENOENT)
    3176           0 :                 ereport(ERROR,
    3177             :                         (errcode_for_file_access(),
    3178             :                          errmsg("could not open file \"%s\": %m", path)));
    3179             :         }
    3180             :         else
    3181         246 :             return fd;
    3182             :     }
    3183             : 
    3184             :     /*
    3185             :      * Initialize an empty (all zeroes) segment.  NOTE: it is possible that
    3186             :      * another process is doing the same thing.  If so, we will end up
    3187             :      * pre-creating an extra log segment.  That seems OK, and better than
    3188             :      * holding the lock throughout this lengthy process.
    3189             :      */
    3190          11 :     elog(DEBUG2, "creating and filling new WAL file");
    3191             : 
    3192          11 :     snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
    3193             : 
    3194          11 :     unlink(tmppath);
    3195             : 
    3196             :     /* do not use get_sync_bit() here --- want to fsync only at end of fill */
    3197          11 :     fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
    3198             :                        S_IRUSR | S_IWUSR);
    3199          11 :     if (fd < 0)
    3200           0 :         ereport(ERROR,
    3201             :                 (errcode_for_file_access(),
    3202             :                  errmsg("could not create file \"%s\": %m", tmppath)));
    3203             : 
    3204             :     /*
    3205             :      * Zero-fill the file.  We have to do this the hard way to ensure that all
    3206             :      * the file space has really been allocated --- on platforms that allow
    3207             :      * "holes" in files, just seeking to the end doesn't allocate intermediate
    3208             :      * space.  This way, we know that we have all the space and (after the
    3209             :      * fsync below) that all the indirect blocks are down on disk.  Therefore,
    3210             :      * fdatasync(2) or O_DSYNC will be sufficient to sync future writes to the
    3211             :      * log file.
    3212             :      *
    3213             :      * Note: ensure the buffer is reasonably well-aligned; this may save a few
    3214             :      * cycles transferring data to the kernel.
    3215             :      */
    3216          11 :     zbuffer = (char *) MAXALIGN(zbuffer_raw);
    3217          11 :     memset(zbuffer, 0, XLOG_BLCKSZ);
    3218       22539 :     for (nbytes = 0; nbytes < XLogSegSize; nbytes += XLOG_BLCKSZ)
    3219             :     {
    3220       22528 :         errno = 0;
    3221       22528 :         pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_WRITE);
    3222       22528 :         if ((int) write(fd, zbuffer, XLOG_BLCKSZ) != (int) XLOG_BLCKSZ)
    3223             :         {
    3224           0 :             int         save_errno = errno;
    3225             : 
    3226             :             /*
    3227             :              * If we fail to make the file, delete it to release disk space
    3228             :              */
    3229           0 :             unlink(tmppath);
    3230             : 
    3231           0 :             close(fd);
    3232             : 
    3233             :             /* if write didn't set errno, assume problem is no disk space */
    3234           0 :             errno = save_errno ? save_errno : ENOSPC;
    3235             : 
    3236           0 :             ereport(ERROR,
    3237             :                     (errcode_for_file_access(),
    3238             :                      errmsg("could not write to file \"%s\": %m", tmppath)));
    3239             :         }
    3240       22528 :         pgstat_report_wait_end();
    3241             :     }
    3242             : 
    3243          11 :     pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_SYNC);
    3244          11 :     if (pg_fsync(fd) != 0)
    3245             :     {
    3246           0 :         close(fd);
    3247           0 :         ereport(ERROR,
    3248             :                 (errcode_for_file_access(),
    3249             :                  errmsg("could not fsync file \"%s\": %m", tmppath)));
    3250             :     }
    3251          11 :     pgstat_report_wait_end();
    3252             : 
    3253          11 :     if (close(fd))
    3254           0 :         ereport(ERROR,
    3255             :                 (errcode_for_file_access(),
    3256             :                  errmsg("could not close file \"%s\": %m", tmppath)));
    3257             : 
    3258             :     /*
    3259             :      * Now move the segment into place with its final name.
    3260             :      *
    3261             :      * If caller didn't want to use a pre-existing file, get rid of any
    3262             :      * pre-existing file.  Otherwise, cope with possibility that someone else
    3263             :      * has created the file while we were filling ours: if so, use ours to
    3264             :      * pre-create a future log segment.
    3265             :      */
    3266          11 :     installed_segno = logsegno;
    3267             : 
    3268             :     /*
    3269             :      * XXX: What should we use as max_segno? We used to use XLOGfileslop when
    3270             :      * that was a constant, but that was always a bit dubious: normally, at a
    3271             :      * checkpoint, XLOGfileslop was the offset from the checkpoint record, but
    3272             :      * here, it was the offset from the insert location. We can't do the
    3273             :      * normal XLOGfileslop calculation here because we don't have access to
    3274             :      * the prior checkpoint's redo location. So somewhat arbitrarily, just use
    3275             :      * CheckPointSegments.
    3276             :      */
    3277          11 :     max_segno = logsegno + CheckPointSegments;
    3278          22 :     if (!InstallXLogFileSegment(&installed_segno, tmppath,
    3279          11 :                                 *use_existent, max_segno,
    3280             :                                 use_lock))
    3281             :     {
    3282             :         /*
    3283             :          * No need for any more future segments, or InstallXLogFileSegment()
    3284             :          * failed to rename the file into place. If the rename failed, opening
    3285             :          * the file below will fail.
    3286             :          */
    3287           0 :         unlink(tmppath);
    3288             :     }
    3289             : 
    3290             :     /* Set flag to tell caller there was no existent file */
    3291          11 :     *use_existent = false;
    3292             : 
    3293             :     /* Now open original target segment (might not be file I just made) */
    3294          11 :     fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
    3295             :                        S_IRUSR | S_IWUSR);
    3296          11 :     if (fd < 0)
    3297           0 :         ereport(ERROR,
    3298             :                 (errcode_for_file_access(),
    3299             :                  errmsg("could not open file \"%s\": %m", path)));
    3300             : 
    3301          11 :     elog(DEBUG2, "done creating and filling new WAL file");
    3302             : 
    3303          11 :     return fd;
    3304             : }
    3305             : 
    3306             : /*
    3307             :  * Create a new XLOG file segment by copying a pre-existing one.
    3308             :  *
    3309             :  * destsegno: identify segment to be created.
    3310             :  *
    3311             :  * srcTLI, srcsegno: identify segment to be copied (could be from
    3312             :  *      a different timeline)
    3313             :  *
    3314             :  * upto: how much of the source file to copy (the rest is filled with
    3315             :  *      zeros)
    3316             :  *
    3317             :  * Currently this is only used during recovery, and so there are no locking
    3318             :  * considerations.  But we should be just as tense as XLogFileInit to avoid
    3319             :  * emplacing a bogus file.
    3320             :  */
    3321             : static void
    3322           0 : XLogFileCopy(XLogSegNo destsegno, TimeLineID srcTLI, XLogSegNo srcsegno,
    3323             :              int upto)
    3324             : {
    3325             :     char        path[MAXPGPATH];
    3326             :     char        tmppath[MAXPGPATH];
    3327             :     char        buffer[XLOG_BLCKSZ];
    3328             :     int         srcfd;
    3329             :     int         fd;
    3330             :     int         nbytes;
    3331             : 
    3332             :     /*
    3333             :      * Open the source file
    3334             :      */
    3335           0 :     XLogFilePath(path, srcTLI, srcsegno);
    3336           0 :     srcfd = OpenTransientFile(path, O_RDONLY | PG_BINARY, 0);
    3337           0 :     if (srcfd < 0)
    3338           0 :         ereport(ERROR,
    3339             :                 (errcode_for_file_access(),
    3340             :                  errmsg("could not open file \"%s\": %m", path)));
    3341             : 
    3342             :     /*
    3343             :      * Copy into a temp file name.
    3344             :      */
    3345           0 :     snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
    3346             : 
    3347           0 :     unlink(tmppath);
    3348             : 
    3349             :     /* do not use get_sync_bit() here --- want to fsync only at end of fill */
    3350           0 :     fd = OpenTransientFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
    3351             :                            S_IRUSR | S_IWUSR);
    3352           0 :     if (fd < 0)
    3353           0 :         ereport(ERROR,
    3354             :                 (errcode_for_file_access(),
    3355             :                  errmsg("could not create file \"%s\": %m", tmppath)));
    3356             : 
    3357             :     /*
    3358             :      * Do the data copying.
    3359             :      */
    3360           0 :     for (nbytes = 0; nbytes < XLogSegSize; nbytes += sizeof(buffer))
    3361             :     {
    3362             :         int         nread;
    3363             : 
    3364           0 :         nread = upto - nbytes;
    3365             : 
    3366             :         /*
    3367             :          * The part that is not read from the source file is filled with
    3368             :          * zeros.
    3369             :          */
    3370           0 :         if (nread < sizeof(buffer))
    3371           0 :             memset(buffer, 0, sizeof(buffer));
    3372             : 
    3373           0 :         if (nread > 0)
    3374             :         {
    3375           0 :             if (nread > sizeof(buffer))
    3376           0 :                 nread = sizeof(buffer);
    3377           0 :             errno = 0;
    3378           0 :             pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_READ);
    3379           0 :             if (read(srcfd, buffer, nread) != nread)
    3380             :             {
    3381           0 :                 if (errno != 0)
    3382           0 :                     ereport(ERROR,
    3383             :                             (errcode_for_file_access(),
    3384             :                              errmsg("could not read file \"%s\": %m",
    3385             :                                     path)));
    3386             :                 else
    3387           0 :                     ereport(ERROR,
    3388             :                             (errmsg("not enough data in file \"%s\"",
    3389             :                                     path)));
    3390             :             }
    3391           0 :             pgstat_report_wait_end();
    3392             :         }
    3393           0 :         errno = 0;
    3394           0 :         pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_WRITE);
    3395           0 :         if ((int) write(fd, buffer, sizeof(buffer)) != (int) sizeof(buffer))
    3396             :         {
    3397           0 :             int         save_errno = errno;
    3398             : 
    3399             :             /*
    3400             :              * If we fail to make the file, delete it to release disk space
    3401             :              */
    3402           0 :             unlink(tmppath);
    3403             :             /* if write didn't set errno, assume problem is no disk space */
    3404           0 :             errno = save_errno ? save_errno : ENOSPC;
    3405             : 
    3406           0 :             ereport(ERROR,
    3407             :                     (errcode_for_file_access(),
    3408             :                      errmsg("could not write to file \"%s\": %m", tmppath)));
    3409             :         }
    3410           0 :         pgstat_report_wait_end();
    3411             :     }
    3412             : 
    3413           0 :     pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_SYNC);
    3414           0 :     if (pg_fsync(fd) != 0)
    3415           0 :         ereport(ERROR,
    3416             :                 (errcode_for_file_access(),
    3417             :                  errmsg("could not fsync file \"%s\": %m", tmppath)));
    3418           0 :     pgstat_report_wait_end();
    3419             : 
    3420           0 :     if (CloseTransientFile(fd))
    3421           0 :         ereport(ERROR,
    3422             :                 (errcode_for_file_access(),
    3423             :                  errmsg("could not close file \"%s\": %m", tmppath)));
    3424             : 
    3425           0 :     CloseTransientFile(srcfd);
    3426             : 
    3427             :     /*
    3428             :      * Now move the segment into place with its final name.
    3429             :      */
    3430           0 :     if (!InstallXLogFileSegment(&destsegno, tmppath, false, 0, false))
    3431           0 :         elog(ERROR, "InstallXLogFileSegment should not have failed");
    3432           0 : }
    3433             : 
    3434             : /*
    3435             :  * Install a new XLOG segment file as a current or future log segment.
    3436             :  *
    3437             :  * This is used both to install a newly-created segment (which has a temp
    3438             :  * filename while it's being created) and to recycle an old segment.
    3439             :  *
    3440             :  * *segno: identify segment to install as (or first possible target).
    3441             :  * When find_free is TRUE, this is modified on return to indicate the
    3442             :  * actual installation location or last segment searched.
    3443             :  *
    3444             :  * tmppath: initial name of file to install.  It will be renamed into place.
    3445             :  *
    3446             :  * find_free: if TRUE, install the new segment at the first empty segno
    3447             :  * number at or after the passed numbers.  If FALSE, install the new segment
    3448             :  * exactly where specified, deleting any existing segment file there.
    3449             :  *
    3450             :  * max_segno: maximum segment number to install the new file as.  Fail if no
    3451             :  * free slot is found between *segno and max_segno. (Ignored when find_free
    3452             :  * is FALSE.)
    3453             :  *
    3454             :  * use_lock: if TRUE, acquire ControlFileLock while moving file into
    3455             :  * place.  This should be TRUE except during bootstrap log creation.  The
    3456             :  * caller must *not* hold the lock at call.
    3457             :  *
    3458             :  * Returns TRUE if the file was installed successfully.  FALSE indicates that
    3459             :  * max_segno limit was exceeded, or an error occurred while renaming the
    3460             :  * file into place.
    3461             :  */
    3462             : static bool
    3463          11 : InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
    3464             :                        bool find_free, XLogSegNo max_segno,
    3465             :                        bool use_lock)
    3466             : {
    3467             :     char        path[MAXPGPATH];
    3468             :     struct stat stat_buf;
    3469             : 
    3470          11 :     XLogFilePath(path, ThisTimeLineID, *segno);
    3471             : 
    3472             :     /*
    3473             :      * We want to be sure that only one process does this at a time.
    3474             :      */
    3475          11 :     if (use_lock)
    3476          10 :         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    3477             : 
    3478          11 :     if (!find_free)
    3479             :     {
    3480             :         /* Force installation: get rid of any pre-existing segment file */
    3481           1 :         durable_unlink(path, DEBUG1);
    3482             :     }
    3483             :     else
    3484             :     {
    3485             :         /* Find a free slot to put it in */
    3486          20 :         while (stat(path, &stat_buf) == 0)
    3487             :         {
    3488           0 :             if ((*segno) >= max_segno)
    3489             :             {
    3490             :                 /* Failed to find a free slot within specified range */
    3491           0 :                 if (use_lock)
    3492           0 :                     LWLockRelease(ControlFileLock);
    3493           0 :                 return false;
    3494             :             }
    3495           0 :             (*segno)++;
    3496           0 :             XLogFilePath(path, ThisTimeLineID, *segno);
    3497             :         }
    3498             :     }
    3499             : 
    3500             :     /*
    3501             :      * Perform the rename using link if available, paranoidly trying to avoid
    3502             :      * overwriting an existing file (there shouldn't be one).
    3503             :      */
    3504          11 :     if (durable_link_or_rename(tmppath, path, LOG) != 0)
    3505             :     {
    3506           0 :         if (use_lock)
    3507           0 :             LWLockRelease(ControlFileLock);
    3508             :         /* durable_link_or_rename already emitted log message */
    3509           0 :         return false;
    3510             :     }
    3511             : 
    3512          11 :     if (use_lock)
    3513          10 :         LWLockRelease(ControlFileLock);
    3514             : 
    3515          11 :     return true;
    3516             : }
    3517             : 
    3518             : /*
    3519             :  * Open a pre-existing logfile segment for writing.
    3520             :  */
    3521             : int
    3522           1 : XLogFileOpen(XLogSegNo segno)
    3523             : {
    3524             :     char        path[MAXPGPATH];
    3525             :     int         fd;
    3526             : 
    3527           1 :     XLogFilePath(path, ThisTimeLineID, segno);
    3528             : 
    3529           1 :     fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
    3530             :                        S_IRUSR | S_IWUSR);
    3531           1 :     if (fd < 0)
    3532           0 :         ereport(PANIC,
    3533             :                 (errcode_for_file_access(),
    3534             :                  errmsg("could not open write-ahead log file \"%s\": %m", path)));
    3535             : 
    3536           1 :     return fd;
    3537             : }
    3538             : 
    3539             : /*
    3540             :  * Open a logfile segment for reading (during recovery).
    3541             :  *
    3542             :  * If source == XLOG_FROM_ARCHIVE, the segment is retrieved from archive.
    3543             :  * Otherwise, it's assumed to be already available in pg_wal.
    3544             :  */
    3545             : static int
    3546           3 : XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
    3547             :              int source, bool notfoundOk)
    3548             : {
    3549             :     char        xlogfname[MAXFNAMELEN];
    3550             :     char        activitymsg[MAXFNAMELEN + 16];
    3551             :     char        path[MAXPGPATH];
    3552             :     int         fd;
    3553             : 
    3554           3 :     XLogFileName(xlogfname, tli, segno);
    3555             : 
    3556           3 :     switch (source)
    3557             :     {
    3558             :         case XLOG_FROM_ARCHIVE:
    3559             :             /* Report recovery progress in PS display */
    3560           0 :             snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
    3561             :                      xlogfname);
    3562           0 :             set_ps_display(activitymsg, false);
    3563             : 
    3564           0 :             restoredFromArchive = RestoreArchivedFile(path, xlogfname,
    3565             :                                                       "RECOVERYXLOG",
    3566             :                                                       XLogSegSize,
    3567             :                                                       InRedo);
    3568           0 :             if (!restoredFromArchive)
    3569           0 :                 return -1;
    3570           0 :             break;
    3571             : 
    3572             :         case XLOG_FROM_PG_WAL:
    3573             :         case XLOG_FROM_STREAM:
    3574           3 :             XLogFilePath(path, tli, segno);
    3575           3 :             restoredFromArchive = false;
    3576           3 :             break;
    3577             : 
    3578             :         default:
    3579           0 :             elog(ERROR, "invalid XLogFileRead source %d", source);
    3580             :     }
    3581             : 
    3582             :     /*
    3583             :      * If the segment was fetched from archival storage, replace the existing
    3584             :      * xlog segment (if any) with the archival version.
    3585             :      */
    3586           3 :     if (source == XLOG_FROM_ARCHIVE)
    3587             :     {
    3588           0 :         KeepFileRestoredFromArchive(path, xlogfname);
    3589             : 
    3590             :         /*
    3591             :          * Set path to point at the new file in pg_wal.
    3592             :          */
    3593           0 :         snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
    3594             :     }
    3595             : 
    3596           3 :     fd = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0);
    3597           3 :     if (fd >= 0)
    3598             :     {
    3599             :         /* Success! */
    3600           3 :         curFileTLI = tli;
    3601             : 
    3602             :         /* Report recovery progress in PS display */
    3603           3 :         snprintf(activitymsg, sizeof(activitymsg), "recovering %s",
    3604             :                  xlogfname);
    3605           3 :         set_ps_display(activitymsg, false);
    3606             : 
    3607             :         /* Track source of data in assorted state variables */
    3608           3 :         readSource = source;
    3609           3 :         XLogReceiptSource = source;
    3610             :         /* In FROM_STREAM case, caller tracks receipt time, not me */
    3611           3 :         if (source != XLOG_FROM_STREAM)
    3612           3 :             XLogReceiptTime = GetCurrentTimestamp();
    3613             : 
    3614           3 :         return fd;
    3615             :     }
    3616           0 :     if (errno != ENOENT || !notfoundOk) /* unexpected failure? */
    3617           0 :         ereport(PANIC,
    3618             :                 (errcode_for_file_access(),
    3619             :                  errmsg("could not open file \"%s\": %m", path)));
    3620           0 :     return -1;
    3621             : }
    3622             : 
    3623             : /*
    3624             :  * Open a logfile segment for reading (during recovery).
    3625             :  *
    3626             :  * This version searches for the segment with any TLI listed in expectedTLEs.
    3627             :  */
    3628             : static int
    3629           3 : XLogFileReadAnyTLI(XLogSegNo segno, int emode, int source)
    3630             : {
    3631             :     char        path[MAXPGPATH];
    3632             :     ListCell   *cell;
    3633             :     int         fd;
    3634             :     List       *tles;
    3635             : 
    3636             :     /*
    3637             :      * Loop looking for a suitable timeline ID: we might need to read any of
    3638             :      * the timelines listed in expectedTLEs.
    3639             :      *
    3640             :      * We expect curFileTLI on entry to be the TLI of the preceding file in
    3641             :      * sequence, or 0 if there was no predecessor.  We do not allow curFileTLI
    3642             :      * to go backwards; this prevents us from picking up the wrong file when a
    3643             :      * parent timeline extends to higher segment numbers than the child we
    3644             :      * want to read.
    3645             :      *
    3646             :      * If we haven't read the timeline history file yet, read it now, so that
    3647             :      * we know which TLIs to scan.  We don't save the list in expectedTLEs,
    3648             :      * however, unless we actually find a valid segment.  That way if there is
    3649             :      * neither a timeline history file nor a WAL segment in the archive, and
    3650             :      * streaming replication is set up, we'll read the timeline history file
    3651             :      * streamed from the master when we start streaming, instead of recovering
    3652             :      * with a dummy history generated here.
    3653             :      */
    3654           3 :     if (expectedTLEs)
    3655           0 :         tles = expectedTLEs;
    3656             :     else
    3657           3 :         tles = readTimeLineHistory(recoveryTargetTLI);
    3658             : 
    3659           3 :     foreach(cell, tles)
    3660             :     {
    3661           3 :         TimeLineID  tli = ((TimeLineHistoryEntry *) lfirst(cell))->tli;
    3662             : 
    3663           3 :         if (tli < curFileTLI)
    3664           0 :             break;              /* don't bother looking at too-old TLIs */
    3665             : 
    3666           3 :         if (source == XLOG_FROM_ANY || source == XLOG_FROM_ARCHIVE)
    3667             :         {
    3668           0 :             fd = XLogFileRead(segno, emode, tli,
    3669             :                               XLOG_FROM_ARCHIVE, true);
    3670           0 :             if (fd != -1)
    3671             :             {
    3672           0 :                 elog(DEBUG1, "got WAL segment from archive");
    3673           0 :                 if (!expectedTLEs)
    3674           0 :                     expectedTLEs = tles;
    3675           0 :                 return fd;
    3676             :             }
    3677             :         }
    3678             : 
    3679           3 :         if (source == XLOG_FROM_ANY || source == XLOG_FROM_PG_WAL)
    3680             :         {
    3681           3 :             fd = XLogFileRead(segno, emode, tli,
    3682             :                               XLOG_FROM_PG_WAL, true);
    3683           3 :             if (fd != -1)
    3684             :             {
    3685           3 :                 if (!expectedTLEs)
    3686           3 :                     expectedTLEs = tles;
    3687           3 :                 return fd;
    3688             :             }
    3689             :         }
    3690             :     }
    3691             : 
    3692             :     /* Couldn't find it.  For simplicity, complain about front timeline */
    3693           0 :     XLogFilePath(path, recoveryTargetTLI, segno);
    3694           0 :     errno = ENOENT;
    3695           0 :     ereport(emode,
    3696             :             (errcode_for_file_access(),
    3697             :              errmsg("could not open file \"%s\": %m", path)));
    3698           0 :     return -1;
    3699             : }
    3700             : 
    3701             : /*
    3702             :  * Close the current logfile segment for writing.
    3703             :  */
    3704             : static void
    3705          53 : XLogFileClose(void)
    3706             : {
    3707          53 :     Assert(openLogFile >= 0);
    3708             : 
    3709             :     /*
    3710             :      * WAL segment files will not be re-read in normal operation, so we advise
    3711             :      * the OS to release any cached pages.  But do not do so if WAL archiving
    3712             :      * or streaming is active, because archiver and walsender process could
    3713             :      * use the cache to read the WAL segment.
    3714             :      */
    3715             : #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
    3716          53 :     if (!XLogIsNeeded())
    3717           0 :         (void) posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED);
    3718             : #endif
    3719             : 
    3720          53 :     if (close(openLogFile))
    3721           0 :         ereport(PANIC,
    3722             :                 (errcode_for_file_access(),
    3723             :                  errmsg("could not close log file %s: %m",
    3724             :                         XLogFileNameP(ThisTimeLineID, openLogSegNo))));
    3725          53 :     openLogFile = -1;
    3726          53 : }
    3727             : 
    3728             : /*
    3729             :  * Preallocate log files beyond the specified log endpoint.
    3730             :  *
    3731             :  * XXX this is currently extremely conservative, since it forces only one
    3732             :  * future log segment to exist, and even that only if we are 75% done with
    3733             :  * the current one.  This is only appropriate for very low-WAL-volume systems.
    3734             :  * High-volume systems will be OK once they've built up a sufficient set of
    3735             :  * recycled log segments, but the startup transient is likely to include
    3736             :  * a lot of segment creations by foreground processes, which is not so good.
    3737             :  */
    3738             : static void
    3739          11 : PreallocXlogFiles(XLogRecPtr endptr)
    3740             : {
    3741             :     XLogSegNo   _logSegNo;
    3742             :     int         lf;
    3743             :     bool        use_existent;
    3744             : 
    3745          11 :     XLByteToPrevSeg(endptr, _logSegNo);
    3746          11 :     if ((endptr - 1) % XLogSegSize >= (uint32) (0.75 * XLogSegSize))
    3747             :     {
    3748           0 :         _logSegNo++;
    3749           0 :         use_existent = true;
    3750           0 :         lf = XLogFileInit(_logSegNo, &use_existent, true);
    3751           0 :         close(lf);
    3752           0 :         if (!use_existent)
    3753           0 :             CheckpointStats.ckpt_segs_added++;
    3754             :     }
    3755          11 : }
    3756             : 
    3757             : /*
    3758             :  * Throws an error if the given log segment has already been removed or
    3759             :  * recycled. The caller should only pass a segment that it knows to have
    3760             :  * existed while the server has been running, as this function always
    3761             :  * succeeds if no WAL segments have been removed since startup.
    3762             :  * 'tli' is only used in the error message.
    3763             :  */
    3764             : void
    3765           0 : CheckXLogRemoved(XLogSegNo segno, TimeLineID tli)
    3766             : {
    3767             :     XLogSegNo   lastRemovedSegNo;
    3768             : 
    3769           0 :     SpinLockAcquire(&XLogCtl->info_lck);
    3770           0 :     lastRemovedSegNo = XLogCtl->lastRemovedSegNo;
    3771           0 :     SpinLockRelease(&XLogCtl->info_lck);
    3772             : 
    3773           0 :     if (segno <= lastRemovedSegNo)
    3774             :     {
    3775             :         char        filename[MAXFNAMELEN];
    3776             : 
    3777           0 :         XLogFileName(filename, tli, segno);
    3778           0 :         ereport(ERROR,
    3779             :                 (errcode_for_file_access(),
    3780             :                  errmsg("requested WAL segment %s has already been removed",
    3781             :                         filename)));
    3782             :     }
    3783           0 : }
    3784             : 
    3785             : /*
    3786             :  * Return the last WAL segment removed, or 0 if no segment has been removed
    3787             :  * since startup.
    3788             :  *
    3789             :  * NB: the result can be out of date arbitrarily fast, the caller has to deal
    3790             :  * with that.
    3791             :  */
    3792             : XLogSegNo
    3793           0 : XLogGetLastRemovedSegno(void)
    3794             : {
    3795             :     XLogSegNo   lastRemovedSegNo;
    3796             : 
    3797           0 :     SpinLockAcquire(&XLogCtl->info_lck);
    3798           0 :     lastRemovedSegNo = XLogCtl->lastRemovedSegNo;
    3799           0 :     SpinLockRelease(&XLogCtl->info_lck);
    3800             : 
    3801           0 :     return lastRemovedSegNo;
    3802             : }
    3803             : 
    3804             : /*
    3805             :  * Update the last removed segno pointer in shared memory, to reflect
    3806             :  * that the given XLOG file has been removed.
    3807             :  */
    3808             : static void
    3809           0 : UpdateLastRemovedPtr(char *filename)
    3810             : {
    3811             :     uint32      tli;
    3812             :     XLogSegNo   segno;
    3813             : 
    3814           0 :     XLogFromFileName(filename, &tli, &segno);
    3815             : 
    3816           0 :     SpinLockAcquire(&XLogCtl->info_lck);
    3817           0 :     if (segno > XLogCtl->lastRemovedSegNo)
    3818           0 :         XLogCtl->lastRemovedSegNo = segno;
    3819           0 :     SpinLockRelease(&XLogCtl->info_lck);
    3820           0 : }
    3821             : 
    3822             : /*
    3823             :  * Recycle or remove all log files older or equal to passed segno.
    3824             :  *
    3825             :  * endptr is current (or recent) end of xlog, and PriorRedoRecPtr is the
    3826             :  * redo pointer of the previous checkpoint. These are used to determine
    3827             :  * whether we want to recycle rather than delete no-longer-wanted log files.
    3828             :  */
    3829             : static void
    3830          11 : RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr PriorRedoPtr, XLogRecPtr endptr)
    3831             : {
    3832             :     DIR        *xldir;
    3833             :     struct dirent *xlde;
    3834             :     char        lastoff[MAXFNAMELEN];
    3835             : 
    3836          11 :     xldir = AllocateDir(XLOGDIR);
    3837          11 :     if (xldir == NULL)
    3838           0 :         ereport(ERROR,
    3839             :                 (errcode_for_file_access(),
    3840             :                  errmsg("could not open write-ahead log directory \"%s\": %m",
    3841             :                         XLOGDIR)));
    3842             : 
    3843             :     /*
    3844             :      * Construct a filename of the last segment to be kept. The timeline ID
    3845             :      * doesn't matter, we ignore that in the comparison. (During recovery,
    3846             :      * ThisTimeLineID isn't set, so we can't use that.)
    3847             :      */
    3848          11 :     XLogFileName(lastoff, 0, segno);
    3849             : 
    3850          11 :     elog(DEBUG2, "attempting to remove WAL segments older than log file %s",
    3851             :          lastoff);
    3852             : 
    3853          76 :     while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
    3854             :     {
    3855             :         /* Ignore files that are not XLOG segments */
    3856          87 :         if (!IsXLogFileName(xlde->d_name) &&
    3857          33 :             !IsPartialXLogFileName(xlde->d_name))
    3858          33 :             continue;
    3859             : 
    3860             :         /*
    3861             :          * We ignore the timeline part of the XLOG segment identifiers in
    3862             :          * deciding whether a segment is still needed.  This ensures that we
    3863             :          * won't prematurely remove a segment from a parent timeline. We could
    3864             :          * probably be a little more proactive about removing segments of
    3865             :          * non-parent timelines, but that would be a whole lot more
    3866             :          * complicated.
    3867             :          *
    3868             :          * We use the alphanumeric sorting property of the filenames to decide
    3869             :          * which ones are earlier than the lastoff segment.
    3870             :          */
    3871          21 :         if (strcmp(xlde->d_name + 8, lastoff + 8) <= 0)
    3872             :         {
    3873           0 :             if (XLogArchiveCheckDone(xlde->d_name))
    3874             :             {
    3875             :                 /* Update the last removed location in shared memory first */
    3876           0 :                 UpdateLastRemovedPtr(xlde->d_name);
    3877             : 
    3878           0 :                 RemoveXlogFile(xlde->d_name, PriorRedoPtr, endptr);
    3879             :             }
    3880             :         }
    3881             :     }
    3882             : 
    3883          11 :     FreeDir(xldir);
    3884          11 : }
    3885             : 
    3886             : /*
    3887             :  * Remove WAL files that are not part of the given timeline's history.
    3888             :  *
    3889             :  * This is called during recovery, whenever we switch to follow a new
    3890             :  * timeline, and at the end of recovery when we create a new timeline. We
    3891             :  * wouldn't otherwise care about extra WAL files lying in pg_wal, but they
    3892             :  * might be leftover pre-allocated or recycled WAL segments on the old timeline
    3893             :  * that we haven't used yet, and contain garbage. If we just leave them in
    3894             :  * pg_wal, they will eventually be archived, and we can't let that happen.
    3895             :  * Files that belong to our timeline history are valid, because we have
    3896             :  * successfully replayed them, but from others we can't be sure.
    3897             :  *
    3898             :  * 'switchpoint' is the current point in WAL where we switch to new timeline,
    3899             :  * and 'newTLI' is the new timeline we switch to.
    3900             :  */
    3901             : static void
    3902           0 : RemoveNonParentXlogFiles(XLogRecPtr switchpoint, TimeLineID newTLI)
    3903             : {
    3904             :     DIR        *xldir;
    3905             :     struct dirent *xlde;
    3906             :     char        switchseg[MAXFNAMELEN];
    3907             :     XLogSegNo   endLogSegNo;
    3908             : 
    3909           0 :     XLByteToPrevSeg(switchpoint, endLogSegNo);
    3910             : 
    3911           0 :     xldir = AllocateDir(XLOGDIR);
    3912           0 :     if (xldir == NULL)
    3913           0 :         ereport(ERROR,
    3914             :                 (errcode_for_file_access(),
    3915             :                  errmsg("could not open write-ahead log directory \"%s\": %m",
    3916             :                         XLOGDIR)));
    3917             : 
    3918             :     /*
    3919             :      * Construct a filename of the last segment to be kept.
    3920             :      */
    3921           0 :     XLogFileName(switchseg, newTLI, endLogSegNo);
    3922             : 
    3923           0 :     elog(DEBUG2, "attempting to remove WAL segments newer than log file %s",
    3924             :          switchseg);
    3925             : 
    3926           0 :     while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
    3927             :     {
    3928             :         /* Ignore files that are not XLOG segments */
    3929           0 :         if (!IsXLogFileName(xlde->d_name))
    3930           0 :             continue;
    3931             : 
    3932             :         /*
    3933             :          * Remove files that are on a timeline older than the new one we're
    3934             :          * switching to, but with a segment number >= the first segment on the
    3935             :          * new timeline.
    3936             :          */
    3937           0 :         if (strncmp(xlde->d_name, switchseg, 8) < 0 &&
    3938           0 :             strcmp(xlde->d_name + 8, switchseg + 8) > 0)
    3939             :         {
    3940             :             /*
    3941             :              * If the file has already been marked as .ready, however, don't
    3942             :              * remove it yet. It should be OK to remove it - files that are
    3943             :              * not part of our timeline history are not required for recovery
    3944             :              * - but seems safer to let them be archived and removed later.
    3945             :              */
    3946           0 :             if (!XLogArchiveIsReady(xlde->d_name))
    3947           0 :                 RemoveXlogFile(xlde->d_name, InvalidXLogRecPtr, switchpoint);
    3948             :         }
    3949             :     }
    3950             : 
    3951           0 :     FreeDir(xldir);
    3952           0 : }
    3953             : 
    3954             : /*
    3955             :  * Recycle or remove a log file that's no longer needed.
    3956             :  *
    3957             :  * endptr is current (or recent) end of xlog, and PriorRedoRecPtr is the
    3958             :  * redo pointer of the previous checkpoint. These are used to determine
    3959             :  * whether we want to recycle rather than delete no-longer-wanted log files.
    3960             :  * If PriorRedoRecPtr is not known, pass invalid, and the function will
    3961             :  * recycle, somewhat arbitrarily, 10 future segments.
    3962             :  */
    3963             : static void
    3964           0 : RemoveXlogFile(const char *segname, XLogRecPtr PriorRedoPtr, XLogRecPtr endptr)
    3965             : {
    3966             :     char        path[MAXPGPATH];
    3967             : #ifdef WIN32
    3968             :     char        newpath[MAXPGPATH];
    3969             : #endif
    3970             :     struct stat statbuf;
    3971             :     XLogSegNo   endlogSegNo;
    3972             :     XLogSegNo   recycleSegNo;
    3973             : 
    3974             :     /*
    3975             :      * Initialize info about where to try to recycle to.
    3976             :      */
    3977           0 :     XLByteToSeg(endptr, endlogSegNo);
    3978           0 :     if (PriorRedoPtr == InvalidXLogRecPtr)
    3979           0 :         recycleSegNo = endlogSegNo + 10;
    3980             :     else
    3981           0 :         recycleSegNo = XLOGfileslop(PriorRedoPtr);
    3982             : 
    3983           0 :     snprintf(path, MAXPGPATH, XLOGDIR "/%s", segname);
    3984             : 
    3985             :     /*
    3986             :      * Before deleting the file, see if it can be recycled as a future log
    3987             :      * segment. Only recycle normal files, pg_standby for example can create
    3988             :      * symbolic links pointing to a separate archive directory.
    3989             :      */
    3990           0 :     if (endlogSegNo <= recycleSegNo &&
    3991           0 :         lstat(path, &statbuf) == 0 && S_ISREG(statbuf.st_mode) &&
    3992           0 :         InstallXLogFileSegment(&endlogSegNo, path,
    3993             :                                true, recycleSegNo, true))
    3994             :     {
    3995           0 :         ereport(DEBUG2,
    3996             :                 (errmsg("recycled write-ahead log file \"%s\"",
    3997             :                         segname)));
    3998           0 :         CheckpointStats.ckpt_segs_recycled++;
    3999             :         /* Needn't recheck that slot on future iterations */
    4000           0 :         endlogSegNo++;
    4001             :     }
    4002             :     else
    4003             :     {
    4004             :         /* No need for any more future segments... */
    4005             :         int         rc;
    4006             : 
    4007           0 :         ereport(DEBUG2,
    4008             :                 (errmsg("removing write-ahead log file \"%s\"",
    4009             :                         segname)));
    4010             : 
    4011             : #ifdef WIN32
    4012             : 
    4013             :         /*
    4014             :          * On Windows, if another process (e.g another backend) holds the file
    4015             :          * open in FILE_SHARE_DELETE mode, unlink will succeed, but the file
    4016             :          * will still show up in directory listing until the last handle is
    4017             :          * closed. To avoid confusing the lingering deleted file for a live
    4018             :          * WAL file that needs to be archived, rename it before deleting it.
    4019             :          *
    4020             :          * If another process holds the file open without FILE_SHARE_DELETE
    4021             :          * flag, rename will fail. We'll try again at the next checkpoint.
    4022             :          */
    4023             :         snprintf(newpath, MAXPGPATH, "%s.deleted", path);
    4024             :         if (rename(path, newpath) != 0)
    4025             :         {
    4026             :             ereport(LOG,
    4027             :                     (errcode_for_file_access(),
    4028             :                      errmsg("could not rename old write-ahead log file \"%s\": %m",
    4029             :                             path)));
    4030             :             return;
    4031             :         }
    4032             :         rc = durable_unlink(newpath, LOG);
    4033             : #else
    4034           0 :         rc = durable_unlink(path, LOG);
    4035             : #endif
    4036           0 :         if (rc != 0)
    4037             :         {
    4038             :             /* Message already logged by durable_unlink() */
    4039           0 :             return;
    4040             :         }
    4041           0 :         CheckpointStats.ckpt_segs_removed++;
    4042             :     }
    4043             : 
    4044           0 :     XLogArchiveCleanup(segname);
    4045             : }
    4046             : 
    4047             : /*
    4048             :  * Verify whether pg_wal and pg_wal/archive_status exist.
    4049             :  * If the latter does not exist, recreate it.
    4050             :  *
    4051             :  * It is not the goal of this function to verify the contents of these
    4052             :  * directories, but to help in cases where someone has performed a cluster
    4053             :  * copy for PITR purposes but omitted pg_wal from the copy.
    4054             :  *
    4055             :  * We could also recreate pg_wal if it doesn't exist, but a deliberate
    4056             :  * policy decision was made not to.  It is fairly common for pg_wal to be
    4057             :  * a symlink, and if that was the DBA's intent then automatically making a
    4058             :  * plain directory would result in degraded performance with no notice.
    4059             :  */
    4060             : static void
    4061           3 : ValidateXLOGDirectoryStructure(void)
    4062             : {
    4063             :     char        path[MAXPGPATH];
    4064             :     struct stat stat_buf;
    4065             : 
    4066             :     /* Check for pg_wal; if it doesn't exist, error out */
    4067           6 :     if (stat(XLOGDIR, &stat_buf) != 0 ||
    4068           3 :         !S_ISDIR(stat_buf.st_mode))
    4069           0 :         ereport(FATAL,
    4070             :                 (errmsg("required WAL directory \"%s\" does not exist",
    4071             :                         XLOGDIR)));
    4072             : 
    4073             :     /* Check for archive_status */
    4074           3 :     snprintf(path, MAXPGPATH, XLOGDIR "/archive_status");
    4075           3 :     if (stat(path, &stat_buf) == 0)
    4076             :     {
    4077             :         /* Check for weird cases where it exists but isn't a directory */
    4078           3 :         if (!S_ISDIR(stat_buf.st_mode))
    4079           0 :             ereport(FATAL,
    4080             :                     (errmsg("required WAL directory \"%s\" does not exist",
    4081             :                             path)));
    4082             :     }
    4083             :     else
    4084             :     {
    4085           0 :         ereport(LOG,
    4086             :                 (errmsg("creating missing WAL directory \"%s\"", path)));
    4087           0 :         if (mkdir(path, S_IRWXU) < 0)
    4088           0 :             ereport(FATAL,
    4089             :                     (errmsg("could not create missing directory \"%s\": %m",
    4090             :                             path)));
    4091             :     }
    4092           3 : }
    4093             : 
    4094             : /*
    4095             :  * Remove previous backup history files.  This also retries creation of
    4096             :  * .ready files for any backup history files for which XLogArchiveNotify
    4097             :  * failed earlier.
    4098             :  */
    4099             : static void
    4100           0 : CleanupBackupHistory(void)
    4101             : {
    4102             :     DIR        *xldir;
    4103             :     struct dirent *xlde;
    4104             :     char        path[MAXPGPATH + sizeof(XLOGDIR)];
    4105             : 
    4106           0 :     xldir = AllocateDir(XLOGDIR);
    4107           0 :     if (xldir == NULL)
    4108           0 :         ereport(ERROR,
    4109             :                 (errcode_for_file_access(),
    4110             :                  errmsg("could not open write-ahead log directory \"%s\": %m",
    4111             :                         XLOGDIR)));
    4112             : 
    4113           0 :     while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
    4114             :     {
    4115           0 :         if (IsBackupHistoryFileName(xlde->d_name))
    4116             :         {
    4117           0 :             if (XLogArchiveCheckDone(xlde->d_name))
    4118             :             {
    4119           0 :                 elog(DEBUG2, "removing WAL backup history file \"%s\"",
    4120             :                      xlde->d_name);
    4121           0 :                 snprintf(path, sizeof(path), XLOGDIR "/%s", xlde->d_name);
    4122           0 :                 unlink(path);
    4123           0 :                 XLogArchiveCleanup(xlde->d_name);
    4124             :             }
    4125             :         }
    4126             :     }
    4127             : 
    4128           0 :     FreeDir(xldir);
    4129           0 : }
    4130             : 
    4131             : /*
    4132             :  * Attempt to read an XLOG record.
    4133             :  *
    4134             :  * If RecPtr is valid, try to read a record at that position.  Otherwise
    4135             :  * try to read a record just after the last one previously read.
    4136             :  *
    4137             :  * If no valid record is available, returns NULL, or fails if emode is PANIC.
    4138             :  * (emode must be either PANIC, LOG). In standby mode, retries until a valid
    4139             :  * record is available.
    4140             :  *
    4141             :  * The record is copied into readRecordBuf, so that on successful return,
    4142             :  * the returned record pointer always points there.
    4143             :  */
    4144             : static XLogRecord *
    4145           6 : ReadRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr, int emode,
    4146             :            bool fetching_ckpt)
    4147             : {
    4148             :     XLogRecord *record;
    4149           6 :     XLogPageReadPrivate *private = (XLogPageReadPrivate *) xlogreader->private_data;
    4150             : 
    4151             :     /* Pass through parameters to XLogPageRead */
    4152           6 :     private->fetching_ckpt = fetching_ckpt;
    4153           6 :     private->emode = emode;
    4154           6 :     private->randAccess = (RecPtr != InvalidXLogRecPtr);
    4155             : 
    4156             :     /* This is the first attempt to read this page. */
    4157           6 :     lastSourceFailed = false;
    4158             : 
    4159             :     for (;;)
    4160             :     {
    4161             :         char       *errormsg;
    4162             : 
    4163           6 :         record = XLogReadRecord(xlogreader, RecPtr, &errormsg);
    4164           6 :         ReadRecPtr = xlogreader->ReadRecPtr;
    4165           6 :         EndRecPtr = xlogreader->EndRecPtr;
    4166           6 :         if (record == NULL)
    4167             :         {
    4168           0 :             if (readFile >= 0)
    4169             :             {
    4170           0 :                 close(readFile);
    4171           0 :                 readFile = -1;
    4172             :             }
    4173             : 
    4174             :             /*
    4175             :              * We only end up here without a message when XLogPageRead()
    4176             :              * failed - in that case we already logged something. In
    4177             :              * StandbyMode that only happens if we have been triggered, so we
    4178             :              * shouldn't loop anymore in that case.
    4179             :              */
    4180           0 :             if (errormsg)
    4181           0 :                 ereport(emode_for_corrupt_record(emode,
    4182             :                                                  RecPtr ? RecPtr : EndRecPtr),
    4183             :                         (errmsg_internal("%s", errormsg) /* already translated */ ));
    4184             :         }
    4185             : 
    4186             :         /*
    4187             :          * Check page TLI is one of the expected values.
    4188             :          */
    4189           6 :         else if (!tliInHistory(xlogreader->latestPageTLI, expectedTLEs))
    4190             :         {
    4191             :             char        fname[MAXFNAMELEN];
    4192             :             XLogSegNo   segno;
    4193             :             int32       offset;
    4194             : 
    4195           0 :             XLByteToSeg(xlogreader->latestPagePtr, segno);
    4196           0 :             offset = xlogreader->latestPagePtr % XLogSegSize;
    4197           0 :             XLogFileName(fname, xlogreader->readPageTLI, segno);
    4198           0 :             ereport(emode_for_corrupt_record(emode,
    4199             :                                              RecPtr ? RecPtr : EndRecPtr),
    4200             :                     (errmsg("unexpected timeline ID %u in log segment %s, offset %u",
    4201             :                             xlogreader->latestPageTLI,
    4202             :                             fname,
    4203             :                             offset)));
    4204           0 :             record = NULL;
    4205             :         }
    4206             : 
    4207           6 :         if (record)
    4208             :         {
    4209             :             /* Great, got a record */
    4210          12 :             return record;
    4211             :         }
    4212             :         else
    4213             :         {
    4214             :             /* No valid record available from this source */
    4215           0 :             lastSourceFailed = true;
    4216             : 
    4217             :             /*
    4218             :              * If archive recovery was requested, but we were still doing
    4219             :              * crash recovery, switch to archive recovery and retry using the
    4220             :              * offline archive. We have now replayed all the valid WAL in
    4221             :              * pg_wal, so we are presumably now consistent.
    4222             :              *
    4223             :              * We require that there's at least some valid WAL present in
    4224             :              * pg_wal, however (!fetching_ckpt).  We could recover using the
    4225             :              * WAL from the archive, even if pg_wal is completely empty, but
    4226             :              * we'd have no idea how far we'd have to replay to reach
    4227             :              * consistency.  So err on the safe side and give up.
    4228             :              */
    4229           0 :             if (!InArchiveRecovery && ArchiveRecoveryRequested &&
    4230             :                 !fetching_ckpt)
    4231             :             {
    4232           0 :                 ereport(DEBUG1,
    4233             :                         (errmsg_internal("reached end of WAL in pg_wal, entering archive recovery")));
    4234           0 :                 InArchiveRecovery = true;
    4235           0 :                 if (StandbyModeRequested)
    4236           0 :                     StandbyMode = true;
    4237             : 
    4238             :                 /* initialize minRecoveryPoint to this record */
    4239           0 :                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    4240           0 :                 ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
    4241           0 :                 if (ControlFile->minRecoveryPoint < EndRecPtr)
    4242             :                 {
    4243           0 :                     ControlFile->minRecoveryPoint = EndRecPtr;
    4244           0 :                     ControlFile->minRecoveryPointTLI = ThisTimeLineID;
    4245             :                 }
    4246             :                 /* update local copy */
    4247           0 :                 minRecoveryPoint = ControlFile->minRecoveryPoint;
    4248           0 :                 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
    4249             : 
    4250           0 :                 UpdateControlFile();
    4251           0 :                 LWLockRelease(ControlFileLock);
    4252             : 
    4253           0 :                 CheckRecoveryConsistency();
    4254             : 
    4255             :                 /*
    4256             :                  * Before we retry, reset lastSourceFailed and currentSource
    4257             :                  * so that we will check the archive next.
    4258             :                  */
    4259           0 :                 lastSourceFailed = false;
    4260           0 :                 currentSource = 0;
    4261             : 
    4262           0 :                 continue;
    4263             :             }
    4264             : 
    4265             :             /* In standby mode, loop back to retry. Otherwise, give up. */
    4266           0 :             if (StandbyMode && !CheckForStandbyTrigger())
    4267           0 :                 continue;
    4268             :             else
    4269           0 :                 return NULL;
    4270             :         }
    4271           0 :     }
    4272             : }
    4273             : 
    4274             : /*
    4275             :  * Scan for new timelines that might have appeared in the archive since we
    4276             :  * started recovery.
    4277             :  *
    4278             :  * If there are any, the function changes recovery target TLI to the latest
    4279             :  * one and returns 'true'.
    4280             :  */
    4281             : static bool
    4282           0 : rescanLatestTimeLine(void)
    4283             : {
    4284             :     List       *newExpectedTLEs;
    4285             :     bool        found;
    4286             :     ListCell   *cell;
    4287             :     TimeLineID  newtarget;
    4288           0 :     TimeLineID  oldtarget = recoveryTargetTLI;
    4289           0 :     TimeLineHistoryEntry *currentTle = NULL;
    4290             : 
    4291           0 :     newtarget = findNewestTimeLine(recoveryTargetTLI);
    4292           0 :     if (newtarget == recoveryTargetTLI)
    4293             :     {
    4294             :         /* No new timelines found */
    4295           0 :         return false;
    4296             :     }
    4297             : 
    4298             :     /*
    4299             :      * Determine the list of expected TLIs for the new TLI
    4300             :      */
    4301             : 
    4302           0 :     newExpectedTLEs = readTimeLineHistory(newtarget);
    4303             : 
    4304             :     /*
    4305             :      * If the current timeline is not part of the history of the new timeline,
    4306             :      * we cannot proceed to it.
    4307             :      */
    4308           0 :     found = false;
    4309           0 :     foreach(cell, newExpectedTLEs)
    4310             :     {
    4311           0 :         currentTle = (TimeLineHistoryEntry *) lfirst(cell);
    4312             : 
    4313           0 :         if (currentTle->tli == recoveryTargetTLI)
    4314             :         {
    4315           0 :             found = true;
    4316           0 :             break;
    4317             :         }
    4318             :     }
    4319           0 :     if (!found)
    4320             :     {
    4321           0 :         ereport(LOG,
    4322             :                 (errmsg("new timeline %u is not a child of database system timeline %u",
    4323             :                         newtarget,
    4324             :                         ThisTimeLineID)));
    4325           0 :         return false;
    4326             :     }
    4327             : 
    4328             :     /*
    4329             :      * The current timeline was found in the history file, but check that the
    4330             :      * next timeline was forked off from it *after* the current recovery
    4331             :      * location.
    4332             :      */
    4333           0 :     if (currentTle->end < EndRecPtr)
    4334             :     {
    4335           0 :         ereport(LOG,
    4336             :                 (errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%X",
    4337             :                         newtarget,
    4338             :                         ThisTimeLineID,
    4339             :                         (uint32) (EndRecPtr >> 32), (uint32) EndRecPtr)));
    4340           0 :         return false;
    4341             :     }
    4342             : 
    4343             :     /* The new timeline history seems valid. Switch target */
    4344           0 :     recoveryTargetTLI = newtarget;
    4345           0 :     list_free_deep(expectedTLEs);
    4346           0 :     expectedTLEs = newExpectedTLEs;
    4347             : 
    4348             :     /*
    4349             :      * As in StartupXLOG(), try to ensure we have all the history files
    4350             :      * between the old target and new target in pg_wal.
    4351             :      */
    4352           0 :     restoreTimeLineHistoryFiles(oldtarget + 1, newtarget);
    4353             : 
    4354           0 :     ereport(LOG,
    4355             :             (errmsg("new target timeline is %u",
    4356             :                     recoveryTargetTLI)));
    4357             : 
    4358           0 :     return true;
    4359             : }
    4360             : 
    4361             : /*
    4362             :  * I/O routines for pg_control
    4363             :  *
    4364             :  * *ControlFile is a buffer in shared memory that holds an image of the
    4365             :  * contents of pg_control.  WriteControlFile() initializes pg_control
    4366             :  * given a preloaded buffer, ReadControlFile() loads the buffer from
    4367             :  * the pg_control file (during postmaster or standalone-backend startup),
    4368             :  * and UpdateControlFile() rewrites pg_control after we modify xlog state.
    4369             :  *
    4370             :  * For simplicity, WriteControlFile() initializes the fields of pg_control
    4371             :  * that are related to checking backend/database compatibility, and
    4372             :  * ReadControlFile() verifies they are correct.  We could split out the
    4373             :  * I/O and compatibility-check functions, but there seems no need currently.
    4374             :  */
    4375             : static void
    4376           1 : WriteControlFile(void)
    4377             : {
    4378             :     int         fd;
    4379             :     char        buffer[PG_CONTROL_FILE_SIZE];   /* need not be aligned */
    4380             : 
    4381             :     /*
    4382             :      * Ensure that the size of the pg_control data structure is sane.  See the
    4383             :      * comments for these symbols in pg_control.h.
    4384             :      */
    4385             :     StaticAssertStmt(sizeof(ControlFileData) <= PG_CONTROL_MAX_SAFE_SIZE,
    4386             :                      "pg_control is too large for atomic disk writes");
    4387             :     StaticAssertStmt(sizeof(ControlFileData) <= PG_CONTROL_FILE_SIZE,
    4388             :                      "sizeof(ControlFileData) exceeds PG_CONTROL_FILE_SIZE");
    4389             : 
    4390             :     /*
    4391             :      * Initialize version and compatibility-check fields
    4392             :      */
    4393           1 :     ControlFile->pg_control_version = PG_CONTROL_VERSION;
    4394           1 :     ControlFile->catalog_version_no = CATALOG_VERSION_NO;
    4395             : 
    4396           1 :     ControlFile->maxAlign = MAXIMUM_ALIGNOF;
    4397           1 :     ControlFile->floatFormat = FLOATFORMAT_VALUE;
    4398             : 
    4399           1 :     ControlFile->blcksz = BLCKSZ;
    4400           1 :     ControlFile->relseg_size = RELSEG_SIZE;
    4401           1 :     ControlFile->xlog_blcksz = XLOG_BLCKSZ;
    4402           1 :     ControlFile->xlog_seg_size = XLOG_SEG_SIZE;
    4403             : 
    4404           1 :     ControlFile->nameDataLen = NAMEDATALEN;
    4405           1 :     ControlFile->indexMaxKeys = INDEX_MAX_KEYS;
    4406             : 
    4407           1 :     ControlFile->toast_max_chunk_size = TOAST_MAX_CHUNK_SIZE;
    4408           1 :     ControlFile->loblksize = LOBLKSIZE;
    4409             : 
    4410           1 :     ControlFile->float4ByVal = FLOAT4PASSBYVAL;
    4411           1 :     ControlFile->float8ByVal = FLOAT8PASSBYVAL;
    4412             : 
    4413             :     /* Contents are protected with a CRC */
    4414           1 :     INIT_CRC32C(ControlFile->crc);
    4415           1 :     COMP_CRC32C(ControlFile->crc,
    4416             :                 (char *) ControlFile,
    4417             :                 offsetof(ControlFileData, crc));
    4418           1 :     FIN_CRC32C(ControlFile->crc);
    4419             : 
    4420             :     /*
    4421             :      * We write out PG_CONTROL_FILE_SIZE bytes into pg_control, zero-padding
    4422             :      * the excess over sizeof(ControlFileData).  This reduces the odds of
    4423             :      * premature-EOF errors when reading pg_control.  We'll still fail when we
    4424             :      * check the contents of the file, but hopefully with a more specific
    4425             :      * error than "couldn't read pg_control".
    4426             :      */
    4427           1 :     memset(buffer, 0, PG_CONTROL_FILE_SIZE);
    4428           1 :     memcpy(buffer, ControlFile, sizeof(ControlFileData));
    4429             : 
    4430           1 :     fd = BasicOpenFile(XLOG_CONTROL_FILE,
    4431             :                        O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
    4432             :                        S_IRUSR | S_IWUSR);
    4433           1 :     if (fd < 0)
    4434           0 :         ereport(PANIC,
    4435             :                 (errcode_for_file_access(),
    4436             :                  errmsg("could not create control file \"%s\": %m",
    4437             :                         XLOG_CONTROL_FILE)));
    4438             : 
    4439           1 :     errno = 0;
    4440           1 :     pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_WRITE);
    4441           1 :     if (write(fd, buffer, PG_CONTROL_FILE_SIZE) != PG_CONTROL_FILE_SIZE)
    4442             :     {
    4443             :         /* if write didn't set errno, assume problem is no disk space */
    4444           0 :         if (errno == 0)
    4445           0 :             errno = ENOSPC;
    4446           0 :         ereport(PANIC,
    4447             :                 (errcode_for_file_access(),
    4448             :                  errmsg("could not write to control file: %m")));
    4449             :     }
    4450           1 :     pgstat_report_wait_end();
    4451             : 
    4452           1 :     pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_SYNC);
    4453           1 :     if (pg_fsync(fd) != 0)
    4454           0 :         ereport(PANIC,
    4455             :                 (errcode_for_file_access(),
    4456             :                  errmsg("could not fsync control file: %m")));
    4457           1 :     pgstat_report_wait_end();
    4458             : 
    4459           1 :     if (close(fd))
    4460           0 :         ereport(PANIC,
    4461             :                 (errcode_for_file_access(),
    4462             :                  errmsg("could not close control file: %m")));
    4463           1 : }
    4464             : 
    4465             : static void
    4466           5 : ReadControlFile(void)
    4467             : {
    4468             :     pg_crc32c   crc;
    4469             :     int         fd;
    4470             : 
    4471             :     /*
    4472             :      * Read data...
    4473             :      */
    4474           5 :     fd = BasicOpenFile(XLOG_CONTROL_FILE,
    4475             :                        O_RDWR | PG_BINARY,
    4476             :                        S_IRUSR | S_IWUSR);
    4477           5 :     if (fd < 0)
    4478           0 :         ereport(PANIC,
    4479             :                 (errcode_for_file_access(),
    4480             :                  errmsg("could not open control file \"%s\": %m",
    4481             :                         XLOG_CONTROL_FILE)));
    4482             : 
    4483           5 :     pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_READ);
    4484           5 :     if (read(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData))
    4485           0 :         ereport(PANIC,
    4486             :                 (errcode_for_file_access(),
    4487             :                  errmsg("could not read from control file: %m")));
    4488           5 :     pgstat_report_wait_end();
    4489             : 
    4490           5 :     close(fd);
    4491             : 
    4492             :     /*
    4493             :      * Check for expected pg_control format version.  If this is wrong, the
    4494             :      * CRC check will likely fail because we'll be checking the wrong number
    4495             :      * of bytes.  Complaining about wrong version will probably be more
    4496             :      * enlightening than complaining about wrong CRC.
    4497             :      */
    4498             : 
    4499           5 :     if (ControlFile->pg_control_version != PG_CONTROL_VERSION && ControlFile->pg_control_version % 65536 == 0 && ControlFile->pg_control_version / 65536 != 0)
    4500           0 :         ereport(FATAL,
    4501             :                 (errmsg("database files are incompatible with server"),
    4502             :                  errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d (0x%08x),"
    4503             :                            " but the server was compiled with PG_CONTROL_VERSION %d (0x%08x).",
    4504             :                            ControlFile->pg_control_version, ControlFile->pg_control_version,
    4505             :                            PG_CONTROL_VERSION, PG_CONTROL_VERSION),
    4506             :                  errhint("This could be a problem of mismatched byte ordering.  It looks like you need to initdb.")));
    4507             : 
    4508           5 :     if (ControlFile->pg_control_version != PG_CONTROL_VERSION)
    4509           0 :         ereport(FATAL,
    4510             :                 (errmsg("database files are incompatible with server"),
    4511             :                  errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d,"
    4512             :                            " but the server was compiled with PG_CONTROL_VERSION %d.",
    4513             :                            ControlFile->pg_control_version, PG_CONTROL_VERSION),
    4514             :                  errhint("It looks like you need to initdb.")));
    4515             : 
    4516             :     /* Now check the CRC. */
    4517           5 :     INIT_CRC32C(crc);
    4518           5 :     COMP_CRC32C(crc,
    4519             :                 (char *) ControlFile,
    4520             :                 offsetof(ControlFileData, crc));
    4521           5 :     FIN_CRC32C(crc);
    4522             : 
    4523           5 :     if (!EQ_CRC32C(crc, ControlFile->crc))
    4524           0 :         ereport(FATAL,
    4525             :                 (errmsg("incorrect checksum in control file")));
    4526             : 
    4527             :     /*
    4528             :      * Do compatibility checking immediately.  If the database isn't
    4529             :      * compatible with the backend executable, we want to abort before we can
    4530             :      * possibly do any damage.
    4531             :      */
    4532           5 :     if (ControlFile->catalog_version_no != CATALOG_VERSION_NO)
    4533           0 :         ereport(FATAL,
    4534             :                 (errmsg("database files are incompatible with server"),
    4535             :                  errdetail("The database cluster was initialized with CATALOG_VERSION_NO %d,"
    4536             :                            " but the server was compiled with CATALOG_VERSION_NO %d.",
    4537             :                            ControlFile->catalog_version_no, CATALOG_VERSION_NO),
    4538             :                  errhint("It looks like you need to initdb.")));
    4539           5 :     if (ControlFile->maxAlign != MAXIMUM_ALIGNOF)
    4540           0 :         ereport(FATAL,
    4541             :                 (errmsg("database files are incompatible with server"),
    4542             :                  errdetail("The database cluster was initialized with MAXALIGN %d,"
    4543             :                            " but the server was compiled with MAXALIGN %d.",
    4544             :                            ControlFile->maxAlign, MAXIMUM_ALIGNOF),
    4545             :                  errhint("It looks like you need to initdb.")));
    4546           5 :     if (ControlFile->floatFormat != FLOATFORMAT_VALUE)
    4547           0 :         ereport(FATAL,
    4548             :                 (errmsg("database files are incompatible with server"),
    4549             :                  errdetail("The database cluster appears to use a different floating-point number format than the server executable."),
    4550             :                  errhint("It looks like you need to initdb.")));
    4551           5 :     if (ControlFile->blcksz != BLCKSZ)
    4552           0 :         ereport(FATAL,
    4553             :                 (errmsg("database files are incompatible with server"),
    4554             :                  errdetail("The database cluster was initialized with BLCKSZ %d,"
    4555             :                            " but the server was compiled with BLCKSZ %d.",
    4556             :                            ControlFile->blcksz, BLCKSZ),
    4557             :                  errhint("It looks like you need to recompile or initdb.")));
    4558           5 :     if (ControlFile->relseg_size != RELSEG_SIZE)
    4559           0 :         ereport(FATAL,
    4560             :                 (errmsg("database files are incompatible with server"),
    4561             :                  errdetail("The database cluster was initialized with RELSEG_SIZE %d,"
    4562             :                            " but the server was compiled with RELSEG_SIZE %d.",
    4563             :                            ControlFile->relseg_size, RELSEG_SIZE),
    4564             :                  errhint("It looks like you need to recompile or initdb.")));
    4565           5 :     if (ControlFile->xlog_blcksz != XLOG_BLCKSZ)
    4566           0 :         ereport(FATAL,
    4567             :                 (errmsg("database files are incompatible with server"),
    4568             :                  errdetail("The database cluster was initialized with XLOG_BLCKSZ %d,"
    4569             :                            " but the server was compiled with XLOG_BLCKSZ %d.",
    4570             :                            ControlFile->xlog_blcksz, XLOG_BLCKSZ),
    4571             :                  errhint("It looks like you need to recompile or initdb.")));
    4572           5 :     if (ControlFile->xlog_seg_size != XLOG_SEG_SIZE)
    4573           0 :         ereport(FATAL,
    4574             :                 (errmsg("database files are incompatible with server"),
    4575             :                  errdetail("The database cluster was initialized with XLOG_SEG_SIZE %d,"
    4576             :                            " but the server was compiled with XLOG_SEG_SIZE %d.",
    4577             :                            ControlFile->xlog_seg_size, XLOG_SEG_SIZE),
    4578             :                  errhint("It looks like you need to recompile or initdb.")));
    4579           5 :     if (ControlFile->nameDataLen != NAMEDATALEN)
    4580           0 :         ereport(FATAL,
    4581             :                 (errmsg("database files are incompatible with server"),
    4582             :                  errdetail("The database cluster was initialized with NAMEDATALEN %d,"
    4583             :                            " but the server was compiled with NAMEDATALEN %d.",
    4584             :                            ControlFile->nameDataLen, NAMEDATALEN),
    4585             :                  errhint("It looks like you need to recompile or initdb.")));
    4586           5 :     if (ControlFile->indexMaxKeys != INDEX_MAX_KEYS)
    4587           0 :         ereport(FATAL,
    4588             :                 (errmsg("database files are incompatible with server"),
    4589             :                  errdetail("The database cluster was initialized with INDEX_MAX_KEYS %d,"
    4590             :                            " but the server was compiled with INDEX_MAX_KEYS %d.",
    4591             :                            ControlFile->indexMaxKeys, INDEX_MAX_KEYS),
    4592             :                  errhint("It looks like you need to recompile or initdb.")));
    4593           5 :     if (ControlFile->toast_max_chunk_size != TOAST_MAX_CHUNK_SIZE)
    4594           0 :         ereport(FATAL,
    4595             :                 (errmsg("database files are incompatible with server"),
    4596             :                  errdetail("The database cluster was initialized with TOAST_MAX_CHUNK_SIZE %d,"
    4597             :                            " but the server was compiled with TOAST_MAX_CHUNK_SIZE %d.",
    4598             :                            ControlFile->toast_max_chunk_size, (int) TOAST_MAX_CHUNK_SIZE),
    4599             :                  errhint("It looks like you need to recompile or initdb.")));
    4600           5 :     if (ControlFile->loblksize != LOBLKSIZE)
    4601           0 :         ereport(FATAL,
    4602             :                 (errmsg("database files are incompatible with server"),
    4603             :                  errdetail("The database cluster was initialized with LOBLKSIZE %d,"
    4604             :                            " but the server was compiled with LOBLKSIZE %d.",
    4605             :                            ControlFile->loblksize, (int) LOBLKSIZE),
    4606             :                  errhint("It looks like you need to recompile or initdb.")));
    4607             : 
    4608             : #ifdef USE_FLOAT4_BYVAL
    4609           5 :     if (ControlFile->float4ByVal != true)
    4610           0 :         ereport(FATAL,
    4611             :                 (errmsg("database files are incompatible with server"),
    4612             :                  errdetail("The database cluster was initialized without USE_FLOAT4_BYVAL"
    4613             :                            " but the server was compiled with USE_FLOAT4_BYVAL."),
    4614             :                  errhint("It looks like you need to recompile or initdb.")));
    4615             : #else
    4616             :     if (ControlFile->float4ByVal != false)
    4617             :         ereport(FATAL,
    4618             :                 (errmsg("database files are incompatible with server"),
    4619             :                  errdetail("The database cluster was initialized with USE_FLOAT4_BYVAL"
    4620             :                            " but the server was compiled without USE_FLOAT4_BYVAL."),
    4621             :                  errhint("It looks like you need to recompile or initdb.")));
    4622             : #endif
    4623             : 
    4624             : #ifdef USE_FLOAT8_BYVAL
    4625             :     if (ControlFile->float8ByVal != true)
    4626             :         ereport(FATAL,
    4627             :                 (errmsg("database files are incompatible with server"),
    4628             :                  errdetail("The database cluster was initialized without USE_FLOAT8_BYVAL"
    4629             :                            " but the server was compiled with USE_FLOAT8_BYVAL."),
    4630             :                  errhint("It looks like you need to recompile or initdb.")));
    4631             : #else
    4632           5 :     if (ControlFile->float8ByVal != false)
    4633           0 :         ereport(FATAL,
    4634             :                 (errmsg("database files are incompatible with server"),
    4635             :                  errdetail("The database cluster was initialized with USE_FLOAT8_BYVAL"
    4636             :                            " but the server was compiled without USE_FLOAT8_BYVAL."),
    4637             :                  errhint("It looks like you need to recompile or initdb.")));
    4638             : #endif
    4639             : 
    4640             :     /* Make the initdb settings visible as GUC variables, too */
    4641           5 :     SetConfigOption("data_checksums", DataChecksumsEnabled() ? "yes" : "no",
    4642             :                     PGC_INTERNAL, PGC_S_OVERRIDE);
    4643           5 : }
    4644             : 
    4645             : void
    4646          18 : UpdateControlFile(void)
    4647             : {
    4648             :     int         fd;
    4649             : 
    4650          18 :     INIT_CRC32C(ControlFile->crc);
    4651          18 :     COMP_CRC32C(ControlFile->crc,
    4652             :                 (char *) ControlFile,
    4653             :                 offsetof(ControlFileData, crc));
    4654          18 :     FIN_CRC32C(ControlFile->crc);
    4655             : 
    4656          18 :     fd = BasicOpenFile(XLOG_CONTROL_FILE,
    4657             :                        O_RDWR | PG_BINARY,
    4658             :                        S_IRUSR | S_IWUSR);
    4659          18 :     if (fd < 0)
    4660           0 :         ereport(PANIC,
    4661             :                 (errcode_for_file_access(),
    4662             :                  errmsg("could not open control file \"%s\": %m",
    4663             :                         XLOG_CONTROL_FILE)));
    4664             : 
    4665          18 :     errno = 0;
    4666          18 :     pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_WRITE_UPDATE);
    4667          18 :     if (write(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData))
    4668             :     {
    4669             :         /* if write didn't set errno, assume problem is no disk space */
    4670           0 :         if (errno == 0)
    4671           0 :             errno = ENOSPC;
    4672           0 :         ereport(PANIC,
    4673             :                 (errcode_for_file_access(),
    4674             :                  errmsg("could not write to control file: %m")));
    4675             :     }
    4676          18 :     pgstat_report_wait_end();
    4677             : 
    4678          18 :     pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_SYNC_UPDATE);
    4679          18 :     if (pg_fsync(fd) != 0)
    4680           0 :         ereport(PANIC,
    4681             :                 (errcode_for_file_access(),
    4682             :                  errmsg("could not fsync control file: %m")));
    4683          18 :     pgstat_report_wait_end();
    4684             : 
    4685          18 :     if (close(fd))
    4686           0 :         ereport(PANIC,
    4687             :                 (errcode_for_file_access(),
    4688             :                  errmsg("could not close control file: %m")));
    4689          18 : }
    4690             : 
    4691             : /*
    4692             :  * Returns the unique system identifier from control file.
    4693             :  */
    4694             : uint64
    4695           0 : GetSystemIdentifier(void)
    4696             : {
    4697           0 :     Assert(ControlFile != NULL);
    4698           0 :     return ControlFile->system_identifier;
    4699             : }
    4700             : 
    4701             : /*
    4702             :  * Returns the random nonce from control file.
    4703             :  */
    4704             : char *
    4705           0 : GetMockAuthenticationNonce(void)
    4706             : {
    4707           0 :     Assert(ControlFile != NULL);
    4708           0 :     return ControlFile->mock_authentication_nonce;
    4709             : }
    4710             : 
    4711             : /*
    4712             :  * Are checksums enabled for data pages?
    4713             :  */
    4714             : bool
    4715       27816 : DataChecksumsEnabled(void)
    4716             : {
    4717       27816 :     Assert(ControlFile != NULL);
    4718       27816 :     return (ControlFile->data_checksum_version > 0);
    4719             : }
    4720             : 
    4721             : /*
    4722             :  * Returns a fake LSN for unlogged relations.
    4723             :  *
    4724             :  * Each call generates an LSN that is greater than any previous value
    4725             :  * returned. The current counter value is saved and restored across clean
    4726             :  * shutdowns, but like unlogged relations, does not survive a crash. This can
    4727             :  * be used in lieu of real LSN values returned by XLogInsert, if you need an
    4728             :  * LSN-like increasing sequence of numbers without writing any WAL.
    4729             :  */
    4730             : XLogRecPtr
    4731           0 : GetFakeLSNForUnloggedRel(void)
    4732             : {
    4733             :     XLogRecPtr  nextUnloggedLSN;
    4734             : 
    4735             :     /* increment the unloggedLSN counter, need SpinLock */
    4736           0 :     SpinLockAcquire(&XLogCtl->ulsn_lck);
    4737           0 :     nextUnloggedLSN = XLogCtl->unloggedLSN++;
    4738           0 :     SpinLockRelease(&XLogCtl->ulsn_lck);
    4739             : 
    4740           0 :     return nextUnloggedLSN;
    4741             : }
    4742             : 
    4743             : /*
    4744             :  * Auto-tune the number of XLOG buffers.
    4745             :  *
    4746             :  * The preferred setting for wal_buffers is about 3% of shared_buffers, with
    4747             :  * a maximum of one XLOG segment (there is little reason to think that more
    4748             :  * is helpful, at least so long as we force an fsync when switching log files)
    4749             :  * and a minimum of 8 blocks (which was the default value prior to PostgreSQL
    4750             :  * 9.1, when auto-tuning was added).
    4751             :  *
    4752             :  * This should not be called until NBuffers has received its final value.
    4753             :  */
    4754             : static int
    4755           5 : XLOGChooseNumBuffers(void)
    4756             : {
    4757             :     int         xbuffers;
    4758             : 
    4759           5 :     xbuffers = NBuffers / 32;
    4760           5 :     if (xbuffers > XLOG_SEG_SIZE / XLOG_BLCKSZ)
    4761           0 :         xbuffers = XLOG_SEG_SIZE / XLOG_BLCKSZ;
    4762           5 :     if (xbuffers < 8)
    4763           0 :         xbuffers = 8;
    4764           5 :     return xbuffers;
    4765             : }
    4766             : 
    4767             : /*
    4768             :  * GUC check_hook for wal_buffers
    4769             :  */
    4770             : bool
    4771          10 : check_wal_buffers(int *newval, void **extra, GucSource source)
    4772             : {
    4773             :     /*
    4774             :      * -1 indicates a request for auto-tune.
    4775             :      */
    4776          10 :     if (*newval == -1)
    4777             :     {
    4778             :         /*
    4779             :          * If we haven't yet changed the boot_val default of -1, just let it
    4780             :          * be.  We'll fix it when XLOGShmemSize is called.
    4781             :          */
    4782           5 :         if (XLOGbuffers == -1)
    4783           5 :             return true;
    4784             : 
    4785             :         /* Otherwise, substitute the auto-tune value */
    4786           0 :         *newval = XLOGChooseNumBuffers();
    4787             :     }
    4788             : 
    4789             :     /*
    4790             :      * We clamp manually-set values to at least 4 blocks.  Prior to PostgreSQL
    4791             :      * 9.1, a minimum of 4 was enforced by guc.c, but since that is no longer
    4792             :      * the case, we just silently treat such values as a request for the
    4793             :      * minimum.  (We could throw an error instead, but that doesn't seem very
    4794             :      * helpful.)
    4795             :      */
    4796           5 :     if (*newval < 4)
    4797           0 :         *newval = 4;
    4798             : 
    4799           5 :     return true;
    4800             : }
    4801             : 
    4802             : /*
    4803             :  * Initialization of shared memory for XLOG
    4804             :  */
    4805             : Size
    4806          10 : XLOGShmemSize(void)
    4807             : {
    4808             :     Size        size;
    4809             : 
    4810             :     /*
    4811             :      * If the value of wal_buffers is -1, use the preferred auto-tune value.
    4812             :      * This isn't an amazingly clean place to do this, but we must wait till
    4813             :      * NBuffers has received its final value, and must do it before using the
    4814             :      * value of XLOGbuffers to do anything important.
    4815             :      */
    4816          10 :     if (XLOGbuffers == -1)
    4817             :     {
    4818             :         char        buf[32];
    4819             : 
    4820           5 :         snprintf(buf, sizeof(buf), "%d", XLOGChooseNumBuffers());
    4821           5 :         SetConfigOption("wal_buffers", buf, PGC_POSTMASTER, PGC_S_OVERRIDE);
    4822             :     }
    4823          10 :     Assert(XLOGbuffers > 0);
    4824             : 
    4825             :     /* XLogCtl */
    4826          10 :     size = sizeof(XLogCtlData);
    4827             : 
    4828             :     /* WAL insertion locks, plus alignment */
    4829          10 :     size = add_size(size, mul_size(sizeof(WALInsertLockPadded), NUM_XLOGINSERT_LOCKS + 1));
    4830             :     /* xlblocks array */
    4831          10 :     size = add_size(size, mul_size(sizeof(XLogRecPtr), XLOGbuffers));
    4832             :     /* extra alignment padding for XLOG I/O buffers */
    4833          10 :     size = add_size(size, XLOG_BLCKSZ);
    4834             :     /* and the buffers themselves */
    4835          10 :     size = add_size(size, mul_size(XLOG_BLCKSZ, XLOGbuffers));
    4836             : 
    4837             :     /*
    4838             :      * Note: we don't count ControlFileData, it comes out of the "slop factor"
    4839             :      * added by CreateSharedMemoryAndSemaphores.  This lets us use this
    4840             :      * routine again below to compute the actual allocation size.
    4841             :      */
    4842             : 
    4843          10 :     return size;
    4844             : }
    4845             : 
    4846             : void
    4847           5 : XLOGShmemInit(void)
    4848             : {
    4849             :     bool        foundCFile,
    4850             :                 foundXLog;
    4851             :     char       *allocptr;
    4852             :     int         i;
    4853             : 
    4854             : #ifdef WAL_DEBUG
    4855             : 
    4856             :     /*
    4857             :      * Create a memory context for WAL debugging that's exempt from the normal
    4858             :      * "no pallocs in critical section" rule. Yes, that can lead to a PANIC if
    4859             :      * an allocation fails, but wal_debug is not for production use anyway.
    4860             :      */
    4861             :     if (walDebugCxt == NULL)
    4862             :     {
    4863             :         walDebugCxt = AllocSetContextCreate(TopMemoryContext,
    4864             :                                             "WAL Debug",
    4865             :                                             ALLOCSET_DEFAULT_SIZES);
    4866             :         MemoryContextAllowInCriticalSection(walDebugCxt, true);
    4867             :     }
    4868             : #endif
    4869             : 
    4870           5 :     ControlFile = (ControlFileData *)
    4871           5 :         ShmemInitStruct("Control File", sizeof(ControlFileData), &foundCFile);
    4872           5 :     XLogCtl = (XLogCtlData *)
    4873           5 :         ShmemInitStruct("XLOG Ctl", XLOGShmemSize(), &foundXLog);
    4874             : 
    4875           5 :     if (foundCFile || foundXLog)
    4876             :     {
    4877             :         /* both should be present or neither */
    4878           0 :         Assert(foundCFile && foundXLog);
    4879             : 
    4880             :         /* Initialize local copy of WALInsertLocks and register the tranche */
    4881           0 :         WALInsertLocks = XLogCtl->Insert.WALInsertLocks;
    4882           0 :         LWLockRegisterTranche(LWTRANCHE_WAL_INSERT,
    4883             :                               "wal_insert");
    4884           5 :         return;
    4885             :     }
    4886           5 :     memset(XLogCtl, 0, sizeof(XLogCtlData));
    4887             : 
    4888             :     /*
    4889             :      * Since XLogCtlData contains XLogRecPtr fields, its sizeof should be a
    4890             :      * multiple of the alignment for same, so no extra alignment padding is
    4891             :      * needed here.
    4892             :      */
    4893           5 :     allocptr = ((char *) XLogCtl) + sizeof(XLogCtlData);
    4894           5 :     XLogCtl->xlblocks = (XLogRecPtr *) allocptr;
    4895           5 :     memset(XLogCtl->xlblocks, 0, sizeof(XLogRecPtr) * XLOGbuffers);
    4896           5 :     allocptr += sizeof(XLogRecPtr) * XLOGbuffers;
    4897             : 
    4898             : 
    4899             :     /* WAL insertion locks. Ensure they're aligned to the full padded size */
    4900           5 :     allocptr += sizeof(WALInsertLockPadded) -
    4901           5 :         ((uintptr_t) allocptr) % sizeof(WALInsertLockPadded);
    4902           5 :     WALInsertLocks = XLogCtl->Insert.WALInsertLocks =
    4903             :         (WALInsertLockPadded *) allocptr;
    4904           5 :     allocptr += sizeof(WALInsertLockPadded) * NUM_XLOGINSERT_LOCKS;
    4905             : 
    4906           5 :     LWLockRegisterTranche(LWTRANCHE_WAL_INSERT, "wal_insert");
    4907          45 :     for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
    4908             :     {
    4909          40 :         LWLockInitialize(&WALInsertLocks[i].l.lock, LWTRANCHE_WAL_INSERT);
    4910          40 :         WALInsertLocks[i].l.insertingAt = InvalidXLogRecPtr;
    4911          40 :         WALInsertLocks[i].l.lastImportantAt = InvalidXLogRecPtr;
    4912             :     }
    4913             : 
    4914             :     /*
    4915             :      * Align the start of the page buffers to a full xlog block size boundary.
    4916             :      * This simplifies some calculations in XLOG insertion. It is also
    4917             :      * required for O_DIRECT.
    4918             :      */
    4919           5 :     allocptr = (char *) TYPEALIGN(XLOG_BLCKSZ, allocptr);
    4920           5 :     XLogCtl->pages = allocptr;
    4921           5 :     memset(XLogCtl->pages, 0, (Size) XLOG_BLCKSZ * XLOGbuffers);
    4922             : 
    4923             :     /*
    4924             :      * Do basic initialization of XLogCtl shared data. (StartupXLOG will fill
    4925             :      * in additional info.)
    4926             :      */
    4927           5 :     XLogCtl->XLogCacheBlck = XLOGbuffers - 1;
    4928           5 :     XLogCtl->SharedRecoveryInProgress = true;
    4929           5 :     XLogCtl->SharedHotStandbyActive = false;
    4930           5 :     XLogCtl->WalWriterSleeping = false;
    4931             : 
    4932           5 :     SpinLockInit(&XLogCtl->Insert.insertpos_lck);
    4933           5 :     SpinLockInit(&XLogCtl->info_lck);
    4934           5 :     SpinLockInit(&XLogCtl->ulsn_lck);
    4935           5 :     InitSharedLatch(&XLogCtl->recoveryWakeupLatch);
    4936             : 
    4937             :     /*
    4938             :      * If we are not in bootstrap mode, pg_control should already exist. Read
    4939             :      * and validate it immediately (see comments in ReadControlFile() for the
    4940             :      * reasons why).
    4941             :      */
    4942           5 :     if (!IsBootstrapProcessingMode())
    4943           2 :         ReadControlFile();
    4944             : }
    4945             : 
    4946             : /*
    4947             :  * This func must be called ONCE on system install.  It creates pg_control
    4948             :  * and the initial XLOG segment.
    4949             :  */
    4950             : void
    4951           1 : BootStrapXLOG(void)
    4952             : {
    4953             :     CheckPoint  checkPoint;
    4954             :     char       *buffer;
    4955             :     XLogPageHeader page;
    4956             :     XLogLongPageHeader longpage;
    4957             :     XLogRecord *record;
    4958             :     char       *recptr;
    4959             :     bool        use_existent;
    4960             :     uint64      sysidentifier;
    4961             :     char        mock_auth_nonce[MOCK_AUTH_NONCE_LEN];
    4962             :     struct timeval tv;
    4963             :     pg_crc32c   crc;
    4964             : 
    4965             :     /*
    4966             :      * Select a hopefully-unique system identifier code for this installation.
    4967             :      * We use the result of gettimeofday(), including the fractional seconds
    4968             :      * field, as being about as unique as we can easily get.  (Think not to
    4969             :      * use random(), since it hasn't been seeded and there's no portable way
    4970             :      * to seed it other than the system clock value...)  The upper half of the
    4971             :      * uint64 value is just the tv_sec part, while the lower half contains the
    4972             :      * tv_usec part (which must fit in 20 bits), plus 12 bits from our current
    4973             :      * PID for a little extra uniqueness.  A person knowing this encoding can
    4974             :      * determine the initialization time of the installation, which could
    4975             :      * perhaps be useful sometimes.
    4976             :      */
    4977           1 :     gettimeofday(&tv, NULL);
    4978           1 :     sysidentifier = ((uint64) tv.tv_sec) << 32;
    4979           1 :     sysidentifier |= ((uint64) tv.tv_usec) << 12;
    4980           1 :     sysidentifier |= getpid() & 0xFFF;
    4981             : 
    4982             :     /*
    4983             :      * Generate a random nonce. This is used for authentication requests that
    4984             :      * will fail because the user does not exist. The nonce is used to create
    4985             :      * a genuine-looking password challenge for the non-existent user, in lieu
    4986             :      * of an actual stored password.
    4987             :      */
    4988           1 :     if (!pg_backend_random(mock_auth_nonce, MOCK_AUTH_NONCE_LEN))
    4989           0 :         ereport(PANIC,
    4990             :                 (errcode(ERRCODE_INTERNAL_ERROR),
    4991             :                  errmsg("could not generate secret authorization token")));
    4992             : 
    4993             :     /* First timeline ID is always 1 */
    4994           1 :     ThisTimeLineID = 1;
    4995             : 
    4996             :     /* page buffer must be aligned suitably for O_DIRECT */
    4997           1 :     buffer = (char *) palloc(XLOG_BLCKSZ + XLOG_BLCKSZ);
    4998           1 :     page = (XLogPageHeader) TYPEALIGN(XLOG_BLCKSZ, buffer);
    4999           1 :     memset(page, 0, XLOG_BLCKSZ);
    5000             : 
    5001             :     /*
    5002             :      * Set up information for the initial checkpoint record
    5003             :      *
    5004             :      * The initial checkpoint record is written to the beginning of the WAL
    5005             :      * segment with logid=0 logseg=1. The very first WAL segment, 0/0, is not
    5006             :      * used, so that we can use 0/0 to mean "before any valid WAL segment".
    5007             :      */
    5008           1 :     checkPoint.redo = XLogSegSize + SizeOfXLogLongPHD;
    5009           1 :     checkPoint.ThisTimeLineID = ThisTimeLineID;
    5010           1 :     checkPoint.PrevTimeLineID = ThisTimeLineID;
    5011           1 :     checkPoint.fullPageWrites = fullPageWrites;
    5012           1 :     checkPoint.nextXidEpoch = 0;
    5013           1 :     checkPoint.nextXid = FirstNormalTransactionId;
    5014           1 :     checkPoint.nextOid = FirstBootstrapObjectId;
    5015           1 :     checkPoint.nextMulti = FirstMultiXactId;
    5016           1 :     checkPoint.nextMultiOffset = 0;
    5017           1 :     checkPoint.oldestXid = FirstNormalTransactionId;
    5018           1 :     checkPoint.oldestXidDB = TemplateDbOid;
    5019           1 :     checkPoint.oldestMulti = FirstMultiXactId;
    5020           1 :     checkPoint.oldestMultiDB = TemplateDbOid;
    5021           1 :     checkPoint.oldestCommitTsXid = InvalidTransactionId;
    5022           1 :     checkPoint.newestCommitTsXid = InvalidTransactionId;
    5023           1 :     checkPoint.time = (pg_time_t) time(NULL);
    5024           1 :     checkPoint.oldestActiveXid = InvalidTransactionId;
    5025             : 
    5026           1 :     ShmemVariableCache->nextXid = checkPoint.nextXid;
    5027           1 :     ShmemVariableCache->nextOid = checkPoint.nextOid;
    5028           1 :     ShmemVariableCache->oidCount = 0;
    5029           1 :     MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
    5030           1 :     AdvanceOldestClogXid(checkPoint.oldestXid);
    5031           1 :     SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
    5032           1 :     SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB, true);
    5033           1 :     SetCommitTsLimit(InvalidTransactionId, InvalidTransactionId);
    5034             : 
    5035             :     /* Set up the XLOG page header */
    5036           1 :     page->xlp_magic = XLOG_PAGE_MAGIC;
    5037           1 :     page->xlp_info = XLP_LONG_HEADER;
    5038           1 :     page->xlp_tli = ThisTimeLineID;
    5039           1 :     page->xlp_pageaddr = XLogSegSize;
    5040           1 :     longpage = (XLogLongPageHeader) page;
    5041           1 :     longpage->xlp_sysid = sysidentifier;
    5042           1 :     longpage->xlp_seg_size = XLogSegSize;
    5043           1 :     longpage->xlp_xlog_blcksz = XLOG_BLCKSZ;
    5044             : 
    5045             :     /* Insert the initial checkpoint record */
    5046           1 :     recptr = ((char *) page + SizeOfXLogLongPHD);
    5047           1 :     record = (XLogRecord *) recptr;
    5048           1 :     record->xl_prev = 0;
    5049           1 :     record->xl_xid = InvalidTransactionId;
    5050           1 :     record->xl_tot_len = SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(checkPoint);
    5051           1 :     record->xl_info = XLOG_CHECKPOINT_SHUTDOWN;
    5052           1 :     record->xl_rmid = RM_XLOG_ID;
    5053           1 :     recptr += SizeOfXLogRecord;
    5054             :     /* fill the XLogRecordDataHeaderShort struct */
    5055           1 :     *(recptr++) = (char) XLR_BLOCK_ID_DATA_SHORT;
    5056           1 :     *(recptr++) = sizeof(checkPoint);
    5057           1 :     memcpy(recptr, &checkPoint, sizeof(checkPoint));
    5058           1 :     recptr += sizeof(checkPoint);
    5059           1 :     Assert(recptr - (char *) record == record->xl_tot_len);
    5060             : 
    5061           1 :     INIT_CRC32C(crc);
    5062           1 :     COMP_CRC32C(crc, ((char *) record) + SizeOfXLogRecord, record->xl_tot_len - SizeOfXLogRecord);
    5063           1 :     COMP_CRC32C(crc, (char *) record, offsetof(XLogRecord, xl_crc));
    5064           1 :     FIN_CRC32C(crc);
    5065           1 :     record->xl_crc = crc;
    5066             : 
    5067             :     /* Create first XLOG segment file */
    5068           1 :     use_existent = false;
    5069           1 :     openLogFile = XLogFileInit(1, &use_existent, false);
    5070             : 
    5071             :     /* Write the first page with the initial record */
    5072           1 :     errno = 0;
    5073           1 :     pgstat_report_wait_start(WAIT_EVENT_WAL_BOOTSTRAP_WRITE);
    5074           1 :     if (write(openLogFile, page, XLOG_BLCKSZ) != XLOG_BLCKSZ)
    5075             :     {
    5076             :         /* if write didn't set errno, assume problem is no disk space */
    5077           0 :         if (errno == 0)
    5078           0 :             errno = ENOSPC;
    5079           0 :         ereport(PANIC,
    5080             :                 (errcode_for_file_access(),
    5081             :                  errmsg("could not write bootstrap write-ahead log file: %m")));
    5082             :     }
    5083           1 :     pgstat_report_wait_end();
    5084             : 
    5085           1 :     pgstat_report_wait_start(WAIT_EVENT_WAL_BOOTSTRAP_SYNC);
    5086           1 :     if (pg_fsync(openLogFile) != 0)
    5087           0 :         ereport(PANIC,
    5088             :                 (errcode_for_file_access(),
    5089             :                  errmsg("could not fsync bootstrap write-ahead log file: %m")));
    5090           1 :     pgstat_report_wait_end();
    5091             : 
    5092           1 :     if (close(openLogFile))
    5093           0 :         ereport(PANIC,
    5094             :                 (errcode_for_file_access(),
    5095             :                  errmsg("could not close bootstrap write-ahead log file: %m")));
    5096             : 
    5097           1 :     openLogFile = -1;
    5098             : 
    5099             :     /* Now create pg_control */
    5100             : 
    5101           1 :     memset(ControlFile, 0, sizeof(ControlFileData));
    5102             :     /* Initialize pg_control status fields */
    5103           1 :     ControlFile->system_identifier = sysidentifier;
    5104           1 :     memcpy(ControlFile->mock_authentication_nonce, mock_auth_nonce, MOCK_AUTH_NONCE_LEN);
    5105           1 :     ControlFile->state = DB_SHUTDOWNED;
    5106           1 :     ControlFile->time = checkPoint.time;
    5107           1 :     ControlFile->checkPoint = checkPoint.redo;
    5108           1 :     ControlFile->checkPointCopy = checkPoint;
    5109           1 :     ControlFile->unloggedLSN = 1;
    5110             : 
    5111             :     /* Set important parameter values for use when replaying WAL */
    5112           1 :     ControlFile->MaxConnections = MaxConnections;
    5113           1 :     ControlFile->max_worker_processes = max_worker_processes;
    5114           1 :     ControlFile->max_prepared_xacts = max_prepared_xacts;
    5115           1 :     ControlFile->max_locks_per_xact = max_locks_per_xact;
    5116           1 :     ControlFile->wal_level = wal_level;
    5117           1 :     ControlFile->wal_log_hints = wal_log_hints;
    5118           1 :     ControlFile->track_commit_timestamp = track_commit_timestamp;
    5119           1 :     ControlFile->data_checksum_version = bootstrap_data_checksum_version;
    5120             : 
    5121             :     /* some additional ControlFile fields are set in WriteControlFile() */
    5122             : 
    5123           1 :     WriteControlFile();
    5124             : 
    5125             :     /* Bootstrap the commit log, too */
    5126           1 :     BootStrapCLOG();
    5127           1 :     BootStrapCommitTs();
    5128           1 :     BootStrapSUBTRANS();
    5129           1 :     BootStrapMultiXact();
    5130             : 
    5131           1 :     pfree(buffer);
    5132           1 : }
    5133             : 
    5134             : static char *
    5135           1 : str_time(pg_time_t tnow)
    5136             : {
    5137             :     static char buf[128];
    5138             : 
    5139           1 :     pg_strftime(buf, sizeof(buf),
    5140             :                 "%Y-%m-%d %H:%M:%S %Z",
    5141           1 :                 pg_localtime(&tnow, log_timezone));
    5142             : 
    5143           1 :     return buf;
    5144             : }
    5145             : 
    5146             : /*
    5147             :  * See if there is a recovery command file (recovery.conf), and if so
    5148             :  * read in parameters for archive recovery and XLOG streaming.
    5149             :  *
    5150             :  * The file is parsed using the main configuration parser.
    5151             :  */
    5152             : static void
    5153           3 : readRecoveryCommandFile(void)
    5154             : {
    5155             :     FILE       *fd;
    5156           3 :     TimeLineID  rtli = 0;
    5157           3 :     bool        rtliGiven = false;
    5158             :     ConfigVariable *item,
    5159           3 :                *head = NULL,
    5160           3 :                *tail = NULL;
    5161           3 :     bool        recoveryTargetActionSet = false;
    5162             : 
    5163             : 
    5164           3 :     fd = AllocateFile(RECOVERY_COMMAND_FILE, "r");
    5165           3 :     if (fd == NULL)
    5166             :     {
    5167           3 :         if (errno == ENOENT)
    5168           6 :             return;             /* not there, so no archive recovery */
    5169           0 :         ereport(FATAL,
    5170             :                 (errcode_for_file_access(),
    5171             :                  errmsg("could not open recovery command file \"%s\": %m",
    5172             :                         RECOVERY_COMMAND_FILE)));
    5173             :     }
    5174             : 
    5175             :     /*
    5176             :      * Since we're asking ParseConfigFp() to report errors as FATAL, there's
    5177             :      * no need to check the return value.
    5178             :      */
    5179           0 :     (void) ParseConfigFp(fd, RECOVERY_COMMAND_FILE, 0, FATAL, &head, &tail);
    5180             : 
    5181           0 :     FreeFile(fd);
    5182             : 
    5183           0 :     for (item = head; item; item = item->next)
    5184             :     {
    5185           0 :         if (strcmp(item->name, "restore_command") == 0)
    5186             :         {
    5187           0 :             recoveryRestoreCommand = pstrdup(item->value);
    5188           0 :             ereport(DEBUG2,
    5189             :                     (errmsg_internal("restore_command = '%s'",
    5190             :                                      recoveryRestoreCommand)));
    5191             :         }
    5192           0 :         else if (strcmp(item->name, "recovery_end_command") == 0)
    5193             :         {
    5194           0 :             recoveryEndCommand = pstrdup(item->value);
    5195           0 :             ereport(DEBUG2,
    5196             :                     (errmsg_internal("recovery_end_command = '%s'",
    5197             :                                      recoveryEndCommand)));
    5198             :         }
    5199           0 :         else if (strcmp(item->name, "archive_cleanup_command") == 0)
    5200             :         {
    5201           0 :             archiveCleanupCommand = pstrdup(item->value);
    5202           0 :             ereport(DEBUG2,
    5203             :                     (errmsg_internal("archive_cleanup_command = '%s'",
    5204             :                                      archiveCleanupCommand)));
    5205             :         }
    5206           0 :         else if (strcmp(item->name, "recovery_target_action") == 0)
    5207             :         {
    5208           0 :             if (strcmp(item->value, "pause") == 0)
    5209           0 :                 recoveryTargetAction = RECOVERY_TARGET_ACTION_PAUSE;
    5210           0 :             else if (strcmp(item->value, "promote") == 0)
    5211           0 :                 recoveryTargetAction = RECOVERY_TARGET_ACTION_PROMOTE;
    5212           0 :             else if (strcmp(item->value, "shutdown") == 0)
    5213           0 :                 recoveryTargetAction = RECOVERY_TARGET_ACTION_SHUTDOWN;
    5214             :             else
    5215           0 :                 ereport(ERROR,
    5216             :                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    5217             :                          errmsg("invalid value for recovery parameter \"%s\": \"%s\"",
    5218             :                                 "recovery_target_action",
    5219             :                                 item->value),
    5220             :                          errhint("Valid values are \"pause\", \"promote\", and \"shutdown\".")));
    5221             : 
    5222           0 :             ereport(DEBUG2,
    5223             :                     (errmsg_internal("recovery_target_action = '%s'",
    5224             :                                      item->value)));
    5225             : 
    5226           0 :             recoveryTargetActionSet = true;
    5227             :         }
    5228           0 :         else if (strcmp(item->name, "recovery_target_timeline") == 0)
    5229             :         {
    5230           0 :             rtliGiven = true;
    5231           0 :             if (strcmp(item->value, "latest") == 0)
    5232           0 :                 rtli = 0;
    5233             :             else
    5234             :             {
    5235           0 :                 errno = 0;
    5236           0 :                 rtli = (TimeLineID) strtoul(item->value, NULL, 0);
    5237           0 :                 if (errno == EINVAL || errno == ERANGE)
    5238           0 :                     ereport(FATAL,
    5239             :                             (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    5240             :                              errmsg("recovery_target_timeline is not a valid number: \"%s\"",
    5241             :                                     item->value)));
    5242             :             }
    5243           0 :             if (rtli)
    5244           0 :                 ereport(DEBUG2,
    5245             :                         (errmsg_internal("recovery_target_timeline = %u", rtli)));
    5246             :             else
    5247           0 :                 ereport(DEBUG2,
    5248             :                         (errmsg_internal("recovery_target_timeline = latest")));
    5249             :         }
    5250           0 :         else if (strcmp(item->name, "recovery_target_xid") == 0)
    5251             :         {
    5252           0 :             errno = 0;
    5253           0 :             recoveryTargetXid = (TransactionId) strtoul(item->value, NULL, 0);
    5254           0 :             if (errno == EINVAL || errno == ERANGE)
    5255           0 :                 ereport(FATAL,
    5256             :                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    5257             :                          errmsg("recovery_target_xid is not a valid number: \"%s\"",
    5258             :                                 item->value)));
    5259           0 :             ereport(DEBUG2,
    5260             :                     (errmsg_internal("recovery_target_xid = %u",
    5261             :                                      recoveryTargetXid)));
    5262           0 :             recoveryTarget = RECOVERY_TARGET_XID;
    5263             :         }
    5264           0 :         else if (strcmp(item->name, "recovery_target_time") == 0)
    5265             :         {
    5266           0 :             recoveryTarget = RECOVERY_TARGET_TIME;
    5267             : 
    5268             :             /*
    5269             :              * Convert the time string given by the user to TimestampTz form.
    5270             :              */
    5271           0 :             recoveryTargetTime =
    5272           0 :                 DatumGetTimestampTz(DirectFunctionCall3(timestamptz_in,
    5273             :                                                         CStringGetDatum(item->value),
    5274             :                                                         ObjectIdGetDatum(InvalidOid),
    5275             :                                                         Int32GetDatum(-1)));
    5276           0 :             ereport(DEBUG2,
    5277             :                     (errmsg_internal("recovery_target_time = '%s'",
    5278             :                                      timestamptz_to_str(recoveryTargetTime))));
    5279             :         }
    5280           0 :         else if (strcmp(item->name, "recovery_target_name") == 0)
    5281             :         {
    5282           0 :             recoveryTarget = RECOVERY_TARGET_NAME;
    5283             : 
    5284           0 :             recoveryTargetName = pstrdup(item->value);
    5285           0 :             if (strlen(recoveryTargetName) >= MAXFNAMELEN)
    5286           0 :                 ereport(FATAL,
    5287             :                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    5288             :                          errmsg("recovery_target_name is too long (maximum %d characters)",
    5289             :                                 MAXFNAMELEN - 1)));
    5290             : 
    5291           0 :             ereport(DEBUG2,
    5292             :                     (errmsg_internal("recovery_target_name = '%s'",
    5293             :                                      recoveryTargetName)));
    5294             :         }
    5295           0 :         else if (strcmp(item->name, "recovery_target_lsn") == 0)
    5296             :         {
    5297           0 :             recoveryTarget = RECOVERY_TARGET_LSN;
    5298             : 
    5299             :             /*
    5300             :              * Convert the LSN string given by the user to XLogRecPtr form.
    5301             :              */
    5302           0 :             recoveryTargetLSN =
    5303           0 :                 DatumGetLSN(DirectFunctionCall3(pg_lsn_in,
    5304             :                                                 CStringGetDatum(item->value),
    5305             :                                                 ObjectIdGetDatum(InvalidOid),
    5306             :                                                 Int32GetDatum(-1)));
    5307           0 :             ereport(DEBUG2,
    5308             :                     (errmsg_internal("recovery_target_lsn = '%X/%X'",
    5309             :                                      (uint32) (recoveryTargetLSN >> 32),
    5310             :                                      (uint32) recoveryTargetLSN)));
    5311             :         }
    5312           0 :         else if (strcmp(item->name, "recovery_target") == 0)
    5313             :         {
    5314           0 :             if (strcmp(item->value, "immediate") == 0)
    5315           0 :                 recoveryTarget = RECOVERY_TARGET_IMMEDIATE;
    5316             :             else
    5317           0 :                 ereport(ERROR,
    5318             :                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    5319             :                          errmsg("invalid value for recovery parameter \"%s\": \"%s\"",
    5320             :                                 "recovery_target",
    5321             :                                 item->value),
    5322             :                          errhint("The only allowed value is \"immediate\".")));
    5323           0 :             ereport(DEBUG2,
    5324             :                     (errmsg_internal("recovery_target = '%s'",
    5325             :                                      item->value)));
    5326             :         }
    5327           0 :         else if (strcmp(item->name, "recovery_target_inclusive") == 0)
    5328             :         {
    5329             :             /*
    5330             :              * does nothing if a recovery_target is not also set
    5331             :              */
    5332           0 :             if (!parse_bool(item->value, &recoveryTargetInclusive))
    5333           0 :                 ereport(ERROR,
    5334             :                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    5335             :                          errmsg("parameter \"%s\" requires a Boolean value",
    5336             :                                 "recovery_target_inclusive")));
    5337           0 :             ereport(DEBUG2,
    5338             :                     (errmsg_internal("recovery_target_inclusive = %s",
    5339             :                                      item->value)));
    5340             :         }
    5341           0 :         else if (strcmp(item->name, "standby_mode") == 0)
    5342             :         {
    5343           0 :             if (!parse_bool(item->value, &StandbyModeRequested))
    5344           0 :                 ereport(ERROR,
    5345             :                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    5346             :                          errmsg("parameter \"%s\" requires a Boolean value",
    5347             :                                 "standby_mode")));
    5348           0 :             ereport(DEBUG2,
    5349             :                     (errmsg_internal("standby_mode = '%s'", item->value)));
    5350             :         }
    5351           0 :         else if (strcmp(item->name, "primary_conninfo") == 0)
    5352             :         {
    5353           0 :             PrimaryConnInfo = pstrdup(item->value);
    5354           0 :             ereport(DEBUG2,
    5355             :                     (errmsg_internal("primary_conninfo = '%s'",
    5356             :                                      PrimaryConnInfo)));
    5357             :         }
    5358           0 :         else if (strcmp(item->name, "primary_slot_name") == 0)
    5359             :         {
    5360           0 :             ReplicationSlotValidateName(item->value, ERROR);
    5361           0 :             PrimarySlotName = pstrdup(item->value);
    5362           0 :             ereport(DEBUG2,
    5363             :                     (errmsg_internal("primary_slot_name = '%s'",
    5364             :                                      PrimarySlotName)));
    5365             :         }
    5366           0 :         else if (strcmp(item->name, "trigger_file") == 0)
    5367             :         {
    5368           0 :             TriggerFile = pstrdup(item->value);
    5369           0 :             ereport(DEBUG2,
    5370             :                     (errmsg_internal("trigger_file = '%s'",
    5371             :                                      TriggerFile)));
    5372             :         }
    5373           0 :         else if (strcmp(item->name, "recovery_min_apply_delay") == 0)
    5374             :         {
    5375             :             const char *hintmsg;
    5376             : 
    5377           0 :             if (!parse_int(item->value, &recovery_min_apply_delay, GUC_UNIT_MS,
    5378             :                            &hintmsg))
    5379           0 :                 ereport(ERROR,
    5380             :                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    5381             :                          errmsg("parameter \"%s\" requires a temporal value",
    5382             :                                 "recovery_min_apply_delay"),
    5383             :                          hintmsg ? errhint("%s", _(hintmsg)) : 0));
    5384           0 :             ereport(DEBUG2,
    5385             :                     (errmsg_internal("recovery_min_apply_delay = '%s'", item->value)));
    5386             :         }
    5387             :         else
    5388           0 :             ereport(FATAL,
    5389             :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    5390             :                      errmsg("unrecognized recovery parameter \"%s\"",
    5391             :                             item->name)));
    5392             :     }
    5393             : 
    5394             :     /*
    5395             :      * Check for compulsory parameters
    5396             :      */
    5397           0 :     if (StandbyModeRequested)
    5398             :     {
    5399           0 :         if (PrimaryConnInfo == NULL && recoveryRestoreCommand == NULL)
    5400           0 :             ereport(WARNING,
    5401             :                     (errmsg("recovery command file \"%s\" specified neither primary_conninfo nor restore_command",
    5402             :                             RECOVERY_COMMAND_FILE),
    5403             :                      errhint("The database server will regularly poll the pg_wal subdirectory to check for files placed there.")));
    5404             :     }
    5405             :     else
    5406             :     {
    5407           0 :         if (recoveryRestoreCommand == NULL)
    5408           0 :             ereport(FATAL,
    5409             :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    5410             :                      errmsg("recovery command file \"%s\" must specify restore_command when standby mode is not enabled",
    5411             :                             RECOVERY_COMMAND_FILE)));
    5412             :     }
    5413             : 
    5414             :     /*
    5415             :      * Override any inconsistent requests. Not that this is a change of
    5416             :      * behaviour in 9.5; prior to this we simply ignored a request to pause if
    5417             :      * hot_standby = off, which was surprising behaviour.
    5418             :      */
    5419           0 :     if (recoveryTargetAction == RECOVERY_TARGET_ACTION_PAUSE &&
    5420           0 :         recoveryTargetActionSet &&
    5421           0 :         !EnableHotStandby)
    5422           0 :         recoveryTargetAction = RECOVERY_TARGET_ACTION_SHUTDOWN;
    5423             : 
    5424             :     /*
    5425             :      * We don't support standby_mode in standalone backends; that requires
    5426             :      * other processes such as the WAL receiver to be alive.
    5427             :      */
    5428           0 :     if (StandbyModeRequested && !IsUnderPostmaster)
    5429           0 :         ereport(FATAL,
    5430             :                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
    5431             :                  errmsg("standby mode is not supported by single-user servers")));
    5432             : 
    5433             :     /* Enable fetching from archive recovery area */
    5434           0 :     ArchiveRecoveryRequested = true;
    5435             : 
    5436             :     /*
    5437             :      * If user specified recovery_target_timeline, validate it or compute the
    5438             :      * "latest" value.  We can't do this until after we've gotten the restore
    5439             :      * command and set InArchiveRecovery, because we need to fetch timeline
    5440             :      * history files from the archive.
    5441             :      */
    5442           0 :     if (rtliGiven)
    5443             :     {
    5444           0 :         if (rtli)
    5445             :         {
    5446             :             /* Timeline 1 does not have a history file, all else should */
    5447           0 :             if (rtli != 1 && !existsTimeLineHistory(rtli))
    5448           0 :                 ereport(FATAL,
    5449             :                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    5450             :                          errmsg("recovery target timeline %u does not exist",
    5451             :                                 rtli)));
    5452           0 :             recoveryTargetTLI = rtli;
    5453           0 :             recoveryTargetIsLatest = false;
    5454             :         }
    5455             :         else
    5456             :         {
    5457             :             /* We start the "latest" search from pg_control's timeline */
    5458           0 :             recoveryTargetTLI = findNewestTimeLine(recoveryTargetTLI);
    5459           0 :             recoveryTargetIsLatest = true;
    5460             :         }
    5461             :     }
    5462             : 
    5463           0 :     FreeConfigVariables(head);
    5464             : }
    5465             : 
    5466             : /*
    5467             :  * Exit archive-recovery state
    5468             :  */
    5469             : static void
    5470           0 : exitArchiveRecovery(TimeLineID endTLI, XLogRecPtr endOfLog)
    5471             : {
    5472             :     char        recoveryPath[MAXPGPATH];
    5473             :     char        xlogfname[MAXFNAMELEN];
    5474             :     XLogSegNo   endLogSegNo;
    5475             :     XLogSegNo   startLogSegNo;
    5476             : 
    5477             :     /* we always switch to a new timeline after archive recovery */
    5478           0 :     Assert(endTLI != ThisTimeLineID);
    5479             : 
    5480             :     /*
    5481             :      * We are no longer in archive recovery state.
    5482             :      */
    5483           0 :     InArchiveRecovery = false;
    5484             : 
    5485             :     /*
    5486             :      * Update min recovery point one last time.
    5487             :      */
    5488           0 :     UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
    5489             : 
    5490             :     /*
    5491             :      * If the ending log segment is still open, close it (to avoid problems on
    5492             :      * Windows with trying to rename or delete an open file).
    5493             :      */
    5494           0 :     if (readFile >= 0)
    5495             :     {
    5496           0 :         close(readFile);
    5497           0 :         readFile = -1;
    5498             :     }
    5499             : 
    5500             :     /*
    5501             :      * Calculate the last segment on the old timeline, and the first segment
    5502             :      * on the new timeline. If the switch happens in the middle of a segment,
    5503             :      * they are the same, but if the switch happens exactly at a segment
    5504             :      * boundary, startLogSegNo will be endLogSegNo + 1.
    5505             :      */
    5506           0 :     XLByteToPrevSeg(endOfLog, endLogSegNo);
    5507           0 :     XLByteToSeg(endOfLog, startLogSegNo);
    5508             : 
    5509             :     /*
    5510             :      * Initialize the starting WAL segment for the new timeline. If the switch
    5511             :      * happens in the middle of a segment, copy data from the last WAL segment
    5512             :      * of the old timeline up to the switch point, to the starting WAL segment
    5513             :      * on the new timeline.
    5514             :      */
    5515           0 :     if (endLogSegNo == startLogSegNo)
    5516             :     {
    5517             :         /*
    5518             :          * Make a copy of the file on the new timeline.
    5519             :          *
    5520             :          * Writing WAL isn't allowed yet, so there are no locking
    5521             :          * considerations. But we should be just as tense as XLogFileInit to
    5522             :          * avoid emplacing a bogus file.
    5523             :          */
    5524           0 :         XLogFileCopy(endLogSegNo, endTLI, endLogSegNo,
    5525             :                      endOfLog % XLOG_SEG_SIZE);
    5526             :     }
    5527             :     else
    5528             :     {
    5529             :         /*
    5530             :          * The switch happened at a segment boundary, so just create the next
    5531             :          * segment on the new timeline.
    5532             :          */
    5533           0 :         bool        use_existent = true;
    5534             :         int         fd;
    5535             : 
    5536           0 :         fd = XLogFileInit(startLogSegNo, &use_existent, true);
    5537             : 
    5538           0 :         if (close(fd))
    5539           0 :             ereport(ERROR,
    5540             :                     (errcode_for_file_access(),
    5541             :                      errmsg("could not close log file %s: %m",
    5542             :                             XLogFileNameP(ThisTimeLineID, startLogSegNo))));
    5543             :     }
    5544             : 
    5545             :     /*
    5546             :      * Let's just make real sure there are not .ready or .done flags posted
    5547             :      * for the new segment.
    5548             :      */
    5549           0 :     XLogFileName(xlogfname, ThisTimeLineID, startLogSegNo);
    5550           0 :     XLogArchiveCleanup(xlogfname);
    5551             : 
    5552             :     /*
    5553             :      * Since there might be a partial WAL segment named RECOVERYXLOG, get rid
    5554             :      * of it.
    5555             :      */
    5556           0 :     snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG");
    5557           0 :     unlink(recoveryPath);       /* ignore any error */
    5558             : 
    5559             :     /* Get rid of any remaining recovered timeline-history file, too */
    5560           0 :     snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY");
    5561           0 :     unlink(recoveryPath);       /* ignore any error */
    5562             : 
    5563             :     /*
    5564             :      * Rename the config file out of the way, so that we don't accidentally
    5565             :      * re-enter archive recovery mode in a subsequent crash.
    5566             :      */
    5567           0 :     unlink(RECOVERY_COMMAND_DONE);
    5568           0 :     durable_rename(RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE, FATAL);
    5569             : 
    5570           0 :     ereport(LOG,
    5571             :             (errmsg("archive recovery complete")));
    5572           0 : }
    5573             : 
    5574             : /*
    5575             :  * Extract timestamp from WAL record.
    5576             :  *
    5577             :  * If the record contains a timestamp, returns true, and saves the timestamp
    5578             :  * in *recordXtime. If the record type has no timestamp, returns false.
    5579             :  * Currently, only transaction commit/abort records and restore points contain
    5580             :  * timestamps.
    5581             :  */
    5582             : static bool
    5583           0 : getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime)
    5584             : {
    5585           0 :     uint8       info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
    5586           0 :     uint8       xact_info = info & XLOG_XACT_OPMASK;
    5587           0 :     uint8       rmid = XLogRecGetRmid(record);
    5588             : 
    5589           0 :     if (rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
    5590             :     {
    5591           0 :         *recordXtime = ((xl_restore_point *) XLogRecGetData(record))->rp_time;
    5592           0 :         return true;
    5593             :     }
    5594           0 :     if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_COMMIT ||
    5595             :                                xact_info == XLOG_XACT_COMMIT_PREPARED))
    5596             :     {
    5597           0 :         *recordXtime = ((xl_xact_commit *) XLogRecGetData(record))->xact_time;
    5598           0 :         return true;
    5599             :     }
    5600           0 :     if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_ABORT ||
    5601             :                                xact_info == XLOG_XACT_ABORT_PREPARED))
    5602             :     {
    5603           0 :         *recordXtime = ((xl_xact_abort *) XLogRecGetData(record))->xact_time;
    5604           0 :         return true;
    5605             :     }
    5606           0 :     return false;
    5607             : }
    5608             : 
    5609             : /*
    5610             :  * For point-in-time recovery, this function decides whether we want to
    5611             :  * stop applying the XLOG before the current record.
    5612             :  *
    5613             :  * Returns TRUE if we are stopping, FALSE otherwise. If stopping, some
    5614             :  * information is saved in recoveryStopXid et al for use in annotating the
    5615             :  * new timeline's history file.
    5616             :  */
    5617             : static bool
    5618           0 : recoveryStopsBefore(XLogReaderState *record)
    5619             : {
    5620           0 :     bool        stopsHere = false;
    5621             :     uint8       xact_info;
    5622             :     bool        isCommit;
    5623           0 :     TimestampTz recordXtime = 0;
    5624             :     TransactionId recordXid;
    5625             : 
    5626             :     /* Check if we should stop as soon as reaching consistency */
    5627           0 :     if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
    5628             :     {
    5629           0 :         ereport(LOG,
    5630             :                 (errmsg("recovery stopping after reaching consistency")));
    5631             : 
    5632           0 :         recoveryStopAfter = false;
    5633           0 :         recoveryStopXid = InvalidTransactionId;
    5634           0 :         recoveryStopLSN = InvalidXLogRecPtr;
    5635           0 :         recoveryStopTime = 0;
    5636           0 :         recoveryStopName[0] = '\0';
    5637           0 :         return true;
    5638             :     }
    5639             : 
    5640             :     /* Check if target LSN has been reached */
    5641           0 :     if (recoveryTarget == RECOVERY_TARGET_LSN &&
    5642           0 :         !recoveryTargetInclusive &&
    5643           0 :         record->ReadRecPtr >= recoveryTargetLSN)
    5644             :     {
    5645           0 :         recoveryStopAfter = false;
    5646           0 :         recoveryStopXid = InvalidTransactionId;
    5647           0 :         recoveryStopLSN = record->ReadRecPtr;
    5648           0 :         recoveryStopTime = 0;
    5649           0 :         recoveryStopName[0] = '\0';
    5650           0 :         ereport(LOG,
    5651             :                 (errmsg("recovery stopping before WAL location (LSN) \"%X/%X\"",
    5652             :                         (uint32) (recoveryStopLSN >> 32),
    5653             :                         (uint32) recoveryStopLSN)));
    5654           0 :         return true;
    5655             :     }
    5656             : 
    5657             :     /* Otherwise we only consider stopping before COMMIT or ABORT records. */
    5658           0 :     if (XLogRecGetRmid(record) != RM_XACT_ID)
    5659           0 :         return false;
    5660             : 
    5661           0 :     xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
    5662             : 
    5663           0 :     if (xact_info == XLOG_XACT_COMMIT)
    5664             :     {
    5665           0 :         isCommit = true;
    5666           0 :         recordXid = XLogRecGetXid(record);
    5667             :     }
    5668           0 :     else if (xact_info == XLOG_XACT_COMMIT_PREPARED)
    5669             :     {
    5670           0 :         xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
    5671             :         xl_xact_parsed_commit parsed;
    5672             : 
    5673           0 :         isCommit = true;
    5674           0 :         ParseCommitRecord(XLogRecGetInfo(record),
    5675             :                           xlrec,
    5676             :                           &parsed);
    5677           0 :         recordXid = parsed.twophase_xid;
    5678             :     }
    5679           0 :     else if (xact_info == XLOG_XACT_ABORT)
    5680             :     {
    5681           0 :         isCommit = false;
    5682           0 :         recordXid = XLogRecGetXid(record);
    5683             :     }
    5684           0 :     else if (xact_info == XLOG_XACT_ABORT_PREPARED)
    5685             :     {
    5686           0 :         xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
    5687             :         xl_xact_parsed_abort parsed;
    5688             : 
    5689           0 :         isCommit = true;
    5690           0 :         ParseAbortRecord(XLogRecGetInfo(record),
    5691             :                          xlrec,
    5692             :                          &parsed);
    5693           0 :         recordXid = parsed.twophase_xid;
    5694             :     }
    5695             :     else
    5696           0 :         return false;
    5697             : 
    5698           0 :     if (recoveryTarget == RECOVERY_TARGET_XID && !recoveryTargetInclusive)
    5699             :     {
    5700             :         /*
    5701             :          * There can be only one transaction end record with this exact
    5702             :          * transactionid
    5703             :          *
    5704             :          * when testing for an xid, we MUST test for equality only, since
    5705             :          * transactions are numbered in the order they start, not the order
    5706             :          * they complete. A higher numbered xid will complete before you about
    5707             :          * 50% of the time...
    5708             :          */
    5709           0 :         stopsHere = (recordXid == recoveryTargetXid);
    5710             :     }
    5711             : 
    5712           0 :     if (recoveryTarget == RECOVERY_TARGET_TIME &&
    5713           0 :         getRecordTimestamp(record, &recordXtime))
    5714             :     {
    5715             :         /*
    5716             :          * There can be many transactions that share the same commit time, so
    5717             :          * we stop after the last one, if we are inclusive, or stop at the
    5718             :          * first one if we are exclusive
    5719             :          */
    5720           0 :         if (recoveryTargetInclusive)
    5721           0 :             stopsHere = (recordXtime > recoveryTargetTime);
    5722             :         else
    5723           0 :             stopsHere = (recordXtime >= recoveryTargetTime);
    5724             :     }
    5725             : 
    5726           0 :     if (stopsHere)
    5727             :     {
    5728           0 :         recoveryStopAfter = false;
    5729           0 :         recoveryStopXid = recordXid;
    5730           0 :         recoveryStopTime = recordXtime;
    5731           0 :         recoveryStopLSN = InvalidXLogRecPtr;
    5732           0 :         recoveryStopName[0] = '\0';
    5733             : 
    5734           0 :         if (isCommit)
    5735             :         {
    5736           0 :             ereport(LOG,
    5737             :                     (errmsg("recovery stopping before commit of transaction %u, time %s",
    5738             :                             recoveryStopXid,
    5739             :                             timestamptz_to_str(recoveryStopTime))));
    5740             :         }
    5741             :         else
    5742             :         {
    5743           0 :             ereport(LOG,
    5744             :                     (errmsg("recovery stopping before abort of transaction %u, time %s",
    5745             :                             recoveryStopXid,
    5746             :                             timestamptz_to_str(recoveryStopTime))));
    5747             :         }
    5748             :     }
    5749             : 
    5750           0 :     return stopsHere;
    5751             : }
    5752             : 
    5753             : /*
    5754             :  * Same as recoveryStopsBefore, but called after applying the record.
    5755             :  *
    5756             :  * We also track the timestamp of the latest applied COMMIT/ABORT
    5757             :  * record in XLogCtl->recoveryLastXTime.
    5758             :  */
    5759             : static bool
    5760           0 : recoveryStopsAfter(XLogReaderState *record)
    5761             : {
    5762             :     uint8       info;
    5763             :     uint8       xact_info;
    5764             :     uint8       rmid;
    5765             :     TimestampTz recordXtime;
    5766             : 
    5767           0 :     info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
    5768           0 :     rmid = XLogRecGetRmid(record);
    5769             : 
    5770             :     /*
    5771             :      * There can be many restore points that share the same name; we stop at
    5772             :      * the first one.
    5773             :      */
    5774           0 :     if (recoveryTarget == RECOVERY_TARGET_NAME &&
    5775           0 :         rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
    5776             :     {
    5777             :         xl_restore_point *recordRestorePointData;
    5778             : 
    5779           0 :         recordRestorePointData = (xl_restore_point *) XLogRecGetData(record);
    5780             : 
    5781           0 :         if (strcmp(recordRestorePointData->rp_name, recoveryTargetName) == 0)
    5782             :         {
    5783           0 :             recoveryStopAfter = true;
    5784           0 :             recoveryStopXid = InvalidTransactionId;
    5785           0 :             recoveryStopLSN = InvalidXLogRecPtr;
    5786           0 :             (void) getRecordTimestamp(record, &recoveryStopTime);
    5787           0 :             strlcpy(recoveryStopName, recordRestorePointData->rp_name, MAXFNAMELEN);
    5788             : 
    5789           0 :             ereport(LOG,
    5790             :                     (errmsg("recovery stopping at restore point \"%s\", time %s",
    5791             :                             recoveryStopName,
    5792             :                             timestamptz_to_str(recoveryStopTime))));
    5793           0 :             return true;
    5794             :         }
    5795             :     }
    5796             : 
    5797             :     /* Check if the target LSN has been reached */
    5798           0 :     if (recoveryTarget == RECOVERY_TARGET_LSN &&
    5799           0 :         recoveryTargetInclusive &&
    5800           0 :         record->ReadRecPtr >= recoveryTargetLSN)
    5801             :     {
    5802           0 :         recoveryStopAfter = true;
    5803           0 :         recoveryStopXid = InvalidTransactionId;
    5804           0 :         recoveryStopLSN = record->ReadRecPtr;
    5805           0 :         recoveryStopTime = 0;
    5806           0 :         recoveryStopName[0] = '\0';
    5807           0 :         ereport(LOG,
    5808             :                 (errmsg("recovery stopping after WAL location (LSN) \"%X/%X\"",
    5809             :                         (uint32) (recoveryStopLSN >> 32),
    5810             :                         (uint32) recoveryStopLSN)));
    5811           0 :         return true;
    5812             :     }
    5813             : 
    5814           0 :     if (rmid != RM_XACT_ID)
    5815           0 :         return false;
    5816             : 
    5817           0 :     xact_info = info & XLOG_XACT_OPMASK;
    5818             : 
    5819           0 :     if (xact_info == XLOG_XACT_COMMIT ||
    5820           0 :         xact_info == XLOG_XACT_COMMIT_PREPARED ||
    5821           0 :         xact_info == XLOG_XACT_ABORT ||
    5822             :         xact_info == XLOG_XACT_ABORT_PREPARED)
    5823             :     {
    5824             :         TransactionId recordXid;
    5825             : 
    5826             :         /* Update the last applied transaction timestamp */
    5827           0 :         if (getRecordTimestamp(record, &recordXtime))
    5828           0 :             SetLatestXTime(recordXtime);
    5829             : 
    5830             :         /* Extract the XID of the committed/aborted transaction */
    5831           0 :         if (xact_info == XLOG_XACT_COMMIT_PREPARED)
    5832             :         {
    5833           0 :             xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
    5834             :             xl_xact_parsed_commit parsed;
    5835             : 
    5836           0 :             ParseCommitRecord(XLogRecGetInfo(record),
    5837             :                               xlrec,
    5838             :                               &parsed);
    5839           0 :             recordXid = parsed.twophase_xid;
    5840             :         }
    5841           0 :         else if (xact_info == XLOG_XACT_ABORT_PREPARED)
    5842             :         {
    5843           0 :             xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
    5844             :             xl_xact_parsed_abort parsed;
    5845             : 
    5846           0 :             ParseAbortRecord(XLogRecGetInfo(record),
    5847             :                              xlrec,
    5848             :                              &parsed);
    5849           0 :             recordXid = parsed.twophase_xid;
    5850             :         }
    5851             :         else
    5852           0 :             recordXid = XLogRecGetXid(record);
    5853             : 
    5854             :         /*
    5855             :          * There can be only one transaction end record with this exact
    5856             :          * transactionid
    5857             :          *
    5858             :          * when testing for an xid, we MUST test for equality only, since
    5859             :          * transactions are numbered in the order they start, not the order
    5860             :          * they complete. A higher numbered xid will complete before you about
    5861             :          * 50% of the time...
    5862             :          */
    5863           0 :         if (recoveryTarget == RECOVERY_TARGET_XID && recoveryTargetInclusive &&
    5864           0 :             recordXid == recoveryTargetXid)
    5865             :         {
    5866           0 :             recoveryStopAfter = true;
    5867           0 :             recoveryStopXid = recordXid;
    5868           0 :             recoveryStopTime = recordXtime;
    5869           0 :             recoveryStopLSN = InvalidXLogRecPtr;
    5870           0 :             recoveryStopName[0] = '\0';
    5871             : 
    5872           0 :             if (xact_info == XLOG_XACT_COMMIT ||
    5873             :                 xact_info == XLOG_XACT_COMMIT_PREPARED)
    5874             :             {
    5875           0 :                 ereport(LOG,
    5876             :                         (errmsg("recovery stopping after commit of transaction %u, time %s",
    5877             :                                 recoveryStopXid,
    5878             :                                 timestamptz_to_str(recoveryStopTime))));
    5879             :             }
    5880           0 :             else if (xact_info == XLOG_XACT_ABORT ||
    5881             :                      xact_info == XLOG_XACT_ABORT_PREPARED)
    5882             :             {
    5883           0 :                 ereport(LOG,
    5884             :                         (errmsg("recovery stopping after abort of transaction %u, time %s",
    5885             :                                 recoveryStopXid,
    5886             :                                 timestamptz_to_str(recoveryStopTime))));
    5887             :             }
    5888           0 :             return true;
    5889             :         }
    5890             :     }
    5891             : 
    5892             :     /* Check if we should stop as soon as reaching consistency */
    5893           0 :     if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
    5894             :     {
    5895           0 :         ereport(LOG,
    5896             :                 (errmsg("recovery stopping after reaching consistency")));
    5897             : 
    5898           0 :         recoveryStopAfter = true;
    5899           0 :         recoveryStopXid = InvalidTransactionId;
    5900           0 :         recoveryStopTime = 0;
    5901           0 :         recoveryStopLSN = InvalidXLogRecPtr;
    5902           0 :         recoveryStopName[0] = '\0';
    5903           0 :         return true;
    5904             :     }
    5905             : 
    5906           0 :     return false;
    5907             : }
    5908             : 
    5909             : /*
    5910             :  * Wait until shared recoveryPause flag is cleared.
    5911             :  *
    5912             :  * XXX Could also be done with shared latch, avoiding the pg_usleep loop.
    5913             :  * Probably not worth the trouble though.  This state shouldn't be one that
    5914             :  * anyone cares about server power consumption in.
    5915             :  */
    5916             : static void
    5917           0 : recoveryPausesHere(void)
    5918             : {
    5919             :     /* Don't pause unless users can connect! */
    5920           0 :     if (!LocalHotStandbyActive)
    5921           0 :         return;
    5922             : 
    5923           0 :     ereport(LOG,
    5924             :             (errmsg("recovery has paused"),
    5925             :              errhint("Execute pg_wal_replay_resume() to continue.")));
    5926             : 
    5927           0 :     while (RecoveryIsPaused())
    5928             :     {
    5929           0 :         pg_usleep(1000000L);    /* 1000 ms */
    5930           0 :         HandleStartupProcInterrupts();
    5931             :     }
    5932             : }
    5933             : 
    5934             : bool
    5935           0 : RecoveryIsPaused(void)
    5936             : {
    5937             :     bool        recoveryPause;
    5938             : 
    5939           0 :     SpinLockAcquire(&XLogCtl->info_lck);
    5940           0 :     recoveryPause = XLogCtl->recoveryPause;
    5941           0 :     SpinLockRelease(&XLogCtl->info_lck);
    5942             : 
    5943           0 :     return recoveryPause;
    5944             : }
    5945             : 
    5946             : void
    5947           0 : SetRecoveryPause(bool recoveryPause)
    5948             : {
    5949           0 :     SpinLockAcquire(&XLogCtl->info_lck);
    5950           0 :     XLogCtl->recoveryPause = recoveryPause;
    5951           0 :     SpinLockRelease(&XLogCtl->info_lck);
    5952           0 : }
    5953             : 
    5954             : /*
    5955             :  * When recovery_min_apply_delay is set, we wait long enough to make sure
    5956             :  * certain record types are applied at least that interval behind the master.
    5957             :  *
    5958             :  * Returns true if we waited.
    5959             :  *
    5960             :  * Note that the delay is calculated between the WAL record log time and
    5961             :  * the current time on standby. We would prefer to keep track of when this
    5962             :  * standby received each WAL record, which would allow a more consistent
    5963             :  * approach and one not affected by time synchronisation issues, but that
    5964             :  * is significantly more effort and complexity for little actual gain in
    5965             :  * usability.
    5966             :  */
    5967             : static bool
    5968           0 : recoveryApplyDelay(XLogReaderState *record)
    5969             : {
    5970             :     uint8       xact_info;
    5971             :     TimestampTz xtime;
    5972             :     long        secs;
    5973             :     int         microsecs;
    5974             : 
    5975             :     /* nothing to do if no delay configured */
    5976           0 :     if (recovery_min_apply_delay <= 0)
    5977           0 :         return false;
    5978             : 
    5979             :     /* no delay is applied on a database not yet consistent */
    5980           0 :     if (!reachedConsistency)
    5981           0 :         return false;
    5982             : 
    5983             :     /*
    5984             :      * Is it a COMMIT record?
    5985             :      *
    5986             :      * We deliberately choose not to delay aborts since they have no effect on
    5987             :      * MVCC. We already allow replay of records that don't have a timestamp,
    5988             :      * so there is already opportunity for issues caused by early conflicts on
    5989             :      * standbys.
    5990             :      */
    5991           0 :     if (XLogRecGetRmid(record) != RM_XACT_ID)
    5992           0 :         return false;
    5993             : 
    5994           0 :     xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
    5995             : 
    5996           0 :     if (xact_info != XLOG_XACT_COMMIT &&
    5997             :         xact_info != XLOG_XACT_COMMIT_PREPARED)
    5998           0 :         return false;
    5999             : 
    6000           0 :     if (!getRecordTimestamp(record, &xtime))
    6001           0 :         return false;
    6002             : 
    6003           0 :     recoveryDelayUntilTime =
    6004           0 :         TimestampTzPlusMilliseconds(xtime, recovery_min_apply_delay);
    6005             : 
    6006             :     /*
    6007             :      * Exit without arming the latch if it's already past time to apply this
    6008             :      * record
    6009             :      */
    6010           0 :     TimestampDifference(GetCurrentTimestamp(), recoveryDelayUntilTime,
    6011             :                         &secs, &microsecs);
    6012           0 :     if (secs <= 0 && microsecs <= 0)
    6013           0 :         return false;
    6014             : 
    6015             :     while (true)
    6016             :     {
    6017           0 :         ResetLatch(&XLogCtl->recoveryWakeupLatch);
    6018             : 
    6019             :         /* might change the trigger file's location */
    6020           0 :         HandleStartupProcInterrupts();
    6021             : 
    6022           0 :         if (CheckForStandbyTrigger())
    6023           0 :             break;
    6024             : 
    6025             :         /*
    6026             :          * Wait for difference between GetCurrentTimestamp() and
    6027             :          * recoveryDelayUntilTime
    6028             :          */
    6029           0 :         TimestampDifference(GetCurrentTimestamp(), recoveryDelayUntilTime,
    6030             :                             &secs, &microsecs);
    6031             : 
    6032             :         /* NB: We're ignoring waits below min_apply_delay's resolution. */
    6033           0 :         if (secs <= 0 && microsecs / 1000 <= 0)
    6034           0 :             break;
    6035             : 
    6036           0 :         elog(DEBUG2, "recovery apply delay %ld seconds, %d milliseconds",
    6037             :              secs, microsecs / 1000);
    6038             : 
    6039           0 :         WaitLatch(&XLogCtl->recoveryWakeupLatch,
    6040             :                   WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
    6041           0 :                   secs * 1000L + microsecs / 1000,
    6042             :                   WAIT_EVENT_RECOVERY_APPLY_DELAY);
    6043           0 :     }
    6044           0 :     return true;
    6045             : }
    6046             : 
    6047             : /*
    6048             :  * Save timestamp of latest processed commit/abort record.
    6049             :  *
    6050             :  * We keep this in XLogCtl, not a simple static variable, so that it can be
    6051             :  * seen by processes other than the startup process.  Note in particular
    6052             :  * that CreateRestartPoint is executed in the checkpointer.
    6053             :  */
    6054             : static void
    6055           0 : SetLatestXTime(TimestampTz xtime)
    6056             : {
    6057           0 :     SpinLockAcquire(&XLogCtl->info_lck);
    6058           0 :     XLogCtl->recoveryLastXTime = xtime;
    6059           0 :     SpinLockRelease(&XLogCtl->info_lck);
    6060           0 : }
    6061             : 
    6062             : /*
    6063             :  * Fetch timestamp of latest processed commit/abort record.
    6064             :  */
    6065             : TimestampTz
    6066           0 : GetLatestXTime(void)
    6067             : {
    6068             :     TimestampTz xtime;
    6069             : 
    6070           0 :     SpinLockAcquire(&XLogCtl->info_lck);
    6071           0 :     xtime = XLogCtl->recoveryLastXTime;
    6072           0 :     SpinLockRelease(&XLogCtl->info_lck);
    6073             : 
    6074           0 :     return xtime;
    6075             : }
    6076             : 
    6077             : /*
    6078             :  * Save timestamp of the next chunk of WAL records to apply.
    6079             :  *
    6080             :  * We keep this in XLogCtl, not a simple static variable, so that it can be
    6081             :  * seen by all backends.
    6082             :  */
    6083             : static void
    6084           0 : SetCurrentChunkStartTime(TimestampTz xtime)
    6085             : {
    6086           0 :     SpinLockAcquire(&XLogCtl->info_lck);
    6087           0 :     XLogCtl->currentChunkStartTime = xtime;
    6088           0 :     SpinLockRelease(&XLogCtl->info_lck);
    6089           0 : }
    6090             : 
    6091             : /*
    6092             :  * Fetch timestamp of latest processed commit/abort record.
    6093             :  * Startup process maintains an accurate local copy in XLogReceiptTime
    6094             :  */
    6095             : TimestampTz
    6096           0 : GetCurrentChunkReplayStartTime(void)
    6097             : {
    6098             :     TimestampTz xtime;
    6099             : 
    6100           0 :     SpinLockAcquire(&XLogCtl->info_lck);
    6101           0 :     xtime = XLogCtl->currentChunkStartTime;
    6102           0 :     SpinLockRelease(&XLogCtl->info_lck);
    6103             : 
    6104           0 :     return xtime;
    6105             : }
    6106             : 
    6107             : /*
    6108             :  * Returns time of receipt of current chunk of XLOG data, as well as
    6109             :  * whether it was received from streaming replication or from archives.
    6110             :  */
    6111             : void
    6112           0 : GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream)
    6113             : {
    6114             :     /*
    6115             :      * This must be executed in the startup process, since we don't export the
    6116             :      * relevant state to shared memory.
    6117             :      */
    6118           0 :     Assert(InRecovery);
    6119             : 
    6120           0 :     *rtime = XLogReceiptTime;
    6121           0 :     *fromStream = (XLogReceiptSource == XLOG_FROM_STREAM);
    6122           0 : }
    6123             : 
    6124             : /*
    6125             :  * Note that text field supplied is a parameter name and does not require
    6126             :  * translation
    6127             :  */
    6128             : #define RecoveryRequiresIntParameter(param_name, currValue, minValue) \
    6129             : do { \
    6130             :     if ((currValue) < (minValue)) \
    6131             :         ereport(ERROR, \
    6132             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE), \
    6133             :                  errmsg("hot standby is not possible because " \
    6134             :                         "%s = %d is a lower setting than on the master server " \
    6135             :                         "(its value was %d)", \
    6136             :                         param_name, \
    6137             :                         currValue, \
    6138             :                         minValue))); \
    6139             : } while(0)
    6140             : 
    6141             : /*
    6142             :  * Check to see if required parameters are set high enough on this server
    6143             :  * for various aspects of recovery operation.
    6144             :  *
    6145             :  * Note that all the parameters which this function tests need to be
    6146             :  * listed in Administrator's Overview section in high-availability.sgml.
    6147             :  * If you change them, don't forget to update the list.
    6148             :  */
    6149             : static void
    6150           0 : CheckRequiredParameterValues(void)
    6151             : {
    6152             :     /*
    6153             :      * For archive recovery, the WAL must be generated with at least 'replica'
    6154             :      * wal_level.
    6155             :      */
    6156           0 :     if (ArchiveRecoveryRequested && ControlFile->wal_level == WAL_LEVEL_MINIMAL)
    6157             :     {
    6158           0 :         ereport(WARNING,
    6159             :                 (errmsg("WAL was generated with wal_level=minimal, data may be missing"),
    6160             :                  errhint("This happens if you temporarily set wal_level=minimal without taking a new base backup.")));
    6161             :     }
    6162             : 
    6163             :     /*
    6164             :      * For Hot Standby, the WAL must be generated with 'replica' mode, and we
    6165             :      * must have at least as many backend slots as the primary.
    6166             :      */
    6167           0 :     if (ArchiveRecoveryRequested && EnableHotStandby)
    6168             :     {
    6169           0 :         if (ControlFile->wal_level < WAL_LEVEL_REPLICA)
    6170           0 :             ereport(ERROR,
    6171             :                     (errmsg("hot standby is not possible because wal_level was not set to \"replica\" or higher on the master server"),
    6172             :                      errhint("Either set wal_level to \"replica\" on the master, or turn off hot_standby here.")));
    6173             : 
    6174             :         /* We ignore autovacuum_max_workers when we make this test. */
    6175           0 :         RecoveryRequiresIntParameter("max_connections",
    6176             :                                      MaxConnections,
    6177             :                                      ControlFile->MaxConnections);
    6178           0 :         RecoveryRequiresIntParameter("max_worker_processes",
    6179             :                                      max_worker_processes,
    6180             :                                      ControlFile->max_worker_processes);
    6181           0 :         RecoveryRequiresIntParameter("max_prepared_transactions",
    6182             :                                      max_prepared_xacts,
    6183             :                                      ControlFile->max_prepared_xacts);
    6184           0 :         RecoveryRequiresIntParameter("max_locks_per_transaction",
    6185             :                                      max_locks_per_xact,
    6186             :                                      ControlFile->max_locks_per_xact);
    6187             :     }
    6188           0 : }
    6189             : 
    6190             : /*
    6191             :  * This must be called ONCE during postmaster or standalone-backend startup
    6192             :  */
    6193             : void
    6194           3 : StartupXLOG(void)
    6195             : {
    6196             :     XLogCtlInsert *Insert;
    6197             :     CheckPoint  checkPoint;
    6198             :     bool        wasShutdown;
    6199           3 :     bool        reachedStopPoint = false;
    6200           3 :     bool        haveBackupLabel = false;
    6201           3 :     bool        haveTblspcMap = false;
    6202             :     XLogRecPtr  RecPtr,
    6203             :                 checkPointLoc,
    6204             :                 EndOfLog;
    6205             :     TimeLineID  EndOfLogTLI;
    6206             :     TimeLineID  PrevTimeLineID;
    6207             :     XLogRecord *record;
    6208             :     TransactionId oldestActiveXID;
    6209           3 :     bool        backupEndRequired = false;
    6210           3 :     bool        backupFromStandby = false;
    6211             :     DBState     dbstate_at_startup;
    6212             :     XLogReaderState *xlogreader;
    6213             :     XLogPageReadPrivate private;
    6214           3 :     bool        fast_promoted = false;
    6215             :     struct stat st;
    6216             : 
    6217             :     /*
    6218             :      * Read control file and check XLOG status looks valid.
    6219             :      *
    6220             :      * Note: in most control paths, *ControlFile is already valid and we need
    6221             :      * not do ReadControlFile() here, but might as well do it to be sure.
    6222             :      */
    6223           3 :     ReadControlFile();
    6224             : 
    6225           6 :     if (ControlFile->state < DB_SHUTDOWNED ||
    6226           6 :         ControlFile->state > DB_IN_PRODUCTION ||
    6227           3 :         !XRecOffIsValid(ControlFile->checkPoint))
    6228           0 :         ereport(FATAL,
    6229             :                 (errmsg("control file contains invalid data")));
    6230             : 
    6231           3 :     if (ControlFile->state == DB_SHUTDOWNED)
    6232             :     {
    6233             :         /* This is the expected case, so don't be chatty in standalone mode */
    6234           3 :         ereport(IsPostmasterEnvironment ? LOG : NOTICE,
    6235             :                 (errmsg("database system was shut down at %s",
    6236             :                         str_time(ControlFile->time))));
    6237             :     }
    6238           0 :     else if (ControlFile->state == DB_SHUTDOWNED_IN_RECOVERY)
    6239           0 :         ereport(LOG,
    6240             :                 (errmsg("database system was shut down in recovery at %s",
    6241             :                         str_time(ControlFile->time))));
    6242           0 :     else if (ControlFile->state == DB_SHUTDOWNING)
    6243           0 :         ereport(LOG,
    6244             :                 (errmsg("database system shutdown was interrupted; last known up at %s",
    6245             :                         str_time(ControlFile->time))));
    6246           0 :     else if (ControlFile->state == DB_IN_CRASH_RECOVERY)
    6247           0 :         ereport(LOG,
    6248             :                 (errmsg("database system was interrupted while in recovery at %s",
    6249             :                         str_time(ControlFile->time)),
    6250             :                  errhint("This probably means that some data is corrupted and"
    6251             :                          " you will have to use the last backup for recovery.")));
    6252           0 :     else if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY)
    6253           0 :         ereport(LOG,
    6254             :                 (errmsg("database system was interrupted while in recovery at log time %s",
    6255             :                         str_time(ControlFile->checkPointCopy.time)),
    6256             :                  errhint("If this has occurred more than once some data might be corrupted"
    6257             :                          " and you might need to choose an earlier recovery target.")));
    6258           0 :     else if (ControlFile->state == DB_IN_PRODUCTION)
    6259           0 :         ereport(LOG,
    6260             :                 (errmsg("database system was interrupted; last known up at %s",
    6261             :                         str_time(ControlFile->time))));
    6262             : 
    6263             :     /* This is just to allow attaching to startup process with a debugger */
    6264             : #ifdef XLOG_REPLAY_DELAY
    6265             :     if (ControlFile->state != DB_SHUTDOWNED)
    6266             :         pg_usleep(60000000L);
    6267             : #endif
    6268             : 
    6269             :     /*
    6270             :      * Verify that pg_wal and pg_wal/archive_status exist.  In cases where
    6271             :      * someone has performed a copy for PITR, these directories may have been
    6272             :      * excluded and need to be re-created.
    6273             :      */
    6274           3 :     ValidateXLOGDirectoryStructure();
    6275             : 
    6276             :     /*
    6277             :      * If we previously crashed, there might be data which we had written,
    6278             :      * intending to fsync it, but which we had not actually fsync'd yet.
    6279             :      * Therefore, a power failure in the near future might cause earlier
    6280             :      * unflushed writes to be lost, even though more recent data written to
    6281             :      * disk from here on would be persisted.  To avoid that, fsync the entire
    6282             :      * data directory.
    6283             :      */
    6284           3 :     if (ControlFile->state != DB_SHUTDOWNED &&
    6285           0 :         ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY)
    6286           0 :         SyncDataDirectory();
    6287             : 
    6288             :     /*
    6289             :      * Initialize on the assumption we want to recover to the latest timeline
    6290             :      * that's active according to pg_control.
    6291             :      */
    6292           6 :     if (ControlFile->minRecoveryPointTLI >
    6293           3 :         ControlFile->checkPointCopy.ThisTimeLineID)
    6294           0 :         recoveryTargetTLI = ControlFile->minRecoveryPointTLI;
    6295             :     else
    6296           3 :         recoveryTargetTLI = ControlFile->checkPointCopy.ThisTimeLineID;
    6297             : 
    6298             :     /*
    6299             :      * Check for recovery control file, and if so set up state for offline
    6300             :      * recovery
    6301             :      */
    6302           3 :     readRecoveryCommandFile();
    6303             : 
    6304             :     /*
    6305             :      * Save archive_cleanup_command in shared memory so that other processes
    6306             :      * can see it.
    6307             :      */
    6308           3 :     strlcpy(XLogCtl->archiveCleanupCommand,
    6309           3 :             archiveCleanupCommand ? archiveCleanupCommand : "",
    6310             :             sizeof(XLogCtl->archiveCleanupCommand));
    6311             : 
    6312           3 :     if (ArchiveRecoveryRequested)
    6313             :     {
    6314           0 :         if (StandbyModeRequested)
    6315           0 :             ereport(LOG,
    6316             :                     (errmsg("entering standby mode")));
    6317           0 :         else if (recoveryTarget == RECOVERY_TARGET_XID)
    6318           0 :             ereport(LOG,
    6319             :                     (errmsg("starting point-in-time recovery to XID %u",
    6320             :                             recoveryTargetXid)));
    6321           0 :         else if (recoveryTarget == RECOVERY_TARGET_TIME)
    6322           0 :             ereport(LOG,
    6323             :                     (errmsg("starting point-in-time recovery to %s",
    6324             :                             timestamptz_to_str(recoveryTargetTime))));
    6325           0 :         else if (recoveryTarget == RECOVERY_TARGET_NAME)
    6326           0 :             ereport(LOG,
    6327             :                     (errmsg("starting point-in-time recovery to \"%s\"",
    6328             :                             recoveryTargetName)));
    6329           0 :         else if (recoveryTarget == RECOVERY_TARGET_LSN)
    6330           0 :             ereport(LOG,
    6331             :                     (errmsg("starting point-in-time recovery to WAL location (LSN) \"%X/%X\"",
    6332             :                             (uint32) (recoveryTargetLSN >> 32),
    6333             :                             (uint32) recoveryTargetLSN)));
    6334           0 :         else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
    6335           0 :             ereport(LOG,
    6336             :                     (errmsg("starting point-in-time recovery to earliest consistent point")));
    6337             :         else
    6338           0 :             ereport(LOG,
    6339             :                     (errmsg("starting archive recovery")));
    6340             :     }
    6341             : 
    6342             :     /*
    6343             :      * Take ownership of the wakeup latch if we're going to sleep during
    6344             :      * recovery.
    6345             :      */
    6346           3 :     if (StandbyModeRequested)
    6347           0 :         OwnLatch(&XLogCtl->recoveryWakeupLatch);
    6348             : 
    6349             :     /* Set up XLOG reader facility */
    6350           3 :     MemSet(&private, 0, sizeof(XLogPageReadPrivate));
    6351           3 :     xlogreader = XLogReaderAllocate(&XLogPageRead, &private);
    6352           3 :     if (!xlogreader)
    6353           0 :         ereport(ERROR,
    6354             :                 (errcode(ERRCODE_OUT_OF_MEMORY),
    6355             :                  errmsg("out of memory"),
    6356             :                  errdetail("Failed while allocating a WAL reading processor.")));
    6357           3 :     xlogreader->system_identifier = ControlFile->system_identifier;
    6358             : 
    6359             :     /*
    6360             :      * Allocate pages dedicated to WAL consistency checks, those had better be
    6361             :      * aligned.
    6362             :      */
    6363           3 :     replay_image_masked = (char *) palloc(BLCKSZ);
    6364           3 :     master_image_masked = (char *) palloc(BLCKSZ);
    6365             : 
    6366           3 :     if (read_backup_label(&checkPointLoc, &backupEndRequired,
    6367             :                           &backupFromStandby))
    6368             :     {
    6369           0 :         List       *tablespaces = NIL;
    6370             : 
    6371             :         /*
    6372             :          * Archive recovery was requested, and thanks to the backup label
    6373             :          * file, we know how far we need to replay to reach consistency. Enter
    6374             :          * archive recovery directly.
    6375             :          */
    6376           0 :         InArchiveRecovery = true;
    6377           0 :         if (StandbyModeRequested)
    6378           0 :             StandbyMode = true;
    6379             : 
    6380             :         /*
    6381             :          * When a backup_label file is present, we want to roll forward from
    6382             :          * the checkpoint it identifies, rather than using pg_control.
    6383             :          */
    6384           0 :         record = ReadCheckpointRecord(xlogreader, checkPointLoc, 0, true);
    6385           0 :         if (record != NULL)
    6386             :         {
    6387           0 :             memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
    6388           0 :             wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
    6389           0 :             ereport(DEBUG1,
    6390             :                     (errmsg("checkpoint record is at %X/%X",
    6391             :                             (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
    6392           0 :             InRecovery = true;  /* force recovery even if SHUTDOWNED */
    6393             : 
    6394             :             /*
    6395             :              * Make sure that REDO location exists. This may not be the case
    6396             :              * if there was a crash during an online backup, which left a
    6397             :              * backup_label around that references a WAL segment that's
    6398             :              * already been archived.
    6399             :              */
    6400           0 :             if (checkPoint.redo < checkPointLoc)
    6401             :             {
    6402           0 :                 if (!ReadRecord(xlogreader, checkPoint.redo, LOG, false))
    6403           0 :                     ereport(FATAL,
    6404             :                             (errmsg("could not find redo location referenced by checkpoint record"),
    6405             :                              errhint("If you are not restoring from a backup, try removing the file \"%s/backup_label\".", DataDir)));
    6406             :             }
    6407             :         }
    6408             :         else
    6409             :         {
    6410           0 :             ereport(FATAL,
    6411             :                     (errmsg("could not locate required checkpoint record"),
    6412             :                      errhint("If you are not restoring from a backup, try removing the file \"%s/backup_label\".", DataDir)));
    6413             :             wasShutdown = false;    /* keep compiler quiet */
    6414             :         }
    6415             : 
    6416             :         /* read the tablespace_map file if present and create symlinks. */
    6417           0 :         if (read_tablespace_map(&tablespaces))
    6418             :         {
    6419             :             ListCell   *lc;
    6420             : 
    6421           0 :             foreach(lc, tablespaces)
    6422             :             {
    6423           0 :                 tablespaceinfo *ti = lfirst(lc);
    6424             :                 char       *linkloc;
    6425             : 
    6426           0 :                 linkloc = psprintf("pg_tblspc/%s", ti->oid);
    6427             : 
    6428             :                 /*
    6429             :                  * Remove the existing symlink if any and Create the symlink
    6430             :                  * under PGDATA.
    6431             :                  */
    6432           0 :                 remove_tablespace_symlink(linkloc);
    6433             : 
    6434           0 :                 if (symlink(ti->path, linkloc) < 0)
    6435           0 :                     ereport(ERROR,
    6436             :                             (errcode_for_file_access(),
    6437             :                              errmsg("could not create symbolic link \"%s\": %m",
    6438             :                                     linkloc)));
    6439             : 
    6440           0 :                 pfree(ti->oid);
    6441           0 :                 pfree(ti->path);
    6442           0 :                 pfree(ti);
    6443             :             }
    6444             : 
    6445             :             /* set flag to delete it later */
    6446           0 :             haveTblspcMap = true;
    6447             :         }
    6448             : 
    6449             :         /* set flag to delete it later */
    6450           0 :         haveBackupLabel = true;
    6451             :     }
    6452             :     else
    6453             :     {
    6454             :         /*
    6455             :          * If tablespace_map file is present without backup_label file, there
    6456             :          * is no use of such file.  There is no harm in retaining it, but it
    6457             :          * is better to get rid of the map file so that we don't have any
    6458             :          * redundant file in data directory and it will avoid any sort of
    6459             :          * confusion.  It seems prudent though to just rename the file out of
    6460             :          * the way rather than delete it completely, also we ignore any error
    6461             :          * that occurs in rename operation as even if map file is present
    6462             :          * without backup_label file, it is harmless.
    6463             :          */
    6464           3 :         if (stat(TABLESPACE_MAP, &st) == 0)
    6465             :         {
    6466           0 :             unlink(TABLESPACE_MAP_OLD);
    6467           0 :             if (durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, DEBUG1) == 0)
    6468           0 :                 ereport(LOG,
    6469             :                         (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
    6470             :                                 TABLESPACE_MAP, BACKUP_LABEL_FILE),
    6471             :                          errdetail("File \"%s\" was renamed to \"%s\".",
    6472             :                                    TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
    6473             :             else
    6474           0 :                 ereport(LOG,
    6475             :                         (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
    6476             :                                 TABLESPACE_MAP, BACKUP_LABEL_FILE),
    6477             :                          errdetail("Could not rename file \"%s\" to \"%s\": %m.",
    6478             :                                    TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
    6479             :         }
    6480             : 
    6481             :         /*
    6482             :          * It's possible that archive recovery was requested, but we don't
    6483             :          * know how far we need to replay the WAL before we reach consistency.
    6484             :          * This can happen for example if a base backup is taken from a
    6485             :          * running server using an atomic filesystem snapshot, without calling
    6486             :          * pg_start/stop_backup. Or if you just kill a running master server
    6487             :          * and put it into archive recovery by creating a recovery.conf file.
    6488             :          *
    6489             :          * Our strategy in that case is to perform crash recovery first,
    6490             :          * replaying all the WAL present in pg_wal, and only enter archive
    6491             :          * recovery after that.
    6492             :          *
    6493             :          * But usually we already know how far we need to replay the WAL (up
    6494             :          * to minRecoveryPoint, up to backupEndPoint, or until we see an
    6495             :          * end-of-backup record), and we can enter archive recovery directly.
    6496             :          */
    6497           3 :         if (ArchiveRecoveryRequested &&
    6498           0 :             (ControlFile->minRecoveryPoint != InvalidXLogRecPtr ||
    6499           0 :              ControlFile->backupEndRequired ||
    6500           0 :              ControlFile->backupEndPoint != InvalidXLogRecPtr ||
    6501           0 :              ControlFile->state == DB_SHUTDOWNED))
    6502             :         {
    6503           0 :             InArchiveRecovery = true;
    6504           0 :             if (StandbyModeRequested)
    6505           0 :                 StandbyMode = true;
    6506             :         }
    6507             : 
    6508             :         /*
    6509             :          * Get the last valid checkpoint record.  If the latest one according
    6510             :          * to pg_control is broken, try the next-to-last one.
    6511             :          */
    6512           3 :         checkPointLoc = ControlFile->checkPoint;
    6513           3 :         RedoStartLSN = ControlFile->checkPointCopy.redo;
    6514           3 :         record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, true);
    6515           3 :         if (record != NULL)
    6516             :         {
    6517           3 :             ereport(DEBUG1,
    6518             :                     (errmsg("checkpoint record is at %X/%X",
    6519             :                             (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
    6520             :         }
    6521           0 :         else if (StandbyMode)
    6522             :         {
    6523             :             /*
    6524             :              * The last valid checkpoint record required for a streaming
    6525             :              * recovery exists in neither standby nor the primary.
    6526             :              */
    6527           0 :             ereport(PANIC,
    6528             :                     (errmsg("could not locate a valid checkpoint record")));
    6529             :         }
    6530             :         else
    6531             :         {
    6532           0 :             checkPointLoc = ControlFile->prevCheckPoint;
    6533           0 :             record = ReadCheckpointRecord(xlogreader, checkPointLoc, 2, true);
    6534           0 :             if (record != NULL)
    6535             :             {
    6536           0 :                 ereport(LOG,
    6537             :                         (errmsg("using previous checkpoint record at %X/%X",
    6538             :                                 (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
    6539           0 :                 InRecovery = true;  /* force recovery even if SHUTDOWNED */
    6540             :             }
    6541             :             else
    6542           0 :                 ereport(PANIC,
    6543             :                         (errmsg("could not locate a valid checkpoint record")));
    6544             :         }
    6545           3 :         memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
    6546           3 :         wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
    6547             :     }
    6548             : 
    6549             :     /*
    6550             :      * Clear out any old relcache cache files.  This is *necessary* if we do
    6551             :      * any WAL replay, since that would probably result in the cache files
    6552             :      * being out of sync with database reality.  In theory we could leave them
    6553             :      * in place if the database had been cleanly shut down, but it seems
    6554             :      * safest to just remove them always and let them be rebuilt during the
    6555             :      * first backend startup.  These files needs to be removed from all
    6556             :      * directories including pg_tblspc, however the symlinks are created only
    6557             :      * after reading tablespace_map file in case of archive recovery from
    6558             :      * backup, so needs to clear old relcache files here after creating
    6559             :      * symlinks.
    6560             :      */
    6561           3 :     RelationCacheInitFileRemove();
    6562             : 
    6563             :     /*
    6564             :      * If the location of the checkpoint record is not on the expected
    6565             :      * timeline in the history of the requested timeline, we cannot proceed:
    6566             :      * the backup is not part of the history of the requested timeline.
    6567             :      */
    6568           3 :     Assert(expectedTLEs);       /* was initialized by reading checkpoint
    6569             :                                  * record */
    6570           6 :     if (tliOfPointInHistory(checkPointLoc, expectedTLEs) !=
    6571           3 :         checkPoint.ThisTimeLineID)
    6572             :     {
    6573             :         XLogRecPtr  switchpoint;
    6574             : 
    6575             :         /*
    6576             :          * tliSwitchPoint will throw an error if the checkpoint's timeline is
    6577             :          * not in expectedTLEs at all.
    6578             :          */
    6579           0 :         switchpoint = tliSwitchPoint(ControlFile->checkPointCopy.ThisTimeLineID, expectedTLEs, NULL);
    6580           0 :         ereport(FATAL,
    6581             :                 (errmsg("requested timeline %u is not a child of this server's history",
    6582             :                         recoveryTargetTLI),
    6583             :                  errdetail("Latest checkpoint is at %X/%X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%X.",
    6584             :                            (uint32) (ControlFile->checkPoint >> 32),
    6585             :                            (uint32) ControlFile->checkPoint,
    6586             :                            ControlFile->checkPointCopy.ThisTimeLineID,
    6587             :                            (uint32) (switchpoint >> 32),
    6588             :                            (uint32) switchpoint)));
    6589             :     }
    6590             : 
    6591             :     /*
    6592             :      * The min recovery point should be part of the requested timeline's
    6593             :      * history, too.
    6594             :      */
    6595           3 :     if (!XLogRecPtrIsInvalid(ControlFile->minRecoveryPoint) &&
    6596           0 :         tliOfPointInHistory(ControlFile->minRecoveryPoint - 1, expectedTLEs) !=
    6597           0 :         ControlFile->minRecoveryPointTLI)
    6598           0 :         ereport(FATAL,
    6599             :                 (errmsg("requested timeline %u does not contain minimum recovery point %X/%X on timeline %u",
    6600             :                         recoveryTargetTLI,
    6601             :                         (uint32) (ControlFile->minRecoveryPoint >> 32),
    6602             :                         (uint32) ControlFile->minRecoveryPoint,
    6603             :                         ControlFile->minRecoveryPointTLI)));
    6604             : 
    6605           3 :     LastRec = RecPtr = checkPointLoc;
    6606             : 
    6607           3 :     ereport(DEBUG1,
    6608             :             (errmsg_internal("redo record is at %X/%X; shutdown %s",
    6609             :                              (uint32) (checkPoint.redo >> 32), (uint32) checkPoint.redo,
    6610             :                              wasShutdown ? "TRUE" : "FALSE")));
    6611           3 :     ereport(DEBUG1,
    6612             :             (errmsg_internal("next transaction ID: %u:%u; next OID: %u",
    6613             :                              checkPoint.nextXidEpoch, checkPoint.nextXid,
    6614             :                              checkPoint.nextOid)));
    6615           3 :     ereport(DEBUG1,
    6616             :             (errmsg_internal("next MultiXactId: %u; next MultiXactOffset: %u",
    6617             :                              checkPoint.nextMulti, checkPoint.nextMultiOffset)));
    6618           3 :     ereport(DEBUG1,
    6619             :             (errmsg_internal("oldest unfrozen transaction ID: %u, in database %u",
    6620             :                              checkPoint.oldestXid, checkPoint.oldestXidDB)));
    6621           3 :     ereport(DEBUG1,
    6622             :             (errmsg_internal("oldest MultiXactId: %u, in database %u",
    6623             :                              checkPoint.oldestMulti, checkPoint.oldestMultiDB)));
    6624           3 :     ereport(DEBUG1,
    6625             :             (errmsg_internal("commit timestamp Xid oldest/newest: %u/%u",
    6626             :                              checkPoint.oldestCommitTsXid,
    6627             :                              checkPoint.newestCommitTsXid)));
    6628           3 :     if (!TransactionIdIsNormal(checkPoint.nextXid))
    6629           0 :         ereport(PANIC,
    6630             :                 (errmsg("invalid next transaction ID")));
    6631             : 
    6632             :     /* initialize shared memory variables from the checkpoint record */
    6633           3 :     ShmemVariableCache->nextXid = checkPoint.nextXid;
    6634           3 :     ShmemVariableCache->nextOid = checkPoint.nextOid;
    6635           3 :     ShmemVariableCache->oidCount = 0;
    6636           3 :     MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
    6637           3 :     AdvanceOldestClogXid(checkPoint.oldestXid);
    6638           3 :     SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
    6639           3 :     SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB, true);
    6640           3 :     SetCommitTsLimit(checkPoint.oldestCommitTsXid,
    6641             :                      checkPoint.newestCommitTsXid);
    6642           3 :     XLogCtl->ckptXidEpoch = checkPoint.nextXidEpoch;
    6643           3 :     XLogCtl->ckptXid = checkPoint.nextXid;
    6644             : 
    6645             :     /*
    6646             :      * Initialize replication slots, before there's a chance to remove
    6647             :      * required resources.
    6648             :      */
    6649           3 :     StartupReplicationSlots();
    6650             : 
    6651             :     /*
    6652             :      * Startup logical state, needs to be setup now so we have proper data
    6653             :      * during crash recovery.
    6654             :      */
    6655           3 :     StartupReorderBuffer();
    6656             : 
    6657             :     /*
    6658             :      * Startup MultiXact. We need to do this early to be able to replay
    6659             :      * truncations.
    6660             :      */
    6661           3 :     StartupMultiXact();
    6662             : 
    6663             :     /*
    6664             :      * Ditto commit timestamps.  In a standby, we do it if setting is enabled
    6665             :      * in ControlFile; in a master we base the decision on the GUC itself.
    6666             :      */
    6667           3 :     if (ArchiveRecoveryRequested ?
    6668           0 :         ControlFile->track_commit_timestamp : track_commit_timestamp)
    6669           0 :         StartupCommitTs();
    6670             : 
    6671             :     /*
    6672             :      * Recover knowledge about replay progress of known replication partners.
    6673             :      */
    6674           3 :     StartupReplicationOrigin();
    6675             : 
    6676             :     /*
    6677             :      * Initialize unlogged LSN. On a clean shutdown, it's restored from the
    6678             :      * control file. On recovery, all unlogged relations are blown away, so
    6679             :      * the unlogged LSN counter can be reset too.
    6680             :      */
    6681           3 :     if (ControlFile->state == DB_SHUTDOWNED)
    6682           3 :         XLogCtl->unloggedLSN = ControlFile->unloggedLSN;
    6683             :     else
    6684           0 :         XLogCtl->unloggedLSN = 1;
    6685             : 
    6686             :     /*
    6687             :      * We must replay WAL entries using the same TimeLineID they were created
    6688             :      * under, so temporarily adopt the TLI indicated by the checkpoint (see
    6689             :      * also xlog_redo()).
    6690             :      */
    6691           3 :     ThisTimeLineID = checkPoint.ThisTimeLineID;
    6692             : 
    6693             :     /*
    6694             :      * Copy any missing timeline history files between 'now' and the recovery
    6695             :      * target timeline from archive to pg_wal. While we don't need those files
    6696             :      * ourselves - the history file of the recovery target timeline covers all
    6697             :      * the previous timelines in the history too - a cascading standby server
    6698             :      * might be interested in them. Or, if you archive the WAL from this
    6699             :      * server to a different archive than the master, it'd be good for all the
    6700             :      * history files to get archived there after failover, so that you can use
    6701             :      * one of the old timelines as a PITR target. Timeline history files are
    6702             :      * small, so it's better to copy them unnecessarily than not copy them and
    6703             :      * regret later.
    6704             :      */
    6705           3 :     restoreTimeLineHistoryFiles(ThisTimeLineID, recoveryTargetTLI);
    6706             : 
    6707             :     /*
    6708             :      * Before running in recovery, scan pg_twophase and fill in its status to
    6709             :      * be able to work on entries generated by redo.  Doing a scan before
    6710             :      * taking any recovery action has the merit to discard any 2PC files that
    6711             :      * are newer than the first record to replay, saving from any conflicts at
    6712             :      * replay.  This avoids as well any subsequent scans when doing recovery
    6713             :      * of the on-disk two-phase data.
    6714             :      */
    6715           3 :     restoreTwoPhaseData();
    6716             : 
    6717           3 :     lastFullPageWrites = checkPoint.fullPageWrites;
    6718             : 
    6719           3 :     RedoRecPtr = XLogCtl->RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
    6720           3 :     doPageWrites = lastFullPageWrites;
    6721             : 
    6722           3 :     if (RecPtr < checkPoint.redo)
    6723           0 :         ereport(PANIC,
    6724             :                 (errmsg("invalid redo in checkpoint record")));
    6725             : 
    6726             :     /*
    6727             :      * Check whether we need to force recovery from WAL.  If it appears to
    6728             :      * have been a clean shutdown and we did not have a recovery.conf file,
    6729             :      * then assume no recovery needed.
    6730             :      */
    6731           3 :     if (checkPoint.redo < RecPtr)
    6732             :     {
    6733           0 :         if (wasShutdown)
    6734           0 :             ereport(PANIC,
    6735             :                     (errmsg("invalid redo record in shutdown checkpoint")));
    6736           0 :         InRecovery = true;
    6737             :     }
    6738           3 :     else if (ControlFile->state != DB_SHUTDOWNED)
    6739           0 :         InRecovery = true;
    6740           3 :     else if (ArchiveRecoveryRequested)
    6741             :     {
    6742             :         /* force recovery due to presence of recovery.conf */
    6743           0 :         InRecovery = true;
    6744             :     }
    6745             : 
    6746             :     /* REDO */
    6747           3 :     if (InRecovery)
    6748             :     {
    6749             :         int         rmid;
    6750             : 
    6751             :         /*
    6752             :          * Update pg_control to show that we are recovering and to show the
    6753             :          * selected checkpoint as the place we are starting from. We also mark
    6754             :          * pg_control with any minimum recovery stop point obtained from a
    6755             :          * backup history file.
    6756             :          */
    6757           0 :         dbstate_at_startup = ControlFile->state;
    6758           0 :         if (InArchiveRecovery)
    6759           0 :             ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
    6760             :         else
    6761             :         {
    6762           0 :             ereport(LOG,
    6763             :                     (errmsg("database system was not properly shut down; "
    6764             :                             "automatic recovery in progress")));
    6765           0 :             if (recoveryTargetTLI > ControlFile->checkPointCopy.ThisTimeLineID)
    6766           0 :                 ereport(LOG,
    6767             :                         (errmsg("crash recovery starts in timeline %u "
    6768             :                                 "and has target timeline %u",
    6769             :                                 ControlFile->checkPointCopy.ThisTimeLineID,
    6770             :                                 recoveryTargetTLI)));
    6771           0 :             ControlFile->state = DB_IN_CRASH_RECOVERY;
    6772             :         }
    6773           0 :         ControlFile->prevCheckPoint = ControlFile->checkPoint;
    6774           0 :         ControlFile->checkPoint = checkPointLoc;
    6775           0 :         ControlFile->checkPointCopy = checkPoint;
    6776           0 :         if (InArchiveRecovery)
    6777             :         {
    6778             :             /* initialize minRecoveryPoint if not set yet */
    6779           0 :             if (ControlFile->minRecoveryPoint < checkPoint.redo)
    6780             :             {
    6781           0 :                 ControlFile->minRecoveryPoint = checkPoint.redo;
    6782           0 :                 ControlFile->minRecoveryPointTLI = checkPoint.ThisTimeLineID;
    6783             :             }
    6784             :         }
    6785             : 
    6786             :         /*
    6787             :          * Set backupStartPoint if we're starting recovery from a base backup.
    6788             :          *
    6789             :          * Also set backupEndPoint and use minRecoveryPoint as the backup end
    6790             :          * location if we're starting recovery from a base backup which was
    6791             :          * taken from a standby. In this case, the database system status in
    6792             :          * pg_control must indicate that the database was already in recovery.
    6793             :          * Usually that will be DB_IN_ARCHIVE_RECOVERY but also can be
    6794             :          * DB_SHUTDOWNED_IN_RECOVERY if recovery previously was interrupted
    6795             :          * before reaching this point; e.g. because restore_command or
    6796             :          * primary_conninfo were faulty.
    6797             :          *
    6798             :          * Any other state indicates that the backup somehow became corrupted
    6799             :          * and we can't sensibly continue with recovery.
    6800             :          */
    6801           0 :         if (haveBackupLabel)
    6802             :         {
    6803           0 :             ControlFile->backupStartPoint = checkPoint.redo;
    6804           0 :             ControlFile->backupEndRequired = backupEndRequired;
    6805             : 
    6806           0 :             if (backupFromStandby)
    6807             :             {
    6808           0 :                 if (dbstate_at_startup != DB_IN_ARCHIVE_RECOVERY &&
    6809             :                     dbstate_at_startup != DB_SHUTDOWNED_IN_RECOVERY)
    6810           0 :                     ereport(FATAL,
    6811             :                             (errmsg("backup_label contains data inconsistent with control file"),
    6812             :                              errhint("This means that the backup is corrupted and you will "
    6813             :                                      "have to use another backup for recovery.")));
    6814           0 :                 ControlFile->backupEndPoint = ControlFile->minRecoveryPoint;
    6815             :             }
    6816             :         }
    6817           0 :         ControlFile->time = (pg_time_t) time(NULL);
    6818             :         /* No need to hold ControlFileLock yet, we aren't up far enough */
    6819           0 :         UpdateControlFile();
    6820             : 
    6821             :         /* initialize our local copy of minRecoveryPoint */
    6822           0 :         minRecoveryPoint = ControlFile->minRecoveryPoint;
    6823           0 :         minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
    6824             : 
    6825             :         /*
    6826             :          * Reset pgstat data, because it may be invalid after recovery.
    6827             :          */
    6828           0 :         pgstat_reset_all();
    6829             : 
    6830             :         /*
    6831             :          * If there was a backup label file, it's done its job and the info
    6832             :          * has now been propagated into pg_control.  We must get rid of the
    6833             :          * label file so that if we crash during recovery, we'll pick up at
    6834             :          * the latest recovery restartpoint instead of going all the way back
    6835             :          * to the backup start point.  It seems prudent though to just rename
    6836             :          * the file out of the way rather than delete it completely.
    6837             :          */
    6838           0 :         if (haveBackupLabel)
    6839             :         {
    6840           0 :             unlink(BACKUP_LABEL_OLD);
    6841           0 :             durable_rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD, FATAL);
    6842             :         }
    6843             : 
    6844             :         /*
    6845             :          * If there was a tablespace_map file, it's done its job and the
    6846             :          * symlinks have been created.  We must get rid of the map file so
    6847             :          * that if we crash during recovery, we don't create symlinks again.
    6848             :          * It seems prudent though to just rename the file out of the way
    6849             :          * rather than delete it completely.
    6850             :          */
    6851           0 :         if (haveTblspcMap)
    6852             :         {
    6853           0 :             unlink(TABLESPACE_MAP_OLD);
    6854           0 :             durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, FATAL);
    6855             :         }
    6856             : 
    6857             :         /* Check that the GUCs used to generate the WAL allow recovery */
    6858           0 :         CheckRequiredParameterValues();
    6859             : 
    6860             :         /*
    6861             :          * We're in recovery, so unlogged relations may be trashed and must be
    6862             :          * reset.  This should be done BEFORE allowing Hot Standby
    6863             :          * connections, so that read-only backends don't try to read whatever
    6864             :          * garbage is left over from before.
    6865             :          */
    6866           0 :         ResetUnloggedRelations(UNLOGGED_RELATION_CLEANUP);
    6867             : 
    6868             :         /*
    6869             :          * Likewise, delete any saved transaction snapshot files that got left
    6870             :          * behind by crashed backends.
    6871             :          */
    6872           0 :         DeleteAllExportedSnapshotFiles();
    6873             : 
    6874             :         /*
    6875             :          * Initialize for Hot Standby, if enabled. We won't let backends in
    6876             :          * yet, not until we've reached the min recovery point specified in
    6877             :          * control file and we've established a recovery snapshot from a
    6878             :          * running-xacts WAL record.
    6879             :          */
    6880           0 :         if (ArchiveRecoveryRequested && EnableHotStandby)
    6881             :         {
    6882             :             TransactionId *xids;
    6883             :             int         nxids;
    6884             : 
    6885           0 :             ereport(DEBUG1,
    6886             :                     (errmsg("initializing for hot standby")));
    6887             : 
    6888           0 :             InitRecoveryTransactionEnvironment();
    6889             : 
    6890           0 :             if (wasShutdown)
    6891           0 :                 oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
    6892             :             else
    6893           0 :                 oldestActiveXID = checkPoint.oldestActiveXid;
    6894           0 :             Assert(TransactionIdIsValid(oldestActiveXID));
    6895             : 
    6896             :             /* Tell procarray about the range of xids it has to deal with */
    6897           0 :             ProcArrayInitRecovery(ShmemVariableCache->nextXid);
    6898             : 
    6899             :             /*
    6900             :              * Startup commit log and subtrans only.  MultiXact and commit
    6901             :              * timestamp have already been started up and other SLRUs are not
    6902             :              * maintained during recovery and need not be started yet.
    6903             :              */
    6904           0 :             StartupCLOG();
    6905           0 :             StartupSUBTRANS(oldestActiveXID);
    6906             : 
    6907             :             /*
    6908             :              * If we're beginning at a shutdown checkpoint, we know that
    6909             :              * nothing was running on the master at this point. So fake-up an
    6910             :              * empty running-xacts record and use that here and now. Recover
    6911             :              * additional standby state for prepared transactions.
    6912             :              */
    6913           0 :             if (wasShutdown)
    6914             :             {
    6915             :                 RunningTransactionsData running;
    6916             :                 TransactionId latestCompletedXid;
    6917             : 
    6918             :                 /*
    6919             :                  * Construct a RunningTransactions snapshot representing a
    6920             :                  * shut down server, with only prepared transactions still
    6921             :                  * alive. We're never overflowed at this point because all
    6922             :                  * subxids are listed with their parent prepared transactions.
    6923             :                  */
    6924           0 :                 running.xcnt = nxids;
    6925           0 :                 running.subxcnt = 0;
    6926           0 :                 running.subxid_overflow = false;
    6927           0 :                 running.nextXid = checkPoint.nextXid;
    6928           0 :                 running.oldestRunningXid = oldestActiveXID;
    6929           0 :                 latestCompletedXid = checkPoint.nextXid;
    6930           0 :                 TransactionIdRetreat(latestCompletedXid);
    6931           0 :                 Assert(TransactionIdIsNormal(latestCompletedXid));
    6932           0 :                 running.latestCompletedXid = latestCompletedXid;
    6933           0 :                 running.xids = xids;
    6934             : 
    6935           0 :                 ProcArrayApplyRecoveryInfo(&running);
    6936             : 
    6937           0 :                 StandbyRecoverPreparedTransactions();
    6938             :             }
    6939             :         }
    6940             : 
    6941             :         /* Initialize resource managers */
    6942           0 :         for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
    6943             :         {
    6944           0 :             if (RmgrTable[rmid].rm_startup != NULL)
    6945           0 :                 RmgrTable[rmid].rm_startup();
    6946             :         }
    6947             : 
    6948             :         /*
    6949             :          * Initialize shared variables for tracking progress of WAL replay, as
    6950             :          * if we had just replayed the record before the REDO location (or the
    6951             :          * checkpoint record itself, if it's a shutdown checkpoint).
    6952             :          */
    6953           0 :         SpinLockAcquire(&XLogCtl->info_lck);
    6954           0 :         if (checkPoint.redo < RecPtr)
    6955           0 :             XLogCtl->replayEndRecPtr = checkPoint.redo;
    6956             :         else
    6957           0 :             XLogCtl->replayEndRecPtr = EndRecPtr;
    6958           0 :         XLogCtl->replayEndTLI = ThisTimeLineID;
    6959           0 :         XLogCtl->lastReplayedEndRecPtr = XLogCtl->replayEndRecPtr;
    6960           0 :         XLogCtl->lastReplayedTLI = XLogCtl->replayEndTLI;
    6961           0 :         XLogCtl->recoveryLastXTime = 0;
    6962           0 :         XLogCtl->currentChunkStartTime = 0;
    6963           0 :         XLogCtl->recoveryPause = false;
    6964           0 :         SpinLockRelease(&XLogCtl->info_lck);
    6965             : 
    6966             :         /* Also ensure XLogReceiptTime has a sane value */
    6967           0 :         XLogReceiptTime = GetCurrentTimestamp();
    6968             : 
    6969             :         /*
    6970             :          * Let postmaster know we've started redo now, so that it can launch
    6971             :          * checkpointer to perform restartpoints.  We don't bother during
    6972             :          * crash recovery as restartpoints can only be performed during
    6973             :          * archive recovery.  And we'd like to keep crash recovery simple, to
    6974             :          * avoid introducing bugs that could affect you when recovering after
    6975             :          * crash.
    6976             :          *
    6977             :          * After this point, we can no longer assume that we're the only
    6978             :          * process in addition to postmaster!  Also, fsync requests are
    6979             :          * subsequently to be handled by the checkpointer, not locally.
    6980             :          */
    6981           0 :         if (ArchiveRecoveryRequested && IsUnderPostmaster)
    6982             :         {
    6983           0 :             PublishStartupProcessInformation();
    6984           0 :             SetForwardFsyncRequests();
    6985           0 :             SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
    6986           0 :             bgwriterLaunched = true;
    6987             :         }
    6988             : 
    6989             :         /*
    6990             :          * Allow read-only connections immediately if we're consistent
    6991             :          * already.
    6992             :          */
    6993           0 :         CheckRecoveryConsistency();
    6994             : 
    6995             :         /*
    6996             :          * Find the first record that logically follows the checkpoint --- it
    6997             :          * might physically precede it, though.
    6998             :          */
    6999           0 :         if (checkPoint.redo < RecPtr)
    7000             :         {
    7001             :             /* back up to find the record */
    7002           0 :             record = ReadRecord(xlogreader, checkPoint.redo, PANIC, false);
    7003             :         }
    7004             :         else
    7005             :         {
    7006             :             /* just have to read next record after CheckPoint */
    7007           0 :             record = ReadRecord(xlogreader, InvalidXLogRecPtr, LOG, false);
    7008             :         }
    7009             : 
    7010           0 :         if (record != NULL)
    7011             :         {
    7012             :             ErrorContextCallback errcallback;
    7013             :             TimestampTz xtime;
    7014             : 
    7015           0 :             InRedo = true;
    7016             : 
    7017           0 :             ereport(LOG,
    7018             :                     (errmsg("redo starts at %X/%X",
    7019             :                             (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr)));
    7020             : 
    7021             :             /*
    7022             :              * main redo apply loop
    7023             :              */
    7024             :             do
    7025             :             {
    7026           0 :                 bool        switchedTLI = false;
    7027             : 
    7028             : #ifdef WAL_DEBUG
    7029             :                 if (XLOG_DEBUG ||
    7030             :                     (rmid == RM_XACT_ID && trace_recovery_messages <= DEBUG2) ||
    7031             :                     (rmid != RM_XACT_ID && trace_recovery_messages <= DEBUG3))
    7032             :                 {
    7033             :                     StringInfoData buf;
    7034             : 
    7035             :                     initStringInfo(&buf);
    7036             :                     appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ",
    7037             :                                      (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr,
    7038             :                                      (uint32) (EndRecPtr >> 32), (uint32) EndRecPtr);
    7039             :                     xlog_outrec(&buf, xlogreader);
    7040             :                     appendStringInfoString(&buf, " - ");
    7041             :                     xlog_outdesc(&buf, xlogreader);
    7042             :                     elog(LOG, "%s", buf.data);
    7043             :                     pfree(buf.data);
    7044             :                 }
    7045             : #endif
    7046             : 
    7047             :                 /* Handle interrupt signals of startup process */
    7048           0 :                 HandleStartupProcInterrupts();
    7049             : 
    7050             :                 /*
    7051             :                  * Pause WAL replay, if requested by a hot-standby session via
    7052             :                  * SetRecoveryPause().
    7053             :                  *
    7054             :                  * Note that we intentionally don't take the info_lck spinlock
    7055             :                  * here.  We might therefore read a slightly stale value of
    7056             :                  * the recoveryPause flag, but it can't be very stale (no
    7057             :                  * worse than the last spinlock we did acquire).  Since a
    7058             :                  * pause request is a pretty asynchronous thing anyway,
    7059             :                  * possibly responding to it one WAL record later than we
    7060             :                  * otherwise would is a minor issue, so it doesn't seem worth
    7061             :                  * adding another spinlock cycle to prevent that.
    7062             :                  */
    7063           0 :                 if (((volatile XLogCtlData *) XLogCtl)->recoveryPause)
    7064           0 :                     recoveryPausesHere();
    7065             : 
    7066             :                 /*
    7067             :                  * Have we reached our recovery target?
    7068             :                  */
    7069           0 :                 if (recoveryStopsBefore(xlogreader))
    7070             :                 {
    7071           0 :                     reachedStopPoint = true;    /* see below */
    7072           0 :                     break;
    7073             :                 }
    7074             : 
    7075             :                 /*
    7076             :                  * If we've been asked to lag the master, wait on latch until
    7077             :                  * enough time has passed.
    7078             :                  */
    7079           0 :                 if (recoveryApplyDelay(xlogreader))
    7080             :                 {
    7081             :                     /*
    7082             :                      * We test for paused recovery again here. If user sets
    7083             :                      * delayed apply, it may be because they expect to pause
    7084             :                      * recovery in case of problems, so we must test again
    7085             :                      * here otherwise pausing during the delay-wait wouldn't
    7086             :                      * work.
    7087             :                      */
    7088           0 :                     if (((volatile XLogCtlData *) XLogCtl)->recoveryPause)
    7089           0 :                         recoveryPausesHere();
    7090             :                 }
    7091             : 
    7092             :                 /* Setup error traceback support for ereport() */
    7093           0 :                 errcallback.callback = rm_redo_error_callback;
    7094           0 :                 errcallback.arg = (void *) xlogreader;
    7095           0 :                 errcallback.previous = error_context_stack;
    7096           0 :                 error_context_stack = &errcallback;
    7097             : 
    7098             :                 /*
    7099             :                  * ShmemVariableCache->nextXid must be beyond record's xid.
    7100             :                  *
    7101             :                  * We don't expect anyone else to modify nextXid, hence we
    7102             :                  * don't need to hold a lock while examining it.  We still
    7103             :                  * acquire the lock to modify it, though.
    7104             :                  */
    7105           0 :                 if (TransactionIdFollowsOrEquals(record->xl_xid,
    7106           0 :                                                  ShmemVariableCache->nextXid))
    7107             :                 {
    7108           0 :                     LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
    7109           0 :                     ShmemVariableCache->nextXid = record->xl_xid;
    7110           0 :                     TransactionIdAdvance(ShmemVariableCache->nextXid);
    7111           0 :                     LWLockRelease(XidGenLock);
    7112             :                 }
    7113             : 
    7114             :                 /*
    7115             :                  * Before replaying this record, check if this record causes
    7116             :                  * the current timeline to change. The record is already
    7117             :                  * considered to be part of the new timeline, so we update
    7118             :                  * ThisTimeLineID before replaying it. That's important so
    7119             :                  * that replayEndTLI, which is recorded as the minimum
    7120             :                  * recovery point's TLI if recovery stops after this record,
    7121             :                  * is set correctly.
    7122             :                  */
    7123           0 :                 if (record->xl_rmid == RM_XLOG_ID)
    7124             :                 {
    7125           0 :                     TimeLineID  newTLI = ThisTimeLineID;
    7126           0 :                     TimeLineID  prevTLI = ThisTimeLineID;
    7127           0 :                     uint8       info = record->xl_info & ~XLR_INFO_MASK;
    7128             : 
    7129           0 :                     if (info == XLOG_CHECKPOINT_SHUTDOWN)
    7130             :                     {
    7131             :                         CheckPoint  checkPoint;
    7132             : 
    7133           0 :                         memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
    7134           0 :                         newTLI = checkPoint.ThisTimeLineID;
    7135           0 :                         prevTLI = checkPoint.PrevTimeLineID;
    7136             :                     }
    7137           0 :                     else if (info == XLOG_END_OF_RECOVERY)
    7138             :                     {
    7139             :                         xl_end_of_recovery xlrec;
    7140             : 
    7141           0 :                         memcpy(&xlrec, XLogRecGetData(xlogreader), sizeof(xl_end_of_recovery));
    7142           0 :                         newTLI = xlrec.ThisTimeLineID;
    7143           0 :                         prevTLI = xlrec.PrevTimeLineID;
    7144             :                     }
    7145             : 
    7146           0 :                     if (newTLI != ThisTimeLineID)
    7147             :                     {
    7148             :                         /* Check that it's OK to switch to this TLI */
    7149           0 :                         checkTimeLineSwitch(EndRecPtr, newTLI, prevTLI);
    7150             : 
    7151             :                         /* Following WAL records should be run with new TLI */
    7152           0 :                         ThisTimeLineID = newTLI;
    7153           0 :                         switchedTLI = true;
    7154             :                     }
    7155             :                 }
    7156             : 
    7157             :                 /*
    7158             :                  * Update shared replayEndRecPtr before replaying this record,
    7159             :                  * so that XLogFlush will update minRecoveryPoint correctly.
    7160             :                  */
    7161           0 :                 SpinLockAcquire(&XLogCtl->info_lck);
    7162           0 :                 XLogCtl->replayEndRecPtr = EndRecPtr;
    7163           0 :                 XLogCtl->replayEndTLI = ThisTimeLineID;
    7164           0 :                 SpinLockRelease(&XLogCtl->info_lck);
    7165             : 
    7166             :                 /*
    7167             :                  * If we are attempting to enter Hot Standby mode, process
    7168             :                  * XIDs we see
    7169             :                  */
    7170           0 :                 if (standbyState >= STANDBY_INITIALIZED &&
    7171           0 :                     TransactionIdIsValid(record->xl_xid))
    7172           0 :                     RecordKnownAssignedTransactionIds(record->xl_xid);
    7173             : 
    7174             :                 /* Now apply the WAL record itself */
    7175           0 :                 RmgrTable[record->xl_rmid].rm_redo(xlogreader);
    7176             : 
    7177             :                 /*
    7178             :                  * After redo, check whether the backup pages associated with
    7179             :                  * the WAL record are consistent with the existing pages. This
    7180             :                  * check is done only if consistency check is enabled for this
    7181             :                  * record.
    7182             :                  */
    7183           0 :                 if ((record->xl_info & XLR_CHECK_CONSISTENCY) != 0)
    7184           0 :                     checkXLogConsistency(xlogreader);
    7185             : 
    7186             :                 /* Pop the error context stack */
    7187           0 :                 error_context_stack = errcallback.previous;
    7188             : 
    7189             :                 /*
    7190             :                  * Update lastReplayedEndRecPtr after this record has been
    7191             :                  * successfully replayed.
    7192             :                  */
    7193           0 :                 SpinLockAcquire(&XLogCtl->info_lck);
    7194           0 :                 XLogCtl->lastReplayedEndRecPtr = EndRecPtr;
    7195           0 :                 XLogCtl->lastReplayedTLI = ThisTimeLineID;
    7196           0 :                 SpinLockRelease(&XLogCtl->info_lck);
    7197             : 
    7198             :                 /*
    7199             :                  * If rm_redo called XLogRequestWalReceiverReply, then we wake
    7200             :                  * up the receiver so that it notices the updated
    7201             :                  * lastReplayedEndRecPtr and sends a reply to the master.
    7202             :                  */
    7203           0 :                 if (doRequestWalReceiverReply)
    7204             :                 {
    7205           0 :                     doRequestWalReceiverReply = false;
    7206           0 :                     WalRcvForceReply();
    7207             :                 }
    7208             : 
    7209             :                 /* Remember this record as the last-applied one */
    7210           0 :                 LastRec = ReadRecPtr;
    7211             : 
    7212             :                 /* Allow read-only connections if we're consistent now */
    7213           0 :                 CheckRecoveryConsistency();
    7214             : 
    7215             :                 /* Is this a timeline switch? */
    7216           0 :                 if (switchedTLI)
    7217             :                 {
    7218             :                     /*
    7219             :                      * Before we continue on the new timeline, clean up any
    7220             :                      * (possibly bogus) future WAL segments on the old
    7221             :                      * timeline.
    7222             :                      */
    7223           0 :                     RemoveNonParentXlogFiles(EndRecPtr, ThisTimeLineID);
    7224             : 
    7225             :                     /*
    7226             :                      * Wake up any walsenders to notice that we are on a new
    7227             :                      * timeline.
    7228             :                      */
    7229           0 :                     if (switchedTLI && AllowCascadeReplication())
    7230           0 :                         WalSndWakeup();
    7231             :                 }
    7232             : 
    7233             :                 /* Exit loop if we reached inclusive recovery target */
    7234           0 :                 if (recoveryStopsAfter(xlogreader))
    7235             :                 {
    7236           0 :                     reachedStopPoint = true;
    7237           0 :                     break;
    7238             :                 }
    7239             : 
    7240             :                 /* Else, try to fetch the next WAL record */
    7241           0 :                 record = ReadRecord(xlogreader, InvalidXLogRecPtr, LOG, false);
    7242           0 :             } while (record != NULL);
    7243             : 
    7244             :             /*
    7245             :              * end of main redo apply loop
    7246             :              */
    7247             : 
    7248           0 :             if (reachedStopPoint)
    7249             :             {
    7250           0 :                 if (!reachedConsistency)
    7251           0 :                     ereport(FATAL,
    7252             :                             (errmsg("requested recovery stop point is before consistent recovery point")));
    7253             : 
    7254             :                 /*
    7255             :                  * This is the last point where we can restart recovery with a
    7256             :                  * new recovery target, if we shutdown and begin again. After
    7257             :                  * this, Resource Managers may choose to do permanent
    7258             :                  * corrective actions at end of recovery.
    7259             :                  */
    7260           0 :                 switch (recoveryTargetAction)
    7261             :                 {
    7262             :                     case RECOVERY_TARGET_ACTION_SHUTDOWN:
    7263             : 
    7264             :                         /*
    7265             :                          * exit with special return code to request shutdown
    7266             :                          * of postmaster.  Log messages issued from
    7267             :                          * postmaster.
    7268             :                          */
    7269           0 :                         proc_exit(3);
    7270             : 
    7271             :                     case RECOVERY_TARGET_ACTION_PAUSE:
    7272           0 :                         SetRecoveryPause(true);
    7273           0 :                         recoveryPausesHere();
    7274             : 
    7275             :                         /* drop into promote */
    7276             : 
    7277             :                     case RECOVERY_TARGET_ACTION_PROMOTE:
    7278           0 :                         break;
    7279             :                 }
    7280             :             }
    7281             : 
    7282             :             /* Allow resource managers to do any required cleanup. */
    7283           0 :             for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
    7284             :             {
    7285           0 :                 if (RmgrTable[rmid].rm_cleanup != NULL)
    7286           0 :                     RmgrTable[rmid].rm_cleanup();
    7287             :             }
    7288             : 
    7289           0 :             ereport(LOG,
    7290             :                     (errmsg("redo done at %X/%X",
    7291             :                             (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr)));
    7292           0 :             xtime = GetLatestXTime();
    7293           0 :             if (xtime)
    7294           0 :                 ereport(LOG,
    7295             :                         (errmsg("last completed transaction was at log time %s",
    7296             :                                 timestamptz_to_str(xtime))));
    7297             : 
    7298           0 :             InRedo = false;
    7299             :         }
    7300             :         else
    7301             :         {
    7302             :             /* there are no WAL records following the checkpoint */
    7303           0 :             ereport(LOG,
    7304             :                     (errmsg("redo is not required")));
    7305             :         }
    7306             :     }
    7307             : 
    7308             :     /*
    7309             :      * Kill WAL receiver, if it's still running, before we continue to write
    7310             :      * the startup checkpoint record. It will trump over the checkpoint and
    7311             :      * subsequent records if it's still alive when we start writing WAL.
    7312             :      */
    7313           3 :     ShutdownWalRcv();
    7314             : 
    7315             :     /*
    7316             :      * Reset unlogged relations to the contents of their INIT fork. This is
    7317             :      * done AFTER recovery is complete so as to include any unlogged relations
    7318             :      * created during recovery, but BEFORE recovery is marked as having
    7319             :      * completed successfully. Otherwise we'd not retry if any of the post
    7320             :      * end-of-recovery steps fail.
    7321             :      */
    7322           3 :     if (InRecovery)
    7323           0 :         ResetUnloggedRelations(UNLOGGED_RELATION_INIT);
    7324             : 
    7325             :     /*
    7326             :      * We don't need the latch anymore. It's not strictly necessary to disown
    7327             :      * it, but let's do it for the sake of tidiness.
    7328             :      */
    7329           3 :     if (StandbyModeRequested)
    7330           0 :         DisownLatch(&XLogCtl->recoveryWakeupLatch);
    7331             : 
    7332             :     /*
    7333             :      * We are now done reading the xlog from stream. Turn off streaming
    7334             :      * recovery to force fetching the files (which would be required at end of
    7335             :      * recovery, e.g., timeline history file) from archive or pg_wal.
    7336             :      */
    7337           3 :     StandbyMode = false;
    7338             : 
    7339             :     /*
    7340             :      * Re-fetch the last valid or last applied record, so we can identify the
    7341             :      * exact endpoint of what we consider the valid portion of WAL.
    7342             :      */
    7343           3 :     record = ReadRecord(xlogreader, LastRec, PANIC, false);
    7344           3 :     EndOfLog = EndRecPtr;
    7345             : 
    7346             :     /*
    7347             :      * EndOfLogTLI is the TLI in the filename of the XLOG segment containing
    7348             :      * the end-of-log. It could be different from the timeline that EndOfLog
    7349             :      * nominally belongs to, if there was a timeline switch in that segment,
    7350             :      * and we were reading the old WAL from a segment belonging to a higher
    7351             :      * timeline.
    7352             :      */
    7353           3 :     EndOfLogTLI = xlogreader->readPageTLI;
    7354             : 
    7355             :     /*
    7356             :      * Complain if we did not roll forward far enough to render the backup
    7357             :      * dump consistent.  Note: it is indeed okay to look at the local variable
    7358             :      * minRecoveryPoint here, even though ControlFile->minRecoveryPoint might
    7359             :      * be further ahead --- ControlFile->minRecoveryPoint cannot have been
    7360             :      * advanced beyond the WAL we processed.
    7361             :      */
    7362           3 :     if (InRecovery &&
    7363           0 :         (EndOfLog < minRecoveryPoint ||
    7364           0 :          !XLogRecPtrIsInvalid(ControlFile->backupStartPoint)))
    7365             :     {
    7366             :         /*
    7367             :          * Ran off end of WAL before reaching end-of-backup WAL record, or
    7368             :          * minRecoveryPoint. That's usually a bad sign, indicating that you
    7369             :          * tried to recover from an online backup but never called
    7370             :          * pg_stop_backup(), or you didn't archive all the WAL up to that
    7371             :          * point. However, this also happens in crash recovery, if the system
    7372             :          * crashes while an online backup is in progress. We must not treat
    7373             :          * that as an error, or the database will refuse to start up.
    7374             :          */
    7375           0 :         if (ArchiveRecoveryRequested || ControlFile->backupEndRequired)
    7376             :         {
    7377           0 :             if (ControlFile->backupEndRequired)
    7378           0 :                 ereport(FATAL,
    7379             :                         (errmsg("WAL ends before end of online backup"),
    7380             :                          errhint("All WAL generated while online backup was taken must be available at recovery.")));
    7381           0 :             else if (!XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
    7382           0 :                 ereport(FATAL,
    7383             :                         (errmsg("WAL ends before end of online backup"),
    7384             :                          errhint("Online backup started with pg_start_backup() must be ended with pg_stop_backup(), and all WAL up to that point must be available at recovery.")));
    7385             :             else
    7386           0 :                 ereport(FATAL,
    7387             :                         (errmsg("WAL ends before consistent recovery point")));
    7388             :         }
    7389             :     }
    7390             : 
    7391             :     /*
    7392             :      * Consider whether we need to assign a new timeline ID.
    7393             :      *
    7394             :      * If we are doing an archive recovery, we always assign a new ID.  This
    7395             :      * handles a couple of issues.  If we stopped short of the end of WAL
    7396             :      * during recovery, then we are clearly generating a new timeline and must
    7397             :      * assign it a unique new ID.  Even if we ran to the end, modifying the
    7398             :      * current last segment is problematic because it may result in trying to
    7399             :      * overwrite an already-archived copy of that segment, and we encourage
    7400             :      * DBAs to make their archive_commands reject that.  We can dodge the
    7401             :      * problem by making the new active segment have a new timeline ID.
    7402             :      *
    7403             :      * In a normal crash recovery, we can just extend the timeline we were in.
    7404             :      */
    7405           3 :     PrevTimeLineID = ThisTimeLineID;
    7406           3 :     if (ArchiveRecoveryRequested)
    7407             :     {
    7408             :         char        reason[200];
    7409             : 
    7410           0 :         Assert(InArchiveRecovery);
    7411             : 
    7412           0 :         ThisTimeLineID = findNewestTimeLine(recoveryTargetTLI) + 1;
    7413           0 :         ereport(LOG,
    7414             :                 (errmsg("selected new timeline ID: %u", ThisTimeLineID)));
    7415             : 
    7416             :         /*
    7417             :          * Create a comment for the history file to explain why and where
    7418             :          * timeline changed.
    7419             :          */
    7420           0 :         if (recoveryTarget == RECOVERY_TARGET_XID)
    7421           0 :             snprintf(reason, sizeof(reason),
    7422             :                      "%s transaction %u",
    7423           0 :                      recoveryStopAfter ? "after" : "before",
    7424             :                      recoveryStopXid);
    7425           0 :         else if (recoveryTarget == RECOVERY_TARGET_TIME)
    7426           0 :             snprintf(reason, sizeof(reason),
    7427             :                      "%s %s\n",
    7428           0 :                      recoveryStopAfter ? "after" : "before",
    7429             :                      timestamptz_to_str(recoveryStopTime));
    7430           0 :         else if (recoveryTarget == RECOVERY_TARGET_LSN)
    7431           0 :             snprintf(reason, sizeof(reason),
    7432             :                      "%s LSN %X/%X\n",
    7433           0 :                      recoveryStopAfter ? "after" : "before",
    7434           0 :                      (uint32) (recoveryStopLSN >> 32),
    7435             :                      (uint32) recoveryStopLSN);
    7436           0 :         else if (recoveryTarget == RECOVERY_TARGET_NAME)
    7437           0 :             snprintf(reason, sizeof(reason),
    7438             :                      "at restore point \"%s\"",
    7439             :                      recoveryStopName);
    7440           0 :         else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
    7441           0 :             snprintf(reason, sizeof(reason), "reached consistency");
    7442             :         else
    7443           0 :             snprintf(reason, sizeof(reason), "no recovery target specified");
    7444             : 
    7445           0 :         writeTimeLineHistory(ThisTimeLineID, recoveryTargetTLI,
    7446             :                              EndRecPtr, reason);
    7447             :     }
    7448             : 
    7449             :     /* Save the selected TimeLineID in shared memory, too */
    7450           3 :     XLogCtl->ThisTimeLineID = ThisTimeLineID;
    7451           3 :     XLogCtl->PrevTimeLineID = PrevTimeLineID;
    7452             : 
    7453             :     /*
    7454             :      * We are now done reading the old WAL.  Turn off archive fetching if it
    7455             :      * was active, and make a writable copy of the last WAL segment. (Note
    7456             :      * that we also have a copy of the last block of the old WAL in readBuf;
    7457             :      * we will use that below.)
    7458             :      */
    7459           3 :     if (ArchiveRecoveryRequested)
    7460           0 :         exitArchiveRecovery(EndOfLogTLI, EndOfLog);
    7461             : 
    7462             :     /*
    7463             :      * Prepare to write WAL starting at EndOfLog location, and init xlog
    7464             :      * buffer cache using the block containing the last record from the
    7465             :      * previous incarnation.
    7466             :      */
    7467           3 :     Insert = &XLogCtl->Insert;
    7468           3 :     Insert->PrevBytePos = XLogRecPtrToBytePos(LastRec);
    7469           3 :     Insert->CurrBytePos = XLogRecPtrToBytePos(EndOfLog);
    7470             : 
    7471             :     /*
    7472             :      * Tricky point here: readBuf contains the *last* block that the LastRec
    7473             :      * record spans, not the one it starts in.  The last block is indeed the
    7474             :      * one we want to use.
    7475             :      */
    7476           3 :     if (EndOfLog % XLOG_BLCKSZ != 0)
    7477             :     {
    7478             :         char       *page;
    7479             :         int         len;
    7480             :         int         firstIdx;
    7481             :         XLogRecPtr  pageBeginPtr;
    7482             : 
    7483           3 :         pageBeginPtr = EndOfLog - (EndOfLog % XLOG_BLCKSZ);
    7484           3 :         Assert(readOff == pageBeginPtr % XLogSegSize);
    7485             : 
    7486           3 :         firstIdx = XLogRecPtrToBufIdx(EndOfLog);
    7487             : 
    7488             :         /* Copy the valid part of the last block, and zero the rest */
    7489           3 :         page = &XLogCtl->pages[firstIdx * XLOG_BLCKSZ];
    7490           3 :         len = EndOfLog % XLOG_BLCKSZ;
    7491           3 :         memcpy(page, xlogreader->readBuf, len);
    7492           3 :         memset(page + len, 0, XLOG_BLCKSZ - len);
    7493             : 
    7494           3 :         XLogCtl->xlblocks[firstIdx] = pageBeginPtr + XLOG_BLCKSZ;
    7495           3 :         XLogCtl->InitializedUpTo = pageBeginPtr + XLOG_BLCKSZ;
    7496             :     }
    7497             :     else
    7498             :     {
    7499             :         /*
    7500             :          * There is no partial block to copy. Just set InitializedUpTo, and
    7501             :          * let the first attempt to insert a log record to initialize the next
    7502             :          * buffer.
    7503             :          */
    7504           0 :         XLogCtl->InitializedUpTo = EndOfLog;
    7505             :     }
    7506             : 
    7507           3 :     LogwrtResult.Write = LogwrtResult.Flush = EndOfLog;
    7508             : 
    7509           3 :     XLogCtl->LogwrtResult = LogwrtResult;
    7510             : 
    7511           3 :     XLogCtl->LogwrtRqst.Write = EndOfLog;
    7512           3 :     XLogCtl->LogwrtRqst.Flush = EndOfLog;
    7513             : 
    7514             :     /* Pre-scan prepared transactions to find out the range of XIDs present */
    7515           3 :     oldestActiveXID = PrescanPreparedTransactions(NULL, NULL);
    7516             : 
    7517             :     /*
    7518             :      * Update full_page_writes in shared memory and write an XLOG_FPW_CHANGE
    7519             :      * record before resource manager writes cleanup WAL records or checkpoint
    7520             :      * record is written.
    7521             :      */
    7522           3 :     Insert->fullPageWrites = lastFullPageWrites;
    7523           3 :     LocalSetXLogInsertAllowed();
    7524           3 :     UpdateFullPageWrites();
    7525           3 :     LocalXLogInsertAllowed = -1;
    7526             : 
    7527           3 :     if (InRecovery)
    7528             :     {
    7529             :         /*
    7530             :          * Perform a checkpoint to update all our recovery activity to disk.
    7531             :          *
    7532             :          * Note that we write a shutdown checkpoint rather than an on-line
    7533             :          * one. This is not particularly critical, but since we may be
    7534             :          * assigning a new TLI, using a shutdown checkpoint allows us to have
    7535             :          * the rule that TLI only changes in shutdown checkpoints, which
    7536             :          * allows some extra error checking in xlog_redo.
    7537             :          *
    7538             :          * In fast promotion, only create a lightweight end-of-recovery record
    7539             :          * instead of a full checkpoint. A checkpoint is requested later,
    7540             :          * after we're fully out of recovery mode and already accepting
    7541             :          * queries.
    7542             :          */
    7543           0 :         if (bgwriterLaunched)
    7544             :         {
    7545           0 :             if (fast_promote)
    7546             :             {
    7547           0 :                 checkPointLoc = ControlFile->prevCheckPoint;
    7548             : 
    7549             :                 /*
    7550             :                  * Confirm the last checkpoint is available for us to recover
    7551             :                  * from if we fail. Note that we don't check for the secondary
    7552             :                  * checkpoint since that isn't available in most base backups.
    7553             :                  */
    7554           0 :                 record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, false);
    7555           0 :                 if (record != NULL)
    7556             :                 {
    7557           0 :                     fast_promoted = true;
    7558             : 
    7559             :                     /*
    7560             :                      * Insert a special WAL record to mark the end of
    7561             :                      * recovery, since we aren't doing a checkpoint. That
    7562             :                      * means that the checkpointer process may likely be in
    7563             :                      * the middle of a time-smoothed restartpoint and could
    7564             :                      * continue to be for minutes after this. That sounds
    7565             :                      * strange, but the effect is roughly the same and it
    7566             :                      * would be stranger to try to come out of the
    7567             :                      * restartpoint and then checkpoint. We request a
    7568             :                      * checkpoint later anyway, just for safety.
    7569             :                      */
    7570           0 :                     CreateEndOfRecoveryRecord();
    7571             :                 }
    7572             :             }
    7573             : 
    7574           0 :             if (!fast_promoted)
    7575           0 :                 RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
    7576             :                                   CHECKPOINT_IMMEDIATE |
    7577             :                                   CHECKPOINT_WAIT);
    7578             :         }
    7579             :         else
    7580           0 :             CreateCheckPoint(CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IMMEDIATE);
    7581             : 
    7582             :         /*
    7583             :          * And finally, execute the recovery_end_command, if any.
    7584             :          */
    7585           0 :         if (recoveryEndCommand)
    7586           0 :             ExecuteRecoveryCommand(recoveryEndCommand,
    7587             :                                    "recovery_end_command",
    7588             :                                    true);
    7589             :     }
    7590             : 
    7591           3 :     if (ArchiveRecoveryRequested)
    7592             :     {
    7593             :         /*
    7594             :          * We switched to a new timeline. Clean up segments on the old
    7595             :          * timeline.
    7596             :          *
    7597             :          * If there are any higher-numbered segments on the old timeline,
    7598             :          * remove them. They might contain valid WAL, but they might also be
    7599             :          * pre-allocated files containing garbage. In any case, they are not
    7600             :          * part of the new timeline's history so we don't need them.
    7601             :          */
    7602           0 :         RemoveNonParentXlogFiles(EndOfLog, ThisTimeLineID);
    7603             : 
    7604             :         /*
    7605             :          * If the switch happened in the middle of a segment, what to do with
    7606             :          * the last, partial segment on the old timeline? If we don't archive
    7607             :          * it, and the server that created the WAL never archives it either
    7608             :          * (e.g. because it was hit by a meteor), it will never make it to the
    7609             :          * archive. That's OK from our point of view, because the new segment
    7610             :          * that we created with the new TLI contains all the WAL from the old
    7611             :          * timeline up to the switch point. But if you later try to do PITR to
    7612             :          * the "missing" WAL on the old timeline, recovery won't find it in
    7613             :          * the archive. It's physically present in the new file with new TLI,
    7614             :          * but recovery won't look there when it's recovering to the older
    7615             :          * timeline. On the other hand, if we archive the partial segment, and
    7616             :          * the original server on that timeline is still running and archives
    7617             :          * the completed version of the same segment later, it will fail. (We
    7618             :          * used to do that in 9.4 and below, and it caused such problems).
    7619             :          *
    7620             :          * As a compromise, we rename the last segment with the .partial
    7621             :          * suffix, and archive it. Archive recovery will never try to read
    7622             :          * .partial segments, so they will normally go unused. But in the odd
    7623             :          * PITR case, the administrator can copy them manually to the pg_wal
    7624             :          * directory (removing the suffix). They can be useful in debugging,
    7625             :          * too.
    7626             :          *
    7627             :          * If a .done or .ready file already exists for the old timeline,
    7628             :          * however, we had already determined that the segment is complete, so
    7629             :          * we can let it be archived normally. (In particular, if it was
    7630             :          * restored from the archive to begin with, it's expected to have a
    7631             :          * .done file).
    7632             :          */
    7633           0 :         if (EndOfLog % XLOG_SEG_SIZE != 0 && XLogArchivingActive())
    7634             :         {
    7635             :             char        origfname[MAXFNAMELEN];
    7636             :             XLogSegNo   endLogSegNo;
    7637             : 
    7638           0 :             XLByteToPrevSeg(EndOfLog, endLogSegNo);
    7639           0 :             XLogFileName(origfname, EndOfLogTLI, endLogSegNo);
    7640             : 
    7641           0 :             if (!XLogArchiveIsReadyOrDone(origfname))
    7642             :             {
    7643             :                 char        origpath[MAXPGPATH];
    7644             :                 char        partialfname[MAXFNAMELEN];
    7645             :                 char        partialpath[MAXPGPATH];
    7646             : 
    7647           0 :                 XLogFilePath(origpath, EndOfLogTLI, endLogSegNo);
    7648           0 :                 snprintf(partialfname, MAXFNAMELEN, "%s.partial", origfname);
    7649           0 :                 snprintf(partialpath, MAXPGPATH, "%s.partial", origpath);
    7650             : 
    7651             :                 /*
    7652             :                  * Make sure there's no .done or .ready file for the .partial
    7653             :                  * file.
    7654             :                  */
    7655           0 :                 XLogArchiveCleanup(partialfname);
    7656             : 
    7657           0 :                 durable_rename(origpath, partialpath, ERROR);
    7658           0 :                 XLogArchiveNotify(partialfname);
    7659             :             }
    7660             :         }
    7661             :     }
    7662             : 
    7663             :     /*
    7664             :      * Preallocate additional log files, if wanted.
    7665             :      */
    7666           3 :     PreallocXlogFiles(EndOfLog);
    7667             : 
    7668             :     /*
    7669             :      * Okay, we're officially UP.
    7670             :      */
    7671           3 :     InRecovery = false;
    7672             : 
    7673             :     /* start the archive_timeout timer and LSN running */
    7674           3 :     XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
    7675           3 :     XLogCtl->lastSegSwitchLSN = EndOfLog;
    7676             : 
    7677             :     /* also initialize latestCompletedXid, to nextXid - 1 */
    7678           3 :     LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
    7679           3 :     ShmemVariableCache->latestCompletedXid = ShmemVariableCache->nextXid;
    7680           9 :     TransactionIdRetreat(ShmemVariableCache->latestCompletedXid);
    7681           3 :     LWLockRelease(ProcArrayLock);
    7682             : 
    7683             :     /*
    7684             :      * Start up the commit log and subtrans, if not already done for hot
    7685             :      * standby.  (commit timestamps are started below, if necessary.)
    7686             :      */
    7687           3 :     if (standbyState == STANDBY_DISABLED)
    7688             :     {
    7689           3 :         StartupCLOG();
    7690           3 :         StartupSUBTRANS(oldestActiveXID);
    7691             :     }
    7692             : 
    7693             :     /*
    7694             :      * Perform end of recovery actions for any SLRUs that need it.
    7695             :      */
    7696           3 :     TrimCLOG();
    7697           3 :     TrimMultiXact();
    7698             : 
    7699             :     /* Reload shared-memory state for prepared transactions */
    7700           3 :     RecoverPreparedTransactions();
    7701             : 
    7702             :     /*
    7703             :      * Shutdown the recovery environment. This must occur after
    7704             :      * RecoverPreparedTransactions(), see notes for lock_twophase_recover()
    7705             :      */
    7706           3 :     if (standbyState != STANDBY_DISABLED)
    7707           0 :         ShutdownRecoveryTransactionEnvironment();
    7708             : 
    7709             :     /* Shut down xlogreader */
    7710           3 :     if (readFile >= 0)
    7711             :     {
    7712           3 :         close(readFile);
    7713           3 :         readFile = -1;
    7714             :     }
    7715           3 :     XLogReaderFree(xlogreader);
    7716             : 
    7717             :     /*
    7718             :      * If any of the critical GUCs have changed, log them before we allow
    7719             :      * backends to write WAL.
    7720             :      */
    7721           3 :     LocalSetXLogInsertAllowed();
    7722           3 :     XLogReportParameters();
    7723             : 
    7724             :     /*
    7725             :      * Local WAL inserts enabled, so it's time to finish initialization of
    7726             :      * commit timestamp.
    7727             :      */
    7728           3 :     CompleteCommitTsInitialization();
    7729             : 
    7730             :     /*
    7731             :      * All done with end-of-recovery actions.
    7732             :      *
    7733             :      * Now allow backends to write WAL and update the control file status in
    7734             :      * consequence.  The boolean flag allowing backends to write WAL is
    7735             :      * updated while holding ControlFileLock to prevent other backends to look
    7736             :      * at an inconsistent state of the control file in shared memory.  There
    7737             :      * is still a small window during which backends can write WAL and the
    7738             :      * control file is still referring to a system not in DB_IN_PRODUCTION
    7739             :      * state while looking at the on-disk control file.
    7740             :      *
    7741             :      * Also, although the boolean flag to allow WAL is probably atomic in
    7742             :      * itself, we use the info_lck here to ensure that there are no race
    7743             :      * conditions concerning visibility of other recent updates to shared
    7744             :      * memory.
    7745             :      */
    7746           3 :     LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    7747           3 :     ControlFile->state = DB_IN_PRODUCTION;
    7748           3 :     ControlFile->time = (pg_time_t) time(NULL);
    7749             : 
    7750           3 :     SpinLockAcquire(&XLogCtl->info_lck);
    7751           3 :     XLogCtl->SharedRecoveryInProgress = false;
    7752           3 :     SpinLockRelease(&XLogCtl->info_lck);
    7753             : 
    7754           3 :     UpdateControlFile();
    7755           3 :     LWLockRelease(ControlFileLock);
    7756             : 
    7757             :     /*
    7758             :      * If there were cascading standby servers connected to us, nudge any wal
    7759             :      * sender processes to notice that we've been promoted.
    7760             :      */
    7761           3 :     WalSndWakeup();
    7762             : 
    7763             :     /*
    7764             :      * If this was a fast promotion, request an (online) checkpoint now. This
    7765             :      * isn't required for consistency, but the last restartpoint might be far
    7766             :      * back, and in case of a crash, recovering from it might take a longer
    7767             :      * than is appropriate now that we're not in standby mode anymore.
    7768             :      */
    7769           3 :     if (fast_promoted)
    7770           0 :         RequestCheckpoint(CHECKPOINT_FORCE);
    7771           3 : }
    7772             : 
    7773             : /*
    7774             :  * Checks if recovery has reached a consistent state. When consistency is
    7775             :  * reached and we have a valid starting standby snapshot, tell postmaster
    7776             :  * that it can start accepting read-only connections.
    7777             :  */
    7778             : static void
    7779           0 : CheckRecoveryConsistency(void)
    7780             : {
    7781             :     XLogRecPtr  lastReplayedEndRecPtr;
    7782             : 
    7783             :     /*
    7784             :      * During crash recovery, we don't reach a consistent state until we've
    7785             :      * replayed all the WAL.
    7786             :      */
    7787           0 :     if (XLogRecPtrIsInvalid(minRecoveryPoint))
    7788           0 :         return;
    7789             : 
    7790             :     /*
    7791             :      * assume that we are called in the startup process, and hence don't need
    7792             :      * a lock to read lastReplayedEndRecPtr
    7793             :      */
    7794           0 :     lastReplayedEndRecPtr = XLogCtl->lastReplayedEndRecPtr;
    7795             : 
    7796             :     /*
    7797             :      * Have we reached the point where our base backup was completed?
    7798             :      */
    7799           0 :     if (!XLogRecPtrIsInvalid(ControlFile->backupEndPoint) &&
    7800           0 :         ControlFile->backupEndPoint <= lastReplayedEndRecPtr)
    7801             :     {
    7802             :         /*
    7803             :          * We have reached the end of base backup, as indicated by pg_control.
    7804             :          * The data on disk is now consistent. Reset backupStartPoint and
    7805             :          * backupEndPoint, and update minRecoveryPoint to make sure we don't
    7806             :          * allow starting up at an earlier point even if recovery is stopped
    7807             :          * and restarted soon after this.
    7808             :          */
    7809           0 :         elog(DEBUG1, "end of backup reached");
    7810             : 
    7811           0 :         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    7812             : 
    7813           0 :         if (ControlFile->minRecoveryPoint < lastReplayedEndRecPtr)
    7814           0 :             ControlFile->minRecoveryPoint = lastReplayedEndRecPtr;
    7815             : 
    7816           0 :         ControlFile->backupStartPoint = InvalidXLogRecPtr;
    7817           0 :         ControlFile->backupEndPoint = InvalidXLogRecPtr;
    7818           0 :         ControlFile->backupEndRequired = false;
    7819           0 :         UpdateControlFile();
    7820             : 
    7821           0 :         LWLockRelease(ControlFileLock);
    7822             :     }
    7823             : 
    7824             :     /*
    7825             :      * Have we passed our safe starting point? Note that minRecoveryPoint is
    7826             :      * known to be incorrectly set if ControlFile->backupEndRequired, until
    7827             :      * the XLOG_BACKUP_RECORD arrives to advise us of the correct
    7828             :      * minRecoveryPoint. All we know prior to that is that we're not
    7829             :      * consistent yet.
    7830             :      */
    7831           0 :     if (!reachedConsistency && !ControlFile->backupEndRequired &&
    7832           0 :         minRecoveryPoint <= lastReplayedEndRecPtr &&
    7833           0 :         XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
    7834             :     {
    7835             :         /*
    7836             :          * Check to see if the XLOG sequence contained any unresolved
    7837             :          * references to uninitialized pages.
    7838             :          */
    7839           0 :         XLogCheckInvalidPages();
    7840             : 
    7841           0 :         reachedConsistency = true;
    7842           0 :         ereport(LOG,
    7843             :                 (errmsg("consistent recovery state reached at %X/%X",
    7844             :                         (uint32) (lastReplayedEndRecPtr >> 32),
    7845             :                         (uint32) lastReplayedEndRecPtr)));
    7846             :     }
    7847             : 
    7848             :     /*
    7849             :      * Have we got a valid starting snapshot that will allow queries to be
    7850             :      * run? If so, we can tell postmaster that the database is consistent now,
    7851             :      * enabling connections.
    7852             :      */
    7853           0 :     if (standbyState == STANDBY_SNAPSHOT_READY &&
    7854           0 :         !LocalHotStandbyActive &&
    7855           0 :         reachedConsistency &&
    7856             :         IsUnderPostmaster)
    7857             :     {
    7858           0 :         SpinLockAcquire(&XLogCtl->info_lck);
    7859           0 :         XLogCtl->SharedHotStandbyActive = true;
    7860           0 :         SpinLockRelease(&XLogCtl->info_lck);
    7861             : 
    7862           0 :         LocalHotStandbyActive = true;
    7863             : 
    7864           0 :         SendPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY);
    7865             :     }
    7866             : }
    7867             : 
    7868             : /*
    7869             :  * Is the system still in recovery?
    7870             :  *
    7871             :  * Unlike testing InRecovery, this works in any process that's connected to
    7872             :  * shared memory.
    7873             :  *
    7874             :  * As a side-effect, we initialize the local TimeLineID and RedoRecPtr
    7875             :  * variables the first time we see that recovery is finished.
    7876             :  */
    7877             : bool
    7878     2390619 : RecoveryInProgress(void)
    7879             : {
    7880             :     /*
    7881             :      * We check shared state each time only until we leave recovery mode. We
    7882             :      * can't re-enter recovery, so there's no need to keep checking after the
    7883             :      * shared variable has once been seen false.
    7884             :      */
    7885     2390619 :     if (!LocalRecoveryInProgress)
    7886     2390278 :         return false;
    7887             :     else
    7888             :     {
    7889             :         /*
    7890             :          * use volatile pointer to make sure we make a fresh read of the
    7891             :          * shared variable.
    7892             :          */
    7893         341 :         volatile XLogCtlData *xlogctl = XLogCtl;
    7894             : 
    7895         341 :         LocalRecoveryInProgress = xlogctl->SharedRecoveryInProgress;
    7896             : 
    7897             :         /*
    7898             :          * Initialize TimeLineID and RedoRecPtr when we discover that recovery
    7899             :          * is finished. InitPostgres() relies upon this behaviour to ensure
    7900             :          * that InitXLOGAccess() is called at backend startup.  (If you change
    7901             :          * this, see also LocalSetXLogInsertAllowed.)
    7902             :          */
    7903         341 :         if (!LocalRecoveryInProgress)
    7904             :         {
    7905             :             /*
    7906             :              * If we just exited recovery, make sure we read TimeLineID and
    7907             :              * RedoRecPtr after SharedRecoveryInProgress (for machines with
    7908             :              * weak memory ordering).
    7909             :              */
    7910         341 :             pg_memory_barrier();
    7911         341 :             InitXLOGAccess();
    7912             :         }
    7913             : 
    7914             :         /*
    7915             :          * Note: We don't need a memory barrier when we're still in recovery.
    7916             :          * We might exit recovery immediately after return, so the caller
    7917             :          * can't rely on 'true' meaning that we're still in recovery anyway.
    7918             :          */
    7919             : 
    7920         341 :         return LocalRecoveryInProgress;
    7921             :     }
    7922             : }
    7923             : 
    7924             : /*
    7925             :  * Is HotStandby active yet? This is only important in special backends
    7926             :  * since normal backends won't ever be able to connect until this returns
    7927             :  * true. Postmaster knows this by way of signal, not via shared memory.
    7928             :  *
    7929             :  * Unlike testing standbyState, this works in any process that's connected to
    7930             :  * shared memory.  (And note that standbyState alone doesn't tell the truth
    7931             :  * anyway.)
    7932             :  */
    7933             : bool
    7934           0 : HotStandbyActive(void)
    7935             : {
    7936             :     /*
    7937             :      * We check shared state each time only until Hot Standby is active. We
    7938             :      * can't de-activate Hot Standby, so there's no need to keep checking
    7939             :      * after the shared variable has once been seen true.
    7940             :      */
    7941           0 :     if (LocalHotStandbyActive)
    7942           0 :         return true;
    7943             :     else
    7944             :     {
    7945             :         /* spinlock is essential on machines with weak memory ordering! */
    7946           0 :         SpinLockAcquire(&XLogCtl->info_lck);
    7947           0 :         LocalHotStandbyActive = XLogCtl->SharedHotStandbyActive;
    7948           0 :         SpinLockRelease(&XLogCtl->info_lck);
    7949             : 
    7950           0 :         return LocalHotStandbyActive;
    7951             :     }
    7952             : }
    7953             : 
    7954             : /*
    7955             :  * Like HotStandbyActive(), but to be used only in WAL replay code,
    7956             :  * where we don't need to ask any other process what the state is.
    7957             :  */
    7958             : bool
    7959           0 : HotStandbyActiveInReplay(void)
    7960             : {
    7961           0 :     Assert(AmStartupProcess() || !IsPostmasterEnvironment);
    7962           0 :     return LocalHotStandbyActive;
    7963             : }
    7964             : 
    7965             : /*
    7966             :  * Is this process allowed to insert new WAL records?
    7967             :  *
    7968             :  * Ordinarily this is essentially equivalent to !RecoveryInProgress().
    7969             :  * But we also have provisions for forcing the result "true" or "false"
    7970             :  * within specific processes regardless of the global state.
    7971             :  */
    7972             : bool
    7973     2638911 : XLogInsertAllowed(void)
    7974             : {
    7975             :     /*
    7976             :      * If value is "unconditionally true" or "unconditionally false", just
    7977             :      * return it.  This provides the normal fast path once recovery is known
    7978             :      * done.
    7979             :      */
    7980     2638911 :     if (LocalXLogInsertAllowed >= 0)
    7981     2638702 :         return (bool) LocalXLogInsertAllowed;
    7982             : 
    7983             :     /*
    7984             :      * Else, must check to see if we're still in recovery.
    7985             :      */
    7986         209 :     if (RecoveryInProgress())
    7987           0 :         return false;
    7988             : 
    7989             :     /*
    7990             :      * On exit from recovery, reset to "unconditionally true", since there is
    7991             :      * no need to keep checking.
    7992             :      */
    7993         209 :     LocalXLogInsertAllowed = 1;
    7994         209 :     return true;
    7995             : }
    7996             : 
    7997             : /*
    7998             :  * Make XLogInsertAllowed() return true in the current process only.
    7999             :  *
    8000             :  * Note: it is allowed to switch LocalXLogInsertAllowed back to -1 later,
    8001             :  * and even call LocalSetXLogInsertAllowed() again after that.
    8002             :  */
    8003             : static void
    8004           6 : LocalSetXLogInsertAllowed(void)
    8005             : {
    8006           6 :     Assert(LocalXLogInsertAllowed == -1);
    8007           6 :     LocalXLogInsertAllowed = 1;
    8008             : 
    8009             :     /* Initialize as RecoveryInProgress() would do when switching state */
    8010           6 :     InitXLOGAccess();
    8011           6 : }
    8012             : 
    8013             : /*
    8014             :  * Subroutine to try to fetch and validate a prior checkpoint record.
    8015             :  *
    8016             :  * whichChkpt identifies the checkpoint (merely for reporting purposes).
    8017             :  * 1 for "primary", 2 for "secondary", 0 for "other" (backup_label)
    8018             :  */
    8019             : static XLogRecord *
    8020           3 : ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
    8021             :                      int whichChkpt, bool report)
    8022             : {
    8023             :     XLogRecord *record;
    8024             :     uint8       info;
    8025             : 
    8026           3 :     if (!XRecOffIsValid(RecPtr))
    8027             :     {
    8028           0 :         if (!report)
    8029           0 :             return NULL;
    8030             : 
    8031           0 :         switch (whichChkpt)
    8032             :         {
    8033             :             case 1:
    8034           0 :                 ereport(LOG,
    8035             :                         (errmsg("invalid primary checkpoint link in control file")));
    8036           0 :                 break;
    8037             :             case 2:
    8038           0 :                 ereport(LOG,
    8039             :                         (errmsg("invalid secondary checkpoint link in control file")));
    8040           0 :                 break;
    8041             :             default:
    8042           0 :                 ereport(LOG,
    8043             :                         (errmsg("invalid checkpoint link in backup_label file")));
    8044           0 :                 break;
    8045             :         }
    8046           0 :         return NULL;
    8047             :     }
    8048             : 
    8049           3 :     record = ReadRecord(xlogreader, RecPtr, LOG, true);
    8050             : 
    8051           3 :     if (record == NULL)
    8052             :     {
    8053           0 :         if (!report)
    8054           0 :             return NULL;
    8055             : 
    8056           0 :         switch (whichChkpt)
    8057             :         {
    8058             :             case 1:
    8059           0 :                 ereport(LOG,
    8060             :                         (errmsg("invalid primary checkpoint record")));
    8061           0 :                 break;
    8062             :             case 2:
    8063           0 :                 ereport(LOG,
    8064             :                         (errmsg("invalid secondary checkpoint record")));
    8065           0 :                 break;
    8066             :             default:
    8067           0 :                 ereport(LOG,
    8068             :                         (errmsg("invalid checkpoint record")));
    8069           0 :                 break;
    8070             :         }
    8071           0 :         return NULL;
    8072             :     }
    8073           3 :     if (record->xl_rmid != RM_XLOG_ID)
    8074             :     {
    8075           0 :         switch (whichChkpt)
    8076             :         {
    8077             :             case 1:
    8078           0 :                 ereport(LOG,
    8079             :                         (errmsg("invalid resource manager ID in primary checkpoint record")));
    8080           0 :                 break;
    8081             :             case 2:
    8082           0 :                 ereport(LOG,
    8083             :                         (errmsg("invalid resource manager ID in secondary checkpoint record")));
    8084           0 :                 break;
    8085             :             default:
    8086           0 :                 ereport(LOG,
    8087             :                         (errmsg("invalid resource manager ID in checkpoint record")));
    8088           0 :                 break;
    8089             :         }
    8090           0 :         return NULL;
    8091             :     }
    8092           3 :     info = record->xl_info & ~XLR_INFO_MASK;
    8093           3 :     if (info != XLOG_CHECKPOINT_SHUTDOWN &&
    8094             :         info != XLOG_CHECKPOINT_ONLINE)
    8095             :     {
    8096           0 :         switch (whichChkpt)
    8097             :         {
    8098             :             case 1:
    8099           0 :                 ereport(LOG,
    8100             :                         (errmsg("invalid xl_info in primary checkpoint record")));
    8101           0 :                 break;
    8102             :             case 2:
    8103           0 :                 ereport(LOG,
    8104             :                         (errmsg("invalid xl_info in secondary checkpoint record")));
    8105           0 :                 break;
    8106             :             default:
    8107           0 :                 ereport(LOG,
    8108             :                         (errmsg("invalid xl_info in checkpoint record")));
    8109           0 :                 break;
    8110             :         }
    8111           0 :         return NULL;
    8112             :     }
    8113           3 :     if (record->xl_tot_len != SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(CheckPoint))
    8114             :     {
    8115           0 :         switch (whichChkpt)
    8116             :         {
    8117             :             case 1:
    8118           0 :                 ereport(LOG,
    8119             :                         (errmsg("invalid length of primary checkpoint record")));
    8120           0 :                 break;
    8121             :             case 2:
    8122           0 :                 ereport(LOG,
    8123             :                         (errmsg("invalid length of secondary checkpoint record")));
    8124           0 :                 break;
    8125             :             default:
    8126           0 :                 ereport(LOG,
    8127             :                         (errmsg("invalid length of checkpoint record")));
    8128           0 :                 break;
    8129             :         }
    8130           0 :         return NULL;
    8131             :     }
    8132           3 :     return record;
    8133             : }
    8134             : 
    8135             : /*
    8136             :  * This must be called in a backend process before creating WAL records
    8137             :  * (except in a standalone backend, which does StartupXLOG instead).  We need
    8138             :  * to initialize the local copies of ThisTimeLineID and RedoRecPtr.
    8139             :  *
    8140             :  * Note: before Postgres 8.0, we went to some effort to keep the postmaster
    8141             :  * process's copies of ThisTimeLineID and RedoRecPtr valid too.  This was
    8142             :  * unnecessary however, since the postmaster itself never touches XLOG anyway.
    8143             :  */
    8144             : void
    8145         348 : InitXLOGAccess(void)
    8146             : {
    8147         348 :     XLogCtlInsert *Insert = &XLogCtl->Insert;
    8148             : 
    8149             :     /* ThisTimeLineID doesn't change so we need no lock to copy it */
    8150         348 :     ThisTimeLineID = XLogCtl->ThisTimeLineID;
    8151         348 :     Assert(ThisTimeLineID != 0 || IsBootstrapProcessingMode());
    8152             : 
    8153             :     /* Use GetRedoRecPtr to copy the RedoRecPtr safely */
    8154         348 :     (void) GetRedoRecPtr();
    8155             :     /* Also update our copy of doPageWrites. */
    8156         348 :     doPageWrites = (Insert->fullPageWrites || Insert->forcePageWrites);
    8157             : 
    8158             :     /* Also initialize the working areas for constructing WAL records */
    8159         348 :     InitXLogInsert();
    8160         348 : }
    8161             : 
    8162             : /*
    8163             :  * Return the current Redo pointer from shared memory.
    8164             :  *
    8165             :  * As a side-effect, the local RedoRecPtr copy is updated.
    8166             :  */
    8167             : XLogRecPtr
    8168         758 : GetRedoRecPtr(void)
    8169             : {
    8170             :     XLogRecPtr  ptr;
    8171             : 
    8172             :     /*
    8173             :      * The possibly not up-to-date copy in XlogCtl is enough. Even if we
    8174             :      * grabbed a WAL insertion lock to read the master copy, someone might
    8175             :      * update it just after we've released the lock.
    8176             :      */
    8177         758 :     SpinLockAcquire(&XLogCtl->info_lck);
    8178         758 :     ptr = XLogCtl->RedoRecPtr;
    8179         758 :     SpinLockRelease(&XLogCtl->info_lck);
    8180             : 
    8181         758 :     if (RedoRecPtr < ptr)
    8182         339 :         RedoRecPtr = ptr;
    8183             : 
    8184         758 :     return RedoRecPtr;
    8185             : }
    8186             : 
    8187             : /*
    8188             :  * Return information needed to decide whether a modified block needs a
    8189             :  * full-page image to be included in the WAL record.
    8190             :  *
    8191             :  * The returned values are cached copies from backend-private memory, and
    8192             :  * possibly out-of-date.  XLogInsertRecord will re-check them against
    8193             :  * up-to-date values, while holding the WAL insert lock.
    8194             :  */
    8195             : void
    8196     1312995 : GetFullPageWriteInfo(XLogRecPtr *RedoRecPtr_p, bool *doPageWrites_p)
    8197             : {
    8198     1312995 :     *RedoRecPtr_p = RedoRecPtr;
    8199     1312995 :     *doPageWrites_p = doPageWrites;
    8200     1312995 : }
    8201             : 
    8202             : /*
    8203             :  * GetInsertRecPtr -- Returns the current insert position.
    8204             :  *
    8205             :  * NOTE: The value *actually* returned is the position of the last full
    8206             :  * xlog page. It lags behind the real insert position by at most 1 page.
    8207             :  * For that, we don't need to scan through WAL insertion locks, and an
    8208             :  * approximation is enough for the current usage of this function.
    8209             :  */
    8210             : XLogRecPtr
    8211           4 : GetInsertRecPtr(void)
    8212             : {
    8213             :     XLogRecPtr  recptr;
    8214             : 
    8215           4 :     SpinLockAcquire(&XLogCtl->info_lck);
    8216           4 :     recptr = XLogCtl->LogwrtRqst.Write;
    8217           4 :     SpinLockRelease(&XLogCtl->info_lck);
    8218             : 
    8219           4 :     return recptr;
    8220             : }
    8221             : 
    8222             : /*
    8223             :  * GetFlushRecPtr -- Returns the current flush position, ie, the last WAL
    8224             :  * position known to be fsync'd to disk.
    8225             :  */
    8226             : XLogRecPtr
    8227          13 : GetFlushRecPtr(void)
    8228             : {
    8229          13 :     SpinLockAcquire(&XLogCtl->info_lck);
    8230          13 :     LogwrtResult = XLogCtl->LogwrtResult;
    8231          13 :     SpinLockRelease(&XLogCtl->info_lck);
    8232             : 
    8233          13 :     return LogwrtResult.Flush;
    8234             : }
    8235             : 
    8236             : /*
    8237             :  * GetLastImportantRecPtr -- Returns the LSN of the last important record
    8238             :  * inserted. All records not explicitly marked as unimportant are considered
    8239             :  * important.
    8240             :  *
    8241             :  * The LSN is determined by computing the maximum of
    8242             :  * WALInsertLocks[i].lastImportantAt.
    8243             :  */
    8244             : XLogRecPtr
    8245          17 : GetLastImportantRecPtr(void)
    8246             : {
    8247          17 :     XLogRecPtr  res = InvalidXLogRecPtr;
    8248             :     int         i;
    8249             : 
    8250         153 :     for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
    8251             :     {
    8252             :         XLogRecPtr  last_important;
    8253             : 
    8254             :         /*
    8255             :          * Need to take a lock to prevent torn reads of the LSN, which are
    8256             :          * possible on some of the supported platforms. WAL insert locks only
    8257             :          * support exclusive mode, so we have to use that.
    8258             :          */
    8259         136 :         LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
    8260         136 :         last_important = WALInsertLocks[i].l.lastImportantAt;
    8261         136 :         LWLockRelease(&WALInsertLocks[i].l.lock);
    8262             : 
    8263         136 :         if (res < last_important)
    8264          33 :             res = last_important;
    8265             :     }
    8266             : 
    8267          17 :     return res;
    8268             : }
    8269             : 
    8270             : /*
    8271             :  * Get the time and LSN of the last xlog segment switch
    8272             :  */
    8273             : pg_time_t
    8274           0 : GetLastSegSwitchData(XLogRecPtr *lastSwitchLSN)
    8275             : {
    8276             :     pg_time_t   result;
    8277             : 
    8278             :     /* Need WALWriteLock, but shared lock is sufficient */
    8279           0 :     LWLockAcquire(WALWriteLock, LW_SHARED);
    8280           0 :     result = XLogCtl->lastSegSwitchTime;
    8281           0 :     *lastSwitchLSN = XLogCtl->lastSegSwitchLSN;
    8282           0 :     LWLockRelease(WALWriteLock);
    8283             : 
    8284           0 :     return result;
    8285             : }
    8286             : 
    8287             : /*
    8288             :  * GetNextXidAndEpoch - get the current nextXid value and associated epoch
    8289             :  *
    8290             :  * This is exported for use by code that would like to have 64-bit XIDs.
    8291             :  * We don't really support such things, but all XIDs within the system
    8292             :  * can be presumed "close to" the result, and thus the epoch associated
    8293             :  * with them can be determined.
    8294             :  */
    8295             : void
    8296         298 : GetNextXidAndEpoch(TransactionId *xid, uint32 *epoch)
    8297             : {
    8298             :     uint32      ckptXidEpoch;
    8299             :     TransactionId ckptXid;
    8300             :     TransactionId nextXid;
    8301             : 
    8302             :     /* Must read checkpoint info first, else have race condition */
    8303         298 :     SpinLockAcquire(&XLogCtl->info_lck);
    8304         298 :     ckptXidEpoch = XLogCtl->ckptXidEpoch;
    8305         298 :     ckptXid = XLogCtl->ckptXid;
    8306         298 :     SpinLockRelease(&XLogCtl->info_lck);
    8307             : 
    8308             :     /* Now fetch current nextXid */
    8309         298 :     nextXid = ReadNewTransactionId();
    8310             : 
    8311             :     /*
    8312             :      * nextXid is certainly logically later than ckptXid.  So if it's
    8313             :      * numerically less, it must have wrapped into the next epoch.
    8314             :      */
    8315         298 :     if (nextXid < ckptXid)
    8316           0 :         ckptXidEpoch++;
    8317             : 
    8318         298 :     *xid = nextXid;
    8319         298 :     *epoch = ckptXidEpoch;
    8320         298 : }
    8321             : 
    8322             : /*
    8323             :  * This must be called ONCE during postmaster or standalone-backend shutdown
    8324             :  */
    8325             : void
    8326           3 : ShutdownXLOG(int code, Datum arg)
    8327             : {
    8328             :     /* Don't be chatty in standalone mode */
    8329           3 :     ereport(IsPostmasterEnvironment ? LOG : NOTICE,
    8330             :             (errmsg("shutting down")));
    8331             : 
    8332             :     /*
    8333             :      * Signal walsenders to move to stopping state.
    8334             :      */
    8335           3 :     WalSndInitStopping();
    8336             : 
    8337             :     /*
    8338             :      * Wait for WAL senders to be in stopping state.  This prevents commands
    8339             :      * from writing new WAL.
    8340             :      */
    8341           3 :     WalSndWaitStopping();
    8342             : 
    8343           3 :     if (RecoveryInProgress())
    8344           0 :         CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
    8345             :     else
    8346             :     {
    8347             :         /*
    8348             :          * If archiving is enabled, rotate the last XLOG file so that all the
    8349             :          * remaining records are archived (postmaster wakes up the archiver
    8350             :          * process one more time at the end of shutdown). The checkpoint
    8351             :          * record will go to the next XLOG file and won't be archived (yet).
    8352             :          */
    8353           3 :         if (XLogArchivingActive() && XLogArchiveCommandSet())
    8354           0 :             RequestXLogSwitch(false);
    8355             : 
    8356           3 :         CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
    8357             :     }
    8358           3 :     ShutdownCLOG();
    8359           3 :     ShutdownCommitTs();
    8360           3 :     ShutdownSUBTRANS();
    8361           3 :     ShutdownMultiXact();
    8362           3 : }
    8363             : 
    8364             : /*
    8365             :  * Log start of a checkpoint.
    8366             :  */
    8367             : static void
    8368           5 : LogCheckpointStart(int flags, bool restartpoint)
    8369             : {
    8370           5 :     elog(LOG, "%s starting:%s%s%s%s%s%s%s%s",
    8371             :          restartpoint ? "restartpoint" : "checkpoint",
    8372             :          (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
    8373             :          (flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "",
    8374             :          (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
    8375             :          (flags & CHECKPOINT_FORCE) ? " force" : "",
    8376             :          (flags & CHECKPOINT_WAIT) ? " wait" : "",
    8377             :          (flags & CHECKPOINT_CAUSE_XLOG) ? " xlog" : "",
    8378             :          (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "",
    8379             :          (flags & CHECKPOINT_FLUSH_ALL) ? " flush-all" : "");
    8380           5 : }
    8381             : 
    8382             : /*
    8383             :  * Log end of a checkpoint.
    8384             :  */
    8385             : static void
    8386          11 : LogCheckpointEnd(bool restartpoint)
    8387             : {
    8388             :     long        write_secs,
    8389             :                 sync_secs,
    8390             :                 total_secs,
    8391             :                 longest_secs,
    8392             :                 average_secs;
    8393             :     int         write_usecs,
    8394             :                 sync_usecs,
    8395             :                 total_usecs,
    8396             :                 longest_usecs,
    8397             :                 average_usecs;
    8398             :     uint64      average_sync_time;
    8399             : 
    8400          11 :     CheckpointStats.ckpt_end_t = GetCurrentTimestamp();
    8401             : 
    8402          11 :     TimestampDifference(CheckpointStats.ckpt_write_t,
    8403             :                         CheckpointStats.ckpt_sync_t,
    8404             :                         &write_secs, &write_usecs);
    8405             : 
    8406          11 :     TimestampDifference(CheckpointStats.ckpt_sync_t,
    8407             :                         CheckpointStats.ckpt_sync_end_t,
    8408             :                         &sync_secs, &sync_usecs);
    8409             : 
    8410             :     /* Accumulate checkpoint timing summary data, in milliseconds. */
    8411          22 :     BgWriterStats.m_checkpoint_write_time +=
    8412          11 :         write_secs * 1000 + write_usecs / 1000;
    8413          22 :     BgWriterStats.m_checkpoint_sync_time +=
    8414          11 :         sync_secs * 1000 + sync_usecs / 1000;
    8415             : 
    8416             :     /*
    8417             :      * All of the published timing statistics are accounted for.  Only
    8418             :      * continue if a log message is to be written.
    8419             :      */
    8420          11 :     if (!log_checkpoints)
    8421          17 :         return;
    8422             : 
    8423           5 :     TimestampDifference(CheckpointStats.ckpt_start_t,
    8424             :                         CheckpointStats.ckpt_end_t,
    8425             :                         &total_secs, &total_usecs);
    8426             : 
    8427             :     /*
    8428             :      * Timing values returned from CheckpointStats are in microseconds.
    8429             :      * Convert to the second plus microsecond form that TimestampDifference
    8430             :      * returns for homogeneous printing.
    8431             :      */
    8432           5 :     longest_secs = (long) (CheckpointStats.ckpt_longest_sync / 1000000);
    8433          10 :     longest_usecs = CheckpointStats.ckpt_longest_sync -
    8434           5 :         (uint64) longest_secs * 1000000;
    8435             : 
    8436           5 :     average_sync_time = 0;
    8437           5 :     if (CheckpointStats.ckpt_sync_rels > 0)
    8438           0 :         average_sync_time = CheckpointStats.ckpt_agg_sync_time /
    8439           0 :             CheckpointStats.ckpt_sync_rels;
    8440           5 :     average_secs = (long) (average_sync_time / 1000000);
    8441           5 :     average_usecs = average_sync_time - (uint64) average_secs * 1000000;
    8442             : 
    8443           5 :     elog(LOG, "%s complete: wrote %d buffers (%.1f%%); "
    8444             :          "%d WAL file(s) added, %d removed, %d recycled; "
    8445             :          "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
    8446             :          "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s; "
    8447             :          "distance=%d kB, estimate=%d kB",
    8448             :          restartpoint ? "restartpoint" : "checkpoint",
    8449             :          CheckpointStats.ckpt_bufs_written,
    8450             :          (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
    8451             :          CheckpointStats.ckpt_segs_added,
    8452             :          CheckpointStats.ckpt_segs_removed,
    8453             :          CheckpointStats.ckpt_segs_recycled,
    8454             :          write_secs, write_usecs / 1000,
    8455             :          sync_secs, sync_usecs / 1000,
    8456             :          total_secs, total_usecs / 1000,
    8457             :          CheckpointStats.ckpt_sync_rels,
    8458             :          longest_secs, longest_usecs / 1000,
    8459             :          average_secs, average_usecs / 1000,
    8460             :          (int) (PrevCheckPointDistance / 1024.0),
    8461             :          (int) (CheckPointDistanceEstimate / 1024.0));
    8462             : }
    8463             : 
    8464             : /*
    8465             :  * Update the estimate of distance between checkpoints.
    8466             :  *
    8467             :  * The estimate is used to calculate the number of WAL segments to keep
    8468             :  * preallocated, see XLOGFileSlop().
    8469             :  */
    8470             : static void
    8471          11 : UpdateCheckPointDistanceEstimate(uint64 nbytes)
    8472             : {
    8473             :     /*
    8474             :      * To estimate the number of segments consumed between checkpoints, keep a
    8475             :      * moving average of the amount of WAL generated in previous checkpoint
    8476             :      * cycles. However, if the load is bursty, with quiet periods and busy
    8477             :      * periods, we want to cater for the peak load. So instead of a plain
    8478             :      * moving average, let the average decline slowly if the previous cycle
    8479             :      * used less WAL than estimated, but bump it up immediately if it used
    8480             :      * more.
    8481             :      *
    8482             :      * When checkpoints are triggered by max_wal_size, this should converge to
    8483             :      * CheckpointSegments * XLOG_SEG_SIZE,
    8484             :      *
    8485             :      * Note: This doesn't pay any attention to what caused the checkpoint.
    8486             :      * Checkpoints triggered manually with CHECKPOINT command, or by e.g.
    8487             :      * starting a base backup, are counted the same as those created
    8488             :      * automatically. The slow-decline will largely mask them out, if they are
    8489             :      * not frequent. If they are frequent, it seems reasonable to count them
    8490             :      * in as any others; if you issue a manual checkpoint every 5 minutes and
    8491             :      * never let a timed checkpoint happen, it makes sense to base the
    8492             :      * preallocation on that 5 minute interval rather than whatever
    8493             :      * checkpoint_timeout is set to.
    8494             :      */
    8495          11 :     PrevCheckPointDistance = nbytes;
    8496          11 :     if (CheckPointDistanceEstimate < nbytes)
    8497           5 :         CheckPointDistanceEstimate = nbytes;
    8498             :     else
    8499          12 :         CheckPointDistanceEstimate =
    8500           6 :             (0.90 * CheckPointDistanceEstimate + 0.10 * (double) nbytes);
    8501          11 : }
    8502             : 
    8503             : /*
    8504             :  * Perform a checkpoint --- either during shutdown, or on-the-fly
    8505             :  *
    8506             :  * flags is a bitwise OR of the following:
    8507             :  *  CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown.
    8508             :  *  CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery.
    8509             :  *  CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP,
    8510             :  *      ignoring checkpoint_completion_target parameter.
    8511             :  *  CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occurred
    8512             :  *      since the last one (implied by CHECKPOINT_IS_SHUTDOWN or
    8513             :  *      CHECKPOINT_END_OF_RECOVERY).
    8514             :  *  CHECKPOINT_FLUSH_ALL: also flush buffers of unlogged tables.
    8515             :  *
    8516             :  * Note: flags contains other bits, of interest here only for logging purposes.
    8517             :  * In particular note that this routine is synchronous and does not pay
    8518             :  * attention to CHECKPOINT_WAIT.
    8519             :  *
    8520             :  * If !shutdown then we are writing an online checkpoint. This is a very special
    8521             :  * kind of operation and WAL record because the checkpoint action occurs over
    8522             :  * a period of time yet logically occurs at just a single LSN. The logical
    8523             :  * position of the WAL record (redo ptr) is the same or earlier than the
    8524             :  * physical position. When we replay WAL we locate the checkpoint via its
    8525             :  * physical position then read the redo ptr and actually start replay at the
    8526             :  * earlier logical position. Note that we don't write *anything* to WAL at
    8527             :  * the logical position, so that location could be any other kind of WAL record.
    8528             :  * All of this mechanism allows us to continue working while we checkpoint.
    8529             :  * As a result, timing of actions is critical here and be careful to note that
    8530             :  * this function will likely take minutes to execute on a busy system.
    8531             :  */
    8532             : void
    8533          11 : CreateCheckPoint(int flags)
    8534             : {
    8535             :     bool        shutdown;
    8536             :     CheckPoint  checkPoint;
    8537             :     XLogRecPtr  recptr;
    8538          11 :     XLogCtlInsert *Insert = &XLogCtl->Insert;
    8539             :     uint32      freespace;
    8540             :     XLogRecPtr  PriorRedoPtr;
    8541             :     XLogRecPtr  curInsert;
    8542             :     XLogRecPtr  last_important_lsn;
    8543             :     VirtualTransactionId *vxids;
    8544             :     int         nvxids;
    8545             : 
    8546             :     /*
    8547             :      * An end-of-recovery checkpoint is really a shutdown checkpoint, just
    8548             :      * issued at a different time.
    8549             :      */
    8550          11 :     if (flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY))
    8551           3 :         shutdown = true;
    8552             :     else
    8553           8 :         shutdown = false;
    8554             : 
    8555             :     /* sanity check */
    8556          11 :     if (RecoveryInProgress() && (flags & CHECKPOINT_END_OF_RECOVERY) == 0)
    8557           0 :         elog(ERROR, "can't create a checkpoint during recovery");
    8558             : 
    8559             :     /*
    8560             :      * Initialize InitXLogInsert working areas before entering the critical
    8561             :      * section.  Normally, this is done by the first call to
    8562             :      * RecoveryInProgress() or LocalSetXLogInsertAllowed(), but when creating
    8563             :      * an end-of-recovery checkpoint, the LocalSetXLogInsertAllowed call is
    8564             :      * done below in a critical section, and InitXLogInsert cannot be called
    8565             :      * in a critical section.
    8566             :      */
    8567          11 :     InitXLogInsert();
    8568             : 
    8569             :     /*
    8570             :      * Acquire CheckpointLock to ensure only one checkpoint happens at a time.
    8571             :      * (This is just pro forma, since in the present system structure there is
    8572             :      * only one process that is allowed to issue checkpoints at any given
    8573             :      * time.)
    8574             :      */
    8575          11 :     LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
    8576             : 
    8577             :     /*
    8578             :      * Prepare to accumulate statistics.
    8579             :      *
    8580             :      * Note: because it is possible for log_checkpoints to change while a
    8581             :      * checkpoint proceeds, we always accumulate stats, even if
    8582             :      * log_checkpoints is currently off.
    8583             :      */
    8584          11 :     MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
    8585          11 :     CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
    8586             : 
    8587             :     /*
    8588             :      * Use a critical section to force system panic if we have trouble.
    8589             :      */
    8590          11 :     START_CRIT_SECTION();
    8591             : 
    8592          11 :     if (shutdown)
    8593             :     {
    8594           3 :         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    8595           3 :         ControlFile->state = DB_SHUTDOWNING;
    8596           3 :         ControlFile->time = (pg_time_t) time(NULL);
    8597           3 :         UpdateControlFile();
    8598           3 :         LWLockRelease(ControlFileLock);
    8599             :     }
    8600             : 
    8601             :     /*
    8602             :      * Let smgr prepare for checkpoint; this has to happen before we determine
    8603             :      * the REDO pointer.  Note that smgr must not do anything that'd have to
    8604             :      * be undone if we decide no checkpoint is needed.
    8605             :      */
    8606          11 :     smgrpreckpt();
    8607             : 
    8608             :     /* Begin filling in the checkpoint WAL record */
    8609          11 :     MemSet(&checkPoint, 0, sizeof(checkPoint));
    8610          11 :     checkPoint.time = (pg_time_t) time(NULL);
    8611             : 
    8612             :     /*
    8613             :      * For Hot Standby, derive the oldestActiveXid before we fix the redo
    8614             :      * pointer. This allows us to begin accumulating changes to assemble our
    8615             :      * starting snapshot of locks and transactions.
    8616             :      */
    8617          11 :     if (!shutdown && XLogStandbyInfoActive())
    8618           8 :         checkPoint.oldestActiveXid = GetOldestActiveTransactionId();
    8619             :     else
    8620           3 :         checkPoint.oldestActiveXid = InvalidTransactionId;
    8621             : 
    8622             :     /*
    8623             :      * Get location of last important record before acquiring insert locks (as
    8624             :      * GetLastImportantRecPtr() also locks WAL locks).
    8625             :      */
    8626          11 :     last_important_lsn = GetLastImportantRecPtr();
    8627             : 
    8628             :     /*
    8629             :      * We must block concurrent insertions while examining insert state to
    8630             :      * determine the checkpoint REDO pointer.
    8631             :      */
    8632          11 :     WALInsertLockAcquireExclusive();
    8633          11 :     curInsert = XLogBytePosToRecPtr(Insert->CurrBytePos);
    8634             : 
    8635             :     /*
    8636             :      * If this isn't a shutdown or forced checkpoint, and if there has been no
    8637             :      * WAL activity requiring a checkpoint, skip it.  The idea here is to
    8638             :      * avoid inserting duplicate checkpoints when the system is idle.
    8639             :      */
    8640          11 :     if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
    8641             :                   CHECKPOINT_FORCE)) == 0)
    8642             :     {
    8643           0 :         if (last_important_lsn == ControlFile->checkPoint)
    8644             :         {
    8645           0 :             WALInsertLockRelease();
    8646           0 :             LWLockRelease(CheckpointLock);
    8647           0 :             END_CRIT_SECTION();
    8648           0 :             ereport(DEBUG1,
    8649             :                     (errmsg("checkpoint skipped due to an idle system")));
    8650          11 :             return;
    8651             :         }
    8652             :     }
    8653             : 
    8654             :     /*
    8655             :      * An end-of-recovery checkpoint is created before anyone is allowed to
    8656             :      * write WAL. To allow us to write the checkpoint record, temporarily
    8657             :      * enable XLogInsertAllowed.  (This also ensures ThisTimeLineID is
    8658             :      * initialized, which we need here and in AdvanceXLInsertBuffer.)
    8659             :      */
    8660          11 :     if (flags & CHECKPOINT_END_OF_RECOVERY)
    8661           0 :         LocalSetXLogInsertAllowed();
    8662             : 
    8663          11 :     checkPoint.ThisTimeLineID = ThisTimeLineID;
    8664          11 :     if (flags & CHECKPOINT_END_OF_RECOVERY)
    8665           0 :         checkPoint.PrevTimeLineID = XLogCtl->PrevTimeLineID;
    8666             :     else
    8667          11 :         checkPoint.PrevTimeLineID = ThisTimeLineID;
    8668             : 
    8669          11 :     checkPoint.fullPageWrites = Insert->fullPageWrites;
    8670             : 
    8671             :     /*
    8672             :      * Compute new REDO record ptr = location of next XLOG record.
    8673             :      *
    8674             :      * NB: this is NOT necessarily where the checkpoint record itself will be,
    8675             :      * since other backends may insert more XLOG records while we're off doing
    8676             :      * the buffer flush work.  Those XLOG records are logically after the
    8677             :      * checkpoint, even though physically before it.  Got that?
    8678             :      */
    8679          11 :     freespace = INSERT_FREESPACE(curInsert);
    8680          11 :     if (freespace == 0)
    8681             :     {
    8682           0 :         if (curInsert % XLogSegSize == 0)
    8683           0 :             curInsert += SizeOfXLogLongPHD;
    8684             :         else
    8685           0 :             curInsert += SizeOfXLogShortPHD;
    8686             :     }
    8687          11 :     checkPoint.redo = curInsert;
    8688             : 
    8689             :     /*
    8690             :      * Here we update the shared RedoRecPtr for future XLogInsert calls; this
    8691             :      * must be done while holding all the insertion locks.
    8692             :      *
    8693             :      * Note: if we fail to complete the checkpoint, RedoRecPtr will be left
    8694             :      * pointing past where it really needs to point.  This is okay; the only
    8695             :      * consequence is that XLogInsert might back up whole buffers that it
    8696             :      * didn't really need to.  We can't postpone advancing RedoRecPtr because
    8697             :      * XLogInserts that happen while we are dumping buffers must assume that
    8698             :      * their buffer changes are not included in the checkpoint.
    8699             :      */
    8700          11 :     RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
    8701             : 
    8702             :     /*
    8703             :      * Now we can release the WAL insertion locks, allowing other xacts to
    8704             :      * proceed while we are flushing disk buffers.
    8705             :      */
    8706          11 :     WALInsertLockRelease();
    8707             : 
    8708             :     /* Update the info_lck-protected copy of RedoRecPtr as well */
    8709          11 :     SpinLockAcquire(&XLogCtl->info_lck);
    8710          11 :     XLogCtl->RedoRecPtr = checkPoint.redo;
    8711          11 :     SpinLockRelease(&XLogCtl->info_lck);
    8712             : 
    8713             :     /*
    8714             :      * If enabled, log checkpoint start.  We postpone this until now so as not
    8715             :      * to log anything if we decided to skip the checkpoint.
    8716             :      */
    8717          11 :     if (log_checkpoints)
    8718           5 :         LogCheckpointStart(flags, false);
    8719             : 
    8720             :     TRACE_POSTGRESQL_CHECKPOINT_START(flags);
    8721             : 
    8722             :     /*
    8723             :      * Get the other info we need for the checkpoint record.
    8724             :      *
    8725             :      * We don't need to save oldestClogXid in the checkpoint, it only matters
    8726             :      * for the short period in which clog is being truncated, and if we crash
    8727             :      * during that we'll redo the clog truncation and fix up oldestClogXid
    8728             :      * there.
    8729             :      */
    8730          11 :     LWLockAcquire(XidGenLock, LW_SHARED);
    8731          11 :     checkPoint.nextXid = ShmemVariableCache->nextXid;
    8732          11 :     checkPoint.oldestXid = ShmemVariableCache->oldestXid;
    8733          11 :     checkPoint.oldestXidDB = ShmemVariableCache->oldestXidDB;
    8734          11 :     LWLockRelease(XidGenLock);
    8735             : 
    8736          11 :     LWLockAcquire(CommitTsLock, LW_SHARED);
    8737          11 :     checkPoint.oldestCommitTsXid = ShmemVariableCache->oldestCommitTsXid;
    8738          11 :     checkPoint.newestCommitTsXid = ShmemVariableCache->newestCommitTsXid;
    8739          11 :     LWLockRelease(CommitTsLock);
    8740             : 
    8741             :     /* Increase XID epoch if we've wrapped around since last checkpoint */
    8742          11 :     checkPoint.nextXidEpoch = ControlFile->checkPointCopy.nextXidEpoch;
    8743          11 :     if (checkPoint.nextXid < ControlFile->checkPointCopy.nextXid)
    8744           0 :         checkPoint.nextXidEpoch++;
    8745             : 
    8746          11 :     LWLockAcquire(OidGenLock, LW_SHARED);
    8747          11 :     checkPoint.nextOid = ShmemVariableCache->nextOid;
    8748          11 :     if (!shutdown)
    8749           8 :         checkPoint.nextOid += ShmemVariableCache->oidCount;
    8750          11 :     LWLockRelease(OidGenLock);
    8751             : 
    8752          11 :     MultiXactGetCheckptMulti(shutdown,
    8753             :                              &checkPoint.nextMulti,
    8754             :                              &checkPoint.nextMultiOffset,
    8755             :                              &checkPoint.oldestMulti,
    8756             :                              &checkPoint.oldestMultiDB);
    8757             : 
    8758             :     /*
    8759             :      * Having constructed the checkpoint record, ensure all shmem disk buffers
    8760             :      * and commit-log buffers are flushed to disk.
    8761             :      *
    8762             :      * This I/O could fail for various reasons.  If so, we will fail to
    8763             :      * complete the checkpoint, but there is no reason to force a system
    8764             :      * panic. Accordingly, exit critical section while doing it.
    8765             :      */
    8766          11 :     END_CRIT_SECTION();
    8767             : 
    8768             :     /*
    8769             :      * In some cases there are groups of actions that must all occur on one
    8770             :      * side or the other of a checkpoint record. Before flushing the
    8771             :      * checkpoint record we must explicitly wait for any backend currently
    8772             :      * performing those groups of actions.
    8773             :      *
    8774             :      * One example is end of transaction, so we must wait for any transactions
    8775             :      * that are currently in commit critical sections.  If an xact inserted
    8776             :      * its commit record into XLOG just before the REDO point, then a crash
    8777             :      * restart from the REDO point would not replay that record, which means
    8778             :      * that our flushing had better include the xact's update of pg_xact.  So
    8779             :      * we wait till he's out of his commit critical section before proceeding.
    8780             :      * See notes in RecordTransactionCommit().
    8781             :      *
    8782             :      * Because we've already released the insertion locks, this test is a bit
    8783             :      * fuzzy: it is possible that we will wait for xacts we didn't really need
    8784             :      * to wait for.  But the delay should be short and it seems better to make
    8785             :      * checkpoint take a bit longer than to hold off insertions longer than
    8786             :      * necessary. (In fact, the whole reason we have this issue is that xact.c
    8787             :      * does commit record XLOG insertion and clog update as two separate steps
    8788             :      * protected by different locks, but again that seems best on grounds of
    8789             :      * minimizing lock contention.)
    8790             :      *
    8791             :      * A transaction that has not yet set delayChkpt when we look cannot be at
    8792             :      * risk, since he's not inserted his commit record yet; and one that's
    8793             :      * already cleared it is not at risk either, since he's done fixing clog
    8794             :      * and we will correctly flush the update below.  So we cannot miss any
    8795             :      * xacts we need to wait for.
    8796             :      */
    8797          11 :     vxids = GetVirtualXIDsDelayingChkpt(&nvxids);
    8798          11 :     if (nvxids > 0)
    8799             :     {
    8800             :         do
    8801             :         {
    8802           0 :             pg_usleep(10000L);  /* wait for 10 msec */
    8803           0 :         } while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids));
    8804             :     }
    8805          11 :     pfree(vxids);
    8806             : 
    8807          11 :     CheckPointGuts(checkPoint.redo, flags);
    8808             : 
    8809             :     /*
    8810             :      * Take a snapshot of running transactions and write this to WAL. This
    8811             :      * allows us to reconstruct the state of running transactions during
    8812             :      * archive recovery, if required. Skip, if this info disabled.
    8813             :      *
    8814             :      * If we are shutting down, or Startup process is completing crash
    8815             :      * recovery we don't need to write running xact data.
    8816             :      */
    8817          11 :     if (!shutdown && XLogStandbyInfoActive())
    8818           8 :         LogStandbySnapshot();
    8819             : 
    8820          11 :     START_CRIT_SECTION();
    8821             : 
    8822             :     /*
    8823             :      * Now insert the checkpoint record into XLOG.
    8824             :      */
    8825          11 :     XLogBeginInsert();
    8826          11 :     XLogRegisterData((char *) (&checkPoint), sizeof(checkPoint));
    8827          11 :     recptr = XLogInsert(RM_XLOG_ID,
    8828             :                         shutdown ? XLOG_CHECKPOINT_SHUTDOWN :
    8829             :                         XLOG_CHECKPOINT_ONLINE);
    8830             : 
    8831          11 :     XLogFlush(recptr);
    8832             : 
    8833             :     /*
    8834             :      * We mustn't write any new WAL after a shutdown checkpoint, or it will be
    8835             :      * overwritten at next startup.  No-one should even try, this just allows
    8836             :      * sanity-checking.  In the case of an end-of-recovery checkpoint, we want
    8837             :      * to just temporarily disable writing until the system has exited
    8838             :      * recovery.
    8839             :      */
    8840          11 :     if (shutdown)
    8841             :     {
    8842           3 :         if (flags & CHECKPOINT_END_OF_RECOVERY)
    8843           0 :             LocalXLogInsertAllowed = -1;    /* return to "check" state */
    8844             :         else
    8845           3 :             LocalXLogInsertAllowed = 0; /* never again write WAL */
    8846             :     }
    8847             : 
    8848             :     /*
    8849             :      * We now have ProcLastRecPtr = start of actual checkpoint record, recptr
    8850             :      * = end of actual checkpoint record.
    8851             :      */
    8852          11 :     if (shutdown && checkPoint.redo != ProcLastRecPtr)
    8853           0 :         ereport(PANIC,
    8854             :                 (errmsg("concurrent write-ahead log activity while database system is shutting down")));
    8855             : 
    8856             :     /*
    8857             :      * Remember the prior checkpoint's redo pointer, used later to determine
    8858             :      * the point where the log can be truncated.
    8859             :      */
    8860          11 :     PriorRedoPtr = ControlFile->checkPointCopy.redo;
    8861             : 
    8862             :     /*
    8863             :      * Update the control file.
    8864             :      */
    8865          11 :     LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    8866          11 :     if (shutdown)
    8867           3 :         ControlFile->state = DB_SHUTDOWNED;
    8868          11 :     ControlFile->prevCheckPoint = ControlFile->checkPoint;
    8869          11 :     ControlFile->checkPoint = ProcLastRecPtr;
    8870          11 :     ControlFile->checkPointCopy = checkPoint;
    8871          11 :     ControlFile->time = (pg_time_t) time(NULL);
    8872             :     /* crash recovery should always recover to the end of WAL */
    8873          11 :     ControlFile->minRecoveryPoint = InvalidXLogRecPtr;
    8874          11 :     ControlFile->minRecoveryPointTLI = 0;
    8875             : 
    8876             :     /*
    8877             :      * Persist unloggedLSN value. It's reset on crash recovery, so this goes
    8878             :      * unused on non-shutdown checkpoints, but seems useful to store it always
    8879             :      * for debugging purposes.
    8880             :      */
    8881          11 :     SpinLockAcquire(&XLogCtl->ulsn_lck);
    8882          11 :     ControlFile->unloggedLSN = XLogCtl->unloggedLSN;
    8883          11 :     SpinLockRelease(&XLogCtl->ulsn_lck);
    8884             : 
    8885          11 :     UpdateControlFile();
    8886          11 :     LWLockRelease(ControlFileLock);
    8887             : 
    8888             :     /* Update shared-memory copy of checkpoint XID/epoch */
    8889          11 :     SpinLockAcquire(&XLogCtl->info_lck);
    8890          11 :     XLogCtl->ckptXidEpoch = checkPoint.nextXidEpoch;
    8891          11 :     XLogCtl->ckptXid = checkPoint.nextXid;
    8892          11 :     SpinLockRelease(&XLogCtl->info_lck);
    8893             : 
    8894             :     /*
    8895             :      * We are now done with critical updates; no need for system panic if we
    8896             :      * have trouble while fooling with old log segments.
    8897             :      */
    8898          11 :     END_CRIT_SECTION();
    8899             : 
    8900             :     /*
    8901             :      * Let smgr do post-checkpoint cleanup (eg, deleting old files).
    8902             :      */
    8903          11 :     smgrpostckpt();
    8904             : 
    8905             :     /*
    8906             :      * Delete old log files (those no longer needed even for previous
    8907             :      * checkpoint or the standbys in XLOG streaming).
    8908             :      */
    8909          11 :     if (PriorRedoPtr != InvalidXLogRecPtr)
    8910             :     {
    8911             :         XLogSegNo   _logSegNo;
    8912             : 
    8913             :         /* Update the average distance between checkpoints. */
    8914          11 :         UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr);
    8915             : 
    8916          11 :         XLByteToSeg(PriorRedoPtr, _logSegNo);
    8917          11 :         KeepLogSeg(recptr, &_logSegNo);
    8918          11 :         _logSegNo--;
    8919          11 :         RemoveOldXlogFiles(_logSegNo, PriorRedoPtr, recptr);
    8920             :     }
    8921             : 
    8922             :     /*
    8923             :      * Make more log segments if needed.  (Do this after recycling old log
    8924             :      * segments, since that may supply some of the needed files.)
    8925             :      */
    8926          11 :     if (!shutdown)
    8927           8 :         PreallocXlogFiles(recptr);
    8928             : 
    8929             :     /*
    8930             :      * Truncate pg_subtrans if possible.  We can throw away all data before
    8931             :      * the oldest XMIN of any running transaction.  No future transaction will
    8932             :      * attempt to reference any pg_subtrans entry older than that (see Asserts
    8933             :      * in subtrans.c).  During recovery, though, we mustn't do this because
    8934             :      * StartupSUBTRANS hasn't been called yet.
    8935             :      */
    8936          11 :     if (!RecoveryInProgress())
    8937          11 :         TruncateSUBTRANS(GetOldestXmin(NULL, PROCARRAY_FLAGS_DEFAULT));
    8938             : 
    8939             :     /* Real work is done, but log and update stats before releasing lock. */
    8940          11 :     LogCheckpointEnd(false);
    8941             : 
    8942             :     TRACE_POSTGRESQL_CHECKPOINT_DONE(CheckpointStats.ckpt_bufs_written,
    8943             :                                      NBuffers,
    8944             :                                      CheckpointStats.ckpt_segs_added,
    8945             :                                      CheckpointStats.ckpt_segs_removed,
    8946             :                                      CheckpointStats.ckpt_segs_recycled);
    8947             : 
    8948          11 :     LWLockRelease(CheckpointLock);
    8949             : }
    8950             : 
    8951             : /*
    8952             :  * Mark the end of recovery in WAL though without running a full checkpoint.
    8953             :  * We can expect that a restartpoint is likely to be in progress as we
    8954             :  * do this, though we are unwilling to wait for it to complete. So be
    8955             :  * careful to avoid taking the CheckpointLock anywhere here.
    8956             :  *
    8957             :  * CreateRestartPoint() allows for the case where recovery may end before
    8958             :  * the restartpoint completes so there is no concern of concurrent behaviour.
    8959             :  */
    8960             : static void
    8961           0 : CreateEndOfRecoveryRecord(void)
    8962             : {
    8963             :     xl_end_of_recovery xlrec;
    8964             :     XLogRecPtr  recptr;
    8965             : 
    8966             :     /* sanity check */
    8967           0 :     if (!RecoveryInProgress())
    8968           0 :         elog(ERROR, "can only be used to end recovery");
    8969             : 
    8970           0 :     xlrec.end_time = GetCurrentTimestamp();
    8971             : 
    8972           0 :     WALInsertLockAcquireExclusive();
    8973           0 :     xlrec.ThisTimeLineID = ThisTimeLineID;
    8974           0 :     xlrec.PrevTimeLineID = XLogCtl->PrevTimeLineID;
    8975           0 :     WALInsertLockRelease();
    8976             : 
    8977           0 :     LocalSetXLogInsertAllowed();
    8978             : 
    8979           0 :     START_CRIT_SECTION();
    8980             : 
    8981           0 :     XLogBeginInsert();
    8982           0 :     XLogRegisterData((char *) &xlrec, sizeof(xl_end_of_recovery));
    8983           0 :     recptr = XLogInsert(RM_XLOG_ID, XLOG_END_OF_RECOVERY);
    8984             : 
    8985           0 :     XLogFlush(recptr);
    8986             : 
    8987             :     /*
    8988             :      * Update the control file so that crash recovery can follow the timeline
    8989             :      * changes to this point.
    8990             :      */
    8991           0 :     LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    8992           0 :     ControlFile->time = (pg_time_t) time(NULL);
    8993           0 :     ControlFile->minRecoveryPoint = recptr;
    8994           0 :     ControlFile->minRecoveryPointTLI = ThisTimeLineID;
    8995           0 :     UpdateControlFile();
    8996           0 :     LWLockRelease(ControlFileLock);
    8997             : 
    8998           0 :     END_CRIT_SECTION();
    8999             : 
    9000           0 :     LocalXLogInsertAllowed = -1;    /* return to "check" state */
    9001           0 : }
    9002             : 
    9003             : /*
    9004             :  * Flush all data in shared memory to disk, and fsync
    9005             :  *
    9006             :  * This is the common code shared between regular checkpoints and
    9007             :  * recovery restartpoints.
    9008             :  */
    9009             : static void
    9010          11 : CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
    9011             : {
    9012          11 :     CheckPointCLOG();
    9013          11 :     CheckPointCommitTs();
    9014          11 :     CheckPointSUBTRANS();
    9015          11 :     CheckPointMultiXact();
    9016          11 :     CheckPointPredicate();
    9017          11 :     CheckPointRelationMap();
    9018          11 :     CheckPointReplicationSlots();
    9019          11 :     CheckPointSnapBuild();
    9020          11 :     CheckPointLogicalRewriteHeap();
    9021          11 :     CheckPointBuffers(flags);   /* performs all required fsyncs */
    9022          11 :     CheckPointReplicationOrigin();
    9023             :     /* We deliberately delay 2PC checkpointing as long as possible */
    9024          11 :     CheckPointTwoPhase(checkPointRedo);
    9025          11 : }
    9026             : 
    9027             : /*
    9028             :  * Save a checkpoint for recovery restart if appropriate
    9029             :  *
    9030             :  * This function is called each time a checkpoint record is read from XLOG.
    9031             :  * It must determine whether the checkpoint represents a safe restartpoint or
    9032             :  * not.  If so, the checkpoint record is stashed in shared memory so that
    9033             :  * CreateRestartPoint can consult it.  (Note that the latter function is
    9034             :  * executed by the checkpointer, while this one will be executed by the
    9035             :  * startup process.)
    9036             :  */
    9037             : static void
    9038           0 : RecoveryRestartPoint(const CheckPoint *checkPoint)
    9039             : {
    9040             :     /*
    9041             :      * Also refrain from creating a restartpoint if we have seen any
    9042             :      * references to non-existent pages. Restarting recovery from the
    9043             :      * restartpoint would not see the references, so we would lose the
    9044             :      * cross-check that the pages belonged to a relation that was dropped
    9045             :      * later.
    9046             :      */
    9047           0 :     if (XLogHaveInvalidPages())
    9048             :     {
    9049           0 :         elog(trace_recovery(DEBUG2),
    9050             :              "could not record restart point at %X/%X because there "
    9051             :              "are unresolved references to invalid pages",
    9052             :              (uint32) (checkPoint->redo >> 32),
    9053             :              (uint32) checkPoint->redo);
    9054           0 :         return;
    9055             :     }
    9056             : 
    9057             :     /*
    9058             :      * Copy the checkpoint record to shared memory, so that checkpointer can
    9059             :      * work out the next time it wants to perform a restartpoint.
    9060             :      */
    9061           0 :     SpinLockAcquire(&XLogCtl->info_lck);
    9062           0 :     XLogCtl->lastCheckPointRecPtr = ReadRecPtr;
    9063           0 :     XLogCtl->lastCheckPointEndPtr = EndRecPtr;
    9064           0 :     XLogCtl->lastCheckPoint = *checkPoint;
    9065           0 :     SpinLockRelease(&XLogCtl->info_lck);
    9066             : }
    9067             : 
    9068             : /*
    9069             :  * Establish a restartpoint if possible.
    9070             :  *
    9071             :  * This is similar to CreateCheckPoint, but is used during WAL recovery
    9072             :  * to establish a point from which recovery can roll forward without
    9073             :  * replaying the entire recovery log.
    9074             :  *
    9075             :  * Returns true if a new restartpoint was established. We can only establish
    9076             :  * a restartpoint if we have replayed a safe checkpoint record since last
    9077             :  * restartpoint.
    9078             :  */
    9079             : bool
    9080           0 : CreateRestartPoint(int flags)
    9081             : {
    9082             :     XLogRecPtr  lastCheckPointRecPtr;
    9083             :     XLogRecPtr  lastCheckPointEndPtr;
    9084             :     CheckPoint  lastCheckPoint;
    9085             :     XLogRecPtr  PriorRedoPtr;
    9086             :     TimestampTz xtime;
    9087             : 
    9088             :     /*
    9089             :      * Acquire CheckpointLock to ensure only one restartpoint or checkpoint
    9090             :      * happens at a time.
    9091             :      */
    9092           0 :     LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
    9093             : 
    9094             :     /* Get a local copy of the last safe checkpoint record. */
    9095           0 :     SpinLockAcquire(&XLogCtl->info_lck);
    9096           0 :     lastCheckPointRecPtr = XLogCtl->lastCheckPointRecPtr;
    9097           0 :     lastCheckPointEndPtr = XLogCtl->lastCheckPointEndPtr;
    9098           0 :     lastCheckPoint = XLogCtl->lastCheckPoint;
    9099           0 :     SpinLockRelease(&XLogCtl->info_lck);
    9100             : 
    9101             :     /*
    9102             :      * Check that we're still in recovery mode. It's ok if we exit recovery
    9103             :      * mode after this check, the restart point is valid anyway.
    9104             :      */
    9105           0 :     if (!RecoveryInProgress())
    9106             :     {
    9107           0 :         ereport(DEBUG2,
    9108             :                 (errmsg("skipping restartpoint, recovery has already ended")));
    9109           0 :         LWLockRelease(CheckpointLock);
    9110           0 :         return false;
    9111             :     }
    9112             : 
    9113             :     /*
    9114             :      * If the last checkpoint record we've replayed is already our last
    9115             :      * restartpoint, we can't perform a new restart point. We still update
    9116             :      * minRecoveryPoint in that case, so that if this is a shutdown restart
    9117             :      * point, we won't start up earlier than before. That's not strictly
    9118             :      * necessary, but when hot standby is enabled, it would be rather weird if
    9119             :      * the database opened up for read-only connections at a point-in-time
    9120             :      * before the last shutdown. Such time travel is still possible in case of
    9121             :      * immediate shutdown, though.
    9122             :      *
    9123             :      * We don't explicitly advance minRecoveryPoint when we do create a
    9124             :      * restartpoint. It's assumed that flushing the buffers will do that as a
    9125             :      * side-effect.
    9126             :      */
    9127           0 :     if (XLogRecPtrIsInvalid(lastCheckPointRecPtr) ||
    9128           0 :         lastCheckPoint.redo <= ControlFile->checkPointCopy.redo)
    9129             :     {
    9130           0 :         ereport(DEBUG2,
    9131             :                 (errmsg("skipping restartpoint, already performed at %X/%X",
    9132             :                         (uint32) (lastCheckPoint.redo >> 32),
    9133             :                         (uint32) lastCheckPoint.redo)));
    9134             : 
    9135           0 :         UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
    9136           0 :         if (flags & CHECKPOINT_IS_SHUTDOWN)
    9137             :         {
    9138           0 :             LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    9139           0 :             ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
    9140           0 :             ControlFile->time = (pg_time_t) time(NULL);
    9141           0 :             UpdateControlFile();
    9142           0 :             LWLockRelease(ControlFileLock);
    9143             :         }
    9144           0 :         LWLockRelease(CheckpointLock);
    9145           0 :         return false;
    9146             :     }
    9147             : 
    9148             :     /*
    9149             :      * Update the shared RedoRecPtr so that the startup process can calculate
    9150             :      * the number of segments replayed since last restartpoint, and request a
    9151             :      * restartpoint if it exceeds CheckPointSegments.
    9152             :      *
    9153             :      * Like in CreateCheckPoint(), hold off insertions to update it, although
    9154             :      * during recovery this is just pro forma, because no WAL insertions are
    9155             :      * happening.
    9156             :      */
    9157           0 :     WALInsertLockAcquireExclusive();
    9158           0 :     RedoRecPtr = XLogCtl->Insert.RedoRecPtr = lastCheckPoint.redo;
    9159           0 :     WALInsertLockRelease();
    9160             : 
    9161             :     /* Also update the info_lck-protected copy */
    9162           0 :     SpinLockAcquire(&XLogCtl->info_lck);
    9163           0 :     XLogCtl->RedoRecPtr = lastCheckPoint.redo;
    9164           0 :     SpinLockRelease(&XLogCtl->info_lck);
    9165             : 
    9166             :     /*
    9167             :      * Prepare to accumulate statistics.
    9168             :      *
    9169             :      * Note: because it is possible for log_checkpoints to change while a
    9170             :      * checkpoint proceeds, we always accumulate stats, even if
    9171             :      * log_checkpoints is currently off.
    9172             :      */
    9173           0 :     MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
    9174           0 :     CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
    9175             : 
    9176           0 :     if (log_checkpoints)
    9177           0 :         LogCheckpointStart(flags, true);
    9178             : 
    9179           0 :     CheckPointGuts(lastCheckPoint.redo, flags);
    9180             : 
    9181             :     /*
    9182             :      * Remember the prior checkpoint's redo pointer, used later to determine
    9183             :      * the point at which we can truncate the log.
    9184             :      */
    9185           0 :     PriorRedoPtr = ControlFile->checkPointCopy.redo;
    9186             : 
    9187             :     /*
    9188             :      * Update pg_control, using current time.  Check that it still shows
    9189             :      * IN_ARCHIVE_RECOVERY state and an older checkpoint, else do nothing;
    9190             :      * this is a quick hack to make sure nothing really bad happens if somehow
    9191             :      * we get here after the end-of-recovery checkpoint.
    9192             :      */
    9193           0 :     LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    9194           0 :     if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY &&
    9195           0 :         ControlFile->checkPointCopy.redo < lastCheckPoint.redo)
    9196             :     {
    9197           0 :         ControlFile->prevCheckPoint = ControlFile->checkPoint;
    9198           0 :         ControlFile->checkPoint = lastCheckPointRecPtr;
    9199           0 :         ControlFile->checkPointCopy = lastCheckPoint;
    9200           0 :         ControlFile->time = (pg_time_t) time(NULL);
    9201             : 
    9202             :         /*
    9203             :          * Ensure minRecoveryPoint is past the checkpoint record.  Normally,
    9204             :          * this will have happened already while writing out dirty buffers,
    9205             :          * but not necessarily - e.g. because no buffers were dirtied.  We do
    9206             :          * this because a non-exclusive base backup uses minRecoveryPoint to
    9207             :          * determine which WAL files must be included in the backup, and the
    9208             :          * file (or files) containing the checkpoint record must be included,
    9209             :          * at a minimum. Note that for an ordinary restart of recovery there's
    9210             :          * no value in having the minimum recovery point any earlier than this
    9211             :          * anyway, because redo will begin just after the checkpoint record.
    9212             :          */
    9213           0 :         if (ControlFile->minRecoveryPoint < lastCheckPointEndPtr)
    9214             :         {
    9215           0 :             ControlFile->minRecoveryPoint = lastCheckPointEndPtr;
    9216           0 :             ControlFile->minRecoveryPointTLI = lastCheckPoint.ThisTimeLineID;
    9217             : 
    9218             :             /* update local copy */
    9219           0 :             minRecoveryPoint = ControlFile->minRecoveryPoint;
    9220           0 :             minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
    9221             :         }
    9222           0 :         if (flags & CHECKPOINT_IS_SHUTDOWN)
    9223           0 :             ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
    9224           0 :         UpdateControlFile();
    9225             :     }
    9226           0 :     LWLockRelease(ControlFileLock);
    9227             : 
    9228             :     /*
    9229             :      * Delete old log files (those no longer needed even for previous
    9230             :      * checkpoint/restartpoint) to prevent the disk holding the xlog from
    9231             :      * growing full.
    9232             :      */
    9233           0 :     if (PriorRedoPtr != InvalidXLogRecPtr)
    9234             :     {
    9235             :         XLogRecPtr  receivePtr;
    9236             :         XLogRecPtr  replayPtr;
    9237             :         TimeLineID  replayTLI;
    9238             :         XLogRecPtr  endptr;
    9239             :         XLogSegNo   _logSegNo;
    9240             : 
    9241             :         /* Update the average distance between checkpoints/restartpoints. */
    9242           0 :         UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr);
    9243             : 
    9244           0 :         XLByteToSeg(PriorRedoPtr, _logSegNo);
    9245             : 
    9246             :         /*
    9247             :          * Get the current end of xlog replayed or received, whichever is
    9248             :          * later.
    9249             :          */
    9250           0 :         receivePtr = GetWalRcvWriteRecPtr(NULL, NULL);
    9251           0 :         replayPtr = GetXLogReplayRecPtr(&replayTLI);
    9252           0 :         endptr = (receivePtr < replayPtr) ? replayPtr : receivePtr;
    9253             : 
    9254           0 :         KeepLogSeg(endptr, &_logSegNo);
    9255           0 :         _logSegNo--;
    9256             : 
    9257             :         /*
    9258             :          * Try to recycle segments on a useful timeline. If we've been
    9259             :          * promoted since the beginning of this restartpoint, use the new
    9260             :          * timeline chosen at end of recovery (RecoveryInProgress() sets
    9261             :          * ThisTimeLineID in that case). If we're still in recovery, use the
    9262             :          * timeline we're currently replaying.
    9263             :          *
    9264             :          * There is no guarantee that the WAL segments will be useful on the
    9265             :          * current timeline; if recovery proceeds to a new timeline right
    9266             :          * after this, the pre-allocated WAL segments on this timeline will
    9267             :          * not be used, and will go wasted until recycled on the next
    9268             :          * restartpoint. We'll live with that.
    9269             :          */
    9270           0 :         if (RecoveryInProgress())
    9271           0 :             ThisTimeLineID = replayTLI;
    9272             : 
    9273           0 :         RemoveOldXlogFiles(_logSegNo, PriorRedoPtr, endptr);
    9274             : 
    9275             :         /*
    9276             :          * Make more log segments if needed.  (Do this after recycling old log
    9277             :          * segments, since that may supply some of the needed files.)
    9278             :          */
    9279           0 :         PreallocXlogFiles(endptr);
    9280             : 
    9281             :         /*
    9282             :          * ThisTimeLineID is normally not set when we're still in recovery.
    9283             :          * However, recycling/preallocating segments above needed
    9284             :          * ThisTimeLineID to determine which timeline to install the segments
    9285             :          * on. Reset it now, to restore the normal state of affairs for
    9286             :          * debugging purposes.
    9287             :          */
    9288           0 :         if (RecoveryInProgress())
    9289           0 :             ThisTimeLineID = 0;
    9290             :     }
    9291             : 
    9292             :     /*
    9293             :      * Truncate pg_subtrans if possible.  We can throw away all data before
    9294             :      * the oldest XMIN of any running transaction.  No future transaction will
    9295             :      * attempt to reference any pg_subtrans entry older than that (see Asserts
    9296             :      * in subtrans.c).  When hot standby is disabled, though, we mustn't do
    9297             :      * this because StartupSUBTRANS hasn't been called yet.
    9298             :      */
    9299           0 :     if (EnableHotStandby)
    9300           0 :         TruncateSUBTRANS(GetOldestXmin(NULL, PROCARRAY_FLAGS_DEFAULT));
    9301             : 
    9302             :     /* Real work is done, but log and update before releasing lock. */
    9303           0 :     LogCheckpointEnd(true);
    9304             : 
    9305           0 :     xtime = GetLatestXTime();
    9306           0 :     ereport((log_checkpoints ? LOG : DEBUG2),
    9307             :             (errmsg("recovery restart point at %X/%X",
    9308             :                     (uint32) (lastCheckPoint.redo >> 32), (uint32) lastCheckPoint.redo),
    9309             :              xtime ? errdetail("last completed transaction was at log time %s",
    9310             :                                timestamptz_to_str(xtime)) : 0));
    9311             : 
    9312           0 :     LWLockRelease(CheckpointLock);
    9313             : 
    9314             :     /*
    9315             :      * Finally, execute archive_cleanup_command, if any.
    9316             :      */
    9317           0 :     if (XLogCtl->archiveCleanupCommand[0])
    9318           0 :         ExecuteRecoveryCommand(XLogCtl->archiveCleanupCommand,
    9319             :                                "archive_cleanup_command",
    9320             :                                false);
    9321             : 
    9322           0 :     return true;
    9323             : }
    9324             : 
    9325             : /*
    9326             :  * Retreat *logSegNo to the last segment that we need to retain because of
    9327             :  * either wal_keep_segments or replication slots.
    9328             :  *
    9329             :  * This is calculated by subtracting wal_keep_segments from the given xlog
    9330             :  * location, recptr and by making sure that that result is below the
    9331             :  * requirement of replication slots.
    9332             :  */
    9333             : static void
    9334          11 : KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo)
    9335             : {
    9336             :     XLogSegNo   segno;
    9337             :     XLogRecPtr  keep;
    9338             : 
    9339          11 :     XLByteToSeg(recptr, segno);
    9340          11 :     keep = XLogGetReplicationSlotMinimumLSN();
    9341             : 
    9342             :     /* compute limit for wal_keep_segments first */
    9343          11 :     if (wal_keep_segments > 0)
    9344             :     {
    9345             :         /* avoid underflow, don't go below 1 */
    9346           0 :         if (segno <= wal_keep_segments)
    9347           0 :             segno = 1;
    9348             :         else
    9349           0 :             segno = segno - wal_keep_segments;
    9350             :     }
    9351             : 
    9352             :     /* then check whether slots limit removal further */
    9353          11 :     if (max_replication_slots > 0 && keep != InvalidXLogRecPtr)
    9354             :     {
    9355             :         XLogSegNo   slotSegNo;
    9356             : 
    9357           0 :         XLByteToSeg(keep, slotSegNo);
    9358             : 
    9359           0 :         if (slotSegNo <= 0)
    9360           0 :             segno = 1;
    9361           0 :         else if (slotSegNo < segno)
    9362           0 :             segno = slotSegNo;
    9363             :     }
    9364             : 
    9365             :     /* don't delete WAL segments newer than the calculated segment */
    9366          11 :     if (segno < *logSegNo)
    9367           0 :         *logSegNo = segno;
    9368          11 : }
    9369             : 
    9370             : /*
    9371             :  * Write a NEXTOID log record
    9372             :  */
    9373             : void
    9374           6 : XLogPutNextOid(Oid nextOid)
    9375             : {
    9376           6 :     XLogBeginInsert();
    9377           6 :     XLogRegisterData((char *) (&nextOid), sizeof(Oid));
    9378           6 :     (void) XLogInsert(RM_XLOG_ID, XLOG_NEXTOID);
    9379             : 
    9380             :     /*
    9381             :      * We need not flush the NEXTOID record immediately, because any of the
    9382             :      * just-allocated OIDs could only reach disk as part of a tuple insert or
    9383             :      * update that would have its own XLOG record that must follow the NEXTOID
    9384             :      * record.  Therefore, the standard buffer LSN interlock applied to those
    9385             :      * records will ensure no such OID reaches disk before the NEXTOID record
    9386             :      * does.
    9387             :      *
    9388             :      * Note, however, that the above statement only covers state "within" the
    9389             :      * database.  When we use a generated OID as a file or directory name, we
    9390             :      * are in a sense violating the basic WAL rule, because that filesystem
    9391             :      * change may reach disk before the NEXTOID WAL record does.  The impact
    9392             :      * of this is that if a database crash occurs immediately afterward, we
    9393             :      * might after restart re-generate the same OID and find that it conflicts
    9394             :      * with the leftover file or directory.  But since for safety's sake we
    9395             :      * always loop until finding a nonconflicting filename, this poses no real
    9396             :      * problem in practice. See pgsql-hackers discussion 27-Sep-2006.
    9397             :      */
    9398           6 : }
    9399             : 
    9400             : /*
    9401             :  * Write an XLOG SWITCH record.
    9402             :  *
    9403             :  * Here we just blindly issue an XLogInsert request for the record.
    9404             :  * All the magic happens inside XLogInsert.
    9405             :  *
    9406             :  * The return value is either the end+1 address of the switch record,
    9407             :  * or the end+1 address of the prior segment if we did not need to
    9408             :  * write a switch record because we are already at segment start.
    9409             :  */
    9410             : XLogRecPtr
    9411           0 : RequestXLogSwitch(bool mark_unimportant)
    9412             : {
    9413             :     XLogRecPtr  RecPtr;
    9414             : 
    9415             :     /* XLOG SWITCH has no data */
    9416           0 :     XLogBeginInsert();
    9417             : 
    9418           0 :     if (mark_unimportant)
    9419           0 :         XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT);
    9420           0 :     RecPtr = XLogInsert(RM_XLOG_ID, XLOG_SWITCH);
    9421             : 
    9422           0 :     return RecPtr;
    9423             : }
    9424             : 
    9425             : /*
    9426             :  * Write a RESTORE POINT record
    9427             :  */
    9428             : XLogRecPtr
    9429           0 : XLogRestorePoint(const char *rpName)
    9430             : {
    9431             :     XLogRecPtr  RecPtr;
    9432             :     xl_restore_point xlrec;
    9433             : 
    9434           0 :     xlrec.rp_time = GetCurrentTimestamp();
    9435           0 :     strlcpy(xlrec.rp_name, rpName, MAXFNAMELEN);
    9436             : 
    9437           0 :     XLogBeginInsert();
    9438           0 :     XLogRegisterData((char *) &xlrec, sizeof(xl_restore_point));
    9439             : 
    9440           0 :     RecPtr = XLogInsert(RM_XLOG_ID, XLOG_RESTORE_POINT);
    9441             : 
    9442           0 :     ereport(LOG,
    9443             :             (errmsg("restore point \"%s\" created at %X/%X",
    9444             :                     rpName, (uint32) (RecPtr >> 32), (uint32) RecPtr)));
    9445             : 
    9446           0 :     return RecPtr;
    9447             : }
    9448             : 
    9449             : /*
    9450             :  * Check if any of the GUC parameters that are critical for hot standby
    9451             :  * have changed, and update the value in pg_control file if necessary.
    9452             :  */
    9453             : static void
    9454           3 : XLogReportParameters(void)
    9455             : {
    9456           6 :     if (wal_level != ControlFile->wal_level ||
    9457           6 :         wal_log_hints != ControlFile->wal_log_hints ||
    9458           6 :         MaxConnections != ControlFile->MaxConnections ||
    9459           6 :         max_worker_processes != ControlFile->max_worker_processes ||
    9460           5 :         max_prepared_xacts != ControlFile->max_prepared_xacts ||
    9461           4 :         max_locks_per_xact != ControlFile->max_locks_per_xact ||
    9462           2 :         track_commit_timestamp != ControlFile->track_commit_timestamp)
    9463             :     {
    9464             :         /*
    9465             :          * The change in number of backend slots doesn't need to be WAL-logged
    9466             :          * if archiving is not enabled, as you can't start archive recovery
    9467             :          * with wal_level=minimal anyway. We don't really care about the
    9468             :          * values in pg_control either if wal_level=minimal, but seems better
    9469             :          * to keep them up-to-date to avoid confusion.
    9470             :          */
    9471           1 :         if (wal_level != ControlFile->wal_level || XLogIsNeeded())
    9472             :         {
    9473             :             xl_parameter_change xlrec;
    9474             :             XLogRecPtr  recptr;
    9475             : 
    9476           1 :             xlrec.MaxConnections = MaxConnections;
    9477           1 :             xlrec.max_worker_processes = max_worker_processes;
    9478           1 :             xlrec.max_prepared_xacts = max_prepared_xacts;
    9479           1 :             xlrec.max_locks_per_xact = max_locks_per_xact;
    9480           1 :             xlrec.wal_level = wal_level;
    9481           1 :             xlrec.wal_log_hints = wal_log_hints;
    9482           1 :             xlrec.track_commit_timestamp = track_commit_timestamp;
    9483             : 
    9484           1 :             XLogBeginInsert();
    9485           1 :             XLogRegisterData((char *) &xlrec, sizeof(xlrec));
    9486             : 
    9487           1 :             recptr = XLogInsert(RM_XLOG_ID, XLOG_PARAMETER_CHANGE);
    9488           1 :             XLogFlush(recptr);
    9489             :         }
    9490             : 
    9491           1 :         ControlFile->MaxConnections = MaxConnections;
    9492           1 :         ControlFile->max_worker_processes = max_worker_processes;
    9493           1 :         ControlFile->max_prepared_xacts = max_prepared_xacts;
    9494           1 :         ControlFile->max_locks_per_xact = max_locks_per_xact;
    9495           1 :         ControlFile->wal_level = wal_level;
    9496           1 :         ControlFile->wal_log_hints = wal_log_hints;
    9497           1 :         ControlFile->track_commit_timestamp = track_commit_timestamp;
    9498           1 :         UpdateControlFile();
    9499             :     }
    9500           3 : }
    9501             : 
    9502             : /*
    9503             :  * Update full_page_writes in shared memory, and write an
    9504             :  * XLOG_FPW_CHANGE record if necessary.
    9505             :  *
    9506             :  * Note: this function assumes there is no other process running
    9507             :  * concurrently that could update it.
    9508             :  */
    9509             : void
    9510           4 : UpdateFullPageWrites(void)
    9511             : {
    9512           4 :     XLogCtlInsert *Insert = &XLogCtl->Insert;
    9513             : 
    9514             :     /*
    9515             :      * Do nothing if full_page_writes has not been changed.
    9516             :      *
    9517             :      * It's safe to check the shared full_page_writes without the lock,
    9518             :      * because we assume that there is no concurrently running process which
    9519             :      * can update it.
    9520             :      */
    9521           4 :     if (fullPageWrites == Insert->fullPageWrites)
    9522           8 :         return;
    9523             : 
    9524           0 :     START_CRIT_SECTION();
    9525             : 
    9526             :     /*
    9527             :      * It's always safe to take full page images, even when not strictly
    9528             :      * required, but not the other round. So if we're setting full_page_writes
    9529             :      * to true, first set it true and then write the WAL record. If we're
    9530             :      * setting it to false, first write the WAL record and then set the global
    9531             :      * flag.
    9532             :      */
    9533           0 :     if (fullPageWrites)
    9534             :     {
    9535           0 :         WALInsertLockAcquireExclusive();
    9536           0 :         Insert->fullPageWrites = true;
    9537           0 :         WALInsertLockRelease();
    9538             :     }
    9539             : 
    9540             :     /*
    9541             :      * Write an XLOG_FPW_CHANGE record. This allows us to keep track of
    9542             :      * full_page_writes during archive recovery, if required.
    9543             :      */
    9544           0 :     if (XLogStandbyInfoActive() && !RecoveryInProgress())
    9545             :     {
    9546           0 :         XLogBeginInsert();
    9547           0 :         XLogRegisterData((char *) (&fullPageWrites), sizeof(bool));
    9548             : 
    9549           0 :         XLogInsert(RM_XLOG_ID, XLOG_FPW_CHANGE);
    9550             :     }
    9551             : 
    9552           0 :     if (!fullPageWrites)
    9553             :     {
    9554           0 :         WALInsertLockAcquireExclusive();
    9555           0 :         Insert->fullPageWrites = false;
    9556           0 :         WALInsertLockRelease();
    9557             :     }
    9558           0 :     END_CRIT_SECTION();
    9559             : }
    9560             : 
    9561             : /*
    9562             :  * Check that it's OK to switch to new timeline during recovery.
    9563             :  *
    9564             :  * 'lsn' is the address of the shutdown checkpoint record we're about to
    9565             :  * replay. (Currently, timeline can only change at a shutdown checkpoint).
    9566             :  */
    9567             : static void
    9568           0 : checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, TimeLineID prevTLI)
    9569             : {
    9570             :     /* Check that the record agrees on what the current (old) timeline is */
    9571           0 :     if (prevTLI != ThisTimeLineID)
    9572           0 :         ereport(PANIC,
    9573             :                 (errmsg("unexpected previous timeline ID %u (current timeline ID %u) in checkpoint record",
    9574             :                         prevTLI, ThisTimeLineID)));
    9575             : 
    9576             :     /*
    9577             :      * The new timeline better be in the list of timelines we expect to see,
    9578             :      * according to the timeline history. It should also not decrease.
    9579             :      */
    9580           0 :     if (newTLI < ThisTimeLineID || !tliInHistory(newTLI, expectedTLEs))
    9581           0 :         ereport(PANIC,
    9582             :                 (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
    9583             :                         newTLI, ThisTimeLineID)));
    9584             : 
    9585             :     /*
    9586             :      * If we have not yet reached min recovery point, and we're about to
    9587             :      * switch to a timeline greater than the timeline of the min recovery
    9588             :      * point: trouble. After switching to the new timeline, we could not
    9589             :      * possibly visit the min recovery point on the correct timeline anymore.
    9590             :      * This can happen if there is a newer timeline in the archive that
    9591             :      * branched before the timeline the min recovery point is on, and you
    9592             :      * attempt to do PITR to the new timeline.
    9593             :      */
    9594           0 :     if (!XLogRecPtrIsInvalid(minRecoveryPoint) &&
    9595           0 :         lsn < minRecoveryPoint &&
    9596           0 :         newTLI > minRecoveryPointTLI)
    9597           0 :         ereport(PANIC,
    9598             :                 (errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%X on timeline %u",
    9599             :                         newTLI,
    9600             :                         (uint32) (minRecoveryPoint >> 32),
    9601             :                         (uint32) minRecoveryPoint,
    9602             :                         minRecoveryPointTLI)));
    9603             : 
    9604             :     /* Looks good */
    9605           0 : }
    9606             : 
    9607             : /*
    9608             :  * XLOG resource manager's routines
    9609             :  *
    9610             :  * Definitions of info values are in include/catalog/pg_control.h, though
    9611             :  * not all record types are related to control file updates.
    9612             :  */
    9613             : void
    9614           0 : xlog_redo(XLogReaderState *record)
    9615             : {
    9616           0 :     uint8       info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
    9617           0 :     XLogRecPtr  lsn = record->EndRecPtr;
    9618             : 
    9619             :     /* in XLOG rmgr, backup blocks are only used by XLOG_FPI records */
    9620           0 :     Assert(info == XLOG_FPI || info == XLOG_FPI_FOR_HINT ||
    9621             :            !XLogRecHasAnyBlockRefs(record));
    9622             : 
    9623           0 :     if (info == XLOG_NEXTOID)
    9624             :     {
    9625             :         Oid         nextOid;
    9626             : 
    9627             :         /*
    9628             :          * We used to try to take the maximum of ShmemVariableCache->nextOid
    9629             :          * and the recorded nextOid, but that fails if the OID counter wraps
    9630             :          * around.  Since no OID allocation should be happening during replay
    9631             :          * anyway, better to just believe the record exactly.  We still take
    9632             :          * OidGenLock while setting the variable, just in case.
    9633             :          */
    9634           0 :         memcpy(&nextOid, XLogRecGetData(record), sizeof(Oid));
    9635           0 :         LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
    9636           0 :         ShmemVariableCache->nextOid = nextOid;
    9637           0 :         ShmemVariableCache->oidCount = 0;
    9638           0 :         LWLockRelease(OidGenLock);
    9639             :     }
    9640           0 :     else if (info == XLOG_CHECKPOINT_SHUTDOWN)
    9641             :     {
    9642             :         CheckPoint  checkPoint;
    9643             : 
    9644           0 :         memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
    9645             :         /* In a SHUTDOWN checkpoint, believe the counters exactly */
    9646           0 :         LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
    9647           0 :         ShmemVariableCache->nextXid = checkPoint.nextXid;
    9648           0 :         LWLockRelease(XidGenLock);
    9649           0 :         LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
    9650           0 :         ShmemVariableCache->nextOid = checkPoint.nextOid;
    9651           0 :         ShmemVariableCache->oidCount = 0;
    9652           0 :         LWLockRelease(OidGenLock);
    9653           0 :         MultiXactSetNextMXact(checkPoint.nextMulti,
    9654             :                               checkPoint.nextMultiOffset);
    9655             : 
    9656           0 :         MultiXactAdvanceOldest(checkPoint.oldestMulti,
    9657             :                                checkPoint.oldestMultiDB);
    9658             : 
    9659             :         /*
    9660             :          * No need to set oldestClogXid here as well; it'll be set when we
    9661             :          * redo an xl_clog_truncate if it changed since initialization.
    9662             :          */
    9663           0 :         SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
    9664             : 
    9665             :         /*
    9666             :          * If we see a shutdown checkpoint while waiting for an end-of-backup
    9667             :          * record, the backup was canceled and the end-of-backup record will
    9668             :          * never arrive.
    9669             :          */
    9670           0 :         if (ArchiveRecoveryRequested &&
    9671           0 :             !XLogRecPtrIsInvalid(ControlFile->backupStartPoint) &&
    9672           0 :             XLogRecPtrIsInvalid(ControlFile->backupEndPoint))
    9673           0 :             ereport(PANIC,
    9674             :                     (errmsg("online backup was canceled, recovery cannot continue")));
    9675             : 
    9676             :         /*
    9677             :          * If we see a shutdown checkpoint, we know that nothing was running
    9678             :          * on the master at this point. So fake-up an empty running-xacts
    9679             :          * record and use that here and now. Recover additional standby state
    9680             :          * for prepared transactions.
    9681             :          */
    9682           0 :         if (standbyState >= STANDBY_INITIALIZED)
    9683             :         {
    9684             :             TransactionId *xids;
    9685             :             int         nxids;
    9686             :             TransactionId oldestActiveXID;
    9687             :             TransactionId latestCompletedXid;
    9688             :             RunningTransactionsData running;
    9689             : 
    9690           0 :             oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
    9691             : 
    9692             :             /*
    9693             :              * Construct a RunningTransactions snapshot representing a shut
    9694             :              * down server, with only prepared transactions still alive. We're
    9695             :              * never overflowed at this point because all subxids are listed
    9696             :              * with their parent prepared transactions.
    9697             :              */
    9698           0 :             running.xcnt = nxids;
    9699           0 :             running.subxcnt = 0;
    9700           0 :             running.subxid_overflow = false;
    9701           0 :             running.nextXid = checkPoint.nextXid;
    9702           0 :             running.oldestRunningXid = oldestActiveXID;
    9703           0 :             latestCompletedXid = checkPoint.nextXid;
    9704           0 :             TransactionIdRetreat(latestCompletedXid);
    9705           0 :             Assert(TransactionIdIsNormal(latestCompletedXid));
    9706           0 :             running.latestCompletedXid = latestCompletedXid;
    9707           0 :             running.xids = xids;
    9708             : 
    9709           0 :             ProcArrayApplyRecoveryInfo(&running);
    9710             : 
    9711           0 :             StandbyRecoverPreparedTransactions();
    9712             :         }
    9713             : 
    9714             :         /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
    9715           0 :         ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
    9716           0 :         ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
    9717             : 
    9718             :         /* Update shared-memory copy of checkpoint XID/epoch */
    9719           0 :         SpinLockAcquire(&XLogCtl->info_lck);
    9720           0 :         XLogCtl->ckptXidEpoch = checkPoint.nextXidEpoch;
    9721           0 :         XLogCtl->ckptXid = checkPoint.nextXid;
    9722           0 :         SpinLockRelease(&XLogCtl->info_lck);
    9723             : 
    9724             :         /*
    9725             :          * We should've already switched to the new TLI before replaying this
    9726             :          * record.
    9727             :          */
    9728           0 :         if (checkPoint.ThisTimeLineID != ThisTimeLineID)
    9729           0 :             ereport(PANIC,
    9730             :                     (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
    9731             :                             checkPoint.ThisTimeLineID, ThisTimeLineID)));
    9732             : 
    9733           0 :         RecoveryRestartPoint(&checkPoint);
    9734             :     }
    9735           0 :     else if (info == XLOG_CHECKPOINT_ONLINE)
    9736             :     {
    9737             :         CheckPoint  checkPoint;
    9738             : 
    9739           0 :         memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
    9740             :         /* In an ONLINE checkpoint, treat the XID counter as a minimum */
    9741           0 :         LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
    9742           0 :         if (TransactionIdPrecedes(ShmemVariableCache->nextXid,
    9743             :                                   checkPoint.nextXid))
    9744           0 :             ShmemVariableCache->nextXid = checkPoint.nextXid;
    9745           0 :         LWLockRelease(XidGenLock);
    9746             :         /* ... but still treat OID counter as exact */
    9747           0 :         LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
    9748           0 :         ShmemVariableCache->nextOid = checkPoint.nextOid;
    9749           0 :         ShmemVariableCache->oidCount = 0;
    9750           0 :         LWLockRelease(OidGenLock);
    9751           0 :         MultiXactAdvanceNextMXact(checkPoint.nextMulti,
    9752             :                                   checkPoint.nextMultiOffset);
    9753             : 
    9754             :         /*
    9755             :          * NB: This may perform multixact truncation when replaying WAL
    9756             :          * generated by an older primary.
    9757             :          */
    9758           0 :         MultiXactAdvanceOldest(checkPoint.oldestMulti,
    9759             :                                checkPoint.oldestMultiDB);
    9760           0 :         if (TransactionIdPrecedes(ShmemVariableCache->oldestXid,
    9761             :                                   checkPoint.oldestXid))
    9762           0 :             SetTransactionIdLimit(checkPoint.oldestXid,
    9763             :                                   checkPoint.oldestXidDB);
    9764             :         /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
    9765           0 :         ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
    9766           0 :         ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
    9767             : 
    9768             :         /* Update shared-memory copy of checkpoint XID/epoch */
    9769           0 :         SpinLockAcquire(&XLogCtl->info_lck);
    9770           0 :         XLogCtl->ckptXidEpoch = checkPoint.nextXidEpoch;
    9771           0 :         XLogCtl->ckptXid = checkPoint.nextXid;
    9772           0 :         SpinLockRelease(&XLogCtl->info_lck);
    9773             : 
    9774             :         /* TLI should not change in an on-line checkpoint */
    9775           0 :         if (checkPoint.ThisTimeLineID != ThisTimeLineID)
    9776           0 :             ereport(PANIC,
    9777             :                     (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
    9778             :                             checkPoint.ThisTimeLineID, ThisTimeLineID)));
    9779             : 
    9780           0 :         RecoveryRestartPoint(&checkPoint);
    9781             :     }
    9782           0 :     else if (info == XLOG_END_OF_RECOVERY)
    9783             :     {
    9784             :         xl_end_of_recovery xlrec;
    9785             : 
    9786           0 :         memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery));
    9787             : 
    9788             :         /*
    9789             :          * For Hot Standby, we could treat this like a Shutdown Checkpoint,
    9790             :          * but this case is rarer and harder to test, so the benefit doesn't
    9791             :          * outweigh the potential extra cost of maintenance.
    9792             :          */
    9793             : 
    9794             :         /*
    9795             :          * We should've already switched to the new TLI before replaying this
    9796             :          * record.
    9797             :          */
    9798           0 :         if (xlrec.ThisTimeLineID != ThisTimeLineID)
    9799           0 :             ereport(PANIC,
    9800             :                     (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
    9801             :                             xlrec.ThisTimeLineID, ThisTimeLineID)));
    9802             :     }
    9803           0 :     else if (info == XLOG_NOOP)
    9804             :     {
    9805             :         /* nothing to do here */
    9806             :     }
    9807           0 :     else if (info == XLOG_SWITCH)
    9808             :     {
    9809             :         /* nothing to do here */
    9810             :     }
    9811           0 :     else if (info == XLOG_RESTORE_POINT)
    9812             :     {
    9813             :         /* nothing to do here */
    9814             :     }
    9815           0 :     else if (info == XLOG_FPI || info == XLOG_FPI_FOR_HINT)
    9816           0 :     {
    9817             :         Buffer      buffer;
    9818             : 
    9819             :         /*
    9820             :          * Full-page image (FPI) records contain nothing else but a backup
    9821             :          * block. The block reference must include a full-page image -
    9822             :          * otherwise there would be no point in this record.
    9823             :          *
    9824             :          * No recovery conflicts are generated by these generic records - if a
    9825             :          * resource manager needs to generate conflicts, it has to define a
    9826             :          * separate WAL record type and redo routine.
    9827             :          *
    9828             :          * XLOG_FPI_FOR_HINT records are generated when a page needs to be
    9829             :          * WAL- logged because of a hint bit update. They are only generated
    9830             :          * when checksums are enabled. There is no difference in handling
    9831             :          * XLOG_FPI and XLOG_FPI_FOR_HINT records, they use a different info
    9832             :          * code just to distinguish them for statistics purposes.
    9833             :          */
    9834           0 :         if (XLogReadBufferForRedo(record, 0, &buffer) != BLK_RESTORED)
    9835           0 :             elog(ERROR, "unexpected XLogReadBufferForRedo result when restoring backup block");
    9836           0 :         UnlockReleaseBuffer(buffer);
    9837             :     }
    9838           0 :     else if (info == XLOG_BACKUP_END)
    9839             :     {
    9840             :         XLogRecPtr  startpoint;
    9841             : 
    9842           0 :         memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint));
    9843             : 
    9844           0 :         if (ControlFile->backupStartPoint == startpoint)
    9845             :         {
    9846             :             /*
    9847             :              * We have reached the end of base backup, the point where
    9848             :              * pg_stop_backup() was done. The data on disk is now consistent.
    9849             :              * Reset backupStartPoint, and update minRecoveryPoint to make
    9850             :              * sure we don't allow starting up at an earlier point even if
    9851             :              * recovery is stopped and restarted soon after this.
    9852             :              */
    9853           0 :             elog(DEBUG1, "end of backup reached");
    9854             : 
    9855           0 :             LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    9856             : 
    9857           0 :             if (ControlFile->minRecoveryPoint < lsn)
    9858             :             {
    9859           0 :                 ControlFile->minRecoveryPoint = lsn;
    9860           0 :                 ControlFile->minRecoveryPointTLI = ThisTimeLineID;
    9861             :             }
    9862           0 :             ControlFile->backupStartPoint = InvalidXLogRecPtr;
    9863           0 :             ControlFile->backupEndRequired = false;
    9864           0 :             UpdateControlFile();
    9865             : 
    9866           0 :             LWLockRelease(ControlFileLock);
    9867             :         }
    9868             :     }
    9869           0 :     else if (info == XLOG_PARAMETER_CHANGE)
    9870             :     {
    9871             :         xl_parameter_change xlrec;
    9872             : 
    9873             :         /* Update our copy of the parameters in pg_control */
    9874           0 :         memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_parameter_change));
    9875             : 
    9876           0 :         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    9877           0 :         ControlFile->MaxConnections = xlrec.MaxConnections;
    9878           0 :         ControlFile->max_worker_processes = xlrec.max_worker_processes;
    9879           0 :         ControlFile->max_prepared_xacts = xlrec.max_prepared_xacts;
    9880           0 :         ControlFile->max_locks_per_xact = xlrec.max_locks_per_xact;
    9881           0 :         ControlFile->wal_level = xlrec.wal_level;
    9882           0 :         ControlFile->wal_log_hints = xlrec.wal_log_hints;
    9883             : 
    9884             :         /*
    9885             :          * Update minRecoveryPoint to ensure that if recovery is aborted, we
    9886             :          * recover back up to this point before allowing hot standby again.
    9887             :          * This is important if the max_* settings are decreased, to ensure
    9888             :          * you don't run queries against the WAL preceding the change.
    9889             :          */
    9890           0 :         minRecoveryPoint = ControlFile->minRecoveryPoint;
    9891           0 :         minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
    9892           0 :         if (minRecoveryPoint != 0 && minRecoveryPoint < lsn)
    9893             :         {
    9894           0 :             ControlFile->minRecoveryPoint = lsn;
    9895           0 :             ControlFile->minRecoveryPointTLI = ThisTimeLineID;
    9896             :         }
    9897             : 
    9898           0 :         CommitTsParameterChange(xlrec.track_commit_timestamp,
    9899           0 :                                 ControlFile->track_commit_timestamp);
    9900           0 :         ControlFile->track_commit_timestamp = xlrec.track_commit_timestamp;
    9901             : 
    9902           0 :         UpdateControlFile();
    9903           0 :         LWLockRelease(ControlFileLock);
    9904             : 
    9905             :         /* Check to see if any changes to max_connections give problems */
    9906           0 :         CheckRequiredParameterValues();
    9907             :     }
    9908           0 :     else if (info == XLOG_FPW_CHANGE)
    9909             :     {
    9910             :         bool        fpw;
    9911             : 
    9912           0 :         memcpy(&fpw, XLogRecGetData(record), sizeof(bool));
    9913             : 
    9914             :         /*
    9915             :          * Update the LSN of the last replayed XLOG_FPW_CHANGE record so that
    9916             :          * do_pg_start_backup() and do_pg_stop_backup() can check whether
    9917             :          * full_page_writes has been disabled during online backup.
    9918             :          */
    9919           0 :         if (!fpw)
    9920             :         {
    9921           0 :             SpinLockAcquire(&XLogCtl->info_lck);
    9922           0 :             if (XLogCtl->lastFpwDisableRecPtr < ReadRecPtr)
    9923           0 :                 XLogCtl->lastFpwDisableRecPtr = ReadRecPtr;
    9924           0 :             SpinLockRelease(&XLogCtl->info_lck);
    9925             :         }
    9926             : 
    9927             :         /* Keep track of full_page_writes */
    9928           0 :         lastFullPageWrites = fpw;
    9929             :     }
    9930           0 : }
    9931             : 
    9932             : #ifdef WAL_DEBUG
    9933             : 
    9934             : static void
    9935             : xlog_outrec(StringInfo buf, XLogReaderState *record)
    9936             : {
    9937             :     int         block_id;
    9938             : 
    9939             :     appendStringInfo(buf, "prev %X/%X; xid %u",
    9940             :                      (uint32) (XLogRecGetPrev(record) >> 32),
    9941             :                      (uint32) XLogRecGetPrev(record),
    9942             :                      XLogRecGetXid(record));
    9943             : 
    9944             :     appendStringInfo(buf, "; len %u",
    9945             :                      XLogRecGetDataLen(record));
    9946             : 
    9947             :     /* decode block references */
    9948             :     for (block_id = 0; block_id <= record->max_block_id; block_id++)
    9949             :     {
    9950             :         RelFileNode rnode;
    9951             :         ForkNumber  forknum;
    9952             :         BlockNumber blk;
    9953             : 
    9954             :         if (!XLogRecHasBlockRef(record, block_id))
    9955             :             continue;
    9956             : 
    9957             :         XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blk);
    9958             :         if (forknum != MAIN_FORKNUM)
    9959             :             appendStringInfo(buf, "; blkref #%u: rel %u/%u/%u, fork %u, blk %u",
    9960             :                              block_id,
    9961             :                              rnode.spcNode, rnode.dbNode, rnode.relNode,
    9962             :                              forknum,
    9963             :                              blk);
    9964             :         else
    9965             :             appendStringInfo(buf, "; blkref #%u: rel %u/%u/%u, blk %u",
    9966             :                              block_id,
    9967             :                              rnode.spcNode, rnode.dbNode, rnode.relNode,
    9968             :                              blk);
    9969             :         if (XLogRecHasBlockImage(record, block_id))
    9970             :             appendStringInfoString(buf, " FPW");
    9971             :     }
    9972             : }
    9973             : #endif                          /* WAL_DEBUG */
    9974             : 
    9975             : /*
    9976             :  * Returns a string describing an XLogRecord, consisting of its identity
    9977             :  * optionally followed by a colon, a space, and a further description.
    9978             :  */
    9979             : static void
    9980           0 : xlog_outdesc(StringInfo buf, XLogReaderState *record)
    9981             : {
    9982           0 :     RmgrId      rmid = XLogRecGetRmid(record);
    9983           0 :     uint8       info = XLogRecGetInfo(record);
    9984             :     const char *id;
    9985             : 
    9986           0 :     appendStringInfoString(buf, RmgrTable[rmid].rm_name);
    9987           0 :     appendStringInfoChar(buf, '/');
    9988             : 
    9989           0 :     id = RmgrTable[rmid].rm_identify(info);
    9990           0 :     if (id == NULL)
    9991           0 :         appendStringInfo(buf, "UNKNOWN (%X): ", info & ~XLR_INFO_MASK);
    9992             :     else
    9993           0 :         appendStringInfo(buf, "%s: ", id);
    9994             : 
    9995           0 :     RmgrTable[rmid].rm_desc(buf, record);
    9996           0 : }
    9997             : 
    9998             : 
    9999             : /*
   10000             :  * Return the (possible) sync flag used for opening a file, depending on the
   10001             :  * value of the GUC wal_sync_method.
   10002             :  */
   10003             : static int
   10004         268 : get_sync_bit(int method)
   10005             : {
   10006         268 :     int         o_direct_flag = 0;
   10007             : 
   10008             :     /* If fsync is disabled, never open in sync mode */
   10009         268 :     if (!enableFsync)
   10010         268 :         return 0;
   10011             : 
   10012             :     /*
   10013             :      * Optimize writes by bypassing kernel cache with O_DIRECT when using
   10014             :      * O_SYNC/O_FSYNC and O_DSYNC.  But only if archiving and streaming are
   10015             :      * disabled, otherwise the archive command or walsender process will read
   10016             :      * the WAL soon after writing it, which is guaranteed to cause a physical
   10017             :      * read if we bypassed the kernel cache. We also skip the
   10018             :      * posix_fadvise(POSIX_FADV_DONTNEED) call in XLogFileClose() for the same
   10019             :      * reason.
   10020             :      *
   10021             :      * Never use O_DIRECT in walreceiver process for similar reasons; the WAL
   10022             :      * written by walreceiver is normally read by the startup process soon
   10023             :      * after its written. Also, walreceiver performs unaligned writes, which
   10024             :      * don't work with O_DIRECT, so it is required for correctness too.
   10025             :      */
   10026           0 :     if (!XLogIsNeeded() && !AmWalReceiverProcess())
   10027           0 :         o_direct_flag = PG_O_DIRECT;
   10028             : 
   10029           0 :     switch (method)
   10030             :     {
   10031             :             /*
   10032             :              * enum values for all sync options are defined even if they are
   10033             :              * not supported on the current platform.  But if not, they are
   10034             :              * not included in the enum option array, and therefore will never
   10035             :              * be seen here.
   10036             :              */
   10037             :         case SYNC_METHOD_FSYNC:
   10038             :         case SYNC_METHOD_FSYNC_WRITETHROUGH:
   10039             :         case SYNC_METHOD_FDATASYNC:
   10040           0 :             return 0;
   10041             : #ifdef OPEN_SYNC_FLAG
   10042             :         case SYNC_METHOD_OPEN:
   10043           0 :             return OPEN_SYNC_FLAG | o_direct_flag;
   10044             : #endif
   10045             : #ifdef OPEN_DATASYNC_FLAG
   10046             :         case SYNC_METHOD_OPEN_DSYNC:
   10047           0 :             return OPEN_DATASYNC_FLAG | o_direct_flag;
   10048             : #endif
   10049             :         default:
   10050             :             /* can't happen (unless we are out of sync with option array) */
   10051           0 :             elog(ERROR, "unrecognized wal_sync_method: %d", method);
   10052             :             return 0;           /* silence warning */
   10053             :     }
   10054             : }
   10055             : 
   10056             : /*
   10057             :  * GUC support
   10058             :  */
   10059             : void
   10060           5 : assign_xlog_sync_method(int new_sync_method, void *extra)
   10061             : {
   10062           5 :     if (sync_method != new_sync_method)
   10063             :     {
   10064             :         /*
   10065             :          * To ensure that no blocks escape unsynced, force an fsync on the
   10066             :          * currently open log segment (if any).  Also, if the open flag is
   10067             :          * changing, close the log file so it will be reopened (with new flag
   10068             :          * bit) at next use.
   10069             :          */
   10070           0 :         if (openLogFile >= 0)
   10071             :         {
   10072           0 :             pgstat_report_wait_start(WAIT_EVENT_WAL_SYNC_METHOD_ASSIGN);
   10073           0 :             if (pg_fsync(openLogFile) != 0)
   10074           0 :                 ereport(PANIC,
   10075             :                         (errcode_for_file_access(),
   10076             :                          errmsg("could not fsync log segment %s: %m",
   10077             :                                 XLogFileNameP(ThisTimeLineID, openLogSegNo))));
   10078           0 :             pgstat_report_wait_end();
   10079           0 :             if (get_sync_bit(sync_method) != get_sync_bit(new_sync_method))
   10080           0 :                 XLogFileClose();
   10081             :         }
   10082             :     }
   10083           5 : }
   10084             : 
   10085             : 
   10086             : /*
   10087             :  * Issue appropriate kind of fsync (if any) for an XLOG output file.
   10088             :  *
   10089             :  * 'fd' is a file descriptor for the XLOG file to be fsync'd.
   10090             :  * 'log' and 'seg' are for error reporting purposes.
   10091             :  */
   10092             : void
   10093        9577 : issue_xlog_fsync(int fd, XLogSegNo segno)
   10094             : {
   10095        9577 :     switch (sync_method)
   10096             :     {
   10097             :         case SYNC_METHOD_FSYNC:
   10098           0 :             if (pg_fsync_no_writethrough(fd) != 0)
   10099           0 :                 ereport(PANIC,
   10100             :                         (errcode_for_file_access(),
   10101             :                          errmsg("could not fsync log file %s: %m",
   10102             :                                 XLogFileNameP(ThisTimeLineID, segno))));
   10103           0 :             break;
   10104             : #ifdef HAVE_FSYNC_WRITETHROUGH
   10105             :         case SYNC_METHOD_FSYNC_WRITETHROUGH:
   10106             :             if (pg_fsync_writethrough(fd) != 0)
   10107             :                 ereport(PANIC,
   10108             :                         (errcode_for_file_access(),
   10109             :                          errmsg("could not fsync write-through log file %s: %m",
   10110             :                                 XLogFileNameP(ThisTimeLineID, segno))));
   10111             :             break;
   10112             : #endif
   10113             : #ifdef HAVE_FDATASYNC
   10114             :         case SYNC_METHOD_FDATASYNC:
   10115        9577 :             if (pg_fdatasync(fd) != 0)
   10116           0 :                 ereport(PANIC,
   10117             :                         (errcode_for_file_access(),
   10118             :                          errmsg("could not fdatasync log file %s: %m",
   10119             :                                 XLogFileNameP(ThisTimeLineID, segno))));
   10120        9577 :             break;
   10121             : #endif
   10122             :         case SYNC_METHOD_OPEN:
   10123             :         case SYNC_METHOD_OPEN_DSYNC:
   10124             :             /* write synced it already */
   10125           0 :             break;
   10126             :         default:
   10127           0 :             elog(PANIC, "unrecognized wal_sync_method: %d", sync_method);
   10128             :             break;
   10129             :     }
   10130        9577 : }
   10131             : 
   10132             : /*
   10133             :  * Return the filename of given log segment, as a palloc'd string.
   10134             :  */
   10135             : char *
   10136           0 : XLogFileNameP(TimeLineID tli, XLogSegNo segno)
   10137             : {
   10138           0 :     char       *result = palloc(MAXFNAMELEN);
   10139             : 
   10140           0 :     XLogFileName(result, tli, segno);
   10141           0 :     return result;
   10142             : }
   10143             : 
   10144             : /*
   10145             :  * do_pg_start_backup is the workhorse of the user-visible pg_start_backup()
   10146             :  * function. It creates the necessary starting checkpoint and constructs the
   10147             :  * backup label file.
   10148             :  *
   10149             :  * There are two kind of backups: exclusive and non-exclusive. An exclusive
   10150             :  * backup is started with pg_start_backup(), and there can be only one active
   10151             :  * at a time. The backup and tablespace map files of an exclusive backup are
   10152             :  * written to $PGDATA/backup_label and $PGDATA/tablespace_map, and they are
   10153             :  * removed by pg_stop_backup().
   10154             :  *
   10155             :  * A non-exclusive backup is used for the streaming base backups (see
   10156             :  * src/backend/replication/basebackup.c). The difference to exclusive backups
   10157             :  * is that the backup label and tablespace map files are not written to disk.
   10158             :  * Instead, their would-be contents are returned in *labelfile and *tblspcmapfile,
   10159             :  * and the caller is responsible for including them in the backup archive as
   10160             :  * 'backup_label' and 'tablespace_map'. There can be many non-exclusive backups
   10161             :  * active at the same time, and they don't conflict with an exclusive backup
   10162             :  * either.
   10163             :  *
   10164             :  * tblspcmapfile is required mainly for tar format in windows as native windows
   10165             :  * utilities are not able to create symlinks while extracting files from tar.
   10166             :  * However for consistency, the same is used for all platforms.
   10167             :  *
   10168             :  * needtblspcmapfile is true for the cases (exclusive backup and for
   10169             :  * non-exclusive backup only when tar format is used for taking backup)
   10170             :  * when backup needs to generate tablespace_map file, it is used to
   10171             :  * embed escape character before newline character in tablespace path.
   10172             :  *
   10173             :  * Returns the minimum WAL location that must be present to restore from this
   10174             :  * backup, and the corresponding timeline ID in *starttli_p.
   10175             :  *
   10176             :  * Every successfully started non-exclusive backup must be stopped by calling
   10177             :  * do_pg_stop_backup() or do_pg_abort_backup().
   10178             :  *
   10179             :  * It is the responsibility of the caller of this function to verify the
   10180             :  * permissions of the calling user!
   10181             :  */
   10182             : XLogRecPtr
   10183           0 : do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p,
   10184             :                    StringInfo labelfile, DIR *tblspcdir, List **tablespaces,
   10185             :                    StringInfo tblspcmapfile, bool infotbssize,
   10186             :                    bool needtblspcmapfile)
   10187             : {
   10188           0 :     bool        exclusive = (labelfile == NULL);
   10189           0 :     bool        backup_started_in_recovery = false;
   10190             :     XLogRecPtr  checkpointloc;
   10191             :     XLogRecPtr  startpoint;
   10192             :     TimeLineID  starttli;
   10193             :     pg_time_t   stamp_time;
   10194             :     char        strfbuf[128];
   10195             :     char        xlogfilename[MAXFNAMELEN];
   10196             :     XLogSegNo   _logSegNo;
   10197             :     struct stat stat_buf;
   10198             :     FILE       *fp;
   10199             : 
   10200           0 :     backup_started_in_recovery = RecoveryInProgress();
   10201             : 
   10202             :     /*
   10203             :      * Currently only non-exclusive backup can be taken during recovery.
   10204             :      */
   10205           0 :     if (backup_started_in_recovery && exclusive)
   10206           0 :         ereport(ERROR,
   10207             :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
   10208             :                  errmsg("recovery is in progress"),
   10209             :                  errhint("WAL control functions cannot be executed during recovery.")));
   10210             : 
   10211             :     /*
   10212             :      * During recovery, we don't need to check WAL level. Because, if WAL
   10213             :      * level is not sufficient, it's impossible to get here during recovery.
   10214             :      */
   10215           0 :     if (!backup_started_in_recovery && !XLogIsNeeded())
   10216           0 :         ereport(ERROR,
   10217             :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
   10218             :                  errmsg("WAL level not sufficient for making an online backup"),
   10219             :                  errhint("wal_level must be set to \"replica\" or \"logical\" at server start.")));
   10220             : 
   10221           0 :     if (strlen(backupidstr) > MAXPGPATH)
   10222           0 :         ereport(ERROR,
   10223             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
   10224             :                  errmsg("backup label too long (max %d bytes)",
   10225             :                         MAXPGPATH)));
   10226             : 
   10227             :     /*
   10228             :      * Mark backup active in shared memory.  We must do full-page WAL writes
   10229             :      * during an on-line backup even if not doing so at other times, because
   10230             :      * it's quite possible for the backup dump to obtain a "torn" (partially
   10231             :      * written) copy of a database page if it reads the page concurrently with
   10232             :      * our write to the same page.  This can be fixed as long as the first
   10233             :      * write to the page in the WAL sequence is a full-page write. Hence, we
   10234             :      * turn on forcePageWrites and then force a CHECKPOINT, to ensure there
   10235             :      * are no dirty pages in shared memory that might get dumped while the
   10236             :      * backup is in progress without having a corresponding WAL record.  (Once
   10237             :      * the backup is complete, we need not force full-page writes anymore,
   10238             :      * since we expect that any pages not modified during the backup interval
   10239             :      * must have been correctly captured by the backup.)
   10240             :      *
   10241             :      * Note that forcePageWrites has no effect during an online backup from
   10242             :      * the standby.
   10243             :      *
   10244             :      * We must hold all the insertion locks to change the value of
   10245             :      * forcePageWrites, to ensure adequate interlocking against
   10246             :      * XLogInsertRecord().
   10247             :      */
   10248           0 :     WALInsertLockAcquireExclusive();
   10249           0 :     if (exclusive)
   10250             :     {
   10251             :         /*
   10252             :          * At first, mark that we're now starting an exclusive backup, to
   10253             :          * ensure that there are no other sessions currently running
   10254             :          * pg_start_backup() or pg_stop_backup().
   10255             :          */
   10256           0 :         if (XLogCtl->Insert.exclusiveBackupState != EXCLUSIVE_BACKUP_NONE)
   10257             :         {
   10258           0 :             WALInsertLockRelease();
   10259           0 :             ereport(ERROR,
   10260             :                     (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
   10261             :                      errmsg("a backup is already in progress"),
   10262             :                      errhint("Run pg_stop_backup() and try again.")));
   10263             :         }
   10264           0 :         XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_STARTING;
   10265             :     }
   10266             :     else
   10267           0 :         XLogCtl->Insert.nonExclusiveBackups++;
   10268           0 :     XLogCtl->Insert.forcePageWrites = true;
   10269           0 :     WALInsertLockRelease();
   10270             : 
   10271             :     /* Ensure we release forcePageWrites if fail below */
   10272           0 :     PG_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));
   10273             :     {
   10274           0 :         bool        gotUniqueStartpoint = false;
   10275             :         struct dirent *de;
   10276             :         tablespaceinfo *ti;
   10277             :         int         datadirpathlen;
   10278             : 
   10279             :         /*
   10280             :          * Force an XLOG file switch before the checkpoint, to ensure that the
   10281             :          * WAL segment the checkpoint is written to doesn't contain pages with
   10282             :          * old timeline IDs.  That would otherwise happen if you called
   10283             :          * pg_start_backup() right after restoring from a PITR archive: the
   10284             :          * first WAL segment containing the startup checkpoint has pages in
   10285             :          * the beginning with the old timeline ID.  That can cause trouble at
   10286             :          * recovery: we won't have a history file covering the old timeline if
   10287             :          * pg_wal directory was not included in the base backup and the WAL
   10288             :          * archive was cleared too before starting the backup.
   10289             :          *
   10290             :          * This also ensures that we have emitted a WAL page header that has
   10291             :          * XLP_BKP_REMOVABLE off before we emit the checkpoint record.
   10292             :          * Therefore, if a WAL archiver (such as pglesslog) is trying to
   10293             :          * compress out removable backup blocks, it won't remove any that
   10294             :          * occur after this point.
   10295             :          *
   10296             :          * During recovery, we skip forcing XLOG file switch, which means that
   10297             :          * the backup taken during recovery is not available for the special
   10298             :          * recovery case described above.
   10299             :          */
   10300           0 :         if (!backup_started_in_recovery)
   10301           0 :             RequestXLogSwitch(false);
   10302             : 
   10303             :         do
   10304             :         {
   10305             :             bool        checkpointfpw;
   10306             : 
   10307             :             /*
   10308             :              * Force a CHECKPOINT.  Aside from being necessary to prevent torn
   10309             :              * page problems, this guarantees that two successive backup runs
   10310             :              * will have different checkpoint positions and hence different
   10311             :              * history file names, even if nothing happened in between.
   10312             :              *
   10313             :              * During recovery, establish a restartpoint if possible. We use
   10314             :              * the last restartpoint as the backup starting checkpoint. This
   10315             :              * means that two successive backup runs can have same checkpoint
   10316             :              * positions.
   10317             :              *
   10318             :              * Since the fact that we are executing do_pg_start_backup()
   10319             :              * during recovery means that checkpointer is running, we can use
   10320             :              * RequestCheckpoint() to establish a restartpoint.
   10321             :              *
   10322             :              * We use CHECKPOINT_IMMEDIATE only if requested by user (via
   10323             :              * passing fast = true).  Otherwise this can take awhile.
   10324             :              */
   10325           0 :             RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT |
   10326             :                               (fast ? CHECKPOINT_IMMEDIATE : 0));
   10327             : 
   10328             :             /*
   10329             :              * Now we need to fetch the checkpoint record location, and also
   10330             :              * its REDO pointer.  The oldest point in WAL that would be needed
   10331             :              * to restore starting from the checkpoint is precisely the REDO
   10332             :              * pointer.
   10333             :              */
   10334           0 :             LWLockAcquire(ControlFileLock, LW_SHARED);
   10335           0 :             checkpointloc = ControlFile->checkPoint;
   10336           0 :             startpoint = ControlFile->checkPointCopy.redo;
   10337           0 :             starttli = ControlFile->checkPointCopy.ThisTimeLineID;
   10338           0 :             checkpointfpw = ControlFile->checkPointCopy.fullPageWrites;
   10339           0 :             LWLockRelease(ControlFileLock);
   10340             : 
   10341           0 :             if (backup_started_in_recovery)
   10342             :             {
   10343             :                 XLogRecPtr  recptr;
   10344             : 
   10345             :                 /*
   10346             :                  * Check to see if all WAL replayed during online backup
   10347             :                  * (i.e., since last restartpoint used as backup starting
   10348             :                  * checkpoint) contain full-page writes.
   10349             :                  */
   10350           0 :                 SpinLockAcquire(&XLogCtl->info_lck);
   10351           0 :                 recptr = XLogCtl->lastFpwDisableRecPtr;
   10352           0 :                 SpinLockRelease(&XLogCtl->info_lck);
   10353             : 
   10354           0 :                 if (!checkpointfpw || startpoint <= recptr)
   10355           0 :                     ereport(ERROR,
   10356             :                             (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
   10357             :                              errmsg("WAL generated with full_page_writes=off was replayed "
   10358             :                                     "since last restartpoint"),
   10359             :                              errhint("This means that the backup being taken on the standby "
   10360             :                                      "is corrupt and should not be used. "
   10361             :                                      "Enable full_page_writes and run CHECKPOINT on the master, "
   10362             :                                      "and then try an online backup again.")));
   10363             : 
   10364             :                 /*
   10365             :                  * During recovery, since we don't use the end-of-backup WAL
   10366             :                  * record and don't write the backup history file, the
   10367             :                  * starting WAL location doesn't need to be unique. This means
   10368             :                  * that two base backups started at the same time might use
   10369             :                  * the same checkpoint as starting locations.
   10370             :                  */
   10371           0 :                 gotUniqueStartpoint = true;
   10372             :             }
   10373             : 
   10374             :             /*
   10375             :              * If two base backups are started at the same time (in WAL sender
   10376             :              * processes), we need to make sure that they use different
   10377             :              * checkpoints as starting locations, because we use the starting
   10378             :              * WAL location as a unique identifier for the base backup in the
   10379             :              * end-of-backup WAL record and when we write the backup history
   10380             :              * file. Perhaps it would be better generate a separate unique ID
   10381             :              * for each backup instead of forcing another checkpoint, but
   10382             :              * taking a checkpoint right after another is not that expensive
   10383             :              * either because only few buffers have been dirtied yet.
   10384             :              */
   10385           0 :             WALInsertLockAcquireExclusive();
   10386           0 :             if (XLogCtl->Insert.lastBackupStart < startpoint)
   10387             :             {
   10388           0 :                 XLogCtl->Insert.lastBackupStart = startpoint;
   10389           0 :                 gotUniqueStartpoint = true;
   10390             :             }
   10391           0 :             WALInsertLockRelease();
   10392           0 :         } while (!gotUniqueStartpoint);
   10393             : 
   10394           0 :         XLByteToSeg(startpoint, _logSegNo);
   10395           0 :         XLogFileName(xlogfilename, starttli, _logSegNo);
   10396             : 
   10397             :         /*
   10398             :          * Construct tablespace_map file
   10399             :          */
   10400           0 :         if (exclusive)
   10401           0 :             tblspcmapfile = makeStringInfo();
   10402             : 
   10403           0 :         datadirpathlen = strlen(DataDir);
   10404             : 
   10405             :         /* Collect information about all tablespaces */
   10406           0 :         while ((de = ReadDir(tblspcdir, "pg_tblspc")) != NULL)
   10407             :         {
   10408             :             char        fullpath[MAXPGPATH + 10];
   10409             :             char        linkpath[MAXPGPATH];
   10410           0 :             char       *relpath = NULL;
   10411             :             int         rllen;
   10412             :             StringInfoData buflinkpath;
   10413           0 :             char       *s = linkpath;
   10414             : 
   10415             :             /* Skip special stuff */
   10416           0 :             if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
   10417           0 :                 continue;
   10418             : 
   10419           0 :             snprintf(fullpath, sizeof(fullpath), "pg_tblspc/%s", de->d_name);
   10420             : 
   10421             : #if defined(HAVE_READLINK) || defined(WIN32)
   10422           0 :             rllen = readlink(fullpath, linkpath, sizeof(linkpath));
   10423           0 :             if (rllen < 0)
   10424             :             {
   10425           0 :                 ereport(WARNING,
   10426             :                         (errmsg("could not read symbolic link \"%s\": %m",
   10427             :                                 fullpath)));
   10428           0 :                 continue;
   10429             :             }
   10430           0 :             else if (rllen >= sizeof(linkpath))
   10431             :             {
   10432           0 :                 ereport(WARNING,
   10433             :                         (errmsg("symbolic link \"%s\" target is too long",
   10434             :                                 fullpath)));
   10435           0 :                 continue;
   10436             :             }
   10437           0 :             linkpath[rllen] = '\0';
   10438             : 
   10439             :             /*
   10440             :              * Add the escape character '\\' before newline in a string to
   10441             :              * ensure that we can distinguish between the newline in the
   10442             :              * tablespace path and end of line while reading tablespace_map
   10443             :              * file during archive recovery.
   10444             :              */
   10445           0 :             initStringInfo(&buflinkpath);
   10446             : 
   10447           0 :             while (*s)
   10448             :             {
   10449           0 :                 if ((*s == '\n' || *s == '\r') && needtblspcmapfile)
   10450           0 :                     appendStringInfoChar(&buflinkpath, '\\');
   10451           0 :                 appendStringInfoChar(&buflinkpath, *s++);
   10452             :             }
   10453             : 
   10454             : 
   10455             :             /*
   10456             :              * Relpath holds the relative path of the tablespace directory
   10457             :              * when it's located within PGDATA, or NULL if it's located
   10458             :              * elsewhere.
   10459             :              */
   10460           0 :             if (rllen > datadirpathlen &&
   10461           0 :                 strncmp(linkpath, DataDir, datadirpathlen) == 0 &&
   10462           0 :                 IS_DIR_SEP(linkpath[datadirpathlen]))
   10463           0 :                 relpath = linkpath + datadirpathlen + 1;
   10464             : 
   10465           0 :             ti = palloc(sizeof(tablespaceinfo));
   10466           0 :             ti->oid = pstrdup(de->d_name);
   10467           0 :             ti->path = pstrdup(buflinkpath.data);
   10468           0 :             ti->rpath = relpath ? pstrdup(relpath) : NULL;
   10469           0 :             ti->size = infotbssize ? sendTablespace(fullpath, true) : -1;
   10470             : 
   10471           0 :             if (tablespaces)
   10472           0 :                 *tablespaces = lappend(*tablespaces, ti);
   10473             : 
   10474           0 :             appendStringInfo(tblspcmapfile, "%s %s\n", ti->oid, ti->path);
   10475             : 
   10476           0 :             pfree(buflinkpath.data);
   10477             : #else
   10478             : 
   10479             :             /*
   10480             :              * If the platform does not have symbolic links, it should not be
   10481             :              * possible to have tablespaces - clearly somebody else created
   10482             :              * them. Warn about it and ignore.
   10483             :              */
   10484             :             ereport(WARNING,
   10485             :                     (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
   10486             :                      errmsg("tablespaces are not supported on this platform")));
   10487             : #endif
   10488             :         }
   10489             : 
   10490             :         /*
   10491             :          * Construct backup label file
   10492             :          */
   10493           0 :         if (exclusive)
   10494           0 :             labelfile = makeStringInfo();
   10495             : 
   10496             :         /* Use the log timezone here, not the session timezone */
   10497           0 :         stamp_time = (pg_time_t) time(NULL);
   10498           0 :         pg_strftime(strfbuf, sizeof(strfbuf),
   10499             :                     "%Y-%m-%d %H:%M:%S %Z",
   10500           0 :                     pg_localtime(&stamp_time, log_timezone));
   10501           0 :         appendStringInfo(labelfile, "START WAL LOCATION: %X/%X (file %s)\n",
   10502           0 :                          (uint32) (startpoint >> 32), (uint32) startpoint, xlogfilename);
   10503           0 :         appendStringInfo(labelfile, "CHECKPOINT LOCATION: %X/%X\n",
   10504           0 :                          (uint32) (checkpointloc >> 32), (uint32) checkpointloc);
   10505           0 :         appendStringInfo(labelfile, "BACKUP METHOD: %s\n",
   10506             :                          exclusive ? "pg_start_backup" : "streamed");
   10507           0 :         appendStringInfo(labelfile, "BACKUP FROM: %s\n",
   10508             :                          backup_started_in_recovery ? "standby" : "master");
   10509           0 :         appendStringInfo(labelfile, "START TIME: %s\n", strfbuf);
   10510           0 :         appendStringInfo(labelfile, "LABEL: %s\n", backupidstr);
   10511             : 
   10512             :         /*
   10513             :          * Okay, write the file, or return its contents to caller.
   10514             :          */
   10515           0 :         if (exclusive)
   10516             :         {
   10517             :             /*
   10518             :              * Check for existing backup label --- implies a backup is already
   10519             :              * running.  (XXX given that we checked exclusiveBackupState
   10520             :              * above, maybe it would be OK to just unlink any such label
   10521             :              * file?)
   10522             :              */
   10523           0 :             if (stat(BACKUP_LABEL_FILE, &stat_buf) != 0)
   10524             :             {
   10525           0 :                 if (errno != ENOENT)
   10526           0 :                     ereport(ERROR,
   10527             :                             (errcode_for_file_access(),
   10528             :                              errmsg("could not stat file \"%s\": %m",
   10529             :                                     BACKUP_LABEL_FILE)));
   10530             :             }
   10531             :             else
   10532           0 :                 ereport(ERROR,
   10533             :                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
   10534             :                          errmsg("a backup is already in progress"),
   10535             :                          errhint("If you're sure there is no backup in progress, remove file \"%s\" and try again.",
   10536             :                                  BACKUP_LABEL_FILE)));
   10537             : 
   10538           0 :             fp = AllocateFile(BACKUP_LABEL_FILE, "w");
   10539             : 
   10540           0 :             if (!fp)
   10541           0 :                 ereport(ERROR,
   10542             :                         (errcode_for_file_access(),
   10543             :                          errmsg("could not create file \"%s\": %m",
   10544             :                                 BACKUP_LABEL_FILE)));
   10545           0 :             if (fwrite(labelfile->data, labelfile->len, 1, fp) != 1 ||
   10546           0 :                 fflush(fp) != 0 ||
   10547           0 :                 pg_fsync(fileno(fp)) != 0 ||
   10548           0 :                 ferror(fp) ||
   10549           0 :                 FreeFile(fp))
   10550           0 :                 ereport(ERROR,
   10551             :                         (errcode_for_file_access(),
   10552             :                          errmsg("could not write file \"%s\": %m",
   10553             :                                 BACKUP_LABEL_FILE)));
   10554             :             /* Allocated locally for exclusive backups, so free separately */
   10555           0 :             pfree(labelfile->data);
   10556           0 :             pfree(labelfile);
   10557             : 
   10558             :             /* Write backup tablespace_map file. */
   10559           0 :             if (tblspcmapfile->len > 0)
   10560             :             {
   10561           0 :                 if (stat(TABLESPACE_MAP, &stat_buf) != 0)
   10562             :                 {
   10563           0 :                     if (errno != ENOENT)
   10564           0 :                         ereport(ERROR,
   10565             :                                 (errcode_for_file_access(),
   10566             :                                  errmsg("could not stat file \"%s\": %m",
   10567             :                                         TABLESPACE_MAP)));
   10568             :                 }
   10569             :                 else
   10570           0 :                     ereport(ERROR,
   10571             :                             (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
   10572             :                              errmsg("a backup is already in progress"),
   10573             :                              errhint("If you're sure there is no backup in progress, remove file \"%s\" and try again.",
   10574             :                                      TABLESPACE_MAP)));
   10575             : 
   10576           0 :                 fp = AllocateFile(TABLESPACE_MAP, "w");
   10577             : 
   10578           0 :                 if (!fp)
   10579           0 :                     ereport(ERROR,
   10580             :                             (errcode_for_file_access(),
   10581             :                              errmsg("could not create file \"%s\": %m",
   10582             :                                     TABLESPACE_MAP)));
   10583           0 :                 if (fwrite(tblspcmapfile->data, tblspcmapfile->len, 1, fp) != 1 ||
   10584           0 :                     fflush(fp) != 0 ||
   10585           0 :                     pg_fsync(fileno(fp)) != 0 ||
   10586           0 :                     ferror(fp) ||
   10587           0 :                     FreeFile(fp))
   10588           0 :                     ereport(ERROR,
   10589             :                             (errcode_for_file_access(),
   10590             :                              errmsg("could not write file \"%s\": %m",
   10591             :                                     TABLESPACE_MAP)));
   10592             :             }
   10593             : 
   10594             :             /* Allocated locally for exclusive backups, so free separately */
   10595           0 :             pfree(tblspcmapfile->data);
   10596           0 :             pfree(tblspcmapfile);
   10597             :         }
   10598             :     }
   10599           0 :     PG_END_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));
   10600             : 
   10601             :     /*
   10602             :      * Mark that start phase has correctly finished for an exclusive backup.
   10603             :      * Session-level locks are updated as well to reflect that state.
   10604             :      */
   10605           0 :     if (exclusive)
   10606             :     {
   10607           0 :         WALInsertLockAcquireExclusive();
   10608           0 :         XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_IN_PROGRESS;
   10609           0 :         WALInsertLockRelease();
   10610           0 :         sessionBackupState = SESSION_BACKUP_EXCLUSIVE;
   10611             :     }
   10612             :     else
   10613           0 :         sessionBackupState = SESSION_BACKUP_NON_EXCLUSIVE;
   10614             : 
   10615             :     /*
   10616             :      * We're done.  As a convenience, return the starting WAL location.
   10617             :      */
   10618           0 :     if (starttli_p)
   10619           0 :         *starttli_p = starttli;
   10620           0 :     return startpoint;
   10621             : }
   10622             : 
   10623             : /* Error cleanup callback for pg_start_backup */
   10624             : static void
   10625           0 : pg_start_backup_callback(int code, Datum arg)
   10626             : {
   10627           0 :     bool        exclusive = DatumGetBool(arg);
   10628             : 
   10629             :     /* Update backup counters and forcePageWrites on failure */
   10630           0 :     WALInsertLockAcquireExclusive();
   10631           0 :     if (exclusive)
   10632             :     {
   10633           0 :         Assert(XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_STARTING);
   10634           0 :         XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_NONE;
   10635             :     }
   10636             :     else
   10637             :     {
   10638           0 :         Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
   10639           0 :         XLogCtl->Insert.nonExclusiveBackups--;
   10640             :     }
   10641             : 
   10642           0 :     if (XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_NONE &&
   10643           0 :         XLogCtl->Insert.nonExclusiveBackups == 0)
   10644             :     {
   10645           0 :         XLogCtl->Insert.forcePageWrites = false;
   10646             :     }
   10647           0 :     WALInsertLockRelease();
   10648           0 : }
   10649             : 
   10650             : /*
   10651             :  * Error cleanup callback for pg_stop_backup
   10652             :  */
   10653             : static void
   10654           0 : pg_stop_backup_callback(int code, Datum arg)
   10655             : {
   10656           0 :     bool        exclusive = DatumGetBool(arg);
   10657             : 
   10658             :     /* Update backup status on failure */
   10659           0 :     WALInsertLockAcquireExclusive();
   10660           0 :     if (exclusive)
   10661             :     {
   10662           0 :         Assert(XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_STOPPING);
   10663           0 :         XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_IN_PROGRESS;
   10664             :     }
   10665           0 :     WALInsertLockRelease();
   10666           0 : }
   10667             : 
   10668             : /*
   10669             :  * Utility routine to fetch the session-level status of a backup running.
   10670             :  */
   10671             : SessionBackupState
   10672           0 : get_backup_status(void)
   10673             : {
   10674           0 :     return sessionBackupState;
   10675             : }
   10676             : 
   10677             : /*
   10678             :  * do_pg_stop_backup is the workhorse of the user-visible pg_stop_backup()
   10679             :  * function.
   10680             :  *
   10681             :  * If labelfile is NULL, this stops an exclusive backup. Otherwise this stops
   10682             :  * the non-exclusive backup specified by 'labelfile'.
   10683             :  *
   10684             :  * Returns the last WAL location that must be present to restore from this
   10685             :  * backup, and the corresponding timeline ID in *stoptli_p.
   10686             :  *
   10687             :  * It is the responsibility of the caller of this function to verify the
   10688             :  * permissions of the calling user!
   10689             :  */
   10690             : XLogRecPtr
   10691           0 : do_pg_stop_backup(char *labelfile, bool waitforarchive, TimeLineID *stoptli_p)
   10692             : {
   10693           0 :     bool        exclusive = (labelfile == NULL);
   10694           0 :     bool        backup_started_in_recovery = false;
   10695             :     XLogRecPtr  startpoint;
   10696             :     XLogRecPtr  stoppoint;
   10697             :     TimeLineID  stoptli;
   10698             :     pg_time_t   stamp_time;
   10699             :     char        strfbuf[128];
   10700             :     char        histfilepath[MAXPGPATH];
   10701             :     char        startxlogfilename[MAXFNAMELEN];
   10702             :     char        stopxlogfilename[MAXFNAMELEN];
   10703             :     char        lastxlogfilename[MAXFNAMELEN];
   10704             :     char        histfilename[MAXFNAMELEN];
   10705             :     char        backupfrom[20];
   10706             :     XLogSegNo   _logSegNo;
   10707             :     FILE       *lfp;
   10708             :     FILE       *fp;
   10709             :     char        ch;
   10710             :     int         seconds_before_warning;
   10711           0 :     int         waits = 0;
   10712           0 :     bool        reported_waiting = false;
   10713             :     char       *remaining;
   10714             :     char       *ptr;
   10715             :     uint32      hi,
   10716             :                 lo;
   10717             : 
   10718           0 :     backup_started_in_recovery = RecoveryInProgress();
   10719             : 
   10720             :     /*
   10721             :      * Currently only non-exclusive backup can be taken during recovery.
   10722             :      */
   10723           0 :     if (backup_started_in_recovery && exclusive)
   10724           0 :         ereport(ERROR,
   10725             :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
   10726             :                  errmsg("recovery is in progress"),
   10727             :                  errhint("WAL control functions cannot be executed during recovery.")));
   10728             : 
   10729             :     /*
   10730             :      * During recovery, we don't need to check WAL level. Because, if WAL
   10731             :      * level is not sufficient, it's impossible to get here during recovery.
   10732             :      */
   10733           0 :     if (!backup_started_in_recovery && !XLogIsNeeded())
   10734           0 :         ereport(ERROR,
   10735             :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
   10736             :                  errmsg("WAL level not sufficient for making an online backup"),
   10737             :                  errhint("wal_level must be set to \"replica\" or \"logical\" at server start.")));
   10738             : 
   10739           0 :     if (exclusive)
   10740             :     {
   10741             :         /*
   10742             :          * At first, mark that we're now stopping an exclusive backup, to
   10743             :          * ensure that there are no other sessions currently running
   10744             :          * pg_start_backup() or pg_stop_backup().
   10745             :          */
   10746           0 :         WALInsertLockAcquireExclusive();
   10747           0 :         if (XLogCtl->Insert.exclusiveBackupState != EXCLUSIVE_BACKUP_IN_PROGRESS)
   10748             :         {
   10749           0 :             WALInsertLockRelease();
   10750           0 :             ereport(ERROR,
   10751             :                     (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
   10752             :                      errmsg("exclusive backup not in progress")));
   10753             :         }
   10754           0 :         XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_STOPPING;
   10755           0 :         WALInsertLockRelease();
   10756             : 
   10757             :         /*
   10758             :          * Remove backup_label. In case of failure, the state for an exclusive
   10759             :          * backup is switched back to in-progress.
   10760             :          */
   10761           0 :         PG_ENSURE_ERROR_CLEANUP(pg_stop_backup_callback, (Datum) BoolGetDatum(exclusive));
   10762             :         {
   10763             :             /*
   10764             :              * Read the existing label file into memory.
   10765             :              */
   10766             :             struct stat statbuf;
   10767             :             int         r;
   10768             : 
   10769           0 :             if (stat(BACKUP_LABEL_FILE, &statbuf))
   10770             :             {
   10771             :                 /* should not happen per the upper checks */
   10772           0 :                 if (errno != ENOENT)
   10773           0 :                     ereport(ERROR,
   10774             :                             (errcode_for_file_access(),
   10775             :                              errmsg("could not stat file \"%s\": %m",
   10776             :                                     BACKUP_LABEL_FILE)));
   10777           0 :                 ereport(ERROR,
   10778             :                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
   10779             :                          errmsg("a backup is not in progress")));
   10780             :             }
   10781             : 
   10782           0 :             lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
   10783           0 :             if (!lfp)
   10784             :             {
   10785           0 :                 ereport(ERROR,
   10786             :                         (errcode_for_file_access(),
   10787             :                          errmsg("could not read file \"%s\": %m",
   10788             :                                 BACKUP_LABEL_FILE)));
   10789             :             }
   10790           0 :             labelfile = palloc(statbuf.st_size + 1);
   10791           0 :             r = fread(labelfile, statbuf.st_size, 1, lfp);
   10792           0 :             labelfile[statbuf.st_size] = '\0';
   10793             : 
   10794             :             /*
   10795             :              * Close and remove the backup label file
   10796             :              */
   10797           0 :             if (r != 1 || ferror(lfp) || FreeFile(lfp))
   10798           0 :                 ereport(ERROR,
   10799             :                         (errcode_for_file_access(),
   10800             :                          errmsg("could not read file \"%s\": %m",
   10801             :                                 BACKUP_LABEL_FILE)));
   10802           0 :             durable_unlink(BACKUP_LABEL_FILE, ERROR);
   10803             : 
   10804             :             /*
   10805             :              * Remove tablespace_map file if present, it is created only if
   10806             :              * there are tablespaces.
   10807             :              */
   10808           0 :             durable_unlink(TABLESPACE_MAP, DEBUG1);
   10809             :         }
   10810           0 :         PG_END_ENSURE_ERROR_CLEANUP(pg_stop_backup_callback, (Datum) BoolGetDatum(exclusive));
   10811             :     }
   10812             : 
   10813             :     /*
   10814             :      * OK to update backup counters and forcePageWrites
   10815             :      */
   10816           0 :     WALInsertLockAcquireExclusive();
   10817           0 :     if (exclusive)
   10818             :     {
   10819           0 :         XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_NONE;
   10820             :     }
   10821             :     else
   10822             :     {
   10823             :         /*
   10824             :          * The user-visible pg_start/stop_backup() functions that operate on
   10825             :          * exclusive backups can be called at any time, but for non-exclusive
   10826             :          * backups, it is expected that each do_pg_start_backup() call is
   10827             :          * matched by exactly one do_pg_stop_backup() call.
   10828             :          */
   10829           0 :         Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
   10830           0 :         XLogCtl->Insert.nonExclusiveBackups--;
   10831             :     }
   10832             : 
   10833           0 :     if (XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_NONE &&
   10834           0 :         XLogCtl->Insert.nonExclusiveBackups == 0)
   10835             :     {
   10836           0 :         XLogCtl->Insert.forcePageWrites = false;
   10837             :     }
   10838           0 :     WALInsertLockRelease();
   10839             : 
   10840             :     /* Clean up session-level lock */
   10841           0 :     sessionBackupState = SESSION_BACKUP_NONE;
   10842             : 
   10843             :     /*
   10844             :      * Read and parse the START WAL LOCATION line (this code is pretty crude,
   10845             :      * but we are not expecting any variability in the file format).
   10846             :      */
   10847           0 :     if (sscanf(labelfile, "START WAL LOCATION: %X/%X (file %24s)%c",
   10848             :                &hi, &lo, startxlogfilename,
   10849           0 :                &ch) != 4 || ch != '\n')
   10850           0 :         ereport(ERROR,
   10851             :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
   10852             :                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
   10853           0 :     startpoint = ((uint64) hi) << 32 | lo;
   10854           0 :     remaining = strchr(labelfile, '\n') + 1;    /* %n is not portable enough */
   10855             : 
   10856             :     /*
   10857             :      * Parse the BACKUP FROM line. If we are taking an online backup from the
   10858             :      * standby, we confirm that the standby has not been promoted during the
   10859             :      * backup.
   10860             :      */
   10861           0 :     ptr = strstr(remaining, "BACKUP FROM:");
   10862           0 :     if (!ptr || sscanf(ptr, "BACKUP FROM: %19s\n", backupfrom) != 1)
   10863           0 :         ereport(ERROR,
   10864             :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
   10865             :                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
   10866           0 :     if (strcmp(backupfrom, "standby") == 0 && !backup_started_in_recovery)
   10867           0 :         ereport(ERROR,
   10868             :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
   10869             :                  errmsg("the standby was promoted during online backup"),
   10870             :                  errhint("This means that the backup being taken is corrupt "
   10871             :                          "and should not be used. "
   10872             :                          "Try taking another online backup.")));
   10873             : 
   10874             :     /*
   10875             :      * During recovery, we don't write an end-of-backup record. We assume that
   10876             :      * pg_control was backed up last and its minimum recovery point can be
   10877             :      * available as the backup end location. Since we don't have an
   10878             :      * end-of-backup record, we use the pg_control value to check whether
   10879             :      * we've reached the end of backup when starting recovery from this
   10880             :      * backup. We have no way of checking if pg_control wasn't backed up last
   10881             :      * however.
   10882             :      *
   10883             :      * We don't force a switch to new WAL file but it is still possible to
   10884             :      * wait for all the required files to be archived if waitforarchive is
   10885             :      * true. This is okay if we use the backup to start a standby and fetch
   10886             :      * the missing WAL using streaming replication. But in the case of an
   10887             :      * archive recovery, a user should set waitforarchive to true and wait for
   10888             :      * them to be archived to ensure that all the required files are
   10889             :      * available.
   10890             :      *
   10891             :      * We return the current minimum recovery point as the backup end
   10892             :      * location. Note that it can be greater than the exact backup end
   10893             :      * location if the minimum recovery point is updated after the backup of
   10894             :      * pg_control. This is harmless for current uses.
   10895             :      *
   10896             :      * XXX currently a backup history file is for informational and debug
   10897             :      * purposes only. It's not essential for an online backup. Furthermore,
   10898             :      * even if it's created, it will not be archived during recovery because
   10899             :      * an archiver is not invoked. So it doesn't seem worthwhile to write a
   10900             :      * backup history file during recovery.
   10901             :      */
   10902           0 :     if (backup_started_in_recovery)
   10903             :     {
   10904             :         XLogRecPtr  recptr;
   10905             : 
   10906             :         /*
   10907             :          * Check to see if all WAL replayed during online backup contain
   10908             :          * full-page writes.
   10909             :          */
   10910           0 :         SpinLockAcquire(&XLogCtl->info_lck);
   10911           0 :         recptr = XLogCtl->lastFpwDisableRecPtr;
   10912           0 :         SpinLockRelease(&XLogCtl->info_lck);
   10913             : 
   10914           0 :         if (startpoint <= recptr)
   10915           0 :             ereport(ERROR,
   10916             :                     (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
   10917             :                      errmsg("WAL generated with full_page_writes=off was replayed "
   10918             :                             "during online backup"),
   10919             :                      errhint("This means that the backup being taken on the standby "
   10920             :                              "is corrupt and should not be used. "
   10921             :                              "Enable full_page_writes and run CHECKPOINT on the master, "
   10922             :                              "and then try an online backup again.")));
   10923             : 
   10924             : 
   10925           0 :         LWLockAcquire(ControlFileLock, LW_SHARED);
   10926           0 :         stoppoint = ControlFile->minRecoveryPoint;
   10927           0 :         stoptli = ControlFile->minRecoveryPointTLI;
   10928           0 :         LWLockRelease(ControlFileLock);
   10929             :     }
   10930             :     else
   10931             :     {
   10932             :         /*
   10933             :          * Write the backup-end xlog record
   10934             :          */
   10935           0 :         XLogBeginInsert();
   10936           0 :         XLogRegisterData((char *) (&startpoint), sizeof(startpoint));
   10937           0 :         stoppoint = XLogInsert(RM_XLOG_ID, XLOG_BACKUP_END);
   10938           0 :         stoptli = ThisTimeLineID;
   10939             : 
   10940             :         /*
   10941             :          * Force a switch to a new xlog segment file, so that the backup is
   10942             :          * valid as soon as archiver moves out the current segment file.
   10943             :          */
   10944           0 :         RequestXLogSwitch(false);
   10945             : 
   10946           0 :         XLByteToPrevSeg(stoppoint, _logSegNo);
   10947           0 :         XLogFileName(stopxlogfilename, stoptli, _logSegNo);
   10948             : 
   10949             :         /* Use the log timezone here, not the session timezone */
   10950           0 :         stamp_time = (pg_time_t) time(NULL);
   10951           0 :         pg_strftime(strfbuf, sizeof(strfbuf),
   10952             :                     "%Y-%m-%d %H:%M:%S %Z",
   10953           0 :                     pg_localtime(&stamp_time, log_timezone));
   10954             : 
   10955             :         /*
   10956             :          * Write the backup history file
   10957             :          */
   10958           0 :         XLByteToSeg(startpoint, _logSegNo);
   10959           0 :         BackupHistoryFilePath(histfilepath, stoptli, _logSegNo,
   10960             :                               (uint32) (startpoint % XLogSegSize));
   10961           0 :         fp = AllocateFile(histfilepath, "w");
   10962           0 :         if (!fp)
   10963           0 :             ereport(ERROR,
   10964             :                     (errcode_for_file_access(),
   10965             :                      errmsg("could not create file \"%s\": %m",
   10966             :                             histfilepath)));
   10967           0 :         fprintf(fp, "START WAL LOCATION: %X/%X (file %s)\n",
   10968           0 :                 (uint32) (startpoint >> 32), (uint32) startpoint, startxlogfilename);
   10969           0 :         fprintf(fp, "STOP WAL LOCATION: %X/%X (file %s)\n",
   10970           0 :                 (uint32) (stoppoint >> 32), (uint32) stoppoint, stopxlogfilename);
   10971             :         /* transfer remaining lines from label to history file */
   10972           0 :         fprintf(fp, "%s", remaining);
   10973           0 :         fprintf(fp, "STOP TIME: %s\n", strfbuf);
   10974           0 :         if (fflush(fp) || ferror(fp) || FreeFile(fp))
   10975           0 :             ereport(ERROR,
   10976             :                     (errcode_for_file_access(),
   10977             :                      errmsg("could not write file \"%s\": %m",
   10978             :                             histfilepath)));
   10979             : 
   10980             :         /*
   10981             :          * Clean out any no-longer-needed history files.  As a side effect,
   10982             :          * this will post a .ready file for the newly created history file,
   10983             :          * notifying the archiver that history file may be archived
   10984             :          * immediately.
   10985             :          */
   10986           0 :         CleanupBackupHistory();
   10987             :     }
   10988             : 
   10989             :     /*
   10990             :      * If archiving is enabled, wait for all the required WAL files to be
   10991             :      * archived before returning. If archiving isn't enabled, the required WAL
   10992             :      * needs to be transported via streaming replication (hopefully with
   10993             :      * wal_keep_segments set high enough), or some more exotic mechanism like
   10994             :      * polling and copying files from pg_wal with script. We have no knowledge
   10995             :      * of those mechanisms, so it's up to the user to ensure that he gets all
   10996             :      * the required WAL.
   10997             :      *
   10998             :      * We wait until both the last WAL file filled during backup and the
   10999             :      * history file have been archived, and assume that the alphabetic sorting
   11000             :      * property of the WAL files ensures any earlier WAL files are safely
   11001             :      * archived as well.
   11002             :      *
   11003             :      * We wait forever, since archive_command is supposed to work and we
   11004             :      * assume the admin wanted his backup to work completely. If you don't
   11005             :      * wish to wait, then either waitforarchive should be passed in as false,
   11006             :      * or you can set statement_timeout.  Also, some notices are issued to
   11007             :      * clue in anyone who might be doing this interactively.
   11008             :      */
   11009             : 
   11010           0 :     if (waitforarchive &&
   11011           0 :         ((!backup_started_in_recovery && XLogArchivingActive()) ||
   11012           0 :          (backup_started_in_recovery && XLogArchivingAlways())))
   11013             :     {
   11014           0 :         XLByteToPrevSeg(stoppoint, _logSegNo);
   11015           0 :         XLogFileName(lastxlogfilename, stoptli, _logSegNo);
   11016             : 
   11017           0 :         XLByteToSeg(startpoint, _logSegNo);
   11018           0 :         BackupHistoryFileName(histfilename, stoptli, _logSegNo,
   11019             :                               (uint32) (startpoint % XLogSegSize));
   11020             : 
   11021           0 :         seconds_before_warning = 60;
   11022           0 :         waits = 0;
   11023             : 
   11024           0 :         while (XLogArchiveIsBusy(lastxlogfilename) ||
   11025           0 :                XLogArchiveIsBusy(histfilename))
   11026             :         {
   11027           0 :             CHECK_FOR_INTERRUPTS();
   11028             : 
   11029           0 :             if (!reported_waiting && waits > 5)
   11030             :             {
   11031           0 :                 ereport(NOTICE,
   11032             :                         (errmsg("pg_stop_backup cleanup done, waiting for required WAL segments to be archived")));
   11033           0 :                 reported_waiting = true;
   11034             :             }
   11035             : 
   11036           0 :             pg_usleep(1000000L);
   11037             : 
   11038           0 :             if (++waits >= seconds_before_warning)
   11039             :             {
   11040           0 :                 seconds_before_warning *= 2;    /* This wraps in >10 years... */
   11041           0 :                 ereport(WARNING,
   11042             :                         (errmsg("pg_stop_backup still waiting for all required WAL segments to be archived (%d seconds elapsed)",
   11043             :                                 waits),
   11044             :                          errhint("Check that your archive_command is executing properly.  "
   11045             :                                  "pg_stop_backup can be canceled safely, "
   11046             :                                  "but the database backup will not be usable without all the WAL segments.")));
   11047             :             }
   11048             :         }
   11049             : 
   11050           0 :         ereport(NOTICE,
   11051             :                 (errmsg("pg_stop_backup complete, all required WAL segments have been archived")));
   11052             :     }
   11053           0 :     else if (waitforarchive)
   11054           0 :         ereport(NOTICE,
   11055             :                 (errmsg("WAL archiving is not enabled; you must ensure that all required WAL segments are copied through other means to complete the backup")));
   11056             : 
   11057             :     /*
   11058             :      * We're done.  As a convenience, return the ending WAL location.
   11059             :      */
   11060           0 :     if (stoptli_p)
   11061           0 :         *stoptli_p = stoptli;
   11062           0 :     return stoppoint;
   11063             : }
   11064             : 
   11065             : 
   11066             : /*
   11067             :  * do_pg_abort_backup: abort a running backup
   11068             :  *
   11069             :  * This does just the most basic steps of do_pg_stop_backup(), by taking the
   11070             :  * system out of backup mode, thus making it a lot more safe to call from
   11071             :  * an error handler.
   11072             :  *
   11073             :  * NB: This is only for aborting a non-exclusive backup that doesn't write
   11074             :  * backup_label. A backup started with pg_start_backup() needs to be finished
   11075             :  * with pg_stop_backup().
   11076             :  */
   11077             : void
   11078           0 : do_pg_abort_backup(void)
   11079             : {
   11080           0 :     WALInsertLockAcquireExclusive();
   11081           0 :     Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
   11082           0 :     XLogCtl->Insert.nonExclusiveBackups--;
   11083             : 
   11084           0 :     if (XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_NONE &&
   11085           0 :         XLogCtl->Insert.nonExclusiveBackups == 0)
   11086             :     {
   11087           0 :         XLogCtl->Insert.forcePageWrites = false;
   11088             :     }
   11089           0 :     WALInsertLockRelease();
   11090           0 : }
   11091             : 
   11092             : /*
   11093             :  * Get latest redo apply position.
   11094             :  *
   11095             :  * Exported to allow WALReceiver to read the pointer directly.
   11096             :  */
   11097             : XLogRecPtr
   11098           0 : GetXLogReplayRecPtr(TimeLineID *replayTLI)
   11099             : {
   11100             :     XLogRecPtr  recptr;
   11101             :     TimeLineID  tli;
   11102             : 
   11103           0 :     SpinLockAcquire(&XLogCtl->info_lck);
   11104           0 :     recptr = XLogCtl->lastReplayedEndRecPtr;
   11105           0 :     tli = XLogCtl->lastReplayedTLI;
   11106           0 :     SpinLockRelease(&XLogCtl->info_lck);
   11107             : 
   11108           0 :     if (replayTLI)
   11109           0 :         *replayTLI = tli;
   11110           0 :     return recptr;
   11111             : }
   11112             : 
   11113             : /*
   11114             :  * Get latest WAL insert pointer
   11115             :  */
   11116             : XLogRecPtr
   11117           0 : GetXLogInsertRecPtr(void)
   11118             : {
   11119           0 :     XLogCtlInsert *Insert = &XLogCtl->Insert;
   11120             :     uint64      current_bytepos;
   11121             : 
   11122           0 :     SpinLockAcquire(&Insert->insertpos_lck);
   11123           0 :     current_bytepos = Insert->CurrBytePos;
   11124           0 :     SpinLockRelease(&Insert->insertpos_lck);
   11125             : 
   11126           0 :     return XLogBytePosToRecPtr(current_bytepos);
   11127             : }
   11128             : 
   11129             : /*
   11130             :  * Get latest WAL write pointer
   11131             :  */
   11132             : XLogRecPtr
   11133           0 : GetXLogWriteRecPtr(void)
   11134             : {
   11135           0 :     SpinLockAcquire(&XLogCtl->info_lck);
   11136           0 :     LogwrtResult = XLogCtl->LogwrtResult;
   11137           0 :     SpinLockRelease(&XLogCtl->info_lck);
   11138             : 
   11139           0 :     return LogwrtResult.Write;
   11140             : }
   11141             : 
   11142             : /*
   11143             :  * Returns the redo pointer of the last checkpoint or restartpoint. This is
   11144             :  * the oldest point in WAL that we still need, if we have to restart recovery.
   11145             :  */
   11146             : void
   11147           0 : GetOldestRestartPoint(XLogRecPtr *oldrecptr, TimeLineID *oldtli)
   11148             : {
   11149           0 :     LWLockAcquire(ControlFileLock, LW_SHARED);
   11150           0 :     *oldrecptr = ControlFile->checkPointCopy.redo;
   11151           0 :     *oldtli = ControlFile->checkPointCopy.ThisTimeLineID;
   11152           0 :     LWLockRelease(ControlFileLock);
   11153           0 : }
   11154             : 
   11155             : /*
   11156             :  * read_backup_label: check to see if a backup_label file is present
   11157             :  *
   11158             :  * If we see a backup_label during recovery, we assume that we are recovering
   11159             :  * from a backup dump file, and we therefore roll forward from the checkpoint
   11160             :  * identified by the label file, NOT what pg_control says.  This avoids the
   11161             :  * problem that pg_control might have been archived one or more checkpoints
   11162             :  * later than the start of the dump, and so if we rely on it as the start
   11163             :  * point, we will fail to restore a consistent database state.
   11164             :  *
   11165             :  * Returns TRUE if a backup_label was found (and fills the checkpoint
   11166             :  * location and its REDO location into *checkPointLoc and RedoStartLSN,
   11167             :  * respectively); returns FALSE if not. If this backup_label came from a
   11168             :  * streamed backup, *backupEndRequired is set to TRUE. If this backup_label
   11169             :  * was created during recovery, *backupFromStandby is set to TRUE.
   11170             :  */
   11171             : static bool
   11172           3 : read_backup_label(XLogRecPtr *checkPointLoc, bool *backupEndRequired,
   11173             :                   bool *backupFromStandby)
   11174             : {
   11175             :     char        startxlogfilename[MAXFNAMELEN];
   11176             :     TimeLineID  tli;
   11177             :     FILE       *lfp;
   11178             :     char        ch;
   11179             :     char        backuptype[20];
   11180             :     char        backupfrom[20];
   11181             :     uint32      hi,
   11182             :                 lo;
   11183             : 
   11184           3 :     *backupEndRequired = false;
   11185           3 :     *backupFromStandby = false;
   11186             : 
   11187             :     /*
   11188             :      * See if label file is present
   11189             :      */
   11190           3 :     lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
   11191           3 :     if (!lfp)
   11192             :     {
   11193           3 :         if (errno != ENOENT)
   11194           0 :             ereport(FATAL,
   11195             :                     (errcode_for_file_access(),
   11196             :                      errmsg("could not read file \"%s\": %m",
   11197             :                             BACKUP_LABEL_FILE)));
   11198           3 :         return false;           /* it's not there, all is fine */
   11199             :     }
   11200             : 
   11201             :     /*
   11202             :      * Read and parse the START WAL LOCATION and CHECKPOINT lines (this code
   11203             :      * is pretty crude, but we are not expecting any variability in the file
   11204             :      * format).
   11205             :      */
   11206           0 :     if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %08X%16s)%c",
   11207           0 :                &hi, &lo, &tli, startxlogfilename, &ch) != 5 || ch != '\n')
   11208           0 :         ereport(FATAL,
   11209             :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
   11210             :                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
   11211           0 :     RedoStartLSN = ((uint64) hi) << 32 | lo;
   11212           0 :     if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%X%c",
   11213           0 :                &hi, &lo, &ch) != 3 || ch != '\n')
   11214           0 :         ereport(FATAL,
   11215             :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
   11216             :                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
   11217           0 :     *checkPointLoc = ((uint64) hi) << 32 | lo;
   11218             : 
   11219             :     /*
   11220             :      * BACKUP METHOD and BACKUP FROM lines are new in 9.2. We can't restore
   11221             :      * from an older backup anyway, but since the information on it is not
   11222             :      * strictly required, don't error out if it's missing for some reason.
   11223             :      */
   11224           0 :     if (fscanf(lfp, "BACKUP METHOD: %19s\n", backuptype) == 1)
   11225             :     {
   11226           0 :         if (strcmp(backuptype, "streamed") == 0)
   11227           0 :             *backupEndRequired = true;
   11228             :     }
   11229             : 
   11230           0 :     if (fscanf(lfp, "BACKUP FROM: %19s\n", backupfrom) == 1)
   11231             :     {
   11232           0 :         if (strcmp(backupfrom, "standby") == 0)
   11233           0 :             *backupFromStandby = true;
   11234             :     }
   11235             : 
   11236           0 :     if (ferror(lfp) || FreeFile(lfp))
   11237           0 :         ereport(FATAL,
   11238             :                 (errcode_for_file_access(),
   11239             :                  errmsg("could not read file \"%s\": %m",
   11240             :                         BACKUP_LABEL_FILE)));
   11241             : 
   11242           0 :     return true;
   11243             : }
   11244             : 
   11245             : /*
   11246             :  * read_tablespace_map: check to see if a tablespace_map file is present
   11247             :  *
   11248             :  * If we see a tablespace_map file during recovery, we assume that we are
   11249             :  * recovering from a backup dump file, and we therefore need to create symlinks
   11250             :  * as per the information present in tablespace_map file.
   11251             :  *
   11252             :  * Returns TRUE if a tablespace_map file was found (and fills the link
   11253             :  * information for all the tablespace links present in file); returns FALSE
   11254             :  * if not.
   11255             :  */
   11256             : static bool
   11257           0 : read_tablespace_map(List **tablespaces)
   11258             : {
   11259             :     tablespaceinfo *ti;
   11260             :     FILE       *lfp;
   11261             :     char        tbsoid[MAXPGPATH];
   11262             :     char       *tbslinkpath;
   11263             :     char        str[MAXPGPATH];
   11264             :     int         ch,
   11265           0 :                 prev_ch = -1,
   11266           0 :                 i = 0,
   11267             :                 n;
   11268             : 
   11269             :     /*
   11270             :      * See if tablespace_map file is present
   11271             :      */
   11272           0 :     lfp = AllocateFile(TABLESPACE_MAP, "r");
   11273           0 :     if (!lfp)
   11274             :     {
   11275           0 :         if (errno != ENOENT)
   11276           0 :             ereport(FATAL,
   11277             :                     (errcode_for_file_access(),
   11278             :                      errmsg("could not read file \"%s\": %m",
   11279             :                             TABLESPACE_MAP)));
   11280           0 :         return false;           /* it's not there, all is fine */
   11281             :     }
   11282             : 
   11283             :     /*
   11284             :      * Read and parse the link name and path lines from tablespace_map file
   11285             :      * (this code is pretty crude, but we are not expecting any variability in
   11286             :      * the file format).  While taking backup we embed escape character '\\'
   11287             :      * before newline in tablespace path, so that during reading of
   11288             :      * tablespace_map file, we could distinguish newline in tablespace path
   11289             :      * and end of line.  Now while reading tablespace_map file, remove the
   11290             :      * escape character that has been added in tablespace path during backup.
   11291             :      */
   11292           0 :     while ((ch = fgetc(lfp)) != EOF)
   11293             :     {
   11294           0 :         if ((ch == '\n' || ch == '\r') && prev_ch != '\\')
   11295             :         {
   11296           0 :             str[i] = '\0';
   11297           0 :             if (sscanf(str, "%s %n", tbsoid, &n) != 1)
   11298           0 :                 ereport(FATAL,
   11299             :                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
   11300             :                          errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
   11301           0 :             tbslinkpath = str + n;
   11302           0 :             i = 0;
   11303             : 
   11304           0 :             ti = palloc(sizeof(tablespaceinfo));
   11305           0 :             ti->oid = pstrdup(tbsoid);
   11306           0 :             ti->path = pstrdup(tbslinkpath);
   11307             : 
   11308           0 :             *tablespaces = lappend(*tablespaces, ti);
   11309           0 :             continue;
   11310             :         }
   11311           0 :         else if ((ch == '\n' || ch == '\r') && prev_ch == '\\')
   11312           0 :             str[i - 1] = ch;
   11313             :         else
   11314           0 :             str[i++] = ch;
   11315           0 :         prev_ch = ch;
   11316             :     }
   11317             : 
   11318           0 :     if (ferror(lfp) || FreeFile(lfp))
   11319           0 :         ereport(FATAL,
   11320             :                 (errcode_for_file_access(),
   11321             :                  errmsg("could not read file \"%s\": %m",
   11322             :                         TABLESPACE_MAP)));
   11323             : 
   11324           0 :     return true;
   11325             : }
   11326             : 
   11327             : /*
   11328             :  * Error context callback for errors occurring during rm_redo().
   11329             :  */
   11330             : static void
   11331           0 : rm_redo_error_callback(void *arg)
   11332             : {
   11333           0 :     XLogReaderState *record = (XLogReaderState *) arg;
   11334             :     StringInfoData buf;
   11335             : 
   11336           0 :     initStringInfo(&buf);
   11337           0 :     xlog_outdesc(&buf, record);
   11338             : 
   11339             :     /* translator: %s is a WAL record description */
   11340           0 :     errcontext("WAL redo at %X/%X for %s",
   11341           0 :                (uint32) (record->ReadRecPtr >> 32),
   11342           0 :                (uint32) record->ReadRecPtr,
   11343             :                buf.data);
   11344             : 
   11345           0 :     pfree(buf.data);
   11346           0 : }
   11347             : 
   11348             : /*
   11349             :  * BackupInProgress: check if online backup mode is active
   11350             :  *
   11351             :  * This is done by checking for existence of the "backup_label" file.
   11352             :  */
   11353             : bool
   11354           0 : BackupInProgress(void)
   11355             : {
   11356             :     struct stat stat_buf;
   11357             : 
   11358           0 :     return (stat(BACKUP_LABEL_FILE, &stat_buf) == 0);
   11359             : }
   11360             : 
   11361             : /*
   11362             :  * CancelBackup: rename the "backup_label" and "tablespace_map"
   11363             :  *               files to cancel backup mode
   11364             :  *
   11365             :  * If the "backup_label" file exists, it will be renamed to "backup_label.old".
   11366             :  * Similarly, if the "tablespace_map" file exists, it will be renamed to
   11367             :  * "tablespace_map.old".
   11368             :  *
   11369             :  * Note that this will render an online backup in progress
   11370             :  * useless. To correctly finish an online backup, pg_stop_backup must be
   11371             :  * called.
   11372             :  */
   11373             : void
   11374           1 : CancelBackup(void)
   11375             : {
   11376             :     struct stat stat_buf;
   11377             : 
   11378             :     /* if the backup_label file is not there, return */
   11379           1 :     if (stat(BACKUP_LABEL_FILE, &stat_buf) < 0)
   11380           2 :         return;
   11381             : 
   11382             :     /* remove leftover file from previously canceled backup if it exists */
   11383           0 :     unlink(BACKUP_LABEL_OLD);
   11384             : 
   11385           0 :     if (durable_rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD, DEBUG1) != 0)
   11386             :     {
   11387           0 :         ereport(WARNING,
   11388             :                 (errcode_for_file_access(),
   11389             :                  errmsg("online backup mode was not canceled"),
   11390             :                  errdetail("File \"%s\" could not be renamed to \"%s\": %m.",
   11391             :                            BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
   11392           0 :         return;
   11393             :     }
   11394             : 
   11395             :     /* if the tablespace_map file is not there, return */
   11396           0 :     if (stat(TABLESPACE_MAP, &stat_buf) < 0)
   11397             :     {
   11398           0 :         ereport(LOG,
   11399             :                 (errmsg("online backup mode canceled"),
   11400             :                  errdetail("File \"%s\" was renamed to \"%s\".",
   11401             :                            BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
   11402           0 :         return;
   11403             :     }
   11404             : 
   11405             :     /* remove leftover file from previously canceled backup if it exists */
   11406           0 :     unlink(TABLESPACE_MAP_OLD);
   11407             : 
   11408           0 :     if (durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, DEBUG1) == 0)
   11409             :     {
   11410           0 :         ereport(LOG,
   11411             :                 (errmsg("online backup mode canceled"),
   11412             :                  errdetail("Files \"%s\" and \"%s\" were renamed to "
   11413             :                            "\"%s\" and \"%s\", respectively.",
   11414             :                            BACKUP_LABEL_FILE, TABLESPACE_MAP,
   11415             :                            BACKUP_LABEL_OLD, TABLESPACE_MAP_OLD)));
   11416             :     }
   11417             :     else
   11418             :     {
   11419           0 :         ereport(WARNING,
   11420             :                 (errcode_for_file_access(),
   11421             :                  errmsg("online backup mode canceled"),
   11422             :                  errdetail("File \"%s\" was renamed to \"%s\", but "
   11423             :                            "file \"%s\" could not be renamed to \"%s\": %m.",
   11424             :                            BACKUP_LABEL_FILE, BACKUP_LABEL_OLD,
   11425             :                            TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
   11426             :     }
   11427             : }
   11428             : 
   11429             : /*
   11430             :  * Read the XLOG page containing RecPtr into readBuf (if not read already).
   11431             :  * Returns number of bytes read, if the page is read successfully, or -1
   11432             :  * in case of errors.  When errors occur, they are ereport'ed, but only
   11433             :  * if they have not been previously reported.
   11434             :  *
   11435             :  * This is responsible for restoring files from archive as needed, as well
   11436             :  * as for waiting for the requested WAL record to arrive in standby mode.
   11437             :  *
   11438             :  * 'emode' specifies the log level used for reporting "file not found" or
   11439             :  * "end of WAL" situations in archive recovery, or in standby mode when a
   11440             :  * trigger file is found. If set to WARNING or below, XLogPageRead() returns
   11441             :  * false in those situations, on higher log levels the ereport() won't
   11442             :  * return.
   11443             :  *
   11444             :  * In standby mode, if after a successful return of XLogPageRead() the
   11445             :  * caller finds the record it's interested in to be broken, it should
   11446             :  * ereport the error with the level determined by
   11447             :  * emode_for_corrupt_record(), and then set lastSourceFailed
   11448             :  * and call XLogPageRead() again with the same arguments. This lets
   11449             :  * XLogPageRead() to try fetching the record from another source, or to
   11450             :  * sleep and retry.
   11451             :  */
   11452             : static int
   11453           5 : XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen,
   11454             :              XLogRecPtr targetRecPtr, char *readBuf, TimeLineID *readTLI)
   11455             : {
   11456           5 :     XLogPageReadPrivate *private =
   11457             :     (XLogPageReadPrivate *) xlogreader->private_data;
   11458           5 :     int         emode = private->emode;
   11459             :     uint32      targetPageOff;
   11460             :     XLogSegNo   targetSegNo PG_USED_FOR_ASSERTS_ONLY;
   11461             : 
   11462           5 :     XLByteToSeg(targetPagePtr, targetSegNo);
   11463           5 :     targetPageOff = targetPagePtr % XLogSegSize;
   11464             : 
   11465             :     /*
   11466             :      * See if we need to switch to a new segment because the requested record
   11467             :      * is not in the currently open one.
   11468             :      */
   11469           5 :     if (readFile >= 0 && !XLByteInSeg(targetPagePtr, readSegNo))
   11470             :     {
   11471             :         /*
   11472             :          * Request a restartpoint if we've replayed too much xlog since the
   11473             :          * last one.
   11474             :          */
   11475           0 :         if (bgwriterLaunched)
   11476             :         {
   11477           0 :             if (XLogCheckpointNeeded(readSegNo))
   11478             :             {
   11479           0 :                 (void) GetRedoRecPtr();
   11480           0 :                 if (XLogCheckpointNeeded(readSegNo))
   11481           0 :                     RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
   11482             :             }
   11483             :         }
   11484             : 
   11485           0 :         close(readFile);
   11486           0 :         readFile = -1;
   11487           0 :         readSource = 0;
   11488             :     }
   11489             : 
   11490           5 :     XLByteToSeg(targetPagePtr, readSegNo);
   11491             : 
   11492             : retry:
   11493             :     /* See if we need to retrieve more data */
   11494           7 :     if (readFile < 0 ||
   11495           2 :         (readSource == XLOG_FROM_STREAM &&
   11496           0 :          receivedUpto < targetPagePtr + reqLen))
   11497             :     {
   11498           6 :         if (!WaitForWALToBecomeAvailable(targetPagePtr + reqLen,
   11499           3 :                                          private->randAccess,
   11500           3 :                                          private->fetching_ckpt,
   11501             :                                          targetRecPtr))
   11502             :         {
   11503           0 :             if (readFile >= 0)
   11504           0 :                 close(readFile);
   11505           0 :             readFile = -1;
   11506           0 :             readLen = 0;
   11507           0 :             readSource = 0;
   11508             : 
   11509           0 :             return -1;
   11510             :         }
   11511             :     }
   11512             : 
   11513             :     /*
   11514             :      * At this point, we have the right segment open and if we're streaming we
   11515             :      * know the requested record is in it.
   11516             :      */
   11517           5 :     Assert(readFile != -1);
   11518             : 
   11519             :     /*
   11520             :      * If the current segment is being streamed from master, calculate how
   11521             :      * much of the current page we have received already. We know the
   11522             :      * requested record has been received, but this is for the benefit of
   11523             :      * future calls, to allow quick exit at the top of this function.
   11524             :      */
   11525           5 :     if (readSource == XLOG_FROM_STREAM)
   11526             :     {
   11527           0 :         if (((targetPagePtr) / XLOG_BLCKSZ) != (receivedUpto / XLOG_BLCKSZ))
   11528           0 :             readLen = XLOG_BLCKSZ;
   11529             :         else
   11530           0 :             readLen = receivedUpto % XLogSegSize - targetPageOff;
   11531             :     }
   11532             :     else
   11533           5 :         readLen = XLOG_BLCKSZ;
   11534             : 
   11535             :     /* Read the requested page */
   11536           5 :     readOff = targetPageOff;
   11537           5 :     if (lseek(readFile, (off_t) readOff, SEEK_SET) < 0)
   11538             :     {
   11539             :         char        fname[MAXFNAMELEN];
   11540             : 
   11541           0 :         XLogFileName(fname, curFileTLI, readSegNo);
   11542           0 :         ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
   11543             :                 (errcode_for_file_access(),
   11544             :                  errmsg("could not seek in log segment %s to offset %u: %m",
   11545             :                         fname, readOff)));
   11546           0 :         goto next_record_is_invalid;
   11547             :     }
   11548             : 
   11549           5 :     pgstat_report_wait_start(WAIT_EVENT_WAL_READ);
   11550           5 :     if (read(readFile, readBuf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
   11551             :     {
   11552             :         char        fname[MAXFNAMELEN];
   11553             : 
   11554           0 :         pgstat_report_wait_end();
   11555           0 :         XLogFileName(fname, curFileTLI, readSegNo);
   11556           0 :         ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
   11557             :                 (errcode_for_file_access(),
   11558             :                  errmsg("could not read from log segment %s, offset %u: %m",
   11559             :                         fname, readOff)));
   11560           0 :         goto next_record_is_invalid;
   11561             :     }
   11562           5 :     pgstat_report_wait_end();
   11563             : 
   11564           5 :     Assert(targetSegNo == readSegNo);
   11565           5 :     Assert(targetPageOff == readOff);
   11566           5 :     Assert(reqLen <= readLen);
   11567             : 
   11568           5 :     *readTLI = curFileTLI;
   11569           5 :     return readLen;
   11570             : 
   11571             : next_record_is_invalid:
   11572           0 :     lastSourceFailed = true;
   11573             : 
   11574           0 :     if (readFile >= 0)
   11575           0 :         close(readFile);
   11576           0 :     readFile = -1;
   11577           0 :     readLen = 0;
   11578           0 :     readSource = 0;
   11579             : 
   11580             :     /* In standby-mode, keep trying */
   11581           0 :     if (StandbyMode)
   11582           0 :         goto retry;
   11583             :     else
   11584           0 :         return -1;
   11585             : }
   11586             : 
   11587             : /*
   11588             :  * Open the WAL segment containing WAL location 'RecPtr'.
   11589             :  *
   11590             :  * The segment can be fetched via restore_command, or via walreceiver having
   11591             :  * streamed the record, or it can already be present in pg_wal. Checking
   11592             :  * pg_wal is mainly for crash recovery, but it will be polled in standby mode
   11593             :  * too, in case someone copies a new segment directly to pg_wal. That is not
   11594             :  * documented or recommended, though.
   11595             :  *
   11596             :  * If 'fetching_ckpt' is true, we're fetching a checkpoint record, and should
   11597             :  * prepare to read WAL starting from RedoStartLSN after this.
   11598             :  *
   11599             :  * 'RecPtr' might not point to the beginning of the record we're interested
   11600             :  * in, it might also point to the page or segment header. In that case,
   11601             :  * 'tliRecPtr' is the position of the WAL record we're interested in. It is
   11602             :  * used to decide which timeline to stream the requested WAL from.
   11603             :  *
   11604             :  * If the record is not immediately available, the function returns false
   11605             :  * if we're not in standby mode. In standby mode, waits for it to become
   11606             :  * available.
   11607             :  *
   11608             :  * When the requested record becomes available, the function opens the file
   11609             :  * containing it (if not open already), and returns true. When end of standby
   11610             :  * mode is triggered by the user, and there is no more WAL available, returns
   11611             :  * false.
   11612             :  */
   11613             : static bool
   11614           3 : WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
   11615             :                             bool fetching_ckpt, XLogRecPtr tliRecPtr)
   11616             : {
   11617             :     static TimestampTz last_fail_time = 0;
   11618             :     TimestampTz now;
   11619           3 :     bool        streaming_reply_sent = false;
   11620             : 
   11621             :     /*-------
   11622             :      * Standby mode is implemented by a state machine:
   11623             :      *
   11624             :      * 1. Read from either archive or pg_wal (XLOG_FROM_ARCHIVE), or just
   11625             :      *    pg_wal (XLOG_FROM_PG_WAL)
   11626             :      * 2. Check trigger file
   11627             :      * 3. Read from primary server via walreceiver (XLOG_FROM_STREAM)
   11628             :      * 4. Rescan timelines
   11629             :      * 5. Sleep wal_retrieve_retry_interval milliseconds, and loop back to 1.
   11630             :      *
   11631             :      * Failure to read from the current source advances the state machine to
   11632             :      * the next state.
   11633             :      *
   11634             :      * 'currentSource' indicates the current state. There are no currentSource
   11635             :      * values for "check trigger", "rescan timelines", and "sleep" states,
   11636             :      * those actions are taken when reading from the previous source fails, as
   11637             :      * part of advancing to the next state.
   11638             :      *-------
   11639             :      */
   11640           3 :     if (!InArchiveRecovery)
   11641           3 :         currentSource = XLOG_FROM_PG_WAL;
   11642           0 :     else if (currentSource == 0)
   11643           0 :         currentSource = XLOG_FROM_ARCHIVE;
   11644             : 
   11645             :     for (;;)
   11646             :     {
   11647           3 :         int         oldSource = currentSource;
   11648             : 
   11649             :         /*
   11650             :          * First check if we failed to read from the current source, and
   11651             :          * advance the state machine if so. The failure to read might've
   11652             :          * happened outside this function, e.g when a CRC check fails on a
   11653             :          * record, or within this loop.
   11654             :          */
   11655           3 :         if (lastSourceFailed)
   11656             :         {
   11657           0 :             switch (currentSource)
   11658             :             {
   11659             :                 case XLOG_FROM_ARCHIVE:
   11660             :                 case XLOG_FROM_PG_WAL:
   11661             : 
   11662             :                     /*
   11663             :                      * Check to see if the trigger file exists. Note that we
   11664             :                      * do this only after failure, so when you create the
   11665             :                      * trigger file, we still finish replaying as much as we
   11666             :                      * can from archive and pg_wal before failover.
   11667             :                      */
   11668           0 :                     if (StandbyMode && CheckForStandbyTrigger())
   11669             :                     {
   11670           0 :                         ShutdownWalRcv();
   11671           0 :                         return false;
   11672             :                     }
   11673             : 
   11674             :                     /*
   11675             :                      * Not in standby mode, and we've now tried the archive
   11676             :                      * and pg_wal.
   11677             :                      */
   11678           0 :                     if (!StandbyMode)
   11679           0 :                         return false;
   11680             : 
   11681             :                     /*
   11682             :                      * If primary_conninfo is set, launch walreceiver to try
   11683             :                      * to stream the missing WAL.
   11684             :                      *
   11685             :                      * If fetching_ckpt is TRUE, RecPtr points to the initial
   11686             :                      * checkpoint location. In that case, we use RedoStartLSN
   11687             :                      * as the streaming start position instead of RecPtr, so
   11688             :                      * that when we later jump backwards to start redo at
   11689             :                      * RedoStartLSN, we will have the logs streamed already.
   11690             :                      */
   11691           0 :                     if (PrimaryConnInfo)
   11692             :                     {
   11693             :                         XLogRecPtr  ptr;
   11694             :                         TimeLineID  tli;
   11695             : 
   11696           0 :                         if (fetching_ckpt)
   11697             :                         {
   11698           0 :                             ptr = RedoStartLSN;
   11699           0 :                             tli = ControlFile->checkPointCopy.ThisTimeLineID;
   11700             :                         }
   11701             :                         else
   11702             :                         {
   11703           0 :                             ptr = tliRecPtr;
   11704           0 :                             tli = tliOfPointInHistory(tliRecPtr, expectedTLEs);
   11705             : 
   11706           0 :                             if (curFileTLI > 0 && tli < curFileTLI)
   11707           0 :                                 elog(ERROR, "according to history file, WAL location %X/%X belongs to timeline %u, but previous recovered WAL file came from timeline %u",
   11708             :                                      (uint32) (ptr >> 32), (uint32) ptr,
   11709             :                                      tli, curFileTLI);
   11710             :                         }
   11711           0 :                         curFileTLI = tli;
   11712           0 :                         RequestXLogStreaming(tli, ptr, PrimaryConnInfo,
   11713             :                                              PrimarySlotName);
   11714           0 :                         receivedUpto = 0;
   11715             :                     }
   11716             : 
   11717             :                     /*
   11718             :                      * Move to XLOG_FROM_STREAM state in either case. We'll
   11719             :                      * get immediate failure if we didn't launch walreceiver,
   11720             :                      * and move on to the next state.
   11721             :                      */
   11722           0 :                     currentSource = XLOG_FROM_STREAM;
   11723           0 :                     break;
   11724             : 
   11725             :                 case XLOG_FROM_STREAM:
   11726             : 
   11727             :                     /*
   11728             :                      * Failure while streaming. Most likely, we got here
   11729             :                      * because streaming replication was terminated, or
   11730             :                      * promotion was triggered. But we also get here if we
   11731             :                      * find an invalid record in the WAL streamed from master,
   11732             :                      * in which case something is seriously wrong. There's
   11733             :                      * little chance that the problem will just go away, but
   11734             :                      * PANIC is not good for availability either, especially
   11735             :                      * in hot standby mode. So, we treat that the same as
   11736             :                      * disconnection, and retry from archive/pg_wal again. The
   11737             :                      * WAL in the archive should be identical to what was
   11738             :                      * streamed, so it's unlikely that it helps, but one can
   11739             :                      * hope...
   11740             :                      */
   11741             : 
   11742             :                     /*
   11743             :                      * Before we leave XLOG_FROM_STREAM state, make sure that
   11744             :                      * walreceiver is not active, so that it won't overwrite
   11745             :                      * WAL that we restore from archive.
   11746             :                      */
   11747           0 :                     if (WalRcvStreaming())
   11748           0 :                         ShutdownWalRcv();
   11749             : 
   11750             :                     /*
   11751             :                      * Before we sleep, re-scan for possible new timelines if
   11752             :                      * we were requested to recover to the latest timeline.
   11753             :                      */
   11754           0 :                     if (recoveryTargetIsLatest)
   11755             :                     {
   11756           0 :                         if (rescanLatestTimeLine())
   11757             :                         {
   11758           0 :                             currentSource = XLOG_FROM_ARCHIVE;
   11759           0 :                             break;
   11760             :                         }
   11761             :                     }
   11762             : 
   11763             :                     /*
   11764             :                      * XLOG_FROM_STREAM is the last state in our state
   11765             :                      * machine, so we've exhausted all the options for
   11766             :                      * obtaining the requested WAL. We're going to loop back
   11767             :                      * and retry from the archive, but if it hasn't been long
   11768             :                      * since last attempt, sleep wal_retrieve_retry_interval
   11769             :                      * milliseconds to avoid busy-waiting.
   11770             :                      */
   11771           0 :                     now = GetCurrentTimestamp();
   11772           0 :                     if (!TimestampDifferenceExceeds(last_fail_time, now,
   11773             :                                                     wal_retrieve_retry_interval))
   11774             :                     {
   11775             :                         long        secs,
   11776             :                                     wait_time;
   11777             :                         int         usecs;
   11778             : 
   11779           0 :                         TimestampDifference(last_fail_time, now, &secs, &usecs);
   11780           0 :                         wait_time = wal_retrieve_retry_interval -
   11781           0 :                             (secs * 1000 + usecs / 1000);
   11782             : 
   11783           0 :                         WaitLatch(&XLogCtl->recoveryWakeupLatch,
   11784             :                                   WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
   11785             :                                   wait_time, WAIT_EVENT_RECOVERY_WAL_STREAM);
   11786           0 :                         ResetLatch(&XLogCtl->recoveryWakeupLatch);
   11787           0 :                         now = GetCurrentTimestamp();
   11788             :                     }
   11789           0 :                     last_fail_time = now;
   11790           0 :                     currentSource = XLOG_FROM_ARCHIVE;
   11791           0 :                     break;
   11792             : 
   11793             :                 default:
   11794           0 :                     elog(ERROR, "unexpected WAL source %d", currentSource);
   11795             :             }
   11796             :         }
   11797           3 :         else if (currentSource == XLOG_FROM_PG_WAL)
   11798             :         {
   11799             :             /*
   11800             :              * We just successfully read a file in pg_wal. We prefer files in
   11801             :              * the archive over ones in pg_wal, so try the next file again
   11802             :              * from the archive first.
   11803             :              */
   11804           3 :             if (InArchiveRecovery)
   11805           0 :                 currentSource = XLOG_FROM_ARCHIVE;
   11806             :         }
   11807             : 
   11808           3 :         if (currentSource != oldSource)
   11809           0 :             elog(DEBUG2, "switched WAL source from %s to %s after %s",
   11810             :                  xlogSourceNames[oldSource], xlogSourceNames[currentSource],
   11811             :                  lastSourceFailed ? "failure" : "success");
   11812             : 
   11813             :         /*
   11814             :          * We've now handled possible failure. Try to read from the chosen
   11815             :          * source.
   11816             :          */
   11817           3 :         lastSourceFailed = false;
   11818             : 
   11819           3 :         switch (currentSource)
   11820             :         {
   11821             :             case XLOG_FROM_ARCHIVE:
   11822             :             case XLOG_FROM_PG_WAL:
   11823             :                 /* Close any old file we might have open. */
   11824           3 :                 if (readFile >= 0)
   11825             :                 {
   11826           0 :                     close(readFile);
   11827           0 :                     readFile = -1;
   11828             :                 }
   11829             :                 /* Reset curFileTLI if random fetch. */
   11830           3 :                 if (randAccess)
   11831           3 :                     curFileTLI = 0;
   11832             : 
   11833             :                 /*
   11834             :                  * Try to restore the file from archive, or read an existing
   11835             :                  * file from pg_wal.
   11836             :                  */
   11837           3 :                 readFile = XLogFileReadAnyTLI(readSegNo, DEBUG2,
   11838           3 :                                               currentSource == XLOG_FROM_ARCHIVE ? XLOG_FROM_ANY :
   11839             :                                               currentSource);
   11840           3 :                 if (readFile >= 0)
   11841           3 :                     return true;    /* success! */
   11842             : 
   11843             :                 /*
   11844             :                  * Nope, not found in archive or pg_wal.
   11845             :                  */
   11846           0 :                 lastSourceFailed = true;
   11847           0 :                 break;
   11848             : 
   11849             :             case XLOG_FROM_STREAM:
   11850             :                 {
   11851             :                     bool        havedata;
   11852             : 
   11853             :                     /*
   11854             :                      * Check if WAL receiver is still active.
   11855             :                      */
   11856           0 :                     if (!WalRcvStreaming())
   11857             :                     {
   11858           0 :                         lastSourceFailed = true;
   11859           0 :                         break;
   11860             :                     }
   11861             : 
   11862             :                     /*
   11863             :                      * Walreceiver is active, so see if new data has arrived.
   11864             :                      *
   11865             :                      * We only advance XLogReceiptTime when we obtain fresh
   11866             :                      * WAL from walreceiver and observe that we had already
   11867             :                      * processed everything before the most recent "chunk"
   11868             :                      * that it flushed to disk.  In steady state where we are
   11869             :                      * keeping up with the incoming data, XLogReceiptTime will
   11870             :                      * be updated on each cycle. When we are behind,
   11871             :                      * XLogReceiptTime will not advance, so the grace time
   11872             :                      * allotted to conflicting queries will decrease.
   11873             :                      */
   11874           0 :                     if (RecPtr < receivedUpto)
   11875           0 :                         havedata = true;
   11876             :                     else
   11877             :                     {
   11878             :                         XLogRecPtr  latestChunkStart;
   11879             : 
   11880           0 :                         receivedUpto = GetWalRcvWriteRecPtr(&latestChunkStart, &receiveTLI);
   11881           0 :                         if (RecPtr < receivedUpto && receiveTLI == curFileTLI)
   11882             :                         {
   11883           0 :                             havedata = true;
   11884           0 :                             if (latestChunkStart <= RecPtr)
   11885             :                             {
   11886           0 :                                 XLogReceiptTime = GetCurrentTimestamp();
   11887           0 :                                 SetCurrentChunkStartTime(XLogReceiptTime);
   11888             :                             }
   11889             :                         }
   11890             :                         else
   11891           0 :                             havedata = false;
   11892             :                     }
   11893           0 :                     if (havedata)
   11894             :                     {
   11895             :                         /*
   11896             :                          * Great, streamed far enough.  Open the file if it's
   11897             :                          * not open already.  Also read the timeline history
   11898             :                          * file if we haven't initialized timeline history
   11899             :                          * yet; it should be streamed over and present in
   11900             :                          * pg_wal by now.  Use XLOG_FROM_STREAM so that source
   11901             :                          * info is set correctly and XLogReceiptTime isn't
   11902             :                          * changed.
   11903             :                          */
   11904           0 :                         if (readFile < 0)
   11905             :                         {
   11906           0 :                             if (!expectedTLEs)
   11907           0 :                                 expectedTLEs = readTimeLineHistory(receiveTLI);
   11908           0 :                             readFile = XLogFileRead(readSegNo, PANIC,
   11909             :                                                     receiveTLI,
   11910             :                                                     XLOG_FROM_STREAM, false);
   11911           0 :                             Assert(readFile >= 0);
   11912             :                         }
   11913             :                         else
   11914             :                         {
   11915             :                             /* just make sure source info is correct... */
   11916           0 :                             readSource = XLOG_FROM_STREAM;
   11917           0 :                             XLogReceiptSource = XLOG_FROM_STREAM;
   11918           0 :                             return true;
   11919             :                         }
   11920           0 :                         break;
   11921             :                     }
   11922             : 
   11923             :                     /*
   11924             :                      * Data not here yet. Check for trigger, then wait for
   11925             :                      * walreceiver to wake us up when new WAL arrives.
   11926             :                      */
   11927           0 :                     if (CheckForStandbyTrigger())
   11928             :                     {
   11929             :                         /*
   11930             :                          * Note that we don't "return false" immediately here.
   11931             :                          * After being triggered, we still want to replay all
   11932             :                          * the WAL that was already streamed. It's in pg_wal
   11933             :                          * now, so we just treat this as a failure, and the
   11934             :                          * state machine will move on to replay the streamed
   11935             :                          * WAL from pg_wal, and then recheck the trigger and
   11936             :                          * exit replay.
   11937             :                          */
   11938           0 :                         lastSourceFailed = true;
   11939           0 :                         break;
   11940             :                     }
   11941             : 
   11942             :                     /*
   11943             :                      * Since we have replayed everything we have received so
   11944             :                      * far and are about to start waiting for more WAL, let's
   11945             :                      * tell the upstream server our replay location now so
   11946             :                      * that pg_stat_replication doesn't show stale
   11947             :                      * information.
   11948             :                      */
   11949           0 :                     if (!streaming_reply_sent)
   11950             :                     {
   11951           0 :                         WalRcvForceReply();
   11952           0 :                         streaming_reply_sent = true;
   11953             :                     }
   11954             : 
   11955             :                     /*
   11956             :                      * Wait for more WAL to arrive. Time out after 5 seconds
   11957             :                      * to react to a trigger file promptly.
   11958             :                      */
   11959           0 :                     WaitLatch(&XLogCtl->recoveryWakeupLatch,
   11960             :                               WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
   11961             :                               5000L, WAIT_EVENT_RECOVERY_WAL_ALL);
   11962           0 :                     ResetLatch(&XLogCtl->recoveryWakeupLatch);
   11963           0 :                     break;
   11964             :                 }
   11965             : 
   11966             :             default:
   11967           0 :                 elog(ERROR, "unexpected WAL source %d", currentSource);
   11968             :         }
   11969             : 
   11970             :         /*
   11971             :          * This possibly-long loop needs to handle interrupts of startup
   11972             :          * process.
   11973             :          */
   11974           0 :         HandleStartupProcInterrupts();
   11975           0 :     }
   11976             : 
   11977             :     return false;               /* not reached */
   11978             : }
   11979             : 
   11980             : /*
   11981             :  * Determine what log level should be used to report a corrupt WAL record
   11982             :  * in the current WAL page, previously read by XLogPageRead().
   11983             :  *
   11984             :  * 'emode' is the error mode that would be used to report a file-not-found
   11985             :  * or legitimate end-of-WAL situation.   Generally, we use it as-is, but if
   11986             :  * we're retrying the exact same record that we've tried previously, only
   11987             :  * complain the first time to keep the noise down.  However, we only do when
   11988             :  * reading from pg_wal, because we don't expect any invalid records in archive
   11989             :  * or in records streamed from master. Files in the archive should be complete,
   11990             :  * and we should never hit the end of WAL because we stop and wait for more WAL
   11991             :  * to arrive before replaying it.
   11992             :  *
   11993             :  * NOTE: This function remembers the RecPtr value it was last called with,
   11994             :  * to suppress repeated messages about the same record. Only call this when
   11995             :  * you are about to ereport(), or you might cause a later message to be
   11996             :  * erroneously suppressed.
   11997             :  */
   11998             : static int
   11999           0 : emode_for_corrupt_record(int emode, XLogRecPtr RecPtr)
   12000             : {
   12001             :     static XLogRecPtr lastComplaint = 0;
   12002             : 
   12003           0 :     if (readSource == XLOG_FROM_PG_WAL && emode == LOG)
   12004             :     {
   12005           0 :         if (RecPtr == lastComplaint)
   12006           0 :             emode = DEBUG1;
   12007             :         else
   12008           0 :             lastComplaint = RecPtr;
   12009             :     }
   12010           0 :     return emode;
   12011             : }
   12012             : 
   12013             : /*
   12014             :  * Check to see whether the user-specified trigger file exists and whether a
   12015             :  * promote request has arrived.  If either condition holds, return true.
   12016             :  */
   12017             : static bool
   12018           0 : CheckForStandbyTrigger(void)
   12019             : {
   12020             :     struct stat stat_buf;
   12021             :     static bool triggered = false;
   12022             : 
   12023           0 :     if (triggered)
   12024           0 :         return true;
   12025             : 
   12026           0 :     if (IsPromoteTriggered())
   12027             :     {
   12028             :         /*
   12029             :          * In 9.1 and 9.2 the postmaster unlinked the promote file inside the
   12030             :          * signal handler. It now leaves the file in place and lets the
   12031             :          * Startup process do the unlink. This allows Startup to know whether
   12032             :          * it should create a full checkpoint before starting up (fallback
   12033             :          * mode). Fast promotion takes precedence.
   12034             :          */
   12035           0 :         if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
   12036             :         {
   12037           0 :             unlink(PROMOTE_SIGNAL_FILE);
   12038           0 :             unlink(FALLBACK_PROMOTE_SIGNAL_FILE);
   12039           0 :             fast_promote = true;
   12040             :         }
   12041           0 :         else if (stat(FALLBACK_PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
   12042             :         {
   12043           0 :             unlink(FALLBACK_PROMOTE_SIGNAL_FILE);
   12044           0 :             fast_promote = false;
   12045             :         }
   12046             : 
   12047           0 :         ereport(LOG, (errmsg("received promote request")));
   12048             : 
   12049           0 :         ResetPromoteTriggered();
   12050           0 :         triggered = true;
   12051           0 :         return true;
   12052             :     }
   12053             : 
   12054           0 :     if (TriggerFile == NULL)
   12055           0 :         return false;
   12056             : 
   12057           0 :     if (stat(TriggerFile, &stat_buf) == 0)
   12058             :     {
   12059           0 :         ereport(LOG,
   12060             :                 (errmsg("trigger file found: %s", TriggerFile)));
   12061           0 :         unlink(TriggerFile);
   12062           0 :         triggered = true;
   12063           0 :         fast_promote = true;
   12064           0 :         return true;
   12065             :     }
   12066           0 :     else if (errno != ENOENT)
   12067           0 :         ereport(ERROR,
   12068             :                 (errcode_for_file_access(),
   12069             :                  errmsg("could not stat trigger file \"%s\": %m",
   12070             :                         TriggerFile)));
   12071             : 
   12072           0 :     return false;
   12073             : }
   12074             : 
   12075             : /*
   12076             :  * Remove the files signaling a standby promotion request.
   12077             :  */
   12078             : void
   12079           1 : RemovePromoteSignalFiles(void)
   12080             : {
   12081           1 :     unlink(PROMOTE_SIGNAL_FILE);
   12082           1 :     unlink(FALLBACK_PROMOTE_SIGNAL_FILE);
   12083           1 : }
   12084             : 
   12085             : /*
   12086             :  * Check to see if a promote request has arrived. Should be
   12087             :  * called by postmaster after receiving SIGUSR1.
   12088             :  */
   12089             : bool
   12090          43 : CheckPromoteSignal(void)
   12091             : {
   12092             :     struct stat stat_buf;
   12093             : 
   12094          86 :     if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0 ||
   12095          43 :         stat(FALLBACK_PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
   12096           0 :         return true;
   12097             : 
   12098          43 :     return false;
   12099             : }
   12100             : 
   12101             : /*
   12102             :  * Wake up startup process to replay newly arrived WAL, or to notice that
   12103             :  * failover has been requested.
   12104             :  */
   12105             : void
   12106           0 : WakeupRecovery(void)
   12107             : {
   12108           0 :     SetLatch(&XLogCtl->recoveryWakeupLatch);
   12109           0 : }
   12110             : 
   12111             : /*
   12112             :  * Update the WalWriterSleeping flag.
   12113             :  */
   12114             : void
   12115           1 : SetWalWriterSleeping(bool sleeping)
   12116             : {
   12117           1 :     SpinLockAcquire(&XLogCtl->info_lck);
   12118           1 :     XLogCtl->WalWriterSleeping = sleeping;
   12119           1 :     SpinLockRelease(&XLogCtl->info_lck);
   12120           1 : }
   12121             : 
   12122             : /*
   12123             :  * Schedule a walreceiver wakeup in the main recovery loop.
   12124             :  */
   12125             : void
   12126           0 : XLogRequestWalReceiverReply(void)
   12127             : {
   12128           0 :     doRequestWalReceiverReply = true;
   12129           0 : }

Generated by: LCOV version 1.11