Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * checkpointer.c
4 : *
5 : * The checkpointer is new as of Postgres 9.2. It handles all checkpoints.
6 : * Checkpoints are automatically dispatched after a certain amount of time has
7 : * elapsed since the last one, and it can be signaled to perform requested
8 : * checkpoints as well. (The GUC parameter that mandates a checkpoint every
9 : * so many WAL segments is implemented by having backends signal when they
10 : * fill WAL segments; the checkpointer itself doesn't watch for the
11 : * condition.)
12 : *
13 : * The checkpointer is started by the postmaster as soon as the startup
14 : * subprocess finishes, or as soon as recovery begins if we are doing archive
15 : * recovery. It remains alive until the postmaster commands it to terminate.
16 : * Normal termination is by SIGUSR2, which instructs the checkpointer to
17 : * execute a shutdown checkpoint and then exit(0). (All backends must be
18 : * stopped before SIGUSR2 is issued!) Emergency termination is by SIGQUIT;
19 : * like any backend, the checkpointer will simply abort and exit on SIGQUIT.
20 : *
21 : * If the checkpointer exits unexpectedly, the postmaster treats that the same
22 : * as a backend crash: shared memory may be corrupted, so remaining backends
23 : * should be killed by SIGQUIT and then a recovery cycle started. (Even if
24 : * shared memory isn't corrupted, we have lost information about which
25 : * files need to be fsync'd for the next checkpoint, and so a system
26 : * restart needs to be forced.)
27 : *
28 : *
29 : * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
30 : *
31 : *
32 : * IDENTIFICATION
33 : * src/backend/postmaster/checkpointer.c
34 : *
35 : *-------------------------------------------------------------------------
36 : */
37 : #include "postgres.h"
38 :
39 : #include <signal.h>
40 : #include <sys/time.h>
41 : #include <time.h>
42 : #include <unistd.h>
43 :
44 : #include "access/xlog.h"
45 : #include "access/xlog_internal.h"
46 : #include "libpq/pqsignal.h"
47 : #include "miscadmin.h"
48 : #include "pgstat.h"
49 : #include "postmaster/bgwriter.h"
50 : #include "replication/syncrep.h"
51 : #include "storage/bufmgr.h"
52 : #include "storage/condition_variable.h"
53 : #include "storage/fd.h"
54 : #include "storage/ipc.h"
55 : #include "storage/lwlock.h"
56 : #include "storage/proc.h"
57 : #include "storage/shmem.h"
58 : #include "storage/smgr.h"
59 : #include "storage/spin.h"
60 : #include "utils/guc.h"
61 : #include "utils/memutils.h"
62 : #include "utils/resowner.h"
63 :
64 :
65 : /*----------
66 : * Shared memory area for communication between checkpointer and backends
67 : *
68 : * The ckpt counters allow backends to watch for completion of a checkpoint
69 : * request they send. Here's how it works:
70 : * * At start of a checkpoint, checkpointer reads (and clears) the request
71 : * flags and increments ckpt_started, while holding ckpt_lck.
72 : * * On completion of a checkpoint, checkpointer sets ckpt_done to
73 : * equal ckpt_started.
74 : * * On failure of a checkpoint, checkpointer increments ckpt_failed
75 : * and sets ckpt_done to equal ckpt_started.
76 : *
77 : * The algorithm for backends is:
78 : * 1. Record current values of ckpt_failed and ckpt_started, and
79 : * set request flags, while holding ckpt_lck.
80 : * 2. Send signal to request checkpoint.
81 : * 3. Sleep until ckpt_started changes. Now you know a checkpoint has
82 : * begun since you started this algorithm (although *not* that it was
83 : * specifically initiated by your signal), and that it is using your flags.
84 : * 4. Record new value of ckpt_started.
85 : * 5. Sleep until ckpt_done >= saved value of ckpt_started. (Use modulo
86 : * arithmetic here in case counters wrap around.) Now you know a
87 : * checkpoint has started and completed, but not whether it was
88 : * successful.
89 : * 6. If ckpt_failed is different from the originally saved value,
90 : * assume request failed; otherwise it was definitely successful.
91 : *
92 : * ckpt_flags holds the OR of the checkpoint request flags sent by all
93 : * requesting backends since the last checkpoint start. The flags are
94 : * chosen so that OR'ing is the correct way to combine multiple requests.
95 : *
96 : * num_backend_writes is used to count the number of buffer writes performed
97 : * by user backend processes. This counter should be wide enough that it
98 : * can't overflow during a single processing cycle. num_backend_fsync
99 : * counts the subset of those writes that also had to do their own fsync,
100 : * because the checkpointer failed to absorb their request.
101 : *
102 : * The requests array holds fsync requests sent by backends and not yet
103 : * absorbed by the checkpointer.
104 : *
105 : * Unlike the checkpoint fields, num_backend_writes, num_backend_fsync, and
106 : * the requests fields are protected by CheckpointerCommLock.
107 : *----------
108 : */
109 : typedef struct
110 : {
111 : RelFileNode rnode;
112 : ForkNumber forknum;
113 : BlockNumber segno; /* see md.c for special values */
114 : /* might add a real request-type field later; not needed yet */
115 : } CheckpointerRequest;
116 :
117 : typedef struct
118 : {
119 : pid_t checkpointer_pid; /* PID (0 if not started) */
120 :
121 : slock_t ckpt_lck; /* protects all the ckpt_* fields */
122 :
123 : int ckpt_started; /* advances when checkpoint starts */
124 : int ckpt_done; /* advances when checkpoint done */
125 : int ckpt_failed; /* advances when checkpoint fails */
126 :
127 : int ckpt_flags; /* checkpoint flags, as defined in xlog.h */
128 :
129 : uint32 num_backend_writes; /* counts user backend buffer writes */
130 : uint32 num_backend_fsync; /* counts user backend fsync calls */
131 :
132 : int num_requests; /* current # of requests */
133 : int max_requests; /* allocated array size */
134 : CheckpointerRequest requests[FLEXIBLE_ARRAY_MEMBER];
135 : } CheckpointerShmemStruct;
136 :
137 : static CheckpointerShmemStruct *CheckpointerShmem;
138 :
139 : /* interval for calling AbsorbFsyncRequests in CheckpointWriteDelay */
140 : #define WRITES_PER_ABSORB 1000
141 :
142 : /*
143 : * GUC parameters
144 : */
145 : int CheckPointTimeout = 300;
146 : int CheckPointWarning = 30;
147 : double CheckPointCompletionTarget = 0.5;
148 :
149 : /*
150 : * Flags set by interrupt handlers for later service in the main loop.
151 : */
152 : static volatile sig_atomic_t got_SIGHUP = false;
153 : static volatile sig_atomic_t checkpoint_requested = false;
154 : static volatile sig_atomic_t shutdown_requested = false;
155 :
156 : /*
157 : * Private state
158 : */
159 : static bool ckpt_active = false;
160 :
161 : /* these values are valid when ckpt_active is true: */
162 : static pg_time_t ckpt_start_time;
163 : static XLogRecPtr ckpt_start_recptr;
164 : static double ckpt_cached_elapsed;
165 :
166 : static pg_time_t last_checkpoint_time;
167 : static pg_time_t last_xlog_switch_time;
168 :
169 : /* Prototypes for private functions */
170 :
171 : static void CheckArchiveTimeout(void);
172 : static bool IsCheckpointOnSchedule(double progress);
173 : static bool ImmediateCheckpointRequested(void);
174 : static bool CompactCheckpointerRequestQueue(void);
175 : static void UpdateSharedMemoryConfig(void);
176 :
177 : /* Signal handlers */
178 :
179 : static void chkpt_quickdie(SIGNAL_ARGS);
180 : static void ChkptSigHupHandler(SIGNAL_ARGS);
181 : static void ReqCheckpointHandler(SIGNAL_ARGS);
182 : static void chkpt_sigusr1_handler(SIGNAL_ARGS);
183 : static void ReqShutdownHandler(SIGNAL_ARGS);
184 :
185 :
186 : /*
187 : * Main entry point for checkpointer process
188 : *
189 : * This is invoked from AuxiliaryProcessMain, which has already created the
190 : * basic execution environment, but not enabled signals yet.
191 : */
192 : void
193 1 : CheckpointerMain(void)
194 : {
195 : sigjmp_buf local_sigjmp_buf;
196 : MemoryContext checkpointer_context;
197 :
198 1 : CheckpointerShmem->checkpointer_pid = MyProcPid;
199 :
200 : /*
201 : * Properly accept or ignore signals the postmaster might send us
202 : *
203 : * Note: we deliberately ignore SIGTERM, because during a standard Unix
204 : * system shutdown cycle, init will SIGTERM all processes at once. We
205 : * want to wait for the backends to exit, whereupon the postmaster will
206 : * tell us it's okay to shut down (via SIGUSR2).
207 : */
208 1 : pqsignal(SIGHUP, ChkptSigHupHandler); /* set flag to read config file */
209 1 : pqsignal(SIGINT, ReqCheckpointHandler); /* request checkpoint */
210 1 : pqsignal(SIGTERM, SIG_IGN); /* ignore SIGTERM */
211 1 : pqsignal(SIGQUIT, chkpt_quickdie); /* hard crash time */
212 1 : pqsignal(SIGALRM, SIG_IGN);
213 1 : pqsignal(SIGPIPE, SIG_IGN);
214 1 : pqsignal(SIGUSR1, chkpt_sigusr1_handler);
215 1 : pqsignal(SIGUSR2, ReqShutdownHandler); /* request shutdown */
216 :
217 : /*
218 : * Reset some signals that are accepted by postmaster but not here
219 : */
220 1 : pqsignal(SIGCHLD, SIG_DFL);
221 1 : pqsignal(SIGTTIN, SIG_DFL);
222 1 : pqsignal(SIGTTOU, SIG_DFL);
223 1 : pqsignal(SIGCONT, SIG_DFL);
224 1 : pqsignal(SIGWINCH, SIG_DFL);
225 :
226 : /* We allow SIGQUIT (quickdie) at all times */
227 1 : sigdelset(&BlockSig, SIGQUIT);
228 :
229 : /*
230 : * Initialize so that first time-driven event happens at the correct time.
231 : */
232 1 : last_checkpoint_time = last_xlog_switch_time = (pg_time_t) time(NULL);
233 :
234 : /*
235 : * Create a resource owner to keep track of our resources (currently only
236 : * buffer pins).
237 : */
238 1 : CurrentResourceOwner = ResourceOwnerCreate(NULL, "Checkpointer");
239 :
240 : /*
241 : * Create a memory context that we will do all our work in. We do this so
242 : * that we can reset the context during error recovery and thereby avoid
243 : * possible memory leaks. Formerly this code just ran in
244 : * TopMemoryContext, but resetting that would be a really bad idea.
245 : */
246 1 : checkpointer_context = AllocSetContextCreate(TopMemoryContext,
247 : "Checkpointer",
248 : ALLOCSET_DEFAULT_SIZES);
249 1 : MemoryContextSwitchTo(checkpointer_context);
250 :
251 : /*
252 : * If an exception is encountered, processing resumes here.
253 : *
254 : * See notes in postgres.c about the design of this coding.
255 : */
256 1 : if (sigsetjmp(local_sigjmp_buf, 1) != 0)
257 : {
258 : /* Since not using PG_TRY, must reset error stack by hand */
259 0 : error_context_stack = NULL;
260 :
261 : /* Prevent interrupts while cleaning up */
262 0 : HOLD_INTERRUPTS();
263 :
264 : /* Report the error to the server log */
265 0 : EmitErrorReport();
266 :
267 : /*
268 : * These operations are really just a minimal subset of
269 : * AbortTransaction(). We don't have very many resources to worry
270 : * about in checkpointer, but we do have LWLocks, buffers, and temp
271 : * files.
272 : */
273 0 : LWLockReleaseAll();
274 0 : ConditionVariableCancelSleep();
275 0 : pgstat_report_wait_end();
276 0 : AbortBufferIO();
277 0 : UnlockBuffers();
278 : /* buffer pins are released here: */
279 0 : ResourceOwnerRelease(CurrentResourceOwner,
280 : RESOURCE_RELEASE_BEFORE_LOCKS,
281 : false, true);
282 : /* we needn't bother with the other ResourceOwnerRelease phases */
283 0 : AtEOXact_Buffers(false);
284 0 : AtEOXact_SMgr();
285 0 : AtEOXact_Files();
286 0 : AtEOXact_HashTables(false);
287 :
288 : /* Warn any waiting backends that the checkpoint failed. */
289 0 : if (ckpt_active)
290 : {
291 0 : SpinLockAcquire(&CheckpointerShmem->ckpt_lck);
292 0 : CheckpointerShmem->ckpt_failed++;
293 0 : CheckpointerShmem->ckpt_done = CheckpointerShmem->ckpt_started;
294 0 : SpinLockRelease(&CheckpointerShmem->ckpt_lck);
295 :
296 0 : ckpt_active = false;
297 : }
298 :
299 : /*
300 : * Now return to normal top-level context and clear ErrorContext for
301 : * next time.
302 : */
303 0 : MemoryContextSwitchTo(checkpointer_context);
304 0 : FlushErrorState();
305 :
306 : /* Flush any leaked data in the top-level context */
307 0 : MemoryContextResetAndDeleteChildren(checkpointer_context);
308 :
309 : /* Now we can allow interrupts again */
310 0 : RESUME_INTERRUPTS();
311 :
312 : /*
313 : * Sleep at least 1 second after any error. A write error is likely
314 : * to be repeated, and we don't want to be filling the error logs as
315 : * fast as we can.
316 : */
317 0 : pg_usleep(1000000L);
318 :
319 : /*
320 : * Close all open files after any error. This is helpful on Windows,
321 : * where holding deleted files open causes various strange errors.
322 : * It's not clear we need it elsewhere, but shouldn't hurt.
323 : */
324 0 : smgrcloseall();
325 : }
326 :
327 : /* We can now handle ereport(ERROR) */
328 1 : PG_exception_stack = &local_sigjmp_buf;
329 :
330 : /*
331 : * Unblock signals (they were blocked when the postmaster forked us)
332 : */
333 1 : PG_SETMASK(&UnBlockSig);
334 :
335 : /*
336 : * Ensure all shared memory values are set correctly for the config. Doing
337 : * this here ensures no race conditions from other concurrent updaters.
338 : */
339 1 : UpdateSharedMemoryConfig();
340 :
341 : /*
342 : * Advertise our latch that backends can use to wake us up while we're
343 : * sleeping.
344 : */
345 1 : ProcGlobal->checkpointerLatch = &MyProc->procLatch;
346 :
347 : /*
348 : * Loop forever
349 : */
350 : for (;;)
351 : {
352 9 : bool do_checkpoint = false;
353 9 : int flags = 0;
354 : pg_time_t now;
355 : int elapsed_secs;
356 : int cur_timeout;
357 : int rc;
358 :
359 : /* Clear any already-pending wakeups */
360 9 : ResetLatch(MyLatch);
361 :
362 : /*
363 : * Process any requests or signals received recently.
364 : */
365 9 : AbsorbFsyncRequests();
366 :
367 9 : if (got_SIGHUP)
368 : {
369 0 : got_SIGHUP = false;
370 0 : ProcessConfigFile(PGC_SIGHUP);
371 :
372 : /*
373 : * Checkpointer is the last process to shut down, so we ask it to
374 : * hold the keys for a range of other tasks required most of which
375 : * have nothing to do with checkpointing at all.
376 : *
377 : * For various reasons, some config values can change dynamically
378 : * so the primary copy of them is held in shared memory to make
379 : * sure all backends see the same value. We make Checkpointer
380 : * responsible for updating the shared memory copy if the
381 : * parameter setting changes because of SIGHUP.
382 : */
383 0 : UpdateSharedMemoryConfig();
384 : }
385 9 : if (checkpoint_requested)
386 : {
387 4 : checkpoint_requested = false;
388 4 : do_checkpoint = true;
389 4 : BgWriterStats.m_requested_checkpoints++;
390 : }
391 9 : if (shutdown_requested)
392 : {
393 : /*
394 : * From here on, elog(ERROR) should end with exit(1), not send
395 : * control back to the sigsetjmp block above
396 : */
397 1 : ExitOnAnyError = true;
398 : /* Close down the database */
399 1 : ShutdownXLOG(0, 0);
400 : /* Normal exit from the checkpointer is here */
401 1 : proc_exit(0); /* done */
402 : }
403 :
404 : /*
405 : * Force a checkpoint if too much time has elapsed since the last one.
406 : * Note that we count a timed checkpoint in stats only when this
407 : * occurs without an external request, but we set the CAUSE_TIME flag
408 : * bit even if there is also an external request.
409 : */
410 8 : now = (pg_time_t) time(NULL);
411 8 : elapsed_secs = now - last_checkpoint_time;
412 8 : if (elapsed_secs >= CheckPointTimeout)
413 : {
414 0 : if (!do_checkpoint)
415 0 : BgWriterStats.m_timed_checkpoints++;
416 0 : do_checkpoint = true;
417 0 : flags |= CHECKPOINT_CAUSE_TIME;
418 : }
419 :
420 : /*
421 : * Do a checkpoint if requested.
422 : */
423 8 : if (do_checkpoint)
424 : {
425 4 : bool ckpt_performed = false;
426 : bool do_restartpoint;
427 :
428 : /*
429 : * Check if we should perform a checkpoint or a restartpoint. As a
430 : * side-effect, RecoveryInProgress() initializes TimeLineID if
431 : * it's not set yet.
432 : */
433 4 : do_restartpoint = RecoveryInProgress();
434 :
435 : /*
436 : * Atomically fetch the request flags to figure out what kind of a
437 : * checkpoint we should perform, and increase the started-counter
438 : * to acknowledge that we've started a new checkpoint.
439 : */
440 4 : SpinLockAcquire(&CheckpointerShmem->ckpt_lck);
441 4 : flags |= CheckpointerShmem->ckpt_flags;
442 4 : CheckpointerShmem->ckpt_flags = 0;
443 4 : CheckpointerShmem->ckpt_started++;
444 4 : SpinLockRelease(&CheckpointerShmem->ckpt_lck);
445 :
446 : /*
447 : * The end-of-recovery checkpoint is a real checkpoint that's
448 : * performed while we're still in recovery.
449 : */
450 4 : if (flags & CHECKPOINT_END_OF_RECOVERY)
451 0 : do_restartpoint = false;
452 :
453 : /*
454 : * We will warn if (a) too soon since last checkpoint (whatever
455 : * caused it) and (b) somebody set the CHECKPOINT_CAUSE_XLOG flag
456 : * since the last checkpoint start. Note in particular that this
457 : * implementation will not generate warnings caused by
458 : * CheckPointTimeout < CheckPointWarning.
459 : */
460 8 : if (!do_restartpoint &&
461 4 : (flags & CHECKPOINT_CAUSE_XLOG) &&
462 0 : elapsed_secs < CheckPointWarning)
463 0 : ereport(LOG,
464 : (errmsg_plural("checkpoints are occurring too frequently (%d second apart)",
465 : "checkpoints are occurring too frequently (%d seconds apart)",
466 : elapsed_secs,
467 : elapsed_secs),
468 : errhint("Consider increasing the configuration parameter \"max_wal_size\".")));
469 :
470 : /*
471 : * Initialize checkpointer-private variables used during
472 : * checkpoint.
473 : */
474 4 : ckpt_active = true;
475 4 : if (do_restartpoint)
476 0 : ckpt_start_recptr = GetXLogReplayRecPtr(NULL);
477 : else
478 4 : ckpt_start_recptr = GetInsertRecPtr();
479 4 : ckpt_start_time = now;
480 4 : ckpt_cached_elapsed = 0;
481 :
482 : /*
483 : * Do the checkpoint.
484 : */
485 4 : if (!do_restartpoint)
486 : {
487 4 : CreateCheckPoint(flags);
488 4 : ckpt_performed = true;
489 : }
490 : else
491 0 : ckpt_performed = CreateRestartPoint(flags);
492 :
493 : /*
494 : * After any checkpoint, close all smgr files. This is so we
495 : * won't hang onto smgr references to deleted files indefinitely.
496 : */
497 4 : smgrcloseall();
498 :
499 : /*
500 : * Indicate checkpoint completion to any waiting backends.
501 : */
502 4 : SpinLockAcquire(&CheckpointerShmem->ckpt_lck);
503 4 : CheckpointerShmem->ckpt_done = CheckpointerShmem->ckpt_started;
504 4 : SpinLockRelease(&CheckpointerShmem->ckpt_lck);
505 :
506 4 : if (ckpt_performed)
507 : {
508 : /*
509 : * Note we record the checkpoint start time not end time as
510 : * last_checkpoint_time. This is so that time-driven
511 : * checkpoints happen at a predictable spacing.
512 : */
513 4 : last_checkpoint_time = now;
514 : }
515 : else
516 : {
517 : /*
518 : * We were not able to perform the restartpoint (checkpoints
519 : * throw an ERROR in case of error). Most likely because we
520 : * have not received any new checkpoint WAL records since the
521 : * last restartpoint. Try again in 15 s.
522 : */
523 0 : last_checkpoint_time = now - CheckPointTimeout + 15;
524 : }
525 :
526 4 : ckpt_active = false;
527 : }
528 :
529 : /* Check for archive_timeout and switch xlog files if necessary. */
530 8 : CheckArchiveTimeout();
531 :
532 : /*
533 : * Send off activity statistics to the stats collector. (The reason
534 : * why we re-use bgwriter-related code for this is that the bgwriter
535 : * and checkpointer used to be just one process. It's probably not
536 : * worth the trouble to split the stats support into two independent
537 : * stats message types.)
538 : */
539 8 : pgstat_send_bgwriter();
540 :
541 : /*
542 : * Sleep until we are signaled or it's time for another checkpoint or
543 : * xlog file switch.
544 : */
545 8 : now = (pg_time_t) time(NULL);
546 8 : elapsed_secs = now - last_checkpoint_time;
547 8 : if (elapsed_secs >= CheckPointTimeout)
548 0 : continue; /* no sleep for us ... */
549 8 : cur_timeout = CheckPointTimeout - elapsed_secs;
550 8 : if (XLogArchiveTimeout > 0 && !RecoveryInProgress())
551 : {
552 0 : elapsed_secs = now - last_xlog_switch_time;
553 0 : if (elapsed_secs >= XLogArchiveTimeout)
554 0 : continue; /* no sleep for us ... */
555 0 : cur_timeout = Min(cur_timeout, XLogArchiveTimeout - elapsed_secs);
556 : }
557 :
558 8 : rc = WaitLatch(MyLatch,
559 : WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
560 : cur_timeout * 1000L /* convert to ms */ ,
561 : WAIT_EVENT_CHECKPOINTER_MAIN);
562 :
563 : /*
564 : * Emergency bailout if postmaster has died. This is to avoid the
565 : * necessity for manual cleanup of all postmaster children.
566 : */
567 8 : if (rc & WL_POSTMASTER_DEATH)
568 0 : exit(1);
569 8 : }
570 : }
571 :
572 : /*
573 : * CheckArchiveTimeout -- check for archive_timeout and switch xlog files
574 : *
575 : * This will switch to a new WAL file and force an archive file write if
576 : * meaningful activity is recorded in the current WAL file. This includes most
577 : * writes, including just a single checkpoint record, but excludes WAL records
578 : * that were inserted with the XLOG_MARK_UNIMPORTANT flag being set (like
579 : * snapshots of running transactions). Such records, depending on
580 : * configuration, occur on regular intervals and don't contain important
581 : * information. This avoids generating archives with a few unimportant
582 : * records.
583 : */
584 : static void
585 8 : CheckArchiveTimeout(void)
586 : {
587 : pg_time_t now;
588 : pg_time_t last_time;
589 : XLogRecPtr last_switch_lsn;
590 :
591 8 : if (XLogArchiveTimeout <= 0 || RecoveryInProgress())
592 16 : return;
593 :
594 0 : now = (pg_time_t) time(NULL);
595 :
596 : /* First we do a quick check using possibly-stale local state. */
597 0 : if ((int) (now - last_xlog_switch_time) < XLogArchiveTimeout)
598 0 : return;
599 :
600 : /*
601 : * Update local state ... note that last_xlog_switch_time is the last time
602 : * a switch was performed *or requested*.
603 : */
604 0 : last_time = GetLastSegSwitchData(&last_switch_lsn);
605 :
606 0 : last_xlog_switch_time = Max(last_xlog_switch_time, last_time);
607 :
608 : /* Now we can do the real checks */
609 0 : if ((int) (now - last_xlog_switch_time) >= XLogArchiveTimeout)
610 : {
611 : /*
612 : * Switch segment only when "important" WAL has been logged since the
613 : * last segment switch (last_switch_lsn points to end of segment
614 : * switch occurred in).
615 : */
616 0 : if (GetLastImportantRecPtr() > last_switch_lsn)
617 : {
618 : XLogRecPtr switchpoint;
619 :
620 : /* mark switch as unimportant, avoids triggering checkpoints */
621 0 : switchpoint = RequestXLogSwitch(true);
622 :
623 : /*
624 : * If the returned pointer points exactly to a segment boundary,
625 : * assume nothing happened.
626 : */
627 0 : if ((switchpoint % XLogSegSize) != 0)
628 0 : elog(DEBUG1, "write-ahead log switch forced (archive_timeout=%d)",
629 : XLogArchiveTimeout);
630 : }
631 :
632 : /*
633 : * Update state in any case, so we don't retry constantly when the
634 : * system is idle.
635 : */
636 0 : last_xlog_switch_time = now;
637 : }
638 : }
639 :
640 : /*
641 : * Returns true if an immediate checkpoint request is pending. (Note that
642 : * this does not check the *current* checkpoint's IMMEDIATE flag, but whether
643 : * there is one pending behind it.)
644 : */
645 : static bool
646 0 : ImmediateCheckpointRequested(void)
647 : {
648 0 : if (checkpoint_requested)
649 : {
650 0 : volatile CheckpointerShmemStruct *cps = CheckpointerShmem;
651 :
652 : /*
653 : * We don't need to acquire the ckpt_lck in this case because we're
654 : * only looking at a single flag bit.
655 : */
656 0 : if (cps->ckpt_flags & CHECKPOINT_IMMEDIATE)
657 0 : return true;
658 : }
659 0 : return false;
660 : }
661 :
662 : /*
663 : * CheckpointWriteDelay -- control rate of checkpoint
664 : *
665 : * This function is called after each page write performed by BufferSync().
666 : * It is responsible for throttling BufferSync()'s write rate to hit
667 : * checkpoint_completion_target.
668 : *
669 : * The checkpoint request flags should be passed in; currently the only one
670 : * examined is CHECKPOINT_IMMEDIATE, which disables delays between writes.
671 : *
672 : * 'progress' is an estimate of how much of the work has been done, as a
673 : * fraction between 0.0 meaning none, and 1.0 meaning all done.
674 : */
675 : void
676 8514 : CheckpointWriteDelay(int flags, double progress)
677 : {
678 : static int absorb_counter = WRITES_PER_ABSORB;
679 :
680 : /* Do nothing if checkpoint is being executed by non-checkpointer process */
681 8514 : if (!AmCheckpointerProcess())
682 9478 : return;
683 :
684 : /*
685 : * Perform the usual duties and take a nap, unless we're behind schedule,
686 : * in which case we just try to catch up as quickly as possible.
687 : */
688 7550 : if (!(flags & CHECKPOINT_IMMEDIATE) &&
689 0 : !shutdown_requested &&
690 0 : !ImmediateCheckpointRequested() &&
691 0 : IsCheckpointOnSchedule(progress))
692 : {
693 0 : if (got_SIGHUP)
694 : {
695 0 : got_SIGHUP = false;
696 0 : ProcessConfigFile(PGC_SIGHUP);
697 : /* update shmem copies of config variables */
698 0 : UpdateSharedMemoryConfig();
699 : }
700 :
701 0 : AbsorbFsyncRequests();
702 0 : absorb_counter = WRITES_PER_ABSORB;
703 :
704 0 : CheckArchiveTimeout();
705 :
706 : /*
707 : * Report interim activity statistics to the stats collector.
708 : */
709 0 : pgstat_send_bgwriter();
710 :
711 : /*
712 : * This sleep used to be connected to bgwriter_delay, typically 200ms.
713 : * That resulted in more frequent wakeups if not much work to do.
714 : * Checkpointer and bgwriter are no longer related so take the Big
715 : * Sleep.
716 : */
717 0 : pg_usleep(100000L);
718 : }
719 7550 : else if (--absorb_counter <= 0)
720 : {
721 : /*
722 : * Absorb pending fsync requests after each WRITES_PER_ABSORB write
723 : * operations even when we don't sleep, to prevent overflow of the
724 : * fsync request queue.
725 : */
726 7 : AbsorbFsyncRequests();
727 7 : absorb_counter = WRITES_PER_ABSORB;
728 : }
729 : }
730 :
731 : /*
732 : * IsCheckpointOnSchedule -- are we on schedule to finish this checkpoint
733 : * (or restartpoint) in time?
734 : *
735 : * Compares the current progress against the time/segments elapsed since last
736 : * checkpoint, and returns true if the progress we've made this far is greater
737 : * than the elapsed time/segments.
738 : */
739 : static bool
740 0 : IsCheckpointOnSchedule(double progress)
741 : {
742 : XLogRecPtr recptr;
743 : struct timeval now;
744 : double elapsed_xlogs,
745 : elapsed_time;
746 :
747 0 : Assert(ckpt_active);
748 :
749 : /* Scale progress according to checkpoint_completion_target. */
750 0 : progress *= CheckPointCompletionTarget;
751 :
752 : /*
753 : * Check against the cached value first. Only do the more expensive
754 : * calculations once we reach the target previously calculated. Since
755 : * neither time or WAL insert pointer moves backwards, a freshly
756 : * calculated value can only be greater than or equal to the cached value.
757 : */
758 0 : if (progress < ckpt_cached_elapsed)
759 0 : return false;
760 :
761 : /*
762 : * Check progress against WAL segments written and CheckPointSegments.
763 : *
764 : * We compare the current WAL insert location against the location
765 : * computed before calling CreateCheckPoint. The code in XLogInsert that
766 : * actually triggers a checkpoint when CheckPointSegments is exceeded
767 : * compares against RedoRecptr, so this is not completely accurate.
768 : * However, it's good enough for our purposes, we're only calculating an
769 : * estimate anyway.
770 : *
771 : * During recovery, we compare last replayed WAL record's location with
772 : * the location computed before calling CreateRestartPoint. That maintains
773 : * the same pacing as we have during checkpoints in normal operation, but
774 : * we might exceed max_wal_size by a fair amount. That's because there can
775 : * be a large gap between a checkpoint's redo-pointer and the checkpoint
776 : * record itself, and we only start the restartpoint after we've seen the
777 : * checkpoint record. (The gap is typically up to CheckPointSegments *
778 : * checkpoint_completion_target where checkpoint_completion_target is the
779 : * value that was in effect when the WAL was generated).
780 : */
781 0 : if (RecoveryInProgress())
782 0 : recptr = GetXLogReplayRecPtr(NULL);
783 : else
784 0 : recptr = GetInsertRecPtr();
785 0 : elapsed_xlogs = (((double) (recptr - ckpt_start_recptr)) / XLogSegSize) / CheckPointSegments;
786 :
787 0 : if (progress < elapsed_xlogs)
788 : {
789 0 : ckpt_cached_elapsed = elapsed_xlogs;
790 0 : return false;
791 : }
792 :
793 : /*
794 : * Check progress against time elapsed and checkpoint_timeout.
795 : */
796 0 : gettimeofday(&now, NULL);
797 0 : elapsed_time = ((double) ((pg_time_t) now.tv_sec - ckpt_start_time) +
798 0 : now.tv_usec / 1000000.0) / CheckPointTimeout;
799 :
800 0 : if (progress < elapsed_time)
801 : {
802 0 : ckpt_cached_elapsed = elapsed_time;
803 0 : return false;
804 : }
805 :
806 : /* It looks like we're on schedule. */
807 0 : return true;
808 : }
809 :
810 :
811 : /* --------------------------------
812 : * signal handler routines
813 : * --------------------------------
814 : */
815 :
816 : /*
817 : * chkpt_quickdie() occurs when signalled SIGQUIT by the postmaster.
818 : *
819 : * Some backend has bought the farm,
820 : * so we need to stop what we're doing and exit.
821 : */
822 : static void
823 0 : chkpt_quickdie(SIGNAL_ARGS)
824 : {
825 0 : PG_SETMASK(&BlockSig);
826 :
827 : /*
828 : * We DO NOT want to run proc_exit() callbacks -- we're here because
829 : * shared memory may be corrupted, so we don't want to try to clean up our
830 : * transaction. Just nail the windows shut and get out of town. Now that
831 : * there's an atexit callback to prevent third-party code from breaking
832 : * things by calling exit() directly, we have to reset the callbacks
833 : * explicitly to make this work as intended.
834 : */
835 0 : on_exit_reset();
836 :
837 : /*
838 : * Note we do exit(2) not exit(0). This is to force the postmaster into a
839 : * system reset cycle if some idiot DBA sends a manual SIGQUIT to a random
840 : * backend. This is necessary precisely because we don't clean up our
841 : * shared memory state. (The "dead man switch" mechanism in pmsignal.c
842 : * should ensure the postmaster sees this as a crash, too, but no harm in
843 : * being doubly sure.)
844 : */
845 0 : exit(2);
846 : }
847 :
848 : /* SIGHUP: set flag to re-read config file at next convenient time */
849 : static void
850 0 : ChkptSigHupHandler(SIGNAL_ARGS)
851 : {
852 0 : int save_errno = errno;
853 :
854 0 : got_SIGHUP = true;
855 0 : SetLatch(MyLatch);
856 :
857 0 : errno = save_errno;
858 0 : }
859 :
860 : /* SIGINT: set flag to run a normal checkpoint right away */
861 : static void
862 4 : ReqCheckpointHandler(SIGNAL_ARGS)
863 : {
864 4 : int save_errno = errno;
865 :
866 4 : checkpoint_requested = true;
867 4 : SetLatch(MyLatch);
868 :
869 4 : errno = save_errno;
870 4 : }
871 :
872 : /* SIGUSR1: used for latch wakeups */
873 : static void
874 3 : chkpt_sigusr1_handler(SIGNAL_ARGS)
875 : {
876 3 : int save_errno = errno;
877 :
878 3 : latch_sigusr1_handler();
879 :
880 3 : errno = save_errno;
881 3 : }
882 :
883 : /* SIGUSR2: set flag to run a shutdown checkpoint and exit */
884 : static void
885 1 : ReqShutdownHandler(SIGNAL_ARGS)
886 : {
887 1 : int save_errno = errno;
888 :
889 1 : shutdown_requested = true;
890 1 : SetLatch(MyLatch);
891 :
892 1 : errno = save_errno;
893 1 : }
894 :
895 :
896 : /* --------------------------------
897 : * communication with backends
898 : * --------------------------------
899 : */
900 :
901 : /*
902 : * CheckpointerShmemSize
903 : * Compute space needed for checkpointer-related shared memory
904 : */
905 : Size
906 10 : CheckpointerShmemSize(void)
907 : {
908 : Size size;
909 :
910 : /*
911 : * Currently, the size of the requests[] array is arbitrarily set equal to
912 : * NBuffers. This may prove too large or small ...
913 : */
914 10 : size = offsetof(CheckpointerShmemStruct, requests);
915 10 : size = add_size(size, mul_size(NBuffers, sizeof(CheckpointerRequest)));
916 :
917 10 : return size;
918 : }
919 :
920 : /*
921 : * CheckpointerShmemInit
922 : * Allocate and initialize checkpointer-related shared memory
923 : */
924 : void
925 5 : CheckpointerShmemInit(void)
926 : {
927 5 : Size size = CheckpointerShmemSize();
928 : bool found;
929 :
930 5 : CheckpointerShmem = (CheckpointerShmemStruct *)
931 5 : ShmemInitStruct("Checkpointer Data",
932 : size,
933 : &found);
934 :
935 5 : if (!found)
936 : {
937 : /*
938 : * First time through, so initialize. Note that we zero the whole
939 : * requests array; this is so that CompactCheckpointerRequestQueue can
940 : * assume that any pad bytes in the request structs are zeroes.
941 : */
942 5 : MemSet(CheckpointerShmem, 0, size);
943 5 : SpinLockInit(&CheckpointerShmem->ckpt_lck);
944 5 : CheckpointerShmem->max_requests = NBuffers;
945 : }
946 5 : }
947 :
948 : /*
949 : * RequestCheckpoint
950 : * Called in backend processes to request a checkpoint
951 : *
952 : * flags is a bitwise OR of the following:
953 : * CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown.
954 : * CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery.
955 : * CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP,
956 : * ignoring checkpoint_completion_target parameter.
957 : * CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occurred
958 : * since the last one (implied by CHECKPOINT_IS_SHUTDOWN or
959 : * CHECKPOINT_END_OF_RECOVERY).
960 : * CHECKPOINT_WAIT: wait for completion before returning (otherwise,
961 : * just signal checkpointer to do it, and return).
962 : * CHECKPOINT_CAUSE_XLOG: checkpoint is requested due to xlog filling.
963 : * (This affects logging, and in particular enables CheckPointWarning.)
964 : */
965 : void
966 8 : RequestCheckpoint(int flags)
967 : {
968 : int ntries;
969 : int old_failed,
970 : old_started;
971 :
972 : /*
973 : * If in a standalone backend, just do it ourselves.
974 : */
975 8 : if (!IsPostmasterEnvironment)
976 : {
977 : /*
978 : * There's no point in doing slow checkpoints in a standalone backend,
979 : * because there's no other backends the checkpoint could disrupt.
980 : */
981 4 : CreateCheckPoint(flags | CHECKPOINT_IMMEDIATE);
982 :
983 : /*
984 : * After any checkpoint, close all smgr files. This is so we won't
985 : * hang onto smgr references to deleted files indefinitely.
986 : */
987 4 : smgrcloseall();
988 :
989 4 : return;
990 : }
991 :
992 : /*
993 : * Atomically set the request flags, and take a snapshot of the counters.
994 : * When we see ckpt_started > old_started, we know the flags we set here
995 : * have been seen by checkpointer.
996 : *
997 : * Note that we OR the flags with any existing flags, to avoid overriding
998 : * a "stronger" request by another backend. The flag senses must be
999 : * chosen to make this work!
1000 : */
1001 4 : SpinLockAcquire(&CheckpointerShmem->ckpt_lck);
1002 :
1003 4 : old_failed = CheckpointerShmem->ckpt_failed;
1004 4 : old_started = CheckpointerShmem->ckpt_started;
1005 4 : CheckpointerShmem->ckpt_flags |= flags;
1006 :
1007 4 : SpinLockRelease(&CheckpointerShmem->ckpt_lck);
1008 :
1009 : /*
1010 : * Send signal to request checkpoint. It's possible that the checkpointer
1011 : * hasn't started yet, or is in process of restarting, so we will retry a
1012 : * few times if needed. Also, if not told to wait for the checkpoint to
1013 : * occur, we consider failure to send the signal to be nonfatal and merely
1014 : * LOG it.
1015 : */
1016 4 : for (ntries = 0;; ntries++)
1017 : {
1018 4 : if (CheckpointerShmem->checkpointer_pid == 0)
1019 : {
1020 0 : if (ntries >= 20) /* max wait 2.0 sec */
1021 : {
1022 0 : elog((flags & CHECKPOINT_WAIT) ? ERROR : LOG,
1023 : "could not request checkpoint because checkpointer not running");
1024 0 : break;
1025 : }
1026 : }
1027 4 : else if (kill(CheckpointerShmem->checkpointer_pid, SIGINT) != 0)
1028 : {
1029 0 : if (ntries >= 20) /* max wait 2.0 sec */
1030 : {
1031 0 : elog((flags & CHECKPOINT_WAIT) ? ERROR : LOG,
1032 : "could not signal for checkpoint: %m");
1033 0 : break;
1034 : }
1035 : }
1036 : else
1037 4 : break; /* signal sent successfully */
1038 :
1039 0 : CHECK_FOR_INTERRUPTS();
1040 0 : pg_usleep(100000L); /* wait 0.1 sec, then retry */
1041 0 : }
1042 :
1043 : /*
1044 : * If requested, wait for completion. We detect completion according to
1045 : * the algorithm given above.
1046 : */
1047 4 : if (flags & CHECKPOINT_WAIT)
1048 : {
1049 : int new_started,
1050 : new_failed;
1051 :
1052 : /* Wait for a new checkpoint to start. */
1053 : for (;;)
1054 : {
1055 7 : SpinLockAcquire(&CheckpointerShmem->ckpt_lck);
1056 7 : new_started = CheckpointerShmem->ckpt_started;
1057 7 : SpinLockRelease(&CheckpointerShmem->ckpt_lck);
1058 :
1059 7 : if (new_started != old_started)
1060 4 : break;
1061 :
1062 3 : CHECK_FOR_INTERRUPTS();
1063 3 : pg_usleep(100000L);
1064 3 : }
1065 :
1066 : /*
1067 : * We are waiting for ckpt_done >= new_started, in a modulo sense.
1068 : */
1069 : for (;;)
1070 : {
1071 : int new_done;
1072 :
1073 6 : SpinLockAcquire(&CheckpointerShmem->ckpt_lck);
1074 6 : new_done = CheckpointerShmem->ckpt_done;
1075 6 : new_failed = CheckpointerShmem->ckpt_failed;
1076 6 : SpinLockRelease(&CheckpointerShmem->ckpt_lck);
1077 :
1078 6 : if (new_done - new_started >= 0)
1079 4 : break;
1080 :
1081 2 : CHECK_FOR_INTERRUPTS();
1082 2 : pg_usleep(100000L);
1083 2 : }
1084 :
1085 4 : if (new_failed != old_failed)
1086 0 : ereport(ERROR,
1087 : (errmsg("checkpoint request failed"),
1088 : errhint("Consult recent messages in the server log for details.")));
1089 : }
1090 : }
1091 :
1092 : /*
1093 : * ForwardFsyncRequest
1094 : * Forward a file-fsync request from a backend to the checkpointer
1095 : *
1096 : * Whenever a backend is compelled to write directly to a relation
1097 : * (which should be seldom, if the background writer is getting its job done),
1098 : * the backend calls this routine to pass over knowledge that the relation
1099 : * is dirty and must be fsync'd before next checkpoint. We also use this
1100 : * opportunity to count such writes for statistical purposes.
1101 : *
1102 : * This functionality is only supported for regular (not backend-local)
1103 : * relations, so the rnode argument is intentionally RelFileNode not
1104 : * RelFileNodeBackend.
1105 : *
1106 : * segno specifies which segment (not block!) of the relation needs to be
1107 : * fsync'd. (Since the valid range is much less than BlockNumber, we can
1108 : * use high values for special flags; that's all internal to md.c, which
1109 : * see for details.)
1110 : *
1111 : * To avoid holding the lock for longer than necessary, we normally write
1112 : * to the requests[] queue without checking for duplicates. The checkpointer
1113 : * will have to eliminate dups internally anyway. However, if we discover
1114 : * that the queue is full, we make a pass over the entire queue to compact
1115 : * it. This is somewhat expensive, but the alternative is for the backend
1116 : * to perform its own fsync, which is far more expensive in practice. It
1117 : * is theoretically possible a backend fsync might still be necessary, if
1118 : * the queue is full and contains no duplicate entries. In that case, we
1119 : * let the backend know by returning false.
1120 : */
1121 : bool
1122 25902 : ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno)
1123 : {
1124 : CheckpointerRequest *request;
1125 : bool too_full;
1126 :
1127 25902 : if (!IsUnderPostmaster)
1128 0 : return false; /* probably shouldn't even get here */
1129 :
1130 25902 : if (AmCheckpointerProcess())
1131 0 : elog(ERROR, "ForwardFsyncRequest must not be called in checkpointer");
1132 :
1133 25902 : LWLockAcquire(CheckpointerCommLock, LW_EXCLUSIVE);
1134 :
1135 : /* Count all backend writes regardless of if they fit in the queue */
1136 25902 : if (!AmBackgroundWriterProcess())
1137 25902 : CheckpointerShmem->num_backend_writes++;
1138 :
1139 : /*
1140 : * If the checkpointer isn't running or the request queue is full, the
1141 : * backend will have to perform its own fsync request. But before forcing
1142 : * that to happen, we can try to compact the request queue.
1143 : */
1144 51804 : if (CheckpointerShmem->checkpointer_pid == 0 ||
1145 25902 : (CheckpointerShmem->num_requests >= CheckpointerShmem->max_requests &&
1146 0 : !CompactCheckpointerRequestQueue()))
1147 : {
1148 : /*
1149 : * Count the subset of writes where backends have to do their own
1150 : * fsync
1151 : */
1152 0 : if (!AmBackgroundWriterProcess())
1153 0 : CheckpointerShmem->num_backend_fsync++;
1154 0 : LWLockRelease(CheckpointerCommLock);
1155 0 : return false;
1156 : }
1157 :
1158 : /* OK, insert request */
1159 25902 : request = &CheckpointerShmem->requests[CheckpointerShmem->num_requests++];
1160 25902 : request->rnode = rnode;
1161 25902 : request->forknum = forknum;
1162 25902 : request->segno = segno;
1163 :
1164 : /* If queue is more than half full, nudge the checkpointer to empty it */
1165 51804 : too_full = (CheckpointerShmem->num_requests >=
1166 25902 : CheckpointerShmem->max_requests / 2);
1167 :
1168 25902 : LWLockRelease(CheckpointerCommLock);
1169 :
1170 : /* ... but not till after we release the lock */
1171 25902 : if (too_full && ProcGlobal->checkpointerLatch)
1172 5 : SetLatch(ProcGlobal->checkpointerLatch);
1173 :
1174 25902 : return true;
1175 : }
1176 :
1177 : /*
1178 : * CompactCheckpointerRequestQueue
1179 : * Remove duplicates from the request queue to avoid backend fsyncs.
1180 : * Returns "true" if any entries were removed.
1181 : *
1182 : * Although a full fsync request queue is not common, it can lead to severe
1183 : * performance problems when it does happen. So far, this situation has
1184 : * only been observed to occur when the system is under heavy write load,
1185 : * and especially during the "sync" phase of a checkpoint. Without this
1186 : * logic, each backend begins doing an fsync for every block written, which
1187 : * gets very expensive and can slow down the whole system.
1188 : *
1189 : * Trying to do this every time the queue is full could lose if there
1190 : * aren't any removable entries. But that should be vanishingly rare in
1191 : * practice: there's one queue entry per shared buffer.
1192 : */
1193 : static bool
1194 0 : CompactCheckpointerRequestQueue(void)
1195 : {
1196 : struct CheckpointerSlotMapping
1197 : {
1198 : CheckpointerRequest request;
1199 : int slot;
1200 : };
1201 :
1202 : int n,
1203 : preserve_count;
1204 0 : int num_skipped = 0;
1205 : HASHCTL ctl;
1206 : HTAB *htab;
1207 : bool *skip_slot;
1208 :
1209 : /* must hold CheckpointerCommLock in exclusive mode */
1210 0 : Assert(LWLockHeldByMe(CheckpointerCommLock));
1211 :
1212 : /* Initialize skip_slot array */
1213 0 : skip_slot = palloc0(sizeof(bool) * CheckpointerShmem->num_requests);
1214 :
1215 : /* Initialize temporary hash table */
1216 0 : MemSet(&ctl, 0, sizeof(ctl));
1217 0 : ctl.keysize = sizeof(CheckpointerRequest);
1218 0 : ctl.entrysize = sizeof(struct CheckpointerSlotMapping);
1219 0 : ctl.hcxt = CurrentMemoryContext;
1220 :
1221 0 : htab = hash_create("CompactCheckpointerRequestQueue",
1222 0 : CheckpointerShmem->num_requests,
1223 : &ctl,
1224 : HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
1225 :
1226 : /*
1227 : * The basic idea here is that a request can be skipped if it's followed
1228 : * by a later, identical request. It might seem more sensible to work
1229 : * backwards from the end of the queue and check whether a request is
1230 : * *preceded* by an earlier, identical request, in the hopes of doing less
1231 : * copying. But that might change the semantics, if there's an
1232 : * intervening FORGET_RELATION_FSYNC or FORGET_DATABASE_FSYNC request, so
1233 : * we do it this way. It would be possible to be even smarter if we made
1234 : * the code below understand the specific semantics of such requests (it
1235 : * could blow away preceding entries that would end up being canceled
1236 : * anyhow), but it's not clear that the extra complexity would buy us
1237 : * anything.
1238 : */
1239 0 : for (n = 0; n < CheckpointerShmem->num_requests; n++)
1240 : {
1241 : CheckpointerRequest *request;
1242 : struct CheckpointerSlotMapping *slotmap;
1243 : bool found;
1244 :
1245 : /*
1246 : * We use the request struct directly as a hashtable key. This
1247 : * assumes that any padding bytes in the structs are consistently the
1248 : * same, which should be okay because we zeroed them in
1249 : * CheckpointerShmemInit. Note also that RelFileNode had better
1250 : * contain no pad bytes.
1251 : */
1252 0 : request = &CheckpointerShmem->requests[n];
1253 0 : slotmap = hash_search(htab, request, HASH_ENTER, &found);
1254 0 : if (found)
1255 : {
1256 : /* Duplicate, so mark the previous occurrence as skippable */
1257 0 : skip_slot[slotmap->slot] = true;
1258 0 : num_skipped++;
1259 : }
1260 : /* Remember slot containing latest occurrence of this request value */
1261 0 : slotmap->slot = n;
1262 : }
1263 :
1264 : /* Done with the hash table. */
1265 0 : hash_destroy(htab);
1266 :
1267 : /* If no duplicates, we're out of luck. */
1268 0 : if (!num_skipped)
1269 : {
1270 0 : pfree(skip_slot);
1271 0 : return false;
1272 : }
1273 :
1274 : /* We found some duplicates; remove them. */
1275 0 : preserve_count = 0;
1276 0 : for (n = 0; n < CheckpointerShmem->num_requests; n++)
1277 : {
1278 0 : if (skip_slot[n])
1279 0 : continue;
1280 0 : CheckpointerShmem->requests[preserve_count++] = CheckpointerShmem->requests[n];
1281 : }
1282 0 : ereport(DEBUG1,
1283 : (errmsg("compacted fsync request queue from %d entries to %d entries",
1284 : CheckpointerShmem->num_requests, preserve_count)));
1285 0 : CheckpointerShmem->num_requests = preserve_count;
1286 :
1287 : /* Cleanup. */
1288 0 : pfree(skip_slot);
1289 0 : return true;
1290 : }
1291 :
1292 : /*
1293 : * AbsorbFsyncRequests
1294 : * Retrieve queued fsync requests and pass them to local smgr.
1295 : *
1296 : * This is exported because it must be called during CreateCheckPoint;
1297 : * we have to be sure we have accepted all pending requests just before
1298 : * we start fsync'ing. Since CreateCheckPoint sometimes runs in
1299 : * non-checkpointer processes, do nothing if not checkpointer.
1300 : */
1301 : void
1302 283 : AbsorbFsyncRequests(void)
1303 : {
1304 283 : CheckpointerRequest *requests = NULL;
1305 : CheckpointerRequest *request;
1306 : int n;
1307 :
1308 283 : if (!AmCheckpointerProcess())
1309 289 : return;
1310 :
1311 277 : LWLockAcquire(CheckpointerCommLock, LW_EXCLUSIVE);
1312 :
1313 : /* Transfer stats counts into pending pgstats message */
1314 277 : BgWriterStats.m_buf_written_backend += CheckpointerShmem->num_backend_writes;
1315 277 : BgWriterStats.m_buf_fsync_backend += CheckpointerShmem->num_backend_fsync;
1316 :
1317 277 : CheckpointerShmem->num_backend_writes = 0;
1318 277 : CheckpointerShmem->num_backend_fsync = 0;
1319 :
1320 : /*
1321 : * We try to avoid holding the lock for a long time by copying the request
1322 : * array, and processing the requests after releasing the lock.
1323 : *
1324 : * Once we have cleared the requests from shared memory, we have to PANIC
1325 : * if we then fail to absorb them (eg, because our hashtable runs out of
1326 : * memory). This is because the system cannot run safely if we are unable
1327 : * to fsync what we have been told to fsync. Fortunately, the hashtable
1328 : * is so small that the problem is quite unlikely to arise in practice.
1329 : */
1330 277 : n = CheckpointerShmem->num_requests;
1331 277 : if (n > 0)
1332 : {
1333 6 : requests = (CheckpointerRequest *) palloc(n * sizeof(CheckpointerRequest));
1334 6 : memcpy(requests, CheckpointerShmem->requests, n * sizeof(CheckpointerRequest));
1335 : }
1336 :
1337 277 : START_CRIT_SECTION();
1338 :
1339 277 : CheckpointerShmem->num_requests = 0;
1340 :
1341 277 : LWLockRelease(CheckpointerCommLock);
1342 :
1343 26179 : for (request = requests; n > 0; request++, n--)
1344 25902 : RememberFsyncRequest(request->rnode, request->forknum, request->segno);
1345 :
1346 277 : END_CRIT_SECTION();
1347 :
1348 277 : if (requests)
1349 6 : pfree(requests);
1350 : }
1351 :
1352 : /*
1353 : * Update any shared memory configurations based on config parameters
1354 : */
1355 : static void
1356 1 : UpdateSharedMemoryConfig(void)
1357 : {
1358 : /* update global shmem state for sync rep */
1359 1 : SyncRepUpdateSyncStandbysDefined();
1360 :
1361 : /*
1362 : * If full_page_writes has been changed by SIGHUP, we update it in shared
1363 : * memory and write an XLOG_FPW_CHANGE record.
1364 : */
1365 1 : UpdateFullPageWrites();
1366 :
1367 1 : elog(DEBUG2, "checkpointer updated shared memory configuration values");
1368 1 : }
1369 :
1370 : /*
1371 : * FirstCallSinceLastCheckpoint allows a process to take an action once
1372 : * per checkpoint cycle by asynchronously checking for checkpoint completion.
1373 : */
1374 : bool
1375 388 : FirstCallSinceLastCheckpoint(void)
1376 : {
1377 : static int ckpt_done = 0;
1378 : int new_done;
1379 388 : bool FirstCall = false;
1380 :
1381 388 : SpinLockAcquire(&CheckpointerShmem->ckpt_lck);
1382 388 : new_done = CheckpointerShmem->ckpt_done;
1383 388 : SpinLockRelease(&CheckpointerShmem->ckpt_lck);
1384 :
1385 388 : if (new_done != ckpt_done)
1386 4 : FirstCall = true;
1387 :
1388 388 : ckpt_done = new_done;
1389 :
1390 388 : return FirstCall;
1391 : }
|