Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * bufmgr.c
4 : * buffer manager interface routines
5 : *
6 : * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/backend/storage/buffer/bufmgr.c
12 : *
13 : *-------------------------------------------------------------------------
14 : */
15 : /*
16 : * Principal entry points:
17 : *
18 : * ReadBuffer() -- find or create a buffer holding the requested page,
19 : * and pin it so that no one can destroy it while this process
20 : * is using it.
21 : *
22 : * ReleaseBuffer() -- unpin a buffer
23 : *
24 : * MarkBufferDirty() -- mark a pinned buffer's contents as "dirty".
25 : * The disk write is delayed until buffer replacement or checkpoint.
26 : *
27 : * See also these files:
28 : * freelist.c -- chooses victim for buffer replacement
29 : * buf_table.c -- manages the buffer lookup table
30 : */
31 : #include "postgres.h"
32 :
33 : #include <sys/file.h>
34 : #include <unistd.h>
35 :
36 : #include "access/xlog.h"
37 : #include "catalog/catalog.h"
38 : #include "catalog/storage.h"
39 : #include "executor/instrument.h"
40 : #include "lib/binaryheap.h"
41 : #include "miscadmin.h"
42 : #include "pg_trace.h"
43 : #include "pgstat.h"
44 : #include "postmaster/bgwriter.h"
45 : #include "storage/buf_internals.h"
46 : #include "storage/bufmgr.h"
47 : #include "storage/ipc.h"
48 : #include "storage/proc.h"
49 : #include "storage/smgr.h"
50 : #include "storage/standby.h"
51 : #include "utils/rel.h"
52 : #include "utils/resowner_private.h"
53 : #include "utils/timestamp.h"
54 :
55 :
56 : /* Note: these two macros only work on shared buffers, not local ones! */
57 : #define BufHdrGetBlock(bufHdr) ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
58 : #define BufferGetLSN(bufHdr) (PageGetLSN(BufHdrGetBlock(bufHdr)))
59 :
60 : /* Note: this macro only works on local buffers, not shared ones! */
61 : #define LocalBufHdrGetBlock(bufHdr) \
62 : LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]
63 :
64 : /* Bits in SyncOneBuffer's return value */
65 : #define BUF_WRITTEN 0x01
66 : #define BUF_REUSABLE 0x02
67 :
68 : #define DROP_RELS_BSEARCH_THRESHOLD 20
69 :
70 : typedef struct PrivateRefCountEntry
71 : {
72 : Buffer buffer;
73 : int32 refcount;
74 : } PrivateRefCountEntry;
75 :
76 : /* 64 bytes, about the size of a cache line on common systems */
77 : #define REFCOUNT_ARRAY_ENTRIES 8
78 :
79 : /*
80 : * Status of buffers to checkpoint for a particular tablespace, used
81 : * internally in BufferSync.
82 : */
83 : typedef struct CkptTsStatus
84 : {
85 : /* oid of the tablespace */
86 : Oid tsId;
87 :
88 : /*
89 : * Checkpoint progress for this tablespace. To make progress comparable
90 : * between tablespaces the progress is, for each tablespace, measured as a
91 : * number between 0 and the total number of to-be-checkpointed pages. Each
92 : * page checkpointed in this tablespace increments this space's progress
93 : * by progress_slice.
94 : */
95 : float8 progress;
96 : float8 progress_slice;
97 :
98 : /* number of to-be checkpointed pages in this tablespace */
99 : int num_to_scan;
100 : /* already processed pages in this tablespace */
101 : int num_scanned;
102 :
103 : /* current offset in CkptBufferIds for this tablespace */
104 : int index;
105 : } CkptTsStatus;
106 :
107 : /* GUC variables */
108 : bool zero_damaged_pages = false;
109 : int bgwriter_lru_maxpages = 100;
110 : double bgwriter_lru_multiplier = 2.0;
111 : bool track_io_timing = false;
112 : int effective_io_concurrency = 0;
113 :
114 : /*
115 : * GUC variables about triggering kernel writeback for buffers written; OS
116 : * dependent defaults are set via the GUC mechanism.
117 : */
118 : int checkpoint_flush_after = 0;
119 : int bgwriter_flush_after = 0;
120 : int backend_flush_after = 0;
121 :
122 : /*
123 : * How many buffers PrefetchBuffer callers should try to stay ahead of their
124 : * ReadBuffer calls by. This is maintained by the assign hook for
125 : * effective_io_concurrency. Zero means "never prefetch". This value is
126 : * only used for buffers not belonging to tablespaces that have their
127 : * effective_io_concurrency parameter set.
128 : */
129 : int target_prefetch_pages = 0;
130 :
131 : /* local state for StartBufferIO and related functions */
132 : static BufferDesc *InProgressBuf = NULL;
133 : static bool IsForInput;
134 :
135 : /* local state for LockBufferForCleanup */
136 : static BufferDesc *PinCountWaitBuf = NULL;
137 :
138 : /*
139 : * Backend-Private refcount management:
140 : *
141 : * Each buffer also has a private refcount that keeps track of the number of
142 : * times the buffer is pinned in the current process. This is so that the
143 : * shared refcount needs to be modified only once if a buffer is pinned more
144 : * than once by an individual backend. It's also used to check that no buffers
145 : * are still pinned at the end of transactions and when exiting.
146 : *
147 : *
148 : * To avoid - as we used to - requiring an array with NBuffers entries to keep
149 : * track of local buffers, we use a small sequentially searched array
150 : * (PrivateRefCountArray) and an overflow hash table (PrivateRefCountHash) to
151 : * keep track of backend local pins.
152 : *
153 : * Until no more than REFCOUNT_ARRAY_ENTRIES buffers are pinned at once, all
154 : * refcounts are kept track of in the array; after that, new array entries
155 : * displace old ones into the hash table. That way a frequently used entry
156 : * can't get "stuck" in the hashtable while infrequent ones clog the array.
157 : *
158 : * Note that in most scenarios the number of pinned buffers will not exceed
159 : * REFCOUNT_ARRAY_ENTRIES.
160 : *
161 : *
162 : * To enter a buffer into the refcount tracking mechanism first reserve a free
163 : * entry using ReservePrivateRefCountEntry() and then later, if necessary,
164 : * fill it with NewPrivateRefCountEntry(). That split lets us avoid doing
165 : * memory allocations in NewPrivateRefCountEntry() which can be important
166 : * because in some scenarios it's called with a spinlock held...
167 : */
168 : static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES];
169 : static HTAB *PrivateRefCountHash = NULL;
170 : static int32 PrivateRefCountOverflowed = 0;
171 : static uint32 PrivateRefCountClock = 0;
172 : static PrivateRefCountEntry *ReservedRefCountEntry = NULL;
173 :
174 : static void ReservePrivateRefCountEntry(void);
175 : static PrivateRefCountEntry *NewPrivateRefCountEntry(Buffer buffer);
176 : static PrivateRefCountEntry *GetPrivateRefCountEntry(Buffer buffer, bool do_move);
177 : static inline int32 GetPrivateRefCount(Buffer buffer);
178 : static void ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref);
179 :
180 : /*
181 : * Ensure that the PrivateRefCountArray has sufficient space to store one more
182 : * entry. This has to be called before using NewPrivateRefCountEntry() to fill
183 : * a new entry - but it's perfectly fine to not use a reserved entry.
184 : */
185 : static void
186 3077346 : ReservePrivateRefCountEntry(void)
187 : {
188 : /* Already reserved (or freed), nothing to do */
189 3077346 : if (ReservedRefCountEntry != NULL)
190 2698762 : return;
191 :
192 : /*
193 : * First search for a free entry the array, that'll be sufficient in the
194 : * majority of cases.
195 : */
196 : {
197 : int i;
198 :
199 763559 : for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
200 : {
201 : PrivateRefCountEntry *res;
202 :
203 763490 : res = &PrivateRefCountArray[i];
204 :
205 763490 : if (res->buffer == InvalidBuffer)
206 : {
207 378515 : ReservedRefCountEntry = res;
208 378515 : return;
209 : }
210 : }
211 : }
212 :
213 : /*
214 : * No luck. All array entries are full. Move one array entry into the hash
215 : * table.
216 : */
217 : {
218 : /*
219 : * Move entry from the current clock position in the array into the
220 : * hashtable. Use that slot.
221 : */
222 : PrivateRefCountEntry *hashent;
223 : bool found;
224 :
225 : /* select victim slot */
226 69 : ReservedRefCountEntry =
227 69 : &PrivateRefCountArray[PrivateRefCountClock++ % REFCOUNT_ARRAY_ENTRIES];
228 :
229 : /* Better be used, otherwise we shouldn't get here. */
230 69 : Assert(ReservedRefCountEntry->buffer != InvalidBuffer);
231 :
232 : /* enter victim array entry into hashtable */
233 69 : hashent = hash_search(PrivateRefCountHash,
234 69 : (void *) &(ReservedRefCountEntry->buffer),
235 : HASH_ENTER,
236 : &found);
237 69 : Assert(!found);
238 69 : hashent->refcount = ReservedRefCountEntry->refcount;
239 :
240 : /* clear the now free array slot */
241 69 : ReservedRefCountEntry->buffer = InvalidBuffer;
242 69 : ReservedRefCountEntry->refcount = 0;
243 :
244 69 : PrivateRefCountOverflowed++;
245 : }
246 : }
247 :
248 : /*
249 : * Fill a previously reserved refcount entry.
250 : */
251 : static PrivateRefCountEntry *
252 3064752 : NewPrivateRefCountEntry(Buffer buffer)
253 : {
254 : PrivateRefCountEntry *res;
255 :
256 : /* only allowed to be called when a reservation has been made */
257 3064752 : Assert(ReservedRefCountEntry != NULL);
258 :
259 : /* use up the reserved entry */
260 3064752 : res = ReservedRefCountEntry;
261 3064752 : ReservedRefCountEntry = NULL;
262 :
263 : /* and fill it */
264 3064752 : res->buffer = buffer;
265 3064752 : res->refcount = 0;
266 :
267 3064752 : return res;
268 : }
269 :
270 : /*
271 : * Return the PrivateRefCount entry for the passed buffer.
272 : *
273 : * Returns NULL if a buffer doesn't have a refcount entry. Otherwise, if
274 : * do_move is true, and the entry resides in the hashtable the entry is
275 : * optimized for frequent access by moving it to the array.
276 : */
277 : static PrivateRefCountEntry *
278 22907824 : GetPrivateRefCountEntry(Buffer buffer, bool do_move)
279 : {
280 : PrivateRefCountEntry *res;
281 : int i;
282 :
283 22907824 : Assert(BufferIsValid(buffer));
284 22907824 : Assert(!BufferIsLocal(buffer));
285 :
286 : /*
287 : * First search for references in the array, that'll be sufficient in the
288 : * majority of cases.
289 : */
290 64206779 : for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
291 : {
292 61141818 : res = &PrivateRefCountArray[i];
293 :
294 61141818 : if (res->buffer == buffer)
295 19842863 : return res;
296 : }
297 :
298 : /*
299 : * By here we know that the buffer, if already pinned, isn't residing in
300 : * the array.
301 : *
302 : * Only look up the buffer in the hashtable if we've previously overflowed
303 : * into it.
304 : */
305 3064961 : if (PrivateRefCountOverflowed == 0)
306 3064455 : return NULL;
307 :
308 506 : res = hash_search(PrivateRefCountHash,
309 : (void *) &buffer,
310 : HASH_FIND,
311 : NULL);
312 :
313 506 : if (res == NULL)
314 297 : return NULL;
315 209 : else if (!do_move)
316 : {
317 : /* caller doesn't want us to move the hash entry into the array */
318 206 : return res;
319 : }
320 : else
321 : {
322 : /* move buffer from hashtable into the free array slot */
323 : bool found;
324 : PrivateRefCountEntry *free;
325 :
326 : /* Ensure there's a free array slot */
327 3 : ReservePrivateRefCountEntry();
328 :
329 : /* Use up the reserved slot */
330 3 : Assert(ReservedRefCountEntry != NULL);
331 3 : free = ReservedRefCountEntry;
332 3 : ReservedRefCountEntry = NULL;
333 3 : Assert(free->buffer == InvalidBuffer);
334 :
335 : /* and fill it */
336 3 : free->buffer = buffer;
337 3 : free->refcount = res->refcount;
338 :
339 : /* delete from hashtable */
340 3 : hash_search(PrivateRefCountHash,
341 : (void *) &buffer,
342 : HASH_REMOVE,
343 : &found);
344 3 : Assert(found);
345 3 : Assert(PrivateRefCountOverflowed > 0);
346 3 : PrivateRefCountOverflowed--;
347 :
348 3 : return free;
349 : }
350 : }
351 :
352 : /*
353 : * Returns how many times the passed buffer is pinned by this backend.
354 : *
355 : * Only works for shared memory buffers!
356 : */
357 : static inline int32
358 15833454 : GetPrivateRefCount(Buffer buffer)
359 : {
360 : PrivateRefCountEntry *ref;
361 :
362 15833454 : Assert(BufferIsValid(buffer));
363 15833454 : Assert(!BufferIsLocal(buffer));
364 :
365 : /*
366 : * Not moving the entry - that's ok for the current users, but we might
367 : * want to change this one day.
368 : */
369 15833454 : ref = GetPrivateRefCountEntry(buffer, false);
370 :
371 15833454 : if (ref == NULL)
372 0 : return 0;
373 15833454 : return ref->refcount;
374 : }
375 :
376 : /*
377 : * Release resources used to track the reference count of a buffer which we no
378 : * longer have pinned and don't want to pin again immediately.
379 : */
380 : static void
381 3064752 : ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref)
382 : {
383 3064752 : Assert(ref->refcount == 0);
384 :
385 3064752 : if (ref >= &PrivateRefCountArray[0] &&
386 : ref < &PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES])
387 : {
388 3064686 : ref->buffer = InvalidBuffer;
389 :
390 : /*
391 : * Mark the just used entry as reserved - in many scenarios that
392 : * allows us to avoid ever having to search the array/hash for free
393 : * entries.
394 : */
395 3064686 : ReservedRefCountEntry = ref;
396 : }
397 : else
398 : {
399 : bool found;
400 66 : Buffer buffer = ref->buffer;
401 :
402 66 : hash_search(PrivateRefCountHash,
403 : (void *) &buffer,
404 : HASH_REMOVE,
405 : &found);
406 66 : Assert(found);
407 66 : Assert(PrivateRefCountOverflowed > 0);
408 66 : PrivateRefCountOverflowed--;
409 : }
410 3064752 : }
411 :
412 : /*
413 : * BufferIsPinned
414 : * True iff the buffer is pinned (also checks for valid buffer number).
415 : *
416 : * NOTE: what we check here is that *this* backend holds a pin on
417 : * the buffer. We do not care whether some other backend does.
418 : */
419 : #define BufferIsPinned(bufnum) \
420 : ( \
421 : !BufferIsValid(bufnum) ? \
422 : false \
423 : : \
424 : BufferIsLocal(bufnum) ? \
425 : (LocalRefCount[-(bufnum) - 1] > 0) \
426 : : \
427 : (GetPrivateRefCount(bufnum) > 0) \
428 : )
429 :
430 :
431 : static Buffer ReadBuffer_common(SMgrRelation reln, char relpersistence,
432 : ForkNumber forkNum, BlockNumber blockNum,
433 : ReadBufferMode mode, BufferAccessStrategy strategy,
434 : bool *hit);
435 : static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy);
436 : static void PinBuffer_Locked(BufferDesc *buf);
437 : static void UnpinBuffer(BufferDesc *buf, bool fixOwner);
438 : static void BufferSync(int flags);
439 : static uint32 WaitBufHdrUnlocked(BufferDesc *buf);
440 : static int SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *flush_context);
441 : static void WaitIO(BufferDesc *buf);
442 : static bool StartBufferIO(BufferDesc *buf, bool forInput);
443 : static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty,
444 : uint32 set_flag_bits);
445 : static void shared_buffer_write_error_callback(void *arg);
446 : static void local_buffer_write_error_callback(void *arg);
447 : static BufferDesc *BufferAlloc(SMgrRelation smgr,
448 : char relpersistence,
449 : ForkNumber forkNum,
450 : BlockNumber blockNum,
451 : BufferAccessStrategy strategy,
452 : bool *foundPtr);
453 : static void FlushBuffer(BufferDesc *buf, SMgrRelation reln);
454 : static void AtProcExit_Buffers(int code, Datum arg);
455 : static void CheckForBufferLeaks(void);
456 : static int rnode_comparator(const void *p1, const void *p2);
457 : static int buffertag_comparator(const void *p1, const void *p2);
458 : static int ckpt_buforder_comparator(const void *pa, const void *pb);
459 : static int ts_ckpt_progress_comparator(Datum a, Datum b, void *arg);
460 :
461 :
462 : /*
463 : * ComputeIoConcurrency -- get the number of pages to prefetch for a given
464 : * number of spindles.
465 : */
466 : bool
467 50 : ComputeIoConcurrency(int io_concurrency, double *target)
468 : {
469 50 : double new_prefetch_pages = 0.0;
470 : int i;
471 :
472 : /*
473 : * Make sure the io_concurrency value is within valid range; it may have
474 : * been forced with a manual pg_tablespace update.
475 : */
476 50 : io_concurrency = Min(Max(io_concurrency, 0), MAX_IO_CONCURRENCY);
477 :
478 : /*----------
479 : * The user-visible GUC parameter is the number of drives (spindles),
480 : * which we need to translate to a number-of-pages-to-prefetch target.
481 : * The target value is stashed in *extra and then assigned to the actual
482 : * variable by assign_effective_io_concurrency.
483 : *
484 : * The expected number of prefetch pages needed to keep N drives busy is:
485 : *
486 : * drives | I/O requests
487 : * -------+----------------
488 : * 1 | 1
489 : * 2 | 2/1 + 2/2 = 3
490 : * 3 | 3/1 + 3/2 + 3/3 = 5 1/2
491 : * 4 | 4/1 + 4/2 + 4/3 + 4/4 = 8 1/3
492 : * n | n * H(n)
493 : *
494 : * This is called the "coupon collector problem" and H(n) is called the
495 : * harmonic series. This could be approximated by n * ln(n), but for
496 : * reasonable numbers of drives we might as well just compute the series.
497 : *
498 : * Alternatively we could set the target to the number of pages necessary
499 : * so that the expected number of active spindles is some arbitrary
500 : * percentage of the total. This sounds the same but is actually slightly
501 : * different. The result ends up being ln(1-P)/ln((n-1)/n) where P is
502 : * that desired fraction.
503 : *
504 : * Experimental results show that both of these formulas aren't aggressive
505 : * enough, but we don't really have any better proposals.
506 : *
507 : * Note that if io_concurrency = 0 (disabled), we must set target = 0.
508 : *----------
509 : */
510 :
511 2305 : for (i = 1; i <= io_concurrency; i++)
512 2255 : new_prefetch_pages += (double) io_concurrency / (double) i;
513 :
514 50 : *target = new_prefetch_pages;
515 :
516 : /* This range check shouldn't fail, but let's be paranoid */
517 50 : return (new_prefetch_pages >= 0.0 && new_prefetch_pages < (double) INT_MAX);
518 : }
519 :
520 : /*
521 : * PrefetchBuffer -- initiate asynchronous read of a block of a relation
522 : *
523 : * This is named by analogy to ReadBuffer but doesn't actually allocate a
524 : * buffer. Instead it tries to ensure that a future ReadBuffer for the given
525 : * block will not be delayed by the I/O. Prefetching is optional.
526 : * No-op if prefetching isn't compiled in.
527 : */
528 : void
529 27377 : PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum)
530 : {
531 : #ifdef USE_PREFETCH
532 27377 : Assert(RelationIsValid(reln));
533 27377 : Assert(BlockNumberIsValid(blockNum));
534 :
535 : /* Open it at the smgr level if not already done */
536 27377 : RelationOpenSmgr(reln);
537 :
538 27377 : if (RelationUsesLocalBuffers(reln))
539 : {
540 : /* see comments in ReadBufferExtended */
541 5 : if (RELATION_IS_OTHER_TEMP(reln))
542 0 : ereport(ERROR,
543 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
544 : errmsg("cannot access temporary tables of other sessions")));
545 :
546 : /* pass it off to localbuf.c */
547 5 : LocalPrefetchBuffer(reln->rd_smgr, forkNum, blockNum);
548 : }
549 : else
550 : {
551 : BufferTag newTag; /* identity of requested block */
552 : uint32 newHash; /* hash value for newTag */
553 : LWLock *newPartitionLock; /* buffer partition lock for it */
554 : int buf_id;
555 :
556 : /* create a tag so we can lookup the buffer */
557 27372 : INIT_BUFFERTAG(newTag, reln->rd_smgr->smgr_rnode.node,
558 : forkNum, blockNum);
559 :
560 : /* determine its hash code and partition lock ID */
561 27372 : newHash = BufTableHashCode(&newTag);
562 27372 : newPartitionLock = BufMappingPartitionLock(newHash);
563 :
564 : /* see if the block is in the buffer pool already */
565 27372 : LWLockAcquire(newPartitionLock, LW_SHARED);
566 27372 : buf_id = BufTableLookup(&newTag, newHash);
567 27372 : LWLockRelease(newPartitionLock);
568 :
569 : /* If not in buffers, initiate prefetch */
570 27372 : if (buf_id < 0)
571 0 : smgrprefetch(reln->rd_smgr, forkNum, blockNum);
572 :
573 : /*
574 : * If the block *is* in buffers, we do nothing. This is not really
575 : * ideal: the block might be just about to be evicted, which would be
576 : * stupid since we know we are going to need it soon. But the only
577 : * easy answer is to bump the usage_count, which does not seem like a
578 : * great solution: when the caller does ultimately touch the block,
579 : * usage_count would get bumped again, resulting in too much
580 : * favoritism for blocks that are involved in a prefetch sequence. A
581 : * real fix would involve some additional per-buffer state, and it's
582 : * not clear that there's enough of a problem to justify that.
583 : */
584 : }
585 : #endif /* USE_PREFETCH */
586 27377 : }
587 :
588 :
589 : /*
590 : * ReadBuffer -- a shorthand for ReadBufferExtended, for reading from main
591 : * fork with RBM_NORMAL mode and default strategy.
592 : */
593 : Buffer
594 3079037 : ReadBuffer(Relation reln, BlockNumber blockNum)
595 : {
596 3079037 : return ReadBufferExtended(reln, MAIN_FORKNUM, blockNum, RBM_NORMAL, NULL);
597 : }
598 :
599 : /*
600 : * ReadBufferExtended -- returns a buffer containing the requested
601 : * block of the requested relation. If the blknum
602 : * requested is P_NEW, extend the relation file and
603 : * allocate a new block. (Caller is responsible for
604 : * ensuring that only one backend tries to extend a
605 : * relation at the same time!)
606 : *
607 : * Returns: the buffer number for the buffer containing
608 : * the block read. The returned buffer has been pinned.
609 : * Does not return on error --- elog's instead.
610 : *
611 : * Assume when this function is called, that reln has been opened already.
612 : *
613 : * In RBM_NORMAL mode, the page is read from disk, and the page header is
614 : * validated. An error is thrown if the page header is not valid. (But
615 : * note that an all-zero page is considered "valid"; see PageIsVerified().)
616 : *
617 : * RBM_ZERO_ON_ERROR is like the normal mode, but if the page header is not
618 : * valid, the page is zeroed instead of throwing an error. This is intended
619 : * for non-critical data, where the caller is prepared to repair errors.
620 : *
621 : * In RBM_ZERO_AND_LOCK mode, if the page isn't in buffer cache already, it's
622 : * filled with zeros instead of reading it from disk. Useful when the caller
623 : * is going to fill the page from scratch, since this saves I/O and avoids
624 : * unnecessary failure if the page-on-disk has corrupt page headers.
625 : * The page is returned locked to ensure that the caller has a chance to
626 : * initialize the page before it's made visible to others.
627 : * Caution: do not use this mode to read a page that is beyond the relation's
628 : * current physical EOF; that is likely to cause problems in md.c when
629 : * the page is modified and written out. P_NEW is OK, though.
630 : *
631 : * RBM_ZERO_AND_CLEANUP_LOCK is the same as RBM_ZERO_AND_LOCK, but acquires
632 : * a cleanup-strength lock on the page.
633 : *
634 : * RBM_NORMAL_NO_LOG mode is treated the same as RBM_NORMAL here.
635 : *
636 : * If strategy is not NULL, a nondefault buffer access strategy is used.
637 : * See buffer/README for details.
638 : */
639 : Buffer
640 3240308 : ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum,
641 : ReadBufferMode mode, BufferAccessStrategy strategy)
642 : {
643 : bool hit;
644 : Buffer buf;
645 :
646 : /* Open it at the smgr level if not already done */
647 3240308 : RelationOpenSmgr(reln);
648 :
649 : /*
650 : * Reject attempts to read non-local temporary relations; we would be
651 : * likely to get wrong data since we have no visibility into the owning
652 : * session's local buffers.
653 : */
654 3240308 : if (RELATION_IS_OTHER_TEMP(reln))
655 0 : ereport(ERROR,
656 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
657 : errmsg("cannot access temporary tables of other sessions")));
658 :
659 : /*
660 : * Read the buffer, and update pgstat counters to reflect a cache hit or
661 : * miss.
662 : */
663 3240308 : pgstat_count_buffer_read(reln);
664 3240308 : buf = ReadBuffer_common(reln->rd_smgr, reln->rd_rel->relpersistence,
665 : forkNum, blockNum, mode, strategy, &hit);
666 3240308 : if (hit)
667 3221758 : pgstat_count_buffer_hit(reln);
668 3240308 : return buf;
669 : }
670 :
671 :
672 : /*
673 : * ReadBufferWithoutRelcache -- like ReadBufferExtended, but doesn't require
674 : * a relcache entry for the relation.
675 : *
676 : * NB: At present, this function may only be used on permanent relations, which
677 : * is OK, because we only use it during XLOG replay. If in the future we
678 : * want to use it on temporary or unlogged relations, we could pass additional
679 : * parameters.
680 : */
681 : Buffer
682 0 : ReadBufferWithoutRelcache(RelFileNode rnode, ForkNumber forkNum,
683 : BlockNumber blockNum, ReadBufferMode mode,
684 : BufferAccessStrategy strategy)
685 : {
686 : bool hit;
687 :
688 0 : SMgrRelation smgr = smgropen(rnode, InvalidBackendId);
689 :
690 0 : Assert(InRecovery);
691 :
692 0 : return ReadBuffer_common(smgr, RELPERSISTENCE_PERMANENT, forkNum, blockNum,
693 : mode, strategy, &hit);
694 : }
695 :
696 :
697 : /*
698 : * ReadBuffer_common -- common logic for all ReadBuffer variants
699 : *
700 : * *hit is set to true if the request was satisfied from shared buffer cache.
701 : */
702 : static Buffer
703 3240308 : ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
704 : BlockNumber blockNum, ReadBufferMode mode,
705 : BufferAccessStrategy strategy, bool *hit)
706 : {
707 : BufferDesc *bufHdr;
708 : Block bufBlock;
709 : bool found;
710 : bool isExtend;
711 3240308 : bool isLocalBuf = SmgrIsTemp(smgr);
712 :
713 3240308 : *hit = false;
714 :
715 : /* Make sure we will have room to remember the buffer pin */
716 3240308 : ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
717 :
718 3240308 : isExtend = (blockNum == P_NEW);
719 :
720 : TRACE_POSTGRESQL_BUFFER_READ_START(forkNum, blockNum,
721 : smgr->smgr_rnode.node.spcNode,
722 : smgr->smgr_rnode.node.dbNode,
723 : smgr->smgr_rnode.node.relNode,
724 : smgr->smgr_rnode.backend,
725 : isExtend);
726 :
727 : /* Substitute proper block number if caller asked for P_NEW */
728 3240308 : if (isExtend)
729 14050 : blockNum = smgrnblocks(smgr, forkNum);
730 :
731 3240308 : if (isLocalBuf)
732 : {
733 37373 : bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, &found);
734 37373 : if (found)
735 36028 : pgBufferUsage.local_blks_hit++;
736 : else
737 1345 : pgBufferUsage.local_blks_read++;
738 : }
739 : else
740 : {
741 : /*
742 : * lookup the buffer. IO_IN_PROGRESS is set if the requested block is
743 : * not currently in memory.
744 : */
745 3202935 : bufHdr = BufferAlloc(smgr, relpersistence, forkNum, blockNum,
746 : strategy, &found);
747 3202935 : if (found)
748 3185730 : pgBufferUsage.shared_blks_hit++;
749 : else
750 17205 : pgBufferUsage.shared_blks_read++;
751 : }
752 :
753 : /* At this point we do NOT hold any locks. */
754 :
755 : /* if it was already in the buffer pool, we're done */
756 3240308 : if (found)
757 : {
758 3221758 : if (!isExtend)
759 : {
760 : /* Just need to update stats before we exit */
761 3221758 : *hit = true;
762 3221758 : VacuumPageHit++;
763 :
764 3221758 : if (VacuumCostActive)
765 13027 : VacuumCostBalance += VacuumCostPageHit;
766 :
767 : TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
768 : smgr->smgr_rnode.node.spcNode,
769 : smgr->smgr_rnode.node.dbNode,
770 : smgr->smgr_rnode.node.relNode,
771 : smgr->smgr_rnode.backend,
772 : isExtend,
773 : found);
774 :
775 : /*
776 : * In RBM_ZERO_AND_LOCK mode the caller expects the page to be
777 : * locked on return.
778 : */
779 3221758 : if (!isLocalBuf)
780 : {
781 3185730 : if (mode == RBM_ZERO_AND_LOCK)
782 8 : LWLockAcquire(BufferDescriptorGetContentLock(bufHdr),
783 : LW_EXCLUSIVE);
784 3185722 : else if (mode == RBM_ZERO_AND_CLEANUP_LOCK)
785 0 : LockBufferForCleanup(BufferDescriptorGetBuffer(bufHdr));
786 : }
787 :
788 3221758 : return BufferDescriptorGetBuffer(bufHdr);
789 : }
790 :
791 : /*
792 : * We get here only in the corner case where we are trying to extend
793 : * the relation but we found a pre-existing buffer marked BM_VALID.
794 : * This can happen because mdread doesn't complain about reads beyond
795 : * EOF (when zero_damaged_pages is ON) and so a previous attempt to
796 : * read a block beyond EOF could have left a "valid" zero-filled
797 : * buffer. Unfortunately, we have also seen this case occurring
798 : * because of buggy Linux kernels that sometimes return an
799 : * lseek(SEEK_END) result that doesn't account for a recent write. In
800 : * that situation, the pre-existing buffer would contain valid data
801 : * that we don't want to overwrite. Since the legitimate case should
802 : * always have left a zero-filled buffer, complain if not PageIsNew.
803 : */
804 0 : bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
805 0 : if (!PageIsNew((Page) bufBlock))
806 0 : ereport(ERROR,
807 : (errmsg("unexpected data beyond EOF in block %u of relation %s",
808 : blockNum, relpath(smgr->smgr_rnode, forkNum)),
809 : errhint("This has been seen to occur with buggy kernels; consider updating your system.")));
810 :
811 : /*
812 : * We *must* do smgrextend before succeeding, else the page will not
813 : * be reserved by the kernel, and the next P_NEW call will decide to
814 : * return the same page. Clear the BM_VALID bit, do the StartBufferIO
815 : * call that BufferAlloc didn't, and proceed.
816 : */
817 0 : if (isLocalBuf)
818 : {
819 : /* Only need to adjust flags */
820 0 : uint32 buf_state = pg_atomic_read_u32(&bufHdr->state);
821 :
822 0 : Assert(buf_state & BM_VALID);
823 0 : buf_state &= ~BM_VALID;
824 0 : pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
825 : }
826 : else
827 : {
828 : /*
829 : * Loop to handle the very small possibility that someone re-sets
830 : * BM_VALID between our clearing it and StartBufferIO inspecting
831 : * it.
832 : */
833 : do
834 : {
835 0 : uint32 buf_state = LockBufHdr(bufHdr);
836 :
837 0 : Assert(buf_state & BM_VALID);
838 0 : buf_state &= ~BM_VALID;
839 0 : UnlockBufHdr(bufHdr, buf_state);
840 0 : } while (!StartBufferIO(bufHdr, true));
841 : }
842 : }
843 :
844 : /*
845 : * if we have gotten to this point, we have allocated a buffer for the
846 : * page but its contents are not yet valid. IO_IN_PROGRESS is set for it,
847 : * if it's a shared buffer.
848 : *
849 : * Note: if smgrextend fails, we will end up with a buffer that is
850 : * allocated but not marked BM_VALID. P_NEW will still select the same
851 : * block number (because the relation didn't get any longer on disk) and
852 : * so future attempts to extend the relation will find the same buffer (if
853 : * it's not been recycled) but come right back here to try smgrextend
854 : * again.
855 : */
856 18550 : Assert(!(pg_atomic_read_u32(&bufHdr->state) & BM_VALID)); /* spinlock not needed */
857 :
858 18550 : bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
859 :
860 18550 : if (isExtend)
861 : {
862 : /* new buffers are zero-filled */
863 14050 : MemSet((char *) bufBlock, 0, BLCKSZ);
864 : /* don't set checksum for all-zero page */
865 14050 : smgrextend(smgr, forkNum, blockNum, (char *) bufBlock, false);
866 :
867 : /*
868 : * NB: we're *not* doing a ScheduleBufferTagForWriteback here;
869 : * although we're essentially performing a write. At least on linux
870 : * doing so defeats the 'delayed allocation' mechanism, leading to
871 : * increased file fragmentation.
872 : */
873 : }
874 : else
875 : {
876 : /*
877 : * Read in the page, unless the caller intends to overwrite it and
878 : * just wants us to allocate a buffer.
879 : */
880 4500 : if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
881 73 : MemSet((char *) bufBlock, 0, BLCKSZ);
882 : else
883 : {
884 : instr_time io_start,
885 : io_time;
886 :
887 4427 : if (track_io_timing)
888 0 : INSTR_TIME_SET_CURRENT(io_start);
889 :
890 4427 : smgrread(smgr, forkNum, blockNum, (char *) bufBlock);
891 :
892 4427 : if (track_io_timing)
893 : {
894 0 : INSTR_TIME_SET_CURRENT(io_time);
895 0 : INSTR_TIME_SUBTRACT(io_time, io_start);
896 0 : pgstat_count_buffer_read_time(INSTR_TIME_GET_MICROSEC(io_time));
897 0 : INSTR_TIME_ADD(pgBufferUsage.blk_read_time, io_time);
898 : }
899 :
900 : /* check for garbage data */
901 4427 : if (!PageIsVerified((Page) bufBlock, blockNum))
902 : {
903 0 : if (mode == RBM_ZERO_ON_ERROR || zero_damaged_pages)
904 : {
905 0 : ereport(WARNING,
906 : (errcode(ERRCODE_DATA_CORRUPTED),
907 : errmsg("invalid page in block %u of relation %s; zeroing out page",
908 : blockNum,
909 : relpath(smgr->smgr_rnode, forkNum))));
910 0 : MemSet((char *) bufBlock, 0, BLCKSZ);
911 : }
912 : else
913 0 : ereport(ERROR,
914 : (errcode(ERRCODE_DATA_CORRUPTED),
915 : errmsg("invalid page in block %u of relation %s",
916 : blockNum,
917 : relpath(smgr->smgr_rnode, forkNum))));
918 : }
919 : }
920 : }
921 :
922 : /*
923 : * In RBM_ZERO_AND_LOCK mode, grab the buffer content lock before marking
924 : * the page as valid, to make sure that no other backend sees the zeroed
925 : * page before the caller has had a chance to initialize it.
926 : *
927 : * Since no-one else can be looking at the page contents yet, there is no
928 : * difference between an exclusive lock and a cleanup-strength lock. (Note
929 : * that we cannot use LockBuffer() or LockBufferForCleanup() here, because
930 : * they assert that the buffer is already valid.)
931 : */
932 18550 : if ((mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK) &&
933 : !isLocalBuf)
934 : {
935 73 : LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_EXCLUSIVE);
936 : }
937 :
938 18550 : if (isLocalBuf)
939 : {
940 : /* Only need to adjust flags */
941 1345 : uint32 buf_state = pg_atomic_read_u32(&bufHdr->state);
942 :
943 1345 : buf_state |= BM_VALID;
944 1345 : pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
945 : }
946 : else
947 : {
948 : /* Set BM_VALID, terminate IO, and wake up any waiters */
949 17205 : TerminateBufferIO(bufHdr, false, BM_VALID);
950 : }
951 :
952 18550 : VacuumPageMiss++;
953 18550 : if (VacuumCostActive)
954 120 : VacuumCostBalance += VacuumCostPageMiss;
955 :
956 : TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
957 : smgr->smgr_rnode.node.spcNode,
958 : smgr->smgr_rnode.node.dbNode,
959 : smgr->smgr_rnode.node.relNode,
960 : smgr->smgr_rnode.backend,
961 : isExtend,
962 : found);
963 :
964 18550 : return BufferDescriptorGetBuffer(bufHdr);
965 : }
966 :
967 : /*
968 : * BufferAlloc -- subroutine for ReadBuffer. Handles lookup of a shared
969 : * buffer. If no buffer exists already, selects a replacement
970 : * victim and evicts the old page, but does NOT read in new page.
971 : *
972 : * "strategy" can be a buffer replacement strategy object, or NULL for
973 : * the default strategy. The selected buffer's usage_count is advanced when
974 : * using the default strategy, but otherwise possibly not (see PinBuffer).
975 : *
976 : * The returned buffer is pinned and is already marked as holding the
977 : * desired page. If it already did have the desired page, *foundPtr is
978 : * set TRUE. Otherwise, *foundPtr is set FALSE and the buffer is marked
979 : * as IO_IN_PROGRESS; ReadBuffer will now need to do I/O to fill it.
980 : *
981 : * *foundPtr is actually redundant with the buffer's BM_VALID flag, but
982 : * we keep it for simplicity in ReadBuffer.
983 : *
984 : * No locks are held either at entry or exit.
985 : */
986 : static BufferDesc *
987 3202935 : BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
988 : BlockNumber blockNum,
989 : BufferAccessStrategy strategy,
990 : bool *foundPtr)
991 : {
992 : BufferTag newTag; /* identity of requested block */
993 : uint32 newHash; /* hash value for newTag */
994 : LWLock *newPartitionLock; /* buffer partition lock for it */
995 : BufferTag oldTag; /* previous identity of selected buffer */
996 : uint32 oldHash; /* hash value for oldTag */
997 : LWLock *oldPartitionLock; /* buffer partition lock for it */
998 : uint32 oldFlags;
999 : int buf_id;
1000 : BufferDesc *buf;
1001 : bool valid;
1002 : uint32 buf_state;
1003 :
1004 : /* create a tag so we can lookup the buffer */
1005 3202935 : INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum);
1006 :
1007 : /* determine its hash code and partition lock ID */
1008 3202935 : newHash = BufTableHashCode(&newTag);
1009 3202935 : newPartitionLock = BufMappingPartitionLock(newHash);
1010 :
1011 : /* see if the block is in the buffer pool already */
1012 3202935 : LWLockAcquire(newPartitionLock, LW_SHARED);
1013 3202935 : buf_id = BufTableLookup(&newTag, newHash);
1014 3202935 : if (buf_id >= 0)
1015 : {
1016 : /*
1017 : * Found it. Now, pin the buffer so no one can steal it from the
1018 : * buffer pool, and check to see if the correct data has been loaded
1019 : * into the buffer.
1020 : */
1021 3185730 : buf = GetBufferDescriptor(buf_id);
1022 :
1023 3185730 : valid = PinBuffer(buf, strategy);
1024 :
1025 : /* Can release the mapping lock as soon as we've pinned it */
1026 3185730 : LWLockRelease(newPartitionLock);
1027 :
1028 3185730 : *foundPtr = TRUE;
1029 :
1030 3185730 : if (!valid)
1031 : {
1032 : /*
1033 : * We can only get here if (a) someone else is still reading in
1034 : * the page, or (b) a previous read attempt failed. We have to
1035 : * wait for any active read attempt to finish, and then set up our
1036 : * own read attempt if the page is still not BM_VALID.
1037 : * StartBufferIO does it all.
1038 : */
1039 0 : if (StartBufferIO(buf, true))
1040 : {
1041 : /*
1042 : * If we get here, previous attempts to read the buffer must
1043 : * have failed ... but we shall bravely try again.
1044 : */
1045 0 : *foundPtr = FALSE;
1046 : }
1047 : }
1048 :
1049 3185730 : return buf;
1050 : }
1051 :
1052 : /*
1053 : * Didn't find it in the buffer pool. We'll have to initialize a new
1054 : * buffer. Remember to unlock the mapping lock while doing the work.
1055 : */
1056 17205 : LWLockRelease(newPartitionLock);
1057 :
1058 : /* Loop here in case we have to try another victim buffer */
1059 : for (;;)
1060 : {
1061 : /*
1062 : * Ensure, while the spinlock's not yet held, that there's a free
1063 : * refcount entry.
1064 : */
1065 17205 : ReservePrivateRefCountEntry();
1066 :
1067 : /*
1068 : * Select a victim buffer. The buffer is returned with its header
1069 : * spinlock still held!
1070 : */
1071 17205 : buf = StrategyGetBuffer(strategy, &buf_state);
1072 :
1073 17205 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 0);
1074 :
1075 : /* Must copy buffer flags while we still hold the spinlock */
1076 17205 : oldFlags = buf_state & BUF_FLAG_MASK;
1077 :
1078 : /* Pin the buffer and then release the buffer spinlock */
1079 17205 : PinBuffer_Locked(buf);
1080 :
1081 : /*
1082 : * If the buffer was dirty, try to write it out. There is a race
1083 : * condition here, in that someone might dirty it after we released it
1084 : * above, or even while we are writing it out (since our share-lock
1085 : * won't prevent hint-bit updates). We will recheck the dirty bit
1086 : * after re-locking the buffer header.
1087 : */
1088 17205 : if (oldFlags & BM_DIRTY)
1089 : {
1090 : /*
1091 : * We need a share-lock on the buffer contents to write it out
1092 : * (else we might write invalid data, eg because someone else is
1093 : * compacting the page contents while we write). We must use a
1094 : * conditional lock acquisition here to avoid deadlock. Even
1095 : * though the buffer was not pinned (and therefore surely not
1096 : * locked) when StrategyGetBuffer returned it, someone else could
1097 : * have pinned and exclusive-locked it by the time we get here. If
1098 : * we try to get the lock unconditionally, we'd block waiting for
1099 : * them; if they later block waiting for us, deadlock ensues.
1100 : * (This has been observed to happen when two backends are both
1101 : * trying to split btree index pages, and the second one just
1102 : * happens to be trying to split the page the first one got from
1103 : * StrategyGetBuffer.)
1104 : */
1105 26 : if (LWLockConditionalAcquire(BufferDescriptorGetContentLock(buf),
1106 : LW_SHARED))
1107 : {
1108 : /*
1109 : * If using a nondefault strategy, and writing the buffer
1110 : * would require a WAL flush, let the strategy decide whether
1111 : * to go ahead and write/reuse the buffer or to choose another
1112 : * victim. We need lock to inspect the page LSN, so this
1113 : * can't be done inside StrategyGetBuffer.
1114 : */
1115 26 : if (strategy != NULL)
1116 : {
1117 : XLogRecPtr lsn;
1118 :
1119 : /* Read the LSN while holding buffer header lock */
1120 26 : buf_state = LockBufHdr(buf);
1121 26 : lsn = BufferGetLSN(buf);
1122 26 : UnlockBufHdr(buf, buf_state);
1123 :
1124 34 : if (XLogNeedsFlush(lsn) &&
1125 8 : StrategyRejectBuffer(strategy, buf))
1126 : {
1127 : /* Drop lock/pin and loop around for another buffer */
1128 0 : LWLockRelease(BufferDescriptorGetContentLock(buf));
1129 0 : UnpinBuffer(buf, true);
1130 0 : continue;
1131 : }
1132 : }
1133 :
1134 : /* OK, do the I/O */
1135 : TRACE_POSTGRESQL_BUFFER_WRITE_DIRTY_START(forkNum, blockNum,
1136 : smgr->smgr_rnode.node.spcNode,
1137 : smgr->smgr_rnode.node.dbNode,
1138 : smgr->smgr_rnode.node.relNode);
1139 :
1140 26 : FlushBuffer(buf, NULL);
1141 26 : LWLockRelease(BufferDescriptorGetContentLock(buf));
1142 :
1143 26 : ScheduleBufferTagForWriteback(&BackendWritebackContext,
1144 : &buf->tag);
1145 :
1146 : TRACE_POSTGRESQL_BUFFER_WRITE_DIRTY_DONE(forkNum, blockNum,
1147 : smgr->smgr_rnode.node.spcNode,
1148 : smgr->smgr_rnode.node.dbNode,
1149 : smgr->smgr_rnode.node.relNode);
1150 : }
1151 : else
1152 : {
1153 : /*
1154 : * Someone else has locked the buffer, so give it up and loop
1155 : * back to get another one.
1156 : */
1157 0 : UnpinBuffer(buf, true);
1158 0 : continue;
1159 : }
1160 : }
1161 :
1162 : /*
1163 : * To change the association of a valid buffer, we'll need to have
1164 : * exclusive lock on both the old and new mapping partitions.
1165 : */
1166 17205 : if (oldFlags & BM_TAG_VALID)
1167 : {
1168 : /*
1169 : * Need to compute the old tag's hashcode and partition lock ID.
1170 : * XXX is it worth storing the hashcode in BufferDesc so we need
1171 : * not recompute it here? Probably not.
1172 : */
1173 553 : oldTag = buf->tag;
1174 553 : oldHash = BufTableHashCode(&oldTag);
1175 553 : oldPartitionLock = BufMappingPartitionLock(oldHash);
1176 :
1177 : /*
1178 : * Must lock the lower-numbered partition first to avoid
1179 : * deadlocks.
1180 : */
1181 553 : if (oldPartitionLock < newPartitionLock)
1182 : {
1183 274 : LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
1184 274 : LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
1185 : }
1186 279 : else if (oldPartitionLock > newPartitionLock)
1187 : {
1188 276 : LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
1189 276 : LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
1190 : }
1191 : else
1192 : {
1193 : /* only one partition, only one lock */
1194 3 : LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
1195 : }
1196 : }
1197 : else
1198 : {
1199 : /* if it wasn't valid, we need only the new partition */
1200 16652 : LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
1201 : /* remember we have no old-partition lock or tag */
1202 16652 : oldPartitionLock = NULL;
1203 : /* this just keeps the compiler quiet about uninit variables */
1204 16652 : oldHash = 0;
1205 : }
1206 :
1207 : /*
1208 : * Try to make a hashtable entry for the buffer under its new tag.
1209 : * This could fail because while we were writing someone else
1210 : * allocated another buffer for the same block we want to read in.
1211 : * Note that we have not yet removed the hashtable entry for the old
1212 : * tag.
1213 : */
1214 17205 : buf_id = BufTableInsert(&newTag, newHash, buf->buf_id);
1215 :
1216 17205 : if (buf_id >= 0)
1217 : {
1218 : /*
1219 : * Got a collision. Someone has already done what we were about to
1220 : * do. We'll just handle this as if it were found in the buffer
1221 : * pool in the first place. First, give up the buffer we were
1222 : * planning to use.
1223 : */
1224 0 : UnpinBuffer(buf, true);
1225 :
1226 : /* Can give up that buffer's mapping partition lock now */
1227 0 : if (oldPartitionLock != NULL &&
1228 : oldPartitionLock != newPartitionLock)
1229 0 : LWLockRelease(oldPartitionLock);
1230 :
1231 : /* remaining code should match code at top of routine */
1232 :
1233 0 : buf = GetBufferDescriptor(buf_id);
1234 :
1235 0 : valid = PinBuffer(buf, strategy);
1236 :
1237 : /* Can release the mapping lock as soon as we've pinned it */
1238 0 : LWLockRelease(newPartitionLock);
1239 :
1240 0 : *foundPtr = TRUE;
1241 :
1242 0 : if (!valid)
1243 : {
1244 : /*
1245 : * We can only get here if (a) someone else is still reading
1246 : * in the page, or (b) a previous read attempt failed. We
1247 : * have to wait for any active read attempt to finish, and
1248 : * then set up our own read attempt if the page is still not
1249 : * BM_VALID. StartBufferIO does it all.
1250 : */
1251 0 : if (StartBufferIO(buf, true))
1252 : {
1253 : /*
1254 : * If we get here, previous attempts to read the buffer
1255 : * must have failed ... but we shall bravely try again.
1256 : */
1257 0 : *foundPtr = FALSE;
1258 : }
1259 : }
1260 :
1261 0 : return buf;
1262 : }
1263 :
1264 : /*
1265 : * Need to lock the buffer header too in order to change its tag.
1266 : */
1267 17205 : buf_state = LockBufHdr(buf);
1268 :
1269 : /*
1270 : * Somebody could have pinned or re-dirtied the buffer while we were
1271 : * doing the I/O and making the new hashtable entry. If so, we can't
1272 : * recycle this buffer; we must undo everything we've done and start
1273 : * over with a new victim buffer.
1274 : */
1275 17205 : oldFlags = buf_state & BUF_FLAG_MASK;
1276 17205 : if (BUF_STATE_GET_REFCOUNT(buf_state) == 1 && !(oldFlags & BM_DIRTY))
1277 17205 : break;
1278 :
1279 0 : UnlockBufHdr(buf, buf_state);
1280 0 : BufTableDelete(&newTag, newHash);
1281 0 : if (oldPartitionLock != NULL &&
1282 : oldPartitionLock != newPartitionLock)
1283 0 : LWLockRelease(oldPartitionLock);
1284 0 : LWLockRelease(newPartitionLock);
1285 0 : UnpinBuffer(buf, true);
1286 0 : }
1287 :
1288 : /*
1289 : * Okay, it's finally safe to rename the buffer.
1290 : *
1291 : * Clearing BM_VALID here is necessary, clearing the dirtybits is just
1292 : * paranoia. We also reset the usage_count since any recency of use of
1293 : * the old content is no longer relevant. (The usage_count starts out at
1294 : * 1 so that the buffer can survive one clock-sweep pass.)
1295 : *
1296 : * Make sure BM_PERMANENT is set for buffers that must be written at every
1297 : * checkpoint. Unlogged buffers only need to be written at shutdown
1298 : * checkpoints, except for their "init" forks, which need to be treated
1299 : * just like permanent relations.
1300 : */
1301 17205 : buf->tag = newTag;
1302 17205 : buf_state &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED |
1303 : BM_CHECKPOINT_NEEDED | BM_IO_ERROR | BM_PERMANENT |
1304 : BUF_USAGECOUNT_MASK);
1305 17205 : if (relpersistence == RELPERSISTENCE_PERMANENT || forkNum == INIT_FORKNUM)
1306 17192 : buf_state |= BM_TAG_VALID | BM_PERMANENT | BUF_USAGECOUNT_ONE;
1307 : else
1308 13 : buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
1309 :
1310 17205 : UnlockBufHdr(buf, buf_state);
1311 :
1312 17205 : if (oldPartitionLock != NULL)
1313 : {
1314 553 : BufTableDelete(&oldTag, oldHash);
1315 553 : if (oldPartitionLock != newPartitionLock)
1316 550 : LWLockRelease(oldPartitionLock);
1317 : }
1318 :
1319 17205 : LWLockRelease(newPartitionLock);
1320 :
1321 : /*
1322 : * Buffer contents are currently invalid. Try to get the io_in_progress
1323 : * lock. If StartBufferIO returns false, then someone else managed to
1324 : * read it before we did, so there's nothing left for BufferAlloc() to do.
1325 : */
1326 17205 : if (StartBufferIO(buf, true))
1327 17205 : *foundPtr = FALSE;
1328 : else
1329 0 : *foundPtr = TRUE;
1330 :
1331 17205 : return buf;
1332 : }
1333 :
1334 : /*
1335 : * InvalidateBuffer -- mark a shared buffer invalid and return it to the
1336 : * freelist.
1337 : *
1338 : * The buffer header spinlock must be held at entry. We drop it before
1339 : * returning. (This is sane because the caller must have locked the
1340 : * buffer in order to be sure it should be dropped.)
1341 : *
1342 : * This is used only in contexts such as dropping a relation. We assume
1343 : * that no other backend could possibly be interested in using the page,
1344 : * so the only reason the buffer might be pinned is if someone else is
1345 : * trying to write it out. We have to let them finish before we can
1346 : * reclaim the buffer.
1347 : *
1348 : * The buffer could get reclaimed by someone else while we are waiting
1349 : * to acquire the necessary locks; if so, don't mess it up.
1350 : */
1351 : static void
1352 7315 : InvalidateBuffer(BufferDesc *buf)
1353 : {
1354 : BufferTag oldTag;
1355 : uint32 oldHash; /* hash value for oldTag */
1356 : LWLock *oldPartitionLock; /* buffer partition lock for it */
1357 : uint32 oldFlags;
1358 : uint32 buf_state;
1359 :
1360 : /* Save the original buffer tag before dropping the spinlock */
1361 7315 : oldTag = buf->tag;
1362 :
1363 7315 : buf_state = pg_atomic_read_u32(&buf->state);
1364 7315 : Assert(buf_state & BM_LOCKED);
1365 7315 : UnlockBufHdr(buf, buf_state);
1366 :
1367 : /*
1368 : * Need to compute the old tag's hashcode and partition lock ID. XXX is it
1369 : * worth storing the hashcode in BufferDesc so we need not recompute it
1370 : * here? Probably not.
1371 : */
1372 7315 : oldHash = BufTableHashCode(&oldTag);
1373 7315 : oldPartitionLock = BufMappingPartitionLock(oldHash);
1374 :
1375 : retry:
1376 :
1377 : /*
1378 : * Acquire exclusive mapping lock in preparation for changing the buffer's
1379 : * association.
1380 : */
1381 7315 : LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
1382 :
1383 : /* Re-lock the buffer header */
1384 7315 : buf_state = LockBufHdr(buf);
1385 :
1386 : /* If it's changed while we were waiting for lock, do nothing */
1387 7315 : if (!BUFFERTAGS_EQUAL(buf->tag, oldTag))
1388 : {
1389 0 : UnlockBufHdr(buf, buf_state);
1390 0 : LWLockRelease(oldPartitionLock);
1391 7315 : return;
1392 : }
1393 :
1394 : /*
1395 : * We assume the only reason for it to be pinned is that someone else is
1396 : * flushing the page out. Wait for them to finish. (This could be an
1397 : * infinite loop if the refcount is messed up... it would be nice to time
1398 : * out after awhile, but there seems no way to be sure how many loops may
1399 : * be needed. Note that if the other guy has pinned the buffer but not
1400 : * yet done StartBufferIO, WaitIO will fall through and we'll effectively
1401 : * be busy-looping here.)
1402 : */
1403 7315 : if (BUF_STATE_GET_REFCOUNT(buf_state) != 0)
1404 : {
1405 0 : UnlockBufHdr(buf, buf_state);
1406 0 : LWLockRelease(oldPartitionLock);
1407 : /* safety check: should definitely not be our *own* pin */
1408 0 : if (GetPrivateRefCount(BufferDescriptorGetBuffer(buf)) > 0)
1409 0 : elog(ERROR, "buffer is pinned in InvalidateBuffer");
1410 0 : WaitIO(buf);
1411 0 : goto retry;
1412 : }
1413 :
1414 : /*
1415 : * Clear out the buffer's tag and flags. We must do this to ensure that
1416 : * linear scans of the buffer array don't think the buffer is valid.
1417 : */
1418 7315 : oldFlags = buf_state & BUF_FLAG_MASK;
1419 7315 : CLEAR_BUFFERTAG(buf->tag);
1420 7315 : buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
1421 7315 : UnlockBufHdr(buf, buf_state);
1422 :
1423 : /*
1424 : * Remove the buffer from the lookup hashtable, if it was in there.
1425 : */
1426 7315 : if (oldFlags & BM_TAG_VALID)
1427 7315 : BufTableDelete(&oldTag, oldHash);
1428 :
1429 : /*
1430 : * Done with mapping lock.
1431 : */
1432 7315 : LWLockRelease(oldPartitionLock);
1433 :
1434 : /*
1435 : * Insert the buffer at the head of the list of free buffers.
1436 : */
1437 7315 : StrategyFreeBuffer(buf);
1438 : }
1439 :
1440 : /*
1441 : * MarkBufferDirty
1442 : *
1443 : * Marks buffer contents as dirty (actual write happens later).
1444 : *
1445 : * Buffer must be pinned and exclusive-locked. (If caller does not hold
1446 : * exclusive lock, then somebody could be in process of writing the buffer,
1447 : * leading to risk of bad data written to disk.)
1448 : */
1449 : void
1450 1560347 : MarkBufferDirty(Buffer buffer)
1451 : {
1452 : BufferDesc *bufHdr;
1453 : uint32 buf_state;
1454 : uint32 old_buf_state;
1455 :
1456 1560347 : if (!BufferIsValid(buffer))
1457 0 : elog(ERROR, "bad buffer ID: %d", buffer);
1458 :
1459 1560347 : if (BufferIsLocal(buffer))
1460 : {
1461 34358 : MarkLocalBufferDirty(buffer);
1462 1594705 : return;
1463 : }
1464 :
1465 1525989 : bufHdr = GetBufferDescriptor(buffer - 1);
1466 :
1467 1525989 : Assert(BufferIsPinned(buffer));
1468 1525989 : Assert(LWLockHeldByMeInMode(BufferDescriptorGetContentLock(bufHdr),
1469 : LW_EXCLUSIVE));
1470 :
1471 1525989 : old_buf_state = pg_atomic_read_u32(&bufHdr->state);
1472 : for (;;)
1473 : {
1474 1525991 : if (old_buf_state & BM_LOCKED)
1475 0 : old_buf_state = WaitBufHdrUnlocked(bufHdr);
1476 :
1477 1525991 : buf_state = old_buf_state;
1478 :
1479 1525991 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
1480 1525991 : buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
1481 :
1482 1525991 : if (pg_atomic_compare_exchange_u32(&bufHdr->state, &old_buf_state,
1483 : buf_state))
1484 1525989 : break;
1485 2 : }
1486 :
1487 : /*
1488 : * If the buffer was not dirty already, do vacuum accounting.
1489 : */
1490 1525989 : if (!(old_buf_state & BM_DIRTY))
1491 : {
1492 14036 : VacuumPageDirty++;
1493 14036 : pgBufferUsage.shared_blks_dirtied++;
1494 14036 : if (VacuumCostActive)
1495 42 : VacuumCostBalance += VacuumCostPageDirty;
1496 : }
1497 : }
1498 :
1499 : /*
1500 : * ReleaseAndReadBuffer -- combine ReleaseBuffer() and ReadBuffer()
1501 : *
1502 : * Formerly, this saved one cycle of acquiring/releasing the BufMgrLock
1503 : * compared to calling the two routines separately. Now it's mainly just
1504 : * a convenience function. However, if the passed buffer is valid and
1505 : * already contains the desired block, we just return it as-is; and that
1506 : * does save considerable work compared to a full release and reacquire.
1507 : *
1508 : * Note: it is OK to pass buffer == InvalidBuffer, indicating that no old
1509 : * buffer actually needs to be released. This case is the same as ReadBuffer,
1510 : * but can save some tests in the caller.
1511 : */
1512 : Buffer
1513 1224688 : ReleaseAndReadBuffer(Buffer buffer,
1514 : Relation relation,
1515 : BlockNumber blockNum)
1516 : {
1517 1224688 : ForkNumber forkNum = MAIN_FORKNUM;
1518 : BufferDesc *bufHdr;
1519 :
1520 1224688 : if (BufferIsValid(buffer))
1521 : {
1522 947450 : Assert(BufferIsPinned(buffer));
1523 947450 : if (BufferIsLocal(buffer))
1524 : {
1525 778 : bufHdr = GetLocalBufferDescriptor(-buffer - 1);
1526 1328 : if (bufHdr->tag.blockNum == blockNum &&
1527 1650 : RelFileNodeEquals(bufHdr->tag.rnode, relation->rd_node) &&
1528 550 : bufHdr->tag.forkNum == forkNum)
1529 550 : return buffer;
1530 228 : ResourceOwnerForgetBuffer(CurrentResourceOwner, buffer);
1531 228 : LocalRefCount[-buffer - 1]--;
1532 : }
1533 : else
1534 : {
1535 946672 : bufHdr = GetBufferDescriptor(buffer - 1);
1536 : /* we have pin, so it's ok to examine tag without spinlock */
1537 1280283 : if (bufHdr->tag.blockNum == blockNum &&
1538 1000833 : RelFileNodeEquals(bufHdr->tag.rnode, relation->rd_node) &&
1539 333611 : bufHdr->tag.forkNum == forkNum)
1540 333611 : return buffer;
1541 613061 : UnpinBuffer(bufHdr, true);
1542 : }
1543 : }
1544 :
1545 890527 : return ReadBuffer(relation, blockNum);
1546 : }
1547 :
1548 : /*
1549 : * PinBuffer -- make buffer unavailable for replacement.
1550 : *
1551 : * For the default access strategy, the buffer's usage_count is incremented
1552 : * when we first pin it; for other strategies we just make sure the usage_count
1553 : * isn't zero. (The idea of the latter is that we don't want synchronized
1554 : * heap scans to inflate the count, but we need it to not be zero to discourage
1555 : * other backends from stealing buffers from our ring. As long as we cycle
1556 : * through the ring faster than the global clock-sweep cycles, buffers in
1557 : * our ring won't be chosen as victims for replacement by other backends.)
1558 : *
1559 : * This should be applied only to shared buffers, never local ones.
1560 : *
1561 : * Since buffers are pinned/unpinned very frequently, pin buffers without
1562 : * taking the buffer header lock; instead update the state variable in loop of
1563 : * CAS operations. Hopefully it's just a single CAS.
1564 : *
1565 : * Note that ResourceOwnerEnlargeBuffers must have been done already.
1566 : *
1567 : * Returns TRUE if buffer is BM_VALID, else FALSE. This provision allows
1568 : * some callers to avoid an extra spinlock cycle.
1569 : */
1570 : static bool
1571 3185730 : PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy)
1572 : {
1573 3185730 : Buffer b = BufferDescriptorGetBuffer(buf);
1574 : bool result;
1575 : PrivateRefCountEntry *ref;
1576 :
1577 3185730 : ref = GetPrivateRefCountEntry(b, true);
1578 :
1579 3185730 : if (ref == NULL)
1580 : {
1581 : uint32 buf_state;
1582 : uint32 old_buf_state;
1583 :
1584 3039031 : ReservePrivateRefCountEntry();
1585 3039031 : ref = NewPrivateRefCountEntry(b);
1586 :
1587 3039031 : old_buf_state = pg_atomic_read_u32(&buf->state);
1588 : for (;;)
1589 : {
1590 3039244 : if (old_buf_state & BM_LOCKED)
1591 0 : old_buf_state = WaitBufHdrUnlocked(buf);
1592 :
1593 3039244 : buf_state = old_buf_state;
1594 :
1595 : /* increase refcount */
1596 3039244 : buf_state += BUF_REFCOUNT_ONE;
1597 :
1598 3039244 : if (strategy == NULL)
1599 : {
1600 : /* Default case: increase usagecount unless already max. */
1601 3023444 : if (BUF_STATE_GET_USAGECOUNT(buf_state) < BM_MAX_USAGE_COUNT)
1602 54075 : buf_state += BUF_USAGECOUNT_ONE;
1603 : }
1604 : else
1605 : {
1606 : /*
1607 : * Ring buffers shouldn't evict others from pool. Thus we
1608 : * don't make usagecount more than 1.
1609 : */
1610 15800 : if (BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
1611 0 : buf_state += BUF_USAGECOUNT_ONE;
1612 : }
1613 :
1614 3039244 : if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
1615 : buf_state))
1616 : {
1617 3039031 : result = (buf_state & BM_VALID) != 0;
1618 3039031 : break;
1619 : }
1620 213 : }
1621 : }
1622 : else
1623 : {
1624 : /* If we previously pinned the buffer, it must surely be valid */
1625 146699 : result = true;
1626 : }
1627 :
1628 3185730 : ref->refcount++;
1629 3185730 : Assert(ref->refcount > 0);
1630 3185730 : ResourceOwnerRememberBuffer(CurrentResourceOwner, b);
1631 3185730 : return result;
1632 : }
1633 :
1634 : /*
1635 : * PinBuffer_Locked -- as above, but caller already locked the buffer header.
1636 : * The spinlock is released before return.
1637 : *
1638 : * As this function is called with the spinlock held, the caller has to
1639 : * previously call ReservePrivateRefCountEntry().
1640 : *
1641 : * Currently, no callers of this function want to modify the buffer's
1642 : * usage_count at all, so there's no need for a strategy parameter.
1643 : * Also we don't bother with a BM_VALID test (the caller could check that for
1644 : * itself).
1645 : *
1646 : * Also all callers only ever use this function when it's known that the
1647 : * buffer can't have a preexisting pin by this backend. That allows us to skip
1648 : * searching the private refcount array & hash, which is a boon, because the
1649 : * spinlock is still held.
1650 : *
1651 : * Note: use of this routine is frequently mandatory, not just an optimization
1652 : * to save a spin lock/unlock cycle, because we need to pin a buffer before
1653 : * its state can change under us.
1654 : */
1655 : static void
1656 25721 : PinBuffer_Locked(BufferDesc *buf)
1657 : {
1658 : Buffer b;
1659 : PrivateRefCountEntry *ref;
1660 : uint32 buf_state;
1661 :
1662 : /*
1663 : * As explained, We don't expect any preexisting pins. That allows us to
1664 : * manipulate the PrivateRefCount after releasing the spinlock
1665 : */
1666 25721 : Assert(GetPrivateRefCountEntry(BufferDescriptorGetBuffer(buf), false) == NULL);
1667 :
1668 : /*
1669 : * Since we hold the buffer spinlock, we can update the buffer state and
1670 : * release the lock in one operation.
1671 : */
1672 25721 : buf_state = pg_atomic_read_u32(&buf->state);
1673 25721 : Assert(buf_state & BM_LOCKED);
1674 25721 : buf_state += BUF_REFCOUNT_ONE;
1675 25721 : UnlockBufHdr(buf, buf_state);
1676 :
1677 25721 : b = BufferDescriptorGetBuffer(buf);
1678 :
1679 25721 : ref = NewPrivateRefCountEntry(b);
1680 25721 : ref->refcount++;
1681 :
1682 25721 : ResourceOwnerRememberBuffer(CurrentResourceOwner, b);
1683 25721 : }
1684 :
1685 : /*
1686 : * UnpinBuffer -- make buffer available for replacement.
1687 : *
1688 : * This should be applied only to shared buffers, never local ones.
1689 : *
1690 : * Most but not all callers want CurrentResourceOwner to be adjusted.
1691 : * Those that don't should pass fixOwner = FALSE.
1692 : */
1693 : static void
1694 3537185 : UnpinBuffer(BufferDesc *buf, bool fixOwner)
1695 : {
1696 : PrivateRefCountEntry *ref;
1697 3537185 : Buffer b = BufferDescriptorGetBuffer(buf);
1698 :
1699 : /* not moving as we're likely deleting it soon anyway */
1700 3537185 : ref = GetPrivateRefCountEntry(b, false);
1701 3537185 : Assert(ref != NULL);
1702 :
1703 3537185 : if (fixOwner)
1704 3537185 : ResourceOwnerForgetBuffer(CurrentResourceOwner, b);
1705 :
1706 3537185 : Assert(ref->refcount > 0);
1707 3537185 : ref->refcount--;
1708 3537185 : if (ref->refcount == 0)
1709 : {
1710 : uint32 buf_state;
1711 : uint32 old_buf_state;
1712 :
1713 : /* I'd better not still hold any locks on the buffer */
1714 3064752 : Assert(!LWLockHeldByMe(BufferDescriptorGetContentLock(buf)));
1715 3064752 : Assert(!LWLockHeldByMe(BufferDescriptorGetIOLock(buf)));
1716 :
1717 : /*
1718 : * Decrement the shared reference count.
1719 : *
1720 : * Since buffer spinlock holder can update status using just write,
1721 : * it's not safe to use atomic decrement here; thus use a CAS loop.
1722 : */
1723 3064752 : old_buf_state = pg_atomic_read_u32(&buf->state);
1724 : for (;;)
1725 : {
1726 3064994 : if (old_buf_state & BM_LOCKED)
1727 0 : old_buf_state = WaitBufHdrUnlocked(buf);
1728 :
1729 3064994 : buf_state = old_buf_state;
1730 :
1731 3064994 : buf_state -= BUF_REFCOUNT_ONE;
1732 :
1733 3064994 : if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
1734 : buf_state))
1735 3064752 : break;
1736 242 : }
1737 :
1738 : /* Support LockBufferForCleanup() */
1739 3064752 : if (buf_state & BM_PIN_COUNT_WAITER)
1740 : {
1741 : /*
1742 : * Acquire the buffer header lock, re-check that there's a waiter.
1743 : * Another backend could have unpinned this buffer, and already
1744 : * woken up the waiter. There's no danger of the buffer being
1745 : * replaced after we unpinned it above, as it's pinned by the
1746 : * waiter.
1747 : */
1748 0 : buf_state = LockBufHdr(buf);
1749 :
1750 0 : if ((buf_state & BM_PIN_COUNT_WAITER) &&
1751 0 : BUF_STATE_GET_REFCOUNT(buf_state) == 1)
1752 0 : {
1753 : /* we just released the last pin other than the waiter's */
1754 0 : int wait_backend_pid = buf->wait_backend_pid;
1755 :
1756 0 : buf_state &= ~BM_PIN_COUNT_WAITER;
1757 0 : UnlockBufHdr(buf, buf_state);
1758 0 : ProcSendSignal(wait_backend_pid);
1759 : }
1760 : else
1761 0 : UnlockBufHdr(buf, buf_state);
1762 : }
1763 3064752 : ForgetPrivateRefCountEntry(ref);
1764 : }
1765 3537185 : }
1766 :
1767 : /*
1768 : * BufferSync -- Write out all dirty buffers in the pool.
1769 : *
1770 : * This is called at checkpoint time to write out all dirty shared buffers.
1771 : * The checkpoint request flags should be passed in. If CHECKPOINT_IMMEDIATE
1772 : * is set, we disable delays between writes; if CHECKPOINT_IS_SHUTDOWN,
1773 : * CHECKPOINT_END_OF_RECOVERY or CHECKPOINT_FLUSH_ALL is set, we write even
1774 : * unlogged buffers, which are otherwise skipped. The remaining flags
1775 : * currently have no effect here.
1776 : */
1777 : static void
1778 11 : BufferSync(int flags)
1779 : {
1780 : uint32 buf_state;
1781 : int buf_id;
1782 : int num_to_scan;
1783 : int num_spaces;
1784 : int num_processed;
1785 : int num_written;
1786 11 : CkptTsStatus *per_ts_stat = NULL;
1787 : Oid last_tsid;
1788 : binaryheap *ts_heap;
1789 : int i;
1790 11 : int mask = BM_DIRTY;
1791 : WritebackContext wb_context;
1792 :
1793 : /* Make sure we can handle the pin inside SyncOneBuffer */
1794 11 : ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
1795 :
1796 : /*
1797 : * Unless this is a shutdown checkpoint or we have been explicitly told,
1798 : * we write only permanent, dirty buffers. But at shutdown or end of
1799 : * recovery, we write all dirty buffers.
1800 : */
1801 11 : if (!((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
1802 : CHECKPOINT_FLUSH_ALL))))
1803 5 : mask |= BM_PERMANENT;
1804 :
1805 : /*
1806 : * Loop over all buffers, and mark the ones that need to be written with
1807 : * BM_CHECKPOINT_NEEDED. Count them as we go (num_to_scan), so that we
1808 : * can estimate how much work needs to be done.
1809 : *
1810 : * This allows us to write only those pages that were dirty when the
1811 : * checkpoint began, and not those that get dirtied while it proceeds.
1812 : * Whenever a page with BM_CHECKPOINT_NEEDED is written out, either by us
1813 : * later in this function, or by normal backends or the bgwriter cleaning
1814 : * scan, the flag is cleared. Any buffer dirtied after this point won't
1815 : * have the flag set.
1816 : *
1817 : * Note that if we fail to write some buffer, we may leave buffers with
1818 : * BM_CHECKPOINT_NEEDED still set. This is OK since any such buffer would
1819 : * certainly need to be written for the next checkpoint attempt, too.
1820 : */
1821 11 : num_to_scan = 0;
1822 180235 : for (buf_id = 0; buf_id < NBuffers; buf_id++)
1823 : {
1824 180224 : BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
1825 :
1826 : /*
1827 : * Header spinlock is enough to examine BM_DIRTY, see comment in
1828 : * SyncOneBuffer.
1829 : */
1830 180224 : buf_state = LockBufHdr(bufHdr);
1831 :
1832 180224 : if ((buf_state & mask) == mask)
1833 : {
1834 : CkptSortItem *item;
1835 :
1836 8514 : buf_state |= BM_CHECKPOINT_NEEDED;
1837 :
1838 8514 : item = &CkptBufferIds[num_to_scan++];
1839 8514 : item->buf_id = buf_id;
1840 8514 : item->tsId = bufHdr->tag.rnode.spcNode;
1841 8514 : item->relNode = bufHdr->tag.rnode.relNode;
1842 8514 : item->forkNum = bufHdr->tag.forkNum;
1843 8514 : item->blockNum = bufHdr->tag.blockNum;
1844 : }
1845 :
1846 180224 : UnlockBufHdr(bufHdr, buf_state);
1847 : }
1848 :
1849 11 : if (num_to_scan == 0)
1850 14 : return; /* nothing to do */
1851 :
1852 8 : WritebackContextInit(&wb_context, &checkpoint_flush_after);
1853 :
1854 : TRACE_POSTGRESQL_BUFFER_SYNC_START(NBuffers, num_to_scan);
1855 :
1856 : /*
1857 : * Sort buffers that need to be written to reduce the likelihood of random
1858 : * IO. The sorting is also important for the implementation of balancing
1859 : * writes between tablespaces. Without balancing writes we'd potentially
1860 : * end up writing to the tablespaces one-by-one; possibly overloading the
1861 : * underlying system.
1862 : */
1863 8 : qsort(CkptBufferIds, num_to_scan, sizeof(CkptSortItem),
1864 : ckpt_buforder_comparator);
1865 :
1866 8 : num_spaces = 0;
1867 :
1868 : /*
1869 : * Allocate progress status for each tablespace with buffers that need to
1870 : * be flushed. This requires the to-be-flushed array to be sorted.
1871 : */
1872 8 : last_tsid = InvalidOid;
1873 8522 : for (i = 0; i < num_to_scan; i++)
1874 : {
1875 : CkptTsStatus *s;
1876 : Oid cur_tsid;
1877 :
1878 8514 : cur_tsid = CkptBufferIds[i].tsId;
1879 :
1880 : /*
1881 : * Grow array of per-tablespace status structs, every time a new
1882 : * tablespace is found.
1883 : */
1884 8514 : if (last_tsid == InvalidOid || last_tsid != cur_tsid)
1885 15 : {
1886 : Size sz;
1887 :
1888 15 : num_spaces++;
1889 :
1890 : /*
1891 : * Not worth adding grow-by-power-of-2 logic here - even with a
1892 : * few hundred tablespaces this should be fine.
1893 : */
1894 15 : sz = sizeof(CkptTsStatus) * num_spaces;
1895 :
1896 15 : if (per_ts_stat == NULL)
1897 8 : per_ts_stat = (CkptTsStatus *) palloc(sz);
1898 : else
1899 7 : per_ts_stat = (CkptTsStatus *) repalloc(per_ts_stat, sz);
1900 :
1901 15 : s = &per_ts_stat[num_spaces - 1];
1902 15 : memset(s, 0, sizeof(*s));
1903 15 : s->tsId = cur_tsid;
1904 :
1905 : /*
1906 : * The first buffer in this tablespace. As CkptBufferIds is sorted
1907 : * by tablespace all (s->num_to_scan) buffers in this tablespace
1908 : * will follow afterwards.
1909 : */
1910 15 : s->index = i;
1911 :
1912 : /*
1913 : * progress_slice will be determined once we know how many buffers
1914 : * are in each tablespace, i.e. after this loop.
1915 : */
1916 :
1917 15 : last_tsid = cur_tsid;
1918 : }
1919 : else
1920 : {
1921 8499 : s = &per_ts_stat[num_spaces - 1];
1922 : }
1923 :
1924 8514 : s->num_to_scan++;
1925 : }
1926 :
1927 8 : Assert(num_spaces > 0);
1928 :
1929 : /*
1930 : * Build a min-heap over the write-progress in the individual tablespaces,
1931 : * and compute how large a portion of the total progress a single
1932 : * processed buffer is.
1933 : */
1934 8 : ts_heap = binaryheap_allocate(num_spaces,
1935 : ts_ckpt_progress_comparator,
1936 : NULL);
1937 :
1938 23 : for (i = 0; i < num_spaces; i++)
1939 : {
1940 15 : CkptTsStatus *ts_stat = &per_ts_stat[i];
1941 :
1942 15 : ts_stat->progress_slice = (float8) num_to_scan / ts_stat->num_to_scan;
1943 :
1944 15 : binaryheap_add_unordered(ts_heap, PointerGetDatum(ts_stat));
1945 : }
1946 :
1947 8 : binaryheap_build(ts_heap);
1948 :
1949 : /*
1950 : * Iterate through to-be-checkpointed buffers and write the ones (still)
1951 : * marked with BM_CHECKPOINT_NEEDED. The writes are balanced between
1952 : * tablespaces; otherwise the sorting would lead to only one tablespace
1953 : * receiving writes at a time, making inefficient use of the hardware.
1954 : */
1955 8 : num_processed = 0;
1956 8 : num_written = 0;
1957 8530 : while (!binaryheap_empty(ts_heap))
1958 : {
1959 8514 : BufferDesc *bufHdr = NULL;
1960 8514 : CkptTsStatus *ts_stat = (CkptTsStatus *)
1961 8514 : DatumGetPointer(binaryheap_first(ts_heap));
1962 :
1963 8514 : buf_id = CkptBufferIds[ts_stat->index].buf_id;
1964 8514 : Assert(buf_id != -1);
1965 :
1966 8514 : bufHdr = GetBufferDescriptor(buf_id);
1967 :
1968 8514 : num_processed++;
1969 :
1970 : /*
1971 : * We don't need to acquire the lock here, because we're only looking
1972 : * at a single bit. It's possible that someone else writes the buffer
1973 : * and clears the flag right after we check, but that doesn't matter
1974 : * since SyncOneBuffer will then do nothing. However, there is a
1975 : * further race condition: it's conceivable that between the time we
1976 : * examine the bit here and the time SyncOneBuffer acquires the lock,
1977 : * someone else not only wrote the buffer but replaced it with another
1978 : * page and dirtied it. In that improbable case, SyncOneBuffer will
1979 : * write the buffer though we didn't need to. It doesn't seem worth
1980 : * guarding against this, though.
1981 : */
1982 8514 : if (pg_atomic_read_u32(&bufHdr->state) & BM_CHECKPOINT_NEEDED)
1983 : {
1984 8514 : if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN)
1985 : {
1986 : TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id);
1987 8514 : BgWriterStats.m_buf_written_checkpoints++;
1988 8514 : num_written++;
1989 : }
1990 : }
1991 :
1992 : /*
1993 : * Measure progress independent of actually having to flush the buffer
1994 : * - otherwise writing become unbalanced.
1995 : */
1996 8514 : ts_stat->progress += ts_stat->progress_slice;
1997 8514 : ts_stat->num_scanned++;
1998 8514 : ts_stat->index++;
1999 :
2000 : /* Have all the buffers from the tablespace been processed? */
2001 8514 : if (ts_stat->num_scanned == ts_stat->num_to_scan)
2002 : {
2003 15 : binaryheap_remove_first(ts_heap);
2004 : }
2005 : else
2006 : {
2007 : /* update heap with the new progress */
2008 8499 : binaryheap_replace_first(ts_heap, PointerGetDatum(ts_stat));
2009 : }
2010 :
2011 : /*
2012 : * Sleep to throttle our I/O rate.
2013 : */
2014 8514 : CheckpointWriteDelay(flags, (double) num_processed / num_to_scan);
2015 : }
2016 :
2017 : /* issue all pending flushes */
2018 8 : IssuePendingWritebacks(&wb_context);
2019 :
2020 8 : pfree(per_ts_stat);
2021 8 : per_ts_stat = NULL;
2022 8 : binaryheap_free(ts_heap);
2023 :
2024 : /*
2025 : * Update checkpoint statistics. As noted above, this doesn't include
2026 : * buffers written by other backends or bgwriter scan.
2027 : */
2028 8 : CheckpointStats.ckpt_bufs_written += num_written;
2029 :
2030 : TRACE_POSTGRESQL_BUFFER_SYNC_DONE(NBuffers, num_written, num_to_scan);
2031 : }
2032 :
2033 : /*
2034 : * BgBufferSync -- Write out some dirty buffers in the pool.
2035 : *
2036 : * This is called periodically by the background writer process.
2037 : *
2038 : * Returns true if it's appropriate for the bgwriter process to go into
2039 : * low-power hibernation mode. (This happens if the strategy clock sweep
2040 : * has been "lapped" and no buffer allocations have occurred recently,
2041 : * or if the bgwriter has been effectively disabled by setting
2042 : * bgwriter_lru_maxpages to 0.)
2043 : */
2044 : bool
2045 388 : BgBufferSync(WritebackContext *wb_context)
2046 : {
2047 : /* info obtained from freelist.c */
2048 : int strategy_buf_id;
2049 : uint32 strategy_passes;
2050 : uint32 recent_alloc;
2051 :
2052 : /*
2053 : * Information saved between calls so we can determine the strategy
2054 : * point's advance rate and avoid scanning already-cleaned buffers.
2055 : */
2056 : static bool saved_info_valid = false;
2057 : static int prev_strategy_buf_id;
2058 : static uint32 prev_strategy_passes;
2059 : static int next_to_clean;
2060 : static uint32 next_passes;
2061 :
2062 : /* Moving averages of allocation rate and clean-buffer density */
2063 : static float smoothed_alloc = 0;
2064 : static float smoothed_density = 10.0;
2065 :
2066 : /* Potentially these could be tunables, but for now, not */
2067 388 : float smoothing_samples = 16;
2068 388 : float scan_whole_pool_milliseconds = 120000.0;
2069 :
2070 : /* Used to compute how far we scan ahead */
2071 : long strategy_delta;
2072 : int bufs_to_lap;
2073 : int bufs_ahead;
2074 : float scans_per_alloc;
2075 : int reusable_buffers_est;
2076 : int upcoming_alloc_est;
2077 : int min_scan_buffers;
2078 :
2079 : /* Variables for the scanning loop proper */
2080 : int num_to_scan;
2081 : int num_written;
2082 : int reusable_buffers;
2083 :
2084 : /* Variables for final smoothed_density update */
2085 : long new_strategy_delta;
2086 : uint32 new_recent_alloc;
2087 :
2088 : /*
2089 : * Find out where the freelist clock sweep currently is, and how many
2090 : * buffer allocations have happened since our last call.
2091 : */
2092 388 : strategy_buf_id = StrategySyncStart(&strategy_passes, &recent_alloc);
2093 :
2094 : /* Report buffer alloc counts to pgstat */
2095 388 : BgWriterStats.m_buf_alloc += recent_alloc;
2096 :
2097 : /*
2098 : * If we're not running the LRU scan, just stop after doing the stats
2099 : * stuff. We mark the saved state invalid so that we can recover sanely
2100 : * if LRU scan is turned back on later.
2101 : */
2102 388 : if (bgwriter_lru_maxpages <= 0)
2103 : {
2104 0 : saved_info_valid = false;
2105 0 : return true;
2106 : }
2107 :
2108 : /*
2109 : * Compute strategy_delta = how many buffers have been scanned by the
2110 : * clock sweep since last time. If first time through, assume none. Then
2111 : * see if we are still ahead of the clock sweep, and if so, how many
2112 : * buffers we could scan before we'd catch up with it and "lap" it. Note:
2113 : * weird-looking coding of xxx_passes comparisons are to avoid bogus
2114 : * behavior when the passes counts wrap around.
2115 : */
2116 388 : if (saved_info_valid)
2117 : {
2118 387 : int32 passes_delta = strategy_passes - prev_strategy_passes;
2119 :
2120 387 : strategy_delta = strategy_buf_id - prev_strategy_buf_id;
2121 387 : strategy_delta += (long) passes_delta * NBuffers;
2122 :
2123 387 : Assert(strategy_delta >= 0);
2124 :
2125 387 : if ((int32) (next_passes - strategy_passes) > 0)
2126 : {
2127 : /* we're one pass ahead of the strategy point */
2128 0 : bufs_to_lap = strategy_buf_id - next_to_clean;
2129 : #ifdef BGW_DEBUG
2130 : elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
2131 : next_passes, next_to_clean,
2132 : strategy_passes, strategy_buf_id,
2133 : strategy_delta, bufs_to_lap);
2134 : #endif
2135 : }
2136 774 : else if (next_passes == strategy_passes &&
2137 387 : next_to_clean >= strategy_buf_id)
2138 : {
2139 : /* on same pass, but ahead or at least not behind */
2140 387 : bufs_to_lap = NBuffers - (next_to_clean - strategy_buf_id);
2141 : #ifdef BGW_DEBUG
2142 : elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
2143 : next_passes, next_to_clean,
2144 : strategy_passes, strategy_buf_id,
2145 : strategy_delta, bufs_to_lap);
2146 : #endif
2147 : }
2148 : else
2149 : {
2150 : /*
2151 : * We're behind, so skip forward to the strategy point and start
2152 : * cleaning from there.
2153 : */
2154 : #ifdef BGW_DEBUG
2155 : elog(DEBUG2, "bgwriter behind: bgw %u-%u strategy %u-%u delta=%ld",
2156 : next_passes, next_to_clean,
2157 : strategy_passes, strategy_buf_id,
2158 : strategy_delta);
2159 : #endif
2160 0 : next_to_clean = strategy_buf_id;
2161 0 : next_passes = strategy_passes;
2162 0 : bufs_to_lap = NBuffers;
2163 : }
2164 : }
2165 : else
2166 : {
2167 : /*
2168 : * Initializing at startup or after LRU scanning had been off. Always
2169 : * start at the strategy point.
2170 : */
2171 : #ifdef BGW_DEBUG
2172 : elog(DEBUG2, "bgwriter initializing: strategy %u-%u",
2173 : strategy_passes, strategy_buf_id);
2174 : #endif
2175 1 : strategy_delta = 0;
2176 1 : next_to_clean = strategy_buf_id;
2177 1 : next_passes = strategy_passes;
2178 1 : bufs_to_lap = NBuffers;
2179 : }
2180 :
2181 : /* Update saved info for next time */
2182 388 : prev_strategy_buf_id = strategy_buf_id;
2183 388 : prev_strategy_passes = strategy_passes;
2184 388 : saved_info_valid = true;
2185 :
2186 : /*
2187 : * Compute how many buffers had to be scanned for each new allocation, ie,
2188 : * 1/density of reusable buffers, and track a moving average of that.
2189 : *
2190 : * If the strategy point didn't move, we don't update the density estimate
2191 : */
2192 388 : if (strategy_delta > 0 && recent_alloc > 0)
2193 : {
2194 0 : scans_per_alloc = (float) strategy_delta / (float) recent_alloc;
2195 0 : smoothed_density += (scans_per_alloc - smoothed_density) /
2196 : smoothing_samples;
2197 : }
2198 :
2199 : /*
2200 : * Estimate how many reusable buffers there are between the current
2201 : * strategy point and where we've scanned ahead to, based on the smoothed
2202 : * density estimate.
2203 : */
2204 388 : bufs_ahead = NBuffers - bufs_to_lap;
2205 388 : reusable_buffers_est = (float) bufs_ahead / smoothed_density;
2206 :
2207 : /*
2208 : * Track a moving average of recent buffer allocations. Here, rather than
2209 : * a true average we want a fast-attack, slow-decline behavior: we
2210 : * immediately follow any increase.
2211 : */
2212 388 : if (smoothed_alloc <= (float) recent_alloc)
2213 39 : smoothed_alloc = recent_alloc;
2214 : else
2215 349 : smoothed_alloc += ((float) recent_alloc - smoothed_alloc) /
2216 : smoothing_samples;
2217 :
2218 : /* Scale the estimate by a GUC to allow more aggressive tuning. */
2219 388 : upcoming_alloc_est = (int) (smoothed_alloc * bgwriter_lru_multiplier);
2220 :
2221 : /*
2222 : * If recent_alloc remains at zero for many cycles, smoothed_alloc will
2223 : * eventually underflow to zero, and the underflows produce annoying
2224 : * kernel warnings on some platforms. Once upcoming_alloc_est has gone to
2225 : * zero, there's no point in tracking smaller and smaller values of
2226 : * smoothed_alloc, so just reset it to exactly zero to avoid this
2227 : * syndrome. It will pop back up as soon as recent_alloc increases.
2228 : */
2229 388 : if (upcoming_alloc_est == 0)
2230 1 : smoothed_alloc = 0;
2231 :
2232 : /*
2233 : * Even in cases where there's been little or no buffer allocation
2234 : * activity, we want to make a small amount of progress through the buffer
2235 : * cache so that as many reusable buffers as possible are clean after an
2236 : * idle period.
2237 : *
2238 : * (scan_whole_pool_milliseconds / BgWriterDelay) computes how many times
2239 : * the BGW will be called during the scan_whole_pool time; slice the
2240 : * buffer pool into that many sections.
2241 : */
2242 388 : min_scan_buffers = (int) (NBuffers / (scan_whole_pool_milliseconds / BgWriterDelay));
2243 :
2244 388 : if (upcoming_alloc_est < (min_scan_buffers + reusable_buffers_est))
2245 : {
2246 : #ifdef BGW_DEBUG
2247 : elog(DEBUG2, "bgwriter: alloc_est=%d too small, using min=%d + reusable_est=%d",
2248 : upcoming_alloc_est, min_scan_buffers, reusable_buffers_est);
2249 : #endif
2250 381 : upcoming_alloc_est = min_scan_buffers + reusable_buffers_est;
2251 : }
2252 :
2253 : /*
2254 : * Now write out dirty reusable buffers, working forward from the
2255 : * next_to_clean point, until we have lapped the strategy scan, or cleaned
2256 : * enough buffers to match our estimate of the next cycle's allocation
2257 : * requirements, or hit the bgwriter_lru_maxpages limit.
2258 : */
2259 :
2260 : /* Make sure we can handle the pin inside SyncOneBuffer */
2261 388 : ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
2262 :
2263 388 : num_to_scan = bufs_to_lap;
2264 388 : num_written = 0;
2265 388 : reusable_buffers = reusable_buffers_est;
2266 :
2267 : /* Execute the LRU scan */
2268 13361 : while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est)
2269 : {
2270 12585 : int sync_state = SyncOneBuffer(next_to_clean, true,
2271 : wb_context);
2272 :
2273 12585 : if (++next_to_clean >= NBuffers)
2274 : {
2275 0 : next_to_clean = 0;
2276 0 : next_passes++;
2277 : }
2278 12585 : num_to_scan--;
2279 :
2280 12585 : if (sync_state & BUF_WRITTEN)
2281 : {
2282 0 : reusable_buffers++;
2283 0 : if (++num_written >= bgwriter_lru_maxpages)
2284 : {
2285 0 : BgWriterStats.m_maxwritten_clean++;
2286 0 : break;
2287 : }
2288 : }
2289 12585 : else if (sync_state & BUF_REUSABLE)
2290 11668 : reusable_buffers++;
2291 : }
2292 :
2293 388 : BgWriterStats.m_buf_written_clean += num_written;
2294 :
2295 : #ifdef BGW_DEBUG
2296 : elog(DEBUG1, "bgwriter: recent_alloc=%u smoothed=%.2f delta=%ld ahead=%d density=%.2f reusable_est=%d upcoming_est=%d scanned=%d wrote=%d reusable=%d",
2297 : recent_alloc, smoothed_alloc, strategy_delta, bufs_ahead,
2298 : smoothed_density, reusable_buffers_est, upcoming_alloc_est,
2299 : bufs_to_lap - num_to_scan,
2300 : num_written,
2301 : reusable_buffers - reusable_buffers_est);
2302 : #endif
2303 :
2304 : /*
2305 : * Consider the above scan as being like a new allocation scan.
2306 : * Characterize its density and update the smoothed one based on it. This
2307 : * effectively halves the moving average period in cases where both the
2308 : * strategy and the background writer are doing some useful scanning,
2309 : * which is helpful because a long memory isn't as desirable on the
2310 : * density estimates.
2311 : */
2312 388 : new_strategy_delta = bufs_to_lap - num_to_scan;
2313 388 : new_recent_alloc = reusable_buffers - reusable_buffers_est;
2314 388 : if (new_strategy_delta > 0 && new_recent_alloc > 0)
2315 : {
2316 388 : scans_per_alloc = (float) new_strategy_delta / (float) new_recent_alloc;
2317 388 : smoothed_density += (scans_per_alloc - smoothed_density) /
2318 : smoothing_samples;
2319 :
2320 : #ifdef BGW_DEBUG
2321 : elog(DEBUG2, "bgwriter: cleaner density alloc=%u scan=%ld density=%.2f new smoothed=%.2f",
2322 : new_recent_alloc, new_strategy_delta,
2323 : scans_per_alloc, smoothed_density);
2324 : #endif
2325 : }
2326 :
2327 : /* Return true if OK to hibernate */
2328 388 : return (bufs_to_lap == 0 && recent_alloc == 0);
2329 : }
2330 :
2331 : /*
2332 : * SyncOneBuffer -- process a single buffer during syncing.
2333 : *
2334 : * If skip_recently_used is true, we don't write currently-pinned buffers, nor
2335 : * buffers marked recently used, as these are not replacement candidates.
2336 : *
2337 : * Returns a bitmask containing the following flag bits:
2338 : * BUF_WRITTEN: we wrote the buffer.
2339 : * BUF_REUSABLE: buffer is available for replacement, ie, it has
2340 : * pin count 0 and usage count 0.
2341 : *
2342 : * (BUF_WRITTEN could be set in error if FlushBuffers finds the buffer clean
2343 : * after locking it, but we don't care all that much.)
2344 : *
2345 : * Note: caller must have done ResourceOwnerEnlargeBuffers.
2346 : */
2347 : static int
2348 21099 : SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
2349 : {
2350 21099 : BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
2351 21099 : int result = 0;
2352 : uint32 buf_state;
2353 : BufferTag tag;
2354 :
2355 21099 : ReservePrivateRefCountEntry();
2356 :
2357 : /*
2358 : * Check whether buffer needs writing.
2359 : *
2360 : * We can make this check without taking the buffer content lock so long
2361 : * as we mark pages dirty in access methods *before* logging changes with
2362 : * XLogInsert(): if someone marks the buffer dirty just after our check we
2363 : * don't worry because our checkpoint.redo points before log record for
2364 : * upcoming changes and so we are not required to write such dirty buffer.
2365 : */
2366 21099 : buf_state = LockBufHdr(bufHdr);
2367 :
2368 42181 : if (BUF_STATE_GET_REFCOUNT(buf_state) == 0 &&
2369 21082 : BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
2370 : {
2371 11668 : result |= BUF_REUSABLE;
2372 : }
2373 9431 : else if (skip_recently_used)
2374 : {
2375 : /* Caller told us not to write recently-used buffers */
2376 917 : UnlockBufHdr(bufHdr, buf_state);
2377 917 : return result;
2378 : }
2379 :
2380 20182 : if (!(buf_state & BM_VALID) || !(buf_state & BM_DIRTY))
2381 : {
2382 : /* It's clean, so nothing to do */
2383 11668 : UnlockBufHdr(bufHdr, buf_state);
2384 11668 : return result;
2385 : }
2386 :
2387 : /*
2388 : * Pin it, share-lock it, write it. (FlushBuffer will do nothing if the
2389 : * buffer is clean by the time we've locked it.)
2390 : */
2391 8514 : PinBuffer_Locked(bufHdr);
2392 8514 : LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
2393 :
2394 8514 : FlushBuffer(bufHdr, NULL);
2395 :
2396 8514 : LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
2397 :
2398 8514 : tag = bufHdr->tag;
2399 :
2400 8514 : UnpinBuffer(bufHdr, true);
2401 :
2402 8514 : ScheduleBufferTagForWriteback(wb_context, &tag);
2403 :
2404 8514 : return result | BUF_WRITTEN;
2405 : }
2406 :
2407 : /*
2408 : * AtEOXact_Buffers - clean up at end of transaction.
2409 : *
2410 : * As of PostgreSQL 8.0, buffer pins should get released by the
2411 : * ResourceOwner mechanism. This routine is just a debugging
2412 : * cross-check that no pins remain.
2413 : */
2414 : void
2415 26218 : AtEOXact_Buffers(bool isCommit)
2416 : {
2417 26218 : CheckForBufferLeaks();
2418 :
2419 26218 : AtEOXact_LocalBuffers(isCommit);
2420 :
2421 26218 : Assert(PrivateRefCountOverflowed == 0);
2422 26218 : }
2423 :
2424 : /*
2425 : * Initialize access to shared buffer pool
2426 : *
2427 : * This is called during backend startup (whether standalone or under the
2428 : * postmaster). It sets up for this backend's access to the already-existing
2429 : * buffer pool.
2430 : *
2431 : * NB: this is called before InitProcess(), so we do not have a PGPROC and
2432 : * cannot do LWLockAcquire; hence we can't actually access stuff in
2433 : * shared memory yet. We are only initializing local data here.
2434 : * (See also InitBufferPoolBackend)
2435 : */
2436 : void
2437 344 : InitBufferPoolAccess(void)
2438 : {
2439 : HASHCTL hash_ctl;
2440 :
2441 344 : memset(&PrivateRefCountArray, 0, sizeof(PrivateRefCountArray));
2442 :
2443 344 : MemSet(&hash_ctl, 0, sizeof(hash_ctl));
2444 344 : hash_ctl.keysize = sizeof(int32);
2445 344 : hash_ctl.entrysize = sizeof(PrivateRefCountEntry);
2446 :
2447 344 : PrivateRefCountHash = hash_create("PrivateRefCount", 100, &hash_ctl,
2448 : HASH_ELEM | HASH_BLOBS);
2449 344 : }
2450 :
2451 : /*
2452 : * InitBufferPoolBackend --- second-stage initialization of a new backend
2453 : *
2454 : * This is called after we have acquired a PGPROC and so can safely get
2455 : * LWLocks. We don't currently need to do anything at this stage ...
2456 : * except register a shmem-exit callback. AtProcExit_Buffers needs LWLock
2457 : * access, and thereby has to be called at the corresponding phase of
2458 : * backend shutdown.
2459 : */
2460 : void
2461 342 : InitBufferPoolBackend(void)
2462 : {
2463 342 : on_shmem_exit(AtProcExit_Buffers, 0);
2464 342 : }
2465 :
2466 : /*
2467 : * During backend exit, ensure that we released all shared-buffer locks and
2468 : * assert that we have no remaining pins.
2469 : */
2470 : static void
2471 342 : AtProcExit_Buffers(int code, Datum arg)
2472 : {
2473 342 : AbortBufferIO();
2474 342 : UnlockBuffers();
2475 :
2476 342 : CheckForBufferLeaks();
2477 :
2478 : /* localbuf.c needs a chance too */
2479 342 : AtProcExit_LocalBuffers();
2480 342 : }
2481 :
2482 : /*
2483 : * CheckForBufferLeaks - ensure this backend holds no buffer pins
2484 : *
2485 : * As of PostgreSQL 8.0, buffer pins should get released by the
2486 : * ResourceOwner mechanism. This routine is just a debugging
2487 : * cross-check that no pins remain.
2488 : */
2489 : static void
2490 26560 : CheckForBufferLeaks(void)
2491 : {
2492 : #ifdef USE_ASSERT_CHECKING
2493 26560 : int RefCountErrors = 0;
2494 : PrivateRefCountEntry *res;
2495 : int i;
2496 :
2497 : /* check the array */
2498 239040 : for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
2499 : {
2500 212480 : res = &PrivateRefCountArray[i];
2501 :
2502 212480 : if (res->buffer != InvalidBuffer)
2503 : {
2504 0 : PrintBufferLeakWarning(res->buffer);
2505 0 : RefCountErrors++;
2506 : }
2507 : }
2508 :
2509 : /* if necessary search the hash */
2510 26560 : if (PrivateRefCountOverflowed)
2511 : {
2512 : HASH_SEQ_STATUS hstat;
2513 :
2514 0 : hash_seq_init(&hstat, PrivateRefCountHash);
2515 0 : while ((res = (PrivateRefCountEntry *) hash_seq_search(&hstat)) != NULL)
2516 : {
2517 0 : PrintBufferLeakWarning(res->buffer);
2518 0 : RefCountErrors++;
2519 : }
2520 :
2521 : }
2522 :
2523 26560 : Assert(RefCountErrors == 0);
2524 : #endif
2525 26560 : }
2526 :
2527 : /*
2528 : * Helper routine to issue warnings when a buffer is unexpectedly pinned
2529 : */
2530 : void
2531 0 : PrintBufferLeakWarning(Buffer buffer)
2532 : {
2533 : BufferDesc *buf;
2534 : int32 loccount;
2535 : char *path;
2536 : BackendId backend;
2537 : uint32 buf_state;
2538 :
2539 0 : Assert(BufferIsValid(buffer));
2540 0 : if (BufferIsLocal(buffer))
2541 : {
2542 0 : buf = GetLocalBufferDescriptor(-buffer - 1);
2543 0 : loccount = LocalRefCount[-buffer - 1];
2544 0 : backend = MyBackendId;
2545 : }
2546 : else
2547 : {
2548 0 : buf = GetBufferDescriptor(buffer - 1);
2549 0 : loccount = GetPrivateRefCount(buffer);
2550 0 : backend = InvalidBackendId;
2551 : }
2552 :
2553 : /* theoretically we should lock the bufhdr here */
2554 0 : path = relpathbackend(buf->tag.rnode, backend, buf->tag.forkNum);
2555 0 : buf_state = pg_atomic_read_u32(&buf->state);
2556 0 : elog(WARNING,
2557 : "buffer refcount leak: [%03d] "
2558 : "(rel=%s, blockNum=%u, flags=0x%x, refcount=%u %d)",
2559 : buffer, path,
2560 : buf->tag.blockNum, buf_state & BUF_FLAG_MASK,
2561 : BUF_STATE_GET_REFCOUNT(buf_state), loccount);
2562 0 : pfree(path);
2563 0 : }
2564 :
2565 : /*
2566 : * CheckPointBuffers
2567 : *
2568 : * Flush all dirty blocks in buffer pool to disk at checkpoint time.
2569 : *
2570 : * Note: temporary relations do not participate in checkpoints, so they don't
2571 : * need to be flushed.
2572 : */
2573 : void
2574 11 : CheckPointBuffers(int flags)
2575 : {
2576 : TRACE_POSTGRESQL_BUFFER_CHECKPOINT_START(flags);
2577 11 : CheckpointStats.ckpt_write_t = GetCurrentTimestamp();
2578 11 : BufferSync(flags);
2579 11 : CheckpointStats.ckpt_sync_t = GetCurrentTimestamp();
2580 : TRACE_POSTGRESQL_BUFFER_CHECKPOINT_SYNC_START();
2581 11 : smgrsync();
2582 11 : CheckpointStats.ckpt_sync_end_t = GetCurrentTimestamp();
2583 : TRACE_POSTGRESQL_BUFFER_CHECKPOINT_DONE();
2584 11 : }
2585 :
2586 :
2587 : /*
2588 : * Do whatever is needed to prepare for commit at the bufmgr and smgr levels
2589 : */
2590 : void
2591 9913 : BufmgrCommit(void)
2592 : {
2593 : /* Nothing to do in bufmgr anymore... */
2594 9913 : }
2595 :
2596 : /*
2597 : * BufferGetBlockNumber
2598 : * Returns the block number associated with a buffer.
2599 : *
2600 : * Note:
2601 : * Assumes that the buffer is valid and pinned, else the
2602 : * value may be obsolete immediately...
2603 : */
2604 : BlockNumber
2605 10203702 : BufferGetBlockNumber(Buffer buffer)
2606 : {
2607 : BufferDesc *bufHdr;
2608 :
2609 10203702 : Assert(BufferIsPinned(buffer));
2610 :
2611 10203702 : if (BufferIsLocal(buffer))
2612 93377 : bufHdr = GetLocalBufferDescriptor(-buffer - 1);
2613 : else
2614 10110325 : bufHdr = GetBufferDescriptor(buffer - 1);
2615 :
2616 : /* pinned, so OK to read tag without spinlock */
2617 10203702 : return bufHdr->tag.blockNum;
2618 : }
2619 :
2620 : /*
2621 : * BufferGetTag
2622 : * Returns the relfilenode, fork number and block number associated with
2623 : * a buffer.
2624 : */
2625 : void
2626 1488047 : BufferGetTag(Buffer buffer, RelFileNode *rnode, ForkNumber *forknum,
2627 : BlockNumber *blknum)
2628 : {
2629 : BufferDesc *bufHdr;
2630 :
2631 : /* Do the same checks as BufferGetBlockNumber. */
2632 1488047 : Assert(BufferIsPinned(buffer));
2633 :
2634 1488047 : if (BufferIsLocal(buffer))
2635 0 : bufHdr = GetLocalBufferDescriptor(-buffer - 1);
2636 : else
2637 1488047 : bufHdr = GetBufferDescriptor(buffer - 1);
2638 :
2639 : /* pinned, so OK to read tag without spinlock */
2640 1488047 : *rnode = bufHdr->tag.rnode;
2641 1488047 : *forknum = bufHdr->tag.forkNum;
2642 1488047 : *blknum = bufHdr->tag.blockNum;
2643 1488047 : }
2644 :
2645 : /*
2646 : * FlushBuffer
2647 : * Physically write out a shared buffer.
2648 : *
2649 : * NOTE: this actually just passes the buffer contents to the kernel; the
2650 : * real write to disk won't happen until the kernel feels like it. This
2651 : * is okay from our point of view since we can redo the changes from WAL.
2652 : * However, we will need to force the changes to disk via fsync before
2653 : * we can checkpoint WAL.
2654 : *
2655 : * The caller must hold a pin on the buffer and have share-locked the
2656 : * buffer contents. (Note: a share-lock does not prevent updates of
2657 : * hint bits in the buffer, so the page could change while the write
2658 : * is in progress, but we assume that that will not invalidate the data
2659 : * written.)
2660 : *
2661 : * If the caller has an smgr reference for the buffer's relation, pass it
2662 : * as the second parameter. If not, pass NULL.
2663 : */
2664 : static void
2665 8542 : FlushBuffer(BufferDesc *buf, SMgrRelation reln)
2666 : {
2667 : XLogRecPtr recptr;
2668 : ErrorContextCallback errcallback;
2669 : instr_time io_start,
2670 : io_time;
2671 : Block bufBlock;
2672 : char *bufToWrite;
2673 : uint32 buf_state;
2674 :
2675 : /*
2676 : * Acquire the buffer's io_in_progress lock. If StartBufferIO returns
2677 : * false, then someone else flushed the buffer before we could, so we need
2678 : * not do anything.
2679 : */
2680 8542 : if (!StartBufferIO(buf, false))
2681 8542 : return;
2682 :
2683 : /* Setup error traceback support for ereport() */
2684 8542 : errcallback.callback = shared_buffer_write_error_callback;
2685 8542 : errcallback.arg = (void *) buf;
2686 8542 : errcallback.previous = error_context_stack;
2687 8542 : error_context_stack = &errcallback;
2688 :
2689 : /* Find smgr relation for buffer */
2690 8542 : if (reln == NULL)
2691 8540 : reln = smgropen(buf->tag.rnode, InvalidBackendId);
2692 :
2693 : TRACE_POSTGRESQL_BUFFER_FLUSH_START(buf->tag.forkNum,
2694 : buf->tag.blockNum,
2695 : reln->smgr_rnode.node.spcNode,
2696 : reln->smgr_rnode.node.dbNode,
2697 : reln->smgr_rnode.node.relNode);
2698 :
2699 8542 : buf_state = LockBufHdr(buf);
2700 :
2701 : /*
2702 : * Run PageGetLSN while holding header lock, since we don't have the
2703 : * buffer locked exclusively in all cases.
2704 : */
2705 8542 : recptr = BufferGetLSN(buf);
2706 :
2707 : /* To check if block content changes while flushing. - vadim 01/17/97 */
2708 8542 : buf_state &= ~BM_JUST_DIRTIED;
2709 8542 : UnlockBufHdr(buf, buf_state);
2710 :
2711 : /*
2712 : * Force XLOG flush up to buffer's LSN. This implements the basic WAL
2713 : * rule that log updates must hit disk before any of the data-file changes
2714 : * they describe do.
2715 : *
2716 : * However, this rule does not apply to unlogged relations, which will be
2717 : * lost after a crash anyway. Most unlogged relation pages do not bear
2718 : * LSNs since we never emit WAL records for them, and therefore flushing
2719 : * up through the buffer LSN would be useless, but harmless. However,
2720 : * GiST indexes use LSNs internally to track page-splits, and therefore
2721 : * unlogged GiST pages bear "fake" LSNs generated by
2722 : * GetFakeLSNForUnloggedRel. It is unlikely but possible that the fake
2723 : * LSN counter could advance past the WAL insertion point; and if it did
2724 : * happen, attempting to flush WAL through that location would fail, with
2725 : * disastrous system-wide consequences. To make sure that can't happen,
2726 : * skip the flush if the buffer isn't permanent.
2727 : */
2728 8542 : if (buf_state & BM_PERMANENT)
2729 8542 : XLogFlush(recptr);
2730 :
2731 : /*
2732 : * Now it's safe to write buffer to disk. Note that no one else should
2733 : * have been able to write it while we were busy with log flushing because
2734 : * we have the io_in_progress lock.
2735 : */
2736 8542 : bufBlock = BufHdrGetBlock(buf);
2737 :
2738 : /*
2739 : * Update page checksum if desired. Since we have only shared lock on the
2740 : * buffer, other processes might be updating hint bits in it, so we must
2741 : * copy the page to private storage if we do checksumming.
2742 : */
2743 8542 : bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum);
2744 :
2745 8542 : if (track_io_timing)
2746 0 : INSTR_TIME_SET_CURRENT(io_start);
2747 :
2748 : /*
2749 : * bufToWrite is either the shared buffer or a copy, as appropriate.
2750 : */
2751 8542 : smgrwrite(reln,
2752 : buf->tag.forkNum,
2753 : buf->tag.blockNum,
2754 : bufToWrite,
2755 : false);
2756 :
2757 8542 : if (track_io_timing)
2758 : {
2759 0 : INSTR_TIME_SET_CURRENT(io_time);
2760 0 : INSTR_TIME_SUBTRACT(io_time, io_start);
2761 0 : pgstat_count_buffer_write_time(INSTR_TIME_GET_MICROSEC(io_time));
2762 0 : INSTR_TIME_ADD(pgBufferUsage.blk_write_time, io_time);
2763 : }
2764 :
2765 8542 : pgBufferUsage.shared_blks_written++;
2766 :
2767 : /*
2768 : * Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and
2769 : * end the io_in_progress state.
2770 : */
2771 8542 : TerminateBufferIO(buf, true, 0);
2772 :
2773 : TRACE_POSTGRESQL_BUFFER_FLUSH_DONE(buf->tag.forkNum,
2774 : buf->tag.blockNum,
2775 : reln->smgr_rnode.node.spcNode,
2776 : reln->smgr_rnode.node.dbNode,
2777 : reln->smgr_rnode.node.relNode);
2778 :
2779 : /* Pop the error context stack */
2780 8542 : error_context_stack = errcallback.previous;
2781 : }
2782 :
2783 : /*
2784 : * RelationGetNumberOfBlocksInFork
2785 : * Determines the current number of pages in the specified relation fork.
2786 : */
2787 : BlockNumber
2788 66317 : RelationGetNumberOfBlocksInFork(Relation relation, ForkNumber forkNum)
2789 : {
2790 : /* Open it at the smgr level if not already done */
2791 66317 : RelationOpenSmgr(relation);
2792 :
2793 66317 : return smgrnblocks(relation->rd_smgr, forkNum);
2794 : }
2795 :
2796 : /*
2797 : * BufferIsPermanent
2798 : * Determines whether a buffer will potentially still be around after
2799 : * a crash. Caller must hold a buffer pin.
2800 : */
2801 : bool
2802 735990 : BufferIsPermanent(Buffer buffer)
2803 : {
2804 : BufferDesc *bufHdr;
2805 :
2806 : /* Local buffers are used only for temp relations. */
2807 735990 : if (BufferIsLocal(buffer))
2808 28039 : return false;
2809 :
2810 : /* Make sure we've got a real buffer, and that we hold a pin on it. */
2811 707951 : Assert(BufferIsValid(buffer));
2812 707951 : Assert(BufferIsPinned(buffer));
2813 :
2814 : /*
2815 : * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
2816 : * need not bother with the buffer header spinlock. Even if someone else
2817 : * changes the buffer header state while we're doing this, the state is
2818 : * changed atomically, so we'll read the old value or the new value, but
2819 : * not random garbage.
2820 : */
2821 707951 : bufHdr = GetBufferDescriptor(buffer - 1);
2822 707951 : return (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT) != 0;
2823 : }
2824 :
2825 : /*
2826 : * BufferGetLSNAtomic
2827 : * Retrieves the LSN of the buffer atomically using a buffer header lock.
2828 : * This is necessary for some callers who may not have an exclusive lock
2829 : * on the buffer.
2830 : */
2831 : XLogRecPtr
2832 6538 : BufferGetLSNAtomic(Buffer buffer)
2833 : {
2834 6538 : BufferDesc *bufHdr = GetBufferDescriptor(buffer - 1);
2835 6538 : char *page = BufferGetPage(buffer);
2836 : XLogRecPtr lsn;
2837 : uint32 buf_state;
2838 :
2839 : /*
2840 : * If we don't need locking for correctness, fastpath out.
2841 : */
2842 6538 : if (!XLogHintBitIsNeeded() || BufferIsLocal(buffer))
2843 6538 : return PageGetLSN(page);
2844 :
2845 : /* Make sure we've got a real buffer, and that we hold a pin on it. */
2846 0 : Assert(BufferIsValid(buffer));
2847 0 : Assert(BufferIsPinned(buffer));
2848 :
2849 0 : buf_state = LockBufHdr(bufHdr);
2850 0 : lsn = PageGetLSN(page);
2851 0 : UnlockBufHdr(bufHdr, buf_state);
2852 :
2853 0 : return lsn;
2854 : }
2855 :
2856 : /* ---------------------------------------------------------------------
2857 : * DropRelFileNodeBuffers
2858 : *
2859 : * This function removes from the buffer pool all the pages of the
2860 : * specified relation fork that have block numbers >= firstDelBlock.
2861 : * (In particular, with firstDelBlock = 0, all pages are removed.)
2862 : * Dirty pages are simply dropped, without bothering to write them
2863 : * out first. Therefore, this is NOT rollback-able, and so should be
2864 : * used only with extreme caution!
2865 : *
2866 : * Currently, this is called only from smgr.c when the underlying file
2867 : * is about to be deleted or truncated (firstDelBlock is needed for
2868 : * the truncation case). The data in the affected pages would therefore
2869 : * be deleted momentarily anyway, and there is no point in writing it.
2870 : * It is the responsibility of higher-level code to ensure that the
2871 : * deletion or truncation does not lose any data that could be needed
2872 : * later. It is also the responsibility of higher-level code to ensure
2873 : * that no other process could be trying to load more pages of the
2874 : * relation into buffers.
2875 : *
2876 : * XXX currently it sequentially searches the buffer pool, should be
2877 : * changed to more clever ways of searching. However, this routine
2878 : * is used only in code paths that aren't very performance-critical,
2879 : * and we shouldn't slow down the hot paths to make it faster ...
2880 : * --------------------------------------------------------------------
2881 : */
2882 : void
2883 46 : DropRelFileNodeBuffers(RelFileNodeBackend rnode, ForkNumber forkNum,
2884 : BlockNumber firstDelBlock)
2885 : {
2886 : int i;
2887 :
2888 : /* If it's a local relation, it's localbuf.c's problem. */
2889 46 : if (RelFileNodeBackendIsTemp(rnode))
2890 : {
2891 18 : if (rnode.backend == MyBackendId)
2892 18 : DropRelFileNodeLocalBuffers(rnode.node, forkNum, firstDelBlock);
2893 64 : return;
2894 : }
2895 :
2896 458780 : for (i = 0; i < NBuffers; i++)
2897 : {
2898 458752 : BufferDesc *bufHdr = GetBufferDescriptor(i);
2899 : uint32 buf_state;
2900 :
2901 : /*
2902 : * We can make this a tad faster by prechecking the buffer tag before
2903 : * we attempt to lock the buffer; this saves a lot of lock
2904 : * acquisitions in typical cases. It should be safe because the
2905 : * caller must have AccessExclusiveLock on the relation, or some other
2906 : * reason to be certain that no one is loading new pages of the rel
2907 : * into the buffer pool. (Otherwise we might well miss such pages
2908 : * entirely.) Therefore, while the tag might be changing while we
2909 : * look at it, it can't be changing *to* a value we care about, only
2910 : * *away* from such a value. So false negatives are impossible, and
2911 : * false positives are safe because we'll recheck after getting the
2912 : * buffer lock.
2913 : *
2914 : * We could check forkNum and blockNum as well as the rnode, but the
2915 : * incremental win from doing so seems small.
2916 : */
2917 458752 : if (!RelFileNodeEquals(bufHdr->tag.rnode, rnode.node))
2918 457809 : continue;
2919 :
2920 943 : buf_state = LockBufHdr(bufHdr);
2921 1886 : if (RelFileNodeEquals(bufHdr->tag.rnode, rnode.node) &&
2922 1394 : bufHdr->tag.forkNum == forkNum &&
2923 451 : bufHdr->tag.blockNum >= firstDelBlock)
2924 86 : InvalidateBuffer(bufHdr); /* releases spinlock */
2925 : else
2926 857 : UnlockBufHdr(bufHdr, buf_state);
2927 : }
2928 : }
2929 :
2930 : /* ---------------------------------------------------------------------
2931 : * DropRelFileNodesAllBuffers
2932 : *
2933 : * This function removes from the buffer pool all the pages of all
2934 : * forks of the specified relations. It's equivalent to calling
2935 : * DropRelFileNodeBuffers once per fork per relation with
2936 : * firstDelBlock = 0.
2937 : * --------------------------------------------------------------------
2938 : */
2939 : void
2940 1032 : DropRelFileNodesAllBuffers(RelFileNodeBackend *rnodes, int nnodes)
2941 : {
2942 : int i,
2943 1032 : n = 0;
2944 : RelFileNode *nodes;
2945 : bool use_bsearch;
2946 :
2947 1032 : if (nnodes == 0)
2948 0 : return;
2949 :
2950 1032 : nodes = palloc(sizeof(RelFileNode) * nnodes); /* non-local relations */
2951 :
2952 : /* If it's a local relation, it's localbuf.c's problem. */
2953 4213 : for (i = 0; i < nnodes; i++)
2954 : {
2955 3181 : if (RelFileNodeBackendIsTemp(rnodes[i]))
2956 : {
2957 605 : if (rnodes[i].backend == MyBackendId)
2958 605 : DropRelFileNodeAllLocalBuffers(rnodes[i].node);
2959 : }
2960 : else
2961 2576 : nodes[n++] = rnodes[i].node;
2962 : }
2963 :
2964 : /*
2965 : * If there are no non-local relations, then we're done. Release the
2966 : * memory and return.
2967 : */
2968 1032 : if (n == 0)
2969 : {
2970 131 : pfree(nodes);
2971 131 : return;
2972 : }
2973 :
2974 : /*
2975 : * For low number of relations to drop just use a simple walk through, to
2976 : * save the bsearch overhead. The threshold to use is rather a guess than
2977 : * an exactly determined value, as it depends on many factors (CPU and RAM
2978 : * speeds, amount of shared buffers etc.).
2979 : */
2980 901 : use_bsearch = n > DROP_RELS_BSEARCH_THRESHOLD;
2981 :
2982 : /* sort the list of rnodes if necessary */
2983 901 : if (use_bsearch)
2984 8 : pg_qsort(nodes, n, sizeof(RelFileNode), rnode_comparator);
2985 :
2986 14762885 : for (i = 0; i < NBuffers; i++)
2987 : {
2988 14761984 : RelFileNode *rnode = NULL;
2989 14761984 : BufferDesc *bufHdr = GetBufferDescriptor(i);
2990 : uint32 buf_state;
2991 :
2992 : /*
2993 : * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
2994 : * and saves some cycles.
2995 : */
2996 :
2997 14761984 : if (!use_bsearch)
2998 : {
2999 : int j;
3000 :
3001 51526806 : for (j = 0; j < n; j++)
3002 : {
3003 36903058 : if (RelFileNodeEquals(bufHdr->tag.rnode, nodes[j]))
3004 : {
3005 7164 : rnode = &nodes[j];
3006 7164 : break;
3007 : }
3008 : }
3009 : }
3010 : else
3011 : {
3012 131072 : rnode = bsearch((const void *) &(bufHdr->tag.rnode),
3013 : nodes, n, sizeof(RelFileNode),
3014 : rnode_comparator);
3015 : }
3016 :
3017 : /* buffer doesn't belong to any of the given relfilenodes; skip it */
3018 14761984 : if (rnode == NULL)
3019 14754755 : continue;
3020 :
3021 7229 : buf_state = LockBufHdr(bufHdr);
3022 7229 : if (RelFileNodeEquals(bufHdr->tag.rnode, (*rnode)))
3023 7229 : InvalidateBuffer(bufHdr); /* releases spinlock */
3024 : else
3025 0 : UnlockBufHdr(bufHdr, buf_state);
3026 : }
3027 :
3028 901 : pfree(nodes);
3029 : }
3030 :
3031 : /* ---------------------------------------------------------------------
3032 : * DropDatabaseBuffers
3033 : *
3034 : * This function removes all the buffers in the buffer cache for a
3035 : * particular database. Dirty pages are simply dropped, without
3036 : * bothering to write them out first. This is used when we destroy a
3037 : * database, to avoid trying to flush data to disk when the directory
3038 : * tree no longer exists. Implementation is pretty similar to
3039 : * DropRelFileNodeBuffers() which is for destroying just one relation.
3040 : * --------------------------------------------------------------------
3041 : */
3042 : void
3043 0 : DropDatabaseBuffers(Oid dbid)
3044 : {
3045 : int i;
3046 :
3047 : /*
3048 : * We needn't consider local buffers, since by assumption the target
3049 : * database isn't our own.
3050 : */
3051 :
3052 0 : for (i = 0; i < NBuffers; i++)
3053 : {
3054 0 : BufferDesc *bufHdr = GetBufferDescriptor(i);
3055 : uint32 buf_state;
3056 :
3057 : /*
3058 : * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
3059 : * and saves some cycles.
3060 : */
3061 0 : if (bufHdr->tag.rnode.dbNode != dbid)
3062 0 : continue;
3063 :
3064 0 : buf_state = LockBufHdr(bufHdr);
3065 0 : if (bufHdr->tag.rnode.dbNode == dbid)
3066 0 : InvalidateBuffer(bufHdr); /* releases spinlock */
3067 : else
3068 0 : UnlockBufHdr(bufHdr, buf_state);
3069 : }
3070 0 : }
3071 :
3072 : /* -----------------------------------------------------------------
3073 : * PrintBufferDescs
3074 : *
3075 : * this function prints all the buffer descriptors, for debugging
3076 : * use only.
3077 : * -----------------------------------------------------------------
3078 : */
3079 : #ifdef NOT_USED
3080 : void
3081 : PrintBufferDescs(void)
3082 : {
3083 : int i;
3084 :
3085 : for (i = 0; i < NBuffers; ++i)
3086 : {
3087 : BufferDesc *buf = GetBufferDescriptor(i);
3088 : Buffer b = BufferDescriptorGetBuffer(buf);
3089 :
3090 : /* theoretically we should lock the bufhdr here */
3091 : elog(LOG,
3092 : "[%02d] (freeNext=%d, rel=%s, "
3093 : "blockNum=%u, flags=0x%x, refcount=%u %d)",
3094 : i, buf->freeNext,
3095 : relpathbackend(buf->tag.rnode, InvalidBackendId, buf->tag.forkNum),
3096 : buf->tag.blockNum, buf->flags,
3097 : buf->refcount, GetPrivateRefCount(b));
3098 : }
3099 : }
3100 : #endif
3101 :
3102 : #ifdef NOT_USED
3103 : void
3104 : PrintPinnedBufs(void)
3105 : {
3106 : int i;
3107 :
3108 : for (i = 0; i < NBuffers; ++i)
3109 : {
3110 : BufferDesc *buf = GetBufferDescriptor(i);
3111 : Buffer b = BufferDescriptorGetBuffer(buf);
3112 :
3113 : if (GetPrivateRefCount(b) > 0)
3114 : {
3115 : /* theoretically we should lock the bufhdr here */
3116 : elog(LOG,
3117 : "[%02d] (freeNext=%d, rel=%s, "
3118 : "blockNum=%u, flags=0x%x, refcount=%u %d)",
3119 : i, buf->freeNext,
3120 : relpathperm(buf->tag.rnode, buf->tag.forkNum),
3121 : buf->tag.blockNum, buf->flags,
3122 : buf->refcount, GetPrivateRefCount(b));
3123 : }
3124 : }
3125 : }
3126 : #endif
3127 :
3128 : /* ---------------------------------------------------------------------
3129 : * FlushRelationBuffers
3130 : *
3131 : * This function writes all dirty pages of a relation out to disk
3132 : * (or more accurately, out to kernel disk buffers), ensuring that the
3133 : * kernel has an up-to-date view of the relation.
3134 : *
3135 : * Generally, the caller should be holding AccessExclusiveLock on the
3136 : * target relation to ensure that no other backend is busy dirtying
3137 : * more blocks of the relation; the effects can't be expected to last
3138 : * after the lock is released.
3139 : *
3140 : * XXX currently it sequentially searches the buffer pool, should be
3141 : * changed to more clever ways of searching. This routine is not
3142 : * used in any performance-critical code paths, so it's not worth
3143 : * adding additional overhead to normal paths to make it go faster;
3144 : * but see also DropRelFileNodeBuffers.
3145 : * --------------------------------------------------------------------
3146 : */
3147 : void
3148 33 : FlushRelationBuffers(Relation rel)
3149 : {
3150 : int i;
3151 : BufferDesc *bufHdr;
3152 :
3153 : /* Open rel at the smgr level if not already done */
3154 33 : RelationOpenSmgr(rel);
3155 :
3156 33 : if (RelationUsesLocalBuffers(rel))
3157 : {
3158 0 : for (i = 0; i < NLocBuffer; i++)
3159 : {
3160 : uint32 buf_state;
3161 :
3162 0 : bufHdr = GetLocalBufferDescriptor(i);
3163 0 : if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
3164 0 : ((buf_state = pg_atomic_read_u32(&bufHdr->state)) &
3165 : (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
3166 : {
3167 : ErrorContextCallback errcallback;
3168 : Page localpage;
3169 :
3170 0 : localpage = (char *) LocalBufHdrGetBlock(bufHdr);
3171 :
3172 : /* Setup error traceback support for ereport() */
3173 0 : errcallback.callback = local_buffer_write_error_callback;
3174 0 : errcallback.arg = (void *) bufHdr;
3175 0 : errcallback.previous = error_context_stack;
3176 0 : error_context_stack = &errcallback;
3177 :
3178 0 : PageSetChecksumInplace(localpage, bufHdr->tag.blockNum);
3179 :
3180 0 : smgrwrite(rel->rd_smgr,
3181 : bufHdr->tag.forkNum,
3182 : bufHdr->tag.blockNum,
3183 : localpage,
3184 : false);
3185 :
3186 0 : buf_state &= ~(BM_DIRTY | BM_JUST_DIRTIED);
3187 0 : pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
3188 :
3189 : /* Pop the error context stack */
3190 0 : error_context_stack = errcallback.previous;
3191 : }
3192 : }
3193 :
3194 33 : return;
3195 : }
3196 :
3197 : /* Make sure we can handle the pin inside the loop */
3198 33 : ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
3199 :
3200 540705 : for (i = 0; i < NBuffers; i++)
3201 : {
3202 : uint32 buf_state;
3203 :
3204 540672 : bufHdr = GetBufferDescriptor(i);
3205 :
3206 : /*
3207 : * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
3208 : * and saves some cycles.
3209 : */
3210 540672 : if (!RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node))
3211 540664 : continue;
3212 :
3213 8 : ReservePrivateRefCountEntry();
3214 :
3215 8 : buf_state = LockBufHdr(bufHdr);
3216 16 : if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
3217 8 : (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
3218 : {
3219 2 : PinBuffer_Locked(bufHdr);
3220 2 : LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
3221 2 : FlushBuffer(bufHdr, rel->rd_smgr);
3222 2 : LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
3223 2 : UnpinBuffer(bufHdr, true);
3224 : }
3225 : else
3226 6 : UnlockBufHdr(bufHdr, buf_state);
3227 : }
3228 : }
3229 :
3230 : /* ---------------------------------------------------------------------
3231 : * FlushDatabaseBuffers
3232 : *
3233 : * This function writes all dirty pages of a database out to disk
3234 : * (or more accurately, out to kernel disk buffers), ensuring that the
3235 : * kernel has an up-to-date view of the database.
3236 : *
3237 : * Generally, the caller should be holding an appropriate lock to ensure
3238 : * no other backend is active in the target database; otherwise more
3239 : * pages could get dirtied.
3240 : *
3241 : * Note we don't worry about flushing any pages of temporary relations.
3242 : * It's assumed these wouldn't be interesting.
3243 : * --------------------------------------------------------------------
3244 : */
3245 : void
3246 0 : FlushDatabaseBuffers(Oid dbid)
3247 : {
3248 : int i;
3249 : BufferDesc *bufHdr;
3250 :
3251 : /* Make sure we can handle the pin inside the loop */
3252 0 : ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
3253 :
3254 0 : for (i = 0; i < NBuffers; i++)
3255 : {
3256 : uint32 buf_state;
3257 :
3258 0 : bufHdr = GetBufferDescriptor(i);
3259 :
3260 : /*
3261 : * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
3262 : * and saves some cycles.
3263 : */
3264 0 : if (bufHdr->tag.rnode.dbNode != dbid)
3265 0 : continue;
3266 :
3267 0 : ReservePrivateRefCountEntry();
3268 :
3269 0 : buf_state = LockBufHdr(bufHdr);
3270 0 : if (bufHdr->tag.rnode.dbNode == dbid &&
3271 0 : (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
3272 : {
3273 0 : PinBuffer_Locked(bufHdr);
3274 0 : LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
3275 0 : FlushBuffer(bufHdr, NULL);
3276 0 : LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
3277 0 : UnpinBuffer(bufHdr, true);
3278 : }
3279 : else
3280 0 : UnlockBufHdr(bufHdr, buf_state);
3281 : }
3282 0 : }
3283 :
3284 : /*
3285 : * Flush a previously, shared or exclusively, locked and pinned buffer to the
3286 : * OS.
3287 : */
3288 : void
3289 0 : FlushOneBuffer(Buffer buffer)
3290 : {
3291 : BufferDesc *bufHdr;
3292 :
3293 : /* currently not needed, but no fundamental reason not to support */
3294 0 : Assert(!BufferIsLocal(buffer));
3295 :
3296 0 : Assert(BufferIsPinned(buffer));
3297 :
3298 0 : bufHdr = GetBufferDescriptor(buffer - 1);
3299 :
3300 0 : Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr)));
3301 :
3302 0 : FlushBuffer(bufHdr, NULL);
3303 0 : }
3304 :
3305 : /*
3306 : * ReleaseBuffer -- release the pin on a buffer
3307 : */
3308 : void
3309 2964002 : ReleaseBuffer(Buffer buffer)
3310 : {
3311 2964002 : if (!BufferIsValid(buffer))
3312 0 : elog(ERROR, "bad buffer ID: %d", buffer);
3313 :
3314 2964002 : if (BufferIsLocal(buffer))
3315 : {
3316 48394 : ResourceOwnerForgetBuffer(CurrentResourceOwner, buffer);
3317 :
3318 48394 : Assert(LocalRefCount[-buffer - 1] > 0);
3319 48394 : LocalRefCount[-buffer - 1]--;
3320 3012396 : return;
3321 : }
3322 :
3323 2915608 : UnpinBuffer(GetBufferDescriptor(buffer - 1), true);
3324 : }
3325 :
3326 : /*
3327 : * UnlockReleaseBuffer -- release the content lock and pin on a buffer
3328 : *
3329 : * This is just a shorthand for a common combination.
3330 : */
3331 : void
3332 1386959 : UnlockReleaseBuffer(Buffer buffer)
3333 : {
3334 1386959 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3335 1386959 : ReleaseBuffer(buffer);
3336 1386959 : }
3337 :
3338 : /*
3339 : * IncrBufferRefCount
3340 : * Increment the pin count on a buffer that we have *already* pinned
3341 : * at least once.
3342 : *
3343 : * This function cannot be used on a buffer we do not have pinned,
3344 : * because it doesn't change the shared buffer state.
3345 : */
3346 : void
3347 336983 : IncrBufferRefCount(Buffer buffer)
3348 : {
3349 336983 : Assert(BufferIsPinned(buffer));
3350 336983 : ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
3351 336983 : ResourceOwnerRememberBuffer(CurrentResourceOwner, buffer);
3352 336983 : if (BufferIsLocal(buffer))
3353 11249 : LocalRefCount[-buffer - 1]++;
3354 : else
3355 : {
3356 : PrivateRefCountEntry *ref;
3357 :
3358 325734 : ref = GetPrivateRefCountEntry(buffer, true);
3359 325734 : Assert(ref != NULL);
3360 325734 : ref->refcount++;
3361 : }
3362 336983 : }
3363 :
3364 : /*
3365 : * MarkBufferDirtyHint
3366 : *
3367 : * Mark a buffer dirty for non-critical changes.
3368 : *
3369 : * This is essentially the same as MarkBufferDirty, except:
3370 : *
3371 : * 1. The caller does not write WAL; so if checksums are enabled, we may need
3372 : * to write an XLOG_FPI WAL record to protect against torn pages.
3373 : * 2. The caller might have only share-lock instead of exclusive-lock on the
3374 : * buffer's content lock.
3375 : * 3. This function does not guarantee that the buffer is always marked dirty
3376 : * (due to a race condition), so it cannot be used for important changes.
3377 : */
3378 : void
3379 747223 : MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
3380 : {
3381 : BufferDesc *bufHdr;
3382 747223 : Page page = BufferGetPage(buffer);
3383 :
3384 747223 : if (!BufferIsValid(buffer))
3385 0 : elog(ERROR, "bad buffer ID: %d", buffer);
3386 :
3387 747223 : if (BufferIsLocal(buffer))
3388 : {
3389 28819 : MarkLocalBufferDirty(buffer);
3390 28819 : return;
3391 : }
3392 :
3393 718404 : bufHdr = GetBufferDescriptor(buffer - 1);
3394 :
3395 718404 : Assert(GetPrivateRefCount(buffer) > 0);
3396 : /* here, either share or exclusive lock is OK */
3397 718404 : Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr)));
3398 :
3399 : /*
3400 : * This routine might get called many times on the same page, if we are
3401 : * making the first scan after commit of an xact that added/deleted many
3402 : * tuples. So, be as quick as we can if the buffer is already dirty. We
3403 : * do this by not acquiring spinlock if it looks like the status bits are
3404 : * already set. Since we make this test unlocked, there's a chance we
3405 : * might fail to notice that the flags have just been cleared, and failed
3406 : * to reset them, due to memory-ordering issues. But since this function
3407 : * is only intended to be used in cases where failing to write out the
3408 : * data would be harmless anyway, it doesn't really matter.
3409 : */
3410 718404 : if ((pg_atomic_read_u32(&bufHdr->state) & (BM_DIRTY | BM_JUST_DIRTIED)) !=
3411 : (BM_DIRTY | BM_JUST_DIRTIED))
3412 : {
3413 688 : XLogRecPtr lsn = InvalidXLogRecPtr;
3414 688 : bool dirtied = false;
3415 688 : bool delayChkpt = false;
3416 : uint32 buf_state;
3417 :
3418 : /*
3419 : * If we need to protect hint bit updates from torn writes, WAL-log a
3420 : * full page image of the page. This full page image is only necessary
3421 : * if the hint bit update is the first change to the page since the
3422 : * last checkpoint.
3423 : *
3424 : * We don't check full_page_writes here because that logic is included
3425 : * when we call XLogInsert() since the value changes dynamically.
3426 : */
3427 688 : if (XLogHintBitIsNeeded() &&
3428 0 : (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT))
3429 : {
3430 : /*
3431 : * If we're in recovery we cannot dirty a page because of a hint.
3432 : * We can set the hint, just not dirty the page as a result so the
3433 : * hint is lost when we evict the page or shutdown.
3434 : *
3435 : * See src/backend/storage/page/README for longer discussion.
3436 : */
3437 0 : if (RecoveryInProgress())
3438 0 : return;
3439 :
3440 : /*
3441 : * If the block is already dirty because we either made a change
3442 : * or set a hint already, then we don't need to write a full page
3443 : * image. Note that aggressive cleaning of blocks dirtied by hint
3444 : * bit setting would increase the call rate. Bulk setting of hint
3445 : * bits would reduce the call rate...
3446 : *
3447 : * We must issue the WAL record before we mark the buffer dirty.
3448 : * Otherwise we might write the page before we write the WAL. That
3449 : * causes a race condition, since a checkpoint might occur between
3450 : * writing the WAL record and marking the buffer dirty. We solve
3451 : * that with a kluge, but one that is already in use during
3452 : * transaction commit to prevent race conditions. Basically, we
3453 : * simply prevent the checkpoint WAL record from being written
3454 : * until we have marked the buffer dirty. We don't start the
3455 : * checkpoint flush until we have marked dirty, so our checkpoint
3456 : * must flush the change to disk successfully or the checkpoint
3457 : * never gets written, so crash recovery will fix.
3458 : *
3459 : * It's possible we may enter here without an xid, so it is
3460 : * essential that CreateCheckpoint waits for virtual transactions
3461 : * rather than full transactionids.
3462 : */
3463 0 : MyPgXact->delayChkpt = delayChkpt = true;
3464 0 : lsn = XLogSaveBufferForHint(buffer, buffer_std);
3465 : }
3466 :
3467 688 : buf_state = LockBufHdr(bufHdr);
3468 :
3469 688 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
3470 :
3471 688 : if (!(buf_state & BM_DIRTY))
3472 : {
3473 688 : dirtied = true; /* Means "will be dirtied by this action" */
3474 :
3475 : /*
3476 : * Set the page LSN if we wrote a backup block. We aren't supposed
3477 : * to set this when only holding a share lock but as long as we
3478 : * serialise it somehow we're OK. We choose to set LSN while
3479 : * holding the buffer header lock, which causes any reader of an
3480 : * LSN who holds only a share lock to also obtain a buffer header
3481 : * lock before using PageGetLSN(), which is enforced in
3482 : * BufferGetLSNAtomic().
3483 : *
3484 : * If checksums are enabled, you might think we should reset the
3485 : * checksum here. That will happen when the page is written
3486 : * sometime later in this checkpoint cycle.
3487 : */
3488 688 : if (!XLogRecPtrIsInvalid(lsn))
3489 0 : PageSetLSN(page, lsn);
3490 : }
3491 :
3492 688 : buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
3493 688 : UnlockBufHdr(bufHdr, buf_state);
3494 :
3495 688 : if (delayChkpt)
3496 0 : MyPgXact->delayChkpt = false;
3497 :
3498 688 : if (dirtied)
3499 : {
3500 688 : VacuumPageDirty++;
3501 688 : pgBufferUsage.shared_blks_dirtied++;
3502 688 : if (VacuumCostActive)
3503 21 : VacuumCostBalance += VacuumCostPageDirty;
3504 : }
3505 : }
3506 : }
3507 :
3508 : /*
3509 : * Release buffer content locks for shared buffers.
3510 : *
3511 : * Used to clean up after errors.
3512 : *
3513 : * Currently, we can expect that lwlock.c's LWLockReleaseAll() took care
3514 : * of releasing buffer content locks per se; the only thing we need to deal
3515 : * with here is clearing any PIN_COUNT request that was in progress.
3516 : */
3517 : void
3518 3971 : UnlockBuffers(void)
3519 : {
3520 3971 : BufferDesc *buf = PinCountWaitBuf;
3521 :
3522 3971 : if (buf)
3523 : {
3524 : uint32 buf_state;
3525 :
3526 0 : buf_state = LockBufHdr(buf);
3527 :
3528 : /*
3529 : * Don't complain if flag bit not set; it could have been reset but we
3530 : * got a cancel/die interrupt before getting the signal.
3531 : */
3532 0 : if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
3533 0 : buf->wait_backend_pid == MyProcPid)
3534 0 : buf_state &= ~BM_PIN_COUNT_WAITER;
3535 :
3536 0 : UnlockBufHdr(buf, buf_state);
3537 :
3538 0 : PinCountWaitBuf = NULL;
3539 : }
3540 3971 : }
3541 :
3542 : /*
3543 : * Acquire or release the content_lock for the buffer.
3544 : */
3545 : void
3546 11231503 : LockBuffer(Buffer buffer, int mode)
3547 : {
3548 : BufferDesc *buf;
3549 :
3550 11231503 : Assert(BufferIsValid(buffer));
3551 11231503 : if (BufferIsLocal(buffer))
3552 11442551 : return; /* local buffers need no lock */
3553 :
3554 11020455 : buf = GetBufferDescriptor(buffer - 1);
3555 :
3556 11020455 : if (mode == BUFFER_LOCK_UNLOCK)
3557 5612958 : LWLockRelease(BufferDescriptorGetContentLock(buf));
3558 5407497 : else if (mode == BUFFER_LOCK_SHARE)
3559 3846291 : LWLockAcquire(BufferDescriptorGetContentLock(buf), LW_SHARED);
3560 1561206 : else if (mode == BUFFER_LOCK_EXCLUSIVE)
3561 1561206 : LWLockAcquire(BufferDescriptorGetContentLock(buf), LW_EXCLUSIVE);
3562 : else
3563 0 : elog(ERROR, "unrecognized buffer lock mode: %d", mode);
3564 : }
3565 :
3566 : /*
3567 : * Acquire the content_lock for the buffer, but only if we don't have to wait.
3568 : *
3569 : * This assumes the caller wants BUFFER_LOCK_EXCLUSIVE mode.
3570 : */
3571 : bool
3572 205396 : ConditionalLockBuffer(Buffer buffer)
3573 : {
3574 : BufferDesc *buf;
3575 :
3576 205396 : Assert(BufferIsValid(buffer));
3577 205396 : if (BufferIsLocal(buffer))
3578 7 : return true; /* act as though we got it */
3579 :
3580 205389 : buf = GetBufferDescriptor(buffer - 1);
3581 :
3582 205389 : return LWLockConditionalAcquire(BufferDescriptorGetContentLock(buf),
3583 : LW_EXCLUSIVE);
3584 : }
3585 :
3586 : /*
3587 : * LockBufferForCleanup - lock a buffer in preparation for deleting items
3588 : *
3589 : * Items may be deleted from a disk page only when the caller (a) holds an
3590 : * exclusive lock on the buffer and (b) has observed that no other backend
3591 : * holds a pin on the buffer. If there is a pin, then the other backend
3592 : * might have a pointer into the buffer (for example, a heapscan reference
3593 : * to an item --- see README for more details). It's OK if a pin is added
3594 : * after the cleanup starts, however; the newly-arrived backend will be
3595 : * unable to look at the page until we release the exclusive lock.
3596 : *
3597 : * To implement this protocol, a would-be deleter must pin the buffer and
3598 : * then call LockBufferForCleanup(). LockBufferForCleanup() is similar to
3599 : * LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE), except that it loops until
3600 : * it has successfully observed pin count = 1.
3601 : */
3602 : void
3603 1878 : LockBufferForCleanup(Buffer buffer)
3604 : {
3605 : BufferDesc *bufHdr;
3606 :
3607 1878 : Assert(BufferIsValid(buffer));
3608 1878 : Assert(PinCountWaitBuf == NULL);
3609 :
3610 1878 : if (BufferIsLocal(buffer))
3611 : {
3612 : /* There should be exactly one pin */
3613 0 : if (LocalRefCount[-buffer - 1] != 1)
3614 0 : elog(ERROR, "incorrect local pin count: %d",
3615 : LocalRefCount[-buffer - 1]);
3616 : /* Nobody else to wait for */
3617 0 : return;
3618 : }
3619 :
3620 : /* There should be exactly one local pin */
3621 1878 : if (GetPrivateRefCount(buffer) != 1)
3622 0 : elog(ERROR, "incorrect local pin count: %d",
3623 : GetPrivateRefCount(buffer));
3624 :
3625 1878 : bufHdr = GetBufferDescriptor(buffer - 1);
3626 :
3627 : for (;;)
3628 : {
3629 : uint32 buf_state;
3630 :
3631 : /* Try to acquire lock */
3632 1878 : LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
3633 1878 : buf_state = LockBufHdr(bufHdr);
3634 :
3635 1878 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
3636 1878 : if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
3637 : {
3638 : /* Successfully acquired exclusive lock with pincount 1 */
3639 1878 : UnlockBufHdr(bufHdr, buf_state);
3640 1878 : return;
3641 : }
3642 : /* Failed, so mark myself as waiting for pincount 1 */
3643 0 : if (buf_state & BM_PIN_COUNT_WAITER)
3644 : {
3645 0 : UnlockBufHdr(bufHdr, buf_state);
3646 0 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3647 0 : elog(ERROR, "multiple backends attempting to wait for pincount 1");
3648 : }
3649 0 : bufHdr->wait_backend_pid = MyProcPid;
3650 0 : PinCountWaitBuf = bufHdr;
3651 0 : buf_state |= BM_PIN_COUNT_WAITER;
3652 0 : UnlockBufHdr(bufHdr, buf_state);
3653 0 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3654 :
3655 : /* Wait to be signaled by UnpinBuffer() */
3656 0 : if (InHotStandby)
3657 : {
3658 : /* Publish the bufid that Startup process waits on */
3659 0 : SetStartupBufferPinWaitBufId(buffer - 1);
3660 : /* Set alarm and then wait to be signaled by UnpinBuffer() */
3661 0 : ResolveRecoveryConflictWithBufferPin();
3662 : /* Reset the published bufid */
3663 0 : SetStartupBufferPinWaitBufId(-1);
3664 : }
3665 : else
3666 0 : ProcWaitForSignal(PG_WAIT_BUFFER_PIN);
3667 :
3668 : /*
3669 : * Remove flag marking us as waiter. Normally this will not be set
3670 : * anymore, but ProcWaitForSignal() can return for other signals as
3671 : * well. We take care to only reset the flag if we're the waiter, as
3672 : * theoretically another backend could have started waiting. That's
3673 : * impossible with the current usages due to table level locking, but
3674 : * better be safe.
3675 : */
3676 0 : buf_state = LockBufHdr(bufHdr);
3677 0 : if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
3678 0 : bufHdr->wait_backend_pid == MyProcPid)
3679 0 : buf_state &= ~BM_PIN_COUNT_WAITER;
3680 0 : UnlockBufHdr(bufHdr, buf_state);
3681 :
3682 0 : PinCountWaitBuf = NULL;
3683 : /* Loop back and try again */
3684 0 : }
3685 : }
3686 :
3687 : /*
3688 : * Check called from RecoveryConflictInterrupt handler when Startup
3689 : * process requests cancellation of all pin holders that are blocking it.
3690 : */
3691 : bool
3692 0 : HoldingBufferPinThatDelaysRecovery(void)
3693 : {
3694 0 : int bufid = GetStartupBufferPinWaitBufId();
3695 :
3696 : /*
3697 : * If we get woken slowly then it's possible that the Startup process was
3698 : * already woken by other backends before we got here. Also possible that
3699 : * we get here by multiple interrupts or interrupts at inappropriate
3700 : * times, so make sure we do nothing if the bufid is not set.
3701 : */
3702 0 : if (bufid < 0)
3703 0 : return false;
3704 :
3705 0 : if (GetPrivateRefCount(bufid + 1) > 0)
3706 0 : return true;
3707 :
3708 0 : return false;
3709 : }
3710 :
3711 : /*
3712 : * ConditionalLockBufferForCleanup - as above, but don't wait to get the lock
3713 : *
3714 : * We won't loop, but just check once to see if the pin count is OK. If
3715 : * not, return FALSE with no lock held.
3716 : */
3717 : bool
3718 8249 : ConditionalLockBufferForCleanup(Buffer buffer)
3719 : {
3720 : BufferDesc *bufHdr;
3721 : uint32 buf_state,
3722 : refcount;
3723 :
3724 8249 : Assert(BufferIsValid(buffer));
3725 :
3726 8249 : if (BufferIsLocal(buffer))
3727 : {
3728 8 : refcount = LocalRefCount[-buffer - 1];
3729 : /* There should be exactly one pin */
3730 8 : Assert(refcount > 0);
3731 8 : if (refcount != 1)
3732 7 : return false;
3733 : /* Nobody else to wait for */
3734 1 : return true;
3735 : }
3736 :
3737 : /* There should be exactly one local pin */
3738 8241 : refcount = GetPrivateRefCount(buffer);
3739 8241 : Assert(refcount);
3740 8241 : if (refcount != 1)
3741 2 : return false;
3742 :
3743 : /* Try to acquire lock */
3744 8239 : if (!ConditionalLockBuffer(buffer))
3745 4 : return false;
3746 :
3747 8235 : bufHdr = GetBufferDescriptor(buffer - 1);
3748 8235 : buf_state = LockBufHdr(bufHdr);
3749 8235 : refcount = BUF_STATE_GET_REFCOUNT(buf_state);
3750 :
3751 8235 : Assert(refcount > 0);
3752 8235 : if (refcount == 1)
3753 : {
3754 : /* Successfully acquired exclusive lock with pincount 1 */
3755 8229 : UnlockBufHdr(bufHdr, buf_state);
3756 8229 : return true;
3757 : }
3758 :
3759 : /* Failed, so release the lock */
3760 6 : UnlockBufHdr(bufHdr, buf_state);
3761 6 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3762 6 : return false;
3763 : }
3764 :
3765 : /*
3766 : * IsBufferCleanupOK - as above, but we already have the lock
3767 : *
3768 : * Check whether it's OK to perform cleanup on a buffer we've already
3769 : * locked. If we observe that the pin count is 1, our exclusive lock
3770 : * happens to be a cleanup lock, and we can proceed with anything that
3771 : * would have been allowable had we sought a cleanup lock originally.
3772 : */
3773 : bool
3774 213 : IsBufferCleanupOK(Buffer buffer)
3775 : {
3776 : BufferDesc *bufHdr;
3777 : uint32 buf_state;
3778 :
3779 213 : Assert(BufferIsValid(buffer));
3780 :
3781 213 : if (BufferIsLocal(buffer))
3782 : {
3783 : /* There should be exactly one pin */
3784 0 : if (LocalRefCount[-buffer - 1] != 1)
3785 0 : return false;
3786 : /* Nobody else to wait for */
3787 0 : return true;
3788 : }
3789 :
3790 : /* There should be exactly one local pin */
3791 213 : if (GetPrivateRefCount(buffer) != 1)
3792 0 : return false;
3793 :
3794 213 : bufHdr = GetBufferDescriptor(buffer - 1);
3795 :
3796 : /* caller must hold exclusive lock on buffer */
3797 213 : Assert(LWLockHeldByMeInMode(BufferDescriptorGetContentLock(bufHdr),
3798 : LW_EXCLUSIVE));
3799 :
3800 213 : buf_state = LockBufHdr(bufHdr);
3801 :
3802 213 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
3803 213 : if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
3804 : {
3805 : /* pincount is OK. */
3806 213 : UnlockBufHdr(bufHdr, buf_state);
3807 213 : return true;
3808 : }
3809 :
3810 0 : UnlockBufHdr(bufHdr, buf_state);
3811 0 : return false;
3812 : }
3813 :
3814 :
3815 : /*
3816 : * Functions for buffer I/O handling
3817 : *
3818 : * Note: We assume that nested buffer I/O never occurs.
3819 : * i.e at most one io_in_progress lock is held per proc.
3820 : *
3821 : * Also note that these are used only for shared buffers, not local ones.
3822 : */
3823 :
3824 : /*
3825 : * WaitIO -- Block until the IO_IN_PROGRESS flag on 'buf' is cleared.
3826 : */
3827 : static void
3828 0 : WaitIO(BufferDesc *buf)
3829 : {
3830 : /*
3831 : * Changed to wait until there's no IO - Inoue 01/13/2000
3832 : *
3833 : * Note this is *necessary* because an error abort in the process doing
3834 : * I/O could release the io_in_progress_lock prematurely. See
3835 : * AbortBufferIO.
3836 : */
3837 : for (;;)
3838 : {
3839 : uint32 buf_state;
3840 :
3841 : /*
3842 : * It may not be necessary to acquire the spinlock to check the flag
3843 : * here, but since this test is essential for correctness, we'd better
3844 : * play it safe.
3845 : */
3846 0 : buf_state = LockBufHdr(buf);
3847 0 : UnlockBufHdr(buf, buf_state);
3848 :
3849 0 : if (!(buf_state & BM_IO_IN_PROGRESS))
3850 0 : break;
3851 0 : LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_SHARED);
3852 0 : LWLockRelease(BufferDescriptorGetIOLock(buf));
3853 0 : }
3854 0 : }
3855 :
3856 : /*
3857 : * StartBufferIO: begin I/O on this buffer
3858 : * (Assumptions)
3859 : * My process is executing no IO
3860 : * The buffer is Pinned
3861 : *
3862 : * In some scenarios there are race conditions in which multiple backends
3863 : * could attempt the same I/O operation concurrently. If someone else
3864 : * has already started I/O on this buffer then we will block on the
3865 : * io_in_progress lock until he's done.
3866 : *
3867 : * Input operations are only attempted on buffers that are not BM_VALID,
3868 : * and output operations only on buffers that are BM_VALID and BM_DIRTY,
3869 : * so we can always tell if the work is already done.
3870 : *
3871 : * Returns TRUE if we successfully marked the buffer as I/O busy,
3872 : * FALSE if someone else already did the work.
3873 : */
3874 : static bool
3875 25747 : StartBufferIO(BufferDesc *buf, bool forInput)
3876 : {
3877 : uint32 buf_state;
3878 :
3879 25747 : Assert(!InProgressBuf);
3880 :
3881 : for (;;)
3882 : {
3883 : /*
3884 : * Grab the io_in_progress lock so that other processes can wait for
3885 : * me to finish the I/O.
3886 : */
3887 25747 : LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_EXCLUSIVE);
3888 :
3889 25747 : buf_state = LockBufHdr(buf);
3890 :
3891 25747 : if (!(buf_state & BM_IO_IN_PROGRESS))
3892 25747 : break;
3893 :
3894 : /*
3895 : * The only way BM_IO_IN_PROGRESS could be set when the io_in_progress
3896 : * lock isn't held is if the process doing the I/O is recovering from
3897 : * an error (see AbortBufferIO). If that's the case, we must wait for
3898 : * him to get unwedged.
3899 : */
3900 0 : UnlockBufHdr(buf, buf_state);
3901 0 : LWLockRelease(BufferDescriptorGetIOLock(buf));
3902 0 : WaitIO(buf);
3903 0 : }
3904 :
3905 : /* Once we get here, there is definitely no I/O active on this buffer */
3906 :
3907 25747 : if (forInput ? (buf_state & BM_VALID) : !(buf_state & BM_DIRTY))
3908 : {
3909 : /* someone else already did the I/O */
3910 0 : UnlockBufHdr(buf, buf_state);
3911 0 : LWLockRelease(BufferDescriptorGetIOLock(buf));
3912 0 : return false;
3913 : }
3914 :
3915 25747 : buf_state |= BM_IO_IN_PROGRESS;
3916 25747 : UnlockBufHdr(buf, buf_state);
3917 :
3918 25747 : InProgressBuf = buf;
3919 25747 : IsForInput = forInput;
3920 :
3921 25747 : return true;
3922 : }
3923 :
3924 : /*
3925 : * TerminateBufferIO: release a buffer we were doing I/O on
3926 : * (Assumptions)
3927 : * My process is executing IO for the buffer
3928 : * BM_IO_IN_PROGRESS bit is set for the buffer
3929 : * We hold the buffer's io_in_progress lock
3930 : * The buffer is Pinned
3931 : *
3932 : * If clear_dirty is TRUE and BM_JUST_DIRTIED is not set, we clear the
3933 : * buffer's BM_DIRTY flag. This is appropriate when terminating a
3934 : * successful write. The check on BM_JUST_DIRTIED is necessary to avoid
3935 : * marking the buffer clean if it was re-dirtied while we were writing.
3936 : *
3937 : * set_flag_bits gets ORed into the buffer's flags. It must include
3938 : * BM_IO_ERROR in a failure case. For successful completion it could
3939 : * be 0, or BM_VALID if we just finished reading in the page.
3940 : */
3941 : static void
3942 25747 : TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits)
3943 : {
3944 : uint32 buf_state;
3945 :
3946 25747 : Assert(buf == InProgressBuf);
3947 :
3948 25747 : buf_state = LockBufHdr(buf);
3949 :
3950 25747 : Assert(buf_state & BM_IO_IN_PROGRESS);
3951 :
3952 25747 : buf_state &= ~(BM_IO_IN_PROGRESS | BM_IO_ERROR);
3953 25747 : if (clear_dirty && !(buf_state & BM_JUST_DIRTIED))
3954 8542 : buf_state &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED);
3955 :
3956 25747 : buf_state |= set_flag_bits;
3957 25747 : UnlockBufHdr(buf, buf_state);
3958 :
3959 25747 : InProgressBuf = NULL;
3960 :
3961 25747 : LWLockRelease(BufferDescriptorGetIOLock(buf));
3962 25747 : }
3963 :
3964 : /*
3965 : * AbortBufferIO: Clean up any active buffer I/O after an error.
3966 : *
3967 : * All LWLocks we might have held have been released,
3968 : * but we haven't yet released buffer pins, so the buffer is still pinned.
3969 : *
3970 : * If I/O was in progress, we always set BM_IO_ERROR, even though it's
3971 : * possible the error condition wasn't related to the I/O.
3972 : */
3973 : void
3974 3971 : AbortBufferIO(void)
3975 : {
3976 3971 : BufferDesc *buf = InProgressBuf;
3977 :
3978 3971 : if (buf)
3979 : {
3980 : uint32 buf_state;
3981 :
3982 : /*
3983 : * Since LWLockReleaseAll has already been called, we're not holding
3984 : * the buffer's io_in_progress_lock. We have to re-acquire it so that
3985 : * we can use TerminateBufferIO. Anyone who's executing WaitIO on the
3986 : * buffer will be in a busy spin until we succeed in doing this.
3987 : */
3988 0 : LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_EXCLUSIVE);
3989 :
3990 0 : buf_state = LockBufHdr(buf);
3991 0 : Assert(buf_state & BM_IO_IN_PROGRESS);
3992 0 : if (IsForInput)
3993 : {
3994 0 : Assert(!(buf_state & BM_DIRTY));
3995 :
3996 : /* We'd better not think buffer is valid yet */
3997 0 : Assert(!(buf_state & BM_VALID));
3998 0 : UnlockBufHdr(buf, buf_state);
3999 : }
4000 : else
4001 : {
4002 0 : Assert(buf_state & BM_DIRTY);
4003 0 : UnlockBufHdr(buf, buf_state);
4004 : /* Issue notice if this is not the first failure... */
4005 0 : if (buf_state & BM_IO_ERROR)
4006 : {
4007 : /* Buffer is pinned, so we can read tag without spinlock */
4008 : char *path;
4009 :
4010 0 : path = relpathperm(buf->tag.rnode, buf->tag.forkNum);
4011 0 : ereport(WARNING,
4012 : (errcode(ERRCODE_IO_ERROR),
4013 : errmsg("could not write block %u of %s",
4014 : buf->tag.blockNum, path),
4015 : errdetail("Multiple failures --- write error might be permanent.")));
4016 0 : pfree(path);
4017 : }
4018 : }
4019 0 : TerminateBufferIO(buf, false, BM_IO_ERROR);
4020 : }
4021 3971 : }
4022 :
4023 : /*
4024 : * Error context callback for errors occurring during shared buffer writes.
4025 : */
4026 : static void
4027 0 : shared_buffer_write_error_callback(void *arg)
4028 : {
4029 0 : BufferDesc *bufHdr = (BufferDesc *) arg;
4030 :
4031 : /* Buffer is pinned, so we can read the tag without locking the spinlock */
4032 0 : if (bufHdr != NULL)
4033 : {
4034 0 : char *path = relpathperm(bufHdr->tag.rnode, bufHdr->tag.forkNum);
4035 :
4036 0 : errcontext("writing block %u of relation %s",
4037 : bufHdr->tag.blockNum, path);
4038 0 : pfree(path);
4039 : }
4040 0 : }
4041 :
4042 : /*
4043 : * Error context callback for errors occurring during local buffer writes.
4044 : */
4045 : static void
4046 0 : local_buffer_write_error_callback(void *arg)
4047 : {
4048 0 : BufferDesc *bufHdr = (BufferDesc *) arg;
4049 :
4050 0 : if (bufHdr != NULL)
4051 : {
4052 0 : char *path = relpathbackend(bufHdr->tag.rnode, MyBackendId,
4053 : bufHdr->tag.forkNum);
4054 :
4055 0 : errcontext("writing block %u of relation %s",
4056 : bufHdr->tag.blockNum, path);
4057 0 : pfree(path);
4058 : }
4059 0 : }
4060 :
4061 : /*
4062 : * RelFileNode qsort/bsearch comparator; see RelFileNodeEquals.
4063 : */
4064 : static int
4065 793556 : rnode_comparator(const void *p1, const void *p2)
4066 : {
4067 793556 : RelFileNode n1 = *(RelFileNode *) p1;
4068 793556 : RelFileNode n2 = *(RelFileNode *) p2;
4069 :
4070 793556 : if (n1.relNode < n2.relNode)
4071 768162 : return -1;
4072 25394 : else if (n1.relNode > n2.relNode)
4073 6466 : return 1;
4074 :
4075 18928 : if (n1.dbNode < n2.dbNode)
4076 12 : return -1;
4077 18916 : else if (n1.dbNode > n2.dbNode)
4078 38 : return 1;
4079 :
4080 18878 : if (n1.spcNode < n2.spcNode)
4081 0 : return -1;
4082 18878 : else if (n1.spcNode > n2.spcNode)
4083 0 : return 1;
4084 : else
4085 18878 : return 0;
4086 : }
4087 :
4088 : /*
4089 : * Lock buffer header - set BM_LOCKED in buffer state.
4090 : */
4091 : uint32
4092 322304 : LockBufHdr(BufferDesc *desc)
4093 : {
4094 : SpinDelayStatus delayStatus;
4095 : uint32 old_buf_state;
4096 :
4097 322304 : init_local_spin_delay(&delayStatus);
4098 :
4099 : while (true)
4100 : {
4101 : /* set BM_LOCKED flag */
4102 322304 : old_buf_state = pg_atomic_fetch_or_u32(&desc->state, BM_LOCKED);
4103 : /* if it wasn't set before we're OK */
4104 322304 : if (!(old_buf_state & BM_LOCKED))
4105 322304 : break;
4106 0 : perform_spin_delay(&delayStatus);
4107 0 : }
4108 322304 : finish_spin_delay(&delayStatus);
4109 322304 : return old_buf_state | BM_LOCKED;
4110 : }
4111 :
4112 : /*
4113 : * Wait until the BM_LOCKED flag isn't set anymore and return the buffer's
4114 : * state at that point.
4115 : *
4116 : * Obviously the buffer could be locked by the time the value is returned, so
4117 : * this is primarily useful in CAS style loops.
4118 : */
4119 : static uint32
4120 0 : WaitBufHdrUnlocked(BufferDesc *buf)
4121 : {
4122 : SpinDelayStatus delayStatus;
4123 : uint32 buf_state;
4124 :
4125 0 : init_local_spin_delay(&delayStatus);
4126 :
4127 0 : buf_state = pg_atomic_read_u32(&buf->state);
4128 :
4129 0 : while (buf_state & BM_LOCKED)
4130 : {
4131 0 : perform_spin_delay(&delayStatus);
4132 0 : buf_state = pg_atomic_read_u32(&buf->state);
4133 : }
4134 :
4135 0 : finish_spin_delay(&delayStatus);
4136 :
4137 0 : return buf_state;
4138 : }
4139 :
4140 : /*
4141 : * BufferTag comparator.
4142 : */
4143 : static int
4144 23223 : buffertag_comparator(const void *a, const void *b)
4145 : {
4146 23223 : const BufferTag *ba = (const BufferTag *) a;
4147 23223 : const BufferTag *bb = (const BufferTag *) b;
4148 : int ret;
4149 :
4150 23223 : ret = rnode_comparator(&ba->rnode, &bb->rnode);
4151 :
4152 23223 : if (ret != 0)
4153 4410 : return ret;
4154 :
4155 18813 : if (ba->forkNum < bb->forkNum)
4156 721 : return -1;
4157 18092 : if (ba->forkNum > bb->forkNum)
4158 461 : return 1;
4159 :
4160 17631 : if (ba->blockNum < bb->blockNum)
4161 12359 : return -1;
4162 5272 : if (ba->blockNum > bb->blockNum)
4163 5272 : return 1;
4164 :
4165 0 : return 0;
4166 : }
4167 :
4168 : /*
4169 : * Comparator determining the writeout order in a checkpoint.
4170 : *
4171 : * It is important that tablespaces are compared first, the logic balancing
4172 : * writes between tablespaces relies on it.
4173 : */
4174 : static int
4175 111598 : ckpt_buforder_comparator(const void *pa, const void *pb)
4176 : {
4177 111598 : const CkptSortItem *a = (CkptSortItem *) pa;
4178 111598 : const CkptSortItem *b = (CkptSortItem *) pb;
4179 :
4180 : /* compare tablespace */
4181 111598 : if (a->tsId < b->tsId)
4182 161 : return -1;
4183 111437 : else if (a->tsId > b->tsId)
4184 625 : return 1;
4185 : /* compare relation */
4186 110812 : if (a->relNode < b->relNode)
4187 27619 : return -1;
4188 83193 : else if (a->relNode > b->relNode)
4189 30223 : return 1;
4190 : /* compare fork */
4191 52970 : else if (a->forkNum < b->forkNum)
4192 1094 : return -1;
4193 51876 : else if (a->forkNum > b->forkNum)
4194 1402 : return 1;
4195 : /* compare block number */
4196 50474 : else if (a->blockNum < b->blockNum)
4197 25383 : return -1;
4198 : else /* should not be the same block ... */
4199 25091 : return 1;
4200 : }
4201 :
4202 : /*
4203 : * Comparator for a Min-Heap over the per-tablespace checkpoint completion
4204 : * progress.
4205 : */
4206 : static int
4207 8372 : ts_ckpt_progress_comparator(Datum a, Datum b, void *arg)
4208 : {
4209 8372 : CkptTsStatus *sa = (CkptTsStatus *) a;
4210 8372 : CkptTsStatus *sb = (CkptTsStatus *) b;
4211 :
4212 : /* we want a min-heap, so return 1 for the a < b */
4213 8372 : if (sa->progress < sb->progress)
4214 8082 : return 1;
4215 290 : else if (sa->progress == sb->progress)
4216 9 : return 0;
4217 : else
4218 281 : return -1;
4219 : }
4220 :
4221 : /*
4222 : * Initialize a writeback context, discarding potential previous state.
4223 : *
4224 : * *max_pending is a pointer instead of an immediate value, so the coalesce
4225 : * limits can easily changed by the GUC mechanism, and so calling code does
4226 : * not have to check the current configuration. A value is 0 means that no
4227 : * writeback control will be performed.
4228 : */
4229 : void
4230 14 : WritebackContextInit(WritebackContext *context, int *max_pending)
4231 : {
4232 14 : Assert(*max_pending <= WRITEBACK_MAX_PENDING_FLUSHES);
4233 :
4234 14 : context->max_pending = max_pending;
4235 14 : context->nr_pending = 0;
4236 14 : }
4237 :
4238 : /*
4239 : * Add buffer to list of pending writeback requests.
4240 : */
4241 : void
4242 8540 : ScheduleBufferTagForWriteback(WritebackContext *context, BufferTag *tag)
4243 : {
4244 : PendingWriteback *pending;
4245 :
4246 : /*
4247 : * Add buffer to the pending writeback array, unless writeback control is
4248 : * disabled.
4249 : */
4250 8540 : if (*context->max_pending > 0)
4251 : {
4252 8514 : Assert(*context->max_pending <= WRITEBACK_MAX_PENDING_FLUSHES);
4253 :
4254 8514 : pending = &context->pending_writebacks[context->nr_pending++];
4255 :
4256 8514 : pending->tag = *tag;
4257 : }
4258 :
4259 : /*
4260 : * Perform pending flushes if the writeback limit is exceeded. This
4261 : * includes the case where previously an item has been added, but control
4262 : * is now disabled.
4263 : */
4264 8540 : if (context->nr_pending >= *context->max_pending)
4265 289 : IssuePendingWritebacks(context);
4266 8540 : }
4267 :
4268 : /*
4269 : * Issue all pending writeback requests, previously scheduled with
4270 : * ScheduleBufferTagForWriteback, to the OS.
4271 : *
4272 : * Because this is only used to improve the OSs IO scheduling we try to never
4273 : * error out - it's just a hint.
4274 : */
4275 : void
4276 297 : IssuePendingWritebacks(WritebackContext *context)
4277 : {
4278 : int i;
4279 :
4280 297 : if (context->nr_pending == 0)
4281 324 : return;
4282 :
4283 : /*
4284 : * Executing the writes in-order can make them a lot faster, and allows to
4285 : * merge writeback requests to consecutive blocks into larger writebacks.
4286 : */
4287 270 : qsort(&context->pending_writebacks, context->nr_pending,
4288 : sizeof(PendingWriteback), buffertag_comparator);
4289 :
4290 : /*
4291 : * Coalesce neighbouring writes, but nothing else. For that we iterate
4292 : * through the, now sorted, array of pending flushes, and look forward to
4293 : * find all neighbouring (or identical) writes.
4294 : */
4295 1747 : for (i = 0; i < context->nr_pending; i++)
4296 : {
4297 : PendingWriteback *cur;
4298 : PendingWriteback *next;
4299 : SMgrRelation reln;
4300 : int ahead;
4301 : BufferTag tag;
4302 1477 : Size nblocks = 1;
4303 :
4304 1477 : cur = &context->pending_writebacks[i];
4305 1477 : tag = cur->tag;
4306 :
4307 : /*
4308 : * Peek ahead, into following writeback requests, to see if they can
4309 : * be combined with the current one.
4310 : */
4311 8514 : for (ahead = 0; i + ahead + 1 < context->nr_pending; ahead++)
4312 : {
4313 8244 : next = &context->pending_writebacks[i + ahead + 1];
4314 :
4315 : /* different file, stop */
4316 15744 : if (!RelFileNodeEquals(cur->tag.rnode, next->tag.rnode) ||
4317 7500 : cur->tag.forkNum != next->tag.forkNum)
4318 : break;
4319 :
4320 : /* ok, block queued twice, skip */
4321 7107 : if (cur->tag.blockNum == next->tag.blockNum)
4322 0 : continue;
4323 :
4324 : /* only merge consecutive writes */
4325 7107 : if (cur->tag.blockNum + 1 != next->tag.blockNum)
4326 70 : break;
4327 :
4328 7037 : nblocks++;
4329 7037 : cur = next;
4330 : }
4331 :
4332 1477 : i += ahead;
4333 :
4334 : /* and finally tell the kernel to write the data to storage */
4335 1477 : reln = smgropen(tag.rnode, InvalidBackendId);
4336 1477 : smgrwriteback(reln, tag.forkNum, tag.blockNum, nblocks);
4337 : }
4338 :
4339 270 : context->nr_pending = 0;
4340 : }
4341 :
4342 :
4343 : /*
4344 : * Implement slower/larger portions of TestForOldSnapshot
4345 : *
4346 : * Smaller/faster portions are put inline, but the entire set of logic is too
4347 : * big for that.
4348 : */
4349 : void
4350 0 : TestForOldSnapshot_impl(Snapshot snapshot, Relation relation)
4351 : {
4352 0 : if (RelationAllowsEarlyPruning(relation)
4353 0 : && (snapshot)->whenTaken < GetOldSnapshotThresholdTimestamp())
4354 0 : ereport(ERROR,
4355 : (errcode(ERRCODE_SNAPSHOT_TOO_OLD),
4356 : errmsg("snapshot too old")));
4357 0 : }
|