Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * slru.c
4 : * Simple LRU buffering for transaction status logfiles
5 : *
6 : * We use a simple least-recently-used scheme to manage a pool of page
7 : * buffers. Under ordinary circumstances we expect that write
8 : * traffic will occur mostly to the latest page (and to the just-prior
9 : * page, soon after a page transition). Read traffic will probably touch
10 : * a larger span of pages, but in any case a fairly small number of page
11 : * buffers should be sufficient. So, we just search the buffers using plain
12 : * linear search; there's no need for a hashtable or anything fancy.
13 : * The management algorithm is straight LRU except that we will never swap
14 : * out the latest page (since we know it's going to be hit again eventually).
15 : *
16 : * We use a control LWLock to protect the shared data structures, plus
17 : * per-buffer LWLocks that synchronize I/O for each buffer. The control lock
18 : * must be held to examine or modify any shared state. A process that is
19 : * reading in or writing out a page buffer does not hold the control lock,
20 : * only the per-buffer lock for the buffer it is working on.
21 : *
22 : * "Holding the control lock" means exclusive lock in all cases except for
23 : * SimpleLruReadPage_ReadOnly(); see comments for SlruRecentlyUsed() for
24 : * the implications of that.
25 : *
26 : * When initiating I/O on a buffer, we acquire the per-buffer lock exclusively
27 : * before releasing the control lock. The per-buffer lock is released after
28 : * completing the I/O, re-acquiring the control lock, and updating the shared
29 : * state. (Deadlock is not possible here, because we never try to initiate
30 : * I/O when someone else is already doing I/O on the same buffer.)
31 : * To wait for I/O to complete, release the control lock, acquire the
32 : * per-buffer lock in shared mode, immediately release the per-buffer lock,
33 : * reacquire the control lock, and then recheck state (since arbitrary things
34 : * could have happened while we didn't have the lock).
35 : *
36 : * As with the regular buffer manager, it is possible for another process
37 : * to re-dirty a page that is currently being written out. This is handled
38 : * by re-setting the page's page_dirty flag.
39 : *
40 : *
41 : * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
42 : * Portions Copyright (c) 1994, Regents of the University of California
43 : *
44 : * src/backend/access/transam/slru.c
45 : *
46 : *-------------------------------------------------------------------------
47 : */
48 : #include "postgres.h"
49 :
50 : #include <fcntl.h>
51 : #include <sys/stat.h>
52 : #include <unistd.h>
53 :
54 : #include "access/slru.h"
55 : #include "access/transam.h"
56 : #include "access/xlog.h"
57 : #include "pgstat.h"
58 : #include "storage/fd.h"
59 : #include "storage/shmem.h"
60 : #include "miscadmin.h"
61 :
62 :
63 : #define SlruFileName(ctl, path, seg) \
64 : snprintf(path, MAXPGPATH, "%s/%04X", (ctl)->Dir, seg)
65 :
66 : /*
67 : * During SimpleLruFlush(), we will usually not need to write/fsync more
68 : * than one or two physical files, but we may need to write several pages
69 : * per file. We can consolidate the I/O requests by leaving files open
70 : * until control returns to SimpleLruFlush(). This data structure remembers
71 : * which files are open.
72 : */
73 : #define MAX_FLUSH_BUFFERS 16
74 :
75 : typedef struct SlruFlushData
76 : {
77 : int num_files; /* # files actually open */
78 : int fd[MAX_FLUSH_BUFFERS]; /* their FD's */
79 : int segno[MAX_FLUSH_BUFFERS]; /* their log seg#s */
80 : } SlruFlushData;
81 :
82 : typedef struct SlruFlushData *SlruFlush;
83 :
84 : /*
85 : * Macro to mark a buffer slot "most recently used". Note multiple evaluation
86 : * of arguments!
87 : *
88 : * The reason for the if-test is that there are often many consecutive
89 : * accesses to the same page (particularly the latest page). By suppressing
90 : * useless increments of cur_lru_count, we reduce the probability that old
91 : * pages' counts will "wrap around" and make them appear recently used.
92 : *
93 : * We allow this code to be executed concurrently by multiple processes within
94 : * SimpleLruReadPage_ReadOnly(). As long as int reads and writes are atomic,
95 : * this should not cause any completely-bogus values to enter the computation.
96 : * However, it is possible for either cur_lru_count or individual
97 : * page_lru_count entries to be "reset" to lower values than they should have,
98 : * in case a process is delayed while it executes this macro. With care in
99 : * SlruSelectLRUPage(), this does little harm, and in any case the absolute
100 : * worst possible consequence is a nonoptimal choice of page to evict. The
101 : * gain from allowing concurrent reads of SLRU pages seems worth it.
102 : */
103 : #define SlruRecentlyUsed(shared, slotno) \
104 : do { \
105 : int new_lru_count = (shared)->cur_lru_count; \
106 : if (new_lru_count != (shared)->page_lru_count[slotno]) { \
107 : (shared)->cur_lru_count = ++new_lru_count; \
108 : (shared)->page_lru_count[slotno] = new_lru_count; \
109 : } \
110 : } while (0)
111 :
112 : /* Saved info for SlruReportIOError */
113 : typedef enum
114 : {
115 : SLRU_OPEN_FAILED,
116 : SLRU_SEEK_FAILED,
117 : SLRU_READ_FAILED,
118 : SLRU_WRITE_FAILED,
119 : SLRU_FSYNC_FAILED,
120 : SLRU_CLOSE_FAILED
121 : } SlruErrorCause;
122 :
123 : static SlruErrorCause slru_errcause;
124 : static int slru_errno;
125 :
126 :
127 : static void SimpleLruZeroLSNs(SlruCtl ctl, int slotno);
128 : static void SimpleLruWaitIO(SlruCtl ctl, int slotno);
129 : static void SlruInternalWritePage(SlruCtl ctl, int slotno, SlruFlush fdata);
130 : static bool SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno);
131 : static bool SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno,
132 : SlruFlush fdata);
133 : static void SlruReportIOError(SlruCtl ctl, int pageno, TransactionId xid);
134 : static int SlruSelectLRUPage(SlruCtl ctl, int pageno);
135 :
136 : static bool SlruScanDirCbDeleteCutoff(SlruCtl ctl, char *filename,
137 : int segpage, void *data);
138 : static void SlruInternalDeleteSegment(SlruCtl ctl, char *filename);
139 :
140 : /*
141 : * Initialization of shared memory
142 : */
143 :
144 : Size
145 105 : SimpleLruShmemSize(int nslots, int nlsns)
146 : {
147 : Size sz;
148 :
149 : /* we assume nslots isn't so large as to risk overflow */
150 105 : sz = MAXALIGN(sizeof(SlruSharedData));
151 105 : sz += MAXALIGN(nslots * sizeof(char *)); /* page_buffer[] */
152 105 : sz += MAXALIGN(nslots * sizeof(SlruPageStatus)); /* page_status[] */
153 105 : sz += MAXALIGN(nslots * sizeof(bool)); /* page_dirty[] */
154 105 : sz += MAXALIGN(nslots * sizeof(int)); /* page_number[] */
155 105 : sz += MAXALIGN(nslots * sizeof(int)); /* page_lru_count[] */
156 105 : sz += MAXALIGN(nslots * sizeof(LWLockPadded)); /* buffer_locks[] */
157 :
158 105 : if (nlsns > 0)
159 15 : sz += MAXALIGN(nslots * nlsns * sizeof(XLogRecPtr)); /* group_lsn[] */
160 :
161 105 : return BUFFERALIGN(sz) + BLCKSZ * nslots;
162 : }
163 :
164 : void
165 35 : SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
166 : LWLock *ctllock, const char *subdir, int tranche_id)
167 : {
168 : SlruShared shared;
169 : bool found;
170 :
171 35 : shared = (SlruShared) ShmemInitStruct(name,
172 : SimpleLruShmemSize(nslots, nlsns),
173 : &found);
174 :
175 35 : if (!IsUnderPostmaster)
176 : {
177 : /* Initialize locks and shared memory area */
178 : char *ptr;
179 : Size offset;
180 : int slotno;
181 :
182 35 : Assert(!found);
183 :
184 35 : memset(shared, 0, sizeof(SlruSharedData));
185 :
186 35 : shared->ControlLock = ctllock;
187 :
188 35 : shared->num_slots = nslots;
189 35 : shared->lsn_groups_per_page = nlsns;
190 :
191 35 : shared->cur_lru_count = 0;
192 :
193 : /* shared->latest_page_number will be set later */
194 :
195 35 : ptr = (char *) shared;
196 35 : offset = MAXALIGN(sizeof(SlruSharedData));
197 35 : shared->page_buffer = (char **) (ptr + offset);
198 35 : offset += MAXALIGN(nslots * sizeof(char *));
199 35 : shared->page_status = (SlruPageStatus *) (ptr + offset);
200 35 : offset += MAXALIGN(nslots * sizeof(SlruPageStatus));
201 35 : shared->page_dirty = (bool *) (ptr + offset);
202 35 : offset += MAXALIGN(nslots * sizeof(bool));
203 35 : shared->page_number = (int *) (ptr + offset);
204 35 : offset += MAXALIGN(nslots * sizeof(int));
205 35 : shared->page_lru_count = (int *) (ptr + offset);
206 35 : offset += MAXALIGN(nslots * sizeof(int));
207 :
208 : /* Initialize LWLocks */
209 35 : shared->buffer_locks = (LWLockPadded *) (ptr + offset);
210 35 : offset += MAXALIGN(nslots * sizeof(LWLockPadded));
211 :
212 35 : if (nlsns > 0)
213 : {
214 5 : shared->group_lsn = (XLogRecPtr *) (ptr + offset);
215 5 : offset += MAXALIGN(nslots * nlsns * sizeof(XLogRecPtr));
216 : }
217 :
218 35 : Assert(strlen(name) + 1 < SLRU_MAX_NAME_LENGTH);
219 35 : strlcpy(shared->lwlock_tranche_name, name, SLRU_MAX_NAME_LENGTH);
220 35 : shared->lwlock_tranche_id = tranche_id;
221 :
222 35 : ptr += BUFFERALIGN(offset);
223 635 : for (slotno = 0; slotno < nslots; slotno++)
224 : {
225 600 : LWLockInitialize(&shared->buffer_locks[slotno].lock,
226 : shared->lwlock_tranche_id);
227 :
228 600 : shared->page_buffer[slotno] = ptr;
229 600 : shared->page_status[slotno] = SLRU_PAGE_EMPTY;
230 600 : shared->page_dirty[slotno] = false;
231 600 : shared->page_lru_count[slotno] = 0;
232 600 : ptr += BLCKSZ;
233 : }
234 :
235 : /* Should fit to estimated shmem size */
236 35 : Assert(ptr - (char *) shared <= SimpleLruShmemSize(nslots, nlsns));
237 : }
238 : else
239 0 : Assert(found);
240 :
241 : /* Register SLRU tranche in the main tranches array */
242 35 : LWLockRegisterTranche(shared->lwlock_tranche_id,
243 35 : shared->lwlock_tranche_name);
244 :
245 : /*
246 : * Initialize the unshared control struct, including directory path. We
247 : * assume caller set PagePrecedes.
248 : */
249 35 : ctl->shared = shared;
250 35 : ctl->do_fsync = true; /* default behavior */
251 35 : StrNCpy(ctl->Dir, subdir, sizeof(ctl->Dir));
252 35 : }
253 :
254 : /*
255 : * Initialize (or reinitialize) a page to zeroes.
256 : *
257 : * The page is not actually written, just set up in shared memory.
258 : * The slot number of the new page is returned.
259 : *
260 : * Control lock must be held at entry, and will be held at exit.
261 : */
262 : int
263 21 : SimpleLruZeroPage(SlruCtl ctl, int pageno)
264 : {
265 21 : SlruShared shared = ctl->shared;
266 : int slotno;
267 :
268 : /* Find a suitable buffer slot for the page */
269 21 : slotno = SlruSelectLRUPage(ctl, pageno);
270 21 : Assert(shared->page_status[slotno] == SLRU_PAGE_EMPTY ||
271 : (shared->page_status[slotno] == SLRU_PAGE_VALID &&
272 : !shared->page_dirty[slotno]) ||
273 : shared->page_number[slotno] == pageno);
274 :
275 : /* Mark the slot as containing this page */
276 21 : shared->page_number[slotno] = pageno;
277 21 : shared->page_status[slotno] = SLRU_PAGE_VALID;
278 21 : shared->page_dirty[slotno] = true;
279 21 : SlruRecentlyUsed(shared, slotno);
280 :
281 : /* Set the buffer to zeroes */
282 21 : MemSet(shared->page_buffer[slotno], 0, BLCKSZ);
283 :
284 : /* Set the LSNs for this new page to zero */
285 21 : SimpleLruZeroLSNs(ctl, slotno);
286 :
287 : /* Assume this page is now the latest active page */
288 21 : shared->latest_page_number = pageno;
289 :
290 21 : return slotno;
291 : }
292 :
293 : /*
294 : * Zero all the LSNs we store for this slru page.
295 : *
296 : * This should be called each time we create a new page, and each time we read
297 : * in a page from disk into an existing buffer. (Such an old page cannot
298 : * have any interesting LSNs, since we'd have flushed them before writing
299 : * the page in the first place.)
300 : *
301 : * This assumes that InvalidXLogRecPtr is bitwise-all-0.
302 : */
303 : static void
304 25 : SimpleLruZeroLSNs(SlruCtl ctl, int slotno)
305 : {
306 25 : SlruShared shared = ctl->shared;
307 :
308 25 : if (shared->lsn_groups_per_page > 0)
309 4 : MemSet(&shared->group_lsn[slotno * shared->lsn_groups_per_page], 0,
310 : shared->lsn_groups_per_page * sizeof(XLogRecPtr));
311 25 : }
312 :
313 : /*
314 : * Wait for any active I/O on a page slot to finish. (This does not
315 : * guarantee that new I/O hasn't been started before we return, though.
316 : * In fact the slot might not even contain the same page anymore.)
317 : *
318 : * Control lock must be held at entry, and will be held at exit.
319 : */
320 : static void
321 0 : SimpleLruWaitIO(SlruCtl ctl, int slotno)
322 : {
323 0 : SlruShared shared = ctl->shared;
324 :
325 : /* See notes at top of file */
326 0 : LWLockRelease(shared->ControlLock);
327 0 : LWLockAcquire(&shared->buffer_locks[slotno].lock, LW_SHARED);
328 0 : LWLockRelease(&shared->buffer_locks[slotno].lock);
329 0 : LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE);
330 :
331 : /*
332 : * If the slot is still in an io-in-progress state, then either someone
333 : * already started a new I/O on the slot, or a previous I/O failed and
334 : * neglected to reset the page state. That shouldn't happen, really, but
335 : * it seems worth a few extra cycles to check and recover from it. We can
336 : * cheaply test for failure by seeing if the buffer lock is still held (we
337 : * assume that transaction abort would release the lock).
338 : */
339 0 : if (shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS ||
340 0 : shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS)
341 : {
342 0 : if (LWLockConditionalAcquire(&shared->buffer_locks[slotno].lock, LW_SHARED))
343 : {
344 : /* indeed, the I/O must have failed */
345 0 : if (shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS)
346 0 : shared->page_status[slotno] = SLRU_PAGE_EMPTY;
347 : else /* write_in_progress */
348 : {
349 0 : shared->page_status[slotno] = SLRU_PAGE_VALID;
350 0 : shared->page_dirty[slotno] = true;
351 : }
352 0 : LWLockRelease(&shared->buffer_locks[slotno].lock);
353 : }
354 : }
355 0 : }
356 :
357 : /*
358 : * Find a page in a shared buffer, reading it in if necessary.
359 : * The page number must correspond to an already-initialized page.
360 : *
361 : * If write_ok is true then it is OK to return a page that is in
362 : * WRITE_IN_PROGRESS state; it is the caller's responsibility to be sure
363 : * that modification of the page is safe. If write_ok is false then we
364 : * will not return the page until it is not undergoing active I/O.
365 : *
366 : * The passed-in xid is used only for error reporting, and may be
367 : * InvalidTransactionId if no specific xid is associated with the action.
368 : *
369 : * Return value is the shared-buffer slot number now holding the page.
370 : * The buffer's LRU access info is updated.
371 : *
372 : * Control lock must be held at entry, and will be held at exit.
373 : */
374 : int
375 10680 : SimpleLruReadPage(SlruCtl ctl, int pageno, bool write_ok,
376 : TransactionId xid)
377 : {
378 10680 : SlruShared shared = ctl->shared;
379 :
380 : /* Outer loop handles restart if we must wait for someone else's I/O */
381 : for (;;)
382 : {
383 : int slotno;
384 : bool ok;
385 :
386 : /* See if page already is in memory; if not, pick victim slot */
387 10680 : slotno = SlruSelectLRUPage(ctl, pageno);
388 :
389 : /* Did we find the page in memory? */
390 21360 : if (shared->page_number[slotno] == pageno &&
391 10680 : shared->page_status[slotno] != SLRU_PAGE_EMPTY)
392 : {
393 : /*
394 : * If page is still being read in, we must wait for I/O. Likewise
395 : * if the page is being written and the caller said that's not OK.
396 : */
397 21352 : if (shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS ||
398 10676 : (shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS &&
399 : !write_ok))
400 : {
401 0 : SimpleLruWaitIO(ctl, slotno);
402 : /* Now we must recheck state from the top */
403 0 : continue;
404 : }
405 : /* Otherwise, it's ready to use */
406 10676 : SlruRecentlyUsed(shared, slotno);
407 10676 : return slotno;
408 : }
409 :
410 : /* We found no match; assert we selected a freeable slot */
411 4 : Assert(shared->page_status[slotno] == SLRU_PAGE_EMPTY ||
412 : (shared->page_status[slotno] == SLRU_PAGE_VALID &&
413 : !shared->page_dirty[slotno]));
414 :
415 : /* Mark the slot read-busy */
416 4 : shared->page_number[slotno] = pageno;
417 4 : shared->page_status[slotno] = SLRU_PAGE_READ_IN_PROGRESS;
418 4 : shared->page_dirty[slotno] = false;
419 :
420 : /* Acquire per-buffer lock (cannot deadlock, see notes at top) */
421 4 : LWLockAcquire(&shared->buffer_locks[slotno].lock, LW_EXCLUSIVE);
422 :
423 : /* Release control lock while doing I/O */
424 4 : LWLockRelease(shared->ControlLock);
425 :
426 : /* Do the read */
427 4 : ok = SlruPhysicalReadPage(ctl, pageno, slotno);
428 :
429 : /* Set the LSNs for this newly read-in page to zero */
430 4 : SimpleLruZeroLSNs(ctl, slotno);
431 :
432 : /* Re-acquire control lock and update page state */
433 4 : LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE);
434 :
435 4 : Assert(shared->page_number[slotno] == pageno &&
436 : shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS &&
437 : !shared->page_dirty[slotno]);
438 :
439 4 : shared->page_status[slotno] = ok ? SLRU_PAGE_VALID : SLRU_PAGE_EMPTY;
440 :
441 4 : LWLockRelease(&shared->buffer_locks[slotno].lock);
442 :
443 : /* Now it's okay to ereport if we failed */
444 4 : if (!ok)
445 0 : SlruReportIOError(ctl, pageno, xid);
446 :
447 4 : SlruRecentlyUsed(shared, slotno);
448 4 : return slotno;
449 0 : }
450 : }
451 :
452 : /*
453 : * Find a page in a shared buffer, reading it in if necessary.
454 : * The page number must correspond to an already-initialized page.
455 : * The caller must intend only read-only access to the page.
456 : *
457 : * The passed-in xid is used only for error reporting, and may be
458 : * InvalidTransactionId if no specific xid is associated with the action.
459 : *
460 : * Return value is the shared-buffer slot number now holding the page.
461 : * The buffer's LRU access info is updated.
462 : *
463 : * Control lock must NOT be held at entry, but will be held at exit.
464 : * It is unspecified whether the lock will be shared or exclusive.
465 : */
466 : int
467 30585 : SimpleLruReadPage_ReadOnly(SlruCtl ctl, int pageno, TransactionId xid)
468 : {
469 30585 : SlruShared shared = ctl->shared;
470 : int slotno;
471 :
472 : /* Try to find the page while holding only shared lock */
473 30585 : LWLockAcquire(shared->ControlLock, LW_SHARED);
474 :
475 : /* See if page is already in a buffer */
476 30589 : for (slotno = 0; slotno < shared->num_slots; slotno++)
477 : {
478 61174 : if (shared->page_number[slotno] == pageno &&
479 61170 : shared->page_status[slotno] != SLRU_PAGE_EMPTY &&
480 30585 : shared->page_status[slotno] != SLRU_PAGE_READ_IN_PROGRESS)
481 : {
482 : /* See comments for SlruRecentlyUsed macro */
483 30585 : SlruRecentlyUsed(shared, slotno);
484 30585 : return slotno;
485 : }
486 : }
487 :
488 : /* No luck, so switch to normal exclusive lock and do regular read */
489 0 : LWLockRelease(shared->ControlLock);
490 0 : LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE);
491 :
492 0 : return SimpleLruReadPage(ctl, pageno, true, xid);
493 : }
494 :
495 : /*
496 : * Write a page from a shared buffer, if necessary.
497 : * Does nothing if the specified slot is not dirty.
498 : *
499 : * NOTE: only one write attempt is made here. Hence, it is possible that
500 : * the page is still dirty at exit (if someone else re-dirtied it during
501 : * the write). However, we *do* attempt a fresh write even if the page
502 : * is already being written; this is for checkpoints.
503 : *
504 : * Control lock must be held at entry, and will be held at exit.
505 : */
506 : static void
507 1465 : SlruInternalWritePage(SlruCtl ctl, int slotno, SlruFlush fdata)
508 : {
509 1465 : SlruShared shared = ctl->shared;
510 1465 : int pageno = shared->page_number[slotno];
511 : bool ok;
512 :
513 : /* If a write is in progress, wait for it to finish */
514 2930 : while (shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS &&
515 0 : shared->page_number[slotno] == pageno)
516 : {
517 0 : SimpleLruWaitIO(ctl, slotno);
518 : }
519 :
520 : /*
521 : * Do nothing if page is not dirty, or if buffer no longer contains the
522 : * same page we were called for.
523 : */
524 1495 : if (!shared->page_dirty[slotno] ||
525 60 : shared->page_status[slotno] != SLRU_PAGE_VALID ||
526 30 : shared->page_number[slotno] != pageno)
527 2900 : return;
528 :
529 : /*
530 : * Mark the slot write-busy, and clear the dirtybit. After this point, a
531 : * transaction status update on this page will mark it dirty again.
532 : */
533 30 : shared->page_status[slotno] = SLRU_PAGE_WRITE_IN_PROGRESS;
534 30 : shared->page_dirty[slotno] = false;
535 :
536 : /* Acquire per-buffer lock (cannot deadlock, see notes at top) */
537 30 : LWLockAcquire(&shared->buffer_locks[slotno].lock, LW_EXCLUSIVE);
538 :
539 : /* Release control lock while doing I/O */
540 30 : LWLockRelease(shared->ControlLock);
541 :
542 : /* Do the write */
543 30 : ok = SlruPhysicalWritePage(ctl, pageno, slotno, fdata);
544 :
545 : /* If we failed, and we're in a flush, better close the files */
546 30 : if (!ok && fdata)
547 : {
548 : int i;
549 :
550 0 : for (i = 0; i < fdata->num_files; i++)
551 0 : CloseTransientFile(fdata->fd[i]);
552 : }
553 :
554 : /* Re-acquire control lock and update page state */
555 30 : LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE);
556 :
557 30 : Assert(shared->page_number[slotno] == pageno &&
558 : shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS);
559 :
560 : /* If we failed to write, mark the page dirty again */
561 30 : if (!ok)
562 0 : shared->page_dirty[slotno] = true;
563 :
564 30 : shared->page_status[slotno] = SLRU_PAGE_VALID;
565 :
566 30 : LWLockRelease(&shared->buffer_locks[slotno].lock);
567 :
568 : /* Now it's okay to ereport if we failed */
569 30 : if (!ok)
570 0 : SlruReportIOError(ctl, pageno, InvalidTransactionId);
571 : }
572 :
573 : /*
574 : * Wrapper of SlruInternalWritePage, for external callers.
575 : * fdata is always passed a NULL here.
576 : */
577 : void
578 9 : SimpleLruWritePage(SlruCtl ctl, int slotno)
579 : {
580 9 : SlruInternalWritePage(ctl, slotno, NULL);
581 9 : }
582 :
583 : /*
584 : * Return whether the given page exists on disk.
585 : *
586 : * A false return means that either the file does not exist, or that it's not
587 : * large enough to contain the given page.
588 : */
589 : bool
590 0 : SimpleLruDoesPhysicalPageExist(SlruCtl ctl, int pageno)
591 : {
592 0 : int segno = pageno / SLRU_PAGES_PER_SEGMENT;
593 0 : int rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
594 0 : int offset = rpageno * BLCKSZ;
595 : char path[MAXPGPATH];
596 : int fd;
597 : bool result;
598 : off_t endpos;
599 :
600 0 : SlruFileName(ctl, path, segno);
601 :
602 0 : fd = OpenTransientFile(path, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR);
603 0 : if (fd < 0)
604 : {
605 : /* expected: file doesn't exist */
606 0 : if (errno == ENOENT)
607 0 : return false;
608 :
609 : /* report error normally */
610 0 : slru_errcause = SLRU_OPEN_FAILED;
611 0 : slru_errno = errno;
612 0 : SlruReportIOError(ctl, pageno, 0);
613 : }
614 :
615 0 : if ((endpos = lseek(fd, 0, SEEK_END)) < 0)
616 : {
617 0 : slru_errcause = SLRU_OPEN_FAILED;
618 0 : slru_errno = errno;
619 0 : SlruReportIOError(ctl, pageno, 0);
620 : }
621 :
622 0 : result = endpos >= (off_t) (offset + BLCKSZ);
623 :
624 0 : CloseTransientFile(fd);
625 0 : return result;
626 : }
627 :
628 : /*
629 : * Physical read of a (previously existing) page into a buffer slot
630 : *
631 : * On failure, we cannot just ereport(ERROR) since caller has put state in
632 : * shared memory that must be undone. So, we return FALSE and save enough
633 : * info in static variables to let SlruReportIOError make the report.
634 : *
635 : * For now, assume it's not worth keeping a file pointer open across
636 : * read/write operations. We could cache one virtual file pointer ...
637 : */
638 : static bool
639 4 : SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno)
640 : {
641 4 : SlruShared shared = ctl->shared;
642 4 : int segno = pageno / SLRU_PAGES_PER_SEGMENT;
643 4 : int rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
644 4 : int offset = rpageno * BLCKSZ;
645 : char path[MAXPGPATH];
646 : int fd;
647 :
648 4 : SlruFileName(ctl, path, segno);
649 :
650 : /*
651 : * In a crash-and-restart situation, it's possible for us to receive
652 : * commands to set the commit status of transactions whose bits are in
653 : * already-truncated segments of the commit log (see notes in
654 : * SlruPhysicalWritePage). Hence, if we are InRecovery, allow the case
655 : * where the file doesn't exist, and return zeroes instead.
656 : */
657 4 : fd = OpenTransientFile(path, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR);
658 4 : if (fd < 0)
659 : {
660 0 : if (errno != ENOENT || !InRecovery)
661 : {
662 0 : slru_errcause = SLRU_OPEN_FAILED;
663 0 : slru_errno = errno;
664 0 : return false;
665 : }
666 :
667 0 : ereport(LOG,
668 : (errmsg("file \"%s\" doesn't exist, reading as zeroes",
669 : path)));
670 0 : MemSet(shared->page_buffer[slotno], 0, BLCKSZ);
671 0 : return true;
672 : }
673 :
674 4 : if (lseek(fd, (off_t) offset, SEEK_SET) < 0)
675 : {
676 0 : slru_errcause = SLRU_SEEK_FAILED;
677 0 : slru_errno = errno;
678 0 : CloseTransientFile(fd);
679 0 : return false;
680 : }
681 :
682 4 : errno = 0;
683 4 : pgstat_report_wait_start(WAIT_EVENT_SLRU_READ);
684 4 : if (read(fd, shared->page_buffer[slotno], BLCKSZ) != BLCKSZ)
685 : {
686 0 : pgstat_report_wait_end();
687 0 : slru_errcause = SLRU_READ_FAILED;
688 0 : slru_errno = errno;
689 0 : CloseTransientFile(fd);
690 0 : return false;
691 : }
692 4 : pgstat_report_wait_end();
693 :
694 4 : if (CloseTransientFile(fd))
695 : {
696 0 : slru_errcause = SLRU_CLOSE_FAILED;
697 0 : slru_errno = errno;
698 0 : return false;
699 : }
700 :
701 4 : return true;
702 : }
703 :
704 : /*
705 : * Physical write of a page from a buffer slot
706 : *
707 : * On failure, we cannot just ereport(ERROR) since caller has put state in
708 : * shared memory that must be undone. So, we return FALSE and save enough
709 : * info in static variables to let SlruReportIOError make the report.
710 : *
711 : * For now, assume it's not worth keeping a file pointer open across
712 : * independent read/write operations. We do batch operations during
713 : * SimpleLruFlush, though.
714 : *
715 : * fdata is NULL for a standalone write, pointer to open-file info during
716 : * SimpleLruFlush.
717 : */
718 : static bool
719 30 : SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruFlush fdata)
720 : {
721 30 : SlruShared shared = ctl->shared;
722 30 : int segno = pageno / SLRU_PAGES_PER_SEGMENT;
723 30 : int rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
724 30 : int offset = rpageno * BLCKSZ;
725 : char path[MAXPGPATH];
726 30 : int fd = -1;
727 :
728 : /*
729 : * Honor the write-WAL-before-data rule, if appropriate, so that we do not
730 : * write out data before associated WAL records. This is the same action
731 : * performed during FlushBuffer() in the main buffer manager.
732 : */
733 30 : if (shared->group_lsn != NULL)
734 : {
735 : /*
736 : * We must determine the largest async-commit LSN for the page. This
737 : * is a bit tedious, but since this entire function is a slow path
738 : * anyway, it seems better to do this here than to maintain a per-page
739 : * LSN variable (which'd need an extra comparison in the
740 : * transaction-commit path).
741 : */
742 : XLogRecPtr max_lsn;
743 : int lsnindex,
744 : lsnoff;
745 :
746 9 : lsnindex = slotno * shared->lsn_groups_per_page;
747 9 : max_lsn = shared->group_lsn[lsnindex++];
748 9216 : for (lsnoff = 1; lsnoff < shared->lsn_groups_per_page; lsnoff++)
749 : {
750 9207 : XLogRecPtr this_lsn = shared->group_lsn[lsnindex++];
751 :
752 9207 : if (max_lsn < this_lsn)
753 79 : max_lsn = this_lsn;
754 : }
755 :
756 9 : if (!XLogRecPtrIsInvalid(max_lsn))
757 : {
758 : /*
759 : * As noted above, elog(ERROR) is not acceptable here, so if
760 : * XLogFlush were to fail, we must PANIC. This isn't much of a
761 : * restriction because XLogFlush is just about all critical
762 : * section anyway, but let's make sure.
763 : */
764 4 : START_CRIT_SECTION();
765 4 : XLogFlush(max_lsn);
766 4 : END_CRIT_SECTION();
767 : }
768 : }
769 :
770 : /*
771 : * During a Flush, we may already have the desired file open.
772 : */
773 30 : if (fdata)
774 : {
775 : int i;
776 :
777 21 : for (i = 0; i < fdata->num_files; i++)
778 : {
779 4 : if (fdata->segno[i] == segno)
780 : {
781 4 : fd = fdata->fd[i];
782 4 : break;
783 : }
784 : }
785 : }
786 :
787 30 : if (fd < 0)
788 : {
789 : /*
790 : * If the file doesn't already exist, we should create it. It is
791 : * possible for this to need to happen when writing a page that's not
792 : * first in its segment; we assume the OS can cope with that. (Note:
793 : * it might seem that it'd be okay to create files only when
794 : * SimpleLruZeroPage is called for the first page of a segment.
795 : * However, if after a crash and restart the REDO logic elects to
796 : * replay the log from a checkpoint before the latest one, then it's
797 : * possible that we will get commands to set transaction status of
798 : * transactions that have already been truncated from the commit log.
799 : * Easiest way to deal with that is to accept references to
800 : * nonexistent files here and in SlruPhysicalReadPage.)
801 : *
802 : * Note: it is possible for more than one backend to be executing this
803 : * code simultaneously for different pages of the same file. Hence,
804 : * don't use O_EXCL or O_TRUNC or anything like that.
805 : */
806 26 : SlruFileName(ctl, path, segno);
807 26 : fd = OpenTransientFile(path, O_RDWR | O_CREAT | PG_BINARY,
808 : S_IRUSR | S_IWUSR);
809 26 : if (fd < 0)
810 : {
811 0 : slru_errcause = SLRU_OPEN_FAILED;
812 0 : slru_errno = errno;
813 0 : return false;
814 : }
815 :
816 26 : if (fdata)
817 : {
818 17 : if (fdata->num_files < MAX_FLUSH_BUFFERS)
819 : {
820 17 : fdata->fd[fdata->num_files] = fd;
821 17 : fdata->segno[fdata->num_files] = segno;
822 17 : fdata->num_files++;
823 : }
824 : else
825 : {
826 : /*
827 : * In the unlikely event that we exceed MAX_FLUSH_BUFFERS,
828 : * fall back to treating it as a standalone write.
829 : */
830 0 : fdata = NULL;
831 : }
832 : }
833 : }
834 :
835 30 : if (lseek(fd, (off_t) offset, SEEK_SET) < 0)
836 : {
837 0 : slru_errcause = SLRU_SEEK_FAILED;
838 0 : slru_errno = errno;
839 0 : if (!fdata)
840 0 : CloseTransientFile(fd);
841 0 : return false;
842 : }
843 :
844 30 : errno = 0;
845 30 : pgstat_report_wait_start(WAIT_EVENT_SLRU_WRITE);
846 30 : if (write(fd, shared->page_buffer[slotno], BLCKSZ) != BLCKSZ)
847 : {
848 0 : pgstat_report_wait_end();
849 : /* if write didn't set errno, assume problem is no disk space */
850 0 : if (errno == 0)
851 0 : errno = ENOSPC;
852 0 : slru_errcause = SLRU_WRITE_FAILED;
853 0 : slru_errno = errno;
854 0 : if (!fdata)
855 0 : CloseTransientFile(fd);
856 0 : return false;
857 : }
858 30 : pgstat_report_wait_end();
859 :
860 : /*
861 : * If not part of Flush, need to fsync now. We assume this happens
862 : * infrequently enough that it's not a performance issue.
863 : */
864 30 : if (!fdata)
865 : {
866 9 : pgstat_report_wait_start(WAIT_EVENT_SLRU_SYNC);
867 9 : if (ctl->do_fsync && pg_fsync(fd))
868 : {
869 0 : pgstat_report_wait_end();
870 0 : slru_errcause = SLRU_FSYNC_FAILED;
871 0 : slru_errno = errno;
872 0 : CloseTransientFile(fd);
873 0 : return false;
874 : }
875 9 : pgstat_report_wait_end();
876 :
877 9 : if (CloseTransientFile(fd))
878 : {
879 0 : slru_errcause = SLRU_CLOSE_FAILED;
880 0 : slru_errno = errno;
881 0 : return false;
882 : }
883 : }
884 :
885 30 : return true;
886 : }
887 :
888 : /*
889 : * Issue the error message after failure of SlruPhysicalReadPage or
890 : * SlruPhysicalWritePage. Call this after cleaning up shared-memory state.
891 : */
892 : static void
893 0 : SlruReportIOError(SlruCtl ctl, int pageno, TransactionId xid)
894 : {
895 0 : int segno = pageno / SLRU_PAGES_PER_SEGMENT;
896 0 : int rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
897 0 : int offset = rpageno * BLCKSZ;
898 : char path[MAXPGPATH];
899 :
900 0 : SlruFileName(ctl, path, segno);
901 0 : errno = slru_errno;
902 0 : switch (slru_errcause)
903 : {
904 : case SLRU_OPEN_FAILED:
905 0 : ereport(ERROR,
906 : (errcode_for_file_access(),
907 : errmsg("could not access status of transaction %u", xid),
908 : errdetail("Could not open file \"%s\": %m.", path)));
909 : break;
910 : case SLRU_SEEK_FAILED:
911 0 : ereport(ERROR,
912 : (errcode_for_file_access(),
913 : errmsg("could not access status of transaction %u", xid),
914 : errdetail("Could not seek in file \"%s\" to offset %u: %m.",
915 : path, offset)));
916 : break;
917 : case SLRU_READ_FAILED:
918 0 : ereport(ERROR,
919 : (errcode_for_file_access(),
920 : errmsg("could not access status of transaction %u", xid),
921 : errdetail("Could not read from file \"%s\" at offset %u: %m.",
922 : path, offset)));
923 : break;
924 : case SLRU_WRITE_FAILED:
925 0 : ereport(ERROR,
926 : (errcode_for_file_access(),
927 : errmsg("could not access status of transaction %u", xid),
928 : errdetail("Could not write to file \"%s\" at offset %u: %m.",
929 : path, offset)));
930 : break;
931 : case SLRU_FSYNC_FAILED:
932 0 : ereport(ERROR,
933 : (errcode_for_file_access(),
934 : errmsg("could not access status of transaction %u", xid),
935 : errdetail("Could not fsync file \"%s\": %m.",
936 : path)));
937 : break;
938 : case SLRU_CLOSE_FAILED:
939 0 : ereport(ERROR,
940 : (errcode_for_file_access(),
941 : errmsg("could not access status of transaction %u", xid),
942 : errdetail("Could not close file \"%s\": %m.",
943 : path)));
944 : break;
945 : default:
946 : /* can't get here, we trust */
947 0 : elog(ERROR, "unrecognized SimpleLru error cause: %d",
948 : (int) slru_errcause);
949 : break;
950 : }
951 : }
952 :
953 : /*
954 : * Select the slot to re-use when we need a free slot.
955 : *
956 : * The target page number is passed because we need to consider the
957 : * possibility that some other process reads in the target page while
958 : * we are doing I/O to free a slot. Hence, check or recheck to see if
959 : * any slot already holds the target page, and return that slot if so.
960 : * Thus, the returned slot is *either* a slot already holding the pageno
961 : * (could be any state except EMPTY), *or* a freeable slot (state EMPTY
962 : * or CLEAN).
963 : *
964 : * Control lock must be held at entry, and will be held at exit.
965 : */
966 : static int
967 10701 : SlruSelectLRUPage(SlruCtl ctl, int pageno)
968 : {
969 10701 : SlruShared shared = ctl->shared;
970 :
971 : /* Outer loop handles restart after I/O */
972 : for (;;)
973 : {
974 : int slotno;
975 : int cur_count;
976 10701 : int bestvalidslot = 0; /* keep compiler quiet */
977 10701 : int best_valid_delta = -1;
978 10701 : int best_valid_page_number = 0; /* keep compiler quiet */
979 10701 : int bestinvalidslot = 0; /* keep compiler quiet */
980 10701 : int best_invalid_delta = -1;
981 10701 : int best_invalid_page_number = 0; /* keep compiler quiet */
982 :
983 : /* See if page already has a buffer assigned */
984 11310 : for (slotno = 0; slotno < shared->num_slots; slotno++)
985 : {
986 22257 : if (shared->page_number[slotno] == pageno &&
987 10968 : shared->page_status[slotno] != SLRU_PAGE_EMPTY)
988 10680 : return slotno;
989 : }
990 :
991 : /*
992 : * If we find any EMPTY slot, just select that one. Else choose a
993 : * victim page to replace. We normally take the least recently used
994 : * valid page, but we will never take the slot containing
995 : * latest_page_number, even if it appears least recently used. We
996 : * will select a slot that is already I/O busy only if there is no
997 : * other choice: a read-busy slot will not be least recently used once
998 : * the read finishes, and waiting for an I/O on a write-busy slot is
999 : * inferior to just picking some other slot. Testing shows the slot
1000 : * we pick instead will often be clean, allowing us to begin a read at
1001 : * once.
1002 : *
1003 : * Normally the page_lru_count values will all be different and so
1004 : * there will be a well-defined LRU page. But since we allow
1005 : * concurrent execution of SlruRecentlyUsed() within
1006 : * SimpleLruReadPage_ReadOnly(), it is possible that multiple pages
1007 : * acquire the same lru_count values. In that case we break ties by
1008 : * choosing the furthest-back page.
1009 : *
1010 : * Notice that this next line forcibly advances cur_lru_count to a
1011 : * value that is certainly beyond any value that will be in the
1012 : * page_lru_count array after the loop finishes. This ensures that
1013 : * the next execution of SlruRecentlyUsed will mark the page newly
1014 : * used, even if it's for a page that has the current counter value.
1015 : * That gets us back on the path to having good data when there are
1016 : * multiple pages with the same lru_count.
1017 : */
1018 21 : cur_count = (shared->cur_lru_count)++;
1019 36 : for (slotno = 0; slotno < shared->num_slots; slotno++)
1020 : {
1021 : int this_delta;
1022 : int this_page_number;
1023 :
1024 36 : if (shared->page_status[slotno] == SLRU_PAGE_EMPTY)
1025 21 : return slotno;
1026 15 : this_delta = cur_count - shared->page_lru_count[slotno];
1027 15 : if (this_delta < 0)
1028 : {
1029 : /*
1030 : * Clean up in case shared updates have caused cur_count
1031 : * increments to get "lost". We back off the page counts,
1032 : * rather than trying to increase cur_count, to avoid any
1033 : * question of infinite loops or failure in the presence of
1034 : * wrapped-around counts.
1035 : */
1036 0 : shared->page_lru_count[slotno] = cur_count;
1037 0 : this_delta = 0;
1038 : }
1039 15 : this_page_number = shared->page_number[slotno];
1040 15 : if (this_page_number == shared->latest_page_number)
1041 5 : continue;
1042 10 : if (shared->page_status[slotno] == SLRU_PAGE_VALID)
1043 : {
1044 10 : if (this_delta > best_valid_delta ||
1045 0 : (this_delta == best_valid_delta &&
1046 0 : ctl->PagePrecedes(this_page_number,
1047 : best_valid_page_number)))
1048 : {
1049 4 : bestvalidslot = slotno;
1050 4 : best_valid_delta = this_delta;
1051 4 : best_valid_page_number = this_page_number;
1052 : }
1053 : }
1054 : else
1055 : {
1056 0 : if (this_delta > best_invalid_delta ||
1057 0 : (this_delta == best_invalid_delta &&
1058 0 : ctl->PagePrecedes(this_page_number,
1059 : best_invalid_page_number)))
1060 : {
1061 0 : bestinvalidslot = slotno;
1062 0 : best_invalid_delta = this_delta;
1063 0 : best_invalid_page_number = this_page_number;
1064 : }
1065 : }
1066 : }
1067 :
1068 : /*
1069 : * If all pages (except possibly the latest one) are I/O busy, we'll
1070 : * have to wait for an I/O to complete and then retry. In that
1071 : * unhappy case, we choose to wait for the I/O on the least recently
1072 : * used slot, on the assumption that it was likely initiated first of
1073 : * all the I/Os in progress and may therefore finish first.
1074 : */
1075 0 : if (best_valid_delta < 0)
1076 : {
1077 0 : SimpleLruWaitIO(ctl, bestinvalidslot);
1078 0 : continue;
1079 : }
1080 :
1081 : /*
1082 : * If the selected page is clean, we're set.
1083 : */
1084 0 : if (!shared->page_dirty[bestvalidslot])
1085 0 : return bestvalidslot;
1086 :
1087 : /*
1088 : * Write the page.
1089 : */
1090 0 : SlruInternalWritePage(ctl, bestvalidslot, NULL);
1091 :
1092 : /*
1093 : * Now loop back and try again. This is the easiest way of dealing
1094 : * with corner cases such as the victim page being re-dirtied while we
1095 : * wrote it.
1096 : */
1097 0 : }
1098 : }
1099 :
1100 : /*
1101 : * Flush dirty pages to disk during checkpoint or database shutdown
1102 : */
1103 : void
1104 70 : SimpleLruFlush(SlruCtl ctl, bool allow_redirtied)
1105 : {
1106 70 : SlruShared shared = ctl->shared;
1107 : SlruFlushData fdata;
1108 : int slotno;
1109 70 : int pageno = 0;
1110 : int i;
1111 : bool ok;
1112 :
1113 : /*
1114 : * Find and write dirty pages
1115 : */
1116 70 : fdata.num_files = 0;
1117 :
1118 70 : LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE);
1119 :
1120 1526 : for (slotno = 0; slotno < shared->num_slots; slotno++)
1121 : {
1122 1456 : SlruInternalWritePage(ctl, slotno, &fdata);
1123 :
1124 : /*
1125 : * In some places (e.g. checkpoints), we cannot assert that the slot
1126 : * is clean now, since another process might have re-dirtied it
1127 : * already. That's okay.
1128 : */
1129 1456 : Assert(allow_redirtied ||
1130 : shared->page_status[slotno] == SLRU_PAGE_EMPTY ||
1131 : (shared->page_status[slotno] == SLRU_PAGE_VALID &&
1132 : !shared->page_dirty[slotno]));
1133 : }
1134 :
1135 70 : LWLockRelease(shared->ControlLock);
1136 :
1137 : /*
1138 : * Now fsync and close any files that were open
1139 : */
1140 70 : ok = true;
1141 87 : for (i = 0; i < fdata.num_files; i++)
1142 : {
1143 17 : pgstat_report_wait_start(WAIT_EVENT_SLRU_FLUSH_SYNC);
1144 17 : if (ctl->do_fsync && pg_fsync(fdata.fd[i]))
1145 : {
1146 0 : slru_errcause = SLRU_FSYNC_FAILED;
1147 0 : slru_errno = errno;
1148 0 : pageno = fdata.segno[i] * SLRU_PAGES_PER_SEGMENT;
1149 0 : ok = false;
1150 : }
1151 17 : pgstat_report_wait_end();
1152 :
1153 17 : if (CloseTransientFile(fdata.fd[i]))
1154 : {
1155 0 : slru_errcause = SLRU_CLOSE_FAILED;
1156 0 : slru_errno = errno;
1157 0 : pageno = fdata.segno[i] * SLRU_PAGES_PER_SEGMENT;
1158 0 : ok = false;
1159 : }
1160 : }
1161 70 : if (!ok)
1162 0 : SlruReportIOError(ctl, pageno, InvalidTransactionId);
1163 70 : }
1164 :
1165 : /*
1166 : * Remove all segments before the one holding the passed page number
1167 : */
1168 : void
1169 11 : SimpleLruTruncate(SlruCtl ctl, int cutoffPage)
1170 : {
1171 11 : SlruShared shared = ctl->shared;
1172 : int slotno;
1173 :
1174 : /*
1175 : * The cutoff point is the start of the segment containing cutoffPage.
1176 : */
1177 11 : cutoffPage -= cutoffPage % SLRU_PAGES_PER_SEGMENT;
1178 :
1179 : /*
1180 : * Scan shared memory and remove any pages preceding the cutoff page, to
1181 : * ensure we won't rewrite them later. (Since this is normally called in
1182 : * or just after a checkpoint, any dirty pages should have been flushed
1183 : * already ... we're just being extra careful here.)
1184 : */
1185 11 : LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE);
1186 :
1187 : restart:;
1188 :
1189 : /*
1190 : * While we are holding the lock, make an important safety check: the
1191 : * planned cutoff point must be <= the current endpoint page. Otherwise we
1192 : * have already wrapped around, and proceeding with the truncation would
1193 : * risk removing the current segment.
1194 : */
1195 11 : if (ctl->PagePrecedes(shared->latest_page_number, cutoffPage))
1196 : {
1197 0 : LWLockRelease(shared->ControlLock);
1198 0 : ereport(LOG,
1199 : (errmsg("could not truncate directory \"%s\": apparent wraparound",
1200 : ctl->Dir)));
1201 11 : return;
1202 : }
1203 :
1204 363 : for (slotno = 0; slotno < shared->num_slots; slotno++)
1205 : {
1206 352 : if (shared->page_status[slotno] == SLRU_PAGE_EMPTY)
1207 336 : continue;
1208 16 : if (!ctl->PagePrecedes(shared->page_number[slotno], cutoffPage))
1209 16 : continue;
1210 :
1211 : /*
1212 : * If page is clean, just change state to EMPTY (expected case).
1213 : */
1214 0 : if (shared->page_status[slotno] == SLRU_PAGE_VALID &&
1215 0 : !shared->page_dirty[slotno])
1216 : {
1217 0 : shared->page_status[slotno] = SLRU_PAGE_EMPTY;
1218 0 : continue;
1219 : }
1220 :
1221 : /*
1222 : * Hmm, we have (or may have) I/O operations acting on the page, so
1223 : * we've got to wait for them to finish and then start again. This is
1224 : * the same logic as in SlruSelectLRUPage. (XXX if page is dirty,
1225 : * wouldn't it be OK to just discard it without writing it? For now,
1226 : * keep the logic the same as it was.)
1227 : */
1228 0 : if (shared->page_status[slotno] == SLRU_PAGE_VALID)
1229 0 : SlruInternalWritePage(ctl, slotno, NULL);
1230 : else
1231 0 : SimpleLruWaitIO(ctl, slotno);
1232 0 : goto restart;
1233 : }
1234 :
1235 11 : LWLockRelease(shared->ControlLock);
1236 :
1237 : /* Now we can remove the old segment(s) */
1238 11 : (void) SlruScanDirectory(ctl, SlruScanDirCbDeleteCutoff, &cutoffPage);
1239 : }
1240 :
1241 : /*
1242 : * Delete an individual SLRU segment, identified by the filename.
1243 : *
1244 : * NB: This does not touch the SLRU buffers themselves, callers have to ensure
1245 : * they either can't yet contain anything, or have already been cleaned out.
1246 : */
1247 : static void
1248 4 : SlruInternalDeleteSegment(SlruCtl ctl, char *filename)
1249 : {
1250 : char path[MAXPGPATH];
1251 :
1252 4 : snprintf(path, MAXPGPATH, "%s/%s", ctl->Dir, filename);
1253 4 : ereport(DEBUG2,
1254 : (errmsg("removing file \"%s\"", path)));
1255 4 : unlink(path);
1256 4 : }
1257 :
1258 : /*
1259 : * Delete an individual SLRU segment, identified by the segment number.
1260 : */
1261 : void
1262 0 : SlruDeleteSegment(SlruCtl ctl, int segno)
1263 : {
1264 0 : SlruShared shared = ctl->shared;
1265 : int slotno;
1266 : char path[MAXPGPATH];
1267 : bool did_write;
1268 :
1269 : /* Clean out any possibly existing references to the segment. */
1270 0 : LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE);
1271 : restart:
1272 0 : did_write = false;
1273 0 : for (slotno = 0; slotno < shared->num_slots; slotno++)
1274 : {
1275 0 : int pagesegno = shared->page_number[slotno] / SLRU_PAGES_PER_SEGMENT;
1276 :
1277 0 : if (shared->page_status[slotno] == SLRU_PAGE_EMPTY)
1278 0 : continue;
1279 :
1280 : /* not the segment we're looking for */
1281 0 : if (pagesegno != segno)
1282 0 : continue;
1283 :
1284 : /* If page is clean, just change state to EMPTY (expected case). */
1285 0 : if (shared->page_status[slotno] == SLRU_PAGE_VALID &&
1286 0 : !shared->page_dirty[slotno])
1287 : {
1288 0 : shared->page_status[slotno] = SLRU_PAGE_EMPTY;
1289 0 : continue;
1290 : }
1291 :
1292 : /* Same logic as SimpleLruTruncate() */
1293 0 : if (shared->page_status[slotno] == SLRU_PAGE_VALID)
1294 0 : SlruInternalWritePage(ctl, slotno, NULL);
1295 : else
1296 0 : SimpleLruWaitIO(ctl, slotno);
1297 :
1298 0 : did_write = true;
1299 : }
1300 :
1301 : /*
1302 : * Be extra careful and re-check. The IO functions release the control
1303 : * lock, so new pages could have been read in.
1304 : */
1305 0 : if (did_write)
1306 0 : goto restart;
1307 :
1308 0 : snprintf(path, MAXPGPATH, "%s/%04X", ctl->Dir, segno);
1309 0 : ereport(DEBUG2,
1310 : (errmsg("removing file \"%s\"", path)));
1311 0 : unlink(path);
1312 :
1313 0 : LWLockRelease(shared->ControlLock);
1314 0 : }
1315 :
1316 : /*
1317 : * SlruScanDirectory callback
1318 : * This callback reports true if there's any segment prior to the one
1319 : * containing the page passed as "data".
1320 : */
1321 : bool
1322 2 : SlruScanDirCbReportPresence(SlruCtl ctl, char *filename, int segpage, void *data)
1323 : {
1324 2 : int cutoffPage = *(int *) data;
1325 :
1326 2 : cutoffPage -= cutoffPage % SLRU_PAGES_PER_SEGMENT;
1327 :
1328 2 : if (ctl->PagePrecedes(segpage, cutoffPage))
1329 0 : return true; /* found one; don't iterate any more */
1330 :
1331 2 : return false; /* keep going */
1332 : }
1333 :
1334 : /*
1335 : * SlruScanDirectory callback.
1336 : * This callback deletes segments prior to the one passed in as "data".
1337 : */
1338 : static bool
1339 11 : SlruScanDirCbDeleteCutoff(SlruCtl ctl, char *filename, int segpage, void *data)
1340 : {
1341 11 : int cutoffPage = *(int *) data;
1342 :
1343 11 : if (ctl->PagePrecedes(segpage, cutoffPage))
1344 0 : SlruInternalDeleteSegment(ctl, filename);
1345 :
1346 11 : return false; /* keep going */
1347 : }
1348 :
1349 : /*
1350 : * SlruScanDirectory callback.
1351 : * This callback deletes all segments.
1352 : */
1353 : bool
1354 4 : SlruScanDirCbDeleteAll(SlruCtl ctl, char *filename, int segpage, void *data)
1355 : {
1356 4 : SlruInternalDeleteSegment(ctl, filename);
1357 :
1358 4 : return false; /* keep going */
1359 : }
1360 :
1361 : /*
1362 : * Scan the SimpleLRU directory and apply a callback to each file found in it.
1363 : *
1364 : * If the callback returns true, the scan is stopped. The last return value
1365 : * from the callback is returned.
1366 : *
1367 : * The callback receives the following arguments: 1. the SlruCtl struct for the
1368 : * slru being truncated; 2. the filename being considered; 3. the page number
1369 : * for the first page of that file; 4. a pointer to the opaque data given to us
1370 : * by the caller.
1371 : *
1372 : * Note that the ordering in which the directory is scanned is not guaranteed.
1373 : *
1374 : * Note that no locking is applied.
1375 : */
1376 : bool
1377 23 : SlruScanDirectory(SlruCtl ctl, SlruScanCallback callback, void *data)
1378 : {
1379 23 : bool retval = false;
1380 : DIR *cldir;
1381 : struct dirent *clde;
1382 : int segno;
1383 : int segpage;
1384 :
1385 23 : cldir = AllocateDir(ctl->Dir);
1386 23 : while ((clde = ReadDir(cldir, ctl->Dir)) != NULL)
1387 : {
1388 : size_t len;
1389 :
1390 63 : len = strlen(clde->d_name);
1391 :
1392 80 : if ((len == 4 || len == 5 || len == 6) &&
1393 17 : strspn(clde->d_name, "0123456789ABCDEF") == len)
1394 : {
1395 17 : segno = (int) strtol(clde->d_name, NULL, 16);
1396 17 : segpage = segno * SLRU_PAGES_PER_SEGMENT;
1397 :
1398 17 : elog(DEBUG2, "SlruScanDirectory invoking callback on %s/%s",
1399 : ctl->Dir, clde->d_name);
1400 17 : retval = callback(ctl, clde->d_name, segpage, data);
1401 17 : if (retval)
1402 0 : break;
1403 : }
1404 : }
1405 23 : FreeDir(cldir);
1406 :
1407 23 : return retval;
1408 : }
|