Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * fd.c
4 : * Virtual file descriptor code.
5 : *
6 : * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : * IDENTIFICATION
10 : * src/backend/storage/file/fd.c
11 : *
12 : * NOTES:
13 : *
14 : * This code manages a cache of 'virtual' file descriptors (VFDs).
15 : * The server opens many file descriptors for a variety of reasons,
16 : * including base tables, scratch files (e.g., sort and hash spool
17 : * files), and random calls to C library routines like system(3); it
18 : * is quite easy to exceed system limits on the number of open files a
19 : * single process can have. (This is around 256 on many modern
20 : * operating systems, but can be as low as 32 on others.)
21 : *
22 : * VFDs are managed as an LRU pool, with actual OS file descriptors
23 : * being opened and closed as needed. Obviously, if a routine is
24 : * opened using these interfaces, all subsequent operations must also
25 : * be through these interfaces (the File type is not a real file
26 : * descriptor).
27 : *
28 : * For this scheme to work, most (if not all) routines throughout the
29 : * server should use these interfaces instead of calling the C library
30 : * routines (e.g., open(2) and fopen(3)) themselves. Otherwise, we
31 : * may find ourselves short of real file descriptors anyway.
32 : *
33 : * INTERFACE ROUTINES
34 : *
35 : * PathNameOpenFile and OpenTemporaryFile are used to open virtual files.
36 : * A File opened with OpenTemporaryFile is automatically deleted when the
37 : * File is closed, either explicitly or implicitly at end of transaction or
38 : * process exit. PathNameOpenFile is intended for files that are held open
39 : * for a long time, like relation files. It is the caller's responsibility
40 : * to close them, there is no automatic mechanism in fd.c for that.
41 : *
42 : * AllocateFile, AllocateDir, OpenPipeStream and OpenTransientFile are
43 : * wrappers around fopen(3), opendir(3), popen(3) and open(2), respectively.
44 : * They behave like the corresponding native functions, except that the handle
45 : * is registered with the current subtransaction, and will be automatically
46 : * closed at abort. These are intended mainly for short operations like
47 : * reading a configuration file; there is a limit on the number of files that
48 : * can be opened using these functions at any one time.
49 : *
50 : * Finally, BasicOpenFile is just a thin wrapper around open() that can
51 : * release file descriptors in use by the virtual file descriptors if
52 : * necessary. There is no automatic cleanup of file descriptors returned by
53 : * BasicOpenFile, it is solely the caller's responsibility to close the file
54 : * descriptor by calling close(2).
55 : *
56 : *-------------------------------------------------------------------------
57 : */
58 :
59 : #include "postgres.h"
60 :
61 : #include <sys/file.h>
62 : #include <sys/param.h>
63 : #include <sys/stat.h>
64 : #ifndef WIN32
65 : #include <sys/mman.h>
66 : #endif
67 : #include <limits.h>
68 : #include <unistd.h>
69 : #include <fcntl.h>
70 : #ifdef HAVE_SYS_RESOURCE_H
71 : #include <sys/resource.h> /* for getrlimit */
72 : #endif
73 :
74 : #include "miscadmin.h"
75 : #include "access/xact.h"
76 : #include "access/xlog.h"
77 : #include "catalog/catalog.h"
78 : #include "catalog/pg_tablespace.h"
79 : #include "pgstat.h"
80 : #include "portability/mem.h"
81 : #include "storage/fd.h"
82 : #include "storage/ipc.h"
83 : #include "utils/guc.h"
84 : #include "utils/resowner_private.h"
85 :
86 :
87 : /* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
88 : #if defined(HAVE_SYNC_FILE_RANGE)
89 : #define PG_FLUSH_DATA_WORKS 1
90 : #elif !defined(WIN32) && defined(MS_ASYNC)
91 : #define PG_FLUSH_DATA_WORKS 1
92 : #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
93 : #define PG_FLUSH_DATA_WORKS 1
94 : #endif
95 :
96 : /*
97 : * We must leave some file descriptors free for system(), the dynamic loader,
98 : * and other code that tries to open files without consulting fd.c. This
99 : * is the number left free. (While we can be pretty sure we won't get
100 : * EMFILE, there's never any guarantee that we won't get ENFILE due to
101 : * other processes chewing up FDs. So it's a bad idea to try to open files
102 : * without consulting fd.c. Nonetheless we cannot control all code.)
103 : *
104 : * Because this is just a fixed setting, we are effectively assuming that
105 : * no such code will leave FDs open over the long term; otherwise the slop
106 : * is likely to be insufficient. Note in particular that we expect that
107 : * loading a shared library does not result in any permanent increase in
108 : * the number of open files. (This appears to be true on most if not
109 : * all platforms as of Feb 2004.)
110 : */
111 : #define NUM_RESERVED_FDS 10
112 :
113 : /*
114 : * If we have fewer than this many usable FDs after allowing for the reserved
115 : * ones, choke.
116 : */
117 : #define FD_MINFREE 10
118 :
119 :
120 : /*
121 : * A number of platforms allow individual processes to open many more files
122 : * than they can really support when *many* processes do the same thing.
123 : * This GUC parameter lets the DBA limit max_safe_fds to something less than
124 : * what the postmaster's initial probe suggests will work.
125 : */
126 : int max_files_per_process = 1000;
127 :
128 : /*
129 : * Maximum number of file descriptors to open for either VFD entries or
130 : * AllocateFile/AllocateDir/OpenTransientFile operations. This is initialized
131 : * to a conservative value, and remains that way indefinitely in bootstrap or
132 : * standalone-backend cases. In normal postmaster operation, the postmaster
133 : * calls set_max_safe_fds() late in initialization to update the value, and
134 : * that value is then inherited by forked subprocesses.
135 : *
136 : * Note: the value of max_files_per_process is taken into account while
137 : * setting this variable, and so need not be tested separately.
138 : */
139 : int max_safe_fds = 32; /* default if not changed */
140 :
141 :
142 : /* Debugging.... */
143 :
144 : #ifdef FDDEBUG
145 : #define DO_DB(A) \
146 : do { \
147 : int _do_db_save_errno = errno; \
148 : A; \
149 : errno = _do_db_save_errno; \
150 : } while (0)
151 : #else
152 : #define DO_DB(A) \
153 : ((void) 0)
154 : #endif
155 :
156 : #define VFD_CLOSED (-1)
157 :
158 : #define FileIsValid(file) \
159 : ((file) > 0 && (file) < (int) SizeVfdCache && VfdCache[file].fileName != NULL)
160 :
161 : #define FileIsNotOpen(file) (VfdCache[file].fd == VFD_CLOSED)
162 :
163 : /*
164 : * Note: a VFD's seekPos is normally always valid, but if for some reason
165 : * an lseek() fails, it might become set to FileUnknownPos. We can struggle
166 : * along without knowing the seek position in many cases, but in some places
167 : * we have to fail if we don't have it.
168 : */
169 : #define FileUnknownPos ((off_t) -1)
170 : #define FilePosIsUnknown(pos) ((pos) < 0)
171 :
172 : /* these are the assigned bits in fdstate below: */
173 : #define FD_TEMPORARY (1 << 0) /* T = delete when closed */
174 : #define FD_XACT_TEMPORARY (1 << 1) /* T = delete at eoXact */
175 :
176 : typedef struct vfd
177 : {
178 : int fd; /* current FD, or VFD_CLOSED if none */
179 : unsigned short fdstate; /* bitflags for VFD's state */
180 : ResourceOwner resowner; /* owner, for automatic cleanup */
181 : File nextFree; /* link to next free VFD, if in freelist */
182 : File lruMoreRecently; /* doubly linked recency-of-use list */
183 : File lruLessRecently;
184 : off_t seekPos; /* current logical file position, or -1 */
185 : off_t fileSize; /* current size of file (0 if not temporary) */
186 : char *fileName; /* name of file, or NULL for unused VFD */
187 : /* NB: fileName is malloc'd, and must be free'd when closing the VFD */
188 : int fileFlags; /* open(2) flags for (re)opening the file */
189 : int fileMode; /* mode to pass to open(2) */
190 : } Vfd;
191 :
192 : /*
193 : * Virtual File Descriptor array pointer and size. This grows as
194 : * needed. 'File' values are indexes into this array.
195 : * Note that VfdCache[0] is not a usable VFD, just a list header.
196 : */
197 : static Vfd *VfdCache;
198 : static Size SizeVfdCache = 0;
199 :
200 : /*
201 : * Number of file descriptors known to be in use by VFD entries.
202 : */
203 : static int nfile = 0;
204 :
205 : /*
206 : * Flag to tell whether it's worth scanning VfdCache looking for temp files
207 : * to close
208 : */
209 : static bool have_xact_temporary_files = false;
210 :
211 : /*
212 : * Tracks the total size of all temporary files. Note: when temp_file_limit
213 : * is being enforced, this cannot overflow since the limit cannot be more
214 : * than INT_MAX kilobytes. When not enforcing, it could theoretically
215 : * overflow, but we don't care.
216 : */
217 : static uint64 temporary_files_size = 0;
218 :
219 : /*
220 : * List of OS handles opened with AllocateFile, AllocateDir and
221 : * OpenTransientFile.
222 : */
223 : typedef enum
224 : {
225 : AllocateDescFile,
226 : AllocateDescPipe,
227 : AllocateDescDir,
228 : AllocateDescRawFD
229 : } AllocateDescKind;
230 :
231 : typedef struct
232 : {
233 : AllocateDescKind kind;
234 : SubTransactionId create_subid;
235 : union
236 : {
237 : FILE *file;
238 : DIR *dir;
239 : int fd;
240 : } desc;
241 : } AllocateDesc;
242 :
243 : static int numAllocatedDescs = 0;
244 : static int maxAllocatedDescs = 0;
245 : static AllocateDesc *allocatedDescs = NULL;
246 :
247 : /*
248 : * Number of temporary files opened during the current session;
249 : * this is used in generation of tempfile names.
250 : */
251 : static long tempFileCounter = 0;
252 :
253 : /*
254 : * Array of OIDs of temp tablespaces. When numTempTableSpaces is -1,
255 : * this has not been set in the current transaction.
256 : */
257 : static Oid *tempTableSpaces = NULL;
258 : static int numTempTableSpaces = -1;
259 : static int nextTempTableSpace = 0;
260 :
261 :
262 : /*--------------------
263 : *
264 : * Private Routines
265 : *
266 : * Delete - delete a file from the Lru ring
267 : * LruDelete - remove a file from the Lru ring and close its FD
268 : * Insert - put a file at the front of the Lru ring
269 : * LruInsert - put a file at the front of the Lru ring and open it
270 : * ReleaseLruFile - Release an fd by closing the last entry in the Lru ring
271 : * ReleaseLruFiles - Release fd(s) until we're under the max_safe_fds limit
272 : * AllocateVfd - grab a free (or new) file record (from VfdArray)
273 : * FreeVfd - free a file record
274 : *
275 : * The Least Recently Used ring is a doubly linked list that begins and
276 : * ends on element zero. Element zero is special -- it doesn't represent
277 : * a file and its "fd" field always == VFD_CLOSED. Element zero is just an
278 : * anchor that shows us the beginning/end of the ring.
279 : * Only VFD elements that are currently really open (have an FD assigned) are
280 : * in the Lru ring. Elements that are "virtually" open can be recognized
281 : * by having a non-null fileName field.
282 : *
283 : * example:
284 : *
285 : * /--less----\ /---------\
286 : * v \ v \
287 : * #0 --more---> LeastRecentlyUsed --more-\ \
288 : * ^\ | |
289 : * \\less--> MostRecentlyUsedFile <---/ |
290 : * \more---/ \--less--/
291 : *
292 : *--------------------
293 : */
294 : static void Delete(File file);
295 : static void LruDelete(File file);
296 : static void Insert(File file);
297 : static int LruInsert(File file);
298 : static bool ReleaseLruFile(void);
299 : static void ReleaseLruFiles(void);
300 : static File AllocateVfd(void);
301 : static void FreeVfd(File file);
302 :
303 : static int FileAccess(File file);
304 : static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError);
305 : static bool reserveAllocatedDesc(void);
306 : static int FreeDesc(AllocateDesc *desc);
307 : static struct dirent *ReadDirExtended(DIR *dir, const char *dirname, int elevel);
308 :
309 : static void AtProcExit_Files(int code, Datum arg);
310 : static void CleanupTempFiles(bool isProcExit);
311 : static void RemovePgTempFilesInDir(const char *tmpdirname);
312 : static void RemovePgTempRelationFiles(const char *tsdirname);
313 : static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname);
314 : static bool looks_like_temp_rel_name(const char *name);
315 :
316 : static void walkdir(const char *path,
317 : void (*action) (const char *fname, bool isdir, int elevel),
318 : bool process_symlinks,
319 : int elevel);
320 : #ifdef PG_FLUSH_DATA_WORKS
321 : static void pre_sync_fname(const char *fname, bool isdir, int elevel);
322 : #endif
323 : static void datadir_fsync_fname(const char *fname, bool isdir, int elevel);
324 :
325 : static int fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel);
326 : static int fsync_parent_path(const char *fname, int elevel);
327 :
328 :
329 : /*
330 : * pg_fsync --- do fsync with or without writethrough
331 : */
332 : int
333 1436 : pg_fsync(int fd)
334 : {
335 : /* #if is to skip the sync_method test if there's no need for it */
336 : #if defined(HAVE_FSYNC_WRITETHROUGH) && !defined(FSYNC_WRITETHROUGH_IS_FSYNC)
337 : if (sync_method == SYNC_METHOD_FSYNC_WRITETHROUGH)
338 : return pg_fsync_writethrough(fd);
339 : else
340 : #endif
341 1436 : return pg_fsync_no_writethrough(fd);
342 : }
343 :
344 :
345 : /*
346 : * pg_fsync_no_writethrough --- same as fsync except does nothing if
347 : * enableFsync is off
348 : */
349 : int
350 1436 : pg_fsync_no_writethrough(int fd)
351 : {
352 1436 : if (enableFsync)
353 0 : return fsync(fd);
354 : else
355 1436 : return 0;
356 : }
357 :
358 : /*
359 : * pg_fsync_writethrough
360 : */
361 : int
362 0 : pg_fsync_writethrough(int fd)
363 : {
364 0 : if (enableFsync)
365 : {
366 : #ifdef WIN32
367 : return _commit(fd);
368 : #elif defined(F_FULLFSYNC)
369 : return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0;
370 : #else
371 0 : errno = ENOSYS;
372 0 : return -1;
373 : #endif
374 : }
375 : else
376 0 : return 0;
377 : }
378 :
379 : /*
380 : * pg_fdatasync --- same as fdatasync except does nothing if enableFsync is off
381 : *
382 : * Not all platforms have fdatasync; treat as fsync if not available.
383 : */
384 : int
385 9577 : pg_fdatasync(int fd)
386 : {
387 9577 : if (enableFsync)
388 : {
389 : #ifdef HAVE_FDATASYNC
390 0 : return fdatasync(fd);
391 : #else
392 : return fsync(fd);
393 : #endif
394 : }
395 : else
396 9577 : return 0;
397 : }
398 :
399 : /*
400 : * pg_flush_data --- advise OS that the described dirty data should be flushed
401 : *
402 : * offset of 0 with nbytes 0 means that the entire file should be flushed;
403 : * in this case, this function may have side-effects on the file's
404 : * seek position!
405 : */
406 : void
407 2302 : pg_flush_data(int fd, off_t offset, off_t nbytes)
408 : {
409 : /*
410 : * Right now file flushing is primarily used to avoid making later
411 : * fsync()/fdatasync() calls have less impact. Thus don't trigger flushes
412 : * if fsyncs are disabled - that's a decision we might want to make
413 : * configurable at some point.
414 : */
415 2302 : if (!enableFsync)
416 2302 : return;
417 :
418 : /*
419 : * We compile all alternatives that are supported on the current platform,
420 : * to find portability problems more easily.
421 : */
422 : #if defined(HAVE_SYNC_FILE_RANGE)
423 : {
424 : int rc;
425 :
426 : /*
427 : * sync_file_range(SYNC_FILE_RANGE_WRITE), currently linux specific,
428 : * tells the OS that writeback for the specified blocks should be
429 : * started, but that we don't want to wait for completion. Note that
430 : * this call might block if too much dirty data exists in the range.
431 : * This is the preferable method on OSs supporting it, as it works
432 : * reliably when available (contrast to msync()) and doesn't flush out
433 : * clean data (like FADV_DONTNEED).
434 : */
435 0 : rc = sync_file_range(fd, offset, nbytes,
436 : SYNC_FILE_RANGE_WRITE);
437 :
438 : /* don't error out, this is just a performance optimization */
439 0 : if (rc != 0)
440 : {
441 0 : ereport(WARNING,
442 : (errcode_for_file_access(),
443 : errmsg("could not flush dirty data: %m")));
444 : }
445 :
446 0 : return;
447 : }
448 : #endif
449 : #if !defined(WIN32) && defined(MS_ASYNC)
450 : {
451 : void *p;
452 : static int pagesize = 0;
453 :
454 : /*
455 : * On several OSs msync(MS_ASYNC) on a mmap'ed file triggers
456 : * writeback. On linux it only does so if MS_SYNC is specified, but
457 : * then it does the writeback synchronously. Luckily all common linux
458 : * systems have sync_file_range(). This is preferable over
459 : * FADV_DONTNEED because it doesn't flush out clean data.
460 : *
461 : * We map the file (mmap()), tell the kernel to sync back the contents
462 : * (msync()), and then remove the mapping again (munmap()).
463 : */
464 :
465 : /* mmap() needs actual length if we want to map whole file */
466 : if (offset == 0 && nbytes == 0)
467 : {
468 : nbytes = lseek(fd, 0, SEEK_END);
469 : if (nbytes < 0)
470 : {
471 : ereport(WARNING,
472 : (errcode_for_file_access(),
473 : errmsg("could not determine dirty data size: %m")));
474 : return;
475 : }
476 : }
477 :
478 : /*
479 : * Some platforms reject partial-page mmap() attempts. To deal with
480 : * that, just truncate the request to a page boundary. If any extra
481 : * bytes don't get flushed, well, it's only a hint anyway.
482 : */
483 :
484 : /* fetch pagesize only once */
485 : if (pagesize == 0)
486 : pagesize = sysconf(_SC_PAGESIZE);
487 :
488 : /* align length to pagesize, dropping any fractional page */
489 : if (pagesize > 0)
490 : nbytes = (nbytes / pagesize) * pagesize;
491 :
492 : /* fractional-page request is a no-op */
493 : if (nbytes <= 0)
494 : return;
495 :
496 : /*
497 : * mmap could well fail, particularly on 32-bit platforms where there
498 : * may simply not be enough address space. If so, silently fall
499 : * through to the next implementation.
500 : */
501 : if (nbytes <= (off_t) SSIZE_MAX)
502 : p = mmap(NULL, nbytes, PROT_READ, MAP_SHARED, fd, offset);
503 : else
504 : p = MAP_FAILED;
505 :
506 : if (p != MAP_FAILED)
507 : {
508 : int rc;
509 :
510 : rc = msync(p, (size_t) nbytes, MS_ASYNC);
511 : if (rc != 0)
512 : {
513 : ereport(WARNING,
514 : (errcode_for_file_access(),
515 : errmsg("could not flush dirty data: %m")));
516 : /* NB: need to fall through to munmap()! */
517 : }
518 :
519 : rc = munmap(p, (size_t) nbytes);
520 : if (rc != 0)
521 : {
522 : /* FATAL error because mapping would remain */
523 : ereport(FATAL,
524 : (errcode_for_file_access(),
525 : errmsg("could not munmap() while flushing data: %m")));
526 : }
527 :
528 : return;
529 : }
530 : }
531 : #endif
532 : #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
533 : {
534 : int rc;
535 :
536 : /*
537 : * Signal the kernel that the passed in range should not be cached
538 : * anymore. This has the, desired, side effect of writing out dirty
539 : * data, and the, undesired, side effect of likely discarding useful
540 : * clean cached blocks. For the latter reason this is the least
541 : * preferable method.
542 : */
543 :
544 : rc = posix_fadvise(fd, offset, nbytes, POSIX_FADV_DONTNEED);
545 :
546 : if (rc != 0)
547 : {
548 : /* don't error out, this is just a performance optimization */
549 : ereport(WARNING,
550 : (errcode_for_file_access(),
551 : errmsg("could not flush dirty data: %m")));
552 : }
553 :
554 : return;
555 : }
556 : #endif
557 : }
558 :
559 :
560 : /*
561 : * fsync_fname -- fsync a file or directory, handling errors properly
562 : *
563 : * Try to fsync a file or directory. When doing the latter, ignore errors that
564 : * indicate the OS just doesn't allow/require fsyncing directories.
565 : */
566 : void
567 33 : fsync_fname(const char *fname, bool isdir)
568 : {
569 33 : fsync_fname_ext(fname, isdir, false, ERROR);
570 33 : }
571 :
572 : /*
573 : * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
574 : *
575 : * This routine ensures that, after returning, the effect of renaming file
576 : * persists in case of a crash. A crash while this routine is running will
577 : * leave you with either the pre-existing or the moved file in place of the
578 : * new file; no mixed state or truncated files are possible.
579 : *
580 : * It does so by using fsync on the old filename and the possibly existing
581 : * target filename before the rename, and the target file and directory after.
582 : *
583 : * Note that rename() cannot be used across arbitrary directories, as they
584 : * might not be on the same filesystem. Therefore this routine does not
585 : * support renaming across directories.
586 : *
587 : * Log errors with the caller specified severity.
588 : *
589 : * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
590 : * valid upon return.
591 : */
592 : int
593 11 : durable_rename(const char *oldfile, const char *newfile, int elevel)
594 : {
595 : int fd;
596 :
597 : /*
598 : * First fsync the old and target path (if it exists), to ensure that they
599 : * are properly persistent on disk. Syncing the target file is not
600 : * strictly necessary, but it makes it easier to reason about crashes;
601 : * because it's then guaranteed that either source or target file exists
602 : * after a crash.
603 : */
604 11 : if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
605 0 : return -1;
606 :
607 11 : fd = OpenTransientFile((char *) newfile, PG_BINARY | O_RDWR, 0);
608 11 : if (fd < 0)
609 : {
610 1 : if (errno != ENOENT)
611 : {
612 0 : ereport(elevel,
613 : (errcode_for_file_access(),
614 : errmsg("could not open file \"%s\": %m", newfile)));
615 0 : return -1;
616 : }
617 : }
618 : else
619 : {
620 10 : if (pg_fsync(fd) != 0)
621 : {
622 : int save_errno;
623 :
624 : /* close file upon error, might not be in transaction context */
625 0 : save_errno = errno;
626 0 : CloseTransientFile(fd);
627 0 : errno = save_errno;
628 :
629 0 : ereport(elevel,
630 : (errcode_for_file_access(),
631 : errmsg("could not fsync file \"%s\": %m", newfile)));
632 0 : return -1;
633 : }
634 10 : CloseTransientFile(fd);
635 : }
636 :
637 : /* Time to do the real deal... */
638 11 : if (rename(oldfile, newfile) < 0)
639 : {
640 0 : ereport(elevel,
641 : (errcode_for_file_access(),
642 : errmsg("could not rename file \"%s\" to \"%s\": %m",
643 : oldfile, newfile)));
644 0 : return -1;
645 : }
646 :
647 : /*
648 : * To guarantee renaming the file is persistent, fsync the file with its
649 : * new name, and its containing directory.
650 : */
651 11 : if (fsync_fname_ext(newfile, false, false, elevel) != 0)
652 0 : return -1;
653 :
654 11 : if (fsync_parent_path(newfile, elevel) != 0)
655 0 : return -1;
656 :
657 11 : return 0;
658 : }
659 :
660 : /*
661 : * durable_unlink -- remove a file in a durable manner
662 : *
663 : * This routine ensures that, after returning, the effect of removing file
664 : * persists in case of a crash. A crash while this routine is running will
665 : * leave the system in no mixed state.
666 : *
667 : * It does so by using fsync on the parent directory of the file after the
668 : * actual removal is done.
669 : *
670 : * Log errors with the severity specified by caller.
671 : *
672 : * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
673 : * valid upon return.
674 : */
675 : int
676 1 : durable_unlink(const char *fname, int elevel)
677 : {
678 1 : if (unlink(fname) < 0)
679 : {
680 1 : ereport(elevel,
681 : (errcode_for_file_access(),
682 : errmsg("could not remove file \"%s\": %m",
683 : fname)));
684 1 : return -1;
685 : }
686 :
687 : /*
688 : * To guarantee that the removal of the file is persistent, fsync its
689 : * parent directory.
690 : */
691 0 : if (fsync_parent_path(fname, elevel) != 0)
692 0 : return -1;
693 :
694 0 : return 0;
695 : }
696 :
697 : /*
698 : * durable_link_or_rename -- rename a file in a durable manner.
699 : *
700 : * Similar to durable_rename(), except that this routine tries (but does not
701 : * guarantee) not to overwrite the target file.
702 : *
703 : * Note that a crash in an unfortunate moment can leave you with two links to
704 : * the target file.
705 : *
706 : * Log errors with the caller specified severity.
707 : *
708 : * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
709 : * valid upon return.
710 : */
711 : int
712 11 : durable_link_or_rename(const char *oldfile, const char *newfile, int elevel)
713 : {
714 : /*
715 : * Ensure that, if we crash directly after the rename/link, a file with
716 : * valid contents is moved into place.
717 : */
718 11 : if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
719 0 : return -1;
720 :
721 : #if HAVE_WORKING_LINK
722 11 : if (link(oldfile, newfile) < 0)
723 : {
724 0 : ereport(elevel,
725 : (errcode_for_file_access(),
726 : errmsg("could not link file \"%s\" to \"%s\": %m",
727 : oldfile, newfile)));
728 0 : return -1;
729 : }
730 11 : unlink(oldfile);
731 : #else
732 : /* XXX: Add racy file existence check? */
733 : if (rename(oldfile, newfile) < 0)
734 : {
735 : ereport(elevel,
736 : (errcode_for_file_access(),
737 : errmsg("could not rename file \"%s\" to \"%s\": %m",
738 : oldfile, newfile)));
739 : return -1;
740 : }
741 : #endif
742 :
743 : /*
744 : * Make change persistent in case of an OS crash, both the new entry and
745 : * its parent directory need to be flushed.
746 : */
747 11 : if (fsync_fname_ext(newfile, false, false, elevel) != 0)
748 0 : return -1;
749 :
750 : /* Same for parent directory */
751 11 : if (fsync_parent_path(newfile, elevel) != 0)
752 0 : return -1;
753 :
754 11 : return 0;
755 : }
756 :
757 : /*
758 : * InitFileAccess --- initialize this module during backend startup
759 : *
760 : * This is called during either normal or standalone backend start.
761 : * It is *not* called in the postmaster.
762 : */
763 : void
764 344 : InitFileAccess(void)
765 : {
766 344 : Assert(SizeVfdCache == 0); /* call me only once */
767 :
768 : /* initialize cache header entry */
769 344 : VfdCache = (Vfd *) malloc(sizeof(Vfd));
770 344 : if (VfdCache == NULL)
771 0 : ereport(FATAL,
772 : (errcode(ERRCODE_OUT_OF_MEMORY),
773 : errmsg("out of memory")));
774 :
775 344 : MemSet((char *) &(VfdCache[0]), 0, sizeof(Vfd));
776 344 : VfdCache->fd = VFD_CLOSED;
777 :
778 344 : SizeVfdCache = 1;
779 :
780 : /* register proc-exit hook to ensure temp files are dropped at exit */
781 344 : on_proc_exit(AtProcExit_Files, 0);
782 344 : }
783 :
784 : /*
785 : * count_usable_fds --- count how many FDs the system will let us open,
786 : * and estimate how many are already open.
787 : *
788 : * We stop counting if usable_fds reaches max_to_probe. Note: a small
789 : * value of max_to_probe might result in an underestimate of already_open;
790 : * we must fill in any "gaps" in the set of used FDs before the calculation
791 : * of already_open will give the right answer. In practice, max_to_probe
792 : * of a couple of dozen should be enough to ensure good results.
793 : *
794 : * We assume stdin (FD 0) is available for dup'ing
795 : */
796 : static void
797 1 : count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
798 : {
799 : int *fd;
800 : int size;
801 1 : int used = 0;
802 1 : int highestfd = 0;
803 : int j;
804 :
805 : #ifdef HAVE_GETRLIMIT
806 : struct rlimit rlim;
807 : int getrlimit_status;
808 : #endif
809 :
810 1 : size = 1024;
811 1 : fd = (int *) palloc(size * sizeof(int));
812 :
813 : #ifdef HAVE_GETRLIMIT
814 : #ifdef RLIMIT_NOFILE /* most platforms use RLIMIT_NOFILE */
815 1 : getrlimit_status = getrlimit(RLIMIT_NOFILE, &rlim);
816 : #else /* but BSD doesn't ... */
817 : getrlimit_status = getrlimit(RLIMIT_OFILE, &rlim);
818 : #endif /* RLIMIT_NOFILE */
819 1 : if (getrlimit_status != 0)
820 0 : ereport(WARNING, (errmsg("getrlimit failed: %m")));
821 : #endif /* HAVE_GETRLIMIT */
822 :
823 : /* dup until failure or probe limit reached */
824 : for (;;)
825 : {
826 : int thisfd;
827 :
828 : #ifdef HAVE_GETRLIMIT
829 :
830 : /*
831 : * don't go beyond RLIMIT_NOFILE; causes irritating kernel logs on
832 : * some platforms
833 : */
834 1000 : if (getrlimit_status == 0 && highestfd >= rlim.rlim_cur - 1)
835 0 : break;
836 : #endif
837 :
838 1000 : thisfd = dup(0);
839 1000 : if (thisfd < 0)
840 : {
841 : /* Expect EMFILE or ENFILE, else it's fishy */
842 0 : if (errno != EMFILE && errno != ENFILE)
843 0 : elog(WARNING, "dup(0) failed after %d successes: %m", used);
844 0 : break;
845 : }
846 :
847 1000 : if (used >= size)
848 : {
849 0 : size *= 2;
850 0 : fd = (int *) repalloc(fd, size * sizeof(int));
851 : }
852 1000 : fd[used++] = thisfd;
853 :
854 1000 : if (highestfd < thisfd)
855 1000 : highestfd = thisfd;
856 :
857 1000 : if (used >= max_to_probe)
858 1 : break;
859 999 : }
860 :
861 : /* release the files we opened */
862 1001 : for (j = 0; j < used; j++)
863 1000 : close(fd[j]);
864 :
865 1 : pfree(fd);
866 :
867 : /*
868 : * Return results. usable_fds is just the number of successful dups. We
869 : * assume that the system limit is highestfd+1 (remember 0 is a legal FD
870 : * number) and so already_open is highestfd+1 - usable_fds.
871 : */
872 1 : *usable_fds = used;
873 1 : *already_open = highestfd + 1 - used;
874 1 : }
875 :
876 : /*
877 : * set_max_safe_fds
878 : * Determine number of filedescriptors that fd.c is allowed to use
879 : */
880 : void
881 1 : set_max_safe_fds(void)
882 : {
883 : int usable_fds;
884 : int already_open;
885 :
886 : /*----------
887 : * We want to set max_safe_fds to
888 : * MIN(usable_fds, max_files_per_process - already_open)
889 : * less the slop factor for files that are opened without consulting
890 : * fd.c. This ensures that we won't exceed either max_files_per_process
891 : * or the experimentally-determined EMFILE limit.
892 : *----------
893 : */
894 1 : count_usable_fds(max_files_per_process,
895 : &usable_fds, &already_open);
896 :
897 1 : max_safe_fds = Min(usable_fds, max_files_per_process - already_open);
898 :
899 : /*
900 : * Take off the FDs reserved for system() etc.
901 : */
902 1 : max_safe_fds -= NUM_RESERVED_FDS;
903 :
904 : /*
905 : * Make sure we still have enough to get by.
906 : */
907 1 : if (max_safe_fds < FD_MINFREE)
908 0 : ereport(FATAL,
909 : (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
910 : errmsg("insufficient file descriptors available to start server process"),
911 : errdetail("System allows %d, we need at least %d.",
912 : max_safe_fds + NUM_RESERVED_FDS,
913 : FD_MINFREE + NUM_RESERVED_FDS)));
914 :
915 1 : elog(DEBUG2, "max_safe_fds = %d, usable_fds = %d, already_open = %d",
916 : max_safe_fds, usable_fds, already_open);
917 1 : }
918 :
919 : /*
920 : * BasicOpenFile --- same as open(2) except can free other FDs if needed
921 : *
922 : * This is exported for use by places that really want a plain kernel FD,
923 : * but need to be proof against running out of FDs. Once an FD has been
924 : * successfully returned, it is the caller's responsibility to ensure that
925 : * it will not be leaked on ereport()! Most users should *not* call this
926 : * routine directly, but instead use the VFD abstraction level, which
927 : * provides protection against descriptor leaks as well as management of
928 : * files that need to be open for more than a short period of time.
929 : *
930 : * Ideally this should be the *only* direct call of open() in the backend.
931 : * In practice, the postmaster calls open() directly, and there are some
932 : * direct open() calls done early in backend startup. Those are OK since
933 : * this module wouldn't have any open files to close at that point anyway.
934 : */
935 : int
936 36910 : BasicOpenFile(FileName fileName, int fileFlags, int fileMode)
937 : {
938 : int fd;
939 :
940 : tryAgain:
941 36910 : fd = open(fileName, fileFlags, fileMode);
942 :
943 36910 : if (fd >= 0)
944 25383 : return fd; /* success! */
945 :
946 11527 : if (errno == EMFILE || errno == ENFILE)
947 : {
948 0 : int save_errno = errno;
949 :
950 0 : ereport(LOG,
951 : (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
952 : errmsg("out of file descriptors: %m; release and retry")));
953 0 : errno = 0;
954 0 : if (ReleaseLruFile())
955 0 : goto tryAgain;
956 0 : errno = save_errno;
957 : }
958 :
959 11527 : return -1; /* failure */
960 : }
961 :
962 : #if defined(FDDEBUG)
963 :
964 : static void
965 : _dump_lru(void)
966 : {
967 : int mru = VfdCache[0].lruLessRecently;
968 : Vfd *vfdP = &VfdCache[mru];
969 : char buf[2048];
970 :
971 : snprintf(buf, sizeof(buf), "LRU: MOST %d ", mru);
972 : while (mru != 0)
973 : {
974 : mru = vfdP->lruLessRecently;
975 : vfdP = &VfdCache[mru];
976 : snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "%d ", mru);
977 : }
978 : snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "LEAST");
979 : elog(LOG, "%s", buf);
980 : }
981 : #endif /* FDDEBUG */
982 :
983 : static void
984 19733 : Delete(File file)
985 : {
986 : Vfd *vfdP;
987 :
988 19733 : Assert(file != 0);
989 :
990 : DO_DB(elog(LOG, "Delete %d (%s)",
991 : file, VfdCache[file].fileName));
992 : DO_DB(_dump_lru());
993 :
994 19733 : vfdP = &VfdCache[file];
995 :
996 19733 : VfdCache[vfdP->lruLessRecently].lruMoreRecently = vfdP->lruMoreRecently;
997 19733 : VfdCache[vfdP->lruMoreRecently].lruLessRecently = vfdP->lruLessRecently;
998 :
999 : DO_DB(_dump_lru());
1000 19733 : }
1001 :
1002 : static void
1003 1181 : LruDelete(File file)
1004 : {
1005 : Vfd *vfdP;
1006 :
1007 1181 : Assert(file != 0);
1008 :
1009 : DO_DB(elog(LOG, "LruDelete %d (%s)",
1010 : file, VfdCache[file].fileName));
1011 :
1012 1181 : vfdP = &VfdCache[file];
1013 :
1014 : /*
1015 : * Normally we should know the seek position, but if for some reason we
1016 : * have lost track of it, try again to get it. If we still can't get it,
1017 : * we have a problem: we will be unable to restore the file seek position
1018 : * when and if the file is re-opened. But we can't really throw an error
1019 : * and refuse to close the file, or activities such as transaction cleanup
1020 : * will be broken.
1021 : */
1022 1181 : if (FilePosIsUnknown(vfdP->seekPos))
1023 : {
1024 0 : vfdP->seekPos = lseek(vfdP->fd, (off_t) 0, SEEK_CUR);
1025 0 : if (FilePosIsUnknown(vfdP->seekPos))
1026 0 : elog(LOG, "could not seek file \"%s\" before closing: %m",
1027 : vfdP->fileName);
1028 : }
1029 :
1030 : /*
1031 : * Close the file. We aren't expecting this to fail; if it does, better
1032 : * to leak the FD than to mess up our internal state.
1033 : */
1034 1181 : if (close(vfdP->fd))
1035 0 : elog(LOG, "could not close file \"%s\": %m", vfdP->fileName);
1036 1181 : vfdP->fd = VFD_CLOSED;
1037 1181 : --nfile;
1038 :
1039 : /* delete the vfd record from the LRU ring */
1040 1181 : Delete(file);
1041 1181 : }
1042 :
1043 : static void
1044 26392 : Insert(File file)
1045 : {
1046 : Vfd *vfdP;
1047 :
1048 26392 : Assert(file != 0);
1049 :
1050 : DO_DB(elog(LOG, "Insert %d (%s)",
1051 : file, VfdCache[file].fileName));
1052 : DO_DB(_dump_lru());
1053 :
1054 26392 : vfdP = &VfdCache[file];
1055 :
1056 26392 : vfdP->lruMoreRecently = 0;
1057 26392 : vfdP->lruLessRecently = VfdCache[0].lruLessRecently;
1058 26392 : VfdCache[0].lruLessRecently = file;
1059 26392 : VfdCache[vfdP->lruLessRecently].lruMoreRecently = file;
1060 :
1061 : DO_DB(_dump_lru());
1062 26392 : }
1063 :
1064 : /* returns 0 on success, -1 on re-open failure (with errno set) */
1065 : static int
1066 636 : LruInsert(File file)
1067 : {
1068 : Vfd *vfdP;
1069 :
1070 636 : Assert(file != 0);
1071 :
1072 : DO_DB(elog(LOG, "LruInsert %d (%s)",
1073 : file, VfdCache[file].fileName));
1074 :
1075 636 : vfdP = &VfdCache[file];
1076 :
1077 636 : if (FileIsNotOpen(file))
1078 : {
1079 : /* Close excess kernel FDs. */
1080 636 : ReleaseLruFiles();
1081 :
1082 : /*
1083 : * The open could still fail for lack of file descriptors, eg due to
1084 : * overall system file table being full. So, be prepared to release
1085 : * another FD if necessary...
1086 : */
1087 636 : vfdP->fd = BasicOpenFile(vfdP->fileName, vfdP->fileFlags,
1088 : vfdP->fileMode);
1089 636 : if (vfdP->fd < 0)
1090 : {
1091 : DO_DB(elog(LOG, "re-open failed: %m"));
1092 0 : return -1;
1093 : }
1094 : else
1095 : {
1096 636 : ++nfile;
1097 : }
1098 :
1099 : /*
1100 : * Seek to the right position. We need no special case for seekPos
1101 : * equal to FileUnknownPos, as lseek() will certainly reject that
1102 : * (thus completing the logic noted in LruDelete() that we will fail
1103 : * to re-open a file if we couldn't get its seek position before
1104 : * closing).
1105 : */
1106 636 : if (vfdP->seekPos != (off_t) 0)
1107 : {
1108 411 : if (lseek(vfdP->fd, vfdP->seekPos, SEEK_SET) < 0)
1109 : {
1110 : /*
1111 : * If we fail to restore the seek position, treat it like an
1112 : * open() failure.
1113 : */
1114 0 : int save_errno = errno;
1115 :
1116 0 : elog(LOG, "could not seek file \"%s\" after re-opening: %m",
1117 : vfdP->fileName);
1118 0 : (void) close(vfdP->fd);
1119 0 : vfdP->fd = VFD_CLOSED;
1120 0 : --nfile;
1121 0 : errno = save_errno;
1122 0 : return -1;
1123 : }
1124 : }
1125 : }
1126 :
1127 : /*
1128 : * put it at the head of the Lru ring
1129 : */
1130 :
1131 636 : Insert(file);
1132 :
1133 636 : return 0;
1134 : }
1135 :
1136 : /*
1137 : * Release one kernel FD by closing the least-recently-used VFD.
1138 : */
1139 : static bool
1140 1176 : ReleaseLruFile(void)
1141 : {
1142 : DO_DB(elog(LOG, "ReleaseLruFile. Opened %d", nfile));
1143 :
1144 1176 : if (nfile > 0)
1145 : {
1146 : /*
1147 : * There are opened files and so there should be at least one used vfd
1148 : * in the ring.
1149 : */
1150 1176 : Assert(VfdCache[0].lruMoreRecently != 0);
1151 1176 : LruDelete(VfdCache[0].lruMoreRecently);
1152 1176 : return true; /* freed a file */
1153 : }
1154 0 : return false; /* no files available to free */
1155 : }
1156 :
1157 : /*
1158 : * Release kernel FDs as needed to get under the max_safe_fds limit.
1159 : * After calling this, it's OK to try to open another file.
1160 : */
1161 : static void
1162 36191 : ReleaseLruFiles(void)
1163 : {
1164 73558 : while (nfile + numAllocatedDescs >= max_safe_fds)
1165 : {
1166 1176 : if (!ReleaseLruFile())
1167 0 : break;
1168 : }
1169 36191 : }
1170 :
1171 : static File
1172 26046 : AllocateVfd(void)
1173 : {
1174 : Index i;
1175 : File file;
1176 :
1177 : DO_DB(elog(LOG, "AllocateVfd. Size %zu", SizeVfdCache));
1178 :
1179 26046 : Assert(SizeVfdCache > 0); /* InitFileAccess not called? */
1180 :
1181 26046 : if (VfdCache[0].nextFree == 0)
1182 : {
1183 : /*
1184 : * The free list is empty so it is time to increase the size of the
1185 : * array. We choose to double it each time this happens. However,
1186 : * there's not much point in starting *real* small.
1187 : */
1188 443 : Size newCacheSize = SizeVfdCache * 2;
1189 : Vfd *newVfdCache;
1190 :
1191 443 : if (newCacheSize < 32)
1192 339 : newCacheSize = 32;
1193 :
1194 : /*
1195 : * Be careful not to clobber VfdCache ptr if realloc fails.
1196 : */
1197 443 : newVfdCache = (Vfd *) realloc(VfdCache, sizeof(Vfd) * newCacheSize);
1198 443 : if (newVfdCache == NULL)
1199 0 : ereport(ERROR,
1200 : (errcode(ERRCODE_OUT_OF_MEMORY),
1201 : errmsg("out of memory")));
1202 443 : VfdCache = newVfdCache;
1203 :
1204 : /*
1205 : * Initialize the new entries and link them into the free list.
1206 : */
1207 17032 : for (i = SizeVfdCache; i < newCacheSize; i++)
1208 : {
1209 16589 : MemSet((char *) &(VfdCache[i]), 0, sizeof(Vfd));
1210 16589 : VfdCache[i].nextFree = i + 1;
1211 16589 : VfdCache[i].fd = VFD_CLOSED;
1212 : }
1213 443 : VfdCache[newCacheSize - 1].nextFree = 0;
1214 443 : VfdCache[0].nextFree = SizeVfdCache;
1215 :
1216 : /*
1217 : * Record the new size
1218 : */
1219 443 : SizeVfdCache = newCacheSize;
1220 : }
1221 :
1222 26046 : file = VfdCache[0].nextFree;
1223 :
1224 26046 : VfdCache[0].nextFree = VfdCache[file].nextFree;
1225 :
1226 26046 : return file;
1227 : }
1228 :
1229 : static void
1230 19194 : FreeVfd(File file)
1231 : {
1232 19194 : Vfd *vfdP = &VfdCache[file];
1233 :
1234 : DO_DB(elog(LOG, "FreeVfd: %d (%s)",
1235 : file, vfdP->fileName ? vfdP->fileName : ""));
1236 :
1237 19194 : if (vfdP->fileName != NULL)
1238 : {
1239 12134 : free(vfdP->fileName);
1240 12134 : vfdP->fileName = NULL;
1241 : }
1242 19194 : vfdP->fdstate = 0x0;
1243 :
1244 19194 : vfdP->nextFree = VfdCache[0].nextFree;
1245 19194 : VfdCache[0].nextFree = file;
1246 19194 : }
1247 :
1248 : /* returns 0 on success, -1 on re-open failure (with errno set) */
1249 : static int
1250 36439 : FileAccess(File file)
1251 : {
1252 : int returnValue;
1253 :
1254 : DO_DB(elog(LOG, "FileAccess %d (%s)",
1255 : file, VfdCache[file].fileName));
1256 :
1257 : /*
1258 : * Is the file open? If not, open it and put it at the head of the LRU
1259 : * ring (possibly closing the least recently used file to get an FD).
1260 : */
1261 :
1262 36439 : if (FileIsNotOpen(file))
1263 : {
1264 636 : returnValue = LruInsert(file);
1265 636 : if (returnValue != 0)
1266 0 : return returnValue;
1267 : }
1268 35803 : else if (VfdCache[0].lruLessRecently != file)
1269 : {
1270 : /*
1271 : * We now know that the file is open and that it is not the last one
1272 : * accessed, so we need to move it to the head of the Lru ring.
1273 : */
1274 :
1275 6770 : Delete(file);
1276 6770 : Insert(file);
1277 : }
1278 :
1279 36439 : return 0;
1280 : }
1281 :
1282 : /*
1283 : * Called when we get a shared invalidation message on some relation.
1284 : */
1285 : #ifdef NOT_USED
1286 : void
1287 : FileInvalidate(File file)
1288 : {
1289 : Assert(FileIsValid(file));
1290 : if (!FileIsNotOpen(file))
1291 : LruDelete(file);
1292 : }
1293 : #endif
1294 :
1295 : /*
1296 : * open a file in an arbitrary directory
1297 : *
1298 : * NB: if the passed pathname is relative (which it usually is),
1299 : * it will be interpreted relative to the process' working directory
1300 : * (which should always be $PGDATA when this code is running).
1301 : */
1302 : File
1303 26046 : PathNameOpenFile(FileName fileName, int fileFlags, int fileMode)
1304 : {
1305 : char *fnamecopy;
1306 : File file;
1307 : Vfd *vfdP;
1308 :
1309 : DO_DB(elog(LOG, "PathNameOpenFile: %s %x %o",
1310 : fileName, fileFlags, fileMode));
1311 :
1312 : /*
1313 : * We need a malloc'd copy of the file name; fail cleanly if no room.
1314 : */
1315 26046 : fnamecopy = strdup(fileName);
1316 26046 : if (fnamecopy == NULL)
1317 0 : ereport(ERROR,
1318 : (errcode(ERRCODE_OUT_OF_MEMORY),
1319 : errmsg("out of memory")));
1320 :
1321 26046 : file = AllocateVfd();
1322 26046 : vfdP = &VfdCache[file];
1323 :
1324 : /* Close excess kernel FDs. */
1325 26046 : ReleaseLruFiles();
1326 :
1327 26046 : vfdP->fd = BasicOpenFile(fileName, fileFlags, fileMode);
1328 :
1329 26046 : if (vfdP->fd < 0)
1330 : {
1331 7060 : int save_errno = errno;
1332 :
1333 7060 : FreeVfd(file);
1334 7060 : free(fnamecopy);
1335 7060 : errno = save_errno;
1336 7060 : return -1;
1337 : }
1338 18986 : ++nfile;
1339 : DO_DB(elog(LOG, "PathNameOpenFile: success %d",
1340 : vfdP->fd));
1341 :
1342 18986 : Insert(file);
1343 :
1344 18986 : vfdP->fileName = fnamecopy;
1345 : /* Saved flags are adjusted to be OK for re-opening file */
1346 18986 : vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);
1347 18986 : vfdP->fileMode = fileMode;
1348 18986 : vfdP->seekPos = 0;
1349 18986 : vfdP->fileSize = 0;
1350 18986 : vfdP->fdstate = 0x0;
1351 18986 : vfdP->resowner = NULL;
1352 :
1353 18986 : return file;
1354 : }
1355 :
1356 : /*
1357 : * Open a temporary file that will disappear when we close it.
1358 : *
1359 : * This routine takes care of generating an appropriate tempfile name.
1360 : * There's no need to pass in fileFlags or fileMode either, since only
1361 : * one setting makes any sense for a temp file.
1362 : *
1363 : * Unless interXact is true, the file is remembered by CurrentResourceOwner
1364 : * to ensure it's closed and deleted when it's no longer needed, typically at
1365 : * the end-of-transaction. In most cases, you don't want temporary files to
1366 : * outlive the transaction that created them, so this should be false -- but
1367 : * if you need "somewhat" temporary storage, this might be useful. In either
1368 : * case, the file is removed when the File is explicitly closed.
1369 : */
1370 : File
1371 23 : OpenTemporaryFile(bool interXact)
1372 : {
1373 23 : File file = 0;
1374 :
1375 : /*
1376 : * If some temp tablespace(s) have been given to us, try to use the next
1377 : * one. If a given tablespace can't be found, we silently fall back to
1378 : * the database's default tablespace.
1379 : *
1380 : * BUT: if the temp file is slated to outlive the current transaction,
1381 : * force it into the database's default tablespace, so that it will not
1382 : * pose a threat to possible tablespace drop attempts.
1383 : */
1384 23 : if (numTempTableSpaces > 0 && !interXact)
1385 : {
1386 0 : Oid tblspcOid = GetNextTempTableSpace();
1387 :
1388 0 : if (OidIsValid(tblspcOid))
1389 0 : file = OpenTemporaryFileInTablespace(tblspcOid, false);
1390 : }
1391 :
1392 : /*
1393 : * If not, or if tablespace is bad, create in database's default
1394 : * tablespace. MyDatabaseTableSpace should normally be set before we get
1395 : * here, but just in case it isn't, fall back to pg_default tablespace.
1396 : */
1397 23 : if (file <= 0)
1398 23 : file = OpenTemporaryFileInTablespace(MyDatabaseTableSpace ?
1399 : MyDatabaseTableSpace :
1400 : DEFAULTTABLESPACE_OID,
1401 : true);
1402 :
1403 : /* Mark it for deletion at close */
1404 23 : VfdCache[file].fdstate |= FD_TEMPORARY;
1405 :
1406 : /* Register it with the current resource owner */
1407 23 : if (!interXact)
1408 : {
1409 23 : VfdCache[file].fdstate |= FD_XACT_TEMPORARY;
1410 :
1411 23 : ResourceOwnerEnlargeFiles(CurrentResourceOwner);
1412 23 : ResourceOwnerRememberFile(CurrentResourceOwner, file);
1413 23 : VfdCache[file].resowner = CurrentResourceOwner;
1414 :
1415 : /* ensure cleanup happens at eoxact */
1416 23 : have_xact_temporary_files = true;
1417 : }
1418 :
1419 23 : return file;
1420 : }
1421 :
1422 : /*
1423 : * Open a temporary file in a specific tablespace.
1424 : * Subroutine for OpenTemporaryFile, which see for details.
1425 : */
1426 : static File
1427 23 : OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
1428 : {
1429 : char tempdirpath[MAXPGPATH];
1430 : char tempfilepath[MAXPGPATH];
1431 : File file;
1432 :
1433 : /*
1434 : * Identify the tempfile directory for this tablespace.
1435 : *
1436 : * If someone tries to specify pg_global, use pg_default instead.
1437 : */
1438 23 : if (tblspcOid == DEFAULTTABLESPACE_OID ||
1439 : tblspcOid == GLOBALTABLESPACE_OID)
1440 : {
1441 : /* The default tablespace is {datadir}/base */
1442 23 : snprintf(tempdirpath, sizeof(tempdirpath), "base/%s",
1443 : PG_TEMP_FILES_DIR);
1444 : }
1445 : else
1446 : {
1447 : /* All other tablespaces are accessed via symlinks */
1448 0 : snprintf(tempdirpath, sizeof(tempdirpath), "pg_tblspc/%u/%s/%s",
1449 : tblspcOid, TABLESPACE_VERSION_DIRECTORY, PG_TEMP_FILES_DIR);
1450 : }
1451 :
1452 : /*
1453 : * Generate a tempfile name that should be unique within the current
1454 : * database instance.
1455 : */
1456 23 : snprintf(tempfilepath, sizeof(tempfilepath), "%s/%s%d.%ld",
1457 : tempdirpath, PG_TEMP_FILE_PREFIX, MyProcPid, tempFileCounter++);
1458 :
1459 : /*
1460 : * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1461 : * temp file that can be reused.
1462 : */
1463 23 : file = PathNameOpenFile(tempfilepath,
1464 : O_RDWR | O_CREAT | O_TRUNC | PG_BINARY,
1465 : 0600);
1466 23 : if (file <= 0)
1467 : {
1468 : /*
1469 : * We might need to create the tablespace's tempfile directory, if no
1470 : * one has yet done so.
1471 : *
1472 : * Don't check for error from mkdir; it could fail if someone else
1473 : * just did the same thing. If it doesn't work then we'll bomb out on
1474 : * the second create attempt, instead.
1475 : */
1476 1 : mkdir(tempdirpath, S_IRWXU);
1477 :
1478 1 : file = PathNameOpenFile(tempfilepath,
1479 : O_RDWR | O_CREAT | O_TRUNC | PG_BINARY,
1480 : 0600);
1481 1 : if (file <= 0 && rejectError)
1482 0 : elog(ERROR, "could not create temporary file \"%s\": %m",
1483 : tempfilepath);
1484 : }
1485 :
1486 23 : return file;
1487 : }
1488 :
1489 : /*
1490 : * close a file when done with it
1491 : */
1492 : void
1493 12134 : FileClose(File file)
1494 : {
1495 : Vfd *vfdP;
1496 :
1497 12134 : Assert(FileIsValid(file));
1498 :
1499 : DO_DB(elog(LOG, "FileClose: %d (%s)",
1500 : file, VfdCache[file].fileName));
1501 :
1502 12134 : vfdP = &VfdCache[file];
1503 :
1504 12134 : if (!FileIsNotOpen(file))
1505 : {
1506 : /* close the file */
1507 11782 : if (close(vfdP->fd))
1508 0 : elog(LOG, "could not close file \"%s\": %m", vfdP->fileName);
1509 :
1510 11782 : --nfile;
1511 11782 : vfdP->fd = VFD_CLOSED;
1512 :
1513 : /* remove the file from the lru ring */
1514 11782 : Delete(file);
1515 : }
1516 :
1517 : /*
1518 : * Delete the file if it was temporary, and make a log entry if wanted
1519 : */
1520 12134 : if (vfdP->fdstate & FD_TEMPORARY)
1521 : {
1522 : struct stat filestats;
1523 : int stat_errno;
1524 :
1525 : /*
1526 : * If we get an error, as could happen within the ereport/elog calls,
1527 : * we'll come right back here during transaction abort. Reset the
1528 : * flag to ensure that we can't get into an infinite loop. This code
1529 : * is arranged to ensure that the worst-case consequence is failing to
1530 : * emit log message(s), not failing to attempt the unlink.
1531 : */
1532 23 : vfdP->fdstate &= ~FD_TEMPORARY;
1533 :
1534 : /* Subtract its size from current usage (do first in case of error) */
1535 23 : temporary_files_size -= vfdP->fileSize;
1536 23 : vfdP->fileSize = 0;
1537 :
1538 : /* first try the stat() */
1539 23 : if (stat(vfdP->fileName, &filestats))
1540 0 : stat_errno = errno;
1541 : else
1542 23 : stat_errno = 0;
1543 :
1544 : /* in any case do the unlink */
1545 23 : if (unlink(vfdP->fileName))
1546 0 : elog(LOG, "could not unlink file \"%s\": %m", vfdP->fileName);
1547 :
1548 : /* and last report the stat results */
1549 23 : if (stat_errno == 0)
1550 : {
1551 23 : pgstat_report_tempfile(filestats.st_size);
1552 :
1553 23 : if (log_temp_files >= 0)
1554 : {
1555 23 : if ((filestats.st_size / 1024) >= log_temp_files)
1556 7 : ereport(LOG,
1557 : (errmsg("temporary file: path \"%s\", size %lu",
1558 : vfdP->fileName,
1559 : (unsigned long) filestats.st_size)));
1560 : }
1561 : }
1562 : else
1563 : {
1564 0 : errno = stat_errno;
1565 0 : elog(LOG, "could not stat file \"%s\": %m", vfdP->fileName);
1566 : }
1567 : }
1568 :
1569 : /* Unregister it from the resource owner */
1570 12134 : if (vfdP->resowner)
1571 23 : ResourceOwnerForgetFile(vfdP->resowner, file);
1572 :
1573 : /*
1574 : * Return the Vfd slot to the free list
1575 : */
1576 12134 : FreeVfd(file);
1577 12134 : }
1578 :
1579 : /*
1580 : * FilePrefetch - initiate asynchronous read of a given range of the file.
1581 : * The logical seek position is unaffected.
1582 : *
1583 : * Currently the only implementation of this function is using posix_fadvise
1584 : * which is the simplest standardized interface that accomplishes this.
1585 : * We could add an implementation using libaio in the future; but note that
1586 : * this API is inappropriate for libaio, which wants to have a buffer provided
1587 : * to read into.
1588 : */
1589 : int
1590 0 : FilePrefetch(File file, off_t offset, int amount, uint32 wait_event_info)
1591 : {
1592 : #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_WILLNEED)
1593 : int returnCode;
1594 :
1595 0 : Assert(FileIsValid(file));
1596 :
1597 : DO_DB(elog(LOG, "FilePrefetch: %d (%s) " INT64_FORMAT " %d",
1598 : file, VfdCache[file].fileName,
1599 : (int64) offset, amount));
1600 :
1601 0 : returnCode = FileAccess(file);
1602 0 : if (returnCode < 0)
1603 0 : return returnCode;
1604 :
1605 0 : pgstat_report_wait_start(wait_event_info);
1606 0 : returnCode = posix_fadvise(VfdCache[file].fd, offset, amount,
1607 : POSIX_FADV_WILLNEED);
1608 0 : pgstat_report_wait_end();
1609 :
1610 0 : return returnCode;
1611 : #else
1612 : Assert(FileIsValid(file));
1613 : return 0;
1614 : #endif
1615 : }
1616 :
1617 : void
1618 1462 : FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
1619 : {
1620 : int returnCode;
1621 :
1622 1462 : Assert(FileIsValid(file));
1623 :
1624 : DO_DB(elog(LOG, "FileWriteback: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
1625 : file, VfdCache[file].fileName,
1626 : (int64) offset, (int64) nbytes));
1627 :
1628 : /*
1629 : * Caution: do not call pg_flush_data with nbytes = 0, it could trash the
1630 : * file's seek position. We prefer to define that as a no-op here.
1631 : */
1632 1462 : if (nbytes <= 0)
1633 0 : return;
1634 :
1635 1462 : returnCode = FileAccess(file);
1636 1462 : if (returnCode < 0)
1637 0 : return;
1638 :
1639 1462 : pgstat_report_wait_start(wait_event_info);
1640 1462 : pg_flush_data(VfdCache[file].fd, offset, nbytes);
1641 1462 : pgstat_report_wait_end();
1642 : }
1643 :
1644 : int
1645 5431 : FileRead(File file, char *buffer, int amount, uint32 wait_event_info)
1646 : {
1647 : int returnCode;
1648 : Vfd *vfdP;
1649 :
1650 5431 : Assert(FileIsValid(file));
1651 :
1652 : DO_DB(elog(LOG, "FileRead: %d (%s) " INT64_FORMAT " %d %p",
1653 : file, VfdCache[file].fileName,
1654 : (int64) VfdCache[file].seekPos,
1655 : amount, buffer));
1656 :
1657 5431 : returnCode = FileAccess(file);
1658 5431 : if (returnCode < 0)
1659 0 : return returnCode;
1660 :
1661 5431 : vfdP = &VfdCache[file];
1662 :
1663 : retry:
1664 5431 : pgstat_report_wait_start(wait_event_info);
1665 5431 : returnCode = read(vfdP->fd, buffer, amount);
1666 5431 : pgstat_report_wait_end();
1667 :
1668 5431 : if (returnCode >= 0)
1669 : {
1670 : /* if seekPos is unknown, leave it that way */
1671 5431 : if (!FilePosIsUnknown(vfdP->seekPos))
1672 5431 : vfdP->seekPos += returnCode;
1673 : }
1674 : else
1675 : {
1676 : /*
1677 : * Windows may run out of kernel buffers and return "Insufficient
1678 : * system resources" error. Wait a bit and retry to solve it.
1679 : *
1680 : * It is rumored that EINTR is also possible on some Unix filesystems,
1681 : * in which case immediate retry is indicated.
1682 : */
1683 : #ifdef WIN32
1684 : DWORD error = GetLastError();
1685 :
1686 : switch (error)
1687 : {
1688 : case ERROR_NO_SYSTEM_RESOURCES:
1689 : pg_usleep(1000L);
1690 : errno = EINTR;
1691 : break;
1692 : default:
1693 : _dosmaperr(error);
1694 : break;
1695 : }
1696 : #endif
1697 : /* OK to retry if interrupted */
1698 0 : if (errno == EINTR)
1699 0 : goto retry;
1700 :
1701 : /* Trouble, so assume we don't know the file position anymore */
1702 0 : vfdP->seekPos = FileUnknownPos;
1703 : }
1704 :
1705 5431 : return returnCode;
1706 : }
1707 :
1708 : int
1709 27734 : FileWrite(File file, char *buffer, int amount, uint32 wait_event_info)
1710 : {
1711 : int returnCode;
1712 : Vfd *vfdP;
1713 :
1714 27734 : Assert(FileIsValid(file));
1715 :
1716 : DO_DB(elog(LOG, "FileWrite: %d (%s) " INT64_FORMAT " %d %p",
1717 : file, VfdCache[file].fileName,
1718 : (int64) VfdCache[file].seekPos,
1719 : amount, buffer));
1720 :
1721 27734 : returnCode = FileAccess(file);
1722 27734 : if (returnCode < 0)
1723 0 : return returnCode;
1724 :
1725 27734 : vfdP = &VfdCache[file];
1726 :
1727 : /*
1728 : * If enforcing temp_file_limit and it's a temp file, check to see if the
1729 : * write would overrun temp_file_limit, and throw error if so. Note: it's
1730 : * really a modularity violation to throw error here; we should set errno
1731 : * and return -1. However, there's no way to report a suitable error
1732 : * message if we do that. All current callers would just throw error
1733 : * immediately anyway, so this is safe at present.
1734 : */
1735 27734 : if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMPORARY))
1736 : {
1737 : off_t newPos;
1738 :
1739 : /*
1740 : * Normally we should know the seek position, but if for some reason
1741 : * we have lost track of it, try again to get it. Here, it's fine to
1742 : * throw an error if we still can't get it.
1743 : */
1744 0 : if (FilePosIsUnknown(vfdP->seekPos))
1745 : {
1746 0 : vfdP->seekPos = lseek(vfdP->fd, (off_t) 0, SEEK_CUR);
1747 0 : if (FilePosIsUnknown(vfdP->seekPos))
1748 0 : elog(ERROR, "could not seek file \"%s\": %m", vfdP->fileName);
1749 : }
1750 :
1751 0 : newPos = vfdP->seekPos + amount;
1752 0 : if (newPos > vfdP->fileSize)
1753 : {
1754 0 : uint64 newTotal = temporary_files_size;
1755 :
1756 0 : newTotal += newPos - vfdP->fileSize;
1757 0 : if (newTotal > (uint64) temp_file_limit * (uint64) 1024)
1758 0 : ereport(ERROR,
1759 : (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED),
1760 : errmsg("temporary file size exceeds temp_file_limit (%dkB)",
1761 : temp_file_limit)));
1762 : }
1763 : }
1764 :
1765 : retry:
1766 27734 : errno = 0;
1767 27734 : pgstat_report_wait_start(wait_event_info);
1768 27734 : returnCode = write(vfdP->fd, buffer, amount);
1769 27734 : pgstat_report_wait_end();
1770 :
1771 : /* if write didn't set errno, assume problem is no disk space */
1772 27734 : if (returnCode != amount && errno == 0)
1773 0 : errno = ENOSPC;
1774 :
1775 27734 : if (returnCode >= 0)
1776 : {
1777 : /* if seekPos is unknown, leave it that way */
1778 27734 : if (!FilePosIsUnknown(vfdP->seekPos))
1779 27734 : vfdP->seekPos += returnCode;
1780 :
1781 : /*
1782 : * Maintain fileSize and temporary_files_size if it's a temp file.
1783 : *
1784 : * If seekPos is -1 (unknown), this will do nothing; but we could only
1785 : * get here in that state if we're not enforcing temporary_files_size,
1786 : * so we don't care.
1787 : */
1788 27734 : if (vfdP->fdstate & FD_TEMPORARY)
1789 : {
1790 1057 : off_t newPos = vfdP->seekPos;
1791 :
1792 1057 : if (newPos > vfdP->fileSize)
1793 : {
1794 1052 : temporary_files_size += newPos - vfdP->fileSize;
1795 1052 : vfdP->fileSize = newPos;
1796 : }
1797 : }
1798 : }
1799 : else
1800 : {
1801 : /*
1802 : * See comments in FileRead()
1803 : */
1804 : #ifdef WIN32
1805 : DWORD error = GetLastError();
1806 :
1807 : switch (error)
1808 : {
1809 : case ERROR_NO_SYSTEM_RESOURCES:
1810 : pg_usleep(1000L);
1811 : errno = EINTR;
1812 : break;
1813 : default:
1814 : _dosmaperr(error);
1815 : break;
1816 : }
1817 : #endif
1818 : /* OK to retry if interrupted */
1819 0 : if (errno == EINTR)
1820 0 : goto retry;
1821 :
1822 : /* Trouble, so assume we don't know the file position anymore */
1823 0 : vfdP->seekPos = FileUnknownPos;
1824 : }
1825 :
1826 27734 : return returnCode;
1827 : }
1828 :
1829 : int
1830 1260 : FileSync(File file, uint32 wait_event_info)
1831 : {
1832 : int returnCode;
1833 :
1834 1260 : Assert(FileIsValid(file));
1835 :
1836 : DO_DB(elog(LOG, "FileSync: %d (%s)",
1837 : file, VfdCache[file].fileName));
1838 :
1839 1260 : returnCode = FileAccess(file);
1840 1260 : if (returnCode < 0)
1841 0 : return returnCode;
1842 :
1843 1260 : pgstat_report_wait_start(wait_event_info);
1844 1260 : returnCode = pg_fsync(VfdCache[file].fd);
1845 1260 : pgstat_report_wait_end();
1846 :
1847 1260 : return returnCode;
1848 : }
1849 :
1850 : off_t
1851 148547 : FileSeek(File file, off_t offset, int whence)
1852 : {
1853 : Vfd *vfdP;
1854 :
1855 148547 : Assert(FileIsValid(file));
1856 :
1857 : DO_DB(elog(LOG, "FileSeek: %d (%s) " INT64_FORMAT " " INT64_FORMAT " %d",
1858 : file, VfdCache[file].fileName,
1859 : (int64) VfdCache[file].seekPos,
1860 : (int64) offset, whence));
1861 :
1862 148547 : vfdP = &VfdCache[file];
1863 :
1864 148547 : if (FileIsNotOpen(file))
1865 : {
1866 636 : switch (whence)
1867 : {
1868 : case SEEK_SET:
1869 107 : if (offset < 0)
1870 : {
1871 0 : errno = EINVAL;
1872 0 : return (off_t) -1;
1873 : }
1874 107 : vfdP->seekPos = offset;
1875 107 : break;
1876 : case SEEK_CUR:
1877 0 : if (FilePosIsUnknown(vfdP->seekPos) ||
1878 0 : vfdP->seekPos + offset < 0)
1879 : {
1880 0 : errno = EINVAL;
1881 0 : return (off_t) -1;
1882 : }
1883 0 : vfdP->seekPos += offset;
1884 0 : break;
1885 : case SEEK_END:
1886 529 : if (FileAccess(file) < 0)
1887 0 : return (off_t) -1;
1888 529 : vfdP->seekPos = lseek(vfdP->fd, offset, whence);
1889 529 : break;
1890 : default:
1891 0 : elog(ERROR, "invalid whence: %d", whence);
1892 : break;
1893 : }
1894 : }
1895 : else
1896 : {
1897 147911 : switch (whence)
1898 : {
1899 : case SEEK_SET:
1900 31012 : if (offset < 0)
1901 : {
1902 0 : errno = EINVAL;
1903 0 : return (off_t) -1;
1904 : }
1905 31012 : if (vfdP->seekPos != offset)
1906 3669 : vfdP->seekPos = lseek(vfdP->fd, offset, whence);
1907 31012 : break;
1908 : case SEEK_CUR:
1909 0 : if (offset != 0 || FilePosIsUnknown(vfdP->seekPos))
1910 0 : vfdP->seekPos = lseek(vfdP->fd, offset, whence);
1911 0 : break;
1912 : case SEEK_END:
1913 116899 : vfdP->seekPos = lseek(vfdP->fd, offset, whence);
1914 116899 : break;
1915 : default:
1916 0 : elog(ERROR, "invalid whence: %d", whence);
1917 : break;
1918 : }
1919 : }
1920 :
1921 148547 : return vfdP->seekPos;
1922 : }
1923 :
1924 : /*
1925 : * XXX not actually used but here for completeness
1926 : */
1927 : #ifdef NOT_USED
1928 : off_t
1929 : FileTell(File file)
1930 : {
1931 : Assert(FileIsValid(file));
1932 : DO_DB(elog(LOG, "FileTell %d (%s)",
1933 : file, VfdCache[file].fileName));
1934 : return VfdCache[file].seekPos;
1935 : }
1936 : #endif
1937 :
1938 : int
1939 23 : FileTruncate(File file, off_t offset, uint32 wait_event_info)
1940 : {
1941 : int returnCode;
1942 :
1943 23 : Assert(FileIsValid(file));
1944 :
1945 : DO_DB(elog(LOG, "FileTruncate %d (%s)",
1946 : file, VfdCache[file].fileName));
1947 :
1948 23 : returnCode = FileAccess(file);
1949 23 : if (returnCode < 0)
1950 0 : return returnCode;
1951 :
1952 23 : pgstat_report_wait_start(wait_event_info);
1953 23 : returnCode = ftruncate(VfdCache[file].fd, offset);
1954 23 : pgstat_report_wait_end();
1955 :
1956 23 : if (returnCode == 0 && VfdCache[file].fileSize > offset)
1957 : {
1958 : /* adjust our state for truncation of a temp file */
1959 0 : Assert(VfdCache[file].fdstate & FD_TEMPORARY);
1960 0 : temporary_files_size -= VfdCache[file].fileSize - offset;
1961 0 : VfdCache[file].fileSize = offset;
1962 : }
1963 :
1964 23 : return returnCode;
1965 : }
1966 :
1967 : /*
1968 : * Return the pathname associated with an open file.
1969 : *
1970 : * The returned string points to an internal buffer, which is valid until
1971 : * the file is closed.
1972 : */
1973 : char *
1974 0 : FilePathName(File file)
1975 : {
1976 0 : Assert(FileIsValid(file));
1977 :
1978 0 : return VfdCache[file].fileName;
1979 : }
1980 :
1981 : /*
1982 : * Return the raw file descriptor of an opened file.
1983 : *
1984 : * The returned file descriptor will be valid until the file is closed, but
1985 : * there are a lot of things that can make that happen. So the caller should
1986 : * be careful not to do much of anything else before it finishes using the
1987 : * returned file descriptor.
1988 : */
1989 : int
1990 0 : FileGetRawDesc(File file)
1991 : {
1992 0 : Assert(FileIsValid(file));
1993 0 : return VfdCache[file].fd;
1994 : }
1995 :
1996 : /*
1997 : * FileGetRawFlags - returns the file flags on open(2)
1998 : */
1999 : int
2000 0 : FileGetRawFlags(File file)
2001 : {
2002 0 : Assert(FileIsValid(file));
2003 0 : return VfdCache[file].fileFlags;
2004 : }
2005 :
2006 : /*
2007 : * FileGetRawMode - returns the mode bitmask passed to open(2)
2008 : */
2009 : int
2010 0 : FileGetRawMode(File file)
2011 : {
2012 0 : Assert(FileIsValid(file));
2013 0 : return VfdCache[file].fileMode;
2014 : }
2015 :
2016 : /*
2017 : * Make room for another allocatedDescs[] array entry if needed and possible.
2018 : * Returns true if an array element is available.
2019 : */
2020 : static bool
2021 9509 : reserveAllocatedDesc(void)
2022 : {
2023 : AllocateDesc *newDescs;
2024 : int newMax;
2025 :
2026 : /* Quick out if array already has a free slot. */
2027 9509 : if (numAllocatedDescs < maxAllocatedDescs)
2028 9504 : return true;
2029 :
2030 : /*
2031 : * If the array hasn't yet been created in the current process, initialize
2032 : * it with FD_MINFREE / 2 elements. In many scenarios this is as many as
2033 : * we will ever need, anyway. We don't want to look at max_safe_fds
2034 : * immediately because set_max_safe_fds() may not have run yet.
2035 : */
2036 5 : if (allocatedDescs == NULL)
2037 : {
2038 5 : newMax = FD_MINFREE / 2;
2039 5 : newDescs = (AllocateDesc *) malloc(newMax * sizeof(AllocateDesc));
2040 : /* Out of memory already? Treat as fatal error. */
2041 5 : if (newDescs == NULL)
2042 0 : ereport(ERROR,
2043 : (errcode(ERRCODE_OUT_OF_MEMORY),
2044 : errmsg("out of memory")));
2045 5 : allocatedDescs = newDescs;
2046 5 : maxAllocatedDescs = newMax;
2047 5 : return true;
2048 : }
2049 :
2050 : /*
2051 : * Consider enlarging the array beyond the initial allocation used above.
2052 : * By the time this happens, max_safe_fds should be known accurately.
2053 : *
2054 : * We mustn't let allocated descriptors hog all the available FDs, and in
2055 : * practice we'd better leave a reasonable number of FDs for VFD use. So
2056 : * set the maximum to max_safe_fds / 2. (This should certainly be at
2057 : * least as large as the initial size, FD_MINFREE / 2.)
2058 : */
2059 0 : newMax = max_safe_fds / 2;
2060 0 : if (newMax > maxAllocatedDescs)
2061 : {
2062 0 : newDescs = (AllocateDesc *) realloc(allocatedDescs,
2063 : newMax * sizeof(AllocateDesc));
2064 : /* Treat out-of-memory as a non-fatal error. */
2065 0 : if (newDescs == NULL)
2066 0 : return false;
2067 0 : allocatedDescs = newDescs;
2068 0 : maxAllocatedDescs = newMax;
2069 0 : return true;
2070 : }
2071 :
2072 : /* Can't enlarge allocatedDescs[] any more. */
2073 0 : return false;
2074 : }
2075 :
2076 : /*
2077 : * Routines that want to use stdio (ie, FILE*) should use AllocateFile
2078 : * rather than plain fopen(). This lets fd.c deal with freeing FDs if
2079 : * necessary to open the file. When done, call FreeFile rather than fclose.
2080 : *
2081 : * Note that files that will be open for any significant length of time
2082 : * should NOT be handled this way, since they cannot share kernel file
2083 : * descriptors with other files; there is grave risk of running out of FDs
2084 : * if anyone locks down too many FDs. Most callers of this routine are
2085 : * simply reading a config file that they will read and close immediately.
2086 : *
2087 : * fd.c will automatically close all files opened with AllocateFile at
2088 : * transaction commit or abort; this prevents FD leakage if a routine
2089 : * that calls AllocateFile is terminated prematurely by ereport(ERROR).
2090 : *
2091 : * Ideally this should be the *only* direct call of fopen() in the backend.
2092 : */
2093 : FILE *
2094 3458 : AllocateFile(const char *name, const char *mode)
2095 : {
2096 : FILE *file;
2097 :
2098 : DO_DB(elog(LOG, "AllocateFile: Allocated %d (%s)",
2099 : numAllocatedDescs, name));
2100 :
2101 : /* Can we allocate another non-virtual FD? */
2102 3458 : if (!reserveAllocatedDesc())
2103 0 : ereport(ERROR,
2104 : (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2105 : errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2106 : maxAllocatedDescs, name)));
2107 :
2108 : /* Close excess kernel FDs. */
2109 3458 : ReleaseLruFiles();
2110 :
2111 : TryAgain:
2112 3458 : if ((file = fopen(name, mode)) != NULL)
2113 : {
2114 3430 : AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2115 :
2116 3430 : desc->kind = AllocateDescFile;
2117 3430 : desc->desc.file = file;
2118 3430 : desc->create_subid = GetCurrentSubTransactionId();
2119 3430 : numAllocatedDescs++;
2120 3430 : return desc->desc.file;
2121 : }
2122 :
2123 28 : if (errno == EMFILE || errno == ENFILE)
2124 : {
2125 0 : int save_errno = errno;
2126 :
2127 0 : ereport(LOG,
2128 : (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2129 : errmsg("out of file descriptors: %m; release and retry")));
2130 0 : errno = 0;
2131 0 : if (ReleaseLruFile())
2132 0 : goto TryAgain;
2133 0 : errno = save_errno;
2134 : }
2135 :
2136 28 : return NULL;
2137 : }
2138 :
2139 :
2140 : /*
2141 : * Like AllocateFile, but returns an unbuffered fd like open(2)
2142 : */
2143 : int
2144 5465 : OpenTransientFile(FileName fileName, int fileFlags, int fileMode)
2145 : {
2146 : int fd;
2147 :
2148 : DO_DB(elog(LOG, "OpenTransientFile: Allocated %d (%s)",
2149 : numAllocatedDescs, fileName));
2150 :
2151 : /* Can we allocate another non-virtual FD? */
2152 5465 : if (!reserveAllocatedDesc())
2153 0 : ereport(ERROR,
2154 : (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2155 : errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2156 : maxAllocatedDescs, fileName)));
2157 :
2158 : /* Close excess kernel FDs. */
2159 5465 : ReleaseLruFiles();
2160 :
2161 5465 : fd = BasicOpenFile(fileName, fileFlags, fileMode);
2162 :
2163 5465 : if (fd >= 0)
2164 : {
2165 5463 : AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2166 :
2167 5463 : desc->kind = AllocateDescRawFD;
2168 5463 : desc->desc.fd = fd;
2169 5463 : desc->create_subid = GetCurrentSubTransactionId();
2170 5463 : numAllocatedDescs++;
2171 :
2172 5463 : return fd;
2173 : }
2174 :
2175 2 : return -1; /* failure */
2176 : }
2177 :
2178 : /*
2179 : * Routines that want to initiate a pipe stream should use OpenPipeStream
2180 : * rather than plain popen(). This lets fd.c deal with freeing FDs if
2181 : * necessary. When done, call ClosePipeStream rather than pclose.
2182 : */
2183 : FILE *
2184 1 : OpenPipeStream(const char *command, const char *mode)
2185 : {
2186 : FILE *file;
2187 :
2188 : DO_DB(elog(LOG, "OpenPipeStream: Allocated %d (%s)",
2189 : numAllocatedDescs, command));
2190 :
2191 : /* Can we allocate another non-virtual FD? */
2192 1 : if (!reserveAllocatedDesc())
2193 0 : ereport(ERROR,
2194 : (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2195 : errmsg("exceeded maxAllocatedDescs (%d) while trying to execute command \"%s\"",
2196 : maxAllocatedDescs, command)));
2197 :
2198 : /* Close excess kernel FDs. */
2199 1 : ReleaseLruFiles();
2200 :
2201 : TryAgain:
2202 1 : fflush(stdout);
2203 1 : fflush(stderr);
2204 1 : errno = 0;
2205 1 : if ((file = popen(command, mode)) != NULL)
2206 : {
2207 1 : AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2208 :
2209 1 : desc->kind = AllocateDescPipe;
2210 1 : desc->desc.file = file;
2211 1 : desc->create_subid = GetCurrentSubTransactionId();
2212 1 : numAllocatedDescs++;
2213 1 : return desc->desc.file;
2214 : }
2215 :
2216 0 : if (errno == EMFILE || errno == ENFILE)
2217 : {
2218 0 : int save_errno = errno;
2219 :
2220 0 : ereport(LOG,
2221 : (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2222 : errmsg("out of file descriptors: %m; release and retry")));
2223 0 : errno = 0;
2224 0 : if (ReleaseLruFile())
2225 0 : goto TryAgain;
2226 0 : errno = save_errno;
2227 : }
2228 :
2229 0 : return NULL;
2230 : }
2231 :
2232 : /*
2233 : * Free an AllocateDesc of any type.
2234 : *
2235 : * The argument *must* point into the allocatedDescs[] array.
2236 : */
2237 : static int
2238 9478 : FreeDesc(AllocateDesc *desc)
2239 : {
2240 : int result;
2241 :
2242 : /* Close the underlying object */
2243 9478 : switch (desc->kind)
2244 : {
2245 : case AllocateDescFile:
2246 3430 : result = fclose(desc->desc.file);
2247 3430 : break;
2248 : case AllocateDescPipe:
2249 1 : result = pclose(desc->desc.file);
2250 1 : break;
2251 : case AllocateDescDir:
2252 584 : result = closedir(desc->desc.dir);
2253 584 : break;
2254 : case AllocateDescRawFD:
2255 5463 : result = close(desc->desc.fd);
2256 5463 : break;
2257 : default:
2258 0 : elog(ERROR, "AllocateDesc kind not recognized");
2259 : result = 0; /* keep compiler quiet */
2260 : break;
2261 : }
2262 :
2263 : /* Compact storage in the allocatedDescs array */
2264 9478 : numAllocatedDescs--;
2265 9478 : *desc = allocatedDescs[numAllocatedDescs];
2266 :
2267 9478 : return result;
2268 : }
2269 :
2270 : /*
2271 : * Close a file returned by AllocateFile.
2272 : *
2273 : * Note we do not check fclose's return value --- it is up to the caller
2274 : * to handle close errors.
2275 : */
2276 : int
2277 3429 : FreeFile(FILE *file)
2278 : {
2279 : int i;
2280 :
2281 : DO_DB(elog(LOG, "FreeFile: Allocated %d", numAllocatedDescs));
2282 :
2283 : /* Remove file from list of allocated files, if it's present */
2284 6858 : for (i = numAllocatedDescs; --i >= 0;)
2285 : {
2286 3429 : AllocateDesc *desc = &allocatedDescs[i];
2287 :
2288 3429 : if (desc->kind == AllocateDescFile && desc->desc.file == file)
2289 3429 : return FreeDesc(desc);
2290 : }
2291 :
2292 : /* Only get here if someone passes us a file not in allocatedDescs */
2293 0 : elog(WARNING, "file passed to FreeFile was not obtained from AllocateFile");
2294 :
2295 0 : return fclose(file);
2296 : }
2297 :
2298 : /*
2299 : * Close a file returned by OpenTransientFile.
2300 : *
2301 : * Note we do not check close's return value --- it is up to the caller
2302 : * to handle close errors.
2303 : */
2304 : int
2305 5463 : CloseTransientFile(int fd)
2306 : {
2307 : int i;
2308 :
2309 : DO_DB(elog(LOG, "CloseTransientFile: Allocated %d", numAllocatedDescs));
2310 :
2311 : /* Remove fd from list of allocated files, if it's present */
2312 10926 : for (i = numAllocatedDescs; --i >= 0;)
2313 : {
2314 5463 : AllocateDesc *desc = &allocatedDescs[i];
2315 :
2316 5463 : if (desc->kind == AllocateDescRawFD && desc->desc.fd == fd)
2317 5463 : return FreeDesc(desc);
2318 : }
2319 :
2320 : /* Only get here if someone passes us a file not in allocatedDescs */
2321 0 : elog(WARNING, "fd passed to CloseTransientFile was not obtained from OpenTransientFile");
2322 :
2323 0 : return close(fd);
2324 : }
2325 :
2326 : /*
2327 : * Routines that want to use <dirent.h> (ie, DIR*) should use AllocateDir
2328 : * rather than plain opendir(). This lets fd.c deal with freeing FDs if
2329 : * necessary to open the directory, and with closing it after an elog.
2330 : * When done, call FreeDir rather than closedir.
2331 : *
2332 : * Ideally this should be the *only* direct call of opendir() in the backend.
2333 : */
2334 : DIR *
2335 585 : AllocateDir(const char *dirname)
2336 : {
2337 : DIR *dir;
2338 :
2339 : DO_DB(elog(LOG, "AllocateDir: Allocated %d (%s)",
2340 : numAllocatedDescs, dirname));
2341 :
2342 : /* Can we allocate another non-virtual FD? */
2343 585 : if (!reserveAllocatedDesc())
2344 0 : ereport(ERROR,
2345 : (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2346 : errmsg("exceeded maxAllocatedDescs (%d) while trying to open directory \"%s\"",
2347 : maxAllocatedDescs, dirname)));
2348 :
2349 : /* Close excess kernel FDs. */
2350 585 : ReleaseLruFiles();
2351 :
2352 : TryAgain:
2353 585 : if ((dir = opendir(dirname)) != NULL)
2354 : {
2355 584 : AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2356 :
2357 584 : desc->kind = AllocateDescDir;
2358 584 : desc->desc.dir = dir;
2359 584 : desc->create_subid = GetCurrentSubTransactionId();
2360 584 : numAllocatedDescs++;
2361 584 : return desc->desc.dir;
2362 : }
2363 :
2364 1 : if (errno == EMFILE || errno == ENFILE)
2365 : {
2366 0 : int save_errno = errno;
2367 :
2368 0 : ereport(LOG,
2369 : (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2370 : errmsg("out of file descriptors: %m; release and retry")));
2371 0 : errno = 0;
2372 0 : if (ReleaseLruFile())
2373 0 : goto TryAgain;
2374 0 : errno = save_errno;
2375 : }
2376 :
2377 1 : return NULL;
2378 : }
2379 :
2380 : /*
2381 : * Read a directory opened with AllocateDir, ereport'ing any error.
2382 : *
2383 : * This is easier to use than raw readdir() since it takes care of some
2384 : * otherwise rather tedious and error-prone manipulation of errno. Also,
2385 : * if you are happy with a generic error message for AllocateDir failure,
2386 : * you can just do
2387 : *
2388 : * dir = AllocateDir(path);
2389 : * while ((dirent = ReadDir(dir, path)) != NULL)
2390 : * process dirent;
2391 : * FreeDir(dir);
2392 : *
2393 : * since a NULL dir parameter is taken as indicating AllocateDir failed.
2394 : * (Make sure errno hasn't been changed since AllocateDir if you use this
2395 : * shortcut.)
2396 : *
2397 : * The pathname passed to AllocateDir must be passed to this routine too,
2398 : * but it is only used for error reporting.
2399 : */
2400 : struct dirent *
2401 18266 : ReadDir(DIR *dir, const char *dirname)
2402 : {
2403 18266 : return ReadDirExtended(dir, dirname, ERROR);
2404 : }
2405 :
2406 : /*
2407 : * Alternate version that allows caller to specify the elevel for any
2408 : * error report. If elevel < ERROR, returns NULL on any error.
2409 : */
2410 : static struct dirent *
2411 18266 : ReadDirExtended(DIR *dir, const char *dirname, int elevel)
2412 : {
2413 : struct dirent *dent;
2414 :
2415 : /* Give a generic message for AllocateDir failure, if caller didn't */
2416 18266 : if (dir == NULL)
2417 : {
2418 0 : ereport(elevel,
2419 : (errcode_for_file_access(),
2420 : errmsg("could not open directory \"%s\": %m",
2421 : dirname)));
2422 0 : return NULL;
2423 : }
2424 :
2425 18266 : errno = 0;
2426 18266 : if ((dent = readdir(dir)) != NULL)
2427 18124 : return dent;
2428 :
2429 142 : if (errno)
2430 0 : ereport(elevel,
2431 : (errcode_for_file_access(),
2432 : errmsg("could not read directory \"%s\": %m",
2433 : dirname)));
2434 142 : return NULL;
2435 : }
2436 :
2437 : /*
2438 : * Close a directory opened with AllocateDir.
2439 : *
2440 : * Note we do not check closedir's return value --- it is up to the caller
2441 : * to handle close errors.
2442 : */
2443 : int
2444 584 : FreeDir(DIR *dir)
2445 : {
2446 : int i;
2447 :
2448 : DO_DB(elog(LOG, "FreeDir: Allocated %d", numAllocatedDescs));
2449 :
2450 : /* Remove dir from list of allocated dirs, if it's present */
2451 1168 : for (i = numAllocatedDescs; --i >= 0;)
2452 : {
2453 584 : AllocateDesc *desc = &allocatedDescs[i];
2454 :
2455 584 : if (desc->kind == AllocateDescDir && desc->desc.dir == dir)
2456 584 : return FreeDesc(desc);
2457 : }
2458 :
2459 : /* Only get here if someone passes us a dir not in allocatedDescs */
2460 0 : elog(WARNING, "dir passed to FreeDir was not obtained from AllocateDir");
2461 :
2462 0 : return closedir(dir);
2463 : }
2464 :
2465 :
2466 : /*
2467 : * Close a pipe stream returned by OpenPipeStream.
2468 : */
2469 : int
2470 1 : ClosePipeStream(FILE *file)
2471 : {
2472 : int i;
2473 :
2474 : DO_DB(elog(LOG, "ClosePipeStream: Allocated %d", numAllocatedDescs));
2475 :
2476 : /* Remove file from list of allocated files, if it's present */
2477 2 : for (i = numAllocatedDescs; --i >= 0;)
2478 : {
2479 1 : AllocateDesc *desc = &allocatedDescs[i];
2480 :
2481 1 : if (desc->kind == AllocateDescPipe && desc->desc.file == file)
2482 1 : return FreeDesc(desc);
2483 : }
2484 :
2485 : /* Only get here if someone passes us a file not in allocatedDescs */
2486 0 : elog(WARNING, "file passed to ClosePipeStream was not obtained from OpenPipeStream");
2487 :
2488 0 : return pclose(file);
2489 : }
2490 :
2491 : /*
2492 : * closeAllVfds
2493 : *
2494 : * Force all VFDs into the physically-closed state, so that the fewest
2495 : * possible number of kernel file descriptors are in use. There is no
2496 : * change in the logical state of the VFDs.
2497 : */
2498 : void
2499 1 : closeAllVfds(void)
2500 : {
2501 : Index i;
2502 :
2503 1 : if (SizeVfdCache > 0)
2504 : {
2505 1 : Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
2506 32 : for (i = 1; i < SizeVfdCache; i++)
2507 : {
2508 31 : if (!FileIsNotOpen(i))
2509 5 : LruDelete(i);
2510 : }
2511 : }
2512 1 : }
2513 :
2514 :
2515 : /*
2516 : * SetTempTablespaces
2517 : *
2518 : * Define a list (actually an array) of OIDs of tablespaces to use for
2519 : * temporary files. This list will be used until end of transaction,
2520 : * unless this function is called again before then. It is caller's
2521 : * responsibility that the passed-in array has adequate lifespan (typically
2522 : * it'd be allocated in TopTransactionContext).
2523 : */
2524 : void
2525 338 : SetTempTablespaces(Oid *tableSpaces, int numSpaces)
2526 : {
2527 338 : Assert(numSpaces >= 0);
2528 338 : tempTableSpaces = tableSpaces;
2529 338 : numTempTableSpaces = numSpaces;
2530 :
2531 : /*
2532 : * Select a random starting point in the list. This is to minimize
2533 : * conflicts between backends that are most likely sharing the same list
2534 : * of temp tablespaces. Note that if we create multiple temp files in the
2535 : * same transaction, we'll advance circularly through the list --- this
2536 : * ensures that large temporary sort files are nicely spread across all
2537 : * available tablespaces.
2538 : */
2539 338 : if (numSpaces > 1)
2540 0 : nextTempTableSpace = random() % numSpaces;
2541 : else
2542 338 : nextTempTableSpace = 0;
2543 338 : }
2544 :
2545 : /*
2546 : * TempTablespacesAreSet
2547 : *
2548 : * Returns TRUE if SetTempTablespaces has been called in current transaction.
2549 : * (This is just so that tablespaces.c doesn't need its own per-transaction
2550 : * state.)
2551 : */
2552 : bool
2553 427 : TempTablespacesAreSet(void)
2554 : {
2555 427 : return (numTempTableSpaces >= 0);
2556 : }
2557 :
2558 : /*
2559 : * GetNextTempTableSpace
2560 : *
2561 : * Select the next temp tablespace to use. A result of InvalidOid means
2562 : * to use the current database's default tablespace.
2563 : */
2564 : Oid
2565 417 : GetNextTempTableSpace(void)
2566 : {
2567 417 : if (numTempTableSpaces > 0)
2568 : {
2569 : /* Advance nextTempTableSpace counter with wraparound */
2570 0 : if (++nextTempTableSpace >= numTempTableSpaces)
2571 0 : nextTempTableSpace = 0;
2572 0 : return tempTableSpaces[nextTempTableSpace];
2573 : }
2574 417 : return InvalidOid;
2575 : }
2576 :
2577 :
2578 : /*
2579 : * AtEOSubXact_Files
2580 : *
2581 : * Take care of subtransaction commit/abort. At abort, we close temp files
2582 : * that the subtransaction may have opened. At commit, we reassign the
2583 : * files that were opened to the parent subtransaction.
2584 : */
2585 : void
2586 372 : AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid,
2587 : SubTransactionId parentSubid)
2588 : {
2589 : Index i;
2590 :
2591 372 : for (i = 0; i < numAllocatedDescs; i++)
2592 : {
2593 0 : if (allocatedDescs[i].create_subid == mySubid)
2594 : {
2595 0 : if (isCommit)
2596 0 : allocatedDescs[i].create_subid = parentSubid;
2597 : else
2598 : {
2599 : /* have to recheck the item after FreeDesc (ugly) */
2600 0 : FreeDesc(&allocatedDescs[i--]);
2601 : }
2602 : }
2603 : }
2604 372 : }
2605 :
2606 : /*
2607 : * AtEOXact_Files
2608 : *
2609 : * This routine is called during transaction commit or abort (it doesn't
2610 : * particularly care which). All still-open per-transaction temporary file
2611 : * VFDs are closed, which also causes the underlying files to be deleted
2612 : * (although they should've been closed already by the ResourceOwner
2613 : * cleanup). Furthermore, all "allocated" stdio files are closed. We also
2614 : * forget any transaction-local temp tablespace list.
2615 : */
2616 : void
2617 26167 : AtEOXact_Files(void)
2618 : {
2619 26167 : CleanupTempFiles(false);
2620 26167 : tempTableSpaces = NULL;
2621 26167 : numTempTableSpaces = -1;
2622 26167 : }
2623 :
2624 : /*
2625 : * AtProcExit_Files
2626 : *
2627 : * on_proc_exit hook to clean up temp files during backend shutdown.
2628 : * Here, we want to clean up *all* temp files including interXact ones.
2629 : */
2630 : static void
2631 344 : AtProcExit_Files(int code, Datum arg)
2632 : {
2633 344 : CleanupTempFiles(true);
2634 344 : }
2635 :
2636 : /*
2637 : * Close temporary files and delete their underlying files.
2638 : *
2639 : * isProcExit: if true, this is being called as the backend process is
2640 : * exiting. If that's the case, we should remove all temporary files; if
2641 : * that's not the case, we are being called for transaction commit/abort
2642 : * and should only remove transaction-local temp files. In either case,
2643 : * also clean up "allocated" stdio files, dirs and fds.
2644 : */
2645 : static void
2646 26511 : CleanupTempFiles(bool isProcExit)
2647 : {
2648 : Index i;
2649 :
2650 : /*
2651 : * Careful here: at proc_exit we need extra cleanup, not just
2652 : * xact_temporary files.
2653 : */
2654 26511 : if (isProcExit || have_xact_temporary_files)
2655 : {
2656 353 : Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
2657 17733 : for (i = 1; i < SizeVfdCache; i++)
2658 : {
2659 17380 : unsigned short fdstate = VfdCache[i].fdstate;
2660 :
2661 17380 : if ((fdstate & FD_TEMPORARY) && VfdCache[i].fileName != NULL)
2662 : {
2663 : /*
2664 : * If we're in the process of exiting a backend process, close
2665 : * all temporary files. Otherwise, only close temporary files
2666 : * local to the current transaction. They should be closed by
2667 : * the ResourceOwner mechanism already, so this is just a
2668 : * debugging cross-check.
2669 : */
2670 0 : if (isProcExit)
2671 0 : FileClose(i);
2672 0 : else if (fdstate & FD_XACT_TEMPORARY)
2673 : {
2674 0 : elog(WARNING,
2675 : "temporary file %s not closed at end-of-transaction",
2676 : VfdCache[i].fileName);
2677 0 : FileClose(i);
2678 : }
2679 : }
2680 : }
2681 :
2682 353 : have_xact_temporary_files = false;
2683 : }
2684 :
2685 : /* Clean up "allocated" stdio files, dirs and fds. */
2686 53023 : while (numAllocatedDescs > 0)
2687 1 : FreeDesc(&allocatedDescs[0]);
2688 26511 : }
2689 :
2690 :
2691 : /*
2692 : * Remove temporary and temporary relation files left over from a prior
2693 : * postmaster session
2694 : *
2695 : * This should be called during postmaster startup. It will forcibly
2696 : * remove any leftover files created by OpenTemporaryFile and any leftover
2697 : * temporary relation files created by mdcreate.
2698 : *
2699 : * NOTE: we could, but don't, call this during a post-backend-crash restart
2700 : * cycle. The argument for not doing it is that someone might want to examine
2701 : * the temp files for debugging purposes. This does however mean that
2702 : * OpenTemporaryFile had better allow for collision with an existing temp
2703 : * file name.
2704 : */
2705 : void
2706 1 : RemovePgTempFiles(void)
2707 : {
2708 : char temp_path[MAXPGPATH + 10 + sizeof(TABLESPACE_VERSION_DIRECTORY) + sizeof(PG_TEMP_FILES_DIR)];
2709 : DIR *spc_dir;
2710 : struct dirent *spc_de;
2711 :
2712 : /*
2713 : * First process temp files in pg_default ($PGDATA/base)
2714 : */
2715 1 : snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR);
2716 1 : RemovePgTempFilesInDir(temp_path);
2717 1 : RemovePgTempRelationFiles("base");
2718 :
2719 : /*
2720 : * Cycle through temp directories for all non-default tablespaces.
2721 : */
2722 1 : spc_dir = AllocateDir("pg_tblspc");
2723 :
2724 4 : while ((spc_de = ReadDir(spc_dir, "pg_tblspc")) != NULL)
2725 : {
2726 3 : if (strcmp(spc_de->d_name, ".") == 0 ||
2727 1 : strcmp(spc_de->d_name, "..") == 0)
2728 2 : continue;
2729 :
2730 0 : snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s/%s",
2731 0 : spc_de->d_name, TABLESPACE_VERSION_DIRECTORY, PG_TEMP_FILES_DIR);
2732 0 : RemovePgTempFilesInDir(temp_path);
2733 :
2734 0 : snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s",
2735 0 : spc_de->d_name, TABLESPACE_VERSION_DIRECTORY);
2736 0 : RemovePgTempRelationFiles(temp_path);
2737 : }
2738 :
2739 1 : FreeDir(spc_dir);
2740 :
2741 : /*
2742 : * In EXEC_BACKEND case there is a pgsql_tmp directory at the top level of
2743 : * DataDir as well.
2744 : */
2745 : #ifdef EXEC_BACKEND
2746 : RemovePgTempFilesInDir(PG_TEMP_FILES_DIR);
2747 : #endif
2748 1 : }
2749 :
2750 : /* Process one pgsql_tmp directory for RemovePgTempFiles */
2751 : static void
2752 1 : RemovePgTempFilesInDir(const char *tmpdirname)
2753 : {
2754 : DIR *temp_dir;
2755 : struct dirent *temp_de;
2756 : char rm_path[MAXPGPATH * 2];
2757 :
2758 1 : temp_dir = AllocateDir(tmpdirname);
2759 1 : if (temp_dir == NULL)
2760 : {
2761 : /* anything except ENOENT is fishy */
2762 1 : if (errno != ENOENT)
2763 0 : elog(LOG,
2764 : "could not open temporary-files directory \"%s\": %m",
2765 : tmpdirname);
2766 2 : return;
2767 : }
2768 :
2769 0 : while ((temp_de = ReadDir(temp_dir, tmpdirname)) != NULL)
2770 : {
2771 0 : if (strcmp(temp_de->d_name, ".") == 0 ||
2772 0 : strcmp(temp_de->d_name, "..") == 0)
2773 0 : continue;
2774 :
2775 0 : snprintf(rm_path, sizeof(rm_path), "%s/%s",
2776 0 : tmpdirname, temp_de->d_name);
2777 :
2778 0 : if (strncmp(temp_de->d_name,
2779 : PG_TEMP_FILE_PREFIX,
2780 : strlen(PG_TEMP_FILE_PREFIX)) == 0)
2781 0 : unlink(rm_path); /* note we ignore any error */
2782 : else
2783 0 : elog(LOG,
2784 : "unexpected file found in temporary-files directory: \"%s\"",
2785 : rm_path);
2786 : }
2787 :
2788 0 : FreeDir(temp_dir);
2789 : }
2790 :
2791 : /* Process one tablespace directory, look for per-DB subdirectories */
2792 : static void
2793 1 : RemovePgTempRelationFiles(const char *tsdirname)
2794 : {
2795 : DIR *ts_dir;
2796 : struct dirent *de;
2797 : char dbspace_path[MAXPGPATH * 2];
2798 :
2799 1 : ts_dir = AllocateDir(tsdirname);
2800 1 : if (ts_dir == NULL)
2801 : {
2802 : /* anything except ENOENT is fishy */
2803 0 : if (errno != ENOENT)
2804 0 : elog(LOG,
2805 : "could not open tablespace directory \"%s\": %m",
2806 : tsdirname);
2807 1 : return;
2808 : }
2809 :
2810 7 : while ((de = ReadDir(ts_dir, tsdirname)) != NULL)
2811 : {
2812 5 : int i = 0;
2813 :
2814 : /*
2815 : * We're only interested in the per-database directories, which have
2816 : * numeric names. Note that this code will also (properly) ignore "."
2817 : * and "..".
2818 : */
2819 21 : while (isdigit((unsigned char) de->d_name[i]))
2820 11 : ++i;
2821 5 : if (de->d_name[i] != '\0' || i == 0)
2822 2 : continue;
2823 :
2824 3 : snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s",
2825 3 : tsdirname, de->d_name);
2826 3 : RemovePgTempRelationFilesInDbspace(dbspace_path);
2827 : }
2828 :
2829 1 : FreeDir(ts_dir);
2830 : }
2831 :
2832 : /* Process one per-dbspace directory for RemovePgTempRelationFiles */
2833 : static void
2834 3 : RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
2835 : {
2836 : DIR *dbspace_dir;
2837 : struct dirent *de;
2838 : char rm_path[MAXPGPATH * 2];
2839 :
2840 3 : dbspace_dir = AllocateDir(dbspacedirname);
2841 3 : if (dbspace_dir == NULL)
2842 : {
2843 : /* we just saw this directory, so it really ought to be there */
2844 0 : elog(LOG,
2845 : "could not open dbspace directory \"%s\": %m",
2846 : dbspacedirname);
2847 3 : return;
2848 : }
2849 :
2850 891 : while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL)
2851 : {
2852 885 : if (!looks_like_temp_rel_name(de->d_name))
2853 885 : continue;
2854 :
2855 0 : snprintf(rm_path, sizeof(rm_path), "%s/%s",
2856 0 : dbspacedirname, de->d_name);
2857 :
2858 0 : unlink(rm_path); /* note we ignore any error */
2859 : }
2860 :
2861 3 : FreeDir(dbspace_dir);
2862 : }
2863 :
2864 : /* t<digits>_<digits>, or t<digits>_<digits>_<forkname> */
2865 : static bool
2866 885 : looks_like_temp_rel_name(const char *name)
2867 : {
2868 : int pos;
2869 : int savepos;
2870 :
2871 : /* Must start with "t". */
2872 885 : if (name[0] != 't')
2873 885 : return false;
2874 :
2875 : /* Followed by a non-empty string of digits and then an underscore. */
2876 0 : for (pos = 1; isdigit((unsigned char) name[pos]); ++pos)
2877 : ;
2878 0 : if (pos == 1 || name[pos] != '_')
2879 0 : return false;
2880 :
2881 : /* Followed by another nonempty string of digits. */
2882 0 : for (savepos = ++pos; isdigit((unsigned char) name[pos]); ++pos)
2883 : ;
2884 0 : if (savepos == pos)
2885 0 : return false;
2886 :
2887 : /* We might have _forkname or .segment or both. */
2888 0 : if (name[pos] == '_')
2889 : {
2890 0 : int forkchar = forkname_chars(&name[pos + 1], NULL);
2891 :
2892 0 : if (forkchar <= 0)
2893 0 : return false;
2894 0 : pos += forkchar + 1;
2895 : }
2896 0 : if (name[pos] == '.')
2897 : {
2898 : int segchar;
2899 :
2900 0 : for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar)
2901 : ;
2902 0 : if (segchar <= 1)
2903 0 : return false;
2904 0 : pos += segchar;
2905 : }
2906 :
2907 : /* Now we should be at the end. */
2908 0 : if (name[pos] != '\0')
2909 0 : return false;
2910 0 : return true;
2911 : }
2912 :
2913 :
2914 : /*
2915 : * Issue fsync recursively on PGDATA and all its contents.
2916 : *
2917 : * We fsync regular files and directories wherever they are, but we
2918 : * follow symlinks only for pg_wal and immediately under pg_tblspc.
2919 : * Other symlinks are presumed to point at files we're not responsible
2920 : * for fsyncing, and might not have privileges to write at all.
2921 : *
2922 : * Errors are logged but not considered fatal; that's because this is used
2923 : * only during database startup, to deal with the possibility that there are
2924 : * issued-but-unsynced writes pending against the data directory. We want to
2925 : * ensure that such writes reach disk before anything that's done in the new
2926 : * run. However, aborting on error would result in failure to start for
2927 : * harmless cases such as read-only files in the data directory, and that's
2928 : * not good either.
2929 : *
2930 : * Note we assume we're chdir'd into PGDATA to begin with.
2931 : */
2932 : void
2933 0 : SyncDataDirectory(void)
2934 : {
2935 : bool xlog_is_symlink;
2936 :
2937 : /* We can skip this whole thing if fsync is disabled. */
2938 0 : if (!enableFsync)
2939 0 : return;
2940 :
2941 : /*
2942 : * If pg_wal is a symlink, we'll need to recurse into it separately,
2943 : * because the first walkdir below will ignore it.
2944 : */
2945 0 : xlog_is_symlink = false;
2946 :
2947 : #ifndef WIN32
2948 : {
2949 : struct stat st;
2950 :
2951 0 : if (lstat("pg_wal", &st) < 0)
2952 0 : ereport(LOG,
2953 : (errcode_for_file_access(),
2954 : errmsg("could not stat file \"%s\": %m",
2955 : "pg_wal")));
2956 0 : else if (S_ISLNK(st.st_mode))
2957 0 : xlog_is_symlink = true;
2958 : }
2959 : #else
2960 : if (pgwin32_is_junction("pg_wal"))
2961 : xlog_is_symlink = true;
2962 : #endif
2963 :
2964 : /*
2965 : * If possible, hint to the kernel that we're soon going to fsync the data
2966 : * directory and its contents. Errors in this step are even less
2967 : * interesting than normal, so log them only at DEBUG1.
2968 : */
2969 : #ifdef PG_FLUSH_DATA_WORKS
2970 0 : walkdir(".", pre_sync_fname, false, DEBUG1);
2971 0 : if (xlog_is_symlink)
2972 0 : walkdir("pg_wal", pre_sync_fname, false, DEBUG1);
2973 0 : walkdir("pg_tblspc", pre_sync_fname, true, DEBUG1);
2974 : #endif
2975 :
2976 : /*
2977 : * Now we do the fsync()s in the same order.
2978 : *
2979 : * The main call ignores symlinks, so in addition to specially processing
2980 : * pg_wal if it's a symlink, pg_tblspc has to be visited separately with
2981 : * process_symlinks = true. Note that if there are any plain directories
2982 : * in pg_tblspc, they'll get fsync'd twice. That's not an expected case
2983 : * so we don't worry about optimizing it.
2984 : */
2985 0 : walkdir(".", datadir_fsync_fname, false, LOG);
2986 0 : if (xlog_is_symlink)
2987 0 : walkdir("pg_wal", datadir_fsync_fname, false, LOG);
2988 0 : walkdir("pg_tblspc", datadir_fsync_fname, true, LOG);
2989 : }
2990 :
2991 : /*
2992 : * walkdir: recursively walk a directory, applying the action to each
2993 : * regular file and directory (including the named directory itself).
2994 : *
2995 : * If process_symlinks is true, the action and recursion are also applied
2996 : * to regular files and directories that are pointed to by symlinks in the
2997 : * given directory; otherwise symlinks are ignored. Symlinks are always
2998 : * ignored in subdirectories, ie we intentionally don't pass down the
2999 : * process_symlinks flag to recursive calls.
3000 : *
3001 : * Errors are reported at level elevel, which might be ERROR or less.
3002 : *
3003 : * See also walkdir in initdb.c, which is a frontend version of this logic.
3004 : */
3005 : static void
3006 0 : walkdir(const char *path,
3007 : void (*action) (const char *fname, bool isdir, int elevel),
3008 : bool process_symlinks,
3009 : int elevel)
3010 : {
3011 : DIR *dir;
3012 : struct dirent *de;
3013 :
3014 0 : dir = AllocateDir(path);
3015 0 : if (dir == NULL)
3016 : {
3017 0 : ereport(elevel,
3018 : (errcode_for_file_access(),
3019 : errmsg("could not open directory \"%s\": %m", path)));
3020 0 : return;
3021 : }
3022 :
3023 0 : while ((de = ReadDirExtended(dir, path, elevel)) != NULL)
3024 : {
3025 : char subpath[MAXPGPATH * 2];
3026 : struct stat fst;
3027 : int sret;
3028 :
3029 0 : CHECK_FOR_INTERRUPTS();
3030 :
3031 0 : if (strcmp(de->d_name, ".") == 0 ||
3032 0 : strcmp(de->d_name, "..") == 0)
3033 0 : continue;
3034 :
3035 0 : snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name);
3036 :
3037 0 : if (process_symlinks)
3038 0 : sret = stat(subpath, &fst);
3039 : else
3040 0 : sret = lstat(subpath, &fst);
3041 :
3042 0 : if (sret < 0)
3043 : {
3044 0 : ereport(elevel,
3045 : (errcode_for_file_access(),
3046 : errmsg("could not stat file \"%s\": %m", subpath)));
3047 0 : continue;
3048 : }
3049 :
3050 0 : if (S_ISREG(fst.st_mode))
3051 0 : (*action) (subpath, false, elevel);
3052 0 : else if (S_ISDIR(fst.st_mode))
3053 0 : walkdir(subpath, action, false, elevel);
3054 : }
3055 :
3056 0 : FreeDir(dir); /* we ignore any error here */
3057 :
3058 : /*
3059 : * It's important to fsync the destination directory itself as individual
3060 : * file fsyncs don't guarantee that the directory entry for the file is
3061 : * synced.
3062 : */
3063 0 : (*action) (path, true, elevel);
3064 : }
3065 :
3066 :
3067 : /*
3068 : * Hint to the OS that it should get ready to fsync() this file.
3069 : *
3070 : * Ignores errors trying to open unreadable files, and logs other errors at a
3071 : * caller-specified level.
3072 : */
3073 : #ifdef PG_FLUSH_DATA_WORKS
3074 :
3075 : static void
3076 0 : pre_sync_fname(const char *fname, bool isdir, int elevel)
3077 : {
3078 : int fd;
3079 :
3080 : /* Don't try to flush directories, it'll likely just fail */
3081 0 : if (isdir)
3082 0 : return;
3083 :
3084 0 : fd = OpenTransientFile((char *) fname, O_RDONLY | PG_BINARY, 0);
3085 :
3086 0 : if (fd < 0)
3087 : {
3088 0 : if (errno == EACCES)
3089 0 : return;
3090 0 : ereport(elevel,
3091 : (errcode_for_file_access(),
3092 : errmsg("could not open file \"%s\": %m", fname)));
3093 0 : return;
3094 : }
3095 :
3096 : /*
3097 : * pg_flush_data() ignores errors, which is ok because this is only a
3098 : * hint.
3099 : */
3100 0 : pg_flush_data(fd, 0, 0);
3101 :
3102 0 : (void) CloseTransientFile(fd);
3103 : }
3104 :
3105 : #endif /* PG_FLUSH_DATA_WORKS */
3106 :
3107 : static void
3108 0 : datadir_fsync_fname(const char *fname, bool isdir, int elevel)
3109 : {
3110 : /*
3111 : * We want to silently ignoring errors about unreadable files. Pass that
3112 : * desire on to fsync_fname_ext().
3113 : */
3114 0 : fsync_fname_ext(fname, isdir, true, elevel);
3115 0 : }
3116 :
3117 : /*
3118 : * fsync_fname_ext -- Try to fsync a file or directory
3119 : *
3120 : * If ignore_perm is true, ignore errors upon trying to open unreadable
3121 : * files. Logs other errors at a caller-specified level.
3122 : *
3123 : * Returns 0 if the operation succeeded, -1 otherwise.
3124 : */
3125 : static int
3126 99 : fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
3127 : {
3128 : int fd;
3129 : int flags;
3130 : int returncode;
3131 :
3132 : /*
3133 : * Some OSs require directories to be opened read-only whereas other
3134 : * systems don't allow us to fsync files opened read-only; so we need both
3135 : * cases here. Using O_RDWR will cause us to fail to fsync files that are
3136 : * not writable by our userid, but we assume that's OK.
3137 : */
3138 99 : flags = PG_BINARY;
3139 99 : if (!isdir)
3140 44 : flags |= O_RDWR;
3141 : else
3142 55 : flags |= O_RDONLY;
3143 :
3144 99 : fd = OpenTransientFile((char *) fname, flags, 0);
3145 :
3146 : /*
3147 : * Some OSs don't allow us to open directories at all (Windows returns
3148 : * EACCES), just ignore the error in that case. If desired also silently
3149 : * ignoring errors about unreadable files. Log others.
3150 : */
3151 99 : if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
3152 0 : return 0;
3153 99 : else if (fd < 0 && ignore_perm && errno == EACCES)
3154 0 : return 0;
3155 99 : else if (fd < 0)
3156 : {
3157 0 : ereport(elevel,
3158 : (errcode_for_file_access(),
3159 : errmsg("could not open file \"%s\": %m", fname)));
3160 0 : return -1;
3161 : }
3162 :
3163 99 : returncode = pg_fsync(fd);
3164 :
3165 : /*
3166 : * Some OSes don't allow us to fsync directories at all, so we can ignore
3167 : * those errors. Anything else needs to be logged.
3168 : */
3169 99 : if (returncode != 0 && !(isdir && errno == EBADF))
3170 : {
3171 : int save_errno;
3172 :
3173 : /* close file upon error, might not be in transaction context */
3174 0 : save_errno = errno;
3175 0 : (void) CloseTransientFile(fd);
3176 0 : errno = save_errno;
3177 :
3178 0 : ereport(elevel,
3179 : (errcode_for_file_access(),
3180 : errmsg("could not fsync file \"%s\": %m", fname)));
3181 0 : return -1;
3182 : }
3183 :
3184 99 : (void) CloseTransientFile(fd);
3185 :
3186 99 : return 0;
3187 : }
3188 :
3189 : /*
3190 : * fsync_parent_path -- fsync the parent path of a file or directory
3191 : *
3192 : * This is aimed at making file operations persistent on disk in case of
3193 : * an OS crash or power failure.
3194 : */
3195 : static int
3196 22 : fsync_parent_path(const char *fname, int elevel)
3197 : {
3198 : char parentpath[MAXPGPATH];
3199 :
3200 22 : strlcpy(parentpath, fname, MAXPGPATH);
3201 22 : get_parent_directory(parentpath);
3202 :
3203 : /*
3204 : * get_parent_directory() returns an empty string if the input argument is
3205 : * just a file name (see comments in path.c), so handle that as being the
3206 : * current directory.
3207 : */
3208 22 : if (strlen(parentpath) == 0)
3209 0 : strlcpy(parentpath, ".", MAXPGPATH);
3210 :
3211 22 : if (fsync_fname_ext(parentpath, true, false, elevel) != 0)
3212 0 : return -1;
3213 :
3214 22 : return 0;
3215 : }
|