LCOV - code coverage report
Current view: top level - src/backend/storage/file - fd.c (source / functions) Hit Total Coverage
Test: PostgreSQL Lines: 508 809 62.8 %
Date: 2017-09-29 15:12:54 Functions: 58 68 85.3 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * fd.c
       4             :  *    Virtual file descriptor code.
       5             :  *
       6             :  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
       7             :  * Portions Copyright (c) 1994, Regents of the University of California
       8             :  *
       9             :  * IDENTIFICATION
      10             :  *    src/backend/storage/file/fd.c
      11             :  *
      12             :  * NOTES:
      13             :  *
      14             :  * This code manages a cache of 'virtual' file descriptors (VFDs).
      15             :  * The server opens many file descriptors for a variety of reasons,
      16             :  * including base tables, scratch files (e.g., sort and hash spool
      17             :  * files), and random calls to C library routines like system(3); it
      18             :  * is quite easy to exceed system limits on the number of open files a
      19             :  * single process can have.  (This is around 256 on many modern
      20             :  * operating systems, but can be as low as 32 on others.)
      21             :  *
      22             :  * VFDs are managed as an LRU pool, with actual OS file descriptors
      23             :  * being opened and closed as needed.  Obviously, if a routine is
      24             :  * opened using these interfaces, all subsequent operations must also
      25             :  * be through these interfaces (the File type is not a real file
      26             :  * descriptor).
      27             :  *
      28             :  * For this scheme to work, most (if not all) routines throughout the
      29             :  * server should use these interfaces instead of calling the C library
      30             :  * routines (e.g., open(2) and fopen(3)) themselves.  Otherwise, we
      31             :  * may find ourselves short of real file descriptors anyway.
      32             :  *
      33             :  * INTERFACE ROUTINES
      34             :  *
      35             :  * PathNameOpenFile and OpenTemporaryFile are used to open virtual files.
      36             :  * A File opened with OpenTemporaryFile is automatically deleted when the
      37             :  * File is closed, either explicitly or implicitly at end of transaction or
      38             :  * process exit. PathNameOpenFile is intended for files that are held open
      39             :  * for a long time, like relation files. It is the caller's responsibility
      40             :  * to close them, there is no automatic mechanism in fd.c for that.
      41             :  *
      42             :  * AllocateFile, AllocateDir, OpenPipeStream and OpenTransientFile are
      43             :  * wrappers around fopen(3), opendir(3), popen(3) and open(2), respectively.
      44             :  * They behave like the corresponding native functions, except that the handle
      45             :  * is registered with the current subtransaction, and will be automatically
      46             :  * closed at abort. These are intended mainly for short operations like
      47             :  * reading a configuration file; there is a limit on the number of files that
      48             :  * can be opened using these functions at any one time.
      49             :  *
      50             :  * Finally, BasicOpenFile is just a thin wrapper around open() that can
      51             :  * release file descriptors in use by the virtual file descriptors if
      52             :  * necessary. There is no automatic cleanup of file descriptors returned by
      53             :  * BasicOpenFile, it is solely the caller's responsibility to close the file
      54             :  * descriptor by calling close(2).
      55             :  *
      56             :  *-------------------------------------------------------------------------
      57             :  */
      58             : 
      59             : #include "postgres.h"
      60             : 
      61             : #include <sys/file.h>
      62             : #include <sys/param.h>
      63             : #include <sys/stat.h>
      64             : #ifndef WIN32
      65             : #include <sys/mman.h>
      66             : #endif
      67             : #include <limits.h>
      68             : #include <unistd.h>
      69             : #include <fcntl.h>
      70             : #ifdef HAVE_SYS_RESOURCE_H
      71             : #include <sys/resource.h>     /* for getrlimit */
      72             : #endif
      73             : 
      74             : #include "miscadmin.h"
      75             : #include "access/xact.h"
      76             : #include "access/xlog.h"
      77             : #include "catalog/catalog.h"
      78             : #include "catalog/pg_tablespace.h"
      79             : #include "pgstat.h"
      80             : #include "portability/mem.h"
      81             : #include "storage/fd.h"
      82             : #include "storage/ipc.h"
      83             : #include "utils/guc.h"
      84             : #include "utils/resowner_private.h"
      85             : 
      86             : 
      87             : /* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
      88             : #if defined(HAVE_SYNC_FILE_RANGE)
      89             : #define PG_FLUSH_DATA_WORKS 1
      90             : #elif !defined(WIN32) && defined(MS_ASYNC)
      91             : #define PG_FLUSH_DATA_WORKS 1
      92             : #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
      93             : #define PG_FLUSH_DATA_WORKS 1
      94             : #endif
      95             : 
      96             : /*
      97             :  * We must leave some file descriptors free for system(), the dynamic loader,
      98             :  * and other code that tries to open files without consulting fd.c.  This
      99             :  * is the number left free.  (While we can be pretty sure we won't get
     100             :  * EMFILE, there's never any guarantee that we won't get ENFILE due to
     101             :  * other processes chewing up FDs.  So it's a bad idea to try to open files
     102             :  * without consulting fd.c.  Nonetheless we cannot control all code.)
     103             :  *
     104             :  * Because this is just a fixed setting, we are effectively assuming that
     105             :  * no such code will leave FDs open over the long term; otherwise the slop
     106             :  * is likely to be insufficient.  Note in particular that we expect that
     107             :  * loading a shared library does not result in any permanent increase in
     108             :  * the number of open files.  (This appears to be true on most if not
     109             :  * all platforms as of Feb 2004.)
     110             :  */
     111             : #define NUM_RESERVED_FDS        10
     112             : 
     113             : /*
     114             :  * If we have fewer than this many usable FDs after allowing for the reserved
     115             :  * ones, choke.
     116             :  */
     117             : #define FD_MINFREE              10
     118             : 
     119             : 
     120             : /*
     121             :  * A number of platforms allow individual processes to open many more files
     122             :  * than they can really support when *many* processes do the same thing.
     123             :  * This GUC parameter lets the DBA limit max_safe_fds to something less than
     124             :  * what the postmaster's initial probe suggests will work.
     125             :  */
     126             : int         max_files_per_process = 1000;
     127             : 
     128             : /*
     129             :  * Maximum number of file descriptors to open for either VFD entries or
     130             :  * AllocateFile/AllocateDir/OpenTransientFile operations.  This is initialized
     131             :  * to a conservative value, and remains that way indefinitely in bootstrap or
     132             :  * standalone-backend cases.  In normal postmaster operation, the postmaster
     133             :  * calls set_max_safe_fds() late in initialization to update the value, and
     134             :  * that value is then inherited by forked subprocesses.
     135             :  *
     136             :  * Note: the value of max_files_per_process is taken into account while
     137             :  * setting this variable, and so need not be tested separately.
     138             :  */
     139             : int         max_safe_fds = 32;  /* default if not changed */
     140             : 
     141             : 
     142             : /* Debugging.... */
     143             : 
     144             : #ifdef FDDEBUG
     145             : #define DO_DB(A) \
     146             :     do { \
     147             :         int         _do_db_save_errno = errno; \
     148             :         A; \
     149             :         errno = _do_db_save_errno; \
     150             :     } while (0)
     151             : #else
     152             : #define DO_DB(A) \
     153             :     ((void) 0)
     154             : #endif
     155             : 
     156             : #define VFD_CLOSED (-1)
     157             : 
     158             : #define FileIsValid(file) \
     159             :     ((file) > 0 && (file) < (int) SizeVfdCache && VfdCache[file].fileName != NULL)
     160             : 
     161             : #define FileIsNotOpen(file) (VfdCache[file].fd == VFD_CLOSED)
     162             : 
     163             : /*
     164             :  * Note: a VFD's seekPos is normally always valid, but if for some reason
     165             :  * an lseek() fails, it might become set to FileUnknownPos.  We can struggle
     166             :  * along without knowing the seek position in many cases, but in some places
     167             :  * we have to fail if we don't have it.
     168             :  */
     169             : #define FileUnknownPos ((off_t) -1)
     170             : #define FilePosIsUnknown(pos) ((pos) < 0)
     171             : 
     172             : /* these are the assigned bits in fdstate below: */
     173             : #define FD_TEMPORARY        (1 << 0)  /* T = delete when closed */
     174             : #define FD_XACT_TEMPORARY   (1 << 1)  /* T = delete at eoXact */
     175             : 
     176             : typedef struct vfd
     177             : {
     178             :     int         fd;             /* current FD, or VFD_CLOSED if none */
     179             :     unsigned short fdstate;     /* bitflags for VFD's state */
     180             :     ResourceOwner resowner;     /* owner, for automatic cleanup */
     181             :     File        nextFree;       /* link to next free VFD, if in freelist */
     182             :     File        lruMoreRecently;    /* doubly linked recency-of-use list */
     183             :     File        lruLessRecently;
     184             :     off_t       seekPos;        /* current logical file position, or -1 */
     185             :     off_t       fileSize;       /* current size of file (0 if not temporary) */
     186             :     char       *fileName;       /* name of file, or NULL for unused VFD */
     187             :     /* NB: fileName is malloc'd, and must be free'd when closing the VFD */
     188             :     int         fileFlags;      /* open(2) flags for (re)opening the file */
     189             :     int         fileMode;       /* mode to pass to open(2) */
     190             : } Vfd;
     191             : 
     192             : /*
     193             :  * Virtual File Descriptor array pointer and size.  This grows as
     194             :  * needed.  'File' values are indexes into this array.
     195             :  * Note that VfdCache[0] is not a usable VFD, just a list header.
     196             :  */
     197             : static Vfd *VfdCache;
     198             : static Size SizeVfdCache = 0;
     199             : 
     200             : /*
     201             :  * Number of file descriptors known to be in use by VFD entries.
     202             :  */
     203             : static int  nfile = 0;
     204             : 
     205             : /*
     206             :  * Flag to tell whether it's worth scanning VfdCache looking for temp files
     207             :  * to close
     208             :  */
     209             : static bool have_xact_temporary_files = false;
     210             : 
     211             : /*
     212             :  * Tracks the total size of all temporary files.  Note: when temp_file_limit
     213             :  * is being enforced, this cannot overflow since the limit cannot be more
     214             :  * than INT_MAX kilobytes.  When not enforcing, it could theoretically
     215             :  * overflow, but we don't care.
     216             :  */
     217             : static uint64 temporary_files_size = 0;
     218             : 
     219             : /*
     220             :  * List of OS handles opened with AllocateFile, AllocateDir and
     221             :  * OpenTransientFile.
     222             :  */
     223             : typedef enum
     224             : {
     225             :     AllocateDescFile,
     226             :     AllocateDescPipe,
     227             :     AllocateDescDir,
     228             :     AllocateDescRawFD
     229             : } AllocateDescKind;
     230             : 
     231             : typedef struct
     232             : {
     233             :     AllocateDescKind kind;
     234             :     SubTransactionId create_subid;
     235             :     union
     236             :     {
     237             :         FILE       *file;
     238             :         DIR        *dir;
     239             :         int         fd;
     240             :     }           desc;
     241             : } AllocateDesc;
     242             : 
     243             : static int  numAllocatedDescs = 0;
     244             : static int  maxAllocatedDescs = 0;
     245             : static AllocateDesc *allocatedDescs = NULL;
     246             : 
     247             : /*
     248             :  * Number of temporary files opened during the current session;
     249             :  * this is used in generation of tempfile names.
     250             :  */
     251             : static long tempFileCounter = 0;
     252             : 
     253             : /*
     254             :  * Array of OIDs of temp tablespaces.  When numTempTableSpaces is -1,
     255             :  * this has not been set in the current transaction.
     256             :  */
     257             : static Oid *tempTableSpaces = NULL;
     258             : static int  numTempTableSpaces = -1;
     259             : static int  nextTempTableSpace = 0;
     260             : 
     261             : 
     262             : /*--------------------
     263             :  *
     264             :  * Private Routines
     265             :  *
     266             :  * Delete          - delete a file from the Lru ring
     267             :  * LruDelete       - remove a file from the Lru ring and close its FD
     268             :  * Insert          - put a file at the front of the Lru ring
     269             :  * LruInsert       - put a file at the front of the Lru ring and open it
     270             :  * ReleaseLruFile  - Release an fd by closing the last entry in the Lru ring
     271             :  * ReleaseLruFiles - Release fd(s) until we're under the max_safe_fds limit
     272             :  * AllocateVfd     - grab a free (or new) file record (from VfdArray)
     273             :  * FreeVfd         - free a file record
     274             :  *
     275             :  * The Least Recently Used ring is a doubly linked list that begins and
     276             :  * ends on element zero.  Element zero is special -- it doesn't represent
     277             :  * a file and its "fd" field always == VFD_CLOSED.  Element zero is just an
     278             :  * anchor that shows us the beginning/end of the ring.
     279             :  * Only VFD elements that are currently really open (have an FD assigned) are
     280             :  * in the Lru ring.  Elements that are "virtually" open can be recognized
     281             :  * by having a non-null fileName field.
     282             :  *
     283             :  * example:
     284             :  *
     285             :  *     /--less----\                /---------\
     286             :  *     v           \              v           \
     287             :  *   #0 --more---> LeastRecentlyUsed --more-\ \
     288             :  *    ^\                                    | |
     289             :  *     \\less--> MostRecentlyUsedFile    <---/ |
     290             :  *      \more---/                    \--less--/
     291             :  *
     292             :  *--------------------
     293             :  */
     294             : static void Delete(File file);
     295             : static void LruDelete(File file);
     296             : static void Insert(File file);
     297             : static int  LruInsert(File file);
     298             : static bool ReleaseLruFile(void);
     299             : static void ReleaseLruFiles(void);
     300             : static File AllocateVfd(void);
     301             : static void FreeVfd(File file);
     302             : 
     303             : static int  FileAccess(File file);
     304             : static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError);
     305             : static bool reserveAllocatedDesc(void);
     306             : static int  FreeDesc(AllocateDesc *desc);
     307             : static struct dirent *ReadDirExtended(DIR *dir, const char *dirname, int elevel);
     308             : 
     309             : static void AtProcExit_Files(int code, Datum arg);
     310             : static void CleanupTempFiles(bool isProcExit);
     311             : static void RemovePgTempFilesInDir(const char *tmpdirname);
     312             : static void RemovePgTempRelationFiles(const char *tsdirname);
     313             : static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname);
     314             : static bool looks_like_temp_rel_name(const char *name);
     315             : 
     316             : static void walkdir(const char *path,
     317             :         void (*action) (const char *fname, bool isdir, int elevel),
     318             :         bool process_symlinks,
     319             :         int elevel);
     320             : #ifdef PG_FLUSH_DATA_WORKS
     321             : static void pre_sync_fname(const char *fname, bool isdir, int elevel);
     322             : #endif
     323             : static void datadir_fsync_fname(const char *fname, bool isdir, int elevel);
     324             : 
     325             : static int  fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel);
     326             : static int  fsync_parent_path(const char *fname, int elevel);
     327             : 
     328             : 
     329             : /*
     330             :  * pg_fsync --- do fsync with or without writethrough
     331             :  */
     332             : int
     333        1440 : pg_fsync(int fd)
     334             : {
     335             :     /* #if is to skip the sync_method test if there's no need for it */
     336             : #if defined(HAVE_FSYNC_WRITETHROUGH) && !defined(FSYNC_WRITETHROUGH_IS_FSYNC)
     337             :     if (sync_method == SYNC_METHOD_FSYNC_WRITETHROUGH)
     338             :         return pg_fsync_writethrough(fd);
     339             :     else
     340             : #endif
     341        1440 :         return pg_fsync_no_writethrough(fd);
     342             : }
     343             : 
     344             : 
     345             : /*
     346             :  * pg_fsync_no_writethrough --- same as fsync except does nothing if
     347             :  *  enableFsync is off
     348             :  */
     349             : int
     350        1440 : pg_fsync_no_writethrough(int fd)
     351             : {
     352        1440 :     if (enableFsync)
     353           0 :         return fsync(fd);
     354             :     else
     355        1440 :         return 0;
     356             : }
     357             : 
     358             : /*
     359             :  * pg_fsync_writethrough
     360             :  */
     361             : int
     362           0 : pg_fsync_writethrough(int fd)
     363             : {
     364           0 :     if (enableFsync)
     365             :     {
     366             : #ifdef WIN32
     367             :         return _commit(fd);
     368             : #elif defined(F_FULLFSYNC)
     369             :         return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0;
     370             : #else
     371           0 :         errno = ENOSYS;
     372           0 :         return -1;
     373             : #endif
     374             :     }
     375             :     else
     376           0 :         return 0;
     377             : }
     378             : 
     379             : /*
     380             :  * pg_fdatasync --- same as fdatasync except does nothing if enableFsync is off
     381             :  *
     382             :  * Not all platforms have fdatasync; treat as fsync if not available.
     383             :  */
     384             : int
     385        9615 : pg_fdatasync(int fd)
     386             : {
     387        9615 :     if (enableFsync)
     388             :     {
     389             : #ifdef HAVE_FDATASYNC
     390           0 :         return fdatasync(fd);
     391             : #else
     392             :         return fsync(fd);
     393             : #endif
     394             :     }
     395             :     else
     396        9615 :         return 0;
     397             : }
     398             : 
     399             : /*
     400             :  * pg_flush_data --- advise OS that the described dirty data should be flushed
     401             :  *
     402             :  * offset of 0 with nbytes 0 means that the entire file should be flushed;
     403             :  * in this case, this function may have side-effects on the file's
     404             :  * seek position!
     405             :  */
     406             : void
     407        2317 : pg_flush_data(int fd, off_t offset, off_t nbytes)
     408             : {
     409             :     /*
     410             :      * Right now file flushing is primarily used to avoid making later
     411             :      * fsync()/fdatasync() calls have less impact. Thus don't trigger flushes
     412             :      * if fsyncs are disabled - that's a decision we might want to make
     413             :      * configurable at some point.
     414             :      */
     415        2317 :     if (!enableFsync)
     416        2317 :         return;
     417             : 
     418             :     /*
     419             :      * We compile all alternatives that are supported on the current platform,
     420             :      * to find portability problems more easily.
     421             :      */
     422             : #if defined(HAVE_SYNC_FILE_RANGE)
     423             :     {
     424             :         int         rc;
     425             : 
     426             :         /*
     427             :          * sync_file_range(SYNC_FILE_RANGE_WRITE), currently linux specific,
     428             :          * tells the OS that writeback for the specified blocks should be
     429             :          * started, but that we don't want to wait for completion.  Note that
     430             :          * this call might block if too much dirty data exists in the range.
     431             :          * This is the preferable method on OSs supporting it, as it works
     432             :          * reliably when available (contrast to msync()) and doesn't flush out
     433             :          * clean data (like FADV_DONTNEED).
     434             :          */
     435           0 :         rc = sync_file_range(fd, offset, nbytes,
     436             :                              SYNC_FILE_RANGE_WRITE);
     437             : 
     438             :         /* don't error out, this is just a performance optimization */
     439           0 :         if (rc != 0)
     440             :         {
     441           0 :             ereport(WARNING,
     442             :                     (errcode_for_file_access(),
     443             :                      errmsg("could not flush dirty data: %m")));
     444             :         }
     445             : 
     446           0 :         return;
     447             :     }
     448             : #endif
     449             : #if !defined(WIN32) && defined(MS_ASYNC)
     450             :     {
     451             :         void       *p;
     452             :         static int  pagesize = 0;
     453             : 
     454             :         /*
     455             :          * On several OSs msync(MS_ASYNC) on a mmap'ed file triggers
     456             :          * writeback. On linux it only does so if MS_SYNC is specified, but
     457             :          * then it does the writeback synchronously. Luckily all common linux
     458             :          * systems have sync_file_range().  This is preferable over
     459             :          * FADV_DONTNEED because it doesn't flush out clean data.
     460             :          *
     461             :          * We map the file (mmap()), tell the kernel to sync back the contents
     462             :          * (msync()), and then remove the mapping again (munmap()).
     463             :          */
     464             : 
     465             :         /* mmap() needs actual length if we want to map whole file */
     466             :         if (offset == 0 && nbytes == 0)
     467             :         {
     468             :             nbytes = lseek(fd, 0, SEEK_END);
     469             :             if (nbytes < 0)
     470             :             {
     471             :                 ereport(WARNING,
     472             :                         (errcode_for_file_access(),
     473             :                          errmsg("could not determine dirty data size: %m")));
     474             :                 return;
     475             :             }
     476             :         }
     477             : 
     478             :         /*
     479             :          * Some platforms reject partial-page mmap() attempts.  To deal with
     480             :          * that, just truncate the request to a page boundary.  If any extra
     481             :          * bytes don't get flushed, well, it's only a hint anyway.
     482             :          */
     483             : 
     484             :         /* fetch pagesize only once */
     485             :         if (pagesize == 0)
     486             :             pagesize = sysconf(_SC_PAGESIZE);
     487             : 
     488             :         /* align length to pagesize, dropping any fractional page */
     489             :         if (pagesize > 0)
     490             :             nbytes = (nbytes / pagesize) * pagesize;
     491             : 
     492             :         /* fractional-page request is a no-op */
     493             :         if (nbytes <= 0)
     494             :             return;
     495             : 
     496             :         /*
     497             :          * mmap could well fail, particularly on 32-bit platforms where there
     498             :          * may simply not be enough address space.  If so, silently fall
     499             :          * through to the next implementation.
     500             :          */
     501             :         if (nbytes <= (off_t) SSIZE_MAX)
     502             :             p = mmap(NULL, nbytes, PROT_READ, MAP_SHARED, fd, offset);
     503             :         else
     504             :             p = MAP_FAILED;
     505             : 
     506             :         if (p != MAP_FAILED)
     507             :         {
     508             :             int         rc;
     509             : 
     510             :             rc = msync(p, (size_t) nbytes, MS_ASYNC);
     511             :             if (rc != 0)
     512             :             {
     513             :                 ereport(WARNING,
     514             :                         (errcode_for_file_access(),
     515             :                          errmsg("could not flush dirty data: %m")));
     516             :                 /* NB: need to fall through to munmap()! */
     517             :             }
     518             : 
     519             :             rc = munmap(p, (size_t) nbytes);
     520             :             if (rc != 0)
     521             :             {
     522             :                 /* FATAL error because mapping would remain */
     523             :                 ereport(FATAL,
     524             :                         (errcode_for_file_access(),
     525             :                          errmsg("could not munmap() while flushing data: %m")));
     526             :             }
     527             : 
     528             :             return;
     529             :         }
     530             :     }
     531             : #endif
     532             : #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
     533             :     {
     534             :         int         rc;
     535             : 
     536             :         /*
     537             :          * Signal the kernel that the passed in range should not be cached
     538             :          * anymore. This has the, desired, side effect of writing out dirty
     539             :          * data, and the, undesired, side effect of likely discarding useful
     540             :          * clean cached blocks.  For the latter reason this is the least
     541             :          * preferable method.
     542             :          */
     543             : 
     544             :         rc = posix_fadvise(fd, offset, nbytes, POSIX_FADV_DONTNEED);
     545             : 
     546             :         if (rc != 0)
     547             :         {
     548             :             /* don't error out, this is just a performance optimization */
     549             :             ereport(WARNING,
     550             :                     (errcode_for_file_access(),
     551             :                      errmsg("could not flush dirty data: %m")));
     552             :         }
     553             : 
     554             :         return;
     555             :     }
     556             : #endif
     557             : }
     558             : 
     559             : 
     560             : /*
     561             :  * fsync_fname -- fsync a file or directory, handling errors properly
     562             :  *
     563             :  * Try to fsync a file or directory. When doing the latter, ignore errors that
     564             :  * indicate the OS just doesn't allow/require fsyncing directories.
     565             :  */
     566             : void
     567          33 : fsync_fname(const char *fname, bool isdir)
     568             : {
     569          33 :     fsync_fname_ext(fname, isdir, false, ERROR);
     570          33 : }
     571             : 
     572             : /*
     573             :  * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
     574             :  *
     575             :  * This routine ensures that, after returning, the effect of renaming file
     576             :  * persists in case of a crash. A crash while this routine is running will
     577             :  * leave you with either the pre-existing or the moved file in place of the
     578             :  * new file; no mixed state or truncated files are possible.
     579             :  *
     580             :  * It does so by using fsync on the old filename and the possibly existing
     581             :  * target filename before the rename, and the target file and directory after.
     582             :  *
     583             :  * Note that rename() cannot be used across arbitrary directories, as they
     584             :  * might not be on the same filesystem. Therefore this routine does not
     585             :  * support renaming across directories.
     586             :  *
     587             :  * Log errors with the caller specified severity.
     588             :  *
     589             :  * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
     590             :  * valid upon return.
     591             :  */
     592             : int
     593          11 : durable_rename(const char *oldfile, const char *newfile, int elevel)
     594             : {
     595             :     int         fd;
     596             : 
     597             :     /*
     598             :      * First fsync the old and target path (if it exists), to ensure that they
     599             :      * are properly persistent on disk. Syncing the target file is not
     600             :      * strictly necessary, but it makes it easier to reason about crashes;
     601             :      * because it's then guaranteed that either source or target file exists
     602             :      * after a crash.
     603             :      */
     604          11 :     if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
     605           0 :         return -1;
     606             : 
     607          11 :     fd = OpenTransientFile((char *) newfile, PG_BINARY | O_RDWR, 0);
     608          11 :     if (fd < 0)
     609             :     {
     610           1 :         if (errno != ENOENT)
     611             :         {
     612           0 :             ereport(elevel,
     613             :                     (errcode_for_file_access(),
     614             :                      errmsg("could not open file \"%s\": %m", newfile)));
     615           0 :             return -1;
     616             :         }
     617             :     }
     618             :     else
     619             :     {
     620          10 :         if (pg_fsync(fd) != 0)
     621             :         {
     622             :             int         save_errno;
     623             : 
     624             :             /* close file upon error, might not be in transaction context */
     625           0 :             save_errno = errno;
     626           0 :             CloseTransientFile(fd);
     627           0 :             errno = save_errno;
     628             : 
     629           0 :             ereport(elevel,
     630             :                     (errcode_for_file_access(),
     631             :                      errmsg("could not fsync file \"%s\": %m", newfile)));
     632           0 :             return -1;
     633             :         }
     634          10 :         CloseTransientFile(fd);
     635             :     }
     636             : 
     637             :     /* Time to do the real deal... */
     638          11 :     if (rename(oldfile, newfile) < 0)
     639             :     {
     640           0 :         ereport(elevel,
     641             :                 (errcode_for_file_access(),
     642             :                  errmsg("could not rename file \"%s\" to \"%s\": %m",
     643             :                         oldfile, newfile)));
     644           0 :         return -1;
     645             :     }
     646             : 
     647             :     /*
     648             :      * To guarantee renaming the file is persistent, fsync the file with its
     649             :      * new name, and its containing directory.
     650             :      */
     651          11 :     if (fsync_fname_ext(newfile, false, false, elevel) != 0)
     652           0 :         return -1;
     653             : 
     654          11 :     if (fsync_parent_path(newfile, elevel) != 0)
     655           0 :         return -1;
     656             : 
     657          11 :     return 0;
     658             : }
     659             : 
     660             : /*
     661             :  * durable_unlink -- remove a file in a durable manner
     662             :  *
     663             :  * This routine ensures that, after returning, the effect of removing file
     664             :  * persists in case of a crash. A crash while this routine is running will
     665             :  * leave the system in no mixed state.
     666             :  *
     667             :  * It does so by using fsync on the parent directory of the file after the
     668             :  * actual removal is done.
     669             :  *
     670             :  * Log errors with the severity specified by caller.
     671             :  *
     672             :  * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
     673             :  * valid upon return.
     674             :  */
     675             : int
     676           1 : durable_unlink(const char *fname, int elevel)
     677             : {
     678           1 :     if (unlink(fname) < 0)
     679             :     {
     680           1 :         ereport(elevel,
     681             :                 (errcode_for_file_access(),
     682             :                  errmsg("could not remove file \"%s\": %m",
     683             :                         fname)));
     684           1 :         return -1;
     685             :     }
     686             : 
     687             :     /*
     688             :      * To guarantee that the removal of the file is persistent, fsync its
     689             :      * parent directory.
     690             :      */
     691           0 :     if (fsync_parent_path(fname, elevel) != 0)
     692           0 :         return -1;
     693             : 
     694           0 :     return 0;
     695             : }
     696             : 
     697             : /*
     698             :  * durable_link_or_rename -- rename a file in a durable manner.
     699             :  *
     700             :  * Similar to durable_rename(), except that this routine tries (but does not
     701             :  * guarantee) not to overwrite the target file.
     702             :  *
     703             :  * Note that a crash in an unfortunate moment can leave you with two links to
     704             :  * the target file.
     705             :  *
     706             :  * Log errors with the caller specified severity.
     707             :  *
     708             :  * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
     709             :  * valid upon return.
     710             :  */
     711             : int
     712          11 : durable_link_or_rename(const char *oldfile, const char *newfile, int elevel)
     713             : {
     714             :     /*
     715             :      * Ensure that, if we crash directly after the rename/link, a file with
     716             :      * valid contents is moved into place.
     717             :      */
     718          11 :     if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
     719           0 :         return -1;
     720             : 
     721             : #if HAVE_WORKING_LINK
     722          11 :     if (link(oldfile, newfile) < 0)
     723             :     {
     724           0 :         ereport(elevel,
     725             :                 (errcode_for_file_access(),
     726             :                  errmsg("could not link file \"%s\" to \"%s\": %m",
     727             :                         oldfile, newfile)));
     728           0 :         return -1;
     729             :     }
     730          11 :     unlink(oldfile);
     731             : #else
     732             :     /* XXX: Add racy file existence check? */
     733             :     if (rename(oldfile, newfile) < 0)
     734             :     {
     735             :         ereport(elevel,
     736             :                 (errcode_for_file_access(),
     737             :                  errmsg("could not rename file \"%s\" to \"%s\": %m",
     738             :                         oldfile, newfile)));
     739             :         return -1;
     740             :     }
     741             : #endif
     742             : 
     743             :     /*
     744             :      * Make change persistent in case of an OS crash, both the new entry and
     745             :      * its parent directory need to be flushed.
     746             :      */
     747          11 :     if (fsync_fname_ext(newfile, false, false, elevel) != 0)
     748           0 :         return -1;
     749             : 
     750             :     /* Same for parent directory */
     751          11 :     if (fsync_parent_path(newfile, elevel) != 0)
     752           0 :         return -1;
     753             : 
     754          11 :     return 0;
     755             : }
     756             : 
     757             : /*
     758             :  * InitFileAccess --- initialize this module during backend startup
     759             :  *
     760             :  * This is called during either normal or standalone backend start.
     761             :  * It is *not* called in the postmaster.
     762             :  */
     763             : void
     764         344 : InitFileAccess(void)
     765             : {
     766         344 :     Assert(SizeVfdCache == 0);  /* call me only once */
     767             : 
     768             :     /* initialize cache header entry */
     769         344 :     VfdCache = (Vfd *) malloc(sizeof(Vfd));
     770         344 :     if (VfdCache == NULL)
     771           0 :         ereport(FATAL,
     772             :                 (errcode(ERRCODE_OUT_OF_MEMORY),
     773             :                  errmsg("out of memory")));
     774             : 
     775         344 :     MemSet((char *) &(VfdCache[0]), 0, sizeof(Vfd));
     776         344 :     VfdCache->fd = VFD_CLOSED;
     777             : 
     778         344 :     SizeVfdCache = 1;
     779             : 
     780             :     /* register proc-exit hook to ensure temp files are dropped at exit */
     781         344 :     on_proc_exit(AtProcExit_Files, 0);
     782         344 : }
     783             : 
     784             : /*
     785             :  * count_usable_fds --- count how many FDs the system will let us open,
     786             :  *      and estimate how many are already open.
     787             :  *
     788             :  * We stop counting if usable_fds reaches max_to_probe.  Note: a small
     789             :  * value of max_to_probe might result in an underestimate of already_open;
     790             :  * we must fill in any "gaps" in the set of used FDs before the calculation
     791             :  * of already_open will give the right answer.  In practice, max_to_probe
     792             :  * of a couple of dozen should be enough to ensure good results.
     793             :  *
     794             :  * We assume stdin (FD 0) is available for dup'ing
     795             :  */
     796             : static void
     797           1 : count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
     798             : {
     799             :     int        *fd;
     800             :     int         size;
     801           1 :     int         used = 0;
     802           1 :     int         highestfd = 0;
     803             :     int         j;
     804             : 
     805             : #ifdef HAVE_GETRLIMIT
     806             :     struct rlimit rlim;
     807             :     int         getrlimit_status;
     808             : #endif
     809             : 
     810           1 :     size = 1024;
     811           1 :     fd = (int *) palloc(size * sizeof(int));
     812             : 
     813             : #ifdef HAVE_GETRLIMIT
     814             : #ifdef RLIMIT_NOFILE            /* most platforms use RLIMIT_NOFILE */
     815           1 :     getrlimit_status = getrlimit(RLIMIT_NOFILE, &rlim);
     816             : #else                           /* but BSD doesn't ... */
     817             :     getrlimit_status = getrlimit(RLIMIT_OFILE, &rlim);
     818             : #endif                          /* RLIMIT_NOFILE */
     819           1 :     if (getrlimit_status != 0)
     820           0 :         ereport(WARNING, (errmsg("getrlimit failed: %m")));
     821             : #endif                          /* HAVE_GETRLIMIT */
     822             : 
     823             :     /* dup until failure or probe limit reached */
     824             :     for (;;)
     825             :     {
     826             :         int         thisfd;
     827             : 
     828             : #ifdef HAVE_GETRLIMIT
     829             : 
     830             :         /*
     831             :          * don't go beyond RLIMIT_NOFILE; causes irritating kernel logs on
     832             :          * some platforms
     833             :          */
     834        1000 :         if (getrlimit_status == 0 && highestfd >= rlim.rlim_cur - 1)
     835           0 :             break;
     836             : #endif
     837             : 
     838        1000 :         thisfd = dup(0);
     839        1000 :         if (thisfd < 0)
     840             :         {
     841             :             /* Expect EMFILE or ENFILE, else it's fishy */
     842           0 :             if (errno != EMFILE && errno != ENFILE)
     843           0 :                 elog(WARNING, "dup(0) failed after %d successes: %m", used);
     844           0 :             break;
     845             :         }
     846             : 
     847        1000 :         if (used >= size)
     848             :         {
     849           0 :             size *= 2;
     850           0 :             fd = (int *) repalloc(fd, size * sizeof(int));
     851             :         }
     852        1000 :         fd[used++] = thisfd;
     853             : 
     854        1000 :         if (highestfd < thisfd)
     855        1000 :             highestfd = thisfd;
     856             : 
     857        1000 :         if (used >= max_to_probe)
     858           1 :             break;
     859         999 :     }
     860             : 
     861             :     /* release the files we opened */
     862        1001 :     for (j = 0; j < used; j++)
     863        1000 :         close(fd[j]);
     864             : 
     865           1 :     pfree(fd);
     866             : 
     867             :     /*
     868             :      * Return results.  usable_fds is just the number of successful dups. We
     869             :      * assume that the system limit is highestfd+1 (remember 0 is a legal FD
     870             :      * number) and so already_open is highestfd+1 - usable_fds.
     871             :      */
     872           1 :     *usable_fds = used;
     873           1 :     *already_open = highestfd + 1 - used;
     874           1 : }
     875             : 
     876             : /*
     877             :  * set_max_safe_fds
     878             :  *      Determine number of filedescriptors that fd.c is allowed to use
     879             :  */
     880             : void
     881           1 : set_max_safe_fds(void)
     882             : {
     883             :     int         usable_fds;
     884             :     int         already_open;
     885             : 
     886             :     /*----------
     887             :      * We want to set max_safe_fds to
     888             :      *          MIN(usable_fds, max_files_per_process - already_open)
     889             :      * less the slop factor for files that are opened without consulting
     890             :      * fd.c.  This ensures that we won't exceed either max_files_per_process
     891             :      * or the experimentally-determined EMFILE limit.
     892             :      *----------
     893             :      */
     894           1 :     count_usable_fds(max_files_per_process,
     895             :                      &usable_fds, &already_open);
     896             : 
     897           1 :     max_safe_fds = Min(usable_fds, max_files_per_process - already_open);
     898             : 
     899             :     /*
     900             :      * Take off the FDs reserved for system() etc.
     901             :      */
     902           1 :     max_safe_fds -= NUM_RESERVED_FDS;
     903             : 
     904             :     /*
     905             :      * Make sure we still have enough to get by.
     906             :      */
     907           1 :     if (max_safe_fds < FD_MINFREE)
     908           0 :         ereport(FATAL,
     909             :                 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
     910             :                  errmsg("insufficient file descriptors available to start server process"),
     911             :                  errdetail("System allows %d, we need at least %d.",
     912             :                            max_safe_fds + NUM_RESERVED_FDS,
     913             :                            FD_MINFREE + NUM_RESERVED_FDS)));
     914             : 
     915           1 :     elog(DEBUG2, "max_safe_fds = %d, usable_fds = %d, already_open = %d",
     916             :          max_safe_fds, usable_fds, already_open);
     917           1 : }
     918             : 
     919             : /*
     920             :  * BasicOpenFile --- same as open(2) except can free other FDs if needed
     921             :  *
     922             :  * This is exported for use by places that really want a plain kernel FD,
     923             :  * but need to be proof against running out of FDs.  Once an FD has been
     924             :  * successfully returned, it is the caller's responsibility to ensure that
     925             :  * it will not be leaked on ereport()!  Most users should *not* call this
     926             :  * routine directly, but instead use the VFD abstraction level, which
     927             :  * provides protection against descriptor leaks as well as management of
     928             :  * files that need to be open for more than a short period of time.
     929             :  *
     930             :  * Ideally this should be the *only* direct call of open() in the backend.
     931             :  * In practice, the postmaster calls open() directly, and there are some
     932             :  * direct open() calls done early in backend startup.  Those are OK since
     933             :  * this module wouldn't have any open files to close at that point anyway.
     934             :  */
     935             : int
     936       37406 : BasicOpenFile(FileName fileName, int fileFlags, int fileMode)
     937             : {
     938             :     int         fd;
     939             : 
     940             : tryAgain:
     941       37406 :     fd = open(fileName, fileFlags, fileMode);
     942             : 
     943       37406 :     if (fd >= 0)
     944       25517 :         return fd;              /* success! */
     945             : 
     946       11889 :     if (errno == EMFILE || errno == ENFILE)
     947             :     {
     948           0 :         int         save_errno = errno;
     949             : 
     950           0 :         ereport(LOG,
     951             :                 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
     952             :                  errmsg("out of file descriptors: %m; release and retry")));
     953           0 :         errno = 0;
     954           0 :         if (ReleaseLruFile())
     955           0 :             goto tryAgain;
     956           0 :         errno = save_errno;
     957             :     }
     958             : 
     959       11889 :     return -1;                  /* failure */
     960             : }
     961             : 
     962             : #if defined(FDDEBUG)
     963             : 
     964             : static void
     965             : _dump_lru(void)
     966             : {
     967             :     int         mru = VfdCache[0].lruLessRecently;
     968             :     Vfd        *vfdP = &VfdCache[mru];
     969             :     char        buf[2048];
     970             : 
     971             :     snprintf(buf, sizeof(buf), "LRU: MOST %d ", mru);
     972             :     while (mru != 0)
     973             :     {
     974             :         mru = vfdP->lruLessRecently;
     975             :         vfdP = &VfdCache[mru];
     976             :         snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "%d ", mru);
     977             :     }
     978             :     snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "LEAST");
     979             :     elog(LOG, "%s", buf);
     980             : }
     981             : #endif                          /* FDDEBUG */
     982             : 
     983             : static void
     984       19625 : Delete(File file)
     985             : {
     986             :     Vfd        *vfdP;
     987             : 
     988       19625 :     Assert(file != 0);
     989             : 
     990             :     DO_DB(elog(LOG, "Delete %d (%s)",
     991             :                file, VfdCache[file].fileName));
     992             :     DO_DB(_dump_lru());
     993             : 
     994       19625 :     vfdP = &VfdCache[file];
     995             : 
     996       19625 :     VfdCache[vfdP->lruLessRecently].lruMoreRecently = vfdP->lruMoreRecently;
     997       19625 :     VfdCache[vfdP->lruMoreRecently].lruLessRecently = vfdP->lruLessRecently;
     998             : 
     999             :     DO_DB(_dump_lru());
    1000       19625 : }
    1001             : 
    1002             : static void
    1003        1181 : LruDelete(File file)
    1004             : {
    1005             :     Vfd        *vfdP;
    1006             : 
    1007        1181 :     Assert(file != 0);
    1008             : 
    1009             :     DO_DB(elog(LOG, "LruDelete %d (%s)",
    1010             :                file, VfdCache[file].fileName));
    1011             : 
    1012        1181 :     vfdP = &VfdCache[file];
    1013             : 
    1014             :     /*
    1015             :      * Normally we should know the seek position, but if for some reason we
    1016             :      * have lost track of it, try again to get it.  If we still can't get it,
    1017             :      * we have a problem: we will be unable to restore the file seek position
    1018             :      * when and if the file is re-opened.  But we can't really throw an error
    1019             :      * and refuse to close the file, or activities such as transaction cleanup
    1020             :      * will be broken.
    1021             :      */
    1022        1181 :     if (FilePosIsUnknown(vfdP->seekPos))
    1023             :     {
    1024           0 :         vfdP->seekPos = lseek(vfdP->fd, (off_t) 0, SEEK_CUR);
    1025           0 :         if (FilePosIsUnknown(vfdP->seekPos))
    1026           0 :             elog(LOG, "could not seek file \"%s\" before closing: %m",
    1027             :                  vfdP->fileName);
    1028             :     }
    1029             : 
    1030             :     /*
    1031             :      * Close the file.  We aren't expecting this to fail; if it does, better
    1032             :      * to leak the FD than to mess up our internal state.
    1033             :      */
    1034        1181 :     if (close(vfdP->fd))
    1035           0 :         elog(LOG, "could not close file \"%s\": %m", vfdP->fileName);
    1036        1181 :     vfdP->fd = VFD_CLOSED;
    1037        1181 :     --nfile;
    1038             : 
    1039             :     /* delete the vfd record from the LRU ring */
    1040        1181 :     Delete(file);
    1041        1181 : }
    1042             : 
    1043             : static void
    1044       26628 : Insert(File file)
    1045             : {
    1046             :     Vfd        *vfdP;
    1047             : 
    1048       26628 :     Assert(file != 0);
    1049             : 
    1050             :     DO_DB(elog(LOG, "Insert %d (%s)",
    1051             :                file, VfdCache[file].fileName));
    1052             :     DO_DB(_dump_lru());
    1053             : 
    1054       26628 :     vfdP = &VfdCache[file];
    1055             : 
    1056       26628 :     vfdP->lruMoreRecently = 0;
    1057       26628 :     vfdP->lruLessRecently = VfdCache[0].lruLessRecently;
    1058       26628 :     VfdCache[0].lruLessRecently = file;
    1059       26628 :     VfdCache[vfdP->lruLessRecently].lruMoreRecently = file;
    1060             : 
    1061             :     DO_DB(_dump_lru());
    1062       26628 : }
    1063             : 
    1064             : /* returns 0 on success, -1 on re-open failure (with errno set) */
    1065             : static int
    1066         636 : LruInsert(File file)
    1067             : {
    1068             :     Vfd        *vfdP;
    1069             : 
    1070         636 :     Assert(file != 0);
    1071             : 
    1072             :     DO_DB(elog(LOG, "LruInsert %d (%s)",
    1073             :                file, VfdCache[file].fileName));
    1074             : 
    1075         636 :     vfdP = &VfdCache[file];
    1076             : 
    1077         636 :     if (FileIsNotOpen(file))
    1078             :     {
    1079             :         /* Close excess kernel FDs. */
    1080         636 :         ReleaseLruFiles();
    1081             : 
    1082             :         /*
    1083             :          * The open could still fail for lack of file descriptors, eg due to
    1084             :          * overall system file table being full.  So, be prepared to release
    1085             :          * another FD if necessary...
    1086             :          */
    1087         636 :         vfdP->fd = BasicOpenFile(vfdP->fileName, vfdP->fileFlags,
    1088             :                                  vfdP->fileMode);
    1089         636 :         if (vfdP->fd < 0)
    1090             :         {
    1091             :             DO_DB(elog(LOG, "re-open failed: %m"));
    1092           0 :             return -1;
    1093             :         }
    1094             :         else
    1095             :         {
    1096         636 :             ++nfile;
    1097             :         }
    1098             : 
    1099             :         /*
    1100             :          * Seek to the right position.  We need no special case for seekPos
    1101             :          * equal to FileUnknownPos, as lseek() will certainly reject that
    1102             :          * (thus completing the logic noted in LruDelete() that we will fail
    1103             :          * to re-open a file if we couldn't get its seek position before
    1104             :          * closing).
    1105             :          */
    1106         636 :         if (vfdP->seekPos != (off_t) 0)
    1107             :         {
    1108         411 :             if (lseek(vfdP->fd, vfdP->seekPos, SEEK_SET) < 0)
    1109             :             {
    1110             :                 /*
    1111             :                  * If we fail to restore the seek position, treat it like an
    1112             :                  * open() failure.
    1113             :                  */
    1114           0 :                 int         save_errno = errno;
    1115             : 
    1116           0 :                 elog(LOG, "could not seek file \"%s\" after re-opening: %m",
    1117             :                      vfdP->fileName);
    1118           0 :                 (void) close(vfdP->fd);
    1119           0 :                 vfdP->fd = VFD_CLOSED;
    1120           0 :                 --nfile;
    1121           0 :                 errno = save_errno;
    1122           0 :                 return -1;
    1123             :             }
    1124             :         }
    1125             :     }
    1126             : 
    1127             :     /*
    1128             :      * put it at the head of the Lru ring
    1129             :      */
    1130             : 
    1131         636 :     Insert(file);
    1132             : 
    1133         636 :     return 0;
    1134             : }
    1135             : 
    1136             : /*
    1137             :  * Release one kernel FD by closing the least-recently-used VFD.
    1138             :  */
    1139             : static bool
    1140        1176 : ReleaseLruFile(void)
    1141             : {
    1142             :     DO_DB(elog(LOG, "ReleaseLruFile. Opened %d", nfile));
    1143             : 
    1144        1176 :     if (nfile > 0)
    1145             :     {
    1146             :         /*
    1147             :          * There are opened files and so there should be at least one used vfd
    1148             :          * in the ring.
    1149             :          */
    1150        1176 :         Assert(VfdCache[0].lruMoreRecently != 0);
    1151        1176 :         LruDelete(VfdCache[0].lruMoreRecently);
    1152        1176 :         return true;            /* freed a file */
    1153             :     }
    1154           0 :     return false;               /* no files available to free */
    1155             : }
    1156             : 
    1157             : /*
    1158             :  * Release kernel FDs as needed to get under the max_safe_fds limit.
    1159             :  * After calling this, it's OK to try to open another file.
    1160             :  */
    1161             : static void
    1162       36640 : ReleaseLruFiles(void)
    1163             : {
    1164       74456 :     while (nfile + numAllocatedDescs >= max_safe_fds)
    1165             :     {
    1166        1176 :         if (!ReleaseLruFile())
    1167           0 :             break;
    1168             :     }
    1169       36640 : }
    1170             : 
    1171             : static File
    1172       26561 : AllocateVfd(void)
    1173             : {
    1174             :     Index       i;
    1175             :     File        file;
    1176             : 
    1177             :     DO_DB(elog(LOG, "AllocateVfd. Size %zu", SizeVfdCache));
    1178             : 
    1179       26561 :     Assert(SizeVfdCache > 0);    /* InitFileAccess not called? */
    1180             : 
    1181       26561 :     if (VfdCache[0].nextFree == 0)
    1182             :     {
    1183             :         /*
    1184             :          * The free list is empty so it is time to increase the size of the
    1185             :          * array.  We choose to double it each time this happens. However,
    1186             :          * there's not much point in starting *real* small.
    1187             :          */
    1188         447 :         Size        newCacheSize = SizeVfdCache * 2;
    1189             :         Vfd        *newVfdCache;
    1190             : 
    1191         447 :         if (newCacheSize < 32)
    1192         339 :             newCacheSize = 32;
    1193             : 
    1194             :         /*
    1195             :          * Be careful not to clobber VfdCache ptr if realloc fails.
    1196             :          */
    1197         447 :         newVfdCache = (Vfd *) realloc(VfdCache, sizeof(Vfd) * newCacheSize);
    1198         447 :         if (newVfdCache == NULL)
    1199           0 :             ereport(ERROR,
    1200             :                     (errcode(ERRCODE_OUT_OF_MEMORY),
    1201             :                      errmsg("out of memory")));
    1202         447 :         VfdCache = newVfdCache;
    1203             : 
    1204             :         /*
    1205             :          * Initialize the new entries and link them into the free list.
    1206             :          */
    1207       17132 :         for (i = SizeVfdCache; i < newCacheSize; i++)
    1208             :         {
    1209       16685 :             MemSet((char *) &(VfdCache[i]), 0, sizeof(Vfd));
    1210       16685 :             VfdCache[i].nextFree = i + 1;
    1211       16685 :             VfdCache[i].fd = VFD_CLOSED;
    1212             :         }
    1213         447 :         VfdCache[newCacheSize - 1].nextFree = 0;
    1214         447 :         VfdCache[0].nextFree = SizeVfdCache;
    1215             : 
    1216             :         /*
    1217             :          * Record the new size
    1218             :          */
    1219         447 :         SizeVfdCache = newCacheSize;
    1220             :     }
    1221             : 
    1222       26561 :     file = VfdCache[0].nextFree;
    1223             : 
    1224       26561 :     VfdCache[0].nextFree = VfdCache[file].nextFree;
    1225             : 
    1226       26561 :     return file;
    1227             : }
    1228             : 
    1229             : static void
    1230       19365 : FreeVfd(File file)
    1231             : {
    1232       19365 :     Vfd        *vfdP = &VfdCache[file];
    1233             : 
    1234             :     DO_DB(elog(LOG, "FreeVfd: %d (%s)",
    1235             :                file, vfdP->fileName ? vfdP->fileName : ""));
    1236             : 
    1237       19365 :     if (vfdP->fileName != NULL)
    1238             :     {
    1239       11959 :         free(vfdP->fileName);
    1240       11959 :         vfdP->fileName = NULL;
    1241             :     }
    1242       19365 :     vfdP->fdstate = 0x0;
    1243             : 
    1244       19365 :     vfdP->nextFree = VfdCache[0].nextFree;
    1245       19365 :     VfdCache[0].nextFree = file;
    1246       19365 : }
    1247             : 
    1248             : /* returns 0 on success, -1 on re-open failure (with errno set) */
    1249             : static int
    1250       37590 : FileAccess(File file)
    1251             : {
    1252             :     int         returnValue;
    1253             : 
    1254             :     DO_DB(elog(LOG, "FileAccess %d (%s)",
    1255             :                file, VfdCache[file].fileName));
    1256             : 
    1257             :     /*
    1258             :      * Is the file open?  If not, open it and put it at the head of the LRU
    1259             :      * ring (possibly closing the least recently used file to get an FD).
    1260             :      */
    1261             : 
    1262       37590 :     if (FileIsNotOpen(file))
    1263             :     {
    1264         636 :         returnValue = LruInsert(file);
    1265         636 :         if (returnValue != 0)
    1266           0 :             return returnValue;
    1267             :     }
    1268       36954 :     else if (VfdCache[0].lruLessRecently != file)
    1269             :     {
    1270             :         /*
    1271             :          * We now know that the file is open and that it is not the last one
    1272             :          * accessed, so we need to move it to the head of the Lru ring.
    1273             :          */
    1274             : 
    1275        6837 :         Delete(file);
    1276        6837 :         Insert(file);
    1277             :     }
    1278             : 
    1279       37590 :     return 0;
    1280             : }
    1281             : 
    1282             : /*
    1283             :  *  Called when we get a shared invalidation message on some relation.
    1284             :  */
    1285             : #ifdef NOT_USED
    1286             : void
    1287             : FileInvalidate(File file)
    1288             : {
    1289             :     Assert(FileIsValid(file));
    1290             :     if (!FileIsNotOpen(file))
    1291             :         LruDelete(file);
    1292             : }
    1293             : #endif
    1294             : 
    1295             : /*
    1296             :  * open a file in an arbitrary directory
    1297             :  *
    1298             :  * NB: if the passed pathname is relative (which it usually is),
    1299             :  * it will be interpreted relative to the process' working directory
    1300             :  * (which should always be $PGDATA when this code is running).
    1301             :  */
    1302             : File
    1303       26561 : PathNameOpenFile(FileName fileName, int fileFlags, int fileMode)
    1304             : {
    1305             :     char       *fnamecopy;
    1306             :     File        file;
    1307             :     Vfd        *vfdP;
    1308             : 
    1309             :     DO_DB(elog(LOG, "PathNameOpenFile: %s %x %o",
    1310             :                fileName, fileFlags, fileMode));
    1311             : 
    1312             :     /*
    1313             :      * We need a malloc'd copy of the file name; fail cleanly if no room.
    1314             :      */
    1315       26561 :     fnamecopy = strdup(fileName);
    1316       26561 :     if (fnamecopy == NULL)
    1317           0 :         ereport(ERROR,
    1318             :                 (errcode(ERRCODE_OUT_OF_MEMORY),
    1319             :                  errmsg("out of memory")));
    1320             : 
    1321       26561 :     file = AllocateVfd();
    1322       26561 :     vfdP = &VfdCache[file];
    1323             : 
    1324             :     /* Close excess kernel FDs. */
    1325       26561 :     ReleaseLruFiles();
    1326             : 
    1327       26561 :     vfdP->fd = BasicOpenFile(fileName, fileFlags, fileMode);
    1328             : 
    1329       26561 :     if (vfdP->fd < 0)
    1330             :     {
    1331        7406 :         int         save_errno = errno;
    1332             : 
    1333        7406 :         FreeVfd(file);
    1334        7406 :         free(fnamecopy);
    1335        7406 :         errno = save_errno;
    1336        7406 :         return -1;
    1337             :     }
    1338       19155 :     ++nfile;
    1339             :     DO_DB(elog(LOG, "PathNameOpenFile: success %d",
    1340             :                vfdP->fd));
    1341             : 
    1342       19155 :     Insert(file);
    1343             : 
    1344       19155 :     vfdP->fileName = fnamecopy;
    1345             :     /* Saved flags are adjusted to be OK for re-opening file */
    1346       19155 :     vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);
    1347       19155 :     vfdP->fileMode = fileMode;
    1348       19155 :     vfdP->seekPos = 0;
    1349       19155 :     vfdP->fileSize = 0;
    1350       19155 :     vfdP->fdstate = 0x0;
    1351       19155 :     vfdP->resowner = NULL;
    1352             : 
    1353       19155 :     return file;
    1354             : }
    1355             : 
    1356             : /*
    1357             :  * Open a temporary file that will disappear when we close it.
    1358             :  *
    1359             :  * This routine takes care of generating an appropriate tempfile name.
    1360             :  * There's no need to pass in fileFlags or fileMode either, since only
    1361             :  * one setting makes any sense for a temp file.
    1362             :  *
    1363             :  * Unless interXact is true, the file is remembered by CurrentResourceOwner
    1364             :  * to ensure it's closed and deleted when it's no longer needed, typically at
    1365             :  * the end-of-transaction. In most cases, you don't want temporary files to
    1366             :  * outlive the transaction that created them, so this should be false -- but
    1367             :  * if you need "somewhat" temporary storage, this might be useful. In either
    1368             :  * case, the file is removed when the File is explicitly closed.
    1369             :  */
    1370             : File
    1371          24 : OpenTemporaryFile(bool interXact)
    1372             : {
    1373          24 :     File        file = 0;
    1374             : 
    1375             :     /*
    1376             :      * If some temp tablespace(s) have been given to us, try to use the next
    1377             :      * one.  If a given tablespace can't be found, we silently fall back to
    1378             :      * the database's default tablespace.
    1379             :      *
    1380             :      * BUT: if the temp file is slated to outlive the current transaction,
    1381             :      * force it into the database's default tablespace, so that it will not
    1382             :      * pose a threat to possible tablespace drop attempts.
    1383             :      */
    1384          24 :     if (numTempTableSpaces > 0 && !interXact)
    1385             :     {
    1386           0 :         Oid         tblspcOid = GetNextTempTableSpace();
    1387             : 
    1388           0 :         if (OidIsValid(tblspcOid))
    1389           0 :             file = OpenTemporaryFileInTablespace(tblspcOid, false);
    1390             :     }
    1391             : 
    1392             :     /*
    1393             :      * If not, or if tablespace is bad, create in database's default
    1394             :      * tablespace.  MyDatabaseTableSpace should normally be set before we get
    1395             :      * here, but just in case it isn't, fall back to pg_default tablespace.
    1396             :      */
    1397          24 :     if (file <= 0)
    1398          24 :         file = OpenTemporaryFileInTablespace(MyDatabaseTableSpace ?
    1399             :                                              MyDatabaseTableSpace :
    1400             :                                              DEFAULTTABLESPACE_OID,
    1401             :                                              true);
    1402             : 
    1403             :     /* Mark it for deletion at close */
    1404          24 :     VfdCache[file].fdstate |= FD_TEMPORARY;
    1405             : 
    1406             :     /* Register it with the current resource owner */
    1407          24 :     if (!interXact)
    1408             :     {
    1409          24 :         VfdCache[file].fdstate |= FD_XACT_TEMPORARY;
    1410             : 
    1411          24 :         ResourceOwnerEnlargeFiles(CurrentResourceOwner);
    1412          24 :         ResourceOwnerRememberFile(CurrentResourceOwner, file);
    1413          24 :         VfdCache[file].resowner = CurrentResourceOwner;
    1414             : 
    1415             :         /* ensure cleanup happens at eoxact */
    1416          24 :         have_xact_temporary_files = true;
    1417             :     }
    1418             : 
    1419          24 :     return file;
    1420             : }
    1421             : 
    1422             : /*
    1423             :  * Open a temporary file in a specific tablespace.
    1424             :  * Subroutine for OpenTemporaryFile, which see for details.
    1425             :  */
    1426             : static File
    1427          24 : OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
    1428             : {
    1429             :     char        tempdirpath[MAXPGPATH];
    1430             :     char        tempfilepath[MAXPGPATH];
    1431             :     File        file;
    1432             : 
    1433             :     /*
    1434             :      * Identify the tempfile directory for this tablespace.
    1435             :      *
    1436             :      * If someone tries to specify pg_global, use pg_default instead.
    1437             :      */
    1438          24 :     if (tblspcOid == DEFAULTTABLESPACE_OID ||
    1439             :         tblspcOid == GLOBALTABLESPACE_OID)
    1440             :     {
    1441             :         /* The default tablespace is {datadir}/base */
    1442          24 :         snprintf(tempdirpath, sizeof(tempdirpath), "base/%s",
    1443             :                  PG_TEMP_FILES_DIR);
    1444             :     }
    1445             :     else
    1446             :     {
    1447             :         /* All other tablespaces are accessed via symlinks */
    1448           0 :         snprintf(tempdirpath, sizeof(tempdirpath), "pg_tblspc/%u/%s/%s",
    1449             :                  tblspcOid, TABLESPACE_VERSION_DIRECTORY, PG_TEMP_FILES_DIR);
    1450             :     }
    1451             : 
    1452             :     /*
    1453             :      * Generate a tempfile name that should be unique within the current
    1454             :      * database instance.
    1455             :      */
    1456          24 :     snprintf(tempfilepath, sizeof(tempfilepath), "%s/%s%d.%ld",
    1457             :              tempdirpath, PG_TEMP_FILE_PREFIX, MyProcPid, tempFileCounter++);
    1458             : 
    1459             :     /*
    1460             :      * Open the file.  Note: we don't use O_EXCL, in case there is an orphaned
    1461             :      * temp file that can be reused.
    1462             :      */
    1463          24 :     file = PathNameOpenFile(tempfilepath,
    1464             :                             O_RDWR | O_CREAT | O_TRUNC | PG_BINARY,
    1465             :                             0600);
    1466          24 :     if (file <= 0)
    1467             :     {
    1468             :         /*
    1469             :          * We might need to create the tablespace's tempfile directory, if no
    1470             :          * one has yet done so.
    1471             :          *
    1472             :          * Don't check for error from mkdir; it could fail if someone else
    1473             :          * just did the same thing.  If it doesn't work then we'll bomb out on
    1474             :          * the second create attempt, instead.
    1475             :          */
    1476           1 :         mkdir(tempdirpath, S_IRWXU);
    1477             : 
    1478           1 :         file = PathNameOpenFile(tempfilepath,
    1479             :                                 O_RDWR | O_CREAT | O_TRUNC | PG_BINARY,
    1480             :                                 0600);
    1481           1 :         if (file <= 0 && rejectError)
    1482           0 :             elog(ERROR, "could not create temporary file \"%s\": %m",
    1483             :                  tempfilepath);
    1484             :     }
    1485             : 
    1486          24 :     return file;
    1487             : }
    1488             : 
    1489             : /*
    1490             :  * close a file when done with it
    1491             :  */
    1492             : void
    1493       11959 : FileClose(File file)
    1494             : {
    1495             :     Vfd        *vfdP;
    1496             : 
    1497       11959 :     Assert(FileIsValid(file));
    1498             : 
    1499             :     DO_DB(elog(LOG, "FileClose: %d (%s)",
    1500             :                file, VfdCache[file].fileName));
    1501             : 
    1502       11959 :     vfdP = &VfdCache[file];
    1503             : 
    1504       11959 :     if (!FileIsNotOpen(file))
    1505             :     {
    1506             :         /* close the file */
    1507       11607 :         if (close(vfdP->fd))
    1508           0 :             elog(LOG, "could not close file \"%s\": %m", vfdP->fileName);
    1509             : 
    1510       11607 :         --nfile;
    1511       11607 :         vfdP->fd = VFD_CLOSED;
    1512             : 
    1513             :         /* remove the file from the lru ring */
    1514       11607 :         Delete(file);
    1515             :     }
    1516             : 
    1517             :     /*
    1518             :      * Delete the file if it was temporary, and make a log entry if wanted
    1519             :      */
    1520       11959 :     if (vfdP->fdstate & FD_TEMPORARY)
    1521             :     {
    1522             :         struct stat filestats;
    1523             :         int         stat_errno;
    1524             : 
    1525             :         /*
    1526             :          * If we get an error, as could happen within the ereport/elog calls,
    1527             :          * we'll come right back here during transaction abort.  Reset the
    1528             :          * flag to ensure that we can't get into an infinite loop.  This code
    1529             :          * is arranged to ensure that the worst-case consequence is failing to
    1530             :          * emit log message(s), not failing to attempt the unlink.
    1531             :          */
    1532          24 :         vfdP->fdstate &= ~FD_TEMPORARY;
    1533             : 
    1534             :         /* Subtract its size from current usage (do first in case of error) */
    1535          24 :         temporary_files_size -= vfdP->fileSize;
    1536          24 :         vfdP->fileSize = 0;
    1537             : 
    1538             :         /* first try the stat() */
    1539          24 :         if (stat(vfdP->fileName, &filestats))
    1540           0 :             stat_errno = errno;
    1541             :         else
    1542          24 :             stat_errno = 0;
    1543             : 
    1544             :         /* in any case do the unlink */
    1545          24 :         if (unlink(vfdP->fileName))
    1546           0 :             elog(LOG, "could not unlink file \"%s\": %m", vfdP->fileName);
    1547             : 
    1548             :         /* and last report the stat results */
    1549          24 :         if (stat_errno == 0)
    1550             :         {
    1551          24 :             pgstat_report_tempfile(filestats.st_size);
    1552             : 
    1553          24 :             if (log_temp_files >= 0)
    1554             :             {
    1555          24 :                 if ((filestats.st_size / 1024) >= log_temp_files)
    1556           8 :                     ereport(LOG,
    1557             :                             (errmsg("temporary file: path \"%s\", size %lu",
    1558             :                                     vfdP->fileName,
    1559             :                                     (unsigned long) filestats.st_size)));
    1560             :             }
    1561             :         }
    1562             :         else
    1563             :         {
    1564           0 :             errno = stat_errno;
    1565           0 :             elog(LOG, "could not stat file \"%s\": %m", vfdP->fileName);
    1566             :         }
    1567             :     }
    1568             : 
    1569             :     /* Unregister it from the resource owner */
    1570       11959 :     if (vfdP->resowner)
    1571          24 :         ResourceOwnerForgetFile(vfdP->resowner, file);
    1572             : 
    1573             :     /*
    1574             :      * Return the Vfd slot to the free list
    1575             :      */
    1576       11959 :     FreeVfd(file);
    1577       11959 : }
    1578             : 
    1579             : /*
    1580             :  * FilePrefetch - initiate asynchronous read of a given range of the file.
    1581             :  * The logical seek position is unaffected.
    1582             :  *
    1583             :  * Currently the only implementation of this function is using posix_fadvise
    1584             :  * which is the simplest standardized interface that accomplishes this.
    1585             :  * We could add an implementation using libaio in the future; but note that
    1586             :  * this API is inappropriate for libaio, which wants to have a buffer provided
    1587             :  * to read into.
    1588             :  */
    1589             : int
    1590           0 : FilePrefetch(File file, off_t offset, int amount, uint32 wait_event_info)
    1591             : {
    1592             : #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_WILLNEED)
    1593             :     int         returnCode;
    1594             : 
    1595           0 :     Assert(FileIsValid(file));
    1596             : 
    1597             :     DO_DB(elog(LOG, "FilePrefetch: %d (%s) " INT64_FORMAT " %d",
    1598             :                file, VfdCache[file].fileName,
    1599             :                (int64) offset, amount));
    1600             : 
    1601           0 :     returnCode = FileAccess(file);
    1602           0 :     if (returnCode < 0)
    1603           0 :         return returnCode;
    1604             : 
    1605           0 :     pgstat_report_wait_start(wait_event_info);
    1606           0 :     returnCode = posix_fadvise(VfdCache[file].fd, offset, amount,
    1607             :                                POSIX_FADV_WILLNEED);
    1608           0 :     pgstat_report_wait_end();
    1609             : 
    1610           0 :     return returnCode;
    1611             : #else
    1612             :     Assert(FileIsValid(file));
    1613             :     return 0;
    1614             : #endif
    1615             : }
    1616             : 
    1617             : void
    1618        1477 : FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
    1619             : {
    1620             :     int         returnCode;
    1621             : 
    1622        1477 :     Assert(FileIsValid(file));
    1623             : 
    1624             :     DO_DB(elog(LOG, "FileWriteback: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
    1625             :                file, VfdCache[file].fileName,
    1626             :                (int64) offset, (int64) nbytes));
    1627             : 
    1628             :     /*
    1629             :      * Caution: do not call pg_flush_data with nbytes = 0, it could trash the
    1630             :      * file's seek position.  We prefer to define that as a no-op here.
    1631             :      */
    1632        1477 :     if (nbytes <= 0)
    1633           0 :         return;
    1634             : 
    1635        1477 :     returnCode = FileAccess(file);
    1636        1477 :     if (returnCode < 0)
    1637           0 :         return;
    1638             : 
    1639        1477 :     pgstat_report_wait_start(wait_event_info);
    1640        1477 :     pg_flush_data(VfdCache[file].fd, offset, nbytes);
    1641        1477 :     pgstat_report_wait_end();
    1642             : }
    1643             : 
    1644             : int
    1645        5646 : FileRead(File file, char *buffer, int amount, uint32 wait_event_info)
    1646             : {
    1647             :     int         returnCode;
    1648             :     Vfd        *vfdP;
    1649             : 
    1650        5646 :     Assert(FileIsValid(file));
    1651             : 
    1652             :     DO_DB(elog(LOG, "FileRead: %d (%s) " INT64_FORMAT " %d %p",
    1653             :                file, VfdCache[file].fileName,
    1654             :                (int64) VfdCache[file].seekPos,
    1655             :                amount, buffer));
    1656             : 
    1657        5646 :     returnCode = FileAccess(file);
    1658        5646 :     if (returnCode < 0)
    1659           0 :         return returnCode;
    1660             : 
    1661        5646 :     vfdP = &VfdCache[file];
    1662             : 
    1663             : retry:
    1664        5646 :     pgstat_report_wait_start(wait_event_info);
    1665        5646 :     returnCode = read(vfdP->fd, buffer, amount);
    1666        5646 :     pgstat_report_wait_end();
    1667             : 
    1668        5646 :     if (returnCode >= 0)
    1669             :     {
    1670             :         /* if seekPos is unknown, leave it that way */
    1671        5646 :         if (!FilePosIsUnknown(vfdP->seekPos))
    1672        5646 :             vfdP->seekPos += returnCode;
    1673             :     }
    1674             :     else
    1675             :     {
    1676             :         /*
    1677             :          * Windows may run out of kernel buffers and return "Insufficient
    1678             :          * system resources" error.  Wait a bit and retry to solve it.
    1679             :          *
    1680             :          * It is rumored that EINTR is also possible on some Unix filesystems,
    1681             :          * in which case immediate retry is indicated.
    1682             :          */
    1683             : #ifdef WIN32
    1684             :         DWORD       error = GetLastError();
    1685             : 
    1686             :         switch (error)
    1687             :         {
    1688             :             case ERROR_NO_SYSTEM_RESOURCES:
    1689             :                 pg_usleep(1000L);
    1690             :                 errno = EINTR;
    1691             :                 break;
    1692             :             default:
    1693             :                 _dosmaperr(error);
    1694             :                 break;
    1695             :         }
    1696             : #endif
    1697             :         /* OK to retry if interrupted */
    1698           0 :         if (errno == EINTR)
    1699           0 :             goto retry;
    1700             : 
    1701             :         /* Trouble, so assume we don't know the file position anymore */
    1702           0 :         vfdP->seekPos = FileUnknownPos;
    1703             :     }
    1704             : 
    1705        5646 :     return returnCode;
    1706             : }
    1707             : 
    1708             : int
    1709       28649 : FileWrite(File file, char *buffer, int amount, uint32 wait_event_info)
    1710             : {
    1711             :     int         returnCode;
    1712             :     Vfd        *vfdP;
    1713             : 
    1714       28649 :     Assert(FileIsValid(file));
    1715             : 
    1716             :     DO_DB(elog(LOG, "FileWrite: %d (%s) " INT64_FORMAT " %d %p",
    1717             :                file, VfdCache[file].fileName,
    1718             :                (int64) VfdCache[file].seekPos,
    1719             :                amount, buffer));
    1720             : 
    1721       28649 :     returnCode = FileAccess(file);
    1722       28649 :     if (returnCode < 0)
    1723           0 :         return returnCode;
    1724             : 
    1725       28649 :     vfdP = &VfdCache[file];
    1726             : 
    1727             :     /*
    1728             :      * If enforcing temp_file_limit and it's a temp file, check to see if the
    1729             :      * write would overrun temp_file_limit, and throw error if so.  Note: it's
    1730             :      * really a modularity violation to throw error here; we should set errno
    1731             :      * and return -1.  However, there's no way to report a suitable error
    1732             :      * message if we do that.  All current callers would just throw error
    1733             :      * immediately anyway, so this is safe at present.
    1734             :      */
    1735       28649 :     if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMPORARY))
    1736             :     {
    1737             :         off_t       newPos;
    1738             : 
    1739             :         /*
    1740             :          * Normally we should know the seek position, but if for some reason
    1741             :          * we have lost track of it, try again to get it.  Here, it's fine to
    1742             :          * throw an error if we still can't get it.
    1743             :          */
    1744           0 :         if (FilePosIsUnknown(vfdP->seekPos))
    1745             :         {
    1746           0 :             vfdP->seekPos = lseek(vfdP->fd, (off_t) 0, SEEK_CUR);
    1747           0 :             if (FilePosIsUnknown(vfdP->seekPos))
    1748           0 :                 elog(ERROR, "could not seek file \"%s\": %m", vfdP->fileName);
    1749             :         }
    1750             : 
    1751           0 :         newPos = vfdP->seekPos + amount;
    1752           0 :         if (newPos > vfdP->fileSize)
    1753             :         {
    1754           0 :             uint64      newTotal = temporary_files_size;
    1755             : 
    1756           0 :             newTotal += newPos - vfdP->fileSize;
    1757           0 :             if (newTotal > (uint64) temp_file_limit * (uint64) 1024)
    1758           0 :                 ereport(ERROR,
    1759             :                         (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED),
    1760             :                          errmsg("temporary file size exceeds temp_file_limit (%dkB)",
    1761             :                                 temp_file_limit)));
    1762             :         }
    1763             :     }
    1764             : 
    1765             : retry:
    1766       28649 :     errno = 0;
    1767       28649 :     pgstat_report_wait_start(wait_event_info);
    1768       28649 :     returnCode = write(vfdP->fd, buffer, amount);
    1769       28649 :     pgstat_report_wait_end();
    1770             : 
    1771             :     /* if write didn't set errno, assume problem is no disk space */
    1772       28649 :     if (returnCode != amount && errno == 0)
    1773           0 :         errno = ENOSPC;
    1774             : 
    1775       28649 :     if (returnCode >= 0)
    1776             :     {
    1777             :         /* if seekPos is unknown, leave it that way */
    1778       28649 :         if (!FilePosIsUnknown(vfdP->seekPos))
    1779       28649 :             vfdP->seekPos += returnCode;
    1780             : 
    1781             :         /*
    1782             :          * Maintain fileSize and temporary_files_size if it's a temp file.
    1783             :          *
    1784             :          * If seekPos is -1 (unknown), this will do nothing; but we could only
    1785             :          * get here in that state if we're not enforcing temporary_files_size,
    1786             :          * so we don't care.
    1787             :          */
    1788       28649 :         if (vfdP->fdstate & FD_TEMPORARY)
    1789             :         {
    1790        1225 :             off_t       newPos = vfdP->seekPos;
    1791             : 
    1792        1225 :             if (newPos > vfdP->fileSize)
    1793             :             {
    1794        1128 :                 temporary_files_size += newPos - vfdP->fileSize;
    1795        1128 :                 vfdP->fileSize = newPos;
    1796             :             }
    1797             :         }
    1798             :     }
    1799             :     else
    1800             :     {
    1801             :         /*
    1802             :          * See comments in FileRead()
    1803             :          */
    1804             : #ifdef WIN32
    1805             :         DWORD       error = GetLastError();
    1806             : 
    1807             :         switch (error)
    1808             :         {
    1809             :             case ERROR_NO_SYSTEM_RESOURCES:
    1810             :                 pg_usleep(1000L);
    1811             :                 errno = EINTR;
    1812             :                 break;
    1813             :             default:
    1814             :                 _dosmaperr(error);
    1815             :                 break;
    1816             :         }
    1817             : #endif
    1818             :         /* OK to retry if interrupted */
    1819           0 :         if (errno == EINTR)
    1820           0 :             goto retry;
    1821             : 
    1822             :         /* Trouble, so assume we don't know the file position anymore */
    1823           0 :         vfdP->seekPos = FileUnknownPos;
    1824             :     }
    1825             : 
    1826       28649 :     return returnCode;
    1827             : }
    1828             : 
    1829             : int
    1830        1264 : FileSync(File file, uint32 wait_event_info)
    1831             : {
    1832             :     int         returnCode;
    1833             : 
    1834        1264 :     Assert(FileIsValid(file));
    1835             : 
    1836             :     DO_DB(elog(LOG, "FileSync: %d (%s)",
    1837             :                file, VfdCache[file].fileName));
    1838             : 
    1839        1264 :     returnCode = FileAccess(file);
    1840        1264 :     if (returnCode < 0)
    1841           0 :         return returnCode;
    1842             : 
    1843        1264 :     pgstat_report_wait_start(wait_event_info);
    1844        1264 :     returnCode = pg_fsync(VfdCache[file].fd);
    1845        1264 :     pgstat_report_wait_end();
    1846             : 
    1847        1264 :     return returnCode;
    1848             : }
    1849             : 
    1850             : off_t
    1851      150818 : FileSeek(File file, off_t offset, int whence)
    1852             : {
    1853             :     Vfd        *vfdP;
    1854             : 
    1855      150818 :     Assert(FileIsValid(file));
    1856             : 
    1857             :     DO_DB(elog(LOG, "FileSeek: %d (%s) " INT64_FORMAT " " INT64_FORMAT " %d",
    1858             :                file, VfdCache[file].fileName,
    1859             :                (int64) VfdCache[file].seekPos,
    1860             :                (int64) offset, whence));
    1861             : 
    1862      150818 :     vfdP = &VfdCache[file];
    1863             : 
    1864      150818 :     if (FileIsNotOpen(file))
    1865             :     {
    1866         636 :         switch (whence)
    1867             :         {
    1868             :             case SEEK_SET:
    1869         107 :                 if (offset < 0)
    1870             :                 {
    1871           0 :                     errno = EINVAL;
    1872           0 :                     return (off_t) -1;
    1873             :                 }
    1874         107 :                 vfdP->seekPos = offset;
    1875         107 :                 break;
    1876             :             case SEEK_CUR:
    1877           0 :                 if (FilePosIsUnknown(vfdP->seekPos) ||
    1878           0 :                     vfdP->seekPos + offset < 0)
    1879             :                 {
    1880           0 :                     errno = EINVAL;
    1881           0 :                     return (off_t) -1;
    1882             :                 }
    1883           0 :                 vfdP->seekPos += offset;
    1884           0 :                 break;
    1885             :             case SEEK_END:
    1886         529 :                 if (FileAccess(file) < 0)
    1887           0 :                     return (off_t) -1;
    1888         529 :                 vfdP->seekPos = lseek(vfdP->fd, offset, whence);
    1889         529 :                 break;
    1890             :             default:
    1891           0 :                 elog(ERROR, "invalid whence: %d", whence);
    1892             :                 break;
    1893             :         }
    1894             :     }
    1895             :     else
    1896             :     {
    1897      150182 :         switch (whence)
    1898             :         {
    1899             :             case SEEK_SET:
    1900       32012 :                 if (offset < 0)
    1901             :                 {
    1902           0 :                     errno = EINVAL;
    1903           0 :                     return (off_t) -1;
    1904             :                 }
    1905       32012 :                 if (vfdP->seekPos != offset)
    1906        3891 :                     vfdP->seekPos = lseek(vfdP->fd, offset, whence);
    1907       32012 :                 break;
    1908             :             case SEEK_CUR:
    1909           0 :                 if (offset != 0 || FilePosIsUnknown(vfdP->seekPos))
    1910           0 :                     vfdP->seekPos = lseek(vfdP->fd, offset, whence);
    1911           0 :                 break;
    1912             :             case SEEK_END:
    1913      118170 :                 vfdP->seekPos = lseek(vfdP->fd, offset, whence);
    1914      118170 :                 break;
    1915             :             default:
    1916           0 :                 elog(ERROR, "invalid whence: %d", whence);
    1917             :                 break;
    1918             :         }
    1919             :     }
    1920             : 
    1921      150818 :     return vfdP->seekPos;
    1922             : }
    1923             : 
    1924             : /*
    1925             :  * XXX not actually used but here for completeness
    1926             :  */
    1927             : #ifdef NOT_USED
    1928             : off_t
    1929             : FileTell(File file)
    1930             : {
    1931             :     Assert(FileIsValid(file));
    1932             :     DO_DB(elog(LOG, "FileTell %d (%s)",
    1933             :                file, VfdCache[file].fileName));
    1934             :     return VfdCache[file].seekPos;
    1935             : }
    1936             : #endif
    1937             : 
    1938             : int
    1939          25 : FileTruncate(File file, off_t offset, uint32 wait_event_info)
    1940             : {
    1941             :     int         returnCode;
    1942             : 
    1943          25 :     Assert(FileIsValid(file));
    1944             : 
    1945             :     DO_DB(elog(LOG, "FileTruncate %d (%s)",
    1946             :                file, VfdCache[file].fileName));
    1947             : 
    1948          25 :     returnCode = FileAccess(file);
    1949          25 :     if (returnCode < 0)
    1950           0 :         return returnCode;
    1951             : 
    1952          25 :     pgstat_report_wait_start(wait_event_info);
    1953          25 :     returnCode = ftruncate(VfdCache[file].fd, offset);
    1954          25 :     pgstat_report_wait_end();
    1955             : 
    1956          25 :     if (returnCode == 0 && VfdCache[file].fileSize > offset)
    1957             :     {
    1958             :         /* adjust our state for truncation of a temp file */
    1959           0 :         Assert(VfdCache[file].fdstate & FD_TEMPORARY);
    1960           0 :         temporary_files_size -= VfdCache[file].fileSize - offset;
    1961           0 :         VfdCache[file].fileSize = offset;
    1962             :     }
    1963             : 
    1964          25 :     return returnCode;
    1965             : }
    1966             : 
    1967             : /*
    1968             :  * Return the pathname associated with an open file.
    1969             :  *
    1970             :  * The returned string points to an internal buffer, which is valid until
    1971             :  * the file is closed.
    1972             :  */
    1973             : char *
    1974           0 : FilePathName(File file)
    1975             : {
    1976           0 :     Assert(FileIsValid(file));
    1977             : 
    1978           0 :     return VfdCache[file].fileName;
    1979             : }
    1980             : 
    1981             : /*
    1982             :  * Return the raw file descriptor of an opened file.
    1983             :  *
    1984             :  * The returned file descriptor will be valid until the file is closed, but
    1985             :  * there are a lot of things that can make that happen.  So the caller should
    1986             :  * be careful not to do much of anything else before it finishes using the
    1987             :  * returned file descriptor.
    1988             :  */
    1989             : int
    1990           0 : FileGetRawDesc(File file)
    1991             : {
    1992           0 :     Assert(FileIsValid(file));
    1993           0 :     return VfdCache[file].fd;
    1994             : }
    1995             : 
    1996             : /*
    1997             :  * FileGetRawFlags - returns the file flags on open(2)
    1998             :  */
    1999             : int
    2000           0 : FileGetRawFlags(File file)
    2001             : {
    2002           0 :     Assert(FileIsValid(file));
    2003           0 :     return VfdCache[file].fileFlags;
    2004             : }
    2005             : 
    2006             : /*
    2007             :  * FileGetRawMode - returns the mode bitmask passed to open(2)
    2008             :  */
    2009             : int
    2010           0 : FileGetRawMode(File file)
    2011             : {
    2012           0 :     Assert(FileIsValid(file));
    2013           0 :     return VfdCache[file].fileMode;
    2014             : }
    2015             : 
    2016             : /*
    2017             :  * Make room for another allocatedDescs[] array entry if needed and possible.
    2018             :  * Returns true if an array element is available.
    2019             :  */
    2020             : static bool
    2021        9443 : reserveAllocatedDesc(void)
    2022             : {
    2023             :     AllocateDesc *newDescs;
    2024             :     int         newMax;
    2025             : 
    2026             :     /* Quick out if array already has a free slot. */
    2027        9443 :     if (numAllocatedDescs < maxAllocatedDescs)
    2028        9438 :         return true;
    2029             : 
    2030             :     /*
    2031             :      * If the array hasn't yet been created in the current process, initialize
    2032             :      * it with FD_MINFREE / 2 elements.  In many scenarios this is as many as
    2033             :      * we will ever need, anyway.  We don't want to look at max_safe_fds
    2034             :      * immediately because set_max_safe_fds() may not have run yet.
    2035             :      */
    2036           5 :     if (allocatedDescs == NULL)
    2037             :     {
    2038           5 :         newMax = FD_MINFREE / 2;
    2039           5 :         newDescs = (AllocateDesc *) malloc(newMax * sizeof(AllocateDesc));
    2040             :         /* Out of memory already?  Treat as fatal error. */
    2041           5 :         if (newDescs == NULL)
    2042           0 :             ereport(ERROR,
    2043             :                     (errcode(ERRCODE_OUT_OF_MEMORY),
    2044             :                      errmsg("out of memory")));
    2045           5 :         allocatedDescs = newDescs;
    2046           5 :         maxAllocatedDescs = newMax;
    2047           5 :         return true;
    2048             :     }
    2049             : 
    2050             :     /*
    2051             :      * Consider enlarging the array beyond the initial allocation used above.
    2052             :      * By the time this happens, max_safe_fds should be known accurately.
    2053             :      *
    2054             :      * We mustn't let allocated descriptors hog all the available FDs, and in
    2055             :      * practice we'd better leave a reasonable number of FDs for VFD use.  So
    2056             :      * set the maximum to max_safe_fds / 2.  (This should certainly be at
    2057             :      * least as large as the initial size, FD_MINFREE / 2.)
    2058             :      */
    2059           0 :     newMax = max_safe_fds / 2;
    2060           0 :     if (newMax > maxAllocatedDescs)
    2061             :     {
    2062           0 :         newDescs = (AllocateDesc *) realloc(allocatedDescs,
    2063             :                                             newMax * sizeof(AllocateDesc));
    2064             :         /* Treat out-of-memory as a non-fatal error. */
    2065           0 :         if (newDescs == NULL)
    2066           0 :             return false;
    2067           0 :         allocatedDescs = newDescs;
    2068           0 :         maxAllocatedDescs = newMax;
    2069           0 :         return true;
    2070             :     }
    2071             : 
    2072             :     /* Can't enlarge allocatedDescs[] any more. */
    2073           0 :     return false;
    2074             : }
    2075             : 
    2076             : /*
    2077             :  * Routines that want to use stdio (ie, FILE*) should use AllocateFile
    2078             :  * rather than plain fopen().  This lets fd.c deal with freeing FDs if
    2079             :  * necessary to open the file.  When done, call FreeFile rather than fclose.
    2080             :  *
    2081             :  * Note that files that will be open for any significant length of time
    2082             :  * should NOT be handled this way, since they cannot share kernel file
    2083             :  * descriptors with other files; there is grave risk of running out of FDs
    2084             :  * if anyone locks down too many FDs.  Most callers of this routine are
    2085             :  * simply reading a config file that they will read and close immediately.
    2086             :  *
    2087             :  * fd.c will automatically close all files opened with AllocateFile at
    2088             :  * transaction commit or abort; this prevents FD leakage if a routine
    2089             :  * that calls AllocateFile is terminated prematurely by ereport(ERROR).
    2090             :  *
    2091             :  * Ideally this should be the *only* direct call of fopen() in the backend.
    2092             :  */
    2093             : FILE *
    2094        3422 : AllocateFile(const char *name, const char *mode)
    2095             : {
    2096             :     FILE       *file;
    2097             : 
    2098             :     DO_DB(elog(LOG, "AllocateFile: Allocated %d (%s)",
    2099             :                numAllocatedDescs, name));
    2100             : 
    2101             :     /* Can we allocate another non-virtual FD? */
    2102        3422 :     if (!reserveAllocatedDesc())
    2103           0 :         ereport(ERROR,
    2104             :                 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
    2105             :                  errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
    2106             :                         maxAllocatedDescs, name)));
    2107             : 
    2108             :     /* Close excess kernel FDs. */
    2109        3422 :     ReleaseLruFiles();
    2110             : 
    2111             : TryAgain:
    2112        3422 :     if ((file = fopen(name, mode)) != NULL)
    2113             :     {
    2114        3388 :         AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
    2115             : 
    2116        3388 :         desc->kind = AllocateDescFile;
    2117        3388 :         desc->desc.file = file;
    2118        3388 :         desc->create_subid = GetCurrentSubTransactionId();
    2119        3388 :         numAllocatedDescs++;
    2120        3388 :         return desc->desc.file;
    2121             :     }
    2122             : 
    2123          34 :     if (errno == EMFILE || errno == ENFILE)
    2124             :     {
    2125           0 :         int         save_errno = errno;
    2126             : 
    2127           0 :         ereport(LOG,
    2128             :                 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
    2129             :                  errmsg("out of file descriptors: %m; release and retry")));
    2130           0 :         errno = 0;
    2131           0 :         if (ReleaseLruFile())
    2132           0 :             goto TryAgain;
    2133           0 :         errno = save_errno;
    2134             :     }
    2135             : 
    2136          34 :     return NULL;
    2137             : }
    2138             : 
    2139             : 
    2140             : /*
    2141             :  * Like AllocateFile, but returns an unbuffered fd like open(2)
    2142             :  */
    2143             : int
    2144        5434 : OpenTransientFile(FileName fileName, int fileFlags, int fileMode)
    2145             : {
    2146             :     int         fd;
    2147             : 
    2148             :     DO_DB(elog(LOG, "OpenTransientFile: Allocated %d (%s)",
    2149             :                numAllocatedDescs, fileName));
    2150             : 
    2151             :     /* Can we allocate another non-virtual FD? */
    2152        5434 :     if (!reserveAllocatedDesc())
    2153           0 :         ereport(ERROR,
    2154             :                 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
    2155             :                  errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
    2156             :                         maxAllocatedDescs, fileName)));
    2157             : 
    2158             :     /* Close excess kernel FDs. */
    2159        5434 :     ReleaseLruFiles();
    2160             : 
    2161        5434 :     fd = BasicOpenFile(fileName, fileFlags, fileMode);
    2162             : 
    2163        5434 :     if (fd >= 0)
    2164             :     {
    2165        5432 :         AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
    2166             : 
    2167        5432 :         desc->kind = AllocateDescRawFD;
    2168        5432 :         desc->desc.fd = fd;
    2169        5432 :         desc->create_subid = GetCurrentSubTransactionId();
    2170        5432 :         numAllocatedDescs++;
    2171             : 
    2172        5432 :         return fd;
    2173             :     }
    2174             : 
    2175           2 :     return -1;                  /* failure */
    2176             : }
    2177             : 
    2178             : /*
    2179             :  * Routines that want to initiate a pipe stream should use OpenPipeStream
    2180             :  * rather than plain popen().  This lets fd.c deal with freeing FDs if
    2181             :  * necessary.  When done, call ClosePipeStream rather than pclose.
    2182             :  */
    2183             : FILE *
    2184           1 : OpenPipeStream(const char *command, const char *mode)
    2185             : {
    2186             :     FILE       *file;
    2187             : 
    2188             :     DO_DB(elog(LOG, "OpenPipeStream: Allocated %d (%s)",
    2189             :                numAllocatedDescs, command));
    2190             : 
    2191             :     /* Can we allocate another non-virtual FD? */
    2192           1 :     if (!reserveAllocatedDesc())
    2193           0 :         ereport(ERROR,
    2194             :                 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
    2195             :                  errmsg("exceeded maxAllocatedDescs (%d) while trying to execute command \"%s\"",
    2196             :                         maxAllocatedDescs, command)));
    2197             : 
    2198             :     /* Close excess kernel FDs. */
    2199           1 :     ReleaseLruFiles();
    2200             : 
    2201             : TryAgain:
    2202           1 :     fflush(stdout);
    2203           1 :     fflush(stderr);
    2204           1 :     errno = 0;
    2205           1 :     if ((file = popen(command, mode)) != NULL)
    2206             :     {
    2207           1 :         AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
    2208             : 
    2209           1 :         desc->kind = AllocateDescPipe;
    2210           1 :         desc->desc.file = file;
    2211           1 :         desc->create_subid = GetCurrentSubTransactionId();
    2212           1 :         numAllocatedDescs++;
    2213           1 :         return desc->desc.file;
    2214             :     }
    2215             : 
    2216           0 :     if (errno == EMFILE || errno == ENFILE)
    2217             :     {
    2218           0 :         int         save_errno = errno;
    2219             : 
    2220           0 :         ereport(LOG,
    2221             :                 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
    2222             :                  errmsg("out of file descriptors: %m; release and retry")));
    2223           0 :         errno = 0;
    2224           0 :         if (ReleaseLruFile())
    2225           0 :             goto TryAgain;
    2226           0 :         errno = save_errno;
    2227             :     }
    2228             : 
    2229           0 :     return NULL;
    2230             : }
    2231             : 
    2232             : /*
    2233             :  * Free an AllocateDesc of any type.
    2234             :  *
    2235             :  * The argument *must* point into the allocatedDescs[] array.
    2236             :  */
    2237             : static int
    2238        9406 : FreeDesc(AllocateDesc *desc)
    2239             : {
    2240             :     int         result;
    2241             : 
    2242             :     /* Close the underlying object */
    2243        9406 :     switch (desc->kind)
    2244             :     {
    2245             :         case AllocateDescFile:
    2246        3388 :             result = fclose(desc->desc.file);
    2247        3388 :             break;
    2248             :         case AllocateDescPipe:
    2249           1 :             result = pclose(desc->desc.file);
    2250           1 :             break;
    2251             :         case AllocateDescDir:
    2252         585 :             result = closedir(desc->desc.dir);
    2253         585 :             break;
    2254             :         case AllocateDescRawFD:
    2255        5432 :             result = close(desc->desc.fd);
    2256        5432 :             break;
    2257             :         default:
    2258           0 :             elog(ERROR, "AllocateDesc kind not recognized");
    2259             :             result = 0;         /* keep compiler quiet */
    2260             :             break;
    2261             :     }
    2262             : 
    2263             :     /* Compact storage in the allocatedDescs array */
    2264        9406 :     numAllocatedDescs--;
    2265        9406 :     *desc = allocatedDescs[numAllocatedDescs];
    2266             : 
    2267        9406 :     return result;
    2268             : }
    2269             : 
    2270             : /*
    2271             :  * Close a file returned by AllocateFile.
    2272             :  *
    2273             :  * Note we do not check fclose's return value --- it is up to the caller
    2274             :  * to handle close errors.
    2275             :  */
    2276             : int
    2277        3387 : FreeFile(FILE *file)
    2278             : {
    2279             :     int         i;
    2280             : 
    2281             :     DO_DB(elog(LOG, "FreeFile: Allocated %d", numAllocatedDescs));
    2282             : 
    2283             :     /* Remove file from list of allocated files, if it's present */
    2284        6774 :     for (i = numAllocatedDescs; --i >= 0;)
    2285             :     {
    2286        3387 :         AllocateDesc *desc = &allocatedDescs[i];
    2287             : 
    2288        3387 :         if (desc->kind == AllocateDescFile && desc->desc.file == file)
    2289        3387 :             return FreeDesc(desc);
    2290             :     }
    2291             : 
    2292             :     /* Only get here if someone passes us a file not in allocatedDescs */
    2293           0 :     elog(WARNING, "file passed to FreeFile was not obtained from AllocateFile");
    2294             : 
    2295           0 :     return fclose(file);
    2296             : }
    2297             : 
    2298             : /*
    2299             :  * Close a file returned by OpenTransientFile.
    2300             :  *
    2301             :  * Note we do not check close's return value --- it is up to the caller
    2302             :  * to handle close errors.
    2303             :  */
    2304             : int
    2305        5432 : CloseTransientFile(int fd)
    2306             : {
    2307             :     int         i;
    2308             : 
    2309             :     DO_DB(elog(LOG, "CloseTransientFile: Allocated %d", numAllocatedDescs));
    2310             : 
    2311             :     /* Remove fd from list of allocated files, if it's present */
    2312       10864 :     for (i = numAllocatedDescs; --i >= 0;)
    2313             :     {
    2314        5432 :         AllocateDesc *desc = &allocatedDescs[i];
    2315             : 
    2316        5432 :         if (desc->kind == AllocateDescRawFD && desc->desc.fd == fd)
    2317        5432 :             return FreeDesc(desc);
    2318             :     }
    2319             : 
    2320             :     /* Only get here if someone passes us a file not in allocatedDescs */
    2321           0 :     elog(WARNING, "fd passed to CloseTransientFile was not obtained from OpenTransientFile");
    2322             : 
    2323           0 :     return close(fd);
    2324             : }
    2325             : 
    2326             : /*
    2327             :  * Routines that want to use <dirent.h> (ie, DIR*) should use AllocateDir
    2328             :  * rather than plain opendir().  This lets fd.c deal with freeing FDs if
    2329             :  * necessary to open the directory, and with closing it after an elog.
    2330             :  * When done, call FreeDir rather than closedir.
    2331             :  *
    2332             :  * Ideally this should be the *only* direct call of opendir() in the backend.
    2333             :  */
    2334             : DIR *
    2335         586 : AllocateDir(const char *dirname)
    2336             : {
    2337             :     DIR        *dir;
    2338             : 
    2339             :     DO_DB(elog(LOG, "AllocateDir: Allocated %d (%s)",
    2340             :                numAllocatedDescs, dirname));
    2341             : 
    2342             :     /* Can we allocate another non-virtual FD? */
    2343         586 :     if (!reserveAllocatedDesc())
    2344           0 :         ereport(ERROR,
    2345             :                 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
    2346             :                  errmsg("exceeded maxAllocatedDescs (%d) while trying to open directory \"%s\"",
    2347             :                         maxAllocatedDescs, dirname)));
    2348             : 
    2349             :     /* Close excess kernel FDs. */
    2350         586 :     ReleaseLruFiles();
    2351             : 
    2352             : TryAgain:
    2353         586 :     if ((dir = opendir(dirname)) != NULL)
    2354             :     {
    2355         585 :         AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
    2356             : 
    2357         585 :         desc->kind = AllocateDescDir;
    2358         585 :         desc->desc.dir = dir;
    2359         585 :         desc->create_subid = GetCurrentSubTransactionId();
    2360         585 :         numAllocatedDescs++;
    2361         585 :         return desc->desc.dir;
    2362             :     }
    2363             : 
    2364           1 :     if (errno == EMFILE || errno == ENFILE)
    2365             :     {
    2366           0 :         int         save_errno = errno;
    2367             : 
    2368           0 :         ereport(LOG,
    2369             :                 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
    2370             :                  errmsg("out of file descriptors: %m; release and retry")));
    2371           0 :         errno = 0;
    2372           0 :         if (ReleaseLruFile())
    2373           0 :             goto TryAgain;
    2374           0 :         errno = save_errno;
    2375             :     }
    2376             : 
    2377           1 :     return NULL;
    2378             : }
    2379             : 
    2380             : /*
    2381             :  * Read a directory opened with AllocateDir, ereport'ing any error.
    2382             :  *
    2383             :  * This is easier to use than raw readdir() since it takes care of some
    2384             :  * otherwise rather tedious and error-prone manipulation of errno.  Also,
    2385             :  * if you are happy with a generic error message for AllocateDir failure,
    2386             :  * you can just do
    2387             :  *
    2388             :  *      dir = AllocateDir(path);
    2389             :  *      while ((dirent = ReadDir(dir, path)) != NULL)
    2390             :  *          process dirent;
    2391             :  *      FreeDir(dir);
    2392             :  *
    2393             :  * since a NULL dir parameter is taken as indicating AllocateDir failed.
    2394             :  * (Make sure errno hasn't been changed since AllocateDir if you use this
    2395             :  * shortcut.)
    2396             :  *
    2397             :  * The pathname passed to AllocateDir must be passed to this routine too,
    2398             :  * but it is only used for error reporting.
    2399             :  */
    2400             : struct dirent *
    2401       18292 : ReadDir(DIR *dir, const char *dirname)
    2402             : {
    2403       18292 :     return ReadDirExtended(dir, dirname, ERROR);
    2404             : }
    2405             : 
    2406             : /*
    2407             :  * Alternate version that allows caller to specify the elevel for any
    2408             :  * error report.  If elevel < ERROR, returns NULL on any error.
    2409             :  */
    2410             : static struct dirent *
    2411       18292 : ReadDirExtended(DIR *dir, const char *dirname, int elevel)
    2412             : {
    2413             :     struct dirent *dent;
    2414             : 
    2415             :     /* Give a generic message for AllocateDir failure, if caller didn't */
    2416       18292 :     if (dir == NULL)
    2417             :     {
    2418           0 :         ereport(elevel,
    2419             :                 (errcode_for_file_access(),
    2420             :                  errmsg("could not open directory \"%s\": %m",
    2421             :                         dirname)));
    2422           0 :         return NULL;
    2423             :     }
    2424             : 
    2425       18292 :     errno = 0;
    2426       18292 :     if ((dent = readdir(dir)) != NULL)
    2427       18150 :         return dent;
    2428             : 
    2429         142 :     if (errno)
    2430           0 :         ereport(elevel,
    2431             :                 (errcode_for_file_access(),
    2432             :                  errmsg("could not read directory \"%s\": %m",
    2433             :                         dirname)));
    2434         142 :     return NULL;
    2435             : }
    2436             : 
    2437             : /*
    2438             :  * Close a directory opened with AllocateDir.
    2439             :  *
    2440             :  * Note we do not check closedir's return value --- it is up to the caller
    2441             :  * to handle close errors.
    2442             :  */
    2443             : int
    2444         585 : FreeDir(DIR *dir)
    2445             : {
    2446             :     int         i;
    2447             : 
    2448             :     DO_DB(elog(LOG, "FreeDir: Allocated %d", numAllocatedDescs));
    2449             : 
    2450             :     /* Remove dir from list of allocated dirs, if it's present */
    2451        1170 :     for (i = numAllocatedDescs; --i >= 0;)
    2452             :     {
    2453         585 :         AllocateDesc *desc = &allocatedDescs[i];
    2454             : 
    2455         585 :         if (desc->kind == AllocateDescDir && desc->desc.dir == dir)
    2456         585 :             return FreeDesc(desc);
    2457             :     }
    2458             : 
    2459             :     /* Only get here if someone passes us a dir not in allocatedDescs */
    2460           0 :     elog(WARNING, "dir passed to FreeDir was not obtained from AllocateDir");
    2461             : 
    2462           0 :     return closedir(dir);
    2463             : }
    2464             : 
    2465             : 
    2466             : /*
    2467             :  * Close a pipe stream returned by OpenPipeStream.
    2468             :  */
    2469             : int
    2470           1 : ClosePipeStream(FILE *file)
    2471             : {
    2472             :     int         i;
    2473             : 
    2474             :     DO_DB(elog(LOG, "ClosePipeStream: Allocated %d", numAllocatedDescs));
    2475             : 
    2476             :     /* Remove file from list of allocated files, if it's present */
    2477           2 :     for (i = numAllocatedDescs; --i >= 0;)
    2478             :     {
    2479           1 :         AllocateDesc *desc = &allocatedDescs[i];
    2480             : 
    2481           1 :         if (desc->kind == AllocateDescPipe && desc->desc.file == file)
    2482           1 :             return FreeDesc(desc);
    2483             :     }
    2484             : 
    2485             :     /* Only get here if someone passes us a file not in allocatedDescs */
    2486           0 :     elog(WARNING, "file passed to ClosePipeStream was not obtained from OpenPipeStream");
    2487             : 
    2488           0 :     return pclose(file);
    2489             : }
    2490             : 
    2491             : /*
    2492             :  * closeAllVfds
    2493             :  *
    2494             :  * Force all VFDs into the physically-closed state, so that the fewest
    2495             :  * possible number of kernel file descriptors are in use.  There is no
    2496             :  * change in the logical state of the VFDs.
    2497             :  */
    2498             : void
    2499           1 : closeAllVfds(void)
    2500             : {
    2501             :     Index       i;
    2502             : 
    2503           1 :     if (SizeVfdCache > 0)
    2504             :     {
    2505           1 :         Assert(FileIsNotOpen(0));   /* Make sure ring not corrupted */
    2506          32 :         for (i = 1; i < SizeVfdCache; i++)
    2507             :         {
    2508          31 :             if (!FileIsNotOpen(i))
    2509           5 :                 LruDelete(i);
    2510             :         }
    2511             :     }
    2512           1 : }
    2513             : 
    2514             : 
    2515             : /*
    2516             :  * SetTempTablespaces
    2517             :  *
    2518             :  * Define a list (actually an array) of OIDs of tablespaces to use for
    2519             :  * temporary files.  This list will be used until end of transaction,
    2520             :  * unless this function is called again before then.  It is caller's
    2521             :  * responsibility that the passed-in array has adequate lifespan (typically
    2522             :  * it'd be allocated in TopTransactionContext).
    2523             :  */
    2524             : void
    2525         338 : SetTempTablespaces(Oid *tableSpaces, int numSpaces)
    2526             : {
    2527         338 :     Assert(numSpaces >= 0);
    2528         338 :     tempTableSpaces = tableSpaces;
    2529         338 :     numTempTableSpaces = numSpaces;
    2530             : 
    2531             :     /*
    2532             :      * Select a random starting point in the list.  This is to minimize
    2533             :      * conflicts between backends that are most likely sharing the same list
    2534             :      * of temp tablespaces.  Note that if we create multiple temp files in the
    2535             :      * same transaction, we'll advance circularly through the list --- this
    2536             :      * ensures that large temporary sort files are nicely spread across all
    2537             :      * available tablespaces.
    2538             :      */
    2539         338 :     if (numSpaces > 1)
    2540           0 :         nextTempTableSpace = random() % numSpaces;
    2541             :     else
    2542         338 :         nextTempTableSpace = 0;
    2543         338 : }
    2544             : 
    2545             : /*
    2546             :  * TempTablespacesAreSet
    2547             :  *
    2548             :  * Returns TRUE if SetTempTablespaces has been called in current transaction.
    2549             :  * (This is just so that tablespaces.c doesn't need its own per-transaction
    2550             :  * state.)
    2551             :  */
    2552             : bool
    2553         427 : TempTablespacesAreSet(void)
    2554             : {
    2555         427 :     return (numTempTableSpaces >= 0);
    2556             : }
    2557             : 
    2558             : /*
    2559             :  * GetNextTempTableSpace
    2560             :  *
    2561             :  * Select the next temp tablespace to use.  A result of InvalidOid means
    2562             :  * to use the current database's default tablespace.
    2563             :  */
    2564             : Oid
    2565         417 : GetNextTempTableSpace(void)
    2566             : {
    2567         417 :     if (numTempTableSpaces > 0)
    2568             :     {
    2569             :         /* Advance nextTempTableSpace counter with wraparound */
    2570           0 :         if (++nextTempTableSpace >= numTempTableSpaces)
    2571           0 :             nextTempTableSpace = 0;
    2572           0 :         return tempTableSpaces[nextTempTableSpace];
    2573             :     }
    2574         417 :     return InvalidOid;
    2575             : }
    2576             : 
    2577             : 
    2578             : /*
    2579             :  * AtEOSubXact_Files
    2580             :  *
    2581             :  * Take care of subtransaction commit/abort.  At abort, we close temp files
    2582             :  * that the subtransaction may have opened.  At commit, we reassign the
    2583             :  * files that were opened to the parent subtransaction.
    2584             :  */
    2585             : void
    2586         372 : AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid,
    2587             :                   SubTransactionId parentSubid)
    2588             : {
    2589             :     Index       i;
    2590             : 
    2591         372 :     for (i = 0; i < numAllocatedDescs; i++)
    2592             :     {
    2593           0 :         if (allocatedDescs[i].create_subid == mySubid)
    2594             :         {
    2595           0 :             if (isCommit)
    2596           0 :                 allocatedDescs[i].create_subid = parentSubid;
    2597             :             else
    2598             :             {
    2599             :                 /* have to recheck the item after FreeDesc (ugly) */
    2600           0 :                 FreeDesc(&allocatedDescs[i--]);
    2601             :             }
    2602             :         }
    2603             :     }
    2604         372 : }
    2605             : 
    2606             : /*
    2607             :  * AtEOXact_Files
    2608             :  *
    2609             :  * This routine is called during transaction commit or abort (it doesn't
    2610             :  * particularly care which).  All still-open per-transaction temporary file
    2611             :  * VFDs are closed, which also causes the underlying files to be deleted
    2612             :  * (although they should've been closed already by the ResourceOwner
    2613             :  * cleanup). Furthermore, all "allocated" stdio files are closed. We also
    2614             :  * forget any transaction-local temp tablespace list.
    2615             :  */
    2616             : void
    2617       26218 : AtEOXact_Files(void)
    2618             : {
    2619       26218 :     CleanupTempFiles(false);
    2620       26218 :     tempTableSpaces = NULL;
    2621       26218 :     numTempTableSpaces = -1;
    2622       26218 : }
    2623             : 
    2624             : /*
    2625             :  * AtProcExit_Files
    2626             :  *
    2627             :  * on_proc_exit hook to clean up temp files during backend shutdown.
    2628             :  * Here, we want to clean up *all* temp files including interXact ones.
    2629             :  */
    2630             : static void
    2631         344 : AtProcExit_Files(int code, Datum arg)
    2632             : {
    2633         344 :     CleanupTempFiles(true);
    2634         344 : }
    2635             : 
    2636             : /*
    2637             :  * Close temporary files and delete their underlying files.
    2638             :  *
    2639             :  * isProcExit: if true, this is being called as the backend process is
    2640             :  * exiting. If that's the case, we should remove all temporary files; if
    2641             :  * that's not the case, we are being called for transaction commit/abort
    2642             :  * and should only remove transaction-local temp files.  In either case,
    2643             :  * also clean up "allocated" stdio files, dirs and fds.
    2644             :  */
    2645             : static void
    2646       26562 : CleanupTempFiles(bool isProcExit)
    2647             : {
    2648             :     Index       i;
    2649             : 
    2650             :     /*
    2651             :      * Careful here: at proc_exit we need extra cleanup, not just
    2652             :      * xact_temporary files.
    2653             :      */
    2654       26562 :     if (isProcExit || have_xact_temporary_files)
    2655             :     {
    2656         354 :         Assert(FileIsNotOpen(0));   /* Make sure ring not corrupted */
    2657       17893 :         for (i = 1; i < SizeVfdCache; i++)
    2658             :         {
    2659       17539 :             unsigned short fdstate = VfdCache[i].fdstate;
    2660             : 
    2661       17539 :             if ((fdstate & FD_TEMPORARY) && VfdCache[i].fileName != NULL)
    2662             :             {
    2663             :                 /*
    2664             :                  * If we're in the process of exiting a backend process, close
    2665             :                  * all temporary files. Otherwise, only close temporary files
    2666             :                  * local to the current transaction. They should be closed by
    2667             :                  * the ResourceOwner mechanism already, so this is just a
    2668             :                  * debugging cross-check.
    2669             :                  */
    2670           0 :                 if (isProcExit)
    2671           0 :                     FileClose(i);
    2672           0 :                 else if (fdstate & FD_XACT_TEMPORARY)
    2673             :                 {
    2674           0 :                     elog(WARNING,
    2675             :                          "temporary file %s not closed at end-of-transaction",
    2676             :                          VfdCache[i].fileName);
    2677           0 :                     FileClose(i);
    2678             :                 }
    2679             :             }
    2680             :         }
    2681             : 
    2682         354 :         have_xact_temporary_files = false;
    2683             :     }
    2684             : 
    2685             :     /* Clean up "allocated" stdio files, dirs and fds. */
    2686       53125 :     while (numAllocatedDescs > 0)
    2687           1 :         FreeDesc(&allocatedDescs[0]);
    2688       26562 : }
    2689             : 
    2690             : 
    2691             : /*
    2692             :  * Remove temporary and temporary relation files left over from a prior
    2693             :  * postmaster session
    2694             :  *
    2695             :  * This should be called during postmaster startup.  It will forcibly
    2696             :  * remove any leftover files created by OpenTemporaryFile and any leftover
    2697             :  * temporary relation files created by mdcreate.
    2698             :  *
    2699             :  * NOTE: we could, but don't, call this during a post-backend-crash restart
    2700             :  * cycle.  The argument for not doing it is that someone might want to examine
    2701             :  * the temp files for debugging purposes.  This does however mean that
    2702             :  * OpenTemporaryFile had better allow for collision with an existing temp
    2703             :  * file name.
    2704             :  */
    2705             : void
    2706           1 : RemovePgTempFiles(void)
    2707             : {
    2708             :     char        temp_path[MAXPGPATH + 10 + sizeof(TABLESPACE_VERSION_DIRECTORY) + sizeof(PG_TEMP_FILES_DIR)];
    2709             :     DIR        *spc_dir;
    2710             :     struct dirent *spc_de;
    2711             : 
    2712             :     /*
    2713             :      * First process temp files in pg_default ($PGDATA/base)
    2714             :      */
    2715           1 :     snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR);
    2716           1 :     RemovePgTempFilesInDir(temp_path);
    2717           1 :     RemovePgTempRelationFiles("base");
    2718             : 
    2719             :     /*
    2720             :      * Cycle through temp directories for all non-default tablespaces.
    2721             :      */
    2722           1 :     spc_dir = AllocateDir("pg_tblspc");
    2723             : 
    2724           4 :     while ((spc_de = ReadDir(spc_dir, "pg_tblspc")) != NULL)
    2725             :     {
    2726           3 :         if (strcmp(spc_de->d_name, ".") == 0 ||
    2727           1 :             strcmp(spc_de->d_name, "..") == 0)
    2728           2 :             continue;
    2729             : 
    2730           0 :         snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s/%s",
    2731           0 :                  spc_de->d_name, TABLESPACE_VERSION_DIRECTORY, PG_TEMP_FILES_DIR);
    2732           0 :         RemovePgTempFilesInDir(temp_path);
    2733             : 
    2734           0 :         snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s",
    2735           0 :                  spc_de->d_name, TABLESPACE_VERSION_DIRECTORY);
    2736           0 :         RemovePgTempRelationFiles(temp_path);
    2737             :     }
    2738             : 
    2739           1 :     FreeDir(spc_dir);
    2740             : 
    2741             :     /*
    2742             :      * In EXEC_BACKEND case there is a pgsql_tmp directory at the top level of
    2743             :      * DataDir as well.
    2744             :      */
    2745             : #ifdef EXEC_BACKEND
    2746             :     RemovePgTempFilesInDir(PG_TEMP_FILES_DIR);
    2747             : #endif
    2748           1 : }
    2749             : 
    2750             : /* Process one pgsql_tmp directory for RemovePgTempFiles */
    2751             : static void
    2752           1 : RemovePgTempFilesInDir(const char *tmpdirname)
    2753             : {
    2754             :     DIR        *temp_dir;
    2755             :     struct dirent *temp_de;
    2756             :     char        rm_path[MAXPGPATH * 2];
    2757             : 
    2758           1 :     temp_dir = AllocateDir(tmpdirname);
    2759           1 :     if (temp_dir == NULL)
    2760             :     {
    2761             :         /* anything except ENOENT is fishy */
    2762           1 :         if (errno != ENOENT)
    2763           0 :             elog(LOG,
    2764             :                  "could not open temporary-files directory \"%s\": %m",
    2765             :                  tmpdirname);
    2766           2 :         return;
    2767             :     }
    2768             : 
    2769           0 :     while ((temp_de = ReadDir(temp_dir, tmpdirname)) != NULL)
    2770             :     {
    2771           0 :         if (strcmp(temp_de->d_name, ".") == 0 ||
    2772           0 :             strcmp(temp_de->d_name, "..") == 0)
    2773           0 :             continue;
    2774             : 
    2775           0 :         snprintf(rm_path, sizeof(rm_path), "%s/%s",
    2776           0 :                  tmpdirname, temp_de->d_name);
    2777             : 
    2778           0 :         if (strncmp(temp_de->d_name,
    2779             :                     PG_TEMP_FILE_PREFIX,
    2780             :                     strlen(PG_TEMP_FILE_PREFIX)) == 0)
    2781           0 :             unlink(rm_path);    /* note we ignore any error */
    2782             :         else
    2783           0 :             elog(LOG,
    2784             :                  "unexpected file found in temporary-files directory: \"%s\"",
    2785             :                  rm_path);
    2786             :     }
    2787             : 
    2788           0 :     FreeDir(temp_dir);
    2789             : }
    2790             : 
    2791             : /* Process one tablespace directory, look for per-DB subdirectories */
    2792             : static void
    2793           1 : RemovePgTempRelationFiles(const char *tsdirname)
    2794             : {
    2795             :     DIR        *ts_dir;
    2796             :     struct dirent *de;
    2797             :     char        dbspace_path[MAXPGPATH * 2];
    2798             : 
    2799           1 :     ts_dir = AllocateDir(tsdirname);
    2800           1 :     if (ts_dir == NULL)
    2801             :     {
    2802             :         /* anything except ENOENT is fishy */
    2803           0 :         if (errno != ENOENT)
    2804           0 :             elog(LOG,
    2805             :                  "could not open tablespace directory \"%s\": %m",
    2806             :                  tsdirname);
    2807           1 :         return;
    2808             :     }
    2809             : 
    2810           7 :     while ((de = ReadDir(ts_dir, tsdirname)) != NULL)
    2811             :     {
    2812           5 :         int         i = 0;
    2813             : 
    2814             :         /*
    2815             :          * We're only interested in the per-database directories, which have
    2816             :          * numeric names.  Note that this code will also (properly) ignore "."
    2817             :          * and "..".
    2818             :          */
    2819          21 :         while (isdigit((unsigned char) de->d_name[i]))
    2820          11 :             ++i;
    2821           5 :         if (de->d_name[i] != '\0' || i == 0)
    2822           2 :             continue;
    2823             : 
    2824           3 :         snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s",
    2825           3 :                  tsdirname, de->d_name);
    2826           3 :         RemovePgTempRelationFilesInDbspace(dbspace_path);
    2827             :     }
    2828             : 
    2829           1 :     FreeDir(ts_dir);
    2830             : }
    2831             : 
    2832             : /* Process one per-dbspace directory for RemovePgTempRelationFiles */
    2833             : static void
    2834           3 : RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
    2835             : {
    2836             :     DIR        *dbspace_dir;
    2837             :     struct dirent *de;
    2838             :     char        rm_path[MAXPGPATH * 2];
    2839             : 
    2840           3 :     dbspace_dir = AllocateDir(dbspacedirname);
    2841           3 :     if (dbspace_dir == NULL)
    2842             :     {
    2843             :         /* we just saw this directory, so it really ought to be there */
    2844           0 :         elog(LOG,
    2845             :              "could not open dbspace directory \"%s\": %m",
    2846             :              dbspacedirname);
    2847           3 :         return;
    2848             :     }
    2849             : 
    2850         891 :     while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL)
    2851             :     {
    2852         885 :         if (!looks_like_temp_rel_name(de->d_name))
    2853         885 :             continue;
    2854             : 
    2855           0 :         snprintf(rm_path, sizeof(rm_path), "%s/%s",
    2856           0 :                  dbspacedirname, de->d_name);
    2857             : 
    2858           0 :         unlink(rm_path);        /* note we ignore any error */
    2859             :     }
    2860             : 
    2861           3 :     FreeDir(dbspace_dir);
    2862             : }
    2863             : 
    2864             : /* t<digits>_<digits>, or t<digits>_<digits>_<forkname> */
    2865             : static bool
    2866         885 : looks_like_temp_rel_name(const char *name)
    2867             : {
    2868             :     int         pos;
    2869             :     int         savepos;
    2870             : 
    2871             :     /* Must start with "t". */
    2872         885 :     if (name[0] != 't')
    2873         885 :         return false;
    2874             : 
    2875             :     /* Followed by a non-empty string of digits and then an underscore. */
    2876           0 :     for (pos = 1; isdigit((unsigned char) name[pos]); ++pos)
    2877             :         ;
    2878           0 :     if (pos == 1 || name[pos] != '_')
    2879           0 :         return false;
    2880             : 
    2881             :     /* Followed by another nonempty string of digits. */
    2882           0 :     for (savepos = ++pos; isdigit((unsigned char) name[pos]); ++pos)
    2883             :         ;
    2884           0 :     if (savepos == pos)
    2885           0 :         return false;
    2886             : 
    2887             :     /* We might have _forkname or .segment or both. */
    2888           0 :     if (name[pos] == '_')
    2889             :     {
    2890           0 :         int         forkchar = forkname_chars(&name[pos + 1], NULL);
    2891             : 
    2892           0 :         if (forkchar <= 0)
    2893           0 :             return false;
    2894           0 :         pos += forkchar + 1;
    2895             :     }
    2896           0 :     if (name[pos] == '.')
    2897             :     {
    2898             :         int         segchar;
    2899             : 
    2900           0 :         for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar)
    2901             :             ;
    2902           0 :         if (segchar <= 1)
    2903           0 :             return false;
    2904           0 :         pos += segchar;
    2905             :     }
    2906             : 
    2907             :     /* Now we should be at the end. */
    2908           0 :     if (name[pos] != '\0')
    2909           0 :         return false;
    2910           0 :     return true;
    2911             : }
    2912             : 
    2913             : 
    2914             : /*
    2915             :  * Issue fsync recursively on PGDATA and all its contents.
    2916             :  *
    2917             :  * We fsync regular files and directories wherever they are, but we
    2918             :  * follow symlinks only for pg_wal and immediately under pg_tblspc.
    2919             :  * Other symlinks are presumed to point at files we're not responsible
    2920             :  * for fsyncing, and might not have privileges to write at all.
    2921             :  *
    2922             :  * Errors are logged but not considered fatal; that's because this is used
    2923             :  * only during database startup, to deal with the possibility that there are
    2924             :  * issued-but-unsynced writes pending against the data directory.  We want to
    2925             :  * ensure that such writes reach disk before anything that's done in the new
    2926             :  * run.  However, aborting on error would result in failure to start for
    2927             :  * harmless cases such as read-only files in the data directory, and that's
    2928             :  * not good either.
    2929             :  *
    2930             :  * Note we assume we're chdir'd into PGDATA to begin with.
    2931             :  */
    2932             : void
    2933           0 : SyncDataDirectory(void)
    2934             : {
    2935             :     bool        xlog_is_symlink;
    2936             : 
    2937             :     /* We can skip this whole thing if fsync is disabled. */
    2938           0 :     if (!enableFsync)
    2939           0 :         return;
    2940             : 
    2941             :     /*
    2942             :      * If pg_wal is a symlink, we'll need to recurse into it separately,
    2943             :      * because the first walkdir below will ignore it.
    2944             :      */
    2945           0 :     xlog_is_symlink = false;
    2946             : 
    2947             : #ifndef WIN32
    2948             :     {
    2949             :         struct stat st;
    2950             : 
    2951           0 :         if (lstat("pg_wal", &st) < 0)
    2952           0 :             ereport(LOG,
    2953             :                     (errcode_for_file_access(),
    2954             :                      errmsg("could not stat file \"%s\": %m",
    2955             :                             "pg_wal")));
    2956           0 :         else if (S_ISLNK(st.st_mode))
    2957           0 :             xlog_is_symlink = true;
    2958             :     }
    2959             : #else
    2960             :     if (pgwin32_is_junction("pg_wal"))
    2961             :         xlog_is_symlink = true;
    2962             : #endif
    2963             : 
    2964             :     /*
    2965             :      * If possible, hint to the kernel that we're soon going to fsync the data
    2966             :      * directory and its contents.  Errors in this step are even less
    2967             :      * interesting than normal, so log them only at DEBUG1.
    2968             :      */
    2969             : #ifdef PG_FLUSH_DATA_WORKS
    2970           0 :     walkdir(".", pre_sync_fname, false, DEBUG1);
    2971           0 :     if (xlog_is_symlink)
    2972           0 :         walkdir("pg_wal", pre_sync_fname, false, DEBUG1);
    2973           0 :     walkdir("pg_tblspc", pre_sync_fname, true, DEBUG1);
    2974             : #endif
    2975             : 
    2976             :     /*
    2977             :      * Now we do the fsync()s in the same order.
    2978             :      *
    2979             :      * The main call ignores symlinks, so in addition to specially processing
    2980             :      * pg_wal if it's a symlink, pg_tblspc has to be visited separately with
    2981             :      * process_symlinks = true.  Note that if there are any plain directories
    2982             :      * in pg_tblspc, they'll get fsync'd twice.  That's not an expected case
    2983             :      * so we don't worry about optimizing it.
    2984             :      */
    2985           0 :     walkdir(".", datadir_fsync_fname, false, LOG);
    2986           0 :     if (xlog_is_symlink)
    2987           0 :         walkdir("pg_wal", datadir_fsync_fname, false, LOG);
    2988           0 :     walkdir("pg_tblspc", datadir_fsync_fname, true, LOG);
    2989             : }
    2990             : 
    2991             : /*
    2992             :  * walkdir: recursively walk a directory, applying the action to each
    2993             :  * regular file and directory (including the named directory itself).
    2994             :  *
    2995             :  * If process_symlinks is true, the action and recursion are also applied
    2996             :  * to regular files and directories that are pointed to by symlinks in the
    2997             :  * given directory; otherwise symlinks are ignored.  Symlinks are always
    2998             :  * ignored in subdirectories, ie we intentionally don't pass down the
    2999             :  * process_symlinks flag to recursive calls.
    3000             :  *
    3001             :  * Errors are reported at level elevel, which might be ERROR or less.
    3002             :  *
    3003             :  * See also walkdir in initdb.c, which is a frontend version of this logic.
    3004             :  */
    3005             : static void
    3006           0 : walkdir(const char *path,
    3007             :         void (*action) (const char *fname, bool isdir, int elevel),
    3008             :         bool process_symlinks,
    3009             :         int elevel)
    3010             : {
    3011             :     DIR        *dir;
    3012             :     struct dirent *de;
    3013             : 
    3014           0 :     dir = AllocateDir(path);
    3015           0 :     if (dir == NULL)
    3016             :     {
    3017           0 :         ereport(elevel,
    3018             :                 (errcode_for_file_access(),
    3019             :                  errmsg("could not open directory \"%s\": %m", path)));
    3020           0 :         return;
    3021             :     }
    3022             : 
    3023           0 :     while ((de = ReadDirExtended(dir, path, elevel)) != NULL)
    3024             :     {
    3025             :         char        subpath[MAXPGPATH * 2];
    3026             :         struct stat fst;
    3027             :         int         sret;
    3028             : 
    3029           0 :         CHECK_FOR_INTERRUPTS();
    3030             : 
    3031           0 :         if (strcmp(de->d_name, ".") == 0 ||
    3032           0 :             strcmp(de->d_name, "..") == 0)
    3033           0 :             continue;
    3034             : 
    3035           0 :         snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name);
    3036             : 
    3037           0 :         if (process_symlinks)
    3038           0 :             sret = stat(subpath, &fst);
    3039             :         else
    3040           0 :             sret = lstat(subpath, &fst);
    3041             : 
    3042           0 :         if (sret < 0)
    3043             :         {
    3044           0 :             ereport(elevel,
    3045             :                     (errcode_for_file_access(),
    3046             :                      errmsg("could not stat file \"%s\": %m", subpath)));
    3047           0 :             continue;
    3048             :         }
    3049             : 
    3050           0 :         if (S_ISREG(fst.st_mode))
    3051           0 :             (*action) (subpath, false, elevel);
    3052           0 :         else if (S_ISDIR(fst.st_mode))
    3053           0 :             walkdir(subpath, action, false, elevel);
    3054             :     }
    3055             : 
    3056           0 :     FreeDir(dir);               /* we ignore any error here */
    3057             : 
    3058             :     /*
    3059             :      * It's important to fsync the destination directory itself as individual
    3060             :      * file fsyncs don't guarantee that the directory entry for the file is
    3061             :      * synced.
    3062             :      */
    3063           0 :     (*action) (path, true, elevel);
    3064             : }
    3065             : 
    3066             : 
    3067             : /*
    3068             :  * Hint to the OS that it should get ready to fsync() this file.
    3069             :  *
    3070             :  * Ignores errors trying to open unreadable files, and logs other errors at a
    3071             :  * caller-specified level.
    3072             :  */
    3073             : #ifdef PG_FLUSH_DATA_WORKS
    3074             : 
    3075             : static void
    3076           0 : pre_sync_fname(const char *fname, bool isdir, int elevel)
    3077             : {
    3078             :     int         fd;
    3079             : 
    3080             :     /* Don't try to flush directories, it'll likely just fail */
    3081           0 :     if (isdir)
    3082           0 :         return;
    3083             : 
    3084           0 :     fd = OpenTransientFile((char *) fname, O_RDONLY | PG_BINARY, 0);
    3085             : 
    3086           0 :     if (fd < 0)
    3087             :     {
    3088           0 :         if (errno == EACCES)
    3089           0 :             return;
    3090           0 :         ereport(elevel,
    3091             :                 (errcode_for_file_access(),
    3092             :                  errmsg("could not open file \"%s\": %m", fname)));
    3093           0 :         return;
    3094             :     }
    3095             : 
    3096             :     /*
    3097             :      * pg_flush_data() ignores errors, which is ok because this is only a
    3098             :      * hint.
    3099             :      */
    3100           0 :     pg_flush_data(fd, 0, 0);
    3101             : 
    3102           0 :     (void) CloseTransientFile(fd);
    3103             : }
    3104             : 
    3105             : #endif                          /* PG_FLUSH_DATA_WORKS */
    3106             : 
    3107             : static void
    3108           0 : datadir_fsync_fname(const char *fname, bool isdir, int elevel)
    3109             : {
    3110             :     /*
    3111             :      * We want to silently ignoring errors about unreadable files.  Pass that
    3112             :      * desire on to fsync_fname_ext().
    3113             :      */
    3114           0 :     fsync_fname_ext(fname, isdir, true, elevel);
    3115           0 : }
    3116             : 
    3117             : /*
    3118             :  * fsync_fname_ext -- Try to fsync a file or directory
    3119             :  *
    3120             :  * If ignore_perm is true, ignore errors upon trying to open unreadable
    3121             :  * files. Logs other errors at a caller-specified level.
    3122             :  *
    3123             :  * Returns 0 if the operation succeeded, -1 otherwise.
    3124             :  */
    3125             : static int
    3126          99 : fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
    3127             : {
    3128             :     int         fd;
    3129             :     int         flags;
    3130             :     int         returncode;
    3131             : 
    3132             :     /*
    3133             :      * Some OSs require directories to be opened read-only whereas other
    3134             :      * systems don't allow us to fsync files opened read-only; so we need both
    3135             :      * cases here.  Using O_RDWR will cause us to fail to fsync files that are
    3136             :      * not writable by our userid, but we assume that's OK.
    3137             :      */
    3138          99 :     flags = PG_BINARY;
    3139          99 :     if (!isdir)
    3140          44 :         flags |= O_RDWR;
    3141             :     else
    3142          55 :         flags |= O_RDONLY;
    3143             : 
    3144          99 :     fd = OpenTransientFile((char *) fname, flags, 0);
    3145             : 
    3146             :     /*
    3147             :      * Some OSs don't allow us to open directories at all (Windows returns
    3148             :      * EACCES), just ignore the error in that case.  If desired also silently
    3149             :      * ignoring errors about unreadable files. Log others.
    3150             :      */
    3151          99 :     if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
    3152           0 :         return 0;
    3153          99 :     else if (fd < 0 && ignore_perm && errno == EACCES)
    3154           0 :         return 0;
    3155          99 :     else if (fd < 0)
    3156             :     {
    3157           0 :         ereport(elevel,
    3158             :                 (errcode_for_file_access(),
    3159             :                  errmsg("could not open file \"%s\": %m", fname)));
    3160           0 :         return -1;
    3161             :     }
    3162             : 
    3163          99 :     returncode = pg_fsync(fd);
    3164             : 
    3165             :     /*
    3166             :      * Some OSes don't allow us to fsync directories at all, so we can ignore
    3167             :      * those errors. Anything else needs to be logged.
    3168             :      */
    3169          99 :     if (returncode != 0 && !(isdir && errno == EBADF))
    3170             :     {
    3171             :         int         save_errno;
    3172             : 
    3173             :         /* close file upon error, might not be in transaction context */
    3174           0 :         save_errno = errno;
    3175           0 :         (void) CloseTransientFile(fd);
    3176           0 :         errno = save_errno;
    3177             : 
    3178           0 :         ereport(elevel,
    3179             :                 (errcode_for_file_access(),
    3180             :                  errmsg("could not fsync file \"%s\": %m", fname)));
    3181           0 :         return -1;
    3182             :     }
    3183             : 
    3184          99 :     (void) CloseTransientFile(fd);
    3185             : 
    3186          99 :     return 0;
    3187             : }
    3188             : 
    3189             : /*
    3190             :  * fsync_parent_path -- fsync the parent path of a file or directory
    3191             :  *
    3192             :  * This is aimed at making file operations persistent on disk in case of
    3193             :  * an OS crash or power failure.
    3194             :  */
    3195             : static int
    3196          22 : fsync_parent_path(const char *fname, int elevel)
    3197             : {
    3198             :     char        parentpath[MAXPGPATH];
    3199             : 
    3200          22 :     strlcpy(parentpath, fname, MAXPGPATH);
    3201          22 :     get_parent_directory(parentpath);
    3202             : 
    3203             :     /*
    3204             :      * get_parent_directory() returns an empty string if the input argument is
    3205             :      * just a file name (see comments in path.c), so handle that as being the
    3206             :      * current directory.
    3207             :      */
    3208          22 :     if (strlen(parentpath) == 0)
    3209           0 :         strlcpy(parentpath, ".", MAXPGPATH);
    3210             : 
    3211          22 :     if (fsync_fname_ext(parentpath, true, false, elevel) != 0)
    3212           0 :         return -1;
    3213             : 
    3214          22 :     return 0;
    3215             : }

Generated by: LCOV version 1.11